view usr/src/uts/common/fs/udfs/udf_vnops.c @ 13603:19698f035619

2061 uts homebrew offsetofs cause various pointer-cast warnings Reviewed by: Joshua M. Clulow <josh@sysmgr.org> Reviewed by: Jason King <jason.brian.king@gmail.com> Reviewed by: Darren Reed <avalon@coombs.anu.edu.au> Approved by: Garrett D'Amore <garrett@damore.org>
author Richard Lowe <richlowe@richlowe.net>
date Mon, 30 Jan 2012 19:38:22 -0500
parents 515936598736
children
line wrap: on
line source

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
 */

#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/time.h>
#include <sys/systm.h>
#include <sys/sysmacros.h>
#include <sys/resource.h>
#include <sys/signal.h>
#include <sys/cred.h>
#include <sys/user.h>
#include <sys/buf.h>
#include <sys/vfs.h>
#include <sys/vfs_opreg.h>
#include <sys/stat.h>
#include <sys/vnode.h>
#include <sys/mode.h>
#include <sys/proc.h>
#include <sys/disp.h>
#include <sys/file.h>
#include <sys/fcntl.h>
#include <sys/flock.h>
#include <sys/kmem.h>
#include <sys/uio.h>
#include <sys/dnlc.h>
#include <sys/conf.h>
#include <sys/errno.h>
#include <sys/mman.h>
#include <sys/fbuf.h>
#include <sys/pathname.h>
#include <sys/debug.h>
#include <sys/vmsystm.h>
#include <sys/cmn_err.h>
#include <sys/dirent.h>
#include <sys/errno.h>
#include <sys/modctl.h>
#include <sys/statvfs.h>
#include <sys/mount.h>
#include <sys/sunddi.h>
#include <sys/bootconf.h>
#include <sys/policy.h>

#include <vm/hat.h>
#include <vm/page.h>
#include <vm/pvn.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/seg_map.h>
#include <vm/seg_kmem.h>
#include <vm/seg_vn.h>
#include <vm/rm.h>
#include <vm/page.h>
#include <sys/swap.h>

#include <fs/fs_subr.h>

#include <sys/fs/udf_volume.h>
#include <sys/fs/udf_inode.h>

static int32_t udf_open(struct vnode **,
	int32_t, struct cred *, caller_context_t *);
static int32_t udf_close(struct vnode *,
	int32_t, int32_t, offset_t, struct cred *, caller_context_t *);
static int32_t udf_read(struct vnode *,
	struct uio *, int32_t, struct cred *, caller_context_t *);
static int32_t udf_write(struct vnode *,
	struct uio *, int32_t, struct cred *, caller_context_t *);
static int32_t udf_ioctl(struct vnode *,
	int32_t, intptr_t, int32_t, struct cred *, int32_t *,
	caller_context_t *);
static int32_t udf_getattr(struct vnode *,
	struct vattr *, int32_t, struct cred *, caller_context_t *);
static int32_t udf_setattr(struct vnode *,
	struct vattr *, int32_t, struct cred *, caller_context_t *);
static int32_t udf_access(struct vnode *,
	int32_t, int32_t, struct cred *, caller_context_t *);
static int32_t udf_lookup(struct vnode *,
	char *, struct vnode **, struct pathname *,
	int32_t, struct vnode *, struct cred *,
	caller_context_t *, int *, pathname_t *);
static int32_t udf_create(struct vnode *,
	char *, struct vattr *, enum vcexcl,
	int32_t, struct vnode **, struct cred *, int32_t,
	caller_context_t *, vsecattr_t *);
static int32_t udf_remove(struct vnode *,
	char *, struct cred *, caller_context_t *, int);
static int32_t udf_link(struct vnode *,
	struct vnode *, char *, struct cred *, caller_context_t *, int);
static int32_t udf_rename(struct vnode *,
	char *, struct vnode *, char *, struct cred *, caller_context_t *, int);
static int32_t udf_mkdir(struct vnode *,
	char *, struct vattr *, struct vnode **, struct cred *,
	caller_context_t *, int, vsecattr_t *);
static int32_t udf_rmdir(struct vnode *,
	char *, struct vnode *, struct cred *, caller_context_t *, int);
static int32_t udf_readdir(struct vnode *,
	struct uio *, struct cred *, int32_t *, caller_context_t *, int);
static int32_t udf_symlink(struct vnode *,
	char *, struct vattr *, char *, struct cred *, caller_context_t *, int);
static int32_t udf_readlink(struct vnode *,
	struct uio *, struct cred *, caller_context_t *);
static int32_t udf_fsync(struct vnode *,
	int32_t, struct cred *, caller_context_t *);
static void udf_inactive(struct vnode *,
	struct cred *, caller_context_t *);
static int32_t udf_fid(struct vnode *, struct fid *, caller_context_t *);
static int udf_rwlock(struct vnode *, int32_t, caller_context_t *);
static void udf_rwunlock(struct vnode *, int32_t, caller_context_t *);
static int32_t udf_seek(struct vnode *, offset_t, offset_t *,
	caller_context_t *);
static int32_t udf_frlock(struct vnode *, int32_t,
	struct flock64 *, int32_t, offset_t, struct flk_callback *, cred_t *,
	caller_context_t *);
static int32_t udf_space(struct vnode *, int32_t,
	struct flock64 *, int32_t, offset_t, cred_t *, caller_context_t *);
static int32_t udf_getpage(struct vnode *, offset_t,
	size_t, uint32_t *, struct page **, size_t,
	struct seg *, caddr_t, enum seg_rw, struct cred *, caller_context_t *);
static int32_t udf_putpage(struct vnode *, offset_t,
	size_t, int32_t, struct cred *, caller_context_t *);
static int32_t udf_map(struct vnode *, offset_t, struct as *,
	caddr_t *, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
	caller_context_t *);
static int32_t udf_addmap(struct vnode *, offset_t, struct as *,
	caddr_t, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
	caller_context_t *);
static int32_t udf_delmap(struct vnode *, offset_t, struct as *,
	caddr_t, size_t, uint32_t, uint32_t, uint32_t, struct cred *,
	caller_context_t *);
static int32_t udf_l_pathconf(struct vnode *, int32_t,
	ulong_t *, struct cred *, caller_context_t *);
static int32_t udf_pageio(struct vnode *, struct page *,
	u_offset_t, size_t, int32_t, struct cred *, caller_context_t *);

int32_t ud_getpage_miss(struct vnode *, u_offset_t,
	size_t, struct seg *, caddr_t, page_t *pl[],
	size_t, enum seg_rw, int32_t);
void ud_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t);
int32_t ud_putpages(struct vnode *, offset_t, size_t, int32_t, struct cred *);
int32_t ud_page_fill(struct ud_inode *, page_t *,
	u_offset_t, uint32_t, u_offset_t *);
int32_t ud_iodone(struct buf *);
int32_t ud_rdip(struct ud_inode *, struct uio *, int32_t, cred_t *);
int32_t ud_wrip(struct ud_inode *, struct uio *, int32_t, cred_t *);
int32_t ud_multi_strat(struct ud_inode *, page_t *, struct buf *, u_offset_t);
int32_t ud_slave_done(struct buf *);

/*
 * Structures to control multiple IO operations to get or put pages
 * that are backed by discontiguous blocks. The master struct is
 * a dummy that holds the original bp from pageio_setup. The
 * slave struct holds the working bp's to do the actual IO. Once
 * all the slave IOs complete. The master is processed as if a single
 * IO op has completed.
 */
uint32_t master_index = 0;
typedef struct mio_master {
	kmutex_t	mm_mutex;	/* protect the fields below */
	int32_t		mm_size;
	buf_t		*mm_bp;		/* original bp */
	int32_t		mm_resid;	/* bytes remaining to transfer */
	int32_t		mm_error;	/* accumulated error from slaves */
	int32_t		mm_index;	/* XXX debugging */
} mio_master_t;

typedef struct mio_slave {
	buf_t		ms_buf;		/* working buffer for this IO chunk */
	mio_master_t	*ms_ptr;	/* pointer to master */
} mio_slave_t;

struct vnodeops *udf_vnodeops;

const fs_operation_def_t udf_vnodeops_template[] = {
	VOPNAME_OPEN,		{ .vop_open = udf_open },
	VOPNAME_CLOSE,		{ .vop_close = udf_close },
	VOPNAME_READ,		{ .vop_read = udf_read },
	VOPNAME_WRITE,		{ .vop_write = udf_write },
	VOPNAME_IOCTL,		{ .vop_ioctl = udf_ioctl },
	VOPNAME_GETATTR,	{ .vop_getattr = udf_getattr },
	VOPNAME_SETATTR,	{ .vop_setattr = udf_setattr },
	VOPNAME_ACCESS,		{ .vop_access = udf_access },
	VOPNAME_LOOKUP,		{ .vop_lookup = udf_lookup },
	VOPNAME_CREATE,		{ .vop_create = udf_create },
	VOPNAME_REMOVE,		{ .vop_remove = udf_remove },
	VOPNAME_LINK,		{ .vop_link = udf_link },
	VOPNAME_RENAME,		{ .vop_rename = udf_rename },
	VOPNAME_MKDIR,		{ .vop_mkdir = udf_mkdir },
	VOPNAME_RMDIR,		{ .vop_rmdir = udf_rmdir },
	VOPNAME_READDIR,	{ .vop_readdir = udf_readdir },
	VOPNAME_SYMLINK,	{ .vop_symlink = udf_symlink },
	VOPNAME_READLINK,	{ .vop_readlink = udf_readlink },
	VOPNAME_FSYNC,		{ .vop_fsync = udf_fsync },
	VOPNAME_INACTIVE,	{ .vop_inactive = udf_inactive },
	VOPNAME_FID,		{ .vop_fid = udf_fid },
	VOPNAME_RWLOCK,		{ .vop_rwlock = udf_rwlock },
	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = udf_rwunlock },
	VOPNAME_SEEK,		{ .vop_seek = udf_seek },
	VOPNAME_FRLOCK,		{ .vop_frlock = udf_frlock },
	VOPNAME_SPACE,		{ .vop_space = udf_space },
	VOPNAME_GETPAGE,	{ .vop_getpage = udf_getpage },
	VOPNAME_PUTPAGE,	{ .vop_putpage = udf_putpage },
	VOPNAME_MAP,		{ .vop_map = udf_map },
	VOPNAME_ADDMAP,		{ .vop_addmap = udf_addmap },
	VOPNAME_DELMAP,		{ .vop_delmap = udf_delmap },
	VOPNAME_PATHCONF,	{ .vop_pathconf = udf_l_pathconf },
	VOPNAME_PAGEIO,		{ .vop_pageio = udf_pageio },
	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
	NULL,			NULL
};

/* ARGSUSED */
static int32_t
udf_open(
	struct vnode **vpp,
	int32_t flag,
	struct cred *cr,
	caller_context_t *ct)
{
	ud_printf("udf_open\n");

	return (0);
}

/* ARGSUSED */
static int32_t
udf_close(
	struct vnode *vp,
	int32_t flag,
	int32_t count,
	offset_t offset,
	struct cred *cr,
	caller_context_t *ct)
{
	struct ud_inode *ip = VTOI(vp);

	ud_printf("udf_close\n");

	ITIMES(ip);

	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
	cleanshares(vp, ttoproc(curthread)->p_pid);

	/*
	 * Push partially filled cluster at last close.
	 * ``last close'' is approximated because the dnlc
	 * may have a hold on the vnode.
	 */
	if (vp->v_count <= 2 && vp->v_type != VBAD) {
		struct ud_inode *ip = VTOI(vp);
		if (ip->i_delaylen) {
			(void) ud_putpages(vp, ip->i_delayoff, ip->i_delaylen,
			    B_ASYNC | B_FREE, cr);
			ip->i_delaylen = 0;
		}
	}

	return (0);
}

/* ARGSUSED */
static int32_t
udf_read(
	struct vnode *vp,
	struct uio *uiop,
	int32_t ioflag,
	struct cred *cr,
	caller_context_t *ct)
{
	struct ud_inode *ip = VTOI(vp);
	int32_t error;

	ud_printf("udf_read\n");

#ifdef	__lock_lint
	rw_enter(&ip->i_rwlock, RW_READER);
#endif

	ASSERT(RW_READ_HELD(&ip->i_rwlock));

	if (MANDLOCK(vp, ip->i_char)) {
		/*
		 * udf_getattr ends up being called by chklock
		 */
		error = chklock(vp, FREAD, uiop->uio_loffset,
		    uiop->uio_resid, uiop->uio_fmode, ct);
		if (error) {
			goto end;
		}
	}

	rw_enter(&ip->i_contents, RW_READER);
	error = ud_rdip(ip, uiop, ioflag, cr);
	rw_exit(&ip->i_contents);

end:
#ifdef	__lock_lint
	rw_exit(&ip->i_rwlock);
#endif

	return (error);
}


int32_t ud_WRITES = 1;
int32_t ud_HW = 96 * 1024;
int32_t ud_LW = 64 * 1024;
int32_t ud_throttles = 0;

/* ARGSUSED */
static int32_t
udf_write(
	struct vnode *vp,
	struct uio *uiop,
	int32_t ioflag,
	struct cred *cr,
	caller_context_t *ct)
{
	struct ud_inode *ip = VTOI(vp);
	int32_t error = 0;

	ud_printf("udf_write\n");

#ifdef	__lock_lint
	rw_enter(&ip->i_rwlock, RW_WRITER);
#endif

	ASSERT(RW_WRITE_HELD(&ip->i_rwlock));

	if (MANDLOCK(vp, ip->i_char)) {
		/*
		 * ud_getattr ends up being called by chklock
		 */
		error = chklock(vp, FWRITE, uiop->uio_loffset,
		    uiop->uio_resid, uiop->uio_fmode, ct);
		if (error) {
			goto end;
		}
	}
	/*
	 * Throttle writes.
	 */
	mutex_enter(&ip->i_tlock);
	if (ud_WRITES && (ip->i_writes > ud_HW)) {
		while (ip->i_writes > ud_HW) {
			ud_throttles++;
			cv_wait(&ip->i_wrcv, &ip->i_tlock);
		}
	}
	mutex_exit(&ip->i_tlock);

	/*
	 * Write to the file
	 */
	rw_enter(&ip->i_contents, RW_WRITER);
	if ((ioflag & FAPPEND) != 0 && (ip->i_type == VREG)) {
		/*
		 * In append mode start at end of file.
		 */
		uiop->uio_loffset = ip->i_size;
	}
	error = ud_wrip(ip, uiop, ioflag, cr);
	rw_exit(&ip->i_contents);

end:
#ifdef	__lock_lint
	rw_exit(&ip->i_rwlock);
#endif

	return (error);
}

/* ARGSUSED */
static int32_t
udf_ioctl(
	struct vnode *vp,
	int32_t cmd,
	intptr_t arg,
	int32_t flag,
	struct cred *cr,
	int32_t *rvalp,
	caller_context_t *ct)
{
	return (ENOTTY);
}

/* ARGSUSED */
static int32_t
udf_getattr(
	struct vnode *vp,
	struct vattr *vap,
	int32_t flags,
	struct cred *cr,
	caller_context_t *ct)
{
	struct ud_inode *ip = VTOI(vp);

	ud_printf("udf_getattr\n");

	if (vap->va_mask == AT_SIZE) {
		/*
		 * for performance, if only the size is requested don't bother
		 * with anything else.
		 */
		vap->va_size = ip->i_size;
		return (0);
	}

	rw_enter(&ip->i_contents, RW_READER);

	vap->va_type = vp->v_type;
	vap->va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;

	vap->va_uid = ip->i_uid;
	vap->va_gid = ip->i_gid;
	vap->va_fsid = ip->i_dev;
	vap->va_nodeid = ip->i_icb_lbano;
	vap->va_nlink = ip->i_nlink;
	vap->va_size = ip->i_size;
	vap->va_seq = ip->i_seq;
	if (vp->v_type == VCHR || vp->v_type == VBLK) {
		vap->va_rdev = ip->i_rdev;
	} else {
		vap->va_rdev = 0;
	}

	mutex_enter(&ip->i_tlock);
	ITIMES_NOLOCK(ip);	/* mark correct time in inode */
	vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec;
	vap->va_atime.tv_nsec = ip->i_atime.tv_nsec;
	vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec;
	vap->va_mtime.tv_nsec = ip->i_mtime.tv_nsec;
	vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec;
	vap->va_ctime.tv_nsec = ip->i_ctime.tv_nsec;
	mutex_exit(&ip->i_tlock);

	switch (ip->i_type) {
		case VBLK:
			vap->va_blksize = MAXBSIZE;
			break;
		case VCHR:
			vap->va_blksize = MAXBSIZE;
			break;
		default:
			vap->va_blksize = ip->i_udf->udf_lbsize;
			break;
	}
	vap->va_nblocks = ip->i_lbr << ip->i_udf->udf_l2d_shift;

	rw_exit(&ip->i_contents);

	return (0);
}

static int
ud_iaccess_vmode(void *ip, int mode, struct cred *cr)
{
	return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 0));
}

/*ARGSUSED4*/
static int32_t
udf_setattr(
	struct vnode *vp,
	struct vattr *vap,
	int32_t flags,
	struct cred *cr,
	caller_context_t *ct)
{
	int32_t error = 0;
	uint32_t mask = vap->va_mask;
	struct ud_inode *ip;
	timestruc_t now;
	struct vattr ovap;

	ud_printf("udf_setattr\n");

	ip = VTOI(vp);

	/*
	 * not updates allowed to 4096 files
	 */
	if (ip->i_astrat == STRAT_TYPE4096) {
		return (EINVAL);
	}

	/*
	 * Cannot set these attributes
	 */
	if (mask & AT_NOSET) {
		return (EINVAL);
	}

	rw_enter(&ip->i_rwlock, RW_WRITER);
	rw_enter(&ip->i_contents, RW_WRITER);

	ovap.va_uid = ip->i_uid;
	ovap.va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
	error = secpolicy_vnode_setattr(cr, vp, vap, &ovap, flags,
	    ud_iaccess_vmode, ip);
	if (error)
		goto update_inode;

	mask = vap->va_mask;
	/*
	 * Change file access modes.
	 */
	if (mask & AT_MODE) {
		ip->i_perm = VA2UD_PERM(vap->va_mode);
		ip->i_char = vap->va_mode & (VSUID | VSGID | VSVTX);
		mutex_enter(&ip->i_tlock);
		ip->i_flag |= ICHG;
		mutex_exit(&ip->i_tlock);
	}
	if (mask & (AT_UID|AT_GID)) {
		if (mask & AT_UID) {
			ip->i_uid = vap->va_uid;
		}
		if (mask & AT_GID) {
			ip->i_gid = vap->va_gid;
		}
		mutex_enter(&ip->i_tlock);
		ip->i_flag |= ICHG;
		mutex_exit(&ip->i_tlock);
	}
	/*
	 * Truncate file.  Must have write permission and not be a directory.
	 */
	if (mask & AT_SIZE) {
		if (vp->v_type == VDIR) {
			error = EISDIR;
			goto update_inode;
		}
		if (error = ud_iaccess(ip, IWRITE, cr, 0)) {
			goto update_inode;
		}
		if (vap->va_size > MAXOFFSET_T) {
			error = EFBIG;
			goto update_inode;
		}
		if (error = ud_itrunc(ip, vap->va_size, 0, cr)) {
			goto update_inode;
		}
	}
	/*
	 * Change file access or modified times.
	 */
	if (mask & (AT_ATIME|AT_MTIME)) {
		mutex_enter(&ip->i_tlock);
		if (mask & AT_ATIME) {
			ip->i_atime.tv_sec = vap->va_atime.tv_sec;
			ip->i_atime.tv_nsec = vap->va_atime.tv_nsec;
			ip->i_flag &= ~IACC;
		}
		if (mask & AT_MTIME) {
			ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
			ip->i_mtime.tv_nsec = vap->va_mtime.tv_nsec;
			gethrestime(&now);
			ip->i_ctime.tv_sec = now.tv_sec;
			ip->i_ctime.tv_nsec = now.tv_nsec;
			ip->i_flag &= ~(IUPD|ICHG);
			ip->i_flag |= IMODTIME;
		}
		ip->i_flag |= IMOD;
		mutex_exit(&ip->i_tlock);
	}

update_inode:
	if (curthread->t_flag & T_DONTPEND) {
		ud_iupdat(ip, 1);
	} else {
		ITIMES_NOLOCK(ip);
	}
	rw_exit(&ip->i_contents);
	rw_exit(&ip->i_rwlock);

	return (error);
}

/* ARGSUSED */
static int32_t
udf_access(
	struct vnode *vp,
	int32_t mode,
	int32_t flags,
	struct cred *cr,
	caller_context_t *ct)
{
	struct ud_inode *ip = VTOI(vp);

	ud_printf("udf_access\n");

	if (ip->i_udf == NULL) {
		return (EIO);
	}

	return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 1));
}

int32_t udfs_stickyhack = 1;

/* ARGSUSED */
static int32_t
udf_lookup(
	struct vnode *dvp,
	char *nm,
	struct vnode **vpp,
	struct pathname *pnp,
	int32_t flags,
	struct vnode *rdir,
	struct cred *cr,
	caller_context_t *ct,
	int *direntflags,
	pathname_t *realpnp)
{
	int32_t error;
	struct vnode *vp;
	struct ud_inode *ip, *xip;

	ud_printf("udf_lookup\n");
	/*
	 * Null component name is a synonym for directory being searched.
	 */
	if (*nm == '\0') {
		VN_HOLD(dvp);
		*vpp = dvp;
		error = 0;
		goto out;
	}

	/*
	 * Fast path: Check the directory name lookup cache.
	 */
	ip = VTOI(dvp);
	if (vp = dnlc_lookup(dvp, nm)) {
		/*
		 * Check accessibility of directory.
		 */
		if ((error = ud_iaccess(ip, IEXEC, cr, 1)) != 0) {
			VN_RELE(vp);
		}
		xip = VTOI(vp);
	} else {
		error = ud_dirlook(ip, nm, &xip, cr, 1);
		ITIMES(ip);
	}

	if (error == 0) {
		ip = xip;
		*vpp = ITOV(ip);
		if ((ip->i_type != VDIR) &&
		    (ip->i_char & ISVTX) &&
		    ((ip->i_perm & IEXEC) == 0) &&
		    udfs_stickyhack) {
			mutex_enter(&(*vpp)->v_lock);
			(*vpp)->v_flag |= VISSWAP;
			mutex_exit(&(*vpp)->v_lock);
		}
		ITIMES(ip);
		/*
		 * If vnode is a device return special vnode instead.
		 */
		if (IS_DEVVP(*vpp)) {
			struct vnode *newvp;
			newvp = specvp(*vpp, (*vpp)->v_rdev,
			    (*vpp)->v_type, cr);
			VN_RELE(*vpp);
			if (newvp == NULL) {
				error = ENOSYS;
			} else {
				*vpp = newvp;
			}
		}
	}
out:
	return (error);
}

/* ARGSUSED */
static int32_t
udf_create(
	struct vnode *dvp,
	char *name,
	struct vattr *vap,
	enum vcexcl excl,
	int32_t mode,
	struct vnode **vpp,
	struct cred *cr,
	int32_t flag,
	caller_context_t *ct,
	vsecattr_t *vsecp)
{
	int32_t error;
	struct ud_inode *ip = VTOI(dvp), *xip;

	ud_printf("udf_create\n");

	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
		vap->va_mode &= ~VSVTX;

	if (*name == '\0') {
		/*
		 * Null component name refers to the directory itself.
		 */
		VN_HOLD(dvp);
		ITIMES(ip);
		error = EEXIST;
	} else {
		xip = NULL;
		rw_enter(&ip->i_rwlock, RW_WRITER);
		error = ud_direnter(ip, name, DE_CREATE,
		    (struct ud_inode *)0, (struct ud_inode *)0,
		    vap, &xip, cr, ct);
		rw_exit(&ip->i_rwlock);
		ITIMES(ip);
		ip = xip;
	}
#ifdef	__lock_lint
	rw_enter(&ip->i_contents, RW_WRITER);
#else
	if (ip != NULL) {
		rw_enter(&ip->i_contents, RW_WRITER);
	}
#endif

	/*
	 * If the file already exists and this is a non-exclusive create,
	 * check permissions and allow access for non-directories.
	 * Read-only create of an existing directory is also allowed.
	 * We fail an exclusive create of anything which already exists.
	 */
	if (error == EEXIST) {
		if (excl == NONEXCL) {
			if ((ip->i_type == VDIR) && (mode & VWRITE)) {
				error = EISDIR;
			} else if (mode) {
				error = ud_iaccess(ip,
				    UD_UPERM2DPERM(mode), cr, 0);
			} else {
				error = 0;
			}
		}
		if (error) {
			rw_exit(&ip->i_contents);
			VN_RELE(ITOV(ip));
			goto out;
		} else if ((ip->i_type == VREG) &&
		    (vap->va_mask & AT_SIZE) && vap->va_size == 0) {
			/*
			 * Truncate regular files, if requested by caller.
			 * Grab i_rwlock to make sure no one else is
			 * currently writing to the file (we promised
			 * bmap we would do this).
			 * Must get the locks in the correct order.
			 */
			if (ip->i_size == 0) {
				ip->i_flag |= ICHG | IUPD;
			} else {
				rw_exit(&ip->i_contents);
				rw_enter(&ip->i_rwlock, RW_WRITER);
				rw_enter(&ip->i_contents, RW_WRITER);
				(void) ud_itrunc(ip, 0, 0, cr);
				rw_exit(&ip->i_rwlock);
			}
			vnevent_create(ITOV(ip), ct);
		}
	}

	if (error == 0) {
		*vpp = ITOV(ip);
		ITIMES(ip);
	}
#ifdef	__lock_lint
	rw_exit(&ip->i_contents);
#else
	if (ip != NULL) {
		rw_exit(&ip->i_contents);
	}
#endif
	if (error) {
		goto out;
	}

	/*
	 * If vnode is a device return special vnode instead.
	 */
	if (!error && IS_DEVVP(*vpp)) {
		struct vnode *newvp;

		newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
		VN_RELE(*vpp);
		if (newvp == NULL) {
			error = ENOSYS;
			goto out;
		}
		*vpp = newvp;
	}
out:
	return (error);
}

/* ARGSUSED */
static int32_t
udf_remove(
	struct vnode *vp,
	char *nm,
	struct cred *cr,
	caller_context_t *ct,
	int flags)
{
	int32_t error;
	struct ud_inode *ip = VTOI(vp);

	ud_printf("udf_remove\n");

	rw_enter(&ip->i_rwlock, RW_WRITER);
	error = ud_dirremove(ip, nm,
	    (struct ud_inode *)0, (struct vnode *)0, DR_REMOVE, cr, ct);
	rw_exit(&ip->i_rwlock);
	ITIMES(ip);

	return (error);
}

/* ARGSUSED */
static int32_t
udf_link(
	struct vnode *tdvp,
	struct vnode *svp,
	char *tnm,
	struct cred *cr,
	caller_context_t *ct,
	int flags)
{
	int32_t error;
	struct vnode *realvp;
	struct ud_inode *sip;
	struct ud_inode *tdp;

	ud_printf("udf_link\n");
	if (VOP_REALVP(svp, &realvp, ct) == 0) {
		svp = realvp;
	}

	/*
	 * Do not allow links to directories
	 */
	if (svp->v_type == VDIR) {
		return (EPERM);
	}

	sip = VTOI(svp);

	if (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)
		return (EPERM);

	tdp = VTOI(tdvp);

	rw_enter(&tdp->i_rwlock, RW_WRITER);
	error = ud_direnter(tdp, tnm, DE_LINK, (struct ud_inode *)0,
	    sip, (struct vattr *)0, (struct ud_inode **)0, cr, ct);
	rw_exit(&tdp->i_rwlock);
	ITIMES(sip);
	ITIMES(tdp);

	if (error == 0) {
		vnevent_link(svp, ct);
	}

	return (error);
}

/* ARGSUSED */
static int32_t
udf_rename(
	struct vnode *sdvp,
	char *snm,
	struct vnode *tdvp,
	char *tnm,
	struct cred *cr,
	caller_context_t *ct,
	int flags)
{
	int32_t error = 0;
	struct udf_vfs *udf_vfsp;
	struct ud_inode *sip;		/* source inode */
	struct ud_inode *sdp, *tdp;	/* source and target parent inode */
	struct vnode *realvp;

	ud_printf("udf_rename\n");

	if (VOP_REALVP(tdvp, &realvp, ct) == 0) {
		tdvp = realvp;
	}

	sdp = VTOI(sdvp);
	tdp = VTOI(tdvp);

	udf_vfsp = sdp->i_udf;

	mutex_enter(&udf_vfsp->udf_rename_lck);
	/*
	 * Look up inode of file we're supposed to rename.
	 */
	if (error = ud_dirlook(sdp, snm, &sip, cr, 0)) {
		mutex_exit(&udf_vfsp->udf_rename_lck);
		return (error);
	}
	/*
	 * be sure this is not a directory with another file system mounted
	 * over it.  If it is just give up the locks, and return with
	 * EBUSY
	 */
	if (vn_mountedvfs(ITOV(sip)) != NULL) {
		error = EBUSY;
		goto errout;
	}
	/*
	 * Make sure we can delete the source entry.  This requires
	 * write permission on the containing directory.  If that
	 * directory is "sticky" it further requires (except for
	 * privileged users) that the user own the directory or the
	 * source entry, or else have permission to write the source
	 * entry.
	 */
	rw_enter(&sdp->i_contents, RW_READER);
	rw_enter(&sip->i_contents, RW_READER);
	if ((error = ud_iaccess(sdp, IWRITE, cr, 0)) != 0 ||
	    (error = ud_sticky_remove_access(sdp, sip, cr)) != 0) {
		rw_exit(&sip->i_contents);
		rw_exit(&sdp->i_contents);
		ITIMES(sip);
		goto errout;
	}

	/*
	 * Check for renaming '.' or '..' or alias of '.'
	 */
	if ((strcmp(snm, ".") == 0) ||
	    (strcmp(snm, "..") == 0) ||
	    (sdp == sip)) {
		error = EINVAL;
		rw_exit(&sip->i_contents);
		rw_exit(&sdp->i_contents);
		goto errout;
	}
	rw_exit(&sip->i_contents);
	rw_exit(&sdp->i_contents);


	/*
	 * Link source to the target.
	 */
	rw_enter(&tdp->i_rwlock, RW_WRITER);
	if (error = ud_direnter(tdp, tnm, DE_RENAME, sdp, sip,
	    (struct vattr *)0, (struct ud_inode **)0, cr, ct)) {
		/*
		 * ESAME isn't really an error; it indicates that the
		 * operation should not be done because the source and target
		 * are the same file, but that no error should be reported.
		 */
		if (error == ESAME) {
			error = 0;
		}
		rw_exit(&tdp->i_rwlock);
		goto errout;
	}
	vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
	rw_exit(&tdp->i_rwlock);

	rw_enter(&sdp->i_rwlock, RW_WRITER);
	/*
	 * Unlink the source.
	 * Remove the source entry.  ud_dirremove() checks that the entry
	 * still reflects sip, and returns an error if it doesn't.
	 * If the entry has changed just forget about it.  Release
	 * the source inode.
	 */
	if ((error = ud_dirremove(sdp, snm, sip, (struct vnode *)0,
	    DR_RENAME, cr, ct)) == ENOENT) {
		error = 0;
	}
	rw_exit(&sdp->i_rwlock);
errout:
	ITIMES(sdp);
	ITIMES(tdp);
	VN_RELE(ITOV(sip));
	mutex_exit(&udf_vfsp->udf_rename_lck);

	return (error);
}

/* ARGSUSED */
static int32_t
udf_mkdir(
	struct vnode *dvp,
	char *dirname,
	struct vattr *vap,
	struct vnode **vpp,
	struct cred *cr,
	caller_context_t *ct,
	int flags,
	vsecattr_t *vsecp)
{
	int32_t error;
	struct ud_inode *ip;
	struct ud_inode *xip;

	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));

	ud_printf("udf_mkdir\n");

	ip = VTOI(dvp);
	rw_enter(&ip->i_rwlock, RW_WRITER);
	error = ud_direnter(ip, dirname, DE_MKDIR,
	    (struct ud_inode *)0, (struct ud_inode *)0, vap, &xip, cr, ct);
	rw_exit(&ip->i_rwlock);
	ITIMES(ip);
	if (error == 0) {
		ip = xip;
		*vpp = ITOV(ip);
		ITIMES(ip);
	} else if (error == EEXIST) {
		ITIMES(xip);
		VN_RELE(ITOV(xip));
	}

	return (error);
}

/* ARGSUSED */
static int32_t
udf_rmdir(
	struct vnode *vp,
	char *nm,
	struct vnode *cdir,
	struct cred *cr,
	caller_context_t *ct,
	int flags)
{
	int32_t error;
	struct ud_inode *ip = VTOI(vp);

	ud_printf("udf_rmdir\n");

	rw_enter(&ip->i_rwlock, RW_WRITER);
	error = ud_dirremove(ip, nm, (struct ud_inode *)0, cdir, DR_RMDIR,
	    cr, ct);
	rw_exit(&ip->i_rwlock);
	ITIMES(ip);

	return (error);
}

/* ARGSUSED */
static int32_t
udf_readdir(
	struct vnode *vp,
	struct uio *uiop,
	struct cred *cr,
	int32_t *eofp,
	caller_context_t *ct,
	int flags)
{
	struct ud_inode *ip;
	struct dirent64 *nd;
	struct udf_vfs *udf_vfsp;
	int32_t error = 0, len, outcount = 0;
	uint32_t dirsiz, offset;
	uint32_t bufsize, ndlen, dummy;
	caddr_t outbuf;
	caddr_t outb, end_outb;
	struct iovec *iovp;

	uint8_t *dname;
	int32_t length;

	uint8_t *buf = NULL;

	struct fbuf *fbp = NULL;
	struct file_id *fid;
	uint8_t *name;


	ud_printf("udf_readdir\n");

	ip = VTOI(vp);
	udf_vfsp = ip->i_udf;

	dirsiz = ip->i_size;
	if ((uiop->uio_offset >= dirsiz) ||
	    (ip->i_nlink <= 0)) {
		if (eofp) {
			*eofp = 1;
		}
		return (0);
	}

	offset = uiop->uio_offset;
	iovp = uiop->uio_iov;
	bufsize = iovp->iov_len;

	outb = outbuf = (char *)kmem_alloc((uint32_t)bufsize, KM_SLEEP);
	end_outb = outb + bufsize;
	nd = (struct dirent64 *)outbuf;

	dname = (uint8_t *)kmem_zalloc(1024, KM_SLEEP);
	buf = (uint8_t *)kmem_zalloc(udf_vfsp->udf_lbsize, KM_SLEEP);

	if (offset == 0) {
		len = DIRENT64_RECLEN(1);
		if (((caddr_t)nd + len) >= end_outb) {
			error = EINVAL;
			goto end;
		}
		nd->d_ino = ip->i_icb_lbano;
		nd->d_reclen = (uint16_t)len;
		nd->d_off = 0x10;
		nd->d_name[0] = '.';
		bzero(&nd->d_name[1], DIRENT64_NAMELEN(len) - 1);
		nd = (struct dirent64 *)((char *)nd + nd->d_reclen);
		outcount++;
	} else if (offset == 0x10) {
		offset = 0;
	}

	while (offset < dirsiz) {
		error = ud_get_next_fid(ip, &fbp,
		    offset, &fid, &name, buf);
		if (error != 0) {
			break;
		}

		if ((fid->fid_flags & FID_DELETED) == 0) {
			if (fid->fid_flags & FID_PARENT) {

				len = DIRENT64_RECLEN(2);
				if (((caddr_t)nd + len) >= end_outb) {
					error = EINVAL;
					break;
				}

				nd->d_ino = ip->i_icb_lbano;
				nd->d_reclen = (uint16_t)len;
				nd->d_off = offset + FID_LEN(fid);
				nd->d_name[0] = '.';
				nd->d_name[1] = '.';
				bzero(&nd->d_name[2],
				    DIRENT64_NAMELEN(len) - 2);
				nd = (struct dirent64 *)
				    ((char *)nd + nd->d_reclen);
			} else {
				if ((error = ud_uncompress(fid->fid_idlen,
				    &length, name, dname)) != 0) {
					break;
				}
				if (length == 0) {
					offset += FID_LEN(fid);
					continue;
				}
				len = DIRENT64_RECLEN(length);
				if (((caddr_t)nd + len) >= end_outb) {
					if (!outcount) {
						error = EINVAL;
					}
					break;
				}
				(void) strncpy(nd->d_name,
				    (caddr_t)dname, length);
				bzero(&nd->d_name[length],
				    DIRENT64_NAMELEN(len) - length);
				nd->d_ino = ud_xlate_to_daddr(udf_vfsp,
				    SWAP_16(fid->fid_icb.lad_ext_prn),
				    SWAP_32(fid->fid_icb.lad_ext_loc), 1,
				    &dummy);
				nd->d_reclen = (uint16_t)len;
				nd->d_off = offset + FID_LEN(fid);
				nd = (struct dirent64 *)
				    ((char *)nd + nd->d_reclen);
			}
			outcount++;
		}

		offset += FID_LEN(fid);
	}

end:
	if (fbp != NULL) {
		fbrelse(fbp, S_OTHER);
	}
	ndlen = ((char *)nd - outbuf);
	/*
	 * In case of error do not call uiomove.
	 * Return the error to the caller.
	 */
	if ((error == 0) && (ndlen != 0)) {
		error = uiomove(outbuf, (long)ndlen, UIO_READ, uiop);
		uiop->uio_offset = offset;
	}
	kmem_free((caddr_t)buf, udf_vfsp->udf_lbsize);
	kmem_free((caddr_t)dname, 1024);
	kmem_free(outbuf, (uint32_t)bufsize);
	if (eofp && error == 0) {
		*eofp = (uiop->uio_offset >= dirsiz);
	}
	return (error);
}

/* ARGSUSED */
static int32_t
udf_symlink(
	struct vnode *dvp,
	char *linkname,
	struct vattr *vap,
	char *target,
	struct cred *cr,
	caller_context_t *ct,
	int flags)
{
	int32_t error = 0, outlen;
	uint32_t ioflag = 0;
	struct ud_inode *ip, *dip = VTOI(dvp);

	struct path_comp *pc;
	int8_t *dname = NULL, *uname = NULL, *sp;

	ud_printf("udf_symlink\n");

	ip = (struct ud_inode *)0;
	vap->va_type = VLNK;
	vap->va_rdev = 0;

	rw_enter(&dip->i_rwlock, RW_WRITER);
	error = ud_direnter(dip, linkname, DE_CREATE,
	    (struct ud_inode *)0, (struct ud_inode *)0, vap, &ip, cr, ct);
	rw_exit(&dip->i_rwlock);
	if (error == 0) {
		dname = kmem_zalloc(1024, KM_SLEEP);
		uname = kmem_zalloc(PAGESIZE, KM_SLEEP);

		pc = (struct path_comp *)uname;
		/*
		 * If the first character in target is "/"
		 * then skip it and create entry for it
		 */
		if (*target == '/') {
			pc->pc_type = 2;
			pc->pc_len = 0;
			pc = (struct path_comp *)(((char *)pc) + 4);
			while (*target == '/') {
				target++;
			}
		}

		while (*target != NULL) {
			sp = target;
			while ((*target != '/') && (*target != '\0')) {
				target ++;
			}
			/*
			 * We got the next component of the
			 * path name. Create path_comp of
			 * appropriate type
			 */
			if (((target - sp) == 1) && (*sp == '.')) {
				/*
				 * Dot entry.
				 */
				pc->pc_type = 4;
				pc = (struct path_comp *)(((char *)pc) + 4);
			} else if (((target - sp) == 2) &&
			    (*sp == '.') && ((*(sp + 1)) == '.')) {
				/*
				 * DotDot entry.
				 */
				pc->pc_type = 3;
				pc = (struct path_comp *)(((char *)pc) + 4);
			} else {
				/*
				 * convert the user given name
				 * into appropriate form to be put
				 * on the media
				 */
				outlen = 1024;	/* set to size of dname */
				if (error = ud_compress(target - sp, &outlen,
				    (uint8_t *)sp, (uint8_t *)dname)) {
					break;
				}
				pc->pc_type = 5;
				/* LINTED */
				pc->pc_len = outlen;
				dname[outlen] = '\0';
				(void) strcpy((char *)pc->pc_id, dname);
				pc = (struct path_comp *)
				    (((char *)pc) + 4 + outlen);
			}
			while (*target == '/') {
				target++;
			}
			if (*target == NULL) {
				break;
			}
		}

		rw_enter(&ip->i_contents, RW_WRITER);
		if (error == 0) {
			ioflag = FWRITE;
			if (curthread->t_flag & T_DONTPEND) {
				ioflag |= FDSYNC;
			}
			error = ud_rdwri(UIO_WRITE, ioflag, ip,
			    uname, ((int8_t *)pc) - uname,
			    (offset_t)0, UIO_SYSSPACE, (int32_t *)0, cr);
		}
		if (error) {
			ud_idrop(ip);
			rw_exit(&ip->i_contents);
			rw_enter(&dip->i_rwlock, RW_WRITER);
			(void) ud_dirremove(dip, linkname, (struct ud_inode *)0,
			    (struct vnode *)0, DR_REMOVE, cr, ct);
			rw_exit(&dip->i_rwlock);
			goto update_inode;
		}
		rw_exit(&ip->i_contents);
	}

	if ((error == 0) || (error == EEXIST)) {
		VN_RELE(ITOV(ip));
	}

update_inode:
	ITIMES(VTOI(dvp));
	if (uname != NULL) {
		kmem_free(uname, PAGESIZE);
	}
	if (dname != NULL) {
		kmem_free(dname, 1024);
	}

	return (error);
}

/* ARGSUSED */
static int32_t
udf_readlink(
	struct vnode *vp,
	struct uio *uiop,
	struct cred *cr,
	caller_context_t *ct)
{
	int32_t error = 0, off, id_len, size, len;
	int8_t *dname = NULL, *uname = NULL;
	struct ud_inode *ip;
	struct fbuf *fbp = NULL;
	struct path_comp *pc;

	ud_printf("udf_readlink\n");

	if (vp->v_type != VLNK) {
		return (EINVAL);
	}

	ip = VTOI(vp);
	size = ip->i_size;
	if (size > PAGESIZE) {
		return (EIO);
	}

	if (size == 0) {
		return (0);
	}

	dname = kmem_zalloc(1024, KM_SLEEP);
	uname = kmem_zalloc(PAGESIZE, KM_SLEEP);

	rw_enter(&ip->i_contents, RW_READER);

	if ((error = fbread(vp, 0, size, S_READ, &fbp)) != 0) {
		goto end;
	}

	off = 0;

	while (off < size) {
		pc = (struct path_comp *)(fbp->fb_addr + off);
		switch (pc->pc_type) {
			case 1 :
				(void) strcpy(uname, ip->i_udf->udf_fsmnt);
				(void) strcat(uname, "/");
				break;
			case 2 :
				if (pc->pc_len != 0) {
					goto end;
				}
				uname[0] = '/';
				uname[1] = '\0';
				break;
			case 3 :
				(void) strcat(uname, "../");
				break;
			case 4 :
				(void) strcat(uname, "./");
				break;
			case 5 :
				if ((error = ud_uncompress(pc->pc_len, &id_len,
				    pc->pc_id, (uint8_t *)dname)) != 0) {
					break;
				}
				dname[id_len] = '\0';
				(void) strcat(uname, dname);
				(void) strcat(uname, "/");
				break;
			default :
				error = EINVAL;
				goto end;
		}
		off += 4 + pc->pc_len;
	}
	len = strlen(uname) - 1;
	if (uname[len] == '/') {
		if (len == 0) {
			/*
			 * special case link to /
			 */
			len = 1;
		} else {
			uname[len] = '\0';
		}
	}

	error = uiomove(uname, len, UIO_READ, uiop);

	ITIMES(ip);

end:
	if (fbp != NULL) {
		fbrelse(fbp, S_OTHER);
	}
	rw_exit(&ip->i_contents);
	if (uname != NULL) {
		kmem_free(uname, PAGESIZE);
	}
	if (dname != NULL) {
		kmem_free(dname, 1024);
	}
	return (error);
}

/* ARGSUSED */
static int32_t
udf_fsync(
	struct vnode *vp,
	int32_t syncflag,
	struct cred *cr,
	caller_context_t *ct)
{
	int32_t error = 0;
	struct ud_inode *ip = VTOI(vp);

	ud_printf("udf_fsync\n");

	rw_enter(&ip->i_contents, RW_WRITER);
	if (!(IS_SWAPVP(vp))) {
		error = ud_syncip(ip, 0, I_SYNC); /* Do synchronous writes */
	}
	if (error == 0) {
		error = ud_sync_indir(ip);
	}
	ITIMES(ip);		/* XXX: is this necessary ??? */
	rw_exit(&ip->i_contents);

	return (error);
}

/* ARGSUSED */
static void
udf_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct)
{
	ud_printf("udf_iinactive\n");

	ud_iinactive(VTOI(vp), cr);
}

/* ARGSUSED */
static int32_t
udf_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
{
	struct udf_fid *udfidp;
	struct ud_inode *ip = VTOI(vp);

	ud_printf("udf_fid\n");

	if (fidp->fid_len < (sizeof (struct udf_fid) - sizeof (uint16_t))) {
		fidp->fid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
		return (ENOSPC);
	}

	udfidp = (struct udf_fid *)fidp;
	bzero((char *)udfidp, sizeof (struct udf_fid));
	rw_enter(&ip->i_contents, RW_READER);
	udfidp->udfid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
	udfidp->udfid_uinq_lo = ip->i_uniqid & 0xffffffff;
	udfidp->udfid_prn = ip->i_icb_prn;
	udfidp->udfid_icb_lbn = ip->i_icb_block;
	rw_exit(&ip->i_contents);

	return (0);
}

/* ARGSUSED2 */
static int
udf_rwlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
{
	struct ud_inode *ip = VTOI(vp);

	ud_printf("udf_rwlock\n");

	if (write_lock) {
		rw_enter(&ip->i_rwlock, RW_WRITER);
	} else {
		rw_enter(&ip->i_rwlock, RW_READER);
	}
#ifdef	__lock_lint
	rw_exit(&ip->i_rwlock);
#endif
	return (write_lock);
}

/* ARGSUSED */
static void
udf_rwunlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
{
	struct ud_inode *ip = VTOI(vp);

	ud_printf("udf_rwunlock\n");

#ifdef	__lock_lint
	rw_enter(&ip->i_rwlock, RW_WRITER);
#endif

	rw_exit(&ip->i_rwlock);

}

/* ARGSUSED */
static int32_t
udf_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
{
	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
}

static int32_t
udf_frlock(
	struct vnode *vp,
	int32_t cmd,
	struct flock64 *bfp,
	int32_t flag,
	offset_t offset,
	struct flk_callback *flk_cbp,
	cred_t *cr,
	caller_context_t *ct)
{
	struct ud_inode *ip = VTOI(vp);

	ud_printf("udf_frlock\n");

	/*
	 * If file is being mapped, disallow frlock.
	 * XXX I am not holding tlock while checking i_mapcnt because the
	 * current locking strategy drops all locks before calling fs_frlock.
	 * So, mapcnt could change before we enter fs_frlock making is
	 * meaningless to have held tlock in the first place.
	 */
	if ((ip->i_mapcnt > 0) &&
	    (MANDLOCK(vp, ip->i_char))) {
		return (EAGAIN);
	}

	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
}

/*ARGSUSED6*/
static int32_t
udf_space(
	struct vnode *vp,
	int32_t cmd,
	struct flock64 *bfp,
	int32_t flag,
	offset_t offset,
	cred_t *cr,
	caller_context_t *ct)
{
	int32_t error = 0;

	ud_printf("udf_space\n");

	if (cmd != F_FREESP) {
		error =  EINVAL;
	} else if ((error = convoff(vp, bfp, 0, offset)) == 0) {
		error = ud_freesp(vp, bfp, flag, cr);
	}

	return (error);
}

/* ARGSUSED */
static int32_t
udf_getpage(
	struct vnode *vp,
	offset_t off,
	size_t len,
	uint32_t *protp,
	struct page **plarr,
	size_t plsz,
	struct seg *seg,
	caddr_t addr,
	enum seg_rw rw,
	struct cred *cr,
	caller_context_t *ct)
{
	struct ud_inode *ip = VTOI(vp);
	int32_t error, has_holes, beyond_eof, seqmode, dolock;
	int32_t pgsize = PAGESIZE;
	struct udf_vfs *udf_vfsp = ip->i_udf;
	page_t **pl;
	u_offset_t pgoff, eoff, uoff;
	krw_t rwtype;
	caddr_t pgaddr;

	ud_printf("udf_getpage\n");

	uoff = (u_offset_t)off; /* type conversion */
	if (protp) {
		*protp = PROT_ALL;
	}
	if (vp->v_flag & VNOMAP) {
		return (ENOSYS);
	}
	seqmode = ip->i_nextr == uoff && rw != S_CREATE;

	rwtype = RW_READER;
	dolock = (rw_owner(&ip->i_contents) != curthread);
retrylock:
#ifdef	__lock_lint
	rw_enter(&ip->i_contents, rwtype);
#else
	if (dolock) {
		rw_enter(&ip->i_contents, rwtype);
	}
#endif

	/*
	 * We may be getting called as a side effect of a bmap using
	 * fbread() when the blocks might be being allocated and the
	 * size has not yet been up'ed.  In this case we want to be
	 * able to return zero pages if we get back UDF_HOLE from
	 * calling bmap for a non write case here.  We also might have
	 * to read some frags from the disk into a page if we are
	 * extending the number of frags for a given lbn in bmap().
	 */
	beyond_eof = uoff + len > ip->i_size + PAGEOFFSET;
	if (beyond_eof && seg != segkmap) {
#ifdef	__lock_lint
		rw_exit(&ip->i_contents);
#else
		if (dolock) {
			rw_exit(&ip->i_contents);
		}
#endif
		return (EFAULT);
	}

	/*
	 * Must hold i_contents lock throughout the call to pvn_getpages
	 * since locked pages are returned from each call to ud_getapage.
	 * Must *not* return locked pages and then try for contents lock
	 * due to lock ordering requirements (inode > page)
	 */

	has_holes = ud_bmap_has_holes(ip);

	if ((rw == S_WRITE || rw == S_CREATE) && (has_holes || beyond_eof)) {
		int32_t	blk_size, count;
		u_offset_t offset;

		/*
		 * We must acquire the RW_WRITER lock in order to
		 * call bmap_write().
		 */
		if (dolock && rwtype == RW_READER) {
			rwtype = RW_WRITER;

			if (!rw_tryupgrade(&ip->i_contents)) {

				rw_exit(&ip->i_contents);

				goto retrylock;
			}
		}

		/*
		 * May be allocating disk blocks for holes here as
		 * a result of mmap faults. write(2) does the bmap_write
		 * in rdip/wrip, not here. We are not dealing with frags
		 * in this case.
		 */
		offset = uoff;
		while ((offset < uoff + len) &&
		    (offset < ip->i_size)) {
			/*
			 * the variable "bnp" is to simplify the expression for
			 * the compiler; * just passing in &bn to bmap_write
			 * causes a compiler "loop"
			 */

			blk_size = udf_vfsp->udf_lbsize;
			if ((offset + blk_size) > ip->i_size) {
				count = ip->i_size - offset;
			} else {
				count = blk_size;
			}
			error = ud_bmap_write(ip, offset, count, 0, cr);
			if (error) {
				goto update_inode;
			}
			offset += count; /* XXX - make this contig */
		}
	}

	/*
	 * Can be a reader from now on.
	 */
#ifdef	__lock_lint
	if (rwtype == RW_WRITER) {
		rw_downgrade(&ip->i_contents);
	}
#else
	if (dolock && rwtype == RW_WRITER) {
		rw_downgrade(&ip->i_contents);
	}
#endif

	/*
	 * We remove PROT_WRITE in cases when the file has UDF holes
	 * because we don't  want to call bmap_read() to check each
	 * page if it is backed with a disk block.
	 */
	if (protp && has_holes && rw != S_WRITE && rw != S_CREATE) {
		*protp &= ~PROT_WRITE;
	}

	error = 0;

	/*
	 * The loop looks up pages in the range <off, off + len).
	 * For each page, we first check if we should initiate an asynchronous
	 * read ahead before we call page_lookup (we may sleep in page_lookup
	 * for a previously initiated disk read).
	 */
	eoff = (uoff + len);
	for (pgoff = uoff, pgaddr = addr, pl = plarr;
	    pgoff < eoff; /* empty */) {
		page_t	*pp;
		u_offset_t	nextrio;
		se_t	se;

		se = ((rw == S_CREATE) ? SE_EXCL : SE_SHARED);

		/*
		 * Handle async getpage (faultahead)
		 */
		if (plarr == NULL) {
			ip->i_nextrio = pgoff;
			ud_getpage_ra(vp, pgoff, seg, pgaddr);
			pgoff += pgsize;
			pgaddr += pgsize;
			continue;
		}

		/*
		 * Check if we should initiate read ahead of next cluster.
		 * We call page_exists only when we need to confirm that
		 * we have the current page before we initiate the read ahead.
		 */
		nextrio = ip->i_nextrio;
		if (seqmode &&
		    pgoff + RD_CLUSTSZ(ip) >= nextrio && pgoff <= nextrio &&
		    nextrio < ip->i_size && page_exists(vp, pgoff))
			ud_getpage_ra(vp, pgoff, seg, pgaddr);

		if ((pp = page_lookup(vp, pgoff, se)) != NULL) {

			/*
			 * We found the page in the page cache.
			 */
			*pl++ = pp;
			pgoff += pgsize;
			pgaddr += pgsize;
			len -= pgsize;
			plsz -= pgsize;
		} else  {

			/*
			 * We have to create the page, or read it from disk.
			 */
			if (error = ud_getpage_miss(vp, pgoff, len,
			    seg, pgaddr, pl, plsz, rw, seqmode)) {
				goto error_out;
			}

			while (*pl != NULL) {
				pl++;
				pgoff += pgsize;
				pgaddr += pgsize;
				len -= pgsize;
				plsz -= pgsize;
			}
		}
	}

	/*
	 * Return pages up to plsz if they are in the page cache.
	 * We cannot return pages if there is a chance that they are
	 * backed with a UDF hole and rw is S_WRITE or S_CREATE.
	 */
	if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) {

		ASSERT((protp == NULL) ||
		    !(has_holes && (*protp & PROT_WRITE)));

		eoff = pgoff + plsz;
		while (pgoff < eoff) {
			page_t		*pp;

			if ((pp = page_lookup_nowait(vp, pgoff,
			    SE_SHARED)) == NULL)
				break;

			*pl++ = pp;
			pgoff += pgsize;
			plsz -= pgsize;
		}
	}

	if (plarr)
		*pl = NULL;			/* Terminate page list */
	ip->i_nextr = pgoff;

error_out:
	if (error && plarr) {
		/*
		 * Release any pages we have locked.
		 */
		while (pl > &plarr[0])
			page_unlock(*--pl);

		plarr[0] = NULL;
	}

update_inode:
#ifdef	__lock_lint
	rw_exit(&ip->i_contents);
#else
	if (dolock) {
		rw_exit(&ip->i_contents);
	}
#endif

	/*
	 * If the inode is not already marked for IACC (in rwip() for read)
	 * and the inode is not marked for no access time update (in rwip()
	 * for write) then update the inode access time and mod time now.
	 */
	mutex_enter(&ip->i_tlock);
	if ((ip->i_flag & (IACC | INOACC)) == 0) {
		if ((rw != S_OTHER) && (ip->i_type != VDIR)) {
			ip->i_flag |= IACC;
		}
		if (rw == S_WRITE) {
			ip->i_flag |= IUPD;
		}
		ITIMES_NOLOCK(ip);
	}
	mutex_exit(&ip->i_tlock);

	return (error);
}

int32_t ud_delay = 1;

/* ARGSUSED */
static int32_t
udf_putpage(
	struct vnode *vp,
	offset_t off,
	size_t len,
	int32_t flags,
	struct cred *cr,
	caller_context_t *ct)
{
	struct ud_inode *ip;
	int32_t error = 0;

	ud_printf("udf_putpage\n");

	ip = VTOI(vp);
#ifdef	__lock_lint
	rw_enter(&ip->i_contents, RW_WRITER);
#endif

	if (vp->v_count == 0) {
		cmn_err(CE_WARN, "ud_putpage : bad v_count");
		error = EINVAL;
		goto out;
	}

	if (vp->v_flag & VNOMAP) {
		error = ENOSYS;
		goto out;
	}

	if (flags & B_ASYNC) {
		if (ud_delay && len &&
		    (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) {
			mutex_enter(&ip->i_tlock);

			/*
			 * If nobody stalled, start a new cluster.
			 */
			if (ip->i_delaylen == 0) {
				ip->i_delayoff = off;
				ip->i_delaylen = len;
				mutex_exit(&ip->i_tlock);
				goto out;
			}

			/*
			 * If we have a full cluster or they are not contig,
			 * then push last cluster and start over.
			 */
			if (ip->i_delaylen >= WR_CLUSTSZ(ip) ||
			    ip->i_delayoff + ip->i_delaylen != off) {
				u_offset_t doff;
				size_t dlen;

				doff = ip->i_delayoff;
				dlen = ip->i_delaylen;
				ip->i_delayoff = off;
				ip->i_delaylen = len;
				mutex_exit(&ip->i_tlock);
				error = ud_putpages(vp, doff, dlen, flags, cr);
				/* LMXXX - flags are new val, not old */
				goto out;
			}

			/*
			 * There is something there, it's not full, and
			 * it is contig.
			 */
			ip->i_delaylen += len;
			mutex_exit(&ip->i_tlock);
			goto out;
		}

		/*
		 * Must have weird flags or we are not clustering.
		 */
	}

	error = ud_putpages(vp, off, len, flags, cr);

out:
#ifdef	__lock_lint
	rw_exit(&ip->i_contents);
#endif
	return (error);
}

/* ARGSUSED */
static int32_t
udf_map(
	struct vnode *vp,
	offset_t off,
	struct as *as,
	caddr_t *addrp,
	size_t len,
	uint8_t prot,
	uint8_t maxprot,
	uint32_t flags,
	struct cred *cr,
	caller_context_t *ct)
{
	struct segvn_crargs vn_a;
	int32_t error = 0;

	ud_printf("udf_map\n");

	if (vp->v_flag & VNOMAP) {
		error = ENOSYS;
		goto end;
	}

	if ((off < (offset_t)0) ||
	    ((off + len) < (offset_t)0)) {
		error = EINVAL;
		goto end;
	}

	if (vp->v_type != VREG) {
		error = ENODEV;
		goto end;
	}

	/*
	 * If file is being locked, disallow mapping.
	 */
	if (vn_has_mandatory_locks(vp, VTOI(vp)->i_char)) {
		error = EAGAIN;
		goto end;
	}

	as_rangelock(as);
	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
	if (error != 0) {
		as_rangeunlock(as);
		goto end;
	}

	vn_a.vp = vp;
	vn_a.offset = off;
	vn_a.type = flags & MAP_TYPE;
	vn_a.prot = prot;
	vn_a.maxprot = maxprot;
	vn_a.cred = cr;
	vn_a.amp = NULL;
	vn_a.flags = flags & ~MAP_TYPE;
	vn_a.szc = 0;
	vn_a.lgrp_mem_policy_flags = 0;

	error = as_map(as, *addrp, len, segvn_create, (caddr_t)&vn_a);
	as_rangeunlock(as);

end:
	return (error);
}

/* ARGSUSED */
static int32_t
udf_addmap(struct vnode *vp,
	offset_t off,
	struct as *as,
	caddr_t addr,
	size_t len,
	uint8_t prot,
	uint8_t maxprot,
	uint32_t flags,
	struct cred *cr,
	caller_context_t *ct)
{
	struct ud_inode *ip = VTOI(vp);

	ud_printf("udf_addmap\n");

	if (vp->v_flag & VNOMAP) {
		return (ENOSYS);
	}

	mutex_enter(&ip->i_tlock);
	ip->i_mapcnt += btopr(len);
	mutex_exit(&ip->i_tlock);

	return (0);
}

/* ARGSUSED */
static int32_t
udf_delmap(
	struct vnode *vp, offset_t off,
	struct as *as,
	caddr_t addr,
	size_t len,
	uint32_t prot,
	uint32_t maxprot,
	uint32_t flags,
	struct cred *cr,
	caller_context_t *ct)
{
	struct ud_inode *ip = VTOI(vp);

	ud_printf("udf_delmap\n");

	if (vp->v_flag & VNOMAP) {
		return (ENOSYS);
	}

	mutex_enter(&ip->i_tlock);
	ip->i_mapcnt -= btopr(len); 	/* Count released mappings */
	ASSERT(ip->i_mapcnt >= 0);
	mutex_exit(&ip->i_tlock);

	return (0);
}

/* ARGSUSED */
static int32_t
udf_l_pathconf(
	struct vnode *vp,
	int32_t cmd,
	ulong_t *valp,
	struct cred *cr,
	caller_context_t *ct)
{
	int32_t error = 0;

	ud_printf("udf_l_pathconf\n");

	if (cmd == _PC_FILESIZEBITS) {
		/*
		 * udf supports 64 bits as file size
		 * but there are several other restrictions
		 * it only supports 32-bit block numbers and
		 * daddr32_t is only and int32_t so taking these
		 * into account we can stay just as where ufs is
		 */
		*valp = 41;
	} else if (cmd == _PC_TIMESTAMP_RESOLUTION) {
		/* nanosecond timestamp resolution */
		*valp = 1L;
	} else {
		error = fs_pathconf(vp, cmd, valp, cr, ct);
	}

	return (error);
}

uint32_t ud_pageio_reads = 0, ud_pageio_writes = 0;
#ifndef	__lint
_NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_reads))
_NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_writes))
#endif
/*
 * Assumption is that there will not be a pageio request
 * to a enbedded file
 */
/* ARGSUSED */
static int32_t
udf_pageio(
	struct vnode *vp,
	struct page *pp,
	u_offset_t io_off,
	size_t io_len,
	int32_t flags,
	struct cred *cr,
	caller_context_t *ct)
{
	daddr_t bn;
	struct buf *bp;
	struct ud_inode *ip = VTOI(vp);
	int32_t dolock, error = 0, contig, multi_io;
	size_t done_len = 0, cur_len = 0;
	page_t *npp = NULL, *opp = NULL, *cpp = pp;

	if (pp == NULL) {
		return (EINVAL);
	}

	dolock = (rw_owner(&ip->i_contents) != curthread);

	/*
	 * We need a better check.  Ideally, we would use another
	 * vnodeops so that hlocked and forcibly unmounted file
	 * systems would return EIO where appropriate and w/o the
	 * need for these checks.
	 */
	if (ip->i_udf == NULL) {
		return (EIO);
	}

#ifdef	__lock_lint
	rw_enter(&ip->i_contents, RW_READER);
#else
	if (dolock) {
		rw_enter(&ip->i_contents, RW_READER);
	}
#endif

	/*
	 * Break the io request into chunks, one for each contiguous
	 * stretch of disk blocks in the target file.
	 */
	while (done_len < io_len) {
		ASSERT(cpp);
		bp = NULL;
		contig = 0;
		if (error = ud_bmap_read(ip, (u_offset_t)(io_off + done_len),
		    &bn, &contig)) {
			break;
		}

		if (bn == UDF_HOLE) {   /* No holey swapfiles */
			cmn_err(CE_WARN, "SWAP file has HOLES");
			error = EINVAL;
			break;
		}

		cur_len = MIN(io_len - done_len, contig);

		/*
		 * Check if more than one I/O is
		 * required to complete the given
		 * I/O operation
		 */
		if (ip->i_udf->udf_lbsize < PAGESIZE) {
			if (cur_len >= PAGESIZE) {
				multi_io = 0;
				cur_len &= PAGEMASK;
			} else {
				multi_io = 1;
				cur_len = MIN(io_len - done_len, PAGESIZE);
			}
		}
		page_list_break(&cpp, &npp, btop(cur_len));

		bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags);
		ASSERT(bp != NULL);

		bp->b_edev = ip->i_dev;
		bp->b_dev = cmpdev(ip->i_dev);
		bp->b_blkno = bn;
		bp->b_un.b_addr = (caddr_t)0;
		bp->b_file = vp;
		bp->b_offset = (offset_t)(io_off + done_len);

/*
 *		ub.ub_pageios.value.ul++;
 */
		if (multi_io == 0) {
			(void) bdev_strategy(bp);
		} else {
			error = ud_multi_strat(ip, cpp, bp,
			    (u_offset_t)(io_off + done_len));
			if (error != 0) {
				pageio_done(bp);
				break;
			}
		}
		if (flags & B_READ) {
			ud_pageio_reads++;
		} else {
			ud_pageio_writes++;
		}

		/*
		 * If the request is not B_ASYNC, wait for i/o to complete
		 * and re-assemble the page list to return to the caller.
		 * If it is B_ASYNC we leave the page list in pieces and
		 * cleanup() will dispose of them.
		 */
		if ((flags & B_ASYNC) == 0) {
			error = biowait(bp);
			pageio_done(bp);
			if (error) {
				break;
			}
			page_list_concat(&opp, &cpp);
		}
		cpp = npp;
		npp = NULL;
		done_len += cur_len;
	}

	ASSERT(error || (cpp == NULL && npp == NULL && done_len == io_len));
	if (error) {
		if (flags & B_ASYNC) {
			/* Cleanup unprocessed parts of list */
			page_list_concat(&cpp, &npp);
			if (flags & B_READ) {
				pvn_read_done(cpp, B_ERROR);
			} else {
				pvn_write_done(cpp, B_ERROR);
			}
		} else {
			/* Re-assemble list and let caller clean up */
			page_list_concat(&opp, &cpp);
			page_list_concat(&opp, &npp);
		}
	}

#ifdef	__lock_lint
	rw_exit(&ip->i_contents);
#else
	if (dolock) {
		rw_exit(&ip->i_contents);
	}
#endif
	return (error);
}




/* -------------------- local functions --------------------------- */



int32_t
ud_rdwri(enum uio_rw rw, int32_t ioflag,
	struct ud_inode *ip, caddr_t base, int32_t len,
	offset_t offset, enum uio_seg seg, int32_t *aresid, struct cred *cr)
{
	int32_t error;
	struct uio auio;
	struct iovec aiov;

	ud_printf("ud_rdwri\n");

	bzero((caddr_t)&auio, sizeof (uio_t));
	bzero((caddr_t)&aiov, sizeof (iovec_t));

	aiov.iov_base = base;
	aiov.iov_len = len;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_loffset = offset;
	auio.uio_segflg = (int16_t)seg;
	auio.uio_resid = len;

	if (rw == UIO_WRITE) {
		auio.uio_fmode = FWRITE;
		auio.uio_extflg = UIO_COPY_DEFAULT;
		auio.uio_llimit = curproc->p_fsz_ctl;
		error = ud_wrip(ip, &auio, ioflag, cr);
	} else {
		auio.uio_fmode = FREAD;
		auio.uio_extflg = UIO_COPY_CACHED;
		auio.uio_llimit = MAXOFFSET_T;
		error = ud_rdip(ip, &auio, ioflag, cr);
	}

	if (aresid) {
		*aresid = auio.uio_resid;
	} else if (auio.uio_resid) {
		error = EIO;
	}
	return (error);
}

/*
 * Free behind hacks.  The pager is busted.
 * XXX - need to pass the information down to writedone() in a flag like B_SEQ
 * or B_FREE_IF_TIGHT_ON_MEMORY.
 */
int32_t ud_freebehind = 1;
int32_t ud_smallfile = 32 * 1024;

/* ARGSUSED */
int32_t
ud_getpage_miss(struct vnode *vp, u_offset_t off,
	size_t len, struct seg *seg, caddr_t addr, page_t *pl[],
	size_t plsz, enum seg_rw rw, int32_t seq)
{
	struct ud_inode *ip = VTOI(vp);
	int32_t err = 0;
	size_t io_len;
	u_offset_t io_off;
	u_offset_t pgoff;
	page_t *pp;

	pl[0] = NULL;

	/*
	 * Figure out whether the page can be created, or must be
	 * read from the disk
	 */
	if (rw == S_CREATE) {
		if ((pp = page_create_va(vp, off,
		    PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
			cmn_err(CE_WARN, "ud_getpage_miss: page_create");
			return (EINVAL);
		}
		io_len = PAGESIZE;
	} else {
		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
		    &io_len, off, PAGESIZE, 0);

		/*
		 * Some other thread has entered the page.
		 * ud_getpage will retry page_lookup.
		 */
		if (pp == NULL) {
			return (0);
		}

		/*
		 * Fill the page with as much data as we can from the file.
		 */
		err = ud_page_fill(ip, pp, off, B_READ, &pgoff);
		if (err) {
			pvn_read_done(pp, B_ERROR);
			return (err);
		}

		/*
		 * XXX ??? ufs has io_len instead of pgoff below
		 */
		ip->i_nextrio = off + ((pgoff + PAGESIZE - 1) & PAGEMASK);

		/*
		 * If the file access is sequential, initiate read ahead
		 * of the next cluster.
		 */
		if (seq && ip->i_nextrio < ip->i_size) {
			ud_getpage_ra(vp, off, seg, addr);
		}
	}

outmiss:
	pvn_plist_init(pp, pl, plsz, (offset_t)off, io_len, rw);
	return (err);
}

/* ARGSUSED */
void
ud_getpage_ra(struct vnode *vp,
	u_offset_t off, struct seg *seg, caddr_t addr)
{
	page_t *pp;
	size_t io_len;
	struct ud_inode *ip = VTOI(vp);
	u_offset_t io_off = ip->i_nextrio, pgoff;
	caddr_t addr2 = addr + (io_off - off);
	daddr_t bn;
	int32_t contig = 0;

	/*
	 * Is this test needed?
	 */

	if (addr2 >= seg->s_base + seg->s_size) {
		return;
	}

	contig = 0;
	if (ud_bmap_read(ip, io_off, &bn, &contig) != 0 || bn == UDF_HOLE) {
		return;
	}

	pp = pvn_read_kluster(vp, io_off, seg, addr2,
	    &io_off, &io_len, io_off, PAGESIZE, 1);

	/*
	 * Some other thread has entered the page.
	 * So no read head done here (ie we will have to and wait
	 * for the read when needed).
	 */

	if (pp == NULL) {
		return;
	}

	(void) ud_page_fill(ip, pp, io_off, (B_READ|B_ASYNC), &pgoff);
	ip->i_nextrio =  io_off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
}

int
ud_page_fill(struct ud_inode *ip, page_t *pp, u_offset_t off,
	uint32_t bflgs, u_offset_t *pg_off)
{
	daddr_t bn;
	struct buf *bp;
	caddr_t kaddr, caddr;
	int32_t error = 0, contig = 0, multi_io = 0;
	int32_t lbsize = ip->i_udf->udf_lbsize;
	int32_t lbmask = ip->i_udf->udf_lbmask;
	uint64_t isize;

	isize = (ip->i_size + lbmask) & (~lbmask);
	if (ip->i_desc_type == ICB_FLAG_ONE_AD) {

		/*
		 * Embedded file read file_entry
		 * from buffer cache and copy the required
		 * portions
		 */
		bp = ud_bread(ip->i_dev,
		    ip->i_icb_lbano << ip->i_udf->udf_l2d_shift, lbsize);
		if ((bp->b_error == 0) &&
		    (bp->b_resid == 0)) {

			caddr = bp->b_un.b_addr + ip->i_data_off;

			/*
			 * mapin to kvm
			 */
			kaddr = (caddr_t)ppmapin(pp,
			    PROT_READ | PROT_WRITE, (caddr_t)-1);
			(void) kcopy(caddr, kaddr, ip->i_size);

			/*
			 * mapout of kvm
			 */
			ppmapout(kaddr);
		}
		brelse(bp);
		contig = ip->i_size;
	} else {

		/*
		 * Get the continuous size and block number
		 * at offset "off"
		 */
		if (error = ud_bmap_read(ip, off, &bn, &contig))
			goto out;
		contig = MIN(contig, PAGESIZE);
		contig = (contig + lbmask) & (~lbmask);

		/*
		 * Zero part of the page which we are not
		 * going to read from the disk.
		 */

		if (bn == UDF_HOLE) {

			/*
			 * This is a HOLE. Just zero out
			 * the page
			 */
			if (((off + contig) == isize) ||
			    (contig == PAGESIZE)) {
				pagezero(pp->p_prev, 0, PAGESIZE);
				goto out;
			}
		}

		if (contig < PAGESIZE) {
			uint64_t count;

			count = isize - off;
			if (contig != count) {
				multi_io = 1;
				contig = (int32_t)(MIN(count, PAGESIZE));
			} else {
				pagezero(pp->p_prev, contig, PAGESIZE - contig);
			}
		}

		/*
		 * Get a bp and initialize it
		 */
		bp = pageio_setup(pp, contig, ip->i_devvp, bflgs);
		ASSERT(bp != NULL);

		bp->b_edev = ip->i_dev;
		bp->b_dev = cmpdev(ip->i_dev);
		bp->b_blkno = bn;
		bp->b_un.b_addr = 0;
		bp->b_file = ip->i_vnode;

		/*
		 * Start I/O
		 */
		if (multi_io == 0) {

			/*
			 * Single I/O is sufficient for this page
			 */
			(void) bdev_strategy(bp);
		} else {

			/*
			 * We need to do the I/O in
			 * piece's
			 */
			error = ud_multi_strat(ip, pp, bp, off);
			if (error != 0) {
				goto out;
			}
		}
		if ((bflgs & B_ASYNC) == 0) {

			/*
			 * Wait for i/o to complete.
			 */

			error = biowait(bp);
			pageio_done(bp);
			if (error) {
				goto out;
			}
		}
	}
	if ((off + contig) >= ip->i_size) {
		contig = ip->i_size - off;
	}

out:
	*pg_off = contig;
	return (error);
}

int32_t
ud_putpages(struct vnode *vp, offset_t off,
	size_t len, int32_t flags, struct cred *cr)
{
	struct ud_inode *ip;
	page_t *pp;
	u_offset_t io_off;
	size_t io_len;
	u_offset_t eoff;
	int32_t err = 0;
	int32_t dolock;

	ud_printf("ud_putpages\n");

	if (vp->v_count == 0) {
		cmn_err(CE_WARN, "ud_putpages: bad v_count");
		return (EINVAL);
	}

	ip = VTOI(vp);

	/*
	 * Acquire the readers/write inode lock before locking
	 * any pages in this inode.
	 * The inode lock is held during i/o.
	 */
	if (len == 0) {
		mutex_enter(&ip->i_tlock);
		ip->i_delayoff = ip->i_delaylen = 0;
		mutex_exit(&ip->i_tlock);
	}
#ifdef	__lock_lint
	rw_enter(&ip->i_contents, RW_READER);
#else
	dolock = (rw_owner(&ip->i_contents) != curthread);
	if (dolock) {
		rw_enter(&ip->i_contents, RW_READER);
	}
#endif

	if (!vn_has_cached_data(vp)) {
#ifdef	__lock_lint
		rw_exit(&ip->i_contents);
#else
		if (dolock) {
			rw_exit(&ip->i_contents);
		}
#endif
		return (0);
	}

	if (len == 0) {
		/*
		 * Search the entire vp list for pages >= off.
		 */
		err = pvn_vplist_dirty(vp, (u_offset_t)off, ud_putapage,
		    flags, cr);
	} else {
		/*
		 * Loop over all offsets in the range looking for
		 * pages to deal with.
		 */
		if ((eoff = blkroundup(ip->i_udf, ip->i_size)) != 0) {
			eoff = MIN(off + len, eoff);
		} else {
			eoff = off + len;
		}

		for (io_off = off; io_off < eoff; io_off += io_len) {
			/*
			 * If we are not invalidating, synchronously
			 * freeing or writing pages, use the routine
			 * page_lookup_nowait() to prevent reclaiming
			 * them from the free list.
			 */
			if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
				pp = page_lookup(vp, io_off,
				    (flags & (B_INVAL | B_FREE)) ?
				    SE_EXCL : SE_SHARED);
			} else {
				pp = page_lookup_nowait(vp, io_off,
				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
			}

			if (pp == NULL || pvn_getdirty(pp, flags) == 0) {
				io_len = PAGESIZE;
			} else {

				err = ud_putapage(vp, pp,
				    &io_off, &io_len, flags, cr);
				if (err != 0) {
					break;
				}
				/*
				 * "io_off" and "io_len" are returned as
				 * the range of pages we actually wrote.
				 * This allows us to skip ahead more quickly
				 * since several pages may've been dealt
				 * with by this iteration of the loop.
				 */
			}
		}
	}
	if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) {
		/*
		 * We have just sync'ed back all the pages on
		 * the inode, turn off the IMODTIME flag.
		 */
		mutex_enter(&ip->i_tlock);
		ip->i_flag &= ~IMODTIME;
		mutex_exit(&ip->i_tlock);
	}
#ifdef	__lock_lint
	rw_exit(&ip->i_contents);
#else
	if (dolock) {
		rw_exit(&ip->i_contents);
	}
#endif
	return (err);
}

/* ARGSUSED */
int32_t
ud_putapage(struct vnode *vp,
	page_t *pp, u_offset_t *offp,
	size_t *lenp, int32_t flags, struct cred *cr)
{
	daddr_t bn;
	size_t io_len;
	struct ud_inode *ip;
	int32_t error = 0, contig, multi_io = 0;
	struct udf_vfs *udf_vfsp;
	u_offset_t off, io_off;
	caddr_t kaddr, caddr;
	struct buf *bp = NULL;
	int32_t lbmask;
	uint64_t isize;
	uint16_t crc_len;
	struct file_entry *fe;

	ud_printf("ud_putapage\n");

	ip = VTOI(vp);
	ASSERT(ip);
	ASSERT(RW_LOCK_HELD(&ip->i_contents));
	lbmask = ip->i_udf->udf_lbmask;
	isize = (ip->i_size + lbmask) & (~lbmask);

	udf_vfsp = ip->i_udf;
	ASSERT(udf_vfsp->udf_flags & UDF_FL_RW);

	/*
	 * If the modified time on the inode has not already been
	 * set elsewhere (e.g. for write/setattr) we set the time now.
	 * This gives us approximate modified times for mmap'ed files
	 * which are modified via stores in the user address space.
	 */
	if (((ip->i_flag & IMODTIME) == 0) || (flags & B_FORCE)) {
		mutex_enter(&ip->i_tlock);
		ip->i_flag |= IUPD;
		ITIMES_NOLOCK(ip);
		mutex_exit(&ip->i_tlock);
	}


	/*
	 * Align the request to a block boundry (for old file systems),
	 * and go ask bmap() how contiguous things are for this file.
	 */
	off = pp->p_offset & ~(offset_t)lbmask;
				/* block align it */


	if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
		ASSERT(ip->i_size <= ip->i_max_emb);

		pp = pvn_write_kluster(vp, pp, &io_off,
		    &io_len, off, PAGESIZE, flags);
		if (io_len == 0) {
			io_len = PAGESIZE;
		}

		bp = ud_bread(ip->i_dev,
		    ip->i_icb_lbano << udf_vfsp->udf_l2d_shift,
		    udf_vfsp->udf_lbsize);
		fe = (struct file_entry *)bp->b_un.b_addr;
		if ((bp->b_flags & B_ERROR) ||
		    (ud_verify_tag_and_desc(&fe->fe_tag, UD_FILE_ENTRY,
		    ip->i_icb_block,
		    1, udf_vfsp->udf_lbsize) != 0)) {
			if (pp != NULL)
				pvn_write_done(pp, B_ERROR | B_WRITE | flags);
			if (bp->b_flags & B_ERROR) {
				error = EIO;
			} else {
				error = EINVAL;
			}
			brelse(bp);
			return (error);
		}
		if ((bp->b_error == 0) &&
		    (bp->b_resid == 0)) {

			caddr = bp->b_un.b_addr + ip->i_data_off;
			kaddr = (caddr_t)ppmapin(pp,
			    PROT_READ | PROT_WRITE, (caddr_t)-1);
			(void) kcopy(kaddr, caddr, ip->i_size);
			ppmapout(kaddr);
		}
		crc_len = offsetof(struct file_entry, fe_spec) +
		    SWAP_32(fe->fe_len_ear);
		crc_len += ip->i_size;
		ud_make_tag(ip->i_udf, &fe->fe_tag,
		    UD_FILE_ENTRY, ip->i_icb_block, crc_len);

		bwrite(bp);

		if (flags & B_ASYNC) {
			pvn_write_done(pp, flags);
		}
		contig = ip->i_size;
	} else {

		if (error = ud_bmap_read(ip, off, &bn, &contig)) {
			goto out;
		}
		contig = MIN(contig, PAGESIZE);
		contig = (contig + lbmask) & (~lbmask);

		if (contig < PAGESIZE) {
			uint64_t count;

			count = isize - off;
			if (contig != count) {
				multi_io = 1;
				contig = (int32_t)(MIN(count, PAGESIZE));
			}
		}

		if ((off + contig) > isize) {
			contig = isize - off;
		}

		if (contig > PAGESIZE) {
			if (contig & PAGEOFFSET) {
				contig &= PAGEMASK;
			}
		}

		pp = pvn_write_kluster(vp, pp, &io_off,
		    &io_len, off, contig, flags);
		if (io_len == 0) {
			io_len = PAGESIZE;
		}

		bp = pageio_setup(pp, contig, ip->i_devvp, B_WRITE | flags);
		ASSERT(bp != NULL);

		bp->b_edev = ip->i_dev;
		bp->b_dev = cmpdev(ip->i_dev);
		bp->b_blkno = bn;
		bp->b_un.b_addr = 0;
		bp->b_file = vp;
		bp->b_offset = (offset_t)off;


		/*
		 * write throttle
		 */
		ASSERT(bp->b_iodone == NULL);
		bp->b_iodone = ud_iodone;
		mutex_enter(&ip->i_tlock);
		ip->i_writes += bp->b_bcount;
		mutex_exit(&ip->i_tlock);

		if (multi_io == 0) {

			(void) bdev_strategy(bp);
		} else {
			error = ud_multi_strat(ip, pp, bp, off);
			if (error != 0) {
				goto out;
			}
		}

		if ((flags & B_ASYNC) == 0) {
			/*
			 * Wait for i/o to complete.
			 */
			error = biowait(bp);
			pageio_done(bp);
		}
	}

	if ((flags & B_ASYNC) == 0) {
		pvn_write_done(pp, ((error) ? B_ERROR : 0) | B_WRITE | flags);
	}

	pp = NULL;

out:
	if (error != 0 && pp != NULL) {
		pvn_write_done(pp, B_ERROR | B_WRITE | flags);
	}

	if (offp) {
		*offp = io_off;
	}
	if (lenp) {
		*lenp = io_len;
	}

	return (error);
}


int32_t
ud_iodone(struct buf *bp)
{
	struct ud_inode *ip;

	ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ));

	bp->b_iodone = NULL;

	ip = VTOI(bp->b_pages->p_vnode);

	mutex_enter(&ip->i_tlock);
	if (ip->i_writes >= ud_LW) {
		if ((ip->i_writes -= bp->b_bcount) <= ud_LW) {
			if (ud_WRITES) {
				cv_broadcast(&ip->i_wrcv); /* wake all up */
			}
		}
	} else {
		ip->i_writes -= bp->b_bcount;
	}
	mutex_exit(&ip->i_tlock);
	iodone(bp);
	return (0);
}

/* ARGSUSED3 */
int32_t
ud_rdip(struct ud_inode *ip, struct uio *uio, int32_t ioflag, cred_t *cr)
{
	struct vnode *vp;
	struct udf_vfs *udf_vfsp;
	krw_t rwtype;
	caddr_t base;
	uint32_t flags;
	int32_t error, n, on, mapon, dofree;
	u_offset_t off;
	long oresid = uio->uio_resid;

	ASSERT(RW_LOCK_HELD(&ip->i_contents));
	if ((ip->i_type != VREG) &&
	    (ip->i_type != VDIR) &&
	    (ip->i_type != VLNK)) {
		return (EIO);
	}

	if (uio->uio_loffset > MAXOFFSET_T) {
		return (0);
	}

	if ((uio->uio_loffset < (offset_t)0) ||
	    ((uio->uio_loffset + uio->uio_resid) < 0)) {
		return (EINVAL);
	}
	if (uio->uio_resid == 0) {
		return (0);
	}

	vp = ITOV(ip);
	udf_vfsp = ip->i_udf;
	mutex_enter(&ip->i_tlock);
	ip->i_flag |= IACC;
	mutex_exit(&ip->i_tlock);

	rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);

	do {
		offset_t diff;
		u_offset_t uoff = uio->uio_loffset;
		off = uoff & (offset_t)MAXBMASK;
		mapon = (int)(uoff & (offset_t)MAXBOFFSET);
		on = (int)blkoff(udf_vfsp, uoff);
		n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);

		diff = ip->i_size - uoff;

		if (diff <= (offset_t)0) {
			error = 0;
			goto out;
		}
		if (diff < (offset_t)n) {
			n = (int)diff;
		}
		dofree = ud_freebehind &&
		    ip->i_nextr == (off & PAGEMASK) &&
		    off > ud_smallfile;

#ifndef	__lock_lint
		if (rwtype == RW_READER) {
			rw_exit(&ip->i_contents);
		}
#endif

		base = segmap_getmapflt(segkmap, vp, (off + mapon),
		    (uint32_t)n, 1, S_READ);
		error = uiomove(base + mapon, (long)n, UIO_READ, uio);

		flags = 0;
		if (!error) {
			/*
			 * If read a whole block, or read to eof,
			 * won't need this buffer again soon.
			 */
			if (n + on == MAXBSIZE && ud_freebehind && dofree &&
			    freemem < lotsfree + pages_before_pager) {
				flags = SM_FREE | SM_DONTNEED |SM_ASYNC;
			}
			/*
			 * In POSIX SYNC (FSYNC and FDSYNC) read mode,
			 * we want to make sure that the page which has
			 * been read, is written on disk if it is dirty.
			 * And corresponding indirect blocks should also
			 * be flushed out.
			 */
			if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
				flags &= ~SM_ASYNC;
				flags |= SM_WRITE;
			}
			error = segmap_release(segkmap, base, flags);
		} else    {
			(void) segmap_release(segkmap, base, flags);
		}

#ifndef __lock_lint
		if (rwtype == RW_READER) {
			rw_enter(&ip->i_contents, rwtype);
		}
#endif
	} while (error == 0 && uio->uio_resid > 0 && n != 0);
out:
	/*
	 * Inode is updated according to this table if FRSYNC is set.
	 *
	 *	FSYNC	FDSYNC(posix.4)
	 *	--------------------------
	 *	always	IATTCHG|IBDWRITE
	 */
	if (ioflag & FRSYNC) {
		if ((ioflag & FSYNC) ||
		    ((ioflag & FDSYNC) &&
		    (ip->i_flag & (IATTCHG|IBDWRITE)))) {
		rw_exit(&ip->i_contents);
		rw_enter(&ip->i_contents, RW_WRITER);
		ud_iupdat(ip, 1);
		}
	}
	/*
	 * If we've already done a partial read, terminate
	 * the read but return no error.
	 */
	if (oresid != uio->uio_resid) {
		error = 0;
	}
	ITIMES(ip);

	return (error);
}

int32_t
ud_wrip(struct ud_inode *ip, struct uio *uio, int ioflag, struct cred *cr)
{
	caddr_t base;
	struct vnode *vp;
	struct udf_vfs *udf_vfsp;
	uint32_t flags;
	int32_t error = 0, iupdat_flag, n, on, mapon, i_size_changed = 0;
	int32_t pagecreate, newpage;
	uint64_t old_i_size;
	u_offset_t off;
	long start_resid = uio->uio_resid, premove_resid;
	rlim64_t limit = uio->uio_limit;


	ASSERT(RW_WRITE_HELD(&ip->i_contents));
	if ((ip->i_type != VREG) &&
	    (ip->i_type != VDIR) &&
	    (ip->i_type != VLNK)) {
		return (EIO);
	}

	if (uio->uio_loffset >= MAXOFFSET_T) {
		return (EFBIG);
	}
	/*
	 * see udf_l_pathconf
	 */
	if (limit > (((uint64_t)1 << 40) - 1)) {
		limit = ((uint64_t)1 << 40) - 1;
	}
	if (uio->uio_loffset >= limit) {
		proc_t *p = ttoproc(curthread);

		mutex_enter(&p->p_lock);
		(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
		    p, RCA_UNSAFE_SIGINFO);
		mutex_exit(&p->p_lock);
		return (EFBIG);
	}
	if ((uio->uio_loffset < (offset_t)0) ||
	    ((uio->uio_loffset + uio->uio_resid) < 0)) {
		return (EINVAL);
	}
	if (uio->uio_resid == 0) {
		return (0);
	}

	mutex_enter(&ip->i_tlock);
	ip->i_flag |= INOACC;

	if (ioflag & (FSYNC | FDSYNC)) {
		ip->i_flag |= ISYNC;
		iupdat_flag = 1;
	}
	mutex_exit(&ip->i_tlock);

	udf_vfsp = ip->i_udf;
	vp = ITOV(ip);

	do {
		u_offset_t uoff = uio->uio_loffset;
		off = uoff & (offset_t)MAXBMASK;
		mapon = (int)(uoff & (offset_t)MAXBOFFSET);
		on = (int)blkoff(udf_vfsp, uoff);
		n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);

		if (ip->i_type == VREG && uoff + n >= limit) {
			if (uoff >= limit) {
				error = EFBIG;
				goto out;
			}
			n = (int)(limit - (rlim64_t)uoff);
		}
		if (uoff + n > ip->i_size) {
			/*
			 * We are extending the length of the file.
			 * bmap is used so that we are sure that
			 * if we need to allocate new blocks, that it
			 * is done here before we up the file size.
			 */
			error = ud_bmap_write(ip, uoff,
			    (int)(on + n), mapon == 0, cr);
			if (error) {
				break;
			}
			i_size_changed = 1;
			old_i_size = ip->i_size;
			ip->i_size = uoff + n;
			/*
			 * If we are writing from the beginning of
			 * the mapping, we can just create the
			 * pages without having to read them.
			 */
			pagecreate = (mapon == 0);
		} else if (n == MAXBSIZE) {
			/*
			 * Going to do a whole mappings worth,
			 * so we can just create the pages w/o
			 * having to read them in.  But before
			 * we do that, we need to make sure any
			 * needed blocks are allocated first.
			 */
			error = ud_bmap_write(ip, uoff,
			    (int)(on + n), 1, cr);
			if (error) {
				break;
			}
			pagecreate = 1;
		} else {
			pagecreate = 0;
		}

		rw_exit(&ip->i_contents);

		/*
		 * Touch the page and fault it in if it is not in
		 * core before segmap_getmapflt can lock it. This
		 * is to avoid the deadlock if the buffer is mapped
		 * to the same file through mmap which we want to
		 * write to.
		 */
		uio_prefaultpages((long)n, uio);

		base = segmap_getmapflt(segkmap, vp, (off + mapon),
		    (uint32_t)n, !pagecreate, S_WRITE);

		/*
		 * segmap_pagecreate() returns 1 if it calls
		 * page_create_va() to allocate any pages.
		 */
		newpage = 0;
		if (pagecreate) {
			newpage = segmap_pagecreate(segkmap, base,
			    (size_t)n, 0);
		}

		premove_resid = uio->uio_resid;
		error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);

		if (pagecreate &&
		    uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
			/*
			 * We created pages w/o initializing them completely,
			 * thus we need to zero the part that wasn't set up.
			 * This happens on most EOF write cases and if
			 * we had some sort of error during the uiomove.
			 */
			int nzero, nmoved;

			nmoved = (int)(uio->uio_loffset - (off + mapon));
			ASSERT(nmoved >= 0 && nmoved <= n);
			nzero = roundup(on + n, PAGESIZE) - nmoved;
			ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
			(void) kzero(base + mapon + nmoved, (uint32_t)nzero);
		}

		/*
		 * Unlock the pages allocated by page_create_va()
		 * in segmap_pagecreate()
		 */
		if (newpage) {
			segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
		}

		if (error) {
			/*
			 * If we failed on a write, we may have already
			 * allocated file blocks as well as pages.  It's
			 * hard to undo the block allocation, but we must
			 * be sure to invalidate any pages that may have
			 * been allocated.
			 */
			(void) segmap_release(segkmap, base, SM_INVAL);
		} else {
			flags = 0;
			/*
			 * Force write back for synchronous write cases.
			 */
			if ((ioflag & (FSYNC|FDSYNC)) || ip->i_type == VDIR) {
				/*
				 * If the sticky bit is set but the
				 * execute bit is not set, we do a
				 * synchronous write back and free
				 * the page when done.  We set up swap
				 * files to be handled this way to
				 * prevent servers from keeping around
				 * the client's swap pages too long.
				 * XXX - there ought to be a better way.
				 */
				if (IS_SWAPVP(vp)) {
					flags = SM_WRITE | SM_FREE |
					    SM_DONTNEED;
					iupdat_flag = 0;
				} else {
					flags = SM_WRITE;
				}
			} else if (((mapon + n) == MAXBSIZE) ||
			    IS_SWAPVP(vp)) {
				/*
				 * Have written a whole block.
				 * Start an asynchronous write and
				 * mark the buffer to indicate that
				 * it won't be needed again soon.
				 */
				flags = SM_WRITE |SM_ASYNC | SM_DONTNEED;
			}
			error = segmap_release(segkmap, base, flags);

			/*
			 * If the operation failed and is synchronous,
			 * then we need to unwind what uiomove() last
			 * did so we can potentially return an error to
			 * the caller.  If this write operation was
			 * done in two pieces and the first succeeded,
			 * then we won't return an error for the second
			 * piece that failed.  However, we only want to
			 * return a resid value that reflects what was
			 * really done.
			 *
			 * Failures for non-synchronous operations can
			 * be ignored since the page subsystem will
			 * retry the operation until it succeeds or the
			 * file system is unmounted.
			 */
			if (error) {
				if ((ioflag & (FSYNC | FDSYNC)) ||
				    ip->i_type == VDIR) {
					uio->uio_resid = premove_resid;
				} else {
					error = 0;
				}
			}
		}

		/*
		 * Re-acquire contents lock.
		 */
		rw_enter(&ip->i_contents, RW_WRITER);
		/*
		 * If the uiomove() failed or if a synchronous
		 * page push failed, fix up i_size.
		 */
		if (error) {
			if (i_size_changed) {
				/*
				 * The uiomove failed, and we
				 * allocated blocks,so get rid
				 * of them.
				 */
				(void) ud_itrunc(ip, old_i_size, 0, cr);
			}
		} else {
			/*
			 * XXX - Can this be out of the loop?
			 */
			ip->i_flag |= IUPD | ICHG;
			if (i_size_changed) {
				ip->i_flag |= IATTCHG;
			}
			if ((ip->i_perm & (IEXEC | (IEXEC >> 5) |
			    (IEXEC >> 10))) != 0 &&
			    (ip->i_char & (ISUID | ISGID)) != 0 &&
			    secpolicy_vnode_setid_retain(cr,
			    (ip->i_char & ISUID) != 0 && ip->i_uid == 0) != 0) {
				/*
				 * Clear Set-UID & Set-GID bits on
				 * successful write if not privileged
				 * and at least one of the execute bits
				 * is set.  If we always clear Set-GID,
				 * mandatory file and record locking is
				 * unuseable.
				 */
				ip->i_char &= ~(ISUID | ISGID);
			}
		}
	} while (error == 0 && uio->uio_resid > 0 && n != 0);

out:
	/*
	 * Inode is updated according to this table -
	 *
	 *	FSYNC	FDSYNC(posix.4)
	 *	--------------------------
	 *	always@	IATTCHG|IBDWRITE
	 *
	 * @ -  If we are doing synchronous write the only time we should
	 *	not be sync'ing the ip here is if we have the stickyhack
	 *	activated, the file is marked with the sticky bit and
	 *	no exec bit, the file length has not been changed and
	 *	no new blocks have been allocated during this write.
	 */
	if ((ip->i_flag & ISYNC) != 0) {
		/*
		 * we have eliminated nosync
		 */
		if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
		    ((ioflag & FSYNC) && iupdat_flag)) {
			ud_iupdat(ip, 1);
		}
	}

	/*
	 * If we've already done a partial-write, terminate
	 * the write but return no error.
	 */
	if (start_resid != uio->uio_resid) {
		error = 0;
	}
	ip->i_flag &= ~(INOACC | ISYNC);
	ITIMES_NOLOCK(ip);

	return (error);
}

int32_t
ud_multi_strat(struct ud_inode *ip,
	page_t *pp, struct buf *bp, u_offset_t start)
{
	daddr_t bn;
	int32_t error = 0, io_count, contig, alloc_sz, i;
	uint32_t io_off;
	mio_master_t *mm = NULL;
	mio_slave_t *ms = NULL;
	struct buf *rbp;

	ASSERT(!(start & PAGEOFFSET));

	/*
	 * Figure out how many buffers to allocate
	 */
	io_count = 0;
	for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
		contig = 0;
		if (error = ud_bmap_read(ip, (u_offset_t)(start + io_off),
		    &bn, &contig)) {
			goto end;
		}
		if (contig == 0) {
			goto end;
		}
		contig = MIN(contig, PAGESIZE - io_off);
		if (bn != UDF_HOLE) {
			io_count ++;
		} else {
			/*
			 * HOLE
			 */
			if (bp->b_flags & B_READ) {

				/*
				 * This is a hole and is read
				 * it should be filled with 0's
				 */
				pagezero(pp, io_off, contig);
			}
		}
	}


	if (io_count != 0) {

		/*
		 * Allocate memory for all the
		 * required number of buffers
		 */
		alloc_sz = sizeof (mio_master_t) +
		    (sizeof (mio_slave_t) * io_count);
		mm = (mio_master_t *)kmem_zalloc(alloc_sz, KM_SLEEP);
		if (mm == NULL) {
			error = ENOMEM;
			goto end;
		}

		/*
		 * initialize master
		 */
		mutex_init(&mm->mm_mutex, NULL, MUTEX_DEFAULT, NULL);
		mm->mm_size = alloc_sz;
		mm->mm_bp = bp;
		mm->mm_resid = 0;
		mm->mm_error = 0;
		mm->mm_index = master_index++;

		ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));

		/*
		 * Initialize buffers
		 */
		io_count = 0;
		for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
			contig = 0;
			if (error = ud_bmap_read(ip,
			    (u_offset_t)(start + io_off),
			    &bn, &contig)) {
				goto end;
			}
			ASSERT(contig);
			if ((io_off + contig) > bp->b_bcount) {
				contig = bp->b_bcount - io_off;
			}
			if (bn != UDF_HOLE) {
				/*
				 * Clone the buffer
				 * and prepare to start I/O
				 */
				ms->ms_ptr = mm;
				bioinit(&ms->ms_buf);
				rbp = bioclone(bp, io_off, (size_t)contig,
				    bp->b_edev, bn, ud_slave_done,
				    &ms->ms_buf, KM_NOSLEEP);
				ASSERT(rbp == &ms->ms_buf);
				mm->mm_resid += contig;
				io_count++;
				ms ++;
			}
		}

		/*
		 * Start I/O's
		 */
		ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
		for (i = 0; i < io_count; i++) {
			(void) bdev_strategy(&ms->ms_buf);
			ms ++;
		}
	}

end:
	if (error != 0) {
		bp->b_flags |= B_ERROR;
		bp->b_error = error;
		if (mm != NULL) {
			mutex_destroy(&mm->mm_mutex);
			kmem_free(mm, mm->mm_size);
		}
	}
	return (error);
}

int32_t
ud_slave_done(struct buf *bp)
{
	mio_master_t *mm;
	int32_t resid;

	ASSERT(SEMA_HELD(&bp->b_sem));
	ASSERT((bp->b_flags & B_DONE) == 0);

	mm = ((mio_slave_t *)bp)->ms_ptr;

	/*
	 * Propagate error and byte count info from slave struct to
	 * the master struct
	 */
	mutex_enter(&mm->mm_mutex);
	if (bp->b_flags & B_ERROR) {

		/*
		 * If multiple slave buffers get
		 * error we forget the old errors
		 * this is ok because we any way
		 * cannot return multiple errors
		 */
		mm->mm_error = bp->b_error;
	}
	mm->mm_resid -= bp->b_bcount;
	resid = mm->mm_resid;
	mutex_exit(&mm->mm_mutex);

	/*
	 * free up the resources allocated to cloned buffers.
	 */
	bp_mapout(bp);
	biofini(bp);

	if (resid == 0) {

		/*
		 * This is the last I/O operation
		 * clean up and return the original buffer
		 */
		if (mm->mm_error) {
			mm->mm_bp->b_flags |= B_ERROR;
			mm->mm_bp->b_error = mm->mm_error;
		}
		biodone(mm->mm_bp);
		mutex_destroy(&mm->mm_mutex);
		kmem_free(mm, mm->mm_size);
	}
	return (0);
}