changeset 3638:6b28ebc717aa

6496357 spec_fsync() is useless on devices that do write caching 6496359 zfs_read does unnecessary work on mmaped files with cached data 6496341 zvol performance can be improved 6496344 zvol async routines are synchronous 6496346 zfs_write with multiple iovs logs wrong data 6496347 zfs_write can be made much simpler thanks to range locks 6496350 zvol needs to log all writes to honor fsync 6496356 zvol should behave like a write-caching device 6489169 zvol: Incorrect ordering of biodone() and ensuring data is stable
author billm
date Mon, 12 Feb 2007 17:35:21 -0800
parents 526d8412c163
children 77dd70fd4e7b
files usr/src/uts/common/fs/specfs/specvnops.c usr/src/uts/common/fs/zfs/dmu.c usr/src/uts/common/fs/zfs/sys/dmu.h usr/src/uts/common/fs/zfs/sys/zfs_znode.h usr/src/uts/common/fs/zfs/zfs_ioctl.c usr/src/uts/common/fs/zfs/zfs_log.c usr/src/uts/common/fs/zfs/zfs_vnops.c usr/src/uts/common/fs/zfs/zvol.c usr/src/uts/common/sys/fs/snode.h
diffstat 9 files changed, 286 insertions(+), 315 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/fs/specfs/specvnops.c	Mon Feb 12 15:22:44 2007 -0800
+++ b/usr/src/uts/common/fs/specfs/specvnops.c	Mon Feb 12 17:35:21 2007 -0800
@@ -50,6 +50,7 @@
 #include <sys/conf.h>
 #include <sys/ddi.h>
 #include <sys/debug.h>
+#include <sys/dkio.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/fcntl.h>
@@ -1294,6 +1295,23 @@
 		(void) VOP_PUTPAGE(cvp, (offset_t)0, 0, 0, cr);
 
 	/*
+	 * For devices that support it, force write cache to stable storage.
+	 * We don't need the lock to check s_flags since we can treat
+	 * SNOFLUSH as a hint.
+	 */
+	if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
+	    !(sp->s_flag & SNOFLUSH)) {
+		int rval, rc;
+		rc = cdev_ioctl(vp->v_rdev, DKIOCFLUSHWRITECACHE,
+		    NULL, FNATIVE|FKIOCTL, cr, &rval);
+		if (rc == ENOTSUP || rc == ENOTTY) {
+			mutex_enter(&sp->s_lock);
+			sp->s_flag |= SNOFLUSH;
+			mutex_exit(&sp->s_lock);
+		}
+	}
+
+	/*
 	 * If no real vnode to update, don't flush anything.
 	 */
 	if (realvp == NULL)
--- a/usr/src/uts/common/fs/zfs/dmu.c	Mon Feb 12 15:22:44 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu.c	Mon Feb 12 17:35:21 2007 -0800
@@ -360,13 +360,15 @@
 	dmu_buf_t **dbp;
 	int numbufs, i, err;
 
-	/*
-	 * Deal with odd block sizes, where there can't be data past the
-	 * first block.
-	 */
 	err = dnode_hold(os->os, object, FTAG, &dn);
 	if (err)
 		return (err);
+
+	/*
+	 * Deal with odd block sizes, where there can't be data past the first
+	 * block.  If we ever do the tail block optimization, we will need to
+	 * handle that here as well.
+	 */
 	if (dn->dn_datablkshift == 0) {
 		int newsz = offset > dn->dn_datablksz ? 0 :
 		    MIN(size, dn->dn_datablksz - offset);
@@ -453,8 +455,45 @@
 
 #ifdef _KERNEL
 int
-dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-    uio_t *uio, dmu_tx_t *tx)
+dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
+{
+	dmu_buf_t **dbp;
+	int numbufs, i, err;
+
+	/*
+	 * NB: we could do this block-at-a-time, but it's nice
+	 * to be reading in parallel.
+	 */
+	err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG,
+	    &numbufs, &dbp);
+	if (err)
+		return (err);
+
+	for (i = 0; i < numbufs; i++) {
+		int tocpy;
+		int bufoff;
+		dmu_buf_t *db = dbp[i];
+
+		ASSERT(size > 0);
+
+		bufoff = uio->uio_loffset - db->db_offset;
+		tocpy = (int)MIN(db->db_size - bufoff, size);
+
+		err = uiomove((char *)db->db_data + bufoff, tocpy,
+		    UIO_READ, uio);
+		if (err)
+			break;
+
+		size -= tocpy;
+	}
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+	return (err);
+}
+
+int
+dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
+    dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i;
@@ -463,7 +502,7 @@
 	if (size == 0)
 		return (0);
 
-	err = dmu_buf_hold_array(os, object, offset, size,
+	err = dmu_buf_hold_array(os, object, uio->uio_loffset, size,
 	    FALSE, FTAG, &numbufs, &dbp);
 	if (err)
 		return (err);
@@ -475,7 +514,7 @@
 
 		ASSERT(size > 0);
 
-		bufoff = offset - db->db_offset;
+		bufoff = uio->uio_loffset - db->db_offset;
 		tocpy = (int)MIN(db->db_size - bufoff, size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
@@ -500,7 +539,6 @@
 		if (err)
 			break;
 
-		offset += tocpy;
 		size -= tocpy;
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h	Mon Feb 12 15:22:44 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h	Mon Feb 12 17:35:21 2007 -0800
@@ -424,8 +424,9 @@
 	void *buf);
 void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	const void *buf, dmu_tx_t *tx);
-int dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-    struct uio *uio, dmu_tx_t *tx);
+int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
+int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
+    dmu_tx_t *tx);
 int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, struct page *pp, dmu_tx_t *tx);
 
--- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h	Mon Feb 12 15:22:44 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h	Mon Feb 12 17:35:21 2007 -0800
@@ -264,7 +264,7 @@
 extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype,
     znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp);
 extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-    znode_t *zp, offset_t off, ssize_t len, int ioflag, uio_t *uio);
+    znode_t *zp, offset_t off, ssize_t len, int ioflag);
 extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
     znode_t *zp, uint64_t off, uint64_t len);
 extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Mon Feb 12 15:22:44 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Mon Feb 12 17:35:21 2007 -0800
@@ -1617,8 +1617,8 @@
 	NULL,		/* streamtab */
 	D_NEW | D_MP | D_64BIT,		/* Driver compatibility flag */
 	CB_REV,		/* version */
-	zvol_aread,	/* async read */
-	zvol_awrite,	/* async write */
+	nodev,		/* async read */
+	nodev,		/* async write */
 };
 
 static struct dev_ops zfs_dev_ops = {
--- a/usr/src/uts/common/fs/zfs/zfs_log.c	Mon Feb 12 15:22:44 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/zfs_log.c	Mon Feb 12 17:35:21 2007 -0800
@@ -208,13 +208,12 @@
 
 void
 zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-	znode_t *zp, offset_t off, ssize_t len, int ioflag, uio_t *uio)
+	znode_t *zp, offset_t off, ssize_t len, int ioflag)
 {
 	itx_t *itx;
 	uint64_t seq;
 	lr_write_t *lr;
 	itx_wr_state_t write_state;
-	size_t dlen;
 	int err;
 
 	if (zilog == NULL || zp->z_unlinked)
@@ -237,34 +236,28 @@
 	 *    flush the write later then a buffer is allocated and
 	 *    we retrieve the data using the dmu.
 	 */
-	if (len > zfs_immediate_write_sz) {
-		dlen = 0;
+	if (len > zfs_immediate_write_sz)
 		write_state = WR_INDIRECT;
-	} else if (ioflag & FDSYNC) {
-		dlen = len;
+	else if (ioflag & FDSYNC)
 		write_state = WR_COPIED;
-	} else {
-		dlen = 0;
+	else
 		write_state = WR_NEED_COPY;
-	}
-	itx = zil_itx_create(txtype, sizeof (*lr) + dlen);
+
+	itx = zil_itx_create(txtype, sizeof (*lr) +
+	    (write_state == WR_COPIED ? len : 0));
+	lr = (lr_write_t *)&itx->itx_lr;
 	if (write_state == WR_COPIED) {
-		err = xcopyin(uio->uio_iov->iov_base - len,
-		    (char *)itx + offsetof(itx_t, itx_lr) + sizeof (*lr), len);
-		/*
-		 * xcopyin shouldn't error as we've already successfully
-		 * copied it to a dmu buffer. However if it does we'll get
-		 * the data from the dmu later.
-		 */
+		err = dmu_read(zp->z_zfsvfs->z_os, zp->z_id, off, len, lr + 1);
 		if (err) {
-			kmem_free(itx, offsetof(itx_t, itx_lr)
-			    + itx->itx_lr.lrc_reclen);
+			kmem_free(itx, offsetof(itx_t, itx_lr) +
+			    itx->itx_lr.lrc_reclen);
 			itx = zil_itx_create(txtype, sizeof (*lr));
+			lr = (lr_write_t *)&itx->itx_lr;
 			write_state = WR_NEED_COPY;
 		}
 	}
+
 	itx->itx_wr_state = write_state;
-	lr = (lr_write_t *)&itx->itx_lr;
 	lr->lr_foid = zp->z_id;
 	lr->lr_offset = off;
 	lr->lr_length = len;
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c	Mon Feb 12 15:22:44 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c	Mon Feb 12 17:35:21 2007 -0800
@@ -280,7 +280,7 @@
  *	the file is memory mapped.
  */
 static int
-mappedwrite(vnode_t *vp, uint64_t woff, int nbytes, uio_t *uio, dmu_tx_t *tx)
+mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
@@ -293,6 +293,7 @@
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		page_t *pp;
 		uint64_t bytes = MIN(PAGESIZE - off, len);
+		uint64_t woff = uio->uio_loffset;
 
 		/*
 		 * We don't want a new page to "appear" in the middle of
@@ -315,11 +316,10 @@
 			page_unlock(pp);
 		} else {
 			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
-			    woff, bytes, uio, tx);
+			    uio, bytes, tx);
 			rw_exit(&zp->z_map_lock);
 		}
 		len -= bytes;
-		woff += bytes;
 		off = 0;
 		if (error)
 			break;
@@ -338,9 +338,11 @@
  *	the file is memory mapped.
  */
 static int
-mappedread(vnode_t *vp, char *addr, int nbytes, uio_t *uio)
+mappedread(vnode_t *vp, int nbytes, uio_t *uio)
 {
-	int64_t	start, off, bytes;
+	znode_t *zp = VTOZ(vp);
+	objset_t *os = zp->z_zfsvfs->z_os;
+	int64_t	start, off;
 	int len = nbytes;
 	int error = 0;
 
@@ -348,8 +350,8 @@
 	off = start & PAGEOFFSET;
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		page_t *pp;
-
-		bytes = MIN(PAGESIZE - off, len);
+		uint64_t bytes = MIN(PAGESIZE - off, len);
+
 		if (pp = page_lookup(vp, start, SE_SHARED)) {
 			caddr_t va;
 
@@ -358,11 +360,9 @@
 			ppmapout(va);
 			page_unlock(pp);
 		} else {
-			/* XXX use dmu_read here? */
-			error = uiomove(addr, bytes, UIO_READ, uio);
+			error = dmu_read_uio(os, zp->z_id, uio, bytes);
 		}
 		len -= bytes;
-		addr += bytes;
 		off = 0;
 		if (error)
 			break;
@@ -370,7 +370,7 @@
 	return (error);
 }
 
-uint_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
+offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 
 /*
  * Read bytes from specified file into supplied buffer.
@@ -395,10 +395,9 @@
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	uint64_t	delta;
-	ssize_t		n, size, cnt, ndone;
-	int		error, i, numbufs;
-	dmu_buf_t	*dbp, **dbpp;
+	objset_t	*os = zfsvfs->z_os;
+	ssize_t		n, nbytes;
+	int		error;
 	rl_t		*rl;
 
 	ZFS_ENTER(zfsvfs);
@@ -446,58 +445,27 @@
 	 * to the end; but we might still need to set atime.
 	 */
 	if (uio->uio_loffset >= zp->z_phys->zp_size) {
-		cnt = 0;
 		error = 0;
 		goto out;
 	}
 
-	cnt = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
-
-	for (ndone = 0; ndone < cnt; ndone += zfs_read_chunk_size) {
-		ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
-		n = MIN(zfs_read_chunk_size,
-		    zp->z_phys->zp_size - uio->uio_loffset);
-		n = MIN(n, cnt);
-		error = dmu_buf_hold_array_by_bonus(zp->z_dbuf,
-		    uio->uio_loffset, n, TRUE, FTAG, &numbufs, &dbpp);
+	ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
+	n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
+
+	while (n > 0) {
+		nbytes = MIN(n, zfs_read_chunk_size -
+		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
+
+		if (vn_has_cached_data(vp))
+			error = mappedread(vp, nbytes, uio);
+		else
+			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
 		if (error)
-			goto out;
-		/*
-		 * Compute the adjustment to align the dmu buffers
-		 * with the uio buffer.
-		 */
-		delta = uio->uio_loffset - dbpp[0]->db_offset;
-
-		for (i = 0; i < numbufs; i++) {
-			if (n < 0)
-				break;
-			dbp = dbpp[i];
-			size = dbp->db_size - delta;
-			/*
-			 * XXX -- this is correct, but may be suboptimal.
-			 * If the pages are all clean, we don't need to
-			 * go through mappedread().  Maybe the VMODSORT
-			 * stuff can help us here.
-			 */
-			if (vn_has_cached_data(vp)) {
-				error = mappedread(vp, (caddr_t)dbp->db_data +
-				    delta, (n < size ? n : size), uio);
-			} else {
-				error = uiomove((caddr_t)dbp->db_data + delta,
-					(n < size ? n : size), UIO_READ, uio);
-			}
-			if (error) {
-				dmu_buf_rele_array(dbpp, numbufs, FTAG);
-				goto out;
-			}
-			n -= dbp->db_size;
-			if (delta) {
-				n += delta;
-				delta = 0;
-			}
-		}
-		dmu_buf_rele_array(dbpp, numbufs, FTAG);
+			break;
+
+		n -= nbytes;
 	}
+
 out:
 	zfs_range_unlock(rl);
 
@@ -660,8 +628,9 @@
 	}
 
 	if (woff >= limit) {
-		error = EFBIG;
-		goto no_tx_done;
+		zfs_range_unlock(rl);
+		ZFS_EXIT(zfsvfs);
+		return (EFBIG);
 	}
 
 	if ((woff + n) > limit || woff > (limit - n))
@@ -671,114 +640,21 @@
 	 * Check for mandatory locks
 	 */
 	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
-	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0)
-		goto no_tx_done;
+	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
+		zfs_range_unlock(rl);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
 	end_size = MAX(zp->z_phys->zp_size, woff + n);
-top:
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_bonus(tx, zp->z_id);
-	dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
-	if (error) {
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
-		dmu_tx_abort(tx);
-		goto no_tx_done;
-	}
 
 	/*
-	 * If zfs_range_lock() over-locked we grow the blocksize
-	 * and then reduce the lock range.
-	 */
-	if (rl->r_len == UINT64_MAX) {
-		uint64_t new_blksz;
-
-		if (zp->z_blksz > max_blksz) {
-			ASSERT(!ISP2(zp->z_blksz));
-			new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
-		} else {
-			new_blksz = MIN(end_size, max_blksz);
-		}
-		zfs_grow_blocksize(zp, new_blksz, tx);
-		zfs_range_reduce(rl, woff, n);
-	}
-
-	/*
-	 * The file data does not fit in the znode "cache", so we
-	 * will be writing to the file block data buffers.
-	 * Each buffer will be written in a separate transaction;
-	 * this keeps the intent log records small and allows us
-	 * to do more fine-grained space accounting.
+	 * Write the file in reasonable size chunks.  Each chunk is written
+	 * in a separate transaction; this keeps the intent log records small
+	 * and allows us to do more fine-grained space accounting.
 	 */
 	while (n > 0) {
 		/*
-		 * XXX - should we really limit each write to z_max_blksz?
-		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
-		 */
-		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
-		rw_enter(&zp->z_map_lock, RW_READER);
-
-		tx_bytes = uio->uio_resid;
-		if (vn_has_cached_data(vp)) {
-			rw_exit(&zp->z_map_lock);
-			error = mappedwrite(vp, woff, nbytes, uio, tx);
-		} else {
-			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
-			    woff, nbytes, uio, tx);
-			rw_exit(&zp->z_map_lock);
-		}
-		tx_bytes -= uio->uio_resid;
-
-		if (error) {
-			/* XXX - do we need to "clean up" the dmu buffer? */
-			break;
-		}
-
-		ASSERT(tx_bytes == nbytes);
-
-		/*
-		 * Clear Set-UID/Set-GID bits on successful write if not
-		 * privileged and at least one of the excute bits is set.
-		 *
-		 * It would be nice to to this after all writes have
-		 * been done, but that would still expose the ISUID/ISGID
-		 * to another app after the partial write is committed.
-		 */
-
-		mutex_enter(&zp->z_acl_lock);
-		if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
-		    (S_IXUSR >> 6))) != 0 &&
-		    (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
-		    secpolicy_vnode_setid_retain(cr,
-		    (zp->z_phys->zp_mode & S_ISUID) != 0 &&
-		    zp->z_phys->zp_uid == 0) != 0) {
-			    zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
-		}
-		mutex_exit(&zp->z_acl_lock);
-
-		n -= nbytes;
-		if (n <= 0)
-			break;
-
-		/*
-		 * We have more work ahead of us, so wrap up this transaction
-		 * and start another.  Exact same logic as tx_done below.
-		 */
-		while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) {
-			dmu_buf_will_dirty(zp->z_dbuf, tx);
-			(void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
-			    uio->uio_loffset);
-		}
-		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
-		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes,
-		    ioflag, uio);
-		dmu_tx_commit(tx);
-
-		/*
-		 * Start another transaction.
+		 * Start a transaction.
 		 */
 		woff = uio->uio_loffset;
 		tx = dmu_tx_create(zfsvfs->z_os);
@@ -790,33 +666,98 @@
 			    zfsvfs->z_assign == TXG_NOWAIT) {
 				dmu_tx_wait(tx);
 				dmu_tx_abort(tx);
-				goto top;
+				continue;
 			}
 			dmu_tx_abort(tx);
-			goto no_tx_done;
+			break;
+		}
+
+		/*
+		 * If zfs_range_lock() over-locked we grow the blocksize
+		 * and then reduce the lock range.  This will only happen
+		 * on the first iteration since zfs_range_reduce() will
+		 * shrink down r_len to the appropriate size.
+		 */
+		if (rl->r_len == UINT64_MAX) {
+			uint64_t new_blksz;
+
+			if (zp->z_blksz > max_blksz) {
+				ASSERT(!ISP2(zp->z_blksz));
+				new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
+			} else {
+				new_blksz = MIN(end_size, max_blksz);
+			}
+			zfs_grow_blocksize(zp, new_blksz, tx);
+			zfs_range_reduce(rl, woff, n);
+		}
+
+		/*
+		 * XXX - should we really limit each write to z_max_blksz?
+		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
+		 */
+		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
+		rw_enter(&zp->z_map_lock, RW_READER);
+
+		tx_bytes = uio->uio_resid;
+		if (vn_has_cached_data(vp)) {
+			rw_exit(&zp->z_map_lock);
+			error = mappedwrite(vp, nbytes, uio, tx);
+		} else {
+			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
+			    uio, nbytes, tx);
+			rw_exit(&zp->z_map_lock);
 		}
-	}
-
-tx_done:
-
-	if (tx_bytes != 0) {
+		tx_bytes -= uio->uio_resid;
+
+		/*
+		 * If we made no progress, we're done.  If we made even
+		 * partial progress, update the znode and ZIL accordingly.
+		 */
+		if (tx_bytes == 0) {
+			ASSERT(error != 0);
+			break;
+		}
+
 		/*
-		 * Update the file size if it has changed; account
-		 * for possible concurrent updates.
+		 * Clear Set-UID/Set-GID bits on successful write if not
+		 * privileged and at least one of the excute bits is set.
+		 *
+		 * It would be nice to to this after all writes have
+		 * been done, but that would still expose the ISUID/ISGID
+		 * to another app after the partial write is committed.
 		 */
-		while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) {
-			dmu_buf_will_dirty(zp->z_dbuf, tx);
+		mutex_enter(&zp->z_acl_lock);
+		if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
+		    (S_IXUSR >> 6))) != 0 &&
+		    (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
+		    secpolicy_vnode_setid_retain(cr,
+		    (zp->z_phys->zp_mode & S_ISUID) != 0 &&
+		    zp->z_phys->zp_uid == 0) != 0) {
+			    zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
+		}
+		mutex_exit(&zp->z_acl_lock);
+
+		/*
+		 * Update time stamp.  NOTE: This marks the bonus buffer as
+		 * dirty, so we don't have to do it again for zp_size.
+		 */
+		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+
+		/*
+		 * Update the file size (zp_size) if it has changed;
+		 * account for possible concurrent updates.
+		 */
+		while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
 			(void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
 			    uio->uio_loffset);
-		}
-		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
-		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes,
-		    ioflag, uio);
+		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
+		dmu_tx_commit(tx);
+
+		if (error != 0)
+			break;
+		ASSERT(tx_bytes == nbytes);
+		n -= nbytes;
 	}
-	dmu_tx_commit(tx);
-
-
-no_tx_done:
 
 	zfs_range_unlock(rl);
 
@@ -863,7 +804,7 @@
 	dmu_buf_t *db;
 	rl_t *rl;
 	zgd_t *zgd;
-	int dlen = lr->lr_length;  		/* length of user data */
+	int dlen = lr->lr_length;		/* length of user data */
 	int error = 0;
 
 	ASSERT(zio);
@@ -2243,7 +2184,7 @@
 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
 {
 	zfs_zlock_t	*zl;
-	znode_t 	*zp = tdzp;
+	znode_t		*zp = tdzp;
 	uint64_t	rootid = zp->z_zfsvfs->z_root;
 	uint64_t	*oidp = &zp->z_id;
 	krwlock_t	*rwlp = &szp->z_parent_lock;
@@ -2923,8 +2864,7 @@
 
 	if (err == 0) {
 		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
-		(void) zfs_log_write(
-		    zilog, tx, TX_WRITE, zp, off, len, 0, NULL);
+		zfs_log_write(zilog, tx, TX_WRITE, zp, off, len, 0);
 		dmu_tx_commit(tx);
 	}
 
--- a/usr/src/uts/common/fs/zfs/zvol.c	Mon Feb 12 15:22:44 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/zvol.c	Mon Feb 12 17:35:21 2007 -0800
@@ -42,7 +42,6 @@
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/errno.h>
-#include <sys/aio_req.h>
 #include <sys/uio.h>
 #include <sys/buf.h>
 #include <sys/modctl.h>
@@ -110,7 +109,7 @@
  */
 int zvol_maxphys = DMU_MAX_ACCESS/2;
 
-int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
+static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
 
 static void
 zvol_size_changed(zvol_state_t *zv, dev_t dev)
@@ -679,29 +678,7 @@
 	return (0);
 }
 
-/*
- * Create and return an immediate write ZIL transaction.
- */
-itx_t *
-zvol_immediate_itx(offset_t off, ssize_t len, char *addr)
-{
-	itx_t *itx;
-	lr_write_t *lr;
-
-	itx = zil_itx_create(TX_WRITE, sizeof (*lr) + len);
-	lr = (lr_write_t *)&itx->itx_lr;
-	lr->lr_foid = ZVOL_OBJ;
-	lr->lr_offset = off;
-	lr->lr_length = len;
-	lr->lr_blkoff = 0;
-	BP_ZERO(&lr->lr_blkptr);
-	bcopy(addr, (char *)itx + offsetof(itx_t, itx_lr) +
-	    sizeof (*lr), len);
-	itx->itx_wr_state = WR_COPIED;
-	return (itx);
-}
-
-void
+static void
 zvol_get_done(dmu_buf_t *db, void *vzgd)
 {
 	zgd_t *zgd = (zgd_t *)vzgd;
@@ -714,19 +691,21 @@
 /*
  * Get data to generate a TX_WRITE intent log record.
  */
-int
+static int
 zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 {
 	zvol_state_t *zv = arg;
 	objset_t *os = zv->zv_objset;
 	dmu_buf_t *db;
 	zgd_t *zgd;
-	int dlen = lr->lr_length;  		/* length of user data */
 	int error;
 
 	ASSERT(zio);
-	ASSERT(dlen != 0);
-	ASSERT(buf == NULL);
+	ASSERT(lr->lr_length != 0);
+
+	if (buf != NULL)
+		return (dmu_read(os, ZVOL_OBJ,
+		    lr->lr_offset, lr->lr_length, buf));
 
 	zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
 	zgd->zgd_zilog = zv->zv_zilog;
@@ -742,10 +721,9 @@
 	error = dmu_sync(zio, db, &lr->lr_blkptr,
 	    lr->lr_common.lrc_txg, zvol_get_done, zgd);
 	rw_exit(&zv->zv_dslock);
-	if (error == 0) {
+	if (error == 0)
 		zil_add_vdev(zv->zv_zilog,
 		    DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
-	}
 	/*
 	 * If we get EINPROGRESS, then we need to wait for a
 	 * write IO initiated by dmu_sync() to complete before
@@ -767,42 +745,27 @@
  */
 ssize_t zvol_immediate_write_sz = 32768;
 
-void
-zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len,
-    char *addr)
+static void
+zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
 {
-	ssize_t nbytes;
-	itx_t *itx;
+	uint32_t blocksize = zv->zv_volblocksize;
 	lr_write_t *lr;
-	zilog_t *zilog = zv->zv_zilog;
-	uint64_t boff;
-	uint32_t blocksize;
-
-	/* handle common case */
-	if (len <= zvol_immediate_write_sz) {
-		itx = zvol_immediate_itx(off, len, addr);
-		(void) zil_itx_assign(zilog, itx, tx);
-	}
-
-	blocksize = zv->zv_volblocksize;
 
 	while (len) {
-		nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
-		if (nbytes <= zvol_immediate_write_sz) {
-			itx = zvol_immediate_itx(off, nbytes, addr);
-		} else {
-			boff = P2ALIGN_TYPED(off, blocksize, uint64_t);
-			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
-			itx->itx_wr_state = WR_INDIRECT;
-			itx->itx_private = zv;
-			lr = (lr_write_t *)&itx->itx_lr;
-			lr->lr_foid = ZVOL_OBJ;
-			lr->lr_offset = off;
-			lr->lr_length = nbytes;
-			lr->lr_blkoff = off - boff;
-			BP_ZERO(&lr->lr_blkptr);
-		}
-		(void) zil_itx_assign(zilog, itx, tx);
+		ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
+		itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+
+		itx->itx_wr_state =
+		    len > zvol_immediate_write_sz ?  WR_INDIRECT : WR_NEED_COPY;
+		itx->itx_private = zv;
+		lr = (lr_write_t *)&itx->itx_lr;
+		lr->lr_foid = ZVOL_OBJ;
+		lr->lr_offset = off;
+		lr->lr_length = nbytes;
+		lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t);
+		BP_ZERO(&lr->lr_blkptr);
+
+		(void) zil_itx_assign(zv->zv_zilog, itx, tx);
 		len -= nbytes;
 		off += nbytes;
 	}
@@ -878,9 +841,7 @@
 				dmu_tx_abort(tx);
 			} else {
 				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
-				/* add a log write transaction */
-				if (sync)
-					zvol_log_write(zv, tx, off, size, addr);
+				zvol_log_write(zv, tx, off, size);
 				dmu_tx_commit(tx);
 			}
 		}
@@ -895,10 +856,10 @@
 	if ((bp->b_resid = resid) == bp->b_bcount)
 		bioerror(bp, off > volsize ? EINVAL : error);
 
-	biodone(bp);
+	if (sync)
+		zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
 
-	if (sync)
-		zil_commit(zv->zv_zilog, UINT64_MAX, 0);
+	biodone(bp);
 
 	return (0);
 }
@@ -920,32 +881,48 @@
 
 /*ARGSUSED*/
 int
-zvol_read(dev_t dev, uio_t *uiop, cred_t *cr)
+zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 {
-	return (physio(zvol_strategy, NULL, dev, B_READ, zvol_minphys, uiop));
+	zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(dev));
+	int error = 0;
+
+	while (uio->uio_resid > 0) {
+		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
+
+		error =  dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
+		if (error)
+			break;
+	}
+	return (error);
 }
 
 /*ARGSUSED*/
 int
-zvol_write(dev_t dev, uio_t *uiop, cred_t *cr)
-{
-	return (physio(zvol_strategy, NULL, dev, B_WRITE, zvol_minphys, uiop));
-}
-
-/*ARGSUSED*/
-int
-zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr)
+zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 {
-	return (aphysio(zvol_strategy, anocancel, dev, B_READ, zvol_minphys,
-	    aio));
-}
+	zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(dev));
+	int error = 0;
+
+	while (uio->uio_resid > 0) {
+		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
+		uint64_t off = uio->uio_loffset;
 
-/*ARGSUSED*/
-int
-zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr)
-{
-	return (aphysio(zvol_strategy, anocancel, dev, B_WRITE, zvol_minphys,
-	    aio));
+		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+			break;
+		}
+		error = dmu_write_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes, tx);
+		if (error == 0)
+			zvol_log_write(zv, tx, off, bytes);
+		dmu_tx_commit(tx);
+
+		if (error)
+			break;
+	}
+	return (error);
 }
 
 /*
@@ -1068,6 +1045,10 @@
 		}
 		return (error);
 
+	case DKIOCFLUSHWRITECACHE:
+		zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
+		break;
+
 	case DKIOCGGEOM:
 	case DKIOCGVTOC:
 		/* commands using these (like prtvtoc) expect ENOTSUP */
--- a/usr/src/uts/common/sys/fs/snode.h	Mon Feb 12 15:22:44 2007 -0800
+++ b/usr/src/uts/common/sys/fs/snode.h	Mon Feb 12 17:35:21 2007 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,11 +19,11 @@
  * CDDL HEADER END
  */
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
-/*	  All Rights Reserved  	*/
+/*	  All Rights Reserved	*/
 
 
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -118,6 +117,7 @@
 #define	SMUXED		0x1000		/* this snode is a stream that has */
 					/* been multiplexed */
 #define	SSELFCLONE	0x2000		/* represents a self cloning device */
+#define	SNOFLUSH	0x4000		/* do not flush device on fsync */
 
 #ifdef _KERNEL
 /*