Mercurial > illumos > illumos-gate
changeset 3638:6b28ebc717aa
6496357 spec_fsync() is useless on devices that do write caching
6496359 zfs_read does unnecessary work on mmaped files with cached data
6496341 zvol performance can be improved
6496344 zvol async routines are synchronous
6496346 zfs_write with multiple iovs logs wrong data
6496347 zfs_write can be made much simpler thanks to range locks
6496350 zvol needs to log all writes to honor fsync
6496356 zvol should behave like a write-caching device
6489169 zvol: Incorrect ordering of biodone() and ensuring data is stable
author | billm |
---|---|
date | Mon, 12 Feb 2007 17:35:21 -0800 |
parents | 526d8412c163 |
children | 77dd70fd4e7b |
files | usr/src/uts/common/fs/specfs/specvnops.c usr/src/uts/common/fs/zfs/dmu.c usr/src/uts/common/fs/zfs/sys/dmu.h usr/src/uts/common/fs/zfs/sys/zfs_znode.h usr/src/uts/common/fs/zfs/zfs_ioctl.c usr/src/uts/common/fs/zfs/zfs_log.c usr/src/uts/common/fs/zfs/zfs_vnops.c usr/src/uts/common/fs/zfs/zvol.c usr/src/uts/common/sys/fs/snode.h |
diffstat | 9 files changed, 286 insertions(+), 315 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/uts/common/fs/specfs/specvnops.c Mon Feb 12 15:22:44 2007 -0800 +++ b/usr/src/uts/common/fs/specfs/specvnops.c Mon Feb 12 17:35:21 2007 -0800 @@ -50,6 +50,7 @@ #include <sys/conf.h> #include <sys/ddi.h> #include <sys/debug.h> +#include <sys/dkio.h> #include <sys/errno.h> #include <sys/time.h> #include <sys/fcntl.h> @@ -1294,6 +1295,23 @@ (void) VOP_PUTPAGE(cvp, (offset_t)0, 0, 0, cr); /* + * For devices that support it, force write cache to stable storage. + * We don't need the lock to check s_flags since we can treat + * SNOFLUSH as a hint. + */ + if ((vp->v_type == VBLK || vp->v_type == VCHR) && + !(sp->s_flag & SNOFLUSH)) { + int rval, rc; + rc = cdev_ioctl(vp->v_rdev, DKIOCFLUSHWRITECACHE, + NULL, FNATIVE|FKIOCTL, cr, &rval); + if (rc == ENOTSUP || rc == ENOTTY) { + mutex_enter(&sp->s_lock); + sp->s_flag |= SNOFLUSH; + mutex_exit(&sp->s_lock); + } + } + + /* * If no real vnode to update, don't flush anything. */ if (realvp == NULL)
--- a/usr/src/uts/common/fs/zfs/dmu.c Mon Feb 12 15:22:44 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/dmu.c Mon Feb 12 17:35:21 2007 -0800 @@ -360,13 +360,15 @@ dmu_buf_t **dbp; int numbufs, i, err; - /* - * Deal with odd block sizes, where there can't be data past the - * first block. - */ err = dnode_hold(os->os, object, FTAG, &dn); if (err) return (err); + + /* + * Deal with odd block sizes, where there can't be data past the first + * block. If we ever do the tail block optimization, we will need to + * handle that here as well. + */ if (dn->dn_datablkshift == 0) { int newsz = offset > dn->dn_datablksz ? 0 : MIN(size, dn->dn_datablksz - offset); @@ -453,8 +455,45 @@ #ifdef _KERNEL int -dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - uio_t *uio, dmu_tx_t *tx) +dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) +{ + dmu_buf_t **dbp; + int numbufs, i, err; + + /* + * NB: we could do this block-at-a-time, but it's nice + * to be reading in parallel. + */ + err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, + &numbufs, &dbp); + if (err) + return (err); + + for (i = 0; i < numbufs; i++) { + int tocpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = uio->uio_loffset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + err = uiomove((char *)db->db_data + bufoff, tocpy, + UIO_READ, uio); + if (err) + break; + + size -= tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + + return (err); +} + +int +dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, + dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs, i; @@ -463,7 +502,7 @@ if (size == 0) return (0); - err = dmu_buf_hold_array(os, object, offset, size, + err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, FALSE, FTAG, &numbufs, &dbp); if (err) return (err); @@ -475,7 +514,7 @@ ASSERT(size > 0); - bufoff = offset - db->db_offset; + bufoff = uio->uio_loffset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); @@ -500,7 +539,6 @@ if (err) break; - offset += tocpy; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG);
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h Mon Feb 12 15:22:44 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h Mon Feb 12 17:35:21 2007 -0800 @@ -424,8 +424,9 @@ void *buf); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); -int dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - struct uio *uio, dmu_tx_t *tx); +int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); +int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, + dmu_tx_t *tx); int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct page *pp, dmu_tx_t *tx);
--- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h Mon Feb 12 15:22:44 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h Mon Feb 12 17:35:21 2007 -0800 @@ -264,7 +264,7 @@ extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp); extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, offset_t off, ssize_t len, int ioflag, uio_t *uio); + znode_t *zp, offset_t off, ssize_t len, int ioflag); extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, uint64_t off, uint64_t len); extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c Mon Feb 12 15:22:44 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c Mon Feb 12 17:35:21 2007 -0800 @@ -1617,8 +1617,8 @@ NULL, /* streamtab */ D_NEW | D_MP | D_64BIT, /* Driver compatibility flag */ CB_REV, /* version */ - zvol_aread, /* async read */ - zvol_awrite, /* async write */ + nodev, /* async read */ + nodev, /* async write */ }; static struct dev_ops zfs_dev_ops = {
--- a/usr/src/uts/common/fs/zfs/zfs_log.c Mon Feb 12 15:22:44 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/zfs_log.c Mon Feb 12 17:35:21 2007 -0800 @@ -208,13 +208,12 @@ void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, offset_t off, ssize_t len, int ioflag, uio_t *uio) + znode_t *zp, offset_t off, ssize_t len, int ioflag) { itx_t *itx; uint64_t seq; lr_write_t *lr; itx_wr_state_t write_state; - size_t dlen; int err; if (zilog == NULL || zp->z_unlinked) @@ -237,34 +236,28 @@ * flush the write later then a buffer is allocated and * we retrieve the data using the dmu. */ - if (len > zfs_immediate_write_sz) { - dlen = 0; + if (len > zfs_immediate_write_sz) write_state = WR_INDIRECT; - } else if (ioflag & FDSYNC) { - dlen = len; + else if (ioflag & FDSYNC) write_state = WR_COPIED; - } else { - dlen = 0; + else write_state = WR_NEED_COPY; - } - itx = zil_itx_create(txtype, sizeof (*lr) + dlen); + + itx = zil_itx_create(txtype, sizeof (*lr) + + (write_state == WR_COPIED ? len : 0)); + lr = (lr_write_t *)&itx->itx_lr; if (write_state == WR_COPIED) { - err = xcopyin(uio->uio_iov->iov_base - len, - (char *)itx + offsetof(itx_t, itx_lr) + sizeof (*lr), len); - /* - * xcopyin shouldn't error as we've already successfully - * copied it to a dmu buffer. However if it does we'll get - * the data from the dmu later. - */ + err = dmu_read(zp->z_zfsvfs->z_os, zp->z_id, off, len, lr + 1); if (err) { - kmem_free(itx, offsetof(itx_t, itx_lr) - + itx->itx_lr.lrc_reclen); + kmem_free(itx, offsetof(itx_t, itx_lr) + + itx->itx_lr.lrc_reclen); itx = zil_itx_create(txtype, sizeof (*lr)); + lr = (lr_write_t *)&itx->itx_lr; write_state = WR_NEED_COPY; } } + itx->itx_wr_state = write_state; - lr = (lr_write_t *)&itx->itx_lr; lr->lr_foid = zp->z_id; lr->lr_offset = off; lr->lr_length = len;
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c Mon Feb 12 15:22:44 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c Mon Feb 12 17:35:21 2007 -0800 @@ -280,7 +280,7 @@ * the file is memory mapped. */ static int -mappedwrite(vnode_t *vp, uint64_t woff, int nbytes, uio_t *uio, dmu_tx_t *tx) +mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; @@ -293,6 +293,7 @@ for (start &= PAGEMASK; len > 0; start += PAGESIZE) { page_t *pp; uint64_t bytes = MIN(PAGESIZE - off, len); + uint64_t woff = uio->uio_loffset; /* * We don't want a new page to "appear" in the middle of @@ -315,11 +316,10 @@ page_unlock(pp); } else { error = dmu_write_uio(zfsvfs->z_os, zp->z_id, - woff, bytes, uio, tx); + uio, bytes, tx); rw_exit(&zp->z_map_lock); } len -= bytes; - woff += bytes; off = 0; if (error) break; @@ -338,9 +338,11 @@ * the file is memory mapped. */ static int -mappedread(vnode_t *vp, char *addr, int nbytes, uio_t *uio) +mappedread(vnode_t *vp, int nbytes, uio_t *uio) { - int64_t start, off, bytes; + znode_t *zp = VTOZ(vp); + objset_t *os = zp->z_zfsvfs->z_os; + int64_t start, off; int len = nbytes; int error = 0; @@ -348,8 +350,8 @@ off = start & PAGEOFFSET; for (start &= PAGEMASK; len > 0; start += PAGESIZE) { page_t *pp; - - bytes = MIN(PAGESIZE - off, len); + uint64_t bytes = MIN(PAGESIZE - off, len); + if (pp = page_lookup(vp, start, SE_SHARED)) { caddr_t va; @@ -358,11 +360,9 @@ ppmapout(va); page_unlock(pp); } else { - /* XXX use dmu_read here? */ - error = uiomove(addr, bytes, UIO_READ, uio); + error = dmu_read_uio(os, zp->z_id, uio, bytes); } len -= bytes; - addr += bytes; off = 0; if (error) break; @@ -370,7 +370,7 @@ return (error); } -uint_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ +offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ /* * Read bytes from specified file into supplied buffer. @@ -395,10 +395,9 @@ { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - uint64_t delta; - ssize_t n, size, cnt, ndone; - int error, i, numbufs; - dmu_buf_t *dbp, **dbpp; + objset_t *os = zfsvfs->z_os; + ssize_t n, nbytes; + int error; rl_t *rl; ZFS_ENTER(zfsvfs); @@ -446,58 +445,27 @@ * to the end; but we might still need to set atime. */ if (uio->uio_loffset >= zp->z_phys->zp_size) { - cnt = 0; error = 0; goto out; } - cnt = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); - - for (ndone = 0; ndone < cnt; ndone += zfs_read_chunk_size) { - ASSERT(uio->uio_loffset < zp->z_phys->zp_size); - n = MIN(zfs_read_chunk_size, - zp->z_phys->zp_size - uio->uio_loffset); - n = MIN(n, cnt); - error = dmu_buf_hold_array_by_bonus(zp->z_dbuf, - uio->uio_loffset, n, TRUE, FTAG, &numbufs, &dbpp); + ASSERT(uio->uio_loffset < zp->z_phys->zp_size); + n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); + + while (n > 0) { + nbytes = MIN(n, zfs_read_chunk_size - + P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); + + if (vn_has_cached_data(vp)) + error = mappedread(vp, nbytes, uio); + else + error = dmu_read_uio(os, zp->z_id, uio, nbytes); if (error) - goto out; - /* - * Compute the adjustment to align the dmu buffers - * with the uio buffer. - */ - delta = uio->uio_loffset - dbpp[0]->db_offset; - - for (i = 0; i < numbufs; i++) { - if (n < 0) - break; - dbp = dbpp[i]; - size = dbp->db_size - delta; - /* - * XXX -- this is correct, but may be suboptimal. - * If the pages are all clean, we don't need to - * go through mappedread(). Maybe the VMODSORT - * stuff can help us here. - */ - if (vn_has_cached_data(vp)) { - error = mappedread(vp, (caddr_t)dbp->db_data + - delta, (n < size ? n : size), uio); - } else { - error = uiomove((caddr_t)dbp->db_data + delta, - (n < size ? n : size), UIO_READ, uio); - } - if (error) { - dmu_buf_rele_array(dbpp, numbufs, FTAG); - goto out; - } - n -= dbp->db_size; - if (delta) { - n += delta; - delta = 0; - } - } - dmu_buf_rele_array(dbpp, numbufs, FTAG); + break; + + n -= nbytes; } + out: zfs_range_unlock(rl); @@ -660,8 +628,9 @@ } if (woff >= limit) { - error = EFBIG; - goto no_tx_done; + zfs_range_unlock(rl); + ZFS_EXIT(zfsvfs); + return (EFBIG); } if ((woff + n) > limit || woff > (limit - n)) @@ -671,114 +640,21 @@ * Check for mandatory locks */ if (MANDMODE((mode_t)zp->z_phys->zp_mode) && - (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) - goto no_tx_done; + (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { + zfs_range_unlock(rl); + ZFS_EXIT(zfsvfs); + return (error); + } end_size = MAX(zp->z_phys->zp_size, woff + n); -top: - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); - dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); - error = dmu_tx_assign(tx, zfsvfs->z_assign); - if (error) { - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - goto no_tx_done; - } /* - * If zfs_range_lock() over-locked we grow the blocksize - * and then reduce the lock range. - */ - if (rl->r_len == UINT64_MAX) { - uint64_t new_blksz; - - if (zp->z_blksz > max_blksz) { - ASSERT(!ISP2(zp->z_blksz)); - new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); - } else { - new_blksz = MIN(end_size, max_blksz); - } - zfs_grow_blocksize(zp, new_blksz, tx); - zfs_range_reduce(rl, woff, n); - } - - /* - * The file data does not fit in the znode "cache", so we - * will be writing to the file block data buffers. - * Each buffer will be written in a separate transaction; - * this keeps the intent log records small and allows us - * to do more fine-grained space accounting. + * Write the file in reasonable size chunks. Each chunk is written + * in a separate transaction; this keeps the intent log records small + * and allows us to do more fine-grained space accounting. */ while (n > 0) { /* - * XXX - should we really limit each write to z_max_blksz? - * Perhaps we should use SPA_MAXBLOCKSIZE chunks? - */ - nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); - rw_enter(&zp->z_map_lock, RW_READER); - - tx_bytes = uio->uio_resid; - if (vn_has_cached_data(vp)) { - rw_exit(&zp->z_map_lock); - error = mappedwrite(vp, woff, nbytes, uio, tx); - } else { - error = dmu_write_uio(zfsvfs->z_os, zp->z_id, - woff, nbytes, uio, tx); - rw_exit(&zp->z_map_lock); - } - tx_bytes -= uio->uio_resid; - - if (error) { - /* XXX - do we need to "clean up" the dmu buffer? */ - break; - } - - ASSERT(tx_bytes == nbytes); - - /* - * Clear Set-UID/Set-GID bits on successful write if not - * privileged and at least one of the excute bits is set. - * - * It would be nice to to this after all writes have - * been done, but that would still expose the ISUID/ISGID - * to another app after the partial write is committed. - */ - - mutex_enter(&zp->z_acl_lock); - if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | - (S_IXUSR >> 6))) != 0 && - (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && - secpolicy_vnode_setid_retain(cr, - (zp->z_phys->zp_mode & S_ISUID) != 0 && - zp->z_phys->zp_uid == 0) != 0) { - zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); - } - mutex_exit(&zp->z_acl_lock); - - n -= nbytes; - if (n <= 0) - break; - - /* - * We have more work ahead of us, so wrap up this transaction - * and start another. Exact same logic as tx_done below. - */ - while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) { - dmu_buf_will_dirty(zp->z_dbuf, tx); - (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, - uio->uio_loffset); - } - zfs_time_stamper(zp, CONTENT_MODIFIED, tx); - zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, - ioflag, uio); - dmu_tx_commit(tx); - - /* - * Start another transaction. + * Start a transaction. */ woff = uio->uio_loffset; tx = dmu_tx_create(zfsvfs->z_os); @@ -790,33 +666,98 @@ zfsvfs->z_assign == TXG_NOWAIT) { dmu_tx_wait(tx); dmu_tx_abort(tx); - goto top; + continue; } dmu_tx_abort(tx); - goto no_tx_done; + break; + } + + /* + * If zfs_range_lock() over-locked we grow the blocksize + * and then reduce the lock range. This will only happen + * on the first iteration since zfs_range_reduce() will + * shrink down r_len to the appropriate size. + */ + if (rl->r_len == UINT64_MAX) { + uint64_t new_blksz; + + if (zp->z_blksz > max_blksz) { + ASSERT(!ISP2(zp->z_blksz)); + new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); + } else { + new_blksz = MIN(end_size, max_blksz); + } + zfs_grow_blocksize(zp, new_blksz, tx); + zfs_range_reduce(rl, woff, n); + } + + /* + * XXX - should we really limit each write to z_max_blksz? + * Perhaps we should use SPA_MAXBLOCKSIZE chunks? + */ + nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); + rw_enter(&zp->z_map_lock, RW_READER); + + tx_bytes = uio->uio_resid; + if (vn_has_cached_data(vp)) { + rw_exit(&zp->z_map_lock); + error = mappedwrite(vp, nbytes, uio, tx); + } else { + error = dmu_write_uio(zfsvfs->z_os, zp->z_id, + uio, nbytes, tx); + rw_exit(&zp->z_map_lock); } - } - -tx_done: - - if (tx_bytes != 0) { + tx_bytes -= uio->uio_resid; + + /* + * If we made no progress, we're done. If we made even + * partial progress, update the znode and ZIL accordingly. + */ + if (tx_bytes == 0) { + ASSERT(error != 0); + break; + } + /* - * Update the file size if it has changed; account - * for possible concurrent updates. + * Clear Set-UID/Set-GID bits on successful write if not + * privileged and at least one of the excute bits is set. + * + * It would be nice to to this after all writes have + * been done, but that would still expose the ISUID/ISGID + * to another app after the partial write is committed. */ - while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) { - dmu_buf_will_dirty(zp->z_dbuf, tx); + mutex_enter(&zp->z_acl_lock); + if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | + (S_IXUSR >> 6))) != 0 && + (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && + secpolicy_vnode_setid_retain(cr, + (zp->z_phys->zp_mode & S_ISUID) != 0 && + zp->z_phys->zp_uid == 0) != 0) { + zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); + } + mutex_exit(&zp->z_acl_lock); + + /* + * Update time stamp. NOTE: This marks the bonus buffer as + * dirty, so we don't have to do it again for zp_size. + */ + zfs_time_stamper(zp, CONTENT_MODIFIED, tx); + + /* + * Update the file size (zp_size) if it has changed; + * account for possible concurrent updates. + */ + while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, uio->uio_loffset); - } - zfs_time_stamper(zp, CONTENT_MODIFIED, tx); - zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, - ioflag, uio); + zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); + dmu_tx_commit(tx); + + if (error != 0) + break; + ASSERT(tx_bytes == nbytes); + n -= nbytes; } - dmu_tx_commit(tx); - - -no_tx_done: zfs_range_unlock(rl); @@ -863,7 +804,7 @@ dmu_buf_t *db; rl_t *rl; zgd_t *zgd; - int dlen = lr->lr_length; /* length of user data */ + int dlen = lr->lr_length; /* length of user data */ int error = 0; ASSERT(zio); @@ -2243,7 +2184,7 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) { zfs_zlock_t *zl; - znode_t *zp = tdzp; + znode_t *zp = tdzp; uint64_t rootid = zp->z_zfsvfs->z_root; uint64_t *oidp = &zp->z_id; krwlock_t *rwlp = &szp->z_parent_lock; @@ -2923,8 +2864,7 @@ if (err == 0) { zfs_time_stamper(zp, CONTENT_MODIFIED, tx); - (void) zfs_log_write( - zilog, tx, TX_WRITE, zp, off, len, 0, NULL); + zfs_log_write(zilog, tx, TX_WRITE, zp, off, len, 0); dmu_tx_commit(tx); }
--- a/usr/src/uts/common/fs/zfs/zvol.c Mon Feb 12 15:22:44 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/zvol.c Mon Feb 12 17:35:21 2007 -0800 @@ -42,7 +42,6 @@ #include <sys/types.h> #include <sys/param.h> #include <sys/errno.h> -#include <sys/aio_req.h> #include <sys/uio.h> #include <sys/buf.h> #include <sys/modctl.h> @@ -110,7 +109,7 @@ */ int zvol_maxphys = DMU_MAX_ACCESS/2; -int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio); +static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio); static void zvol_size_changed(zvol_state_t *zv, dev_t dev) @@ -679,29 +678,7 @@ return (0); } -/* - * Create and return an immediate write ZIL transaction. - */ -itx_t * -zvol_immediate_itx(offset_t off, ssize_t len, char *addr) -{ - itx_t *itx; - lr_write_t *lr; - - itx = zil_itx_create(TX_WRITE, sizeof (*lr) + len); - lr = (lr_write_t *)&itx->itx_lr; - lr->lr_foid = ZVOL_OBJ; - lr->lr_offset = off; - lr->lr_length = len; - lr->lr_blkoff = 0; - BP_ZERO(&lr->lr_blkptr); - bcopy(addr, (char *)itx + offsetof(itx_t, itx_lr) + - sizeof (*lr), len); - itx->itx_wr_state = WR_COPIED; - return (itx); -} - -void +static void zvol_get_done(dmu_buf_t *db, void *vzgd) { zgd_t *zgd = (zgd_t *)vzgd; @@ -714,19 +691,21 @@ /* * Get data to generate a TX_WRITE intent log record. */ -int +static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) { zvol_state_t *zv = arg; objset_t *os = zv->zv_objset; dmu_buf_t *db; zgd_t *zgd; - int dlen = lr->lr_length; /* length of user data */ int error; ASSERT(zio); - ASSERT(dlen != 0); - ASSERT(buf == NULL); + ASSERT(lr->lr_length != 0); + + if (buf != NULL) + return (dmu_read(os, ZVOL_OBJ, + lr->lr_offset, lr->lr_length, buf)); zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_zilog = zv->zv_zilog; @@ -742,10 +721,9 @@ error = dmu_sync(zio, db, &lr->lr_blkptr, lr->lr_common.lrc_txg, zvol_get_done, zgd); rw_exit(&zv->zv_dslock); - if (error == 0) { + if (error == 0) zil_add_vdev(zv->zv_zilog, DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr))); - } /* * If we get EINPROGRESS, then we need to wait for a * write IO initiated by dmu_sync() to complete before @@ -767,42 +745,27 @@ */ ssize_t zvol_immediate_write_sz = 32768; -void -zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len, - char *addr) +static void +zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len) { - ssize_t nbytes; - itx_t *itx; + uint32_t blocksize = zv->zv_volblocksize; lr_write_t *lr; - zilog_t *zilog = zv->zv_zilog; - uint64_t boff; - uint32_t blocksize; - - /* handle common case */ - if (len <= zvol_immediate_write_sz) { - itx = zvol_immediate_itx(off, len, addr); - (void) zil_itx_assign(zilog, itx, tx); - } - - blocksize = zv->zv_volblocksize; while (len) { - nbytes = MIN(len, blocksize - P2PHASE(off, blocksize)); - if (nbytes <= zvol_immediate_write_sz) { - itx = zvol_immediate_itx(off, nbytes, addr); - } else { - boff = P2ALIGN_TYPED(off, blocksize, uint64_t); - itx = zil_itx_create(TX_WRITE, sizeof (*lr)); - itx->itx_wr_state = WR_INDIRECT; - itx->itx_private = zv; - lr = (lr_write_t *)&itx->itx_lr; - lr->lr_foid = ZVOL_OBJ; - lr->lr_offset = off; - lr->lr_length = nbytes; - lr->lr_blkoff = off - boff; - BP_ZERO(&lr->lr_blkptr); - } - (void) zil_itx_assign(zilog, itx, tx); + ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize)); + itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr)); + + itx->itx_wr_state = + len > zvol_immediate_write_sz ? WR_INDIRECT : WR_NEED_COPY; + itx->itx_private = zv; + lr = (lr_write_t *)&itx->itx_lr; + lr->lr_foid = ZVOL_OBJ; + lr->lr_offset = off; + lr->lr_length = nbytes; + lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t); + BP_ZERO(&lr->lr_blkptr); + + (void) zil_itx_assign(zv->zv_zilog, itx, tx); len -= nbytes; off += nbytes; } @@ -878,9 +841,7 @@ dmu_tx_abort(tx); } else { dmu_write(os, ZVOL_OBJ, off, size, addr, tx); - /* add a log write transaction */ - if (sync) - zvol_log_write(zv, tx, off, size, addr); + zvol_log_write(zv, tx, off, size); dmu_tx_commit(tx); } } @@ -895,10 +856,10 @@ if ((bp->b_resid = resid) == bp->b_bcount) bioerror(bp, off > volsize ? EINVAL : error); - biodone(bp); + if (sync) + zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); - if (sync) - zil_commit(zv->zv_zilog, UINT64_MAX, 0); + biodone(bp); return (0); } @@ -920,32 +881,48 @@ /*ARGSUSED*/ int -zvol_read(dev_t dev, uio_t *uiop, cred_t *cr) +zvol_read(dev_t dev, uio_t *uio, cred_t *cr) { - return (physio(zvol_strategy, NULL, dev, B_READ, zvol_minphys, uiop)); + zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(dev)); + int error = 0; + + while (uio->uio_resid > 0) { + uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); + + error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes); + if (error) + break; + } + return (error); } /*ARGSUSED*/ int -zvol_write(dev_t dev, uio_t *uiop, cred_t *cr) -{ - return (physio(zvol_strategy, NULL, dev, B_WRITE, zvol_minphys, uiop)); -} - -/*ARGSUSED*/ -int -zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr) +zvol_write(dev_t dev, uio_t *uio, cred_t *cr) { - return (aphysio(zvol_strategy, anocancel, dev, B_READ, zvol_minphys, - aio)); -} + zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(dev)); + int error = 0; + + while (uio->uio_resid > 0) { + uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); + uint64_t off = uio->uio_loffset; -/*ARGSUSED*/ -int -zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr) -{ - return (aphysio(zvol_strategy, anocancel, dev, B_WRITE, zvol_minphys, - aio)); + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + break; + } + error = dmu_write_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes, tx); + if (error == 0) + zvol_log_write(zv, tx, off, bytes); + dmu_tx_commit(tx); + + if (error) + break; + } + return (error); } /* @@ -1068,6 +1045,10 @@ } return (error); + case DKIOCFLUSHWRITECACHE: + zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); + break; + case DKIOCGGEOM: case DKIOCGVTOC: /* commands using these (like prtvtoc) expect ENOTSUP */
--- a/usr/src/uts/common/sys/fs/snode.h Mon Feb 12 15:22:44 2007 -0800 +++ b/usr/src/uts/common/sys/fs/snode.h Mon Feb 12 17:35:21 2007 -0800 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,11 +19,11 @@ * CDDL HEADER END */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ -/* All Rights Reserved */ +/* All Rights Reserved */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -118,6 +117,7 @@ #define SMUXED 0x1000 /* this snode is a stream that has */ /* been multiplexed */ #define SSELFCLONE 0x2000 /* represents a self cloning device */ +#define SNOFLUSH 0x4000 /* do not flush device on fsync */ #ifdef _KERNEL /*