Mercurial > illumos > illumos-gate
changeset 6992:20c04e18c58c
6573681 deleting a very large file can be slow
6706950 ((&dnp->dn_blkptr[0])->blk_birth == 0) || list_head(list) != 0L || dn->dn_next_blksz[txgoff]
line wrap: on
line diff
--- a/usr/src/cmd/zdb/zdb.c Tue Jul 01 11:24:56 2008 -0700 +++ b/usr/src/cmd/zdb/zdb.c Tue Jul 01 12:01:12 2008 -0700 @@ -1093,13 +1093,13 @@ } for (;;) { - error = dnode_next_offset(dn, B_FALSE, &start, minlvl, - blkfill, 0); + error = dnode_next_offset(dn, + 0, &start, minlvl, blkfill, 0); if (error) break; end = start; - error = dnode_next_offset(dn, B_TRUE, &end, minlvl, - blkfill, 0); + error = dnode_next_offset(dn, + DNODE_FIND_HOLE, &end, minlvl, blkfill, 0); nicenum(end - start, segsize); (void) printf("\t\tsegment [%016llx, %016llx)" " size %5s\n", (u_longlong_t)start,
--- a/usr/src/uts/common/fs/zfs/dbuf.c Tue Jul 01 11:24:56 2008 -0700 +++ b/usr/src/uts/common/fs/zfs/dbuf.c Tue Jul 01 12:01:12 2008 -0700 @@ -705,22 +705,50 @@ arc_release(dr->dt.dl.dr_data, db); } +/* + * Evict (if its unreferenced) or clear (if its referenced) any level-0 + * data blocks in the free range, so that any future readers will find + * empty blocks. Also, if we happen accross any level-1 dbufs in the + * range that have not already been marked dirty, mark them dirty so + * they stay in memory. + */ void -dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) +dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) { dmu_buf_impl_t *db, *db_next; uint64_t txg = tx->tx_txg; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + uint64_t first_l1 = start >> epbs; + uint64_t last_l1 = end >> epbs; - dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks); + if (end > dn->dn_maxblkid) { + end = dn->dn_maxblkid; + last_l1 = end >> epbs; + } + dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); mutex_enter(&dn->dn_dbufs_mtx); for (db = list_head(&dn->dn_dbufs); db; db = db_next) { db_next = list_next(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DB_BONUS_BLKID); + + if (db->db_level == 1 && + db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { + mutex_enter(&db->db_mtx); + if (db->db_last_dirty && + db->db_last_dirty->dr_txg < txg) { + dbuf_add_ref(db, FTAG); + mutex_exit(&db->db_mtx); + dbuf_will_dirty(db, tx); + dbuf_rele(db, FTAG); + } else { + mutex_exit(&db->db_mtx); + } + } + if (db->db_level != 0) continue; dprintf_dbuf(db, "found buf %s\n", ""); - if (db->db_blkid < blkid || - db->db_blkid >= blkid+nblks) + if (db->db_blkid < start || db->db_blkid > end) continue; /* found a level 0 buffer in the range */ @@ -1161,7 +1189,7 @@ list_remove(&dr->dr_parent->dt.di.dr_children, dr); mutex_exit(&dr->dr_parent->dt.di.dr_mtx); } else if (db->db_level+1 == dn->dn_nlevels) { - ASSERT3P(db->db_parent, ==, dn->dn_dbuf); + ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); mutex_exit(&dn->dn_mtx); @@ -1976,7 +2004,7 @@ mutex_exit(&db->db_mtx); if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg)) - dsl_dataset_block_kill(os->os_dsl_dataset, + (void) dsl_dataset_block_kill(os->os_dsl_dataset, &zio_fake.io_bp_orig, dn->dn_zio, tx); dbuf_write_ready(&zio_fake, db->db_buf, db); @@ -2105,7 +2133,7 @@ if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0) zio_flags |= ZIO_FLAG_METADATA; if (BP_IS_OLDER(db->db_blkptr, txg)) - dsl_dataset_block_kill( + (void) dsl_dataset_block_kill( os->os_dsl_dataset, db->db_blkptr, zio, tx); dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress, @@ -2137,7 +2165,7 @@ dmu_tx_t *tx = os->os_synctx; if (bp_orig->blk_birth == tx->tx_txg) - dsl_dataset_block_kill(ds, bp_orig, NULL, tx); + (void) dsl_dataset_block_kill(ds, bp_orig, NULL, tx); ASSERT3U(db->db_blkptr->blk_fill, ==, 0); return; } @@ -2185,7 +2213,7 @@ dmu_tx_t *tx = os->os_synctx; if (bp_orig->blk_birth == tx->tx_txg) - dsl_dataset_block_kill(ds, bp_orig, NULL, tx); + (void) dsl_dataset_block_kill(ds, bp_orig, NULL, tx); dsl_dataset_block_born(ds, zio->io_bp, tx); } }
--- a/usr/src/uts/common/fs/zfs/dmu.c Tue Jul 01 11:24:56 2008 -0700 +++ b/usr/src/uts/common/fs/zfs/dmu.c Tue Jul 01 12:01:12 2008 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -364,6 +364,152 @@ dnode_rele(dn, FTAG); } +static int +get_next_chunk(dnode_t *dn, uint64_t *offset, uint64_t limit) +{ + uint64_t len = limit - *offset; + uint64_t chunk_len = dn->dn_datablksz * DMU_MAX_DELETEBLKCNT; + uint64_t dn_used; + int err; + + ASSERT(limit <= *offset); + + dn_used = dn->dn_phys->dn_used << + (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES ? 0 : DEV_BSHIFT); + if (len <= chunk_len || dn_used <= chunk_len) { + *offset = limit; + return (0); + } + + while (*offset > limit) { + uint64_t initial_offset = *offset; + uint64_t delta; + + /* skip over allocated data */ + err = dnode_next_offset(dn, + DNODE_FIND_HOLE|DNODE_FIND_BACKWARDS, offset, 1, 1, 0); + if (err == ESRCH) + *offset = limit; + else if (err) + return (err); + + ASSERT3U(*offset, <=, initial_offset); + delta = initial_offset - *offset; + if (delta >= chunk_len) { + *offset += delta - chunk_len; + return (0); + } + chunk_len -= delta; + + /* skip over unallocated data */ + err = dnode_next_offset(dn, + DNODE_FIND_BACKWARDS, offset, 1, 1, 0); + if (err == ESRCH) + *offset = limit; + else if (err) + return (err); + + if (*offset < limit) + *offset = limit; + ASSERT3U(*offset, <, initial_offset); + } + return (0); +} + +static int +dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, + uint64_t length, boolean_t free_dnode) +{ + dmu_tx_t *tx; + uint64_t object_size, start, end, len; + boolean_t trunc = (length == DMU_OBJECT_END); + int align, err; + + align = 1 << dn->dn_datablkshift; + ASSERT(align > 0); + object_size = align == 1 ? dn->dn_datablksz : + (dn->dn_maxblkid + 1) << dn->dn_datablkshift; + + if (trunc || (end = offset + length) > object_size) + end = object_size; + if (end <= offset) + return (0); + length = end - offset; + + while (length) { + start = end; + err = get_next_chunk(dn, &start, offset); + if (err) + return (err); + len = trunc ? DMU_OBJECT_END : end - start; + + tx = dmu_tx_create(os); + dmu_tx_hold_free(tx, dn->dn_object, start, len); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + dnode_free_range(dn, start, trunc ? -1 : len, tx); + + if (start == 0 && trunc && free_dnode) + dnode_free(dn, tx); + + length -= end - start; + + dmu_tx_commit(tx); + end = start; + trunc = FALSE; + } + return (0); +} + +int +dmu_free_long_range(objset_t *os, uint64_t object, + uint64_t offset, uint64_t length) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os->os, object, FTAG, &dn); + if (err != 0) + return (err); + err = dmu_free_long_range_impl(os, dn, offset, length, FALSE); + dnode_rele(dn, FTAG); + return (err); +} + +int +dmu_free_object(objset_t *os, uint64_t object) +{ + dnode_t *dn; + dmu_tx_t *tx; + int err; + + err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, + FTAG, &dn); + if (err != 0) + return (err); + if (dn->dn_nlevels == 1) { + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, object); + dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err == 0) { + dnode_free_range(dn, 0, DMU_OBJECT_END, tx); + dnode_free(dn, tx); + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + } else { + err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE); + } + dnode_rele(dn, FTAG); + return (err); +} + int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) @@ -912,7 +1058,7 @@ return (err); } - err = dnode_next_offset(dn, hole, off, 1, 1, 0); + err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); dnode_rele(dn, FTAG); return (err);
--- a/usr/src/uts/common/fs/zfs/dmu_object.c Tue Jul 01 11:24:56 2008 -0700 +++ b/usr/src/uts/common/fs/zfs/dmu_object.c Tue Jul 01 12:01:12 2008 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -54,7 +54,8 @@ if (P2PHASE(object, L2_dnode_count) == 0) { uint64_t offset = restarted ? object << DNODE_SHIFT : 0; int error = dnode_next_offset(osi->os_meta_dnode, - B_TRUE, &offset, 2, DNODES_PER_BLOCK >> 2, 0); + DNODE_FIND_HOLE, + &offset, 2, DNODES_PER_BLOCK >> 2, 0); restarted = B_TRUE; if (error == 0) object = offset >> DNODE_SHIFT; @@ -139,6 +140,7 @@ return (err); ASSERT(dn->dn_type != DMU_OT_NONE); + dnode_free_range(dn, 0, DMU_OBJECT_END, tx); dnode_free(dn, tx); dnode_rele(dn, FTAG); @@ -152,7 +154,7 @@ int error; error = dnode_next_offset(os->os->os_meta_dnode, - hole, &offset, 0, DNODES_PER_BLOCK, txg); + (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); *objectp = offset >> DNODE_SHIFT;
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c Tue Jul 01 11:24:56 2008 -0700 +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c Tue Jul 01 12:01:12 2008 -0700 @@ -829,7 +829,7 @@ if (!DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(&zio->io_bp_orig))) { if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg) - dsl_dataset_block_kill(os->os_dsl_dataset, + (void) dsl_dataset_block_kill(os->os_dsl_dataset, &zio->io_bp_orig, NULL, os->os_synctx); dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx); } @@ -878,7 +878,7 @@ zb.zb_level = -1; zb.zb_blkid = 0; if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) { - dsl_dataset_block_kill(os->os_dsl_dataset, + (void) dsl_dataset_block_kill(os->os_dsl_dataset, os->os_rootbp, pio, tx); } zio = arc_write(pio, os->os_spa, os->os_md_checksum,
--- a/usr/src/uts/common/fs/zfs/dmu_send.c Tue Jul 01 11:24:56 2008 -0700 +++ b/usr/src/uts/common/fs/zfs/dmu_send.c Tue Jul 01 12:01:12 2008 -0700 @@ -877,23 +877,14 @@ for (obj = drrfo->drr_firstobj; obj < drrfo->drr_firstobj + drrfo->drr_numobjs; (void) dmu_object_next(os, &obj, FALSE, 0)) { - dmu_tx_t *tx; int err; if (dmu_object_info(os, obj, NULL) != 0) continue; - tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, obj); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); + err = dmu_free_object(os, obj); + if (err) return (err); - } - err = dmu_object_free(os, obj, tx); - dmu_tx_commit(tx); - if (err && err != ENOENT) - return (EINVAL); } return (0); } @@ -939,7 +930,6 @@ restore_free(struct restorearg *ra, objset_t *os, struct drr_free *drrf) { - dmu_tx_t *tx; int err; if (drrf->drr_length != -1ULL && @@ -949,18 +939,8 @@ if (dmu_object_info(os, drrf->drr_object, NULL) != 0) return (EINVAL); - tx = dmu_tx_create(os); - - dmu_tx_hold_free(tx, drrf->drr_object, + err = dmu_free_long_range(os, drrf->drr_object, drrf->drr_offset, drrf->drr_length); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); - return (err); - } - err = dmu_free_range(os, drrf->drr_object, - drrf->drr_offset, drrf->drr_length, tx); - dmu_tx_commit(tx); return (err); }
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c Tue Jul 01 11:24:56 2008 -0700 +++ b/usr/src/uts/common/fs/zfs/dmu_tx.c Tue Jul 01 12:01:12 2008 -0700 @@ -320,39 +320,25 @@ static void dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { - uint64_t blkid, nblks; - uint64_t space = 0, unref = 0; + uint64_t blkid, nblks, lastblk; + uint64_t space = 0, unref = 0, skipped = 0; dnode_t *dn = txh->txh_dnode; dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; spa_t *spa = txh->txh_tx->tx_pool->dp_spa; - int dirty; + int epbs; - /* - * We don't need to use any locking to check for dirtyness - * because it's OK if we get stale data -- the dnode may become - * dirty immediately after our check anyway. This is just a - * means to avoid the expensive count when we aren't sure we - * need it. We need to be able to deal with a dirty dnode. - */ - dirty = list_link_active(&dn->dn_dirty_link[0]) | - list_link_active(&dn->dn_dirty_link[1]) | - list_link_active(&dn->dn_dirty_link[2]) | - list_link_active(&dn->dn_dirty_link[3]); - if (dirty || dn->dn_assigned_txg || dn->dn_phys->dn_nlevels == 0) + if (dn->dn_nlevels == 0) return; /* - * the struct_rwlock protects us against dn_phys->dn_nlevels + * The struct_rwlock protects us against dn_nlevels * changing, in case (against all odds) we manage to dirty & * sync out the changes after we check for being dirty. - * also, dbuf_hold_impl() wants us to have the struct_rwlock. - * - * It's fine to use dn_datablkshift rather than the dn_phys - * equivalent because if it is changing, maxblkid==0 and we will - * bail. + * Also, dbuf_hold_level() wants us to have the struct_rwlock. */ rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (dn->dn_phys->dn_maxblkid == 0) { + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + if (dn->dn_maxblkid == 0) { if (off == 0 && len >= dn->dn_datablksz) { blkid = 0; nblks = 1; @@ -362,24 +348,21 @@ } } else { blkid = off >> dn->dn_datablkshift; - nblks = (off + len) >> dn->dn_datablkshift; + nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift; - if (blkid >= dn->dn_phys->dn_maxblkid) { + if (blkid >= dn->dn_maxblkid) { rw_exit(&dn->dn_struct_rwlock); return; } - if (blkid + nblks > dn->dn_phys->dn_maxblkid) - nblks = dn->dn_phys->dn_maxblkid - blkid; + if (blkid + nblks > dn->dn_maxblkid) + nblks = dn->dn_maxblkid - blkid; - /* don't bother after 128,000 blocks */ - nblks = MIN(nblks, 128*1024); } - - if (dn->dn_phys->dn_nlevels == 1) { + if (dn->dn_nlevels == 1) { int i; for (i = 0; i < nblks; i++) { blkptr_t *bp = dn->dn_phys->dn_blkptr; - ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr); + ASSERT3U(blkid + i, <, dn->dn_nblkptr); bp += blkid + i; if (dsl_dataset_block_freeable(ds, bp->blk_birth)) { dprintf_bp(bp, "can free old%s", ""); @@ -390,51 +373,86 @@ nblks = 0; } + /* + * Add in memory requirements of higher-level indirects + */ + if (nblks && dn->dn_nlevels > 2) { + uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs); + int level = 2; + + while (level++ < dn->dn_nlevels) { + txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift; + blkcnt = 1 + (blkcnt >> epbs); + } + ASSERT(blkcnt <= dn->dn_nblkptr); + } + + lastblk = blkid + nblks - 1; while (nblks) { dmu_buf_impl_t *dbuf; - int err, epbs, blkoff, tochk; - - epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - blkoff = P2PHASE(blkid, 1<<epbs); - tochk = MIN((1<<epbs) - blkoff, nblks); - - err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf); - if (err == 0) { - int i; - blkptr_t *bp; + uint64_t ibyte, new_blkid; + int epb = 1 << epbs; + int err, i, blkoff, tochk; + blkptr_t *bp; - err = dbuf_read(dbuf, NULL, - DB_RF_HAVESTRUCT | DB_RF_CANFAIL); - if (err != 0) { - txh->txh_tx->tx_err = err; - dbuf_rele(dbuf, FTAG); - break; - } - - bp = dbuf->db.db_data; - bp += blkoff; - - for (i = 0; i < tochk; i++) { - if (dsl_dataset_block_freeable(ds, - bp[i].blk_birth)) { - dprintf_bp(&bp[i], - "can free old%s", ""); - space += bp_get_dasize(spa, &bp[i]); - } - unref += BP_GET_ASIZE(bp); - } - dbuf_rele(dbuf, FTAG); - } - if (err && err != ENOENT) { + ibyte = blkid << dn->dn_datablkshift; + err = dnode_next_offset(dn, + DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0); + new_blkid = ibyte >> dn->dn_datablkshift; + if (err == ESRCH) + break; + if (err) { txh->txh_tx->tx_err = err; break; } + if (new_blkid > lastblk) + break; + + if (new_blkid > blkid) { + skipped += new_blkid - blkid - 1; + nblks -= new_blkid - blkid; + blkid = new_blkid; + } + blkoff = P2PHASE(blkid, epb); + tochk = MIN(epb - blkoff, nblks); + + dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG); + + txh->txh_memory_tohold += dbuf->db.db_size; + if (txh->txh_memory_tohold > DMU_MAX_ACCESS) { + txh->txh_tx->tx_err = E2BIG; + dbuf_rele(dbuf, FTAG); + break; + } + err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL); + if (err != 0) { + txh->txh_tx->tx_err = err; + dbuf_rele(dbuf, FTAG); + break; + } + + bp = dbuf->db.db_data; + bp += blkoff; + + for (i = 0; i < tochk; i++) { + if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) { + dprintf_bp(&bp[i], "can free old%s", ""); + space += bp_get_dasize(spa, &bp[i]); + } + unref += BP_GET_ASIZE(bp); + } + dbuf_rele(dbuf, FTAG); blkid += tochk; nblks -= tochk; } rw_exit(&dn->dn_struct_rwlock); + /* account for new level 1 indirect blocks that might show up */ + if (skipped) { + skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs); + txh->txh_memory_tohold += skipped << dn->dn_indblkshift; + } txh->txh_space_tofree += space; txh->txh_space_tounref += unref; } @@ -471,7 +489,7 @@ /* * For i/o error checking, read the first and last level-0 * blocks, and all the level-1 blocks. The above count_write's - * will take care of the level-0 blocks. + * have already taken care of the level-0 blocks. */ if (dn->dn_nlevels > 1) { shift = dn->dn_datablkshift + dn->dn_indblkshift - @@ -483,7 +501,7 @@ NULL, NULL, ZIO_FLAG_CANFAIL); for (i = start; i <= end; i++) { uint64_t ibyte = i << shift; - err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1, 0); + err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); i = ibyte >> shift; if (err == ESRCH) break; @@ -706,12 +724,13 @@ match_offset = TRUE; break; case THT_FREE: - if (blkid == beginblk && - (txh->txh_arg1 != 0 || - dn->dn_maxblkid == 0)) - match_offset = TRUE; - if (blkid == endblk && - txh->txh_arg2 != DMU_OBJECT_END) + /* + * We will dirty all the level 1 blocks in + * the free range and perhaps the first and + * last level 0 block. + */ + if (blkid >= beginblk && (blkid <= endblk || + txh->txh_arg2 == DMU_OBJECT_END)) match_offset = TRUE; break; case THT_BONUS: @@ -742,8 +761,8 @@ { dmu_tx_hold_t *txh; spa_t *spa = tx->tx_pool->dp_spa; - uint64_t lsize, asize, fsize, usize; - uint64_t towrite, tofree, tooverwrite, tounref; + uint64_t memory, asize, fsize, usize; + uint64_t towrite, tofree, tooverwrite, tounref, tohold; ASSERT3U(tx->tx_txg, ==, 0); @@ -776,7 +795,7 @@ * dmu_tx_unassign() logic. */ - towrite = tofree = tooverwrite = tounref = 0; + towrite = tofree = tooverwrite = tounref = tohold = 0; for (txh = list_head(&tx->tx_holds); txh; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; @@ -797,6 +816,7 @@ tofree += txh->txh_space_tofree; tooverwrite += txh->txh_space_tooverwrite; tounref += txh->txh_space_tounref; + tohold += txh->txh_memory_tohold; } /* @@ -817,24 +837,27 @@ tooverwrite = tofree = 0; } - /* - * Convert logical size to worst-case allocated size. - */ + /* needed allocation: worst-case estimate of write space */ + asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite); + /* freed space estimate: worst-case overwrite + free estimate */ fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree; - lsize = towrite + tooverwrite; - asize = spa_get_asize(tx->tx_pool->dp_spa, lsize); + /* convert unrefd space to worst-case estimate */ usize = spa_get_asize(tx->tx_pool->dp_spa, tounref); + /* calculate memory footprint estimate */ + memory = towrite + tooverwrite + tohold; #ifdef ZFS_DEBUG - tx->tx_space_towrite = asize; + /* add in 'tohold' to account for our dirty holds on this memory */ + tx->tx_space_towrite = asize + + spa_get_asize(tx->tx_pool->dp_spa, tohold); tx->tx_space_tofree = tofree; tx->tx_space_tooverwrite = tooverwrite; tx->tx_space_tounref = tounref; #endif if (tx->tx_dir && asize != 0) { - int err = dsl_dir_tempreserve_space(tx->tx_dir, - lsize, asize, fsize, usize, &tx->tx_tempreserve_cookie, tx); + int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, + asize, fsize, usize, &tx->tx_tempreserve_cookie, tx); if (err) return (err); }
--- a/usr/src/uts/common/fs/zfs/dnode.c Tue Jul 01 11:24:56 2008 -0700 +++ b/usr/src/uts/common/fs/zfs/dnode.c Tue Jul 01 12:01:12 2008 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -780,7 +780,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) { dmu_buf_impl_t *db, *db_next; - int have_db0 = FALSE; + int err; if (size == 0) size = SPA_MINBLOCKSIZE; @@ -805,9 +805,7 @@ for (db = list_head(&dn->dn_dbufs); db; db = db_next) { db_next = list_next(&dn->dn_dbufs, db); - if (db->db_blkid == 0) { - have_db0 = TRUE; - } else if (db->db_blkid != DB_BONUS_BLKID) { + if (db->db_blkid != 0 && db->db_blkid != DB_BONUS_BLKID) { mutex_exit(&dn->dn_dbufs_mtx); goto fail; } @@ -817,12 +815,12 @@ if (ibs && dn->dn_nlevels != 1) goto fail; - db = NULL; - if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || have_db0) { - /* obtain the old block */ - db = dbuf_hold(dn, 0, FTAG); + /* resize the old block */ + err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db); + if (err == 0) dbuf_new_size(db, size, tx); - } + else if (err != ENOENT) + goto fail; dnode_setdblksz(dn, size); dnode_setdirty(dn, tx); @@ -831,7 +829,7 @@ dn->dn_indblkshift = ibs; dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; } - + /* rele after we have fixed the blocksize in the dnode */ if (db) dbuf_rele(db, FTAG); @@ -969,15 +967,15 @@ { dmu_buf_impl_t *db; uint64_t blkoff, blkid, nblks; - int blksz, head; + int blksz, blkshift, head, tail; int trunc = FALSE; + int epbs; rw_enter(&dn->dn_struct_rwlock, RW_WRITER); blksz = dn->dn_datablksz; + blkshift = dn->dn_datablkshift; + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - /* If the range is past the end of the file, this is a no-op */ - if (off >= blksz * (dn->dn_maxblkid+1)) - goto out; if (len == -1ULL) { len = UINT64_MAX - off; trunc = TRUE; @@ -989,11 +987,18 @@ if (ISP2(blksz)) { head = P2NPHASE(off, blksz); blkoff = P2PHASE(off, blksz); + if ((off >> blkshift) > dn->dn_maxblkid) + goto out; } else { ASSERT(dn->dn_maxblkid == 0); if (off == 0 && len >= blksz) { - /* Freeing the whole block; don't do any head. */ - head = 0; + /* Freeing the whole block; fast-track this request */ + blkid = 0; + nblks = 1; + goto done; + } else if (off > blkid) { + /* Freeing past end-of-data */ + goto out; } else { /* Freeing part of the block. */ head = blksz - off; @@ -1026,88 +1031,85 @@ } /* If the range was less than one block, we're done */ - if (len == 0 || off >= blksz * (dn->dn_maxblkid+1)) + if (len == 0) + goto out; + + ASSERT(ISP2(blksz)); + /* If the remaining range is past end of file, we're done */ + if ((off >> blkshift) > dn->dn_maxblkid) + goto out; + + if (trunc) + tail = 0; + else + tail = P2PHASE(len, blksz); + + ASSERT3U(P2PHASE(off, blksz), ==, 0); + /* zero out any partial block data at the end of the range */ + if (tail) { + if (len < tail) + tail = len; + if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), + TRUE, FTAG, &db) == 0) { + /* don't dirty if not on disk and not dirty */ + if (db->db_last_dirty || + (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { + rw_exit(&dn->dn_struct_rwlock); + dbuf_will_dirty(db, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + bzero(db->db.db_data, tail); + } + dbuf_rele(db, FTAG); + } + len -= tail; + } + + /* If the range did not include a full block, we are done */ + if (len == 0) goto out; - if (!ISP2(blksz)) { - /* - * They are freeing the whole block of a - * non-power-of-two blocksize file. Skip all the messy - * math. - */ - ASSERT3U(off, ==, 0); - ASSERT3U(len, >=, blksz); - blkid = 0; - nblks = 1; - } else { - int tail; - int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - int blkshift = dn->dn_datablkshift; + ASSERT(IS_P2ALIGNED(off, blksz)); + ASSERT(trunc || IS_P2ALIGNED(len, blksz)); + blkid = off >> blkshift; + nblks = len >> blkshift; + if (trunc) + nblks += 1; - /* If the remaining range is past end of file, we're done */ - if (off > dn->dn_maxblkid << blkshift) - goto out; + /* + * Read in and mark all the level-1 indirects dirty, + * so that they will stay in memory until syncing phase. + */ + if (dn->dn_nlevels > 1) { + uint64_t i, first, last; + int shift = epbs + dn->dn_datablkshift; - if (off + len == UINT64_MAX) - tail = 0; + first = blkid >> epbs; + if (trunc) + last = dn->dn_maxblkid >> epbs; else - tail = P2PHASE(len, blksz); + last = (blkid + nblks - 1) >> epbs; + for (i = first; i <= last; i++) { + uint64_t ibyte = i << shift; + int err; - ASSERT3U(P2PHASE(off, blksz), ==, 0); - /* zero out any partial block data at the end of the range */ - if (tail) { - if (len < tail) - tail = len; - if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), - TRUE, FTAG, &db) == 0) { - /* don't dirty if not on disk and not dirty */ - if (db->db_last_dirty || - (db->db_blkptr && - !BP_IS_HOLE(db->db_blkptr))) { - rw_exit(&dn->dn_struct_rwlock); - dbuf_will_dirty(db, tx); - rw_enter(&dn->dn_struct_rwlock, - RW_WRITER); - bzero(db->db.db_data, tail); - } + err = dnode_next_offset(dn, + DNODE_FIND_HAVELOCK, &ibyte, 1, 1, 0); + i = ibyte >> shift; + if (err == ESRCH || i > last) + break; + ASSERT(err == 0); + db = dbuf_hold_level(dn, 1, i, FTAG); + if (db) { + dbuf_will_dirty(db, tx); dbuf_rele(db, FTAG); } - len -= tail; } - /* If the range did not include a full block, we are done */ - if (len == 0) - goto out; - - /* dirty the left indirects */ - if (dn->dn_nlevels > 1 && off != 0) { - db = dbuf_hold_level(dn, 1, - (off - head) >> (blkshift + epbs), FTAG); - dbuf_will_dirty(db, tx); - dbuf_rele(db, FTAG); - } - - /* dirty the right indirects */ - if (dn->dn_nlevels > 1 && !trunc) { - db = dbuf_hold_level(dn, 1, - (off + len + tail - 1) >> (blkshift + epbs), FTAG); - dbuf_will_dirty(db, tx); - dbuf_rele(db, FTAG); - } - - /* - * Finally, add this range to the dnode range list, we - * will finish up this free operation in the syncing phase. - */ - ASSERT(IS_P2ALIGNED(off, 1<<blkshift)); - ASSERT(off + len == UINT64_MAX || - IS_P2ALIGNED(len, 1<<blkshift)); - blkid = off >> blkshift; - nblks = len >> blkshift; - - if (trunc) - dn->dn_maxblkid = (blkid ? blkid - 1 : 0); } - +done: + /* + * Add this range to the dnode range list. + * We will finish up this free operation in the syncing phase. + */ mutex_enter(&dn->dn_mtx); dnode_clear_range(dn, blkid, nblks, tx); { @@ -1127,9 +1129,12 @@ } mutex_exit(&dn->dn_mtx); - dbuf_free_range(dn, blkid, nblks, tx); + dbuf_free_range(dn, blkid, blkid + nblks - 1, tx); dnode_setdirty(dn, tx); out: + if (trunc && dn->dn_maxblkid >= (off >> blkshift)) + dn->dn_maxblkid = (off >> blkshift ? (off >> blkshift) - 1 : 0); + rw_exit(&dn->dn_struct_rwlock); } @@ -1229,7 +1234,7 @@ } static int -dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset, +dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, int lvl, uint64_t blkfill, uint64_t txg) { dmu_buf_impl_t *db = NULL; @@ -1237,11 +1242,15 @@ uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; uint64_t epb = 1ULL << epbs; uint64_t minfill, maxfill; - int i, error, span; + boolean_t hole; + int i, inc, error, span; dprintf("probing object %llu offset %llx level %d of %u\n", dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels); + hole = flags & DNODE_FIND_HOLE; + inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; + if (lvl == dn->dn_phys->dn_nlevels) { error = 0; epb = dn->dn_phys->dn_nblkptr; @@ -1270,7 +1279,8 @@ span = DNODE_SHIFT; ASSERT(dn->dn_type == DMU_OT_DNODE); - for (i = (*offset >> span) & (blkfill - 1); i < blkfill; i++) { + for (i = (*offset >> span) & (blkfill - 1); + i >= 0 && i < blkfill; i += inc) { boolean_t newcontents = B_TRUE; if (txg) { int j; @@ -1282,9 +1292,9 @@ } if (!dnp[i].dn_type == hole && newcontents) break; - *offset += 1ULL << span; + *offset += (1ULL << span) * inc; } - if (i == blkfill) + if (i < 0 || i == blkfill) error = ESRCH; } else { blkptr_t *bp = data; @@ -1298,14 +1308,14 @@ minfill++; for (i = (*offset >> span) & ((1ULL << epbs) - 1); - i < epb; i++) { + i >= 0 && i < epb; i += inc) { if (bp[i].blk_fill >= minfill && bp[i].blk_fill <= maxfill && bp[i].blk_birth > txg) break; - *offset += 1ULL << span; + *offset += (1ULL << span) * inc; } - if (i >= epb) + if (i < 0 || i == epb) error = ESRCH; } @@ -1324,64 +1334,66 @@ * * Examples: * - * dnode_next_offset(dn, hole, offset, 1, 1, 0); - * Finds the next hole/data in a file. + * dnode_next_offset(dn, flags, offset, 1, 1, 0); + * Finds the next/previous hole/data in a file. * Used in dmu_offset_next(). * - * dnode_next_offset(mdn, hole, offset, 0, DNODES_PER_BLOCK, txg); + * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg); * Finds the next free/allocated dnode an objset's meta-dnode. * Only finds objects that have new contents since txg (ie. * bonus buffer changes and content removal are ignored). * Used in dmu_object_next(). * - * dnode_next_offset(mdn, TRUE, offset, 2, DNODES_PER_BLOCK >> 2, 0); + * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0); * Finds the next L2 meta-dnode bp that's at most 1/4 full. * Used in dmu_object_alloc(). */ int -dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *offset, +dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, int minlvl, uint64_t blkfill, uint64_t txg) { + uint64_t initial_offset = *offset; int lvl, maxlvl; int error = 0; - uint64_t initial_offset = *offset; - rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (!(flags & DNODE_FIND_HAVELOCK)) + rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_phys->dn_nlevels == 0) { - rw_exit(&dn->dn_struct_rwlock); - return (ESRCH); + error = ESRCH; + goto out; } if (dn->dn_datablkshift == 0) { if (*offset < dn->dn_datablksz) { - if (hole) + if (flags & DNODE_FIND_HOLE) *offset = dn->dn_datablksz; } else { error = ESRCH; } - rw_exit(&dn->dn_struct_rwlock); - return (error); + goto out; } maxlvl = dn->dn_phys->dn_nlevels; for (lvl = minlvl; lvl <= maxlvl; lvl++) { error = dnode_next_offset_level(dn, - hole, offset, lvl, blkfill, txg); + flags, offset, lvl, blkfill, txg); if (error != ESRCH) break; } - while (--lvl >= minlvl && error == 0) { + while (error == 0 && --lvl >= minlvl) { error = dnode_next_offset_level(dn, - hole, offset, lvl, blkfill, txg); + flags, offset, lvl, blkfill, txg); } - rw_exit(&dn->dn_struct_rwlock); - - if (error == 0 && initial_offset > *offset) + if (error == 0 && (flags & DNODE_FIND_BACKWARDS ? + initial_offset < *offset : initial_offset > *offset)) error = ESRCH; +out: + if (!(flags & DNODE_FIND_HAVELOCK)) + rw_exit(&dn->dn_struct_rwlock); return (error); }
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c Tue Jul 01 11:24:56 2008 -0700 +++ b/usr/src/uts/common/fs/zfs/dnode_sync.c Tue Jul 01 12:01:12 2008 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -109,25 +109,26 @@ rw_exit(&dn->dn_struct_rwlock); } -static void +static int free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) { - objset_impl_t *os = dn->dn_objset; + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; uint64_t bytesfreed = 0; - int i; + int i, blocks_freed = 0; - dprintf("os=%p obj=%llx num=%d\n", os, dn->dn_object, num); + dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num); for (i = 0; i < num; i++, bp++) { if (BP_IS_HOLE(bp)) continue; - bytesfreed += bp_get_dasize(os->os_spa, bp); + bytesfreed += dsl_dataset_block_kill(ds, bp, dn->dn_zio, tx); ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys)); - dsl_dataset_block_kill(os->os_dsl_dataset, bp, dn->dn_zio, tx); bzero(bp, sizeof (blkptr_t)); + blocks_freed += 1; } dnode_diduse_space(dn, -bytesfreed); + return (blocks_freed); } #ifdef ZFS_DEBUG @@ -205,6 +206,8 @@ } #endif +#define ALL -1 + static int free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, dmu_tx_t *tx) @@ -215,8 +218,18 @@ uint64_t start, end, dbstart, dbend, i; int epbs, shift, err; int all = TRUE; + int blocks_freed = 0; - (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); + /* + * There is a small possibility that this block will not be cached: + * 1 - if level > 1 and there are no children with level <= 1 + * 2 - if we didn't get a dirty hold (because this block had just + * finished being written -- and so had no holds), and then this + * block got evicted before we got here. + */ + if (db->db_state != DB_CACHED) + (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); + arc_release(db->db_buf, db); bp = (blkptr_t *)db->db.db_data; @@ -240,10 +253,10 @@ if (db->db_level == 1) { FREE_VERIFY(db, start, end, tx); - free_blocks(dn, bp, end-start+1, tx); + blocks_freed = free_blocks(dn, bp, end-start+1, tx); arc_buf_freeze(db->db_buf); - ASSERT(all || db->db_last_dirty); - return (all); + ASSERT(all || blocks_freed == 0 || db->db_last_dirty); + return (all ? ALL : blocks_freed); } for (i = start; i <= end; i++, bp++) { @@ -254,9 +267,9 @@ ASSERT3U(err, ==, 0); rw_exit(&dn->dn_struct_rwlock); - if (free_children(subdb, blkid, nblks, trunc, tx)) { + if (free_children(subdb, blkid, nblks, trunc, tx) == ALL) { ASSERT3P(subdb->db_blkptr, ==, bp); - free_blocks(dn, bp, 1, tx); + blocks_freed += free_blocks(dn, bp, 1, tx); } else { all = FALSE; } @@ -273,8 +286,8 @@ ASSERT3U(bp->blk_birth, ==, 0); } #endif - ASSERT(all || db->db_last_dirty); - return (all); + ASSERT(all || blocks_freed == 0 || db->db_last_dirty); + return (all ? ALL : blocks_freed); } /* @@ -304,15 +317,14 @@ return; } ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr); - free_blocks(dn, bp + blkid, nblks, tx); + (void) free_blocks(dn, bp + blkid, nblks, tx); if (trunc) { uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0); ASSERT(off < dn->dn_phys->dn_maxblkid || dn->dn_phys->dn_maxblkid == 0 || - dnode_next_offset(dn, FALSE, &off, - 1, 1, 0) != 0); + dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0); } return; } @@ -330,9 +342,9 @@ ASSERT3U(err, ==, 0); rw_exit(&dn->dn_struct_rwlock); - if (free_children(db, blkid, nblks, trunc, tx)) { + if (free_children(db, blkid, nblks, trunc, tx) == ALL) { ASSERT3P(db->db_blkptr, ==, bp); - free_blocks(dn, bp, 1, tx); + (void) free_blocks(dn, bp, 1, tx); } dbuf_rele(db, FTAG); } @@ -342,7 +354,7 @@ dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0); ASSERT(off < dn->dn_phys->dn_maxblkid || dn->dn_phys->dn_maxblkid == 0 || - dnode_next_offset(dn, FALSE, &off, 1, 1, 0) != 0); + dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0); } } @@ -442,6 +454,13 @@ ASSERT(dmu_tx_is_syncing(tx)); + /* + * Our contents should have been freed in dnode_sync() by the + * free range record inserted by the caller of dnode_free(). + */ + ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0); + ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr)); + dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); dnode_evict_dbufs(dn); ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); @@ -461,10 +480,6 @@ dn->dn_next_indblkshift[txgoff] = 0; dn->dn_next_blksz[txgoff] = 0; - /* free up all the blocks in the file. */ - dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx); - ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0); - /* ASSERT(blkptrs are zero); */ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); ASSERT(dn->dn_type != DMU_OT_NONE); @@ -541,7 +556,7 @@ ASSERT(P2PHASE(dn->dn_next_blksz[txgoff], SPA_MINBLOCKSIZE) == 0); ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) || - list_head(list) != NULL || + dn->dn_maxblkid == 0 || list_head(list) != NULL || dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT == dnp->dn_datablkszsec); dnp->dn_datablkszsec = @@ -575,22 +590,15 @@ mutex_exit(&dn->dn_mtx); /* process all the "freed" ranges in the file */ - if (dn->dn_free_txg == 0 || dn->dn_free_txg > tx->tx_txg) { - for (rp = avl_last(&dn->dn_ranges[txgoff]); rp != NULL; - rp = AVL_PREV(&dn->dn_ranges[txgoff], rp)) - dnode_sync_free_range(dn, - rp->fr_blkid, rp->fr_nblks, tx); + while (rp = avl_last(&dn->dn_ranges[txgoff])) { + dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx); + /* grab the mutex so we don't race with dnode_block_freed() */ + mutex_enter(&dn->dn_mtx); + avl_remove(&dn->dn_ranges[txgoff], rp); + mutex_exit(&dn->dn_mtx); + kmem_free(rp, sizeof (free_range_t)); } - /* grab the mutex so we don't race with dnode_block_freed() */ - mutex_enter(&dn->dn_mtx); - for (rp = avl_first(&dn->dn_ranges[txgoff]); rp; ) { - free_range_t *last = rp; - rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp); - avl_remove(&dn->dn_ranges[txgoff], last); - kmem_free(last, sizeof (free_range_t)); - } - mutex_exit(&dn->dn_mtx); if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) { dnode_sync_free(dn, tx); return;
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c Tue Jul 01 11:24:56 2008 -0700 +++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c Tue Jul 01 12:01:12 2008 -0700 @@ -115,7 +115,7 @@ dsl_dir_diduse_space(ds->ds_dir, delta, compressed, uncompressed, tx); } -void +int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, dmu_tx_t *tx) { @@ -126,7 +126,7 @@ ASSERT(dmu_tx_is_syncing(tx)); /* No block pointer => nothing to free */ if (BP_IS_HOLE(bp)) - return; + return (0); ASSERT(used > 0); if (ds == NULL) { @@ -142,7 +142,7 @@ dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, -used, -compressed, -uncompressed, tx); dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); - return; + return (used); } ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); @@ -189,6 +189,8 @@ ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); ds->ds_phys->ds_uncompressed_bytes -= uncompressed; mutex_exit(&ds->ds_lock); + + return (used); } uint64_t @@ -957,21 +959,11 @@ */ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, ds->ds_phys->ds_prev_snap_txg)) { - dmu_tx_t *tx = dmu_tx_create(os); - dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END); - dmu_tx_hold_bonus(tx, obj); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - /* - * Perhaps there is not enough disk - * space. Just deal with it from - * dsl_dataset_destroy_sync(). - */ - dmu_tx_abort(tx); - continue; - } - VERIFY(0 == dmu_object_free(os, obj, tx)); - dmu_tx_commit(tx); + /* + * Ignore errors, if there is not enough disk space + * we will deal with it in dsl_dataset_destroy_sync(). + */ + (void) dmu_free_object(os, obj); } dmu_objset_close(os);
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h Tue Jul 01 11:24:56 2008 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h Tue Jul 01 12:01:12 2008 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -271,7 +271,7 @@ void dbuf_unoverride(dbuf_dirty_record_t *dr); void dbuf_sync_list(list_t *list, dmu_tx_t *tx); -void dbuf_free_range(struct dnode *dn, uint64_t blkid, uint64_t nblks, +void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, struct dmu_tx *); void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h Tue Jul 01 11:24:56 2008 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h Tue Jul 01 12:01:12 2008 -0700 @@ -154,6 +154,7 @@ * operation, including metadata. */ #define DMU_MAX_ACCESS (10<<20) /* 10MB */ +#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */ /* * Public routines to create, destroy, open, and close objsets. @@ -421,6 +422,9 @@ */ int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); +int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size); +int dmu_free_object(objset_t *os, uint64_t object); /* * Convenience functions.
--- a/usr/src/uts/common/fs/zfs/sys/dmu_tx.h Tue Jul 01 11:24:56 2008 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h Tue Jul 01 12:01:12 2008 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -89,6 +89,7 @@ uint64_t txh_space_tofree; uint64_t txh_space_tooverwrite; uint64_t txh_space_tounref; + uint64_t txh_memory_tohold; #ifdef ZFS_DEBUG enum dmu_tx_hold_type txh_type; uint64_t txh_arg1;
--- a/usr/src/uts/common/fs/zfs/sys/dnode.h Tue Jul 01 11:24:56 2008 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/dnode.h Tue Jul 01 12:01:12 2008 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -41,12 +41,19 @@ #endif /* - * Flags. + * dnode_hold() flags. */ #define DNODE_MUST_BE_ALLOCATED 1 #define DNODE_MUST_BE_FREE 2 /* + * dnode_next_offset() flags. + */ +#define DNODE_FIND_HOLE 1 +#define DNODE_FIND_BACKWARDS 2 +#define DNODE_FIND_HAVELOCK 4 + +/* * Fixed constants. */ #define DNODE_SHIFT 9 /* 512 bytes */ @@ -227,8 +234,8 @@ uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid); void dnode_init(void); void dnode_fini(void); -int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl, - uint64_t blkfill, uint64_t txg); +int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off, + int minlvl, uint64_t blkfill, uint64_t txg); void dnode_evict_dbufs(dnode_t *dn); #ifdef ZFS_DEBUG
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h Tue Jul 01 11:24:56 2008 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h Tue Jul 01 12:01:12 2008 -0700 @@ -191,7 +191,7 @@ void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx); void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); -void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, +int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, dmu_tx_t *tx); int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth); uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
--- a/usr/src/uts/common/fs/zfs/zfs_dir.c Tue Jul 01 11:24:56 2008 -0700 +++ b/usr/src/uts/common/fs/zfs/zfs_dir.c Tue Jul 01 12:01:12 2008 -0700 @@ -451,6 +451,21 @@ ASSERT3U(error, ==, 0); } +static void +zfs_unlinked_remove(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + char obj_name[17]; + int error; + + ASSERT(zp->z_unlinked); + ASSERT3U(zp->z_phys->zp_links, ==, 0); + + error = zap_remove(zfsvfs->z_os, zfsvfs->z_unlinkedobj, + zfs_unlinked_hexname(obj_name, zp->z_id), tx); + ASSERT3U(error, ==, 0); +} + /* * Clean up any znodes that had no links when we either crashed or * (force) umounted the file system. @@ -574,7 +589,6 @@ zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zfsvfs->z_os; znode_t *xzp = NULL; - char obj_name[17]; dmu_tx_t *tx; uint64_t acl_obj; int error; @@ -589,7 +603,7 @@ if (zfs_purgedir(zp) != 0) { /* * Not enough space to delete some xattrs. - * Leave it on the unlinked set. + * Leave it in the unlinked set. */ zfs_znode_dmu_fini(zp); zfs_znode_free(zp); @@ -598,6 +612,19 @@ } /* + * Free up all the data in the file. + */ + error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); + if (error) { + /* + * Not enough space. Leave the file in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + + /* * If the file has extended attributes, we're going to unlink * the xattr dir. */ @@ -609,7 +636,7 @@ acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; /* - * Set up the transaction. + * Set up the final transaction. */ tx = dmu_tx_create(os); dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); @@ -643,9 +670,7 @@ } /* Remove this znode from the unlinked set */ - error = zap_remove(os, zfsvfs->z_unlinkedobj, - zfs_unlinked_hexname(obj_name, zp->z_id), tx); - ASSERT3U(error, ==, 0); + zfs_unlinked_remove(zp, tx); zfs_znode_delete(zp, tx);
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c Tue Jul 01 11:24:56 2008 -0700 +++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c Tue Jul 01 12:01:12 2008 -0700 @@ -1304,15 +1304,10 @@ */ if ((ZTOV(zp)->v_type == VREG) && (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { + /* we can't hold any locks when calling zfs_freesp() */ + zfs_dirent_unlock(dl); + dl = NULL; error = zfs_freesp(zp, 0, 0, mode, TRUE); - if (error == ERESTART && - zfsvfs->z_assign == TXG_NOWAIT) { - /* NB: we already did dmu_tx_wait() */ - zfs_dirent_unlock(dl); - VN_RELE(ZTOV(zp)); - goto top; - } - if (error == 0) { vnevent_create(ZTOV(zp), ct); } @@ -1379,7 +1374,7 @@ zfs_dirlock_t *dl; dmu_tx_t *tx; boolean_t may_delete_now, delete_now = FALSE; - boolean_t unlinked; + boolean_t unlinked, toobig = FALSE; uint64_t txtype; pathname_t *realnmp = NULL; pathname_t realnm; @@ -1442,8 +1437,13 @@ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_bonus(tx, zp->z_id); - if (may_delete_now) - dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); + if (may_delete_now) { + toobig = + zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; + /* if the file is too big, only hold_free a token amount */ + dmu_tx_hold_free(tx, zp->z_id, 0, + (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); + } /* are there any extended attributes? */ if ((xattr_obj = zp->z_phys->zp_xattr) != 0) { @@ -1487,7 +1487,7 @@ if (unlinked) { mutex_enter(&vp->v_lock); - delete_now = may_delete_now && + delete_now = may_delete_now && !toobig && vp->v_count == 1 && !vn_has_cached_data(vp) && zp->z_phys->zp_xattr == xattr_obj && zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj; @@ -1533,7 +1533,7 @@ if (!delete_now) { VN_RELE(vp); } else if (xzp) { - /* this rele delayed to prevent nesting transactions */ + /* this rele is delayed to prevent nesting transactions */ VN_RELE(ZTOV(xzp)); } @@ -2451,10 +2451,8 @@ * block if there are locks present... this * should be addressed in openat(). */ - do { - err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); - /* NB: we already did dmu_tx_wait() if necessary */ - } while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT); + /* XXX - would it be OK to generate a log record here? */ + err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) { ZFS_EXIT(zfsvfs); return (err); @@ -2725,6 +2723,7 @@ if (mask & AT_MTIME) ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); + /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ if (mask & AT_SIZE) zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx); else if (mask != 0) @@ -4236,7 +4235,6 @@ ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); -top: if (cmd != F_FREESP) { ZFS_EXIT(zfsvfs); return (EINVAL); @@ -4255,10 +4253,7 @@ off = bfp->l_start; len = bfp->l_len; /* 0 means from off to end of file */ - do { - error = zfs_freesp(zp, off, len, flag, TRUE); - /* NB: we already did dmu_tx_wait() if necessary */ - } while (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT); + error = zfs_freesp(zp, off, len, flag, TRUE); ZFS_EXIT(zfsvfs); return (error);
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c Tue Jul 01 11:24:56 2008 -0700 +++ b/usr/src/uts/common/fs/zfs/zfs_znode.c Tue Jul 01 12:01:12 2008 -0700 @@ -1046,14 +1046,14 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; uint64_t obj = zp->z_id; + uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); - if (zp->z_phys->zp_acl.z_acl_extern_obj) { - VERIFY(0 == dmu_object_free(zfsvfs->z_os, - zp->z_phys->zp_acl.z_acl_extern_obj, tx)); - } - VERIFY(0 == dmu_object_free(zfsvfs->z_os, obj, tx)); + if (acl_obj) + VERIFY(0 == dmu_object_free(os, acl_obj, tx)); + VERIFY(0 == dmu_object_free(os, obj, tx)); zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); zfs_znode_free(zp); @@ -1233,137 +1233,177 @@ } /* - * Free space in a file. + * Increase the file length * * IN: zp - znode of file to free data in. - * off - start of section to free. - * len - length of section to free (0 => to EOF). - * flag - current file open mode flags. + * end - new end-of-file * * RETURN: 0 if success * error code if failure */ -int -zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) +static int +zfs_extend(znode_t *zp, uint64_t end) { - vnode_t *vp = ZTOV(zp); - dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; + dmu_tx_t *tx; rl_t *rl; - uint64_t end = off + len; - uint64_t size, new_blksz; - uint64_t pflags = zp->z_phys->zp_flags; + uint64_t newblksz; int error; - if ((pflags & (ZFS_IMMUTABLE|ZFS_READONLY)) || - off < zp->z_phys->zp_size && (pflags & ZFS_APPENDONLY)) - return (EPERM); - - if (ZTOV(zp)->v_type == VFIFO) - return (0); - /* - * If we will change zp_size then lock the whole file, - * otherwise just lock the range being freed. + * We will change zp_size, lock the whole file. */ - if (len == 0 || off + len > zp->z_phys->zp_size) { - rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); - } else { - rl = zfs_range_lock(zp, off, len, RL_WRITER); - /* recheck, in case zp_size changed */ - if (off + len > zp->z_phys->zp_size) { - /* lost race: file size changed, lock whole file */ - zfs_range_unlock(rl); - rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); - } - } + rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ - size = zp->z_phys->zp_size; - if (len == 0 && size == off && off != 0) { + if (end <= zp->z_phys->zp_size) { zfs_range_unlock(rl); return (0); } - - /* - * Check for any locks in the region to be freed. - */ - if (MANDLOCK(vp, (mode_t)zp->z_phys->zp_mode)) { - uint64_t start = off; - uint64_t extent = len; - - if (off > size) { - start = size; - extent += off - size; - } else if (len == 0) { - extent = size - off; - } - if (error = chklock(vp, FWRITE, start, extent, flag, NULL)) { - zfs_range_unlock(rl); - return (error); - } - } - +top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); - new_blksz = 0; - if (end > size && + if (end > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { /* * We are growing the file past the current block size. */ if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { ASSERT(!ISP2(zp->z_blksz)); - new_blksz = MIN(end, SPA_MAXBLOCKSIZE); + newblksz = MIN(end, SPA_MAXBLOCKSIZE); } else { - new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz); + newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); } - dmu_tx_hold_write(tx, zp->z_id, 0, MIN(end, new_blksz)); - } else if (off < size) { - /* - * If len == 0, we are truncating the file. - */ - dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END); + dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); + } else { + newblksz = 0; } error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } dmu_tx_abort(tx); zfs_range_unlock(rl); return (error); } - - if (new_blksz) - zfs_grow_blocksize(zp, new_blksz, tx); - - if (end > size || len == 0) - zp->z_phys->zp_size = end; - - if (off < size) { - objset_t *os = zfsvfs->z_os; - uint64_t rlen = len; + dmu_buf_will_dirty(zp->z_dbuf, tx); - if (len == 0) - rlen = -1; - else if (end > size) - rlen = size - off; - VERIFY(0 == dmu_free_range(os, zp->z_id, off, rlen, tx)); - } + if (newblksz) + zfs_grow_blocksize(zp, newblksz, tx); - if (log) { - zfs_time_stamper(zp, CONTENT_MODIFIED, tx); - zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); - } + zp->z_phys->zp_size = end; zfs_range_unlock(rl); dmu_tx_commit(tx); + return (0); +} + +/* + * Free space in a file. + * + * IN: zp - znode of file to free data in. + * off - start of section to free. + * len - length of section to free. + * + * RETURN: 0 if success + * error code if failure + */ +static int +zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + rl_t *rl; + int error; + + /* + * Lock the range being freed. + */ + rl = zfs_range_lock(zp, off, len, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (off >= zp->z_phys->zp_size) { + zfs_range_unlock(rl); + return (0); + } + + if (off + len > zp->z_phys->zp_size) + len = zp->z_phys->zp_size - off; + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); + + zfs_range_unlock(rl); + + return (error); +} + +/* + * Truncate a file + * + * IN: zp - znode of file to free data in. + * end - new end-of-file. + * + * RETURN: 0 if success + * error code if failure + */ +static int +zfs_trunc(znode_t *zp, uint64_t end) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + vnode_t *vp = ZTOV(zp); + dmu_tx_t *tx; + rl_t *rl; + int error; + + /* + * We will change zp_size, lock the whole file. + */ + rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (end >= zp->z_phys->zp_size) { + zfs_range_unlock(rl); + return (0); + } + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1); + if (error) { + zfs_range_unlock(rl); + return (error); + } +top: + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + zfs_range_unlock(rl); + return (error); + } + dmu_buf_will_dirty(zp->z_dbuf, tx); + + zp->z_phys->zp_size = end; + + dmu_tx_commit(tx); + + zfs_range_unlock(rl); + /* * Clear any mapped pages in the truncated region. This has to * happen outside of the transaction to avoid the possibility of @@ -1371,10 +1411,10 @@ * about to invalidate. */ rw_enter(&zp->z_map_lock, RW_WRITER); - if (off < size && vn_has_cached_data(vp)) { + if (vn_has_cached_data(vp)) { page_t *pp; - uint64_t start = off & PAGEMASK; - int poff = off & PAGEOFFSET; + uint64_t start = end & PAGEMASK; + int poff = end & PAGEOFFSET; if (poff != 0 && (pp = page_lookup(vp, start, SE_SHARED))) { /* @@ -1393,6 +1433,74 @@ return (0); } +/* + * Free space in a file + * + * IN: zp - znode of file to free data in. + * off - start of range + * len - end of range (0 => EOF) + * flag - current file open mode flags. + * log - TRUE if this action should be logged + * + * RETURN: 0 if success + * error code if failure + */ +int +zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) +{ + vnode_t *vp = ZTOV(zp); + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + int error; + + if (off > zp->z_phys->zp_size) { + error = zfs_extend(zp, off+len); + if (error == 0 && log) + goto log; + else + return (error); + } + + /* + * Check for any locks in the region to be freed. + */ + if (MANDLOCK(vp, (mode_t)zp->z_phys->zp_mode)) { + uint64_t length = (len ? len : zp->z_phys->zp_size - off); + if (error = chklock(vp, FWRITE, off, length, flag, NULL)) + return (error); + } + + if (len == 0) { + error = zfs_trunc(zp, off); + } else { + if ((error = zfs_free_range(zp, off, len)) == 0 && + off + len > zp->z_phys->zp_size) + error = zfs_extend(zp, off+len); + } + if (error || !log) + return (error); +log: + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto log; + } + dmu_tx_abort(tx); + return (error); + } + + zfs_time_stamper(zp, CONTENT_MODIFIED, tx); + zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); + + dmu_tx_commit(tx); + return (0); +} + void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) {
--- a/usr/src/uts/common/fs/zfs/zvol.c Tue Jul 01 11:24:56 2008 -0700 +++ b/usr/src/uts/common/fs/zfs/zvol.c Tue Jul 01 12:01:12 2008 -0700 @@ -774,24 +774,6 @@ return (0); } -static int -zvol_truncate(zvol_state_t *zv, uint64_t offset, uint64_t size) -{ - dmu_tx_t *tx; - int error; - - tx = dmu_tx_create(zv->zv_objset); - dmu_tx_hold_free(tx, ZVOL_OBJ, offset, size); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - return (error); - } - error = dmu_free_range(zv->zv_objset, ZVOL_OBJ, offset, size, tx); - dmu_tx_commit(tx); - return (0); -} - int zvol_prealloc(zvol_state_t *zv) { @@ -823,7 +805,7 @@ if (error) { dmu_tx_abort(tx); kmem_free(data, SPA_MAXBLOCKSIZE); - (void) zvol_truncate(zv, 0, off); + (void) dmu_free_long_range(os, ZVOL_OBJ, 0, off); return (error); } dmu_write(os, ZVOL_OBJ, off, bytes, data, tx); @@ -847,7 +829,6 @@ tx = dmu_tx_create(zv->zv_objset); dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); - dmu_tx_hold_free(tx, ZVOL_OBJ, volsize, DMU_OBJECT_END); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); @@ -859,7 +840,8 @@ dmu_tx_commit(tx); if (error == 0) - error = zvol_truncate(zv, volsize, DMU_OBJECT_END); + error = dmu_free_long_range(zv->zv_objset, + ZVOL_OBJ, volsize, DMU_OBJECT_END); if (error == 0) { zv->zv_volsize = volsize; @@ -1651,7 +1633,6 @@ ASSERT(MUTEX_HELD(&zvol_state_lock)); tx = dmu_tx_create(os); - dmu_tx_hold_free(tx, ZVOL_OBJ, 0, DMU_OBJECT_END); dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { @@ -1690,7 +1671,8 @@ /* Truncate the file */ if (!error) - error = zvol_truncate(zv, 0, DMU_OBJECT_END); + error = dmu_free_long_range(zv->zv_objset, + ZVOL_OBJ, 0, DMU_OBJECT_END); if (error) return (error); @@ -1813,7 +1795,7 @@ (void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx); zvol_free_extents(zv); - (void) zvol_truncate(zv, 0, DMU_OBJECT_END); + (void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END); zv->zv_flags &= ~ZVOL_DUMPIFIED; dmu_tx_commit(tx);