Mercurial > illumos > illumos-gate

--- a/usr/src/cmd/zdb/zdb.c	Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/cmd/zdb/zdb.c	Tue Jul 01 12:01:12 2008 -0700
@@ -1093,13 +1093,13 @@
 		}

 		for (;;) {
-			error = dnode_next_offset(dn, B_FALSE, &start, minlvl,
-			    blkfill, 0);
+			error = dnode_next_offset(dn,
+			    0, &start, minlvl, blkfill, 0);
 			if (error)
 				break;
 			end = start;
-			error = dnode_next_offset(dn, B_TRUE, &end, minlvl,
-			    blkfill, 0);
+			error = dnode_next_offset(dn,
+			    DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
 			nicenum(end - start, segsize);
 			(void) printf("\t\tsegment [%016llx, %016llx)"
 			    " size %5s\n", (u_longlong_t)start,
--- a/usr/src/uts/common/fs/zfs/dbuf.c	Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dbuf.c	Tue Jul 01 12:01:12 2008 -0700
@@ -705,22 +705,50 @@
 	arc_release(dr->dt.dl.dr_data, db);
 }

+/*
+ * Evict (if its unreferenced) or clear (if its referenced) any level-0
+ * data blocks in the free range, so that any future readers will find
+ * empty blocks.  Also, if we happen accross any level-1 dbufs in the
+ * range that have not already been marked dirty, mark them dirty so
+ * they stay in memory.
+ */
 void
-dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db, *db_next;
 	uint64_t txg = tx->tx_txg;
+	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+	uint64_t first_l1 = start >> epbs;
+	uint64_t last_l1 = end >> epbs;

-	dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks);
+	if (end > dn->dn_maxblkid) {
+		end = dn->dn_maxblkid;
+		last_l1 = end >> epbs;
+	}
+	dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
 	mutex_enter(&dn->dn_dbufs_mtx);
 	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
 		db_next = list_next(&dn->dn_dbufs, db);
 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
+
+		if (db->db_level == 1 &&
+		    db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
+			mutex_enter(&db->db_mtx);
+			if (db->db_last_dirty &&
+			    db->db_last_dirty->dr_txg < txg) {
+				dbuf_add_ref(db, FTAG);
+				mutex_exit(&db->db_mtx);
+				dbuf_will_dirty(db, tx);
+				dbuf_rele(db, FTAG);
+			} else {
+				mutex_exit(&db->db_mtx);
+			}
+		}
+
 		if (db->db_level != 0)
 			continue;
 		dprintf_dbuf(db, "found buf %s\n", "");
-		if (db->db_blkid < blkid ||
-		    db->db_blkid >= blkid+nblks)
+		if (db->db_blkid < start || db->db_blkid > end)
 			continue;

 		/* found a level 0 buffer in the range */
@@ -1161,7 +1189,7 @@
 		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
 		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
 	} else if (db->db_level+1 == dn->dn_nlevels) {
-		ASSERT3P(db->db_parent, ==, dn->dn_dbuf);
+		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
 		mutex_exit(&dn->dn_mtx);
@@ -1976,7 +2004,7 @@
 		mutex_exit(&db->db_mtx);

 		if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
-			dsl_dataset_block_kill(os->os_dsl_dataset,
+			(void) dsl_dataset_block_kill(os->os_dsl_dataset,
 			    &zio_fake.io_bp_orig, dn->dn_zio, tx);

 		dbuf_write_ready(&zio_fake, db->db_buf, db);
@@ -2105,7 +2133,7 @@
 	if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0)
 		zio_flags |= ZIO_FLAG_METADATA;
 	if (BP_IS_OLDER(db->db_blkptr, txg))
-		dsl_dataset_block_kill(
+		(void) dsl_dataset_block_kill(
 		    os->os_dsl_dataset, db->db_blkptr, zio, tx);

 	dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress,
@@ -2137,7 +2165,7 @@
 		dmu_tx_t *tx = os->os_synctx;

 		if (bp_orig->blk_birth == tx->tx_txg)
-			dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
+			(void) dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
 		ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
 		return;
 	}
@@ -2185,7 +2213,7 @@
 		dmu_tx_t *tx = os->os_synctx;

 		if (bp_orig->blk_birth == tx->tx_txg)
-			dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
+			(void) dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
 		dsl_dataset_block_born(ds, zio->io_bp, tx);
 	}
 }
--- a/usr/src/uts/common/fs/zfs/dmu.c	Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu.c	Tue Jul 01 12:01:12 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */

@@ -364,6 +364,152 @@
 	dnode_rele(dn, FTAG);
 }

+static int
+get_next_chunk(dnode_t *dn, uint64_t *offset, uint64_t limit)
+{
+	uint64_t len = limit - *offset;
+	uint64_t chunk_len = dn->dn_datablksz * DMU_MAX_DELETEBLKCNT;
+	uint64_t dn_used;
+	int err;
+
+	ASSERT(limit <= *offset);
+
+	dn_used = dn->dn_phys->dn_used <<
+	    (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES ? 0 : DEV_BSHIFT);
+	if (len <= chunk_len || dn_used <= chunk_len) {
+		*offset = limit;
+		return (0);
+	}
+
+	while (*offset > limit) {
+		uint64_t initial_offset = *offset;
+		uint64_t delta;
+
+		/* skip over allocated data */
+		err = dnode_next_offset(dn,
+		    DNODE_FIND_HOLE|DNODE_FIND_BACKWARDS, offset, 1, 1, 0);
+		if (err == ESRCH)
+			*offset = limit;
+		else if (err)
+			return (err);
+
+		ASSERT3U(*offset, <=, initial_offset);
+		delta = initial_offset - *offset;
+		if (delta >= chunk_len) {
+			*offset += delta - chunk_len;
+			return (0);
+		}
+		chunk_len -= delta;
+
+		/* skip over unallocated data */
+		err = dnode_next_offset(dn,
+		    DNODE_FIND_BACKWARDS, offset, 1, 1, 0);
+		if (err == ESRCH)
+			*offset = limit;
+		else if (err)
+			return (err);
+
+		if (*offset < limit)
+			*offset = limit;
+		ASSERT3U(*offset, <, initial_offset);
+	}
+	return (0);
+}
+
+static int
+dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
+    uint64_t length, boolean_t free_dnode)
+{
+	dmu_tx_t *tx;
+	uint64_t object_size, start, end, len;
+	boolean_t trunc = (length == DMU_OBJECT_END);
+	int align, err;
+
+	align = 1 << dn->dn_datablkshift;
+	ASSERT(align > 0);
+	object_size = align == 1 ? dn->dn_datablksz :
+	    (dn->dn_maxblkid + 1) << dn->dn_datablkshift;
+
+	if (trunc || (end = offset + length) > object_size)
+		end = object_size;
+	if (end <= offset)
+		return (0);
+	length = end - offset;
+
+	while (length) {
+		start = end;
+		err = get_next_chunk(dn, &start, offset);
+		if (err)
+			return (err);
+		len = trunc ? DMU_OBJECT_END : end - start;
+
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_free(tx, dn->dn_object, start, len);
+		err = dmu_tx_assign(tx, TXG_WAIT);
+		if (err) {
+			dmu_tx_abort(tx);
+			return (err);
+		}
+
+		dnode_free_range(dn, start, trunc ? -1 : len, tx);
+
+		if (start == 0 && trunc && free_dnode)
+			dnode_free(dn, tx);
+
+		length -= end - start;
+
+		dmu_tx_commit(tx);
+		end = start;
+		trunc = FALSE;
+	}
+	return (0);
+}
+
+int
+dmu_free_long_range(objset_t *os, uint64_t object,
+    uint64_t offset, uint64_t length)
+{
+	dnode_t *dn;
+	int err;
+
+	err = dnode_hold(os->os, object, FTAG, &dn);
+	if (err != 0)
+		return (err);
+	err = dmu_free_long_range_impl(os, dn, offset, length, FALSE);
+	dnode_rele(dn, FTAG);
+	return (err);
+}
+
+int
+dmu_free_object(objset_t *os, uint64_t object)
+{
+	dnode_t *dn;
+	dmu_tx_t *tx;
+	int err;
+
+	err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+	    FTAG, &dn);
+	if (err != 0)
+		return (err);
+	if (dn->dn_nlevels == 1) {
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_bonus(tx, object);
+		dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END);
+		err = dmu_tx_assign(tx, TXG_WAIT);
+		if (err == 0) {
+			dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
+			dnode_free(dn, tx);
+			dmu_tx_commit(tx);
+		} else {
+			dmu_tx_abort(tx);
+		}
+	} else {
+		err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE);
+	}
+	dnode_rele(dn, FTAG);
+	return (err);
+}
+
 int
 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, dmu_tx_t *tx)
@@ -912,7 +1058,7 @@
 			return (err);
 	}

-	err = dnode_next_offset(dn, hole, off, 1, 1, 0);
+	err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
 	dnode_rele(dn, FTAG);

 	return (err);
--- a/usr/src/uts/common/fs/zfs/dmu_object.c	Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_object.c	Tue Jul 01 12:01:12 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */

@@ -54,7 +54,8 @@
 		if (P2PHASE(object, L2_dnode_count) == 0) {
 			uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
 			int error = dnode_next_offset(osi->os_meta_dnode,
-			    B_TRUE, &offset, 2, DNODES_PER_BLOCK >> 2, 0);
+			    DNODE_FIND_HOLE,
+			    &offset, 2, DNODES_PER_BLOCK >> 2, 0);
 			restarted = B_TRUE;
 			if (error == 0)
 				object = offset >> DNODE_SHIFT;
@@ -139,6 +140,7 @@
 		return (err);

 	ASSERT(dn->dn_type != DMU_OT_NONE);
+	dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
 	dnode_free(dn, tx);
 	dnode_rele(dn, FTAG);

@@ -152,7 +154,7 @@
 	int error;

 	error = dnode_next_offset(os->os->os_meta_dnode,
-	    hole, &offset, 0, DNODES_PER_BLOCK, txg);
+	    (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);

 	*objectp = offset >> DNODE_SHIFT;
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c	Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c	Tue Jul 01 12:01:12 2008 -0700
@@ -829,7 +829,7 @@
 	if (!DVA_EQUAL(BP_IDENTITY(bp),
 	    BP_IDENTITY(&zio->io_bp_orig))) {
 		if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
-			dsl_dataset_block_kill(os->os_dsl_dataset,
+			(void) dsl_dataset_block_kill(os->os_dsl_dataset,
 			    &zio->io_bp_orig, NULL, os->os_synctx);
 		dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx);
 	}
@@ -878,7 +878,7 @@
 	zb.zb_level = -1;
 	zb.zb_blkid = 0;
 	if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) {
-		dsl_dataset_block_kill(os->os_dsl_dataset,
+		(void) dsl_dataset_block_kill(os->os_dsl_dataset,
 		    os->os_rootbp, pio, tx);
 	}
 	zio = arc_write(pio, os->os_spa, os->os_md_checksum,
--- a/usr/src/uts/common/fs/zfs/dmu_send.c	Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c	Tue Jul 01 12:01:12 2008 -0700
@@ -877,23 +877,14 @@
 	for (obj = drrfo->drr_firstobj;
 	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
 	    (void) dmu_object_next(os, &obj, FALSE, 0)) {
-		dmu_tx_t *tx;
 		int err;

 		if (dmu_object_info(os, obj, NULL) != 0)
 			continue;

-		tx = dmu_tx_create(os);
-		dmu_tx_hold_bonus(tx, obj);
-		err = dmu_tx_assign(tx, TXG_WAIT);
-		if (err) {
-			dmu_tx_abort(tx);
+		err = dmu_free_object(os, obj);
+		if (err)
 			return (err);
-		}
-		err = dmu_object_free(os, obj, tx);
-		dmu_tx_commit(tx);
-		if (err && err != ENOENT)
-			return (EINVAL);
 	}
 	return (0);
 }
@@ -939,7 +930,6 @@
 restore_free(struct restorearg *ra, objset_t *os,
     struct drr_free *drrf)
 {
-	dmu_tx_t *tx;
 	int err;

 	if (drrf->drr_length != -1ULL &&
@@ -949,18 +939,8 @@
 	if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
 		return (EINVAL);

-	tx = dmu_tx_create(os);
-
-	dmu_tx_hold_free(tx, drrf->drr_object,
+	err = dmu_free_long_range(os, drrf->drr_object,
 	    drrf->drr_offset, drrf->drr_length);
-	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err) {
-		dmu_tx_abort(tx);
-		return (err);
-	}
-	err = dmu_free_range(os, drrf->drr_object,
-	    drrf->drr_offset, drrf->drr_length, tx);
-	dmu_tx_commit(tx);
 	return (err);
 }
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c	Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c	Tue Jul 01 12:01:12 2008 -0700
@@ -320,39 +320,25 @@
 static void
 dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 {
-	uint64_t blkid, nblks;
-	uint64_t space = 0, unref = 0;
+	uint64_t blkid, nblks, lastblk;
+	uint64_t space = 0, unref = 0, skipped = 0;
 	dnode_t *dn = txh->txh_dnode;
 	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 	spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
-	int dirty;
+	int epbs;

-	/*
-	 * We don't need to use any locking to check for dirtyness
-	 * because it's OK if we get stale data -- the dnode may become
-	 * dirty immediately after our check anyway.  This is just a
-	 * means to avoid the expensive count when we aren't sure we
-	 * need it.  We need to be able to deal with a dirty dnode.
-	 */
-	dirty = list_link_active(&dn->dn_dirty_link[0]) |
-	    list_link_active(&dn->dn_dirty_link[1]) |
-	    list_link_active(&dn->dn_dirty_link[2]) |
-	    list_link_active(&dn->dn_dirty_link[3]);
-	if (dirty || dn->dn_assigned_txg || dn->dn_phys->dn_nlevels == 0)
+	if (dn->dn_nlevels == 0)
 		return;

 	/*
-	 * the struct_rwlock protects us against dn_phys->dn_nlevels
+	 * The struct_rwlock protects us against dn_nlevels
 	 * changing, in case (against all odds) we manage to dirty &
 	 * sync out the changes after we check for being dirty.
-	 * also, dbuf_hold_impl() wants us to have the struct_rwlock.
-	 *
-	 * It's fine to use dn_datablkshift rather than the dn_phys
-	 * equivalent because if it is changing, maxblkid==0 and we will
-	 * bail.
+	 * Also, dbuf_hold_level() wants us to have the struct_rwlock.
 	 */
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	if (dn->dn_phys->dn_maxblkid == 0) {
+	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+	if (dn->dn_maxblkid == 0) {
 		if (off == 0 && len >= dn->dn_datablksz) {
 			blkid = 0;
 			nblks = 1;
@@ -362,24 +348,21 @@
 		}
 	} else {
 		blkid = off >> dn->dn_datablkshift;
-		nblks = (off + len) >> dn->dn_datablkshift;
+		nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;

-		if (blkid >= dn->dn_phys->dn_maxblkid) {
+		if (blkid >= dn->dn_maxblkid) {
 			rw_exit(&dn->dn_struct_rwlock);
 			return;
 		}
-		if (blkid + nblks > dn->dn_phys->dn_maxblkid)
-			nblks = dn->dn_phys->dn_maxblkid - blkid;
+		if (blkid + nblks > dn->dn_maxblkid)
+			nblks = dn->dn_maxblkid - blkid;

-		/* don't bother after 128,000 blocks */
-		nblks = MIN(nblks, 128*1024);
 	}
-
-	if (dn->dn_phys->dn_nlevels == 1) {
+	if (dn->dn_nlevels == 1) {
 		int i;
 		for (i = 0; i < nblks; i++) {
 			blkptr_t *bp = dn->dn_phys->dn_blkptr;
-			ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr);
+			ASSERT3U(blkid + i, <, dn->dn_nblkptr);
 			bp += blkid + i;
 			if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
 				dprintf_bp(bp, "can free old%s", "");
@@ -390,51 +373,86 @@
 		nblks = 0;
 	}

+	/*
+	 * Add in memory requirements of higher-level indirects
+	 */
+	if (nblks && dn->dn_nlevels > 2) {
+		uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs);
+		int level = 2;
+
+		while (level++ < dn->dn_nlevels) {
+			txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift;
+			blkcnt = 1 + (blkcnt >> epbs);
+		}
+		ASSERT(blkcnt <= dn->dn_nblkptr);
+	}
+
+	lastblk = blkid + nblks - 1;
 	while (nblks) {
 		dmu_buf_impl_t *dbuf;
-		int err, epbs, blkoff, tochk;
-
-		epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-		blkoff = P2PHASE(blkid, 1<<epbs);
-		tochk = MIN((1<<epbs) - blkoff, nblks);
-
-		err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf);
-		if (err == 0) {
-			int i;
-			blkptr_t *bp;
+		uint64_t ibyte, new_blkid;
+		int epb = 1 << epbs;
+		int err, i, blkoff, tochk;
+		blkptr_t *bp;

-			err = dbuf_read(dbuf, NULL,
-			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
-			if (err != 0) {
-				txh->txh_tx->tx_err = err;
-				dbuf_rele(dbuf, FTAG);
-				break;
-			}
-
-			bp = dbuf->db.db_data;
-			bp += blkoff;
-
-			for (i = 0; i < tochk; i++) {
-				if (dsl_dataset_block_freeable(ds,
-				    bp[i].blk_birth)) {
-					dprintf_bp(&bp[i],
-					    "can free old%s", "");
-					space += bp_get_dasize(spa, &bp[i]);
-				}
-				unref += BP_GET_ASIZE(bp);
-			}
-			dbuf_rele(dbuf, FTAG);
-		}
-		if (err && err != ENOENT) {
+		ibyte = blkid << dn->dn_datablkshift;
+		err = dnode_next_offset(dn,
+		    DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
+		new_blkid = ibyte >> dn->dn_datablkshift;
+		if (err == ESRCH)
+			break;
+		if (err) {
 			txh->txh_tx->tx_err = err;
 			break;
 		}
+		if (new_blkid > lastblk)
+			break;
+
+		if (new_blkid > blkid) {
+			skipped += new_blkid - blkid - 1;
+			nblks -= new_blkid - blkid;
+			blkid = new_blkid;
+		}
+		blkoff = P2PHASE(blkid, epb);
+		tochk = MIN(epb - blkoff, nblks);
+
+		dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG);
+
+		txh->txh_memory_tohold += dbuf->db.db_size;
+		if (txh->txh_memory_tohold > DMU_MAX_ACCESS) {
+			txh->txh_tx->tx_err = E2BIG;
+			dbuf_rele(dbuf, FTAG);
+			break;
+		}
+		err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
+		if (err != 0) {
+			txh->txh_tx->tx_err = err;
+			dbuf_rele(dbuf, FTAG);
+			break;
+		}
+
+		bp = dbuf->db.db_data;
+		bp += blkoff;
+
+		for (i = 0; i < tochk; i++) {
+			if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) {
+				dprintf_bp(&bp[i], "can free old%s", "");
+				space += bp_get_dasize(spa, &bp[i]);
+			}
+			unref += BP_GET_ASIZE(bp);
+		}
+		dbuf_rele(dbuf, FTAG);

 		blkid += tochk;
 		nblks -= tochk;
 	}
 	rw_exit(&dn->dn_struct_rwlock);

+	/* account for new level 1 indirect blocks that might show up */
+	if (skipped) {
+		skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
+		txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
+	}
 	txh->txh_space_tofree += space;
 	txh->txh_space_tounref += unref;
 }
@@ -471,7 +489,7 @@
 	/*
 	 * For i/o error checking, read the first and last level-0
 	 * blocks, and all the level-1 blocks.  The above count_write's
-	 * will take care of the level-0 blocks.
+	 * have already taken care of the level-0 blocks.
 	 */
 	if (dn->dn_nlevels > 1) {
 		shift = dn->dn_datablkshift + dn->dn_indblkshift -
@@ -483,7 +501,7 @@
 		    NULL, NULL, ZIO_FLAG_CANFAIL);
 		for (i = start; i <= end; i++) {
 			uint64_t ibyte = i << shift;
-			err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1, 0);
+			err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
 			i = ibyte >> shift;
 			if (err == ESRCH)
 				break;
@@ -706,12 +724,13 @@
 					match_offset = TRUE;
 				break;
 			case THT_FREE:
-				if (blkid == beginblk &&
-				    (txh->txh_arg1 != 0 ||
-				    dn->dn_maxblkid == 0))
-					match_offset = TRUE;
-				if (blkid == endblk &&
-				    txh->txh_arg2 != DMU_OBJECT_END)
+				/*
+				 * We will dirty all the level 1 blocks in
+				 * the free range and perhaps the first and
+				 * last level 0 block.
+				 */
+				if (blkid >= beginblk && (blkid <= endblk ||
+				    txh->txh_arg2 == DMU_OBJECT_END))
 					match_offset = TRUE;
 				break;
 			case THT_BONUS:
@@ -742,8 +761,8 @@
 {
 	dmu_tx_hold_t *txh;
 	spa_t *spa = tx->tx_pool->dp_spa;
-	uint64_t lsize, asize, fsize, usize;
-	uint64_t towrite, tofree, tooverwrite, tounref;
+	uint64_t memory, asize, fsize, usize;
+	uint64_t towrite, tofree, tooverwrite, tounref, tohold;

 	ASSERT3U(tx->tx_txg, ==, 0);

@@ -776,7 +795,7 @@
 	 * dmu_tx_unassign() logic.
 	 */

-	towrite = tofree = tooverwrite = tounref = 0;
+	towrite = tofree = tooverwrite = tounref = tohold = 0;
 	for (txh = list_head(&tx->tx_holds); txh;
 	    txh = list_next(&tx->tx_holds, txh)) {
 		dnode_t *dn = txh->txh_dnode;
@@ -797,6 +816,7 @@
 		tofree += txh->txh_space_tofree;
 		tooverwrite += txh->txh_space_tooverwrite;
 		tounref += txh->txh_space_tounref;
+		tohold += txh->txh_memory_tohold;
 	}

 	/*
@@ -817,24 +837,27 @@
 		tooverwrite = tofree = 0;
 	}

-	/*
-	 * Convert logical size to worst-case allocated size.
-	 */
+	/* needed allocation: worst-case estimate of write space */
+	asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite);
+	/* freed space estimate: worst-case overwrite + free estimate */
 	fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
-	lsize = towrite + tooverwrite;
-	asize = spa_get_asize(tx->tx_pool->dp_spa, lsize);
+	/* convert unrefd space to worst-case estimate */
 	usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
+	/* calculate memory footprint estimate */
+	memory = towrite + tooverwrite + tohold;

 #ifdef ZFS_DEBUG
-	tx->tx_space_towrite = asize;
+	/* add in 'tohold' to account for our dirty holds on this memory */
+	tx->tx_space_towrite = asize +
+	    spa_get_asize(tx->tx_pool->dp_spa, tohold);
 	tx->tx_space_tofree = tofree;
 	tx->tx_space_tooverwrite = tooverwrite;
 	tx->tx_space_tounref = tounref;
 #endif

 	if (tx->tx_dir && asize != 0) {
-		int err = dsl_dir_tempreserve_space(tx->tx_dir,
-		    lsize, asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
+		int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
+		    asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
 		if (err)
 			return (err);
 	}
--- a/usr/src/uts/common/fs/zfs/dnode.c	Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dnode.c	Tue Jul 01 12:01:12 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */

@@ -780,7 +780,7 @@
 dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db, *db_next;
-	int have_db0 = FALSE;
+	int err;

 	if (size == 0)
 		size = SPA_MINBLOCKSIZE;
@@ -805,9 +805,7 @@
 	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
 		db_next = list_next(&dn->dn_dbufs, db);

-		if (db->db_blkid == 0) {
-			have_db0 = TRUE;
-		} else if (db->db_blkid != DB_BONUS_BLKID) {
+		if (db->db_blkid != 0 && db->db_blkid != DB_BONUS_BLKID) {
 			mutex_exit(&dn->dn_dbufs_mtx);
 			goto fail;
 		}
@@ -817,12 +815,12 @@
 	if (ibs && dn->dn_nlevels != 1)
 		goto fail;

-	db = NULL;
-	if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || have_db0) {
-		/* obtain the old block */
-		db = dbuf_hold(dn, 0, FTAG);
+	/* resize the old block */
+	err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db);
+	if (err == 0)
 		dbuf_new_size(db, size, tx);
-	}
+	else if (err != ENOENT)
+		goto fail;

 	dnode_setdblksz(dn, size);
 	dnode_setdirty(dn, tx);
@@ -831,7 +829,7 @@
 		dn->dn_indblkshift = ibs;
 		dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
 	}
-
+	/* rele after we have fixed the blocksize in the dnode */
 	if (db)
 		dbuf_rele(db, FTAG);

@@ -969,15 +967,15 @@
 {
 	dmu_buf_impl_t *db;
 	uint64_t blkoff, blkid, nblks;
-	int blksz, head;
+	int blksz, blkshift, head, tail;
 	int trunc = FALSE;
+	int epbs;

 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	blksz = dn->dn_datablksz;
+	blkshift = dn->dn_datablkshift;
+	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;

-	/* If the range is past the end of the file, this is a no-op */
-	if (off >= blksz * (dn->dn_maxblkid+1))
-		goto out;
 	if (len == -1ULL) {
 		len = UINT64_MAX - off;
 		trunc = TRUE;
@@ -989,11 +987,18 @@
 	if (ISP2(blksz)) {
 		head = P2NPHASE(off, blksz);
 		blkoff = P2PHASE(off, blksz);
+		if ((off >> blkshift) > dn->dn_maxblkid)
+			goto out;
 	} else {
 		ASSERT(dn->dn_maxblkid == 0);
 		if (off == 0 && len >= blksz) {
-			/* Freeing the whole block; don't do any head. */
-			head = 0;
+			/* Freeing the whole block; fast-track this request */
+			blkid = 0;
+			nblks = 1;
+			goto done;
+		} else if (off > blkid) {
+			/* Freeing past end-of-data */
+			goto out;
 		} else {
 			/* Freeing part of the block. */
 			head = blksz - off;
@@ -1026,88 +1031,85 @@
 	}

 	/* If the range was less than one block, we're done */
-	if (len == 0 || off >= blksz * (dn->dn_maxblkid+1))
+	if (len == 0)
+		goto out;
+
+	ASSERT(ISP2(blksz));
+	/* If the remaining range is past end of file, we're done */
+	if ((off >> blkshift) > dn->dn_maxblkid)
+		goto out;
+
+	if (trunc)
+		tail = 0;
+	else
+		tail = P2PHASE(len, blksz);
+
+	ASSERT3U(P2PHASE(off, blksz), ==, 0);
+	/* zero out any partial block data at the end of the range */
+	if (tail) {
+		if (len < tail)
+			tail = len;
+		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
+		    TRUE, FTAG, &db) == 0) {
+			/* don't dirty if not on disk and not dirty */
+			if (db->db_last_dirty ||
+			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
+				rw_exit(&dn->dn_struct_rwlock);
+				dbuf_will_dirty(db, tx);
+				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+				bzero(db->db.db_data, tail);
+			}
+			dbuf_rele(db, FTAG);
+		}
+		len -= tail;
+	}
+
+	/* If the range did not include a full block, we are done */
+	if (len == 0)
 		goto out;

-	if (!ISP2(blksz)) {
-		/*
-		 * They are freeing the whole block of a
-		 * non-power-of-two blocksize file.  Skip all the messy
-		 * math.
-		 */
-		ASSERT3U(off, ==, 0);
-		ASSERT3U(len, >=, blksz);
-		blkid = 0;
-		nblks = 1;
-	} else {
-		int tail;
-		int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-		int blkshift = dn->dn_datablkshift;
+	ASSERT(IS_P2ALIGNED(off, blksz));
+	ASSERT(trunc || IS_P2ALIGNED(len, blksz));
+	blkid = off >> blkshift;
+	nblks = len >> blkshift;
+	if (trunc)
+		nblks += 1;

-		/* If the remaining range is past end of file, we're done */
-		if (off > dn->dn_maxblkid << blkshift)
-			goto out;
+	/*
+	 * Read in and mark all the level-1 indirects dirty,
+	 * so that they will stay in memory until syncing phase.
+	 */
+	if (dn->dn_nlevels > 1) {
+		uint64_t i, first, last;
+		int shift = epbs + dn->dn_datablkshift;

-		if (off + len == UINT64_MAX)
-			tail = 0;
+		first = blkid >> epbs;
+		if (trunc)
+			last = dn->dn_maxblkid >> epbs;
 		else
-			tail = P2PHASE(len, blksz);
+			last = (blkid + nblks - 1) >> epbs;
+		for (i = first; i <= last; i++) {
+			uint64_t ibyte = i << shift;
+			int err;

-		ASSERT3U(P2PHASE(off, blksz), ==, 0);
-		/* zero out any partial block data at the end of the range */
-		if (tail) {
-			if (len < tail)
-				tail = len;
-			if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
-			    TRUE, FTAG, &db) == 0) {
-				/* don't dirty if not on disk and not dirty */
-				if (db->db_last_dirty ||
-				    (db->db_blkptr &&
-				    !BP_IS_HOLE(db->db_blkptr))) {
-					rw_exit(&dn->dn_struct_rwlock);
-					dbuf_will_dirty(db, tx);
-					rw_enter(&dn->dn_struct_rwlock,
-					    RW_WRITER);
-					bzero(db->db.db_data, tail);
-				}
+			err = dnode_next_offset(dn,
+			    DNODE_FIND_HAVELOCK, &ibyte, 1, 1, 0);
+			i = ibyte >> shift;
+			if (err == ESRCH || i > last)
+				break;
+			ASSERT(err == 0);
+			db = dbuf_hold_level(dn, 1, i, FTAG);
+			if (db) {
+				dbuf_will_dirty(db, tx);
 				dbuf_rele(db, FTAG);
 			}
-			len -= tail;
 		}
-		/* If the range did not include a full block, we are done */
-		if (len == 0)
-			goto out;
-
-		/* dirty the left indirects */
-		if (dn->dn_nlevels > 1 && off != 0) {
-			db = dbuf_hold_level(dn, 1,
-			    (off - head) >> (blkshift + epbs), FTAG);
-			dbuf_will_dirty(db, tx);
-			dbuf_rele(db, FTAG);
-		}
-
-		/* dirty the right indirects */
-		if (dn->dn_nlevels > 1 && !trunc) {
-			db = dbuf_hold_level(dn, 1,
-			    (off + len + tail - 1) >> (blkshift + epbs), FTAG);
-			dbuf_will_dirty(db, tx);
-			dbuf_rele(db, FTAG);
-		}
-
-		/*
-		 * Finally, add this range to the dnode range list, we
-		 * will finish up this free operation in the syncing phase.
-		 */
-		ASSERT(IS_P2ALIGNED(off, 1<<blkshift));
-		ASSERT(off + len == UINT64_MAX ||
-		    IS_P2ALIGNED(len, 1<<blkshift));
-		blkid = off >> blkshift;
-		nblks = len >> blkshift;
-
-		if (trunc)
-			dn->dn_maxblkid = (blkid ? blkid - 1 : 0);
 	}
-
+done:
+	/*
+	 * Add this range to the dnode range list.
+	 * We will finish up this free operation in the syncing phase.
+	 */
 	mutex_enter(&dn->dn_mtx);
 	dnode_clear_range(dn, blkid, nblks, tx);
 	{
@@ -1127,9 +1129,12 @@
 	}
 	mutex_exit(&dn->dn_mtx);

-	dbuf_free_range(dn, blkid, nblks, tx);
+	dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
 	dnode_setdirty(dn, tx);
 out:
+	if (trunc && dn->dn_maxblkid >= (off >> blkshift))
+		dn->dn_maxblkid = (off >> blkshift ? (off >> blkshift) - 1 : 0);
+
 	rw_exit(&dn->dn_struct_rwlock);
 }

@@ -1229,7 +1234,7 @@
 }

 static int
-dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
+dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 	int lvl, uint64_t blkfill, uint64_t txg)
 {
 	dmu_buf_impl_t *db = NULL;
@@ -1237,11 +1242,15 @@
 	uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 	uint64_t epb = 1ULL << epbs;
 	uint64_t minfill, maxfill;
-	int i, error, span;
+	boolean_t hole;
+	int i, inc, error, span;

 	dprintf("probing object %llu offset %llx level %d of %u\n",
 	    dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);

+	hole = flags & DNODE_FIND_HOLE;
+	inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
+
 	if (lvl == dn->dn_phys->dn_nlevels) {
 		error = 0;
 		epb = dn->dn_phys->dn_nblkptr;
@@ -1270,7 +1279,8 @@
 		span = DNODE_SHIFT;
 		ASSERT(dn->dn_type == DMU_OT_DNODE);

-		for (i = (*offset >> span) & (blkfill - 1); i < blkfill; i++) {
+		for (i = (*offset >> span) & (blkfill - 1);
+		    i >= 0 && i < blkfill; i += inc) {
 			boolean_t newcontents = B_TRUE;
 			if (txg) {
 				int j;
@@ -1282,9 +1292,9 @@
 			}
 			if (!dnp[i].dn_type == hole && newcontents)
 				break;
-			*offset += 1ULL << span;
+			*offset += (1ULL << span) * inc;
 		}
-		if (i == blkfill)
+		if (i < 0 || i == blkfill)
 			error = ESRCH;
 	} else {
 		blkptr_t *bp = data;
@@ -1298,14 +1308,14 @@
 			minfill++;

 		for (i = (*offset >> span) & ((1ULL << epbs) - 1);
-		    i < epb; i++) {
+		    i >= 0 && i < epb; i += inc) {
 			if (bp[i].blk_fill >= minfill &&
 			    bp[i].blk_fill <= maxfill &&
 			    bp[i].blk_birth > txg)
 				break;
-			*offset += 1ULL << span;
+			*offset += (1ULL << span) * inc;
 		}
-		if (i >= epb)
+		if (i < 0 || i == epb)
 			error = ESRCH;
 	}

@@ -1324,64 +1334,66 @@
  *
  * Examples:
  *
- * dnode_next_offset(dn, hole, offset, 1, 1, 0);
- *	Finds the next hole/data in a file.
+ * dnode_next_offset(dn, flags, offset, 1, 1, 0);
+ *	Finds the next/previous hole/data in a file.
  *	Used in dmu_offset_next().
  *
- * dnode_next_offset(mdn, hole, offset, 0, DNODES_PER_BLOCK, txg);
+ * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
  *	Finds the next free/allocated dnode an objset's meta-dnode.
  *	Only finds objects that have new contents since txg (ie.
  *	bonus buffer changes and content removal are ignored).
  *	Used in dmu_object_next().
  *
- * dnode_next_offset(mdn, TRUE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
+ * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
  *	Finds the next L2 meta-dnode bp that's at most 1/4 full.
  *	Used in dmu_object_alloc().
  */
 int
-dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *offset,
+dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
     int minlvl, uint64_t blkfill, uint64_t txg)
 {
+	uint64_t initial_offset = *offset;
 	int lvl, maxlvl;
 	int error = 0;
-	uint64_t initial_offset = *offset;

-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	if (!(flags & DNODE_FIND_HAVELOCK))
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);

 	if (dn->dn_phys->dn_nlevels == 0) {
-		rw_exit(&dn->dn_struct_rwlock);
-		return (ESRCH);
+		error = ESRCH;
+		goto out;
 	}

 	if (dn->dn_datablkshift == 0) {
 		if (*offset < dn->dn_datablksz) {
-			if (hole)
+			if (flags & DNODE_FIND_HOLE)
 				*offset = dn->dn_datablksz;
 		} else {
 			error = ESRCH;
 		}
-		rw_exit(&dn->dn_struct_rwlock);
-		return (error);
+		goto out;
 	}

 	maxlvl = dn->dn_phys->dn_nlevels;

 	for (lvl = minlvl; lvl <= maxlvl; lvl++) {
 		error = dnode_next_offset_level(dn,
-		    hole, offset, lvl, blkfill, txg);
+		    flags, offset, lvl, blkfill, txg);
 		if (error != ESRCH)
 			break;
 	}

-	while (--lvl >= minlvl && error == 0) {
+	while (error == 0 && --lvl >= minlvl) {
 		error = dnode_next_offset_level(dn,
-		    hole, offset, lvl, blkfill, txg);
+		    flags, offset, lvl, blkfill, txg);
 	}

-	rw_exit(&dn->dn_struct_rwlock);
-
-	if (error == 0 && initial_offset > *offset)
+	if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
+	    initial_offset < *offset : initial_offset > *offset))
 		error = ESRCH;
+out:
+	if (!(flags & DNODE_FIND_HAVELOCK))
+		rw_exit(&dn->dn_struct_rwlock);

 	return (error);
 }
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c	Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c	Tue Jul 01 12:01:12 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */

@@ -109,25 +109,26 @@
 	rw_exit(&dn->dn_struct_rwlock);
 }

-static void
+static int
 free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
 {
-	objset_impl_t *os = dn->dn_objset;
+	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 	uint64_t bytesfreed = 0;
-	int i;
+	int i, blocks_freed = 0;

-	dprintf("os=%p obj=%llx num=%d\n", os, dn->dn_object, num);
+	dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num);

 	for (i = 0; i < num; i++, bp++) {
 		if (BP_IS_HOLE(bp))
 			continue;

-		bytesfreed += bp_get_dasize(os->os_spa, bp);
+		bytesfreed += dsl_dataset_block_kill(ds, bp, dn->dn_zio, tx);
 		ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
-		dsl_dataset_block_kill(os->os_dsl_dataset, bp, dn->dn_zio, tx);
 		bzero(bp, sizeof (blkptr_t));
+		blocks_freed += 1;
 	}
 	dnode_diduse_space(dn, -bytesfreed);
+	return (blocks_freed);
 }

 #ifdef ZFS_DEBUG
@@ -205,6 +206,8 @@
 }
 #endif

+#define	ALL -1
+
 static int
 free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
     dmu_tx_t *tx)
@@ -215,8 +218,18 @@
 	uint64_t start, end, dbstart, dbend, i;
 	int epbs, shift, err;
 	int all = TRUE;
+	int blocks_freed = 0;

-	(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
+	/*
+	 * There is a small possibility that this block will not be cached:
+	 *   1 - if level > 1 and there are no children with level <= 1
+	 *   2 - if we didn't get a dirty hold (because this block had just
+	 *	 finished being written -- and so had no holds), and then this
+	 *	 block got evicted before we got here.
+	 */
+	if (db->db_state != DB_CACHED)
+		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
+
 	arc_release(db->db_buf, db);
 	bp = (blkptr_t *)db->db.db_data;

@@ -240,10 +253,10 @@

 	if (db->db_level == 1) {
 		FREE_VERIFY(db, start, end, tx);
-		free_blocks(dn, bp, end-start+1, tx);
+		blocks_freed = free_blocks(dn, bp, end-start+1, tx);
 		arc_buf_freeze(db->db_buf);
-		ASSERT(all || db->db_last_dirty);
-		return (all);
+		ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
+		return (all ? ALL : blocks_freed);
 	}

 	for (i = start; i <= end; i++, bp++) {
@@ -254,9 +267,9 @@
 		ASSERT3U(err, ==, 0);
 		rw_exit(&dn->dn_struct_rwlock);

-		if (free_children(subdb, blkid, nblks, trunc, tx)) {
+		if (free_children(subdb, blkid, nblks, trunc, tx) == ALL) {
 			ASSERT3P(subdb->db_blkptr, ==, bp);
-			free_blocks(dn, bp, 1, tx);
+			blocks_freed += free_blocks(dn, bp, 1, tx);
 		} else {
 			all = FALSE;
 		}
@@ -273,8 +286,8 @@
 		ASSERT3U(bp->blk_birth, ==, 0);
 	}
 #endif
-	ASSERT(all || db->db_last_dirty);
-	return (all);
+	ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
+	return (all ? ALL : blocks_freed);
 }

 /*
@@ -304,15 +317,14 @@
 			return;
 		}
 		ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
-		free_blocks(dn, bp + blkid, nblks, tx);
+		(void) free_blocks(dn, bp + blkid, nblks, tx);
 		if (trunc) {
 			uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
 			    (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 			dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
 			ASSERT(off < dn->dn_phys->dn_maxblkid ||
 			    dn->dn_phys->dn_maxblkid == 0 ||
-			    dnode_next_offset(dn, FALSE, &off,
-			    1, 1, 0) != 0);
+			    dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
 		}
 		return;
 	}
@@ -330,9 +342,9 @@
 		ASSERT3U(err, ==, 0);
 		rw_exit(&dn->dn_struct_rwlock);

-		if (free_children(db, blkid, nblks, trunc, tx)) {
+		if (free_children(db, blkid, nblks, trunc, tx) == ALL) {
 			ASSERT3P(db->db_blkptr, ==, bp);
-			free_blocks(dn, bp, 1, tx);
+			(void) free_blocks(dn, bp, 1, tx);
 		}
 		dbuf_rele(db, FTAG);
 	}
@@ -342,7 +354,7 @@
 		dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
 		ASSERT(off < dn->dn_phys->dn_maxblkid ||
 		    dn->dn_phys->dn_maxblkid == 0 ||
-		    dnode_next_offset(dn, FALSE, &off, 1, 1, 0) != 0);
+		    dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
 	}
 }

@@ -442,6 +454,13 @@

 	ASSERT(dmu_tx_is_syncing(tx));

+	/*
+	 * Our contents should have been freed in dnode_sync() by the
+	 * free range record inserted by the caller of dnode_free().
+	 */
+	ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0);
+	ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr));
+
 	dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
 	dnode_evict_dbufs(dn);
 	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
@@ -461,10 +480,6 @@
 	dn->dn_next_indblkshift[txgoff] = 0;
 	dn->dn_next_blksz[txgoff] = 0;

-	/* free up all the blocks in the file. */
-	dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx);
-	ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0);
-
 	/* ASSERT(blkptrs are zero); */
 	ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
 	ASSERT(dn->dn_type != DMU_OT_NONE);
@@ -541,7 +556,7 @@
 		ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
 		    SPA_MINBLOCKSIZE) == 0);
 		ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
-		    list_head(list) != NULL ||
+		    dn->dn_maxblkid == 0 || list_head(list) != NULL ||
 		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
 		    dnp->dn_datablkszsec);
 		dnp->dn_datablkszsec =
@@ -575,22 +590,15 @@
 	mutex_exit(&dn->dn_mtx);

 	/* process all the "freed" ranges in the file */
-	if (dn->dn_free_txg == 0 || dn->dn_free_txg > tx->tx_txg) {
-		for (rp = avl_last(&dn->dn_ranges[txgoff]); rp != NULL;
-		    rp = AVL_PREV(&dn->dn_ranges[txgoff], rp))
-			dnode_sync_free_range(dn,
-			    rp->fr_blkid, rp->fr_nblks, tx);
+	while (rp = avl_last(&dn->dn_ranges[txgoff])) {
+		dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx);
+		/* grab the mutex so we don't race with dnode_block_freed() */
+		mutex_enter(&dn->dn_mtx);
+		avl_remove(&dn->dn_ranges[txgoff], rp);
+		mutex_exit(&dn->dn_mtx);
+		kmem_free(rp, sizeof (free_range_t));
 	}
-	/* grab the mutex so we don't race with dnode_block_freed() */
-	mutex_enter(&dn->dn_mtx);
-	for (rp = avl_first(&dn->dn_ranges[txgoff]); rp; ) {

-		free_range_t *last = rp;
-		rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp);
-		avl_remove(&dn->dn_ranges[txgoff], last);
-		kmem_free(last, sizeof (free_range_t));
-	}
-	mutex_exit(&dn->dn_mtx);
 	if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
 		dnode_sync_free(dn, tx);
 		return;
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c	Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c	Tue Jul 01 12:01:12 2008 -0700
@@ -115,7 +115,7 @@
 	dsl_dir_diduse_space(ds->ds_dir, delta, compressed, uncompressed, tx);
 }

-void
+int
 dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
     dmu_tx_t *tx)
 {
@@ -126,7 +126,7 @@
 	ASSERT(dmu_tx_is_syncing(tx));
 	/* No block pointer => nothing to free */
 	if (BP_IS_HOLE(bp))
-		return;
+		return (0);

 	ASSERT(used > 0);
 	if (ds == NULL) {
@@ -142,7 +142,7 @@
 		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
 		    -used, -compressed, -uncompressed, tx);
 		dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
-		return;
+		return (used);
 	}
 	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);

@@ -189,6 +189,8 @@
 	ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
 	ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
 	mutex_exit(&ds->ds_lock);
+
+	return (used);
 }

 uint64_t
@@ -957,21 +959,11 @@
 	 */
 	for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
 	    ds->ds_phys->ds_prev_snap_txg)) {
-		dmu_tx_t *tx = dmu_tx_create(os);
-		dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END);
-		dmu_tx_hold_bonus(tx, obj);
-		err = dmu_tx_assign(tx, TXG_WAIT);
-		if (err) {
-			/*
-			 * Perhaps there is not enough disk
-			 * space.  Just deal with it from
-			 * dsl_dataset_destroy_sync().
-			 */
-			dmu_tx_abort(tx);
-			continue;
-		}
-		VERIFY(0 == dmu_object_free(os, obj, tx));
-		dmu_tx_commit(tx);
+		/*
+		 * Ignore errors, if there is not enough disk space
+		 * we will deal with it in dsl_dataset_destroy_sync().
+		 */
+		(void) dmu_free_object(os, obj);
 	}

 	dmu_objset_close(os);
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h	Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h	Tue Jul 01 12:01:12 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */

@@ -271,7 +271,7 @@
 void dbuf_unoverride(dbuf_dirty_record_t *dr);
 void dbuf_sync_list(list_t *list, dmu_tx_t *tx);

-void dbuf_free_range(struct dnode *dn, uint64_t blkid, uint64_t nblks,
+void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
     struct dmu_tx *);

 void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h	Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h	Tue Jul 01 12:01:12 2008 -0700
@@ -154,6 +154,7 @@
  * operation, including metadata.
  */
 #define	DMU_MAX_ACCESS (10<<20) /* 10MB */
+#define	DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */

 /*
  * Public routines to create, destroy, open, and close objsets.
@@ -421,6 +422,9 @@
  */
 int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
 	uint64_t size, dmu_tx_t *tx);
+int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
+	uint64_t size);
+int dmu_free_object(objset_t *os, uint64_t object);

 /*
  * Convenience functions.
--- a/usr/src/uts/common/fs/zfs/sys/dmu_tx.h	Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h	Tue Jul 01 12:01:12 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */

@@ -89,6 +89,7 @@
 	uint64_t txh_space_tofree;
 	uint64_t txh_space_tooverwrite;
 	uint64_t txh_space_tounref;
+	uint64_t txh_memory_tohold;
 #ifdef ZFS_DEBUG
 	enum dmu_tx_hold_type txh_type;
 	uint64_t txh_arg1;
--- a/usr/src/uts/common/fs/zfs/sys/dnode.h	Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dnode.h	Tue Jul 01 12:01:12 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */

@@ -41,12 +41,19 @@
 #endif

 /*
- * Flags.
+ * dnode_hold() flags.
  */
 #define	DNODE_MUST_BE_ALLOCATED	1
 #define	DNODE_MUST_BE_FREE	2

 /*
+ * dnode_next_offset() flags.
+ */
+#define	DNODE_FIND_HOLE		1
+#define	DNODE_FIND_BACKWARDS	2
+#define	DNODE_FIND_HAVELOCK	4
+
+/*
  * Fixed constants.
  */
 #define	DNODE_SHIFT		9	/* 512 bytes */
@@ -227,8 +234,8 @@
 uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
 void dnode_init(void);
 void dnode_fini(void);
-int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl,
-    uint64_t blkfill, uint64_t txg);
+int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
+    int minlvl, uint64_t blkfill, uint64_t txg);
 void dnode_evict_dbufs(dnode_t *dn);

 #ifdef ZFS_DEBUG
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h	Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h	Tue Jul 01 12:01:12 2008 -0700
@@ -191,7 +191,7 @@
 void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);

 void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
-void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
+int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
     dmu_tx_t *tx);
 int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
 uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
--- a/usr/src/uts/common/fs/zfs/zfs_dir.c	Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_dir.c	Tue Jul 01 12:01:12 2008 -0700
@@ -451,6 +451,21 @@
 	ASSERT3U(error, ==, 0);
 }

+static void
+zfs_unlinked_remove(znode_t *zp, dmu_tx_t *tx)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	char obj_name[17];
+	int error;
+
+	ASSERT(zp->z_unlinked);
+	ASSERT3U(zp->z_phys->zp_links, ==, 0);
+
+	error = zap_remove(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
+	    zfs_unlinked_hexname(obj_name, zp->z_id), tx);
+	ASSERT3U(error, ==, 0);
+}
+
 /*
  * Clean up any znodes that had no links when we either crashed or
  * (force) umounted the file system.
@@ -574,7 +589,6 @@
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	objset_t	*os = zfsvfs->z_os;
 	znode_t		*xzp = NULL;
-	char		obj_name[17];
 	dmu_tx_t	*tx;
 	uint64_t	acl_obj;
 	int		error;
@@ -589,7 +603,7 @@
 		if (zfs_purgedir(zp) != 0) {
 			/*
 			 * Not enough space to delete some xattrs.
-			 * Leave it on the unlinked set.
+			 * Leave it in the unlinked set.
 			 */
 			zfs_znode_dmu_fini(zp);
 			zfs_znode_free(zp);
@@ -598,6 +612,19 @@
 	}

 	/*
+	 * Free up all the data in the file.
+	 */
+	error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
+	if (error) {
+		/*
+		 * Not enough space.  Leave the file in the unlinked set.
+		 */
+		zfs_znode_dmu_fini(zp);
+		zfs_znode_free(zp);
+		return;
+	}
+
+	/*
 	 * If the file has extended attributes, we're going to unlink
 	 * the xattr dir.
 	 */
@@ -609,7 +636,7 @@
 	acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;

 	/*
-	 * Set up the transaction.
+	 * Set up the final transaction.
 	 */
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
@@ -643,9 +670,7 @@
 	}

 	/* Remove this znode from the unlinked set */
-	error = zap_remove(os, zfsvfs->z_unlinkedobj,
-	    zfs_unlinked_hexname(obj_name, zp->z_id), tx);
-	ASSERT3U(error, ==, 0);
+	zfs_unlinked_remove(zp, tx);

 	zfs_znode_delete(zp, tx);
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c	Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c	Tue Jul 01 12:01:12 2008 -0700
@@ -1304,15 +1304,10 @@
 		 */
 		if ((ZTOV(zp)->v_type == VREG) &&
 		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
+			/* we can't hold any locks when calling zfs_freesp() */
+			zfs_dirent_unlock(dl);
+			dl = NULL;
 			error = zfs_freesp(zp, 0, 0, mode, TRUE);
-			if (error == ERESTART &&
-			    zfsvfs->z_assign == TXG_NOWAIT) {
-				/* NB: we already did dmu_tx_wait() */
-				zfs_dirent_unlock(dl);
-				VN_RELE(ZTOV(zp));
-				goto top;
-			}
-
 			if (error == 0) {
 				vnevent_create(ZTOV(zp), ct);
 			}
@@ -1379,7 +1374,7 @@
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	boolean_t	may_delete_now, delete_now = FALSE;
-	boolean_t	unlinked;
+	boolean_t	unlinked, toobig = FALSE;
 	uint64_t	txtype;
 	pathname_t	*realnmp = NULL;
 	pathname_t	realnm;
@@ -1442,8 +1437,13 @@
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_bonus(tx, zp->z_id);
-	if (may_delete_now)
-		dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
+	if (may_delete_now) {
+		toobig =
+		    zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
+		/* if the file is too big, only hold_free a token amount */
+		dmu_tx_hold_free(tx, zp->z_id, 0,
+		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
+	}

 	/* are there any extended attributes? */
 	if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
@@ -1487,7 +1487,7 @@

 	if (unlinked) {
 		mutex_enter(&vp->v_lock);
-		delete_now = may_delete_now &&
+		delete_now = may_delete_now && !toobig &&
 		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
 		    zp->z_phys->zp_xattr == xattr_obj &&
 		    zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
@@ -1533,7 +1533,7 @@
 	if (!delete_now) {
 		VN_RELE(vp);
 	} else if (xzp) {
-		/* this rele delayed to prevent nesting transactions */
+		/* this rele is delayed to prevent nesting transactions */
 		VN_RELE(ZTOV(xzp));
 	}

@@ -2451,10 +2451,8 @@
 		 * block if there are locks present... this
 		 * should be addressed in openat().
 		 */
-		do {
-			err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
-			/* NB: we already did dmu_tx_wait() if necessary */
-		} while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT);
+		/* XXX - would it be OK to generate a log record here? */
+		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
 		if (err) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
@@ -2725,6 +2723,7 @@
 	if (mask & AT_MTIME)
 		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);

+	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
 	if (mask & AT_SIZE)
 		zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
 	else if (mask != 0)
@@ -4236,7 +4235,6 @@
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);

-top:
 	if (cmd != F_FREESP) {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
@@ -4255,10 +4253,7 @@
 	off = bfp->l_start;
 	len = bfp->l_len; /* 0 means from off to end of file */

-	do {
-		error = zfs_freesp(zp, off, len, flag, TRUE);
-		/* NB: we already did dmu_tx_wait() if necessary */
-	} while (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT);
+	error = zfs_freesp(zp, off, len, flag, TRUE);

 	ZFS_EXIT(zfsvfs);
 	return (error);
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c	Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c	Tue Jul 01 12:01:12 2008 -0700
@@ -1046,14 +1046,14 @@
 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	objset_t *os = zfsvfs->z_os;
 	uint64_t obj = zp->z_id;
+	uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;

 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
-	if (zp->z_phys->zp_acl.z_acl_extern_obj) {
-		VERIFY(0 == dmu_object_free(zfsvfs->z_os,
-		    zp->z_phys->zp_acl.z_acl_extern_obj, tx));
-	}
-	VERIFY(0 == dmu_object_free(zfsvfs->z_os, obj, tx));
+	if (acl_obj)
+		VERIFY(0 == dmu_object_free(os, acl_obj, tx));
+	VERIFY(0 == dmu_object_free(os, obj, tx));
 	zfs_znode_dmu_fini(zp);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
 	zfs_znode_free(zp);
@@ -1233,137 +1233,177 @@
 }

 /*
- * Free space in a file.
+ * Increase the file length
  *
  *	IN:	zp	- znode of file to free data in.
- *		off	- start of section to free.
- *		len	- length of section to free (0 => to EOF).
- *		flag	- current file open mode flags.
+ *		end	- new end-of-file
  *
  * 	RETURN:	0 if success
  *		error code if failure
  */
-int
-zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
+static int
+zfs_extend(znode_t *zp, uint64_t end)
 {
-	vnode_t *vp = ZTOV(zp);
-	dmu_tx_t *tx;
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	zilog_t *zilog = zfsvfs->z_log;
+	dmu_tx_t *tx;
 	rl_t *rl;
-	uint64_t end = off + len;
-	uint64_t size, new_blksz;
-	uint64_t pflags = zp->z_phys->zp_flags;
+	uint64_t newblksz;
 	int error;

-	if ((pflags & (ZFS_IMMUTABLE|ZFS_READONLY)) ||
-	    off < zp->z_phys->zp_size && (pflags & ZFS_APPENDONLY))
-		return (EPERM);
-
-	if (ZTOV(zp)->v_type == VFIFO)
-		return (0);
-
 	/*
-	 * If we will change zp_size then lock the whole file,
-	 * otherwise just lock the range being freed.
+	 * We will change zp_size, lock the whole file.
 	 */
-	if (len == 0 || off + len > zp->z_phys->zp_size) {
-		rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
-	} else {
-		rl = zfs_range_lock(zp, off, len, RL_WRITER);
-		/* recheck, in case zp_size changed */
-		if (off + len > zp->z_phys->zp_size) {
-			/* lost race: file size changed, lock whole file */
-			zfs_range_unlock(rl);
-			rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
-		}
-	}
+	rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);

 	/*
 	 * Nothing to do if file already at desired length.
 	 */
-	size = zp->z_phys->zp_size;
-	if (len == 0 && size == off && off != 0) {
+	if (end <= zp->z_phys->zp_size) {
 		zfs_range_unlock(rl);
 		return (0);
 	}
-
-	/*
-	 * Check for any locks in the region to be freed.
-	 */
-	if (MANDLOCK(vp, (mode_t)zp->z_phys->zp_mode)) {
-		uint64_t start = off;
-		uint64_t extent = len;
-
-		if (off > size) {
-			start = size;
-			extent += off - size;
-		} else if (len == 0) {
-			extent = size - off;
-		}
-		if (error = chklock(vp, FWRITE, start, extent, flag, NULL)) {
-			zfs_range_unlock(rl);
-			return (error);
-		}
-	}
-
+top:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, zp->z_id);
-	new_blksz = 0;
-	if (end > size &&
+	if (end > zp->z_blksz &&
 	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
 		/*
 		 * We are growing the file past the current block size.
 		 */
 		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
 			ASSERT(!ISP2(zp->z_blksz));
-			new_blksz = MIN(end, SPA_MAXBLOCKSIZE);
+			newblksz = MIN(end, SPA_MAXBLOCKSIZE);
 		} else {
-			new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
+			newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
 		}
-		dmu_tx_hold_write(tx, zp->z_id, 0, MIN(end, new_blksz));
-	} else if (off < size) {
-		/*
-		 * If len == 0, we are truncating the file.
-		 */
-		dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END);
+		dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
+	} else {
+		newblksz = 0;
 	}

 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
 			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			goto top;
+		}
 		dmu_tx_abort(tx);
 		zfs_range_unlock(rl);
 		return (error);
 	}
-
-	if (new_blksz)
-		zfs_grow_blocksize(zp, new_blksz, tx);
-
-	if (end > size || len == 0)
-		zp->z_phys->zp_size = end;
-
-	if (off < size) {
-		objset_t *os = zfsvfs->z_os;
-		uint64_t rlen = len;
+	dmu_buf_will_dirty(zp->z_dbuf, tx);

-		if (len == 0)
-			rlen = -1;
-		else if (end > size)
-			rlen = size - off;
-		VERIFY(0 == dmu_free_range(os, zp->z_id, off, rlen, tx));
-	}
+	if (newblksz)
+		zfs_grow_blocksize(zp, newblksz, tx);

-	if (log) {
-		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
-		zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
-	}
+	zp->z_phys->zp_size = end;

 	zfs_range_unlock(rl);

 	dmu_tx_commit(tx);

+	return (0);
+}
+
+/*
+ * Free space in a file.
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		off	- start of section to free.
+ *		len	- length of section to free.
+ *
+ * 	RETURN:	0 if success
+ *		error code if failure
+ */
+static int
+zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	rl_t *rl;
+	int error;
+
+	/*
+	 * Lock the range being freed.
+	 */
+	rl = zfs_range_lock(zp, off, len, RL_WRITER);
+
+	/*
+	 * Nothing to do if file already at desired length.
+	 */
+	if (off >= zp->z_phys->zp_size) {
+		zfs_range_unlock(rl);
+		return (0);
+	}
+
+	if (off + len > zp->z_phys->zp_size)
+		len = zp->z_phys->zp_size - off;
+
+	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
+
+	zfs_range_unlock(rl);
+
+	return (error);
+}
+
+/*
+ * Truncate a file
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		end	- new end-of-file.
+ *
+ * 	RETURN:	0 if success
+ *		error code if failure
+ */
+static int
+zfs_trunc(znode_t *zp, uint64_t end)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	vnode_t *vp = ZTOV(zp);
+	dmu_tx_t *tx;
+	rl_t *rl;
+	int error;
+
+	/*
+	 * We will change zp_size, lock the whole file.
+	 */
+	rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
+
+	/*
+	 * Nothing to do if file already at desired length.
+	 */
+	if (end >= zp->z_phys->zp_size) {
+		zfs_range_unlock(rl);
+		return (0);
+	}
+
+	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,  -1);
+	if (error) {
+		zfs_range_unlock(rl);
+		return (error);
+	}
+top:
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_bonus(tx, zp->z_id);
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			goto top;
+		}
+		dmu_tx_abort(tx);
+		zfs_range_unlock(rl);
+		return (error);
+	}
+	dmu_buf_will_dirty(zp->z_dbuf, tx);
+
+	zp->z_phys->zp_size = end;
+
+	dmu_tx_commit(tx);
+
+	zfs_range_unlock(rl);
+
 	/*
 	 * Clear any mapped pages in the truncated region.  This has to
 	 * happen outside of the transaction to avoid the possibility of
@@ -1371,10 +1411,10 @@
 	 * about to invalidate.
 	 */
 	rw_enter(&zp->z_map_lock, RW_WRITER);
-	if (off < size && vn_has_cached_data(vp)) {
+	if (vn_has_cached_data(vp)) {
 		page_t *pp;
-		uint64_t start = off & PAGEMASK;
-		int poff = off & PAGEOFFSET;
+		uint64_t start = end & PAGEMASK;
+		int poff = end & PAGEOFFSET;

 		if (poff != 0 && (pp = page_lookup(vp, start, SE_SHARED))) {
 			/*
@@ -1393,6 +1433,74 @@
 	return (0);
 }

+/*
+ * Free space in a file
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		off	- start of range
+ *		len	- end of range (0 => EOF)
+ *		flag	- current file open mode flags.
+ *		log	- TRUE if this action should be logged
+ *
+ * 	RETURN:	0 if success
+ *		error code if failure
+ */
+int
+zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
+{
+	vnode_t *vp = ZTOV(zp);
+	dmu_tx_t *tx;
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	zilog_t *zilog = zfsvfs->z_log;
+	int error;
+
+	if (off > zp->z_phys->zp_size) {
+		error =  zfs_extend(zp, off+len);
+		if (error == 0 && log)
+			goto log;
+		else
+			return (error);
+	}
+
+	/*
+	 * Check for any locks in the region to be freed.
+	 */
+	if (MANDLOCK(vp, (mode_t)zp->z_phys->zp_mode)) {
+		uint64_t length = (len ? len : zp->z_phys->zp_size - off);
+		if (error = chklock(vp, FWRITE, off, length, flag, NULL))
+			return (error);
+	}
+
+	if (len == 0) {
+		error = zfs_trunc(zp, off);
+	} else {
+		if ((error = zfs_free_range(zp, off, len)) == 0 &&
+		    off + len > zp->z_phys->zp_size)
+			error = zfs_extend(zp, off+len);
+	}
+	if (error || !log)
+		return (error);
+log:
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_bonus(tx, zp->z_id);
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			goto log;
+		}
+		dmu_tx_abort(tx);
+		return (error);
+	}
+
+	zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+	zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
+
+	dmu_tx_commit(tx);
+	return (0);
+}
+
 void
 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 {
--- a/usr/src/uts/common/fs/zfs/zvol.c	Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zvol.c	Tue Jul 01 12:01:12 2008 -0700
@@ -774,24 +774,6 @@
 	return (0);
 }

-static int
-zvol_truncate(zvol_state_t *zv, uint64_t offset, uint64_t size)
-{
-	dmu_tx_t *tx;
-	int error;
-
-	tx = dmu_tx_create(zv->zv_objset);
-	dmu_tx_hold_free(tx, ZVOL_OBJ, offset, size);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		dmu_tx_abort(tx);
-		return (error);
-	}
-	error = dmu_free_range(zv->zv_objset, ZVOL_OBJ, offset, size, tx);
-	dmu_tx_commit(tx);
-	return (0);
-}
-
 int
 zvol_prealloc(zvol_state_t *zv)
 {
@@ -823,7 +805,7 @@
 		if (error) {
 			dmu_tx_abort(tx);
 			kmem_free(data, SPA_MAXBLOCKSIZE);
-			(void) zvol_truncate(zv, 0, off);
+			(void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
 			return (error);
 		}
 		dmu_write(os, ZVOL_OBJ, off, bytes, data, tx);
@@ -847,7 +829,6 @@

 	tx = dmu_tx_create(zv->zv_objset);
 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
-	dmu_tx_hold_free(tx, ZVOL_OBJ, volsize, DMU_OBJECT_END);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
@@ -859,7 +840,8 @@
 	dmu_tx_commit(tx);

 	if (error == 0)
-		error = zvol_truncate(zv, volsize, DMU_OBJECT_END);
+		error = dmu_free_long_range(zv->zv_objset,
+		    ZVOL_OBJ, volsize, DMU_OBJECT_END);

 	if (error == 0) {
 		zv->zv_volsize = volsize;
@@ -1651,7 +1633,6 @@
 	ASSERT(MUTEX_HELD(&zvol_state_lock));

 	tx = dmu_tx_create(os);
-	dmu_tx_hold_free(tx, ZVOL_OBJ, 0, DMU_OBJECT_END);
 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
@@ -1690,7 +1671,8 @@

 	/* Truncate the file */
 	if (!error)
-		error = zvol_truncate(zv, 0, DMU_OBJECT_END);
+		error = dmu_free_long_range(zv->zv_objset,
+		    ZVOL_OBJ, 0, DMU_OBJECT_END);

 	if (error)
 		return (error);
@@ -1813,7 +1795,7 @@

 	(void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
 	zvol_free_extents(zv);
-	(void) zvol_truncate(zv, 0, DMU_OBJECT_END);
+	(void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
 	zv->zv_flags &= ~ZVOL_DUMPIFIED;
 	dmu_tx_commit(tx);