changeset 13782:8f78aae28a63

3104 eliminate empty bpobjs Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: Christopher Siden <chris.siden@delphix.com> Reviewed by: Garrett D'Amore <garrett@damore.org> Approved by: Eric Schrock <eric.schrock@delphix.com>
author Matthew Ahrens <mahrens@delphix.com>
date Mon, 27 Aug 2012 03:58:14 -0700
parents 64a1ab954737
children b3333788688b
files usr/src/common/zfs/zfeature_common.c usr/src/common/zfs/zfeature_common.h usr/src/man/man5/zpool-features.5 usr/src/uts/common/fs/zfs/bpobj.c usr/src/uts/common/fs/zfs/dsl_deadlist.c usr/src/uts/common/fs/zfs/dsl_pool.c usr/src/uts/common/fs/zfs/sys/bpobj.h usr/src/uts/common/fs/zfs/sys/dmu.h usr/src/uts/common/fs/zfs/sys/dsl_pool.h usr/src/uts/common/fs/zfs/sys/zap.h usr/src/uts/common/fs/zfs/zap.c usr/src/uts/common/fs/zfs/zfeature.c
diffstat 12 files changed, 166 insertions(+), 11 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/common/zfs/zfeature_common.c	Sat Aug 25 02:44:53 2012 -0500
+++ b/usr/src/common/zfs/zfeature_common.c	Mon Aug 27 03:58:14 2012 -0700
@@ -153,4 +153,7 @@
 	zfeature_register(SPA_FEATURE_ASYNC_DESTROY,
 	    "com.delphix:async_destroy", "async_destroy",
 	    "Destroy filesystems asynchronously.", B_TRUE, B_FALSE, NULL);
+	zfeature_register(SPA_FEATURE_EMPTY_BPOBJ,
+	    "com.delphix:empty_bpobj", "empty_bpobj",
+	    "Snapshots use less space.", B_TRUE, B_FALSE, NULL);
 }
--- a/usr/src/common/zfs/zfeature_common.h	Sat Aug 25 02:44:53 2012 -0500
+++ b/usr/src/common/zfs/zfeature_common.h	Mon Aug 27 03:58:14 2012 -0700
@@ -51,6 +51,7 @@
 
 enum spa_feature {
 	SPA_FEATURE_ASYNC_DESTROY,
+	SPA_FEATURE_EMPTY_BPOBJ,
 	SPA_FEATURES
 } spa_feature_t;
 
--- a/usr/src/man/man5/zpool-features.5	Sat Aug 25 02:44:53 2012 -0500
+++ b/usr/src/man/man5/zpool-features.5	Mon Aug 27 03:58:14 2012 -0700
@@ -169,5 +169,33 @@
 
 This feature is only \fBactive\fR while \fBfreeing\fR is non\-zero.
 .RE
+
+.sp
+.ne 2
+.na
+\fB\fBempty_bpobj\fR\fR
+.ad
+.RS 4n
+.TS
+l l .
+GUID	com.delphix:empty_bpobj
+READ\-ONLY COMPATIBLE	yes
+DEPENDENCIES	none
+.TE
+
+This feature increases the performance of creating and using a large
+number of snapshots of a single filesystem or volume, and also reduces
+the disk space required.
+
+When there are many snapshots, each snapshot uses many Block Pointer
+Objects (bpobj's) to track blocks associated with that snapshot.
+However, in common use cases, most of these bpobj's are empty.  This
+feature allows us to create each bpobj on-demand, thus eliminating the
+empty bpobjs.
+
+This feature is \fBactive\fR while there are any filesystems, volumes,
+or snapshots which were created after enabling this feature.
+.RE
+
 .SH "SEE ALSO"
 \fBzpool\fR(1M)
--- a/usr/src/uts/common/fs/zfs/bpobj.c	Sat Aug 25 02:44:53 2012 -0500
+++ b/usr/src/uts/common/fs/zfs/bpobj.c	Mon Aug 27 03:58:14 2012 -0700
@@ -20,13 +20,61 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/bpobj.h>
 #include <sys/zfs_context.h>
 #include <sys/refcount.h>
 #include <sys/dsl_pool.h>
+#include <sys/zfeature.h>
+#include <sys/zap.h>
+
+/*
+ * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
+ */
+uint64_t
+bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
+{
+	zfeature_info_t *empty_bpobj_feat =
+	    &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ];
+	spa_t *spa = dmu_objset_spa(os);
+	dsl_pool_t *dp = dmu_objset_pool(os);
+
+	if (spa_feature_is_enabled(spa, empty_bpobj_feat)) {
+		if (!spa_feature_is_active(spa, empty_bpobj_feat)) {
+			ASSERT3U(dp->dp_empty_bpobj, ==, 0);
+			dp->dp_empty_bpobj =
+			    bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx);
+			VERIFY(zap_add(os,
+			    DMU_POOL_DIRECTORY_OBJECT,
+			    DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
+			    &dp->dp_empty_bpobj, tx) == 0);
+		}
+		spa_feature_incr(spa, empty_bpobj_feat, tx);
+		ASSERT(dp->dp_empty_bpobj != 0);
+		return (dp->dp_empty_bpobj);
+	} else {
+		return (bpobj_alloc(os, blocksize, tx));
+	}
+}
+
+void
+bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
+{
+	zfeature_info_t *empty_bpobj_feat =
+	    &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ];
+	dsl_pool_t *dp = dmu_objset_pool(os);
+
+	spa_feature_decr(dmu_objset_spa(os), empty_bpobj_feat, tx);
+	if (!spa_feature_is_active(dmu_objset_spa(os), empty_bpobj_feat)) {
+		VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_EMPTY_BPOBJ, tx));
+		VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
+		dp->dp_empty_bpobj = 0;
+	}
+}
 
 uint64_t
 bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
@@ -53,6 +101,7 @@
 	int epb;
 	dmu_buf_t *dbuf = NULL;
 
+	ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
 	VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
 
 	mutex_enter(&bpo.bpo_lock);
@@ -320,6 +369,12 @@
 
 	ASSERT(bpo->bpo_havesubobj);
 	ASSERT(bpo->bpo_havecomp);
+	ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
+
+	if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
+		bpobj_decr_empty(bpo->bpo_os, tx);
+		return;
+	}
 
 	VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
 	VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
@@ -388,6 +443,7 @@
 	blkptr_t *bparray;
 
 	ASSERT(!BP_IS_HOLE(bp));
+	ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
 
 	/* We never need the fill count. */
 	stored_bp.blk_fill = 0;
--- a/usr/src/uts/common/fs/zfs/dsl_deadlist.c	Sat Aug 25 02:44:53 2012 -0500
+++ b/usr/src/uts/common/fs/zfs/dsl_deadlist.c	Mon Aug 27 03:58:14 2012 -0700
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/dsl_dataset.h>
@@ -163,12 +163,49 @@
 
 	for (zap_cursor_init(&zc, os, dlobj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
-	    zap_cursor_advance(&zc))
-		bpobj_free(os, za.za_first_integer, tx);
+	    zap_cursor_advance(&zc)) {
+		uint64_t obj = za.za_first_integer;
+		if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
+			bpobj_decr_empty(os, tx);
+		else
+			bpobj_free(os, obj, tx);
+	}
 	zap_cursor_fini(&zc);
 	VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
 }
 
+static void
+dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
+    const blkptr_t *bp, dmu_tx_t *tx)
+{
+	if (dle->dle_bpobj.bpo_object ==
+	    dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
+		uint64_t obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+		bpobj_close(&dle->dle_bpobj);
+		bpobj_decr_empty(dl->dl_os, tx);
+		VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+		VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
+		    dle->dle_mintxg, obj, tx));
+	}
+	bpobj_enqueue(&dle->dle_bpobj, bp, tx);
+}
+
+static void
+dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
+    uint64_t obj, dmu_tx_t *tx)
+{
+	if (dle->dle_bpobj.bpo_object !=
+	    dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
+		bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
+	} else {
+		bpobj_close(&dle->dle_bpobj);
+		bpobj_decr_empty(dl->dl_os, tx);
+		VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+		VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
+		    dle->dle_mintxg, obj, tx));
+	}
+}
+
 void
 dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
 {
@@ -197,7 +234,7 @@
 		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
 	else
 		dle = AVL_PREV(&dl->dl_tree, dle);
-	bpobj_enqueue(&dle->dle_bpobj, bp, tx);
+	dle_enqueue(dl, dle, bp, tx);
 }
 
 /*
@@ -217,7 +254,7 @@
 
 	dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
 	dle->dle_mintxg = mintxg;
-	obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+	obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
 	VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
 	avl_add(&dl->dl_tree, dle);
 
@@ -243,8 +280,7 @@
 	dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
 	dle_prev = AVL_PREV(&dl->dl_tree, dle);
 
-	bpobj_enqueue_subobj(&dle_prev->dle_bpobj,
-	    dle->dle_bpobj.bpo_object, tx);
+	dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);
 
 	avl_remove(&dl->dl_tree, dle);
 	bpobj_close(&dle->dle_bpobj);
@@ -302,7 +338,7 @@
 		if (dle->dle_mintxg >= maxtxg)
 			break;
 
-		obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+		obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
 		VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
 		    dle->dle_mintxg, obj, tx));
 	}
@@ -400,7 +436,7 @@
 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 	if (dle == NULL)
 		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
-	bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
+	dle_enqueue_subobj(dl, dle, obj, tx);
 }
 
 static int
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c	Sat Aug 25 02:44:53 2012 -0500
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c	Mon Aug 27 03:58:14 2012 -0700
@@ -182,6 +182,15 @@
 			goto out;
 	}
 
+	if (spa_feature_is_active(dp->dp_spa,
+	    &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ])) {
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
+		    &dp->dp_empty_bpobj);
+		if (err != 0)
+			goto out;
+	}
+
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
 	    &dp->dp_tmp_userrefs_obj);
--- a/usr/src/uts/common/fs/zfs/sys/bpobj.h	Sat Aug 25 02:44:53 2012 -0500
+++ b/usr/src/uts/common/fs/zfs/sys/bpobj.h	Mon Aug 27 03:58:14 2012 -0700
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_BPOBJ_H
@@ -67,7 +68,9 @@
 typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
 
 uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx);
+uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx);
 void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
+void bpobj_decr_empty(objset_t *os, dmu_tx_t *tx);
 
 int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object);
 void bpobj_close(bpobj_t *bpo);
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h	Sat Aug 25 02:44:53 2012 -0500
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h	Mon Aug 27 03:58:14 2012 -0700
@@ -305,6 +305,7 @@
 #define	DMU_POOL_SCAN			"scan"
 #define	DMU_POOL_FREE_BPOBJ		"free_bpobj"
 #define	DMU_POOL_BPTREE_OBJ		"bptree_obj"
+#define	DMU_POOL_EMPTY_BPOBJ		"empty_bpobj"
 
 /*
  * Allocate an object from this objset.  The range of object numbers
--- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h	Sat Aug 25 02:44:53 2012 -0500
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h	Mon Aug 27 03:58:14 2012 -0700
@@ -88,6 +88,7 @@
 	uint64_t dp_tmp_userrefs_obj;
 	bpobj_t dp_free_bpobj;
 	uint64_t dp_bptree_obj;
+	uint64_t dp_empty_bpobj;
 
 	struct dsl_scan *dp_scan;
 
--- a/usr/src/uts/common/fs/zfs/sys/zap.h	Sat Aug 25 02:44:53 2012 -0500
+++ b/usr/src/uts/common/fs/zfs/sys/zap.h	Mon Aug 27 03:58:14 2012 -0700
@@ -300,6 +300,8 @@
 /* Here the key is an int and the value is a different int. */
 int zap_add_int_key(objset_t *os, uint64_t obj,
     uint64_t key, uint64_t value, dmu_tx_t *tx);
+int zap_update_int_key(objset_t *os, uint64_t obj,
+    uint64_t key, uint64_t value, dmu_tx_t *tx);
 int zap_lookup_int_key(objset_t *os, uint64_t obj,
     uint64_t key, uint64_t *valuep);
 
--- a/usr/src/uts/common/fs/zfs/zap.c	Sat Aug 25 02:44:53 2012 -0500
+++ b/usr/src/uts/common/fs/zfs/zap.c	Mon Aug 27 03:58:14 2012 -0700
@@ -1094,6 +1094,16 @@
 }
 
 int
+zap_update_int_key(objset_t *os, uint64_t obj,
+    uint64_t key, uint64_t value, dmu_tx_t *tx)
+{
+	char name[20];
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+	return (zap_update(os, obj, name, 8, 1, &value, tx));
+}
+
+int
 zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
 {
 	char name[20];
--- a/usr/src/uts/common/fs/zfs/zfeature.c	Sat Aug 25 02:44:53 2012 -0500
+++ b/usr/src/uts/common/fs/zfs/zfeature.c	Mon Aug 27 03:58:14 2012 -0700
@@ -221,7 +221,12 @@
 	uint64_t refcount;
 	uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj;
 
-	ASSERT(0 != zapobj);
+	/*
+	 * If the pool is currently being created, the feature objects may not
+	 * have been allocated yet.  Act as though all features are disabled.
+	 */
+	if (zapobj == 0)
+		return (ENOTSUP);
 
 	err = zap_lookup(os, zapobj, feature->fi_guid, sizeof (uint64_t), 1,
 	    &refcount);