changeset 10974:32d689ba6466

6897958 ASSERT in metaslab_class_space_update() with 8+ exabyte pool 6898598 dsl needs to be more aware of dedup space
author Jeff Bonwick <Jeff.Bonwick@Sun.COM>
date Thu, 05 Nov 2009 18:44:56 -0800
parents 6969e719525a
children 9dd13a7cd2e3
files usr/src/uts/common/fs/zfs/dsl_dir.c usr/src/uts/common/fs/zfs/metaslab.c usr/src/uts/common/fs/zfs/spa.c usr/src/uts/common/fs/zfs/spa_misc.c usr/src/uts/common/fs/zfs/sys/metaslab.h usr/src/uts/common/fs/zfs/sys/metaslab_impl.h usr/src/uts/common/fs/zfs/sys/spa_impl.h usr/src/uts/common/fs/zfs/vdev.c
diffstat 8 files changed, 163 insertions(+), 133 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c	Thu Nov 05 17:39:24 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c	Thu Nov 05 18:44:56 2009 -0800
@@ -651,8 +651,7 @@
 		 * dsl_pool_adjustedsize()), something is very
 		 * wrong.
 		 */
-		ASSERT3U(used, <=, metaslab_class_get_space(
-		    spa_normal_class(dd->dd_pool->dp_spa)));
+		ASSERT3U(used, <=, spa_get_dspace(dd->dd_pool->dp_spa));
 	} else {
 		/*
 		 * the lesser of the space provided by our parent and
--- a/usr/src/uts/common/fs/zfs/metaslab.c	Thu Nov 05 17:39:24 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/metaslab.c	Thu Nov 05 18:44:56 2009 -0800
@@ -77,60 +77,15 @@
 void
 metaslab_class_destroy(metaslab_class_t *mc)
 {
-	metaslab_group_t *mg;
-
-	while ((mg = mc->mc_rotor) != NULL) {
-		metaslab_class_remove(mc, mg);
-		metaslab_group_destroy(mg);
-	}
+	ASSERT(mc->mc_rotor == NULL);
+	ASSERT(mc->mc_alloc == 0);
+	ASSERT(mc->mc_deferred == 0);
+	ASSERT(mc->mc_space == 0);
+	ASSERT(mc->mc_dspace == 0);
 
 	kmem_free(mc, sizeof (metaslab_class_t));
 }
 
-void
-metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg)
-{
-	metaslab_group_t *mgprev, *mgnext;
-
-	ASSERT(mg->mg_class == NULL);
-
-	if ((mgprev = mc->mc_rotor) == NULL) {
-		mg->mg_prev = mg;
-		mg->mg_next = mg;
-	} else {
-		mgnext = mgprev->mg_next;
-		mg->mg_prev = mgprev;
-		mg->mg_next = mgnext;
-		mgprev->mg_next = mg;
-		mgnext->mg_prev = mg;
-	}
-	mc->mc_rotor = mg;
-	mg->mg_class = mc;
-}
-
-void
-metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg)
-{
-	metaslab_group_t *mgprev, *mgnext;
-
-	ASSERT(mg->mg_class == mc);
-
-	mgprev = mg->mg_prev;
-	mgnext = mg->mg_next;
-
-	if (mg == mgnext) {
-		mc->mc_rotor = NULL;
-	} else {
-		mc->mc_rotor = mgnext;
-		mgprev->mg_next = mgnext;
-		mgnext->mg_prev = mgprev;
-	}
-
-	mg->mg_prev = NULL;
-	mg->mg_next = NULL;
-	mg->mg_class = NULL;
-}
-
 int
 metaslab_class_validate(metaslab_class_t *mc)
 {
@@ -165,11 +120,6 @@
 	atomic_add_64(&mc->mc_deferred, defer_delta);
 	atomic_add_64(&mc->mc_space, space_delta);
 	atomic_add_64(&mc->mc_dspace, dspace_delta);
-
-	ASSERT((int64_t)mc->mc_alloc >= 0 &&
-	    (int64_t)mc->mc_deferred >= 0 &&
-	    (int64_t)mc->mc_space >= 0 &&
-	    (int64_t)mc->mc_dspace >= 0);
 }
 
 uint64_t
@@ -234,9 +184,9 @@
 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
-	mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children);
 	mg->mg_vd = vd;
-	metaslab_class_add(mc, mg);
+	mg->mg_class = mc;
+	mg->mg_activation_count = 0;
 
 	return (mg);
 }
@@ -244,11 +194,77 @@
 void
 metaslab_group_destroy(metaslab_group_t *mg)
 {
+	ASSERT(mg->mg_prev == NULL);
+	ASSERT(mg->mg_next == NULL);
+	ASSERT(mg->mg_activation_count + mg->mg_vd->vdev_removing == 0);
+
 	avl_destroy(&mg->mg_metaslab_tree);
 	mutex_destroy(&mg->mg_lock);
 	kmem_free(mg, sizeof (metaslab_group_t));
 }
 
+void
+metaslab_group_activate(metaslab_group_t *mg)
+{
+	metaslab_class_t *mc = mg->mg_class;
+	metaslab_group_t *mgprev, *mgnext;
+
+	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
+
+	ASSERT(mc->mc_rotor != mg);
+	ASSERT(mg->mg_prev == NULL);
+	ASSERT(mg->mg_next == NULL);
+	ASSERT(mg->mg_activation_count <= 0);
+
+	if (++mg->mg_activation_count <= 0)
+		return;
+
+	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
+
+	if ((mgprev = mc->mc_rotor) == NULL) {
+		mg->mg_prev = mg;
+		mg->mg_next = mg;
+	} else {
+		mgnext = mgprev->mg_next;
+		mg->mg_prev = mgprev;
+		mg->mg_next = mgnext;
+		mgprev->mg_next = mg;
+		mgnext->mg_prev = mg;
+	}
+	mc->mc_rotor = mg;
+}
+
+void
+metaslab_group_passivate(metaslab_group_t *mg)
+{
+	metaslab_class_t *mc = mg->mg_class;
+	metaslab_group_t *mgprev, *mgnext;
+
+	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
+
+	if (--mg->mg_activation_count != 0) {
+		ASSERT(mc->mc_rotor != mg);
+		ASSERT(mg->mg_prev == NULL);
+		ASSERT(mg->mg_next == NULL);
+		ASSERT(mg->mg_activation_count < 0);
+		return;
+	}
+
+	mgprev = mg->mg_prev;
+	mgnext = mg->mg_next;
+
+	if (mg == mgnext) {
+		mc->mc_rotor = NULL;
+	} else {
+		mc->mc_rotor = mgnext;
+		mgprev->mg_next = mgnext;
+		mgnext->mg_prev = mgprev;
+	}
+
+	mg->mg_prev = NULL;
+	mg->mg_next = NULL;
+}
+
 static void
 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 {
@@ -1052,7 +1068,7 @@
 		 * longer exists (i.e. removed). Consult the rotor when
 		 * all else fails.
 		 */
-		if (vd != NULL && vd->vdev_mg != NULL) {
+		if (vd != NULL) {
 			mg = vd->vdev_mg;
 
 			if (flags & METASLAB_HINTBP_AVOID &&
@@ -1069,15 +1085,18 @@
 	}
 
 	/*
-	 * If the hint put us into the wrong class, just follow the rotor.
+	 * If the hint put us into the wrong metaslab class, or into a
+	 * metaslab group that has been passivated, just follow the rotor.
 	 */
-	if (mg->mg_class != mc)
+	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
 		mg = mc->mc_rotor;
 
 	rotor = mg;
 top:
 	all_zero = B_TRUE;
 	do {
+		ASSERT(mg->mg_activation_count == 1);
+
 		vd = mg->mg_vd;
 
 		/*
--- a/usr/src/uts/common/fs/zfs/spa.c	Thu Nov 05 17:39:24 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/spa.c	Thu Nov 05 18:44:56 2009 -0800
@@ -3789,32 +3789,13 @@
  */
 
 /*
- * Initial phase of device removal - stop future allocations from this device.
- */
-void
-spa_vdev_remove_start(spa_t *spa, vdev_t *vd)
-{
-	metaslab_group_t *mg = vd->vdev_mg;
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
-	ASSERT(vd == vd->vdev_top);
-
-	/*
-	 * Remove our vdev from the allocatable vdevs
-	 */
-	if (mg)
-		metaslab_class_remove(mg->mg_class, mg);
-}
-
-/*
  * Evacuate the device.
  */
 int
 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
 {
+	int error = 0;
 	uint64_t txg;
-	int error;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
@@ -3827,23 +3808,20 @@
 	 * should no longer have any blocks allocated on it.
 	 */
 	if (vd->vdev_islog) {
-		/*
-		 * Evacuate the device.
-		 */
-		if (error = dmu_objset_find(spa_name(spa),
-		    zil_vdev_offline, NULL, DS_FIND_CHILDREN)) {
-			uint64_t txg;
-
-			txg = spa_vdev_config_enter(spa);
-			metaslab_class_add(spa->spa_log_class,
-			    vd->vdev_mg);
-			return (spa_vdev_exit(spa, NULL, txg, error));
-		}
-		txg_wait_synced(spa_get_dsl(spa), 0);
+		error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
+		    NULL, DS_FIND_CHILDREN);
+	} else {
+		error = ENOTSUP;	/* until we have bp rewrite */
 	}
 
+	txg_wait_synced(spa_get_dsl(spa), 0);
+
+	if (error)
+		return (error);
+
 	/*
-	 * Remove any remaining MOS metadata associated with the device.
+	 * The evacuation succeeded.  Remove any remaining MOS metadata
+	 * associated with this vdev, and wait for these changes to sync.
 	 */
 	txg = spa_vdev_config_enter(spa);
 	vd->vdev_removing = B_TRUE;
@@ -3858,10 +3836,9 @@
  * Complete the removal by cleaning up the namespace.
  */
 void
-spa_vdev_remove_done(spa_t *spa, vdev_t *vd)
+spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
-	metaslab_group_t *mg = vd->vdev_mg;
 	uint64_t id = vd->vdev_id;
 	boolean_t last_vdev = (id == (rvd->vdev_children - 1));
 
@@ -3878,14 +3855,6 @@
 
 	vdev_free(vd);
 
-	/*
-	 * It's possible that another thread is trying todo a spa_vdev_add()
-	 * at the same time we're trying remove it. As a result the
-	 * added vdev may not have initialized its metaslabs yet.
-	 */
-	if (mg != NULL)
-		metaslab_group_destroy(mg);
-
 	if (last_vdev) {
 		vdev_compact_children(rvd);
 	} else {
@@ -3908,6 +3877,7 @@
 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 {
 	vdev_t *vd;
+	metaslab_group_t *mg;
 	nvlist_t **spares, **l2cache, *nv;
 	uint64_t txg = 0;
 	uint_t nspares, nl2cache;
@@ -3955,13 +3925,12 @@
 		 * become the common case.
 		 */
 
+		mg = vd->vdev_mg;
+
 		/*
-		 * 1. Stop allocations
-		 * 2. Evacuate the device (i.e. kill off stubby and
-		 *    metadata) and wait for it to complete (i.e. sync).
-		 * 3. Cleanup the vdev namespace.
+		 * Stop allocating from this vdev.
 		 */
-		spa_vdev_remove_start(spa, vd);
+		metaslab_group_passivate(mg);
 
 		/*
 		 * Wait for the youngest allocations and frees to sync,
@@ -3970,11 +3939,25 @@
 		spa_vdev_config_exit(spa, NULL,
 		    txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
 
-		if ((error = spa_vdev_remove_evacuate(spa, vd)) != 0)
-			return (error);
+		/*
+		 * Attempt to evacuate the vdev.
+		 */
+		error = spa_vdev_remove_evacuate(spa, vd);
+
 		txg = spa_vdev_config_enter(spa);
 
-		spa_vdev_remove_done(spa, vd);
+		/*
+		 * If we couldn't evacuate the vdev, unwind.
+		 */
+		if (error) {
+			metaslab_group_activate(mg);
+			return (spa_vdev_exit(spa, NULL, txg, error));
+		}
+
+		/*
+		 * Clean up the vdev namespace.
+		 */
+		spa_vdev_remove_from_namespace(spa, vd);
 
 	} else if (vd != NULL) {
 		/*
--- a/usr/src/uts/common/fs/zfs/spa_misc.c	Thu Nov 05 17:39:24 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c	Thu Nov 05 18:44:56 2009 -0800
@@ -436,6 +436,8 @@
 	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
@@ -456,8 +458,6 @@
 
 	avl_add(&spa_namespace_avl, spa);
 
-	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
-
 	/*
 	 * Set the alternate root, if there is one.
 	 */
@@ -533,6 +533,7 @@
 	mutex_destroy(&spa->spa_history_lock);
 	mutex_destroy(&spa->spa_props_lock);
 	mutex_destroy(&spa->spa_suspend_lock);
+	mutex_destroy(&spa->spa_vdev_top_lock);
 
 	kmem_free(spa, sizeof (spa_t));
 }
@@ -841,6 +842,7 @@
 spa_vdev_enter(spa_t *spa)
 {
 	mutex_enter(&spa_namespace_lock);
+	mutex_enter(&spa->spa_vdev_top_lock);
 	return (spa_vdev_config_enter(spa));
 }
 
@@ -936,6 +938,7 @@
 spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 {
 	spa_vdev_config_exit(spa, vd, txg, error, FTAG);
+	mutex_exit(&spa->spa_vdev_top_lock);
 	mutex_exit(&spa_namespace_lock);
 
 	return (error);
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h	Thu Nov 05 17:39:24 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h	Thu Nov 05 18:44:56 2009 -0800
@@ -57,8 +57,6 @@
 extern metaslab_class_t *metaslab_class_create(spa_t *spa,
     space_map_ops_t *ops);
 extern void metaslab_class_destroy(metaslab_class_t *mc);
-extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg);
-extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg);
 extern int metaslab_class_validate(metaslab_class_t *mc);
 
 extern void metaslab_class_space_update(metaslab_class_t *mc,
@@ -72,6 +70,8 @@
 extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
     vdev_t *vd);
 extern void metaslab_group_destroy(metaslab_group_t *mg);
+extern void metaslab_group_activate(metaslab_group_t *mg);
+extern void metaslab_group_passivate(metaslab_group_t *mg);
 
 #ifdef	__cplusplus
 }
--- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h	Thu Nov 05 17:39:24 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h	Thu Nov 05 18:44:56 2009 -0800
@@ -52,6 +52,7 @@
 	avl_tree_t		mg_metaslab_tree;
 	uint64_t		mg_aliquot;
 	int64_t			mg_bias;
+	int64_t			mg_activation_count;
 	metaslab_class_t	*mg_class;
 	vdev_t			*mg_vd;
 	metaslab_group_t	*mg_prev;
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Thu Nov 05 17:39:24 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Thu Nov 05 18:44:56 2009 -0800
@@ -183,6 +183,7 @@
 	uint64_t	spa_dedup_ditto;	/* dedup ditto threshold */
 	uint64_t	spa_dedup_checksum;	/* default dedup checksum */
 	uint64_t	spa_dspace;		/* dspace in normal class */
+	kmutex_t	spa_vdev_top_lock;	/* dueling offline/remove */
 	boolean_t	spa_autoreplace;	/* autoreplace set in open */
 	int		spa_vdev_locks;		/* locks grabbed */
 	/*
--- a/usr/src/uts/common/fs/zfs/vdev.c	Thu Nov 05 17:39:24 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev.c	Thu Nov 05 18:44:56 2009 -0800
@@ -491,6 +491,13 @@
 		    &vd->vdev_asize);
 	}
 
+	if (parent && !parent->vdev_parent) {
+		ASSERT(alloctype == VDEV_ALLOC_LOAD ||
+		    alloctype == VDEV_ALLOC_ADD);
+		vd->vdev_mg = metaslab_group_create(islog ?
+		    spa_log_class(spa) : spa_normal_class(spa), vd);
+	}
+
 	/*
 	 * If we're a leaf vdev, try to load the DTL object and other state.
 	 */
@@ -578,8 +585,10 @@
 	/*
 	 * Discard allocation state.
 	 */
-	if (vd == vd->vdev_top)
+	if (vd->vdev_mg != NULL) {
 		vdev_metaslab_fini(vd);
+		metaslab_group_destroy(vd->vdev_mg);
+	}
 
 	ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
 	ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0);
@@ -787,13 +796,14 @@
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
-	metaslab_class_t *mc;
 	uint64_t m;
 	uint64_t oldc = vd->vdev_ms_count;
 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
 	metaslab_t **mspp;
 	int error;
 
+	ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+
 	/*
 	 * This vdev is not being allocated from yet or is a hole.
 	 */
@@ -813,14 +823,6 @@
 
 	ASSERT(oldc <= newc);
 
-	if (vd->vdev_islog)
-		mc = spa_log_class(spa);
-	else
-		mc = spa_normal_class(spa);
-
-	if (vd->vdev_mg == NULL)
-		vd->vdev_mg = metaslab_group_create(mc, vd);
-
 	mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
 
 	if (oldc != 0) {
@@ -855,6 +857,15 @@
 		    m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
 	}
 
+	if (txg == 0)
+		spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
+
+	if (oldc == 0)
+		metaslab_group_activate(vd->vdev_mg);
+
+	if (txg == 0)
+		spa_config_exit(spa, SCL_ALLOC, FTAG);
+
 	return (0);
 }
 
@@ -865,6 +876,7 @@
 	uint64_t count = vd->vdev_ms_count;
 
 	if (vd->vdev_ms != NULL) {
+		metaslab_group_passivate(vd->vdev_mg);
 		for (m = 0; m < count; m++)
 			if (vd->vdev_ms[m] != NULL)
 				metaslab_fini(vd->vdev_ms[m]);
@@ -2134,8 +2146,8 @@
 	return (error);
 }
 
-int
-vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
+static int
+vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
 {
 	vdev_t *vd, *tvd;
 	int error = 0;
@@ -2178,7 +2190,7 @@
 			/*
 			 * Prevent any future allocations.
 			 */
-			metaslab_class_remove(spa->spa_log_class, mg);
+			metaslab_group_passivate(mg);
 			(void) spa_vdev_state_exit(spa, vd, 0);
 
 			error = vdev_offline_log(spa);
@@ -2189,7 +2201,7 @@
 			 * Check to see if the config has changed.
 			 */
 			if (error || generation != spa->spa_config_generation) {
-				metaslab_class_add(spa->spa_log_class, mg);
+				metaslab_group_activate(mg);
 				if (error)
 					return (spa_vdev_state_exit(spa,
 					    vd, error));
@@ -2220,7 +2232,7 @@
 		 * once we online the device it's open for business.
 		 */
 		if (tvd->vdev_islog && mg != NULL)
-			metaslab_class_add(spa->spa_log_class, mg);
+			metaslab_group_activate(mg);
 	}
 
 	vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
@@ -2228,6 +2240,18 @@
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
+int
+vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
+{
+	int error;
+
+	mutex_enter(&spa->spa_vdev_top_lock);
+	error = vdev_offline_locked(spa, guid, flags);
+	mutex_exit(&spa->spa_vdev_top_lock);
+
+	return (error);
+}
+
 /*
  * Clear the error counts associated with this vdev.  Unlike vdev_online() and
  * vdev_offline(), we assume the spa config is locked.  We also clear all