Mercurial > illumos > illumos-gate
changeset 10974:32d689ba6466
6897958 ASSERT in metaslab_class_space_update() with 8+ exabyte pool
6898598 dsl needs to be more aware of dedup space
author | Jeff Bonwick <Jeff.Bonwick@Sun.COM> |
---|---|
date | Thu, 05 Nov 2009 18:44:56 -0800 |
parents | 6969e719525a |
children | 9dd13a7cd2e3 |
files | usr/src/uts/common/fs/zfs/dsl_dir.c usr/src/uts/common/fs/zfs/metaslab.c usr/src/uts/common/fs/zfs/spa.c usr/src/uts/common/fs/zfs/spa_misc.c usr/src/uts/common/fs/zfs/sys/metaslab.h usr/src/uts/common/fs/zfs/sys/metaslab_impl.h usr/src/uts/common/fs/zfs/sys/spa_impl.h usr/src/uts/common/fs/zfs/vdev.c |
diffstat | 8 files changed, 163 insertions(+), 133 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c Thu Nov 05 17:39:24 2009 -0800 +++ b/usr/src/uts/common/fs/zfs/dsl_dir.c Thu Nov 05 18:44:56 2009 -0800 @@ -651,8 +651,7 @@ * dsl_pool_adjustedsize()), something is very * wrong. */ - ASSERT3U(used, <=, metaslab_class_get_space( - spa_normal_class(dd->dd_pool->dp_spa))); + ASSERT3U(used, <=, spa_get_dspace(dd->dd_pool->dp_spa)); } else { /* * the lesser of the space provided by our parent and
--- a/usr/src/uts/common/fs/zfs/metaslab.c Thu Nov 05 17:39:24 2009 -0800 +++ b/usr/src/uts/common/fs/zfs/metaslab.c Thu Nov 05 18:44:56 2009 -0800 @@ -77,60 +77,15 @@ void metaslab_class_destroy(metaslab_class_t *mc) { - metaslab_group_t *mg; - - while ((mg = mc->mc_rotor) != NULL) { - metaslab_class_remove(mc, mg); - metaslab_group_destroy(mg); - } + ASSERT(mc->mc_rotor == NULL); + ASSERT(mc->mc_alloc == 0); + ASSERT(mc->mc_deferred == 0); + ASSERT(mc->mc_space == 0); + ASSERT(mc->mc_dspace == 0); kmem_free(mc, sizeof (metaslab_class_t)); } -void -metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg) -{ - metaslab_group_t *mgprev, *mgnext; - - ASSERT(mg->mg_class == NULL); - - if ((mgprev = mc->mc_rotor) == NULL) { - mg->mg_prev = mg; - mg->mg_next = mg; - } else { - mgnext = mgprev->mg_next; - mg->mg_prev = mgprev; - mg->mg_next = mgnext; - mgprev->mg_next = mg; - mgnext->mg_prev = mg; - } - mc->mc_rotor = mg; - mg->mg_class = mc; -} - -void -metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg) -{ - metaslab_group_t *mgprev, *mgnext; - - ASSERT(mg->mg_class == mc); - - mgprev = mg->mg_prev; - mgnext = mg->mg_next; - - if (mg == mgnext) { - mc->mc_rotor = NULL; - } else { - mc->mc_rotor = mgnext; - mgprev->mg_next = mgnext; - mgnext->mg_prev = mgprev; - } - - mg->mg_prev = NULL; - mg->mg_next = NULL; - mg->mg_class = NULL; -} - int metaslab_class_validate(metaslab_class_t *mc) { @@ -165,11 +120,6 @@ atomic_add_64(&mc->mc_deferred, defer_delta); atomic_add_64(&mc->mc_space, space_delta); atomic_add_64(&mc->mc_dspace, dspace_delta); - - ASSERT((int64_t)mc->mc_alloc >= 0 && - (int64_t)mc->mc_deferred >= 0 && - (int64_t)mc->mc_space >= 0 && - (int64_t)mc->mc_dspace >= 0); } uint64_t @@ -234,9 +184,9 @@ mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&mg->mg_metaslab_tree, metaslab_compare, sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); - mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children); mg->mg_vd = vd; - metaslab_class_add(mc, mg); + mg->mg_class = mc; + mg->mg_activation_count = 0; return (mg); } @@ -244,11 +194,77 @@ void metaslab_group_destroy(metaslab_group_t *mg) { + ASSERT(mg->mg_prev == NULL); + ASSERT(mg->mg_next == NULL); + ASSERT(mg->mg_activation_count + mg->mg_vd->vdev_removing == 0); + avl_destroy(&mg->mg_metaslab_tree); mutex_destroy(&mg->mg_lock); kmem_free(mg, sizeof (metaslab_group_t)); } +void +metaslab_group_activate(metaslab_group_t *mg) +{ + metaslab_class_t *mc = mg->mg_class; + metaslab_group_t *mgprev, *mgnext; + + ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); + + ASSERT(mc->mc_rotor != mg); + ASSERT(mg->mg_prev == NULL); + ASSERT(mg->mg_next == NULL); + ASSERT(mg->mg_activation_count <= 0); + + if (++mg->mg_activation_count <= 0) + return; + + mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); + + if ((mgprev = mc->mc_rotor) == NULL) { + mg->mg_prev = mg; + mg->mg_next = mg; + } else { + mgnext = mgprev->mg_next; + mg->mg_prev = mgprev; + mg->mg_next = mgnext; + mgprev->mg_next = mg; + mgnext->mg_prev = mg; + } + mc->mc_rotor = mg; +} + +void +metaslab_group_passivate(metaslab_group_t *mg) +{ + metaslab_class_t *mc = mg->mg_class; + metaslab_group_t *mgprev, *mgnext; + + ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); + + if (--mg->mg_activation_count != 0) { + ASSERT(mc->mc_rotor != mg); + ASSERT(mg->mg_prev == NULL); + ASSERT(mg->mg_next == NULL); + ASSERT(mg->mg_activation_count < 0); + return; + } + + mgprev = mg->mg_prev; + mgnext = mg->mg_next; + + if (mg == mgnext) { + mc->mc_rotor = NULL; + } else { + mc->mc_rotor = mgnext; + mgprev->mg_next = mgnext; + mgnext->mg_prev = mgprev; + } + + mg->mg_prev = NULL; + mg->mg_next = NULL; +} + static void metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) { @@ -1052,7 +1068,7 @@ * longer exists (i.e. removed). Consult the rotor when * all else fails. */ - if (vd != NULL && vd->vdev_mg != NULL) { + if (vd != NULL) { mg = vd->vdev_mg; if (flags & METASLAB_HINTBP_AVOID && @@ -1069,15 +1085,18 @@ } /* - * If the hint put us into the wrong class, just follow the rotor. + * If the hint put us into the wrong metaslab class, or into a + * metaslab group that has been passivated, just follow the rotor. */ - if (mg->mg_class != mc) + if (mg->mg_class != mc || mg->mg_activation_count <= 0) mg = mc->mc_rotor; rotor = mg; top: all_zero = B_TRUE; do { + ASSERT(mg->mg_activation_count == 1); + vd = mg->mg_vd; /*
--- a/usr/src/uts/common/fs/zfs/spa.c Thu Nov 05 17:39:24 2009 -0800 +++ b/usr/src/uts/common/fs/zfs/spa.c Thu Nov 05 18:44:56 2009 -0800 @@ -3789,32 +3789,13 @@ */ /* - * Initial phase of device removal - stop future allocations from this device. - */ -void -spa_vdev_remove_start(spa_t *spa, vdev_t *vd) -{ - metaslab_group_t *mg = vd->vdev_mg; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); - ASSERT(vd == vd->vdev_top); - - /* - * Remove our vdev from the allocatable vdevs - */ - if (mg) - metaslab_class_remove(mg->mg_class, mg); -} - -/* * Evacuate the device. */ int spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) { + int error = 0; uint64_t txg; - int error; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); @@ -3827,23 +3808,20 @@ * should no longer have any blocks allocated on it. */ if (vd->vdev_islog) { - /* - * Evacuate the device. - */ - if (error = dmu_objset_find(spa_name(spa), - zil_vdev_offline, NULL, DS_FIND_CHILDREN)) { - uint64_t txg; - - txg = spa_vdev_config_enter(spa); - metaslab_class_add(spa->spa_log_class, - vd->vdev_mg); - return (spa_vdev_exit(spa, NULL, txg, error)); - } - txg_wait_synced(spa_get_dsl(spa), 0); + error = dmu_objset_find(spa_name(spa), zil_vdev_offline, + NULL, DS_FIND_CHILDREN); + } else { + error = ENOTSUP; /* until we have bp rewrite */ } + txg_wait_synced(spa_get_dsl(spa), 0); + + if (error) + return (error); + /* - * Remove any remaining MOS metadata associated with the device. + * The evacuation succeeded. Remove any remaining MOS metadata + * associated with this vdev, and wait for these changes to sync. */ txg = spa_vdev_config_enter(spa); vd->vdev_removing = B_TRUE; @@ -3858,10 +3836,9 @@ * Complete the removal by cleaning up the namespace. */ void -spa_vdev_remove_done(spa_t *spa, vdev_t *vd) +spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; - metaslab_group_t *mg = vd->vdev_mg; uint64_t id = vd->vdev_id; boolean_t last_vdev = (id == (rvd->vdev_children - 1)); @@ -3878,14 +3855,6 @@ vdev_free(vd); - /* - * It's possible that another thread is trying todo a spa_vdev_add() - * at the same time we're trying remove it. As a result the - * added vdev may not have initialized its metaslabs yet. - */ - if (mg != NULL) - metaslab_group_destroy(mg); - if (last_vdev) { vdev_compact_children(rvd); } else { @@ -3908,6 +3877,7 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) { vdev_t *vd; + metaslab_group_t *mg; nvlist_t **spares, **l2cache, *nv; uint64_t txg = 0; uint_t nspares, nl2cache; @@ -3955,13 +3925,12 @@ * become the common case. */ + mg = vd->vdev_mg; + /* - * 1. Stop allocations - * 2. Evacuate the device (i.e. kill off stubby and - * metadata) and wait for it to complete (i.e. sync). - * 3. Cleanup the vdev namespace. + * Stop allocating from this vdev. */ - spa_vdev_remove_start(spa, vd); + metaslab_group_passivate(mg); /* * Wait for the youngest allocations and frees to sync, @@ -3970,11 +3939,25 @@ spa_vdev_config_exit(spa, NULL, txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); - if ((error = spa_vdev_remove_evacuate(spa, vd)) != 0) - return (error); + /* + * Attempt to evacuate the vdev. + */ + error = spa_vdev_remove_evacuate(spa, vd); + txg = spa_vdev_config_enter(spa); - spa_vdev_remove_done(spa, vd); + /* + * If we couldn't evacuate the vdev, unwind. + */ + if (error) { + metaslab_group_activate(mg); + return (spa_vdev_exit(spa, NULL, txg, error)); + } + + /* + * Clean up the vdev namespace. + */ + spa_vdev_remove_from_namespace(spa, vd); } else if (vd != NULL) { /*
--- a/usr/src/uts/common/fs/zfs/spa_misc.c Thu Nov 05 17:39:24 2009 -0800 +++ b/usr/src/uts/common/fs/zfs/spa_misc.c Thu Nov 05 18:44:56 2009 -0800 @@ -436,6 +436,8 @@ mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); @@ -456,8 +458,6 @@ avl_add(&spa_namespace_avl, spa); - mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); - /* * Set the alternate root, if there is one. */ @@ -533,6 +533,7 @@ mutex_destroy(&spa->spa_history_lock); mutex_destroy(&spa->spa_props_lock); mutex_destroy(&spa->spa_suspend_lock); + mutex_destroy(&spa->spa_vdev_top_lock); kmem_free(spa, sizeof (spa_t)); } @@ -841,6 +842,7 @@ spa_vdev_enter(spa_t *spa) { mutex_enter(&spa_namespace_lock); + mutex_enter(&spa->spa_vdev_top_lock); return (spa_vdev_config_enter(spa)); } @@ -936,6 +938,7 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) { spa_vdev_config_exit(spa, vd, txg, error, FTAG); + mutex_exit(&spa->spa_vdev_top_lock); mutex_exit(&spa_namespace_lock); return (error);
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h Thu Nov 05 17:39:24 2009 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h Thu Nov 05 18:44:56 2009 -0800 @@ -57,8 +57,6 @@ extern metaslab_class_t *metaslab_class_create(spa_t *spa, space_map_ops_t *ops); extern void metaslab_class_destroy(metaslab_class_t *mc); -extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg); -extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg); extern int metaslab_class_validate(metaslab_class_t *mc); extern void metaslab_class_space_update(metaslab_class_t *mc, @@ -72,6 +70,8 @@ extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc, vdev_t *vd); extern void metaslab_group_destroy(metaslab_group_t *mg); +extern void metaslab_group_activate(metaslab_group_t *mg); +extern void metaslab_group_passivate(metaslab_group_t *mg); #ifdef __cplusplus }
--- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h Thu Nov 05 17:39:24 2009 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h Thu Nov 05 18:44:56 2009 -0800 @@ -52,6 +52,7 @@ avl_tree_t mg_metaslab_tree; uint64_t mg_aliquot; int64_t mg_bias; + int64_t mg_activation_count; metaslab_class_t *mg_class; vdev_t *mg_vd; metaslab_group_t *mg_prev;
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h Thu Nov 05 17:39:24 2009 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h Thu Nov 05 18:44:56 2009 -0800 @@ -183,6 +183,7 @@ uint64_t spa_dedup_ditto; /* dedup ditto threshold */ uint64_t spa_dedup_checksum; /* default dedup checksum */ uint64_t spa_dspace; /* dspace in normal class */ + kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ boolean_t spa_autoreplace; /* autoreplace set in open */ int spa_vdev_locks; /* locks grabbed */ /*
--- a/usr/src/uts/common/fs/zfs/vdev.c Thu Nov 05 17:39:24 2009 -0800 +++ b/usr/src/uts/common/fs/zfs/vdev.c Thu Nov 05 18:44:56 2009 -0800 @@ -491,6 +491,13 @@ &vd->vdev_asize); } + if (parent && !parent->vdev_parent) { + ASSERT(alloctype == VDEV_ALLOC_LOAD || + alloctype == VDEV_ALLOC_ADD); + vd->vdev_mg = metaslab_group_create(islog ? + spa_log_class(spa) : spa_normal_class(spa), vd); + } + /* * If we're a leaf vdev, try to load the DTL object and other state. */ @@ -578,8 +585,10 @@ /* * Discard allocation state. */ - if (vd == vd->vdev_top) + if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); + metaslab_group_destroy(vd->vdev_mg); + } ASSERT3U(vd->vdev_stat.vs_space, ==, 0); ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0); @@ -787,13 +796,14 @@ { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; - metaslab_class_t *mc; uint64_t m; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; + ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); + /* * This vdev is not being allocated from yet or is a hole. */ @@ -813,14 +823,6 @@ ASSERT(oldc <= newc); - if (vd->vdev_islog) - mc = spa_log_class(spa); - else - mc = spa_normal_class(spa); - - if (vd->vdev_mg == NULL) - vd->vdev_mg = metaslab_group_create(mc, vd); - mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); if (oldc != 0) { @@ -855,6 +857,15 @@ m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); } + if (txg == 0) + spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); + + if (oldc == 0) + metaslab_group_activate(vd->vdev_mg); + + if (txg == 0) + spa_config_exit(spa, SCL_ALLOC, FTAG); + return (0); } @@ -865,6 +876,7 @@ uint64_t count = vd->vdev_ms_count; if (vd->vdev_ms != NULL) { + metaslab_group_passivate(vd->vdev_mg); for (m = 0; m < count; m++) if (vd->vdev_ms[m] != NULL) metaslab_fini(vd->vdev_ms[m]); @@ -2134,8 +2146,8 @@ return (error); } -int -vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) +static int +vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *vd, *tvd; int error = 0; @@ -2178,7 +2190,7 @@ /* * Prevent any future allocations. */ - metaslab_class_remove(spa->spa_log_class, mg); + metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); error = vdev_offline_log(spa); @@ -2189,7 +2201,7 @@ * Check to see if the config has changed. */ if (error || generation != spa->spa_config_generation) { - metaslab_class_add(spa->spa_log_class, mg); + metaslab_group_activate(mg); if (error) return (spa_vdev_state_exit(spa, vd, error)); @@ -2220,7 +2232,7 @@ * once we online the device it's open for business. */ if (tvd->vdev_islog && mg != NULL) - metaslab_class_add(spa->spa_log_class, mg); + metaslab_group_activate(mg); } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); @@ -2228,6 +2240,18 @@ return (spa_vdev_state_exit(spa, vd, 0)); } +int +vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) +{ + int error; + + mutex_enter(&spa->spa_vdev_top_lock); + error = vdev_offline_locked(spa, guid, flags); + mutex_exit(&spa->spa_vdev_top_lock); + + return (error); +} + /* * Clear the error counts associated with this vdev. Unlike vdev_online() and * vdev_offline(), we assume the spa config is locked. We also clear all