Mercurial > illumos > illumos-gate
diff usr/src/uts/common/fs/zfs/spa.c @ 10921:8aac17999e4d
PSARC 2009/479 zpool recovery support
6667683 need a way to rollback to an uberblock from a previous txg
6885998 bad ASSERT() in traverse_zil_block()
author | Tim Haley <Tim.Haley@Sun.COM> |
---|---|
date | Fri, 30 Oct 2009 18:47:17 -0600 |
parents | 2a6b5dc1374c |
children | e2081f502306 |
line wrap: on
line diff
--- a/usr/src/uts/common/fs/zfs/spa.c Fri Oct 30 16:13:16 2009 -0700 +++ b/usr/src/uts/common/fs/zfs/spa.c Fri Oct 30 18:47:17 2009 -0600 @@ -1151,12 +1151,91 @@ spa_check_removed(sav->sav_vdevs[i]); } +typedef struct spa_load_error { + uint64_t sle_metadata_count; + uint64_t sle_data_count; +} spa_load_error_t; + +static void +spa_load_verify_done(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + spa_load_error_t *sle = zio->io_private; + dmu_object_type_t type = BP_GET_TYPE(bp); + int error = zio->io_error; + + if (error) { + if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) && + type != DMU_OT_INTENT_LOG) + atomic_add_64(&sle->sle_metadata_count, 1); + else + atomic_add_64(&sle->sle_data_count, 1); + } + zio_data_buf_free(zio->io_data, zio->io_size); +} + +/*ARGSUSED*/ +static int +spa_load_verify_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, + const dnode_phys_t *dnp, void *arg) +{ + if (bp != NULL) { + zio_t *rio = arg; + size_t size = BP_GET_PSIZE(bp); + void *data = zio_data_buf_alloc(size); + + zio_nowait(zio_read(rio, spa, bp, data, size, + spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); + } + return (0); +} + +static int +spa_load_verify(spa_t *spa) +{ + zio_t *rio; + spa_load_error_t sle = { 0 }; + zpool_rewind_policy_t policy; + boolean_t verify_ok = B_FALSE; + int error; + + rio = zio_root(spa, NULL, &sle, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); + + error = traverse_pool(spa, spa_load_verify_cb, rio, + spa->spa_verify_min_txg); + + (void) zio_wait(rio); + + zpool_get_rewind_policy(spa->spa_config, &policy); + + spa->spa_load_meta_errors = sle.sle_metadata_count; + spa->spa_load_data_errors = sle.sle_data_count; + + if (!error && sle.sle_metadata_count <= policy.zrp_maxmeta && + sle.sle_data_count <= policy.zrp_maxdata) { + verify_ok = B_TRUE; + spa->spa_load_txg = spa->spa_uberblock.ub_txg; + spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; + } + + if (error) { + if (error != ENXIO && error != EIO) + error = EIO; + return (error); + } + + return (verify_ok ? 0 : EIO); +} + /* * Load an existing storage pool, using the pool's builtin spa_config as a * source of configuration information. */ static int -spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) +spa_load(spa_t *spa, spa_load_state_t state, int mosconfig) { int error = 0; nvlist_t *nvconfig, *nvroot = NULL; @@ -1168,6 +1247,7 @@ uint64_t autoreplace = 0; int orig_mode = spa->spa_mode; char *ereport = FM_EREPORT_ZFS_POOL; + nvlist_t *config = spa->spa_config; /* * If this is an untrusted config, access the pool in read-only mode. @@ -1296,11 +1376,15 @@ */ spa->spa_state = POOL_STATE_ACTIVE; spa->spa_ubsync = spa->spa_uberblock; - spa->spa_first_txg = spa_last_synced_txg(spa) + 1; + spa->spa_verify_min_txg = spa->spa_extreme_rewind ? + TXG_INITIAL : spa_last_synced_txg(spa) - TXG_DEFER_SIZE; + spa->spa_first_txg = spa->spa_last_ubsync_txg ? + spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); if (error) { vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); + error = EIO; goto out; } spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; @@ -1359,7 +1443,7 @@ spa_deactivate(spa); spa_activate(spa, orig_mode); - return (spa_load(spa, nvconfig, state, B_TRUE)); + return (spa_load(spa, state, B_TRUE)); } if (zap_lookup(spa->spa_meta_objset, @@ -1569,7 +1653,17 @@ goto out; } - if (spa_writeable(spa)) { + if (state != SPA_LOAD_TRYIMPORT) { + error = spa_load_verify(spa); + if (error) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + goto out; + } + } + + if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || + spa->spa_load_max_txg == UINT64_MAX)) { dmu_tx_t *tx; int need_update = B_FALSE; @@ -1578,6 +1672,7 @@ /* * Claim log blocks that haven't been committed yet. * This must all happen in a single txg. + * Price of rollback is that we abandon the log. */ tx = dmu_tx_create_assigned(spa_get_dsl(spa), spa_first_txg(spa)); @@ -1602,7 +1697,8 @@ * in-core spa_config and update the disk labels. */ if (config_cache_txg != spa->spa_config_txg || - state == SPA_LOAD_IMPORT || spa->spa_load_verbatim) + state == SPA_LOAD_IMPORT || spa->spa_load_verbatim || + state == SPA_LOAD_RECOVER) need_update = B_TRUE; for (int c = 0; c < rvd->vdev_children; c++) @@ -1636,6 +1732,7 @@ error = 0; out: + spa->spa_minref = refcount_count(&spa->spa_refcount); if (error && error != EBADF) zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); @@ -1645,6 +1742,76 @@ return (error); } +static int +spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) +{ + spa_unload(spa); + spa_deactivate(spa); + + spa->spa_load_max_txg--; + + spa_activate(spa, spa_mode_global); + spa_async_suspend(spa); + + return (spa_load(spa, state, mosconfig)); +} + +static int +spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, + uint64_t max_request, boolean_t extreme) +{ + nvlist_t *config = NULL; + int load_error, rewind_error; + uint64_t safe_rollback_txg; + uint64_t min_txg; + + if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) + spa->spa_load_max_txg = spa->spa_load_txg; + else + spa->spa_load_max_txg = max_request; + + load_error = rewind_error = spa_load(spa, state, mosconfig); + if (load_error == 0) + return (0); + + if (spa->spa_root_vdev != NULL) + config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + + spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; + spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; + + /* specific txg requested */ + if (spa->spa_load_max_txg != UINT64_MAX && !extreme) { + nvlist_free(config); + return (load_error); + } + + /* Price of rolling back is discarding txgs, including log */ + if (state == SPA_LOAD_RECOVER) + spa->spa_log_state = SPA_LOG_CLEAR; + + spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; + safe_rollback_txg = spa->spa_uberblock.ub_txg - TXG_DEFER_SIZE; + + min_txg = extreme ? TXG_INITIAL : safe_rollback_txg; + while (rewind_error && (spa->spa_uberblock.ub_txg >= min_txg)) { + if (spa->spa_load_max_txg < safe_rollback_txg) + spa->spa_extreme_rewind = B_TRUE; + rewind_error = spa_load_retry(spa, state, mosconfig); + } + + if (config) + spa_rewind_data_to_nvlist(spa, config); + + spa->spa_extreme_rewind = B_FALSE; + spa->spa_load_max_txg = UINT64_MAX; + + if (config && (rewind_error || state != SPA_LOAD_RECOVER)) + spa_config_set(spa, config); + + return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); +} + /* * Pool Open/Import * @@ -1658,14 +1825,25 @@ * ambiguous state. */ static int -spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) +spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, + nvlist_t **config) { spa_t *spa; + boolean_t norewind; + boolean_t extreme; + zpool_rewind_policy_t policy; + spa_load_state_t state = SPA_LOAD_OPEN; int error; int locked = B_FALSE; *spapp = NULL; + zpool_get_rewind_policy(nvpolicy, &policy); + if (policy.zrp_request & ZPOOL_DO_REWIND) + state = SPA_LOAD_RECOVER; + norewind = (policy.zrp_request == ZPOOL_NO_REWIND); + extreme = ((policy.zrp_request & ZPOOL_EXTREME_REWIND) != 0); + /* * As disgusting as this is, we need to support recursive calls to this * function because dsl_dir_open() is called during spa_load(), and ends @@ -1682,11 +1860,26 @@ mutex_exit(&spa_namespace_lock); return (ENOENT); } + if (spa->spa_state == POOL_STATE_UNINITIALIZED) { spa_activate(spa, spa_mode_global); - error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); + if (spa->spa_last_open_failed && norewind) { + if (config != NULL && spa->spa_config) + VERIFY(nvlist_dup(spa->spa_config, + config, KM_SLEEP) == 0); + spa_deactivate(spa); + if (locked) + mutex_exit(&spa_namespace_lock); + return (spa->spa_last_open_failed); + } + + if (state != SPA_LOAD_RECOVER) + spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; + + error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, + extreme); if (error == EBADF) { /* @@ -1711,38 +1904,49 @@ * information: the state of each vdev after the * attempted vdev_open(). Return this to the user. */ - if (config != NULL && spa->spa_root_vdev != NULL) - *config = spa_config_generate(spa, NULL, -1ULL, - B_TRUE); + if (config != NULL && spa->spa_config) + VERIFY(nvlist_dup(spa->spa_config, config, + KM_SLEEP) == 0); spa_unload(spa); spa_deactivate(spa); - spa->spa_last_open_failed = B_TRUE; + spa->spa_last_open_failed = error; if (locked) mutex_exit(&spa_namespace_lock); *spapp = NULL; return (error); - } else { - spa->spa_last_open_failed = B_FALSE; } + } spa_open_ref(spa, tag); + spa->spa_last_open_failed = 0; + + if (config != NULL) + *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + + spa->spa_last_ubsync_txg = 0; + spa->spa_load_txg = 0; + if (locked) mutex_exit(&spa_namespace_lock); *spapp = spa; - if (config != NULL) - *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); - return (0); } int +spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, + nvlist_t **config) +{ + return (spa_open_common(name, spapp, tag, policy, config)); +} + +int spa_open(const char *name, spa_t **spapp, void *tag) { - return (spa_open_common(name, spapp, tag, NULL)); + return (spa_open_common(name, spapp, tag, NULL, NULL)); } /* @@ -1883,7 +2087,7 @@ spa_t *spa; *config = NULL; - error = spa_open_common(name, &spa, FTAG, config); + error = spa_open_common(name, &spa, FTAG, NULL, config); if (spa != NULL) { /* @@ -2143,7 +2347,7 @@ */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); - spa = spa_add(pool, altroot); + spa = spa_add(pool, NULL, altroot); spa_activate(spa, spa_mode_global); spa->spa_uberblock.ub_txg = txg - 1; @@ -2450,7 +2654,7 @@ spa_remove(spa); } - spa = spa_add(pname, NULL); + spa = spa_add(pname, config, NULL); spa->spa_is_root = B_TRUE; spa->spa_load_verbatim = B_TRUE; @@ -2529,6 +2733,7 @@ spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) { spa_t *spa; + zpool_rewind_policy_t policy; char *altroot = NULL; mutex_enter(&spa_namespace_lock); @@ -2539,12 +2744,13 @@ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); - spa = spa_add(pool, altroot); + spa = spa_add(pool, config, altroot); + + zpool_get_rewind_policy(config, &policy); + spa->spa_load_max_txg = policy.zrp_txg; spa->spa_load_verbatim = B_TRUE; - VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); - if (props != NULL) spa_configfile_set(spa, props, B_FALSE); @@ -2564,6 +2770,8 @@ { spa_t *spa; char *altroot = NULL; + spa_load_state_t state = SPA_LOAD_IMPORT; + zpool_rewind_policy_t policy; int error; nvlist_t *nvroot; nvlist_t **spares, **l2cache; @@ -2578,12 +2786,16 @@ return (EEXIST); } + zpool_get_rewind_policy(config, &policy); + if (policy.zrp_request & ZPOOL_DO_REWIND) + state = SPA_LOAD_RECOVER; + /* * Create and initialize the spa structure. */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); - spa = spa_add(pool, altroot); + spa = spa_add(pool, config, altroot); spa_activate(spa, spa_mode_global); /* @@ -2596,7 +2808,16 @@ * because the user-supplied config is actually the one to trust when * doing an import. */ - error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); + if (state != SPA_LOAD_RECOVER) + spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; + error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, + ((policy.zrp_request & ZPOOL_EXTREME_REWIND) != 0)); + + /* + * Propagate anything learned about failing or best txgs + * back to caller + */ + spa_rewind_data_to_nvlist(spa, config); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* @@ -2726,7 +2947,7 @@ * Create and initialize the spa structure. */ mutex_enter(&spa_namespace_lock); - spa = spa_add(TRYIMPORT_NAME, NULL); + spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); spa_activate(spa, FREAD); /* @@ -2734,7 +2955,7 @@ * Pass TRUE for mosconfig because the user-supplied config * is actually the one to trust when doing an import. */ - error = spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); + error = spa_load(spa, SPA_LOAD_TRYIMPORT, B_TRUE); /* * If 'tryconfig' was at least parsable, return the current config. @@ -4531,6 +4752,8 @@ spa_config_exit(spa, SCL_CONFIG, FTAG); + spa_handle_ignored_writes(spa); + /* * If any async tasks have been requested, kick them off. */