Mercurial > illumos > illumos-gate
diff usr/src/uts/common/fs/zfs/spa.c @ 2082:76b439ec3ac1
PSARC 2006/223 ZFS Hot Spares
PSARC 2006/303 ZFS Clone Promotion
6276916 support for "clone swap"
6288488 du reports misleading size on RAID-Z
6393490 libzfs should be a real library
6397148 fbufs debug code should be removed from buf_hash_insert()
6405966 Hot Spare support in ZFS
6409302 passing a non-root vdev via zpool_create() panics system
6415739 assertion failed: !(zio->io_flags & 0x00040)
6416759 ::dbufs does not find bonus buffers anymore
6417978 double parity RAID-Z a.k.a. RAID6
6424554 full block re-writes need not read data in
6425111 detaching an offline device can result in import confusion
author | eschrock |
---|---|
date | Tue, 30 May 2006 15:47:16 -0700 |
parents | 628267397204 |
children | 73de7a781492 |
line wrap: on
line diff
--- a/usr/src/uts/common/fs/zfs/spa.c Tue May 30 15:35:36 2006 -0700 +++ b/usr/src/uts/common/fs/zfs/spa.c Tue May 30 15:47:16 2006 -0700 @@ -18,6 +18,7 @@ * * CDDL HEADER END */ + /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. @@ -185,33 +186,40 @@ * in the CLOSED state. This will prep the pool before open/creation/import. * All vdev validation is done by the vdev_alloc() routine. */ -static vdev_t * -spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) +static int +spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, + uint_t id, int atype) { nvlist_t **child; uint_t c, children; - vdev_t *vd; - - if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) - return (NULL); - - if (vd->vdev_ops->vdev_op_leaf) - return (vd); + int error; + + if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) + return (error); + + if ((*vdp)->vdev_ops->vdev_op_leaf) + return (0); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { - vdev_free(vd); - return (NULL); + vdev_free(*vdp); + *vdp = NULL; + return (EINVAL); } for (c = 0; c < children; c++) { - if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { - vdev_free(vd); - return (NULL); + vdev_t *vd; + if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, + atype)) != 0) { + vdev_free(*vdp); + *vdp = NULL; + return (error); } } - return (vd); + ASSERT(*vdp != NULL); + + return (0); } /* @@ -220,6 +228,8 @@ static void spa_unload(spa_t *spa) { + int i; + /* * Stop async tasks. */ @@ -254,10 +264,117 @@ vdev_free(spa->spa_root_vdev); ASSERT(spa->spa_root_vdev == NULL); + for (i = 0; i < spa->spa_nspares; i++) + vdev_free(spa->spa_spares[i]); + if (spa->spa_spares) { + kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); + spa->spa_spares = NULL; + } + if (spa->spa_sparelist) { + nvlist_free(spa->spa_sparelist); + spa->spa_sparelist = NULL; + } + spa->spa_async_suspended = 0; } /* + * Load (or re-load) the current list of vdevs describing the active spares for + * this pool. When this is called, we have some form of basic information in + * 'spa_sparelist'. We parse this into vdevs, try to open them, and then + * re-generate a more complete list including status information. + */ +static void +spa_load_spares(spa_t *spa) +{ + nvlist_t **spares; + uint_t nspares; + int i; + + /* + * First, close and free any existing spare vdevs. + */ + for (i = 0; i < spa->spa_nspares; i++) { + vdev_close(spa->spa_spares[i]); + vdev_free(spa->spa_spares[i]); + } + if (spa->spa_spares) + kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); + + if (spa->spa_sparelist == NULL) + nspares = 0; + else + VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); + + spa->spa_nspares = (int)nspares; + spa->spa_spares = NULL; + + if (nspares == 0) + return; + + /* + * Construct the array of vdevs, opening them to get status in the + * process. + */ + spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); + for (i = 0; i < spa->spa_nspares; i++) { + vdev_t *vd; + + VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, + VDEV_ALLOC_SPARE) == 0); + ASSERT(vd != NULL); + + spa->spa_spares[i] = vd; + + if (vdev_open(vd) != 0) + continue; + + vd->vdev_top = vd; + (void) vdev_validate_spare(vd); + } + + /* + * Recompute the stashed list of spares, with status information + * this time. + */ + VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, + DATA_TYPE_NVLIST_ARRAY) == 0); + + spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); + for (i = 0; i < spa->spa_nspares; i++) + spares[i] = vdev_config_generate(spa, spa->spa_spares[i], + B_TRUE, B_TRUE); + VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, + spares, spa->spa_nspares) == 0); + for (i = 0; i < spa->spa_nspares; i++) + nvlist_free(spares[i]); + kmem_free(spares, spa->spa_nspares * sizeof (void *)); +} + +static int +load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) +{ + dmu_buf_t *db; + char *packed = NULL; + size_t nvsize = 0; + int error; + *value = NULL; + + VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); + nvsize = *(uint64_t *)db->db_data; + dmu_buf_rele(db, FTAG); + + packed = kmem_alloc(nvsize, KM_SLEEP); + error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); + if (error == 0) + error = nvlist_unpack(packed, nvsize, value, 0); + kmem_free(packed, nvsize); + + return (error); +} + +/* * Load an existing storage pool, using the pool's builtin spa_config as a * source of configuration information. */ @@ -270,6 +387,7 @@ uberblock_t *ub = &spa->spa_uberblock; uint64_t config_cache_txg = spa->spa_config_txg; uint64_t pool_guid; + uint64_t version; zio_t *zio; spa->spa_load_state = state; @@ -280,6 +398,13 @@ goto out; } + /* + * Versioning wasn't explicitly added to the label until later, so if + * it's not present treat it as the initial version. + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) + version = ZFS_VERSION_INITIAL; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &spa->spa_config_txg); @@ -290,16 +415,17 @@ } /* - * Parse the configuration into a vdev tree. + * Parse the configuration into a vdev tree. We explicitly set the + * value that will be returned by spa_version() since parsing the + * configuration requires knowing the version number. */ spa_config_enter(spa, RW_WRITER, FTAG); - rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); + spa->spa_ubsync.ub_version = version; + error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); spa_config_exit(spa, FTAG); - if (rvd == NULL) { - error = EINVAL; + if (error != 0) goto out; - } ASSERT(spa->spa_root_vdev == rvd); ASSERT(spa_guid(spa) == pool_guid); @@ -396,24 +522,9 @@ } if (!mosconfig) { - dmu_buf_t *db; - char *packed = NULL; - size_t nvsize = 0; - nvlist_t *newconfig = NULL; - - VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, - spa->spa_config_object, FTAG, &db)); - nvsize = *(uint64_t *)db->db_data; - dmu_buf_rele(db, FTAG); - - packed = kmem_alloc(nvsize, KM_SLEEP); - error = dmu_read(spa->spa_meta_objset, - spa->spa_config_object, 0, nvsize, packed); - if (error == 0) - error = nvlist_unpack(packed, nvsize, &newconfig, 0); - kmem_free(packed, nvsize); - - if (error) { + nvlist_t *newconfig; + + if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); error = EIO; @@ -421,7 +532,6 @@ } spa_config_set(spa, newconfig); - spa_unload(spa); spa_deactivate(spa); spa_activate(spa); @@ -439,6 +549,21 @@ } /* + * Load the bit that tells us to use the new accounting function + * (raid-z deflation). If we have an older pool, this will not + * be present. + */ + error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, + sizeof (uint64_t), 1, &spa->spa_deflate); + if (error != 0 && error != ENOENT) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + /* * Load the persistent error log. If we have an older pool, this will * not be present. */ @@ -463,6 +588,32 @@ } /* + * Load any hot spares for this pool. + */ + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); + if (error != 0 && error != ENOENT) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + if (error == 0) { + ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); + if (load_nvlist(spa, spa->spa_spares_object, + &spa->spa_sparelist) != 0) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + spa_config_enter(spa, RW_WRITER, FTAG); + spa_load_spares(spa); + spa_config_exit(spa, FTAG); + } + + /* * Load the vdev state for all toplevel vdevs. */ vdev_load(rvd); @@ -527,7 +678,7 @@ error = 0; out: - if (error) + if (error && error != EBADF) zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); spa->spa_load_state = SPA_LOAD_NONE; spa->spa_ena = 0; @@ -587,6 +738,7 @@ * this is the case, the config cache is out of sync and * we should remove the pool from the namespace. */ + zfs_post_ok(spa, NULL); spa_unload(spa); spa_deactivate(spa); spa_remove(spa); @@ -678,6 +830,48 @@ mutex_exit(&spa_namespace_lock); } +static void +spa_add_spares(spa_t *spa, nvlist_t *config) +{ + nvlist_t **spares; + uint_t i, nspares; + nvlist_t *nvroot; + uint64_t guid; + vdev_stat_t *vs; + uint_t vsc; + + if (spa->spa_nspares == 0) + return; + + VERIFY(nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); + if (nspares != 0) { + VERIFY(nvlist_add_nvlist_array(nvroot, + ZPOOL_CONFIG_SPARES, spares, nspares) == 0); + VERIFY(nvlist_lookup_nvlist_array(nvroot, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); + + /* + * Go through and find any spares which have since been + * repurposed as an active spare. If this is the case, update + * their status appropriately. + */ + for (i = 0; i < nspares; i++) { + VERIFY(nvlist_lookup_uint64(spares[i], + ZPOOL_CONFIG_GUID, &guid) == 0); + if (spa_spare_inuse(guid)) { + VERIFY(nvlist_lookup_uint64_array( + spares[i], ZPOOL_CONFIG_STATS, + (uint64_t **)&vs, &vsc) == 0); + vs->vs_state = VDEV_STATE_CANT_OPEN; + vs->vs_aux = VDEV_AUX_SPARED; + } + } + } +} + int spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) { @@ -687,10 +881,13 @@ *config = NULL; error = spa_open_common(name, &spa, FTAG, config); - if (spa && *config != NULL) + if (spa && *config != NULL) { VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, spa_get_errlog_size(spa)) == 0); + spa_add_spares(spa, *config); + } + /* * We want to get the alternate root even for faulted pools, so we cheat * and call spa_lookup() directly. @@ -717,6 +914,65 @@ } /* + * Validate that the 'spares' array is well formed. We must have an array of + * nvlists, each which describes a valid leaf vdev. + */ +static int +spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) +{ + nvlist_t **spares; + uint_t i, nspares; + vdev_t *vd; + int error; + + /* + * It's acceptable to have no spares specified. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) != 0) + return (0); + + if (nspares == 0) + return (EINVAL); + + /* + * Make sure the pool is formatted with a version that supports hot + * spares. + */ + if (spa_version(spa) < ZFS_VERSION_SPARES) + return (ENOTSUP); + + for (i = 0; i < nspares; i++) { + if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, + mode)) != 0) + return (error); + + if (!vd->vdev_ops->vdev_op_leaf) { + vdev_free(vd); + return (EINVAL); + } + + if ((error = vdev_open(vd)) != 0) { + vdev_free(vd); + return (error); + } + + vd->vdev_top = vd; + if ((error = vdev_label_spare(vd, crtxg)) != 0) { + vdev_free(vd); + return (error); + } + + VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, + vd->vdev_guid) == 0); + + vdev_free(vd); + } + + return (0); +} + +/* * Pool Creation */ int @@ -726,8 +982,10 @@ vdev_t *rvd; dsl_pool_t *dp; dmu_tx_t *tx; - int c, error; + int c, error = 0; uint64_t txg = TXG_INITIAL; + nvlist_t **spares; + uint_t nspares; /* * If this pool already exists, return failure. @@ -753,23 +1011,26 @@ */ spa_config_enter(spa, RW_WRITER, FTAG); - rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); - - ASSERT(spa->spa_root_vdev == rvd); - - if (rvd == NULL) { + error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); + + ASSERT(error != 0 || rvd != NULL); + ASSERT(error != 0 || spa->spa_root_vdev == rvd); + + if (error == 0 && rvd->vdev_children == 0) error = EINVAL; - } else { - if ((error = vdev_create(rvd, txg)) == 0) { - for (c = 0; c < rvd->vdev_children; c++) - vdev_init(rvd->vdev_child[c], txg); - vdev_config_dirty(rvd); - } + + if (error == 0 && + (error = vdev_create(rvd, txg, B_FALSE)) == 0 && + (error = spa_validate_spares(spa, nvroot, txg, + VDEV_ALLOC_ADD)) == 0) { + for (c = 0; c < rvd->vdev_children; c++) + vdev_init(rvd->vdev_child[c], txg); + vdev_config_dirty(rvd); } spa_config_exit(spa, FTAG); - if (error) { + if (error != 0) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); @@ -777,6 +1038,21 @@ return (error); } + /* + * Get the list of spares, if specified. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, + ZPOOL_CONFIG_SPARES, spares, nspares) == 0); + spa_config_enter(spa, RW_WRITER, FTAG); + spa_load_spares(spa); + spa_config_exit(spa, FTAG); + spa->spa_sync_spares = B_TRUE; + } + spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); spa->spa_meta_objset = dp->dp_meta_objset; @@ -795,6 +1071,14 @@ cmn_err(CE_PANIC, "failed to add pool config"); } + /* Newly created pools are always deflated. */ + spa->spa_deflate = TRUE; + if (zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, + sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { + cmn_err(CE_PANIC, "failed to add deflate"); + } + /* * Create the deferred-free bplist object. Turn off compression * because sync-to-convergence takes longer if the blocksize @@ -838,6 +1122,9 @@ { spa_t *spa; int error; + nvlist_t *nvroot; + nvlist_t **spares; + uint_t nspares; if (!(spa_mode & FWRITE)) return (EROFS); @@ -864,7 +1151,25 @@ */ error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); - if (error) { + spa_config_enter(spa, RW_WRITER, FTAG); + /* + * Toss any existing sparelist, as it doesn't have any validity anymore, + * and conflicts with spa_has_spare(). + */ + if (spa->spa_sparelist) { + nvlist_free(spa->spa_sparelist); + spa->spa_sparelist = NULL; + spa_load_spares(spa); + } + + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + if (error == 0) + error = spa_validate_spares(spa, nvroot, -1ULL, + VDEV_ALLOC_SPARE); + spa_config_exit(spa, FTAG); + + if (error != 0) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); @@ -873,6 +1178,26 @@ } /* + * Override any spares as specified by the user, as these may have + * correct device names/devids, etc. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + if (spa->spa_sparelist) + VERIFY(nvlist_remove(spa->spa_sparelist, + ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); + else + VERIFY(nvlist_alloc(&spa->spa_sparelist, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, + ZPOOL_CONFIG_SPARES, spares, nspares) == 0); + spa_config_enter(spa, RW_WRITER, FTAG); + spa_load_spares(spa); + spa_config_exit(spa, FTAG); + spa->spa_sync_spares = B_TRUE; + } + + /* * Update the config cache to include the newly-imported pool. */ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); @@ -933,6 +1258,11 @@ poolname) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state) == 0); + + /* + * Add the list of hot spares. + */ + spa_add_spares(spa, config); } spa_unload(spa); @@ -1083,26 +1413,80 @@ int c, error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; + nvlist_t **spares; + uint_t i, nspares; txg = spa_vdev_enter(spa); - vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); - - if (vd == NULL) - return (spa_vdev_exit(spa, vd, txg, EINVAL)); - - if ((error = vdev_create(vd, txg)) != 0) + if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, + VDEV_ALLOC_ADD)) != 0) + return (spa_vdev_exit(spa, NULL, txg, error)); + + if ((error = spa_validate_spares(spa, nvroot, txg, + VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, vd, txg, error)); - /* - * Transfer each new top-level vdev from vd to rvd. - */ - for (c = 0; c < vd->vdev_children; c++) { - tvd = vd->vdev_child[c]; - vdev_remove_child(vd, tvd); - tvd->vdev_id = rvd->vdev_children; - vdev_add_child(rvd, tvd); - vdev_config_dirty(tvd); + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) != 0) + nspares = 0; + + if (vd->vdev_children == 0 && nspares == 0) + return (spa_vdev_exit(spa, vd, txg, EINVAL)); + + if (vd->vdev_children != 0) { + if ((error = vdev_create(vd, txg, B_FALSE)) != 0) + return (spa_vdev_exit(spa, vd, txg, error)); + + /* + * Transfer each new top-level vdev from vd to rvd. + */ + for (c = 0; c < vd->vdev_children; c++) { + tvd = vd->vdev_child[c]; + vdev_remove_child(vd, tvd); + tvd->vdev_id = rvd->vdev_children; + vdev_add_child(rvd, tvd); + vdev_config_dirty(tvd); + } + } + + if (nspares != 0) { + if (spa->spa_sparelist != NULL) { + nvlist_t **oldspares; + uint_t oldnspares; + nvlist_t **newspares; + + VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, + ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); + + newspares = kmem_alloc(sizeof (void *) * + (nspares + oldnspares), KM_SLEEP); + for (i = 0; i < oldnspares; i++) + VERIFY(nvlist_dup(oldspares[i], + &newspares[i], KM_SLEEP) == 0); + for (i = 0; i < nspares; i++) + VERIFY(nvlist_dup(spares[i], + &newspares[i + oldnspares], + KM_SLEEP) == 0); + + VERIFY(nvlist_remove(spa->spa_sparelist, + ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); + + VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, + ZPOOL_CONFIG_SPARES, newspares, + nspares + oldnspares) == 0); + for (i = 0; i < oldnspares + nspares; i++) + nvlist_free(newspares[i]); + kmem_free(newspares, (oldnspares + nspares) * + sizeof (void *)); + } else { + VERIFY(nvlist_alloc(&spa->spa_sparelist, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, + ZPOOL_CONFIG_SPARES, spares, nspares) == 0); + } + + spa_load_spares(spa); + spa->spa_sync_spares = B_TRUE; } /* @@ -1147,7 +1531,7 @@ int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; - vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; + vdev_ops_t *pvops; txg = spa_vdev_enter(spa); @@ -1161,18 +1545,8 @@ pvd = oldvd->vdev_parent; - /* - * The parent must be a mirror or the root, unless we're replacing; - * in that case, the parent can be anything but another replacing vdev. - */ - if (pvd->vdev_ops != &vdev_mirror_ops && - pvd->vdev_ops != &vdev_root_ops && - (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - - newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); - - if (newrootvd == NULL || newrootvd->vdev_children != 1) + if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, + VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); newvd = newrootvd->vdev_child[0]; @@ -1180,9 +1554,43 @@ if (!newvd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); - if ((error = vdev_create(newrootvd, txg)) != 0) + if ((error = vdev_create(newrootvd, txg, replacing)) != 0) return (spa_vdev_exit(spa, newrootvd, txg, error)); + if (!replacing) { + /* + * For attach, the only allowable parent is a mirror or the root + * vdev. + */ + if (pvd->vdev_ops != &vdev_mirror_ops && + pvd->vdev_ops != &vdev_root_ops) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + + pvops = &vdev_mirror_ops; + } else { + /* + * Active hot spares can only be replaced by inactive hot + * spares. + */ + if (pvd->vdev_ops == &vdev_spare_ops && + pvd->vdev_child[1] == oldvd && + !spa_has_spare(spa, newvd->vdev_guid)) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + + /* + * If the source is a hot spare, and the parent isn't already a + * spare, then we want to create a new hot spare. Otherwise, we + * want to create a replacing vdev. + */ + if (pvd->vdev_ops == &vdev_replacing_ops) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + else if (pvd->vdev_ops != &vdev_spare_ops && + newvd->vdev_isspare) + pvops = &vdev_spare_ops; + else + pvops = &vdev_replacing_ops; + } + /* * Compare the new device size with the replaceable/attachable * device size. @@ -1214,8 +1622,8 @@ } /* - * If the parent is not a mirror, or if we're replacing, - * insert the new mirror/replacing vdev above oldvd. + * If the parent is not a mirror, or if we're replacing, insert the new + * mirror/replacing/spare vdev above oldvd. */ if (pvd->vdev_ops != pvops) pvd = vdev_add_parent(oldvd, pvops); @@ -1283,6 +1691,8 @@ int c, t, error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *pvd, *cvd, *tvd; + boolean_t unspare = B_FALSE; + uint64_t unspare_guid; txg = spa_vdev_enter(spa); @@ -1298,17 +1708,27 @@ /* * If replace_done is specified, only remove this device if it's - * the first child of a replacing vdev. + * the first child of a replacing vdev. For the 'spare' vdev, either + * disk can be removed. */ - if (replace_done && - (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + if (replace_done) { + if (pvd->vdev_ops == &vdev_replacing_ops) { + if (vd->vdev_id != 0) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + } else if (pvd->vdev_ops != &vdev_spare_ops) { + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + } + } + + ASSERT(pvd->vdev_ops != &vdev_spare_ops || + spa_version(spa) >= ZFS_VERSION_SPARES); /* - * Only mirror and replacing vdevs support detach. + * Only mirror, replacing, and spare vdevs support detach. */ if (pvd->vdev_ops != &vdev_replacing_ops && - pvd->vdev_ops != &vdev_mirror_ops) + pvd->vdev_ops != &vdev_mirror_ops && + pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); /* @@ -1339,10 +1759,25 @@ if (!dirty) break; } - if (c == pvd->vdev_children) + + /* + * If we are a replacing or spare vdev, then we can always detach the + * latter child, as that is how one cancels the operation. + */ + if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && + c == pvd->vdev_children) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); /* + * If we are detaching the original disk from a spare, then it implies + * that the spare should become a real disk, and be removed from the + * active spare list for the pool. + */ + if (pvd->vdev_ops == &vdev_spare_ops && + vd->vdev_id == 0) + unspare = B_TRUE; + + /* * Erase the disk labels so the disk can be used for other things. * This must be done after all other error cases are handled, * but before we disembowel vd (so we can still do I/O to it). @@ -1350,7 +1785,7 @@ * it may be that the unwritability of the disk is the reason * it's being detached! */ - error = vdev_label_init(vd, 0); + error = vdev_label_init(vd, 0, B_FALSE); if (error) dprintf("unable to erase labels on %s\n", vdev_description(vd)); @@ -1366,6 +1801,19 @@ cvd = pvd->vdev_child[0]; /* + * If we need to remove the remaining child from the list of hot spares, + * do it now, marking the vdev as no longer a spare in the process. We + * must do this before vdev_remove_parent(), because that can change the + * GUID if it creates a new toplevel GUID. + */ + if (unspare) { + ASSERT(cvd->vdev_isspare); + spa_spare_remove(cvd->vdev_guid); + cvd->vdev_isspare = B_FALSE; + unspare_guid = cvd->vdev_guid; + } + + /* * If the parent mirror/replacing vdev only has one child, * the parent is no longer needed. Remove it from the tree. */ @@ -1408,7 +1856,104 @@ dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); - return (spa_vdev_exit(spa, vd, txg, 0)); + error = spa_vdev_exit(spa, vd, txg, 0); + + /* + * If we are supposed to remove the given vdev from the list of spares, + * iterate over all pools in the system and replace it if it's present. + */ + if (unspare) { + spa = NULL; + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) { + if (spa->spa_state != POOL_STATE_ACTIVE) + continue; + + (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); + } + mutex_exit(&spa_namespace_lock); + } + + return (error); +} + +/* + * Remove a device from the pool. Currently, this supports removing only hot + * spares. + */ +int +spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) +{ + vdev_t *vd; + nvlist_t **spares, *nv, **newspares; + uint_t i, j, nspares; + int ret = 0; + + spa_config_enter(spa, RW_WRITER, FTAG); + + vd = spa_lookup_by_guid(spa, guid); + + nv = NULL; + if (spa->spa_spares != NULL && + nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + for (i = 0; i < nspares; i++) { + uint64_t theguid; + + VERIFY(nvlist_lookup_uint64(spares[i], + ZPOOL_CONFIG_GUID, &theguid) == 0); + if (theguid == guid) { + nv = spares[i]; + break; + } + } + } + + /* + * We only support removing a hot spare, and only if it's not currently + * in use in this pool. + */ + if (nv == NULL && vd == NULL) { + ret = ENOENT; + goto out; + } + + if (nv == NULL && vd != NULL) { + ret = ENOTSUP; + goto out; + } + + if (!unspare && nv != NULL && vd != NULL) { + ret = EBUSY; + goto out; + } + + if (nspares == 1) { + newspares = NULL; + } else { + newspares = kmem_alloc((nspares - 1) * sizeof (void *), + KM_SLEEP); + for (i = 0, j = 0; i < nspares; i++) { + if (spares[i] != nv) + VERIFY(nvlist_dup(spares[i], + &newspares[j++], KM_SLEEP) == 0); + } + } + + VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, + DATA_TYPE_NVLIST_ARRAY) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, + newspares, nspares - 1) == 0); + for (i = 0; i < nspares - 1; i++) + nvlist_free(newspares[i]); + kmem_free(newspares, (nspares - 1) * sizeof (void *)); + spa_load_spares(spa); + spa->spa_sync_spares = B_TRUE; + +out: + spa_config_exit(spa, FTAG); + + return (ret); } /* @@ -1446,15 +1991,31 @@ spa_vdev_replace_done(spa_t *spa) { vdev_t *vd; + vdev_t *pvd; uint64_t guid; + uint64_t pguid = 0; spa_config_enter(spa, RW_READER, FTAG); while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { guid = vd->vdev_guid; + /* + * If we have just finished replacing a hot spared device, then + * we need to detach the parent's first child (the original hot + * spare) as well. + */ + pvd = vd->vdev_parent; + if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && + pvd->vdev_id == 0) { + ASSERT(pvd->vdev_ops == &vdev_replacing_ops); + ASSERT(pvd->vdev_parent->vdev_children == 2); + pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; + } spa_config_exit(spa, FTAG); if (spa_vdev_detach(spa, guid, B_TRUE) != 0) return; + if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) + return; spa_config_enter(spa, RW_READER, FTAG); } @@ -1475,8 +2036,36 @@ txg = spa_vdev_enter(spa); - if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) - return (spa_vdev_exit(spa, NULL, txg, ENOENT)); + if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { + /* + * Determine if this is a reference to a hot spare. In that + * case, update the path as stored in the spare list. + */ + nvlist_t **spares; + uint_t i, nspares; + if (spa->spa_sparelist != NULL) { + VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); + for (i = 0; i < nspares; i++) { + uint64_t theguid; + VERIFY(nvlist_lookup_uint64(spares[i], + ZPOOL_CONFIG_GUID, &theguid) == 0); + if (theguid == guid) + break; + } + + if (i == nspares) + return (spa_vdev_exit(spa, NULL, txg, ENOENT)); + + VERIFY(nvlist_add_string(spares[i], + ZPOOL_CONFIG_PATH, newpath) == 0); + spa_load_spares(spa); + spa->spa_sync_spares = B_TRUE; + return (spa_vdev_exit(spa, NULL, txg, 0)); + } else { + return (spa_vdev_exit(spa, NULL, txg, ENOENT)); + } + } if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); @@ -2049,12 +2638,79 @@ } static void +spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) +{ + char *packed = NULL; + size_t nvsize = 0; + dmu_buf_t *db; + + VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); + + packed = kmem_alloc(nvsize, KM_SLEEP); + + VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, + KM_SLEEP) == 0); + + dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); + + kmem_free(packed, nvsize); + + VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + *(uint64_t *)db->db_data = nvsize; + dmu_buf_rele(db, FTAG); +} + +static void +spa_sync_spares(spa_t *spa, dmu_tx_t *tx) +{ + nvlist_t *nvroot; + nvlist_t **spares; + int i; + + if (!spa->spa_sync_spares) + return; + + /* + * Update the MOS nvlist describing the list of available spares. + * spa_validate_spares() will have already made sure this nvlist is + * valid and the vdevs are labelled appropriately. + */ + if (spa->spa_spares_object == 0) { + spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, + DMU_OT_PACKED_NVLIST, 1 << 14, + DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); + VERIFY(zap_update(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, + sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); + } + + VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); + if (spa->spa_nspares == 0) { + VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + NULL, 0) == 0); + } else { + spares = kmem_alloc(spa->spa_nspares * sizeof (void *), + KM_SLEEP); + for (i = 0; i < spa->spa_nspares; i++) + spares[i] = vdev_config_generate(spa, + spa->spa_spares[i], B_FALSE, B_TRUE); + VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + spares, spa->spa_nspares) == 0); + for (i = 0; i < spa->spa_nspares; i++) + nvlist_free(spares[i]); + kmem_free(spares, spa->spa_nspares * sizeof (void *)); + } + + spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); + + spa->spa_sync_spares = B_FALSE; +} + +static void spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) { nvlist_t *config; - char *packed = NULL; - size_t nvsize = 0; - dmu_buf_t *db; if (list_is_empty(&spa->spa_dirty_list)) return; @@ -2065,23 +2721,7 @@ nvlist_free(spa->spa_config_syncing); spa->spa_config_syncing = config; - VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); - - packed = kmem_alloc(nvsize, KM_SLEEP); - - VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, - KM_SLEEP) == 0); - - dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, - packed, tx); - - kmem_free(packed, nvsize); - - VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, - spa->spa_config_object, FTAG, &db)); - dmu_buf_will_dirty(db, tx); - *(uint64_t *)db->db_data = nvsize; - dmu_buf_rele(db, FTAG); + spa_sync_nvlist(spa, spa->spa_config_object, config, tx); } /* @@ -2109,6 +2749,29 @@ VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); + tx = dmu_tx_create_assigned(dp, txg); + + /* + * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, + * set spa_deflate if we have no raid-z vdevs. + */ + if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && + spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { + int i; + + for (i = 0; i < rvd->vdev_children; i++) { + vd = rvd->vdev_child[i]; + if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) + break; + } + if (i == rvd->vdev_children) { + spa->spa_deflate = TRUE; + VERIFY(0 == zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, + sizeof (uint64_t), 1, &spa->spa_deflate, tx)); + } + } + /* * If anything has changed in this txg, push the deferred frees * from the previous txg. If not, leave them alone so that we @@ -2124,12 +2787,9 @@ do { spa->spa_sync_pass++; - tx = dmu_tx_create_assigned(dp, txg); spa_sync_config_object(spa, tx); - dmu_tx_commit(tx); - + spa_sync_spares(spa, tx); spa_errlog_sync(spa, txg); - dsl_pool_sync(dp, txg); dirty_vdevs = 0; @@ -2138,10 +2798,7 @@ dirty_vdevs++; } - tx = dmu_tx_create_assigned(dp, txg); bplist_sync(bpl, tx); - dmu_tx_commit(tx); - } while (dirty_vdevs); bplist_close(bpl); @@ -2175,6 +2832,8 @@ VERIFY(vdev_config_sync(rvd, txg) == 0); } + dmu_tx_commit(tx); + /* * Clear the dirty config list. */ @@ -2219,7 +2878,7 @@ /* * It had better be the case that we didn't dirty anything - * since spa_sync_labels(). + * since vdev_config_sync(). */ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); @@ -2319,4 +2978,18 @@ vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, FTAG); + + txg_wait_synced(spa_get_dsl(spa), 0); } + +boolean_t +spa_has_spare(spa_t *spa, uint64_t guid) +{ + int i; + + for (i = 0; i < spa->spa_nspares; i++) + if (spa->spa_spares[i]->vdev_guid == guid) + return (B_TRUE); + + return (B_FALSE); +}