changeset 13049:2511753a6599

6782540 zpool cannot replace a replacing device
author Mark J Musante <Mark.Musante@Sun.COM>
date Fri, 06 Aug 2010 13:53:14 -0600
parents debbc43aae95
children 5279d6e2ec9e
files usr/src/cmd/zinject/zinject.c usr/src/cmd/zpool/zpool_main.c usr/src/grub/capability usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h usr/src/lib/libzfs/common/libzfs_pool.c usr/src/uts/common/fs/zfs/spa.c usr/src/uts/common/fs/zfs/sys/vdev_impl.h usr/src/uts/common/fs/zfs/vdev.c usr/src/uts/common/fs/zfs/vdev_label.c usr/src/uts/common/sys/fs/zfs.h
diffstat 10 files changed, 180 insertions(+), 116 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/zinject/zinject.c	Fri Aug 06 12:31:49 2010 -0700
+++ b/usr/src/cmd/zinject/zinject.c	Fri Aug 06 13:53:14 2010 -0600
@@ -233,7 +233,7 @@
 	    "\t\tInject a fault into a particular device or the device's\n"
 	    "\t\tlabel.  Label injection can either be 'nvlist', 'uber',\n "
 	    "\t\t'pad1', or 'pad2'.\n"
-	    "\t\t'errno' can either be 'nxio' (the default) or 'io'.\n"
+	    "\t\t'errno' can be 'nxio' (the default), 'io', or 'dtl'.\n"
 	    "\n"
 	    "\tzinject -d device -A <degrade|fault> pool\n"
 	    "\t\tPerform a specific action on a particular device\n"
@@ -395,17 +395,25 @@
 static int
 print_all_handlers(void)
 {
-	int count = 0;
+	int count = 0, total = 0;
 
 	(void) iter_handlers(print_device_handler, &count);
-	(void) printf("\n");
-	count = 0;
+	if (count > 0) {
+		total += count;
+		(void) printf("\n");
+		count = 0;
+	}
+
 	(void) iter_handlers(print_data_handler, &count);
-	(void) printf("\n");
-	count = 0;
+	if (count > 0) {
+		total += count;
+		(void) printf("\n");
+		count = 0;
+	}
+
 	(void) iter_handlers(print_panic_handler, &count);
 
-	return (count);
+	return (count + total);
 }
 
 /* ARGSUSED */
@@ -627,6 +635,8 @@
 				error = ECKSUM;
 			} else if (strcasecmp(optarg, "nxio") == 0) {
 				error = ENXIO;
+			} else if (strcasecmp(optarg, "dtl") == 0) {
+				error = ECHILD;
 			} else {
 				(void) fprintf(stderr, "invalid error type "
 				    "'%s': must be 'io', 'checksum' or "
--- a/usr/src/cmd/zpool/zpool_main.c	Fri Aug 06 12:31:49 2010 -0700
+++ b/usr/src/cmd/zpool/zpool_main.c	Fri Aug 06 13:53:14 2010 -0600
@@ -4014,6 +4014,7 @@
 		    "performance\n"));
 		(void) printf(gettext(" 27  Improved snapshot creation "
 		    "performance\n"));
+		(void) printf(gettext(" 28  Multiple vdev replacements\n"));
 		(void) printf(gettext("\nFor more information on a particular "
 		    "version, including supported releases,\n"));
 		(void) printf(gettext("see the ZFS Administration Guide.\n\n"));
--- a/usr/src/grub/capability	Fri Aug 06 12:31:49 2010 -0700
+++ b/usr/src/grub/capability	Fri Aug 06 13:53:14 2010 -0600
@@ -39,7 +39,7 @@
 # This file and the associated version are Solaris specific and are
 # not a part of the open source distribution of GRUB.
 #
-VERSION=19
+VERSION=20
 dboot
 xVM
 zfs
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h	Fri Aug 06 12:31:49 2010 -0700
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h	Fri Aug 06 13:53:14 2010 -0600
@@ -26,7 +26,7 @@
 /*
  * On-disk version number.
  */
-#define	SPA_VERSION			27ULL
+#define	SPA_VERSION			28ULL
 
 /*
  * The following are configuration names used in the nvlist describing a pool's
--- a/usr/src/lib/libzfs/common/libzfs_pool.c	Fri Aug 06 12:31:49 2010 -0700
+++ b/usr/src/lib/libzfs/common/libzfs_pool.c	Fri Aug 06 13:53:14 2010 -0600
@@ -997,13 +997,12 @@
 	char msg[1024];
 
 	if (zhp->zpool_state == POOL_STATE_ACTIVE &&
-	    (zfp = zfs_open(zhp->zpool_hdl, zhp->zpool_name,
-	    ZFS_TYPE_FILESYSTEM)) == NULL)
+	    (zfp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_FILESYSTEM)) == NULL)
 		return (-1);
 
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
 
-	if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) {
+	if (zfs_ioctl(hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) {
 		(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
 		    "cannot destroy '%s'"), zhp->zpool_name);
 
@@ -1086,7 +1085,7 @@
 		return (-1);
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
 
-	if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) {
+	if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) {
 		switch (errno) {
 		case EBUSY:
 			/*
@@ -1578,7 +1577,7 @@
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
 	zc.zc_cookie = func;
 
-	if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SCAN, &zc) == 0 ||
+	if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0 ||
 	    (errno == ENOENT && func != POOL_SCAN_NONE))
 		return (0);
 
@@ -1670,26 +1669,17 @@
 	srchkey = nvpair_name(pair);
 
 	switch (nvpair_type(pair)) {
-	case DATA_TYPE_UINT64: {
-		uint64_t srchval, theguid, present;
-
-		verify(nvpair_value_uint64(pair, &srchval) == 0);
+	case DATA_TYPE_UINT64:
 		if (strcmp(srchkey, ZPOOL_CONFIG_GUID) == 0) {
-			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
-			    &present) == 0) {
-				/*
-				 * If the device has never been present since
-				 * import, the only reliable way to match the
-				 * vdev is by GUID.
-				 */
-				verify(nvlist_lookup_uint64(nv,
-				    ZPOOL_CONFIG_GUID, &theguid) == 0);
-				if (theguid == srchval)
-					return (nv);
-			}
+			uint64_t srchval, theguid;
+
+			verify(nvpair_value_uint64(pair, &srchval) == 0);
+			verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
+			    &theguid) == 0);
+			if (theguid == srchval)
+				return (nv);
 		}
 		break;
-	}
 
 	case DATA_TYPE_STRING: {
 		char *srchval, *val;
@@ -1871,6 +1861,8 @@
 	    &nvroot) == 0);
 
 	*avail_spare = B_FALSE;
+	*l2cache = B_FALSE;
+	*log = B_FALSE;
 	ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log);
 	nvlist_free(search);
 
@@ -2166,14 +2158,14 @@
 
 		if (wholedisk) {
 			pathname += strlen(DISK_ROOT) + 1;
-			(void) zpool_relabel_disk(zhp->zpool_hdl, pathname);
+			(void) zpool_relabel_disk(hdl, pathname);
 		}
 	}
 
 	zc.zc_cookie = VDEV_STATE_ONLINE;
 	zc.zc_obj = flags;
 
-	if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) {
+	if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) {
 		if (errno == EINVAL) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "was split "
 			    "from this pool into a new one.  Use '%s' "
@@ -2215,7 +2207,7 @@
 	zc.zc_cookie = VDEV_STATE_OFFLINE;
 	zc.zc_obj = istmp ? ZFS_OFFLINE_TEMPORARY : 0;
 
-	if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
+	if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
 		return (0);
 
 	switch (errno) {
@@ -2255,7 +2247,7 @@
 	zc.zc_cookie = VDEV_STATE_FAULTED;
 	zc.zc_obj = aux;
 
-	if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
+	if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
 		return (0);
 
 	switch (errno) {
@@ -2290,7 +2282,7 @@
 	zc.zc_cookie = VDEV_STATE_DEGRADED;
 	zc.zc_obj = aux;
 
-	if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
+	if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
 		return (0);
 
 	return (zpool_standard_error(hdl, errno, msg));
@@ -2338,7 +2330,7 @@
 	nvlist_t *tgt;
 	boolean_t avail_spare, l2cache, islog;
 	uint64_t val;
-	char *path, *newname;
+	char *newname;
 	nvlist_t **child;
 	uint_t children;
 	nvlist_t *config_root;
@@ -2404,27 +2396,12 @@
 		return (zfs_error(hdl, EZFS_BADTARGET, msg));
 	}
 
-	/*
-	 * If we are attempting to replace a spare, it canot be applied to an
-	 * already spared device.
-	 */
-	if (replacing &&
-	    nvlist_lookup_string(child[0], ZPOOL_CONFIG_PATH, &path) == 0 &&
-	    zpool_find_vdev(zhp, newname, &avail_spare,
-	    &l2cache, NULL) != NULL && avail_spare &&
-	    is_replacing_spare(config_root, tgt, 0)) {
-		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-		    "device has already been replaced with a spare"));
-		free(newname);
-		return (zfs_error(hdl, EZFS_BADTARGET, msg));
-	}
-
 	free(newname);
 
 	if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
 		return (-1);
 
-	ret = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_ATTACH, &zc);
+	ret = zfs_ioctl(hdl, ZFS_IOC_VDEV_ATTACH, &zc);
 
 	zcmd_free_nvlists(&zc);
 
@@ -2447,9 +2424,16 @@
 		 * Can't attach to or replace this type of vdev.
 		 */
 		if (replacing) {
+			uint64_t version = zpool_get_prop_int(zhp,
+			    ZPOOL_PROP_VERSION, NULL);
+
 			if (islog)
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "cannot replace a log with a spare"));
+			else if (version >= SPA_VERSION_MULTI_REPLACE)
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "already in replacing/spare config; wait "
+				    "for completion or use 'zpool detach'"));
 			else
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "cannot replace a replacing device"));
@@ -2547,7 +2531,7 @@
 		 */
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only "
 		    "applicable to mirror and replacing vdevs"));
-		(void) zfs_error(zhp->zpool_hdl, EZFS_BADTARGET, msg);
+		(void) zfs_error(hdl, EZFS_BADTARGET, msg);
 		break;
 
 	case EBUSY:
@@ -2908,7 +2892,7 @@
 	if (zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size * 2) != 0)
 		return (-1);
 
-	if (zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, rewindnvl) != 0)
+	if (zcmd_write_src_nvlist(hdl, &zc, rewindnvl) != 0)
 		return (-1);
 
 	while ((error = zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc)) != 0 &&
--- a/usr/src/uts/common/fs/zfs/spa.c	Fri Aug 06 12:31:49 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/spa.c	Fri Aug 06 13:53:14 2010 -0600
@@ -116,6 +116,7 @@
 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
     char **ereport);
+static void spa_vdev_resilver_done(spa_t *spa);
 
 uint_t		zio_taskq_batch_pct = 100;	/* 1 thread per cpu in pset */
 id_t		zio_taskq_psrset_bind = PS_NONE;
@@ -3226,7 +3227,8 @@
 	    !bvd->vdev_isspare) {
 		cmn_err(CE_NOTE, "The boot device is currently spared. Please "
 		    "try booting from '%s'",
-		    bvd->vdev_parent->vdev_child[1]->vdev_path);
+		    bvd->vdev_parent->
+		    vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
 		error = EINVAL;
 		goto out;
 	}
@@ -3834,7 +3836,7 @@
 		 * spares.
 		 */
 		if (pvd->vdev_ops == &vdev_spare_ops &&
-		    pvd->vdev_child[1] == oldvd &&
+		    oldvd->vdev_isspare &&
 		    !spa_has_spare(spa, newvd->vdev_guid))
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
@@ -3846,13 +3848,15 @@
 		 * the same (spare replaces spare, non-spare replaces
 		 * non-spare).
 		 */
-		if (pvd->vdev_ops == &vdev_replacing_ops)
+		if (pvd->vdev_ops == &vdev_replacing_ops &&
+		    spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
-		else if (pvd->vdev_ops == &vdev_spare_ops &&
-		    newvd->vdev_isspare != oldvd->vdev_isspare)
+		} else if (pvd->vdev_ops == &vdev_spare_ops &&
+		    newvd->vdev_isspare != oldvd->vdev_isspare) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
-		else if (pvd->vdev_ops != &vdev_spare_ops &&
-		    newvd->vdev_isspare)
+		}
+
+		if (newvd->vdev_isspare)
 			pvops = &vdev_spare_ops;
 		else
 			pvops = &vdev_replacing_ops;
@@ -3887,6 +3891,9 @@
 		}
 	}
 
+	/* mark the device being resilvered */
+	newvd->vdev_resilvering = B_TRUE;
+
 	/*
 	 * If the parent is not a mirror, or if we're replacing, insert the new
 	 * mirror/replacing/spare vdev above oldvd.
@@ -3975,7 +3982,6 @@
 	vdev_t *vd, *pvd, *cvd, *tvd;
 	boolean_t unspare = B_FALSE;
 	uint64_t unspare_guid;
-	size_t len;
 	char *vdpath;
 
 	txg = spa_vdev_enter(spa);
@@ -4007,18 +4013,11 @@
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
 	/*
-	 * If replace_done is specified, only remove this device if it's
-	 * the first child of a replacing vdev.  For the 'spare' vdev, either
-	 * disk can be removed.
+	 * Only 'replacing' or 'spare' vdevs can be replaced.
 	 */
-	if (replace_done) {
-		if (pvd->vdev_ops == &vdev_replacing_ops) {
-			if (vd->vdev_id != 0)
-				return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-		} else if (pvd->vdev_ops != &vdev_spare_ops) {
-			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-		}
-	}
+	if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
+	    pvd->vdev_ops != &vdev_spare_ops)
+		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
 	    spa_version(spa) >= SPA_VERSION_SPARES);
@@ -4045,16 +4044,22 @@
 	 * check to see if we changed the original vdev's path to have "/old"
 	 * at the end in spa_vdev_attach().  If so, undo that change now.
 	 */
-	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 &&
-	    pvd->vdev_child[0]->vdev_path != NULL &&
-	    pvd->vdev_child[1]->vdev_path != NULL) {
-		ASSERT(pvd->vdev_child[1] == vd);
-		cvd = pvd->vdev_child[0];
-		len = strlen(vd->vdev_path);
-		if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
-		    strcmp(cvd->vdev_path + len, "/old") == 0) {
-			spa_strfree(cvd->vdev_path);
-			cvd->vdev_path = spa_strdup(vd->vdev_path);
+	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
+	    vd->vdev_path != NULL) {
+		size_t len = strlen(vd->vdev_path);
+
+		for (int c = 0; c < pvd->vdev_children; c++) {
+			cvd = pvd->vdev_child[c];
+
+			if (cvd == vd || cvd->vdev_path == NULL)
+				continue;
+
+			if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
+			    strcmp(cvd->vdev_path + len, "/old") == 0) {
+				spa_strfree(cvd->vdev_path);
+				cvd->vdev_path = spa_strdup(vd->vdev_path);
+				break;
+			}
 		}
 	}
 
@@ -4064,7 +4069,8 @@
 	 * active spare list for the pool.
 	 */
 	if (pvd->vdev_ops == &vdev_spare_ops &&
-	    vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare)
+	    vd->vdev_id == 0 &&
+	    pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
 		unspare = B_TRUE;
 
 	/*
@@ -4086,7 +4092,7 @@
 	/*
 	 * Remember one of the remaining children so we can get tvd below.
 	 */
-	cvd = pvd->vdev_child[0];
+	cvd = pvd->vdev_child[pvd->vdev_children - 1];
 
 	/*
 	 * If we need to remove the remaining child from the list of hot spares,
@@ -4102,14 +4108,20 @@
 		spa_spare_remove(cvd);
 		unspare_guid = cvd->vdev_guid;
 		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
+		cvd->vdev_unspare = B_TRUE;
 	}
 
 	/*
 	 * If the parent mirror/replacing vdev only has one child,
 	 * the parent is no longer needed.  Remove it from the tree.
 	 */
-	if (pvd->vdev_children == 1)
+	if (pvd->vdev_children == 1) {
+		if (pvd->vdev_ops == &vdev_spare_ops)
+			cvd->vdev_unspare = B_FALSE;
 		vdev_remove_parent(cvd);
+		cvd->vdev_resilvering = B_FALSE;
+	}
+
 
 	/*
 	 * We don't set tvd until now because the parent we just removed
@@ -4151,6 +4163,9 @@
 
 	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
 
+	/* hang on to the spa before we release the lock */
+	spa_open_ref(spa, FTAG);
+
 	error = spa_vdev_exit(spa, vd, txg, 0);
 
 	spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL,
@@ -4163,24 +4178,31 @@
 	 * list of every other pool.
 	 */
 	if (unspare) {
-		spa_t *myspa = spa;
-		spa = NULL;
+		spa_t *altspa = NULL;
+
 		mutex_enter(&spa_namespace_lock);
-		while ((spa = spa_next(spa)) != NULL) {
-			if (spa->spa_state != POOL_STATE_ACTIVE)
+		while ((altspa = spa_next(altspa)) != NULL) {
+			if (altspa->spa_state != POOL_STATE_ACTIVE ||
+			    altspa == spa)
 				continue;
-			if (spa == myspa)
-				continue;
-			spa_open_ref(spa, FTAG);
+
+			spa_open_ref(altspa, FTAG);
 			mutex_exit(&spa_namespace_lock);
-			(void) spa_vdev_remove(spa, unspare_guid,
-			    B_TRUE);
+			(void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
 			mutex_enter(&spa_namespace_lock);
-			spa_close(spa, FTAG);
+			spa_close(altspa, FTAG);
 		}
 		mutex_exit(&spa_namespace_lock);
+
+		/* search the rest of the vdevs for spares to remove */
+		spa_vdev_resilver_done(spa);
 	}
 
+	/* all done with the spa; OK to release */
+	mutex_enter(&spa_namespace_lock);
+	spa_close(spa, FTAG);
+	mutex_exit(&spa_namespace_lock);
+
 	return (error);
 }
 
@@ -4728,11 +4750,18 @@
 	}
 
 	/*
-	 * Check for a completed replacement.
+	 * Check for a completed replacement.  We always consider the first
+	 * vdev in the list to be the oldest vdev, and the last one to be
+	 * the newest (see spa_vdev_attach() for how that works).  In
+	 * the case where the newest vdev is faulted, we will not automatically
+	 * remove it after a resilver completes.  This is OK as it will require
+	 * user intervention to determine which disk the admin wishes to keep.
 	 */
-	if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
+	if (vd->vdev_ops == &vdev_replacing_ops) {
+		ASSERT(vd->vdev_children > 1);
+
+		newvd = vd->vdev_child[vd->vdev_children - 1];
 		oldvd = vd->vdev_child[0];
-		newvd = vd->vdev_child[1];
 
 		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
@@ -4743,16 +4772,41 @@
 	/*
 	 * Check for a completed resilver with the 'unspare' flag set.
 	 */
-	if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) {
-		newvd = vd->vdev_child[0];
-		oldvd = vd->vdev_child[1];
-
-		if (newvd->vdev_unspare &&
+	if (vd->vdev_ops == &vdev_spare_ops) {
+		vdev_t *first = vd->vdev_child[0];
+		vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
+
+		if (last->vdev_unspare) {
+			oldvd = first;
+			newvd = last;
+		} else if (first->vdev_unspare) {
+			oldvd = last;
+			newvd = first;
+		} else {
+			oldvd = NULL;
+		}
+
+		if (oldvd != NULL &&
 		    vdev_dtl_empty(newvd, DTL_MISSING) &&
 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
-		    !vdev_dtl_required(oldvd)) {
-			newvd->vdev_unspare = 0;
+		    !vdev_dtl_required(oldvd))
 			return (oldvd);
+
+		/*
+		 * If there are more than two spares attached to a disk,
+		 * and those spares are not required, then we want to
+		 * attempt to free them up now so that they can be used
+		 * by other pools.  Once we're back down to a single
+		 * disk+spare, we stop removing them.
+		 */
+		if (vd->vdev_children > 2) {
+			newvd = vd->vdev_child[1];
+
+			if (newvd->vdev_isspare && last->vdev_isspare &&
+			    vdev_dtl_empty(last, DTL_MISSING) &&
+			    vdev_dtl_empty(last, DTL_OUTAGE) &&
+			    !vdev_dtl_required(newvd))
+				return (newvd);
 		}
 	}
 
@@ -4779,9 +4833,9 @@
 		 * we need to detach the parent's first child (the original hot
 		 * spare) as well.
 		 */
-		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) {
+		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
+		    ppvd->vdev_children == 2) {
 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
-			ASSERT(ppvd->vdev_children == 2);
 			sguid = ppvd->vdev_child[1]->vdev_guid;
 		}
 		spa_config_exit(spa, SCL_ALL, FTAG);
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h	Fri Aug 06 12:31:49 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h	Fri Aug 06 13:53:14 2010 -0600
@@ -169,6 +169,7 @@
 	uint64_t	vdev_faulted;	/* persistent faulted state	*/
 	uint64_t	vdev_degraded;	/* persistent degraded state	*/
 	uint64_t	vdev_removed;	/* persistent removed state	*/
+	uint64_t	vdev_resilvering; /* persistent resilvering state */
 	uint64_t	vdev_nparity;	/* number of parity devices for raidz */
 	char		*vdev_path;	/* vdev path (if any)		*/
 	char		*vdev_devid;	/* vdev devid (if any)		*/
--- a/usr/src/uts/common/fs/zfs/vdev.c	Fri Aug 06 12:31:49 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev.c	Fri Aug 06 13:53:14 2010 -0600
@@ -518,6 +518,9 @@
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 		    &vd->vdev_offline);
 
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING,
+		    &vd->vdev_resilvering);
+
 		/*
 		 * When importing a pool, we want to ignore the persistent fault
 		 * state, as the diagnosis made on another system may not be
@@ -1849,6 +1852,9 @@
 	vd->vdev_cant_read = cant_read;
 	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
 
+	if (!required && zio_injection_enabled)
+		required = !!zio_handle_device_injection(vd, NULL, ECHILD);
+
 	return (required);
 }
 
@@ -2940,12 +2946,13 @@
 		vd->vdev_removed = B_TRUE;
 	} else if (state == VDEV_STATE_CANT_OPEN) {
 		/*
-		 * If we fail to open a vdev during an import, we mark it as
-		 * "not available", which signifies that it was never there to
-		 * begin with.  Failure to open such a device is not considered
-		 * an error.
+		 * If we fail to open a vdev during an import or recovery, we
+		 * mark it as "not available", which signifies that it was
+		 * never there to begin with.  Failure to open such a device
+		 * is not considered an error.
 		 */
-		if (spa_load_state(spa) == SPA_LOAD_IMPORT &&
+		if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
+		    spa_load_state(spa) == SPA_LOAD_RECOVER) &&
 		    vd->vdev_ops->vdev_op_leaf)
 			vd->vdev_not_present = 1;
 
--- a/usr/src/uts/common/fs/zfs/vdev_label.c	Fri Aug 06 12:31:49 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c	Fri Aug 06 13:53:14 2010 -0600
@@ -353,6 +353,9 @@
 		if (vd->vdev_offline && !vd->vdev_tmpoffline)
 			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 			    B_TRUE) == 0);
+		if (vd->vdev_resilvering)
+			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVERING,
+			    B_TRUE) == 0);
 		if (vd->vdev_faulted)
 			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED,
 			    B_TRUE) == 0);
--- a/usr/src/uts/common/sys/fs/zfs.h	Fri Aug 06 12:31:49 2010 -0700
+++ b/usr/src/uts/common/sys/fs/zfs.h	Fri Aug 06 13:53:14 2010 -0600
@@ -336,14 +336,16 @@
 #define	SPA_VERSION_25			25ULL
 #define	SPA_VERSION_26			26ULL
 #define	SPA_VERSION_27			27ULL
+#define	SPA_VERSION_28			28ULL
+
 /*
  * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
  * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
  * and do the appropriate changes.  Also bump the version number in
  * usr/src/grub/capability.
  */
-#define	SPA_VERSION			SPA_VERSION_27
-#define	SPA_VERSION_STRING		"27"
+#define	SPA_VERSION			SPA_VERSION_28
+#define	SPA_VERSION_STRING		"28"
 
 /*
  * Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -393,6 +395,7 @@
 #define	SPA_VERSION_DIR_CLONES		SPA_VERSION_26
 #define	SPA_VERSION_DEADLISTS		SPA_VERSION_26
 #define	SPA_VERSION_FAST_SNAP		SPA_VERSION_27
+#define	SPA_VERSION_MULTI_REPLACE	SPA_VERSION_28
 
 /*
  * ZPL version - rev'd whenever an incompatible on-disk format change
@@ -483,6 +486,7 @@
 #define	ZPOOL_CONFIG_SPLIT_GUID		"split_guid"
 #define	ZPOOL_CONFIG_SPLIT_LIST		"guid_list"
 #define	ZPOOL_CONFIG_REMOVING		"removing"
+#define	ZPOOL_CONFIG_RESILVERING	"resilvering"
 #define	ZPOOL_CONFIG_SUSPENDED		"suspended"	/* not stored on disk */
 #define	ZPOOL_CONFIG_TIMESTAMP		"timestamp"	/* not stored on disk */
 #define	ZPOOL_CONFIG_BOOTFS		"bootfs"	/* not stored on disk */