Mercurial > illumos > illumos-gate
annotate usr/src/uts/common/fs/zfs/spa.c @ 1585:4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
6395485 ensure the gu in vdev_guid
6395487 config cache can become stale relative to mosconfig
6395488 vdev addition must sync to config cache before allocation begins
author | bonwick |
---|---|
date | Thu, 09 Mar 2006 16:56:05 -0800 |
parents | 938876158511 |
children | 438b928f80c7 |
rev | line source |
---|---|
789 | 1 /* |
2 * CDDL HEADER START | |
3 * | |
4 * The contents of this file are subject to the terms of the | |
1544 | 5 * Common Development and Distribution License (the "License"). |
6 * You may not use this file except in compliance with the License. | |
789 | 7 * |
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 * or http://www.opensolaris.org/os/licensing. | |
10 * See the License for the specific language governing permissions | |
11 * and limitations under the License. | |
12 * | |
13 * When distributing Covered Code, include this CDDL HEADER in each | |
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 * If applicable, add the following below this CDDL HEADER, with the | |
16 * fields enclosed by brackets "[]" replaced with your own identifying | |
17 * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 * | |
19 * CDDL HEADER END | |
20 */ | |
21 /* | |
1354
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. |
789 | 23 * Use is subject to license terms. |
24 */ | |
25 | |
26 #pragma ident "%Z%%M% %I% %E% SMI" | |
27 | |
28 /* | |
29 * This file contains all the routines used when modifying on-disk SPA state. | |
30 * This includes opening, importing, destroying, exporting a pool, and syncing a | |
31 * pool. | |
32 */ | |
33 | |
34 #include <sys/zfs_context.h> | |
1544 | 35 #include <sys/fm/fs/zfs.h> |
789 | 36 #include <sys/spa_impl.h> |
37 #include <sys/zio.h> | |
38 #include <sys/zio_checksum.h> | |
39 #include <sys/zio_compress.h> | |
40 #include <sys/dmu.h> | |
41 #include <sys/dmu_tx.h> | |
42 #include <sys/zap.h> | |
43 #include <sys/zil.h> | |
44 #include <sys/vdev_impl.h> | |
45 #include <sys/metaslab.h> | |
46 #include <sys/uberblock_impl.h> | |
47 #include <sys/txg.h> | |
48 #include <sys/avl.h> | |
49 #include <sys/dmu_traverse.h> | |
50 #include <sys/unique.h> | |
51 #include <sys/dsl_pool.h> | |
52 #include <sys/dsl_dir.h> | |
53 #include <sys/dsl_prop.h> | |
54 #include <sys/fs/zfs.h> | |
55 #include <sys/callb.h> | |
56 | |
57 static uint32_t spa_active_count; | |
58 | |
59 /* | |
60 * ========================================================================== | |
61 * SPA state manipulation (open/create/destroy/import/export) | |
62 * ========================================================================== | |
63 */ | |
64 | |
1544 | 65 static int |
66 spa_error_entry_compare(const void *a, const void *b) | |
67 { | |
68 spa_error_entry_t *sa = (spa_error_entry_t *)a; | |
69 spa_error_entry_t *sb = (spa_error_entry_t *)b; | |
70 int ret; | |
71 | |
72 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, | |
73 sizeof (zbookmark_t)); | |
74 | |
75 if (ret < 0) | |
76 return (-1); | |
77 else if (ret > 0) | |
78 return (1); | |
79 else | |
80 return (0); | |
81 } | |
82 | |
83 /* | |
84 * Utility function which retrieves copies of the current logs and | |
85 * re-initializes them in the process. | |
86 */ | |
87 void | |
88 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) | |
89 { | |
90 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); | |
91 | |
92 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); | |
93 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); | |
94 | |
95 avl_create(&spa->spa_errlist_scrub, | |
96 spa_error_entry_compare, sizeof (spa_error_entry_t), | |
97 offsetof(spa_error_entry_t, se_avl)); | |
98 avl_create(&spa->spa_errlist_last, | |
99 spa_error_entry_compare, sizeof (spa_error_entry_t), | |
100 offsetof(spa_error_entry_t, se_avl)); | |
101 } | |
102 | |
789 | 103 /* |
104 * Activate an uninitialized pool. | |
105 */ | |
106 static void | |
107 spa_activate(spa_t *spa) | |
108 { | |
109 int t; | |
110 | |
111 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); | |
112 | |
113 spa->spa_state = POOL_STATE_ACTIVE; | |
114 | |
115 spa->spa_normal_class = metaslab_class_create(); | |
116 | |
117 for (t = 0; t < ZIO_TYPES; t++) { | |
118 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", | |
119 8, maxclsyspri, 50, INT_MAX, | |
120 TASKQ_PREPOPULATE); | |
121 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", | |
122 8, maxclsyspri, 50, INT_MAX, | |
123 TASKQ_PREPOPULATE); | |
124 } | |
125 | |
126 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); | |
127 | |
128 list_create(&spa->spa_dirty_list, sizeof (vdev_t), | |
129 offsetof(vdev_t, vdev_dirty_node)); | |
130 | |
131 txg_list_create(&spa->spa_vdev_txg_list, | |
132 offsetof(struct vdev, vdev_txg_node)); | |
1544 | 133 |
134 avl_create(&spa->spa_errlist_scrub, | |
135 spa_error_entry_compare, sizeof (spa_error_entry_t), | |
136 offsetof(spa_error_entry_t, se_avl)); | |
137 avl_create(&spa->spa_errlist_last, | |
138 spa_error_entry_compare, sizeof (spa_error_entry_t), | |
139 offsetof(spa_error_entry_t, se_avl)); | |
789 | 140 } |
141 | |
142 /* | |
143 * Opposite of spa_activate(). | |
144 */ | |
145 static void | |
146 spa_deactivate(spa_t *spa) | |
147 { | |
148 int t; | |
149 | |
150 ASSERT(spa->spa_sync_on == B_FALSE); | |
151 ASSERT(spa->spa_dsl_pool == NULL); | |
152 ASSERT(spa->spa_root_vdev == NULL); | |
153 | |
154 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); | |
155 | |
156 txg_list_destroy(&spa->spa_vdev_txg_list); | |
157 | |
158 list_destroy(&spa->spa_dirty_list); | |
159 | |
160 rw_destroy(&spa->spa_traverse_lock); | |
161 | |
162 for (t = 0; t < ZIO_TYPES; t++) { | |
163 taskq_destroy(spa->spa_zio_issue_taskq[t]); | |
164 taskq_destroy(spa->spa_zio_intr_taskq[t]); | |
165 spa->spa_zio_issue_taskq[t] = NULL; | |
166 spa->spa_zio_intr_taskq[t] = NULL; | |
167 } | |
168 | |
169 metaslab_class_destroy(spa->spa_normal_class); | |
170 spa->spa_normal_class = NULL; | |
171 | |
1544 | 172 /* |
173 * If this was part of an import or the open otherwise failed, we may | |
174 * still have errors left in the queues. Empty them just in case. | |
175 */ | |
176 spa_errlog_drain(spa); | |
177 | |
178 avl_destroy(&spa->spa_errlist_scrub); | |
179 avl_destroy(&spa->spa_errlist_last); | |
180 | |
789 | 181 spa->spa_state = POOL_STATE_UNINITIALIZED; |
182 } | |
183 | |
184 /* | |
185 * Verify a pool configuration, and construct the vdev tree appropriately. This | |
186 * will create all the necessary vdevs in the appropriate layout, with each vdev | |
187 * in the CLOSED state. This will prep the pool before open/creation/import. | |
188 * All vdev validation is done by the vdev_alloc() routine. | |
189 */ | |
190 static vdev_t * | |
191 spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) | |
192 { | |
193 nvlist_t **child; | |
194 uint_t c, children; | |
195 vdev_t *vd; | |
196 | |
197 if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) | |
198 return (NULL); | |
199 | |
200 if (vd->vdev_ops->vdev_op_leaf) | |
201 return (vd); | |
202 | |
203 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, | |
204 &child, &children) != 0) { | |
205 vdev_free(vd); | |
206 return (NULL); | |
207 } | |
208 | |
209 for (c = 0; c < children; c++) { | |
210 if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { | |
211 vdev_free(vd); | |
212 return (NULL); | |
213 } | |
214 } | |
215 | |
216 return (vd); | |
217 } | |
218 | |
219 /* | |
220 * Opposite of spa_load(). | |
221 */ | |
222 static void | |
223 spa_unload(spa_t *spa) | |
224 { | |
225 /* | |
1544 | 226 * Stop async tasks. |
227 */ | |
228 spa_async_suspend(spa); | |
229 | |
230 /* | |
789 | 231 * Stop syncing. |
232 */ | |
233 if (spa->spa_sync_on) { | |
234 txg_sync_stop(spa->spa_dsl_pool); | |
235 spa->spa_sync_on = B_FALSE; | |
236 } | |
237 | |
238 /* | |
239 * Wait for any outstanding prefetch I/O to complete. | |
240 */ | |
1544 | 241 spa_config_enter(spa, RW_WRITER, FTAG); |
242 spa_config_exit(spa, FTAG); | |
789 | 243 |
244 /* | |
245 * Close the dsl pool. | |
246 */ | |
247 if (spa->spa_dsl_pool) { | |
248 dsl_pool_close(spa->spa_dsl_pool); | |
249 spa->spa_dsl_pool = NULL; | |
250 } | |
251 | |
252 /* | |
253 * Close all vdevs. | |
254 */ | |
1585
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
255 if (spa->spa_root_vdev) |
789 | 256 vdev_free(spa->spa_root_vdev); |
1585
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
257 ASSERT(spa->spa_root_vdev == NULL); |
1544 | 258 |
259 spa->spa_async_suspended = 0; | |
789 | 260 } |
261 | |
262 /* | |
263 * Load an existing storage pool, using the pool's builtin spa_config as a | |
1544 | 264 * source of configuration information. |
789 | 265 */ |
266 static int | |
1544 | 267 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) |
789 | 268 { |
269 int error = 0; | |
1585
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
270 uint64_t config_cache_txg = spa->spa_config_txg; |
789 | 271 nvlist_t *nvroot = NULL; |
272 vdev_t *rvd; | |
273 uberblock_t *ub = &spa->spa_uberblock; | |
274 uint64_t pool_guid; | |
275 zio_t *zio; | |
276 | |
1544 | 277 spa->spa_load_state = state; |
789 | 278 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || |
1544 | 279 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { |
280 error = EINVAL; | |
281 goto out; | |
282 } | |
789 | 283 |
284 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, | |
285 &spa->spa_config_txg); | |
286 | |
1544 | 287 if ((spa->spa_load_state == SPA_LOAD_IMPORT || |
288 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && | |
289 spa_guid_exists(pool_guid, 0)) { | |
290 error = EEXIST; | |
291 goto out; | |
292 } | |
789 | 293 |
294 /* | |
295 * Parse the configuration into a vdev tree. | |
296 */ | |
1544 | 297 spa_config_enter(spa, RW_WRITER, FTAG); |
789 | 298 rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); |
1544 | 299 spa_config_exit(spa, FTAG); |
789 | 300 |
1544 | 301 if (rvd == NULL) { |
302 error = EINVAL; | |
303 goto out; | |
304 } | |
789 | 305 |
1585
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
306 ASSERT(spa->spa_root_vdev == rvd); |
789 | 307 ASSERT(spa_guid(spa) == pool_guid); |
308 | |
309 /* | |
310 * Try to open all vdevs, loading each label in the process. | |
311 */ | |
1544 | 312 if (vdev_open(rvd) != 0) { |
313 error = ENXIO; | |
314 goto out; | |
315 } | |
789 | 316 |
317 /* | |
318 * Find the best uberblock. | |
319 */ | |
320 bzero(ub, sizeof (uberblock_t)); | |
321 | |
322 zio = zio_root(spa, NULL, NULL, | |
323 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); | |
324 vdev_uberblock_load(zio, rvd, ub); | |
325 error = zio_wait(zio); | |
326 | |
327 /* | |
328 * If we weren't able to find a single valid uberblock, return failure. | |
329 */ | |
330 if (ub->ub_txg == 0) { | |
1544 | 331 error = ENXIO; |
332 goto out; | |
333 } | |
334 | |
335 /* | |
336 * If the pool is newer than the code, we can't open it. | |
337 */ | |
338 if (ub->ub_version > UBERBLOCK_VERSION) { | |
339 error = ENOTSUP; | |
340 goto out; | |
789 | 341 } |
342 | |
343 /* | |
344 * If the vdev guid sum doesn't match the uberblock, we have an | |
345 * incomplete configuration. | |
346 */ | |
347 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { | |
1544 | 348 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, |
349 VDEV_AUX_BAD_GUID_SUM); | |
350 error = ENXIO; | |
351 goto out; | |
789 | 352 } |
353 | |
354 /* | |
355 * Initialize internal SPA structures. | |
356 */ | |
357 spa->spa_state = POOL_STATE_ACTIVE; | |
358 spa->spa_ubsync = spa->spa_uberblock; | |
359 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; | |
1544 | 360 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); |
361 if (error) { | |
362 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, | |
363 VDEV_AUX_CORRUPT_DATA); | |
364 goto out; | |
365 } | |
789 | 366 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; |
367 | |
1544 | 368 if (zap_lookup(spa->spa_meta_objset, |
789 | 369 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, |
1544 | 370 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { |
371 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, | |
372 VDEV_AUX_CORRUPT_DATA); | |
373 error = EIO; | |
374 goto out; | |
375 } | |
789 | 376 |
377 if (!mosconfig) { | |
378 dmu_buf_t *db; | |
379 char *packed = NULL; | |
380 size_t nvsize = 0; | |
381 nvlist_t *newconfig = NULL; | |
382 | |
1544 | 383 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, |
384 spa->spa_config_object, FTAG, &db)); | |
789 | 385 nvsize = *(uint64_t *)db->db_data; |
1544 | 386 dmu_buf_rele(db, FTAG); |
789 | 387 |
388 packed = kmem_alloc(nvsize, KM_SLEEP); | |
1544 | 389 error = dmu_read(spa->spa_meta_objset, |
789 | 390 spa->spa_config_object, 0, nvsize, packed); |
391 if (error == 0) | |
392 error = nvlist_unpack(packed, nvsize, &newconfig, 0); | |
393 kmem_free(packed, nvsize); | |
394 | |
1544 | 395 if (error) { |
396 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, | |
397 VDEV_AUX_CORRUPT_DATA); | |
398 error = EIO; | |
399 goto out; | |
400 } | |
789 | 401 |
402 spa_config_set(spa, newconfig); | |
403 | |
404 spa_unload(spa); | |
405 spa_deactivate(spa); | |
406 spa_activate(spa); | |
407 | |
1544 | 408 return (spa_load(spa, newconfig, state, B_TRUE)); |
409 } | |
410 | |
411 if (zap_lookup(spa->spa_meta_objset, | |
412 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, | |
413 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { | |
414 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, | |
415 VDEV_AUX_CORRUPT_DATA); | |
416 error = EIO; | |
417 goto out; | |
789 | 418 } |
419 | |
1544 | 420 /* |
421 * Load the persistent error log. If we have an older pool, this will | |
422 * not be present. | |
423 */ | |
424 error = zap_lookup(spa->spa_meta_objset, | |
425 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, | |
426 sizeof (uint64_t), 1, &spa->spa_errlog_last); | |
427 if (error != 0 &&error != ENOENT) { | |
428 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, | |
429 VDEV_AUX_CORRUPT_DATA); | |
430 error = EIO; | |
431 goto out; | |
432 } | |
433 | |
434 error = zap_lookup(spa->spa_meta_objset, | |
435 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, | |
436 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); | |
437 if (error != 0 && error != ENOENT) { | |
438 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, | |
439 VDEV_AUX_CORRUPT_DATA); | |
440 error = EIO; | |
441 goto out; | |
442 } | |
789 | 443 |
444 /* | |
1544 | 445 * Load the vdev state for all top level vdevs. We need to grab the |
446 * config lock because all label I/O is done with the | |
447 * ZIO_FLAG_CONFIG_HELD flag. | |
789 | 448 */ |
1544 | 449 spa_config_enter(spa, RW_READER, FTAG); |
450 if ((error = vdev_load(rvd)) != 0) { | |
451 spa_config_exit(spa, FTAG); | |
452 goto out; | |
453 } | |
454 spa_config_exit(spa, FTAG); | |
789 | 455 |
456 /* | |
457 * Propagate the leaf DTLs we just loaded all the way up the tree. | |
458 */ | |
1544 | 459 spa_config_enter(spa, RW_WRITER, FTAG); |
789 | 460 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); |
1544 | 461 spa_config_exit(spa, FTAG); |
789 | 462 |
463 /* | |
464 * Check the state of the root vdev. If it can't be opened, it | |
465 * indicates one or more toplevel vdevs are faulted. | |
466 */ | |
1544 | 467 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { |
468 error = ENXIO; | |
469 goto out; | |
470 } | |
789 | 471 |
472 /* | |
473 * Claim log blocks that haven't been committed yet, and update all | |
474 * top-level vdevs to sync any config changes found in vdev_load(). | |
475 * This must all happen in a single txg. | |
476 */ | |
1544 | 477 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { |
1585
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
478 int c; |
789 | 479 dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), |
480 spa_first_txg(spa)); | |
481 dmu_objset_find(spa->spa_name, zil_claim, tx, 0); | |
482 vdev_config_dirty(rvd); | |
483 dmu_tx_commit(tx); | |
484 | |
485 spa->spa_sync_on = B_TRUE; | |
486 txg_sync_start(spa->spa_dsl_pool); | |
487 | |
488 /* | |
489 * Wait for all claims to sync. | |
490 */ | |
491 txg_wait_synced(spa->spa_dsl_pool, 0); | |
1585
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
492 |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
493 /* |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
494 * If the config cache is stale relative to the mosconfig, |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
495 * sync the config cache. |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
496 */ |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
497 if (config_cache_txg != spa->spa_config_txg) |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
498 spa_config_sync(); |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
499 |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
500 /* |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
501 * If we have top-level vdevs that were added but have |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
502 * not yet been prepared for allocation, do that now. |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
503 * (It's safe now because the config cache is up to date, |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
504 * so it will be able to translate the new DVAs.) |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
505 * See comments in spa_vdev_add() for full details. |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
506 */ |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
507 for (c = 0; c < rvd->vdev_children; c++) { |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
508 vdev_t *tvd = rvd->vdev_child[c]; |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
509 if (tvd->vdev_ms_array == 0) { |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
510 uint64_t txg = spa_last_synced_txg(spa) + 1; |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
511 ASSERT(tvd->vdev_ms_shift == 0); |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
512 spa_config_enter(spa, RW_WRITER, FTAG); |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
513 vdev_init(tvd, txg); |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
514 vdev_config_dirty(tvd); |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
515 spa_config_set(spa, |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
516 spa_config_generate(spa, rvd, txg, 0)); |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
517 spa_config_exit(spa, FTAG); |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
518 txg_wait_synced(spa->spa_dsl_pool, txg); |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
519 ASSERT(tvd->vdev_ms_shift != 0); |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
520 ASSERT(tvd->vdev_ms_array != 0); |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
521 spa_config_sync(); |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
522 } |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
523 } |
789 | 524 } |
525 | |
1544 | 526 error = 0; |
527 out: | |
528 if (error) | |
529 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); | |
530 spa->spa_load_state = SPA_LOAD_NONE; | |
531 spa->spa_ena = 0; | |
532 | |
533 return (error); | |
789 | 534 } |
535 | |
536 /* | |
537 * Pool Open/Import | |
538 * | |
539 * The import case is identical to an open except that the configuration is sent | |
540 * down from userland, instead of grabbed from the configuration cache. For the | |
541 * case of an open, the pool configuration will exist in the | |
542 * POOL_STATE_UNITIALIZED state. | |
543 * | |
544 * The stats information (gen/count/ustats) is used to gather vdev statistics at | |
545 * the same time open the pool, without having to keep around the spa_t in some | |
546 * ambiguous state. | |
547 */ | |
548 static int | |
549 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) | |
550 { | |
551 spa_t *spa; | |
552 int error; | |
553 int loaded = B_FALSE; | |
554 int locked = B_FALSE; | |
555 | |
556 *spapp = NULL; | |
557 | |
558 /* | |
559 * As disgusting as this is, we need to support recursive calls to this | |
560 * function because dsl_dir_open() is called during spa_load(), and ends | |
561 * up calling spa_open() again. The real fix is to figure out how to | |
562 * avoid dsl_dir_open() calling this in the first place. | |
563 */ | |
564 if (mutex_owner(&spa_namespace_lock) != curthread) { | |
565 mutex_enter(&spa_namespace_lock); | |
566 locked = B_TRUE; | |
567 } | |
568 | |
569 if ((spa = spa_lookup(pool)) == NULL) { | |
570 if (locked) | |
571 mutex_exit(&spa_namespace_lock); | |
572 return (ENOENT); | |
573 } | |
574 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { | |
575 | |
576 spa_activate(spa); | |
577 | |
578 error = spa_load(spa, spa->spa_config, | |
1544 | 579 SPA_LOAD_OPEN, B_FALSE); |
789 | 580 |
581 if (error == EBADF) { | |
582 /* | |
583 * If vdev_load() returns EBADF, it indicates that one | |
584 * of the vdevs indicates that the pool has been | |
585 * exported or destroyed. If this is the case, the | |
586 * config cache is out of sync and we should remove the | |
587 * pool from the namespace. | |
588 */ | |
589 spa_unload(spa); | |
590 spa_deactivate(spa); | |
591 spa_remove(spa); | |
592 spa_config_sync(); | |
593 if (locked) | |
594 mutex_exit(&spa_namespace_lock); | |
595 return (ENOENT); | |
1544 | 596 } |
597 | |
598 if (error) { | |
789 | 599 /* |
600 * We can't open the pool, but we still have useful | |
601 * information: the state of each vdev after the | |
602 * attempted vdev_open(). Return this to the user. | |
603 */ | |
604 if (config != NULL && spa->spa_root_vdev != NULL) | |
605 *config = spa_config_generate(spa, NULL, -1ULL, | |
606 B_TRUE); | |
607 spa_unload(spa); | |
608 spa_deactivate(spa); | |
1544 | 609 spa->spa_last_open_failed = B_TRUE; |
789 | 610 if (locked) |
611 mutex_exit(&spa_namespace_lock); | |
612 *spapp = NULL; | |
613 return (error); | |
1544 | 614 } else { |
615 zfs_post_ok(spa, NULL); | |
616 spa->spa_last_open_failed = B_FALSE; | |
789 | 617 } |
618 | |
619 loaded = B_TRUE; | |
620 } | |
621 | |
622 spa_open_ref(spa, tag); | |
623 if (locked) | |
624 mutex_exit(&spa_namespace_lock); | |
625 | |
626 *spapp = spa; | |
627 | |
628 if (config != NULL) { | |
1544 | 629 spa_config_enter(spa, RW_READER, FTAG); |
789 | 630 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); |
1544 | 631 spa_config_exit(spa, FTAG); |
789 | 632 } |
633 | |
634 /* | |
635 * If we just loaded the pool, resilver anything that's out of date. | |
636 */ | |
637 if (loaded && (spa_mode & FWRITE)) | |
638 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); | |
639 | |
640 return (0); | |
641 } | |
642 | |
643 int | |
644 spa_open(const char *name, spa_t **spapp, void *tag) | |
645 { | |
646 return (spa_open_common(name, spapp, tag, NULL)); | |
647 } | |
648 | |
1544 | 649 /* |
650 * Lookup the given spa_t, incrementing the inject count in the process, | |
651 * preventing it from being exported or destroyed. | |
652 */ | |
653 spa_t * | |
654 spa_inject_addref(char *name) | |
655 { | |
656 spa_t *spa; | |
657 | |
658 mutex_enter(&spa_namespace_lock); | |
659 if ((spa = spa_lookup(name)) == NULL) { | |
660 mutex_exit(&spa_namespace_lock); | |
661 return (NULL); | |
662 } | |
663 spa->spa_inject_ref++; | |
664 mutex_exit(&spa_namespace_lock); | |
665 | |
666 return (spa); | |
667 } | |
668 | |
669 void | |
670 spa_inject_delref(spa_t *spa) | |
671 { | |
672 mutex_enter(&spa_namespace_lock); | |
673 spa->spa_inject_ref--; | |
674 mutex_exit(&spa_namespace_lock); | |
675 } | |
676 | |
789 | 677 int |
1544 | 678 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) |
789 | 679 { |
680 int error; | |
681 spa_t *spa; | |
682 | |
683 *config = NULL; | |
684 error = spa_open_common(name, &spa, FTAG, config); | |
685 | |
1544 | 686 if (spa && *config != NULL) |
687 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, | |
688 spa_get_errlog_size(spa)) == 0); | |
689 | |
690 /* | |
691 * We want to get the alternate root even for faulted pools, so we cheat | |
692 * and call spa_lookup() directly. | |
693 */ | |
694 if (altroot) { | |
695 if (spa == NULL) { | |
696 mutex_enter(&spa_namespace_lock); | |
697 spa = spa_lookup(name); | |
698 if (spa) | |
699 spa_altroot(spa, altroot, buflen); | |
700 else | |
701 altroot[0] = '\0'; | |
702 spa = NULL; | |
703 mutex_exit(&spa_namespace_lock); | |
704 } else { | |
705 spa_altroot(spa, altroot, buflen); | |
706 } | |
707 } | |
708 | |
789 | 709 if (spa != NULL) |
710 spa_close(spa, FTAG); | |
711 | |
712 return (error); | |
713 } | |
714 | |
715 /* | |
716 * Pool Creation | |
717 */ | |
718 int | |
719 spa_create(const char *pool, nvlist_t *nvroot, char *altroot) | |
720 { | |
721 spa_t *spa; | |
722 dsl_pool_t *dp; | |
723 dmu_tx_t *tx; | |
724 int error; | |
725 uint64_t txg = TXG_INITIAL; | |
726 | |
727 /* | |
728 * If this pool already exists, return failure. | |
729 */ | |
730 mutex_enter(&spa_namespace_lock); | |
731 if (spa_lookup(pool) != NULL) { | |
732 mutex_exit(&spa_namespace_lock); | |
733 return (EEXIST); | |
734 } | |
735 spa = spa_add(pool); | |
736 | |
737 /* | |
738 * Allocate a new spa_t structure. | |
739 */ | |
740 spa_activate(spa); | |
741 | |
742 spa->spa_uberblock.ub_txg = txg - 1; | |
743 spa->spa_ubsync = spa->spa_uberblock; | |
744 | |
745 error = spa_vdev_add(spa, nvroot); | |
746 | |
747 if (error) { | |
748 spa_unload(spa); | |
749 spa_deactivate(spa); | |
750 spa_remove(spa); | |
751 mutex_exit(&spa_namespace_lock); | |
752 return (error); | |
753 } | |
754 | |
755 if (altroot != NULL) { | |
756 spa->spa_root = spa_strdup(altroot); | |
757 atomic_add_32(&spa_active_count, 1); | |
758 } | |
759 | |
760 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); | |
761 spa->spa_meta_objset = dp->dp_meta_objset; | |
762 | |
763 tx = dmu_tx_create_assigned(dp, txg); | |
764 | |
765 /* | |
766 * Create the pool config object. | |
767 */ | |
768 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, | |
769 DMU_OT_PACKED_NVLIST, 1 << 14, | |
770 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); | |
771 | |
1544 | 772 if (zap_add(spa->spa_meta_objset, |
789 | 773 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, |
1544 | 774 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { |
775 cmn_err(CE_PANIC, "failed to add pool config"); | |
776 } | |
789 | 777 |
778 /* | |
779 * Create the deferred-free bplist object. Turn off compression | |
780 * because sync-to-convergence takes longer if the blocksize | |
781 * keeps changing. | |
782 */ | |
783 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, | |
784 1 << 14, tx); | |
785 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, | |
786 ZIO_COMPRESS_OFF, tx); | |
787 | |
1544 | 788 if (zap_add(spa->spa_meta_objset, |
789 | 789 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, |
1544 | 790 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { |
791 cmn_err(CE_PANIC, "failed to add bplist"); | |
792 } | |
789 | 793 |
794 dmu_tx_commit(tx); | |
795 | |
796 spa->spa_sync_on = B_TRUE; | |
797 txg_sync_start(spa->spa_dsl_pool); | |
798 | |
799 /* | |
800 * We explicitly wait for the first transaction to complete so that our | |
801 * bean counters are appropriately updated. | |
802 */ | |
803 txg_wait_synced(spa->spa_dsl_pool, txg); | |
804 | |
805 spa_config_sync(); | |
806 | |
807 mutex_exit(&spa_namespace_lock); | |
808 | |
809 return (0); | |
810 } | |
811 | |
812 /* | |
813 * Import the given pool into the system. We set up the necessary spa_t and | |
814 * then call spa_load() to do the dirty work. | |
815 */ | |
816 int | |
817 spa_import(const char *pool, nvlist_t *config, char *altroot) | |
818 { | |
819 spa_t *spa; | |
820 int error; | |
821 | |
822 if (!(spa_mode & FWRITE)) | |
823 return (EROFS); | |
824 | |
825 /* | |
826 * If a pool with this name exists, return failure. | |
827 */ | |
828 mutex_enter(&spa_namespace_lock); | |
829 if (spa_lookup(pool) != NULL) { | |
830 mutex_exit(&spa_namespace_lock); | |
831 return (EEXIST); | |
832 } | |
833 | |
834 /* | |
835 * Create an initialize the spa structure | |
836 */ | |
837 spa = spa_add(pool); | |
838 spa_activate(spa); | |
839 | |
840 /* | |
841 * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig | |
842 * so that we don't try to open the pool if the config is damaged. | |
843 */ | |
1544 | 844 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); |
789 | 845 |
846 if (error) { | |
847 spa_unload(spa); | |
848 spa_deactivate(spa); | |
849 spa_remove(spa); | |
850 mutex_exit(&spa_namespace_lock); | |
851 return (error); | |
852 } | |
853 | |
854 /* | |
855 * Set the alternate root, if there is one. | |
856 */ | |
857 if (altroot != NULL) { | |
858 atomic_add_32(&spa_active_count, 1); | |
859 spa->spa_root = spa_strdup(altroot); | |
860 } | |
861 | |
862 /* | |
863 * Initialize the config based on the in-core state. | |
864 */ | |
865 config = spa_config_generate(spa, NULL, spa_last_synced_txg(spa), 0); | |
866 | |
867 spa_config_set(spa, config); | |
868 | |
869 /* | |
870 * Sync the configuration cache. | |
871 */ | |
872 spa_config_sync(); | |
873 | |
874 mutex_exit(&spa_namespace_lock); | |
875 | |
876 /* | |
877 * Resilver anything that's out of date. | |
878 */ | |
879 if (spa_mode & FWRITE) | |
880 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); | |
881 | |
882 return (0); | |
883 } | |
884 | |
885 /* | |
886 * This (illegal) pool name is used when temporarily importing a spa_t in order | |
887 * to get the vdev stats associated with the imported devices. | |
888 */ | |
889 #define TRYIMPORT_NAME "$import" | |
890 | |
891 nvlist_t * | |
892 spa_tryimport(nvlist_t *tryconfig) | |
893 { | |
894 nvlist_t *config = NULL; | |
895 char *poolname; | |
896 spa_t *spa; | |
897 uint64_t state; | |
898 | |
899 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) | |
900 return (NULL); | |
901 | |
902 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) | |
903 return (NULL); | |
904 | |
905 mutex_enter(&spa_namespace_lock); | |
906 spa = spa_add(TRYIMPORT_NAME); | |
907 | |
908 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); | |
909 | |
910 /* | |
911 * Initialize the spa_t structure. | |
912 */ | |
913 spa_activate(spa); | |
914 | |
915 /* | |
916 * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig | |
917 * so we don't try to open the pool if the config is damaged. | |
918 */ | |
1544 | 919 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); |
789 | 920 |
921 /* | |
922 * If 'tryconfig' was at least parsable, return the current config. | |
923 */ | |
924 if (spa->spa_root_vdev != NULL) { | |
925 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); | |
926 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, | |
927 poolname) == 0); | |
928 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, | |
929 state) == 0); | |
930 } | |
931 | |
932 spa_unload(spa); | |
933 spa_deactivate(spa); | |
934 spa_remove(spa); | |
935 mutex_exit(&spa_namespace_lock); | |
936 | |
937 return (config); | |
938 } | |
939 | |
940 /* | |
941 * Pool export/destroy | |
942 * | |
943 * The act of destroying or exporting a pool is very simple. We make sure there | |
944 * is no more pending I/O and any references to the pool are gone. Then, we | |
945 * update the pool state and sync all the labels to disk, removing the | |
946 * configuration from the cache afterwards. | |
947 */ | |
948 static int | |
949 spa_export_common(char *pool, int new_state) | |
950 { | |
951 spa_t *spa; | |
952 | |
953 if (!(spa_mode & FWRITE)) | |
954 return (EROFS); | |
955 | |
956 mutex_enter(&spa_namespace_lock); | |
957 if ((spa = spa_lookup(pool)) == NULL) { | |
958 mutex_exit(&spa_namespace_lock); | |
959 return (ENOENT); | |
960 } | |
961 | |
962 /* | |
1544 | 963 * Put a hold on the pool, drop the namespace lock, stop async tasks, |
964 * reacquire the namespace lock, and see if we can export. | |
965 */ | |
966 spa_open_ref(spa, FTAG); | |
967 mutex_exit(&spa_namespace_lock); | |
968 spa_async_suspend(spa); | |
969 mutex_enter(&spa_namespace_lock); | |
970 spa_close(spa, FTAG); | |
971 | |
972 /* | |
789 | 973 * The pool will be in core if it's openable, |
974 * in which case we can modify its state. | |
975 */ | |
976 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { | |
977 /* | |
978 * Objsets may be open only because they're dirty, so we | |
979 * have to force it to sync before checking spa_refcnt. | |
980 */ | |
981 spa_scrub_suspend(spa); | |
982 txg_wait_synced(spa->spa_dsl_pool, 0); | |
983 | |
1544 | 984 /* |
985 * A pool cannot be exported or destroyed if there are active | |
986 * references. If we are resetting a pool, allow references by | |
987 * fault injection handlers. | |
988 */ | |
989 if (!spa_refcount_zero(spa) || | |
990 (spa->spa_inject_ref != 0 && | |
991 new_state != POOL_STATE_UNINITIALIZED)) { | |
789 | 992 spa_scrub_resume(spa); |
1544 | 993 spa_async_resume(spa); |
789 | 994 mutex_exit(&spa_namespace_lock); |
995 return (EBUSY); | |
996 } | |
997 | |
998 spa_scrub_resume(spa); | |
999 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); | |
1000 | |
1001 if (spa->spa_root != NULL) | |
1002 atomic_add_32(&spa_active_count, -1); | |
1003 | |
1004 /* | |
1005 * We want this to be reflected on every label, | |
1006 * so mark them all dirty. spa_unload() will do the | |
1007 * final sync that pushes these changes out. | |
1008 */ | |
1544 | 1009 if (new_state != POOL_STATE_UNINITIALIZED) { |
1010 spa->spa_state = new_state; | |
1011 vdev_config_dirty(spa->spa_root_vdev); | |
1012 } | |
789 | 1013 } |
1014 | |
1015 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { | |
1016 spa_unload(spa); | |
1017 spa_deactivate(spa); | |
1018 } | |
1019 | |
1544 | 1020 if (new_state != POOL_STATE_UNINITIALIZED) { |
1021 spa_remove(spa); | |
1022 spa_config_sync(); | |
1023 } | |
789 | 1024 mutex_exit(&spa_namespace_lock); |
1025 | |
1026 return (0); | |
1027 } | |
1028 | |
1029 /* | |
1030 * Destroy a storage pool. | |
1031 */ | |
1032 int | |
1033 spa_destroy(char *pool) | |
1034 { | |
1035 return (spa_export_common(pool, POOL_STATE_DESTROYED)); | |
1036 } | |
1037 | |
1038 /* | |
1039 * Export a storage pool. | |
1040 */ | |
1041 int | |
1042 spa_export(char *pool) | |
1043 { | |
1044 return (spa_export_common(pool, POOL_STATE_EXPORTED)); | |
1045 } | |
1046 | |
1047 /* | |
1544 | 1048 * Similar to spa_export(), this unloads the spa_t without actually removing it |
1049 * from the namespace in any way. | |
1050 */ | |
1051 int | |
1052 spa_reset(char *pool) | |
1053 { | |
1054 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED)); | |
1055 } | |
1056 | |
1057 | |
1058 /* | |
789 | 1059 * ========================================================================== |
1060 * Device manipulation | |
1061 * ========================================================================== | |
1062 */ | |
1063 | |
1064 /* | |
1065 * Add capacity to a storage pool. | |
1066 */ | |
1067 int | |
1068 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) | |
1069 { | |
1070 uint64_t txg; | |
1585
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1071 int c, c0, children, error; |
789 | 1072 vdev_t *rvd = spa->spa_root_vdev; |
1585
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1073 vdev_t *vd, *tvd; |
789 | 1074 |
1075 txg = spa_vdev_enter(spa); | |
1076 | |
1077 vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); | |
1078 | |
1079 if (vd == NULL) | |
1080 return (spa_vdev_exit(spa, vd, txg, EINVAL)); | |
1081 | |
1585
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1082 if (rvd == NULL) { /* spa_create() */ |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1083 rvd = vd; |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1084 c0 = 0; |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1085 } else { |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1086 c0 = rvd->vdev_children; |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1087 } |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1088 |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1089 ASSERT(spa->spa_root_vdev == rvd); |
789 | 1090 |
1091 if ((error = vdev_create(vd, txg)) != 0) | |
1092 return (spa_vdev_exit(spa, vd, txg, error)); | |
1093 | |
1585
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1094 children = vd->vdev_children; |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1095 |
789 | 1096 /* |
1585
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1097 * Transfer each new top-level vdev from vd to rvd. |
789 | 1098 */ |
1585
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1099 for (c = 0; c < children; c++) { |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1100 tvd = vd->vdev_child[c]; |
789 | 1101 if (vd != rvd) { |
1102 vdev_remove_child(vd, tvd); | |
1585
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1103 tvd->vdev_id = c0 + c; |
789 | 1104 vdev_add_child(rvd, tvd); |
1105 } | |
1106 vdev_config_dirty(tvd); | |
1107 } | |
1108 | |
1109 /* | |
1585
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1110 * We have to be careful when adding new vdevs to an existing pool. |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1111 * If other threads start allocating from these vdevs before we |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1112 * sync the config cache, and we lose power, then upon reboot we may |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1113 * fail to open the pool because there are DVAs that the config cache |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1114 * can't translate. Therefore, we first add the vdevs without |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1115 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1116 * initialize the metaslabs; and sync the config cache again. |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1117 * |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1118 * spa_load() checks for added-but-not-initialized vdevs, so that |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1119 * if we lose power at any point in this sequence, the remaining |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1120 * steps will be completed the next time we load the pool. |
789 | 1121 */ |
1585
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1122 if (vd != rvd) { |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1123 (void) spa_vdev_exit(spa, vd, txg, 0); |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1124 txg = spa_vdev_enter(spa); |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1125 vd = NULL; |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1126 } |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1127 |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1128 /* |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1129 * Now that the config is safely on disk, we can use the new space. |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1130 */ |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1131 for (c = 0; c < children; c++) { |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1132 tvd = rvd->vdev_child[c0 + c]; |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1133 ASSERT(tvd->vdev_ms_array == 0); |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1134 vdev_init(tvd, txg); |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1135 vdev_config_dirty(tvd); |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1136 } |
789 | 1137 |
1138 return (spa_vdev_exit(spa, vd, txg, 0)); | |
1139 } | |
1140 | |
1141 /* | |
1142 * Attach a device to a mirror. The arguments are the path to any device | |
1143 * in the mirror, and the nvroot for the new device. If the path specifies | |
1144 * a device that is not mirrored, we automatically insert the mirror vdev. | |
1145 * | |
1146 * If 'replacing' is specified, the new device is intended to replace the | |
1147 * existing device; in this case the two devices are made into their own | |
1148 * mirror using the 'replacing' vdev, which is functionally idendical to | |
1149 * the mirror vdev (it actually reuses all the same ops) but has a few | |
1150 * extra rules: you can't attach to it after it's been created, and upon | |
1151 * completion of resilvering, the first disk (the one being replaced) | |
1152 * is automatically detached. | |
1153 */ | |
1154 int | |
1544 | 1155 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) |
789 | 1156 { |
1157 uint64_t txg, open_txg; | |
1158 int error; | |
1159 vdev_t *rvd = spa->spa_root_vdev; | |
1160 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; | |
1161 vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; | |
1162 | |
1163 txg = spa_vdev_enter(spa); | |
1164 | |
1544 | 1165 oldvd = vdev_lookup_by_guid(rvd, guid); |
789 | 1166 |
1167 if (oldvd == NULL) | |
1168 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); | |
1169 | |
1585
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1170 if (!oldvd->vdev_ops->vdev_op_leaf) |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1171 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1172 |
789 | 1173 pvd = oldvd->vdev_parent; |
1174 | |
1175 /* | |
1176 * The parent must be a mirror or the root, unless we're replacing; | |
1177 * in that case, the parent can be anything but another replacing vdev. | |
1178 */ | |
1179 if (pvd->vdev_ops != &vdev_mirror_ops && | |
1180 pvd->vdev_ops != &vdev_root_ops && | |
1181 (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) | |
1182 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); | |
1183 | |
1184 newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); | |
1185 | |
1186 if (newrootvd == NULL || newrootvd->vdev_children != 1) | |
1187 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); | |
1188 | |
1189 newvd = newrootvd->vdev_child[0]; | |
1190 | |
1191 if (!newvd->vdev_ops->vdev_op_leaf) | |
1192 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); | |
1193 | |
1194 if ((error = vdev_create(newrootvd, txg)) != 0) | |
1195 return (spa_vdev_exit(spa, newrootvd, txg, error)); | |
1196 | |
1175
759d20c7e57b
6366265 attach/replace should allow a new device size at least the min of all devs in a mirror/raidz
lling
parents:
797
diff
changeset
|
1197 /* |
759d20c7e57b
6366265 attach/replace should allow a new device size at least the min of all devs in a mirror/raidz
lling
parents:
797
diff
changeset
|
1198 * Compare the new device size with the replaceable/attachable |
759d20c7e57b
6366265 attach/replace should allow a new device size at least the min of all devs in a mirror/raidz
lling
parents:
797
diff
changeset
|
1199 * device size. |
759d20c7e57b
6366265 attach/replace should allow a new device size at least the min of all devs in a mirror/raidz
lling
parents:
797
diff
changeset
|
1200 */ |
759d20c7e57b
6366265 attach/replace should allow a new device size at least the min of all devs in a mirror/raidz
lling
parents:
797
diff
changeset
|
1201 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) |
789 | 1202 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); |
1203 | |
1204 if (newvd->vdev_ashift != oldvd->vdev_ashift && oldvd->vdev_ashift != 0) | |
1205 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); | |
1206 | |
1207 /* | |
1208 * If this is an in-place replacement, update oldvd's path and devid | |
1209 * to make it distinguishable from newvd, and unopenable from now on. | |
1210 */ | |
1211 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { | |
1212 spa_strfree(oldvd->vdev_path); | |
1213 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, | |
1214 KM_SLEEP); | |
1215 (void) sprintf(oldvd->vdev_path, "%s/%s", | |
1216 newvd->vdev_path, "old"); | |
1217 if (oldvd->vdev_devid != NULL) { | |
1218 spa_strfree(oldvd->vdev_devid); | |
1219 oldvd->vdev_devid = NULL; | |
1220 } | |
1221 } | |
1222 | |
1223 /* | |
1224 * If the parent is not a mirror, or if we're replacing, | |
1225 * insert the new mirror/replacing vdev above oldvd. | |
1226 */ | |
1227 if (pvd->vdev_ops != pvops) | |
1228 pvd = vdev_add_parent(oldvd, pvops); | |
1229 | |
1230 ASSERT(pvd->vdev_top->vdev_parent == rvd); | |
1231 ASSERT(pvd->vdev_ops == pvops); | |
1232 ASSERT(oldvd->vdev_parent == pvd); | |
1233 | |
1234 /* | |
1235 * Extract the new device from its root and add it to pvd. | |
1236 */ | |
1237 vdev_remove_child(newrootvd, newvd); | |
1238 newvd->vdev_id = pvd->vdev_children; | |
1239 vdev_add_child(pvd, newvd); | |
1240 | |
1544 | 1241 /* |
1242 * If newvd is smaller than oldvd, but larger than its rsize, | |
1243 * the addition of newvd may have decreased our parent's asize. | |
1244 */ | |
1245 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); | |
1246 | |
789 | 1247 tvd = newvd->vdev_top; |
1248 ASSERT(pvd->vdev_top == tvd); | |
1249 ASSERT(tvd->vdev_parent == rvd); | |
1250 | |
1251 vdev_config_dirty(tvd); | |
1252 | |
1253 /* | |
1254 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate | |
1255 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). | |
1256 */ | |
1257 open_txg = txg + TXG_CONCURRENT_STATES - 1; | |
1258 | |
1259 mutex_enter(&newvd->vdev_dtl_lock); | |
1260 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, | |
1261 open_txg - TXG_INITIAL + 1); | |
1262 mutex_exit(&newvd->vdev_dtl_lock); | |
1263 | |
1544 | 1264 dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); |
1265 | |
789 | 1266 /* |
1267 * Mark newvd's DTL dirty in this txg. | |
1268 */ | |
1269 vdev_dirty(tvd, VDD_DTL, txg); | |
1270 (void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg); | |
1271 | |
1272 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); | |
1273 | |
1274 /* | |
1275 * Kick off a resilver to update newvd. | |
1276 */ | |
1277 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); | |
1278 | |
1279 return (0); | |
1280 } | |
1281 | |
1282 /* | |
1283 * Detach a device from a mirror or replacing vdev. | |
1284 * If 'replace_done' is specified, only detach if the parent | |
1285 * is a replacing vdev. | |
1286 */ | |
1287 int | |
1544 | 1288 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) |
789 | 1289 { |
1290 uint64_t txg; | |
1291 int c, t, error; | |
1292 vdev_t *rvd = spa->spa_root_vdev; | |
1293 vdev_t *vd, *pvd, *cvd, *tvd; | |
1294 | |
1295 txg = spa_vdev_enter(spa); | |
1296 | |
1544 | 1297 vd = vdev_lookup_by_guid(rvd, guid); |
789 | 1298 |
1299 if (vd == NULL) | |
1300 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); | |
1301 | |
1585
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1302 if (!vd->vdev_ops->vdev_op_leaf) |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1303 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1304 |
789 | 1305 pvd = vd->vdev_parent; |
1306 | |
1307 /* | |
1308 * If replace_done is specified, only remove this device if it's | |
1309 * the first child of a replacing vdev. | |
1310 */ | |
1311 if (replace_done && | |
1312 (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) | |
1313 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); | |
1314 | |
1315 /* | |
1316 * Only mirror and replacing vdevs support detach. | |
1317 */ | |
1318 if (pvd->vdev_ops != &vdev_replacing_ops && | |
1319 pvd->vdev_ops != &vdev_mirror_ops) | |
1320 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); | |
1321 | |
1322 /* | |
1323 * If there's only one replica, you can't detach it. | |
1324 */ | |
1325 if (pvd->vdev_children <= 1) | |
1326 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); | |
1327 | |
1328 /* | |
1329 * If all siblings have non-empty DTLs, this device may have the only | |
1330 * valid copy of the data, which means we cannot safely detach it. | |
1331 * | |
1332 * XXX -- as in the vdev_offline() case, we really want a more | |
1333 * precise DTL check. | |
1334 */ | |
1335 for (c = 0; c < pvd->vdev_children; c++) { | |
1336 uint64_t dirty; | |
1337 | |
1338 cvd = pvd->vdev_child[c]; | |
1339 if (cvd == vd) | |
1340 continue; | |
1341 if (vdev_is_dead(cvd)) | |
1342 continue; | |
1343 mutex_enter(&cvd->vdev_dtl_lock); | |
1344 dirty = cvd->vdev_dtl_map.sm_space | | |
1345 cvd->vdev_dtl_scrub.sm_space; | |
1346 mutex_exit(&cvd->vdev_dtl_lock); | |
1347 if (!dirty) | |
1348 break; | |
1349 } | |
1350 if (c == pvd->vdev_children) | |
1351 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); | |
1352 | |
1353 /* | |
1354 * Erase the disk labels so the disk can be used for other things. | |
1355 * This must be done after all other error cases are handled, | |
1356 * but before we disembowel vd (so we can still do I/O to it). | |
1357 * But if we can't do it, don't treat the error as fatal -- | |
1358 * it may be that the unwritability of the disk is the reason | |
1359 * it's being detached! | |
1360 */ | |
1361 error = vdev_label_init(vd, 0); | |
1362 if (error) | |
1363 dprintf("unable to erase labels on %s\n", vdev_description(vd)); | |
1364 | |
1365 /* | |
1366 * Remove vd from its parent and compact the parent's children. | |
1367 */ | |
1368 vdev_remove_child(pvd, vd); | |
1369 vdev_compact_children(pvd); | |
1370 | |
1371 /* | |
1372 * Remember one of the remaining children so we can get tvd below. | |
1373 */ | |
1374 cvd = pvd->vdev_child[0]; | |
1375 | |
1376 /* | |
1377 * If the parent mirror/replacing vdev only has one child, | |
1378 * the parent is no longer needed. Remove it from the tree. | |
1379 */ | |
1380 if (pvd->vdev_children == 1) | |
1381 vdev_remove_parent(cvd); | |
1382 | |
1383 /* | |
1384 * We don't set tvd until now because the parent we just removed | |
1385 * may have been the previous top-level vdev. | |
1386 */ | |
1387 tvd = cvd->vdev_top; | |
1388 ASSERT(tvd->vdev_parent == rvd); | |
1389 | |
1390 /* | |
1391 * Reopen this top-level vdev to reassess health after detach. | |
1392 */ | |
1544 | 1393 vdev_reopen(tvd); |
789 | 1394 |
1395 /* | |
1396 * If the device we just detached was smaller than the others, | |
1544 | 1397 * it may be possible to add metaslabs (i.e. grow the pool). We ignore |
1398 * the error here because the detach still succeeded - we just weren't | |
1399 * able to reinitialize the metaslabs. This pool is in for a world of | |
1400 * hurt, in any case. | |
789 | 1401 */ |
1544 | 1402 (void) vdev_metaslab_init(tvd, txg); |
789 | 1403 |
1404 vdev_config_dirty(tvd); | |
1405 | |
1406 /* | |
1407 * Mark vd's DTL as dirty in this txg. | |
1408 * vdev_dtl_sync() will see that vd->vdev_detached is set | |
1409 * and free vd's DTL object in syncing context. | |
1410 * But first make sure we're not on any *other* txg's DTL list, | |
1411 * to prevent vd from being accessed after it's freed. | |
1412 */ | |
1413 vdev_dirty(tvd, VDD_DTL, txg); | |
1414 vd->vdev_detached = B_TRUE; | |
1415 for (t = 0; t < TXG_SIZE; t++) | |
1416 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); | |
1417 (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); | |
1418 | |
1544 | 1419 dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); |
789 | 1420 |
1421 return (spa_vdev_exit(spa, vd, txg, 0)); | |
1422 } | |
1423 | |
1424 /* | |
1544 | 1425 * Find any device that's done replacing, so we can detach it. |
789 | 1426 */ |
1544 | 1427 static vdev_t * |
1428 spa_vdev_replace_done_hunt(vdev_t *vd) | |
789 | 1429 { |
1544 | 1430 vdev_t *newvd, *oldvd; |
789 | 1431 int c; |
1432 | |
1544 | 1433 for (c = 0; c < vd->vdev_children; c++) { |
1434 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); | |
1435 if (oldvd != NULL) | |
1436 return (oldvd); | |
1437 } | |
789 | 1438 |
1439 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { | |
1544 | 1440 oldvd = vd->vdev_child[0]; |
1441 newvd = vd->vdev_child[1]; | |
789 | 1442 |
1544 | 1443 mutex_enter(&newvd->vdev_dtl_lock); |
1444 if (newvd->vdev_dtl_map.sm_space == 0 && | |
1445 newvd->vdev_dtl_scrub.sm_space == 0) { | |
1446 mutex_exit(&newvd->vdev_dtl_lock); | |
1447 return (oldvd); | |
1448 } | |
1449 mutex_exit(&newvd->vdev_dtl_lock); | |
1450 } | |
789 | 1451 |
1544 | 1452 return (NULL); |
789 | 1453 } |
1454 | |
1544 | 1455 static void |
789 | 1456 spa_vdev_replace_done(spa_t *spa) |
1457 { | |
1544 | 1458 vdev_t *vd; |
1459 uint64_t guid; | |
789 | 1460 |
1544 | 1461 spa_config_enter(spa, RW_READER, FTAG); |
789 | 1462 |
1544 | 1463 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { |
1464 guid = vd->vdev_guid; | |
1465 spa_config_exit(spa, FTAG); | |
1466 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) | |
1467 return; | |
1468 spa_config_enter(spa, RW_READER, FTAG); | |
789 | 1469 } |
1470 | |
1544 | 1471 spa_config_exit(spa, FTAG); |
789 | 1472 } |
1473 | |
1474 /* | |
1354
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1475 * Update the stored path for this vdev. Dirty the vdev configuration, relying |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1476 * on spa_vdev_enter/exit() to synchronize the labels and cache. |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1477 */ |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1478 int |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1479 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1480 { |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1481 vdev_t *rvd, *vd; |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1482 uint64_t txg; |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1483 |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1484 rvd = spa->spa_root_vdev; |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1485 |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1486 txg = spa_vdev_enter(spa); |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1487 |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1488 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1489 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1490 |
1585
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1491 if (!vd->vdev_ops->vdev_op_leaf) |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1492 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); |
4ad213e858a9
6395480 ztest ASSERT: rbt.bt_objset == wbt.bt_objset, line 2041
bonwick
parents:
1544
diff
changeset
|
1493 |
1354
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1494 spa_strfree(vd->vdev_path); |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1495 vd->vdev_path = spa_strdup(newpath); |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1496 |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1497 vdev_config_dirty(vd->vdev_top); |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1498 |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1499 return (spa_vdev_exit(spa, NULL, txg, 0)); |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1500 } |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1501 |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1502 /* |
789 | 1503 * ========================================================================== |
1504 * SPA Scrubbing | |
1505 * ========================================================================== | |
1506 */ | |
1507 | |
1544 | 1508 void |
1509 spa_scrub_throttle(spa_t *spa, int direction) | |
1510 { | |
1511 mutex_enter(&spa->spa_scrub_lock); | |
1512 spa->spa_scrub_throttled += direction; | |
1513 ASSERT(spa->spa_scrub_throttled >= 0); | |
1514 if (spa->spa_scrub_throttled == 0) | |
1515 cv_broadcast(&spa->spa_scrub_io_cv); | |
1516 mutex_exit(&spa->spa_scrub_lock); | |
1517 } | |
789 | 1518 |
1519 static void | |
1520 spa_scrub_io_done(zio_t *zio) | |
1521 { | |
1522 spa_t *spa = zio->io_spa; | |
1523 | |
1524 zio_buf_free(zio->io_data, zio->io_size); | |
1525 | |
1526 mutex_enter(&spa->spa_scrub_lock); | |
1544 | 1527 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { |
1528 vdev_t *vd = zio->io_vd; | |
789 | 1529 spa->spa_scrub_errors++; |
1530 mutex_enter(&vd->vdev_stat_lock); | |
1531 vd->vdev_stat.vs_scrub_errors++; | |
1532 mutex_exit(&vd->vdev_stat_lock); | |
1533 } | |
1544 | 1534 if (--spa->spa_scrub_inflight == 0) { |
1535 cv_broadcast(&spa->spa_scrub_io_cv); | |
1536 ASSERT(spa->spa_scrub_throttled == 0); | |
1537 } | |
1538 mutex_exit(&spa->spa_scrub_lock); | |
789 | 1539 } |
1540 | |
1541 static void | |
1544 | 1542 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, |
1543 zbookmark_t *zb) | |
789 | 1544 { |
1545 size_t size = BP_GET_LSIZE(bp); | |
1546 void *data = zio_buf_alloc(size); | |
1547 | |
1548 mutex_enter(&spa->spa_scrub_lock); | |
1549 spa->spa_scrub_inflight++; | |
1550 mutex_exit(&spa->spa_scrub_lock); | |
1551 | |
1544 | 1552 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) |
1553 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ | |
1554 | |
1555 flags |= ZIO_FLAG_CANFAIL; | |
1556 | |
789 | 1557 zio_nowait(zio_read(NULL, spa, bp, data, size, |
1544 | 1558 spa_scrub_io_done, NULL, priority, flags, zb)); |
789 | 1559 } |
1560 | |
1561 /* ARGSUSED */ | |
1562 static int | |
1563 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) | |
1564 { | |
1565 blkptr_t *bp = &bc->bc_blkptr; | |
1566 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0])); | |
1567 | |
1568 if (bc->bc_errno || vd == NULL) { | |
1569 /* | |
1570 * We can't scrub this block, but we can continue to scrub | |
1571 * the rest of the pool. Note the error and move along. | |
1572 */ | |
1573 mutex_enter(&spa->spa_scrub_lock); | |
1574 spa->spa_scrub_errors++; | |
1575 mutex_exit(&spa->spa_scrub_lock); | |
1576 | |
1577 if (vd != NULL) { | |
1578 mutex_enter(&vd->vdev_stat_lock); | |
1579 vd->vdev_stat.vs_scrub_errors++; | |
1580 mutex_exit(&vd->vdev_stat_lock); | |
1581 } | |
1582 | |
1583 return (ERESTART); | |
1584 } | |
1585 | |
1586 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); | |
1587 | |
1588 /* | |
1589 * Keep track of how much data we've examined so that | |
1590 * zpool(1M) status can make useful progress reports. | |
1591 */ | |
1592 mutex_enter(&vd->vdev_stat_lock); | |
1593 vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp); | |
1594 mutex_exit(&vd->vdev_stat_lock); | |
1595 | |
1596 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { | |
1597 if (DVA_GET_GANG(&bp->blk_dva[0])) { | |
1598 /* | |
1599 * Gang members may be spread across multiple vdevs, | |
1600 * so the best we can do is look at the pool-wide DTL. | |
1601 * XXX -- it would be better to change our allocation | |
1602 * policy to ensure that this can't happen. | |
1603 */ | |
1604 vd = spa->spa_root_vdev; | |
1605 } | |
1606 if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) { | |
1607 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, | |
1544 | 1608 ZIO_FLAG_RESILVER, &bc->bc_bookmark); |
789 | 1609 } |
1610 } else { | |
1611 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, | |
1544 | 1612 ZIO_FLAG_SCRUB, &bc->bc_bookmark); |
789 | 1613 } |
1614 | |
1615 return (0); | |
1616 } | |
1617 | |
1618 static void | |
1619 spa_scrub_thread(spa_t *spa) | |
1620 { | |
1621 callb_cpr_t cprinfo; | |
1622 traverse_handle_t *th = spa->spa_scrub_th; | |
1623 vdev_t *rvd = spa->spa_root_vdev; | |
1624 pool_scrub_type_t scrub_type = spa->spa_scrub_type; | |
1625 int error = 0; | |
1626 boolean_t complete; | |
1627 | |
1628 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); | |
1629 | |
797
af56ba8b7e41
6344108 snapshot create/delete interlock with scrub/resilver must sync txg
bonwick
parents:
789
diff
changeset
|
1630 /* |
af56ba8b7e41
6344108 snapshot create/delete interlock with scrub/resilver must sync txg
bonwick
parents:
789
diff
changeset
|
1631 * If we're restarting due to a snapshot create/delete, |
af56ba8b7e41
6344108 snapshot create/delete interlock with scrub/resilver must sync txg
bonwick
parents:
789
diff
changeset
|
1632 * wait for that to complete. |
af56ba8b7e41
6344108 snapshot create/delete interlock with scrub/resilver must sync txg
bonwick
parents:
789
diff
changeset
|
1633 */ |
af56ba8b7e41
6344108 snapshot create/delete interlock with scrub/resilver must sync txg
bonwick
parents:
789
diff
changeset
|
1634 txg_wait_synced(spa_get_dsl(spa), 0); |
af56ba8b7e41
6344108 snapshot create/delete interlock with scrub/resilver must sync txg
bonwick
parents:
789
diff
changeset
|
1635 |
1544 | 1636 dprintf("start %s mintxg=%llu maxtxg=%llu\n", |
1637 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", | |
1638 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); | |
1639 | |
1640 spa_config_enter(spa, RW_WRITER, FTAG); | |
1641 vdev_reopen(rvd); /* purge all vdev caches */ | |
789 | 1642 vdev_config_dirty(rvd); /* rewrite all disk labels */ |
1643 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); | |
1544 | 1644 spa_config_exit(spa, FTAG); |
789 | 1645 |
1646 mutex_enter(&spa->spa_scrub_lock); | |
1647 spa->spa_scrub_errors = 0; | |
1648 spa->spa_scrub_active = 1; | |
1544 | 1649 ASSERT(spa->spa_scrub_inflight == 0); |
1650 ASSERT(spa->spa_scrub_throttled == 0); | |
789 | 1651 |
1652 while (!spa->spa_scrub_stop) { | |
1653 CALLB_CPR_SAFE_BEGIN(&cprinfo); | |
1544 | 1654 while (spa->spa_scrub_suspended) { |
789 | 1655 spa->spa_scrub_active = 0; |
1656 cv_broadcast(&spa->spa_scrub_cv); | |
1657 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); | |
1658 spa->spa_scrub_active = 1; | |
1659 } | |
1660 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); | |
1661 | |
1662 if (spa->spa_scrub_restart_txg != 0) | |
1663 break; | |
1664 | |
1665 mutex_exit(&spa->spa_scrub_lock); | |
1666 error = traverse_more(th); | |
1667 mutex_enter(&spa->spa_scrub_lock); | |
1668 if (error != EAGAIN) | |
1669 break; | |
1544 | 1670 |
1671 while (spa->spa_scrub_throttled > 0) | |
1672 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); | |
789 | 1673 } |
1674 | |
1675 while (spa->spa_scrub_inflight) | |
1676 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); | |
1677 | |
1678 if (spa->spa_scrub_restart_txg != 0) | |
1679 error = ERESTART; | |
1680 | |
1544 | 1681 if (spa->spa_scrub_stop) |
1682 error = EINTR; | |
1683 | |
789 | 1684 spa->spa_scrub_active = 0; |
1685 cv_broadcast(&spa->spa_scrub_cv); | |
1686 | |
1687 /* | |
1544 | 1688 * Even if there were uncorrectable errors, we consider the scrub |
1689 * completed. The downside is that if there is a transient error during | |
1690 * a resilver, we won't resilver the data properly to the target. But | |
1691 * if the damage is permanent (more likely) we will resilver forever, | |
1692 * which isn't really acceptable. Since there is enough information for | |
1693 * the user to know what has failed and why, this seems like a more | |
1694 * tractable approach. | |
789 | 1695 */ |
1544 | 1696 complete = (error == 0); |
789 | 1697 |
1544 | 1698 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", |
1699 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", | |
789 | 1700 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", |
1701 error, spa->spa_scrub_errors, spa->spa_scrub_stop); | |
1702 | |
1703 mutex_exit(&spa->spa_scrub_lock); | |
1704 | |
1705 /* | |
1706 * If the scrub/resilver completed, update all DTLs to reflect this. | |
1707 * Whether it succeeded or not, vacate all temporary scrub DTLs. | |
1708 */ | |
1544 | 1709 spa_config_enter(spa, RW_WRITER, FTAG); |
789 | 1710 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, |
1711 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); | |
1712 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); | |
1544 | 1713 spa_errlog_rotate(spa); |
1714 spa_config_exit(spa, FTAG); | |
789 | 1715 |
1716 mutex_enter(&spa->spa_scrub_lock); | |
1717 | |
1544 | 1718 /* |
1719 * We may have finished replacing a device. | |
1720 * Let the async thread assess this and handle the detach. | |
1721 */ | |
1722 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); | |
789 | 1723 |
1724 /* | |
1725 * If we were told to restart, our final act is to start a new scrub. | |
1726 */ | |
1727 if (error == ERESTART) | |
1544 | 1728 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? |
1729 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); | |
789 | 1730 |
1544 | 1731 spa->spa_scrub_type = POOL_SCRUB_NONE; |
1732 spa->spa_scrub_active = 0; | |
1733 spa->spa_scrub_thread = NULL; | |
1734 cv_broadcast(&spa->spa_scrub_cv); | |
789 | 1735 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ |
1736 thread_exit(); | |
1737 } | |
1738 | |
1739 void | |
1740 spa_scrub_suspend(spa_t *spa) | |
1741 { | |
1742 mutex_enter(&spa->spa_scrub_lock); | |
1544 | 1743 spa->spa_scrub_suspended++; |
789 | 1744 while (spa->spa_scrub_active) { |
1745 cv_broadcast(&spa->spa_scrub_cv); | |
1746 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); | |
1747 } | |
1748 while (spa->spa_scrub_inflight) | |
1749 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); | |
1750 mutex_exit(&spa->spa_scrub_lock); | |
1751 } | |
1752 | |
1753 void | |
1754 spa_scrub_resume(spa_t *spa) | |
1755 { | |
1756 mutex_enter(&spa->spa_scrub_lock); | |
1544 | 1757 ASSERT(spa->spa_scrub_suspended != 0); |
1758 if (--spa->spa_scrub_suspended == 0) | |
789 | 1759 cv_broadcast(&spa->spa_scrub_cv); |
1760 mutex_exit(&spa->spa_scrub_lock); | |
1761 } | |
1762 | |
1763 void | |
1764 spa_scrub_restart(spa_t *spa, uint64_t txg) | |
1765 { | |
1766 /* | |
1767 * Something happened (e.g. snapshot create/delete) that means | |
1768 * we must restart any in-progress scrubs. The itinerary will | |
1769 * fix this properly. | |
1770 */ | |
1771 mutex_enter(&spa->spa_scrub_lock); | |
1772 spa->spa_scrub_restart_txg = txg; | |
1773 mutex_exit(&spa->spa_scrub_lock); | |
1774 } | |
1775 | |
1544 | 1776 int |
1777 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) | |
789 | 1778 { |
1779 space_seg_t *ss; | |
1780 uint64_t mintxg, maxtxg; | |
1781 vdev_t *rvd = spa->spa_root_vdev; | |
1544 | 1782 int advance = ADVANCE_PRE | ADVANCE_ZIL; |
789 | 1783 |
1784 if ((uint_t)type >= POOL_SCRUB_TYPES) | |
1785 return (ENOTSUP); | |
1786 | |
1544 | 1787 mutex_enter(&spa->spa_scrub_lock); |
1788 | |
789 | 1789 /* |
1790 * If there's a scrub or resilver already in progress, stop it. | |
1791 */ | |
1792 while (spa->spa_scrub_thread != NULL) { | |
1793 /* | |
1794 * Don't stop a resilver unless forced. | |
1795 */ | |
1544 | 1796 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { |
1797 mutex_exit(&spa->spa_scrub_lock); | |
789 | 1798 return (EBUSY); |
1544 | 1799 } |
789 | 1800 spa->spa_scrub_stop = 1; |
1801 cv_broadcast(&spa->spa_scrub_cv); | |
1802 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); | |
1803 } | |
1804 | |
1805 /* | |
1806 * Terminate the previous traverse. | |
1807 */ | |
1808 if (spa->spa_scrub_th != NULL) { | |
1809 traverse_fini(spa->spa_scrub_th); | |
1810 spa->spa_scrub_th = NULL; | |
1811 } | |
1812 | |
1544 | 1813 if (rvd == NULL) { |
1814 ASSERT(spa->spa_scrub_stop == 0); | |
1815 ASSERT(spa->spa_scrub_type == type); | |
1816 ASSERT(spa->spa_scrub_restart_txg == 0); | |
1817 mutex_exit(&spa->spa_scrub_lock); | |
1818 return (0); | |
1819 } | |
789 | 1820 |
1821 mintxg = TXG_INITIAL - 1; | |
1822 maxtxg = spa_last_synced_txg(spa) + 1; | |
1823 | |
1544 | 1824 mutex_enter(&rvd->vdev_dtl_lock); |
789 | 1825 |
1544 | 1826 if (rvd->vdev_dtl_map.sm_space == 0) { |
1827 /* | |
1828 * The pool-wide DTL is empty. | |
1829 * If this is a resilver, there's nothing to do. | |
1830 */ | |
1831 if (type == POOL_SCRUB_RESILVER) | |
1832 type = POOL_SCRUB_NONE; | |
1833 } else { | |
1834 /* | |
1835 * The pool-wide DTL is non-empty. | |
1836 * If this is a normal scrub, upgrade to a resilver instead. | |
1837 */ | |
1838 if (type == POOL_SCRUB_EVERYTHING) | |
1839 type = POOL_SCRUB_RESILVER; | |
1840 } | |
789 | 1841 |
1544 | 1842 if (type == POOL_SCRUB_RESILVER) { |
789 | 1843 /* |
1844 * Determine the resilvering boundaries. | |
1845 * | |
1846 * Note: (mintxg, maxtxg) is an open interval, | |
1847 * i.e. mintxg and maxtxg themselves are not included. | |
1848 * | |
1849 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 | |
1850 * so we don't claim to resilver a txg that's still changing. | |
1851 */ | |
1852 ss = avl_first(&rvd->vdev_dtl_map.sm_root); | |
1544 | 1853 mintxg = ss->ss_start - 1; |
789 | 1854 ss = avl_last(&rvd->vdev_dtl_map.sm_root); |
1544 | 1855 maxtxg = MIN(ss->ss_end, maxtxg); |
789 | 1856 |
1544 | 1857 advance |= ADVANCE_PRUNE; |
789 | 1858 } |
1859 | |
1544 | 1860 mutex_exit(&rvd->vdev_dtl_lock); |
1861 | |
1862 spa->spa_scrub_stop = 0; | |
1863 spa->spa_scrub_type = type; | |
1864 spa->spa_scrub_restart_txg = 0; | |
1865 | |
1866 if (type != POOL_SCRUB_NONE) { | |
1867 spa->spa_scrub_mintxg = mintxg; | |
789 | 1868 spa->spa_scrub_maxtxg = maxtxg; |
1869 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, | |
1870 advance, ZIO_FLAG_CANFAIL); | |
1871 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); | |
1872 spa->spa_scrub_thread = thread_create(NULL, 0, | |
1873 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); | |
1874 } | |
1875 | |
1544 | 1876 mutex_exit(&spa->spa_scrub_lock); |
1877 | |
789 | 1878 return (0); |
1879 } | |
1880 | |
1544 | 1881 /* |
1882 * ========================================================================== | |
1883 * SPA async task processing | |
1884 * ========================================================================== | |
1885 */ | |
1886 | |
1887 static void | |
1888 spa_async_reopen(spa_t *spa) | |
789 | 1889 { |
1544 | 1890 vdev_t *rvd = spa->spa_root_vdev; |
1891 vdev_t *tvd; | |
1892 int c; | |
1893 | |
1894 spa_config_enter(spa, RW_WRITER, FTAG); | |
1895 | |
1896 for (c = 0; c < rvd->vdev_children; c++) { | |
1897 tvd = rvd->vdev_child[c]; | |
1898 if (tvd->vdev_reopen_wanted) { | |
1899 tvd->vdev_reopen_wanted = 0; | |
1900 vdev_reopen(tvd); | |
1901 } | |
1902 } | |
789 | 1903 |
1544 | 1904 spa_config_exit(spa, FTAG); |
1905 } | |
1906 | |
1907 static void | |
1908 spa_async_thread(spa_t *spa) | |
1909 { | |
1910 int tasks; | |
1911 | |
1912 ASSERT(spa->spa_sync_on); | |
789 | 1913 |
1544 | 1914 mutex_enter(&spa->spa_async_lock); |
1915 tasks = spa->spa_async_tasks; | |
1916 spa->spa_async_tasks = 0; | |
1917 mutex_exit(&spa->spa_async_lock); | |
1918 | |
1919 /* | |
1920 * See if any devices need to be reopened. | |
1921 */ | |
1922 if (tasks & SPA_ASYNC_REOPEN) | |
1923 spa_async_reopen(spa); | |
1924 | |
1925 /* | |
1926 * If any devices are done replacing, detach them. | |
1927 */ | |
1928 if (tasks & SPA_ASYNC_REPLACE_DONE) | |
789 | 1929 spa_vdev_replace_done(spa); |
1930 | |
1544 | 1931 /* |
1932 * Kick off a scrub. | |
1933 */ | |
1934 if (tasks & SPA_ASYNC_SCRUB) | |
1935 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); | |
1936 | |
1937 /* | |
1938 * Kick off a resilver. | |
1939 */ | |
1940 if (tasks & SPA_ASYNC_RESILVER) | |
1941 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); | |
1942 | |
1943 /* | |
1944 * Let the world know that we're done. | |
1945 */ | |
1946 mutex_enter(&spa->spa_async_lock); | |
1947 spa->spa_async_thread = NULL; | |
1948 cv_broadcast(&spa->spa_async_cv); | |
1949 mutex_exit(&spa->spa_async_lock); | |
1950 thread_exit(); | |
1951 } | |
1952 | |
1953 void | |
1954 spa_async_suspend(spa_t *spa) | |
1955 { | |
1956 mutex_enter(&spa->spa_async_lock); | |
1957 spa->spa_async_suspended++; | |
1958 while (spa->spa_async_thread != NULL) | |
1959 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); | |
1960 mutex_exit(&spa->spa_async_lock); | |
1961 } | |
1962 | |
1963 void | |
1964 spa_async_resume(spa_t *spa) | |
1965 { | |
1966 mutex_enter(&spa->spa_async_lock); | |
1967 ASSERT(spa->spa_async_suspended != 0); | |
1968 spa->spa_async_suspended--; | |
1969 mutex_exit(&spa->spa_async_lock); | |
1970 } | |
1971 | |
1972 static void | |
1973 spa_async_dispatch(spa_t *spa) | |
1974 { | |
1975 mutex_enter(&spa->spa_async_lock); | |
1976 if (spa->spa_async_tasks && !spa->spa_async_suspended && | |
1977 spa->spa_async_thread == NULL) | |
1978 spa->spa_async_thread = thread_create(NULL, 0, | |
1979 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); | |
1980 mutex_exit(&spa->spa_async_lock); | |
1981 } | |
1982 | |
1983 void | |
1984 spa_async_request(spa_t *spa, int task) | |
1985 { | |
1986 mutex_enter(&spa->spa_async_lock); | |
1987 spa->spa_async_tasks |= task; | |
1988 mutex_exit(&spa->spa_async_lock); | |
789 | 1989 } |
1990 | |
1991 /* | |
1992 * ========================================================================== | |
1993 * SPA syncing routines | |
1994 * ========================================================================== | |
1995 */ | |
1996 | |
1997 static void | |
1998 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) | |
1999 { | |
2000 bplist_t *bpl = &spa->spa_sync_bplist; | |
2001 dmu_tx_t *tx; | |
2002 blkptr_t blk; | |
2003 uint64_t itor = 0; | |
2004 zio_t *zio; | |
2005 int error; | |
2006 uint8_t c = 1; | |
2007 | |
2008 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); | |
2009 | |
2010 while (bplist_iterate(bpl, &itor, &blk) == 0) | |
2011 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); | |
2012 | |
2013 error = zio_wait(zio); | |
2014 ASSERT3U(error, ==, 0); | |
2015 | |
2016 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); | |
2017 bplist_vacate(bpl, tx); | |
2018 | |
2019 /* | |
2020 * Pre-dirty the first block so we sync to convergence faster. | |
2021 * (Usually only the first block is needed.) | |
2022 */ | |
2023 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); | |
2024 dmu_tx_commit(tx); | |
2025 } | |
2026 | |
2027 static void | |
2028 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) | |
2029 { | |
2030 nvlist_t *config; | |
2031 char *packed = NULL; | |
2032 size_t nvsize = 0; | |
2033 dmu_buf_t *db; | |
2034 | |
2035 if (list_is_empty(&spa->spa_dirty_list)) | |
2036 return; | |
2037 | |
2038 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); | |
2039 | |
2040 spa_config_set(spa, config); | |
2041 | |
2042 VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); | |
2043 | |
2044 packed = kmem_alloc(nvsize, KM_SLEEP); | |
2045 | |
1544 | 2046 VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, |
2047 KM_SLEEP) == 0); | |
789 | 2048 |
2049 dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, | |
2050 packed, tx); | |
2051 | |
2052 kmem_free(packed, nvsize); | |
2053 | |
1544 | 2054 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, |
2055 spa->spa_config_object, FTAG, &db)); | |
789 | 2056 dmu_buf_will_dirty(db, tx); |
2057 *(uint64_t *)db->db_data = nvsize; | |
1544 | 2058 dmu_buf_rele(db, FTAG); |
789 | 2059 } |
2060 | |
2061 /* | |
2062 * Sync the specified transaction group. New blocks may be dirtied as | |
2063 * part of the process, so we iterate until it converges. | |
2064 */ | |
2065 void | |
2066 spa_sync(spa_t *spa, uint64_t txg) | |
2067 { | |
2068 dsl_pool_t *dp = spa->spa_dsl_pool; | |
2069 objset_t *mos = spa->spa_meta_objset; | |
2070 bplist_t *bpl = &spa->spa_sync_bplist; | |
2071 vdev_t *vd; | |
2072 dmu_tx_t *tx; | |
2073 int dirty_vdevs; | |
2074 | |
2075 /* | |
2076 * Lock out configuration changes. | |
2077 */ | |
1544 | 2078 spa_config_enter(spa, RW_READER, FTAG); |
789 | 2079 |
2080 spa->spa_syncing_txg = txg; | |
2081 spa->spa_sync_pass = 0; | |
2082 | |
1544 | 2083 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); |
789 | 2084 |
2085 /* | |
2086 * If anything has changed in this txg, push the deferred frees | |
2087 * from the previous txg. If not, leave them alone so that we | |
2088 * don't generate work on an otherwise idle system. | |
2089 */ | |
2090 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || | |
2091 !txg_list_empty(&dp->dp_dirty_dirs, txg)) | |
2092 spa_sync_deferred_frees(spa, txg); | |
2093 | |
2094 /* | |
2095 * Iterate to convergence. | |
2096 */ | |
2097 do { | |
2098 spa->spa_sync_pass++; | |
2099 | |
2100 tx = dmu_tx_create_assigned(dp, txg); | |
2101 spa_sync_config_object(spa, tx); | |
2102 dmu_tx_commit(tx); | |
2103 | |
1544 | 2104 spa_errlog_sync(spa, txg); |
2105 | |
789 | 2106 dsl_pool_sync(dp, txg); |
2107 | |
2108 dirty_vdevs = 0; | |
2109 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { | |
2110 vdev_sync(vd, txg); | |
2111 dirty_vdevs++; | |
2112 } | |
2113 | |
2114 tx = dmu_tx_create_assigned(dp, txg); | |
2115 bplist_sync(bpl, tx); | |
2116 dmu_tx_commit(tx); | |
2117 | |
2118 } while (dirty_vdevs); | |
2119 | |
2120 bplist_close(bpl); | |
2121 | |
2122 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); | |
2123 | |
2124 /* | |
2125 * Rewrite the vdev configuration (which includes the uberblock) | |
2126 * to commit the transaction group. | |
2127 */ | |
1544 | 2128 VERIFY(0 == spa_sync_labels(spa, txg)); |
789 | 2129 |
2130 /* | |
2131 * Make a stable copy of the fully synced uberblock. | |
2132 * We use this as the root for pool traversals. | |
2133 */ | |
2134 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ | |
2135 | |
2136 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ | |
2137 | |
2138 rw_enter(&spa->spa_traverse_lock, RW_WRITER); | |
2139 spa->spa_traverse_wanted = 0; | |
2140 spa->spa_ubsync = spa->spa_uberblock; | |
2141 rw_exit(&spa->spa_traverse_lock); | |
2142 | |
2143 spa_scrub_resume(spa); /* resume scrub with new ubsync */ | |
2144 | |
2145 /* | |
2146 * Clean up the ZIL records for the synced txg. | |
2147 */ | |
2148 dsl_pool_zil_clean(dp); | |
2149 | |
2150 /* | |
2151 * Update usable space statistics. | |
2152 */ | |
2153 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) | |
2154 vdev_sync_done(vd, txg); | |
2155 | |
2156 /* | |
2157 * It had better be the case that we didn't dirty anything | |
2158 * since spa_sync_labels(). | |
2159 */ | |
2160 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); | |
2161 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); | |
2162 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); | |
2163 ASSERT(bpl->bpl_queue == NULL); | |
2164 | |
1544 | 2165 spa_config_exit(spa, FTAG); |
2166 | |
2167 /* | |
2168 * If any async tasks have been requested, kick them off. | |
2169 */ | |
2170 spa_async_dispatch(spa); | |
789 | 2171 } |
2172 | |
2173 /* | |
2174 * Sync all pools. We don't want to hold the namespace lock across these | |
2175 * operations, so we take a reference on the spa_t and drop the lock during the | |
2176 * sync. | |
2177 */ | |
2178 void | |
2179 spa_sync_allpools(void) | |
2180 { | |
2181 spa_t *spa = NULL; | |
2182 mutex_enter(&spa_namespace_lock); | |
2183 while ((spa = spa_next(spa)) != NULL) { | |
2184 if (spa_state(spa) != POOL_STATE_ACTIVE) | |
2185 continue; | |
2186 spa_open_ref(spa, FTAG); | |
2187 mutex_exit(&spa_namespace_lock); | |
2188 txg_wait_synced(spa_get_dsl(spa), 0); | |
2189 mutex_enter(&spa_namespace_lock); | |
2190 spa_close(spa, FTAG); | |
2191 } | |
2192 mutex_exit(&spa_namespace_lock); | |
2193 } | |
2194 | |
2195 /* | |
2196 * ========================================================================== | |
2197 * Miscellaneous routines | |
2198 * ========================================================================== | |
2199 */ | |
2200 | |
2201 int | |
2202 spa_busy(void) | |
2203 { | |
2204 return (spa_active_count != 0); | |
2205 } | |
2206 | |
2207 /* | |
2208 * Remove all pools in the system. | |
2209 */ | |
2210 void | |
2211 spa_evict_all(void) | |
2212 { | |
2213 spa_t *spa; | |
2214 | |
2215 /* | |
2216 * Remove all cached state. All pools should be closed now, | |
2217 * so every spa in the AVL tree should be unreferenced. | |
2218 */ | |
2219 mutex_enter(&spa_namespace_lock); | |
2220 while ((spa = spa_next(NULL)) != NULL) { | |
2221 /* | |
1544 | 2222 * Stop async tasks. The async thread may need to detach |
2223 * a device that's been replaced, which requires grabbing | |
2224 * spa_namespace_lock, so we must drop it here. | |
789 | 2225 */ |
2226 spa_open_ref(spa, FTAG); | |
2227 mutex_exit(&spa_namespace_lock); | |
1544 | 2228 spa_async_suspend(spa); |
789 | 2229 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); |
2230 mutex_enter(&spa_namespace_lock); | |
2231 spa_close(spa, FTAG); | |
2232 | |
2233 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { | |
2234 spa_unload(spa); | |
2235 spa_deactivate(spa); | |
2236 } | |
2237 spa_remove(spa); | |
2238 } | |
2239 mutex_exit(&spa_namespace_lock); | |
2240 } | |
1544 | 2241 |
2242 vdev_t * | |
2243 spa_lookup_by_guid(spa_t *spa, uint64_t guid) | |
2244 { | |
2245 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); | |
2246 } |