Mercurial > illumos > illumos-gate
changeset 10685:931790026ac6
6846163 ZFS continues to use faulted logzilla, bringing system to a crawl
6872547 ztest LUN expansion test fails
6873635 zdb should be able to open a pool with a failed slog
6873654 system panics when a slog device is offlined
6875236 zdb should be able to dump the spa history
author | George Wilson <George.Wilson@Sun.COM> |
---|---|
date | Tue, 29 Sep 2009 07:29:35 -0700 |
parents | 5bf5dbdbb746 |
children | c2381d7785a7 |
files | usr/src/cmd/zdb/zdb.c usr/src/cmd/zinject/zinject.c usr/src/cmd/zpool/zpool_main.c usr/src/cmd/ztest/ztest.c usr/src/lib/libzfs/common/libzfs.h usr/src/lib/libzfs/common/libzfs_pool.c usr/src/lib/libzfs/common/mapfile-vers usr/src/uts/common/fs/zfs/dsl_scrub.c usr/src/uts/common/fs/zfs/spa.c usr/src/uts/common/fs/zfs/spa_misc.c usr/src/uts/common/fs/zfs/sys/spa.h usr/src/uts/common/fs/zfs/sys/spa_impl.h usr/src/uts/common/fs/zfs/sys/vdev.h usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h usr/src/uts/common/fs/zfs/vdev.c usr/src/uts/common/fs/zfs/zfs_ioctl.c usr/src/uts/common/fs/zfs/zfs_vfsops.c usr/src/uts/common/fs/zfs/zil.c usr/src/uts/common/fs/zfs/zio.c usr/src/uts/common/fs/zfs/zio_inject.c usr/src/uts/common/sys/fs/zfs.h |
diffstat | 21 files changed, 380 insertions(+), 122 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/cmd/zdb/zdb.c Tue Sep 29 10:20:35 2009 +0200 +++ b/usr/src/cmd/zdb/zdb.c Tue Sep 29 07:29:35 2009 -0700 @@ -100,6 +100,7 @@ (void) fprintf(stderr, " -u uberblock\n"); (void) fprintf(stderr, " -d datasets\n"); (void) fprintf(stderr, " -C cached pool configuration\n"); + (void) fprintf(stderr, " -h pool history\n"); (void) fprintf(stderr, " -i intent logs\n"); (void) fprintf(stderr, " -b block statistics\n"); (void) fprintf(stderr, " -m metaslabs\n"); @@ -504,7 +505,7 @@ char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" }; char prefix[256]; - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); required = vdev_dtl_required(vd); (void) spa_vdev_state_exit(spa, NULL, 0); @@ -534,6 +535,67 @@ dump_dtl(vd->vdev_child[c], indent + 4); } +static void +dump_history(spa_t *spa) +{ + nvlist_t **events = NULL; + char buf[SPA_MAXBLOCKSIZE]; + uint64_t resid, off = 0; + uint64_t len = sizeof (buf); + uint_t num = 0; + int error; + time_t tsec; + struct tm t; + char tbuf[30]; + char internalstr[MAXPATHLEN]; + + do { + if ((error = spa_history_get(spa, &off, &len, buf)) != 0) { + (void) fprintf(stderr, "Unable to read history: " + "error %d\n", error); + return; + } + + if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0) + break; + + off -= resid; + } while (len != 0); + + (void) printf("\nHistory:\n"); + for (int i = 0; i < num; i++) { + uint64_t time, txg, ievent; + char *cmd, *intstr; + + if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME, + &time) != 0) + continue; + if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD, + &cmd) != 0) { + if (nvlist_lookup_uint64(events[i], + ZPOOL_HIST_INT_EVENT, &ievent) != 0) + continue; + verify(nvlist_lookup_uint64(events[i], + ZPOOL_HIST_TXG, &txg) == 0); + verify(nvlist_lookup_string(events[i], + ZPOOL_HIST_INT_STR, &intstr) == 0); + if (ievent >= LOG_END) + continue; + + (void) snprintf(internalstr, + sizeof (internalstr), + "[internal %s txg:%lld] %s", + hist_event_table[ievent], txg, + intstr); + cmd = internalstr; + } + tsec = time; + (void) localtime_r(&tsec, &t); + (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); + (void) printf("%s %s\n", tbuf, cmd); + } +} + /*ARGSUSED*/ static void dump_dnode(objset_t *os, uint64_t object, void *data, size_t size) @@ -1791,6 +1853,9 @@ if (dump_opt['s']) show_pool_stats(spa); + if (dump_opt['h']) + dump_history(spa); + if (rc != 0) exit(rc); } @@ -2256,11 +2321,12 @@ dprintf_setup(&argc, argv); - while ((c = getopt(argc, argv, "udibcmsvCLS:U:lRep:t:")) != -1) { + while ((c = getopt(argc, argv, "udhibcmsvCLS:U:lRep:t:")) != -1) { switch (c) { case 'u': case 'd': case 'i': + case 'h': case 'b': case 'c': case 'm': @@ -2415,6 +2481,23 @@ B_TRUE, FTAG, &os); } else { error = spa_open(argv[0], &spa, FTAG); + if (error) { + /* + * If we're missing the log device then + * try opening the pool after clearing the + * log state. + */ + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(argv[0])) != NULL && + spa->spa_log_state == SPA_LOG_MISSING) { + spa->spa_log_state = SPA_LOG_CLEAR; + error = 0; + } + mutex_exit(&spa_namespace_lock); + + if (!error) + error = spa_open(argv[0], &spa, FTAG); + } } }
--- a/usr/src/cmd/zinject/zinject.c Tue Sep 29 10:20:35 2009 +0200 +++ b/usr/src/cmd/zinject/zinject.c Tue Sep 29 07:29:35 2009 -0700 @@ -227,11 +227,15 @@ "\t\tfunctions which call spa_vdev_config_exit(), or \n" "\t\tspa_vdev_exit() will trigger a panic.\n" "\n" - "\tzinject -d device [-e errno] [-L <nvlist|uber>] [-F] pool\n" + "\tzinject -d device [-e errno] [-L <nvlist|uber>] [-F]\n" + "\t [-T <read|write|free|claim|all> pool\n" "\t\tInject a fault into a particular device or the device's\n" "\t\tlabel. Label injection can either be 'nvlist' or 'uber'.\n" "\t\t'errno' can either be 'nxio' (the default) or 'io'.\n" "\n" + "\tzinject -d device -A <degrade|fault> pool\n" + "\t\tPerform a specific action on a particular device\n" + "\n" "\tzinject -b objset:object:level:blkid pool\n" "\n" "\t\tInject an error into pool 'pool' with the numeric bookmark\n" @@ -497,6 +501,22 @@ } int +perform_action(const char *pool, zinject_record_t *record, int cmd) +{ + zfs_cmd_t zc; + + ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED); + (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name)); + zc.zc_guid = record->zi_guid; + zc.zc_cookie = cmd; + + if (ioctl(zfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) + return (0); + + return (1); +} + +int main(int argc, char **argv) { int c; @@ -509,6 +529,8 @@ int quiet = 0; int error = 0; int domount = 0; + int io_type = ZIO_TYPES; + int action = VDEV_STATE_UNKNOWN; err_type_t type = TYPE_INVAL; err_type_t label = TYPE_INVAL; zinject_record_t record = { 0 }; @@ -546,11 +568,24 @@ return (0); } - while ((c = getopt(argc, argv, ":ab:d:f:Fqhc:t:l:mr:e:uL:p:")) != -1) { + while ((c = getopt(argc, argv, + ":aA:b:d:f:Fqhc:t:T:l:mr:e:uL:p:")) != -1) { switch (c) { case 'a': flags |= ZINJECT_FLUSH_ARC; break; + case 'A': + if (strcasecmp(optarg, "degrade") == 0) { + action = VDEV_STATE_DEGRADED; + } else if (strcasecmp(optarg, "fault") == 0) { + action = VDEV_STATE_FAULTED; + } else { + (void) fprintf(stderr, "invalid action '%s': " + "must be 'degrade' or 'fault'\n", optarg); + usage(); + return (1); + } + break; case 'b': raw = optarg; break; @@ -611,6 +646,25 @@ case 'r': range = optarg; break; + case 'T': + if (strcasecmp(optarg, "read") == 0) { + io_type = ZIO_TYPE_READ; + } else if (strcasecmp(optarg, "write") == 0) { + io_type = ZIO_TYPE_WRITE; + } else if (strcasecmp(optarg, "free") == 0) { + io_type = ZIO_TYPE_FREE; + } else if (strcasecmp(optarg, "claim") == 0) { + io_type = ZIO_TYPE_CLAIM; + } else if (strcasecmp(optarg, "all") == 0) { + io_type = ZIO_TYPES; + } else { + (void) fprintf(stderr, "invalid I/O type " + "'%s': must be 'read', 'write', 'free', " + "'claim' or 'all'\n", optarg); + usage(); + return (1); + } + break; case 't': if ((type = name_to_type(optarg)) == TYPE_INVAL && !MOS_TYPE(type)) { @@ -708,10 +762,15 @@ return (1); } + record.zi_iotype = io_type; if (translate_device(pool, device, label, &record) != 0) return (1); if (!error) error = ENXIO; + + if (action != VDEV_STATE_UNKNOWN) + return (perform_action(pool, &record, action)); + } else if (raw != NULL) { if (range != NULL || type != TYPE_INVAL || level != 0 || record.zi_func[0] != '\0') {
--- a/usr/src/cmd/zpool/zpool_main.c Tue Sep 29 10:20:35 2009 +0200 +++ b/usr/src/cmd/zpool/zpool_main.c Tue Sep 29 07:29:35 2009 -0700 @@ -3624,49 +3624,6 @@ int internal; } hist_cbdata_t; -char *hist_event_table[LOG_END] = { - "invalid event", - "pool create", - "vdev add", - "pool remove", - "pool destroy", - "pool export", - "pool import", - "vdev attach", - "vdev replace", - "vdev detach", - "vdev online", - "vdev offline", - "vdev upgrade", - "pool clear", - "pool scrub", - "pool property set", - "create", - "clone", - "destroy", - "destroy_begin_sync", - "inherit", - "property set", - "quota set", - "permission update", - "permission remove", - "permission who remove", - "promote", - "receive", - "rename", - "reservation set", - "replay_inc_sync", - "replay_full_sync", - "rollback", - "snapshot", - "filesystem version upgrade", - "refquota set", - "refreservation set", - "pool scrub done", - "user hold", - "user release", -}; - /* * Print out the command history for a specific pool. */
--- a/usr/src/cmd/ztest/ztest.c Tue Sep 29 10:20:35 2009 +0200 +++ b/usr/src/cmd/ztest/ztest.c Tue Sep 29 07:29:35 2009 -0700 @@ -203,13 +203,13 @@ ztest_info_t ztest_info[] = { { ztest_dmu_read_write, 1, &zopt_always }, - { ztest_dmu_read_write_zcopy, 1, &zopt_always }, { ztest_dmu_write_parallel, 30, &zopt_always }, { ztest_dmu_object_alloc_free, 1, &zopt_always }, { ztest_dmu_commit_callbacks, 10, &zopt_always }, { ztest_zap, 30, &zopt_always }, { ztest_fzap, 30, &zopt_always }, { ztest_zap_parallel, 100, &zopt_always }, + { ztest_dmu_read_write_zcopy, 1, &zopt_sometimes }, { ztest_dsl_prop_get_set, 1, &zopt_sometimes }, { ztest_dmu_objset_create_destroy, 1, &zopt_sometimes }, { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes }, @@ -1245,8 +1245,8 @@ { spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; - vdev_t *pvd = vd->vdev_parent; uint64_t guid = vd->vdev_guid; + uint64_t generation = spa->spa_config_generation + 1; ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); ASSERT(vd->vdev_ops->vdev_op_leaf); @@ -1262,10 +1262,14 @@ * vdev may have been detached/replaced while we were * trying to online it. */ - if (vd != vdev_lookup_by_guid(tvd, guid) || vd->vdev_parent != pvd) { - if (zopt_verbose >= 6) { - (void) printf("vdev %p has disappeared, was " - "guid %llu\n", (void *)vd, (u_longlong_t)guid); + if (generation != spa->spa_config_generation) { + if (zopt_verbose >= 5) { + (void) printf("vdev configuration has changed, " + "guid %llu, state %llu, expected gen %llu, " + "got gen %llu\n", (u_longlong_t)guid, + (u_longlong_t)tvd->vdev_state, + (u_longlong_t)generation, + (u_longlong_t)spa->spa_config_generation); } return (vd); } @@ -1309,7 +1313,6 @@ uint64_t spa_newsize, spa_cursize, ms_count; (void) mutex_lock(&ztest_shared->zs_vdev_lock); - mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_STATE, spa, RW_READER); while (tvd == NULL || tvd->vdev_islog) { @@ -1330,12 +1333,12 @@ psize = vd->vdev_psize; /* - * We only try to expand the vdev if it's less than 4x its - * original size and it has a valid psize. + * We only try to expand the vdev if it's healthy, less than 4x its + * original size, and it has a valid psize. */ - if (psize == 0 || psize >= 4 * zopt_vdev_size) { + if (tvd->vdev_state != VDEV_STATE_HEALTHY || + psize == 0 || psize >= 4 * zopt_vdev_size) { spa_config_exit(spa, SCL_STATE, spa); - mutex_exit(&spa_namespace_lock); (void) mutex_unlock(&ztest_shared->zs_vdev_lock); return; } @@ -1361,16 +1364,14 @@ tvd->vdev_state != VDEV_STATE_HEALTHY) { if (zopt_verbose >= 5) { (void) printf("Could not expand LUN because " - "some vdevs were not healthy\n"); + "the vdev configuration changed.\n"); } (void) spa_config_exit(spa, SCL_STATE, spa); - mutex_exit(&spa_namespace_lock); (void) mutex_unlock(&ztest_shared->zs_vdev_lock); return; } (void) spa_config_exit(spa, SCL_STATE, spa); - mutex_exit(&spa_namespace_lock); /* * Expanding the LUN will update the config asynchronously, @@ -3486,6 +3487,7 @@ int maxfaults = zopt_maxfaults; vdev_t *vd0 = NULL; uint64_t guid0 = 0; + boolean_t islog = B_FALSE; ASSERT(leaves >= 1); @@ -3513,6 +3515,9 @@ zopt_dir, zopt_pool, top * leaves + leaf); vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); + if (vd0 != NULL && vd0->vdev_top->vdev_islog) + islog = B_TRUE; + if (vd0 != NULL && maxfaults != 1) { /* * Make vd0 explicitly claim to be unreadable, @@ -3558,22 +3563,38 @@ spa_config_exit(spa, SCL_STATE, FTAG); - if (maxfaults == 0) - return; - /* - * If we can tolerate two or more faults, randomly online/offline vd0. + * If we can tolerate two or more faults, or we're dealing + * with a slog, randomly online/offline vd0. */ - if (maxfaults >= 2 && guid0 != 0) { + if ((maxfaults >= 2 || islog) && guid0 != 0) { if (ztest_random(10) < 6) { int flags = (ztest_random(2) == 0 ? ZFS_OFFLINE_TEMPORARY : 0); + + /* + * We have to grab the zs_name_lock as writer to + * prevent a race between offlining a slog and + * destroying a dataset. Offlining the slog will + * grab a reference on the dataset which may cause + * dmu_objset_destroy() to fail with EBUSY thus + * leaving the dataset in an inconsistent state. + */ + if (islog) + (void) rw_wrlock(&ztest_shared->zs_name_lock); + VERIFY(vdev_offline(spa, guid0, flags) != EBUSY); + + if (islog) + (void) rw_unlock(&ztest_shared->zs_name_lock); } else { (void) vdev_online(spa, guid0, 0, NULL); } } + if (maxfaults == 0) + return; + /* * We have at least single-fault tolerance, so inject data corruption. */ @@ -3921,7 +3942,7 @@ ztest_resume(spa_t *spa) { if (spa_suspended(spa)) { - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); vdev_clear(spa, NULL); (void) spa_vdev_state_exit(spa, NULL, 0); (void) zio_resume(spa);
--- a/usr/src/lib/libzfs/common/libzfs.h Tue Sep 29 10:20:35 2009 +0200 +++ b/usr/src/lib/libzfs/common/libzfs.h Tue Sep 29 07:29:35 2009 -0700 @@ -332,10 +332,14 @@ */ struct zfs_cmd; +extern const char *hist_event_table[LOG_END]; + extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *, boolean_t verbose); extern int zpool_upgrade(zpool_handle_t *, uint64_t); extern int zpool_get_history(zpool_handle_t *, nvlist_t **); +extern int zpool_history_unpack(char *, uint64_t, uint64_t *, + nvlist_t ***, uint_t *); extern void zpool_set_history_str(const char *subcommand, int argc, char **argv, char *history_str); extern int zpool_stage_history(libzfs_handle_t *, const char *);
--- a/usr/src/lib/libzfs/common/libzfs_pool.c Tue Sep 29 10:20:35 2009 +0200 +++ b/usr/src/lib/libzfs/common/libzfs_pool.c Tue Sep 29 07:29:35 2009 -0700 @@ -42,6 +42,49 @@ #include "zfs_prop.h" #include "libzfs_impl.h" +const char *hist_event_table[LOG_END] = { + "invalid event", + "pool create", + "vdev add", + "pool remove", + "pool destroy", + "pool export", + "pool import", + "vdev attach", + "vdev replace", + "vdev detach", + "vdev online", + "vdev offline", + "vdev upgrade", + "pool clear", + "pool scrub", + "pool property set", + "create", + "clone", + "destroy", + "destroy_begin_sync", + "inherit", + "property set", + "quota set", + "permission update", + "permission remove", + "permission who remove", + "promote", + "receive", + "rename", + "reservation set", + "replay_inc_sync", + "replay_full_sync", + "rollback", + "snapshot", + "filesystem version upgrade", + "refquota set", + "refreservation set", + "pool scrub done", + "user hold", + "user release", +}; + static int read_efi_label(nvlist_t *config, diskaddr_t *sb); #if defined(__i386) || defined(__amd64) @@ -2804,7 +2847,7 @@ * into 'records'. 'leftover' is set to the number of bytes that weren't * processed as there wasn't a complete record. */ -static int +int zpool_history_unpack(char *buf, uint64_t bytes_read, uint64_t *leftover, nvlist_t ***records, uint_t *numrecords) {
--- a/usr/src/lib/libzfs/common/mapfile-vers Tue Sep 29 10:20:35 2009 +0200 +++ b/usr/src/lib/libzfs/common/mapfile-vers Tue Sep 29 07:29:35 2009 -0700 @@ -45,6 +45,7 @@ fletcher_4_byteswap; fletcher_4_incremental_native; fletcher_4_incremental_byteswap; + hist_event_table; libzfs_errno; libzfs_error_action; libzfs_error_description; @@ -170,6 +171,7 @@ zpool_get_prop_int; zpool_get_state; zpool_get_status; + zpool_history_unpack; zpool_import; zpool_import_props; zpool_import_status;
--- a/usr/src/uts/common/fs/zfs/dsl_scrub.c Tue Sep 29 10:20:35 2009 +0200 +++ b/usr/src/uts/common/fs/zfs/dsl_scrub.c Tue Sep 29 07:29:35 2009 -0700 @@ -1033,7 +1033,7 @@ * spa_scrub_reopen flag indicates that vdev_open() should not * attempt to start another scrub. */ - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); spa->spa_scrub_reopen = B_TRUE; vdev_reopen(spa->spa_root_vdev); spa->spa_scrub_reopen = B_FALSE;
--- a/usr/src/uts/common/fs/zfs/spa.c Tue Sep 29 10:20:35 2009 +0200 +++ b/usr/src/uts/common/fs/zfs/spa.c Tue Sep 29 07:29:35 2009 -0700 @@ -3958,7 +3958,7 @@ * See if any devices need to be marked REMOVED. */ if (tasks & SPA_ASYNC_REMOVE) { - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); spa_async_remove(spa, spa->spa_root_vdev); for (int i = 0; i < spa->spa_l2cache.sav_count; i++) spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); @@ -3977,7 +3977,7 @@ * See if any devices need to be probed. */ if (tasks & SPA_ASYNC_PROBE) { - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); spa_async_probe(spa, spa->spa_root_vdev); (void) spa_vdev_state_exit(spa, NULL, 0); }
--- a/usr/src/uts/common/fs/zfs/spa_misc.c Tue Sep 29 10:20:35 2009 +0200 +++ b/usr/src/uts/common/fs/zfs/spa_misc.c Tue Sep 29 07:29:35 2009 -0700 @@ -880,6 +880,7 @@ if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { dsl_pool_scrub_restart(spa->spa_dsl_pool); config_changed = B_TRUE; + spa->spa_config_generation++; } /* @@ -939,18 +940,24 @@ * Lock the given spa_t for the purpose of changing vdev state. */ void -spa_vdev_state_enter(spa_t *spa) +spa_vdev_state_enter(spa_t *spa, int oplocks) { - spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER); + int locks = SCL_STATE_ALL | oplocks; + + spa_config_enter(spa, locks, spa, RW_WRITER); + spa->spa_vdev_locks = locks; } int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) { - if (vd != NULL) + if (vd != NULL) { vdev_state_dirty(vd->vdev_top); + spa->spa_config_generation++; + } - spa_config_exit(spa, SCL_STATE_ALL, spa); + ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); + spa_config_exit(spa, spa->spa_vdev_locks, spa); /* * If anything changed, wait for it to sync. This ensures that,
--- a/usr/src/uts/common/fs/zfs/sys/spa.h Tue Sep 29 10:20:35 2009 +0200 +++ b/usr/src/uts/common/fs/zfs/sys/spa.h Tue Sep 29 07:29:35 2009 -0700 @@ -411,6 +411,7 @@ extern void spa_close(spa_t *spa, void *tag); extern boolean_t spa_refcount_zero(spa_t *spa); +#define SCL_NONE 0x00 #define SCL_CONFIG 0x01 #define SCL_STATE 0x02 #define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */ @@ -436,7 +437,7 @@ extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); /* Pool vdev state change lock */ -extern void spa_vdev_state_enter(spa_t *spa); +extern void spa_vdev_state_enter(spa_t *spa, int oplock); extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error); /* Accessor functions */
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h Tue Sep 29 10:20:35 2009 +0200 +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h Tue Sep 29 07:29:35 2009 -0700 @@ -122,6 +122,7 @@ spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ uint64_t spa_config_object; /* MOS object for pool config */ + uint64_t spa_config_generation; /* config generation number */ uint64_t spa_syncing_txg; /* txg currently syncing */ uint64_t spa_sync_bplist_obj; /* object for deferred frees */ bplist_t spa_sync_bplist; /* deferred-free bplist */ @@ -172,6 +173,7 @@ spa_log_state_t spa_log_state; /* log state */ uint64_t spa_autoexpand; /* lun expansion on/off */ boolean_t spa_autoreplace; /* autoreplace set in open */ + int spa_vdev_locks; /* locks grabbed */ /* * spa_refcnt & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options.
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h Tue Sep 29 10:20:35 2009 +0200 +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h Tue Sep 29 07:29:35 2009 -0700 @@ -80,7 +80,6 @@ extern void vdev_stat_update(zio_t *zio, uint64_t psize); extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete); -extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec); extern void vdev_propagate_state(vdev_t *vd); extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux);
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h Tue Sep 29 10:20:35 2009 +0200 +++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h Tue Sep 29 07:29:35 2009 -0700 @@ -118,6 +118,8 @@ uint32_t zi_freq; uint32_t zi_failfast; char zi_func[MAXNAMELEN]; + uint32_t zi_iotype; + uint32_t zi_pad; /* 64-bit alignment */ } zinject_record_t; #define ZINJECT_NULL 0x1
--- a/usr/src/uts/common/fs/zfs/vdev.c Tue Sep 29 10:20:35 2009 +0200 +++ b/usr/src/uts/common/fs/zfs/vdev.c Tue Sep 29 07:29:35 2009 -0700 @@ -1935,7 +1935,7 @@ { vdev_t *vd; - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); @@ -1955,7 +1955,8 @@ * unavailable, then back off and simply mark the vdev as degraded * instead. */ - if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) { + if (vdev_is_dead(vd->vdev_top) && !vd->vdev_islog && + vd->vdev_aux == NULL) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; @@ -1984,7 +1985,7 @@ { vdev_t *vd; - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); @@ -2017,7 +2018,7 @@ { vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); @@ -2064,12 +2065,33 @@ } int +vdev_offline_log(spa_t *spa) +{ + int error = 0; + + if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, + NULL, DS_FIND_CHILDREN)) == 0) { + + /* + * We successfully offlined the log device, sync out the + * current txg so that the "stubby" block can be removed + * by zil_sync(). + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + } + return (error); +} + +int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *vd, *tvd; - int error; - - spa_vdev_state_enter(spa); + int error = 0; + uint64_t generation; + metaslab_group_t *mg; + +top: + spa_vdev_state_enter(spa, SCL_ALLOC); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); @@ -2078,6 +2100,8 @@ return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; + mg = tvd->vdev_mg; + generation = spa->spa_config_generation + 1; /* * If the device isn't already offline, try to offline it. @@ -2093,6 +2117,38 @@ return (spa_vdev_state_exit(spa, NULL, EBUSY)); /* + * If the top-level is a slog and it's had allocations + * then proceed. We check that the vdev's metaslab + * grop is not NULL since it's possible that we may + * have just added this vdev and have not yet initialized + * it's metaslabs. + */ + if (tvd->vdev_islog && mg != NULL) { + /* + * Prevent any future allocations. + */ + metaslab_class_remove(spa->spa_log_class, mg); + (void) spa_vdev_state_exit(spa, vd, 0); + + error = vdev_offline_log(spa); + + spa_vdev_state_enter(spa, SCL_ALLOC); + + /* + * Check to see if the config has changed. + */ + if (error || generation != spa->spa_config_generation) { + metaslab_class_add(spa->spa_log_class, mg); + if (error) + return (spa_vdev_state_exit(spa, + vd, error)); + (void) spa_vdev_state_exit(spa, vd, 0); + goto top; + } + ASSERT3U(tvd->vdev_stat.vs_alloc, ==, 0); + } + + /* * Offline this device and reopen its top-level vdev. * If the top-level vdev is a log device then just offline * it. Otherwise, if this action results in the top-level @@ -2107,28 +2163,18 @@ vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, EBUSY)); } + + /* + * Add the device back into the metaslab rotor so that + * once we online the device it's open for business. + */ + if (tvd->vdev_islog && mg != NULL) + metaslab_class_add(spa->spa_log_class, mg); } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); - if (!tvd->vdev_islog || !vdev_is_dead(tvd)) - return (spa_vdev_state_exit(spa, vd, 0)); - - (void) spa_vdev_state_exit(spa, vd, 0); - - error = dmu_objset_find(spa_name(spa), zil_vdev_offline, - NULL, DS_FIND_CHILDREN); - if (error) { - (void) vdev_online(spa, guid, 0, NULL); - return (error); - } - /* - * If we successfully offlined the log device then we need to - * sync out the current txg so that the "stubby" block can be - * removed by zil_sync(). - */ - txg_wait_synced(spa->spa_dsl_pool, 0); - return (0); + return (spa_vdev_state_exit(spa, vd, 0)); } /* @@ -2356,6 +2402,14 @@ !(zio->io_flags & ZIO_FLAG_IO_RETRY)) return; + /* + * Intent logs writes won't propagate their error to the root + * I/O so don't mark these types of failures as pool-level + * errors. + */ + if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) + return; + mutex_enter(&vd->vdev_stat_lock); if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { if (zio->io_error == ECKSUM)
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c Tue Sep 29 10:20:35 2009 +0200 +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c Tue Sep 29 07:29:35 2009 -0700 @@ -2983,7 +2983,7 @@ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); if (zc->zc_guid == 0) { vd = NULL;
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c Tue Sep 29 10:20:35 2009 +0200 +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c Tue Sep 29 07:29:35 2009 -0700 @@ -951,7 +951,7 @@ zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); if (zil_disable) { - zil_destroy(zfsvfs->z_log, 0); + zil_destroy(zfsvfs->z_log, B_FALSE); zfsvfs->z_log = NULL; }
--- a/usr/src/uts/common/fs/zfs/zil.c Tue Sep 29 10:20:35 2009 +0200 +++ b/usr/src/uts/common/fs/zfs/zil.c Tue Sep 29 07:29:35 2009 -0700 @@ -77,6 +77,8 @@ static kmem_cache_t *zil_lwb_cache; +static boolean_t zil_empty(zilog_t *zilog); + static int zil_dva_compare(const void *x1, const void *x2) { @@ -436,23 +438,12 @@ mutex_enter(&zilog->zl_lock); - /* - * It is possible for the ZIL to get the previously mounted zilog - * structure of the same dataset if quickly remounted and the dbuf - * eviction has not completed. In this case we can see a non - * empty lwb list and keep_first will be set. We fix this by - * clearing the keep_first. This will be slower but it's very rare. - */ - if (!list_is_empty(&zilog->zl_lwb_list) && keep_first) - keep_first = B_FALSE; - ASSERT3U(zilog->zl_destroy_txg, <, txg); zilog->zl_destroy_txg = txg; - zilog->zl_keep_first = keep_first; if (!list_is_empty(&zilog->zl_lwb_list)) { ASSERT(zh->zh_claim_txg == 0); - ASSERT(!keep_first); + zilog->zl_keep_first = B_FALSE; while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { list_remove(&zilog->zl_lwb_list, lwb); if (lwb->lwb_buf != NULL) @@ -461,9 +452,23 @@ kmem_cache_free(zil_lwb_cache, lwb); } } else { - if (!keep_first) { + zilog->zl_keep_first = keep_first; + if (zh->zh_flags & ZIL_REPLAY_NEEDED) { + ASSERT(!keep_first); (void) zil_parse(zilog, zil_free_log_block, zil_free_log_record, tx, zh->zh_claim_txg); + } else { + /* + * Would like to assert zil_empty() but that + * would force us to read the log chain which + * requires us to do I/O to the log. This is + * overkill since we really just want to destroy + * the chain anyway. + */ + if (!keep_first) { + blkptr_t bp = zh->zh_log; + zio_free_blk(zilog->zl_spa, &bp, txg); + } } } mutex_exit(&zilog->zl_lock); @@ -746,7 +751,7 @@ lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, 0, &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb); + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb); } }
--- a/usr/src/uts/common/fs/zfs/zio.c Tue Sep 29 10:20:35 2009 +0200 +++ b/usr/src/uts/common/fs/zfs/zio.c Tue Sep 29 07:29:35 2009 -0700 @@ -2196,8 +2196,9 @@ if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); - if ((zio->io_error == EIO || - !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && zio == lio) { + if ((zio->io_error == EIO || !(zio->io_flags & + (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && + zio == lio) { /* * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport.
--- a/usr/src/uts/common/fs/zfs/zio_inject.c Tue Sep 29 10:20:35 2009 +0200 +++ b/usr/src/uts/common/fs/zfs/zio_inject.c Tue Sep 29 07:29:35 2009 -0700 @@ -184,7 +184,7 @@ int label; int ret = 0; - if (offset + zio->io_size > VDEV_LABEL_START_SIZE && + if (offset >= VDEV_LABEL_START_SIZE && offset < vd->vdev_psize - VDEV_LABEL_END_SIZE) return (0); @@ -226,6 +226,18 @@ inject_handler_t *handler; int ret = 0; + /* + * We skip over faults in the labels unless it's during + * device open (i.e. zio == NULL). + */ + if (zio != NULL) { + uint64_t offset = zio->io_offset; + + if (offset < VDEV_LABEL_START_SIZE || + offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) + return (0); + } + rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; @@ -243,6 +255,12 @@ continue; } + /* Handle type specific I/O failures */ + if (zio != NULL && + handler->zi_record.zi_iotype != ZIO_TYPES && + handler->zi_record.zi_iotype != zio->io_type) + continue; + if (handler->zi_record.zi_error == error) { /* * For a failed open, pretend like the device
--- a/usr/src/uts/common/sys/fs/zfs.h Tue Sep 29 10:20:35 2009 +0200 +++ b/usr/src/uts/common/sys/fs/zfs.h Tue Sep 29 07:29:35 2009 -0700 @@ -692,7 +692,7 @@ /* * Note: This is encoded on-disk, so new events must be added to the * end, and unused events can not be removed. Be sure to edit - * zpool_main.c: hist_event_table[]. + * libzfs_pool.c: hist_event_table[]. */ typedef enum history_internal_events { LOG_NO_EVENT = 0,