Mercurial > illumos > illumos-gate
changeset 11146:7e58f40bcb1c
6826241 Sync write IOPS drops dramatically during TXG sync
6869229 zfs should switch to shiny new metaslabs more frequently
author | George Wilson <George.Wilson@Sun.COM> |
---|---|
date | Sat, 21 Nov 2009 22:51:29 -0800 |
parents | c1f49419a884 |
children | 74e8c05021f1 |
files | usr/src/cmd/zdb/zdb.c usr/src/uts/common/fs/zfs/metaslab.c usr/src/uts/common/fs/zfs/spa.c usr/src/uts/common/fs/zfs/space_map.c usr/src/uts/common/fs/zfs/sys/metaslab.h usr/src/uts/common/fs/zfs/sys/metaslab_impl.h usr/src/uts/common/fs/zfs/sys/spa_impl.h usr/src/uts/common/fs/zfs/sys/space_map.h usr/src/uts/common/fs/zfs/sys/zio.h usr/src/uts/common/fs/zfs/vdev.c usr/src/uts/common/fs/zfs/vdev_queue.c usr/src/uts/common/fs/zfs/zio.c |
diffstat | 12 files changed, 442 insertions(+), 189 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/cmd/zdb/zdb.c Sat Nov 21 01:05:40 2009 -0800 +++ b/usr/src/cmd/zdb/zdb.c Sat Nov 21 22:51:29 2009 -0800 @@ -453,33 +453,37 @@ static void dump_metaslab(metaslab_t *msp) { - char freebuf[5]; - space_map_obj_t *smo = &msp->ms_smo; vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; - - nicenum(msp->ms_map.sm_size - smo->smo_alloc, freebuf); + space_map_t *sm = &msp->ms_map; + space_map_obj_t *smo = &msp->ms_smo; + char freebuf[5]; + + nicenum(sm->sm_size - smo->smo_alloc, freebuf); (void) printf( "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n", - (u_longlong_t)(msp->ms_map.sm_start / msp->ms_map.sm_size), - (u_longlong_t)msp->ms_map.sm_start, (u_longlong_t)smo->smo_object, - freebuf); + (u_longlong_t)(sm->sm_start / sm->sm_size), + (u_longlong_t)sm->sm_start, (u_longlong_t)smo->smo_object, freebuf); if (dump_opt['m'] > 1 && !dump_opt['L']) { mutex_enter(&msp->ms_lock); - VERIFY(space_map_load(&msp->ms_map, zfs_metaslab_ops, - SM_FREE, &msp->ms_smo, spa->spa_meta_objset) == 0); - dump_metaslab_stats(msp); - space_map_unload(&msp->ms_map); + space_map_load_wait(sm); + if (!sm->sm_loaded && + (smo->smo_object != 0 || dump_opt['m'] > 2)) { + VERIFY(space_map_load(sm, zfs_metaslab_ops, + SM_FREE, smo, spa->spa_meta_objset) == 0); + dump_metaslab_stats(msp); + space_map_unload(sm); + } mutex_exit(&msp->ms_lock); } if (dump_opt['d'] > 5 || dump_opt['m'] > 2) { - ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift)); + ASSERT(sm->sm_size == (1ULL << vd->vdev_ms_shift)); mutex_enter(&msp->ms_lock); - dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map); + dump_spacemap(spa->spa_meta_objset, smo, sm); mutex_exit(&msp->ms_lock); } } @@ -2843,6 +2847,8 @@ error = 0; target = argv[0]; + VERIFY(nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) == 0); + if (dump_opt['e']) { nvlist_t *cfg = NULL; char *name = find_zpool(&target, &cfg, nsearch, searchdirs); @@ -2853,8 +2859,7 @@ (void) printf("\nConfiguration for import:\n"); dump_nvlist(cfg, 8); } - if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || - nvlist_add_uint64(policy, + if (nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, max_txg) != 0 || nvlist_add_nvlist(cfg, ZPOOL_REWIND_POLICY, policy) != 0) { @@ -2863,13 +2868,16 @@ } if ((error = spa_import(name, cfg, NULL)) != 0) error = spa_import_verbatim(name, cfg, NULL); - nvlist_free(policy); } + } else { + VERIFY(nvlist_add_uint64(policy, ZPOOL_REWIND_META_THRESH, + UINT64_MAX) == 0); } if (error == 0) { if (strpbrk(target, "/@") == NULL || dump_opt['R']) { - error = spa_open(target, &spa, FTAG); + error = spa_open_rewind(target, &spa, FTAG, policy, + NULL); if (error) { /* * If we're missing the log device then @@ -2884,14 +2892,18 @@ } mutex_exit(&spa_namespace_lock); - if (!error) - error = spa_open(target, &spa, FTAG); + if (!error) { + error = spa_open_rewind(target, &spa, + FTAG, policy, NULL); + } } } else { error = dmu_objset_own(target, DMU_OST_ANY, B_TRUE, FTAG, &os); } } + nvlist_free(policy); + if (error) fatal("can't open '%s': %s", target, strerror(error));
--- a/usr/src/uts/common/fs/zfs/metaslab.c Sat Nov 21 01:05:40 2009 -0800 +++ b/usr/src/uts/common/fs/zfs/metaslab.c Sat Nov 21 22:51:29 2009 -0800 @@ -41,7 +41,7 @@ /* * Minimum size which forces the dynamic allocator to change - * it's allocation strategy. Once the space map cannot satisfy + * it's allocation strategy. Once the space map cannot satisfy * an allocation of this size then it switches to using more * aggressive strategy (i.e search by size rather than offset). */ @@ -53,7 +53,23 @@ * Once the space_map's free space drops below this level we dynamically * switch to using best-fit allocations. */ -int metaslab_df_free_pct = 30; +int metaslab_df_free_pct = 4; + +/* + * A metaslab is considered "free" if it contains a contiguous + * segment which is greater than metaslab_min_alloc_size. + */ +uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; + +/* + * Max number of space_maps to prefetch. + */ +int metaslab_prefetch_limit = SPA_DVAS_PER_BP; + +/* + * Percentage bonus multiplier for metaslabs that are in the bonus area. + */ +int metaslab_smo_bonus_pct = 150; /* * ========================================================================== @@ -310,6 +326,32 @@ } /* + * ========================================================================== + * Common allocator routines + * ========================================================================== + */ +static int +metaslab_segsize_compare(const void *x1, const void *x2) +{ + const space_seg_t *s1 = x1; + const space_seg_t *s2 = x2; + uint64_t ss_size1 = s1->ss_end - s1->ss_start; + uint64_t ss_size2 = s2->ss_end - s2->ss_start; + + if (ss_size1 < ss_size2) + return (-1); + if (ss_size1 > ss_size2) + return (1); + + if (s1->ss_start < s2->ss_start) + return (-1); + if (s1->ss_start > s2->ss_start) + return (1); + + return (0); +} + +/* * This is a helper function that can be used by the allocator to find * a suitable block to allocate. This will search the specified AVL * tree looking for a block that matches the specified criteria. @@ -349,101 +391,8 @@ return (metaslab_block_picker(t, cursor, size, align)); } -/* - * ========================================================================== - * The first-fit block allocator - * ========================================================================== - */ static void -metaslab_ff_load(space_map_t *sm) -{ - ASSERT(sm->sm_ppd == NULL); - sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); - sm->sm_pp_root = NULL; -} - -static void -metaslab_ff_unload(space_map_t *sm) -{ - kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); - sm->sm_ppd = NULL; -} - -static uint64_t -metaslab_ff_alloc(space_map_t *sm, uint64_t size) -{ - avl_tree_t *t = &sm->sm_root; - uint64_t align = size & -size; - uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; - - return (metaslab_block_picker(t, cursor, size, align)); -} - -/* ARGSUSED */ -static void -metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size) -{ - /* No need to update cursor */ -} - -/* ARGSUSED */ -static void -metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size) -{ - /* No need to update cursor */ -} - -static space_map_ops_t metaslab_ff_ops = { - metaslab_ff_load, - metaslab_ff_unload, - metaslab_ff_alloc, - metaslab_ff_claim, - metaslab_ff_free, - NULL /* maxsize */ -}; - -/* - * Dynamic block allocator - - * Uses the first fit allocation scheme until space get low and then - * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold - * and metaslab_df_free_pct to determine when to switch the allocation scheme. - */ - -uint64_t -metaslab_df_maxsize(space_map_t *sm) -{ - avl_tree_t *t = sm->sm_pp_root; - space_seg_t *ss; - - if (t == NULL || (ss = avl_last(t)) == NULL) - return (0ULL); - - return (ss->ss_end - ss->ss_start); -} - -static int -metaslab_df_seg_compare(const void *x1, const void *x2) -{ - const space_seg_t *s1 = x1; - const space_seg_t *s2 = x2; - uint64_t ss_size1 = s1->ss_end - s1->ss_start; - uint64_t ss_size2 = s2->ss_end - s2->ss_start; - - if (ss_size1 < ss_size2) - return (-1); - if (ss_size1 > ss_size2) - return (1); - - if (s1->ss_start < s2->ss_start) - return (-1); - if (s1->ss_start > s2->ss_start) - return (1); - - return (0); -} - -static void -metaslab_df_load(space_map_t *sm) +metaslab_pp_load(space_map_t *sm) { space_seg_t *ss; @@ -451,7 +400,7 @@ sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); - avl_create(sm->sm_pp_root, metaslab_df_seg_compare, + avl_create(sm->sm_pp_root, metaslab_segsize_compare, sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) @@ -459,7 +408,7 @@ } static void -metaslab_df_unload(space_map_t *sm) +metaslab_pp_unload(space_map_t *sm) { void *cookie = NULL; @@ -475,13 +424,82 @@ sm->sm_pp_root = NULL; } +/* ARGSUSED */ +static void +metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size) +{ + /* No need to update cursor */ +} + +/* ARGSUSED */ +static void +metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size) +{ + /* No need to update cursor */ +} + +/* + * Return the maximum contiguous segment within the metaslab. + */ +uint64_t +metaslab_pp_maxsize(space_map_t *sm) +{ + avl_tree_t *t = sm->sm_pp_root; + space_seg_t *ss; + + if (t == NULL || (ss = avl_last(t)) == NULL) + return (0ULL); + + return (ss->ss_end - ss->ss_start); +} + +/* + * ========================================================================== + * The first-fit block allocator + * ========================================================================== + */ +static uint64_t +metaslab_ff_alloc(space_map_t *sm, uint64_t size) +{ + avl_tree_t *t = &sm->sm_root; + uint64_t align = size & -size; + uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; + + return (metaslab_block_picker(t, cursor, size, align)); +} + +/* ARGSUSED */ +boolean_t +metaslab_ff_fragmented(space_map_t *sm) +{ + return (B_TRUE); +} + +static space_map_ops_t metaslab_ff_ops = { + metaslab_pp_load, + metaslab_pp_unload, + metaslab_ff_alloc, + metaslab_pp_claim, + metaslab_pp_free, + metaslab_pp_maxsize, + metaslab_ff_fragmented +}; + +/* + * ========================================================================== + * Dynamic block allocator - + * Uses the first fit allocation scheme until space get low and then + * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold + * and metaslab_df_free_pct to determine when to switch the allocation scheme. + * ========================================================================== + */ static uint64_t metaslab_df_alloc(space_map_t *sm, uint64_t size) { avl_tree_t *t = &sm->sm_root; uint64_t align = size & -size; uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; - uint64_t max_size = metaslab_df_maxsize(sm); + uint64_t max_size = metaslab_pp_maxsize(sm); int free_pct = sm->sm_space * 100 / sm->sm_size; ASSERT(MUTEX_HELD(sm->sm_lock)); @@ -503,27 +521,154 @@ return (metaslab_block_picker(t, cursor, size, 1ULL)); } -/* ARGSUSED */ -static void -metaslab_df_claim(space_map_t *sm, uint64_t start, uint64_t size) +static boolean_t +metaslab_df_fragmented(space_map_t *sm) { - /* No need to update cursor */ -} + uint64_t max_size = metaslab_pp_maxsize(sm); + int free_pct = sm->sm_space * 100 / sm->sm_size; -/* ARGSUSED */ -static void -metaslab_df_free(space_map_t *sm, uint64_t start, uint64_t size) -{ - /* No need to update cursor */ + if (max_size >= metaslab_df_alloc_threshold && + free_pct >= metaslab_df_free_pct) + return (B_FALSE); + + return (B_TRUE); } static space_map_ops_t metaslab_df_ops = { - metaslab_df_load, - metaslab_df_unload, + metaslab_pp_load, + metaslab_pp_unload, metaslab_df_alloc, - metaslab_df_claim, - metaslab_df_free, - metaslab_df_maxsize + metaslab_pp_claim, + metaslab_pp_free, + metaslab_pp_maxsize, + metaslab_df_fragmented +}; + +/* + * ========================================================================== + * Other experimental allocators + * ========================================================================== + */ +static uint64_t +metaslab_cdf_alloc(space_map_t *sm, uint64_t size) +{ + avl_tree_t *t = &sm->sm_root; + uint64_t *cursor = (uint64_t *)sm->sm_ppd; + uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1; + uint64_t max_size = metaslab_pp_maxsize(sm); + uint64_t rsize = size; + uint64_t offset = 0; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); + + if (max_size < size) + return (-1ULL); + + ASSERT3U(*extent_end, >=, *cursor); + + /* + * If we're running low on space switch to using the size + * sorted AVL tree (best-fit). + */ + if ((*cursor + size) > *extent_end) { + + t = sm->sm_pp_root; + *cursor = *extent_end = 0; + + if (max_size > 2 * SPA_MAXBLOCKSIZE) + rsize = MIN(metaslab_min_alloc_size, max_size); + offset = metaslab_block_picker(t, extent_end, rsize, 1ULL); + if (offset != -1) + *cursor = offset + size; + } else { + offset = metaslab_block_picker(t, cursor, rsize, 1ULL); + } + ASSERT3U(*cursor, <=, *extent_end); + return (offset); +} + +static boolean_t +metaslab_cdf_fragmented(space_map_t *sm) +{ + uint64_t max_size = metaslab_pp_maxsize(sm); + + if (max_size > (metaslab_min_alloc_size * 10)) + return (B_FALSE); + return (B_TRUE); +} + +static space_map_ops_t metaslab_cdf_ops = { + metaslab_pp_load, + metaslab_pp_unload, + metaslab_cdf_alloc, + metaslab_pp_claim, + metaslab_pp_free, + metaslab_pp_maxsize, + metaslab_cdf_fragmented +}; + +static uint64_t +metaslab_ndf_alloc(space_map_t *sm, uint64_t size) +{ + avl_tree_t *t = &sm->sm_root; + avl_index_t where; + space_seg_t *ss, ssearch; + uint64_t *cursor = (uint64_t *)sm->sm_ppd; + uint64_t max_size = metaslab_pp_maxsize(sm); + + ASSERT(MUTEX_HELD(sm->sm_lock)); + ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); + + if (max_size < size) + return (-1ULL); + + ssearch.ss_start = *cursor; + ssearch.ss_end = *cursor + size; + + ss = avl_find(t, &ssearch, &where); + if (ss == NULL || (ss->ss_start + size > ss->ss_end)) { + t = sm->sm_pp_root; + + if (max_size > 2 * SPA_MAXBLOCKSIZE) + size = MIN(metaslab_min_alloc_size, max_size); + + ssearch.ss_start = 0; + ssearch.ss_end = size; + ss = avl_find(t, &ssearch, &where); + if (ss == NULL) + ss = avl_nearest(t, where, AVL_AFTER); + ASSERT(ss != NULL); + } + + if (ss != NULL) { + if (ss->ss_start + size <= ss->ss_end) { + *cursor = ss->ss_start + size; + return (ss->ss_start); + } + } + return (-1ULL); +} + +static boolean_t +metaslab_ndf_fragmented(space_map_t *sm) +{ + uint64_t max_size = metaslab_pp_maxsize(sm); + + if (max_size > (metaslab_min_alloc_size * 10)) + return (B_FALSE); + return (B_TRUE); +} + + +static space_map_ops_t metaslab_ndf_ops = { + metaslab_pp_load, + metaslab_pp_unload, + metaslab_ndf_alloc, + metaslab_pp_claim, + metaslab_pp_free, + metaslab_pp_maxsize, + metaslab_ndf_fragmented }; space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; @@ -616,7 +761,6 @@ #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) #define METASLAB_ACTIVE_MASK \ (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) -#define METASLAB_SMO_BONUS_MULTIPLIER 2 static uint64_t metaslab_weight(metaslab_t *msp) @@ -649,25 +793,60 @@ ASSERT(weight >= space && weight <= 2 * space); /* - * For locality, assign higher weight to metaslabs we've used before. + * For locality, assign higher weight to metaslabs which have + * a lower offset than what we've already activated. */ - if (smo->smo_object != 0) - weight *= METASLAB_SMO_BONUS_MULTIPLIER; + if (sm->sm_start <= mg->mg_bonus_area) + weight *= (metaslab_smo_bonus_pct / 100); ASSERT(weight >= space && - weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space); + weight <= 2 * (metaslab_smo_bonus_pct / 100) * space); + + if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) { + /* + * If this metaslab is one we're actively using, adjust its + * weight to make it preferable to any inactive metaslab so + * we'll polish it off. + */ + weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); + } + return (weight); +} + +static void +metaslab_prefetch(metaslab_group_t *mg) +{ + spa_t *spa = mg->mg_vd->vdev_spa; + metaslab_t *msp; + avl_tree_t *t = &mg->mg_metaslab_tree; + int m; + + mutex_enter(&mg->mg_lock); /* - * If this metaslab is one we're actively using, adjust its weight to - * make it preferable to any inactive metaslab so we'll polish it off. + * Prefetch the next potential metaslabs */ - weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); + for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) { + space_map_t *sm = &msp->ms_map; + space_map_obj_t *smo = &msp->ms_smo; + + /* If we have reached our prefetch limit then we're done */ + if (m >= metaslab_prefetch_limit) + break; - return (weight); + if (!sm->sm_loaded && smo->smo_object != 0) { + mutex_exit(&mg->mg_lock); + dmu_prefetch(spa_meta_objset(spa), smo->smo_object, + 0ULL, smo->smo_objsize); + mutex_enter(&mg->mg_lock); + } + } + mutex_exit(&mg->mg_lock); } static int metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size) { + metaslab_group_t *mg = msp->ms_group; space_map_t *sm = &msp->ms_map; space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; @@ -679,13 +858,23 @@ int error = space_map_load(sm, sm_ops, SM_FREE, &msp->ms_smo, spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); - if (error) { + if (error) { metaslab_group_sort(msp->ms_group, msp, 0); return (error); } for (int t = 0; t < TXG_DEFER_SIZE; t++) space_map_walk(&msp->ms_defermap[t], space_map_claim, sm); + + } + + /* + * Track the bonus area as we activate new metaslabs. + */ + if (sm->sm_start > mg->mg_bonus_area) { + mutex_enter(&mg->mg_lock); + mg->mg_bonus_area = sm->sm_start; + mutex_exit(&mg->mg_lock); } /* @@ -712,9 +901,7 @@ * this metaslab again. In that case, it had better be empty, * or we would be leaving space on the table. */ -#if 0 ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); -#endif metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); } @@ -908,6 +1095,32 @@ mutex_exit(&msp->ms_lock); } +void +metaslab_sync_reassess(metaslab_group_t *mg) +{ + vdev_t *vd = mg->mg_vd; + + /* + * Re-evaluate all metaslabs which have lower offsets than the + * bonus area. + */ + for (int m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + if (msp->ms_map.sm_start > mg->mg_bonus_area) + break; + + mutex_enter(&msp->ms_lock); + metaslab_group_sort(mg, msp, metaslab_weight(msp)); + mutex_exit(&msp->ms_lock); + } + + /* + * Prefetch the next potential metaslabs + */ + metaslab_prefetch(mg); +} + static uint64_t metaslab_distance(metaslab_t *msp, dva_t *dva) { @@ -1003,7 +1216,7 @@ if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL) break; - metaslab_passivate(msp, size - 1); + metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); mutex_exit(&msp->ms_lock); }
--- a/usr/src/uts/common/fs/zfs/spa.c Sat Nov 21 01:05:40 2009 -0800 +++ b/usr/src/uts/common/fs/zfs/spa.c Sat Nov 21 22:51:29 2009 -0800 @@ -73,35 +73,38 @@ zti_mode_fixed, /* value is # of threads (min 1) */ zti_mode_online_percent, /* value is % of online CPUs */ zti_mode_tune, /* fill from zio_taskq_tune_* */ + zti_mode_null, /* don't create a taskq */ zti_nmodes }; -#define ZTI_THREAD_FIX(n) { zti_mode_fixed, (n) } -#define ZTI_THREAD_PCT(n) { zti_mode_online_percent, (n) } -#define ZTI_THREAD_TUNE { zti_mode_tune, 0 } - -#define ZTI_THREAD_ONE ZTI_THREAD_FIX(1) +#define ZTI_FIX(n) { zti_mode_fixed, (n) } +#define ZTI_PCT(n) { zti_mode_online_percent, (n) } +#define ZTI_TUNE { zti_mode_tune, 0 } +#define ZTI_NULL { zti_mode_null, 0 } + +#define ZTI_ONE ZTI_FIX(1) typedef struct zio_taskq_info { - const char *zti_name; - struct { - enum zti_modes zti_mode; - uint_t zti_value; - } zti_nthreads[ZIO_TASKQ_TYPES]; + enum zti_modes zti_mode; + uint_t zti_value; } zio_taskq_info_t; static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { - "issue", "intr" + "issue", "issue_high", "intr", "intr_high" }; -const zio_taskq_info_t zio_taskqs[ZIO_TYPES] = { - /* ISSUE INTR */ - { "spa_zio_null", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, - { "spa_zio_read", { ZTI_THREAD_FIX(8), ZTI_THREAD_TUNE } }, - { "spa_zio_write", { ZTI_THREAD_TUNE, ZTI_THREAD_FIX(8) } }, - { "spa_zio_free", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, - { "spa_zio_claim", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, - { "spa_zio_ioctl", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, +/* + * Define the taskq threads for the following I/O types: + * NULL, READ, WRITE, FREE, CLAIM, and IOCTL + */ +const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { + /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, + { ZTI_FIX(8), ZTI_NULL, ZTI_TUNE, ZTI_NULL }, + { ZTI_TUNE, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, }; enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent; @@ -596,14 +599,14 @@ spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); for (int t = 0; t < ZIO_TYPES; t++) { - const zio_taskq_info_t *ztip = &zio_taskqs[t]; for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { - enum zti_modes mode = ztip->zti_nthreads[q].zti_mode; - uint_t value = ztip->zti_nthreads[q].zti_value; + const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; + enum zti_modes mode = ztip->zti_mode; + uint_t value = ztip->zti_value; char name[32]; (void) snprintf(name, sizeof (name), - "%s_%s", ztip->zti_name, zio_taskq_types[q]); + "%s_%s", zio_type_name[t], zio_taskq_types[q]); if (mode == zti_mode_tune) { mode = zio_taskq_tune_mode; @@ -628,6 +631,10 @@ TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); break; + case zti_mode_null: + spa->spa_zio_taskq[t][q] = NULL; + break; + case zti_mode_tune: default: panic("unrecognized mode for " @@ -674,7 +681,8 @@ for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { - taskq_destroy(spa->spa_zio_taskq[t][q]); + if (spa->spa_zio_taskq[t][q] != NULL) + taskq_destroy(spa->spa_zio_taskq[t][q]); spa->spa_zio_taskq[t][q] = NULL; } }
--- a/usr/src/uts/common/fs/zfs/space_map.c Sat Nov 21 01:05:40 2009 -0800 +++ b/usr/src/uts/common/fs/zfs/space_map.c Sat Nov 21 22:51:29 2009 -0800 @@ -367,10 +367,8 @@ uint64_t space_map_maxsize(space_map_t *sm) { - if (sm->sm_loaded && sm->sm_ops != NULL) - return (sm->sm_ops->smop_max(sm)); - else - return (-1ULL); + ASSERT(sm->sm_ops != NULL); + return (sm->sm_ops->smop_max(sm)); } uint64_t
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h Sat Nov 21 01:05:40 2009 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h Sat Nov 21 22:51:29 2009 -0800 @@ -43,6 +43,7 @@ extern void metaslab_fini(metaslab_t *msp); extern void metaslab_sync(metaslab_t *msp, uint64_t txg); extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg); +extern void metaslab_sync_reassess(metaslab_group_t *mg); #define METASLAB_HINTBP_FAVOR 0x0 #define METASLAB_HINTBP_AVOID 0x1
--- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h Sat Nov 21 01:05:40 2009 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h Sat Nov 21 22:51:29 2009 -0800 @@ -51,6 +51,7 @@ kmutex_t mg_lock; avl_tree_t mg_metaslab_tree; uint64_t mg_aliquot; + uint64_t mg_bonus_area; int64_t mg_bias; int64_t mg_activation_count; metaslab_class_t *mg_class;
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h Sat Nov 21 01:05:40 2009 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h Sat Nov 21 22:51:29 2009 -0800 @@ -80,7 +80,9 @@ enum zio_taskq_type { ZIO_TASKQ_ISSUE = 0, + ZIO_TASKQ_ISSUE_HIGH, ZIO_TASKQ_INTERRUPT, + ZIO_TASKQ_INTERRUPT_HIGH, ZIO_TASKQ_TYPES };
--- a/usr/src/uts/common/fs/zfs/sys/space_map.h Sat Nov 21 01:05:40 2009 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/space_map.h Sat Nov 21 22:51:29 2009 -0800 @@ -77,6 +77,7 @@ void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size); void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size); uint64_t (*smop_max)(space_map_t *sm); + boolean_t (*smop_fragmented)(space_map_t *sm); }; /*
--- a/usr/src/uts/common/fs/zfs/sys/zio.h Sat Nov 21 01:05:40 2009 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/zio.h Sat Nov 21 22:51:29 2009 -0800 @@ -120,14 +120,15 @@ #define ZIO_PRIORITY_NOW (zio_priority_table[0]) #define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1]) #define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2]) -#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[3]) -#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[4]) -#define ZIO_PRIORITY_FREE (zio_priority_table[5]) -#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[6]) -#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[7]) -#define ZIO_PRIORITY_RESILVER (zio_priority_table[8]) -#define ZIO_PRIORITY_SCRUB (zio_priority_table[9]) -#define ZIO_PRIORITY_TABLE_SIZE 10 +#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[3]) +#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[4]) +#define ZIO_PRIORITY_AGG (zio_priority_table[5]) +#define ZIO_PRIORITY_FREE (zio_priority_table[6]) +#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[7]) +#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[8]) +#define ZIO_PRIORITY_RESILVER (zio_priority_table[9]) +#define ZIO_PRIORITY_SCRUB (zio_priority_table[10]) +#define ZIO_PRIORITY_TABLE_SIZE 11 #define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_STOP 0x101
--- a/usr/src/uts/common/fs/zfs/vdev.c Sat Nov 21 01:05:40 2009 -0800 +++ b/usr/src/uts/common/fs/zfs/vdev.c Sat Nov 21 22:51:29 2009 -0800 @@ -1939,11 +1939,15 @@ vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; + boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); ASSERT(!vd->vdev_ishole); while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) metaslab_sync_done(msp, txg); + + if (reassess) + metaslab_sync_reassess(vd->vdev_mg); } void
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c Sat Nov 21 01:05:40 2009 -0800 +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c Sat Nov 21 22:51:29 2009 -0800 @@ -285,7 +285,7 @@ ASSERT(size <= zfs_vdev_aggregation_limit); aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, - zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_NOW, + zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL);
--- a/usr/src/uts/common/fs/zfs/zio.c Sat Nov 21 01:05:40 2009 -0800 +++ b/usr/src/uts/common/fs/zfs/zio.c Sat Nov 21 22:51:29 2009 -0800 @@ -45,11 +45,12 @@ 0, /* ZIO_PRIORITY_NOW */ 0, /* ZIO_PRIORITY_SYNC_READ */ 0, /* ZIO_PRIORITY_SYNC_WRITE */ - 6, /* ZIO_PRIORITY_ASYNC_READ */ - 4, /* ZIO_PRIORITY_ASYNC_WRITE */ + 0, /* ZIO_PRIORITY_LOG_WRITE */ + 1, /* ZIO_PRIORITY_CACHE_FILL */ + 1, /* ZIO_PRIORITY_AGG */ 4, /* ZIO_PRIORITY_FREE */ - 0, /* ZIO_PRIORITY_CACHE_FILL */ - 0, /* ZIO_PRIORITY_LOG_WRITE */ + 4, /* ZIO_PRIORITY_ASYNC_WRITE */ + 6, /* ZIO_PRIORITY_ASYNC_READ */ 10, /* ZIO_PRIORITY_RESILVER */ 20, /* ZIO_PRIORITY_SCRUB */ }; @@ -60,7 +61,9 @@ * ========================================================================== */ char *zio_type_name[ZIO_TYPES] = { - "null", "read", "write", "free", "claim", "ioctl" }; + "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", + "zio_ioctl" +}; /* * ========================================================================== @@ -1023,6 +1026,7 @@ static void zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q) { + spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; /* @@ -1039,7 +1043,15 @@ if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) t = ZIO_TYPE_NULL; - (void) taskq_dispatch(zio->io_spa->spa_zio_taskq[t][q], + /* + * If this is a high priority I/O, then use the high priority taskq. + */ + if (zio->io_priority == ZIO_PRIORITY_NOW && + spa->spa_zio_taskq[t][q + 1] != NULL) + q++; + + ASSERT3U(q, <, ZIO_TASKQ_TYPES); + (void) taskq_dispatch(spa->spa_zio_taskq[t][q], (task_func_t *)zio_execute, zio, TQ_SLEEP); }