Mercurial > illumos > illumos-gate
changeset 1732:9e3ae798af31
6280668 pluggable block allocation policy
6399301 initial read of space maps is super slow
6407365 large-sector disk support in ZFS
6407366 ADVANCE_NOLOCK gathers MOS
6407367 three-way deadlock between db_mtx, dbuf_hash[], and ms_lock
line wrap: on
line diff
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c Sat Apr 01 21:50:51 2006 -0800 +++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c Sun Apr 02 00:47:06 2006 -0800 @@ -405,7 +405,6 @@ blkptr(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) { blkptr_t bp; - dva_t *dva; dmu_object_type_info_t *doti; zio_compress_info_t *zct; zio_checksum_info_t *zci; @@ -439,17 +438,20 @@ } for (i = 0; i < SPA_DVAS_PER_BP; i++) { - dva = &bp.blk_dva[i]; - mdb_printf("DVA[%d]: vdev_id %lld / %llx\n", i, - DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva)); - mdb_printf("DVA[%d]: GRID: %04x\t" - "ASIZE: %llx\n", i, DVA_GET_GRID(dva), DVA_GET_ASIZE(dva)); + dva_t *dva = &bp.blk_dva[i]; + mdb_printf("DVA[%d]: GANG: %-5s GRID: %2x ASIZE: %5x " + "vdev %llu offset %llx\n", + i, + DVA_GET_GANG(dva) ? "TRUE" : "FALSE", + DVA_GET_GRID(dva), + DVA_GET_ASIZE(dva), + DVA_GET_VDEV(dva), + DVA_GET_OFFSET(dva)); } mdb_printf("LSIZE: %-16llx\t\tPSIZE: %llx\n", BP_GET_LSIZE(&bp), BP_GET_PSIZE(&bp)); - mdb_printf("ENDIAN: %6s GANG: %-5s\tTYPE: %s\n", + mdb_printf("ENDIAN: %-6s TYPE: %s\n", BP_GET_BYTEORDER(&bp) ? "LITTLE" : "BIG", - DVA_GET_GANG(dva) ? "TRUE" : "FALSE", doti[BP_GET_TYPE(&bp)].ot_name); mdb_printf("BIRTH: %-16llx LEVEL: %-2d\tFILL: %llx\n", bp.blk_birth, BP_GET_LEVEL(&bp), bp.blk_fill); @@ -1146,7 +1148,7 @@ space_map_t ms_allocmap[TXG_SIZE]; space_map_t ms_freemap[TXG_SIZE]; space_map_t ms_map; - uint64_t ms_usable_space; + space_map_obj_t ms_smo; } mdb_metaslab_t; /* @@ -1170,7 +1172,7 @@ uint64_t ms_allocmap[TXG_SIZE] = {0, 0, 0, 0}; uint64_t ms_freemap[TXG_SIZE] = {0, 0, 0, 0}; uint64_t ms_map = 0; - uint64_t ms_usable_space = 0; + uint64_t avail = 0; int i, j; int havecompressed = TRUE; int shift = 20; @@ -1282,7 +1284,7 @@ GETMEMB(vdev_ms[j], struct metaslab, ms_map, ms.ms_map) || GETMEMB(vdev_ms[j], struct metaslab, - ms_usable_space, ms.ms_usable_space)) { + ms_smo, ms.ms_smo)) { return (DCMD_ERR); } @@ -1295,7 +1297,7 @@ ms_freemap[2] += ms.ms_freemap[2].sm_space; ms_freemap[3] += ms.ms_freemap[3].sm_space; ms_map += ms.ms_map.sm_space; - ms_usable_space += ms.ms_usable_space; + avail += ms.ms_map.sm_size - ms.ms_smo.smo_alloc; } } @@ -1310,8 +1312,7 @@ ms_freemap[2] >> shift, suffix, ms_freemap[3] >> shift, suffix); mdb_printf("ms_map = %llu%s\n", ms_map >> shift, suffix); - mdb_printf("ms_usable_space = %llu%s\n", - ms_usable_space >> shift, suffix); + mdb_printf("avail = %llu%s\n", avail >> shift, suffix); return (DCMD_OK); }
--- a/usr/src/cmd/zdb/zdb.c Sat Apr 01 21:50:51 2006 -0800 +++ b/usr/src/cmd/zdb/zdb.c Sun Apr 02 00:47:06 2006 -0800 @@ -366,7 +366,7 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm) { uint64_t alloc, offset, entry; - int mapshift = sm->sm_shift; + uint8_t mapshift = sm->sm_shift; uint64_t mapstart = sm->sm_start; char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID" }; @@ -412,7 +412,7 @@ dump_metaslab(metaslab_t *msp) { char freebuf[5]; - space_map_obj_t *smo = msp->ms_smo; + space_map_obj_t *smo = &msp->ms_smo; vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; @@ -921,13 +921,13 @@ dnode_t *dn; void *bonus = NULL; size_t bsize = 0; - char iblk[6], dblk[6], lsize[6], psize[6], bonus_size[6], segsize[6]; + char iblk[6], dblk[6], lsize[6], asize[6], bonus_size[6], segsize[6]; char aux[50]; int error; if (*print_header) { (void) printf("\n Object lvl iblk dblk lsize" - " psize type\n"); + " asize type\n"); *print_header = 0; } @@ -948,7 +948,7 @@ nicenum(doi.doi_data_block_size, dblk); nicenum(doi.doi_data_block_size * (doi.doi_max_block_offset + 1), lsize); - nicenum(doi.doi_physical_blks << 9, psize); + nicenum(doi.doi_physical_blks << 9, asize); nicenum(doi.doi_bonus_size, bonus_size); aux[0] = '\0'; @@ -963,7 +963,7 @@ (void) printf("%10lld %3u %5s %5s %5s %5s %s%s\n", (u_longlong_t)object, doi.doi_indirection, iblk, dblk, lsize, - psize, dmu_ot[doi.doi_type].ot_name, aux); + asize, dmu_ot[doi.doi_type].ot_name, aux); if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) { (void) printf("%10s %3s %5s %5s %5s %5s %s\n", @@ -1214,11 +1214,9 @@ vd = rvd->vdev_child[c]; for (m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; - space_map_t *sm = &msp->ms_allocmap[0]; mutex_enter(&msp->ms_lock); - error = space_map_load(sm, msp->ms_smo, SM_ALLOC, - spa->spa_meta_objset, msp->ms_usable_end, - sm->sm_size - msp->ms_usable_space); + error = space_map_load(&msp->ms_allocmap[0], NULL, + SM_ALLOC, &msp->ms_smo, spa->spa_meta_objset); mutex_exit(&msp->ms_lock); if (error) fatal("%s bad space map #%d, error %d", @@ -1314,7 +1312,7 @@ } static void -zdb_space_map_vacate(spa_t *spa) +zdb_space_map_unload(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd; @@ -1327,6 +1325,7 @@ mutex_enter(&msp->ms_lock); space_map_vacate(&msp->ms_allocmap[0], zdb_leak, &msp->ms_allocmap[0]); + space_map_unload(&msp->ms_allocmap[0]); space_map_vacate(&msp->ms_freemap[0], NULL, NULL); mutex_exit(&msp->ms_lock); } @@ -1534,7 +1533,7 @@ th = traverse_init(spa, zdb_blkptr_cb, &zcb, advance, flags); th->th_noread = zdb_noread; - traverse_add_pool(th, 0, spa_first_txg(spa)); + traverse_add_pool(th, 0, spa_first_txg(spa) + TXG_CONCURRENT_STATES); while (traverse_more(th) == EAGAIN) continue; @@ -1556,7 +1555,7 @@ * Report any leaked segments. */ if (!dump_opt['L']) - zdb_space_map_vacate(spa); + zdb_space_map_unload(spa); if (dump_opt['L']) (void) printf("\n\n *** Live pool traversal; "
--- a/usr/src/cmd/ztest/ztest.c Sat Apr 01 21:50:51 2006 -0800 +++ b/usr/src/cmd/ztest/ztest.c Sun Apr 02 00:47:06 2006 -0800 @@ -110,10 +110,11 @@ static uint64_t zopt_vdevs = 5; static uint64_t zopt_vdevtime; +static int zopt_ashift = SPA_MINBLOCKSHIFT; static int zopt_mirrors = 2; static int zopt_raidz = 4; static size_t zopt_vdev_size = SPA_MINDEVSIZE; -static int zopt_dirs = 7; +static int zopt_datasets = 7; static int zopt_threads = 23; static uint64_t zopt_passtime = 60; /* 60 seconds */ static uint64_t zopt_killrate = 70; /* 70% kill rate */ @@ -341,6 +342,7 @@ (void) printf("Usage: %s\n" "\t[-v vdevs (default: %llu)]\n" "\t[-s size_of_each_vdev (default: %s)]\n" + "\t[-a alignment_shift (default: %d) (use 0 for random)]\n" "\t[-m mirror_copies (default: %d)]\n" "\t[-r raidz_disks (default: %d)]\n" "\t[-d datasets (default: %d)]\n" @@ -351,17 +353,17 @@ "\t[-p pool_name (default: %s)]\n" "\t[-f file directory for vdev files (default: %s)]\n" "\t[-V(erbose)] (use multiple times for ever more blather)\n" - "\t[-E(xisting)] (use existing pool instead of creating new one\n" - "\t[-I(mport)] (discover and import existing pools)\n" + "\t[-E(xisting)] (use existing pool instead of creating new one)\n" "\t[-T time] total run time (default: %llu sec)\n" "\t[-P passtime] time per pass (default: %llu sec)\n" "", cmdname, (u_longlong_t)zopt_vdevs, /* -v */ nice_vdev_size, /* -s */ + zopt_ashift, /* -a */ zopt_mirrors, /* -m */ zopt_raidz, /* -r */ - zopt_dirs, /* -d */ + zopt_datasets, /* -d */ zopt_threads, /* -t */ nice_gang_bang, /* -g */ zopt_init, /* -i */ @@ -404,14 +406,14 @@ zio_gang_bang = 32 << 10; while ((opt = getopt(argc, argv, - "v:s:m:r:c:d:t:g:i:k:p:f:VEIT:P:S")) != EOF) { + "v:s:a:m:r:d:t:g:i:k:p:f:VET:P:")) != EOF) { value = 0; switch (opt) { case 'v': case 's': + case 'a': case 'm': case 'r': - case 'c': case 'd': case 't': case 'g': @@ -428,6 +430,9 @@ case 's': zopt_vdev_size = MAX(SPA_MINDEVSIZE, value); break; + case 'a': + zopt_ashift = value; + break; case 'm': zopt_mirrors = value; break; @@ -435,7 +440,7 @@ zopt_raidz = MAX(1, value); break; case 'd': - zopt_dirs = MAX(1, value); + zopt_datasets = MAX(1, value); break; case 't': zopt_threads = MAX(1, value); @@ -478,11 +483,20 @@ zopt_maxfaults = MAX(zopt_mirrors, 1) * (zopt_raidz >= 2 ? 2 : 1) - 1; } +static uint64_t +ztest_get_ashift(void) +{ + if (zopt_ashift == 0) + return (SPA_MINBLOCKSHIFT + ztest_random(3)); + return (zopt_ashift); +} + static nvlist_t * make_vdev_file(size_t size) { char dev_name[MAXPATHLEN]; uint64_t vdev; + uint64_t ashift = ztest_get_ashift(); int fd; nvlist_t *file; @@ -505,6 +519,7 @@ VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0); VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0); VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, dev_name) == 0); + VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0); return (file); } @@ -828,7 +843,6 @@ return (NULL); } - /* * Verify that we can attach and detach devices. */ @@ -841,6 +855,7 @@ nvlist_t *root, *file; uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz; uint64_t leaf, top; + uint64_t ashift = ztest_get_ashift(); size_t oldsize, newsize; char oldpath[MAXPATHLEN], newpath[MAXPATHLEN]; int replacing; @@ -917,6 +932,8 @@ expected_error = EBUSY; else if (newsize < oldsize) expected_error = EOVERFLOW; + else if (ashift > oldvd->vdev_top->vdev_ashift) + expected_error = EDOM; else expected_error = 0; @@ -940,6 +957,7 @@ VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0); VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0); VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, newpath) == 0); + VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0); VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0); VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); @@ -2691,6 +2709,7 @@ nvlist_t *file, *root; int error; uint64_t guid; + uint64_t ashift = ztest_get_ashift(); vdev_t *vd; (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev); @@ -2701,6 +2720,7 @@ VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0); VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0); VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, dev_name) == 0); + VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0); VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0); VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); @@ -2714,7 +2734,11 @@ guid = vd->vdev_guid; spa_config_exit(spa, FTAG); error = spa_vdev_attach(spa, guid, root, B_TRUE); - if (error != 0 && error != EBUSY && error != ENOTSUP && error != ENODEV) + if (error != 0 && + error != EBUSY && + error != ENOTSUP && + error != ENODEV && + error != EDOM) fatal(0, "spa_vdev_attach(in-place) = %d", error); nvlist_free(file); @@ -3032,8 +3056,8 @@ za[0].za_kill -= ztest_random(zopt_passtime * NANOSEC); for (t = 0; t < zopt_threads; t++) { - d = t % zopt_dirs; - if (t < zopt_dirs) { + d = t % zopt_datasets; + if (t < zopt_datasets) { ztest_replay_t zr; (void) rw_rdlock(&ztest_shared->zs_name_lock); (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d); @@ -3082,7 +3106,7 @@ fatal(0, "thr_join(%d) = %d", t, error); if (za[t].za_th) traverse_fini(za[t].za_th); - if (t < zopt_dirs) { + if (t < zopt_datasets) { zil_close(za[t].za_zilog); dmu_objset_close(za[t].za_os); } @@ -3105,7 +3129,7 @@ if (zs->zs_enospc_count != 0) { (void) rw_rdlock(&ztest_shared->zs_name_lock); (void) snprintf(name, 100, "%s/%s_%d", pool, pool, - (int)ztest_random(zopt_dirs)); + (int)ztest_random(zopt_datasets)); if (zopt_verbose >= 3) (void) printf("Destroying %s to free up space\n", name); dmu_objset_find(name, ztest_destroy_cb, NULL, @@ -3226,7 +3250,7 @@ if (zopt_verbose >= 1) { (void) printf("%llu vdevs, %d datasets, %d threads," " %llu seconds...\n", - (u_longlong_t)zopt_vdevs, zopt_dirs, zopt_threads, + (u_longlong_t)zopt_vdevs, zopt_datasets, zopt_threads, (u_longlong_t)zopt_time); }
--- a/usr/src/uts/common/fs/zfs/dmu_traverse.c Sat Apr 01 21:50:51 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c Sun Apr 02 00:47:06 2006 -0800 @@ -283,17 +283,19 @@ if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg) return (0); - if (bc->bc_errno == 0) { + /* + * Debugging: verify that the order we visit things agrees with the + * order defined by compare_bookmark(). We don't check this for + * log blocks because there's no defined ordering for them; they're + * always visited (or not) as part of visiting the objset_phys_t. + */ + if (bc->bc_errno == 0 && bc != &th->th_zil_cache) { zbookmark_t *zb = &bc->bc_bookmark; zbookmark_t *szb = &zseg->seg_start; zbookmark_t *ezb = &zseg->seg_end; zbookmark_t *lzb = &th->th_lastcb; dnode_phys_t *dnp = bc->bc_dnode; - /* - * Debugging: verify that the order we visit things - * agrees with the order defined by compare_bookmark(). - */ ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0); ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0); ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 || @@ -477,15 +479,14 @@ zbookmark_t *zb = &bc->bc_bookmark; zseg_t *zseg = list_head(&th->th_seglist); - if (bp->blk_birth <= zseg->seg_mintxg || - bp->blk_birth >= zseg->seg_maxtxg) + if (bp->blk_birth <= zseg->seg_mintxg) return; if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) { zb->zb_object = 0; zb->zb_blkid = bp->blk_cksum.zc_word[3]; bc->bc_blkptr = *bp; - (void) th->th_func(bc, th->th_spa, th->th_arg); + (void) traverse_callback(th, zseg, bc); } } @@ -502,15 +503,14 @@ lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; - if (bp->blk_birth <= zseg->seg_mintxg || - bp->blk_birth >= zseg->seg_maxtxg) + if (bp->blk_birth <= zseg->seg_mintxg) return; if (claim_txg != 0 && bp->blk_birth >= claim_txg) { zb->zb_object = lr->lr_foid; zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp); bc->bc_blkptr = *bp; - (void) th->th_func(bc, th->th_spa, th->th_arg); + (void) traverse_callback(th, zseg, bc); } } } @@ -589,6 +589,20 @@ SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0); + /* + * If we're traversing an open snapshot, we know that it + * can't be deleted (because it's open) and it can't change + * (because it's a snapshot). Therefore, once we've gotten + * from the uberblock down to the snapshot's objset_phys_t, + * we no longer need to synchronize with spa_sync(); we're + * traversing a completely static block tree from here on. + */ + if (th->th_advance & ADVANCE_NOLOCK) { + ASSERT(th->th_locked); + rw_exit(spa_traverse_rwlock(th->th_spa)); + th->th_locked = 0; + } + rc = traverse_read(th, bc, &dsp->ds_bp, dn); if (rc != 0) { @@ -669,7 +683,7 @@ /* * Give spa_sync() a chance to run. */ - if (spa_traverse_wanted(th->th_spa)) { + if (th->th_locked && spa_traverse_wanted(th->th_spa)) { th->th_syncs++; return (EAGAIN); } @@ -723,14 +737,15 @@ save_txg = zseg->seg_mintxg; - if (!(th->th_advance & ADVANCE_NOLOCK)) - rw_enter(rw, RW_READER); + rw_enter(rw, RW_READER); + th->th_locked = 1; rc = traverse_segment(th, zseg, mosbp); ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR); - if (!(th->th_advance & ADVANCE_NOLOCK)) + if (th->th_locked) rw_exit(rw); + th->th_locked = 0; zseg->seg_mintxg = save_txg;
--- a/usr/src/uts/common/fs/zfs/metaslab.c Sat Apr 01 21:50:51 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/metaslab.c Sun Apr 02 00:47:06 2006 -0800 @@ -161,18 +161,18 @@ kmem_free(mg, sizeof (metaslab_group_t)); } -void -metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) +static void +metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) { mutex_enter(&mg->mg_lock); ASSERT(msp->ms_group == NULL); msp->ms_group = mg; - msp->ms_weight = weight; + msp->ms_weight = 0; avl_add(&mg->mg_metaslab_tree, msp); mutex_exit(&mg->mg_lock); } -void +static void metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) { mutex_enter(&mg->mg_lock); @@ -182,9 +182,11 @@ mutex_exit(&mg->mg_lock); } -void +static void metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { + ASSERT(MUTEX_HELD(&msp->ms_lock)); + mutex_enter(&mg->mg_lock); ASSERT(msp->ms_group == mg); avl_remove(&mg->mg_metaslab_tree, msp); @@ -195,277 +197,32 @@ /* * ========================================================================== - * Metaslabs + * The first-fit block allocator * ========================================================================== */ -void -metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, metaslab_t **mspp, - uint64_t start, uint64_t size, uint64_t txg) +static void +metaslab_ff_load(space_map_t *sm) { - vdev_t *vd = mg->mg_vd; - metaslab_t *msp; - int fm; - - msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); - - msp->ms_smo = smo; - - space_map_create(&msp->ms_map, start, size, vd->vdev_ashift, - &msp->ms_lock); - - for (fm = 0; fm < TXG_SIZE; fm++) { - space_map_create(&msp->ms_allocmap[fm], start, size, - vd->vdev_ashift, &msp->ms_lock); - space_map_create(&msp->ms_freemap[fm], start, size, - vd->vdev_ashift, &msp->ms_lock); - } - - /* - * If we're opening an existing pool (txg == 0) or creating - * a new one (txg == TXG_INITIAL), all space is available now. - * If we're adding space to an existing pool, the new space - * does not become available until after this txg has synced. - * We enforce this by assigning an initial weight of 0 to new space. - * - * (Transactional allocations for this txg would actually be OK; - * it's intent log allocations that cause trouble. If we wrote - * a log block in this txg and lost power, the log replay would be - * based on the DVA translations that had been synced in txg - 1. - * Those translations would not include this metaslab's vdev.) - */ - metaslab_group_add(mg, msp, txg > TXG_INITIAL ? 0 : size); - - if (txg == 0) { - /* - * We're opening the pool. Make the metaslab's - * free space available immediately. - */ - vdev_space_update(vd, size, smo->smo_alloc); - metaslab_sync_done(msp, 0); - } else { - /* - * We're adding a new metaslab to an already-open pool. - * Declare all of the metaslab's space to be free. - * - * Note that older transaction groups cannot allocate - * from this metaslab until its existence is committed, - * because we set ms_last_alloc to the current txg. - */ - smo->smo_alloc = 0; - msp->ms_usable_space = size; - mutex_enter(&msp->ms_lock); - space_map_add(&msp->ms_map, start, size); - msp->ms_map_incore = 1; - mutex_exit(&msp->ms_lock); - - /* XXX -- we'll need a call to picker_init here */ - msp->ms_dirty[txg & TXG_MASK] |= MSD_ADD; - msp->ms_last_alloc = txg; - vdev_dirty(vd, VDD_ADD, txg); - (void) txg_list_add(&vd->vdev_ms_list, msp, txg); - } - - *mspp = msp; -} - -void -metaslab_fini(metaslab_t *msp) -{ - int fm; - metaslab_group_t *mg = msp->ms_group; - - vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size, - -msp->ms_smo->smo_alloc); - - metaslab_group_remove(mg, msp); - - /* XXX -- we'll need a call to picker_fini here */ - - mutex_enter(&msp->ms_lock); - - space_map_vacate(&msp->ms_map, NULL, NULL); - msp->ms_map_incore = 0; - space_map_destroy(&msp->ms_map); - - for (fm = 0; fm < TXG_SIZE; fm++) { - space_map_destroy(&msp->ms_allocmap[fm]); - space_map_destroy(&msp->ms_freemap[fm]); - } - - mutex_exit(&msp->ms_lock); - - kmem_free(msp, sizeof (metaslab_t)); + ASSERT(sm->sm_ppd == NULL); + sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); } -/* - * Write a metaslab to disk in the context of the specified transaction group. - */ -void -metaslab_sync(metaslab_t *msp, uint64_t txg) +static void +metaslab_ff_unload(space_map_t *sm) { - vdev_t *vd = msp->ms_group->mg_vd; - spa_t *spa = vd->vdev_spa; - objset_t *os = spa->spa_meta_objset; - space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; - space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; - space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; - space_map_obj_t *smo = msp->ms_smo; - uint8_t *dirty = &msp->ms_dirty[txg & TXG_MASK]; - uint64_t alloc_delta; - dmu_buf_t *db; - dmu_tx_t *tx; - - dprintf("%s offset %llx\n", vdev_description(vd), msp->ms_map.sm_start); - - mutex_enter(&msp->ms_lock); - - if (*dirty & MSD_ADD) - vdev_space_update(vd, msp->ms_map.sm_size, 0); - - if (*dirty & (MSD_ALLOC | MSD_FREE)) { - tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); - - if (smo->smo_object == 0) { - ASSERT(smo->smo_objsize == 0); - ASSERT(smo->smo_alloc == 0); - smo->smo_object = dmu_object_alloc(os, - DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, - DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); - ASSERT(smo->smo_object != 0); - dmu_write(os, vd->vdev_ms_array, sizeof (uint64_t) * - (msp->ms_map.sm_start >> vd->vdev_ms_shift), - sizeof (uint64_t), &smo->smo_object, tx); - } - - alloc_delta = allocmap->sm_space - freemap->sm_space; - vdev_space_update(vd, 0, alloc_delta); - smo->smo_alloc += alloc_delta; - - if (msp->ms_last_alloc == txg && msp->ms_map.sm_space == 0 && - (*dirty & MSD_CONDENSE) == 0) { - space_map_t *sm = &msp->ms_map; - space_map_t *tsm; - int i; - - ASSERT(msp->ms_map_incore); - - space_map_merge(freemap, freed_map); - space_map_vacate(allocmap, NULL, NULL); - - /* - * Write out the current state of the allocation - * world. The current metaslab is full, minus - * stuff that's been freed this txg (freed_map), - * minus allocations from txgs in the future. - */ - space_map_add(sm, sm->sm_start, sm->sm_size); - for (i = 1; i < TXG_CONCURRENT_STATES; i++) { - tsm = &msp->ms_allocmap[(txg + i) & TXG_MASK]; - space_map_iterate(tsm, space_map_remove, sm); - } - space_map_iterate(freed_map, space_map_remove, sm); - - space_map_write(sm, smo, os, tx); - - ASSERT(sm->sm_space == 0); - ASSERT(freemap->sm_space == 0); - ASSERT(allocmap->sm_space == 0); - - *dirty |= MSD_CONDENSE; - } else { - space_map_sync(allocmap, NULL, smo, SM_ALLOC, os, tx); - space_map_sync(freemap, freed_map, smo, SM_FREE, - os, tx); - } - - VERIFY(0 == dmu_bonus_hold(os, smo->smo_object, FTAG, &db)); - dmu_buf_will_dirty(db, tx); - ASSERT3U(db->db_size, ==, sizeof (*smo)); - bcopy(smo, db->db_data, db->db_size); - dmu_buf_rele(db, FTAG); - - dmu_tx_commit(tx); - } - - *dirty &= ~(MSD_ALLOC | MSD_FREE | MSD_ADD); - - mutex_exit(&msp->ms_lock); - - (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); + kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); + sm->sm_ppd = NULL; } -/* - * Called after a transaction group has completely synced to mark - * all of the metaslab's free space as usable. - */ -void -metaslab_sync_done(metaslab_t *msp, uint64_t txg) -{ - uint64_t weight; - uint8_t *dirty = &msp->ms_dirty[txg & TXG_MASK]; - space_map_obj_t *smo = msp->ms_smo; - - dprintf("%s offset %llx txg %llu\n", - vdev_description(msp->ms_group->mg_vd), msp->ms_map.sm_start, txg); - - mutex_enter(&msp->ms_lock); - - ASSERT3U((*dirty & (MSD_ALLOC | MSD_FREE | MSD_ADD)), ==, 0); - - msp->ms_usable_space = msp->ms_map.sm_size - smo->smo_alloc; - msp->ms_usable_end = smo->smo_objsize; - - weight = msp->ms_usable_space; - - if (txg != 0) { - space_map_t *freed_map = - &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; - - /* XXX -- we'll need a call to picker_fini here */ - - /* If we're empty, don't bother sticking around */ - if (msp->ms_usable_space == 0) { - space_map_vacate(&msp->ms_map, NULL, NULL); - msp->ms_map_incore = 0; - ASSERT3U(freed_map->sm_space, ==, 0); - weight = 0; - } else { - /* Add the freed blocks to the available space map */ - if (msp->ms_map_incore) - space_map_merge(freed_map, &msp->ms_map); - else - space_map_vacate(freed_map, NULL, NULL); - weight += msp->ms_map.sm_size; - } - - if (msp->ms_last_alloc == txg) - /* Safe to use for allocation now */ - msp->ms_last_alloc = 0; - - *dirty = 0; - } - - mutex_exit(&msp->ms_lock); - - metaslab_group_sort(msp->ms_group, msp, weight); -} - -/* - * The first-fit block picker. No picker_init or picker_fini, - * this is just an experiment to see how it feels to separate out - * the block selection policy from the map updates. - * Note: the 'cursor' argument is a form of PPD. - */ static uint64_t -metaslab_pick_block(space_map_t *sm, uint64_t size, uint64_t *cursor) +metaslab_ff_alloc(space_map_t *sm, uint64_t size) { avl_tree_t *t = &sm->sm_root; uint64_t align = size & -size; + uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; space_seg_t *ss, ssearch; avl_index_t where; - int tried_once = 0; -again: ssearch.ss_start = *cursor; ssearch.ss_end = *cursor + size; @@ -483,35 +240,351 @@ ss = AVL_NEXT(t, ss); } - /* If we couldn't find a block after cursor, search again */ - if (tried_once == 0) { - tried_once = 1; - *cursor = 0; - goto again; + /* + * If we know we've searched the whole map (*cursor == 0), give up. + * Otherwise, reset the cursor to the beginning and try again. + */ + if (*cursor == 0) + return (-1ULL); + + *cursor = 0; + return (metaslab_ff_alloc(sm, size)); +} + +/* ARGSUSED */ +static void +metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size) +{ + /* No need to update cursor */ +} + +/* ARGSUSED */ +static void +metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size) +{ + /* No need to update cursor */ +} + +static space_map_ops_t metaslab_ff_ops = { + metaslab_ff_load, + metaslab_ff_unload, + metaslab_ff_alloc, + metaslab_ff_claim, + metaslab_ff_free +}; + +/* + * ========================================================================== + * Metaslabs + * ========================================================================== + */ +metaslab_t * +metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, + uint64_t start, uint64_t size, uint64_t txg) +{ + vdev_t *vd = mg->mg_vd; + metaslab_t *msp; + + msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); + + msp->ms_smo_syncing = *smo; + + /* + * We create the main space map here, but we don't create the + * allocmaps and freemaps until metaslab_sync_done(). This serves + * two purposes: it allows metaslab_sync_done() to detect the + * addition of new space; and for debugging, it ensures that we'd + * data fault on any attempt to use this metaslab before it's ready. + */ + space_map_create(&msp->ms_map, start, size, + vd->vdev_ashift, &msp->ms_lock); + + metaslab_group_add(mg, msp); + + /* + * If we're opening an existing pool (txg == 0) or creating + * a new one (txg == TXG_INITIAL), all space is available now. + * If we're adding space to an existing pool, the new space + * does not become available until after this txg has synced. + */ + if (txg <= TXG_INITIAL) + metaslab_sync_done(msp, 0); + + if (txg != 0) { + /* + * The vdev is dirty, but the metaslab isn't -- it just needs + * to have metaslab_sync_done() invoked from vdev_sync_done(). + * [We could just dirty the metaslab, but that would cause us + * to allocate a space map object for it, which is wasteful + * and would mess up the locality logic in metaslab_weight().] + */ + ASSERT(TXG_CLEAN(txg) == spa_last_synced_txg(vd->vdev_spa)); + vdev_dirty(vd, 0, NULL, txg); + vdev_dirty(vd, VDD_METASLAB, msp, TXG_CLEAN(txg)); } - return (-1ULL); + return (msp); +} + +void +metaslab_fini(metaslab_t *msp) +{ + metaslab_group_t *mg = msp->ms_group; + int t; + + vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size, + -msp->ms_smo.smo_alloc); + + metaslab_group_remove(mg, msp); + + mutex_enter(&msp->ms_lock); + + space_map_unload(&msp->ms_map); + space_map_destroy(&msp->ms_map); + + for (t = 0; t < TXG_SIZE; t++) { + space_map_destroy(&msp->ms_allocmap[t]); + space_map_destroy(&msp->ms_freemap[t]); + } + + mutex_exit(&msp->ms_lock); + + kmem_free(msp, sizeof (metaslab_t)); } +#define METASLAB_ACTIVE_WEIGHT (1ULL << 63) + static uint64_t -metaslab_getblock(metaslab_t *msp, uint64_t size, uint64_t txg) +metaslab_weight(metaslab_t *msp) +{ + space_map_t *sm = &msp->ms_map; + space_map_obj_t *smo = &msp->ms_smo; + vdev_t *vd = msp->ms_group->mg_vd; + uint64_t weight, space; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + /* + * The baseline weight is the metaslab's free space. + */ + space = sm->sm_size - smo->smo_alloc; + weight = space; + + /* + * Modern disks have uniform bit density and constant angular velocity. + * Therefore, the outer recording zones are faster (higher bandwidth) + * than the inner zones by the ratio of outer to inner track diameter, + * which is typically around 2:1. We account for this by assigning + * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). + * In effect, this means that we'll select the metaslab with the most + * free bandwidth rather than simply the one with the most free space. + */ + weight = 2 * weight - + ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; + ASSERT(weight >= space && weight <= 2 * space); + + /* + * For locality, assign higher weight to metaslabs we've used before. + */ + if (smo->smo_object != 0) + weight *= 2; + ASSERT(weight >= space && weight <= 4 * space); + + /* + * If this metaslab is one we're actively using, adjust its weight to + * make it preferable to any inactive metaslab so we'll polish it off. + */ + weight |= (msp->ms_weight & METASLAB_ACTIVE_WEIGHT); + + return (weight); +} + +static int +metaslab_activate(metaslab_t *msp) { space_map_t *sm = &msp->ms_map; - vdev_t *vd = msp->ms_group->mg_vd; - uint64_t offset; ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT(msp->ms_map_incore); - ASSERT(sm->sm_space != 0); - ASSERT(P2PHASE(size, 1ULL << vd->vdev_ashift) == 0); + + if (msp->ms_weight < METASLAB_ACTIVE_WEIGHT) { + int error = space_map_load(sm, &metaslab_ff_ops, + SM_FREE, &msp->ms_smo, + msp->ms_group->mg_vd->vdev_spa->spa_meta_objset); + if (error) { + metaslab_group_sort(msp->ms_group, msp, 0); + return (error); + } + metaslab_group_sort(msp->ms_group, msp, + msp->ms_weight | METASLAB_ACTIVE_WEIGHT); + } + ASSERT(sm->sm_loaded); + ASSERT(msp->ms_weight >= METASLAB_ACTIVE_WEIGHT); + + return (0); +} + +static void +metaslab_passivate(metaslab_t *msp, uint64_t size) +{ + metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size - 1)); + ASSERT(msp->ms_weight < METASLAB_ACTIVE_WEIGHT); +} + +/* + * Write a metaslab to disk in the context of the specified transaction group. + */ +void +metaslab_sync(metaslab_t *msp, uint64_t txg) +{ + vdev_t *vd = msp->ms_group->mg_vd; + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; + space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; + space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + space_map_t *sm = &msp->ms_map; + space_map_obj_t *smo = &msp->ms_smo_syncing; + dmu_buf_t *db; + dmu_tx_t *tx; + int t; + + tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); + + /* + * The only state that can actually be changing concurrently with + * metaslab_sync() is the metaslab's ms_map. No other thread can + * be modifying this txg's allocmap, freemap, freed_map, or smo. + * Therefore, we only hold ms_lock to satify space_map ASSERTs. + * We drop it whenever we call into the DMU, because the DMU + * can call down to us (e.g. via zio_free()) at any time. + */ + mutex_enter(&msp->ms_lock); + + if (smo->smo_object == 0) { + ASSERT(smo->smo_objsize == 0); + ASSERT(smo->smo_alloc == 0); + mutex_exit(&msp->ms_lock); + smo->smo_object = dmu_object_alloc(mos, + DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, + DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); + ASSERT(smo->smo_object != 0); + dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * + (sm->sm_start >> vd->vdev_ms_shift), + sizeof (uint64_t), &smo->smo_object, tx); + mutex_enter(&msp->ms_lock); + } + + space_map_walk(freemap, space_map_add, freed_map); + + if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= + 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { + /* + * The in-core space map representation is twice as compact + * as the on-disk one, so it's time to condense the latter + * by generating a pure allocmap from first principles. + * + * This metaslab is 100% allocated, + * minus the content of the in-core map (sm), + * minus what's been freed this txg (freed_map), + * minus allocations from txgs in the future + * (because they haven't been committed yet). + */ + space_map_vacate(allocmap, NULL, NULL); + space_map_vacate(freemap, NULL, NULL); + + space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); - offset = metaslab_pick_block(sm, size, - &msp->ms_map_cursor[highbit(size & -size) - vd->vdev_ashift - 1]); - if (offset != -1ULL) { - space_map_remove(sm, offset, size); - space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + space_map_walk(sm, space_map_remove, allocmap); + space_map_walk(freed_map, space_map_remove, allocmap); + + for (t = 1; t < TXG_CONCURRENT_STATES; t++) + space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], + space_map_remove, allocmap); + + mutex_exit(&msp->ms_lock); + space_map_truncate(smo, mos, tx); + mutex_enter(&msp->ms_lock); } - return (offset); + + space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); + space_map_sync(freemap, SM_FREE, smo, mos, tx); + + mutex_exit(&msp->ms_lock); + + VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + ASSERT3U(db->db_size, ==, sizeof (*smo)); + bcopy(smo, db->db_data, db->db_size); + dmu_buf_rele(db, FTAG); + + dmu_tx_commit(tx); +} + +/* + * Called after a transaction group has completely synced to mark + * all of the metaslab's free space as usable. + */ +void +metaslab_sync_done(metaslab_t *msp, uint64_t txg) +{ + space_map_obj_t *smo = &msp->ms_smo; + space_map_obj_t *smosync = &msp->ms_smo_syncing; + space_map_t *sm = &msp->ms_map; + space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + metaslab_group_t *mg = msp->ms_group; + vdev_t *vd = mg->mg_vd; + int t; + + mutex_enter(&msp->ms_lock); + + /* + * If this metaslab is just becoming available, initialize its + * allocmaps and freemaps and add its capacity to the vdev. + */ + if (freed_map->sm_size == 0) { + for (t = 0; t < TXG_SIZE; t++) { + space_map_create(&msp->ms_allocmap[t], sm->sm_start, + sm->sm_size, sm->sm_shift, sm->sm_lock); + space_map_create(&msp->ms_freemap[t], sm->sm_start, + sm->sm_size, sm->sm_shift, sm->sm_lock); + } + vdev_space_update(vd, sm->sm_size, 0); + } + + vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc); + + ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); + ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); + + /* + * If there's a space_map_load() in progress, wait for it to complete + * so that we have a consistent view of the in-core space map. + * Then, add everything we freed in this txg to the map. + */ + space_map_load_wait(sm); + space_map_vacate(freed_map, sm->sm_loaded ? space_map_free : NULL, sm); + + *smo = *smosync; + + /* + * If the map is loaded but no longer active, evict it as soon as all + * future allocations have synced. (If we unloaded it now and then + * loaded a moment later, the map wouldn't reflect those allocations.) + */ + if (sm->sm_loaded && msp->ms_weight < METASLAB_ACTIVE_WEIGHT) { + int evictable = 1; + + for (t = 1; t < TXG_CONCURRENT_STATES; t++) + if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) + evictable = 0; + + if (evictable) + space_map_unload(sm); + } + + metaslab_group_sort(mg, msp, metaslab_weight(msp)); + + mutex_exit(&msp->ms_lock); } /* @@ -526,11 +599,8 @@ uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); - objset_t *os = spa->spa_meta_objset; vdev_t *vd; metaslab_t *msp; - space_map_t *sm; - space_map_obj_t *smo; int error; if ((vd = vdev_lookup_top(spa, vdev)) == NULL) @@ -540,123 +610,69 @@ return (ENXIO); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; - sm = &msp->ms_map; - smo = msp->ms_smo; if (DVA_GET_GANG(dva)) size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); mutex_enter(&msp->ms_lock); - if (msp->ms_map_incore == 0) { - error = space_map_load(sm, smo, SM_FREE, os, - msp->ms_usable_end, sm->sm_size - msp->ms_usable_space); - ASSERT(error == 0); - if (error) { - mutex_exit(&msp->ms_lock); - return (error); - } - msp->ms_map_incore = 1; - /* XXX -- we'll need a call to picker_init here */ - bzero(msp->ms_map_cursor, sizeof (msp->ms_map_cursor)); + error = metaslab_activate(msp); + if (error) { + mutex_exit(&msp->ms_lock); + return (error); } - space_map_remove(sm, offset, size); - space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) + vdev_dirty(vd, VDD_METASLAB, msp, txg); - if ((msp->ms_dirty[txg & TXG_MASK] & MSD_ALLOC) == 0) { - msp->ms_dirty[txg & TXG_MASK] |= MSD_ALLOC; - msp->ms_last_alloc = txg; - vdev_dirty(vd, VDD_ALLOC, txg); - (void) txg_list_add(&vd->vdev_ms_list, msp, txg); - } + space_map_claim(&msp->ms_map, offset, size); + space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); mutex_exit(&msp->ms_lock); return (0); } -static int -metaslab_usable(metaslab_t *msp, uint64_t size, uint64_t txg) -{ - /* - * Enforce segregation across transaction groups. - */ - /* XXX -- We should probably not assume we know what ms_weight means */ - if (msp->ms_last_alloc == txg) - return (msp->ms_map.sm_space >= size && msp->ms_weight >= size); - - if (msp->ms_last_alloc != 0) - return (0); - - if (msp->ms_map.sm_space >= size && msp->ms_weight >= size) - return (1); - - /* XXX -- the weight test should be in terms of MINFREE */ - return (msp->ms_usable_space >= size && msp->ms_weight >= size); -} - static metaslab_t * -metaslab_pick(metaslab_group_t *mg, uint64_t size, uint64_t txg) +metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t *offp, + uint64_t txg) { - metaslab_t *msp; - avl_tree_t *t = &mg->mg_metaslab_tree; - - mutex_enter(&mg->mg_lock); - for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) - if (metaslab_usable(msp, size, txg)) - break; - mutex_exit(&mg->mg_lock); - - return (msp); -} + metaslab_t *msp = NULL; + uint64_t offset = -1ULL; -static metaslab_t * -metaslab_group_alloc(spa_t *spa, metaslab_group_t *mg, uint64_t size, - uint64_t *offp, uint64_t txg) -{ - metaslab_t *msp; - int error; + for (;;) { + mutex_enter(&mg->mg_lock); + msp = avl_first(&mg->mg_metaslab_tree); + if (msp == NULL || msp->ms_weight < size) { + mutex_exit(&mg->mg_lock); + return (NULL); + } + mutex_exit(&mg->mg_lock); - while ((msp = metaslab_pick(mg, size, txg)) != NULL) { - space_map_obj_t *smo = msp->ms_smo; mutex_enter(&msp->ms_lock); - if (!metaslab_usable(msp, size, txg)) { + + if (metaslab_activate(msp) != 0) { mutex_exit(&msp->ms_lock); continue; } - if (msp->ms_map_incore == 0) { - error = space_map_load(&msp->ms_map, smo, SM_FREE, - spa->spa_meta_objset, msp->ms_usable_end, - msp->ms_map.sm_size - msp->ms_usable_space); - ASSERT(error == 0); - if (error) { - mutex_exit(&msp->ms_lock); - metaslab_group_sort(mg, msp, 0); - continue; - } - msp->ms_map_incore = 1; - /* XXX -- we'll need a call to picker_init here */ - bzero(msp->ms_map_cursor, sizeof (msp->ms_map_cursor)); - } - *offp = metaslab_getblock(msp, size, txg); - if (*offp != -1ULL) { - if ((msp->ms_dirty[txg & TXG_MASK] & MSD_ALLOC) == 0) { - vdev_t *vd = mg->mg_vd; - msp->ms_dirty[txg & TXG_MASK] |= MSD_ALLOC; - msp->ms_last_alloc = txg; - vdev_dirty(vd, VDD_ALLOC, txg); - (void) txg_list_add(&vd->vdev_ms_list, - msp, txg); - } - mutex_exit(&msp->ms_lock); - return (msp); - } + + if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL) + break; + + metaslab_passivate(msp, size); + mutex_exit(&msp->ms_lock); - metaslab_group_sort(msp->ms_group, msp, size - 1); } - return (NULL); + if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) + vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); + + space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + + mutex_exit(&msp->ms_lock); + + *offp = offset; + return (msp); } /* @@ -686,7 +702,7 @@ asize = vdev_psize_to_asize(vd, psize); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); - msp = metaslab_group_alloc(spa, mg, asize, &offset, txg); + msp = metaslab_group_alloc(mg, asize, &offset, txg); if (msp != NULL) { ASSERT(offset != -1ULL); @@ -716,8 +732,6 @@ */ mg->mg_bias = ((su - vu) * (int64_t)mg->mg_aliquot) / (1024 * 4); - - dprintf("bias = %lld\n", mg->mg_bias); } if (atomic_add_64_nv(&mc->mc_allocated, asize) >= @@ -737,8 +751,6 @@ mc->mc_allocated = 0; } while ((mg = mg->mg_next) != rotor); - dprintf("spa=%p, psize=%llu, txg=%llu: no\n", spa, psize, txg); - DVA_SET_VDEV(dva, 0); DVA_SET_OFFSET(dva, 0); DVA_SET_GANG(dva, 0); @@ -751,7 +763,7 @@ * transaction group. */ void -metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg) +metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg, boolean_t now) { uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); @@ -783,13 +795,15 @@ mutex_enter(&msp->ms_lock); - if ((msp->ms_dirty[txg & TXG_MASK] & MSD_FREE) == 0) { - msp->ms_dirty[txg & TXG_MASK] |= MSD_FREE; - vdev_dirty(vd, VDD_FREE, txg); - (void) txg_list_add(&vd->vdev_ms_list, msp, txg); + if (now) { + space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], + offset, size); + space_map_free(&msp->ms_map, offset, size); + } else { + if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) + vdev_dirty(vd, VDD_METASLAB, msp, txg); + space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); } - space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); - mutex_exit(&msp->ms_lock); }
--- a/usr/src/uts/common/fs/zfs/spa.c Sat Apr 01 21:50:51 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/spa.c Sun Apr 02 00:47:06 2006 -0800 @@ -341,8 +341,7 @@ * If the vdev guid sum doesn't match the uberblock, we have an * incomplete configuration. */ - if (rvd->vdev_guid_sum != ub->ub_guid_sum && (mosconfig || - state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT)) { + if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_GUID_SUM); error = ENXIO; @@ -842,8 +841,10 @@ /* * Pass off the heavy lifting to spa_load(). + * Pass TRUE for mosconfig because the user-supplied config + * is actually the one to trust when doing an import. */ - error = spa_load(spa, config, SPA_LOAD_IMPORT, B_FALSE); + error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); if (error) { spa_unload(spa); @@ -898,8 +899,10 @@ /* * Pass off the heavy lifting to spa_load(). + * Pass TRUE for mosconfig because the user-supplied config + * is actually the one to trust when doing an import. */ - (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_FALSE); + (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); /* * If 'tryconfig' was at least parsable, return the current config. @@ -1163,7 +1166,11 @@ if (newvd->vdev_psize < vdev_get_rsize(oldvd)) return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); - if (newvd->vdev_ashift != oldvd->vdev_ashift && oldvd->vdev_ashift != 0) + /* + * The new device cannot have a higher alignment requirement + * than the top-level vdev. + */ + if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); /* @@ -1228,8 +1235,7 @@ /* * Mark newvd's DTL dirty in this txg. */ - vdev_dirty(tvd, VDD_DTL, txg); - (void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg); + vdev_dirty(tvd, VDD_DTL, newvd, txg); (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); @@ -1356,12 +1362,11 @@ /* * If the device we just detached was smaller than the others, - * it may be possible to add metaslabs (i.e. grow the pool). We ignore - * the error here because the detach still succeeded - we just weren't - * able to reinitialize the metaslabs. This pool is in for a world of - * hurt, in any case. + * it may be possible to add metaslabs (i.e. grow the pool). + * vdev_metaslab_init() can't fail because the existing metaslabs + * are already in core, so there's nothing to read from disk. */ - (void) vdev_metaslab_init(tvd, txg); + VERIFY(vdev_metaslab_init(tvd, txg) == 0); vdev_config_dirty(tvd); @@ -1372,11 +1377,10 @@ * But first make sure we're not on any *other* txg's DTL list, * to prevent vd from being accessed after it's freed. */ - vdev_dirty(tvd, VDD_DTL, txg); - vd->vdev_detached = B_TRUE; for (t = 0; t < TXG_SIZE; t++) (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); - (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); + vd->vdev_detached = B_TRUE; + vdev_dirty(tvd, VDD_DTL, vd, txg); dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); @@ -1798,10 +1802,13 @@ if (rvd->vdev_dtl_map.sm_space == 0) { /* * The pool-wide DTL is empty. - * If this is a resilver, there's nothing to do. + * If this is a resilver, there's nothing to do except + * check whether any in-progress replacements have completed. */ - if (type == POOL_SCRUB_RESILVER) + if (type == POOL_SCRUB_RESILVER) { type = POOL_SCRUB_NONE; + spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); + } } else { /* * The pool-wide DTL is non-empty.
--- a/usr/src/uts/common/fs/zfs/space_map.c Sat Apr 01 21:50:51 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/space_map.c Sun Apr 02 00:47:06 2006 -0800 @@ -28,6 +28,7 @@ #include <sys/zfs_context.h> #include <sys/spa.h> #include <sys/dmu.h> +#include <sys/zio.h> #include <sys/space_map.h> /* @@ -54,22 +55,24 @@ } void -space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint64_t shift, +space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift, kmutex_t *lp) { + bzero(sm, sizeof (*sm)); + avl_create(&sm->sm_root, space_map_seg_compare, sizeof (space_seg_t), offsetof(struct space_seg, ss_node)); + sm->sm_start = start; - sm->sm_end = start + size; sm->sm_size = size; sm->sm_shift = shift; - sm->sm_space = 0; sm->sm_lock = lp; } void space_map_destroy(space_map_t *sm) { + ASSERT(!sm->sm_loaded && !sm->sm_loading); VERIFY3U(sm->sm_space, ==, 0); avl_destroy(&sm->sm_root); } @@ -85,7 +88,7 @@ ASSERT(MUTEX_HELD(sm->sm_lock)); VERIFY(size != 0); VERIFY3U(start, >=, sm->sm_start); - VERIFY3U(end, <=, sm->sm_end); + VERIFY3U(end, <=, sm->sm_start + sm->sm_size); VERIFY(sm->sm_space + size <= sm->sm_size); VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); @@ -201,7 +204,7 @@ } void -space_map_iterate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) +space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) { space_seg_t *ss; @@ -210,12 +213,6 @@ } void -space_map_merge(space_map_t *src, space_map_t *dest) -{ - space_map_vacate(src, space_map_add, dest); -} - -void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size) { avl_tree_t *t = &sm->sm_root; @@ -266,25 +263,57 @@ } } +/* + * Wait for any in-progress space_map_load() to complete. + */ +void +space_map_load_wait(space_map_t *sm) +{ + ASSERT(MUTEX_HELD(sm->sm_lock)); + + while (sm->sm_loading) + cv_wait(&sm->sm_load_cv, sm->sm_lock); +} + +/* + * Note: space_map_load() will drop sm_lock across dmu_read() calls. + * The caller must be OK with this. + */ int -space_map_load(space_map_t *sm, space_map_obj_t *smo, uint8_t maptype, - objset_t *os, uint64_t end, uint64_t space) +space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype, + space_map_obj_t *smo, objset_t *os) { uint64_t *entry, *entry_map, *entry_map_end; uint64_t bufsize, size, offset; uint64_t mapstart = sm->sm_start; + uint64_t end = smo->smo_objsize; + uint64_t space = smo->smo_alloc; ASSERT(MUTEX_HELD(sm->sm_lock)); - VERIFY3U(sm->sm_space, ==, 0); + + space_map_load_wait(sm); - bufsize = MIN(end, SPACE_MAP_CHUNKSIZE); - entry_map = kmem_alloc(bufsize, KM_SLEEP); + if (sm->sm_loaded) + return (0); + + sm->sm_loading = B_TRUE; + + ASSERT(sm->sm_ops == NULL); + VERIFY3U(sm->sm_space, ==, 0); if (maptype == SM_FREE) { space_map_add(sm, sm->sm_start, sm->sm_size); space = sm->sm_size - space; } + bufsize = 1ULL << SPACE_MAP_BLOCKSHIFT; + entry_map = zio_buf_alloc(bufsize); + + mutex_exit(sm->sm_lock); + if (end > bufsize) + dmu_prefetch(os, smo->smo_object, bufsize, end - bufsize); + mutex_enter(sm->sm_lock); + for (offset = 0; offset < end; offset += bufsize) { size = MIN(end - offset, bufsize); VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0); @@ -292,8 +321,11 @@ dprintf("object=%llu offset=%llx size=%llx\n", smo->smo_object, offset, size); - VERIFY(0 == dmu_read(os, smo->smo_object, offset, size, - entry_map)); + + mutex_exit(sm->sm_lock); + VERIFY3U(dmu_read(os, smo->smo_object, offset, size, + entry_map), ==, 0); + mutex_enter(sm->sm_lock); entry_map_end = entry_map + (size / sizeof (uint64_t)); for (entry = entry_map; entry < entry_map_end; entry++) { @@ -310,14 +342,65 @@ } VERIFY3U(sm->sm_space, ==, space); - kmem_free(entry_map, bufsize); + zio_buf_free(entry_map, bufsize); + + sm->sm_loading = B_FALSE; + sm->sm_loaded = B_TRUE; + sm->sm_ops = ops; + + cv_broadcast(&sm->sm_load_cv); + + if (ops != NULL) + ops->smop_load(sm); return (0); } void -space_map_sync(space_map_t *sm, space_map_t *dest, space_map_obj_t *smo, - uint8_t maptype, objset_t *os, dmu_tx_t *tx) +space_map_unload(space_map_t *sm) +{ + ASSERT(MUTEX_HELD(sm->sm_lock)); + + if (sm->sm_loaded && sm->sm_ops != NULL) + sm->sm_ops->smop_unload(sm); + + sm->sm_loaded = B_FALSE; + sm->sm_ops = NULL; + + space_map_vacate(sm, NULL, NULL); +} + +uint64_t +space_map_alloc(space_map_t *sm, uint64_t size) +{ + uint64_t start; + + start = sm->sm_ops->smop_alloc(sm, size); + if (start != -1ULL) + space_map_remove(sm, start, size); + return (start); +} + +void +space_map_claim(space_map_t *sm, uint64_t start, uint64_t size) +{ + sm->sm_ops->smop_claim(sm, start, size); + space_map_remove(sm, start, size); +} + +void +space_map_free(space_map_t *sm, uint64_t start, uint64_t size) +{ + space_map_add(sm, start, size); + sm->sm_ops->smop_free(sm, start, size); +} + +/* + * Note: space_map_sync() will drop sm_lock across dmu_write() calls. + */ +void +space_map_sync(space_map_t *sm, uint8_t maptype, + space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx) { spa_t *spa = dmu_objset_spa(os); void *cookie = NULL; @@ -335,9 +418,14 @@ maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root), sm->sm_space); + if (maptype == SM_ALLOC) + smo->smo_alloc += sm->sm_space; + else + smo->smo_alloc -= sm->sm_space; + bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t); - bufsize = MIN(bufsize, SPACE_MAP_CHUNKSIZE); - entry_map = kmem_alloc(bufsize, KM_SLEEP); + bufsize = MIN(bufsize, 1ULL << SPACE_MAP_BLOCKSHIFT); + entry_map = zio_buf_alloc(bufsize); entry_map_end = entry_map + (bufsize / sizeof (uint64_t)); entry = entry_map; @@ -350,9 +438,6 @@ size = ss->ss_end - ss->ss_start; start = (ss->ss_start - sm->sm_start) >> sm->sm_shift; - if (dest) - space_map_add(dest, ss->ss_start, size); - sm->sm_space -= size; size >>= sm->sm_shift; @@ -360,8 +445,10 @@ run_len = MIN(size, SM_RUN_MAX); if (entry == entry_map_end) { + mutex_exit(sm->sm_lock); dmu_write(os, smo->smo_object, smo->smo_objsize, bufsize, entry_map, tx); + mutex_enter(sm->sm_lock); smo->smo_objsize += bufsize; entry = entry_map; } @@ -378,30 +465,23 @@ if (entry != entry_map) { size = (entry - entry_map) * sizeof (uint64_t); + mutex_exit(sm->sm_lock); dmu_write(os, smo->smo_object, smo->smo_objsize, size, entry_map, tx); + mutex_enter(sm->sm_lock); smo->smo_objsize += size; } - kmem_free(entry_map, bufsize); + zio_buf_free(entry_map, bufsize); VERIFY3U(sm->sm_space, ==, 0); } void -space_map_write(space_map_t *sm, space_map_obj_t *smo, objset_t *os, - dmu_tx_t *tx) +space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx) { - uint64_t oldsize = smo->smo_objsize; - - VERIFY(0 == dmu_free_range(os, smo->smo_object, 0, - smo->smo_objsize, tx)); + VERIFY(dmu_free_range(os, smo->smo_object, 0, -1ULL, tx) == 0); smo->smo_objsize = 0; - - VERIFY3U(sm->sm_space, ==, smo->smo_alloc); - space_map_sync(sm, NULL, smo, SM_ALLOC, os, tx); - - dprintf("write sm object %llu from %llu to %llu bytes in txg %llu\n", - smo->smo_object, oldsize, smo->smo_objsize, dmu_tx_get_txg(tx)); + smo->smo_alloc = 0; }
--- a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h Sat Apr 01 21:50:51 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h Sun Apr 02 00:47:06 2006 -0800 @@ -82,7 +82,8 @@ spa_t *th_spa; blkptr_cb_t *th_func; void *th_arg; - int th_advance; + uint16_t th_advance; + uint16_t th_locked; int th_zio_flags; list_t th_seglist; traverse_blk_cache_t th_cache[ZB_DEPTH][ZB_MAXLEVEL];
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h Sat Apr 01 21:50:51 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h Sun Apr 02 00:47:06 2006 -0800 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -42,14 +41,14 @@ typedef struct metaslab_class metaslab_class_t; typedef struct metaslab_group metaslab_group_t; -extern void metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, - metaslab_t **mspp, uint64_t offset, uint64_t size, uint64_t txg); +extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, + uint64_t start, uint64_t size, uint64_t txg); extern void metaslab_fini(metaslab_t *msp); extern void metaslab_sync(metaslab_t *msp, uint64_t txg); extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg); extern int metaslab_alloc(spa_t *spa, uint64_t size, dva_t *dva, uint64_t txg); -extern void metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg); +extern void metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg, boolean_t now); extern int metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg); extern metaslab_class_t *metaslab_class_create(void); @@ -60,11 +59,6 @@ extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc, vdev_t *vd); extern void metaslab_group_destroy(metaslab_group_t *mg); -extern void metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp, - uint64_t weight); -extern void metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp); -extern void metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, - uint64_t weight); #ifdef __cplusplus }
--- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h Sat Apr 01 21:50:51 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h Sun Apr 02 00:47:06 2006 -0800 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -56,68 +55,25 @@ }; /* - * Each metaslab's free block list is kept in its own DMU object in the - * metaslab freelist dataset. To minimize space consumption, the list - * is circular. - * - * Allocations and frees can happen in multiple transaction groups at - * the same time, which makes it a bit challening to keep the metaslab - * consistent. For example, we cannot allow frees from different - * transaction groups to be interleaved in the metaslab's free block list. - * - * We address this in several ways: - * - * We don't allow allocations from the same metaslab in concurrent - * transaction groups. metaslab_alloc() enforces this by checking - * the ms_last_alloc field, which specifies the last txg in which - * the metaslab was used for allocations. - * - * We can't segregate frees this way because we can't choose which - * DVAs someone wants to free. So we keep separate in-core freelists - * for each active transaction group. This in-core data is only - * written to the metaslab's on-disk freelist in metaslab_sync(), - * which solves the interleave problem: we only append frees from - * the syncing txg to the on-disk freelist, so the appends all occur - * in txg order. - * - * We cannot allow a block which was freed in a given txg to be - * allocated again until that txg has closed; otherwise, if we - * failed to sync that txg and had to roll back to txg - 1, - * changes in txg + 1 could have overwritten the data. Therefore, - * we partition the free blocks into "available" and "limbo" states. - * A block is available if the txg in which it was freed has closed; - * until then, the block is in limbo. Each time metaslab_sync() runs, - * if first adds any limbo blocks to the avail list, clears the limbo - * list, and starts writing the new limbo blocks (i.e. the ones that - * were freed in the syncing txg). + * Each metaslab's free space is tracked in space map object in the MOS, + * which is only updated in syncing context. Each time we sync a txg, + * we append the allocs and frees from that txg to the space map object. + * When the txg is done syncing, metaslab_sync_done() updates ms_smo + * to ms_smo_syncing. Everything in ms_smo is always safe to allocate. */ - struct metaslab { kmutex_t ms_lock; /* metaslab lock */ - space_map_obj_t *ms_smo; /* space map object */ - uint64_t ms_last_alloc; /* txg of last alloc */ - uint64_t ms_usable_end; /* end of free_obj at last sync */ - uint64_t ms_usable_space; /* usable space at last sync */ - metaslab_group_t *ms_group; /* metaslab group */ - avl_node_t ms_group_node; /* node in metaslab group tree */ - uint64_t ms_weight; /* weight vs. others in group */ - uint8_t ms_dirty[TXG_SIZE]; /* per-txg dirty flags */ + space_map_obj_t ms_smo; /* synced space map object */ + space_map_obj_t ms_smo_syncing; /* syncing space map object */ space_map_t ms_allocmap[TXG_SIZE]; /* allocated this txg */ space_map_t ms_freemap[TXG_SIZE]; /* freed this txg */ + space_map_t ms_map; /* in-core free space map */ + uint64_t ms_weight; /* weight vs. others in group */ + metaslab_group_t *ms_group; /* metaslab group */ + avl_node_t ms_group_node; /* node in metaslab group tree */ txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ - space_map_t ms_map; /* in-core free space map */ - uint8_t ms_map_incore; /* space map contents are valid */ - uint64_t ms_map_cursor[SPA_ASIZEBITS]; /* XXX -- PPD */ }; -/* - * ms_dirty[] flags - */ -#define MSD_ALLOC 0x01 /* allocated from in this txg */ -#define MSD_FREE 0x02 /* freed to in this txg */ -#define MSD_ADD 0x04 /* added to the pool in this txg */ -#define MSD_CONDENSE 0x08 /* condensed in this txg */ - #ifdef __cplusplus } #endif
--- a/usr/src/uts/common/fs/zfs/sys/space_map.h Sat Apr 01 21:50:51 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/space_map.h Sun Apr 02 00:47:06 2006 -0800 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -36,13 +35,20 @@ extern "C" { #endif +typedef struct space_map_ops space_map_ops_t; + typedef struct space_map { - avl_tree_t sm_root; /* Root of the AVL tree */ - uint64_t sm_start; /* Start of map (inclusive) */ - uint64_t sm_end; /* End of map (exclusive) */ - uint64_t sm_size; /* Size of map (end - start) */ - uint64_t sm_shift; /* Unit shift */ - uint64_t sm_space; /* Sum of all segments in the map */ + avl_tree_t sm_root; /* AVL tree of map segments */ + uint64_t sm_space; /* sum of all segments in the map */ + uint64_t sm_start; /* start of map */ + uint64_t sm_size; /* size of map */ + uint8_t sm_shift; /* unit shift */ + uint8_t sm_pad[3]; /* unused */ + uint8_t sm_loaded; /* map loaded? */ + uint8_t sm_loading; /* map loading? */ + kcondvar_t sm_load_cv; /* map load completion */ + space_map_ops_t *sm_ops; /* space map block picker ops vector */ + void *sm_ppd; /* picker-private data */ kmutex_t *sm_lock; /* pointer to lock that protects map */ } space_map_t; @@ -58,6 +64,14 @@ uint64_t smo_alloc; /* space allocated from the map */ } space_map_obj_t; +struct space_map_ops { + void (*smop_load)(space_map_t *sm); + void (*smop_unload)(space_map_t *sm); + uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size); + void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size); + void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size); +}; + /* * debug entry * @@ -112,29 +126,33 @@ */ #define SPACE_MAP_BLOCKSHIFT 12 -#define SPACE_MAP_CHUNKSIZE (1<<20) - typedef void space_map_func_t(space_map_t *sm, uint64_t start, uint64_t size); extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size, - uint64_t shift, kmutex_t *lp); + uint8_t shift, kmutex_t *lp); extern void space_map_destroy(space_map_t *sm); extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size); extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size); extern int space_map_contains(space_map_t *sm, uint64_t start, uint64_t size); extern void space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest); -extern void space_map_iterate(space_map_t *sm, +extern void space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest); -extern void space_map_merge(space_map_t *dest, space_map_t *src); extern void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size); extern void space_map_union(space_map_t *smd, space_map_t *sms); -extern int space_map_load(space_map_t *sm, space_map_obj_t *smo, - uint8_t maptype, objset_t *os, uint64_t end, uint64_t space); -extern void space_map_sync(space_map_t *sm, space_map_t *dest, - space_map_obj_t *smo, uint8_t maptype, objset_t *os, dmu_tx_t *tx); -extern void space_map_write(space_map_t *sm, space_map_obj_t *smo, +extern void space_map_load_wait(space_map_t *sm); +extern int space_map_load(space_map_t *sm, space_map_ops_t *ops, + uint8_t maptype, space_map_obj_t *smo, objset_t *os); +extern void space_map_unload(space_map_t *sm); + +extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size); +extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size); +extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size); + +extern void space_map_sync(space_map_t *sm, uint8_t maptype, + space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx); +extern void space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx); #ifdef __cplusplus
--- a/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h Sat Apr 01 21:50:51 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h Sun Apr 02 00:47:06 2006 -0800 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -45,13 +44,9 @@ * version mismatch. If the ub_magic field is moved, applications that * expect the magic number in the first word won't work. */ - -#define UBERBLOCK_SHIFT (10) -#define UBERBLOCK_SIZE (1ULL << UBERBLOCK_SHIFT) - #define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */ - #define UBERBLOCK_VERSION 1ULL +#define UBERBLOCK_SHIFT 10 /* up to 1K */ struct uberblock { uint64_t ub_magic; /* UBERBLOCK_MAGIC */ @@ -62,13 +57,6 @@ blkptr_t ub_rootbp; /* MOS objset_phys_t */ }; -typedef struct uberblock_phys { - uberblock_t ubp_uberblock; - char ubp_pad[UBERBLOCK_SIZE - sizeof (uberblock_t) - - sizeof (zio_block_tail_t)]; - zio_block_tail_t ubp_zbt; -} uberblock_phys_t; - #ifdef __cplusplus } #endif
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h Sat Apr 01 21:50:51 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h Sun Apr 02 00:47:06 2006 -0800 @@ -147,12 +147,9 @@ uint64_t vdev_ms_count; /* number of metaslabs */ metaslab_group_t *vdev_mg; /* metaslab group */ metaslab_t **vdev_ms; /* metaslab array */ - space_map_obj_t *vdev_smo; /* metaslab space map array */ txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ - uint8_t vdev_dirty[TXG_SIZE]; /* per-txg dirty flags */ - uint8_t vdev_is_dirty; /* on config dirty list? */ uint8_t vdev_reopen_wanted; /* async reopen wanted? */ list_node_t vdev_dirty_node; /* config dirty list */ @@ -163,13 +160,13 @@ space_map_obj_t vdev_dtl; /* dirty time log on-disk state */ txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ uint64_t vdev_wholedisk; /* true if this is a whole disk */ + uint64_t vdev_offline; /* device taken offline? */ char *vdev_path; /* vdev path (if any) */ char *vdev_devid; /* vdev devid (if any) */ uint64_t vdev_fault_arg; /* fault injection paramater */ int vdev_fault_mask; /* zio types to fault */ uint8_t vdev_fault_mode; /* fault injection mode */ uint8_t vdev_cache_active; /* vdev_cache and vdev_queue */ - uint8_t vdev_offline; /* device taken offline? */ uint8_t vdev_tmpoffline; /* device taken offline temporarily? */ uint8_t vdev_detached; /* device detached? */ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ @@ -185,14 +182,21 @@ * incorrect. */ kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */ - kmutex_t vdev_dirty_lock; /* vdev_dirty[] */ kmutex_t vdev_stat_lock; /* vdev_stat */ }; #define VDEV_SKIP_SIZE (8 << 10) #define VDEV_BOOT_HEADER_SIZE (8 << 10) #define VDEV_PHYS_SIZE (112 << 10) -#define VDEV_UBERBLOCKS ((128 << 10) >> UBERBLOCK_SHIFT) +#define VDEV_UBERBLOCK_RING (128 << 10) + +#define VDEV_UBERBLOCK_SHIFT(vd) \ + MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT) +#define VDEV_UBERBLOCK_COUNT(vd) \ + (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd)) +#define VDEV_UBERBLOCK_OFFSET(vd, n) \ + offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)]) +#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) #define VDEV_BOOT_MAGIC 0x2f5b007b10c /* ZFS boot block */ #define VDEV_BOOT_VERSION 1 /* version number */ @@ -211,13 +215,19 @@ } vdev_phys_t; typedef struct vdev_label { - char vl_pad[VDEV_SKIP_SIZE]; /* 8K */ - vdev_boot_header_t vl_boot_header; /* 8K */ - vdev_phys_t vl_vdev_phys; /* 112K */ - uberblock_phys_t vl_uberblock[VDEV_UBERBLOCKS]; /* 128K */ + char vl_pad[VDEV_SKIP_SIZE]; /* 8K */ + vdev_boot_header_t vl_boot_header; /* 8K */ + vdev_phys_t vl_vdev_phys; /* 112K */ + char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ } vdev_label_t; /* 256K total */ /* + * vdev_dirty() flags + */ +#define VDD_METASLAB 0x01 +#define VDD_DTL 0x02 + +/* * Size and offset of embedded boot loader region on each label. * The total size of the first two labels plus the boot area is 4MB. */ @@ -225,14 +235,6 @@ #define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ /* - * vdev_dirty[] flags - */ -#define VDD_ALLOC 0x01 /* allocated from in this txg */ -#define VDD_FREE 0x02 /* freed to in this txg */ -#define VDD_ADD 0x04 /* added to the pool in this txg */ -#define VDD_DTL 0x08 /* dirty time log entry in this txg */ - -/* * Size of label regions at the start and end of each leaf device. */ #define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE) @@ -264,7 +266,7 @@ extern int vdev_load(vdev_t *vd); extern void vdev_sync(vdev_t *vd, uint64_t txg); extern void vdev_sync_done(vdev_t *vd, uint64_t txg); -extern void vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg); +extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg); /* * Available vdev types.
--- a/usr/src/uts/common/fs/zfs/sys/zio.h Sat Apr 01 21:50:51 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/zio.h Sun Apr 02 00:47:06 2006 -0800 @@ -125,6 +125,7 @@ #define ZIO_FLAG_RESILVER 0x01000 #define ZIO_FLAG_SCRUB 0x02000 +#define ZIO_FLAG_SUBBLOCK 0x04000 #define ZIO_FLAG_NOBOOKMARK 0x10000
--- a/usr/src/uts/common/fs/zfs/vdev.c Sat Apr 01 21:50:51 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/vdev.c Sun Apr 02 00:47:06 2006 -0800 @@ -77,7 +77,7 @@ uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize) { - uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_ashift); + uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; uint64_t c; @@ -299,7 +299,6 @@ vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; - mutex_init(&vd->vdev_dirty_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock); space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock); @@ -328,13 +327,12 @@ txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); mutex_enter(&vd->vdev_dtl_lock); - space_map_vacate(&vd->vdev_dtl_map, NULL, NULL); + space_map_unload(&vd->vdev_dtl_map); space_map_destroy(&vd->vdev_dtl_map); space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); space_map_destroy(&vd->vdev_dtl_scrub); mutex_exit(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_dtl_lock); - mutex_destroy(&vd->vdev_dirty_lock); if (vd == spa->spa_root_vdev) spa->spa_root_vdev = NULL; @@ -352,7 +350,7 @@ { vdev_ops_t *ops; char *type; - uint64_t guid = 0, offline = 0; + uint64_t guid = 0; vdev_t *vd; ASSERT(spa_config_held(spa, RW_WRITER)); @@ -401,6 +399,11 @@ &vd->vdev_not_present); /* + * Get the alignment requirement. + */ + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); + + /* * If we're a top-level vdev, try to load the allocation parameters. */ if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) { @@ -408,24 +411,18 @@ &vd->vdev_ms_array); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, &vd->vdev_ms_shift); - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, - &vd->vdev_ashift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); } /* - * If we're a leaf vdev, try to load the DTL object - * and the offline state. + * If we're a leaf vdev, try to load the DTL object and offline state. */ - vd->vdev_offline = B_FALSE; if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl.smo_object); - - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &offline) - == 0) - vd->vdev_offline = offline; + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, + &vd->vdev_offline); } /* @@ -447,7 +444,7 @@ */ vdev_close(vd); - ASSERT(!vd->vdev_is_dirty); + ASSERT(!list_link_active(&vd->vdev_dirty_node)); /* * Free all children. @@ -499,13 +496,13 @@ svd->vdev_ms_count = 0; tvd->vdev_mg = svd->vdev_mg; - tvd->vdev_mg->mg_vd = tvd; tvd->vdev_ms = svd->vdev_ms; - tvd->vdev_smo = svd->vdev_smo; svd->vdev_mg = NULL; svd->vdev_ms = NULL; - svd->vdev_smo = NULL; + + if (tvd->vdev_mg != NULL) + tvd->vdev_mg->mg_vd = tvd; tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; @@ -520,11 +517,9 @@ (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); - tvd->vdev_dirty[t] = svd->vdev_dirty[t]; - svd->vdev_dirty[t] = 0; } - if (svd->vdev_is_dirty) { + if (list_link_active(&svd->vdev_dirty_node)) { vdev_config_clean(svd); vdev_config_dirty(tvd); } @@ -560,16 +555,17 @@ ASSERT(spa_config_held(spa, RW_WRITER)); mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); + + mvd->vdev_asize = cvd->vdev_asize; + mvd->vdev_ashift = cvd->vdev_ashift; + mvd->vdev_state = cvd->vdev_state; + vdev_remove_child(pvd, cvd); vdev_add_child(pvd, mvd); cvd->vdev_id = mvd->vdev_children; vdev_add_child(mvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); - mvd->vdev_asize = cvd->vdev_asize; - mvd->vdev_ashift = cvd->vdev_ashift; - mvd->vdev_state = cvd->vdev_state; - if (mvd == mvd->vdev_top) vdev_top_transfer(cvd, mvd); @@ -590,6 +586,7 @@ ASSERT(mvd->vdev_children == 1); ASSERT(mvd->vdev_ops == &vdev_mirror_ops || mvd->vdev_ops == &vdev_replacing_ops); + cvd->vdev_ashift = mvd->vdev_ashift; vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); @@ -608,13 +605,13 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; metaslab_class_t *mc = spa_metaslab_class_select(spa); - uint64_t c; + uint64_t m; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; - space_map_obj_t *smo = vd->vdev_smo; - metaslab_t **mspp = vd->vdev_ms; - int ret; + metaslab_t **mspp; + int error; if (vd->vdev_ms_shift == 0) /* not being allocated from yet */ return (0); @@ -623,77 +620,43 @@ ASSERT(oldc <= newc); - vd->vdev_smo = kmem_zalloc(newc * sizeof (*smo), KM_SLEEP); - vd->vdev_ms = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); + if (vd->vdev_mg == NULL) + vd->vdev_mg = metaslab_group_create(mc, vd); + + mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); + + if (oldc != 0) { + bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); + kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); + } + + vd->vdev_ms = mspp; vd->vdev_ms_count = newc; - if (vd->vdev_mg == NULL) { + for (m = oldc; m < newc; m++) { + space_map_obj_t smo = { 0, 0, 0 }; if (txg == 0) { - dmu_buf_t *db; - uint64_t *ms_array; - - ms_array = kmem_zalloc(newc * sizeof (uint64_t), - KM_SLEEP); - - if ((ret = dmu_read(spa->spa_meta_objset, - vd->vdev_ms_array, 0, - newc * sizeof (uint64_t), ms_array)) != 0) { - kmem_free(ms_array, newc * sizeof (uint64_t)); - goto error; - } - - for (c = 0; c < newc; c++) { - if (ms_array[c] == 0) - continue; - if ((ret = dmu_bonus_hold( - spa->spa_meta_objset, ms_array[c], - FTAG, &db)) != 0) { - kmem_free(ms_array, - newc * sizeof (uint64_t)); - goto error; - } - ASSERT3U(db->db_size, ==, sizeof (*smo)); - bcopy(db->db_data, &vd->vdev_smo[c], - db->db_size); - ASSERT3U(vd->vdev_smo[c].smo_object, ==, - ms_array[c]); + uint64_t object = 0; + error = dmu_read(mos, vd->vdev_ms_array, + m * sizeof (uint64_t), sizeof (uint64_t), &object); + if (error) + return (error); + if (object != 0) { + dmu_buf_t *db; + error = dmu_bonus_hold(mos, object, FTAG, &db); + if (error) + return (error); + ASSERT3U(db->db_size, ==, sizeof (smo)); + bcopy(db->db_data, &smo, db->db_size); + ASSERT3U(smo.smo_object, ==, object); dmu_buf_rele(db, FTAG); } - kmem_free(ms_array, newc * sizeof (uint64_t)); } - vd->vdev_mg = metaslab_group_create(mc, vd); - } - - for (c = 0; c < oldc; c++) { - vd->vdev_smo[c] = smo[c]; - vd->vdev_ms[c] = mspp[c]; - mspp[c]->ms_smo = &vd->vdev_smo[c]; - } - - for (c = oldc; c < newc; c++) - metaslab_init(vd->vdev_mg, &vd->vdev_smo[c], &vd->vdev_ms[c], - c << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); - - if (oldc != 0) { - kmem_free(smo, oldc * sizeof (*smo)); - kmem_free(mspp, oldc * sizeof (*mspp)); + vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo, + m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); } return (0); - -error: - /* - * On error, undo any partial progress we may have made, and restore the - * old metaslab values. - */ - kmem_free(vd->vdev_smo, newc * sizeof (*smo)); - kmem_free(vd->vdev_ms, newc * sizeof (*mspp)); - - vd->vdev_smo = smo; - vd->vdev_ms = mspp; - vd->vdev_ms_count = oldc; - - return (ret); } void @@ -704,15 +667,11 @@ if (vd->vdev_ms != NULL) { for (m = 0; m < count; m++) - metaslab_fini(vd->vdev_ms[m]); + if (vd->vdev_ms[m] != NULL) + metaslab_fini(vd->vdev_ms[m]); kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; } - - if (vd->vdev_smo != NULL) { - kmem_free(vd->vdev_smo, count * sizeof (space_map_obj_t)); - vd->vdev_smo = NULL; - } } /* @@ -726,7 +685,7 @@ int c; uint64_t osize = 0; uint64_t asize, psize; - uint64_t ashift = -1ULL; + uint64_t ashift = 0; ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || @@ -793,7 +752,7 @@ psize = osize; asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); } else { - if (osize < SPA_MINDEVSIZE - + if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); @@ -808,14 +767,15 @@ if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. + * For testing purposes, a higher ashift can be requested. */ vd->vdev_asize = asize; - vd->vdev_ashift = ashift; + vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); } else { /* * Make sure the alignment requirement hasn't increased. */ - if (ashift > vd->vdev_ashift) { + if (ashift > vd->vdev_top->vdev_ashift) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (EINVAL); @@ -965,17 +925,18 @@ } void -vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg) +vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { - vdev_t *tvd = vd->vdev_top; + ASSERT(vd == vd->vdev_top); + ASSERT(ISP2(flags)); - mutex_enter(&tvd->vdev_dirty_lock); - if ((tvd->vdev_dirty[txg & TXG_MASK] & flags) != flags) { - tvd->vdev_dirty[txg & TXG_MASK] |= flags; - (void) txg_list_add(&tvd->vdev_spa->spa_vdev_txg_list, - tvd, txg); - } - mutex_exit(&tvd->vdev_dirty_lock); + if (flags & VDD_METASLAB) + (void) txg_list_add(&vd->vdev_ms_list, arg, txg); + + if (flags & VDD_DTL) + (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); + + (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } void @@ -1031,11 +992,8 @@ if (scrub_done) space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); mutex_exit(&vd->vdev_dtl_lock); - if (txg != 0) { - vdev_t *tvd = vd->vdev_top; - vdev_dirty(tvd, VDD_DTL, txg); - (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); - } + if (txg != 0) + vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); return; } @@ -1068,6 +1026,7 @@ { spa_t *spa = vd->vdev_spa; space_map_obj_t *smo = &vd->vdev_dtl; + objset_t *mos = spa->spa_meta_objset; dmu_buf_t *db; int error; @@ -1076,16 +1035,15 @@ if (smo->smo_object == 0) return (0); - if ((error = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object, - FTAG, &db)) != 0) + if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) return (error); + ASSERT3U(db->db_size, ==, sizeof (*smo)); bcopy(db->db_data, smo, db->db_size); dmu_buf_rele(db, FTAG); mutex_enter(&vd->vdev_dtl_lock); - error = space_map_load(&vd->vdev_dtl_map, smo, SM_ALLOC, - spa->spa_meta_objset, smo->smo_objsize, smo->smo_alloc); + error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos); mutex_exit(&vd->vdev_dtl_lock); return (error); @@ -1097,10 +1055,9 @@ spa_t *spa = vd->vdev_spa; space_map_obj_t *smo = &vd->vdev_dtl; space_map_t *sm = &vd->vdev_dtl_map; + objset_t *mos = spa->spa_meta_objset; space_map_t smsync; kmutex_t smlock; - avl_tree_t *t = &sm->sm_root; - space_seg_t *ss; dmu_buf_t *db; dmu_tx_t *tx; @@ -1111,28 +1068,26 @@ if (vd->vdev_detached) { if (smo->smo_object != 0) { - int err = dmu_object_free(spa->spa_meta_objset, - smo->smo_object, tx); + int err = dmu_object_free(mos, smo->smo_object, tx); ASSERT3U(err, ==, 0); smo->smo_object = 0; } dmu_tx_commit(tx); + dprintf("detach %s committed in txg %llu\n", + vdev_description(vd), txg); return; } if (smo->smo_object == 0) { ASSERT(smo->smo_objsize == 0); ASSERT(smo->smo_alloc == 0); - smo->smo_object = dmu_object_alloc(spa->spa_meta_objset, + smo->smo_object = dmu_object_alloc(mos, DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); ASSERT(smo->smo_object != 0); vdev_config_dirty(vd->vdev_top); } - VERIFY(0 == dmu_free_range(spa->spa_meta_objset, smo->smo_object, - 0, smo->smo_objsize, tx)); - mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, @@ -1141,21 +1096,18 @@ mutex_enter(&smlock); mutex_enter(&vd->vdev_dtl_lock); - for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) - space_map_add(&smsync, ss->ss_start, ss->ss_end - ss->ss_start); + space_map_walk(sm, space_map_add, &smsync); mutex_exit(&vd->vdev_dtl_lock); - smo->smo_objsize = 0; - smo->smo_alloc = smsync.sm_space; + space_map_truncate(smo, mos, tx); + space_map_sync(&smsync, SM_ALLOC, smo, mos, tx); - space_map_sync(&smsync, NULL, smo, SM_ALLOC, spa->spa_meta_objset, tx); space_map_destroy(&smsync); mutex_exit(&smlock); mutex_destroy(&smlock); - VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object, - FTAG, &db)); + VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, ==, sizeof (*smo)); bcopy(smo, db->db_data, db->db_size); @@ -1297,45 +1249,30 @@ } void -vdev_add_sync(vdev_t *vd, uint64_t txg) -{ - spa_t *spa = vd->vdev_spa; - dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); - - ASSERT(vd == vd->vdev_top); - - if (vd->vdev_ms_array == 0) - vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, - DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); - - ASSERT(vd->vdev_ms_array != 0); - - vdev_config_dirty(vd); - - dmu_tx_commit(tx); -} - -void vdev_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; vdev_t *lvd; metaslab_t *msp; - uint8_t *dirtyp = &vd->vdev_dirty[txg & TXG_MASK]; - uint8_t dirty = *dirtyp; - - mutex_enter(&vd->vdev_dirty_lock); - *dirtyp &= ~(VDD_ALLOC | VDD_FREE | VDD_ADD | VDD_DTL); - mutex_exit(&vd->vdev_dirty_lock); + dmu_tx_t *tx; dprintf("%s txg %llu pass %d\n", vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); - if (dirty & VDD_ADD) - vdev_add_sync(vd, txg); + if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { + ASSERT(vd == vd->vdev_top); + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, + DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); + ASSERT(vd->vdev_ms_array != 0); + vdev_config_dirty(vd); + dmu_tx_commit(tx); + } - while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) + while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); + (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); + } while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) vdev_dtl_sync(lvd, txg); @@ -1425,36 +1362,37 @@ dprintf("OFFLINE: %s\n", vdev_description(vd)); - /* vdev is already offlined, do nothing */ - if (vd->vdev_offline) - return (spa_vdev_exit(spa, NULL, txg, 0)); - /* - * If this device's top-level vdev has a non-empty DTL, - * don't allow the device to be offlined. - * - * XXX -- we should make this more precise by allowing the offline - * as long as the remaining devices don't have any DTL holes. + * If the device isn't already offline, try to offline it. */ - if (vd->vdev_top->vdev_dtl_map.sm_space != 0) - return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + if (!vd->vdev_offline) { + /* + * If this device's top-level vdev has a non-empty DTL, + * don't allow the device to be offlined. + * + * XXX -- make this more precise by allowing the offline + * as long as the remaining devices don't have any DTL holes. + */ + if (vd->vdev_top->vdev_dtl_map.sm_space != 0) + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); - /* - * Set this device to offline state and reopen its top-level vdev. - * If this action results in the top-level vdev becoming unusable, - * undo it and fail the request. - */ - vd->vdev_offline = B_TRUE; - vdev_reopen(vd->vdev_top); - if (vdev_is_dead(vd->vdev_top)) { - vd->vdev_offline = B_FALSE; + /* + * Offline this device and reopen its top-level vdev. + * If this action results in the top-level vdev becoming + * unusable, undo it and fail the request. + */ + vd->vdev_offline = B_TRUE; vdev_reopen(vd->vdev_top); - return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + if (vdev_is_dead(vd->vdev_top)) { + vd->vdev_offline = B_FALSE; + vdev_reopen(vd->vdev_top); + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + } } vd->vdev_tmpoffline = istmp; - if (!istmp) - vdev_config_dirty(vd->vdev_top); + + vdev_config_dirty(vd->vdev_top); return (spa_vdev_exit(spa, NULL, txg, 0)); } @@ -1613,11 +1551,9 @@ vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1); } if (!(flags & ZIO_FLAG_IO_REPAIR)) { - vdev_t *tvd = vd->vdev_top; if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1)) return; - vdev_dirty(tvd, VDD_DTL, txg); - (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); + vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1); } @@ -1788,10 +1724,8 @@ } else { ASSERT(vd == vd->vdev_top); - if (!vd->vdev_is_dirty) { + if (!list_link_active(&vd->vdev_dirty_node)) list_insert_head(&spa->spa_dirty_list, vd); - vd->vdev_is_dirty = B_TRUE; - } } } @@ -1803,10 +1737,8 @@ ASSERT(spa_config_held(spa, RW_WRITER) || dsl_pool_sync_context(spa_get_dsl(spa))); - ASSERT(vd->vdev_is_dirty); - + ASSERT(list_link_active(&vd->vdev_dirty_node)); list_remove(&spa->spa_dirty_list, vd); - vd->vdev_is_dirty = B_FALSE; } /*
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c Sat Apr 01 21:50:51 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c Sun Apr 02 00:47:06 2006 -0800 @@ -48,6 +48,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) { vdev_disk_t *dvd; + struct dk_minfo dkm; int error; /* @@ -153,27 +154,25 @@ return (EINVAL); } - *ashift = SPA_MINBLOCKSHIFT; - - + /* + * If we own the whole disk, try to enable disk write caching. + * We ignore errors because it's OK if we can't do it. + */ if (vd->vdev_wholedisk == 1) { - - int wce, rc; + int wce = 1; + (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, + FKIOCTL, kcred, NULL); + } - /* - * Enable disk write caching if we own the whole disk. - * Ignore errors as this is a performance optimization, - * we work just fine w/o it. - */ - error = 0; - wce = 1; - rc = ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, - FKIOCTL, kcred, &error); + /* + * Determine the device's minimum transfer size. + * If the ioctl isn't supported, assume DEV_BSIZE. + */ + if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, (intptr_t)&dkm, + FKIOCTL, kcred, NULL) != 0) + dkm.dki_lbsize = DEV_BSIZE; - if (rc || error) - dprintf("%s: DKIOCSETWCE failed %d,%d", - vdev_description(vd), rc, error); - } + *ashift = highbit(MAX(dkm.dki_lbsize, SPA_MINBLOCKSIZE)) - 1; return (0); }
--- a/usr/src/uts/common/fs/zfs/vdev_label.c Sat Apr 01 21:50:51 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/vdev_label.c Sun Apr 02 00:47:06 2006 -0800 @@ -152,6 +152,8 @@ uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset) { + ASSERT(offset < sizeof (vdev_label_t)); + return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? 0 : psize - VDEV_LABELS * sizeof (vdev_label_t))); } @@ -253,14 +255,12 @@ kmem_free(child, vd->vdev_children * sizeof (nvlist_t *)); } else { - if (!vd->vdev_tmpoffline) { - if (vd->vdev_offline) + if (vd->vdev_offline && !vd->vdev_tmpoffline) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, - B_TRUE) == 0); - else + B_TRUE) == 0); + else (void) nvlist_remove(nv, ZPOOL_CONFIG_OFFLINE, - DATA_TYPE_UINT64); - } + DATA_TYPE_UINT64); } return (nv); @@ -314,7 +314,7 @@ nvlist_t *label; vdev_phys_t *vp; vdev_boot_header_t *vb; - uberblock_phys_t *ubphys; + uberblock_t *ub; zio_t *zio; int l, c, n; char *buf; @@ -411,10 +411,10 @@ /* * Initialize uberblock template. */ - ubphys = zio_buf_alloc(sizeof (uberblock_phys_t)); - bzero(ubphys, sizeof (uberblock_phys_t)); - ubphys->ubp_uberblock = spa->spa_uberblock; - ubphys->ubp_uberblock.ub_txg = 0; + ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)); + bzero(ub, VDEV_UBERBLOCK_SIZE(vd)); + *ub = spa->spa_uberblock; + ub->ub_txg = 0; /* * Write everything in parallel. @@ -432,19 +432,17 @@ offsetof(vdev_label_t, vl_boot_header), sizeof (vdev_boot_header_t), NULL, NULL); - for (n = 0; n < VDEV_UBERBLOCKS; n++) { - - vdev_label_write(zio, vd, l, ubphys, - offsetof(vdev_label_t, vl_uberblock[n]), - sizeof (uberblock_phys_t), NULL, NULL); - + for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { + vdev_label_write(zio, vd, l, ub, + VDEV_UBERBLOCK_OFFSET(vd, n), + VDEV_UBERBLOCK_SIZE(vd), NULL, NULL); } } error = zio_wait(zio); nvlist_free(label); - zio_buf_free(ubphys, sizeof (uberblock_phys_t)); + zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd)); zio_buf_free(vb, sizeof (vdev_boot_header_t)); zio_buf_free(vp, sizeof (vdev_phys_t)); @@ -486,12 +484,11 @@ static void vdev_uberblock_load_done(zio_t *zio) { - uberblock_phys_t *ubphys = zio->io_data; - uberblock_t *ub = &ubphys->ubp_uberblock; + uberblock_t *ub = zio->io_data; uberblock_t *ubbest = zio->io_private; spa_t *spa = zio->io_spa; - ASSERT3U(zio->io_size, ==, sizeof (uberblock_phys_t)); + ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd)); if (zio->io_error == 0 && uberblock_verify(ub) == 0) { mutex_enter(&spa->spa_uberblock_lock); @@ -518,11 +515,11 @@ return; for (l = 0; l < VDEV_LABELS; l++) { - for (n = 0; n < VDEV_UBERBLOCKS; n++) { + for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_read(zio, vd, l, - zio_buf_alloc(sizeof (uberblock_phys_t)), - offsetof(vdev_label_t, vl_uberblock[n]), - sizeof (uberblock_phys_t), + zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)), + VDEV_UBERBLOCK_OFFSET(vd, n), + VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_load_done, ubbest); } } @@ -542,13 +539,12 @@ } static void -vdev_uberblock_sync(zio_t *zio, uberblock_phys_t *ubphys, vdev_t *vd, - uint64_t txg) +vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, uint64_t txg) { int l, c, n; for (c = 0; c < vd->vdev_children; c++) - vdev_uberblock_sync(zio, ubphys, vd->vdev_child[c], txg); + vdev_uberblock_sync(zio, ub, vd->vdev_child[c], txg); if (!vd->vdev_ops->vdev_op_leaf) return; @@ -556,36 +552,38 @@ if (vdev_is_dead(vd)) return; - n = txg & (VDEV_UBERBLOCKS - 1); + n = txg & (VDEV_UBERBLOCK_COUNT(vd) - 1); - ASSERT(ubphys->ubp_uberblock.ub_txg == txg); + ASSERT(ub->ub_txg == txg); for (l = 0; l < VDEV_LABELS; l++) - vdev_label_write(zio, vd, l, ubphys, - offsetof(vdev_label_t, vl_uberblock[n]), - sizeof (uberblock_phys_t), vdev_uberblock_sync_done, NULL); + vdev_label_write(zio, vd, l, ub, + VDEV_UBERBLOCK_OFFSET(vd, n), + VDEV_UBERBLOCK_SIZE(vd), + vdev_uberblock_sync_done, NULL); dprintf("vdev %s in txg %llu\n", vdev_description(vd), txg); } static int -vdev_uberblock_sync_tree(spa_t *spa, uberblock_t *ub, vdev_t *uvd, uint64_t txg) +vdev_uberblock_sync_tree(spa_t *spa, uberblock_t *ub, vdev_t *vd, uint64_t txg) { - uberblock_phys_t *ubphys; + uberblock_t *ubbuf; + size_t size = vd->vdev_top ? VDEV_UBERBLOCK_SIZE(vd) : SPA_MAXBLOCKSIZE; uint64_t *good_writes; zio_t *zio; int error; - ubphys = zio_buf_alloc(sizeof (uberblock_phys_t)); - bzero(ubphys, sizeof (uberblock_phys_t)); - ubphys->ubp_uberblock = *ub; + ubbuf = zio_buf_alloc(size); + bzero(ubbuf, size); + *ubbuf = *ub; good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); zio = zio_root(spa, NULL, good_writes, ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); - vdev_uberblock_sync(zio, ubphys, uvd, txg); + vdev_uberblock_sync(zio, ubbuf, vd, txg); error = zio_wait(zio); @@ -602,7 +600,7 @@ error = EIO; kmem_free(good_writes, sizeof (uint64_t)); - zio_buf_free(ubphys, sizeof (uberblock_phys_t)); + zio_buf_free(ubbuf, size); return (error); }
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c Sat Apr 01 21:50:51 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c Sun Apr 02 00:47:06 2006 -0800 @@ -80,7 +80,7 @@ } *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; - *ashift = cvd->vdev_ashift; + *ashift = MAX(*ashift, cvd->vdev_ashift); } if (numerrors == vd->vdev_children) { @@ -129,6 +129,13 @@ mm->mm_skipped = 0; } +static void +vdev_mirror_repair_done(zio_t *zio) +{ + ASSERT(zio->io_private == zio->io_parent); + vdev_mirror_map_free(zio->io_private); +} + /* * Try to find a child whose DTL doesn't contain the block we want to read. * If we can't, try the read on any vdev we haven't already tried. @@ -341,9 +348,18 @@ if (good_copies && (spa_mode & FWRITE) && (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { + zio_t *rio; + /* * Use the good data we have in hand to repair damaged children. + * + * We issue all repair I/Os as children of 'rio' to arrange + * that vdev_mirror_map_free(zio) will be invoked after all + * repairs complete, but before we advance to the next stage. */ + rio = zio_null(zio, zio->io_spa, + vdev_mirror_repair_done, zio, ZIO_FLAG_CANFAIL); + for (c = 0; c < vd->vdev_children; c++) { /* * Don't rewrite known good children. @@ -368,12 +384,16 @@ vdev_description(cvd), zio->io_offset, mm[c].mm_error); - zio_nowait(zio_vdev_child_io(zio, zio->io_bp, cvd, + zio_nowait(zio_vdev_child_io(rio, zio->io_bp, cvd, zio->io_offset, zio->io_data, zio->io_size, ZIO_TYPE_WRITE, zio->io_priority, ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, NULL, NULL)); } + + zio_nowait(rio); + zio_wait_children_done(zio); + return; } vdev_mirror_map_free(zio);
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c Sat Apr 01 21:50:51 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c Sun Apr 02 00:47:06 2006 -0800 @@ -206,7 +206,7 @@ } *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; - *ashift = cvd->vdev_ashift; + *ashift = MAX(*ashift, cvd->vdev_ashift); } *asize *= vd->vdev_children; @@ -232,11 +232,12 @@ vdev_raidz_asize(vdev_t *vd, uint64_t psize) { uint64_t asize; + uint64_t ashift = vd->vdev_top->vdev_ashift; uint64_t cols = vd->vdev_children; - asize = psize >> vd->vdev_ashift; + asize = ((psize - 1) >> ashift) + 1; asize += (asize + cols - 2) / (cols - 1); - asize = P2ROUNDUP(asize, VDEV_RAIDZ_ALIGN) << vd->vdev_ashift; + asize = P2ROUNDUP(asize, VDEV_RAIDZ_ALIGN) << ashift; return (asize); } @@ -254,28 +255,28 @@ static void vdev_raidz_repair_done(zio_t *zio) { - zio_buf_free(zio->io_data, zio->io_size); + ASSERT(zio->io_private == zio->io_parent); + vdev_raidz_map_free(zio->io_private); } static void vdev_raidz_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; + vdev_t *tvd = vd->vdev_top; vdev_t *cvd; blkptr_t *bp = zio->io_bp; raidz_map_t *rm; raidz_col_t *rc; int c; - rm = vdev_raidz_map_alloc(zio, vd->vdev_ashift, vd->vdev_children); + rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children); if (DVA_GET_GANG(ZIO_GET_DVA(zio))) { ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE)); - ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); } else { ASSERT3U(rm->rm_asize, ==, DVA_GET_ASIZE(ZIO_GET_DVA(zio))); - ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); } if (zio->io_type == ZIO_TYPE_WRITE) { @@ -549,34 +550,40 @@ if (zio->io_error == 0 && (spa_mode & FWRITE) && (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { + zio_t *rio; + /* * Use the good data we have in hand to repair damaged children. + * + * We issue all repair I/Os as children of 'rio' to arrange + * that vdev_raidz_map_free(zio) will be invoked after all + * repairs complete, but before we advance to the next stage. */ + rio = zio_null(zio, zio->io_spa, + vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL); + for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_col]; - if (rc->rc_error) { - /* - * Make a copy of the data because we're - * going to free the RAID-Z map below. - */ - void *data = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, data, rc->rc_size); + if (rc->rc_error == 0) + continue; + + dprintf("%s resilvered %s @ 0x%llx error %d\n", + vdev_description(vd), + vdev_description(cvd), + zio->io_offset, rc->rc_error); - dprintf("%s resilvered %s @ 0x%llx error %d\n", - vdev_description(vd), - vdev_description(cvd), - zio->io_offset, rc->rc_error); + zio_nowait(zio_vdev_child_io(rio, NULL, cvd, + rc->rc_offset, rc->rc_data, rc->rc_size, + ZIO_TYPE_WRITE, zio->io_priority, + ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE, NULL, NULL)); + } - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, data, rc->rc_size, - ZIO_TYPE_WRITE, zio->io_priority, - ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE, - vdev_raidz_repair_done, NULL)); - } - } + zio_nowait(rio); + zio_wait_children_done(zio); + return; } vdev_raidz_map_free(zio);
--- a/usr/src/uts/common/fs/zfs/vdev_root.c Sat Apr 01 21:50:51 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/vdev_root.c Sun Apr 02 00:47:06 2006 -0800 @@ -54,14 +54,14 @@ lasterror = error; continue; } - - *asize += cvd->vdev_asize; - *ashift = MAX(*ashift, cvd->vdev_ashift); } if (lasterror) vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + *asize = 0; + *ashift = 0; + return (lasterror); }
--- a/usr/src/uts/common/fs/zfs/zio.c Sat Apr 01 21:50:51 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/zio.c Sun Apr 02 00:47:06 2006 -0800 @@ -762,10 +762,9 @@ * at the block level. We ignore these errors if the * device is currently unavailable. */ - if (zio->io_error != ECKSUM && zio->io_vd && - !vdev_is_dead(zio->io_vd)) + if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) zfs_ereport_post(FM_EREPORT_ZFS_IO, - zio->io_spa, zio->io_vd, zio, 0, 0); + zio->io_spa, vd, zio, 0, 0); if ((zio->io_error == EIO || !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && @@ -1238,7 +1237,7 @@ ASSERT(!BP_IS_HOLE(bp)); - metaslab_free(zio->io_spa, dva, zio->io_txg); + metaslab_free(zio->io_spa, dva, zio->io_txg, B_FALSE); BP_ZERO(bp); @@ -1288,9 +1287,11 @@ zio_vdev_io_setup(zio_t *zio) { vdev_t *vd = zio->io_vd; + vdev_t *tvd = vd->vdev_top; + uint64_t align = 1ULL << tvd->vdev_ashift; /* XXPOLICY */ - if (zio->io_retries == 0 && vd == vd->vdev_top) + if (zio->io_retries == 0 && vd == tvd) zio->io_flags |= ZIO_FLAG_FAILFAST; if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) { @@ -1298,6 +1299,19 @@ zio->io_offset += VDEV_LABEL_START_SIZE; } + if (P2PHASE(zio->io_size, align) != 0) { + uint64_t asize = P2ROUNDUP(zio->io_size, align); + char *abuf = zio_buf_alloc(asize); + ASSERT(vd == tvd); + if (zio->io_type == ZIO_TYPE_WRITE) { + bcopy(zio->io_data, abuf, zio->io_size); + bzero(abuf + zio->io_size, asize - zio->io_size); + } + zio_push_transform(zio, abuf, asize, asize); + ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK)); + zio->io_flags |= ZIO_FLAG_SUBBLOCK; + } + zio_next_stage(zio); } @@ -1305,10 +1319,12 @@ zio_vdev_io_start(zio_t *zio) { blkptr_t *bp = zio->io_bp; + uint64_t align = 1ULL << zio->io_vd->vdev_top->vdev_ashift; - ASSERT(P2PHASE(zio->io_offset, 1ULL << zio->io_vd->vdev_ashift) == 0); - ASSERT(P2PHASE(zio->io_size, 1ULL << zio->io_vd->vdev_ashift) == 0); - ASSERT(bp == NULL || ZIO_GET_IOSIZE(zio) == zio->io_size); + ASSERT(P2PHASE(zio->io_offset, align) == 0); + ASSERT(P2PHASE(zio->io_size, align) == 0); + ASSERT(bp == NULL || + P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size); ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); vdev_io_start(zio); @@ -1350,6 +1366,17 @@ ASSERT(zio->io_vsd == NULL); + if (zio->io_flags & ZIO_FLAG_SUBBLOCK) { + void *abuf; + uint64_t asize; + ASSERT(vd == tvd); + zio_pop_transform(zio, &abuf, &asize, &asize); + if (zio->io_type == ZIO_TYPE_READ) + bcopy(abuf, zio->io_data, zio->io_size); + zio_buf_free(abuf, asize); + zio->io_flags &= ~ZIO_FLAG_SUBBLOCK; + } + if (zio_injection_enabled && !zio->io_error) zio->io_error = zio_handle_fault_injection(zio, EIO); @@ -1660,7 +1687,7 @@ spa_config_enter(spa, RW_READER, FTAG); - metaslab_free(spa, BP_IDENTITY(bp), txg); + metaslab_free(spa, BP_IDENTITY(bp), txg, B_FALSE); spa_config_exit(spa, FTAG); }
--- a/usr/src/uts/common/fs/zfs/zio_checksum.c Sat Apr 01 21:50:51 2006 -0800 +++ b/usr/src/uts/common/fs/zfs/zio_checksum.c Sun Apr 02 00:47:06 2006 -0800 @@ -128,7 +128,7 @@ BP_GET_CHECKSUM(bp); int byteswap = BP_SHOULD_BYTESWAP(bp); void *data = zio->io_data; - uint64_t size = zio->io_size; + uint64_t size = ZIO_GET_IOSIZE(zio); zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1; zio_checksum_info_t *ci = &zio_checksum_table[checksum]; zio_cksum_t actual_cksum, expected_cksum;