Mercurial > illumos > illumos-gate
changeset 1807:35c8b566d7af
6410711 intent log blocks don't get invited to pool parties
line wrap: on
line diff
--- a/usr/src/cmd/zdb/zdb.c Thu Apr 13 15:37:22 2006 -0700 +++ b/usr/src/cmd/zdb/zdb.c Thu Apr 13 16:15:06 2006 -0700 @@ -1468,8 +1468,6 @@ ASSERT(!BP_IS_HOLE(bp)); - zdb_count_block(spa, zcb, bp, type); - if (dump_opt['b'] >= 4) { sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp); (void) printf("objset %llu object %llu offset 0x%llx %s\n", @@ -1480,6 +1478,8 @@ blkbuf); } + zdb_count_block(spa, zcb, bp, type); + return (0); }
--- a/usr/src/cmd/zdb/zdb_il.c Thu Apr 13 15:37:22 2006 -0700 +++ b/usr/src/cmd/zdb/zdb_il.c Thu Apr 13 16:15:06 2006 -0700 @@ -43,7 +43,7 @@ extern uint8_t dump_opt[256]; static void -print_log_bp(blkptr_t *bp, const char *prefix) +print_log_bp(const blkptr_t *bp, const char *prefix) { char blkbuf[BP_SPRINTF_LEN]; @@ -130,13 +130,13 @@ } else { zbookmark_t zb; - ASSERT3U(bp->blk_cksum.zc_word[2], ==, + ASSERT3U(bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], ==, dmu_objset_id(zilog->zl_os)); - zb.zb_objset = bp->blk_cksum.zc_word[2]; + zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET]; zb.zb_object = 0; zb.zb_level = -1; - zb.zb_blkid = bp->blk_cksum.zc_word[3]; + zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; error = zio_wait(zio_read(NULL, zilog->zl_spa, bp, buf, BP_GET_LSIZE(bp), NULL, NULL, @@ -300,7 +300,7 @@ claim = "won't claim"; (void) printf("\tBlock seqno %llu, %s%s\n", - (u_longlong_t)bp->blk_cksum.zc_word[3], claim, blkbuf); + (u_longlong_t)bp->blk_cksum.zc_word[ZIL_ZC_SEQ], claim, blkbuf); } static void @@ -329,7 +329,7 @@ void dump_intent_log(zilog_t *zilog) { - zil_header_t *zh = zilog->zl_header; + const zil_header_t *zh = zilog->zl_header; int verbose = MAX(dump_opt['d'], dump_opt['i']); int i; @@ -347,7 +347,7 @@ if (verbose >= 2) { (void) printf("\n"); - zil_parse(zilog, print_log_block, print_log_record, NULL, + (void) zil_parse(zilog, print_log_block, print_log_record, NULL, zh->zh_claim_txg); print_log_stats(verbose); }
--- a/usr/src/cmd/ztest/ztest.c Thu Apr 13 15:37:22 2006 -0700 +++ b/usr/src/cmd/ztest/ztest.c Thu Apr 13 16:15:06 2006 -0700 @@ -1216,7 +1216,7 @@ /* * Put a random number of objects in there. */ - objects = ztest_random(50); + objects = ztest_random(20); seq = 0; while (objects-- != 0) { uint64_t object; @@ -1237,7 +1237,7 @@ if (ztest_random(5) == 0) { zil_commit(zilog, seq, FSYNC); } - if (ztest_random(5) == 0) { + if (ztest_random(100) == 0) { error = zil_suspend(zilog); if (error == 0) { zil_resume(zilog); @@ -2670,13 +2670,14 @@ ztest_obliterate_one_disk(uint64_t vdev) { int fd; - char dev_name[MAXPATHLEN]; + char dev_name[MAXPATHLEN], copy_name[MAXPATHLEN]; size_t fsize; if (zopt_maxfaults < 2) return; (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev); + (void) snprintf(copy_name, MAXPATHLEN, "%s.old", dev_name); fd = open(dev_name, O_RDWR); @@ -2687,12 +2688,13 @@ * Determine the size. */ fsize = lseek(fd, 0, SEEK_END); + (void) close(fd); /* - * Remove it. + * Rename the old device to dev_name.old (useful for debugging). */ - VERIFY(remove(dev_name) == 0); + VERIFY(rename(dev_name, copy_name) == 0); /* * Create a new one.
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c Thu Apr 13 15:37:22 2006 -0700 +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c Thu Apr 13 16:15:06 2006 -0700 @@ -541,7 +541,7 @@ */ error = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os); if (error == 0) { - zil_destroy(dmu_objset_zil(os)); + zil_destroy(dmu_objset_zil(os), B_FALSE); dmu_objset_close(os); }
--- a/usr/src/uts/common/fs/zfs/dmu_traverse.c Thu Apr 13 15:37:22 2006 -0700 +++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c Thu Apr 13 16:15:06 2006 -0700 @@ -484,7 +484,7 @@ if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) { zb->zb_object = 0; - zb->zb_blkid = bp->blk_cksum.zc_word[3]; + zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; bc->bc_blkptr = *bp; (void) traverse_callback(th, zseg, bc); } @@ -539,7 +539,7 @@ zilog = zil_alloc(dp->dp_meta_objset, zh); - zil_parse(zilog, traverse_zil_block, traverse_zil_record, th, + (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th, claim_txg); zil_free(zilog);
--- a/usr/src/uts/common/fs/zfs/metaslab.c Thu Apr 13 15:37:22 2006 -0700 +++ b/usr/src/uts/common/fs/zfs/metaslab.c Thu Apr 13 16:15:06 2006 -0700 @@ -593,52 +593,6 @@ mutex_exit(&msp->ms_lock); } -/* - * Intent log support: upon opening the pool after a crash, notify the SPA - * of blocks that the intent log has allocated for immediate write, but - * which are still considered free by the SPA because the last transaction - * group didn't commit yet. - */ -int -metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg) -{ - uint64_t vdev = DVA_GET_VDEV(dva); - uint64_t offset = DVA_GET_OFFSET(dva); - uint64_t size = DVA_GET_ASIZE(dva); - vdev_t *vd; - metaslab_t *msp; - int error; - - if ((vd = vdev_lookup_top(spa, vdev)) == NULL) - return (ENXIO); - - if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) - return (ENXIO); - - msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; - - if (DVA_GET_GANG(dva)) - size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); - - mutex_enter(&msp->ms_lock); - - error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); - if (error) { - mutex_exit(&msp->ms_lock); - return (error); - } - - if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) - vdev_dirty(vd, VDD_METASLAB, msp, txg); - - space_map_claim(&msp->ms_map, offset, size); - space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); - - mutex_exit(&msp->ms_lock); - - return (0); -} - static uint64_t metaslab_distance(metaslab_t *msp, dva_t *dva) { @@ -735,7 +689,7 @@ * Allocate a block for the specified i/o. */ static int -metaslab_alloc_one(spa_t *spa, uint64_t psize, dva_t *dva, int d, +metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg) { metaslab_group_t *mg, *rotor; @@ -747,6 +701,8 @@ uint64_t asize; uint64_t distance; + ASSERT(!DVA_IS_VALID(&dva[d])); + mc = spa_metaslab_class_select(spa); /* @@ -854,41 +810,12 @@ return (ENOSPC); } -int -metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ncopies, - uint64_t txg, blkptr_t *hintbp) -{ - int d, error; - dva_t *dva = bp->blk_dva; - dva_t *hintdva = hintbp->blk_dva; - - ASSERT(ncopies > 0 && ncopies <= spa_max_replication(spa)); - ASSERT(BP_GET_NDVAS(bp) == 0); - ASSERT(hintbp == NULL || ncopies <= BP_GET_NDVAS(hintbp)); - - for (d = 0; d < ncopies; d++) { - error = metaslab_alloc_one(spa, psize, dva, d, hintdva, txg); - if (error) { - for (d--; d >= 0; d--) { - ASSERT(DVA_IS_VALID(&dva[d])); - metaslab_free(spa, &dva[d], txg, B_TRUE); - bzero(&dva[d], sizeof (dva_t)); - } - return (ENOSPC); - } - } - ASSERT(error == 0); - ASSERT(BP_GET_NDVAS(bp) == ncopies); - - return (0); -} - /* * Free the block represented by DVA in the context of the specified * transaction group. */ -void -metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg, boolean_t now) +static void +metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) { uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); @@ -896,19 +823,15 @@ vdev_t *vd; metaslab_t *msp; + ASSERT(DVA_IS_VALID(dva)); + if (txg > spa_freeze_txg(spa)) return; - if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { - cmn_err(CE_WARN, "metaslab_free(): bad vdev %llu", - (u_longlong_t)vdev); - ASSERT(0); - return; - } - - if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { - cmn_err(CE_WARN, "metaslab_free(): bad offset %llu", - (u_longlong_t)offset); + if ((vd = vdev_lookup_top(spa, vdev)) == NULL || + (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { + cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", + (u_longlong_t)vdev, (u_longlong_t)offset); ASSERT(0); return; } @@ -932,3 +855,108 @@ mutex_exit(&msp->ms_lock); } + +/* + * Intent log support: upon opening the pool after a crash, notify the SPA + * of blocks that the intent log has allocated for immediate write, but + * which are still considered free by the SPA because the last transaction + * group didn't commit yet. + */ +static int +metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) +{ + uint64_t vdev = DVA_GET_VDEV(dva); + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t size = DVA_GET_ASIZE(dva); + vdev_t *vd; + metaslab_t *msp; + int error; + + ASSERT(DVA_IS_VALID(dva)); + + if ((vd = vdev_lookup_top(spa, vdev)) == NULL || + (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) + return (ENXIO); + + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + if (DVA_GET_GANG(dva)) + size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + + mutex_enter(&msp->ms_lock); + + error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); + if (error) { + mutex_exit(&msp->ms_lock); + return (error); + } + + if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) + vdev_dirty(vd, VDD_METASLAB, msp, txg); + + space_map_claim(&msp->ms_map, offset, size); + space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + + mutex_exit(&msp->ms_lock); + + return (0); +} + +int +metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ndvas, + uint64_t txg, blkptr_t *hintbp) +{ + dva_t *dva = bp->blk_dva; + dva_t *hintdva = hintbp->blk_dva; + int d; + int error = 0; + + ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); + ASSERT(BP_GET_NDVAS(bp) == 0); + ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); + + for (d = 0; d < ndvas; d++) { + error = metaslab_alloc_dva(spa, psize, dva, d, hintdva, txg); + if (error) { + for (d--; d >= 0; d--) { + metaslab_free_dva(spa, &dva[d], txg, B_TRUE); + bzero(&dva[d], sizeof (dva_t)); + } + return (error); + } + } + ASSERT(error == 0); + ASSERT(BP_GET_NDVAS(bp) == ndvas); + + return (0); +} + +void +metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + int d; + + ASSERT(!BP_IS_HOLE(bp)); + + for (d = 0; d < ndvas; d++) + metaslab_free_dva(spa, &dva[d], txg, now); +} + +int +metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + int d, error; + int last_error = 0; + + ASSERT(!BP_IS_HOLE(bp)); + + for (d = 0; d < ndvas; d++) + if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) + last_error = error; + + return (last_error); +}
--- a/usr/src/uts/common/fs/zfs/spa.c Thu Apr 13 15:37:22 2006 -0700 +++ b/usr/src/uts/common/fs/zfs/spa.c Thu Apr 13 16:15:06 2006 -0700 @@ -426,7 +426,7 @@ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1, &spa->spa_errlog_last); - if (error != 0 &&error != ENOENT) { + if (error != 0 && error != ENOENT) { vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); error = EIO; @@ -1530,7 +1530,7 @@ if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ - flags |= ZIO_FLAG_CANFAIL; + flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; zio_nowait(zio_read(NULL, spa, bp, data, size, spa_scrub_io_done, NULL, priority, flags, zb));
--- a/usr/src/uts/common/fs/zfs/spa_misc.c Thu Apr 13 15:37:22 2006 -0700 +++ b/usr/src/uts/common/fs/zfs/spa_misc.c Thu Apr 13 16:15:06 2006 -0700 @@ -616,7 +616,7 @@ } void -sprintf_blkptr(char *buf, int len, blkptr_t *bp) +sprintf_blkptr(char *buf, int len, const blkptr_t *bp) { int d; @@ -637,7 +637,7 @@ (u_longlong_t)BP_GET_PSIZE(bp)); for (d = 0; d < BP_GET_NDVAS(bp); d++) { - dva_t *dva = &bp->blk_dva[d]; + const dva_t *dva = &bp->blk_dva[d]; (void) snprintf(buf + strlen(buf), len - strlen(buf), "DVA[%d]=<%llu:%llx:%llx> ", d, (u_longlong_t)DVA_GET_VDEV(dva),
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h Thu Apr 13 15:37:22 2006 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h Thu Apr 13 16:15:06 2006 -0700 @@ -49,8 +49,9 @@ extern int metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp); -extern void metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg, boolean_t now); -extern int metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg); +extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, + boolean_t now); +extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg); extern metaslab_class_t *metaslab_class_create(void); extern void metaslab_class_destroy(metaslab_class_t *mc);
--- a/usr/src/uts/common/fs/zfs/sys/spa.h Thu Apr 13 15:37:22 2006 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/spa.h Thu Apr 13 16:15:06 2006 -0700 @@ -407,7 +407,7 @@ extern char *spa_strdup(const char *); extern void spa_strfree(char *); extern uint64_t spa_get_random(uint64_t range); -extern void sprintf_blkptr(char *buf, int len, blkptr_t *bp); +extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp); extern void spa_freeze(spa_t *spa); extern void spa_upgrade(spa_t *spa); extern void spa_evict_all(void);
--- a/usr/src/uts/common/fs/zfs/sys/zil.h Thu Apr 13 15:37:22 2006 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/zil.h Thu Apr 13 16:15:06 2006 -0700 @@ -57,7 +57,8 @@ uint64_t zh_claim_txg; /* txg in which log blocks were claimed */ uint64_t zh_replay_seq; /* highest replayed sequence number */ blkptr_t zh_log; /* log chain */ - uint64_t zit_pad[6]; + uint64_t zh_claim_seq; /* highest claimed sequence number */ + uint64_t zh_pad[5]; } zil_header_t; /* @@ -80,6 +81,14 @@ #define ZIL_BLK_DATA_SZ(lwb) ((lwb)->lwb_sz - sizeof (zil_trailer_t)) /* + * The words of a log block checksum. + */ +#define ZIL_ZC_GUID_0 0 +#define ZIL_ZC_GUID_1 1 +#define ZIL_ZC_OBJSET 2 +#define ZIL_ZC_SEQ 3 + +/* * Intent log transaction types and record structures */ #define TX_CREATE 1 /* Create file */ @@ -208,7 +217,7 @@ typedef int zil_replay_func_t(); typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf); -extern void zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, +extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg); extern void zil_init(void); @@ -222,7 +231,7 @@ extern void zil_replay(objset_t *os, void *arg, uint64_t *txgp, zil_replay_func_t *replay_func[TX_MAX_TYPE], void (*rm_wait)(void *)); -extern void zil_destroy(zilog_t *zilog); +extern void zil_destroy(zilog_t *zilog, boolean_t keep_first); extern itx_t *zil_itx_create(int txtype, size_t lrsize); extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
--- a/usr/src/uts/common/fs/zfs/sys/zil_impl.h Thu Apr 13 15:37:22 2006 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h Thu Apr 13 16:15:06 2006 -0700 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -75,7 +74,7 @@ kmutex_t zl_lock; /* protects most zilog_t fields */ struct dsl_pool *zl_dmu_pool; /* DSL pool */ spa_t *zl_spa; /* handle for read/write log */ - zil_header_t *zl_header; /* log header buffer */ + const zil_header_t *zl_header; /* log header buffer */ objset_t *zl_os; /* object set we're logging */ zil_get_data_t *zl_get_data; /* callback to get object content */ uint64_t zl_itx_seq; /* itx sequence number */ @@ -85,6 +84,9 @@ uint32_t zl_suspend; /* log suspend count */ kcondvar_t zl_cv_write; /* for waiting to write to log */ kcondvar_t zl_cv_seq; /* for committing a sequence */ + kcondvar_t zl_cv_suspend; /* log suspend completion */ + uint8_t zl_suspending; /* log is currently suspending */ + uint8_t zl_keep_first; /* keep first log block in destroy */ uint8_t zl_stop_replay; /* don't replay any further */ uint8_t zl_stop_sync; /* for debugging */ uint8_t zl_writer; /* boolean: write setup in progress */ @@ -97,7 +99,6 @@ list_t zl_vdev_list; /* list of [vdev, seq] pairs */ taskq_t *zl_clean_taskq; /* runs lwb and itx clean tasks */ avl_tree_t zl_dva_tree; /* track DVAs during log parse */ - kmutex_t zl_destroy_lock; /* serializes zil_destroy() calls */ }; typedef struct zil_dva_node {
--- a/usr/src/uts/common/fs/zfs/sys/zio.h Thu Apr 13 15:37:22 2006 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/zio.h Thu Apr 13 16:15:06 2006 -0700 @@ -125,7 +125,8 @@ #define ZIO_FLAG_RESILVER 0x01000 #define ZIO_FLAG_SCRUB 0x02000 -#define ZIO_FLAG_SUBBLOCK 0x04000 +#define ZIO_FLAG_SCRUB_THREAD 0x04000 +#define ZIO_FLAG_SUBBLOCK 0x08000 #define ZIO_FLAG_NOBOOKMARK 0x10000 @@ -137,7 +138,8 @@ ZIO_FLAG_IO_REPAIR | \ ZIO_FLAG_SPECULATIVE | \ ZIO_FLAG_RESILVER | \ - ZIO_FLAG_SCRUB) + ZIO_FLAG_SCRUB | \ + ZIO_FLAG_SCRUB_THREAD) #define ZIO_FLAG_VDEV_INHERIT \ (ZIO_FLAG_GANG_INHERIT | \ @@ -282,8 +284,7 @@ uint64_t size, void *data, int checksum, zio_done_func_t *done, void *private, int priority, int flags); -extern int zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, - blkptr_t *bp, uint64_t txg); +extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *bp, uint64_t txg); extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg); extern int zio_wait(zio_t *zio);
--- a/usr/src/uts/common/fs/zfs/vdev.c Thu Apr 13 15:37:22 2006 -0700 +++ b/usr/src/uts/common/fs/zfs/vdev.c Thu Apr 13 16:15:06 2006 -0700 @@ -1502,7 +1502,7 @@ if ((flags & ZIO_FLAG_IO_REPAIR) && zio->io_delegate_list == NULL) { mutex_enter(&vd->vdev_stat_lock); - if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) + if (flags & ZIO_FLAG_SCRUB_THREAD) vs->vs_scrub_repaired += zio->io_size; else vs->vs_self_healed += zio->io_size; @@ -1530,7 +1530,7 @@ if (type == ZIO_TYPE_WRITE) { if (txg == 0 || vd->vdev_children != 0) return; - if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) { + if (flags & ZIO_FLAG_SCRUB_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c Thu Apr 13 15:37:22 2006 -0700 +++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c Thu Apr 13 16:15:06 2006 -0700 @@ -389,7 +389,9 @@ ASSERT(zio->io_error != 0); if (good_copies && (spa_mode & FWRITE) && - (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { + (unexpected_errors || + (zio->io_flags & ZIO_FLAG_RESILVER) || + ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) { zio_t *rio; /* @@ -415,7 +417,8 @@ if (mc->mc_error == 0) { if (mc->mc_tried) continue; - if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, + if (!(zio->io_flags & ZIO_FLAG_SCRUB) && + !vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, zio->io_txg, 1)) continue; mc->mc_error = ESTALE;
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c Thu Apr 13 15:37:22 2006 -0700 +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c Thu Apr 13 16:15:06 2006 -0700 @@ -118,7 +118,7 @@ avl_add(&vq->vq_deadline_tree, zio); avl_add(zio->io_vdev_tree, zio); - if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) && + if ((zio->io_flags & ZIO_FLAG_SCRUB_THREAD) && ++vq->vq_scrub_count >= vq->vq_scrub_limit) spa_scrub_throttle(zio->io_spa, 1); } @@ -126,7 +126,7 @@ static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { - if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) && + if ((zio->io_flags & ZIO_FLAG_SCRUB_THREAD) && vq->vq_scrub_count-- >= vq->vq_scrub_limit) spa_scrub_throttle(zio->io_spa, -1);
--- a/usr/src/uts/common/fs/zfs/zil.c Thu Apr 13 15:37:22 2006 -0700 +++ b/usr/src/uts/common/fs/zfs/zil.c Thu Apr 13 16:15:06 2006 -0700 @@ -127,76 +127,94 @@ return (0); } +static zil_header_t * +zil_header_in_syncing_context(zilog_t *zilog) +{ + return ((zil_header_t *)zilog->zl_header); +} + +static void +zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) +{ + zio_cksum_t *zc = &bp->blk_cksum; + + zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); + zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); + zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); + zc->zc_word[ZIL_ZC_SEQ] = 1ULL; +} + /* * Read a log block, make sure it's valid, and byteswap it if necessary. */ static int -zil_read_log_block(zilog_t *zilog, blkptr_t *bp, char *buf) +zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp) { - uint64_t blksz = BP_GET_LSIZE(bp); - zil_trailer_t *ztp = (zil_trailer_t *)(buf + blksz) - 1; - zio_cksum_t cksum; + blkptr_t blk = *bp; zbookmark_t zb; int error; - zb.zb_objset = bp->blk_cksum.zc_word[2]; + zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET]; zb.zb_object = 0; zb.zb_level = -1; - zb.zb_blkid = bp->blk_cksum.zc_word[3]; + zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; + + *abufpp = NULL; + + error = arc_read(NULL, zilog->zl_spa, &blk, byteswap_uint64_array, + arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, ARC_WAIT, &zb); + + if (error == 0) { + char *data = (*abufpp)->b_data; + uint64_t blksz = BP_GET_LSIZE(bp); + zil_trailer_t *ztp = (zil_trailer_t *)(data + blksz) - 1; + zio_cksum_t cksum = bp->blk_cksum; - error = zio_wait(zio_read(NULL, zilog->zl_spa, bp, buf, blksz, - NULL, NULL, ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb)); - if (error) { - dprintf_bp(bp, "zilog %p bp %p read failed, error %d: ", - zilog, bp, error); - return (error); + /* + * Sequence numbers should be... sequential. The checksum + * verifier for the next block should be bp's checksum plus 1. + */ + cksum.zc_word[ZIL_ZC_SEQ]++; + + if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum))) + error = ESTALE; + else if (BP_IS_HOLE(&ztp->zit_next_blk)) + error = ENOENT; + else if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t))) + error = EOVERFLOW; + + if (error) { + VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1); + *abufpp = NULL; + } } - if (BP_SHOULD_BYTESWAP(bp)) - byteswap_uint64_array(buf, blksz); - - /* - * Sequence numbers should be... sequential. The checksum verifier for - * the next block should be: <logid[0], logid[1], objset id, seq + 1>. - */ - cksum = bp->blk_cksum; - cksum.zc_word[3]++; - if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum)) != 0) { - dprintf_bp(bp, "zilog %p bp %p stale pointer: ", zilog, bp); - return (ESTALE); - } + dprintf("error %d on %llu:%llu\n", error, zb.zb_objset, zb.zb_blkid); - if (BP_IS_HOLE(&ztp->zit_next_blk)) { - dprintf_bp(bp, "zilog %p bp %p hole: ", zilog, bp); - return (ENOENT); - } - - if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t))) { - dprintf("zilog %p bp %p nused exceeds blksz\n", zilog, bp); - return (EOVERFLOW); - } - - dprintf_bp(bp, "zilog %p bp %p good block: ", zilog, bp); - - return (0); + return (error); } /* * Parse the intent log, and call parse_func for each valid record within. + * Return the highest sequence number. */ -void +uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) { - blkptr_t blk; + const zil_header_t *zh = zilog->zl_header; + uint64_t claim_seq = zh->zh_claim_seq; + uint64_t seq = 0; + uint64_t max_seq = 0; + blkptr_t blk = zh->zh_log; + arc_buf_t *abuf; char *lrbuf, *lrp; zil_trailer_t *ztp; int reclen, error; - blk = zilog->zl_header->zh_log; if (BP_IS_HOLE(&blk)) - return; + return (max_seq); /* * Starting at the block pointed to by zh_log we read the log chain. @@ -204,11 +222,20 @@ * ensure its validity. We stop when an invalid block is found. * For each block pointer in the chain we call parse_blk_func(). * For each record in each valid block we call parse_lr_func(). + * If the log has been claimed, stop if we encounter a sequence + * number greater than the highest claimed sequence number. */ zil_dva_tree_init(&zilog->zl_dva_tree); - lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE); for (;;) { - error = zil_read_log_block(zilog, &blk, lrbuf); + seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; + + if (claim_seq != 0 && seq > claim_seq) + break; + + ASSERT(max_seq < seq); + max_seq = seq; + + error = zil_read_log_block(zilog, &blk, &abuf); if (parse_blk_func != NULL) parse_blk_func(zilog, &blk, arg, txg); @@ -216,11 +243,14 @@ if (error) break; + lrbuf = abuf->b_data; ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1; blk = ztp->zit_next_blk; - if (parse_lr_func == NULL) + if (parse_lr_func == NULL) { + VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); continue; + } for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) { lr_t *lr = (lr_t *)lrp; @@ -228,9 +258,11 @@ ASSERT3U(reclen, >=, sizeof (lr_t)); parse_lr_func(zilog, lr, arg, txg); } + VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); } - zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE); zil_dva_tree_fini(&zilog->zl_dva_tree); + + return (max_seq); } /* ARGSUSED */ @@ -240,8 +272,6 @@ spa_t *spa = zilog->zl_spa; int err; - dprintf_bp(bp, "first_txg %llu: ", first_txg); - /* * Claim log block if not already committed and not already claimed. */ @@ -291,44 +321,42 @@ static void zil_create(zilog_t *zilog) { + const zil_header_t *zh = zilog->zl_header; lwb_t *lwb; - uint64_t txg; - dmu_tx_t *tx; + uint64_t txg = 0; + dmu_tx_t *tx = NULL; blkptr_t blk; - int error; - int no_blk; - - ASSERT(zilog->zl_header->zh_claim_txg == 0); - ASSERT(zilog->zl_header->zh_replay_seq == 0); + int error = 0; /* - * Initialize the log header block. + * Wait for any previous destroy to complete. */ - tx = dmu_tx_create(zilog->zl_os); - (void) dmu_tx_assign(tx, TXG_WAIT); - dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); - txg = dmu_tx_get_txg(tx); + txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); + + ASSERT(zh->zh_claim_txg == 0); + ASSERT(zh->zh_replay_seq == 0); + + blk = zh->zh_log; /* - * If we don't have a log block already then - * allocate the first log block and assign its checksum verifier. + * If we don't already have an initial log block, allocate one now. */ - no_blk = BP_IS_HOLE(&zilog->zl_header->zh_log); - if (no_blk) { - error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG, - ZIL_MIN_BLKSZ, &blk, txg); - } else { - blk = zilog->zl_header->zh_log; - error = 0; + if (BP_IS_HOLE(&blk)) { + tx = dmu_tx_create(zilog->zl_os); + (void) dmu_tx_assign(tx, TXG_WAIT); + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + txg = dmu_tx_get_txg(tx); + + error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk, txg); + + if (error == 0) + zil_init_log_chain(zilog, &blk); } + + /* + * Allocate a log write buffer (lwb) for the first log block. + */ if (error == 0) { - ZIO_SET_CHECKSUM(&blk.blk_cksum, - spa_get_random(-1ULL), spa_get_random(-1ULL), - dmu_objset_id(zilog->zl_os), 1ULL); - - /* - * Allocate a log write buffer (lwb) for the first log block. - */ lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); lwb->lwb_zilog = zilog; lwb->lwb_blk = blk; @@ -343,43 +371,81 @@ mutex_exit(&zilog->zl_lock); } - dmu_tx_commit(tx); - if (no_blk) + /* + * If we just allocated the first log block, commit our transaction + * and wait for zil_sync() to stuff the block poiner into zh_log. + * (zh is part of the MOS, so we cannot modify it in open context.) + */ + if (tx != NULL) { + dmu_tx_commit(tx); txg_wait_synced(zilog->zl_dmu_pool, txg); + } + + ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); } /* * In one tx, free all log blocks and clear the log header. + * If keep_first is set, then we're replaying a log with no content. + * We want to keep the first block, however, so that the first + * synchronous transaction doesn't require a txg_wait_synced() + * in zil_create(). We don't need to txg_wait_synced() here either + * when keep_first is set, because both zil_create() and zil_destroy() + * will wait for any in-progress destroys to complete. */ void -zil_destroy(zilog_t *zilog) +zil_destroy(zilog_t *zilog, boolean_t keep_first) { + const zil_header_t *zh = zilog->zl_header; + lwb_t *lwb; dmu_tx_t *tx; uint64_t txg; - mutex_enter(&zilog->zl_destroy_lock); + /* + * Wait for any previous destroy to complete. + */ + txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); - if (BP_IS_HOLE(&zilog->zl_header->zh_log)) { - mutex_exit(&zilog->zl_destroy_lock); + if (BP_IS_HOLE(&zh->zh_log)) return; - } tx = dmu_tx_create(zilog->zl_os); (void) dmu_tx_assign(tx, TXG_WAIT); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); - zil_parse(zilog, zil_free_log_block, zil_free_log_record, tx, - zilog->zl_header->zh_claim_txg); - /* - * zil_sync clears the zil header as soon as the zl_destroy_txg commits - */ + mutex_enter(&zilog->zl_lock); + + ASSERT3U(zilog->zl_destroy_txg, <, txg); zilog->zl_destroy_txg = txg; + zilog->zl_keep_first = keep_first; + + if (!list_is_empty(&zilog->zl_lwb_list)) { + ASSERT(zh->zh_claim_txg == 0); + ASSERT(!keep_first); + while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { + list_remove(&zilog->zl_lwb_list, lwb); + if (lwb->lwb_buf != NULL) + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + zio_free_blk(zilog->zl_spa, &lwb->lwb_blk, txg); + kmem_cache_free(zil_lwb_cache, lwb); + } + mutex_exit(&zilog->zl_lock); + } else { + mutex_exit(&zilog->zl_lock); + if (!keep_first) { + (void) zil_parse(zilog, zil_free_log_block, + zil_free_log_record, tx, zh->zh_claim_txg); + } + } dmu_tx_commit(tx); - txg_wait_synced(zilog->zl_dmu_pool, txg); - mutex_exit(&zilog->zl_destroy_lock); + if (keep_first) /* no need to wait in this case */ + return; + + txg_wait_synced(zilog->zl_dmu_pool, txg); + ASSERT(BP_IS_HOLE(&zh->zh_log)); } void @@ -399,18 +465,23 @@ } zilog = dmu_objset_zil(os); - zh = zilog->zl_header; + zh = zil_header_in_syncing_context(zilog); /* - * Claim all log blocks if we haven't already done so. + * Claim all log blocks if we haven't already done so, and remember + * the highest claimed sequence number. This ensures that if we can + * read only part of the log now (e.g. due to a missing device), + * but we can read the entire log later, we will not try to replay + * or destroy beyond the last block we successfully claimed. */ ASSERT3U(zh->zh_claim_txg, <=, first_txg); if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { zh->zh_claim_txg = first_txg; - zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, - tx, first_txg); + zh->zh_claim_seq = zil_parse(zilog, zil_claim_log_block, + zil_claim_log_record, tx, first_txg); dsl_dataset_dirty(dmu_objset_ds(os), tx); } + ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); dmu_objset_close(os); } @@ -555,6 +626,8 @@ { lwb_t *nlwb; zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1; + spa_t *spa = zilog->zl_spa; + blkptr_t *bp = &ztp->zit_next_blk; uint64_t txg; uint64_t zil_blksz; zbookmark_t zb; @@ -583,8 +656,7 @@ if (zil_blksz > ZIL_MAX_BLKSZ) zil_blksz = ZIL_MAX_BLKSZ; - error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG, - zil_blksz, &ztp->zit_next_blk, txg); + error = zio_alloc_blk(spa, zil_blksz, bp, txg); if (error) { /* * Reinitialise the lwb. @@ -599,12 +671,12 @@ return (NULL); } - ASSERT3U(ztp->zit_next_blk.blk_birth, ==, txg); + ASSERT3U(bp->blk_birth, ==, txg); ztp->zit_pad = 0; ztp->zit_nused = lwb->lwb_nused; ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum; - ztp->zit_next_blk.blk_cksum = lwb->lwb_blk.blk_cksum; - ztp->zit_next_blk.blk_cksum.zc_word[3]++; + bp->blk_cksum = lwb->lwb_blk.blk_cksum; + bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; /* * Allocate a new log write buffer (lwb). @@ -612,7 +684,7 @@ nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); nlwb->lwb_zilog = zilog; - nlwb->lwb_blk = ztp->zit_next_blk; + nlwb->lwb_blk = *bp; nlwb->lwb_nused = 0; nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk); nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz); @@ -633,14 +705,12 @@ /* * write the old log block */ - dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg); - - zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[2]; + zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET]; zb.zb_object = 0; zb.zb_level = -1; - zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[3]; + zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; - zio_nowait(zio_rewrite(NULL, zilog->zl_spa, ZIO_CHECKSUM_ZILOG, 0, + zio_nowait(zio_rewrite(NULL, spa, ZIO_CHECKSUM_ZILOG, 0, &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb)); @@ -949,21 +1019,40 @@ void zil_sync(zilog_t *zilog, dmu_tx_t *tx) { + zil_header_t *zh = zil_header_in_syncing_context(zilog); uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa = zilog->zl_spa; lwb_t *lwb; + mutex_enter(&zilog->zl_lock); + ASSERT(zilog->zl_stop_sync == 0); - zilog->zl_header->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK]; + zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK]; if (zilog->zl_destroy_txg == txg) { - bzero(zilog->zl_header, sizeof (zil_header_t)); + blkptr_t blk = zh->zh_log; + + ASSERT(list_head(&zilog->zl_lwb_list) == NULL); + ASSERT(spa_sync_pass(spa) == 1); + + bzero(zh, sizeof (zil_header_t)); bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq)); - zilog->zl_destroy_txg = 0; + + if (zilog->zl_keep_first) { + /* + * If this block was part of log chain that couldn't + * be claimed because a device was missing during + * zil_claim(), but that device later returns, + * then this block could erroneously appear valid. + * To guard against this, assign a new GUID to the new + * log chain so it doesn't matter what blk points to. + */ + zil_init_log_chain(zilog, &blk); + zh->zh_log = blk; + } } - mutex_enter(&zilog->zl_lock); for (;;) { lwb = list_head(&zilog->zl_lwb_list); if (lwb == NULL) { @@ -976,7 +1065,7 @@ zio_free_blk(spa, &lwb->lwb_blk, txg); kmem_cache_free(zil_lwb_cache, lwb); } - zilog->zl_header->zh_log = lwb->lwb_blk; + zh->zh_log = lwb->lwb_blk; mutex_exit(&zilog->zl_lock); } @@ -1004,6 +1093,7 @@ zilog->zl_os = os; zilog->zl_spa = dmu_objset_spa(os); zilog->zl_dmu_pool = dmu_objset_pool(os); + zilog->zl_destroy_txg = TXG_INITIAL - 1; list_create(&zilog->zl_itx_list, sizeof (itx_t), offsetof(itx_t, itx_node)); @@ -1051,18 +1141,17 @@ static int zil_empty(zilog_t *zilog) { - blkptr_t blk; - char *lrbuf; - int error; + const zil_header_t *zh = zilog->zl_header; + arc_buf_t *abuf = NULL; - blk = zilog->zl_header->zh_log; - if (BP_IS_HOLE(&blk)) + if (BP_IS_HOLE(&zh->zh_log)) return (1); - lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE); - error = zil_read_log_block(zilog, &blk, lrbuf); - zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE); - return (error ? 1 : 0); + if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0) + return (1); + + VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); + return (0); } /* @@ -1086,8 +1175,20 @@ void zil_close(zilog_t *zilog) { - if (!zil_is_committed(zilog)) - txg_wait_synced(zilog->zl_dmu_pool, 0); + /* + * If the log isn't already committed, mark the objset dirty + * (so zil_sync() will be called) and wait for that txg to sync. + */ + if (!zil_is_committed(zilog)) { + uint64_t txg; + dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); + (void) dmu_tx_assign(tx, TXG_WAIT); + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + txg = dmu_tx_get_txg(tx); + dmu_tx_commit(tx); + txg_wait_synced(zilog->zl_dmu_pool, txg); + } + taskq_destroy(zilog->zl_clean_taskq); zilog->zl_clean_taskq = NULL; zilog->zl_get_data = NULL; @@ -1105,38 +1206,55 @@ int zil_suspend(zilog_t *zilog) { + const zil_header_t *zh = zilog->zl_header; lwb_t *lwb; mutex_enter(&zilog->zl_lock); - if (zilog->zl_header->zh_claim_txg != 0) { /* unplayed log */ + if (zh->zh_claim_txg != 0) { /* unplayed log */ mutex_exit(&zilog->zl_lock); return (EBUSY); } - zilog->zl_suspend++; + if (zilog->zl_suspend++ != 0) { + /* + * Someone else already began a suspend. + * Just wait for them to finish. + */ + while (zilog->zl_suspending) + cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); + ASSERT(BP_IS_HOLE(&zh->zh_log)); + mutex_exit(&zilog->zl_lock); + return (0); + } + zilog->zl_suspending = B_TRUE; mutex_exit(&zilog->zl_lock); zil_commit(zilog, UINT64_MAX, FSYNC); mutex_enter(&zilog->zl_lock); - while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { - if (lwb->lwb_buf != NULL) { - /* - * Wait for the buffer if it's in the process of - * being written. - */ - if ((lwb->lwb_seq != 0) && - (lwb->lwb_state != SEQ_COMPLETE)) { - cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock); - continue; - } - zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); - } - list_remove(&zilog->zl_lwb_list, lwb); - kmem_cache_free(zil_lwb_cache, lwb); + for (;;) { + /* + * Wait for any in-flight log writes to complete. + */ + for (lwb = list_head(&zilog->zl_lwb_list); lwb != NULL; + lwb = list_next(&zilog->zl_lwb_list, lwb)) + if (lwb->lwb_seq != 0 && lwb->lwb_state != SEQ_COMPLETE) + break; + + if (lwb == NULL) + break; + + cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock); } + mutex_exit(&zilog->zl_lock); - zil_destroy(zilog); + zil_destroy(zilog, B_FALSE); + + mutex_enter(&zilog->zl_lock); + ASSERT(BP_IS_HOLE(&zh->zh_log)); + zilog->zl_suspending = B_FALSE; + cv_broadcast(&zilog->zl_cv_suspend); + mutex_exit(&zilog->zl_lock); return (0); } @@ -1164,7 +1282,7 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) { zil_replay_arg_t *zr = zra; - zil_header_t *zh = zilog->zl_header; + const zil_header_t *zh = zilog->zl_header; uint64_t reclen = lr->lrc_reclen; uint64_t txtype = lr->lrc_txtype; int pass, error; @@ -1310,15 +1428,11 @@ zil_replay_func_t *replay_func[TX_MAX_TYPE], void (*rm_sync)(void *arg)) { zilog_t *zilog = dmu_objset_zil(os); - zil_replay_arg_t zr; + const zil_header_t *zh = zilog->zl_header; + zil_replay_arg_t zr; if (zil_empty(zilog)) { - /* - * Initialise the log header but don't free the log block - * which will get reused. - */ - zilog->zl_header->zh_claim_txg = 0; - zilog->zl_header->zh_replay_seq = 0; + zil_destroy(zilog, B_TRUE); return; } @@ -1327,7 +1441,7 @@ zr.zr_arg = arg; zr.zr_rm_sync = rm_sync; zr.zr_txgp = txgp; - zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zilog->zl_header->zh_log); + zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); /* @@ -1338,11 +1452,11 @@ txg_wait_synced(zilog->zl_dmu_pool, 0); zilog->zl_stop_replay = 0; - zil_parse(zilog, NULL, zil_replay_log_record, &zr, - zilog->zl_header->zh_claim_txg); + (void) zil_parse(zilog, NULL, zil_replay_log_record, &zr, + zh->zh_claim_txg); kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE); - zil_destroy(zilog); + zil_destroy(zilog, B_FALSE); } /* @@ -1353,7 +1467,7 @@ { lwb_t *lwb; - if (zilog == NULL || list_head(&zilog->zl_itx_list)) + if (!list_is_empty(&zilog->zl_itx_list)) return (B_FALSE); /*
--- a/usr/src/uts/common/fs/zfs/zio.c Thu Apr 13 15:37:22 2006 -0700 +++ b/usr/src/uts/common/fs/zfs/zio.c Thu Apr 13 16:15:06 2006 -0700 @@ -1263,13 +1263,8 @@ zio_dva_free(zio_t *zio) { blkptr_t *bp = zio->io_bp; - dva_t *dva = bp->blk_dva; - int d; - ASSERT(!BP_IS_HOLE(bp)); - - for (d = 0; d < BP_GET_NDVAS(bp); d++) - metaslab_free(zio->io_spa, &dva[d], zio->io_txg, B_FALSE); + metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE); BP_ZERO(bp); @@ -1279,18 +1274,7 @@ static void zio_dva_claim(zio_t *zio) { - blkptr_t *bp = zio->io_bp; - dva_t *dva = bp->blk_dva; - int error = 0; - int d; - - ASSERT(!BP_IS_HOLE(bp)); - - for (d = 0; d < BP_GET_NDVAS(bp); d++) { - error = metaslab_claim(zio->io_spa, &dva[d], zio->io_txg); - if (error) - zio->io_error = error; - } + zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); zio_next_stage(zio); } @@ -1669,8 +1653,7 @@ * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int -zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp, - uint64_t txg) +zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *bp, uint64_t txg) { int error; @@ -1681,10 +1664,10 @@ error = metaslab_alloc(spa, size, bp, 1, txg, NULL); if (error == 0) { - BP_SET_CHECKSUM(bp, checksum); BP_SET_LSIZE(bp, size); BP_SET_PSIZE(bp, size); BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_ZILOG); BP_SET_TYPE(bp, DMU_OT_INTENT_LOG); BP_SET_LEVEL(bp, 0); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); @@ -1705,11 +1688,9 @@ { ASSERT(!BP_IS_GANG(bp)); - dprintf_bp(bp, "txg %llu: ", txg); - spa_config_enter(spa, RW_READER, FTAG); - metaslab_free(spa, BP_IDENTITY(bp), txg, B_FALSE); + metaslab_free(spa, bp, txg, B_FALSE); spa_config_exit(spa, FTAG); }