Mercurial > illumos > illumos-gate
changeset 7638:5505e89fa6c8
6741237 zfs hang in txg_wait_open() on boot
author | Neil Perrin <Neil.Perrin@Sun.COM> |
---|---|
date | Thu, 18 Sep 2008 17:18:10 -0600 |
parents | ad2546323274 |
children | 059edfcf3fa1 |
files | usr/src/cmd/ztest/ztest.c usr/src/uts/common/fs/zfs/sys/zil.h usr/src/uts/common/fs/zfs/zfs_dir.c usr/src/uts/common/fs/zfs/zfs_vfsops.c usr/src/uts/common/fs/zfs/zil.c usr/src/uts/common/fs/zfs/zvol.c |
diffstat | 6 files changed, 35 insertions(+), 41 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/cmd/ztest/ztest.c Thu Sep 18 16:58:59 2008 -0600 +++ b/usr/src/cmd/ztest/ztest.c Thu Sep 18 17:18:10 2008 -0600 @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * The objective of this program is to provide a DMU/ZAP/SPA stress test * that runs entirely in userland, is easy to use, and easy to extend. @@ -1221,7 +1219,7 @@ if (ztest_random(2) == 0 && dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_OWNER, &os) == 0) { zr.zr_os = os; - zil_replay(os, &zr, &zr.zr_assign, ztest_replay_vector); + zil_replay(os, &zr, &zr.zr_assign, ztest_replay_vector, NULL); dmu_objset_close(os); } @@ -3247,7 +3245,7 @@ ztest_dmu_check_future_leak(&za[t]); zr.zr_os = za[d].za_os; zil_replay(zr.zr_os, &zr, &zr.zr_assign, - ztest_replay_vector); + ztest_replay_vector, NULL); za[d].za_zilog = zil_open(za[d].za_os, NULL); }
--- a/usr/src/uts/common/fs/zfs/sys/zil.h Thu Sep 18 16:58:59 2008 -0600 +++ b/usr/src/uts/common/fs/zfs/sys/zil.h Thu Sep 18 17:18:10 2008 -0600 @@ -26,8 +26,6 @@ #ifndef _SYS_ZIL_H #define _SYS_ZIL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/spa.h> #include <sys/zio.h> @@ -337,6 +335,7 @@ typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, uint64_t txg); typedef int zil_replay_func_t(); +typedef void zil_replay_cleaner_t(); typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio); extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, @@ -352,7 +351,8 @@ extern void zil_close(zilog_t *zilog); extern void zil_replay(objset_t *os, void *arg, uint64_t *txgp, - zil_replay_func_t *replay_func[TX_MAX_TYPE]); + zil_replay_func_t *replay_func[TX_MAX_TYPE], + zil_replay_cleaner_t *replay_cleaner); extern void zil_destroy(zilog_t *zilog, boolean_t keep_first); extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
--- a/usr/src/uts/common/fs/zfs/zfs_dir.c Thu Sep 18 16:58:59 2008 -0600 +++ b/usr/src/uts/common/fs/zfs/zfs_dir.c Thu Sep 18 17:18:10 2008 -0600 @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/param.h> #include <sys/time.h> @@ -564,6 +562,24 @@ ASSERT(zp->z_phys->zp_links == 0); /* + * If this is a ZIL replay then leave the object in the unlinked set. + * Otherwise we can get a deadlock, because the delete can be + * quite large and span multiple tx's and txgs, but each replay + * creates a tx to atomically run the replay function and mark the + * replay record as complete. We deadlock trying to start a tx in + * a new txg to further the deletion but can't because the replay + * tx hasn't finished. + * + * We actually delete the object if we get a failure to create an + * object in zil_replay_log_record(), or after calling zil_replay(). + */ + if (zfsvfs->z_assign >= TXG_INITIAL) { + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + + /* * If this is an attribute directory, purge its contents. */ if (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR)) {
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c Thu Sep 18 16:58:59 2008 -0600 +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c Thu Sep 18 17:18:10 2008 -0600 @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/param.h> #include <sys/systm.h> @@ -559,7 +557,6 @@ static int zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) { - uint_t readonly; int error; error = zfs_register_callbacks(zfsvfs->z_vfs); @@ -579,44 +576,22 @@ * operations out since we closed the ZIL. */ if (mounting) { + boolean_t readonly; + /* * During replay we remove the read only flag to * allow replays to succeed. */ readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; - if (readonly != 0) - zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; - else - zfs_unlinked_drain(zfsvfs); + zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; /* * Parse and replay the intent log. - * - * Because of ziltest, this must be done after - * zfs_unlinked_drain(). (Further note: ziltest doesn't - * use readonly mounts, where zfs_unlinked_drain() isn't - * called.) This is because ziltest causes spa_sync() - * to think it's committed, but actually it is not, so - * the intent log contains many txg's worth of changes. - * - * In particular, if object N is in the unlinked set in - * the last txg to actually sync, then it could be - * actually freed in a later txg and then reallocated in - * a yet later txg. This would write a "create object - * N" record to the intent log. Normally, this would be - * fine because the spa_sync() would have written out - * the fact that object N is free, before we could write - * the "create object N" intent log record. - * - * But when we are in ziltest mode, we advance the "open - * txg" without actually spa_sync()-ing the changes to - * disk. So we would see that object N is still - * allocated and in the unlinked set, and there is an - * intent log record saying to allocate it. */ zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign, - zfs_replay_vector); + zfs_replay_vector, zfs_unlinked_drain); + zfs_unlinked_drain(zfsvfs); zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ }
--- a/usr/src/uts/common/fs/zfs/zil.c Thu Sep 18 16:58:59 2008 -0600 +++ b/usr/src/uts/common/fs/zfs/zil.c Thu Sep 18 17:18:10 2008 -0600 @@ -1453,6 +1453,7 @@ typedef struct zil_replay_arg { objset_t *zr_os; zil_replay_func_t **zr_replay; + zil_replay_cleaner_t *zr_replay_cleaner; void *zr_arg; uint64_t *zr_txgp; boolean_t zr_byteswap; @@ -1583,6 +1584,8 @@ * transaction. */ if (error != ERESTART && !sunk) { + if (zr->zr_replay_cleaner) + zr->zr_replay_cleaner(zr->zr_arg); txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); sunk = B_TRUE; continue; /* retry */ @@ -1621,7 +1624,8 @@ */ void zil_replay(objset_t *os, void *arg, uint64_t *txgp, - zil_replay_func_t *replay_func[TX_MAX_TYPE]) + zil_replay_func_t *replay_func[TX_MAX_TYPE], + zil_replay_cleaner_t *replay_cleaner) { zilog_t *zilog = dmu_objset_zil(os); const zil_header_t *zh = zilog->zl_header; @@ -1634,6 +1638,7 @@ zr.zr_os = os; zr.zr_replay = replay_func; + zr.zr_replay_cleaner = replay_cleaner; zr.zr_arg = arg; zr.zr_txgp = txgp; zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
--- a/usr/src/uts/common/fs/zfs/zvol.c Thu Sep 18 16:58:59 2008 -0600 +++ b/usr/src/uts/common/fs/zfs/zvol.c Thu Sep 18 17:18:10 2008 -0600 @@ -713,7 +713,7 @@ ASSERT(error == 0); zv->zv_volblocksize = doi.doi_data_block_size; - zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector); + zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector, NULL); zvol_size_changed(zv, maj); /* XXX this should handle the possible i/o error */