changeset 7638:5505e89fa6c8

6741237 zfs hang in txg_wait_open() on boot
author Neil Perrin <Neil.Perrin@Sun.COM>
date Thu, 18 Sep 2008 17:18:10 -0600
parents ad2546323274
children 059edfcf3fa1
files usr/src/cmd/ztest/ztest.c usr/src/uts/common/fs/zfs/sys/zil.h usr/src/uts/common/fs/zfs/zfs_dir.c usr/src/uts/common/fs/zfs/zfs_vfsops.c usr/src/uts/common/fs/zfs/zil.c usr/src/uts/common/fs/zfs/zvol.c
diffstat 6 files changed, 35 insertions(+), 41 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/ztest/ztest.c	Thu Sep 18 16:58:59 2008 -0600
+++ b/usr/src/cmd/ztest/ztest.c	Thu Sep 18 17:18:10 2008 -0600
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * The objective of this program is to provide a DMU/ZAP/SPA stress test
  * that runs entirely in userland, is easy to use, and easy to extend.
@@ -1221,7 +1219,7 @@
 	if (ztest_random(2) == 0 &&
 	    dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_OWNER, &os) == 0) {
 		zr.zr_os = os;
-		zil_replay(os, &zr, &zr.zr_assign, ztest_replay_vector);
+		zil_replay(os, &zr, &zr.zr_assign, ztest_replay_vector, NULL);
 		dmu_objset_close(os);
 	}
 
@@ -3247,7 +3245,7 @@
 				ztest_dmu_check_future_leak(&za[t]);
 			zr.zr_os = za[d].za_os;
 			zil_replay(zr.zr_os, &zr, &zr.zr_assign,
-			    ztest_replay_vector);
+			    ztest_replay_vector, NULL);
 			za[d].za_zilog = zil_open(za[d].za_os, NULL);
 		}
 
--- a/usr/src/uts/common/fs/zfs/sys/zil.h	Thu Sep 18 16:58:59 2008 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h	Thu Sep 18 17:18:10 2008 -0600
@@ -26,8 +26,6 @@
 #ifndef	_SYS_ZIL_H
 #define	_SYS_ZIL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
@@ -337,6 +335,7 @@
 typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
     uint64_t txg);
 typedef int zil_replay_func_t();
+typedef void zil_replay_cleaner_t();
 typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
 
 extern uint64_t	zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
@@ -352,7 +351,8 @@
 extern void	zil_close(zilog_t *zilog);
 
 extern void	zil_replay(objset_t *os, void *arg, uint64_t *txgp,
-    zil_replay_func_t *replay_func[TX_MAX_TYPE]);
+    zil_replay_func_t *replay_func[TX_MAX_TYPE],
+    zil_replay_cleaner_t *replay_cleaner);
 extern void	zil_destroy(zilog_t *zilog, boolean_t keep_first);
 extern void	zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
 
--- a/usr/src/uts/common/fs/zfs/zfs_dir.c	Thu Sep 18 16:58:59 2008 -0600
+++ b/usr/src/uts/common/fs/zfs/zfs_dir.c	Thu Sep 18 17:18:10 2008 -0600
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
@@ -564,6 +562,24 @@
 	ASSERT(zp->z_phys->zp_links == 0);
 
 	/*
+	 * If this is a ZIL replay then leave the object in the unlinked set.
+	 * Otherwise we can get a deadlock, because the delete can be
+	 * quite large and span multiple tx's and txgs, but each replay
+	 * creates a tx to atomically run the replay function and mark the
+	 * replay record as complete. We deadlock trying to start a tx in
+	 * a new txg to further the deletion but can't because the replay
+	 * tx hasn't finished.
+	 *
+	 * We actually delete the object if we get a failure to create an
+	 * object in zil_replay_log_record(), or after calling zil_replay().
+	 */
+	if (zfsvfs->z_assign >= TXG_INITIAL) {
+		zfs_znode_dmu_fini(zp);
+		zfs_znode_free(zp);
+		return;
+	}
+
+	/*
 	 * If this is an attribute directory, purge its contents.
 	 */
 	if (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR)) {
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c	Thu Sep 18 16:58:59 2008 -0600
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c	Thu Sep 18 17:18:10 2008 -0600
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -559,7 +557,6 @@
 static int
 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
 {
-	uint_t readonly;
 	int error;
 
 	error = zfs_register_callbacks(zfsvfs->z_vfs);
@@ -579,44 +576,22 @@
 	 * operations out since we closed the ZIL.
 	 */
 	if (mounting) {
+		boolean_t readonly;
+
 		/*
 		 * During replay we remove the read only flag to
 		 * allow replays to succeed.
 		 */
 		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
-		if (readonly != 0)
-			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
-		else
-			zfs_unlinked_drain(zfsvfs);
+		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 
 		/*
 		 * Parse and replay the intent log.
-		 *
-		 * Because of ziltest, this must be done after
-		 * zfs_unlinked_drain().  (Further note: ziltest doesn't
-		 * use readonly mounts, where zfs_unlinked_drain() isn't
-		 * called.)  This is because ziltest causes spa_sync()
-		 * to think it's committed, but actually it is not, so
-		 * the intent log contains many txg's worth of changes.
-		 *
-		 * In particular, if object N is in the unlinked set in
-		 * the last txg to actually sync, then it could be
-		 * actually freed in a later txg and then reallocated in
-		 * a yet later txg.  This would write a "create object
-		 * N" record to the intent log.  Normally, this would be
-		 * fine because the spa_sync() would have written out
-		 * the fact that object N is free, before we could write
-		 * the "create object N" intent log record.
-		 *
-		 * But when we are in ziltest mode, we advance the "open
-		 * txg" without actually spa_sync()-ing the changes to
-		 * disk.  So we would see that object N is still
-		 * allocated and in the unlinked set, and there is an
-		 * intent log record saying to allocate it.
 		 */
 		zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
-		    zfs_replay_vector);
+		    zfs_replay_vector, zfs_unlinked_drain);
 
+		zfs_unlinked_drain(zfsvfs);
 		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
 	}
 
--- a/usr/src/uts/common/fs/zfs/zil.c	Thu Sep 18 16:58:59 2008 -0600
+++ b/usr/src/uts/common/fs/zfs/zil.c	Thu Sep 18 17:18:10 2008 -0600
@@ -1453,6 +1453,7 @@
 typedef struct zil_replay_arg {
 	objset_t	*zr_os;
 	zil_replay_func_t **zr_replay;
+	zil_replay_cleaner_t *zr_replay_cleaner;
 	void		*zr_arg;
 	uint64_t	*zr_txgp;
 	boolean_t	zr_byteswap;
@@ -1583,6 +1584,8 @@
 		 * transaction.
 		 */
 		if (error != ERESTART && !sunk) {
+			if (zr->zr_replay_cleaner)
+				zr->zr_replay_cleaner(zr->zr_arg);
 			txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
 			sunk = B_TRUE;
 			continue; /* retry */
@@ -1621,7 +1624,8 @@
  */
 void
 zil_replay(objset_t *os, void *arg, uint64_t *txgp,
-	zil_replay_func_t *replay_func[TX_MAX_TYPE])
+	zil_replay_func_t *replay_func[TX_MAX_TYPE],
+	zil_replay_cleaner_t *replay_cleaner)
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 	const zil_header_t *zh = zilog->zl_header;
@@ -1634,6 +1638,7 @@
 
 	zr.zr_os = os;
 	zr.zr_replay = replay_func;
+	zr.zr_replay_cleaner = replay_cleaner;
 	zr.zr_arg = arg;
 	zr.zr_txgp = txgp;
 	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
--- a/usr/src/uts/common/fs/zfs/zvol.c	Thu Sep 18 16:58:59 2008 -0600
+++ b/usr/src/uts/common/fs/zfs/zvol.c	Thu Sep 18 17:18:10 2008 -0600
@@ -713,7 +713,7 @@
 	ASSERT(error == 0);
 	zv->zv_volblocksize = doi.doi_data_block_size;
 
-	zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector);
+	zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector, NULL);
 	zvol_size_changed(zv, maj);
 
 	/* XXX this should handle the possible i/o error */