changeset 13975:ef6409bc370f

3582 zfs_delay() should support a variable resolution 3584 DTrace sdt probes for ZFS txg states Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Christopher Siden <christopher.siden@delphix.com> Reviewed by: Dan McDonald <danmcd@nexenta.com> Reviewed by: Richard Elling <richard.elling@dey-sys.com> Approved by: Garrett D'Amore <garrett@damore.org>
author Adam Leventhal <ahl@delphix.com>
date Fri, 01 Mar 2013 15:46:07 -0800
parents 9eec6e773689
children 6910539f4a4c
files usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c usr/src/lib/libzpool/common/kernel.c usr/src/lib/libzpool/common/sys/zfs_context.h usr/src/uts/common/conf/param.c usr/src/uts/common/fs/zfs/dsl_dir.c usr/src/uts/common/fs/zfs/dsl_pool.c usr/src/uts/common/fs/zfs/dsl_scan.c usr/src/uts/common/fs/zfs/spa_misc.c usr/src/uts/common/fs/zfs/sys/txg.h usr/src/uts/common/fs/zfs/sys/txg_impl.h usr/src/uts/common/fs/zfs/txg.c usr/src/uts/common/os/condvar.c usr/src/uts/common/sys/condvar.h usr/src/uts/common/sys/time.h
diffstat 14 files changed, 94 insertions(+), 43 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c	Thu Feb 28 13:02:24 2013 -0800
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c	Fri Mar 01 15:46:07 2013 -0800
@@ -35,8 +35,6 @@
 
 #define	IPADDRL		sizeof (struct in_addr)
 #define	RARPRETRIES	5
-#define	MSEC2NSEC(msec)	((msec) * 1000000)
-#define	NSEC2MSEC(nsec)	((nsec) / 1000000)
 
 /*
  * The following value (8) is determined to work reliably in switched 10/100MB
--- a/usr/src/lib/libzpool/common/kernel.c	Thu Feb 28 13:02:24 2013 -0800
+++ b/usr/src/lib/libzpool/common/kernel.c	Fri Mar 01 15:46:07 2013 -0800
@@ -329,6 +329,41 @@
 	return (1);
 }
 
+/*ARGSUSED*/
+clock_t
+cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res,
+    int flag)
+{
+	int error;
+	timestruc_t ts;
+	hrtime_t delta;
+
+	ASSERT(flag == 0);
+
+top:
+	delta = tim - gethrtime();
+	if (delta <= 0)
+		return (-1);
+
+	ts.tv_sec = delta / NANOSEC;
+	ts.tv_nsec = delta % NANOSEC;
+
+	ASSERT(mutex_owner(mp) == curthread);
+	mp->m_owner = NULL;
+	error = cond_reltimedwait(cv, &mp->m_lock, &ts);
+	mp->m_owner = curthread;
+
+	if (error == ETIME)
+		return (-1);
+
+	if (error == EINTR)
+		goto top;
+
+	ASSERT(error == 0);
+
+	return (1);
+}
+
 void
 cv_signal(kcondvar_t *cv)
 {
--- a/usr/src/lib/libzpool/common/sys/zfs_context.h	Thu Feb 28 13:02:24 2013 -0800
+++ b/usr/src/lib/libzpool/common/sys/zfs_context.h	Fri Mar 01 15:46:07 2013 -0800
@@ -254,6 +254,8 @@
 extern void cv_destroy(kcondvar_t *cv);
 extern void cv_wait(kcondvar_t *cv, kmutex_t *mp);
 extern clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime);
+extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
+    hrtime_t res, int flag);
 extern void cv_signal(kcondvar_t *cv);
 extern void cv_broadcast(kcondvar_t *cv);
 
--- a/usr/src/uts/common/conf/param.c	Thu Feb 28 13:02:24 2013 -0800
+++ b/usr/src/uts/common/conf/param.c	Fri Mar 01 15:46:07 2013 -0800
@@ -695,10 +695,10 @@
 	 * should re-evaluate their usage and specify the appropriate
 	 * resolution.
 	 */
-	time_res[TR_NANOSEC] = SEC;
-	time_res[TR_MICROSEC] = MILLISEC;
-	time_res[TR_MILLISEC] = MICROSEC;
-	time_res[TR_SEC] = NANOSEC;
+	time_res[TR_NANOSEC] = NANOSEC / NANOSEC;
+	time_res[TR_MICROSEC] = NANOSEC / MICROSEC;
+	time_res[TR_MILLISEC] = NANOSEC / MILLISEC;
+	time_res[TR_SEC] = NANOSEC / SEC;
 	time_res[TR_CLOCK_TICK] = nsec_per_tick;
 }
 
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c	Thu Feb 28 13:02:24 2013 -0800
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c	Fri Mar 01 15:46:07 2013 -0800
@@ -738,7 +738,8 @@
 		err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
 	} else {
 		if (err == EAGAIN) {
-			txg_delay(dd->dd_pool, tx->tx_txg, 1);
+			txg_delay(dd->dd_pool, tx->tx_txg,
+			    MSEC2NSEC(10), MSEC2NSEC(10));
 			err = ERESTART;
 		}
 		dsl_pool_memory_pressure(dd->dd_pool);
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c	Thu Feb 28 13:02:24 2013 -0800
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c	Fri Mar 01 15:46:07 2013 -0800
@@ -58,6 +58,9 @@
 
 static pgcnt_t old_physmem = 0;
 
+hrtime_t zfs_throttle_delay = MSEC2NSEC(10);
+hrtime_t zfs_throttle_resolution = MSEC2NSEC(10);
+
 int
 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
 {
@@ -511,12 +514,13 @@
 	 * Weight the throughput calculation towards the current value:
 	 * 	thru = 3/4 old_thru + 1/4 new_thru
 	 *
-	 * Note: write_time is in nanosecs, so write_time/MICROSEC
-	 * yields millisecs
+	 * Note: write_time is in nanosecs while dp_throughput is expressed in
+	 * bytes per millisecond.
 	 */
 	ASSERT(zfs_write_limit_min > 0);
-	if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) {
-		uint64_t throughput = data_written / (write_time / MICROSEC);
+	if (data_written > zfs_write_limit_min / 8 &&
+	    write_time > MSEC2NSEC(1)) {
+		uint64_t throughput = data_written / NSEC2MSEC(write_time);
 
 		if (dp->dp_throughput)
 			dp->dp_throughput = throughput / 4 +
@@ -614,8 +618,10 @@
 	 * the caller 1 clock tick.  This will slow down the "fill"
 	 * rate until the sync process can catch up with us.
 	 */
-	if (reserved && reserved > (write_limit - (write_limit >> 3)))
-		txg_delay(dp, tx->tx_txg, 1);
+	if (reserved && reserved > (write_limit - (write_limit >> 3))) {
+		txg_delay(dp, tx->tx_txg, zfs_throttle_delay,
+		    zfs_throttle_resolution);
+	}
 
 	return (0);
 }
--- a/usr/src/uts/common/fs/zfs/dsl_scan.c	Thu Feb 28 13:02:24 2013 -0800
+++ b/usr/src/uts/common/fs/zfs/dsl_scan.c	Fri Mar 01 15:46:07 2013 -0800
@@ -403,7 +403,7 @@
 	    zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
 	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
 	if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
-	    (elapsed_nanosecs / MICROSEC > mintime &&
+	    (NSEC2MSEC(elapsed_nanosecs) > mintime &&
 	    txg_sync_waiting(scn->scn_dp)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa)) {
 		if (zb) {
@@ -1308,7 +1308,7 @@
 
 	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
 	return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
-	    (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
+	    (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms &&
 	    txg_sync_waiting(scn->scn_dp)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa));
 }
@@ -1433,7 +1433,7 @@
 			    "free_bpobj/bptree txg %llu",
 			    (longlong_t)scn->scn_visited_this_txg,
 			    (longlong_t)
-			    (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
+			    NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
 			    (longlong_t)tx->tx_txg);
 			scn->scn_visited_this_txg = 0;
 			/*
@@ -1481,7 +1481,7 @@
 
 	zfs_dbgmsg("visited %llu blocks in %llums",
 	    (longlong_t)scn->scn_visited_this_txg,
-	    (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC);
+	    (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time));
 
 	if (!scn->scn_pausing) {
 		/* finished with scan. */
--- a/usr/src/uts/common/fs/zfs/spa_misc.c	Thu Feb 28 13:02:24 2013 -0800
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c	Fri Mar 01 15:46:07 2013 -0800
@@ -499,8 +499,8 @@
 	hdlr.cyh_arg = spa;
 	hdlr.cyh_level = CY_LOW_LEVEL;
 
-	spa->spa_deadman_synctime = zfs_deadman_synctime *
-	    zfs_txg_synctime_ms * MICROSEC;
+	spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime *
+	    zfs_txg_synctime_ms);
 
 	/*
 	 * This determines how often we need to check for hung I/Os after
@@ -508,7 +508,7 @@
 	 * an expensive operation we don't want to check too frequently.
 	 * Instead wait for 5 synctimes before checking again.
 	 */
-	when.cyt_interval = 5ULL * zfs_txg_synctime_ms * MICROSEC;
+	when.cyt_interval = MSEC2NSEC(5 * zfs_txg_synctime_ms);
 	when.cyt_when = CY_INFINITY;
 	mutex_enter(&cpu_lock);
 	spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
--- a/usr/src/uts/common/fs/zfs/sys/txg.h	Thu Feb 28 13:02:24 2013 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/txg.h	Fri Mar 01 15:46:07 2013 -0800
@@ -74,13 +74,8 @@
 extern void txg_rele_to_sync(txg_handle_t *txghp);
 extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks);
 
-/*
- * Delay the caller by the specified number of ticks or until
- * the txg closes (whichever comes first).  This is intended
- * to be used to throttle writers when the system nears its
- * capacity.
- */
-extern void txg_delay(struct dsl_pool *dp, uint64_t txg, int ticks);
+extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta,
+    hrtime_t resolution);
 
 /*
  * Wait until the given transaction group has finished syncing.
--- a/usr/src/uts/common/fs/zfs/sys/txg_impl.h	Thu Feb 28 13:02:24 2013 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/txg_impl.h	Fri Mar 01 15:46:07 2013 -0800
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
 #ifndef _SYS_TXG_IMPL_H
 #define	_SYS_TXG_IMPL_H
 
@@ -36,14 +40,14 @@
 struct tx_cpu {
 	kmutex_t	tc_lock;
 	kcondvar_t	tc_cv[TXG_SIZE];
-	uint64_t	tc_count[TXG_SIZE];
+	uint64_t	tc_count[TXG_SIZE];	/* tx hold count on each txg */
 	list_t		tc_callbacks[TXG_SIZE]; /* commit cb list */
-	char		tc_pad[16];
+	char		tc_pad[16];		/* pad to fill 3 cache lines */
 };
 
 typedef struct tx_state {
-	tx_cpu_t	*tx_cpu;	/* protects right to enter txg	*/
-	kmutex_t	tx_sync_lock;	/* protects tx_state_t */
+	tx_cpu_t	*tx_cpu;	/* protects access to tx_open_txg */
+	kmutex_t	tx_sync_lock;	/* protects the rest of this struct */
 	uint64_t	tx_open_txg;	/* currently open txg id */
 	uint64_t	tx_quiesced_txg; /* quiesced txg waiting for sync */
 	uint64_t	tx_syncing_txg;	/* currently syncing txg id */
--- a/usr/src/uts/common/fs/zfs/txg.c	Thu Feb 28 13:02:24 2013 -0800
+++ b/usr/src/uts/common/fs/zfs/txg.c	Fri Mar 01 15:46:07 2013 -0800
@@ -232,7 +232,7 @@
 }
 
 static void
-txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time)
+txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
 {
 	CALLB_CPR_SAFE_BEGIN(cpr);
 
@@ -353,6 +353,9 @@
 	ASSERT(txg == tx->tx_open_txg);
 	tx->tx_open_txg++;
 
+	DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
+	DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
+
 	/*
 	 * Now that we've incremented tx_open_txg, we can let threads
 	 * enter the next transaction group.
@@ -475,6 +478,7 @@
 		txg = tx->tx_quiesced_txg;
 		tx->tx_quiesced_txg = 0;
 		tx->tx_syncing_txg = txg;
+		DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
 		cv_broadcast(&tx->tx_quiesce_more_cv);
 
 		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
@@ -488,6 +492,7 @@
 		mutex_enter(&tx->tx_sync_lock);
 		tx->tx_synced_txg = txg;
 		tx->tx_syncing_txg = 0;
+		DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
 		cv_broadcast(&tx->tx_sync_done_cv);
 
 		/*
@@ -536,21 +541,22 @@
 		 */
 		dprintf("quiesce done, handing off txg %llu\n", txg);
 		tx->tx_quiesced_txg = txg;
+		DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
 		cv_broadcast(&tx->tx_sync_more_cv);
 		cv_broadcast(&tx->tx_quiesce_done_cv);
 	}
 }
 
 /*
- * Delay this thread by 'ticks' if we are still in the open transaction
- * group and there is already a waiting txg quiesing or quiesced.  Abort
- * the delay if this txg stalls or enters the quiesing state.
+ * Delay this thread by delay nanoseconds if we are still in the open
+ * transaction group and there is already a waiting txg quiesing or quiesced.
+ * Abort the delay if this txg stalls or enters the quiesing state.
  */
 void
-txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
+txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
 {
 	tx_state_t *tx = &dp->dp_tx;
-	clock_t timeout = ddi_get_lbolt() + ticks;
+	hrtime_t start = gethrtime();
 
 	/* don't delay if this txg could transition to quiesing immediately */
 	if (tx->tx_open_txg > txg ||
@@ -563,10 +569,11 @@
 		return;
 	}
 
-	while (ddi_get_lbolt() < timeout &&
-	    tx->tx_syncing_txg < txg-1 && !txg_stalled(dp))
-		(void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock,
-		    timeout);
+	while (gethrtime() - start < delay &&
+	    tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
+		(void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
+		    &tx->tx_sync_lock, delay, resolution, 0);
+	}
 
 	mutex_exit(&tx->tx_sync_lock);
 }
--- a/usr/src/uts/common/os/condvar.c	Thu Feb 28 13:02:24 2013 -0800
+++ b/usr/src/uts/common/os/condvar.c	Fri Mar 01 15:46:07 2013 -0800
@@ -43,8 +43,6 @@
 #include <sys/sdt.h>
 #include <sys/callo.h>
 
-clock_t cv_timedwait_hires(kcondvar_t *, kmutex_t *, hrtime_t, hrtime_t, int);
-
 /*
  * CV_MAX_WAITERS is the maximum number of waiters we track; once
  * the number becomes higher than that, we look at the sleepq to
--- a/usr/src/uts/common/sys/condvar.h	Thu Feb 28 13:02:24 2013 -0800
+++ b/usr/src/uts/common/sys/condvar.h	Fri Mar 01 15:46:07 2013 -0800
@@ -94,6 +94,8 @@
 extern	void	cv_wait(kcondvar_t *, kmutex_t *);
 extern	void	cv_wait_stop(kcondvar_t *, kmutex_t *, int);
 extern	clock_t	cv_timedwait(kcondvar_t *, kmutex_t *, clock_t);
+extern	clock_t	cv_timedwait_hires(kcondvar_t *, kmutex_t *, hrtime_t, hrtime_t,
+    int);
 extern	clock_t	cv_reltimedwait(kcondvar_t *, kmutex_t *, clock_t, time_res_t);
 extern	int	cv_wait_sig(kcondvar_t *, kmutex_t *);
 extern	clock_t	cv_timedwait_sig(kcondvar_t *, kmutex_t *, clock_t);
--- a/usr/src/uts/common/sys/time.h	Thu Feb 28 13:02:24 2013 -0800
+++ b/usr/src/uts/common/sys/time.h	Fri Mar 01 15:46:07 2013 -0800
@@ -236,6 +236,9 @@
 #define	MICROSEC	1000000
 #define	NANOSEC		1000000000
 
+#define	MSEC2NSEC(m)	((hrtime_t)(m) * (NANOSEC / MILLISEC))
+#define	NSEC2MSEC(n)	((n) / (NANOSEC / MILLISEC))
+
 #endif /* !defined(__XOPEN_OR_POSIX) || defined(__EXTENSIONS__) */
 
 #ifndef	_ASM