changeset 10797:8e4cf0dbd8ca

6883663 CPUs observed not downclocking when system is otherwise idle
author Eric Saxe <Eric.Saxe@Sun.COM>
date Wed, 14 Oct 2009 14:54:01 -0700
parents c03a117618e2
children 209f077e2786
files usr/src/uts/common/os/cpu_pm.c usr/src/uts/common/sys/cpu_pm.h usr/src/uts/common/sys/time.h usr/src/uts/i86pc/os/mp_machdep.c usr/src/uts/i86pc/os/timestamp.c usr/src/uts/intel/ia32/os/archdep.c usr/src/uts/intel/sys/archsystm.h usr/src/uts/sun4/io/cbe.c
diffstat 8 files changed, 131 insertions(+), 157 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/os/cpu_pm.c	Wed Oct 14 11:48:20 2009 -0700
+++ b/usr/src/uts/common/os/cpu_pm.c	Wed Oct 14 14:54:01 2009 -0700
@@ -25,6 +25,7 @@
 
 #include <sys/cpu_pm.h>
 #include <sys/cmn_err.h>
+#include <sys/time.h>
 #include <sys/sdt.h>
 
 /*
@@ -69,7 +70,7 @@
  *
  * Avoiding state thrashing in the presence of transient periods of utilization
  * and idleness while still being responsive to non-transient periods is key.
- * The power manager implmeents several "governors" that are used to throttle
+ * The power manager implements a "governor" that is used to throttle
  * state transitions when a significant amount of transient idle or transient
  * work is detected.
  *
@@ -81,6 +82,28 @@
  * wait for an event elsewhere in the system. Where the idle period is short
  * enough, the overhead associated with making the state transition doesn't
  * justify the power savings.
+ *
+ * The following is the state machine for the governor implemented by
+ * cpupm_utilization_event():
+ *
+ *         ----->---tw---->-----
+ *        /                     \
+ *      (I)-<-ti-<-     -<-ntw-<(W)
+ *       |         \   /         |
+ *       \          \ /          /
+ *        >-nti/rm->(D)--->-tw->-
+ * Key:
+ *
+ * States
+ * - (D): Default (ungoverned)
+ * - (W): Transient work governed
+ * - (I): Transient idle governed
+ * State Transitions
+ * - tw: transient work
+ * - ti: transient idleness
+ * - ntw: non-transient work
+ * - nti: non-transient idleness
+ * - rm: thread remain event
  */
 
 static cpupm_domain_t *cpupm_domains = NULL;
@@ -109,39 +132,35 @@
 /*
  * Number of mispredictions after which future transitions will be governed.
  */
-int cpupm_mispredict_thresh = 2;
+int cpupm_mispredict_thresh = 4;
 
 /*
  * Likewise, the number of mispredicted governed transitions after which the
  * governor will be removed.
  */
-int cpupm_mispredict_gov_thresh = 10;
+int cpupm_mispredict_gov_thresh = 4;
 
 /*
- * The transient work and transient idle prediction intervals are initialized
- * to be some multiple of the amount of time it takes to transition a power
- * domain from the highest to the lowest power state, and back again, which
- * is measured.
- *
- * The default values of those multiples are specified here. Tuning them higher
- * will result in the transient work, and transient idle governors being used
- * more aggresively, which limits the frequency of state transitions at the
- * expense of performance and power savings, respectively.
+ * The transient work and transient idle prediction intervals are specified
+ * here. Tuning them higher will result in the transient work, and transient
+ * idle governors being used more aggresively, which limits the frequency of
+ * state transitions at the expense of performance and power savings,
+ * respectively. The intervals are specified in nanoseconds.
+ */
+/*
+ * 400 usec
  */
-#define	CPUPM_TI_GOV_DEFAULT_MULTIPLE 600
-#define	CPUPM_TW_GOV_DEFAULT_MULTIPLE 25
-
+#define	CPUPM_DEFAULT_TI_INTERVAL	400000
 /*
- * Number of high=>low=>high measurements performed, of which the average
- * is taken.
+ * 400 usec
  */
-#define	CPUPM_BENCHMARK_ITERS 5
+#define	CPUPM_DEFAULT_TW_INTERVAL	400000
 
-int cpupm_ti_gov_multiple = CPUPM_TI_GOV_DEFAULT_MULTIPLE;
-int cpupm_tw_gov_multiple = CPUPM_TW_GOV_DEFAULT_MULTIPLE;
+hrtime_t cpupm_ti_gov_interval = CPUPM_DEFAULT_TI_INTERVAL;
+hrtime_t cpupm_tw_gov_interval = CPUPM_DEFAULT_TW_INTERVAL;
 
 
-static int	cpupm_governor_initialize(void);
+static void	cpupm_governor_initialize(void);
 static void	cpupm_state_change_global(cpupm_dtype_t, cpupm_state_name_t);
 
 cpupm_policy_t
@@ -201,23 +220,15 @@
 			break;
 		}
 
-		pause_cpus(NULL);
 		/*
-		 * Attempt to initialize the governor parameters the first
-		 * time through.
+		 * Initialize the governor parameters the first time through.
 		 */
 		if (gov_init == 0) {
-			result = cpupm_governor_initialize();
-			if (result == 0) {
-				gov_init = 1;
-			} else {
-				/*
-				 * Failed to initialize the governor parameters
-				 */
-				start_cpus();
-				break;
-			}
+			cpupm_governor_initialize();
+			gov_init = 1;
 		}
+
+		pause_cpus(NULL);
 		cpupm_policy = CPUPM_POLICY_ELASTIC;
 		start_cpus();
 
@@ -398,7 +409,7 @@
 	 * If the utilization has dropped to zero, then transition the
 	 * domain to its lowest power state.
 	 *
-	 * Statistics are maintained to implement governors to reduce state
+	 * Statistics are maintained to implement a governor to reduce state
 	 * transitions resulting from either transient work, or periods of
 	 * transient idleness on the domain.
 	 */
@@ -415,8 +426,8 @@
 		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
 			new_state =
 			    dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
-			if (dom->cpd_tw_governed == B_TRUE) {
-				dom->cpd_tw_governed = B_FALSE;
+			if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
+				dom->cpd_governor = CPUPM_GOV_DISENGAGED;
 				dom->cpd_tw = 0;
 			}
 		}
@@ -437,10 +448,17 @@
 			/*
 			 * There's non-zero utilization, and the domain is
 			 * running in the lower power state. Before we
-			 * consider raising power, perform some book keeping
-			 * for the transient idle governor.
+			 * consider raising power, check if the preceeding
+			 * idle period was transient in duration.
+			 *
+			 * If the domain is already transient work governed,
+			 * then we don't bother maintaining transient idle
+			 * statistics, as the presence of enough transient work
+			 * can also make the domain frequently transiently idle.
+			 * In this case, we still want to remain transient work
+			 * governed.
 			 */
-			if (dom->cpd_ti_governed == B_FALSE) {
+			if (dom->cpd_governor == CPUPM_GOV_DISENGAGED) {
 				if ((now - last) < cpupm_ti_predict_interval) {
 					/*
 					 * We're raising the domain power and
@@ -448,18 +466,8 @@
 					 * this a mispredicted power state
 					 * transition due to a transient
 					 * idle period.
-					 *
-					 * Note: The presence of enough
-					 * transient work across the domain can
-					 * result in frequent transient idle
-					 * periods. We don't want the ti
-					 * governor being installed as a side
-					 * effect of transient work, so the ti
-					 * governor is left alone if the tw
-					 * governor is already installed.
 					 */
-					if (dom->cpd_tw_governed == B_FALSE &&
-					    ++dom->cpd_ti >=
+					if (++dom->cpd_ti >=
 					    cpupm_mispredict_thresh) {
 						/*
 						 * There's enough transient
@@ -467,7 +475,8 @@
 						 * justify governing future
 						 * lowering requests.
 						 */
-						dom->cpd_ti_governed = B_TRUE;
+						dom->cpd_governor =
+						    CPUPM_GOV_TRANS_IDLE;
 						dom->cpd_ti = 0;
 						DTRACE_PROBE1(
 						    cpupm__ti__governed,
@@ -481,7 +490,7 @@
 					dom->cpd_ti = 0;
 				}
 			}
-			if (dom->cpd_tw_governed == B_TRUE) {
+			if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
 				/*
 				 * Raise requests are governed due to
 				 * transient work.
@@ -489,22 +498,6 @@
 				DTRACE_PROBE1(cpupm__raise__governed,
 				    cpupm_domain_t *, dom);
 
-				/*
-				 * It's likely that we'll be governed for a
-				 * while. If the transient idle governor is
-				 * also in place, examine the preceeding idle
-				 * interval to see if that still makes sense.
-				 */
-				if (dom->cpd_ti_governed == B_TRUE &&
-				    ((now - last) >=
-				    cpupm_ti_predict_interval)) {
-					if (++dom->cpd_ti >=
-					    cpupm_mispredict_gov_thresh) {
-						dom->cpd_ti_governed =
-						    B_FALSE;
-						dom->cpd_ti = 0;
-					}
-				}
 				return;
 			}
 			/*
@@ -521,7 +514,8 @@
 			 * perform some book keeping if the last lowering
 			 * request was governed.
 			 */
-			if (dom->cpd_ti_governed == B_TRUE) {
+			if (dom->cpd_governor == CPUPM_GOV_TRANS_IDLE) {
+
 				if ((now - last) >= cpupm_ti_predict_interval) {
 					/*
 					 * The domain is transient idle
@@ -535,7 +529,8 @@
 						 * idle periods to justify
 						 * removing the governor.
 						 */
-						dom->cpd_ti_governed = B_FALSE;
+						dom->cpd_governor =
+						    CPUPM_GOV_DISENGAGED;
 						dom->cpd_ti = 0;
 						DTRACE_PROBE1(
 						    cpupm__ti__ungoverned,
@@ -570,7 +565,7 @@
 			 * perform some book keeping for the transient work
 			 * governor.
 			 */
-			if (dom->cpd_tw_governed == B_FALSE) {
+			if (dom->cpd_governor == CPUPM_GOV_DISENGAGED) {
 				if ((now - last) < cpupm_tw_predict_interval) {
 					/*
 					 * We're lowering the domain power and
@@ -581,12 +576,13 @@
 					if (++dom->cpd_tw >=
 					    cpupm_mispredict_thresh) {
 						/*
-						 * There's enough transient idle
+						 * There's enough transient work
 						 * transitions to justify
-						 * governing future lowering
+						 * governing future raise
 						 * requests.
 						 */
-						dom->cpd_tw_governed = B_TRUE;
+						dom->cpd_governor =
+						    CPUPM_GOV_TRANS_WORK;
 						dom->cpd_tw = 0;
 						DTRACE_PROBE1(
 						    cpupm__tw__governed,
@@ -600,7 +596,7 @@
 					dom->cpd_tw = 0;
 				}
 			}
-			if (dom->cpd_ti_governed == B_TRUE) {
+			if (dom->cpd_governor == CPUPM_GOV_TRANS_IDLE) {
 				/*
 				 * Lowering requests are governed due to
 				 * transient idleness.
@@ -608,22 +604,6 @@
 				DTRACE_PROBE1(cpupm__lowering__governed,
 				    cpupm_domain_t *, dom);
 
-				/*
-				 * It's likely that we'll be governed for a
-				 * while. If the transient work governor is
-				 * also in place, examine the preceeding busy
-				 * interval to see if that still makes sense.
-				 */
-				if (dom->cpd_tw_governed == B_TRUE &&
-				    ((now - last) >=
-				    cpupm_tw_predict_interval)) {
-					if (++dom->cpd_tw >=
-					    cpupm_mispredict_gov_thresh) {
-						dom->cpd_tw_governed =
-						    B_FALSE;
-						dom->cpd_tw = 0;
-					}
-				}
 				return;
 			}
 
@@ -642,7 +622,7 @@
 			 * perform some book keeping if the last raising
 			 * request was governed.
 			 */
-			if (dom->cpd_tw_governed == B_TRUE) {
+			if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
 				if ((now - last) >= cpupm_tw_predict_interval) {
 					/*
 					 * The domain is transient work
@@ -656,7 +636,8 @@
 						 * work to justify removing
 						 * the governor.
 						 */
-						dom->cpd_tw_governed = B_FALSE;
+						dom->cpd_governor =
+						    CPUPM_GOV_DISENGAGED;
 						dom->cpd_tw = 0;
 						DTRACE_PROBE1(
 						    cpupm__tw__ungoverned,
@@ -741,62 +722,18 @@
 }
 
 /*
- * Benchmark some power state transitions and use the transition latencies as
- * a basis for initializing parameters for the transient idle and transient
- * work governors.
- *
- * Returns 0 on success or -1 if the governor parameters could not be
- * initialized.
+ * Initialize the parameters for the transience governor state machine
  */
-static int
+static void
 cpupm_governor_initialize(void)
 {
-	cpu_t		*cp = CPU;
-	cpupm_domain_t	*dom;
-	cpupm_state_t	*low, *high;
-	id_t		did;
-	hrtime_t	start, delta, deltas = 0;
-	int		iterations;
-
-	did = cpupm_domain_id(cp, CPUPM_DTYPE_ACTIVE);
-	if (did == CPUPM_NO_DOMAIN)
-		return (-1);
-
-	dom = cpupm_domain_find(did, CPUPM_DTYPE_ACTIVE);
-	if (dom == NULL)
-		return (-1);
-
-	low = dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
-	high = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
-
-	for (iterations = 0; iterations < CPUPM_BENCHMARK_ITERS; iterations++) {
-
-		/*
-		 * Measure the amount of time it takes to transition the
-		 * domain down to the lowest, and back to the highest power
-		 * state.
-		 */
-		start = gethrtime_unscaled();
-		(void) cpupm_change_state(cp, dom, low);
-		(void) cpupm_change_state(cp, dom, high);
-		delta = gethrtime_unscaled() - start;
-
-		DTRACE_PROBE1(cpupm__benchmark__latency,
-		    hrtime_t, delta);
-
-		deltas += delta;
-	}
-
 	/*
-	 * Figure the average latency, and tune the transient work and
-	 * transient idle prediction intervals accordingly.
+	 * The default prediction intervals are specified in nanoseconds.
+	 * Convert these to the equivalent in unscaled hrtime, which is the
+	 * format of the timestamps passed to cpupm_utilization_event()
 	 */
-	delta = deltas / iterations;
-
-	cpupm_ti_predict_interval = delta * cpupm_ti_gov_multiple;
-	cpupm_tw_predict_interval = delta * cpupm_tw_gov_multiple;
-
-	return (0);
+	cpupm_ti_predict_interval = unscalehrtime(cpupm_ti_gov_interval);
+	cpupm_tw_predict_interval = unscalehrtime(cpupm_tw_gov_interval);
 }
 
 /*
--- a/usr/src/uts/common/sys/cpu_pm.h	Wed Oct 14 11:48:20 2009 -0700
+++ b/usr/src/uts/common/sys/cpu_pm.h	Wed Oct 14 14:54:01 2009 -0700
@@ -65,6 +65,15 @@
 } cpupm_state_name_t;
 
 /*
+ * Possible states for the domain's transience governor
+ */
+typedef enum cpupm_gov_state_t {
+	CPUPM_GOV_DISENGAGED,
+	CPUPM_GOV_TRANS_IDLE,	/* Transient idleness, lowerings disabled */
+	CPUPM_GOV_TRANS_WORK	/* Transient work, raises disabled */
+} cpupm_gov_state_t;
+
+/*
  * Utilization events delivered by the dispatcher.
  */
 typedef enum cpupm_util_event {
@@ -95,10 +104,9 @@
 	cpupm_state_t		*cpd_named_states[CPUPM_STATE_NAMES];
 	hrtime_t		cpd_last_raise;	/* Last raise request time */
 	hrtime_t		cpd_last_lower;	/* last lower request time */
+	int			cpd_ti;		/* transient idle history */
 	int			cpd_tw;		/* transient work history */
-	int			cpd_ti;		/* transient idle history */
-	boolean_t		cpd_ti_governed; /* transient idle governor */
-	boolean_t		cpd_tw_governed; /* transient work governor */
+	cpupm_gov_state_t	cpd_governor;   /* transience governor */
 	struct cpupm_domain	*cpd_next;
 } cpupm_domain_t;
 
--- a/usr/src/uts/common/sys/time.h	Wed Oct 14 11:48:20 2009 -0700
+++ b/usr/src/uts/common/sys/time.h	Wed Oct 14 14:54:01 2009 -0700
@@ -9,7 +9,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -344,6 +344,7 @@
 extern	hrtime_t	gethrtime_max(void);
 extern	hrtime_t	gethrtime_waitfree(void);
 extern	void		scalehrtime(hrtime_t *);
+extern	uint64_t	unscalehrtime(hrtime_t);
 extern	void 		gethrestime(timespec_t *);
 extern	time_t 		gethrestime_sec(void);
 extern	void		gethrestime_lasttick(timespec_t *);
--- a/usr/src/uts/i86pc/os/mp_machdep.c	Wed Oct 14 11:48:20 2009 -0700
+++ b/usr/src/uts/i86pc/os/mp_machdep.c	Wed Oct 14 14:54:01 2009 -0700
@@ -85,6 +85,7 @@
 static void mach_notify_error(int level, char *errmsg);
 static hrtime_t dummy_hrtime(void);
 static void dummy_scalehrtime(hrtime_t *);
+static uint64_t dummy_unscalehrtime(hrtime_t);
 void cpu_idle(void);
 static void cpu_wakeup(cpu_t *, int);
 #ifndef __xpv
@@ -133,6 +134,7 @@
 hrtime_t (*gethrtimef)(void)	= dummy_hrtime;
 hrtime_t (*gethrtimeunscaledf)(void)	= dummy_hrtime;
 void (*scalehrtimef)(hrtime_t *)	= dummy_scalehrtime;
+uint64_t (*unscalehrtimef)(hrtime_t)	= dummy_unscalehrtime;
 int (*psm_translate_irq)(dev_info_t *, int) = mach_translate_irq;
 void (*gethrestimef)(timestruc_t *) = pc_gethrestime;
 void (*psm_notify_error)(int, char *) = (void (*)(int, char *))NULL;
@@ -372,6 +374,12 @@
 dummy_scalehrtime(hrtime_t *ticks)
 {}
 
+static uint64_t
+dummy_unscalehrtime(hrtime_t nsecs)
+{
+	return ((uint64_t)nsecs);
+}
+
 /*
  * Supports Deep C-State power saving idle loop.
  */
--- a/usr/src/uts/i86pc/os/timestamp.c	Wed Oct 14 11:48:20 2009 -0700
+++ b/usr/src/uts/i86pc/os/timestamp.c	Wed Oct 14 14:54:01 2009 -0700
@@ -96,6 +96,7 @@
 #define	NSEC_SHIFT 5
 
 static uint_t nsec_scale;
+static uint_t nsec_unscale;
 
 /*
  * These two variables used to be grouped together inside of a structure that
@@ -341,6 +342,20 @@
 	return (tsc);
 }
 
+/*
+ * Convert a nanosecond based timestamp to tsc
+ */
+uint64_t
+tsc_unscalehrtime(hrtime_t nsec)
+{
+	hrtime_t tsc;
+
+	if (tsc_gethrtime_enable) {
+		TSC_CONVERT(nsec, tsc, nsec_unscale);
+		return (tsc);
+	}
+	return ((uint64_t)nsec);
+}
 
 /* Convert a tsc timestamp to nanoseconds */
 void
@@ -603,6 +618,8 @@
 	ASSERT(cpu_freq_hz > NANOSEC / (1 << NSEC_SHIFT));
 	nsec_scale =
 	    (uint_t)(((uint64_t)NANOSEC << (32 - NSEC_SHIFT)) / cpu_freq_hz);
+	nsec_unscale =
+	    (uint_t)(((uint64_t)cpu_freq_hz << (32 - NSEC_SHIFT)) / NANOSEC);
 
 	flags = clear_int_flag();
 	tsc = tsc_read();
@@ -612,6 +629,7 @@
 	gethrtimef = tsc_gethrtime;
 	gethrtimeunscaledf = tsc_gethrtimeunscaled;
 	scalehrtimef = tsc_scalehrtime;
+	unscalehrtimef = tsc_unscalehrtime;
 	hrtime_tick = tsc_tick;
 	gethrtime_hires = 1;
 	/*
--- a/usr/src/uts/intel/ia32/os/archdep.c	Wed Oct 14 11:48:20 2009 -0700
+++ b/usr/src/uts/intel/ia32/os/archdep.c	Wed Oct 14 14:54:01 2009 -0700
@@ -19,16 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved  	*/
 
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/vmparam.h>
@@ -1353,6 +1350,12 @@
 	scalehrtimef(hrt);
 }
 
+uint64_t
+unscalehrtime(hrtime_t nsecs)
+{
+	return (unscalehrtimef(nsecs));
+}
+
 void
 gethrestime(timespec_t *tp)
 {
--- a/usr/src/uts/intel/sys/archsystm.h	Wed Oct 14 11:48:20 2009 -0700
+++ b/usr/src/uts/intel/sys/archsystm.h	Wed Oct 14 14:54:01 2009 -0700
@@ -169,6 +169,7 @@
 extern hrtime_t (*gethrtimef)(void);
 extern hrtime_t (*gethrtimeunscaledf)(void);
 extern void (*scalehrtimef)(hrtime_t *);
+extern uint64_t (*unscalehrtimef)(hrtime_t);
 extern void (*gethrestimef)(timestruc_t *);
 
 extern void av_dispatch_softvect(uint_t);
--- a/usr/src/uts/sun4/io/cbe.c	Wed Oct 14 11:48:20 2009 -0700
+++ b/usr/src/uts/sun4/io/cbe.c	Wed Oct 14 14:54:01 2009 -0700
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/systm.h>
@@ -56,7 +54,7 @@
 	return (q * sys_tick_freq + ((r * sys_tick_freq) / NANOSEC));
 }
 
-static uint64_t
+uint64_t
 unscalehrtime(hrtime_t ts)
 {
 	uint64_t unscale = 0;