# HG changeset patch
# User pm145316
# Date 1175119283 25200
# Node ID 9f2fcd00d0606f63b2dfcb48715265c126a61c86
# Parent  661924bac5685d1604b76a8c168071888bd738ec
6486343 poor mutex performance on large OPL machines

diff -r 661924bac568 -r 9f2fcd00d060 usr/src/uts/common/os/mutex.c
--- a/usr/src/uts/common/os/mutex.c	Wed Mar 28 13:15:12 2007 -0700
+++ b/usr/src/uts/common/os/mutex.c	Wed Mar 28 15:01:23 2007 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -212,8 +212,14 @@
  * throughput was observed with the given values.  For cases where
  * more than 20 threads were waiting on the same lock, lock throughput
  * increased by a factor of 5 or more using the backoff algorithm.
+ *
+ * Some platforms may provide their own platform specific delay code,
+ * using plat_lock_delay(backoff).  If it is available, plat_lock_delay
+ * is executed instead of the default delay code.
  */
 
+#pragma weak plat_lock_delay
+
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/cpuvar.h>
@@ -307,7 +313,11 @@
 
 	CPU_STATS_ADDQ(cpup, sys, mutex_adenters, 1);
 
-	backoff = BACKOFF_BASE;
+	if (&plat_lock_delay) {
+		backoff = 0;
+	} else {
+		backoff = BACKOFF_BASE;
+	}
 
 	for (;;) {
 spin:
@@ -318,16 +328,20 @@
 		 * the spin_count test and call to nulldev are to prevent
 		 * the compiler optimizer from eliminating the delay loop.
 		 */
-		for (backctr = backoff; backctr; backctr--) {
-			if (!spin_count) (void) nulldev();
-		};    /* delay */
-		backoff = backoff << 1;			/* double it */
-		if (backoff > BACKOFF_CAP) {
-			backoff = BACKOFF_CAP;
+		if (&plat_lock_delay) {
+			plat_lock_delay(&backoff);
+		} else {
+			for (backctr = backoff; backctr; backctr--) {
+				if (!spin_count) (void) nulldev();
+			};    /* delay */
+			backoff = backoff << 1;			/* double it */
+			if (backoff > BACKOFF_CAP) {
+				backoff = BACKOFF_CAP;
+			}
+
+			SMT_PAUSE();
 		}
 
-		SMT_PAUSE();
-
 		if (panicstr)
 			return;
 
@@ -579,7 +593,12 @@
 	if (ncpus == 1)
 		panic("lock_set: %p lock held and only one CPU", lp);
 
-	backoff = BACKOFF_BASE;
+	if (&plat_lock_delay) {
+		backoff = 0;
+	} else {
+		backoff = BACKOFF_BASE;
+	}
+
 	while (LOCK_HELD(lp) || !lock_spin_try(lp)) {
 		if (panicstr)
 			return;
@@ -590,15 +609,20 @@
 		 * the spin_count test and call to nulldev are to prevent
 		 * the compiler optimizer from eliminating the delay loop.
 		 */
-		for (backctr = backoff; backctr; backctr--) {	/* delay */
-			if (!spin_count) (void) nulldev();
-		}
+		if (&plat_lock_delay) {
+			plat_lock_delay(&backoff);
+		} else {
+			/* delay */
+			for (backctr = backoff; backctr; backctr--) {
+				if (!spin_count) (void) nulldev();
+			}
 
-		backoff = backoff << 1;		/* double it */
-		if (backoff > BACKOFF_CAP) {
-			backoff = BACKOFF_CAP;
+			backoff = backoff << 1;		/* double it */
+			if (backoff > BACKOFF_CAP) {
+				backoff = BACKOFF_CAP;
+			}
+			SMT_PAUSE();
 		}
-		SMT_PAUSE();
 	}
 
 	if (spin_count) {
@@ -623,7 +647,11 @@
 
 	ASSERT(new_pil > LOCK_LEVEL);
 
-	backoff = BACKOFF_BASE;
+	if (&plat_lock_delay) {
+		backoff = 0;
+	} else {
+		backoff = BACKOFF_BASE;
+	}
 	do {
 		splx(old_pil);
 		while (LOCK_HELD(lp)) {
@@ -638,15 +666,19 @@
 			 * spin_count test and call to nulldev are to prevent
 			 * compiler optimizer from eliminating the delay loop.
 			 */
-			for (backctr = backoff; backctr; backctr--) {
-				if (!spin_count) (void) nulldev();
+			if (&plat_lock_delay) {
+				plat_lock_delay(&backoff);
+			} else {
+				for (backctr = backoff; backctr; backctr--) {
+					if (!spin_count) (void) nulldev();
+				}
+				backoff = backoff << 1;		/* double it */
+				if (backoff > BACKOFF_CAP) {
+					backoff = BACKOFF_CAP;
+				}
+
+				SMT_PAUSE();
 			}
-			backoff = backoff << 1;		/* double it */
-			if (backoff > BACKOFF_CAP) {
-				backoff = BACKOFF_CAP;
-			}
-
-			SMT_PAUSE();
 		}
 		old_pil = splr(new_pil);
 	} while (!lock_spin_try(lp));
diff -r 661924bac568 -r 9f2fcd00d060 usr/src/uts/common/sys/mutex.h
--- a/usr/src/uts/common/sys/mutex.h	Wed Mar 28 13:15:12 2007 -0700
+++ b/usr/src/uts/common/sys/mutex.h	Wed Mar 28 15:01:23 2007 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,8 +19,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 1991-1998 by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
 #ifndef _SYS_MUTEX_H
@@ -84,6 +83,7 @@
 extern	void	mutex_exit(kmutex_t *);
 extern	int	mutex_owned(kmutex_t *);
 extern	struct _kthread *mutex_owner(kmutex_t *);
+extern	void	plat_lock_delay(int *);
 
 #endif	/* _KERNEL */
 
diff -r 661924bac568 -r 9f2fcd00d060 usr/src/uts/sun4u/opl/os/opl.c
--- a/usr/src/uts/sun4u/opl/os/opl.c	Wed Mar 28 13:15:12 2007 -0700
+++ b/usr/src/uts/sun4u/opl/os/opl.c	Wed Mar 28 15:01:23 2007 -0700
@@ -46,6 +46,8 @@
 #include <sys/lgrp.h>
 #include <sys/memnode.h>
 #include <sys/sysmacros.h>
+#include <sys/time.h>
+#include <sys/cpu.h>
 #include <vm/vm_dep.h>
 
 int (*opl_get_mem_unum)(int, uint64_t, char *, int, int *);
@@ -88,6 +90,24 @@
 
 static struct memlist *opl_memlist_per_board(struct memlist *ml);
 
+/*
+ * Note FF/DC out-of-order instruction engine takes only a
+ * single cycle to execute each spin loop
+ * for comparison, Panther takes 6 cycles for same loop
+ * 1500 approx nsec for OPL sleep instruction
+ * if spin count = OPL_BOFF_SLEEP*OPL_BOFF_SPIN then
+ * spin time should be equal to OPL_BOFF_TM nsecs
+ * Listed values tuned for 2.15GHz to 2.4GHz systems
+ * Value may change for future systems
+ */
+#define	OPL_BOFF_SPIN 720
+#define	OPL_BOFF_BASE 1
+#define	OPL_BOFF_SLEEP 5
+#define	OPL_BOFF_CAP1 20
+#define	OPL_BOFF_CAP2 60
+#define	OPL_BOFF_MAX (40 * OPL_BOFF_SLEEP)
+#define	OPL_BOFF_TM 1500
+
 int
 set_platform_max_ncpus(void)
 {
@@ -998,3 +1018,96 @@
 	}
 	return (opl_get_mem_addr(unum, sid, offset, addrp));
 }
+
+void
+plat_lock_delay(int *backoff)
+{
+	int i;
+	int cnt;
+	int flag;
+	int ctr;
+	hrtime_t delay_start;
+	/*
+	 * Platform specific lock delay code for OPL
+	 *
+	 * Using staged linear increases in the delay.
+	 * The sleep instruction is the preferred method of delay,
+	 * but is too large of granularity for the initial backoff.
+	 */
+
+	if (*backoff == 0) *backoff = OPL_BOFF_BASE;
+
+	flag = !*backoff;
+
+	if (*backoff < OPL_BOFF_CAP1) {
+		/*
+		 * If desired backoff is long enough,
+		 * use sleep for most of it
+		 */
+		for (cnt = *backoff;
+			cnt >= OPL_BOFF_SLEEP;
+			cnt -= OPL_BOFF_SLEEP) {
+			cpu_smt_pause();
+		}
+		/*
+		 * spin for small remainder of backoff
+		 *
+		 * fake call to nulldev included to prevent
+		 * compiler from optimizing out the spin loop
+		 */
+		for (ctr = cnt * OPL_BOFF_SPIN; ctr; ctr--) {
+			if (flag) (void) nulldev();
+		}
+	} else {
+		/* backoff is very large.  Fill it by sleeping */
+		delay_start = gethrtime();
+		cnt = *backoff/OPL_BOFF_SLEEP;
+		/*
+		 * use sleep instructions for delay
+		 */
+		for (i = 0; i < cnt; i++) {
+			cpu_smt_pause();
+		}
+
+		/*
+		 * Note: if the other strand executes a sleep instruction,
+		 * then the sleep ends immediately with a minimum time of
+		 * 42 clocks.  We check gethrtime to insure we have
+		 * waited long enough.  And we include both a short
+		 * spin loop and a sleep for any final delay time.
+		 */
+
+		while ((gethrtime() - delay_start) < cnt * OPL_BOFF_TM) {
+			cpu_smt_pause();
+			for (ctr = OPL_BOFF_SPIN; ctr; ctr--) {
+				if (flag) (void) nulldev();
+			}
+		}
+	}
+
+	/*
+	 * We adjust the backoff in three linear stages
+	 * The initial stage has small increases as this phase is
+	 * usually handle locks with light contention.  We don't want
+	 * to have a long backoff on a lock that is available.
+	 *
+	 * In the second stage, we are in transition, unsure whether
+	 * the lock is under heavy contention.  As the failures to
+	 * obtain the lock increase, we back off further.
+	 *
+	 * For the final stage, we are in a heavily contended or
+	 * long held long so we want to reduce the number of tries.
+	 */
+	if (*backoff < OPL_BOFF_CAP1) {
+		*backoff += 1;
+	} else {
+		if (*backoff < OPL_BOFF_CAP2) {
+			*backoff += OPL_BOFF_SLEEP;
+		} else {
+			*backoff += 2 * OPL_BOFF_SLEEP;
+		}
+		if (*backoff > OPL_BOFF_MAX) {
+			*backoff = OPL_BOFF_MAX;
+		}
+	}
+}