changeset 4878:d39bbc62dfd1

4875742 PAGE_SE_MUTEX() macro needs maintenance 6517224 pse_mutex show scaling issues with tpc-h
author blakej
date Thu, 16 Aug 2007 17:46:42 -0700
parents 5744980c78cf
children ced5e1e3b491
files usr/src/uts/common/vm/page_lock.c usr/src/uts/i86pc/os/startup.c usr/src/uts/sun4/os/startup.c
diffstat 3 files changed, 77 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/vm/page_lock.c	Thu Aug 16 16:52:20 2007 -0700
+++ b/usr/src/uts/common/vm/page_lock.c	Thu Aug 16 17:46:42 2007 -0700
@@ -36,6 +36,7 @@
 #include <sys/vnode.h>
 #include <sys/bitmap.h>
 #include <sys/lockstat.h>
+#include <sys/sysmacros.h>
 #include <sys/condvar_impl.h>
 #include <vm/page.h>
 #include <vm/seg_enum.h>
@@ -74,34 +75,34 @@
  * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
  * when given a pointer to a page_t.
  *
- * PSE_TABLE_SIZE must be a power of two.  One could argue that we
+ * PIO_TABLE_SIZE must be a power of two.  One could argue that we
  * should go to the trouble of setting it up at run time and base it
  * on memory size rather than the number of compile time CPUs.
  *
- * XX64	We should be using physmem size to calculate PSE_TABLE_SIZE,
- *	PSE_SHIFT, PIO_SHIFT.
+ * XX64	We should be using physmem size to calculate PIO_SHIFT.
  *
  *	These might break in 64 bit world.
  */
-#define	PSE_SHIFT	7		/* log2(PSE_TABLE_SIZE) */
-
-#define	PSE_TABLE_SIZE	128		/* number of mutexes to have */
-
-#define	PIO_SHIFT	PSE_SHIFT	/* next power of 2 bigger than page_t */
-#define	PIO_TABLE_SIZE	PSE_TABLE_SIZE	/* number of io mutexes to have */
+#define	PIO_SHIFT	7	/* log2(sizeof(page_t)) */
+#define	PIO_TABLE_SIZE	128	/* number of io mutexes to have */
 
 pad_mutex_t	ph_mutex[PH_TABLE_SIZE];
-pad_mutex_t	pse_mutex[PSE_TABLE_SIZE];
 kmutex_t	pio_mutex[PIO_TABLE_SIZE];
 
-#define	PAGE_SE_MUTEX(pp) \
-	    &pse_mutex[((((uintptr_t)(pp) >> PSE_SHIFT) ^ \
-		((uintptr_t)(pp) >> (PSE_SHIFT << 1))) & \
-		(PSE_TABLE_SIZE - 1))].pad_mutex
-
 #define	PAGE_IO_MUTEX(pp) \
 	    &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
 
+/*
+ * The pse_mutex[] array is allocated in the platform startup code
+ * based on the size of the machine at startup.
+ */
+extern pad_mutex_t *pse_mutex;		/* Locks protecting pp->p_selock */
+extern size_t pse_table_size;		/* Number of mutexes in pse_mutex[] */
+extern int pse_shift;			/* log2(pse_table_size) */
+#define	PAGE_SE_MUTEX(pp)	&pse_mutex[				\
+	((((uintptr_t)(pp) >> pse_shift) ^ ((uintptr_t)(pp))) >> 7) &	\
+	(pse_table_size - 1)].pad_mutex
+
 #define	PSZC_MTX_TABLE_SIZE	128
 #define	PSZC_MTX_TABLE_SHIFT	7
 
@@ -163,6 +164,31 @@
 }
 
 /*
+ * Return a value for pse_shift based on npg (the number of physical pages)
+ * and ncpu (the maximum number of CPUs).  This is called by platform startup
+ * code.
+ *
+ * Lockstat data from TPC-H runs showed that contention on the pse_mutex[]
+ * locks grew approximately as the square of the number of threads executing.
+ * So the primary scaling factor used is NCPU^2.  The size of the machine in
+ * megabytes is used as an upper bound, particularly for sun4v machines which
+ * all claim to have 256 CPUs maximum, and the old value of PSE_TABLE_SIZE
+ * (128) is used as a minimum.  Since the size of the table has to be a power
+ * of two, the calculated size is rounded up to the next power of two.
+ */
+/*ARGSUSED*/
+int
+size_pse_array(pgcnt_t npg, int ncpu)
+{
+	size_t size;
+	pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE;
+
+	size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu));
+	size += (1 << (highbit(size) - 1)) - 1;
+	return (highbit(size) - 1);
+}
+
+/*
  * At present we only use page ownership to aid debugging, so it's
  * OK if the owner field isn't exact.  In the 32-bit world two thread ids
  * can map to the same owner because we just 'or' in 0x80000000 and
--- a/usr/src/uts/i86pc/os/startup.c	Thu Aug 16 16:52:20 2007 -0700
+++ b/usr/src/uts/i86pc/os/startup.c	Thu Aug 16 17:46:42 2007 -0700
@@ -122,6 +122,8 @@
 extern void progressbar_start(void);
 extern void brand_init(void);
 
+extern int size_pse_array(pgcnt_t, int);
+
 /*
  * XXX make declaration below "static" when drivers no longer use this
  * interface.
@@ -250,6 +252,9 @@
 long page_hashsz;		/* Size of page hash table (power of two) */
 struct page *pp_base;		/* Base of initial system page struct array */
 struct page **page_hash;	/* Page hash table */
+pad_mutex_t *pse_mutex;		/* Locks protecting pp->p_selock */
+size_t pse_table_size;		/* Number of mutexes in pse_mutex[] */
+int pse_shift;			/* log2(pse_table_size) */
 struct seg ktextseg;		/* Segment used for kernel executable image */
 struct seg kvalloc;		/* Segment used for "valloc" mapping */
 struct seg kpseg;		/* Segment used for pageable kernel virt mem */
@@ -844,6 +849,7 @@
 	size_t pagecolor_memsz;
 	caddr_t page_ctrs_mem;
 	size_t page_ctrs_size;
+	size_t pse_table_alloc_size;
 	struct memlist *current;
 	extern void startup_build_mem_nodes(struct memlist *);
 
@@ -1020,6 +1026,14 @@
 	ADD_TO_ALLOCATIONS(page_ctrs_mem, page_ctrs_size);
 	PRM_DEBUG(page_ctrs_size);
 
+	/*
+	 * Allocate the array that protects pp->p_selock.
+	 */
+	pse_shift = size_pse_array(physmem, max_ncpus);
+	pse_table_size = 1 << pse_shift;
+	pse_table_alloc_size = pse_table_size * sizeof (pad_mutex_t);
+	ADD_TO_ALLOCATIONS(pse_mutex, pse_table_alloc_size);
+
 #if defined(__amd64)
 	valloc_sz = ROUND_UP_LPAGE(valloc_sz);
 	valloc_base = VALLOC_BASE;
--- a/usr/src/uts/sun4/os/startup.c	Thu Aug 16 16:52:20 2007 -0700
+++ b/usr/src/uts/sun4/os/startup.c	Thu Aug 16 17:46:42 2007 -0700
@@ -94,6 +94,7 @@
 extern void memseg_remap_init(void);
 
 extern void mach_kpm_init(void);
+extern int size_pse_array(pgcnt_t, int);
 
 /*
  * External Data:
@@ -181,6 +182,9 @@
 struct page *pp_base;		/* Base of system page struct array */
 size_t pp_sz;			/* Size in bytes of page struct array */
 struct page **page_hash;	/* Page hash table */
+pad_mutex_t *pse_mutex;		/* Locks protecting pp->p_selock */
+size_t pse_table_size;		/* Number of mutexes in pse_mutex[] */
+int pse_shift;			/* log2(pse_table_size) */
 struct seg ktextseg;		/* Segment used for kernel executable image */
 struct seg kvalloc;		/* Segment used for "valloc" mapping */
 struct seg kpseg;		/* Segment used for pageable kernel virt mem */
@@ -1355,6 +1359,17 @@
 			    kpm_pp_sz;
 	}
 
+	/*
+	 * Allocate the array that protects pp->p_selock.
+	 */
+	pse_shift = size_pse_array(physmem, max_ncpus);
+	pse_table_size = 1 << pse_shift;
+	pse_mutex = ndata_alloc(&ndata, pse_table_size * sizeof (pad_mutex_t),
+	    ecache_alignsize);
+	if (pse_mutex == NULL)
+		alloc_sz = roundup(alloc_sz, ecache_alignsize) +
+		    pse_table_size * sizeof (pad_mutex_t);
+
 	if (alloc_sz > 0) {
 		uintptr_t bop_base;
 
@@ -1394,6 +1409,13 @@
 			    ecache_alignsize);
 		}
 
+		if (pse_mutex == NULL) {
+			pse_mutex = (pad_mutex_t *)bop_base;
+			bop_base = roundup(bop_base +
+			    pse_table_size * sizeof (pad_mutex_t),
+			    ecache_alignsize);
+		}
+
 		ASSERT(bop_base <= (uintptr_t)alloc_base);
 	}