Mercurial > illumos > illumos-gate
changeset 4878:d39bbc62dfd1
4875742 PAGE_SE_MUTEX() macro needs maintenance
6517224 pse_mutex show scaling issues with tpc-h
author | blakej |
---|---|
date | Thu, 16 Aug 2007 17:46:42 -0700 |
parents | 5744980c78cf |
children | ced5e1e3b491 |
files | usr/src/uts/common/vm/page_lock.c usr/src/uts/i86pc/os/startup.c usr/src/uts/sun4/os/startup.c |
diffstat | 3 files changed, 77 insertions(+), 15 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/uts/common/vm/page_lock.c Thu Aug 16 16:52:20 2007 -0700 +++ b/usr/src/uts/common/vm/page_lock.c Thu Aug 16 17:46:42 2007 -0700 @@ -36,6 +36,7 @@ #include <sys/vnode.h> #include <sys/bitmap.h> #include <sys/lockstat.h> +#include <sys/sysmacros.h> #include <sys/condvar_impl.h> #include <vm/page.h> #include <vm/seg_enum.h> @@ -74,34 +75,34 @@ * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex * when given a pointer to a page_t. * - * PSE_TABLE_SIZE must be a power of two. One could argue that we + * PIO_TABLE_SIZE must be a power of two. One could argue that we * should go to the trouble of setting it up at run time and base it * on memory size rather than the number of compile time CPUs. * - * XX64 We should be using physmem size to calculate PSE_TABLE_SIZE, - * PSE_SHIFT, PIO_SHIFT. + * XX64 We should be using physmem size to calculate PIO_SHIFT. * * These might break in 64 bit world. */ -#define PSE_SHIFT 7 /* log2(PSE_TABLE_SIZE) */ - -#define PSE_TABLE_SIZE 128 /* number of mutexes to have */ - -#define PIO_SHIFT PSE_SHIFT /* next power of 2 bigger than page_t */ -#define PIO_TABLE_SIZE PSE_TABLE_SIZE /* number of io mutexes to have */ +#define PIO_SHIFT 7 /* log2(sizeof(page_t)) */ +#define PIO_TABLE_SIZE 128 /* number of io mutexes to have */ pad_mutex_t ph_mutex[PH_TABLE_SIZE]; -pad_mutex_t pse_mutex[PSE_TABLE_SIZE]; kmutex_t pio_mutex[PIO_TABLE_SIZE]; -#define PAGE_SE_MUTEX(pp) \ - &pse_mutex[((((uintptr_t)(pp) >> PSE_SHIFT) ^ \ - ((uintptr_t)(pp) >> (PSE_SHIFT << 1))) & \ - (PSE_TABLE_SIZE - 1))].pad_mutex - #define PAGE_IO_MUTEX(pp) \ &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)] +/* + * The pse_mutex[] array is allocated in the platform startup code + * based on the size of the machine at startup. + */ +extern pad_mutex_t *pse_mutex; /* Locks protecting pp->p_selock */ +extern size_t pse_table_size; /* Number of mutexes in pse_mutex[] */ +extern int pse_shift; /* log2(pse_table_size) */ +#define PAGE_SE_MUTEX(pp) &pse_mutex[ \ + ((((uintptr_t)(pp) >> pse_shift) ^ ((uintptr_t)(pp))) >> 7) & \ + (pse_table_size - 1)].pad_mutex + #define PSZC_MTX_TABLE_SIZE 128 #define PSZC_MTX_TABLE_SHIFT 7 @@ -163,6 +164,31 @@ } /* + * Return a value for pse_shift based on npg (the number of physical pages) + * and ncpu (the maximum number of CPUs). This is called by platform startup + * code. + * + * Lockstat data from TPC-H runs showed that contention on the pse_mutex[] + * locks grew approximately as the square of the number of threads executing. + * So the primary scaling factor used is NCPU^2. The size of the machine in + * megabytes is used as an upper bound, particularly for sun4v machines which + * all claim to have 256 CPUs maximum, and the old value of PSE_TABLE_SIZE + * (128) is used as a minimum. Since the size of the table has to be a power + * of two, the calculated size is rounded up to the next power of two. + */ +/*ARGSUSED*/ +int +size_pse_array(pgcnt_t npg, int ncpu) +{ + size_t size; + pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE; + + size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu)); + size += (1 << (highbit(size) - 1)) - 1; + return (highbit(size) - 1); +} + +/* * At present we only use page ownership to aid debugging, so it's * OK if the owner field isn't exact. In the 32-bit world two thread ids * can map to the same owner because we just 'or' in 0x80000000 and
--- a/usr/src/uts/i86pc/os/startup.c Thu Aug 16 16:52:20 2007 -0700 +++ b/usr/src/uts/i86pc/os/startup.c Thu Aug 16 17:46:42 2007 -0700 @@ -122,6 +122,8 @@ extern void progressbar_start(void); extern void brand_init(void); +extern int size_pse_array(pgcnt_t, int); + /* * XXX make declaration below "static" when drivers no longer use this * interface. @@ -250,6 +252,9 @@ long page_hashsz; /* Size of page hash table (power of two) */ struct page *pp_base; /* Base of initial system page struct array */ struct page **page_hash; /* Page hash table */ +pad_mutex_t *pse_mutex; /* Locks protecting pp->p_selock */ +size_t pse_table_size; /* Number of mutexes in pse_mutex[] */ +int pse_shift; /* log2(pse_table_size) */ struct seg ktextseg; /* Segment used for kernel executable image */ struct seg kvalloc; /* Segment used for "valloc" mapping */ struct seg kpseg; /* Segment used for pageable kernel virt mem */ @@ -844,6 +849,7 @@ size_t pagecolor_memsz; caddr_t page_ctrs_mem; size_t page_ctrs_size; + size_t pse_table_alloc_size; struct memlist *current; extern void startup_build_mem_nodes(struct memlist *); @@ -1020,6 +1026,14 @@ ADD_TO_ALLOCATIONS(page_ctrs_mem, page_ctrs_size); PRM_DEBUG(page_ctrs_size); + /* + * Allocate the array that protects pp->p_selock. + */ + pse_shift = size_pse_array(physmem, max_ncpus); + pse_table_size = 1 << pse_shift; + pse_table_alloc_size = pse_table_size * sizeof (pad_mutex_t); + ADD_TO_ALLOCATIONS(pse_mutex, pse_table_alloc_size); + #if defined(__amd64) valloc_sz = ROUND_UP_LPAGE(valloc_sz); valloc_base = VALLOC_BASE;
--- a/usr/src/uts/sun4/os/startup.c Thu Aug 16 16:52:20 2007 -0700 +++ b/usr/src/uts/sun4/os/startup.c Thu Aug 16 17:46:42 2007 -0700 @@ -94,6 +94,7 @@ extern void memseg_remap_init(void); extern void mach_kpm_init(void); +extern int size_pse_array(pgcnt_t, int); /* * External Data: @@ -181,6 +182,9 @@ struct page *pp_base; /* Base of system page struct array */ size_t pp_sz; /* Size in bytes of page struct array */ struct page **page_hash; /* Page hash table */ +pad_mutex_t *pse_mutex; /* Locks protecting pp->p_selock */ +size_t pse_table_size; /* Number of mutexes in pse_mutex[] */ +int pse_shift; /* log2(pse_table_size) */ struct seg ktextseg; /* Segment used for kernel executable image */ struct seg kvalloc; /* Segment used for "valloc" mapping */ struct seg kpseg; /* Segment used for pageable kernel virt mem */ @@ -1355,6 +1359,17 @@ kpm_pp_sz; } + /* + * Allocate the array that protects pp->p_selock. + */ + pse_shift = size_pse_array(physmem, max_ncpus); + pse_table_size = 1 << pse_shift; + pse_mutex = ndata_alloc(&ndata, pse_table_size * sizeof (pad_mutex_t), + ecache_alignsize); + if (pse_mutex == NULL) + alloc_sz = roundup(alloc_sz, ecache_alignsize) + + pse_table_size * sizeof (pad_mutex_t); + if (alloc_sz > 0) { uintptr_t bop_base; @@ -1394,6 +1409,13 @@ ecache_alignsize); } + if (pse_mutex == NULL) { + pse_mutex = (pad_mutex_t *)bop_base; + bop_base = roundup(bop_base + + pse_table_size * sizeof (pad_mutex_t), + ecache_alignsize); + } + ASSERT(bop_base <= (uintptr_t)alloc_base); }