Mercurial > illumos > illumos-gate
changeset 10710:b9f4a7af952b
6861737 Assertion in lgrpplat.c hit on platform with >32 cores.
author | Jonathan Chew <jonathan.chew@sun.com> |
---|---|
date | Sat, 03 Oct 2009 12:16:34 -0700 |
parents | 62c8735e37aa |
children | f3a7a77cc1a6 |
files | usr/src/uts/common/os/lgrp.c usr/src/uts/common/os/main.c usr/src/uts/common/sys/lgrp.h usr/src/uts/i86pc/os/lgrpplat.c usr/src/uts/i86pc/os/mlsetup.c usr/src/uts/sun4/os/lgrpplat.c usr/src/uts/sun4/os/mlsetup.c |
diffstat | 7 files changed, 487 insertions(+), 372 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/uts/common/os/lgrp.c Fri Oct 02 17:27:26 2009 -0700 +++ b/usr/src/uts/common/os/lgrp.c Sat Oct 03 12:16:34 2009 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -225,7 +225,13 @@ static void lgrp_part_add_cpu(struct cpu *, lgrp_id_t); static void lgrp_part_del_cpu(struct cpu *); +/* + * lgroup framework initialization + */ +static void lgrp_main_init(void); +static void lgrp_main_mp_init(void); static void lgrp_root_init(void); +static void lgrp_setup(void); /* * lpl topology @@ -284,7 +290,7 @@ } /* - * Build full lgroup topology + * Setup root lgroup */ static void lgrp_root_init(void) @@ -352,28 +358,52 @@ /* * Initialize the lgroup framework and allow the platform to do the same + * + * This happens in stages during boot and is all funnelled through this routine + * (see definition of lgrp_init_stages_t to see what happens at each stage and + * when) */ void -lgrp_init(void) +lgrp_init(lgrp_init_stages_t stage) { /* * Initialize the platform */ - lgrp_plat_init(); - - /* - * Set max number of lgroups supported on this platform which must be - * less than the max number of lgroups supported by the common lgroup - * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.) - */ - nlgrpsmax = lgrp_plat_max_lgrps(); - ASSERT(nlgrpsmax <= NLGRPS_MAX); + lgrp_plat_init(stage); + + switch (stage) { + case LGRP_INIT_STAGE1: + /* + * Set max number of lgroups supported on this platform which + * must be less than the max number of lgroups supported by the + * common lgroup framework (eg. NLGRPS_MAX is max elements in + * lgrp_table[], etc.) + */ + nlgrpsmax = lgrp_plat_max_lgrps(); + ASSERT(nlgrpsmax <= NLGRPS_MAX); + break; + + case LGRP_INIT_STAGE2: + lgrp_setup(); + break; + + case LGRP_INIT_STAGE4: + lgrp_main_init(); + break; + + case LGRP_INIT_STAGE5: + lgrp_main_mp_init(); + break; + + default: + break; + } } /* * Create the root and cpu0's lgroup, and set t0's home. */ -void +static void lgrp_setup(void) { /* @@ -389,16 +419,6 @@ } /* - * Lgroup initialization is split in two parts. The first part - * (lgrp_main_init()) is called right before start_other_cpus() in main. The - * second part (lgrp_main_mp_init()) is called right after start_other_cpus() - * when all CPUs are brought online and all distance information is available. - * - * When lgrp_main_init() is complete it sets lgrp_initialized. The - * lgrp_main_mp_init() sets lgrp_topo_initialized. - */ - -/* * true when lgrp initialization has been completed. */ int lgrp_initialized = 0; @@ -412,7 +432,7 @@ * Init routine called after startup(), /etc/system has been processed, * and cpu0 has been added to an lgroup. */ -void +static void lgrp_main_init(void) { cpu_t *cp = CPU; @@ -488,7 +508,6 @@ lgrp_kstat_create(cp); mutex_exit(&cpu_lock); - lgrp_plat_main_init(); lgrp_initialized = 1; } @@ -496,7 +515,7 @@ * Finish lgrp initialization after all CPUS are brought on-line. * This routine is called after start_other_cpus(). */ -void +static void lgrp_main_mp_init(void) { klgrpset_t changed;
--- a/usr/src/uts/common/os/main.c Fri Oct 02 17:27:26 2009 -0700 +++ b/usr/src/uts/common/os/main.c Sat Oct 03 12:16:34 2009 -0700 @@ -370,8 +370,6 @@ extern int pm_adjust_timestamps(dev_info_t *, void *); extern void start_other_cpus(int); extern void sysevent_evc_thrinit(); - extern void lgrp_main_init(void); - extern void lgrp_main_mp_init(void); #if defined(__x86) extern void fastboot_post_startup(void); #endif @@ -388,9 +386,9 @@ ASSERT_STACK_ALIGNED(); /* - * Setup the first lgroup, and home t0 + * Setup root lgroup and leaf lgroup for CPU 0 */ - lgrp_setup(); + lgrp_init(LGRP_INIT_STAGE2); /* * Once 'startup()' completes, the thread_reaper() daemon would be @@ -419,8 +417,10 @@ /* * May need to probe to determine latencies from CPU 0 after * gethrtime() comes alive in cbe_init() and before enabling interrupts + * and copy and release any temporary memory allocated with BOP_ALLOC() + * before release_bootstrap() frees boot memory */ - lgrp_plat_probe(); + lgrp_init(LGRP_INIT_STAGE3); /* * Call all system initialization functions. @@ -529,11 +529,10 @@ sysevent_evc_thrinit(); /* - * main lgroup initialization - * This must be done after post_startup(), but before + * This must be done after post_startup() but before * start_other_cpus() */ - lgrp_main_init(); + lgrp_init(LGRP_INIT_STAGE4); /* * Perform MP initialization, if any. @@ -551,7 +550,7 @@ /* * Finish lgrp initialization after all CPUS are brought online. */ - lgrp_main_mp_init(); + lgrp_init(LGRP_INIT_STAGE5); /* * After mp_init(), number of cpus are known (this is
--- a/usr/src/uts/common/sys/lgrp.h Fri Oct 02 17:27:26 2009 -0700 +++ b/usr/src/uts/common/sys/lgrp.h Sat Oct 03 12:16:34 2009 -0700 @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -295,6 +295,32 @@ } lgrp_config_flag_t; /* + * Stages of lgroup framework initialization (done through lgrp_init()): + * + * 1) Initialize common and platform specific code (called in mlsetup()) + * + * 2) Setup root lgroup and add CPU 0 to lgroup(s) (called near beginning of + * main() before startup()) + * + * 3) Probe from CPU 0 and copy and release any BOP_ALLOC-ed memory temporarily + * allocated before kernel memory allocator is setup (called in main() + * after startup(), gethrtime() is setup, and before interrupts enabled) + * + * 4) Check for null proc LPA on Starcat, collapse lgroup topology (if + * necessary), setup lgroup kstats, etc. (called before start_other_cpus()) + * + * 5) Finish any lgroup initialization needed including updating lgroup + * topology after all CPUs started (called after start_other_cpus()) + */ +typedef enum lgrp_init_stages { + LGRP_INIT_STAGE1, + LGRP_INIT_STAGE2, + LGRP_INIT_STAGE3, + LGRP_INIT_STAGE4, + LGRP_INIT_STAGE5 +} lgrp_init_stages_t; + +/* * Memory allocation policies */ typedef enum lgrp_mem_policy { @@ -510,8 +536,7 @@ * lgroup management */ int lgrp_optimizations(void); -void lgrp_init(void); -void lgrp_setup(void); +void lgrp_init(lgrp_init_stages_t); lgrp_t *lgrp_create(void); void lgrp_destroy(lgrp_t *); void lgrp_config(lgrp_config_flag_t, uintptr_t, uintptr_t); @@ -588,8 +613,7 @@ /* platform interfaces */ -void lgrp_plat_init(void); -void lgrp_plat_main_init(void); +void lgrp_plat_init(lgrp_init_stages_t); lgrp_t *lgrp_plat_alloc(lgrp_id_t lgrpid); void lgrp_plat_config(lgrp_config_flag_t, uintptr_t); lgrp_handle_t lgrp_plat_cpu_to_hand(processorid_t); @@ -598,7 +622,6 @@ pgcnt_t lgrp_plat_mem_size(lgrp_handle_t, lgrp_mem_query_t); int lgrp_plat_latency(lgrp_handle_t, lgrp_handle_t); lgrp_handle_t lgrp_plat_root_hand(void); -void lgrp_plat_probe(void); extern uint32_t lgrp_expand_proc_thresh; extern uint32_t lgrp_expand_proc_diff;
--- a/usr/src/uts/i86pc/os/lgrpplat.c Fri Oct 02 17:27:26 2009 -0700 +++ b/usr/src/uts/i86pc/os/lgrpplat.c Sat Oct 03 12:16:34 2009 -0700 @@ -254,9 +254,11 @@ static int lgrp_plat_apic_ncpus = 0; /* - * CPU to node ID mapping table (only used for SRAT) + * CPU to node ID mapping table (only used for SRAT) and its max number of + * entries */ -static cpu_node_map_t lgrp_plat_cpu_node[NCPU]; +static cpu_node_map_t *lgrp_plat_cpu_node = NULL; +static uint_t lgrp_plat_cpu_node_nentries = 0; /* * Latency statistics @@ -385,12 +387,10 @@ lgrp_handle_t lgrp_plat_cpu_to_hand(processorid_t id); -void lgrp_plat_init(void); +void lgrp_plat_init(lgrp_init_stages_t stage); int lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to); -void lgrp_plat_main_init(void); - int lgrp_plat_max_lgrps(void); pgcnt_t lgrp_plat_mem_size(lgrp_handle_t plathand, @@ -412,11 +412,14 @@ int node_cnt, cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, uint32_t domain); -static int lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node); +static int lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node, + int cpu_node_nentries); static int lgrp_plat_domain_to_node(node_domain_map_t *node_domain, int node_cnt, uint32_t domain); +static void lgrp_plat_get_numa_config(void); + static void lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory, lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats); @@ -424,6 +427,8 @@ static int lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory, lgrp_plat_latency_stats_t *lat_stats); +static void lgrp_plat_main_init(void); + static pgcnt_t lgrp_plat_mem_size_default(lgrp_handle_t, lgrp_mem_query_t); static int lgrp_plat_node_domain_update(node_domain_map_t *node_domain, @@ -438,9 +443,8 @@ node_phys_addr_map_t *node_memory); static hrtime_t lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, - lgrp_plat_probe_mem_config_t *probe_mem_config, - lgrp_plat_latency_stats_t *lat_stats, - lgrp_plat_probe_stats_t *probe_stats); + int cpu_node_nentries, lgrp_plat_probe_mem_config_t *probe_mem_config, + lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats); static int lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node); @@ -452,6 +456,8 @@ cpu_node_map_t *cpu_node, int cpu_count, node_phys_addr_map_t *node_memory); +static void lgrp_plat_release_bootstrap(void); + static int lgrp_plat_srat_domains(struct srat *tp, uint32_t *prox_domain_min); @@ -728,7 +734,7 @@ return (LGRP_DEFAULT_HANDLE); hand = (lgrp_handle_t)lgrp_plat_cpu_to_node(cpu[id], - lgrp_plat_cpu_node); + lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries); ASSERT(hand != (lgrp_handle_t)-1); if (hand == (lgrp_handle_t)-1) @@ -742,166 +748,63 @@ * Platform-specific initialization of lgroups */ void -lgrp_plat_init(void) +lgrp_plat_init(lgrp_init_stages_t stage) { #if defined(__xpv) - /* - * XXPV For now, the hypervisor treats all memory equally. - */ - lgrp_plat_node_cnt = max_mem_nodes = 1; #else /* __xpv */ - uint_t probe_op; u_longlong_t value; - - /* - * Get boot property for lgroup topology height limit - */ - if (bootprop_getval(BP_LGRP_TOPO_LEVELS, &value) == 0) - (void) lgrp_topo_ht_limit_set((int)value); - - /* - * Get boot property for enabling/disabling SRAT - */ - if (bootprop_getval(BP_LGRP_SRAT_ENABLE, &value) == 0) - lgrp_plat_srat_enable = (int)value; - - /* - * Get boot property for enabling/disabling SLIT - */ - if (bootprop_getval(BP_LGRP_SLIT_ENABLE, &value) == 0) - lgrp_plat_slit_enable = (int)value; - - /* - * Initialize as a UMA machine - */ - if (lgrp_topo_ht_limit() == 1) { +#endif /* __xpv */ + + switch (stage) { + case LGRP_INIT_STAGE1: +#if defined(__xpv) + /* + * XXPV For now, the hypervisor treats all memory equally. + */ lgrp_plat_node_cnt = max_mem_nodes = 1; - return; - } - - /* - * Read boot property with CPU to APIC ID mapping table/array and fill - * in CPU to node ID mapping table with APIC ID for each CPU - */ - lgrp_plat_apic_ncpus = - lgrp_plat_process_cpu_apicids(lgrp_plat_cpu_node); - - /* - * Determine which CPUs and memory are local to each other and number - * of NUMA nodes by reading ACPI System Resource Affinity Table (SRAT) - */ - if (lgrp_plat_apic_ncpus > 0) { - int retval; - - retval = lgrp_plat_process_srat(srat_ptr, - &lgrp_plat_prox_domain_min, - lgrp_plat_node_domain, lgrp_plat_cpu_node, - lgrp_plat_apic_ncpus, lgrp_plat_node_memory); - if (retval <= 0) { - lgrp_plat_srat_error = retval; - lgrp_plat_node_cnt = 1; - } else { - lgrp_plat_srat_error = 0; - lgrp_plat_node_cnt = retval; +#else /* __xpv */ + /* + * Get boot property for lgroup topology height limit + */ + if (bootprop_getval(BP_LGRP_TOPO_LEVELS, &value) == 0) + (void) lgrp_topo_ht_limit_set((int)value); + + /* + * Get boot property for enabling/disabling SRAT + */ + if (bootprop_getval(BP_LGRP_SRAT_ENABLE, &value) == 0) + lgrp_plat_srat_enable = (int)value; + + /* + * Get boot property for enabling/disabling SLIT + */ + if (bootprop_getval(BP_LGRP_SLIT_ENABLE, &value) == 0) + lgrp_plat_slit_enable = (int)value; + + /* + * Initialize as a UMA machine + */ + if (lgrp_topo_ht_limit() == 1) { + lgrp_plat_node_cnt = max_mem_nodes = 1; + return; } + + lgrp_plat_get_numa_config(); +#endif /* __xpv */ + break; + + case LGRP_INIT_STAGE3: + lgrp_plat_probe(); + lgrp_plat_release_bootstrap(); + break; + + case LGRP_INIT_STAGE4: + lgrp_plat_main_init(); + break; + + default: + break; } - - /* - * Try to use PCI config space registers on Opteron if there's an error - * processing CPU to APIC ID mapping or SRAT - */ - if ((lgrp_plat_apic_ncpus <= 0 || lgrp_plat_srat_error != 0) && - is_opteron()) - opt_get_numa_config(&lgrp_plat_node_cnt, &lgrp_plat_mem_intrlv, - lgrp_plat_node_memory); - - /* - * Don't bother to setup system for multiple lgroups and only use one - * memory node when memory is interleaved between any nodes or there is - * only one NUMA node - * - * NOTE: May need to change this for Dynamic Reconfiguration (DR) - * when and if it happens for x86/x64 - */ - if (lgrp_plat_mem_intrlv || lgrp_plat_node_cnt == 1) { - lgrp_plat_node_cnt = max_mem_nodes = 1; - (void) lgrp_topo_ht_limit_set(1); - return; - } - - /* - * Leaf lgroups on x86/x64 architectures contain one physical - * processor chip. Tune lgrp_expand_proc_thresh and - * lgrp_expand_proc_diff so that lgrp_choose() will spread - * things out aggressively. - */ - lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2; - lgrp_expand_proc_diff = 0; - - /* - * There should be one memnode (physical page free list(s)) for - * each node - */ - max_mem_nodes = lgrp_plat_node_cnt; - - /* - * Initialize min and max latency before reading SLIT or probing - */ - lgrp_plat_lat_stats.latency_min = -1; - lgrp_plat_lat_stats.latency_max = 0; - - /* - * Determine how far each NUMA node is from each other by - * reading ACPI System Locality Information Table (SLIT) if it - * exists - */ - lgrp_plat_slit_error = lgrp_plat_process_slit(slit_ptr, - lgrp_plat_node_cnt, lgrp_plat_node_memory, - &lgrp_plat_lat_stats); - if (lgrp_plat_slit_error == 0) - return; - - /* - * Probe to determine latency between NUMA nodes when SLIT - * doesn't exist or make sense - */ - lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_ENABLE; - - /* - * Specify whether to probe using vendor ID register or page copy - * if hasn't been specified already or is overspecified - */ - probe_op = lgrp_plat_probe_flags & - (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR); - - if (probe_op == 0 || - probe_op == (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR)) { - lgrp_plat_probe_flags &= - ~(LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR); - if (is_opteron()) - lgrp_plat_probe_flags |= - LGRP_PLAT_PROBE_VENDOR; - else - lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_PGCPY; - } - - /* - * Probing errors can mess up the lgroup topology and - * force us fall back to a 2 level lgroup topology. - * Here we bound how tall the lgroup topology can grow - * in hopes of avoiding any anamolies in probing from - * messing up the lgroup topology by limiting the - * accuracy of the latency topology. - * - * Assume that nodes will at least be configured in a - * ring, so limit height of lgroup topology to be less - * than number of nodes on a system with 4 or more - * nodes - */ - if (lgrp_plat_node_cnt >= 4 && lgrp_topo_ht_limit() == - lgrp_topo_ht_limit_default()) - (void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1); -#endif /* __xpv */ } @@ -943,7 +846,8 @@ * Probe from current CPU if its lgroup latencies haven't been set yet * and we are trying to get latency from current CPU to some node */ - node = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node); + node = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node, + lgrp_plat_cpu_node_nentries); ASSERT(node >= 0 && node < lgrp_plat_node_cnt); if (lgrp_plat_lat_stats.latencies[src][src] == 0 && node == src) lgrp_plat_probe(); @@ -953,117 +857,6 @@ /* - * Platform-specific initialization - */ -void -lgrp_plat_main_init(void) -{ - int curnode; - int ht_limit; - int i; - - /* - * Print a notice that MPO is disabled when memory is interleaved - * across nodes....Would do this when it is discovered, but can't - * because it happens way too early during boot.... - */ - if (lgrp_plat_mem_intrlv) - cmn_err(CE_NOTE, - "MPO disabled because memory is interleaved\n"); - - /* - * Don't bother to do any probing if it is disabled, there is only one - * node, or the height of the lgroup topology less than or equal to 2 - */ - ht_limit = lgrp_topo_ht_limit(); - if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) || - max_mem_nodes == 1 || ht_limit <= 2) { - /* - * Setup lgroup latencies for 2 level lgroup topology - * (ie. local and remote only) if they haven't been set yet - */ - if (ht_limit == 2 && lgrp_plat_lat_stats.latency_min == -1 && - lgrp_plat_lat_stats.latency_max == 0) - lgrp_plat_2level_setup(lgrp_plat_node_memory, - &lgrp_plat_lat_stats); - return; - } - - if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) { - /* - * Should have been able to probe from CPU 0 when it was added - * to lgroup hierarchy, but may not have been able to then - * because it happens so early in boot that gethrtime() hasn't - * been initialized. (:-( - */ - curnode = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node); - ASSERT(curnode >= 0 && curnode < lgrp_plat_node_cnt); - if (lgrp_plat_lat_stats.latencies[curnode][curnode] == 0) - lgrp_plat_probe(); - - return; - } - - /* - * When probing memory, use one page for every sample to determine - * lgroup topology and taking multiple samples - */ - if (lgrp_plat_probe_mem_config.probe_memsize == 0) - lgrp_plat_probe_mem_config.probe_memsize = PAGESIZE * - lgrp_plat_probe_nsamples; - - /* - * Map memory in each node needed for probing to determine latency - * topology - */ - for (i = 0; i < lgrp_plat_node_cnt; i++) { - int mnode; - - /* - * Skip this node and leave its probe page NULL - * if it doesn't have any memory - */ - mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i); - if (!mem_node_config[mnode].exists) { - lgrp_plat_probe_mem_config.probe_va[i] = NULL; - continue; - } - - /* - * Allocate one kernel virtual page - */ - lgrp_plat_probe_mem_config.probe_va[i] = vmem_alloc(heap_arena, - lgrp_plat_probe_mem_config.probe_memsize, VM_NOSLEEP); - if (lgrp_plat_probe_mem_config.probe_va[i] == NULL) { - cmn_err(CE_WARN, - "lgrp_plat_main_init: couldn't allocate memory"); - return; - } - - /* - * Get PFN for first page in each node - */ - lgrp_plat_probe_mem_config.probe_pfn[i] = - mem_node_config[mnode].physbase; - - /* - * Map virtual page to first page in node - */ - hat_devload(kas.a_hat, lgrp_plat_probe_mem_config.probe_va[i], - lgrp_plat_probe_mem_config.probe_memsize, - lgrp_plat_probe_mem_config.probe_pfn[i], - PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE, - HAT_LOAD_NOCONSIST); - } - - /* - * Probe from current CPU - */ - lgrp_plat_probe(); -} - - -/* * Return the maximum number of lgrps supported by the platform. * Before lgrp topology is known it returns an estimate based on the number of * nodes. Once topology is known it returns the actual maximim number of lgrps @@ -1189,7 +982,8 @@ /* * Determine ID of node containing current CPU */ - from = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node); + from = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node, + lgrp_plat_cpu_node_nentries); ASSERT(from >= 0 && from < lgrp_plat_node_cnt); if (srat_ptr && lgrp_plat_srat_enable && !lgrp_plat_srat_error) ASSERT(lgrp_plat_node_domain[from].exists); @@ -1215,8 +1009,9 @@ * probed yet or don't have memory */ probe_time = lgrp_plat_probe_time(to, - lgrp_plat_cpu_node, &lgrp_plat_probe_mem_config, - &lgrp_plat_lat_stats, &lgrp_plat_probe_stats); + lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries, + &lgrp_plat_probe_mem_config, &lgrp_plat_lat_stats, + &lgrp_plat_probe_stats); if (probe_time == 0) continue; @@ -1343,7 +1138,8 @@ * Get node ID for given CPU */ static int -lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node) +lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node, + int cpu_node_nentries) { processorid_t cpuid; @@ -1369,7 +1165,7 @@ * Return -1 when CPU to node ID mapping entry doesn't exist for given * CPU */ - if (!cpu_node[cpuid].exists) + if (cpuid >= cpu_node_nentries || !cpu_node[cpuid].exists) return (-1); return (cpu_node[cpuid].node); @@ -1403,6 +1199,159 @@ /* + * Get NUMA configuration of machine + */ +static void +lgrp_plat_get_numa_config(void) +{ + uint_t probe_op; + + /* + * Read boot property with CPU to APIC ID mapping table/array to + * determine number of CPUs + */ + lgrp_plat_apic_ncpus = lgrp_plat_process_cpu_apicids(NULL); + + /* + * Determine which CPUs and memory are local to each other and number + * of NUMA nodes by reading ACPI System Resource Affinity Table (SRAT) + */ + if (lgrp_plat_apic_ncpus > 0) { + int retval; + + /* + * Temporarily allocate boot memory to use for CPU to node + * mapping since kernel memory allocator isn't alive yet + */ + lgrp_plat_cpu_node = (cpu_node_map_t *)BOP_ALLOC(bootops, + NULL, lgrp_plat_apic_ncpus * sizeof (cpu_node_map_t), + sizeof (int)); + + ASSERT(lgrp_plat_cpu_node != NULL); + if (lgrp_plat_cpu_node) { + lgrp_plat_cpu_node_nentries = lgrp_plat_apic_ncpus; + bzero(lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries * + sizeof (cpu_node_map_t)); + } + + /* + * Fill in CPU to node ID mapping table with APIC ID for each + * CPU + */ + (void) lgrp_plat_process_cpu_apicids(lgrp_plat_cpu_node); + + retval = lgrp_plat_process_srat(srat_ptr, + &lgrp_plat_prox_domain_min, + lgrp_plat_node_domain, lgrp_plat_cpu_node, + lgrp_plat_apic_ncpus, lgrp_plat_node_memory); + if (retval <= 0) { + lgrp_plat_srat_error = retval; + lgrp_plat_node_cnt = 1; + } else { + lgrp_plat_srat_error = 0; + lgrp_plat_node_cnt = retval; + } + } + + /* + * Try to use PCI config space registers on Opteron if there's an error + * processing CPU to APIC ID mapping or SRAT + */ + if ((lgrp_plat_apic_ncpus <= 0 || lgrp_plat_srat_error != 0) && + is_opteron()) + opt_get_numa_config(&lgrp_plat_node_cnt, &lgrp_plat_mem_intrlv, + lgrp_plat_node_memory); + + /* + * Don't bother to setup system for multiple lgroups and only use one + * memory node when memory is interleaved between any nodes or there is + * only one NUMA node + * + * NOTE: May need to change this for Dynamic Reconfiguration (DR) + * when and if it happens for x86/x64 + */ + if (lgrp_plat_mem_intrlv || lgrp_plat_node_cnt == 1) { + lgrp_plat_node_cnt = max_mem_nodes = 1; + (void) lgrp_topo_ht_limit_set(1); + return; + } + + /* + * Leaf lgroups on x86/x64 architectures contain one physical + * processor chip. Tune lgrp_expand_proc_thresh and + * lgrp_expand_proc_diff so that lgrp_choose() will spread + * things out aggressively. + */ + lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2; + lgrp_expand_proc_diff = 0; + + /* + * There should be one memnode (physical page free list(s)) for + * each node + */ + max_mem_nodes = lgrp_plat_node_cnt; + + /* + * Initialize min and max latency before reading SLIT or probing + */ + lgrp_plat_lat_stats.latency_min = -1; + lgrp_plat_lat_stats.latency_max = 0; + + /* + * Determine how far each NUMA node is from each other by + * reading ACPI System Locality Information Table (SLIT) if it + * exists + */ + lgrp_plat_slit_error = lgrp_plat_process_slit(slit_ptr, + lgrp_plat_node_cnt, lgrp_plat_node_memory, + &lgrp_plat_lat_stats); + if (lgrp_plat_slit_error == 0) + return; + + /* + * Probe to determine latency between NUMA nodes when SLIT + * doesn't exist or make sense + */ + lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_ENABLE; + + /* + * Specify whether to probe using vendor ID register or page copy + * if hasn't been specified already or is overspecified + */ + probe_op = lgrp_plat_probe_flags & + (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR); + + if (probe_op == 0 || + probe_op == (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR)) { + lgrp_plat_probe_flags &= + ~(LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR); + if (is_opteron()) + lgrp_plat_probe_flags |= + LGRP_PLAT_PROBE_VENDOR; + else + lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_PGCPY; + } + + /* + * Probing errors can mess up the lgroup topology and + * force us fall back to a 2 level lgroup topology. + * Here we bound how tall the lgroup topology can grow + * in hopes of avoiding any anamolies in probing from + * messing up the lgroup topology by limiting the + * accuracy of the latency topology. + * + * Assume that nodes will at least be configured in a + * ring, so limit height of lgroup topology to be less + * than number of nodes on a system with 4 or more + * nodes + */ + if (lgrp_plat_node_cnt >= 4 && lgrp_topo_ht_limit() == + lgrp_topo_ht_limit_default()) + (void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1); +} + + +/* * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to * be considered same */ @@ -1749,6 +1698,118 @@ /* + * Platform-specific initialization + */ +static void +lgrp_plat_main_init(void) +{ + int curnode; + int ht_limit; + int i; + + /* + * Print a notice that MPO is disabled when memory is interleaved + * across nodes....Would do this when it is discovered, but can't + * because it happens way too early during boot.... + */ + if (lgrp_plat_mem_intrlv) + cmn_err(CE_NOTE, + "MPO disabled because memory is interleaved\n"); + + /* + * Don't bother to do any probing if it is disabled, there is only one + * node, or the height of the lgroup topology less than or equal to 2 + */ + ht_limit = lgrp_topo_ht_limit(); + if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) || + max_mem_nodes == 1 || ht_limit <= 2) { + /* + * Setup lgroup latencies for 2 level lgroup topology + * (ie. local and remote only) if they haven't been set yet + */ + if (ht_limit == 2 && lgrp_plat_lat_stats.latency_min == -1 && + lgrp_plat_lat_stats.latency_max == 0) + lgrp_plat_2level_setup(lgrp_plat_node_memory, + &lgrp_plat_lat_stats); + return; + } + + if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) { + /* + * Should have been able to probe from CPU 0 when it was added + * to lgroup hierarchy, but may not have been able to then + * because it happens so early in boot that gethrtime() hasn't + * been initialized. (:-( + */ + curnode = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node, + lgrp_plat_cpu_node_nentries); + ASSERT(curnode >= 0 && curnode < lgrp_plat_node_cnt); + if (lgrp_plat_lat_stats.latencies[curnode][curnode] == 0) + lgrp_plat_probe(); + + return; + } + + /* + * When probing memory, use one page for every sample to determine + * lgroup topology and taking multiple samples + */ + if (lgrp_plat_probe_mem_config.probe_memsize == 0) + lgrp_plat_probe_mem_config.probe_memsize = PAGESIZE * + lgrp_plat_probe_nsamples; + + /* + * Map memory in each node needed for probing to determine latency + * topology + */ + for (i = 0; i < lgrp_plat_node_cnt; i++) { + int mnode; + + /* + * Skip this node and leave its probe page NULL + * if it doesn't have any memory + */ + mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i); + if (!mem_node_config[mnode].exists) { + lgrp_plat_probe_mem_config.probe_va[i] = NULL; + continue; + } + + /* + * Allocate one kernel virtual page + */ + lgrp_plat_probe_mem_config.probe_va[i] = vmem_alloc(heap_arena, + lgrp_plat_probe_mem_config.probe_memsize, VM_NOSLEEP); + if (lgrp_plat_probe_mem_config.probe_va[i] == NULL) { + cmn_err(CE_WARN, + "lgrp_plat_main_init: couldn't allocate memory"); + return; + } + + /* + * Get PFN for first page in each node + */ + lgrp_plat_probe_mem_config.probe_pfn[i] = + mem_node_config[mnode].physbase; + + /* + * Map virtual page to first page in node + */ + hat_devload(kas.a_hat, lgrp_plat_probe_mem_config.probe_va[i], + lgrp_plat_probe_mem_config.probe_memsize, + lgrp_plat_probe_mem_config.probe_pfn[i], + PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE, + HAT_LOAD_NOCONSIST); + } + + /* + * Probe from current CPU + */ + lgrp_plat_probe(); +} + + +/* * Return the number of free, allocatable, or installed * pages in an lgroup * This is a copy of the MAX_MEM_NODES == 1 version of the routine @@ -2026,7 +2087,7 @@ * Return time needed to probe from current CPU to memory in given node */ static hrtime_t -lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, +lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, int cpu_node_nentries, lgrp_plat_probe_mem_config_t *probe_mem_config, lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats) { @@ -2044,7 +2105,7 @@ /* * Determine ID of node containing current CPU */ - from = lgrp_plat_cpu_to_node(CPU, cpu_node); + from = lgrp_plat_cpu_to_node(CPU, cpu_node, cpu_node_nentries); ASSERT(from >= 0 && from < lgrp_plat_node_cnt); /* @@ -2139,7 +2200,8 @@ /* * Read boot property with CPU to APIC ID array, fill in CPU to node ID - * mapping table with APIC ID for each CPU, and return number of CPU APIC IDs. + * mapping table with APIC ID for each CPU (if pointer to table isn't NULL), + * and return number of CPU APIC IDs. * * NOTE: This code assumes that CPU IDs are assigned in order that they appear * in in cpu_apicid_array boot property which is based on and follows @@ -2157,17 +2219,11 @@ int n; /* - * Nothing to do when no array to fill in or not enough CPUs - */ - if (cpu_node == NULL) - return (-1); - - /* * Check length of property value */ boot_prop_len = BOP_GETPROPLEN(bootops, boot_prop_name); if (boot_prop_len <= 0 || boot_prop_len > sizeof (cpu_apicid_array)) - return (-2); + return (-1); /* * Calculate number of entries in array and return when there's just @@ -2175,13 +2231,20 @@ */ n = boot_prop_len / sizeof (uint8_t); if (n == 1) - return (-3); + return (-2); /* * Get CPU to APIC ID property value */ if (BOP_GETPROP(bootops, boot_prop_name, cpu_apicid_array) < 0) - return (-4); + return (-3); + + /* + * Just return number of CPU APIC IDs if CPU to node mapping table is + * NULL + */ + if (cpu_node == NULL) + return (n); /* * Fill in CPU to node ID mapping table with APIC ID for each CPU @@ -2404,6 +2467,27 @@ /* + * Allocate permanent memory for any temporary memory that we needed to + * allocate using BOP_ALLOC() before kmem_alloc() and VM system were + * initialized and copy everything from temporary to permanent memory since + * temporary boot memory will eventually be released during boot + */ +static void +lgrp_plat_release_bootstrap(void) +{ + void *buf; + size_t size; + + if (lgrp_plat_cpu_node_nentries > 0) { + size = lgrp_plat_cpu_node_nentries * sizeof (cpu_node_map_t); + buf = kmem_alloc(size, KM_SLEEP); + bcopy(lgrp_plat_cpu_node, buf, size); + lgrp_plat_cpu_node = buf; + } +} + + +/* * Return number of proximity domains given in ACPI SRAT */ static int
--- a/usr/src/uts/i86pc/os/mlsetup.c Fri Oct 02 17:27:26 2009 -0700 +++ b/usr/src/uts/i86pc/os/mlsetup.c Sat Oct 03 12:16:34 2009 -0700 @@ -367,7 +367,7 @@ /* * Initialize the lgrp framework */ - lgrp_init(); + lgrp_init(LGRP_INIT_STAGE1); if (boothowto & RB_HALT) { prom_printf("unix: kernel halted by -h flag\n");
--- a/usr/src/uts/sun4/os/lgrpplat.c Fri Oct 02 17:27:26 2009 -0700 +++ b/usr/src/uts/sun4/os/lgrpplat.c Sat Oct 03 12:16:34 2009 -0700 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,14 +18,12 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - - #include <sys/cpuvar.h> #include <sys/lgrp.h> #include <sys/memnode.h> @@ -100,34 +97,35 @@ lgrp_handle_t lgrp_default_handle = LGRP_DEFAULT_HANDLE; void -lgrp_plat_init(void) +lgrp_plat_init(lgrp_init_stages_t stage) { int i; - /* - * Initialize lookup tables to invalid values so we catch - * any illegal use of them. - */ - for (i = 0; i < MAX_MEM_NODES; i++) { - memnode_to_lgrphand[i] = -1; - lgrphand_to_memnode[i] = -1; - } + switch (stage) { + case LGRP_INIT_STAGE1: + /* + * Initialize lookup tables to invalid values so we catch + * any illegal use of them. + */ + for (i = 0; i < MAX_MEM_NODES; i++) { + memnode_to_lgrphand[i] = -1; + lgrphand_to_memnode[i] = -1; + } - if (lgrp_topo_ht_limit() == 1) { - max_mem_nodes = 1; - return; - } - - if (&plat_lgrp_cpu_to_hand) - max_mem_nodes = MAX_MEM_NODES; + if (lgrp_topo_ht_limit() == 1) { + max_mem_nodes = 1; + return; + } - if (&plat_lgrp_init) - plat_lgrp_init(); -} + if (&plat_lgrp_cpu_to_hand) + max_mem_nodes = MAX_MEM_NODES; -void -lgrp_plat_main_init(void) -{ + if (&plat_lgrp_init) + plat_lgrp_init(); + break; + default: + break; + } } /* ARGSUSED */ @@ -352,11 +350,3 @@ return (NULL); return (lgrp); } - -/* - * Probe memory in each node from current CPU to determine latency topology - */ -void -lgrp_plat_probe(void) -{ -}
--- a/usr/src/uts/sun4/os/mlsetup.c Fri Oct 02 17:27:26 2009 -0700 +++ b/usr/src/uts/sun4/os/mlsetup.c Sat Oct 03 12:16:34 2009 -0700 @@ -273,7 +273,7 @@ * lgroup framework initialization. This must be done prior * to devices being mapped. */ - lgrp_init(); + lgrp_init(LGRP_INIT_STAGE1); cpu_setup();