Mercurial > illumos > illumos-gate

--- a/usr/src/uts/common/os/lgrp.c	Fri Oct 02 17:27:26 2009 -0700
+++ b/usr/src/uts/common/os/lgrp.c	Sat Oct 03 12:16:34 2009 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */

@@ -225,7 +225,13 @@
 static void	lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
 static void	lgrp_part_del_cpu(struct cpu *);

+/*
+ * lgroup framework initialization
+ */
+static void	lgrp_main_init(void);
+static void	lgrp_main_mp_init(void);
 static void	lgrp_root_init(void);
+static void	lgrp_setup(void);

 /*
  * lpl topology
@@ -284,7 +290,7 @@
 }

 /*
- * Build full lgroup topology
+ * Setup root lgroup
  */
 static void
 lgrp_root_init(void)
@@ -352,28 +358,52 @@

 /*
  * Initialize the lgroup framework and allow the platform to do the same
+ *
+ * This happens in stages during boot and is all funnelled through this routine
+ * (see definition of lgrp_init_stages_t to see what happens at each stage and
+ * when)
  */
 void
-lgrp_init(void)
+lgrp_init(lgrp_init_stages_t stage)
 {
 	/*
 	 * Initialize the platform
 	 */
-	lgrp_plat_init();
-
-	/*
-	 * Set max number of lgroups supported on this platform which must be
-	 * less than the max number of lgroups supported by the common lgroup
-	 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.)
-	 */
-	nlgrpsmax = lgrp_plat_max_lgrps();
-	ASSERT(nlgrpsmax <= NLGRPS_MAX);
+	lgrp_plat_init(stage);
+
+	switch (stage) {
+	case LGRP_INIT_STAGE1:
+		/*
+		 * Set max number of lgroups supported on this platform which
+		 * must be less than the max number of lgroups supported by the
+		 * common lgroup framework (eg. NLGRPS_MAX is max elements in
+		 * lgrp_table[], etc.)
+		 */
+		nlgrpsmax = lgrp_plat_max_lgrps();
+		ASSERT(nlgrpsmax <= NLGRPS_MAX);
+		break;
+
+	case LGRP_INIT_STAGE2:
+		lgrp_setup();
+		break;
+
+	case LGRP_INIT_STAGE4:
+		lgrp_main_init();
+		break;
+
+	case LGRP_INIT_STAGE5:
+		lgrp_main_mp_init();
+		break;
+
+	default:
+		break;
+	}
 }

 /*
  * Create the root and cpu0's lgroup, and set t0's home.
  */
-void
+static void
 lgrp_setup(void)
 {
 	/*
@@ -389,16 +419,6 @@
 }

 /*
- * Lgroup initialization is split in two parts. The first part
- * (lgrp_main_init()) is called right before start_other_cpus() in main. The
- * second part (lgrp_main_mp_init()) is called right after start_other_cpus()
- * when all CPUs are brought online and all distance information is available.
- *
- * When lgrp_main_init() is complete it sets lgrp_initialized. The
- * lgrp_main_mp_init() sets lgrp_topo_initialized.
- */
-
-/*
  * true when lgrp initialization has been completed.
  */
 int	lgrp_initialized = 0;
@@ -412,7 +432,7 @@
  * Init routine called after startup(), /etc/system has been processed,
  * and cpu0 has been added to an lgroup.
  */
-void
+static void
 lgrp_main_init(void)
 {
 	cpu_t		*cp = CPU;
@@ -488,7 +508,6 @@
 	lgrp_kstat_create(cp);
 	mutex_exit(&cpu_lock);

-	lgrp_plat_main_init();
 	lgrp_initialized = 1;
 }

@@ -496,7 +515,7 @@
  * Finish lgrp initialization after all CPUS are brought on-line.
  * This routine is called after start_other_cpus().
  */
-void
+static void
 lgrp_main_mp_init(void)
 {
 	klgrpset_t changed;
--- a/usr/src/uts/common/os/main.c	Fri Oct 02 17:27:26 2009 -0700
+++ b/usr/src/uts/common/os/main.c	Sat Oct 03 12:16:34 2009 -0700
@@ -370,8 +370,6 @@
 	extern int	pm_adjust_timestamps(dev_info_t *, void *);
 	extern void	start_other_cpus(int);
 	extern void	sysevent_evc_thrinit();
-	extern void	lgrp_main_init(void);
-	extern void	lgrp_main_mp_init(void);
 #if defined(__x86)
 	extern void	fastboot_post_startup(void);
 #endif
@@ -388,9 +386,9 @@
 	ASSERT_STACK_ALIGNED();

 	/*
-	 * Setup the first lgroup, and home t0
+	 * Setup root lgroup and leaf lgroup for CPU 0
 	 */
-	lgrp_setup();
+	lgrp_init(LGRP_INIT_STAGE2);

 	/*
 	 * Once 'startup()' completes, the thread_reaper() daemon would be
@@ -419,8 +417,10 @@
 	/*
 	 * May need to probe to determine latencies from CPU 0 after
 	 * gethrtime() comes alive in cbe_init() and before enabling interrupts
+	 * and copy and release any temporary memory allocated with BOP_ALLOC()
+	 * before release_bootstrap() frees boot memory
 	 */
-	lgrp_plat_probe();
+	lgrp_init(LGRP_INIT_STAGE3);

 	/*
 	 * Call all system initialization functions.
@@ -529,11 +529,10 @@
 	sysevent_evc_thrinit();

 	/*
-	 * main lgroup initialization
-	 * This must be done after post_startup(), but before
+	 * This must be done after post_startup() but before
 	 * start_other_cpus()
 	 */
-	lgrp_main_init();
+	lgrp_init(LGRP_INIT_STAGE4);

 	/*
 	 * Perform MP initialization, if any.
@@ -551,7 +550,7 @@
 	/*
 	 * Finish lgrp initialization after all CPUS are brought online.
 	 */
-	lgrp_main_mp_init();
+	lgrp_init(LGRP_INIT_STAGE5);

 	/*
 	 * After mp_init(), number of cpus are known (this is
--- a/usr/src/uts/common/sys/lgrp.h	Fri Oct 02 17:27:26 2009 -0700
+++ b/usr/src/uts/common/sys/lgrp.h	Sat Oct 03 12:16:34 2009 -0700
@@ -20,7 +20,7 @@
  */

 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */

@@ -295,6 +295,32 @@
 } lgrp_config_flag_t;

 /*
+ * Stages of lgroup framework initialization (done through lgrp_init()):
+ *
+ * 1) Initialize common and platform specific code (called in mlsetup())
+ *
+ * 2) Setup root lgroup and add CPU 0 to lgroup(s) (called near beginning of
+ *    main() before startup())
+ *
+ * 3) Probe from CPU 0 and copy and release any BOP_ALLOC-ed memory temporarily
+ *    allocated before kernel memory allocator is setup (called in main()
+ *    after startup(), gethrtime() is setup, and before interrupts enabled)
+ *
+ * 4) Check for null proc LPA on Starcat, collapse lgroup topology (if
+ *    necessary), setup lgroup kstats, etc. (called before start_other_cpus())
+ *
+ * 5) Finish any lgroup initialization needed including updating lgroup
+ *    topology after all CPUs started (called after start_other_cpus())
+ */
+typedef enum lgrp_init_stages {
+	LGRP_INIT_STAGE1,
+	LGRP_INIT_STAGE2,
+	LGRP_INIT_STAGE3,
+	LGRP_INIT_STAGE4,
+	LGRP_INIT_STAGE5
+} lgrp_init_stages_t;
+
+/*
  * Memory allocation policies
  */
 typedef enum lgrp_mem_policy {
@@ -510,8 +536,7 @@
  * lgroup management
  */
 int	lgrp_optimizations(void);
-void	lgrp_init(void);
-void	lgrp_setup(void);
+void	lgrp_init(lgrp_init_stages_t);
 lgrp_t	*lgrp_create(void);
 void	lgrp_destroy(lgrp_t *);
 void	lgrp_config(lgrp_config_flag_t, uintptr_t, uintptr_t);
@@ -588,8 +613,7 @@


 /* platform interfaces */
-void	lgrp_plat_init(void);
-void	lgrp_plat_main_init(void);
+void	lgrp_plat_init(lgrp_init_stages_t);
 lgrp_t	*lgrp_plat_alloc(lgrp_id_t lgrpid);
 void	lgrp_plat_config(lgrp_config_flag_t, uintptr_t);
 lgrp_handle_t	lgrp_plat_cpu_to_hand(processorid_t);
@@ -598,7 +622,6 @@
 pgcnt_t	lgrp_plat_mem_size(lgrp_handle_t, lgrp_mem_query_t);
 int	lgrp_plat_latency(lgrp_handle_t, lgrp_handle_t);
 lgrp_handle_t	lgrp_plat_root_hand(void);
-void	lgrp_plat_probe(void);

 extern uint32_t		lgrp_expand_proc_thresh;
 extern uint32_t		lgrp_expand_proc_diff;
--- a/usr/src/uts/i86pc/os/lgrpplat.c	Fri Oct 02 17:27:26 2009 -0700
+++ b/usr/src/uts/i86pc/os/lgrpplat.c	Sat Oct 03 12:16:34 2009 -0700
@@ -254,9 +254,11 @@
 static int				lgrp_plat_apic_ncpus = 0;

 /*
- * CPU to node ID mapping table (only used for SRAT)
+ * CPU to node ID mapping table (only used for SRAT) and its max number of
+ * entries
  */
-static cpu_node_map_t			lgrp_plat_cpu_node[NCPU];
+static cpu_node_map_t			*lgrp_plat_cpu_node = NULL;
+static uint_t				lgrp_plat_cpu_node_nentries = 0;

 /*
  * Latency statistics
@@ -385,12 +387,10 @@

 lgrp_handle_t	lgrp_plat_cpu_to_hand(processorid_t id);

-void		lgrp_plat_init(void);
+void		lgrp_plat_init(lgrp_init_stages_t stage);

 int		lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to);

-void		lgrp_plat_main_init(void);
-
 int		lgrp_plat_max_lgrps(void);

 pgcnt_t		lgrp_plat_mem_size(lgrp_handle_t plathand,
@@ -412,11 +412,14 @@
     int node_cnt, cpu_node_map_t *cpu_node, int nentries, uint32_t apicid,
     uint32_t domain);

-static int	lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node);
+static int	lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node,
+    int cpu_node_nentries);

 static int	lgrp_plat_domain_to_node(node_domain_map_t *node_domain,
     int node_cnt, uint32_t domain);

+static void	lgrp_plat_get_numa_config(void);
+
 static void	lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory,
     lgrp_plat_latency_stats_t *lat_stats,
     lgrp_plat_probe_stats_t *probe_stats);
@@ -424,6 +427,8 @@
 static int	lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory,
     lgrp_plat_latency_stats_t *lat_stats);

+static void	lgrp_plat_main_init(void);
+
 static pgcnt_t	lgrp_plat_mem_size_default(lgrp_handle_t, lgrp_mem_query_t);

 static int	lgrp_plat_node_domain_update(node_domain_map_t *node_domain,
@@ -438,9 +443,8 @@
     node_phys_addr_map_t *node_memory);

 static hrtime_t	lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node,
-    lgrp_plat_probe_mem_config_t *probe_mem_config,
-    lgrp_plat_latency_stats_t *lat_stats,
-    lgrp_plat_probe_stats_t *probe_stats);
+    int cpu_node_nentries, lgrp_plat_probe_mem_config_t *probe_mem_config,
+    lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats);

 static int	lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node);

@@ -452,6 +456,8 @@
     cpu_node_map_t *cpu_node, int cpu_count,
     node_phys_addr_map_t *node_memory);

+static void	lgrp_plat_release_bootstrap(void);
+
 static int	lgrp_plat_srat_domains(struct srat *tp,
     uint32_t *prox_domain_min);

@@ -728,7 +734,7 @@
 		return (LGRP_DEFAULT_HANDLE);

 	hand = (lgrp_handle_t)lgrp_plat_cpu_to_node(cpu[id],
-	    lgrp_plat_cpu_node);
+	    lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries);

 	ASSERT(hand != (lgrp_handle_t)-1);
 	if (hand == (lgrp_handle_t)-1)
@@ -742,166 +748,63 @@
  * Platform-specific initialization of lgroups
  */
 void
-lgrp_plat_init(void)
+lgrp_plat_init(lgrp_init_stages_t stage)
 {
 #if defined(__xpv)
-	/*
-	 * XXPV	For now, the hypervisor treats all memory equally.
-	 */
-	lgrp_plat_node_cnt = max_mem_nodes = 1;
 #else	/* __xpv */
-	uint_t		probe_op;
 	u_longlong_t	value;
-
-	/*
-	 * Get boot property for lgroup topology height limit
-	 */
-	if (bootprop_getval(BP_LGRP_TOPO_LEVELS, &value) == 0)
-		(void) lgrp_topo_ht_limit_set((int)value);
-
-	/*
-	 * Get boot property for enabling/disabling SRAT
-	 */
-	if (bootprop_getval(BP_LGRP_SRAT_ENABLE, &value) == 0)
-		lgrp_plat_srat_enable = (int)value;
-
-	/*
-	 * Get boot property for enabling/disabling SLIT
-	 */
-	if (bootprop_getval(BP_LGRP_SLIT_ENABLE, &value) == 0)
-		lgrp_plat_slit_enable = (int)value;
-
-	/*
-	 * Initialize as a UMA machine
-	 */
-	if (lgrp_topo_ht_limit() == 1) {
+#endif	/* __xpv */
+
+	switch (stage) {
+	case LGRP_INIT_STAGE1:
+#if defined(__xpv)
+		/*
+		 * XXPV	For now, the hypervisor treats all memory equally.
+		 */
 		lgrp_plat_node_cnt = max_mem_nodes = 1;
-		return;
-	}
-
-	/*
-	 * Read boot property with CPU to APIC ID mapping table/array and fill
-	 * in CPU to node ID mapping table with APIC ID for each CPU
-	 */
-	lgrp_plat_apic_ncpus =
-	    lgrp_plat_process_cpu_apicids(lgrp_plat_cpu_node);
-
-	/*
-	 * Determine which CPUs and memory are local to each other and number
-	 * of NUMA nodes by reading ACPI System Resource Affinity Table (SRAT)
-	 */
-	if (lgrp_plat_apic_ncpus > 0) {
-		int	retval;
-
-		retval = lgrp_plat_process_srat(srat_ptr,
-		    &lgrp_plat_prox_domain_min,
-		    lgrp_plat_node_domain, lgrp_plat_cpu_node,
-		    lgrp_plat_apic_ncpus, lgrp_plat_node_memory);
-		if (retval <= 0) {
-			lgrp_plat_srat_error = retval;
-			lgrp_plat_node_cnt = 1;
-		} else {
-			lgrp_plat_srat_error = 0;
-			lgrp_plat_node_cnt = retval;
+#else	/* __xpv */
+		/*
+		 * Get boot property for lgroup topology height limit
+		 */
+		if (bootprop_getval(BP_LGRP_TOPO_LEVELS, &value) == 0)
+			(void) lgrp_topo_ht_limit_set((int)value);
+
+		/*
+		 * Get boot property for enabling/disabling SRAT
+		 */
+		if (bootprop_getval(BP_LGRP_SRAT_ENABLE, &value) == 0)
+			lgrp_plat_srat_enable = (int)value;
+
+		/*
+		 * Get boot property for enabling/disabling SLIT
+		 */
+		if (bootprop_getval(BP_LGRP_SLIT_ENABLE, &value) == 0)
+			lgrp_plat_slit_enable = (int)value;
+
+		/*
+		 * Initialize as a UMA machine
+		 */
+		if (lgrp_topo_ht_limit() == 1) {
+			lgrp_plat_node_cnt = max_mem_nodes = 1;
+			return;
 		}
+
+		lgrp_plat_get_numa_config();
+#endif	/* __xpv */
+		break;
+
+	case LGRP_INIT_STAGE3:
+		lgrp_plat_probe();
+		lgrp_plat_release_bootstrap();
+		break;
+
+	case LGRP_INIT_STAGE4:
+		lgrp_plat_main_init();
+		break;
+
+	default:
+		break;
 	}
-
-	/*
-	 * Try to use PCI config space registers on Opteron if there's an error
-	 * processing CPU to APIC ID mapping or SRAT
-	 */
-	if ((lgrp_plat_apic_ncpus <= 0 || lgrp_plat_srat_error != 0) &&
-	    is_opteron())
-		opt_get_numa_config(&lgrp_plat_node_cnt, &lgrp_plat_mem_intrlv,
-		    lgrp_plat_node_memory);
-
-	/*
-	 * Don't bother to setup system for multiple lgroups and only use one
-	 * memory node when memory is interleaved between any nodes or there is
-	 * only one NUMA node
-	 *
-	 * NOTE: May need to change this for Dynamic Reconfiguration (DR)
-	 *	 when and if it happens for x86/x64
-	 */
-	if (lgrp_plat_mem_intrlv || lgrp_plat_node_cnt == 1) {
-		lgrp_plat_node_cnt = max_mem_nodes = 1;
-		(void) lgrp_topo_ht_limit_set(1);
-		return;
-	}
-
-	/*
-	 * Leaf lgroups on x86/x64 architectures contain one physical
-	 * processor chip. Tune lgrp_expand_proc_thresh and
-	 * lgrp_expand_proc_diff so that lgrp_choose() will spread
-	 * things out aggressively.
-	 */
-	lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2;
-	lgrp_expand_proc_diff = 0;
-
-	/*
-	 * There should be one memnode (physical page free list(s)) for
-	 * each node
-	 */
-	max_mem_nodes = lgrp_plat_node_cnt;
-
-	/*
-	 * Initialize min and max latency before reading SLIT or probing
-	 */
-	lgrp_plat_lat_stats.latency_min = -1;
-	lgrp_plat_lat_stats.latency_max = 0;
-
-	/*
-	 * Determine how far each NUMA node is from each other by
-	 * reading ACPI System Locality Information Table (SLIT) if it
-	 * exists
-	 */
-	lgrp_plat_slit_error = lgrp_plat_process_slit(slit_ptr,
-	    lgrp_plat_node_cnt, lgrp_plat_node_memory,
-	    &lgrp_plat_lat_stats);
-	if (lgrp_plat_slit_error == 0)
-		return;
-
-	/*
-	 * Probe to determine latency between NUMA nodes when SLIT
-	 * doesn't exist or make sense
-	 */
-	lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_ENABLE;
-
-	/*
-	 * Specify whether to probe using vendor ID register or page copy
-	 * if hasn't been specified already or is overspecified
-	 */
-	probe_op = lgrp_plat_probe_flags &
-	    (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR);
-
-	if (probe_op == 0 ||
-	    probe_op == (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR)) {
-		lgrp_plat_probe_flags &=
-		    ~(LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR);
-		if (is_opteron())
-			lgrp_plat_probe_flags |=
-			    LGRP_PLAT_PROBE_VENDOR;
-		else
-			lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_PGCPY;
-	}
-
-	/*
-	 * Probing errors can mess up the lgroup topology and
-	 * force us fall back to a 2 level lgroup topology.
-	 * Here we bound how tall the lgroup topology can grow
-	 * in hopes of avoiding any anamolies in probing from
-	 * messing up the lgroup topology by limiting the
-	 * accuracy of the latency topology.
-	 *
-	 * Assume that nodes will at least be configured in a
-	 * ring, so limit height of lgroup topology to be less
-	 * than number of nodes on a system with 4 or more
-	 * nodes
-	 */
-	if (lgrp_plat_node_cnt >= 4 && lgrp_topo_ht_limit() ==
-	    lgrp_topo_ht_limit_default())
-		(void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1);
-#endif	/* __xpv */
 }


@@ -943,7 +846,8 @@
 	 * Probe from current CPU if its lgroup latencies haven't been set yet
 	 * and we are trying to get latency from current CPU to some node
 	 */
-	node = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node);
+	node = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
+	    lgrp_plat_cpu_node_nentries);
 	ASSERT(node >= 0 && node < lgrp_plat_node_cnt);
 	if (lgrp_plat_lat_stats.latencies[src][src] == 0 && node == src)
 		lgrp_plat_probe();
@@ -953,117 +857,6 @@


 /*
- * Platform-specific initialization
- */
-void
-lgrp_plat_main_init(void)
-{
-	int	curnode;
-	int	ht_limit;
-	int	i;
-
-	/*
-	 * Print a notice that MPO is disabled when memory is interleaved
-	 * across nodes....Would do this when it is discovered, but can't
-	 * because it happens way too early during boot....
-	 */
-	if (lgrp_plat_mem_intrlv)
-		cmn_err(CE_NOTE,
-		    "MPO disabled because memory is interleaved\n");
-
-	/*
-	 * Don't bother to do any probing if it is disabled, there is only one
-	 * node, or the height of the lgroup topology less than or equal to 2
-	 */
-	ht_limit = lgrp_topo_ht_limit();
-	if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) ||
-	    max_mem_nodes == 1 || ht_limit <= 2) {
-		/*
-		 * Setup lgroup latencies for 2 level lgroup topology
-		 * (ie. local and remote only) if they haven't been set yet
-		 */
-		if (ht_limit == 2 && lgrp_plat_lat_stats.latency_min == -1 &&
-		    lgrp_plat_lat_stats.latency_max == 0)
-			lgrp_plat_2level_setup(lgrp_plat_node_memory,
-			    &lgrp_plat_lat_stats);
-		return;
-	}
-
-	if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) {
-		/*
-		 * Should have been able to probe from CPU 0 when it was added
-		 * to lgroup hierarchy, but may not have been able to then
-		 * because it happens so early in boot that gethrtime() hasn't
-		 * been initialized.  (:-(
-		 */
-		curnode = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node);
-		ASSERT(curnode >= 0 && curnode < lgrp_plat_node_cnt);
-		if (lgrp_plat_lat_stats.latencies[curnode][curnode] == 0)
-			lgrp_plat_probe();
-
-		return;
-	}
-
-	/*
-	 * When probing memory, use one page for every sample to determine
-	 * lgroup topology and taking multiple samples
-	 */
-	if (lgrp_plat_probe_mem_config.probe_memsize == 0)
-		lgrp_plat_probe_mem_config.probe_memsize = PAGESIZE *
-		    lgrp_plat_probe_nsamples;
-
-	/*
-	 * Map memory in each node needed for probing to determine latency
-	 * topology
-	 */
-	for (i = 0; i < lgrp_plat_node_cnt; i++) {
-		int	mnode;
-
-		/*
-		 * Skip this node and leave its probe page NULL
-		 * if it doesn't have any memory
-		 */
-		mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i);
-		if (!mem_node_config[mnode].exists) {
-			lgrp_plat_probe_mem_config.probe_va[i] = NULL;
-			continue;
-		}
-
-		/*
-		 * Allocate one kernel virtual page
-		 */
-		lgrp_plat_probe_mem_config.probe_va[i] = vmem_alloc(heap_arena,
-		    lgrp_plat_probe_mem_config.probe_memsize, VM_NOSLEEP);
-		if (lgrp_plat_probe_mem_config.probe_va[i] == NULL) {
-			cmn_err(CE_WARN,
-			    "lgrp_plat_main_init: couldn't allocate memory");
-			return;
-		}
-
-		/*
-		 * Get PFN for first page in each node
-		 */
-		lgrp_plat_probe_mem_config.probe_pfn[i] =
-		    mem_node_config[mnode].physbase;
-
-		/*
-		 * Map virtual page to first page in node
-		 */
-		hat_devload(kas.a_hat, lgrp_plat_probe_mem_config.probe_va[i],
-		    lgrp_plat_probe_mem_config.probe_memsize,
-		    lgrp_plat_probe_mem_config.probe_pfn[i],
-		    PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE,
-		    HAT_LOAD_NOCONSIST);
-	}
-
-	/*
-	 * Probe from current CPU
-	 */
-	lgrp_plat_probe();
-}
-
-
-/*
  * Return the maximum number of lgrps supported by the platform.
  * Before lgrp topology is known it returns an estimate based on the number of
  * nodes. Once topology is known it returns the actual maximim number of lgrps
@@ -1189,7 +982,8 @@
 	/*
 	 * Determine ID of node containing current CPU
 	 */
-	from = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node);
+	from = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
+	    lgrp_plat_cpu_node_nentries);
 	ASSERT(from >= 0 && from < lgrp_plat_node_cnt);
 	if (srat_ptr && lgrp_plat_srat_enable && !lgrp_plat_srat_error)
 		ASSERT(lgrp_plat_node_domain[from].exists);
@@ -1215,8 +1009,9 @@
 			 * probed yet or don't have memory
 			 */
 			probe_time = lgrp_plat_probe_time(to,
-			    lgrp_plat_cpu_node, &lgrp_plat_probe_mem_config,
-			    &lgrp_plat_lat_stats, &lgrp_plat_probe_stats);
+			    lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries,
+			    &lgrp_plat_probe_mem_config, &lgrp_plat_lat_stats,
+			    &lgrp_plat_probe_stats);
 			if (probe_time == 0)
 				continue;

@@ -1343,7 +1138,8 @@
  * Get node ID for given CPU
  */
 static int
-lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node)
+lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node,
+    int cpu_node_nentries)
 {
 	processorid_t	cpuid;

@@ -1369,7 +1165,7 @@
 	 * Return -1 when CPU to node ID mapping entry doesn't exist for given
 	 * CPU
 	 */
-	if (!cpu_node[cpuid].exists)
+	if (cpuid >= cpu_node_nentries || !cpu_node[cpuid].exists)
 		return (-1);

 	return (cpu_node[cpuid].node);
@@ -1403,6 +1199,159 @@


 /*
+ * Get NUMA configuration of machine
+ */
+static void
+lgrp_plat_get_numa_config(void)
+{
+	uint_t		probe_op;
+
+	/*
+	 * Read boot property with CPU to APIC ID mapping table/array to
+	 * determine number of CPUs
+	 */
+	lgrp_plat_apic_ncpus = lgrp_plat_process_cpu_apicids(NULL);
+
+	/*
+	 * Determine which CPUs and memory are local to each other and number
+	 * of NUMA nodes by reading ACPI System Resource Affinity Table (SRAT)
+	 */
+	if (lgrp_plat_apic_ncpus > 0) {
+		int	retval;
+
+		/*
+		 * Temporarily allocate boot memory to use for CPU to node
+		 * mapping since kernel memory allocator isn't alive yet
+		 */
+		lgrp_plat_cpu_node = (cpu_node_map_t *)BOP_ALLOC(bootops,
+		    NULL, lgrp_plat_apic_ncpus * sizeof (cpu_node_map_t),
+		    sizeof (int));
+
+		ASSERT(lgrp_plat_cpu_node != NULL);
+		if (lgrp_plat_cpu_node) {
+			lgrp_plat_cpu_node_nentries = lgrp_plat_apic_ncpus;
+			bzero(lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries *
+			    sizeof (cpu_node_map_t));
+		}
+
+		/*
+		 * Fill in CPU to node ID mapping table with APIC ID for each
+		 * CPU
+		 */
+		(void) lgrp_plat_process_cpu_apicids(lgrp_plat_cpu_node);
+
+		retval = lgrp_plat_process_srat(srat_ptr,
+		    &lgrp_plat_prox_domain_min,
+		    lgrp_plat_node_domain, lgrp_plat_cpu_node,
+		    lgrp_plat_apic_ncpus, lgrp_plat_node_memory);
+		if (retval <= 0) {
+			lgrp_plat_srat_error = retval;
+			lgrp_plat_node_cnt = 1;
+		} else {
+			lgrp_plat_srat_error = 0;
+			lgrp_plat_node_cnt = retval;
+		}
+	}
+
+	/*
+	 * Try to use PCI config space registers on Opteron if there's an error
+	 * processing CPU to APIC ID mapping or SRAT
+	 */
+	if ((lgrp_plat_apic_ncpus <= 0 || lgrp_plat_srat_error != 0) &&
+	    is_opteron())
+		opt_get_numa_config(&lgrp_plat_node_cnt, &lgrp_plat_mem_intrlv,
+		    lgrp_plat_node_memory);
+
+	/*
+	 * Don't bother to setup system for multiple lgroups and only use one
+	 * memory node when memory is interleaved between any nodes or there is
+	 * only one NUMA node
+	 *
+	 * NOTE: May need to change this for Dynamic Reconfiguration (DR)
+	 *	 when and if it happens for x86/x64
+	 */
+	if (lgrp_plat_mem_intrlv || lgrp_plat_node_cnt == 1) {
+		lgrp_plat_node_cnt = max_mem_nodes = 1;
+		(void) lgrp_topo_ht_limit_set(1);
+		return;
+	}
+
+	/*
+	 * Leaf lgroups on x86/x64 architectures contain one physical
+	 * processor chip. Tune lgrp_expand_proc_thresh and
+	 * lgrp_expand_proc_diff so that lgrp_choose() will spread
+	 * things out aggressively.
+	 */
+	lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2;
+	lgrp_expand_proc_diff = 0;
+
+	/*
+	 * There should be one memnode (physical page free list(s)) for
+	 * each node
+	 */
+	max_mem_nodes = lgrp_plat_node_cnt;
+
+	/*
+	 * Initialize min and max latency before reading SLIT or probing
+	 */
+	lgrp_plat_lat_stats.latency_min = -1;
+	lgrp_plat_lat_stats.latency_max = 0;
+
+	/*
+	 * Determine how far each NUMA node is from each other by
+	 * reading ACPI System Locality Information Table (SLIT) if it
+	 * exists
+	 */
+	lgrp_plat_slit_error = lgrp_plat_process_slit(slit_ptr,
+	    lgrp_plat_node_cnt, lgrp_plat_node_memory,
+	    &lgrp_plat_lat_stats);
+	if (lgrp_plat_slit_error == 0)
+		return;
+
+	/*
+	 * Probe to determine latency between NUMA nodes when SLIT
+	 * doesn't exist or make sense
+	 */
+	lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_ENABLE;
+
+	/*
+	 * Specify whether to probe using vendor ID register or page copy
+	 * if hasn't been specified already or is overspecified
+	 */
+	probe_op = lgrp_plat_probe_flags &
+	    (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR);
+
+	if (probe_op == 0 ||
+	    probe_op == (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR)) {
+		lgrp_plat_probe_flags &=
+		    ~(LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR);
+		if (is_opteron())
+			lgrp_plat_probe_flags |=
+			    LGRP_PLAT_PROBE_VENDOR;
+		else
+			lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_PGCPY;
+	}
+
+	/*
+	 * Probing errors can mess up the lgroup topology and
+	 * force us fall back to a 2 level lgroup topology.
+	 * Here we bound how tall the lgroup topology can grow
+	 * in hopes of avoiding any anamolies in probing from
+	 * messing up the lgroup topology by limiting the
+	 * accuracy of the latency topology.
+	 *
+	 * Assume that nodes will at least be configured in a
+	 * ring, so limit height of lgroup topology to be less
+	 * than number of nodes on a system with 4 or more
+	 * nodes
+	 */
+	if (lgrp_plat_node_cnt >= 4 && lgrp_topo_ht_limit() ==
+	    lgrp_topo_ht_limit_default())
+		(void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1);
+}
+
+
+/*
  * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to
  * be considered same
  */
@@ -1749,6 +1698,118 @@


 /*
+ * Platform-specific initialization
+ */
+static void
+lgrp_plat_main_init(void)
+{
+	int	curnode;
+	int	ht_limit;
+	int	i;
+
+	/*
+	 * Print a notice that MPO is disabled when memory is interleaved
+	 * across nodes....Would do this when it is discovered, but can't
+	 * because it happens way too early during boot....
+	 */
+	if (lgrp_plat_mem_intrlv)
+		cmn_err(CE_NOTE,
+		    "MPO disabled because memory is interleaved\n");
+
+	/*
+	 * Don't bother to do any probing if it is disabled, there is only one
+	 * node, or the height of the lgroup topology less than or equal to 2
+	 */
+	ht_limit = lgrp_topo_ht_limit();
+	if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) ||
+	    max_mem_nodes == 1 || ht_limit <= 2) {
+		/*
+		 * Setup lgroup latencies for 2 level lgroup topology
+		 * (ie. local and remote only) if they haven't been set yet
+		 */
+		if (ht_limit == 2 && lgrp_plat_lat_stats.latency_min == -1 &&
+		    lgrp_plat_lat_stats.latency_max == 0)
+			lgrp_plat_2level_setup(lgrp_plat_node_memory,
+			    &lgrp_plat_lat_stats);
+		return;
+	}
+
+	if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) {
+		/*
+		 * Should have been able to probe from CPU 0 when it was added
+		 * to lgroup hierarchy, but may not have been able to then
+		 * because it happens so early in boot that gethrtime() hasn't
+		 * been initialized.  (:-(
+		 */
+		curnode = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
+		    lgrp_plat_cpu_node_nentries);
+		ASSERT(curnode >= 0 && curnode < lgrp_plat_node_cnt);
+		if (lgrp_plat_lat_stats.latencies[curnode][curnode] == 0)
+			lgrp_plat_probe();
+
+		return;
+	}
+
+	/*
+	 * When probing memory, use one page for every sample to determine
+	 * lgroup topology and taking multiple samples
+	 */
+	if (lgrp_plat_probe_mem_config.probe_memsize == 0)
+		lgrp_plat_probe_mem_config.probe_memsize = PAGESIZE *
+		    lgrp_plat_probe_nsamples;
+
+	/*
+	 * Map memory in each node needed for probing to determine latency
+	 * topology
+	 */
+	for (i = 0; i < lgrp_plat_node_cnt; i++) {
+		int	mnode;
+
+		/*
+		 * Skip this node and leave its probe page NULL
+		 * if it doesn't have any memory
+		 */
+		mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i);
+		if (!mem_node_config[mnode].exists) {
+			lgrp_plat_probe_mem_config.probe_va[i] = NULL;
+			continue;
+		}
+
+		/*
+		 * Allocate one kernel virtual page
+		 */
+		lgrp_plat_probe_mem_config.probe_va[i] = vmem_alloc(heap_arena,
+		    lgrp_plat_probe_mem_config.probe_memsize, VM_NOSLEEP);
+		if (lgrp_plat_probe_mem_config.probe_va[i] == NULL) {
+			cmn_err(CE_WARN,
+			    "lgrp_plat_main_init: couldn't allocate memory");
+			return;
+		}
+
+		/*
+		 * Get PFN for first page in each node
+		 */
+		lgrp_plat_probe_mem_config.probe_pfn[i] =
+		    mem_node_config[mnode].physbase;
+
+		/*
+		 * Map virtual page to first page in node
+		 */
+		hat_devload(kas.a_hat, lgrp_plat_probe_mem_config.probe_va[i],
+		    lgrp_plat_probe_mem_config.probe_memsize,
+		    lgrp_plat_probe_mem_config.probe_pfn[i],
+		    PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE,
+		    HAT_LOAD_NOCONSIST);
+	}
+
+	/*
+	 * Probe from current CPU
+	 */
+	lgrp_plat_probe();
+}
+
+
+/*
  * Return the number of free, allocatable, or installed
  * pages in an lgroup
  * This is a copy of the MAX_MEM_NODES == 1 version of the routine
@@ -2026,7 +2087,7 @@
  * Return time needed to probe from current CPU to memory in given node
  */
 static hrtime_t
-lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node,
+lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, int cpu_node_nentries,
     lgrp_plat_probe_mem_config_t *probe_mem_config,
     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats)
 {
@@ -2044,7 +2105,7 @@
 	/*
 	 * Determine ID of node containing current CPU
 	 */
-	from = lgrp_plat_cpu_to_node(CPU, cpu_node);
+	from = lgrp_plat_cpu_to_node(CPU, cpu_node, cpu_node_nentries);
 	ASSERT(from >= 0 && from < lgrp_plat_node_cnt);

 	/*
@@ -2139,7 +2200,8 @@

 /*
  * Read boot property with CPU to APIC ID array, fill in CPU to node ID
- * mapping table with APIC ID for each CPU, and return number of CPU APIC IDs.
+ * mapping table with APIC ID for each CPU (if pointer to table isn't NULL),
+ * and return number of CPU APIC IDs.
  *
  * NOTE: This code assumes that CPU IDs are assigned in order that they appear
  *       in in cpu_apicid_array boot property which is based on and follows
@@ -2157,17 +2219,11 @@
 	int	n;

 	/*
-	 * Nothing to do when no array to fill in or not enough CPUs
-	 */
-	if (cpu_node == NULL)
-		return (-1);
-
-	/*
 	 * Check length of property value
 	 */
 	boot_prop_len = BOP_GETPROPLEN(bootops, boot_prop_name);
 	if (boot_prop_len <= 0 || boot_prop_len > sizeof (cpu_apicid_array))
-		return (-2);
+		return (-1);

 	/*
 	 * Calculate number of entries in array and return when there's just
@@ -2175,13 +2231,20 @@
 	 */
 	n = boot_prop_len / sizeof (uint8_t);
 	if (n == 1)
-		return (-3);
+		return (-2);

 	/*
 	 * Get CPU to APIC ID property value
 	 */
 	if (BOP_GETPROP(bootops, boot_prop_name, cpu_apicid_array) < 0)
-		return (-4);
+		return (-3);
+
+	/*
+	 * Just return number of CPU APIC IDs if CPU to node mapping table is
+	 * NULL
+	 */
+	if (cpu_node == NULL)
+		return (n);

 	/*
 	 * Fill in CPU to node ID mapping table with APIC ID for each CPU
@@ -2404,6 +2467,27 @@


 /*
+ * Allocate permanent memory for any temporary memory that we needed to
+ * allocate using BOP_ALLOC() before kmem_alloc() and VM system were
+ * initialized and copy everything from temporary to permanent memory since
+ * temporary boot memory will eventually be released during boot
+ */
+static void
+lgrp_plat_release_bootstrap(void)
+{
+	void	*buf;
+	size_t	size;
+
+	if (lgrp_plat_cpu_node_nentries > 0) {
+		size = lgrp_plat_cpu_node_nentries * sizeof (cpu_node_map_t);
+		buf = kmem_alloc(size, KM_SLEEP);
+		bcopy(lgrp_plat_cpu_node, buf, size);
+		lgrp_plat_cpu_node = buf;
+	}
+}
+
+
+/*
  * Return number of proximity domains given in ACPI SRAT
  */
 static int
--- a/usr/src/uts/i86pc/os/mlsetup.c	Fri Oct 02 17:27:26 2009 -0700
+++ b/usr/src/uts/i86pc/os/mlsetup.c	Sat Oct 03 12:16:34 2009 -0700
@@ -367,7 +367,7 @@
 	/*
 	 * Initialize the lgrp framework
 	 */
-	lgrp_init();
+	lgrp_init(LGRP_INIT_STAGE1);

 	if (boothowto & RB_HALT) {
 		prom_printf("unix: kernel halted by -h flag\n");
--- a/usr/src/uts/sun4/os/lgrpplat.c	Fri Oct 02 17:27:26 2009 -0700
+++ b/usr/src/uts/sun4/os/lgrpplat.c	Sat Oct 03 12:16:34 2009 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,14 +18,12 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */

-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-
 #include <sys/cpuvar.h>
 #include <sys/lgrp.h>
 #include <sys/memnode.h>
@@ -100,34 +97,35 @@
 lgrp_handle_t lgrp_default_handle = LGRP_DEFAULT_HANDLE;

 void
-lgrp_plat_init(void)
+lgrp_plat_init(lgrp_init_stages_t stage)
 {
 	int i;

-	/*
-	 * Initialize lookup tables to invalid values so we catch
-	 * any illegal use of them.
-	 */
-	for (i = 0; i < MAX_MEM_NODES; i++) {
-		memnode_to_lgrphand[i] = -1;
-		lgrphand_to_memnode[i] = -1;
-	}
+	switch (stage) {
+	case LGRP_INIT_STAGE1:
+		/*
+		 * Initialize lookup tables to invalid values so we catch
+		 * any illegal use of them.
+		 */
+		for (i = 0; i < MAX_MEM_NODES; i++) {
+			memnode_to_lgrphand[i] = -1;
+			lgrphand_to_memnode[i] = -1;
+		}

-	if (lgrp_topo_ht_limit() == 1) {
-		max_mem_nodes = 1;
-		return;
-	}
-
-	if (&plat_lgrp_cpu_to_hand)
-		max_mem_nodes = MAX_MEM_NODES;
+		if (lgrp_topo_ht_limit() == 1) {
+			max_mem_nodes = 1;
+			return;
+		}

-	if (&plat_lgrp_init)
-		plat_lgrp_init();
-}
+		if (&plat_lgrp_cpu_to_hand)
+			max_mem_nodes = MAX_MEM_NODES;

-void
-lgrp_plat_main_init(void)
-{
+		if (&plat_lgrp_init)
+			plat_lgrp_init();
+		break;
+	default:
+		break;
+	}
 }

 /* ARGSUSED */
@@ -352,11 +350,3 @@
 		return (NULL);
 	return (lgrp);
 }
-
-/*
- * Probe memory in each node from current CPU to determine latency topology
- */
-void
-lgrp_plat_probe(void)
-{
-}
--- a/usr/src/uts/sun4/os/mlsetup.c	Fri Oct 02 17:27:26 2009 -0700
+++ b/usr/src/uts/sun4/os/mlsetup.c	Sat Oct 03 12:16:34 2009 -0700
@@ -273,7 +273,7 @@
 	 * lgroup framework initialization. This must be done prior
 	 * to devices being mapped.
 	 */
-	lgrp_init();
+	lgrp_init(LGRP_INIT_STAGE1);

 	cpu_setup();