Mercurial > illumos > illumos-gate
changeset 8906:e559381f1e2b
PSARC 2008/777 cpupm keyword mode extensions
PSARC 2008/663 CPU Deep Idle Keyword
6567156 bring CPU power awareness to the dispatcher
6700904 deeper C-State support required on follow-ons to Intel Penryn processor generation microarchitecture
6805661 cmt_root may contain duplicates on UMA systems
line wrap: on
line diff
--- a/usr/src/cmd/mdb/common/modules/genunix/pg.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/cmd/mdb/common/modules/genunix/pg.c Wed Feb 25 21:04:18 2009 -0800 @@ -20,12 +20,10 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Display processor group information */ @@ -34,6 +32,7 @@ #include <mdb/mdb_modapi.h> #include <sys/pghw.h> +#include <sys/cmt.h> /* * PG hardware types indexed by hardware ID @@ -46,6 +45,8 @@ "mpipe", "chip", "memory", + "active_pwr", + "idle_pwr", }; #define A_CNT(arr) (sizeof (arr) / sizeof (arr[0])) @@ -70,8 +71,10 @@ { pg_t pg; pghw_t pghw; + pg_cmt_t pg_cmt; pg_class_t pg_class; int opt_q = 0; /* display only address. */ + int is_cmt = 0; /* This is CMT pg */ /* Should provide an address */ if (! (flags & DCMD_ADDRSPEC)) @@ -86,13 +89,14 @@ opt_q = B_TRUE; if (DCMD_HDRSPEC(flags) && !opt_q) { - mdb_printf("%6s %?s %6s %7s %9s %5s\n", + mdb_printf("%6s %?s %6s %7s %11s %5s %5s\n", "PGID", "ADDR", "PHYSID", "CLASS", "HARDWARE", - "#CPUs"); + "#CPUs", + "LOAD"); } /* @@ -111,6 +115,14 @@ return (DCMD_OK); } + if (strcmp(pg_class.pgc_name, "cmt") == 0) { + if (mdb_vread(&pg_cmt, sizeof (pg_cmt_t), addr) == -1) { + mdb_warn("unable to read 'cmt pg' at %p", addr); + return (DCMD_ERR); + } + is_cmt = 1; + } + if (mdb_vread(&pg_class, sizeof (struct pg_class), (uintptr_t)pg.pg_class) == -1) { mdb_warn("unable to read 'pg_class' at %p", pg.pg_class); @@ -125,10 +137,11 @@ /* * Display the physical PG info. */ - mdb_printf("%6d %?p %6d %7s %9s %5d\n", + mdb_printf("%6d %?p %6d %7s %11s %5d %5d\n", pg.pg_id, addr, pghw.pghw_instance, pg_class.pgc_name, pg_hw_name(pghw.pghw_hw), - pg.pg_cpus.grp_size); + pg.pg_cpus.grp_size, + is_cmt ? pg_cmt.cmt_utilization : 0); } else { /* * Display the basic PG info.
--- a/usr/src/cmd/power/handlers.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/cmd/power/handlers.c Wed Feb 25 21:04:18 2009 -0800 @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "pmconfig.h" #include <sys/mkdev.h> #include <sys/syslog.h> @@ -120,14 +118,66 @@ int cpupm(void) { + struct bmtoc { + char *behavior; + char *mode; + int cmd; + int Errno; + }; + + static struct bmtoc bmlist[] = { + "disable", "\0", PM_STOP_CPUPM, EINVAL, + "enable", "poll-mode", PM_START_CPUPM_POLL, EBUSY, + "enable", "event-mode", PM_START_CPUPM_EV, EBUSY, + "enable", "\0", PM_START_CPUPM, EBUSY, + NULL, 0, 0, 0 + }; + struct bmtoc *bp; + char *behavior; + char *mode; + + behavior = LINEARG(1); + if ((mode = LINEARG(2)) == NULL) + mode = "\0"; + + for (bp = bmlist; bp->cmd; bp++) { + if (strcmp(behavior, bp->behavior) == 0 && + strcmp(mode, bp->mode) == 0) { + break; + } + } + if (bp->cmd == 0) { + if (LINEARG(2) == NULL) { + mesg(MERR, "invalid cpupm behavior \"%s\"\n", behavior); + } else { + mesg(MERR, "invalid cpupm behavior \"%s %s\"\n", + behavior, mode); + } + return (NOUP); + } + if (ioctl(pm_fd, bp->cmd, NULL) == -1 && errno != bp->Errno) { + mesg(MERR, "cpupm %s failed, %s\n", + behavior, strerror(errno)); + return (NOUP); + } + return (OKUP); +} + +/* + * Check for valid cpu_deep_idle option and communicate it to the kernel. + */ +int +cpuidle(void) +{ struct btoc { char *behavior; int cmd; int Errno; }; static struct btoc blist[] = { - "disable", PM_STOP_CPUPM, EINVAL, - "enable", PM_START_CPUPM, EBUSY, + "disable", PM_DISABLE_CPU_DEEP_IDLE, EINVAL, + "enable", PM_ENABLE_CPU_DEEP_IDLE, EBUSY, + "default", PM_DEFAULT_CPU_DEEP_IDLE, EBUSY, NULL, 0, 0 }; struct btoc *bp; @@ -138,18 +188,17 @@ break; } if (bp->cmd == 0) { - mesg(MERR, "invalid cpupm behavior \"%s\"\n", behavior); + mesg(MERR, "invalid cpu_deep_idle behavior \"%s\"\n", behavior); return (NOUP); } if (ioctl(pm_fd, bp->cmd, NULL) == -1 && errno != bp->Errno) { - mesg(MERR, "cpupm %s failed, %s\n", + mesg(MERR, "cpu_deep_idle %s failed, %s\n", behavior, strerror(errno)); return (NOUP); } return (OKUP); } - /* * Two decisions are identical except for the list names and ioctl commands * inputs: whitelist, blacklist, yes, no
--- a/usr/src/cmd/power/parse.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/cmd/power/parse.c Wed Feb 25 21:04:18 2009 -0800 @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "pmconfig.h" #include <deflt.h> #include <pwd.h> @@ -58,7 +56,8 @@ "autopm", autopm, &pm_status, NULL, 2, 0, 1, "autoshutdown", autosd, &cpr_status, as_cmt, 5, 0, 1, "cpu-threshold", cputhr, &pm_status, NULL, 2, 0, 1, - "cpupm", cpupm, &pm_status, NULL, 2, 0, 1, + "cpu_deep_idle", cpuidle, &pm_status, NULL, 2, 0, 1, + "cpupm", cpupm, &pm_status, NULL, 2, 1, 1, "device-dependency-property", ddprop, &pm_status, NULL, 3, 1, 1, "device-dependency", devdep, &pm_status, NULL, 3, 1, 1,
--- a/usr/src/cmd/power/pmconfig.h Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/cmd/power/pmconfig.h Wed Feb 25 21:04:18 2009 -0800 @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _PMCONFIG_H #define _PMCONFIG_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -126,6 +124,7 @@ extern int autopm(void); extern int autosd(void); extern int cpupm(void); +extern int cpuidle(void); extern int cputhr(void); extern int ddprop(void); extern int devdep(void);
--- a/usr/src/cmd/powertop/cpufreq.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/cmd/powertop/cpufreq.c Wed Feb 25 21:04:18 2009 -0800 @@ -71,18 +71,18 @@ "}" "" ":::cpu-change-speed" -"/last[((cpudrv_devstate_t *)arg0)->cpu_id] != 0/" +"/last[(processorid_t)arg0] != 0/" "{" -" this->cpu = ((cpudrv_devstate_t *)arg0)->cpu_id;" -" this->oldspeed = ((cpudrv_pm_t *)arg1)->cur_spd->speed;" +" this->cpu = (processorid_t)arg0;" +" this->oldspeed = (uint32_t)(arg1/1000000);" " @times[this->cpu, this->oldspeed] = sum(timestamp - last[this->cpu]);" " last[this->cpu] = timestamp;" "}" ":::cpu-change-speed" -"/last[((cpudrv_devstate_t *)arg0)->cpu_id] == 0/" +"/last[(processorid_t)arg0] == 0/" "{" -" this->cpu = ((cpudrv_devstate_t *)arg0)->cpu_id;" -" this->oldspeed = ((cpudrv_pm_t *)arg1)->cur_spd->speed;" +" this->cpu = (processorid_t)arg0;" +" this->oldspeed = (uint32_t)(arg1/1000000);" " @times[this->cpu, this->oldspeed] = sum(timestamp - begin);" " last[this->cpu] = timestamp;" "}";
--- a/usr/src/pkgdefs/SUNWhea/prototype_com Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/pkgdefs/SUNWhea/prototype_com Wed Feb 25 21:04:18 2009 -0800 @@ -711,6 +711,7 @@ f none usr/include/sys/cpu.h 644 root bin f none usr/include/sys/cpupart.h 644 root bin f none usr/include/sys/cpuvar.h 644 root bin +f none usr/include/sys/cpu_pm.h 644 root bin f none usr/include/sys/crc32.h 644 root bin f none usr/include/sys/cred.h 644 root bin f none usr/include/sys/cred_impl.h 644 root bin
--- a/usr/src/uts/common/Makefile.files Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/common/Makefile.files Wed Feb 25 21:04:18 2009 -0800 @@ -41,8 +41,10 @@ brand.o \ cpucaps.o \ cmt.o \ + cmt_policy.o \ cpu.o \ cpu_intr.o \ + cpu_pm.o \ cpupart.o \ disp.o \ group.o \
--- a/usr/src/uts/common/conf/param.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/common/conf/param.c Wed Feb 25 21:04:18 2009 -0800 @@ -190,9 +190,6 @@ extern void clock_timer_init(void); extern void clock_realtime_init(void); extern void clock_highres_init(void); -extern void pg_init(void); -extern void pg_cmt_class_init(void); -extern void pg_cpu0_init(void); extern void clock_tick_mp_init(void); extern void callout_mp_init(void); extern void cpu_seq_tbl_init(void); @@ -214,9 +211,6 @@ segvn_init, flk_init, cpu_seq_tbl_init, - pg_init, - pg_cmt_class_init, - pg_cpu0_init, schedctl_init, fdb_init, deadman_init,
--- a/usr/src/uts/common/disp/cmt.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/common/disp/cmt.c Wed Feb 25 21:04:18 2009 -0800 @@ -39,6 +39,7 @@ #include <sys/bitset.h> #include <sys/lgrp.h> #include <sys/cmt.h> +#include <sys/cpu_pm.h> /* * CMT scheduler / dispatcher support @@ -58,11 +59,12 @@ * * The scheduler/dispatcher leverages knowledge of the performance * relevant CMT sharing relationships existing between cpus to implement - * optimized affinity and load balancing policies. + * optimized affinity, load balancing, and coalescence policies. * * Load balancing policy seeks to improve performance by minimizing - * contention over shared processor resources / facilities, while the - * affinity policies seek to improve cache and TLB utilization. + * contention over shared processor resources / facilities, Affinity + * policies seek to improve cache and TLB utilization. Coalescence + * policies improve resource utilization and ultimately power efficiency. * * The CMT PGs created by this class are already arranged into a * hierarchy (which is done in the pghw layer). To implement the top-down @@ -79,25 +81,24 @@ * balancng across the CMT PGs within their respective (per lgroup) top level * groups. */ -typedef struct cmt_lgrp { - group_t cl_pgs; /* Top level group of active CMT PGs */ - int cl_npgs; /* # of top level PGs in the lgroup */ - lgrp_handle_t cl_hand; /* lgroup's platform handle */ - struct cmt_lgrp *cl_next; /* next cmt_lgrp */ -} cmt_lgrp_t; - static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */ static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */ /* used for null_proc_lpa */ -static cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */ +cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */ static int is_cpu0 = 1; /* true if this is boot CPU context */ /* + * Array of hardware sharing relationships that are blacklisted. + * PGs won't be instantiated for blacklisted hardware sharing relationships. + */ +static int cmt_hw_blacklisted[PGHW_NUM_COMPONENTS]; + +/* * Set this to non-zero to disable CMT scheduling * This must be done via kmdb -d, as /etc/system will be too late */ -static int cmt_sched_disabled = 0; +int cmt_sched_disabled = 0; static pg_cid_t pg_cmt_class_id; /* PG class id */ @@ -109,11 +110,19 @@ static void pg_cmt_cpu_inactive(cpu_t *); static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *); static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *); -static void pg_cmt_hier_pack(void **, int); +static char *pg_cmt_policy_name(pg_t *); +static void pg_cmt_hier_sort(pg_cmt_t **, int); +static pg_cmt_t *pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *); static int pg_cmt_cpu_belongs(pg_t *, cpu_t *); static int pg_cmt_hw(pghw_type_t); static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t); static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t); +static int pg_cmt_lineage_validate(pg_cmt_t **, int *); +static void cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t, + kthread_t *, kthread_t *); +static void cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t, + kthread_t *, kthread_t *); +static void cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *); /* * Macro to test if PG is managed by the CMT PG class @@ -121,6 +130,29 @@ #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id) /* + * Status codes for CMT lineage validation + * See cmt_lineage_validate() below + */ +typedef enum cmt_lineage_validation { + CMT_LINEAGE_VALID, + CMT_LINEAGE_NON_CONCENTRIC, + CMT_LINEAGE_REPAIRED, + CMT_LINEAGE_UNRECOVERABLE +} cmt_lineage_validation_t; + +/* + * Status of the current lineage under construction. + * One must be holding cpu_lock to change this. + */ +static cmt_lineage_validation_t cmt_lineage_status = CMT_LINEAGE_VALID; + +/* + * Power domain definitions (on x86) are defined by ACPI, and + * therefore may be subject to BIOS bugs. + */ +#define PG_CMT_HW_SUSPECT(hw) PGHW_IS_PM_DOMAIN(hw) + +/* * CMT PG ops */ struct pg_ops pg_ops_cmt = { @@ -134,6 +166,7 @@ NULL, /* cpupart_out */ pg_cmt_cpupart_move, pg_cmt_cpu_belongs, + pg_cmt_policy_name, }; /* @@ -156,25 +189,8 @@ void pg_cmt_cpu_startup(cpu_t *cp) { - PG_NRUN_UPDATE(cp, 1); -} - -/* - * Adjust the CMT load in the CMT PGs in which the CPU belongs - * Note that "n" can be positive in the case of increasing - * load, or negative in the case of decreasing load. - */ -void -pg_cmt_load(cpu_t *cp, int n) -{ - pg_cmt_t *pg; - - pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage; - while (pg != NULL) { - ASSERT(IS_CMT_PG(pg)); - atomic_add_32(&pg->cmt_nrunning, n); - pg = pg->cmt_parent; - } + pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread, + cp->cpu_thread); } /* @@ -212,14 +228,219 @@ } /* - * Return 1 if CMT scheduling policies should be impelmented - * for the specified hardware sharing relationship. + * Given a hardware sharing relationship, return which dispatcher + * policies should be implemented to optimize performance and efficiency + */ +static pg_cmt_policy_t +pg_cmt_policy(pghw_type_t hw) +{ + pg_cmt_policy_t p; + + /* + * Give the platform a chance to override the default + */ + if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY) + return (p); + + switch (hw) { + case PGHW_IPIPE: + case PGHW_FPU: + case PGHW_CHIP: + return (CMT_BALANCE); + case PGHW_CACHE: + return (CMT_AFFINITY); + case PGHW_POW_ACTIVE: + case PGHW_POW_IDLE: + return (CMT_BALANCE); + default: + return (CMT_NO_POLICY); + } +} + +/* + * Rank the importance of optimizing for the pg1 relationship vs. + * the pg2 relationship. + */ +static pg_cmt_t * +pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2) +{ + pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw; + pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw; + + /* + * A power domain is only important if CPUPM is enabled. + */ + if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) { + if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2)) + return (pg2); + if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1)) + return (pg1); + } + + /* + * Otherwise, ask the platform + */ + if (pg_plat_hw_rank(hw1, hw2) == hw1) + return (pg1); + else + return (pg2); +} + +/* + * Initialize CMT callbacks for the given PG + */ +static void +cmt_callback_init(pg_t *pg) +{ + switch (((pghw_t *)pg)->pghw_hw) { + case PGHW_POW_ACTIVE: + pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr; + pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr; + break; + default: + pg->pg_cb.thread_swtch = cmt_ev_thread_swtch; + + } +} + +/* + * Promote PG above it's current parent. + * This is only legal if PG has an equal or greater number of CPUs + * than it's parent. */ -static int -pg_cmt_hw(pghw_type_t hw) +static void +cmt_hier_promote(pg_cmt_t *pg) { - return (pg_plat_cmt_load_bal_hw(hw) || - pg_plat_cmt_affinity_hw(hw)); + pg_cmt_t *parent; + group_t *children; + cpu_t *cpu; + group_iter_t iter; + pg_cpu_itr_t cpu_iter; + int r; + int err; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + parent = pg->cmt_parent; + if (parent == NULL) { + /* + * Nothing to do + */ + return; + } + + ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent)); + + /* + * We're changing around the hierarchy, which is actively traversed + * by the dispatcher. Pause CPUS to ensure exclusivity. + */ + pause_cpus(NULL); + + /* + * If necessary, update the parent's sibling set, replacing parent + * with PG. + */ + if (parent->cmt_siblings) { + if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE) + != -1) { + r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE); + ASSERT(r != -1); + } + } + + /* + * If the parent is at the top of the hierarchy, replace it's entry + * in the root lgroup's group of top level PGs. + */ + if (parent->cmt_parent == NULL && + parent->cmt_siblings != &cmt_root->cl_pgs) { + if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE) + != -1) { + r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE); + ASSERT(r != -1); + } + } + + /* + * We assume (and therefore assert) that the PG being promoted is an + * only child of it's parent. Update the parent's children set + * replacing PG's entry with the parent (since the parent is becoming + * the child). Then have PG and the parent swap children sets. + */ + ASSERT(GROUP_SIZE(parent->cmt_children) <= 1); + if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) { + r = group_add(parent->cmt_children, parent, GRP_NORESIZE); + ASSERT(r != -1); + } + + children = pg->cmt_children; + pg->cmt_children = parent->cmt_children; + parent->cmt_children = children; + + /* + * Update the sibling references for PG and it's parent + */ + pg->cmt_siblings = parent->cmt_siblings; + parent->cmt_siblings = pg->cmt_children; + + /* + * Update any cached lineages in the per CPU pg data. + */ + PG_CPU_ITR_INIT(pg, cpu_iter); + while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { + int idx; + group_t *pgs; + pg_cmt_t *cpu_pg; + + /* + * Iterate over the CPU's PGs updating the children + * of the PG being promoted, since they have a new parent. + */ + pgs = &cpu->cpu_pg->pgs; + group_iter_init(&iter); + while ((cpu_pg = group_iterate(pgs, &iter)) != NULL) { + if (cpu_pg->cmt_parent == pg) { + cpu_pg->cmt_parent = parent; + } + } + + /* + * Update the CMT load balancing lineage + */ + pgs = &cpu->cpu_pg->cmt_pgs; + if ((idx = group_find(pgs, (void *)pg)) == -1) { + /* + * Unless this is the CPU who's lineage is being + * constructed, the PG being promoted should be + * in the lineage. + */ + ASSERT(GROUP_SIZE(pgs) == 0); + continue; + } + + ASSERT(GROUP_ACCESS(pgs, idx - 1) == parent); + ASSERT(idx > 0); + + /* + * Have the child and the parent swap places in the CPU's + * lineage + */ + group_remove_at(pgs, idx); + group_remove_at(pgs, idx - 1); + err = group_add_at(pgs, parent, idx); + ASSERT(err == 0); + err = group_add_at(pgs, pg, idx - 1); + ASSERT(err == 0); + } + + /* + * Update the parent references for PG and it's parent + */ + pg->cmt_parent = parent->cmt_parent; + parent->cmt_parent = pg; + + start_cpus(); } /* @@ -230,7 +451,7 @@ { pg_cmt_t *pg; group_t *cmt_pgs; - int level, max_level, nlevels; + int levels, level; pghw_type_t hw; pg_t *pg_cache = NULL; pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS]; @@ -239,26 +460,42 @@ ASSERT(MUTEX_HELD(&cpu_lock)); + if (cmt_sched_disabled) + return; + /* * A new CPU is coming into the system. * Interrogate the platform to see if the CPU - * has any performance relevant CMT sharing - * relationships + * has any performance or efficiency relevant + * sharing relationships */ cmt_pgs = &cp->cpu_pg->cmt_pgs; cp->cpu_pg->cmt_lineage = NULL; bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier)); - max_level = nlevels = 0; + levels = 0; for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) { + pg_cmt_policy_t policy; + /* - * We're only interested in CMT hw sharing relationships + * We're only interested in the hw sharing relationships + * for which we know how to optimize. */ - if (pg_cmt_hw(hw) == 0 || pg_plat_hw_shared(cp, hw) == 0) + policy = pg_cmt_policy(hw); + if (policy == CMT_NO_POLICY || + pg_plat_hw_shared(cp, hw) == 0) continue; /* + * Continue if the hardware sharing relationship has been + * blacklisted. + */ + if (cmt_hw_blacklisted[hw]) { + continue; + } + + /* * Find (or create) the PG associated with * the hw sharing relationship in which cp * belongs. @@ -281,6 +518,11 @@ * ... and CMT specific portions of the * structure. */ + pg->cmt_policy = policy; + + /* CMT event callbacks */ + cmt_callback_init((pg_t *)pg); + bitset_init(&pg->cmt_cpus_actv_set); group_create(&pg->cmt_cpus_actv); } else { @@ -303,14 +545,10 @@ } /* - * Build a lineage of CMT PGs for load balancing + * Build a lineage of CMT PGs for load balancing / coalescence */ - if (pg_plat_cmt_load_bal_hw(hw)) { - level = pghw_level(hw); - cpu_cmt_hier[level] = pg; - if (level > max_level) - max_level = level; - nlevels++; + if (policy & (CMT_BALANCE | CMT_COALESCE)) { + cpu_cmt_hier[levels++] = pg; } /* Cache this for later */ @@ -318,44 +556,73 @@ pg_cache = (pg_t *)pg; } - /* - * Pack out any gaps in the constructed lineage, - * then size it out. - * - * Gaps may exist where the architecture knows - * about a hardware sharing relationship, but such a - * relationship either isn't relevant for load - * balancing or doesn't exist between CPUs on the system. - */ - pg_cmt_hier_pack((void **)cpu_cmt_hier, max_level + 1); - group_expand(cmt_pgs, nlevels); - + group_expand(cmt_pgs, levels); if (cmt_root == NULL) cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand()); /* - * Find the lgrp that encapsulates this CPU's CMT hierarchy. - * and locate/create a suitable cmt_lgrp_t. + * Find the lgrp that encapsulates this CPU's CMT hierarchy */ lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL) lgrp = pg_cmt_lgrp_create(lgrp_handle); /* + * Ascendingly sort the PGs in the lineage by number of CPUs + */ + pg_cmt_hier_sort(cpu_cmt_hier, levels); + + /* + * Examine the lineage and validate it. + * This routine will also try to fix the lineage along with the + * rest of the PG hierarchy should it detect an issue. + * + * If it returns -1, an unrecoverable error has happened and we + * need to return. + */ + if (pg_cmt_lineage_validate(cpu_cmt_hier, &levels) < 0) + return; + + /* + * For existing PGs in the lineage, verify that the parent is + * correct, as the generation in the lineage may have changed + * as a result of the sorting. Start the traversal at the top + * of the lineage, moving down. + */ + for (level = levels - 1; level >= 0; ) { + int reorg; + + reorg = 0; + pg = cpu_cmt_hier[level]; + + /* + * Promote PGs at an incorrect generation into place. + */ + while (pg->cmt_parent && + pg->cmt_parent != cpu_cmt_hier[level + 1]) { + cmt_hier_promote(pg); + reorg++; + } + if (reorg > 0) + level = levels - 1; + else + level--; + } + + /* * For each of the PGs in the CPU's lineage: - * - Add an entry in the CPU's CMT PG group - * which is used by the dispatcher to implement load balancing - * policy. + * - Add an entry in the CPU sorted CMT PG group + * which is used for top down CMT load balancing * - Tie the PG into the CMT hierarchy by connecting * it to it's parent and siblings. */ - for (level = 0; level < nlevels; level++) { + for (level = 0; level < levels; level++) { uint_t children; int err; pg = cpu_cmt_hier[level]; - err = group_add_at(cmt_pgs, pg, nlevels - level - 1); + err = group_add_at(cmt_pgs, pg, levels - level - 1); ASSERT(err == 0); if (level == 0) @@ -371,12 +638,13 @@ continue; } - if ((level + 1) == nlevels) { + if ((level + 1) == levels) { pg->cmt_parent = NULL; pg->cmt_siblings = &lgrp->cl_pgs; children = ++lgrp->cl_npgs; - cmt_root->cl_npgs++; + if (cmt_root != lgrp) + cmt_root->cl_npgs++; } else { pg->cmt_parent = cpu_cmt_hier[level + 1]; @@ -436,6 +704,9 @@ lgrp_handle_t lgrp_handle; cmt_lgrp_t *lgrp; + if (cmt_sched_disabled) + return; + pgs = &cp->cpu_pg->pgs; cmt_pgs = &cp->cpu_pg->cmt_pgs; @@ -544,6 +815,9 @@ ASSERT(MUTEX_HELD(&cpu_lock)); + if (cmt_sched_disabled) + return; + pgs = &cp->cpu_pg->pgs; /* @@ -576,6 +850,9 @@ ASSERT(MUTEX_HELD(&cpu_lock)); + if (cmt_sched_disabled) + return; + pgs = &cp->cpu_pg->pgs; group_iter_init(&pg_iter); @@ -627,6 +904,9 @@ ASSERT(MUTEX_HELD(&cpu_lock)); + if (cmt_sched_disabled) + return; + pgs = &cp->cpu_pg->pgs; group_iter_init(&i); @@ -648,15 +928,16 @@ * for balancing with it's siblings. */ if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 && - pg_plat_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) { + (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE); ASSERT(err == 0); /* * If this is a top level PG, add it as a balancing - * candidate when balancing within the root lgroup + * candidate when balancing within the root lgroup. */ - if (pg->cmt_parent == NULL) { + if (pg->cmt_parent == NULL && + pg->cmt_siblings != &cmt_root->cl_pgs) { err = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE); ASSERT(err == 0); @@ -691,6 +972,9 @@ ASSERT(MUTEX_HELD(&cpu_lock)); + if (cmt_sched_disabled) + return; + pgs = &cp->cpu_pg->pgs; group_iter_init(&i); @@ -713,11 +997,12 @@ * load was balanced, remove it as a balancing candidate. */ if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 && - pg_plat_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) { + (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); ASSERT(err == 0); - if (pg->cmt_parent == NULL) { + if (pg->cmt_parent == NULL && + pg->cmt_siblings != &cmt_root->cl_pgs) { err = group_remove(&cmt_root->cl_pgs, pg, GRP_NORESIZE); ASSERT(err == 0); @@ -776,26 +1061,47 @@ } /* - * Hierarchy packing utility routine. The hierarchy order is preserved. + * Sort the CPUs CMT hierarchy, where "size" is the number of levels. */ static void -pg_cmt_hier_pack(void *hier[], int sz) +pg_cmt_hier_sort(pg_cmt_t **hier, int size) { - int i, j; - - for (i = 0; i < sz; i++) { - if (hier[i] != NULL) - continue; + int i, j, inc; + pg_t *tmp; + pg_t **h = (pg_t **)hier; - for (j = i; j < sz; j++) { - if (hier[j] != NULL) { - hier[i] = hier[j]; - hier[j] = NULL; - break; + /* + * First sort by number of CPUs + */ + inc = size / 2; + while (inc > 0) { + for (i = inc; i < size; i++) { + j = i; + tmp = h[i]; + while ((j >= inc) && + (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) { + h[j] = h[j - inc]; + j = j - inc; } + h[j] = tmp; } - if (j == sz) - break; + if (inc == 2) + inc = 1; + else + inc = (inc * 5) / 11; + } + + /* + * Break ties by asking the platform. + * Determine if h[i] outranks h[i + 1] and if so, swap them. + */ + for (i = 0; i < size - 1; i++) { + if ((PG_NUM_CPUS(h[i]) == PG_NUM_CPUS(h[i + 1])) && + pg_cmt_hier_rank(hier[i], hier[i + 1]) == hier[i]) { + tmp = h[i]; + h[i] = h[i + 1]; + h[i + 1] = tmp; + } } } @@ -840,134 +1146,492 @@ } /* - * Perform multi-level CMT load balancing of running threads. - * - * tp is the thread being enqueued. - * cp is a hint CPU, against which CMT load balancing will be performed. + * Interfaces to enable and disable power aware dispatching + * The caller must be holding cpu_lock. * - * Returns cp, or a CPU better than cp with respect to balancing - * running thread load. + * Return 0 on success and -1 on failure. */ -cpu_t * -cmt_balance(kthread_t *tp, cpu_t *cp) +int +cmt_pad_enable(pghw_type_t type) { - int hint, i, cpu, nsiblings; - int self = 0; - group_t *cmt_pgs, *siblings; - pg_cmt_t *pg, *pg_tmp, *tpg = NULL; - int pg_nrun, tpg_nrun; - int level = 0; - cpu_t *newcp; + group_t *hwset; + group_iter_t iter; + pg_cmt_t *pg; + + ASSERT(PGHW_IS_PM_DOMAIN(type)); + ASSERT(MUTEX_HELD(&cpu_lock)); - ASSERT(THREAD_LOCK_HELD(tp)); - - cmt_pgs = &cp->cpu_pg->cmt_pgs; - - if (GROUP_SIZE(cmt_pgs) == 0) - return (cp); /* nothing to do */ - - if (tp == curthread) - self = 1; + if ((hwset = pghw_set_lookup(type)) == NULL || + cmt_hw_blacklisted[type]) { + /* + * Unable to find any instances of the specified type + * of power domain, or the power domains have been blacklisted. + */ + return (-1); + } /* - * Balance across siblings in the CPUs CMT lineage - * If the thread is homed to the root lgroup, perform - * top level balancing against other top level PGs - * in the system. Otherwise, start with the default - * top level siblings group, which is within the leaf lgroup + * Iterate over the power domains, setting the default dispatcher + * policy for power/performance optimization. + * + * Simply setting the policy isn't enough in the case where the power + * domain is an only child of another PG. Because the dispatcher walks + * the PG hierarchy in a top down fashion, the higher up PG's policy + * will dominate. So promote the power domain above it's parent if both + * PG and it's parent have the same CPUs to ensure it's policy + * dominates. */ - pg = GROUP_ACCESS(cmt_pgs, level); - if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) - siblings = &cmt_root->cl_pgs; - else - siblings = pg->cmt_siblings; + group_iter_init(&iter); + while ((pg = group_iterate(hwset, &iter)) != NULL) { + /* + * If the power domain is an only child to a parent + * not implementing the same policy, promote the child + * above the parent to activate the policy. + */ + pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw); + while ((pg->cmt_parent != NULL) && + (pg->cmt_parent->cmt_policy != pg->cmt_policy) && + (PG_NUM_CPUS((pg_t *)pg) == + PG_NUM_CPUS((pg_t *)pg->cmt_parent))) { + cmt_hier_promote(pg); + } + } + + return (0); +} +int +cmt_pad_disable(pghw_type_t type) +{ + group_t *hwset; + group_iter_t iter; + pg_cmt_t *pg; + pg_cmt_t *child; + + ASSERT(PGHW_IS_PM_DOMAIN(type)); + ASSERT(MUTEX_HELD(&cpu_lock)); + + if ((hwset = pghw_set_lookup(type)) == NULL) { + /* + * Unable to find any instances of the specified type of + * power domain. + */ + return (-1); + } /* - * Traverse down the lineage until we find a level that needs - * balancing, or we get to the end. + * Iterate over the power domains, setting the default dispatcher + * policy for performance optimization (load balancing). */ - for (;;) { - nsiblings = GROUP_SIZE(siblings); /* self inclusive */ - if (nsiblings == 1) - goto next_level; - - pg_nrun = pg->cmt_nrunning; - if (self && - bitset_in_set(&pg->cmt_cpus_actv_set, CPU->cpu_seqid)) - pg_nrun--; /* Ignore curthread's effect */ - - hint = CPU_PSEUDO_RANDOM() % nsiblings; + group_iter_init(&iter); + while ((pg = group_iterate(hwset, &iter)) != NULL) { /* - * Find a balancing candidate from among our siblings - * "hint" is a hint for where to start looking + * If the power domain has an only child that implements + * policy other than load balancing, promote the child + * above the power domain to ensure it's policy dominates. */ - i = hint; - do { - ASSERT(i < nsiblings); - pg_tmp = GROUP_ACCESS(siblings, i); + if (GROUP_SIZE(pg->cmt_children) == 1) { + child = GROUP_ACCESS(pg->cmt_children, 0); + if ((child->cmt_policy & CMT_BALANCE) == 0) { + cmt_hier_promote(child); + } + } + pg->cmt_policy = CMT_BALANCE; + } + return (0); +} + +/* ARGSUSED */ +static void +cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, + kthread_t *new) +{ + pg_cmt_t *cmt_pg = (pg_cmt_t *)pg; + + if (old == cp->cpu_idle_thread) { + atomic_add_32(&cmt_pg->cmt_utilization, 1); + } else if (new == cp->cpu_idle_thread) { + atomic_add_32(&cmt_pg->cmt_utilization, -1); + } +} + +/* + * Macro to test whether a thread is currently runnable on a CPU in a PG. + */ +#define THREAD_RUNNABLE_IN_PG(t, pg) \ + ((t)->t_state == TS_RUN && \ + (t)->t_disp_queue->disp_cpu && \ + bitset_in_set(&(pg)->cmt_cpus_actv_set, \ + (t)->t_disp_queue->disp_cpu->cpu_seqid)) + +static void +cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, + kthread_t *new) +{ + pg_cmt_t *cmt = (pg_cmt_t *)pg; + cpupm_domain_t *dom; + uint32_t u; + + if (old == cp->cpu_idle_thread) { + ASSERT(new != cp->cpu_idle_thread); + u = atomic_add_32_nv(&cmt->cmt_utilization, 1); + if (u == 1) { + /* + * Notify the CPU power manager that the domain + * is non-idle. + */ + dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; + cpupm_utilization_event(cp, now, dom, + CPUPM_DOM_BUSY_FROM_IDLE); + } + } else if (new == cp->cpu_idle_thread) { + ASSERT(old != cp->cpu_idle_thread); + u = atomic_add_32_nv(&cmt->cmt_utilization, -1); + if (u == 0) { + /* + * The domain is idle, notify the CPU power + * manager. + * + * Avoid notifying if the thread is simply migrating + * between CPUs in the domain. + */ + if (!THREAD_RUNNABLE_IN_PG(old, cmt)) { + dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; + cpupm_utilization_event(cp, now, dom, + CPUPM_DOM_IDLE_FROM_BUSY); + } + } + } +} + +/* ARGSUSED */ +static void +cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t) +{ + pg_cmt_t *cmt = (pg_cmt_t *)pg; + cpupm_domain_t *dom; + + dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; + cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY); +} + +/* + * Return the name of the CMT scheduling policy + * being implemented across this PG + */ +static char * +pg_cmt_policy_name(pg_t *pg) +{ + pg_cmt_policy_t policy; + + policy = ((pg_cmt_t *)pg)->cmt_policy; + + if (policy & CMT_AFFINITY) { + if (policy & CMT_BALANCE) + return ("Load Balancing & Affinity"); + else if (policy & CMT_COALESCE) + return ("Load Coalescence & Affinity"); + else + return ("Affinity"); + } else { + if (policy & CMT_BALANCE) + return ("Load Balancing"); + else if (policy & CMT_COALESCE) + return ("Load Coalescence"); + else + return ("None"); + } +} + +/* + * Prune PG, and all other instances of PG's hardware sharing relationship + * from the PG hierarchy. + */ +static int +pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz) +{ + group_t *hwset, *children; + int i, j, r, size = *sz; + group_iter_t hw_iter, child_iter; + pg_cpu_itr_t cpu_iter; + pg_cmt_t *pg, *child; + cpu_t *cpu; + int cap_needed; + pghw_type_t hw; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + hw = ((pghw_t *)pg_bad)->pghw_hw; + + if (hw == PGHW_POW_ACTIVE) { + cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. " + "Event Based CPUPM Unavailable"); + } else if (hw == PGHW_POW_IDLE) { + cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. " + "Dispatcher assisted CPUPM disabled."); + } + + /* + * Find and eliminate the PG from the lineage. + */ + for (i = 0; i < size; i++) { + if (lineage[i] == pg_bad) { + for (j = i; j < size - 1; j++) + lineage[j] = lineage[j + 1]; + *sz = size - 1; + break; + } + } + + /* + * We'll prune all instances of the hardware sharing relationship + * represented by pg. But before we do that (and pause CPUs) we need + * to ensure the hierarchy's groups are properly sized. + */ + hwset = pghw_set_lookup(hw); + + /* + * Blacklist the hardware so that future groups won't be created. + */ + cmt_hw_blacklisted[hw] = 1; + + /* + * For each of the PGs being pruned, ensure sufficient capacity in + * the siblings set for the PG's children + */ + group_iter_init(&hw_iter); + while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { + /* + * PG is being pruned, but if it is bringing up more than + * one child, ask for more capacity in the siblings group. + */ + cap_needed = 0; + if (pg->cmt_children && + GROUP_SIZE(pg->cmt_children) > 1) { + cap_needed = GROUP_SIZE(pg->cmt_children) - 1; + + group_expand(pg->cmt_siblings, + GROUP_SIZE(pg->cmt_siblings) + cap_needed); /* - * The candidate must not be us, and must - * have some CPU resources in the thread's - * partition + * If this is a top level group, also ensure the + * capacity in the root lgrp level CMT grouping. */ - if (pg_tmp != pg && - bitset_in_set(&tp->t_cpupart->cp_cmt_pgs, - ((pg_t *)pg_tmp)->pg_id)) { - tpg = pg_tmp; - break; + if (pg->cmt_parent == NULL && + pg->cmt_siblings != &cmt_root->cl_pgs) { + group_expand(&cmt_root->cl_pgs, + GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed); } + } + } - if (++i >= nsiblings) - i = 0; - } while (i != hint); + /* + * We're operating on the PG hierarchy. Pause CPUs to ensure + * exclusivity with respect to the dispatcher. + */ + pause_cpus(NULL); - if (!tpg) - goto next_level; /* no candidates at this level */ + /* + * Prune all PG instances of the hardware sharing relationship + * represented by pg. + */ + group_iter_init(&hw_iter); + while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { /* - * Check if the balancing target is underloaded - * Decide to balance if the target is running fewer - * threads, or if it's running the same number of threads - * with more online CPUs + * Remove PG from it's group of siblings, if it's there. + */ + if (pg->cmt_siblings) { + (void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); + } + if (pg->cmt_parent == NULL && + pg->cmt_siblings != &cmt_root->cl_pgs) { + (void) group_remove(&cmt_root->cl_pgs, pg, + GRP_NORESIZE); + } + /* + * Add PGs children to it's group of siblings. + */ + if (pg->cmt_children != NULL) { + children = pg->cmt_children; + + group_iter_init(&child_iter); + while ((child = group_iterate(children, &child_iter)) + != NULL) { + /* + * Transplant child from it's siblings set to + * PGs. + */ + if (pg->cmt_siblings != NULL && + child->cmt_siblings != NULL && + group_remove(child->cmt_siblings, child, + GRP_NORESIZE) != -1) { + r = group_add(pg->cmt_siblings, child, + GRP_NORESIZE); + ASSERT(r == 0); + } + } + } + + /* + * Reset the callbacks to the defaults + */ + pg_callback_set_defaults((pg_t *)pg); + + /* + * Update all the CPU lineages in each of PG's CPUs */ - tpg_nrun = tpg->cmt_nrunning; - if (pg_nrun > tpg_nrun || - (pg_nrun == tpg_nrun && - (GROUP_SIZE(&tpg->cmt_cpus_actv) > - GROUP_SIZE(&pg->cmt_cpus_actv)))) { - break; + PG_CPU_ITR_INIT(pg, cpu_iter); + while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { + group_t *pgs; + pg_cmt_t *cpu_pg; + group_iter_t liter; /* Iterator for the lineage */ + + /* + * Iterate over the CPU's PGs updating the children + * of the PG being promoted, since they have a new + * parent and siblings set. + */ + pgs = &cpu->cpu_pg->pgs; + group_iter_init(&liter); + while ((cpu_pg = group_iterate(pgs, &liter)) != NULL) { + if (cpu_pg->cmt_parent == pg) { + cpu_pg->cmt_parent = pg->cmt_parent; + cpu_pg->cmt_siblings = pg->cmt_siblings; + } + } + + /* + * Update the CPU's lineages + */ + pgs = &cpu->cpu_pg->cmt_pgs; + (void) group_remove(pgs, pg, GRP_NORESIZE); + pgs = &cpu->cpu_pg->pgs; + (void) group_remove(pgs, pg, GRP_NORESIZE); } - tpg = NULL; + } + start_cpus(); + return (0); +} + +/* + * Disable CMT scheduling + */ +static void +pg_cmt_disable(void) +{ + cpu_t *cpu; + + pause_cpus(NULL); + cpu = cpu_list; + + do { + if (cpu->cpu_pg) + group_empty(&cpu->cpu_pg->cmt_pgs); + } while ((cpu = cpu->cpu_next) != cpu_list); + + cmt_sched_disabled = 1; + start_cpus(); + cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable"); +} -next_level: - if (++level == GROUP_SIZE(cmt_pgs)) - break; +static int +pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz) +{ + int i, size; + pg_cmt_t *pg, *parent, *pg_bad; + cpu_t *cp; + pg_cpu_itr_t cpu_iter; + + ASSERT(MUTEX_HELD(&cpu_lock)); + +revalidate: + size = *sz; + pg_bad = NULL; + for (i = 0; i < size - 1; i++) { + + pg = lineage[i]; + parent = lineage[i + 1]; - pg = GROUP_ACCESS(cmt_pgs, level); - siblings = pg->cmt_siblings; + /* + * We assume that the lineage has already been sorted + * by the number of CPUs. In fact, we depend on it. + */ + ASSERT(PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)parent)); + + /* + * Walk each of the CPUs in the PGs group, and verify that + * the next larger PG contains at least the CPUs in this one. + */ + PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter); + while ((cp = pg_cpu_next(&cpu_iter)) != NULL) { + if (pg_cpu_find((pg_t *)parent, cp) == B_FALSE) { + cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC; + goto handle_error; + } + } } - if (tpg) { - uint_t tgt_size = GROUP_SIZE(&tpg->cmt_cpus_actv); - +handle_error: + switch (cmt_lineage_status) { + case CMT_LINEAGE_VALID: + case CMT_LINEAGE_REPAIRED: + break; + case CMT_LINEAGE_NON_CONCENTRIC: /* - * Select an idle CPU from the target + * We've detected a non-concentric PG lineage. + * + * This can happen when some of the CPU grouping information + * is derived from buggy sources (for example, incorrect ACPI + * tables on x86 systems). + * + * We attempt to recover from this by pruning out the + * illegal groupings from the PG hierarchy, which means that + * we won't optimize for those levels, but we will for the + * remaining ones. + * + * If a given level has CPUs not found in it's parent, then + * we examine the PG and it's parent to see if either grouping + * is enumerated from potentially buggy sources. + * + * If one has less CPUs than the other, and contains CPUs + * not found in the parent, and it is an untrusted enumeration, + * then prune it. If both have the same number of CPUs, then + * prune the one that is untrusted. + * + * This process repeats until we have a concentric lineage, + * or we would have to prune out level derived from what we + * thought was a reliable source, in which case CMT scheduling + * is disabled all together. */ - hint = CPU_PSEUDO_RANDOM() % tgt_size; - cpu = hint; - do { - newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu); - if (newcp->cpu_part == tp->t_cpupart && - newcp->cpu_dispatch_pri == -1) { - cp = newcp; - break; + if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)parent)) && + (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) { + pg_bad = pg; + } else if (PG_NUM_CPUS((pg_t *)pg) == + PG_NUM_CPUS((pg_t *)parent)) { + if (PG_CMT_HW_SUSPECT(((pghw_t *)parent)->pghw_hw)) { + pg_bad = parent; + } else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { + pg_bad = pg; + } + } + if (pg_bad) { + if (pg_cmt_prune(pg_bad, lineage, sz) == 0) { + cmt_lineage_status = CMT_LINEAGE_REPAIRED; + goto revalidate; } - if (++cpu == tgt_size) - cpu = 0; - } while (cpu != hint); + } + /*FALLTHROUGH*/ + default: + /* + * If we're here, something has gone wrong in trying to + * recover from a illegal PG hierarchy, or we've encountered + * a validation error for which we don't know how to recover. + * In this case, disable CMT scheduling all together. + */ + pg_cmt_disable(); + cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE; + return (-1); } - - return (cp); + return (0); }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/disp/cmt_policy.c Wed Feb 25 21:04:18 2009 -0800 @@ -0,0 +1,229 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/thread.h> +#include <sys/cpuvar.h> +#include <sys/cpupart.h> +#include <sys/cmn_err.h> +#include <sys/disp.h> +#include <sys/group.h> +#include <sys/bitset.h> +#include <sys/lgrp.h> +#include <sys/cmt.h> + +/* + * CMT dispatcher policies + * + * This file implements CMT dispatching policies using Processor Groups. + * + * The scheduler/dispatcher leverages knowledge of the performance + * relevant CMT sharing relationships existing between CPUs to implement + * load balancing, and coalescence thread placement policies. + * + * Load balancing policy seeks to improve performance by minimizing + * contention over shared processor resources / facilities. Coalescence + * policies improve resource utilization and ultimately power efficiency. + * + * On NUMA systems, the dispatcher will generally perform load balancing and + * coalescence within (and not across) lgroups. This is because there isn't + * much sense in trying to correct an imbalance by sending a thread outside + * of its home, if it would attempt to return home a short while later. + * The dispatcher will implement CMT policy across lgroups however, if + * it can do so with a thread homed to the root lgroup, since root homed + * threads have no lgroup affinity. + */ + +/* + * Return non-zero if, given the policy, we should migrate from running + * somewhere "here" to somewhere "there". + */ +static int +cmt_should_migrate(pg_cmt_t *here, pg_cmt_t *there, pg_cmt_policy_t policy, + int self) +{ + uint32_t here_util, there_util; + + here_util = here->cmt_utilization; + there_util = there->cmt_utilization; + + /* + * This assumes that curthread's utilization is "1" + */ + if (self && bitset_in_set(&here->cmt_cpus_actv_set, CPU->cpu_seqid)) + here_util--; /* Ignore curthread's effect */ + + /* + * Load balancing and coalescence are conflicting policies + */ + ASSERT((policy & (CMT_BALANCE|CMT_COALESCE)) != + (CMT_BALANCE|CMT_COALESCE)); + + if (policy & CMT_BALANCE) { + /* + * Balance utilization + * + * If the target is comparatively underutilized + * (either in an absolute sense, or scaled by capacity), + * then choose to balance. + */ + if ((here_util > there_util) || + (here_util == there_util && + (CMT_CAPACITY(there) > CMT_CAPACITY(here)))) { + return (1); + } + } else if (policy & CMT_COALESCE) { + /* + * Attempt to drive group utilization up to capacity + */ + if (there_util > here_util && + there_util < CMT_CAPACITY(there)) + return (1); + } + return (0); +} + +/* + * Perform multi-level CMT load balancing of running threads. + * + * tp is the thread being enqueued. + * cp is a hint CPU, against which CMT load balancing will be performed. + * + * Returns cp, or a CPU better than cp with respect to balancing + * running thread load. + */ +cpu_t * +cmt_balance(kthread_t *tp, cpu_t *cp) +{ + int hint, i, cpu, nsiblings; + int self = 0; + group_t *cmt_pgs, *siblings; + pg_cmt_t *pg, *pg_tmp, *tpg = NULL; + int level = 0; + cpu_t *newcp; + extern cmt_lgrp_t *cmt_root; + + ASSERT(THREAD_LOCK_HELD(tp)); + + cmt_pgs = &cp->cpu_pg->cmt_pgs; + + if (GROUP_SIZE(cmt_pgs) == 0) + return (cp); /* nothing to do */ + + if (tp == curthread) + self = 1; + + /* + * Balance across siblings in the CPUs CMT lineage + * If the thread is homed to the root lgroup, perform + * top level balancing against other top level PGs + * in the system. Otherwise, start with the default + * top level siblings group, which is within the leaf lgroup + */ + pg = GROUP_ACCESS(cmt_pgs, level); + if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) + siblings = &cmt_root->cl_pgs; + else + siblings = pg->cmt_siblings; + + /* + * Traverse down the lineage until we find a level that needs + * balancing, or we get to the end. + */ + for (;;) { + nsiblings = GROUP_SIZE(siblings); /* self inclusive */ + if (nsiblings == 1) + goto next_level; + + hint = CPU_PSEUDO_RANDOM() % nsiblings; + + /* + * Find a balancing candidate from among our siblings + * "hint" is a hint for where to start looking + */ + i = hint; + do { + ASSERT(i < nsiblings); + pg_tmp = GROUP_ACCESS(siblings, i); + + /* + * The candidate must not be us, and must + * have some CPU resources in the thread's + * partition + */ + if (pg_tmp != pg && + bitset_in_set(&tp->t_cpupart->cp_cmt_pgs, + ((pg_t *)pg_tmp)->pg_id)) { + tpg = pg_tmp; + break; + } + + if (++i >= nsiblings) + i = 0; + } while (i != hint); + + if (!tpg) + goto next_level; /* no candidates at this level */ + + /* + * Decide if we should migrate from the current PG to a + * target PG given a policy + */ + if (cmt_should_migrate(pg, tpg, pg->cmt_policy, self)) + break; + tpg = NULL; + +next_level: + if (++level == GROUP_SIZE(cmt_pgs)) + break; + + pg = GROUP_ACCESS(cmt_pgs, level); + siblings = pg->cmt_siblings; + } + + if (tpg) { + uint_t tgt_size = GROUP_SIZE(&tpg->cmt_cpus_actv); + + /* + * Select an idle CPU from the target + */ + hint = CPU_PSEUDO_RANDOM() % tgt_size; + cpu = hint; + do { + newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu); + if (newcp->cpu_part == tp->t_cpupart && + newcp->cpu_dispatch_pri == -1) { + cp = newcp; + break; + } + if (++cpu == tgt_size) + cpu = 0; + } while (cpu != hint); + } + + return (cp); +}
--- a/usr/src/uts/common/disp/disp.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/common/disp/disp.c Wed Feb 25 21:04:18 2009 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -890,11 +890,10 @@ cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; if (next != t) { - if (t == cp->cpu_idle_thread) { - PG_NRUN_UPDATE(cp, 1); - } else if (next == cp->cpu_idle_thread) { - PG_NRUN_UPDATE(cp, -1); - } + hrtime_t now; + + now = gethrtime_unscaled(); + pg_ev_thread_swtch(cp, now, t, next); /* * If t was previously in the TS_ONPROC state, @@ -904,7 +903,7 @@ * queue. */ if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) { - t->t_waitrq = gethrtime_unscaled(); + t->t_waitrq = now; } /* @@ -929,6 +928,8 @@ if (t->t_flag & T_INTR_THREAD) cpu_intr_swtch_exit(t); + pg_ev_thread_remain(cp, t); + DTRACE_SCHED(remain__cpu); TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end"); (void) spl0(); @@ -960,8 +961,7 @@ ASSERT(next != curthread); TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); - if (next == cpu->cpu_idle_thread) - PG_NRUN_UPDATE(cpu, -1); + pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next); restore_mstate(next); @@ -1055,6 +1055,7 @@ swtch_to(kthread_t *next) { cpu_t *cp = CPU; + hrtime_t now; TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); @@ -1065,8 +1066,8 @@ TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); - if (curthread == cp->cpu_idle_thread) - PG_NRUN_UPDATE(cp, 1); + now = gethrtime_unscaled(); + pg_ev_thread_swtch(cp, now, curthread, next); /* OK to steal anything left on run queue */ cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; @@ -1081,7 +1082,7 @@ * queue. */ if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) { - curthread->t_waitrq = gethrtime_unscaled(); + curthread->t_waitrq = now; } /* restore next thread to previously running microstate */ @@ -1098,8 +1099,6 @@ */ } - - #define CPU_IDLING(pri) ((pri) == -1) static void
--- a/usr/src/uts/common/io/cpudrv.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/common/io/cpudrv.c Wed Feb 25 21:04:18 2009 -0800 @@ -43,7 +43,7 @@ #include <sys/ddi.h> #include <sys/sunddi.h> #include <sys/sdt.h> - +#include <sys/epm.h> #include <sys/machsystm.h> #include <sys/x_call.h> #include <sys/cpudrv_mach.h> @@ -110,23 +110,25 @@ /* * Function prototypes */ -static int cpudrv_pm_init_power(cpudrv_devstate_t *cpudsp); -static void cpudrv_pm_free(cpudrv_devstate_t *cpudsp); -static int cpudrv_pm_comp_create(cpudrv_devstate_t *cpudsp); -static void cpudrv_pm_monitor_disp(void *arg); -static void cpudrv_pm_monitor(void *arg); +static int cpudrv_init(cpudrv_devstate_t *cpudsp); +static void cpudrv_free(cpudrv_devstate_t *cpudsp); +static int cpudrv_comp_create(cpudrv_devstate_t *cpudsp); +static void cpudrv_monitor_disp(void *arg); +static void cpudrv_monitor(void *arg); /* * Driver global variables */ uint_t cpudrv_debug = 0; void *cpudrv_state; -static uint_t cpudrv_pm_idle_hwm = CPUDRV_PM_IDLE_HWM; -static uint_t cpudrv_pm_idle_lwm = CPUDRV_PM_IDLE_LWM; -static uint_t cpudrv_pm_idle_buf_zone = CPUDRV_PM_IDLE_BUF_ZONE; -static uint_t cpudrv_pm_idle_bhwm_cnt_max = CPUDRV_PM_IDLE_BHWM_CNT_MAX; -static uint_t cpudrv_pm_idle_blwm_cnt_max = CPUDRV_PM_IDLE_BLWM_CNT_MAX; -static uint_t cpudrv_pm_user_hwm = CPUDRV_PM_USER_HWM; +static uint_t cpudrv_idle_hwm = CPUDRV_IDLE_HWM; +static uint_t cpudrv_idle_lwm = CPUDRV_IDLE_LWM; +static uint_t cpudrv_idle_buf_zone = CPUDRV_IDLE_BUF_ZONE; +static uint_t cpudrv_idle_bhwm_cnt_max = CPUDRV_IDLE_BHWM_CNT_MAX; +static uint_t cpudrv_idle_blwm_cnt_max = CPUDRV_IDLE_BLWM_CNT_MAX; +static uint_t cpudrv_user_hwm = CPUDRV_USER_HWM; + +boolean_t cpudrv_enabled = B_TRUE; /* * cpudrv_direct_pm allows user applications to directly control the @@ -154,13 +156,13 @@ * Arranges for the handler function to be called at the interval suitable * for current speed. */ -#define CPUDRV_PM_MONITOR_INIT(cpudsp) { \ - if (CPUDRV_PM_POWER_ENABLED(cpudsp)) { \ +#define CPUDRV_MONITOR_INIT(cpudsp) { \ + if (cpudrv_is_enabled(cpudsp)) { \ ASSERT(mutex_owned(&(cpudsp)->lock)); \ (cpudsp)->cpudrv_pm.timeout_id = \ - timeout(cpudrv_pm_monitor_disp, \ + timeout(cpudrv_monitor_disp, \ (cpudsp), (((cpudsp)->cpudrv_pm.cur_spd == NULL) ? \ - CPUDRV_PM_QUANT_CNT_OTHR : \ + CPUDRV_QUANT_CNT_OTHR : \ (cpudsp)->cpudrv_pm.cur_spd->quant_cnt)); \ } \ } @@ -168,7 +170,7 @@ /* * Arranges for the handler function not to be called back. */ -#define CPUDRV_PM_MONITOR_FINI(cpudsp) { \ +#define CPUDRV_MONITOR_FINI(cpudsp) { \ timeout_id_t tmp_tid; \ ASSERT(mutex_owned(&(cpudsp)->lock)); \ tmp_tid = (cpudsp)->cpudrv_pm.timeout_id; \ @@ -203,7 +205,7 @@ /* * Callbacks used by the PPM driver. */ - CPUDRV_PM_SET_PPM_CALLBACKS(); + CPUDRV_SET_PPM_CALLBACKS(); return (error); } @@ -242,13 +244,13 @@ case DDI_ATTACH: DPRINTF(D_ATTACH, ("cpudrv_attach: instance %d: " "DDI_ATTACH called\n", instance)); - if (CPUDRV_PM_DISABLED()) + if (!cpudrv_is_enabled(NULL)) return (DDI_FAILURE); if (ddi_soft_state_zalloc(cpudrv_state, instance) != DDI_SUCCESS) { cmn_err(CE_WARN, "cpudrv_attach: instance %d: " "can't allocate state", instance); - CPUDRV_PM_DISABLE(); + cpudrv_enabled = B_FALSE; return (DDI_FAILURE); } if ((cpudsp = ddi_get_soft_state(cpudrv_state, instance)) == @@ -256,7 +258,7 @@ cmn_err(CE_WARN, "cpudrv_attach: instance %d: " "can't get state", instance); ddi_soft_state_free(cpudrv_state, instance); - CPUDRV_PM_DISABLE(); + cpudrv_enabled = B_FALSE; return (DDI_FAILURE); } cpudsp->dip = dip; @@ -264,36 +266,36 @@ /* * Find CPU number for this dev_info node. */ - if (!cpudrv_pm_get_cpu_id(dip, &(cpudsp->cpu_id))) { + if (!cpudrv_get_cpu_id(dip, &(cpudsp->cpu_id))) { cmn_err(CE_WARN, "cpudrv_attach: instance %d: " "can't convert dip to cpu_id", instance); ddi_soft_state_free(cpudrv_state, instance); - CPUDRV_PM_DISABLE(); + cpudrv_enabled = B_FALSE; + return (DDI_FAILURE); + } + if (!cpudrv_mach_init(cpudsp)) { + cpudrv_enabled = B_FALSE; return (DDI_FAILURE); } - if (!cpudrv_mach_pm_init(cpudsp)) { - ddi_soft_state_free(cpudrv_state, instance); - CPUDRV_PM_DISABLE(); - return (DDI_FAILURE); - } + mutex_init(&cpudsp->lock, NULL, MUTEX_DRIVER, NULL); - if (CPUDRV_PM_POWER_ENABLED(cpudsp)) { - if (cpudrv_pm_init_power(cpudsp) != DDI_SUCCESS) { - CPUDRV_PM_DISABLE(); - cpudrv_pm_free(cpudsp); + if (cpudrv_is_enabled(cpudsp)) { + if (cpudrv_init(cpudsp) != DDI_SUCCESS) { + cpudrv_enabled = B_FALSE; + cpudrv_free(cpudsp); ddi_soft_state_free(cpudrv_state, instance); return (DDI_FAILURE); } - if (cpudrv_pm_comp_create(cpudsp) != DDI_SUCCESS) { - CPUDRV_PM_DISABLE(); - cpudrv_pm_free(cpudsp); + if (cpudrv_comp_create(cpudsp) != DDI_SUCCESS) { + cpudrv_enabled = B_FALSE; + cpudrv_free(cpudsp); ddi_soft_state_free(cpudrv_state, instance); return (DDI_FAILURE); } if (ddi_prop_update_string(DDI_DEV_T_NONE, dip, "pm-class", "CPU") != DDI_PROP_SUCCESS) { - CPUDRV_PM_DISABLE(); - cpudrv_pm_free(cpudsp); + cpudrv_enabled = B_FALSE; + cpudrv_free(cpudsp); ddi_soft_state_free(cpudrv_state, instance); return (DDI_FAILURE); } @@ -303,10 +305,10 @@ * activities. */ cpudsp->cpudrv_pm.tq = taskq_create_instance( - "cpudrv_pm_monitor", - ddi_get_instance(dip), CPUDRV_PM_TASKQ_THREADS, - (maxclsyspri - 1), CPUDRV_PM_TASKQ_MIN, - CPUDRV_PM_TASKQ_MAX, + "cpudrv_monitor", + ddi_get_instance(dip), CPUDRV_TASKQ_THREADS, + (maxclsyspri - 1), CPUDRV_TASKQ_MIN, + CPUDRV_TASKQ_MAX, TASKQ_PREPOPULATE|TASKQ_CPR_SAFE); mutex_init(&cpudsp->cpudrv_pm.timeout_lock, NULL, @@ -321,7 +323,7 @@ * is full speed for us. */ /* - * We need to take the lock because cpudrv_pm_monitor() + * We need to take the lock because cpudrv_monitor() * will start running in parallel with attach(). */ mutex_enter(&cpudsp->lock); @@ -335,12 +337,12 @@ * unknown speed and moves CPU to top speed when it * has been initialized. */ - CPUDRV_PM_MONITOR_INIT(cpudsp); + CPUDRV_MONITOR_INIT(cpudsp); mutex_exit(&cpudsp->lock); } - CPUDRV_PM_INSTALL_MAX_CHANGE_HANDLER(cpudsp, dip); + CPUDRV_INSTALL_MAX_CHANGE_HANDLER(cpudsp); ddi_report_dev(dip); return (DDI_SUCCESS); @@ -355,7 +357,7 @@ /* * Nothing to do for resume, if not doing active PM. */ - if (!CPUDRV_PM_POWER_ENABLED(cpudsp)) + if (!cpudrv_is_enabled(cpudsp)) return (DDI_SUCCESS); mutex_enter(&cpudsp->lock); @@ -365,9 +367,9 @@ * that the needed speed is full speed for us. */ cpudsp->cpudrv_pm.cur_spd = NULL; - CPUDRV_PM_MONITOR_INIT(cpudsp); + CPUDRV_MONITOR_INIT(cpudsp); mutex_exit(&cpudsp->lock); - CPUDRV_PM_REDEFINE_TOPSPEED(dip); + CPUDRV_REDEFINE_TOPSPEED(dip); return (DDI_SUCCESS); default: @@ -409,7 +411,7 @@ /* * Nothing to do for suspend, if not doing active PM. */ - if (!CPUDRV_PM_POWER_ENABLED(cpudsp)) + if (!cpudrv_is_enabled(cpudsp)) return (DDI_SUCCESS); /* @@ -427,18 +429,18 @@ DPRINTF(D_DETACH, ("cpudrv_detach: instance %d: DDI_SUSPEND - " "cur_spd %d, topspeed %d\n", instance, cpupm->cur_spd->pm_level, - CPUDRV_PM_TOPSPEED(cpupm)->pm_level)); + CPUDRV_TOPSPEED(cpupm)->pm_level)); - CPUDRV_PM_MONITOR_FINI(cpudsp); + CPUDRV_MONITOR_FINI(cpudsp); if (!cpudrv_direct_pm && (cpupm->cur_spd != - CPUDRV_PM_TOPSPEED(cpupm))) { + CPUDRV_TOPSPEED(cpupm))) { if (cpupm->pm_busycnt < 1) { - if ((pm_busy_component(dip, CPUDRV_PM_COMP_NUM) + if ((pm_busy_component(dip, CPUDRV_COMP_NUM) == DDI_SUCCESS)) { cpupm->pm_busycnt++; } else { - CPUDRV_PM_MONITOR_INIT(cpudsp); + CPUDRV_MONITOR_INIT(cpudsp); mutex_exit(&cpudsp->lock); cmn_err(CE_WARN, "cpudrv_detach: " "instance %d: can't busy CPU " @@ -447,16 +449,16 @@ } } mutex_exit(&cpudsp->lock); - if (pm_raise_power(dip, CPUDRV_PM_COMP_NUM, - CPUDRV_PM_TOPSPEED(cpupm)->pm_level) != + if (pm_raise_power(dip, CPUDRV_COMP_NUM, + CPUDRV_TOPSPEED(cpupm)->pm_level) != DDI_SUCCESS) { mutex_enter(&cpudsp->lock); - CPUDRV_PM_MONITOR_INIT(cpudsp); + CPUDRV_MONITOR_INIT(cpudsp); mutex_exit(&cpudsp->lock); cmn_err(CE_WARN, "cpudrv_detach: instance %d: " "can't raise CPU power level to %d", instance, - CPUDRV_PM_TOPSPEED(cpupm)->pm_level); + CPUDRV_TOPSPEED(cpupm)->pm_level); return (DDI_FAILURE); } else { return (DDI_SUCCESS); @@ -483,7 +485,7 @@ { int instance; cpudrv_devstate_t *cpudsp; - cpudrv_pm_t *cpupm; + cpudrv_pm_t *cpudrvpm; cpudrv_pm_spd_t *new_spd; boolean_t is_ready; int ret; @@ -492,14 +494,15 @@ DPRINTF(D_POWER, ("cpudrv_power: instance %d: level %d\n", instance, level)); + if ((cpudsp = ddi_get_soft_state(cpudrv_state, instance)) == NULL) { - cmn_err(CE_WARN, "cpudrv_power: instance %d: can't get state", - instance); + cmn_err(CE_WARN, "cpudrv_power: instance %d: can't " + "get state", instance); return (DDI_FAILURE); } mutex_enter(&cpudsp->lock); - cpupm = &(cpudsp->cpudrv_pm); + cpudrvpm = &(cpudsp->cpudrv_pm); /* * In normal operation, we fail if we are busy and request is @@ -507,21 +510,22 @@ * is in special direct pm mode. On x86, we also let this through * if the change is due to a request to govern the max speed. */ - if (!cpudrv_direct_pm && (cpupm->pm_busycnt >= 1) && - !cpudrv_pm_is_governor_thread(cpupm)) { - if ((cpupm->cur_spd != NULL) && - (level < cpupm->cur_spd->pm_level)) { + if (!cpudrv_direct_pm && (cpudrvpm->pm_busycnt >= 1) && + !cpudrv_is_governor_thread(cpudrvpm)) { + if ((cpudrvpm->cur_spd != NULL) && + (level < cpudrvpm->cur_spd->pm_level)) { mutex_exit(&cpudsp->lock); return (DDI_FAILURE); } } - for (new_spd = cpupm->head_spd; new_spd; new_spd = new_spd->down_spd) { + for (new_spd = cpudrvpm->head_spd; new_spd; new_spd = + new_spd->down_spd) { if (new_spd->pm_level == level) break; } if (!new_spd) { - CPUDRV_PM_RESET_GOVERNOR_THREAD(cpupm); + CPUDRV_RESET_GOVERNOR_THREAD(cpudrvpm); mutex_exit(&cpudsp->lock); cmn_err(CE_WARN, "cpudrv_power: instance %d: " "can't locate new CPU speed", instance); @@ -538,105 +542,66 @@ * That's because we don't know what the CPU domains look like * until all instances have been initialized. */ - is_ready = CPUDRV_PM_XCALL_IS_READY(cpudsp->cpu_id); + is_ready = CPUDRV_XCALL_IS_READY(cpudsp->cpu_id); if (!is_ready) { DPRINTF(D_POWER, ("cpudrv_power: instance %d: " "CPU not ready for x-calls\n", instance)); - } else if (!(is_ready = cpudrv_pm_power_ready())) { + } else if (!(is_ready = cpudrv_power_ready())) { DPRINTF(D_POWER, ("cpudrv_power: instance %d: " - "waiting for all CPUs to be power manageable\n", instance)); + "waiting for all CPUs to be power manageable\n", + instance)); } if (!is_ready) { - CPUDRV_PM_RESET_GOVERNOR_THREAD(cpupm); - mutex_exit(&cpudsp->lock); - return (DDI_FAILURE); - } - - /* - * Execute CPU specific routine on the requested CPU to change its - * speed to normal-speed/divisor. - */ - if ((ret = cpudrv_pm_change_speed(cpudsp, new_spd)) != DDI_SUCCESS) { - cmn_err(CE_WARN, "cpudrv_power: cpudrv_pm_change_speed() " - "return = %d", ret); + CPUDRV_RESET_GOVERNOR_THREAD(cpudrvpm); mutex_exit(&cpudsp->lock); return (DDI_FAILURE); } /* - * DTrace probe point for CPU speed change transition + * Execute CPU specific routine on the requested CPU to + * change its speed to normal-speed/divisor. */ - DTRACE_PROBE3(cpu__change__speed, cpudrv_devstate_t *, cpudsp, - cpudrv_pm_t *, cpupm, cpudrv_pm_spd_t *, new_spd); + if ((ret = cpudrv_change_speed(cpudsp, new_spd)) != DDI_SUCCESS) { + cmn_err(CE_WARN, "cpudrv_power: " + "cpudrv_change_speed() return = %d", ret); + mutex_exit(&cpudsp->lock); + return (DDI_FAILURE); + } /* * Reset idle threshold time for the new power level. */ - if ((cpupm->cur_spd != NULL) && (level < cpupm->cur_spd->pm_level)) { - if (pm_idle_component(dip, CPUDRV_PM_COMP_NUM) == + if ((cpudrvpm->cur_spd != NULL) && (level < + cpudrvpm->cur_spd->pm_level)) { + if (pm_idle_component(dip, CPUDRV_COMP_NUM) == DDI_SUCCESS) { - if (cpupm->pm_busycnt >= 1) - cpupm->pm_busycnt--; - } else - cmn_err(CE_WARN, "cpudrv_power: instance %d: can't " - "idle CPU component", ddi_get_instance(dip)); + if (cpudrvpm->pm_busycnt >= 1) + cpudrvpm->pm_busycnt--; + } else { + cmn_err(CE_WARN, "cpudrv_power: instance %d: " + "can't idle CPU component", + ddi_get_instance(dip)); + } } /* * Reset various parameters because we are now running at new speed. */ - cpupm->lastquan_mstate[CMS_IDLE] = 0; - cpupm->lastquan_mstate[CMS_SYSTEM] = 0; - cpupm->lastquan_mstate[CMS_USER] = 0; - cpupm->lastquan_ticks = 0; - cpupm->cur_spd = new_spd; - CPUDRV_PM_RESET_GOVERNOR_THREAD(cpupm); + cpudrvpm->lastquan_mstate[CMS_IDLE] = 0; + cpudrvpm->lastquan_mstate[CMS_SYSTEM] = 0; + cpudrvpm->lastquan_mstate[CMS_USER] = 0; + cpudrvpm->lastquan_ticks = 0; + cpudrvpm->cur_spd = new_spd; + CPUDRV_RESET_GOVERNOR_THREAD(cpudrvpm); mutex_exit(&cpudsp->lock); return (DDI_SUCCESS); } /* - * Initialize the field that will be used for reporting - * the supported_frequencies_Hz cpu_info kstat. - */ -static void -set_supp_freqs(cpu_t *cp, cpudrv_pm_t *cpupm) -{ - char *supp_freqs; - char *sfptr; - uint64_t *speeds; - cpudrv_pm_spd_t *spd; - int i; -#define UINT64_MAX_STRING (sizeof ("18446744073709551615")) - - speeds = kmem_zalloc(cpupm->num_spd * sizeof (uint64_t), KM_SLEEP); - for (i = cpupm->num_spd - 1, spd = cpupm->head_spd; spd; - i--, spd = spd->down_spd) { - speeds[i] = - CPUDRV_PM_SPEED_HZ(cp->cpu_type_info.pi_clock, spd->speed); - } - - supp_freqs = kmem_zalloc((UINT64_MAX_STRING * cpupm->num_spd), - KM_SLEEP); - sfptr = supp_freqs; - for (i = 0; i < cpupm->num_spd; i++) { - if (i == cpupm->num_spd - 1) { - (void) sprintf(sfptr, "%"PRIu64, speeds[i]); - } else { - (void) sprintf(sfptr, "%"PRIu64":", speeds[i]); - sfptr = supp_freqs + strlen(supp_freqs); - } - } - cpu_set_supp_freqs(cp, supp_freqs); - kmem_free(supp_freqs, (UINT64_MAX_STRING * cpupm->num_spd)); - kmem_free(speeds, cpupm->num_spd * sizeof (uint64_t)); -} - -/* * Initialize power management data. */ static int -cpudrv_pm_init_power(cpudrv_devstate_t *cpudsp) +cpudrv_init(cpudrv_devstate_t *cpudsp) { cpudrv_pm_t *cpupm = &(cpudsp->cpudrv_pm); cpudrv_pm_spd_t *cur_spd; @@ -647,10 +612,10 @@ int user_cnt_percent; int i; - CPUDRV_PM_GET_SPEEDS(cpudsp, speeds, nspeeds); + CPUDRV_GET_SPEEDS(cpudsp, speeds, nspeeds); if (nspeeds < 2) { /* Need at least two speeds to power manage */ - CPUDRV_PM_FREE_SPEEDS(speeds, nspeeds); + CPUDRV_FREE_SPEEDS(speeds, nspeeds); return (DDI_FAILURE); } cpupm->num_spd = nspeeds; @@ -685,15 +650,15 @@ cur_spd->speed = speeds[i]; if (i == 0) { /* normal speed */ cpupm->head_spd = cur_spd; - CPUDRV_PM_TOPSPEED(cpupm) = cur_spd; - cur_spd->quant_cnt = CPUDRV_PM_QUANT_CNT_NORMAL; + CPUDRV_TOPSPEED(cpupm) = cur_spd; + cur_spd->quant_cnt = CPUDRV_QUANT_CNT_NORMAL; cur_spd->idle_hwm = - (cpudrv_pm_idle_hwm * cur_spd->quant_cnt) / 100; + (cpudrv_idle_hwm * cur_spd->quant_cnt) / 100; /* can't speed anymore */ cur_spd->idle_lwm = 0; cur_spd->user_hwm = UINT_MAX; } else { - cur_spd->quant_cnt = CPUDRV_PM_QUANT_CNT_OTHR; + cur_spd->quant_cnt = CPUDRV_QUANT_CNT_OTHR; ASSERT(prev_spd != NULL); prev_spd->down_spd = cur_spd; cur_spd->up_spd = cpupm->head_spd; @@ -711,14 +676,14 @@ * that there is at least a buffer zone seperation * between the idle_lwm and idle_hwm values. */ - idle_cnt_percent = CPUDRV_PM_IDLE_CNT_PERCENT( - cpudrv_pm_idle_hwm, speeds, i); + idle_cnt_percent = CPUDRV_IDLE_CNT_PERCENT( + cpudrv_idle_hwm, speeds, i); idle_cnt_percent = max(idle_cnt_percent, - (cpudrv_pm_idle_lwm + cpudrv_pm_idle_buf_zone)); + (cpudrv_idle_lwm + cpudrv_idle_buf_zone)); cur_spd->idle_hwm = (idle_cnt_percent * cur_spd->quant_cnt) / 100; cur_spd->idle_lwm = - (cpudrv_pm_idle_lwm * cur_spd->quant_cnt) / 100; + (cpudrv_idle_lwm * cur_spd->quant_cnt) / 100; /* * The lwm for user threads are determined such that @@ -727,10 +692,10 @@ * user_hwm in the new speed. This is to prevent * the quick jump back up to higher speed. */ - cur_spd->user_hwm = (cpudrv_pm_user_hwm * + cur_spd->user_hwm = (cpudrv_user_hwm * cur_spd->quant_cnt) / 100; - user_cnt_percent = CPUDRV_PM_USER_CNT_PERCENT( - cpudrv_pm_user_hwm, speeds, i); + user_cnt_percent = CPUDRV_USER_CNT_PERCENT( + cpudrv_user_hwm, speeds, i); prev_spd->user_lwm = (user_cnt_percent * prev_spd->quant_cnt) / 100; } @@ -740,11 +705,11 @@ cur_spd->idle_hwm = UINT_MAX; cur_spd->user_lwm = -1; #ifdef DEBUG - DPRINTF(D_PM_INIT, ("cpudrv_pm_init: instance %d: head_spd spd %d, " + DPRINTF(D_PM_INIT, ("cpudrv_init: instance %d: head_spd spd %d, " "num_spd %d\n", ddi_get_instance(cpudsp->dip), cpupm->head_spd->speed, cpupm->num_spd)); for (cur_spd = cpupm->head_spd; cur_spd; cur_spd = cur_spd->down_spd) { - DPRINTF(D_PM_INIT, ("cpudrv_pm_init: instance %d: speed %d, " + DPRINTF(D_PM_INIT, ("cpudrv_init: instance %d: speed %d, " "down_spd spd %d, idle_hwm %d, user_lwm %d, " "up_spd spd %d, idle_lwm %d, user_hwm %d, " "quant_cnt %d\n", ddi_get_instance(cpudsp->dip), @@ -756,7 +721,7 @@ cur_spd->quant_cnt)); } #endif /* DEBUG */ - CPUDRV_PM_FREE_SPEEDS(speeds, nspeeds); + CPUDRV_FREE_SPEEDS(speeds, nspeeds); return (DDI_SUCCESS); } @@ -764,7 +729,7 @@ * Free CPU power management data. */ static void -cpudrv_pm_free(cpudrv_devstate_t *cpudsp) +cpudrv_free(cpudrv_devstate_t *cpudsp) { cpudrv_pm_t *cpupm = &(cpudsp->cpudrv_pm); cpudrv_pm_spd_t *cur_spd, *next_spd; @@ -776,14 +741,13 @@ cur_spd = next_spd; } bzero(cpupm, sizeof (cpudrv_pm_t)); - cpudrv_mach_pm_free(cpudsp); } /* * Create pm-components property. */ static int -cpudrv_pm_comp_create(cpudrv_devstate_t *cpudsp) +cpudrv_comp_create(cpudrv_devstate_t *cpudsp) { cpudrv_pm_t *cpupm = &(cpudsp->cpudrv_pm); cpudrv_pm_spd_t *cur_spd; @@ -795,9 +759,9 @@ int result = DDI_FAILURE; pmc = kmem_zalloc((cpupm->num_spd + 1) * sizeof (char *), KM_SLEEP); - size = CPUDRV_PM_COMP_SIZE(); - if (cpupm->num_spd > CPUDRV_PM_COMP_MAX_VAL) { - cmn_err(CE_WARN, "cpudrv_pm_comp_create: instance %d: " + size = CPUDRV_COMP_SIZE(); + if (cpupm->num_spd > CPUDRV_COMP_MAX_VAL) { + cmn_err(CE_WARN, "cpudrv_comp_create: instance %d: " "number of speeds exceeded limits", ddi_get_instance(cpudsp->dip)); kmem_free(pmc, (cpupm->num_spd + 1) * sizeof (char *)); @@ -808,9 +772,9 @@ i--, cur_spd = cur_spd->down_spd) { cur_spd->pm_level = i; pmc[i] = kmem_zalloc((size * sizeof (char)), KM_SLEEP); - comp_spd = CPUDRV_PM_COMP_SPEED(cpupm, cur_spd); - if (comp_spd > CPUDRV_PM_COMP_MAX_VAL) { - cmn_err(CE_WARN, "cpudrv_pm_comp_create: " + comp_spd = CPUDRV_COMP_SPEED(cpupm, cur_spd); + if (comp_spd > CPUDRV_COMP_MAX_VAL) { + cmn_err(CE_WARN, "cpudrv_comp_create: " "instance %d: speed exceeded limits", ddi_get_instance(cpudsp->dip)); for (j = cpupm->num_spd; j >= i; j--) { @@ -820,14 +784,14 @@ sizeof (char *)); return (result); } - CPUDRV_PM_COMP_SPRINT(pmc[i], cpupm, cur_spd, comp_spd) - DPRINTF(D_PM_COMP_CREATE, ("cpudrv_pm_comp_create: " + CPUDRV_COMP_SPRINT(pmc[i], cpupm, cur_spd, comp_spd) + DPRINTF(D_PM_COMP_CREATE, ("cpudrv_comp_create: " "instance %d: pm-components power level %d string '%s'\n", ddi_get_instance(cpudsp->dip), i, pmc[i])); } pmc[0] = kmem_zalloc(sizeof (name), KM_SLEEP); (void) strcat(pmc[0], name); - DPRINTF(D_PM_COMP_CREATE, ("cpudrv_pm_comp_create: instance %d: " + DPRINTF(D_PM_COMP_CREATE, ("cpudrv_comp_create: instance %d: " "pm-components component name '%s'\n", ddi_get_instance(cpudsp->dip), pmc[0])); @@ -835,7 +799,7 @@ "pm-components", pmc, cpupm->num_spd + 1) == DDI_PROP_SUCCESS) { result = DDI_SUCCESS; } else { - cmn_err(CE_WARN, "cpudrv_pm_comp_create: instance %d: " + cmn_err(CE_WARN, "cpudrv_comp_create: instance %d: " "can't create pm-components property", ddi_get_instance(cpudsp->dip)); } @@ -851,16 +815,16 @@ /* * Mark a component idle. */ -#define CPUDRV_PM_MONITOR_PM_IDLE_COMP(dip, cpupm) { \ +#define CPUDRV_MONITOR_PM_IDLE_COMP(dip, cpupm) { \ if ((cpupm)->pm_busycnt >= 1) { \ - if (pm_idle_component((dip), CPUDRV_PM_COMP_NUM) == \ + if (pm_idle_component((dip), CPUDRV_COMP_NUM) == \ DDI_SUCCESS) { \ - DPRINTF(D_PM_MONITOR, ("cpudrv_pm_monitor: " \ + DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: " \ "instance %d: pm_idle_component called\n", \ ddi_get_instance((dip)))); \ (cpupm)->pm_busycnt--; \ } else { \ - cmn_err(CE_WARN, "cpudrv_pm_monitor: instance %d: " \ + cmn_err(CE_WARN, "cpudrv_monitor: instance %d: " \ "can't idle CPU component", \ ddi_get_instance((dip))); \ } \ @@ -870,16 +834,16 @@ /* * Marks a component busy in both PM framework and driver state structure. */ -#define CPUDRV_PM_MONITOR_PM_BUSY_COMP(dip, cpupm) { \ +#define CPUDRV_MONITOR_PM_BUSY_COMP(dip, cpupm) { \ if ((cpupm)->pm_busycnt < 1) { \ - if (pm_busy_component((dip), CPUDRV_PM_COMP_NUM) == \ + if (pm_busy_component((dip), CPUDRV_COMP_NUM) == \ DDI_SUCCESS) { \ - DPRINTF(D_PM_MONITOR, ("cpudrv_pm_monitor: " \ + DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: " \ "instance %d: pm_busy_component called\n", \ ddi_get_instance((dip)))); \ (cpupm)->pm_busycnt++; \ } else { \ - cmn_err(CE_WARN, "cpudrv_pm_monitor: instance %d: " \ + cmn_err(CE_WARN, "cpudrv_monitor: instance %d: " \ "can't busy CPU component", \ ddi_get_instance((dip))); \ } \ @@ -889,19 +853,19 @@ /* * Marks a component busy and calls pm_raise_power(). */ -#define CPUDRV_PM_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm, new_level) { \ +#define CPUDRV_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm, new_level) { \ /* \ * Mark driver and PM framework busy first so framework doesn't try \ * to bring CPU to lower speed when we need to be at higher speed. \ */ \ - CPUDRV_PM_MONITOR_PM_BUSY_COMP((dip), (cpupm)); \ + CPUDRV_MONITOR_PM_BUSY_COMP((dip), (cpupm)); \ mutex_exit(&(cpudsp)->lock); \ - DPRINTF(D_PM_MONITOR, ("cpudrv_pm_monitor: instance %d: " \ + DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: instance %d: " \ "pm_raise_power called to %d\n", ddi_get_instance((dip)), \ (new_level))); \ - if (pm_raise_power((dip), CPUDRV_PM_COMP_NUM, (new_level)) != \ + if (pm_raise_power((dip), CPUDRV_COMP_NUM, (new_level)) != \ DDI_SUCCESS) { \ - cmn_err(CE_WARN, "cpudrv_pm_monitor: instance %d: can't " \ + cmn_err(CE_WARN, "cpudrv_monitor: instance %d: can't " \ "raise CPU power level", ddi_get_instance((dip))); \ } \ mutex_enter(&(cpudsp)->lock); \ @@ -913,7 +877,7 @@ * We dispatch a taskq to do that job. */ static void -cpudrv_pm_monitor_disp(void *arg) +cpudrv_monitor_disp(void *arg) { cpudrv_devstate_t *cpudsp = (cpudrv_devstate_t *)arg; @@ -922,13 +886,13 @@ * The queue should be empty at this time. */ mutex_enter(&cpudsp->cpudrv_pm.timeout_lock); - if (!taskq_dispatch(cpudsp->cpudrv_pm.tq, cpudrv_pm_monitor, arg, + if (!taskq_dispatch(cpudsp->cpudrv_pm.tq, cpudrv_monitor, arg, TQ_NOSLEEP)) { mutex_exit(&cpudsp->cpudrv_pm.timeout_lock); - DPRINTF(D_PM_MONITOR, ("cpudrv_pm_monitor_disp: failed to " - "dispatch the cpudrv_pm_monitor taskq\n")); + DPRINTF(D_PM_MONITOR, ("cpudrv_monitor_disp: failed to " + "dispatch the cpudrv_monitor taskq\n")); mutex_enter(&cpudsp->lock); - CPUDRV_PM_MONITOR_INIT(cpudsp); + CPUDRV_MONITOR_INIT(cpudsp); mutex_exit(&cpudsp->lock); return; } @@ -940,17 +904,16 @@ * Monitors each CPU for the amount of time idle thread was running in the * last quantum and arranges for the CPU to go to the lower or higher speed. * Called at the time interval appropriate for the current speed. The - * time interval for normal speed is CPUDRV_PM_QUANT_CNT_NORMAL. The time + * time interval for normal speed is CPUDRV_QUANT_CNT_NORMAL. The time * interval for other speeds (including unknown speed) is - * CPUDRV_PM_QUANT_CNT_OTHR. + * CPUDRV_QUANT_CNT_OTHR. */ static void -cpudrv_pm_monitor(void *arg) +cpudrv_monitor(void *arg) { cpudrv_devstate_t *cpudsp = (cpudrv_devstate_t *)arg; cpudrv_pm_t *cpupm; cpudrv_pm_spd_t *cur_spd, *new_spd; - cpu_t *cp; dev_info_t *dip; uint_t idle_cnt, user_cnt, system_cnt; clock_t ticks; @@ -984,12 +947,12 @@ * That's because we don't know what the CPU domains look like * until all instances have been initialized. */ - is_ready = CPUDRV_PM_XCALL_IS_READY(cpudsp->cpu_id); + is_ready = CPUDRV_XCALL_IS_READY(cpudsp->cpu_id); if (!is_ready) { - DPRINTF(D_PM_MONITOR, ("cpudrv_pm_monitor: instance %d: " + DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: instance %d: " "CPU not ready for x-calls\n", ddi_get_instance(dip))); - } else if (!(is_ready = cpudrv_pm_power_ready())) { - DPRINTF(D_PM_MONITOR, ("cpudrv_pm_monitor: instance %d: " + } else if (!(is_ready = cpudrv_power_ready())) { + DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: instance %d: " "waiting for all CPUs to be power manageable\n", ddi_get_instance(dip))); } @@ -998,8 +961,8 @@ * Make sure that we are busy so that framework doesn't * try to bring us down in this situation. */ - CPUDRV_PM_MONITOR_PM_BUSY_COMP(dip, cpupm); - CPUDRV_PM_MONITOR_INIT(cpudsp); + CPUDRV_MONITOR_PM_BUSY_COMP(dip, cpupm); + CPUDRV_MONITOR_INIT(cpudsp); mutex_exit(&cpudsp->lock); goto do_return; } @@ -1008,35 +971,36 @@ * Make sure that we are still not at unknown power level. */ if (cur_spd == NULL) { - DPRINTF(D_PM_MONITOR, ("cpudrv_pm_monitor: instance %d: " + DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: instance %d: " "cur_spd is unknown\n", ddi_get_instance(dip))); - CPUDRV_PM_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm, - CPUDRV_PM_TOPSPEED(cpupm)->pm_level); + CPUDRV_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm, + CPUDRV_TOPSPEED(cpupm)->pm_level); /* * We just changed the speed. Wait till at least next * call to this routine before proceeding ahead. */ - CPUDRV_PM_MONITOR_INIT(cpudsp); + CPUDRV_MONITOR_INIT(cpudsp); mutex_exit(&cpudsp->lock); goto do_return; } mutex_enter(&cpu_lock); - if ((cp = cpu_get(cpudsp->cpu_id)) == NULL) { + if (cpudsp->cp == NULL && + (cpudsp->cp = cpu_get(cpudsp->cpu_id)) == NULL) { mutex_exit(&cpu_lock); - CPUDRV_PM_MONITOR_INIT(cpudsp); + CPUDRV_MONITOR_INIT(cpudsp); mutex_exit(&cpudsp->lock); - cmn_err(CE_WARN, "cpudrv_pm_monitor: instance %d: can't get " + cmn_err(CE_WARN, "cpudrv_monitor: instance %d: can't get " "cpu_t", ddi_get_instance(dip)); goto do_return; } if (!cpupm->pm_started) { cpupm->pm_started = B_TRUE; - set_supp_freqs(cp, cpupm); + cpudrv_set_supp_freqs(cpudsp); } - get_cpu_mstate(cp, msnsecs); + get_cpu_mstate(cpudsp->cp, msnsecs); GET_CPU_MSTATE_CNT(CMS_IDLE, idle_cnt); GET_CPU_MSTATE_CNT(CMS_USER, user_cnt); GET_CPU_MSTATE_CNT(CMS_SYSTEM, system_cnt); @@ -1048,7 +1012,7 @@ if (cpupm->lastquan_ticks == 0) { cpupm->lastquan_ticks = NSEC_TO_TICK(gethrtime()); mutex_exit(&cpu_lock); - CPUDRV_PM_MONITOR_INIT(cpudsp); + CPUDRV_MONITOR_INIT(cpudsp); mutex_exit(&cpudsp->lock); goto do_return; } @@ -1071,10 +1035,10 @@ * Time taken between recording the current counts and * arranging the next call of this routine is an error in our * calculation. We minimize the error by calling - * CPUDRV_PM_MONITOR_INIT() here instead of end of this routine. + * CPUDRV_MONITOR_INIT() here instead of end of this routine. */ - CPUDRV_PM_MONITOR_INIT(cpudsp); - DPRINTF(D_PM_MONITOR_VERBOSE, ("cpudrv_pm_monitor: instance %d: " + CPUDRV_MONITOR_INIT(cpudsp); + DPRINTF(D_PM_MONITOR_VERBOSE, ("cpudrv_monitor: instance %d: " "idle count %d, user count %d, system count %d, pm_level %d, " "pm_busycnt %d\n", ddi_get_instance(dip), idle_cnt, user_cnt, system_cnt, cur_spd->pm_level, cpupm->pm_busycnt)); @@ -1089,7 +1053,7 @@ * DPRINTFs changes the timing. */ if (tick_cnt > cur_spd->quant_cnt) { - DPRINTF(D_PM_MONITOR_DELAY, ("cpudrv_pm_monitor: instance %d: " + DPRINTF(D_PM_MONITOR_DELAY, ("cpudrv_monitor: instance %d: " "tick count %d > quantum_count %u\n", ddi_get_instance(dip), tick_cnt, cur_spd->quant_cnt)); } @@ -1102,7 +1066,7 @@ user_cnt = (user_cnt * cur_spd->quant_cnt) / tick_cnt; if ((user_cnt > cur_spd->user_hwm) || (idle_cnt < cur_spd->idle_lwm && - cur_spd->idle_blwm_cnt >= cpudrv_pm_idle_blwm_cnt_max)) { + cur_spd->idle_blwm_cnt >= cpudrv_idle_blwm_cnt_max)) { cur_spd->idle_blwm_cnt = 0; cur_spd->idle_bhwm_cnt = 0; /* @@ -1111,21 +1075,21 @@ * at the current speed. */ if (cur_spd == cur_spd->up_spd || cpudrv_direct_pm) { - CPUDRV_PM_MONITOR_PM_BUSY_COMP(dip, cpupm); + CPUDRV_MONITOR_PM_BUSY_COMP(dip, cpupm); } else { new_spd = cur_spd->up_spd; - CPUDRV_PM_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm, + CPUDRV_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm, new_spd->pm_level); } } else if ((user_cnt <= cur_spd->user_lwm) && - (idle_cnt >= cur_spd->idle_hwm) || !CPU_ACTIVE(cp)) { + (idle_cnt >= cur_spd->idle_hwm) || !CPU_ACTIVE(cpudsp->cp)) { cur_spd->idle_blwm_cnt = 0; cur_spd->idle_bhwm_cnt = 0; /* * Arrange to go to next lower speed by informing our idle * status to the power management framework. */ - CPUDRV_PM_MONITOR_PM_IDLE_COMP(dip, cpupm); + CPUDRV_MONITOR_PM_IDLE_COMP(dip, cpupm); } else { /* * If we are between the idle water marks and have not @@ -1134,7 +1098,7 @@ */ if ((idle_cnt < cur_spd->idle_hwm) && (idle_cnt >= cur_spd->idle_lwm) && - (cur_spd->idle_bhwm_cnt < cpudrv_pm_idle_bhwm_cnt_max)) { + (cur_spd->idle_bhwm_cnt < cpudrv_idle_bhwm_cnt_max)) { cur_spd->idle_blwm_cnt = 0; cur_spd->idle_bhwm_cnt++; mutex_exit(&cpudsp->lock); @@ -1147,7 +1111,7 @@ /* * Arranges to stay at the current speed. */ - CPUDRV_PM_MONITOR_PM_BUSY_COMP(dip, cpupm); + CPUDRV_MONITOR_PM_BUSY_COMP(dip, cpupm); } mutex_exit(&cpudsp->lock); do_return:
--- a/usr/src/uts/common/io/pm.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/common/io/pm.c Wed Feb 25 21:04:18 2009 -0800 @@ -19,11 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ - /* * pm This driver now only handles the ioctl interface. The scanning * and policy stuff now lives in common/os/sunpm.c. @@ -33,6 +32,7 @@ #include <sys/types.h> #include <sys/errno.h> #include <sys/modctl.h> +#include <sys/callb.h> /* callback registration for cpu_deep_idle */ #include <sys/conf.h> /* driver flags and functions */ #include <sys/open.h> /* OTYP_CHR definition */ #include <sys/stat.h> /* S_IFCHR definition */ @@ -53,6 +53,7 @@ #include <sys/note.h> #include <sys/taskq.h> #include <sys/policy.h> +#include <sys/cpu_pm.h> /* * Minor number is instance<<8 + clone minor from range 1-254; (0 reserved @@ -73,6 +74,7 @@ extern kmutex_t pm_clone_lock; /* protects pm_clones array */ extern int autopm_enabled; extern pm_cpupm_t cpupm; +extern pm_cpupm_t cpupm_default_mode; extern int pm_default_idle_threshold; extern int pm_system_idle_threshold; extern int pm_cpu_idle_threshold; @@ -444,6 +446,10 @@ {PM_ADD_DEPENDENT_PROPERTY, "PM_ADD_DEPENDENT_PROPERTY", 1, PM_REQ, INWHO | INDATASTRING, NODIP, DEP, SU}, {PM_START_CPUPM, "PM_START_CPUPM", 1, NOSTRUCT, 0, 0, 0, SU}, + {PM_START_CPUPM_EV, "PM_START_CPUPM_EV", 1, NOSTRUCT, 0, + 0, 0, SU}, + {PM_START_CPUPM_POLL, "PM_START_CPUPM_POLL", 1, NOSTRUCT, 0, + 0, 0, SU}, {PM_STOP_CPUPM, "PM_STOP_CPUPM", 1, NOSTRUCT, 0, 0, 0, SU}, {PM_GET_CPU_THRESHOLD, "PM_GET_CPU_THRESHOLD", 1, NOSTRUCT}, {PM_SET_CPU_THRESHOLD, "PM_SET_CPU_THRESHOLD", 1, NOSTRUCT, @@ -457,6 +463,12 @@ {PM_SEARCH_LIST, "PM_SEARCH_LIST", 1, PM_SRCH, 0, 0, 0, SU}, {PM_GET_CMD_NAME, "PM_GET_CMD_NAME", 1, PM_REQ, INDATAOUT, NODIP, NODEP, 0}, + {PM_DISABLE_CPU_DEEP_IDLE, "PM_DISABLE_CPU_DEEP_IDLE", 1, NOSTRUCT, 0, + 0, 0, SU}, + {PM_ENABLE_CPU_DEEP_IDLE, "PM_START_CPU_DEEP_IDLE", 1, NOSTRUCT, 0, + 0, 0, SU}, + {PM_DEFAULT_CPU_DEEP_IDLE, "PM_DFLT_CPU_DEEP_IDLE", 1, NOSTRUCT, 0, + 0, 0, SU}, {0, NULL} }; @@ -500,16 +512,17 @@ switch (cmd) { case PM_START_CPUPM: + case PM_START_CPUPM_POLL: if (!PM_ISCPU(dip)) return (DDI_WALK_CONTINUE); mutex_enter(&pm_scan_lock); - if (!PM_CPUPM_DISABLED) + if (!PM_CPUPM_DISABLED && !PM_EVENT_CPUPM) pm_scan_init(dip); mutex_exit(&pm_scan_lock); break; case PM_START_PM: mutex_enter(&pm_scan_lock); - if (PM_ISCPU(dip) && PM_CPUPM_DISABLED) { + if (PM_ISCPU(dip) && (PM_CPUPM_DISABLED || PM_EVENT_CPUPM)) { mutex_exit(&pm_scan_lock); return (DDI_WALK_CONTINUE); } @@ -552,7 +565,7 @@ * stop them as part of PM_STOP_PM. Only stop them as part of * PM_STOP_CPUPM and PM_RESET_PM. */ - if (PM_ISCPU(dip) && PM_CPUPM_ENABLED) + if (PM_ISCPU(dip) && PM_POLLING_CPUPM) return (DDI_WALK_CONTINUE); break; case PM_STOP_CPUPM: @@ -2662,22 +2675,74 @@ switch (cmd) { case PM_START_PM: case PM_START_CPUPM: + case PM_START_CPUPM_EV: + case PM_START_CPUPM_POLL: { + pm_cpupm_t new_mode = PM_CPUPM_NOTSET; + pm_cpupm_t old_mode = PM_CPUPM_NOTSET; + int r; + mutex_enter(&pm_scan_lock); if ((cmd == PM_START_PM && autopm_enabled) || - (cmd == PM_START_CPUPM && PM_CPUPM_ENABLED)) { + (cmd == PM_START_CPUPM && PM_DEFAULT_CPUPM) || + (cmd == PM_START_CPUPM_EV && PM_EVENT_CPUPM) || + (cmd == PM_START_CPUPM_POLL && PM_POLLING_CPUPM)) { mutex_exit(&pm_scan_lock); - PMD(PMD_ERROR, ("ioctl: %s: EBUSY\n", - cmdstr)) + PMD(PMD_ERROR, ("ioctl: %s: EBUSY\n", cmdstr)) ret = EBUSY; break; } - if (cmd == PM_START_PM) + + if (cmd == PM_START_PM) { autopm_enabled = 1; - else - cpupm = PM_CPUPM_ENABLE; + } else if (cmd == PM_START_CPUPM) { + old_mode = cpupm; + new_mode = cpupm = cpupm_default_mode; + } else if (cmd == PM_START_CPUPM_EV) { + old_mode = cpupm; + new_mode = cpupm = PM_CPUPM_EVENT; + } else if (cmd == PM_START_CPUPM_POLL) { + old_mode = cpupm; + new_mode = cpupm = PM_CPUPM_POLLING; + } + mutex_exit(&pm_scan_lock); - ddi_walk_devs(ddi_root_node(), pm_start_pm_walk, &cmd); + + /* + * If we are changing CPUPM modes, and it is active, + * then stop it from operating in the old mode. + */ + if (old_mode == PM_CPUPM_POLLING) { + int c = PM_STOP_CPUPM; + ddi_walk_devs(ddi_root_node(), pm_stop_pm_walk, + &c); + } else if (old_mode == PM_CPUPM_EVENT) { + r = cpupm_set_policy(CPUPM_POLICY_DISABLED); + + /* + * Disabling CPUPM policy should always + * succeed + */ + ASSERT(r == 0); + } + + /* + * If we are changing to event based CPUPM, enable it. + * In the event it's not supported, fall back to + * polling based CPUPM. + */ + if (new_mode == PM_CPUPM_EVENT && + cpupm_set_policy(CPUPM_POLICY_ELASTIC) < 0) { + mutex_enter(&pm_scan_lock); + new_mode = cpupm = PM_CPUPM_POLLING; + cmd = PM_START_CPUPM_POLL; + mutex_exit(&pm_scan_lock); + } + if (new_mode == PM_CPUPM_POLLING || + cmd == PM_START_PM) { + ddi_walk_devs(ddi_root_node(), pm_start_pm_walk, + &cmd); + } ret = 0; break; } @@ -2687,6 +2752,7 @@ case PM_STOP_CPUPM: { extern void pm_discard_thresholds(void); + pm_cpupm_t old_mode = PM_CPUPM_NOTSET; mutex_enter(&pm_scan_lock); if ((cmd == PM_STOP_PM && !autopm_enabled) || @@ -2697,22 +2763,30 @@ ret = EINVAL; break; } + if (cmd == PM_STOP_PM) { autopm_enabled = 0; pm_S3_enabled = 0; autoS3_enabled = 0; } else if (cmd == PM_STOP_CPUPM) { + old_mode = cpupm; cpupm = PM_CPUPM_DISABLE; } else { autopm_enabled = 0; autoS3_enabled = 0; + old_mode = cpupm; cpupm = PM_CPUPM_NOTSET; } mutex_exit(&pm_scan_lock); /* * bring devices to full power level, stop scan + * If CPUPM was operating in event driven mode, disable + * that. */ + if (old_mode == PM_CPUPM_EVENT) { + (void) cpupm_set_policy(CPUPM_POLICY_DISABLED); + } ddi_walk_devs(ddi_root_node(), pm_stop_pm_walk, &cmd); ret = 0; if (cmd == PM_STOP_PM || cmd == PM_STOP_CPUPM) @@ -2796,7 +2870,7 @@ case PM_GET_CPUPM_STATE: { - if (PM_CPUPM_ENABLED) + if (PM_POLLING_CPUPM || PM_EVENT_CPUPM) *rval_p = PM_CPU_PM_ENABLED; else if (PM_CPUPM_DISABLED) *rval_p = PM_CPU_PM_DISABLED; @@ -2881,6 +2955,34 @@ break; } + case PM_ENABLE_CPU_DEEP_IDLE: + { + if (callb_execute_class(CB_CL_CPU_DEEP_IDLE, + PM_ENABLE_CPU_DEEP_IDLE) == NULL) + ret = 0; + else + ret = EBUSY; + break; + } + case PM_DISABLE_CPU_DEEP_IDLE: + { + if (callb_execute_class(CB_CL_CPU_DEEP_IDLE, + PM_DISABLE_CPU_DEEP_IDLE) == NULL) + ret = 0; + else + ret = EINVAL; + break; + } + case PM_DEFAULT_CPU_DEEP_IDLE: + { + if (callb_execute_class(CB_CL_CPU_DEEP_IDLE, + PM_DEFAULT_CPU_DEEP_IDLE) == NULL) + ret = 0; + else + ret = EBUSY; + break; + } + default: /* * Internal error, invalid ioctl description @@ -2896,7 +2998,7 @@ break; } - default: +default: /* * Internal error, invalid ioctl description * force debug entry even if pm_debug not set
--- a/usr/src/uts/common/os/cpu.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/common/os/cpu.c Wed Feb 25 21:04:18 2009 -0800 @@ -56,6 +56,7 @@ #include <sys/msacct.h> #include <sys/time.h> #include <sys/archsystm.h> +#include <sys/sdt.h> #if defined(__x86) || defined(__amd64) #include <sys/x86_archext.h> #endif @@ -2163,6 +2164,8 @@ kstat_named_t ci_pkg_core_id; kstat_named_t ci_ncpuperchip; kstat_named_t ci_ncoreperchip; + kstat_named_t ci_max_cstates; + kstat_named_t ci_curr_cstate; #endif } cpu_info_template = { { "state", KSTAT_DATA_CHAR }, @@ -2189,6 +2192,8 @@ { "pkg_core_id", KSTAT_DATA_LONG }, { "ncpu_per_chip", KSTAT_DATA_INT32 }, { "ncore_per_chip", KSTAT_DATA_INT32 }, + { "supported_max_cstates", KSTAT_DATA_INT32 }, + { "current_cstate", KSTAT_DATA_INT32 }, #endif }; @@ -2258,6 +2263,8 @@ cpu_info_template.ci_ncoreperchip.value.l = cpuid_get_ncore_per_chip(cp); cpu_info_template.ci_pkg_core_id.value.l = cpuid_get_pkgcoreid(cp); + cpu_info_template.ci_max_cstates.value.l = cp->cpu_m.max_cstates; + cpu_info_template.ci_curr_cstate.value.l = cp->cpu_m.curr_cstate; #endif return (0); @@ -2960,6 +2967,25 @@ } /* + * Indicate the current CPU's clock freqency (in Hz). + * The calling context must be such that CPU references are safe. + */ +void +cpu_set_curr_clock(uint64_t new_clk) +{ + uint64_t old_clk; + + old_clk = CPU->cpu_curr_clock; + CPU->cpu_curr_clock = new_clk; + + /* + * The cpu-change-speed DTrace probe exports the frequency in Hz + */ + DTRACE_PROBE3(cpu__change__speed, processorid_t, CPU->cpu_id, + uint64_t, old_clk, uint64_t, new_clk); +} + +/* * processor_info(2) and p_online(2) status support functions * The constants returned by the cpu_get_state() and cpu_get_state_str() are * for use in communicating processor state information to userland. Kernel
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/os/cpu_pm.c Wed Feb 25 21:04:18 2009 -0800 @@ -0,0 +1,840 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/cpu_pm.h> +#include <sys/cmn_err.h> +#include <sys/sdt.h> + +/* + * Solaris Event Based CPU Power Manager + * + * This file implements platform independent event based CPU power management. + * When CPUs are configured into the system, the CMT scheduling subsystem will + * query the platform to determine if the CPU belongs to any power management + * domains. That is, sets of CPUs that share power management states. + * + * Active Power Management domains represent a group of CPUs across which the + * Operating System can request speed changes (which may in turn result + * in voltage changes). This allows the operating system to trade off + * performance for power savings. + * + * Idle Power Management domains can enter power savings states when they are + * unutilized. These states allow the Operating System to trade off power + * for performance (in the form of latency to transition from the idle state + * to an active one). + * + * For each active and idle power domain the CMT subsystem instantiates, a + * cpupm_domain_t structure is created. As the dispatcher schedules threads + * to run on the system's CPUs, it will also track the utilization of the + * enumerated power domains. Significant changes in utilization will result + * in the dispatcher sending the power manager events that relate to the + * utilization of the power domain. The power manager recieves the events, + * and in the context of the policy objectives in force, may decide to request + * the domain's power/performance state be changed. + * + * Under the "elastic" CPUPM policy, when the utilization rises, the CPU power + * manager will request the CPUs in the domain run at their fastest (and most + * power consuming) state. When the domain becomes idle (utilization at zero), + * the power manager will request that the CPUs run at a speed that saves the + * most power. + * + * The advantage of this scheme, is that the CPU power manager working with the + * dispatcher can be extremely responsive to changes in utilization. Optimizing + * for performance in the presence of utilization, and power savings in the + * presence of idleness. Such close collaboration with the dispatcher has other + * benefits that will play out in the form of more sophisticated power / + * performance policy in the near future. + * + * Avoiding state thrashing in the presence of transient periods of utilization + * and idleness while still being responsive to non-transient periods is key. + * The power manager implmeents several "governors" that are used to throttle + * state transitions when a significant amount of transient idle or transient + * work is detected. + * + * Kernel background activity (e.g. taskq threads) are by far the most common + * form of transient utilization. Ungoverned in the face of this utililzation, + * hundreds of state transitions per second would result on an idle system. + * + * Transient idleness is common when a thread briefly yields the CPU to + * wait for an event elsewhere in the system. Where the idle period is short + * enough, the overhead associated with making the state transition doesn't + * justify the power savings. + */ + +static cpupm_domain_t *cpupm_domains = NULL; + +/* + * Uninitialized state of CPU power management is disabled + */ +cpupm_policy_t cpupm_policy = CPUPM_POLICY_DISABLED; + +/* + * Periods of utilization lasting less than this time interval are characterized + * as transient. State changes associated with transient work are considered + * to be mispredicted. That is, it's not worth raising and lower power states + * where the utilization lasts for less than this interval. + */ +hrtime_t cpupm_tw_predict_interval; + +/* + * Periods of idleness lasting less than this time interval are characterized + * as transient. State changes associated with transient idle are considered + * to be mispredicted. That is, it's not worth lowering and raising power + * states where the idleness lasts for less than this interval. + */ +hrtime_t cpupm_ti_predict_interval; + +/* + * Number of mispredictions after which future transitions will be governed. + */ +int cpupm_mispredict_thresh = 2; + +/* + * Likewise, the number of mispredicted governed transitions after which the + * governor will be removed. + */ +int cpupm_mispredict_gov_thresh = 10; + +/* + * The transient work and transient idle prediction intervals are initialized + * to be some multiple of the amount of time it takes to transition a power + * domain from the highest to the lowest power state, and back again, which + * is measured. + * + * The default values of those multiples are specified here. Tuning them higher + * will result in the transient work, and transient idle governors being used + * more aggresively, which limits the frequency of state transitions at the + * expense of performance and power savings, respectively. + */ +#define CPUPM_TI_GOV_DEFAULT_MULTIPLE 600 +#define CPUPM_TW_GOV_DEFAULT_MULTIPLE 25 + +/* + * Number of high=>low=>high measurements performed, of which the average + * is taken. + */ +#define CPUPM_BENCHMARK_ITERS 5 + +int cpupm_ti_gov_multiple = CPUPM_TI_GOV_DEFAULT_MULTIPLE; +int cpupm_tw_gov_multiple = CPUPM_TW_GOV_DEFAULT_MULTIPLE; + + +static int cpupm_governor_initialize(void); +static void cpupm_state_change_global(cpupm_dtype_t, cpupm_state_name_t); + +cpupm_policy_t +cpupm_get_policy(void) +{ + return (cpupm_policy); +} + +int +cpupm_set_policy(cpupm_policy_t new_policy) +{ + static int gov_init = 0; + int result = 0; + + mutex_enter(&cpu_lock); + if (new_policy == cpupm_policy) { + mutex_exit(&cpu_lock); + return (result); + } + + /* + * Pausing CPUs causes a high priority thread to be scheduled + * on all other CPUs (besides the current one). This locks out + * other CPUs from making CPUPM state transitions. + */ + switch (new_policy) { + case CPUPM_POLICY_DISABLED: + pause_cpus(NULL); + cpupm_policy = CPUPM_POLICY_DISABLED; + start_cpus(); + + result = cmt_pad_disable(PGHW_POW_ACTIVE); + + /* + * Once PAD has been enabled, it should always be possible + * to disable it. + */ + ASSERT(result == 0); + + /* + * Bring all the active power domains to the maximum + * performance state. + */ + cpupm_state_change_global(CPUPM_DTYPE_ACTIVE, + CPUPM_STATE_MAX_PERF); + + break; + case CPUPM_POLICY_ELASTIC: + + result = cmt_pad_enable(PGHW_POW_ACTIVE); + if (result < 0) { + /* + * Failed to enable PAD across the active power + * domains, which may well be because none were + * enumerated. + */ + break; + } + + pause_cpus(NULL); + /* + * Attempt to initialize the governor parameters the first + * time through. + */ + if (gov_init == 0) { + result = cpupm_governor_initialize(); + if (result == 0) { + gov_init = 1; + } else { + /* + * Failed to initialize the governor parameters + */ + start_cpus(); + break; + } + } + cpupm_policy = CPUPM_POLICY_ELASTIC; + start_cpus(); + + break; + default: + cmn_err(CE_WARN, "Attempt to set unknown CPUPM policy %d\n", + new_policy); + ASSERT(0); + break; + } + mutex_exit(&cpu_lock); + + return (result); +} + +/* + * Look for an existing power domain + */ +static cpupm_domain_t * +cpupm_domain_find(id_t id, cpupm_dtype_t type) +{ + ASSERT(MUTEX_HELD(&cpu_lock)); + + cpupm_domain_t *dom; + + dom = cpupm_domains; + while (dom != NULL) { + if (id == dom->cpd_id && type == dom->cpd_type) + return (dom); + dom = dom->cpd_next; + } + return (NULL); +} + +/* + * Create a new domain + */ +static cpupm_domain_t * +cpupm_domain_create(id_t id, cpupm_dtype_t type) +{ + cpupm_domain_t *dom; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + dom = kmem_zalloc(sizeof (cpupm_domain_t), KM_SLEEP); + dom->cpd_id = id; + dom->cpd_type = type; + + /* Link into the known domain list */ + dom->cpd_next = cpupm_domains; + cpupm_domains = dom; + + return (dom); +} + +static void +cpupm_domain_state_enum(struct cpu *cp, cpupm_domain_t *dom) +{ + /* + * In the envent we're enumerating because the domain's state + * configuration has changed, toss any existing states. + */ + if (dom->cpd_nstates > 0) { + kmem_free(dom->cpd_states, + sizeof (cpupm_state_t) * dom->cpd_nstates); + dom->cpd_nstates = 0; + } + + /* + * Query to determine the number of states, allocate storage + * large enough to hold the state information, and pass it back + * to the platform driver to complete the enumeration. + */ + dom->cpd_nstates = cpupm_plat_state_enumerate(cp, dom->cpd_type, NULL); + + if (dom->cpd_nstates == 0) + return; + + dom->cpd_states = + kmem_zalloc(dom->cpd_nstates * sizeof (cpupm_state_t), KM_SLEEP); + (void) cpupm_plat_state_enumerate(cp, dom->cpd_type, dom->cpd_states); +} + +/* + * Initialize the specified type of power domain on behalf of the CPU + */ +cpupm_domain_t * +cpupm_domain_init(struct cpu *cp, cpupm_dtype_t type) +{ + cpupm_domain_t *dom; + id_t did; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + /* + * Instantiate the domain if it doesn't already exist + * and enumerate its power states. + */ + did = cpupm_domain_id(cp, type); + dom = cpupm_domain_find(did, type); + if (dom == NULL) { + dom = cpupm_domain_create(did, type); + cpupm_domain_state_enum(cp, dom); + } + + /* + * Named state initialization + */ + if (type == CPUPM_DTYPE_ACTIVE) { + /* + * For active power domains, the highest performance + * state is defined as first state returned from + * the domain enumeration. + */ + dom->cpd_named_states[CPUPM_STATE_MAX_PERF] = + &dom->cpd_states[0]; + dom->cpd_named_states[CPUPM_STATE_LOW_POWER] = + &dom->cpd_states[dom->cpd_nstates - 1]; + + /* + * Begin by assuming CPU is running at the max perf state. + */ + dom->cpd_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF]; + } + + return (dom); +} + +/* + * Return the id associated with the given type of domain + * to which cp belongs + */ +id_t +cpupm_domain_id(struct cpu *cp, cpupm_dtype_t type) +{ + return (cpupm_plat_domain_id(cp, type)); +} + +/* + * Initiate a state change for the specified domain on behalf of cp + */ +int +cpupm_change_state(struct cpu *cp, cpupm_domain_t *dom, cpupm_state_t *state) +{ + if (cpupm_plat_change_state(cp, state) < 0) + return (-1); + + DTRACE_PROBE2(cpupm__change__state, + cpupm_domain_t *, dom, + cpupm_state_t *, state); + + dom->cpd_state = state; + return (0); +} + +/* + * Interface into the CPU power manager to indicate a significant change + * in utilization of the specified active power domain + */ +void +cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom, + cpupm_util_event_t event) +{ + cpupm_state_t *new_state = NULL; + hrtime_t last; + + if (cpupm_policy == CPUPM_POLICY_DISABLED) { + return; + } + + /* + * What follows is a simple elastic power state management policy. + * + * If the utilization has become non-zero, and the domain was + * previously at it's lowest power state, then transition it + * to the highest state in the spirit of "race to idle". + * + * If the utilization has dropped to zero, then transition the + * domain to its lowest power state. + * + * Statistics are maintained to implement governors to reduce state + * transitions resulting from either transient work, or periods of + * transient idleness on the domain. + */ + switch (event) { + case CPUPM_DOM_REMAIN_BUSY: + + /* + * We've received an event that the domain is running a thread + * that's made it to the end of it's time slice. If we are at + * low power, then raise it. If the transient work governor + * is engaged, then remove it. + */ + if (dom->cpd_state == + dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) { + new_state = + dom->cpd_named_states[CPUPM_STATE_MAX_PERF]; + if (dom->cpd_tw_governed == B_TRUE) { + dom->cpd_tw_governed = B_FALSE; + dom->cpd_tw = 0; + } + } + break; + + case CPUPM_DOM_BUSY_FROM_IDLE: + last = dom->cpd_last_lower; + dom->cpd_last_raise = now; + + DTRACE_PROBE3(cpupm__raise__req, + cpupm_domain_t *, dom, + hrtime_t, last, + hrtime_t, now); + + if (dom->cpd_state == + dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) { + + /* + * There's non-zero utilization, and the domain is + * running in the lower power state. Before we + * consider raising power, perform some book keeping + * for the transient idle governor. + */ + if (dom->cpd_ti_governed == B_FALSE) { + if ((now - last) < cpupm_ti_predict_interval) { + /* + * We're raising the domain power and + * we *just* lowered it. Consider + * this a mispredicted power state + * transition due to a transient + * idle period. + */ + if (++dom->cpd_ti >= + cpupm_mispredict_thresh) { + /* + * There's enough transient + * idle transitions to + * justify governing future + * lowering requests. + */ + dom->cpd_ti_governed = B_TRUE; + dom->cpd_ti = 0; + DTRACE_PROBE1( + cpupm__ti__governed, + cpupm_domain_t *, dom); + } + } else { + /* + * We correctly predicted the last + * lowering. + */ + dom->cpd_ti = 0; + } + } + if (dom->cpd_tw_governed == B_TRUE) { + /* + * Raise requests are governed due to + * transient work. + */ + DTRACE_PROBE1(cpupm__raise__governed, + cpupm_domain_t *, dom); + + /* + * It's likely that we'll be governed for a + * while. If the transient idle governor is + * also in place, examine the preceeding idle + * interval to see if that still makes sense. + */ + if (dom->cpd_ti_governed == B_TRUE && + ((now - last) >= + cpupm_ti_predict_interval)) { + if (++dom->cpd_ti >= + cpupm_mispredict_gov_thresh) { + dom->cpd_ti_governed = + B_FALSE; + dom->cpd_ti = 0; + } + } + return; + } + /* + * Prepare to transition to the higher power state + */ + new_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF]; + + } else if (dom->cpd_state == + dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) { + + /* + * Utilization is non-zero, and we're already running + * in the higher power state. Take this opportunity to + * perform some book keeping if the last lowering + * request was governed. + */ + if (dom->cpd_ti_governed == B_TRUE) { + if ((now - last) >= cpupm_ti_predict_interval) { + /* + * The domain is transient idle + * governed, and we mispredicted + * governing the last lowering request. + */ + if (++dom->cpd_ti >= + cpupm_mispredict_gov_thresh) { + /* + * There's enough non-transient + * idle periods to justify + * removing the governor. + */ + dom->cpd_ti_governed = B_FALSE; + dom->cpd_ti = 0; + DTRACE_PROBE1( + cpupm__ti__ungoverned, + cpupm_domain_t *, dom); + } + } else { + /* + * Correctly predicted governing the + * last lowering request. + */ + dom->cpd_ti = 0; + } + } + } + break; + + case CPUPM_DOM_IDLE_FROM_BUSY: + last = dom->cpd_last_raise; + dom->cpd_last_lower = now; + + DTRACE_PROBE3(cpupm__lower__req, + cpupm_domain_t *, dom, + hrtime_t, last, + hrtime_t, now); + + if (dom->cpd_state == + dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) { + + /* + * The domain is idle, and is running in the highest + * performance state. Before we consider lowering power, + * perform some book keeping for the transient work + * governor. + */ + if (dom->cpd_tw_governed == B_FALSE) { + if ((now - last) < cpupm_tw_predict_interval) { + /* + * We're lowering the domain power and + * we *just* raised it. Consider the + * last raise mispredicted due to + * transient work. + */ + if (++dom->cpd_tw >= + cpupm_mispredict_thresh) { + /* + * There's enough transient idle + * transitions to justify + * governing future lowering + * requests. + */ + dom->cpd_tw_governed = B_TRUE; + dom->cpd_tw = 0; + DTRACE_PROBE1( + cpupm__tw__governed, + cpupm_domain_t *, dom); + } + } else { + /* + * We correctly predicted during the + * last raise. + */ + dom->cpd_tw = 0; + } + } + if (dom->cpd_ti_governed == B_TRUE) { + /* + * Lowering requests are governed due to + * transient idleness. + */ + DTRACE_PROBE1(cpupm__lowering__governed, + cpupm_domain_t *, dom); + + /* + * It's likely that we'll be governed for a + * while. If the transient work governor is + * also in place, examine the preceeding busy + * interval to see if that still makes sense. + */ + if (dom->cpd_tw_governed == B_TRUE && + ((now - last) >= + cpupm_tw_predict_interval)) { + if (++dom->cpd_tw >= + cpupm_mispredict_gov_thresh) { + dom->cpd_tw_governed = + B_FALSE; + dom->cpd_tw = 0; + } + } + return; + } + + /* + * Prepare to transition to a lower power state. + */ + new_state = + dom->cpd_named_states[CPUPM_STATE_LOW_POWER]; + + } else if (dom->cpd_state == + dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) { + + /* + * The domain is idle, and we're already running in + * the lower power state. Take this opportunity to + * perform some book keeping if the last raising + * request was governed. + */ + if (dom->cpd_tw_governed == B_TRUE) { + if ((now - last) >= cpupm_tw_predict_interval) { + /* + * The domain is transient work + * governed, and we mispredicted + * governing the last raising request. + */ + if (++dom->cpd_tw >= + cpupm_mispredict_gov_thresh) { + /* + * There's enough non-transient + * work to justify removing + * the governor. + */ + dom->cpd_tw_governed = B_FALSE; + dom->cpd_tw = 0; + DTRACE_PROBE1( + cpupm__tw__ungoverned, + cpupm_domain_t *, dom); + } + } else { + /* + * We correctly predicted governing + * the last raise. + */ + dom->cpd_tw = 0; + } + } + } + break; + } + /* + * Change the power state + * Not much currently done if this doesn't succeed + */ + if (new_state) + (void) cpupm_change_state(cp, dom, new_state); +} + + +/* + * Interface called by platforms to dynamically change the + * MAX performance cpupm state + */ +void +cpupm_redefine_max_activepwr_state(struct cpu *cp, int max_perf_level) +{ + cpupm_domain_t *dom; + id_t did; + cpupm_dtype_t type = CPUPM_DTYPE_ACTIVE; + boolean_t change_state = B_FALSE; + cpupm_state_t *new_state = NULL; + + did = cpupm_domain_id(cp, type); + mutex_enter(&cpu_lock); + dom = cpupm_domain_find(did, type); + mutex_exit(&cpu_lock); + + /* + * Can use a lock to avoid changing the power state of the cpu when + * CPUPM_STATE_MAX_PERF is getting changed. + * Since the occurance of events to change MAX_PERF is not frequent, + * it may not be a good idea to overburden with locks. In the worst + * case, for one cycle the power may not get changed to the required + * level + */ + if (dom != NULL) { + if (dom->cpd_state == + dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) { + change_state = B_TRUE; + } + + /* + * If an out of range level is passed, use the lowest supported + * speed. + */ + if (max_perf_level >= dom->cpd_nstates && + dom->cpd_nstates > 1) { + max_perf_level = dom->cpd_nstates - 1; + } + + dom->cpd_named_states[CPUPM_STATE_MAX_PERF] = + &dom->cpd_states[max_perf_level]; + + /* + * If the current state is MAX_PERF, change the current state + * to the new MAX_PERF + */ + if (change_state) { + new_state = + dom->cpd_named_states[CPUPM_STATE_MAX_PERF]; + if (new_state) { + (void) cpupm_change_state(cp, dom, new_state); + } + } + } +} + +/* + * Benchmark some power state transitions and use the transition latencies as + * a basis for initializing parameters for the transient idle and transient + * work governors. + * + * Returns 0 on success or -1 if the governor parameters could not be + * initialized. + */ +static int +cpupm_governor_initialize(void) +{ + cpu_t *cp = CPU; + cpupm_domain_t *dom; + cpupm_state_t *low, *high; + id_t did; + hrtime_t start, delta, deltas = 0; + int iterations; + + did = cpupm_domain_id(cp, CPUPM_DTYPE_ACTIVE); + if (did == CPUPM_NO_DOMAIN) + return (-1); + + dom = cpupm_domain_find(did, CPUPM_DTYPE_ACTIVE); + if (dom == NULL) + return (-1); + + low = dom->cpd_named_states[CPUPM_STATE_LOW_POWER]; + high = dom->cpd_named_states[CPUPM_STATE_MAX_PERF]; + + for (iterations = 0; iterations < CPUPM_BENCHMARK_ITERS; iterations++) { + + /* + * Measure the amount of time it takes to transition the + * domain down to the lowest, and back to the highest power + * state. + */ + start = gethrtime_unscaled(); + (void) cpupm_change_state(cp, dom, low); + (void) cpupm_change_state(cp, dom, high); + delta = gethrtime_unscaled() - start; + + DTRACE_PROBE1(cpupm__benchmark__latency, + hrtime_t, delta); + + deltas += delta; + } + + /* + * Figure the average latency, and tune the transient work and + * transient idle prediction intervals accordingly. + */ + delta = deltas / iterations; + + cpupm_ti_predict_interval = delta * cpupm_ti_gov_multiple; + cpupm_tw_predict_interval = delta * cpupm_tw_gov_multiple; + + return (0); +} + +/* + * Initiate a state change in all CPUPM domain instances of the specified type + */ +static void +cpupm_state_change_global(cpupm_dtype_t type, cpupm_state_name_t state) +{ + cpu_t *cp; + pg_cmt_t *pwr_pg; + cpupm_domain_t *dom; + group_t *hwset; + group_iter_t giter; + pg_cpu_itr_t cpu_iter; + pghw_type_t hw; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + switch (type) { + case CPUPM_DTYPE_ACTIVE: + hw = PGHW_POW_ACTIVE; + break; + default: + /* + * Power domain types other than "active" unsupported. + */ + ASSERT(type == CPUPM_DTYPE_ACTIVE); + return; + } + + if ((hwset = pghw_set_lookup(hw)) == NULL) + return; + + /* + * Iterate over the power domains + */ + group_iter_init(&giter); + while ((pwr_pg = group_iterate(hwset, &giter)) != NULL) { + + dom = (cpupm_domain_t *)pwr_pg->cmt_pg.pghw_handle; + + /* + * Iterate over the CPUs in each domain + */ + PG_CPU_ITR_INIT(pwr_pg, cpu_iter); + while ((cp = pg_cpu_next(&cpu_iter)) != NULL) { + (void) cpupm_change_state(cp, dom, + dom->cpd_named_states[state]); + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/os/cpupm.c Wed Feb 25 21:04:18 2009 -0800 @@ -0,0 +1,67 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/sunddi.h> +#include <sys/cpupm.h> + +/* + * Initialize the field that will be used for reporting + * the supported_frequencies_Hz cpu_info kstat. + */ +void +cpupm_set_supp_freqs(cpu_t *cp, int *speeds, uint_t nspeeds) +{ + char *supp_freqs = NULL; + char *sfptr; + uint64_t *hzspeeds; + int i; + int j; +#define UINT64_MAX_STRING (sizeof ("18446744073709551615")) + + if (speeds == NULL) { + cpu_set_supp_freqs(cp, supp_freqs); + return; + } + + hzspeeds = kmem_zalloc(nspeeds * sizeof (uint64_t), KM_SLEEP); + for (i = nspeeds - 1, j = 0; i >= 0; i--, j++) { + hzspeeds[i] = CPUPM_SPEED_HZ(cp->cpu_type_info.pi_clock, + speeds[j]); + } + + supp_freqs = kmem_zalloc((UINT64_MAX_STRING * nspeeds), KM_SLEEP); + sfptr = supp_freqs; + for (i = 0; i < nspeeds; i++) { + if (i == nspeeds - 1) { + (void) sprintf(sfptr, "%"PRIu64, hzspeeds[i]); + } else { + (void) sprintf(sfptr, "%"PRIu64":", hzspeeds[i]); + sfptr = supp_freqs + strlen(supp_freqs); + } + } + cpu_set_supp_freqs(cp, supp_freqs); + kmem_free(supp_freqs, (UINT64_MAX_STRING * nspeeds)); + kmem_free(hzspeeds, nspeeds * sizeof (uint64_t)); +}
--- a/usr/src/uts/common/os/group.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/common/os/group.c Wed Feb 25 21:04:18 2009 -0800 @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/systm.h> #include <sys/param.h> #include <sys/debug.h> @@ -64,6 +62,21 @@ } /* + * Empty a group_t + * Capacity is preserved. + */ +void +group_empty(group_t *g) +{ + int i; + int sz = g->grp_size; + + g->grp_size = 0; + for (i = 0; i < sz; i++) + g->grp_set[i] = NULL; +} + +/* * Add element "e" to group "g" * * Returns -1 if addition would result in overcapacity, and @@ -312,7 +325,7 @@ } /* - * Remove the entry at the specified index + * Remove the element at the specified index */ void group_remove_at(group_t *g, uint_t idx) @@ -320,3 +333,19 @@ ASSERT(idx < g->grp_capacity); g->grp_set[idx] = NULL; } + +/* + * Find an element in the group, and return its index + * Returns -1 if the element could not be found. + */ +uint_t +group_find(group_t *g, void *e) +{ + uint_t idx; + + for (idx = 0; idx < g->grp_capacity; idx++) { + if (g->grp_set[idx] == e) + return (idx); + } + return ((uint_t)-1); +}
--- a/usr/src/uts/common/os/pg.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/common/os/pg.c Wed Feb 25 21:04:18 2009 -0800 @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/systm.h> #include <sys/types.h> #include <sys/param.h> @@ -99,6 +97,7 @@ static pg_t *pg_alloc_default(pg_class_t); static void pg_free_default(pg_t *); +static void pg_null_op(); /* * Bootstrap CPU specific PG data @@ -127,6 +126,12 @@ NULL, /* cpupart_out */ NULL, /* cpupart_move */ NULL, /* cpu_belongs */ + NULL, /* policy_name */ +}; + +static struct pg_cb_ops pg_cb_ops_default = { + pg_null_op, /* thread_swtch */ + pg_null_op, /* thread_remain */ }; /* @@ -144,6 +149,13 @@ /* + * Class specific PG policy name + */ +#define PG_POLICY_NAME(pg) \ + ((pg)->pg_class->pgc_ops->policy_name ? \ + (pg)->pg_class->pgc_ops->policy_name(pg) : NULL) \ + +/* * Class specific membership test callback */ #define PG_CPU_BELONGS(pg, cp) \ @@ -206,13 +218,22 @@ static pg_cid_t pg_default_cid; /* - * Initialze common PG subsystem. Perform CPU 0 initialization + * Initialze common PG subsystem. */ void pg_init(void) { + extern void pg_cmt_class_init(); + pg_default_cid = pg_class_register("default", &pg_ops_default, PGR_LOGICAL); + + /* + * Initialize classes to allow them to register with the framework + */ + pg_cmt_class_init(); + + pg_cpu0_init(); } /* @@ -282,7 +303,7 @@ classes_old = pg_classes; pg_classes = kmem_zalloc(sizeof (pg_class_t) * (pg_nclasses + 1), - KM_SLEEP); + KM_SLEEP); (void) kcopy(classes_old, pg_classes, sizeof (pg_class_t) * pg_nclasses); kmem_free(classes_old, sizeof (pg_class_t) * pg_nclasses); @@ -339,6 +360,27 @@ } /* + * Test if a given PG contains a given CPU + */ +boolean_t +pg_cpu_find(pg_t *pg, cpu_t *cp) +{ + if (group_find(&pg->pg_cpus, cp) == (uint_t)-1) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * Set the PGs callbacks to the default + */ +void +pg_callback_set_defaults(pg_t *pg) +{ + bcopy(&pg_cb_ops_default, &pg->pg_cb, sizeof (struct pg_cb_ops)); +} + +/* * Create a PG of a given class. * This routine may block. */ @@ -374,6 +416,11 @@ */ group_create(&pg->pg_cpus); + /* + * Initialize the events ops vector + */ + pg_callback_set_defaults(pg); + return (pg); } @@ -620,6 +667,20 @@ } /* + * Return a class specific string describing a policy implemented + * across this PG + */ +char * +pg_policy_name(pg_t *pg) +{ + char *str; + if ((str = PG_POLICY_NAME(pg)) != NULL) + return (str); + + return ("N/A"); +} + +/* * Provide the specified CPU a bootstrap pg * This is needed to allow sane behaviour if any PG consuming * code needs to deal with a partially initialized CPU @@ -643,3 +704,52 @@ { kmem_free(pg, sizeof (pg_t)); } + +static void +pg_null_op() +{ +} + +/* + * Invoke the "thread switch" callback for each of the CPU's PGs + * This is invoked from the dispatcher swtch() routine, which is called + * when a thread running an a CPU should switch to another thread. + * "cp" is the CPU on which the thread switch is happening + * "now" is an unscaled hrtime_t timestamp taken in swtch() + * "old" and "new" are the outgoing and incoming threads, respectively. + */ +void +pg_ev_thread_swtch(struct cpu *cp, hrtime_t now, kthread_t *old, kthread_t *new) +{ + int i, sz; + group_t *grp; + pg_t *pg; + + grp = &cp->cpu_pg->pgs; + sz = GROUP_SIZE(grp); + for (i = 0; i < sz; i++) { + pg = GROUP_ACCESS(grp, i); + pg->pg_cb.thread_swtch(pg, cp, now, old, new); + } +} + +/* + * Invoke the "thread remain" callback for each of the CPU's PGs. + * This is called from the dispatcher's swtch() routine when a thread + * running on the CPU "cp" is switching to itself, which can happen as an + * artifact of the thread's timeslice expiring. + */ +void +pg_ev_thread_remain(struct cpu *cp, kthread_t *t) +{ + int i, sz; + group_t *grp; + pg_t *pg; + + grp = &cp->cpu_pg->pgs; + sz = GROUP_SIZE(grp); + for (i = 0; i < sz; i++) { + pg = GROUP_ACCESS(grp, i); + pg->pg_cb.thread_remain(pg, cp, t); + } +}
--- a/usr/src/uts/common/os/pghw.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/common/os/pghw.c Wed Feb 25 21:04:18 2009 -0800 @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/systm.h> #include <sys/types.h> #include <sys/param.h> @@ -35,6 +33,7 @@ #include <sys/group.h> #include <sys/pg.h> #include <sys/pghw.h> +#include <sys/cpu_pm.h> /* * Processor Groups: Hardware sharing relationship layer @@ -99,7 +98,7 @@ * (the CPU's chip, cache, lgroup, etc.). * * The hwsets are created dynamically as new hardware sharing relationship types - * are instantiated. They are never destroyed, as once a given relathionship + * are instantiated. They are never destroyed, as once a given relationship * type appears in the system, it is quite likely that at least one instance of * that relationship will always persist as long as the system is running. */ @@ -107,11 +106,6 @@ static group_t *pg_hw; /* top level pg hw group */ /* - * Lookup table mapping hardware sharing relationships with hierarchy levels - */ -static int pghw_level_table[PGHW_NUM_COMPONENTS]; - -/* * Physical PG kstats */ struct pghw_kstat { @@ -120,12 +114,14 @@ kstat_named_t pg_ncpus; kstat_named_t pg_instance_id; kstat_named_t pg_hw; + kstat_named_t pg_policy; } pghw_kstat = { { "id", KSTAT_DATA_UINT64 }, { "pg_class", KSTAT_DATA_STRING }, { "ncpus", KSTAT_DATA_UINT64 }, { "instance_id", KSTAT_DATA_UINT64 }, { "hardware", KSTAT_DATA_STRING }, + { "policy", KSTAT_DATA_STRING }, }; kmutex_t pghw_kstat_lock; @@ -138,7 +134,7 @@ static void pghw_set_remove(group_t *, pghw_t *); /* - * Initialize the physical portion of a physical PG + * Initialize the physical portion of a hardware PG */ void pghw_init(pghw_t *pg, cpu_t *cp, pghw_type_t hw) @@ -157,6 +153,22 @@ pg->pghw_instance = pg_plat_hw_instance_id(cp, hw); pghw_kstat_create(pg); + + /* + * Hardware sharing relationship specific initialization + */ + switch (pg->pghw_hw) { + case PGHW_POW_ACTIVE: + pg->pghw_handle = + (pghw_handle_t)cpupm_domain_init(cp, CPUPM_DTYPE_ACTIVE); + break; + case PGHW_POW_IDLE: + pg->pghw_handle = + (pghw_handle_t)cpupm_domain_init(cp, CPUPM_DTYPE_IDLE); + break; + default: + pg->pghw_handle = (pghw_handle_t)NULL; + } } /* @@ -262,16 +274,6 @@ } /* - * Return a sequential level identifier for the specified - * hardware sharing relationship - */ -int -pghw_level(pghw_type_t hw) -{ - return (pg_plat_hw_level(hw)); -} - -/* * Create a new, empty hwset. * This routine may block, and must not be called from any * paused CPU context. @@ -303,13 +305,6 @@ ret = group_add_at(pg_hw, g, (uint_t)hw); ASSERT(ret == 0); - /* - * Update the table that maps hardware sharing relationships - * to hierarchy levels - */ - ASSERT(pghw_level_table[hw] == NULL); - pghw_level_table[hw] = pg_plat_hw_level(hw); - return (g); } @@ -353,24 +348,26 @@ /* * Return a string name given a pg_hw sharing type */ -#define PGHW_TYPE_NAME_MAX 8 - static char * pghw_type_string(pghw_type_t hw) { switch (hw) { case PGHW_IPIPE: - return ("ipipe"); + return ("Integer Pipeline"); case PGHW_CACHE: - return ("cache"); + return ("Cache"); case PGHW_FPU: - return ("fpu"); + return ("Floating Point Unit"); case PGHW_MPIPE: - return ("mpipe"); + return ("Data Pipe to memory"); case PGHW_CHIP: - return ("chip"); + return ("Socket"); case PGHW_MEMORY: - return ("memory"); + return ("Memory"); + case PGHW_POW_ACTIVE: + return ("CPU PM Active Power Domain"); + case PGHW_POW_IDLE: + return ("CPU PM Idle Power Domain"); default: return ("unknown"); } @@ -393,8 +390,10 @@ "pg", "pg", KSTAT_TYPE_NAMED, sizeof (pghw_kstat) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL)) != NULL) { + /* Class string, hw string, and policy string */ pg->pghw_kstat->ks_data_size += PG_CLASS_NAME_MAX; - pg->pghw_kstat->ks_data_size += PGHW_TYPE_NAME_MAX; + pg->pghw_kstat->ks_data_size += PGHW_KSTAT_STR_LEN_MAX; + pg->pghw_kstat->ks_data_size += PGHW_KSTAT_STR_LEN_MAX; pg->pghw_kstat->ks_lock = &pghw_kstat_lock; pg->pghw_kstat->ks_data = &pghw_kstat; pg->pghw_kstat->ks_update = pghw_kstat_update; @@ -417,6 +416,6 @@ pgsp->pg_instance_id.value.ui64 = (uint64_t)pg->pghw_instance; kstat_named_setstr(&pgsp->pg_class, ((pg_t *)pg)->pg_class->pgc_name); kstat_named_setstr(&pgsp->pg_hw, pghw_type_string(pg->pghw_hw)); - + kstat_named_setstr(&pgsp->pg_policy, pg_policy_name((pg_t *)pg)); return (0); }
--- a/usr/src/uts/common/os/sunpm.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/common/os/sunpm.c Wed Feb 25 21:04:18 2009 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -348,6 +348,13 @@ pm_cpupm_t cpupm = PM_CPUPM_NOTSET; /* + * Defines the default mode of operation for CPU power management, + * either the polling implementation, or the event based dispatcher driven + * implementation. + */ +pm_cpupm_t cpupm_default_mode = PM_CPUPM_EVENT; + +/* * AutoS3 depends on autopm being enabled, and must be enabled by * PM_START_AUTOS3 command. */ @@ -2568,7 +2575,7 @@ PMD(PMD_FAIL, ("%s: %s@%s(%s#%d) %s%s%s%s\n", pmf, PM_DEVICE(dip), !autopm_enabled ? "!autopm_enabled " : "", - !PM_CPUPM_ENABLED ? "!cpupm_enabled " : "", + !PM_POLLING_CPUPM ? "!cpupm_polling " : "", PM_CPUPM_DISABLED ? "cpupm_disabled " : "", pm_noinvol(dip) ? "pm_noinvol()" : "")) return (DDI_SUCCESS);
--- a/usr/src/uts/common/sys/Makefile Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/common/sys/Makefile Wed Feb 25 21:04:18 2009 -0800 @@ -139,6 +139,7 @@ cpr.h \ cpupart.h \ cpuvar.h \ + cpu_pm.h \ crc32.h \ cred.h \ cred_impl.h \
--- a/usr/src/uts/common/sys/callb.h Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/common/sys/callb.h Wed Feb 25 21:04:18 2009 -0800 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_CALLB_H #define _SYS_CALLB_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/t_lock.h> #include <sys/thread.h> @@ -69,7 +66,8 @@ #define CB_CL_MDBOOT CB_CL_UADMIN #define CB_CL_ENTER_DEBUGGER 14 #define CB_CL_CPR_POST_KERNEL 15 -#define NCBCLASS 16 /* CHANGE ME if classes are added/removed */ +#define CB_CL_CPU_DEEP_IDLE 16 +#define NCBCLASS 17 /* CHANGE ME if classes are added/removed */ /* * CB_CL_CPR_DAEMON class specific definitions are given below:
--- a/usr/src/uts/common/sys/cmt.h Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/common/sys/cmt.h Wed Feb 25 21:04:18 2009 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -37,9 +37,20 @@ #if (defined(_KERNEL) || defined(_KMEMUSER)) #include <sys/group.h> #include <sys/pghw.h> +#include <sys/lgrp.h> #include <sys/types.h> /* + * CMT related dispatcher policies + */ +#define CMT_NO_POLICY 0x0 +#define CMT_BALANCE 0x1 +#define CMT_COALESCE 0x2 +#define CMT_AFFINITY 0x4 + +typedef uint_t pg_cmt_policy_t; + +/* * CMT pg structure */ typedef struct pg_cmt { @@ -47,26 +58,67 @@ struct group *cmt_siblings; /* CMT PGs to balance with */ struct pg_cmt *cmt_parent; /* Parent CMT PG */ struct group *cmt_children; /* Active children CMT PGs */ + pg_cmt_policy_t cmt_policy; /* Dispatcher policies to use */ + uint32_t cmt_utilization; /* Group's utilization */ int cmt_nchildren; /* # of children CMT PGs */ - uint32_t cmt_nrunning; /* # of running threads */ + int cmt_hint; /* hint for balancing */ struct group cmt_cpus_actv; struct bitset cmt_cpus_actv_set; /* bitset of active CPUs */ } pg_cmt_t; /* + * CMT lgroup structure + */ +typedef struct cmt_lgrp { + group_t cl_pgs; /* Top level group of active CMT PGs */ + int cl_npgs; /* # of top level PGs in the lgroup */ + lgrp_handle_t cl_hand; /* lgroup's platform handle */ + struct cmt_lgrp *cl_next; /* next cmt_lgrp */ +} cmt_lgrp_t; + +/* * Change the number of running threads on the pg */ -#define PG_NRUN_UPDATE(cp, n) (pg_cmt_load((cp), (n))) +#define PG_NRUN_UPDATE(cp, n) (pg_cmt_load((cp), (n))) + +/* + * Indicate that the given logical CPU is (or isn't) currently utilized + */ +#define CMT_CPU_UTILIZED(cp) (pg_cmt_load((cp), 1)) +#define CMT_CPU_NOT_UTILIZED(cp) (pg_cmt_load((cp), -1)) + +/* + * CMT PG's capacity + * + * Currently, this is defined to be the number of active + * logical CPUs in the group. + * + * This will be used in conjunction with the utilization, which is defined + * to be the number of threads actively running on CPUs in the group. + */ +#define CMT_CAPACITY(pg) (GROUP_SIZE(&((pg_cmt_t *)pg)->cmt_cpus_actv)) void pg_cmt_load(cpu_t *, int); void pg_cmt_cpu_startup(cpu_t *); int pg_cmt_can_migrate(cpu_t *, cpu_t *); -int pg_plat_cmt_load_bal_hw(pghw_type_t); -int pg_plat_cmt_affinity_hw(pghw_type_t); +/* + * CMT platform interfaces + */ +pg_cmt_policy_t pg_plat_cmt_policy(pghw_type_t); +int pg_plat_cmt_rank(pg_cmt_t *, pg_cmt_t *); +/* + * CMT dispatcher policy + */ cpu_t *cmt_balance(kthread_t *, cpu_t *); +/* + * Power Aware Dispatcher Interfaces + */ +int cmt_pad_enable(pghw_type_t); +int cmt_pad_disable(pghw_type_t); + #endif /* !_KERNEL && !_KMEMUSER */ #ifdef __cplusplus
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/sys/cpu_pm.h Wed Feb 25 21:04:18 2009 -0800 @@ -0,0 +1,139 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _CPU_PM_H +#define _CPU_PM_H + +#ifdef __cplusplus +extern "C" { +#endif + +#if (defined(_KERNEL) || defined(_KMEMUSER)) +#include <sys/cpuvar.h> +#include <sys/processor.h> +#include <sys/types.h> +#include <sys/kstat.h> +#include <sys/cmt.h> + +/* + * CPU Power Manager Policies + */ +typedef enum cpupm_policy { + CPUPM_POLICY_ELASTIC, + CPUPM_POLICY_DISABLED, + CPUPM_NUM_POLICIES +} cpupm_policy_t; + +/* + * Power Managable CPU Domain Types + */ +typedef enum cpupm_dtype { + CPUPM_DTYPE_ACTIVE, /* Active Power Domain */ + CPUPM_DTYPE_IDLE /* Idle Power Domain */ +} cpupm_dtype_t; + +/* + * CPUPM state names for policy implementation. + * The last element is used to size the enumeration. + */ +typedef enum cpupm_state_name { + CPUPM_STATE_LOW_POWER, + CPUPM_STATE_MAX_PERF, + CPUPM_STATE_NAMES +} cpupm_state_name_t; + +/* + * Utilization events delivered by the dispatcher. + */ +typedef enum cpupm_util_event { + CPUPM_DOM_BUSY_FROM_IDLE, + CPUPM_DOM_IDLE_FROM_BUSY, + CPUPM_DOM_REMAIN_BUSY +} cpupm_util_event_t; + +typedef uintptr_t cpupm_handle_t; /* Platform handle */ + +/* + * CPU Power Domain State + */ +typedef struct cpupm_state { + uint32_t cps_speed; + cpupm_handle_t cps_handle; +} cpupm_state_t; + +/* + * CPU Power Domain + */ +typedef struct cpupm_domain { + id_t cpd_id; /* Domain ID */ + cpupm_dtype_t cpd_type; /* Active or Idle */ + cpupm_state_t *cpd_states; /* Available Power States */ + cpupm_state_t *cpd_state; /* Current State */ + uint_t cpd_nstates; /* Number of States */ + cpupm_state_t *cpd_named_states[CPUPM_STATE_NAMES]; + hrtime_t cpd_last_raise; /* Last raise request time */ + hrtime_t cpd_last_lower; /* last lower request time */ + int cpd_tw; /* transient work history */ + int cpd_ti; /* transient idle history */ + boolean_t cpd_ti_governed; /* transient idle governor */ + boolean_t cpd_tw_governed; /* transient work governor */ + struct cpupm_domain *cpd_next; +} cpupm_domain_t; + +#define CPUPM_NO_DOMAIN ((id_t)-1) + +/* + * CPU power manager domain management interfaces + */ +cpupm_domain_t *cpupm_domain_init(struct cpu *, cpupm_dtype_t); +id_t cpupm_domain_id(struct cpu *, cpupm_dtype_t); +int cpupm_change_state(struct cpu *, cpupm_domain_t *, + cpupm_state_t *); +extern void cpupm_redefine_max_activepwr_state(struct cpu *, int); + +/* + * CPU power manager policy engine interfaces + */ +int cpupm_set_policy(cpupm_policy_t); +cpupm_policy_t cpupm_get_policy(void); +void cpupm_utilization_event(struct cpu *, hrtime_t, + cpupm_domain_t *, cpupm_util_event_t); + +/* + * CPU power platform driver interfaces + */ +id_t cpupm_plat_domain_id(struct cpu *, cpupm_dtype_t); +uint_t cpupm_plat_state_enumerate(struct cpu *, cpupm_dtype_t, + cpupm_state_t *); +int cpupm_plat_change_state(struct cpu *, cpupm_state_t *); + + +#endif /* !_KERNEL && !_KMEMUSER */ + +#ifdef __cplusplus +} +#endif + +#endif /* _CPU_PM_H */
--- a/usr/src/uts/common/sys/cpudrv.h Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/common/sys/cpudrv.h Wed Feb 25 21:04:18 2009 -0800 @@ -76,10 +76,10 @@ cpudrv_pm_spd_t *cur_spd; /* ptr to current speed */ uint_t num_spd; /* number of speeds */ hrtime_t lastquan_mstate[NCMSTATES]; /* last quantum's mstate */ - clock_t lastquan_ticks; /* last quantum's clock tick */ + clock_t lastquan_ticks; /* last quantum's clock tick */ int pm_busycnt; /* pm_busy_component() count */ taskq_t *tq; /* taskq handler for CPU monitor */ - timeout_id_t timeout_id; /* cpudrv_pm_monitor()'s timeout_id */ + timeout_id_t timeout_id; /* cpudrv_monitor()'s timeout_id */ int timeout_count; /* count dispatched timeouts */ kmutex_t timeout_lock; /* protect timeout_count */ kcondvar_t timeout_cv; /* wait on timeout_count change */ @@ -94,31 +94,31 @@ * Idle & user threads water marks in percentage */ #if defined(__x86) -#define CPUDRV_PM_IDLE_HWM 85 /* idle high water mark */ -#define CPUDRV_PM_IDLE_LWM 70 /* idle low water mark */ -#define CPUDRV_PM_IDLE_BLWM_CNT_MAX 1 /* # of iters idle can be < lwm */ -#define CPUDRV_PM_IDLE_BHWM_CNT_MAX 1 /* # of iters idle can be < hwm */ +#define CPUDRV_IDLE_HWM 85 /* idle high water mark */ +#define CPUDRV_IDLE_LWM 70 /* idle low water mark */ +#define CPUDRV_IDLE_BLWM_CNT_MAX 1 /* # of iters idle can be < lwm */ +#define CPUDRV_IDLE_BHWM_CNT_MAX 1 /* # of iters idle can be < hwm */ #else -#define CPUDRV_PM_IDLE_HWM 98 /* idle high water mark */ -#define CPUDRV_PM_IDLE_LWM 8 /* idle low water mark */ -#define CPUDRV_PM_IDLE_BLWM_CNT_MAX 2 /* # of iters idle can be < lwm */ -#define CPUDRV_PM_IDLE_BHWM_CNT_MAX 2 /* # of iters idle can be < hwm */ +#define CPUDRV_IDLE_HWM 98 /* idle high water mark */ +#define CPUDRV_IDLE_LWM 8 /* idle low water mark */ +#define CPUDRV_IDLE_BLWM_CNT_MAX 2 /* # of iters idle can be < lwm */ +#define CPUDRV_IDLE_BHWM_CNT_MAX 2 /* # of iters idle can be < hwm */ #endif -#define CPUDRV_PM_USER_HWM 20 /* user high water mark */ -#define CPUDRV_PM_IDLE_BUF_ZONE 4 /* buffer zone when going down */ +#define CPUDRV_USER_HWM 20 /* user high water mark */ +#define CPUDRV_IDLE_BUF_ZONE 4 /* buffer zone when going down */ /* * Maximums for creating 'pm-components' property */ -#define CPUDRV_PM_COMP_MAX_DIG 4 /* max digits in power level */ +#define CPUDRV_COMP_MAX_DIG 4 /* max digits in power level */ /* or divisor */ -#define CPUDRV_PM_COMP_MAX_VAL 9999 /* max value in above digits */ +#define CPUDRV_COMP_MAX_VAL 9999 /* max value in above digits */ /* * Component number for calls to PM framework */ -#define CPUDRV_PM_COMP_NUM 0 /* first component is 0 */ +#define CPUDRV_COMP_NUM 0 /* first component is 0 */ /* * Quantum counts for normal and other clock speeds in terms of ticks. @@ -132,26 +132,26 @@ * that we monitor less frequently. * * We reach a tradeoff between these two requirements by monitoring - * more frequently when we are in low speed mode (CPUDRV_PM_QUANT_CNT_OTHR) + * more frequently when we are in low speed mode (CPUDRV_QUANT_CNT_OTHR) * so we can bring the CPU up without user noticing it. Moreover, at low * speed we are not using CPU much so extra code execution should be fine. * Since we are in no hurry to bring CPU down and at normal speed and we * might really be using the CPU fully, we monitor less frequently - * (CPUDRV_PM_QUANT_CNT_NORMAL). + * (CPUDRV_QUANT_CNT_NORMAL). */ #if defined(__x86) -#define CPUDRV_PM_QUANT_CNT_NORMAL (hz * 1) /* 1 sec */ +#define CPUDRV_QUANT_CNT_NORMAL (hz * 1) /* 1 sec */ #else -#define CPUDRV_PM_QUANT_CNT_NORMAL (hz * 5) /* 5 sec */ +#define CPUDRV_QUANT_CNT_NORMAL (hz * 5) /* 5 sec */ #endif -#define CPUDRV_PM_QUANT_CNT_OTHR (hz * 1) /* 1 sec */ +#define CPUDRV_QUANT_CNT_OTHR (hz * 1) /* 1 sec */ /* * Taskq parameters */ -#define CPUDRV_PM_TASKQ_THREADS 1 /* # threads to run CPU monitor */ -#define CPUDRV_PM_TASKQ_MIN 2 /* min # of taskq entries */ -#define CPUDRV_PM_TASKQ_MAX 2 /* max # of taskq entries */ +#define CPUDRV_TASKQ_THREADS 1 /* # threads to run CPU monitor */ +#define CPUDRV_TASKQ_MIN 2 /* min # of taskq entries */ +#define CPUDRV_TASKQ_MAX 2 /* max # of taskq entries */ /* @@ -159,13 +159,14 @@ */ typedef struct cpudrv_devstate { dev_info_t *dip; /* devinfo handle */ + cpu_t *cp; /* CPU data for this node */ processorid_t cpu_id; /* CPU number for this node */ cpudrv_pm_t cpudrv_pm; /* power management data */ kmutex_t lock; /* protects state struct */ - void *mach_state; /* machine specific state */ } cpudrv_devstate_t; extern void *cpudrv_state; +extern boolean_t cpudrv_enabled; /* * Debugging definitions @@ -191,12 +192,13 @@ #define DPRINTF(flag, args) #endif /* DEBUG */ -extern int cpudrv_pm_change_speed(cpudrv_devstate_t *, cpudrv_pm_spd_t *); -extern boolean_t cpudrv_pm_get_cpu_id(dev_info_t *, processorid_t *); -extern boolean_t cpudrv_pm_power_ready(void); -extern boolean_t cpudrv_pm_is_governor_thread(cpudrv_pm_t *); -extern boolean_t cpudrv_mach_pm_init(cpudrv_devstate_t *); -extern void cpudrv_mach_pm_free(cpudrv_devstate_t *); +extern int cpudrv_change_speed(cpudrv_devstate_t *, cpudrv_pm_spd_t *); +extern boolean_t cpudrv_get_cpu_id(dev_info_t *, processorid_t *); +extern boolean_t cpudrv_is_governor_thread(cpudrv_pm_t *); +extern boolean_t cpudrv_mach_init(cpudrv_devstate_t *); +extern boolean_t cpudrv_power_ready(void); +extern boolean_t cpudrv_is_enabled(cpudrv_devstate_t *); +extern void cpudrv_set_supp_freqs(cpudrv_devstate_t *); #endif /* _KERNEL */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/sys/cpupm.h Wed Feb 25 21:04:18 2009 -0800 @@ -0,0 +1,43 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _CPUPM_H +#define _CPUPM_H + +#include <sys/types.h> +#include <sys/cpuvar.h> +#include <sys/cpupm_mach.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern void cpupm_set_supp_freqs(cpu_t *, int *, uint_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _CPUPM_H */
--- a/usr/src/uts/common/sys/cpuvar.h Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/common/sys/cpuvar.h Wed Feb 25 21:04:18 2009 -0800 @@ -366,7 +366,6 @@ #define CPU_DISP_DONTSTEAL 0x01 /* CPU undergoing context swtch */ #define CPU_DISP_HALTED 0x02 /* CPU halted waiting for interrupt */ - #endif /* _KERNEL || _KMEMUSER */ #if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP) @@ -673,6 +672,7 @@ const char *cpu_get_state_str(cpu_t *); /* get current cpu state as string */ +void cpu_set_curr_clock(uint64_t); /* indicate the current CPU's freq */ void cpu_set_supp_freqs(cpu_t *, const char *); /* set the CPU supported */ /* frequencies */
--- a/usr/src/uts/common/sys/epm.h Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/common/sys/epm.h Wed Feb 25 21:04:18 2009 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -227,7 +227,8 @@ typedef enum pm_cpupm { PM_CPUPM_NOTSET, /* no specific treatment of CPU devices */ - PM_CPUPM_ENABLE, /* power manage CPU devices */ + PM_CPUPM_POLLING, /* CPUPM enabled: polling mode */ + PM_CPUPM_EVENT, /* CPUPM enabled: event driven mode */ PM_CPUPM_DISABLE /* do not power manage CPU devices */ } pm_cpupm_t; @@ -609,9 +610,19 @@ #define PM_ISCPU(dip) (DEVI(dip)->devi_pm_flags & PMC_CPU_DEVICE) /* - * Returns true if cpupm is enabled. + * Returns true if cpupm is enabled in event driven mode. + */ +#define PM_EVENT_CPUPM (cpupm == PM_CPUPM_EVENT) + +/* + * Returns true if cpupm is enabled in polling mode. */ -#define PM_CPUPM_ENABLED (cpupm == PM_CPUPM_ENABLE) +#define PM_POLLING_CPUPM (cpupm == PM_CPUPM_POLLING) + +/* + * Returns true if cpupm operating using the default mode. + */ +#define PM_DEFAULT_CPUPM (cpupm == cpupm_default_mode) /* * Returns true if is disabled. @@ -619,12 +630,14 @@ #define PM_CPUPM_DISABLED (cpupm == PM_CPUPM_DISABLE) /* - * If (autopm is enabled and - * (CPUs are not disabled, or it isn't a cpu)) OR - * (CPUs are enabled and it is one) + * If ((autopm is enabled and + * (CPUPM is not disabled and we're not in event mode, or it isn't a cpu)) + * OR + * (CPUPM are enabled and it is one)) */ #define PM_SCANABLE(dip) ((autopm_enabled && \ -(!PM_CPUPM_DISABLED || !PM_ISCPU(dip))) || (PM_CPUPM_ENABLED && PM_ISCPU(dip))) + ((!PM_CPUPM_DISABLED && !PM_EVENT_CPUPM) || !PM_ISCPU(dip))) || \ + (PM_POLLING_CPUPM && PM_ISCPU(dip))) #define PM_NOT_ALL_LOWEST 0x0 /* not all components are at lowest */ #define PM_ALL_LOWEST 0x1 /* all components are at lowest lvl */
--- a/usr/src/uts/common/sys/group.h Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/common/sys/group.h Wed Feb 25 21:04:18 2009 -0800 @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _GROUP_H #define _GROUP_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Group Abstraction */ @@ -79,13 +77,14 @@ * Group element iteration */ void group_iter_init(group_iter_t *); -void *group_iterate(group_t *, uint_t *); +void *group_iterate(group_t *, group_iter_t *); /* - * Add / remove an element from the group + * Add / remove an element (or elements) from the group */ int group_add(group_t *, void *, int); int group_remove(group_t *, void *, int); +void group_empty(group_t *); /* * Add / remove / access an element at a specified index. @@ -95,6 +94,13 @@ int group_add_at(group_t *, void *, uint_t); void group_remove_at(group_t *, uint_t); +/* + * Search for an element in a group. + * Returns an index that may be used with the *_at() + * routines above to add or remove the element. + */ +uint_t group_find(group_t *, void *); + #endif /* !_KERNEL && !_KMEMUSER */ #ifdef __cplusplus
--- a/usr/src/uts/common/sys/pg.h Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/common/sys/pg.h Wed Feb 25 21:04:18 2009 -0800 @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _PG_H #define _PG_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Processor Groups */ @@ -48,6 +46,8 @@ typedef uint_t pgid_t; /* processor group id */ typedef uint_t pg_cid_t; /* processor group class id */ +struct pg; + /* * Nature of CPU relationships */ @@ -57,13 +57,26 @@ } pg_relation_t; /* + * Processor Group callbacks ops vector + * These provide a mechanism allowing per PG routines to invoked + * in response to events. + */ +typedef struct pg_cb_ops { + void (*thread_swtch)(struct pg *, struct cpu *, hrtime_t, + kthread_t *, kthread_t *); + void (*thread_remain)(struct pg *, struct cpu *, + kthread_t *); +} pg_cb_ops_t; + +/* * Processor group structure */ typedef struct pg { - pgid_t pg_id; /* seq id */ - pg_relation_t pg_relation; /* grouping relationship */ - struct pg_class *pg_class; /* pg class */ - struct group pg_cpus; /* group of CPUs */ + pgid_t pg_id; /* seq id */ + pg_relation_t pg_relation; /* grouping relationship */ + struct pg_class *pg_class; /* pg class */ + struct group pg_cpus; /* group of CPUs */ + pg_cb_ops_t pg_cb; /* pg events ops vector */ } pg_t; /* @@ -81,6 +94,7 @@ void (*cpupart_move)(struct cpu *, struct cpupart *, struct cpupart *); int (*cpu_belongs)(struct pg *, struct cpu *); + char *(*policy_name)(struct pg *); }; #define PG_CLASS_NAME_MAX 32 @@ -130,6 +144,12 @@ GROUP_ACCESS(&((pg_t *)pgrp)->pg_cpus, 0) : NULL) /* + * Return the number of CPUs in a PG + */ +#define PG_NUM_CPUS(pgrp) \ + (GROUP_SIZE(&(pgrp)->pg_cpus)) + +/* * Framework routines */ void pg_init(void); @@ -162,7 +182,19 @@ void pg_cpu_delete(pg_t *, cpu_t *); pg_t *pg_cpu_find_pg(cpu_t *, group_t *); cpu_t *pg_cpu_next(pg_cpu_itr_t *); +boolean_t pg_cpu_find(pg_t *, cpu_t *); +/* + * PG Event callbacks + */ +void pg_callback_set_defaults(pg_t *); +void pg_ev_thread_swtch(cpu_t *, hrtime_t, kthread_t *, kthread_t *); +void pg_ev_thread_remain(cpu_t *, kthread_t *); + +/* + * PG Observability interfaces + */ +char *pg_policy_name(pg_t *); #endif /* !_KERNEL && !_KMEMUSER */
--- a/usr/src/uts/common/sys/pghw.h Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/common/sys/pghw.h Wed Feb 25 21:04:18 2009 -0800 @@ -19,16 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _PGHW_H #define _PGHW_H -#pragma ident "%Z%%M% %I% %E% SMI" - - #ifdef __cplusplus extern "C" { #endif @@ -48,27 +45,47 @@ */ typedef enum pghw_type { PGHW_START, - PGHW_IPIPE, - PGHW_CACHE, - PGHW_FPU, - PGHW_MPIPE, - PGHW_CHIP, + PGHW_IPIPE, /* Instruction Pipeline */ + PGHW_CACHE, /* Cache (generally last level) */ + PGHW_FPU, /* Floating Point Unit / Pipeline */ + PGHW_MPIPE, /* Pipe to Memory */ + PGHW_CHIP, /* Socket */ PGHW_MEMORY, + PGHW_POW_ACTIVE, /* Active Power Management Domain */ + PGHW_POW_IDLE, /* Idle Power Management Domain */ PGHW_NUM_COMPONENTS } pghw_type_t; /* + * Returns true if the hardware is a type of power management domain + */ +#define PGHW_IS_PM_DOMAIN(hw) \ + (hw == PGHW_POW_ACTIVE || hw == PGHW_POW_IDLE) + +/* * Anonymous instance id */ #define PGHW_INSTANCE_ANON ((id_t)0xdecafbad) /* + * Max length of PGHW kstat strings + */ +#define PGHW_KSTAT_STR_LEN_MAX 32 + + +/* + * Platform specific handle + */ +typedef uintptr_t pghw_handle_t; + +/* * Processor Group (physical sharing relationship) */ typedef struct pghw { pg_t pghw_pg; /* processor group */ pghw_type_t pghw_hw; /* HW sharing relationship */ id_t pghw_instance; /* sharing instance identifier */ + pghw_handle_t pghw_handle; /* hw specific opaque handle */ kstat_t *pghw_kstat; /* physical kstats exported */ } pghw_t; @@ -102,16 +119,14 @@ pghw_t *pghw_find_by_instance(id_t, pghw_type_t); group_t *pghw_set_lookup(pghw_type_t); -int pghw_level(pghw_type_t); - void pghw_kstat_create(pghw_t *); int pghw_kstat_update(kstat_t *, int); /* Hardware sharing relationship platform interfaces */ int pg_plat_hw_shared(cpu_t *, pghw_type_t); int pg_plat_cpus_share(cpu_t *, cpu_t *, pghw_type_t); -int pg_plat_hw_level(pghw_type_t); id_t pg_plat_hw_instance_id(cpu_t *, pghw_type_t); +pghw_type_t pg_plat_hw_rank(pghw_type_t, pghw_type_t); /* * What comprises a "core" may vary across processor implementations,
--- a/usr/src/uts/common/sys/pm.h Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/common/sys/pm.h Wed Feb 25 21:04:18 2009 -0800 @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_PM_H #define _SYS_PM_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -92,6 +90,8 @@ PM_GET_DEFAULT_SYSTEM_THRESHOLD, PM_ADD_DEPENDENT_PROPERTY, PM_START_CPUPM, + PM_START_CPUPM_EV, + PM_START_CPUPM_POLL, PM_STOP_CPUPM, PM_GET_CPU_THRESHOLD, PM_SET_CPU_THRESHOLD, @@ -104,7 +104,10 @@ PM_SEARCH_LIST, /* search S3 enable/disable list */ PM_GET_AUTOS3_STATE, PM_GET_S3_SUPPORT_STATE, - PM_GET_CMD_NAME + PM_GET_CMD_NAME, + PM_DISABLE_CPU_DEEP_IDLE, + PM_ENABLE_CPU_DEEP_IDLE, + PM_DEFAULT_CPU_DEEP_IDLE } pm_cmds; /*
--- a/usr/src/uts/i86pc/Makefile.files Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/i86pc/Makefile.files Wed Feb 25 21:04:18 2009 -0800 @@ -40,9 +40,15 @@ cmi_hw.o \ cms.o \ confunix.o \ + cpu_idle.o \ cpuid.o \ cpuid_subr.o \ cpupm.o \ + cpupm_mach.o \ + cpupm_amd.o \ + cpupm_intel.o \ + cpupm_throttle.o \ + cpu_acpi.o \ dis_tables.o \ ddi_impl.o \ dtrace_subr.o \ @@ -93,6 +99,8 @@ pci_orion.o \ pmem.o \ ppage.o \ + pwrnow.o \ + speedstep.o \ startup.o \ timestamp.o \ todpc_subr.o \ @@ -169,19 +177,14 @@ PCI_E_NEXUS_OBJS += npe.o npe_misc.o PCI_E_NEXUS_OBJS += pci_common.o pci_kstats.o pci_tools.o PCINEXUS_OBJS += pci.o pci_common.o pci_kstats.o pci_tools.o -PCPLUSMP_OBJS += apic.o apic_regops.o psm_common.o apic_introp.o mp_platform_common.o +PCPLUSMP_OBJS += apic.o apic_regops.o psm_common.o apic_introp.o \ + mp_platform_common.o hpet_acpi.o ACPI_DRV_OBJS += acpi_drv.o acpi_video.o CPUDRV_OBJS += \ cpudrv.o \ - cpudrv_amd.o \ - cpudrv_intel.o \ - cpudrv_mach.o \ - cpudrv_throttle.o \ - cpu_acpi.o \ - speedstep.o \ - pwrnow.o + cpudrv_mach.o PPM_OBJS += ppm_subr.o ppm.o ppm_plat.o
--- a/usr/src/uts/i86pc/Makefile.rules Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/i86pc/Makefile.rules Wed Feb 25 21:04:18 2009 -0800 @@ -67,10 +67,6 @@ $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) -$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/cpudrv/%.c - $(COMPILE.c) -o $@ $< - $(CTFCONVERT_O) - $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/ioat/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -115,6 +111,10 @@ $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/os/cpupm/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/boot/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -270,9 +270,6 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/acpi_drv/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) -$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/cpudrv/%.c - @($(LHEAD) $(LINT.c) $< $(LTAIL)) - $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/ioat/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -309,6 +306,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/os/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/os/cpupm/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/boot/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL))
--- a/usr/src/uts/i86pc/io/cpudrv/cpu_acpi.c Wed Feb 25 20:53:30 2009 -0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,792 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include <sys/cpu_acpi.h> - -/* - * List of the processor ACPI object types that are being used. - */ -typedef enum cpu_acpi_obj { - PDC_OBJ = 0, - PCT_OBJ, - PSS_OBJ, - PSD_OBJ, - PPC_OBJ, - PTC_OBJ, - TSS_OBJ, - TSD_OBJ, - TPC_OBJ -} cpu_acpi_obj_t; - -/* - * Container to store object name. - * Other attributes can be added in the future as necessary. - */ -typedef struct cpu_acpi_obj_attr { - char *name; -} cpu_acpi_obj_attr_t; - -/* - * List of object attributes. - * NOTE: Please keep the ordering of the list as same as cpu_acpi_obj_t. - */ -static cpu_acpi_obj_attr_t cpu_acpi_obj_attrs[] = { - {"_PDC"}, - {"_PCT"}, - {"_PSS"}, - {"_PSD"}, - {"_PPC"}, - {"_PTC"}, - {"_TSS"}, - {"_TSD"}, - {"_TPC"} -}; - -/* - * Cache the ACPI CPU control data objects. - */ -static int -cpu_acpi_cache_ctrl_regs(cpu_acpi_handle_t handle, cpu_acpi_obj_t objtype, - cpu_acpi_ctrl_regs_t *regs) -{ - ACPI_BUFFER abuf; - ACPI_OBJECT *obj; - AML_RESOURCE_GENERIC_REGISTER *greg; - int ret = -1; - int i; - - /* - * Fetch the control registers (if present) for the CPU node. - * Since they are optional, non-existence is not a failure - * (we just consider it a fixed hardware case). - */ - abuf.Length = ACPI_ALLOCATE_BUFFER; - abuf.Pointer = NULL; - if (ACPI_FAILURE(AcpiEvaluateObjectTyped(handle->cs_handle, - cpu_acpi_obj_attrs[objtype].name, NULL, &abuf, - ACPI_TYPE_PACKAGE))) { - regs[0].cr_addrspace_id = ACPI_ADR_SPACE_FIXED_HARDWARE; - regs[1].cr_addrspace_id = ACPI_ADR_SPACE_FIXED_HARDWARE; - return (1); - } - - obj = abuf.Pointer; - if (obj->Package.Count != 2) { - cmn_err(CE_NOTE, "!cpu_acpi: %s package bad count %d.", - cpu_acpi_obj_attrs[objtype].name, obj->Package.Count); - goto out; - } - - /* - * Does the package look coherent? - */ - for (i = 0; i < obj->Package.Count; i++) { - if (obj->Package.Elements[i].Type != ACPI_TYPE_BUFFER) { - cmn_err(CE_NOTE, "!cpu_acpi: " - "Unexpected data in %s package.", - cpu_acpi_obj_attrs[objtype].name); - goto out; - } - - greg = (AML_RESOURCE_GENERIC_REGISTER *) - obj->Package.Elements[i].Buffer.Pointer; - if (greg->DescriptorType != - ACPI_RESOURCE_NAME_GENERIC_REGISTER) { - cmn_err(CE_NOTE, "!cpu_acpi: " - "%s package has format error.", - cpu_acpi_obj_attrs[objtype].name); - goto out; - } - if (greg->ResourceLength != - ACPI_AML_SIZE_LARGE(AML_RESOURCE_GENERIC_REGISTER)) { - cmn_err(CE_NOTE, "!cpu_acpi: " - "%s package not right size.", - cpu_acpi_obj_attrs[objtype].name); - goto out; - } - if (greg->AddressSpaceId != ACPI_ADR_SPACE_FIXED_HARDWARE && - greg->AddressSpaceId != ACPI_ADR_SPACE_SYSTEM_IO) { - cmn_err(CE_NOTE, "!cpu_apci: %s contains unsupported " - "address space type %x", - cpu_acpi_obj_attrs[objtype].name, - greg->AddressSpaceId); - goto out; - } - } - - /* - * Looks good! - */ - for (i = 0; i < obj->Package.Count; i++) { - greg = (AML_RESOURCE_GENERIC_REGISTER *) - obj->Package.Elements[i].Buffer.Pointer; - regs[i].cr_addrspace_id = greg->AddressSpaceId; - regs[i].cr_width = greg->BitWidth; - regs[i].cr_offset = greg->BitOffset; - regs[i].cr_asize = greg->AccessSize; - regs[i].cr_address = greg->Address; - } - ret = 0; -out: - AcpiOsFree(abuf.Pointer); - return (ret); -} - -/* - * Cache the ACPI _PCT data. The _PCT data defines the interface to use - * when making power level transitions (i.e., system IO ports, fixed - * hardware port, etc). - */ -static int -cpu_acpi_cache_pct(cpu_acpi_handle_t handle) -{ - cpu_acpi_pct_t *pct; - int ret; - - CPU_ACPI_OBJ_IS_NOT_CACHED(handle, CPU_ACPI_PCT_CACHED); - pct = &CPU_ACPI_PCT(handle)[0]; - if ((ret = cpu_acpi_cache_ctrl_regs(handle, PCT_OBJ, pct)) == 0) - CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_PCT_CACHED); - return (ret); -} - -/* - * Cache the ACPI _PTC data. The _PTC data defines the interface to use - * when making T-state transitions (i.e., system IO ports, fixed - * hardware port, etc). - */ -static int -cpu_acpi_cache_ptc(cpu_acpi_handle_t handle) -{ - cpu_acpi_ptc_t *ptc; - int ret; - - CPU_ACPI_OBJ_IS_NOT_CACHED(handle, CPU_ACPI_PTC_CACHED); - ptc = &CPU_ACPI_PTC(handle)[0]; - if ((ret = cpu_acpi_cache_ctrl_regs(handle, PTC_OBJ, ptc)) == 0) - CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_PTC_CACHED); - return (ret); -} - -/* - * Cache the ACPI CPU state dependency data objects. - */ -static int -cpu_acpi_cache_state_dependencies(cpu_acpi_handle_t handle, - cpu_acpi_obj_t objtype, cpu_acpi_state_dependency_t *sd) -{ - ACPI_BUFFER abuf; - ACPI_OBJECT *pkg, *elements; - int ret = -1; - - /* - * Fetch the dependencies (if present) for the CPU node. - * Since they are optional, non-existence is not a failure - * (it's up to the caller to determine how to handle non-existence). - */ - abuf.Length = ACPI_ALLOCATE_BUFFER; - abuf.Pointer = NULL; - if (ACPI_FAILURE(AcpiEvaluateObjectTyped(handle->cs_handle, - cpu_acpi_obj_attrs[objtype].name, NULL, &abuf, - ACPI_TYPE_PACKAGE))) { - return (1); - } - - pkg = abuf.Pointer; - if (pkg->Package.Count != 1) { - cmn_err(CE_NOTE, "!cpu_acpi: %s unsupported package " - "count %d.", cpu_acpi_obj_attrs[objtype].name, - pkg->Package.Count); - goto out; - } - - if (pkg->Package.Elements[0].Type != ACPI_TYPE_PACKAGE || - pkg->Package.Elements[0].Package.Count != 5) { - cmn_err(CE_NOTE, "!cpu_acpi: Unexpected data in %s package.", - cpu_acpi_obj_attrs[objtype].name); - goto out; - } - elements = pkg->Package.Elements[0].Package.Elements; - if (elements[0].Integer.Value != 5 || elements[1].Integer.Value != 0) { - cmn_err(CE_NOTE, "!cpu_acpi: Unexpected %s revision.", - cpu_acpi_obj_attrs[objtype].name); - goto out; - } - - sd->sd_entries = elements[0].Integer.Value; - sd->sd_revision = elements[1].Integer.Value; - sd->sd_domain = elements[2].Integer.Value; - sd->sd_type = elements[3].Integer.Value; - sd->sd_num = elements[4].Integer.Value; - - ret = 0; -out: - AcpiOsFree(abuf.Pointer); - return (ret); -} - -/* - * Cache the ACPI _PSD data. The _PSD data defines P-state CPU dependencies - * (think CPU domains). - */ -static int -cpu_acpi_cache_psd(cpu_acpi_handle_t handle) -{ - cpu_acpi_psd_t *psd; - int ret; - - CPU_ACPI_OBJ_IS_NOT_CACHED(handle, CPU_ACPI_PSD_CACHED); - psd = &CPU_ACPI_PSD(handle); - ret = cpu_acpi_cache_state_dependencies(handle, PSD_OBJ, psd); - if (ret == 0) - CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_PSD_CACHED); - return (ret); - -} - -/* - * Cache the ACPI _TSD data. The _TSD data defines T-state CPU dependencies - * (think CPU domains). - */ -static int -cpu_acpi_cache_tsd(cpu_acpi_handle_t handle) -{ - cpu_acpi_tsd_t *tsd; - int ret; - - CPU_ACPI_OBJ_IS_NOT_CACHED(handle, CPU_ACPI_TSD_CACHED); - tsd = &CPU_ACPI_TSD(handle); - ret = cpu_acpi_cache_state_dependencies(handle, TSD_OBJ, tsd); - if (ret == 0) - CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_TSD_CACHED); - return (ret); - -} - -static void -cpu_acpi_cache_pstate(cpu_acpi_handle_t handle, ACPI_OBJECT *obj, int cnt) -{ - cpu_acpi_pstate_t *pstate; - ACPI_OBJECT *q, *l; - int i, j; - - CPU_ACPI_PSTATES_COUNT(handle) = cnt; - CPU_ACPI_PSTATES(handle) = kmem_zalloc(CPU_ACPI_PSTATES_SIZE(cnt), - KM_SLEEP); - pstate = (cpu_acpi_pstate_t *)CPU_ACPI_PSTATES(handle); - for (i = 0, l = NULL; i < obj->Package.Count && cnt > 0; i++, l = q) { - uint32_t *up; - - q = obj->Package.Elements[i].Package.Elements; - - /* - * Skip duplicate entries. - */ - if (l != NULL && l[0].Integer.Value == q[0].Integer.Value) - continue; - - up = (uint32_t *)pstate; - for (j = 0; j < CPU_ACPI_PSS_CNT; j++) - up[j] = q[j].Integer.Value; - pstate++; - cnt--; - } -} - -static void -cpu_acpi_cache_tstate(cpu_acpi_handle_t handle, ACPI_OBJECT *obj, int cnt) -{ - cpu_acpi_tstate_t *tstate; - ACPI_OBJECT *q, *l; - int i, j; - - CPU_ACPI_TSTATES_COUNT(handle) = cnt; - CPU_ACPI_TSTATES(handle) = kmem_zalloc(CPU_ACPI_TSTATES_SIZE(cnt), - KM_SLEEP); - tstate = (cpu_acpi_tstate_t *)CPU_ACPI_TSTATES(handle); - for (i = 0, l = NULL; i < obj->Package.Count && cnt > 0; i++, l = q) { - uint32_t *up; - - q = obj->Package.Elements[i].Package.Elements; - - /* - * Skip duplicate entries. - */ - if (l != NULL && l[0].Integer.Value == q[0].Integer.Value) - continue; - - up = (uint32_t *)tstate; - for (j = 0; j < CPU_ACPI_TSS_CNT; j++) - up[j] = q[j].Integer.Value; - tstate++; - cnt--; - } -} - -/* - * Cache the _PSS or _TSS data. - */ -static int -cpu_acpi_cache_supported_states(cpu_acpi_handle_t handle, - cpu_acpi_obj_t objtype, int fcnt) -{ - ACPI_BUFFER abuf; - ACPI_OBJECT *obj, *q, *l; - boolean_t eot = B_FALSE; - int ret = -1; - int cnt; - int i, j; - - /* - * Fetch the data (if present) for the CPU node. - */ - abuf.Length = ACPI_ALLOCATE_BUFFER; - abuf.Pointer = NULL; - if (ACPI_FAILURE(AcpiEvaluateObjectTyped(handle->cs_handle, - cpu_acpi_obj_attrs[objtype].name, NULL, &abuf, - ACPI_TYPE_PACKAGE))) { - cmn_err(CE_NOTE, "!cpu_acpi: %s package not found.", - cpu_acpi_obj_attrs[objtype].name); - return (1); - } - obj = abuf.Pointer; - if (obj->Package.Count < 2) { - cmn_err(CE_NOTE, "!cpu_acpi: %s package bad count %d.", - cpu_acpi_obj_attrs[objtype].name, obj->Package.Count); - goto out; - } - - /* - * Does the package look coherent? - */ - cnt = 0; - for (i = 0, l = NULL; i < obj->Package.Count; i++, l = q) { - if (obj->Package.Elements[i].Type != ACPI_TYPE_PACKAGE || - obj->Package.Elements[i].Package.Count != fcnt) { - cmn_err(CE_NOTE, "!cpu_acpi: " - "Unexpected data in %s package.", - cpu_acpi_obj_attrs[objtype].name); - goto out; - } - - q = obj->Package.Elements[i].Package.Elements; - for (j = 0; j < fcnt; j++) { - if (q[j].Type != ACPI_TYPE_INTEGER) { - cmn_err(CE_NOTE, "!cpu_acpi: " - "%s element invalid (type)", - cpu_acpi_obj_attrs[objtype].name); - goto out; - } - } - - /* - * Ignore duplicate entries. - */ - if (l != NULL && l[0].Integer.Value == q[0].Integer.Value) - continue; - - /* - * Some supported state tables are larger than required - * and unused elements are filled with patterns - * of 0xff. Simply check here for frequency = 0xffff - * and stop counting if found. - */ - if (q[0].Integer.Value == 0xffff) { - eot = B_TRUE; - continue; - } - - /* - * We should never find a valid entry after we've hit - * an the end-of-table entry. - */ - if (eot) { - cmn_err(CE_NOTE, "!cpu_acpi: " - "Unexpected data in %s package after eot.", - cpu_acpi_obj_attrs[objtype].name); - goto out; - } - - /* - * states must be defined in order from highest to lowest. - */ - if (l != NULL && l[0].Integer.Value < q[0].Integer.Value) { - cmn_err(CE_NOTE, "!cpu_acpi: " - "%s package state definitions out of order.", - cpu_acpi_obj_attrs[objtype].name); - goto out; - } - - /* - * This entry passes. - */ - cnt++; - } - if (cnt == 0) - goto out; - - /* - * Yes, fill in the structure. - */ - ASSERT(objtype == PSS_OBJ || objtype == TSS_OBJ); - (objtype == PSS_OBJ) ? cpu_acpi_cache_pstate(handle, obj, cnt) : - cpu_acpi_cache_tstate(handle, obj, cnt); - - ret = 0; -out: - AcpiOsFree(abuf.Pointer); - return (ret); -} - -/* - * Cache the _PSS data. The _PSS data defines the different power levels - * supported by the CPU and the attributes associated with each power level - * (i.e., frequency, voltage, etc.). The power levels are number from - * highest to lowest. That is, the highest power level is _PSS entry 0 - * and the lowest power level is the last _PSS entry. - */ -static int -cpu_acpi_cache_pstates(cpu_acpi_handle_t handle) -{ - int ret; - - CPU_ACPI_OBJ_IS_NOT_CACHED(handle, CPU_ACPI_PSS_CACHED); - ret = cpu_acpi_cache_supported_states(handle, PSS_OBJ, - CPU_ACPI_PSS_CNT); - if (ret == 0) - CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_PSS_CACHED); - return (ret); -} - -/* - * Cache the _TSS data. The _TSS data defines the different freq throttle - * levels supported by the CPU and the attributes associated with each - * throttle level (i.e., frequency throttle percentage, voltage, etc.). - * The throttle levels are number from highest to lowest. - */ -static int -cpu_acpi_cache_tstates(cpu_acpi_handle_t handle) -{ - int ret; - - CPU_ACPI_OBJ_IS_NOT_CACHED(handle, CPU_ACPI_TSS_CACHED); - ret = cpu_acpi_cache_supported_states(handle, TSS_OBJ, - CPU_ACPI_TSS_CNT); - if (ret == 0) - CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_TSS_CACHED); - return (ret); -} - -/* - * Cache the ACPI CPU present capabilities data objects. - */ -static int -cpu_acpi_cache_present_capabilities(cpu_acpi_handle_t handle, - cpu_acpi_obj_t objtype, cpu_acpi_present_capabilities_t *pc) - -{ - ACPI_BUFFER abuf; - ACPI_OBJECT *obj; - - /* - * Fetch the present capabilites object (if present) for the CPU node. - * Since they are optional, non-existence is not a failure. - */ - abuf.Length = ACPI_ALLOCATE_BUFFER; - abuf.Pointer = NULL; - if (ACPI_FAILURE(AcpiEvaluateObject(handle->cs_handle, - cpu_acpi_obj_attrs[objtype].name, NULL, &abuf)) || - abuf.Length == 0) { - *pc = 0; - return (1); - } - - obj = (ACPI_OBJECT *)abuf.Pointer; - *pc = obj->Integer.Value; - AcpiOsFree(abuf.Pointer); - return (0); -} - -/* - * Cache the _PPC data. The _PPC simply contains an integer value which - * represents the highest power level that a CPU should transition to. - * That is, it's an index into the array of _PSS entries and will be - * greater than or equal to zero. - */ -void -cpu_acpi_cache_ppc(cpu_acpi_handle_t handle) -{ - cpu_acpi_ppc_t *ppc; - int ret; - - CPU_ACPI_OBJ_IS_NOT_CACHED(handle, CPU_ACPI_PPC_CACHED); - ppc = &CPU_ACPI_PPC(handle); - ret = cpu_acpi_cache_present_capabilities(handle, PPC_OBJ, ppc); - if (ret == 0) - CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_PPC_CACHED); -} - -/* - * Cache the _TPC data. The _TPC simply contains an integer value which - * represents the throttle level that a CPU should transition to. - * That is, it's an index into the array of _TSS entries and will be - * greater than or equal to zero. - */ -void -cpu_acpi_cache_tpc(cpu_acpi_handle_t handle) -{ - cpu_acpi_tpc_t *tpc; - int ret; - - CPU_ACPI_OBJ_IS_NOT_CACHED(handle, CPU_ACPI_TPC_CACHED); - tpc = &CPU_ACPI_TPC(handle); - ret = cpu_acpi_cache_present_capabilities(handle, TPC_OBJ, tpc); - if (ret == 0) - CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_TPC_CACHED); -} - -/* - * Cache the _PCT, _PSS, _PSD and _PPC data. - */ -int -cpu_acpi_cache_pstate_data(cpu_acpi_handle_t handle) -{ - if (cpu_acpi_cache_pct(handle) < 0) { - cmn_err(CE_WARN, "!cpu_acpi: error parsing _PCT for " - "CPU instance %d", ddi_get_instance(handle->cs_dip)); - return (-1); - } - - if (cpu_acpi_cache_pstates(handle) != 0) { - cmn_err(CE_WARN, "!cpu_acpi: error parsing _PSS for " - "CPU instance %d", ddi_get_instance(handle->cs_dip)); - return (-1); - } - - if (cpu_acpi_cache_psd(handle) < 0) { - cmn_err(CE_WARN, "!cpu_acpi: error parsing _PSD for " - "CPU instance %d", ddi_get_instance(handle->cs_dip)); - return (-1); - } - - cpu_acpi_cache_ppc(handle); - - return (0); -} - -void -cpu_acpi_free_pstate_data(cpu_acpi_handle_t handle) -{ - if (handle != NULL) { - if (CPU_ACPI_PSTATES(handle)) { - kmem_free(CPU_ACPI_PSTATES(handle), - CPU_ACPI_PSTATES_SIZE( - CPU_ACPI_PSTATES_COUNT(handle))); - CPU_ACPI_PSTATES(handle) = NULL; - } - } -} - -/* - * Cache the _PTC, _TSS, _TSD and _TPC data. - */ -int -cpu_acpi_cache_tstate_data(cpu_acpi_handle_t handle) -{ - if (cpu_acpi_cache_ptc(handle) < 0) { - cmn_err(CE_WARN, "!cpu_acpi: error parsing _PTC for " - "CPU instance %d", ddi_get_instance(handle->cs_dip)); - return (-1); - } - - if (cpu_acpi_cache_tstates(handle) != 0) { - cmn_err(CE_WARN, "!cpu_acpi: error parsing _TSS for " - "CPU instance %d", ddi_get_instance(handle->cs_dip)); - return (-1); - } - - if (cpu_acpi_cache_tsd(handle) < 0) { - cmn_err(CE_WARN, "!cpu_acpi: error parsing _TSD for " - "CPU instance %d", ddi_get_instance(handle->cs_dip)); - return (-1); - } - - cpu_acpi_cache_tpc(handle); - - return (0); -} - -void -cpu_acpi_free_tstate_data(cpu_acpi_handle_t handle) -{ - if (handle != NULL) { - if (CPU_ACPI_TSTATES(handle)) { - kmem_free(CPU_ACPI_TSTATES(handle), - CPU_ACPI_TSTATES_SIZE( - CPU_ACPI_TSTATES_COUNT(handle))); - CPU_ACPI_TSTATES(handle) = NULL; - } - } -} - -/* - * Register a handler for processor change notifications. - */ -void -cpu_acpi_install_notify_handler(cpu_acpi_handle_t handle, - ACPI_NOTIFY_HANDLER handler, dev_info_t *dip) -{ - char path[MAXNAMELEN]; - if (ACPI_FAILURE(AcpiInstallNotifyHandler(handle->cs_handle, - ACPI_DEVICE_NOTIFY, handler, dip))) - cmn_err(CE_NOTE, "!cpu_acpi: Unable to register " - "notify handler for %s", ddi_pathname(dip, path)); -} - -/* - * Write _PDC. - */ -int -cpu_acpi_write_pdc(cpu_acpi_handle_t handle, uint32_t revision, uint32_t count, - uint32_t *capabilities) -{ - ACPI_OBJECT obj; - ACPI_OBJECT_LIST list = { 1, &obj}; - uint32_t *buffer; - uint32_t *bufptr; - uint32_t bufsize; - int i; - - bufsize = (count + 2) * sizeof (uint32_t); - buffer = kmem_zalloc(bufsize, KM_SLEEP); - buffer[0] = revision; - buffer[1] = count; - bufptr = &buffer[2]; - for (i = 0; i < count; i++) - *bufptr++ = *capabilities++; - - obj.Type = ACPI_TYPE_BUFFER; - obj.Buffer.Length = bufsize; - obj.Buffer.Pointer = (void *)buffer; - - /* - * _PDC is optional, so don't log failure. - */ - if (ACPI_FAILURE(AcpiEvaluateObject(handle->cs_handle, "_PDC", - &list, NULL))) { - kmem_free(buffer, bufsize); - return (-1); - } - - kmem_free(buffer, bufsize); - return (0); -} - -/* - * Write to system IO port. - */ -int -cpu_acpi_write_port(ACPI_IO_ADDRESS address, uint32_t value, uint32_t width) -{ - if (ACPI_FAILURE(AcpiOsWritePort(address, value, width))) { - cmn_err(CE_NOTE, "cpu_acpi: error writing system IO port " - "%lx.", (long)address); - return (-1); - } - return (0); -} - -/* - * Read from a system IO port. - */ -int -cpu_acpi_read_port(ACPI_IO_ADDRESS address, uint32_t *value, uint32_t width) -{ - if (ACPI_FAILURE(AcpiOsReadPort(address, value, width))) { - cmn_err(CE_NOTE, "cpu_acpi: error reading system IO port " - "%lx.", (long)address); - return (-1); - } - return (0); -} - -/* - * Return supported frequencies. - */ -uint_t -cpu_acpi_get_speeds(cpu_acpi_handle_t handle, int **speeds) -{ - cpu_acpi_pstate_t *pstate; - int *hspeeds; - uint_t nspeeds; - int i; - - nspeeds = CPU_ACPI_PSTATES_COUNT(handle); - pstate = (cpu_acpi_pstate_t *)CPU_ACPI_PSTATES(handle); - hspeeds = kmem_zalloc(nspeeds * sizeof (int), KM_SLEEP); - for (i = 0; i < nspeeds; i++) { - hspeeds[i] = CPU_ACPI_FREQ(pstate); - pstate++; - } - *speeds = hspeeds; - return (nspeeds); -} - -/* - * Free resources allocated by cpu_acpi_get_speeds(). - */ -void -cpu_acpi_free_speeds(int *speeds, uint_t nspeeds) -{ - kmem_free(speeds, nspeeds * sizeof (int)); -} - -/* - * Map the dip to an ACPI handle for the device. - */ -cpu_acpi_handle_t -cpu_acpi_init(dev_info_t *dip) -{ - cpu_acpi_handle_t handle; - - handle = kmem_zalloc(sizeof (cpu_acpi_state_t), KM_SLEEP); - - if (ACPI_FAILURE(acpica_get_handle(dip, &handle->cs_handle))) { - kmem_free(handle, sizeof (cpu_acpi_state_t)); - return (NULL); - } - handle->cs_dip = dip; - return (handle); -} - -/* - * Free any resources. - */ -void -cpu_acpi_fini(cpu_acpi_handle_t handle) -{ - if (handle) - kmem_free(handle, sizeof (cpu_acpi_state_t)); -}
--- a/usr/src/uts/i86pc/io/cpudrv/cpudrv_amd.c Wed Feb 25 20:53:30 2009 -0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,51 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * AMD specific CPU power management support. - */ - -#include <sys/x86_archext.h> -#include <sys/cpudrv_mach.h> -#include <sys/cpu_acpi.h> -#include <sys/pwrnow.h> - -boolean_t -cpudrv_amd_init(cpudrv_devstate_t *cpudsp) -{ - cpudrv_mach_state_t *mach_state = cpudsp->mach_state; - - /* AMD? */ - if (x86_vendor != X86_VENDOR_AMD) - return (B_FALSE); - - /* - * If we support PowerNow! on this processor, then set the - * correct pstate_ops for the processor. - */ - mach_state->cpupm_pstate_ops = pwrnow_supported() ? &pwrnow_ops : NULL; - - return (B_TRUE); -}
--- a/usr/src/uts/i86pc/io/cpudrv/cpudrv_intel.c Wed Feb 25 20:53:30 2009 -0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,95 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Intel specific CPU power management support. - */ - -#include <sys/x86_archext.h> -#include <sys/cpudrv_mach.h> -#include <sys/cpu_acpi.h> -#include <sys/speedstep.h> -#include <sys/cpudrv_throttle.h> - -/* - * The Intel Processor Driver Capabilities (_PDC). - * See Intel Processor Vendor-Specific ACPI Interface Specification - * for details. - */ -#define CPUDRV_INTEL_PDC_REVISION 0x1 -#define CPUDRV_INTEL_PDC_PS_MSR 0x0001 -#define CPUDRV_INTEL_PDC_C1_HALT 0x0002 -#define CPUDRV_INTEL_PDC_TS_MSR 0x0004 -#define CPUDRV_INTEL_PDC_MP 0x0008 -#define CPUDRV_INTEL_PDC_SW_PSD 0x0020 -#define CPUDRV_INTEL_PDC_TSD 0x0080 -#define CPUDRV_INTEL_PDC_HW_PSD 0x0800 - -static uint32_t cpudrv_intel_pdccap = 0; - -boolean_t -cpudrv_intel_init(cpudrv_devstate_t *cpudsp) -{ - cpudrv_mach_state_t *mach_state = cpudsp->mach_state; - uint_t family; - uint_t model; - - if (x86_vendor != X86_VENDOR_Intel) - return (B_FALSE); - - family = cpuid_getfamily(CPU); - model = cpuid_getmodel(CPU); - - /* - * If we support SpeedStep on this processor, then set the - * correct pstate_ops for the processor and enable appropriate - * _PDC bits. - */ - if (speedstep_supported(family, model)) { - mach_state->cpupm_pstate_ops = &speedstep_ops; - cpudrv_intel_pdccap = CPUDRV_INTEL_PDC_PS_MSR | - CPUDRV_INTEL_PDC_C1_HALT | CPUDRV_INTEL_PDC_MP | - CPUDRV_INTEL_PDC_SW_PSD | CPUDRV_INTEL_PDC_HW_PSD; - } else { - mach_state->cpupm_pstate_ops = NULL; - } - - /* - * Set the correct tstate_ops for the processor and - * enable appropriate _PDC bits. - */ - mach_state->cpupm_tstate_ops = &cpudrv_throttle_ops; - cpudrv_intel_pdccap |= CPUDRV_INTEL_PDC_TS_MSR | - CPUDRV_INTEL_PDC_TSD; - - /* - * _PDC support is optional and the driver should - * function even if the _PDC write fails. - */ - (void) cpu_acpi_write_pdc(mach_state->acpi_handle, - CPUDRV_INTEL_PDC_REVISION, 1, &cpudrv_intel_pdccap); - - return (B_TRUE); -}
--- a/usr/src/uts/i86pc/io/cpudrv/cpudrv_mach.c Wed Feb 25 20:53:30 2009 -0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,516 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * CPU power management driver support for i86pc. - */ - -#include <sys/ddi.h> -#include <sys/sunddi.h> -#include <sys/cpupm.h> -#include <sys/cpudrv_mach.h> -#include <sys/machsystm.h> - -/* - * Constants used by the Processor Device Notification handler - * that identify what kind of change has occurred. We currently - * only handle PPC_CHANGE_NOTIFICATION. The other two are - * ignored. - */ -#define PPC_CHANGE_NOTIFICATION 0x80 -#define CST_CHANGE_NOTIFICATION 0x81 -#define TPC_CHANGE_NOTIFICATION 0x82 - -/* - * Note that our driver numbers the power levels from lowest to - * highest starting at 1 (i.e., the lowest power level is 1 and - * the highest power level is cpupm->num_spd). The x86 modules get - * their power levels from ACPI which numbers power levels from - * highest to lowest starting at 0 (i.e., the lowest power level - * is (cpupm->num_spd - 1) and the highest power level is 0). So to - * map one of our driver power levels to one understood by ACPI we - * simply subtract our driver power level from cpupm->num_spd. Likewise, - * to map an ACPI power level to the proper driver power level, we - * subtract the ACPI power level from cpupm->num_spd. - */ -#define PM_2_PLAT_LEVEL(cpupm, pm_level) (cpupm->num_spd - pm_level) -#define PLAT_2_PM_LEVEL(cpupm, plat_level) (cpupm->num_spd - plat_level) - -extern boolean_t cpudrv_intel_init(cpudrv_devstate_t *); -extern boolean_t cpudrv_amd_init(cpudrv_devstate_t *); - -typedef struct cpudrv_mach_vendor { - boolean_t (*cpuv_init)(cpudrv_devstate_t *); -} cpudrv_mach_vendor_t; - -/* - * Table of supported vendors. - */ -static cpudrv_mach_vendor_t cpudrv_vendors[] = { - cpudrv_intel_init, - cpudrv_amd_init, - NULL -}; - -uint_t -cpudrv_pm_get_speeds(cpudrv_devstate_t *cpudsp, int **speeds) -{ - cpudrv_mach_state_t *mach_state = cpudsp->mach_state; - return (cpu_acpi_get_speeds(mach_state->acpi_handle, speeds)); -} - -void -cpudrv_pm_free_speeds(int *speeds, uint_t nspeeds) -{ - cpu_acpi_free_speeds(speeds, nspeeds); -} - -/* - * Change CPU speed using interface provided by module. - */ -int -cpudrv_pm_change_speed(cpudrv_devstate_t *cpudsp, cpudrv_pm_spd_t *new_spd) -{ - cpudrv_mach_state_t *mach_state = cpudsp->mach_state; - cpudrv_pm_t *cpupm; - uint32_t plat_level; - int ret; - - if (!(mach_state->caps & CPUDRV_P_STATES)) - return (DDI_FAILURE); - ASSERT(mach_state->cpupm_pstate_ops != NULL); - cpupm = &(cpudsp->cpudrv_pm); - plat_level = PM_2_PLAT_LEVEL(cpupm, new_spd->pm_level); - ret = mach_state->cpupm_pstate_ops->cpups_power(cpudsp, plat_level); - if (ret != 0) - return (DDI_FAILURE); - return (DDI_SUCCESS); -} - -/* - * Determine the cpu_id for the CPU device. - */ -boolean_t -cpudrv_pm_get_cpu_id(dev_info_t *dip, processorid_t *cpu_id) -{ - return ((*cpu_id = ddi_prop_get_int(DDI_DEV_T_ANY, dip, - DDI_PROP_DONTPASS, "reg", -1)) != -1); - -} - -/* - * All CPU instances have been initialized successfully. - */ -boolean_t -cpudrv_pm_power_ready(void) -{ - return (cpupm_is_enabled(CPUPM_P_STATES) && cpupm_is_ready()); -} - -/* - * All CPU instances have been initialized successfully. - */ -boolean_t -cpudrv_pm_throttle_ready(void) -{ - return (cpupm_is_enabled(CPUPM_T_STATES) && cpupm_is_ready()); -} - -/* - * Is the current thread the thread that is handling the - * PPC change notification? - */ -boolean_t -cpudrv_pm_is_governor_thread(cpudrv_pm_t *cpupm) -{ - return (curthread == cpupm->pm_governor_thread); -} - -/* - * Initialize the machine. - * See if a module exists for managing power for this CPU. - */ -boolean_t -cpudrv_mach_pm_init(cpudrv_devstate_t *cpudsp) -{ - cpudrv_mach_vendor_t *vendors; - cpudrv_mach_state_t *mach_state; - int ret; - - mach_state = cpudsp->mach_state = - kmem_zalloc(sizeof (cpudrv_mach_state_t), KM_SLEEP); - mach_state->caps = CPUDRV_NO_STATES; - - mach_state->acpi_handle = cpu_acpi_init(cpudsp->dip); - if (mach_state->acpi_handle == NULL) { - cpudrv_mach_pm_free(cpudsp); - cmn_err(CE_WARN, "!cpudrv_mach_pm_init: instance %d: " - "unable to get ACPI handle", - ddi_get_instance(cpudsp->dip)); - cmn_err(CE_NOTE, "!CPU power management will not function."); - return (B_FALSE); - } - - /* - * Loop through the CPU management module table and see if - * any of the modules implement CPU power management - * for this CPU. - */ - for (vendors = cpudrv_vendors; vendors->cpuv_init != NULL; vendors++) { - if (vendors->cpuv_init(cpudsp)) - break; - } - - /* - * Nope, we can't power manage this CPU. - */ - if (vendors == NULL) { - cpudrv_mach_pm_free(cpudsp); - return (B_FALSE); - } - - /* - * If P-state support exists for this system, then initialize it. - */ - if (mach_state->cpupm_pstate_ops != NULL) { - ret = mach_state->cpupm_pstate_ops->cpups_init(cpudsp); - if (ret != 0) { - cmn_err(CE_WARN, "!cpudrv_mach_pm_init: instance %d:" - " unable to initialize P-state support", - ddi_get_instance(cpudsp->dip)); - mach_state->cpupm_pstate_ops = NULL; - cpupm_disable(CPUPM_P_STATES); - } else { - mach_state->caps |= CPUDRV_P_STATES; - } - } - - if (mach_state->cpupm_tstate_ops != NULL) { - ret = mach_state->cpupm_tstate_ops->cputs_init(cpudsp); - if (ret != 0) { - cmn_err(CE_WARN, "!cpudrv_mach_pm_init: instance %d:" - " unable to initialize T-state support", - ddi_get_instance(cpudsp->dip)); - mach_state->cpupm_tstate_ops = NULL; - cpupm_disable(CPUPM_T_STATES); - } else { - mach_state->caps |= CPUDRV_T_STATES; - } - } - - if (mach_state->caps == CPUDRV_NO_STATES) { - cpudrv_mach_pm_free(cpudsp); - return (B_FALSE); - } - - return (B_TRUE); -} - -/* - * Free any resources allocated by cpudrv_mach_pm_init(). - */ -void -cpudrv_mach_pm_free(cpudrv_devstate_t *cpudsp) -{ - cpudrv_mach_state_t *mach_state = cpudsp->mach_state; - - if (mach_state == NULL) - return; - if (mach_state->cpupm_pstate_ops != NULL) { - mach_state->cpupm_pstate_ops->cpups_fini(cpudsp); - mach_state->cpupm_pstate_ops = NULL; - } - - if (mach_state->cpupm_tstate_ops != NULL) { - mach_state->cpupm_tstate_ops->cputs_fini(cpudsp); - mach_state->cpupm_tstate_ops = NULL; - } - - if (mach_state->acpi_handle != NULL) { - cpu_acpi_fini(mach_state->acpi_handle); - mach_state->acpi_handle = NULL; - } - - kmem_free(mach_state, sizeof (cpudrv_mach_state_t)); - cpudsp->mach_state = NULL; -} - -/* - * This routine changes the top speed to which the CPUs can transition by: - * - * - Resetting the up_spd for all speeds lower than the new top speed - * to point to the new top speed. - * - Updating the framework with a new "normal" (maximum power) for this - * device. - */ -void -cpudrv_pm_set_topspeed(void *ctx, int plat_level) -{ - cpudrv_devstate_t *cpudsp; - cpudrv_pm_t *cpupm; - cpudrv_pm_spd_t *spd; - cpudrv_pm_spd_t *top_spd; - dev_info_t *dip; - int pm_level; - int instance; - int i; - - dip = ctx; - instance = ddi_get_instance(dip); - cpudsp = ddi_get_soft_state(cpudrv_state, instance); - ASSERT(cpudsp != NULL); - - mutex_enter(&cpudsp->lock); - cpupm = &(cpudsp->cpudrv_pm); - pm_level = PLAT_2_PM_LEVEL(cpupm, plat_level); - for (i = 0, spd = cpupm->head_spd; spd; i++, spd = spd->down_spd) { - /* - * Don't mess with speeds that are higher than the new - * top speed. They should be out of range anyway. - */ - if (spd->pm_level > pm_level) - continue; - /* - * This is the new top speed. - */ - if (spd->pm_level == pm_level) - top_spd = spd; - - spd->up_spd = top_spd; - } - cpupm->top_spd = top_spd; - - cpupm->pm_governor_thread = curthread; - - mutex_exit(&cpudsp->lock); - - (void) pm_update_maxpower(dip, 0, top_spd->pm_level); -} - -/* - * This routine reads the ACPI _PPC object. It's accessed as a callback - * by the ppm driver whenever a _PPC change notification is received. - */ -int -cpudrv_pm_get_topspeed(void *ctx) -{ - cpudrv_mach_state_t *mach_state; - cpu_acpi_handle_t handle; - cpudrv_devstate_t *cpudsp; - cpudrv_pm_t *cpupm; - dev_info_t *dip; - int instance; - int plat_level; - int max_level; - - dip = ctx; - instance = ddi_get_instance(dip); - cpudsp = ddi_get_soft_state(cpudrv_state, instance); - ASSERT(cpudsp != NULL); - cpupm = &(cpudsp->cpudrv_pm); - mach_state = cpudsp->mach_state; - handle = mach_state->acpi_handle; - - cpu_acpi_cache_ppc(handle); - plat_level = CPU_ACPI_PPC(handle); - max_level = cpupm->num_spd - 1; - if ((plat_level < 0) || (plat_level > max_level)) { - cmn_err(CE_NOTE, "!cpudrv_pm_get_topspeed: instance %d: " - "_PPC out of range %d", instance, plat_level); - - plat_level = 0; - } - return (plat_level); -} - -/* - * This routine reads the ACPI _TPC object. It's accessed as a callback - * by the cpu driver whenever a _TPC change notification is received. - */ -int -cpudrv_pm_get_topthrottle(cpudrv_devstate_t *cpudsp) -{ - cpudrv_mach_state_t *mach_state; - cpu_acpi_handle_t handle; - int throtl_level; - - mach_state = cpudsp->mach_state; - handle = mach_state->acpi_handle; - - cpu_acpi_cache_tpc(handle); - throtl_level = CPU_ACPI_TPC(handle); - return (throtl_level); -} - -/* - * Take care of CPU throttling when _TPC notification arrives - */ -void -cpudrv_pm_throttle_instance(cpudrv_devstate_t *cpudsp) -{ - cpudrv_mach_state_t *mach_state; - uint32_t new_level; - int ret; - - ASSERT(cpudsp != NULL); - mach_state = cpudsp->mach_state; - if (!(mach_state->caps & CPUDRV_T_STATES)) - return; - ASSERT(mach_state->cpupm_tstate_ops != NULL); - - /* - * Get the new T-State support level - */ - new_level = cpudrv_pm_get_topthrottle(cpudsp); - - /* - * Change the cpu throttling to the new level - */ - ret = mach_state->cpupm_tstate_ops->cputs_throttle(cpudsp, new_level); - if (ret != 0) { - cmn_err(CE_WARN, "Cannot change the cpu throttling to the new" - " level: %d, Instance: %d", new_level, cpudsp->cpu_id); - } -} - -/* - * Take care of CPU throttling when _TPC notification arrives - */ -void -cpudrv_pm_manage_throttling(void *ctx) -{ - cpudrv_devstate_t *cpudsp; - cpudrv_mach_state_t *mach_state; - cpudrv_tstate_domain_t *domain; - cpudrv_tstate_domain_node_t *domain_node; - int instance; - boolean_t is_ready; - - instance = ddi_get_instance((dev_info_t *)ctx); - cpudsp = ddi_get_soft_state(cpudrv_state, instance); - ASSERT(cpudsp != NULL); - - /* - * We currently refuse to power manage if the CPU is not ready to - * take cross calls (cross calls fail silently if CPU is not ready - * for it). - * - * Additionally, for x86 platforms we cannot power manage - * any one instance, until all instances have been initialized. - * That's because we don't know what the CPU domains look like - * until all instances have been initialized. - */ - is_ready = CPUDRV_PM_XCALL_IS_READY(cpudsp->cpu_id); - if (!is_ready) { - DPRINTF(D_POWER, ("cpudrv_power: instance %d: " - "CPU not ready for x-calls\n", instance)); - } else if (!(is_ready = cpudrv_pm_throttle_ready())) { - DPRINTF(D_POWER, ("cpudrv_power: instance %d: " - "waiting for all CPUs to be ready\n", instance)); - } - if (!is_ready) { - return; - } - - mach_state = cpudsp->mach_state; - domain_node = mach_state->tstate_domain_node; - domain = domain_node->tdn_domain; - - switch (domain->td_type) { - case CPU_ACPI_SW_ANY: - /* - * Just throttle the current instance and all other instances - * under the same domain will get throttled to the same level - */ - cpudrv_pm_throttle_instance(cpudsp); - break; - case CPU_ACPI_HW_ALL: - case CPU_ACPI_SW_ALL: - /* - * Along with the current instance, throttle all the CPU's that - * belong to the same domain - */ - mutex_enter(&domain->td_lock); - for (domain_node = domain->td_node; domain_node != NULL; - domain_node = domain_node->tdn_next) - cpudrv_pm_throttle_instance(domain_node->tdn_cpudsp); - mutex_exit(&domain->td_lock); - break; - - default: - cmn_err(CE_WARN, "Not a valid coordination type (%x) to" - " throttle cpu", domain->td_domain); - break; - } -} - -/* - * This notification handler is called whenever the ACPI _PPC - * object changes. The _PPC is a sort of governor on power levels. - * It sets an upper threshold on which, _PSS defined, power levels - * are usuable. The _PPC value is dynamic and may change as properties - * (i.e., thermal or AC source) of the system change. - */ -/* ARGSUSED */ -static void -cpudrv_pm_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx) -{ - /* - * We only handle _PPC change notifications. - */ - if (val == PPC_CHANGE_NOTIFICATION) - cpudrv_pm_redefine_topspeed(ctx); - else if (val == TPC_CHANGE_NOTIFICATION) { - cpudrv_pm_manage_throttling(ctx); - } -} - -void -cpudrv_pm_install_notify_handler(cpudrv_devstate_t *cpudsp, dev_info_t *dip) -{ - cpudrv_mach_state_t *mach_state = cpudsp->mach_state; - cpu_acpi_install_notify_handler(mach_state->acpi_handle, - cpudrv_pm_notify_handler, dip); -} - -void -cpudrv_pm_redefine_topspeed(void *ctx) -{ - /* - * This should never happen, unless ppm does not get loaded. - */ - if (cpupm_redefine_topspeed == NULL) { - cmn_err(CE_WARN, "cpudrv_pm_redefine_topspeed: " - "cpupm_redefine_topspeed has not been initialized - " - "ignoring notification"); - return; - } - - /* - * ppm callback needs to handle redefinition for all CPUs in - * the domain. - */ - (*cpupm_redefine_topspeed)(ctx); -}
--- a/usr/src/uts/i86pc/io/cpudrv/cpudrv_throttle.c Wed Feb 25 20:53:30 2009 -0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,350 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include <sys/x86_archext.h> -#include <sys/machsystm.h> -#include <sys/x_call.h> -#include <sys/cpu_acpi.h> -#include <sys/cpudrv_throttle.h> -#include <sys/dtrace.h> -#include <sys/sdt.h> - -static int cpudrv_throttle_init(cpudrv_devstate_t *); -static void cpudrv_throttle_fini(cpudrv_devstate_t *); -static int cpudrv_throttle(cpudrv_devstate_t *, uint32_t); - -cpudrv_tstate_ops_t cpudrv_throttle_ops = { - "Generic ACPI T-state Support", - cpudrv_throttle_init, - cpudrv_throttle_fini, - cpudrv_throttle -}; - -/* - * Error returns - */ -#define THROTTLE_RET_SUCCESS 0x00 -#define THROTTLE_RET_INCOMPLETE_DATA 0x01 -#define THROTTLE_RET_UNSUP_STATE 0x02 -#define THROTTLE_RET_TRANS_INCOMPLETE 0x03 - -#define THROTTLE_LATENCY_WAIT 1 - -/* - * MSR register for clock modulation - */ -#define IA32_CLOCK_MODULATION_MSR 0x19A - -/* - * Debugging support - */ -#ifdef DEBUG -volatile int cpudrv_throttle_debug = 0; -#define CTDEBUG(arglist) if (cpudrv_throttle_debug) printf arglist; -#else -#define CTDEBUG(arglist) -#endif - -cpudrv_tstate_domain_t *cpudrv_tstate_domains = NULL; - -/* - * Allocate a new domain node. - */ -static void -cpudrv_alloc_tstate_domain(cpudrv_devstate_t *cpudsp) -{ - cpudrv_mach_state_t *mach_state = cpudsp->mach_state; - cpu_acpi_handle_t handle = mach_state->acpi_handle; - cpudrv_tstate_domain_t *dptr; - cpudrv_tstate_domain_node_t *nptr; - uint32_t domain; - uint32_t type; - cpu_t *cp; - - if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_TSD_CACHED)) { - domain = CPU_ACPI_TSD(handle).sd_domain; - type = CPU_ACPI_TSD(handle).sd_type; - } else { - mutex_enter(&cpu_lock); - cp = cpu[CPU->cpu_id]; - domain = cpuid_get_chipid(cp); - mutex_exit(&cpu_lock); - type = CPU_ACPI_SW_ALL; - } - - for (dptr = cpudrv_tstate_domains; dptr != NULL; - dptr = dptr->td_next) { - if (dptr->td_domain == domain) - break; - } - - /* new domain is created and linked at the head */ - if (dptr == NULL) { - dptr = kmem_zalloc(sizeof (cpudrv_tstate_domain_t), KM_SLEEP); - dptr->td_domain = domain; - dptr->td_type = type; - dptr->td_next = cpudrv_tstate_domains; - mutex_init(&dptr->td_lock, NULL, MUTEX_DRIVER, NULL); - cpudrv_tstate_domains = dptr; - } - - /* new domain node is created and linked at the head of the domain */ - nptr = kmem_zalloc(sizeof (cpudrv_tstate_domain_node_t), KM_SLEEP); - nptr->tdn_cpudsp = cpudsp; - nptr->tdn_domain = dptr; - nptr->tdn_next = dptr->td_node; - dptr->td_node = nptr; - mach_state->tstate_domain_node = nptr; -} - -static void -cpudrv_free_tstate_domains() -{ - cpudrv_tstate_domain_t *this_domain, *next_domain; - cpudrv_tstate_domain_node_t *this_node, *next_node; - - this_domain = cpudrv_tstate_domains; - while (this_domain != NULL) { - next_domain = this_domain->td_next; - - /* discard CPU node chain */ - this_node = this_domain->td_node; - while (this_node != NULL) { - next_node = this_node->tdn_next; - kmem_free((void *)this_node, - sizeof (cpudrv_tstate_domain_node_t)); - this_node = next_node; - } - mutex_destroy(&this_domain->td_lock); - kmem_free((void *)this_domain, - sizeof (cpudrv_tstate_domain_t)); - this_domain = next_domain; - } - cpudrv_tstate_domains = NULL; -} - -/* - * Write the _PTC ctrl register. How it is written, depends upon the _PTC - * APCI object value. - */ -static int -write_ctrl(cpu_acpi_handle_t handle, uint32_t ctrl) -{ - cpu_acpi_ptc_t *ptc_ctrl; - uint64_t reg; - int ret = 0; - - ptc_ctrl = CPU_ACPI_PTC_CTRL(handle); - - switch (ptc_ctrl->cr_addrspace_id) { - case ACPI_ADR_SPACE_FIXED_HARDWARE: - /* - * Read current thermal state because reserved bits must be - * preserved, compose new value, and write it.The writable - * bits are 4:1 (1 to 4). - * Bits 3:1 => On-Demand Clock Modulation Duty Cycle - * Bit 4 => On-Demand Clock Modulation Enable - * Left shift ctrl by 1 to allign with bits 1-4 of MSR - */ - reg = rdmsr(IA32_CLOCK_MODULATION_MSR); - reg &= ~((uint64_t)0x1E); - reg |= ctrl; - wrmsr(IA32_CLOCK_MODULATION_MSR, reg); - break; - - case ACPI_ADR_SPACE_SYSTEM_IO: - ret = cpu_acpi_write_port(ptc_ctrl->cr_address, ctrl, - ptc_ctrl->cr_width); - break; - - default: - DTRACE_PROBE1(throttle_ctrl_unsupported_type, uint8_t, - ptc_ctrl->cr_addrspace_id); - - ret = -1; - } - - DTRACE_PROBE1(throttle_ctrl_write, uint32_t, ctrl); - DTRACE_PROBE1(throttle_ctrl_write_err, int, ret); - - return (ret); -} - -static int -read_status(cpu_acpi_handle_t handle, uint32_t *stat) -{ - cpu_acpi_ptc_t *ptc_stat; - uint64_t reg; - int ret = 0; - - ptc_stat = CPU_ACPI_PTC_STATUS(handle); - - switch (ptc_stat->cr_addrspace_id) { - case ACPI_ADR_SPACE_FIXED_HARDWARE: - reg = rdmsr(IA32_CLOCK_MODULATION_MSR); - *stat = reg & 0x1E; - ret = 0; - break; - - case ACPI_ADR_SPACE_SYSTEM_IO: - ret = cpu_acpi_read_port(ptc_stat->cr_address, stat, - ptc_stat->cr_width); - break; - - default: - DTRACE_PROBE1(throttle_status_unsupported_type, uint8_t, - ptc_stat->cr_addrspace_id); - - return (-1); - } - - DTRACE_PROBE1(throttle_status_read, uint32_t, *stat); - DTRACE_PROBE1(throttle_status_read_err, int, ret); - - return (ret); -} - -/* - * Transition the current processor to the requested throttling state. - */ -static void -cpudrv_tstate_transition(int *ret, cpudrv_devstate_t *cpudsp, - uint32_t req_state) -{ - cpudrv_mach_state_t *mach_state = cpudsp->mach_state; - cpu_acpi_handle_t handle = mach_state->acpi_handle; - cpu_acpi_tstate_t *req_tstate; - uint32_t ctrl; - uint32_t stat; - int i; - - req_tstate = (cpu_acpi_tstate_t *)CPU_ACPI_TSTATES(handle); - req_tstate += req_state; - DTRACE_PROBE1(throttle_transition, uint32_t, - CPU_ACPI_FREQPER(req_tstate)); - - /* - * Initiate the processor t-state change. - */ - ctrl = CPU_ACPI_TSTATE_CTRL(req_tstate); - if (write_ctrl(handle, ctrl) != 0) { - *ret = THROTTLE_RET_UNSUP_STATE; - return; - } - - /* - * If status is zero, then transition is synchronous and - * no status value comparison is required. - */ - if (CPU_ACPI_TSTATE_STAT(req_tstate) == 0) { - *ret = THROTTLE_RET_SUCCESS; - return; - } - - /* Wait until switch is complete, but bound the loop just in case. */ - for (i = CPU_ACPI_TSTATE_TRANSLAT(req_tstate) * 2; i >= 0; - i -= THROTTLE_LATENCY_WAIT) { - if (read_status(handle, &stat) == 0 && - CPU_ACPI_TSTATE_STAT(req_tstate) == stat) - break; - drv_usecwait(THROTTLE_LATENCY_WAIT); - } - - if (CPU_ACPI_TSTATE_STAT(req_tstate) != stat) { - DTRACE_PROBE(throttle_transition_incomplete); - *ret = THROTTLE_RET_TRANS_INCOMPLETE; - } else { - *ret = THROTTLE_RET_SUCCESS; - } -} - -static int -cpudrv_throttle(cpudrv_devstate_t *cpudsp, uint32_t throtl_lvl) -{ - cpuset_t cpus; - int ret; - - /* - * If thread is already running on target CPU then just - * make the transition request. Otherwise, we'll need to - * make a cross-call. - */ - kpreempt_disable(); - if (cpudsp->cpu_id == CPU->cpu_id) { - cpudrv_tstate_transition(&ret, cpudsp, throtl_lvl); - } else { - CPUSET_ONLY(cpus, cpudsp->cpu_id); - xc_call((xc_arg_t)&ret, (xc_arg_t)cpudsp, (xc_arg_t)throtl_lvl, - X_CALL_HIPRI, cpus, (xc_func_t)cpudrv_tstate_transition); - } - kpreempt_enable(); - - return (ret); -} - -static int -cpudrv_throttle_init(cpudrv_devstate_t *cpudsp) -{ - cpudrv_mach_state_t *mach_state = cpudsp->mach_state; - cpu_acpi_handle_t handle = mach_state->acpi_handle; - cpu_acpi_ptc_t *ptc_stat; - - if (cpu_acpi_cache_tstate_data(handle) != 0) { - CTDEBUG(("Failed to cache T-state ACPI data\n")); - cpudrv_throttle_fini(cpudsp); - return (THROTTLE_RET_INCOMPLETE_DATA); - } - - /* - * Check the address space used for transitions - */ - ptc_stat = CPU_ACPI_PTC_STATUS(handle); - switch (ptc_stat->cr_addrspace_id) { - case ACPI_ADR_SPACE_FIXED_HARDWARE: - CTDEBUG(("T-State transitions will use fixed hardware\n")); - break; - case ACPI_ADR_SPACE_SYSTEM_IO: - CTDEBUG(("T-State transitions will use System IO\n")); - break; - default: - cmn_err(CE_WARN, "!_PTC conifgured for unsupported " - "address space type = %d.", ptc_stat->cr_addrspace_id); - return (THROTTLE_RET_INCOMPLETE_DATA); - } - - cpudrv_alloc_tstate_domain(cpudsp); - - return (THROTTLE_RET_SUCCESS); -} - -static void -cpudrv_throttle_fini(cpudrv_devstate_t *cpudsp) -{ - cpudrv_mach_state_t *mach_state = cpudsp->mach_state; - cpu_acpi_handle_t handle = mach_state->acpi_handle; - - cpudrv_free_tstate_domains(); - cpu_acpi_free_tstate_data(handle); -}
--- a/usr/src/uts/i86pc/io/cpudrv/pwrnow.c Wed Feb 25 20:53:30 2009 -0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,270 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include <sys/x86_archext.h> -#include <sys/machsystm.h> -#include <sys/x_call.h> -#include <sys/acpi/acpi.h> -#include <sys/acpica.h> -#include <sys/cpudrv_mach.h> -#include <sys/pwrnow.h> -#include <sys/cpu_acpi.h> -#include <sys/cpupm.h> -#include <sys/dtrace.h> -#include <sys/sdt.h> - -static int pwrnow_init(cpudrv_devstate_t *); -static void pwrnow_fini(cpudrv_devstate_t *); -static int pwrnow_power(cpudrv_devstate_t *, uint32_t); - -/* - * Interfaces for modules implementing AMD's PowerNow!. - */ -cpudrv_pstate_ops_t pwrnow_ops = { - "PowerNow! Technology", - pwrnow_init, - pwrnow_fini, - pwrnow_power -}; - -/* - * Error returns - */ -#define PWRNOW_RET_SUCCESS 0x00 -#define PWRNOW_RET_NO_PM 0x01 -#define PWRNOW_RET_UNSUP_STATE 0x02 -#define PWRNOW_RET_TRANS_INCOMPLETE 0x03 - -#define PWRNOW_LATENCY_WAIT 10 - -/* - * MSR registers for changing and reading processor power state. - */ -#define PWRNOW_PERF_CTL_MSR 0xC0010062 -#define PWRNOW_PERF_STATUS_MSR 0xC0010063 - -#define AMD_CPUID_PSTATE_HARDWARE (1<<7) -#define AMD_CPUID_TSC_CONSTANT (1<<8) - -/* - * Debugging support - */ -#ifdef DEBUG -volatile int pwrnow_debug = 0; -#define PWRNOW_DEBUG(arglist) if (pwrnow_debug) printf arglist; -#else -#define PWRNOW_DEBUG(arglist) -#endif - -/* - * Write the ctrl register. - */ -static int -write_ctrl(cpu_acpi_handle_t handle, uint32_t ctrl) -{ - cpu_acpi_pct_t *pct_ctrl; - uint64_t reg; - int ret = 0; - - pct_ctrl = CPU_ACPI_PCT_CTRL(handle); - - switch (pct_ctrl->cr_addrspace_id) { - case ACPI_ADR_SPACE_FIXED_HARDWARE: - reg = ctrl; - wrmsr(PWRNOW_PERF_CTL_MSR, reg); - ret = 0; - break; - - default: - DTRACE_PROBE1(pwrnow_ctrl_unsupported_type, uint8_t, - pct_ctrl->cr_addrspace_id); - return (-1); - } - - DTRACE_PROBE1(pwrnow_ctrl_write, uint32_t, ctrl); - DTRACE_PROBE1(pwrnow_ctrl_write_err, int, ret); - - return (ret); -} - -/* - * Transition the current processor to the requested state. - */ -static void -pwrnow_pstate_transition(int *ret, cpudrv_devstate_t *cpudsp, - uint32_t req_state) -{ - cpudrv_mach_state_t *mach_state = cpudsp->mach_state; - cpu_acpi_handle_t handle = mach_state->acpi_handle; - cpu_acpi_pstate_t *req_pstate; - uint32_t ctrl; - - req_pstate = (cpu_acpi_pstate_t *)CPU_ACPI_PSTATES(handle); - req_pstate += req_state; - DTRACE_PROBE1(pwrnow_transition_freq, uint32_t, - CPU_ACPI_FREQ(req_pstate)); - - /* - * Initiate the processor p-state change. - */ - ctrl = CPU_ACPI_PSTATE_CTRL(req_pstate); - if (write_ctrl(handle, ctrl) != 0) { - *ret = PWRNOW_RET_UNSUP_STATE; - return; - } - - mach_state->pstate = req_state; - CPU->cpu_curr_clock = ((uint64_t) - CPU_ACPI_FREQ(req_pstate) * 1000000); - - *ret = PWRNOW_RET_SUCCESS; -} - -static int -pwrnow_power(cpudrv_devstate_t *cpudsp, uint32_t req_state) -{ - cpuset_t cpus; - int ret; - - /* - * If thread is already running on target CPU then just - * make the transition request. Otherwise, we'll need to - * make a cross-call. - */ - kpreempt_disable(); - if (cpudsp->cpu_id == CPU->cpu_id) { - pwrnow_pstate_transition(&ret, cpudsp, req_state); - } else { - CPUSET_ONLY(cpus, cpudsp->cpu_id); - xc_call((xc_arg_t)&ret, (xc_arg_t)cpudsp, (xc_arg_t)req_state, - X_CALL_HIPRI, cpus, (xc_func_t)pwrnow_pstate_transition); - } - kpreempt_enable(); - - return (ret); -} - -/* - * Validate that this processor supports PowerNow! and if so, - * get the P-state data from ACPI and cache it. - */ -static int -pwrnow_init(cpudrv_devstate_t *cpudsp) -{ - cpudrv_mach_state_t *mach_state = cpudsp->mach_state; - cpu_acpi_handle_t handle = mach_state->acpi_handle; - cpu_acpi_pct_t *pct_stat; - cpu_t *cp; - int domain; - - PWRNOW_DEBUG(("pwrnow_init: instance %d\n", - ddi_get_instance(cpudsp->dip))); - - /* - * Cache the P-state specific ACPI data. - */ - if (cpu_acpi_cache_pstate_data(handle) != 0) { - PWRNOW_DEBUG(("Failed to cache ACPI data\n")); - pwrnow_fini(cpudsp); - return (PWRNOW_RET_NO_PM); - } - - pct_stat = CPU_ACPI_PCT_STATUS(handle); - switch (pct_stat->cr_addrspace_id) { - case ACPI_ADR_SPACE_FIXED_HARDWARE: - PWRNOW_DEBUG(("Transitions will use fixed hardware\n")); - break; - default: - cmn_err(CE_WARN, "!_PCT configured for unsupported " - "addrspace = %d.", pct_stat->cr_addrspace_id); - cmn_err(CE_NOTE, "!CPU power management will not function."); - pwrnow_fini(cpudsp); - return (PWRNOW_RET_NO_PM); - } - - if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_PSD_CACHED)) - domain = CPU_ACPI_PSD(handle).sd_domain; - else { - cp = cpu[CPU->cpu_id]; - domain = cpuid_get_chipid(cp); - } - cpupm_add_cpu2dependency(cpudsp->dip, domain); - - PWRNOW_DEBUG(("Instance %d succeeded.\n", - ddi_get_instance(cpudsp->dip))); - return (PWRNOW_RET_SUCCESS); -} - -/* - * Free resources allocated by pwrnow_init(). - */ -static void -pwrnow_fini(cpudrv_devstate_t *cpudsp) -{ - cpudrv_mach_state_t *mach_state = cpudsp->mach_state; - cpu_acpi_handle_t handle = mach_state->acpi_handle; - - cpupm_free_cpu_dependencies(); - cpu_acpi_free_pstate_data(handle); -} - -boolean_t -pwrnow_supported() -{ - struct cpuid_regs cpu_regs; - - /* Required features */ - if (!(x86_feature & X86_CPUID) || - !(x86_feature & X86_MSR)) { - PWRNOW_DEBUG(("No CPUID or MSR support.")); - return (B_FALSE); - } - - /* - * Get the Advanced Power Management Information. - */ - cpu_regs.cp_eax = 0x80000007; - (void) __cpuid_insn(&cpu_regs); - - /* - * We currently only support CPU power management of - * processors that are P-state TSC invariant - */ - if (!(cpu_regs.cp_edx & AMD_CPUID_TSC_CONSTANT)) { - PWRNOW_DEBUG(("No support for CPUs that are not P-state " - "TSC invariant.\n")); - return (B_FALSE); - } - - /* - * We only support the "Fire and Forget" style of PowerNow! (i.e., - * single MSR write to change speed). - */ - if (!(cpu_regs.cp_edx & AMD_CPUID_PSTATE_HARDWARE)) { - PWRNOW_DEBUG(("Hardware P-State control is not supported.\n")); - return (B_FALSE); - } - return (B_TRUE); -}
--- a/usr/src/uts/i86pc/io/cpudrv/speedstep.c Wed Feb 25 20:53:30 2009 -0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,287 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include <sys/x86_archext.h> -#include <sys/machsystm.h> -#include <sys/x_call.h> -#include <sys/acpi/acpi.h> -#include <sys/acpica.h> -#include <sys/cpudrv_mach.h> -#include <sys/speedstep.h> -#include <sys/cpu_acpi.h> -#include <sys/cpupm.h> -#include <sys/dtrace.h> -#include <sys/sdt.h> - -static int speedstep_init(cpudrv_devstate_t *); -static void speedstep_fini(cpudrv_devstate_t *); -static int speedstep_power(cpudrv_devstate_t *, uint32_t); - -/* - * Interfaces for modules implementing Intel's Enhanced SpeedStep. - */ -cpudrv_pstate_ops_t speedstep_ops = { - "Enhanced SpeedStep Technology", - speedstep_init, - speedstep_fini, - speedstep_power -}; - -/* - * Error returns - */ -#define ESS_RET_SUCCESS 0x00 -#define ESS_RET_NO_PM 0x01 -#define ESS_RET_UNSUP_STATE 0x02 - -/* - * MSR registers for changing and reading processor power state. - */ -#define IA32_PERF_STAT_MSR 0x198 -#define IA32_PERF_CTL_MSR 0x199 - -#define IA32_CPUID_TSC_CONSTANT 0xF30 -#define IA32_MISC_ENABLE_MSR 0x1A0 -#define IA32_MISC_ENABLE_EST (1<<16) -#define IA32_MISC_ENABLE_CXE (1<<25) -/* - * Debugging support - */ -#ifdef DEBUG -volatile int ess_debug = 0; -#define ESSDEBUG(arglist) if (ess_debug) printf arglist; -#else -#define ESSDEBUG(arglist) -#endif - -/* - * Write the ctrl register. How it is written, depends upon the _PCT - * APCI object value. - */ -static int -write_ctrl(cpu_acpi_handle_t handle, uint32_t ctrl) -{ - cpu_acpi_pct_t *pct_ctrl; - uint64_t reg; - int ret = 0; - - pct_ctrl = CPU_ACPI_PCT_CTRL(handle); - - switch (pct_ctrl->cr_addrspace_id) { - case ACPI_ADR_SPACE_FIXED_HARDWARE: - /* - * Read current power state because reserved bits must be - * preserved, compose new value, and write it. - */ - reg = rdmsr(IA32_PERF_CTL_MSR); - reg &= ~((uint64_t)0xFFFF); - reg |= ctrl; - wrmsr(IA32_PERF_CTL_MSR, reg); - ret = 0; - break; - - case ACPI_ADR_SPACE_SYSTEM_IO: - ret = cpu_acpi_write_port(pct_ctrl->cr_address, ctrl, - pct_ctrl->cr_width); - break; - - default: - DTRACE_PROBE1(ess_ctrl_unsupported_type, uint8_t, - pct_ctrl->cr_addrspace_id); - return (-1); - } - - DTRACE_PROBE1(ess_ctrl_write, uint32_t, ctrl); - DTRACE_PROBE1(ess_ctrl_write_err, int, ret); - - return (ret); -} - -/* - * Transition the current processor to the requested state. - */ -void -speedstep_pstate_transition(int *ret, cpudrv_devstate_t *cpudsp, - uint32_t req_state) -{ - cpudrv_mach_state_t *mach_state = cpudsp->mach_state; - cpu_acpi_handle_t handle = mach_state->acpi_handle; - cpu_acpi_pstate_t *req_pstate; - uint32_t ctrl; - - req_pstate = (cpu_acpi_pstate_t *)CPU_ACPI_PSTATES(handle); - req_pstate += req_state; - DTRACE_PROBE1(ess_transition, uint32_t, CPU_ACPI_FREQ(req_pstate)); - - /* - * Initiate the processor p-state change. - */ - ctrl = CPU_ACPI_PSTATE_CTRL(req_pstate); - if (write_ctrl(handle, ctrl) != 0) { - *ret = ESS_RET_UNSUP_STATE; - return; - } - - mach_state->pstate = req_state; - CPU->cpu_curr_clock = - (((uint64_t)CPU_ACPI_FREQ(req_pstate) * 1000000)); - *ret = ESS_RET_SUCCESS; -} - -static int -speedstep_power(cpudrv_devstate_t *cpudsp, uint32_t req_state) -{ - cpuset_t cpus; - int ret; - - /* - * If thread is already running on target CPU then just - * make the transition request. Otherwise, we'll need to - * make a cross-call. - */ - kpreempt_disable(); - if (cpudsp->cpu_id == CPU->cpu_id) { - speedstep_pstate_transition(&ret, cpudsp, req_state); - } else { - CPUSET_ONLY(cpus, cpudsp->cpu_id); - xc_call((xc_arg_t)&ret, (xc_arg_t)cpudsp, (xc_arg_t)req_state, - X_CALL_HIPRI, cpus, (xc_func_t)speedstep_pstate_transition); - } - kpreempt_enable(); - - return (ret); -} - -/* - * Validate that this processor supports Speedstep and if so, - * get the P-state data from ACPI and cache it. - */ -static int -speedstep_init(cpudrv_devstate_t *cpudsp) -{ - cpudrv_mach_state_t *mach_state = cpudsp->mach_state; - cpu_acpi_handle_t handle = mach_state->acpi_handle; - cpu_acpi_pct_t *pct_stat; - cpu_t *cp; - int dependency; - - ESSDEBUG(("speedstep_init: instance %d\n", - ddi_get_instance(cpudsp->dip))); - - /* - * Cache the P-state specific ACPI data. - */ - if (cpu_acpi_cache_pstate_data(handle) != 0) { - ESSDEBUG(("Failed to cache ACPI data\n")); - speedstep_fini(cpudsp); - return (ESS_RET_NO_PM); - } - - pct_stat = CPU_ACPI_PCT_STATUS(handle); - switch (pct_stat->cr_addrspace_id) { - case ACPI_ADR_SPACE_FIXED_HARDWARE: - ESSDEBUG(("Transitions will use fixed hardware\n")); - break; - case ACPI_ADR_SPACE_SYSTEM_IO: - ESSDEBUG(("Transitions will use system IO\n")); - break; - default: - cmn_err(CE_WARN, "!_PCT conifgured for unsupported " - "addrspace = %d.", pct_stat->cr_addrspace_id); - cmn_err(CE_NOTE, "!CPU power management will not function."); - speedstep_fini(cpudsp); - return (ESS_RET_NO_PM); - } - - if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_PSD_CACHED)) - dependency = CPU_ACPI_PSD(handle).sd_domain; - else { - mutex_enter(&cpu_lock); - cp = cpu[CPU->cpu_id]; - dependency = cpuid_get_chipid(cp); - mutex_exit(&cpu_lock); - } - cpupm_add_cpu2dependency(cpudsp->dip, dependency); - - ESSDEBUG(("Instance %d succeeded.\n", ddi_get_instance(cpudsp->dip))); - return (ESS_RET_SUCCESS); -} - -/* - * Free resources allocated by speedstep_init(). - */ -static void -speedstep_fini(cpudrv_devstate_t *cpudsp) -{ - cpudrv_mach_state_t *mach_state = cpudsp->mach_state; - cpu_acpi_handle_t handle = mach_state->acpi_handle; - - cpupm_free_cpu_dependencies(); - cpu_acpi_free_pstate_data(handle); -} - -boolean_t -speedstep_supported(uint_t family, uint_t model) -{ - struct cpuid_regs cpu_regs; - uint64_t reg; - - /* Required features */ - if (!(x86_feature & X86_CPUID) || - !(x86_feature & X86_MSR)) { - return (B_FALSE); - } - - /* - * We only support family/model combinations which - * are P-state TSC invariant. - */ - if (!((family == 0xf && model >= 0x3) || - (family == 0x6 && model >= 0xe))) { - return (B_FALSE); - } - - /* - * Enhanced SpeedStep supported? - */ - cpu_regs.cp_eax = 0x1; - (void) __cpuid_insn(&cpu_regs); - if (!(cpu_regs.cp_ecx & CPUID_INTC_ECX_EST)) { - return (B_FALSE); - } - - /* - * If Enhanced SpeedStep has not been enabled on the system, - * then we probably should not override the BIOS setting. - */ - reg = rdmsr(IA32_MISC_ENABLE_MSR); - if (! (reg & IA32_MISC_ENABLE_EST)) { - cmn_err(CE_NOTE, "!Enhanced Intel SpeedStep not enabled."); - cmn_err(CE_NOTE, "!CPU power management will not function."); - return (B_FALSE); - } - - return (B_TRUE); -}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/i86pc/io/cpudrv_mach.c Wed Feb 25 21:04:18 2009 -0800 @@ -0,0 +1,287 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * CPU power management driver support for i86pc. + */ + +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/cpupm.h> +#include <sys/cpudrv_mach.h> +#include <sys/machsystm.h> +#include <sys/cpu_pm.h> +#include <sys/cpuvar.h> +#include <sys/sdt.h> +#include <sys/cpu_idle.h> + +/* + * Note that our driver numbers the power levels from lowest to + * highest starting at 1 (i.e., the lowest power level is 1 and + * the highest power level is cpupm->num_spd). The x86 modules get + * their power levels from ACPI which numbers power levels from + * highest to lowest starting at 0 (i.e., the lowest power level + * is (cpupm->num_spd - 1) and the highest power level is 0). So to + * map one of our driver power levels to one understood by ACPI we + * simply subtract our driver power level from cpupm->num_spd. Likewise, + * to map an ACPI power level to the proper driver power level, we + * subtract the ACPI power level from cpupm->num_spd. + */ +#define PM_2_PLAT_LEVEL(cpupm, pm_level) (cpupm->num_spd - pm_level) +#define PLAT_2_PM_LEVEL(cpupm, plat_level) (cpupm->num_spd - plat_level) + +/* + * Change CPU speed using interface provided by module. + */ +int +cpudrv_change_speed(cpudrv_devstate_t *cpudsp, cpudrv_pm_spd_t *new_spd) +{ + cpu_t *cp = cpudsp->cp; + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; + cpudrv_pm_t *cpupm; + cpuset_t set; + uint32_t plat_level; + + if (!(mach_state->ms_caps & CPUPM_P_STATES)) + return (DDI_FAILURE); + ASSERT(mach_state->ms_pstate.cma_ops != NULL); + cpupm = &(cpudsp->cpudrv_pm); + plat_level = PM_2_PLAT_LEVEL(cpupm, new_spd->pm_level); + CPUSET_ONLY(set, cp->cpu_id); + mach_state->ms_pstate.cma_ops->cpus_change(set, plat_level); + + return (DDI_SUCCESS); +} + +/* + * Determine the cpu_id for the CPU device. + */ +boolean_t +cpudrv_get_cpu_id(dev_info_t *dip, processorid_t *cpu_id) +{ + return ((*cpu_id = ddi_prop_get_int(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, "reg", -1)) != -1); + +} + +boolean_t +cpudrv_is_enabled(cpudrv_devstate_t *cpudsp) +{ + cpupm_mach_state_t *mach_state; + + if (!cpupm_is_enabled(CPUPM_P_STATES) || !cpudrv_enabled) + return (B_FALSE); + + /* + * Only check the instance specific setting it exists. + */ + if (cpudsp != NULL && cpudsp->cp != NULL && + cpudsp->cp->cpu_m.mcpu_pm_mach_state != NULL) { + mach_state = + (cpupm_mach_state_t *)cpudsp->cp->cpu_m.mcpu_pm_mach_state; + return (mach_state->ms_caps & CPUPM_P_STATES); + } + + return (B_TRUE); +} + +/* + * Is the current thread the thread that is handling the + * PPC change notification? + */ +boolean_t +cpudrv_is_governor_thread(cpudrv_pm_t *cpupm) +{ + return (curthread == cpupm->pm_governor_thread); +} + +/* + * This routine changes the top speed to which the CPUs can transition by: + * + * - Resetting the up_spd for all speeds lower than the new top speed + * to point to the new top speed. + * - Updating the framework with a new "normal" (maximum power) for this + * device. + */ +void +cpudrv_set_topspeed(void *ctx, int plat_level) +{ + cpudrv_devstate_t *cpudsp; + cpudrv_pm_t *cpupm; + cpudrv_pm_spd_t *spd; + cpudrv_pm_spd_t *top_spd; + dev_info_t *dip; + int pm_level; + int instance; + int i; + + dip = ctx; + instance = ddi_get_instance(dip); + cpudsp = ddi_get_soft_state(cpudrv_state, instance); + ASSERT(cpudsp != NULL); + + mutex_enter(&cpudsp->lock); + cpupm = &(cpudsp->cpudrv_pm); + pm_level = PLAT_2_PM_LEVEL(cpupm, plat_level); + for (i = 0, spd = cpupm->head_spd; spd; i++, spd = spd->down_spd) { + /* + * Don't mess with speeds that are higher than the new + * top speed. They should be out of range anyway. + */ + if (spd->pm_level > pm_level) + continue; + /* + * This is the new top speed. + */ + if (spd->pm_level == pm_level) + top_spd = spd; + + spd->up_spd = top_spd; + } + cpupm->top_spd = top_spd; + + cpupm->pm_governor_thread = curthread; + + mutex_exit(&cpudsp->lock); + + (void) pm_update_maxpower(dip, 0, top_spd->pm_level); +} + +/* + * This routine reads the ACPI _PPC object. It's accessed as a callback + * by the ppm driver whenever a _PPC change notification is received. + */ +int +cpudrv_get_topspeed(void *ctx) +{ + cpu_t *cp; + cpudrv_devstate_t *cpudsp; + dev_info_t *dip; + int instance; + int plat_level; + + dip = ctx; + instance = ddi_get_instance(dip); + cpudsp = ddi_get_soft_state(cpudrv_state, instance); + ASSERT(cpudsp != NULL); + cp = cpudsp->cp; + plat_level = cpupm_get_top_speed(cp); + + return (plat_level); +} + + +/* + * This notification handler is called whenever the ACPI _PPC + * object changes. The _PPC is a sort of governor on power levels. + * It sets an upper threshold on which, _PSS defined, power levels + * are usuable. The _PPC value is dynamic and may change as properties + * (i.e., thermal or AC source) of the system change. + */ +/* ARGSUSED */ +static void +cpudrv_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx) +{ + extern pm_cpupm_t cpupm; + + /* + * We only handle _PPC change notifications. + */ + if (val == CPUPM_PPC_CHANGE_NOTIFICATION && !PM_EVENT_CPUPM) + cpudrv_redefine_topspeed(ctx); +} + +void +cpudrv_install_notify_handler(cpudrv_devstate_t *cpudsp) +{ + cpu_t *cp = cpudsp->cp; + cpupm_add_notify_handler(cp, cpudrv_notify_handler, + cpudsp->dip); +} + +void +cpudrv_redefine_topspeed(void *ctx) +{ + /* + * This should never happen, unless ppm does not get loaded. + */ + if (cpupm_redefine_topspeed == NULL) { + cmn_err(CE_WARN, "cpudrv_redefine_topspeed: " + "cpupm_redefine_topspeed has not been initialized - " + "ignoring notification"); + return; + } + + /* + * ppm callback needs to handle redefinition for all CPUs in + * the domain. + */ + (*cpupm_redefine_topspeed)(ctx); +} + +boolean_t +cpudrv_mach_init(cpudrv_devstate_t *cpudsp) +{ + cpupm_mach_state_t *mach_state; + + mutex_enter(&cpu_lock); + cpudsp->cp = cpu_get(cpudsp->cpu_id); + mutex_exit(&cpu_lock); + if (cpudsp->cp == NULL) { + cmn_err(CE_WARN, "cpudrv_mach_pm_init: instance %d: " + "can't get cpu_t", ddi_get_instance(cpudsp->dip)); + return (B_FALSE); + } + + mach_state = (cpupm_mach_state_t *) + (cpudsp->cp->cpu_m.mcpu_pm_mach_state); + mach_state->ms_dip = cpudsp->dip; + return (B_TRUE); +} + +uint_t +cpudrv_get_speeds(cpudrv_devstate_t *cpudsp, int **speeds) +{ + return (cpupm_get_speeds(cpudsp->cp, speeds)); +} + +void +cpudrv_free_speeds(int *speeds, uint_t nspeeds) +{ + cpupm_free_speeds(speeds, nspeeds); +} + +boolean_t +cpudrv_power_ready(void) +{ + return (cpupm_power_ready()); +} + +/* ARGSUSED */ +void +cpudrv_set_supp_freqs(cpudrv_devstate_t *cpudsp) +{ +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/i86pc/io/hpet_acpi.c Wed Feb 25 21:04:18 2009 -0800 @@ -0,0 +1,1388 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/hpet_acpi.h> +#include <sys/hpet.h> +#include <sys/bitmap.h> +#include <sys/inttypes.h> +#include <sys/time.h> +#include <sys/sunddi.h> +#include <sys/ksynch.h> +#include <sys/apic.h> +#include <sys/callb.h> +#include <sys/clock.h> +#include <sys/archsystm.h> +#include <sys/cpupart.h> + +/* + * hpet_state_lock is used to synchronize disabling/enabling deep c-states + * and to synchronize suspend/resume. + */ +static kmutex_t hpet_state_lock; +static struct hpet_state { + boolean_t proxy_installed; /* CBE proxy interrupt setup */ + boolean_t cpr; /* currently in CPR */ + boolean_t cpu_deep_idle; /* user enable/disable */ + boolean_t uni_cstate; /* disable if only one cstate */ +} hpet_state = { B_FALSE, B_FALSE, B_TRUE, B_TRUE}; + +uint64_t hpet_spin_check = HPET_SPIN_CHECK; +uint64_t hpet_spin_timeout = HPET_SPIN_TIMEOUT; +uint64_t hpet_idle_spin_timeout = HPET_SPIN_TIMEOUT; +uint64_t hpet_isr_spin_timeout = HPET_SPIN_TIMEOUT; + +static kmutex_t hpet_proxy_lock; /* lock for lAPIC proxy data */ +/* + * hpet_proxy_users is a per-cpu array. + */ +static hpet_proxy_t *hpet_proxy_users; /* one per CPU */ + + +ACPI_TABLE_HPET *hpet_table; /* ACPI HPET table */ +hpet_info_t hpet_info; /* Human readable Information */ + +/* + * Provide HPET access from unix.so. + * Set up pointers to access symbols in pcplusmp. + */ +static void +hpet_establish_hooks(void) +{ + hpet.install_proxy = &hpet_install_proxy; + hpet.callback = &hpet_callback; + hpet.use_hpet_timer = &hpet_use_hpet_timer; + hpet.use_lapic_timer = &hpet_use_lapic_timer; +} + +/* + * Get the ACPI "HPET" table. + * acpi_probe() calls this function from mp_startup before drivers are loaded. + * acpi_probe() verified the system is using ACPI before calling this. + * + * There may be more than one ACPI HPET table (Itanium only?). + * Intel's HPET spec defines each timer block to have up to 32 counters and + * be 1024 bytes long. There can be more than one timer block of 32 counters. + * Each timer block would have an additional ACPI HPET table. + * Typical x86 systems today only have 1 HPET with 3 counters. + * On x86 we only consume HPET table "1" for now. + */ +int +hpet_acpi_init(int *hpet_vect, iflag_t *hpet_flags) +{ + extern hrtime_t tsc_read(void); + extern int idle_cpu_no_deep_c; + extern int cpuid_deep_cstates_supported(void); + void *la; + uint64_t ret; + uint_t num_timers; + uint_t ti; + + (void) memset(&hpet_info, 0, sizeof (hpet_info)); + hpet.supported = HPET_NO_SUPPORT; + + if (idle_cpu_no_deep_c) + return (DDI_FAILURE); + + if (!cpuid_deep_cstates_supported()) + return (DDI_FAILURE); + + hpet_establish_hooks(); + + /* + * Get HPET ACPI table 1. + */ + if (ACPI_FAILURE(AcpiGetTable(ACPI_SIG_HPET, HPET_TABLE_1, + (ACPI_TABLE_HEADER **)&hpet_table))) { + cmn_err(CE_NOTE, "!hpet_acpi: unable to get ACPI HPET table"); + return (DDI_FAILURE); + } + + if (hpet_validate_table(hpet_table) != AE_OK) { + cmn_err(CE_NOTE, "!hpet_acpi: invalid HPET table"); + return (DDI_FAILURE); + } + + la = hpet_memory_map(hpet_table); + if (la == NULL) { + cmn_err(CE_NOTE, "!hpet_acpi: memory map HPET failed"); + return (DDI_FAILURE); + } + hpet_info.logical_address = la; + + ret = hpet_read_gen_cap(&hpet_info); + hpet_info.gen_cap.counter_clk_period = HPET_GCAP_CNTR_CLK_PERIOD(ret); + hpet_info.gen_cap.vendor_id = HPET_GCAP_VENDOR_ID(ret); + hpet_info.gen_cap.leg_route_cap = HPET_GCAP_LEG_ROUTE_CAP(ret); + hpet_info.gen_cap.count_size_cap = HPET_GCAP_CNT_SIZE_CAP(ret); + /* + * Hardware contains the last timer's number. + * Add 1 to get the number of timers. + */ + hpet_info.gen_cap.num_tim_cap = HPET_GCAP_NUM_TIM_CAP(ret) + 1; + hpet_info.gen_cap.rev_id = HPET_GCAP_REV_ID(ret); + + if (hpet_info.gen_cap.counter_clk_period > HPET_MAX_CLK_PERIOD) { + cmn_err(CE_NOTE, "!hpet_acpi: COUNTER_CLK_PERIOD 0x%lx > 0x%lx", + (long)hpet_info.gen_cap.counter_clk_period, + (long)HPET_MAX_CLK_PERIOD); + return (DDI_FAILURE); + } + + num_timers = (uint_t)hpet_info.gen_cap.num_tim_cap; + if ((num_timers < 3) || (num_timers > 32)) { + cmn_err(CE_NOTE, "!hpet_acpi: invalid number of HPET timers " + "%lx", (long)num_timers); + return (DDI_FAILURE); + } + hpet_info.timer_n_config = (hpet_TN_conf_cap_t *)kmem_zalloc( + num_timers * sizeof (uint64_t), KM_SLEEP); + + ret = hpet_read_gen_config(&hpet_info); + hpet_info.gen_config.leg_rt_cnf = HPET_GCFR_LEG_RT_CNF_BITX(ret); + hpet_info.gen_config.enable_cnf = HPET_GCFR_ENABLE_CNF_BITX(ret); + + /* + * Solaris does not use the HPET Legacy Replacement Route capabilities. + * This feature has been off by default on test systems. + * The HPET spec does not specify if Legacy Replacement Route is + * on or off by default, so we explicitely set it off here. + * It should not matter which mode the HPET is in since we use + * the first available non-legacy replacement timer: timer 2. + */ + (void) hpet_set_leg_rt_cnf(&hpet_info, 0); + + ret = hpet_read_gen_config(&hpet_info); + hpet_info.gen_config.leg_rt_cnf = HPET_GCFR_LEG_RT_CNF_BITX(ret); + hpet_info.gen_config.enable_cnf = HPET_GCFR_ENABLE_CNF_BITX(ret); + + hpet_info.gen_intrpt_stat = hpet_read_gen_intrpt_stat(&hpet_info); + hpet_info.main_counter_value = hpet_read_main_counter_value(&hpet_info); + + for (ti = 0; ti < num_timers; ++ti) { + ret = hpet_read_timer_N_config(&hpet_info, ti); + /* + * Make sure no timers are enabled (think fast reboot or + * virtual hardware). + */ + if (ret & HPET_TIMER_N_INT_ENB_CNF_BIT) { + hpet_disable_timer(&hpet_info, ti); + ret &= ~HPET_TIMER_N_INT_ENB_CNF_BIT; + } + + hpet_info.timer_n_config[ti] = hpet_convert_timer_N_config(ret); + } + + /* + * Be aware the Main Counter may need to be initialized in the future + * if it is used for more than just Deep C-State support. + * The HPET's Main Counter does not need to be initialize to a specific + * value before starting it for use to wake up CPUs from Deep C-States. + */ + if (hpet_start_main_counter(&hpet_info) != AE_OK) { + cmn_err(CE_NOTE, "!hpet_acpi: hpet_start_main_counter failed"); + return (DDI_FAILURE); + } + + hpet_info.period = hpet_info.gen_cap.counter_clk_period; + /* + * Read main counter twice to record HPET latency for debugging. + */ + hpet_info.tsc[0] = tsc_read(); + hpet_info.hpet_main_counter_reads[0] = + hpet_read_main_counter_value(&hpet_info); + hpet_info.tsc[1] = tsc_read(); + hpet_info.hpet_main_counter_reads[1] = + hpet_read_main_counter_value(&hpet_info); + hpet_info.tsc[2] = tsc_read(); + + ret = hpet_read_gen_config(&hpet_info); + hpet_info.gen_config.leg_rt_cnf = HPET_GCFR_LEG_RT_CNF_BITX(ret); + hpet_info.gen_config.enable_cnf = HPET_GCFR_ENABLE_CNF_BITX(ret); + + /* + * HPET main counter reads are supported now. + */ + hpet.supported = HPET_TIMER_SUPPORT; + + return (hpet_init_proxy(hpet_vect, hpet_flags)); +} + +void +hpet_acpi_fini(void) +{ + if (hpet.supported == HPET_NO_SUPPORT) + return; + if (hpet.supported >= HPET_TIMER_SUPPORT) + hpet_stop_main_counter(&hpet_info); + if (hpet.supported > HPET_TIMER_SUPPORT) + hpet_disable_timer(&hpet_info, hpet_info.cstate_timer.timer); +} + +/* + * Do initial setup to use a HPET timer as a proxy for Deep C-state stalled + * LAPIC Timers. Get a free HPET timer that supports I/O APIC routed interrupt. + * Setup data to handle the timer's ISR, and add the timer's interrupt. + * + * The ddi cannot be use to allocate the HPET timer's interrupt. + * ioapic_init_intr() in mp_platform_common() later sets up the I/O APIC + * to handle the HPET timer's interrupt. + * + * Note: FSB (MSI) interrupts are not currently supported by Intel HPETs as of + * ICH9. The HPET spec allows for MSI. In the future MSI may be prefered. + */ +static int +hpet_init_proxy(int *hpet_vect, iflag_t *hpet_flags) +{ + if (hpet_get_IOAPIC_intr_capable_timer(&hpet_info) == -1) { + cmn_err(CE_WARN, "!hpet_acpi: get ioapic intr failed."); + return (DDI_FAILURE); + } + + hpet_init_proxy_data(); + + if (hpet_install_interrupt_handler(&hpet_isr, + hpet_info.cstate_timer.intr) != AE_OK) { + cmn_err(CE_WARN, "!hpet_acpi: install interrupt failed."); + return (DDI_FAILURE); + } + *hpet_vect = hpet_info.cstate_timer.intr; + hpet_flags->intr_el = INTR_EL_LEVEL; + hpet_flags->intr_po = INTR_PO_ACTIVE_HIGH; + hpet_flags->bustype = BUS_PCI; /* we *do* conform to PCI */ + + /* + * Avoid a possibly stuck interrupt by programing the HPET's timer here + * before the I/O APIC is programmed to handle this interrupt. + */ + hpet_timer_set_up(&hpet_info, hpet_info.cstate_timer.timer, + hpet_info.cstate_timer.intr); + + /* + * All HPET functionality is supported. + */ + hpet.supported = HPET_FULL_SUPPORT; + return (DDI_SUCCESS); +} + +/* + * Called by kernel if it can support Deep C-States. + */ +static boolean_t +hpet_install_proxy(void) +{ + if (hpet_state.proxy_installed == B_TRUE) + return (B_TRUE); + + if (hpet.supported != HPET_FULL_SUPPORT) + return (B_FALSE); + + hpet_enable_timer(&hpet_info, hpet_info.cstate_timer.timer); + hpet_state.proxy_installed = B_TRUE; + + return (B_TRUE); +} + +/* + * Remove the interrupt that was added with add_avintr() in + * hpet_install_interrupt_handler(). + */ +static void +hpet_uninstall_interrupt_handler(void) +{ + rem_avintr(NULL, CBE_HIGH_PIL, (avfunc)&hpet_isr, + hpet_info.cstate_timer.intr); +} + +static int +hpet_validate_table(ACPI_TABLE_HPET *hpet_table) +{ + ACPI_TABLE_HEADER *table_header = (ACPI_TABLE_HEADER *)hpet_table; + + if (table_header->Length != sizeof (ACPI_TABLE_HPET)) { + cmn_err(CE_WARN, "!hpet_validate_table: Length %lx != sizeof (" + "ACPI_TABLE_HPET) %lx.", + (unsigned long)((ACPI_TABLE_HEADER *)hpet_table)->Length, + (unsigned long)sizeof (ACPI_TABLE_HPET)); + return (AE_ERROR); + } + + if (!ACPI_COMPARE_NAME(table_header->Signature, ACPI_SIG_HPET)) { + cmn_err(CE_WARN, "!hpet_validate_table: Invalid HPET table " + "signature"); + return (AE_ERROR); + } + + if (!hpet_checksum_table((unsigned char *)hpet_table, + (unsigned int)table_header->Length)) { + cmn_err(CE_WARN, "!hpet_validate_table: Invalid HPET checksum"); + return (AE_ERROR); + } + + /* + * Sequence should be table number - 1. We are using table 1. + */ + if (hpet_table->Sequence != HPET_TABLE_1 - 1) { + cmn_err(CE_WARN, "!hpet_validate_table: Invalid Sequence %lx", + (long)hpet_table->Sequence); + return (AE_ERROR); + } + + return (AE_OK); +} + +static boolean_t +hpet_checksum_table(unsigned char *table, unsigned int length) +{ + unsigned char checksum = 0; + int i; + + for (i = 0; i < length; ++i, ++table) + checksum += *table; + + return (checksum == 0); +} + +static void * +hpet_memory_map(ACPI_TABLE_HPET *hpet_table) +{ + return (AcpiOsMapMemory(hpet_table->Address.Address, HPET_SIZE)); +} + +static int +hpet_start_main_counter(hpet_info_t *hip) +{ + uint64_t *gcr_ptr; + uint64_t gcr; + + gcr_ptr = (uint64_t *)HPET_GEN_CONFIG_ADDRESS(hip->logical_address); + gcr = *gcr_ptr; + + gcr |= HPET_GCFR_ENABLE_CNF; + *gcr_ptr = gcr; + gcr = *gcr_ptr; + + return (gcr & HPET_GCFR_ENABLE_CNF ? AE_OK : ~AE_OK); +} + +static int +hpet_stop_main_counter(hpet_info_t *hip) +{ + uint64_t *gcr_ptr; + uint64_t gcr; + + gcr_ptr = (uint64_t *)HPET_GEN_CONFIG_ADDRESS(hip->logical_address); + gcr = *gcr_ptr; + + gcr &= ~HPET_GCFR_ENABLE_CNF; + *gcr_ptr = gcr; + gcr = *gcr_ptr; + + return (gcr & HPET_GCFR_ENABLE_CNF ? ~AE_OK : AE_OK); +} + +/* + * Set the Legacy Replacement Route bit. + * This should be called before setting up timers. + * The HPET specification is silent regarding setting this after timers are + * programmed. + */ +static uint64_t +hpet_set_leg_rt_cnf(hpet_info_t *hip, uint32_t new_value) +{ + uint64_t gen_conf = hpet_read_gen_config(hip); + + switch (new_value) { + case 0: + gen_conf &= ~HPET_GCFR_LEG_RT_CNF; + break; + + case HPET_GCFR_LEG_RT_CNF: + gen_conf |= HPET_GCFR_LEG_RT_CNF; + break; + + default: + ASSERT(new_value == 0 || new_value == HPET_GCFR_LEG_RT_CNF); + break; + } + hpet_write_gen_config(hip, gen_conf); + return (gen_conf); +} + +static uint64_t +hpet_read_gen_cap(hpet_info_t *hip) +{ + return (*(uint64_t *)HPET_GEN_CAP_ADDRESS(hip->logical_address)); +} + +static uint64_t +hpet_read_gen_config(hpet_info_t *hip) +{ + return (*(uint64_t *) + HPET_GEN_CONFIG_ADDRESS(hip->logical_address)); +} + +static uint64_t +hpet_read_gen_intrpt_stat(hpet_info_t *hip) +{ + hip->gen_intrpt_stat = *(uint64_t *)HPET_GEN_INTR_STAT_ADDRESS( + hip->logical_address); + return (hip->gen_intrpt_stat); +} + +static uint64_t +hpet_read_timer_N_config(hpet_info_t *hip, uint_t n) +{ + uint64_t conf = *(uint64_t *)HPET_TIMER_N_CONF_ADDRESS( + hip->logical_address, n); + hip->timer_n_config[n] = hpet_convert_timer_N_config(conf); + return (conf); +} + +static hpet_TN_conf_cap_t +hpet_convert_timer_N_config(uint64_t conf) +{ + hpet_TN_conf_cap_t cc = { 0 }; + + cc.int_route_cap = HPET_TIMER_N_INT_ROUTE_CAP(conf); + cc.fsb_int_del_cap = HPET_TIMER_N_FSB_INT_DEL_CAP(conf); + cc.fsb_int_en_cnf = HPET_TIMER_N_FSB_EN_CNF(conf); + cc.int_route_cnf = HPET_TIMER_N_INT_ROUTE_CNF(conf); + cc.mode32_cnf = HPET_TIMER_N_MODE32_CNF(conf); + cc.val_set_cnf = HPET_TIMER_N_VAL_SET_CNF(conf); + cc.size_cap = HPET_TIMER_N_SIZE_CAP(conf); + cc.per_int_cap = HPET_TIMER_N_PER_INT_CAP(conf); + cc.type_cnf = HPET_TIMER_N_TYPE_CNF(conf); + cc.int_enb_cnf = HPET_TIMER_N_INT_ENB_CNF(conf); + cc.int_type_cnf = HPET_TIMER_N_INT_TYPE_CNF(conf); + + return (cc); +} + +static uint64_t +hpet_read_timer_N_comp(hpet_info_t *hip, uint_t n) +{ + if (hip->timer_n_config[n].size_cap == 1) + return (*(uint64_t *) + HPET_TIMER_N_COMP_ADDRESS(hip->logical_address, n)); + else + return (*(uint32_t *) + HPET_TIMER_N_COMP_ADDRESS(hip->logical_address, n)); +} + +static uint64_t +hpet_read_main_counter_value(hpet_info_t *hip) +{ + uint64_t value; + uint32_t *counter; + uint32_t high1, high2, low; + + counter = (uint32_t *)HPET_MAIN_COUNTER_ADDRESS(hip->logical_address); + + /* + * 32-bit main counters + */ + if (hip->gen_cap.count_size_cap == 0) { + value = (uint64_t)*counter; + hip->main_counter_value = value; + return (value); + } + + /* + * HPET spec claims a 64-bit read can be split into two 32-bit reads + * by the hardware connection to the HPET. + */ + high2 = counter[1]; + do { + high1 = high2; + low = counter[0]; + high2 = counter[1]; + } while (high2 != high1); + + value = ((uint64_t)high1 << 32) | low; + hip->main_counter_value = value; + return (value); +} + +static void +hpet_write_gen_cap(hpet_info_t *hip, uint64_t l) +{ + *(uint64_t *)HPET_GEN_CAP_ADDRESS(hip->logical_address) = l; +} + +static void +hpet_write_gen_config(hpet_info_t *hip, uint64_t l) +{ + *(uint64_t *)HPET_GEN_CONFIG_ADDRESS(hip->logical_address) = l; +} + +static void +hpet_write_gen_intrpt_stat(hpet_info_t *hip, uint64_t l) +{ + *(uint64_t *)HPET_GEN_INTR_STAT_ADDRESS(hip->logical_address) = l; +} + +static void +hpet_write_timer_N_config(hpet_info_t *hip, uint_t n, uint64_t l) +{ + if (hip->timer_n_config[n].size_cap == 1) + *(uint64_t *)HPET_TIMER_N_CONF_ADDRESS( + hip->logical_address, n) = l; + else + *(uint32_t *)HPET_TIMER_N_CONF_ADDRESS( + hip->logical_address, n) = (uint32_t)(0xFFFFFFFF & l); +} + +static void +hpet_write_timer_N_comp(hpet_info_t *hip, uint_t n, uint64_t l) +{ + *(uint64_t *)HPET_TIMER_N_COMP_ADDRESS(hip->logical_address, n) = l; +} + +static void +hpet_disable_timer(hpet_info_t *hip, uint32_t timer_n) +{ + uint64_t l; + + l = hpet_read_timer_N_config(hip, timer_n); + l &= ~HPET_TIMER_N_INT_ENB_CNF_BIT; + hpet_write_timer_N_config(hip, timer_n, l); +} + +static void +hpet_enable_timer(hpet_info_t *hip, uint32_t timer_n) +{ + uint64_t l; + + l = hpet_read_timer_N_config(hip, timer_n); + l |= HPET_TIMER_N_INT_ENB_CNF_BIT; + hpet_write_timer_N_config(hip, timer_n, l); +} + +static void +hpet_write_main_counter_value(hpet_info_t *hip, uint64_t l) +{ + uint32_t *address; + + /* + * HPET spec 1.0a states main counter register should be halted before + * it is written to. + */ + ASSERT(!(hpet_read_gen_config(hip) & HPET_GCFR_ENABLE_CNF)); + + if (hip->gen_cap.count_size_cap == 1) { + *(uint64_t *)HPET_MAIN_COUNTER_ADDRESS(hip->logical_address) + = l; + } else { + address = (uint32_t *)HPET_MAIN_COUNTER_ADDRESS( + hip->logical_address); + + address[0] = (uint32_t)(l & 0xFFFFFFFF); + } +} + +/* + * Add the interrupt handler for I/O APIC interrupt number (interrupt line). + * + * The I/O APIC line (vector) is programmed in ioapic_init_intr() called + * from apic_picinit() psm_ops apic_ops entry point after we return from + * apic_init() psm_ops entry point. + */ +static uint32_t +hpet_install_interrupt_handler(uint_t (*func)(char *), int vector) +{ + uint32_t retval; + + retval = add_avintr(NULL, CBE_HIGH_PIL, (avfunc)func, "HPET Timer", + vector, NULL, NULL, NULL, NULL); + if (retval == 0) { + cmn_err(CE_WARN, "!hpet_acpi: add_avintr() failed"); + return (AE_BAD_PARAMETER); + } + return (AE_OK); +} + +/* + * The HPET timers specify which I/O APIC interrupts they can be routed to. + * Find the first available non-legacy-replacement timer and its I/O APIC irq. + * Supported I/O APIC IRQs are specified in the int_route_cap bitmap in each + * timer's timer_n_config register. + */ +static int +hpet_get_IOAPIC_intr_capable_timer(hpet_info_t *hip) +{ + int timer; + int intr; + + for (timer = HPET_FIRST_NON_LEGACY_TIMER; + timer < hip->gen_cap.num_tim_cap; ++timer) { + + if (!hpet_timer_available(hip->allocated_timers, timer)) + continue; + + intr = lowbit(hip->timer_n_config[timer].int_route_cap) - 1; + if (intr >= 0) { + hpet_timer_alloc(&hip->allocated_timers, timer); + hip->cstate_timer.timer = timer; + hip->cstate_timer.intr = intr; + return (timer); + } + } + + return (-1); +} + +/* + * Mark this timer as used. + */ +static void +hpet_timer_alloc(uint32_t *allocated_timers, uint32_t n) +{ + *allocated_timers |= 1 << n; +} + +/* + * Check if this timer is available. + * No mutual exclusion because only one thread uses this. + */ +static int +hpet_timer_available(uint32_t allocated_timers, uint32_t n) +{ + return ((allocated_timers & (1 << n)) == 0); +} + +/* + * Setup timer N to route its interrupt to I/O APIC. + */ +static void +hpet_timer_set_up(hpet_info_t *hip, uint32_t timer_n, uint32_t interrupt) +{ + uint64_t conf; + + conf = hpet_read_timer_N_config(hip, timer_n); + + /* + * Caller is required to verify this interrupt route is supported. + */ + ASSERT(HPET_TIMER_N_INT_ROUTE_CAP(conf) & (1 << interrupt)); + + conf &= ~HPET_TIMER_N_FSB_EN_CNF_BIT; /* use IOAPIC */ + conf |= HPET_TIMER_N_INT_ROUTE_SHIFT(interrupt); + conf &= ~HPET_TIMER_N_TYPE_CNF_BIT; /* non periodic */ + conf &= ~HPET_TIMER_N_INT_ENB_CNF_BIT; /* disabled */ + conf |= HPET_TIMER_N_INT_TYPE_CNF_BIT; /* Level Triggered */ + + hpet_write_timer_N_config(hip, timer_n, conf); +} + +/* + * The HPET's Main Counter is not stopped before programming an HPET timer. + * This will allow the HPET to be used as a time source. + * The programmed timer interrupt may occur before this function returns. + * Callers must block interrupts before calling this function if they must + * guarantee the interrupt is handled after this function returns. + * + * Return 0 if main counter is less than timer after enabling timer. + * The interrupt was programmed, but it may fire before this returns. + * Return !0 if main counter is greater than timer after enabling timer. + * In other words: the timer will not fire, and we do not know if it did fire. + * + * delta is in HPET ticks. + * + * Writing a 64-bit value to a 32-bit register will "wrap around". + * A 32-bit HPET timer will wrap around in a little over 5 minutes. + */ +int +hpet_timer_program(hpet_info_t *hip, uint32_t timer, uint64_t delta) +{ + uint64_t time, program; + + program = hpet_read_main_counter_value(hip); + program += delta; + hpet_write_timer_N_comp(hip, timer, program); + + time = hpet_read_main_counter_value(hip); + if (time < program) + return (AE_OK); + + return (AE_TIME); +} + +/* + * CPR and power policy-change callback entry point. + */ +boolean_t +hpet_callback(int code) +{ + switch (code) { + case PM_DEFAULT_CPU_DEEP_IDLE: + /*FALLTHROUGH*/ + case PM_ENABLE_CPU_DEEP_IDLE: + /*FALLTHROUGH*/ + case PM_DISABLE_CPU_DEEP_IDLE: + return (hpet_deep_idle_config(code)); + + case CB_CODE_CPR_RESUME: + /*FALLTHROUGH*/ + case CB_CODE_CPR_CHKPT: + return (hpet_cpr(code)); + + case CST_EVENT_MULTIPLE_CSTATES: + hpet_cst_callback(CST_EVENT_MULTIPLE_CSTATES); + return (B_TRUE); + + case CST_EVENT_ONE_CSTATE: + hpet_cst_callback(CST_EVENT_ONE_CSTATE); + return (B_TRUE); + + default: + cmn_err(CE_NOTE, "!hpet_callback: invalid code %d\n", code); + return (B_FALSE); + } +} + +/* + * According to the HPET spec 1.0a: the Operating System must save and restore + * HPET event timer hardware context through ACPI sleep state transitions. + * Timer registers (including the main counter) may not be preserved through + * ACPI S3, S4, or S5 sleep states. This code does not not support S1 nor S2. + * + * Current HPET state is already in hpet.supported and + * hpet_state.proxy_installed. hpet_info contains the proxy interrupt HPET + * Timer state. + * + * Future projects beware: the HPET Main Counter is undefined after ACPI S3 or + * S4, and it is not saved/restored here. Future projects cannot expect the + * Main Counter to be monotomically (or accurately) increasing across CPR. + * + * Note: the CPR Checkpoint path later calls pause_cpus() which ensures all + * CPUs are awake and in a spin loop before the system suspends. The HPET is + * not needed for Deep C-state wakeup when CPUs are in cpu_pause(). + * It is safe to leave the HPET running as the system suspends; we just + * disable the timer from generating interrupts here. + */ +static boolean_t +hpet_cpr(int code) +{ + ulong_t intr, dead_count = 0; + hrtime_t dead = gethrtime() + hpet_spin_timeout; + boolean_t ret = B_TRUE; + + mutex_enter(&hpet_state_lock); + switch (code) { + case CB_CODE_CPR_CHKPT: + if (hpet_state.proxy_installed == B_FALSE) + break; + + hpet_state.cpr = B_TRUE; + + intr = intr_clear(); + while (!mutex_tryenter(&hpet_proxy_lock)) { + /* + * spin + */ + intr_restore(intr); + if (dead_count++ > hpet_spin_check) { + dead_count = 0; + if (gethrtime() > dead) { + hpet_state.cpr = B_FALSE; + mutex_exit(&hpet_state_lock); + cmn_err(CE_NOTE, "!hpet_cpr: deadman"); + return (B_FALSE); + } + } + intr = intr_clear(); + } + hpet_expire_all(); + mutex_exit(&hpet_proxy_lock); + intr_restore(intr); + + hpet_disable_timer(&hpet_info, hpet_info.cstate_timer.timer); + break; + + case CB_CODE_CPR_RESUME: + if (hpet_resume() == B_TRUE) + hpet_state.cpr = B_FALSE; + else + cmn_err(CE_NOTE, "!hpet_resume failed."); + break; + + default: + cmn_err(CE_NOTE, "!hpet_cpr: invalid code %d\n", code); + ret = B_FALSE; + break; + } + mutex_exit(&hpet_state_lock); + return (ret); +} + +/* + * Assume the HPET stopped in Suspend state and timer state was lost. + */ +static boolean_t +hpet_resume(void) +{ + if (hpet.supported != HPET_TIMER_SUPPORT) + return (B_TRUE); + + /* + * The HPET spec does not specify if Legacy Replacement Route is + * on or off by default, so we set it off here. + */ + (void) hpet_set_leg_rt_cnf(&hpet_info, 0); + + if (hpet_start_main_counter(&hpet_info) != AE_OK) { + cmn_err(CE_NOTE, "!hpet_resume: start main counter failed"); + hpet.supported = HPET_NO_SUPPORT; + if (hpet_state.proxy_installed == B_TRUE) { + hpet_state.proxy_installed = B_FALSE; + hpet_uninstall_interrupt_handler(); + } + return (B_FALSE); + } + + if (hpet_state.proxy_installed == B_FALSE) + return (B_TRUE); + + hpet_timer_set_up(&hpet_info, hpet_info.cstate_timer.timer, + hpet_info.cstate_timer.intr); + if (hpet_state.cpu_deep_idle == B_TRUE) + hpet_enable_timer(&hpet_info, hpet_info.cstate_timer.timer); + + return (B_TRUE); +} + +/* + * Callback to enable/disable Deep C-States based on power.conf setting. + */ +static boolean_t +hpet_deep_idle_config(int code) +{ + ulong_t intr, dead_count = 0; + hrtime_t dead = gethrtime() + hpet_spin_timeout; + boolean_t ret = B_TRUE; + + mutex_enter(&hpet_state_lock); + switch (code) { + case PM_DEFAULT_CPU_DEEP_IDLE: + /*FALLTHROUGH*/ + case PM_ENABLE_CPU_DEEP_IDLE: + + if (hpet_state.cpu_deep_idle == B_TRUE) + break; + + if (hpet_state.proxy_installed == B_FALSE) { + ret = B_FALSE; /* Deep C-States not supported */ + break; + } + + hpet_enable_timer(&hpet_info, hpet_info.cstate_timer.timer); + hpet_state.cpu_deep_idle = B_TRUE; + break; + + case PM_DISABLE_CPU_DEEP_IDLE: + + if ((hpet_state.cpu_deep_idle == B_FALSE) || + (hpet_state.proxy_installed == B_FALSE)) + break; + + /* + * The order of these operations is important to avoid + * lost wakeups: Set a flag to refuse all future LAPIC Timer + * proxy requests, then wake up all CPUs from deep C-state, + * and finally disable the HPET interrupt-generating timer. + */ + hpet_state.cpu_deep_idle = B_FALSE; + + intr = intr_clear(); + while (!mutex_tryenter(&hpet_proxy_lock)) { + /* + * spin + */ + intr_restore(intr); + if (dead_count++ > hpet_spin_check) { + dead_count = 0; + if (gethrtime() > dead) { + hpet_state.cpu_deep_idle = B_TRUE; + mutex_exit(&hpet_state_lock); + cmn_err(CE_NOTE, + "!hpet_deep_idle_config: deadman"); + return (B_FALSE); + } + } + intr = intr_clear(); + } + hpet_expire_all(); + mutex_exit(&hpet_proxy_lock); + intr_restore(intr); + + hpet_disable_timer(&hpet_info, hpet_info.cstate_timer.timer); + break; + + default: + cmn_err(CE_NOTE, "!hpet_deep_idle_config: invalid code %d\n", + code); + ret = B_FALSE; + break; + } + mutex_exit(&hpet_state_lock); + + return (ret); +} + +/* + * Callback for _CST c-state change notifications. + */ +static void +hpet_cst_callback(uint32_t code) +{ + ulong_t intr, dead_count = 0; + hrtime_t dead = gethrtime() + hpet_spin_timeout; + + switch (code) { + case CST_EVENT_ONE_CSTATE: + hpet_state.uni_cstate = B_TRUE; + intr = intr_clear(); + while (!mutex_tryenter(&hpet_proxy_lock)) { + /* + * spin + */ + intr_restore(intr); + if (dead_count++ > hpet_spin_check) { + dead_count = 0; + if (gethrtime() > dead) { + hpet_expire_all(); + cmn_err(CE_NOTE, + "!hpet_cst_callback: deadman"); + return; + } + } + intr = intr_clear(); + } + hpet_expire_all(); + mutex_exit(&hpet_proxy_lock); + intr_restore(intr); + break; + + case CST_EVENT_MULTIPLE_CSTATES: + hpet_state.uni_cstate = B_FALSE; + break; + + default: + cmn_err(CE_NOTE, "!hpet_cst_callback: invalid code %d\n", code); + break; + } +} + +/* + * Interrupt Service Routine for HPET I/O-APIC-generated interrupts. + * Used to wakeup CPUs from Deep C-state when their Local APIC Timer stops. + * This ISR runs on one CPU which pokes other CPUs out of Deep C-state as + * needed. + */ +/* ARGSUSED */ +static uint_t +hpet_isr(char *arg) +{ + uint64_t timer_status; + uint64_t timer_mask; + ulong_t intr, dead_count = 0; + hrtime_t dead = gethrtime() + hpet_isr_spin_timeout; + + timer_mask = HPET_INTR_STATUS_MASK(hpet_info.cstate_timer.timer); + + /* + * We are using a level-triggered interrupt. + * HPET sets timer's General Interrupt Status Register bit N. + * ISR checks this bit to see if it needs servicing. + * ISR then clears this bit by writing 1 to that bit. + */ + timer_status = hpet_read_gen_intrpt_stat(&hpet_info); + if (!(timer_status & timer_mask)) + return (DDI_INTR_UNCLAIMED); + hpet_write_gen_intrpt_stat(&hpet_info, timer_mask); + + /* + * Do not touch ISR data structures before checking the HPET's General + * Interrupt Status register. The General Interrupt Status register + * will not be set by hardware until after timer interrupt generation + * is enabled by software. Software allocates necessary data + * structures before enabling timer interrupts. ASSERT the software + * data structures required to handle this interrupt are initialized. + */ + ASSERT(hpet_proxy_users != NULL); + + /* + * CPUs in deep c-states do not enable interrupts until after + * performing idle cleanup which includes descheduling themselves from + * the HPET. The CPU running this ISR will NEVER find itself in the + * proxy list. A lost wakeup may occur if this is false. + */ + ASSERT(hpet_proxy_users[CPU->cpu_id] == HPET_INFINITY); + + /* + * Higher level interrupts may deadlock with CPUs going idle if this + * ISR is prempted while holding hpet_proxy_lock. + */ + intr = intr_clear(); + while (!mutex_tryenter(&hpet_proxy_lock)) { + /* + * spin + */ + intr_restore(intr); + if (dead_count++ > hpet_spin_check) { + dead_count = 0; + if (gethrtime() > dead) { + hpet_expire_all(); + return (DDI_INTR_CLAIMED); + } + } + intr = intr_clear(); + } + (void) hpet_guaranteed_schedule(HPET_INFINITY); + mutex_exit(&hpet_proxy_lock); + intr_restore(intr); + + return (DDI_INTR_CLAIMED); +} + +/* + * Used when disabling the HPET Timer interrupt. CPUs in Deep C-state must be + * woken up because they can no longer rely on the HPET's Timer to wake them. + * We do not need to wait for CPUs to wakeup. + */ +static void +hpet_expire_all(void) +{ + processorid_t id; + + for (id = 0; id < ncpus; ++id) { + if (hpet_proxy_users[id] != HPET_INFINITY) { + hpet_proxy_users[id] = HPET_INFINITY; + if (id != CPU->cpu_id) + poke_cpu(id); + } + } +} + +/* + * To avoid missed wakeups this function must guarantee either the HPET timer + * was successfully programmed to the next expire time or there are no waiting + * CPUs. + * + * Callers cannot enter C2 or deeper if the HPET could not be programmed to + * generate its next interrupt to happen at required_wakeup_time or sooner. + * Returns B_TRUE if the HPET was programmed to interrupt by + * required_wakeup_time, B_FALSE if not. + */ +static boolean_t +hpet_guaranteed_schedule(hrtime_t required_wakeup_time) +{ + hrtime_t now, next_proxy_time; + processorid_t id, next_proxy_id; + int proxy_timer = hpet_info.cstate_timer.timer; + boolean_t done = B_FALSE; + + ASSERT(mutex_owned(&hpet_proxy_lock)); + + /* + * Loop until we successfully program the HPET, + * or no CPUs are scheduled to use the HPET as a proxy. + */ + do { + /* + * Wake all CPUs that expired before now. + * Find the next CPU to wake up and next HPET program time. + */ + now = gethrtime(); + next_proxy_time = HPET_INFINITY; + next_proxy_id = CPU->cpu_id; + for (id = 0; id < ncpus; ++id) { + if (hpet_proxy_users[id] < now) { + hpet_proxy_users[id] = HPET_INFINITY; + if (id != CPU->cpu_id) + poke_cpu(id); + } else if (hpet_proxy_users[id] < next_proxy_time) { + next_proxy_time = hpet_proxy_users[id]; + next_proxy_id = id; + } + } + + if (next_proxy_time == HPET_INFINITY) { + done = B_TRUE; + /* + * There are currently no CPUs using the HPET's Timer + * as a proxy for their LAPIC Timer. The HPET's Timer + * does not need to be programmed. + * + * Letting the HPET timer wrap around to the current + * time is the longest possible timeout. + * A 64-bit timer will wrap around in ~ 2^44 seconds. + * A 32-bit timer will wrap around in ~ 2^12 seconds. + * + * Disabling the HPET's timer interrupt requires a + * (relatively expensive) write to the HPET. + * Instead we do nothing. + * + * We are gambling some CPU will attempt to enter a + * deep c-state before the timer wraps around. + * We assume one spurious interrupt in a little over an + * hour has less performance impact than writing to the + * HPET's timer disable bit every time all CPUs wakeup + * from deep c-state. + */ + + } else { + /* + * Idle CPUs disable interrupts before programming the + * HPET to prevent a lost wakeup if the HPET + * interrupts the idle cpu before it can enter a + * Deep C-State. + */ + if (hpet_timer_program(&hpet_info, proxy_timer, + HRTIME_TO_HPET_TICKS(next_proxy_time - gethrtime())) + != AE_OK) { + /* + * We could not program the HPET to wakeup the + * next CPU. We must wake the CPU ourself to + * avoid a lost wakeup. + */ + hpet_proxy_users[next_proxy_id] = HPET_INFINITY; + if (next_proxy_id != CPU->cpu_id) + poke_cpu(next_proxy_id); + } else { + done = B_TRUE; + } + } + + } while (!done); + + return (next_proxy_time <= required_wakeup_time); +} + +/* + * Use an HPET timer to act as this CPU's proxy local APIC timer. + * Used in deep c-states C2 and above while the CPU's local APIC timer stalls. + * Called by the idle thread with interrupts enabled. + * Always returns with interrupts disabled. + * + * There are 3 possible outcomes from this function: + * 1. The Local APIC Timer was already disabled before this function was called. + * LAPIC TIMER : disabled + * HPET : not scheduled to wake this CPU + * *lapic_expire : (hrtime_t)HPET_INFINITY + * Returns : B_TRUE + * 2. Successfully programmed the HPET to act as a LAPIC Timer proxy. + * LAPIC TIMER : disabled + * HPET : scheduled to wake this CPU + * *lapic_expire : hrtime_t when LAPIC timer would have expired + * Returns : B_TRUE + * 3. Failed to programmed the HPET to act as a LAPIC Timer proxy. + * LAPIC TIMER : enabled + * HPET : not scheduled to wake this CPU + * *lapic_expire : (hrtime_t)HPET_INFINITY + * Returns : B_FALSE + * + * The idle thread cannot enter Deep C-State in case 3. + * The idle thread must re-enable & re-program the LAPIC_TIMER in case 2. + */ +static boolean_t +hpet_use_hpet_timer(hrtime_t *lapic_expire) +{ + extern hrtime_t apic_timer_stop_count(void); + extern void apic_timer_restart(hrtime_t); + hrtime_t now, expire, dead; + uint64_t lapic_count, dead_count; + cpupart_t *cpu_part; + processorid_t cpu_sid; + processorid_t cpu_id = CPU->cpu_id; + processorid_t id; + boolean_t rslt; + boolean_t hset_update; + + cpu_part = CPU->cpu_part; + cpu_sid = CPU->cpu_seqid; + + ASSERT(CPU->cpu_thread == CPU->cpu_idle_thread); + ASSERT(interrupts_enabled()); + + /* + * A critical section exists between when the HPET is programmed + * to interrupt the CPU and when this CPU enters an idle state. + * Interrupts must be blocked during that time to prevent lost + * CBE wakeup interrupts from either LAPIC or HPET. + * + * Must block interrupts before acquiring hpet_proxy_lock to prevent + * a deadlock with the ISR if the ISR runs on this CPU after the + * idle thread acquires the mutex but before it clears interrupts. + */ + cli(); + + lapic_count = apic_timer_stop_count(); + now = gethrtime(); + dead = now + hpet_idle_spin_timeout; + *lapic_expire = expire = now + lapic_count; + if (lapic_count == (hrtime_t)-1) { + /* + * LAPIC timer is currently disabled. + * Will not use the HPET as a LAPIC Timer proxy. + */ + *lapic_expire = (hrtime_t)HPET_INFINITY; + return (B_TRUE); + } + + /* + * Serialize hpet_proxy data structure manipulation. + */ + dead_count = 0; + while (!mutex_tryenter(&hpet_proxy_lock)) { + /* + * spin + */ + apic_timer_restart(expire); + sti(); + cli(); + + if (dead_count++ > hpet_spin_check) { + dead_count = 0; + hset_update = (((CPU->cpu_flags & CPU_OFFLINE) == 0) && + (ncpus > 1)); + if (hset_update && + !bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) { + *lapic_expire = (hrtime_t)HPET_INFINITY; + return (B_FALSE); + } + } + + lapic_count = apic_timer_stop_count(); + now = gethrtime(); + *lapic_expire = expire = now + lapic_count; + if (lapic_count == (hrtime_t)-1) { + /* + * LAPIC timer is currently disabled. + * Will not use the HPET as a LAPIC Timer proxy. + */ + *lapic_expire = (hrtime_t)HPET_INFINITY; + return (B_TRUE); + } + if (now > dead) { + apic_timer_restart(expire); + *lapic_expire = (hrtime_t)HPET_INFINITY; + return (B_FALSE); + } + } + + if ((hpet_state.cpr == B_TRUE) || + (hpet_state.cpu_deep_idle == B_FALSE) || + (hpet_state.proxy_installed == B_FALSE) || + (hpet_state.uni_cstate == B_TRUE)) { + mutex_exit(&hpet_proxy_lock); + apic_timer_restart(expire); + *lapic_expire = (hrtime_t)HPET_INFINITY; + return (B_FALSE); + } + + hpet_proxy_users[cpu_id] = expire; + + /* + * We are done if another cpu is scheduled on the HPET with an + * expire time before us. The next HPET interrupt has been programmed + * to fire before our expire time. + */ + for (id = 0; id < ncpus; ++id) { + if ((hpet_proxy_users[id] <= expire) && (id != cpu_id)) { + mutex_exit(&hpet_proxy_lock); + return (B_TRUE); + } + } + + /* + * We are the next lAPIC to expire. + * Program the HPET with our expire time. + */ + rslt = hpet_guaranteed_schedule(expire); + mutex_exit(&hpet_proxy_lock); + + if (rslt == B_FALSE) { + apic_timer_restart(expire); + *lapic_expire = (hrtime_t)HPET_INFINITY; + } + + return (rslt); +} + +/* + * Called by the idle thread when waking up from Deep C-state before enabling + * interrupts. With an array data structure it is faster to always remove + * ourself from the array without checking if the HPET ISR already removed. + * + * We use a lazy algorithm for removing CPUs from the HPET's schedule. + * We do not reprogram the HPET here because this CPU has real work to do. + * On a idle system the CPU was probably woken up by the HPET's ISR. + * On a heavily loaded system CPUs are not going into Deep C-state. + * On a moderately loaded system another CPU will usually enter Deep C-state + * and reprogram the HPET before the HPET fires with our wakeup. + */ +static void +hpet_use_lapic_timer(hrtime_t expire) +{ + extern void apic_timer_restart(hrtime_t); + processorid_t cpu_id = CPU->cpu_id; + + ASSERT(CPU->cpu_thread == CPU->cpu_idle_thread); + ASSERT(!interrupts_enabled()); + + hpet_proxy_users[cpu_id] = HPET_INFINITY; + + /* + * Do not enable a LAPIC Timer that was initially disabled. + */ + if (expire != HPET_INFINITY) + apic_timer_restart(expire); + + sti(); +} + +/* + * Initialize data structure to keep track of CPUs using HPET as a proxy for + * their stalled local APIC timer. For now this is just an array. + */ +static void +hpet_init_proxy_data(void) +{ + processorid_t id; + + /* + * Use apic_nproc because we are in boot before max_ncpus has been + * initialized. + */ + hpet_proxy_users = kmem_zalloc(apic_nproc * sizeof (*hpet_proxy_users), + KM_SLEEP); + + /* + * Unused entries always contain HPET_INFINITY. + */ + for (id = 0; id < apic_nproc; ++id) + hpet_proxy_users[id] = HPET_INFINITY; +}
--- a/usr/src/uts/i86pc/io/mp_platform_common.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/i86pc/io/mp_platform_common.c Wed Feb 25 21:04:18 2009 -0800 @@ -62,7 +62,10 @@ #include <sys/note.h> #include <sys/pci_intr_lib.h> #include <sys/sunndi.h> - +#if !defined(__xpv) +#include <sys/hpet.h> +#include <sys/clock.h> +#endif /* * Local Function Prototypes @@ -103,6 +106,12 @@ int apic_sci_vect = -1; iflag_t apic_sci_flags; +#if !defined(__xpv) +/* ACPI HPET interrupt configuration; -1 if HPET not used */ +int apic_hpet_vect = -1; +iflag_t apic_hpet_flags; +#endif + /* * psm name pointer */ @@ -892,6 +901,17 @@ cmn_err(CE_CONT, "?Using ACPI for CPU/IOAPIC information ONLY\n"); } + +#if !defined(__xpv) + /* + * probe ACPI for hpet information here which is used later + * in apic_picinit(). + */ + if (hpet_acpi_init(&apic_hpet_vect, &apic_hpet_flags) < 0) { + cmn_err(CE_NOTE, "!ACPI HPET table query failed\n"); + } +#endif + return (PSM_SUCCESS); } /* if setting APIC mode failed above, we fall through to cleanup */ @@ -1324,6 +1344,40 @@ irqptr->airq_share++; } + +#if !defined(__xpv) + /* + * Hack alert: deal with ACPI HPET interrupt chicken/egg here. + */ + if (apic_hpet_vect > 0) { + /* + * hpet has already done add_avintr(); we just need + * to finish the job by mimicing translate_irq() + * + * Fake up an intrspec and setup the tables + */ + ispec.intrspec_vec = apic_hpet_vect; + ispec.intrspec_pri = CBE_HIGH_PIL; + + if (apic_setup_irq_table(NULL, apic_hpet_vect, NULL, + &ispec, &apic_hpet_flags, DDI_INTR_TYPE_FIXED) < 0) { + cmn_err(CE_WARN, "!apic: HPET setup failed"); + return; + } + irqptr = apic_irq_table[apic_hpet_vect]; + + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + /* Program I/O APIC */ + (void) apic_setup_io_intr(irqptr, apic_hpet_vect, B_FALSE); + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + + irqptr->airq_share++; + } +#endif /* !defined(__xpv) */ } /*
--- a/usr/src/uts/i86pc/io/pcplusmp/apic.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/i86pc/io/pcplusmp/apic.c Wed Feb 25 21:04:18 2009 -0800 @@ -68,6 +68,7 @@ #include <sys/sunddi.h> #include <sys/x_call.h> #include <sys/reboot.h> +#include <sys/hpet.h> /* * Local Function Prototypes @@ -1650,6 +1651,8 @@ uchar_t byte; ulong_t iflag; + hpet_acpi_fini(); + /* Send NMI to all CPUs except self to do per processor shutdown */ iflag = intr_clear(); #ifdef DEBUG @@ -2039,6 +2042,41 @@ (apic_clkvect + APIC_BASE_VECT) | AV_MASK); } +/* + * Set timer far into the future and return timer + * current Count in nanoseconds. + */ +hrtime_t +apic_timer_stop_count(void) +{ + hrtime_t ns_val; + int enable_val, count_val; + + /* + * Should be called with interrupts disabled. + */ + ASSERT(!interrupts_enabled()); + + enable_val = apic_reg_ops->apic_read(APIC_LOCAL_TIMER); + if ((enable_val & AV_MASK) == AV_MASK) + return ((hrtime_t)-1); /* timer is disabled */ + + count_val = apic_reg_ops->apic_read(APIC_CURR_COUNT); + ns_val = APIC_TICKS_TO_NSECS(count_val); + + apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL); + + return (ns_val); +} + +/* + * Reprogram timer after Deep C-State. + */ +void +apic_timer_restart(hrtime_t time) +{ + apic_timer_reprogram(time); +} ddi_periodic_t apic_periodic_id;
--- a/usr/src/uts/i86pc/io/ppm_plat.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/i86pc/io/ppm_plat.c Wed Feb 25 21:04:18 2009 -0800 @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Platform Power Management master pseudo driver platform support. */ @@ -49,14 +47,17 @@ ppm_rebuild_cpu_domains(void) { char *str = "ppm_rebuild_cpu_domains"; - cpupm_cpu_dependency_t *dep; - cpupm_cpu_dependency_t *dep_next; - cpupm_cpu_node_t *cpu_next; + cpupm_state_domains_t *dep; + cpupm_state_domains_t *dep_next; struct ppm_domit *domit_p; ppm_domain_t *domp_old; ppm_domain_t *domp; ppm_dev_t *devp; ppm_db_t *dbp; + uint_t cpu_id; + cpuset_t dom_cpu_set; + int result; + dev_info_t *cpu_dip; /* * Get the CPU domain data @@ -100,7 +101,7 @@ * leave the domain as it is (which is unmanageable since * PPM_CPU_READY is off). */ - dep = cpupm_get_cpu_dependencies(); + dep = cpupm_pstate_domains; if (dep == NULL) { PPMD(D_CPU, ("%s: No CPU dependency info!\n", str)); return; @@ -112,11 +113,11 @@ */ mutex_enter(&domp_old->lock); domp_old->dflags |= PPMD_OFFLINE; - for (dep_next = dep; dep_next; dep_next = dep_next->cd_next) { + for (dep_next = dep; dep_next; dep_next = dep_next->pm_next) { domp = kmem_zalloc(sizeof (*domp), KM_SLEEP); domp->name = kmem_zalloc(MAXNAMELEN, KM_SLEEP); (void) snprintf(domp->name, MAXNAMELEN, "acpi_cpu_domain_%d", - dep_next->cd_dependency_id); + dep_next->pm_domain); mutex_init(&domp->lock, NULL, MUTEX_DRIVER, NULL); mutex_enter(&domp->lock); domp->dflags = domit_p->dflags | PPMD_CPU_READY; @@ -135,18 +136,27 @@ * build the "conflist" for the domain. But conveniently, the * "conflist" data is easily obtainable from the "devlist". */ - for (cpu_next = dep_next->cd_cpu; cpu_next; - cpu_next = cpu_next->cn_next) { - devp = PPM_GET_PRIVATE(cpu_next->cn_dip); + dom_cpu_set = dep_next->pm_cpus; + do { + CPUSET_FIND(dom_cpu_set, cpu_id); + if (cpu_id == CPUSET_NOTINSET) + break; + + ASSERT(cpu_id < NCPU); + cpu_dip = ((cpupm_mach_state_t *) + (cpu[cpu_id]->cpu_m.mcpu_pm_mach_state))->ms_dip; + devp = PPM_GET_PRIVATE(cpu_dip); ASSERT(devp && devp->domp == domp_old); - devp = ppm_add_dev(cpu_next->cn_dip, domp); + devp = ppm_add_dev(cpu_dip, domp); dbp = kmem_zalloc(sizeof (struct ppm_db), KM_SLEEP); dbp->name = kmem_zalloc((strlen(devp->path) + 1), KM_SLEEP); (void) strcpy(dbp->name, devp->path); dbp->next = domp->conflist; domp->conflist = dbp; - } + + CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result); + } while (result == 0); /* * Note that we do not bother creating a "dc" list as there @@ -165,7 +175,6 @@ mutex_exit(&domp->lock); } mutex_exit(&domp_old->lock); - cpupm_free_cpu_dependencies(); } /* @@ -176,7 +185,7 @@ ppm_set_topspeed(ppm_dev_t *cpup, int speed) { for (cpup = cpup->domp->devlist; cpup != NULL; cpup = cpup->next) - (*cpupm_set_topspeed)(cpup->dip, speed); + (*cpupm_set_topspeed_callb)(cpup->dip, speed); } /* @@ -197,7 +206,8 @@ cpup = PPM_GET_PRIVATE((dev_info_t *)ctx); - if (cpupm_get_topspeed == NULL || cpupm_set_topspeed == NULL) { + if (cpupm_get_topspeed_callb == NULL || + cpupm_set_topspeed_callb == NULL) { cmn_err(CE_WARN, "%s: Cannot process request for instance %d " "since cpupm interfaces are not initialized", str, ddi_get_instance(cpup->dip)); @@ -215,7 +225,7 @@ * Process each CPU in the domain. */ for (ncpup = cpup->domp->devlist; ncpup != NULL; ncpup = ncpup->next) { - topspeed = (*cpupm_get_topspeed)(ncpup->dip); + topspeed = (*cpupm_get_topspeed_callb)(ncpup->dip); if (newspeed == -1 || topspeed < newspeed) newspeed = topspeed; }
--- a/usr/src/uts/i86pc/os/cpuid.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/i86pc/os/cpuid.c Wed Feb 25 21:04:18 2009 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1024,6 +1024,22 @@ cpi->cpi_ncore_per_chip = 1; break; } + + /* + * Get CPUID data about TSC Invariance in Deep C-State. + */ + switch (cpi->cpi_vendor) { + case X86_VENDOR_Intel: + if (cpi->cpi_maxeax >= 7) { + cp = &cpi->cpi_extd[7]; + cp->cp_eax = 0x80000007; + cp->cp_ecx = 0; + (void) __cpuid_insn(cp); + } + break; + default: + break; + } } else { cpi->cpi_ncore_per_chip = 1; } @@ -3847,6 +3863,36 @@ } } +int +cpuid_deep_cstates_supported(void) +{ + struct cpuid_info *cpi; + struct cpuid_regs regs; + + ASSERT(cpuid_checkpass(CPU, 1)); + + cpi = CPU->cpu_m.mcpu_cpi; + + if (!(x86_feature & X86_CPUID)) + return (0); + + switch (cpi->cpi_vendor) { + case X86_VENDOR_Intel: + if (cpi->cpi_xmaxeax < 0x80000007) + return (0); + + /* + * TSC run at a constant rate in all ACPI C-states? + */ + regs.cp_eax = 0x80000007; + (void) __cpuid_insn(®s); + return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE); + + default: + return (0); + } +} + #if defined(__amd64) && !defined(__xpv) /* * Patch in versions of bcopy for high performance Intel Nhm processors
--- a/usr/src/uts/i86pc/os/cpupm.c Wed Feb 25 20:53:30 2009 -0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,247 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include <sys/cpupm.h> - -/* - * This callback is used to build the PPM CPU domains once - * all the CPU devices have been started. The callback is - * initialized by the PPM driver to point to a routine that - * will build the domains. - */ -void (*cpupm_rebuild_cpu_domains)(void); - -/* - * This callback is used to reset the topspeed for all the - * CPU devices. The callback is initialized by the PPM driver to - * point to a routine that will reinitialize all the CPU devices - * once all the CPU devices have been started and the CPU domains - * built. - */ -void (*cpupm_init_topspeed)(void); - -/* - * This callback is used to redefine the topspeed for a CPU device. - * Since all CPUs in a domain should have identical properties, this - * callback is initialized by the PPM driver to point to a routine - * that will redefine the topspeed for all devices in a CPU domain. - * This callback is exercised whenever an ACPI _PPC change notification - * is received by the CPU driver. - */ -void (*cpupm_redefine_topspeed)(void *); - -/* - * This callback is used by the PPM driver to call into the CPU driver - * to find a CPU's current topspeed (i.e., it's current ACPI _PPC value). - */ -void (*cpupm_set_topspeed)(void *, int); - -/* - * This callback is used by the PPM driver to call into the CPU driver - * to set a new topspeed for a CPU. - */ -int (*cpupm_get_topspeed)(void *); - -/* - * Used to dynamically keep track of the CPU dependencies as CPU - * devices attach. Will subsequently be used by the PPM driver - * to build PPM CPU domains. - */ -static cpupm_cpu_dependency_t *cpupm_cpu_dependencies = NULL; - -/* - * If we are unable to correctly identify a dependency for any CPU, then - * we punt and all CPUs are managed as one domain. - */ -static boolean_t cpupm_dependencies_valid = B_TRUE; - -/* - * If any CPU fails to attach, then cpupm is disabled for all CPUs. - */ -static uint32_t cpupm_enabled = CPUPM_P_STATES | CPUPM_T_STATES; - -/* - * Until all CPUs have succesfully attached, we do not allow - * power management. - */ -static boolean_t cpupm_ready = B_FALSE; - -/* - * Print the CPU dependencies. - */ -static void -cpupm_print_cpu_dependencies() -{ - cpupm_cpu_dependency_t *dptr; - cpupm_cpu_node_t *nptr; - - for (dptr = cpupm_cpu_dependencies; dptr != NULL; - dptr = dptr->cd_next) { - for (nptr = dptr->cd_cpu; nptr != NULL; nptr = nptr->cn_next) { - int instance = ddi_get_instance(nptr->cn_dip); - cmn_err(CE_NOTE, - "print_cpu_dependencies: dependency %d " - "instance %d\n", dptr->cd_dependency_id, instance); - } - } -} - -/* - * Used to retrieve the dependencies built during CPUs attaching. - */ -cpupm_cpu_dependency_t * -cpupm_get_cpu_dependencies() -{ - return (cpupm_cpu_dependencies); -} - -/* - * Build dependencies as CPUs attach. Note that we don't need to worry - * about locking the dependency lists as concurrency is not an issue. - * This routine relies on the fact that the CPU devices are attached - * sequentially by a single thread. - */ -void -cpupm_add_cpu2dependency(dev_info_t *dip, int cpu_dependency) -{ - cpupm_cpu_dependency_t *dptr; - cpupm_cpu_node_t *nptr; - - if (!cpupm_dependencies_valid) - return; - - if (cpu_dependency == -1) { - cpupm_free_cpu_dependencies(); - return; - } - - for (dptr = cpupm_cpu_dependencies; dptr != NULL; - dptr = dptr->cd_next) { - if (dptr->cd_dependency_id == cpu_dependency) - break; - } - - /* new dependency is created and linked at the head */ - if (dptr == NULL) { - dptr = kmem_zalloc(sizeof (cpupm_cpu_dependency_t), KM_SLEEP); - dptr->cd_dependency_id = cpu_dependency; - dptr->cd_next = cpupm_cpu_dependencies; - cpupm_cpu_dependencies = dptr; - } - - /* new cpu is created and linked at head of dependency */ - nptr = kmem_zalloc(sizeof (cpupm_cpu_node_t), KM_SLEEP); - nptr->cn_dip = dip; - nptr->cn_next = dptr->cd_cpu; - dptr->cd_cpu = nptr; -} - -/* - * Free the CPU dependencies. - */ -void -cpupm_free_cpu_dependencies() -{ - cpupm_cpu_dependency_t *this_dependency, *next_dependency; - cpupm_cpu_node_t *this_node, *next_node; - - cpupm_dependencies_valid = B_FALSE; - this_dependency = cpupm_cpu_dependencies; - while (this_dependency != NULL) { - next_dependency = this_dependency->cd_next; - - /* discard CPU node chain */ - this_node = this_dependency->cd_cpu; - while (this_node != NULL) { - next_node = this_node->cn_next; - kmem_free((void *)this_node, - sizeof (cpupm_cpu_node_t)); - this_node = next_node; - } - kmem_free((void *)this_dependency, - sizeof (cpupm_cpu_dependency_t)); - this_dependency = next_dependency; - } - cpupm_cpu_dependencies = NULL; -} - -/* - * If all CPUs have attached successfully, then the CPUs are - * ready for power management. - */ -boolean_t -cpupm_is_ready() -{ -#ifndef __xpv - if (cpupm_enabled == CPUPM_NO_STATES) - return (B_FALSE); - return (cpupm_ready); -#else - return (B_FALSE); -#endif -} - -boolean_t -cpupm_is_enabled(uint32_t state) -{ - return ((cpupm_enabled & state) == state); -} - -/* - * By default, all states are enabled. But if there are any errors attaching - * any of the CPU devices, then they are disabled. - */ -void -cpupm_disable(uint32_t state) -{ - cpupm_enabled &= ~state; - if (state & CPUPM_P_STATES) - cpupm_free_cpu_dependencies(); -} - -/* - * Once all CPUs have been started, the PPM driver should build CPU - * domains and initialize the topspeed for all CPU devices. - */ -void -cpupm_post_startup() -{ -#ifndef __xpv - /* - * The CPU domain built by the PPM during CPUs attaching - * should be rebuilt with the information retrieved from - * ACPI. - */ - if (cpupm_rebuild_cpu_domains != NULL) - (*cpupm_rebuild_cpu_domains)(); - - /* - * Only initialize the topspeed if P-states are enabled. - */ - if (cpupm_enabled & CPUPM_P_STATES && cpupm_init_topspeed != NULL) - (*cpupm_init_topspeed)(); -#endif - cpupm_ready = B_TRUE; -}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/i86pc/os/cpupm/cpu_acpi.c Wed Feb 25 21:04:18 2009 -0800 @@ -0,0 +1,1018 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/cpu_acpi.h> + +/* + * List of the processor ACPI object types that are being used. + */ +typedef enum cpu_acpi_obj { + PDC_OBJ = 0, + PCT_OBJ, + PSS_OBJ, + PSD_OBJ, + PPC_OBJ, + PTC_OBJ, + TSS_OBJ, + TSD_OBJ, + TPC_OBJ, + CSD_OBJ, +} cpu_acpi_obj_t; + +/* + * Container to store object name. + * Other attributes can be added in the future as necessary. + */ +typedef struct cpu_acpi_obj_attr { + char *name; +} cpu_acpi_obj_attr_t; + +/* + * List of object attributes. + * NOTE: Please keep the ordering of the list as same as cpu_acpi_obj_t. + */ +static cpu_acpi_obj_attr_t cpu_acpi_obj_attrs[] = { + {"_PDC"}, + {"_PCT"}, + {"_PSS"}, + {"_PSD"}, + {"_PPC"}, + {"_PTC"}, + {"_TSS"}, + {"_TSD"}, + {"_TPC"}, + {"_CSD"} +}; + +/* + * Cache the ACPI CPU control data objects. + */ +static int +cpu_acpi_cache_ctrl_regs(cpu_acpi_handle_t handle, cpu_acpi_obj_t objtype, + cpu_acpi_ctrl_regs_t *regs) +{ + ACPI_BUFFER abuf; + ACPI_OBJECT *obj; + AML_RESOURCE_GENERIC_REGISTER *greg; + int ret = -1; + int i; + + /* + * Fetch the control registers (if present) for the CPU node. + * Since they are optional, non-existence is not a failure + * (we just consider it a fixed hardware case). + */ + abuf.Length = ACPI_ALLOCATE_BUFFER; + abuf.Pointer = NULL; + if (ACPI_FAILURE(AcpiEvaluateObjectTyped(handle->cs_handle, + cpu_acpi_obj_attrs[objtype].name, NULL, &abuf, + ACPI_TYPE_PACKAGE))) { + regs[0].cr_addrspace_id = ACPI_ADR_SPACE_FIXED_HARDWARE; + regs[1].cr_addrspace_id = ACPI_ADR_SPACE_FIXED_HARDWARE; + return (1); + } + + obj = abuf.Pointer; + if (obj->Package.Count != 2) { + cmn_err(CE_NOTE, "!cpu_acpi: %s package bad count %d.", + cpu_acpi_obj_attrs[objtype].name, obj->Package.Count); + goto out; + } + + /* + * Does the package look coherent? + */ + for (i = 0; i < obj->Package.Count; i++) { + if (obj->Package.Elements[i].Type != ACPI_TYPE_BUFFER) { + cmn_err(CE_NOTE, "!cpu_acpi: " + "Unexpected data in %s package.", + cpu_acpi_obj_attrs[objtype].name); + goto out; + } + + greg = (AML_RESOURCE_GENERIC_REGISTER *) + obj->Package.Elements[i].Buffer.Pointer; + if (greg->DescriptorType != + ACPI_RESOURCE_NAME_GENERIC_REGISTER) { + cmn_err(CE_NOTE, "!cpu_acpi: " + "%s package has format error.", + cpu_acpi_obj_attrs[objtype].name); + goto out; + } + if (greg->ResourceLength != + ACPI_AML_SIZE_LARGE(AML_RESOURCE_GENERIC_REGISTER)) { + cmn_err(CE_NOTE, "!cpu_acpi: " + "%s package not right size.", + cpu_acpi_obj_attrs[objtype].name); + goto out; + } + if (greg->AddressSpaceId != ACPI_ADR_SPACE_FIXED_HARDWARE && + greg->AddressSpaceId != ACPI_ADR_SPACE_SYSTEM_IO) { + cmn_err(CE_NOTE, "!cpu_apci: %s contains unsupported " + "address space type %x", + cpu_acpi_obj_attrs[objtype].name, + greg->AddressSpaceId); + goto out; + } + } + + /* + * Looks good! + */ + for (i = 0; i < obj->Package.Count; i++) { + greg = (AML_RESOURCE_GENERIC_REGISTER *) + obj->Package.Elements[i].Buffer.Pointer; + regs[i].cr_addrspace_id = greg->AddressSpaceId; + regs[i].cr_width = greg->BitWidth; + regs[i].cr_offset = greg->BitOffset; + regs[i].cr_asize = greg->AccessSize; + regs[i].cr_address = greg->Address; + } + ret = 0; +out: + AcpiOsFree(abuf.Pointer); + return (ret); +} + +/* + * Cache the ACPI _PCT data. The _PCT data defines the interface to use + * when making power level transitions (i.e., system IO ports, fixed + * hardware port, etc). + */ +static int +cpu_acpi_cache_pct(cpu_acpi_handle_t handle) +{ + cpu_acpi_pct_t *pct; + int ret; + + CPU_ACPI_OBJ_IS_NOT_CACHED(handle, CPU_ACPI_PCT_CACHED); + pct = &CPU_ACPI_PCT(handle)[0]; + if ((ret = cpu_acpi_cache_ctrl_regs(handle, PCT_OBJ, pct)) == 0) + CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_PCT_CACHED); + return (ret); +} + +/* + * Cache the ACPI _PTC data. The _PTC data defines the interface to use + * when making T-state transitions (i.e., system IO ports, fixed + * hardware port, etc). + */ +static int +cpu_acpi_cache_ptc(cpu_acpi_handle_t handle) +{ + cpu_acpi_ptc_t *ptc; + int ret; + + CPU_ACPI_OBJ_IS_NOT_CACHED(handle, CPU_ACPI_PTC_CACHED); + ptc = &CPU_ACPI_PTC(handle)[0]; + if ((ret = cpu_acpi_cache_ctrl_regs(handle, PTC_OBJ, ptc)) == 0) + CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_PTC_CACHED); + return (ret); +} + +/* + * Cache the ACPI CPU state dependency data objects. + */ +static int +cpu_acpi_cache_state_dependencies(cpu_acpi_handle_t handle, + cpu_acpi_obj_t objtype, cpu_acpi_state_dependency_t *sd) +{ + ACPI_BUFFER abuf; + ACPI_OBJECT *pkg, *elements; + int number; + int ret = -1; + + if (objtype == CSD_OBJ) { + number = 6; + } else { + number = 5; + } + /* + * Fetch the dependencies (if present) for the CPU node. + * Since they are optional, non-existence is not a failure + * (it's up to the caller to determine how to handle non-existence). + */ + abuf.Length = ACPI_ALLOCATE_BUFFER; + abuf.Pointer = NULL; + if (ACPI_FAILURE(AcpiEvaluateObjectTyped(handle->cs_handle, + cpu_acpi_obj_attrs[objtype].name, NULL, &abuf, + ACPI_TYPE_PACKAGE))) { + return (1); + } + + pkg = abuf.Pointer; + + if (((objtype != CSD_OBJ) && (pkg->Package.Count != 1)) || + ((objtype == CSD_OBJ) && (pkg->Package.Count != 1) && + (pkg->Package.Count != 2))) { + cmn_err(CE_NOTE, "!cpu_acpi: %s unsupported package " + "count %d.", cpu_acpi_obj_attrs[objtype].name, + pkg->Package.Count); + goto out; + } + + /* + * For C-state domain, we assume C2 and C3 have the same + * domain information + */ + if (pkg->Package.Elements[0].Type != ACPI_TYPE_PACKAGE || + pkg->Package.Elements[0].Package.Count != number) { + cmn_err(CE_NOTE, "!cpu_acpi: Unexpected data in %s package.", + cpu_acpi_obj_attrs[objtype].name); + goto out; + } + elements = pkg->Package.Elements[0].Package.Elements; + if (elements[0].Integer.Value != number || + elements[1].Integer.Value != 0) { + cmn_err(CE_NOTE, "!cpu_acpi: Unexpected %s revision.", + cpu_acpi_obj_attrs[objtype].name); + goto out; + } + + sd->sd_entries = elements[0].Integer.Value; + sd->sd_revision = elements[1].Integer.Value; + sd->sd_domain = elements[2].Integer.Value; + sd->sd_type = elements[3].Integer.Value; + sd->sd_num = elements[4].Integer.Value; + if (objtype == CSD_OBJ) { + sd->sd_index = elements[5].Integer.Value; + } + + ret = 0; +out: + AcpiOsFree(abuf.Pointer); + return (ret); +} + +/* + * Cache the ACPI _PSD data. The _PSD data defines P-state CPU dependencies + * (think CPU domains). + */ +static int +cpu_acpi_cache_psd(cpu_acpi_handle_t handle) +{ + cpu_acpi_psd_t *psd; + int ret; + + CPU_ACPI_OBJ_IS_NOT_CACHED(handle, CPU_ACPI_PSD_CACHED); + psd = &CPU_ACPI_PSD(handle); + ret = cpu_acpi_cache_state_dependencies(handle, PSD_OBJ, psd); + if (ret == 0) + CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_PSD_CACHED); + return (ret); + +} + +/* + * Cache the ACPI _TSD data. The _TSD data defines T-state CPU dependencies + * (think CPU domains). + */ +static int +cpu_acpi_cache_tsd(cpu_acpi_handle_t handle) +{ + cpu_acpi_tsd_t *tsd; + int ret; + + CPU_ACPI_OBJ_IS_NOT_CACHED(handle, CPU_ACPI_TSD_CACHED); + tsd = &CPU_ACPI_TSD(handle); + ret = cpu_acpi_cache_state_dependencies(handle, TSD_OBJ, tsd); + if (ret == 0) + CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_TSD_CACHED); + return (ret); + +} + +/* + * Cache the ACPI _CSD data. The _CSD data defines C-state CPU dependencies + * (think CPU domains). + */ +static int +cpu_acpi_cache_csd(cpu_acpi_handle_t handle) +{ + cpu_acpi_csd_t *csd; + int ret; + + CPU_ACPI_OBJ_IS_NOT_CACHED(handle, CPU_ACPI_CSD_CACHED); + csd = &CPU_ACPI_CSD(handle); + ret = cpu_acpi_cache_state_dependencies(handle, CSD_OBJ, csd); + if (ret == 0) + CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_CSD_CACHED); + return (ret); + +} + +static void +cpu_acpi_cache_pstate(cpu_acpi_handle_t handle, ACPI_OBJECT *obj, int cnt) +{ + cpu_acpi_pstate_t *pstate; + ACPI_OBJECT *q, *l; + int i, j; + + CPU_ACPI_PSTATES_COUNT(handle) = cnt; + CPU_ACPI_PSTATES(handle) = kmem_zalloc(CPU_ACPI_PSTATES_SIZE(cnt), + KM_SLEEP); + pstate = (cpu_acpi_pstate_t *)CPU_ACPI_PSTATES(handle); + for (i = 0, l = NULL; i < obj->Package.Count && cnt > 0; i++, l = q) { + uint32_t *up; + + q = obj->Package.Elements[i].Package.Elements; + + /* + * Skip duplicate entries. + */ + if (l != NULL && l[0].Integer.Value == q[0].Integer.Value) + continue; + + up = (uint32_t *)pstate; + for (j = 0; j < CPU_ACPI_PSS_CNT; j++) + up[j] = q[j].Integer.Value; + pstate++; + cnt--; + } +} + +static void +cpu_acpi_cache_tstate(cpu_acpi_handle_t handle, ACPI_OBJECT *obj, int cnt) +{ + cpu_acpi_tstate_t *tstate; + ACPI_OBJECT *q, *l; + int i, j; + + CPU_ACPI_TSTATES_COUNT(handle) = cnt; + CPU_ACPI_TSTATES(handle) = kmem_zalloc(CPU_ACPI_TSTATES_SIZE(cnt), + KM_SLEEP); + tstate = (cpu_acpi_tstate_t *)CPU_ACPI_TSTATES(handle); + for (i = 0, l = NULL; i < obj->Package.Count && cnt > 0; i++, l = q) { + uint32_t *up; + + q = obj->Package.Elements[i].Package.Elements; + + /* + * Skip duplicate entries. + */ + if (l != NULL && l[0].Integer.Value == q[0].Integer.Value) + continue; + + up = (uint32_t *)tstate; + for (j = 0; j < CPU_ACPI_TSS_CNT; j++) + up[j] = q[j].Integer.Value; + tstate++; + cnt--; + } +} + +/* + * Cache the _PSS or _TSS data. + */ +static int +cpu_acpi_cache_supported_states(cpu_acpi_handle_t handle, + cpu_acpi_obj_t objtype, int fcnt) +{ + ACPI_BUFFER abuf; + ACPI_OBJECT *obj, *q, *l; + boolean_t eot = B_FALSE; + int ret = -1; + int cnt; + int i, j; + + /* + * Fetch the data (if present) for the CPU node. + */ + abuf.Length = ACPI_ALLOCATE_BUFFER; + abuf.Pointer = NULL; + if (ACPI_FAILURE(AcpiEvaluateObjectTyped(handle->cs_handle, + cpu_acpi_obj_attrs[objtype].name, NULL, &abuf, + ACPI_TYPE_PACKAGE))) { + cmn_err(CE_NOTE, "!cpu_acpi: %s package not found.", + cpu_acpi_obj_attrs[objtype].name); + return (1); + } + obj = abuf.Pointer; + if (obj->Package.Count < 2) { + cmn_err(CE_NOTE, "!cpu_acpi: %s package bad count %d.", + cpu_acpi_obj_attrs[objtype].name, obj->Package.Count); + goto out; + } + + /* + * Does the package look coherent? + */ + cnt = 0; + for (i = 0, l = NULL; i < obj->Package.Count; i++, l = q) { + if (obj->Package.Elements[i].Type != ACPI_TYPE_PACKAGE || + obj->Package.Elements[i].Package.Count != fcnt) { + cmn_err(CE_NOTE, "!cpu_acpi: " + "Unexpected data in %s package.", + cpu_acpi_obj_attrs[objtype].name); + goto out; + } + + q = obj->Package.Elements[i].Package.Elements; + for (j = 0; j < fcnt; j++) { + if (q[j].Type != ACPI_TYPE_INTEGER) { + cmn_err(CE_NOTE, "!cpu_acpi: " + "%s element invalid (type)", + cpu_acpi_obj_attrs[objtype].name); + goto out; + } + } + + /* + * Ignore duplicate entries. + */ + if (l != NULL && l[0].Integer.Value == q[0].Integer.Value) + continue; + + /* + * Some supported state tables are larger than required + * and unused elements are filled with patterns + * of 0xff. Simply check here for frequency = 0xffff + * and stop counting if found. + */ + if (q[0].Integer.Value == 0xffff) { + eot = B_TRUE; + continue; + } + + /* + * We should never find a valid entry after we've hit + * an the end-of-table entry. + */ + if (eot) { + cmn_err(CE_NOTE, "!cpu_acpi: " + "Unexpected data in %s package after eot.", + cpu_acpi_obj_attrs[objtype].name); + goto out; + } + + /* + * states must be defined in order from highest to lowest. + */ + if (l != NULL && l[0].Integer.Value < q[0].Integer.Value) { + cmn_err(CE_NOTE, "!cpu_acpi: " + "%s package state definitions out of order.", + cpu_acpi_obj_attrs[objtype].name); + goto out; + } + + /* + * This entry passes. + */ + cnt++; + } + if (cnt == 0) + goto out; + + /* + * Yes, fill in the structure. + */ + ASSERT(objtype == PSS_OBJ || objtype == TSS_OBJ); + (objtype == PSS_OBJ) ? cpu_acpi_cache_pstate(handle, obj, cnt) : + cpu_acpi_cache_tstate(handle, obj, cnt); + + ret = 0; +out: + AcpiOsFree(abuf.Pointer); + return (ret); +} + +/* + * Cache the _PSS data. The _PSS data defines the different power levels + * supported by the CPU and the attributes associated with each power level + * (i.e., frequency, voltage, etc.). The power levels are number from + * highest to lowest. That is, the highest power level is _PSS entry 0 + * and the lowest power level is the last _PSS entry. + */ +static int +cpu_acpi_cache_pstates(cpu_acpi_handle_t handle) +{ + int ret; + + CPU_ACPI_OBJ_IS_NOT_CACHED(handle, CPU_ACPI_PSS_CACHED); + ret = cpu_acpi_cache_supported_states(handle, PSS_OBJ, + CPU_ACPI_PSS_CNT); + if (ret == 0) + CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_PSS_CACHED); + return (ret); +} + +/* + * Cache the _TSS data. The _TSS data defines the different freq throttle + * levels supported by the CPU and the attributes associated with each + * throttle level (i.e., frequency throttle percentage, voltage, etc.). + * The throttle levels are number from highest to lowest. + */ +static int +cpu_acpi_cache_tstates(cpu_acpi_handle_t handle) +{ + int ret; + + CPU_ACPI_OBJ_IS_NOT_CACHED(handle, CPU_ACPI_TSS_CACHED); + ret = cpu_acpi_cache_supported_states(handle, TSS_OBJ, + CPU_ACPI_TSS_CNT); + if (ret == 0) + CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_TSS_CACHED); + return (ret); +} + +/* + * Cache the ACPI CPU present capabilities data objects. + */ +static int +cpu_acpi_cache_present_capabilities(cpu_acpi_handle_t handle, + cpu_acpi_obj_t objtype, cpu_acpi_present_capabilities_t *pc) + +{ + ACPI_BUFFER abuf; + ACPI_OBJECT *obj; + + /* + * Fetch the present capabilites object (if present) for the CPU node. + * Since they are optional, non-existence is not a failure. + */ + abuf.Length = ACPI_ALLOCATE_BUFFER; + abuf.Pointer = NULL; + if (ACPI_FAILURE(AcpiEvaluateObject(handle->cs_handle, + cpu_acpi_obj_attrs[objtype].name, NULL, &abuf)) || + abuf.Length == 0) { + *pc = 0; + return (1); + } + + obj = (ACPI_OBJECT *)abuf.Pointer; + *pc = obj->Integer.Value; + AcpiOsFree(abuf.Pointer); + return (0); +} + +/* + * Cache the _PPC data. The _PPC simply contains an integer value which + * represents the highest power level that a CPU should transition to. + * That is, it's an index into the array of _PSS entries and will be + * greater than or equal to zero. + */ +void +cpu_acpi_cache_ppc(cpu_acpi_handle_t handle) +{ + cpu_acpi_ppc_t *ppc; + int ret; + + CPU_ACPI_OBJ_IS_NOT_CACHED(handle, CPU_ACPI_PPC_CACHED); + ppc = &CPU_ACPI_PPC(handle); + ret = cpu_acpi_cache_present_capabilities(handle, PPC_OBJ, ppc); + if (ret == 0) + CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_PPC_CACHED); +} + +/* + * Cache the _TPC data. The _TPC simply contains an integer value which + * represents the throttle level that a CPU should transition to. + * That is, it's an index into the array of _TSS entries and will be + * greater than or equal to zero. + */ +void +cpu_acpi_cache_tpc(cpu_acpi_handle_t handle) +{ + cpu_acpi_tpc_t *tpc; + int ret; + + CPU_ACPI_OBJ_IS_NOT_CACHED(handle, CPU_ACPI_TPC_CACHED); + tpc = &CPU_ACPI_TPC(handle); + ret = cpu_acpi_cache_present_capabilities(handle, TPC_OBJ, tpc); + if (ret == 0) + CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_TPC_CACHED); +} + +int +cpu_acpi_verify_cstate(cpu_acpi_cstate_t *cstate) +{ + uint32_t addrspaceid = cstate->cs_addrspace_id; + + if ((addrspaceid != ACPI_ADR_SPACE_FIXED_HARDWARE) && + (addrspaceid != ACPI_ADR_SPACE_SYSTEM_IO)) { + cmn_err(CE_WARN, "!_CST: unsupported address space id" + ":C%d, type: %d\n", cstate->cs_type, addrspaceid); + return (1); + } + return (0); +} + +int +cpu_acpi_cache_cst(cpu_acpi_handle_t handle) +{ + ACPI_BUFFER abuf; + ACPI_OBJECT *obj; + ACPI_INTEGER cnt; + cpu_acpi_cstate_t *cstate, *p; + int i, count; + + CPU_ACPI_OBJ_IS_NOT_CACHED(handle, CPU_ACPI_CST_CACHED); + + abuf.Length = ACPI_ALLOCATE_BUFFER; + abuf.Pointer = NULL; + + if (ACPI_FAILURE(AcpiEvaluateObject(handle->cs_handle, "_CST", + NULL, &abuf))) { + cmn_err(CE_NOTE, "!cpu_acpi: _CST evaluate failure"); + return (-1); + } + obj = (ACPI_OBJECT *)abuf.Pointer; + if (obj->Package.Count < 2) { + cmn_err(CE_NOTE, "!cpu_acpi: _CST package bad count %d.", + obj->Package.Count); + AcpiOsFree(abuf.Pointer); + return (-1); + } + + /* + * Does the package look coherent? + */ + cnt = obj->Package.Elements[0].Integer.Value; + if (cnt < 1 || cnt != obj->Package.Count - 1) { + cmn_err(CE_NOTE, "!cpu_acpi: _CST invalid element count %d != " + "Package count %d\n", + (int)cnt, (int)obj->Package.Count - 1); + AcpiOsFree(abuf.Pointer); + return (-1); + } + + CPU_ACPI_CSTATES_COUNT(handle) = (uint32_t)cnt; + CPU_ACPI_CSTATES(handle) = kmem_zalloc(CPU_ACPI_CSTATES_SIZE(cnt), + KM_SLEEP); + CPU_ACPI_BM_INFO(handle) = 0; + cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle); + p = cstate; + + for (i = 1, count = 1; i <= cnt; i++) { + ACPI_OBJECT *pkg; + AML_RESOURCE_GENERIC_REGISTER *reg; + ACPI_OBJECT *element; + + pkg = &(obj->Package.Elements[i]); + reg = (AML_RESOURCE_GENERIC_REGISTER *) + pkg->Package.Elements[0].Buffer.Pointer; + cstate->cs_addrspace_id = reg->AddressSpaceId; + cstate->cs_address = reg->Address; + element = &(pkg->Package.Elements[1]); + cstate->cs_type = element->Integer.Value; + element = &(pkg->Package.Elements[2]); + cstate->cs_latency = element->Integer.Value; + element = &(pkg->Package.Elements[3]); + cstate->cs_power = element->Integer.Value; + + if (cpu_acpi_verify_cstate(cstate)) { + /* + * ignore this entry if it's not valid + */ + continue; + } + if (cstate == p) { + cstate++; + } else if (p->cs_type == cstate->cs_type) { + /* + * if there are duplicate entries, we keep the + * last one. This fixes: + * 1) some buggy BIOS have total duplicate entries. + * 2) ACPI Spec allows the same cstate entry with + * different power and latency, we use the one + * with more power saving. + */ + (void) memcpy(p, cstate, sizeof (cpu_acpi_cstate_t)); + } else { + /* + * we got a valid entry, cache it to the + * cstate structure + */ + p = cstate++; + count++; + } + } + + if (count < 2) { + cmn_err(CE_NOTE, "!cpu_acpi: _CST invalid count %d < 2\n", + count); + AcpiOsFree(abuf.Pointer); + return (-1); + } + + if (count != cnt) + CPU_ACPI_CSTATES_COUNT(handle) = (uint32_t)count; + + AcpiOsFree(abuf.Pointer); + CPU_ACPI_OBJ_IS_CACHED(handle, CPU_ACPI_CST_CACHED); + return (0); +} + +/* + * Cache the _PCT, _PSS, _PSD and _PPC data. + */ +int +cpu_acpi_cache_pstate_data(cpu_acpi_handle_t handle) +{ + if (cpu_acpi_cache_pct(handle) < 0) { + cmn_err(CE_WARN, "!cpu_acpi: error parsing _PCT for " + "CPU %d", handle->cs_id); + return (-1); + } + + if (cpu_acpi_cache_pstates(handle) != 0) { + cmn_err(CE_WARN, "!cpu_acpi: error parsing _PSS for " + "CPU %d", handle->cs_id); + return (-1); + } + + if (cpu_acpi_cache_psd(handle) < 0) { + cmn_err(CE_WARN, "!cpu_acpi: error parsing _PSD for " + "CPU %d", handle->cs_id); + return (-1); + } + + cpu_acpi_cache_ppc(handle); + + return (0); +} + +void +cpu_acpi_free_pstate_data(cpu_acpi_handle_t handle) +{ + if (handle != NULL) { + if (CPU_ACPI_PSTATES(handle)) { + kmem_free(CPU_ACPI_PSTATES(handle), + CPU_ACPI_PSTATES_SIZE( + CPU_ACPI_PSTATES_COUNT(handle))); + CPU_ACPI_PSTATES(handle) = NULL; + } + } +} + +/* + * Cache the _PTC, _TSS, _TSD and _TPC data. + */ +int +cpu_acpi_cache_tstate_data(cpu_acpi_handle_t handle) +{ + if (cpu_acpi_cache_ptc(handle) < 0) { + cmn_err(CE_WARN, "!cpu_acpi: error parsing _PTC for " + "CPU %d", handle->cs_id); + return (-1); + } + + if (cpu_acpi_cache_tstates(handle) != 0) { + cmn_err(CE_WARN, "!cpu_acpi: error parsing _TSS for " + "CPU %d", handle->cs_id); + return (-1); + } + + if (cpu_acpi_cache_tsd(handle) < 0) { + cmn_err(CE_WARN, "!cpu_acpi: error parsing _TSD for " + "CPU %d", handle->cs_id); + return (-1); + } + + cpu_acpi_cache_tpc(handle); + + return (0); +} + +void +cpu_acpi_free_tstate_data(cpu_acpi_handle_t handle) +{ + if (handle != NULL) { + if (CPU_ACPI_TSTATES(handle)) { + kmem_free(CPU_ACPI_TSTATES(handle), + CPU_ACPI_TSTATES_SIZE( + CPU_ACPI_TSTATES_COUNT(handle))); + CPU_ACPI_TSTATES(handle) = NULL; + } + } +} + +/* + * Cache the _CST data. + */ +int +cpu_acpi_cache_cstate_data(cpu_acpi_handle_t handle) +{ + if (cpu_acpi_cache_cst(handle) < 0) { + cmn_err(CE_WARN, "!cpu_acpi: error parsing _CST for " + "CPU %d", handle->cs_id); + return (-1); + } + + if (cpu_acpi_cache_csd(handle) < 0) { + cmn_err(CE_WARN, "!cpu_acpi: error parsing _CSD for " + "CPU %d", handle->cs_id); + return (-1); + } + + return (0); +} + +void +cpu_acpi_free_cstate_data(cpu_acpi_handle_t handle) +{ + if (handle != NULL) { + if (CPU_ACPI_CSTATES(handle)) { + kmem_free(CPU_ACPI_CSTATES(handle), + CPU_ACPI_CSTATES_SIZE( + CPU_ACPI_CSTATES_COUNT(handle))); + CPU_ACPI_CSTATES(handle) = NULL; + } + } +} + +/* + * Register a handler for processor change notifications. + */ +void +cpu_acpi_install_notify_handler(cpu_acpi_handle_t handle, + ACPI_NOTIFY_HANDLER handler, void *ctx) +{ + if (ACPI_FAILURE(AcpiInstallNotifyHandler(handle->cs_handle, + ACPI_DEVICE_NOTIFY, handler, ctx))) + cmn_err(CE_NOTE, "!cpu_acpi: Unable to register " + "notify handler for CPU"); +} + +/* + * Remove a handler for processor change notifications. + */ +void +cpu_acpi_remove_notify_handler(cpu_acpi_handle_t handle, + ACPI_NOTIFY_HANDLER handler) +{ + if (ACPI_FAILURE(AcpiRemoveNotifyHandler(handle->cs_handle, + ACPI_DEVICE_NOTIFY, handler))) + cmn_err(CE_NOTE, "!cpu_acpi: Unable to remove " + "notify handler for CPU"); +} + +/* + * Write _PDC. + */ +int +cpu_acpi_write_pdc(cpu_acpi_handle_t handle, uint32_t revision, uint32_t count, + uint32_t *capabilities) +{ + ACPI_OBJECT obj; + ACPI_OBJECT_LIST list = { 1, &obj}; + uint32_t *buffer; + uint32_t *bufptr; + uint32_t bufsize; + int i; + + bufsize = (count + 2) * sizeof (uint32_t); + buffer = kmem_zalloc(bufsize, KM_SLEEP); + buffer[0] = revision; + buffer[1] = count; + bufptr = &buffer[2]; + for (i = 0; i < count; i++) + *bufptr++ = *capabilities++; + + obj.Type = ACPI_TYPE_BUFFER; + obj.Buffer.Length = bufsize; + obj.Buffer.Pointer = (void *)buffer; + + /* + * _PDC is optional, so don't log failure. + */ + if (ACPI_FAILURE(AcpiEvaluateObject(handle->cs_handle, "_PDC", + &list, NULL))) { + kmem_free(buffer, bufsize); + return (-1); + } + + kmem_free(buffer, bufsize); + return (0); +} + +/* + * Write to system IO port. + */ +int +cpu_acpi_write_port(ACPI_IO_ADDRESS address, uint32_t value, uint32_t width) +{ + if (ACPI_FAILURE(AcpiOsWritePort(address, value, width))) { + cmn_err(CE_NOTE, "cpu_acpi: error writing system IO port " + "%lx.", (long)address); + return (-1); + } + return (0); +} + +/* + * Read from a system IO port. + */ +int +cpu_acpi_read_port(ACPI_IO_ADDRESS address, uint32_t *value, uint32_t width) +{ + if (ACPI_FAILURE(AcpiOsReadPort(address, value, width))) { + cmn_err(CE_NOTE, "cpu_acpi: error reading system IO port " + "%lx.", (long)address); + return (-1); + } + return (0); +} + +/* + * Return supported frequencies. + */ +uint_t +cpu_acpi_get_speeds(cpu_acpi_handle_t handle, int **speeds) +{ + cpu_acpi_pstate_t *pstate; + int *hspeeds; + uint_t nspeeds; + int i; + + nspeeds = CPU_ACPI_PSTATES_COUNT(handle); + pstate = (cpu_acpi_pstate_t *)CPU_ACPI_PSTATES(handle); + hspeeds = kmem_zalloc(nspeeds * sizeof (int), KM_SLEEP); + for (i = 0; i < nspeeds; i++) { + hspeeds[i] = CPU_ACPI_FREQ(pstate); + pstate++; + } + *speeds = hspeeds; + return (nspeeds); +} + +/* + * Free resources allocated by cpu_acpi_get_speeds(). + */ +void +cpu_acpi_free_speeds(int *speeds, uint_t nspeeds) +{ + kmem_free(speeds, nspeeds * sizeof (int)); +} + +uint_t +cpu_acpi_get_max_cstates(cpu_acpi_handle_t handle) +{ + if (CPU_ACPI_CSTATES(handle)) + return (CPU_ACPI_CSTATES_COUNT(handle)); + else + return (1); +} + +void +cpu_acpi_set_register(uint32_t bitreg, uint32_t value) +{ + AcpiSetRegister(bitreg, value); +} + +void +cpu_acpi_get_register(uint32_t bitreg, uint32_t *value) +{ + AcpiGetRegister(bitreg, value); +} + +/* + * Map the dip to an ACPI handle for the device. + */ +cpu_acpi_handle_t +cpu_acpi_init(cpu_t *cp) +{ + cpu_acpi_handle_t handle; + + handle = kmem_zalloc(sizeof (cpu_acpi_state_t), KM_SLEEP); + + if (ACPI_FAILURE(acpica_get_handle_cpu(cp->cpu_id, + &handle->cs_handle))) { + kmem_free(handle, sizeof (cpu_acpi_state_t)); + return (NULL); + } + handle->cs_id = cp->cpu_id; + return (handle); +} + +/* + * Free any resources. + */ +void +cpu_acpi_fini(cpu_acpi_handle_t handle) +{ + if (handle) + kmem_free(handle, sizeof (cpu_acpi_state_t)); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/i86pc/os/cpupm/cpu_idle.c Wed Feb 25 21:04:18 2009 -0800 @@ -0,0 +1,877 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/x86_archext.h> +#include <sys/machsystm.h> +#include <sys/x_call.h> +#include <sys/stat.h> +#include <sys/acpi/acpi.h> +#include <sys/acpica.h> +#include <sys/cpu_acpi.h> +#include <sys/cpu_idle.h> +#include <sys/cpupm.h> +#include <sys/hpet.h> +#include <sys/archsystm.h> +#include <vm/hat_i86.h> +#include <sys/dtrace.h> +#include <sys/sdt.h> +#include <sys/callb.h> + +extern void cpu_idle_adaptive(void); + +static int cpu_idle_init(cpu_t *); +static void cpu_idle_fini(cpu_t *); +static boolean_t cpu_deep_idle_callb(void *arg, int code); +static boolean_t cpu_idle_cpr_callb(void *arg, int code); +static void acpi_cpu_cstate(cpu_acpi_cstate_t *cstate); +static void cpuidle_set_cstate_latency(cpu_t *cp); + +/* + * Interfaces for modules implementing Intel's deep c-state. + */ +cpupm_state_ops_t cpu_idle_ops = { + "Generic ACPI C-state Support", + cpu_idle_init, + cpu_idle_fini, + NULL +}; + +static kmutex_t cpu_idle_callb_mutex; +static callb_id_t cpu_deep_idle_callb_id; +static callb_id_t cpu_idle_cpr_callb_id; +static uint_t cpu_idle_cfg_state; + +static kmutex_t cpu_idle_mutex; + +cpu_idle_kstat_t cpu_idle_kstat = { + { "address_space_id", KSTAT_DATA_STRING }, + { "latency", KSTAT_DATA_UINT32 }, + { "power", KSTAT_DATA_UINT32 }, +}; + +/* + * kstat update function of the c-state info + */ +static int +cpu_idle_kstat_update(kstat_t *ksp, int flag) +{ + cpu_acpi_cstate_t *cstate = ksp->ks_private; + + if (flag == KSTAT_WRITE) { + return (EACCES); + } + + if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_FIXED_HARDWARE) { + kstat_named_setstr(&cpu_idle_kstat.addr_space_id, + "FFixedHW"); + } else if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_SYSTEM_IO) { + kstat_named_setstr(&cpu_idle_kstat.addr_space_id, + "SystemIO"); + } else { + kstat_named_setstr(&cpu_idle_kstat.addr_space_id, + "Unsupported"); + } + + cpu_idle_kstat.cs_latency.value.ui32 = cstate->cs_latency; + cpu_idle_kstat.cs_power.value.ui32 = cstate->cs_power; + + return (0); +} + +/* + * c-state wakeup function. + * Similar to cpu_wakeup and cpu_wakeup_mwait except this function deals + * with CPUs asleep in MWAIT, HLT, or ACPI Deep C-State. + */ +void +cstate_wakeup(cpu_t *cp, int bound) +{ + struct machcpu *mcpu = &(cp->cpu_m); + volatile uint32_t *mcpu_mwait = mcpu->mcpu_mwait; + cpupart_t *cpu_part; + uint_t cpu_found; + processorid_t cpu_sid; + + cpu_part = cp->cpu_part; + cpu_sid = cp->cpu_seqid; + /* + * Clear the halted bit for that CPU since it will be woken up + * in a moment. + */ + if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) { + /* + * Clear the halted bit for that CPU since it will be + * poked in a moment. + */ + bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid); + + /* + * We may find the current CPU present in the halted cpuset + * if we're in the context of an interrupt that occurred + * before we had a chance to clear our bit in cpu_idle(). + * Waking ourself is obviously unnecessary, since if + * we're here, we're not halted. + */ + if (cp != CPU) { + /* + * Use correct wakeup mechanism + */ + if ((mcpu_mwait != NULL) && + (*mcpu_mwait == MWAIT_HALTED)) + MWAIT_WAKEUP(cp); + else + poke_cpu(cp->cpu_id); + } + return; + } else { + /* + * This cpu isn't halted, but it's idle or undergoing a + * context switch. No need to awaken anyone else. + */ + if (cp->cpu_thread == cp->cpu_idle_thread || + cp->cpu_disp_flags & CPU_DISP_DONTSTEAL) + return; + } + + /* + * No need to wake up other CPUs if the thread we just enqueued + * is bound. + */ + if (bound) + return; + + + /* + * See if there's any other halted CPUs. If there are, then + * select one, and awaken it. + * It's possible that after we find a CPU, somebody else + * will awaken it before we get the chance. + * In that case, look again. + */ + do { + cpu_found = bitset_find(&cpu_part->cp_haltset); + if (cpu_found == (uint_t)-1) + return; + + } while (bitset_atomic_test_and_del(&cpu_part->cp_haltset, + cpu_found) < 0); + + /* + * Must use correct wakeup mechanism to avoid lost wakeup of + * alternate cpu. + */ + if (cpu_found != CPU->cpu_seqid) { + mcpu_mwait = cpu[cpu_found]->cpu_m.mcpu_mwait; + if ((mcpu_mwait != NULL) && (*mcpu_mwait == MWAIT_HALTED)) + MWAIT_WAKEUP(cpu_seq[cpu_found]); + else + poke_cpu(cpu_seq[cpu_found]->cpu_id); + } +} + +/* + * enter deep c-state handler + */ +static void +acpi_cpu_cstate(cpu_acpi_cstate_t *cstate) +{ + volatile uint32_t *mcpu_mwait = CPU->cpu_m.mcpu_mwait; + cpu_t *cpup = CPU; + processorid_t cpu_sid = cpup->cpu_seqid; + cpupart_t *cp = cpup->cpu_part; + hrtime_t lapic_expire; + uint8_t type = cstate->cs_addrspace_id; + uint32_t cs_type = cstate->cs_type; + int hset_update = 1; + boolean_t using_hpet_timer; + + /* + * Set our mcpu_mwait here, so we can tell if anyone tries to + * wake us between now and when we call mwait. No other cpu will + * attempt to set our mcpu_mwait until we add ourself to the haltset. + */ + if (mcpu_mwait) { + if (type == ACPI_ADR_SPACE_SYSTEM_IO) + *mcpu_mwait = MWAIT_WAKEUP_IPI; + else + *mcpu_mwait = MWAIT_HALTED; + } + + /* + * If this CPU is online, and there are multiple CPUs + * in the system, then we should note our halting + * by adding ourselves to the partition's halted CPU + * bitmap. This allows other CPUs to find/awaken us when + * work becomes available. + */ + if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1) + hset_update = 0; + + /* + * Add ourselves to the partition's halted CPUs bitmask + * and set our HALTED flag, if necessary. + * + * When a thread becomes runnable, it is placed on the queue + * and then the halted cpuset is checked to determine who + * (if anyone) should be awakened. We therefore need to first + * add ourselves to the halted cpuset, and and then check if there + * is any work available. + * + * Note that memory barriers after updating the HALTED flag + * are not necessary since an atomic operation (updating the bitmap) + * immediately follows. On x86 the atomic operation acts as a + * memory barrier for the update of cpu_disp_flags. + */ + if (hset_update) { + cpup->cpu_disp_flags |= CPU_DISP_HALTED; + bitset_atomic_add(&cp->cp_haltset, cpu_sid); + } + + /* + * Check to make sure there's really nothing to do. + * Work destined for this CPU may become available after + * this check. We'll be notified through the clearing of our + * bit in the halted CPU bitmask, and a write to our mcpu_mwait. + * + * disp_anywork() checks disp_nrunnable, so we do not have to later. + */ + if (disp_anywork()) { + if (hset_update) { + cpup->cpu_disp_flags &= ~CPU_DISP_HALTED; + bitset_atomic_del(&cp->cp_haltset, cpu_sid); + } + return; + } + + /* + * We're on our way to being halted. + * + * The local APIC timer can stop in ACPI C2 and deeper c-states. + * Program the HPET hardware to substitute for this CPU's lAPIC timer. + * hpet.use_hpet_timer() disables the LAPIC Timer. Make sure to + * start the LAPIC Timer again before leaving this function. + * + * hpet.use_hpet_timer disables interrupts, so we will awaken + * immediately after halting if someone tries to poke us between now + * and the time we actually halt. + */ + using_hpet_timer = hpet.use_hpet_timer(&lapic_expire); + + /* + * We check for the presence of our bit after disabling interrupts. + * If it's cleared, we'll return. If the bit is cleared after + * we check then the cstate_wakeup() will pop us out of the halted + * state. + * + * This means that the ordering of the cstate_wakeup() and the clearing + * of the bit by cpu_wakeup is important. + * cpu_wakeup() must clear our mc_haltset bit, and then call + * cstate_wakeup(). + * acpi_cpu_cstate() must disable interrupts, then check for the bit. + */ + if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) { + hpet.use_lapic_timer(lapic_expire); + cpup->cpu_disp_flags &= ~CPU_DISP_HALTED; + return; + } + + /* + * The check for anything locally runnable is here for performance + * and isn't needed for correctness. disp_nrunnable ought to be + * in our cache still, so it's inexpensive to check, and if there + * is anything runnable we won't have to wait for the poke. + */ + if (cpup->cpu_disp->disp_nrunnable != 0) { + hpet.use_lapic_timer(lapic_expire); + if (hset_update) { + cpup->cpu_disp_flags &= ~CPU_DISP_HALTED; + bitset_atomic_del(&cp->cp_haltset, cpu_sid); + } + return; + } + + if (using_hpet_timer == B_FALSE) { + + hpet.use_lapic_timer(lapic_expire); + + /* + * We are currently unable to program the HPET to act as this + * CPU's proxy lAPIC timer. This CPU cannot enter C2 or deeper + * because no timer is set to wake it up while its lAPIC timer + * stalls in deep C-States. + * Enter C1 instead. + * + * cstate_wake_cpu() will wake this CPU with an IPI which + * works with MWAIT. + */ + i86_monitor(mcpu_mwait, 0, 0); + if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) == MWAIT_HALTED) { + cpu_dtrace_idle_probe(CPU_ACPI_C1); + + tlb_going_idle(); + i86_mwait(0, 0); + tlb_service(); + + cpu_dtrace_idle_probe(CPU_ACPI_C0); + } + + /* + * We're no longer halted + */ + if (hset_update) { + cpup->cpu_disp_flags &= ~CPU_DISP_HALTED; + bitset_atomic_del(&cp->cp_haltset, cpu_sid); + } + return; + } + + cpu_dtrace_idle_probe((uint_t)cs_type); + + if (type == ACPI_ADR_SPACE_FIXED_HARDWARE) { + /* + * We're on our way to being halted. + * To avoid a lost wakeup, arm the monitor before checking + * if another cpu wrote to mcpu_mwait to wake us up. + */ + i86_monitor(mcpu_mwait, 0, 0); + if (*mcpu_mwait == MWAIT_HALTED) { + uint32_t eax = cstate->cs_address; + uint32_t ecx = 1; + + tlb_going_idle(); + i86_mwait(eax, ecx); + tlb_service(); + } + } else if (type == ACPI_ADR_SPACE_SYSTEM_IO) { + uint32_t value; + ACPI_TABLE_FADT *gbl_FADT; + + if (*mcpu_mwait == MWAIT_WAKEUP_IPI) { + tlb_going_idle(); + (void) cpu_acpi_read_port(cstate->cs_address, + &value, 8); + acpica_get_global_FADT(&gbl_FADT); + (void) cpu_acpi_read_port( + gbl_FADT->XPmTimerBlock.Address, &value, 32); + tlb_service(); + } + } else { + cmn_err(CE_WARN, "!_CST: cs_type %lx bad asid type %lx\n", + (long)cs_type, (long)type); + } + + /* + * The lAPIC timer may have stopped in deep c-state. + * Reprogram this CPU's lAPIC here before enabling interrupts. + */ + hpet.use_lapic_timer(lapic_expire); + + cpu_dtrace_idle_probe(CPU_ACPI_C0); + + /* + * We're no longer halted + */ + if (hset_update) { + cpup->cpu_disp_flags &= ~CPU_DISP_HALTED; + bitset_atomic_del(&cp->cp_haltset, cpu_sid); + } +} + +/* + * indicate when bus masters are active + */ +static uint32_t +cpu_acpi_bm_sts(void) +{ + uint32_t bm_sts = 0; + + cpu_acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_sts); + + if (bm_sts) + cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1); + + return (bm_sts); +} + +/* + * Idle the present CPU, deep c-state is supported + */ +void +cpu_acpi_idle(void) +{ + cpu_t *cp = CPU; + uint16_t cs_type; + cpu_acpi_handle_t handle; + cma_c_state_t *cs_data; + cpu_acpi_cstate_t *cstate; + hrtime_t start, end; + int cpu_max_cstates; + + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; + handle = mach_state->ms_acpi_handle; + ASSERT(CPU_ACPI_CSTATES(handle) != NULL); + + cs_data = mach_state->ms_cstate.cma_state.cstate; + cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle); + ASSERT(cstate != NULL); + cpu_max_cstates = cpu_acpi_get_max_cstates(handle); + if (cpu_max_cstates > CPU_MAX_CSTATES) + cpu_max_cstates = CPU_MAX_CSTATES; + + start = gethrtime_unscaled(); + + cs_type = cpupm_next_cstate(cs_data, start); + + /* + * OSPM uses the BM_STS bit to determine the power state to enter + * when considering a transition to or from the C2/C3 power state. + * if C3 is determined, bus master activity demotes the power state + * to C2. + */ + if ((cs_type >= CPU_ACPI_C3) && cpu_acpi_bm_sts()) + cs_type = CPU_ACPI_C2; + + /* + * BM_RLD determines if the Cx power state was exited as a result of + * bus master requests. Set this bit when using a C3 power state, and + * clear it when using a C1 or C2 power state. + */ + if ((CPU_ACPI_BM_INFO(handle) & BM_RLD) && (cs_type < CPU_ACPI_C3)) { + cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0); + CPU_ACPI_BM_INFO(handle) &= ~BM_RLD; + } + + if ((!(CPU_ACPI_BM_INFO(handle) & BM_RLD)) && + (cs_type >= CPU_ACPI_C3)) { + cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1); + CPU_ACPI_BM_INFO(handle) |= BM_RLD; + } + + cstate += cs_type - 1; + + switch (cs_type) { + default: + /* FALLTHROUGH */ + case CPU_ACPI_C1: + (*non_deep_idle_cpu)(); + break; + + case CPU_ACPI_C2: + acpi_cpu_cstate(cstate); + break; + + case CPU_ACPI_C3: + /* + * recommended in ACPI spec, providing hardware mechanisms + * to prevent master from writing to memory (UP-only) + */ + if ((ncpus_online == 1) && + (CPU_ACPI_BM_INFO(handle) & BM_CTL)) { + cpu_acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1); + CPU_ACPI_BM_INFO(handle) |= BM_ARB_DIS; + /* + * Today all Intel's processor support C3 share cache. + */ + } else if (x86_vendor != X86_VENDOR_Intel) { + __acpi_wbinvd(); + } + acpi_cpu_cstate(cstate); + if (CPU_ACPI_BM_INFO(handle) & BM_ARB_DIS) { + cpu_acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0); + CPU_ACPI_BM_INFO(handle) &= ~BM_ARB_DIS; + } + break; + } + + end = gethrtime_unscaled(); + + /* + * Update statistics + */ + cpupm_wakeup_cstate_data(cs_data, end); +} + +boolean_t +cpu_deep_cstates_supported(void) +{ + extern int idle_cpu_no_deep_c; + + if (idle_cpu_no_deep_c) + return (B_FALSE); + + if (!cpuid_deep_cstates_supported()) + return (B_FALSE); + + if ((hpet.supported != HPET_FULL_SUPPORT) || !hpet.install_proxy()) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * Validate that this processor supports deep cstate and if so, + * get the c-state data from ACPI and cache it. + */ +static int +cpu_idle_init(cpu_t *cp) +{ + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; + cpu_acpi_handle_t handle = mach_state->ms_acpi_handle; + cpu_acpi_cstate_t *cstate; + char name[KSTAT_STRLEN]; + int cpu_max_cstates, i; + ACPI_TABLE_FADT *gbl_FADT; + + /* + * Cache the C-state specific ACPI data. + */ + if (cpu_acpi_cache_cstate_data(handle) != 0) { + cmn_err(CE_NOTE, + "!cpu_idle_init: Failed to cache ACPI C-state data\n"); + cpu_idle_fini(cp); + return (-1); + } + + /* + * Check the bus master arbitration control ability. + */ + acpica_get_global_FADT(&gbl_FADT); + if (gbl_FADT->Pm2ControlBlock && gbl_FADT->Pm2ControlLength) + CPU_ACPI_BM_INFO(handle) |= BM_CTL; + + cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle); + + cpu_max_cstates = cpu_acpi_get_max_cstates(handle); + + for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) { + (void) snprintf(name, KSTAT_STRLEN - 1, "c%d", cstate->cs_type); + /* + * Allocate, initialize and install cstate kstat + */ + cstate->cs_ksp = kstat_create("cstate", CPU->cpu_id, + name, "misc", + KSTAT_TYPE_NAMED, + sizeof (cpu_idle_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (cstate->cs_ksp == NULL) { + cmn_err(CE_NOTE, "kstat_create(c_state) fail"); + } else { + cstate->cs_ksp->ks_data = &cpu_idle_kstat; + cstate->cs_ksp->ks_lock = &cpu_idle_mutex; + cstate->cs_ksp->ks_update = cpu_idle_kstat_update; + cstate->cs_ksp->ks_data_size += MAXNAMELEN; + cstate->cs_ksp->ks_private = cstate; + kstat_install(cstate->cs_ksp); + cstate++; + } + } + + cpupm_alloc_domains(cp, CPUPM_C_STATES); + cpupm_alloc_ms_cstate(cp); + cpuidle_set_cstate_latency(cp); + + if (cpu_deep_cstates_supported()) { + mutex_enter(&cpu_idle_callb_mutex); + if (cpu_deep_idle_callb_id == (callb_id_t)0) + cpu_deep_idle_callb_id = callb_add(&cpu_deep_idle_callb, + (void *)NULL, CB_CL_CPU_DEEP_IDLE, "cpu_deep_idle"); + if (cpu_idle_cpr_callb_id == (callb_id_t)0) + cpu_idle_cpr_callb_id = callb_add(&cpu_idle_cpr_callb, + (void *)NULL, CB_CL_CPR_PM, "cpu_idle_cpr"); + mutex_exit(&cpu_idle_callb_mutex); + } + + return (0); +} + +/* + * Free resources allocated by cpu_idle_init(). + */ +static void +cpu_idle_fini(cpu_t *cp) +{ + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state); + cpu_acpi_handle_t handle = mach_state->ms_acpi_handle; + cpu_acpi_cstate_t *cstate; + uint_t cpu_max_cstates, i; + + /* + * idle cpu points back to the generic one + */ + idle_cpu = CPU->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu; + disp_enq_thread = non_deep_idle_disp_enq_thread; + + cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle); + if (cstate) { + cpu_max_cstates = cpu_acpi_get_max_cstates(handle); + + for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) { + if (cstate->cs_ksp != NULL) + kstat_delete(cstate->cs_ksp); + cstate++; + } + } + + cpupm_free_ms_cstate(cp); + cpupm_free_domains(&cpupm_cstate_domains); + cpu_acpi_free_cstate_data(handle); + + mutex_enter(&cpu_idle_callb_mutex); + if (cpu_deep_idle_callb_id != (callb_id_t)0) { + (void) callb_delete(cpu_deep_idle_callb_id); + cpu_deep_idle_callb_id = (callb_id_t)0; + } + if (cpu_idle_cpr_callb_id != (callb_id_t)0) { + (void) callb_delete(cpu_idle_cpr_callb_id); + cpu_idle_cpr_callb_id = (callb_id_t)0; + } + mutex_exit(&cpu_idle_callb_mutex); +} + +/*ARGSUSED*/ +static boolean_t +cpu_deep_idle_callb(void *arg, int code) +{ + boolean_t rslt = B_TRUE; + + mutex_enter(&cpu_idle_callb_mutex); + switch (code) { + case PM_DEFAULT_CPU_DEEP_IDLE: + /* + * Default policy is same as enable + */ + /*FALLTHROUGH*/ + case PM_ENABLE_CPU_DEEP_IDLE: + if ((cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) == 0) + break; + + if (hpet.callback(PM_ENABLE_CPU_DEEP_IDLE)) { + disp_enq_thread = cstate_wakeup; + idle_cpu = cpu_idle_adaptive; + cpu_idle_cfg_state &= ~CPU_IDLE_DEEP_CFG; + } else { + rslt = B_FALSE; + } + break; + + case PM_DISABLE_CPU_DEEP_IDLE: + if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) + break; + + idle_cpu = non_deep_idle_cpu; + if (hpet.callback(PM_DISABLE_CPU_DEEP_IDLE)) { + disp_enq_thread = non_deep_idle_disp_enq_thread; + cpu_idle_cfg_state |= CPU_IDLE_DEEP_CFG; + } + break; + + default: + cmn_err(CE_NOTE, "!cpu deep_idle_callb: invalid code %d\n", + code); + break; + } + mutex_exit(&cpu_idle_callb_mutex); + return (rslt); +} + +/*ARGSUSED*/ +static boolean_t +cpu_idle_cpr_callb(void *arg, int code) +{ + boolean_t rslt = B_TRUE; + + mutex_enter(&cpu_idle_callb_mutex); + switch (code) { + case CB_CODE_CPR_RESUME: + if (hpet.callback(CB_CODE_CPR_RESUME)) { + /* + * Do not enable dispatcher hooks if disabled by user. + */ + if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) + break; + + disp_enq_thread = cstate_wakeup; + idle_cpu = cpu_idle_adaptive; + } else { + rslt = B_FALSE; + } + break; + + case CB_CODE_CPR_CHKPT: + idle_cpu = non_deep_idle_cpu; + disp_enq_thread = non_deep_idle_disp_enq_thread; + hpet.callback(CB_CODE_CPR_CHKPT); + break; + + default: + cmn_err(CE_NOTE, "!cpudvr cpr_callb: invalid code %d\n", code); + break; + } + mutex_exit(&cpu_idle_callb_mutex); + return (rslt); +} + +/* + * handle _CST notification + */ +void +cpuidle_cstate_instance(cpu_t *cp) +{ +#ifndef __xpv + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; + cpu_acpi_handle_t handle; + struct machcpu *mcpu; + cpuset_t dom_cpu_set; + kmutex_t *pm_lock; + int result = 0; + processorid_t cpu_id; + + if (mach_state == NULL) { + return; + } + + ASSERT(mach_state->ms_cstate.cma_domain != NULL); + dom_cpu_set = mach_state->ms_cstate.cma_domain->pm_cpus; + pm_lock = &mach_state->ms_cstate.cma_domain->pm_lock; + + /* + * Do for all the CPU's in the domain + */ + mutex_enter(pm_lock); + do { + CPUSET_FIND(dom_cpu_set, cpu_id); + if (cpu_id == CPUSET_NOTINSET) + break; + + ASSERT(cpu_id >= 0 && cpu_id < NCPU); + cp = cpu[cpu_id]; + mach_state = (cpupm_mach_state_t *) + cp->cpu_m.mcpu_pm_mach_state; + if (!(mach_state->ms_caps & CPUPM_C_STATES)) { + mutex_exit(pm_lock); + return; + } + handle = mach_state->ms_acpi_handle; + ASSERT(handle != NULL); + + /* + * re-evaluate cstate object + */ + if (cpu_acpi_cache_cstate_data(handle) != 0) { + cmn_err(CE_WARN, "Cannot re-evaluate the cpu c-state" + " object Instance: %d", cpu_id); + } + mutex_enter(&cpu_lock); + mcpu = &(cp->cpu_m); + mcpu->max_cstates = cpu_acpi_get_max_cstates(handle); + if (mcpu->max_cstates > CPU_ACPI_C1) { + hpet.callback(CST_EVENT_MULTIPLE_CSTATES); + disp_enq_thread = cstate_wakeup; + cp->cpu_m.mcpu_idle_cpu = cpu_acpi_idle; + cpuidle_set_cstate_latency(cp); + } else if (mcpu->max_cstates == CPU_ACPI_C1) { + disp_enq_thread = non_deep_idle_disp_enq_thread; + cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu; + hpet.callback(CST_EVENT_ONE_CSTATE); + } + mutex_exit(&cpu_lock); + + CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result); + mutex_exit(pm_lock); + } while (result < 0); +#endif +} + +/* + * handle the number or the type of available processor power states change + */ +void +cpuidle_manage_cstates(void *ctx) +{ + cpu_t *cp = ctx; + processorid_t cpu_id = cp->cpu_id; + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; + boolean_t is_ready; + + if (mach_state == NULL) { + return; + } + + /* + * We currently refuse to power manage if the CPU is not ready to + * take cross calls (cross calls fail silently if CPU is not ready + * for it). + * + * Additionally, for x86 platforms we cannot power manage + * any one instance, until all instances have been initialized. + * That's because we don't know what the CPU domains look like + * until all instances have been initialized. + */ + is_ready = CPUPM_XCALL_IS_READY(cpu_id) && cpupm_cstate_ready(); + if (!is_ready) + return; + + cpuidle_cstate_instance(cp); +} + +static void +cpuidle_set_cstate_latency(cpu_t *cp) +{ + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; + cpu_acpi_handle_t handle; + cpu_acpi_cstate_t *acpi_cstates; + cma_c_state_t *cpupm_cdata; + uint32_t i, cnt; + + cpupm_cdata = mach_state->ms_cstate.cma_state.cstate; + + ASSERT(cpupm_cdata != 0); + ASSERT(mach_state != NULL); + handle = mach_state->ms_acpi_handle; + ASSERT(handle != NULL); + + cnt = CPU_ACPI_CSTATES_COUNT(handle); + acpi_cstates = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle); + + cpupm_cdata->cs_C2_latency = CPU_CSTATE_LATENCY_UNDEF; + cpupm_cdata->cs_C3_latency = CPU_CSTATE_LATENCY_UNDEF; + + for (i = 1; i <= cnt; ++i, ++acpi_cstates) { + if ((cpupm_cdata->cs_C2_latency == CPU_CSTATE_LATENCY_UNDEF) && + (acpi_cstates->cs_type == CPU_ACPI_C2)) + cpupm_cdata->cs_C2_latency = acpi_cstates->cs_latency; + + if ((cpupm_cdata->cs_C3_latency == CPU_CSTATE_LATENCY_UNDEF) && + (acpi_cstates->cs_type == CPU_ACPI_C3)) + cpupm_cdata->cs_C3_latency = acpi_cstates->cs_latency; + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/i86pc/os/cpupm/cpupm_amd.c Wed Feb 25 21:04:18 2009 -0800 @@ -0,0 +1,52 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * AMD specific CPU power management support. + */ + +#include <sys/x86_archext.h> +#include <sys/cpu_acpi.h> +#include <sys/pwrnow.h> + +boolean_t +cpupm_amd_init(cpu_t *cp) +{ + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state); + + /* AMD? */ + if (x86_vendor != X86_VENDOR_AMD) + return (B_FALSE); + + /* + * If we support PowerNow! on this processor, then set the + * correct cma_ops for the processor. + */ + mach_state->ms_pstate.cma_ops = pwrnow_supported() ? + &pwrnow_ops : NULL; + + return (B_TRUE); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/i86pc/os/cpupm/cpupm_intel.c Wed Feb 25 21:04:18 2009 -0800 @@ -0,0 +1,109 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Intel specific CPU power management support. + */ + +#include <sys/x86_archext.h> +#include <sys/cpu_acpi.h> +#include <sys/speedstep.h> +#include <sys/cpupm_throttle.h> +#include <sys/cpu_idle.h> + +/* + * The Intel Processor Driver Capabilities (_PDC). + * See Intel Processor Vendor-Specific ACPI Interface Specification + * for details. + */ +#define CPUPM_INTEL_PDC_REVISION 0x1 +#define CPUPM_INTEL_PDC_PS_MSR 0x0001 +#define CPUPM_INTEL_PDC_C1_HALT 0x0002 +#define CPUPM_INTEL_PDC_TS_MSR 0x0004 +#define CPUPM_INTEL_PDC_MP 0x0008 +#define CPUPM_INTEL_PDC_C2C3_MP 0x0010 +#define CPUPM_INTEL_PDC_SW_PSD 0x0020 +#define CPUPM_INTEL_PDC_TSD 0x0080 +#define CPUPM_INTEL_PDC_C1_FFH 0x0100 +#define CPUPM_INTEL_PDC_HW_PSD 0x0800 + +static uint32_t cpupm_intel_pdccap = 0; + +boolean_t +cpupm_intel_init(cpu_t *cp) +{ + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state); + uint_t family; + uint_t model; + + if (x86_vendor != X86_VENDOR_Intel) + return (B_FALSE); + + family = cpuid_getfamily(CPU); + model = cpuid_getmodel(CPU); + + cpupm_intel_pdccap = CPUPM_INTEL_PDC_MP; + + /* + * If we support SpeedStep on this processor, then set the + * correct cma_ops for the processor and enable appropriate + * _PDC bits. + */ + if (speedstep_supported(family, model)) { + mach_state->ms_pstate.cma_ops = &speedstep_ops; + cpupm_intel_pdccap |= CPUPM_INTEL_PDC_PS_MSR | + CPUPM_INTEL_PDC_C1_HALT | CPUPM_INTEL_PDC_SW_PSD | + CPUPM_INTEL_PDC_HW_PSD; + } else { + mach_state->ms_pstate.cma_ops = NULL; + } + + /* + * Set the correct tstate_ops for the processor and + * enable appropriate _PDC bits. + */ + mach_state->ms_tstate.cma_ops = &cpupm_throttle_ops; + cpupm_intel_pdccap |= CPUPM_INTEL_PDC_TS_MSR | + CPUPM_INTEL_PDC_TSD; + + /* + * If we support deep cstates on this processor, then set the + * correct cstate_ops for the processor and enable appropriate + * _PDC bits. + */ + mach_state->ms_cstate.cma_ops = &cpu_idle_ops; + cpupm_intel_pdccap |= CPUPM_INTEL_PDC_C1_HALT | + CPUPM_INTEL_PDC_C2C3_MP | CPUPM_INTEL_PDC_C1_FFH; + + /* + * _PDC support is optional and the driver should + * function even if the _PDC write fails. + */ + (void) cpu_acpi_write_pdc(mach_state->ms_acpi_handle, + CPUPM_INTEL_PDC_REVISION, 1, &cpupm_intel_pdccap); + + return (B_TRUE); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/i86pc/os/cpupm/cpupm_mach.c Wed Feb 25 21:04:18 2009 -0800 @@ -0,0 +1,928 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/cpu_pm.h> +#include <sys/x86_archext.h> +#include <sys/sdt.h> +#include <sys/spl.h> +#include <sys/machsystm.h> +#include <sys/hpet.h> +#include <sys/cpupm.h> +#include <sys/cpu_idle.h> +#include <sys/cpu_acpi.h> +#include <sys/cpupm_throttle.h> + +/* + * This callback is used to build the PPM CPU domains once + * all the CPU devices have been started. The callback is + * initialized by the PPM driver to point to a routine that + * will build the domains. + */ +void (*cpupm_rebuild_cpu_domains)(void); + +/* + * This callback is used to reset the topspeed for all the + * CPU devices. The callback is initialized by the PPM driver to + * point to a routine that will reinitialize all the CPU devices + * once all the CPU devices have been started and the CPU domains + * built. + */ +void (*cpupm_init_topspeed)(void); + +/* + * This callback is used to redefine the topspeed for a CPU device. + * Since all CPUs in a domain should have identical properties, this + * callback is initialized by the PPM driver to point to a routine + * that will redefine the topspeed for all devices in a CPU domain. + * This callback is exercised whenever an ACPI _PPC change notification + * is received by the CPU driver. + */ +void (*cpupm_redefine_topspeed)(void *); + +/* + * This callback is used by the PPM driver to call into the CPU driver + * to find a CPU's current topspeed (i.e., it's current ACPI _PPC value). + */ +void (*cpupm_set_topspeed_callb)(void *, int); + +/* + * This callback is used by the PPM driver to call into the CPU driver + * to set a new topspeed for a CPU. + */ +int (*cpupm_get_topspeed_callb)(void *); + +static void cpupm_event_notify_handler(ACPI_HANDLE, UINT32, void *); +static void cpupm_free_notify_handlers(cpu_t *); + +/* + * Until proven otherwise, all power states are manageable. + */ +static uint32_t cpupm_enabled = CPUPM_ALL_STATES; + +/* + * Until all CPUs have started, we do not allow + * power management. + */ +static boolean_t cpupm_ready = B_FALSE; + +cpupm_state_domains_t *cpupm_pstate_domains = NULL; +cpupm_state_domains_t *cpupm_tstate_domains = NULL; +cpupm_state_domains_t *cpupm_cstate_domains = NULL; + +/* + * c-state tunables + * + * cpupm_cs_idle_cost_tunable is the ratio of time CPU spends executing + idle + * divided by time spent in the idle state transitions. + * A value of 10 means the CPU will not spend more than 1/10 of its time + * in idle latency. The worst case performance will be 90% of non Deep C-state + * kernel. + * + * cpupm_cs_idle_save_tunable is how long we must stay in a deeper C-state + * before it is worth going there. Expressed as a multiple of latency. + */ +uint32_t cpupm_cs_sample_tunable = 5; /* samples in decision period */ +uint32_t cpupm_cs_idle_cost_tunable = 10; /* work time / latency cost */ +uint32_t cpupm_cs_idle_save_tunable = 2; /* idle power savings */ +uint16_t cpupm_C2_idle_pct_tunable = 70; +uint16_t cpupm_C3_idle_pct_tunable = 80; + +#ifndef __xpv +extern boolean_t cpupm_intel_init(cpu_t *); +extern boolean_t cpupm_amd_init(cpu_t *); + +typedef struct cpupm_vendor { + boolean_t (*cpuv_init)(cpu_t *); +} cpupm_vendor_t; + +/* + * Table of supported vendors. + */ +static cpupm_vendor_t cpupm_vendors[] = { + cpupm_intel_init, + cpupm_amd_init, + NULL +}; +#endif + +/* + * Initialize the machine. + * See if a module exists for managing power for this CPU. + */ +/*ARGSUSED*/ +void +cpupm_init(cpu_t *cp) +{ +#ifndef __xpv + cpupm_vendor_t *vendors; + cpupm_mach_state_t *mach_state; + struct machcpu *mcpu = &(cp->cpu_m); + int *speeds; + uint_t nspeeds; + int ret; + + cpupm_set_supp_freqs(cp, NULL, 1); + + mach_state = cp->cpu_m.mcpu_pm_mach_state = + kmem_zalloc(sizeof (cpupm_mach_state_t), KM_SLEEP); + mach_state->ms_caps = CPUPM_NO_STATES; + mutex_init(&mach_state->ms_lock, NULL, MUTEX_DRIVER, NULL); + + mach_state->ms_acpi_handle = cpu_acpi_init(cp); + if (mach_state->ms_acpi_handle == NULL) { + cpupm_free(cp); + cmn_err(CE_WARN, "!cpupm_init: processor %d: " + "unable to get ACPI handle", cp->cpu_id); + cmn_err(CE_NOTE, "!CPU power management will not function."); + CPUPM_DISABLE(); + return; + } + + /* + * Loop through the CPU management module table and see if + * any of the modules implement CPU power management + * for this CPU. + */ + for (vendors = cpupm_vendors; vendors->cpuv_init != NULL; vendors++) { + if (vendors->cpuv_init(cp)) + break; + } + + /* + * Nope, we can't power manage this CPU. + */ + if (vendors == NULL) { + cpupm_free(cp); + CPUPM_DISABLE(); + return; + } + + /* + * If P-state support exists for this system, then initialize it. + */ + if (mach_state->ms_pstate.cma_ops != NULL) { + ret = mach_state->ms_pstate.cma_ops->cpus_init(cp); + if (ret != 0) { + cmn_err(CE_WARN, "!cpupm_init: processor %d:" + " unable to initialize P-state support", + cp->cpu_id); + mach_state->ms_pstate.cma_ops = NULL; + cpupm_disable(CPUPM_P_STATES); + } else { + nspeeds = cpupm_get_speeds(cp, &speeds); + if (nspeeds == 0) { + cmn_err(CE_WARN, "!cpupm_init: processor %d:" + " no speeds to manage", cp->cpu_id); + } else { + cpupm_set_supp_freqs(cp, speeds, nspeeds); + cpupm_free_speeds(speeds, nspeeds); + mach_state->ms_caps |= CPUPM_P_STATES; + } + } + } + + if (mach_state->ms_tstate.cma_ops != NULL) { + ret = mach_state->ms_tstate.cma_ops->cpus_init(cp); + if (ret != 0) { + cmn_err(CE_WARN, "!cpupm_init: processor %d:" + " unable to initialize T-state support", + cp->cpu_id); + mach_state->ms_tstate.cma_ops = NULL; + cpupm_disable(CPUPM_T_STATES); + } else { + mach_state->ms_caps |= CPUPM_T_STATES; + } + } + + /* + * If C-states support exists for this system, then initialize it. + */ + if (mach_state->ms_cstate.cma_ops != NULL) { + ret = mach_state->ms_cstate.cma_ops->cpus_init(cp); + if (ret != 0) { + cmn_err(CE_WARN, "!cpupm_init: processor %d:" + " unable to initialize C-state support", + cp->cpu_id); + mach_state->ms_cstate.cma_ops = NULL; + mcpu->max_cstates = CPU_ACPI_C1; + cpupm_disable(CPUPM_C_STATES); + idle_cpu = non_deep_idle_cpu; + disp_enq_thread = non_deep_idle_disp_enq_thread; + } else if (cpu_deep_cstates_supported()) { + mcpu->max_cstates = cpu_acpi_get_max_cstates( + mach_state->ms_acpi_handle); + if (mcpu->max_cstates > CPU_ACPI_C1) { + hpet.callback(CST_EVENT_MULTIPLE_CSTATES); + CPU->cpu_m.mcpu_idle_cpu = cpu_acpi_idle; + mcpu->mcpu_idle_type = CPU_ACPI_C1; + disp_enq_thread = cstate_wakeup; + } else { + hpet.callback(CST_EVENT_ONE_CSTATE); + } + mach_state->ms_caps |= CPUPM_C_STATES; + } else { + mcpu->max_cstates = CPU_ACPI_C1; + idle_cpu = non_deep_idle_cpu; + disp_enq_thread = non_deep_idle_disp_enq_thread; + } + } + + + if (mach_state->ms_caps == CPUPM_NO_STATES) { + cpupm_free(cp); + CPUPM_DISABLE(); + return; + } + + if ((mach_state->ms_caps & CPUPM_T_STATES) || + (mach_state->ms_caps & CPUPM_P_STATES) || + (mach_state->ms_caps & CPUPM_C_STATES)) + cpupm_add_notify_handler(cp, cpupm_event_notify_handler, cp); +#endif +} + +/* + * Free any resources allocated by cpupm_init(). + */ +/*ARGSUSED*/ +void +cpupm_free(cpu_t *cp) +{ +#ifndef __xpv + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; + + if (mach_state == NULL) + return; + if (mach_state->ms_pstate.cma_ops != NULL) { + mach_state->ms_pstate.cma_ops->cpus_fini(cp); + mach_state->ms_pstate.cma_ops = NULL; + } + + if (mach_state->ms_tstate.cma_ops != NULL) { + mach_state->ms_tstate.cma_ops->cpus_fini(cp); + mach_state->ms_tstate.cma_ops = NULL; + } + + if (mach_state->ms_cstate.cma_ops != NULL) { + mach_state->ms_cstate.cma_ops->cpus_fini(cp); + mach_state->ms_cstate.cma_ops = NULL; + } + + cpupm_free_notify_handlers(cp); + + if (mach_state->ms_acpi_handle != NULL) { + cpu_acpi_fini(mach_state->ms_acpi_handle); + mach_state->ms_acpi_handle = NULL; + } + + mutex_destroy(&mach_state->ms_lock); + kmem_free(mach_state, sizeof (cpupm_mach_state_t)); + cp->cpu_m.mcpu_pm_mach_state = NULL; +#endif +} + +/* + * If all CPUs have started and at least one power state is manageable, + * then the CPUs are ready for power management. + */ +boolean_t +cpupm_is_ready() +{ +#ifndef __xpv + if (cpupm_enabled == CPUPM_NO_STATES) + return (B_FALSE); + return (cpupm_ready); +#else + return (B_FALSE); +#endif + +} + +boolean_t +cpupm_is_enabled(uint32_t state) +{ + return ((cpupm_enabled & state) == state); +} + +/* + * By default, all states are enabled. + */ +void +cpupm_disable(uint32_t state) +{ + + if (state & CPUPM_P_STATES) { + cpupm_free_domains(&cpupm_pstate_domains); + } + if (state & CPUPM_T_STATES) { + cpupm_free_domains(&cpupm_tstate_domains); + } + if (state & CPUPM_C_STATES) { + cpupm_free_domains(&cpupm_cstate_domains); + } + cpupm_enabled &= ~state; +} + +/* + * Once all CPUs have been started, the PPM driver should build CPU + * domains and initialize the topspeed for all CPU devices. + */ +void +cpupm_post_startup() +{ +#ifndef __xpv + /* + * The CPU domain built by the PPM during CPUs attaching + * should be rebuilt with the information retrieved from + * ACPI. + */ + if (cpupm_rebuild_cpu_domains != NULL) + (*cpupm_rebuild_cpu_domains)(); + + /* + * Only initialize the topspeed if P-states are enabled. + */ + if (cpupm_enabled & CPUPM_P_STATES && cpupm_init_topspeed != NULL) + (*cpupm_init_topspeed)(); +#endif + cpupm_ready = B_TRUE; +} + +/* + * Allocate power domains for C,P and T States + */ +void +cpupm_alloc_domains(cpu_t *cp, int state) +{ + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state); + cpu_acpi_handle_t handle = mach_state->ms_acpi_handle; + cpupm_state_domains_t **dom_ptr; + cpupm_state_domains_t *dptr; + cpupm_state_domains_t **mach_dom_state_ptr; + uint32_t domain; + uint32_t type; + + switch (state) { + case CPUPM_P_STATES: + if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_PSD_CACHED)) { + domain = CPU_ACPI_PSD(handle).sd_domain; + type = CPU_ACPI_PSD(handle).sd_type; + } else { + mutex_enter(&cpu_lock); + domain = cpuid_get_chipid(cp); + mutex_exit(&cpu_lock); + type = CPU_ACPI_HW_ALL; + } + dom_ptr = &cpupm_pstate_domains; + mach_dom_state_ptr = &mach_state->ms_pstate.cma_domain; + break; + case CPUPM_T_STATES: + if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_TSD_CACHED)) { + domain = CPU_ACPI_TSD(handle).sd_domain; + type = CPU_ACPI_TSD(handle).sd_type; + } else { + mutex_enter(&cpu_lock); + domain = cpuid_get_chipid(cp); + mutex_exit(&cpu_lock); + type = CPU_ACPI_HW_ALL; + } + dom_ptr = &cpupm_tstate_domains; + mach_dom_state_ptr = &mach_state->ms_tstate.cma_domain; + break; + case CPUPM_C_STATES: + if (CPU_ACPI_IS_OBJ_CACHED(handle, CPU_ACPI_CSD_CACHED)) { + domain = CPU_ACPI_CSD(handle).sd_domain; + type = CPU_ACPI_CSD(handle).sd_type; + } else { + mutex_enter(&cpu_lock); + domain = cpuid_get_coreid(cp); + mutex_exit(&cpu_lock); + type = CPU_ACPI_HW_ALL; + } + dom_ptr = &cpupm_cstate_domains; + mach_dom_state_ptr = &mach_state->ms_cstate.cma_domain; + break; + default: + return; + } + + for (dptr = *dom_ptr; dptr != NULL; dptr = dptr->pm_next) { + if (dptr->pm_domain == domain) + break; + } + + /* new domain is created and linked at the head */ + if (dptr == NULL) { + dptr = kmem_zalloc(sizeof (cpupm_state_domains_t), KM_SLEEP); + dptr->pm_domain = domain; + dptr->pm_type = type; + dptr->pm_next = *dom_ptr; + mutex_init(&dptr->pm_lock, NULL, MUTEX_SPIN, + (void *)ipltospl(DISP_LEVEL)); + CPUSET_ZERO(dptr->pm_cpus); + *dom_ptr = dptr; + } + CPUSET_ADD(dptr->pm_cpus, cp->cpu_id); + *mach_dom_state_ptr = dptr; +} + +/* + * Free C, P or T state power domains + */ +void +cpupm_free_domains(cpupm_state_domains_t **dom_ptr) +{ + cpupm_state_domains_t *this_domain, *next_domain; + + this_domain = *dom_ptr; + while (this_domain != NULL) { + next_domain = this_domain->pm_next; + mutex_destroy(&this_domain->pm_lock); + kmem_free((void *)this_domain, + sizeof (cpupm_state_domains_t)); + this_domain = next_domain; + } + *dom_ptr = NULL; +} + +void +cpupm_alloc_ms_cstate(cpu_t *cp) +{ + cpupm_mach_state_t *mach_state; + cpupm_mach_acpi_state_t *ms_cstate; + + mach_state = (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state); + ms_cstate = &mach_state->ms_cstate; + ASSERT(ms_cstate->cma_state.cstate == NULL); + ms_cstate->cma_state.cstate = kmem_zalloc(sizeof (cma_c_state_t), + KM_SLEEP); + ms_cstate->cma_state.cstate->cs_next_cstate = CPU_ACPI_C1; +} + +void +cpupm_free_ms_cstate(cpu_t *cp) +{ + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state); + cpupm_mach_acpi_state_t *ms_cstate = &mach_state->ms_cstate; + + if (ms_cstate->cma_state.cstate != NULL) { + kmem_free(ms_cstate->cma_state.cstate, sizeof (cma_c_state_t)); + ms_cstate->cma_state.cstate = NULL; + } +} + +void +cpupm_state_change(cpu_t *cp, int level, int state) +{ + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state); + cpupm_state_ops_t *state_ops; + cpupm_state_domains_t *state_domain; + cpuset_t set; + + DTRACE_PROBE2(cpupm__state__change, cpu_t *, cp, int, level); + + if (mach_state == NULL) { + return; + } + + switch (state) { + case CPUPM_P_STATES: + state_ops = mach_state->ms_pstate.cma_ops; + state_domain = mach_state->ms_pstate.cma_domain; + break; + case CPUPM_T_STATES: + state_ops = mach_state->ms_tstate.cma_ops; + state_domain = mach_state->ms_tstate.cma_domain; + break; + default: + break; + } + + switch (state_domain->pm_type) { + case CPU_ACPI_SW_ANY: + /* + * A request on any CPU in the domain transitions the domain + */ + CPUSET_ONLY(set, cp->cpu_id); + state_ops->cpus_change(set, level); + break; + case CPU_ACPI_SW_ALL: + /* + * All CPUs in the domain must request the transition + */ + case CPU_ACPI_HW_ALL: + /* + * P/T-state transitions are coordinated by the hardware + * For now, request the transition on all CPUs in the domain, + * but looking ahead we can probably be smarter about this. + */ + mutex_enter(&state_domain->pm_lock); + state_ops->cpus_change(state_domain->pm_cpus, level); + mutex_exit(&state_domain->pm_lock); + break; + default: + cmn_err(CE_WARN, "Unknown domain coordination type: %d", + state_domain->pm_type); + } +} + +/* + * CPU PM interfaces exposed to the CPU power manager + */ +/*ARGSUSED*/ +id_t +cpupm_plat_domain_id(cpu_t *cp, cpupm_dtype_t type) +{ + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state); + + if ((mach_state == NULL) || (!cpupm_is_enabled(CPUPM_P_STATES) && + !cpupm_is_enabled(CPUPM_C_STATES))) { + return (CPUPM_NO_DOMAIN); + } + if (type == CPUPM_DTYPE_ACTIVE) { + /* + * Return P-State domain for the specified CPU + */ + if (mach_state->ms_pstate.cma_domain) { + return (mach_state->ms_pstate.cma_domain->pm_domain); + } + } else if (type == CPUPM_DTYPE_IDLE) { + /* + * Return C-State domain for the specified CPU + */ + if (mach_state->ms_cstate.cma_domain) { + return (mach_state->ms_cstate.cma_domain->pm_domain); + } + } + return (CPUPM_NO_DOMAIN); +} + +/*ARGSUSED*/ +uint_t +cpupm_plat_state_enumerate(cpu_t *cp, cpupm_dtype_t type, + cpupm_state_t *states) +{ + int *speeds; + uint_t nspeeds, i; + + /* + * Idle domain support unimplemented + */ + if (type != CPUPM_DTYPE_ACTIVE) { + return (0); + } + nspeeds = cpupm_get_speeds(cp, &speeds); + + /* + * If the caller passes NULL for states, just return the + * number of states. + */ + if (states != NULL) { + for (i = 0; i < nspeeds; i++) { + states[i].cps_speed = speeds[i]; + states[i].cps_handle = (cpupm_handle_t)i; + } + } + cpupm_free_speeds(speeds, nspeeds); + return (nspeeds); +} + +/*ARGSUSED*/ +int +cpupm_plat_change_state(cpu_t *cp, cpupm_state_t *state) +{ + if (!cpupm_is_ready()) + return (-1); + + cpupm_state_change(cp, (int)state->cps_handle, CPUPM_P_STATES); + + return (0); +} + +/*ARGSUSED*/ +/* + * Note: It is the responsibility of the users of + * cpupm_get_speeds() to free the memory allocated + * for speeds using cpupm_free_speeds() + */ +uint_t +cpupm_get_speeds(cpu_t *cp, int **speeds) +{ +#ifndef __xpv + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; + return (cpu_acpi_get_speeds(mach_state->ms_acpi_handle, speeds)); +#else + return (0); +#endif +} + +/*ARGSUSED*/ +void +cpupm_free_speeds(int *speeds, uint_t nspeeds) +{ +#ifndef __xpv + cpu_acpi_free_speeds(speeds, nspeeds); +#endif +} + +/* + * All CPU instances have been initialized successfully. + */ +boolean_t +cpupm_power_ready(void) +{ + return (cpupm_is_enabled(CPUPM_P_STATES) && cpupm_is_ready()); +} + +/* + * All CPU instances have been initialized successfully. + */ +boolean_t +cpupm_throttle_ready(void) +{ + return (cpupm_is_enabled(CPUPM_T_STATES) && cpupm_is_ready()); +} + +/* + * All CPU instances have been initialized successfully. + */ +boolean_t +cpupm_cstate_ready(void) +{ + return (cpupm_is_enabled(CPUPM_C_STATES) && cpupm_is_ready()); +} + +void +cpupm_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx) +{ + cpu_t *cp = ctx; + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state); + cpupm_notification_t *entry; + + mutex_enter(&mach_state->ms_lock); + for (entry = mach_state->ms_handlers; entry != NULL; + entry = entry->nq_next) { + entry->nq_handler(obj, val, entry->nq_ctx); + } + mutex_exit(&mach_state->ms_lock); +} + +/*ARGSUSED*/ +void +cpupm_add_notify_handler(cpu_t *cp, CPUPM_NOTIFY_HANDLER handler, void *ctx) +{ +#ifndef __xpv + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; + cpupm_notification_t *entry; + + entry = kmem_zalloc(sizeof (cpupm_notification_t), KM_SLEEP); + entry->nq_handler = handler; + entry->nq_ctx = ctx; + mutex_enter(&mach_state->ms_lock); + if (mach_state->ms_handlers == NULL) { + entry->nq_next = NULL; + mach_state->ms_handlers = entry; + cpu_acpi_install_notify_handler(mach_state->ms_acpi_handle, + cpupm_notify_handler, cp); + + } else { + entry->nq_next = mach_state->ms_handlers; + mach_state->ms_handlers = entry; + } + mutex_exit(&mach_state->ms_lock); +#endif +} + +/*ARGSUSED*/ +static void +cpupm_free_notify_handlers(cpu_t *cp) +{ +#ifndef __xpv + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; + cpupm_notification_t *entry; + cpupm_notification_t *next; + + mutex_enter(&mach_state->ms_lock); + if (mach_state->ms_handlers == NULL) { + mutex_exit(&mach_state->ms_lock); + return; + } + if (mach_state->ms_acpi_handle != NULL) { + cpu_acpi_remove_notify_handler(mach_state->ms_acpi_handle, + cpupm_notify_handler); + } + entry = mach_state->ms_handlers; + while (entry != NULL) { + next = entry->nq_next; + kmem_free(entry, sizeof (cpupm_notification_t)); + entry = next; + } + mach_state->ms_handlers = NULL; + mutex_exit(&mach_state->ms_lock); +#endif +} + +/* + * Get the current max speed from the ACPI _PPC object + */ +/*ARGSUSED*/ +int +cpupm_get_top_speed(cpu_t *cp) +{ +#ifndef __xpv + cpupm_mach_state_t *mach_state; + cpu_acpi_handle_t handle; + int plat_level; + uint_t nspeeds; + int max_level; + + mach_state = + (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; + handle = mach_state->ms_acpi_handle; + + cpu_acpi_cache_ppc(handle); + plat_level = CPU_ACPI_PPC(handle); + + nspeeds = CPU_ACPI_PSTATES_COUNT(handle); + + max_level = nspeeds - 1; + if ((plat_level < 0) || (plat_level > max_level)) { + cmn_err(CE_NOTE, "!cpupm_get_top_speed: CPU %d: " + "_PPC out of range %d", cp->cpu_id, plat_level); + plat_level = 0; + } + + return (plat_level); +#else + return (0); +#endif +} + +/* + * This notification handler is called whenever the ACPI _PPC + * object changes. The _PPC is a sort of governor on power levels. + * It sets an upper threshold on which, _PSS defined, power levels + * are usuable. The _PPC value is dynamic and may change as properties + * (i.e., thermal or AC source) of the system change. + */ + +static void +cpupm_power_manage_notifications(void *ctx) +{ + cpu_t *cp = ctx; + int top_speed; + + top_speed = cpupm_get_top_speed(cp); + cpupm_redefine_max_activepwr_state(cp, top_speed); +} + +/* ARGSUSED */ +static void +cpupm_event_notify_handler(ACPI_HANDLE obj, UINT32 val, void *ctx) +{ +#ifndef __xpv + /* + * Currently, we handle _TPC,_CST and _PPC change notifications. + */ + if (val == CPUPM_TPC_CHANGE_NOTIFICATION) { + cpupm_throttle_manage_notification(ctx); + } else if (val == CPUPM_CST_CHANGE_NOTIFICATION) { + cpuidle_manage_cstates(ctx); + } else if (val == CPUPM_PPC_CHANGE_NOTIFICATION) { + cpupm_power_manage_notifications(ctx); + } +#endif +} + +/* + * Update cpupm cstate data each time CPU exits idle. + */ +void +cpupm_wakeup_cstate_data(cma_c_state_t *cs_data, hrtime_t end) +{ + cs_data->cs_idle_exit = end; +} + +/* + * Determine next cstate based on cpupm data. + * Update cpupm cstate data each time CPU goes idle. + * Do as much as possible in the idle state bookkeeping function because the + * performance impact while idle is minimal compared to in the wakeup function + * when there is real work to do. + */ +uint32_t +cpupm_next_cstate(cma_c_state_t *cs_data, hrtime_t start) +{ + hrtime_t duration; + hrtime_t ave_interval; + hrtime_t ave_idle_time; + + duration = cs_data->cs_idle_exit - cs_data->cs_idle_enter; + scalehrtime(&duration); + cs_data->cs_idle += duration; + cs_data->cs_idle_enter = start; + + ++cs_data->cs_cnt; + if (cs_data->cs_cnt > cpupm_cs_sample_tunable) { + cs_data->cs_smpl_len = start - cs_data->cs_smpl_start; + scalehrtime(&cs_data->cs_smpl_len); + cs_data->cs_smpl_len |= 1; /* protect from DIV 0 */ + cs_data->cs_smpl_idle = cs_data->cs_idle; + cs_data->cs_idle = 0; + cs_data->cs_smpl_idle_pct = ((100 * cs_data->cs_smpl_idle) / + cs_data->cs_smpl_len); + + cs_data->cs_smpl_start = start; + cs_data->cs_cnt = 0; + + /* + * Strand level C-state policy + */ + cs_data->cs_next_cstate = CPU_ACPI_C3; + + /* + * Will CPU be idle long enough to save power? + */ + ave_idle_time = (cs_data->cs_smpl_idle / + cpupm_cs_sample_tunable) / 1000; + if (ave_idle_time < (cs_data->cs_C2_latency * + cpupm_cs_idle_save_tunable)) { + cs_data->cs_next_cstate = CPU_ACPI_C1; + DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU, + int, 1); + return (cs_data->cs_next_cstate); + } else if (ave_idle_time < (cs_data->cs_C3_latency * + cpupm_cs_idle_save_tunable)) { + cs_data->cs_next_cstate = CPU_ACPI_C2; + DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU, + int, 2); + } + + /* + * Wakeup often (even when non-idle time is very short)? + * Some producer/consumer type loads fall into this category. + */ + ave_interval = (cs_data->cs_smpl_len / cpupm_cs_sample_tunable) + / 1000; + if (ave_interval <= + (cs_data->cs_C2_latency * cpupm_cs_idle_cost_tunable)) { + cs_data->cs_next_cstate = CPU_ACPI_C1; + DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU, + int, 3); + return (cs_data->cs_next_cstate); + } else if (ave_interval <= + (cs_data->cs_C3_latency * cpupm_cs_idle_cost_tunable)) { + cs_data->cs_next_cstate = CPU_ACPI_C2; + DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU, + int, 4); + } + + /* + * Idle percent + */ + if (cs_data->cs_smpl_idle_pct < cpupm_C2_idle_pct_tunable) { + cs_data->cs_next_cstate = CPU_ACPI_C1; + DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU, + int, 5); + return (cs_data->cs_next_cstate); + } else if ((cs_data->cs_next_cstate > CPU_ACPI_C2) && + (cs_data->cs_smpl_idle_pct < cpupm_C3_idle_pct_tunable)) { + cs_data->cs_next_cstate = CPU_ACPI_C2; + DTRACE_PROBE2(cpupm__next__cstate, cpu_t *, CPU, + int, 6); + } + } + + return (cs_data->cs_next_cstate); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/i86pc/os/cpupm/cpupm_throttle.c Wed Feb 25 21:04:18 2009 -0800 @@ -0,0 +1,345 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/x86_archext.h> +#include <sys/machsystm.h> +#include <sys/x_call.h> +#include <sys/cpu_acpi.h> +#include <sys/cpupm_throttle.h> +#include <sys/dtrace.h> +#include <sys/sdt.h> + +static int cpupm_throttle_init(cpu_t *); +static void cpupm_throttle_fini(cpu_t *); +static void cpupm_throttle(cpuset_t, uint32_t); + +cpupm_state_ops_t cpupm_throttle_ops = { + "Generic ACPI T-state Support", + cpupm_throttle_init, + cpupm_throttle_fini, + cpupm_throttle +}; + +/* + * Error returns + */ +#define THROTTLE_RET_SUCCESS 0x00 +#define THROTTLE_RET_INCOMPLETE_DATA 0x01 +#define THROTTLE_RET_UNSUP_STATE 0x02 +#define THROTTLE_RET_TRANS_INCOMPLETE 0x03 + +#define THROTTLE_LATENCY_WAIT 1 + +/* + * MSR register for clock modulation + */ +#define IA32_CLOCK_MODULATION_MSR 0x19A + +/* + * Debugging support + */ +#ifdef DEBUG +volatile int cpupm_throttle_debug = 0; +#define CTDEBUG(arglist) if (cpupm_throttle_debug) printf arglist; +#else +#define CTDEBUG(arglist) +#endif + +/* + * Write the _PTC ctrl register. How it is written, depends upon the _PTC + * APCI object value. + */ +static int +write_ctrl(cpu_acpi_handle_t handle, uint32_t ctrl) +{ + cpu_acpi_ptc_t *ptc_ctrl; + uint64_t reg; + int ret = 0; + + ptc_ctrl = CPU_ACPI_PTC_CTRL(handle); + + switch (ptc_ctrl->cr_addrspace_id) { + case ACPI_ADR_SPACE_FIXED_HARDWARE: + /* + * Read current thermal state because reserved bits must be + * preserved, compose new value, and write it.The writable + * bits are 4:1 (1 to 4). + * Bits 3:1 => On-Demand Clock Modulation Duty Cycle + * Bit 4 => On-Demand Clock Modulation Enable + * Left shift ctrl by 1 to allign with bits 1-4 of MSR + */ + reg = rdmsr(IA32_CLOCK_MODULATION_MSR); + reg &= ~((uint64_t)0x1E); + reg |= ctrl; + wrmsr(IA32_CLOCK_MODULATION_MSR, reg); + break; + + case ACPI_ADR_SPACE_SYSTEM_IO: + ret = cpu_acpi_write_port(ptc_ctrl->cr_address, ctrl, + ptc_ctrl->cr_width); + break; + + default: + DTRACE_PROBE1(throttle_ctrl_unsupported_type, uint8_t, + ptc_ctrl->cr_addrspace_id); + + ret = -1; + } + + DTRACE_PROBE1(throttle_ctrl_write, uint32_t, ctrl); + DTRACE_PROBE1(throttle_ctrl_write_err, int, ret); + + return (ret); +} + +static int +read_status(cpu_acpi_handle_t handle, uint32_t *stat) +{ + cpu_acpi_ptc_t *ptc_stat; + uint64_t reg; + int ret = 0; + + ptc_stat = CPU_ACPI_PTC_STATUS(handle); + + switch (ptc_stat->cr_addrspace_id) { + case ACPI_ADR_SPACE_FIXED_HARDWARE: + reg = rdmsr(IA32_CLOCK_MODULATION_MSR); + *stat = reg & 0x1E; + ret = 0; + break; + + case ACPI_ADR_SPACE_SYSTEM_IO: + ret = cpu_acpi_read_port(ptc_stat->cr_address, stat, + ptc_stat->cr_width); + break; + + default: + DTRACE_PROBE1(throttle_status_unsupported_type, uint8_t, + ptc_stat->cr_addrspace_id); + + return (-1); + } + + DTRACE_PROBE1(throttle_status_read, uint32_t, *stat); + DTRACE_PROBE1(throttle_status_read_err, int, ret); + + return (ret); +} + +/* + * Transition the current processor to the requested throttling state. + */ +static void +cpupm_tstate_transition(uint32_t req_state) +{ + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)CPU->cpu_m.mcpu_pm_mach_state; + cpu_acpi_handle_t handle = mach_state->ms_acpi_handle; + cpu_acpi_tstate_t *req_tstate; + uint32_t ctrl; + uint32_t stat; + int i; + + req_tstate = (cpu_acpi_tstate_t *)CPU_ACPI_TSTATES(handle); + req_tstate += req_state; + DTRACE_PROBE1(throttle_transition, uint32_t, + CPU_ACPI_FREQPER(req_tstate)); + + /* + * Initiate the processor t-state change. + */ + ctrl = CPU_ACPI_TSTATE_CTRL(req_tstate); + if (write_ctrl(handle, ctrl) != 0) { + return; + } + + /* + * If status is zero, then transition is synchronous and + * no status value comparison is required. + */ + if (CPU_ACPI_TSTATE_STAT(req_tstate) == 0) { + return; + } + + /* Wait until switch is complete, but bound the loop just in case. */ + for (i = CPU_ACPI_TSTATE_TRANSLAT(req_tstate) * 2; i >= 0; + i -= THROTTLE_LATENCY_WAIT) { + if (read_status(handle, &stat) == 0 && + CPU_ACPI_TSTATE_STAT(req_tstate) == stat) + break; + drv_usecwait(THROTTLE_LATENCY_WAIT); + } + + if (CPU_ACPI_TSTATE_STAT(req_tstate) != stat) { + DTRACE_PROBE(throttle_transition_incomplete); + } +} + +static void +cpupm_throttle(cpuset_t set, uint32_t throtl_lvl) +{ + /* + * If thread is already running on target CPU then just + * make the transition request. Otherwise, we'll need to + * make a cross-call. + */ + kpreempt_disable(); + if (CPU_IN_SET(set, CPU->cpu_id)) { + cpupm_tstate_transition(throtl_lvl); + CPUSET_DEL(set, CPU->cpu_id); + } + if (!CPUSET_ISNULL(set)) { + xc_call((xc_arg_t)throtl_lvl, NULL, NULL, X_CALL_HIPRI, + set, (xc_func_t)cpupm_tstate_transition); + } + kpreempt_enable(); +} + +static int +cpupm_throttle_init(cpu_t *cp) +{ + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; + cpu_acpi_handle_t handle = mach_state->ms_acpi_handle; + cpu_acpi_ptc_t *ptc_stat; + + if (cpu_acpi_cache_tstate_data(handle) != 0) { + CTDEBUG(("Failed to cache T-state ACPI data\n")); + cpupm_throttle_fini(cp); + return (THROTTLE_RET_INCOMPLETE_DATA); + } + + /* + * Check the address space used for transitions + */ + ptc_stat = CPU_ACPI_PTC_STATUS(handle); + switch (ptc_stat->cr_addrspace_id) { + case ACPI_ADR_SPACE_FIXED_HARDWARE: + CTDEBUG(("T-State transitions will use fixed hardware\n")); + break; + case ACPI_ADR_SPACE_SYSTEM_IO: + CTDEBUG(("T-State transitions will use System IO\n")); + break; + default: + cmn_err(CE_WARN, "!_PTC conifgured for unsupported " + "address space type = %d.", ptc_stat->cr_addrspace_id); + return (THROTTLE_RET_INCOMPLETE_DATA); + } + + cpupm_alloc_domains(cp, CPUPM_T_STATES); + + return (THROTTLE_RET_SUCCESS); +} + +static void +cpupm_throttle_fini(cpu_t *cp) +{ + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; + cpu_acpi_handle_t handle = mach_state->ms_acpi_handle; + + cpupm_free_domains(&cpupm_tstate_domains); + cpu_acpi_free_tstate_data(handle); +} + +/* + * This routine reads the ACPI _TPC object. It's accessed as a callback + * by the cpu driver whenever a _TPC change notification is received. + */ +static int +cpupm_throttle_get_max(processorid_t cpu_id) +{ + cpu_t *cp = cpu[cpu_id]; + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state); + cpu_acpi_handle_t handle; + int throtl_level; + int max_throttle_lvl; + uint_t num_throtl; + + if (mach_state == NULL) { + return (-1); + } + + handle = mach_state->ms_acpi_handle; + ASSERT(handle != NULL); + + cpu_acpi_cache_tpc(handle); + throtl_level = CPU_ACPI_TPC(handle); + + num_throtl = CPU_ACPI_TSTATES_COUNT(handle); + + max_throttle_lvl = num_throtl - 1; + if ((throtl_level < 0) || (throtl_level > max_throttle_lvl)) { + cmn_err(CE_NOTE, "!cpupm_throttle_get_max: CPU %d: " + "_TPC out of range %d", cp->cpu_id, throtl_level); + throtl_level = 0; + } + + return (throtl_level); +} + +/* + * Take care of CPU throttling when _TPC notification arrives + */ +void +cpupm_throttle_manage_notification(void *ctx) +{ + cpu_t *cp = ctx; + processorid_t cpu_id = cp->cpu_id; + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; + boolean_t is_ready; + int new_level; + + if (mach_state == NULL) { + return; + } + + /* + * We currently refuse to power-manage if the CPU is not ready to + * take cross calls (cross calls fail silently if CPU is not ready + * for it). + * + * Additionally, for x86 platforms we cannot power-manage + * any one instance, until all instances have been initialized. + * That's because we don't know what the CPU domains look like + * until all instances have been initialized. + */ + is_ready = CPUPM_XCALL_IS_READY(cpu_id) && cpupm_throttle_ready(); + if (!is_ready) + return; + + if (!(mach_state->ms_caps & CPUPM_T_STATES)) + return; + ASSERT(mach_state->ms_tstate.cma_ops != NULL); + + /* + * Get the new T-State support level + */ + new_level = cpupm_throttle_get_max(cpu_id); + + cpupm_state_change(cp, new_level, CPUPM_T_STATES); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/i86pc/os/cpupm/pwrnow.c Wed Feb 25 21:04:18 2009 -0800 @@ -0,0 +1,247 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/x86_archext.h> +#include <sys/machsystm.h> +#include <sys/x_call.h> +#include <sys/acpi/acpi.h> +#include <sys/acpica.h> +#include <sys/pwrnow.h> +#include <sys/cpu_acpi.h> +#include <sys/cpupm.h> +#include <sys/dtrace.h> +#include <sys/sdt.h> + +static int pwrnow_init(cpu_t *); +static void pwrnow_fini(cpu_t *); +static void pwrnow_power(cpuset_t, uint32_t); + +/* + * Interfaces for modules implementing AMD's PowerNow!. + */ +cpupm_state_ops_t pwrnow_ops = { + "PowerNow! Technology", + pwrnow_init, + pwrnow_fini, + pwrnow_power +}; + +/* + * Error returns + */ +#define PWRNOW_RET_SUCCESS 0x00 +#define PWRNOW_RET_NO_PM 0x01 +#define PWRNOW_RET_UNSUP_STATE 0x02 +#define PWRNOW_RET_TRANS_INCOMPLETE 0x03 + +#define PWRNOW_LATENCY_WAIT 10 + +/* + * MSR registers for changing and reading processor power state. + */ +#define PWRNOW_PERF_CTL_MSR 0xC0010062 +#define PWRNOW_PERF_STATUS_MSR 0xC0010063 + +#define AMD_CPUID_PSTATE_HARDWARE (1<<7) +#define AMD_CPUID_TSC_CONSTANT (1<<8) + +/* + * Debugging support + */ +#ifdef DEBUG +volatile int pwrnow_debug = 0; +#define PWRNOW_DEBUG(arglist) if (pwrnow_debug) printf arglist; +#else +#define PWRNOW_DEBUG(arglist) +#endif + +/* + * Write the ctrl register. + */ +static void +write_ctrl(cpu_acpi_handle_t handle, uint32_t ctrl) +{ + cpu_acpi_pct_t *pct_ctrl; + uint64_t reg; + + pct_ctrl = CPU_ACPI_PCT_CTRL(handle); + + switch (pct_ctrl->cr_addrspace_id) { + case ACPI_ADR_SPACE_FIXED_HARDWARE: + reg = ctrl; + wrmsr(PWRNOW_PERF_CTL_MSR, reg); + break; + + default: + DTRACE_PROBE1(pwrnow_ctrl_unsupported_type, uint8_t, + pct_ctrl->cr_addrspace_id); + return; + } + + DTRACE_PROBE1(pwrnow_ctrl_write, uint32_t, ctrl); +} + +/* + * Transition the current processor to the requested state. + */ +static void +pwrnow_pstate_transition(uint32_t req_state) +{ + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)CPU->cpu_m.mcpu_pm_mach_state; + cpu_acpi_handle_t handle = mach_state->ms_acpi_handle; + cpu_acpi_pstate_t *req_pstate; + uint32_t ctrl; + + req_pstate = (cpu_acpi_pstate_t *)CPU_ACPI_PSTATES(handle); + req_pstate += req_state; + + DTRACE_PROBE1(pwrnow_transition_freq, uint32_t, + CPU_ACPI_FREQ(req_pstate)); + + /* + * Initiate the processor p-state change. + */ + ctrl = CPU_ACPI_PSTATE_CTRL(req_pstate); + write_ctrl(handle, ctrl); + + mach_state->ms_pstate.cma_state.pstate = req_state; + cpu_set_curr_clock((uint64_t)CPU_ACPI_FREQ(req_pstate) * 1000000); +} + +static void +pwrnow_power(cpuset_t set, uint32_t req_state) +{ + /* + * If thread is already running on target CPU then just + * make the transition request. Otherwise, we'll need to + * make a cross-call. + */ + kpreempt_disable(); + if (CPU_IN_SET(set, CPU->cpu_id)) { + pwrnow_pstate_transition(req_state); + CPUSET_DEL(set, CPU->cpu_id); + } + if (!CPUSET_ISNULL(set)) { + xc_call((xc_arg_t)req_state, NULL, NULL, X_CALL_HIPRI, + set, (xc_func_t)pwrnow_pstate_transition); + } + kpreempt_enable(); +} + +/* + * Validate that this processor supports PowerNow! and if so, + * get the P-state data from ACPI and cache it. + */ +static int +pwrnow_init(cpu_t *cp) +{ + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; + cpu_acpi_handle_t handle = mach_state->ms_acpi_handle; + cpu_acpi_pct_t *pct_stat; + + PWRNOW_DEBUG(("pwrnow_init: processor %d\n", cp->cpu_id)); + + /* + * Cache the P-state specific ACPI data. + */ + if (cpu_acpi_cache_pstate_data(handle) != 0) { + PWRNOW_DEBUG(("Failed to cache ACPI data\n")); + pwrnow_fini(cp); + return (PWRNOW_RET_NO_PM); + } + + pct_stat = CPU_ACPI_PCT_STATUS(handle); + switch (pct_stat->cr_addrspace_id) { + case ACPI_ADR_SPACE_FIXED_HARDWARE: + PWRNOW_DEBUG(("Transitions will use fixed hardware\n")); + break; + default: + cmn_err(CE_WARN, "!_PCT configured for unsupported " + "addrspace = %d.", pct_stat->cr_addrspace_id); + cmn_err(CE_NOTE, "!CPU power management will not function."); + pwrnow_fini(cp); + return (PWRNOW_RET_NO_PM); + } + + cpupm_alloc_domains(cp, CPUPM_P_STATES); + + PWRNOW_DEBUG(("Processor %d succeeded.\n", cp->cpu_id)) + return (PWRNOW_RET_SUCCESS); +} + +/* + * Free resources allocated by pwrnow_init(). + */ +static void +pwrnow_fini(cpu_t *cp) +{ + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state); + cpu_acpi_handle_t handle = mach_state->ms_acpi_handle; + + cpupm_free_domains(&cpupm_pstate_domains); + cpu_acpi_free_pstate_data(handle); +} + +boolean_t +pwrnow_supported() +{ + struct cpuid_regs cpu_regs; + + /* Required features */ + if (!(x86_feature & X86_CPUID) || + !(x86_feature & X86_MSR)) { + PWRNOW_DEBUG(("No CPUID or MSR support.")); + return (B_FALSE); + } + + /* + * Get the Advanced Power Management Information. + */ + cpu_regs.cp_eax = 0x80000007; + (void) __cpuid_insn(&cpu_regs); + + /* + * We currently only support CPU power management of + * processors that are P-state TSC invariant + */ + if (!(cpu_regs.cp_edx & AMD_CPUID_TSC_CONSTANT)) { + PWRNOW_DEBUG(("No support for CPUs that are not P-state " + "TSC invariant.\n")); + return (B_FALSE); + } + + /* + * We only support the "Fire and Forget" style of PowerNow! (i.e., + * single MSR write to change speed). + */ + if (!(cpu_regs.cp_edx & AMD_CPUID_PSTATE_HARDWARE)) { + PWRNOW_DEBUG(("Hardware P-State control is not supported.\n")); + return (B_FALSE); + } + return (B_TRUE); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/i86pc/os/cpupm/speedstep.c Wed Feb 25 21:04:18 2009 -0800 @@ -0,0 +1,252 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/x86_archext.h> +#include <sys/machsystm.h> +#include <sys/x_call.h> +#include <sys/acpi/acpi.h> +#include <sys/acpica.h> +#include <sys/speedstep.h> +#include <sys/cpu_acpi.h> +#include <sys/cpupm.h> +#include <sys/dtrace.h> +#include <sys/sdt.h> + +static int speedstep_init(cpu_t *); +static void speedstep_fini(cpu_t *); +static void speedstep_power(cpuset_t, uint32_t); + +/* + * Interfaces for modules implementing Intel's Enhanced SpeedStep. + */ +cpupm_state_ops_t speedstep_ops = { + "Enhanced SpeedStep Technology", + speedstep_init, + speedstep_fini, + speedstep_power +}; + +/* + * Error returns + */ +#define ESS_RET_SUCCESS 0x00 +#define ESS_RET_NO_PM 0x01 +#define ESS_RET_UNSUP_STATE 0x02 + +/* + * MSR registers for changing and reading processor power state. + */ +#define IA32_PERF_STAT_MSR 0x198 +#define IA32_PERF_CTL_MSR 0x199 + +#define IA32_CPUID_TSC_CONSTANT 0xF30 +#define IA32_MISC_ENABLE_MSR 0x1A0 +#define IA32_MISC_ENABLE_EST (1<<16) +#define IA32_MISC_ENABLE_CXE (1<<25) +/* + * Debugging support + */ +#ifdef DEBUG +volatile int ess_debug = 0; +#define ESSDEBUG(arglist) if (ess_debug) printf arglist; +#else +#define ESSDEBUG(arglist) +#endif + +/* + * Write the ctrl register. How it is written, depends upon the _PCT + * APCI object value. + */ +static void +write_ctrl(cpu_acpi_handle_t handle, uint32_t ctrl) +{ + cpu_acpi_pct_t *pct_ctrl; + uint64_t reg; + + pct_ctrl = CPU_ACPI_PCT_CTRL(handle); + + switch (pct_ctrl->cr_addrspace_id) { + case ACPI_ADR_SPACE_FIXED_HARDWARE: + /* + * Read current power state because reserved bits must be + * preserved, compose new value, and write it. + */ + reg = rdmsr(IA32_PERF_CTL_MSR); + reg &= ~((uint64_t)0xFFFF); + reg |= ctrl; + wrmsr(IA32_PERF_CTL_MSR, reg); + break; + + case ACPI_ADR_SPACE_SYSTEM_IO: + (void) cpu_acpi_write_port(pct_ctrl->cr_address, ctrl, + pct_ctrl->cr_width); + break; + + default: + DTRACE_PROBE1(ess_ctrl_unsupported_type, uint8_t, + pct_ctrl->cr_addrspace_id); + return; + } + + DTRACE_PROBE1(ess_ctrl_write, uint32_t, ctrl); +} + +/* + * Transition the current processor to the requested state. + */ +void +speedstep_pstate_transition(uint32_t req_state) +{ + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)CPU->cpu_m.mcpu_pm_mach_state; + cpu_acpi_handle_t handle = mach_state->ms_acpi_handle; + cpu_acpi_pstate_t *req_pstate; + uint32_t ctrl; + + req_pstate = (cpu_acpi_pstate_t *)CPU_ACPI_PSTATES(handle); + req_pstate += req_state; + + DTRACE_PROBE1(ess_transition, uint32_t, CPU_ACPI_FREQ(req_pstate)); + + /* + * Initiate the processor p-state change. + */ + ctrl = CPU_ACPI_PSTATE_CTRL(req_pstate); + write_ctrl(handle, ctrl); + + mach_state->ms_pstate.cma_state.pstate = req_state; + cpu_set_curr_clock(((uint64_t)CPU_ACPI_FREQ(req_pstate) * 1000000)); +} + +static void +speedstep_power(cpuset_t set, uint32_t req_state) +{ + /* + * If thread is already running on target CPU then just + * make the transition request. Otherwise, we'll need to + * make a cross-call. + */ + kpreempt_disable(); + if (CPU_IN_SET(set, CPU->cpu_id)) { + speedstep_pstate_transition(req_state); + CPUSET_DEL(set, CPU->cpu_id); + } + if (!CPUSET_ISNULL(set)) { + xc_call((xc_arg_t)req_state, NULL, NULL, X_CALL_HIPRI, set, + (xc_func_t)speedstep_pstate_transition); + } + kpreempt_enable(); +} + +/* + * Validate that this processor supports Speedstep and if so, + * get the P-state data from ACPI and cache it. + */ +static int +speedstep_init(cpu_t *cp) +{ + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; + cpu_acpi_handle_t handle = mach_state->ms_acpi_handle; + cpu_acpi_pct_t *pct_stat; + + ESSDEBUG(("speedstep_init: processor %d\n", cp->cpu_id)); + + /* + * Cache the P-state specific ACPI data. + */ + if (cpu_acpi_cache_pstate_data(handle) != 0) { + ESSDEBUG(("Failed to cache ACPI data\n")); + speedstep_fini(cp); + return (ESS_RET_NO_PM); + } + + pct_stat = CPU_ACPI_PCT_STATUS(handle); + switch (pct_stat->cr_addrspace_id) { + case ACPI_ADR_SPACE_FIXED_HARDWARE: + ESSDEBUG(("Transitions will use fixed hardware\n")); + break; + case ACPI_ADR_SPACE_SYSTEM_IO: + ESSDEBUG(("Transitions will use system IO\n")); + break; + default: + cmn_err(CE_WARN, "!_PCT conifgured for unsupported " + "addrspace = %d.", pct_stat->cr_addrspace_id); + cmn_err(CE_NOTE, "!CPU power management will not function."); + speedstep_fini(cp); + return (ESS_RET_NO_PM); + } + + cpupm_alloc_domains(cp, CPUPM_P_STATES); + + ESSDEBUG(("Processor %d succeeded.\n", cp->cpu_id)) + return (ESS_RET_SUCCESS); +} + +/* + * Free resources allocated by speedstep_init(). + */ +static void +speedstep_fini(cpu_t *cp) +{ + cpupm_mach_state_t *mach_state = + (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state); + cpu_acpi_handle_t handle = mach_state->ms_acpi_handle; + + cpupm_free_domains(&cpupm_pstate_domains); + cpu_acpi_free_pstate_data(handle); +} + +boolean_t +speedstep_supported(uint_t family, uint_t model) +{ + struct cpuid_regs cpu_regs; + + /* Required features */ + if (!(x86_feature & X86_CPUID) || + !(x86_feature & X86_MSR)) { + return (B_FALSE); + } + + /* + * We only support family/model combinations which + * are P-state TSC invariant. + */ + if (!((family == 0xf && model >= 0x3) || + (family == 0x6 && model >= 0xe))) { + return (B_FALSE); + } + + /* + * Enhanced SpeedStep supported? + */ + cpu_regs.cp_eax = 0x1; + (void) __cpuid_insn(&cpu_regs); + if (!(cpu_regs.cp_ecx & CPUID_INTC_ECX_EST)) { + return (B_FALSE); + } + + return (B_TRUE); +}
--- a/usr/src/uts/i86pc/os/mlsetup.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/i86pc/os/mlsetup.c Wed Feb 25 21:04:18 2009 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -295,6 +295,8 @@ */ cpu_list_init(CPU); + pg_cpu_bootstrap(CPU); + /* * Now that we have taken over the GDT, IDT and have initialized * active CPU list it's time to inform kmdb if present.
--- a/usr/src/uts/i86pc/os/mp_machdep.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/i86pc/os/mp_machdep.c Wed Feb 25 21:04:18 2009 -0800 @@ -45,6 +45,7 @@ #include <sys/memlist.h> #include <sys/param.h> #include <sys/promif.h> +#include <sys/cpu_pm.h> #if defined(__xpv) #include <sys/hypervisor.h> #endif @@ -52,6 +53,7 @@ #include <vm/hat_i86.h> #include <sys/kdi_machimpl.h> #include <sys/sdt.h> +#include <sys/hpet.h> #define OFFSETOF(s, m) (size_t)(&(((s *)0)->m)) @@ -76,10 +78,10 @@ static void mach_notify_error(int level, char *errmsg); static hrtime_t dummy_hrtime(void); static void dummy_scalehrtime(hrtime_t *); -static void cpu_idle(void); +void cpu_idle(void); static void cpu_wakeup(cpu_t *, int); #ifndef __xpv -static void cpu_idle_mwait(void); +void cpu_idle_mwait(void); static void cpu_wakeup_mwait(cpu_t *, int); #endif /* @@ -184,7 +186,23 @@ */ int idle_cpu_assert_cflush_monitor = 1; -#endif +/* + * If non-zero, idle cpus will not use power saving Deep C-States idle loop. + */ +int idle_cpu_no_deep_c = 0; +/* + * Non-power saving idle loop and wakeup pointers. + * Allows user to toggle Deep Idle power saving feature on/off. + */ +void (*non_deep_idle_cpu)() = cpu_idle; +void (*non_deep_idle_disp_enq_thread)(cpu_t *, int); + +/* + * Object for the kernel to access the HPET. + */ +hpet_t hpet; + +#endif /* ifndef __xpv */ /*ARGSUSED*/ int @@ -210,6 +228,16 @@ return (1); else return (0); + case PGHW_POW_ACTIVE: + if (cpupm_domain_id(cp, CPUPM_DTYPE_ACTIVE) != (id_t)-1) + return (1); + else + return (0); + case PGHW_POW_IDLE: + if (cpupm_domain_id(cp, CPUPM_DTYPE_IDLE) != (id_t)-1) + return (1); + else + return (0); default: return (0); } @@ -247,58 +275,63 @@ return (cpuid_get_last_lvl_cacheid(cpu)); case PGHW_CHIP: return (cpuid_get_chipid(cpu)); + case PGHW_POW_ACTIVE: + return (cpupm_domain_id(cpu, CPUPM_DTYPE_ACTIVE)); + case PGHW_POW_IDLE: + return (cpupm_domain_id(cpu, CPUPM_DTYPE_IDLE)); default: return (-1); } } -int -pg_plat_hw_level(pghw_type_t hw) +/* + * Express preference for optimizing for sharing relationship + * hw1 vs hw2 + */ +pghw_type_t +pg_plat_hw_rank(pghw_type_t hw1, pghw_type_t hw2) { - int i; + int i, rank1, rank2; + static pghw_type_t hw_hier[] = { PGHW_IPIPE, PGHW_CACHE, PGHW_CHIP, + PGHW_POW_IDLE, + PGHW_POW_ACTIVE, PGHW_NUM_COMPONENTS }; for (i = 0; hw_hier[i] != PGHW_NUM_COMPONENTS; i++) { - if (hw_hier[i] == hw) - return (i); + if (hw_hier[i] == hw1) + rank1 = i; + if (hw_hier[i] == hw2) + rank2 = i; } - return (-1); + + if (rank1 > rank2) + return (hw1); + else + return (hw2); } /* - * Return 1 if CMT load balancing policies should be - * implemented across instances of the specified hardware - * sharing relationship. + * Override the default CMT dispatcher policy for the specified + * hardware sharing relationship */ -int -pg_plat_cmt_load_bal_hw(pghw_type_t hw) +pg_cmt_policy_t +pg_plat_cmt_policy(pghw_type_t hw) { - if (hw == PGHW_IPIPE || - hw == PGHW_FPU || - hw == PGHW_CHIP || - hw == PGHW_CACHE) - return (1); - else - return (0); -} - - -/* - * Return 1 if thread affinity polices should be implemented - * for instances of the specifed hardware sharing relationship. - */ -int -pg_plat_cmt_affinity_hw(pghw_type_t hw) -{ - if (hw == PGHW_CACHE) - return (1); - else - return (0); + /* + * For shared caches, also load balance across them to + * maximize aggregate cache capacity + */ + switch (hw) { + case PGHW_CACHE: + return (CMT_BALANCE|CMT_AFFINITY); + default: + return (CMT_NO_POLICY); + } } id_t @@ -329,9 +362,28 @@ {} /* + * Supports Deep C-State power saving idle loop. + */ +void +cpu_idle_adaptive(void) +{ + (*CPU->cpu_m.mcpu_idle_cpu)(); +} + +void +cpu_dtrace_idle_probe(uint_t cstate) +{ + cpu_t *cpup = CPU; + struct machcpu *mcpu = &(cpup->cpu_m); + + mcpu->curr_cstate = cstate; + DTRACE_PROBE1(idle__state__transition, uint_t, cstate); +} + +/* * Idle the present CPU until awoken via an interrupt */ -static void +void cpu_idle(void) { cpu_t *cpup = CPU; @@ -427,11 +479,11 @@ return; } - DTRACE_PROBE1(idle__state__transition, uint_t, IDLE_STATE_C1); + cpu_dtrace_idle_probe(IDLE_STATE_C1); mach_cpu_idle(); - DTRACE_PROBE1(idle__state__transition, uint_t, IDLE_STATE_C0); + cpu_dtrace_idle_probe(IDLE_STATE_C0); /* * We're no longer halted @@ -510,7 +562,7 @@ /* * Idle the present CPU until awoken via touching its monitored line */ -static void +void cpu_idle_mwait(void) { volatile uint32_t *mcpu_mwait = CPU->cpu_m.mcpu_mwait; @@ -520,7 +572,7 @@ int hset_update = 1; /* - * Set our mcpu_mwait here, so we can tell if anyone trys to + * Set our mcpu_mwait here, so we can tell if anyone tries to * wake us between now and when we call mwait. No other cpu will * attempt to set our mcpu_mwait until we add ourself to the halted * CPU bitmap. @@ -529,7 +581,7 @@ /* * If this CPU is online, and there's multiple CPUs - * in the system, then we should notate our halting + * in the system, then we should note our halting * by adding ourselves to the partition's halted CPU * bitmap. This allows other CPUs to find/awaken us when * work becomes available. @@ -543,7 +595,7 @@ * * When a thread becomes runnable, it is placed on the queue * and then the halted CPU bitmap is checked to determine who - * (if anyone) should be awoken. We therefore need to first + * (if anyone) should be awakened. We therefore need to first * add ourselves to the bitmap, and and then check if there * is any work available. * @@ -580,13 +632,13 @@ */ i86_monitor(mcpu_mwait, 0, 0); if (*mcpu_mwait == MWAIT_HALTED) { - DTRACE_PROBE1(idle__state__transition, uint_t, IDLE_STATE_C1); + cpu_dtrace_idle_probe(IDLE_STATE_C1); tlb_going_idle(); i86_mwait(0, 0); tlb_service(); - DTRACE_PROBE1(idle__state__transition, uint_t, IDLE_STATE_C0); + cpu_dtrace_idle_probe(IDLE_STATE_C0); } /* @@ -858,14 +910,23 @@ (*pops->psm_softinit)(); /* - * Initialize the dispatcher's function hooks - * to enable CPU halting when idle. + * Initialize the dispatcher's function hooks to enable CPU halting + * when idle. Set both the deep-idle and non-deep-idle hooks. + * + * Assume we can use power saving deep-idle loop cpu_idle_adaptive. + * Platform deep-idle driver will reset our idle loop to + * non_deep_idle_cpu if power saving deep-idle feature is not available. + * * Do not use monitor/mwait if idle_cpu_use_hlt is not set(spin idle) * or idle_cpu_prefer_mwait is not set. * Allocate monitor/mwait buffer for cpu0. */ +#ifndef __xpv + non_deep_idle_disp_enq_thread = disp_enq_thread; +#endif if (idle_cpu_use_hlt) { - idle_cpu = cpu_idle; + idle_cpu = cpu_idle_adaptive; + CPU->cpu_m.mcpu_idle_cpu = cpu_idle; #ifndef __xpv if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait) { CPU->cpu_m.mcpu_mwait = cpuid_mwait_alloc(CPU); @@ -878,12 +939,20 @@ "handle cpu 0 mwait size."); #endif idle_cpu_prefer_mwait = 0; - idle_cpu = cpu_idle; + CPU->cpu_m.mcpu_idle_cpu = cpu_idle; } else { - idle_cpu = cpu_idle_mwait; + CPU->cpu_m.mcpu_idle_cpu = cpu_idle_mwait; } } else { - idle_cpu = cpu_idle; + CPU->cpu_m.mcpu_idle_cpu = cpu_idle; + } + non_deep_idle_cpu = CPU->cpu_m.mcpu_idle_cpu; + + /* + * Disable power saving deep idle loop? + */ + if (idle_cpu_no_deep_c) { + idle_cpu = non_deep_idle_cpu; } #endif } @@ -970,6 +1039,7 @@ #ifndef __xpv if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait) disp_enq_thread = cpu_wakeup_mwait; + non_deep_idle_disp_enq_thread = disp_enq_thread; #endif }
--- a/usr/src/uts/i86pc/os/mp_startup.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/i86pc/os/mp_startup.c Wed Feb 25 21:04:18 2009 -0800 @@ -120,11 +120,6 @@ */ cp->cpu_curr_clock = cpu_freq_hz; - /* - * Supported frequencies. - */ - cpu_set_supp_freqs(cp, NULL); - (void) strcpy(pi->pi_processor_type, "i386"); if (fpu_exists) (void) strcpy(pi->pi_fputypes, "i387 compatible"); @@ -236,8 +231,10 @@ proc_t *procp; #if !defined(__xpv) extern int idle_cpu_prefer_mwait; + extern void cpu_idle_mwait(); #endif extern void idle(); + extern void cpu_idle(); #ifdef TRAPTRACE trap_trace_ctl_t *ttc = &trap_trace_ctl[cpun]; @@ -247,9 +244,12 @@ cp = kmem_zalloc(sizeof (*cp), KM_SLEEP); #if !defined(__xpv) - if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait) + if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait) { cp->cpu_m.mcpu_mwait = cpuid_mwait_alloc(CPU); + cp->cpu_m.mcpu_idle_cpu = cpu_idle_mwait; + } else #endif + cp->cpu_m.mcpu_idle_cpu = cpu_idle; procp = curthread->t_procp; @@ -1463,6 +1463,9 @@ { struct cpu *cp = CPU; uint_t new_x86_feature; +#ifndef __xpv + extern void cpupm_init(cpu_t *); +#endif /* * We need to get TSC on this proc synced (i.e., any delta @@ -1558,14 +1561,6 @@ init_cpu_info(cp); mutex_enter(&cpu_lock); - /* - * Processor group initialization for this CPU is dependent on the - * cpuid probing, which must be done in the context of the current - * CPU. - */ - pghw_physid_create(cp); - pg_cpu_init(cp); - pg_cmt_cpu_startup(cp); cp->cpu_flags |= CPU_RUNNING | CPU_READY | CPU_EXISTS; @@ -1597,15 +1592,30 @@ ASSERT(cp->cpu_base_spl == ipltospl(LOCK_LEVEL)); set_base_spl(); /* Restore the spl to its proper value */ +#ifndef __xpv + cpupm_init(cp); +#endif + add_cpunode2devtree(cp->cpu_id, cp->cpu_m.mcpu_cpi); + + /* + * Processor group initialization for this CPU is dependent on the + * cpuid probing, which must be done in the context of the current + * CPU, as well as the CPU's device node initialization (for ACPI). + */ + mutex_enter(&cpu_lock); + pghw_physid_create(cp); + pg_cpu_init(cp); + pg_cmt_cpu_startup(cp); + mutex_exit(&cpu_lock); + /* Enable interrupts */ (void) spl0(); + mutex_enter(&cpu_lock); cpu_enable_intr(cp); cpu_add_active(cp); mutex_exit(&cpu_lock); - add_cpunode2devtree(cp->cpu_id, cp->cpu_m.mcpu_cpi); - #ifndef __xpv { /*
--- a/usr/src/uts/i86pc/os/startup.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/i86pc/os/startup.c Wed Feb 25 21:04:18 2009 -0800 @@ -137,6 +137,7 @@ extern void progressbar_start(void); extern void brand_init(void); extern void pcf_init(void); +extern void pg_init(void); extern int size_pse_array(pgcnt_t, int); @@ -2128,6 +2129,8 @@ void post_startup(void) { + extern void cpupm_init(cpu_t *); + /* * Set the system wide, processor-specific flags to be passed * to userland via the aux vector for performance hints and @@ -2186,7 +2189,11 @@ maxmem = freemem; + cpupm_init(CPU); + add_cpunode2devtree(CPU->cpu_id, CPU->cpu_m.mcpu_cpi); + + pg_init(); } static int
--- a/usr/src/uts/i86pc/sys/cpu_acpi.h Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/i86pc/sys/cpu_acpi.h Wed Feb 25 21:04:18 2009 -0800 @@ -19,13 +19,14 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _CPU_ACPI_H #define _CPU_ACPI_H +#include <sys/cpuvar.h> #include <sys/acpi/acpi.h> #include <sys/acpi/acresrc.h> #include <sys/acpi/acglobal.h> @@ -66,15 +67,25 @@ #define CPU_ACPI_TSTATE_CTRL(tstate) tstate->ts_ctrl #define CPU_ACPI_TSTATE_STAT(tstate) tstate->ts_state -#define CPU_ACPI_NONE_CACHED 0x00 -#define CPU_ACPI_PCT_CACHED 0x01 -#define CPU_ACPI_PSS_CACHED 0x02 -#define CPU_ACPI_PSD_CACHED 0x04 -#define CPU_ACPI_PPC_CACHED 0x08 -#define CPU_ACPI_PTC_CACHED 0x10 -#define CPU_ACPI_TSS_CACHED 0x20 -#define CPU_ACPI_TSD_CACHED 0x40 -#define CPU_ACPI_TPC_CACHED 0x80 +/* + * C-state realted macros + */ +#define CPU_ACPI_CSD(sp) sp->cs_csd +#define CPU_ACPI_BM_INFO(sp) sp->bm_info +#define CPU_ACPI_CSTATES(sp) sp->cs_cstates.ss_states +#define CPU_ACPI_CSTATES_COUNT(sp) sp->cs_cstates.ss_count + +#define CPU_ACPI_NONE_CACHED 0x0000 +#define CPU_ACPI_PCT_CACHED 0x0001 +#define CPU_ACPI_PSS_CACHED 0x0002 +#define CPU_ACPI_PSD_CACHED 0x0004 +#define CPU_ACPI_PPC_CACHED 0x0008 +#define CPU_ACPI_PTC_CACHED 0x0010 +#define CPU_ACPI_TSS_CACHED 0x0020 +#define CPU_ACPI_TSD_CACHED 0x0040 +#define CPU_ACPI_TPC_CACHED 0x0080 +#define CPU_ACPI_CST_CACHED 0x0100 +#define CPU_ACPI_CSD_CACHED 0x0200 #define CPU_ACPI_IS_OBJ_CACHED(sp, obj) (sp->cpu_acpi_cached & obj) #define CPU_ACPI_OBJ_IS_CACHED(sp, obj) (sp->cpu_acpi_cached |= obj) @@ -84,7 +95,8 @@ #define CPU_ACPI_PSS_CNT (sizeof (cpu_acpi_pstate_t) / sizeof (uint32_t)) #define CPU_ACPI_TSTATES_SIZE(cnt) (cnt * sizeof (cpu_acpi_tstate_t)) #define CPU_ACPI_TSS_CNT (sizeof (cpu_acpi_tstate_t) / sizeof (uint32_t)) - +#define CPU_ACPI_CSTATES_SIZE(cnt) (cnt * sizeof (cpu_acpi_cstate_t)) +#define CPU_ACPI_CST_CNT (sizeof (cpu_acpi_cstate_t) / sizeof (uint32_t)) /* * CPU Domain Coordination Types */ @@ -102,10 +114,12 @@ uint32_t sd_domain; uint32_t sd_type; uint32_t sd_num; + uint32_t sd_index; } cpu_acpi_state_dependency_t; typedef cpu_acpi_state_dependency_t cpu_acpi_psd_t; typedef cpu_acpi_state_dependency_t cpu_acpi_tsd_t; +typedef cpu_acpi_state_dependency_t cpu_acpi_csd_t; /* * Container for ACPI processor control register information @@ -148,6 +162,21 @@ } cpu_acpi_tstate_t; +/* + * Container for _CST information + */ +typedef struct cpu_acpi_cstate +{ + uint32_t cs_addrspace_id; + uint32_t cs_address; + uint32_t cs_type; + uint32_t cs_latency; + uint32_t cs_power; + uint32_t promotion; + uint32_t demotion; + kstat_t *cs_ksp; +} cpu_acpi_cstate_t; + typedef struct cpu_acpi_supported_states { void *ss_states; uint32_t ss_count; @@ -155,6 +184,7 @@ typedef cpu_acpi_supported_states_t cpu_acpi_pstates_t; typedef cpu_acpi_supported_states_t cpu_acpi_tstates_t; +typedef cpu_acpi_supported_states_t cpu_acpi_cstates_t; typedef int cpu_acpi_present_capabilities_t; typedef int cpu_acpi_ppc_t; @@ -165,7 +195,7 @@ */ typedef struct cpu_acpi_state { ACPI_HANDLE cs_handle; - dev_info_t *cs_dip; + int cs_id; uint_t cpu_acpi_cached; cpu_acpi_pstates_t cs_pstates; cpu_acpi_pct_t cs_pct[2]; @@ -175,6 +205,9 @@ cpu_acpi_ptc_t cs_ptc[2]; cpu_acpi_tsd_t cs_tsd; cpu_acpi_tpc_t cs_tpc; + cpu_acpi_cstates_t cs_cstates; + cpu_acpi_csd_t cs_csd; + uint_t bm_info; } cpu_acpi_state_t; typedef cpu_acpi_state_t *cpu_acpi_handle_t; @@ -185,15 +218,22 @@ extern void cpu_acpi_free_pstate_data(cpu_acpi_handle_t); extern int cpu_acpi_cache_tstate_data(cpu_acpi_handle_t); extern void cpu_acpi_free_tstate_data(cpu_acpi_handle_t); +extern int cpu_acpi_cache_cstate_data(cpu_acpi_handle_t); +extern void cpu_acpi_free_cstate_data(cpu_acpi_handle_t); extern void cpu_acpi_install_notify_handler(cpu_acpi_handle_t, - ACPI_NOTIFY_HANDLER, dev_info_t *); + ACPI_NOTIFY_HANDLER, void *); +extern void cpu_acpi_remove_notify_handler(cpu_acpi_handle_t, + ACPI_NOTIFY_HANDLER); extern int cpu_acpi_write_pdc(cpu_acpi_handle_t, uint32_t, uint32_t, uint32_t *); extern int cpu_acpi_write_port(ACPI_IO_ADDRESS, uint32_t, uint32_t); extern int cpu_acpi_read_port(ACPI_IO_ADDRESS, uint32_t *, uint32_t); +extern void cpu_acpi_set_register(uint32_t, uint32_t); +extern void cpu_acpi_get_register(uint32_t, uint32_t *); extern uint_t cpu_acpi_get_speeds(cpu_acpi_handle_t, int **); +extern uint_t cpu_acpi_get_max_cstates(cpu_acpi_handle_t); extern void cpu_acpi_free_speeds(int *, uint_t); -extern cpu_acpi_handle_t cpu_acpi_init(dev_info_t *); +extern cpu_acpi_handle_t cpu_acpi_init(cpu_t *); extern void cpu_acpi_fini(cpu_acpi_handle_t); #ifdef __cplusplus
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/i86pc/sys/cpu_idle.h Wed Feb 25 21:04:18 2009 -0800 @@ -0,0 +1,72 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _CPUIDLE_H +#define _CPUIDLE_H + +#include <sys/cpupm.h> + +#ifdef __cplusplus +extern "C" { +#endif +#define CPU_MAX_CSTATES 8 + +#define CPU_ACPI_C0 0 +#define CPU_ACPI_C1 1 +#define CPU_ACPI_C2 2 +#define CPU_ACPI_C3 3 + +#define BM_CTL 0x1 +#define BM_RLD 0x2 +#define BM_ARB_DIS 0x4 + +#define CPUID_TSC_INVARIANCE 0x100 + +#define CPU_IDLE_DEEP_CFG (0x1) /* Deep Idle disabled by user */ +#define CPU_IDLE_CPR_CFG (0x2) /* In CPR */ + +#define CPU_CSTATE_LATENCY_UNDEF (1000000) /* ACPI info missing */ + +typedef struct cpu_idle_kstat_s { + struct kstat_named addr_space_id; /* register address space id */ + struct kstat_named cs_latency; /* worst latency */ + struct kstat_named cs_power; /* average power consumption */ +} cpu_idle_kstat_t; + +extern cpupm_state_ops_t cpu_idle_ops; + +extern void cpu_acpi_idle(void); +extern void cstate_wakeup(cpu_t *, int); +extern boolean_t cpu_deep_cstates_supported(void); +extern void cpu_wakeup(cpu_t *, int); +extern void cpu_wakeup_mwait(cpu_t *, int); +extern void cpu_dtrace_idle_probe(uint_t); +extern void cpuidle_manage_cstates(void *); + +#ifdef __cplusplus +} +#endif + +#endif /* _CPUIDLE_H */
--- a/usr/src/uts/i86pc/sys/cpudrv_mach.h Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/i86pc/sys/cpudrv_mach.h Wed Feb 25 21:04:18 2009 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -42,31 +42,12 @@ * for it). */ extern cpuset_t cpu_ready_set; -#define CPUDRV_PM_XCALL_IS_READY(cpuid) CPU_IN_SET(cpu_ready_set, (cpuid)) - -/* - * An error attaching any of the devices results in disabling - * CPU power management. - */ -#define CPUDRV_PM_DISABLE() cpupm_disable(CPUPM_ALL_STATES) - -/* - * If no power management states are enabled, then CPU power - * management is disabled. - */ -#define CPUDRV_PM_DISABLED() \ - (!cpupm_is_enabled(CPUPM_P_STATES) && !cpupm_is_enabled(CPUPM_T_STATES)) - -/* - * Is P-state management enabled? - */ -#define CPUDRV_PM_POWER_ENABLED(cpudsp) \ - (((cpudrv_mach_state_t *)cpudsp->mach_state)->caps & CPUDRV_P_STATES) +#define CPUDRV_XCALL_IS_READY(cpuid) CPU_IN_SET(cpu_ready_set, (cpuid)) /* * We're about to exit the _PPC thread so reset tag. */ -#define CPUDRV_PM_RESET_GOVERNOR_THREAD(cpupm) { \ +#define CPUDRV_RESET_GOVERNOR_THREAD(cpupm) { \ if (curthread == cpupm->pm_governor_thread) \ cpupm->pm_governor_thread = NULL; \ } @@ -74,50 +55,51 @@ /* * The current top speed as defined by the _PPC. */ -#define CPUDRV_PM_TOPSPEED(cpupm) (cpupm)->top_spd +#define CPUDRV_TOPSPEED(cpupm) (cpupm)->top_spd /* * Install a _PPC/_TPC change notification handler. */ -#define CPUDRV_PM_INSTALL_MAX_CHANGE_HANDLER(cpudsp, dip) \ - cpudrv_pm_install_notify_handler(cpudsp, dip); +#define CPUDRV_INSTALL_MAX_CHANGE_HANDLER(cpudsp) \ + cpudrv_install_notify_handler(cpudsp); /* * Redefine the topspeed. */ -#define CPUDRV_PM_REDEFINE_TOPSPEED(dip) cpudrv_pm_redefine_topspeed(dip) +#define CPUDRV_REDEFINE_TOPSPEED(dip) cpudrv_redefine_topspeed(dip) /* * Set callbacks so that PPM can callback into CPUDRV */ -#define CPUDRV_PM_SET_PPM_CALLBACKS() { \ - cpupm_get_topspeed = cpudrv_pm_get_topspeed; \ - cpupm_set_topspeed = cpudrv_pm_set_topspeed; \ +#define CPUDRV_SET_PPM_CALLBACKS() { \ + cpupm_get_topspeed_callb = cpudrv_get_topspeed; \ + cpupm_set_topspeed_callb = cpudrv_set_topspeed; \ } /* * ACPI provides the supported speeds. */ -#define CPUDRV_PM_GET_SPEEDS(cpudsp, speeds, nspeeds) \ - nspeeds = cpudrv_pm_get_speeds(cpudsp, &speeds); -#define CPUDRV_PM_FREE_SPEEDS(speeds, nspeeds) \ - cpudrv_pm_free_speeds(speeds, nspeeds); +#define CPUDRV_GET_SPEEDS(cpudsp, speeds, nspeeds) \ + nspeeds = cpudrv_get_speeds(cpudsp, &speeds); +#define CPUDRV_FREE_SPEEDS(speeds, nspeeds) \ + cpudrv_free_speeds(speeds, nspeeds); /* - * Convert speed to Hz. + * ACPI provides the supported C-states. */ -#define CPUDRV_PM_SPEED_HZ(unused, mhz) ((uint64_t)mhz * 1000000) +#define CPUDRV_GET_MAX_CSTATES(handle) \ + cpu_acpi_get_max_cstates(handle); /* * Compute the idle cnt percentage for a given speed. */ -#define CPUDRV_PM_IDLE_CNT_PERCENT(hwm, speeds, i) \ +#define CPUDRV_IDLE_CNT_PERCENT(hwm, speeds, i) \ (100 - (((100 - hwm) * speeds[0]) / speeds[i])) /* * Compute the user cnt percentage for a given speed. */ -#define CPUDRV_PM_USER_CNT_PERCENT(hwm, speeds, i) \ +#define CPUDRV_USER_CNT_PERCENT(hwm, speeds, i) \ ((hwm * speeds[i]) / speeds[i - 1]); /* @@ -133,82 +115,21 @@ * The amount of memory needed for each string is: * digits for power level + '=' + digits for freq + 'MHz' + '\0' */ -#define CPUDRV_PM_COMP_SIZE() \ - (CPUDRV_PM_COMP_MAX_DIG + 1 + CPUDRV_PM_COMP_MAX_DIG + 3 + 1); -#define CPUDRV_PM_COMP_SPEED(cpupm, cur_spd) cur_spd->speed; -#define CPUDRV_PM_COMP_SPRINT(pmc, cpupm, cur_spd, comp_spd) \ +#define CPUDRV_COMP_SIZE() \ + (CPUDRV_COMP_MAX_DIG + 1 + CPUDRV_COMP_MAX_DIG + 3 + 1); +#define CPUDRV_COMP_SPEED(cpupm, cur_spd) cur_spd->speed; +#define CPUDRV_COMP_SPRINT(pmc, cpupm, cur_spd, comp_spd) \ (void) sprintf(pmc, "%d=%dMHz", cur_spd->pm_level, comp_spd); -/* - * T-State domain list - */ -typedef struct cpudrv_tstate_domain_node { - struct cpudrv_tstate_domain_node *tdn_next; - struct cpudrv_tstate_domain *tdn_domain; - cpudrv_devstate_t *tdn_cpudsp; -} cpudrv_tstate_domain_node_t; - -typedef struct cpudrv_tstate_domain { - struct cpudrv_tstate_domain *td_next; - cpudrv_tstate_domain_node_t *td_node; - uint32_t td_domain; - uint32_t td_type; - kmutex_t td_lock; -} cpudrv_tstate_domain_t; - -extern cpudrv_tstate_domain_t *cpudrv_tstate_domains; +extern void cpudrv_set_topspeed(void *, int); +extern int cpudrv_get_topspeed(void *); +extern int cpudrv_get_topthrottle(cpu_t *); +extern void cpudrv_manage_throttling(void *); +extern void cpudrv_install_notify_handler(cpudrv_devstate_t *); +extern void cpudrv_redefine_topspeed(void *); +extern uint_t cpudrv_get_speeds(cpudrv_devstate_t *, int **); +extern void cpudrv_free_speeds(int *, uint_t); -/* - * Different processor families have their own technologies for supporting - * CPU power management (i.e., Intel has Enhanced SpeedStep for some of it's - * processors and AMD has PowerNow! for some of it's processors). We support - * these different technologies via modules that export the interfaces - * described below. - * - * If a module implements the technology that should be used to manage - * the current CPU device, then the cpups_init() module should return - * succesfully (i.e., return code of 0) and perform any initialization - * such that future power transistions can be performed by calling - * the cpups_power() interface(). And the cpups_fini() interface can be - * used to free any resources allocated by cpups_init(). - */ -typedef struct cpudrv_pstate_ops { - char *cpups_label; - int (*cpups_init)(cpudrv_devstate_t *); - void (*cpups_fini)(cpudrv_devstate_t *); - int (*cpups_power)(cpudrv_devstate_t *, uint32_t); -} cpudrv_pstate_ops_t; - -/* - * T-state support. - */ -typedef struct cpudrv_tstate_ops { - char *cputs_label; - int (*cputs_init)(cpudrv_devstate_t *); - void (*cputs_fini)(cpudrv_devstate_t *); - int (*cputs_throttle)(cpudrv_devstate_t *, uint32_t); -} cpudrv_tstate_ops_t; - -typedef struct cpudrv_mach_state { - void *acpi_handle; - cpudrv_pstate_ops_t *cpupm_pstate_ops; - cpudrv_tstate_ops_t *cpupm_tstate_ops; - cpudrv_tstate_domain_node_t *tstate_domain_node; - uint32_t pstate; - uint32_t tstate; - uint32_t caps; -} cpudrv_mach_state_t; - -#define CPUDRV_NO_STATES 0x00 -#define CPUDRV_P_STATES 0x01 -#define CPUDRV_T_STATES 0x02 - -extern uint_t cpudrv_pm_get_speeds(cpudrv_devstate_t *, int **); -extern void cpudrv_pm_free_speeds(int *, uint_t); -extern void cpudrv_pm_set_topspeed(void *, int); -extern int cpudrv_pm_get_topspeed(void *); -extern void cpudrv_pm_redefine_topspeed(void *); -extern void cpudrv_pm_install_notify_handler(cpudrv_devstate_t *, dev_info_t *); #ifdef __cplusplus } #endif
--- a/usr/src/uts/i86pc/sys/cpudrv_throttle.h Wed Feb 25 20:53:30 2009 -0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,41 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _CPUDRV_THROTTLE_H -#define _CPUDRV_THROTTLE_H - -#include <sys/cpudrv_mach.h> - -#ifdef __cplusplus -extern "C" { -#endif - -cpudrv_tstate_ops_t cpudrv_throttle_ops; - -#ifdef __cplusplus -} -#endif - -#endif /* _CPUDRV_THROTTLE_H */
--- a/usr/src/uts/i86pc/sys/cpupm.h Wed Feb 25 20:53:30 2009 -0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,89 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _CPUPM_H -#define _CPUPM_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include <sys/ddi.h> -#include <sys/sunddi.h> - -/* - * Simple structures used to temporarily keep track of CPU - * dependencies until the PPM driver can build PPM CPU domains. - */ -typedef struct cpupm_cpu_node { - struct cpupm_cpu_node *cn_next; - dev_info_t *cn_dip; -} cpupm_cpu_node_t; - -typedef struct cpupm_cpu_dependency { - struct cpupm_cpu_dependency *cd_next; - cpupm_cpu_node_t *cd_cpu; - int cd_dependency_id; -} cpupm_cpu_dependency_t; - -/* - * If any states are added, then make sure to add them to - * CPUPM_ALL_STATES. - */ -#define CPUPM_NO_STATES 0x00 -#define CPUPM_P_STATES 0x01 -#define CPUPM_T_STATES 0x02 -#define CPUPM_ALL_STATES (CPUPM_P_STATES | CPUPM_T_STATES) - -/* - * Callbacks used for CPU power management. - */ -extern void (*cpupm_rebuild_cpu_domains)(void); -extern void (*cpupm_init_topspeed)(void); -extern void (*cpupm_redefine_topspeed)(void *); -extern int (*cpupm_get_topspeed)(void *); -extern void (*cpupm_set_topspeed)(void *, int); - -/* - * Routines used to manage temporary CPU dependencies. - */ -extern cpupm_cpu_dependency_t *cpupm_get_cpu_dependencies(); -extern void cpupm_add_cpu2dependency(dev_info_t *, int); -extern void cpupm_free_cpu_dependencies(); - -/* - * Routines to track overall status of CPU power management readiness. - * - */ -extern boolean_t cpupm_is_ready(); -extern boolean_t cpupm_is_enabled(uint32_t); -extern void cpupm_disable(uint32_t); -extern void cpupm_post_startup(); - -#ifdef __cplusplus -} -#endif - -#endif /* _CPUPM_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/i86pc/sys/cpupm_mach.h Wed Feb 25 21:04:18 2009 -0800 @@ -0,0 +1,197 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _CPUPM_MACH_H +#define _CPUPM_MACH_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/cpuvar.h> +#include <sys/ksynch.h> +#include <sys/cpu_pm.h> + +/* + * CPU power domains + */ +typedef struct cpupm_state_domains { + struct cpupm_state_domains *pm_next; + uint32_t pm_domain; + uint32_t pm_type; + cpuset_t pm_cpus; + kmutex_t pm_lock; +} cpupm_state_domains_t; + +extern cpupm_state_domains_t *cpupm_pstate_domains; +extern cpupm_state_domains_t *cpupm_tstate_domains; +extern cpupm_state_domains_t *cpupm_cstate_domains; + +/* + * Different processor families have their own technologies for supporting + * CPU power management (i.e., Intel has Enhanced SpeedStep for some of its + * processors and AMD has PowerNow! for some of its processors). We support + * these different technologies via modules that export the interfaces + * described below. + * + * If a module implements the technology that should be used to manage + * the current CPU device, then the cpus_init() module should return + * succesfully (i.e., return code of 0) and perform any initialization + * such that future power transistions can be performed by calling + * the cpus_change() interface. And the cpups_fini() interface can be + * used to free any resources allocated by cpus_init(). + */ +typedef struct cpupm_state_ops { + char *cpups_label; + int (*cpus_init)(cpu_t *); + void (*cpus_fini)(cpu_t *); + void (*cpus_change)(cpuset_t, uint32_t); +} cpupm_state_ops_t; + +/* + * Data kept for each C-state power-domain. + */ +typedef struct cma_c_state { + uint32_t cs_next_cstate; /* computed best C-state */ + + uint32_t cs_cnt; /* times accessed */ + uint32_t cs_type; /* current ACPI idle type */ + + hrtime_t cs_idle_enter; /* entered idle */ + hrtime_t cs_idle_exit; /* left idle */ + + hrtime_t cs_smpl_start; /* accounting sample began */ + hrtime_t cs_idle; /* time idle */ + hrtime_t cs_smpl_len; /* sample duration */ + hrtime_t cs_smpl_idle; /* idle time in last sample */ + uint64_t cs_smpl_idle_pct; /* % idle time in last smpl */ + + hrtime_t cs_C2_latency; /* C2 round trip latency */ + hrtime_t cs_C3_latency; /* C3 round trip latency */ +} cma_c_state_t; + +typedef union cma_state { + cma_c_state_t *cstate; + uint32_t pstate; +} cma_state_t; + +typedef struct cpupm_mach_acpi_state { + cpupm_state_ops_t *cma_ops; + cpupm_state_domains_t *cma_domain; + cma_state_t cma_state; +} cpupm_mach_acpi_state_t; + +typedef struct cpupm_mach_state { + void *ms_acpi_handle; + cpupm_mach_acpi_state_t ms_pstate; + cpupm_mach_acpi_state_t ms_cstate; + cpupm_mach_acpi_state_t ms_tstate; + uint32_t ms_caps; + dev_info_t *ms_dip; + kmutex_t ms_lock; + struct cpupm_notification *ms_handlers; +} cpupm_mach_state_t; + +/* + * Constants used by the Processor Device Notification handler + * that identify what kind of change has occurred. + */ +#define CPUPM_PPC_CHANGE_NOTIFICATION 0x80 +#define CPUPM_CST_CHANGE_NOTIFICATION 0x81 +#define CPUPM_TPC_CHANGE_NOTIFICATION 0x82 + +typedef void (*CPUPM_NOTIFY_HANDLER)(void *handle, uint32_t val, + void *ctx); + +typedef struct cpupm_notification { + struct cpupm_notification *nq_next; + CPUPM_NOTIFY_HANDLER nq_handler; + void *nq_ctx; +} cpupm_notification_t; + +/* + * If any states are added, then make sure to add them to + * CPUPM_ALL_STATES. + */ +#define CPUPM_NO_STATES 0x00 +#define CPUPM_P_STATES 0x01 +#define CPUPM_T_STATES 0x02 +#define CPUPM_C_STATES 0x04 +#define CPUPM_ALL_STATES (CPUPM_P_STATES \ + | CPUPM_T_STATES \ + | CPUPM_C_STATES) + +#define CPUPM_XCALL_IS_READY(cpuid) CPU_IN_SET(cpu_ready_set, (cpuid)) + +/* + * An error in initializing any of the CPU PM results in disabling + * CPU power management. + */ +#define CPUPM_DISABLE() cpupm_disable(CPUPM_ALL_STATES) + +#define CPUPM_SPEED_HZ(unused, mhz) ((uint64_t)mhz * 1000000) + +/* + * Callbacks used for CPU power management. + */ +extern void (*cpupm_rebuild_cpu_domains)(void); +extern void (*cpupm_init_topspeed)(void); +extern void (*cpupm_redefine_topspeed)(void *); +extern int (*cpupm_get_topspeed_callb)(void *); +extern void (*cpupm_set_topspeed_callb)(void *, int); + +extern void cpupm_init(cpu_t *); +extern void cpupm_free(cpu_t *); +extern boolean_t cpupm_is_ready(); +extern boolean_t cpupm_is_enabled(uint32_t); +extern void cpupm_disable(uint32_t); +extern void cpupm_post_startup(); +extern void cpupm_alloc_domains(cpu_t *, int); +extern void cpupm_free_domains(cpupm_state_domains_t **); +extern void cpupm_alloc_ms_cstate(cpu_t *cp); +extern void cpupm_free_ms_cstate(cpu_t *cp); +extern void cpupm_state_change(cpu_t *, int, int); +extern id_t cpupm_plat_domain_id(cpu_t *cp, cpupm_dtype_t type); +extern uint_t cpupm_plat_state_enumerate(cpu_t *, cpupm_dtype_t, + cpupm_state_t *); +extern int cpupm_plat_change_state(cpu_t *, cpupm_state_t *); +extern uint_t cpupm_get_speeds(cpu_t *, int **); +extern void cpupm_free_speeds(int *, uint_t); +extern boolean_t cpupm_power_ready(void); +extern boolean_t cpupm_throttle_ready(void); +extern boolean_t cpupm_cstate_ready(void); +extern void cpupm_add_notify_handler(cpu_t *, CPUPM_NOTIFY_HANDLER, void *); +extern int cpupm_get_top_speed(cpu_t *); +extern uint32_t cpupm_next_cstate(cma_c_state_t *, hrtime_t); +extern void cpupm_idle_cstate_data(cma_c_state_t *, int); +extern void cpupm_wakeup_cstate_data(cma_c_state_t *, hrtime_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _CPUPM_MACH_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/i86pc/sys/cpupm_throttle.h Wed Feb 25 21:04:18 2009 -0800 @@ -0,0 +1,43 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _CPUPM_THROTTLE_H +#define _CPUPM_THROTTLE_H + +#include <sys/cpupm.h> + +#ifdef __cplusplus +extern "C" { +#endif + +cpupm_state_ops_t cpupm_throttle_ops; + +extern void cpupm_throttle_manage_notification(void *); + +#ifdef __cplusplus +} +#endif + +#endif /* _CPUPM_THROTTLE_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/i86pc/sys/hpet.h Wed Feb 25 21:04:18 2009 -0800 @@ -0,0 +1,80 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _HPET_H +#define _HPET_H + +#include <sys/hpet_acpi.h> + +/* + * Interface for HPET access. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * HPET_INFINITY is used for timers that will never expire. + */ +#define HPET_INFINITY (INT64_MAX) + +/* + * State of initialization. + */ +#define HPET_NO_SUPPORT (0) +#define HPET_TIMER_SUPPORT (1) /* supports main counter reads */ +#define HPET_INTERRUPT_SUPPORT (2) /* supports interrupt/timer */ +#define HPET_FULL_SUPPORT (3) /* supports counter and timer intr */ + +typedef struct hpet { + uint_t supported; + boolean_t (*install_proxy)(void); + boolean_t (*callback)(int); + /* + * Next two function pointers allow CPUs to use the HPET's timer + * as a proxy for their LAPIC timers which stop during Deep C-State. + */ + boolean_t (*use_hpet_timer)(hrtime_t *); + void (*use_lapic_timer)(hrtime_t); +} hpet_t; + +#define CST_EVENT_MULTIPLE_CSTATES (128) /* callbacks for _CST changes */ +#define CST_EVENT_ONE_CSTATE (129) + +/* + * unix access to the HPET is done through the hpet structure. + */ +extern hpet_t hpet; + +int hpet_acpi_init(int *hpet_vect, iflag_t *hpet_flags); +void hpet_acpi_fini(void); +uint32_t hpet_proxy_ipl(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _HPET_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/i86pc/sys/hpet_acpi.h Wed Feb 25 21:04:18 2009 -0800 @@ -0,0 +1,334 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _HPET_ACPI_H +#define _HPET_ACPI_H + +#if defined(_KERNEL) +#include <sys/acpi/acpi.h> +#include <sys/acpi/actbl1.h> +#include <sys/acpica.h> +#endif /* defined(_KERNEL) */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Solaris uses an HPET Timer to generate interrupts for CPUs in Deep C-state + * with stalled LAPIC Timers. All CPUs use one HPET timer. The timer's + * interrupt targets one CPU (via the I/O APIC). The one CPU that receives + * the HPET's interrupt wakes up other CPUs as needed during the HPET Interrupt + * Service Routing. The HPET ISR uses poke_cpus to wake up other CPUs with an + * Inter Processor Interrupt. + * + * Please see the Intel Programmer's guides. Interrupts are disabled before + * a CPU Halts into Deep C-state. (This allows CPU-hardware-specific cleanup + * before servicing interrupts.) When a Deep C-state CPU wakes up (due to + * an externally generated interrupt), it resume execution where it halted. + * The CPU returning from Deep C-state must enable interrupts before it will + * handle the pending interrupt that woke it from Deep C-state. + * + * + * HPET bits as defined in the Intel IA-PC HPET Specification Rev 1.0a. + * + * The physical address space layout of the memory mapped HPET looks like this: + * + * struct hpet { + * uint64_t gen_cap; + * uint64_t res1; + * uint64_t gen_config; + * uint64_t res2; + * uint64_t gen_inter_stat; + * uint64_t res3; + * uint64_t main_counter_value; + * uint64_t res4; + * stuct hpet_timer { + * uint64_t config_and_capability; + * uint64_t comparator_value; + * uint64_t FSB_interrupt_route; + * uint64_t reserved; + * } timers[32]; + * } + * + * There are 32 possible timers in an hpet. Only the first 3 timers are + * required. The other 29 timers are optional. + * + * HPETs can have 64-bit or 32-bit timers. Timers/compare registers can + * be 64-bit or 32-bit and can be a mixture of both. + * The first two timers are not used. The HPET spec intends the first two + * timers to be used as "legacy replacement" for the PIT and RTC timers. + * + * Solaris uses the first available non-legacy replacement timer as a proxy + * timer for processor Local APIC Timers that stop in deep idle C-states. + */ + +/* + * We only use HPET table 1 on x86. Typical x86 systems only have 1 HPET. + * ACPI allows for multiple HPET tables to describe multiple HPETs. + */ +#define HPET_TABLE_1 (1) + +/* + * HPET Specification 1.0a defines the HPET to occupy 1024 bytes regardless of + * the number of counters (3 to 32) in this implementation. + */ +#define HPET_SIZE (1024) + +/* + * Offsets of hpet registers and macros to access them from HPET base address. + */ +#define HPET_GEN_CAP_OFFSET (0) +#define HPET_GEN_CONFIG_OFFSET (0x10) +#define HPET_GEN_INTR_STAT_OFFSET (0x20) +#define HPET_MAIN_COUNTER_OFFSET (0xF0) +#define HPET_TIMER_N_CONF_OFFSET(n) (0x100 + (n * 0x20)) +#define HPET_TIMER_N_COMP_OFFSET(n) (0x108 + (n * 0x20)) + +#define OFFSET_ADDR(a, o) (((uintptr_t)(a)) + (o)) +#define HPET_GEN_CAP_ADDRESS(la) \ + OFFSET_ADDR(la, HPET_GEN_CAP_OFFSET) +#define HPET_GEN_CONFIG_ADDRESS(la) \ + OFFSET_ADDR(la, HPET_GEN_CONFIG_OFFSET) +#define HPET_GEN_INTR_STAT_ADDRESS(la) \ + OFFSET_ADDR(la, HPET_GEN_INTR_STAT_OFFSET) +#define HPET_MAIN_COUNTER_ADDRESS(la) \ + OFFSET_ADDR(la, HPET_MAIN_COUNTER_OFFSET) +#define HPET_TIMER_N_CONF_ADDRESS(la, n) \ + OFFSET_ADDR(la, HPET_TIMER_N_CONF_OFFSET(n)) +#define HPET_TIMER_N_COMP_ADDRESS(la, n) \ + OFFSET_ADDR(la, HPET_TIMER_N_COMP_OFFSET(n)) + +/* + * HPET General Capabilities and ID Register + */ +typedef struct hpet_gen_cap { + uint32_t counter_clk_period; /* period in femtoseconds */ + uint32_t vendor_id :16; /* vendor */ + uint32_t leg_route_cap :1; /* 1=LegacyReplacemnt support */ + uint32_t res1 :1; /* reserved */ + uint32_t count_size_cap :1; /* 0=32bit, 1=64bit wide */ + uint32_t num_tim_cap :5; /* number of timers -1 */ + uint32_t rev_id :8; /* revision number */ +} hpet_gen_cap_t; + +/* + * Macros to parse fields of the hpet General Capabilities and ID Register. + */ +#define HPET_GCAP_CNTR_CLK_PERIOD(l) (l >> 32) +#define HPET_GCAP_VENDOR_ID(l) BITX(l, 31, 16) +#define HPET_GCAP_LEG_ROUTE_CAP(l) BITX(l, 15, 15) +#define HPET_GCAP_CNT_SIZE_CAP(l) BITX(l, 13, 13) +#define HPET_GCAP_NUM_TIM_CAP(l) BITX(l, 12, 8) +#define HPET_GCAP_REV_ID(l) BITX(l, 7, 0) + +/* + * From HPET spec "The value in this field must be less than or equal to": + */ +#define HPET_MAX_CLK_PERIOD (0x5F5E100) + +/* + * Femto seconds in a second. + */ +#if defined(__i386) +#define HPET_FEMTO_TO_NANO (1000000LL) +#define HRTIME_TO_HPET_TICKS(t) (((t) * HPET_FEMTO_TO_NANO) / hpet_info.period) +#else +#define HPET_FEMTO_TO_NANO (1000000L) +#define HRTIME_TO_HPET_TICKS(t) (((t) * HPET_FEMTO_TO_NANO) / hpet_info.period) +#endif /* (__i386) */ + +/* + * HPET General Configuration Register + */ +typedef struct hpet_gen_config_bitfield { + uint32_t leg_rt_cnf :1; /* legacy replacement route */ + uint32_t enable_cnf :1; /* overal enable */ +} hpet_gen_conf_t; + +/* + * General Configuration Register fields. + */ +#define HPET_GCFR_LEG_RT_CNF (0x2) /* bit field value */ +#define HPET_GCFR_ENABLE_CNF (0x1) /* bit field value */ +#define HPET_GCFR_LEG_RT_CNF_BITX(l) BITX(l, 1, 1) +#define HPET_GCFR_ENABLE_CNF_BITX(l) BITX(l, 0, 0) + +/* + * General Interrupt Status Register. + */ +#define HPET_GIS_T2_INT_STS(l) BITX(l, 2, 2) +#define HPET_GIS_T1_INT_STS(l) BITX(l, 1, 1) +#define HPET_GIS_T0_INT_STS(l) BITX(l, 0, 0) +#define HPET_GIS_TN_INT_STS(l, n) BITX(l, n, n) + +#define HPET_INTR_STATUS_MASK(timer) ((uint64_t)1 << (timer)) + +/* + * HPET Timer N Configuration and Capabilities Register + */ +typedef struct hpet_TN_conf_cap { + uint32_t int_route_cap; /* available I/O APIC intrups */ + uint32_t res1 :16; /* reserved */ + uint32_t fsb_int_del_cap :1; /* FSB interrupt supported */ + uint32_t fsb_int_en_cnf :1; /* Set FSB intr delivery */ + uint32_t int_route_cnf :5; /* I/O APIC interrupt to use */ + uint32_t mode32_cnf :1; /* Force 32-bit mode */ + uint32_t res2 :1; /* reserved */ + uint32_t val_set_cnf :1; /* Set periodic mode accumula */ + uint32_t size_cap :1; /* 1=64bit, 0=32bit timer */ + uint32_t per_int_cap :1; /* 1=periodic mode supported */ + uint32_t type_cnf :1; /* Enable periodic mode */ + uint32_t int_enb_cnf :1; /* Enable interrupt generat */ + uint32_t int_type_cnf :1; /* 0=edge, 1=level triggered */ + uint32_t res3 :1; /* reserved */ +} hpet_TN_conf_cap_t; + +/* + * There are 3 to 32 timers on each HPET. + */ +#define HPET_TIMER_N_INT_ROUTE_CAP(l) (l >> 32) +#define HPET_TIMER_N_INT_TYPE_CNF(l) BITX(l, 1, 1) +#define HPET_TIMER_N_INT_ENB_CNF(l) BITX(l, 2, 2) +#define HPET_TIMER_N_TYPE_CNF(l) BITX(l, 3, 3) +#define HPET_TIMER_N_PER_INT_CAP(l) BITX(l, 4, 4) +#define HPET_TIMER_N_SIZE_CAP(l) BITX(l, 5, 5) +#define HPET_TIMER_N_VAL_SET_CNF(l) BITX(l, 6, 6) +#define HPET_TIMER_N_MODE32_CNF(l) BITX(l, 8, 8) +#define HPET_TIMER_N_INT_ROUTE_CNF(l) BITX(l, 13, 9) +#define HPET_TIMER_N_FSB_EN_CNF(l) BITX(l, 14, 14) +#define HPET_TIMER_N_FSB_INT_DEL_CAP(l) BITX(l, 15, 15) + +#define HPET_TIMER_N_INT_TYPE_CNF_BIT (1 << 1) +#define HPET_TIMER_N_INT_ENB_CNF_BIT (1 << 2) +#define HPET_TIMER_N_TYPE_CNF_BIT (1 << 3) +#define HPET_TIMER_N_FSB_EN_CNF_BIT (1 << 14) +#define HPET_TIMER_N_INT_ROUTE_SHIFT(i) (i << 9) + +/* + * HPET Spec reserves timers 0 and 1 for legacy timer replacement (PIT and RTC). + * Available timers for other use such as LACPI proxy during Deep C-State + * start at timer 2. + */ +#define HPET_FIRST_NON_LEGACY_TIMER (2) + +/* + * HPET timer and interrupt used as LAPIC proxy during deep C-State. + */ +typedef struct cstate_timer { + int timer; + int intr; +} cstate_timer_t; + +/* + * Data structure of useful HPET device information. + */ +typedef struct hpet_info { + hpet_gen_cap_t gen_cap; + hpet_gen_conf_t gen_config; + uint64_t gen_intrpt_stat; + uint64_t main_counter_value; + void *logical_address; /* HPET VA memory map */ + hpet_TN_conf_cap_t *timer_n_config; /* N Timer config and cap */ + uint32_t num_timers; /* number of timers */ + uint32_t allocated_timers; /* bitmap of timers in use */ + cstate_timer_t cstate_timer; /* HPET Timer used for LAPIC proxy */ + uint64_t hpet_main_counter_reads[2]; + hrtime_t tsc[3]; + hrtime_t period; /* counter_clk_period in Femto Secs */ +} hpet_info_t; + +#if defined(_KERNEL) + +/* + * Spin mutexes are used in several places because idle threads cannot block. + * These defines provide a mechanism to break out of spin loops to prevent + * system hangs if a CPU can never get the lock (due to an unknown + * hardware/software bug). 100 microsecond was chosen after extensive stress + * testing. + */ +#define HPET_SPIN_CHECK (1000) +#define HPET_SPIN_TIMEOUT (100000) + +/* + * There is one of these per CPU using the HPET as a proxy for its stalled + * local APIC while in c-state >= C2. + */ +typedef hrtime_t hpet_proxy_t; + +extern ACPI_TABLE_HPET *hpet_table; +extern hpet_info_t hpet_info; + +static int hpet_init_proxy(int *hpet_vect, iflag_t *hpet_flags); +static boolean_t hpet_install_proxy(void); +static boolean_t hpet_callback(int code); +static boolean_t hpet_cpr(int code); +static boolean_t hpet_resume(void); +static void hpet_cst_callback(uint32_t code); +static boolean_t hpet_deep_idle_config(int code); +static int hpet_validate_table(ACPI_TABLE_HPET *hpet_table); +static boolean_t hpet_checksum_table(unsigned char *table, unsigned int len); +static void *hpet_memory_map(ACPI_TABLE_HPET *hpet_table); +static int hpet_start_main_counter(hpet_info_t *hip); +static int hpet_stop_main_counter(hpet_info_t *hip); +static uint64_t hpet_read_main_counter_value(hpet_info_t *hip); +static uint64_t hpet_set_leg_rt_cnf(hpet_info_t *hip, uint32_t new_value); +static uint64_t hpet_read_gen_cap(hpet_info_t *hip); +static uint64_t hpet_read_gen_config(hpet_info_t *hip); +static uint64_t hpet_read_gen_intrpt_stat(hpet_info_t *hip); +static uint64_t hpet_read_timer_N_config(hpet_info_t *hip, uint_t n); +static hpet_TN_conf_cap_t hpet_convert_timer_N_config(uint64_t conf); +static uint64_t hpet_read_timer_N_comp(hpet_info_t *hip, uint_t n); +static void hpet_write_gen_cap(hpet_info_t *hip, uint64_t l); +static void hpet_write_gen_config(hpet_info_t *hip, uint64_t l); +static void hpet_write_gen_intrpt_stat(hpet_info_t *hip, uint64_t l); +static void hpet_write_timer_N_config(hpet_info_t *hip, uint_t n, uint64_t l); +static void hpet_write_timer_N_comp(hpet_info_t *hip, uint_t n, uint64_t l); +static void hpet_disable_timer(hpet_info_t *hip, uint32_t timer_n); +static void hpet_enable_timer(hpet_info_t *hip, uint32_t timer_n); +static void hpet_write_main_counter_value(hpet_info_t *hip, uint64_t l); +static int hpet_get_FSB_intr_capable_timer(hpet_info_t *hip, uint32_t mask); +static int hpet_get_IOAPIC_intr_capable_timer(hpet_info_t *hip); +static int hpet_timer_available(uint32_t allocated_timers, uint32_t n); +static void hpet_timer_alloc(uint32_t *allocated_timers, uint32_t n); +static void hpet_timer_set_up(hpet_info_t *hip, uint32_t timer_n, + uint32_t interrupt); +static uint_t hpet_isr(char *arg); +static uint32_t hpet_install_interrupt_handler(uint_t (*func)(char *), + int vector); +static void hpet_uninstall_interrupt_handler(void); +static void hpet_expire_all(void); +static boolean_t hpet_guaranteed_schedule(hrtime_t required_wakeup_time); +static boolean_t hpet_use_hpet_timer(hrtime_t *expire); +static void hpet_use_lapic_timer(hrtime_t expire); +static void hpet_init_proxy_data(void); + +#endif /* defined(_KERNEL) */ + +#ifdef __cplusplus +} +#endif + +#endif /* _HPET_ACPI_H */
--- a/usr/src/uts/i86pc/sys/machcpuvar.h Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/i86pc/sys/machcpuvar.h Wed Feb 25 21:04:18 2009 -0800 @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_MACHCPUVAR_H #define _SYS_MACHCPUVAR_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -128,14 +126,21 @@ struct xen_evt_data *mcpu_evt_pend; /* hypervisor: pending events */ volatile uint32_t *mcpu_mwait; /* MONITOR/MWAIT buffer */ + void (*mcpu_idle_cpu)(void); /* idle function */ + uint16_t mcpu_idle_type; /* CPU next idle type */ + uint16_t max_cstates; /* supported max cstates */ + uint32_t curr_cstate; /* current cstate */ struct cpu_ucode_info *mcpu_ucode_info; + + void *mcpu_pm_mach_state; }; #define NINTR_THREADS (LOCK_LEVEL-1) /* number of interrupt threads */ #define MWAIT_HALTED (1) /* mcpu_mwait set when halting */ #define MWAIT_RUNNING (0) /* mcpu_mwait set to wakeup */ -#define MWAIT_WAKEUP(cpu) (*((cpu)->cpu_m.mcpu_mwait) = MWAIT_RUNNING); +#define MWAIT_WAKEUP_IPI (2) /* need IPI to wakeup */ +#define MWAIT_WAKEUP(cpu) (*((cpu)->cpu_m.mcpu_mwait) = MWAIT_RUNNING) #endif /* _ASM */
--- a/usr/src/uts/i86pc/sys/machsystm.h Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/i86pc/sys/machsystm.h Wed Feb 25 21:04:18 2009 -0800 @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -102,6 +102,14 @@ extern void do_interrupt(struct regs *, trap_trace_rec_t *); extern void memscrub_disable(void); +/* + * Dispatcher hooks. + */ +void (*idle_cpu)(); +void (*non_deep_idle_cpu)(); +void (*disp_enq_thread)(cpu_t *, int); +void (*non_deep_idle_disp_enq_thread)(cpu_t *, int); + #ifndef __xpv extern unsigned int microdata; #endif
--- a/usr/src/uts/i86pc/sys/pwrnow.h Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/i86pc/sys/pwrnow.h Wed Feb 25 21:04:18 2009 -0800 @@ -19,22 +19,22 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _PWRNOW_H #define _PWRNOW_H -#include <sys/cpudrv_mach.h> +#include <sys/cpupm.h> #ifdef __cplusplus extern "C" { #endif -boolean_t pwrnow_supported(); +extern boolean_t pwrnow_supported(); -cpudrv_pstate_ops_t pwrnow_ops; +extern cpupm_state_ops_t pwrnow_ops; #ifdef __cplusplus }
--- a/usr/src/uts/i86pc/sys/speedstep.h Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/i86pc/sys/speedstep.h Wed Feb 25 21:04:18 2009 -0800 @@ -19,22 +19,22 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SPEEDSTEP_H #define _SPEEDSTEP_H -#include <sys/cpudrv_mach.h> +#include <sys/cpupm.h> #ifdef __cplusplus extern "C" { #endif -boolean_t speedstep_supported(uint_t, uint_t); +extern boolean_t speedstep_supported(uint_t, uint_t); -cpudrv_pstate_ops_t speedstep_ops; +extern cpupm_state_ops_t speedstep_ops; #ifdef __cplusplus }
--- a/usr/src/uts/i86xpv/Makefile.files Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/i86xpv/Makefile.files Wed Feb 25 21:04:18 2009 -0800 @@ -20,7 +20,7 @@ # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -44,6 +44,7 @@ cpuid.o \ cpuid_subr.o \ cpupm.o \ + cpupm_mach.o \ dis_tables.o \ ddi_impl.o \ dtrace_subr.o \
--- a/usr/src/uts/intel/ia32/ml/modstubs.s Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/intel/ia32/ml/modstubs.s Wed Feb 25 21:04:18 2009 -0800 @@ -1315,6 +1315,26 @@ END_MODULE(dcopy); #endif +/* + * Stubs for acpica + */ +#ifndef ACPICA_MODULE + MODULE(acpica,misc); + NO_UNLOAD_STUB(acpica, AcpiOsReadPort, nomod_minus_one) ; + NO_UNLOAD_STUB(acpica, AcpiOsWritePort, nomod_minus_one) ; + NO_UNLOAD_STUB(acpica, AcpiInstallNotifyHandler, nomod_minus_one) ; + NO_UNLOAD_STUB(acpica, AcpiRemoveNotifyHandler, nomod_minus_one) ; + NO_UNLOAD_STUB(acpica, AcpiEvaluateObject, nomod_minus_one) ; + NO_UNLOAD_STUB(acpica, AcpiEvaluateObjectTyped, nomod_minus_one) ; + NO_UNLOAD_STUB(acpica, AcpiSetRegister, nomod_minus_one) ; + NO_UNLOAD_STUB(acpica, AcpiGetRegister, nomod_minus_one) ; + NO_UNLOAD_STUB(acpica, AcpiOsFree, nomod_minus_one) ; + NO_UNLOAD_STUB(acpica, acpica_get_handle_cpu, nomod_minus_one) ; + NO_UNLOAD_STUB(acpica, acpica_get_global_FADT, nomod_minus_one) ; + NO_UNLOAD_STUB(acpica, __acpi_wbinvd, nomod_minus_one) ; + END_MODULE(acpica); +#endif + #ifndef IPNET_MODULE MODULE(ipnet,drv); STUB(ipnet, ipnet_if_getdev, nomod_zero);
--- a/usr/src/uts/intel/io/acpica/osl.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/intel/io/acpica/osl.c Wed Feb 25 21:04:18 2009 -0800 @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -474,8 +474,16 @@ AcpiOsAcquireLock(ACPI_HANDLE Handle) { - mutex_enter((kmutex_t *)Handle); - return (0); + + if (Handle == NULL) + return (AE_BAD_PARAMETER); + + if (curthread == CPU->cpu_idle_thread) { + while (!mutex_tryenter((kmutex_t *)Handle)) + /* spin */; + } else + mutex_enter((kmutex_t *)Handle); + return (AE_OK); } void @@ -1365,24 +1373,8 @@ * Return the ACPI device node matching the CPU dev_info node. */ ACPI_STATUS -acpica_get_handle_cpu(dev_info_t *dip, ACPI_HANDLE *rh) +acpica_get_handle_cpu(int cpu_id, ACPI_HANDLE *rh) { - char *device_type_prop; - int cpu_id; - - /* - * if "device_type" != "cpu", error - */ - if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0, - "device_type", &device_type_prop) != DDI_PROP_SUCCESS) - return (AE_ERROR); - - if (strcmp("cpu", device_type_prop) != 0) { - ddi_prop_free(device_type_prop); - return (AE_ERROR); - } - ddi_prop_free(device_type_prop); - /* * if cpu_map itself is NULL, we're a uppc system and * acpica_build_processor_map() hasn't been called yet. @@ -1394,19 +1386,10 @@ return (AE_ERROR); } - /* - * get 'reg' and get obj from cpu_map - */ - cpu_id = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, - "reg", -1); if ((cpu_id < 0) || (cpu_map[cpu_id] == NULL) || (cpu_map[cpu_id]->obj == NULL)) return (AE_ERROR); - /* - * tag devinfo and obj - */ - (void) acpica_tag_devinfo(dip, cpu_map[cpu_id]->obj); *rh = cpu_map[cpu_id]->obj; return (AE_OK); } @@ -1689,7 +1672,7 @@ if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "acpi-namespace", &acpiname) != DDI_PROP_SUCCESS) { - return (acpica_get_handle_cpu(dip, rh)); + return (AE_ERROR); } status = AcpiGetHandle(NULL, acpiname, rh); @@ -1793,3 +1776,9 @@ ASSERT(status == AE_OK); cpu_map_built = 1; } + +void +acpica_get_global_FADT(ACPI_TABLE_FADT **gbl_FADT) +{ + *gbl_FADT = &AcpiGbl_FADT; +}
--- a/usr/src/uts/intel/sys/acpica.h Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/intel/sys/acpica.h Wed Feb 25 21:04:18 2009 -0800 @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ACPICA_H #define _SYS_ACPICA_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -125,11 +123,13 @@ extern int acpica_get_bdf(dev_info_t *, int *, int *, int *); extern ACPI_STATUS acpica_get_devinfo(ACPI_HANDLE, dev_info_t **); extern ACPI_STATUS acpica_get_handle(dev_info_t *, ACPI_HANDLE *); +extern ACPI_STATUS acpica_get_handle_cpu(int, ACPI_HANDLE *); extern ACPI_STATUS acpica_eval_int(ACPI_HANDLE, char *, int *); extern void acpica_map_cpu(processorid_t, UINT32); extern void acpica_build_processor_map(); extern void acpica_ddi_save_resources(dev_info_t *); extern void acpica_ddi_restore_resources(dev_info_t *); +extern void acpica_get_global_FADT(ACPI_TABLE_FADT **); #ifdef __cplusplus }
--- a/usr/src/uts/intel/sys/x86_archext.h Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/intel/sys/x86_archext.h Wed Feb 25 21:04:18 2009 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -357,6 +357,11 @@ "\10mmx\7cmov\6de\5pge\4mtrr\3msr\2tsc\1lgpg" /* + * Intel Deep C-State invariant TSC in leaf 0x80000007. + */ +#define CPUID_TSC_CSTATE_INVARIANCE (0x100) + +/* * x86_type is a legacy concept; this is supplanted * for most purposes by x86_feature; modern CPUs * should be X86_TYPE_OTHER @@ -605,6 +610,7 @@ #if !defined(__xpv) extern uint32_t *cpuid_mwait_alloc(struct cpu *); extern void cpuid_mwait_free(struct cpu *); +extern int cpuid_deep_cstates_supported(void); #endif struct cpu_ucode_info;
--- a/usr/src/uts/sun4/Makefile.files Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/sun4/Makefile.files Wed Feb 25 21:04:18 2009 -0800 @@ -20,11 +20,9 @@ # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# # This Makefile defines all file modules for the directory uts/sun4 # and it's children. These are the source files which are common # between sun4u and sun4r. @@ -38,6 +36,7 @@ CORE_OBJS += cbe.o CORE_OBJS += confunix.o CORE_OBJS += copy.o +CORE_OBJS += cpupm_mach.o CORE_OBJS += cpu_states.o CORE_OBJS += ddi_impl.o CORE_OBJS += dmv.o
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/sun4/os/cpupm_mach.c Wed Feb 25 21:04:18 2009 -0800 @@ -0,0 +1,51 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/cpu_pm.h> + +/* + * CPU PM interfaces exposed to the CPU power manager + */ +/*ARGSUSED*/ +id_t +cpupm_plat_domain_id(struct cpu *cp, cpupm_dtype_t type) +{ + return (CPUPM_NO_DOMAIN); +} + +/*ARGSUSED*/ +uint_t +cpupm_plat_state_enumerate(struct cpu *cp, cpupm_dtype_t type, + cpupm_state_t *states) +{ + return (0); +} + +/*ARGSUSED*/ +int +cpupm_plat_change_state(struct cpu *cp, cpupm_state_t *state) +{ + return (-1); +}
--- a/usr/src/uts/sun4/os/mlsetup.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/sun4/os/mlsetup.c Wed Feb 25 21:04:18 2009 -0800 @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/systm.h> #include <sys/archsystm.h> @@ -216,6 +214,8 @@ cpu_vm_data_init(CPU); + pg_cpu_bootstrap(CPU); + (void) prom_set_preprom(kern_splr_preprom); (void) prom_set_postprom(kern_splx_postprom); PRM_INFO("mlsetup: now ok to call prom_printf");
--- a/usr/src/uts/sun4/os/startup.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/sun4/os/startup.c Wed Feb 25 21:04:18 2009 -0800 @@ -94,6 +94,7 @@ extern void mach_kpm_init(void); extern void pcf_init(); extern int size_pse_array(pgcnt_t, int); +extern void pg_init(); /* * External Data: @@ -2222,6 +2223,8 @@ maxmem = freemem; + pg_init(); + #ifdef PTL1_PANIC_DEBUG init_ptl1_thread(); #endif /* PTL1_PANIC_DEBUG */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/sun4/sys/cpupm_mach.h Wed Feb 25 21:04:18 2009 -0800 @@ -0,0 +1,42 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _CPUPM_MACH_H +#define _CPUPM_MACH_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Convert speed to Hz. + */ +#define CPUPM_SPEED_HZ(mhz, divisor) (((uint64_t)mhz * 1000000) / divisor) + +#ifdef __cplusplus +} +#endif + +#endif /* _CPUPM_MACH_H */
--- a/usr/src/uts/sun4u/Makefile.files Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/sun4u/Makefile.files Wed Feb 25 21:04:18 2009 -0800 @@ -20,7 +20,7 @@ # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # This Makefile defines all file modules for the directory uts/sun4u @@ -36,6 +36,7 @@ CORE_OBJS += cmp.o CORE_OBJS += cpc_hwreg.o CORE_OBJS += cpc_subr.o +CORE_OBJS += cpupm.o CORE_OBJS += mach_cpu_states.o CORE_OBJS += mach_ddi_impl.o CORE_OBJS += ecc.o
--- a/usr/src/uts/sun4u/cpu/spitfire.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/sun4u/cpu/spitfire.c Wed Feb 25 21:04:18 2009 -0800 @@ -2904,8 +2904,7 @@ CHANGE_REFRESH_COUNT(HB_SPEED_UP, cur_divisor, new_divisor); } CPU->cpu_m.divisor = (uchar_t)new_divisor; - CPU->cpu_curr_clock = - (((uint64_t)pi->pi_clock * 1000000) / new_divisor); + cpu_set_curr_clock(((uint64_t)pi->pi_clock * 1000000) / new_divisor); #endif }
--- a/usr/src/uts/sun4u/cpu/us3_cheetah.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/sun4u/cpu/us3_cheetah.c Wed Feb 25 21:04:18 2009 -0800 @@ -570,8 +570,8 @@ reg |= bceclk->mask; set_safari_config(reg); CPU->cpu_m.divisor = (uchar_t)divisor; - CPU->cpu_curr_clock = - (((uint64_t)pi->pi_clock * 1000000) / divisor); + cpu_set_curr_clock(((uint64_t)pi->pi_clock * 1000000) / + divisor); return; } /*
--- a/usr/src/uts/sun4u/cpu/us3_cheetahplus.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/sun4u/cpu/us3_cheetahplus.c Wed Feb 25 21:04:18 2009 -0800 @@ -774,8 +774,8 @@ reg |= bceclk->mask; set_safari_config(reg); CPU->cpu_m.divisor = (uchar_t)divisor; - CPU->cpu_curr_clock = - (((uint64_t)pi->pi_clock * 1000000) / divisor); + cpu_set_curr_clock(((uint64_t)pi->pi_clock * 1000000) / + divisor); return; } /*
--- a/usr/src/uts/sun4u/cpu/us3_jalapeno.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/sun4u/cpu/us3_jalapeno.c Wed Feb 25 21:04:18 2009 -0800 @@ -792,8 +792,8 @@ (void) get_mcu_ctl_reg1(); } CPU->cpu_m.divisor = (uchar_t)divisor; - CPU->cpu_curr_clock = - (((uint64_t)pi->pi_clock * 1000000) / divisor); + cpu_set_curr_clock(((uint64_t)pi->pi_clock * 1000000) / + divisor); return; } /*
--- a/usr/src/uts/sun4u/io/cpudrv_mach.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/sun4u/io/cpudrv_mach.c Wed Feb 25 21:04:18 2009 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -28,16 +28,15 @@ */ #include <sys/ddi.h> #include <sys/sunddi.h> +#include <sys/cpupm.h> #include <sys/cpudrv_mach.h> #include <sys/machsystm.h> -boolean_t cpudrv_enabled = B_TRUE; - /* * Change CPU speed. */ int -cpudrv_pm_change_speed(cpudrv_devstate_t *cpudsp, cpudrv_pm_spd_t *new_spd) +cpudrv_change_speed(cpudrv_devstate_t *cpudsp, cpudrv_pm_spd_t *new_spd) { xc_one(cpudsp->cpu_id, (xcfunc_t *)cpu_change_speed, \ (uint64_t)new_spd->speed, 0); @@ -48,7 +47,7 @@ * Determine the cpu_id for the CPU device. */ boolean_t -cpudrv_pm_get_cpu_id(dev_info_t *dip, processorid_t *cpu_id) +cpudrv_get_cpu_id(dev_info_t *dip, processorid_t *cpu_id) { return (dip_to_cpu_id(dip, cpu_id) == DDI_SUCCESS); } @@ -57,7 +56,7 @@ * A noop for this machine type. */ boolean_t -cpudrv_pm_power_ready(void) +cpudrv_power_ready(void) { return (B_TRUE); } @@ -67,7 +66,7 @@ */ /* ARGSUSED */ boolean_t -cpudrv_pm_is_governor_thread(cpudrv_pm_t *cpupm) +cpudrv_is_governor_thread(cpudrv_pm_t *cpupm) { return (B_FALSE); } @@ -77,26 +76,31 @@ */ /*ARGSUSED*/ boolean_t -cpudrv_mach_pm_init(cpudrv_devstate_t *cpudsp) +cpudrv_mach_init(cpudrv_devstate_t *cpudsp) { return (B_TRUE); } /* - * A noop for this machine type. + * On SPARC all instances support power management unless attach fails. + * In the case of attach failure, cpudrv_enabled will be false. */ /*ARGSUSED*/ -void -cpudrv_mach_pm_free(cpudrv_devstate_t *cpudsp) +boolean_t +cpudrv_is_enabled(cpudrv_devstate_t *cpudsp) { + return (cpudrv_enabled); } -/* - * On SPARC all instances support power management unless attach fails. - * In the case of attach failure, cpupm_enabled will be false. - */ -boolean_t -cpudrv_pm_enabled() +void +cpudrv_set_supp_freqs(cpudrv_devstate_t *cpudsp) { - return (B_TRUE); + int *speeds; + uint_t nspeeds; + + CPUDRV_GET_SPEEDS(cpudsp, speeds, nspeeds); + if (nspeeds == 0) + return; + cpupm_set_supp_freqs(cpudsp->cp, speeds, nspeeds); + CPUDRV_FREE_SPEEDS(speeds, nspeeds); }
--- a/usr/src/uts/sun4u/os/cmp.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/sun4u/os/cmp.c Wed Feb 25 21:04:18 2009 -0800 @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/machsystm.h> #include <sys/x_call.h> @@ -224,10 +222,16 @@ } } -int -pg_plat_hw_level(pghw_type_t hw) +/* + * Rank the relative importance of optimizing for hw1 or hw2 + */ +pghw_type_t +pg_plat_hw_rank(pghw_type_t hw1, pghw_type_t hw2) { int i; + int rank1 = 0; + int rank2 = 0; + static pghw_type_t hw_hier[] = { PGHW_IPIPE, PGHW_CHIP, @@ -236,40 +240,28 @@ }; for (i = 0; hw_hier[i] != PGHW_NUM_COMPONENTS; i++) { - if (hw_hier[i] == hw) - return (i); + if (hw_hier[i] == hw1) + rank1 = i; + if (hw_hier[i] == hw2) + rank2 = i; } - return (-1); + + if (rank1 > rank2) + return (hw1); + else + return (hw2); } /* - * Return 1 if CMT load balancing policies should be - * implemented across instances of the specified hardware - * sharing relationship. + * Override the default CMT dispatcher policy for the specified + * hardware sharing relationship */ -int -pg_plat_cmt_load_bal_hw(pghw_type_t hw) +/* ARGSUSED */ +pg_cmt_policy_t +pg_plat_cmt_policy(pghw_type_t hw) { - if (hw == PGHW_IPIPE || - hw == PGHW_FPU || - hw == PGHW_CHIP) - return (1); - else - return (0); -} - - -/* - * Return 1 if thread affinity polices should be implemented - * for instances of the specifed hardware sharing relationship. - */ -int -pg_plat_cmt_affinity_hw(pghw_type_t hw) -{ - if (hw == PGHW_CACHE) - return (1); - else - return (0); + /* Accept the default polices */ + return (CMT_NO_POLICY); } id_t
--- a/usr/src/uts/sun4u/os/mach_startup.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/sun4u/os/mach_startup.c Wed Feb 25 21:04:18 2009 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -403,7 +403,7 @@ } void -mach_cpu_halt_idle() +mach_cpu_halt_idle(void) { if (enable_halt_idle_cpus) { if (&cpu_halt_cpu) {
--- a/usr/src/uts/sun4u/sys/cpudrv_mach.h Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/sun4u/sys/cpudrv_mach.h Wed Feb 25 21:04:18 2009 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,44 +38,32 @@ * take cross calls (cross calls fail silently if CPU is not ready * for it). */ -#define CPUDRV_PM_XCALL_IS_READY(cpuid) (CPU_XCALL_READY(cpuid)) - -/* - * If a failure occurs during attach(), then CPU power management - * is disabled. - */ -extern boolean_t cpudrv_enabled; - -#define CPUDRV_PM_DISABLE() (cpudrv_enabled = B_FALSE) - -#define CPUDRV_PM_DISABLED() (!cpudrv_enabled) - -#define CPUDRV_PM_POWER_ENABLED(cpudsp) cpudrv_pm_enabled() +#define CPUDRV_XCALL_IS_READY(cpuid) (CPU_XCALL_READY(cpuid)) /* * Currently, there is no governor on sun4u, */ -#define CPUDRV_PM_RESET_GOVERNOR_THREAD(cpupm) +#define CPUDRV_RESET_GOVERNOR_THREAD(cpupm) /* * Currently, there is no need for a handler on sun4u. */ -#define CPUDRV_PM_INSTALL_MAX_CHANGE_HANDLER(cpudsp, dip) +#define CPUDRV_INSTALL_MAX_CHANGE_HANDLER(cpuid) /* * Topspeed is always the head speed. */ -#define CPUDRV_PM_TOPSPEED(cpupm) (cpupm)->head_spd +#define CPUDRV_TOPSPEED(cpupm) (cpupm)->head_spd /* * There is no notion of changing topspeed on sun4u. */ -#define CPUDRV_PM_REDEFINE_TOPSPEED(dip) +#define CPUDRV_REDEFINE_TOPSPEED(dip) /* * There are no PPM callbacks for sun4u. */ -#define CPUDRV_PM_SET_PPM_CALLBACKS() +#define CPUDRV_SET_PPM_CALLBACKS() /* * clock-divisors property tells the supported speeds @@ -84,33 +72,36 @@ * property value of "1, 2, 32" represents full, 1/2 and 1/32 * speeds. */ -#define CPUDRV_PM_GET_SPEEDS(cpudsp, speeds, nspeeds) { \ +#define CPUDRV_GET_SPEEDS(cpudsp, speeds, nspeeds) { \ if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, cpudsp->dip, \ DDI_PROP_DONTPASS, "clock-divisors", &speeds, \ &nspeeds) != DDI_PROP_SUCCESS) { \ - DPRINTF(D_PM_INIT, ("cpudrv_pm_init: instance %d: " \ + nspeeds = 0; \ + DPRINTF(D_PM_INIT, ("cpudrv_init: instance %d: " \ "clock-divisors property not defined\n", \ - ddi_get_instance(cpudsp->dip))); \ - return (DDI_FAILURE); \ + ddi_get_instance(cpudsp->dip))); \ } \ } -#define CPUDRV_PM_FREE_SPEEDS(speeds, unused) ddi_prop_free(speeds); +#define CPUDRV_FREE_SPEEDS(speeds, nspeeds) { \ + if (nspeeds > 0) \ + ddi_prop_free(speeds); \ +} /* * Convert speed to Hz. */ -#define CPUDRV_PM_SPEED_HZ(mhz, divisor) (((uint64_t)mhz * 1000000) / divisor) +#define CPUDRV_SPEED_HZ(mhz, divisor) (((uint64_t)mhz * 1000000) / divisor) /* * Compute the idle cnt percentage for a given speed. */ -#define CPUDRV_PM_IDLE_CNT_PERCENT(hwm, speeds, i) \ +#define CPUDRV_IDLE_CNT_PERCENT(hwm, speeds, i) \ (100 - ((100 - hwm) * speeds[i])) /* * Compute the user cnt percentage for a given speed. */ -#define CPUDRV_PM_USER_CNT_PERCENT(hwm, speeds, i) \ +#define CPUDRV_USER_CNT_PERCENT(hwm, speeds, i) \ ((hwm * speeds[i - 1]) / speeds[i]) /* @@ -128,23 +119,21 @@ * digits for power level + '=' + '1/' + digits for speed + * description text + '\0' */ -#define CPUDRV_PM_COMP_NORMAL "Normal" -#define CPUDRV_PM_COMP_OTHER " of Normal" -#define CPUDRV_PM_COMP_SIZE() \ - (CPUDRV_PM_COMP_MAX_DIG + 1 + 2 + CPUDRV_PM_COMP_MAX_DIG + \ - sizeof (CPUDRV_PM_COMP_OTHER) + 1); -#define CPUDRV_PM_COMP_SPEED(cpupm, cur_spd) \ +#define CPUDRV_COMP_NORMAL "Normal" +#define CPUDRV_COMP_OTHER " of Normal" +#define CPUDRV_COMP_SIZE() \ + (CPUDRV_COMP_MAX_DIG + 1 + 2 + CPUDRV_COMP_MAX_DIG + \ + sizeof (CPUDRV_COMP_OTHER) + 1); +#define CPUDRV_COMP_SPEED(cpupm, cur_spd) \ ((cur_spd == cpupm->head_spd) ? cur_spd->pm_level : cur_spd->speed) -#define CPUDRV_PM_COMP_SPRINT(pmc, cpupm, cur_spd, comp_spd) { \ +#define CPUDRV_COMP_SPRINT(pmc, cpupm, cur_spd, comp_spd) { \ if (cur_spd == cpupm->head_spd) \ - (void) sprintf(pmc, "%d=%s", comp_spd, CPUDRV_PM_COMP_NORMAL);\ + (void) sprintf(pmc, "%d=%s", comp_spd, CPUDRV_COMP_NORMAL);\ else \ (void) sprintf(pmc, "%d=1/%d%s", cur_spd->pm_level, \ - comp_spd, CPUDRV_PM_COMP_OTHER); \ + comp_spd, CPUDRV_COMP_OTHER); \ } -extern boolean_t cpudrv_pm_enabled(void); - #ifdef __cplusplus } #endif
--- a/usr/src/uts/sun4v/os/cmp.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/sun4v/os/cmp.c Wed Feb 25 21:04:18 2009 -0800 @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/machsystm.h> #include <sys/cmp.h> @@ -132,16 +130,15 @@ } /* - * Order the relevant hw sharing relationships - * from least, to greatest physical scope. - * - * The hierarchy *must* be defined for all hw that - * pg_plat_hw_shared() returns non-zero. + * Rank the relative importance of optimizing for hw1 or hw2 */ -int -pg_plat_hw_level(pghw_type_t hw) +pghw_type_t +pg_plat_hw_rank(pghw_type_t hw1, pghw_type_t hw2) { int i; + int rank1 = 0; + int rank2 = 0; + static pghw_type_t hw_hier[] = { PGHW_IPIPE, PGHW_FPU, @@ -150,40 +147,27 @@ }; for (i = 0; hw_hier[i] != PGHW_NUM_COMPONENTS; i++) { - if (hw_hier[i] == hw) - return (i); + if (hw_hier[i] == hw1) + rank1 = i; + if (hw_hier[i] == hw2) + rank2 = i; } - return (-1); + if (rank1 > rank2) + return (hw1); + else + return (hw2); } /* - * Return 1 if CMT load balancing policies should be - * implemented across instances of the specified hardware - * sharing relationship. + * Override the default CMT dispatcher policy for the specified + * hardware sharing relationship */ -int -pg_plat_cmt_load_bal_hw(pghw_type_t hw) +/* ARGSUSED */ +pg_cmt_policy_t +pg_plat_cmt_policy(pghw_type_t hw) { - if (hw == PGHW_IPIPE || - hw == PGHW_FPU || - hw == PGHW_MPIPE) - return (1); - else - return (0); -} - - -/* - * Return 1 if thread affinity polices should be implemented - * for instances of the specifed hardware sharing relationship. - */ -int -pg_plat_cmt_affinity_hw(pghw_type_t hw) -{ - if (hw == PGHW_CACHE) - return (1); - else - return (0); + /* Accept the default policies */ + return (CMT_NO_POLICY); } id_t @@ -213,7 +197,7 @@ return (0); } /* - * Return 1 if thread affinity polices should be implemented + * Return 1 if thread affinity policies should be implemented * for instances of the specifed hardware sharing relationship. */ int
--- a/usr/src/uts/sun4v/os/mach_startup.c Wed Feb 25 20:53:30 2009 -0800 +++ b/usr/src/uts/sun4v/os/mach_startup.c Wed Feb 25 21:04:18 2009 -0800 @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -270,7 +270,7 @@ } void -mach_cpu_halt_idle() +mach_cpu_halt_idle(void) { if (enable_halt_idle_cpus) { idle_cpu = cpu_halt;