Mercurial > illumos > illumos-gate
changeset 4759:3a228be89946
PSARC/2007/307 Victoria Falls CPU/memory FMA
6532872 Incorrect fault name reported for store buffer
6536478 anchored page retire for T5140/T5240
6536482 diagnose FBR and FBU errors to branch
6545057 on T5140/T5240, diagnose mem UE as L2 cache data UE if C2C bit is set
6545604 Enhance CPU/Mem DE to support T2plus
6545632 add US-T2plus support to CPU/Mem error injector
line wrap: on
line diff
--- a/usr/src/cmd/fm/dicts/SUN4V.dict Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/cmd/fm/dicts/SUN4V.dict Mon Jul 30 12:41:05 2007 -0700 @@ -66,3 +66,19 @@ fault.io.n2.crossbar=36 fault.io.fire.fw-epkt fault.io.fire.sw-epkt fault.io.fire.sw-fw-mismatch=37 fault.io.vf.ncx=38 +fault.memory.link-f=39 +fault.cpu.ultraSPARC-T2plus.ireg=40 +fault.cpu.ultraSPARC-T2plus.freg=41 +fault.cpu.ultraSPARC-T2plus.misc_reg=42 +fault.cpu.ultraSPARC-T2plus.itlb=43 +fault.cpu.ultraSPARC-T2plus.dtlb=44 +fault.cpu.ultraSPARC-T2plus.icache=45 +fault.cpu.ultraSPARC-T2plus.dcache=46 +fault.cpu.ultraSPARC-T2plus.mau=47 +fault.cpu.ultraSPARC-T2plus.l2data-c=48 +fault.cpu.ultraSPARC-T2plus.l2cachetag=49 +fault.cpu.ultraSPARC-T2plus.l2cachectl=50 +fault.cpu.ultraSPARC-T2plus.l2data-u=51 +fault.cpu.ultraSPARC-T2plus.lfu-f=52 +fault.cpu.ultraSPARC-T2plus.lfu-p=53 +fault.cpu.ultraSPARC-T2plus.lfu-u=54
--- a/usr/src/cmd/fm/dicts/SUN4V.po Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/cmd/fm/dicts/SUN4V.po Mon Jul 30 12:41:05 2007 -0700 @@ -633,3 +633,259 @@ msgstr "Loss of services provided by the device\ninstances associated with this fault\n" msgid "SUN4V-8001-64.action" msgstr "Schedule a repair procedure to replace the affected\ndevice if necessary, or contact Sun for support.\n" +# +# code: SUN4V-8001-7R +# keys: fault.memory.link-f +# +msgid "SUN4V-8001-7R.type" +msgstr "Fault" +msgid "SUN4V-8001-7R.severity" +msgstr "Major" +msgid "SUN4V-8001-7R.description" +msgstr "A problem was detected in the interconnect between a memory DIMM module and\nits memory controller. A lane failover has taken place.\n Refer to %s for more information." +msgid "SUN4V-8001-7R.response" +msgstr "No automated response.\n" +msgid "SUN4V-8001-7R.impact" +msgstr "System performance may be impacted.\n" +msgid "SUN4V-8001-7R.action" +msgstr "At convenient time, try reseating the memory module(s). If problem persists,\ncontact Sun to schedule part replacement.\n" +# +# code: SUN4V-8001-8H +# keys: fault.cpu.ultraSPARC-T2plus.ireg +# +msgid "SUN4V-8001-8H.type" +msgstr "Fault" +msgid "SUN4V-8001-8H.severity" +msgstr "Minor" +msgid "SUN4V-8001-8H.description" +msgstr "The number of integer register errors associated with this thread has exceeded acceptable levels.\n Refer to %s for more information." +msgid "SUN4V-8001-8H.response" +msgstr "The fault manager will attempt to remove the affected thread\nfrom service.\n" +msgid "SUN4V-8001-8H.impact" +msgstr "System performance may be affected. \n" +msgid "SUN4V-8001-8H.action" +msgstr "Schedule a repair procedure to replace the affected CPU, the identity of which can be determined using fmdump -v -u <EVENT_ID>.\n" +# +# code: SUN4V-8001-9D +# keys: fault.cpu.ultraSPARC-T2plus.freg +# +msgid "SUN4V-8001-9D.type" +msgstr "Fault" +msgid "SUN4V-8001-9D.severity" +msgstr "Minor" +msgid "SUN4V-8001-9D.description" +msgstr "The number of floating register errors associated with this thread has exceeded acceptable levels.\n Refer to %s for more information." +msgid "SUN4V-8001-9D.response" +msgstr "The fault manager will attempt to remove the affected thread\nfrom service.\n" +msgid "SUN4V-8001-9D.impact" +msgstr "System performance may be affected. \n" +msgid "SUN4V-8001-9D.action" +msgstr "Schedule a repair procedure to replace the affected CPU, the identity of which can be determined using fmdump -v -u <EVENT_ID>.\n" +# +# code: SUN4V-8001-AY +# keys: fault.cpu.ultraSPARC-T2plus.misc_reg +# +msgid "SUN4V-8001-AY.type" +msgstr "Fault" +msgid "SUN4V-8001-AY.severity" +msgstr "Minor" +msgid "SUN4V-8001-AY.description" +msgstr "The number of ancillary register errors associated with this thread has exceeded acceptable levels.\n Refer to %s for more information." +msgid "SUN4V-8001-AY.response" +msgstr "The fault manager will attempt to remove the affected thread\nfrom service.\n" +msgid "SUN4V-8001-AY.impact" +msgstr "System performance may be affected. \n" +msgid "SUN4V-8001-AY.action" +msgstr "Schedule a repair procedure to replace the affected CPU, the identity of which can be determined using fmdump -v -u <EVENT_ID>.\n" +# +# code: SUN4V-8001-C3 +# keys: fault.cpu.ultraSPARC-T2plus.itlb +# +msgid "SUN4V-8001-C3.type" +msgstr "Fault" +msgid "SUN4V-8001-C3.severity" +msgstr "Major" +msgid "SUN4V-8001-C3.description" +msgstr "The number of ITLB errors associated with this thread has exceeded acceptable levels.\n Refer to %s for more information." +msgid "SUN4V-8001-C3.response" +msgstr "The fault manager will attempt to remove all threads associated with\nthis resource from service.\n" +msgid "SUN4V-8001-C3.impact" +msgstr "System performance is likely to be affected. \n" +msgid "SUN4V-8001-C3.action" +msgstr "Schedule a repair procedure to replace the affected resource, the identity of which can be determined using fmdump -v -u <EVENT_ID>.\n" +# +# code: SUN4V-8001-DS +# keys: fault.cpu.ultraSPARC-T2plus.dtlb +# +msgid "SUN4V-8001-DS.type" +msgstr "Fault" +msgid "SUN4V-8001-DS.severity" +msgstr "Major" +msgid "SUN4V-8001-DS.description" +msgstr "The number of DTLB errors associated with this thread has exceeded acceptable levels.\n Refer to %s for more information." +msgid "SUN4V-8001-DS.response" +msgstr "The fault manager will attempt to remove all threads associated with\nthis resource from service.\n" +msgid "SUN4V-8001-DS.impact" +msgstr "System performance is likely to be affected. \n" +msgid "SUN4V-8001-DS.action" +msgstr "Schedule a repair procedure to replace the affected resource, the identity of which can be determined using fmdump -v -u <EVENT_ID>.\n" +# +# code: SUN4V-8001-E5 +# keys: fault.cpu.ultraSPARC-T2plus.icache +# +msgid "SUN4V-8001-E5.type" +msgstr "Fault" +msgid "SUN4V-8001-E5.severity" +msgstr "Major" +msgid "SUN4V-8001-E5.description" +msgstr "The number of I-cache errors associated with this thread has exceeded acceptable levels.\n Refer to %s for more information." +msgid "SUN4V-8001-E5.response" +msgstr "The fault manager will attempt to remove all threads associated with\nthis resource from service.\n" +msgid "SUN4V-8001-E5.impact" +msgstr "System performance is likely to be affected. \n" +msgid "SUN4V-8001-E5.action" +msgstr "Schedule a repair procedure to replace the affected resource, the identity of which can be determined using fmdump -v -u <EVENT_ID>.\n" +# +# code: SUN4V-8001-FP +# keys: fault.cpu.ultraSPARC-T2plus.dcache +# +msgid "SUN4V-8001-FP.type" +msgstr "Fault" +msgid "SUN4V-8001-FP.severity" +msgstr "Major" +msgid "SUN4V-8001-FP.description" +msgstr "The number of D-cache errors associated with this thread has exceeded acceptable levels.\n Refer to %s for more information." +msgid "SUN4V-8001-FP.response" +msgstr "The fault manager will attempt to remove all threads associated with\nthis resource from service.\n" +msgid "SUN4V-8001-FP.impact" +msgstr "System performance is likely to be affected. \n" +msgid "SUN4V-8001-FP.action" +msgstr "Schedule a repair procedure to replace the affected resource, the identity of which can be determined using fmdump -v -u <EVENT_ID>.\n" +# +# code: SUN4V-8001-GA +# keys: fault.cpu.ultraSPARC-T2plus.mau +# +msgid "SUN4V-8001-GA.type" +msgstr "Fault" +msgid "SUN4V-8001-GA.severity" +msgstr "Major" +msgid "SUN4V-8001-GA.description" +msgstr "The number of modular arithmetic unit errors associated with this thread has exceeded acceptable levels.\n Refer to %s for more information." +msgid "SUN4V-8001-GA.response" +msgstr "Cryptographic software will not use this modular arithmetic unit.\n\n" +msgid "SUN4V-8001-GA.impact" +msgstr "System performance is likely to be affected. \n" +msgid "SUN4V-8001-GA.action" +msgstr "Schedule a repair procedure to replace the affected resource, the identity of which can be determined using fmdump -v -u <EVENT_ID>.\n" +# +# code: SUN4V-8001-HJ +# keys: fault.cpu.ultraSPARC-T2plus.l2data-c +# +msgid "SUN4V-8001-HJ.type" +msgstr "Fault" +msgid "SUN4V-8001-HJ.severity" +msgstr "Critical" +msgid "SUN4V-8001-HJ.description" +msgstr "The number of level 2 cache correctable data errors has exceeded acceptable levels.\n Refer to %s for more information." +msgid "SUN4V-8001-HJ.response" +msgstr "No automated response.\n" +msgid "SUN4V-8001-HJ.impact" +msgstr "System performance is likely to be affected. \n" +msgid "SUN4V-8001-HJ.action" +msgstr "Schedule a repair procedure to replace the affected resource, the identity of which can be determined using fmdump -v -u <EVENT_ID>.\n" +# +# code: SUN4V-8001-JE +# keys: fault.cpu.ultraSPARC-T2plus.l2cachetag +# +msgid "SUN4V-8001-JE.type" +msgstr "Fault" +msgid "SUN4V-8001-JE.severity" +msgstr "Critical" +msgid "SUN4V-8001-JE.description" +msgstr "The number of level 2 cache tag errors has exceeded acceptable levels.\n Refer to %s for more information." +msgid "SUN4V-8001-JE.response" +msgstr "No automated response.\n" +msgid "SUN4V-8001-JE.impact" +msgstr "System performance is likely to be affected. \n" +msgid "SUN4V-8001-JE.action" +msgstr "Schedule a repair procedure to replace the affected resource, the identity of which can be determined using fmdump -v -u <EVENT_ID>.\n" +# +# code: SUN4V-8001-KX +# keys: fault.cpu.ultraSPARC-T2plus.l2cachectl +# +msgid "SUN4V-8001-KX.type" +msgstr "Fault" +msgid "SUN4V-8001-KX.severity" +msgstr "Critical" +msgid "SUN4V-8001-KX.description" +msgstr "The number of level 2 cache control errors has exceeded acceptable levels.\n Refer to %s for more information." +msgid "SUN4V-8001-KX.response" +msgstr "No automated response.\n" +msgid "SUN4V-8001-KX.impact" +msgstr "System performance is likely to be affected. System may be unstable.\n" +msgid "SUN4V-8001-KX.action" +msgstr "Schedule a repair procedure to replace the affected resource, the identity of which can be determined using fmdump -v -u <EVENT_ID>.\n" +# +# code: SUN4V-8001-L2 +# keys: fault.cpu.ultraSPARC-T2plus.l2data-u +# +msgid "SUN4V-8001-L2.type" +msgstr "Fault" +msgid "SUN4V-8001-L2.severity" +msgstr "Critical" +msgid "SUN4V-8001-L2.description" +msgstr "The number of level 2 cache uncorrectable data errors has exceeded acceptable levels.\n Refer to %s for more information." +msgid "SUN4V-8001-L2.response" +msgstr "No automated response.\n" +msgid "SUN4V-8001-L2.impact" +msgstr "System performance is likely to be affected. \n" +msgid "SUN4V-8001-L2.action" +msgstr "Schedule a repair procedure to replace the affected resource, the identity of which can be determined using fmdump -v -u <EVENT_ID>.\n" +# +# code: SUN4V-8001-MR +# keys: fault.cpu.ultraSPARC-T2plus.lfu-f +# +msgid "SUN4V-8001-MR.type" +msgstr "Fault" +msgid "SUN4V-8001-MR.severity" +msgstr "Major" +msgid "SUN4V-8001-MR.description" +msgstr "A CPU chip's Link Framing Unit has stopped using a bad lane.\n Refer to %s for more information." +msgid "SUN4V-8001-MR.response" +msgstr "No other automated response.\n" +msgid "SUN4V-8001-MR.impact" +msgstr "The system's capacity to correct transmission errors between CPU chips has been reduced.\n" +msgid "SUN4V-8001-MR.action" +msgstr "Schedule a repair procedure to replace the affected resource, the identity of which can be determined using fmdump -v -u <EVENT_ID>.\n" +# +# code: SUN4V-8001-N4 +# keys: fault.cpu.ultraSPARC-T2plus.lfu-p +# +msgid "SUN4V-8001-N4.type" +msgstr "Fault" +msgid "SUN4V-8001-N4.severity" +msgstr "Major" +msgid "SUN4V-8001-N4.description" +msgstr "A CPU chip's Link Framing Unit has encountered a protocol error.\n Refer to %s for more information." +msgid "SUN4V-8001-N4.response" +msgstr "No automated response.\n" +msgid "SUN4V-8001-N4.impact" +msgstr "The system has most likely taken a fatal reset.\n" +msgid "SUN4V-8001-N4.action" +msgstr "Schedule a repair procedure to replace the affected resource, the identity of which can be determined using fmdump -v -u <EVENT_ID>.\n" +# +# code: SUN4V-8001-PQ +# keys: fault.cpu.ultraSPARC-T2plus.lfu-u +# +msgid "SUN4V-8001-PQ.type" +msgstr "Fault" +msgid "SUN4V-8001-PQ.severity" +msgstr "Major" +msgid "SUN4V-8001-PQ.description" +msgstr "A CPU chip's Link Framing Unit has encountered an unrecoverable lane failure.\n Refer to %s for more information." +msgid "SUN4V-8001-PQ.response" +msgstr "No automated response.\n" +msgid "SUN4V-8001-PQ.impact" +msgstr "The system's integrity is seriously compromised.\n" +msgid "SUN4V-8001-PQ.action" +msgstr "Do not rely on this system for mission-critical tasks.\nSchedule a repair procedure to replace the affected resource, the identity of which can be determined using fmdump -v -u <EVENT_ID>.\n"
--- a/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd.h Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd.h Mon Jul 30 12:41:05 2007 -0700 @@ -158,6 +158,9 @@ fmd_stat_t dp_ignored_ue; /* # of UEs ignored due to DP fault */ fmd_stat_t dp_deferred_ue; /* # of UEs deferred due to DP error */ #endif +#ifdef sun4v + fmd_stat_t branch_creat; /* # of branch state structs created */ +#endif } cmd_stat_t; typedef struct cmd_serd { @@ -195,6 +198,9 @@ #ifdef sun4u uint16_t cmd_dp_flag; /* datapath error in progress if set */ #endif +#ifdef sun4v + cmd_list_t cmd_branches; /* List of branches state structures */ +#endif } cmd_t; extern cmd_t cmd;
--- a/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_cpu.c Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_cpu.c Mon Jul 30 12:41:05 2007 -0700 @@ -70,7 +70,8 @@ "ultraSPARC-IIIiplus", "ultraSPARC-T1", "SPARC64-VI", - "ultraSPARC-T2" + "ultraSPARC-T2", + "ultraSPARC-T2plus" }; /* @@ -179,6 +180,7 @@ return; } case CPU_ULTRASPARC_T2: + case CPU_ULTRASPARC_T2plus: switch (level) { case CMD_CPU_LEVEL_CORE: *cpuinit = core * UST2_CPUS_PER_CORE; @@ -238,6 +240,7 @@ return (cpuid); } case CPU_ULTRASPARC_T2: + case CPU_ULTRASPARC_T2plus: switch (level) { case CMD_CPU_LEVEL_CORE: return (cpuid/UST2_CPUS_PER_CORE); @@ -2016,6 +2019,9 @@ cpu_case_restore(hdl, cpu, &cpu->cpu_misc_regs, cp, "misc_regs"); break; + case CMD_PTR_CPU_LFU: + cpu_case_restore(hdl, cpu, &cpu->cpu_lfu, cp, "lfu"); + break; #ifdef sun4u case CMD_PTR_CPU_INV_SFSR: cpu_case_restore(hdl, cpu, &cpu->cpu_opl_invsfsr, cp,
--- a/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_cpu.h Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_cpu.h Mon Jul 30 12:41:05 2007 -0700 @@ -100,7 +100,8 @@ CPU_ULTRASPARC_IIIiplus, CPU_ULTRASPARC_T1, CPU_SPARC64_VI, - CPU_ULTRASPARC_T2 + CPU_ULTRASPARC_T2, + CPU_ULTRASPARC_T2plus } cmd_cpu_type_t; typedef struct cmd_cpu_cases { @@ -122,6 +123,7 @@ /* Tick compare (TC) */ /* Store buffer (SBD) */ /* Trap stack array errors (TSA) */ + cmd_case_t cpuc_lfu; /* Coherency link error (LFU) */ #ifdef sun4u cmd_case_t cpuc_opl_invsfsr; /* Olympus-C cpu inv-sfsr errors */ cmd_case_t cpuc_oplue_detcpu; /* Olympus-C cpu det. ue (eid=CPU) */ @@ -424,6 +426,7 @@ #define cpu_mau cpu_cases.cpuc_mau #define cpu_l2ctl cpu_cases.cpuc_l2ctl #define cpu_misc_regs cpu_cases.cpuc_misc_regs +#define cpu_lfu cpu_cases.cpuc_lfu #ifdef sun4u #define cpu_opl_invsfsr cpu_cases.cpuc_opl_invsfsr #define cpu_oplue_detcpu cpu_cases.cpuc_oplue_detcpu @@ -656,6 +659,37 @@ const char *, cmd_errcl_t); /* + * Type Fault + * --------------------------------------------------------------------- + * LFU-RTF uncorrectable link retrain fail error fault.cpu.T2plus.lfu-u + * LFU-TTO uncorrectable training timeout error + * LFU-CTO uncorrectable config timeout error + * LFU-MLF uncorrectable multi lanes link fail error + * LFU-SLF correctable single lane failover fault.cpu.T2plus.lfu-f + * + * The expected resolution of lfu faults is the repair of the indicated CPU. + */ +extern cmd_evdisp_t cmd_lfu_ue(fmd_hdl_t *, fmd_event_t *, nvlist_t *, + const char *, cmd_errcl_t); +extern cmd_evdisp_t cmd_lfu_ce(fmd_hdl_t *, fmd_event_t *, nvlist_t *, + const char *, cmd_errcl_t); +/* + * Type Fault + * --------------------------------------------------------------------- + * Coherency link protocol errors + * to Transaction timed out fault.cpu.T2plus.lfu-p + * frack Invalid or redundant request ack + * fsr Invalid or redundant snoop response + * fdr Invalid or redundant data return + * snptyp Invalid snoop type received from + * coherency link + * + * The expected resolution of lfu faults is the repair of the indicated CPU. + */ +extern cmd_evdisp_t cmd_lfu_pe(fmd_hdl_t *, fmd_event_t *, nvlist_t *, + const char *, cmd_errcl_t); + +/* * CPUs are described by FMRIs. This routine will retrieve the CPU state * structure (creating a new one if necessary) described by the detector * FMRI in the passed ereport.
--- a/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_cpuerr.c Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_cpuerr.c Mon Jul 30 12:41:05 2007 -0700 @@ -128,7 +128,7 @@ CMD_CPU_SIMPLEHANDLER(frc, freg, CMD_PTR_CPU_FREG, "freg", "freg") CMD_CPU_SIMPLEHANDLER(mau, mau, CMD_PTR_CPU_MAU, "mau", "mau") CMD_CPU_SIMPLEHANDLER(miscregs_ce, misc_regs, CMD_PTR_CPU_MISC_REGS, - "misc_regs", "misc_regs") + "misc_regs", "misc_reg") CMD_CPU_SIMPLEHANDLER(l2c, l2data, CMD_PTR_CPU_L2DATA, "l2data", "l2data-c") CMD_CPU_SIMPLEHANDLER(fpu, fpu, CMD_PTR_CPU_FPU, "", "fpu") @@ -136,8 +136,11 @@ CMD_CPU_SIMPLEHANDLER(iru, ireg, CMD_PTR_CPU_IREG, "", "ireg") CMD_CPU_SIMPLEHANDLER(fru, freg, CMD_PTR_CPU_FREG, "", "freg") CMD_CPU_SIMPLEHANDLER(miscregs_ue, misc_regs, CMD_PTR_CPU_MISC_REGS, - "", "misc_regs") + "", "misc_reg") CMD_CPU_SIMPLEHANDLER(l2u, l2data, CMD_PTR_CPU_L2DATA, "", "l2data-u") +CMD_CPU_SIMPLEHANDLER(lfu_ue, lfu, CMD_PTR_CPU_LFU, "", "lfu-u") +CMD_CPU_SIMPLEHANDLER(lfu_ce, lfu, CMD_PTR_CPU_LFU, "", "lfu-f") +CMD_CPU_SIMPLEHANDLER(lfu_pe, lfu, CMD_PTR_CPU_LFU, "", "lfu-p") #ifdef sun4u
--- a/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_dimm.c Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_dimm.c Mon Jul 30 12:41:05 2007 -0700 @@ -45,6 +45,7 @@ #include <sys/nvpair.h> #ifdef sun4v #include <cmd_hc_sun4v.h> +#include <cmd_branch.h> #endif /* sun4v */ /* @@ -99,6 +100,9 @@ { cmd_case_t *cc = &dimm->dimm_case; +#ifdef sun4v + cmd_branch_t *branch; +#endif if (cc->cc_cp != NULL) { cmd_case_fini(hdl, cc->cc_cp, destroy); if (cc->cc_serdnm != NULL) { @@ -111,6 +115,11 @@ if (dimm->dimm_bank != NULL) cmd_bank_remove_dimm(hdl, dimm->dimm_bank, dimm); +#ifdef sun4v + branch = cmd_branch_lookup_by_unum(hdl, dimm->dimm_unum); + if (branch != NULL) + cmd_branch_remove_dimm(hdl, branch, dimm); +#endif cmd_fmri_fini(hdl, &dimm->dimm_asru, destroy);
--- a/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_main.c Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_main.c Mon Jul 30 12:41:05 2007 -0700 @@ -308,8 +308,8 @@ CMD_CPU_LEVEL_CHIP }, { "ereport.cpu.*.lvu", cmd_l2ctl }, { "ereport.cpu.*.lru", cmd_l2ctl }, - { "ereport.cpu.*.fbr", cmd_nop }, - { "ereport.cpu.*.fbu", cmd_ue, CMD_ERRCL_DAU }, + { "ereport.cpu.*.fbr", cmd_fb }, + { "ereport.cpu.*.fbu", cmd_fb }, { "ereport.cpu.*.dac", cmd_ce, CMD_ERRCL_DAC }, { "ereport.cpu.*.dsc", cmd_ce, CMD_ERRCL_DSC }, { "ereport.cpu.*.dau", cmd_ue, CMD_ERRCL_DAU }, @@ -342,6 +342,19 @@ CMD_ERRCL_SBDPC | CMD_CPU_LEVEL_THREAD }, { "ereport.cpu.*.tsau", cmd_miscregs_ue, CMD_CPU_LEVEL_THREAD }, + { "ereport.cpu.*.cbce", cmd_xxc, CMD_CPU_LEVEL_CHIP }, + { "ereport.cpu.*.dce", cmd_nop }, + { "ereport.cpu.*.wbue", cmd_nop }, + { "ereport.cpu.*.lfu-slf", cmd_lfu_ce, CMD_CPU_LEVEL_CHIP }, + { "ereport.cpu.*.lfu-rtf", cmd_lfu_ue, CMD_CPU_LEVEL_CHIP }, + { "ereport.cpu.*.lfu-tto", cmd_lfu_ue, CMD_CPU_LEVEL_CHIP }, + { "ereport.cpu.*.lfu-cto", cmd_lfu_ue, CMD_CPU_LEVEL_CHIP }, + { "ereport.cpu.*.lfu-mlf", cmd_lfu_ue, CMD_CPU_LEVEL_CHIP }, + { "ereport.cpu.*.frack", cmd_lfu_pe, CMD_CPU_LEVEL_CHIP }, + { "ereport.cpu.*.fsr", cmd_lfu_pe, CMD_CPU_LEVEL_CHIP }, + { "ereport.cpu.*.fdr", cmd_lfu_pe, CMD_CPU_LEVEL_CHIP }, + { "ereport.cpu.*.to", cmd_lfu_pe, CMD_CPU_LEVEL_CHIP }, + { "ereport.cpu.*.snptyp", cmd_lfu_pe, CMD_CPU_LEVEL_CHIP }, #endif /* sun4u */ { "ereport.cpu.*.fpu.hwcopy", cmd_fpu }, { NULL, NULL } @@ -456,6 +469,9 @@ { "dp_deferred_ue", FMD_TYPE_UINT64, "memory UEs deferred due to DP error" }, #endif +#ifdef sun4v + { "branch_creat", FMD_TYPE_UINT64, "created new mem branch structure" }, +#endif }; static const fmd_prop_t fmd_props[] = { @@ -507,6 +523,10 @@ { "thresh_abs_badrw", FMD_TYPE_UINT64, "128" }, { "max_perm_ce_dimm", FMD_TYPE_UINT32, "128" }, { "miscregs_trdelay", FMD_TYPE_TIME, "45s"}, +#ifdef sun4v + { "fbr_n", FMD_TYPE_UINT32, "14" }, + { "fbr_t", FMD_TYPE_TIME, "30min"}, +#endif { NULL, 0, NULL } }; @@ -588,6 +608,7 @@ fmd_hdl_subscribe(hdl, "ereport.cpu.ultraSPARC-IV.*"); fmd_hdl_subscribe(hdl, "ereport.cpu.ultraSPARC-IVplus.*"); fmd_hdl_subscribe(hdl, "ereport.cpu.ultraSPARC-T2.*"); + fmd_hdl_subscribe(hdl, "ereport.cpu.ultraSPARC-T2plus.*"); fmd_hdl_subscribe(hdl, "ereport.cpu.ultraSPARC-T1.*"); fmd_hdl_subscribe(hdl, "ereport.io.tom.ecc.drce");
--- a/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_mem.c Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_mem.c Mon Jul 30 12:41:05 2007 -0700 @@ -36,6 +36,9 @@ #ifdef sun4u #include <cmd_dp.h> #endif +#ifdef sun4v +#include <cmd_branch.h> +#endif #include <errno.h> #include <strings.h> @@ -253,6 +256,9 @@ { cmd_dimm_gc(hdl); cmd_bank_gc(hdl); +#ifdef sun4v + cmd_branch_gc(hdl); +#endif } void @@ -262,6 +268,9 @@ cmd_dimm_fini(hdl); cmd_bank_fini(hdl); +#ifdef sun4v + cmd_branch_fini(hdl); +#endif while ((rf = cmd_list_next(&cmd.cmd_iorxefrx)) != NULL) cmd_iorxefrx_free(hdl, rf);
--- a/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_mem.h Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_mem.h Mon Jul 30 12:41:05 2007 -0700 @@ -78,6 +78,9 @@ typedef struct cmd_dimm cmd_dimm_t; typedef struct cmd_bank cmd_bank_t; +#ifdef sun4v +typedef struct cmd_branch cmd_branch_t; +#endif /* * Correctable and Uncorrectable memory errors @@ -129,6 +132,11 @@ uint16_t, uint8_t, cmd_cpu_t *); extern void cmd_dimm_close(fmd_hdl_t *, void *); extern void cmd_bank_close(fmd_hdl_t *, void *); +#ifdef sun4v +extern void cmd_branch_close(fmd_hdl_t *, void *); +extern cmd_evdisp_t cmd_fb(fmd_hdl_t *, fmd_event_t *, nvlist_t *, + const char *, cmd_errcl_t); +#endif /* * US-IIIi I/O, Remote and Foreign Read memory errors
--- a/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_state.c Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_state.c Mon Jul 30 12:41:05 2007 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -40,6 +40,9 @@ #endif #include <cmd_bank.h> #include <cmd.h> +#ifdef sun4v +#include <cmd_branch.h> +#endif #include <string.h> #include <fm/fmd_api.h> @@ -98,7 +101,13 @@ cmd_cpuerr_close, /* CMD_PTR_CPU_UGESR_DAE */ cmd_cpuerr_close, /* CMD_PTR_CPU_UGESR_IAE */ cmd_cpuerr_close, /* CMD_PTR_CPU_UGESR_UGE */ - cmd_cpuerr_close /* CMD_PTR_CPU_MISC_REGS */ + cmd_cpuerr_close, /* CMD_PTR_CPU_MISC_REGS */ + cmd_cpuerr_close, /* CMD_PTR_CPU_LFU */ +#ifdef sun4v + cmd_branch_close /* CMD_PTR_BRANCH_CASE */ +#else + NULL +#endif }; fmd_case_t * @@ -171,6 +180,9 @@ #ifdef sun4u cmd_dp_restore /* CMD_NT_DP */ #endif +#ifdef sun4v + cmd_branch_restore /* CMD_NT_BRANCH */ +#endif }; int @@ -222,6 +234,9 @@ cmd_dp_validate(hdl); #endif cmd_page_validate(hdl); +#ifdef sun4v + cmd_branch_validate(hdl); +#endif return (0); }
--- a/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_state.h Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_state.h Mon Jul 30 12:41:05 2007 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -71,7 +71,12 @@ CMD_NT_DIMM, CMD_NT_BANK, CMD_NT_PAGE, +#ifdef sun4u CMD_NT_DP +#endif +#ifdef sun4v + CMD_NT_BRANCH +#endif } cmd_nodetype_t; /* @@ -123,7 +128,9 @@ CMD_PTR_CPU_UGESR_DAE, CMD_PTR_CPU_UGESR_IAE, CMD_PTR_CPU_UGESR_UGE, - CMD_PTR_CPU_MISC_REGS + CMD_PTR_CPU_MISC_REGS, + CMD_PTR_CPU_LFU, + CMD_PTR_BRANCH_CASE } cmd_ptrsubtype_t; /*
--- a/usr/src/cmd/fm/modules/sun4v/cpumem-diagnosis/Makefile Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/cmd/fm/modules/sun4v/cpumem-diagnosis/Makefile Mon Jul 30 12:41:05 2007 -0700 @@ -26,7 +26,8 @@ ARCH = sun4v -sun4v_SRCS = cmd_hc_sun4v.c +sun4v_SRCS = cmd_hc_sun4v.c \ + cmd_branch.c INCDIRS = $(SRC)/uts/sun4v \ $(ROOT)/usr/platform/sun4v/include
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/cmd/fm/modules/sun4v/cpumem-diagnosis/cmd_branch.c Mon Jul 30 12:41:05 2007 -0700 @@ -0,0 +1,453 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <cmd_mem.h> +#include <cmd_branch.h> +#include <cmd_dimm.h> +#include <cmd.h> +#include <cmd_hc_sun4v.h> + +#include <errno.h> +#include <string.h> +#include <strings.h> +#include <fcntl.h> +#include <unistd.h> +#include <fm/fmd_api.h> +#include <sys/fm/protocol.h> +#include <sys/mem.h> +#include <sys/nvpair.h> + +#define BUF_SIZE 120 + +void +cmd_branch_add_dimm(fmd_hdl_t *hdl, cmd_branch_t *branch, cmd_dimm_t *dimm) +{ + cmd_branch_memb_t *bm; + + if (dimm == NULL) + return; + + fmd_hdl_debug(hdl, "Attaching dimm %s to branch %s\n", + dimm->dimm_unum, branch->branch_unum); + bm = fmd_hdl_zalloc(hdl, sizeof (cmd_branch_memb_t), FMD_SLEEP); + bm->dimm = dimm; + cmd_list_append(&branch->branch_dimms, bm); +} + +void +cmd_branch_remove_dimm(fmd_hdl_t *hdl, cmd_branch_t *branch, cmd_dimm_t *dimm) +{ + cmd_branch_memb_t *bm; + + fmd_hdl_debug(hdl, "Detaching dimm %s from branch %s\n", + dimm->dimm_unum, branch->branch_unum); + + for (bm = cmd_list_next(&branch->branch_dimms); bm != NULL; + bm = cmd_list_next(bm)) { + if (bm->dimm == dimm) { + cmd_list_delete(&branch->branch_dimms, bm); + fmd_hdl_free(hdl, bm, sizeof (cmd_branch_memb_t)); + return; + } + } + + fmd_hdl_abort(hdl, + "Attempt to disconnect dimm from non-parent branch\n"); +} + +static cmd_dimm_t * +branch_dimm_create(fmd_hdl_t *hdl, char *dimm_unum) +{ + nvlist_t *fmri; + cmd_dimm_t *dimm; + + fmri = cmd_mem_fmri_create(dimm_unum); + + if (fmri != NULL && (fmd_nvl_fmri_expand(hdl, fmri) == 0)) { + dimm = cmd_dimm_create(hdl, fmri); + if (dimm != NULL) { + nvlist_free(fmri); + return (dimm); + } + } + + nvlist_free(fmri); + return (NULL); +} + +/* + * The cmd_dimm_t structure created for a DIMM in a branch never has a + * Jxxx in its unum; the cmd_dimm_t structure created for a DIMM containing + * a page, or in a bank (i.e. for ECC errors)-always-has a Jxxx in its + * unum. Therefore the set of cmd_dimm_t's created for a branch is always + * disjoint from the set of cmd_dimm_t's created for pages and/or banks, so + * the cmd_dimm_create will never link a 'branch' cmd_dimm_t into bank. + * Faulting a DIMM for ECC will not prevent subsequent faulting of "same" + * dimm for FBR/FBU and vice versa + */ +static int +branch_dimmlist_create(fmd_hdl_t *hdl, cmd_branch_t *branch) +{ + int channel, d; + char dimm_unum[BUF_SIZE]; + cmd_dimm_t *dimm; + int dimm_count = 0; + + for (channel = 0; channel < MAX_CHANNELS_ON_CHIP; channel++) { + for (d = 0; d < MAX_DIMMS_IN_CHANNEL; d++) { + (void) snprintf(dimm_unum, BUF_SIZE, + "%s/CH%1d/D%1d", branch->branch_unum, channel, d); + dimm = branch_dimm_create(hdl, dimm_unum); + if (dimm != NULL) { + cmd_branch_add_dimm(hdl, branch, dimm); + dimm_count++; + } + } + } + return (dimm_count); +} + +void +cmd_branch_create_fault(fmd_hdl_t *hdl, cmd_branch_t *branch, + const char *fltnm, nvlist_t *asru) +{ + nvlist_t *flt; + cmd_branch_memb_t *bm; + cmd_dimm_t *dimm; + int dimm_count; + uint_t cert = 0; + + /* attach the dimms to the branch */ + dimm_count = branch_dimmlist_create(hdl, branch); + + if (dimm_count != 0) + cert = (100 - CMD_MBFAULT_CERT) / dimm_count; + + /* create motherboard fault */ + flt = cmd_motherboard_create_fault(hdl, asru, fltnm, CMD_MBFAULT_CERT); + fmd_case_add_suspect(hdl, branch->branch_case.cc_cp, flt); + + /* create dimm faults */ + for (bm = cmd_list_next(&branch->branch_dimms); bm != NULL; + bm = cmd_list_next(bm)) { + dimm = bm->dimm; + if (dimm != NULL) { + dimm->dimm_flags |= CMD_MEM_F_FAULTING; + cmd_dimm_dirty(hdl, dimm); + flt = cmd_dimm_create_fault(hdl, dimm, fltnm, cert); + fmd_case_add_suspect(hdl, branch->branch_case.cc_cp, + flt); + } + } +} + +cmd_branch_t * +cmd_branch_create(fmd_hdl_t *hdl, nvlist_t *asru) +{ + cmd_branch_t *branch; + const char *b_unum; + + if ((b_unum = cmd_fmri_get_unum(asru)) == NULL) { + CMD_STAT_BUMP(bad_mem_asru); + return (NULL); + } + + fmd_hdl_debug(hdl, "branch_create: creating new branch %s\n", b_unum); + CMD_STAT_BUMP(branch_creat); + + branch = fmd_hdl_zalloc(hdl, sizeof (cmd_branch_t), FMD_SLEEP); + branch->branch_nodetype = CMD_NT_BRANCH; + branch->branch_version = CMD_BRANCH_VERSION; + + cmd_bufname(branch->branch_bufname, sizeof (branch->branch_bufname), + "branch_%s", b_unum); + cmd_fmri_init(hdl, &branch->branch_asru, asru, "branch_asru_%s", + b_unum); + + (void) nvlist_lookup_string(branch->branch_asru_nvl, FM_FMRI_MEM_UNUM, + (char **)&branch->branch_unum); + + cmd_list_append(&cmd.cmd_branches, branch); + cmd_branch_dirty(hdl, branch); + + return (branch); +} + +cmd_branch_t * +cmd_branch_lookup_by_unum(fmd_hdl_t *hdl, const char *unum) +{ + cmd_branch_t *branch; + + fmd_hdl_debug(hdl, "branch_lookup: dimm_unum %s", unum); + /* + * fbr/fbu unum dimm does not have a J number + */ + if (strstr(unum, "J") != NULL) + return (NULL); + + for (branch = cmd_list_next(&cmd.cmd_branches); branch != NULL; + branch = cmd_list_next(branch)) { + if (strncmp(branch->branch_unum, unum, BRANCH_UNUM_LEN) == 0) + return (branch); + } + + fmd_hdl_debug(hdl, "branch_lookup_by_unum: no branch is found\n"); + return (NULL); +} + +cmd_branch_t * +cmd_branch_lookup(fmd_hdl_t *hdl, nvlist_t *asru) +{ + cmd_branch_t *branch; + const char *unum; + + if ((unum = cmd_fmri_get_unum(asru)) == NULL) { + CMD_STAT_BUMP(bad_mem_asru); + return (NULL); + } + + for (branch = cmd_list_next(&cmd.cmd_branches); branch != NULL; + branch = cmd_list_next(branch)) { + if (strncmp(branch->branch_unum, unum, BRANCH_UNUM_LEN) == 0) + return (branch); + } + + fmd_hdl_debug(hdl, "cmd_branch_lookup: discarding old \n"); + return (NULL); +} + +static cmd_branch_t * +branch_wrapv0(fmd_hdl_t *hdl, cmd_branch_pers_t *pers, size_t psz) +{ + cmd_branch_t *branch; + + if (psz != sizeof (cmd_branch_pers_t)) { + fmd_hdl_abort(hdl, "size of state doesn't match size of " + "version 0 state (%u bytes).\n", + sizeof (cmd_branch_pers_t)); + } + + branch = fmd_hdl_zalloc(hdl, sizeof (cmd_branch_t), FMD_SLEEP); + bcopy(pers, branch, sizeof (cmd_branch_pers_t)); + fmd_hdl_free(hdl, pers, psz); + return (branch); +} + +void * +cmd_branch_restore(fmd_hdl_t *hdl, fmd_case_t *cp, cmd_case_ptr_t *ptr) +{ + cmd_branch_t *branch; + size_t branchsz; + + + for (branch = cmd_list_next(&cmd.cmd_branches); branch != NULL; + branch = cmd_list_next(branch)) { + if (strcmp(branch->branch_bufname, ptr->ptr_name) == 0) + break; + } + + if (branch == NULL) { + fmd_hdl_debug(hdl, "restoring branch from %s\n", ptr->ptr_name); + + if ((branchsz = fmd_buf_size(hdl, NULL, ptr->ptr_name)) == 0) { + fmd_hdl_abort(hdl, "branch referenced by case %s does " + "not exist in saved state\n", + fmd_case_uuid(hdl, cp)); + } else if (branchsz > CMD_BRANCH_MAXSIZE || + branchsz < CMD_BRANCH_MINSIZE) { + fmd_hdl_abort(hdl, + "branch buffer referenced by case %s " + "is out of bounds (is %u bytes, max %u, min %u)\n", + fmd_case_uuid(hdl, cp), branchsz, + CMD_BRANCH_MAXSIZE, CMD_BRANCH_MINSIZE); + } + + if ((branch = cmd_buf_read(hdl, NULL, ptr->ptr_name, + branchsz)) == NULL) { + fmd_hdl_abort(hdl, "failed to read branch buf %s", + ptr->ptr_name); + } + + fmd_hdl_debug(hdl, "found %d in version field\n", + branch->branch_version); + + switch (branch->branch_version) { + case CMD_BRANCH_VERSION_0: + branch = branch_wrapv0(hdl, + (cmd_branch_pers_t *)branch, branchsz); + break; + default: + fmd_hdl_abort(hdl, "unknown version (found %d) " + "for branch state referenced by case %s.\n", + branch->branch_version, fmd_case_uuid(hdl, + cp)); + break; + } + + cmd_fmri_restore(hdl, &branch->branch_asru); + + if ((errno = nvlist_lookup_string(branch->branch_asru_nvl, + FM_FMRI_MEM_UNUM, (char **)&branch->branch_unum)) != 0) + fmd_hdl_abort(hdl, "failed to retrieve unum from asru"); + + + cmd_list_append(&cmd.cmd_branches, branch); + } + + switch (ptr->ptr_subtype) { + case CMD_PTR_BRANCH_CASE: + cmd_mem_case_restore(hdl, &branch->branch_case, cp, "branch", + branch->branch_unum); + break; + default: + fmd_hdl_abort(hdl, "invalid %s subtype %d\n", + ptr->ptr_name, ptr->ptr_subtype); + } + + return (branch); +} + +void +cmd_branch_dirty(fmd_hdl_t *hdl, cmd_branch_t *branch) +{ + if (fmd_buf_size(hdl, NULL, branch->branch_bufname) != + sizeof (cmd_branch_pers_t)) + fmd_buf_destroy(hdl, NULL, branch->branch_bufname); + + /* No need to rewrite the FMRIs in the branch - they don't change */ + fmd_buf_write(hdl, NULL, branch->branch_bufname, &branch->branch_pers, + sizeof (cmd_branch_pers_t)); +} + +static void +branch_dimmlist_free(fmd_hdl_t *hdl, cmd_branch_t *branch) +{ + cmd_branch_memb_t *bm; + + while ((bm = cmd_list_next(&branch->branch_dimms)) != NULL) { + cmd_list_delete(&branch->branch_dimms, bm); + fmd_hdl_free(hdl, bm, sizeof (cmd_branch_memb_t)); + } +} + +static void +branch_free(fmd_hdl_t *hdl, cmd_branch_t *branch, int destroy) +{ + fmd_hdl_debug(hdl, "Free branch %s\n", branch->branch_unum); + if (branch->branch_case.cc_cp != NULL) { + if (destroy) { + if (branch->branch_case.cc_serdnm != NULL) { + fmd_serd_destroy(hdl, + branch->branch_case.cc_serdnm); + fmd_hdl_strfree(hdl, + branch->branch_case.cc_serdnm); + branch->branch_case.cc_serdnm = NULL; + } + } + cmd_case_fini(hdl, branch->branch_case.cc_cp, destroy); + } + + branch_dimmlist_free(hdl, branch); + cmd_fmri_fini(hdl, &branch->branch_asru, destroy); + + if (destroy) + fmd_buf_destroy(hdl, NULL, branch->branch_bufname); + cmd_list_delete(&cmd.cmd_branches, branch); + fmd_hdl_free(hdl, branch, sizeof (cmd_branch_t)); +} + +void +cmd_branch_destroy(fmd_hdl_t *hdl, cmd_branch_t *branch) +{ + branch_free(hdl, branch, FMD_B_TRUE); +} + +int +branch_exist(fmd_hdl_t *hdl, cmd_branch_t *branch) +{ + char dimm_unum[BUF_SIZE]; + int channel, d; + nvlist_t *fmri; + + fmd_hdl_debug(hdl, "branch_exist"); + for (channel = 0; channel < MAX_CHANNELS_ON_CHIP; channel++) { + for (d = 0; d < MAX_DIMMS_IN_CHANNEL; d++) { + (void) snprintf(dimm_unum, BUF_SIZE, "%s/CH%1d/D%1d", + branch->branch_unum, channel, d); + fmri = cmd_mem_fmri_create(dimm_unum); + if (fmri != NULL && + (fmd_nvl_fmri_expand(hdl, fmri) == 0)) { + nvlist_free(fmri); + return (1); + } + nvlist_free(fmri); + } + } + fmd_hdl_debug(hdl, "branch %s does not exist\n", branch->branch_unum); + return (0); +} + +/* + * If the case has been solved, don't need to check the dimmlist + * If the case has not been solved, the branch is valid if there is least one + * existing dimm in the branch + */ +void +cmd_branch_validate(fmd_hdl_t *hdl) +{ + cmd_branch_t *branch, *next; + + fmd_hdl_debug(hdl, "cmd_branch_validate\n"); + + for (branch = cmd_list_next(&cmd.cmd_branches); branch != NULL; + branch = next) { + next = cmd_list_next(branch); + if (branch->branch_case.cc_cp != NULL && + fmd_case_solved(hdl, branch->branch_case.cc_cp)) + continue; + if (branch_exist(hdl, branch)) + continue; + cmd_branch_destroy(hdl, branch); + } +} + +void +cmd_branch_gc(fmd_hdl_t *hdl) +{ + fmd_hdl_debug(hdl, "cmd_branch_gc\n"); + cmd_branch_validate(hdl); +} + +void +cmd_branch_fini(fmd_hdl_t *hdl) +{ + cmd_branch_t *branch; + fmd_hdl_debug(hdl, "cmd_branch_fini\n"); + + while ((branch = cmd_list_next(&cmd.cmd_branches)) != NULL) + branch_free(hdl, branch, FMD_B_FALSE); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/cmd/fm/modules/sun4v/cpumem-diagnosis/cmd_branch.h Mon Jul 30 12:41:05 2007 -0700 @@ -0,0 +1,125 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _CMD_BRANCH_H +#define _CMD_BRANCH_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Memory modules are described by the cmd_branch general-purpose state + * structure. This structure is used to track FBR errors + * + * Data structures: + * + * ,--------. ,--------. + * |branch | <---- |case_ptr| (CMD_PTR_BRANCH_CASE) + * | | `--------' + * |,-------| ,-------------. + * ,->||asru_t | ----> |packed nvlist| + * | |`-------| `-------------' + * `--| | + * | dimms | ----> cmd_branch_memb_t -----> cmd_branch_memb_t -----> ... + * `--------' | | + * cmd_dimm_t cmd_dimm_t + * + * Data structure P? Case? Notes + * ---------------- --- ----- ---------------------------------------------- + * cmd_branch_pers_t Yes No Name is derived from the unum ("branch_%s") + * cmd_case_ptr_t Yes Yes Name is case's UUID + * branch_asru Yes No Name is derived from the unum + * ("branch_asru_%d") + * branch_unum No No Pointer into ASRU - relinked during restore + * branch_dimms No No Recreated during restore + */ + +#include <cmd_mem.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_CHANNELS_ON_CHIP 4 +#define MAX_DIMMS_IN_CHANNEL 4 +#define BRANCH_UNUM_LEN 11 +#define CMD_MBFAULT_CERT 30 + +typedef struct cmd_branch_memb { + cmd_list_t bm_list; + cmd_dimm_t *dimm; +} cmd_branch_memb_t; + +#define CMD_BRANCH_VERSION_0 0 +#define CMD_BRANCH_VERSION CMD_BRANCH_VERSION_0 + +typedef struct cmd_branch_pers { + cmd_header_t branchp_header; /* Nodetype must be CMD_NT_BRANCH */ + uint_t branchp_version; + cmd_fmri_t branchp_asru; /* ASRU for this BRANCH */ + uint_t branchp_flags; /* CMD_MEM_F_* */ +} cmd_branch_pers_t; + +struct cmd_branch { + cmd_branch_pers_t branch_pers; + const char *branch_unum; /* This BRANCH's name */ + cmd_case_t branch_case; /* Open link errors case against */ + /* this BRANCH */ + cmd_list_t branch_dimms; /* This BRANCH's dimms */ +}; + +#define CMD_BRANCH_MAXSIZE sizeof (cmd_branch_pers_t) +#define CMD_BRANCH_MINSIZE sizeof (cmd_branch_pers_t) + +#define branch_header branch_pers.branchp_header +#define branch_nodetype branch_pers.branchp_header.hdr_nodetype +#define branch_bufname branch_pers.branchp_header.hdr_bufname +#define branch_version branch_pers.branchp_version +#define branch_asru branch_pers.branchp_asru +#define branch_asru_nvl branch_pers.branchp_asru.fmri_nvl +#define branch_flags branch_pers.branchp_flags + +extern cmd_branch_t *cmd_branch_lookup(fmd_hdl_t *, nvlist_t *); +extern cmd_branch_t *cmd_branch_create(fmd_hdl_t *, nvlist_t *); +extern cmd_branch_t *cmd_branch_lookup_by_unum(fmd_hdl_t *, const char *); + +extern void cmd_branch_create_fault(fmd_hdl_t *, cmd_branch_t *, + const char *, nvlist_t *); +extern void cmd_branch_add_dimm(fmd_hdl_t *, cmd_branch_t *, cmd_dimm_t *); +extern void cmd_branch_remove_dimm(fmd_hdl_t *, cmd_branch_t *, cmd_dimm_t *); + + +extern void *cmd_branch_restore(fmd_hdl_t *, fmd_case_t *, cmd_case_ptr_t *); +extern void cmd_branch_destroy(fmd_hdl_t *, cmd_branch_t *); +extern void cmd_branch_validate(fmd_hdl_t *); +extern void cmd_branch_gc(fmd_hdl_t *); +extern void cmd_branch_fini(fmd_hdl_t *); +extern void cmd_branch_dirty(fmd_hdl_t *, cmd_branch_t *); + + +#ifdef __cplusplus +} +#endif + +#endif /* _CMD_BRANCH_H */
--- a/usr/src/cmd/fm/modules/sun4v/cpumem-diagnosis/cmd_hc_sun4v.c Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/cmd/fm/modules/sun4v/cpumem-diagnosis/cmd_hc_sun4v.c Mon Jul 30 12:41:05 2007 -0700 @@ -39,3 +39,60 @@ fmd_hdl_error(hdl, "unable to alloc location for fault\n"); return (flt); } + +nvlist_t * +cmd_motherboard_fru_create(fmd_hdl_t *hdl, nvlist_t *asru) +{ + nvlist_t *fru, *hcelem; + char *serialstr, *partstr; + + if (nvlist_lookup_string(asru, FM_FMRI_HC_SERIAL_ID, &serialstr) != 0) + serialstr = NULL; + if (nvlist_lookup_string(asru, FM_FMRI_HC_PART, &partstr) != 0) + partstr = NULL; + + if (nvlist_alloc(&hcelem, NV_UNIQUE_NAME, 0) != 0) + return (NULL); + + if (nvlist_add_string(hcelem, FM_FMRI_HC_NAME, "motherboard") != 0 || + nvlist_add_string(hcelem, FM_FMRI_HC_ID, "0") != 0) { + nvlist_free(hcelem); + return (NULL); + } + + if (nvlist_alloc(&fru, NV_UNIQUE_NAME, 0) != 0) { + fmd_hdl_debug(hdl, "Failed to allocate memory"); + nvlist_free(hcelem); + return (NULL); + } + + if (nvlist_add_uint8(fru, FM_VERSION, FM_HC_SCHEME_VERSION) != 0 || + nvlist_add_string(fru, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0 || + nvlist_add_string(fru, FM_FMRI_HC_ROOT, "/") != 0 || + nvlist_add_uint32(fru, FM_FMRI_HC_LIST_SZ, 1) != 0 || + nvlist_add_nvlist_array(fru, FM_FMRI_HC_LIST, &hcelem, 1) != 0 || + (serialstr != NULL && + nvlist_add_string(fru, FM_FMRI_HC_SERIAL_ID, serialstr) != 0) || + (partstr != NULL && + nvlist_add_string(fru, FM_FMRI_HC_PART, partstr) != 0)) { + nvlist_free(hcelem); + nvlist_free(fru); + return (NULL); + } + nvlist_free(hcelem); + return (fru); +} + +nvlist_t * +cmd_motherboard_create_fault(fmd_hdl_t *hdl, nvlist_t *asru, const char *fltnm, + uint_t cert) +{ + nvlist_t *mb_fru, *flt; + + mb_fru = cmd_motherboard_fru_create(hdl, asru); + flt = fmd_nvl_create_fault(hdl, fltnm, cert, mb_fru, mb_fru, NULL); + flt = cmd_fault_add_location(hdl, flt, "MB"); + if (mb_fru != NULL) + nvlist_free(mb_fru); + return (flt); +}
--- a/usr/src/cmd/fm/modules/sun4v/cpumem-diagnosis/cmd_hc_sun4v.h Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/cmd/fm/modules/sun4v/cpumem-diagnosis/cmd_hc_sun4v.h Mon Jul 30 12:41:05 2007 -0700 @@ -36,6 +36,9 @@ #endif extern nvlist_t *cmd_fault_add_location(fmd_hdl_t *, nvlist_t *, const char *); +extern nvlist_t *cmd_motherboard_fru_create(fmd_hdl_t *, nvlist_t *); +extern nvlist_t *cmd_motherboard_create_fault(fmd_hdl_t *, nvlist_t *, + const char *, uint_t); #ifdef __cplusplus }
--- a/usr/src/cmd/fm/modules/sun4v/cpumem-diagnosis/cmd_memerr_arch.c Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/cmd/fm/modules/sun4v/cpumem-diagnosis/cmd_memerr_arch.c Mon Jul 30 12:41:05 2007 -0700 @@ -34,6 +34,8 @@ #include <cmd_bank.h> #include <cmd_page.h> #include <cmd_cpu.h> +#include <cmd_branch.h> +#include <cmd_state.h> #include <cmd.h> #include <assert.h> @@ -53,6 +55,11 @@ #include <sys/fm/ldom.h> #include <ctype.h> +#define VF_TS3_FCR 0x000000000000FFFFULL +#define VF_L2ESYR_C2C 0x8000000000000000ULL +#define UTS2_CPUS_PER_CHIP 64 +#define FBR_ERROR ".fbr" + extern ldom_hdl_t *cpumem_diagnosis_lhp; static fmd_hdl_t *cpumem_hdl = NULL; @@ -96,6 +103,73 @@ return (CMD_EVD_OK); } +static int +cpu_present(fmd_hdl_t *hdl, nvlist_t *asru, uint32_t *cpuid) +{ + nvlist_t *cp_asru; + uint32_t i; + + if (nvlist_dup(asru, &cp_asru, 0) != 0) { + fmd_hdl_debug(hdl, "unable to alloc asru for thread\n"); + return (-1); + } + + for (i = *cpuid; i < *cpuid + UTS2_CPUS_PER_CHIP; i++) { + + (void) nvlist_remove_all(cp_asru, FM_FMRI_CPU_ID); + + if (nvlist_add_uint32(cp_asru, FM_FMRI_CPU_ID, i) == 0) { + if (fmd_nvl_fmri_present(hdl, cp_asru) && + !fmd_nvl_fmri_unusable(hdl, cp_asru)) { + nvlist_free(cp_asru); + *cpuid = i; + return (0); + } + } + } + nvlist_free(cp_asru); + return (-1); +} + +/*ARGSUSED*/ +cmd_evdisp_t +cmd_c2c(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class, + cmd_errcl_t clcode) +{ + uint32_t cpuid; + nvlist_t *det; + int rc; + + (void) nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR, &det); + if (nvlist_lookup_uint32(det, FM_FMRI_CPU_ID, &cpuid) == 0) { + /* + * If the c2c bit is set, the sending cache of the + * cpu must be faulted instead of the memory. + * If the detector is chip0, the cache of the chip1 + * is faulted and vice versa. + */ + if (cpuid < UTS2_CPUS_PER_CHIP) + cpuid = UTS2_CPUS_PER_CHIP; + else + cpuid = 0; + + rc = cpu_present(hdl, det, &cpuid); + + if (rc != -1) { + (void) nvlist_remove(det, FM_FMRI_CPU_ID, + DATA_TYPE_UINT32); + if (nvlist_add_uint32(det, + FM_FMRI_CPU_ID, cpuid) == 0) { + clcode |= CMD_CPU_LEVEL_CHIP; + return (cmd_l2u(hdl, ep, nvl, class, clcode)); + } + + } + } + fmd_hdl_debug(hdl, "cmd_c2c: no cpuid discarding C2C error"); + return (CMD_EVD_BAD); +} + /* * sun4v's xe_common routine has an extra argument, clcode, compared * to routine of same name in sun4u. @@ -106,7 +180,7 @@ const char *class, cmd_errcl_t clcode, cmd_xe_handler_f *hdlr) { uint64_t afar, l2_afar, dram_afar; - uint64_t l2_afsr, dram_afsr; + uint64_t l2_afsr, dram_afsr, l2_esyr; uint16_t synd; uint8_t afar_status, synd_status; nvlist_t *rsrc; @@ -176,6 +250,13 @@ afar_status = ((l2_afsr & NI_L2AFSR_P05) == 0) ? AFLT_STAT_VALID : AFLT_STAT_INVALID; synd_status = AFLT_STAT_VALID; + + if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_L2_ESYR, + &l2_esyr) == 0) { + if (l2_esyr & VF_L2ESYR_C2C) { + return (cmd_c2c(hdl, ep, nvl, class, clcode)); + } + } break; case CMD_ERRCL_DSU: afar = dram_afar; @@ -191,6 +272,7 @@ synd_status, cmd_mem_name2type(typenm, minorvers), disp, rsrc)); } + /*ARGSUSED*/ cmd_evdisp_t cmd_ce(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class, @@ -215,6 +297,110 @@ return (CMD_EVD_UNUSED); } + +/*ARGSUSED*/ +cmd_evdisp_t +cmd_fb(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class, + cmd_errcl_t clcode) +{ + cmd_branch_t *branch; + const char *uuid; + nvlist_t *asru, *det; + uint64_t ts3_fcr; + + if (nvlist_lookup_nvlist(nvl, FM_RSRC_RESOURCE, &asru) < 0) { + CMD_STAT_BUMP(bad_mem_asru); + return (NULL); + } + + if (nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR, &det) < 0) { + CMD_STAT_BUMP(bad_mem_asru); + return (NULL); + } + + if (fmd_nvl_fmri_expand(hdl, det) < 0) { + fmd_hdl_debug(hdl, "Failed to expand detector"); + return (NULL); + } + + branch = cmd_branch_lookup(hdl, asru); + if (branch == NULL) { + if ((branch = cmd_branch_create(hdl, asru)) == NULL) + return (CMD_EVD_UNUSED); + } + + if (branch->branch_case.cc_cp != NULL && + fmd_case_solved(hdl, branch->branch_case.cc_cp)) { + fmd_hdl_debug(hdl, "Case solved\n"); + return (CMD_EVD_REDUND); + } + + if (branch->branch_case.cc_cp == NULL) { + branch->branch_case.cc_cp = cmd_case_create(hdl, + &branch->branch_header, CMD_PTR_BRANCH_CASE, &uuid); + } + + if (strcmp(strrchr(class, '.'), FBR_ERROR) == 0) { + if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_TS3_FCR, + &ts3_fcr) == 0 && (ts3_fcr != VF_TS3_FCR)) { + fmd_hdl_debug(hdl, + "Processing fbr with lane failover\n"); + cmd_branch_create_fault(hdl, branch, + "fault.memory.link-f", det); + + } else { + fmd_hdl_debug(hdl, "Adding fbr event to serd engine\n"); + if (branch->branch_case.cc_serdnm == NULL) { + branch->branch_case.cc_serdnm = + cmd_mem_serdnm_create(hdl, + "branch", branch->branch_unum); + + fmd_serd_create(hdl, + branch->branch_case.cc_serdnm, + fmd_prop_get_int32(hdl, "fbr_n"), + fmd_prop_get_int64(hdl, "fbr_t")); + } + + if (fmd_serd_record(hdl, + branch->branch_case.cc_serdnm, ep) == FMD_B_FALSE) + return (CMD_EVD_OK); /* engine hasn't fired */ + + fmd_hdl_debug(hdl, "fbr serd fired\n"); + + fmd_case_add_serd(hdl, branch->branch_case.cc_cp, + branch->branch_case.cc_serdnm); + + cmd_branch_create_fault(hdl, branch, + "fault.memory.link-c", det); + } + } else { + fmd_hdl_debug(hdl, "Processing fbu event"); + cmd_branch_create_fault(hdl, branch, "fault.memory.link-u", + det); + } + + branch->branch_flags |= CMD_MEM_F_FAULTING; + + if (branch->branch_case.cc_serdnm != NULL) { + fmd_serd_destroy(hdl, branch->branch_case.cc_serdnm); + fmd_hdl_strfree(hdl, branch->branch_case.cc_serdnm); + branch->branch_case.cc_serdnm = NULL; + } + + fmd_case_add_ereport(hdl, branch->branch_case.cc_cp, ep); + fmd_case_solve(hdl, branch->branch_case.cc_cp); + cmd_branch_dirty(hdl, branch); + + return (CMD_EVD_OK); +} + +void +cmd_branch_close(fmd_hdl_t *hdl, void *arg) +{ + cmd_branch_destroy(hdl, arg); +} + + /*ARGSUSED*/ ulong_t cmd_mem_get_phys_pages(fmd_hdl_t *hdl) @@ -250,10 +436,10 @@ } listp = (mde_cookie_t *)cpumem_alloc(sizeof (mde_cookie_t) * - num_nodes); + num_nodes); nmblocks = md_scan_dag(mdp, MDE_INVAL_ELEM_COOKIE, - md_find_name(mdp, "mblock"), - md_find_name(mdp, "fwd"), listp); + md_find_name(mdp, "mblock"), + md_find_name(mdp, "fwd"), listp); for (i = 0; i < nmblocks; i++) { if (md_get_prop_val(mdp, listp[i], "size", &bmem) < 0) { physmem = 0; @@ -429,7 +615,7 @@ namebuf[namelen] = '\0'; if ((j = map_name(namebuf)) < 0) - continue; /* skip names that don't map */ + continue; /* skip names that don't map */ if (instlen == 0) { (void) strncpy(instbuf, "0", 2);
--- a/usr/src/cmd/fm/modules/sun4v/cpumem-retire/cma_main.c Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/cmd/fm/modules/sun4v/cpumem-retire/cma_main.c Mon Jul 30 12:41:05 2007 -0700 @@ -78,6 +78,12 @@ NULL }, { "fault.memory.datapath", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, NULL }, + { "fault.memory.link-c", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, + NULL }, + { "fault.memory.link-u", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, + NULL }, + { "fault.memory.link-f", FM_FMRI_SCHEME_MEM, FM_MEM_SCHEME_VERSION, + NULL }, /* * The following ultraSPARC-T1/T2 faults do NOT retire a cpu thread, @@ -96,6 +102,12 @@ FM_CPU_SCHEME_VERSION, NULL }, { "fault.cpu.*.mau", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, NULL }, + { "fault.cpu.*.lfu-u", FM_FMRI_SCHEME_CPU, + FM_CPU_SCHEME_VERSION, NULL }, + { "fault.cpu.*.lfu-f", FM_FMRI_SCHEME_CPU, + FM_CPU_SCHEME_VERSION, NULL }, + { "fault.cpu.*.lfu-p", FM_FMRI_SCHEME_CPU, + FM_CPU_SCHEME_VERSION, NULL }, { "fault.cpu.*", FM_FMRI_SCHEME_CPU, FM_CPU_SCHEME_VERSION, cma_cpu_retire }, { NULL, NULL, 0, NULL }
--- a/usr/src/cmd/fm/schemes/mem/i386/mem_disc.c Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/cmd/fm/schemes/mem/i386/mem_disc.c Mon Jul 30 12:41:05 2007 -0700 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -21,12 +20,15 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" +#include <mem.h> +#include <fm/fmd_fmri.h> + /* * We do not yet support DIMM enumeration in the x86 mem scheme because our * diagnosis is using the new libtopo functionality and hopefully won't need @@ -37,3 +39,21 @@ { return (0); } + +/* + * The following two routines are stubs for corresponding SPARC-only code. + */ + +/*ARGSUSED*/ +int +mem_get_serids_by_unum(const char *unum, char ***seridsp, size_t *nseridsp) +{ + errno = ENOTSUP; + return (-1); +} + +/*ARGSUSED*/ +void +mem_expand_opt(nvlist_t *nvl, char *unum, char **serids) +{ +}
--- a/usr/src/cmd/fm/schemes/mem/mem.c Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/cmd/fm/schemes/mem/mem.c Mon Jul 30 12:41:05 2007 -0700 @@ -29,11 +29,8 @@ #include <mem.h> #include <fm/fmd_fmri.h> -#include <fcntl.h> -#include <unistd.h> #include <string.h> #include <strings.h> -#include <time.h> #include <sys/mem.h> #ifdef sparc @@ -43,358 +40,6 @@ mem_t mem; -#ifdef sparc - -extern int mem_update_mdesc(void); - -/* - * Retry values for handling the case where the kernel is not yet ready - * to provide DIMM serial ids. Some platforms acquire DIMM serial id - * information from their System Controller via a mailbox interface. - * The values chosen are for 10 retries 3 seconds apart to approximate the - * possible 30 second timeout length of a mailbox message request. - */ -#define MAX_MEM_SID_RETRIES 10 -#define MEM_SID_RETRY_WAIT 3 - -static mem_dimm_map_t * -dm_lookup(const char *name) -{ - mem_dimm_map_t *dm; - - for (dm = mem.mem_dm; dm != NULL; dm = dm->dm_next) { - if (strcmp(name, dm->dm_label) == 0) - return (dm); - } - - return (NULL); -} - -/* - * Returns 0 with serial numbers if found, -1 (with errno set) for errors. If - * the unum (or a component of same) wasn't found, -1 is returned with errno - * set to ENOENT. If the kernel doesn't have support for serial numbers, - * -1 is returned with errno set to ENOTSUP. - */ -static int -mem_get_serids_from_kernel(const char *unum, char ***seridsp, size_t *nseridsp) -{ - char **dimms, **serids; - size_t ndimms, nserids; - int i, rc = 0; - int fd; - int retries = MAX_MEM_SID_RETRIES; - mem_name_t mn; - struct timespec rqt; - - if ((fd = open("/dev/mem", O_RDONLY)) < 0) - return (-1); - - if (mem_unum_burst(unum, &dimms, &ndimms) < 0) { - (void) close(fd); - return (-1); /* errno is set for us */ - } - - serids = fmd_fmri_zalloc(sizeof (char *) * ndimms); - nserids = ndimms; - - bzero(&mn, sizeof (mn)); - - for (i = 0; i < ndimms; i++) { - mn.m_namelen = strlen(dimms[i]) + 1; - mn.m_sidlen = MEM_SERID_MAXLEN; - - mn.m_name = fmd_fmri_alloc(mn.m_namelen); - mn.m_sid = fmd_fmri_alloc(mn.m_sidlen); - - (void) strcpy(mn.m_name, dimms[i]); - - do { - rc = ioctl(fd, MEM_SID, &mn); - - if (rc >= 0 || errno != EAGAIN) - break; - - if (retries == 0) { - errno = ETIMEDOUT; - break; - } - - /* - * EAGAIN indicates the kernel is - * not ready to provide DIMM serial - * ids. Sleep MEM_SID_RETRY_WAIT seconds - * and try again. - * nanosleep() is used instead of sleep() - * to avoid interfering with fmd timers. - */ - rqt.tv_sec = MEM_SID_RETRY_WAIT; - rqt.tv_nsec = 0; - (void) nanosleep(&rqt, NULL); - - } while (retries--); - - if (rc < 0) { - /* - * ENXIO can happen if the kernel memory driver - * doesn't have the MEM_SID ioctl (e.g. if the - * kernel hasn't been patched to provide the - * support). - * - * If the MEM_SID ioctl is available but the - * particular platform doesn't support providing - * serial ids, ENOTSUP will be returned by the ioctl. - */ - if (errno == ENXIO) - errno = ENOTSUP; - fmd_fmri_free(mn.m_name, mn.m_namelen); - fmd_fmri_free(mn.m_sid, mn.m_sidlen); - mem_strarray_free(serids, nserids); - mem_strarray_free(dimms, ndimms); - (void) close(fd); - return (-1); - } - - serids[i] = fmd_fmri_strdup(mn.m_sid); - - fmd_fmri_free(mn.m_name, mn.m_namelen); - fmd_fmri_free(mn.m_sid, mn.m_sidlen); - } - - mem_strarray_free(dimms, ndimms); - - (void) close(fd); - - *seridsp = serids; - *nseridsp = nserids; - - return (0); -} - -/* - * Returns 0 with serial numbers if found, -1 (with errno set) for errors. If - * the unum (or a component of same) wasn't found, -1 is returned with errno - * set to ENOENT. - */ -static int -mem_get_serids_from_cache(const char *unum, char ***seridsp, size_t *nseridsp) -{ - uint64_t drgen = fmd_fmri_get_drgen(); - char **dimms, **serids; - size_t ndimms, nserids; - mem_dimm_map_t *dm; - int i, rc = 0; - - if (mem_unum_burst(unum, &dimms, &ndimms) < 0) - return (-1); /* errno is set for us */ - - serids = fmd_fmri_zalloc(sizeof (char *) * ndimms); - nserids = ndimms; - - for (i = 0; i < ndimms; i++) { - if ((dm = dm_lookup(dimms[i])) == NULL) { - rc = fmd_fmri_set_errno(EINVAL); - break; - } - - if (*dm->dm_serid == '\0' || dm->dm_drgen != drgen) { - /* - * We don't have a cached copy, or the copy we've got is - * out of date. Look it up again. - */ - if (mem_get_serid(dm->dm_device, dm->dm_serid, - sizeof (dm->dm_serid)) < 0) { - rc = -1; /* errno is set for us */ - break; - } - - dm->dm_drgen = drgen; - } - - serids[i] = fmd_fmri_strdup(dm->dm_serid); - } - - mem_strarray_free(dimms, ndimms); - - if (rc == 0) { - *seridsp = serids; - *nseridsp = nserids; - } else { - mem_strarray_free(serids, nserids); - } - - return (rc); -} - -/* - * Returns 0 with serial numbers if found, -1 (with errno set) for errors. If - * the unum (or a component of same) wasn't found, -1 is returned with errno - * set to ENOENT. - */ -static int -mem_get_serids_from_mdesc(const char *unum, char ***seridsp, size_t *nseridsp) -{ - uint64_t drgen = fmd_fmri_get_drgen(); - char **dimms, **serids; - size_t ndimms, nserids; - mem_dimm_map_t *dm; - int i, rc = 0; - - if (mem_unum_burst(unum, &dimms, &ndimms) < 0) - return (-1); /* errno is set for us */ - - serids = fmd_fmri_zalloc(sizeof (char *) * ndimms); - nserids = ndimms; - - /* - * first go through dimms and see if dm_drgen entries are outdated - */ - for (i = 0; i < ndimms; i++) { - if ((dm = dm_lookup(dimms[i])) == NULL || - dm->dm_drgen != drgen) - break; - } - - if (i < ndimms && mem_update_mdesc() != 0) { - mem_strarray_free(dimms, ndimms); - return (-1); - } - - /* - * get to this point if an up-to-date mdesc (and corresponding - * entries in the global mem list) exists - */ - for (i = 0; i < ndimms; i++) { - if ((dm = dm_lookup(dimms[i])) == NULL) { - rc = fmd_fmri_set_errno(EINVAL); - break; - } - - if (dm->dm_drgen != drgen) - dm->dm_drgen = drgen; - - /* - * mdesc and dm entry was updated by an earlier call to - * mem_update_mdesc, so we go ahead and dup the serid - */ - serids[i] = fmd_fmri_strdup(dm->dm_serid); - } - - mem_strarray_free(dimms, ndimms); - - if (rc == 0) { - *seridsp = serids; - *nseridsp = nserids; - } else { - mem_strarray_free(serids, nserids); - } - - return (rc); -} - -/* - * Returns 0 with part numbers if found, returns -1 for errors. - */ -static int -mem_get_parts_from_mdesc(const char *unum, char ***partsp, size_t *npartsp) -{ - uint64_t drgen = fmd_fmri_get_drgen(); - char **dimms, **parts; - size_t ndimms, nparts; - mem_dimm_map_t *dm; - int i, rc = 0; - - if (mem_unum_burst(unum, &dimms, &ndimms) < 0) - return (-1); /* errno is set for us */ - - parts = fmd_fmri_zalloc(sizeof (char *) * ndimms); - nparts = ndimms; - - /* - * first go through dimms and see if dm_drgen entries are outdated - */ - for (i = 0; i < ndimms; i++) { - if ((dm = dm_lookup(dimms[i])) == NULL || - dm->dm_drgen != drgen) - break; - } - - if (i < ndimms && mem_update_mdesc() != 0) { - mem_strarray_free(dimms, ndimms); - mem_strarray_free(parts, nparts); - return (-1); - } - - /* - * get to this point if an up-to-date mdesc (and corresponding - * entries in the global mem list) exists - */ - for (i = 0; i < ndimms; i++) { - if ((dm = dm_lookup(dimms[i])) == NULL) { - rc = fmd_fmri_set_errno(EINVAL); - break; - } - - if (dm->dm_drgen != drgen) - dm->dm_drgen = drgen; - - /* - * mdesc and dm entry was updated by an earlier call to - * mem_update_mdesc, so we go ahead and dup the part - */ - if (dm->dm_part == NULL) { - rc = -1; - break; - } - parts[i] = fmd_fmri_strdup(dm->dm_part); - } - - mem_strarray_free(dimms, ndimms); - - if (rc == 0) { - *partsp = parts; - *npartsp = nparts; - } else { - mem_strarray_free(parts, nparts); - } - - return (rc); -} - -static int -mem_get_parts_by_unum(const char *unum, char ***partp, size_t *npartp) -{ - if (mem.mem_dm == NULL) - return (-1); - else - return (mem_get_parts_from_mdesc(unum, partp, npartp)); -} - -#endif /* sparc */ - -/*ARGSUSED*/ - -static int -mem_get_serids_by_unum(const char *unum, char ***seridsp, size_t *nseridsp) -{ - /* - * Some platforms do not support the caching of serial ids by the - * mem scheme plugin but instead support making serial ids available - * via the kernel. - */ -#ifdef sparc - if (mem.mem_dm == NULL) - return (mem_get_serids_from_kernel(unum, seridsp, nseridsp)); - else if (mem_get_serids_from_mdesc(unum, seridsp, nseridsp) == 0) - return (0); - else - return (mem_get_serids_from_cache(unum, seridsp, nseridsp)); -#else - errno = ENOTSUP; - return (-1); -#endif /* sparc */ -} - static int mem_fmri_get_unum(nvlist_t *nvl, char **unump) { @@ -492,19 +137,16 @@ char *unum, **serids; uint_t nnvlserids; size_t nserids; -#ifdef sparc - char **parts; - size_t nparts; -#endif int rc; if ((mem_fmri_get_unum(nvl, &unum) < 0) || (*unum == '\0')) return (fmd_fmri_set_errno(EINVAL)); if ((rc = nvlist_lookup_string_array(nvl, FM_FMRI_MEM_SERIAL_ID, - &serids, &nnvlserids)) == 0) - return (0); /* fmri is already expanded */ - else if (rc != ENOENT) + &serids, &nnvlserids)) == 0) { /* already have serial #s */ + mem_expand_opt(nvl, unum, serids); + return (0); + } else if (rc != ENOENT) return (fmd_fmri_set_errno(EINVAL)); if (mem_get_serids_by_unum(unum, &serids, &nserids) < 0) { @@ -517,24 +159,14 @@ rc = nvlist_add_string_array(nvl, FM_FMRI_MEM_SERIAL_ID, serids, nserids); + mem_expand_opt(nvl, unum, serids); mem_strarray_free(serids, nserids); if (rc != 0) return (fmd_fmri_set_errno(EINVAL)); - -#ifdef sparc - /* - * Continue with the process if there are no part numbers. - */ - if (mem_get_parts_by_unum(unum, &parts, &nparts) < 0) + else return (0); - - rc = nvlist_add_string_array(nvl, FM_FMRI_HC_PART, parts, nparts); - - mem_strarray_free(parts, nparts); -#endif - return (0); } static int @@ -726,13 +358,19 @@ fmd_fmri_fini(void) { mem_dimm_map_t *dm, *em; + mem_seg_map_t *sm, *tm; for (dm = mem.mem_dm; dm != NULL; dm = em) { em = dm->dm_next; fmd_fmri_strfree(dm->dm_label); + fmd_fmri_strfree(dm->dm_part); fmd_fmri_strfree(dm->dm_device); fmd_fmri_free(dm, sizeof (mem_dimm_map_t)); } + for (sm = mem.mem_seg; sm != NULL; sm = tm) { + tm = sm->sm_next; + fmd_fmri_free(sm, sizeof (mem_seg_map_t)); + } #ifdef sparc ldom_fini(mem_scheme_lhp); #endif /* sparc */
--- a/usr/src/cmd/fm/schemes/mem/mem.h Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/cmd/fm/schemes/mem/mem.h Mon Jul 30 12:41:05 2007 -0700 @@ -20,7 +20,7 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -80,9 +80,8 @@ * generated. No attempt is made to determine whether or not the named * item is still present in the system. * - * - expand: At the time of this writing, no platforms include bank or DIMM - * serial numbers in their ereports. As such, the serial number(s) must - * be added by the diagnosis engine. This entry point will read the + * - expand: For platforms which do not include bank or DIMM + * serial numbers in their ereports, this entry point will read the * serial number(s) for the named item, and will add it/them to the passed * FMRI. Errors will be returned if the FMRI (unum) was unparseable, or if * the serial number could not be retrieved. @@ -108,6 +107,15 @@ */ #define MEM_SERID_MAXLEN 64 +typedef struct mem_seg_map { + struct mem_seg_map *sm_next; /* the next segment map */ + uint64_t sm_base; /* base address for this segment */ + uint64_t sm_size; /* size for this segment */ + uint64_t sm_mask; /* mask denoting dimm selection bits */ + uint64_t sm_match; /* value selecting this set of DIMMs */ + uint16_t sm_shift; /* dimms-per-reference shift */ +} mem_seg_map_t; + typedef struct mem_dimm_map { struct mem_dimm_map *dm_next; /* The next DIMM map */ char *dm_label; /* The UNUM for this DIMM */ @@ -115,18 +123,19 @@ char dm_serid[MEM_SERID_MAXLEN]; /* Cached serial number */ char *dm_part; /* DIMM part number */ uint64_t dm_drgen; /* DR gen count for cached S/N */ + mem_seg_map_t *dm_seg; /* segment for this DIMM */ } mem_dimm_map_t; typedef struct mem { mem_dimm_map_t *mem_dm; /* List supported DIMMs */ uint64_t mem_memconfig; /* HV memory-configuration-id# */ - uint64_t mem_rank_mask; /* "rank" bit */ - int mem_ch_shift; /* # bits for "CH" */ - const char *mem_rank_str; /* string denoting "rank" */ + mem_seg_map_t *mem_seg; /* list of defined segments */ } mem_t; extern int mem_discover(void); extern int mem_get_serid(const char *, char *, size_t); +extern int mem_get_serids_by_unum(const char *, char ***, size_t *); +extern void mem_expand_opt(nvlist_t *, char *, char **); extern int mem_unum_burst(const char *, char ***, size_t *); extern int mem_unum_contains(const char *, const char *);
--- a/usr/src/cmd/fm/schemes/mem/mem_unum.c Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/cmd/fm/schemes/mem/mem_unum.c Mon Jul 30 12:41:05 2007 -0700 @@ -95,7 +95,9 @@ { "/MBU_B/MEMB%*d/%n%nMEM%*d%*1c%n", " MEM%*d%*1c%n" }, { "/CMU%*2d/%n%nMEM%*2d%*1c%n", " MEM%*2d%*1c%n" }, { "MB/CMP%*d/BR%*d%n:%n%n", " CH%*d/D%*d/J%*4d%n", "/" }, - { "MB/CMP%*d/BR%*d%n%n%n", "/CH%*d/D%*d/J%*4d%n" }, + { "%n%nMB/CMP%*d/BR%*d/CH%*d/D%*d/J%*4d%n", + "MB/CMP%*d/BR%*d/CH%*d/D%*d/J%*4d%n" }, + { "%n%nMB/CMP%*d/BR%*d/CH%*d/D%*d%n", "MB/CMP%*d/BR%*d/CH%*d/D%*d%n" }, { NULL } }; @@ -179,7 +181,6 @@ (void) sscanf(pat, bd->bd_pat, &replace, &start, &matched); if (matched == -1) continue; - (void) strlcpy(dimmname, pat, sizeof (dimmname)); if (bd->bd_subst != NULL) { (void) strlcpy(dimmname+replace, bd->bd_subst,
--- a/usr/src/cmd/fm/schemes/mem/sparc/mem_disc.c Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/cmd/fm/schemes/mem/sparc/mem_disc.c Mon Jul 30 12:41:05 2007 -0700 @@ -50,11 +50,15 @@ #include <mem.h> #include <fm/fmd_fmri.h> +#include <fcntl.h> +#include <unistd.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <strings.h> #include <errno.h> +#include <time.h> +#include <sys/mem.h> #include <sys/fm/ldom.h> extern ldom_hdl_t *mem_scheme_lhp; @@ -344,144 +348,106 @@ } } -int -mem_discover_mdesc(md_t *mdp, size_t mdbufsz) +uint16_t +mem_log2(uint64_t v) { - mde_cookie_t *listp; - int num_nodes, idx, mdesc_dimm_count, unique_ch; - mem_dimm_map_t *dm; - uint64_t sysmem_size, i, drgen = fmd_fmri_get_drgen(); - char curr_ch; - int num_comps = 0; - char *unum, *serial, *part, *dash; - - num_nodes = md_node_count(mdp); - listp = fmd_fmri_alloc(sizeof (mde_cookie_t) * num_nodes); - - num_comps = md_scan_dag(mdp, - MDE_INVAL_ELEM_COOKIE, - md_find_name(mdp, "component"), - md_find_name(mdp, "fwd"), - listp); - if (num_comps == 0) { - - /* - * Find first 'memory' node -- there should only be one. - * Extract 'memory-generation-id#' value from it. - */ - mdesc_dimm_count = md_scan_dag(mdp, - MDE_INVAL_ELEM_COOKIE, md_find_name(mdp, "memory"), - md_find_name(mdp, "fwd"), listp); - - if (md_get_prop_val(mdp, listp[0], "memory-generation-id#", - &mem.mem_memconfig)) - mem.mem_memconfig = 0; - - mdesc_dimm_count = md_scan_dag(mdp, - MDE_INVAL_ELEM_COOKIE, md_find_name(mdp, "dimm_data"), - md_find_name(mdp, "fwd"), listp); - - for (idx = 0; idx < mdesc_dimm_count; idx++) { - - if (md_get_prop_str(mdp, listp[idx], "nac", &unum) < 0) - unum = ""; - if (md_get_prop_str(mdp, listp[idx], "serial#", - &serial) < 0) - serial = ""; - if (md_get_prop_str(mdp, listp[idx], "part#", - &part) < 0) - part = ""; - - dm = fmd_fmri_zalloc(sizeof (mem_dimm_map_t)); - dm->dm_label = fmd_fmri_strdup(unum); - (void) strncpy(dm->dm_serid, serial, - MEM_SERID_MAXLEN - 1); - dm->dm_part = fmd_fmri_strdup(part); - dm->dm_drgen = drgen; + uint16_t i; + for (i = 0; v > 1; i++) { + v = v >> 1; + } + return (i); +} - dm->dm_next = mem.mem_dm; - mem.mem_dm = dm; - } - } else { - char *type, *sp, *jnum, *nac; - size_t ss; - for (idx = 0; idx < num_comps; idx++) { - if (md_get_prop_str(mdp, listp[idx], "type", &type) < 0) - continue; - if (strcmp(type, "dimm") == 0) { - if (md_get_prop_str(mdp, listp[idx], "nac", - &nac) < 0) - nac = ""; - if (md_get_prop_str(mdp, listp[idx], "label", - &jnum) < 0) - jnum = ""; - if (md_get_prop_str(mdp, listp[idx], - "serial_number", &serial) < 0) - serial = ""; - if (md_get_prop_str(mdp, listp[idx], - "part_number", &part) < 0) - part = ""; - if (md_get_prop_str(mdp, listp[idx], - "dash_number", &dash) < 0) - dash = ""; +static mem_dimm_map_t * +get_dimm_by_sn(char *sn) +{ + mem_dimm_map_t *dp; - ss = strlen(part) + strlen(dash) + 1; - sp = fmd_fmri_alloc(ss); - sp = strcpy(sp, part); - sp = strncat(sp, dash, strlen(dash) + 1); - - dm = fmd_fmri_zalloc(sizeof (mem_dimm_map_t)); - - if ((strcmp(nac, "") != 0) && - (strcmp(jnum, "") != 0)) { - ss = strlen(nac) + strlen(jnum) + 2; - unum = fmd_fmri_alloc(ss); - (void) snprintf(unum, ss, "%s/%s", nac, - jnum); - dm->dm_label = unum; - } else { - unum = ""; - dm->dm_label = fmd_fmri_strdup(unum); - } - - (void) strncpy(dm->dm_serid, serial, - MEM_SERID_MAXLEN - 1); - dm->dm_part = sp; - dm->dm_drgen = drgen; - - dm->dm_next = mem.mem_dm; - mem.mem_dm = dm; - } - } + for (dp = mem.mem_dm; dp != NULL; dp = dp->dm_next) { + if (strcmp(sn, dp->dm_serid) == 0) + return (dp); } - if (strstr(mem.mem_dm->dm_label, "BR") != NULL) { /* N2 */ - mem.mem_rank_str = "CH"; - } else { /* Niagara-1 */ - mem.mem_rank_str = "/R"; + return (NULL); +} + +#define MEM_BYTES_PER_CACHELINE 64 + +static void +mdesc_init_n1(md_t *mdp, mde_cookie_t *listp) +{ + int idx, mdesc_dimm_count; + mem_dimm_map_t *dm, *d; + uint64_t sysmem_size, i, drgen = fmd_fmri_get_drgen(); + int dimms, min_chan, max_chan, min_rank, max_rank; + int chan, rank, dimm, chans, chan_step; + uint64_t mask, chan_mask, chan_value; + uint64_t rank_mask, rank_value; + char *unum, *serial, *part; + mem_seg_map_t *seg; + char s[20]; + + /* + * Find first 'memory' node -- there should only be one. + * Extract 'memory-generation-id#' value from it. + */ + mdesc_dimm_count = md_scan_dag(mdp, + MDE_INVAL_ELEM_COOKIE, md_find_name(mdp, "memory"), + md_find_name(mdp, "fwd"), listp); + + if (md_get_prop_val(mdp, listp[0], "memory-generation-id#", + &mem.mem_memconfig)) + mem.mem_memconfig = 0; + + mdesc_dimm_count = md_scan_dag(mdp, + MDE_INVAL_ELEM_COOKIE, md_find_name(mdp, "dimm_data"), + md_find_name(mdp, "fwd"), listp); + + for (idx = 0; idx < mdesc_dimm_count; idx++) { + + if (md_get_prop_str(mdp, listp[idx], "nac", &unum) < 0) + unum = ""; + if (md_get_prop_str(mdp, listp[idx], "serial#", + &serial) < 0) + serial = ""; + if (md_get_prop_str(mdp, listp[idx], "part#", + &part) < 0) + part = ""; + + dm = fmd_fmri_zalloc(sizeof (mem_dimm_map_t)); + dm->dm_label = fmd_fmri_strdup(unum); + (void) strncpy(dm->dm_serid, serial, + MEM_SERID_MAXLEN - 1); + dm->dm_part = fmd_fmri_strdup(part); + dm->dm_drgen = drgen; + + dm->dm_next = mem.mem_dm; + mem.mem_dm = dm; + } + /* N1 (MD) specific segment initialization */ + + dimms = 0; + min_chan = 99; + max_chan = -1; + min_rank = 99; + max_rank = -1; + + for (d = mem.mem_dm; d != NULL; d = d->dm_next) { + if (sscanf(d->dm_label, "MB/CMP0/CH%d/R%d/D%d", + &chan, &rank, &dimm) != 3) /* didn't scan all 3 values */ + return; + min_chan = MIN(min_chan, chan); + max_chan = MAX(max_chan, chan); + min_rank = MIN(min_rank, rank); + max_rank = MAX(max_rank, rank); + dimms++; } - curr_ch = '\0'; - unique_ch = 0; - for (dm = mem.mem_dm; dm != NULL; dm = dm->dm_next) { - char my_ch; - if (mem.mem_rank_str == "CH") - my_ch = *(strstr(dm->dm_label, "BR") + 2); - else my_ch = *(strstr(dm->dm_label, "CH") + 2); - if (curr_ch != my_ch) { - unique_ch++; - curr_ch = my_ch; - } - } - - if (unique_ch == 1) mem.mem_ch_shift = 0; - else if (unique_ch == 2) mem.mem_ch_shift = 1; - else mem.mem_ch_shift = 2; - mdesc_dimm_count = md_scan_dag(mdp, - MDE_INVAL_ELEM_COOKIE, md_find_name(mdp, "mblock"), - md_find_name(mdp, "fwd"), listp); - + MDE_INVAL_ELEM_COOKIE, + md_find_name(mdp, "mblock"), + md_find_name(mdp, "fwd"), + listp); sysmem_size = 0; for (idx = 0; idx < mdesc_dimm_count; idx++) { uint64_t size = 0; @@ -489,8 +455,180 @@ sysmem_size += size; } - for (i = 1 << 30; i < sysmem_size; i <<= 1); /* round up to 2^i */ - mem.mem_rank_mask = i >> 1; /* PA high order bit */ + for (i = 1 << 30; i < sysmem_size; i = i << 1) + ; + if (max_rank > min_rank) { + chans = dimms/4; + rank_mask = i >> 1; + } else { + chans = dimms/2; + rank_mask = 0; + } + + chan_mask = (uint64_t)((chans - 1) * MEM_BYTES_PER_CACHELINE); + mask = rank_mask | chan_mask; + + if (chans > 2) + chan_step = 1; + else + chan_step = max_chan - min_chan; + + for (rank = min_rank, rank_value = 0; + rank <= max_rank; + rank++, rank_value += rank_mask) { + for (chan = min_chan, chan_value = 0; + chan <= max_chan; + chan += chan_step, + chan_value += MEM_BYTES_PER_CACHELINE) { + seg = fmd_fmri_zalloc(sizeof (mem_seg_map_t)); + seg->sm_next = mem.mem_seg; + mem.mem_seg = seg; + seg->sm_base = 0; + seg->sm_size = sysmem_size; + seg->sm_mask = mask; + seg->sm_match = chan_value | rank_value; + seg->sm_shift = 1; + (void) sprintf(s, "MB/CMP0/CH%1d/R%1d", chan, rank); + for (d = mem.mem_dm; d != NULL; d = d->dm_next) { + if (strncmp(s, d->dm_label, strlen(s)) == 0) + d->dm_seg = seg; + } + } + } +} + +static void +mdesc_init_n2(md_t *mdp, mde_cookie_t *listp, int num_comps) +{ + mde_cookie_t *dl, t; + int idx, mdesc_dimm_count, mdesc_bank_count; + mem_dimm_map_t *dm, *dp; + uint64_t i, drgen = fmd_fmri_get_drgen(); + int n; + uint64_t mask, match, base, size; + char *unum, *serial, *part, *dash; + mem_seg_map_t *smp; + char *type, *sp, *jnum, *nac; + size_t ss; + + mdesc_dimm_count = 0; + for (idx = 0; idx < num_comps; idx++) { + if (md_get_prop_str(mdp, listp[idx], "type", &type) < 0) + continue; + if (strcmp(type, "dimm") == 0) { + mdesc_dimm_count++; + if (md_get_prop_str(mdp, listp[idx], "nac", + &nac) < 0) + nac = ""; + if (md_get_prop_str(mdp, listp[idx], "label", + &jnum) < 0) + jnum = ""; + if (md_get_prop_str(mdp, listp[idx], + "serial_number", &serial) < 0) + serial = ""; + if (md_get_prop_str(mdp, listp[idx], + "part_number", &part) < 0) + part = ""; + if (md_get_prop_str(mdp, listp[idx], + "dash_number", &dash) < 0) + dash = ""; + + ss = strlen(part) + strlen(dash) + 1; + sp = fmd_fmri_alloc(ss); + sp = strcpy(sp, part); + sp = strncat(sp, dash, strlen(dash) + 1); + + dm = fmd_fmri_zalloc(sizeof (mem_dimm_map_t)); + + if ((strcmp(nac, "") != 0) && + (strcmp(jnum, "") != 0)) { + ss = strlen(nac) + strlen(jnum) + 2; + unum = fmd_fmri_alloc(ss); + (void) snprintf(unum, ss, "%s/%s", nac, + jnum); + dm->dm_label = unum; + } else { + unum = ""; + dm->dm_label = fmd_fmri_strdup(unum); + } + + (void) strncpy(dm->dm_serid, serial, + MEM_SERID_MAXLEN - 1); + dm->dm_part = sp; + dm->dm_drgen = drgen; + + dm->dm_next = mem.mem_dm; + mem.mem_dm = dm; + } + } + + /* N2 (PRI) specific segment initialization occurs here */ + + mdesc_bank_count = md_scan_dag(mdp, MDE_INVAL_ELEM_COOKIE, + md_find_name(mdp, "memory-bank"), + md_find_name(mdp, "fwd"), + listp); + + dl = fmd_fmri_zalloc(mdesc_dimm_count * sizeof (mde_cookie_t)); + + for (idx = 0; idx < mdesc_bank_count; idx++) { + if (md_get_prop_val(mdp, listp[idx], "mask", &mask) < 0) + mask = 0; + if (md_get_prop_val(mdp, listp[idx], "match", &match) < 0) + match = 0; + n = md_scan_dag(mdp, listp[idx], + md_find_name(mdp, "memory-segment"), + md_find_name(mdp, "back"), + &t); /* only 1 "back" arc, so n must equal 1 here */ + if (md_get_prop_val(mdp, t, "base", &base) < 0) + base = 0; + if (md_get_prop_val(mdp, t, "size", &size) < 0) + size = 0; + smp = fmd_fmri_zalloc(sizeof (mem_seg_map_t)); + smp->sm_next = mem.mem_seg; + mem.mem_seg = smp; + smp->sm_base = base; + smp->sm_size = size; + smp->sm_mask = mask; + smp->sm_match = match; + + n = md_scan_dag(mdp, listp[idx], + md_find_name(mdp, "component"), + md_find_name(mdp, "fwd"), + dl); + smp->sm_shift = mem_log2(n); + + for (i = 0; i < n; i++) { + if (md_get_prop_str(mdp, dl[i], + "serial_number", &serial) < 0) + continue; + if ((dp = get_dimm_by_sn(serial)) == NULL) + continue; + dp->dm_seg = smp; + } + } + fmd_fmri_free(dl, mdesc_dimm_count * sizeof (mde_cookie_t)); +} + +int +mem_discover_mdesc(md_t *mdp, size_t mdbufsz) +{ + mde_cookie_t *listp; + int num_nodes; + int num_comps = 0; + + num_nodes = md_node_count(mdp); + listp = fmd_fmri_alloc(sizeof (mde_cookie_t) * num_nodes); + + num_comps = md_scan_dag(mdp, + MDE_INVAL_ELEM_COOKIE, + md_find_name(mdp, "component"), + md_find_name(mdp, "fwd"), + listp); + if (num_comps == 0) + mdesc_init_n1(mdp, listp); + else + mdesc_init_n2(mdp, listp, num_comps); fmd_fmri_free(listp, sizeof (mde_cookie_t) * num_nodes); fmd_fmri_free(*mdp, mdbufsz); @@ -593,6 +731,7 @@ for (dm = mem.mem_dm; dm != NULL; dm = next) { next = dm->dm_next; fmd_fmri_strfree(dm->dm_label); + fmd_fmri_strfree(dm->dm_part); fmd_fmri_free(dm, sizeof (mem_dimm_map_t)); } mem.mem_dm = NULL; @@ -600,3 +739,461 @@ return (mem_discover_mdesc(mdp, mdbufsz)); } } + +/* + * Retry values for handling the case where the kernel is not yet ready + * to provide DIMM serial ids. Some platforms acquire DIMM serial id + * information from their System Controller via a mailbox interface. + * The values chosen are for 10 retries 3 seconds apart to approximate the + * possible 30 second timeout length of a mailbox message request. + */ +#define MAX_MEM_SID_RETRIES 10 +#define MEM_SID_RETRY_WAIT 3 + +/* + * The comparison is asymmetric. It compares up to the length of the + * argument unum. + */ +static mem_dimm_map_t * +dm_lookup(const char *name) +{ + mem_dimm_map_t *dm; + + for (dm = mem.mem_dm; dm != NULL; dm = dm->dm_next) { + if (strncmp(name, dm->dm_label, strlen(name)) == 0) + return (dm); + } + + return (NULL); +} + +/* + * Returns 0 with serial numbers if found, -1 (with errno set) for errors. If + * the unum (or a component of same) wasn't found, -1 is returned with errno + * set to ENOENT. If the kernel doesn't have support for serial numbers, + * -1 is returned with errno set to ENOTSUP. + */ +static int +mem_get_serids_from_kernel(const char *unum, char ***seridsp, size_t *nseridsp) +{ + char **dimms, **serids; + size_t ndimms, nserids; + int i, rc = 0; + int fd; + int retries = MAX_MEM_SID_RETRIES; + mem_name_t mn; + struct timespec rqt; + + if ((fd = open("/dev/mem", O_RDONLY)) < 0) + return (-1); + + if (mem_unum_burst(unum, &dimms, &ndimms) < 0) { + (void) close(fd); + return (-1); /* errno is set for us */ + } + + serids = fmd_fmri_zalloc(sizeof (char *) * ndimms); + nserids = ndimms; + + bzero(&mn, sizeof (mn)); + + for (i = 0; i < ndimms; i++) { + mn.m_namelen = strlen(dimms[i]) + 1; + mn.m_sidlen = MEM_SERID_MAXLEN; + + mn.m_name = fmd_fmri_alloc(mn.m_namelen); + mn.m_sid = fmd_fmri_alloc(mn.m_sidlen); + + (void) strcpy(mn.m_name, dimms[i]); + + do { + rc = ioctl(fd, MEM_SID, &mn); + + if (rc >= 0 || errno != EAGAIN) + break; + + if (retries == 0) { + errno = ETIMEDOUT; + break; + } + + /* + * EAGAIN indicates the kernel is + * not ready to provide DIMM serial + * ids. Sleep MEM_SID_RETRY_WAIT seconds + * and try again. + * nanosleep() is used instead of sleep() + * to avoid interfering with fmd timers. + */ + rqt.tv_sec = MEM_SID_RETRY_WAIT; + rqt.tv_nsec = 0; + (void) nanosleep(&rqt, NULL); + + } while (retries--); + + if (rc < 0) { + /* + * ENXIO can happen if the kernel memory driver + * doesn't have the MEM_SID ioctl (e.g. if the + * kernel hasn't been patched to provide the + * support). + * + * If the MEM_SID ioctl is available but the + * particular platform doesn't support providing + * serial ids, ENOTSUP will be returned by the ioctl. + */ + if (errno == ENXIO) + errno = ENOTSUP; + fmd_fmri_free(mn.m_name, mn.m_namelen); + fmd_fmri_free(mn.m_sid, mn.m_sidlen); + mem_strarray_free(serids, nserids); + mem_strarray_free(dimms, ndimms); + (void) close(fd); + return (-1); + } + + serids[i] = fmd_fmri_strdup(mn.m_sid); + + fmd_fmri_free(mn.m_name, mn.m_namelen); + fmd_fmri_free(mn.m_sid, mn.m_sidlen); + } + + mem_strarray_free(dimms, ndimms); + + (void) close(fd); + + *seridsp = serids; + *nseridsp = nserids; + + return (0); +} + +/* + * Returns 0 with serial numbers if found, -1 (with errno set) for errors. If + * the unum (or a component of same) wasn't found, -1 is returned with errno + * set to ENOENT. + */ +static int +mem_get_serids_from_cache(const char *unum, char ***seridsp, size_t *nseridsp) +{ + uint64_t drgen = fmd_fmri_get_drgen(); + char **dimms, **serids; + size_t ndimms, nserids; + mem_dimm_map_t *dm; + int i, rc = 0; + + if (mem_unum_burst(unum, &dimms, &ndimms) < 0) + return (-1); /* errno is set for us */ + + serids = fmd_fmri_zalloc(sizeof (char *) * ndimms); + nserids = ndimms; + + for (i = 0; i < ndimms; i++) { + if ((dm = dm_lookup(dimms[i])) == NULL) { + rc = fmd_fmri_set_errno(EINVAL); + break; + } + + if (*dm->dm_serid == '\0' || dm->dm_drgen != drgen) { + /* + * We don't have a cached copy, or the copy we've got is + * out of date. Look it up again. + */ + if (mem_get_serid(dm->dm_device, dm->dm_serid, + sizeof (dm->dm_serid)) < 0) { + rc = -1; /* errno is set for us */ + break; + } + + dm->dm_drgen = drgen; + } + + serids[i] = fmd_fmri_strdup(dm->dm_serid); + } + + mem_strarray_free(dimms, ndimms); + + if (rc == 0) { + *seridsp = serids; + *nseridsp = nserids; + } else { + mem_strarray_free(serids, nserids); + } + + return (rc); +} + +/* + * Returns 0 with serial numbers if found, -1 (with errno set) for errors. If + * the unum (or a component of same) wasn't found, -1 is returned with errno + * set to ENOENT. + */ +static int +mem_get_serids_from_mdesc(const char *unum, char ***seridsp, size_t *nseridsp) +{ + uint64_t drgen = fmd_fmri_get_drgen(); + char **dimms, **serids; + size_t ndimms, nserids; + mem_dimm_map_t *dm; + int i, rc = 0; + + if (mem_unum_burst(unum, &dimms, &ndimms) < 0) + return (-1); /* errno is set for us */ + + serids = fmd_fmri_zalloc(sizeof (char *) * ndimms); + nserids = ndimms; + + /* + * first go through dimms and see if dm_drgen entries are outdated + */ + for (i = 0; i < ndimms; i++) { + if ((dm = dm_lookup(dimms[i])) == NULL || + dm->dm_drgen != drgen) + break; + } + + if (i < ndimms && mem_update_mdesc() != 0) { + mem_strarray_free(dimms, ndimms); + return (-1); + } + + /* + * get to this point if an up-to-date mdesc (and corresponding + * entries in the global mem list) exists + */ + for (i = 0; i < ndimms; i++) { + if ((dm = dm_lookup(dimms[i])) == NULL) { + rc = fmd_fmri_set_errno(EINVAL); + break; + } + + if (dm->dm_drgen != drgen) + dm->dm_drgen = drgen; + + /* + * mdesc and dm entry was updated by an earlier call to + * mem_update_mdesc, so we go ahead and dup the serid + */ + serids[i] = fmd_fmri_strdup(dm->dm_serid); + } + + mem_strarray_free(dimms, ndimms); + + if (rc == 0) { + *seridsp = serids; + *nseridsp = nserids; + } else { + mem_strarray_free(serids, nserids); + } + + return (rc); +} + +/* + * Returns 0 with part numbers if found, returns -1 for errors. + */ +static int +mem_get_parts_from_mdesc(const char *unum, char ***partsp, uint_t *npartsp) +{ + uint64_t drgen = fmd_fmri_get_drgen(); + char **dimms, **parts; + size_t ndimms, nparts; + mem_dimm_map_t *dm; + int i, rc = 0; + + if (mem_unum_burst(unum, &dimms, &ndimms) < 0) + return (-1); /* errno is set for us */ + + parts = fmd_fmri_zalloc(sizeof (char *) * ndimms); + nparts = ndimms; + + /* + * first go through dimms and see if dm_drgen entries are outdated + */ + for (i = 0; i < ndimms; i++) { + if ((dm = dm_lookup(dimms[i])) == NULL || + dm->dm_drgen != drgen) + break; + } + + if (i < ndimms && mem_update_mdesc() != 0) { + mem_strarray_free(dimms, ndimms); + mem_strarray_free(parts, nparts); + return (-1); + } + + /* + * get to this point if an up-to-date mdesc (and corresponding + * entries in the global mem list) exists + */ + for (i = 0; i < ndimms; i++) { + if ((dm = dm_lookup(dimms[i])) == NULL) { + rc = fmd_fmri_set_errno(EINVAL); + break; + } + + if (dm->dm_drgen != drgen) + dm->dm_drgen = drgen; + + /* + * mdesc and dm entry was updated by an earlier call to + * mem_update_mdesc, so we go ahead and dup the part + */ + if (dm->dm_part == NULL) { + rc = -1; + break; + } + parts[i] = fmd_fmri_strdup(dm->dm_part); + } + + mem_strarray_free(dimms, ndimms); + + if (rc == 0) { + *partsp = parts; + *npartsp = nparts; + } else { + mem_strarray_free(parts, nparts); + } + + return (rc); +} + +static int +mem_get_parts_by_unum(const char *unum, char ***partp, uint_t *npartp) +{ + if (mem.mem_dm == NULL) + return (-1); + else + return (mem_get_parts_from_mdesc(unum, partp, npartp)); +} + +static int +get_seg_by_sn(char *sn, mem_seg_map_t **segmap) +{ + mem_dimm_map_t *dm; + + for (dm = mem.mem_dm; dm != NULL; dm = dm->dm_next) { + if (strcmp(sn, dm->dm_serid) == 0) { + *segmap = dm->dm_seg; + return (0); + } + } + return (-1); +} + +/* + * Niagara-1, Niagara-2, and Victoria Falls all have physical address + * spaces of 40 bits. + */ + +#define MEM_PHYS_ADDRESS_LIMIT 0x10000000000ULL + +/* + * The 'mask' argument to extract_bits has 1's in those bit positions of + * the physical address used to select the DIMM (or set of DIMMs) which will + * store the contents of the physical address. If we extract those bits, ie. + * remove them and collapse the holes, the result is the 'address' within the + * DIMM or set of DIMMs where the contents are stored. + */ + +static uint64_t +extract_bits(uint64_t paddr, uint64_t mask) +{ + uint64_t from, to; + uint64_t result = 0; + + to = 1; + for (from = 1; from <= MEM_PHYS_ADDRESS_LIMIT; from <<= 1) { + if ((from & mask) == 0) { + if ((from & paddr) != 0) + result |= to; + to <<= 1; + } + } + return (result); +} + +/* + * insert_bits is the reverse operation to extract_bits. Where extract_bits + * removes from the physical address those bits which select a DIMM or set + * of DIMMs, insert_bits reconstitutes a physical address given the DIMM + * selection 'mask' and the 'value' for the address bits denoted by 1s in + * the 'mask'. + */ +static uint64_t +insert_bits(uint64_t offset, uint64_t mask, uint64_t value) +{ + uint64_t result = 0; + uint64_t from, to; + + from = 1; + for (to = 1; to <= MEM_PHYS_ADDRESS_LIMIT; to <<= 1) { + if ((to & mask) == 0) { + if ((offset & from) != 0) + result |= to; + from <<= 1; + } else { + result |= to & value; + } + } + return (result); +} + +int +mem_get_serids_by_unum(const char *unum, char ***seridsp, size_t *nseridsp) +{ + /* + * Some platforms do not support the caching of serial ids by the + * mem scheme plugin but instead support making serial ids available + * via the kernel. + */ + if (mem.mem_dm == NULL) + return (mem_get_serids_from_kernel(unum, seridsp, nseridsp)); + else if (mem_get_serids_from_mdesc(unum, seridsp, nseridsp) == 0) + return (0); + else + return (mem_get_serids_from_cache(unum, seridsp, nseridsp)); +} + +void +mem_expand_opt(nvlist_t *nvl, char *unum, char **serids) +{ + mem_seg_map_t *seg; + uint64_t offset, physaddr; + char **parts; + uint_t nparts; + + /* + * The following additional expansions are all optional. + * Failure to retrieve a data value, or failure to add it + * successfully to the FMRI, does NOT cause a failure of + * fmd_fmri_expand. All optional expansions will be attempted + * once expand_opt is entered. + */ + + if ((mem.mem_seg != NULL) && + (get_seg_by_sn(*serids, &seg) == 0)) { + + if (nvlist_lookup_uint64(nvl, + FM_FMRI_MEM_OFFSET, &offset) == 0) { + physaddr = insert_bits((offset<<seg->sm_shift), + seg->sm_mask, seg->sm_match); + (void) nvlist_add_uint64(nvl, FM_FMRI_MEM_PHYSADDR, + physaddr); /* displaces any previous physaddr */ + } else if (nvlist_lookup_uint64(nvl, + FM_FMRI_MEM_PHYSADDR, &physaddr) == 0) { + offset = extract_bits(physaddr, + seg->sm_mask) >> seg->sm_shift; + (void) (nvlist_add_uint64(nvl, FM_FMRI_MEM_OFFSET, + offset)); + } + } + + if ((nvlist_lookup_string_array(nvl, FM_FMRI_HC_PART, + &parts, &nparts) < 0) && + (mem_get_parts_by_unum(unum, &parts, &nparts) == 0)) { + (void) nvlist_add_string_array(nvl, + FM_FMRI_HC_PART, parts, nparts); + mem_strarray_free(parts, nparts); + } +}
--- a/usr/src/pkgdefs/SUNWonmtst.v/prototype_sparc Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/pkgdefs/SUNWonmtst.v/prototype_sparc Mon Jul 30 12:41:05 2007 -0700 @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # #pragma ident "%Z%%M% %I% %E% SMI" @@ -61,4 +61,5 @@ f none usr/platform/sun4v/include/sys/memtestio_v.h 644 root bin f none usr/platform/sun4v/include/sys/memtestio_ni.h 644 root bin f none usr/platform/sun4v/include/sys/memtestio_n2.h 644 root bin +f none usr/platform/sun4v/include/sys/memtestio_vf.h 644 root bin f none usr/bin/mtst 555 root bin
--- a/usr/src/uts/sparc/sys/fm/cpu/UltraSPARC-T1.h Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/uts/sparc/sys/fm/cpu/UltraSPARC-T1.h Mon Jul 30 12:41:05 2007 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -42,6 +42,7 @@ #define FM_EREPORT_PAYLOAD_NAME_L2_SYND "l2-synd" #define FM_EREPORT_PAYLOAD_NAME_L2_ESR "l2-esr" #define FM_EREPORT_PAYLOAD_NAME_L2_EAR "l2-ear" +#define FM_EREPORT_PAYLOAD_NAME_L2_ESYR "l2-esyr" #define FM_EREPORT_PAYLOAD_NAME_DRAM_AFSR "dram-afsr" #define FM_EREPORT_PAYLOAD_NAME_DRAM_AFAR "dram-afar" @@ -50,6 +51,8 @@ #define FM_EREPORT_PAYLOAD_NAME_DRAM_ESR "dram-esr" #define FM_EREPORT_PAYLOAD_NAME_DRAM_EAR "dram-ear" +#define FM_EREPORT_PAYLOAD_NAME_TS3_FCR "ts3-fcr" + #define FM_EREPORT_CPU_UST1_DAU "dau" #define FM_EREPORT_CPU_UST1_DAC "dac" #define FM_EREPORT_CPU_UST1_DSU "dsu"
--- a/usr/src/uts/sun4v/Makefile.files Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/uts/sun4v/Makefile.files Mon Jul 30 12:41:05 2007 -0700 @@ -131,7 +131,8 @@ MEMTEST_OBJS += memtest.o memtest_asm.o \ memtest_v.o memtest_v_asm.o \ memtest_ni.o memtest_ni_asm.o \ - memtest_n2.o memtest_n2_asm.o + memtest_n2.o memtest_n2_asm.o \ + memtest_vf.o # # sun4v virtual devices
--- a/usr/src/uts/sun4v/sys/Makefile Mon Jul 30 11:30:55 2007 -0700 +++ b/usr/src/uts/sun4v/sys/Makefile Mon Jul 30 12:41:05 2007 -0700 @@ -96,7 +96,8 @@ CLOSED_HDRS= \ memtestio_ni.h \ memtestio_n2.h \ - memtestio_v.h + memtestio_v.h \ + memtestio_vf.h ROOTHDRS= $(HDRS:%=$(USR_PSM_ISYS_DIR)/%) $(CLOSED_BUILD)ROOTHDRS += $(CLOSED_HDRS:%=$(USR_PSM_ISYS_DIR)/%)