Mercurial > illumos > illumos-gate
changeset 6693:944dc748ba76
6511374 confusing mc-opl error message after memory reconfiguration
6647128 MI-CEs that are not at an address 0 mod 64 also produce a PTRL-CE ereport
6671901 The fix for ICE handling in 6556139 causes problems for XSCF in Fujitsu Service Mode.
6677120 mc-opl driver cause domain panic if 2 CE errors occur on a memory bank w/ CE rewrite timeout
6695855 PCE page may not be retired on Quad mode.
author | wh31274 |
---|---|
date | Thu, 22 May 2008 20:47:36 -0700 |
parents | a6d8ea2756e6 |
children | d1503f9c5de3 |
files | usr/src/cmd/fm/modules/sun4u/cpumem-diagnosis/cmd_oplerr.c usr/src/uts/sun4u/opl/io/mc-opl.c usr/src/uts/sun4u/opl/sys/mc-opl.h |
diffstat | 3 files changed, 295 insertions(+), 16 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/cmd/fm/modules/sun4u/cpumem-diagnosis/cmd_oplerr.c Thu May 22 14:07:07 2008 -0700 +++ b/usr/src/cmd/fm/modules/sun4u/cpumem-diagnosis/cmd_oplerr.c Thu May 22 20:47:36 2008 -0700 @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -37,6 +37,10 @@ #include <cmd_opl.h> #include <string.h> #include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <dirent.h> +#include <sys/stat.h> #include <sys/fm/protocol.h> #include <sys/fm/io/opl_mc_fm.h> @@ -44,6 +48,9 @@ #include <sys/opl_olympus_regs.h> #include <sys/fm/cpu/SPARC64-VI.h> #include <sys/int_const.h> +#include <sys/mutex.h> +#include <sys/dditypes.h> +#include <opl/sys/mc-opl.h> /* * The following is the common function for handling @@ -264,13 +271,110 @@ } /* + * Notify fault page information (pa and errlog) to XSCF via mc-opl + */ +#define MC_PHYDEV_DIR "/devices" +#define MC_PHYPREFIX "pseudo-mc@" +static int +opl_scf_log(fmd_hdl_t *hdl, nvlist_t *nvl) +{ + uint32_t *eadd, *elog; + uint_t n; + uint64_t pa; + char path[MAXPATHLEN]; + char *unum; + nvlist_t *rsrc; + DIR *mcdir; + struct dirent *dp; + mc_flt_page_t flt_page; + cmd_page_t *page; + struct stat statbuf; + + /* + * Extract ereport. + * Sanity check of pa is already done at cmd_opl_mac_common(). + * mc-opl sets only one entry for MC_OPL_ERR_ADD, MC_OPL_ERR_LOG, + * and MC_OPL_BANK. + */ + if ((nvlist_lookup_uint64(nvl, MC_OPL_PA, &pa) != 0) || + (nvlist_lookup_uint32_array(nvl, MC_OPL_ERR_ADD, &eadd, &n) != 0) || + (nvlist_lookup_uint32_array(nvl, MC_OPL_ERR_LOG, &elog, &n) != 0)) { + fmd_hdl_debug(hdl, "opl_scf_log failed to extract ereport.\n"); + return (-1); + } + if (nvlist_lookup_nvlist(nvl, FM_EREPORT_PAYLOAD_NAME_RESOURCE, + &rsrc) != 0) { + fmd_hdl_debug(hdl, "opl_scf_log failed to get resource.\n"); + return (-1); + } + if (nvlist_lookup_string(rsrc, FM_FMRI_MEM_UNUM, &unum) != 0) { + fmd_hdl_debug(hdl, "opl_scf_log failed to get unum.\n"); + return (-1); + } + + page = cmd_page_lookup(pa); + if (page != NULL && page->page_flags & CMD_MEM_F_FAULTING) { + /* + * fault.memory.page will not be created. + */ + return (0); + } + + flt_page.err_add = eadd[0]; + flt_page.err_log = elog[0]; + flt_page.fmri_addr = (uint64_t)(uint32_t)unum; + flt_page.fmri_sz = strlen(unum) + 1; + + fmd_hdl_debug(hdl, "opl_scf_log DIMM: %s (%d)\n", + unum, strlen(unum) + 1); + fmd_hdl_debug(hdl, "opl_scf_log pa:%llx add:%x log:%x\n", + pa, eadd[0], elog[0]); + + if ((mcdir = opendir(MC_PHYDEV_DIR)) != NULL) { + while ((dp = readdir(mcdir)) != NULL) { + int fd; + + if (strncmp(dp->d_name, MC_PHYPREFIX, + strlen(MC_PHYPREFIX)) != 0) + continue; + + (void) snprintf(path, sizeof (path), + "%s/%s", MC_PHYDEV_DIR, dp->d_name); + + if (stat(path, &statbuf) != 0 || + (statbuf.st_mode & S_IFCHR) == 0) { + /* skip if not a character device */ + continue; + } + + if ((fd = open(path, O_RDONLY)) < 0) + continue; + + if (ioctl(fd, MCIOC_FAULT_PAGE, &flt_page) == 0) { + fmd_hdl_debug(hdl, "opl_scf_log ioctl(%s)\n", + path); + (void) close(fd); + (void) closedir(mcdir); + return (0); + } + (void) close(fd); + } + (void) closedir(mcdir); + } + + fmd_hdl_debug(hdl, "opl_scf_log failed ioctl().\n"); + + return (-1); +} + +/* * This is the common function for processing MAC detected * Intermittent and Permanent CEs. */ cmd_evdisp_t cmd_opl_mac_ce(fmd_hdl_t *hdl, fmd_event_t *ep, const char *class, - nvlist_t *asru, nvlist_t *fru, uint64_t pa) + nvlist_t *asru, nvlist_t *fru, uint64_t pa, nvlist_t *nvl) { cmd_dimm_t *dimm; const char *uuid; @@ -310,6 +414,7 @@ dimm->dimm_case.cc_serdnm); fmd_serd_reset(hdl, dimm->dimm_case.cc_serdnm); + (void) opl_scf_log(hdl, nvl); } else { CMD_STAT_BUMP(ce_sticky); } @@ -388,7 +493,7 @@ strcmp(class, "ereport.asic.mac.ptrl-ice") == 0) { cmd_evdisp_t ret; - ret = cmd_opl_mac_ce(hdl, ep, class, asru, fru, pa); + ret = cmd_opl_mac_ce(hdl, ep, class, asru, fru, pa, nvl); nvlist_free(asru); nvlist_free(fru); if (ret != CMD_EVD_OK) {
--- a/usr/src/uts/sun4u/opl/io/mc-opl.c Thu May 22 14:07:07 2008 -0700 +++ b/usr/src/uts/sun4u/opl/io/mc-opl.c Thu May 22 20:47:36 2008 -0700 @@ -60,6 +60,7 @@ #include <vm/hat_sfmmu.h> #include <sys/vmsystm.h> #include <sys/membar.h> +#include <sys/mem.h> /* * Function prototypes @@ -103,6 +104,7 @@ static void mc_clear_rewrite(mc_opl_t *mcp, int i); static void mc_set_rewrite(mc_opl_t *mcp, int bank, uint32_t addr, int state); +static int mc_scf_log_event(mc_flt_page_t *flt_pag); #ifdef DEBUG static int mc_ioctl_debug(dev_t, int, intptr_t, int, cred_t *, int *); @@ -508,6 +510,12 @@ if (ddi_soft_state_zalloc(mc_statep, instance) != DDI_SUCCESS) return (DDI_FAILURE); + if (ddi_create_minor_node(devi, "mc-opl", S_IFCHR, instance, + "ddi_mem_ctrl", 0) != DDI_SUCCESS) { + MC_LOG("mc_attach: create_minor_node failed\n"); + return (DDI_FAILURE); + } + if ((mcp = ddi_get_soft_state(mc_statep, instance)) == NULL) { goto bad; } @@ -542,6 +550,7 @@ return (DDI_SUCCESS); bad: + ddi_remove_minor_node(devi, NULL); ddi_soft_state_free(mc_statep, instance); return (DDI_FAILURE); } @@ -575,6 +584,8 @@ return (DDI_FAILURE); } + ddi_remove_minor_node(devi, NULL); + /* free up the soft state */ ddi_soft_state_free(mc_statep, instance); @@ -600,10 +611,22 @@ mc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) { + mc_flt_page_t flt_page; + + if (cmd == MCIOC_FAULT_PAGE) { + if (arg == NULL) + return (EINVAL); + + if (ddi_copyin((const void *)arg, (void *)&flt_page, + sizeof (mc_flt_page_t), 0) < 0) + return (EFAULT); + + return (mc_scf_log_event(&flt_page)); + } #ifdef DEBUG return (mc_ioctl_debug(dev, cmd, arg, mode, credp, rvalp)); #else - return (ENXIO); + return (ENOTTY); #endif } @@ -698,7 +721,7 @@ maddr1.ma_dimm_addr)) { return (0); } else { - cmn_err(CE_WARN, "Translation error source /LSB%d/B%d/%x, " + MC_LOG("Translation error source /LSB%d/B%d/%x, " "PA %lx, target /LSB%d/B%d/%x\n", maddr->ma_bd, bank, maddr->ma_dimm_addr, *pa, maddr1.ma_bd, maddr1.ma_bank, maddr1.ma_dimm_addr); @@ -1419,7 +1442,27 @@ retry = mc_retry_info_get(&bankp->mcb_retry_freelist); - ASSERT(retry != NULL); + if (retry == NULL) { + mc_addr_t maddr; + uint64_t paddr; + /* + * previous rewrite request has not completed yet. + * So we discard this rewrite request. + */ + maddr.ma_bd = mcp->mc_board_num; + maddr.ma_bank = bank; + maddr.ma_dimm_addr = addr; + if (mcaddr_to_pa(mcp, &maddr, &paddr) == 0) { + cmn_err(CE_WARN, "Discard CE rewrite request" + " for 0x%lx (/LSB%d/B%d/%x).\n", + paddr, mcp->mc_board_num, bank, addr); + } else { + cmn_err(CE_WARN, "Discard CE rewrite request" + " for /LSB%d/B%d/%x.\n", + mcp->mc_board_num, bank, addr); + } + return; + } retry->ri_addr = addr; retry->ri_state = state; @@ -1477,8 +1520,7 @@ } else { if ((++mc_pce_dropped & 0xff) == 0) { cmn_err(CE_WARN, "Cannot " - "report Permanent CE to " - "SCF\n"); + "report CE to SCF\n"); } } } @@ -1497,7 +1539,7 @@ if (mcp->mc_scf_total[bank] >= mc_max_scf_logs) { if ((++mc_pce_dropped & 0xff) == 0) { - cmn_err(CE_WARN, "Too many Permanent CE requests.\n"); + cmn_err(CE_WARN, "Too many CE requests.\n"); } return; } @@ -1918,11 +1960,13 @@ if ((((flt_stat[0].mf_cntl & MAC_CNTL_PTRL_ERRS) >> MAC_CNTL_PTRL_ERR_SHIFT) == ((mi_flt_stat[0].mf_cntl & MAC_CNTL_MI_ERRS) >> MAC_CNTL_MI_ERR_SHIFT)) && - (flt_stat[0].mf_err_add == mi_flt_stat[0].mf_err_add) && + (flt_stat[0].mf_err_add == + ROUNDDOWN(mi_flt_stat[0].mf_err_add, MC_BOUND_BYTE)) && (((flt_stat[1].mf_cntl & MAC_CNTL_PTRL_ERRS) >> MAC_CNTL_PTRL_ERR_SHIFT) == ((mi_flt_stat[1].mf_cntl & MAC_CNTL_MI_ERRS) >> MAC_CNTL_MI_ERR_SHIFT)) && - (flt_stat[1].mf_err_add == mi_flt_stat[1].mf_err_add)) { + (flt_stat[1].mf_err_add == + ROUNDDOWN(mi_flt_stat[1].mf_err_add, MC_BOUND_BYTE))) { #ifdef DEBUG MC_LOG("discarding PTRL error because " "it is the same as MI\n"); @@ -2032,7 +2076,8 @@ if ((((flt_stat.mf_cntl & MAC_CNTL_PTRL_ERRS) >> MAC_CNTL_PTRL_ERR_SHIFT) == ((mi_flt_stat.mf_cntl & MAC_CNTL_MI_ERRS) >> MAC_CNTL_MI_ERR_SHIFT)) && - (flt_stat.mf_err_add == mi_flt_stat.mf_err_add)) { + (flt_stat.mf_err_add == + ROUNDDOWN(mi_flt_stat.mf_err_add, MC_BOUND_BYTE))) { #ifdef DEBUG MC_LOG("discarding PTRL error because " "it is the same as MI\n"); @@ -3703,6 +3748,8 @@ MC_LOG("mc_get_mem_addr: " "mcaddr_to_pa failed\n"); ret = ENODEV; + mutex_exit(&mcp->mc_lock); + continue; } mutex_exit(&mcp->mc_lock); break; @@ -3815,6 +3862,112 @@ return (dimm_list); } +static int +mc_get_mem_fmri(mc_flt_page_t *fpag, char **unum) +{ + if (fpag->fmri_addr == 0 || fpag->fmri_sz > MEM_FMRI_MAX_BUFSIZE) + return (EINVAL); + + *unum = kmem_alloc(fpag->fmri_sz, KM_SLEEP); + if (copyin((void *)fpag->fmri_addr, *unum, fpag->fmri_sz) != 0) { + kmem_free(*unum, fpag->fmri_sz); + return (EFAULT); + } + return (0); +} + +static int +mc_scf_log_event(mc_flt_page_t *flt_pag) +{ + mc_opl_t *mcp; + int board, bank, slot; + int len, rv = 0; + char *unum, *sid; + char dname[MCOPL_MAX_DIMMNAME + 1]; + size_t sid_sz; + uint64_t pa; + mc_flt_stat_t flt_stat; + + if ((sid_sz = cpu_get_name_bufsize()) == 0) + return (ENOTSUP); + + if ((rv = mc_get_mem_fmri(flt_pag, &unum)) != 0) { + MC_LOG("mc_scf_log_event: mc_get_mem_fmri failed\n"); + return (rv); + } + + sid = kmem_zalloc(sid_sz, KM_SLEEP); + + if ((rv = mc_get_mem_sid(unum, sid, sid_sz, &len)) != 0) { + MC_LOG("mc_scf_log_event: mc_get_mem_sid failed\n"); + goto out; + } + + if ((rv = mc_get_mem_addr(unum, sid, (uint64_t)flt_pag->err_add, + &pa)) != 0) { + MC_LOG("mc_scf_log_event: mc_get_mem_addr failed\n"); + goto out; + } + + if (parse_unum_memory(unum, &board, dname) != 0) { + MC_LOG("mc_scf_log_event: parse_unum_memory failed\n"); + rv = EINVAL; + goto out; + } + + if (board < 0) { + MC_LOG("mc_scf_log_event: Invalid board=%d dimm=%s\n", + board, dname); + rv = EINVAL; + goto out; + } + + if (dname_to_bankslot(dname, &bank, &slot) != 0) { + MC_LOG("mc_scf_log_event: dname_to_bankslot failed\n"); + rv = EINVAL; + goto out; + } + + mutex_enter(&mcmutex); + + flt_stat.mf_err_add = flt_pag->err_add; + flt_stat.mf_err_log = flt_pag->err_log; + flt_stat.mf_flt_paddr = pa; + + if ((mcp = mc_pa_to_mcp(pa)) == NULL) { + mutex_exit(&mcmutex); + MC_LOG("mc_scf_log_event: invalid pa\n"); + rv = EINVAL; + goto out; + } + + MC_LOG("mc_scf_log_event: DIMM%s, /LSB%d/B%d/%x, pa %lx elog %x\n", + unum, mcp->mc_board_num, bank, flt_pag->err_add, pa, + flt_pag->err_log); + + mutex_enter(&mcp->mc_lock); + + if (!pa_is_valid(mcp, pa)) { + mutex_exit(&mcp->mc_lock); + mutex_exit(&mcmutex); + rv = EINVAL; + goto out; + } + + rv = 0; + + mc_queue_scf_log(mcp, &flt_stat, bank); + + mutex_exit(&mcp->mc_lock); + mutex_exit(&mcmutex); + +out: + kmem_free(unum, flt_pag->fmri_sz); + kmem_free(sid, sid_sz); + + return (rv); +} + #ifdef DEBUG void mc_dump_dimm(char *buf, int dnamesz, int serialsz, int partnumsz) @@ -3868,7 +4021,7 @@ mc_ioctl_debug(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) { - caddr_t buf; + caddr_t buf, kbuf; uint64_t pa; int rv = 0; int i; @@ -3937,9 +4090,9 @@ */ cmn_err(CE_NOTE, "Allocating kmem %d MB\n", flags * 512); for (i = 0; i < flags; i++) { - buf = kmem_alloc(512 * 1024 * 1024, KM_SLEEP); + kbuf = kmem_alloc(512 * 1024 * 1024, KM_SLEEP); cmn_err(CE_NOTE, "kmem buf %llx PA %llx\n", - (u_longlong_t)buf, (u_longlong_t)va_to_pa(buf)); + (u_longlong_t)kbuf, (u_longlong_t)va_to_pa(kbuf)); } break; case MCI_SUSPEND: @@ -3951,6 +4104,9 @@ default: rv = ENXIO; } + if (buf) + kmem_free(buf, PAGESIZE); + return (rv); }
--- a/usr/src/uts/sun4u/opl/sys/mc-opl.h Thu May 22 14:07:07 2008 -0700 +++ b/usr/src/uts/sun4u/opl/sys/mc-opl.h Thu May 22 20:47:36 2008 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -190,6 +190,13 @@ mc_flt_stat_t *mflt_stat[2]; /* fault status */ } mc_aflt_t; +typedef struct mc_flt_page { + uint32_t err_add; /* MAC_BANKm_{PTRL|MI}_ERR_ADD reg */ + uint32_t err_log; /* MAC_BANKm_{PTRL|MI}_ERR_LOG reg */ + uint64_t fmri_addr; /* FRU name string */ + uint32_t fmri_sz; /* length of FRU name +1 */ +} mc_flt_page_t; + #define MAC_PTRL_STAT(mcp, i) (mcp->mc_bank[i].mcb_reg_base) #define MAC_PTRL_CNTL(mcp, i) (mcp->mc_bank[i].mcb_reg_base + 0x10) #define MAC_PTRL_ERR_ADD(mcp, i) (mcp->mc_bank[i].mcb_reg_base + 0x20) @@ -206,6 +213,14 @@ /* use PA[37:6] */ #define MAC_RESTART_PA(pa) ((pa >> 6) & 0xffffffff) + +/* + * This is for changing MI_ERR_ADDR accuracy. + * Last two bits of PTRL_ERR_ADDR are always 0. + */ +#define ROUNDDOWN(a, n) (((a) & ~((n) - 1))) +#define MC_BOUND_BYTE 4 + /* * MAC_BANKm_PTRL_STAT_Register */ @@ -399,6 +414,9 @@ #define MC_INJECT_FLAG_ST 0x40 #define MC_INJECT_FLAG_PATH 0x80 +#define MCIOC ('M' << 8) +#define MCIOC_FAULT_PAGE (MCIOC|1) + #ifdef DEBUG #define MCI_NOP 0x0