changeset 6693:944dc748ba76

6511374 confusing mc-opl error message after memory reconfiguration 6647128 MI-CEs that are not at an address 0 mod 64 also produce a PTRL-CE ereport 6671901 The fix for ICE handling in 6556139 causes problems for XSCF in Fujitsu Service Mode. 6677120 mc-opl driver cause domain panic if 2 CE errors occur on a memory bank w/ CE rewrite timeout 6695855 PCE page may not be retired on Quad mode.
author wh31274
date Thu, 22 May 2008 20:47:36 -0700
parents a6d8ea2756e6
children d1503f9c5de3
files usr/src/cmd/fm/modules/sun4u/cpumem-diagnosis/cmd_oplerr.c usr/src/uts/sun4u/opl/io/mc-opl.c usr/src/uts/sun4u/opl/sys/mc-opl.h
diffstat 3 files changed, 295 insertions(+), 16 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/fm/modules/sun4u/cpumem-diagnosis/cmd_oplerr.c	Thu May 22 14:07:07 2008 -0700
+++ b/usr/src/cmd/fm/modules/sun4u/cpumem-diagnosis/cmd_oplerr.c	Thu May 22 20:47:36 2008 -0700
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -37,6 +37,10 @@
 #include <cmd_opl.h>
 #include <string.h>
 #include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <sys/stat.h>
 
 #include <sys/fm/protocol.h>
 #include <sys/fm/io/opl_mc_fm.h>
@@ -44,6 +48,9 @@
 #include <sys/opl_olympus_regs.h>
 #include <sys/fm/cpu/SPARC64-VI.h>
 #include <sys/int_const.h>
+#include <sys/mutex.h>
+#include <sys/dditypes.h>
+#include <opl/sys/mc-opl.h>
 
 /*
  * The following is the common function for handling
@@ -264,13 +271,110 @@
 }
 
 /*
+ * Notify fault page information (pa and errlog) to XSCF via mc-opl
+ */
+#define	MC_PHYDEV_DIR	"/devices"
+#define	MC_PHYPREFIX	"pseudo-mc@"
+static int
+opl_scf_log(fmd_hdl_t *hdl, nvlist_t *nvl)
+{
+	uint32_t *eadd, *elog;
+	uint_t n;
+	uint64_t pa;
+	char path[MAXPATHLEN];
+	char *unum;
+	nvlist_t *rsrc;
+	DIR *mcdir;
+	struct dirent *dp;
+	mc_flt_page_t flt_page;
+	cmd_page_t *page;
+	struct stat statbuf;
+
+	/*
+	 * Extract ereport.
+	 * Sanity check of pa is already done at cmd_opl_mac_common().
+	 * mc-opl sets only one entry for MC_OPL_ERR_ADD, MC_OPL_ERR_LOG,
+	 * and MC_OPL_BANK.
+	 */
+	if ((nvlist_lookup_uint64(nvl, MC_OPL_PA, &pa) != 0) ||
+	    (nvlist_lookup_uint32_array(nvl, MC_OPL_ERR_ADD, &eadd, &n) != 0) ||
+	    (nvlist_lookup_uint32_array(nvl, MC_OPL_ERR_LOG, &elog, &n) != 0)) {
+		fmd_hdl_debug(hdl, "opl_scf_log failed to extract ereport.\n");
+		return (-1);
+	}
+	if (nvlist_lookup_nvlist(nvl, FM_EREPORT_PAYLOAD_NAME_RESOURCE,
+	    &rsrc) != 0) {
+		fmd_hdl_debug(hdl, "opl_scf_log failed to get resource.\n");
+		return (-1);
+	}
+	if (nvlist_lookup_string(rsrc, FM_FMRI_MEM_UNUM, &unum) != 0) {
+		fmd_hdl_debug(hdl, "opl_scf_log failed to get unum.\n");
+		return (-1);
+	}
+
+	page = cmd_page_lookup(pa);
+	if (page != NULL && page->page_flags & CMD_MEM_F_FAULTING) {
+		/*
+		 * fault.memory.page will not be created.
+		 */
+		return (0);
+	}
+
+	flt_page.err_add = eadd[0];
+	flt_page.err_log = elog[0];
+	flt_page.fmri_addr = (uint64_t)(uint32_t)unum;
+	flt_page.fmri_sz = strlen(unum) + 1;
+
+	fmd_hdl_debug(hdl, "opl_scf_log DIMM: %s (%d)\n",
+	    unum, strlen(unum) + 1);
+	fmd_hdl_debug(hdl, "opl_scf_log pa:%llx add:%x log:%x\n",
+	    pa, eadd[0], elog[0]);
+
+	if ((mcdir = opendir(MC_PHYDEV_DIR)) != NULL) {
+		while ((dp = readdir(mcdir)) != NULL) {
+			int fd;
+
+			if (strncmp(dp->d_name, MC_PHYPREFIX,
+			    strlen(MC_PHYPREFIX)) != 0)
+				continue;
+
+			(void) snprintf(path, sizeof (path),
+			    "%s/%s", MC_PHYDEV_DIR, dp->d_name);
+
+			if (stat(path, &statbuf) != 0 ||
+			    (statbuf.st_mode & S_IFCHR) == 0) {
+				/* skip if not a character device */
+				continue;
+			}
+
+			if ((fd = open(path, O_RDONLY)) < 0)
+				continue;
+
+			if (ioctl(fd, MCIOC_FAULT_PAGE, &flt_page) == 0) {
+				fmd_hdl_debug(hdl, "opl_scf_log ioctl(%s)\n",
+				    path);
+				(void) close(fd);
+				(void) closedir(mcdir);
+				return (0);
+			}
+			(void) close(fd);
+		}
+		(void) closedir(mcdir);
+	}
+
+	fmd_hdl_debug(hdl, "opl_scf_log failed ioctl().\n");
+
+	return (-1);
+}
+
+/*
  * This is the common function for processing MAC detected
  * Intermittent and Permanent CEs.
  */
 
 cmd_evdisp_t
 cmd_opl_mac_ce(fmd_hdl_t *hdl, fmd_event_t *ep, const char *class,
-    nvlist_t *asru, nvlist_t *fru, uint64_t pa)
+    nvlist_t *asru, nvlist_t *fru, uint64_t pa, nvlist_t *nvl)
 {
 	cmd_dimm_t *dimm;
 	const char *uuid;
@@ -310,6 +414,7 @@
 		    dimm->dimm_case.cc_serdnm);
 		fmd_serd_reset(hdl, dimm->dimm_case.cc_serdnm);
 
+		(void) opl_scf_log(hdl, nvl);
 	} else {
 		CMD_STAT_BUMP(ce_sticky);
 	}
@@ -388,7 +493,7 @@
 	    strcmp(class, "ereport.asic.mac.ptrl-ice") == 0) {
 		cmd_evdisp_t ret;
 
-		ret = cmd_opl_mac_ce(hdl, ep, class, asru, fru, pa);
+		ret = cmd_opl_mac_ce(hdl, ep, class, asru, fru, pa, nvl);
 		nvlist_free(asru);
 		nvlist_free(fru);
 		if (ret != CMD_EVD_OK) {
--- a/usr/src/uts/sun4u/opl/io/mc-opl.c	Thu May 22 14:07:07 2008 -0700
+++ b/usr/src/uts/sun4u/opl/io/mc-opl.c	Thu May 22 20:47:36 2008 -0700
@@ -60,6 +60,7 @@
 #include <vm/hat_sfmmu.h>
 #include <sys/vmsystm.h>
 #include <sys/membar.h>
+#include <sys/mem.h>
 
 /*
  * Function prototypes
@@ -103,6 +104,7 @@
 
 static void mc_clear_rewrite(mc_opl_t *mcp, int i);
 static void mc_set_rewrite(mc_opl_t *mcp, int bank, uint32_t addr, int state);
+static int mc_scf_log_event(mc_flt_page_t *flt_pag);
 
 #ifdef	DEBUG
 static int mc_ioctl_debug(dev_t, int, intptr_t, int, cred_t *, int *);
@@ -508,6 +510,12 @@
 	if (ddi_soft_state_zalloc(mc_statep, instance) != DDI_SUCCESS)
 		return (DDI_FAILURE);
 
+	if (ddi_create_minor_node(devi, "mc-opl", S_IFCHR, instance,
+	    "ddi_mem_ctrl", 0) != DDI_SUCCESS) {
+		MC_LOG("mc_attach: create_minor_node failed\n");
+		return (DDI_FAILURE);
+	}
+
 	if ((mcp = ddi_get_soft_state(mc_statep, instance)) == NULL) {
 		goto bad;
 	}
@@ -542,6 +550,7 @@
 	return (DDI_SUCCESS);
 
 bad:
+	ddi_remove_minor_node(devi, NULL);
 	ddi_soft_state_free(mc_statep, instance);
 	return (DDI_FAILURE);
 }
@@ -575,6 +584,8 @@
 		return (DDI_FAILURE);
 	}
 
+	ddi_remove_minor_node(devi, NULL);
+
 	/* free up the soft state */
 	ddi_soft_state_free(mc_statep, instance);
 
@@ -600,10 +611,22 @@
 mc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
 	int *rvalp)
 {
+	mc_flt_page_t flt_page;
+
+	if (cmd == MCIOC_FAULT_PAGE) {
+		if (arg == NULL)
+			return (EINVAL);
+
+		if (ddi_copyin((const void *)arg, (void *)&flt_page,
+		    sizeof (mc_flt_page_t), 0) < 0)
+			return (EFAULT);
+
+		return (mc_scf_log_event(&flt_page));
+	}
 #ifdef DEBUG
 	return (mc_ioctl_debug(dev, cmd, arg, mode, credp, rvalp));
 #else
-	return (ENXIO);
+	return (ENOTTY);
 #endif
 }
 
@@ -698,7 +721,7 @@
 	    maddr1.ma_dimm_addr)) {
 		return (0);
 	} else {
-		cmn_err(CE_WARN, "Translation error source /LSB%d/B%d/%x, "
+		MC_LOG("Translation error source /LSB%d/B%d/%x, "
 		    "PA %lx, target /LSB%d/B%d/%x\n", maddr->ma_bd, bank,
 		    maddr->ma_dimm_addr, *pa, maddr1.ma_bd, maddr1.ma_bank,
 		    maddr1.ma_dimm_addr);
@@ -1419,7 +1442,27 @@
 
 	retry = mc_retry_info_get(&bankp->mcb_retry_freelist);
 
-	ASSERT(retry != NULL);
+	if (retry == NULL) {
+		mc_addr_t maddr;
+		uint64_t paddr;
+		/*
+		 * previous rewrite request has not completed yet.
+		 * So we discard this rewrite request.
+		 */
+		maddr.ma_bd = mcp->mc_board_num;
+		maddr.ma_bank =  bank;
+		maddr.ma_dimm_addr = addr;
+		if (mcaddr_to_pa(mcp, &maddr, &paddr) == 0) {
+			cmn_err(CE_WARN, "Discard CE rewrite request"
+			    " for 0x%lx (/LSB%d/B%d/%x).\n",
+			    paddr, mcp->mc_board_num, bank, addr);
+		} else {
+			cmn_err(CE_WARN, "Discard CE rewrite request"
+			    " for /LSB%d/B%d/%x.\n",
+			    mcp->mc_board_num, bank, addr);
+		}
+		return;
+	}
 
 	retry->ri_addr = addr;
 	retry->ri_state = state;
@@ -1477,8 +1520,7 @@
 			} else {
 				if ((++mc_pce_dropped & 0xff) == 0) {
 					cmn_err(CE_WARN, "Cannot "
-					    "report Permanent CE to "
-					    "SCF\n");
+					    "report CE to SCF\n");
 				}
 			}
 		}
@@ -1497,7 +1539,7 @@
 
 	if (mcp->mc_scf_total[bank] >= mc_max_scf_logs) {
 		if ((++mc_pce_dropped & 0xff) == 0) {
-			cmn_err(CE_WARN, "Too many Permanent CE requests.\n");
+			cmn_err(CE_WARN, "Too many CE requests.\n");
 		}
 		return;
 	}
@@ -1918,11 +1960,13 @@
 	if ((((flt_stat[0].mf_cntl & MAC_CNTL_PTRL_ERRS) >>
 	    MAC_CNTL_PTRL_ERR_SHIFT) == ((mi_flt_stat[0].mf_cntl &
 	    MAC_CNTL_MI_ERRS) >> MAC_CNTL_MI_ERR_SHIFT)) &&
-	    (flt_stat[0].mf_err_add == mi_flt_stat[0].mf_err_add) &&
+	    (flt_stat[0].mf_err_add ==
+	    ROUNDDOWN(mi_flt_stat[0].mf_err_add, MC_BOUND_BYTE)) &&
 	    (((flt_stat[1].mf_cntl & MAC_CNTL_PTRL_ERRS) >>
 	    MAC_CNTL_PTRL_ERR_SHIFT) == ((mi_flt_stat[1].mf_cntl &
 	    MAC_CNTL_MI_ERRS) >> MAC_CNTL_MI_ERR_SHIFT)) &&
-	    (flt_stat[1].mf_err_add == mi_flt_stat[1].mf_err_add)) {
+	    (flt_stat[1].mf_err_add ==
+	    ROUNDDOWN(mi_flt_stat[1].mf_err_add, MC_BOUND_BYTE))) {
 #ifdef DEBUG
 		MC_LOG("discarding PTRL error because "
 		    "it is the same as MI\n");
@@ -2032,7 +2076,8 @@
 	if ((((flt_stat.mf_cntl & MAC_CNTL_PTRL_ERRS) >>
 	    MAC_CNTL_PTRL_ERR_SHIFT) == ((mi_flt_stat.mf_cntl &
 	    MAC_CNTL_MI_ERRS) >> MAC_CNTL_MI_ERR_SHIFT)) &&
-	    (flt_stat.mf_err_add == mi_flt_stat.mf_err_add)) {
+	    (flt_stat.mf_err_add ==
+	    ROUNDDOWN(mi_flt_stat.mf_err_add, MC_BOUND_BYTE))) {
 #ifdef DEBUG
 		MC_LOG("discarding PTRL error because "
 		    "it is the same as MI\n");
@@ -3703,6 +3748,8 @@
 				MC_LOG("mc_get_mem_addr: "
 				    "mcaddr_to_pa failed\n");
 				ret = ENODEV;
+				mutex_exit(&mcp->mc_lock);
+				continue;
 			}
 			mutex_exit(&mcp->mc_lock);
 			break;
@@ -3815,6 +3862,112 @@
 	return (dimm_list);
 }
 
+static int
+mc_get_mem_fmri(mc_flt_page_t *fpag, char **unum)
+{
+	if (fpag->fmri_addr == 0 || fpag->fmri_sz > MEM_FMRI_MAX_BUFSIZE)
+		return (EINVAL);
+
+	*unum = kmem_alloc(fpag->fmri_sz, KM_SLEEP);
+	if (copyin((void *)fpag->fmri_addr, *unum, fpag->fmri_sz) != 0) {
+		kmem_free(*unum, fpag->fmri_sz);
+		return (EFAULT);
+	}
+	return (0);
+}
+
+static int
+mc_scf_log_event(mc_flt_page_t *flt_pag)
+{
+	mc_opl_t *mcp;
+	int board, bank, slot;
+	int len, rv = 0;
+	char *unum, *sid;
+	char dname[MCOPL_MAX_DIMMNAME + 1];
+	size_t sid_sz;
+	uint64_t pa;
+	mc_flt_stat_t flt_stat;
+
+	if ((sid_sz = cpu_get_name_bufsize()) == 0)
+		return (ENOTSUP);
+
+	if ((rv = mc_get_mem_fmri(flt_pag, &unum)) != 0) {
+		MC_LOG("mc_scf_log_event: mc_get_mem_fmri failed\n");
+		return (rv);
+	}
+
+	sid = kmem_zalloc(sid_sz, KM_SLEEP);
+
+	if ((rv = mc_get_mem_sid(unum, sid, sid_sz, &len)) != 0) {
+		MC_LOG("mc_scf_log_event: mc_get_mem_sid failed\n");
+		goto out;
+	}
+
+	if ((rv = mc_get_mem_addr(unum, sid, (uint64_t)flt_pag->err_add,
+	    &pa)) != 0) {
+		MC_LOG("mc_scf_log_event: mc_get_mem_addr failed\n");
+		goto out;
+	}
+
+	if (parse_unum_memory(unum, &board, dname) != 0) {
+		MC_LOG("mc_scf_log_event: parse_unum_memory failed\n");
+		rv = EINVAL;
+		goto out;
+	}
+
+	if (board < 0) {
+		MC_LOG("mc_scf_log_event: Invalid board=%d dimm=%s\n",
+		    board, dname);
+		rv = EINVAL;
+		goto out;
+	}
+
+	if (dname_to_bankslot(dname, &bank, &slot) != 0) {
+		MC_LOG("mc_scf_log_event: dname_to_bankslot failed\n");
+		rv = EINVAL;
+		goto out;
+	}
+
+	mutex_enter(&mcmutex);
+
+	flt_stat.mf_err_add = flt_pag->err_add;
+	flt_stat.mf_err_log = flt_pag->err_log;
+	flt_stat.mf_flt_paddr = pa;
+
+	if ((mcp = mc_pa_to_mcp(pa)) == NULL) {
+		mutex_exit(&mcmutex);
+		MC_LOG("mc_scf_log_event: invalid pa\n");
+		rv = EINVAL;
+		goto out;
+	}
+
+	MC_LOG("mc_scf_log_event: DIMM%s, /LSB%d/B%d/%x, pa %lx elog %x\n",
+	    unum, mcp->mc_board_num, bank, flt_pag->err_add, pa,
+	    flt_pag->err_log);
+
+	mutex_enter(&mcp->mc_lock);
+
+	if (!pa_is_valid(mcp, pa)) {
+		mutex_exit(&mcp->mc_lock);
+		mutex_exit(&mcmutex);
+		rv = EINVAL;
+		goto out;
+	}
+
+	rv = 0;
+
+	mc_queue_scf_log(mcp, &flt_stat, bank);
+
+	mutex_exit(&mcp->mc_lock);
+	mutex_exit(&mcmutex);
+
+out:
+	kmem_free(unum, flt_pag->fmri_sz);
+	kmem_free(sid, sid_sz);
+
+	return (rv);
+}
+
 #ifdef DEBUG
 void
 mc_dump_dimm(char *buf, int dnamesz, int serialsz, int partnumsz)
@@ -3868,7 +4021,7 @@
 mc_ioctl_debug(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
 	int *rvalp)
 {
-	caddr_t	buf;
+	caddr_t	buf, kbuf;
 	uint64_t pa;
 	int rv = 0;
 	int i;
@@ -3937,9 +4090,9 @@
 		 */
 		cmn_err(CE_NOTE, "Allocating kmem %d MB\n", flags * 512);
 		for (i = 0; i < flags; i++) {
-			buf = kmem_alloc(512 * 1024 * 1024, KM_SLEEP);
+			kbuf = kmem_alloc(512 * 1024 * 1024, KM_SLEEP);
 			cmn_err(CE_NOTE, "kmem buf %llx PA %llx\n",
-			    (u_longlong_t)buf, (u_longlong_t)va_to_pa(buf));
+			    (u_longlong_t)kbuf, (u_longlong_t)va_to_pa(kbuf));
 		}
 		break;
 	case MCI_SUSPEND:
@@ -3951,6 +4104,9 @@
 	default:
 		rv = ENXIO;
 	}
+	if (buf)
+		kmem_free(buf, PAGESIZE);
+
 	return (rv);
 }
 
--- a/usr/src/uts/sun4u/opl/sys/mc-opl.h	Thu May 22 14:07:07 2008 -0700
+++ b/usr/src/uts/sun4u/opl/sys/mc-opl.h	Thu May 22 20:47:36 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -190,6 +190,13 @@
 	mc_flt_stat_t *mflt_stat[2];	/* fault status */
 } mc_aflt_t;
 
+typedef struct mc_flt_page {
+	uint32_t err_add;		/* MAC_BANKm_{PTRL|MI}_ERR_ADD reg */
+	uint32_t err_log;		/* MAC_BANKm_{PTRL|MI}_ERR_LOG reg */
+	uint64_t fmri_addr;		/* FRU name string */
+	uint32_t fmri_sz;		/* length of FRU name +1 */
+} mc_flt_page_t;
+
 #define	MAC_PTRL_STAT(mcp, i)		(mcp->mc_bank[i].mcb_reg_base)
 #define	MAC_PTRL_CNTL(mcp, i)		(mcp->mc_bank[i].mcb_reg_base + 0x10)
 #define	MAC_PTRL_ERR_ADD(mcp, i)	(mcp->mc_bank[i].mcb_reg_base + 0x20)
@@ -206,6 +213,14 @@
 
 /* use PA[37:6] */
 #define	MAC_RESTART_PA(pa)		((pa >> 6) & 0xffffffff)
+
+/*
+ * This is for changing MI_ERR_ADDR accuracy.
+ * Last two bits of PTRL_ERR_ADDR are always 0.
+ */
+#define	ROUNDDOWN(a, n) (((a) & ~((n) - 1)))
+#define	MC_BOUND_BYTE   4
+
 /*
  * MAC_BANKm_PTRL_STAT_Register
  */
@@ -399,6 +414,9 @@
 #define	MC_INJECT_FLAG_ST	0x40
 #define	MC_INJECT_FLAG_PATH	0x80
 
+#define	MCIOC			('M' << 8)
+#define	MCIOC_FAULT_PAGE	(MCIOC|1)
+
 #ifdef DEBUG
 
 #define	MCI_NOP		0x0