changeset 3712:881021ac3355

6520990 DR panic if XSCF is BUSY
author bm42561
date Sun, 25 Feb 2007 14:16:28 -0800
parents 2226ffbe7873
children 00e75dc8b749
files usr/src/uts/sun4u/opl/io/dr_mem.c usr/src/uts/sun4u/opl/io/drmach.c usr/src/uts/sun4u/opl/ml/drmach_asm.s usr/src/uts/sun4u/opl/sys/drmach.h usr/src/uts/sun4u/sys/sbd_ioctl.h
diffstat 5 files changed, 128 insertions(+), 105 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/sun4u/opl/io/dr_mem.c	Sun Feb 25 07:50:49 2007 -0800
+++ b/usr/src/uts/sun4u/opl/io/dr_mem.c	Sun Feb 25 14:16:28 2007 -0800
@@ -299,7 +299,7 @@
 		}
 
 		if (e_code != ESBD_NOERROR) {
-			dr_dev_err(CE_IGNORE, &mp->sbm_cm, e_code);
+			dr_dev_err(CE_WARN, &mp->sbm_cm, e_code);
 		}
 	}
 }
@@ -1770,10 +1770,6 @@
 	 */
 /* XXX Can we know that sbdev_error was encountered during release? */
 	if (s_mp->sbm_cm.sbdev_error != NULL) {
-		cmn_err(CE_WARN, "%s: %s: error %d noted\n",
-			f,
-			s_mp->sbm_cm.sbdev_path,
-			s_mp->sbm_cm.sbdev_error->e_code);
 
 		if (t_mp != NULL) {
 			ASSERT(t_mp->sbm_del_mlist == t_mp->sbm_mlist);
--- a/usr/src/uts/sun4u/opl/io/drmach.c	Sun Feb 25 07:50:49 2007 -0800
+++ b/usr/src/uts/sun4u/opl/io/drmach.c	Sun Feb 25 14:16:28 2007 -0800
@@ -2390,7 +2390,9 @@
 drmach_mem_get_memlist(drmachid_t id, struct memlist **ml)
 {
 	drmach_mem_t	*mem;
+#ifdef	DEBUG
 	int		rv;
+#endif
 	struct memlist	*mlist;
 
 	if (!DRMACH_IS_MEM_ID(id))
@@ -2881,9 +2883,10 @@
 		rv = -1;
 		goto logexit;
 	}
-	if (verbose)
+	if (verbose) {
 		DRMACH_PR("drmach_log_sysevent: %s %s, flag: %d, verbose: %d\n",
 			attach_pnt, hint, flag, verbose);
+	}
 
 	if ((ev = sysevent_alloc(EC_DR, ESC_DR_AP_STATE_CHANGE,
 		SUNW_KERN_PUB"dr", km_flag)) == NULL) {
@@ -3055,11 +3058,41 @@
 /*
  * We multiply this to system_clock_frequency so we
  * are setting a delay of fmem_timeout second for
- * the rename command.  The spec says 15 second is
- * enough but the Fujitsu HW team suggested 17 sec.
+ * the rename command.
+ *
+ * FMEM command itself should complete within 15 sec.
+ * We add 2 more sec to be conservative.
+ *
+ * Note that there is also a SCF BUSY bit checking
+ * in drmach_asm.s right before FMEM command is
+ * issued.  XSCF sets the SCF BUSY bit when the
+ * other domain on the same PSB reboots and it
+ * will not be able to service the FMEM command
+ * within 15 sec.   After setting the SCF BUSY
+ * bit, XSCF will wait a while before servicing
+ * other reboot command so there is no race
+ * condition.
  */
+
 static int	fmem_timeout = 17;
-static int	min_copy_size_per_sec = 20 * 1024 * 1024;
+
+/*
+ *	The empirical data on some OPL system shows that
+ *	we can copy 250 MB per second.  We set it to
+ * 	80 MB to be conservative.  In normal case,
+ *	this timeout does not affect anything.
+ */
+
+static int	min_copy_size_per_sec = 80 * 1024 * 1024;
+
+/*
+ *	This is the timeout value for the xcall synchronization
+ *	to get all the CPU ready to do the parallel copying.
+ *	Even on a fully loaded system, 10 sec. should be long
+ *	enough.
+ */
+
+static int	cpu_xcall_delay = 10;
 int drmach_disable_mcopy = 0;
 
 /*
@@ -3115,7 +3148,7 @@
 
 	if (prog->data->cpuid == cpuid) {
 		limit = drmach_get_stick_il();
-		limit += prog->critical->delay;
+		limit += cpu_xcall_delay * system_clock_freq;
 		for (i = 0; i < NCPU; i++) {
 			if (CPU_IN_SET(prog->data->cpu_slave_set, i)) {
 			/* wait for all CPU's to be ready */
@@ -3129,8 +3162,8 @@
 			    curr = drmach_get_stick_il();
 			    if (curr > limit) {
 				prog->data->fmem_status.error =
-					FMEM_XC_TIMEOUT;
-				return (FMEM_XC_TIMEOUT);
+					EOPL_FMEM_XC_TIMEOUT;
+				return (EOPL_FMEM_XC_TIMEOUT);
 			    }
 			}
 		}
@@ -3144,8 +3177,9 @@
 				break;
 			}
 			if (prog->data->fmem_status.error) {
-				prog->data->error[cpuid] = FMEM_TERMINATE;
-				return (FMEM_TERMINATE);
+				prog->data->error[cpuid] =
+					EOPL_FMEM_TERMINATE;
+				return (EOPL_FMEM_TERMINATE);
 			}
 			DR_DELAY_IL(1, prog->data->stick_freq);
 		}
@@ -3165,9 +3199,10 @@
 
 		while (nbytes != 0ull) {
 			/* If the master has detected error, we just bail out */
-			if (prog->data->fmem_status.error) {
-				prog->data->error[cpuid] = FMEM_TERMINATE;
-				return (FMEM_TERMINATE);
+			if (prog->data->fmem_status.error != ESBD_NOERROR) {
+				prog->data->error[cpuid] =
+					EOPL_FMEM_TERMINATE;
+				return (EOPL_FMEM_TERMINATE);
 			}
 			/*
 			 * This copy does NOT use an ASI
@@ -3224,11 +3259,11 @@
 					break;
 				}
 				/* got error traps */
-				if (prog->critical->stat[i] ==
-					FMEM_COPY_ERROR) {
+				if (prog->data->error[i] ==
+					EOPL_FMEM_COPY_ERROR) {
 					prog->data->fmem_status.error =
-						FMEM_COPY_ERROR;
-					return (FMEM_COPY_ERROR);
+						EOPL_FMEM_COPY_ERROR;
+					return (EOPL_FMEM_COPY_ERROR);
 				}
 				/* if we have not reached limit, wait more */
 				curr = drmach_get_stick_il();
@@ -3252,19 +3287,20 @@
 						FMEM_LOOP_FMEM_READY)
 						break;
 					/* copy error */
-					if (prog->critical->stat[i] ==
-						FMEM_COPY_ERROR) {
+					if (prog->data->error[i] ==
+						EOPL_FMEM_COPY_ERROR) {
 						prog->data->fmem_status.error =
-							FMEM_COPY_ERROR;
-						return (FMEM_COPY_ERROR);
+							EOPL_FMEM_COPY_ERROR;
+						return (EOPL_FMEM_COPY_ERROR);
 					}
 					prog->data->fmem_status.error =
-					    FMEM_COPY_TIMEOUT;
-					return (FMEM_COPY_TIMEOUT);
+					    EOPL_FMEM_COPY_TIMEOUT;
+					return (EOPL_FMEM_COPY_TIMEOUT);
 				}
 			    }
 			}
 		}
+
 		prog->critical->stat[cpuid] = FMEM_LOOP_FMEM_READY;
 		prog->data->fmem_status.stat  = FMEM_LOOP_FMEM_READY;
 
@@ -3583,6 +3619,8 @@
 		(ulong_t)drmach_fmem_loop_script);
 	prog->critical->loop_rtn = (void (*)()) (wp+len);
 
+	prog->data->fmem_status.error = ESBD_NOERROR;
+
 	/* now we are committed, call SCF, soft suspend mac patrol */
 	if ((*scf_fmem_start)(s_bd, t_bd)) {
 		err = drerr_new(1, EOPL_SCF_FMEM_START, NULL);
@@ -3592,6 +3630,7 @@
 	prog->data->scf_fmem_cancel = scf_fmem_cancel;
 	prog->data->scf_get_base_addr = scf_get_base_addr;
 	prog->data->fmem_status.op |= OPL_FMEM_SCF_START;
+
 	/* soft suspend mac patrol */
 	(*mc_suspend)();
 	prog->data->fmem_status.op |= OPL_FMEM_MC_SUSPEND;
@@ -3722,6 +3761,7 @@
 	drmach_copy_rename_program_t	*prog = id;
 	sbd_error_t			*err = NULL;
 	int				rv;
+	uint_t				fmem_error;
 
 	/*
 	 * Note that we have to delay calling SCF to find out the
@@ -3750,12 +3790,17 @@
 				prog->data->fmem_status.op);
 	}
 
+	fmem_error = prog->data->fmem_status.error;
+	if (fmem_error != ESBD_NOERROR) {
+		err = drerr_new(1, fmem_error, NULL);
+	}
+
 	/* possible ops are SCF_START, MC_SUSPEND */
 	if (prog->critical->fmem_issued) {
-		if (prog->data->fmem_status.error != FMEM_NO_ERROR)
-			cmn_err(CE_PANIC, "scf fmem request failed. "
-				"error code = 0x%x.",
-				prog->data->fmem_status.error);
+		if (fmem_error != ESBD_NOERROR) {
+		    cmn_err(CE_PANIC, "Irrecoverable FMEM error %d\n",
+			fmem_error);
+		}
 		rv = (*prog->data->scf_fmem_end)();
 		if (rv) {
 			cmn_err(CE_PANIC, "scf_fmem_end() failed rv=%d", rv);
@@ -3767,18 +3812,12 @@
 		drmach_swap_pa((drmach_mem_t *)prog->data->s_mem,
 			(drmach_mem_t *)prog->data->t_mem);
 	} else {
-		if (prog->data->fmem_status.error != 0) {
-			cmn_err(CE_WARN, "Kernel Migration fails. 0x%x",
-				prog->data->fmem_status.error);
-			err = drerr_new(1, EOPL_FMEM_ERROR, "FMEM error = 0x%x",
-				prog->data->fmem_status.error);
-		}
 		rv = (*prog->data->scf_fmem_cancel)();
 		if (rv) {
 		    cmn_err(CE_WARN, "scf_fmem_cancel() failed rv=0x%x", rv);
 		    if (!err)
 			err = drerr_new(1, EOPL_SCF_FMEM_CANCEL,
-			    "rv = 0x%x", rv);
+			    "scf_fmem_cancel() failed. rv = 0x%x", rv);
 		}
 	}
 	/* soft resume mac patrol */
@@ -3808,7 +3847,7 @@
 
 	if (on_trap(&otd, OT_DATA_EC)) {
 		no_trap();
-		prog->data->error[cpuid] = FMEM_COPY_ERROR;
+		prog->data->error[cpuid] = EOPL_FMEM_COPY_ERROR;
 		prog->critical->stat[cpuid] = FMEM_LOOP_EXIT;
 		drmach_flush_icache();
 		membar_sync_il();
@@ -3908,7 +3947,7 @@
 
 	if (prog->critical->scf_reg_base == (uint64_t)-1 ||
 		prog->critical->scf_reg_base == NULL) {
-		prog->data->fmem_status.error = FMEM_SCF_ERR;
+		prog->data->fmem_status.error = EOPL_FMEM_SCF_ERR;
 		drmach_unlock_critical((caddr_t)prog);
 		return;
 	}
@@ -3918,7 +3957,7 @@
 	for (cpuid = 0; cpuid < NCPU; cpuid++) {
 		if (CPU_IN_SET(cpuset, cpuid)) {
 			prog->critical->stat[cpuid] = FMEM_LOOP_START;
-			prog->data->error[cpuid] = FMEM_NO_ERROR;
+			prog->data->error[cpuid] = ESBD_NOERROR;
 		}
 	}
 
@@ -3943,7 +3982,7 @@
 	xt_sync(cpuset);
 
 	if (on_trap(&otd, OT_DATA_EC)) {
-		rtn = FMEM_COPY_ERROR;
+		rtn = EOPL_FMEM_COPY_ERROR;
 		drmach_flush_icache();
 		goto done;
 	}
@@ -3959,7 +3998,7 @@
 
 done:
 	no_trap();
-	if (rtn == FMEM_HW_ERROR) {
+	if (rtn == EOPL_FMEM_HW_ERROR) {
 		kpreempt_enable();
 		prom_panic("URGENT_ERROR_TRAP is "
 			"detected during FMEM.\n");
@@ -4017,7 +4056,7 @@
 			}
 			last = now;
 		}
-		if (prog->data->error[cpuid] == FMEM_HW_ERROR) {
+		if (prog->data->error[cpuid] == EOPL_FMEM_HW_ERROR) {
 			prom_panic("URGENT_ERROR_TRAP is "
 				"detected during FMEM.\n");
 		}
@@ -4050,7 +4089,7 @@
 
 	(void) drmach_lock_critical((caddr_t)prog_kmem, (caddr_t)prog);
 
-	if (prog->data->fmem_status.error == 0)
+	if (prog->data->fmem_status.error == ESBD_NOERROR)
 		prog->data->fmem_status.error = rtn;
 
 	if (prog->data->copy_wait_time > 0) {
--- a/usr/src/uts/sun4u/opl/ml/drmach_asm.s	Sun Feb 25 07:50:49 2007 -0800
+++ b/usr/src/uts/sun4u/opl/ml/drmach_asm.s	Sun Feb 25 14:16:28 2007 -0800
@@ -50,6 +50,7 @@
 #include <sys/intreg.h>
 #include <sys/cheetahregs.h>
 #include <sys/drmach.h>
+#include <sys/sbd_ioctl.h>
 
 #if !defined(lint)
 
@@ -127,7 +128,7 @@
 	btst	%o3, %o4
 	bz,pn	%xcc, 3f
 	 mov	%g0, %o4
-	set	FMEM_HW_ERROR, %o4
+	set	EOPL_FMEM_HW_ERROR, %o4
 
 	/* set error code and stat code */
 3:
@@ -235,7 +236,7 @@
 	btst	%l1, %l2
 	bz,pn	%xcc, 2f
 	 nop
-	mov	FMEM_HW_ERROR, %o4
+	set	EOPL_FMEM_HW_ERROR, %o4
 2:
 	/* restore all locals */
 	add	%o0, SAVE_LOCAL, %o1
@@ -287,8 +288,9 @@
 	btst	%o2, %o3
 	be	%xcc, 6f
 	 nop
+	set	EOPL_FMEM_SCF_BUSY, %o4
 	ba	1b
-	 mov	FMEM_SCF_BUSY, %o4
+	 nop
 
 	/* clear STATUS bit */
 6:
@@ -332,14 +334,18 @@
 	 * we read the data back after the write to verify
 	 * we write 2 bytes at a time.
 	 * If the data read is not the same as data written
-	 * we retry up to a limit of FMEM_RETRY_OUT
+	 * we retry up to a limit of SCF_RETRY_CNT
 	 */
 9:
 	stha	%o3, [%o1]ASI_IO
 	lduha	[%o1]ASI_IO, %o2
 	sub	%o5, 1, %o5
-	brz,a	%o5, 1b
-	 mov	FMEM_RETRY_OUT, %o4
+	brnz	%o5, 7f
+	 nop
+	set	EOPL_FMEM_RETRY_OUT, %o4
+	ba	1b
+	 nop
+7:
 	cmp	%o2, %o3
 	bne,a	9b
 	 nop
@@ -373,6 +379,23 @@
 	ldxa	[%o1]ASI_IO, %o2
 	stx	%o2, [%o0+SCF_TD+8]
 
+	/* The following code conforms to the FMEM
+	   sequence (4) as described in the Columbus2
+	   logical spec section 4.6
+	*/
+
+	/* read from SCF SB INFO register */
+	sethi	%hi(SCF_SB_INFO_OFFSET), %o2
+	or	%o2, %lo(SCF_SB_INFO_OFFSET), %o2
+	add	%l0, %o2, %o1
+	lduba	[%o1]ASI_IO, %o2
+
+	/* If BUSY bit is set, abort */
+	or	%g0, (SCF_SB_INFO_BUSY), %o1
+	btst	%o1, %o2
+	set	EOPL_FMEM_SCF_BUSY, %o4
+	bne	1b
+	 nop
 
 	rd	STICK, %l1
 	add	%l5, %l1, %l5
@@ -406,9 +429,8 @@
 	be	%xcc, 5f		! CMD_COMPLETE is not set
 	 nop
 	stha	%o3, [%o1]ASI_IO	! Now we are done and clear it
-	mov	FMEM_NO_ERROR, %o4
 	ba	%xcc, 6f
-	 nop
+	 mov	ESBD_NOERROR, %o4
 
 	/* timeout delay checking */
 5:
@@ -416,7 +438,7 @@
 	cmp	%l5, %l2
 	bge	%xcc, 3b
 	 nop
-	mov	FMEM_TIMEOUT, %o4
+	set	EOPL_FMEM_TIMEOUT, %o4
 
 	/* we are done or timed out */
 6:
--- a/usr/src/uts/sun4u/opl/sys/drmach.h	Sun Feb 25 07:50:49 2007 -0800
+++ b/usr/src/uts/sun4u/opl/sys/drmach.h	Sun Feb 25 14:16:28 2007 -0800
@@ -72,17 +72,6 @@
 #define	FMEM_LOOP_DONE		6
 #define	FMEM_LOOP_EXIT		7
 
-#define	FMEM_NO_ERROR		0
-#define	FMEM_XC_TIMEOUT		1
-#define	FMEM_COPY_TIMEOUT	2
-#define	FMEM_SCF_BUSY		3
-#define	FMEM_RETRY_OUT		4
-#define	FMEM_TIMEOUT		5
-#define	FMEM_HW_ERROR		6
-#define	FMEM_TERMINATE		7
-#define	FMEM_COPY_ERROR		8
-#define	FMEM_SCF_ERR		9
-
 #define	SCF_CMD_BUSY		0x8000
 #define	SCF_STATUS_READY	0x8000
 #define	SCF_STATUS_SHUTDOWN	0x4000
@@ -116,6 +105,9 @@
 #define	MH_MIN_ALIGNMENT	(4 * 1024 * 1024)
 #define	rounddown(x, y)		((x) & ~(y - 1))
 
+#define	SCF_SB_INFO_OFFSET	0x80020
+#define	SCF_SB_INFO_BUSY	0x40
+
 #ifndef _ASM
 
 /*
@@ -201,7 +193,6 @@
 } drmach_scf_regs_t;
 
 
-
 typedef struct {
 	volatile uint_t	stat;
 	volatile uint_t	error;
@@ -236,7 +227,7 @@
 	cpuset_t		cpu_copy_set;
 	processorid_t		cpuid;
 	drmach_fmem_mbox_t	fmem_status;
-	volatile uchar_t 	error[NCPU];
+	volatile ushort_t 	error[NCPU];
 	struct memlist		*c_ml;
 	struct memlist		*cpu_ml[NCPU];
 	void			(*mc_resume)(void);
@@ -269,44 +260,6 @@
 #define	DRMACH_FMEM_MLIST_PAGE		2
 #define	DRMACH_FMEM_STAT_PAGE		3
 
-/*
- * layout of the FMEM buffers:
- * 1st 8k page
- * +--------------------------------+
- * |drmach_copy_rename_program_t    |
- * +--------------------------------+
- * |drmach_copy_rename_data_t       |
- * |                                |
- * +--------------------------------+
- *
- * 2nd 8k page
- * +--------------------------------+
- * |drmach_copy_rename_critical_t   |
- * |                                |
- * +--------------------------------+
- * |run (drmach_copy_rename_prog__relocatable)
- * |(roundup boundary to 1K)        |
- * +--------------------------------+
- * | fmem_script                    |
- * |(roundup boundary to 1K)        |
- * +--------------------------------+
- * |loop_script                     |
- * |                                |
- * +--------------------------------+
- * |at least 1K NOP/0's             |
- * |                                |
- * +--------------------------------+
- *
- * 3rd 8k page
- * +--------------------------------+
- * |memlist_buffer (free_mlist)     |
- * |                                |
- * +--------------------------------+
- *
- * 4th 8k page - drmach_cr_stat_t.
- *
- */
-
 typedef struct {
 	boolean_t	assigned;
 	boolean_t	powered;
--- a/usr/src/uts/sun4u/sys/sbd_ioctl.h	Sun Feb 25 07:50:49 2007 -0800
+++ b/usr/src/uts/sun4u/sys/sbd_ioctl.h	Sun Feb 25 14:16:28 2007 -0800
@@ -28,15 +28,18 @@
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
+#ifndef	_ASM
 #include <sys/types.h>
 #include <sys/obpdefs.h>
 #include <sys/processor.h>
 #include <sys/param.h>
+#endif
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
+#ifndef	_ASM
 typedef enum {
 	SBD_COMP_NONE,
 	SBD_COMP_CPU,
@@ -405,6 +408,7 @@
 } sbd_etab32_t;
 
 #endif	/* _SYSCALL32 */
+#endif	/* _ASM */
 
 /* Common error codes */
 
@@ -602,6 +606,15 @@
 #define	EOPL_SCF_FMEM_START	5018	/* scf_fmem_start error */
 #define	EOPL_FMEM_ERROR		5019	/* FMEM error */
 #define	EOPL_SCF_FMEM_CANCEL	5020	/* scf_fmem_cancel error */
+#define	EOPL_FMEM_XC_TIMEOUT	5021	/* xcall timeout */
+#define	EOPL_FMEM_COPY_TIMEOUT	5022	/* DR parellel copy timeout */
+#define	EOPL_FMEM_SCF_BUSY	5023	/* SCF busy */
+#define	EOPL_FMEM_RETRY_OUT	5024	/* SCF IO Retry Error */
+#define	EOPL_FMEM_TIMEOUT	5025	/* FMEM command timeout */
+#define	EOPL_FMEM_HW_ERROR	5026	/* Hardware error */
+#define	EOPL_FMEM_TERMINATE	5027	/* FMEM operation Terminated */
+#define	EOPL_FMEM_COPY_ERROR	5028	/* Memory copy error */
+#define	EOPL_FMEM_SCF_ERR	5029	/* SCF error */
 
 #ifdef	__cplusplus
 }