Mercurial > illumos > illumos-gate
changeset 3712:881021ac3355
6520990 DR panic if XSCF is BUSY
author | bm42561 |
---|---|
date | Sun, 25 Feb 2007 14:16:28 -0800 |
parents | 2226ffbe7873 |
children | 00e75dc8b749 |
files | usr/src/uts/sun4u/opl/io/dr_mem.c usr/src/uts/sun4u/opl/io/drmach.c usr/src/uts/sun4u/opl/ml/drmach_asm.s usr/src/uts/sun4u/opl/sys/drmach.h usr/src/uts/sun4u/sys/sbd_ioctl.h |
diffstat | 5 files changed, 128 insertions(+), 105 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/uts/sun4u/opl/io/dr_mem.c Sun Feb 25 07:50:49 2007 -0800 +++ b/usr/src/uts/sun4u/opl/io/dr_mem.c Sun Feb 25 14:16:28 2007 -0800 @@ -299,7 +299,7 @@ } if (e_code != ESBD_NOERROR) { - dr_dev_err(CE_IGNORE, &mp->sbm_cm, e_code); + dr_dev_err(CE_WARN, &mp->sbm_cm, e_code); } } } @@ -1770,10 +1770,6 @@ */ /* XXX Can we know that sbdev_error was encountered during release? */ if (s_mp->sbm_cm.sbdev_error != NULL) { - cmn_err(CE_WARN, "%s: %s: error %d noted\n", - f, - s_mp->sbm_cm.sbdev_path, - s_mp->sbm_cm.sbdev_error->e_code); if (t_mp != NULL) { ASSERT(t_mp->sbm_del_mlist == t_mp->sbm_mlist);
--- a/usr/src/uts/sun4u/opl/io/drmach.c Sun Feb 25 07:50:49 2007 -0800 +++ b/usr/src/uts/sun4u/opl/io/drmach.c Sun Feb 25 14:16:28 2007 -0800 @@ -2390,7 +2390,9 @@ drmach_mem_get_memlist(drmachid_t id, struct memlist **ml) { drmach_mem_t *mem; +#ifdef DEBUG int rv; +#endif struct memlist *mlist; if (!DRMACH_IS_MEM_ID(id)) @@ -2881,9 +2883,10 @@ rv = -1; goto logexit; } - if (verbose) + if (verbose) { DRMACH_PR("drmach_log_sysevent: %s %s, flag: %d, verbose: %d\n", attach_pnt, hint, flag, verbose); + } if ((ev = sysevent_alloc(EC_DR, ESC_DR_AP_STATE_CHANGE, SUNW_KERN_PUB"dr", km_flag)) == NULL) { @@ -3055,11 +3058,41 @@ /* * We multiply this to system_clock_frequency so we * are setting a delay of fmem_timeout second for - * the rename command. The spec says 15 second is - * enough but the Fujitsu HW team suggested 17 sec. + * the rename command. + * + * FMEM command itself should complete within 15 sec. + * We add 2 more sec to be conservative. + * + * Note that there is also a SCF BUSY bit checking + * in drmach_asm.s right before FMEM command is + * issued. XSCF sets the SCF BUSY bit when the + * other domain on the same PSB reboots and it + * will not be able to service the FMEM command + * within 15 sec. After setting the SCF BUSY + * bit, XSCF will wait a while before servicing + * other reboot command so there is no race + * condition. */ + static int fmem_timeout = 17; -static int min_copy_size_per_sec = 20 * 1024 * 1024; + +/* + * The empirical data on some OPL system shows that + * we can copy 250 MB per second. We set it to + * 80 MB to be conservative. In normal case, + * this timeout does not affect anything. + */ + +static int min_copy_size_per_sec = 80 * 1024 * 1024; + +/* + * This is the timeout value for the xcall synchronization + * to get all the CPU ready to do the parallel copying. + * Even on a fully loaded system, 10 sec. should be long + * enough. + */ + +static int cpu_xcall_delay = 10; int drmach_disable_mcopy = 0; /* @@ -3115,7 +3148,7 @@ if (prog->data->cpuid == cpuid) { limit = drmach_get_stick_il(); - limit += prog->critical->delay; + limit += cpu_xcall_delay * system_clock_freq; for (i = 0; i < NCPU; i++) { if (CPU_IN_SET(prog->data->cpu_slave_set, i)) { /* wait for all CPU's to be ready */ @@ -3129,8 +3162,8 @@ curr = drmach_get_stick_il(); if (curr > limit) { prog->data->fmem_status.error = - FMEM_XC_TIMEOUT; - return (FMEM_XC_TIMEOUT); + EOPL_FMEM_XC_TIMEOUT; + return (EOPL_FMEM_XC_TIMEOUT); } } } @@ -3144,8 +3177,9 @@ break; } if (prog->data->fmem_status.error) { - prog->data->error[cpuid] = FMEM_TERMINATE; - return (FMEM_TERMINATE); + prog->data->error[cpuid] = + EOPL_FMEM_TERMINATE; + return (EOPL_FMEM_TERMINATE); } DR_DELAY_IL(1, prog->data->stick_freq); } @@ -3165,9 +3199,10 @@ while (nbytes != 0ull) { /* If the master has detected error, we just bail out */ - if (prog->data->fmem_status.error) { - prog->data->error[cpuid] = FMEM_TERMINATE; - return (FMEM_TERMINATE); + if (prog->data->fmem_status.error != ESBD_NOERROR) { + prog->data->error[cpuid] = + EOPL_FMEM_TERMINATE; + return (EOPL_FMEM_TERMINATE); } /* * This copy does NOT use an ASI @@ -3224,11 +3259,11 @@ break; } /* got error traps */ - if (prog->critical->stat[i] == - FMEM_COPY_ERROR) { + if (prog->data->error[i] == + EOPL_FMEM_COPY_ERROR) { prog->data->fmem_status.error = - FMEM_COPY_ERROR; - return (FMEM_COPY_ERROR); + EOPL_FMEM_COPY_ERROR; + return (EOPL_FMEM_COPY_ERROR); } /* if we have not reached limit, wait more */ curr = drmach_get_stick_il(); @@ -3252,19 +3287,20 @@ FMEM_LOOP_FMEM_READY) break; /* copy error */ - if (prog->critical->stat[i] == - FMEM_COPY_ERROR) { + if (prog->data->error[i] == + EOPL_FMEM_COPY_ERROR) { prog->data->fmem_status.error = - FMEM_COPY_ERROR; - return (FMEM_COPY_ERROR); + EOPL_FMEM_COPY_ERROR; + return (EOPL_FMEM_COPY_ERROR); } prog->data->fmem_status.error = - FMEM_COPY_TIMEOUT; - return (FMEM_COPY_TIMEOUT); + EOPL_FMEM_COPY_TIMEOUT; + return (EOPL_FMEM_COPY_TIMEOUT); } } } } + prog->critical->stat[cpuid] = FMEM_LOOP_FMEM_READY; prog->data->fmem_status.stat = FMEM_LOOP_FMEM_READY; @@ -3583,6 +3619,8 @@ (ulong_t)drmach_fmem_loop_script); prog->critical->loop_rtn = (void (*)()) (wp+len); + prog->data->fmem_status.error = ESBD_NOERROR; + /* now we are committed, call SCF, soft suspend mac patrol */ if ((*scf_fmem_start)(s_bd, t_bd)) { err = drerr_new(1, EOPL_SCF_FMEM_START, NULL); @@ -3592,6 +3630,7 @@ prog->data->scf_fmem_cancel = scf_fmem_cancel; prog->data->scf_get_base_addr = scf_get_base_addr; prog->data->fmem_status.op |= OPL_FMEM_SCF_START; + /* soft suspend mac patrol */ (*mc_suspend)(); prog->data->fmem_status.op |= OPL_FMEM_MC_SUSPEND; @@ -3722,6 +3761,7 @@ drmach_copy_rename_program_t *prog = id; sbd_error_t *err = NULL; int rv; + uint_t fmem_error; /* * Note that we have to delay calling SCF to find out the @@ -3750,12 +3790,17 @@ prog->data->fmem_status.op); } + fmem_error = prog->data->fmem_status.error; + if (fmem_error != ESBD_NOERROR) { + err = drerr_new(1, fmem_error, NULL); + } + /* possible ops are SCF_START, MC_SUSPEND */ if (prog->critical->fmem_issued) { - if (prog->data->fmem_status.error != FMEM_NO_ERROR) - cmn_err(CE_PANIC, "scf fmem request failed. " - "error code = 0x%x.", - prog->data->fmem_status.error); + if (fmem_error != ESBD_NOERROR) { + cmn_err(CE_PANIC, "Irrecoverable FMEM error %d\n", + fmem_error); + } rv = (*prog->data->scf_fmem_end)(); if (rv) { cmn_err(CE_PANIC, "scf_fmem_end() failed rv=%d", rv); @@ -3767,18 +3812,12 @@ drmach_swap_pa((drmach_mem_t *)prog->data->s_mem, (drmach_mem_t *)prog->data->t_mem); } else { - if (prog->data->fmem_status.error != 0) { - cmn_err(CE_WARN, "Kernel Migration fails. 0x%x", - prog->data->fmem_status.error); - err = drerr_new(1, EOPL_FMEM_ERROR, "FMEM error = 0x%x", - prog->data->fmem_status.error); - } rv = (*prog->data->scf_fmem_cancel)(); if (rv) { cmn_err(CE_WARN, "scf_fmem_cancel() failed rv=0x%x", rv); if (!err) err = drerr_new(1, EOPL_SCF_FMEM_CANCEL, - "rv = 0x%x", rv); + "scf_fmem_cancel() failed. rv = 0x%x", rv); } } /* soft resume mac patrol */ @@ -3808,7 +3847,7 @@ if (on_trap(&otd, OT_DATA_EC)) { no_trap(); - prog->data->error[cpuid] = FMEM_COPY_ERROR; + prog->data->error[cpuid] = EOPL_FMEM_COPY_ERROR; prog->critical->stat[cpuid] = FMEM_LOOP_EXIT; drmach_flush_icache(); membar_sync_il(); @@ -3908,7 +3947,7 @@ if (prog->critical->scf_reg_base == (uint64_t)-1 || prog->critical->scf_reg_base == NULL) { - prog->data->fmem_status.error = FMEM_SCF_ERR; + prog->data->fmem_status.error = EOPL_FMEM_SCF_ERR; drmach_unlock_critical((caddr_t)prog); return; } @@ -3918,7 +3957,7 @@ for (cpuid = 0; cpuid < NCPU; cpuid++) { if (CPU_IN_SET(cpuset, cpuid)) { prog->critical->stat[cpuid] = FMEM_LOOP_START; - prog->data->error[cpuid] = FMEM_NO_ERROR; + prog->data->error[cpuid] = ESBD_NOERROR; } } @@ -3943,7 +3982,7 @@ xt_sync(cpuset); if (on_trap(&otd, OT_DATA_EC)) { - rtn = FMEM_COPY_ERROR; + rtn = EOPL_FMEM_COPY_ERROR; drmach_flush_icache(); goto done; } @@ -3959,7 +3998,7 @@ done: no_trap(); - if (rtn == FMEM_HW_ERROR) { + if (rtn == EOPL_FMEM_HW_ERROR) { kpreempt_enable(); prom_panic("URGENT_ERROR_TRAP is " "detected during FMEM.\n"); @@ -4017,7 +4056,7 @@ } last = now; } - if (prog->data->error[cpuid] == FMEM_HW_ERROR) { + if (prog->data->error[cpuid] == EOPL_FMEM_HW_ERROR) { prom_panic("URGENT_ERROR_TRAP is " "detected during FMEM.\n"); } @@ -4050,7 +4089,7 @@ (void) drmach_lock_critical((caddr_t)prog_kmem, (caddr_t)prog); - if (prog->data->fmem_status.error == 0) + if (prog->data->fmem_status.error == ESBD_NOERROR) prog->data->fmem_status.error = rtn; if (prog->data->copy_wait_time > 0) {
--- a/usr/src/uts/sun4u/opl/ml/drmach_asm.s Sun Feb 25 07:50:49 2007 -0800 +++ b/usr/src/uts/sun4u/opl/ml/drmach_asm.s Sun Feb 25 14:16:28 2007 -0800 @@ -50,6 +50,7 @@ #include <sys/intreg.h> #include <sys/cheetahregs.h> #include <sys/drmach.h> +#include <sys/sbd_ioctl.h> #if !defined(lint) @@ -127,7 +128,7 @@ btst %o3, %o4 bz,pn %xcc, 3f mov %g0, %o4 - set FMEM_HW_ERROR, %o4 + set EOPL_FMEM_HW_ERROR, %o4 /* set error code and stat code */ 3: @@ -235,7 +236,7 @@ btst %l1, %l2 bz,pn %xcc, 2f nop - mov FMEM_HW_ERROR, %o4 + set EOPL_FMEM_HW_ERROR, %o4 2: /* restore all locals */ add %o0, SAVE_LOCAL, %o1 @@ -287,8 +288,9 @@ btst %o2, %o3 be %xcc, 6f nop + set EOPL_FMEM_SCF_BUSY, %o4 ba 1b - mov FMEM_SCF_BUSY, %o4 + nop /* clear STATUS bit */ 6: @@ -332,14 +334,18 @@ * we read the data back after the write to verify * we write 2 bytes at a time. * If the data read is not the same as data written - * we retry up to a limit of FMEM_RETRY_OUT + * we retry up to a limit of SCF_RETRY_CNT */ 9: stha %o3, [%o1]ASI_IO lduha [%o1]ASI_IO, %o2 sub %o5, 1, %o5 - brz,a %o5, 1b - mov FMEM_RETRY_OUT, %o4 + brnz %o5, 7f + nop + set EOPL_FMEM_RETRY_OUT, %o4 + ba 1b + nop +7: cmp %o2, %o3 bne,a 9b nop @@ -373,6 +379,23 @@ ldxa [%o1]ASI_IO, %o2 stx %o2, [%o0+SCF_TD+8] + /* The following code conforms to the FMEM + sequence (4) as described in the Columbus2 + logical spec section 4.6 + */ + + /* read from SCF SB INFO register */ + sethi %hi(SCF_SB_INFO_OFFSET), %o2 + or %o2, %lo(SCF_SB_INFO_OFFSET), %o2 + add %l0, %o2, %o1 + lduba [%o1]ASI_IO, %o2 + + /* If BUSY bit is set, abort */ + or %g0, (SCF_SB_INFO_BUSY), %o1 + btst %o1, %o2 + set EOPL_FMEM_SCF_BUSY, %o4 + bne 1b + nop rd STICK, %l1 add %l5, %l1, %l5 @@ -406,9 +429,8 @@ be %xcc, 5f ! CMD_COMPLETE is not set nop stha %o3, [%o1]ASI_IO ! Now we are done and clear it - mov FMEM_NO_ERROR, %o4 ba %xcc, 6f - nop + mov ESBD_NOERROR, %o4 /* timeout delay checking */ 5: @@ -416,7 +438,7 @@ cmp %l5, %l2 bge %xcc, 3b nop - mov FMEM_TIMEOUT, %o4 + set EOPL_FMEM_TIMEOUT, %o4 /* we are done or timed out */ 6:
--- a/usr/src/uts/sun4u/opl/sys/drmach.h Sun Feb 25 07:50:49 2007 -0800 +++ b/usr/src/uts/sun4u/opl/sys/drmach.h Sun Feb 25 14:16:28 2007 -0800 @@ -72,17 +72,6 @@ #define FMEM_LOOP_DONE 6 #define FMEM_LOOP_EXIT 7 -#define FMEM_NO_ERROR 0 -#define FMEM_XC_TIMEOUT 1 -#define FMEM_COPY_TIMEOUT 2 -#define FMEM_SCF_BUSY 3 -#define FMEM_RETRY_OUT 4 -#define FMEM_TIMEOUT 5 -#define FMEM_HW_ERROR 6 -#define FMEM_TERMINATE 7 -#define FMEM_COPY_ERROR 8 -#define FMEM_SCF_ERR 9 - #define SCF_CMD_BUSY 0x8000 #define SCF_STATUS_READY 0x8000 #define SCF_STATUS_SHUTDOWN 0x4000 @@ -116,6 +105,9 @@ #define MH_MIN_ALIGNMENT (4 * 1024 * 1024) #define rounddown(x, y) ((x) & ~(y - 1)) +#define SCF_SB_INFO_OFFSET 0x80020 +#define SCF_SB_INFO_BUSY 0x40 + #ifndef _ASM /* @@ -201,7 +193,6 @@ } drmach_scf_regs_t; - typedef struct { volatile uint_t stat; volatile uint_t error; @@ -236,7 +227,7 @@ cpuset_t cpu_copy_set; processorid_t cpuid; drmach_fmem_mbox_t fmem_status; - volatile uchar_t error[NCPU]; + volatile ushort_t error[NCPU]; struct memlist *c_ml; struct memlist *cpu_ml[NCPU]; void (*mc_resume)(void); @@ -269,44 +260,6 @@ #define DRMACH_FMEM_MLIST_PAGE 2 #define DRMACH_FMEM_STAT_PAGE 3 -/* - * layout of the FMEM buffers: - * 1st 8k page - * +--------------------------------+ - * |drmach_copy_rename_program_t | - * +--------------------------------+ - * |drmach_copy_rename_data_t | - * | | - * +--------------------------------+ - * - * 2nd 8k page - * +--------------------------------+ - * |drmach_copy_rename_critical_t | - * | | - * +--------------------------------+ - * |run (drmach_copy_rename_prog__relocatable) - * |(roundup boundary to 1K) | - * +--------------------------------+ - * | fmem_script | - * |(roundup boundary to 1K) | - * +--------------------------------+ - * |loop_script | - * | | - * +--------------------------------+ - * |at least 1K NOP/0's | - * | | - * +--------------------------------+ - * - * 3rd 8k page - * +--------------------------------+ - * |memlist_buffer (free_mlist) | - * | | - * +--------------------------------+ - * - * 4th 8k page - drmach_cr_stat_t. - * - */ - typedef struct { boolean_t assigned; boolean_t powered;
--- a/usr/src/uts/sun4u/sys/sbd_ioctl.h Sun Feb 25 07:50:49 2007 -0800 +++ b/usr/src/uts/sun4u/sys/sbd_ioctl.h Sun Feb 25 14:16:28 2007 -0800 @@ -28,15 +28,18 @@ #pragma ident "%Z%%M% %I% %E% SMI" +#ifndef _ASM #include <sys/types.h> #include <sys/obpdefs.h> #include <sys/processor.h> #include <sys/param.h> +#endif #ifdef __cplusplus extern "C" { #endif +#ifndef _ASM typedef enum { SBD_COMP_NONE, SBD_COMP_CPU, @@ -405,6 +408,7 @@ } sbd_etab32_t; #endif /* _SYSCALL32 */ +#endif /* _ASM */ /* Common error codes */ @@ -602,6 +606,15 @@ #define EOPL_SCF_FMEM_START 5018 /* scf_fmem_start error */ #define EOPL_FMEM_ERROR 5019 /* FMEM error */ #define EOPL_SCF_FMEM_CANCEL 5020 /* scf_fmem_cancel error */ +#define EOPL_FMEM_XC_TIMEOUT 5021 /* xcall timeout */ +#define EOPL_FMEM_COPY_TIMEOUT 5022 /* DR parellel copy timeout */ +#define EOPL_FMEM_SCF_BUSY 5023 /* SCF busy */ +#define EOPL_FMEM_RETRY_OUT 5024 /* SCF IO Retry Error */ +#define EOPL_FMEM_TIMEOUT 5025 /* FMEM command timeout */ +#define EOPL_FMEM_HW_ERROR 5026 /* Hardware error */ +#define EOPL_FMEM_TERMINATE 5027 /* FMEM operation Terminated */ +#define EOPL_FMEM_COPY_ERROR 5028 /* Memory copy error */ +#define EOPL_FMEM_SCF_ERR 5029 /* SCF error */ #ifdef __cplusplus }