Mercurial > illumos > illumos-gate
changeset 960:cede29d5c3d0
5089580 panicking cpu should log syndrome 71 errors in other cpus logout area
author | rscott |
---|---|
date | Mon, 21 Nov 2005 12:56:14 -0800 |
parents | c2f9e2889520 |
children | 6cc00d26f27c |
files | usr/src/uts/sun4u/cpu/us3_common.c |
diffstat | 1 files changed, 132 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/uts/sun4u/cpu/us3_common.c Mon Nov 21 09:45:49 2005 -0800 +++ b/usr/src/uts/sun4u/cpu/us3_common.c Mon Nov 21 12:56:14 2005 -0800 @@ -1177,6 +1177,124 @@ } /* + * Tunable to disable the checking of other cpu logout areas during panic for + * potential syndrome 71 generating errors. + */ +int enable_check_other_cpus_logout = 1; + +/* + * Check other cpus logout area for potential synd 71 generating + * errors. + */ +static void +cpu_check_cpu_logout(int cpuid, caddr_t tpc, int tl, int ecc_type, + ch_cpu_logout_t *clop) +{ + struct async_flt *aflt; + ch_async_flt_t ch_flt; + uint64_t t_afar, t_afsr, t_afsr_ext, t_afsr_errs; + + if (clop == NULL || clop->clo_data.chd_afar == LOGOUT_INVALID) { + return; + } + + bzero(&ch_flt, sizeof (ch_async_flt_t)); + + t_afar = clop->clo_data.chd_afar; + t_afsr = clop->clo_data.chd_afsr; + t_afsr_ext = clop->clo_data.chd_afsr_ext; +#if defined(SERRANO) + ch_flt.afar2 = clop->clo_data.chd_afar2; +#endif /* SERRANO */ + + /* + * In order to simplify code, we maintain this afsr_errs + * variable which holds the aggregate of AFSR and AFSR_EXT + * sticky bits. + */ + t_afsr_errs = (t_afsr_ext & C_AFSR_EXT_ALL_ERRS) | + (t_afsr & C_AFSR_ALL_ERRS); + + /* Setup the async fault structure */ + aflt = (struct async_flt *)&ch_flt; + aflt->flt_id = gethrtime_waitfree(); + ch_flt.afsr_ext = t_afsr_ext; + ch_flt.afsr_errs = t_afsr_errs; + aflt->flt_stat = t_afsr; + aflt->flt_addr = t_afar; + aflt->flt_bus_id = cpuid; + aflt->flt_inst = cpuid; + aflt->flt_pc = tpc; + aflt->flt_prot = AFLT_PROT_NONE; + aflt->flt_class = CPU_FAULT; + aflt->flt_priv = ((t_afsr & C_AFSR_PRIV) != 0); + aflt->flt_tl = tl; + aflt->flt_status = ecc_type; + aflt->flt_panic = C_AFSR_PANIC(t_afsr_errs); + + /* + * Queue events on the async event queue, one event per error bit. + * If no events are queued, queue an event to complain. + */ + if (cpu_queue_events(&ch_flt, NULL, t_afsr_errs, clop) == 0) { + ch_flt.flt_type = CPU_INV_AFSR; + cpu_errorq_dispatch(FM_EREPORT_CPU_USIII_INVALID_AFSR, + (void *)&ch_flt, sizeof (ch_async_flt_t), ue_queue, + aflt->flt_panic); + } + + /* + * Zero out + invalidate CPU logout. + */ + bzero(clop, sizeof (ch_cpu_logout_t)); + clop->clo_data.chd_afar = LOGOUT_INVALID; +} + +/* + * Check the logout areas of all other cpus for unlogged errors. + */ +static void +cpu_check_other_cpus_logout(void) +{ + int i, j; + processorid_t myid; + struct cpu *cp; + ch_err_tl1_data_t *cl1p; + + myid = CPU->cpu_id; + for (i = 0; i < NCPU; i++) { + cp = cpu[i]; + + if ((cp == NULL) || !(cp->cpu_flags & CPU_EXISTS) || + (cp->cpu_id == myid) || (CPU_PRIVATE(cp) == NULL)) { + continue; + } + + /* + * Check each of the tl>0 logout areas + */ + cl1p = CPU_PRIVATE_PTR(cp, chpr_tl1_err_data[0]); + for (j = 0; j < CH_ERR_TL1_TLMAX; j++, cl1p++) { + if (cl1p->ch_err_tl1_flags == 0) + continue; + + cpu_check_cpu_logout(i, (caddr_t)cl1p->ch_err_tl1_tpc, + 1, ECC_F_TRAP, &cl1p->ch_err_tl1_logout); + } + + /* + * Check each of the remaining logout areas + */ + cpu_check_cpu_logout(i, NULL, 0, ECC_F_TRAP, + CPU_PRIVATE_PTR(cp, chpr_fecctl0_logout)); + cpu_check_cpu_logout(i, NULL, 0, ECC_C_TRAP, + CPU_PRIVATE_PTR(cp, chpr_cecc_logout)); + cpu_check_cpu_logout(i, NULL, 0, ECC_D_TRAP, + CPU_PRIVATE_PTR(cp, chpr_async_logout)); + } +} + +/* * The fast_ecc_err handler transfers control here for UCU, UCC events. * Note that we flush Ecache twice, once in the fast_ecc_err handler to * flush the error that caused the UCU/UCC, then again here at the end to @@ -3512,6 +3630,20 @@ cpu_disable_errors(void) { xt_all(set_error_enable_tl1, EN_REG_DISABLE, EER_SET_ABSOLUTE); + + /* + * With error detection now turned off, check the other cpus + * logout areas for any unlogged errors. + */ + if (enable_check_other_cpus_logout) { + cpu_check_other_cpus_logout(); + /* + * Make a second pass over the logout areas, in case + * there is a failing CPU in an error-trap loop which + * will write to the logout area once it is emptied. + */ + cpu_check_other_cpus_logout(); + } } /*