Mercurial > illumos > illumos-gate
changeset 3139:289de3a0984a
5092743 pcplusmp: IOAPIC cannot deliver interrupt to cpu when RDT entry is masked
6335195 psm modules should workaround incorrect interrupt polarity information from ACPI
6437077 pcplusmp: Fix for 6425990 still allows some deadlocks
6483272 apic_check_stuck_interrupt must not call functions that may lead to preemption; induces hang
6490462 pcplusmp still assumes CPU 0 is always available
author | sethg |
---|---|
date | Fri, 17 Nov 2006 19:17:15 -0800 |
parents | 7bbdcbfa4cd5 |
children | 0555fdc68b85 |
files | usr/src/uts/i86pc/io/pcplusmp/apic.c usr/src/uts/i86pc/io/pcplusmp/apic.h usr/src/uts/i86pc/io/pcplusmp/apic_introp.c |
diffstat | 3 files changed, 831 insertions(+), 449 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/uts/i86pc/io/pcplusmp/apic.c Fri Nov 17 16:14:57 2006 -0800 +++ b/usr/src/uts/i86pc/io/pcplusmp/apic.c Fri Nov 17 19:17:15 2006 -0800 @@ -62,6 +62,9 @@ #include <sys/cyclic.h> #include <sys/note.h> #include <sys/pci_intr_lib.h> +#include <sys/sunndi.h> + +struct ioapic_reprogram_data; /* * Local Function Prototypes @@ -89,9 +92,8 @@ static void apic_reprogram_timeout_handler(void *arg); static int apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu, int new_bind_cpu, volatile int32_t *ioapic, int intin_no, int which_irq, - int iflag, boolean_t *restore_intrp); -static int apic_setup_io_intr(apic_irq_t *irqptr, int irq); -static int apic_setup_io_intr_deferred(apic_irq_t *irqptr, int irq); + struct ioapic_reprogram_data *drep); +static int apic_setup_io_intr(void *p, int irq, boolean_t deferred); static void apic_record_rdt_entry(apic_irq_t *irqptr, int irq); static struct apic_io_intr *apic_find_io_intr_w_busid(int irqno, int busid); static int apic_find_intin(uchar_t ioapic, uchar_t intin); @@ -105,13 +107,17 @@ static void apic_nmi_intr(caddr_t arg); uchar_t apic_bind_intr(dev_info_t *dip, int irq, uchar_t ioapicid, uchar_t intin); -static int apic_rebind(apic_irq_t *irq_ptr, int bind_cpu, int acquire_lock, - int when); -int apic_rebind_all(apic_irq_t *irq_ptr, int bind_cpu, int safe); +static int apic_rebind(apic_irq_t *irq_ptr, int bind_cpu, + struct ioapic_reprogram_data *drep); +int apic_rebind_all(apic_irq_t *irq_ptr, int bind_cpu); static void apic_intr_redistribute(); static void apic_cleanup_busy(); static void apic_set_pwroff_method_from_mpcnfhdr(struct apic_mp_cnf_hdr *hdrp); int apic_introp_xlate(dev_info_t *dip, struct intrspec *ispec, int type); +static void apic_try_deferred_reprogram(int ipl, int vect); +static void delete_defer_repro_ent(int which_irq); +static void apic_ioapic_wait_pending_clear(volatile int32_t *ioapic, + int intin_no); /* ACPI support routines */ static int acpi_probe(void); @@ -184,21 +190,11 @@ int apic_cpcovf_vect; int apic_enable_cpcovf_intr = 1; -/* Max wait time (in microsecs) for flags to clear in an RDT entry. */ -static int apic_max_usecs_clear_pending = 1000; - -/* Amt of usecs to wait before checking if RDT flags have reset. */ -#define APIC_USECS_PER_WAIT_INTERVAL 100 - -/* Maximum number of times to retry reprogramming via the timeout */ -#define APIC_REPROGRAM_MAX_TIMEOUTS 10 - -/* timeout delay for IOAPIC delayed reprogramming */ -#define APIC_REPROGRAM_TIMEOUT_DELAY 5 /* microseconds */ - -/* Parameter to apic_rebind(): Should reprogramming be done now or later? */ -#define DEFERRED 1 -#define IMMEDIATE 0 +/* Max wait time (in repetitions) for flags to clear in an RDT entry. */ +static int apic_max_reps_clear_pending = 1000; + +/* Maximum number of times to retry reprogramming at apic_intr_exit time */ +#define APIC_REPROGRAM_MAX_TRIES 10000 /* * number of bits per byte, from <sys/param.h> @@ -505,11 +501,29 @@ lock_t apic_ioapic_lock; /* - * apic_ioapic_reprogram_lock prevents a CPU from exiting - * apic_intr_exit before IOAPIC reprogramming information - * is collected. + * apic_defer_reprogram_lock ensures that only one processor is handling + * deferred interrupt programming at apic_intr_exit time. + */ +static lock_t apic_defer_reprogram_lock; + +/* + * The current number of deferred reprogrammings outstanding */ -static lock_t apic_ioapic_reprogram_lock; +uint_t apic_reprogram_outstanding = 0; + +#ifdef DEBUG +/* + * Counters that keep track of deferred reprogramming stats + */ +uint_t apic_intr_deferrals = 0; +uint_t apic_intr_deliver_timeouts = 0; +uint_t apic_last_ditch_reprogram_failures = 0; +uint_t apic_deferred_setup_failures = 0; +uint_t apic_defer_repro_total_retries = 0; +uint_t apic_defer_repro_successes = 0; +uint_t apic_deferred_spurious_enters = 0; +#endif + static int apic_io_max = 0; /* no. of i/o apics enabled */ static struct apic_io_intr *apic_io_intrp = 0; @@ -566,19 +580,29 @@ static uchar_t *apic_oldvec_to_newvec; static uchar_t *apic_newvec_to_oldvec; -/* Ensures that the IOAPIC-reprogramming timeout is not reentrant */ -static kmutex_t apic_reprogram_timeout_mutex; - static struct ioapic_reprogram_data { - int valid; /* This entry is valid */ - int bindcpu; /* The CPU to which the int will be bound */ - unsigned timeouts; /* # times the reprogram timeout was called */ -} apic_reprogram_info[APIC_MAX_VECTOR+1]; + boolean_t done; + apic_irq_t *irqp; + /* The CPU to which the int will be bound */ + int bindcpu; + /* # times the reprogram timeout was called */ + unsigned tries; + +/* The irq # is implicit in the array index: */ +} apic_reprogram_info[APIC_MAX_VECTOR + 1]; + /* * APIC_MAX_VECTOR + 1 is the maximum # of IRQs as well. apic_reprogram_info * is indexed by IRQ number, NOT by vector number. */ +typedef struct prs_irq_list_ent { + int list_prio; + int32_t irq; + iflag_t intrflags; + acpi_prs_private_t prsprv; + struct prs_irq_list_ent *next; +} prs_irq_list_t; /* * The following added to identify a software poweroff method if available. @@ -1520,9 +1544,12 @@ apic_level_intr[i] = 0; *iptr++ = NULL; apic_vector_to_irq[i] = APIC_RESV_IRQ; - apic_reprogram_info[i].valid = 0; + + /* These *must* be initted to B_TRUE! */ + apic_reprogram_info[i].done = B_TRUE; + apic_reprogram_info[i].irqp = NULL; + apic_reprogram_info[i].tries = 0; apic_reprogram_info[i].bindcpu = 0; - apic_reprogram_info[i].timeouts = 0; } /* @@ -1535,7 +1562,6 @@ kmem_zalloc(sizeof (apic_irq_t), KM_NOSLEEP); mutex_init(&airq_mutex, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&apic_reprogram_timeout_mutex, NULL, MUTEX_DEFAULT, NULL); #if defined(__amd64) /* * Make cpu-specific interrupt info point to cr8pri vector @@ -1724,7 +1750,7 @@ static void apic_picinit(void) { - int i, j; + int i, j, iflag; uint_t isr; volatile int32_t *ioapic; apic_irq_t *irqptr; @@ -1753,7 +1779,7 @@ LOCK_INIT_CLEAR(&apic_gethrtime_lock); LOCK_INIT_CLEAR(&apic_ioapic_lock); LOCK_INIT_CLEAR(&apic_revector_lock); - LOCK_INIT_CLEAR(&apic_ioapic_reprogram_lock); + LOCK_INIT_CLEAR(&apic_defer_reprogram_lock); LOCK_INIT_CLEAR(&apic_error_lock); picsetup(); /* initialise the 8259 */ @@ -1806,8 +1832,14 @@ } irqptr = apic_irq_table[apic_sci_vect]; + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + /* Program I/O APIC */ - (void) apic_setup_io_intr(irqptr, apic_sci_vect); + (void) apic_setup_io_intr(irqptr, apic_sci_vect, B_FALSE); + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); irqptr->airq_share++; } @@ -2325,6 +2357,7 @@ if ((irqindex == -1) || (!apic_irq_table[irqindex])) return (PSM_FAILURE); + mutex_enter(&airq_mutex); irqptr = irqheadptr = apic_irq_table[irqindex]; DDI_INTR_IMPLDBG((CE_CONT, "apic_addspl: dip=0x%p type=%d irqno=0x%x " @@ -2338,6 +2371,8 @@ } irqptr->airq_share++; + mutex_exit(&airq_mutex); + /* return if it is not hardware interrupt */ if (irqptr->airq_mps_intr_index == RESERVE_INDEX) return (PSM_SUCCESS); @@ -2354,8 +2389,6 @@ if (!apic_flag) return (PSM_SUCCESS); - iflag = intr_clear(); - /* * Upgrade vector if max_ipl is not earlier ipl. If we cannot allocate, * return failure. Not very elegant, but then we hope the @@ -2364,7 +2397,6 @@ if (irqptr->airq_ipl != max_ipl) { vector = apic_allocate_vector(max_ipl, irqindex, 1); if (vector == 0) { - intr_restore(iflag); irqptr->airq_share--; return (PSM_FAILURE); } @@ -2380,17 +2412,31 @@ if ((VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno) || (irqptr->airq_temp_cpu != IRQ_UNINIT)) { apic_record_rdt_entry(irqptr, irqindex); - (void) apic_setup_io_intr(irqptr, irqindex); + + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + (void) apic_setup_io_intr(irqptr, irqindex, + B_FALSE); + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); } irqptr = irqptr->airq_next; } - intr_restore(iflag); return (PSM_SUCCESS); } ASSERT(irqptr); - (void) apic_setup_io_intr(irqptr, irqindex); + + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + (void) apic_setup_io_intr(irqptr, irqindex, B_FALSE); + + lock_clear(&apic_ioapic_lock); intr_restore(iflag); + return (PSM_SUCCESS); } @@ -2412,6 +2458,7 @@ volatile int32_t *ioapic; apic_irq_t *irqptr, *irqheadptr; + mutex_enter(&airq_mutex); irqindex = IRQINDEX(irqno); irqptr = irqheadptr = apic_irq_table[irqindex]; @@ -2428,6 +2475,8 @@ irqptr->airq_share--; + mutex_exit(&airq_mutex); + if (ipl < max_ipl) return (PSM_SUCCESS); @@ -2460,8 +2509,15 @@ irqp->airq_ipl = (uchar_t)max_ipl; if (irqp->airq_temp_cpu != IRQ_UNINIT) { apic_record_rdt_entry(irqp, irqindex); + + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + (void) apic_setup_io_intr(irqp, - irqindex); + irqindex, B_FALSE); + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); } irqp = irqp->airq_next; } @@ -2509,10 +2565,10 @@ } else apic_cpus[bind_cpu].aci_temp_bound--; } + irqptr->airq_temp_cpu = IRQ_UNINIT; + irqptr->airq_mps_intr_index = FREE_INDEX; lock_clear(&apic_ioapic_lock); intr_restore(iflag); - irqptr->airq_temp_cpu = IRQ_UNINIT; - irqptr->airq_mps_intr_index = FREE_INDEX; apic_free_vector(irqptr->airq_vector); return (PSM_SUCCESS); } @@ -2545,6 +2601,7 @@ irqptr->airq_temp_cpu = IRQ_UNINIT; irqptr->airq_mps_intr_index = FREE_INDEX; + return (PSM_SUCCESS); } @@ -2570,7 +2627,7 @@ static int apic_post_cpu_start() { - int i, cpun; + int i, cpun, iflag; apic_irq_t *irq_ptr; apic_init_intr(); @@ -2585,6 +2642,7 @@ apic_ret(); cpun = psm_get_cpu_id(); + apic_cpus[cpun].aci_status = APIC_CPU_ONLINE | APIC_CPU_INTR_ENABLE; for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) { @@ -2594,12 +2652,20 @@ continue; while (irq_ptr) { - if (irq_ptr->airq_temp_cpu != IRQ_UNINIT) - (void) apic_rebind(irq_ptr, cpun, 1, IMMEDIATE); + if (irq_ptr->airq_temp_cpu != IRQ_UNINIT) { + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + (void) apic_rebind(irq_ptr, cpun, NULL); + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + } irq_ptr = irq_ptr->airq_next; } } + apicadr[APIC_DIVIDE_REG] = apic_divide_reg_init; return (PSM_SUCCESS); } @@ -3016,10 +3082,25 @@ iflag = intr_clear(); lock_set(&apic_ioapic_lock); + + for (i = 0; i <= APIC_MAX_VECTOR; i++) { + if (apic_reprogram_info[i].done == B_FALSE) { + if (apic_reprogram_info[i].bindcpu == cpun) { + /* + * CPU is busy -- it's the target of + * a pending reprogramming attempt + */ + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + return (PSM_FAILURE); + } + } + } + apic_cpus[cpun].aci_status &= ~APIC_CPU_INTR_ENABLE; - lock_clear(&apic_ioapic_lock); - intr_restore(iflag); + apic_cpus[cpun].aci_curipl = 0; + i = apic_min_device_irq; for (; i <= apic_max_device_irq; i++) { /* @@ -3046,10 +3127,14 @@ bind_cpu = 0; } - } while (apic_rebind_all(irq_ptr, bind_cpu, 1)); + } while (apic_rebind_all(irq_ptr, bind_cpu)); } } } + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + if (hardbound) { cmn_err(CE_WARN, "Could not disable interrupts on %d" "due to user bound interrupts", cpun); @@ -3067,25 +3152,27 @@ iflag = intr_clear(); lock_set(&apic_ioapic_lock); + apic_cpus[cpun].aci_status |= APIC_CPU_INTR_ENABLE; - lock_clear(&apic_ioapic_lock); - intr_restore(iflag); i = apic_min_device_irq; for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) { if ((irq_ptr = apic_irq_table[i]) != NULL) { if ((irq_ptr->airq_cpu & ~IRQ_USER_BOUND) == cpun) { (void) apic_rebind_all(irq_ptr, - irq_ptr->airq_cpu, 1); + irq_ptr->airq_cpu); } } } + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); } /* * apic_introp_xlate() replaces apic_translate_irq() and is * called only from apic_intr_ops(). With the new ADII framework, - * the priority can no longer be retrived through i_ddi_get_intrspec(). + * the priority can no longer be retrieved through i_ddi_get_intrspec(). * It has to be passed in from the caller. */ int @@ -4199,55 +4286,86 @@ irqptr->airq_rdt_entry = level|io_po|vector; } +static processorid_t +apic_find_cpu(int flag) +{ + processorid_t acid = 0; + int i; + + /* Find the first CPU with the passed-in flag set */ + for (i = 0; i < apic_nproc; i++) { + if (apic_cpus[i].aci_status & flag) { + acid = i; + break; + } + } + + ASSERT((apic_cpus[acid].aci_status & flag) != 0); + return (acid); +} + /* * Call rebind to do the actual programming. + * Must be called with interrupts disabled and apic_ioapic_lock held + * 'p' is polymorphic -- if this function is called to process a deferred + * reprogramming, p is of type 'struct ioapic_reprogram_data *', from which + * the irq pointer is retrieved. If not doing deferred reprogramming, + * p is of the type 'apic_irq_t *'. + * + * apic_ioapic_lock must be held across this call, as it protects apic_rebind + * and it protects apic_find_cpu() from a race in which a CPU can be taken + * offline after a cpu is selected, but before apic_rebind is called to + * bind interrupts to it. */ static int -apic_setup_io_intr(apic_irq_t *irqptr, int irq) +apic_setup_io_intr(void *p, int irq, boolean_t deferred) { + apic_irq_t *irqptr; + struct ioapic_reprogram_data *drep = NULL; int rv; - if (rv = apic_rebind(irqptr, apic_irq_table[irq]->airq_cpu, 1, - IMMEDIATE)) - /* CPU is not up or interrupt is disabled. Fall back to 0 */ - rv = apic_rebind(irqptr, 0, 1, IMMEDIATE); + if (deferred) { + drep = (struct ioapic_reprogram_data *)p; + ASSERT(drep != NULL); + irqptr = drep->irqp; + } else + irqptr = (apic_irq_t *)p; + + ASSERT(irqptr != NULL); + + rv = apic_rebind(irqptr, apic_irq_table[irq]->airq_cpu, drep); + if (rv) { + /* + * CPU is not up or interrupts are disabled. Fall back to + * the first available CPU + */ + rv = apic_rebind(irqptr, apic_find_cpu(APIC_CPU_INTR_ENABLE), + drep); + } return (rv); } /* - * Deferred reprogramming: Call apic_rebind to do the real work. + * Bind interrupt corresponding to irq_ptr to bind_cpu. + * Must be called with interrupts disabled and apic_ioapic_lock held */ static int -apic_setup_io_intr_deferred(apic_irq_t *irqptr, int irq) +apic_rebind(apic_irq_t *irq_ptr, int bind_cpu, + struct ioapic_reprogram_data *drep) { - int rv; - - if (rv = apic_rebind(irqptr, apic_irq_table[irq]->airq_cpu, 1, - DEFERRED)) - /* CPU is not up or interrupt is disabled. Fall back to 0 */ - rv = apic_rebind(irqptr, 0, 1, DEFERRED); - - return (rv); -} - -/* - * Bind interrupt corresponding to irq_ptr to bind_cpu. acquire_lock - * if false (0) means lock is already held (e.g: in rebind_all). - */ -static int -apic_rebind(apic_irq_t *irq_ptr, int bind_cpu, int acquire_lock, int when) -{ - int intin_no; + int ioapicindex, intin_no; volatile int32_t *ioapic; uchar_t airq_temp_cpu; apic_cpus_info_t *cpu_infop; - int iflag; - int which_irq = apic_vector_to_irq[irq_ptr->airq_vector]; - boolean_t restore_iflag = B_TRUE; + uint32_t rdt_entry; + int which_irq; + + which_irq = apic_vector_to_irq[irq_ptr->airq_vector]; intin_no = irq_ptr->airq_intin_no; - ioapic = apicioadr[irq_ptr->airq_ioapicindex]; + ioapicindex = irq_ptr->airq_ioapicindex; + ioapic = apicioadr[ioapicindex]; airq_temp_cpu = irq_ptr->airq_temp_cpu; if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu != IRQ_UNBOUND) { if (airq_temp_cpu & IRQ_USER_BOUND) @@ -4257,52 +4375,29 @@ ASSERT(airq_temp_cpu < apic_nproc); } - iflag = intr_clear(); - - if (acquire_lock) - lock_set(&apic_ioapic_lock); - /* - * Can't bind to a CPU that's not online: + * Can't bind to a CPU that's not accepting interrupts: */ cpu_infop = &apic_cpus[bind_cpu & ~IRQ_USER_BOUND]; - if (!(cpu_infop->aci_status & APIC_CPU_INTR_ENABLE)) { - - if (acquire_lock) - lock_clear(&apic_ioapic_lock); - - intr_restore(iflag); + if (!(cpu_infop->aci_status & APIC_CPU_INTR_ENABLE)) return (1); - } /* - * If this is a deferred reprogramming attempt, ensure we have - * not been passed stale data: + * If we are about to change the interrupt vector for this interrupt, + * and this interrupt is level-triggered, attached to an IOAPIC, + * has been delivered to a CPU and that CPU has not handled it + * yet, we cannot reprogram the IOAPIC now. */ - if ((when == DEFERRED) && - (apic_reprogram_info[which_irq].valid == 0)) { - /* stale info, so just return */ - if (acquire_lock) - lock_clear(&apic_ioapic_lock); - - intr_restore(iflag); - return (0); - } - - /* - * If this interrupt has been delivered to a CPU and that CPU - * has not handled it yet, we cannot reprogram the IOAPIC now: - */ - if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index) && - apic_check_stuck_interrupt(irq_ptr, airq_temp_cpu, bind_cpu, - ioapic, intin_no, which_irq, iflag, &restore_iflag) != 0) { - - if (acquire_lock) - lock_clear(&apic_ioapic_lock); - - if (restore_iflag) - intr_restore(iflag); - return (0); + if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) { + + rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, intin_no); + + if ((irq_ptr->airq_vector != RDT_VECTOR(rdt_entry)) && + apic_check_stuck_interrupt(irq_ptr, airq_temp_cpu, + bind_cpu, ioapic, intin_no, which_irq, drep) != 0) { + + return (0); + } } /* @@ -4313,6 +4408,9 @@ */ if ((uchar_t)bind_cpu == IRQ_UNBOUND) { + + rdt_entry = AV_LDEST | AV_LOPRI | irq_ptr->airq_rdt_entry; + /* Write the RDT entry -- no specific CPU binding */ WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapic, intin_no, AV_TOALL); @@ -4320,12 +4418,9 @@ apic_cpus[airq_temp_cpu].aci_temp_bound--; /* Write the vector, trigger, and polarity portion of the RDT */ - WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, intin_no, - AV_LDEST | AV_LOPRI | irq_ptr->airq_rdt_entry); - if (acquire_lock) - lock_clear(&apic_ioapic_lock); + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, intin_no, rdt_entry); + irq_ptr->airq_temp_cpu = IRQ_UNBOUND; - intr_restore(iflag); return (0); } @@ -4344,15 +4439,18 @@ apic_cpus[airq_temp_cpu].aci_temp_bound--; } if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) { + + rdt_entry = AV_PDEST | AV_FIXED | irq_ptr->airq_rdt_entry; + /* Write the vector, trigger, and polarity portion of the RDT */ - WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, intin_no, - AV_PDEST | AV_FIXED | irq_ptr->airq_rdt_entry); + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, intin_no, rdt_entry); + } else { int type = (irq_ptr->airq_mps_intr_index == MSI_INDEX) ? DDI_INTR_TYPE_MSI : DDI_INTR_TYPE_MSIX; (void) apic_pci_msi_disable_mode(irq_ptr->airq_dip, type, - irq_ptr->airq_ioapicindex); - if (irq_ptr->airq_ioapicindex == irq_ptr->airq_origirq) { + ioapicindex); + if (ioapicindex == irq_ptr->airq_origirq) { /* first one */ DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call " "apic_pci_msi_enable_vector\n")); @@ -4365,7 +4463,7 @@ "returned PSM_FAILURE"); } } - if ((irq_ptr->airq_ioapicindex + irq_ptr->airq_intin_no - 1) == + if ((ioapicindex + irq_ptr->airq_intin_no - 1) == irq_ptr->airq_origirq) { /* last one */ DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call " "pci_msi_enable_mode\n")); @@ -4378,42 +4476,204 @@ } } } - if (acquire_lock) - lock_clear(&apic_ioapic_lock); irq_ptr->airq_temp_cpu = (uchar_t)bind_cpu; apic_redist_cpu_skip &= ~(1 << (bind_cpu & ~IRQ_USER_BOUND)); - intr_restore(iflag); return (0); } +static void +apic_last_ditch_clear_remote_irr(volatile int32_t *ioapic, int intin_no) +{ + if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, intin_no) + & AV_REMOTE_IRR) != 0) { + /* + * Trying to clear the bit through normal + * channels has failed. So as a last-ditch + * effort, try to set the trigger mode to + * edge, then to level. This has been + * observed to work on many systems. + */ + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, + intin_no, + READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, + intin_no) & ~AV_LEVEL); + + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, + intin_no, + READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, + intin_no) | AV_LEVEL); + + /* + * If the bit's STILL set, this interrupt may + * be hosed. + */ + if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, + intin_no) & AV_REMOTE_IRR) != 0) { + + prom_printf("pcplusmp: Remote IRR still " + "not clear for IOAPIC %p intin %d.\n" + "\tInterrupts to this pin may cease " + "functioning.\n", ioapic, intin_no); +#ifdef DEBUG + apic_last_ditch_reprogram_failures++; +#endif + } + } +} + /* - * Checks to see if the IOAPIC interrupt entry specified has its Remote IRR - * bit set. Sets up a timeout to perform the reprogramming at a later time - * if it cannot wait for the Remote IRR bit to clear (or if waiting did not - * result in the bit's clearing). - * - * This function will mask the RDT entry if the Remote IRR bit is set. - * - * Returns non-zero if the caller should defer IOAPIC reprogramming. + * This function is protected by apic_ioapic_lock coupled with the + * fact that interrupts are disabled. + */ +static void +delete_defer_repro_ent(int which_irq) +{ + ASSERT(which_irq >= 0); + ASSERT(which_irq <= 255); + + if (apic_reprogram_info[which_irq].done) + return; + + apic_reprogram_info[which_irq].done = B_TRUE; + +#ifdef DEBUG + apic_defer_repro_total_retries += + apic_reprogram_info[which_irq].tries; + + apic_defer_repro_successes++; +#endif + + if (--apic_reprogram_outstanding == 0) { + + setlvlx = apic_intr_exit; + } +} + + +/* + * Interrupts must be disabled during this function to prevent + * self-deadlock. Interrupts are disabled because this function + * is called from apic_check_stuck_interrupt(), which is called + * from apic_rebind(), which requires its caller to disable interrupts. */ -static int -apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu, - int new_bind_cpu, volatile int32_t *ioapic, int intin_no, int which_irq, - int iflag, boolean_t *intr_restorep) +static void +add_defer_repro_ent(apic_irq_t *irq_ptr, int which_irq, int new_bind_cpu) +{ + ASSERT(which_irq >= 0); + ASSERT(which_irq <= 255); + + /* + * On the off-chance that there's already a deferred + * reprogramming on this irq, check, and if so, just update the + * CPU and irq pointer to which the interrupt is targeted, then return. + */ + if (!apic_reprogram_info[which_irq].done) { + apic_reprogram_info[which_irq].bindcpu = new_bind_cpu; + apic_reprogram_info[which_irq].irqp = irq_ptr; + return; + } + + apic_reprogram_info[which_irq].irqp = irq_ptr; + apic_reprogram_info[which_irq].bindcpu = new_bind_cpu; + apic_reprogram_info[which_irq].tries = 0; + /* + * This must be the last thing set, since we're not + * grabbing any locks, apic_try_deferred_reprogram() will + * make its decision about using this entry iff done + * is false. + */ + apic_reprogram_info[which_irq].done = B_FALSE; + + /* + * If there were previously no deferred reprogrammings, change + * setlvlx to call apic_try_deferred_reprogram() + */ + if (++apic_reprogram_outstanding == 1) { + + setlvlx = apic_try_deferred_reprogram; + } +} + +static void +apic_try_deferred_reprogram(int prev_ipl, int irq) { - int32_t rdt_entry; - int waited; - - /* Mask the RDT entry, but only if it's a level-triggered interrupt */ - rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, intin_no); - if ((rdt_entry & (AV_LEVEL|AV_MASK)) == AV_LEVEL) { - - /* Mask it */ - WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, intin_no, - AV_MASK | rdt_entry); + int reproirq, iflag; + struct ioapic_reprogram_data *drep; + + apic_intr_exit(prev_ipl, irq); + + if (!lock_try(&apic_defer_reprogram_lock)) { + return; + } + + /* + * Acquire the apic_ioapic_lock so that any other operations that + * may affect the apic_reprogram_info state are serialized. + * It's still possible for the last deferred reprogramming to clear + * between the time we entered this function and the time we get to + * the for loop below. In that case, *setlvlx will have been set + * back to apic_intr_exit and drep will be NULL. (There's no way to + * stop that from happening -- we would need to grab a lock before + * calling *setlvlx, which is neither realistic nor prudent). + */ + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + /* + * For each deferred RDT entry, try to reprogram it now. Note that + * there is no lock acquisition to read apic_reprogram_info because + * '.done' is set only after the other fields in the structure are set. + */ + + drep = NULL; + for (reproirq = 0; reproirq <= APIC_MAX_VECTOR; reproirq++) { + if (apic_reprogram_info[reproirq].done == B_FALSE) { + drep = &apic_reprogram_info[reproirq]; + break; + } } /* + * Either we found a deferred action to perform, or + * we entered this function spuriously, after *setlvlx + * was restored to point to apic_intr_enter. Any other + * permutation is invalid. + */ + ASSERT(drep != NULL || *setlvlx == apic_intr_exit); + + /* + * Though we can't really do anything about errors + * at this point, keep track of them for reporting. + * Note that it is very possible for apic_setup_io_intr + * to re-register this very timeout if the Remote IRR bit + * has not yet cleared. + */ + +#ifdef DEBUG + if (drep != NULL) { + if (apic_setup_io_intr(drep, reproirq, B_TRUE) != 0) { + apic_deferred_setup_failures++; + } + } else { + apic_deferred_spurious_enters++; + } +#else + if (drep != NULL) + (void) apic_setup_io_intr(drep, reproirq, B_TRUE); +#endif + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + + lock_clear(&apic_defer_reprogram_lock); +} + +static void +apic_ioapic_wait_pending_clear(volatile int32_t *ioapic, int intin_no) +{ + int waited; + + /* * Wait for the delivery pending bit to clear. */ if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, intin_no) & @@ -4425,24 +4685,81 @@ * a very small amount of time, but include a timeout just in * case). */ - for (waited = 0; waited < apic_max_usecs_clear_pending; - waited += APIC_USECS_PER_WAIT_INTERVAL) { + for (waited = 0; waited < apic_max_reps_clear_pending; + waited++) { if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, intin_no) & AV_PENDING) == 0) { break; } - drv_usecwait(APIC_USECS_PER_WAIT_INTERVAL); - } - - if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, intin_no) & - AV_PENDING) != 0) { - cmn_err(CE_WARN, "!IOAPIC %d intin %d: Could not " - "deliver interrupt to local APIC within " - "%d usecs.", irq_ptr->airq_ioapicindex, - irq_ptr->airq_intin_no, - apic_max_usecs_clear_pending); } } +} + +/* + * Checks to see if the IOAPIC interrupt entry specified has its Remote IRR + * bit set. Calls functions that modify the function that setlvlx points to, + * so that the reprogramming can be retried very shortly. + * + * This function will mask the RDT entry if the interrupt is level-triggered. + * (The caller is responsible for unmasking the RDT entry.) + * + * Returns non-zero if the caller should defer IOAPIC reprogramming. + */ +static int +apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu, + int new_bind_cpu, volatile int32_t *ioapic, int intin_no, int which_irq, + struct ioapic_reprogram_data *drep) +{ + int32_t rdt_entry; + int waited; + int reps = 0; + + /* + * Wait for the delivery pending bit to clear. + */ + do { + ++reps; + + apic_ioapic_wait_pending_clear(ioapic, intin_no); + + /* + * Mask the RDT entry, but only if it's a level-triggered + * interrupt + */ + rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, intin_no); + if ((rdt_entry & (AV_LEVEL|AV_MASK)) == AV_LEVEL) { + + /* Mask it */ + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, intin_no, + AV_MASK | rdt_entry); + } + + if ((rdt_entry & AV_LEVEL) == AV_LEVEL) { + /* + * If there was a race and an interrupt was injected + * just before we masked, check for that case here. + * Then, unmask the RDT entry and try again. If we're + * on our last try, don't unmask (because we want the + * RDT entry to remain masked for the rest of the + * function). + */ + rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, + intin_no); + if ((rdt_entry & AV_PENDING) && + (reps < apic_max_reps_clear_pending)) { + /* Unmask it */ + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, + intin_no, rdt_entry & ~AV_MASK); + } + } + + } while ((rdt_entry & AV_PENDING) && + (reps < apic_max_reps_clear_pending)); + +#ifdef DEBUG + if (rdt_entry & AV_PENDING) + apic_intr_deliver_timeouts++; +#endif /* * If the remote IRR bit is set, then the interrupt has been sent @@ -4461,188 +4778,79 @@ * may have been delivered to the current CPU so handle that * case by deferring the reprogramming (below). */ - kpreempt_disable(); if ((old_bind_cpu != IRQ_UNBOUND) && (old_bind_cpu != IRQ_UNINIT) && (old_bind_cpu != psm_get_cpu_id())) { - for (waited = 0; waited < apic_max_usecs_clear_pending; - waited += APIC_USECS_PER_WAIT_INTERVAL) { + for (waited = 0; waited < apic_max_reps_clear_pending; + waited++) { if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, intin_no) & AV_REMOTE_IRR) == 0) { - /* Clear the reprogramming state: */ - lock_set(&apic_ioapic_reprogram_lock); - - apic_reprogram_info[which_irq].valid - = 0; - apic_reprogram_info[which_irq].bindcpu - = 0; - apic_reprogram_info[which_irq].timeouts - = 0; - - lock_clear(&apic_ioapic_reprogram_lock); + delete_defer_repro_ent(which_irq); /* Remote IRR has cleared! */ - kpreempt_enable(); return (0); } - drv_usecwait(APIC_USECS_PER_WAIT_INTERVAL); } } - kpreempt_enable(); /* * If we waited and the Remote IRR bit is still not cleared, * AND if we've invoked the timeout APIC_REPROGRAM_MAX_TIMEOUTS - * times for this interrupt, try the last-ditch workarounds: + * times for this interrupt, try the last-ditch workaround: */ - if (apic_reprogram_info[which_irq].timeouts >= - APIC_REPROGRAM_MAX_TIMEOUTS) { - - if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, intin_no) - & AV_REMOTE_IRR) != 0) { - /* - * Trying to clear the bit through normal - * channels has failed. So as a last-ditch - * effort, try to set the trigger mode to - * edge, then to level. This has been - * observed to work on many systems. - */ - WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, - intin_no, - READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, - intin_no) & ~AV_LEVEL); - - WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, - intin_no, - READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, - intin_no) | AV_LEVEL); - - /* - * If the bit's STILL set, declare total and - * utter failure - */ - if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic, - intin_no) & AV_REMOTE_IRR) != 0) { - cmn_err(CE_WARN, "!IOAPIC %d intin %d: " - "Remote IRR failed to reset " - "within %d usecs. Interrupts to " - "this pin may cease to function.", - irq_ptr->airq_ioapicindex, - irq_ptr->airq_intin_no, - apic_max_usecs_clear_pending); - } - } - /* Clear the reprogramming state: */ - lock_set(&apic_ioapic_reprogram_lock); - - apic_reprogram_info[which_irq].valid = 0; - apic_reprogram_info[which_irq].bindcpu = 0; - apic_reprogram_info[which_irq].timeouts = 0; - - lock_clear(&apic_ioapic_reprogram_lock); + if (drep && drep->tries >= APIC_REPROGRAM_MAX_TRIES) { + + apic_last_ditch_clear_remote_irr(ioapic, intin_no); + + /* Mark this one as reprogrammed: */ + delete_defer_repro_ent(which_irq); + + return (0); } else { #ifdef DEBUG - cmn_err(CE_WARN, "Deferring reprogramming of irq %d", - which_irq); -#endif /* DEBUG */ + apic_intr_deferrals++; +#endif + /* * If waiting for the Remote IRR bit (above) didn't - * allow it to clear, defer the reprogramming: + * allow it to clear, defer the reprogramming. + * Add a new deferred-programming entry if the + * caller passed a NULL one (and update the existing one + * in case anything changed). */ - lock_set(&apic_ioapic_reprogram_lock); - - apic_reprogram_info[which_irq].valid = 1; - apic_reprogram_info[which_irq].bindcpu = new_bind_cpu; - apic_reprogram_info[which_irq].timeouts++; - - lock_clear(&apic_ioapic_reprogram_lock); - - *intr_restorep = B_FALSE; - intr_restore(iflag); - - /* Fire up a timeout to handle this later */ - (void) timeout(apic_reprogram_timeout_handler, - (void *) 0, - drv_usectohz(APIC_REPROGRAM_TIMEOUT_DELAY)); + add_defer_repro_ent(irq_ptr, which_irq, new_bind_cpu); + if (drep) + drep->tries++; /* Inform caller to defer IOAPIC programming: */ return (1); } + } + + /* Remote IRR is clear */ + delete_defer_repro_ent(which_irq); + return (0); } /* - * Timeout handler that performs the APIC reprogramming - */ -/*ARGSUSED*/ -static void -apic_reprogram_timeout_handler(void *arg) -{ - /*LINTED: set but not used in function*/ - int i, result; - - /* Serialize access to this function */ - mutex_enter(&apic_reprogram_timeout_mutex); - - /* - * For each entry in the reprogramming state that's valid, - * try the reprogramming again: - */ - for (i = 0; i < APIC_MAX_VECTOR; i++) { - if (apic_reprogram_info[i].valid == 0) - continue; - /* - * Though we can't really do anything about errors - * at this point, keep track of them for reporting. - * Note that it is very possible for apic_setup_io_intr - * to re-register this very timeout if the Remote IRR bit - * has not yet cleared. - */ - result = apic_setup_io_intr_deferred(apic_irq_table[i], i); - -#ifdef DEBUG - if (result) - cmn_err(CE_WARN, "apic_reprogram_timeout: " - "apic_setup_io_intr returned nonzero for " - "irq=%d!", i); -#endif /* DEBUG */ - } - - mutex_exit(&apic_reprogram_timeout_mutex); -} - - -/* - * Called to migrate all interrupts at an irq to another cpu. safe - * if true means we are not being called from an interrupt - * context and hence it is safe to do a lock_set. If false - * do only a lock_try and return failure ( non 0 ) if we cannot get it + * Called to migrate all interrupts at an irq to another cpu. + * Must be called with interrupts disabled and apic_ioapic_lock held */ int -apic_rebind_all(apic_irq_t *irq_ptr, int bind_cpu, int safe) +apic_rebind_all(apic_irq_t *irq_ptr, int bind_cpu) { apic_irq_t *irqptr = irq_ptr; int retval = 0; - int iflag; - - iflag = intr_clear(); - if (!safe) { - if (lock_try(&apic_ioapic_lock) == 0) { - intr_restore(iflag); - return (1); - } - } else - lock_set(&apic_ioapic_lock); while (irqptr) { if (irqptr->airq_temp_cpu != IRQ_UNINIT) - retval |= apic_rebind(irqptr, bind_cpu, 0, IMMEDIATE); + retval |= apic_rebind(irqptr, bind_cpu, NULL); irqptr = irqptr->airq_next; } - lock_clear(&apic_ioapic_lock); - intr_restore(iflag); + return (retval); } @@ -4666,8 +4874,8 @@ int busiest_cpu, most_free_cpu; int cpu_free, cpu_busy, max_busy, min_busy; int min_free, diff; - int average_busy, cpus_online; - int i, busy; + int average_busy, cpus_online; + int i, busy, iflag; apic_cpus_info_t *cpu_infop; apic_irq_t *min_busy_irq = NULL; apic_irq_t *max_busy_irq = NULL; @@ -4792,10 +5000,18 @@ max_busy_irq->airq_vector, most_free_cpu); } #endif /* DEBUG */ - if (apic_rebind_all(max_busy_irq, most_free_cpu, 0) - == 0) - /* Make change permenant */ - max_busy_irq->airq_cpu = (uchar_t)most_free_cpu; + iflag = intr_clear(); + if (lock_try(&apic_ioapic_lock)) { + if (apic_rebind_all(max_busy_irq, + most_free_cpu) == 0) { + /* Make change permenant */ + max_busy_irq->airq_cpu = + (uchar_t)most_free_cpu; + } + lock_clear(&apic_ioapic_lock); + } + intr_restore(iflag); + } else if (min_busy_irq != NULL) { #ifdef DEBUG if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { @@ -4804,10 +5020,18 @@ } #endif /* DEBUG */ - if (apic_rebind_all(min_busy_irq, most_free_cpu, 0) == - 0) - /* Make change permenant */ - min_busy_irq->airq_cpu = (uchar_t)most_free_cpu; + iflag = intr_clear(); + if (lock_try(&apic_ioapic_lock)) { + if (apic_rebind_all(min_busy_irq, + most_free_cpu) == 0) { + /* Make change permenant */ + min_busy_irq->airq_cpu = + (uchar_t)most_free_cpu; + } + lock_clear(&apic_ioapic_lock); + } + intr_restore(iflag); + } else { if (cpu_busy != (1 << busiest_cpu)) { apic_redist_cpu_skip |= 1 << busiest_cpu; @@ -5088,6 +5312,190 @@ } /* + * Adds an entry to the irq list passed in, and returns the new list. + * Entries are added in priority order (lower numerical priorities are + * placed closer to the head of the list) + */ +static prs_irq_list_t * +acpi_insert_prs_irq_ent(prs_irq_list_t *listp, int priority, int irq, + iflag_t *iflagp, acpi_prs_private_t *prsprvp) +{ + struct prs_irq_list_ent *newent, *prevp = NULL, *origlistp; + + newent = kmem_zalloc(sizeof (struct prs_irq_list_ent), KM_SLEEP); + + newent->list_prio = priority; + newent->irq = irq; + newent->intrflags = *iflagp; + newent->prsprv = *prsprvp; + /* ->next is NULL from kmem_zalloc */ + + /* + * New list -- return the new entry as the list. + */ + if (listp == NULL) + return (newent); + + /* + * Save original list pointer for return (since we're not modifying + * the head) + */ + origlistp = listp; + + /* + * Insertion sort, with entries with identical keys stored AFTER + * existing entries (the less-than-or-equal test of priority does + * this for us). + */ + while (listp != NULL && listp->list_prio <= priority) { + prevp = listp; + listp = listp->next; + } + + newent->next = listp; + + if (prevp == NULL) { /* Add at head of list (newent is the new head) */ + return (newent); + } else { + prevp->next = newent; + return (origlistp); + } +} + +/* + * Frees the list passed in, deallocating all memory and leaving *listpp + * set to NULL. + */ +static void +acpi_destroy_prs_irq_list(prs_irq_list_t **listpp) +{ + struct prs_irq_list_ent *nextp; + + ASSERT(listpp != NULL); + + while (*listpp != NULL) { + nextp = (*listpp)->next; + kmem_free(*listpp, sizeof (struct prs_irq_list_ent)); + *listpp = nextp; + } +} + +/* + * apic_choose_irqs_from_prs returns a list of irqs selected from the list of + * irqs returned by the link device's _PRS method. The irqs are chosen + * to minimize contention in situations where the interrupt link device + * can be programmed to steer interrupts to different interrupt controller + * inputs (some of which may already be in use). The list is sorted in order + * of irqs to use, with the highest priority given to interrupt controller + * inputs that are not shared. When an interrupt controller input + * must be shared, apic_choose_irqs_from_prs adds the possible irqs to the + * returned list in the order that minimizes sharing (thereby ensuring lowest + * possible latency from interrupt trigger time to ISR execution time). + */ +static prs_irq_list_t * +apic_choose_irqs_from_prs(acpi_irqlist_t *irqlistent, dev_info_t *dip, + int crs_irq) +{ + int32_t irq; + int i; + prs_irq_list_t *prsirqlistp = NULL; + iflag_t iflags; + + while (irqlistent != NULL) { + irqlistent->intr_flags.bustype = BUS_PCI; + + for (i = 0; i < irqlistent->num_irqs; i++) { + + irq = irqlistent->irqs[i]; + + if (irq <= 0) { + /* invalid irq number */ + continue; + } + + if ((irq < 16) && (apic_reserved_irqlist[irq])) + continue; + + if ((apic_irq_table[irq] == NULL) || + (apic_irq_table[irq]->airq_dip == dip)) { + + prsirqlistp = acpi_insert_prs_irq_ent( + prsirqlistp, 0 /* Highest priority */, irq, + &irqlistent->intr_flags, + &irqlistent->acpi_prs_prv); + + /* + * If we do not prefer the current irq from _CRS + * or if we do and this irq is the same as the + * current irq from _CRS, this is the one + * to pick. + */ + if (!(apic_prefer_crs) || (irq == crs_irq)) { + return (prsirqlistp); + } + continue; + } + + /* + * Edge-triggered interrupts cannot be shared + */ + if (irqlistent->intr_flags.intr_el == INTR_EL_EDGE) + continue; + + /* + * To work around BIOSes that contain incorrect + * interrupt polarity information in interrupt + * descriptors returned by _PRS, we assume that + * the polarity of the other device sharing this + * interrupt controller input is compatible. + * If it's not, the caller will catch it when + * the caller invokes the link device's _CRS method + * (after invoking its _SRS method). + */ + iflags = irqlistent->intr_flags; + iflags.intr_po = + apic_irq_table[irq]->airq_iflag.intr_po; + + if (!acpi_intr_compatible(iflags, + apic_irq_table[irq]->airq_iflag)) + continue; + + /* + * If we prefer the irq from _CRS, no need + * to search any further (and make sure + * to add this irq with the highest priority + * so it's tried first). + */ + if (crs_irq == irq && apic_prefer_crs) { + + return (acpi_insert_prs_irq_ent( + prsirqlistp, + 0 /* Highest priority */, + irq, &iflags, + &irqlistent->acpi_prs_prv)); + } + + /* + * Priority is equal to the share count (lower + * share count is higher priority). Note that + * the intr flags passed in here are the ones we + * changed above -- if incorrect, it will be + * caught by the caller's _CRS flags comparison. + */ + prsirqlistp = acpi_insert_prs_irq_ent( + prsirqlistp, + apic_irq_table[irq]->airq_share, irq, + &iflags, &irqlistent->acpi_prs_prv); + } + + /* Go to the next irqlist entry */ + irqlistent = irqlistent->next; + } + + return (prsirqlistp); +} + +/* * Configures the irq for the interrupt link device identified by * acpipsmlnkp. * @@ -5123,13 +5531,13 @@ int *pci_irqp, iflag_t *dipintr_flagp) { - int i, min_share, foundnow, done = 0; int32_t irq; - int32_t share_irq = -1; - int32_t chosen_irq = -1; int cur_irq = -1; acpi_irqlist_t *irqlistp; - acpi_irqlist_t *irqlistent; + prs_irq_list_t *prs_irq_listp, *prs_irq_entp; + boolean_t found_irq = B_FALSE; + + dipintr_flagp->bustype = BUS_PCI; if ((acpi_get_possible_irq_resources(acpipsmlnkp, &irqlistp)) == ACPI_PSM_FAILURE) { @@ -5153,9 +5561,9 @@ if (acpi_irqlist_find_irq(irqlistp, cur_irq, NULL) == ACPI_PSM_SUCCESS) { - acpi_free_irqlist(irqlistp); ASSERT(pci_irqp != NULL); *pci_irqp = cur_irq; + acpi_free_irqlist(irqlistp); return (ACPI_PSM_SUCCESS); } @@ -5166,128 +5574,91 @@ ddi_get_instance(dip))); } - irqlistent = irqlistp; - min_share = 255; - - while (irqlistent != NULL) { - irqlistent->intr_flags.bustype = BUS_PCI; - - for (foundnow = 0, i = 0; i < irqlistent->num_irqs; i++) { - - irq = irqlistent->irqs[i]; - - if ((irq < 16) && (apic_reserved_irqlist[irq])) - continue; - - if (irq == 0) { - /* invalid irq number */ - continue; - } - - if ((apic_irq_table[irq] == NULL) || - (apic_irq_table[irq]->airq_dip == dip)) { - chosen_irq = irq; - foundnow = 1; - /* - * If we do not prefer current irq from crs - * or if we do and this irq is the same as - * current irq from crs, this is the one - * to pick. - */ - if (!(apic_prefer_crs) || (irq == cur_irq)) { - done = 1; - break; - } - continue; - } - - if (irqlistent->intr_flags.intr_el == INTR_EL_EDGE) - continue; - - if (!acpi_intr_compatible(irqlistent->intr_flags, - apic_irq_table[irq]->airq_iflag)) - continue; - - if ((apic_irq_table[irq]->airq_share < min_share) || - ((apic_irq_table[irq]->airq_share == min_share) && - (cur_irq == irq) && (apic_prefer_crs))) { - min_share = apic_irq_table[irq]->airq_share; - share_irq = irq; - foundnow = 1; - } - } - - /* - * If we found an IRQ in the inner loop this time, save the - * details from the irqlist for later use. - */ - if (foundnow && ((chosen_irq != -1) || (share_irq != -1))) { - /* - * Copy the acpi_prs_private_t and flags from this - * irq list entry, since we found an irq from this - * entry. - */ - acpipsmlnkp->acpi_prs_prv = irqlistent->acpi_prs_prv; - *dipintr_flagp = irqlistent->intr_flags; - } - - if (done) - break; - - /* Go to the next irqlist entry */ - irqlistent = irqlistent->next; - } - - - acpi_free_irqlist(irqlistp); - if (chosen_irq != -1) - irq = chosen_irq; - else if (share_irq != -1) - irq = share_irq; - else { + if ((prs_irq_listp = apic_choose_irqs_from_prs(irqlistp, dip, + cur_irq)) == NULL) { + APIC_VERBOSE_IRQ((CE_WARN, "!pcplusmp: Could not find a " "suitable irq from the list of possible irqs for device " "%s, instance #%d in ACPI's list of possible irqs", ddi_get_name(dip), ddi_get_instance(dip))); + + acpi_free_irqlist(irqlistp); return (ACPI_PSM_FAILURE); } - APIC_VERBOSE_IRQ((CE_CONT, "!pcplusmp: Setting irq %d for device %s " - "instance #%d\n", irq, ddi_get_name(dip), ddi_get_instance(dip))); - - if ((acpi_set_irq_resource(acpipsmlnkp, irq)) == ACPI_PSM_SUCCESS) { - /* - * setting irq was successful, check to make sure CRS - * reflects that. If CRS does not agree with what we - * set, return the irq that was set. - */ - - if (acpi_get_current_irq_resource(acpipsmlnkp, &cur_irq, - dipintr_flagp) == ACPI_PSM_SUCCESS) { - - if (cur_irq != irq) - APIC_VERBOSE_IRQ((CE_WARN, "!pcplusmp: " - "IRQ resource set (irqno %d) for device %s " - "instance #%d, differs from current " - "setting irqno %d", - irq, ddi_get_name(dip), - ddi_get_instance(dip), cur_irq)); + acpi_free_irqlist(irqlistp); + + for (prs_irq_entp = prs_irq_listp; + prs_irq_entp != NULL && found_irq == B_FALSE; + prs_irq_entp = prs_irq_entp->next) { + + acpipsmlnkp->acpi_prs_prv = prs_irq_entp->prsprv; + irq = prs_irq_entp->irq; + + APIC_VERBOSE_IRQ((CE_CONT, "!pcplusmp: Setting irq %d for " + "device %s instance #%d\n", irq, ddi_get_name(dip), + ddi_get_instance(dip))); + + if ((acpi_set_irq_resource(acpipsmlnkp, irq)) + == ACPI_PSM_SUCCESS) { + /* + * setting irq was successful, check to make sure CRS + * reflects that. If CRS does not agree with what we + * set, return the irq that was set. + */ + + if (acpi_get_current_irq_resource(acpipsmlnkp, &cur_irq, + dipintr_flagp) == ACPI_PSM_SUCCESS) { + + if (cur_irq != irq) + APIC_VERBOSE_IRQ((CE_WARN, + "!pcplusmp: IRQ resource set " + "(irqno %d) for device %s " + "instance #%d, differs from " + "current setting irqno %d", + irq, ddi_get_name(dip), + ddi_get_instance(dip), cur_irq)); + } else { + /* + * On at least one system, there was a bug in + * a DSDT method called by _STA, causing _STA to + * indicate that the link device was disabled + * (when, in fact, it was enabled). Since _SRS + * succeeded, assume that _CRS is lying and use + * the iflags from this _PRS interrupt choice. + * If we're wrong about the flags, the polarity + * will be incorrect and we may get an interrupt + * storm, but there's not much else we can do + * at this point. + */ + *dipintr_flagp = prs_irq_entp->intrflags; + } + + /* + * Return the irq that was set, and not what _CRS + * reports, since _CRS has been seen to return + * different IRQs than what was passed to _SRS on some + * systems (and just not return successfully on others). + */ + cur_irq = irq; + found_irq = B_TRUE; + } else { + APIC_VERBOSE_IRQ((CE_WARN, "!pcplusmp: set resource " + "irq %d failed for device %s instance #%d", + irq, ddi_get_name(dip), ddi_get_instance(dip))); + + if (cur_irq == -1) { + acpi_destroy_prs_irq_list(&prs_irq_listp); + return (ACPI_PSM_FAILURE); + } } - - /* - * return the irq that was set, and not what CRS reports, - * since CRS has been seen to be bogus on some systems - */ - cur_irq = irq; - } else { - APIC_VERBOSE_IRQ((CE_WARN, "!pcplusmp: set resource irq %d " - "failed for device %s instance #%d", - irq, ddi_get_name(dip), ddi_get_instance(dip))); - - if (cur_irq == -1) - return (ACPI_PSM_FAILURE); } + acpi_destroy_prs_irq_list(&prs_irq_listp); + + if (!found_irq) + return (ACPI_PSM_FAILURE); + ASSERT(pci_irqp != NULL); *pci_irqp = cur_irq; return (ACPI_PSM_SUCCESS);
--- a/usr/src/uts/i86pc/io/pcplusmp/apic.h Fri Nov 17 16:14:57 2006 -0800 +++ b/usr/src/uts/i86pc/io/pcplusmp/apic.h Fri Nov 17 19:17:15 2006 -0800 @@ -280,6 +280,7 @@ #define AV_LDEST 0x800 /* IO & Local APIC Bit Definitions */ +#define RDT_VECTOR(x) ((uchar_t)((x) & 0xFF)) #define AV_PENDING 0x1000 #define AV_ACTIVE_LOW 0x2000 /* only for integrated APIC */ #define AV_REMOTE_IRR 0x4000 /* IOAPIC RDT-specific */ @@ -566,6 +567,12 @@ #define APIC_NSECS_TO_TICKS(nsecs) (((int64_t)(nsecs) * \ apic_ticks_per_SFnsecs + (SF/2)) / SF) +extern uchar_t apic_bind_intr(dev_info_t *, int, uchar_t, uchar_t); +extern int apic_allocate_irq(int); +extern int apic_introp_xlate(dev_info_t *, struct intrspec *, int); +extern int apic_rebind_all(apic_irq_t *irq_ptr, int bind_cpu); +extern boolean_t apic_cpu_in_range(int cpu); + #ifdef __cplusplus } #endif
--- a/usr/src/uts/i86pc/io/pcplusmp/apic_introp.c Fri Nov 17 16:14:57 2006 -0800 +++ b/usr/src/uts/i86pc/io/pcplusmp/apic_introp.c Fri Nov 17 19:17:15 2006 -0800 @@ -61,11 +61,6 @@ extern int intr_clear(void); extern void intr_restore(uint_t); -extern uchar_t apic_bind_intr(dev_info_t *, int, uchar_t, uchar_t); -extern int apic_allocate_irq(int); -extern int apic_introp_xlate(dev_info_t *, struct intrspec *, int); -extern int apic_rebind_all(apic_irq_t *irq_ptr, int bind_cpu, int safe); -extern boolean_t apic_cpu_in_range(int cpu); /* * MSI support flag: @@ -823,6 +818,7 @@ int cpu; int old_priority; int new_priority; + int iflag; apic_irq_t *irqp; struct intrspec *ispec, intr_spec; @@ -946,17 +942,25 @@ return (PSM_FAILURE); } - mutex_enter(&airq_mutex); /* Convert the vector to the irq using vector_to_irq table. */ + mutex_enter(&airq_mutex); irqp = apic_irq_table[apic_vector_to_irq[hdlp->ih_vector]]; + mutex_exit(&airq_mutex); + if (irqp == NULL) { - mutex_exit(&airq_mutex); *result = ENXIO; return (PSM_FAILURE); } - ret = apic_rebind_all(irqp, cpu, 1); - mutex_exit(&airq_mutex); + + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + ret = apic_rebind_all(irqp, cpu); + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + if (ret) { *result = EIO; return (PSM_FAILURE);