Mercurial > illumos > illumos-gate
changeset 12822:7a89035df76c
6910752 mpt_sas driver performance can be improved
6968206 mptsas_ioctl should idle component after busy_component
author | Brian Xu - Sun Microsystems - Beijing China <Brian.Xu@Sun.COM> |
---|---|
date | Tue, 13 Jul 2010 11:05:20 +0800 |
parents | e444775eb30d |
children | dec8caad2567 |
files | usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_impl.c usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_raid.c usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h |
diffstat | 4 files changed, 1326 insertions(+), 559 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c Tue Jul 13 10:36:11 2010 +0800 +++ b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c Tue Jul 13 11:05:20 2010 +0800 @@ -66,6 +66,7 @@ #include <sys/scsi/scsi.h> #include <sys/pci.h> #include <sys/file.h> +#include <sys/cpuvar.h> #include <sys/policy.h> #include <sys/sysevent.h> #include <sys/sysevent/eventdefs.h> @@ -93,6 +94,7 @@ #include <sys/scsi/adapters/mpt_sas/mptsas_var.h> #include <sys/scsi/adapters/mpt_sas/mptsas_ioctl.h> #include <sys/scsi/adapters/mpt_sas/mptsas_smhba.h> + #include <sys/raidioctl.h> #include <sys/fs/dv_node.h> /* devfs_clean */ @@ -106,6 +108,21 @@ #include <sys/fm/io/ddi.h> /* + * For anyone who would modify the code in mptsas_driver, it must be awared + * that from snv_145 where CR6910752(mpt_sas driver performance can be + * improved) is integrated, the per_instance mutex m_mutex is not hold + * in the key IO code path, including mptsas_scsi_start(), mptsas_intr() + * and all of the recursive functions called in them, so don't + * make it for granted that all operations are sync/exclude correctly. Before + * doing any modification in key code path, and even other code path such as + * DR, watchsubr, ioctl, passthrough etc, make sure the elements modified have + * no releationship to elements shown in the fastpath + * (function mptsas_handle_io_fastpath()) in ISR and its recursive functions. + * otherwise, you have to use the new introduced mutex to protect them. + * As to how to do correctly, refer to the comments in mptsas_intr(). + */ + +/* * autoconfiguration data and routines. */ static int mptsas_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); @@ -198,8 +215,6 @@ static int mptsas_prepare_pkt(mptsas_cmd_t *cmd); static int mptsas_accept_pkt(mptsas_t *mpt, mptsas_cmd_t *sp); -static int mptsas_accept_txwq_and_pkt(mptsas_t *mpt, mptsas_cmd_t *sp); -static void mptsas_accept_tx_waitq(mptsas_t *mpt); static int mptsas_do_detach(dev_info_t *dev); static int mptsas_do_scsi_reset(mptsas_t *mpt, uint16_t devhdl); @@ -225,6 +240,7 @@ static uint_t mptsas_intr(caddr_t arg1, caddr_t arg2); static void mptsas_process_intr(mptsas_t *mpt, pMpi2ReplyDescriptorsUnion_t reply_desc_union); +static int mptsas_handle_io_fastpath(mptsas_t *mpt, uint16_t SMID); static void mptsas_handle_scsi_io_success(mptsas_t *mpt, pMpi2ReplyDescriptorsUnion_t reply_desc); static void mptsas_handle_address_reply(mptsas_t *mpt, @@ -291,12 +307,13 @@ static int mptsas_alloc_active_slots(mptsas_t *mpt, int flag); static void mptsas_free_active_slots(mptsas_t *mpt); static int mptsas_start_cmd(mptsas_t *mpt, mptsas_cmd_t *cmd); +static int mptsas_start_cmd0(mptsas_t *mpt, mptsas_cmd_t *cmd); static void mptsas_restart_hba(mptsas_t *mpt); -static void mptsas_restart_waitq(mptsas_t *mpt); static void mptsas_deliver_doneq_thread(mptsas_t *mpt); static void mptsas_doneq_add(mptsas_t *mpt, mptsas_cmd_t *cmd); +static inline void mptsas_doneq_add0(mptsas_t *mpt, mptsas_cmd_t *cmd); static void mptsas_doneq_mv(mptsas_t *mpt, uint64_t t); static mptsas_cmd_t *mptsas_doneq_thread_rm(mptsas_t *mpt, uint64_t t); @@ -305,15 +322,13 @@ static mptsas_cmd_t *mptsas_waitq_rm(mptsas_t *mpt); static void mptsas_waitq_delete(mptsas_t *mpt, mptsas_cmd_t *cmd); -static mptsas_cmd_t *mptsas_tx_waitq_rm(mptsas_t *mpt); -static void mptsas_tx_waitq_delete(mptsas_t *mpt, mptsas_cmd_t *cmd); - static void mptsas_start_watch_reset_delay(); static void mptsas_setup_bus_reset_delay(mptsas_t *mpt); static void mptsas_watch_reset_delay(void *arg); static int mptsas_watch_reset_delay_subr(mptsas_t *mpt); +static int mptsas_outstanding_cmds_n(mptsas_t *mpt); /* * helper functions */ @@ -345,6 +360,7 @@ static int mptsas_get_target_device_info(mptsas_t *mpt, uint32_t page_address, uint16_t *handle, mptsas_target_t **pptgt); static void mptsas_update_phymask(mptsas_t *mpt); +static inline void mptsas_remove_cmd0(mptsas_t *mpt, mptsas_cmd_t *cmd); static int mptsas_send_sep(mptsas_t *mpt, mptsas_target_t *ptgt, uint32_t *status, uint8_t cmd); @@ -414,7 +430,7 @@ static void * mptsas_hash_traverse(mptsas_hash_table_t *hashtab, int pos); mptsas_target_t *mptsas_tgt_alloc(mptsas_hash_table_t *, uint16_t, uint64_t, - uint32_t, mptsas_phymask_t, uint8_t); + uint32_t, mptsas_phymask_t, uint8_t, mptsas_t *); static mptsas_smp_t *mptsas_smp_alloc(mptsas_hash_table_t *hashtab, mptsas_smp_t *data); static void mptsas_smp_free(mptsas_hash_table_t *hashtab, uint64_t wwid, @@ -1179,7 +1195,7 @@ mutex_init(&mpt->m_mutex, NULL, MUTEX_DRIVER, DDI_INTR_PRI(mpt->m_intr_pri)); mutex_init(&mpt->m_passthru_mutex, NULL, MUTEX_DRIVER, NULL); - mutex_init(&mpt->m_tx_waitq_mutex, NULL, MUTEX_DRIVER, + mutex_init(&mpt->m_intr_mutex, NULL, MUTEX_DRIVER, DDI_INTR_PRI(mpt->m_intr_pri)); for (i = 0; i < MPTSAS_MAX_PHYS; i++) { mutex_init(&mpt->m_phy_info[i].smhba_info.phy_mutex, @@ -1282,8 +1298,6 @@ */ mpt->m_donetail = &mpt->m_doneq; mpt->m_waitqtail = &mpt->m_waitq; - mpt->m_tx_waitqtail = &mpt->m_tx_waitq; - mpt->m_tx_draining = 0; /* * ioc cmd queue initialize @@ -1443,7 +1457,7 @@ ddi_taskq_destroy(mpt->m_dr_taskq); } if (mutex_init_done) { - mutex_destroy(&mpt->m_tx_waitq_mutex); + mutex_destroy(&mpt->m_intr_mutex); mutex_destroy(&mpt->m_passthru_mutex); mutex_destroy(&mpt->m_mutex); for (i = 0; i < MPTSAS_MAX_PHYS; i++) { @@ -1861,7 +1875,7 @@ mpt->m_instance); } - mutex_destroy(&mpt->m_tx_waitq_mutex); + mutex_destroy(&mpt->m_intr_mutex); mutex_destroy(&mpt->m_passthru_mutex); mutex_destroy(&mpt->m_mutex); for (i = 0; i < MPTSAS_MAX_PHYS; i++) { @@ -2225,7 +2239,9 @@ return (DDI_FAILURE); } } + mutex_enter(&mpt->m_intr_mutex); mpt->m_power_level = PM_LEVEL_D0; + mutex_exit(&mpt->m_intr_mutex); break; case PM_LEVEL_D3: NDBG11(("mptsas%d: turning power OFF.", mpt->m_instance)); @@ -2981,147 +2997,29 @@ * and which scsi_pkt is the currently-running command so the * interrupt handler can refer to the pkt to set completion * status, call the target driver back through pkt_comp, etc. - * - * If the instance lock is held by other thread, don't spin to wait - * for it. Instead, queue the cmd and next time when the instance lock - * is not held, accept all the queued cmd. A extra tx_waitq is - * introduced to protect the queue. - * - * The polled cmd will not be queud and accepted as usual. - * - * Under the tx_waitq mutex, record whether a thread is draining - * the tx_waitq. An IO requesting thread that finds the instance - * mutex contended appends to the tx_waitq and while holding the - * tx_wait mutex, if the draining flag is not set, sets it and then - * proceeds to spin for the instance mutex. This scheme ensures that - * the last cmd in a burst be processed. - * - * we enable this feature only when the helper threads are enabled, - * at which we think the loads are heavy. - * - * per instance mutex m_tx_waitq_mutex is introduced to protect the - * m_tx_waitqtail, m_tx_waitq, m_tx_draining. - */ - - if (mpt->m_doneq_thread_n) { - if (mutex_tryenter(&mpt->m_mutex) != 0) { - rval = mptsas_accept_txwq_and_pkt(mpt, cmd); - mutex_exit(&mpt->m_mutex); - } else if (cmd->cmd_pkt_flags & FLAG_NOINTR) { - mutex_enter(&mpt->m_mutex); - rval = mptsas_accept_txwq_and_pkt(mpt, cmd); - mutex_exit(&mpt->m_mutex); - } else { - mutex_enter(&mpt->m_tx_waitq_mutex); - /* - * ptgt->m_dr_flag is protected by m_mutex or - * m_tx_waitq_mutex. In this case, m_tx_waitq_mutex - * is acquired. - */ - if (ptgt->m_dr_flag == MPTSAS_DR_INTRANSITION) { - if (cmd->cmd_pkt_flags & FLAG_NOQUEUE) { - /* - * The command should be allowed to - * retry by returning TRAN_BUSY to - * to stall the I/O's which come from - * scsi_vhci since the device/path is - * in unstable state now. - */ - mutex_exit(&mpt->m_tx_waitq_mutex); - return (TRAN_BUSY); - } else { - /* - * The device is offline, just fail the - * command by returning - * TRAN_FATAL_ERROR. - */ - mutex_exit(&mpt->m_tx_waitq_mutex); - return (TRAN_FATAL_ERROR); - } - } - if (mpt->m_tx_draining) { - cmd->cmd_flags |= CFLAG_TXQ; - *mpt->m_tx_waitqtail = cmd; - mpt->m_tx_waitqtail = &cmd->cmd_linkp; - mutex_exit(&mpt->m_tx_waitq_mutex); - } else { /* drain the queue */ - mpt->m_tx_draining = 1; - mutex_exit(&mpt->m_tx_waitq_mutex); - mutex_enter(&mpt->m_mutex); - rval = mptsas_accept_txwq_and_pkt(mpt, cmd); - mutex_exit(&mpt->m_mutex); - } - } - } else { - mutex_enter(&mpt->m_mutex); - /* - * ptgt->m_dr_flag is protected by m_mutex or m_tx_waitq_mutex - * in this case, m_mutex is acquired. - */ - if (ptgt->m_dr_flag == MPTSAS_DR_INTRANSITION) { - if (cmd->cmd_pkt_flags & FLAG_NOQUEUE) { - /* - * commands should be allowed to retry by - * returning TRAN_BUSY to stall the I/O's - * which come from scsi_vhci since the device/ - * path is in unstable state now. - */ - mutex_exit(&mpt->m_mutex); - return (TRAN_BUSY); - } else { - /* - * The device is offline, just fail the - * command by returning TRAN_FATAL_ERROR. - */ - mutex_exit(&mpt->m_mutex); - return (TRAN_FATAL_ERROR); - } - } - rval = mptsas_accept_pkt(mpt, cmd); - mutex_exit(&mpt->m_mutex); - } - - return (rval); -} - -/* - * Accept all the queued cmds(if any) before accept the current one. - */ -static int -mptsas_accept_txwq_and_pkt(mptsas_t *mpt, mptsas_cmd_t *cmd) -{ - int rval; - mptsas_target_t *ptgt = cmd->cmd_tgt_addr; - - ASSERT(mutex_owned(&mpt->m_mutex)); - /* - * The call to mptsas_accept_tx_waitq() must always be performed - * because that is where mpt->m_tx_draining is cleared. - */ - mutex_enter(&mpt->m_tx_waitq_mutex); - mptsas_accept_tx_waitq(mpt); - mutex_exit(&mpt->m_tx_waitq_mutex); - /* - * ptgt->m_dr_flag is protected by m_mutex or m_tx_waitq_mutex - * in this case, m_mutex is acquired. - */ + */ + + mutex_enter(&ptgt->m_tgt_intr_mutex); if (ptgt->m_dr_flag == MPTSAS_DR_INTRANSITION) { if (cmd->cmd_pkt_flags & FLAG_NOQUEUE) { /* - * The command should be allowed to retry by returning - * TRAN_BUSY to stall the I/O's which come from - * scsi_vhci since the device/path is in unstable state - * now. - */ + * commands should be allowed to retry by + * returning TRAN_BUSY to stall the I/O's + * which come from scsi_vhci since the device/ + * path is in unstable state now. + */ + mutex_exit(&ptgt->m_tgt_intr_mutex); return (TRAN_BUSY); } else { /* - * The device is offline, just fail the command by - * return TRAN_FATAL_ERROR. - */ + * The device is offline, just fail the + * command by returning TRAN_FATAL_ERROR. + */ + mutex_exit(&ptgt->m_tgt_intr_mutex); return (TRAN_FATAL_ERROR); } } + mutex_exit(&ptgt->m_tgt_intr_mutex); rval = mptsas_accept_pkt(mpt, cmd); return (rval); @@ -3135,8 +3033,6 @@ NDBG1(("mptsas_accept_pkt: cmd=0x%p", (void *)cmd)); - ASSERT(mutex_owned(&mpt->m_mutex)); - if ((cmd->cmd_flags & CFLAG_PREPARED) == 0) { rval = mptsas_prepare_pkt(cmd); if (rval != TRAN_ACCEPT) { @@ -3148,6 +3044,7 @@ /* * reset the throttle if we were draining */ + mutex_enter(&ptgt->m_tgt_intr_mutex); if ((ptgt->m_t_ncmds == 0) && (ptgt->m_t_throttle == DRAIN_THROTTLE)) { NDBG23(("reset throttle")); @@ -3156,24 +3053,6 @@ } /* - * If HBA is being reset, the DevHandles are being re-initialized, - * which means that they could be invalid even if the target is still - * attached. Check if being reset and if DevHandle is being - * re-initialized. If this is the case, return BUSY so the I/O can be - * retried later. - */ - if ((ptgt->m_devhdl == MPTSAS_INVALID_DEVHDL) && mpt->m_in_reset) { - mptsas_set_pkt_reason(mpt, cmd, CMD_RESET, STAT_BUS_RESET); - if (cmd->cmd_flags & CFLAG_TXQ) { - mptsas_doneq_add(mpt, cmd); - mptsas_doneq_empty(mpt); - return (rval); - } else { - return (TRAN_BUSY); - } - } - - /* * If device handle has already been invalidated, just * fail the command. In theory, command from scsi_vhci * client is impossible send down command with invalid @@ -3183,36 +3062,66 @@ if (ptgt->m_devhdl == MPTSAS_INVALID_DEVHDL) { NDBG20(("rejecting command, it might because invalid devhdl " "request.")); + mutex_exit(&ptgt->m_tgt_intr_mutex); + mutex_enter(&mpt->m_mutex); + /* + * If HBA is being reset, the DevHandles are being + * re-initialized, which means that they could be invalid + * even if the target is still attached. Check if being reset + * and if DevHandle is being re-initialized. If this is the + * case, return BUSY so the I/O can be retried later. + */ + if (mpt->m_in_reset) { + mptsas_set_pkt_reason(mpt, cmd, CMD_RESET, + STAT_BUS_RESET); + if (cmd->cmd_flags & CFLAG_TXQ) { + mptsas_doneq_add(mpt, cmd); + mptsas_doneq_empty(mpt); + mutex_exit(&mpt->m_mutex); + return (rval); + } else { + mutex_exit(&mpt->m_mutex); + return (TRAN_BUSY); + } + } mptsas_set_pkt_reason(mpt, cmd, CMD_DEV_GONE, STAT_TERMINATED); if (cmd->cmd_flags & CFLAG_TXQ) { mptsas_doneq_add(mpt, cmd); mptsas_doneq_empty(mpt); + mutex_exit(&mpt->m_mutex); return (rval); } else { + mutex_exit(&mpt->m_mutex); return (TRAN_FATAL_ERROR); } } + mutex_exit(&ptgt->m_tgt_intr_mutex); /* * The first case is the normal case. mpt gets a command from the * target driver and starts it. * Since SMID 0 is reserved and the TM slot is reserved, the actual max * commands is m_max_requests - 2. */ - if ((mpt->m_ncmds <= (mpt->m_max_requests - 2)) && - (ptgt->m_t_throttle > HOLD_THROTTLE) && + mutex_enter(&ptgt->m_tgt_intr_mutex); + if ((ptgt->m_t_throttle > HOLD_THROTTLE) && (ptgt->m_t_ncmds < ptgt->m_t_throttle) && (ptgt->m_reset_delay == 0) && (ptgt->m_t_nwait == 0) && ((cmd->cmd_pkt_flags & FLAG_NOINTR) == 0)) { + mutex_exit(&ptgt->m_tgt_intr_mutex); if (mptsas_save_cmd(mpt, cmd) == TRUE) { - (void) mptsas_start_cmd(mpt, cmd); + (void) mptsas_start_cmd0(mpt, cmd); } else { + mutex_enter(&mpt->m_mutex); mptsas_waitq_add(mpt, cmd); + mutex_exit(&mpt->m_mutex); } } else { /* * Add this pkt to the work queue */ + mutex_exit(&ptgt->m_tgt_intr_mutex); + mutex_enter(&mpt->m_mutex); mptsas_waitq_add(mpt, cmd); if (cmd->cmd_pkt_flags & FLAG_NOINTR) { @@ -3227,6 +3136,7 @@ mptsas_doneq_empty(mpt); } } + mutex_exit(&mpt->m_mutex); } return (rval); } @@ -3237,8 +3147,9 @@ mptsas_slots_t *slots; int slot; mptsas_target_t *ptgt = cmd->cmd_tgt_addr; - - ASSERT(mutex_owned(&mpt->m_mutex)); + mptsas_slot_free_e_t *pe; + int qn, qn_first; + slots = mpt->m_active; /* @@ -3246,67 +3157,100 @@ */ ASSERT(slots->m_n_slots == (mpt->m_max_requests - 2)); - /* - * m_tags is equivalent to the SMID when sending requests. Since the - * SMID cannot be 0, start out at one if rolling over past the size - * of the request queue depth. Also, don't use the last SMID, which is - * reserved for TM requests. - */ - slot = (slots->m_tags)++; - if (slots->m_tags > slots->m_n_slots) { - slots->m_tags = 1; - } - -alloc_tag: - /* Validate tag, should never fail. */ - if (slots->m_slot[slot] == NULL) { - /* - * Make sure SMID is not using reserved value of 0 - * and the TM request slot. - */ - ASSERT((slot > 0) && (slot <= slots->m_n_slots)); - cmd->cmd_slot = slot; - slots->m_slot[slot] = cmd; - mpt->m_ncmds++; - - /* - * only increment per target ncmds if this is not a - * command that has no target associated with it (i.e. a - * event acknoledgment) - */ - if ((cmd->cmd_flags & CFLAG_CMDIOC) == 0) { - ptgt->m_t_ncmds++; - } - cmd->cmd_active_timeout = cmd->cmd_pkt->pkt_time; - - /* - * If initial timout is less than or equal to one tick, bump - * the timeout by a tick so that command doesn't timeout before - * its allotted time. - */ - if (cmd->cmd_active_timeout <= mptsas_scsi_watchdog_tick) { - cmd->cmd_active_timeout += mptsas_scsi_watchdog_tick; - } - return (TRUE); - } else { - int i; - - /* - * If slot in use, scan until a free one is found. Don't use 0 - * or final slot, which is reserved for TM requests. - */ - for (i = 0; i < slots->m_n_slots; i++) { - slot = slots->m_tags; - if (++(slots->m_tags) > slots->m_n_slots) { - slots->m_tags = 1; - } - if (slots->m_slot[slot] == NULL) { - NDBG22(("found free slot %d", slot)); - goto alloc_tag; - } - } - } - return (FALSE); + qn = qn_first = CPU->cpu_seqid & (mpt->m_slot_freeq_pair_n - 1); + +qpair_retry: + ASSERT(qn < mpt->m_slot_freeq_pair_n); + mutex_enter(&mpt->m_slot_freeq_pairp[qn].m_slot_allocq.s.m_fq_mutex); + pe = list_head(&mpt->m_slot_freeq_pairp[qn].m_slot_allocq. + s.m_fq_list); + if (!pe) { /* switch the allocq and releq */ + mutex_enter(&mpt->m_slot_freeq_pairp[qn].m_slot_releq. + s.m_fq_mutex); + if (mpt->m_slot_freeq_pairp[qn].m_slot_releq.s.m_fq_n) { + mpt->m_slot_freeq_pairp[qn]. + m_slot_allocq.s.m_fq_n = + mpt->m_slot_freeq_pairp[qn]. + m_slot_releq.s.m_fq_n; + mpt->m_slot_freeq_pairp[qn]. + m_slot_allocq.s.m_fq_list.list_head.list_next = + mpt->m_slot_freeq_pairp[qn]. + m_slot_releq.s.m_fq_list.list_head.list_next; + mpt->m_slot_freeq_pairp[qn]. + m_slot_allocq.s.m_fq_list.list_head.list_prev = + mpt->m_slot_freeq_pairp[qn]. + m_slot_releq.s.m_fq_list.list_head.list_prev; + mpt->m_slot_freeq_pairp[qn]. + m_slot_releq.s.m_fq_list.list_head.list_prev-> + list_next = + &mpt->m_slot_freeq_pairp[qn]. + m_slot_allocq.s.m_fq_list.list_head; + mpt->m_slot_freeq_pairp[qn]. + m_slot_releq.s.m_fq_list.list_head.list_next-> + list_prev = + &mpt->m_slot_freeq_pairp[qn]. + m_slot_allocq.s.m_fq_list.list_head; + + mpt->m_slot_freeq_pairp[qn]. + m_slot_releq.s.m_fq_list.list_head.list_next = + mpt->m_slot_freeq_pairp[qn]. + m_slot_releq.s.m_fq_list.list_head.list_prev = + &mpt->m_slot_freeq_pairp[qn]. + m_slot_releq.s.m_fq_list.list_head; + mpt->m_slot_freeq_pairp[qn]. + m_slot_releq.s.m_fq_n = 0; + } else { + mutex_exit(&mpt->m_slot_freeq_pairp[qn]. + m_slot_releq.s.m_fq_mutex); + mutex_exit(&mpt->m_slot_freeq_pairp[qn]. + m_slot_allocq.s.m_fq_mutex); + qn = (qn + 1) & (mpt->m_slot_freeq_pair_n - 1); + if (qn == qn_first) + return (FALSE); + else + goto qpair_retry; + } + mutex_exit(&mpt->m_slot_freeq_pairp[qn]. + m_slot_releq.s.m_fq_mutex); + pe = list_head(&mpt->m_slot_freeq_pairp[qn]. + m_slot_allocq.s.m_fq_list); + ASSERT(pe); + } + list_remove(&mpt->m_slot_freeq_pairp[qn]. + m_slot_allocq.s.m_fq_list, pe); + slot = pe->slot; + /* + * Make sure SMID is not using reserved value of 0 + * and the TM request slot. + */ + ASSERT((slot > 0) && (slot <= slots->m_n_slots) && + mpt->m_slot_freeq_pairp[qn].m_slot_allocq.s.m_fq_n > 0); + cmd->cmd_slot = slot; + mpt->m_slot_freeq_pairp[qn].m_slot_allocq.s.m_fq_n--; + ASSERT(mpt->m_slot_freeq_pairp[qn].m_slot_allocq.s.m_fq_n >= 0); + + mutex_exit(&mpt->m_slot_freeq_pairp[qn].m_slot_allocq.s.m_fq_mutex); + /* + * only increment per target ncmds if this is not a + * command that has no target associated with it (i.e. a + * event acknoledgment) + */ + if ((cmd->cmd_flags & CFLAG_CMDIOC) == 0) { + mutex_enter(&ptgt->m_tgt_intr_mutex); + ptgt->m_t_ncmds++; + mutex_exit(&ptgt->m_tgt_intr_mutex); + } + cmd->cmd_active_timeout = cmd->cmd_pkt->pkt_time; + + /* + * If initial timout is less than or equal to one tick, bump + * the timeout by a tick so that command doesn't timeout before + * its allotted time. + */ + if (cmd->cmd_active_timeout <= mptsas_scsi_watchdog_tick) { + cmd->cmd_active_timeout += mptsas_scsi_watchdog_tick; + } + return (TRUE); } /* @@ -3374,7 +3318,9 @@ mptsas_cmd_t *cmd, *new_cmd; mptsas_t *mpt = ADDR2MPT(ap); int failure = 1; +#ifndef __sparc uint_t oldcookiec; +#endif /* __sparc */ mptsas_target_t *ptgt = NULL; int rval; mptsas_tgt_private_t *tgt_private; @@ -3412,6 +3358,9 @@ ddi_dma_handle_t save_arq_dma_handle; struct buf *save_arq_bp; ddi_dma_cookie_t save_arqcookie; +#ifdef __sparc + mptti_t *save_sg; +#endif /* __sparc */ cmd = kmem_cache_alloc(mpt->m_kmem_cache, kf); @@ -3420,12 +3369,17 @@ save_arq_dma_handle = cmd->cmd_arqhandle; save_arq_bp = cmd->cmd_arq_buf; save_arqcookie = cmd->cmd_arqcookie; +#ifdef __sparc + save_sg = cmd->cmd_sg; +#endif /* __sparc */ bzero(cmd, sizeof (*cmd) + scsi_pkt_size()); cmd->cmd_dmahandle = save_dma_handle; cmd->cmd_arqhandle = save_arq_dma_handle; cmd->cmd_arq_buf = save_arq_bp; cmd->cmd_arqcookie = save_arqcookie; - +#ifdef __sparc + cmd->cmd_sg = save_sg; +#endif /* __sparc */ pkt = (void *)((uchar_t *)cmd + sizeof (struct mptsas_cmd)); pkt->pkt_ha_private = (opaque_t)cmd; @@ -3468,9 +3422,11 @@ } +#ifndef __sparc /* grab cmd->cmd_cookiec here as oldcookiec */ oldcookiec = cmd->cmd_cookiec; +#endif /* __sparc */ /* * If the dma was broken up into PARTIAL transfers cmd_nwin will be @@ -3620,7 +3576,7 @@ * We check cmd->cmd_cookiec against oldcookiec so * the scatter-gather list is correctly allocated */ - +#ifndef __sparc if (oldcookiec != cmd->cmd_cookiec) { if (cmd->cmd_sg != (mptti_t *)NULL) { kmem_free(cmd->cmd_sg, sizeof (mptti_t) * @@ -3649,7 +3605,7 @@ return ((struct scsi_pkt *)NULL); } } - +#endif /* __sparc */ dmap = cmd->cmd_sg; ASSERT(cmd->cmd_cookie.dmac_size != 0); @@ -3729,12 +3685,12 @@ (void) ddi_dma_unbind_handle(cmd->cmd_dmahandle); cmd->cmd_flags &= ~CFLAG_DMAVALID; } - +#ifndef __sparc if (cmd->cmd_sg) { kmem_free(cmd->cmd_sg, sizeof (mptti_t) * cmd->cmd_cookiec); cmd->cmd_sg = NULL; } - +#endif /* __sparc */ mptsas_free_extra_sgl_frame(mpt, cmd); if ((cmd->cmd_flags & @@ -3812,6 +3768,16 @@ cmd->cmd_arq_buf = NULL; return (-1); } + /* + * In sparc, the sgl length in most of the cases would be 1, so we + * pre-allocate it in cache. On x86, the max number would be 256, + * pre-allocate a maximum would waste a lot of memory especially + * when many cmds are put onto waitq. + */ +#ifdef __sparc + cmd->cmd_sg = kmem_alloc((size_t)(sizeof (mptti_t)* + MPTSAS_MAX_CMD_SEGS), KM_SLEEP); +#endif /* __sparc */ return (0); } @@ -3839,6 +3805,12 @@ ddi_dma_free_handle(&cmd->cmd_dmahandle); cmd->cmd_dmahandle = NULL; } +#ifdef __sparc + if (cmd->cmd_sg) { + kmem_free(cmd->cmd_sg, sizeof (mptti_t)* MPTSAS_MAX_CMD_SEGS); + cmd->cmd_sg = NULL; + } +#endif /* __sparc */ } static int @@ -4500,6 +4472,24 @@ NDBG5(("mptsas_poll: cmd=0x%p", (void *)poll_cmd)); + /* + * In order to avoid using m_mutex in ISR(a new separate mutex + * m_intr_mutex is introduced) and keep the same lock logic, + * the m_intr_mutex should be used to protect the getting and + * setting of the ReplyDescriptorIndex. + * + * Since the m_intr_mutex would be released during processing the poll + * cmd, so we should set the poll flag earlier here to make sure the + * polled cmd be handled in this thread/context. A side effect is other + * cmds during the period between the flag set and reset are also + * handled in this thread and not the ISR. Since the poll cmd is not + * so common, so the performance degradation in this case is not a big + * issue. + */ + mutex_enter(&mpt->m_intr_mutex); + mpt->m_polled_intr = 1; + mutex_exit(&mpt->m_intr_mutex); + if ((poll_cmd->cmd_flags & CFLAG_TM_CMD) == 0) { mptsas_restart_hba(mpt); } @@ -4526,6 +4516,10 @@ } } + mutex_enter(&mpt->m_intr_mutex); + mpt->m_polled_intr = 0; + mutex_exit(&mpt->m_intr_mutex); + if (rval == FALSE) { /* @@ -4562,11 +4556,12 @@ { int cnt; pMpi2ReplyDescriptorsUnion_t reply_desc_union; + Mpi2ReplyDescriptorsUnion_t reply_desc_union_v; uint32_t int_mask; + uint8_t reply_type; NDBG5(("mptsas_wait_intr")); - mpt->m_polled_intr = 1; /* * Get the current interrupt mask and disable interrupts. When @@ -4579,6 +4574,7 @@ * Keep polling for at least (polltime * 1000) seconds */ for (cnt = 0; cnt < polltime; cnt++) { + mutex_enter(&mpt->m_intr_mutex); (void) ddi_dma_sync(mpt->m_dma_post_queue_hdl, 0, 0, DDI_DMA_SYNC_FORCPU); @@ -4589,15 +4585,37 @@ &reply_desc_union->Words.Low) == 0xFFFFFFFF || ddi_get32(mpt->m_acc_post_queue_hdl, &reply_desc_union->Words.High) == 0xFFFFFFFF) { + mutex_exit(&mpt->m_intr_mutex); drv_usecwait(1000); continue; } - /* - * The reply is valid, process it according to its - * type. - */ - mptsas_process_intr(mpt, reply_desc_union); + reply_type = ddi_get8(mpt->m_acc_post_queue_hdl, + &reply_desc_union->Default.ReplyFlags); + reply_type &= MPI2_RPY_DESCRIPT_FLAGS_TYPE_MASK; + reply_desc_union_v.Default.ReplyFlags = reply_type; + if (reply_type == MPI2_RPY_DESCRIPT_FLAGS_SCSI_IO_SUCCESS) { + reply_desc_union_v.SCSIIOSuccess.SMID = + ddi_get16(mpt->m_acc_post_queue_hdl, + &reply_desc_union->SCSIIOSuccess.SMID); + } else if (reply_type == + MPI2_RPY_DESCRIPT_FLAGS_ADDRESS_REPLY) { + reply_desc_union_v.AddressReply.ReplyFrameAddress = + ddi_get32(mpt->m_acc_post_queue_hdl, + &reply_desc_union->AddressReply.ReplyFrameAddress); + reply_desc_union_v.AddressReply.SMID = + ddi_get16(mpt->m_acc_post_queue_hdl, + &reply_desc_union->AddressReply.SMID); + } + /* + * Clear the reply descriptor for re-use and increment + * index. + */ + ddi_put64(mpt->m_acc_post_queue_hdl, + &((uint64_t *)(void *)mpt->m_post_queue)[mpt->m_post_index], + 0xFFFFFFFFFFFFFFFF); + (void) ddi_dma_sync(mpt->m_dma_post_queue_hdl, 0, 0, + DDI_DMA_SYNC_FORDEV); if (++mpt->m_post_index == mpt->m_post_queue_depth) { mpt->m_post_index = 0; @@ -4608,7 +4626,14 @@ */ ddi_put32(mpt->m_datap, &mpt->m_reg->ReplyPostHostIndex, mpt->m_post_index); - mpt->m_polled_intr = 0; + mutex_exit(&mpt->m_intr_mutex); + + /* + * The reply is valid, process it according to its + * type. + */ + mptsas_process_intr(mpt, &reply_desc_union_v); + /* * Re-enable interrupts and quit. @@ -4622,11 +4647,121 @@ /* * Clear polling flag, re-enable interrupts and quit. */ - mpt->m_polled_intr = 0; ddi_put32(mpt->m_datap, &mpt->m_reg->HostInterruptMask, int_mask); return (FALSE); } +/* + * For fastpath, the m_intr_mutex should be held from the begining to the end, + * so we only treat those cmds that need not release m_intr_mutex(even just for + * a moment) as candidate for fast processing. otherwise, we don't handle them + * and just return, then in ISR, those cmds would be handled later with m_mutex + * held and m_intr_mutex not held. + */ +static int +mptsas_handle_io_fastpath(mptsas_t *mpt, + uint16_t SMID) +{ + mptsas_slots_t *slots = mpt->m_active; + mptsas_cmd_t *cmd = NULL; + struct scsi_pkt *pkt; + + /* + * This is a success reply so just complete the IO. First, do a sanity + * check on the SMID. The final slot is used for TM requests, which + * would not come into this reply handler. + */ + if ((SMID == 0) || (SMID > slots->m_n_slots)) { + mptsas_log(mpt, CE_WARN, "?Received invalid SMID of %d\n", + SMID); + ddi_fm_service_impact(mpt->m_dip, DDI_SERVICE_UNAFFECTED); + return (TRUE); + } + + cmd = slots->m_slot[SMID]; + + /* + * print warning and return if the slot is empty + */ + if (cmd == NULL) { + mptsas_log(mpt, CE_WARN, "?NULL command for successful SCSI IO " + "in slot %d", SMID); + return (TRUE); + } + + pkt = CMD2PKT(cmd); + pkt->pkt_state |= (STATE_GOT_BUS | STATE_GOT_TARGET | STATE_SENT_CMD | + STATE_GOT_STATUS); + if (cmd->cmd_flags & CFLAG_DMAVALID) { + pkt->pkt_state |= STATE_XFERRED_DATA; + } + pkt->pkt_resid = 0; + + /* + * If the cmd is a IOC, or a passthrough, then we don't process it in + * fastpath, and later it would be handled by mptsas_process_intr() + * with m_mutex protected. + */ + if (cmd->cmd_flags & (CFLAG_PASSTHRU | CFLAG_CMDIOC)) { + return (FALSE); + } else { + mptsas_remove_cmd0(mpt, cmd); + } + + if (cmd->cmd_flags & CFLAG_RETRY) { + /* + * The target returned QFULL or busy, do not add tihs + * pkt to the doneq since the hba will retry + * this cmd. + * + * The pkt has already been resubmitted in + * mptsas_handle_qfull() or in mptsas_check_scsi_io_error(). + * Remove this cmd_flag here. + */ + cmd->cmd_flags &= ~CFLAG_RETRY; + } else { + mptsas_doneq_add0(mpt, cmd); + } + + /* + * In fastpath, the cmd should only be a context reply, so just check + * the post queue of the reply descriptor and the dmahandle of the cmd + * is enough. No sense data in this case and no need to check the dma + * handle where sense data dma info is saved, the dma handle of the + * reply frame, and the dma handle of the reply free queue. + * For the dma handle of the request queue. Check fma here since we + * are sure the request must have already been sent/DMAed correctly. + * otherwise checking in mptsas_scsi_start() is not correct since + * at that time the dma may not start. + */ + if ((mptsas_check_dma_handle(mpt->m_dma_req_frame_hdl) != + DDI_SUCCESS) || + (mptsas_check_dma_handle(mpt->m_dma_post_queue_hdl) != + DDI_SUCCESS)) { + ddi_fm_service_impact(mpt->m_dip, + DDI_SERVICE_UNAFFECTED); + pkt->pkt_reason = CMD_TRAN_ERR; + pkt->pkt_statistics = 0; + } + if (cmd->cmd_dmahandle && + (mptsas_check_dma_handle(cmd->cmd_dmahandle) != DDI_SUCCESS)) { + ddi_fm_service_impact(mpt->m_dip, DDI_SERVICE_UNAFFECTED); + pkt->pkt_reason = CMD_TRAN_ERR; + pkt->pkt_statistics = 0; + } + if ((cmd->cmd_extra_frames && + ((mptsas_check_dma_handle(cmd->cmd_extra_frames->m_dma_hdl) != + DDI_SUCCESS) || + (mptsas_check_acc_handle(cmd->cmd_extra_frames->m_acc_hdl) != + DDI_SUCCESS)))) { + ddi_fm_service_impact(mpt->m_dip, DDI_SERVICE_UNAFFECTED); + pkt->pkt_reason = CMD_TRAN_ERR; + pkt->pkt_statistics = 0; + } + + return (TRUE); +} + static void mptsas_handle_scsi_io_success(mptsas_t *mpt, pMpi2ReplyDescriptorsUnion_t reply_desc) @@ -4637,10 +4772,8 @@ mptsas_cmd_t *cmd = NULL; struct scsi_pkt *pkt; - ASSERT(mutex_owned(&mpt->m_mutex)); - scsi_io_success = (pMpi2SCSIIOSuccessReplyDescriptor_t)reply_desc; - SMID = ddi_get16(mpt->m_acc_post_queue_hdl, &scsi_io_success->SMID); + SMID = scsi_io_success->SMID; /* * This is a success reply so just complete the IO. First, do a sanity @@ -4715,10 +4848,9 @@ ASSERT(mutex_owned(&mpt->m_mutex)); address_reply = (pMpi2AddressReplyDescriptor_t)reply_desc; - reply_addr = ddi_get32(mpt->m_acc_post_queue_hdl, - &address_reply->ReplyFrameAddress); - SMID = ddi_get16(mpt->m_acc_post_queue_hdl, &address_reply->SMID); - + + reply_addr = address_reply->ReplyFrameAddress; + SMID = address_reply->SMID; /* * If reply frame is not in the proper range we should ignore this * message and exit the interrupt handler. @@ -4978,10 +5110,12 @@ MPI2_IOCSTATUS_SCSI_DEVICE_NOT_THERE)) { pkt->pkt_reason = CMD_INCOMPLETE; pkt->pkt_state |= STATE_GOT_BUS; + mutex_enter(&ptgt->m_tgt_intr_mutex); if (ptgt->m_reset_delay == 0) { mptsas_set_throttle(mpt, ptgt, DRAIN_THROTTLE); } + mutex_exit(&ptgt->m_tgt_intr_mutex); return; } @@ -5081,9 +5215,11 @@ case MPI2_IOCSTATUS_SCSI_DEVICE_NOT_THERE: pkt->pkt_reason = CMD_DEV_GONE; pkt->pkt_state |= STATE_GOT_BUS; + mutex_enter(&ptgt->m_tgt_intr_mutex); if (ptgt->m_reset_delay == 0) { mptsas_set_throttle(mpt, ptgt, DRAIN_THROTTLE); } + mutex_exit(&ptgt->m_tgt_intr_mutex); NDBG31(("lost disk for target%d, command:%x", Tgt(cmd), pkt->pkt_cdbp[0])); break; @@ -5130,7 +5266,9 @@ ptgt = (mptsas_target_t *)mptsas_hash_traverse( &mpt->m_active->m_tgttbl, MPTSAS_HASH_FIRST); while (ptgt != NULL) { + mutex_enter(&ptgt->m_tgt_intr_mutex); mptsas_set_throttle(mpt, ptgt, DRAIN_THROTTLE); + mutex_exit(&ptgt->m_tgt_intr_mutex); ptgt = (mptsas_target_t *)mptsas_hash_traverse( &mpt->m_active->m_tgttbl, MPTSAS_HASH_NEXT); @@ -5142,7 +5280,9 @@ cmd->cmd_flags |= CFLAG_RETRY; cmd->cmd_pkt_flags |= FLAG_HEAD; + mutex_exit(&mpt->m_mutex); (void) mptsas_accept_pkt(mpt, cmd); + mutex_enter(&mpt->m_mutex); break; default: mptsas_log(mpt, CE_WARN, @@ -5257,7 +5397,6 @@ mutex_exit(&mpt->m_doneq_mutex); } - /* * mpt interrupt handler. */ @@ -5267,10 +5406,189 @@ mptsas_t *mpt = (void *)arg1; pMpi2ReplyDescriptorsUnion_t reply_desc_union; uchar_t did_reply = FALSE; + int i = 0, j; + uint8_t reply_type; + uint16_t SMID; NDBG1(("mptsas_intr: arg1 0x%p arg2 0x%p", (void *)arg1, (void *)arg2)); - mutex_enter(&mpt->m_mutex); + /* + * 1. + * To avoid using m_mutex in the ISR(ISR referes not only mptsas_intr, + * but all of the recursive called functions in it. the same below), + * separate mutexs are introduced to protect the elements shown in ISR. + * 3 type of mutex are involved here: + * a)per instance mutex m_intr_mutex. + * b)per target mutex m_tgt_intr_mutex. + * c)mutex that protect the free slot. + * + * a)per instance mutex m_intr_mutex: + * used to protect m_options, m_power, m_waitq, etc that would be + * checked/modified in ISR; protect the getting and setting the reply + * descriptor index; protect the m_slots[]; + * + * b)per target mutex m_tgt_intr_mutex: + * used to protect per target element which has relationship to ISR. + * contention for the new per target mutex is just as high as it in + * sd(7d) driver. + * + * c)mutexs that protect the free slots: + * those mutexs are introduced to minimize the mutex contentions + * between the IO request threads where free slots are allocated + * for sending cmds and ISR where slots holding outstanding cmds + * are returned to the free pool. + * the idea is like this: + * 1) Partition all of the free slot into NCPU groups. For example, + * In system where we have 15 slots, and 4 CPU, then slot s1,s5,s9,s13 + * are marked belonging to CPU1, s2,s6,s10,s14 to CPU2, s3,s7,s11,s15 + * to CPU3, and s4,s8,s12 to CPU4. + * 2) In each of the group, an alloc/release queue pair is created, + * and both the allocq and the releaseq have a dedicated mutex. + * 3) When init, all of the slots in a CPU group are inserted into the + * allocq of its CPU's pair. + * 4) When doing IO, + * mptsas_scsi_start() + * { + * cpuid = the cpu NO of the cpu where this thread is running on + * retry: + * mutex_enter(&allocq[cpuid]); + * if (get free slot = success) { + * remove the slot from the allocq + * mutex_exit(&allocq[cpuid]); + * return(success); + * } else { // exchange allocq and releaseq and try again + * mutex_enter(&releq[cpuid]); + * exchange the allocq and releaseq of this pair; + * mutex_exit(&releq[cpuid]); + * if (try to get free slot again = success) { + * remove the slot from the allocq + * mutex_exit(&allocq[cpuid]); + * return(success); + * } else { + * MOD(cpuid)++; + * goto retry; + * if (all CPU groups tried) + * mutex_exit(&allocq[cpuid]); + * return(failure); + * } + * } + * } + * ISR() + * { + * cpuid = the CPU group id where the slot sending the + * cmd belongs; + * mutex_enter(&releq[cpuid]); + * remove the slot from the releaseq + * mutex_exit(&releq[cpuid]); + * } + * This way, only when the queue pair doing exchange have mutex + * contentions. + * + * For mutex m_intr_mutex and m_tgt_intr_mutex, there are 2 scenarios: + * + * a)If the elements are only checked but not modified in the ISR, then + * only the places where those elements are modifed(outside of ISR) + * need to be protected by the new introduced mutex. + * For example, data A is only read/checked in ISR, then we need do + * like this: + * In ISR: + * { + * mutex_enter(&new_mutex); + * read(A); + * mutex_exit(&new_mutex); + * //the new_mutex here is either the m_tgt_intr_mutex or + * //the m_intr_mutex. + * } + * In non-ISR + * { + * mutex_enter(&m_mutex); //the stock driver already did this + * mutex_enter(&new_mutex); + * write(A); + * mutex_exit(&new_mutex); + * mutex_exit(&m_mutex); //the stock driver already did this + * + * read(A); + * // read(A) in non-ISR is not required to be protected by new + * // mutex since 'A' has already been protected by m_mutex + * // outside of the ISR + * } + * + * Those fields in mptsas_target_t/ptgt which are only read in ISR + * fall into this catergory. So they, together with the fields which + * are never read in ISR, are not necessary to be protected by + * m_tgt_intr_mutex, don't bother. + * checking of m_waitq also falls into this catergory. so all of the + * place outside of ISR where the m_waitq is modified, such as in + * mptsas_waitq_add(), mptsas_waitq_delete(), mptsas_waitq_rm(), + * m_intr_mutex should be used. + * + * b)If the elements are modified in the ISR, then each place where + * those elements are referred(outside of ISR) need to be protected + * by the new introduced mutex. Of course, if those elements only + * appear in the non-key code path, that is, they don't affect + * performance, then the m_mutex can still be used as before. + * For example, data B is modified in key code path in ISR, and data C + * is modified in non-key code path in ISR, then we can do like this: + * In ISR: + * { + * mutex_enter(&new_mutex); + * wirte(B); + * mutex_exit(&new_mutex); + * if (seldom happen) { + * mutex_enter(&m_mutex); + * write(C); + * mutex_exit(&m_mutex); + * } + * //the new_mutex here is either the m_tgt_intr_mutex or + * //the m_intr_mutex. + * } + * In non-ISR + * { + * mutex_enter(&new_mutex); + * write(B); + * mutex_exit(&new_mutex); + * + * mutex_enter(&new_mutex); + * read(B); + * mutex_exit(&new_mutex); + * // both write(B) and read(B) in non-ISR is required to be + * // protected by new mutex outside of the ISR + * + * mutex_enter(&m_mutex); //the stock driver already did this + * read(C); + * write(C); + * mutex_exit(&m_mutex); //the stock driver already did this + * // both write(C) and read(C) in non-ISR have been already + * // been protected by m_mutex outside of the ISR + * } + * + * For example, ptgt->m_t_ncmds fall into 'B' of this catergory, and + * elements shown in address reply, restart_hba, passthrough, IOC + * fall into 'C' of this catergory. + * + * In any case where mutexs are nested, make sure in the following + * order: + * m_mutex -> m_intr_mutex -> m_tgt_intr_mutex + * m_intr_mutex -> m_tgt_intr_mutex + * m_mutex -> m_intr_mutex + * m_mutex -> m_tgt_intr_mutex + * + * 2. + * Make sure at any time, getting the ReplyDescriptor by m_post_index + * and setting m_post_index to the ReplyDescriptorIndex register are + * atomic. Since m_mutex is not used for this purpose in ISR, the new + * mutex m_intr_mutex must play this role. So mptsas_poll(), where this + * kind of getting/setting is also performed, must use m_intr_mutex. + * Note, since context reply in ISR/process_intr is the only code path + * which affect performance, a fast path is introduced to only handle + * the read/write IO having context reply. For other IOs such as + * passthrough and IOC with context reply and all address reply, we + * use the as-is process_intr() to handle them. In order to keep the + * same semantics in process_intr(), make sure any new mutex is not held + * before enterring it. + */ + + mutex_enter(&mpt->m_intr_mutex); /* * If interrupts are shared by two channels then check whether this @@ -5279,7 +5597,7 @@ */ if ((mpt->m_options & MPTSAS_OPT_PM) && (mpt->m_power_level != PM_LEVEL_D0)) { - mutex_exit(&mpt->m_mutex); + mutex_exit(&mpt->m_intr_mutex); return (DDI_INTR_UNCLAIMED); } @@ -5290,7 +5608,7 @@ * return with interrupt unclaimed. */ if (mpt->m_polled_intr) { - mutex_exit(&mpt->m_mutex); + mutex_exit(&mpt->m_intr_mutex); mptsas_log(mpt, CE_WARN, "mpt_sas: Unclaimed interrupt"); return (DDI_INTR_UNCLAIMED); } @@ -5325,7 +5643,40 @@ */ did_reply = TRUE; - mptsas_process_intr(mpt, reply_desc_union); + reply_type = ddi_get8(mpt->m_acc_post_queue_hdl, + &reply_desc_union->Default.ReplyFlags); + reply_type &= MPI2_RPY_DESCRIPT_FLAGS_TYPE_MASK; + mpt->m_reply[i].Default.ReplyFlags = reply_type; + if (reply_type == + MPI2_RPY_DESCRIPT_FLAGS_SCSI_IO_SUCCESS) { + SMID = ddi_get16(mpt->m_acc_post_queue_hdl, + &reply_desc_union->SCSIIOSuccess.SMID); + if (mptsas_handle_io_fastpath(mpt, SMID) != + TRUE) { + mpt->m_reply[i].SCSIIOSuccess.SMID = + SMID; + i++; + } + } else if (reply_type == + MPI2_RPY_DESCRIPT_FLAGS_ADDRESS_REPLY) { + mpt->m_reply[i].AddressReply.ReplyFrameAddress = + ddi_get32(mpt->m_acc_post_queue_hdl, + &reply_desc_union->AddressReply. + ReplyFrameAddress); + mpt->m_reply[i].AddressReply.SMID = + ddi_get16(mpt->m_acc_post_queue_hdl, + &reply_desc_union->AddressReply.SMID); + i++; + } + /* + * Clear the reply descriptor for re-use and increment + * index. + */ + ddi_put64(mpt->m_acc_post_queue_hdl, + &((uint64_t *)(void *)mpt->m_post_queue) + [mpt->m_post_index], 0xFFFFFFFFFFFFFFFF); + (void) ddi_dma_sync(mpt->m_dma_post_queue_hdl, 0, 0, + DDI_DMA_SYNC_FORDEV); /* * Increment post index and roll over if needed. @@ -5333,6 +5684,8 @@ if (++mpt->m_post_index == mpt->m_post_queue_depth) { mpt->m_post_index = 0; } + if (i >= MPI_ADDRESS_COALSCE_MAX) + break; } /* @@ -5342,12 +5695,43 @@ if (did_reply) { ddi_put32(mpt->m_datap, &mpt->m_reg->ReplyPostHostIndex, mpt->m_post_index); - } - } else { - mutex_exit(&mpt->m_mutex); + + /* + * For fma, only check the PIO is required and enough + * here. Those cases where fastpath is not hit, the + * mptsas_fma_check() check all of the types of + * fma. That is not necessary and sometimes not + * correct. fma check should only be done after + * the PIO and/or dma is performed. + */ + if ((mptsas_check_acc_handle(mpt->m_datap) != + DDI_SUCCESS)) { + ddi_fm_service_impact(mpt->m_dip, + DDI_SERVICE_UNAFFECTED); + } + + } + } else { + mutex_exit(&mpt->m_intr_mutex); return (DDI_INTR_UNCLAIMED); } NDBG1(("mptsas_intr complete")); + mutex_exit(&mpt->m_intr_mutex); + + /* + * Since most of the cmds(read and write IO with success return.) + * have already been processed in fast path in which the m_mutex + * is not held, handling here the address reply and other context reply + * such as passthrough and IOC cmd with m_mutex held should be a big + * issue for performance. + * If holding m_mutex to process these cmds was still an obvious issue, + * we can process them in a taskq. + */ + for (j = 0; j < i; j++) { + mutex_enter(&mpt->m_mutex); + mptsas_process_intr(mpt, &mpt->m_reply[j]); + mutex_exit(&mpt->m_mutex); + } /* * If no helper threads are created, process the doneq in ISR. If @@ -5357,40 +5741,55 @@ * This measurement has some limitations, although it is simple and * straightforward and works well for most of the cases at present. */ - if (!mpt->m_doneq_thread_n || - (mpt->m_doneq_len <= mpt->m_doneq_length_threshold)) { + if (!mpt->m_doneq_thread_n) { mptsas_doneq_empty(mpt); } else { - mptsas_deliver_doneq_thread(mpt); + int helper = 1; + mutex_enter(&mpt->m_intr_mutex); + if (mpt->m_doneq_len <= mpt->m_doneq_length_threshold) + helper = 0; + mutex_exit(&mpt->m_intr_mutex); + if (helper) { + mptsas_deliver_doneq_thread(mpt); + } else { + mptsas_doneq_empty(mpt); + } } /* * If there are queued cmd, start them now. */ + mutex_enter(&mpt->m_intr_mutex); if (mpt->m_waitq != NULL) { - mptsas_restart_waitq(mpt); - } - - mutex_exit(&mpt->m_mutex); + mutex_exit(&mpt->m_intr_mutex); + mutex_enter(&mpt->m_mutex); + mptsas_restart_hba(mpt); + mutex_exit(&mpt->m_mutex); + return (DDI_INTR_CLAIMED); + } + mutex_exit(&mpt->m_intr_mutex); return (DDI_INTR_CLAIMED); } +/* + * In ISR, the successfully completed read and write IO are processed in a + * fast path. This function is only used to handle non-fastpath IO, including + * all of the address reply, and the context reply for IOC cmd, passthrough, + * etc. + * This function is also used to process polled cmd. + */ static void mptsas_process_intr(mptsas_t *mpt, pMpi2ReplyDescriptorsUnion_t reply_desc_union) { uint8_t reply_type; - ASSERT(mutex_owned(&mpt->m_mutex)); - /* * The reply is valid, process it according to its * type. Also, set a flag for updated the reply index * after they've all been processed. */ - reply_type = ddi_get8(mpt->m_acc_post_queue_hdl, - &reply_desc_union->Default.ReplyFlags); - reply_type &= MPI2_RPY_DESCRIPT_FLAGS_TYPE_MASK; + reply_type = reply_desc_union->Default.ReplyFlags; if (reply_type == MPI2_RPY_DESCRIPT_FLAGS_SCSI_IO_SUCCESS) { mptsas_handle_scsi_io_success(mpt, reply_desc_union); } else if (reply_type == MPI2_RPY_DESCRIPT_FLAGS_ADDRESS_REPLY) { @@ -5399,16 +5798,6 @@ mptsas_log(mpt, CE_WARN, "?Bad reply type %x", reply_type); ddi_fm_service_impact(mpt->m_dip, DDI_SERVICE_UNAFFECTED); } - - /* - * Clear the reply descriptor for re-use and increment - * index. - */ - ddi_put64(mpt->m_acc_post_queue_hdl, - &((uint64_t *)(void *)mpt->m_post_queue)[mpt->m_post_index], - 0xFFFFFFFFFFFFFFFF); - (void) ddi_dma_sync(mpt->m_dma_post_queue_hdl, 0, 0, - DDI_DMA_SYNC_FORDEV); } /* @@ -5430,18 +5819,24 @@ * to kick in. We do this by having pkt_reason * as CMD_CMPLT and pkt_scbp as STATUS_QFULL. */ + mutex_enter(&ptgt->m_tgt_intr_mutex); mptsas_set_throttle(mpt, ptgt, DRAIN_THROTTLE); - } else { + mutex_exit(&ptgt->m_tgt_intr_mutex); + } else { + mutex_enter(&ptgt->m_tgt_intr_mutex); if (ptgt->m_reset_delay == 0) { ptgt->m_t_throttle = max((ptgt->m_t_ncmds - 2), 0); } + mutex_exit(&ptgt->m_tgt_intr_mutex); cmd->cmd_pkt_flags |= FLAG_HEAD; cmd->cmd_flags &= ~(CFLAG_TRANFLAG); cmd->cmd_flags |= CFLAG_RETRY; + mutex_exit(&mpt->m_mutex); (void) mptsas_accept_pkt(mpt, cmd); + mutex_enter(&mpt->m_mutex); /* * when target gives queue full status with no commands @@ -5450,6 +5845,7 @@ * (see psarc/1994/313); if there are commands outstanding, * throttle is set to (m_t_ncmds - 2) */ + mutex_enter(&ptgt->m_tgt_intr_mutex); if (ptgt->m_t_throttle == HOLD_THROTTLE) { /* * By setting throttle to QFULL_THROTTLE, we @@ -5464,6 +5860,7 @@ ptgt->m_qfull_retry_interval); } } + mutex_exit(&ptgt->m_tgt_intr_mutex); } } @@ -6062,11 +6459,11 @@ * PHCI driver since failover finished. * Invalidate the devhdl */ + mutex_enter(&ptgt->m_tgt_intr_mutex); ptgt->m_devhdl = MPTSAS_INVALID_DEVHDL; ptgt->m_tgt_unconfigured = 0; - mutex_enter(&mpt->m_tx_waitq_mutex); ptgt->m_dr_flag = MPTSAS_DR_INACTIVE; - mutex_exit(&mpt->m_tx_waitq_mutex); + mutex_exit(&ptgt->m_tgt_intr_mutex); } /* @@ -6612,15 +7009,14 @@ /* * Update DR flag immediately avoid I/O failure * before failover finish. Pay attention to the - * mutex protect, we need grab m_tx_waitq_mutex - * during set m_dr_flag because we won't add - * the following command into waitq, instead, - * we need return TRAN_BUSY in the tran_start - * context. + * mutex protect, we need grab the per target + * mutex during set m_dr_flag because the + * m_mutex would not be held all the time in + * mptsas_scsi_start(). */ - mutex_enter(&mpt->m_tx_waitq_mutex); + mutex_enter(&ptgt->m_tgt_intr_mutex); ptgt->m_dr_flag = MPTSAS_DR_INTRANSITION; - mutex_exit(&mpt->m_tx_waitq_mutex); + mutex_exit(&ptgt->m_tgt_intr_mutex); topo_node = kmem_zalloc( sizeof (mptsas_topo_change_list_t), @@ -6853,9 +7249,9 @@ /* * Update DR flag immediately avoid I/O failure */ - mutex_enter(&mpt->m_tx_waitq_mutex); + mutex_enter(&ptgt->m_tgt_intr_mutex); ptgt->m_dr_flag = MPTSAS_DR_INTRANSITION; - mutex_exit(&mpt->m_tx_waitq_mutex); + mutex_exit(&ptgt->m_tgt_intr_mutex); topo_node = kmem_zalloc( sizeof (mptsas_topo_change_list_t), @@ -6887,9 +7283,9 @@ /* * Update DR flag immediately avoid I/O failure */ - mutex_enter(&mpt->m_tx_waitq_mutex); + mutex_enter(&ptgt->m_tgt_intr_mutex); ptgt->m_dr_flag = MPTSAS_DR_INTRANSITION; - mutex_exit(&mpt->m_tx_waitq_mutex); + mutex_exit(&ptgt->m_tgt_intr_mutex); topo_node = kmem_zalloc( sizeof (mptsas_topo_change_list_t), @@ -7544,12 +7940,14 @@ ptgt = (mptsas_target_t *)mptsas_hash_traverse(&mpt->m_active->m_tgttbl, MPTSAS_HASH_FIRST); while (ptgt != NULL) { + mutex_enter(&ptgt->m_tgt_intr_mutex); if (ptgt->m_reset_delay == 0) { if (ptgt->m_t_throttle == QFULL_THROTTLE) { mptsas_set_throttle(mpt, ptgt, MAX_THROTTLE); } } + mutex_exit(&ptgt->m_tgt_intr_mutex); ptgt = (mptsas_target_t *)mptsas_hash_traverse( &mpt->m_active->m_tgttbl, MPTSAS_HASH_NEXT); @@ -7558,13 +7956,40 @@ mutex_exit(&mpt->m_mutex); } +/* + * mptsas_remove_cmd0 is similar to mptsas_remove_cmd except that it is called + * where m_intr_mutex has already been held. + */ void mptsas_remove_cmd(mptsas_t *mpt, mptsas_cmd_t *cmd) { + ASSERT(mutex_owned(&mpt->m_mutex)); + + /* + * With new fine-grained lock mechanism, the outstanding cmd is only + * linked to m_active before the dma is triggerred(MPTSAS_START_CMD) + * to send it. that is, mptsas_save_cmd() doesn't link the outstanding + * cmd now. So when mptsas_remove_cmd is called, a mptsas_save_cmd must + * have been called, but the cmd may have not been linked. + * For mptsas_remove_cmd0, the cmd must have been linked. + * In order to keep the same semantic, we link the cmd to the + * outstanding cmd list. + */ + mpt->m_active->m_slot[cmd->cmd_slot] = cmd; + + mutex_enter(&mpt->m_intr_mutex); + mptsas_remove_cmd0(mpt, cmd); + mutex_exit(&mpt->m_intr_mutex); +} + +static inline void +mptsas_remove_cmd0(mptsas_t *mpt, mptsas_cmd_t *cmd) +{ int slot; mptsas_slots_t *slots = mpt->m_active; int t; mptsas_target_t *ptgt = cmd->cmd_tgt_addr; + mptsas_slot_free_e_t *pe; ASSERT(cmd != NULL); ASSERT(cmd->cmd_queued == FALSE); @@ -7579,37 +8004,52 @@ t = Tgt(cmd); slot = cmd->cmd_slot; + pe = mpt->m_slot_free_ae + slot - 1; + ASSERT(cmd == slots->m_slot[slot]); + ASSERT((slot > 0) && slot < (mpt->m_max_requests - 1)); /* * remove the cmd. */ - if (cmd == slots->m_slot[slot]) { - NDBG31(("mptsas_remove_cmd: removing cmd=0x%p", (void *)cmd)); - slots->m_slot[slot] = NULL; - mpt->m_ncmds--; - - /* - * only decrement per target ncmds if command - * has a target associated with it. - */ - if ((cmd->cmd_flags & CFLAG_CMDIOC) == 0) { - ptgt->m_t_ncmds--; - /* - * reset throttle if we just ran an untagged command - * to a tagged target - */ - if ((ptgt->m_t_ncmds == 0) && - ((cmd->cmd_pkt_flags & FLAG_TAGMASK) == 0)) { - mptsas_set_throttle(mpt, ptgt, MAX_THROTTLE); - } - } - + mutex_enter(&mpt->m_slot_freeq_pairp[pe->cpuid]. + m_slot_releq.s.m_fq_mutex); + NDBG31(("mptsas_remove_cmd0: removing cmd=0x%p", (void *)cmd)); + slots->m_slot[slot] = NULL; + ASSERT(pe->slot == slot); + list_insert_tail(&mpt->m_slot_freeq_pairp[pe->cpuid]. + m_slot_releq.s.m_fq_list, pe); + mpt->m_slot_freeq_pairp[pe->cpuid].m_slot_releq.s.m_fq_n++; + ASSERT(mpt->m_slot_freeq_pairp[pe->cpuid]. + m_slot_releq.s.m_fq_n <= mpt->m_max_requests - 2); + mutex_exit(&mpt->m_slot_freeq_pairp[pe->cpuid]. + m_slot_releq.s.m_fq_mutex); + + /* + * only decrement per target ncmds if command + * has a target associated with it. + */ + if ((cmd->cmd_flags & CFLAG_CMDIOC) == 0) { + mutex_enter(&ptgt->m_tgt_intr_mutex); + ptgt->m_t_ncmds--; + /* + * reset throttle if we just ran an untagged command + * to a tagged target + */ + if ((ptgt->m_t_ncmds == 0) && + ((cmd->cmd_pkt_flags & FLAG_TAGMASK) == 0)) { + mptsas_set_throttle(mpt, ptgt, MAX_THROTTLE); + } + mutex_exit(&ptgt->m_tgt_intr_mutex); } /* * This is all we need to do for ioc commands. + * The ioc cmds would never be handled in fastpath in ISR, so we make + * sure the mptsas_return_to_pool() would always be called with + * m_mutex protected. */ if (cmd->cmd_flags & CFLAG_CMDIOC) { + ASSERT(mutex_owned(&mpt->m_mutex)); mptsas_return_to_pool(mpt, cmd); return; } @@ -7626,6 +8066,7 @@ * going to take a while... * Add 1 to m_n_slots to account for TM request. */ + mutex_enter(&ptgt->m_tgt_intr_mutex); if (cmd->cmd_pkt->pkt_time == ptgt->m_timebase) { if (--(ptgt->m_dups) == 0) { if (ptgt->m_t_ncmds) { @@ -7659,40 +8100,19 @@ ptgt->m_timeout = ptgt->m_timebase; ASSERT(cmd != slots->m_slot[cmd->cmd_slot]); -} - -/* - * accept all cmds on the tx_waitq if any and then + mutex_exit(&ptgt->m_tgt_intr_mutex); +} + +/* * start a fresh request from the top of the device queue. - * - * since there are always cmds queued on the tx_waitq, and rare cmds on - * the instance waitq, so this function should not be invoked in the ISR, - * the mptsas_restart_waitq() is invoked in the ISR instead. otherwise, the - * burden belongs to the IO dispatch CPUs is moved the interrupt CPU. */ static void mptsas_restart_hba(mptsas_t *mpt) { - ASSERT(mutex_owned(&mpt->m_mutex)); - - mutex_enter(&mpt->m_tx_waitq_mutex); - if (mpt->m_tx_waitq) { - mptsas_accept_tx_waitq(mpt); - } - mutex_exit(&mpt->m_tx_waitq_mutex); - mptsas_restart_waitq(mpt); -} - -/* - * start a fresh request from the top of the device queue - */ -static void -mptsas_restart_waitq(mptsas_t *mpt) -{ mptsas_cmd_t *cmd, *next_cmd; mptsas_target_t *ptgt = NULL; - NDBG1(("mptsas_restart_waitq: mpt=0x%p", (void *)mpt)); + NDBG1(("mptsas_restart_hba: mpt=0x%p", (void *)mpt)); ASSERT(mutex_owned(&mpt->m_mutex)); @@ -7747,59 +8167,30 @@ } ptgt = cmd->cmd_tgt_addr; - if (ptgt && (ptgt->m_t_throttle == DRAIN_THROTTLE) && - (ptgt->m_t_ncmds == 0)) { - mptsas_set_throttle(mpt, ptgt, MAX_THROTTLE); - } - if ((mpt->m_ncmds <= (mpt->m_max_requests - 2)) && - (ptgt && (ptgt->m_reset_delay == 0)) && - (ptgt && (ptgt->m_t_ncmds < - ptgt->m_t_throttle))) { - if (mptsas_save_cmd(mpt, cmd) == TRUE) { - mptsas_waitq_delete(mpt, cmd); - (void) mptsas_start_cmd(mpt, cmd); - } - } + if (ptgt) { + mutex_enter(&mpt->m_intr_mutex); + mutex_enter(&ptgt->m_tgt_intr_mutex); + if ((ptgt->m_t_throttle == DRAIN_THROTTLE) && + (ptgt->m_t_ncmds == 0)) { + mptsas_set_throttle(mpt, ptgt, MAX_THROTTLE); + } + if ((ptgt->m_reset_delay == 0) && + (ptgt->m_t_ncmds < ptgt->m_t_throttle)) { + mutex_exit(&ptgt->m_tgt_intr_mutex); + mutex_exit(&mpt->m_intr_mutex); + if (mptsas_save_cmd(mpt, cmd) == TRUE) { + mptsas_waitq_delete(mpt, cmd); + (void) mptsas_start_cmd(mpt, cmd); + } + goto out; + } + mutex_exit(&ptgt->m_tgt_intr_mutex); + mutex_exit(&mpt->m_intr_mutex); + } +out: cmd = next_cmd; } } -/* - * Cmds are queued if tran_start() doesn't get the m_mutexlock(no wait). - * Accept all those queued cmds before new cmd is accept so that the - * cmds are sent in order. - */ -static void -mptsas_accept_tx_waitq(mptsas_t *mpt) -{ - mptsas_cmd_t *cmd; - - ASSERT(mutex_owned(&mpt->m_mutex)); - ASSERT(mutex_owned(&mpt->m_tx_waitq_mutex)); - - /* - * A Bus Reset could occur at any time and flush the tx_waitq, - * so we cannot count on the tx_waitq to contain even one cmd. - * And when the m_tx_waitq_mutex is released and run - * mptsas_accept_pkt(), the tx_waitq may be flushed. - */ - cmd = mpt->m_tx_waitq; - for (;;) { - if ((cmd = mpt->m_tx_waitq) == NULL) { - mpt->m_tx_draining = 0; - break; - } - if ((mpt->m_tx_waitq = cmd->cmd_linkp) == NULL) { - mpt->m_tx_waitqtail = &mpt->m_tx_waitq; - } - cmd->cmd_linkp = NULL; - mutex_exit(&mpt->m_tx_waitq_mutex); - if (mptsas_accept_pkt(mpt, cmd) != TRAN_ACCEPT) - cmn_err(CE_WARN, "mpt: mptsas_accept_tx_waitq: failed " - "to accept cmd on queue\n"); - mutex_enter(&mpt->m_tx_waitq_mutex); - } -} - /* * mpt tag type lookup @@ -7807,8 +8198,21 @@ static char mptsas_tag_lookup[] = {0, MSG_HEAD_QTAG, MSG_ORDERED_QTAG, 0, MSG_SIMPLE_QTAG}; -static int -mptsas_start_cmd(mptsas_t *mpt, mptsas_cmd_t *cmd) +/* + * mptsas_start_cmd0 is similar to mptsas_start_cmd, except that, it is called + * without ANY mutex protected, while, mptsas_start_cmd is called with m_mutex + * protected. + * + * the relevant field in ptgt should be protected by m_tgt_intr_mutex in both + * functions. + * + * before the cmds are linked on the slot for monitor as outstanding cmds, they + * are accessed as slab objects, so slab framework ensures the exclusive access, + * and no other mutex is requireed. Linking for monitor and the trigger of dma + * must be done exclusively. + */ +static int +mptsas_start_cmd0(mptsas_t *mpt, mptsas_cmd_t *cmd) { struct scsi_pkt *pkt = CMD2PKT(cmd); uint32_t control = 0; @@ -7821,7 +8225,7 @@ uint16_t SMID, io_flags = 0; uint32_t request_desc_low, request_desc_high; - NDBG1(("mptsas_start_cmd: cmd=0x%p", (void *)cmd)); + NDBG1(("mptsas_start_cmd0: cmd=0x%p", (void *)cmd)); /* * Set SMID and increment index. Rollover to 1 instead of 0 if index @@ -7843,6 +8247,7 @@ * then drain before submitting this cmd; SCSI-2 allows RQSENSE * to be untagged */ + mutex_enter(&ptgt->m_tgt_intr_mutex); if (((cmd->cmd_pkt_flags & FLAG_TAGMASK) == 0) && (ptgt->m_t_ncmds > 1) && ((cmd->cmd_flags & CFLAG_TM_CMD) == 0) && @@ -7854,13 +8259,19 @@ if (ptgt->m_reset_delay == 0) { mptsas_set_throttle(mpt, ptgt, DRAIN_THROTTLE); } - + mutex_exit(&ptgt->m_tgt_intr_mutex); + + mutex_enter(&mpt->m_mutex); mptsas_remove_cmd(mpt, cmd); cmd->cmd_pkt_flags |= FLAG_HEAD; mptsas_waitq_add(mpt, cmd); - } - return (DDI_FAILURE); - } + mutex_exit(&mpt->m_mutex); + return (DDI_FAILURE); + } + mutex_exit(&ptgt->m_tgt_intr_mutex); + return (DDI_FAILURE); + } + mutex_exit(&ptgt->m_tgt_intr_mutex); /* * Set correct tag bits. @@ -7944,11 +8355,16 @@ */ request_desc_low = (SMID << 16) + MPI2_REQ_DESCRIPT_FLAGS_SCSI_IO; request_desc_high = ptgt->m_devhdl << 16; + + mutex_enter(&mpt->m_mutex); + mpt->m_active->m_slot[cmd->cmd_slot] = cmd; MPTSAS_START_CMD(mpt, request_desc_low, request_desc_high); + mutex_exit(&mpt->m_mutex); /* * Start timeout. */ + mutex_enter(&ptgt->m_tgt_intr_mutex); #ifdef MPTSAS_TEST /* * Temporarily set timebase = 0; needed for @@ -7980,6 +8396,198 @@ ptgt->m_timebase = 60; } #endif + mutex_exit(&ptgt->m_tgt_intr_mutex); + + if ((mptsas_check_dma_handle(dma_hdl) != DDI_SUCCESS) || + (mptsas_check_acc_handle(acc_hdl) != DDI_SUCCESS)) { + ddi_fm_service_impact(mpt->m_dip, DDI_SERVICE_UNAFFECTED); + return (DDI_FAILURE); + } + return (DDI_SUCCESS); +} + +static int +mptsas_start_cmd(mptsas_t *mpt, mptsas_cmd_t *cmd) +{ + struct scsi_pkt *pkt = CMD2PKT(cmd); + uint32_t control = 0; + int n; + caddr_t mem; + pMpi2SCSIIORequest_t io_request; + ddi_dma_handle_t dma_hdl = mpt->m_dma_req_frame_hdl; + ddi_acc_handle_t acc_hdl = mpt->m_acc_req_frame_hdl; + mptsas_target_t *ptgt = cmd->cmd_tgt_addr; + uint16_t SMID, io_flags = 0; + uint32_t request_desc_low, request_desc_high; + + NDBG1(("mptsas_start_cmd: cmd=0x%p", (void *)cmd)); + + /* + * Set SMID and increment index. Rollover to 1 instead of 0 if index + * is at the max. 0 is an invalid SMID, so we call the first index 1. + */ + SMID = cmd->cmd_slot; + + /* + * It is possible for back to back device reset to + * happen before the reset delay has expired. That's + * ok, just let the device reset go out on the bus. + */ + if ((cmd->cmd_pkt_flags & FLAG_NOINTR) == 0) { + ASSERT(ptgt->m_reset_delay == 0); + } + + /* + * if a non-tagged cmd is submitted to an active tagged target + * then drain before submitting this cmd; SCSI-2 allows RQSENSE + * to be untagged + */ + mutex_enter(&ptgt->m_tgt_intr_mutex); + if (((cmd->cmd_pkt_flags & FLAG_TAGMASK) == 0) && + (ptgt->m_t_ncmds > 1) && + ((cmd->cmd_flags & CFLAG_TM_CMD) == 0) && + (*(cmd->cmd_pkt->pkt_cdbp) != SCMD_REQUEST_SENSE)) { + if ((cmd->cmd_pkt_flags & FLAG_NOINTR) == 0) { + NDBG23(("target=%d, untagged cmd, start draining\n", + ptgt->m_devhdl)); + + if (ptgt->m_reset_delay == 0) { + mptsas_set_throttle(mpt, ptgt, DRAIN_THROTTLE); + } + mutex_exit(&ptgt->m_tgt_intr_mutex); + + mptsas_remove_cmd(mpt, cmd); + cmd->cmd_pkt_flags |= FLAG_HEAD; + mptsas_waitq_add(mpt, cmd); + return (DDI_FAILURE); + } + mutex_exit(&ptgt->m_tgt_intr_mutex); + return (DDI_FAILURE); + } + mutex_exit(&ptgt->m_tgt_intr_mutex); + + /* + * Set correct tag bits. + */ + if (cmd->cmd_pkt_flags & FLAG_TAGMASK) { + switch (mptsas_tag_lookup[((cmd->cmd_pkt_flags & + FLAG_TAGMASK) >> 12)]) { + case MSG_SIMPLE_QTAG: + control |= MPI2_SCSIIO_CONTROL_SIMPLEQ; + break; + case MSG_HEAD_QTAG: + control |= MPI2_SCSIIO_CONTROL_HEADOFQ; + break; + case MSG_ORDERED_QTAG: + control |= MPI2_SCSIIO_CONTROL_ORDEREDQ; + break; + default: + mptsas_log(mpt, CE_WARN, "mpt: Invalid tag type\n"); + break; + } + } else { + if (*(cmd->cmd_pkt->pkt_cdbp) != SCMD_REQUEST_SENSE) { + ptgt->m_t_throttle = 1; + } + control |= MPI2_SCSIIO_CONTROL_SIMPLEQ; + } + + if (cmd->cmd_pkt_flags & FLAG_TLR) { + control |= MPI2_SCSIIO_CONTROL_TLR_ON; + } + + mem = mpt->m_req_frame + (mpt->m_req_frame_size * SMID); + io_request = (pMpi2SCSIIORequest_t)mem; + + bzero(io_request, sizeof (Mpi2SCSIIORequest_t)); + ddi_put8(acc_hdl, &io_request->SGLOffset0, offsetof + (MPI2_SCSI_IO_REQUEST, SGL) / 4); + mptsas_init_std_hdr(acc_hdl, io_request, ptgt->m_devhdl, Lun(cmd), 0, + MPI2_FUNCTION_SCSI_IO_REQUEST); + + (void) ddi_rep_put8(acc_hdl, (uint8_t *)pkt->pkt_cdbp, + io_request->CDB.CDB32, cmd->cmd_cdblen, DDI_DEV_AUTOINCR); + + io_flags = cmd->cmd_cdblen; + ddi_put16(acc_hdl, &io_request->IoFlags, io_flags); + /* + * setup the Scatter/Gather DMA list for this request + */ + if (cmd->cmd_cookiec > 0) { + mptsas_sge_setup(mpt, cmd, &control, io_request, acc_hdl); + } else { + ddi_put32(acc_hdl, &io_request->SGL.MpiSimple.FlagsLength, + ((uint32_t)MPI2_SGE_FLAGS_LAST_ELEMENT | + MPI2_SGE_FLAGS_END_OF_BUFFER | + MPI2_SGE_FLAGS_SIMPLE_ELEMENT | + MPI2_SGE_FLAGS_END_OF_LIST) << MPI2_SGE_FLAGS_SHIFT); + } + + /* + * save ARQ information + */ + ddi_put8(acc_hdl, &io_request->SenseBufferLength, cmd->cmd_rqslen); + if ((cmd->cmd_flags & (CFLAG_SCBEXTERN | CFLAG_EXTARQBUFVALID)) == + (CFLAG_SCBEXTERN | CFLAG_EXTARQBUFVALID)) { + ddi_put32(acc_hdl, &io_request->SenseBufferLowAddress, + cmd->cmd_ext_arqcookie.dmac_address); + } else { + ddi_put32(acc_hdl, &io_request->SenseBufferLowAddress, + cmd->cmd_arqcookie.dmac_address); + } + + ddi_put32(acc_hdl, &io_request->Control, control); + + NDBG31(("starting message=0x%p, with cmd=0x%p", + (void *)(uintptr_t)mpt->m_req_frame_dma_addr, (void *)cmd)); + + (void) ddi_dma_sync(dma_hdl, 0, 0, DDI_DMA_SYNC_FORDEV); + + /* + * Build request descriptor and write it to the request desc post reg. + */ + request_desc_low = (SMID << 16) + MPI2_REQ_DESCRIPT_FLAGS_SCSI_IO; + request_desc_high = ptgt->m_devhdl << 16; + + mpt->m_active->m_slot[cmd->cmd_slot] = cmd; + MPTSAS_START_CMD(mpt, request_desc_low, request_desc_high); + + /* + * Start timeout. + */ + mutex_enter(&ptgt->m_tgt_intr_mutex); +#ifdef MPTSAS_TEST + /* + * Temporarily set timebase = 0; needed for + * timeout torture test. + */ + if (mptsas_test_timeouts) { + ptgt->m_timebase = 0; + } +#endif + n = pkt->pkt_time - ptgt->m_timebase; + + if (n == 0) { + (ptgt->m_dups)++; + ptgt->m_timeout = ptgt->m_timebase; + } else if (n > 0) { + ptgt->m_timeout = + ptgt->m_timebase = pkt->pkt_time; + ptgt->m_dups = 1; + } else if (n < 0) { + ptgt->m_timeout = ptgt->m_timebase; + } +#ifdef MPTSAS_TEST + /* + * Set back to a number higher than + * mptsas_scsi_watchdog_tick + * so timeouts will happen in mptsas_watchsubr + */ + if (mptsas_test_timeouts) { + ptgt->m_timebase = 60; + } +#endif + mutex_exit(&ptgt->m_tgt_intr_mutex); if ((mptsas_check_dma_handle(dma_hdl) != DDI_SUCCESS) || (mptsas_check_acc_handle(acc_hdl) != DDI_SUCCESS)) { @@ -8026,7 +8634,7 @@ } /* - * move the current global doneq to the doneq of thead[t] + * move the current global doneq to the doneq of thread[t] */ static void mptsas_doneq_mv(mptsas_t *mpt, uint64_t t) @@ -8035,6 +8643,7 @@ mptsas_doneq_thread_list_t *item = &mpt->m_doneq_thread_id[t]; ASSERT(mutex_owned(&item->mutex)); + mutex_enter(&mpt->m_intr_mutex); while ((cmd = mpt->m_doneq) != NULL) { if ((mpt->m_doneq = cmd->cmd_linkp) == NULL) { mpt->m_donetail = &mpt->m_doneq; @@ -8045,6 +8654,7 @@ mpt->m_doneq_len--; item->len++; } + mutex_exit(&mpt->m_intr_mutex); } void @@ -8119,6 +8729,33 @@ } /* + * mptsas_doneq_add0 is similar to mptsas_doneq_add except that it is called + * where m_intr_mutex has already been held. + */ +static inline void +mptsas_doneq_add0(mptsas_t *mpt, mptsas_cmd_t *cmd) +{ + struct scsi_pkt *pkt = CMD2PKT(cmd); + + NDBG31(("mptsas_doneq_add0: cmd=0x%p", (void *)cmd)); + + ASSERT((cmd->cmd_flags & CFLAG_COMPLETED) == 0); + cmd->cmd_linkp = NULL; + cmd->cmd_flags |= CFLAG_FINISHED; + cmd->cmd_flags &= ~CFLAG_IN_TRANSPORT; + + /* + * only add scsi pkts that have completion routines to + * the doneq. no intr cmds do not have callbacks. + */ + if (pkt && (pkt->pkt_comp)) { + *mpt->m_donetail = cmd; + mpt->m_donetail = &cmd->cmd_linkp; + mpt->m_doneq_len++; + } +} + +/* * These routines manipulate the queue of commands that * are waiting for their completion routines to be called. * The queue is usually in FIFO order but on an MP system @@ -8130,26 +8767,13 @@ static void mptsas_doneq_add(mptsas_t *mpt, mptsas_cmd_t *cmd) { - struct scsi_pkt *pkt = CMD2PKT(cmd); - - NDBG31(("mptsas_doneq_add: cmd=0x%p", (void *)cmd)); - - ASSERT((cmd->cmd_flags & CFLAG_COMPLETED) == 0); - cmd->cmd_linkp = NULL; - cmd->cmd_flags |= CFLAG_FINISHED; - cmd->cmd_flags &= ~CFLAG_IN_TRANSPORT; + ASSERT(mutex_owned(&mpt->m_mutex)); mptsas_fma_check(mpt, cmd); - /* - * only add scsi pkts that have completion routines to - * the doneq. no intr cmds do not have callbacks. - */ - if (pkt && (pkt->pkt_comp)) { - *mpt->m_donetail = cmd; - mpt->m_donetail = &cmd->cmd_linkp; - mpt->m_doneq_len++; - } + mutex_enter(&mpt->m_intr_mutex); + mptsas_doneq_add0(mpt, cmd); + mutex_exit(&mpt->m_intr_mutex); } static mptsas_cmd_t * @@ -8174,6 +8798,7 @@ static void mptsas_doneq_empty(mptsas_t *mpt) { + mutex_enter(&mpt->m_intr_mutex); if (mpt->m_doneq && !mpt->m_in_callback) { mptsas_cmd_t *cmd, *next; struct scsi_pkt *pkt; @@ -8184,7 +8809,14 @@ mpt->m_donetail = &mpt->m_doneq; mpt->m_doneq_len = 0; - mutex_exit(&mpt->m_mutex); + mutex_exit(&mpt->m_intr_mutex); + + /* + * ONLY in ISR, is it called without m_mutex held, otherwise, + * it is always called with m_mutex held. + */ + if ((curthread->t_flag & T_INTR_THREAD) == 0) + mutex_exit(&mpt->m_mutex); /* * run the completion routines of all the * completed commands @@ -8198,9 +8830,12 @@ mptsas_pkt_comp(pkt, cmd); cmd = next; } - mutex_enter(&mpt->m_mutex); + if ((curthread->t_flag & T_INTR_THREAD) == 0) + mutex_enter(&mpt->m_mutex); mpt->m_in_callback = 0; - } + return; + } + mutex_exit(&mpt->m_intr_mutex); } /* @@ -8215,10 +8850,12 @@ if (ptgt) ptgt->m_t_nwait++; if (cmd->cmd_pkt_flags & FLAG_HEAD) { + mutex_enter(&mpt->m_intr_mutex); if ((cmd->cmd_linkp = mpt->m_waitq) == NULL) { mpt->m_waitqtail = &cmd->cmd_linkp; } mpt->m_waitq = cmd; + mutex_exit(&mpt->m_intr_mutex); } else { cmd->cmd_linkp = NULL; *(mpt->m_waitqtail) = cmd; @@ -8233,7 +8870,9 @@ mptsas_target_t *ptgt; NDBG7(("mptsas_waitq_rm")); + mutex_enter(&mpt->m_intr_mutex); MPTSAS_WAITQ_RM(mpt, cmd); + mutex_exit(&mpt->m_intr_mutex); NDBG7(("mptsas_waitq_rm: cmd=0x%p", (void *)cmd)); if (cmd) { @@ -8263,8 +8902,10 @@ } if (prevp == cmd) { + mutex_enter(&mpt->m_intr_mutex); if ((mpt->m_waitq = cmd->cmd_linkp) == NULL) mpt->m_waitqtail = &mpt->m_waitq; + mutex_exit(&mpt->m_intr_mutex); cmd->cmd_linkp = NULL; cmd->cmd_queued = FALSE; @@ -8289,57 +8930,6 @@ cmn_err(CE_PANIC, "mpt: mptsas_waitq_delete: queue botch"); } -static mptsas_cmd_t * -mptsas_tx_waitq_rm(mptsas_t *mpt) -{ - mptsas_cmd_t *cmd; - NDBG7(("mptsas_tx_waitq_rm")); - - MPTSAS_TX_WAITQ_RM(mpt, cmd); - - NDBG7(("mptsas_tx_waitq_rm: cmd=0x%p", (void *)cmd)); - - return (cmd); -} - -/* - * remove specified cmd from the middle of the tx_waitq. - */ -static void -mptsas_tx_waitq_delete(mptsas_t *mpt, mptsas_cmd_t *cmd) -{ - mptsas_cmd_t *prevp = mpt->m_tx_waitq; - - NDBG7(("mptsas_tx_waitq_delete: mpt=0x%p cmd=0x%p", - (void *)mpt, (void *)cmd)); - - if (prevp == cmd) { - if ((mpt->m_tx_waitq = cmd->cmd_linkp) == NULL) - mpt->m_tx_waitqtail = &mpt->m_tx_waitq; - - cmd->cmd_linkp = NULL; - cmd->cmd_queued = FALSE; - NDBG7(("mptsas_tx_waitq_delete: mpt=0x%p cmd=0x%p", - (void *)mpt, (void *)cmd)); - return; - } - - while (prevp != NULL) { - if (prevp->cmd_linkp == cmd) { - if ((prevp->cmd_linkp = cmd->cmd_linkp) == NULL) - mpt->m_tx_waitqtail = &prevp->cmd_linkp; - - cmd->cmd_linkp = NULL; - cmd->cmd_queued = FALSE; - NDBG7(("mptsas_tx_waitq_delete: mpt=0x%p cmd=0x%p", - (void *)mpt, (void *)cmd)); - return; - } - prevp = prevp->cmd_linkp; - } - cmn_err(CE_PANIC, "mpt: mptsas_tx_waitq_delete: queue botch"); -} - /* * device and bus reset handling * @@ -8503,9 +9093,11 @@ * and target/lun for abort task set. * Account for TM requests, which use the last SMID. */ + mutex_enter(&mpt->m_intr_mutex); for (slot = 0; slot <= mpt->m_active->m_n_slots; slot++) { - if ((cmd = slots->m_slot[slot]) == NULL) + if ((cmd = slots->m_slot[slot]) == NULL) { continue; + } reason = CMD_RESET; stat = STAT_DEV_RESET; switch (tasktype) { @@ -8515,9 +9107,9 @@ "NULL cmd in slot %d, tasktype 0x%x", slot, tasktype)); mptsas_dump_cmd(mpt, cmd); - mptsas_remove_cmd(mpt, cmd); + mptsas_remove_cmd0(mpt, cmd); mptsas_set_pkt_reason(mpt, cmd, reason, stat); - mptsas_doneq_add(mpt, cmd); + mptsas_doneq_add0(mpt, cmd); } break; case MPI2_SCSITASKMGMT_TASKTYPE_ABRT_TASK_SET: @@ -8531,19 +9123,20 @@ "NULL cmd in slot %d, tasktype 0x%x", slot, tasktype)); mptsas_dump_cmd(mpt, cmd); - mptsas_remove_cmd(mpt, cmd); + mptsas_remove_cmd0(mpt, cmd); mptsas_set_pkt_reason(mpt, cmd, reason, stat); - mptsas_doneq_add(mpt, cmd); + mptsas_doneq_add0(mpt, cmd); } break; default: break; } } - - /* - * Flush the waitq and tx_waitq of this target's cmds + mutex_exit(&mpt->m_intr_mutex); + + /* + * Flush the waitq of this target's cmds */ cmd = mpt->m_waitq; @@ -8562,21 +9155,6 @@ } cmd = next_cmd; } - mutex_enter(&mpt->m_tx_waitq_mutex); - cmd = mpt->m_tx_waitq; - while (cmd != NULL) { - next_cmd = cmd->cmd_linkp; - if (Tgt(cmd) == target) { - mptsas_tx_waitq_delete(mpt, cmd); - mutex_exit(&mpt->m_tx_waitq_mutex); - mptsas_set_pkt_reason(mpt, cmd, - reason, stat); - mptsas_doneq_add(mpt, cmd); - mutex_enter(&mpt->m_tx_waitq_mutex); - } - cmd = next_cmd; - } - mutex_exit(&mpt->m_tx_waitq_mutex); break; case MPI2_SCSITASKMGMT_TASKTYPE_ABRT_TASK_SET: reason = CMD_ABORTED; @@ -8593,21 +9171,6 @@ } cmd = next_cmd; } - mutex_enter(&mpt->m_tx_waitq_mutex); - cmd = mpt->m_tx_waitq; - while (cmd != NULL) { - next_cmd = cmd->cmd_linkp; - if ((Tgt(cmd) == target) && (Lun(cmd) == lun)) { - mptsas_tx_waitq_delete(mpt, cmd); - mutex_exit(&mpt->m_tx_waitq_mutex); - mptsas_set_pkt_reason(mpt, cmd, - reason, stat); - mptsas_doneq_add(mpt, cmd); - mutex_enter(&mpt->m_tx_waitq_mutex); - } - cmd = next_cmd; - } - mutex_exit(&mpt->m_tx_waitq_mutex); break; default: mptsas_log(mpt, CE_WARN, "Unknown task management type %d.", @@ -8635,9 +9198,11 @@ * sure all commands have been flushed. * Account for TM request, which use the last SMID. */ + mutex_enter(&mpt->m_intr_mutex); for (slot = 0; slot <= mpt->m_active->m_n_slots; slot++) { - if ((cmd = slots->m_slot[slot]) == NULL) + if ((cmd = slots->m_slot[slot]) == NULL) { continue; + } if (cmd->cmd_flags & CFLAG_CMDIOC) { /* @@ -8665,10 +9230,11 @@ slot)); mptsas_dump_cmd(mpt, cmd); - mptsas_remove_cmd(mpt, cmd); + mptsas_remove_cmd0(mpt, cmd); mptsas_set_pkt_reason(mpt, cmd, CMD_RESET, STAT_BUS_RESET); - mptsas_doneq_add(mpt, cmd); - } + mptsas_doneq_add0(mpt, cmd); + } + mutex_exit(&mpt->m_intr_mutex); /* * Flush the waitq. @@ -8686,18 +9252,6 @@ mptsas_doneq_add(mpt, cmd); } } - - /* - * Flush the tx_waitq - */ - mutex_enter(&mpt->m_tx_waitq_mutex); - while ((cmd = mptsas_tx_waitq_rm(mpt)) != NULL) { - mutex_exit(&mpt->m_tx_waitq_mutex); - mptsas_set_pkt_reason(mpt, cmd, CMD_RESET, STAT_BUS_RESET); - mptsas_doneq_add(mpt, cmd); - mutex_enter(&mpt->m_tx_waitq_mutex); - } - mutex_exit(&mpt->m_tx_waitq_mutex); } /* @@ -8746,8 +9300,10 @@ ptgt = (mptsas_target_t *)mptsas_hash_traverse(&mpt->m_active->m_tgttbl, MPTSAS_HASH_FIRST); while (ptgt != NULL) { + mutex_enter(&ptgt->m_tgt_intr_mutex); mptsas_set_throttle(mpt, ptgt, HOLD_THROTTLE); ptgt->m_reset_delay = mpt->m_scsi_reset_delay; + mutex_exit(&ptgt->m_tgt_intr_mutex); ptgt = (mptsas_target_t *)mptsas_hash_traverse( &mpt->m_active->m_tgttbl, MPTSAS_HASH_NEXT); @@ -8805,6 +9361,7 @@ ptgt = (mptsas_target_t *)mptsas_hash_traverse(&mpt->m_active->m_tgttbl, MPTSAS_HASH_FIRST); while (ptgt != NULL) { + mutex_enter(&ptgt->m_tgt_intr_mutex); if (ptgt->m_reset_delay != 0) { ptgt->m_reset_delay -= MPTSAS_WATCH_RESET_DELAY_TICK; @@ -8817,6 +9374,7 @@ done = -1; } } + mutex_exit(&ptgt->m_tgt_intr_mutex); ptgt = (mptsas_target_t *)mptsas_hash_traverse( &mpt->m_active->m_tgttbl, MPTSAS_HASH_NEXT); @@ -8912,8 +9470,9 @@ /* * Have mpt firmware abort this command */ - + mutex_enter(&mpt->m_intr_mutex); if (slots->m_slot[sp->cmd_slot] != NULL) { + mutex_exit(&mpt->m_intr_mutex); rval = mptsas_ioc_task_management(mpt, MPI2_SCSITASKMGMT_TASKTYPE_ABORT_TASK, target, lun, NULL, 0, 0); @@ -8927,6 +9486,7 @@ rval = FALSE; goto done; } + mutex_exit(&mpt->m_intr_mutex); } /* @@ -9038,6 +9598,7 @@ mptsas_t *mpt = ADDR2MPT(ap); int ckey; int rval = FALSE; + mptsas_target_t *ptgt; NDBG24(("mptsas_scsi_setcap: target=%d, cap=%s value=%x tgtonly=%x", ap->a_target, cap, value, tgtonly)); @@ -9077,9 +9638,11 @@ } break; case SCSI_CAP_TAGGED_QING: - mptsas_set_throttle(mpt, ((mptsas_tgt_private_t *) - (ap->a_hba_tran->tran_tgt_private))->t_private, - MAX_THROTTLE); + ptgt = ((mptsas_tgt_private_t *) + (ap->a_hba_tran->tran_tgt_private))->t_private; + mutex_enter(&ptgt->m_tgt_intr_mutex); + mptsas_set_throttle(mpt, ptgt, MAX_THROTTLE); + mutex_exit(&ptgt->m_tgt_intr_mutex); rval = TRUE; break; case SCSI_CAP_QFULL_RETRIES: @@ -9123,13 +9686,13 @@ mptsas_slots_t *old_active = mpt->m_active; mptsas_slots_t *new_active; size_t size; - int rval = -1, i; - - /* - * if there are active commands, then we cannot - * change size of active slots array. - */ - ASSERT(mpt->m_ncmds == 0); + int rval = -1, nslot, i; + mptsas_slot_free_e_t *pe; + + if (mptsas_outstanding_cmds_n(mpt)) { + NDBG9(("cannot change size of active slots array")); + return (rval); + } size = MPTSAS_SLOTS_SIZE(mpt); new_active = kmem_zalloc(size, flag); @@ -9142,9 +9705,10 @@ * number of slots that can be used at any one time is * m_max_requests - 2. */ - new_active->m_n_slots = (mpt->m_max_requests - 2); + new_active->m_n_slots = nslot = (mpt->m_max_requests - 2); new_active->m_size = size; new_active->m_tags = 1; + if (old_active) { new_active->m_tgttbl = old_active->m_tgttbl; new_active->m_smptbl = old_active->m_smptbl; @@ -9156,6 +9720,62 @@ } mptsas_free_active_slots(mpt); } + + if (max_ncpus & (max_ncpus - 1)) { + mpt->m_slot_freeq_pair_n = (1 << highbit(max_ncpus)); + } else { + mpt->m_slot_freeq_pair_n = max_ncpus; + } + mpt->m_slot_freeq_pairp = kmem_zalloc( + mpt->m_slot_freeq_pair_n * + sizeof (mptsas_slot_freeq_pair_t), KM_SLEEP); + for (i = 0; i < mpt->m_slot_freeq_pair_n; i++) { + list_create(&mpt->m_slot_freeq_pairp[i]. + m_slot_allocq.s.m_fq_list, + sizeof (mptsas_slot_free_e_t), + offsetof(mptsas_slot_free_e_t, node)); + list_create(&mpt->m_slot_freeq_pairp[i]. + m_slot_releq.s.m_fq_list, + sizeof (mptsas_slot_free_e_t), + offsetof(mptsas_slot_free_e_t, node)); + mpt->m_slot_freeq_pairp[i].m_slot_allocq.s.m_fq_n = 0; + mpt->m_slot_freeq_pairp[i].m_slot_releq.s.m_fq_n = 0; + mutex_init(&mpt->m_slot_freeq_pairp[i]. + m_slot_allocq.s.m_fq_mutex, NULL, MUTEX_DRIVER, + DDI_INTR_PRI(mpt->m_intr_pri)); + mutex_init(&mpt->m_slot_freeq_pairp[i]. + m_slot_releq.s.m_fq_mutex, NULL, MUTEX_DRIVER, + DDI_INTR_PRI(mpt->m_intr_pri)); + } + pe = mpt->m_slot_free_ae = kmem_zalloc(nslot * + sizeof (mptsas_slot_free_e_t), KM_SLEEP); + /* + * An array of Mpi2ReplyDescriptorsUnion_t is defined here. + * We are trying to eliminate the m_mutex in the context + * reply code path in the ISR. Since the read of the + * ReplyDescriptor and update/write of the ReplyIndex must + * be atomic (since the poll thread may also update them at + * the same time) so we first read out of the ReplyDescriptor + * into this array and update the ReplyIndex register with a + * separate mutex m_intr_mutex protected, and then release the + * mutex and process all of them. the length of the array is + * defined as max as 128(128*64=8k), which is + * assumed as the maxmium depth of the interrupt coalese. + */ + mpt->m_reply = kmem_zalloc(MPI_ADDRESS_COALSCE_MAX * + sizeof (Mpi2ReplyDescriptorsUnion_t), KM_SLEEP); + for (i = 0; i < nslot; i++, pe++) { + pe->slot = i + 1; /* SMID 0 is reserved */ + pe->cpuid = i % mpt->m_slot_freeq_pair_n; + list_insert_tail(&mpt->m_slot_freeq_pairp + [i % mpt->m_slot_freeq_pair_n] + .m_slot_allocq.s.m_fq_list, pe); + mpt->m_slot_freeq_pairp[i % mpt->m_slot_freeq_pair_n] + .m_slot_allocq.s.m_fq_n++; + mpt->m_slot_freeq_pairp[i % mpt->m_slot_freeq_pair_n] + .m_slot_allocq.s.m_fq_n_init++; + } + mpt->m_active = new_active; rval = 0; @@ -9167,9 +9787,44 @@ { mptsas_slots_t *active = mpt->m_active; size_t size; + mptsas_slot_free_e_t *pe; + int i; if (active == NULL) return; + + if (mpt->m_slot_freeq_pairp) { + for (i = 0; i < mpt->m_slot_freeq_pair_n; i++) { + while ((pe = list_head(&mpt->m_slot_freeq_pairp + [i].m_slot_allocq.s.m_fq_list)) != NULL) { + list_remove(&mpt->m_slot_freeq_pairp[i] + .m_slot_allocq.s.m_fq_list, pe); + } + list_destroy(&mpt->m_slot_freeq_pairp + [i].m_slot_allocq.s.m_fq_list); + while ((pe = list_head(&mpt->m_slot_freeq_pairp + [i].m_slot_releq.s.m_fq_list)) != NULL) { + list_remove(&mpt->m_slot_freeq_pairp[i] + .m_slot_releq.s.m_fq_list, pe); + } + list_destroy(&mpt->m_slot_freeq_pairp + [i].m_slot_releq.s.m_fq_list); + mutex_destroy(&mpt->m_slot_freeq_pairp + [i].m_slot_allocq.s.m_fq_mutex); + mutex_destroy(&mpt->m_slot_freeq_pairp + [i].m_slot_releq.s.m_fq_mutex); + } + kmem_free(mpt->m_slot_freeq_pairp, mpt->m_slot_freeq_pair_n * + sizeof (mptsas_slot_freeq_pair_t)); + } + if (mpt->m_slot_free_ae) + kmem_free(mpt->m_slot_free_ae, mpt->m_active->m_n_slots * + sizeof (mptsas_slot_free_e_t)); + + if (mpt->m_reply) + kmem_free(mpt->m_reply, MPI_ADDRESS_COALSCE_MAX * + sizeof (Mpi2ReplyDescriptorsUnion_t)); + size = active->m_size; kmem_free(active, size); mpt->m_active = NULL; @@ -9316,6 +9971,7 @@ * Check for commands stuck in active slot * Account for TM requests, which use the last SMID. */ + mutex_enter(&mpt->m_intr_mutex); for (i = 0; i <= mpt->m_active->m_n_slots; i++) { if ((cmd = mpt->m_active->m_slot[i]) != NULL) { if ((cmd->cmd_flags & CFLAG_CMDIOC) == 0) { @@ -9326,9 +9982,11 @@ * There seems to be a command stuck * in the active slot. Drain throttle. */ - mptsas_set_throttle(mpt, - cmd->cmd_tgt_addr, + ptgt = cmd->cmd_tgt_addr; + mutex_enter(&ptgt->m_tgt_intr_mutex); + mptsas_set_throttle(mpt, ptgt, DRAIN_THROTTLE); + mutex_exit(&ptgt->m_tgt_intr_mutex); } } if ((cmd->cmd_flags & CFLAG_PASSTHRU) || @@ -9349,11 +10007,19 @@ } } } + mutex_exit(&mpt->m_intr_mutex); ptgt = (mptsas_target_t *)mptsas_hash_traverse(&mpt->m_active->m_tgttbl, MPTSAS_HASH_FIRST); while (ptgt != NULL) { /* + * In order to avoid using m_mutex in the key code path in ISR, + * separate mutexs are introduced to protect those elements + * shown in ISR. + */ + mutex_enter(&ptgt->m_tgt_intr_mutex); + + /* * If we were draining due to a qfull condition, * go back to full throttle. */ @@ -9371,6 +10037,7 @@ mptsas_scsi_watchdog_tick) { ptgt->m_timebase += mptsas_scsi_watchdog_tick; + mutex_exit(&ptgt->m_tgt_intr_mutex); ptgt = (mptsas_target_t *)mptsas_hash_traverse( &mpt->m_active->m_tgttbl, MPTSAS_HASH_NEXT); continue; @@ -9379,6 +10046,7 @@ ptgt->m_timeout -= mptsas_scsi_watchdog_tick; if (ptgt->m_timeout < 0) { + mutex_exit(&ptgt->m_tgt_intr_mutex); mptsas_cmd_timeout(mpt, ptgt->m_devhdl); ptgt = (mptsas_target_t *)mptsas_hash_traverse( &mpt->m_active->m_tgttbl, MPTSAS_HASH_NEXT); @@ -9392,7 +10060,7 @@ DRAIN_THROTTLE); } } - + mutex_exit(&ptgt->m_tgt_intr_mutex); ptgt = (mptsas_target_t *)mptsas_hash_traverse( &mpt->m_active->m_tgttbl, MPTSAS_HASH_NEXT); } @@ -9461,14 +10129,18 @@ ptgt = (mptsas_target_t *)mptsas_hash_traverse(&mpt->m_active->m_tgttbl, MPTSAS_HASH_FIRST); while (ptgt != NULL) { + mutex_enter(&ptgt->m_tgt_intr_mutex); mptsas_set_throttle(mpt, ptgt, HOLD_THROTTLE); + mutex_exit(&ptgt->m_tgt_intr_mutex); ptgt = (mptsas_target_t *)mptsas_hash_traverse( &mpt->m_active->m_tgttbl, MPTSAS_HASH_NEXT); } /* If there are any outstanding commands in the queue */ - if (mpt->m_ncmds) { + mutex_enter(&mpt->m_intr_mutex); + if (mptsas_outstanding_cmds_n(mpt)) { + mutex_exit(&mpt->m_intr_mutex); mpt->m_softstate |= MPTSAS_SS_DRAINING; mpt->m_quiesce_timeid = timeout(mptsas_ncmds_checkdrain, mpt, (MPTSAS_QUIESCE_TIMEOUT * drv_usectohz(1000000))); @@ -9480,7 +10152,9 @@ ptgt = (mptsas_target_t *)mptsas_hash_traverse( &mpt->m_active->m_tgttbl, MPTSAS_HASH_FIRST); while (ptgt != NULL) { + mutex_enter(&ptgt->m_tgt_intr_mutex); mptsas_set_throttle(mpt, ptgt, MAX_THROTTLE); + mutex_exit(&ptgt->m_tgt_intr_mutex); ptgt = (mptsas_target_t *)mptsas_hash_traverse( &mpt->m_active->m_tgttbl, MPTSAS_HASH_NEXT); @@ -9504,6 +10178,7 @@ return (0); } } + mutex_exit(&mpt->m_intr_mutex); /* Bus was not busy - QUIESCED */ mutex_exit(&mpt->m_mutex); @@ -9521,7 +10196,9 @@ ptgt = (mptsas_target_t *)mptsas_hash_traverse(&mpt->m_active->m_tgttbl, MPTSAS_HASH_FIRST); while (ptgt != NULL) { + mutex_enter(&ptgt->m_tgt_intr_mutex); mptsas_set_throttle(mpt, ptgt, MAX_THROTTLE); + mutex_exit(&ptgt->m_tgt_intr_mutex); ptgt = (mptsas_target_t *)mptsas_hash_traverse( &mpt->m_active->m_tgttbl, MPTSAS_HASH_NEXT); @@ -9540,10 +10217,9 @@ mutex_enter(&mpt->m_mutex); if (mpt->m_softstate & MPTSAS_SS_DRAINING) { mpt->m_quiesce_timeid = 0; - if (mpt->m_ncmds == 0) { - /* Command queue has been drained */ - cv_signal(&mpt->m_cv); - } else { + mutex_enter(&mpt->m_intr_mutex); + if (mptsas_outstanding_cmds_n(mpt)) { + mutex_exit(&mpt->m_intr_mutex); /* * The throttle may have been reset because * of a SCSI bus reset @@ -9551,7 +10227,9 @@ ptgt = (mptsas_target_t *)mptsas_hash_traverse( &mpt->m_active->m_tgttbl, MPTSAS_HASH_FIRST); while (ptgt != NULL) { + mutex_enter(&ptgt->m_tgt_intr_mutex); mptsas_set_throttle(mpt, ptgt, HOLD_THROTTLE); + mutex_exit(&ptgt->m_tgt_intr_mutex); ptgt = (mptsas_target_t *)mptsas_hash_traverse( &mpt->m_active->m_tgttbl, MPTSAS_HASH_NEXT); @@ -9560,6 +10238,10 @@ mpt->m_quiesce_timeid = timeout(mptsas_ncmds_checkdrain, mpt, (MPTSAS_QUIESCE_TIMEOUT * drv_usectohz(1000000))); + } else { + mutex_exit(&mpt->m_intr_mutex); + /* Command queue has been drained */ + cv_signal(&mpt->m_cv); } } mutex_exit(&mpt->m_mutex); @@ -9723,6 +10405,7 @@ (void) ddi_dma_sync(dma_hdl, 0, 0, DDI_DMA_SYNC_FORDEV); request_desc_low = (cmd->cmd_slot << 16) + desc_type; cmd->cmd_rfm = NULL; + mpt->m_active->m_slot[cmd->cmd_slot] = cmd; MPTSAS_START_CMD(mpt, request_desc_low, request_desc_high); if ((mptsas_check_dma_handle(dma_hdl) != DDI_SUCCESS) || (mptsas_check_acc_handle(acc_hdl) != DDI_SUCCESS)) { @@ -10153,6 +10836,7 @@ request_desc_low = (cmd->cmd_slot << 16) + MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE; cmd->cmd_rfm = NULL; + mpt->m_active->m_slot[cmd->cmd_slot] = cmd; MPTSAS_START_CMD(mpt, request_desc_low, 0); if ((mptsas_check_dma_handle(mpt->m_dma_req_frame_hdl) != DDI_SUCCESS) || @@ -11535,6 +12219,8 @@ } out: + if (mpt->m_options & MPTSAS_OPT_PM) + (void) pm_idle_component(mpt->m_dip, 0); return (status); } @@ -11561,7 +12247,9 @@ ptgt = (mptsas_target_t *)mptsas_hash_traverse(&mpt->m_active->m_tgttbl, MPTSAS_HASH_FIRST); while (ptgt != NULL) { + mutex_enter(&ptgt->m_tgt_intr_mutex); mptsas_set_throttle(mpt, ptgt, HOLD_THROTTLE); + mutex_exit(&ptgt->m_tgt_intr_mutex); ptgt = (mptsas_target_t *)mptsas_hash_traverse( &mpt->m_active->m_tgttbl, MPTSAS_HASH_NEXT); @@ -11573,8 +12261,7 @@ MPTSAS_DISABLE_INTR(mpt); /* - * Abort all commands: outstanding commands, commands in waitq and - * tx_waitq. + * Abort all commands: outstanding commands, commands in waitq */ mptsas_flush_hba(mpt); @@ -11603,7 +12290,9 @@ ptgt = (mptsas_target_t *)mptsas_hash_traverse(&mpt->m_active->m_tgttbl, MPTSAS_HASH_FIRST); while (ptgt != NULL) { + mutex_enter(&ptgt->m_tgt_intr_mutex); mptsas_set_throttle(mpt, ptgt, MAX_THROTTLE); + mutex_exit(&ptgt->m_tgt_intr_mutex); ptgt = (mptsas_target_t *)mptsas_hash_traverse( &mpt->m_active->m_tgttbl, MPTSAS_HASH_NEXT); @@ -11913,7 +12602,9 @@ pmc[0] = pmc_name; if (ddi_prop_update_string_array(DDI_DEV_T_NONE, mpt->m_dip, "pm-components", pmc, 3) != DDI_PROP_SUCCESS) { + mutex_enter(&mpt->m_intr_mutex); mpt->m_options &= ~MPTSAS_OPT_PM; + mutex_exit(&mpt->m_intr_mutex); mptsas_log(mpt, CE_WARN, "mptsas%d: pm-component property creation failed.", mpt->m_instance); @@ -11936,7 +12627,9 @@ mptsas_log(mpt, CE_WARN, "pm_power_has_changed failed"); return (DDI_FAILURE); } + mutex_enter(&mpt->m_intr_mutex); mpt->m_power_level = PM_LEVEL_D0; + mutex_exit(&mpt->m_intr_mutex); /* * Set pm idle delay. */ @@ -12377,7 +13070,7 @@ phymask = mptsas_physport_to_phymask(mpt, physport); *pptgt = mptsas_tgt_alloc(&slots->m_tgttbl, *dev_handle, sas_wwn, - dev_info, phymask, phynum); + dev_info, phymask, phynum, mpt); if (*pptgt == NULL) { mptsas_log(mpt, CE_WARN, "Failed to allocated target" "structure!"); @@ -15082,7 +15775,7 @@ mptsas_target_t * mptsas_tgt_alloc(mptsas_hash_table_t *hashtab, uint16_t devhdl, uint64_t wwid, - uint32_t devinfo, mptsas_phymask_t phymask, uint8_t phynum) + uint32_t devinfo, mptsas_phymask_t phymask, uint8_t phynum, mptsas_t *mpt) { mptsas_target_t *tmp_tgt = NULL; @@ -15108,6 +15801,8 @@ tmp_tgt->m_qfull_retry_interval = drv_usectohz(QFULL_RETRY_INTERVAL * 1000); tmp_tgt->m_t_throttle = MAX_THROTTLE; + mutex_init(&tmp_tgt->m_tgt_intr_mutex, NULL, MUTEX_DRIVER, + DDI_INTR_PRI(mpt->m_intr_pri)); mptsas_hash_add(hashtab, tmp_tgt); @@ -15123,6 +15818,7 @@ if (tmp_tgt == NULL) { cmn_err(CE_WARN, "Tgt not found, nothing to free"); } else { + mutex_destroy(&tmp_tgt->m_tgt_intr_mutex); kmem_free(tmp_tgt, sizeof (struct mptsas_target)); } } @@ -15464,3 +16160,25 @@ ddi_dma_free_handle(dma_hdp); dma_hdp = NULL; } + +static int +mptsas_outstanding_cmds_n(mptsas_t *mpt) +{ + int n = 0, i; + for (i = 0; i < mpt->m_slot_freeq_pair_n; i++) { + mutex_enter(&mpt->m_slot_freeq_pairp[i]. + m_slot_allocq.s.m_fq_mutex); + mutex_enter(&mpt->m_slot_freeq_pairp[i]. + m_slot_releq.s.m_fq_mutex); + n += (mpt->m_slot_freeq_pairp[i].m_slot_allocq.s.m_fq_n_init - + mpt->m_slot_freeq_pairp[i].m_slot_allocq.s.m_fq_n - + mpt->m_slot_freeq_pairp[i].m_slot_releq.s.m_fq_n); + mutex_exit(&mpt->m_slot_freeq_pairp[i]. + m_slot_releq.s.m_fq_mutex); + mutex_exit(&mpt->m_slot_freeq_pairp[i]. + m_slot_allocq.s.m_fq_mutex); + } + if (mpt->m_max_requests - 2 < n) + panic("mptsas: free slot allocq and releq crazy"); + return (n); +}
--- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_impl.c Tue Jul 13 10:36:11 2010 +0800 +++ b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_impl.c Tue Jul 13 11:05:20 2010 +0800 @@ -281,6 +281,7 @@ request_desc_low = (cmd->cmd_slot << 16) + MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE; cmd->cmd_rfm = NULL; + mpt->m_active->m_slot[cmd->cmd_slot] = cmd; MPTSAS_START_CMD(mpt, request_desc_low, 0); if ((mptsas_check_dma_handle(mpt->m_dma_req_frame_hdl) != DDI_SUCCESS) || @@ -1314,6 +1315,7 @@ request_desc_low = (cmd->cmd_slot << 16) + MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE; cmd->cmd_rfm = NULL; + mpt->m_active->m_slot[cmd->cmd_slot] = cmd; MPTSAS_START_CMD(mpt, request_desc_low, 0); rvalue = 0;
--- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_raid.c Tue Jul 13 10:36:11 2010 +0800 +++ b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_raid.c Tue Jul 13 11:05:20 2010 +0800 @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -92,7 +91,7 @@ extern int mptsas_check_dma_handle(ddi_dma_handle_t handle); extern int mptsas_check_acc_handle(ddi_acc_handle_t handle); extern mptsas_target_t *mptsas_tgt_alloc(mptsas_hash_table_t *, uint16_t, - uint64_t, uint32_t, mptsas_phymask_t, uint8_t); + uint64_t, uint32_t, mptsas_phymask_t, uint8_t, mptsas_t *); static int mptsas_raidconf_page_0_cb(mptsas_t *mpt, caddr_t page_memp, @@ -217,7 +216,7 @@ * RAID uses phymask of 0. */ ptgt = mptsas_tgt_alloc(&slots->m_tgttbl, - voldevhandle, raidwwn, 0, 0, 0); + voldevhandle, raidwwn, 0, 0, 0, mpt); raidconfig->m_raidvol[vol].m_raidtgt = ptgt;
--- a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h Tue Jul 13 10:36:11 2010 +0800 +++ b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h Tue Jul 13 11:05:20 2010 +0800 @@ -208,6 +208,14 @@ uint16_t m_slot_num; uint32_t m_tgt_unconfigured; + /* + * For the common case, the elements in this structure are + * protected by the per hba instance mutex. In order to make + * the key code path in ISR lockless, a separate mutex is + * introdeced to protect those shown in ISR. + */ + kmutex_t m_tgt_intr_mutex; + } mptsas_target_t; typedef struct mptsas_smp { @@ -592,6 +600,32 @@ mpt->m_max_requests)) #define MPTSAS_TM_SLOT(mpt) (mpt->m_max_requests - 1) +typedef struct mptsas_slot_free_e { + processorid_t cpuid; + int slot; + list_node_t node; +} mptsas_slot_free_e_t; + +/* + * each of the allocq and releaseq in all CPU groups resides in separate + * cacheline(64 bytes). Multiple mutex in the same cacheline is not good + * for performance. + */ +typedef union mptsas_slot_freeq { + struct { + kmutex_t m_fq_mutex; + list_t m_fq_list; + int m_fq_n; + int m_fq_n_init; + } s; + char pad[64]; +} mptsas_slot_freeq_t; + +typedef struct mptsas_slot_freeq_pair { + mptsas_slot_freeq_t m_slot_allocq; + mptsas_slot_freeq_t m_slot_releq; +} mptsas_slot_freeq_pair_t; + /* * Macro for phy_flags */ @@ -657,9 +691,7 @@ scsi_hba_tran_t *m_tran; smp_hba_tran_t *m_smptran; kmutex_t m_mutex; - kmutex_t m_passthru_mutex; kcondvar_t m_cv; - kcondvar_t m_passthru_cv; kcondvar_t m_fw_cv; kcondvar_t m_config_cv; kcondvar_t m_fw_diag_cv; @@ -675,14 +707,11 @@ mptsas_cmd_t *m_waitq; /* cmd queue for active request */ mptsas_cmd_t **m_waitqtail; /* wait queue tail ptr */ - kmutex_t m_tx_waitq_mutex; - mptsas_cmd_t *m_tx_waitq; /* TX cmd queue for active request */ - mptsas_cmd_t **m_tx_waitqtail; /* tx_wait queue tail ptr */ - int m_tx_draining; /* TX queue draining flag */ - mptsas_cmd_t *m_doneq; /* queue of completed commands */ mptsas_cmd_t **m_donetail; /* queue tail ptr */ + kmutex_t m_passthru_mutex; + kcondvar_t m_passthru_cv; /* * variables for helper threads (fan-out interrupts) */ @@ -721,6 +750,13 @@ ddi_acc_handle_t m_acc_post_queue_hdl; /* + * Try the best to make the key code path in the ISR lockless. + * so avoid to use the per instance mutex m_mutex in the ISR. Introduce + * a separate mutex to protect the elements shown in ISR. + */ + kmutex_t m_intr_mutex; + + /* * list of reset notification requests */ struct scsi_reset_notify_entry *m_reset_notify_listf; @@ -812,8 +848,11 @@ * MPI handshake protocol. only one handshake cmd can run at a time. */ ddi_dma_handle_t m_hshk_dma_hdl; + ddi_acc_handle_t m_hshk_acc_hdl; + caddr_t m_hshk_memp; + size_t m_hshk_dma_size; /* Firmware version on the card at boot time */ @@ -884,6 +923,15 @@ uint8_t m_ir_capable; /* + * release and alloc queue for slot + */ + int m_slot_freeq_pair_n; + mptsas_slot_freeq_pair_t *m_slot_freeq_pairp; + mptsas_slot_free_e_t *m_slot_free_ae; +#define MPI_ADDRESS_COALSCE_MAX 128 + pMpi2ReplyDescriptorsUnion_t m_reply; + + /* * Is HBA processing a diag reset? */ uint8_t m_in_reset;