changeset 10852:51a4dcf550d9

6858031 ibd: one receive memory region is required to fix performance and scaling problems 6884097 ibt_map_mem_iov() fails (hermon reverses SLEEP and NOSLEEP) 6886372 hermon should support 4K IB MTU 6894485 ibd is not lint clean
author Bill Taylor <Bill.Taylor@Sun.COM>
date Fri, 23 Oct 2009 15:06:39 -0700
parents a45d13319506
children 05b7c23148c1
files usr/src/uts/common/io/ib/adapters/hermon/hermon.c usr/src/uts/common/io/ib/adapters/hermon/hermon_ci.c usr/src/uts/common/io/ib/clients/ibd/ibd.c usr/src/uts/common/sys/ib/adapters/hermon/hermon_hw.h usr/src/uts/common/sys/ib/clients/ibd/ibd.h
diffstat 5 files changed, 1043 insertions(+), 905 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/io/ib/adapters/hermon/hermon.c	Fri Oct 23 15:00:42 2009 -0700
+++ b/usr/src/uts/common/io/ib/adapters/hermon/hermon.c	Fri Oct 23 15:06:39 2009 -0700
@@ -3374,6 +3374,10 @@
 			goto init_ports_fail;
 		}
 
+		/* Set mtu_cap to 4096 bytes */
+		initport->mmc = 1;	/* set the change bit */
+		initport->mtu_cap = 5;	/* for 4096 bytes */
+
 		/* Validate the max port width */
 		maxval  = state->hs_queryport.ib_port_wid;
 		val	= cfgprof->cp_max_port_width;
@@ -3388,6 +3392,10 @@
 			goto init_ports_fail;
 		}
 
+		/* Since we're doing mtu_cap, cut vl_cap down */
+		initport->mvc = 1;	/* set this change bit */
+		initport->vl_cap = 3;	/* 3 means vl0-vl3, 4 total */
+
 		/* Validate max GID table size */
 		maxval  = ((uint64_t)1 << state->hs_queryport.log_max_gid);
 		val	= ((uint64_t)1 << cfgprof->cp_log_max_gidtbl);
--- a/usr/src/uts/common/io/ib/adapters/hermon/hermon_ci.c	Fri Oct 23 15:00:42 2009 -0700
+++ b/usr/src/uts/common/io/ib/adapters/hermon/hermon_ci.c	Fri Oct 23 15:06:39 2009 -0700
@@ -2399,7 +2399,7 @@
     ibt_all_wr_t *wr, ibc_mi_hdl_t *mi_hdl_p)
 {
 	int			status;
-	int			i, nds, max_nds;
+	int			i, j, nds, max_nds;
 	uint_t			len;
 	ibt_status_t		ibt_status;
 	ddi_dma_handle_t	dmahdl;
@@ -2431,7 +2431,7 @@
 		max_nds -= (iov_attr->iov_lso_hdr_sz + sizeof (uint32_t) +
 		    0xf) >> 4;	/* 0xf is for rounding up to a multiple of 16 */
 	rsvd_lkey = state->hs_devlim.rsv_lkey;
-	if (iov_attr->iov_flags & IBT_IOV_NOSLEEP) {
+	if ((iov_attr->iov_flags & IBT_IOV_NOSLEEP) == 0) {
 		kmflag = KM_SLEEP;
 		callback = DDI_DMA_SLEEP;
 	} else {
@@ -2490,11 +2490,19 @@
 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sgl))
 
 	len = iov_attr->iov_list_len;
+	for (i = 0, j = 0; j < len; j++) {
+		if (iov_attr->iov[j].iov_len == 0)
+			continue;
+		i++;
+	}
 	mi_hdl = kmem_alloc(sizeof (*mi_hdl) +
-	    (len - 1) * sizeof (ddi_dma_handle_t), kmflag);
+	    (i - 1) * sizeof (ddi_dma_handle_t), kmflag);
 	if (mi_hdl == NULL)
 		return (IBT_INSUFF_RESOURCE);
-	for (i = 0; i < len; i++) {
+	mi_hdl->imh_len = i;
+	for (i = 0, j = 0; j < len; j++) {
+		if (iov_attr->iov[j].iov_len == 0)
+			continue;
 		status = ddi_dma_alloc_handle(state->hs_dip, &dma_attr,
 		    callback, NULL, &dmahdl);
 		if (status != DDI_SUCCESS) {
@@ -2502,7 +2510,7 @@
 			goto fail2;
 		}
 		status = ddi_dma_addr_bind_handle(dmahdl, iov_attr->iov_as,
-		    iov_attr->iov[i].iov_addr, iov_attr->iov[i].iov_len,
+		    iov_attr->iov[j].iov_addr, iov_attr->iov[j].iov_len,
 		    DDI_DMA_RDWR | DDI_DMA_CONSISTENT, callback, NULL,
 		    &dmacookie, &cookie_cnt);
 		if (status != DDI_DMA_MAPPED) {
@@ -2522,13 +2530,13 @@
 				ddi_dma_nextcookie(dmahdl, &dmacookie);
 		}
 		mi_hdl->imh_dmahandle[i] = dmahdl;
+		i++;
 	}
 
 	if (iov_attr->iov_flags & IBT_IOV_RECV)
 		wr->recv.wr_nds = nds;
 	else
 		wr->send.wr_nds = nds;
-	mi_hdl->imh_len = len;
 	*mi_hdl_p = mi_hdl;
 	return (IBT_SUCCESS);
 
--- a/usr/src/uts/common/io/ib/clients/ibd/ibd.c	Fri Oct 23 15:00:42 2009 -0700
+++ b/usr/src/uts/common/io/ib/clients/ibd/ibd.c	Fri Oct 23 15:06:39 2009 -0700
@@ -64,7 +64,7 @@
 #include <sys/ib/mgt/ibmf/ibmf.h>	/* for ibd_get_portspeed */
 
 /*
- * Per-interface tunables
+ * Per-interface tunables (for developers)
  *
  * ibd_tx_copy_thresh
  *     This sets the threshold at which ibd will attempt to do a bcopy of the
@@ -102,17 +102,6 @@
  * ibd_hash_size
  *     Hash table size for the active AH list
  *
- * ibd_separate_cqs
- * ibd_txcomp_poll
- *     These boolean variables (1 or 0) may be used to tune the behavior of
- *     ibd in managing the send and receive completion queues and in deciding
- *     whether or not transmit completions should be polled or interrupt
- *     driven (when the completion queues are separate). If both the completion
- *     queues are interrupt driven, it may not be possible for the handlers to
- *     be invoked concurrently, depending on how the interrupts are tied on
- *     the PCI intr line.  Note that some combination of these two parameters
- *     may not be meaningful (and therefore not allowed).
- *
  * ibd_tx_softintr
  * ibd_rx_softintr
  *     The softintr mechanism allows ibd to avoid event queue overflows if
@@ -130,8 +119,6 @@
 uint_t ibd_num_lso_bufs = 0x400;
 uint_t ibd_num_ah = 64;
 uint_t ibd_hash_size = 32;
-uint_t ibd_separate_cqs = 1;
-uint_t ibd_txcomp_poll = 0;
 uint_t ibd_rx_softintr = 1;
 uint_t ibd_tx_softintr = 1;
 uint_t ibd_create_broadcast_group = 1;
@@ -151,16 +138,16 @@
 #endif
 
 /*
- * Receive CQ moderation parameters: NOT tunables
+ * Receive CQ moderation parameters: tunable (for developers)
  */
-static uint_t ibd_rxcomp_count = 4;
-static uint_t ibd_rxcomp_usec = 10;
+uint_t ibd_rxcomp_count = 4;
+uint_t ibd_rxcomp_usec = 10;
 
 /*
- * Send CQ moderation parameters: NOT tunables
+ * Send CQ moderation parameters: tunable (for developers)
  */
-#define	IBD_TXCOMP_COUNT		10
-#define	IBD_TXCOMP_USEC			300
+uint_t ibd_txcomp_count = 16;
+uint_t ibd_txcomp_usec = 300;
 
 /*
  * Thresholds
@@ -176,13 +163,23 @@
 #define	IBD_TX_POLL_THRESH		80
 
 /*
- * When doing multiple-send-wr or multiple-recv-wr posts, this value
- * determines how many to do at a time (in a single ibt_post_send/recv).
+ * When doing multiple-send-wr, this value determines how many to do at
+ * a time (in a single ibt_post_send).
  */
-#define	IBD_MAX_POST_MULTIPLE		4
+#define	IBD_MAX_TX_POST_MULTIPLE	4
+
+/* Post IBD_RX_POST_CNT receive work requests at a time. */
+#define	IBD_RX_POST_CNT			16
+
+/* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */
+#define	IBD_LOG_RX_POST			3
+
+/* Minimum number of receive work requests driver needs to always have */
+#define	IBD_RWQE_MIN	((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4)
 
 /*
- * Maximum length for returning chained mps back to crossbow
+ * Maximum length for returning chained mps back to crossbow.
+ * Also used as the maximum number of rx wc's polled at a time.
  */
 #define	IBD_MAX_RX_MP_LEN		16
 
@@ -196,10 +193,8 @@
 /*
  * Completion queue polling control
  */
-#define	IBD_RX_CQ_POLLING		0x1
-#define	IBD_TX_CQ_POLLING		0x2
-#define	IBD_REDO_RX_CQ_POLLING		0x4
-#define	IBD_REDO_TX_CQ_POLLING		0x8
+#define	IBD_CQ_POLLING			0x1
+#define	IBD_REDO_CQ_POLLING		0x2
 
 /*
  * Flag bits for resources to reap
@@ -337,6 +332,7 @@
 static void ibd_fini_txlist(ibd_state_t *);
 static void ibd_fini_rxlist(ibd_state_t *);
 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
+static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *);
 static void ibd_acache_fini(ibd_state_t *);
 #ifdef IBD_LOGGING
 static void ibd_log_fini(void);
@@ -345,23 +341,21 @@
 /*
  * Allocation/acquire/map routines
  */
-static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **, int, ibt_lkey_t);
-static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **);
 static int ibd_alloc_tx_copybufs(ibd_state_t *);
+static int ibd_alloc_rx_copybufs(ibd_state_t *);
 static int ibd_alloc_tx_lsobufs(ibd_state_t *);
-static int ibd_acquire_swqe(ibd_state_t *, ibd_swqe_t **);
+static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *);
 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
     uint32_t *);
 
 /*
  * Free/release/unmap routines
  */
-static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *);
 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
-static void ibd_delete_rwqe(ibd_state_t *, ibd_rwqe_t *);
 static void ibd_free_tx_copybufs(ibd_state_t *);
+static void ibd_free_rx_copybufs(ibd_state_t *);
 static void ibd_free_tx_lsobufs(ibd_state_t *);
-static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *);
+static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int);
 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
 static void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *);
@@ -369,12 +363,14 @@
 /*
  * Handlers/callback routines
  */
-static uint_t ibd_intr(char *);
-static uint_t ibd_tx_recycle(char *);
+static uint_t ibd_intr(caddr_t);
+static uint_t ibd_tx_recycle(caddr_t);
 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
-static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t);
-static uint_t ibd_drain_cq(ibd_state_t *, ibt_cq_hdl_t, ibt_wc_t *, uint_t);
+static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t);
+static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t);
+static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t);
+static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t);
 static void ibd_freemsg_cb(char *);
 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
     ibt_async_event_t *);
@@ -386,9 +382,8 @@
  */
 static boolean_t ibd_send(ibd_state_t *, mblk_t *);
 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
-static int ibd_post_recv(ibd_state_t *, ibd_rwqe_t *, boolean_t);
-static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
-static void ibd_flush_rx(ibd_state_t *, mblk_t *);
+static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *);
+static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
 
 /*
  * Threads
@@ -428,6 +423,7 @@
 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
 static boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t);
+static void ibd_dec_ref_ace(ibd_state_t *, ibd_ace_t *);
 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *);
 
@@ -451,7 +447,7 @@
  */
 static int ibd_sched_poll(ibd_state_t *, int, int);
 static void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int);
-static int ibd_resume_transmission(ibd_state_t *);
+static void ibd_resume_transmission(ibd_state_t *);
 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
 static void *list_get_head(list_t *);
@@ -542,7 +538,7 @@
 }
 #define	DPRINT		debug_print
 #else
-#define	DPRINT
+#define	DPRINT		0 &&
 #endif
 
 /*
@@ -584,13 +580,14 @@
 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock,
     ibd_state_t::id_lso))
 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso))
+_NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy))
 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree))
 
 /*
- * id_cq_poll_lock
+ * id_scq_poll_lock
  */
-_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_cq_poll_lock,
-    ibd_state_t::id_cq_poll_busy))
+_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock,
+    ibd_state_t::id_scq_poll_busy))
 
 /*
  * id_txpost_lock
@@ -599,18 +596,6 @@
     ibd_state_t::id_tx_head))
 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
     ibd_state_t::id_tx_busy))
-_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
-    ibd_state_t::id_tx_tailp))
-
-/*
- * id_rxpost_lock
- */
-_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
-    ibd_state_t::id_rx_head))
-_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
-    ibd_state_t::id_rx_busy))
-_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
-    ibd_state_t::id_rx_tailp))
 
 /*
  * id_acache_req_lock
@@ -619,6 +604,8 @@
     ibd_state_t::id_acache_req_cv))
 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 
     ibd_state_t::id_req_list))
+_NOTE(SCHEME_PROTECTS_DATA("atomic",
+    ibd_acache_s::ac_ref))
 
 /*
  * id_ac_mutex
@@ -640,6 +627,8 @@
     ibd_state_t::id_ah_op))
 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
     ibd_state_t::id_ah_error))
+_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
+    ibd_state_t::id_ac_hot_ace))
 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error))
 
 /*
@@ -680,26 +669,21 @@
 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state))
 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start",
     ibd_state_t::id_link_speed))
+_NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid))
 
 /*
  * id_tx_list.dl_mutex
  */
-_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 
+_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
     ibd_state_t::id_tx_list.dl_head))
-_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 
-    ibd_state_t::id_tx_list.dl_tail))
-_NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
+_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
     ibd_state_t::id_tx_list.dl_pending_sends))
-_NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
+_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
     ibd_state_t::id_tx_list.dl_cnt))
 
 /*
  * id_rx_list.dl_mutex
  */
-_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex, 
-    ibd_state_t::id_rx_list.dl_head))
-_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex, 
-    ibd_state_t::id_rx_list.dl_tail))
 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
     ibd_state_t::id_rx_list.dl_bufs_outstanding))
 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
@@ -743,24 +727,39 @@
     mac_capab_lso_s
     msgb::b_next
     msgb::b_rptr
-    msgb::b_wptr))
+    msgb::b_wptr
+    ibd_state_s::id_bgroup_created
+    ibd_state_s::id_mac_state
+    ibd_state_s::id_mtu
+    ibd_state_s::id_num_rwqe
+    ibd_state_s::id_num_swqe
+    ibd_state_s::id_qpnum
+    ibd_state_s::id_rcq_hdl
+    ibd_state_s::id_rx_buf_sz
+    ibd_state_s::id_rx_bufs
+    ibd_state_s::id_rx_mr_hdl
+    ibd_state_s::id_rx_wqes
+    ibd_state_s::id_rxwcs
+    ibd_state_s::id_rxwcs_size
+    ibd_state_s::id_rx_nqueues
+    ibd_state_s::id_rx_queues
+    ibd_state_s::id_scope
+    ibd_state_s::id_scq_hdl
+    ibd_state_s::id_tx_buf_sz
+    ibd_state_s::id_tx_bufs
+    ibd_state_s::id_tx_mr_hdl
+    ibd_state_s::id_tx_rel_list.dl_cnt
+    ibd_state_s::id_tx_wqes
+    ibd_state_s::id_txwcs
+    ibd_state_s::id_txwcs_size))
 
 int
 _init()
 {
 	int status;
 
-	/*
-	 * Sanity check some parameter settings. Tx completion polling
-	 * only makes sense with separate CQs for Tx and Rx.
-	 */
-	if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) {
-		cmn_err(CE_NOTE, "!ibd: %s",
-		    "Setting ibd_txcomp_poll = 0 for combined CQ");
-		ibd_txcomp_poll = 0;
-	}
-
-	status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0);
+	status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t),
+	    PAGESIZE), 0);
 	if (status != 0) {
 		DPRINT(10, "_init:failed in ddi_soft_state_init()");
 		return (status);
@@ -957,9 +956,12 @@
 	_ret_ = mod_hash_insert(state->id_ah_active_hash,	\
 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
 	ASSERT(_ret_ == 0);					\
+	state->id_ac_hot_ace = ce;				\
 }
 #define	IBD_ACACHE_PULLOUT_ACTIVE(state, ce) {			\
 	list_remove(&state->id_ah_active, ce);			\
+	if (state->id_ac_hot_ace == ce)				\
+		state->id_ac_hot_ace = NULL;			\
 	(void) mod_hash_remove(state->id_ah_active_hash,	\
 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
 }
@@ -982,7 +984,7 @@
  * of membership must be present before initiating the transmit.
  * This list is also emptied during driver detach, since sendonly
  * membership acquired during transmit is dropped at detach time
- * alongwith ipv4 broadcast full membership. Insert/deletes to
+ * along with ipv4 broadcast full membership. Insert/deletes to
  * this list are done only by the async thread, but it is also
  * searched in program context (see multicast disable case), thus
  * the id_mc_mutex protects the list. The driver detach path also
@@ -1094,7 +1096,7 @@
  * trap delivery. Querying the SA to establish presence/absence of the
  * mcg is also racy at best. Thus, the driver just prints a warning
  * message when it can not rejoin after receiving a create trap, although
- * this might be (on rare occassions) a mis-warning if the create trap is
+ * this might be (on rare occasions) a mis-warning if the create trap is
  * received after the mcg was deleted.
  */
 
@@ -1353,6 +1355,7 @@
 
 	mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
 	mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
+	mutex_enter(&state->id_ac_mutex);
 	list_create(&state->id_ah_free, sizeof (ibd_ace_t),
 	    offsetof(ibd_ace_t, ac_list));
 	list_create(&state->id_ah_active, sizeof (ibd_ace_t),
@@ -1366,12 +1369,14 @@
 	    offsetof(ibd_mce_t, mc_list));
 	list_create(&state->id_req_list, sizeof (ibd_req_t),
 	    offsetof(ibd_req_t, rq_list));
+	state->id_ac_hot_ace = NULL;
 
 	state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
 	    IBD_NUM_AH, KM_SLEEP);
 	for (i = 0; i < IBD_NUM_AH; i++, ce++) {
 		if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
 		    state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
+			mutex_exit(&state->id_ac_mutex);
 			ibd_acache_fini(state);
 			return (DDI_FAILURE);
 		} else {
@@ -1380,6 +1385,7 @@
 			IBD_ACACHE_INSERT_FREE(state, ce);
 		}
 	}
+	mutex_exit(&state->id_ac_mutex);
 	return (DDI_SUCCESS);
 }
 
@@ -1463,7 +1469,14 @@
 
 	mutex_enter(&state->id_ac_mutex);
 
-	if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) {
+	if (((ptr = state->id_ac_hot_ace) != NULL) &&
+	    (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) {
+		INC_REF(ptr, numwqe);
+		mutex_exit(&state->id_ac_mutex);
+		return (ptr);
+	}
+	if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) {
+		state->id_ac_hot_ace = ptr;
 		mutex_exit(&state->id_ac_mutex);
 		return (ptr);
 	}
@@ -1869,7 +1882,7 @@
 	 * this on a link down, since we will be unable to do SA operations,
 	 * defaulting to the lowest speed. Also notice that we update our
 	 * notion of speed before calling mac_link_update(), which will do
-	 * neccesary higher level notifications for speed changes.
+	 * necessary higher level notifications for speed changes.
 	 */
 	if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
@@ -2074,6 +2087,7 @@
 		 * index is the same as before; finally check to see if the
 		 * pkey has been relocated to a different index in the table.
 		 */
+		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
 		if (bcmp(port_infop->p_sgid_tbl,
 		    &state->id_sgid, sizeof (ib_gid_t)) != 0) {
 
@@ -2098,7 +2112,7 @@
 			 * marked both the start and stop 'in-progress' flags,
 			 * so it is ok to go ahead and do this restart.
 			 */
-			ibd_undo_start(state, LINK_STATE_DOWN);
+			(void) ibd_undo_start(state, LINK_STATE_DOWN);
 			if ((ret = ibd_start(state)) != 0) {
 				DPRINT(10, "ibd_restart: cannot restart, "
 				    "ret=%d", ret);
@@ -2108,6 +2122,7 @@
 		} else {
 			new_link_state = LINK_STATE_DOWN;
 		}
+		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
 	}
 
 update_link_state:
@@ -2284,6 +2299,8 @@
 	ibt_hca_attr_t hca_attrs;
 	ibt_status_t ibt_status;
 
+	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
+
 	/*
 	 * Query the HCA and fetch its attributes
 	 */
@@ -2344,6 +2361,14 @@
 	}
 
 	/*
+	 * Translating the virtual address regions into physical regions
+	 * for using the Reserved LKey feature results in a wr sgl that
+	 * is a little longer. Since failing ibt_map_mem_iov() is costly,
+	 * we'll fix a high-water mark (65%) for when we should stop.
+	 */
+	state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100;
+
+	/*
 	 * 5. Set number of recv and send wqes after checking hca maximum
 	 *    channel size
 	 */
@@ -2352,11 +2377,13 @@
 	} else {
 		state->id_num_rwqe = IBD_NUM_RWQE;
 	}
+	state->id_rx_bufs_outstanding_limit = state->id_num_rwqe - IBD_RWQE_MIN;
 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) {
 		state->id_num_swqe = hca_attrs.hca_max_chan_sz;
 	} else {
 		state->id_num_swqe = IBD_NUM_SWQE;
 	}
+	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
 
 	return (DDI_SUCCESS);
 }
@@ -2563,6 +2590,7 @@
 
 attach_fail:
 	(void) ibd_unattach(state, dip);
+	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
 	return (DDI_FAILURE);
 }
 
@@ -2613,26 +2641,32 @@
 	state->id_trap_stop = B_TRUE;
 	state->id_trap_inprog = 0;
 
-	mutex_init(&state->id_cq_poll_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL);
 	state->id_dip = dip;
 
 	mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
 
+	mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
+	mutex_enter(&state->id_tx_list.dl_mutex);
 	state->id_tx_list.dl_head = NULL;
-	state->id_tx_list.dl_tail = NULL;
 	state->id_tx_list.dl_pending_sends = B_FALSE;
 	state->id_tx_list.dl_cnt = 0;
-	mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
+	mutex_exit(&state->id_tx_list.dl_mutex);
+	mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
+	mutex_enter(&state->id_tx_rel_list.dl_mutex);
+	state->id_tx_rel_list.dl_head = NULL;
+	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
+	state->id_tx_rel_list.dl_cnt = 0;
+	mutex_exit(&state->id_tx_rel_list.dl_mutex);
 	mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
 	state->id_tx_busy = 0;
-
-	state->id_rx_list.dl_head = NULL;
-	state->id_rx_list.dl_tail = NULL;
+	mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL);
+
 	state->id_rx_list.dl_bufs_outstanding = 0;
 	state->id_rx_list.dl_cnt = 0;
 	mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
-	mutex_init(&state->id_rxpost_lock, NULL, MUTEX_DRIVER, NULL);
-
+	mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
 	(void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip));
 	state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
 	    0, NULL, NULL, NULL, NULL, NULL, 0);
@@ -2654,14 +2688,17 @@
 
 	kmem_cache_destroy(state->id_req_kmc);
 
-	mutex_destroy(&state->id_rxpost_lock);
 	mutex_destroy(&state->id_rx_list.dl_mutex);
+	mutex_destroy(&state->id_rx_free_list.dl_mutex);
 
 	mutex_destroy(&state->id_txpost_lock);
 	mutex_destroy(&state->id_tx_list.dl_mutex);
+	mutex_destroy(&state->id_tx_rel_list.dl_mutex);
+	mutex_destroy(&state->id_lso_lock);
 
 	mutex_destroy(&state->id_sched_lock);
-	mutex_destroy(&state->id_cq_poll_lock);
+	mutex_destroy(&state->id_scq_poll_lock);
+	mutex_destroy(&state->id_rcq_poll_lock);
 
 	cv_destroy(&state->id_trap_cv);
 	mutex_destroy(&state->id_trap_lock);
@@ -2955,7 +2992,7 @@
 /*
  * This code handles delayed Tx completion cleanups for mcg's to which
  * disable_multicast has been issued, regular mcg related cleanups during
- * disable_multicast, disable_promiscous and mcg traps, as well as
+ * disable_multicast, disable_promiscuous and mcg traps, as well as
  * cleanups during driver detach time. Depending on the join state,
  * it deletes the mce from the appropriate list and issues the IBA
  * leave/detach; except in the disable_multicast case when the mce
@@ -3121,7 +3158,9 @@
 query_bcast_grp:
 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
 	mcg_attr.mc_pkey = state->id_pkey;
+	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
 	state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;
+	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
 
 	for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
 		state->id_scope = mcg_attr.mc_scope = scopes[i];
@@ -3129,11 +3168,13 @@
 		/*
 		 * Look for the IPoIB broadcast group.
 		 */
+		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
 		state->id_mgid.gid_prefix =
 		    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
 		    ((uint64_t)state->id_scope << 48) |
 		    ((uint32_t)(state->id_pkey << 16)));
 		mcg_attr.mc_mgid = state->id_mgid;
+		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
 		if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
 		    &state->id_mcinfo, &numg) == IBT_SUCCESS) {
 			found = B_TRUE;
@@ -3165,11 +3206,13 @@
 			mcg_attr.mc_flow = 0;
 			mcg_attr.mc_sl = 0;
 			mcg_attr.mc_tclass = 0;
+			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
 			state->id_mgid.gid_prefix =
 			    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
 			    ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) |
 			    ((uint32_t)(state->id_pkey << 16)));
 			mcg_attr.mc_mgid = state->id_mgid;
+			_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
 
 			if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr,
 			    &mcg_info, NULL, NULL)) != IBT_SUCCESS) {
@@ -3228,6 +3271,9 @@
 	state->id_tx_bufs = kmem_zalloc(state->id_num_swqe *
 	    state->id_tx_buf_sz, KM_SLEEP);
 
+	state->id_tx_wqes = kmem_zalloc(state->id_num_swqe *
+	    sizeof (ibd_swqe_t), KM_SLEEP);
+
 	/*
 	 * Do one memory registration on the entire txbuf area
 	 */
@@ -3238,6 +3284,8 @@
 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
 	    &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) {
 		DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed");
+		kmem_free(state->id_tx_wqes,
+		    state->id_num_swqe * sizeof (ibd_swqe_t));
 		kmem_free(state->id_tx_bufs,
 		    state->id_num_swqe * state->id_tx_buf_sz);
 		state->id_tx_bufs = NULL;
@@ -3283,6 +3331,8 @@
 		return (DDI_FAILURE);
 	}
 
+	mutex_enter(&state->id_lso_lock);
+
 	/*
 	 * Now allocate the buflist.  Note that the elements in the buflist and
 	 * the buffers in the lso memory have a permanent 1-1 relation, so we
@@ -3319,6 +3369,7 @@
 	bktp->bkt_nfree = bktp->bkt_nelem;
 
 	state->id_lso = bktp;
+	mutex_exit(&state->id_lso_lock);
 
 	return (DDI_SUCCESS);
 }
@@ -3332,6 +3383,8 @@
 	ibd_swqe_t *swqe;
 	ibt_lkey_t lkey;
 	int i;
+	uint_t len;
+	uint8_t *bufaddr;
 
 	if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS)
 		return (DDI_FAILURE);
@@ -3345,27 +3398,35 @@
 	 * Allocate and setup the swqe list
 	 */
 	lkey = state->id_tx_mr_desc.md_lkey;
-	for (i = 0; i < state->id_num_swqe; i++) {
-		if (ibd_alloc_swqe(state, &swqe, i, lkey) != DDI_SUCCESS) {
-			DPRINT(10, "ibd_init_txlist: ibd_alloc_swqe failed");
-			ibd_fini_txlist(state);
-			return (DDI_FAILURE);
-		}
+	bufaddr = state->id_tx_bufs;
+	len = state->id_tx_buf_sz;
+	swqe = state->id_tx_wqes;
+	mutex_enter(&state->id_tx_list.dl_mutex);
+	for (i = 0; i < state->id_num_swqe; i++, swqe++, bufaddr += len) {
+		swqe->swqe_type = IBD_WQE_SEND;
+		swqe->swqe_next = NULL;
+		swqe->swqe_im_mblk = NULL;
+
+		swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
+		    bufaddr;
+		swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
+		swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
+
+		swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
+		swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS;
+		swqe->w_swr.wr_trans = IBT_UD_SRV;
+
+		/* These are set in send */
+		swqe->w_swr.wr_nds = 0;
+		swqe->w_swr.wr_sgl = NULL;
+		swqe->w_swr.wr_opcode = IBT_WRC_SEND;
 
 		/* add to list */
 		state->id_tx_list.dl_cnt++;
-		if (state->id_tx_list.dl_head == NULL) {
-			swqe->swqe_prev = NULL;
-			swqe->swqe_next = NULL;
-			state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
-			state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
-		} else {
-			swqe->swqe_prev = state->id_tx_list.dl_tail;
-			swqe->swqe_next = NULL;
-			state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe);
-			state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
-		}
-	}
+		swqe->swqe_next = state->id_tx_list.dl_head;
+		state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
+	}
+	mutex_exit(&state->id_tx_list.dl_mutex);
 
 	return (DDI_SUCCESS);
 }
@@ -3503,7 +3564,9 @@
 	/*
 	 * Free txbuf memory
 	 */
+	kmem_free(state->id_tx_wqes, state->id_num_swqe * sizeof (ibd_swqe_t));
 	kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz);
+	state->id_tx_wqes = NULL;
 	state->id_tx_bufs = NULL;
 }
 
@@ -3563,124 +3626,175 @@
 		state->id_tx_list.dl_head = node->swqe_next;
 		ASSERT(state->id_tx_list.dl_cnt > 0);
 		state->id_tx_list.dl_cnt--;
-		ibd_free_swqe(state, node);
-	}
+	}
+	ASSERT(state->id_tx_list.dl_cnt == 0);
 	mutex_exit(&state->id_tx_list.dl_mutex);
+	mutex_enter(&state->id_tx_rel_list.dl_mutex);
+	while (state->id_tx_rel_list.dl_head != NULL) {
+		node = WQE_TO_SWQE(state->id_tx_rel_list.dl_head);
+		state->id_tx_rel_list.dl_head = node->swqe_next;
+		ASSERT(state->id_tx_rel_list.dl_cnt > 0);
+		state->id_tx_rel_list.dl_cnt--;
+	}
+	ASSERT(state->id_tx_rel_list.dl_cnt == 0);
+	mutex_exit(&state->id_tx_rel_list.dl_mutex);
 
 	ibd_free_tx_lsobufs(state);
 	ibd_free_tx_copybufs(state);
 }
 
-/*
- * Allocate a single send wqe and register it so it is almost
- * ready to be posted to the hardware.
- */
-static int
-ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe, int ndx, ibt_lkey_t lkey)
+static void
+ibd_post_recv_task(ibd_rwqe_t *rwqe, ibd_rwqe_t *tail)
 {
-	ibd_swqe_t *swqe;
-
-	swqe = kmem_zalloc(sizeof (ibd_swqe_t), KM_SLEEP);
-	*wqe = swqe;
-
-	swqe->swqe_type = IBD_WQE_SEND;
-	swqe->swqe_next = NULL;
-	swqe->swqe_prev = NULL;
-	swqe->swqe_im_mblk = NULL;
-
-	swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
-	    (state->id_tx_bufs + ndx * state->id_tx_buf_sz);
-	swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
-	swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
-
-	swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
-	swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL;
-	swqe->w_swr.wr_trans = IBT_UD_SRV;
-
-	/* These are set in send */
-	swqe->w_swr.wr_nds = 0;
-	swqe->w_swr.wr_sgl = NULL;
-	swqe->w_swr.wr_opcode = IBT_WRC_SEND;
-
-	return (DDI_SUCCESS);
+	uint_t		i;
+	uint_t		num_posted;
+	ibt_status_t	ibt_status;
+	ibt_recv_wr_t	wrs[IBD_RX_POST_CNT];
+	ibd_state_t	*state = rwqe->w_state;
+
+	mutex_enter(&state->id_rx_post_lock);
+	if (state->id_rx_post_busy) {
+		tail->rwqe_next = state->id_rx_post_head;
+		state->id_rx_post_head = RWQE_TO_WQE(rwqe);
+		mutex_exit(&state->id_rx_post_lock);
+		return;
+	}
+	state->id_rx_post_busy = 1;
+	mutex_exit(&state->id_rx_post_lock);
+
+loop:
+	/* Post the IBD_RX_POST_CNT receive work requests pointed to by arg. */
+	for (i = 0; i < IBD_RX_POST_CNT; i++) {
+		wrs[i] = rwqe->w_rwr;
+		rwqe = WQE_TO_RWQE(rwqe->rwqe_next);
+	}
+
+	/*
+	 * If posting fails for some reason, we'll never receive
+	 * completion intimation, so we'll need to cleanup. But
+	 * we need to make sure we don't clean up nodes whose
+	 * wrs have been successfully posted. We assume that the
+	 * hca driver returns on the first failure to post and
+	 * therefore the first 'num_posted' entries don't need
+	 * cleanup here.
+	 */
+	atomic_add_32(&state->id_rx_list.dl_cnt, IBD_RX_POST_CNT);
+
+	num_posted = 0;
+	ibt_status = ibt_post_recv(state->id_chnl_hdl,
+	    wrs, IBD_RX_POST_CNT, &num_posted);
+	if (ibt_status != IBT_SUCCESS) {
+		ibd_print_warn(state, "ibd_post_recv: FATAL: "
+		    "posting multiple wrs failed: "
+		    "requested=%d, done=%d, ret=%d",
+		    IBD_RX_POST_CNT, num_posted, ibt_status);
+		atomic_add_32(&state->id_rx_list.dl_cnt,
+		    -(IBD_RX_POST_CNT - num_posted));
+		/* This cannot happen! */
+	}
+	if (rwqe != NULL)	/* more rwqes on our list? */
+		goto loop;
+
+	/* check if we have a new list */
+	mutex_enter(&state->id_rx_post_lock);
+	if ((rwqe = WQE_TO_RWQE(state->id_rx_post_head)) != NULL) {
+		state->id_rx_post_head = NULL;
+		mutex_exit(&state->id_rx_post_lock);
+		goto loop;
+	}
+	state->id_rx_post_busy = 0;
+	mutex_exit(&state->id_rx_post_lock);
 }
 
-/*
- * Free an allocated send wqe.
- */
-/*ARGSUSED*/
-static void
-ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe)
-{
-	kmem_free(swqe, sizeof (ibd_swqe_t));
-}
+/* macro explained below */
+#define	RX_QUEUE_HASH(rwqe) \
+	(((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1))
 
 /*
- * Post a rwqe to the hardware and add it to the Rx list. The
- * "recycle" parameter indicates whether an old rwqe is being
- * recycled, or this is a new one.
+ * Add a rwqe to one of the the Rx lists.  If the list is large enough
+ * (exactly IBD_RX_POST_CNT), post the list to the hardware.
+ *
+ * Note: one of 2^N lists is chosen via a hash.  This is done
+ * because using one list is contentious.  If the first list is busy
+ * (mutex_tryenter fails), use a second list (just call mutex_enter).
+ *
+ * The number 8 in RX_QUEUE_HASH is a random choice that provides
+ * even distribution of mapping rwqes to the 2^N queues.
  */
-static int
-ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle)
+static void
+ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe)
 {
-	ibt_status_t ibt_status;
-
-	if (recycle == B_FALSE) {
-		mutex_enter(&state->id_rx_list.dl_mutex);
-		if (state->id_rx_list.dl_head == NULL) {
-			rwqe->rwqe_prev = NULL;
-			rwqe->rwqe_next = NULL;
-			state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe);
-			state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe);
-		} else {
-			rwqe->rwqe_prev = state->id_rx_list.dl_tail;
-			rwqe->rwqe_next = NULL;
-			state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe);
-			state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe);
-		}
-		mutex_exit(&state->id_rx_list.dl_mutex);
-	}
-
-	mutex_enter(&state->id_rxpost_lock);
-	if (state->id_rx_busy) {
-		rwqe->w_post_link = NULL;
-		if (state->id_rx_head)
-			*(state->id_rx_tailp) = (ibd_wqe_t *)rwqe;
-		else
-			state->id_rx_head = rwqe;
-		state->id_rx_tailp = &(rwqe->w_post_link);
+	ibd_rx_queue_t	*rxp;
+	ibd_rwqe_t	*tail;
+
+	rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe);
+
+	if (!mutex_tryenter(&rxp->rx_post_lock)) {
+		/* Failed.  Try a different queue ("ptr + 16" ensures that). */
+		rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16);
+		mutex_enter(&rxp->rx_post_lock);
+	}
+	rwqe->rwqe_next = rxp->rx_head;
+	if (rxp->rx_cnt == 0)
+		rxp->rx_tail = RWQE_TO_WQE(rwqe);
+	if (++rxp->rx_cnt == IBD_RX_POST_CNT) {
+		rxp->rx_head = NULL;
+		tail = WQE_TO_RWQE(rxp->rx_tail);
+		rxp->rx_cnt = 0;
 	} else {
-		state->id_rx_busy = 1;
-		do {
-			mutex_exit(&state->id_rxpost_lock);
-
-			/*
-			 * Here we should add dl_cnt before post recv, because
-			 * we would have to make sure dl_cnt is updated before
-			 * the corresponding ibd_process_rx() is called.
-			 */
-			atomic_add_32(&state->id_rx_list.dl_cnt, 1);
-
-			ibt_status = ibt_post_recv(state->id_chnl_hdl,
-			    &rwqe->w_rwr, 1, NULL);
-			if (ibt_status != IBT_SUCCESS) {
-				(void) atomic_add_32_nv(
-				    &state->id_rx_list.dl_cnt, -1);
-				ibd_print_warn(state, "ibd_post_recv: "
-				    "posting failed, ret=%d", ibt_status);
-				return (DDI_FAILURE);
-			}
-
-			mutex_enter(&state->id_rxpost_lock);
-			rwqe = state->id_rx_head;
-			if (rwqe) {
-				state->id_rx_head =
-				    (ibd_rwqe_t *)(rwqe->w_post_link);
-			}
-		} while (rwqe);
-		state->id_rx_busy = 0;
-	}
-	mutex_exit(&state->id_rxpost_lock);
+		rxp->rx_head = RWQE_TO_WQE(rwqe);
+		rwqe = NULL;
+	}
+	rxp->rx_stat++;
+	mutex_exit(&rxp->rx_post_lock);
+	if (rwqe) {
+		ibd_post_recv_task(rwqe, tail);
+	}
+}
+
+static int
+ibd_alloc_rx_copybufs(ibd_state_t *state)
+{
+	ibt_mr_attr_t mem_attr;
+	int i;
+
+	/*
+	 * Allocate one big chunk for all regular rx copy bufs
+	 */
+	state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE;
+
+	state->id_rx_bufs = kmem_zalloc(state->id_num_rwqe *
+	    state->id_rx_buf_sz, KM_SLEEP);
+
+	state->id_rx_wqes = kmem_zalloc(state->id_num_rwqe *
+	    sizeof (ibd_rwqe_t), KM_SLEEP);
+
+	state->id_rx_nqueues = 1 << IBD_LOG_RX_POST;
+	state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues *
+	    sizeof (ibd_rx_queue_t), KM_SLEEP);
+	for (i = 0; i < state->id_rx_nqueues; i++) {
+		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
+		mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL);
+	}
+
+	/*
+	 * Do one memory registration on the entire rxbuf area
+	 */
+	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs;
+	mem_attr.mr_len = state->id_num_rwqe * state->id_rx_buf_sz;
+	mem_attr.mr_as = NULL;
+	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
+	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
+	    &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) {
+		DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed");
+		kmem_free(state->id_rx_wqes,
+		    state->id_num_rwqe * sizeof (ibd_rwqe_t));
+		kmem_free(state->id_rx_bufs,
+		    state->id_num_rwqe * state->id_rx_buf_sz);
+		state->id_rx_bufs = NULL;
+		state->id_rx_wqes = NULL;
+		return (DDI_FAILURE);
+	}
 
 	return (DDI_SUCCESS);
 }
@@ -3692,24 +3806,82 @@
 ibd_init_rxlist(ibd_state_t *state)
 {
 	ibd_rwqe_t *rwqe;
+	ibt_lkey_t lkey;
 	int i;
-
-	for (i = 0; i < state->id_num_rwqe; i++) {
-		if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) {
+	uint_t len;
+	uint8_t *bufaddr;
+
+	if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS)
+		return (DDI_FAILURE);
+
+	/*
+	 * Allocate and setup the rwqe list
+	 */
+	lkey = state->id_rx_mr_desc.md_lkey;
+	rwqe = state->id_rx_wqes;
+	bufaddr = state->id_rx_bufs;
+	len = state->id_rx_buf_sz;
+	for (i = 0; i < state->id_num_rwqe; i++, rwqe++, bufaddr += len) {
+		rwqe->rwqe_type = IBD_WQE_RECV;
+		rwqe->w_state = state;
+		rwqe->w_freeing_wqe = B_FALSE;
+		rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
+		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
+
+		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
+
+		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
+		    &rwqe->w_freemsg_cb)) == NULL) {
+			DPRINT(10, "ibd_init_rxlist : failed in desballoc()");
+			rwqe->rwqe_copybuf.ic_bufaddr = NULL;
 			ibd_fini_rxlist(state);
 			return (DDI_FAILURE);
 		}
 
-		if (ibd_post_recv(state, rwqe, B_FALSE) == DDI_FAILURE) {
-			ibd_free_rwqe(state, rwqe);
-			ibd_fini_rxlist(state);
-			return (DDI_FAILURE);
-		}
+		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
+		rwqe->rwqe_copybuf.ic_sgl.ds_va =
+		    (ib_vaddr_t)(uintptr_t)bufaddr;
+		rwqe->rwqe_copybuf.ic_sgl.ds_len = len;
+		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
+		rwqe->w_rwr.wr_nds = 1;
+		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
+
+		ibd_post_recv(state, rwqe);
 	}
 
 	return (DDI_SUCCESS);
 }
 
+static void
+ibd_free_rx_copybufs(ibd_state_t *state)
+{
+	int i;
+
+	/*
+	 * Unregister rxbuf mr
+	 */
+	if (ibt_deregister_mr(state->id_hca_hdl,
+	    state->id_rx_mr_hdl) != IBT_SUCCESS) {
+		DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed");
+	}
+	state->id_rx_mr_hdl = NULL;
+
+	/*
+	 * Free rxbuf memory
+	 */
+	for (i = 0; i < state->id_rx_nqueues; i++) {
+		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
+		mutex_destroy(&rxp->rx_post_lock);
+	}
+	kmem_free(state->id_rx_queues, state->id_rx_nqueues *
+	    sizeof (ibd_rx_queue_t));
+	kmem_free(state->id_rx_wqes, state->id_num_rwqe * sizeof (ibd_rwqe_t));
+	kmem_free(state->id_rx_bufs, state->id_num_rwqe * state->id_rx_buf_sz);
+	state->id_rx_queues = NULL;
+	state->id_rx_wqes = NULL;
+	state->id_rx_bufs = NULL;
+}
+
 /*
  * Free the statically allocated Rx buffer list.
  *
@@ -3717,141 +3889,48 @@
 static void
 ibd_fini_rxlist(ibd_state_t *state)
 {
-	ibd_rwqe_t *node;
+	ibd_rwqe_t *rwqe;
+	int i;
 
 	mutex_enter(&state->id_rx_list.dl_mutex);
-	while (state->id_rx_list.dl_head != NULL) {
-		node = WQE_TO_RWQE(state->id_rx_list.dl_head);
-		state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next;
-		ASSERT(state->id_rx_list.dl_cnt > 0);
-		state->id_rx_list.dl_cnt--;
-
-		ibd_free_rwqe(state, node);
+	rwqe = state->id_rx_wqes;
+	for (i = 0; i < state->id_num_rwqe; i++, rwqe++) {
+		if (rwqe->rwqe_im_mblk != NULL) {
+			rwqe->w_freeing_wqe = B_TRUE;
+			freemsg(rwqe->rwqe_im_mblk);
+		}
 	}
 	mutex_exit(&state->id_rx_list.dl_mutex);
-}
-
-/*
- * Allocate a single recv wqe and register it so it is almost
- * ready to be posted to the hardware.
- */
-static int
-ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe)
-{
-	ibt_mr_attr_t mem_attr;
-	ibd_rwqe_t *rwqe;
-
-	if ((rwqe = kmem_zalloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) {
-		DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc");
-		return (DDI_FAILURE);
-	}
-	*wqe = rwqe;
-	rwqe->rwqe_type = IBD_WQE_RECV;
-	rwqe->w_state = state;
-	rwqe->rwqe_next = NULL;
-	rwqe->rwqe_prev = NULL;
-	rwqe->w_freeing_wqe = B_FALSE;
-	rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
-	rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
-
-	rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu +
-	    IPOIB_GRH_SIZE, KM_NOSLEEP);
-	if (rwqe->rwqe_copybuf.ic_bufaddr == NULL) {
-		DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc");
-		kmem_free(rwqe, sizeof (ibd_rwqe_t));
-		return (DDI_FAILURE);
-	}
-
-	if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
-	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) ==
-	    NULL) {
-		DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()");
-		kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
-		    state->id_mtu + IPOIB_GRH_SIZE);
-		rwqe->rwqe_copybuf.ic_bufaddr = NULL;
-		kmem_free(rwqe, sizeof (ibd_rwqe_t));
-		return (DDI_FAILURE);
-	}
-
-	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr;
-	mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE;
-	mem_attr.mr_as = NULL;
-	mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
-	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
-	    &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) !=
-	    IBT_SUCCESS) {
-		DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()");
-		rwqe->w_freeing_wqe = B_TRUE;
-		freemsg(rwqe->rwqe_im_mblk);
-		kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
-		    state->id_mtu + IPOIB_GRH_SIZE);
-		rwqe->rwqe_copybuf.ic_bufaddr = NULL;
-		kmem_free(rwqe, sizeof (ibd_rwqe_t));
-		return (DDI_FAILURE);
-	}
-
-	rwqe->rwqe_copybuf.ic_sgl.ds_va =
-	    (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr;
-	rwqe->rwqe_copybuf.ic_sgl.ds_key =
-	    rwqe->rwqe_copybuf.ic_mr_desc.md_lkey;
-	rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE;
-	rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
-	rwqe->w_rwr.wr_nds = 1;
-	rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
-
-	return (DDI_SUCCESS);
+
+	ibd_free_rx_copybufs(state);
 }
 
 /*
  * Free an allocated recv wqe.
  */
+/* ARGSUSED */
 static void
 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
 {
-	if (ibt_deregister_mr(state->id_hca_hdl,
-	    rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) {
-		DPRINT(10, "ibd_free_rwqe: failed in ibt_deregister_mr()");
-		return;
-	}
-
-	/*
-	 * Indicate to the callback function that this rwqe/mblk
-	 * should not be recycled. The freemsg() will invoke
-	 * ibd_freemsg_cb().
-	 */
-	if (rwqe->rwqe_im_mblk != NULL) {
-		rwqe->w_freeing_wqe = B_TRUE;
-		freemsg(rwqe->rwqe_im_mblk);
-	}
-	kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
-	    state->id_mtu + IPOIB_GRH_SIZE);
-	rwqe->rwqe_copybuf.ic_bufaddr = NULL;
-	kmem_free(rwqe, sizeof (ibd_rwqe_t));
+	/*
+	 * desballoc() failed (no memory).
+	 *
+	 * This rwqe is placed on a free list so that it
+	 * can be reinstated when memory is available.
+	 *
+	 * NOTE: no code currently exists to reinstate
+	 * these "lost" rwqes.
+	 */
+	mutex_enter(&state->id_rx_free_list.dl_mutex);
+	state->id_rx_free_list.dl_cnt++;
+	rwqe->rwqe_next = state->id_rx_free_list.dl_head;
+	state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
+	mutex_exit(&state->id_rx_free_list.dl_mutex);
 }
 
 /*
- * Delete the rwqe being freed from the rx list.
- */
-static void
-ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
-{
-	mutex_enter(&state->id_rx_list.dl_mutex);
-	if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe))
-		state->id_rx_list.dl_head = rwqe->rwqe_next;
-	else
-		rwqe->rwqe_prev->w_next = rwqe->rwqe_next;
-	if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe))
-		state->id_rx_list.dl_tail = rwqe->rwqe_prev;
-	else
-		rwqe->rwqe_next->w_prev = rwqe->rwqe_prev;
-	mutex_exit(&state->id_rx_list.dl_mutex);
-}
-
-/*
- * IBA Rx/Tx completion queue handler. Guaranteed to be single
- * threaded and nonreentrant for this CQ. When using combined CQ,
- * this handles Tx and Rx completions. With separate CQs, this handles
- * only Rx completions.
+ * IBA Rx completion queue handler. Guaranteed to be single
+ * threaded and nonreentrant for this CQ.
  */
 /* ARGSUSED */
 static void
@@ -3861,14 +3940,22 @@
 
 	atomic_add_64(&state->id_num_intrs, 1);
 
-	if (ibd_rx_softintr == 1)
-		ddi_trigger_softintr(state->id_rx);
-	else
-		(void) ibd_intr((char *)state);
+	if (ibd_rx_softintr == 1) {
+		mutex_enter(&state->id_rcq_poll_lock);
+		if (state->id_rcq_poll_busy & IBD_CQ_POLLING) {
+			state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING;
+			mutex_exit(&state->id_rcq_poll_lock);
+			return;
+		} else {
+			mutex_exit(&state->id_rcq_poll_lock);
+			ddi_trigger_softintr(state->id_rx);
+		}
+	} else
+		(void) ibd_intr((caddr_t)state);
 }
 
 /*
- * Separate CQ handler for Tx completions, when the Tx CQ is in
+ * CQ handler for Tx completions, when the Tx CQ is in
  * interrupt driven mode.
  */
 /* ARGSUSED */
@@ -3879,10 +3966,18 @@
 
 	atomic_add_64(&state->id_num_intrs, 1);
 
-	if (ibd_tx_softintr == 1)
-		ddi_trigger_softintr(state->id_tx);
-	else
-		(void) ibd_tx_recycle((char *)state);
+	if (ibd_tx_softintr == 1) {
+		mutex_enter(&state->id_scq_poll_lock);
+		if (state->id_scq_poll_busy & IBD_CQ_POLLING) {
+			state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING;
+			mutex_exit(&state->id_scq_poll_lock);
+			return;
+		} else {
+			mutex_exit(&state->id_scq_poll_lock);
+			ddi_trigger_softintr(state->id_tx);
+		}
+	} else
+		(void) ibd_tx_recycle((caddr_t)state);
 }
 
 /*
@@ -3901,14 +3996,16 @@
 
 	/*
 	 * The trap handler will get invoked once for every event for
-	 * evert port. The input "gid" is the GID0 of the port the
+	 * every port. The input "gid" is the GID0 of the port the
 	 * trap came in on; we just need to act on traps that came
 	 * to our port, meaning the port on which the ipoib interface
 	 * resides. Since ipoib uses GID0 of the port, we just match
 	 * the gids to check whether we need to handle the trap.
 	 */
+	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
 	if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
 		return;
+	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
 
 	DPRINT(10, "ibd_notices_handler : %d\n", code);
 
@@ -4101,7 +4198,9 @@
 	}
 
 	state->id_mtu = (128 << port_infop->p_mtu);
+	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
 	state->id_sgid = *port_infop->p_sgid_tbl;
+	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
 	state->id_link_state = LINK_STATE_UP;
 
 	mutex_exit(&state->id_link_mutex);
@@ -4129,7 +4228,7 @@
 	/*
 	 * Allocate Rx/combined CQ:
 	 * Theoretically, there is no point in having more than #rwqe
-	 * plus #swqe cqe's, except that the CQ will be signalled for
+	 * plus #swqe cqe's, except that the CQ will be signaled for
 	 * overflow when the last wqe completes, if none of the previous
 	 * cqe's have been polled. Thus, we allocate just a few less wqe's
 	 * to make sure such overflow does not occur.
@@ -4137,93 +4236,62 @@
 	cq_attr.cq_sched = NULL;
 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
 
-	if (ibd_separate_cqs == 1) {
-		/*
-		 * Allocate Receive CQ.
-		 */
-		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) {
-			cq_attr.cq_size = state->id_num_rwqe + 1;
-		} else {
-			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
-			state->id_num_rwqe = cq_attr.cq_size - 1;
-		}
-
-		if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
-		    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
-			DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
-			    "failed, ret=%d\n", ret);
-			return (DDI_FAILURE);
-		}
-
-		if ((ret = ibt_modify_cq(state->id_rcq_hdl,
-		    ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) {
-			DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
-			    "moderation failed, ret=%d\n", ret);
-		}
-
-		state->id_rxwcs_size = state->id_num_rwqe + 1;
-		state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
-		    state->id_rxwcs_size, KM_SLEEP);
-
-		/*
-		 * Allocate Send CQ.
-		 */
-		if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) {
-			cq_attr.cq_size = state->id_num_swqe + 1;
-		} else {
-			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
-			state->id_num_swqe = cq_attr.cq_size - 1;
-		}
-
-		if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
-		    &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
-			DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
-			    "failed, ret=%d\n", ret);
-			kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
-			    state->id_rxwcs_size);
-			(void) ibt_free_cq(state->id_rcq_hdl);
-			return (DDI_FAILURE);
-		}
-		if ((ret = ibt_modify_cq(state->id_scq_hdl,
-		    IBD_TXCOMP_COUNT, IBD_TXCOMP_USEC, 0)) != IBT_SUCCESS) {
-			DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
-			    "moderation failed, ret=%d\n", ret);
-		}
-
-		state->id_txwcs_size = state->id_num_swqe + 1;
-		state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
-		    state->id_txwcs_size, KM_SLEEP);
+	/*
+	 * Allocate Receive CQ.
+	 */
+	if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) {
+		cq_attr.cq_size = state->id_num_rwqe + 1;
 	} else {
-		/*
-		 * Allocate combined Send/Receive CQ.
-		 */
-		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe +
-		    state->id_num_swqe + 1)) {
-			cq_attr.cq_size = state->id_num_rwqe +
-			    state->id_num_swqe + 1;
-		} else {
-			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
-			state->id_num_rwqe = ((cq_attr.cq_size - 1) *
-			    state->id_num_rwqe) / (state->id_num_rwqe +
-			    state->id_num_swqe);
-			state->id_num_swqe = cq_attr.cq_size - 1 -
-			    state->id_num_rwqe;
-		}
-
-		state->id_rxwcs_size = cq_attr.cq_size;
-		state->id_txwcs_size = state->id_rxwcs_size;
-
-		if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
-		    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
-			DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rscq) "
-			    "failed, ret=%d\n", ret);
-			return (DDI_FAILURE);
-		}
-		state->id_scq_hdl = state->id_rcq_hdl;
-		state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
-		    state->id_rxwcs_size, KM_SLEEP);
-		state->id_txwcs = state->id_rxwcs;
-	}
+		cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
+		state->id_num_rwqe = cq_attr.cq_size - 1;
+	}
+
+	if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
+	    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
+		DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
+		    "failed, ret=%d\n", ret);
+		return (DDI_FAILURE);
+	}
+
+	if ((ret = ibt_modify_cq(state->id_rcq_hdl,
+	    ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) {
+		DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
+		    "moderation failed, ret=%d\n", ret);
+	}
+
+	/* make the #rx wc's the same as max rx chain size */
+	state->id_rxwcs_size = IBD_MAX_RX_MP_LEN;
+	state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
+	    state->id_rxwcs_size, KM_SLEEP);
+
+	/*
+	 * Allocate Send CQ.
+	 */
+	if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) {
+		cq_attr.cq_size = state->id_num_swqe + 1;
+	} else {
+		cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
+		state->id_num_swqe = cq_attr.cq_size - 1;
+	}
+
+	if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
+	    &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
+		DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
+		    "failed, ret=%d\n", ret);
+		kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
+		    state->id_rxwcs_size);
+		(void) ibt_free_cq(state->id_rcq_hdl);
+		return (DDI_FAILURE);
+	}
+	if ((ret = ibt_modify_cq(state->id_scq_hdl,
+	    ibd_txcomp_count, ibd_txcomp_usec, 0)) != IBT_SUCCESS) {
+		DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
+		    "moderation failed, ret=%d\n", ret);
+	}
+
+	state->id_txwcs_size = IBD_TX_POLL_THRESH;
+	state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
+	    state->id_txwcs_size, KM_SLEEP);
 
 	/*
 	 * Print message in case we could not allocate as many wqe's
@@ -4248,7 +4316,7 @@
 	ibt_ud_chan_query_attr_t ud_chan_attr;
 	ibt_status_t ret;
 
-	ud_alloc_attr.ud_flags  = IBT_WR_SIGNALED;
+	ud_alloc_attr.ud_flags  = IBT_ALL_SIGNALED;
 	if (state->id_hca_res_lkey_capab)
 		ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
 	if (state->id_lso_policy && state->id_lso_capable)
@@ -4341,7 +4409,7 @@
 				 */
 				DPRINT(2, "ibd_undo_start: "
 				    "reclaiming failed");
-				ibd_poll_compq(state, state->id_rcq_hdl);
+				ibd_poll_rcq(state, state->id_rcq_hdl);
 				ibt_set_cq_handler(state->id_rcq_hdl,
 				    ibd_rcq_handler, state);
 				return (DDI_FAILURE);
@@ -4383,10 +4451,8 @@
 		 * ibt_set_cq_handler() returns, the old handler is
 		 * guaranteed not to be invoked anymore.
 		 */
-		if (ibd_separate_cqs == 1) {
-			ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
-		}
-		ibd_poll_compq(state, state->id_scq_hdl);
+		ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
+		ibd_poll_scq(state, state->id_scq_hdl);
 
 		state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED);
 	}
@@ -4456,14 +4522,12 @@
 	}
 
 	if (progress & IBD_DRV_CQS_ALLOCD) {
-		if (ibd_separate_cqs == 1) {
-			kmem_free(state->id_txwcs,
-			    sizeof (ibt_wc_t) * state->id_txwcs_size);
-			if ((ret = ibt_free_cq(state->id_scq_hdl)) !=
-			    IBT_SUCCESS) {
-				DPRINT(10, "ibd_undo_start: free_cq(scq) "
-				    "failed, ret=%d", ret);
-			}
+		kmem_free(state->id_txwcs,
+		    sizeof (ibt_wc_t) * state->id_txwcs_size);
+		if ((ret = ibt_free_cq(state->id_scq_hdl)) !=
+		    IBT_SUCCESS) {
+			DPRINT(10, "ibd_undo_start: free_cq(scq) "
+			    "failed, ret=%d", ret);
 		}
 
 		kmem_free(state->id_rxwcs,
@@ -4482,7 +4546,9 @@
 	}
 
 	if (progress & IBD_DRV_ACACHE_INITIALIZED) {
+		mutex_enter(&state->id_ac_mutex);
 		mod_hash_destroy_hash(state->id_ah_active_hash);
+		mutex_exit(&state->id_ac_mutex);
 		ibd_acache_fini(state);
 
 		state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED);
@@ -4626,19 +4692,17 @@
 	state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD;
 
 	/*
-	 * If we have separate cqs, create the send cq handler here
-	 */
-	if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) {
-		ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
-		if ((ret = ibt_enable_cq_notify(state->id_scq_hdl,
-		    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
-			DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) "
-			    "failed, ret=%d", ret);
-			err = EINVAL;
-			goto start_fail;
-		}
-		state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
-	}
+	 * Create the send cq handler here
+	 */
+	ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
+	if ((ret = ibt_enable_cq_notify(state->id_scq_hdl,
+	    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
+		DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) "
+		    "failed, ret=%d", ret);
+		err = EINVAL;
+		goto start_fail;
+	}
+	state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
 
 	/*
 	 * Allocate and initialize the rx buffer list
@@ -4665,7 +4729,9 @@
 	 */
 	kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
 	    TS_RUN, minclsyspri);
+	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_async_thrid))
 	state->id_async_thrid = kht->t_did;
+	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_async_thrid))
 	state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;
 
 	/*
@@ -4680,7 +4746,7 @@
 	ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK,
 	    state->id_mgid.gid_prefix, state->id_mgid.gid_guid);
 
-	mac_maxsdu_update(state->id_mh, state->id_mtu - IPOIB_HDRSIZE);
+	(void) mac_maxsdu_update(state->id_mh, state->id_mtu - IPOIB_HDRSIZE);
 	mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
 
 	/*
@@ -4789,7 +4855,7 @@
 
 	if (op == IBD_ASYNC_JOIN) {
 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
-			ibd_print_warn(state, "Joint multicast group failed :"
+			ibd_print_warn(state, "Join multicast group failed :"
 			"%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
 		}
 	} else {
@@ -4835,7 +4901,7 @@
 	/*
 	 * Check validity of MCG address. We could additionally check
 	 * that a enable/disable is not being issued on the "broadcast"
-	 * mcg, but since this operation is only invokable by priviledged
+	 * mcg, but since this operation is only invokable by privileged
 	 * programs anyway, we allow the flexibility to those dlpi apps.
 	 * Note that we do not validate the "scope" of the IBA mcg.
 	 */
@@ -5046,124 +5112,100 @@
 static void
 ibd_async_txsched(ibd_state_t *state)
 {
-	ibd_req_t *req;
-	int ret;
-
-	if (ibd_txcomp_poll)
-		ibd_poll_compq(state, state->id_scq_hdl);
-
-	ret = ibd_resume_transmission(state);
-	if (ret && ibd_txcomp_poll) {
-		if (req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP))
-			ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
-		else {
-			ibd_print_warn(state, "ibd_async_txsched: "
-			    "no memory, can't schedule work slot");
-		}
-	}
+	ibd_resume_transmission(state);
 }
 
-static int
+static void
 ibd_resume_transmission(ibd_state_t *state)
 {
 	int flag;
 	int met_thresh = 0;
+	int thresh = 0;
 	int ret = -1;
 
 	mutex_enter(&state->id_sched_lock);
 	if (state->id_sched_needed & IBD_RSRC_SWQE) {
-		met_thresh = (state->id_tx_list.dl_cnt >
-		    IBD_FREE_SWQES_THRESH);
+		mutex_enter(&state->id_tx_list.dl_mutex);
+		mutex_enter(&state->id_tx_rel_list.dl_mutex);
+		met_thresh = state->id_tx_list.dl_cnt +
+		    state->id_tx_rel_list.dl_cnt;
+		mutex_exit(&state->id_tx_rel_list.dl_mutex);
+		mutex_exit(&state->id_tx_list.dl_mutex);
+		thresh = IBD_FREE_SWQES_THRESH;
 		flag = IBD_RSRC_SWQE;
 	} else if (state->id_sched_needed & IBD_RSRC_LSOBUF) {
 		ASSERT(state->id_lso != NULL);
-		met_thresh = (state->id_lso->bkt_nfree >
-		    IBD_FREE_LSOS_THRESH);
+		mutex_enter(&state->id_lso_lock);
+		met_thresh = state->id_lso->bkt_nfree;
+		thresh = IBD_FREE_LSOS_THRESH;
+		mutex_exit(&state->id_lso_lock);
 		flag = IBD_RSRC_LSOBUF;
-	}
-	if (met_thresh) {
+		if (met_thresh > thresh)
+			state->id_sched_lso_cnt++;
+	}
+	if (met_thresh > thresh) {
 		state->id_sched_needed &= ~flag;
+		state->id_sched_cnt++;
 		ret = 0;
 	}
 	mutex_exit(&state->id_sched_lock);
 
 	if (ret == 0)
 		mac_tx_update(state->id_mh);
-
-	return (ret);
 }
 
 /*
  * Release the send wqe back into free list.
  */
 static void
-ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *swqe)
+ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n)
 {
 	/*
 	 * Add back on Tx list for reuse.
 	 */
-	swqe->swqe_next = NULL;
-	mutex_enter(&state->id_tx_list.dl_mutex);
-	if (state->id_tx_list.dl_pending_sends) {
-		state->id_tx_list.dl_pending_sends = B_FALSE;
-	}
-	if (state->id_tx_list.dl_head == NULL) {
-		state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
-	} else {
-		state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe);
-	}
-	state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
-	state->id_tx_list.dl_cnt++;
-	mutex_exit(&state->id_tx_list.dl_mutex);
+	ASSERT(tail->swqe_next == NULL);
+	mutex_enter(&state->id_tx_rel_list.dl_mutex);
+	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
+	tail->swqe_next = state->id_tx_rel_list.dl_head;
+	state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head);
+	state->id_tx_rel_list.dl_cnt += n;
+	mutex_exit(&state->id_tx_rel_list.dl_mutex);
 }
 
 /*
  * Acquire a send wqe from free list.
  * Returns error number and send wqe pointer.
  */
-static int
-ibd_acquire_swqe(ibd_state_t *state, ibd_swqe_t **swqe)
+static ibd_swqe_t *
+ibd_acquire_swqe(ibd_state_t *state)
 {
-	int rc = 0;
 	ibd_swqe_t *wqe;
 
-	/*
-	 * Check and reclaim some of the completed Tx requests.
-	 * If someone else is already in this code and pulling Tx
-	 * completions, no need to poll, since the current lock holder
-	 * will do the work anyway. Normally, we poll for completions
-	 * every few Tx attempts, but if we are short on Tx descriptors,
-	 * we always try to poll.
-	 */
-	if ((ibd_txcomp_poll == 1) &&
-	    (state->id_tx_list.dl_cnt < IBD_TX_POLL_THRESH)) {
-		ibd_poll_compq(state, state->id_scq_hdl);
-	}
-
-	/*
-	 * Grab required transmit wqes.
-	 */
-	mutex_enter(&state->id_tx_list.dl_mutex);
-	wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
-	if (wqe != NULL) {
+	mutex_enter(&state->id_tx_rel_list.dl_mutex);
+	if (state->id_tx_rel_list.dl_head != NULL) {
+		/* transfer id_tx_rel_list to id_tx_list */
+		state->id_tx_list.dl_head =
+		    state->id_tx_rel_list.dl_head;
+		state->id_tx_list.dl_cnt =
+		    state->id_tx_rel_list.dl_cnt;
+		state->id_tx_list.dl_pending_sends = B_FALSE;
+
+		/* clear id_tx_rel_list */
+		state->id_tx_rel_list.dl_head = NULL;
+		state->id_tx_rel_list.dl_cnt = 0;
+		mutex_exit(&state->id_tx_rel_list.dl_mutex);
+
+		wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
 		state->id_tx_list.dl_cnt -= 1;
 		state->id_tx_list.dl_head = wqe->swqe_next;
-		if (state->id_tx_list.dl_tail == SWQE_TO_WQE(wqe))
-			state->id_tx_list.dl_tail = NULL;
-	} else {
-		/*
-		 * If we did not find the number we were looking for, flag
-		 * no resource. Adjust list appropriately in either case.
-		 */
-		rc = ENOENT;
+	} else {	/* no free swqe */
+		mutex_exit(&state->id_tx_rel_list.dl_mutex);
 		state->id_tx_list.dl_pending_sends = B_TRUE;
 		DPRINT(5, "ibd_acquire_swqe: out of Tx wqe");
-		atomic_add_64(&state->id_tx_short, 1);
-	}
-	mutex_exit(&state->id_tx_list.dl_mutex);
-	*swqe = wqe;
-
-	return (rc);
+		state->id_tx_short++;
+		wqe = NULL;
+	}
+	return (wqe);
 }
 
 static int
@@ -5283,60 +5325,44 @@
 	uint_t		num_posted;
 	uint_t		n_wrs;
 	ibt_status_t	ibt_status;
-	ibt_send_wr_t	wrs[IBD_MAX_POST_MULTIPLE];
-	ibd_swqe_t	*elem;
-	ibd_swqe_t	*nodes[IBD_MAX_POST_MULTIPLE];
-
-	node->swqe_next = NULL;
-
-	mutex_enter(&state->id_txpost_lock);
-
-	/*
-	 * Enqueue the new node in chain of wqes to send
-	 */
-	if (state->id_tx_head) {
-		*(state->id_tx_tailp) = (ibd_wqe_t *)node;
-	} else {
-		state->id_tx_head = node;
-	}
-	state->id_tx_tailp = &(node->swqe_next);
-
-	/*
-	 * If someone else is helping out with the sends,
-	 * just go back
-	 */
-	if (state->id_tx_busy) {
-		mutex_exit(&state->id_txpost_lock);
-		return;
-	}
-
-	/*
-	 * Otherwise, mark the flag to indicate that we'll be
-	 * doing the dispatch of what's there in the wqe chain
-	 */
-	state->id_tx_busy = 1;
-
-	while (state->id_tx_head) {
+	ibt_send_wr_t	wrs[IBD_MAX_TX_POST_MULTIPLE];
+	ibd_swqe_t	*tx_head, *elem;
+	ibd_swqe_t	*nodes[IBD_MAX_TX_POST_MULTIPLE];
+
+	/* post the one request, then check for more */
+	ibt_status = ibt_post_send(state->id_chnl_hdl,
+	    &node->w_swr, 1, NULL);
+	if (ibt_status != IBT_SUCCESS) {
+		ibd_print_warn(state, "ibd_post_send: "
+		    "posting one wr failed: ret=%d", ibt_status);
+		ibd_tx_cleanup(state, node);
+	}
+
+	tx_head = NULL;
+	for (;;) {
+		if (tx_head == NULL) {
+			mutex_enter(&state->id_txpost_lock);
+			tx_head = state->id_tx_head;
+			if (tx_head == NULL) {
+				state->id_tx_busy = 0;
+				mutex_exit(&state->id_txpost_lock);
+				return;
+			}
+			state->id_tx_head = NULL;
+			mutex_exit(&state->id_txpost_lock);
+		}
+
 		/*
-		 * Collect pending requests, IBD_MAX_POST_MULTIPLE wrs
+		 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
 		 * at a time if possible, and keep posting them.
 		 */
-		for (n_wrs = 0, elem = state->id_tx_head;
-		    (elem) && (n_wrs < IBD_MAX_POST_MULTIPLE);
+		for (n_wrs = 0, elem = tx_head;
+		    (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
 		    elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
-
 			nodes[n_wrs] = elem;
 			wrs[n_wrs] = elem->w_swr;
 		}
-		state->id_tx_head = elem;
-
-		/*
-		 * Release the txpost lock before posting the
-		 * send request to the hca; if the posting fails
-		 * for some reason, we'll never receive completion
-		 * intimation, so we'll need to cleanup.
-		 */
-		mutex_exit(&state->id_txpost_lock);
+		tx_head = elem;
 
 		ASSERT(n_wrs != 0);
 
@@ -5353,7 +5379,6 @@
 		ibt_status = ibt_post_send(state->id_chnl_hdl,
 		    wrs, n_wrs, &num_posted);
 		if (ibt_status != IBT_SUCCESS) {
-
 			ibd_print_warn(state, "ibd_post_send: "
 			    "posting multiple wrs failed: "
 			    "requested=%d, done=%d, ret=%d",
@@ -5362,15 +5387,7 @@
 			for (i = num_posted; i < n_wrs; i++)
 				ibd_tx_cleanup(state, nodes[i]);
 		}
-
-		/*
-		 * Grab the mutex before we go and check the tx Q again
-		 */
-		mutex_enter(&state->id_txpost_lock);
-	}
-
-	state->id_tx_busy = 0;
-	mutex_exit(&state->id_txpost_lock);
+	}
 }
 
 static int
@@ -5388,7 +5405,6 @@
 	uint_t pktsize;
 	uint_t frag_len;
 	uint_t pending_hdr;
-	uint_t hiwm;
 	int nmblks;
 	int i;
 
@@ -5420,21 +5436,13 @@
 	pktsize -= pending_hdr;
 
 	/*
-	 * Translating the virtual address regions into physical regions
-	 * for using the Reserved LKey feature results in a wr sgl that
-	 * is a little longer. Since failing ibt_map_mem_iov() is costly,
-	 * we'll fix a high-water mark (65%) for when we should stop.
-	 */
-	hiwm = (state->id_max_sqseg * 65) / 100;
-
-	/*
 	 * We only do ibt_map_mem_iov() if the pktsize is above the
 	 * "copy-threshold", and if the number of mp fragments is less than
 	 * the maximum acceptable.
 	 */
 	if ((state->id_hca_res_lkey_capab) &&
 	    (pktsize > IBD_TX_COPY_THRESH) &&
-	    (nmblks < hiwm)) {
+	    (nmblks < state->id_max_sqseg_hiwm)) {
 		ibt_iov_t iov_arr[IBD_MAX_SQSEG];
 		ibt_iov_attr_t iov_attr;
 
@@ -5591,14 +5599,22 @@
 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
 		return (B_FALSE);
 
-	node = NULL;
-	if (ibd_acquire_swqe(state, &node) != 0) {
+	mutex_enter(&state->id_tx_list.dl_mutex);
+	node = WQE_TO_SWQE(state->id_tx_list.dl_head);
+	if (node != NULL) {
+		state->id_tx_list.dl_cnt -= 1;
+		state->id_tx_list.dl_head = node->swqe_next;
+	} else {
+		node = ibd_acquire_swqe(state);
+	}
+	mutex_exit(&state->id_tx_list.dl_mutex);
+	if (node == NULL) {
 		/*
 		 * If we don't have an swqe available, schedule a transmit
 		 * completion queue cleanup and hold off on sending more
 		 * more packets until we have some free swqes
 		 */
-		if (ibd_sched_poll(state, IBD_RSRC_SWQE, ibd_txcomp_poll) == 0)
+		if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0)
 			return (B_FALSE);
 
 		/*
@@ -5650,14 +5666,6 @@
 		node->w_ahandle = NULL;
 
 		/*
-		 * for the poll mode, it is probably some cqe pending in the
-		 * cq. So ibd has to poll cq here, otherwise acache probably
-		 * may not be recycled.
-		 */
-		if (ibd_txcomp_poll == 1)
-			ibd_poll_compq(state, state->id_scq_hdl);
-
-		/*
 		 * Here if ibd_acache_lookup() returns EFAULT, it means ibd
 		 * can not find a path for the specific dest address. We
 		 * should get rid of this kind of packet.  We also should get
@@ -5781,7 +5789,23 @@
 	 * post instead of doing it serially, we cannot assume anything
 	 * about the 'node' after ibd_post_send() returns.
 	 */
-	ibd_post_send(state, node);
+	node->swqe_next = NULL;
+
+	mutex_enter(&state->id_txpost_lock);
+	if (state->id_tx_busy) {
+		if (state->id_tx_head) {
+			state->id_tx_tail->swqe_next =
+			    SWQE_TO_WQE(node);
+		} else {
+			state->id_tx_head = node;
+		}
+		state->id_tx_tail = node;
+		mutex_exit(&state->id_txpost_lock);
+	} else {
+		state->id_tx_busy = 1;
+		mutex_exit(&state->id_txpost_lock);
+		ibd_post_send(state, node);
+	}
 
 	return (B_TRUE);
 
@@ -5831,65 +5855,118 @@
  * only Rx completions.
  */
 static uint_t
-ibd_intr(char *arg)
+ibd_intr(caddr_t arg)
 {
 	ibd_state_t *state = (ibd_state_t *)arg;
 
-	ibd_poll_compq(state, state->id_rcq_hdl);
+	ibd_poll_rcq(state, state->id_rcq_hdl);
 
 	return (DDI_INTR_CLAIMED);
 }
 
 /*
- * Poll and drain the cq
+ * Poll and fully drain the send cq
  */
-static uint_t
-ibd_drain_cq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl, ibt_wc_t *wcs,
-    uint_t numwcs)
+static void
+ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
 {
+	ibt_wc_t *wcs = state->id_txwcs;
+	uint_t numwcs = state->id_txwcs_size;
 	ibd_wqe_t *wqe;
+	ibd_swqe_t *head, *tail;
 	ibt_wc_t *wc;
-	uint_t total_polled = 0;
 	uint_t num_polled;
 	int i;
 
 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
-		total_polled += num_polled;
+		head = tail = NULL;
 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
-			ASSERT((wqe->w_type == IBD_WQE_SEND) ||
-			    (wqe->w_type == IBD_WQE_RECV));
+			ASSERT(wqe->w_type == IBD_WQE_SEND);
 			if (wc->wc_status != IBT_WC_SUCCESS) {
 				/*
 				 * Channel being torn down.
 				 */
 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
-					DPRINT(5, "ibd_drain_cq: flush error");
+					DPRINT(5, "ibd_drain_scq: flush error");
 					/*
 					 * Only invoke the Tx handler to
 					 * release possibly held resources
-					 * like AH refcount etc. Can not
-					 * invoke Rx handler because it might
-					 * try adding buffers to the Rx pool
+					 * like AH refcount etc.
+					 */
+					DPRINT(10, "ibd_drain_scq: Bad "
+					    "status %d", wc->wc_status);
+				}
+				return;	/* give up.  no need to clean up */
+			}
+			/*
+			 * Add this swqe to the list to be cleaned up.
+			 */
+			if (head)
+				tail->swqe_next = wqe;
+			else
+				head = WQE_TO_SWQE(wqe);
+			tail = WQE_TO_SWQE(wqe);
+		}
+		tail->swqe_next = NULL;
+		ibd_tx_cleanup_list(state, head, tail);
+
+		/*
+		 * Resume any blocked transmissions if possible
+		 */
+		ibd_resume_transmission(state);
+	}
+}
+
+/*
+ * Poll and fully drain the receive cq
+ */
+static void
+ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
+{
+	ibt_wc_t *wcs = state->id_rxwcs;
+	uint_t numwcs = state->id_rxwcs_size;
+	ibd_wqe_t *wqe;
+	ibt_wc_t *wc;
+	uint_t num_polled;
+	int i;
+	mblk_t *head, *tail, *mp;
+
+	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
+		head = tail = NULL;
+		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
+			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
+			ASSERT(wqe->w_type == IBD_WQE_RECV);
+			if (wc->wc_status != IBT_WC_SUCCESS) {
+				/*
+				 * Channel being torn down.
+				 */
+				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
+					DPRINT(5, "ibd_drain_rcq: flush error");
+					/*
+					 * Do not invoke Rx handler because
+					 * it might add buffers to the Rx pool
 					 * when we are trying to deinitialize.
 					 */
-					if (wqe->w_type == IBD_WQE_RECV) {
-						continue;
-					} else {
-						DPRINT(10, "ibd_drain_cq: Bad "
-						    "status %d", wc->wc_status);
-					}
+					continue;
 				}
 			}
-			if (wqe->w_type == IBD_WQE_SEND) {
-				ibd_tx_cleanup(state, WQE_TO_SWQE(wqe));
-			} else {
-				ibd_process_rx(state, WQE_TO_RWQE(wqe), wc);
-			}
+			mp = ibd_process_rx(state, WQE_TO_RWQE(wqe), wc);
+			if (mp == NULL)
+				continue;
+
+			/*
+			 * Add this mp to the list to send to the nw layer.
+			 */
+			if (head)
+				tail->b_next = mp;
+			else
+				head = mp;
+			tail = mp;
 		}
-	}
-
-	return (total_polled);
+		if (head)
+			mac_rx(state->id_mh, state->id_rh, head);
+	}
 }
 
 /*
@@ -5897,64 +5974,92 @@
  * for all completed wqe's while detaching.
  */
 static void
-ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
+ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
 {
-	ibt_wc_t *wcs;
-	uint_t numwcs;
 	int flag, redo_flag;
 	int redo = 1;
-	uint_t num_polled = 0;
-
-	if (ibd_separate_cqs == 1) {
-		if (cq_hdl == state->id_rcq_hdl) {
-			flag = IBD_RX_CQ_POLLING;
-			redo_flag = IBD_REDO_RX_CQ_POLLING;
-		} else {
-			flag = IBD_TX_CQ_POLLING;
-			redo_flag = IBD_REDO_TX_CQ_POLLING;
-		}
-	} else {
-		flag = IBD_RX_CQ_POLLING | IBD_TX_CQ_POLLING;
-		redo_flag = IBD_REDO_RX_CQ_POLLING | IBD_REDO_TX_CQ_POLLING;
-	}
-
-	mutex_enter(&state->id_cq_poll_lock);
-	if (state->id_cq_poll_busy & flag) {
-		state->id_cq_poll_busy |= redo_flag;
-		mutex_exit(&state->id_cq_poll_lock);
+
+	flag = IBD_CQ_POLLING;
+	redo_flag = IBD_REDO_CQ_POLLING;
+
+	mutex_enter(&state->id_scq_poll_lock);
+	if (state->id_scq_poll_busy & flag) {
+		ibd_print_warn(state, "ibd_poll_scq: multiple polling threads");
+		state->id_scq_poll_busy |= redo_flag;
+		mutex_exit(&state->id_scq_poll_lock);
 		return;
 	}
-	state->id_cq_poll_busy |= flag;
-	mutex_exit(&state->id_cq_poll_lock);
+	state->id_scq_poll_busy |= flag;
+	mutex_exit(&state->id_scq_poll_lock);
 
 	/*
 	 * In some cases (eg detaching), this code can be invoked on
 	 * any cpu after disabling cq notification (thus no concurrency
 	 * exists). Apart from that, the following applies normally:
-	 * The receive completion handling is always on the Rx interrupt
-	 * cpu. Transmit completion handling could be from any cpu if
+	 * Transmit completion handling could be from any cpu if
 	 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
-	 * is interrupt driven. Combined completion handling is always
-	 * on the interrupt cpu. Thus, lock accordingly and use the
-	 * proper completion array.
-	 */
-	if (ibd_separate_cqs == 1) {
-		if (cq_hdl == state->id_rcq_hdl) {
-			wcs = state->id_rxwcs;
-			numwcs = state->id_rxwcs_size;
-		} else {
-			wcs = state->id_txwcs;
-			numwcs = state->id_txwcs_size;
-		}
-	} else {
-		wcs = state->id_rxwcs;
-		numwcs = state->id_rxwcs_size;
-	}
+	 * is interrupt driven.
+	 */
 
 	/*
 	 * Poll and drain the CQ
 	 */
-	num_polled = ibd_drain_cq(state, cq_hdl, wcs, numwcs);
+	ibd_drain_scq(state, cq_hdl);
+
+	/*
+	 * Enable CQ notifications and redrain the cq to catch any
+	 * completions we might have missed after the ibd_drain_scq()
+	 * above and before the ibt_enable_cq_notify() that follows.
+	 * Finally, service any new requests to poll the cq that
+	 * could've come in after the ibt_enable_cq_notify().
+	 */
+	do {
+		if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
+		    IBT_SUCCESS) {
+			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
+		}
+
+		ibd_drain_scq(state, cq_hdl);
+
+		mutex_enter(&state->id_scq_poll_lock);
+		if (state->id_scq_poll_busy & redo_flag)
+			state->id_scq_poll_busy &= ~redo_flag;
+		else {
+			state->id_scq_poll_busy &= ~flag;
+			redo = 0;
+		}
+		mutex_exit(&state->id_scq_poll_lock);
+
+	} while (redo);
+}
+
+/*
+ * Common code for interrupt handling as well as for polling
+ * for all completed wqe's while detaching.
+ */
+static void
+ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq)
+{
+	int flag, redo_flag;
+	int redo = 1;
+
+	flag = IBD_CQ_POLLING;
+	redo_flag = IBD_REDO_CQ_POLLING;
+
+	mutex_enter(&state->id_rcq_poll_lock);
+	if (state->id_rcq_poll_busy & flag) {
+		ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads");
+		state->id_rcq_poll_busy |= redo_flag;
+		mutex_exit(&state->id_rcq_poll_lock);
+		return;
+	}
+	state->id_rcq_poll_busy |= flag;
+	mutex_exit(&state->id_rcq_poll_lock);
+
+	/*
+	 * Poll and drain the CQ
+	 */
+	ibd_drain_rcq(state, rcq);
 
 	/*
 	 * Enable CQ notifications and redrain the cq to catch any
@@ -5964,31 +6069,23 @@
 	 * could've come in after the ibt_enable_cq_notify().
 	 */
 	do {
-		if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
+		if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) !=
 		    IBT_SUCCESS) {
 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
 		}
 
-		num_polled += ibd_drain_cq(state, cq_hdl, wcs, numwcs);
-
-		mutex_enter(&state->id_cq_poll_lock);
-		if (state->id_cq_poll_busy & redo_flag)
-			state->id_cq_poll_busy &= ~redo_flag;
+		ibd_drain_rcq(state, rcq);
+
+		mutex_enter(&state->id_rcq_poll_lock);
+		if (state->id_rcq_poll_busy & redo_flag)
+			state->id_rcq_poll_busy &= ~redo_flag;
 		else {
-			state->id_cq_poll_busy &= ~flag;
+			state->id_rcq_poll_busy &= ~flag;
 			redo = 0;
 		}
-		mutex_exit(&state->id_cq_poll_lock);
+		mutex_exit(&state->id_rcq_poll_lock);
 
 	} while (redo);
-
-	/*
-	 * If we polled the receive cq and found anything, we need to flush
-	 * it out to the nw layer here.
-	 */
-	if ((flag & IBD_RX_CQ_POLLING) && (num_polled > 0)) {
-		ibd_flush_rx(state, NULL);
-	}
 }
 
 /*
@@ -6012,6 +6109,65 @@
 	swqe->w_swr.wr_nds = 0;
 }
 
+static void
+ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace)
+{
+	/*
+	 * The recycling logic can be eliminated from here
+	 * and put into the async thread if we create another
+	 * list to hold ACE's for unjoined mcg's.
+	 */
+	if (DEC_REF_DO_CYCLE(ace)) {
+		ibd_mce_t *mce;
+
+		/*
+		 * Check with the lock taken: we decremented
+		 * reference count without the lock, and some
+		 * transmitter might already have bumped the
+		 * reference count (possible in case of multicast
+		 * disable when we leave the AH on the active
+		 * list). If not still 0, get out, leaving the
+		 * recycle bit intact.
+		 *
+		 * Atomically transition the AH from active
+		 * to free list, and queue a work request to
+		 * leave the group and destroy the mce. No
+		 * transmitter can be looking at the AH or
+		 * the MCE in between, since we have the
+		 * ac_mutex lock. In the SendOnly reap case,
+		 * it is not necessary to hold the ac_mutex
+		 * and recheck the ref count (since the AH was
+		 * taken off the active list), we just do it
+		 * to have uniform processing with the Full
+		 * reap case.
+		 */
+		mutex_enter(&state->id_ac_mutex);
+		mce = ace->ac_mce;
+		if (GET_REF_CYCLE(ace) == 0) {
+			CLEAR_REFCYCLE(ace);
+			/*
+			 * Identify the case of fullmember reap as
+			 * opposed to mcg trap reap. Also, port up
+			 * might set ac_mce to NULL to indicate Tx
+			 * cleanup should do no more than put the
+			 * AH in the free list (see ibd_async_link).
+			 */
+			if (mce != NULL) {
+				ace->ac_mce = NULL;
+				IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
+				/*
+				 * mc_req was initialized at mce
+				 * creation time.
+				 */
+				ibd_queue_work_slot(state,
+				    &mce->mc_req, IBD_ASYNC_REAP);
+			}
+			IBD_ACACHE_INSERT_FREE(state, ace);
+		}
+		mutex_exit(&state->id_ac_mutex);
+	}
+}
+
 /*
  * Common code that deals with clean ups after a successful or
  * erroneous transmission attempt.
@@ -6051,89 +6207,66 @@
 	 * ibd_send() error path.
 	 */
 	if (ace != NULL) {
-		/*
-		 * The recycling logic can be eliminated from here
-		 * and put into the async thread if we create another
-		 * list to hold ACE's for unjoined mcg's.
-		 */
-		if (DEC_REF_DO_CYCLE(ace)) {
-			ibd_mce_t *mce;
-
-			/*
-			 * Check with the lock taken: we decremented
-			 * reference count without the lock, and some
-			 * transmitter might alreay have bumped the
-			 * reference count (possible in case of multicast
-			 * disable when we leave the AH on the active
-			 * list). If not still 0, get out, leaving the
-			 * recycle bit intact.
-			 *
-			 * Atomically transition the AH from active
-			 * to free list, and queue a work request to
-			 * leave the group and destroy the mce. No
-			 * transmitter can be looking at the AH or
-			 * the MCE in between, since we have the
-			 * ac_mutex lock. In the SendOnly reap case,
-			 * it is not neccesary to hold the ac_mutex
-			 * and recheck the ref count (since the AH was
-			 * taken off the active list), we just do it
-			 * to have uniform processing with the Full
-			 * reap case.
-			 */
-			mutex_enter(&state->id_ac_mutex);
-			mce = ace->ac_mce;
-			if (GET_REF_CYCLE(ace) == 0) {
-				CLEAR_REFCYCLE(ace);
-				/*
-				 * Identify the case of fullmember reap as
-				 * opposed to mcg trap reap. Also, port up
-				 * might set ac_mce to NULL to indicate Tx
-				 * cleanup should do no more than put the
-				 * AH in the free list (see ibd_async_link).
-				 */
-				if (mce != NULL) {
-					ace->ac_mce = NULL;
-					IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
-					/*
-					 * mc_req was initialized at mce
-					 * creation time.
-					 */
-					ibd_queue_work_slot(state,
-					    &mce->mc_req, IBD_ASYNC_REAP);
-				}
-				IBD_ACACHE_INSERT_FREE(state, ace);
-			}
-			mutex_exit(&state->id_ac_mutex);
-		}
+		ibd_dec_ref_ace(state, ace);
 	}
 
 	/*
 	 * Release the send wqe for reuse.
 	 */
-	ibd_release_swqe(state, swqe);
+	swqe->swqe_next = NULL;
+	ibd_release_swqe(state, swqe, swqe, 1);
 }
 
-/*
- * Hand off the processed rx mp chain to mac_rx()
- */
 static void
-ibd_flush_rx(ibd_state_t *state, mblk_t *mpc)
+ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail)
 {
-	if (mpc == NULL) {
-		mutex_enter(&state->id_rx_lock);
-
-		mpc = state->id_rx_mp;
-
-		state->id_rx_mp = NULL;
-		state->id_rx_mp_tail = NULL;
-		state->id_rx_mp_len = 0;
-
-		mutex_exit(&state->id_rx_lock);
-	}
-
-	if (mpc) {
-		mac_rx(state->id_mh, state->id_rh, mpc);
-	}
+	ibd_ace_t *ace;
+	ibd_swqe_t *swqe;
+	int n = 0;
+
+	DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail);
+
+	for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) {
+
+		/*
+		 * If this was a dynamic mapping in ibd_send(), we need to
+		 * unmap here. If this was an lso buffer we'd used for sending,
+		 * we need to release the lso buf to the pool, since the
+		 * resource is scarce. However, if this was simply a normal
+		 * send using the copybuf (present in each swqe), we don't need
+		 * to release it.
+		 */
+		if (swqe->swqe_im_mblk != NULL) {
+			if (swqe->w_buftype == IBD_WQE_MAPPED) {
+				ibd_unmap_mem(state, swqe);
+			} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
+				ibd_release_lsobufs(state,
+				    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
+			}
+			ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
+			freemsg(swqe->swqe_im_mblk);
+			swqe->swqe_im_mblk = NULL;
+		}
+
+		/*
+		 * Drop the reference count on the AH; it can be reused
+		 * now for a different destination if there are no more
+		 * posted sends that will use it. This can be eliminated
+		 * if we can always associate each Tx buffer with an AH.
+		 * The ace can be null if we are cleaning up from the
+		 * ibd_send() error path.
+		 */
+		ace = swqe->w_ahandle;
+		if (ace != NULL) {
+			ibd_dec_ref_ace(state, ace);
+		}
+		n++;
+	}
+
+	/*
+	 * Release the send wqes for reuse.
+	 */
+	ibd_release_swqe(state, head, tail, n);
 }
 
 /*
@@ -6141,30 +6274,48 @@
  * in the format expected by GLD.  The received packet has this
  * format: 2b sap :: 00 :: data.
  */
-static void
+static mblk_t *
 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
 {
 	ib_header_info_t *phdr;
 	mblk_t *mp;
-	mblk_t *mpc = NULL;
 	ipoib_hdr_t *ipibp;
 	ipha_t *iphap;
 	ip6_t *ip6h;
-	int rxcnt, len;
+	int len;
+	ib_msglen_t pkt_len = wc->wc_bytes_xfer;
+	uint32_t bufs;
+
+	atomic_add_32(&state->id_rx_list.dl_cnt, -1);
 
 	/*
 	 * Track number handed to upper layer, and number still
 	 * available to receive packets.
 	 */
-	rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1);
-	ASSERT(rxcnt >= 0);
-	atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1);
+	bufs = atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 1);
+
+	/* Never run out of rwqes, use allocb when running low */
+	if (bufs >= state->id_rx_bufs_outstanding_limit) {
+		atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1);
+		atomic_inc_32(&state->id_rx_allocb);
+		mp = allocb(pkt_len, BPRI_HI);
+		if (mp) {
+			bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len);
+			ibd_post_recv(state, rwqe);
+		} else {	/* no memory */
+			atomic_inc_32(&state->id_rx_allocb_failed);
+			ibd_post_recv(state, rwqe);
+			return (NULL);
+		}
+	} else {
+		mp = rwqe->rwqe_im_mblk;
+	}
+
 
 	/*
 	 * Adjust write pointer depending on how much data came in.
 	 */
-	mp = rwqe->rwqe_im_mblk;
-	mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer;
+	mp->b_wptr = mp->b_rptr + pkt_len;
 
 	/*
 	 * Make sure this is NULL or we're in trouble.
@@ -6192,7 +6343,7 @@
 		if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
 		    IPOIB_ADDRL) == 0) {
 			freemsg(mp);
-			return;
+			return (NULL);
 		}
 
 		ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
@@ -6220,32 +6371,9 @@
 	 */
 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
 	if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
-		if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) {
-			if (!pullupmsg(mp, IPV6_HDR_LEN +
-			    sizeof (ipoib_hdr_t))) {
-				DPRINT(10, "ibd_process_rx: pullupmsg failed");
-				freemsg(mp);
-				return;
-			}
-			ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr +
-			    sizeof (ipoib_pgrh_t));
-		}
 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
 		len = ntohs(ip6h->ip6_plen);
 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
-			if (MBLKL(mp) < sizeof (ipoib_hdr_t) +
-			    IPV6_HDR_LEN + len) {
-				if (!pullupmsg(mp, sizeof (ipoib_hdr_t) +
-				    IPV6_HDR_LEN + len)) {
-					DPRINT(10, "ibd_process_rx: pullupmsg"
-					    " failed");
-					freemsg(mp);
-					return;
-				}
-				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
-				    sizeof (ipoib_pgrh_t) +
-				    sizeof (ipoib_hdr_t));
-			}
 			/* LINTED: E_CONSTANT_CONDITION */
 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
 		}
@@ -6254,7 +6382,7 @@
 	/*
 	 * Update statistics
 	 */
-	atomic_add_64(&state->id_rcv_bytes, wc->wc_bytes_xfer);
+	atomic_add_64(&state->id_rcv_bytes, pkt_len);
 	atomic_inc_64(&state->id_rcv_pkt);
 	if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
 		atomic_inc_64(&state->id_brd_rcv);
@@ -6278,35 +6406,7 @@
 		    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
 	}
 
-	/*
-	 * Add this mp to the list of processed mp's to send to
-	 * the nw layer
-	 */
-	mutex_enter(&state->id_rx_lock);
-	if (state->id_rx_mp) {
-		ASSERT(state->id_rx_mp_tail != NULL);
-		state->id_rx_mp_tail->b_next = mp;
-	} else {
-		ASSERT(state->id_rx_mp_tail == NULL);
-		state->id_rx_mp = mp;
-	}
-
-	state->id_rx_mp_tail = mp;
-	state->id_rx_mp_len++;
-
-	if (state->id_rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
-		mpc = state->id_rx_mp;
-
-		state->id_rx_mp = NULL;
-		state->id_rx_mp_tail = NULL;
-		state->id_rx_mp_len = 0;
-	}
-
-	mutex_exit(&state->id_rx_lock);
-
-	if (mpc) {
-		ibd_flush_rx(state, mpc);
-	}
+	return (mp);
 }
 
 /*
@@ -6325,47 +6425,30 @@
 	if (rwqe->w_freeing_wqe == B_TRUE) {
 		DPRINT(6, "ibd_freemsg: wqe being freed");
 		return;
-	} else {
-		/*
-		 * Upper layer has released held mblk, so we have
-		 * no more use for keeping the old pointer in
-		 * our rwqe.
-		 */
-		rwqe->rwqe_im_mblk = NULL;
 	}
 
 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
 	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
 	if (rwqe->rwqe_im_mblk == NULL) {
-		ibd_delete_rwqe(state, rwqe);
 		ibd_free_rwqe(state, rwqe);
 		DPRINT(6, "ibd_freemsg: desballoc failed");
 		return;
 	}
 
-	if (ibd_post_recv(state, rwqe, B_TRUE) == DDI_FAILURE) {
-		ibd_delete_rwqe(state, rwqe);
-		ibd_free_rwqe(state, rwqe);
-		return;
-	}
+	ibd_post_recv(state, rwqe);
 
 	atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1);
 }
 
 static uint_t
-ibd_tx_recycle(char *arg)
+ibd_tx_recycle(caddr_t arg)
 {
 	ibd_state_t *state = (ibd_state_t *)arg;
 
 	/*
 	 * Poll for completed entries
 	 */
-	ibd_poll_compq(state, state->id_scq_hdl);
-
-	/*
-	 * Resume any blocked transmissions if possible
-	 */
-	(void) ibd_resume_transmission(state);
+	ibd_poll_scq(state, state->id_scq_hdl);
 
 	return (DDI_INTR_CLAIMED);
 }
--- a/usr/src/uts/common/sys/ib/adapters/hermon/hermon_hw.h	Fri Oct 23 15:00:42 2009 -0700
+++ b/usr/src/uts/common/sys/ib/adapters/hermon/hermon_hw.h	Fri Oct 23 15:06:39 2009 -0700
@@ -1151,13 +1151,19 @@
 	uint32_t	cap_mask;
 
 	uint32_t	rqk		:1;	/* reset qkey violation cntr */
-	uint32_t			:15;
+	uint32_t	rcm		:1;	/* reset capability mask */
+	uint32_t			:2;
+	uint32_t	vl_cap		:4;
+	uint32_t			:4;
+	uint32_t	mtu_cap		:4;
 	uint32_t	g0		:1;	/* set port GUID0 */
 	uint32_t	ng		:1;	/* set node GUID (all ports) */
 	uint32_t	sig		:1;	/* set sys image */
 	uint32_t	mg		:1;	/* change GID table */
 	uint32_t	mp		:1;	/* change pkey table size */
-	uint32_t			:11;
+	uint32_t	mvc		:1;	/* change vl_cap */
+	uint32_t	mmc		:1;	/* change mtu_cap */
+	uint32_t			:9;
 
 	uint64_t	sys_img_guid;
 
@@ -1185,13 +1191,19 @@
 };
 #else	/* BIG ENDIAN */
 struct hermon_hw_set_port_s {
-	uint32_t			:11;
+	uint32_t			:9;
+	uint32_t	mmc		:1;	/* change mtu_cap */
+	uint32_t	mvc		:1;	/* change vl_cap */
 	uint32_t	mp		:1;	/* change pkey table size */
 	uint32_t	mg		:1;	/* change GID table size */
 	uint32_t	sig		:1;	/* set sys image GUID */
 	uint32_t	ng		:1;	/* set node GUID (all ports) */
 	uint32_t	g0		:1;	/* set port GUID0 */
-	uint32_t			:15;
+	uint32_t	mtu_cap		:4;
+	uint32_t			:4;
+	uint32_t	vl_cap		:4;
+	uint32_t			:2;
+	uint32_t	rcm		:1;	/* reset capability mask */
 	uint32_t	rqk		:1;	/* reset qkey violation cntr */
 
 	uint32_t	cap_mask;
--- a/usr/src/uts/common/sys/ib/clients/ibd/ibd.h	Fri Oct 23 15:00:42 2009 -0700
+++ b/usr/src/uts/common/sys/ib/clients/ibd/ibd.h	Fri Oct 23 15:06:39 2009 -0700
@@ -144,15 +144,12 @@
  * Pre-registered copybuf used for send and receive
  */
 typedef struct ibd_copybuf_s {
-	ibt_mr_hdl_t		ic_mr_hdl;
 	ibt_wr_ds_t		ic_sgl;
-	ibt_mr_desc_t		ic_mr_desc;
 	uint8_t			*ic_bufaddr;
 } ibd_copybuf_t;
 
 typedef struct ibd_wqe_s {
 	struct ibd_wqe_s	*w_next;
-	struct ibd_wqe_s	*w_prev;
 	ibd_wqe_type_t		w_type;
 	ibd_copybuf_t		w_copybuf;
 	mblk_t			*im_mblk;
@@ -171,7 +168,6 @@
 } ibd_swqe_t;
 
 #define	swqe_next		w_ibd_swqe.w_next
-#define	swqe_prev		w_ibd_swqe.w_prev
 #define	swqe_type		w_ibd_swqe.w_type
 #define	swqe_copybuf		w_ibd_swqe.w_copybuf
 #define	swqe_im_mblk		w_ibd_swqe.im_mblk
@@ -187,11 +183,9 @@
 	ibt_recv_wr_t		w_rwr;
 	boolean_t		w_freeing_wqe;
 	frtn_t			w_freemsg_cb;
-	ibd_wqe_t		*w_post_link;
 } ibd_rwqe_t;
 
 #define	rwqe_next		w_ibd_rwqe.w_next
-#define	rwqe_prev		w_ibd_rwqe.w_prev
 #define	rwqe_type		w_ibd_rwqe.w_type
 #define	rwqe_copybuf		w_ibd_rwqe.w_copybuf
 #define	rwqe_im_mblk		w_ibd_rwqe.im_mblk
@@ -199,14 +193,13 @@
 #define	WQE_TO_RWQE(wqe)	(ibd_rwqe_t *)wqe
 
 typedef struct ibd_list_s {
+	kmutex_t		dl_mutex;
 	ibd_wqe_t		*dl_head;
-	ibd_wqe_t		*dl_tail;
 	union {
 		boolean_t	pending_sends;
 		uint32_t	bufs_outstanding;
 	} ustat;
 	uint32_t		dl_cnt;
-	kmutex_t		dl_mutex;
 } ibd_list_t;
 
 #define	dl_pending_sends	ustat.pending_sends
@@ -240,6 +233,25 @@
 } ibd_lsobkt_t;
 
 /*
+ * Posting to a single software rx post queue is contentious,
+ * so break it out to (multiple) an array of queues.
+ *
+ * Try to ensure rx_queue structs fall in different cache lines using a filler.
+ * Note: the RX_QUEUE_CACHE_LINE needs to change if the struct changes.
+ */
+#define	RX_QUEUE_CACHE_LINE \
+	(64 - ((sizeof (kmutex_t) + 2 * sizeof (ibd_wqe_t *) + \
+	2 * sizeof (uint32_t))))
+typedef struct ibd_rx_queue_s {
+	kmutex_t		rx_post_lock;
+	ibd_wqe_t		*rx_head;
+	ibd_wqe_t		*rx_tail;
+	uint32_t		rx_stat;
+	uint32_t		rx_cnt;
+	uint8_t			rx_cache_filler[RX_QUEUE_CACHE_LINE];
+} ibd_rx_queue_t;
+
+/*
  * This structure maintains information per port per HCA
  * (per network interface).
  */
@@ -250,47 +262,59 @@
 	ibt_pd_hdl_t		id_pd_hdl;
 	kmem_cache_t		*id_req_kmc;
 
+	ibd_list_t		id_tx_rel_list;
+
 	uint32_t		id_max_sqseg;
+	uint32_t		id_max_sqseg_hiwm;
 	ibd_list_t		id_tx_list;
 	ddi_softintr_t		id_tx;
 	uint32_t		id_tx_sends;
 
+	kmutex_t		id_txpost_lock;
+	ibd_swqe_t		*id_tx_head;
+	ibd_swqe_t		*id_tx_tail;
+	int			id_tx_busy;
+
+	uint_t			id_tx_buf_sz;
 	uint8_t			*id_tx_bufs;
+	ibd_swqe_t		*id_tx_wqes;
 	ibt_mr_hdl_t		id_tx_mr_hdl;
 	ibt_mr_desc_t		id_tx_mr_desc;
-	uint_t			id_tx_buf_sz;
 
 	kmutex_t		id_lso_lock;
 	ibd_lsobkt_t		*id_lso;
 
-	kmutex_t		id_cq_poll_lock;
-	int			id_cq_poll_busy;
+	kmutex_t		id_scq_poll_lock;
+	int			id_scq_poll_busy;
 
 	ibt_cq_hdl_t		id_scq_hdl;
 	ibt_wc_t		*id_txwcs;
 	uint32_t		id_txwcs_size;
 
-	kmutex_t		id_txpost_lock;
-	ibd_swqe_t		*id_tx_head;
-	ibd_wqe_t		**id_tx_tailp;
-	int			id_tx_busy;
+	kmutex_t		id_rx_post_lock;
+	int			id_rx_post_busy;
+	int			id_rx_nqueues;
+	ibd_rx_queue_t		*id_rx_queues;
+	ibd_wqe_t		*id_rx_post_head;
 
-	kmutex_t		id_rxpost_lock;
-	ibd_rwqe_t		*id_rx_head;
-	ibd_wqe_t		**id_rx_tailp;
-	int			id_rx_busy;
-
-	kmutex_t		id_rx_lock;
-	mblk_t			*id_rx_mp;
-	mblk_t			*id_rx_mp_tail;
-	uint32_t		id_rx_mp_len;
-
+	ibd_rwqe_t		*id_rx_wqes;
+	uint8_t			*id_rx_bufs;
+	ibt_mr_hdl_t		id_rx_mr_hdl;
+	ibt_mr_desc_t		id_rx_mr_desc;
+	uint_t			id_rx_buf_sz;
 	uint32_t		id_num_rwqe;
 	ibd_list_t		id_rx_list;
 	ddi_softintr_t		id_rx;
-	ibt_cq_hdl_t		id_rcq_hdl;
+	uint32_t		id_rx_bufs_outstanding_limit;
+	uint32_t		id_rx_allocb;
+	uint32_t		id_rx_allocb_failed;
+	ibd_list_t		id_rx_free_list;
+
+	kmutex_t		id_rcq_poll_lock;
+	int			id_rcq_poll_busy;
+	uint32_t		id_rxwcs_size;
 	ibt_wc_t		*id_rxwcs;
-	uint32_t		id_rxwcs_size;
+	ibt_cq_hdl_t		id_rcq_hdl;
 
 	ibt_channel_hdl_t	id_chnl_hdl;
 	ib_pkey_t		id_pkey;
@@ -315,6 +339,7 @@
 	kt_did_t		id_async_thrid;
 
 	kmutex_t		id_ac_mutex;
+	ibd_ace_t		*id_ac_hot_ace;
 	struct list		id_ah_active;
 	struct list		id_ah_free;
 	ipoib_mac_t		id_ah_addr;
@@ -337,6 +362,8 @@
 
 	kmutex_t		id_sched_lock;
 	int			id_sched_needed;
+	int			id_sched_cnt;
+	int			id_sched_lso_cnt;
 
 	kmutex_t		id_link_mutex;
 	link_state_t		id_link_state;