changeset 9985:f0a94a205b15

6845344 IBD driver should attach if PKEY is Valid with fullmembership bit set
author Venkatakrishnan Rajagopalan <Venkatakrishnan.Rajagopalan@Sun.COM>
date Mon, 29 Jun 2009 02:56:22 -0700
parents cc88cd2a9d42
children 4d51e0eb2206
files usr/src/uts/common/io/ib/clients/ibd/ibd.c usr/src/uts/common/sys/ib/clients/ibd/ibd.h usr/src/uts/intel/ibd/Makefile usr/src/uts/sparc/ibd/Makefile
diffstat 4 files changed, 1009 insertions(+), 822 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/io/ib/clients/ibd/ibd.c	Mon Jun 29 02:30:05 2009 -0700
+++ b/usr/src/uts/common/io/ib/clients/ibd/ibd.c	Mon Jun 29 02:56:22 2009 -0700
@@ -157,6 +157,12 @@
 static uint_t ibd_rxcomp_usec = 10;
 
 /*
+ * Send CQ moderation parameters: NOT tunables
+ */
+#define	IBD_TXCOMP_COUNT		10
+#define	IBD_TXCOMP_USEC			300
+
+/*
  * Thresholds
  *
  * When waiting for resources (swqes or lso buffers) to become available,
@@ -225,11 +231,36 @@
 #define	IBD_OP_ROUTERED			4
 
 /*
+ * State of IBD driver initialization during attach/m_start
+ */
+#define	IBD_DRV_STATE_INITIALIZED	0x00001
+#define	IBD_DRV_RXINTR_ADDED		0x00002
+#define	IBD_DRV_TXINTR_ADDED		0x00004
+#define	IBD_DRV_IBTL_ATTACH_DONE	0x00008
+#define	IBD_DRV_HCA_OPENED		0x00010
+#define	IBD_DRV_PD_ALLOCD		0x00020
+#define	IBD_DRV_MAC_REGISTERED		0x00040
+#define	IBD_DRV_PORT_DETAILS_OBTAINED	0x00080
+#define	IBD_DRV_BCAST_GROUP_FOUND	0x00100
+#define	IBD_DRV_ACACHE_INITIALIZED	0x00200
+#define	IBD_DRV_CQS_ALLOCD		0x00400
+#define	IBD_DRV_UD_CHANNEL_SETUP	0x00800
+#define	IBD_DRV_TXLIST_ALLOCD		0x01000
+#define	IBD_DRV_SCQ_NOTIFY_ENABLED	0x02000
+#define	IBD_DRV_RXLIST_ALLOCD		0x04000
+#define	IBD_DRV_BCAST_GROUP_JOINED	0x08000
+#define	IBD_DRV_ASYNC_THR_CREATED	0x10000
+#define	IBD_DRV_RCQ_NOTIFY_ENABLED	0x20000
+#define	IBD_DRV_SM_NOTICES_REGISTERED	0x40000
+#define	IBD_DRV_STARTED			0x80000
+
+/*
  * Miscellaneous constants
  */
 #define	IBD_SEND			0
 #define	IBD_RECV			1
 #define	IB_MGID_IPV4_LOWGRP_MASK	0xFFFFFFFF
+#define	IBD_DEF_MAX_SDU			2044
 #ifdef IBD_LOGGING
 #define	IBD_DMAX_LINE			100
 #endif
@@ -283,7 +314,6 @@
  * Initialization
  */
 static int ibd_state_init(ibd_state_t *, dev_info_t *);
-static int ibd_drv_init(ibd_state_t *);
 static int ibd_init_txlist(ibd_state_t *);
 static int ibd_init_rxlist(ibd_state_t *);
 static int ibd_acache_init(ibd_state_t *);
@@ -295,7 +325,6 @@
  * Termination/cleanup
  */
 static void ibd_state_fini(ibd_state_t *);
-static void ibd_drv_fini(ibd_state_t *);
 static void ibd_fini_txlist(ibd_state_t *);
 static void ibd_fini_rxlist(ibd_state_t *);
 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
@@ -348,7 +377,7 @@
  */
 static boolean_t ibd_send(ibd_state_t *, mblk_t *);
 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
-static int ibd_post_rwqe(ibd_state_t *, ibd_rwqe_t *, boolean_t);
+static int ibd_post_recv(ibd_state_t *, ibd_rwqe_t *, boolean_t);
 static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
 static void ibd_flush_rx(ibd_state_t *, mblk_t *);
 
@@ -384,7 +413,6 @@
 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
 static uint64_t ibd_get_portspeed(ibd_state_t *);
-static int ibd_get_portpkey(ibd_state_t *, ib_guid_t *);
 static boolean_t ibd_async_safe(ibd_state_t *);
 static void ibd_async_done(ibd_state_t *);
 static ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int);
@@ -394,6 +422,18 @@
 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
 
 /*
+ * Helpers for attach/start routines
+ */
+static int ibd_register_mac(ibd_state_t *, dev_info_t *);
+static int ibd_record_capab(ibd_state_t *, dev_info_t *);
+static int ibd_unattach(ibd_state_t *, dev_info_t *);
+static int ibd_get_port_details(ibd_state_t *);
+static int ibd_alloc_cqs(ibd_state_t *);
+static int ibd_setup_ud_channel(ibd_state_t *);
+static int ibd_undo_m_start(ibd_state_t *);
+
+
+/*
  * Miscellaneous helpers
  */
 static int ibd_sched_poll(ibd_state_t *, int, int);
@@ -439,7 +479,7 @@
  * GLDv3 entry points
  */
 #define	IBD_M_CALLBACK_FLAGS	(MC_GETCAPAB)
-static mac_callbacks_t ib_m_callbacks = {
+static mac_callbacks_t ibd_m_callbacks = {
 	IBD_M_CALLBACK_FLAGS,
 	ibd_m_stat,
 	ibd_m_start,
@@ -625,7 +665,7 @@
 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 
     ibd_state_t::id_link_state))
 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state))
-_NOTE(SCHEME_PROTECTS_DATA("only async thr and drv init",
+_NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start",
     ibd_state_t::id_link_speed))
 
 /*
@@ -1236,7 +1276,7 @@
 }
 
 /*
- * Wake up ibd_drv_fini() if the detach code is waiting for pending subnet
+ * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
  * trap or event handling to complete to kill the async thread and deconstruct
  * the mcg/ace list.
  */
@@ -1660,7 +1700,7 @@
 	ibt_path_attr_t path_attr;
 	ibt_path_info_t path_info;
 	ib_gid_t destgid;
-	int ret = IBD_OP_NOTSTARTED;
+	char ret = IBD_OP_NOTSTARTED;
 
 	DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
 	    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
@@ -1954,7 +1994,7 @@
 	mutex_enter(&state->id_link_mutex);
 
 	/*
-	 * If the init code in ibd_drv_init hasn't yet set up the
+	 * If the init code in ibd_m_start hasn't yet set up the
 	 * pkey/gid, nothing to do; that code will set the link state.
 	 */
 	if (state->id_link_state == LINK_STATE_UNKNOWN) {
@@ -2071,8 +2111,8 @@
 		 * done ibt_open_hca() but not yet done ibt_close_hca().
 		 * Only need to do work for our port; IBTF will deliver
 		 * events for other ports on the hca we have ibt_open_hca'ed
-		 * too. Note that ibd_drv_init() initializes id_port before
-		 * doing ibt_open_hca().
+		 * too. Note that id_port is initialized in ibd_attach()
+		 * before we do an ibt_open_hca() in ibd_attach().
 		 */
 		ASSERT(state->id_hca_hdl == hca_hdl);
 		if (state->id_port != event->ev_port)
@@ -2091,8 +2131,8 @@
 		 * done ibt_open_hca() but not yet done ibt_close_hca().
 		 * Only need to do work for our port; IBTF will deliver
 		 * events for other ports on the hca we have ibt_open_hca'ed
-		 * too. Note that ibd_drv_init() initializes id_port before
-		 * doing ibt_open_hca().
+		 * too. Note that id_port is initialized in ibd_attach()
+		 * before we do an ibt_open_hca() in ibd_attach().
 		 */
 		ASSERT(state->id_hca_hdl == hca_hdl);
 		if (state->id_port != event->ev_port)
@@ -2121,160 +2161,330 @@
 	}
 }
 
+static int
+ibd_register_mac(ibd_state_t *state, dev_info_t *dip)
+{
+	mac_register_t *macp;
+	int ret;
+
+	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
+		DPRINT(10, "ibd_register_mac: mac_alloc() failed");
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * Note that when we register with mac during attach, we don't
+	 * have the id_macaddr yet, so we'll simply be registering a
+	 * zero macaddr that we'll overwrite later during plumb (in
+	 * ibd_m_start()). Similar is the case with id_mtu - we'll
+	 * update the mac layer with the correct mtu during plumb.
+	 */
+	macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
+	macp->m_driver = state;
+	macp->m_dip = dip;
+	macp->m_src_addr = (uint8_t *)&state->id_macaddr;
+	macp->m_callbacks = &ibd_m_callbacks;
+	macp->m_min_sdu = 0;
+	macp->m_max_sdu = IBD_DEF_MAX_SDU;
+
+	/*
+	 *  Register ourselves with the GLDv3 interface
+	 */
+	if ((ret = mac_register(macp, &state->id_mh)) != 0) {
+		mac_free(macp);
+		DPRINT(10,
+		    "ibd_register_mac: mac_register() failed, ret=%d", ret);
+		return (DDI_FAILURE);
+	}
+
+	mac_free(macp);
+	return (DDI_SUCCESS);
+}
+
+static int
+ibd_record_capab(ibd_state_t *state, dev_info_t *dip)
+{
+	ibt_hca_attr_t hca_attrs;
+	ibt_status_t ibt_status;
+
+	/*
+	 * Query the HCA and fetch its attributes
+	 */
+	ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
+	ASSERT(ibt_status == IBT_SUCCESS);
+
+	/*
+	 * 1. Set the Hardware Checksum capability. Currently we only consider
+	 *    full checksum offload.
+	 */
+	if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) == IBT_HCA_CKSUM_FULL) {
+		state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
+	}
+
+	/*
+	 * 2. Set LSO policy, capability and maximum length
+	 */
+	if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) {
+		state->id_lso_policy = B_TRUE;
+	} else {
+		state->id_lso_policy = B_FALSE;
+	}
+	if (hca_attrs.hca_max_lso_size > 0) {
+		state->id_lso_capable = B_TRUE;
+		if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
+			state->id_lso_maxlen = IBD_LSO_MAXLEN;
+		else
+			state->id_lso_maxlen = hca_attrs.hca_max_lso_size;
+	} else {
+		state->id_lso_capable = B_FALSE;
+		state->id_lso_maxlen = 0;
+	}
+
+	/*
+	 * 3. Set Reserved L_Key capability
+	 */
+	if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
+		state->id_hca_res_lkey_capab = 1;
+		state->id_res_lkey = hca_attrs.hca_reserved_lkey;
+	}
+
+	/*
+	 * 4. Set maximum sqseg value after checking to see if extended sgl
+	 *    size information is provided by the hca
+	 */
+	if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) {
+		state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
+	} else {
+		state->id_max_sqseg = hca_attrs.hca_max_sgl;
+	}
+	if (state->id_max_sqseg > IBD_MAX_SQSEG) {
+		state->id_max_sqseg = IBD_MAX_SQSEG;
+	} else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
+		ibd_print_warn(state, "Set #sgl = %d instead of default %d",
+		    state->id_max_sqseg, IBD_MAX_SQSEG);
+	}
+
+	/*
+	 * 5. Set number of recv and send wqes after checking hca maximum
+	 *    channel size
+	 */
+	if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) {
+		state->id_num_rwqe = hca_attrs.hca_max_chan_sz;
+	} else {
+		state->id_num_rwqe = IBD_NUM_RWQE;
+	}
+	if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) {
+		state->id_num_swqe = hca_attrs.hca_max_chan_sz;
+	} else {
+		state->id_num_swqe = IBD_NUM_SWQE;
+	}
+
+	return (DDI_SUCCESS);
+}
+
+static int
+ibd_unattach(ibd_state_t *state, dev_info_t *dip)
+{
+	int instance;
+	uint32_t progress = state->id_mac_state;
+	ibt_status_t ret;
+
+	if (progress & IBD_DRV_MAC_REGISTERED) {
+		(void) mac_unregister(state->id_mh);
+		state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
+	}
+
+	if (progress & IBD_DRV_PD_ALLOCD) {
+		if ((ret = ibt_free_pd(state->id_hca_hdl,
+		    state->id_pd_hdl)) != IBT_SUCCESS) {
+			ibd_print_warn(state, "failed to free "
+			    "protection domain, ret=%d", ret);
+		}
+		state->id_pd_hdl = NULL;
+		state->id_mac_state &= (~IBD_DRV_PD_ALLOCD);
+	}
+
+	if (progress & IBD_DRV_HCA_OPENED) {
+		if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
+		    IBT_SUCCESS) {
+			ibd_print_warn(state, "failed to close "
+			    "HCA device, ret=%d", ret);
+		}
+		state->id_hca_hdl = NULL;
+		state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
+	}
+
+	if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
+		if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) {
+			ibd_print_warn(state,
+			    "ibt_detach() failed, ret=%d", ret);
+		}
+		state->id_ibt_hdl = NULL;
+		state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
+	}
+
+	if (progress & IBD_DRV_TXINTR_ADDED) {
+		ddi_remove_softintr(state->id_tx);
+		state->id_tx = NULL;
+		state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED);
+	}
+
+	if (progress & IBD_DRV_RXINTR_ADDED) {
+		ddi_remove_softintr(state->id_rx);
+		state->id_rx = NULL;
+		state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED);
+	}
+
+	if (progress & IBD_DRV_STATE_INITIALIZED) {
+		ibd_state_fini(state);
+		state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED);
+	}
+
+	instance = ddi_get_instance(dip);
+	ddi_soft_state_free(ibd_list, instance);
+
+	return (DDI_SUCCESS);
+}
+
 /*
  * Attach device to the IO framework.
  */
 static int
 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 {
-	mac_register_t *macp;
-	ibd_state_t *state;
+	ibd_state_t *state = NULL;
+	ib_guid_t hca_guid;
 	int instance;
-	int err;
-
-	switch (cmd) {
-		case DDI_ATTACH:
-			break;
-		case DDI_RESUME:
-			/* This driver does not support resume */
-		default:
-			return (DDI_FAILURE);
-	}
+	ibt_status_t ret;
+	int rv;
 
 	/*
-	 * Allocate soft device data structure
+	 * IBD doesn't support suspend/resume
+	 */
+	if (cmd != DDI_ATTACH)
+		return (DDI_FAILURE);
+
+	/*
+	 * Allocate softstate structure
 	 */
 	instance = ddi_get_instance(dip);
 	if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE)
 		return (DDI_FAILURE);
 	state = ddi_get_soft_state(ibd_list, instance);
 
-	/* pre ibt_attach() soft state initialization */
+	/*
+	 * Initialize mutexes and condition variables
+	 */
 	if (ibd_state_init(state, dip) != DDI_SUCCESS) {
-		DPRINT(10, "ibd_attach : failed in ibd_state_init()");
-		goto attach_fail_state_init;
-	}
-
-	/* alloc rx soft intr */
-	if ((ibd_rx_softintr == 1) &&
-	    ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
-	    NULL, NULL, ibd_intr, (caddr_t)state) != DDI_SUCCESS) {
-		DPRINT(10, "ibd_attach : failed in ddi_add_softintr()");
-		goto attach_fail_ddi_add_rx_softintr;
-	}
-
-	/* alloc tx soft intr */
-	if ((ibd_tx_softintr == 1) &&
-	    ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
-	    NULL, NULL, ibd_tx_recycle, (caddr_t)state) != DDI_SUCCESS) {
-		DPRINT(10, "ibd_attach : failed in ddi_add_softintr()");
-		goto attach_fail_ddi_add_tx_softintr;
-	}
-
-	/* "attach" to IBTL */
-	if (ibt_attach(&ibd_clnt_modinfo, dip, state,
-	    &state->id_ibt_hdl) != IBT_SUCCESS) {
-		DPRINT(10, "ibd_attach : failed in ibt_attach()");
-		goto attach_fail_ibt_attach;
-	}
-
-	/* Finish initializing this driver */
-	if (ibd_drv_init(state) != DDI_SUCCESS) {
-		DPRINT(10, "ibd_attach : failed in ibd_drv_init()\n");
-		goto attach_fail_drv_init;
-	}
+		DPRINT(10, "ibd_attach: failed in ibd_state_init()");
+		goto attach_fail;
+	}
+	state->id_mac_state |= IBD_DRV_STATE_INITIALIZED;
 
 	/*
-	 * Initialize pointers to device specific functions which will be
-	 * used by the generic layer.
+	 * Allocate rx,tx softintr
 	 */
-	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
-		DPRINT(10, "ibd_attach : failed in mac_alloc()");
-		goto attach_fail_drv_init;
-	}
-
-	macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
-	macp->m_driver = state;
-	macp->m_dip = state->id_dip;
-	macp->m_src_addr = (uint8_t *)&state->id_macaddr;
-	macp->m_callbacks = &ib_m_callbacks;
-	macp->m_min_sdu = 0;
-	macp->m_max_sdu = state->id_mtu - IPOIB_HDRSIZE;
-
-	/*
-	 *  Register ourselves with the GLDv3 interface
-	 */
-	err = mac_register(macp, &state->id_mh);
-	mac_free(macp);
-	if (err != 0) {
-		DPRINT(10, "ibd_attach : failed in mac_register()");
-		goto attach_fail_mac_register;
+	if (ibd_rx_softintr == 1) {
+		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
+		    NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) {
+			DPRINT(10, "ibd_attach: failed in "
+			    "ddi_add_softintr(id_rx),  ret=%d", rv);
+			goto attach_fail;
+		}
+		state->id_mac_state |= IBD_DRV_RXINTR_ADDED;
+	}
+	if (ibd_tx_softintr == 1) {
+		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
+		    NULL, NULL, ibd_tx_recycle,
+		    (caddr_t)state)) != DDI_SUCCESS) {
+			DPRINT(10, "ibd_attach: failed in "
+			    "ddi_add_softintr(id_tx), ret=%d", rv);
+			goto attach_fail;
+		}
+		state->id_mac_state |= IBD_DRV_TXINTR_ADDED;
 	}
 
 	/*
-	 * Setup the handler we will use for regular DLPI stuff. Its important
-	 * to setup the recv handler after registering with gldv3.
+	 * Obtain IBA P_Key, port number and HCA guid and validate
+	 * them (for P_Key, only full members are allowed as per
+	 * IPoIB specification; neither port number nor HCA guid
+	 * can be zero)
 	 */
-	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
-	if (ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION) !=
-	    IBT_SUCCESS) {
-		DPRINT(10, "ibd_attach : failed in ibt_enable_cq_notify()\n");
-		goto attach_fail_setup_handler;
+	if ((state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
+	    "port-pkey", IB_PKEY_INVALID_LIMITED)) <= IB_PKEY_INVALID_FULL) {
+		DPRINT(10, "ibd_attach: port device has wrong partition (0x%x)",
+		    state->id_pkey);
+		goto attach_fail;
+	}
+	if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
+	    "port-number", 0)) == 0) {
+		DPRINT(10, "ibd_attach: invalid port number (%d)",
+		    state->id_port);
+		goto attach_fail;
+	}
+	if ((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
+	    "hca-guid", 0)) == 0) {
+		DPRINT(10, "ibd_attach: port hca has invalid guid (0x%llx)",
+		    hca_guid);
+		goto attach_fail;
 	}
 
 	/*
-	 * Setup the subnet notices handler after we initialize the a/mcaches
-	 * and start the async thread, both of which are required for the
-	 * trap handler to function properly. Enable the trap handler to
-	 * queue requests to the async thread after the mac_register, because
-	 * the async daemon invokes mac_tx_update(), which must be done after
-	 * mac_register().
+	 * Attach to IBTL
+	 */
+	if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
+	    &state->id_ibt_hdl)) != IBT_SUCCESS) {
+		DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", ret);
+		goto attach_fail;
+	}
+	state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
+
+	/*
+	 * Open the HCA
 	 */
-	ibt_register_subnet_notices(state->id_ibt_hdl,
-	    ibd_snet_notices_handler, state);
-	mutex_enter(&state->id_trap_lock);
-	state->id_trap_stop = B_FALSE;
-	mutex_exit(&state->id_trap_lock);
+	if ((ret = ibt_open_hca(state->id_ibt_hdl, hca_guid,
+	    &state->id_hca_hdl)) != IBT_SUCCESS) {
+		DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret);
+		goto attach_fail;
+	}
+	state->id_mac_state |= IBD_DRV_HCA_OPENED;
+
+	/*
+	 * Record capabilities
+	 */
+	(void) ibd_record_capab(state, dip);
 
 	/*
-	 * Indicate link status to GLDv3 and higher layers. By default,
-	 * we assume we are in up state (which must have been true at
-	 * least at the time the broadcast mcg's were probed); if there
-	 * were any up/down transitions till the time we come here, the
-	 * async handler will have updated last known state, which we
-	 * use to tell GLDv3. The async handler will not send any
-	 * notifications to GLDv3 till we reach here in the initialization
-	 * sequence.
+	 * Allocate a protection domain on the HCA
+	 */
+	if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
+	    &state->id_pd_hdl)) != IBT_SUCCESS) {
+		DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret);
+		goto attach_fail;
+	}
+	state->id_mac_state |= IBD_DRV_PD_ALLOCD;
+
+
+	/*
+	 * Register ibd interfaces with the Nemo framework
 	 */
-	mac_link_update(state->id_mh, state->id_link_state);
-
+	if (ibd_register_mac(state, dip) != IBT_SUCCESS) {
+		DPRINT(10, "ibd_attach: failed in ibd_register_mac()");
+		goto attach_fail;
+	}
+	state->id_mac_state |= IBD_DRV_MAC_REGISTERED;
+
+	/*
+	 * We're done with everything we could to make the attach
+	 * succeed.  All the buffer allocations and IPoIB broadcast
+	 * group joins are deferred to when the interface instance
+	 * is actually plumbed to avoid wasting memory.
+	 */
 	return (DDI_SUCCESS);
 
-	/* Attach failure points, cleanup */
-attach_fail_setup_handler:
-	(void) mac_unregister(state->id_mh);
-
-attach_fail_mac_register:
-	ibd_drv_fini(state);
-
-attach_fail_drv_init:
-	if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS)
-		ibd_print_warn(state, "failed to free IB resources");
-
-attach_fail_ibt_attach:
-	if (ibd_tx_softintr == 1)
-		ddi_remove_softintr(state->id_tx);
-
-attach_fail_ddi_add_tx_softintr:
-	if (ibd_rx_softintr == 1)
-		ddi_remove_softintr(state->id_rx);
-
-attach_fail_ddi_add_rx_softintr:
-	ibd_state_fini(state);
-
-attach_fail_state_init:
-	ddi_soft_state_free(ibd_list, instance);
-
+attach_fail:
+	ibd_unattach(state, dip);
 	return (DDI_FAILURE);
 }
 
@@ -2285,69 +2495,28 @@
 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 {
 	ibd_state_t *state;
-	int status;
 	int instance;
 
-	switch (cmd) {
-		case DDI_DETACH:
-			break;
-		case DDI_SUSPEND:
-		default:
-			return (DDI_FAILURE);
-	}
-
+	/*
+	 * IBD doesn't support suspend/resume
+	 */
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	/*
+	 * Get the instance softstate
+	 */
 	instance = ddi_get_instance(dip);
 	state = ddi_get_soft_state(ibd_list, instance);
 
 	/*
-	 * First, stop receive interrupts; this stops the
-	 * driver from handing up buffers to higher layers.
-	 * Wait for receive buffers to be returned; give up
-	 * after 5 seconds.
+	 * Release all resources we're holding still.  Note that if we'd
+	 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
+	 * so far, we should find all the flags we need in id_mac_state.
 	 */
-	ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
-	status = 50;
-	while (state->id_rx_list.dl_bufs_outstanding > 0) {
-		delay(drv_usectohz(100000));
-		if (--status == 0) {
-			DPRINT(2, "ibd_detach : reclaiming failed");
-			goto failed;
-		}
-	}
-
-	if (mac_unregister(state->id_mh) != DDI_SUCCESS) {
-		DPRINT(10, "ibd_detach : failed in mac_unregister()");
-		goto failed;
-	}
-
-	if (ibd_rx_softintr == 1)
-		ddi_remove_softintr(state->id_rx);
-
-	if (ibd_tx_softintr == 1)
-		ddi_remove_softintr(state->id_tx);
-
-	ibd_drv_fini(state);
-
-	if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS)
-		ibd_print_warn(state, "failed to free all IB resources at "
-		    "driver detach time");
-
-	ibd_state_fini(state);
-	ddi_soft_state_free(ibd_list, instance);
+	(void) ibd_unattach(state, dip);
+
 	return (DDI_SUCCESS);
-
-failed:
-	/*
-	 * Reap all the Tx/Rx completions that were posted since we
-	 * turned off the notification. Turn on notifications. There
-	 * is a race in that we do not reap completions that come in
-	 * after the poll and before notifications get turned on. That
-	 * is okay, the next rx/tx packet will trigger a completion
-	 * that will reap any missed completions.
-	 */
-	ibd_poll_compq(state, state->id_rcq_hdl);
-	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
-	return (DDI_FAILURE);
 }
 
 /*
@@ -2424,48 +2593,6 @@
 }
 
 /*
- * Fetch IBA parameters for the network device from IB nexus.
- */
-static int
-ibd_get_portpkey(ibd_state_t *state, ib_guid_t *hca_guid)
-{
-	/*
-	 * Get the IBA Pkey ... allow only fullmembers, per IPoIB spec.
-	 * Note that the default partition is also allowed.
-	 */
-	state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip,
-	    0, "port-pkey", IB_PKEY_INVALID_LIMITED);
-	if (state->id_pkey <= IB_PKEY_INVALID_FULL) {
-		DPRINT(10, "ibd_get_portpkey : ERROR: IBport device has wrong"
-		    "partition\n");
-		return (DDI_FAILURE);
-	}
-
-	/*
-	 * ... the IBA port ...
-	 */
-	state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip,
-	    0, "port-number", 0);
-	if (state->id_port == 0) {
-		DPRINT(10, "ibd_get_portpkey : ERROR: invalid port number\n");
-		return (DDI_FAILURE);
-	}
-
-	/*
-	 * ... and HCA GUID.
-	 */
-	*hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
-	    0, "hca-guid", 0);
-	if (*hca_guid == 0) {
-		DPRINT(10, "ibd_get_portpkey : ERROR: IBport hca has wrong "
-		    "guid\n");
-		return (DDI_FAILURE);
-	}
-
-	return (DDI_SUCCESS);
-}
-
-/*
  * Fetch link speed from SA for snmp ifspeed reporting.
  */
 static uint64_t
@@ -2951,427 +3078,6 @@
 	return (IBT_SUCCESS);
 }
 
-/*
- * Post ibt_attach() initialization.
- */
-static int
-ibd_drv_init(ibd_state_t *state)
-{
-	kthread_t *kht;
-	ibt_ud_chan_alloc_args_t ud_alloc_attr;
-	ibt_ud_chan_query_attr_t ud_chan_attr;
-	ibt_hca_portinfo_t *port_infop;
-	ibt_hca_attr_t hca_attrs;
-	ibt_status_t ibt_status;
-	ibt_cq_attr_t cq_attr;
-	ib_guid_t hca_guid;
-	uint32_t real_size;
-	uint32_t *ptr;
-	char pathname[OBP_MAXPATHLEN];
-	uint_t psize, port_infosz;
-
-	/*
-	 * Initialize id_port before ibt_open_hca because of
-	 * ordering requirements in port up/down handling.
-	 */
-	if (ibd_get_portpkey(state, &hca_guid) != DDI_SUCCESS)
-		return (DDI_FAILURE);
-
-	if (ibt_open_hca(state->id_ibt_hdl, hca_guid,
-	    &state->id_hca_hdl) != IBT_SUCCESS) {
-		DPRINT(10, "ibd_drv_init : failed in ibt_open_hca()\n");
-		return (DDI_FAILURE);
-	}
-
-	mutex_enter(&state->id_link_mutex);
-	ibt_status = ibt_query_hca_ports(state->id_hca_hdl,
-	    state->id_port, &port_infop, &psize,
-	    &port_infosz);
-	if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
-		mutex_exit(&state->id_link_mutex);
-		DPRINT(10, "ibd_drv_init : failed in ibt_query_port()\n");
-		(void) ibt_close_hca(state->id_hca_hdl);
-		return (DDI_FAILURE);
-	}
-
-	/*
-	 * If the link already went down by the time we get here, give up;
-	 * we can not even get the gid since that is not valid. We would
-	 * fail in ibd_find_bgroup() anyway.
-	 */
-	if (port_infop->p_linkstate != IBT_PORT_ACTIVE) {
-		mutex_exit(&state->id_link_mutex);
-		ibt_free_portinfo(port_infop, port_infosz);
-		(void) ibt_close_hca(state->id_hca_hdl);
-		ibd_print_warn(state, "Port is not active");
-		return (DDI_FAILURE);
-	}
-
-	/*
-	 * This verifies the Pkey ibnexus handed us is still valid.
-	 * This is also the point from which the pkey table for the
-	 * port must hold the exact pkey value at the exact index
-	 * across port up/downs.
-	 */
-	if (ibt_pkey2index(state->id_hca_hdl, state->id_port,
-	    state->id_pkey, &state->id_pkix) != IBT_SUCCESS) {
-		mutex_exit(&state->id_link_mutex);
-		ibt_free_portinfo(port_infop, port_infosz);
-		DPRINT(10, "ibd_drv_init : failed in ibt_pkey2index()\n");
-		(void) ibt_close_hca(state->id_hca_hdl);
-		return (DDI_FAILURE);
-	}
-
-	state->id_mtu = (128 << port_infop->p_mtu);
-	state->id_sgid = *port_infop->p_sgid_tbl;
-	state->id_link_state = LINK_STATE_UP;
-	mutex_exit(&state->id_link_mutex);
-
-	ibt_free_portinfo(port_infop, port_infosz);
-
-	state->id_link_speed = ibd_get_portspeed(state);
-
-	/*
-	 * Read drv conf and record what the policy is on enabling LSO
-	 */
-	if (ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip,
-	    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) {
-		state->id_lso_policy = B_TRUE;
-	} else {
-		state->id_lso_policy = B_FALSE;
-	}
-
-	ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
-	ASSERT(ibt_status == IBT_SUCCESS);
-
-	if (ibd_find_bgroup(state) != IBT_SUCCESS) {
-		DPRINT(10, "ibd_drv_init : failed in ibd_find_bgroup\n");
-		goto drv_init_fail_find_bgroup;
-	}
-
-	if (ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
-	    &state->id_pd_hdl) != IBT_SUCCESS) {
-		DPRINT(10, "ibd_drv_init : failed in ibt_alloc_pd()\n");
-		goto drv_init_fail_alloc_pd;
-	}
-
-	/* Initialize the parallel ARP cache and AHs */
-	if (ibd_acache_init(state) != DDI_SUCCESS) {
-		DPRINT(10, "ibd_drv_init : failed in ibd_acache_init()\n");
-		goto drv_init_fail_acache;
-	}
-
-	if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
-		state->id_hca_res_lkey_capab = 1;
-		state->id_res_lkey = hca_attrs.hca_reserved_lkey;
-	}
-
-	/*
-	 * Check various tunable limits.
-	 */
-
-	/*
-	 * See if extended sgl size information is provided by the hca; if yes,
-	 * use the correct one and set the maximum sqseg value.
-	 */
-	if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO)
-		state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
-	else
-		state->id_max_sqseg = hca_attrs.hca_max_sgl;
-
-	/*
-	 * Set LSO capability and maximum length
-	 */
-	if (hca_attrs.hca_max_lso_size > 0) {
-		state->id_lso_capable = B_TRUE;
-		if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
-			state->id_lso_maxlen = IBD_LSO_MAXLEN;
-		else
-			state->id_lso_maxlen = hca_attrs.hca_max_lso_size;
-	} else {
-		state->id_lso_capable = B_FALSE;
-		state->id_lso_maxlen = 0;
-	}
-
-
-	/*
-	 * Check #r/s wqes against max channel size.
-	 */
-	if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE)
-		state->id_num_rwqe = hca_attrs.hca_max_chan_sz;
-	else
-		state->id_num_rwqe = IBD_NUM_RWQE;
-
-	if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE)
-		state->id_num_swqe = hca_attrs.hca_max_chan_sz;
-	else
-		state->id_num_swqe = IBD_NUM_SWQE;
-
-	/*
-	 * Check the hardware checksum capability. Currently we only consider
-	 * full checksum offload.
-	 */
-	if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) == IBT_HCA_CKSUM_FULL) {
-		state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
-	}
-
-	/*
-	 * Allocate Rx/combined CQ:
-	 * Theoretically, there is no point in having more than #rwqe
-	 * plus #swqe cqe's, except that the CQ will be signalled for
-	 * overflow when the last wqe completes, if none of the previous
-	 * cqe's have been polled. Thus, we allocate just a few less wqe's
-	 * to make sure such overflow does not occur.
-	 */
-	cq_attr.cq_sched = NULL;
-	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
-
-	if (ibd_separate_cqs == 1) {
-		/*
-		 * Allocate Receive CQ.
-		 */
-		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) {
-			cq_attr.cq_size = state->id_num_rwqe + 1;
-		} else {
-			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
-			state->id_num_rwqe = cq_attr.cq_size - 1;
-		}
-
-		if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
-		    &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) {
-			DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n");
-			goto drv_init_fail_alloc_rcq;
-		}
-
-		if (ibt_modify_cq(state->id_rcq_hdl,
-		    ibd_rxcomp_count, ibd_rxcomp_usec, 0) != IBT_SUCCESS) {
-			DPRINT(10, "ibd_drv_init: Receive CQ interrupt "
-			    "moderation failed\n");
-		}
-
-		state->id_rxwcs_size = state->id_num_rwqe + 1;
-		state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
-		    state->id_rxwcs_size, KM_SLEEP);
-
-		/*
-		 * Allocate Send CQ.
-		 */
-		if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) {
-			cq_attr.cq_size = state->id_num_swqe + 1;
-		} else {
-			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
-			state->id_num_swqe = cq_attr.cq_size - 1;
-		}
-
-		if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
-		    &state->id_scq_hdl, &real_size) != IBT_SUCCESS) {
-			DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n");
-			goto drv_init_fail_alloc_scq;
-		}
-		if (ibt_modify_cq(state->id_scq_hdl,
-		    10, 300, 0) != IBT_SUCCESS) {
-			DPRINT(10, "ibd_drv_init: Send CQ interrupt "
-			    "moderation failed\n");
-		}
-
-		state->id_txwcs_size = state->id_num_swqe + 1;
-		state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
-		    state->id_txwcs_size, KM_SLEEP);
-	} else {
-		/*
-		 * Allocate combined Send/Receive CQ.
-		 */
-		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe +
-		    state->id_num_swqe + 1)) {
-			cq_attr.cq_size = state->id_num_rwqe +
-			    state->id_num_swqe + 1;
-		} else {
-			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
-			state->id_num_rwqe = ((cq_attr.cq_size - 1) *
-			    state->id_num_rwqe) / (state->id_num_rwqe +
-			    state->id_num_swqe);
-			state->id_num_swqe = cq_attr.cq_size - 1 -
-			    state->id_num_rwqe;
-		}
-
-		state->id_rxwcs_size = cq_attr.cq_size;
-		state->id_txwcs_size = state->id_rxwcs_size;
-
-		if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
-		    &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) {
-			DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n");
-			goto drv_init_fail_alloc_rcq;
-		}
-		state->id_scq_hdl = state->id_rcq_hdl;
-		state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
-		    state->id_rxwcs_size, KM_SLEEP);
-		state->id_txwcs = state->id_rxwcs;
-	}
-
-	/*
-	 * Print message in case we could not allocate as many wqe's
-	 * as was requested. Note that in the combined CQ case, we will
-	 * get the following message.
-	 */
-	if (state->id_num_rwqe != IBD_NUM_RWQE)
-		ibd_print_warn(state, "Setting #rwqe = %d instead of default "
-		    "%d", state->id_num_rwqe, IBD_NUM_RWQE);
-	if (state->id_num_swqe != IBD_NUM_SWQE)
-		ibd_print_warn(state, "Setting #swqe = %d instead of default "
-		    "%d", state->id_num_swqe, IBD_NUM_SWQE);
-
-	ud_alloc_attr.ud_flags  = IBT_WR_SIGNALED;
-	if (state->id_hca_res_lkey_capab)
-		ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
-	if (state->id_lso_policy && state->id_lso_capable)
-		ud_alloc_attr.ud_flags |= IBT_USES_LSO;
-
-	ud_alloc_attr.ud_hca_port_num	= state->id_port;
-	ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
-	ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
-	ud_alloc_attr.ud_sizes.cs_sq	= state->id_num_swqe;
-	ud_alloc_attr.ud_sizes.cs_rq	= state->id_num_rwqe;
-	ud_alloc_attr.ud_qkey		= state->id_mcinfo->mc_qkey;
-	ud_alloc_attr.ud_scq		= state->id_scq_hdl;
-	ud_alloc_attr.ud_rcq		= state->id_rcq_hdl;
-	ud_alloc_attr.ud_pd		= state->id_pd_hdl;
-	ud_alloc_attr.ud_pkey_ix	= state->id_pkix;
-	ud_alloc_attr.ud_clone_chan	= NULL;
-
-	if (ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
-	    &ud_alloc_attr, &state->id_chnl_hdl, NULL) != IBT_SUCCESS) {
-		DPRINT(10, "ibd_drv_init : failed in ibt_alloc_ud_channel()"
-		    "\n");
-		goto drv_init_fail_alloc_chan;
-	}
-
-	if (ibt_query_ud_channel(state->id_chnl_hdl, &ud_chan_attr) !=
-	    DDI_SUCCESS) {
-		DPRINT(10, "ibd_drv_init : failed in ibt_query_ud_channel()");
-		goto drv_init_fail_query_chan;
-	}
-
-	state->id_qpnum = ud_chan_attr.ud_qpn;
-	/* state->id_max_sqseg = ud_chan_attr.ud_chan_sizes.cs_sq_sgl; */
-
-	if (state->id_max_sqseg > IBD_MAX_SQSEG) {
-		state->id_max_sqseg = IBD_MAX_SQSEG;
-	} else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
-		ibd_print_warn(state, "Set #sgl = %d instead of default %d",
-		    state->id_max_sqseg, IBD_MAX_SQSEG);
-	}
-
-	/* Initialize the Transmit buffer list */
-	if (ibd_init_txlist(state) != DDI_SUCCESS) {
-		DPRINT(10, "ibd_drv_init : failed in ibd_init_txlist()\n");
-		goto drv_init_fail_txlist_init;
-	}
-
-	if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) {
-		/*
-		 * Setup the handler we will use for regular DLPI stuff
-		 */
-		ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
-		if (ibt_enable_cq_notify(state->id_scq_hdl,
-		    IBT_NEXT_COMPLETION) != IBT_SUCCESS) {
-			DPRINT(10, "ibd_drv_init : failed in"
-			    " ibt_enable_cq_notify()\n");
-			goto drv_init_fail_cq_notify;
-		}
-	}
-
-	/* Initialize the Receive buffer list */
-	if (ibd_init_rxlist(state) != DDI_SUCCESS) {
-		DPRINT(10, "ibd_drv_init : failed in ibd_init_rxlist()\n");
-		goto drv_init_fail_rxlist_init;
-	}
-
-	/* Join to IPoIB broadcast group as required by IPoIB */
-	if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
-		DPRINT(10, "ibd_drv_init : failed in ibd_join_group\n");
-		goto drv_init_fail_join_group;
-	}
-
-	/*
-	 * Create the async thread; thread_create never fails.
-	 */
-	kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
-	    TS_RUN, minclsyspri);
-
-	state->id_async_thrid = kht->t_did;
-
-	/*
-	 * The local mac address is now known. Create the IPoIB
-	 * address.
-	 */
-	ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
-	    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
-	/*
-	 * Similarly, program in the broadcast mac address.
-	 */
-	ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, state->id_mgid.gid_prefix,
-	    state->id_mgid.gid_guid);
-
-	ptr = (uint32_t *)&state->id_macaddr;
-	DPRINT(10, "ibd_drv_init : INFO: MAC %08X:%08X:%08X:%08X:%08X\n",
-	    *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4));
-	ptr = (uint32_t *)&state->id_bcaddr;
-	DPRINT(10, "ibd_drv_init : INFO: BCMAC %08X:%08X:%08X:%08X:%08X\n",
-	    *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4));
-	DPRINT(10, "ibd_drv_init : INFO: Pkey 0x%x, Mgid %016llx%016llx\n",
-	    state->id_pkey, state->id_mgid.gid_prefix,
-	    state->id_mgid.gid_guid);
-	DPRINT(10, "ibd_drv_init : INFO: GID %016llx%016llx\n",
-	    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
-	DPRINT(10, "ibd_drv_init : INFO: PKEY %04x\n", state->id_pkey);
-	DPRINT(10, "ibd_drv_init : INFO: MTU %d\n", state->id_mtu);
-	(void) ddi_pathname(state->id_dip, pathname);
-	DPRINT(10, "ibd_drv_init : INFO: Pathname %s\n", pathname);
-
-	return (DDI_SUCCESS);
-
-drv_init_fail_join_group:
-	ibd_fini_rxlist(state);
-
-drv_init_fail_rxlist_init:
-drv_init_fail_cq_notify:
-	ibd_fini_txlist(state);
-
-drv_init_fail_txlist_init:
-drv_init_fail_query_chan:
-	if (ibt_free_channel(state->id_chnl_hdl) != IBT_SUCCESS)
-		DPRINT(10, "ibd_drv_init : failed in ibt_free_channel()");
-
-drv_init_fail_alloc_chan:
-	if ((ibd_separate_cqs == 1) && (ibt_free_cq(state->id_scq_hdl) !=
-	    IBT_SUCCESS))
-		DPRINT(10, "ibd_drv_init : Tx ibt_free_cq()");
-
-	if (ibd_separate_cqs == 1)
-		kmem_free(state->id_txwcs, sizeof (ibt_wc_t) *
-		    state->id_txwcs_size);
-
-drv_init_fail_alloc_scq:
-	if (ibt_free_cq(state->id_rcq_hdl) != IBT_SUCCESS)
-		DPRINT(10, "ibd_drv_init : Rx ibt_free_cq()");
-	kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * state->id_rxwcs_size);
-
-drv_init_fail_alloc_rcq:
-	ibd_acache_fini(state);
-drv_init_fail_acache:
-	if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS)
-		DPRINT(10, "ibd_drv_init : failed in ibt_free_pd()");
-
-drv_init_fail_alloc_pd:
-	ibt_free_mcg_info(state->id_mcinfo, 1);
-drv_init_fail_find_bgroup:
-	if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS)
-		DPRINT(10, "ibd_drv_init : failed in ibt_close_hca()");
-
-	return (DDI_FAILURE);
-}
-
-
 static int
 ibd_alloc_tx_copybufs(ibd_state_t *state)
 {
@@ -3722,8 +3428,8 @@
 	while (state->id_tx_list.dl_head != NULL) {
 		node = WQE_TO_SWQE(state->id_tx_list.dl_head);
 		state->id_tx_list.dl_head = node->swqe_next;
+		ASSERT(state->id_tx_list.dl_cnt > 0);
 		state->id_tx_list.dl_cnt--;
-		ASSERT(state->id_tx_list.dl_cnt >= 0);
 		ibd_free_swqe(state, node);
 	}
 	mutex_exit(&state->id_tx_list.dl_mutex);
@@ -3782,7 +3488,7 @@
  * recycled, or this is a new one.
  */
 static int
-ibd_post_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle)
+ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle)
 {
 	ibt_status_t ibt_status;
 
@@ -3827,7 +3533,7 @@
 			if (ibt_status != IBT_SUCCESS) {
 				(void) atomic_add_32_nv(
 				    &state->id_rx_list.dl_cnt, -1);
-				ibd_print_warn(state, "ibd_post_rwqe: "
+				ibd_print_warn(state, "ibd_post_recv: "
 				    "posting failed, ret=%d", ibt_status);
 				return (DDI_FAILURE);
 			}
@@ -3861,7 +3567,7 @@
 			return (DDI_FAILURE);
 		}
 
-		if (ibd_post_rwqe(state, rwqe, B_FALSE) == DDI_FAILURE) {
+		if (ibd_post_recv(state, rwqe, B_FALSE) == DDI_FAILURE) {
 			ibd_free_rwqe(state, rwqe);
 			ibd_fini_rxlist(state);
 			return (DDI_FAILURE);
@@ -3884,8 +3590,8 @@
 	while (state->id_rx_list.dl_head != NULL) {
 		node = WQE_TO_RWQE(state->id_rx_list.dl_head);
 		state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next;
+		ASSERT(state->id_rx_list.dl_cnt > 0);
 		state->id_rx_list.dl_cnt--;
-		ASSERT(state->id_rx_list.dl_cnt >= 0);
 
 		ibd_free_rwqe(state, node);
 	}
@@ -4009,150 +3715,6 @@
 }
 
 /*
- * Pre ibt_detach() deconstruction.
- */
-static void
-ibd_drv_fini(ibd_state_t *state)
-{
-	ib_gid_t mgid;
-	ibd_mce_t *mce;
-	ibt_status_t status;
-	uint8_t jstate;
-
-	/*
-	 * Desubscribe from trap notices; we will be tearing down
-	 * the mcg lists soon. Make sure the trap handler does nothing
-	 * even if it is invoked (ie till we invoke ibt_detach()).
-	 */
-	ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
-	mutex_enter(&state->id_trap_lock);
-	state->id_trap_stop = B_TRUE;
-	while (state->id_trap_inprog > 0)
-		cv_wait(&state->id_trap_cv, &state->id_trap_lock);
-	mutex_exit(&state->id_trap_lock);
-
-	/*
-	 * Flushing the channel ensures that all pending WQE's
-	 * are marked with flush_error and handed to the CQ. It
-	 * does not guarantee the invocation of the CQ handler.
-	 * This call is guaranteed to return successfully for UD QPNs.
-	 */
-	status = ibt_flush_channel(state->id_chnl_hdl);
-	ASSERT(status == IBT_SUCCESS);
-
-	/*
-	 * We possibly need a loop here to wait for all the Tx
-	 * callbacks to happen. The Tx handlers will retrieve
-	 * held resources like AH ac_ref count, registered memory
-	 * and possibly IBD_ASYNC_REAP requests. Rx interrupts were already
-	 * turned off (in ibd_detach()); turn off Tx interrupts and
-	 * poll. By the time the polling returns an empty indicator,
-	 * we are sure we have seen all pending Tx callbacks. Note
-	 * that after the ibt_set_cq_handler() returns, the old handler
-	 * is guaranteed not to be invoked anymore.
-	 */
-	if (ibd_separate_cqs == 1)
-		ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
-	ibd_poll_compq(state, state->id_scq_hdl);
-
-	/*
-	 * No more async requests will be posted since the device has been
-	 * unregistered; completion handlers have been turned off, so Tx
-	 * handler will not cause any more IBD_ASYNC_REAP requests. Queue a
-	 * request for the async thread to exit, which will be serviced
-	 * after any pending ones. This can take a while, specially if the
-	 * SM is unreachable, since IBMF will slowly timeout each SM request
-	 * issued by the async thread. Reap the thread before continuing on,
-	 * we do not want it to be lingering in modunloaded code.
-	 */
-	ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
-	thread_join(state->id_async_thrid);
-
-	/*
-	 * We can not be in promiscuous mode anymore, upper layers
-	 * would have made a request to disable it (if ever set previously)
-	 * before the detach is allowed to progress to this point; and the
-	 * aysnc thread would have processed that request by now. Thus the
-	 * nonmember list is guaranteed empty at this point.
-	 */
-	ASSERT(state->id_prom_op != IBD_OP_COMPLETED);
-
-	/*
-	 * Drop all residual full/non membership. This includes full
-	 * membership to the broadcast group, and any nonmembership
-	 * acquired during transmits. We do this after the Tx completion
-	 * handlers are done, since those might result in some late
-	 * leaves; this also eliminates a potential race with that
-	 * path wrt the mc full list insert/delete. Trap handling
-	 * has also been suppressed at this point. Thus, no locks
-	 * are required while traversing the mc full list.
-	 */
-	DPRINT(2, "ibd_drv_fini : clear full cache entries");
-	mce = list_head(&state->id_mc_full);
-	while (mce != NULL) {
-		mgid = mce->mc_info.mc_adds_vect.av_dgid;
-		jstate = mce->mc_jstate;
-		mce = list_next(&state->id_mc_full, mce);
-		ibd_leave_group(state, mgid, jstate);
-	}
-
-	ibt_free_mcg_info(state->id_mcinfo, 1);
-
-	/*
-	 * Kill the channel now; guaranteed to return successfully
-	 * for UD QPNs.
-	 */
-	status = ibt_free_channel(state->id_chnl_hdl);
-	ASSERT(status == IBT_SUCCESS);
-
-	/*
-	 * Kill the CQ; all completion handlers are guaranteed to
-	 * have terminated by the time this returns. Since we killed
-	 * the QPN above, we can not receive the IBT_CQ_BUSY error.
-	 */
-	status = ibt_free_cq(state->id_rcq_hdl);
-	ASSERT(status == IBT_SUCCESS);
-	kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * state->id_rxwcs_size);
-
-	if (ibd_separate_cqs == 1) {
-		status = ibt_free_cq(state->id_scq_hdl);
-		ASSERT(status == IBT_SUCCESS);
-		kmem_free(state->id_txwcs, sizeof (ibt_wc_t) *
-		    state->id_txwcs_size);
-	}
-
-	/*
-	 * Since these following will act on the Rx/Tx list, which
-	 * is also looked at by the Rx/Tx handlers, keep them around
-	 * till all handlers are guaranteed to have completed.
-	 */
-	ibd_fini_rxlist(state);
-	ibd_fini_txlist(state);
-
-	/*
-	 * Clean up the active AH hash list.
-	 */
-	mod_hash_destroy_hash(state->id_ah_active_hash);
-
-	/*
-	 * Free parallel ARP cache and AHs; we are sure all of these
-	 * resources have been released by the Tx completion handler.
-	 */
-	ibd_acache_fini(state);
-
-	/*
-	 * We freed the QPN, all the MRs and AHs. This step should not
-	 * fail; print a warning message if it does fail, due to a bug
-	 * in the driver.
-	 */
-	if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS)
-		ibd_print_warn(state, "failed to free protection domain");
-
-	if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS)
-		ibd_print_warn(state, "failed to close HCA device");
-}
-
-/*
  * IBA Rx/Tx completion queue handler. Guaranteed to be single
  * threaded and nonreentrant for this CQ. When using combined CQ,
  * this handles Tx and Rx completions. With separate CQs, this handles
@@ -4249,7 +3811,7 @@
 			 * [de]initialized; back off then, without doing
 			 * anything more, since we are not sure if the
 			 * async thread is around, or whether we might
-			 * be racing with the detach code in ibd_drv_fini()
+			 * be racing with the detach code in ibd_m_stop()
 			 * that scans the mcg list.
 			 */
 			if (!ibd_async_safe(state))
@@ -4361,6 +3923,422 @@
 	return (B_TRUE);
 }
 
+static int
+ibd_get_port_details(ibd_state_t *state)
+{
+	ibt_hca_portinfo_t *port_infop;
+	ibt_status_t ret;
+	uint_t psize, port_infosz;
+
+	mutex_enter(&state->id_link_mutex);
+
+	/*
+	 * Query for port information
+	 */
+	ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
+	    &port_infop, &psize, &port_infosz);
+	if ((ret != IBT_SUCCESS) || (psize != 1)) {
+		mutex_exit(&state->id_link_mutex);
+		DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() "
+		    "failed, ret=%d", ret);
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * If the link already went down by the time we get here,
+	 * give up
+	 */
+	if (port_infop->p_linkstate != IBT_PORT_ACTIVE) {
+		mutex_exit(&state->id_link_mutex);
+		ibt_free_portinfo(port_infop, port_infosz);
+		DPRINT(10, "ibd_get_port_details: port is not active");
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * If the link is active, verify the pkey
+	 */
+	if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port,
+	    state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) {
+		mutex_exit(&state->id_link_mutex);
+		ibt_free_portinfo(port_infop, port_infosz);
+		DPRINT(10, "ibd_get_port_details: ibt_pkey2index "
+		    "failed, ret=%d", ret);
+		return (DDI_FAILURE);
+	}
+
+	state->id_mtu = (128 << port_infop->p_mtu);
+	state->id_sgid = *port_infop->p_sgid_tbl;
+	state->id_link_state = LINK_STATE_UP;
+
+	mutex_exit(&state->id_link_mutex);
+	ibt_free_portinfo(port_infop, port_infosz);
+
+	/*
+	 * Now that the port is active, record the port speed
+	 */
+	state->id_link_speed = ibd_get_portspeed(state);
+
+	return (DDI_SUCCESS);
+}
+
+static int
+ibd_alloc_cqs(ibd_state_t *state)
+{
+	ibt_hca_attr_t hca_attrs;
+	ibt_cq_attr_t cq_attr;
+	ibt_status_t ret;
+	uint32_t real_size;
+
+	ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
+	ASSERT(ret == IBT_SUCCESS);
+
+	/*
+	 * Allocate Rx/combined CQ:
+	 * Theoretically, there is no point in having more than #rwqe
+	 * plus #swqe cqe's, except that the CQ will be signalled for
+	 * overflow when the last wqe completes, if none of the previous
+	 * cqe's have been polled. Thus, we allocate just a few less wqe's
+	 * to make sure such overflow does not occur.
+	 */
+	cq_attr.cq_sched = NULL;
+	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
+
+	if (ibd_separate_cqs == 1) {
+		/*
+		 * Allocate Receive CQ.
+		 */
+		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) {
+			cq_attr.cq_size = state->id_num_rwqe + 1;
+		} else {
+			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
+			state->id_num_rwqe = cq_attr.cq_size - 1;
+		}
+
+		if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
+		    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
+			DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
+			    "failed, ret=%d\n", ret);
+			return (DDI_FAILURE);
+		}
+
+		if ((ret = ibt_modify_cq(state->id_rcq_hdl,
+		    ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) {
+			DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
+			    "moderation failed, ret=%d\n", ret);
+		}
+
+		state->id_rxwcs_size = state->id_num_rwqe + 1;
+		state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
+		    state->id_rxwcs_size, KM_SLEEP);
+
+		/*
+		 * Allocate Send CQ.
+		 */
+		if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) {
+			cq_attr.cq_size = state->id_num_swqe + 1;
+		} else {
+			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
+			state->id_num_swqe = cq_attr.cq_size - 1;
+		}
+
+		if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
+		    &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
+			DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
+			    "failed, ret=%d\n", ret);
+			kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
+			    state->id_rxwcs_size);
+			(void) ibt_free_cq(state->id_rcq_hdl);
+			return (DDI_FAILURE);
+		}
+		if ((ret = ibt_modify_cq(state->id_scq_hdl,
+		    IBD_TXCOMP_COUNT, IBD_TXCOMP_USEC, 0)) != IBT_SUCCESS) {
+			DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
+			    "moderation failed, ret=%d\n", ret);
+		}
+
+		state->id_txwcs_size = state->id_num_swqe + 1;
+		state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
+		    state->id_txwcs_size, KM_SLEEP);
+	} else {
+		/*
+		 * Allocate combined Send/Receive CQ.
+		 */
+		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe +
+		    state->id_num_swqe + 1)) {
+			cq_attr.cq_size = state->id_num_rwqe +
+			    state->id_num_swqe + 1;
+		} else {
+			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
+			state->id_num_rwqe = ((cq_attr.cq_size - 1) *
+			    state->id_num_rwqe) / (state->id_num_rwqe +
+			    state->id_num_swqe);
+			state->id_num_swqe = cq_attr.cq_size - 1 -
+			    state->id_num_rwqe;
+		}
+
+		state->id_rxwcs_size = cq_attr.cq_size;
+		state->id_txwcs_size = state->id_rxwcs_size;
+
+		if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
+		    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
+			DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rscq) "
+			    "failed, ret=%d\n", ret);
+			return (DDI_FAILURE);
+		}
+		state->id_scq_hdl = state->id_rcq_hdl;
+		state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
+		    state->id_rxwcs_size, KM_SLEEP);
+		state->id_txwcs = state->id_rxwcs;
+	}
+
+	/*
+	 * Print message in case we could not allocate as many wqe's
+	 * as was requested.
+	 */
+	if (state->id_num_rwqe != IBD_NUM_RWQE) {
+		ibd_print_warn(state, "Setting #rwqe = %d instead of default "
+		    "%d", state->id_num_rwqe, IBD_NUM_RWQE);
+	}
+	if (state->id_num_swqe != IBD_NUM_SWQE) {
+		ibd_print_warn(state, "Setting #swqe = %d instead of default "
+		    "%d", state->id_num_swqe, IBD_NUM_SWQE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+static int
+ibd_setup_ud_channel(ibd_state_t *state)
+{
+	ibt_ud_chan_alloc_args_t ud_alloc_attr;
+	ibt_ud_chan_query_attr_t ud_chan_attr;
+	ibt_status_t ret;
+
+	ud_alloc_attr.ud_flags  = IBT_WR_SIGNALED;
+	if (state->id_hca_res_lkey_capab)
+		ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
+	if (state->id_lso_policy && state->id_lso_capable)
+		ud_alloc_attr.ud_flags |= IBT_USES_LSO;
+
+	ud_alloc_attr.ud_hca_port_num	= state->id_port;
+	ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
+	ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
+	ud_alloc_attr.ud_sizes.cs_sq    = state->id_num_swqe;
+	ud_alloc_attr.ud_sizes.cs_rq    = state->id_num_rwqe;
+	ud_alloc_attr.ud_qkey		= state->id_mcinfo->mc_qkey;
+	ud_alloc_attr.ud_scq		= state->id_scq_hdl;
+	ud_alloc_attr.ud_rcq		= state->id_rcq_hdl;
+	ud_alloc_attr.ud_pd		= state->id_pd_hdl;
+	ud_alloc_attr.ud_pkey_ix	= state->id_pkix;
+	ud_alloc_attr.ud_clone_chan	= NULL;
+
+	if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
+	    &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) {
+		DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() "
+		    "failed, ret=%d\n", ret);
+		return (DDI_FAILURE);
+	}
+
+	if ((ret = ibt_query_ud_channel(state->id_chnl_hdl,
+	    &ud_chan_attr)) != IBT_SUCCESS) {
+		DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() "
+		    "failed, ret=%d\n", ret);
+		(void) ibt_free_channel(state->id_chnl_hdl);
+		return (DDI_FAILURE);
+	}
+
+	state->id_qpnum = ud_chan_attr.ud_qpn;
+
+	return (DDI_SUCCESS);
+}
+
+static int
+ibd_undo_m_start(ibd_state_t *state)
+{
+	uint32_t progress = state->id_mac_state;
+	uint_t attempts;
+	ibt_status_t ret;
+	ib_gid_t mgid;
+	ibd_mce_t *mce;
+	uint8_t jstate;
+
+	/*
+	 * Before we try to stop/undo whatever we did in ibd_m_start(),
+	 * we need to mark the link state as unknown to prevent nw
+	 * layer from using this instance for any new transfers.
+	 */
+	if (progress & IBD_DRV_PORT_DETAILS_OBTAINED) {
+		state->id_link_state = LINK_STATE_UNKNOWN;
+		mac_link_update(state->id_mh, state->id_link_state);
+
+		state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED);
+	}
+
+	if (progress & IBD_DRV_STARTED) {
+		state->id_mac_state &= (~IBD_DRV_STARTED);
+	}
+
+	/*
+	 * First, stop receive interrupts; this stops the driver from
+	 * handing up buffers to higher layers.  Wait for receive buffers
+	 * to be returned and give up after 5 seconds.
+	 */
+	if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) {
+		ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
+		attempts = 50;
+		while (state->id_rx_list.dl_bufs_outstanding > 0) {
+			delay(drv_usectohz(100000));
+			if (--attempts == 0) {
+				/*
+				 * There are pending bufs with the network
+				 * layer and we have no choice but to wait
+				 * for them to be done with. Reap all the
+				 * Tx/Rx completions that were posted since
+				 * we turned off the notification and
+				 * return failure.
+				 */
+				DPRINT(2, "ibd_undo_m_start: "
+				    "reclaiming failed");
+				ibd_poll_compq(state, state->id_rcq_hdl);
+				ibt_set_cq_handler(state->id_rcq_hdl,
+				    ibd_rcq_handler, state);
+				return (DDI_FAILURE);
+			}
+		}
+		state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED);
+	}
+
+	if (progress & IBD_DRV_SM_NOTICES_REGISTERED) {
+		ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
+
+		mutex_enter(&state->id_trap_lock);
+		state->id_trap_stop = B_TRUE;
+		while (state->id_trap_inprog > 0)
+			cv_wait(&state->id_trap_cv, &state->id_trap_lock);
+		mutex_exit(&state->id_trap_lock);
+
+		state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED);
+	}
+
+	if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) {
+		/*
+		 * Flushing the channel ensures that all pending WQE's
+		 * are marked with flush_error and handed to the CQ. It
+		 * does not guarantee the invocation of the CQ handler.
+		 * This call is guaranteed to return successfully for
+		 * UD QPNs.
+		 */
+		ret = ibt_flush_channel(state->id_chnl_hdl);
+		ASSERT(ret == IBT_SUCCESS);
+
+		/*
+		 * Turn off Tx interrupts and poll. By the time the polling
+		 * returns an empty indicator, we are sure we have seen all
+		 * pending Tx callbacks. Note that after the call to
+		 * ibt_set_cq_handler() returns, the old handler is
+		 * guaranteed not to be invoked anymore.
+		 */
+		if (ibd_separate_cqs == 1)
+			ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
+		ibd_poll_compq(state, state->id_scq_hdl);
+
+		state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED);
+	}
+
+	if (progress & IBD_DRV_ASYNC_THR_CREATED) {
+		/*
+		 * No new async requests will be posted since the device
+		 * link state has been marked as unknown; completion handlers
+		 * have been turned off, so Tx handler will not cause any
+		 * more IBD_ASYNC_REAP requests.
+		 *
+		 * Queue a request for the async thread to exit, which will
+		 * be serviced after any pending ones. This can take a while,
+		 * specially if the SM is unreachable, since IBMF will slowly
+		 * timeout each SM request issued by the async thread.  Reap
+		 * the thread before continuing on, we do not want it to be
+		 * lingering in modunloaded code (or we could move the reap
+		 * to ibd_detach(), provided we keep track of the current
+		 * id_async_thrid somewhere safe).
+		 */
+		ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
+		thread_join(state->id_async_thrid);
+
+		state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED);
+	}
+
+	if (progress & IBD_DRV_BCAST_GROUP_JOINED) {
+		/*
+		 * Drop all residual full/non membership. This includes full
+		 * membership to the broadcast group, and any nonmembership
+		 * acquired during transmits. We do this after the Tx completion
+		 * handlers are done, since those might result in some late
+		 * leaves; this also eliminates a potential race with that
+		 * path wrt the mc full list insert/delete. Trap handling
+		 * has also been suppressed at this point. Thus, no locks
+		 * are required while traversing the mc full list.
+		 */
+		DPRINT(2, "ibd_undo_m_start: clear full cache entries");
+		mce = list_head(&state->id_mc_full);
+		while (mce != NULL) {
+			mgid = mce->mc_info.mc_adds_vect.av_dgid;
+			jstate = mce->mc_jstate;
+			mce = list_next(&state->id_mc_full, mce);
+			ibd_leave_group(state, mgid, jstate);
+		}
+		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED);
+	}
+
+	if (progress & IBD_DRV_RXLIST_ALLOCD) {
+		ibd_fini_rxlist(state);
+		state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD);
+	}
+
+	if (progress & IBD_DRV_TXLIST_ALLOCD) {
+		ibd_fini_txlist(state);
+		state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD);
+	}
+
+	if (progress & IBD_DRV_UD_CHANNEL_SETUP) {
+		(void) ibt_free_channel(state->id_chnl_hdl);
+		state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP);
+	}
+
+	if (progress & IBD_DRV_CQS_ALLOCD) {
+		if (ibd_separate_cqs == 1) {
+			kmem_free(state->id_txwcs,
+			    sizeof (ibt_wc_t) * state->id_txwcs_size);
+			(void) ibt_free_cq(state->id_scq_hdl);
+		}
+
+		kmem_free(state->id_rxwcs,
+		    sizeof (ibt_wc_t) * state->id_rxwcs_size);
+		(void) ibt_free_cq(state->id_rcq_hdl);
+
+		state->id_txwcs = NULL;
+		state->id_rxwcs = NULL;
+		state->id_scq_hdl = NULL;
+		state->id_rcq_hdl = NULL;
+
+		state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD);
+	}
+
+	if (progress & IBD_DRV_ACACHE_INITIALIZED) {
+		mod_hash_destroy_hash(state->id_ah_active_hash);
+		ibd_acache_fini(state);
+
+		state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED);
+	}
+
+	if (progress & IBD_DRV_BCAST_GROUP_FOUND) {
+		ibt_free_mcg_info(state->id_mcinfo, 1);
+		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND);
+	}
+
+	return (DDI_SUCCESS);
+}
+
 /*
  * GLDv3 entry point to start hardware.
  */
@@ -4368,7 +4346,185 @@
 static int
 ibd_m_start(void *arg)
 {
-	return (0);
+	ibd_state_t *state = arg;
+	kthread_t *kht;
+	int err;
+
+	if (state->id_mac_state & IBD_DRV_STARTED)
+		return (DDI_SUCCESS);
+
+	/*
+	 * Get port details; if we fail here, very likely the port
+	 * state is inactive or the pkey can't be verified
+	 */
+	if (ibd_get_port_details(state) != DDI_SUCCESS) {
+		DPRINT(10, "ibd_m_start: ibd_get_port_details() failed");
+		return (EAGAIN);
+	}
+	state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED;
+
+	/*
+	 * Find the IPoIB broadcast group
+	 */
+	if (ibd_find_bgroup(state) != IBT_SUCCESS) {
+		DPRINT(10, "ibd_m_start: ibd_find_bgroup() failed");
+		err = ENOENT;
+		goto m_start_fail;
+	}
+	state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND;
+
+	/*
+	 * Initialize per-interface caches and lists; if we fail here,
+	 * it is most likely due to a lack of resources
+	 */
+	if (ibd_acache_init(state) != DDI_SUCCESS) {
+		DPRINT(10, "ibd_m_start: ibd_acache_init() failed");
+		err = ENOMEM;
+		goto m_start_fail;
+	}
+	state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED;
+
+	/*
+	 * Allocate send and receive completion queues
+	 */
+	if (ibd_alloc_cqs(state) != DDI_SUCCESS) {
+		DPRINT(10, "ibd_m_start: ibd_alloc_cqs() failed");
+		err = ENOMEM;
+		goto m_start_fail;
+	}
+	state->id_mac_state |= IBD_DRV_CQS_ALLOCD;
+
+	/*
+	 * Setup a UD channel
+	 */
+	if (ibd_setup_ud_channel(state) != DDI_SUCCESS) {
+		err = ENOMEM;
+		DPRINT(10, "ibd_m_start: ibd_setup_ud_channel() failed");
+		goto m_start_fail;
+	}
+	state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP;
+
+	/*
+	 * Allocate and initialize the tx buffer list
+	 */
+	if (ibd_init_txlist(state) != DDI_SUCCESS) {
+		DPRINT(10, "ibd_m_start: ibd_init_txlist() failed");
+		err = ENOMEM;
+		goto m_start_fail;
+	}
+	state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD;
+
+	/*
+	 * If we have separate cqs, create the send cq handler here
+	 */
+	if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) {
+		ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
+		if (ibt_enable_cq_notify(state->id_scq_hdl,
+		    IBT_NEXT_COMPLETION) != IBT_SUCCESS) {
+			DPRINT(10,
+			    "ibd_m_start: ibt_enable_cq_notify(scq) failed");
+			err = EINVAL;
+			goto m_start_fail;
+		}
+		state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
+	}
+
+	/*
+	 * Allocate and initialize the rx buffer list
+	 */
+	if (ibd_init_rxlist(state) != DDI_SUCCESS) {
+		DPRINT(10, "ibd_m_start: ibd_init_rxlist() failed");
+		err = ENOMEM;
+		goto m_start_fail;
+	}
+	state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD;
+
+	/*
+	 * Join IPoIB broadcast group
+	 */
+	if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
+		DPRINT(10, "ibd_m_start: ibd_join_group() failed");
+		err = EINVAL;
+		goto m_start_fail;
+	}
+	state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED;
+
+	/*
+	 * Create the async thread; thread_create never fails.
+	 */
+	kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
+	    TS_RUN, minclsyspri);
+	state->id_async_thrid = kht->t_did;
+	state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;
+
+	/*
+	 * When we did mac_register() in ibd_attach(), we didn't register
+	 * the real macaddr and we didn't have the true port mtu. Now that
+	 * we're almost ready, set the local mac address and broadcast
+	 * addresses and update gldv3 about the real values of these
+	 * parameters.
+	 */
+	ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
+	    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
+	ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK,
+	    state->id_mgid.gid_prefix, state->id_mgid.gid_guid);
+
+	mac_maxsdu_update(state->id_mh, state->id_mtu - IPOIB_HDRSIZE);
+	mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
+
+	/*
+	 * Setup the receive cq handler
+	 */
+	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
+	if (ibt_enable_cq_notify(state->id_rcq_hdl,
+	    IBT_NEXT_COMPLETION) != IBT_SUCCESS) {
+		DPRINT(10, "ibd_m_start: ibt_enable_cq_notify(rcq) failed");
+		err = EINVAL;
+		goto m_start_fail;
+	}
+	state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED;
+
+	/*
+	 * Setup the subnet notices handler after we've initialized the acache/
+	 * mcache and started the async thread, both of which are required for
+	 * the trap handler to function properly.
+	 *
+	 * Now that the async thread has been started (and we've already done
+	 * a mac_register() during attach so mac_tx_update() can be called
+	 * if necessary without any problem), we can enable the trap handler
+	 * to queue requests to the async thread.
+	 */
+	ibt_register_subnet_notices(state->id_ibt_hdl,
+	    ibd_snet_notices_handler, state);
+	mutex_enter(&state->id_trap_lock);
+	state->id_trap_stop = B_FALSE;
+	mutex_exit(&state->id_trap_lock);
+	state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED;
+
+	/*
+	 * Indicate link status to GLDv3 and higher layers. By default,
+	 * we assume we are in up state (which must have been true at
+	 * least at the time the broadcast mcg's were probed); if there
+	 * were any up/down transitions till the time we come here, the
+	 * async handler will have updated last known state, which we
+	 * use to tell GLDv3. The async handler will not send any
+	 * notifications to GLDv3 till we reach here in the initialization
+	 * sequence.
+	 */
+	state->id_mac_state |= IBD_DRV_STARTED;
+	mac_link_update(state->id_mh, state->id_link_state);
+
+	return (DDI_SUCCESS);
+
+m_start_fail:
+	/*
+	 * If we ran into a problem during ibd_m_start() and ran into
+	 * some other problem during undoing our partial work, we can't
+	 * do anything about it.  Ignore any errors we might get from
+	 * ibd_undo_m_start() and just return the original error we got.
+	 */
+	(void) ibd_undo_m_start(state);
+	return (err);
 }
 
 /*
@@ -4378,6 +4534,15 @@
 static void
 ibd_m_stop(void *arg)
 {
+	ibd_state_t *state = arg;
+
+	/*
+	 * Since ibd_m_stop() doesn't expect any return, we cannot
+	 * fail even if we run into some problem with ibd_undo_m_start().
+	 * The best we can do is to leave it in a good state, so
+	 * perhaps a future unplumb will succeed.
+	 */
+	(void) ibd_undo_m_start(state);
 }
 
 /*
@@ -4387,9 +4552,15 @@
 static int
 ibd_m_unicst(void *arg, const uint8_t *macaddr)
 {
-	ibd_state_t *state;
-
-	state = (ibd_state_t *)arg;
+	ibd_state_t *state = arg;
+
+	/*
+	 * Don't bother even comparing the macaddr if we haven't
+	 * completed ibd_m_start().
+	 */
+	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
+		return (0);
+
 	if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
 		return (0);
 	else
@@ -4407,7 +4578,6 @@
 	    "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
 
 	if (op == IBD_ASYNC_JOIN) {
-
 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
 			ibd_print_warn(state, "Joint multicast group failed :"
 			"%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
@@ -4435,6 +4605,14 @@
 	ibd_req_t *req;
 
 	/*
+	 * If we haven't completed ibd_m_start(), async thread wouldn't
+	 * have been started and id_bcaddr wouldn't be set, so there's
+	 * no point in continuing.
+	 */
+	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
+		return (0);
+
+	/*
 	 * The incoming multicast address might not be aligned properly
 	 * on a 4 byte boundary to be considered an ipoib_mac_t. We force
 	 * it to look like one though, to get the offsets of the mc gid,
@@ -4461,8 +4639,8 @@
 
 	/*
 	 * If someone is trying to JOIN/LEAVE the broadcast group, we do
-	 * nothing (ie we stay JOINed to the broadcast group done in
-	 * ibd_drv_init()), to mimic ethernet behavior. IPv4 specifically
+	 * nothing (i.e. we stay JOINed to the broadcast group done in
+	 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically
 	 * requires to be joined to broadcast groups at all times.
 	 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
 	 * depends on this.
@@ -4524,7 +4702,8 @@
 	ibt_mcg_info_t *mcg_info;
 	ib_gid_t mgid;
 	uint_t numg;
-	int i, ret = IBD_OP_COMPLETED;
+	int i;
+	char ret = IBD_OP_COMPLETED;
 
 	DPRINT(2, "ibd_async_setprom : async_set_promisc");
 
@@ -4580,6 +4759,13 @@
 	ibd_state_t *state = (ibd_state_t *)arg;
 	ibd_req_t *req;
 
+	/*
+	 * Async thread wouldn't have been started if we haven't
+	 * passed ibd_m_start()
+	 */
+	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
+		return (0);
+
 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
 	if (req == NULL)
 		return (ENOMEM);
@@ -5188,6 +5374,13 @@
 	boolean_t dofree = B_FALSE;
 	boolean_t rc;
 
+	/*
+	 * If we aren't done with the device initialization and start,
+	 * we shouldn't be here.
+	 */
+	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
+		return (B_FALSE);
+
 	node = NULL;
 	if (ibd_acquire_swqe(state, &node) != 0) {
 		/*
@@ -5935,7 +6128,7 @@
 		return;
 	}
 
-	if (ibd_post_rwqe(state, rwqe, B_TRUE) == DDI_FAILURE) {
+	if (ibd_post_recv(state, rwqe, B_TRUE) == DDI_FAILURE) {
 		ibd_delete_rwqe(state, rwqe);
 		ibd_free_rwqe(state, rwqe);
 		return;
--- a/usr/src/uts/common/sys/ib/clients/ibd/ibd.h	Mon Jun 29 02:30:05 2009 -0700
+++ b/usr/src/uts/common/sys/ib/clients/ibd/ibd.h	Mon Jun 29 02:56:22 2009 -0700
@@ -361,6 +361,8 @@
 	uint_t			id_lso_maxlen;
 	int			id_hca_res_lkey_capab;
 	ibt_lkey_t		id_res_lkey;
+
+	uint32_t		id_mac_state;
 } ibd_state_t;
 
 #endif /* _KERNEL && !_BOOT */
--- a/usr/src/uts/intel/ibd/Makefile	Mon Jun 29 02:30:05 2009 -0700
+++ b/usr/src/uts/intel/ibd/Makefile	Mon Jun 29 02:56:22 2009 -0700
@@ -19,11 +19,9 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-#ident	"%Z%%M%	%I%	%E% SMI"
-#
 #
 
 #
@@ -59,10 +57,8 @@
 # to investigate and remove these for maximum lint coverage.
 # Please do not carry these forward to new Makefiles.
 #
-LINTTAGS	+= -erroff=E_SUSPICIOUS_COMPARISON
 LINTTAGS	+= -erroff=E_BAD_PTR_CAST_ALIGN
 LINTTAGS	+= -erroff=E_PTRDIFF_OVERFLOW
-LINTTAGS	+= -erroff=E_ASSIGN_NARROW_CONV
 
 #
 #	Default build targets.
--- a/usr/src/uts/sparc/ibd/Makefile	Mon Jun 29 02:30:05 2009 -0700
+++ b/usr/src/uts/sparc/ibd/Makefile	Mon Jun 29 02:56:22 2009 -0700
@@ -19,11 +19,9 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-#ident	"%Z%%M%	%I%	%E% SMI"
-#
 #
 
 #
@@ -73,8 +71,6 @@
 #
 LINTTAGS	+= -erroff=E_BAD_PTR_CAST_ALIGN
 LINTTAGS	+= -erroff=E_PTRDIFF_OVERFLOW
-LINTTAGS	+= -erroff=E_ASSIGN_NARROW_CONV
-LINTTAGS	+= -erroff=E_SUSPICIOUS_COMPARISON
 
 #
 #	Default build targets.