Mercurial > illumos > illumos-gate

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/pkg/manifests/driver-network-eoib.mf	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,49 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+#
+
+#
+# The default for payload-bearing actions in this package is to appear in the
+# global zone only.  See the include file for greater detail, as well as
+# information about overriding the defaults.
+#
+<include global_zone_only_component>
+set name=pkg.fmri value=pkg:/driver/network/eoib@$(PKGVERS)
+set name=pkg.description value="Solaris Drivers for Ethernet over InfiniBand"
+set name=pkg.summary value="Solaris Ethernet over InfiniBand"
+set name=info.classification \
+    value=org.opensolaris.category.2008:System/Hardware
+set name=variant.arch value=$(ARCH)
+dir path=kernel group=sys
+dir path=kernel/drv group=sys
+dir path=kernel/drv/$(ARCH64) group=sys
+driver name=eibnx perms="* 0666 root sys"
+driver name=eoib clone_perms="eoib 0666 root sys" perms="* 0666 root sys"
+file path=kernel/drv/$(ARCH64)/eibnx group=sys
+file path=kernel/drv/$(ARCH64)/eoib group=sys
+$(i386_ONLY)file path=kernel/drv/eibnx group=sys
+file path=kernel/drv/eibnx.conf group=sys
+$(i386_ONLY)file path=kernel/drv/eoib group=sys
+license cr_Sun license=cr_Sun
+license lic_CDDL license=lic_CDDL
--- a/usr/src/uts/common/Makefile.files	Fri Aug 13 14:44:26 2010 +0800
+++ b/usr/src/uts/common/Makefile.files	Fri Aug 13 07:02:57 2010 -0400
@@ -1734,6 +1734,13 @@

 IBD_OBJS +=	ibd.o ibd_cm.o

+EIBNX_OBJS +=	enx_main.o enx_hdlrs.o enx_ibt.o enx_log.o enx_fip.o \
+		enx_misc.o enx_q.o enx_ctl.o
+
+EOIB_OBJS +=	eib_adm.o eib_chan.o eib_cmn.o eib_ctl.o eib_data.o \
+		eib_fip.o eib_ibt.o eib_log.o eib_mac.o eib_main.o \
+		eib_rsrc.o eib_svc.o eib_vnic.o
+
 DLPISTUB_OBJS += dlpistub.o

 SDP_OBJS +=	sdpddi.o
--- a/usr/src/uts/common/Makefile.rules	Fri Aug 13 14:44:26 2010 +0800
+++ b/usr/src/uts/common/Makefile.rules	Fri Aug 13 07:02:57 2010 -0400
@@ -760,6 +760,10 @@
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)

+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/ib/clients/eoib/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/ib/clients/of/sol_ofs/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
@@ -2070,6 +2074,9 @@
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/ib/clients/ibd/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))

+$(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/ib/clients/eoib/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/ib/clients/of/sol_ofs/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/ib/clients/eoib/eib_adm.c	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,487 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+
+#include <sys/ib/clients/eoib/eib_impl.h>
+
+/*
+ * Declarations private to this file
+ */
+static int eib_adm_setup_cq(eib_t *);
+static int eib_adm_setup_ud_channel(eib_t *);
+static void eib_adm_comp_intr(ibt_cq_hdl_t, void *);
+static void eib_adm_rx_comp(eib_t *, eib_wqe_t *);
+static void eib_adm_tx_comp(eib_t *, eib_wqe_t *);
+static void eib_adm_err_comp(eib_t *, eib_wqe_t *, ibt_wc_t *);
+static void eib_rb_adm_setup_cq(eib_t *);
+static void eib_rb_adm_setup_ud_channel(eib_t *);
+
+int
+eib_adm_setup_qp(eib_t *ss, int *err)
+{
+	eib_chan_t *chan;
+	ibt_status_t ret;
+	uint16_t pkey_ix;
+
+	/*
+	 * Verify pkey
+	 */
+	ret = ibt_pkey2index(ss->ei_hca_hdl, ss->ei_props->ep_port_num,
+	    EIB_ADMIN_PKEY, &pkey_ix);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_setup_qp: "
+		    "ibt_pkey2index() failed, port_num=0x%x, "
+		    "pkey=0x%x, ret=%d", ss->ei_props->ep_port_num,
+		    EIB_ADMIN_PKEY, ret);
+		*err = ENONET;
+		goto adm_setup_qp_fail;
+	}
+
+	/*
+	 * Allocate a eib_chan_t to store stuff about admin qp and
+	 * initialize some basic stuff
+	 */
+	ss->ei_admin_chan = eib_chan_init();
+
+	chan = ss->ei_admin_chan;
+	chan->ch_pkey = EIB_ADMIN_PKEY;
+	chan->ch_pkey_ix = pkey_ix;
+	chan->ch_vnic_inst = -1;
+
+	/*
+	 * Setup a combined CQ and completion handler
+	 */
+	if (eib_adm_setup_cq(ss) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_setup_qp: "
+		    "eib_adm_setup_cq() failed");
+		*err = ENOMEM;
+		goto adm_setup_qp_fail;
+	}
+
+	/*
+	 * Setup UD channel
+	 */
+	if (eib_adm_setup_ud_channel(ss) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_setup_qp: "
+		    "eib_adm_setup_ud_channel() failed");
+		*err = ENOMEM;
+		goto adm_setup_qp_fail;
+	}
+
+	/*
+	 * Post initial set of rx buffers to the HCA
+	 */
+	if (eib_chan_post_rx(ss, chan, NULL) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_setup_qp: "
+		    "eib_chan_post_rx() failed");
+		*err = ENOMEM;
+		goto adm_setup_qp_fail;
+	}
+
+	return (EIB_E_SUCCESS);
+
+adm_setup_qp_fail:
+	eib_rb_adm_setup_qp(ss);
+	return (EIB_E_FAILURE);
+}
+
+/*ARGSUSED*/
+uint_t
+eib_adm_comp_handler(caddr_t arg1, caddr_t arg2)
+{
+	eib_t *ss = (eib_t *)(void *)arg1;
+	eib_chan_t *chan = ss->ei_admin_chan;
+	ibt_wc_t *wc;
+	eib_wqe_t *wqe;
+	ibt_status_t ret;
+	uint_t polled;
+	int i;
+
+	/*
+	 * Re-arm the notification callback before we start polling
+	 * the completion queue.  There's nothing much we can do if the
+	 * enable_cq_notify fails - we issue a warning and move on.
+	 */
+	ret = ibt_enable_cq_notify(chan->ch_cq_hdl, IBT_NEXT_COMPLETION);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_adm_comp_handler: "
+		    "ibt_enable_cq_notify() failed, ret=%d", ret);
+	}
+
+	/*
+	 * Handle tx and rx completions
+	 */
+	while ((ret = ibt_poll_cq(chan->ch_cq_hdl, chan->ch_wc, chan->ch_cq_sz,
+	    &polled)) == IBT_SUCCESS) {
+		for (wc = chan->ch_wc, i = 0; i < polled; i++, wc++) {
+			wqe = (eib_wqe_t *)(uintptr_t)wc->wc_id;
+			if (wc->wc_status != IBT_WC_SUCCESS) {
+				eib_adm_err_comp(ss, wqe, wc);
+			} else if (EIB_WQE_TYPE(wqe->qe_info) == EIB_WQE_RX) {
+				eib_adm_rx_comp(ss, wqe);
+			} else {
+				eib_adm_tx_comp(ss, wqe);
+			}
+		}
+	}
+
+	return (DDI_INTR_CLAIMED);
+}
+
+void
+eib_rb_adm_setup_qp(eib_t *ss)
+{
+	eib_rb_adm_setup_ud_channel(ss);
+
+	eib_rb_adm_setup_cq(ss);
+
+	eib_chan_fini(ss->ei_admin_chan);
+	ss->ei_admin_chan = NULL;
+}
+
+static int
+eib_adm_setup_cq(eib_t *ss)
+{
+	eib_chan_t *chan = ss->ei_admin_chan;
+	ibt_cq_attr_t cq_attr;
+	ibt_status_t ret;
+	uint_t sz;
+	int rv;
+
+	/*
+	 * Allocate the admin completion queue for sending vnic logins and
+	 * logouts and receiving vnic login acks.
+	 */
+	cq_attr.cq_sched = NULL;
+	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
+	if (ss->ei_hca_attrs->hca_max_cq_sz < EIB_ADMIN_CQ_SIZE)
+		cq_attr.cq_size = ss->ei_hca_attrs->hca_max_cq_sz;
+	else
+		cq_attr.cq_size = EIB_ADMIN_CQ_SIZE;
+
+	ret = ibt_alloc_cq(ss->ei_hca_hdl, &cq_attr, &chan->ch_cq_hdl, &sz);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_setup_cq: "
+		    "ibt_alloc_cq(cq_sz=0x%lx) failed, ret=%d",
+		    cq_attr.cq_size, ret);
+		goto adm_setup_cq_fail;
+	}
+
+	/*
+	 * Set up other parameters for collecting completion information
+	 */
+	chan->ch_cq_sz = sz;
+	chan->ch_wc = kmem_zalloc(sizeof (ibt_wc_t) * sz, KM_SLEEP);
+
+	/*
+	 * Allocate soft interrupt for the admin channel cq handler and
+	 * set up the handler as well.
+	 */
+	if ((rv = ddi_intr_add_softint(ss->ei_dip, &ss->ei_admin_si_hdl,
+	    EIB_SOFTPRI_ADM, eib_adm_comp_handler, ss)) != DDI_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_setup_cq: "
+		    "ddi_intr_add_softint() failed for adm qp, ret=%d", rv);
+		goto adm_setup_cq_fail;
+	}
+
+	/*
+	 * Now, set up the admin completion queue handler.
+	 */
+	ibt_set_cq_handler(chan->ch_cq_hdl, eib_adm_comp_intr, ss);
+
+	ret = ibt_enable_cq_notify(chan->ch_cq_hdl, IBT_NEXT_COMPLETION);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_setup_cq: "
+		    "ibt_enable_cq_notify() failed, ret=%d", ret);
+		goto adm_setup_cq_fail;
+	}
+
+	return (EIB_E_SUCCESS);
+
+adm_setup_cq_fail:
+	eib_rb_adm_setup_cq(ss);
+	return (EIB_E_FAILURE);
+}
+
+static int
+eib_adm_setup_ud_channel(eib_t *ss)
+{
+	eib_chan_t *chan = ss->ei_admin_chan;
+	ibt_ud_chan_alloc_args_t alloc_attr;
+	ibt_ud_chan_query_attr_t query_attr;
+	ibt_status_t ret;
+
+	bzero(&alloc_attr, sizeof (ibt_ud_chan_alloc_args_t));
+	bzero(&query_attr, sizeof (ibt_ud_chan_query_attr_t));
+
+	alloc_attr.ud_flags = IBT_ALL_SIGNALED;
+	alloc_attr.ud_hca_port_num = ss->ei_props->ep_port_num;
+	alloc_attr.ud_pkey_ix = chan->ch_pkey_ix;
+	alloc_attr.ud_sizes.cs_sq = EIB_ADMIN_MAX_SWQE;
+	alloc_attr.ud_sizes.cs_rq = EIB_ADMIN_MAX_RWQE;
+	alloc_attr.ud_sizes.cs_sq_sgl = 1;
+	alloc_attr.ud_sizes.cs_rq_sgl = 1;
+	alloc_attr.ud_sizes.cs_inline = 0;
+
+	alloc_attr.ud_qkey = EIB_FIP_QKEY;
+	alloc_attr.ud_scq = chan->ch_cq_hdl;
+	alloc_attr.ud_rcq = chan->ch_cq_hdl;
+	alloc_attr.ud_pd = ss->ei_pd_hdl;
+
+	ret = ibt_alloc_ud_channel(ss->ei_hca_hdl, IBT_ACHAN_NO_FLAGS,
+	    &alloc_attr, &chan->ch_chan, NULL);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_setup_ud_channel: "
+		    "ibt_alloc_ud_channel(port=0x%x, pkey_ix=0x%x) "
+		    "failed, ret=%d", alloc_attr.ud_hca_port_num,
+		    chan->ch_pkey_ix, ret);
+		goto adm_setup_ud_channel_fail;
+	}
+
+	ret = ibt_query_ud_channel(chan->ch_chan, &query_attr);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_setup_ud_channel: "
+		    "ibt_query_ud_channel() failed, ret=%d", ret);
+		goto adm_setup_ud_channel_fail;
+	}
+
+	chan->ch_qpn = query_attr.ud_qpn;
+	chan->ch_max_swqes = query_attr.ud_chan_sizes.cs_sq;
+	chan->ch_max_rwqes = query_attr.ud_chan_sizes.cs_rq;
+	chan->ch_lwm_rwqes = chan->ch_max_rwqes >> 2;
+	chan->ch_rwqe_bktsz = chan->ch_max_rwqes;
+	chan->ch_ip_hdr_align = 0;
+	chan->ch_alloc_mp = B_FALSE;
+	chan->ch_tear_down = B_FALSE;
+
+	return (EIB_E_SUCCESS);
+
+adm_setup_ud_channel_fail:
+	eib_rb_adm_setup_ud_channel(ss);
+	return (EIB_E_FAILURE);
+}
+
+static void
+eib_adm_comp_intr(ibt_cq_hdl_t cq_hdl, void *arg)
+{
+	eib_t *ss = arg;
+	eib_chan_t *chan = ss->ei_admin_chan;
+
+	if (cq_hdl != chan->ch_cq_hdl) {
+		EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_adm_comp_intr: "
+		    "cq_hdl(0x%llx) != chan->ch_cq_hdl(0x%llx), "
+		    "ignoring completion", cq_hdl, chan->ch_cq_hdl);
+		return;
+	}
+
+	ASSERT(ss->ei_admin_si_hdl != NULL);
+
+	(void) ddi_intr_trigger_softint(ss->ei_admin_si_hdl, NULL);
+}
+
+static void
+eib_adm_rx_comp(eib_t *ss, eib_wqe_t *wqe)
+{
+	eib_chan_t *chan = ss->ei_admin_chan;
+	eib_login_data_t ld;
+	uint8_t *pkt = (uint8_t *)(uintptr_t)(wqe->qe_sgl.ds_va);
+	ibt_status_t ret;
+
+	/*
+	 * Skip the GRH and parse the login ack message in the packet
+	 */
+	if (eib_fip_parse_login_ack(ss, pkt + EIB_GRH_SZ, &ld) == EIB_E_SUCCESS)
+		eib_vnic_login_ack(ss, &ld);
+
+	/*
+	 * Try to repost the rwqe.  For admin channel, we can take the shortcut
+	 * and not go through eib_chan_post_recv(), since we know that the
+	 * qe_info flag, qe_chan and qe_vinst are all already set correctly; we
+	 * just took this out of the rx queue, so the ch_rx_posted will be ok
+	 * if we just posted it back. And there are no mblk allocation or
+	 * buffer alignment restrictions for this channel as well.
+	 */
+	if (chan->ch_tear_down) {
+		eib_rsrc_return_rwqe(ss, wqe, chan);
+	} else {
+		ret = ibt_post_recv(chan->ch_chan, &(wqe->qe_wr.recv), 1, NULL);
+		if (ret != IBT_SUCCESS) {
+			EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_rx_comp: "
+			    "ibt_post_recv() failed, ret=%d", ret);
+			eib_rsrc_return_rwqe(ss, wqe, chan);
+		}
+	}
+}
+
+static void
+eib_adm_tx_comp(eib_t *ss, eib_wqe_t *wqe)
+{
+	eib_rsrc_return_swqe(ss, wqe, ss->ei_admin_chan);
+}
+
+/*ARGSUSED*/
+static void
+eib_adm_err_comp(eib_t *ss, eib_wqe_t *wqe, ibt_wc_t *wc)
+{
+	/*
+	 * Currently, all we do is report
+	 */
+	switch (wc->wc_status) {
+	case IBT_WC_WR_FLUSHED_ERR:
+		break;
+
+	case IBT_WC_LOCAL_CHAN_OP_ERR:
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_err_comp: "
+		    "IBT_WC_LOCAL_CHAN_OP_ERR seen, wqe_info=0x%lx ",
+		    wqe->qe_info);
+		break;
+
+	case IBT_WC_LOCAL_PROTECT_ERR:
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_err_comp: "
+		    "IBT_WC_LOCAL_PROTECT_ERR seen, wqe_info=0x%lx ",
+		    wqe->qe_info);
+		break;
+	}
+
+	/*
+	 * When a wc indicates error, we do not attempt to repost but
+	 * simply return it to the wqe pool.
+	 */
+	if (EIB_WQE_TYPE(wqe->qe_info) == EIB_WQE_RX)
+		eib_rsrc_return_rwqe(ss, wqe, ss->ei_admin_chan);
+	else
+		eib_rsrc_return_swqe(ss, wqe, ss->ei_admin_chan);
+}
+
+static void
+eib_rb_adm_setup_cq(eib_t *ss)
+{
+	eib_chan_t *chan = ss->ei_admin_chan;
+	ibt_status_t ret;
+
+	if (chan == NULL)
+		return;
+
+	/*
+	 * Reset any completion handler we may have set up
+	 */
+	if (chan->ch_cq_hdl)
+		ibt_set_cq_handler(chan->ch_cq_hdl, NULL, NULL);
+
+	/*
+	 * Remove any softint we may have allocated for the admin cq
+	 */
+	if (ss->ei_admin_si_hdl) {
+		(void) ddi_intr_remove_softint(ss->ei_admin_si_hdl);
+		ss->ei_admin_si_hdl = NULL;
+	}
+
+	/*
+	 * Release any work completion buffers we may have allocated
+	 */
+	if (chan->ch_wc && chan->ch_cq_sz)
+		kmem_free(chan->ch_wc, sizeof (ibt_wc_t) * chan->ch_cq_sz);
+
+	chan->ch_cq_sz = 0;
+	chan->ch_wc = NULL;
+
+	/*
+	 * Free any completion queue we may have allocated
+	 */
+	if (chan->ch_cq_hdl) {
+		ret = ibt_free_cq(chan->ch_cq_hdl);
+		if (ret != IBT_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_rb_adm_setup_cq: "
+			    "ibt_free_cq() failed, ret=%d", ret);
+		}
+		chan->ch_cq_hdl = NULL;
+	}
+}
+
+static void
+eib_rb_adm_setup_ud_channel(eib_t *ss)
+{
+	eib_chan_t *chan = ss->ei_admin_chan;
+	ibt_status_t ret;
+
+	if (chan == NULL)
+		return;
+
+	if (chan->ch_chan) {
+		/*
+		 * We're trying to tear down this UD channel. Make sure that
+		 * we don't attempt to refill (repost) at any point from now on.
+		 */
+		chan->ch_tear_down = B_TRUE;
+		if ((ret = ibt_flush_channel(chan->ch_chan)) != IBT_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_rb_adm_setup_ud_channel: "
+			    "ibt_flush_channel() failed, ret=%d", ret);
+		}
+
+		/*
+		 * Wait until all posted tx wqes on this channel are back with
+		 * the wqe pool.
+		 */
+		mutex_enter(&chan->ch_tx_lock);
+		while (chan->ch_tx_posted > 0)
+			cv_wait(&chan->ch_tx_cv, &chan->ch_tx_lock);
+		mutex_exit(&chan->ch_tx_lock);
+
+		/*
+		 * Wait until all posted rx wqes on this channel are back with
+		 * the wqe pool.
+		 */
+		mutex_enter(&chan->ch_rx_lock);
+		while (chan->ch_rx_posted > 0)
+			cv_wait(&chan->ch_rx_cv, &chan->ch_rx_lock);
+		mutex_exit(&chan->ch_rx_lock);
+
+		/*
+		 * Now we're ready to free this channel
+		 */
+		if ((ret = ibt_free_channel(chan->ch_chan)) != IBT_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_rb_adm_setup_ud_channel: "
+			    "ibt_free_channel() failed, ret=%d", ret);
+		}
+
+		chan->ch_alloc_mp = B_FALSE;
+		chan->ch_ip_hdr_align = 0;
+		chan->ch_rwqe_bktsz = 0;
+		chan->ch_lwm_rwqes = 0;
+		chan->ch_max_rwqes = 0;
+		chan->ch_max_swqes = 0;
+		chan->ch_qpn = 0;
+		chan->ch_chan = NULL;
+	}
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/ib/clients/eoib/eib_chan.c	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,216 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+
+#include <sys/ib/clients/eoib/eib_impl.h>
+
+eib_chan_t *
+eib_chan_init(void)
+{
+	eib_chan_t *chan;
+
+	/*
+	 * Allocate a eib_chan_t to store stuff about admin qp and
+	 * initialize some basic stuff
+	 */
+	chan = kmem_zalloc(sizeof (eib_chan_t), KM_SLEEP);
+
+	mutex_init(&chan->ch_pkey_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&chan->ch_cep_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&chan->ch_tx_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&chan->ch_rx_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&chan->ch_vhub_lock, NULL, MUTEX_DRIVER, NULL);
+
+	cv_init(&chan->ch_cep_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&chan->ch_tx_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&chan->ch_rx_cv, NULL, CV_DEFAULT, NULL);
+
+	return (chan);
+}
+
+void
+eib_chan_fini(eib_chan_t *chan)
+{
+	if (chan) {
+		cv_destroy(&chan->ch_rx_cv);
+		cv_destroy(&chan->ch_tx_cv);
+		cv_destroy(&chan->ch_cep_cv);
+
+		mutex_destroy(&chan->ch_vhub_lock);
+		mutex_destroy(&chan->ch_rx_lock);
+		mutex_destroy(&chan->ch_tx_lock);
+		mutex_destroy(&chan->ch_cep_lock);
+		mutex_destroy(&chan->ch_pkey_lock);
+
+		kmem_free(chan, sizeof (eib_chan_t));
+	}
+}
+
+int
+eib_chan_post_rx(eib_t *ss, eib_chan_t *chan, uint_t *n_posted)
+{
+	eib_wqe_t *rwqes[EIB_RWR_CHUNK_SZ];
+	ibt_status_t ret;
+	uint_t n_got = 0;
+	uint_t n_good = 0;
+	uint_t limit = 0;
+	uint_t room = 0;
+	uint_t chunk_sz;
+	int wndx;
+	int i;
+
+	/*
+	 * We don't want to post beyond the maximum rwqe size for this channel
+	 */
+	room = chan->ch_max_rwqes - chan->ch_rx_posted;
+	limit = (room > chan->ch_rwqe_bktsz) ? chan->ch_rwqe_bktsz : room;
+
+	for (wndx = 0; wndx < limit; wndx += chunk_sz) {
+		/*
+		 * Grab a chunk of rwqes
+		 */
+		chunk_sz = ((limit - wndx) < EIB_RWR_CHUNK_SZ) ?
+		    (limit - wndx) : EIB_RWR_CHUNK_SZ;
+
+		/*
+		 * When eib_chan_post_rx() is called to post a bunch of rwqes,
+		 * it is either during the vnic setup or when we're refilling
+		 * the data channel.  Neither situation is important enough for
+		 * us to grab the wqes reserved for sending keepalives of
+		 * previously established vnics.
+		 */
+		ret = eib_rsrc_grab_rwqes(ss, rwqes, chunk_sz, &n_got,
+		    EIB_WPRI_LO);
+		if (ret != EIB_E_SUCCESS)
+			break;
+
+		/*
+		 * Post work requests from the rwqes we just grabbed
+		 */
+		for (i = 0; i < n_got; i++) {
+			eib_wqe_t *rwqe = rwqes[i];
+
+			ret = eib_chan_post_recv(ss, chan, rwqe);
+			if (ret == EIB_E_SUCCESS) {
+				n_good++;
+			} else if (rwqe->qe_mp) {
+				freemsg(rwqe->qe_mp);
+			} else {
+				eib_rsrc_return_rwqe(ss, rwqe, NULL);
+			}
+		}
+
+		/*
+		 * If we got less rwqes than we asked for during the grab
+		 * earlier, we'll stop asking for more and quit now.
+		 */
+		if (n_got < chunk_sz)
+			break;
+	}
+
+	/*
+	 * If we posted absolutely nothing, we return failure; otherwise
+	 * return success.
+	 */
+	if (n_good == 0)
+		return (EIB_E_FAILURE);
+
+	if (n_posted)
+		*n_posted = n_good;
+
+	return (EIB_E_SUCCESS);
+}
+
+/*ARGSUSED*/
+int
+eib_chan_post_recv(eib_t *ss, eib_chan_t *chan, eib_wqe_t *rwqe)
+{
+	ibt_status_t ret;
+	uint8_t *mp_base;
+	size_t mp_len;
+
+	rwqe->qe_sgl.ds_va = (ib_vaddr_t)(uintptr_t)rwqe->qe_cpbuf;
+	rwqe->qe_sgl.ds_len = rwqe->qe_bufsz;
+
+	/*
+	 * If this channel has receive buffer alignment restrictions, make
+	 * sure the requirements are met
+	 */
+	if (chan->ch_ip_hdr_align) {
+		rwqe->qe_sgl.ds_va += chan->ch_ip_hdr_align;
+		rwqe->qe_sgl.ds_len -= chan->ch_ip_hdr_align;
+	}
+
+	/*
+	 * If the receive buffer for this channel needs to have an mblk
+	 * allocated, do it
+	 */
+	if (chan->ch_alloc_mp) {
+		mp_base = (uint8_t *)(uintptr_t)(rwqe->qe_sgl.ds_va);
+		mp_len = rwqe->qe_sgl.ds_len;
+
+		rwqe->qe_mp = desballoc(mp_base, mp_len, 0, &rwqe->qe_frp);
+		if (rwqe->qe_mp == NULL) {
+			EIB_DPRINTF_ERR(ss->ei_instance, "eib_chan_post_recv: "
+			    "desballoc(base=0x%llx, len=0x%llx) failed",
+			    mp_base, mp_len);
+			return (EIB_E_FAILURE);
+		}
+	}
+
+	/*
+	 * Check if the recv queue is already full or if we can post one more
+	 */
+	mutex_enter(&chan->ch_rx_lock);
+	if (chan->ch_rx_posted > (chan->ch_max_rwqes - 1)) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_chan_post_recv: "
+		    "too many rwqes posted already, posted=0x%lx, max=0x%lx",
+		    chan->ch_rx_posted, chan->ch_max_rwqes);
+		mutex_exit(&chan->ch_rx_lock);
+		return (EIB_E_FAILURE);
+	}
+
+	rwqe->qe_vnic_inst = chan->ch_vnic_inst;
+	rwqe->qe_chan = chan;
+	rwqe->qe_info |= EIB_WQE_FLG_POSTED_TO_HCA;
+
+	ret = ibt_post_recv(chan->ch_chan, &(rwqe->qe_wr.recv), 1, NULL);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_chan_post_recv: "
+		    "ibt_post_recv() failed, ret=%d", ret);
+		mutex_exit(&chan->ch_rx_lock);
+		return (EIB_E_FAILURE);
+	}
+	chan->ch_rx_posted++;
+	mutex_exit(&chan->ch_rx_lock);
+
+	return (EIB_E_SUCCESS);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/ib/clients/eoib/eib_cmn.c	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,394 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+
+#include <sys/ib/clients/eoib/eib_impl.h>
+
+/*
+ * Definitions private to this file
+ */
+ib_gid_t eib_reserved_gid;
+
+uint8_t eib_zero_mac[] = {
+	0x0, 0x0, 0x0, 0x0, 0x0, 0x0
+};
+
+uint8_t eib_broadcast_mac[] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+
+int eib_setbit_mod67[] = {
+	-1,  0,  1, 39,  2, 15, 40, 23,
+	3,  12, 16, 59, 41, 19, 24, 54,
+	4,  -1, 13, 10, 17, 62, 60, 28,
+	42, 30, 20, 51, 25, 44, 55, 47,
+	5,  32, -1, 38, 14, 22, 11, 58,
+	18, 53, 63,  9, 61, 27, 29, 50,
+	43, 46, 31, 37, 21, 57, 52,  8,
+	26, 49, 45, 36, 56,  7, 48, 35,
+	6,  34, 33
+};
+
+char *eib_pvt_props[] = {
+	EIB_DLPROP_GW_EPORT_STATE,
+	EIB_DLPROP_HCA_GUID,
+	EIB_DLPROP_PORT_GUID,
+	NULL
+};
+
+#define	eib_prop_get_and_test(inst, dp, propname, propval)		\
+{                                                                       \
+	(propval) = ddi_prop_get_int(DDI_DEV_T_ANY, (dp),               \
+	    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, (propname), -1);      \
+	if ((propval) == -1) {                                          \
+		EIB_DPRINTF_WARN((inst), "eib_get_props: "		\
+		    "ddi_prop_get_int() could not find "		\
+		    "property '%s'", (propname));			\
+		goto get_props_fail;                                    \
+	}                                                               \
+}
+
+#define	eib_prop64_get_and_test(inst, dp, propname, propval)		\
+{                                                                       \
+	(propval) = ddi_prop_get_int64(DDI_DEV_T_ANY, (dp),             \
+	    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, (propname), -1);      \
+	if ((propval) == -1) {                                          \
+		EIB_DPRINTF_WARN((inst), "eib_get_props: "		\
+		    "ddi_prop_get_int64() could not find "		\
+		    "property '%s'", (propname));			\
+		goto get_props_fail;                                    \
+	}                                                               \
+}
+
+#define	eib_propstr_get_and_test(inst, dp, propname, propval_p)		\
+{                                                                       \
+	int rv;                                                         \
+									\
+	*(propval_p) = NULL;                                            \
+									\
+	rv = ddi_prop_lookup_string(DDI_DEV_T_ANY, (dp),                \
+	    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, (propname),           \
+	    (propval_p));                                               \
+	if (rv != DDI_PROP_SUCCESS) {                                   \
+		EIB_DPRINTF_WARN((inst), "eib_get_props: "		\
+		    "ddi_prop_lookup_string() could not find "		\
+		    "property '%s'", (propname));			\
+		goto get_props_fail;                                    \
+	}                                                               \
+}
+
+/*
+ * HW/FW workarounds
+ */
+
+/*
+ * 1. Verification of descriptor list length in the received packets is
+ *    disabled, since experimentation shows that BX does not set the desc
+ *    list length correctly. True for EoIB nexus as well.
+ */
+int eib_wa_no_desc_list_len = 1;
+
+/*
+ * 2. LSO/Checksum_Offload for EoIB packets does not seem to be supported
+ *    currently, so we'll disable both temporarily.
+ */
+int eib_wa_no_cksum_offload = 1;
+int eib_wa_no_lso = 1;
+
+/*
+ * 3. The "multicast entry" types are not clearly defined in the spec
+ *    at the moment.  The current BX software/firmware appears to ignore
+ *    the type of the context table entries, so we will treat these
+ *    addresses just like regular vnic addresses.
+ */
+int eib_wa_no_mcast_entries = 1;
+
+/*
+ * 4. VHUB updates from the gateways provide us with destination LIDs,
+ *    and we will hand-create these address vectors.
+ */
+int eib_wa_no_av_discover = 1;
+
+/*
+ * 5. The older BX software does not seem to set the VP flag correctly
+ *    in the login acknowledgements even when it successfully allocates
+ *    a vlan, so we will ignore it for now.
+ */
+int eib_wa_no_good_vp_flag = 1;
+
+/*
+ * 6. Each vhub table is expected to carry a checksum at the end to
+ *    verify the contents of the received vhub table. The current BX
+ *    software/firmware does not seem to fill this field with the
+ *    correct value (and/or the spec description is ambiguous). We
+ *    will ignore the vhub table checksum verification for now.
+ */
+int eib_wa_no_good_vhub_cksum = 1;
+
+int
+eib_get_props(eib_t *ss)
+{
+	int val;
+	int64_t val64;
+	char *str;
+	clock_t gw_ka_usecs;
+	clock_t vnic_ka_usecs;
+
+	ss->ei_gw_props = kmem_zalloc(sizeof (eib_gw_props_t), KM_SLEEP);
+	ss->ei_props = kmem_zalloc(sizeof (eib_props_t), KM_SLEEP);
+
+	mutex_init(&ss->ei_gw_props->pp_gw_lock, NULL, MUTEX_DRIVER, NULL);
+
+	/*
+	 * The interface speed is currently set to 10Gb/s, since we don't
+	 * have a way yet to figure this virtual-wire specific data from
+	 * the gateway.  The rest of the properties are handed over to us
+	 * by the EoIB nexus.
+	 */
+	ss->ei_props->ep_ifspeed = 10000000000;
+
+	eib_prop64_get_and_test(ss->ei_instance, ss->ei_dip,
+	    EIB_PROP_HCA_GUID, val64);
+	ss->ei_props->ep_hca_guid = (ib_guid_t)val64;
+
+	eib_prop64_get_and_test(ss->ei_instance, ss->ei_dip,
+	    EIB_PROP_GW_SYS_GUID, val64);
+	ss->ei_gw_props->pp_gw_system_guid = (ib_guid_t)val64;
+
+	eib_prop64_get_and_test(ss->ei_instance, ss->ei_dip,
+	    EIB_PROP_GW_GUID, val64);
+	ss->ei_gw_props->pp_gw_guid = (ib_guid_t)val64;
+
+	eib_prop64_get_and_test(ss->ei_instance, ss->ei_dip,
+	    EIB_PROP_GW_SN_PREFIX, val64);
+	ss->ei_gw_props->pp_gw_sn_prefix = (ib_sn_prefix_t)val64;
+
+	eib_prop_get_and_test(ss->ei_instance, ss->ei_dip,
+	    EIB_PROP_GW_ADV_PERIOD, val);
+	ss->ei_gw_props->pp_gw_adv_period = (uint_t)val;
+
+	eib_prop_get_and_test(ss->ei_instance, ss->ei_dip,
+	    EIB_PROP_GW_KA_PERIOD, val);
+	ss->ei_gw_props->pp_gw_ka_period = (uint_t)val;
+
+	gw_ka_usecs = ss->ei_gw_props->pp_gw_ka_period * 1000;
+	gw_ka_usecs = ((gw_ka_usecs << 2) + gw_ka_usecs) >> 1;
+	ss->ei_gw_props->pp_gw_ka_ticks = drv_usectohz(gw_ka_usecs);
+
+	eib_prop_get_and_test(ss->ei_instance, ss->ei_dip,
+	    EIB_PROP_VNIC_KA_PERIOD, val);
+	ss->ei_gw_props->pp_vnic_ka_period = (uint_t)val;
+
+	vnic_ka_usecs = ss->ei_gw_props->pp_vnic_ka_period * 1000;
+	ss->ei_gw_props->pp_vnic_ka_ticks = drv_usectohz(vnic_ka_usecs);
+
+	eib_prop_get_and_test(ss->ei_instance, ss->ei_dip,
+	    EIB_PROP_GW_CTRL_QPN, val);
+	ss->ei_gw_props->pp_gw_ctrl_qpn = (ib_qpn_t)val;
+
+	eib_prop_get_and_test(ss->ei_instance, ss->ei_dip,
+	    EIB_PROP_GW_LID, val);
+	ss->ei_gw_props->pp_gw_lid = (ib_lid_t)val;
+
+	eib_prop_get_and_test(ss->ei_instance, ss->ei_dip,
+	    EIB_PROP_GW_PORTID, val);
+	ss->ei_gw_props->pp_gw_portid = (uint16_t)val;
+
+	eib_prop_get_and_test(ss->ei_instance, ss->ei_dip,
+	    EIB_PROP_GW_NUM_NET_VNICS, val);
+	ss->ei_gw_props->pp_gw_num_net_vnics = (uint16_t)val;
+
+	eib_prop_get_and_test(ss->ei_instance, ss->ei_dip,
+	    EIB_PROP_GW_AVAILABLE, val);
+	ss->ei_gw_props->pp_gw_flag_available = (uint8_t)val;
+
+	eib_prop_get_and_test(ss->ei_instance, ss->ei_dip,
+	    EIB_PROP_GW_HOST_VNICS, val);
+	ss->ei_gw_props->pp_gw_is_host_adm_vnics = (uint8_t)val;
+
+	eib_prop_get_and_test(ss->ei_instance, ss->ei_dip,
+	    EIB_PROP_GW_SL, val);
+	ss->ei_gw_props->pp_gw_sl = (uint8_t)val;
+
+	eib_prop_get_and_test(ss->ei_instance, ss->ei_dip,
+	    EIB_PROP_GW_N_RSS_QPN, val);
+	ss->ei_gw_props->pp_gw_n_rss_qpn = (uint8_t)val;
+
+	eib_prop_get_and_test(ss->ei_instance, ss->ei_dip,
+	    EIB_PROP_HCA_PORTNUM, val);
+	ss->ei_props->ep_port_num = (uint8_t)val;
+
+	eib_propstr_get_and_test(ss->ei_instance, ss->ei_dip,
+	    EIB_PROP_GW_SYS_NAME, &str);
+	ss->ei_gw_props->pp_gw_system_name = (uint8_t *)str;
+
+	eib_propstr_get_and_test(ss->ei_instance, ss->ei_dip,
+	    EIB_PROP_GW_PORT_NAME, &str);
+	ss->ei_gw_props->pp_gw_port_name = (uint8_t *)str;
+
+	eib_propstr_get_and_test(ss->ei_instance, ss->ei_dip,
+	    EIB_PROP_GW_VENDOR_ID, &str);
+	ss->ei_gw_props->pp_gw_vendor_id = (uint8_t *)str;
+
+	return (EIB_E_SUCCESS);
+
+get_props_fail:
+	eib_rb_get_props(ss);
+	return (EIB_E_FAILURE);
+}
+
+void
+eib_update_props(eib_t *ss, eib_gw_info_t *new_gw_info)
+{
+	eib_gw_props_t *gwp = ss->ei_gw_props;
+	dev_info_t *dip = ss->ei_dip;
+	char *str;
+
+	ASSERT(gwp != NULL && dip != NULL);
+
+	mutex_enter(&gwp->pp_gw_lock);
+
+	gwp->pp_gw_system_guid = new_gw_info->gi_system_guid;
+	(void) ddi_prop_update_int64(DDI_DEV_T_NONE, dip, EIB_PROP_GW_SYS_GUID,
+	    gwp->pp_gw_system_guid);
+
+	gwp->pp_gw_guid = new_gw_info->gi_guid;
+	(void) ddi_prop_update_int64(DDI_DEV_T_NONE, dip, EIB_PROP_GW_GUID,
+	    gwp->pp_gw_guid);
+
+	gwp->pp_gw_sn_prefix = new_gw_info->gi_sn_prefix;
+	(void) ddi_prop_update_int64(DDI_DEV_T_NONE, dip, EIB_PROP_GW_SN_PREFIX,
+	    gwp->pp_gw_sn_prefix);
+
+	gwp->pp_gw_adv_period = new_gw_info->gi_adv_period;
+	(void) ddi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_ADV_PERIOD,
+	    gwp->pp_gw_adv_period);
+
+	gwp->pp_gw_ka_period = new_gw_info->gi_ka_period;
+	(void) ddi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_KA_PERIOD,
+	    gwp->pp_gw_ka_period);
+
+	gwp->pp_vnic_ka_period = new_gw_info->gi_vnic_ka_period;
+	(void) ddi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_VNIC_KA_PERIOD,
+	    gwp->pp_vnic_ka_period);
+
+	gwp->pp_gw_ctrl_qpn = new_gw_info->gi_ctrl_qpn;
+	(void) ddi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_CTRL_QPN,
+	    gwp->pp_gw_ctrl_qpn);
+
+	gwp->pp_gw_lid = new_gw_info->gi_lid;
+	(void) ddi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_LID,
+	    gwp->pp_gw_lid);
+
+	gwp->pp_gw_portid = new_gw_info->gi_portid;
+	(void) ddi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_PORTID,
+	    gwp->pp_gw_portid);
+
+	gwp->pp_gw_num_net_vnics = new_gw_info->gi_num_net_vnics;
+	(void) ddi_prop_update_int(DDI_DEV_T_NONE, dip,
+	    EIB_PROP_GW_NUM_NET_VNICS, gwp->pp_gw_num_net_vnics);
+
+	gwp->pp_gw_flag_available = new_gw_info->gi_flag_available;
+	(void) ddi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_AVAILABLE,
+	    gwp->pp_gw_flag_available);
+
+	gwp->pp_gw_is_host_adm_vnics = new_gw_info->gi_is_host_adm_vnics;
+	(void) ddi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_HOST_VNICS,
+	    gwp->pp_gw_is_host_adm_vnics);
+
+	gwp->pp_gw_sl = new_gw_info->gi_sl;
+	(void) ddi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_SL,
+	    gwp->pp_gw_sl);
+
+	gwp->pp_gw_n_rss_qpn = new_gw_info->gi_n_rss_qpn;
+	(void) ddi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_N_RSS_QPN,
+	    gwp->pp_gw_n_rss_qpn);
+
+	(void) ddi_prop_update_string(DDI_DEV_T_NONE, dip,
+	    EIB_PROP_GW_SYS_NAME, (char *)(new_gw_info->gi_system_name));
+	(void) ddi_prop_lookup_string(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, EIB_PROP_GW_SYS_NAME, &str);
+	if (gwp->pp_gw_system_name) {
+		ddi_prop_free(gwp->pp_gw_system_name);
+	}
+	gwp->pp_gw_system_name = (uint8_t *)str;
+
+	(void) ddi_prop_update_string(DDI_DEV_T_NONE, dip,
+	    EIB_PROP_GW_PORT_NAME, (char *)(new_gw_info->gi_port_name));
+	(void) ddi_prop_lookup_string(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, EIB_PROP_GW_PORT_NAME, &str);
+	if (gwp->pp_gw_port_name) {
+		ddi_prop_free(gwp->pp_gw_port_name);
+	}
+	gwp->pp_gw_port_name = (uint8_t *)str;
+
+	(void) ddi_prop_update_string(DDI_DEV_T_NONE, dip,
+	    EIB_PROP_GW_VENDOR_ID, (char *)(new_gw_info->gi_vendor_id));
+	(void) ddi_prop_lookup_string(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, EIB_PROP_GW_VENDOR_ID, &str);
+	if (gwp->pp_gw_vendor_id) {
+		ddi_prop_free(gwp->pp_gw_vendor_id);
+	}
+	gwp->pp_gw_vendor_id = (uint8_t *)str;
+
+	mutex_exit(&gwp->pp_gw_lock);
+}
+
+void
+eib_rb_get_props(eib_t *ss)
+{
+	/*
+	 * Free any allocations
+	 */
+	if (ss->ei_gw_props->pp_gw_vendor_id) {
+		ddi_prop_free(ss->ei_gw_props->pp_gw_vendor_id);
+		ss->ei_gw_props->pp_gw_vendor_id = NULL;
+	}
+	if (ss->ei_gw_props->pp_gw_port_name) {
+		ddi_prop_free(ss->ei_gw_props->pp_gw_port_name);
+		ss->ei_gw_props->pp_gw_port_name = NULL;
+	}
+	if (ss->ei_gw_props->pp_gw_system_name) {
+		ddi_prop_free(ss->ei_gw_props->pp_gw_system_name);
+		ss->ei_gw_props->pp_gw_system_name = NULL;
+	}
+
+	mutex_destroy(&ss->ei_gw_props->pp_gw_lock);
+
+	/*
+	 * Free space allocated for holding the props
+	 */
+	kmem_free(ss->ei_props, sizeof (eib_props_t));
+	kmem_free(ss->ei_gw_props, sizeof (eib_gw_props_t));
+
+	ss->ei_props = NULL;
+	ss->ei_gw_props = NULL;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/ib/clients/eoib/eib_ctl.c	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,469 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+
+#include <sys/ib/clients/eoib/eib_impl.h>
+
+/*
+ * Declarations private to this file
+ */
+static int eib_ctl_setup_cq(eib_t *, eib_vnic_t *);
+static int eib_ctl_setup_ud_channel(eib_t *, eib_vnic_t *);
+static void eib_ctl_comp_intr(ibt_cq_hdl_t, void *);
+static void eib_ctl_rx_comp(eib_vnic_t *, eib_wqe_t *);
+static void eib_ctl_tx_comp(eib_vnic_t *, eib_wqe_t *);
+static void eib_ctl_err_comp(eib_vnic_t *, eib_wqe_t *, ibt_wc_t *);
+static void eib_rb_ctl_setup_cq(eib_t *, eib_vnic_t *);
+static void eib_rb_ctl_setup_ud_channel(eib_t *, eib_vnic_t *);
+
+int
+eib_ctl_create_qp(eib_t *ss, eib_vnic_t *vnic, int *err)
+{
+	eib_chan_t *chan = NULL;
+
+	/*
+	 * Allocate a eib_chan_t to store stuff about this vnic's ctl qp
+	 * and initialize it with default admin qp pkey parameters. We'll
+	 * re-associate this with the pkey we receive from the gw once we
+	 * receive the login ack.
+	 */
+	vnic->vn_ctl_chan = eib_chan_init();
+
+	chan = vnic->vn_ctl_chan;
+	chan->ch_pkey = ss->ei_admin_chan->ch_pkey;
+	chan->ch_pkey_ix = ss->ei_admin_chan->ch_pkey_ix;
+	chan->ch_vnic_inst = vnic->vn_instance;
+
+	/*
+	 * Setup a combined CQ and completion handler
+	 */
+	if (eib_ctl_setup_cq(ss, vnic) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_ctl_create_qp: "
+		    "eib_ctl_setup_cq() failed");
+		*err = ENOMEM;
+		goto ctl_create_qp_fail;
+	}
+
+	/*
+	 * Setup UD channel
+	 */
+	if (eib_ctl_setup_ud_channel(ss, vnic) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_ctl_create_qp: "
+		    "eib_ctl_setup_ud_channel() failed");
+		*err = ENOMEM;
+		goto ctl_create_qp_fail;
+	}
+
+	return (EIB_E_SUCCESS);
+
+ctl_create_qp_fail:
+	eib_rb_ctl_create_qp(ss, vnic);
+	return (EIB_E_FAILURE);
+}
+
+/*ARGSUSED*/
+uint_t
+eib_ctl_comp_handler(caddr_t arg1, caddr_t arg2)
+{
+	eib_vnic_t *vnic = (eib_vnic_t *)(void *)arg1;
+	eib_chan_t *chan = vnic->vn_ctl_chan;
+	eib_t *ss = vnic->vn_ss;
+	ibt_wc_t *wc;
+	eib_wqe_t *wqe;
+	ibt_status_t ret;
+	uint_t polled;
+	int i;
+
+	/*
+	 * Re-arm the notification callback before we start polling
+	 * the completion queue.  There's nothing much we can do if the
+	 * enable_cq_notify fails - we issue a warning and move on.
+	 */
+	ret = ibt_enable_cq_notify(chan->ch_cq_hdl, IBT_NEXT_COMPLETION);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_ctl_comp_handler: "
+		    "ibt_enable_cq_notify() failed, ret=%d", ret);
+	}
+
+	/*
+	 * Handle tx and rx completions
+	 */
+	while ((ret = ibt_poll_cq(chan->ch_cq_hdl, chan->ch_wc, chan->ch_cq_sz,
+	    &polled)) == IBT_SUCCESS) {
+		for (wc = chan->ch_wc, i = 0; i < polled; i++, wc++) {
+			wqe = (eib_wqe_t *)(uintptr_t)wc->wc_id;
+			if (wc->wc_status != IBT_WC_SUCCESS) {
+				eib_ctl_err_comp(vnic, wqe, wc);
+			} else if (EIB_WQE_TYPE(wqe->qe_info) == EIB_WQE_RX) {
+				eib_ctl_rx_comp(vnic, wqe);
+			} else {
+				eib_ctl_tx_comp(vnic, wqe);
+			}
+		}
+	}
+
+	return (DDI_INTR_CLAIMED);
+}
+
+void
+eib_rb_ctl_create_qp(eib_t *ss, eib_vnic_t *vnic)
+{
+	eib_rb_ctl_setup_ud_channel(ss, vnic);
+
+	eib_rb_ctl_setup_cq(ss, vnic);
+
+	eib_chan_fini(vnic->vn_ctl_chan);
+	vnic->vn_ctl_chan = NULL;
+}
+
+static int
+eib_ctl_setup_cq(eib_t *ss, eib_vnic_t *vnic)
+{
+	eib_chan_t *chan = vnic->vn_ctl_chan;
+	ibt_cq_attr_t cq_attr;
+	ibt_status_t ret;
+	uint_t sz;
+	int rv;
+
+	/*
+	 * Allocate a completion queue for sending vhub table request
+	 * and vhub-update/vnic-alive messages and responses from the
+	 * gateway
+	 */
+	cq_attr.cq_sched = NULL;
+	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
+	if (ss->ei_hca_attrs->hca_max_cq_sz < EIB_CTL_CQ_SIZE)
+		cq_attr.cq_size = ss->ei_hca_attrs->hca_max_cq_sz;
+	else
+		cq_attr.cq_size = EIB_CTL_CQ_SIZE;
+
+	ret = ibt_alloc_cq(ss->ei_hca_hdl, &cq_attr, &chan->ch_cq_hdl, &sz);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_ctl_setup_cq: "
+		    "ibt_alloc_cq(cq_sz=0x%lx) failed, ret=%d",
+		    cq_attr.cq_size, ret);
+		goto ctl_setup_cq_fail;
+	}
+
+	/*
+	 * Set up other parameters for collecting completion information
+	 */
+	chan->ch_cq_sz = sz;
+	chan->ch_wc = kmem_zalloc(sizeof (ibt_wc_t) * sz, KM_SLEEP);
+
+	/*
+	 * Allocate soft interrupt for this vnic's control channel cq
+	 * handler and set up the IBTL cq handler.
+	 */
+	if ((rv = ddi_intr_add_softint(ss->ei_dip, &vnic->vn_ctl_si_hdl,
+	    EIB_SOFTPRI_CTL, eib_ctl_comp_handler, vnic)) != DDI_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_ctl_setup_cq: "
+		    "ddi_intr_add_softint() failed for vnic %d ctl qp, ret=%d",
+		    vnic->vn_instance, rv);
+		goto ctl_setup_cq_fail;
+	}
+
+	/*
+	 * Now, set up this vnic's control channel completion queue handler
+	 */
+	ibt_set_cq_handler(chan->ch_cq_hdl, eib_ctl_comp_intr, vnic);
+
+	ret = ibt_enable_cq_notify(chan->ch_cq_hdl, IBT_NEXT_COMPLETION);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_ctl_setup_cq: "
+		    "ibt_enable_cq_notify() failed, ret=%d", ret);
+		goto ctl_setup_cq_fail;
+	}
+
+	return (EIB_E_SUCCESS);
+
+ctl_setup_cq_fail:
+	eib_rb_ctl_setup_cq(ss, vnic);
+	return (EIB_E_FAILURE);
+}
+
+static int
+eib_ctl_setup_ud_channel(eib_t *ss, eib_vnic_t *vnic)
+{
+	eib_chan_t *chan = vnic->vn_ctl_chan;
+	ibt_ud_chan_alloc_args_t alloc_attr;
+	ibt_ud_chan_query_attr_t query_attr;
+	ibt_status_t ret;
+
+	bzero(&alloc_attr, sizeof (ibt_ud_chan_alloc_args_t));
+	bzero(&query_attr, sizeof (ibt_ud_chan_query_attr_t));
+
+	alloc_attr.ud_flags = IBT_ALL_SIGNALED;
+	alloc_attr.ud_hca_port_num = ss->ei_props->ep_port_num;
+	alloc_attr.ud_pkey_ix = chan->ch_pkey_ix;
+	alloc_attr.ud_sizes.cs_sq = EIB_CTL_MAX_SWQE;
+	alloc_attr.ud_sizes.cs_rq = EIB_CTL_MAX_RWQE;
+	alloc_attr.ud_sizes.cs_sq_sgl = 1;
+	alloc_attr.ud_sizes.cs_rq_sgl = 1;
+	alloc_attr.ud_sizes.cs_inline = 0;
+
+	alloc_attr.ud_qkey = EIB_FIP_QKEY;
+	alloc_attr.ud_scq = chan->ch_cq_hdl;
+	alloc_attr.ud_rcq = chan->ch_cq_hdl;
+	alloc_attr.ud_pd = ss->ei_pd_hdl;
+
+	ret = ibt_alloc_ud_channel(ss->ei_hca_hdl, IBT_ACHAN_NO_FLAGS,
+	    &alloc_attr, &chan->ch_chan, NULL);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_ctl_setup_ud_channel: "
+		    "ibt_alloc_ud_channel(port=0x%x, pkey_ix=0x%x) "
+		    "failed, ret=%d", alloc_attr.ud_hca_port_num,
+		    chan->ch_pkey_ix, ret);
+		goto ctl_setup_ud_channel_fail;
+	}
+
+	ret = ibt_query_ud_channel(chan->ch_chan, &query_attr);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_ctl_setup_ud_channel: "
+		    "ibt_query_ud_channel() failed, ret=%d", ret);
+		goto ctl_setup_ud_channel_fail;
+	}
+
+	chan->ch_qpn = query_attr.ud_qpn;
+	chan->ch_max_swqes = query_attr.ud_chan_sizes.cs_sq;
+	chan->ch_max_rwqes = query_attr.ud_chan_sizes.cs_rq;
+	chan->ch_lwm_rwqes = chan->ch_max_rwqes >> 2;
+	chan->ch_rwqe_bktsz = chan->ch_max_rwqes;
+	chan->ch_ip_hdr_align = 0;
+	chan->ch_alloc_mp = B_FALSE;
+	chan->ch_tear_down = B_FALSE;
+
+	return (EIB_E_SUCCESS);
+
+ctl_setup_ud_channel_fail:
+	eib_rb_ctl_setup_ud_channel(ss, vnic);
+	return (EIB_E_FAILURE);
+}
+
+static void
+eib_ctl_comp_intr(ibt_cq_hdl_t cq_hdl, void *arg)
+{
+	eib_vnic_t *vnic = arg;
+	eib_t *ss = vnic->vn_ss;
+	eib_chan_t *chan = vnic->vn_ctl_chan;
+
+	if (cq_hdl != chan->ch_cq_hdl) {
+		EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_ctl_comp_intr: "
+		    "cq_hdl(0x%llx) != chan->ch_cq_hdl(0x%llx), "
+		    "ignoring completion", cq_hdl, chan->ch_cq_hdl);
+		return;
+	}
+
+	ASSERT(vnic->vn_ctl_si_hdl != NULL);
+
+	(void) ddi_intr_trigger_softint(vnic->vn_ctl_si_hdl, NULL);
+}
+
+static void
+eib_ctl_rx_comp(eib_vnic_t *vnic, eib_wqe_t *wqe)
+{
+	eib_t *ss = vnic->vn_ss;
+	eib_chan_t *chan = vnic->vn_ctl_chan;
+	uint8_t *pkt = (uint8_t *)(uintptr_t)(wqe->qe_sgl.ds_va);
+	ibt_status_t ret;
+
+	/*
+	 * Skip the GRH and parse the message in the packet
+	 */
+	(void) eib_fip_parse_ctl_pkt(pkt + EIB_GRH_SZ, vnic);
+
+	/*
+	 * Try to repost the rwqe.  For control channels, we take the shortcut
+	 * and not go through eib_chan_post_recv(), since we know that the
+	 * qe_info flag, qe_chan and qe_vinst are all already set correctly; we
+	 * just took this out of the rx queue, so the ch_rx_posted will be ok
+	 * if we just posted it back. And there are no mblk allocation or
+	 * buffer alignment restrictions for this channel as well.
+	 */
+	if (chan->ch_tear_down) {
+		eib_rsrc_return_rwqe(ss, wqe, chan);
+	} else {
+		ret = ibt_post_recv(chan->ch_chan, &(wqe->qe_wr.recv), 1, NULL);
+		if (ret != IBT_SUCCESS) {
+			EIB_DPRINTF_ERR(ss->ei_instance, "eib_ctl_rx_comp: "
+			    "ibt_post_recv() failed, ret=%d", ret);
+			eib_rsrc_return_rwqe(ss, wqe, chan);
+		}
+	}
+}
+
+static void
+eib_ctl_tx_comp(eib_vnic_t *vnic, eib_wqe_t *wqe)
+{
+	eib_rsrc_return_swqe(vnic->vn_ss, wqe, vnic->vn_ctl_chan);
+}
+
+static void
+eib_ctl_err_comp(eib_vnic_t *vnic, eib_wqe_t *wqe, ibt_wc_t *wc)
+{
+	eib_t *ss = vnic->vn_ss;
+
+	/*
+	 * Currently, all we do is report
+	 */
+	switch (wc->wc_status) {
+	case IBT_WC_WR_FLUSHED_ERR:
+		break;
+
+	case IBT_WC_LOCAL_CHAN_OP_ERR:
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_ctl_err_comp: "
+		    "IBT_WC_LOCAL_CHAN_OP_ERR seen, wqe_info=0x%lx ",
+		    wqe->qe_info);
+		break;
+
+	case IBT_WC_LOCAL_PROTECT_ERR:
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_ctl_err_comp: "
+		    "IBT_WC_LOCAL_PROTECT_ERR seen, wqe_info=0x%lx ",
+		    wqe->qe_info);
+		break;
+	}
+
+	/*
+	 * When a wc indicates error, we do not attempt to repost but
+	 * simply return it to the wqe pool.
+	 */
+	if (EIB_WQE_TYPE(wqe->qe_info) == EIB_WQE_RX)
+		eib_rsrc_return_rwqe(ss, wqe, vnic->vn_ctl_chan);
+	else
+		eib_rsrc_return_swqe(ss, wqe, vnic->vn_ctl_chan);
+}
+
+/*ARGSUSED*/
+static void
+eib_rb_ctl_setup_cq(eib_t *ss, eib_vnic_t *vnic)
+{
+	eib_chan_t *chan = vnic->vn_ctl_chan;
+	ibt_status_t ret;
+
+	if (chan == NULL)
+		return;
+
+	/*
+	 * Reset any completion handler we may have set up
+	 */
+	if (chan->ch_cq_hdl)
+		ibt_set_cq_handler(chan->ch_cq_hdl, NULL, NULL);
+
+	/*
+	 * Remove any softint we may have allocated for this cq
+	 */
+	if (vnic->vn_ctl_si_hdl) {
+		(void) ddi_intr_remove_softint(vnic->vn_ctl_si_hdl);
+		vnic->vn_ctl_si_hdl = NULL;
+	}
+
+	/*
+	 * Release any work completion buffers we may have allocated
+	 */
+	if (chan->ch_wc && chan->ch_cq_sz)
+		kmem_free(chan->ch_wc, sizeof (ibt_wc_t) * chan->ch_cq_sz);
+
+	chan->ch_cq_sz = 0;
+	chan->ch_wc = NULL;
+
+	/*
+	 * Free any completion queue we may have allocated
+	 */
+	if (chan->ch_cq_hdl) {
+		ret = ibt_free_cq(chan->ch_cq_hdl);
+		if (ret != IBT_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_rb_ctl_setup_cq: "
+			    "ibt_free_cq() failed, ret=%d", ret);
+		}
+		chan->ch_cq_hdl = NULL;
+	}
+}
+
+/*ARGSUSED*/
+static void
+eib_rb_ctl_setup_ud_channel(eib_t *ss, eib_vnic_t *vnic)
+{
+	eib_chan_t *chan = vnic->vn_ctl_chan;
+	ibt_status_t ret;
+
+	if (chan == NULL)
+		return;
+
+	if (chan->ch_chan) {
+		/*
+		 * We're trying to tear down this UD channel. Make sure that
+		 * we don't attempt to refill (repost) at any point from now on.
+		 */
+		chan->ch_tear_down = B_TRUE;
+		if ((ret = ibt_flush_channel(chan->ch_chan)) != IBT_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_rb_ctl_setup_ud_channel: "
+			    "ibt_flush_channel() failed, ret=%d", ret);
+		}
+
+		/*
+		 * Wait until all posted tx wqes on this channel are back with
+		 * the wqe pool.
+		 */
+		mutex_enter(&chan->ch_tx_lock);
+		while (chan->ch_tx_posted > 0)
+			cv_wait(&chan->ch_tx_cv, &chan->ch_tx_lock);
+		mutex_exit(&chan->ch_tx_lock);
+
+		/*
+		 * Wait until all posted rx wqes on this channel are back with
+		 * the wqe pool.
+		 */
+		mutex_enter(&chan->ch_rx_lock);
+		while (chan->ch_rx_posted > 0)
+			cv_wait(&chan->ch_rx_cv, &chan->ch_rx_lock);
+		mutex_exit(&chan->ch_rx_lock);
+
+		/*
+		 * Now we're ready to free this channel
+		 */
+		if ((ret = ibt_free_channel(chan->ch_chan)) != IBT_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_rb_ctl_setup_ud_channel: "
+			    "ibt_free_channel() failed, ret=%d", ret);
+		}
+
+		chan->ch_alloc_mp = B_FALSE;
+		chan->ch_ip_hdr_align = 0;
+		chan->ch_rwqe_bktsz = 0;
+		chan->ch_lwm_rwqes = 0;
+		chan->ch_max_rwqes = 0;
+		chan->ch_max_swqes = 0;
+		chan->ch_qpn = 0;
+		chan->ch_chan = NULL;
+	}
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/ib/clients/eoib/eib_data.c	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,1496 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+#include <sys/pattr.h>			/* HCK_* */
+#include <inet/ip.h>			/* ipha_t */
+#include <inet/tcp.h>			/* tcph_t */
+#include <sys/mac_provider.h>		/* mac_* */
+#include <sys/strsun.h>			/* MBLKL */
+
+#include <sys/ib/clients/eoib/eib_impl.h>
+
+/*
+ * Declarations private to this file
+ */
+static int eib_data_setup_cqs(eib_t *, eib_vnic_t *);
+static int eib_data_setup_ud_channel(eib_t *, eib_vnic_t *);
+static void eib_data_setup_lso(eib_wqe_t *, mblk_t *, uint32_t,
+    eib_ether_hdr_t *);
+static int eib_data_prepare_sgl(eib_vnic_t *, eib_wqe_t *, mblk_t *);
+static int eib_data_is_mcast_pkt_ok(eib_vnic_t *, uint8_t *, uint64_t *,
+    uint64_t *);
+static void eib_data_rx_comp_intr(ibt_cq_hdl_t, void *);
+static void eib_data_tx_comp_intr(ibt_cq_hdl_t, void *);
+static mblk_t *eib_data_rx_comp(eib_vnic_t *, eib_wqe_t *, ibt_wc_t *);
+static void eib_data_tx_comp(eib_vnic_t *, eib_wqe_t *, eib_chan_t *);
+static void eib_data_err_comp(eib_vnic_t *, eib_wqe_t *, ibt_wc_t *);
+static void eib_rb_data_setup_cqs(eib_t *, eib_vnic_t *);
+static void eib_rb_data_setup_ud_channel(eib_t *, eib_vnic_t *);
+
+
+int
+eib_data_create_qp(eib_t *ss, eib_vnic_t *vnic, int *err)
+{
+	eib_chan_t *chan = NULL;
+
+	/*
+	 * Allocate a eib_chan_t to store stuff about this vnic's data qp
+	 * and initialize it with default admin qp pkey parameters. We'll
+	 * re-associate this with the pkey we receive from the gw once we
+	 * receive the login ack.
+	 */
+	vnic->vn_data_chan = eib_chan_init();
+
+	chan = vnic->vn_data_chan;
+	chan->ch_pkey = ss->ei_admin_chan->ch_pkey;
+	chan->ch_pkey_ix = ss->ei_admin_chan->ch_pkey_ix;
+	chan->ch_vnic_inst = vnic->vn_instance;
+
+	/*
+	 * Setup tx/rx CQs and completion handlers
+	 */
+	if (eib_data_setup_cqs(ss, vnic) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_create_qp: "
+		    "eib_data_setup_cqs(vn_inst=0x%x) failed",
+		    vnic->vn_instance);
+		*err = ENOMEM;
+		goto data_create_qp_fail;
+	}
+
+	/*
+	 * Setup UD channel
+	 */
+	if (eib_data_setup_ud_channel(ss, vnic) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_create_qp: "
+		    "eib_data_setup_ud_channel(vn_inst=0x%x) failed",
+		    vnic->vn_instance);
+		*err = ENOMEM;
+		goto data_create_qp_fail;
+	}
+
+	return (EIB_E_SUCCESS);
+
+data_create_qp_fail:
+	eib_rb_data_create_qp(ss, vnic);
+	return (EIB_E_FAILURE);
+}
+
+/*ARGSUSED*/
+uint_t
+eib_data_rx_comp_handler(caddr_t arg1, caddr_t arg2)
+{
+	eib_vnic_t *vnic = (eib_vnic_t *)(void *)arg1;
+	eib_t *ss = vnic->vn_ss;
+	eib_chan_t *chan = vnic->vn_data_chan;
+	eib_stats_t *stats = ss->ei_stats;
+	ibt_wc_t *wc;
+	eib_wqe_t *wqe;
+	mblk_t *mp;
+	mblk_t *head = NULL;
+	mblk_t *tail = NULL;
+	ibt_status_t ret;
+	uint_t pkts_per_call = 0;
+	uint_t polled;
+	uint_t rbytes;
+	uint_t ipkts;
+	uint_t num_wc;
+	int i;
+
+	/*
+	 * Re-arm the rx notification callback before we start polling
+	 * the completion queue.  There's nothing much we can do if the
+	 * enable_cq_notify fails - we issue a warning and move on.
+	 */
+	ret = ibt_enable_cq_notify(chan->ch_rcv_cq_hdl, IBT_NEXT_COMPLETION);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp_handler: "
+		    "ibt_enable_cq_notify() failed, ret=%d", ret);
+	}
+
+	/*
+	 * We don't want to be stuck in receive processing for too long without
+	 * giving others a chance.
+	 */
+	num_wc = (chan->ch_rcv_cq_sz < EIB_MAX_RX_PKTS_ONINTR) ?
+	    chan->ch_rcv_cq_sz : EIB_MAX_RX_PKTS_ONINTR;
+
+	/*
+	 * Handle rx completions
+	 */
+	while ((ret = ibt_poll_cq(chan->ch_rcv_cq_hdl, chan->ch_rcv_wc,
+	    num_wc, &polled)) == IBT_SUCCESS) {
+
+		rbytes = ipkts = 0;
+		head = tail = NULL;
+
+		for (wc = chan->ch_rcv_wc, i = 0; i < polled; i++, wc++) {
+			wqe = (eib_wqe_t *)(uintptr_t)wc->wc_id;
+
+			ASSERT(EIB_WQE_TYPE(wqe->qe_info) == EIB_WQE_RX);
+
+			/*
+			 * Clear the posted-to-hca flag and reduce the number
+			 * of posted-rwqes count
+			 */
+			wqe->qe_info &= (~EIB_WQE_FLG_POSTED_TO_HCA);
+			eib_rsrc_decr_posted_rwqe(ss, chan);
+
+			rbytes += wc->wc_bytes_xfer;
+			if (wc->wc_status != IBT_WC_SUCCESS) {
+				EIB_INCR_COUNTER(&stats->st_ierrors);
+				eib_data_err_comp(vnic, wqe, wc);
+			} else {
+				ipkts++;
+				mp = eib_data_rx_comp(vnic, wqe, wc);
+				if (mp == NULL) {
+					continue;
+				} else {
+					/*
+					 * Add this mp to the list to
+					 * send it to the nw layer. Note
+					 * that the wqe could've been
+					 * returned to the pool if we're
+					 * running low, so don't process
+					 * wqe after this point.
+					 */
+					if (head)
+						tail->b_next = mp;
+					else
+						head = mp;
+					tail = mp;
+				}
+			}
+		}
+
+		/*
+		 * We reduce the number of atomic updates to key statistics
+		 * by pooling them here, once per ibt_poll_cq().  The accuracy
+		 * and consistency of the published statistics within a cq
+		 * polling cycle will be compromised a little bit, but that
+		 * should be ok, given that we probably gain a little bit by
+		 * not having to do these atomic operations per packet.
+		 */
+		EIB_UPDATE_COUNTER(&stats->st_rbytes, rbytes);
+		EIB_UPDATE_COUNTER(&stats->st_ipkts, ipkts);
+
+		pkts_per_call += ipkts;
+
+		if (head) {
+			mac_rx(ss->ei_mac_hdl, NULL, head);
+		}
+
+		/*
+		 * If we have processed too many packets in one attempt, we'll
+		 * have to come back here later.
+		 */
+		if (pkts_per_call >= EIB_MAX_RX_PKTS_ONINTR) {
+			(void) ddi_intr_trigger_softint(vnic->vn_data_rx_si_hdl,
+			    NULL);
+			break;
+		}
+
+		num_wc -= polled;
+	}
+
+	return (DDI_INTR_CLAIMED);
+}
+
+/*ARGSUSED*/
+uint_t
+eib_data_tx_comp_handler(caddr_t arg1, caddr_t arg2)
+{
+	eib_vnic_t *vnic = (eib_vnic_t *)(void *)arg1;
+	eib_t *ss = vnic->vn_ss;
+	eib_chan_t *chan = vnic->vn_data_chan;
+	eib_stats_t *stats = ss->ei_stats;
+	ibt_wc_t *wc;
+	eib_wqe_t *wqe;
+	ibt_status_t ret;
+	uint_t polled;
+	int i;
+
+	/*
+	 * Re-arm the tx notification callback before we start polling
+	 * the completion queue.  There's nothing much we can do if the
+	 * enable_cq_notify fails - we issue a warning and move on.
+	 */
+	ret = ibt_enable_cq_notify(chan->ch_cq_hdl, IBT_NEXT_COMPLETION);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_tx_comp_handler: "
+		    "ibt_enable_cq_notify() failed, ret=%d", ret);
+	}
+
+	/*
+	 * Handle tx completions
+	 */
+	while ((ret = ibt_poll_cq(chan->ch_cq_hdl, chan->ch_wc, chan->ch_cq_sz,
+	    &polled)) == IBT_SUCCESS) {
+		for (wc = chan->ch_wc, i = 0; i < polled; i++, wc++) {
+			wqe = (eib_wqe_t *)(uintptr_t)wc->wc_id;
+
+			ASSERT(EIB_WQE_TYPE(wqe->qe_info) == EIB_WQE_TX);
+
+			if (wc->wc_status != IBT_WC_SUCCESS) {
+				EIB_INCR_COUNTER(&stats->st_oerrors);
+				eib_data_err_comp(vnic, wqe, wc);
+			} else {
+				eib_data_tx_comp(vnic, wqe, vnic->vn_data_chan);
+			}
+		}
+	}
+
+	return (DDI_INTR_CLAIMED);
+}
+
+void
+eib_data_rx_recycle(caddr_t arg)
+{
+	eib_wqe_t *rwqe = (eib_wqe_t *)(void *)arg;
+	eib_t *ss = rwqe->qe_pool->wp_ss;
+	eib_chan_t *vn_chan;
+	uint_t nic_state;
+	int ret;
+
+	/*
+	 * We come here from three places - (a) from the nw layer if the
+	 * rx mblk we handed to it has been done with and the nw layer is
+	 * calling the freemsg() (b) from eib_data_rx_comp() if the rx
+	 * completion processing discovers that the received EoIB packet
+	 * has a problem and (c) from eib_data_err_comp() if we're tearing
+	 * down this channel.  We only need to repost the rwqe if we're
+	 * being called back from the nw layer.  For the other two cases,
+	 * we'll simply return the rwqe to the pool. Also, since we would've
+	 * already updated the ch_rx_posted counters in the rx completion
+	 * handler, we don't pass the chan pointer to eib_rsrc_return_rwqe
+	 * from within this routine.
+	 */
+	rwqe->qe_mp = NULL;
+	if ((rwqe->qe_info & EIB_WQE_FLG_WITH_NW) == 0) {
+		eib_rsrc_return_rwqe(ss, rwqe, NULL);
+		return;
+	}
+
+	rwqe->qe_info &= (~EIB_WQE_FLG_WITH_NW);
+
+	/*
+	 * If the buffers are being returned by nw layer after a long
+	 * time, this eoib instance could've even been stopped by now.
+	 * If so, simply return the rwqe to the pool.
+	 */
+	nic_state = eib_mac_get_nic_state(ss);
+	if ((nic_state & EIB_NIC_STARTED) != EIB_NIC_STARTED) {
+		eib_rsrc_return_rwqe(ss, rwqe, NULL);
+		return;
+	}
+
+	/*
+	 * Or it could've taken even longer, and the nic has even been
+	 * restarted.  Only thing we can do is to make sure that the
+	 * original channel pointer we passed corresponds to what's in
+	 * the instance of the vnic currently.
+	 */
+	vn_chan = eib_vnic_get_data_chan(ss, rwqe->qe_vnic_inst);
+	if (vn_chan == NULL || vn_chan != rwqe->qe_chan) {
+		eib_rsrc_return_rwqe(ss, rwqe, NULL);
+		return;
+	}
+
+	/*
+	 * Try to repost the rwqe if we're not tearing down this channel
+	 */
+	if (vn_chan->ch_tear_down) {
+		eib_rsrc_return_rwqe(ss, rwqe, NULL);
+	} else {
+		ret = eib_chan_post_recv(ss, vn_chan, rwqe);
+		if (ret != EIB_E_SUCCESS) {
+			if (rwqe->qe_mp)
+				freemsg(rwqe->qe_mp);
+			else
+				eib_rsrc_return_rwqe(ss, rwqe, NULL);
+		}
+	}
+}
+
+void
+eib_data_post_tx(eib_vnic_t *vnic, eib_wqe_t *swqe)
+{
+	eib_chan_t *chan = vnic->vn_data_chan;
+	eib_t *ss = vnic->vn_ss;
+	eib_stats_t *stats = vnic->vn_ss->ei_stats;
+	ibt_send_wr_t wrs[EIB_MAX_POST_MULTIPLE];
+	eib_wqe_t *wqes[EIB_MAX_POST_MULTIPLE];
+	eib_wqe_t *elem;
+	ibt_status_t ret;
+	uint_t n_wrs;
+	uint_t n_posted;
+	uint_t total_failed = 0;
+	uint_t n_failed = 0;
+	uint_t i;
+
+	/*
+	 * See if we have room for this wqe and then add it to the
+	 * list of tx wrs to post in this channel.
+	 */
+	mutex_enter(&chan->ch_tx_lock);
+
+	if ((chan->ch_tx_posted + 1) >= (chan->ch_max_swqes - 1)) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_post_tx: "
+		    "too many swqes posted already, posted=0x%lx, "
+		    "max=0x%lx", chan->ch_tx_posted, chan->ch_max_swqes);
+		mutex_exit(&chan->ch_tx_lock);
+		return;
+	}
+
+	swqe->qe_nxt_post = NULL;
+	if (chan->ch_tx) {
+		chan->ch_tx_tail->qe_nxt_post = swqe;
+	} else {
+		chan->ch_tx = swqe;
+	}
+	chan->ch_tx_tail = swqe;
+	chan->ch_tx_posted++;		/* pre-increment */
+
+	/*
+	 * If someone's already posting tx wqes in this channel, let
+	 * them post ours as well.
+	 */
+	if (chan->ch_tx_busy == B_TRUE) {
+		mutex_exit(&chan->ch_tx_lock);
+		return;
+	}
+	chan->ch_tx_busy = B_TRUE;
+
+	while (chan->ch_tx) {
+		/*
+		 * Post EIB_MAX_POST_MULTIPLE wrs at a time
+		 */
+		for (n_wrs = 0, elem = chan->ch_tx;
+		    (elem) && (n_wrs < EIB_MAX_POST_MULTIPLE);
+		    elem = elem->qe_nxt_post, n_wrs++) {
+			wqes[n_wrs] = elem;
+			wrs[n_wrs] = (elem->qe_wr).send;
+		}
+		chan->ch_tx = elem;
+		if (elem == NULL) {
+			chan->ch_tx_tail = NULL;
+		}
+		mutex_exit(&chan->ch_tx_lock);
+
+		ASSERT(n_wrs != 0);
+
+		/*
+		 * If multiple wrs posting fails for some reason, we'll try
+		 * posting the unposted ones one by one.  If even that fails,
+		 * we'll release any mappings/buffers/mblks associated with
+		 * this wqe and return it to the pool.
+		 */
+		n_posted = n_failed = 0;
+		ret = ibt_post_send(chan->ch_chan, wrs, n_wrs, &n_posted);
+		if (ret != IBT_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_post_tx: "
+			    "ibt_post_send(n_wrs=0x%lx, n_posted=0x%lx) "
+			    "failed, ret=%d", n_wrs, n_posted, ret);
+
+			for (i = n_posted; i < n_wrs; i++) {
+				ret = ibt_post_send(chan->ch_chan, &wrs[i],
+				    1, NULL);
+				if (ret != IBT_SUCCESS) {
+					n_failed++;
+					eib_data_tx_comp(vnic, wqes[i], chan);
+
+					EIB_DPRINTF_WARN(ss->ei_instance,
+					    "eib_data_post_tx: "
+					    "ibt_post_send(n_wrs=1) failed, "
+					    "ret=%d", ret);
+				}
+			}
+		}
+		total_failed += n_failed;
+
+		mutex_enter(&chan->ch_tx_lock);
+	}
+
+	chan->ch_tx_busy = B_FALSE;
+	mutex_exit(&chan->ch_tx_lock);
+
+	/*
+	 * If we failed to post something, update error stats
+	 */
+	if (total_failed) {
+		EIB_UPDATE_COUNTER(&stats->st_oerrors, total_failed);
+	}
+}
+
+void
+eib_data_parse_ether_hdr(mblk_t *mp, eib_ether_hdr_t *evh)
+{
+	struct ether_vlan_header *vl_hdr;
+	struct ether_header *hdr;
+
+	/*
+	 * Assume that the ether header (with or without vlan tag) is
+	 * contained in one fragment
+	 */
+	hdr = (struct ether_header *)(void *)mp->b_rptr;
+	vl_hdr = (struct ether_vlan_header *)(void *)mp->b_rptr;
+
+	evh->eh_ether_type = ntohs(hdr->ether_type);
+	if (evh->eh_ether_type != ETHERTYPE_VLAN) {
+		evh->eh_tagless = 1;
+		evh->eh_vlan = 0;
+		ether_copy((void *)hdr->ether_dhost.ether_addr_octet,
+		    (void *)evh->eh_dmac);
+		ether_copy((void *)hdr->ether_shost.ether_addr_octet,
+		    (void *)evh->eh_smac);
+	} else {
+		evh->eh_ether_type = ntohs(vl_hdr->ether_type);
+		evh->eh_tagless = 0;
+		evh->eh_vlan = VLAN_ID(ntohs(vl_hdr->ether_tci));
+		ether_copy((void *)vl_hdr->ether_dhost.ether_addr_octet,
+		    (void *)evh->eh_dmac);
+		ether_copy((void *)vl_hdr->ether_shost.ether_addr_octet,
+		    (void *)evh->eh_smac);
+	}
+}
+
+int
+eib_data_lookup_vnic(eib_t *ss, uint8_t *mac, uint16_t vlan, eib_vnic_t **vnicp,
+    boolean_t *failed)
+{
+	eib_vnic_t *vnic;
+	eib_vnic_req_t *vrq;
+	uint8_t *vn_mac;
+	uint16_t vn_vlan;
+	uint64_t av;
+	int inst = 0;
+
+	if (mac == NULL)
+		return (EIB_E_FAILURE);
+
+	/*
+	 * For now, a simple search (but only what we've allocated). Note that
+	 * if we're in the process of creating a vnic, the instance might've
+	 * been allocated, but the vnic entry would be NULL.
+	 */
+	mutex_enter(&ss->ei_vnic_lock);
+	av = ss->ei_active_vnics;
+	while ((inst = EIB_FIND_LSB_SET(av)) != -1) {
+		if ((vnic = ss->ei_vnic[inst]) != NULL) {
+			vn_mac = vnic->vn_login_data.ld_assigned_mac;
+			vn_vlan = vnic->vn_login_data.ld_assigned_vlan;
+
+			if ((vn_vlan == vlan) &&
+			    (bcmp(vn_mac, mac, ETHERADDRL) == 0)) {
+				if (vnicp) {
+					*vnicp = vnic;
+				}
+				mutex_exit(&ss->ei_vnic_lock);
+				return (EIB_E_SUCCESS);
+			}
+		}
+
+		av &= (~((uint64_t)1 << inst));
+	}
+	mutex_exit(&ss->ei_vnic_lock);
+
+	/*
+	 * If we haven't been able to locate a vnic for this {mac,vlan} tuple,
+	 * see if we've already failed a creation request for this vnic, and
+	 * return that information.
+	 */
+	if (failed) {
+		mutex_enter(&ss->ei_vnic_req_lock);
+		*failed = B_FALSE;
+		for (vrq = ss->ei_failed_vnic_req; vrq; vrq = vrq->vr_next) {
+			if ((vrq->vr_vlan == vlan) &&
+			    (bcmp(vrq->vr_mac, mac, ETHERADDRL) == 0)) {
+				*failed = B_TRUE;
+			}
+		}
+		mutex_exit(&ss->ei_vnic_req_lock);
+	}
+
+	return (EIB_E_FAILURE);
+}
+
+int
+eib_data_prepare_frame(eib_vnic_t *vnic, eib_wqe_t *swqe, mblk_t *mp,
+    eib_ether_hdr_t *evh)
+{
+	uint32_t mss;
+	uint32_t lsoflags;
+	uint32_t hckflags;
+
+	/*
+	 * The swqe defaults are set to use the regular ud work request
+	 * member and the IBT_WRC_SEND opcode, so we don't need to do
+	 * anything here if this isn't an LSO packet.
+	 */
+	mac_lso_get(mp, &mss, &lsoflags);
+	if ((lsoflags & HW_LSO) == HW_LSO)
+		eib_data_setup_lso(swqe, mp, mss, evh);
+
+	mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags);
+	if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM) {
+		swqe->qe_wr.send.wr_flags |= IBT_WR_SEND_CKSUM;
+	} else {
+		swqe->qe_wr.send.wr_flags &= (~IBT_WR_SEND_CKSUM);
+	}
+
+	if (eib_data_prepare_sgl(vnic, swqe, mp) != 0)
+		return (EIB_E_FAILURE);
+
+	swqe->qe_mp = mp;
+
+	return (EIB_E_SUCCESS);
+}
+
+void
+eib_rb_data_create_qp(eib_t *ss, eib_vnic_t *vnic)
+{
+	eib_rb_data_setup_ud_channel(ss, vnic);
+
+	eib_rb_data_setup_cqs(ss, vnic);
+
+	eib_chan_fini(vnic->vn_data_chan);
+	vnic->vn_data_chan = NULL;
+}
+
+static int
+eib_data_setup_cqs(eib_t *ss, eib_vnic_t *vnic)
+{
+	eib_chan_t *chan = vnic->vn_data_chan;
+	ibt_cq_attr_t cq_attr;
+	ibt_status_t ret;
+	uint_t snd_sz;
+	uint_t rcv_sz;
+	int rv;
+
+	/*
+	 * Allocate send completion queue. Note that we've already verified
+	 * that cp_max_swqe and cp_max_rwqe meet the max cq size requirements
+	 * of the hca.
+	 */
+	cq_attr.cq_sched = NULL;
+	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
+	cq_attr.cq_size = ss->ei_caps->cp_max_swqe + 1;
+
+	ret = ibt_alloc_cq(ss->ei_hca_hdl, &cq_attr, &chan->ch_cq_hdl, &snd_sz);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: "
+		    "ibt_alloc_cq(snd_cq_sz=0x%lx) failed, ret=%d",
+		    cq_attr.cq_size, ret);
+		goto setup_data_cqs_fail;
+	}
+	ret = ibt_modify_cq(chan->ch_cq_hdl, EIB_TX_COMP_COUNT,
+	    EIB_TX_COMP_USEC, 0);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_setup_cqs: "
+		    "ibt_modify_cq(snd_comp_count=0x%lx, snd_comp_usec=0x%lx) "
+		    "failed, ret=%d",
+		    EIB_TX_COMP_COUNT, EIB_TX_COMP_USEC, ret);
+	}
+
+	/*
+	 * Allocate receive completion queue
+	 */
+	cq_attr.cq_sched = NULL;
+	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
+	cq_attr.cq_size = ss->ei_caps->cp_max_rwqe + 1;
+
+	ret = ibt_alloc_cq(ss->ei_hca_hdl, &cq_attr, &chan->ch_rcv_cq_hdl,
+	    &rcv_sz);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: "
+		    "ibt_alloc_cq(rcv_cq_sz=0x%lx) failed, ret=%d",
+		    cq_attr.cq_size, ret);
+		goto setup_data_cqs_fail;
+	}
+	ret = ibt_modify_cq(chan->ch_rcv_cq_hdl, EIB_RX_COMP_COUNT,
+	    EIB_RX_COMP_USEC, 0);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_setup_cqs: "
+		    "ibt_modify_cq(rcv_comp_count=0x%lx, rcv_comp_usec=0x%lx) "
+		    "failed, ret=%d",
+		    EIB_RX_COMP_COUNT, EIB_RX_COMP_USEC, ret);
+	}
+
+	/*
+	 * Set up parameters for collecting tx and rx completion information
+	 */
+	chan->ch_cq_sz = snd_sz;
+	chan->ch_wc = kmem_zalloc(sizeof (ibt_wc_t) * snd_sz, KM_SLEEP);
+	chan->ch_rcv_cq_sz = rcv_sz;
+	chan->ch_rcv_wc = kmem_zalloc(sizeof (ibt_wc_t) * rcv_sz, KM_SLEEP);
+
+	/*
+	 * Set up the vnic's data tx completion queue handler and allocate
+	 * a softint for it as well.
+	 */
+	if ((rv = ddi_intr_add_softint(ss->ei_dip, &vnic->vn_data_tx_si_hdl,
+	    EIB_SOFTPRI_DATA, eib_data_tx_comp_handler, vnic)) != DDI_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: "
+		    "ddi_intr_add_softint() failed for data tx qp, ret=%d", rv);
+		goto setup_data_cqs_fail;
+	}
+	ibt_set_cq_handler(chan->ch_cq_hdl, eib_data_tx_comp_intr, vnic);
+	ret = ibt_enable_cq_notify(chan->ch_cq_hdl, IBT_NEXT_COMPLETION);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: "
+		    "ibt_enable_cq_notify() failed for tx cq, ret=%d", ret);
+		goto setup_data_cqs_fail;
+	}
+
+	/*
+	 * And then the data rx completion queue handler
+	 */
+	if ((rv = ddi_intr_add_softint(ss->ei_dip, &vnic->vn_data_rx_si_hdl,
+	    EIB_SOFTPRI_DATA, eib_data_rx_comp_handler, vnic)) != DDI_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: "
+		    "ddi_intr_add_softint() failed for data rx qp, ret=%d", rv);
+		goto setup_data_cqs_fail;
+	}
+	ibt_set_cq_handler(chan->ch_rcv_cq_hdl, eib_data_rx_comp_intr, vnic);
+	ret = ibt_enable_cq_notify(chan->ch_rcv_cq_hdl, IBT_NEXT_COMPLETION);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: "
+		    "ibt_enable_cq_notify() failed for rx cq, ret=%d", ret);
+		goto setup_data_cqs_fail;
+	}
+
+	return (EIB_E_SUCCESS);
+
+setup_data_cqs_fail:
+	eib_rb_data_setup_cqs(ss, vnic);
+	return (EIB_E_FAILURE);
+}
+
+static int
+eib_data_setup_ud_channel(eib_t *ss, eib_vnic_t *vnic)
+{
+	eib_chan_t *chan = vnic->vn_data_chan;
+	ibt_ud_chan_alloc_args_t alloc_attr;
+	ibt_ud_chan_query_attr_t query_attr;
+	ibt_status_t ret;
+
+	bzero(&alloc_attr, sizeof (ibt_ud_chan_alloc_args_t));
+	bzero(&query_attr, sizeof (ibt_ud_chan_query_attr_t));
+
+	alloc_attr.ud_flags = IBT_ALL_SIGNALED;
+	if (ss->ei_caps->cp_resv_lkey_capab)
+		alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
+	if (ss->ei_caps->cp_lso_maxlen)
+		alloc_attr.ud_flags |= IBT_USES_LSO;
+
+	alloc_attr.ud_hca_port_num = ss->ei_props->ep_port_num;
+	alloc_attr.ud_pkey_ix = chan->ch_pkey_ix;
+	alloc_attr.ud_sizes.cs_sq = ss->ei_caps->cp_max_swqe;
+	alloc_attr.ud_sizes.cs_rq = ss->ei_caps->cp_max_rwqe;
+	alloc_attr.ud_sizes.cs_sq_sgl = ss->ei_caps->cp_max_sgl;
+	alloc_attr.ud_sizes.cs_rq_sgl = 1;
+	alloc_attr.ud_sizes.cs_inline = 0;
+
+	alloc_attr.ud_qkey = EIB_DATA_QKEY;
+	alloc_attr.ud_scq = chan->ch_cq_hdl;
+	alloc_attr.ud_rcq = chan->ch_rcv_cq_hdl;
+	alloc_attr.ud_pd = ss->ei_pd_hdl;
+
+	ret = ibt_alloc_ud_channel(ss->ei_hca_hdl, IBT_ACHAN_NO_FLAGS,
+	    &alloc_attr, &chan->ch_chan, NULL);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_ud_channel: "
+		    "ibt_alloc_ud_channel(port=0x%x, pkey_ix=0x%x, "
+		    "cs_sq=0x%lx, cs_rq=0x%lx, sq_sgl=0x%lx) failed, ret=%d",
+		    alloc_attr.ud_hca_port_num, chan->ch_pkey_ix,
+		    alloc_attr.ud_sizes.cs_sq, alloc_attr.ud_sizes.cs_rq,
+		    alloc_attr.ud_sizes.cs_sq_sgl, ret);
+
+		goto setup_data_ud_channel_fail;
+	}
+
+	ret = ibt_query_ud_channel(chan->ch_chan, &query_attr);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_ud_channel: "
+		    "ibt_query_ud_channel() failed, ret=%d", ret);
+		goto setup_data_ud_channel_fail;
+	}
+
+	chan->ch_qpn = query_attr.ud_qpn;
+	chan->ch_max_swqes = query_attr.ud_chan_sizes.cs_sq;
+	chan->ch_max_rwqes = query_attr.ud_chan_sizes.cs_rq;
+	chan->ch_lwm_rwqes = chan->ch_max_rwqes >> 2;
+	chan->ch_rwqe_bktsz = (chan->ch_max_rwqes < EIB_DATA_RWQE_BKT) ?
+	    chan->ch_max_rwqes : EIB_DATA_RWQE_BKT;
+	chan->ch_ip_hdr_align = EIB_IP_HDR_ALIGN;
+	chan->ch_alloc_mp = B_TRUE;
+	chan->ch_tear_down = B_FALSE;
+
+	return (EIB_E_SUCCESS);
+
+setup_data_ud_channel_fail:
+	eib_rb_data_setup_ud_channel(ss, vnic);
+	return (EIB_E_FAILURE);
+}
+
+static void
+eib_data_setup_lso(eib_wqe_t *swqe, mblk_t *mp, uint32_t mss,
+    eib_ether_hdr_t *evh)
+{
+	ibt_wr_lso_t *lso;
+	mblk_t  *nmp;
+	uint8_t *dst;
+	uintptr_t ip_start;
+	uintptr_t tcp_start;
+	uint_t pending;
+	uint_t mblen;
+	uint_t eth_hdr_len;
+	uint_t ip_hdr_len;
+	uint_t tcp_hdr_len;
+
+	/*
+	 * When the swqe was grabbed, it would've had its wr_opcode and
+	 * wr.ud.udwr_dest set to default values. Since we're now going
+	 * to use LSO, we need to change these.
+	 */
+	swqe->qe_wr.send.wr_opcode = IBT_WRC_SEND_LSO;
+	lso = &(swqe->qe_wr.send.wr.ud_lso);
+	lso->lso_ud_dest = swqe->qe_dest;
+	lso->lso_mss = mss;
+
+	/*
+	 * Details on the ethernet header in the mp is already known to us
+	 */
+	eth_hdr_len = (evh->eh_tagless) ? (sizeof (struct ether_header)) :
+	    (sizeof (struct ether_vlan_header));
+
+	/*
+	 * Calculate the LSO header size and set it in the UD LSO structure.
+	 * Note that the only assumption we make is that each of the Ethernet,
+	 * IP and TCP headers will be contained in a single mblk fragment;
+	 * together, the headers may span multiple mblk fragments. Note also
+	 * that since the EoIB encapsulation header is not part of the message
+	 * block we receive, we'll need to account space for inserting it later.
+	 */
+	nmp = mp;
+	ip_start = (uintptr_t)(nmp->b_rptr) + eth_hdr_len;
+	if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
+		ip_start = (uintptr_t)nmp->b_cont->b_rptr
+		    + (ip_start - (uintptr_t)(nmp->b_wptr));
+		nmp = nmp->b_cont;
+	}
+	ip_hdr_len = IPH_HDR_LENGTH((ipha_t *)ip_start);
+
+	tcp_start = ip_start + ip_hdr_len;
+	if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
+		tcp_start = (uintptr_t)nmp->b_cont->b_rptr
+		    + (tcp_start - (uintptr_t)(nmp->b_wptr));
+		nmp = nmp->b_cont;
+	}
+	tcp_hdr_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
+
+	/*
+	 * Since the passed mp fragment never contains the EoIB encapsulation
+	 * header, we always have to copy the lso header. Sigh.
+	 */
+	lso->lso_hdr = swqe->qe_payload_hdr;
+	lso->lso_hdr_sz = EIB_ENCAP_HDR_SZ + eth_hdr_len +
+	    ip_hdr_len + tcp_hdr_len;
+
+	/*
+	 * We already have the EoIB encapsulation header written at the
+	 * start of wqe->qe_payload_hdr during swqe acquisition.  Only
+	 * copy the remaining headers.
+	 */
+	dst = lso->lso_hdr + EIB_ENCAP_HDR_SZ;
+	pending = lso->lso_hdr_sz - EIB_ENCAP_HDR_SZ;
+
+	for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
+		mblen = MBLKL(nmp);
+		if (pending > mblen) {
+			bcopy(nmp->b_rptr, dst, mblen);
+			dst += mblen;
+			pending -= mblen;
+		} else {
+			bcopy(nmp->b_rptr, dst, pending);
+			break;
+		}
+	}
+}
+
+static int
+eib_data_prepare_sgl(eib_vnic_t *vnic, eib_wqe_t *swqe, mblk_t *mp)
+{
+	eib_t *ss = vnic->vn_ss;
+	eib_stats_t *stats = vnic->vn_ss->ei_stats;
+	ibt_iov_t iov_arr[EIB_MAX_SGL];
+	ibt_iov_attr_t iov_attr;
+	ibt_wr_ds_t *sgl;
+	ibt_status_t ret;
+	mblk_t *nmp;
+	mblk_t *data_mp;
+	uchar_t *bufp;
+	size_t blksize;
+	size_t skip;
+	size_t avail;
+	uint_t lsohdr_sz;
+	uint_t pktsz;
+	ptrdiff_t frag_len;
+	uint_t pending_hdr;
+	uint_t nblks;
+	uint_t i;
+
+	/*
+	 * Let's skip ahead to the TCP data if this is LSO.  Note that while
+	 * the lso header size in the swqe includes the EoIB encapsulation
+	 * header size, that encapsulation header itself won't be found in
+	 * the mblk.
+	 */
+	lsohdr_sz = (swqe->qe_wr.send.wr_opcode == IBT_WRC_SEND) ? 0 :
+	    swqe->qe_wr.send.wr.ud_lso.lso_hdr_sz;
+
+	data_mp = mp;
+	pending_hdr = 0;
+	if (lsohdr_sz) {
+		pending_hdr = lsohdr_sz - EIB_ENCAP_HDR_SZ;
+		for (nmp = mp; nmp; nmp = nmp->b_cont) {
+			frag_len =
+			    (uintptr_t)nmp->b_wptr - (uintptr_t)nmp->b_rptr;
+			if (frag_len > pending_hdr)
+				break;
+			pending_hdr -= frag_len;
+		}
+		data_mp = nmp;  /* start of data past lso header */
+		ASSERT(data_mp != NULL);
+	}
+
+	/*
+	 * If this is an LSO packet, we want pktsz to hold the size of the
+	 * data following the eoib/ethernet/tcp/ip headers.  If this is a
+	 * non-LSO packet, we want pktsz to refer to the size of the entire
+	 * packet with all the headers, and nblks to hold the number of
+	 * mappings we'll need to iov map this (for reserved lkey request).
+	 */
+	if (lsohdr_sz == 0) {
+		nblks = 1;
+		pktsz = EIB_ENCAP_HDR_SZ;
+	} else {
+		nblks = 0;
+		pktsz = 0;
+	}
+	for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
+		pktsz += MBLKL(nmp);
+		nblks++;
+	}
+	pktsz -= pending_hdr;
+
+	EIB_UPDATE_COUNTER(&stats->st_obytes, pktsz);
+	EIB_INCR_COUNTER(&stats->st_opkts);
+
+	/*
+	 * We only do ibt_map_mem_iov() if the pktsz is above the tx copy
+	 * threshold and if the number of mp fragments is less than the
+	 * maximum acceptable.
+	 */
+	if ((ss->ei_caps->cp_resv_lkey_capab) && (pktsz > EIB_TX_COPY_THRESH) &&
+	    (nblks < ss->ei_caps->cp_hiwm_sgl)) {
+
+		iov_attr.iov_as = NULL;
+		iov_attr.iov = iov_arr;
+		iov_attr.iov_buf = NULL;
+		iov_attr.iov_list_len = nblks;
+		iov_attr.iov_wr_nds = ss->ei_caps->cp_max_sgl;
+		iov_attr.iov_lso_hdr_sz = lsohdr_sz;
+		iov_attr.iov_flags = IBT_IOV_SLEEP;
+
+		i = 0;
+		if (lsohdr_sz == 0) {
+			iov_arr[i].iov_addr = (caddr_t)swqe->qe_payload_hdr;
+			iov_arr[i].iov_len = EIB_ENCAP_HDR_SZ;
+			i++;
+		}
+		for (nmp = data_mp; i < nblks; i++, nmp = nmp->b_cont) {
+			iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
+			iov_arr[i].iov_len = MBLKL(nmp);
+			if (nmp == data_mp) {
+				iov_arr[i].iov_addr += pending_hdr;
+				iov_arr[i].iov_len -= pending_hdr;
+			}
+		}
+		swqe->qe_info |= EIB_WQE_FLG_BUFTYPE_MAPPED;
+		swqe->qe_wr.send.wr_sgl = swqe->qe_big_sgl;
+
+		ret = ibt_map_mem_iov(ss->ei_hca_hdl, &iov_attr,
+		    &swqe->qe_wr, &swqe->qe_iov_hdl);
+		if (ret != IBT_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			"eib_data_prepare_sgl: "
+			"ibt_map_mem_iov(nblks=0x%lx) failed, ret=%d ",
+			"attempting to use copy path", nblks, ret);
+			goto prepare_sgl_copy_path;
+		}
+
+		return (EIB_E_SUCCESS);
+	}
+
+prepare_sgl_copy_path:
+	if (pktsz <= swqe->qe_bufsz) {
+		swqe->qe_wr.send.wr_nds = 1;
+		swqe->qe_wr.send.wr_sgl = &swqe->qe_sgl;
+		swqe->qe_sgl.ds_len = pktsz;
+
+		/*
+		 * Even though this is the copy path for transfers less than
+		 * qe_bufsz, it could still be an LSO packet.  If so, we only
+		 * have to write the data following all the headers into the
+		 * work request buffer, since we'll be sending the lso header
+		 * itself separately. If this is not an LSO send (but pkt size
+		 * greater than mtu, say for a jumbo frame), then we need
+		 * to write all the headers including EoIB encapsulation,
+		 * into the work request buffer.
+		 */
+		bufp = (uchar_t *)(uintptr_t)swqe->qe_sgl.ds_va;
+		if (lsohdr_sz == 0) {
+			*(uint32_t *)((void *)bufp) = htonl(EIB_TX_ENCAP_HDR);
+			bufp += EIB_ENCAP_HDR_SZ;
+		}
+		for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
+			blksize = MBLKL(nmp) - pending_hdr;
+			bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
+			bufp += blksize;
+			pending_hdr = 0;
+		}
+
+		/*
+		 * If the ethernet frame we're going to send is less than
+		 * ETHERMIN, pad up the buffer to ETHERMIN (with zeros)
+		 */
+		if ((pktsz + lsohdr_sz) < (ETHERMIN + EIB_ENCAP_HDR_SZ)) {
+			bzero(bufp, (ETHERMIN + EIB_ENCAP_HDR_SZ) -
+			    (pktsz + lsohdr_sz));
+			swqe->qe_sgl.ds_len = ETHERMIN + EIB_ENCAP_HDR_SZ;
+		}
+		return (EIB_E_SUCCESS);
+	}
+
+	/*
+	 * Copy path for transfers greater than swqe->qe_bufsz
+	 */
+	swqe->qe_wr.send.wr_sgl = swqe->qe_big_sgl;
+	if (eib_rsrc_grab_lsobufs(ss, pktsz, swqe->qe_wr.send.wr_sgl,
+	    &(swqe->qe_wr.send.wr_nds)) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_prepare_sgl: "
+		    "eib_rsrc_grab_lsobufs() failed");
+		return (EIB_E_FAILURE);
+	}
+	swqe->qe_info |= EIB_WQE_FLG_BUFTYPE_LSO;
+
+	/*
+	 * Copy the larger-than-qe_buf_sz packet into a set of fixed-sized,
+	 * pre-mapped LSO buffers. Note that we might need to skip part of
+	 * the LSO header in the first fragment as before.
+	 */
+	nmp = data_mp;
+	skip = pending_hdr;
+	for (i = 0; i < swqe->qe_wr.send.wr_nds; i++) {
+		sgl = swqe->qe_wr.send.wr_sgl + i;
+		bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
+		avail = EIB_LSO_BUFSZ;
+
+		/*
+		 * If this is a non-LSO packet (perhaps a jumbo frame?)
+		 * we may still need to prefix the EoIB header in the
+		 * wr buffer.
+		 */
+		if ((i == 0) && (lsohdr_sz == 0)) {
+			*(uint32_t *)((void *)bufp) = htonl(EIB_TX_ENCAP_HDR);
+			bufp += EIB_ENCAP_HDR_SZ;
+			avail -= EIB_ENCAP_HDR_SZ;
+		}
+
+		while (nmp && avail) {
+			blksize = MBLKL(nmp) - skip;
+			if (blksize > avail) {
+				bcopy(nmp->b_rptr + skip, bufp, avail);
+				skip += avail;
+				avail = 0;
+			} else {
+				bcopy(nmp->b_rptr + skip, bufp, blksize);
+				skip = 0;
+				bufp += blksize;
+				avail -= blksize;
+				nmp = nmp->b_cont;
+			}
+		}
+	}
+
+	return (EIB_E_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+eib_data_is_mcast_pkt_ok(eib_vnic_t *vnic, uint8_t *macaddr, uint64_t *brdcst,
+    uint64_t *multicst)
+{
+	/*
+	 * If the dmac is a broadcast packet, let it through.  Otherwise, either
+	 * we should be in promiscuous mode or the dmac should be in our list of
+	 * joined multicast addresses. Currently we only update the stat
+	 * counters and always let things through.
+	 */
+	if (bcmp(macaddr, eib_broadcast_mac, ETHERADDRL) == 0)
+		EIB_INCR_COUNTER(brdcst);
+	else
+		EIB_INCR_COUNTER(multicst);
+
+	return (1);
+}
+
+static void
+eib_data_rx_comp_intr(ibt_cq_hdl_t cq_hdl, void *arg)
+{
+	eib_vnic_t *vnic = arg;
+	eib_chan_t *chan = vnic->vn_data_chan;
+	eib_t *ss = vnic->vn_ss;
+
+	if (cq_hdl != chan->ch_rcv_cq_hdl) {
+		EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_data_rx_comp_intr: "
+		    "cq_hdl(0x%llx) != chan->ch_cq_hdl(0x%llx), "
+		    "ignoring completion", cq_hdl, chan->ch_cq_hdl);
+		return;
+	}
+
+	ASSERT(vnic->vn_data_rx_si_hdl != NULL);
+
+	(void) ddi_intr_trigger_softint(vnic->vn_data_rx_si_hdl, NULL);
+}
+
+static void
+eib_data_tx_comp_intr(ibt_cq_hdl_t cq_hdl, void *arg)
+{
+	eib_vnic_t *vnic = arg;
+	eib_chan_t *chan = vnic->vn_data_chan;
+	eib_t *ss = vnic->vn_ss;
+
+	if (cq_hdl != chan->ch_cq_hdl) {
+		EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_data_tx_comp_intr: "
+		    "cq_hdl(0x%llx) != chan->ch_cq_hdl(0x%llx), "
+		    "ignoring completion", cq_hdl, chan->ch_cq_hdl);
+		return;
+	}
+
+	ASSERT(vnic->vn_data_tx_si_hdl != NULL);
+
+	(void) ddi_intr_trigger_softint(vnic->vn_data_tx_si_hdl, NULL);
+}
+
+static mblk_t *
+eib_data_rx_comp(eib_vnic_t *vnic, eib_wqe_t *wqe, ibt_wc_t *wc)
+{
+	eib_t *ss = vnic->vn_ss;
+	eib_chan_t *chan = vnic->vn_data_chan;
+	eib_login_data_t *ld = &vnic->vn_login_data;
+	eib_stats_t *stats = ss->ei_stats;
+	eib_ether_hdr_t evh;
+	mblk_t *mp;
+	boolean_t allocd_mp = B_FALSE;
+	uint_t ec_hdr;
+	uint_t ec_sign;
+	uint_t ec_ver;
+	uint_t ec_tu_cs;
+	uint_t ec_ip_cs;
+
+	/*
+	 * Before we process this mblk and send it up to network layer, see
+	 * if we're running low on rwqes in the wqe pool. If so, allocate a
+	 * new mblk, copy the received data into it and send it up (and return
+	 * the current rwqe back to the pool immediately by calling freemsg()
+	 * on the original mblk).
+	 */
+	if (!eib_rsrc_rxpool_low(wqe)) {
+		mp = wqe->qe_mp;
+	} else {
+		if ((mp = allocb(wc->wc_bytes_xfer, BPRI_HI)) != NULL) {
+			bcopy(wqe->qe_mp->b_rptr, mp->b_rptr,
+			    wc->wc_bytes_xfer);
+			freemsg(wqe->qe_mp);
+			allocd_mp = B_TRUE;
+		} else {
+			EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
+			    "wqe level below watermark, dropping rx pkt");
+			EIB_INCR_COUNTER(&stats->st_norcvbuf);
+			freemsg(wqe->qe_mp);
+			return (NULL);
+		}
+	}
+
+	/*
+	 * Adjust write pointer depending on how much data came in. Note that
+	 * since the nw layer will expect us to hand over the mp with the
+	 * ethernet header starting at mp->b_rptr, update the b_rptr as well.
+	 */
+	mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer;
+
+	/*
+	 * We have a problem if this really happens!
+	 */
+	if (mp->b_next != NULL) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
+		    "received packet's b_next not NULL, possible dup from cq");
+		mp->b_next = NULL;
+	}
+
+	/*
+	 * Drop loopback packets ?
+	 */
+	if ((wc->wc_slid == ss->ei_props->ep_blid) &&
+	    (wc->wc_qpn == chan->ch_qpn)) {
+		goto data_rx_comp_fail;
+	}
+
+	mp->b_rptr += EIB_GRH_SZ;
+
+	/*
+	 * Since the recv buffer has been aligned for IP header to start on
+	 * a word boundary, it is safe to say that the EoIB and ethernet
+	 * headers won't start on a word boundary.
+	 */
+	bcopy(mp->b_rptr, &ec_hdr, EIB_ENCAP_HDR_SZ);
+
+	/*
+	 * Check EoIB signature and version
+	 */
+	ec_hdr = ntohl(ec_hdr);
+
+	ec_sign = (ec_hdr >> EIB_ENCAP_SIGN_SHIFT) & EIB_ENCAP_SIGN_MASK;
+	if (ec_sign != EIB_EH_SIGNATURE) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
+		    "EoIB encapsulation header signature (0x%lx) unknown",
+		    ec_sign);
+		goto data_rx_comp_fail;
+	}
+
+	ec_ver = (ec_hdr >> EIB_ENCAP_VER_SHIFT) & EIB_ENCAP_VER_MASK;
+	if (ec_ver != EIB_EH_VERSION) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
+		    "EoIB encapsulation header version (0x%lx) unknown",
+		    ec_ver);
+		goto data_rx_comp_fail;
+	}
+
+	/*
+	 * Check TCP/UDP and IP checksum
+	 */
+	ec_tu_cs = (ec_hdr >> EIB_ENCAP_TCPCHK_SHIFT) & EIB_ENCAP_TCPCHK_MASK;
+	ec_ip_cs = (ec_hdr >> EIB_ENCAP_IPCHK_SHIFT) & EIB_ENCAP_IPCHK_MASK;
+
+	if ((ec_tu_cs == EIB_EH_UDPCSUM_OK || ec_tu_cs == EIB_EH_TCPCSUM_OK) &&
+	    (ec_ip_cs == EIB_EH_IPCSUM_OK)) {
+		mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
+	} else if (ec_tu_cs == EIB_EH_CSUM_BAD || ec_ip_cs == EIB_EH_CSUM_BAD) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
+		    "EoIB encapsulation header tcp/udp checksum (0x%lx) or"
+		    "ip checksum (0x%lx) is bad", ec_tu_cs, ec_ip_cs);
+	}
+
+	/*
+	 * Update the message block's b_rptr to the start of ethernet header
+	 * and parse the header information
+	 */
+	mp->b_rptr += EIB_ENCAP_HDR_SZ;
+	eib_data_parse_ether_hdr(mp, &evh);
+
+	/*
+	 * If the incoming packet is vlan-tagged, but the tag doesn't match
+	 * this vnic's vlan, drop it.
+	 */
+	if ((evh.eh_tagless == 0) && (evh.eh_vlan != ld->ld_assigned_vlan)) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
+		    "received packet's vlan unknown, expected=0x%x, got=0x%x",
+		    ld->ld_assigned_vlan, evh.eh_vlan);
+		goto data_rx_comp_fail;
+	}
+
+	/*
+	 * Final checks to see if the unicast destination is indeed correct
+	 * and to see if the multicast address is ok for us.
+	 */
+	if (EIB_UNICAST_MAC(evh.eh_dmac)) {
+		if (bcmp(evh.eh_dmac, ld->ld_assigned_mac, ETHERADDRL) != 0) {
+			uint8_t *exp;
+			uint8_t *got;
+
+			exp = ld->ld_assigned_mac;
+			got = evh.eh_dmac;
+
+			EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
+			    "received packet's macaddr mismatch, "
+			    "expected=%x:%x:%x:%x:%x:%x, got=%x:%x:%x:%x:%x:%x",
+			    exp[0], exp[1], exp[2], exp[3], exp[4], exp[5],
+			    got[0], got[1], got[2], got[3], got[4], got[5]);
+
+			goto data_rx_comp_fail;
+		}
+	} else {
+		if (!eib_data_is_mcast_pkt_ok(vnic, evh.eh_dmac,
+		    &stats->st_brdcstrcv, &stats->st_multircv)) {
+			EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
+			    "multicast packet not ok");
+			goto data_rx_comp_fail;
+		}
+	}
+
+	/*
+	 * Strip ethernet FCS if present in the packet.  ConnectX-2 doesn't
+	 * support ethernet FCS, so this shouldn't happen anyway.
+	 */
+	if ((ec_hdr >> EIB_ENCAP_FCS_B_SHIFT) & 0x1) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
+		    "ethernet FCS present (ec_hdr=0%lx), ignoring",
+		    ec_hdr);
+
+		mp->b_wptr -= ETHERFCSL;
+	}
+
+	/*
+	 * If this is the same mp as was in the original rwqe (i.e. we didn't
+	 * do any allocb()), then mark the rwqe flag so we know that its mblk
+	 * is with the network layer.
+	 */
+	if (!allocd_mp) {
+		wqe->qe_info |= EIB_WQE_FLG_WITH_NW;
+	}
+
+	return (mp);
+
+data_rx_comp_fail:
+	freemsg(mp);
+	return (NULL);
+}
+
+static void
+eib_data_tx_comp(eib_vnic_t *vnic, eib_wqe_t *wqe, eib_chan_t *chan)
+{
+	eib_t *ss = vnic->vn_ss;
+	ibt_status_t ret;
+
+	if (wqe->qe_mp) {
+		if (wqe->qe_info & EIB_WQE_FLG_BUFTYPE_MAPPED) {
+			ret = ibt_unmap_mem_iov(ss->ei_hca_hdl,
+			    wqe->qe_iov_hdl);
+			if (ret != IBT_SUCCESS) {
+				EIB_DPRINTF_WARN(ss->ei_instance,
+				    "eib_data_tx_comp: "
+				    "ibt_unmap_mem_iov() failed, ret=%d", ret);
+			}
+			wqe->qe_iov_hdl = NULL;
+		} else if (wqe->qe_info & EIB_WQE_FLG_BUFTYPE_LSO) {
+			eib_rsrc_return_lsobufs(ss, wqe->qe_big_sgl,
+			    wqe->qe_wr.send.wr_nds);
+		}
+		freemsg(wqe->qe_mp);
+		wqe->qe_mp = NULL;
+	}
+
+	eib_rsrc_return_swqe(ss, wqe, chan);
+}
+
+static void
+eib_data_err_comp(eib_vnic_t *vnic, eib_wqe_t *wqe, ibt_wc_t *wc)
+{
+	eib_t *ss = vnic->vn_ss;
+
+	/*
+	 * Currently, all we do is report
+	 */
+	switch (wc->wc_status) {
+	case IBT_WC_WR_FLUSHED_ERR:
+		break;
+
+	case IBT_WC_LOCAL_CHAN_OP_ERR:
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_err_comp: "
+		    "IBT_WC_LOCAL_CHAN_OP_ERR seen, wqe_info=0x%lx ",
+		    wqe->qe_info);
+		break;
+
+	case IBT_WC_LOCAL_PROTECT_ERR:
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_err_comp: "
+		    "IBT_WC_LOCAL_PROTECT_ERR seen, wqe_info=0x%lx ",
+		    wqe->qe_info);
+		break;
+	}
+
+	/*
+	 * When a wc indicates error, we do not attempt to repost the
+	 * rwqe but simply return it to the wqe pool. Also for rwqes,
+	 * attempting to free the mblk in the wqe invokes the
+	 * eib_data_rx_recycle() callback.  For tx wqes, error handling
+	 * is the same as successful completion handling.  We still
+	 * have to unmap iov/free lsobufs/free mblk and then return the
+	 * swqe to the pool.
+	 */
+	if (EIB_WQE_TYPE(wqe->qe_info) == EIB_WQE_RX) {
+		ASSERT(wqe->qe_mp != NULL);
+		freemsg(wqe->qe_mp);
+	} else {
+		eib_data_tx_comp(vnic, wqe, vnic->vn_data_chan);
+	}
+}
+
+/*ARGSUSED*/
+static void
+eib_rb_data_setup_cqs(eib_t *ss, eib_vnic_t *vnic)
+{
+	eib_chan_t *chan = vnic->vn_data_chan;
+	ibt_status_t ret;
+
+	if (chan == NULL)
+		return;
+
+	/*
+	 * Reset any completion handlers we may have set up
+	 */
+	if (chan->ch_rcv_cq_hdl) {
+		ibt_set_cq_handler(chan->ch_rcv_cq_hdl, NULL, NULL);
+	}
+	if (chan->ch_cq_hdl) {
+		ibt_set_cq_handler(chan->ch_cq_hdl, NULL, NULL);
+	}
+
+	/*
+	 * Remove any softints that were added
+	 */
+	if (vnic->vn_data_rx_si_hdl) {
+		(void) ddi_intr_remove_softint(vnic->vn_data_rx_si_hdl);
+		vnic->vn_data_rx_si_hdl = NULL;
+	}
+	if (vnic->vn_data_tx_si_hdl) {
+		(void) ddi_intr_remove_softint(vnic->vn_data_tx_si_hdl);
+		vnic->vn_data_tx_si_hdl = NULL;
+	}
+
+	/*
+	 * Release any work completion buffers we may have allocated
+	 */
+	if (chan->ch_rcv_wc && chan->ch_rcv_cq_sz) {
+		kmem_free(chan->ch_rcv_wc,
+		    sizeof (ibt_wc_t) * chan->ch_rcv_cq_sz);
+	}
+	chan->ch_rcv_cq_sz = 0;
+	chan->ch_rcv_wc = NULL;
+
+	if (chan->ch_wc && chan->ch_cq_sz) {
+		kmem_free(chan->ch_wc, sizeof (ibt_wc_t) * chan->ch_cq_sz);
+	}
+	chan->ch_cq_sz = 0;
+	chan->ch_wc = NULL;
+
+	/*
+	 * Free any completion queues we may have allocated
+	 */
+	if (chan->ch_rcv_cq_hdl) {
+		ret = ibt_free_cq(chan->ch_rcv_cq_hdl);
+		if (ret != IBT_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_rb_data_setup_cqs: "
+			    "ibt_free_cq(rcv_cq) failed, ret=%d", ret);
+		}
+		chan->ch_rcv_cq_hdl = NULL;
+	}
+	if (chan->ch_cq_hdl) {
+		ret = ibt_free_cq(chan->ch_cq_hdl);
+		if (ret != IBT_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_rb_data_setup_cqs: "
+			    "ibt_free_cq(snd_cq) failed, ret=%d", ret);
+		}
+		chan->ch_cq_hdl = NULL;
+	}
+}
+
+/*ARGSUSED*/
+static void
+eib_rb_data_setup_ud_channel(eib_t *ss, eib_vnic_t *vnic)
+{
+	eib_chan_t *chan = vnic->vn_data_chan;
+	ibt_status_t ret;
+
+	if (chan == NULL)
+		return;
+
+	if (chan->ch_chan) {
+		/*
+		 * We're trying to tear down this UD channel. Make sure that
+		 * we don't attempt to refill (repost) at any point from now on.
+		 */
+		chan->ch_tear_down = B_TRUE;
+		if ((ret = ibt_flush_channel(chan->ch_chan)) != IBT_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_rb_data_setup_ud_channel: "
+			    "ibt_flush_channel() failed, ret=%d", ret);
+		}
+
+		/*
+		 * Wait until all posted tx wqes on this channel are back with
+		 * the wqe pool.
+		 */
+		mutex_enter(&chan->ch_tx_lock);
+		while (chan->ch_tx_posted > 0)
+			cv_wait(&chan->ch_tx_cv, &chan->ch_tx_lock);
+		mutex_exit(&chan->ch_tx_lock);
+
+		/*
+		 * Wait until all posted rx wqes on this channel are back with
+		 * the wqe pool.
+		 */
+		mutex_enter(&chan->ch_rx_lock);
+		while (chan->ch_rx_posted > 0)
+			cv_wait(&chan->ch_rx_cv, &chan->ch_rx_lock);
+		mutex_exit(&chan->ch_rx_lock);
+
+		/*
+		 * Now we're ready to free this channel
+		 */
+		if ((ret = ibt_free_channel(chan->ch_chan)) != IBT_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_rb_data_setup_ud_channel: "
+			    "ibt_free_channel() failed, ret=%d", ret);
+		}
+
+		chan->ch_alloc_mp = B_FALSE;
+		chan->ch_ip_hdr_align = 0;
+		chan->ch_rwqe_bktsz = 0;
+		chan->ch_lwm_rwqes = 0;
+		chan->ch_max_rwqes = 0;
+		chan->ch_max_swqes = 0;
+		chan->ch_qpn = 0;
+		chan->ch_chan = NULL;
+	}
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/ib/clients/eoib/eib_fip.c	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,1504 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+#include <sys/byteorder.h>
+
+#include <sys/ib/clients/eoib/eib_impl.h>
+
+/*
+ * Declarations private to this file
+ */
+static int eib_fip_make_login(eib_t *, eib_vnic_t *, eib_wqe_t *, int *);
+static int eib_fip_make_update(eib_t *, eib_vnic_t *, eib_wqe_t *, int, int *);
+static int eib_fip_make_table(eib_t *, eib_vnic_t *, eib_wqe_t *, int *);
+static int eib_fip_make_ka(eib_t *, eib_vnic_t *, eib_wqe_t *, int *);
+static int eib_fip_make_logout(eib_t *, eib_vnic_t *, eib_wqe_t *, int *);
+
+static int eib_fip_send_login(eib_t *, eib_vnic_t *, eib_wqe_t *, int *);
+static int eib_fip_send_update(eib_t *, eib_vnic_t *, eib_wqe_t *,
+    uint_t, int *);
+static int eib_fip_send_table(eib_t *, eib_vnic_t *, eib_wqe_t *, int *);
+static int eib_fip_send_ka(eib_t *, eib_vnic_t *, eib_wqe_t *, int *);
+static int eib_fip_send_logout(eib_t *, eib_vnic_t *, eib_wqe_t *, int *);
+
+static int eib_fip_parse_vhub_table(uint8_t *, eib_vnic_t *);
+static int eib_fip_parse_vhub_update(uint8_t *, eib_vnic_t *);
+static void eib_fip_update_eport_state(eib_t *, eib_vhub_table_t *,
+    eib_vhub_update_t *, boolean_t, uint8_t);
+static void eib_fip_queue_tbl_entry(eib_vhub_table_t *, eib_vhub_map_t *,
+    uint32_t, uint8_t);
+static void eib_fip_queue_upd_entry(eib_vhub_update_t *, eib_vhub_map_t *,
+    uint32_t, uint8_t);
+static void eib_fip_queue_gw_entry(eib_vnic_t *, eib_vhub_table_t *, uint32_t,
+    uint8_t);
+static int eib_fip_apply_updates(eib_t *, eib_vhub_table_t *,
+    eib_vhub_update_t *);
+static void eib_fip_dequeue_tbl_entry(eib_vhub_table_t *, uint8_t *, uint32_t,
+    uint8_t);
+static eib_vhub_map_t *eib_fip_get_vhub_map(void);
+
+/*
+ * Definitions private to this file
+ */
+const char eib_vendor_mellanox[] = {
+	0x4d, 0x65, 0x6c, 0x6c, 0x61, 0x6e, 0x6f, 0x78
+};
+
+/*
+ * The three requests to the gateway - request a vHUB table, request a
+ * vHUB update (aka keepalive) and vNIC logout - all need the same
+ * vnic identity descriptor to be sent with different flag settings.
+ *
+ *      vHUB table: R=1, U=0, TUSN=last, subcode=KEEPALIVE
+ *      keepalive/vHUB update: R=0, U=1, TUSN=last, subcode=KEEPALIVE
+ *      vNIC logout: R=0, U=0, TUSN=0, subcode=LOGOUT
+ */
+#define	EIB_UPD_REQ_TABLE	1
+#define	EIB_UPD_REQ_KA		2
+#define	EIB_UPD_REQ_LOGOUT	3
+
+int
+eib_fip_login(eib_t *ss, eib_vnic_t *vnic, int *err)
+{
+	eib_wqe_t *swqe;
+	int ret;
+	int ntries = 0;
+
+	do {
+		if ((swqe = eib_rsrc_grab_swqe(ss, EIB_WPRI_LO)) == NULL) {
+			EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_login: "
+			    "no swqe available, not sending "
+			    "vnic login request");
+			*err = ENOMEM;
+			return (EIB_E_FAILURE);
+		}
+
+		ret = eib_fip_make_login(ss, vnic, swqe, err);
+		if (ret != EIB_E_SUCCESS) {
+			eib_rsrc_return_swqe(ss, swqe, NULL);
+			return (EIB_E_FAILURE);
+		}
+
+		ret = eib_fip_send_login(ss, vnic, swqe, err);
+		if (ret != EIB_E_SUCCESS) {
+			eib_rsrc_return_swqe(ss, swqe, NULL);
+			return (EIB_E_FAILURE);
+		}
+
+		ret = eib_vnic_wait_for_login_ack(ss, vnic, err);
+		if (ret == EIB_E_SUCCESS)
+			break;
+
+	} while ((*err == ETIME) && (ntries++ < EIB_MAX_LOGIN_ATTEMPTS));
+
+	return (ret);
+}
+
+int
+eib_fip_vhub_table(eib_t *ss, eib_vnic_t *vnic, int *err)
+{
+	eib_wqe_t *swqe;
+	int ret;
+	int ntries = 0;
+
+	do {
+		if ((swqe = eib_rsrc_grab_swqe(ss, EIB_WPRI_LO)) == NULL) {
+			EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_vhub_table: "
+			    "no swqe available, not sending "
+			    "vhub table request");
+			*err = ENOMEM;
+			return (EIB_E_FAILURE);
+		}
+
+		ret = eib_fip_make_table(ss, vnic, swqe, err);
+		if (ret != EIB_E_SUCCESS) {
+			eib_rsrc_return_swqe(ss, swqe, NULL);
+			return (EIB_E_FAILURE);
+		}
+
+		ret = eib_fip_send_table(ss, vnic, swqe, err);
+		if (ret != EIB_E_SUCCESS) {
+			eib_rsrc_return_swqe(ss, swqe, NULL);
+			return (EIB_E_FAILURE);
+		}
+
+		ret = eib_vnic_wait_for_table(ss, vnic, err);
+		if (ret == EIB_E_SUCCESS) {
+			return (EIB_E_SUCCESS);
+		}
+
+		/*
+		 * If we'd failed in constructing a proper vhub table above,
+		 * the vnic login state would be set to EIB_LOGIN_TBL_FAILED.
+		 * We need to clean up any pending entries from the vhub
+		 * table and vhub update structures and reset the vnic state
+		 * to EIB_LOGIN_ACK_RCVD before we can try again.
+		 */
+		eib_vnic_fini_tables(ss, vnic, B_FALSE);
+		mutex_enter(&vnic->vn_lock);
+		vnic->vn_state = EIB_LOGIN_ACK_RCVD;
+		mutex_exit(&vnic->vn_lock);
+
+	} while ((*err == ETIME) && (ntries++ < EIB_MAX_VHUB_TBL_ATTEMPTS));
+
+	return (EIB_E_FAILURE);
+}
+
+int
+eib_fip_heartbeat(eib_t *ss, eib_vnic_t *vnic, int *err)
+{
+	eib_wqe_t *swqe;
+	int ntries = 0;
+	int ret;
+
+	/*
+	 * Even if we're running low on the wqe resource, we want to be
+	 * able to grab a wqe to send the keepalive, to avoid getting
+	 * logged out by the gateway, so we use EIB_WPRI_HI.
+	 */
+	if ((swqe = eib_rsrc_grab_swqe(ss, EIB_WPRI_HI)) == NULL) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_heartbeat: "
+		    "no swqe available, not sending heartbeat");
+		return (EIB_E_FAILURE);
+	}
+
+	while (ntries++ < EIB_MAX_KA_ATTEMPTS) {
+		ret = eib_fip_make_ka(ss, vnic, swqe, err);
+		if (ret != EIB_E_SUCCESS)
+			continue;
+
+		ret = eib_fip_send_ka(ss, vnic, swqe, err);
+		if (ret == EIB_E_SUCCESS)
+			break;
+	}
+
+	if (ret != EIB_E_SUCCESS)
+		eib_rsrc_return_swqe(ss, swqe, NULL);
+
+	return (ret);
+}
+
+int
+eib_fip_logout(eib_t *ss, eib_vnic_t *vnic, int *err)
+{
+	eib_wqe_t *swqe;
+	int ret;
+
+	/*
+	 * This routine is only called after the vnic has successfully
+	 * logged in to the gateway. If that's really the case, there
+	 * is nothing in terms of resources we need to release: the swqe
+	 * that was acquired during login has already been posted, the
+	 * work has been completed and the swqe has also been reaped back
+	 * into the free pool. The only thing we need to rollback is the
+	 * fact that we're logged in to the gateway at all -- and the way
+	 * to do this is to send a logout request.
+	 */
+	if ((swqe = eib_rsrc_grab_swqe(ss, EIB_WPRI_LO)) == NULL) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_logout: "
+		    "no swqe available, not sending logout");
+		return (EIB_E_FAILURE);
+	}
+
+	ret = eib_fip_make_logout(ss, vnic, swqe, err);
+	if (ret != EIB_E_SUCCESS) {
+		eib_rsrc_return_swqe(ss, swqe, NULL);
+		return (EIB_E_FAILURE);
+	}
+
+	ret = eib_fip_send_logout(ss, vnic, swqe, err);
+	if (ret != EIB_E_SUCCESS) {
+		eib_rsrc_return_swqe(ss, swqe, NULL);
+		return (EIB_E_FAILURE);
+	}
+
+	return (EIB_E_SUCCESS);
+}
+
+int
+eib_fip_parse_login_ack(eib_t *ss, uint8_t *pkt, eib_login_data_t *ld)
+{
+	fip_login_ack_t *ack;
+	fip_basic_hdr_t *hdr;
+	fip_desc_iba_t *iba;
+	fip_desc_vnic_login_t *login;
+	fip_desc_partition_t *partition;
+	ib_guid_t guid;
+	uint32_t syn_ctl_qpn;
+	uint16_t sl_portid;
+	uint16_t flags_vlan;
+	uint16_t opcode;
+	uint8_t subcode;
+
+	/*
+	 * Note that 'pkt' is always atleast double-word aligned
+	 * when it is passed to us, so we can cast it without any
+	 * problems.
+	 */
+	ack = (fip_login_ack_t *)(void *)pkt;
+	hdr = &(ack->ak_fip_header);
+
+	/*
+	 * Verify that the opcode is EoIB
+	 */
+	if ((opcode = ntohs(hdr->hd_opcode)) != FIP_OPCODE_EOIB) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_login_ack: "
+		    "unsupported opcode 0x%x in login ack, ignoring",
+		    opcode);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * The admin qp in the EoIB driver should receive only the login
+	 * acknowledgements
+	 */
+	subcode = hdr->hd_subcode;
+	if (subcode != FIP_SUBCODE_G_VNIC_LOGIN_ACK) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_login_ack: "
+		    "unexpected subcode 0x%x received by adm qp, ignoring",
+		    subcode);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Verify if the descriptor list length in the received packet is
+	 * valid if the workaround to disable it explicitly is absent.
+	 */
+	if (!eib_wa_no_desc_list_len) {
+		uint_t pkt_data_sz;
+
+		pkt_data_sz = (ntohs(hdr->hd_desc_list_len) + 2) << 2;
+		if (pkt_data_sz < sizeof (fip_login_ack_t)) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_fip_parse_login_ack: "
+			    "login ack desc list len (0x%lx) too small "
+			    "(min 0x%lx)",
+			    pkt_data_sz, sizeof (fip_login_ack_t));
+			return (EIB_E_FAILURE);
+		}
+	}
+
+	/*
+	 * Validate all the header and descriptor types and lengths
+	 */
+	if (hdr->hd_type != FIP_DESC_TYPE_VENDOR_ID ||
+	    hdr->hd_len != FIP_DESC_LEN_VENDOR_ID) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_login_ack: "
+		    "invalid type/len in basic hdr: expected (0x%x,0x%x), "
+		    "got (0x%x,0x%x)", FIP_DESC_TYPE_VENDOR_ID,
+		    FIP_DESC_LEN_VENDOR_ID, hdr->hd_type, hdr->hd_len);
+		return (EIB_E_FAILURE);
+	}
+	iba = &(ack->ak_iba);
+	if (iba->ia_type != FIP_DESC_TYPE_IBA ||
+	    iba->ia_len != FIP_DESC_LEN_IBA) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_login_ack: "
+		    "invalid type/len in iba desc: expected (0x%x,0x%x), "
+		    "got (0x%x,0x%x)", FIP_DESC_TYPE_IBA, FIP_DESC_LEN_IBA,
+		    iba->ia_type, iba->ia_len);
+		return (EIB_E_FAILURE);
+	}
+	login = &(ack->ak_vnic_login);
+	if (login->vl_type != FIP_DESC_TYPE_VNIC_LOGIN ||
+	    login->vl_len != FIP_DESC_LEN_VNIC_LOGIN) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_login_ack: "
+		    "invalid type/len in login desc: expected (0x%x,0x%x), "
+		    "got (0x%x,0x%x)", FIP_DESC_TYPE_VNIC_LOGIN,
+		    FIP_DESC_LEN_VNIC_LOGIN, login->vl_type, login->vl_len);
+		return (EIB_E_FAILURE);
+	}
+	partition = &(ack->ak_vhub_partition);
+	if (partition->pn_type != FIP_DESC_TYPE_PARTITION ||
+	    partition->pn_len != FIP_DESC_LEN_PARTITION) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_login_ack: "
+		    "invalid type/len in partition desc: expected (0x%x,0x%x), "
+		    "got (0x%x,0x%x)", FIP_DESC_TYPE_PARTITION,
+		    FIP_DESC_LEN_PARTITION, partition->pn_type,
+		    partition->pn_len);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Note that we'll return the vnic id as-is.  The msb is not actually
+	 * part of the vnic id in our internal records, so we'll mask it out
+	 * later before we do our searches.
+	 */
+	ld->ld_vnic_id = ntohs(login->vl_vnic_id);
+
+	syn_ctl_qpn = ntohl(login->vl_syndrome_ctl_qpn);
+
+	/*
+	 * If the syndrome indicates a nack, we're done.  No need to collect
+	 * any more information
+	 */
+	ld->ld_syndrome = (uint8_t)((syn_ctl_qpn & FIP_VL_SYN_MASK) >>
+	    FIP_VL_SYN_SHIFT);
+	if (ld->ld_syndrome) {
+		return (EIB_E_SUCCESS);
+	}
+
+	/*
+	 * Let's get the rest of the information out of the login ack
+	 */
+	sl_portid = ntohs(iba->ia_sl_portid);
+	ld->ld_gw_port_id = sl_portid & FIP_IBA_PORTID_MASK;
+	ld->ld_gw_sl = (sl_portid & FIP_IBA_SL_MASK) >> FIP_IBA_SL_SHIFT;
+
+	ld->ld_gw_data_qpn = ntohl(iba->ia_qpn) & FIP_IBA_QPN_MASK;
+	ld->ld_gw_lid = ntohs(iba->ia_lid);
+
+	bcopy(iba->ia_guid, &guid, sizeof (ib_guid_t));
+	ld->ld_gw_guid = ntohll(guid);
+	ld->ld_vhub_mtu = ntohs(login->vl_mtu);
+	bcopy(login->vl_mac, ld->ld_assigned_mac, ETHERADDRL);
+	bcopy(login->vl_gw_mgid_prefix, ld->ld_gw_mgid_prefix,
+	    FIP_MGID_PREFIX_LEN);
+	ld->ld_n_rss_mcgid = login->vl_flags_rss & FIP_VL_N_RSS_MCGID_MASK;
+	ld->ld_n_mac_mcgid = login->vl_n_mac_mcgid & FIP_VL_N_MAC_MCGID_MASK;
+	ld->ld_gw_ctl_qpn = (syn_ctl_qpn & FIP_VL_CTL_QPN_MASK);
+
+	flags_vlan = ntohs(login->vl_flags_vlan);
+	ld->ld_assigned_vlan = flags_vlan & FIP_VL_VLAN_MASK;
+	ld->ld_vlan_in_packets = (flags_vlan & FIP_VL_FLAGS_VP) ? 1 : 0;
+	bcopy(login->vl_vnic_name, ld->ld_vnic_name, FIP_VNIC_NAME_LEN);
+
+	ld->ld_vhub_pkey = ntohs(partition->pn_pkey);
+
+	return (EIB_E_SUCCESS);
+}
+
+int
+eib_fip_parse_ctl_pkt(uint8_t *pkt, eib_vnic_t *vnic)
+{
+	eib_t *ss = vnic->vn_ss;
+	fip_vhub_pkt_t *vhb;
+	fip_basic_hdr_t *hdr;
+	uint16_t opcode;
+	uint8_t subcode;
+	uint_t vnic_state;
+	int ret = EIB_E_FAILURE;
+
+	/*
+	 * Note that 'pkt' is always atleast double-word aligned when it is
+	 * passed to us, so we can cast it without any problems.
+	 */
+	vhb = (fip_vhub_pkt_t *)(void *)pkt;
+	hdr = &(vhb->hb_fip_header);
+
+	/*
+	 * Verify that the opcode is EoIB
+	 */
+	if ((opcode = ntohs(hdr->hd_opcode)) != FIP_OPCODE_EOIB) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_ctl_pkt: "
+		    "unsupported opcode 0x%x in ctl pkt, ignoring",
+		    opcode);
+		return (EIB_E_FAILURE);
+	}
+
+	mutex_enter(&vnic->vn_lock);
+	vnic_state = vnic->vn_state;
+	mutex_exit(&vnic->vn_lock);
+
+	/*
+	 * The ctl qp in the EoIB driver should receive only vHUB messages
+	 */
+	subcode = hdr->hd_subcode;
+	if (subcode == FIP_SUBCODE_G_VHUB_UPDATE) {
+		if (vnic_state != EIB_LOGIN_TBL_WAIT &&
+		    vnic_state != EIB_LOGIN_TBL_INPROG &&
+		    vnic_state != EIB_LOGIN_TBL_DONE &&
+		    vnic_state != EIB_LOGIN_DONE) {
+
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_fip_parse_ctl_pkt: unexpected vnic state "
+			    "(0x%lx) for subcode (VHUB_UPDATE 0x%x)",
+			    vnic_state, subcode);
+			return (EIB_E_FAILURE);
+		}
+
+		ret = eib_fip_parse_vhub_update(pkt, vnic);
+
+	} else if (subcode == FIP_SUBCODE_G_VHUB_TABLE) {
+		if ((vnic_state != EIB_LOGIN_TBL_WAIT) &&
+		    (vnic_state != EIB_LOGIN_TBL_INPROG)) {
+
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_fip_parse_ctl_pkt: unexpected vnic state "
+			    "(0x%lx) for subcode (VHUB_TABLE 0x%x)",
+			    vnic_state, subcode);
+			return (EIB_E_FAILURE);
+		}
+
+		ret = eib_fip_parse_vhub_table(pkt, vnic);
+
+	} else {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_ctl_pkt: "
+		    "unexpected subcode 0x%x for ctl pkt", subcode);
+	}
+
+	if (ret == EIB_E_SUCCESS) {
+		/*
+		 * Update last gateway heartbeat received time and
+		 * gateway eport state.  The eport state should only
+		 * be updated if the vnic's vhub table has been fully
+		 * constructed.
+		 */
+		mutex_enter(&ss->ei_vnic_lock);
+		ss->ei_gw_last_heartbeat = ddi_get_lbolt64();
+		if (vnic_state == EIB_LOGIN_TBL_DONE ||
+		    vnic_state == EIB_LOGIN_DONE) {
+			ss->ei_gw_eport_state =
+			    vnic->vn_vhub_table->tb_eport_state;
+		}
+		mutex_exit(&ss->ei_vnic_lock);
+	}
+
+	return (ret);
+}
+
+static int
+eib_fip_make_login(eib_t *ss, eib_vnic_t *vnic, eib_wqe_t *swqe, int *err)
+{
+	fip_login_t *login;
+	fip_proto_t *proto;
+	fip_basic_hdr_t *hdr;
+	fip_desc_iba_t *iba;
+	fip_desc_vnic_login_t *vlg;
+	ib_gid_t port_gid;
+	ib_guid_t port_guid;
+	uint16_t sl_portid;
+	uint16_t flags_vlan;
+
+	uint16_t gw_portid = ss->ei_gw_props->pp_gw_portid;
+	uint16_t sl = ss->ei_gw_props->pp_gw_sl;
+	uint8_t *pkt = (uint8_t *)(uintptr_t)(swqe->qe_sgl.ds_va);
+	uint_t pktsz = swqe->qe_sgl.ds_len;
+	uint_t login_sz = sizeof (fip_login_t);
+
+	if (pktsz < login_sz) {
+		*err = EINVAL;
+
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_make_login: "
+		    "send buffer size (0x%lx) too small to send"
+		    "login request (min 0x%lx)",
+		    pktsz, login_sz);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Lint complains that there may be an alignment issue here,
+	 * but we know that the "pkt" is atleast double-word aligned,
+	 * so it's ok.
+	 */
+	login = (fip_login_t *)(void *)pkt;
+	bzero(pkt, login_sz);
+
+	/*
+	 * Fill in the FIP protocol version
+	 */
+	proto = &login->lg_proto_version;
+	proto->pr_version = FIP_PROTO_VERSION;
+
+	/*
+	 * Fill in the basic header
+	 */
+	hdr = &login->lg_fip_header;
+	hdr->hd_opcode = htons(FIP_OPCODE_EOIB);
+	hdr->hd_subcode = FIP_SUBCODE_H_VNIC_LOGIN;
+	hdr->hd_desc_list_len = htons((login_sz >> 2) - 2);
+	hdr->hd_flags = 0;
+	hdr->hd_type = FIP_DESC_TYPE_VENDOR_ID;
+	hdr->hd_len = FIP_DESC_LEN_VENDOR_ID;
+	bcopy(eib_vendor_mellanox, hdr->hd_vendor_id, FIP_VENDOR_LEN);
+
+	/*
+	 * Fill in the Infiniband Address descriptor
+	 */
+	iba = &login->lg_iba;
+	iba->ia_type = FIP_DESC_TYPE_IBA;
+	iba->ia_len = FIP_DESC_LEN_IBA;
+	bcopy(eib_vendor_mellanox, iba->ia_vendor_id, FIP_VENDOR_LEN);
+	iba->ia_qpn = htonl(vnic->vn_data_chan->ch_qpn);
+
+	sl_portid = (gw_portid & FIP_IBA_PORTID_MASK) |
+	    ((sl << FIP_IBA_SL_SHIFT) & FIP_IBA_SL_MASK);
+	iba->ia_sl_portid = htons(sl_portid);
+
+	iba->ia_lid = htons(ss->ei_props->ep_blid);
+
+	port_gid = ss->ei_props->ep_sgid;
+	port_guid = htonll(port_gid.gid_guid);
+	bcopy(&port_guid, iba->ia_guid, FIP_GUID_LEN);
+
+	/*
+	 * Now, fill in the vNIC Login descriptor
+	 */
+
+	vlg = &login->lg_vnic_login;
+	vlg->vl_type = FIP_DESC_TYPE_VNIC_LOGIN;
+	vlg->vl_len = FIP_DESC_LEN_VNIC_LOGIN;
+	bcopy(eib_vendor_mellanox, vlg->vl_vendor_id, FIP_VENDOR_LEN);
+
+	/*
+	 * Only for the physlink instance 0, we ask the gateway to assign
+	 * the mac address and a VLAN (tagless, actually).  For this vnic
+	 * only, we do not set the H bit. All other vnics are created by
+	 * Solaris admin and will have the H bit set. Note also that we
+	 * need to clear the vnic id's most significant bit for those that
+	 * are administered by the gateway, so vnic0's vnic_id's msb should
+	 * be 0 as well.
+	 */
+	if (vnic->vn_instance == 0) {
+		vlg->vl_vnic_id = htons(vnic->vn_id);
+		flags_vlan = vnic->vn_vlan & FIP_VL_VLAN_MASK;
+	} else {
+		vlg->vl_vnic_id = htons(vnic->vn_id | FIP_VL_VNIC_ID_MSBIT);
+		flags_vlan = (vnic->vn_vlan & FIP_VL_VLAN_MASK) |
+		    FIP_VL_FLAGS_H | FIP_VL_FLAGS_M;
+
+		if (vnic->vn_vlan & FIP_VL_VLAN_MASK)
+			flags_vlan |= (FIP_VL_FLAGS_V | FIP_VL_FLAGS_VP);
+	}
+
+	vlg->vl_flags_vlan = htons(flags_vlan);
+	bcopy(vnic->vn_macaddr, vlg->vl_mac, ETHERADDRL);
+
+	/*
+	 * We aren't ready to enable rss, so we set the RSS bit and
+	 * the n_rss_mcgid field to 0.  Set the mac mcgid to 0 as well.
+	 */
+	vlg->vl_flags_rss = 0;
+	vlg->vl_n_mac_mcgid = 0;
+
+	/*
+	 * Set the syndrome to 0 and pass the control qpn
+	 */
+	vlg->vl_syndrome_ctl_qpn =
+	    htonl(vnic->vn_ctl_chan->ch_qpn & FIP_VL_CTL_QPN_MASK);
+
+	/*
+	 * Try to set as unique a name as possible for this vnic
+	 */
+	(void) snprintf((char *)(vlg->vl_vnic_name), FIP_VNIC_NAME_LEN,
+	    "eoib_%02x_%02x", ss->ei_instance, vnic->vn_instance);
+
+	/*
+	 * Adjust the ds_len in the sgl to indicate the size of this
+	 * request before returning
+	 */
+	swqe->qe_sgl.ds_len = login_sz;
+
+	return (EIB_E_SUCCESS);
+}
+
+static int
+eib_fip_make_update(eib_t *ss, eib_vnic_t *vnic, eib_wqe_t *swqe, int req,
+    int *err)
+{
+	fip_keep_alive_t *ka;
+	fip_proto_t *proto;
+	fip_basic_hdr_t *hdr;
+	fip_desc_vnic_identity_t *vid;
+	ib_gid_t port_gid;
+	ib_guid_t port_guid;
+	uint32_t flags_vhub_id;
+
+	uint8_t *pkt = (uint8_t *)(uintptr_t)(swqe->qe_sgl.ds_va);
+	uint_t pktsz = swqe->qe_sgl.ds_len;
+	uint_t ka_sz = sizeof (fip_keep_alive_t);
+
+	if (pktsz < ka_sz) {
+		*err = EINVAL;
+
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_make_update: "
+		    "send buffer size (0x%lx) too small to send"
+		    "keepalive/update request (min 0x%lx)",
+		    pktsz, ka_sz);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Lint complains that there may be an alignment issue here,
+	 * but we know that the "pkt" is atleast double-word aligned,
+	 * so it's ok.
+	 */
+	ka = (fip_keep_alive_t *)(void *)pkt;
+	bzero(pkt, ka_sz);
+
+	/*
+	 * Fill in the FIP protocol version
+	 */
+	proto = &ka->ka_proto_version;
+	proto->pr_version = FIP_PROTO_VERSION;
+
+	/*
+	 * Fill in the basic header
+	 */
+	hdr = &ka->ka_fip_header;
+	hdr->hd_opcode = htons(FIP_OPCODE_EOIB);
+	hdr->hd_subcode = (req == EIB_UPD_REQ_LOGOUT) ?
+	    FIP_SUBCODE_H_VNIC_LOGOUT : FIP_SUBCODE_H_KEEP_ALIVE;
+	hdr->hd_desc_list_len = htons((ka_sz >> 2) - 2);
+	hdr->hd_flags = 0;
+	hdr->hd_type = FIP_DESC_TYPE_VENDOR_ID;
+	hdr->hd_len = FIP_DESC_LEN_VENDOR_ID;
+	bcopy(eib_vendor_mellanox, hdr->hd_vendor_id, FIP_VENDOR_LEN);
+
+	/*
+	 * Fill in the vNIC Identity descriptor
+	 */
+	vid = &ka->ka_vnic_identity;
+
+	vid->vi_type = FIP_DESC_TYPE_VNIC_IDENTITY;
+	vid->vi_len = FIP_DESC_LEN_VNIC_IDENTITY;
+	bcopy(eib_vendor_mellanox, vid->vi_vendor_id, FIP_VENDOR_LEN);
+
+	flags_vhub_id = vnic->vn_login_data.ld_vhub_id;
+	if (vnic->vn_login_data.ld_vlan_in_packets) {
+		flags_vhub_id |= FIP_VI_FLAG_VP;
+	}
+	if (req == EIB_UPD_REQ_TABLE) {
+		flags_vhub_id |= FIP_VI_FLAG_R;
+	} else if (req == EIB_UPD_REQ_KA) {
+		flags_vhub_id |= FIP_VI_FLAG_U;
+	}
+	vid->vi_flags_vhub_id = htonl(flags_vhub_id);
+
+	vid->vi_tusn = (req != EIB_UPD_REQ_LOGOUT) ?
+	    htonl(vnic->vn_vhub_table->tb_tusn) : 0;
+
+	vid->vi_vnic_id = htons(vnic->vn_login_data.ld_vnic_id);
+	bcopy(vnic->vn_login_data.ld_assigned_mac, vid->vi_mac, ETHERADDRL);
+
+	port_gid = ss->ei_props->ep_sgid;
+	port_guid = htonll(port_gid.gid_guid);
+	bcopy(&port_guid, vid->vi_port_guid, FIP_GUID_LEN);
+	bcopy(vnic->vn_login_data.ld_vnic_name, vid->vi_vnic_name,
+	    FIP_VNIC_NAME_LEN);
+
+	/*
+	 * Adjust the ds_len in the sgl to indicate the size of this
+	 * request before returning
+	 */
+	swqe->qe_sgl.ds_len = ka_sz;
+
+	return (EIB_E_SUCCESS);
+}
+
+static int
+eib_fip_make_table(eib_t *ss, eib_vnic_t *vnic, eib_wqe_t *swqe, int *err)
+{
+	return (eib_fip_make_update(ss, vnic, swqe, EIB_UPD_REQ_TABLE, err));
+}
+
+static int
+eib_fip_make_ka(eib_t *ss, eib_vnic_t *vnic, eib_wqe_t *swqe, int *err)
+{
+	return (eib_fip_make_update(ss, vnic, swqe, EIB_UPD_REQ_KA, err));
+}
+
+static int
+eib_fip_make_logout(eib_t *ss, eib_vnic_t *vnic, eib_wqe_t *swqe, int *err)
+{
+	return (eib_fip_make_update(ss, vnic, swqe, EIB_UPD_REQ_LOGOUT, err));
+}
+
+static int
+eib_fip_send_login(eib_t *ss, eib_vnic_t *vnic, eib_wqe_t *swqe, int *err)
+{
+	eib_avect_t *av;
+	eib_chan_t *chan = ss->ei_admin_chan;
+	ibt_status_t ret;
+
+	/*
+	 * Get an address vector for this destination
+	 */
+	if ((av = eib_ibt_hold_avect(ss, ss->ei_gw_props->pp_gw_lid,
+	    ss->ei_gw_props->pp_gw_sl)) == NULL) {
+		*err = ENOMEM;
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_send_login: "
+		    "eib_ibt_hold_avect(gw_lid=0x%x, sl=0x%x) failed",
+		    ss->ei_gw_props->pp_gw_lid, ss->ei_gw_props->pp_gw_sl);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Modify the UD destination handle to the gateway
+	 */
+	ret = ibt_modify_ud_dest(swqe->qe_dest, EIB_FIP_QKEY,
+	    ss->ei_gw_props->pp_gw_ctrl_qpn, &av->av_vect);
+
+	eib_ibt_release_avect(ss, av);
+	if (ret != IBT_SUCCESS) {
+		*err = EINVAL;
+
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_send_login: "
+		    "ibt_modify_ud_dest(gw_ctl_qpn=0x%lx, qkey=0x%lx) failed, "
+		    "ret=%d", ss->ei_gw_props->pp_gw_ctrl_qpn,
+		    EIB_FIP_QKEY, ret);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Send the login packet to the destination gateway. Posting
+	 * the login and setting the login state to wait-for-ack should
+	 * ideally be atomic to avoid race.
+	 */
+	mutex_enter(&vnic->vn_lock);
+	ret = ibt_post_send(chan->ch_chan, &(swqe->qe_wr.send), 1, NULL);
+	if (ret != IBT_SUCCESS) {
+		mutex_exit(&vnic->vn_lock);
+		*err = EINVAL;
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_send_login: "
+		    "ibt_post_send() failed for vnic id 0x%x, ret=%d",
+		    vnic->vn_id, ret);
+		return (EIB_E_FAILURE);
+	}
+	vnic->vn_state = EIB_LOGIN_ACK_WAIT;
+
+	mutex_enter(&chan->ch_tx_lock);
+	chan->ch_tx_posted++;
+	mutex_exit(&chan->ch_tx_lock);
+
+	mutex_exit(&vnic->vn_lock);
+
+	return (EIB_E_SUCCESS);
+}
+
+static int
+eib_fip_send_update(eib_t *ss, eib_vnic_t *vnic, eib_wqe_t *swqe,
+    uint_t nxt_state, int *err)
+{
+	eib_login_data_t *ld = &vnic->vn_login_data;
+	eib_chan_t *chan = vnic->vn_ctl_chan;
+	eib_avect_t *av;
+	ibt_status_t ret;
+
+	/*
+	 * Get an address vector for this destination
+	 */
+	if ((av = eib_ibt_hold_avect(ss, ld->ld_gw_lid,
+	    ld->ld_gw_sl)) == NULL) {
+		*err = ENOMEM;
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_send_update: "
+		    "eib_ibt_hold_avect(gw_lid=0x%x, sl=0x%x) failed",
+		    ld->ld_gw_lid, ld->ld_gw_sl);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Modify the UD destination handle to the destination appropriately
+	 */
+	ret = ibt_modify_ud_dest(swqe->qe_dest, EIB_FIP_QKEY,
+	    ld->ld_gw_ctl_qpn, &av->av_vect);
+
+	eib_ibt_release_avect(ss, av);
+	if (ret != IBT_SUCCESS) {
+		*err = EINVAL;
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_send_update: "
+		    "ibt_modify_ud_dest(gw_ctl_qpn=0x%lx, qkey=0x%lx) failed, "
+		    "ret=%d", ld->ld_gw_ctl_qpn, EIB_FIP_QKEY, ret);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Send the update packet to the destination. Posting the update request
+	 * and setting the login state to wait-for-vhub_table needs to be atomic
+	 * to avoid race.
+	 */
+	mutex_enter(&vnic->vn_lock);
+	ret = ibt_post_send(chan->ch_chan, &(swqe->qe_wr.send), 1, NULL);
+	if (ret != IBT_SUCCESS) {
+		mutex_exit(&vnic->vn_lock);
+		*err = EINVAL;
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_send_update: "
+		    "ibt_post_send() failed for vnic id 0x%x, ret=%d",
+		    vnic->vn_id, ret);
+		return (EIB_E_FAILURE);
+	}
+	vnic->vn_state = nxt_state;
+
+	mutex_enter(&chan->ch_tx_lock);
+	chan->ch_tx_posted++;
+	mutex_exit(&chan->ch_tx_lock);
+
+	mutex_exit(&vnic->vn_lock);
+
+	return (EIB_E_SUCCESS);
+}
+
+static int
+eib_fip_send_table(eib_t *ss, eib_vnic_t *vnic, eib_wqe_t *swqe, int *err)
+{
+	return (eib_fip_send_update(ss, vnic, swqe, EIB_LOGIN_TBL_WAIT, err));
+}
+
+static int
+eib_fip_send_ka(eib_t *ss, eib_vnic_t *vnic, eib_wqe_t *swqe, int *err)
+{
+	return (eib_fip_send_update(ss, vnic, swqe, EIB_LOGIN_DONE, err));
+}
+
+static int
+eib_fip_send_logout(eib_t *ss, eib_vnic_t *vnic, eib_wqe_t *swqe, int *err)
+{
+	return (eib_fip_send_update(ss, vnic, swqe, EIB_LOGOUT_DONE, err));
+}
+
+static int
+eib_fip_parse_vhub_table(uint8_t *pkt, eib_vnic_t *vnic)
+{
+	fip_vhub_table_t *tbl;
+	fip_desc_vhub_table_t *desc_tbl;
+	fip_vhub_table_entry_t *entry;
+	fip_basic_hdr_t *hdr;
+	eib_t *ss = vnic->vn_ss;
+	eib_login_data_t *ld = &vnic->vn_login_data;
+	eib_vhub_table_t *etbl = vnic->vn_vhub_table;
+	eib_vhub_update_t *eupd = vnic->vn_vhub_update;
+	eib_vhub_map_t *newmap;
+
+	uint32_t *ipkt;
+	uint32_t init_checksum = 0;
+	uint32_t tusn;
+	uint32_t vhub_id;
+	uint_t entries_in_pkt;
+	uint_t ndx;
+	uint_t i;
+
+	/*
+	 * If we're here receiving vhub table messages, we certainly should
+	 * have the vhub table structure allocated and present at this point.
+	 */
+	if (etbl == NULL) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_table: "
+		    "vhub table missing for vnic id 0x%x", vnic->vn_id);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Note that 'pkt' is always atleast double-word aligned when it is
+	 * passed to us, so we can cast it without any problems.
+	 */
+	ipkt = (uint32_t *)(void *)pkt;
+	tbl = (fip_vhub_table_t *)(void *)pkt;
+	hdr = &(tbl->vt_fip_header);
+
+	/*
+	 * Validate all the header and descriptor types and lengths
+	 */
+	if (hdr->hd_type != FIP_DESC_TYPE_VENDOR_ID ||
+	    hdr->hd_len != FIP_DESC_LEN_VENDOR_ID) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_table: "
+		    "invalid type/len in fip basic header, "
+		    "exp (0x%x,0x%x), got (0x%x,0x%x)",
+		    FIP_DESC_TYPE_VENDOR_ID, FIP_DESC_LEN_VENDOR_ID,
+		    hdr->hd_type, hdr->hd_len);
+		return (EIB_E_FAILURE);
+	}
+	desc_tbl = &(tbl->vt_vhub_table);
+	if (desc_tbl->tb_type != FIP_DESC_TYPE_VHUB_TABLE) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_table: "
+		    "invalid type in vhub desc, exp 0x%x, got 0x%x",
+		    FIP_DESC_TYPE_VHUB_TABLE, desc_tbl->tb_type);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Verify that the vhub id is ok for this vnic
+	 */
+	vhub_id = ntohl(desc_tbl->tb_flags_vhub_id) & FIP_TB_VHUB_ID_MASK;
+	if (vhub_id != ld->ld_vhub_id) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_table: "
+		    "invalid vhub id in vhub table pkt: exp 0x%x, got 0x%x",
+		    ld->ld_vhub_id, vhub_id);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Count the number of vhub table entries in this packet
+	 */
+	entries_in_pkt = (desc_tbl->tb_len - FIP_DESC_VHUB_TABLE_WORDS) /
+	    FIP_VHUB_TABLE_ENTRY_WORDS;
+
+	/*
+	 * While we're here, also compute the 32-bit 2's complement carry-
+	 * discarded checksum of the vHUB table descriptor in this packet
+	 * till the first vhub table entry.
+	 */
+	for (i = 0; i < FIP_DESC_VHUB_TABLE_WORDS; i++)
+		init_checksum += ipkt[i];
+
+	/*
+	 * Initialize the vhub's Table Update Sequence Number (tusn),
+	 * checksum and record the total number of entries in in the table
+	 * if this is the first pkt of the table.
+	 */
+	tusn = ntohl(desc_tbl->tb_tusn);
+	if (desc_tbl->tb_hdr & FIP_TB_HDR_FIRST) {
+		etbl->tb_entries_in_table = ntohs(desc_tbl->tb_table_size);
+		etbl->tb_tusn = tusn;
+		etbl->tb_checksum = 0;
+
+		mutex_enter(&vnic->vn_lock);
+		vnic->vn_state = EIB_LOGIN_TBL_INPROG;
+		mutex_exit(&vnic->vn_lock);
+	}
+
+	/*
+	 * First, middle or last, the current table TUSN we have must match this
+	 * packet's TUSN.
+	 */
+	if (etbl->tb_tusn != tusn) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_table: "
+		    "unexpected TUSN (0x%lx) during vhub table construction, "
+		    "expected 0x%lx", etbl->tb_tusn, tusn);
+		goto vhub_table_fail;
+	}
+
+	/*
+	 * See if we've overrun/underrun our original entries count
+	 */
+	if ((etbl->tb_entries_seen + entries_in_pkt) >
+	    etbl->tb_entries_in_table) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_table: "
+		    "vhub table overrun, total_exp=%d, so_far=%d, this_pkt=%d",
+		    etbl->tb_entries_in_table, etbl->tb_entries_seen,
+		    entries_in_pkt);
+		goto vhub_table_fail;
+	} else if (((etbl->tb_entries_seen + entries_in_pkt) <
+	    etbl->tb_entries_in_table) &&
+	    (desc_tbl->tb_hdr & FIP_TB_HDR_LAST)) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_table: "
+		    "vhub table underrun, total_exp=%d, so_far=%d, last_pkt=%d",
+		    etbl->tb_entries_in_table, etbl->tb_entries_seen,
+		    entries_in_pkt);
+		goto vhub_table_fail;
+	}
+
+	/*
+	 * Process and add the entries we have in this packet
+	 */
+	etbl->tb_checksum += init_checksum;
+	entry = (fip_vhub_table_entry_t *)(void *)
+	    ((uint8_t *)desc_tbl + FIP_DESC_VHUB_TABLE_SZ);
+
+	for (ndx = 0; ndx < entries_in_pkt; ndx++, entry++) {
+		/*
+		 * Allocate a eib_vhub_map_t, copy the current entry details
+		 * and chain it to the appropriate queue.
+		 */
+		if ((newmap = eib_fip_get_vhub_map()) == NULL) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_fip_parse_vhub_table: no memory for vhub "
+			    "table entry, ignoring this vhub table packet");
+			goto vhub_table_fail;
+		}
+
+		ASSERT((entry->te_v_rss_type & FIP_TE_VALID) == FIP_TE_VALID);
+		newmap->mp_v_rss_type = entry->te_v_rss_type;
+		bcopy(entry->te_mac, newmap->mp_mac, ETHERADDRL);
+		newmap->mp_qpn = (ntohl(entry->te_qpn) & FIP_TE_QPN_MASK);
+		newmap->mp_sl = (entry->te_sl & FIP_TE_SL_MASK);
+		newmap->mp_lid = ntohs(entry->te_lid);
+		newmap->mp_tusn = tusn;
+		newmap->mp_next = NULL;
+
+		/*
+		 * The vhub table messages do not provide status on eport
+		 * state, so we'll simply assume that the eport is up.
+		 */
+		eib_fip_queue_tbl_entry(etbl, newmap, tusn, FIP_EPORT_UP);
+
+		/*
+		 * Update table checksum with this entry's computed checksum
+		 */
+		ipkt = (uint32_t *)entry;
+		for (i = 0; i < FIP_VHUB_TABLE_ENTRY_WORDS; i++)
+			etbl->tb_checksum += ipkt[i];
+	}
+	etbl->tb_entries_seen += entries_in_pkt;
+
+	/*
+	 * If this is the last packet of this vhub table, complete vhub
+	 * table by verifying checksum and applying all the vhub updates
+	 * that may have come in while we were constructing this table.
+	 */
+	if (desc_tbl->tb_hdr & FIP_TB_HDR_LAST) {
+
+		ipkt = (uint32_t *)entry;
+		if (!eib_wa_no_good_vhub_cksum) {
+			if (*ipkt != etbl->tb_checksum) {
+				EIB_DPRINTF_VERBOSE(ss->ei_instance,
+				    "eib_fip_parse_vhub_table: "
+				    "vhub table checksum invalid, "
+				    "computed=0x%lx, found=0x%lx",
+				    etbl->tb_checksum, *ipkt);
+			}
+		}
+
+		/*
+		 * Per the EoIB specification, the gateway is supposed to
+		 * include its address information for data messages in the
+		 * vhub table.  But we've observed that it doesn't do this
+		 * (with the current version). If this is the case, we'll
+		 * hand-create and add a vhub map for the gateway from the
+		 * information we got in login ack.
+		 */
+		if (etbl->tb_gateway == NULL)
+			eib_fip_queue_gw_entry(vnic, etbl, tusn, FIP_EPORT_UP);
+
+		/*
+		 * Apply pending vhub updates and reset table counters needed
+		 * during table construction.
+		 */
+		if (eib_fip_apply_updates(ss, etbl, eupd) != EIB_E_SUCCESS)
+			goto vhub_table_fail;
+
+		etbl->tb_entries_seen = 0;
+		etbl->tb_entries_in_table = 0;
+
+		eib_vnic_vhub_table_done(vnic, EIB_LOGIN_TBL_DONE);
+	}
+
+	return (EIB_E_SUCCESS);
+
+vhub_table_fail:
+	eib_vnic_vhub_table_done(vnic, EIB_LOGIN_TBL_FAILED);
+	return (EIB_E_FAILURE);
+}
+
+static int
+eib_fip_parse_vhub_update(uint8_t *pkt, eib_vnic_t *vnic)
+{
+	fip_vhub_update_t *upd;
+	fip_desc_vhub_update_t *desc_upd;
+	fip_vhub_table_entry_t *entry;
+	fip_basic_hdr_t *hdr;
+	eib_t *ss = vnic->vn_ss;
+	eib_login_data_t *ld = &vnic->vn_login_data;
+	eib_vhub_table_t *etbl = vnic->vn_vhub_table;
+	eib_vhub_update_t *eupd = vnic->vn_vhub_update;
+	eib_vhub_map_t *newmap;
+	boolean_t vhub_tbl_done;
+	uint32_t eport_vp_vhub_id;
+	uint32_t vhub_id;
+	uint32_t tusn;
+	uint32_t prev_tusn;
+	uint8_t eport_state;
+
+	/*
+	 * We should have the vhub table allocated as long as we're receiving
+	 * vhub control messages.
+	 */
+	if (etbl == NULL) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_update: "
+		    "vhub table missing for vnic id 0x%x", vnic->vn_id);
+		return (EIB_E_FAILURE);
+	}
+
+	mutex_enter(&vnic->vn_lock);
+	vhub_tbl_done = ((vnic->vn_state == EIB_LOGIN_TBL_DONE) ||
+	    (vnic->vn_state == EIB_LOGIN_DONE)) ? B_TRUE : B_FALSE;
+	mutex_exit(&vnic->vn_lock);
+
+	/*
+	 * Note that 'pkt' is always atleast double-word aligned when it is
+	 * passed to us, so we can cast it without any problems.
+	 */
+	upd = (fip_vhub_update_t *)(void *)pkt;
+	hdr = &(upd->vu_fip_header);
+
+	/*
+	 * Validate all the header and descriptor types and lengths
+	 */
+	if (hdr->hd_type != FIP_DESC_TYPE_VENDOR_ID ||
+	    hdr->hd_len != FIP_DESC_LEN_VENDOR_ID) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_update: "
+		    "invalid type/len in fip basic header, "
+		    "exp (0x%x,0x%x), got (0x%x,0x%x)",
+		    FIP_DESC_TYPE_VENDOR_ID, FIP_DESC_LEN_VENDOR_ID,
+		    hdr->hd_type, hdr->hd_len);
+		return (EIB_E_FAILURE);
+	}
+	desc_upd = &(upd->vu_vhub_update);
+	if (desc_upd->up_type != FIP_DESC_TYPE_VHUB_UPDATE ||
+	    desc_upd->up_len != FIP_DESC_LEN_VHUB_UPDATE) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_update: "
+		    "invalid type/len in vhub update desc: "
+		    "exp (0x%x,0x%x), got (0x%x,0x%x)",
+		    FIP_DESC_TYPE_VHUB_UPDATE, FIP_DESC_LEN_VHUB_UPDATE,
+		    desc_upd->up_type, desc_upd->up_len);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Verify that the vhub id is ok for this vnic and save the eport state
+	 */
+	eport_vp_vhub_id = ntohl(desc_upd->up_eport_vp_vhub_id);
+
+	vhub_id = eport_vp_vhub_id & FIP_UP_VHUB_ID_MASK;
+	if (vhub_id != ld->ld_vhub_id) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_update: "
+		    "invalid vhub id in vhub update pkt: exp 0x%x, got 0x%x",
+		    ld->ld_vhub_id, vhub_id);
+		return (EIB_E_FAILURE);
+	}
+	eport_state = (uint8_t)((eport_vp_vhub_id >> FIP_UP_EPORT_STATE_SHIFT) &
+	    FIP_UP_EPORT_STATE_MASK);
+
+	/*
+	 * If this is the first update we receive, any tusn is ok.  Otherwise,
+	 * make sure the tusn we see in the packet is appropriate.
+	 */
+	tusn = ntohl(desc_upd->up_tusn);
+	prev_tusn = vhub_tbl_done ? etbl->tb_tusn : eupd->up_tusn;
+
+	if (prev_tusn != 0) {
+		if (tusn == prev_tusn) {
+			eib_fip_update_eport_state(ss, etbl, eupd,
+			    vhub_tbl_done, eport_state);
+			return (EIB_E_SUCCESS);
+		}
+		if (tusn != (prev_tusn + 1)) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_fip_parse_vhub_update: "
+			    "out of order TUSN received (exp 0x%lx, "
+			    "got 0x%lx), dropping pkt", prev_tusn + 1, tusn);
+			return (EIB_E_FAILURE);
+		}
+	}
+
+	/*
+	 * EoIB expects only type 0 (vnic address) entries to maintain the
+	 * context table
+	 */
+	entry = &(desc_upd->up_tbl_entry);
+	ASSERT((entry->te_v_rss_type & FIP_TE_TYPE_MASK) == FIP_TE_TYPE_VNIC);
+
+	/*
+	 * If the vHUB table has already been fully constructed and if we've
+	 * now received a notice to remove a vnic entry from it, do it.
+	 */
+	if ((vhub_tbl_done) &&
+	    ((entry->te_v_rss_type & FIP_TE_VALID) == 0)) {
+		eib_fip_dequeue_tbl_entry(etbl, entry->te_mac,
+		    tusn, eport_state);
+
+		if (bcmp(entry->te_mac, ld->ld_assigned_mac, ETHERADDRL) == 0) {
+			uint8_t *mymac;
+
+			mymac = entry->te_mac;
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_fip_parse_vhub_update: "
+			    "vhub update pkt received to kill self "
+			    "(%x:%x:%x:%x:%x:%x)", mymac[0], mymac[1], mymac[2],
+			    mymac[3], mymac[4], mymac[5]);
+
+			return (EIB_E_FAILURE);
+		}
+		return (EIB_E_SUCCESS);
+	}
+
+	/*
+	 * Otherwise, allocate a new eib_vhub_map_t and fill it in with
+	 * the details of the new entry
+	 */
+	if ((newmap = eib_fip_get_vhub_map()) == NULL) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_update: "
+		    "no memory for vhub update entry, will be ignoring"
+		    "this vhub update packet");
+		return (EIB_E_FAILURE);
+	}
+
+	newmap->mp_v_rss_type = entry->te_v_rss_type;
+	bcopy(entry->te_mac, newmap->mp_mac, ETHERADDRL);
+	newmap->mp_qpn = (ntohl(entry->te_qpn) & FIP_TE_QPN_MASK);
+	newmap->mp_sl = (entry->te_sl & FIP_TE_SL_MASK);
+	newmap->mp_lid = ntohs(entry->te_lid);
+	newmap->mp_tusn = tusn;
+	newmap->mp_next = NULL;
+
+	/*
+	 * Update the full vhub table or chain it to the list of pending
+	 * updates depending on if the vhub table construction is over
+	 * or not.
+	 */
+	if (vhub_tbl_done) {
+		eib_fip_queue_tbl_entry(etbl, newmap, tusn, eport_state);
+	} else {
+		eib_fip_queue_upd_entry(eupd, newmap, tusn, eport_state);
+	}
+
+	return (EIB_E_SUCCESS);
+}
+
+static void
+eib_fip_update_eport_state(eib_t *ss, eib_vhub_table_t *tbl,
+    eib_vhub_update_t *upd, boolean_t tbl_done, uint8_t eport_state)
+{
+	if (tbl_done) {
+		mutex_enter(&tbl->tb_lock);
+		if (tbl->tb_eport_state != eport_state) {
+			EIB_DPRINTF_DEBUG(ss->ei_instance,
+			    "eib_fip_update_eport_state: "
+			    "eport state changing from %d to %d",
+			    tbl->tb_eport_state, eport_state);
+			tbl->tb_eport_state = eport_state;
+		}
+		mutex_exit(&tbl->tb_lock);
+	} else {
+		mutex_enter(&upd->up_lock);
+		if (upd->up_eport_state != eport_state) {
+			EIB_DPRINTF_DEBUG(ss->ei_instance,
+			    "eib_fip_update_eport_state: "
+			    "eport state changing from %d to %d",
+			    upd->up_eport_state, eport_state);
+			upd->up_eport_state = eport_state;
+		}
+		mutex_exit(&upd->up_lock);
+	}
+}
+
+static void
+eib_fip_queue_tbl_entry(eib_vhub_table_t *tbl, eib_vhub_map_t *map,
+    uint32_t tusn, uint8_t eport_state)
+{
+	uint8_t bkt;
+
+	mutex_enter(&tbl->tb_lock);
+
+	switch (map->mp_v_rss_type & FIP_TE_TYPE_MASK) {
+	case FIP_TE_TYPE_GATEWAY:
+		if (tbl->tb_gateway) {
+			kmem_free(tbl->tb_gateway,
+			    sizeof (eib_vhub_map_t));
+		}
+		tbl->tb_gateway = map;
+		break;
+
+	case FIP_TE_TYPE_UNICAST_MISS:
+		if (tbl->tb_unicast_miss) {
+			kmem_free(tbl->tb_unicast_miss,
+			    sizeof (eib_vhub_map_t));
+		}
+		tbl->tb_unicast_miss = map;
+		break;
+
+	case FIP_TE_TYPE_VHUB_MULTICAST:
+		if (tbl->tb_vhub_multicast) {
+			kmem_free(tbl->tb_vhub_multicast,
+			    sizeof (eib_vhub_map_t));
+		}
+		tbl->tb_vhub_multicast = map;
+		break;
+
+	case FIP_TE_TYPE_MULTICAST_ENTRY:
+		/*
+		 * If multicast entry types are not to be specially
+		 * processed, treat them like regular vnic addresses.
+		 */
+		if (!eib_wa_no_mcast_entries) {
+			bkt = (map->mp_mac[ETHERADDRL-1]) % EIB_TB_NBUCKETS;
+			map->mp_next = tbl->tb_mcast_entry[bkt];
+			tbl->tb_mcast_entry[bkt] = map;
+			break;
+		}
+		/*FALLTHROUGH*/
+
+	case FIP_TE_TYPE_VNIC:
+		bkt = (map->mp_mac[ETHERADDRL-1]) % EIB_TB_NBUCKETS;
+		map->mp_next = tbl->tb_vnic_entry[bkt];
+		tbl->tb_vnic_entry[bkt] = map;
+		break;
+	}
+
+	tbl->tb_tusn = tusn;
+	tbl->tb_eport_state = eport_state;
+
+	mutex_exit(&tbl->tb_lock);
+}
+
+static void
+eib_fip_queue_upd_entry(eib_vhub_update_t *upd, eib_vhub_map_t *map,
+    uint32_t tusn, uint8_t eport_state)
+{
+	eib_vhub_map_t *tail;
+
+	/*
+	 * The eib_vhub_update_t list is only touched/traversed when the
+	 * control cq handler is parsing either update or table message,
+	 * or by the table cleanup routine when we aren't attached to any
+	 * control mcgs.  Bottom line is that this list traversal is always
+	 * single-threaded and we could probably do away with the lock.
+	 */
+	mutex_enter(&upd->up_lock);
+	for (tail = upd->up_vnic_entry;  tail != NULL; tail = tail->mp_next) {
+		if (tail->mp_next == NULL)
+			break;
+	}
+	if (tail) {
+		tail->mp_next = map;
+	} else {
+		upd->up_vnic_entry = map;
+	}
+
+	upd->up_tusn = tusn;
+	upd->up_eport_state = eport_state;
+
+	mutex_exit(&upd->up_lock);
+}
+
+static void
+eib_fip_queue_gw_entry(eib_vnic_t *vnic, eib_vhub_table_t *tbl, uint32_t tusn,
+    uint8_t eport_state)
+{
+	eib_t *ss = vnic->vn_ss;
+	eib_vhub_map_t *newmap;
+	eib_login_data_t *ld = &vnic->vn_login_data;
+
+	if ((newmap = eib_fip_get_vhub_map()) == NULL) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_queue_gw_entry: "
+		    "no memory to queue gw entry, transactions could fail");
+		return;
+	}
+
+	newmap->mp_v_rss_type = FIP_TE_VALID | FIP_TE_TYPE_GATEWAY;
+	bcopy(eib_zero_mac, newmap->mp_mac, ETHERADDRL);
+	newmap->mp_qpn = ld->ld_gw_data_qpn;
+	newmap->mp_sl = ld->ld_gw_sl;
+	newmap->mp_lid = ld->ld_gw_lid;
+	newmap->mp_tusn = tusn;
+	newmap->mp_next = NULL;
+
+	eib_fip_queue_tbl_entry(tbl, newmap, tusn, eport_state);
+}
+
+static int
+eib_fip_apply_updates(eib_t *ss, eib_vhub_table_t *tbl, eib_vhub_update_t *upd)
+{
+	eib_vhub_map_t *list;
+	eib_vhub_map_t *map;
+	eib_vhub_map_t *nxt;
+	uint32_t tbl_tusn = tbl->tb_tusn;
+
+	/*
+	 * Take the update list out
+	 */
+	mutex_enter(&upd->up_lock);
+	list = upd->up_vnic_entry;
+	upd->up_vnic_entry = NULL;
+	mutex_exit(&upd->up_lock);
+
+	/*
+	 * Skip any updates with older/same tusn as our vhub table
+	 */
+	nxt = NULL;
+	for (map = list; (map) && (map->mp_tusn <= tbl_tusn); map = nxt) {
+		nxt = map->mp_next;
+		kmem_free(map, sizeof (eib_vhub_map_t));
+	}
+
+	if (map == NULL)
+		return (EIB_E_SUCCESS);
+
+	/*
+	 * If we missed any updates between table tusn and the first
+	 * update tusn we got, we need to fail.
+	 */
+	if (map->mp_tusn > (tbl_tusn + 1)) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_apply_updates: "
+		    "vhub update missed tusn(s), expected=0x%lx, got=0x%lx",
+		    (tbl_tusn + 1), map->mp_tusn);
+		for (; map != NULL; map = nxt) {
+			nxt = map->mp_next;
+			kmem_free(map, sizeof (eib_vhub_map_t));
+		}
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * If everything is fine, apply all the updates we received
+	 */
+	for (; map != NULL; map = nxt) {
+		nxt = map->mp_next;
+		map->mp_next = NULL;
+
+		if (map->mp_v_rss_type & FIP_TE_VALID) {
+			eib_fip_queue_tbl_entry(tbl, map, upd->up_tusn,
+			    upd->up_eport_state);
+		} else {
+			eib_fip_dequeue_tbl_entry(tbl, map->mp_mac,
+			    upd->up_tusn, upd->up_eport_state);
+			kmem_free(map, sizeof (eib_vhub_map_t));
+		}
+	}
+
+	return (EIB_E_SUCCESS);
+}
+
+static void
+eib_fip_dequeue_tbl_entry(eib_vhub_table_t *tbl, uint8_t *mac, uint32_t tusn,
+    uint8_t eport_state)
+{
+	uint8_t bkt;
+	eib_vhub_map_t *prev;
+	eib_vhub_map_t *elem;
+
+	bkt = (mac[ETHERADDRL-1]) % EIB_TB_NBUCKETS;
+
+	mutex_enter(&tbl->tb_lock);
+
+	/*
+	 * Note that for EoIB, the vhub table is maintained using only
+	 * vnic entry updates
+	 */
+	prev = NULL;
+	for (elem = tbl->tb_vnic_entry[bkt]; elem; elem = elem->mp_next) {
+		if (bcmp(elem->mp_mac, mac, ETHERADDRL) == 0)
+			break;
+		prev = elem;
+	}
+
+	if (prev && elem) {
+		prev->mp_next = elem->mp_next;
+		kmem_free(elem, sizeof (eib_vhub_map_t));
+	}
+
+	tbl->tb_tusn = tusn;
+	tbl->tb_eport_state = eport_state;
+
+	mutex_exit(&tbl->tb_lock);
+}
+
+static eib_vhub_map_t *
+eib_fip_get_vhub_map(void)
+{
+	return (kmem_zalloc(sizeof (eib_vhub_map_t), KM_NOSLEEP));
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/ib/clients/eoib/eib_ibt.c	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,1004 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+#include <sys/dlpi.h>			/* HCKSUM_INET_FULL_V4 */
+#include <sys/pattr.h>			/* HCK_FULLCKSUM */
+#include <sys/ib/mgt/sm_attr.h>		/* SM_INIT_TYPE_REPLY_... */
+
+#include <sys/ib/clients/eoib/eib_impl.h>
+
+/*
+ * Declarations private to this file
+ */
+static void eib_ibt_reset_partitions(eib_t *);
+static void eib_ibt_wakeup_sqd_waiters(eib_t *, ibt_channel_hdl_t);
+static int eib_ibt_chan_pkey(eib_t *, eib_chan_t *, ib_pkey_t, boolean_t,
+    boolean_t *);
+static boolean_t eib_ibt_has_chan_pkey_changed(eib_t *, eib_chan_t *);
+static boolean_t eib_ibt_has_any_pkey_changed(eib_t *);
+static int eib_ibt_fill_avect(eib_t *, eib_avect_t *, ib_lid_t);
+static void eib_ibt_record_srate(eib_t *);
+
+/*
+ * Definitions private to this file
+ */
+
+/*
+ * SM's init type reply flags
+ */
+#define	EIB_PORT_ATTR_LOADED(itr)				\
+	(((itr) & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0)
+#define	EIB_PORT_ATTR_NOT_PRESERVED(itr)			\
+	(((itr) & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)
+#define	EIB_PORT_PRES_NOT_PRESERVED(itr)			\
+	(((itr) & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 0)
+
+/*
+ * eib_ibt_hca_init() initialization progress flags
+ */
+#define	EIB_HCAINIT_HCA_OPENED		0x01
+#define	EIB_HCAINIT_ATTRS_ALLOCD	0x02
+#define	EIB_HCAINIT_HCA_PORTS_QUERIED	0x04
+#define	EIB_HCAINIT_PD_ALLOCD		0x08
+#define	EIB_HCAINIT_CAPAB_RECORDED	0x10
+
+int
+eib_ibt_hca_init(eib_t *ss)
+{
+	ibt_status_t ret;
+	ibt_hca_portinfo_t *pi;
+	uint_t num_pi;
+	uint_t sz_pi;
+	uint_t progress = 0;
+
+	if (ss->ei_hca_hdl)
+		return (EIB_E_SUCCESS);
+
+	/*
+	 * Open the HCA
+	 */
+	ret = ibt_open_hca(ss->ei_ibt_hdl, ss->ei_props->ep_hca_guid,
+	    &ss->ei_hca_hdl);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance,
+		    "ibt_open_hca(hca_guid=0x%llx) "
+		    "failed, ret=%d", ss->ei_props->ep_hca_guid, ret);
+		goto ibt_hca_init_fail;
+	}
+	progress |= EIB_HCAINIT_HCA_OPENED;
+
+	/*
+	 * Query and store HCA attributes
+	 */
+	ss->ei_hca_attrs = kmem_zalloc(sizeof (ibt_hca_attr_t), KM_SLEEP);
+	progress |= EIB_HCAINIT_ATTRS_ALLOCD;
+
+	ret = ibt_query_hca(ss->ei_hca_hdl, ss->ei_hca_attrs);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance,
+		    "ibt_query_hca(hca_hdl=0x%llx, "
+		    "hca_guid=0x%llx) failed, ret=%d",
+		    ss->ei_hca_hdl, ss->ei_props->ep_hca_guid, ret);
+		goto ibt_hca_init_fail;
+	}
+
+	/*
+	 * At this point, we don't even care about the linkstate, we only want
+	 * to record our invariant base port guid and mtu
+	 */
+	ret = ibt_query_hca_ports(ss->ei_hca_hdl, ss->ei_props->ep_port_num,
+	    &pi, &num_pi, &sz_pi);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance,
+		    "ibt_query_hca_ports(hca_hdl=0x%llx, "
+		    "port=0x%x) failed, ret=%d", ss->ei_hca_hdl,
+		    ss->ei_props->ep_port_num, ret);
+		goto ibt_hca_init_fail;
+	}
+	if (num_pi != 1) {
+		EIB_DPRINTF_ERR(ss->ei_instance,
+		    "ibt_query_hca_ports(hca_hdl=0x%llx, "
+		    "port=0x%x) returned num_pi=%d", ss->ei_hca_hdl,
+		    ss->ei_props->ep_port_num, num_pi);
+		ibt_free_portinfo(pi, sz_pi);
+		goto ibt_hca_init_fail;
+	}
+
+	ss->ei_props->ep_sgid = pi->p_sgid_tbl[0];
+	ss->ei_props->ep_mtu = (128 << pi->p_mtu);
+	ibt_free_portinfo(pi, sz_pi);
+
+	progress |= EIB_HCAINIT_HCA_PORTS_QUERIED;
+
+	/*
+	 * Allocate a protection domain for all our transactions
+	 */
+	ret = ibt_alloc_pd(ss->ei_hca_hdl, IBT_PD_NO_FLAGS, &ss->ei_pd_hdl);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance,
+		    "ibt_alloc_pd(hca_hdl=0x%llx, "
+		    "hca_guid=0x%llx) failed, ret=%d",
+		    ss->ei_hca_hdl, ss->ei_props->ep_hca_guid, ret);
+		goto ibt_hca_init_fail;
+	}
+	progress |= EIB_HCAINIT_PD_ALLOCD;
+
+	/*
+	 * Finally, record the capabilities
+	 */
+	ss->ei_caps = kmem_zalloc(sizeof (eib_caps_t), KM_SLEEP);
+	eib_ibt_record_capab(ss, ss->ei_hca_attrs, ss->ei_caps);
+	eib_ibt_record_srate(ss);
+
+	progress |= EIB_HCAINIT_CAPAB_RECORDED;
+
+	return (EIB_E_SUCCESS);
+
+ibt_hca_init_fail:
+	eib_rb_ibt_hca_init(ss, progress);
+	return (EIB_E_FAILURE);
+}
+
+void
+eib_ibt_link_mod(eib_t *ss)
+{
+	eib_node_state_t *ns = ss->ei_node_state;
+	ibt_hca_portinfo_t *pi;
+	ibt_status_t ret;
+	uint8_t vn0_mac[ETHERADDRL];
+	boolean_t all_zombies = B_FALSE;
+	boolean_t all_need_rejoin = B_FALSE;
+	uint_t num_pi;
+	uint_t sz_pi;
+	uint8_t itr;
+
+	if (ns->ns_link_state == LINK_STATE_UNKNOWN)
+		return;
+
+	/*
+	 * See if we can get the port attributes or we're as good as down.
+	 */
+	ret = ibt_query_hca_ports(ss->ei_hca_hdl, ss->ei_props->ep_port_num,
+	    &pi, &num_pi, &sz_pi);
+	if ((ret != IBT_SUCCESS) || (pi->p_linkstate != IBT_PORT_ACTIVE)) {
+		ibt_free_portinfo(pi, sz_pi);
+		eib_mac_link_down(ss, B_FALSE);
+		return;
+	}
+
+	/*
+	 * If the SM re-initialized the port attributes, but did not preserve
+	 * the old attributes, we need to check more.
+	 */
+	itr = pi->p_init_type_reply;
+	if (EIB_PORT_ATTR_LOADED(itr) && EIB_PORT_ATTR_NOT_PRESERVED(itr)) {
+		/*
+		 * We're just coming back up; if we see that our base lid
+		 * or sgid table has changed, we'll update these and try to
+		 * restart all active vnics. If any of the vnic pkeys have
+		 * changed, we'll reset the affected channels to the new pkey.
+		 */
+		if (bcmp(pi->p_sgid_tbl, &ss->ei_props->ep_sgid,
+		    sizeof (ib_gid_t)) != 0) {
+			EIB_DPRINTF_VERBOSE(ss->ei_instance,
+			    "eib_ibt_link_mod: port sgid table changed "
+			    "(old %llx.%llx != new %llx.%llx), "
+			    "all vnics are zombies now.",
+			    ss->ei_props->ep_sgid.gid_prefix,
+			    ss->ei_props->ep_sgid.gid_guid,
+			    pi->p_sgid_tbl[0].gid_prefix,
+			    pi->p_sgid_tbl[0].gid_guid);
+
+			ss->ei_props->ep_sgid = pi->p_sgid_tbl[0];
+			all_zombies = B_TRUE;
+
+		} else if (ss->ei_props->ep_blid != pi->p_base_lid) {
+			EIB_DPRINTF_VERBOSE(ss->ei_instance,
+			    "eib_ibt_link_mod: port base lid changed "
+			    "(old 0x%x != new 0x%x), "
+			    "all vnics are zombies now.",
+			    ss->ei_props->ep_blid, pi->p_base_lid);
+
+			ss->ei_props->ep_blid = pi->p_base_lid;
+			all_zombies = B_TRUE;
+
+		} else if (eib_ibt_has_any_pkey_changed(ss)) {
+			EIB_DPRINTF_VERBOSE(ss->ei_instance,
+			    "eib_ibt_link_mod: pkey has changed for vnic(s), "
+			    "resetting all partitions");
+
+			eib_ibt_reset_partitions(ss);
+		}
+	}
+
+	if (pi) {
+		ibt_free_portinfo(pi, sz_pi);
+	}
+
+	/*
+	 * If the SM hasn't preserved our presence in MCGs, we need to
+	 * rejoin all of them.
+	 */
+	if (EIB_PORT_PRES_NOT_PRESERVED(itr)) {
+		EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_link_mod: "
+		    "hca_guid=0x%llx, port=0x%x presence not preserved in SM, "
+		    "rejoining all mcgs", ss->ei_props->ep_hca_guid,
+		    ss->ei_props->ep_port_num);
+
+		all_need_rejoin = B_TRUE;
+	}
+
+	/*
+	 * Before we do the actual work of restarting/rejoining, we need to
+	 * see if the GW is reachable at this point of time.  If not, we
+	 * still continue to keep our link "down."  Whenever the GW becomes
+	 * reachable again, we'll restart/rejoin all the vnics that we've
+	 * just marked.
+	 */
+	mutex_enter(&ss->ei_vnic_lock);
+	if (all_zombies) {
+		ss->ei_zombie_vnics = ss->ei_active_vnics;
+	}
+	if (all_need_rejoin) {
+		ss->ei_rejoin_vnics = ss->ei_active_vnics;
+	}
+	if (ss->ei_gw_unreachable) {
+		mutex_exit(&ss->ei_vnic_lock);
+
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_link_mod: "
+		    "gateway (gw_port=0x%x) unreachable for "
+		    "hca_guid=0x%llx, port=0x%x, link state down",
+		    ss->ei_gw_props->pp_gw_portid, ss->ei_props->ep_hca_guid,
+		    ss->ei_props->ep_port_num);
+
+		eib_mac_link_down(ss, B_FALSE);
+		return;
+	}
+	mutex_exit(&ss->ei_vnic_lock);
+
+	/*
+	 * Try to awaken the dead if possible
+	 */
+	bcopy(eib_zero_mac, vn0_mac, ETHERADDRL);
+	if (all_zombies) {
+		EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_link_mod: "
+		    "hca_guid=0x%llx, hca_port=0x%x, gw_port=0x%x, "
+		    "attempting to resurrect zombies",
+		    ss->ei_props->ep_hca_guid, ss->ei_props->ep_port_num,
+		    ss->ei_gw_props->pp_gw_portid);
+
+		eib_vnic_resurrect_zombies(ss, vn0_mac);
+	}
+
+	/*
+	 * Re-join the mcgs if we need to
+	 */
+	if (all_need_rejoin) {
+		EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_link_mod: "
+		    "hca_guid=0x%llx, hca_port=0x%x, gw_port=0x%x, "
+		    "attempting to rejoin mcgs",
+		    ss->ei_props->ep_hca_guid, ss->ei_props->ep_port_num,
+		    ss->ei_gw_props->pp_gw_portid);
+
+		eib_vnic_rejoin_mcgs(ss);
+	}
+
+	/*
+	 * If we've restarted the zombies because the gateway went down and
+	 * came back, it is possible our unicast mac address changed from
+	 * what it was earlier. If so, we need to update our unicast address
+	 * with the mac layer before marking the link up.
+	 */
+	if (bcmp(vn0_mac, eib_zero_mac, ETHERADDRL) != 0)
+		mac_unicst_update(ss->ei_mac_hdl, vn0_mac);
+
+	/*
+	 * Notify the link state up if required
+	 */
+	eib_mac_link_up(ss, B_FALSE);
+}
+
+int
+eib_ibt_modify_chan_pkey(eib_t *ss, eib_chan_t *chan, ib_pkey_t pkey)
+{
+	/*
+	 * Make sure the channel pkey and index are set to what we need
+	 */
+	return (eib_ibt_chan_pkey(ss, chan, pkey, B_TRUE, NULL));
+}
+
+eib_avect_t *
+eib_ibt_hold_avect(eib_t *ss, ib_lid_t dlid, uint8_t sl)
+{
+	uint_t ndx = dlid % EIB_AV_NBUCKETS;	/* simple hashing */
+	eib_avect_t *av;
+	eib_avect_t *prev;
+	int ret;
+
+	mutex_enter(&ss->ei_av_lock);
+
+	/*
+	 * See if we have the address vector
+	 */
+	prev = NULL;
+	for (av = ss->ei_av[ndx]; av; av = av->av_next) {
+		prev = av;
+		if ((av->av_vect).av_dlid == dlid)
+			break;
+	}
+
+	/*
+	 * If we don't have it, create a new one and chain it to
+	 * the same bucket
+	 */
+	if (av == NULL) {
+		av = kmem_zalloc(sizeof (eib_avect_t), KM_NOSLEEP);
+		if (av == NULL) {
+			mutex_exit(&ss->ei_av_lock);
+			EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_hold_avect: "
+			    "no memory, could not allocate address vector");
+			return (NULL);
+		}
+
+		ret = EIB_E_FAILURE;
+		if (!eib_wa_no_av_discover)
+			ret = eib_ibt_fill_avect(ss, av, dlid);
+
+		if (ret != EIB_E_SUCCESS) {
+			(av->av_vect).av_srate = IBT_SRATE_10;
+			(av->av_vect).av_srvl = sl;
+			(av->av_vect).av_port_num = ss->ei_props->ep_port_num;
+			(av->av_vect).av_send_grh = B_FALSE;
+			(av->av_vect).av_dlid = dlid;
+			(av->av_vect).av_src_path = 0;	/* we use base lid */
+		}
+
+		if (prev)
+			prev->av_next = av;
+		else
+			ss->ei_av[ndx] = av;
+	}
+
+	/*
+	 * Increment the address vector reference count before returning
+	 */
+	(av->av_ref)++;
+
+	mutex_exit(&ss->ei_av_lock);
+
+	return (av);
+}
+
+static int
+eib_ibt_fill_avect(eib_t *ss, eib_avect_t *av, ib_lid_t dlid)
+{
+	ibt_node_info_t ni;
+	ibt_path_attr_t attr;
+	ibt_path_info_t path;
+	ibt_status_t ret;
+	ib_gid_t dgid;
+
+	if ((ret = ibt_lid_to_node_info(dlid, &ni)) != IBT_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_fill_avect: "
+		    "ibt_lid_to_node_info(dlid=0x%x) failed, ret=%d",
+		    dlid, ret);
+		return (EIB_E_FAILURE);
+	}
+	dgid.gid_prefix = ss->ei_gw_props->pp_gw_sn_prefix;
+	dgid.gid_guid = ni.n_port_guid;
+
+	/*
+	 * Get the reversible path information for this destination
+	 */
+	bzero(&attr, sizeof (ibt_path_info_t));
+	attr.pa_sgid = ss->ei_props->ep_sgid;
+	attr.pa_dgids = &dgid;
+	attr.pa_num_dgids = 1;
+
+	bzero(&path, sizeof (ibt_path_info_t));
+	ret = ibt_get_paths(ss->ei_ibt_hdl, IBT_PATH_NO_FLAGS,
+	    &attr, 1, &path, NULL);
+	if ((ret != IBT_SUCCESS) || (path.pi_hca_guid == 0)) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_fill_avect: "
+		    "ibt_get_paths(dgid=%llx.%llx) failed, ret=%d",
+		    dgid.gid_prefix, dgid.gid_guid);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Fill in the address vector
+	 */
+	bcopy(&path.pi_prim_cep_path.cep_adds_vect, &av->av_vect,
+	    sizeof (ibt_adds_vect_t));
+
+	return (EIB_E_SUCCESS);
+}
+
+void
+eib_ibt_release_avect(eib_t *ss, eib_avect_t *av)
+{
+	mutex_enter(&ss->ei_av_lock);
+
+	ASSERT(av->av_ref > 0);
+	(av->av_ref)--;
+
+	mutex_exit(&ss->ei_av_lock);
+}
+
+void
+eib_ibt_free_avects(eib_t *ss)
+{
+	eib_avect_t *av;
+	eib_avect_t *av_next;
+	int ndx;
+
+	mutex_enter(&ss->ei_av_lock);
+	for (ndx = 0; ndx < EIB_AV_NBUCKETS; ndx++) {
+		for (av = ss->ei_av[ndx]; av; av = av_next) {
+			av_next = av->av_next;
+
+			ASSERT(av->av_ref == 0);
+			kmem_free(av, sizeof (eib_avect_t));
+		}
+		ss->ei_av[ndx] = NULL;
+	}
+	mutex_exit(&ss->ei_av_lock);
+}
+
+/*ARGSUSED*/
+void
+eib_ibt_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
+    ibt_async_code_t code, ibt_async_event_t *event)
+{
+	eib_t *ss = (eib_t *)clnt_private;
+	eib_event_t *evi;
+	uint_t ev_code;
+
+	ev_code = EIB_EV_NONE;
+
+	switch (code) {
+	case IBT_EVENT_SQD:
+		EIB_DPRINTF_VERBOSE(ss->ei_instance,
+		    "eib_ibt_async_handler: got IBT_EVENT_SQD");
+		eib_ibt_wakeup_sqd_waiters(ss, event->ev_chan_hdl);
+		break;
+
+	case IBT_EVENT_PORT_UP:
+		if (event->ev_port == ss->ei_props->ep_port_num) {
+			EIB_DPRINTF_VERBOSE(ss->ei_instance,
+			    "eib_ibt_async_handler: got IBT_EVENT_PORT_UP");
+			ev_code = EIB_EV_PORT_UP;
+		}
+		break;
+
+	case IBT_ERROR_PORT_DOWN:
+		if (event->ev_port == ss->ei_props->ep_port_num) {
+			EIB_DPRINTF_VERBOSE(ss->ei_instance,
+			    "eib_ibt_async_handler: got IBT_ERROR_PORT_DOWN");
+			ev_code = EIB_EV_PORT_DOWN;
+		}
+		break;
+
+	case IBT_CLNT_REREG_EVENT:
+		if (event->ev_port == ss->ei_props->ep_port_num) {
+			EIB_DPRINTF_VERBOSE(ss->ei_instance,
+			    "eib_ibt_async_handler: got IBT_CLNT_REREG_EVENT");
+			ev_code = EIB_EV_CLNT_REREG;
+		}
+		break;
+
+	case IBT_PORT_CHANGE_EVENT:
+		if ((event->ev_port == ss->ei_props->ep_port_num) &&
+		    (event->ev_port_flags & IBT_PORT_CHANGE_PKEY)) {
+			EIB_DPRINTF_VERBOSE(ss->ei_instance,
+			    "eib_ibt_async_handler: "
+			    "got IBT_PORT_CHANGE_EVENT(PKEY_CHANGE)");
+			ev_code = EIB_EV_PKEY_CHANGE;
+		} else if ((event->ev_port == ss->ei_props->ep_port_num) &&
+		    (event->ev_port_flags & IBT_PORT_CHANGE_SGID)) {
+			EIB_DPRINTF_VERBOSE(ss->ei_instance,
+			    "eib_ibt_async_handler: "
+			    "got IBT_PORT_CHANGE_EVENT(SGID_CHANGE)");
+			ev_code = EIB_EV_SGID_CHANGE;
+		}
+		break;
+
+	case IBT_HCA_ATTACH_EVENT:
+		/*
+		 * For HCA attach, after a new HCA is plugged in and
+		 * configured using cfgadm, an explicit plumb will need
+		 * to be run, so we don't need to do anything here.
+		 */
+		EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_async_handler: "
+		    "got IBT_HCA_ATTACH_EVENT");
+		break;
+
+	case IBT_HCA_DETACH_EVENT:
+		/*
+		 * Before an HCA unplug, cfgadm is expected to trigger
+		 * any rcm scripts to unplumb the EoIB instances on the
+		 * card. If so, we should not be holding any hca resource,
+		 * since we don't do ibt_open_hca() until plumb time. However,
+		 * if an earlier unplumb hadn't cleaned up the hca resources
+		 * properly because the network layer hadn't returned the
+		 * buffers at that time, we could be holding hca resources.
+		 * We'll try to release them here, and protect the code from
+		 * racing with some other plumb/unplumb operation.
+		 */
+		EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_async_handler: "
+		    "got IBT_HCA_DETACH_EVENT");
+
+		eib_mac_set_nic_state(ss, EIB_NIC_STOPPING);
+		eib_rb_rsrc_setup_bufs(ss, B_FALSE);
+		if (ss->ei_tx || ss->ei_rx || ss->ei_lso) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_events_handler: nw layer still holding "
+			    "hca resources, could not detach HCA");
+		} else if (ss->ei_hca_hdl) {
+			eib_rb_ibt_hca_init(ss, ~0);
+		}
+		eib_mac_clr_nic_state(ss, EIB_NIC_STOPPING);
+
+		break;
+	}
+
+	if (ev_code != EIB_EV_NONE) {
+		evi = kmem_zalloc(sizeof (eib_event_t), KM_NOSLEEP);
+		if (evi == NULL) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_ibt_async_handler: "
+			    "no memory, could not handle event 0x%lx", ev_code);
+		} else {
+			evi->ev_code = ev_code;
+			evi->ev_arg = NULL;
+			eib_svc_enqueue_event(ss, evi);
+		}
+	}
+}
+
+/*ARGSUSED*/
+void
+eib_ibt_record_capab(eib_t *ss, ibt_hca_attr_t *hca_attrs, eib_caps_t *caps)
+{
+	uint_t max_swqe = EIB_DATA_MAX_SWQE;
+	uint_t max_rwqe = EIB_DATA_MAX_RWQE;
+
+	/*
+	 * Checksum
+	 */
+	caps->cp_cksum_flags = 0;
+	if ((!eib_wa_no_cksum_offload) &&
+	    (hca_attrs->hca_flags & IBT_HCA_CKSUM_FULL)) {
+		caps->cp_cksum_flags =
+		    HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
+		    /* HCKSUM_INET_FULL_V4 | HCKSUM_IPHDRCKSUM; */
+	}
+
+	/*
+	 * Reserved L-Key
+	 */
+	if (hca_attrs->hca_flags2 & IBT_HCA2_RES_LKEY) {
+		caps->cp_resv_lkey_capab = 1;
+		caps->cp_resv_lkey = hca_attrs->hca_reserved_lkey;
+	}
+
+	/*
+	 * LSO
+	 */
+	caps->cp_lso_maxlen = 0;
+	if (!eib_wa_no_lso) {
+		if (hca_attrs->hca_max_lso_size > EIB_LSO_MAXLEN) {
+			caps->cp_lso_maxlen = EIB_LSO_MAXLEN;
+		} else {
+			caps->cp_lso_maxlen = hca_attrs->hca_max_lso_size;
+		}
+	}
+
+	/*
+	 * SGL
+	 *
+	 * Translating virtual address regions into physical regions
+	 * for using the Reserved LKey feature results in a wr sgl that
+	 * is a little longer. Since failing ibt_map_mem_iov() is costly,
+	 * we'll record a high-water mark (65%) when we should stop
+	 * trying to use Reserved LKey
+	 */
+	if (hca_attrs->hca_flags & IBT_HCA_WQE_SIZE_INFO) {
+		caps->cp_max_sgl = hca_attrs->hca_ud_send_sgl_sz;
+	} else {
+		caps->cp_max_sgl = hca_attrs->hca_max_sgl;
+	}
+	if (caps->cp_max_sgl > EIB_MAX_SGL) {
+		caps->cp_max_sgl = EIB_MAX_SGL;
+	}
+	caps->cp_hiwm_sgl = (caps->cp_max_sgl * 65) / 100;
+
+	/*
+	 * SWQE/RWQE: meet max chan size and max cq size limits (leave room
+	 * to avoid cq overflow event)
+	 */
+	if (max_swqe > hca_attrs->hca_max_chan_sz)
+		max_swqe = hca_attrs->hca_max_chan_sz;
+	if (max_swqe > (hca_attrs->hca_max_cq_sz - 1))
+		max_swqe = hca_attrs->hca_max_cq_sz - 1;
+	caps->cp_max_swqe = max_swqe;
+
+	if (max_rwqe > hca_attrs->hca_max_chan_sz)
+		max_rwqe = hca_attrs->hca_max_chan_sz;
+	if (max_rwqe > (hca_attrs->hca_max_cq_sz - 1))
+		max_rwqe = hca_attrs->hca_max_cq_sz - 1;
+	caps->cp_max_rwqe = max_rwqe;
+}
+
+void
+eib_rb_ibt_hca_init(eib_t *ss, uint_t progress)
+{
+	ibt_status_t ret;
+
+	if (progress & EIB_HCAINIT_CAPAB_RECORDED) {
+		if (ss->ei_caps) {
+			kmem_free(ss->ei_caps, sizeof (eib_caps_t));
+			ss->ei_caps = NULL;
+		}
+	}
+
+	if (progress & EIB_HCAINIT_PD_ALLOCD) {
+		if (ss->ei_pd_hdl) {
+			ret = ibt_free_pd(ss->ei_hca_hdl, ss->ei_pd_hdl);
+			if (ret != IBT_SUCCESS) {
+				EIB_DPRINTF_WARN(ss->ei_instance,
+				    "eib_rb_ibt_hca_init: "
+				    "ibt_free_pd(hca_hdl=0x%lx, pd_hdl=0x%lx) "
+				    "failed, ret=%d", ss->ei_hca_hdl,
+				    ss->ei_pd_hdl, ret);
+			}
+			ss->ei_pd_hdl = NULL;
+		}
+	}
+
+	if (progress & EIB_HCAINIT_HCA_PORTS_QUERIED) {
+		ss->ei_props->ep_mtu = 0;
+		bzero(&ss->ei_props->ep_sgid, sizeof (ib_gid_t));
+	}
+
+	if (progress & EIB_HCAINIT_ATTRS_ALLOCD) {
+		kmem_free(ss->ei_hca_attrs, sizeof (ibt_hca_attr_t));
+		ss->ei_hca_attrs = NULL;
+	}
+
+	if (progress & EIB_HCAINIT_HCA_OPENED) {
+		ret = ibt_close_hca(ss->ei_hca_hdl);
+		if (ret != IBT_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "ibt_close_hca(hca_hdl=0x%lx) failed, "
+			    "ret=%d", ss->ei_hca_hdl, ret);
+		}
+		ss->ei_hca_hdl = NULL;
+	}
+}
+
+static void
+eib_ibt_reset_partitions(eib_t *ss)
+{
+	eib_vnic_t *vnic;
+	eib_chan_t *chan = NULL;
+	uint64_t av;
+	int inst = 0;
+
+	/*
+	 * We already have the vhub pkey recorded in our eib_chan_t.
+	 * We only need to make sure our pkey index still matches it.
+	 * If not, modify the channel appropriately and update our
+	 * records.
+	 */
+	if ((chan = ss->ei_admin_chan) != NULL)
+		(void) eib_ibt_modify_chan_pkey(ss, chan, chan->ch_pkey);
+
+	mutex_enter(&ss->ei_vnic_lock);
+	av = ss->ei_active_vnics;
+	while ((inst = EIB_FIND_LSB_SET(av)) != -1) {
+		if ((vnic = ss->ei_vnic[inst]) != NULL) {
+			if ((chan = vnic->vn_ctl_chan) != NULL) {
+				(void) eib_ibt_modify_chan_pkey(ss, chan,
+				    chan->ch_pkey);
+			}
+			if ((chan = vnic->vn_data_chan) != NULL) {
+				(void) eib_ibt_modify_chan_pkey(ss, chan,
+				    chan->ch_pkey);
+			}
+		}
+		av &= (~((uint64_t)1 << inst));
+	}
+	mutex_exit(&ss->ei_vnic_lock);
+}
+
+static void
+eib_ibt_wakeup_sqd_waiters(eib_t *ss, ibt_channel_hdl_t ev_chan_hdl)
+{
+	eib_vnic_t *vnic;
+	eib_chan_t *chan = NULL;
+	uint64_t av;
+	int inst = 0;
+
+	/*
+	 * See if this channel has been waiting for its queue to drain.
+	 *
+	 * Note that since this is especially likely to be called during
+	 * logging in to the gateway, we also need to check the vnic
+	 * currently being created.
+	 */
+	mutex_enter(&ss->ei_vnic_lock);
+
+	if ((vnic = ss->ei_vnic_pending) != NULL) {
+		chan = vnic->vn_ctl_chan;
+		if ((chan) && (chan->ch_chan == ev_chan_hdl))
+			goto wakeup_sqd_waiters;
+
+		chan = vnic->vn_data_chan;
+		if ((chan) && (chan->ch_chan == ev_chan_hdl))
+			goto wakeup_sqd_waiters;
+	}
+
+	av = ss->ei_active_vnics;
+	while ((inst = EIB_FIND_LSB_SET(av)) != -1) {
+		if ((vnic = ss->ei_vnic[inst]) != NULL) {
+			chan = vnic->vn_ctl_chan;
+			if (chan->ch_chan == ev_chan_hdl)
+				break;
+
+			chan = vnic->vn_data_chan;
+			if (chan->ch_chan == ev_chan_hdl)
+				break;
+		}
+		av &= (~((uint64_t)1 << inst));
+	}
+
+wakeup_sqd_waiters:
+	if (chan) {
+		mutex_enter(&chan->ch_cep_lock);
+		chan->ch_cep_state = IBT_STATE_SQD;
+		cv_broadcast(&chan->ch_cep_cv);
+		mutex_exit(&chan->ch_cep_lock);
+	}
+
+	mutex_exit(&ss->ei_vnic_lock);
+}
+
+static int
+eib_ibt_chan_pkey(eib_t *ss, eib_chan_t *chan, ib_pkey_t new_pkey,
+    boolean_t set, boolean_t *pkey_changed)
+{
+	ibt_qp_info_t qp_attr;
+	ibt_status_t ret;
+	uint16_t new_pkey_ix;
+
+	ret = ibt_pkey2index(ss->ei_hca_hdl, ss->ei_props->ep_port_num,
+	    new_pkey, &new_pkey_ix);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_chan_pkey: "
+		    "ibt_pkey2index(hca_hdl=0x%llx, port_num=0x%x, "
+		    "pkey=0x%x) failed, ret=%d",
+		    ss->ei_hca_hdl, ss->ei_props->ep_port_num, new_pkey, ret);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * If the pkey and the pkey index we have already matches the
+	 * new one, nothing to do.
+	 */
+	mutex_enter(&chan->ch_pkey_lock);
+	if ((chan->ch_pkey == new_pkey) && (chan->ch_pkey_ix == new_pkey_ix)) {
+		if (pkey_changed) {
+			*pkey_changed = B_FALSE;
+		}
+		mutex_exit(&chan->ch_pkey_lock);
+		return (EIB_E_SUCCESS);
+	}
+	if (pkey_changed) {
+		*pkey_changed = B_TRUE;
+	}
+	mutex_exit(&chan->ch_pkey_lock);
+
+	/*
+	 * Otherwise, if we're asked only to test if the pkey index
+	 * supplied matches the one recorded in the channel, return
+	 * success, but don't set the pkey.
+	 */
+	if (!set) {
+		return (EIB_E_SUCCESS);
+	}
+
+	/*
+	 * Otherwise, we need to change channel pkey.  Pause the
+	 * channel sendq first.
+	 */
+	ret = ibt_pause_sendq(chan->ch_chan, IBT_CEP_SET_SQD_EVENT);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_chan_pkey: "
+		    "ibt_pause_sendq(chan_hdl=0x%llx) failed, ret=%d",
+		    chan->ch_chan, ret);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Wait for the channel to enter the IBT_STATE_SQD state
+	 */
+	mutex_enter(&chan->ch_cep_lock);
+	while (chan->ch_cep_state != IBT_STATE_SQD)
+		cv_wait(&chan->ch_cep_cv, &chan->ch_cep_lock);
+	mutex_exit(&chan->ch_cep_lock);
+
+	/*
+	 * Modify the qp with the supplied pkey index and unpause the channel
+	 * If either of these operations fail, we'll leave the channel in
+	 * the paused state and fail.
+	 */
+	bzero(&qp_attr, sizeof (ibt_qp_info_t));
+
+	qp_attr.qp_trans = IBT_UD_SRV;
+	qp_attr.qp_current_state = IBT_STATE_SQD;
+	qp_attr.qp_state = IBT_STATE_SQD;
+	qp_attr.qp_transport.ud.ud_pkey_ix = new_pkey_ix;
+
+	/*
+	 * Modify the qp to set the new pkey index, then unpause the
+	 * channel and put it in RTS state and update the new values
+	 * in our records
+	 */
+	mutex_enter(&chan->ch_pkey_lock);
+
+	ret = ibt_modify_qp(chan->ch_chan,
+	    IBT_CEP_SET_STATE | IBT_CEP_SET_PKEY_IX, &qp_attr, NULL);
+	if (ret != IBT_SUCCESS) {
+		mutex_exit(&chan->ch_pkey_lock);
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_chan_pkey: "
+		    "ibt_modify_qp(chan_hdl=0x%llx, IBT_CEP_SET_PKEY_IX) "
+		    "failed for new_pkey_ix=0x%x, ret=%d",
+		    chan->ch_chan, new_pkey_ix, ret);
+		return (EIB_E_FAILURE);
+	}
+
+	if ((ret = ibt_unpause_sendq(chan->ch_chan)) != IBT_SUCCESS) {
+		mutex_exit(&chan->ch_pkey_lock);
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_chan_pkey: "
+		    "ibt_unpause_sendq(chan_hdl=0x%llx) failed, ret=%d",
+		    chan->ch_chan, ret);
+		return (EIB_E_FAILURE);
+	}
+
+	chan->ch_pkey = new_pkey;
+	chan->ch_pkey_ix = new_pkey_ix;
+	mutex_exit(&chan->ch_pkey_lock);
+
+	return (EIB_E_SUCCESS);
+}
+
+static boolean_t
+eib_ibt_has_chan_pkey_changed(eib_t *ss, eib_chan_t *chan)
+{
+	boolean_t changed;
+	int ret;
+
+	/*
+	 * Don't modify the pkey, just ask if the pkey index for the channel's
+	 * pkey has changed for any reason.  If we fail, assume that the pkey
+	 * has changed.
+	 */
+	ret = eib_ibt_chan_pkey(ss, chan, chan->ch_pkey, B_FALSE, &changed);
+	if (ret != EIB_E_SUCCESS)
+		changed = B_TRUE;
+
+	return (changed);
+}
+
+static boolean_t
+eib_ibt_has_any_pkey_changed(eib_t *ss)
+{
+	eib_vnic_t *vnic;
+	eib_chan_t *chan = NULL;
+	uint64_t av;
+	int inst = 0;
+
+	/*
+	 * Return true if the pkey index of any our pkeys (of the channels
+	 * of all active vnics) has changed.
+	 */
+
+	chan = ss->ei_admin_chan;
+	if ((chan) && (eib_ibt_has_chan_pkey_changed(ss, chan)))
+		return (B_TRUE);
+
+	mutex_enter(&ss->ei_vnic_lock);
+	av = ss->ei_active_vnics;
+	while ((inst = EIB_FIND_LSB_SET(av)) != -1) {
+		if ((vnic = ss->ei_vnic[inst]) != NULL) {
+			chan = vnic->vn_ctl_chan;
+			if ((chan) && (eib_ibt_has_chan_pkey_changed(ss, chan)))
+				return (B_TRUE);
+
+			chan = vnic->vn_data_chan;
+			if ((chan) && (eib_ibt_has_chan_pkey_changed(ss, chan)))
+				return (B_TRUE);
+		}
+		av &= (~((uint64_t)1 << inst));
+	}
+	mutex_exit(&ss->ei_vnic_lock);
+
+	return (B_FALSE);
+}
+
+/*
+ * This routine is currently used simply to derive and record the port
+ * speed from the loopback path information (for debug purposes).  For
+ * EoIB, currently the srate used in address vectors to IB neighbors
+ * and the gateway is fixed at IBT_SRATE_10. Eventually though, this
+ * information (and sl) has to come from the gateway for all destinations
+ * in the vhub table.
+ */
+static void
+eib_ibt_record_srate(eib_t *ss)
+{
+	ib_gid_t sgid = ss->ei_props->ep_sgid;
+	ibt_srate_t srate = IBT_SRATE_10;
+	ibt_path_info_t path;
+	ibt_path_attr_t path_attr;
+	ibt_status_t ret;
+	uint8_t num_paths;
+
+	bzero(&path_attr, sizeof (path_attr));
+	path_attr.pa_dgids = &sgid;
+	path_attr.pa_num_dgids = 1;
+	path_attr.pa_sgid = sgid;
+
+	ret = ibt_get_paths(ss->ei_ibt_hdl, IBT_PATH_NO_FLAGS,
+	    &path_attr, 1, &path, &num_paths);
+	if (ret == IBT_SUCCESS && num_paths >= 1) {
+		switch (srate = path.pi_prim_cep_path.cep_adds_vect.av_srate) {
+		case IBT_SRATE_2:
+		case IBT_SRATE_10:
+		case IBT_SRATE_30:
+		case IBT_SRATE_5:
+		case IBT_SRATE_20:
+		case IBT_SRATE_40:
+		case IBT_SRATE_60:
+		case IBT_SRATE_80:
+		case IBT_SRATE_120:
+			break;
+		default:
+			srate = IBT_SRATE_10;
+		}
+	}
+
+	ss->ei_props->ep_srate = srate;
+
+	EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_ibt_record_srate: "
+	    "srate = %d", srate);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/ib/clients/eoib/eib_log.c	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,304 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+#include <sys/varargs.h>
+
+#include <sys/ib/clients/eoib/eib_impl.h>
+
+/*
+ * Defaults
+ */
+uint_t eib_log_size = EIB_LOGSZ_DEFAULT;
+int eib_log_level = EIB_MSGS_DEFAULT | EIB_MSGS_DEBUG;
+int eib_log_timestamps = 0;
+
+/*
+ * Debug variables, should not be tunables so allocated debug buffer
+ * and its size remain consistent.
+ */
+static kmutex_t eib_debug_buf_lock;
+static uint8_t *eib_debug_buf;
+static uint32_t eib_debug_buf_ndx;
+static uint_t eib_debug_buf_sz = 0;
+
+/*
+ * Local declarations
+ */
+static void eib_log(char *);
+
+void
+eib_debug_init(void)
+{
+	eib_debug_buf_ndx = 0;
+	eib_debug_buf_sz = eib_log_size;
+	eib_debug_buf = kmem_zalloc(eib_debug_buf_sz, KM_SLEEP);
+
+	mutex_init(&eib_debug_buf_lock, NULL, MUTEX_DRIVER, NULL);
+}
+
+void
+eib_debug_fini(void)
+{
+	mutex_destroy(&eib_debug_buf_lock);
+
+	if (eib_debug_buf && eib_debug_buf_sz) {
+		kmem_free(eib_debug_buf, eib_debug_buf_sz);
+		eib_debug_buf = NULL;
+	}
+	eib_debug_buf_sz = 0;
+	eib_debug_buf_ndx = 0;
+}
+
+void
+eib_log(char *msg)
+{
+	uint32_t off;
+	int msglen;
+	char msgbuf[EIB_MAX_LINE];
+
+	if (eib_debug_buf == NULL)
+		return;
+
+	if (eib_log_timestamps) {
+		msglen = snprintf(msgbuf, EIB_MAX_LINE, "%llx: %s",
+		    (unsigned long long)ddi_get_lbolt64(), msg);
+	} else {
+		msglen = snprintf(msgbuf, EIB_MAX_LINE, "%s", msg);
+	}
+
+	if (msglen < 0)
+		return;
+	else if (msglen >= EIB_MAX_LINE)
+		msglen = EIB_MAX_LINE - 1;
+
+	mutex_enter(&eib_debug_buf_lock);
+	if ((eib_debug_buf_ndx == 0) ||
+	    (eib_debug_buf[eib_debug_buf_ndx-1] != '\n')) {
+		eib_debug_buf[eib_debug_buf_ndx] = '\n';
+		eib_debug_buf_ndx++;
+	}
+
+	off = eib_debug_buf_ndx;	/* current msg should go here */
+
+	eib_debug_buf_ndx += msglen;	/* next msg should start here */
+	eib_debug_buf[eib_debug_buf_ndx] = 0;	/* terminate current msg */
+
+	if (eib_debug_buf_ndx >= (eib_debug_buf_sz - 2 * EIB_MAX_LINE))
+		eib_debug_buf_ndx = 0;
+
+	mutex_exit(&eib_debug_buf_lock);
+
+	bcopy(msgbuf, eib_debug_buf+off, msglen);    /* no lock needed */
+}
+
+#ifdef EIB_DEBUG
+void
+eib_dprintf_verbose(int inst, const char *fmt, ...)
+{
+	va_list ap;
+	int msglen;
+	char msgbuf[EIB_MAX_LINE];
+	char newfmt[EIB_MAX_LINE];
+
+	if ((eib_log_level & EIB_MSGS_VERBOSE) != EIB_MSGS_VERBOSE)
+		return;
+
+	(void) snprintf(newfmt, EIB_MAX_LINE, "eoib%d__%s", inst, fmt);
+
+	va_start(ap, fmt);
+	msglen = vsnprintf(msgbuf, EIB_MAX_LINE, newfmt, ap);
+	va_end(ap);
+
+	if (msglen > 0) {
+		eib_log(msgbuf);
+	}
+}
+
+void
+eib_dprintf_pkt(int inst, uint8_t *pkt, uint_t sz)
+{
+	char msgbuf[EIB_MAX_LINE];
+	char *bufp;
+	uint8_t *p = pkt;
+	uint_t len;
+	uint_t i;
+
+	if ((eib_log_level & EIB_MSGS_PKT) != EIB_MSGS_PKT)
+		return;
+
+	while (sz >= 16) {
+		(void) snprintf(msgbuf, EIB_MAX_LINE,
+		    "eoib%02d__%02x %02x %02x %02x %02x %02x %02x %02x "
+		    "%02x %02x %02x %02x %02x %02x %02x %02x\n", inst,
+		    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+		    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
+
+		eib_log(msgbuf);
+
+		p += 16;
+		sz -= 16;
+	}
+
+	len = EIB_MAX_LINE;
+	bufp = msgbuf;
+	for (i = 0; i < sz; i++) {
+		if (i == 0) {
+			(void) snprintf(bufp, len, "eoib%02d__%02x ",
+			    inst, p[i]);
+			len -= 11;
+			bufp += 11;
+		} else if (i < (sz - 1)) {
+			(void) snprintf(bufp, len, "%02x ", p[i]);
+			len -= 3;
+			bufp += 3;
+		} else {
+			(void) snprintf(bufp, len, "%02x\n", p[i]);
+			len -= 3;
+			bufp += 3;
+		}
+	}
+
+	eib_log(msgbuf);
+}
+
+void
+eib_dprintf_args(int inst, const char *fmt, ...)
+{
+	va_list ap;
+	int msglen;
+	char msgbuf[EIB_MAX_LINE];
+	char newfmt[EIB_MAX_LINE];
+
+	if ((eib_log_level & EIB_MSGS_ARGS) != EIB_MSGS_ARGS)
+		return;
+
+	(void) snprintf(newfmt, EIB_MAX_LINE, "eoib%d__%s", inst, fmt);
+
+	va_start(ap, fmt);
+	msglen = vsnprintf(msgbuf, EIB_MAX_LINE, newfmt, ap);
+	va_end(ap);
+
+	if (msglen > 0) {
+		eib_log(msgbuf);
+	}
+}
+
+void
+eib_dprintf_debug(int inst, const char *fmt, ...)
+{
+	va_list ap;
+	int msglen;
+	char msgbuf[EIB_MAX_LINE];
+	char newfmt[EIB_MAX_LINE];
+
+	if ((eib_log_level & EIB_MSGS_DEBUG) != EIB_MSGS_DEBUG)
+		return;
+
+	(void) snprintf(newfmt, EIB_MAX_LINE, "eoib%d__%s", inst, fmt);
+
+	va_start(ap, fmt);
+	msglen = vsnprintf(msgbuf, EIB_MAX_LINE, newfmt, ap);
+	va_end(ap);
+
+	if (msglen > 0) {
+		eib_log(msgbuf);
+	}
+}
+#endif
+
+void
+eib_dprintf_warn(int inst, const char *fmt, ...)
+{
+	va_list ap;
+	int msglen;
+	char msgbuf[EIB_MAX_LINE];
+	char newfmt[EIB_MAX_LINE];
+
+	if ((eib_log_level & EIB_MSGS_WARN) != EIB_MSGS_WARN)
+		return;
+
+	(void) snprintf(newfmt, EIB_MAX_LINE, "eoib%d__%s", inst, fmt);
+
+	va_start(ap, fmt);
+	msglen = vsnprintf(msgbuf, EIB_MAX_LINE, newfmt, ap);
+	va_end(ap);
+
+	if (msglen > 0) {
+		eib_log(msgbuf);
+	}
+}
+
+void
+eib_dprintf_err(int inst, const char *fmt, ...)
+{
+	va_list ap;
+	int msglen;
+	char msgbuf[EIB_MAX_LINE];
+	char newfmt[EIB_MAX_LINE];
+
+	if ((eib_log_level & EIB_MSGS_ERR) != EIB_MSGS_ERR)
+		return;
+
+	(void) snprintf(newfmt, EIB_MAX_LINE, "eoib%d__%s", inst, fmt);
+
+	va_start(ap, fmt);
+	msglen = vsnprintf(msgbuf, EIB_MAX_LINE, newfmt, ap);
+	va_end(ap);
+
+	if (msglen > 0) {
+		eib_log(msgbuf);
+		cmn_err(CE_WARN, "!%s\n", msgbuf);
+	}
+}
+
+void
+eib_dprintf_crit(int inst, const char *fmt, ...)
+{
+	va_list ap;
+	int msglen;
+	char msgbuf[EIB_MAX_LINE];
+	char newfmt[EIB_MAX_LINE];
+
+	if ((eib_log_level & EIB_MSGS_CRIT) != EIB_MSGS_CRIT)
+		return;
+
+	(void) snprintf(newfmt, EIB_MAX_LINE, "eoib%d__%s", inst, fmt);
+
+	va_start(ap, fmt);
+	msglen = vsnprintf(msgbuf, EIB_MAX_LINE, newfmt, ap);
+	va_end(ap);
+
+	if (msglen > 0) {
+		eib_log(msgbuf);
+		cmn_err(CE_PANIC, "!%s\n", msgbuf);
+	}
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/ib/clients/eoib/eib_mac.c	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,532 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/devops.h>
+#include <sys/kmem.h>
+#include <sys/ksynch.h>
+#include <sys/modctl.h>
+#include <sys/stat.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_ether.h>
+
+#include <sys/ib/clients/eoib/eib_impl.h>
+
+/*
+ * Declarations private to this file
+ */
+static void eib_rb_mac_start(eib_t *, eib_vnic_t *);
+
+/*
+ * This set of routines are used to set/clear the condition that the
+ * caller is about to do something that affects the state of the nic.
+ * If there's already someone doing either a start or a stop (possibly
+ * due to the async handler, a plumb or a dlpi_open happening, or an
+ * unplumb or dlpi_close coming in), we wait until that's done.
+ */
+void
+eib_mac_set_nic_state(eib_t *ss, uint_t flags)
+{
+	eib_node_state_t *ns = ss->ei_node_state;
+
+	mutex_enter(&ns->ns_lock);
+
+	while ((ns->ns_nic_state & EIB_NIC_STARTING) ||
+	    (ns->ns_nic_state & EIB_NIC_STOPPING)) {
+		cv_wait(&ns->ns_cv, &ns->ns_lock);
+	}
+	ns->ns_nic_state |= flags;
+
+	mutex_exit(&ns->ns_lock);
+}
+
+void
+eib_mac_clr_nic_state(eib_t *ss, uint_t flags)
+{
+	eib_node_state_t *ns = ss->ei_node_state;
+
+	mutex_enter(&ns->ns_lock);
+
+	ns->ns_nic_state &= (~flags);
+
+	cv_broadcast(&ns->ns_cv);
+	mutex_exit(&ns->ns_lock);
+}
+
+void
+eib_mac_upd_nic_state(eib_t *ss, uint_t clr_flags, uint_t set_flags)
+{
+	eib_node_state_t *ns = ss->ei_node_state;
+
+	mutex_enter(&ns->ns_lock);
+
+	ns->ns_nic_state &= (~clr_flags);
+	ns->ns_nic_state |= set_flags;
+
+	cv_broadcast(&ns->ns_cv);
+	mutex_exit(&ns->ns_lock);
+}
+
+uint_t
+eib_mac_get_nic_state(eib_t *ss)
+{
+	eib_node_state_t *ns = ss->ei_node_state;
+	uint_t nic_state;
+
+	mutex_enter(&ns->ns_lock);
+	nic_state = ns->ns_nic_state;
+	mutex_exit(&ns->ns_lock);
+
+	return (nic_state);
+}
+
+void
+eib_mac_link_state(eib_t *ss, link_state_t new_link_state,
+    boolean_t force)
+{
+	eib_node_state_t *ns = ss->ei_node_state;
+	boolean_t state_changed = B_FALSE;
+
+	mutex_enter(&ns->ns_lock);
+
+	/*
+	 * We track the link state only if the current link state is
+	 * not unknown.  Obviously therefore, the first calls to set
+	 * the link state from eib_mac_start() have to pass an explicit
+	 * 'force' flag to force the state change tracking.
+	 */
+	if (ns->ns_link_state != LINK_STATE_UNKNOWN)
+		force = B_TRUE;
+
+	if ((force) && (new_link_state != ns->ns_link_state)) {
+		ns->ns_link_state = new_link_state;
+		state_changed = B_TRUE;
+	}
+	mutex_exit(&ns->ns_lock);
+
+	if (state_changed) {
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_mac_link_state: changing link state to %d",
+		    new_link_state);
+
+		mac_link_update(ss->ei_mac_hdl, new_link_state);
+	} else  {
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_mac_link_state: link state already %d",
+		    new_link_state);
+	}
+}
+
+void
+eib_mac_link_up(eib_t *ss, boolean_t force)
+{
+	eib_mac_link_state(ss, LINK_STATE_UP, force);
+}
+
+void
+eib_mac_link_down(eib_t *ss, boolean_t force)
+{
+	eib_mac_link_state(ss, LINK_STATE_DOWN, force);
+}
+
+int
+eib_mac_start(eib_t *ss)
+{
+	eib_vnic_t *vnic0 = NULL;
+	eib_login_data_t *ld;
+	int err;
+
+	/*
+	 * Perform HCA related initializations
+	 */
+	if (eib_ibt_hca_init(ss) != EIB_E_SUCCESS)
+		goto start_fail;
+
+	/*
+	 * Make sure port is up. Also record the port base lid if it's up.
+	 */
+	if (eib_mac_hca_portstate(ss, &ss->ei_props->ep_blid,
+	    &err) != EIB_E_SUCCESS) {
+		goto start_fail;
+	}
+
+	/*
+	 * Set up tx and rx buffer pools
+	 */
+	if (eib_rsrc_setup_bufs(ss, &err) != EIB_E_SUCCESS)
+		goto start_fail;
+
+	/*
+	 * Set up admin qp for logins and logouts
+	 */
+	if (eib_adm_setup_qp(ss, &err) != EIB_E_SUCCESS)
+		goto start_fail;
+
+	/*
+	 * Create the vnic for physlink (instance 0)
+	 */
+	if (eib_vnic_create(ss, 0, 0, &vnic0, &err) != EIB_E_SUCCESS)
+		goto start_fail;
+
+	/*
+	 * Update the mac layer about the correct values for MTU and
+	 * unicast MAC address.  Note that we've already verified that the
+	 * vhub mtu (plus the eoib encapsulation header) is not greater
+	 * than our port mtu, so we can go ahead and report the vhub mtu
+	 * (of vnic0) directly.
+	 */
+	ld = &(vnic0->vn_login_data);
+	(void) mac_maxsdu_update(ss->ei_mac_hdl, ld->ld_vhub_mtu);
+	mac_unicst_update(ss->ei_mac_hdl, ld->ld_assigned_mac);
+
+	/*
+	 * Report that the link is up and ready
+	 */
+	eib_mac_link_up(ss, B_TRUE);
+	return (0);
+
+start_fail:
+	eib_rb_mac_start(ss, vnic0);
+	eib_mac_link_down(ss, B_TRUE);
+	return (err);
+}
+
+void
+eib_mac_stop(eib_t *ss)
+{
+	eib_vnic_t *vnic;
+	link_state_t cur_link_state = ss->ei_node_state->ns_link_state;
+	int ndx;
+
+	/*
+	 * Stopping an EoIB device instance is somewhat different from starting
+	 * it. Between the time the device instance was started and the call to
+	 * eib_m_stop() now, a number of vnics could've been created. All of
+	 * these will need to be destroyed before we can stop the device.
+	 */
+	for (ndx = EIB_MAX_VNICS - 1; ndx >= 0; ndx--) {
+		if ((vnic = ss->ei_vnic[ndx]) != NULL)
+			eib_vnic_delete(ss, vnic);
+	}
+
+	/*
+	 * And now, to undo the things we did in start (other than creation
+	 * of vnics itself)
+	 */
+	eib_rb_mac_start(ss, NULL);
+
+	/*
+	 * Now that we're completed stopped, there's no mac address assigned
+	 * to us.  Update the mac layer with this information. Note that we
+	 * can let the old max mtu information remain as-is, since we're likely
+	 * to get that same mtu on a later plumb.
+	 */
+	mac_unicst_update(ss->ei_mac_hdl, eib_zero_mac);
+
+	/*
+	 * If our link state was up when the eib_m_stop() callback was called,
+	 * we'll mark the link state as unknown now.  Otherwise, we'll leave
+	 * the link state as-is (down).
+	 */
+	if (cur_link_state == LINK_STATE_UP)
+		eib_mac_link_state(ss, LINK_STATE_UNKNOWN, B_TRUE);
+}
+
+int
+eib_mac_multicast(eib_t *ss, boolean_t add, uint8_t *mcast_mac)
+{
+	int ret = EIB_E_SUCCESS;
+	int err = 0;
+
+	/*
+	 * If it's a broadcast group join, each vnic needs to and is always
+	 * joined to the broadcast address, so we return success immediately.
+	 * If it's a broadcast group leave, we fail immediately for the same
+	 * reason as above.
+	 */
+	if (bcmp(mcast_mac, eib_broadcast_mac, ETHERADDRL) == 0) {
+		if (add)
+			return (0);
+		else
+			return (EINVAL);
+	}
+
+	if (ss->ei_vnic[0]) {
+		if (add) {
+			ret = eib_vnic_join_data_mcg(ss, ss->ei_vnic[0],
+			    mcast_mac, B_FALSE, &err);
+		} else {
+			eib_vnic_leave_data_mcg(ss, ss->ei_vnic[0], mcast_mac);
+			ret = EIB_E_SUCCESS;
+		}
+	}
+
+	if (ret == EIB_E_SUCCESS)
+		return (0);
+	else
+		return (err);
+}
+
+int
+eib_mac_promisc(eib_t *ss, boolean_t set)
+{
+	int ret = EIB_E_SUCCESS;
+	int err = 0;
+
+	if (ss->ei_vnic[0]) {
+		if (set) {
+			ret = eib_vnic_join_data_mcg(ss, ss->ei_vnic[0],
+			    eib_zero_mac, B_FALSE, &err);
+		} else {
+			eib_vnic_leave_data_mcg(ss, ss->ei_vnic[0],
+			    eib_zero_mac);
+			ret = EIB_E_SUCCESS;
+		}
+	}
+
+	if (ret == EIB_E_SUCCESS)
+		return (0);
+	else
+		return (err);
+}
+
+int
+eib_mac_tx(eib_t *ss, mblk_t *mp)
+{
+	eib_ether_hdr_t evh;
+	eib_vnic_t *vnic = NULL;
+	eib_wqe_t *swqe = NULL;
+	boolean_t failed_vnic;
+	int found;
+	int ret;
+
+	/*
+	 * Grab a send wqe.  If we cannot get one, wake up a service
+	 * thread to monitor the swqe status and let the mac layer know
+	 * as soon as we have enough tx wqes to start the traffic again.
+	 */
+	if ((swqe = eib_rsrc_grab_swqe(ss, EIB_WPRI_LO)) == NULL) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_mac_tx: "
+		    "no swqe available, holding tx until resource "
+		    "becomes available");
+		eib_rsrc_txwqes_needed(ss);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Determine dmac, smac and vlan information
+	 */
+	eib_data_parse_ether_hdr(mp, &evh);
+
+	/*
+	 * Lookup the {smac, vlan} tuple in our vnic list. If it isn't
+	 * there, this is obviously a new packet on a vnic/vlan that
+	 * we haven't been informed about. So go ahead and file a request
+	 * to create a new vnic. This is obviously not a clean thing to
+	 * do - we should be informed when a vnic/vlan is being created
+	 * and should be given a proper opportunity to login to the gateway
+	 * and do the creation.  But we don't have that luxury now, and
+	 * this is the next best thing to do.  Note that we return failure
+	 * from here, so tx flow control should prevent further packets
+	 * from coming in until the vnic creation has completed.
+	 */
+	found = eib_data_lookup_vnic(ss, evh.eh_smac, evh.eh_vlan, &vnic,
+	    &failed_vnic);
+	if (found != EIB_E_SUCCESS) {
+		uint8_t *m = evh.eh_smac;
+
+		/*
+		 * Return the swqe back to the pool
+		 */
+		eib_rsrc_return_swqe(ss, swqe, NULL);
+
+		/*
+		 * If we had previously tried creating this vnic and had
+		 * failed, we'll simply drop the packets on this vnic.
+		 * Otherwise, we'll queue up a request to create this vnic.
+		 */
+		if (failed_vnic) {
+			EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_mac_tx: "
+			    "vnic creation for mac=%x:%x:%x:%x:%x:%x "
+			    "vlan=0x%x failed previously, dropping pkt",
+			    m[0], m[1], m[2], m[3], m[4], m[5], evh.eh_vlan);
+			return (EIB_E_SUCCESS);
+		} else {
+			eib_vnic_need_new(ss, evh.eh_smac, evh.eh_vlan);
+			return (EIB_E_FAILURE);
+		}
+	}
+
+	/*
+	 * We'll try to setup the destination in the swqe for this dmac
+	 * and vlan.  If we don't succeed, there's no need to undo any
+	 * vnic-creation we might've made above (if we didn't find the
+	 * vnic corresponding to the {smac, vlan} originally). Note that
+	 * this is not a resource issue, so we'll issue a warning and
+	 * drop the packet, but won't return failure from here.
+	 */
+	ret = eib_vnic_setup_dest(vnic, swqe, evh.eh_dmac, evh.eh_vlan);
+	if (ret != EIB_E_SUCCESS) {
+		uint8_t *dmac;
+
+		dmac = evh.eh_dmac;
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_mac_tx: "
+		    "eib_vnic_setup_dest() failed for mac=%x:%x:%x:%x:%x:%x, "
+		    "vlan=0x%x, dropping pkt", dmac[0], dmac[1], dmac[2],
+		    dmac[3], dmac[4], dmac[5]);
+
+		eib_rsrc_return_swqe(ss, swqe, NULL);
+		return (EIB_E_SUCCESS);
+	}
+
+	/*
+	 * The only reason why this would fail is if we needed LSO buffer(s)
+	 * to prepare this frame and couldn't find enough of those.
+	 */
+	ret = eib_data_prepare_frame(vnic, swqe, mp, &evh);
+	if (ret != EIB_E_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_mac_tx: "
+		    "eib_data_prepare_frame() failed (no LSO bufs?), "
+		    "holding tx until resource becomes available");
+
+		eib_rsrc_return_swqe(ss, swqe, NULL);
+		eib_rsrc_lsobufs_needed(ss);
+		return (EIB_E_FAILURE);
+	}
+
+	eib_data_post_tx(vnic, swqe);
+
+	return (EIB_E_SUCCESS);
+}
+
+int
+eib_mac_hca_portstate(eib_t *ss, ib_lid_t *blid, int *err)
+{
+	ibt_hca_portinfo_t *pi;
+	ibt_status_t ret;
+	uint_t num_pi;
+	uint_t sz_pi;
+
+	ret = ibt_query_hca_ports(ss->ei_hca_hdl, ss->ei_props->ep_port_num,
+	    &pi, &num_pi, &sz_pi);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance,
+		    "ibt_query_hca_ports(hca_hdl=0x%llx, "
+		    "port=0x%x) failed, ret=%d", ss->ei_hca_hdl,
+		    ss->ei_props->ep_port_num, ret);
+		goto mac_hca_portstate_fail;
+	}
+	if (num_pi != 1) {
+		EIB_DPRINTF_ERR(ss->ei_instance,
+		    "ibt_query_hca_ports(hca_hdl=0x%llx, "
+		    "port=0x%x) returned num_pi=%d", ss->ei_hca_hdl,
+		    ss->ei_props->ep_port_num, num_pi);
+		goto mac_hca_portstate_fail;
+	}
+
+	if (pi->p_linkstate != IBT_PORT_ACTIVE)
+		goto mac_hca_portstate_fail;
+
+	/*
+	 * Return the port's base lid if asked
+	 */
+	if (blid) {
+		*blid = pi->p_base_lid;
+	}
+
+	ibt_free_portinfo(pi, sz_pi);
+	return (EIB_E_SUCCESS);
+
+mac_hca_portstate_fail:
+	if (pi) {
+		ibt_free_portinfo(pi, sz_pi);
+	}
+	if (err) {
+		*err = ENETDOWN;
+	}
+	return (EIB_E_FAILURE);
+}
+
+static void
+eib_rb_mac_start(eib_t *ss, eib_vnic_t *vnic0)
+{
+	int ntries;
+
+	/*
+	 * If vnic0 is non-null, delete it
+	 */
+	if (vnic0) {
+		eib_rb_vnic_create(ss, vnic0, ~0);
+	}
+
+	/*
+	 * At this point, we're pretty much done with all communication that
+	 * we need to do for vnic-logout, etc. so we can get rid of any address
+	 * vectors we might've allocated to send control/data packets.
+	 */
+	eib_ibt_free_avects(ss);
+
+	/*
+	 * Tear down the rest of it
+	 */
+	if (ss->ei_admin_chan) {
+		eib_rb_adm_setup_qp(ss);
+	}
+
+	/*
+	 * If (say) the network layer has been holding onto our rx buffers, we
+	 * wait a reasonable time for it to hand them back to us.  If we don't
+	 * get it still, we have nothing to do but avoid rolling back hca init
+	 * since we cannot unregister the memory, release the pd or close the
+	 * hca.  We'll try to reuse it if there's a plumb again.
+	 */
+	for (ntries = 0; ntries < EIB_MAX_ATTEMPTS; ntries++) {
+		eib_rb_rsrc_setup_bufs(ss, B_FALSE);
+		if ((ss->ei_tx == NULL) && (ss->ei_rx == NULL) &&
+		    (ss->ei_lso == NULL)) {
+			break;
+		}
+
+		delay(drv_usectohz(EIB_DELAY_HALF_SECOND));
+	}
+
+	if (ntries == EIB_MAX_ATTEMPTS) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_rb_mac_start: "
+		    "bufs outstanding, tx=0x%llx, rx=0x%llx, lso=0x%llx",
+		    ss->ei_tx, ss->ei_rx, ss->ei_lso);
+	} else if (ss->ei_hca_hdl) {
+		eib_rb_ibt_hca_init(ss, ~0);
+	}
+	ss->ei_props->ep_blid = 0;
+
+	/*
+	 * Pending vnic creation requests (and failed-vnic records) will have
+	 * to be cleaned up in any case
+	 */
+	eib_flush_vnic_reqs(ss);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/ib/clients/eoib/eib_main.c	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,977 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * The Ethernet Over Infiniband driver
+ */
+
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/devops.h>
+#include <sys/kmem.h>
+#include <sys/ksynch.h>
+#include <sys/modctl.h>
+#include <sys/stat.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+
+#include <sys/mac_provider.h>
+#include <sys/mac_ether.h>
+
+#include <sys/ib/clients/eoib/eib_impl.h>
+
+/*
+ * Driver entry point declarations
+ */
+static int eib_attach(dev_info_t *, ddi_attach_cmd_t);
+static int eib_detach(dev_info_t *, ddi_detach_cmd_t);
+
+/*
+ * MAC callbacks
+ */
+static int eib_m_stat(void *, uint_t, uint64_t *);
+static int eib_m_start(void *);
+static void eib_m_stop(void *);
+static int eib_m_promisc(void *, boolean_t);
+static int eib_m_multicast(void *, boolean_t, const uint8_t *);
+static int eib_m_unicast(void *, const uint8_t *);
+static mblk_t *eib_m_tx(void *, mblk_t *);
+static boolean_t eib_m_getcapab(void *, mac_capab_t, void *);
+static int eib_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
+    const void *);
+static int eib_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
+static void eib_m_propinfo(void *, const char *, mac_prop_id_t,
+    mac_prop_info_handle_t);
+
+/*
+ * Devops definition
+ */
+DDI_DEFINE_STREAM_OPS(eib_ops, nulldev, nulldev, eib_attach, eib_detach,
+    nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);
+
+/*
+ * Module Driver Info
+ */
+static struct modldrv eib_modldrv = {
+	&mod_driverops,		/* Driver module */
+	"EoIB Driver",		/* Driver name and version */
+	&eib_ops,		/* Driver ops */
+};
+
+/*
+ * Module Linkage
+ */
+static struct modlinkage eib_modlinkage = {
+	MODREV_1, (void *)&eib_modldrv, NULL
+};
+
+/*
+ * GLDv3 entry points
+ */
+#define	EIB_M_CALLBACK_FLAGS	\
+	(MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO)
+static mac_callbacks_t eib_m_callbacks = {
+	EIB_M_CALLBACK_FLAGS,
+	eib_m_stat,
+	eib_m_start,
+	eib_m_stop,
+	eib_m_promisc,
+	eib_m_multicast,
+	eib_m_unicast,
+	eib_m_tx,
+	NULL,
+	NULL,
+	eib_m_getcapab,
+	NULL,
+	NULL,
+	eib_m_setprop,
+	eib_m_getprop,
+	eib_m_propinfo
+};
+
+/*
+ * Async handler callback for ibt events
+ */
+static ibt_clnt_modinfo_t eib_clnt_modinfo = {
+	IBTI_V_CURR,
+	IBT_NETWORK,
+	eib_ibt_async_handler,
+	NULL,
+	EIB_DRV_NAME
+};
+
+/*
+ * Driver State Pointer
+ */
+void *eib_state;
+
+/*
+ * Declarations private to this file
+ */
+static int eib_state_init(eib_t *);
+static int eib_add_event_callbacks(eib_t *);
+static int eib_register_with_mac(eib_t *, dev_info_t *);
+static void eib_rb_attach(eib_t *, uint_t);
+static void eib_rb_state_init(eib_t *);
+static void eib_rb_add_event_callbacks(eib_t *);
+static void eib_rb_register_with_mac(eib_t *);
+
+/*
+ * Definitions private to this file
+ */
+#define	EIB_ATTACH_STATE_ALLOCD		0x01
+#define	EIB_ATTACH_PROPS_PARSED		0x02
+#define	EIB_ATTACH_STATE_INIT_DONE	0x04
+#define	EIB_ATTACH_IBT_ATT_DONE		0x08
+#define	EIB_ATTACH_EV_CBS_ADDED		0x10
+#define	EIB_ATTACH_REGISTER_MAC_DONE	0x20
+
+int
+_init()
+{
+	int ret;
+
+	if (ddi_name_to_major(EIB_DRV_NAME) == (major_t)-1)
+		return (ENODEV);
+
+	if ((ret = ddi_soft_state_init(&eib_state, sizeof (eib_t), 0)) != 0)
+		return (ret);
+
+	mac_init_ops(&eib_ops, EIB_DRV_NAME);
+	if ((ret = mod_install(&eib_modlinkage)) != 0) {
+		mac_fini_ops(&eib_ops);
+		ddi_soft_state_fini(&eib_state);
+		return (ret);
+	}
+
+	eib_debug_init();
+
+	return (ret);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&eib_modlinkage, modinfop));
+}
+
+int
+_fini()
+{
+	int ret;
+
+	if ((ret = mod_remove(&eib_modlinkage)) != 0)
+		return (ret);
+
+	eib_debug_fini();
+
+	mac_fini_ops(&eib_ops);
+	ddi_soft_state_fini(&eib_state);
+
+	return (ret);
+}
+
+static int
+eib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	eib_t *ss;
+	ibt_status_t ret;
+	int instance;
+	uint_t progress = 0;
+
+	if (cmd != DDI_ATTACH)
+		return (DDI_FAILURE);
+
+	/*
+	 * Allocate softstate for this instance
+	 */
+	instance = ddi_get_instance(dip);
+	if (ddi_soft_state_zalloc(eib_state, instance) == DDI_FAILURE)
+		goto attach_fail;
+
+	progress |= EIB_ATTACH_STATE_ALLOCD;
+
+	ss = ddi_get_soft_state(eib_state, instance);
+	ss->ei_dip = dip;
+	ss->ei_instance = (uint_t)instance;
+
+	/*
+	 * Parse the node properties and get the gateway parameters
+	 * for this instance
+	 */
+	if (eib_get_props(ss) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance,
+		    "eib_attach: eib_get_props() failed");
+		goto attach_fail;
+	}
+	progress |= EIB_ATTACH_PROPS_PARSED;
+
+	/*
+	 * Do per-state initialization
+	 */
+	if (eib_state_init(ss) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance,
+		    "eib_attach: eib_state_init() failed");
+		goto attach_fail;
+	}
+	progress |= EIB_ATTACH_STATE_INIT_DONE;
+
+	/*
+	 * Attach to IBTL
+	 */
+	if ((ret = ibt_attach(&eib_clnt_modinfo, ss->ei_dip, ss,
+	    &ss->ei_ibt_hdl)) != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance,
+		    "eib_attach: ibt_attach() failed, ret=%d", ret);
+		goto attach_fail;
+	}
+	progress |= EIB_ATTACH_IBT_ATT_DONE;
+
+	/*
+	 * Register NDI event callbacks with EoIB nexus
+	 */
+	if (eib_add_event_callbacks(ss) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance,
+		    "eib_attach: eib_add_event_callbacks() failed");
+		goto attach_fail;
+	}
+	progress |= EIB_ATTACH_EV_CBS_ADDED;
+
+	/*
+	 * Register with mac layer
+	 */
+	if (eib_register_with_mac(ss, dip) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance,
+		    "eib_attach: eib_register_with_mac() failed");
+		goto attach_fail;
+	}
+	progress |= EIB_ATTACH_REGISTER_MAC_DONE;
+
+	return (DDI_SUCCESS);
+
+attach_fail:
+	eib_rb_attach(ss, progress);
+	return (DDI_FAILURE);
+}
+
+static int
+eib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	eib_t *ss;
+	int instance;
+
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	instance = ddi_get_instance(dip);
+	ss = ddi_get_soft_state(eib_state, instance);
+
+	/*
+	 * If we had not cleaned up rx buffers (and hca resources) during
+	 * unplumb because they were stuck with the nw layer at the time,
+	 * we can try to clean them up now before doing the detach.
+	 */
+	eib_mac_set_nic_state(ss, EIB_NIC_STOPPING);
+
+	eib_rb_rsrc_setup_bufs(ss, B_FALSE);
+	if (ss->ei_tx || ss->ei_rx || ss->ei_lso) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_detach: buffers still not returned "
+		    "(tx=0x%llx, rx=0x%llx, lso=0x%llx), could "
+		    "not detach", ss->ei_tx, ss->ei_rx, ss->ei_lso);
+		eib_mac_clr_nic_state(ss, EIB_NIC_STOPPING);
+		return (DDI_FAILURE);
+	}
+	if (ss->ei_hca_hdl) {
+		eib_rb_ibt_hca_init(ss, ~0);
+	}
+	eib_mac_clr_nic_state(ss, EIB_NIC_STOPPING);
+
+	eib_rb_attach(ss, ~0);
+
+	return (DDI_SUCCESS);
+}
+
+static int
+eib_m_stat(void *arg, uint_t stat, uint64_t *val)
+{
+	eib_t *ss = arg;
+	eib_stats_t *stats = ss->ei_stats;
+
+	switch (stat) {
+	case MAC_STAT_IFSPEED:
+		*val = ss->ei_props->ep_ifspeed;
+		break;
+
+	case MAC_STAT_OBYTES:
+		*val = stats->st_obytes;
+		break;
+
+	case MAC_STAT_OPACKETS:
+		*val = stats->st_opkts;
+		break;
+
+	case MAC_STAT_BRDCSTXMT:
+		*val = stats->st_brdcstxmit;
+		break;
+
+	case MAC_STAT_MULTIXMT:
+		*val = stats->st_multixmit;
+		break;
+
+	case MAC_STAT_OERRORS:
+		*val = stats->st_oerrors;
+		break;
+
+	case MAC_STAT_NOXMTBUF:
+		*val = stats->st_noxmitbuf;
+		break;
+
+	case MAC_STAT_RBYTES:
+		*val = stats->st_rbytes;
+		break;
+
+	case MAC_STAT_IPACKETS:
+		*val = stats->st_ipkts;
+		break;
+
+	case MAC_STAT_BRDCSTRCV:
+		*val = stats->st_brdcstrcv;
+		break;
+
+	case MAC_STAT_MULTIRCV:
+		*val = stats->st_multircv;
+		break;
+
+	case MAC_STAT_IERRORS:
+		*val = stats->st_ierrors;
+		break;
+
+	case MAC_STAT_NORCVBUF:
+		*val = stats->st_norcvbuf;
+		break;
+
+	case ETHER_STAT_LINK_DUPLEX:
+		*val = LINK_DUPLEX_FULL;
+		break;
+
+	default:
+		return (ENOTSUP);
+	}
+
+	return (0);
+}
+
+static int
+eib_m_start(void *arg)
+{
+	eib_t *ss = arg;
+	int ret = -1;
+
+	eib_mac_set_nic_state(ss, EIB_NIC_STARTING);
+
+	if ((ss->ei_node_state->ns_nic_state & EIB_NIC_STARTED) == 0)
+		ret = eib_mac_start(ss);
+
+	if (ret == 0)
+		eib_mac_upd_nic_state(ss, EIB_NIC_STARTING, EIB_NIC_STARTED);
+	else
+		eib_mac_clr_nic_state(ss, EIB_NIC_STARTING);
+
+	return (ret);
+}
+
+static void
+eib_m_stop(void *arg)
+{
+	eib_t *ss = arg;
+
+	eib_mac_set_nic_state(ss, EIB_NIC_STOPPING);
+
+	if ((ss->ei_node_state->ns_nic_state & EIB_NIC_STARTED) != 0)
+		eib_mac_stop(ss);
+
+	eib_mac_clr_nic_state(ss, EIB_NIC_STARTED|EIB_NIC_STOPPING);
+}
+
+static int
+eib_m_promisc(void *arg, boolean_t flag)
+{
+	eib_t *ss = arg;
+
+	if ((ss->ei_node_state->ns_nic_state & EIB_NIC_STARTED) == 0)
+		return (0);
+
+	return (eib_mac_promisc(ss, flag));
+}
+
+static int
+eib_m_multicast(void *arg, boolean_t add, const uint8_t *mcast_mac)
+{
+	eib_t *ss = arg;
+
+	if ((ss->ei_node_state->ns_nic_state & EIB_NIC_STARTED) == 0)
+		return (0);
+
+	/*
+	 * We don't have any knowledge which of the vnics built on top of
+	 * the physlink is this multicast group relevant for.  We'll join
+	 * it for vnic0 for now.
+	 *
+	 * Since the tx routine in EoIB currently piggy backs all multicast
+	 * traffic over the broadcast channel, and all vnics are joined to
+	 * the broadcast address when they're created, everyone should receive
+	 * all multicast traffic anyway.
+	 *
+	 * On the rx side, we'll check if the incoming multicast address is
+	 * either on the vnic's list of mcgs joined to (which will only be the
+	 * broadcast address) or on vnic0's list of mcgs.  If we find a match,
+	 * we let the packet come through.
+	 *
+	 * This isn't perfect, but it's the best we can do given that we don't
+	 * have any vlan information corresponding to this multicast address.
+	 *
+	 * Also, for now we'll use the synchronous multicast joins and
+	 * leaves instead of the asynchronous mechanism provided by
+	 * ibt_join_mcg() since that involves additional complexity for failed
+	 * joins and removals.
+	 */
+	return (eib_mac_multicast(ss, add, (uint8_t *)mcast_mac));
+}
+
+static int
+eib_m_unicast(void *arg, const uint8_t *macaddr)
+{
+	eib_t *ss = arg;
+	eib_vnic_t *vnic;
+
+	if ((ss->ei_node_state->ns_nic_state & EIB_NIC_STARTED) == 0)
+		return (0);
+
+	mutex_enter(&ss->ei_vnic_lock);
+
+	vnic = ss->ei_vnic[0];
+	if (bcmp(macaddr, vnic->vn_login_data.ld_assigned_mac,
+	    ETHERADDRL) == 0) {
+		mutex_exit(&ss->ei_vnic_lock);
+		return (0);
+	}
+
+	mutex_exit(&ss->ei_vnic_lock);
+
+	return (EINVAL);
+}
+
+static mblk_t *
+eib_m_tx(void *arg, mblk_t *mp)
+{
+	eib_t *ss = arg;
+	mblk_t *next;
+
+	/*
+	 * If the nic hasn't been started, drop the message(s)
+	 */
+	if ((ss->ei_node_state->ns_nic_state & EIB_NIC_STARTED) == 0) {
+		freemsgchain(mp);
+		return (NULL);
+	}
+
+	for (; mp != NULL; mp = next) {
+		/*
+		 * Detach this message from the message chain
+		 */
+		next = mp->b_next;
+		mp->b_next = NULL;
+
+		/*
+		 * Attempt to send the message; if we fail (likely due
+		 * to lack of resources), reattach this message to the
+		 * chain and return the unsent chain back.  When we're
+		 * ready to send again, we'll issue a mac_tx_update().
+		 */
+		if (eib_mac_tx(ss, mp) != EIB_E_SUCCESS) {
+			mp->b_next = next;
+			break;
+		}
+	}
+
+	return (mp);
+}
+
+static boolean_t
+eib_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
+{
+	eib_t *ss = arg;
+	eib_caps_t *caps = ss->ei_caps;
+	eib_caps_t s_caps;
+	ibt_hca_attr_t hca_attrs;
+	ibt_status_t ret;
+
+	/*
+	 * If we haven't been plumbed yet, try getting the hca attributes
+	 * and figure out the capabilities now
+	 */
+	if (caps == NULL) {
+		ASSERT(ss->ei_props != NULL);
+
+		ret = ibt_query_hca_byguid(ss->ei_props->ep_hca_guid,
+		    &hca_attrs);
+		if (ret == IBT_SUCCESS) {
+			eib_ibt_record_capab(ss, &hca_attrs, &s_caps);
+			caps = &s_caps;
+		}
+	}
+
+	if ((caps != NULL) && (cap == MAC_CAPAB_HCKSUM)) {
+		uint32_t *tx_flags = cap_data;
+
+		if (caps->cp_cksum_flags == 0) {
+			EIB_DPRINTF_VERBOSE(ss->ei_instance,
+			    "eib_m_getcapab: hw cksum disabled, cksum_flags=0");
+			return (B_FALSE);
+		}
+
+		*tx_flags = caps->cp_cksum_flags;
+
+		return (B_TRUE);
+
+	} else if ((caps != NULL) && (cap == MAC_CAPAB_LSO)) {
+		mac_capab_lso_t *cap_lso = cap_data;
+
+		/*
+		 * If the HCA supports LSO, it will advertise a non-zero
+		 * "max lso size" parameter. Also, LSO relies on hw
+		 * checksum being available.  Finally, if the HCA
+		 * doesn't provide the reserved-lkey capability, LSO
+		 * will adversely affect the performance.  So, we'll
+		 * enable LSO only if we have a non-zero max lso size,
+		 * support checksum offload and provide reserved lkey.
+		 */
+		if (caps->cp_lso_maxlen == 0 ||
+		    caps->cp_cksum_flags == 0 ||
+		    caps->cp_resv_lkey_capab == 0) {
+			EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_m_getcapab: "
+			    "LSO disabled, lso_maxlen=0x%lx, "
+			    "cksum_flags=0x%lx, resv_lkey_capab=%d",
+			    caps->cp_lso_maxlen,
+			    caps->cp_cksum_flags,
+			    caps->cp_resv_lkey_capab);
+			return (B_FALSE);
+		}
+
+		cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
+		cap_lso->lso_basic_tcp_ipv4.lso_max = caps->cp_lso_maxlen - 1;
+
+		return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+/*ARGSUSED*/
+static int
+eib_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+    uint_t pr_valsize, const void *pr_val)
+{
+	return (ENOTSUP);
+}
+
+static int
+eib_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+    uint_t pr_valsize, void *pr_val)
+{
+	eib_t *ss = arg;
+	link_duplex_t duplex = LINK_DUPLEX_FULL;
+	uint64_t speed = ss->ei_props->ep_ifspeed;
+	int err = 0;
+
+	switch (pr_num) {
+	case MAC_PROP_DUPLEX:
+		ASSERT(pr_valsize >= sizeof (link_duplex_t));
+		bcopy(&duplex, pr_val, sizeof (link_duplex_t));
+		break;
+
+	case MAC_PROP_SPEED:
+		ASSERT(pr_valsize >= sizeof (uint64_t));
+		bcopy(&speed, pr_val, sizeof (speed));
+		break;
+
+	case MAC_PROP_PRIVATE:
+		if (strcmp(pr_name, EIB_DLPROP_GW_EPORT_STATE) == 0) {
+			if (ss->ei_gw_eport_state == FIP_EPORT_UP) {
+				(void) snprintf(pr_val, pr_valsize,
+				    "%s", "up");
+			} else {
+				(void) snprintf(pr_val, pr_valsize,
+				    "%s", "down");
+			}
+		} else if (strcmp(pr_name, EIB_DLPROP_HCA_GUID) == 0) {
+			(void) snprintf(pr_val, pr_valsize, "%llX",
+			    (u_longlong_t)ss->ei_props->ep_hca_guid);
+
+		} else if (strcmp(pr_name, EIB_DLPROP_PORT_GUID) == 0) {
+			(void) snprintf(pr_val, pr_valsize, "%llX",
+			    (u_longlong_t)((ss->ei_props->ep_sgid).gid_guid));
+		}
+		break;
+
+	default:
+		err = ENOTSUP;
+		break;
+	}
+
+	return (err);
+}
+
+/*ARGSUSED*/
+static void
+eib_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+    mac_prop_info_handle_t prh)
+{
+	switch (pr_num) {
+	case MAC_PROP_DUPLEX:
+	case MAC_PROP_SPEED:
+		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		break;
+
+	case MAC_PROP_MTU:
+		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		mac_prop_info_set_range_uint32(prh, ETHERMTU, ETHERMTU);
+		break;
+
+	case MAC_PROP_PRIVATE:
+		if (strcmp(pr_name, EIB_DLPROP_GW_EPORT_STATE) == 0) {
+			mac_prop_info_set_default_str(prh, "up ");
+			mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		} else if (strcmp(pr_name, EIB_DLPROP_HCA_GUID) == 0) {
+			mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		} else if (strcmp(pr_name, EIB_DLPROP_PORT_GUID) == 0) {
+			mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		}
+		break;
+	}
+}
+
+static int
+eib_state_init(eib_t *ss)
+{
+	kthread_t *kt;
+
+	/*
+	 * Initialize synchronization primitives
+	 */
+	mutex_init(&ss->ei_vnic_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&ss->ei_av_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&ss->ei_ev_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&ss->ei_rxpost_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&ss->ei_vnic_req_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&ss->ei_ka_vnics_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&ss->ei_vnic_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&ss->ei_ev_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&ss->ei_rxpost_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&ss->ei_vnic_req_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&ss->ei_ka_vnics_cv, NULL, CV_DEFAULT, NULL);
+
+	/*
+	 * Create a node state structure and initialize
+	 */
+	ss->ei_node_state = kmem_zalloc(sizeof (eib_node_state_t), KM_SLEEP);
+	ss->ei_node_state->ns_link_state = LINK_STATE_UNKNOWN;
+	mutex_init(&ss->ei_node_state->ns_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&ss->ei_node_state->ns_cv, NULL, CV_DEFAULT, NULL);
+
+	/*
+	 * Allocate for gathering statistics
+	 */
+	ss->ei_stats = kmem_zalloc(sizeof (eib_stats_t), KM_SLEEP);
+
+	/*
+	 * Start up service threads
+	 */
+	kt = thread_create(NULL, 0, eib_events_handler, ss, 0,
+	    &p0, TS_RUN, minclsyspri);
+	ss->ei_events_handler = kt->t_did;
+
+	kt = thread_create(NULL, 0, eib_refill_rwqes, ss, 0,
+	    &p0, TS_RUN, minclsyspri);
+	ss->ei_rwqes_refiller = kt->t_did;
+
+	kt = thread_create(NULL, 0, eib_vnic_creator, ss, 0,
+	    &p0, TS_RUN, minclsyspri);
+	ss->ei_vnic_creator = kt->t_did;
+
+	kt = thread_create(NULL, 0, eib_manage_keepalives, ss, 0,
+	    &p0, TS_RUN, minclsyspri);
+	ss->ei_keepalives_manager = kt->t_did;
+
+	/*
+	 * Set default state of gw eport
+	 */
+	ss->ei_gw_eport_state = FIP_EPORT_UP;
+
+	/*
+	 * Do static initializations of common structures
+	 */
+	eib_reserved_gid.gid_prefix = 0;
+	eib_reserved_gid.gid_guid = 0;
+
+	return (EIB_E_SUCCESS);
+}
+
+static int
+eib_add_event_callbacks(eib_t *ss)
+{
+	int ret;
+	ddi_eventcookie_t login_ack_evc;
+	ddi_eventcookie_t gw_alive_evc;
+	ddi_eventcookie_t gw_info_evc;
+
+	/*
+	 * Add callback for receiving vnic login acks from the gateway
+	 */
+	if ((ret = ddi_get_eventcookie(ss->ei_dip, EIB_NDI_EVENT_LOGIN_ACK,
+	    &login_ack_evc)) != DDI_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_add_event_callbacks: "
+		    "ddi_get_eventcookie(LOGIN_ACK) failed, ret=%d", ret);
+		return (EIB_E_FAILURE);
+	}
+	if ((ret = ddi_add_event_handler(ss->ei_dip, login_ack_evc,
+	    eib_login_ack_cb, ss, &ss->ei_login_ack_cb)) != DDI_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_add_event_callbacks: "
+		    "ddi_add_event_handler(LOGIN_ACK) failed, ret=%d", ret);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Add callback for receiving status on gateway transitioning from
+	 * not-available to available
+	 */
+	if ((ret = ddi_get_eventcookie(ss->ei_dip, EIB_NDI_EVENT_GW_AVAILABLE,
+	    &gw_alive_evc)) != DDI_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_add_event_callbacks: "
+		    "ddi_get_eventcookie(GW_AVAILABLE) failed, ret=%d", ret);
+		(void) ddi_remove_event_handler(ss->ei_login_ack_cb);
+		return (EIB_E_FAILURE);
+	}
+	if ((ret = ddi_add_event_handler(ss->ei_dip, gw_alive_evc,
+	    eib_gw_alive_cb, ss, &ss->ei_gw_alive_cb)) != DDI_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_add_event_callbacks: "
+		    "ddi_add_event_handler(GW_AVAILABLE) failed, ret=%d", ret);
+		(void) ddi_remove_event_handler(ss->ei_login_ack_cb);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Add callback for receiving gateway info update
+	 */
+	if ((ret = ddi_get_eventcookie(ss->ei_dip, EIB_NDI_EVENT_GW_INFO_UPDATE,
+	    &gw_info_evc)) != DDI_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_add_event_callbacks: "
+		    "ddi_get_eventcookie(GW_INFO_UPDATE) failed, ret=%d", ret);
+		(void) ddi_remove_event_handler(ss->ei_gw_alive_cb);
+		(void) ddi_remove_event_handler(ss->ei_login_ack_cb);
+		return (EIB_E_FAILURE);
+	}
+	if ((ret = ddi_add_event_handler(ss->ei_dip, gw_info_evc,
+	    eib_gw_info_cb, ss, &ss->ei_gw_info_cb)) != DDI_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_add_event_callbacks: "
+		    "ddi_add_event_handler(GW_INFO) failed, ret=%d", ret);
+		(void) ddi_remove_event_handler(ss->ei_gw_alive_cb);
+		(void) ddi_remove_event_handler(ss->ei_login_ack_cb);
+		return (EIB_E_FAILURE);
+	}
+
+	return (EIB_E_SUCCESS);
+}
+
+static int
+eib_register_with_mac(eib_t *ss, dev_info_t *dip)
+{
+	mac_register_t *macp;
+	int ret;
+
+	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_register_with_mac: "
+		    "mac_alloc(MAC_VERSION=%d) failed", MAC_VERSION);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Note that when we register with mac during attach, we don't
+	 * have the mac address yet (we'll get that after we login into
+	 * the gateway) so we'll simply register a zero macaddr that
+	 * we'll overwrite later during plumb, in eib_m_start(). Likewise,
+	 * we'll also update the max-sdu with the correct MTU after we
+	 * figure it out when we login to the gateway during plumb.
+	 */
+	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
+	macp->m_driver = ss;
+	macp->m_dip = dip;
+	macp->m_src_addr = eib_zero_mac;
+	macp->m_callbacks = &eib_m_callbacks;
+	macp->m_min_sdu = 0;
+	macp->m_max_sdu = ETHERMTU;
+	macp->m_margin = VLAN_TAGSZ;
+	macp->m_priv_props = eib_pvt_props;
+
+	ret = mac_register(macp, &ss->ei_mac_hdl);
+	mac_free(macp);
+
+	if (ret != 0) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_register_with_mac: "
+		    "mac_register() failed, ret=%d", ret);
+		return (EIB_E_FAILURE);
+	}
+
+	return (EIB_E_SUCCESS);
+}
+
+static void
+eib_rb_attach(eib_t *ss, uint_t progress)
+{
+	ibt_status_t ret;
+	int instance;
+
+	if (progress & EIB_ATTACH_REGISTER_MAC_DONE)
+		eib_rb_register_with_mac(ss);
+
+	if (progress & EIB_ATTACH_EV_CBS_ADDED)
+		eib_rb_add_event_callbacks(ss);
+
+	if (progress & EIB_ATTACH_IBT_ATT_DONE) {
+		ret = ibt_detach(ss->ei_ibt_hdl);
+		if (ret != IBT_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance, "eib_rb_attach: "
+			    "ibt_detach() failed, ret=%d", ret);
+		}
+		ss->ei_ibt_hdl = NULL;
+	}
+
+	if (progress & EIB_ATTACH_STATE_INIT_DONE)
+		eib_rb_state_init(ss);
+
+	if (progress & EIB_ATTACH_PROPS_PARSED)
+		eib_rb_get_props(ss);
+
+	if (progress & EIB_ATTACH_STATE_ALLOCD) {
+		instance = ddi_get_instance(ss->ei_dip);
+		ddi_soft_state_free(eib_state, instance);
+	}
+}
+
+static void
+eib_rb_state_init(eib_t *ss)
+{
+	/*
+	 * Terminate service threads
+	 */
+	if (ss->ei_keepalives_manager) {
+		eib_stop_manage_keepalives(ss);
+		ss->ei_keepalives_manager = 0;
+	}
+	if (ss->ei_vnic_creator) {
+		eib_stop_vnic_creator(ss);
+		ss->ei_vnic_creator = 0;
+	}
+	if (ss->ei_rwqes_refiller) {
+		eib_stop_refill_rwqes(ss);
+		ss->ei_rwqes_refiller = 0;
+	}
+	if (ss->ei_events_handler) {
+		eib_stop_events_handler(ss);
+		ss->ei_events_handler = 0;
+	}
+
+	/*
+	 * Remove space allocated for gathering statistics
+	 */
+	if (ss->ei_stats) {
+		kmem_free(ss->ei_stats, sizeof (eib_stats_t));
+		ss->ei_stats = NULL;
+	}
+
+	/*
+	 * Remove space allocated for keeping node state
+	 */
+	if (ss->ei_node_state) {
+		cv_destroy(&ss->ei_node_state->ns_cv);
+		mutex_destroy(&ss->ei_node_state->ns_lock);
+		kmem_free(ss->ei_node_state, sizeof (eib_node_state_t));
+		ss->ei_node_state = NULL;
+	}
+
+	/*
+	 * Finally, destroy all synchronization resources
+	 */
+	cv_destroy(&ss->ei_ka_vnics_cv);
+	cv_destroy(&ss->ei_vnic_req_cv);
+	cv_destroy(&ss->ei_rxpost_cv);
+	cv_destroy(&ss->ei_ev_cv);
+	cv_destroy(&ss->ei_vnic_cv);
+	mutex_destroy(&ss->ei_ka_vnics_lock);
+	mutex_destroy(&ss->ei_vnic_req_lock);
+	mutex_destroy(&ss->ei_rxpost_lock);
+	mutex_destroy(&ss->ei_ev_lock);
+	mutex_destroy(&ss->ei_av_lock);
+	mutex_destroy(&ss->ei_vnic_lock);
+}
+
+static void
+eib_rb_add_event_callbacks(eib_t *ss)
+{
+	ddi_eventcookie_t evc;
+
+	if (ddi_get_eventcookie(ss->ei_dip, EIB_NDI_EVENT_GW_INFO_UPDATE,
+	    &evc) == DDI_SUCCESS) {
+		(void) ddi_remove_event_handler(ss->ei_gw_info_cb);
+		ss->ei_gw_info_cb = NULL;
+	}
+
+	if (ddi_get_eventcookie(ss->ei_dip, EIB_NDI_EVENT_GW_AVAILABLE,
+	    &evc) == DDI_SUCCESS) {
+		(void) ddi_remove_event_handler(ss->ei_gw_alive_cb);
+		ss->ei_gw_alive_cb = NULL;
+	}
+
+	if (ddi_get_eventcookie(ss->ei_dip, EIB_NDI_EVENT_LOGIN_ACK,
+	    &evc) == DDI_SUCCESS) {
+		(void) ddi_remove_event_handler(ss->ei_login_ack_cb);
+		ss->ei_login_ack_cb = NULL;
+	}
+}
+
+static void
+eib_rb_register_with_mac(eib_t *ss)
+{
+	int ret;
+
+	if ((ret = mac_unregister(ss->ei_mac_hdl)) != 0) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_rb_register_with_mac: "
+		    "mac_unregister() failed, ret=%d", ret);
+	}
+
+	ss->ei_mac_hdl = NULL;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/ib/clients/eoib/eib_rsrc.c	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,1233 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+
+#include <sys/ib/clients/eoib/eib_impl.h>
+
+/*
+ * Declarations private to this file
+ */
+static int eib_rsrc_setup_txbufs(eib_t *, int *);
+static int eib_rsrc_setup_rxbufs(eib_t *, int *);
+static int eib_rsrc_setup_lsobufs(eib_t *, int *);
+static void eib_rsrc_init_wqe_pool(eib_t *, eib_wqe_pool_t **,
+    ib_memlen_t, int);
+static void eib_rsrc_fini_wqe_pool(eib_t *, eib_wqe_pool_t **);
+static boolean_t eib_rsrc_ok_to_free_pool(eib_t *, eib_wqe_pool_t *, boolean_t);
+static int eib_rsrc_grab_wqes(eib_t *, eib_wqe_pool_t *, eib_wqe_t **, uint_t,
+    uint_t *, int);
+static void eib_rsrc_return_wqes(eib_t *, eib_wqe_pool_t *, eib_wqe_t **,
+    uint_t);
+
+static void eib_rb_rsrc_setup_txbufs(eib_t *, boolean_t);
+static void eib_rb_rsrc_setup_rxbufs(eib_t *, boolean_t);
+static void eib_rb_rsrc_setup_lsobufs(eib_t *, boolean_t);
+
+/*
+ * Definitions private to this file
+ */
+static uint_t eib_lso_num_bufs = EIB_LSO_NUM_BUFS;	/* tunable? */
+
+int
+eib_rsrc_setup_bufs(eib_t *ss, int *err)
+{
+	if (eib_rsrc_setup_txbufs(ss, err) != EIB_E_SUCCESS)
+		return (EIB_E_FAILURE);
+
+	if (ss->ei_caps->cp_lso_maxlen && ss->ei_caps->cp_cksum_flags &&
+	    ss->ei_caps->cp_resv_lkey_capab) {
+		if (eib_rsrc_setup_lsobufs(ss, err) != EIB_E_SUCCESS) {
+			eib_rb_rsrc_setup_txbufs(ss, B_FALSE);
+			return (EIB_E_FAILURE);
+		}
+	}
+
+	if (eib_rsrc_setup_rxbufs(ss, err) != EIB_E_SUCCESS) {
+		eib_rb_rsrc_setup_lsobufs(ss, B_FALSE);
+		eib_rb_rsrc_setup_txbufs(ss, B_FALSE);
+		return (EIB_E_FAILURE);
+	}
+
+	return (EIB_E_SUCCESS);
+}
+
+int
+eib_rsrc_grab_swqes(eib_t *ss, eib_wqe_t **wqes, uint_t n_req, uint_t *actual,
+    int pri)
+{
+	eib_wqe_t *wqe;
+	uint32_t *encap_hdr;
+	int ret;
+	int i;
+
+	ASSERT(ss->ei_tx != NULL);
+
+	ret = eib_rsrc_grab_wqes(ss, ss->ei_tx, wqes, n_req, actual, pri);
+	if (ret != EIB_E_SUCCESS)
+		return (EIB_E_FAILURE);
+
+	/*
+	 * See note for eib_rsrc_grab_swqe()
+	 */
+	for (i = 0; i < (*actual); i++) {
+		wqe = wqes[i];
+		wqe->qe_wr.send.wr_flags = IBT_WR_NO_FLAGS;
+		wqe->qe_wr.send.wr.ud.udwr_dest = wqe->qe_dest;
+		wqe->qe_wr.send.wr_opcode = IBT_WRC_SEND;
+		wqe->qe_wr.send.wr_nds = 1;
+		wqe->qe_wr.send.wr_sgl = &wqe->qe_sgl;
+		wqe->qe_nxt_post = NULL;
+		wqe->qe_iov_hdl = NULL;
+
+		encap_hdr = (uint32_t *)(void *)wqe->qe_payload_hdr;
+		*encap_hdr = htonl(EIB_TX_ENCAP_HDR);
+	}
+
+	return (EIB_E_SUCCESS);
+}
+
+int
+eib_rsrc_grab_rwqes(eib_t *ss, eib_wqe_t **wqes, uint_t n_req, uint_t *actual,
+    int pri)
+{
+	ASSERT(ss->ei_rx != NULL);
+
+	return (eib_rsrc_grab_wqes(ss, ss->ei_rx, wqes, n_req, actual, pri));
+}
+
+int
+eib_rsrc_grab_lsobufs(eib_t *ss, uint_t req_sz, ibt_wr_ds_t *sgl, uint32_t *nds)
+{
+	eib_lsobkt_t *bkt = ss->ei_lso;
+	eib_lsobuf_t *elem;
+	eib_lsobuf_t *nxt;
+	uint_t frag_sz;
+	uint_t num_needed;
+	int i;
+
+	ASSERT(req_sz != 0);
+	ASSERT(sgl != NULL);
+	ASSERT(nds != NULL);
+
+	/*
+	 * Determine how many bufs we'd need for the size requested
+	 */
+	num_needed = req_sz / EIB_LSO_BUFSZ;
+	if ((frag_sz = req_sz % EIB_LSO_BUFSZ) != 0)
+		num_needed++;
+
+	if (bkt == NULL)
+		return (EIB_E_FAILURE);
+
+	/*
+	 * If we don't have enough lso bufs, return failure
+	 */
+	mutex_enter(&bkt->bk_lock);
+	if (bkt->bk_nfree < num_needed) {
+		mutex_exit(&bkt->bk_lock);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Pick the first "num_needed" bufs from the free list
+	 */
+	elem = bkt->bk_free_head;
+	for (i = 0; i < num_needed; i++) {
+		ASSERT(elem->lb_isfree != 0);
+		ASSERT(elem->lb_buf != NULL);
+
+		nxt = elem->lb_next;
+
+		sgl[i].ds_va = (ib_vaddr_t)(uintptr_t)elem->lb_buf;
+		sgl[i].ds_key = bkt->bk_lkey;
+		sgl[i].ds_len = EIB_LSO_BUFSZ;
+
+		elem->lb_isfree = 0;
+		elem->lb_next = NULL;
+
+		elem = nxt;
+	}
+	bkt->bk_free_head = elem;
+
+	/*
+	 * If the requested size is not a multiple of EIB_LSO_BUFSZ, we need
+	 * to adjust the last sgl entry's length. Since we know we need atleast
+	 * one, the i-1 use below is ok.
+	 */
+	if (frag_sz) {
+		sgl[i-1].ds_len = frag_sz;
+	}
+
+	/*
+	 * Update nfree count and return
+	 */
+	bkt->bk_nfree -= num_needed;
+
+	mutex_exit(&bkt->bk_lock);
+
+	*nds = num_needed;
+
+	return (EIB_E_SUCCESS);
+}
+
+eib_wqe_t *
+eib_rsrc_grab_swqe(eib_t *ss, int pri)
+{
+	eib_wqe_t *wqe = NULL;
+	uint32_t *encap_hdr;
+
+	ASSERT(ss->ei_tx != NULL);
+	(void) eib_rsrc_grab_wqes(ss, ss->ei_tx, &wqe, 1, NULL, pri);
+
+	/*
+	 * Let's reset the swqe basic wr parameters to default. We need
+	 * to do this because this swqe could've previously been used
+	 * for a checksum offload (when the flags would've been set)
+	 * or for an LSO send (in which case the opcode would've been set
+	 * to a different value), or been iov mapped (in which case the
+	 * sgl/nds could've been set to different values).  We'll make
+	 * it easy and initialize it here, so simple transactions can
+	 * go through without any special effort by the caller.
+	 *
+	 * Note that even though the wqe structure is common for both
+	 * send and recv, they're in two independent pools and the wqe
+	 * type remains the same throughout its lifetime. So we don't
+	 * have to worry about resetting any other field.
+	 */
+	if (wqe) {
+		wqe->qe_wr.send.wr_flags = IBT_WR_NO_FLAGS;
+		wqe->qe_wr.send.wr.ud.udwr_dest = wqe->qe_dest;
+		wqe->qe_wr.send.wr_opcode = IBT_WRC_SEND;
+		wqe->qe_wr.send.wr_nds = 1;
+		wqe->qe_wr.send.wr_sgl = &wqe->qe_sgl;
+		wqe->qe_nxt_post = NULL;
+		wqe->qe_iov_hdl = NULL;
+
+		encap_hdr = (uint32_t *)(void *)wqe->qe_payload_hdr;
+		*encap_hdr = htonl(EIB_TX_ENCAP_HDR);
+	}
+
+	return (wqe);
+}
+
+eib_wqe_t *
+eib_rsrc_grab_rwqe(eib_t *ss, int pri)
+{
+	eib_wqe_t *wqe = NULL;
+
+	ASSERT(ss->ei_rx != NULL);
+	(void) eib_rsrc_grab_wqes(ss, ss->ei_rx, &wqe, 1, NULL, pri);
+
+	return (wqe);
+}
+
+void
+eib_rsrc_return_swqe(eib_t *ss, eib_wqe_t *wqe, eib_chan_t *chan)
+{
+	ASSERT(ss->ei_tx != NULL);
+
+	eib_rsrc_return_wqes(ss, ss->ei_tx, &wqe, 1);
+	if (chan) {
+		eib_rsrc_decr_posted_swqe(ss, chan);
+	}
+}
+
+
+void
+eib_rsrc_return_rwqe(eib_t *ss, eib_wqe_t *wqe, eib_chan_t *chan)
+{
+	ASSERT(ss->ei_rx != NULL);
+
+	eib_rsrc_return_wqes(ss, ss->ei_rx, &wqe, 1);
+	if (chan) {
+		eib_rsrc_decr_posted_rwqe(ss, chan);
+	}
+}
+
+void
+eib_rsrc_return_lsobufs(eib_t *ss, ibt_wr_ds_t *sgl_p, uint32_t nds)
+{
+	eib_lsobkt_t *bkt = ss->ei_lso;
+	eib_lsobuf_t *elem;
+	uint8_t *va;
+	ptrdiff_t ndx;
+	int i;
+
+	/*
+	 * Nowhere to return the buffers to ??
+	 */
+	if (bkt == NULL)
+		return;
+
+	mutex_enter(&bkt->bk_lock);
+
+	for (i = 0; i < nds; i++) {
+		va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
+
+		ASSERT(va >= bkt->bk_mem);
+		ASSERT(va < (bkt->bk_mem + bkt->bk_nelem * EIB_LSO_BUFSZ));
+
+		/*
+		 * Figure out the buflist element this sgl buffer corresponds
+		 * to and put it back at the head
+		 */
+		ndx = ((uintptr_t)va - (uintptr_t)bkt->bk_mem) / EIB_LSO_BUFSZ;
+		elem = bkt->bk_bufl + ndx;
+
+		ASSERT(elem->lb_isfree == 0);
+		ASSERT(elem->lb_buf == va);
+
+		elem->lb_isfree = 1;
+		elem->lb_next = bkt->bk_free_head;
+		bkt->bk_free_head = elem;
+	}
+	bkt->bk_nfree += nds;
+
+	/*
+	 * If the number of available lso buffers just crossed the
+	 * threshold, wakeup anyone who may be sleeping on the event.
+	 */
+	if (((bkt->bk_nfree - nds) < EIB_LSO_FREE_BUFS_THRESH) &&
+	    (bkt->bk_nfree >= EIB_LSO_FREE_BUFS_THRESH)) {
+		cv_broadcast(&bkt->bk_cv);
+	}
+
+	mutex_exit(&bkt->bk_lock);
+}
+
+/*ARGSUSED*/
+void
+eib_rsrc_decr_posted_swqe(eib_t *ss, eib_chan_t *chan)
+{
+	ASSERT(chan != NULL);
+
+	mutex_enter(&chan->ch_tx_lock);
+
+	chan->ch_tx_posted--;
+	if ((chan->ch_tear_down) && (chan->ch_tx_posted == 0)) {
+		cv_signal(&chan->ch_tx_cv);
+	}
+
+	mutex_exit(&chan->ch_tx_lock);
+}
+
+void
+eib_rsrc_decr_posted_rwqe(eib_t *ss, eib_chan_t *chan)
+{
+	eib_chan_t *tail;
+	boolean_t queue_for_refill = B_FALSE;
+
+	ASSERT(chan != NULL);
+
+	/*
+	 * Decrement the ch_rx_posted count. If we are tearing this channel
+	 * down, signal the waiter when the count reaches 0.  If we aren't
+	 * tearing the channel down, see if the count has gone below the low
+	 * water mark.  If it has, and if this channel isn't already being
+	 * refilled, queue the channel up with the service thread for a
+	 * rwqe refill.
+	 */
+	mutex_enter(&chan->ch_rx_lock);
+	chan->ch_rx_posted--;
+	if (chan->ch_tear_down) {
+		if (chan->ch_rx_posted == 0)
+			cv_signal(&chan->ch_rx_cv);
+	} else if (chan->ch_rx_posted < chan->ch_lwm_rwqes) {
+		if (chan->ch_rx_refilling == B_FALSE) {
+			chan->ch_rx_refilling = B_TRUE;
+			queue_for_refill = B_TRUE;
+		}
+	}
+	mutex_exit(&chan->ch_rx_lock);
+
+	if (queue_for_refill) {
+		mutex_enter(&ss->ei_rxpost_lock);
+
+		chan->ch_rxpost_next = NULL;
+		for (tail = ss->ei_rxpost; tail; tail = tail->ch_rxpost_next) {
+			if (tail->ch_rxpost_next == NULL)
+				break;
+		}
+		if (tail) {
+			tail->ch_rxpost_next = chan;
+		} else {
+			ss->ei_rxpost = chan;
+		}
+
+		cv_signal(&ss->ei_rxpost_cv);
+		mutex_exit(&ss->ei_rxpost_lock);
+	}
+}
+
+void
+eib_rsrc_txwqes_needed(eib_t *ss)
+{
+	eib_wqe_pool_t *wp = ss->ei_tx;
+
+	EIB_INCR_COUNTER(&ss->ei_stats->st_noxmitbuf);
+
+	mutex_enter(&wp->wp_lock);
+	if ((wp->wp_status & EIB_TXWQE_SHORT) == 0) {
+		wp->wp_status |= EIB_TXWQE_SHORT;
+		cv_broadcast(&wp->wp_cv);
+	}
+	mutex_exit(&wp->wp_lock);
+}
+
+void
+eib_rsrc_lsobufs_needed(eib_t *ss)
+{
+	eib_lsobkt_t *bkt = ss->ei_lso;
+
+	EIB_INCR_COUNTER(&ss->ei_stats->st_noxmitbuf);
+
+	if (bkt == NULL) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_rsrc_lsobufs_needed: "
+		    "lso bufs seem to be needed even though "
+		    "LSO support was not advertised");
+		return;
+	}
+
+	mutex_enter(&bkt->bk_lock);
+	if ((bkt->bk_status & EIB_LBUF_SHORT) == 0) {
+		bkt->bk_status |= EIB_LBUF_SHORT;
+		cv_broadcast(&bkt->bk_cv);
+	}
+	mutex_exit(&bkt->bk_lock);
+}
+
+boolean_t
+eib_rsrc_rxpool_low(eib_wqe_t *wqe)
+{
+	eib_wqe_pool_t *wp = wqe->qe_pool;
+	boolean_t ret = B_FALSE;
+
+	/*
+	 * Set the EIB_RXWQE_SHORT flag when the number of free wqes
+	 * in the rx pool falls below the low threshold for rwqes and
+	 * clear it only when the number of free wqes gets back above
+	 * the high water mark.
+	 */
+	mutex_enter(&wp->wp_lock);
+
+	if (wp->wp_nfree <= EIB_NFREE_RWQES_LOW) {
+		wp->wp_status |= (EIB_RXWQE_SHORT);
+	} else if (wp->wp_nfree >= EIB_NFREE_RWQES_HWM) {
+		wp->wp_status &= (~EIB_RXWQE_SHORT);
+	}
+
+	if ((wp->wp_status & EIB_RXWQE_SHORT) == EIB_RXWQE_SHORT)
+		ret = B_TRUE;
+
+	mutex_exit(&wp->wp_lock);
+
+	return (ret);
+}
+
+void
+eib_rb_rsrc_setup_bufs(eib_t *ss, boolean_t force)
+{
+	eib_rb_rsrc_setup_rxbufs(ss, force);
+	eib_rb_rsrc_setup_lsobufs(ss, force);
+	eib_rb_rsrc_setup_txbufs(ss, force);
+}
+
+static int
+eib_rsrc_setup_txbufs(eib_t *ss, int *err)
+{
+	eib_wqe_pool_t *tx;
+	eib_wqe_t *wqe;
+	ibt_ud_dest_hdl_t dest;
+	ibt_mr_attr_t attr;
+	ibt_mr_desc_t desc;
+	ibt_status_t ret;
+	kthread_t *kt;
+	uint32_t *encap_hdr;
+	uint8_t	*buf;
+	uint_t mtu = ss->ei_props->ep_mtu;
+	uint_t tx_bufsz;
+	uint_t blk;
+	uint_t ndx;
+	uint_t i;
+	int lso_enabled;
+
+	/*
+	 * Try to allocate and initialize the tx wqe pool
+	 */
+	if (ss->ei_tx != NULL)
+		return (EIB_E_SUCCESS);
+
+	/*
+	 * If we keep the tx buffers as mtu-sized, then potentially every
+	 * LSO request that cannot be satisfactorily mapped, will use up
+	 * the 8K large (default size) lso buffers. This may be inadvisable
+	 * given that lso buffers are a scarce resource.  Instead, we'll
+	 * slightly raise the size of the copy buffers in the send wqes
+	 * (say to EIB_TX_COPY_THRESH) so that requests that cannot be
+	 * mapped could still avoid using the 8K LSO buffers if they're
+	 * less than the copy threshold size.
+	 */
+	lso_enabled = ss->ei_caps->cp_lso_maxlen &&
+	    ss->ei_caps->cp_cksum_flags && ss->ei_caps->cp_resv_lkey_capab;
+	tx_bufsz = ((lso_enabled) && (EIB_TX_COPY_THRESH > mtu)) ?
+	    EIB_TX_COPY_THRESH : mtu;
+
+	eib_rsrc_init_wqe_pool(ss, &ss->ei_tx, tx_bufsz, EIB_WP_TYPE_TX);
+	tx = ss->ei_tx;
+
+	/*
+	 * Register the TX memory region with IBTF for use
+	 */
+	attr.mr_vaddr = tx->wp_vaddr;
+	attr.mr_len = tx->wp_memsz;
+	attr.mr_as = NULL;
+	attr.mr_flags = IBT_MR_SLEEP;
+
+	ret = ibt_register_mr(ss->ei_hca_hdl, ss->ei_pd_hdl, &attr,
+	    &tx->wp_mr, &desc);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_rsrc_setup_txbufs: "
+		    "ibt_register_mr() failed for tx "
+		    "region (0x%llx, 0x%llx) with ret=%d",
+		    attr.mr_vaddr, attr.mr_len, ret);
+
+		*err = EINVAL;
+		goto rsrc_setup_txbufs_fail;
+	}
+	tx->wp_lkey = desc.md_lkey;
+
+	/*
+	 * Now setup the tx wqes
+	 */
+	buf = (uint8_t *)(uintptr_t)(tx->wp_vaddr);
+	for (i = 0, blk = 0; blk < EIB_BLKS_PER_POOL; blk++) {
+		for (ndx = 0; ndx < EIB_WQES_PER_BLK; ndx++, i++) {
+			wqe = &tx->wp_wqe[i];
+			/*
+			 * Allocate a UD destination handle
+			 */
+			ret = ibt_alloc_ud_dest(ss->ei_hca_hdl,
+			    IBT_UD_DEST_NO_FLAGS, ss->ei_pd_hdl, &dest);
+			if (ret != IBT_SUCCESS) {
+				EIB_DPRINTF_ERR(ss->ei_instance,
+				    "eib_rsrc_setup_txbufs: "
+				    "ibt_alloc_ud_dest(hca_hdl=0x%llx) "
+				    "failed, ret=%d", ss->ei_hca_hdl, ret);
+
+				*err = ENOMEM;
+				goto rsrc_setup_txbufs_fail;
+			}
+
+			/*
+			 * These parameters should remain fixed throughout the
+			 * lifetime of this wqe.
+			 */
+			wqe->qe_pool = tx;
+			wqe->qe_cpbuf = buf;
+			wqe->qe_bufsz = tx_bufsz;
+
+			/*
+			 * The qe_dest and qe_payload_hdr are specific to tx
+			 * only, but remain unchanged throughout the lifetime
+			 * of the wqe.
+			 *
+			 * The payload header is normally used when we have an
+			 * LSO packet to send.  Since the EoIB encapsulation
+			 * header won't be part of the message we get from the
+			 * network layer, we'll need to copy the lso header into
+			 * a new buffer every time before we hand over the LSO
+			 * send request to the hca driver.
+			 */
+			wqe->qe_dest = dest;
+			wqe->qe_payload_hdr =
+			    kmem_zalloc(EIB_MAX_PAYLOAD_HDR_SZ, KM_SLEEP);
+
+			/*
+			 * The encapsulation header is at the start of the
+			 * payload header and is initialized to the default
+			 * encapsulation header we use (no multiple segments,
+			 * no FCS). This part of the header is not expected
+			 * to change.
+			 */
+			encap_hdr = (uint32_t *)(void *)wqe->qe_payload_hdr;
+			*encap_hdr = htonl(EIB_TX_ENCAP_HDR);
+
+			/*
+			 * The parameter set below are used in tx and rx paths.
+			 * These parameters (except ds_key) are reset to these
+			 * default values in eib_rsrc_return_wqes().
+			 */
+			wqe->qe_sgl.ds_key = tx->wp_lkey;
+			wqe->qe_sgl.ds_va = (ib_vaddr_t)(uintptr_t)buf;
+			wqe->qe_sgl.ds_len = wqe->qe_bufsz;
+			wqe->qe_mp = NULL;
+			wqe->qe_info =
+			    ((blk & EIB_WQEBLK_MASK) << EIB_WQEBLK_SHIFT) |
+			    ((ndx & EIB_WQENDX_MASK) << EIB_WQENDX_SHIFT) |
+			    ((uint_t)EIB_WQE_TX << EIB_WQETYP_SHIFT);
+
+			/*
+			 * These tx-specific parameters (except wr_id and
+			 * wr_trans) are reset in eib_rsrc_grab_swqes() to make
+			 * sure any freshly acquired swqe from the pool has
+			 * these default settings for the caller.
+			 */
+			wqe->qe_wr.send.wr_id = (ibt_wrid_t)(uintptr_t)wqe;
+			wqe->qe_wr.send.wr_trans = IBT_UD_SRV;
+			wqe->qe_wr.send.wr_flags = IBT_WR_NO_FLAGS;
+			wqe->qe_wr.send.wr.ud.udwr_dest = wqe->qe_dest;
+			wqe->qe_wr.send.wr_opcode = IBT_WRC_SEND;
+			wqe->qe_wr.send.wr_nds = 1;
+			wqe->qe_wr.send.wr_sgl = &wqe->qe_sgl;
+			wqe->qe_nxt_post = NULL;
+			wqe->qe_iov_hdl = NULL;
+
+			buf += wqe->qe_bufsz;
+		}
+	}
+
+	/*
+	 * Before returning, create a kernel thread to monitor the status
+	 * of wqes in the tx wqe pool.  Note that this thread cannot be
+	 * created from eib_state_init() during attach(), since the thread
+	 * expects the wqe pool to be allocated and ready when it starts,
+	 * and the tx bufs initialization only happens during eib_m_start().
+	 */
+	kt = thread_create(NULL, 0, eib_monitor_tx_wqes, ss, 0,
+	    &p0, TS_RUN, minclsyspri);
+	ss->ei_txwqe_monitor = kt->t_did;
+
+	return (EIB_E_SUCCESS);
+
+rsrc_setup_txbufs_fail:
+	eib_rb_rsrc_setup_txbufs(ss, B_FALSE);
+	return (EIB_E_FAILURE);
+}
+
+static int
+eib_rsrc_setup_rxbufs(eib_t *ss, int *err)
+{
+	eib_wqe_pool_t *rx;
+	eib_wqe_t *wqe;
+	ibt_mr_attr_t attr;
+	ibt_mr_desc_t desc;
+	ibt_status_t ret;
+	uint8_t	*buf;
+	uint_t mtu = ss->ei_props->ep_mtu;
+	uint_t blk;
+	uint_t ndx;
+	uint_t i;
+
+	/*
+	 * Try to allocate and initialize the wqe pool. When this is called
+	 * during a plumb via the mac m_start callback, we need to make
+	 * sure there is a need to allocate a wqe pool afresh.  If during a
+	 * previous unplumb we didn't free the wqe pool because the nw layer
+	 * was holding on to some rx buffers, we don't need to allocate new
+	 * pool and set up the buffers again; we'll just start re-using the
+	 * previous one.
+	 */
+	if (ss->ei_rx != NULL)
+		return (EIB_E_SUCCESS);
+
+	/*
+	 * The receive buffer has to work for all channels, specifically the
+	 * data qp of the vnics.  This means that the buffer must be large
+	 * enough to hold MTU sized IB payload (including the EoIB and ethernet
+	 * headers) plus the GRH. In addition, because the ethernet header is
+	 * either 14 or 18 bytes (tagless or vlan tagged), we should have the
+	 * buffer filled in such a way that the IP header starts at atleast a
+	 * 4-byte aligned address.  In order to do this, we need to have some
+	 * additional room.
+	 */
+	eib_rsrc_init_wqe_pool(ss, &ss->ei_rx,
+	    mtu + EIB_GRH_SZ + EIB_IPHDR_ALIGN_ROOM, EIB_WP_TYPE_RX);
+	rx = ss->ei_rx;
+
+	/*
+	 * Register the RX memory region with IBTF for use
+	 */
+	attr.mr_vaddr = rx->wp_vaddr;
+	attr.mr_len = rx->wp_memsz;
+	attr.mr_as = NULL;
+	attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
+
+	ret = ibt_register_mr(ss->ei_hca_hdl, ss->ei_pd_hdl, &attr,
+	    &rx->wp_mr, &desc);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_rsrc_setup_rxbufs: "
+		    "ibt_register_mr() failed for rx "
+		    "region (0x%llx, 0x%llx) with ret=%d",
+		    attr.mr_vaddr, attr.mr_len, ret);
+
+		*err = EINVAL;
+		goto rsrc_setup_rxbufs_fail;
+	}
+	rx->wp_lkey = desc.md_lkey;
+
+	/*
+	 * Now setup the rx wqes
+	 */
+	buf = (uint8_t *)(uintptr_t)(rx->wp_vaddr);
+	for (i = 0, blk = 0; blk < EIB_BLKS_PER_POOL; blk++) {
+		for (ndx = 0; ndx < EIB_WQES_PER_BLK; ndx++, i++) {
+			wqe = &rx->wp_wqe[i];
+
+			/*
+			 * These parameters should remain fixed throughout the
+			 * lifetime of this recv wqe. The qe_frp will only be
+			 * used by the data channel of vnics and will remain
+			 * unused by other channels.
+			 */
+			wqe->qe_pool = rx;
+			wqe->qe_cpbuf = buf;
+			wqe->qe_bufsz = mtu + EIB_GRH_SZ + EIB_IPHDR_ALIGN_ROOM;
+			wqe->qe_wr.recv.wr_id = (ibt_wrid_t)(uintptr_t)wqe;
+			wqe->qe_wr.recv.wr_nds = 1;
+			wqe->qe_wr.recv.wr_sgl = &wqe->qe_sgl;
+			wqe->qe_frp.free_func = eib_data_rx_recycle;
+			wqe->qe_frp.free_arg = (caddr_t)wqe;
+
+			/*
+			 * The parameter set below are used in tx and rx paths.
+			 * These parameters (except ds_key) are reset to these
+			 * default values in eib_rsrc_return_wqes().
+			 */
+			wqe->qe_sgl.ds_key = rx->wp_lkey;
+			wqe->qe_sgl.ds_va = (ib_vaddr_t)(uintptr_t)buf;
+			wqe->qe_sgl.ds_len = wqe->qe_bufsz;
+			wqe->qe_mp = NULL;
+			wqe->qe_info =
+			    ((blk & EIB_WQEBLK_MASK) << EIB_WQEBLK_SHIFT) |
+			    ((ndx & EIB_WQENDX_MASK) << EIB_WQENDX_SHIFT) |
+			    ((uint_t)EIB_WQE_RX << EIB_WQETYP_SHIFT);
+
+			/*
+			 * These rx-specific parameters are also reset to
+			 * these default values in eib_rsrc_return_wqes().
+			 */
+			wqe->qe_chan = NULL;
+			wqe->qe_vnic_inst = -1;
+
+			buf += (mtu + EIB_GRH_SZ + EIB_IPHDR_ALIGN_ROOM);
+		}
+	}
+
+	return (EIB_E_SUCCESS);
+
+rsrc_setup_rxbufs_fail:
+	eib_rb_rsrc_setup_rxbufs(ss, B_FALSE);
+	return (EIB_E_FAILURE);
+}
+
+static int
+eib_rsrc_setup_lsobufs(eib_t *ss, int *err)
+{
+	eib_lsobkt_t *bkt;
+	eib_lsobuf_t *elem;
+	eib_lsobuf_t *tail;
+	ibt_mr_attr_t attr;
+	ibt_mr_desc_t desc;
+	kthread_t *kt;
+
+	uint8_t *lsomem;
+	uint8_t *memp;
+	ibt_status_t ret;
+	int i;
+
+	/*
+	 * Allocate the lso bucket and space for buffers
+	 */
+	bkt = kmem_zalloc(sizeof (eib_lsobkt_t), KM_SLEEP);
+	lsomem = kmem_zalloc(eib_lso_num_bufs * EIB_LSO_BUFSZ, KM_SLEEP);
+
+	/*
+	 * Register lso memory and save the lkey
+	 */
+	attr.mr_vaddr = (uint64_t)(uintptr_t)lsomem;
+	attr.mr_len = eib_lso_num_bufs * EIB_LSO_BUFSZ;
+	attr.mr_as = NULL;
+	attr.mr_flags = IBT_MR_SLEEP;
+
+	ret = ibt_register_mr(ss->ei_hca_hdl, ss->ei_pd_hdl, &attr,
+	    &bkt->bk_mr_hdl, &desc);
+	if (ret != IBT_SUCCESS) {
+		*err = EINVAL;
+		EIB_DPRINTF_ERR(ss->ei_instance, "eib_rsrc_setup_lsobufs: "
+		    "ibt_register_mr() failed for LSO "
+		    "region (0x%llx, 0x%llx) with ret=%d",
+		    attr.mr_vaddr, attr.mr_len, ret);
+
+		kmem_free(lsomem, eib_lso_num_bufs * EIB_LSO_BUFSZ);
+		kmem_free(bkt, sizeof (eib_lsobkt_t));
+
+		return (EIB_E_FAILURE);
+	}
+	bkt->bk_lkey = desc.md_lkey;
+
+	/*
+	 * Now allocate the buflist.  Note that the elements in the buflist and
+	 * the buffers in the lso memory have a permanent 1-1 relation, so we
+	 * can always derive the address of a buflist entry from the address of
+	 * an lso buffer.
+	 */
+	bkt->bk_bufl = kmem_zalloc(eib_lso_num_bufs * sizeof (eib_lsobuf_t),
+	    KM_SLEEP);
+
+	/*
+	 * Set up the lso buf chain
+	 */
+	memp = lsomem;
+	elem = bkt->bk_bufl;
+	for (i = 0; i < eib_lso_num_bufs; i++) {
+		elem->lb_isfree = 1;
+		elem->lb_buf = memp;
+		elem->lb_next = elem + 1;
+
+		tail = elem;
+
+		memp += EIB_LSO_BUFSZ;
+		elem++;
+	}
+	tail->lb_next = NULL;
+
+	/*
+	 * Set up the LSO buffer information in eib state
+	 */
+	bkt->bk_free_head = bkt->bk_bufl;
+	bkt->bk_mem = lsomem;
+	bkt->bk_nelem = eib_lso_num_bufs;
+	bkt->bk_nfree = bkt->bk_nelem;
+
+	mutex_init(&bkt->bk_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&bkt->bk_cv, NULL, CV_DEFAULT, NULL);
+
+	ss->ei_lso = bkt;
+
+	/*
+	 * Before returning, create a kernel thread to monitor the status
+	 * of lso bufs
+	 */
+	kt = thread_create(NULL, 0, eib_monitor_lso_bufs, ss, 0,
+	    &p0, TS_RUN, minclsyspri);
+	ss->ei_lsobufs_monitor = kt->t_did;
+
+	return (EIB_E_SUCCESS);
+}
+
+static void
+eib_rsrc_init_wqe_pool(eib_t *ss, eib_wqe_pool_t **wpp, ib_memlen_t bufsz,
+    int wp_type)
+{
+	eib_wqe_pool_t *wp;
+	uint_t wp_wqesz;
+	int i;
+
+	ASSERT(wpp != NULL);
+	ASSERT(*wpp == NULL);
+
+	/*
+	 * Allocate the wqe pool, wqes and bufs
+	 */
+	wp = kmem_zalloc(sizeof (eib_wqe_pool_t), KM_SLEEP);
+	wp_wqesz = EIB_WQES_PER_POOL * sizeof (eib_wqe_t);
+	wp->wp_wqe = (eib_wqe_t *)kmem_zalloc(wp_wqesz, KM_SLEEP);
+	wp->wp_memsz = EIB_WQES_PER_POOL * bufsz;
+	wp->wp_vaddr = (ib_vaddr_t)(uintptr_t)kmem_zalloc(wp->wp_memsz,
+	    KM_SLEEP);
+	wp->wp_ss = ss;
+	wp->wp_type = wp_type;
+	wp->wp_nfree_lwm = (wp_type == EIB_WP_TYPE_TX) ?
+	    EIB_NFREE_SWQES_LWM : EIB_NFREE_RWQES_LWM;
+
+	/*
+	 * Initialize the lock and bitmaps: everything is available at first,
+	 * but note that if the number of blocks per pool is less than 64, we
+	 * need to initialize those extra bits as "unavailable" - these will
+	 * remain unavailable throughout.
+	 */
+	mutex_init(&wp->wp_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&wp->wp_cv, NULL, CV_DEFAULT, NULL);
+
+	wp->wp_nfree = EIB_WQES_PER_POOL;
+	wp->wp_free_blks = (EIB_BLKS_PER_POOL >= 64) ? (~0) :
+	    (((uint64_t)1 << EIB_BLKS_PER_POOL) - 1);
+	for (i = 0; i < EIB_BLKS_PER_POOL; i++)
+		wp->wp_free_wqes[i] = ~0;
+
+	*wpp = wp;
+}
+
+/*ARGSUSED*/
+static void
+eib_rsrc_fini_wqe_pool(eib_t *ss, eib_wqe_pool_t **wpp)
+{
+	eib_wqe_pool_t *wp;
+
+	ASSERT(wpp != NULL);
+
+	wp = *wpp;
+	ASSERT(*wpp != NULL);
+
+	cv_destroy(&wp->wp_cv);
+	mutex_destroy(&wp->wp_lock);
+
+	kmem_free((void *)(uintptr_t)(wp->wp_vaddr), wp->wp_memsz);
+	kmem_free(wp->wp_wqe, EIB_WQES_PER_POOL * sizeof (eib_wqe_t));
+	kmem_free(wp, sizeof (eib_wqe_pool_t));
+
+	*wpp = NULL;
+}
+
+/*ARGSUSED*/
+static boolean_t
+eib_rsrc_ok_to_free_pool(eib_t *ss, eib_wqe_pool_t *wp, boolean_t force)
+{
+	uint64_t free_blks;
+	int i;
+
+	/*
+	 * See if we can release all memory allocated for buffers, wqes and
+	 * the pool.  Note that in the case of data channel rx buffers, some
+	 * of the buffers may not be free if the nw layer is holding on to
+	 * them still.  If this is the case, we cannot free the wqe pool now
+	 * or a subsequent access by the nw layer to the buffers will cause
+	 * a panic.
+	 */
+	ASSERT(wp != NULL);
+
+	/*
+	 * If force-free flag is set, we can always release the memory.
+	 * Note that this flag is unused currently, and should be removed.
+	 */
+	if (force == B_TRUE)
+		return (B_TRUE);
+
+	mutex_enter(&wp->wp_lock);
+
+	/*
+	 * If a whole block remains allocated, obviously we cannot free
+	 * the pool
+	 */
+	free_blks = (EIB_BLKS_PER_POOL >= 64) ? (~0) :
+	    (((uint64_t)1 << EIB_BLKS_PER_POOL) - 1);
+	if (wp->wp_free_blks != free_blks) {
+		mutex_exit(&wp->wp_lock);
+		return (B_FALSE);
+	}
+
+	/*
+	 * If even a single wqe within any one block remains in-use, we
+	 * cannot free the pool
+	 */
+	for (i = 0; i < EIB_BLKS_PER_POOL; i++) {
+		if (wp->wp_free_wqes[i] != (~0)) {
+			mutex_exit(&wp->wp_lock);
+			return (B_FALSE);
+		}
+	}
+
+	mutex_exit(&wp->wp_lock);
+
+	return (B_TRUE);
+}
+
+/*ARGSUSED*/
+static int
+eib_rsrc_grab_wqes(eib_t *ss, eib_wqe_pool_t *wp, eib_wqe_t **wqes,
+    uint_t n_req, uint_t *actual, int pri)
+{
+	uint_t n_allocd = 0;
+	int blk;
+	int ndx;
+	int wqe_ndx;
+
+	ASSERT(wp != NULL);
+	ASSERT(wqes != NULL);
+
+	mutex_enter(&wp->wp_lock);
+
+	/*
+	 * If this is a low priority request, adjust the number requested
+	 * so we don't allocate beyond the low-water-mark
+	 */
+	if (pri == EIB_WPRI_LO) {
+		if (wp->wp_nfree <= wp->wp_nfree_lwm)
+			n_req = 0;
+		else if ((wp->wp_nfree - n_req) < wp->wp_nfree_lwm)
+			n_req = wp->wp_nfree - wp->wp_nfree_lwm;
+	}
+
+	for (n_allocd = 0;  n_allocd < n_req; n_allocd++) {
+		/*
+		 * If the entire pool is unavailable, quit
+		 */
+		if (wp->wp_free_blks == 0)
+			break;
+
+		/*
+		 * Find the first wqe that's available
+		 */
+		blk = EIB_FIND_LSB_SET(wp->wp_free_blks);
+		ASSERT(blk != -1);
+		ndx = EIB_FIND_LSB_SET(wp->wp_free_wqes[blk]);
+		ASSERT(ndx != -1);
+
+		/*
+		 * Mark the wqe as allocated
+		 */
+		wp->wp_free_wqes[blk] &= (~((uint64_t)1 << ndx));
+
+		/*
+		 * If this was the last free wqe in this block, mark
+		 * the block itself as unavailable
+		 */
+		if (wp->wp_free_wqes[blk] == 0)
+			wp->wp_free_blks &= (~((uint64_t)1 << blk));
+
+		/*
+		 * Return this wqe to the caller
+		 */
+		wqe_ndx = blk * EIB_WQES_PER_BLK + ndx;
+		wqes[n_allocd] = &(wp->wp_wqe[wqe_ndx]);
+	}
+
+	wp->wp_nfree -= n_allocd;
+
+	mutex_exit(&wp->wp_lock);
+
+	if (n_allocd == 0)
+		return (EIB_E_FAILURE);
+
+	if (actual) {
+		*actual = n_allocd;
+	}
+
+	return (EIB_E_SUCCESS);
+}
+
+/*ARGSUSED*/
+static void
+eib_rsrc_return_wqes(eib_t *ss, eib_wqe_pool_t *wp, eib_wqe_t **wqes,
+    uint_t n_wqes)
+{
+	eib_wqe_t *wqe;
+	uint_t n_freed = 0;
+	uint_t blk;
+	uint_t ndx;
+
+	ASSERT(wp != NULL);
+	ASSERT(wqes != NULL);
+
+	mutex_enter(&wp->wp_lock);
+	for (n_freed = 0;  n_freed < n_wqes; n_freed++) {
+		wqe = wqes[n_freed];
+
+		/*
+		 * This wqe is being returned back to the pool, so clear
+		 * any wqe flags and reset buffer address and size in the
+		 * single segment sgl back to what they were initially.
+		 * Also erase any mblk pointer and callback function ptrs.
+		 */
+		wqe->qe_sgl.ds_va = (ib_vaddr_t)(uintptr_t)wqe->qe_cpbuf;
+		wqe->qe_sgl.ds_len = wqe->qe_bufsz;
+		wqe->qe_mp = NULL;
+		wqe->qe_chan = NULL;
+		wqe->qe_vnic_inst = -1;
+		wqe->qe_info &= (~EIB_WQEFLGS_MASK);
+
+		/*
+		 * Mark the wqe free in its block
+		 */
+		blk = EIB_WQE_BLK(wqe->qe_info);
+		ndx = EIB_WQE_NDX(wqe->qe_info);
+
+		wp->wp_free_wqes[blk] |= ((uint64_t)1 << ndx);
+
+		/*
+		 * This block now has atleast one wqe free, so mark
+		 * the block itself as available and move on to the
+		 * next wqe to free
+		 */
+		wp->wp_free_blks |= ((uint64_t)1 << blk);
+	}
+
+	wp->wp_nfree += n_freed;
+
+	/*
+	 * If the number of available wqes in the pool has just crossed
+	 * the high-water-mark, wakeup anyone who may be sleeping on it.
+	 */
+	if ((wp->wp_type == EIB_WP_TYPE_TX) &&
+	    ((wp->wp_nfree - n_freed) < EIB_NFREE_SWQES_HWM) &&
+	    (wp->wp_nfree >= EIB_NFREE_SWQES_HWM)) {
+		cv_broadcast(&wp->wp_cv);
+	}
+
+	mutex_exit(&wp->wp_lock);
+}
+
+static void
+eib_rb_rsrc_setup_txbufs(eib_t *ss, boolean_t force)
+{
+	eib_wqe_pool_t *wp = ss->ei_tx;
+	eib_wqe_t *wqe;
+	ibt_ud_dest_hdl_t dest;
+	ibt_status_t ret;
+	uint8_t *plhdr;
+	int i;
+
+	if (wp == NULL)
+		return;
+
+	/*
+	 * Check if it's ok to free the tx wqe pool (i.e. all buffers have
+	 * been reclaimed) and if so, stop the txwqe monitor thread (and wait
+	 * for it to die), release the UD destination handles, deregister
+	 * memory and fini the wqe pool.
+	 */
+	if (eib_rsrc_ok_to_free_pool(ss, wp, force)) {
+		eib_stop_monitor_tx_wqes(ss);
+
+		for (i = 0; i < EIB_WQES_PER_POOL; i++) {
+			wqe = &wp->wp_wqe[i];
+			if ((plhdr = wqe->qe_payload_hdr) != NULL) {
+				kmem_free(plhdr, EIB_MAX_PAYLOAD_HDR_SZ);
+			}
+			if ((dest = wqe->qe_dest) != NULL) {
+				ret = ibt_free_ud_dest(dest);
+				if (ret != IBT_SUCCESS) {
+					EIB_DPRINTF_WARN(ss->ei_instance,
+					    "eib_rb_rsrc_setup_txbufs: "
+					    "ibt_free_ud_dest() failed, ret=%d",
+					    ret);
+				}
+			}
+		}
+		if (wp->wp_mr) {
+			if ((ret = ibt_deregister_mr(ss->ei_hca_hdl,
+			    wp->wp_mr)) != IBT_SUCCESS) {
+				EIB_DPRINTF_WARN(ss->ei_instance,
+				    "eib_rb_rsrc_setup_txbufs: "
+				    "ibt_deregister_mr() failed, ret=%d", ret);
+			}
+			wp->wp_mr = NULL;
+		}
+		eib_rsrc_fini_wqe_pool(ss, &ss->ei_tx);
+	}
+}
+
+void
+eib_rb_rsrc_setup_rxbufs(eib_t *ss, boolean_t force)
+{
+	eib_wqe_pool_t *rx = ss->ei_rx;
+	ibt_status_t ret;
+
+	if (rx == NULL)
+		return;
+
+	/*
+	 * Check if it's ok to free the rx wqe pool (i.e. all buffers have
+	 * been reclaimed) and if so, deregister memory and fini the wqe pool.
+	 */
+	if (eib_rsrc_ok_to_free_pool(ss, rx, force)) {
+		if (rx->wp_mr) {
+			if ((ret = ibt_deregister_mr(ss->ei_hca_hdl,
+			    rx->wp_mr)) != IBT_SUCCESS) {
+				EIB_DPRINTF_WARN(ss->ei_instance,
+				    "eib_rb_rsrc_setup_rxbufs: "
+				    "ibt_deregister_mr() failed, ret=%d", ret);
+			}
+			rx->wp_mr = NULL;
+		}
+
+		eib_rsrc_fini_wqe_pool(ss, &ss->ei_rx);
+	}
+}
+
+static void
+eib_rb_rsrc_setup_lsobufs(eib_t *ss, boolean_t force)
+{
+	eib_lsobkt_t *bkt;
+	ibt_status_t ret;
+
+	/*
+	 * Remove the lso bucket from the state
+	 */
+	if ((bkt = ss->ei_lso) == NULL)
+		return;
+
+	/*
+	 * Try to stop the lso bufs monitor thread. If we fail, we simply
+	 * return.  We'll have another shot at it later from detach() with
+	 * the force flag set.
+	 */
+	if (eib_stop_monitor_lso_bufs(ss, force) != EIB_E_SUCCESS)
+		return;
+
+	/*
+	 * Free the buflist
+	 */
+	if (bkt->bk_bufl) {
+		kmem_free(bkt->bk_bufl, bkt->bk_nelem * sizeof (eib_lsobuf_t));
+		bkt->bk_bufl = NULL;
+	}
+
+	/*
+	 * Deregister LSO memory and free it
+	 */
+	if (bkt->bk_mr_hdl) {
+		if ((ret = ibt_deregister_mr(ss->ei_hca_hdl,
+		    bkt->bk_mr_hdl)) != IBT_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_rb_rsrc_setup_lsobufs: "
+			    "ibt_deregister_mr() failed, ret=%d", ret);
+		}
+		bkt->bk_mr_hdl = NULL;
+	}
+	if (bkt->bk_mem) {
+		kmem_free(bkt->bk_mem, bkt->bk_nelem * EIB_LSO_BUFSZ);
+		bkt->bk_mem = NULL;
+	}
+
+	/*
+	 * Destroy the mutex and condvar
+	 */
+	cv_destroy(&bkt->bk_cv);
+	mutex_destroy(&bkt->bk_lock);
+
+	/*
+	 * Finally, free the lso bucket itself
+	 */
+	kmem_free(bkt, sizeof (eib_lsobkt_t));
+	ss->ei_lso = NULL;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/ib/clients/eoib/eib_svc.c	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,1001 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+#include <sys/callb.h>
+#include <sys/mac_provider.h>
+
+#include <sys/ib/clients/eoib/eib_impl.h>
+
+/*
+ * Thread to handle EoIB events asynchronously
+ */
+void
+eib_events_handler(eib_t *ss)
+{
+	eib_event_t *evi;
+	eib_event_t *nxt;
+	kmutex_t ci_lock;
+	callb_cpr_t ci;
+
+	mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL);
+	CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_EVENTS_HDLR);
+
+wait_for_event:
+	mutex_enter(&ss->ei_ev_lock);
+	while ((evi = ss->ei_event) == NULL) {
+		mutex_enter(&ci_lock);
+		CALLB_CPR_SAFE_BEGIN(&ci);
+		mutex_exit(&ci_lock);
+
+		cv_wait(&ss->ei_ev_cv, &ss->ei_ev_lock);
+
+		mutex_enter(&ci_lock);
+		CALLB_CPR_SAFE_END(&ci, &ci_lock);
+		mutex_exit(&ci_lock);
+	}
+
+	/*
+	 * Are we being asked to die ?
+	 */
+	if (evi->ev_code == EIB_EV_SHUTDOWN) {
+		while (evi) {
+			nxt = evi->ev_next;
+			kmem_free(evi, sizeof (eib_event_t));
+			evi = nxt;
+		}
+		ss->ei_event = NULL;
+		mutex_exit(&ss->ei_ev_lock);
+
+		mutex_enter(&ci_lock);
+		CALLB_CPR_EXIT(&ci);
+		mutex_destroy(&ci_lock);
+
+		return;
+	}
+
+	/*
+	 * Otherwise, pull out the first entry from our work queue
+	 */
+	ss->ei_event = evi->ev_next;
+	evi->ev_next = NULL;
+
+	mutex_exit(&ss->ei_ev_lock);
+
+	/*
+	 * Process this event
+	 *
+	 * Note that we don't want to race with plumb/unplumb in this
+	 * handler, since we may have to restart vnics or do stuff that
+	 * may get re-initialized or released if we allowed plumb/unplumb
+	 * to happen in parallel.
+	 */
+	eib_mac_set_nic_state(ss, EIB_NIC_RESTARTING);
+
+	switch (evi->ev_code) {
+	case EIB_EV_PORT_DOWN:
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_events_handler: Begin EIB_EV_PORT_DOWN");
+
+		eib_mac_link_down(ss, B_FALSE);
+
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_events_handler: End EIB_EV_PORT_DOWN");
+		break;
+
+	case EIB_EV_PORT_UP:
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_events_handler: Begin EIB_EV_PORT_UP");
+
+		eib_ibt_link_mod(ss);
+
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_events_handler: End EIB_EV_PORT_UP");
+		break;
+
+	case EIB_EV_PKEY_CHANGE:
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_events_handler: Begin EIB_EV_PKEY_CHANGE");
+
+		eib_ibt_link_mod(ss);
+
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_events_handler: End EIB_EV_PKEY_CHANGE");
+		break;
+
+	case EIB_EV_SGID_CHANGE:
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_events_handler: Begin EIB_EV_SGID_CHANGE");
+
+		eib_ibt_link_mod(ss);
+
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_events_handler: End EIB_EV_SGID_CHANGE");
+		break;
+
+	case EIB_EV_CLNT_REREG:
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_events_handler: Begin EIB_EV_CLNT_REREG");
+
+		eib_ibt_link_mod(ss);
+
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_events_handler: End EIB_EV_CLNT_REREG");
+		break;
+
+	case EIB_EV_GW_UP:
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_events_handler: Begin EIB_EV_GW_UP");
+
+		/*
+		 * EoIB nexus has notified us that our gateway is now
+		 * reachable. Unless we already think it is reachable,
+		 * mark it so in our records and try to resurrect dead
+		 * vnics.
+		 */
+		mutex_enter(&ss->ei_vnic_lock);
+		if (ss->ei_gw_unreachable == B_FALSE) {
+			EIB_DPRINTF_DEBUG(ss->ei_instance,
+			    "eib_events_handler: gw reachable");
+			mutex_exit(&ss->ei_vnic_lock);
+
+			EIB_DPRINTF_DEBUG(ss->ei_instance,
+			    "eib_events_handler: End EIB_EV_GW_UP");
+			break;
+		}
+		ss->ei_gw_unreachable = B_FALSE;
+		mutex_exit(&ss->ei_vnic_lock);
+
+		/*
+		 * If we've not even started yet, we have nothing to do.
+		 */
+		if ((ss->ei_node_state->ns_nic_state & EIB_NIC_STARTED) == 0) {
+			EIB_DPRINTF_DEBUG(ss->ei_instance,
+			    "eib_events_handler: End EIB_EV_GW_UP");
+			break;
+		}
+
+		if (eib_mac_hca_portstate(ss, NULL, NULL) != EIB_E_SUCCESS) {
+			EIB_DPRINTF_DEBUG(ss->ei_instance,
+			    "eib_events_handler: "
+			    "HCA portstate failed, marking link down");
+
+			eib_mac_link_down(ss, B_FALSE);
+		} else {
+			uint8_t vn0_mac[ETHERADDRL];
+
+			EIB_DPRINTF_DEBUG(ss->ei_instance,
+			    "eib_events_handler: "
+			    "HCA portstate ok, resurrecting zombies");
+
+			bcopy(eib_zero_mac, vn0_mac, ETHERADDRL);
+			eib_vnic_resurrect_zombies(ss, vn0_mac);
+
+			/*
+			 * If we've resurrected the zombies because the gateway
+			 * went down and came back, it is possible our unicast
+			 * mac address changed from what it was earlier. If
+			 * so, we need to update our unicast address with the
+			 * mac layer before marking the link up.
+			 */
+			if (bcmp(vn0_mac, eib_zero_mac, ETHERADDRL) != 0) {
+				EIB_DPRINTF_DEBUG(ss->ei_instance,
+				    "eib_events_handler: updating unicast "
+				    "addr to %x:%x:%x:%x:%x:%x", vn0_mac[0],
+				    vn0_mac[1], vn0_mac[2], vn0_mac[3],
+				    vn0_mac[4], vn0_mac[5]);
+
+				mac_unicst_update(ss->ei_mac_hdl, vn0_mac);
+			}
+
+			EIB_DPRINTF_DEBUG(ss->ei_instance,
+			    "eib_events_handler: eib_mac_link_up(B_FALSE)");
+
+			eib_mac_link_up(ss, B_FALSE);
+		}
+
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_events_handler: End EIB_EV_GW_UP");
+		break;
+
+	case EIB_EV_GW_INFO_UPDATE:
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_events_handler: Begin EIB_EV_GW_INFO_UPDATE");
+
+		if (evi->ev_arg) {
+			eib_update_props(ss, (eib_gw_info_t *)(evi->ev_arg));
+			kmem_free(evi->ev_arg, sizeof (eib_gw_info_t));
+		}
+
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_events_handler: End EIB_EV_GW_INFO_UPDATE");
+		break;
+
+	case EIB_EV_MCG_DELETED:
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_events_handler: Begin-End EIB_EV_MCG_DELETED");
+		break;
+
+	case EIB_EV_MCG_CREATED:
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_events_handler: Begin-End EIB_EV_MCG_CREATED");
+		break;
+
+	case EIB_EV_GW_EPORT_DOWN:
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_events_handler: Begin-End EIB_EV_GW_EPORT_DOWN");
+		break;
+
+	case EIB_EV_GW_DOWN:
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_events_handler: Begin-End EIB_EV_GW_DOWN");
+		break;
+	}
+
+	eib_mac_clr_nic_state(ss, EIB_NIC_RESTARTING);
+
+	kmem_free(evi, sizeof (eib_event_t));
+	goto wait_for_event;
+
+	/*NOTREACHED*/
+}
+
+void
+eib_svc_enqueue_event(eib_t *ss, eib_event_t *evi)
+{
+	eib_event_t *elem = NULL;
+	eib_event_t *tail = NULL;
+
+	mutex_enter(&ss->ei_ev_lock);
+
+	/*
+	 * Notice to shutdown has a higher priority than the
+	 * rest and goes to the head of the list. Everything
+	 * else goes at the end.
+	 */
+	if (evi->ev_code == EIB_EV_SHUTDOWN) {
+		evi->ev_next = ss->ei_event;
+		ss->ei_event = evi;
+	} else {
+		for (elem = ss->ei_event; elem; elem = elem->ev_next)
+			tail = elem;
+
+		if (tail)
+			tail->ev_next = evi;
+		else
+			ss->ei_event = evi;
+	}
+
+	cv_signal(&ss->ei_ev_cv);
+	mutex_exit(&ss->ei_ev_lock);
+}
+
+/*
+ * Thread to refill channels with rwqes whenever they get low.
+ */
+void
+eib_refill_rwqes(eib_t *ss)
+{
+	eib_chan_t *chan;
+	kmutex_t ci_lock;
+	callb_cpr_t ci;
+
+	mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL);
+	CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_RWQES_REFILLER);
+
+wait_for_refill_work:
+	mutex_enter(&ss->ei_rxpost_lock);
+
+	while ((ss->ei_rxpost == NULL) && (ss->ei_rxpost_die == 0)) {
+		mutex_enter(&ci_lock);
+		CALLB_CPR_SAFE_BEGIN(&ci);
+		mutex_exit(&ci_lock);
+
+		cv_wait(&ss->ei_rxpost_cv, &ss->ei_rxpost_lock);
+
+		mutex_enter(&ci_lock);
+		CALLB_CPR_SAFE_END(&ci, &ci_lock);
+		mutex_exit(&ci_lock);
+	}
+
+	/*
+	 * Discard all requests for refill if we're being asked to die
+	 */
+	if (ss->ei_rxpost_die) {
+		ss->ei_rxpost = NULL;
+		mutex_exit(&ss->ei_rxpost_lock);
+
+		mutex_enter(&ci_lock);
+		CALLB_CPR_EXIT(&ci);
+		mutex_destroy(&ci_lock);
+
+		return;
+	}
+	ASSERT(ss->ei_rxpost != NULL);
+
+	/*
+	 * Take the first element out of the queue
+	 */
+	chan = ss->ei_rxpost;
+	ss->ei_rxpost = chan->ch_rxpost_next;
+	chan->ch_rxpost_next = NULL;
+
+	mutex_exit(&ss->ei_rxpost_lock);
+
+	/*
+	 * Try to post a bunch of recv wqes into this channel. If we
+	 * fail, it means that we haven't even been able to post a
+	 * single recv wqe.  This is alarming, but there's nothing
+	 * we can do. We just move on to the next channel needing
+	 * our service.
+	 */
+	if (eib_chan_post_rx(ss, chan, NULL) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_ERR(ss->ei_instance,
+		    "eib_refill_rwqes: eib_chan_post_rx() failed");
+	}
+
+	/*
+	 * Mark it to indicate that the refilling is done
+	 */
+	mutex_enter(&chan->ch_rx_lock);
+	chan->ch_rx_refilling = B_FALSE;
+	mutex_exit(&chan->ch_rx_lock);
+
+	goto wait_for_refill_work;
+
+	/*NOTREACHED*/
+}
+
+/*
+ * Thread to create or restart vnics when required
+ */
+void
+eib_vnic_creator(eib_t *ss)
+{
+	eib_vnic_req_t *vrq;
+	eib_vnic_req_t *elem;
+	eib_vnic_req_t *nxt;
+	kmutex_t ci_lock;
+	callb_cpr_t ci;
+	uint_t vr_req;
+	uint8_t *vr_mac;
+	int ret;
+	int err;
+
+	mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL);
+	CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_VNIC_CREATOR);
+
+wait_for_vnic_req:
+	mutex_enter(&ss->ei_vnic_req_lock);
+
+	while ((vrq = ss->ei_vnic_req) == NULL) {
+		mutex_enter(&ci_lock);
+		CALLB_CPR_SAFE_BEGIN(&ci);
+		mutex_exit(&ci_lock);
+
+		cv_wait(&ss->ei_vnic_req_cv, &ss->ei_vnic_req_lock);
+
+		mutex_enter(&ci_lock);
+		CALLB_CPR_SAFE_END(&ci, &ci_lock);
+		mutex_exit(&ci_lock);
+	}
+
+	/*
+	 * Pull out the first request
+	 */
+	ss->ei_vnic_req = vrq->vr_next;
+	vrq->vr_next = NULL;
+
+	vr_req = vrq->vr_req;
+	vr_mac = vrq->vr_mac;
+
+	switch (vr_req) {
+	case EIB_CR_REQ_DIE:
+	case EIB_CR_REQ_FLUSH:
+		/*
+		 * Cleanup all pending reqs and failed reqs
+		 */
+		for (elem = ss->ei_vnic_req; elem; elem = nxt) {
+			nxt = elem->vr_next;
+			kmem_free(elem, sizeof (eib_vnic_req_t));
+		}
+		for (elem = ss->ei_failed_vnic_req; elem; elem = nxt) {
+			nxt = elem->vr_next;
+			kmem_free(elem, sizeof (eib_vnic_req_t));
+		}
+		ss->ei_vnic_req = NULL;
+		ss->ei_failed_vnic_req = NULL;
+		ss->ei_pending_vnic_req = NULL;
+		mutex_exit(&ss->ei_vnic_req_lock);
+
+		break;
+
+	case EIB_CR_REQ_NEW_VNIC:
+		ss->ei_pending_vnic_req = vrq;
+		mutex_exit(&ss->ei_vnic_req_lock);
+
+		EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_vnic_creator: "
+		    "new vnic creation request for %x:%x:%x:%x:%x:%x, 0x%x",
+		    vr_mac[0], vr_mac[1], vr_mac[2], vr_mac[3], vr_mac[4],
+		    vr_mac[5], vrq->vr_vlan);
+
+		/*
+		 * Make sure we don't race with the plumb/unplumb code.  If
+		 * the eoib instance has been unplumbed already, we ignore any
+		 * creation requests that may have been pending.
+		 */
+		eib_mac_set_nic_state(ss, EIB_NIC_STARTING);
+
+		if ((ss->ei_node_state->ns_nic_state & EIB_NIC_STARTED) !=
+		    EIB_NIC_STARTED) {
+			mutex_enter(&ss->ei_vnic_req_lock);
+			ss->ei_pending_vnic_req = NULL;
+			mutex_exit(&ss->ei_vnic_req_lock);
+			eib_mac_clr_nic_state(ss, EIB_NIC_STARTING);
+			break;
+		}
+
+		/*
+		 * Try to create a new vnic with the supplied parameters.
+		 */
+		err = 0;
+		if ((ret = eib_vnic_create(ss, vrq->vr_mac, vrq->vr_vlan,
+		    NULL, &err)) != EIB_E_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_creator: "
+			    "eib_vnic_create(mac=%x:%x:%x:%x:%x:%x, vlan=0x%x) "
+			    "failed, ret=%d", vr_mac[0], vr_mac[1], vr_mac[2],
+			    vr_mac[3], vr_mac[4], vr_mac[5], vrq->vr_vlan, err);
+		}
+
+		/*
+		 * If we failed, add this vnic req to our failed list (unless
+		 * it already exists there), so we won't try to create this
+		 * vnic again.  Whether we fail or succeed, we're done with
+		 * processing this req, so clear the pending req.
+		 */
+		mutex_enter(&ss->ei_vnic_req_lock);
+		if ((ret != EIB_E_SUCCESS) && (err != EEXIST)) {
+			vrq->vr_next = ss->ei_failed_vnic_req;
+			ss->ei_failed_vnic_req = vrq;
+			vrq = NULL;
+		}
+		ss->ei_pending_vnic_req = NULL;
+		mutex_exit(&ss->ei_vnic_req_lock);
+
+		/*
+		 * Notify the mac layer that it should retry its tx again. If we
+		 * had created the vnic successfully, we'll be able to send the
+		 * packets; if we had not been successful, we'll drop packets on
+		 * this vnic.
+		 */
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_vnic_creator: calling mac_tx_update()");
+		mac_tx_update(ss->ei_mac_hdl);
+
+		eib_mac_clr_nic_state(ss, EIB_NIC_STARTING);
+		break;
+
+	default:
+		EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_vnic_creator: "
+		    "unknown request 0x%lx, ignoring", vrq->vr_req);
+		break;
+	}
+
+	/*
+	 * Free the current req and quit if we have to
+	 */
+	if (vrq) {
+		kmem_free(vrq, sizeof (eib_vnic_req_t));
+	}
+
+	if (vr_req == EIB_CR_REQ_DIE) {
+		mutex_enter(&ci_lock);
+		CALLB_CPR_EXIT(&ci);
+		mutex_destroy(&ci_lock);
+
+		return;
+	}
+
+	goto wait_for_vnic_req;
+	/*NOTREACHED*/
+}
+
+/*
+ * Thread to monitor tx wqes and update the mac layer when needed.
+ * Note that this thread can only be started after the tx wqe pool
+ * has been allocated and initialized.
+ */
+void
+eib_monitor_tx_wqes(eib_t *ss)
+{
+	eib_wqe_pool_t *wp = ss->ei_tx;
+	kmutex_t ci_lock;
+	callb_cpr_t ci;
+
+	mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL);
+	CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_TXWQES_MONITOR);
+
+	ASSERT(wp != NULL);
+
+monitor_wqe_status:
+	mutex_enter(&wp->wp_lock);
+
+	/*
+	 * Wait till someone falls short of wqes
+	 */
+	while (wp->wp_status == 0) {
+		mutex_enter(&ci_lock);
+		CALLB_CPR_SAFE_BEGIN(&ci);
+		mutex_exit(&ci_lock);
+
+		cv_wait(&wp->wp_cv, &wp->wp_lock);
+
+		mutex_enter(&ci_lock);
+		CALLB_CPR_SAFE_END(&ci, &ci_lock);
+		mutex_exit(&ci_lock);
+	}
+
+	/*
+	 * Have we been asked to die ?
+	 */
+	if (wp->wp_status & EIB_TXWQE_MONITOR_DIE) {
+		mutex_exit(&wp->wp_lock);
+
+		mutex_enter(&ci_lock);
+		CALLB_CPR_EXIT(&ci);
+		mutex_destroy(&ci_lock);
+
+		return;
+	}
+
+	ASSERT((wp->wp_status & EIB_TXWQE_SHORT) != 0);
+
+	/*
+	 * Start monitoring free wqes till they cross min threshold
+	 */
+	while ((wp->wp_nfree < EIB_NFREE_SWQES_HWM) &&
+	    ((wp->wp_status & EIB_TXWQE_MONITOR_DIE) == 0)) {
+
+		mutex_enter(&ci_lock);
+		CALLB_CPR_SAFE_BEGIN(&ci);
+		mutex_exit(&ci_lock);
+
+		cv_wait(&wp->wp_cv, &wp->wp_lock);
+
+		mutex_enter(&ci_lock);
+		CALLB_CPR_SAFE_END(&ci, &ci_lock);
+		mutex_exit(&ci_lock);
+	}
+
+	/*
+	 * Have we been asked to die ?
+	 */
+	if (wp->wp_status & EIB_TXWQE_MONITOR_DIE) {
+		mutex_exit(&wp->wp_lock);
+
+		mutex_enter(&ci_lock);
+		CALLB_CPR_EXIT(&ci);
+		mutex_destroy(&ci_lock);
+
+		return;
+	}
+
+	ASSERT(wp->wp_nfree >= EIB_NFREE_SWQES_HWM);
+	wp->wp_status &= (~EIB_TXWQE_SHORT);
+
+	mutex_exit(&wp->wp_lock);
+
+	/*
+	 * Inform the mac layer that tx resources are now available
+	 * and go back to monitoring
+	 */
+	if (ss->ei_mac_hdl) {
+		mac_tx_update(ss->ei_mac_hdl);
+	}
+	goto monitor_wqe_status;
+
+	/*NOTREACHED*/
+}
+
+/*
+ * Thread to monitor lso bufs and update the mac layer as needed.
+ * Note that this thread can only be started after the lso buckets
+ * have been allocated and initialized.
+ */
+void
+eib_monitor_lso_bufs(eib_t *ss)
+{
+	eib_lsobkt_t *bkt = ss->ei_lso;
+	kmutex_t ci_lock;
+	callb_cpr_t ci;
+
+	mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL);
+	CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_LSOBUFS_MONITOR);
+
+	ASSERT(bkt != NULL);
+
+monitor_lso_status:
+	mutex_enter(&bkt->bk_lock);
+
+	/*
+	 * Wait till someone falls short of LSO buffers or we're asked
+	 * to die
+	 */
+	while (bkt->bk_status == 0) {
+		mutex_enter(&ci_lock);
+		CALLB_CPR_SAFE_BEGIN(&ci);
+		mutex_exit(&ci_lock);
+
+		cv_wait(&bkt->bk_cv, &bkt->bk_lock);
+
+		mutex_enter(&ci_lock);
+		CALLB_CPR_SAFE_END(&ci, &ci_lock);
+		mutex_exit(&ci_lock);
+	}
+
+	if (bkt->bk_status & EIB_LBUF_MONITOR_DIE) {
+		mutex_exit(&bkt->bk_lock);
+
+		mutex_enter(&ci_lock);
+		CALLB_CPR_EXIT(&ci);
+		mutex_destroy(&ci_lock);
+
+		return;
+	}
+
+	ASSERT((bkt->bk_status & EIB_LBUF_SHORT) != 0);
+
+	/*
+	 * Start monitoring free LSO buffers till there are enough
+	 * free buffers available
+	 */
+	while ((bkt->bk_nfree < EIB_LSO_FREE_BUFS_THRESH) &&
+	    ((bkt->bk_status & EIB_LBUF_MONITOR_DIE) == 0)) {
+
+		mutex_enter(&ci_lock);
+		CALLB_CPR_SAFE_BEGIN(&ci);
+		mutex_exit(&ci_lock);
+
+		cv_wait(&bkt->bk_cv, &bkt->bk_lock);
+
+		mutex_enter(&ci_lock);
+		CALLB_CPR_SAFE_END(&ci, &ci_lock);
+		mutex_exit(&ci_lock);
+	}
+
+	if (bkt->bk_status & EIB_LBUF_MONITOR_DIE) {
+		mutex_exit(&bkt->bk_lock);
+
+		mutex_enter(&ci_lock);
+		CALLB_CPR_EXIT(&ci);
+		mutex_destroy(&ci_lock);
+
+		return;
+	}
+
+	/*
+	 * We have enough lso buffers available now
+	 */
+	ASSERT(bkt->bk_nfree >= EIB_LSO_FREE_BUFS_THRESH);
+	bkt->bk_status &= (~EIB_LBUF_SHORT);
+
+	mutex_exit(&bkt->bk_lock);
+
+	/*
+	 * Inform the mac layer that tx lso resources are now available
+	 * and go back to monitoring
+	 */
+	if (ss->ei_mac_hdl) {
+		mac_tx_update(ss->ei_mac_hdl);
+	}
+	goto monitor_lso_status;
+
+	/*NOTREACHED*/
+}
+
+/*
+ * Thread to manage the keepalive requirements for vnics and the gateway.
+ */
+void
+eib_manage_keepalives(eib_t *ss)
+{
+	eib_ka_vnics_t *elem;
+	eib_ka_vnics_t *nxt;
+	clock_t deadline;
+	int64_t lbolt64;
+	int err;
+	kmutex_t ci_lock;
+	callb_cpr_t ci;
+
+	mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL);
+	CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_EVENTS_HDLR);
+
+	mutex_enter(&ss->ei_ka_vnics_lock);
+
+periodic_keepalive:
+	deadline = ddi_get_lbolt() + ss->ei_gw_props->pp_vnic_ka_ticks;
+
+	while ((ss->ei_ka_vnics_event &
+	    (EIB_KA_VNICS_DIE | EIB_KA_VNICS_TIMED_OUT)) == 0) {
+		mutex_enter(&ci_lock);
+		CALLB_CPR_SAFE_BEGIN(&ci);
+		mutex_exit(&ci_lock);
+
+		if (cv_timedwait(&ss->ei_ka_vnics_cv, &ss->ei_ka_vnics_lock,
+		    deadline) == -1) {
+			ss->ei_ka_vnics_event |= EIB_KA_VNICS_TIMED_OUT;
+		}
+
+		mutex_enter(&ci_lock);
+		CALLB_CPR_SAFE_END(&ci, &ci_lock);
+		mutex_exit(&ci_lock);
+	}
+
+	if (ss->ei_ka_vnics_event & EIB_KA_VNICS_DIE) {
+		for (elem = ss->ei_ka_vnics; elem; elem = nxt) {
+			nxt = elem->ka_next;
+			kmem_free(elem, sizeof (eib_ka_vnics_t));
+		}
+		ss->ei_ka_vnics = NULL;
+		mutex_exit(&ss->ei_ka_vnics_lock);
+
+		mutex_enter(&ci_lock);
+		CALLB_CPR_EXIT(&ci);
+		mutex_destroy(&ci_lock);
+
+		return;
+	}
+
+	/*
+	 * Are there any vnics that need keepalive management ?
+	 */
+	ss->ei_ka_vnics_event &= ~EIB_KA_VNICS_TIMED_OUT;
+	if (ss->ei_ka_vnics == NULL)
+		goto periodic_keepalive;
+
+	/*
+	 * Ok, we need to send vnic keepalives to our gateway. But first
+	 * check if the gateway heartbeat is good as of this moment.  Note
+	 * that we need do get the lbolt value after acquiring ei_vnic_lock
+	 * to ensure that ei_gw_last_heartbeat does not change before the
+	 * comparison (to avoid a negative value in the comparison result
+	 * causing us to incorrectly assume that the gateway heartbeat has
+	 * stopped).
+	 */
+	mutex_enter(&ss->ei_vnic_lock);
+
+	lbolt64 = ddi_get_lbolt64();
+
+	if (ss->ei_gw_last_heartbeat != 0) {
+		if ((lbolt64 - ss->ei_gw_last_heartbeat) >
+		    ss->ei_gw_props->pp_gw_ka_ticks) {
+
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_manage_keepalives: no keepalives from gateway "
+			    "0x%x for hca_guid=0x%llx, port=0x%x, "
+			    "last_gw_ka=0x%llx", ss->ei_gw_props->pp_gw_portid,
+			    ss->ei_props->ep_hca_guid,
+			    ss->ei_props->ep_port_num,
+			    ss->ei_gw_last_heartbeat);
+
+			for (elem = ss->ei_ka_vnics; elem; elem = nxt) {
+				nxt = elem->ka_next;
+				ss->ei_zombie_vnics |=
+				    ((uint64_t)1 << elem->ka_vnic->vn_instance);
+				kmem_free(elem, sizeof (eib_ka_vnics_t));
+			}
+			ss->ei_ka_vnics = NULL;
+			ss->ei_gw_unreachable = B_TRUE;
+			mutex_exit(&ss->ei_vnic_lock);
+
+			eib_mac_link_down(ss, B_FALSE);
+
+			goto periodic_keepalive;
+		}
+	}
+	mutex_exit(&ss->ei_vnic_lock);
+
+	for (elem = ss->ei_ka_vnics; elem; elem = elem->ka_next)
+		(void) eib_fip_heartbeat(ss, elem->ka_vnic, &err);
+
+	goto periodic_keepalive;
+	/*NOTREACHED*/
+}
+
+void
+eib_stop_events_handler(eib_t *ss)
+{
+	eib_event_t *evi;
+
+	evi = kmem_zalloc(sizeof (eib_event_t), KM_SLEEP);
+	evi->ev_code = EIB_EV_SHUTDOWN;
+	evi->ev_arg = NULL;
+
+	eib_svc_enqueue_event(ss, evi);
+
+	thread_join(ss->ei_events_handler);
+}
+
+void
+eib_stop_refill_rwqes(eib_t *ss)
+{
+	mutex_enter(&ss->ei_rxpost_lock);
+
+	ss->ei_rxpost_die = 1;
+
+	cv_signal(&ss->ei_rxpost_cv);
+	mutex_exit(&ss->ei_rxpost_lock);
+
+	thread_join(ss->ei_rwqes_refiller);
+}
+
+void
+eib_stop_vnic_creator(eib_t *ss)
+{
+	eib_vnic_req_t *vrq;
+
+	vrq = kmem_zalloc(sizeof (eib_vnic_req_t), KM_SLEEP);
+	vrq->vr_req = EIB_CR_REQ_DIE;
+	vrq->vr_next = NULL;
+
+	eib_vnic_enqueue_req(ss, vrq);
+
+	thread_join(ss->ei_vnic_creator);
+}
+
+void
+eib_stop_monitor_tx_wqes(eib_t *ss)
+{
+	eib_wqe_pool_t *wp = ss->ei_tx;
+
+	mutex_enter(&wp->wp_lock);
+
+	wp->wp_status |= EIB_TXWQE_MONITOR_DIE;
+
+	cv_signal(&wp->wp_cv);
+	mutex_exit(&wp->wp_lock);
+
+	thread_join(ss->ei_txwqe_monitor);
+}
+
+int
+eib_stop_monitor_lso_bufs(eib_t *ss, boolean_t force)
+{
+	eib_lsobkt_t *bkt = ss->ei_lso;
+
+	mutex_enter(&bkt->bk_lock);
+
+	/*
+	 * If there are some buffers still not reaped and the force
+	 * flag is not set, return without doing anything. Otherwise,
+	 * stop the lso bufs monitor and wait for it to die.
+	 */
+	if ((bkt->bk_nelem != bkt->bk_nfree) && (force == B_FALSE)) {
+		mutex_exit(&bkt->bk_lock);
+		return (EIB_E_FAILURE);
+	}
+
+	bkt->bk_status |= EIB_LBUF_MONITOR_DIE;
+
+	cv_signal(&bkt->bk_cv);
+	mutex_exit(&bkt->bk_lock);
+
+	thread_join(ss->ei_lsobufs_monitor);
+	return (EIB_E_SUCCESS);
+}
+
+void
+eib_stop_manage_keepalives(eib_t *ss)
+{
+	mutex_enter(&ss->ei_ka_vnics_lock);
+
+	ss->ei_ka_vnics_event |= EIB_KA_VNICS_DIE;
+
+	cv_signal(&ss->ei_ka_vnics_cv);
+	mutex_exit(&ss->ei_ka_vnics_lock);
+
+	thread_join(ss->ei_keepalives_manager);
+}
+
+void
+eib_flush_vnic_reqs(eib_t *ss)
+{
+	eib_vnic_req_t *vrq;
+
+	vrq = kmem_zalloc(sizeof (eib_vnic_req_t), KM_SLEEP);
+	vrq->vr_req = EIB_CR_REQ_FLUSH;
+	vrq->vr_next = NULL;
+
+	eib_vnic_enqueue_req(ss, vrq);
+}
+
+/*ARGSUSED*/
+void
+eib_gw_alive_cb(dev_info_t *dip, ddi_eventcookie_t cookie, void *arg,
+    void *impl_data)
+{
+	eib_t *ss = (eib_t *)arg;
+	eib_event_t *evi;
+
+	evi = kmem_zalloc(sizeof (eib_event_t), KM_NOSLEEP);
+	if (evi == NULL) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_gw_alive_cb: "
+		    "no memory, ignoring this gateway alive event");
+	} else {
+		evi->ev_code = EIB_EV_GW_UP;
+		evi->ev_arg = NULL;
+		eib_svc_enqueue_event(ss, evi);
+	}
+}
+
+/*ARGSUSED*/
+void
+eib_login_ack_cb(dev_info_t *dip, ddi_eventcookie_t cookie, void *arg,
+    void *impl_data)
+{
+	eib_t *ss = (eib_t *)arg;
+	uint8_t *pkt = (uint8_t *)impl_data;
+	eib_login_data_t ld;
+
+	/*
+	 * We have received a login ack message from the gateway via the EoIB
+	 * nexus (solicitation qpn).  The packet is passed to us raw (unparsed)
+	 * and we have to figure out if this is a vnic login ack.
+	 */
+	if (eib_fip_parse_login_ack(ss, pkt + EIB_GRH_SZ, &ld) == EIB_E_SUCCESS)
+		eib_vnic_login_ack(ss, &ld);
+}
+
+/*ARGSUSED*/
+void
+eib_gw_info_cb(dev_info_t *dip, ddi_eventcookie_t cookie, void *arg,
+    void *impl_data)
+{
+	eib_t *ss = (eib_t *)arg;
+	eib_event_t *evi;
+
+	evi = kmem_zalloc(sizeof (eib_event_t), KM_NOSLEEP);
+	if (evi == NULL) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_gw_info_cb: "
+		    "no memory, ignoring this gateway props update event");
+		return;
+	}
+	evi->ev_arg = kmem_zalloc(sizeof (eib_gw_info_t), KM_NOSLEEP);
+	if (evi->ev_arg == NULL) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_gw_info_cb: "
+		    "no memory, ignoring this gateway props update event");
+		kmem_free(evi, sizeof (eib_event_t));
+		return;
+	}
+	bcopy(impl_data, evi->ev_arg, sizeof (eib_gw_info_t));
+	evi->ev_code = EIB_EV_GW_INFO_UPDATE;
+
+	eib_svc_enqueue_event(ss, evi);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/ib/clients/eoib/eib_vnic.c	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,2228 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+
+#include <sys/ib/clients/eoib/eib_impl.h>
+
+/*
+ * Declarations private to this file
+ */
+static int eib_vnic_get_instance(eib_t *, int *);
+static void eib_vnic_ret_instance(eib_t *, int);
+static void eib_vnic_modify_enter(eib_t *, uint_t);
+static void eib_vnic_modify_exit(eib_t *, uint_t);
+static int eib_vnic_create_common(eib_t *, eib_vnic_t *, int *);
+static int eib_vnic_set_partition(eib_t *, eib_vnic_t *, int *);
+static void eib_vnic_make_vhub_mgid(uint8_t *, uint8_t, uint8_t *, uint8_t,
+    uint8_t, uint32_t, ib_gid_t *);
+static int eib_vnic_attach_ctl_mcgs(eib_t *, eib_vnic_t *, int *);
+static int eib_vnic_attach_vhub_table(eib_t *, eib_vnic_t *);
+static int eib_vnic_attach_vhub_update(eib_t *, eib_vnic_t *);
+static void eib_vnic_start_keepalives(eib_t *, eib_vnic_t *);
+static int eib_vnic_lookup_dest(eib_vnic_t *, uint8_t *, uint16_t,
+    eib_vhub_map_t *, ibt_mcg_info_t *, int *);
+static void eib_vnic_leave_all_data_mcgs(eib_t *, eib_vnic_t *);
+static void eib_vnic_rejoin_data_mcgs(eib_t *, eib_vnic_t *);
+static void eib_vnic_reattach_ctl_mcgs(eib_t *, eib_vnic_t *);
+static void eib_rb_vnic_create_common(eib_t *, eib_vnic_t *, uint_t);
+static void eib_rb_vnic_attach_ctl_mcgs(eib_t *, eib_vnic_t *);
+static void eib_rb_vnic_attach_vhub_table(eib_t *, eib_vnic_t *);
+static void eib_rb_vnic_attach_vhub_update(eib_t *, eib_vnic_t *);
+static void eib_rb_vnic_start_keepalives(eib_t *, eib_vnic_t *);
+static void eib_rb_vnic_join_data_mcg(eib_t *, eib_vnic_t *, uint8_t *);
+
+/*
+ * Definitions private to this file
+ */
+#define	EIB_VNIC_STRUCT_ALLOCD		0x0001
+#define	EIB_VNIC_GOT_INSTANCE		0x0002
+#define	EIB_VNIC_CREATE_COMMON_DONE	0x0004
+#define	EIB_VNIC_CTLQP_CREATED		0x0008
+#define	EIB_VNIC_DATAQP_CREATED		0x0010
+#define	EIB_VNIC_LOGIN_DONE		0x0020
+#define	EIB_VNIC_PARTITION_SET		0x0040
+#define	EIB_VNIC_RX_POSTED_TO_CTLQP	0x0080
+#define	EIB_VNIC_RX_POSTED_TO_DATAQP	0x0100
+#define	EIB_VNIC_ATTACHED_TO_CTL_MCGS	0x0200
+#define	EIB_VNIC_GOT_VHUB_TABLE		0x0400
+#define	EIB_VNIC_KEEPALIVES_STARTED	0x0800
+#define	EIB_VNIC_BROADCAST_JOINED	0x1000
+
+/*
+ * Destination type
+ */
+#define	EIB_TX_UNICAST			1
+#define	EIB_TX_MULTICAST		2
+#define	EIB_TX_BROADCAST		3
+
+int
+eib_vnic_create(eib_t *ss, uint8_t *macaddr, uint16_t vlan, eib_vnic_t **vnicp,
+    int *err)
+{
+	eib_vnic_t *vnic = NULL;
+	boolean_t failed_vnic = B_FALSE;
+	uint_t progress = 0;
+
+	eib_vnic_modify_enter(ss, EIB_VN_BEING_CREATED);
+
+	/*
+	 * When a previously created vnic is being resurrected due to a
+	 * gateway reboot, there's a race possible where a creation request
+	 * for the existing vnic could get filed with the vnic creator
+	 * thread. So, before we go ahead with the creation of this vnic,
+	 * make sure we already don't have the vnic.
+	 */
+	if (macaddr) {
+		if (eib_data_lookup_vnic(ss, macaddr, vlan, vnicp,
+		    &failed_vnic) == EIB_E_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_create: "
+			    "vnic for mac=%x:%x:%x:%x:%x:%x, vlan=0x%x "
+			    "already there, no duplicate creation", macaddr[0],
+			    macaddr[1], macaddr[2], macaddr[3], macaddr[4],
+			    macaddr[5], vlan);
+
+			eib_vnic_modify_exit(ss, EIB_VN_BEING_CREATED);
+			return (EIB_E_SUCCESS);
+		} else if (failed_vnic) {
+			EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_create: "
+			    "vnic for mac=%x:%x:%x:%x:%x:%x, vlan=0x%x "
+			    "failed earlier, shouldn't be here at all",
+			    macaddr[0], macaddr[1], macaddr[2], macaddr[3],
+			    macaddr[4], macaddr[5], vlan);
+
+			*err = EEXIST;
+
+			eib_vnic_modify_exit(ss, EIB_VN_BEING_CREATED);
+			return (EIB_E_FAILURE);
+		}
+	}
+
+	/*
+	 * Allocate a vnic structure for this instance
+	 */
+	vnic = kmem_zalloc(sizeof (eib_vnic_t), KM_SLEEP);
+	vnic->vn_ss = ss;
+	vnic->vn_instance = -1;
+	mutex_init(&vnic->vn_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&vnic->vn_cv, NULL, CV_DEFAULT, NULL);
+
+	progress |= EIB_VNIC_STRUCT_ALLOCD;
+
+	/*
+	 * Get a vnic instance
+	 */
+	if (eib_vnic_get_instance(ss, &vnic->vn_instance) != EIB_E_SUCCESS) {
+		*err = EMFILE;
+		goto vnic_create_fail;
+	}
+	progress |= EIB_VNIC_GOT_INSTANCE;
+
+	/*
+	 * Initialize vnic's basic parameters.  Note that we set the 15-bit
+	 * vnic id to send to gw during a login to be a 2-tuple of
+	 * {devi_instance#, eoib_vnic_instance#}.
+	 */
+	vnic->vn_vlan = vlan;
+	if (macaddr) {
+		bcopy(macaddr, vnic->vn_macaddr, sizeof (vnic->vn_macaddr));
+	}
+	vnic->vn_id = (uint16_t)EIB_VNIC_ID(ss->ei_instance, vnic->vn_instance);
+
+	/*
+	 * Start up this vnic instance
+	 */
+	if (eib_vnic_create_common(ss, vnic, err) != EIB_E_SUCCESS)
+		goto vnic_create_fail;
+
+	progress |= EIB_VNIC_CREATE_COMMON_DONE;
+
+	/*
+	 * Return the created vnic
+	 */
+	if (vnicp) {
+		*vnicp = vnic;
+	}
+
+	eib_vnic_modify_exit(ss, EIB_VN_BEING_CREATED);
+	return (EIB_E_SUCCESS);
+
+vnic_create_fail:
+	eib_rb_vnic_create(ss, vnic, progress);
+	eib_vnic_modify_exit(ss, EIB_VN_BEING_CREATED);
+	return (EIB_E_FAILURE);
+}
+
+void
+eib_vnic_delete(eib_t *ss, eib_vnic_t *vnic)
+{
+	eib_vnic_modify_enter(ss, EIB_VN_BEING_DELETED);
+	eib_rb_vnic_create(ss, vnic, ~0);
+	eib_vnic_modify_exit(ss, EIB_VN_BEING_DELETED);
+}
+
+/*ARGSUSED*/
+int
+eib_vnic_wait_for_login_ack(eib_t *ss, eib_vnic_t *vnic, int *err)
+{
+	clock_t deadline;
+	int ret = EIB_E_SUCCESS;
+
+	deadline = ddi_get_lbolt() + drv_usectohz(EIB_LOGIN_TIMEOUT_USEC);
+
+	/*
+	 * Wait for login ack/nack or wait time to get over. If we wake up
+	 * with a login failure, record the reason.
+	 */
+	mutex_enter(&vnic->vn_lock);
+	while (vnic->vn_state == EIB_LOGIN_ACK_WAIT) {
+		if (cv_timedwait(&vnic->vn_cv, &vnic->vn_lock,
+		    deadline) == -1) {
+			if (vnic->vn_state == EIB_LOGIN_ACK_WAIT)
+				vnic->vn_state = EIB_LOGIN_TIMED_OUT;
+		}
+	}
+
+	if (vnic->vn_state != EIB_LOGIN_ACK_RCVD) {
+		ret = EIB_E_FAILURE;
+		*err =  (vnic->vn_state == EIB_LOGIN_TIMED_OUT) ?
+		    ETIME : ECANCELED;
+	}
+	mutex_exit(&vnic->vn_lock);
+
+	return (ret);
+}
+
+void
+eib_vnic_login_ack(eib_t *ss, eib_login_data_t *ld)
+{
+	eib_vnic_t *vnic;
+	uint_t vnic_instance;
+	uint_t hdrs_sz;
+	uint16_t vnic_id;
+	int nack = 1;
+
+	/*
+	 * The msb in the vnic id in login ack message is not
+	 * part of our vNIC id.
+	 */
+	vnic_id = ld->ld_vnic_id & (~FIP_VL_VNIC_ID_MSBIT);
+
+	/*
+	 * Now, we deconstruct the vnic id and determine the vnic
+	 * instance number. If this vnic_instance number isn't
+	 * valid or the vnic_id of the vnic for this instance
+	 * number doesn't match in our records, we quit.
+	 */
+	vnic_instance = EIB_VNIC_INSTANCE(vnic_id);
+	if (vnic_instance >= EIB_MAX_VNICS)
+		return;
+
+	/*
+	 * At this point, we haven't fully created the vnic, so
+	 * this vnic should be present as ei_vnic_pending.
+	 */
+	mutex_enter(&ss->ei_vnic_lock);
+	if ((vnic = ss->ei_vnic_pending) == NULL) {
+		mutex_exit(&ss->ei_vnic_lock);
+		return;
+	} else if (vnic->vn_id != vnic_id) {
+		mutex_exit(&ss->ei_vnic_lock);
+		return;
+	}
+	mutex_exit(&ss->ei_vnic_lock);
+
+	/*
+	 * First check if the vnic is still sleeping, waiting
+	 * for login ack.  If not, we might as well quit now.
+	 */
+	mutex_enter(&vnic->vn_lock);
+	if (vnic->vn_state != EIB_LOGIN_ACK_WAIT) {
+		mutex_exit(&vnic->vn_lock);
+		return;
+	}
+
+	/*
+	 * We NACK the waiter under these conditions:
+	 *
+	 * . syndrome was set
+	 * . vhub mtu is bigger than our max mtu (minus eoib/eth hdrs sz)
+	 * . assigned vlan is different from requested vlan (except
+	 *   when we didn't request a specific vlan)
+	 * . when the assigned mac is different from the requested mac
+	 *   (except when we didn't request a specific mac)
+	 * . when the VP bit indicates that vlan tag should be used
+	 *   but we had not specified a vlan tag in our request
+	 * . when the VP bit indicates that vlan tag should not be
+	 *   present and we'd specified a vlan tag in our request
+	 *
+	 * The last case is interesting: if we had not specified any vlan id
+	 * in our request, but the gateway has assigned a vlan and asks us
+	 * to use/expect that tag on every packet dealt by this vnic, it
+	 * means effectively the EoIB driver has to insert/remove vlan
+	 * tagging on this vnic traffic, since the nw layer on Solaris
+	 * won't be using/expecting any tag on traffic for this vnic. This
+	 * feature is not supported currently.
+	 */
+	hdrs_sz = EIB_ENCAP_HDR_SZ + sizeof (struct ether_header) + VLAN_TAGSZ;
+	if (ld->ld_syndrome) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_login_ack: "
+		    "non-zero syndrome 0x%lx, NACK", ld->ld_syndrome);
+
+	} else if (ld->ld_vhub_mtu > (ss->ei_props->ep_mtu - hdrs_sz)) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_login_ack: "
+		    "vhub mtu (0x%x) bigger than port mtu (0x%x), NACK",
+		    ld->ld_vhub_mtu, ss->ei_props->ep_mtu);
+
+	} else if ((vnic->vn_vlan) && (vnic->vn_vlan != ld->ld_assigned_vlan)) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_login_ack: "
+		    "assigned vlan (0x%x) different from asked (0x%x), "
+		    "for vnic id 0x%x, NACK", ld->ld_assigned_vlan,
+		    vnic->vn_vlan, vnic->vn_id);
+
+	} else if (bcmp(vnic->vn_macaddr, eib_zero_mac, ETHERADDRL) &&
+	    bcmp(vnic->vn_macaddr, ld->ld_assigned_mac, ETHERADDRL)) {
+		uint8_t *asked, *got;
+
+		asked = vnic->vn_macaddr;
+		got = ld->ld_assigned_mac;
+
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_login_ack: "
+		    "assigned mac (%x:%x:%x:%x:%x:%x) different from "
+		    "asked (%x:%x:%x:%x:%x:%x) for vnic id 0x%x, NACK",
+		    got[0], got[1], got[2], got[3], got[4], got[5], asked[0],
+		    asked[1], asked[2], asked[3], asked[4], asked[5]);
+
+	} else if ((vnic->vn_vlan == 0) && (ld->ld_vlan_in_packets)) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_login_ack: "
+		    "asked for tagless vlan, but VP flag is set "
+		    "for vnic id 0x%x, NACK", vnic->vn_id);
+
+	} else if ((vnic->vn_vlan) && (!ld->ld_vlan_in_packets)) {
+		if (eib_wa_no_good_vp_flag) {
+			ld->ld_vlan_in_packets = 1;
+			ld->ld_vhub_id = EIB_VHUB_ID(ld->ld_gw_port_id,
+			    ld->ld_assigned_vlan);
+			nack = 0;
+		} else {
+			EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_login_ack: "
+			    "vlan was assigned correctly, but VP flag is not "
+			    "set for vnic id 0x%x, NACK", vnic->vn_id);
+		}
+	} else {
+		ld->ld_vhub_id = EIB_VHUB_ID(ld->ld_gw_port_id,
+		    ld->ld_assigned_vlan);
+		nack = 0;
+	}
+
+	/*
+	 * ACK/NACK the waiter
+	 */
+	if (nack) {
+		vnic->vn_state = EIB_LOGIN_NACK_RCVD;
+	} else {
+		bcopy(ld, &vnic->vn_login_data, sizeof (eib_login_data_t));
+		vnic->vn_state = EIB_LOGIN_ACK_RCVD;
+	}
+
+	cv_signal(&vnic->vn_cv);
+	mutex_exit(&vnic->vn_lock);
+}
+
+int
+eib_vnic_wait_for_table(eib_t *ss, eib_vnic_t *vnic, int *err)
+{
+	clock_t deadline;
+	int ret = EIB_E_SUCCESS;
+
+	/*
+	 * The EoIB spec does not detail exactly within what time a vhub table
+	 * request is expected to be answered.  However, it does mention that
+	 * in the worst case, the vhub update messages from the gateway must
+	 * be seen atleast once in 2.5 * GW_KA_PERIOD (already saved in
+	 * pp_gw_ka_ticks), so we'll settle for that limit.
+	 */
+	deadline = ddi_get_lbolt() + ss->ei_gw_props->pp_gw_ka_ticks;
+
+	/*
+	 * Wait for vhub table to be constructed. If we wake up with a
+	 * vhub table construction failure, record the reason.
+	 */
+	mutex_enter(&vnic->vn_lock);
+	while (vnic->vn_state == EIB_LOGIN_TBL_WAIT) {
+		if (cv_timedwait(&vnic->vn_cv, &vnic->vn_lock,
+		    deadline) == -1) {
+			if (vnic->vn_state == EIB_LOGIN_TBL_WAIT)
+				vnic->vn_state = EIB_LOGIN_TIMED_OUT;
+		}
+	}
+
+	if (vnic->vn_state != EIB_LOGIN_TBL_DONE) {
+		ret = EIB_E_FAILURE;
+		*err =  (vnic->vn_state == EIB_LOGIN_TIMED_OUT) ?
+		    ETIME : ECANCELED;
+	}
+	mutex_exit(&vnic->vn_lock);
+
+	return (ret);
+}
+
+void
+eib_vnic_vhub_table_done(eib_vnic_t *vnic, uint_t result_state)
+{
+	ASSERT(result_state == EIB_LOGIN_TBL_DONE ||
+	    result_state == EIB_LOGIN_TBL_FAILED);
+
+	/*
+	 * Construction of vhub table for the vnic is done one way or
+	 * the other.  Set the login wait state appropriately and signal
+	 * the waiter. If it's a vhub table failure, we shouldn't parse
+	 * any more vhub table or vhub update packets until the vnic state
+	 * is changed.
+	 */
+	mutex_enter(&vnic->vn_lock);
+	vnic->vn_state = result_state;
+	cv_signal(&vnic->vn_cv);
+	mutex_exit(&vnic->vn_lock);
+}
+
+int
+eib_vnic_join_data_mcg(eib_t *ss, eib_vnic_t *vnic, uint8_t *mcast_mac,
+    boolean_t rejoin, int *err)
+{
+	eib_chan_t *chan = vnic->vn_data_chan;
+	eib_login_data_t *ld = &vnic->vn_login_data;
+	eib_mcg_t *mcg;
+	eib_mcg_t *elem;
+	eib_mcg_t *tail;
+	ibt_mcg_info_t *mcg_info;
+	ibt_mcg_attr_t mcg_attr;
+	ibt_status_t ret;
+
+	/*
+	 * Compose the multicast MGID to join
+	 */
+	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
+
+	eib_vnic_make_vhub_mgid(ld->ld_gw_mgid_prefix,
+	    (uint8_t)EIB_MGID_VHUB_DATA, mcast_mac, ld->ld_n_mac_mcgid, 0,
+	    ld->ld_vhub_id, &(mcg_attr.mc_mgid));
+	mcg_attr.mc_pkey = (ib_pkey_t)ld->ld_vhub_pkey;
+	mcg_attr.mc_qkey = (ib_qkey_t)EIB_DATA_QKEY;
+
+	/*
+	 * Allocate for and prepare the mcg to add to our list
+	 */
+	mcg_info = kmem_zalloc(sizeof (ibt_mcg_info_t), KM_NOSLEEP);
+	if (mcg_info == NULL) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_join_data_mcg: "
+		    "no memory, failed to join mcg (mac=%x:%x:%x:%x:%x:%x)",
+		    mcast_mac[0], mcast_mac[1], mcast_mac[2],
+		    mcast_mac[3], mcast_mac[4], mcast_mac[5]);
+
+		*err = ENOMEM;
+		goto vnic_join_data_mcg_fail;
+	}
+	mcg = kmem_zalloc(sizeof (eib_mcg_t), KM_NOSLEEP);
+	if (mcg == NULL) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_join_data_mcg: "
+		    "no memory, failed to join mcg (mac=%x:%x:%x:%x:%x:%x)",
+		    mcast_mac[0], mcast_mac[1], mcast_mac[2],
+		    mcast_mac[3], mcast_mac[4], mcast_mac[5]);
+
+		*err = ENOMEM;
+		goto vnic_join_data_mcg_fail;
+	}
+	mcg->mg_next = NULL;
+	mcg->mg_rgid = ss->ei_props->ep_sgid;
+	mcg->mg_mgid = mcg_attr.mc_mgid;
+	mcg->mg_join_state = IB_MC_JSTATE_FULL;
+	mcg->mg_mcginfo = mcg_info;
+	bcopy(mcast_mac, mcg->mg_mac, ETHERADDRL);
+
+	/*
+	 * Join the multicast group
+	 *
+	 * Should we query for the mcg and join instead of attempting to
+	 * join directly ?
+	 */
+	mcg_attr.mc_join_state = mcg->mg_join_state;
+	mcg_attr.mc_flow = 0;
+	mcg_attr.mc_tclass = 0;
+	mcg_attr.mc_sl = 0;
+	mcg_attr.mc_scope = 0;	/* IB_MC_SCOPE_SUBNET_LOCAL perhaps ? */
+
+	ret = ibt_join_mcg(mcg->mg_rgid, &mcg_attr, mcg_info, NULL, NULL);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_join_data_mcg: "
+		    "ibt_join_mcg(mgid=%llx.%llx, pkey=0x%x, qkey=0x%lx, "
+		    "jstate=0x%x) failed, ret=%d", mcg_attr.mc_mgid.gid_prefix,
+		    mcg_attr.mc_mgid.gid_guid, mcg_attr.mc_pkey,
+		    mcg_attr.mc_qkey, mcg_attr.mc_join_state, ret);
+
+		*err = EINVAL;
+		goto vnic_join_data_mcg_fail;
+	}
+
+	/*
+	 * Attach to the group to receive multicast messages
+	 */
+	ret = ibt_attach_mcg(chan->ch_chan, mcg_info);
+	if (ret != IBT_SUCCESS) {
+		*err = EINVAL;
+
+		ret = ibt_leave_mcg(mcg->mg_rgid, mcg->mg_mgid,
+		    eib_reserved_gid, mcg->mg_join_state);
+		if (ret != EIB_E_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_vnic_join_data_mcg: "
+			    "ibt_leave_mcg(mgid=%llx.%llx, jstate=0x%x) "
+			    "failed, ret=%d", mcg->mg_mgid.gid_prefix,
+			    mcg->mg_mgid.gid_guid, mcg->mg_join_state, ret);
+		}
+
+		goto vnic_join_data_mcg_fail;
+	}
+
+	mutex_enter(&chan->ch_vhub_lock);
+
+	tail = NULL;
+	for (elem = chan->ch_vhub_data; elem != NULL; elem = elem->mg_next) {
+		if ((elem->mg_mgid.gid_prefix == mcg_attr.mc_mgid.gid_prefix) &&
+		    (elem->mg_mgid.gid_guid == mcg_attr.mc_mgid.gid_guid)) {
+			break;
+		}
+		tail = elem;
+	}
+
+	/*
+	 * If we had't already joined to this mcg, add the newly joined mcg
+	 * to the tail and return success
+	 */
+	if (elem == NULL) {
+		if (tail)
+			tail->mg_next = mcg;
+		else
+			chan->ch_vhub_data = mcg;
+		mutex_exit(&chan->ch_vhub_lock);
+		return (EIB_E_SUCCESS);
+	}
+
+	/*
+	 * Duplicate.  We need to leave one of the two joins.  If "rejoin"
+	 * was requested, leave the old join, otherwise leave the new join.
+	 *
+	 * Note that we must not detach the qp from the mcg, since if this
+	 * was a dup, a second ibt_attach_mcg() above would've simply been
+	 * a nop.
+	 *
+	 * Note also that the leave may not be successful here if our presence
+	 * has been removed by the SM, but we need to do this to prevent leaks
+	 * in ibtf.
+	 */
+	if (rejoin) {
+		ASSERT(elem->mg_mcginfo != NULL);
+		kmem_free(elem->mg_mcginfo, sizeof (ibt_mcg_info_t));
+		(void) ibt_leave_mcg(elem->mg_rgid, elem->mg_mgid,
+		    eib_reserved_gid, elem->mg_join_state);
+		/*
+		 * Copy the new mcg over the old one (including the new
+		 * mg_mcginfo), but preserve the link to the next element
+		 * on the list
+		 */
+		mcg->mg_next = elem->mg_next;
+		bcopy(mcg, elem, sizeof (eib_mcg_t));
+	} else {
+		ASSERT(mcg->mg_mcginfo != NULL);
+		kmem_free(mcg->mg_mcginfo, sizeof (ibt_mcg_info_t));
+		(void) ibt_leave_mcg(mcg->mg_rgid, mcg->mg_mgid,
+		    eib_reserved_gid, mcg->mg_join_state);
+	}
+	mutex_exit(&chan->ch_vhub_lock);
+
+	kmem_free(mcg, sizeof (eib_mcg_t));
+	return (EIB_E_SUCCESS);
+
+vnic_join_data_mcg_fail:
+	if (mcg) {
+		kmem_free(mcg, sizeof (eib_mcg_t));
+	}
+	if (mcg_info) {
+		kmem_free(mcg_info, sizeof (ibt_mcg_info_t));
+	}
+	return (EIB_E_FAILURE);
+}
+
+int
+eib_vnic_setup_dest(eib_vnic_t *vnic, eib_wqe_t *swqe, uint8_t *dmac,
+    uint16_t vlan)
+{
+	eib_t *ss = vnic->vn_ss;
+	eib_stats_t *stats = ss->ei_stats;
+	eib_avect_t *av;
+	eib_vhub_map_t ucast;
+	ibt_mcg_info_t mcast;
+	ibt_status_t ret;
+	int dtype;
+	int rv;
+
+	/*
+	 * Lookup the destination in the vhub table or in our mcg list
+	 */
+	rv = eib_vnic_lookup_dest(vnic, dmac, vlan, &ucast, &mcast, &dtype);
+	if (rv != EIB_E_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_setup_dest: "
+		    "eib_vnic_lookup_dest(dmac=%x:%x:%x:%x:%x:%x, vlan=0x%x) "
+		    "failed", dmac[0], dmac[1], dmac[2], dmac[3], dmac[4],
+		    dmac[5], vlan);
+
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * If we found a unicast address, get an address vector for the lid
+	 * and sl, modify the ud dest based on the address vector and return.
+	 * If we found a multicast address, use the address vector in the
+	 * mcg info to modify the ud dest and return.
+	 */
+	if (dtype == EIB_TX_UNICAST) {
+		if ((av = eib_ibt_hold_avect(ss, ucast.mp_lid,
+		    ucast.mp_sl)) == NULL) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_vnic_setup_dest: "
+			    "eib_ibt_hold_avect(lid=0x%x, sl=0x%x) failed",
+			    ucast.mp_lid, ucast.mp_sl);
+
+			return (EIB_E_FAILURE);
+		}
+		ret = ibt_modify_ud_dest(swqe->qe_dest, EIB_DATA_QKEY,
+		    ucast.mp_qpn, &av->av_vect);
+
+		eib_ibt_release_avect(ss, av);
+
+		if (ret != IBT_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_vnic_setup_dest: "
+			    "ibt_modify_ud_dest(qpn=0x%lx, qkey=0x%lx) "
+			    "failed, ret=%d", ucast.mp_qpn, EIB_DATA_QKEY, ret);
+			return (EIB_E_FAILURE);
+		}
+	} else {
+		ret = ibt_modify_ud_dest(swqe->qe_dest, EIB_DATA_QKEY,
+		    IB_MC_QPN, &(mcast.mc_adds_vect));
+
+		if (dtype == EIB_TX_BROADCAST)
+			EIB_INCR_COUNTER(&stats->st_brdcstxmit);
+		else
+			EIB_INCR_COUNTER(&stats->st_multixmit);
+
+		if (ret != IBT_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_vnic_setup_dest: "
+			    "ibt_modify_ud_dest(mc_qpn=0x%lx, qkey=0x%lx) "
+			    "failed, ret=%d", IB_MC_QPN, EIB_DATA_QKEY, ret);
+			return (EIB_E_FAILURE);
+		}
+	}
+
+	return (EIB_E_SUCCESS);
+}
+
+void
+eib_vnic_leave_data_mcg(eib_t *ss, eib_vnic_t *vnic, uint8_t *mcast_mac)
+{
+	eib_rb_vnic_join_data_mcg(ss, vnic, mcast_mac);
+}
+
+/*ARGSUSED*/
+void
+eib_vnic_init_tables(eib_t *ss, eib_vnic_t *vnic)
+{
+	eib_vhub_table_t *tbl;
+	eib_vhub_update_t *upd;
+
+	tbl = kmem_zalloc(sizeof (eib_vhub_table_t), KM_SLEEP);
+	mutex_init(&tbl->tb_lock, NULL, MUTEX_DRIVER, NULL);
+	tbl->tb_eport_state = FIP_EPORT_UP;
+
+	upd = kmem_zalloc(sizeof (eib_vhub_update_t), KM_SLEEP);
+	mutex_init(&upd->up_lock, NULL, MUTEX_DRIVER, NULL);
+
+	mutex_enter(&vnic->vn_lock);
+	vnic->vn_vhub_table = tbl;
+	vnic->vn_vhub_update = upd;
+	mutex_exit(&vnic->vn_lock);
+}
+
+/*ARGSUSED*/
+void
+eib_vnic_fini_tables(eib_t *ss, eib_vnic_t *vnic, boolean_t clobber)
+{
+	eib_vhub_update_t *upd;
+	eib_vhub_table_t *tbl;
+	eib_vhub_map_t *elem;
+	eib_vhub_map_t *nxt;
+	int i;
+
+	/*
+	 * We come here only when we've either completely detached from
+	 * the vhub multicast groups and so cannot receive anymore table
+	 * or update control messages, or we've had a recent vhub table
+	 * construction failure and the vnic state is currently
+	 * EIB_LOGIN_TBL_FAILED and so won't parse any table or update
+	 * control messages.  Also, since we haven't completed the vnic
+	 * creation, no one from the tx path will be accessing the
+	 * vn_vhub_table entries either.  All said, we're free to play
+	 * around with the vnic's vn_vhub_table and vn_vhub_update here.
+	 */
+
+	mutex_enter(&vnic->vn_lock);
+	upd = vnic->vn_vhub_update;
+	tbl = vnic->vn_vhub_table;
+	if (clobber) {
+		vnic->vn_vhub_update = NULL;
+		vnic->vn_vhub_table = NULL;
+	}
+	mutex_exit(&vnic->vn_lock);
+
+	/*
+	 * Destroy the vhub update entries if any
+	 */
+	if (upd) {
+		/*
+		 * Wipe clean the list of vnic entries accumulated via
+		 * vhub updates so far.  Release eib_vhub_update_t only
+		 * if explicitly asked to do so
+		 */
+		mutex_enter(&upd->up_lock);
+		for (elem = upd->up_vnic_entry; elem != NULL; elem = nxt) {
+			nxt = elem->mp_next;
+			kmem_free(elem, sizeof (eib_vhub_map_t));
+		}
+		upd->up_vnic_entry = NULL;
+		upd->up_tusn = 0;
+		upd->up_eport_state = 0;
+		mutex_exit(&upd->up_lock);
+
+		if (clobber) {
+			mutex_destroy(&upd->up_lock);
+			kmem_free(upd, sizeof (eib_vhub_update_t));
+		}
+	}
+
+	/*
+	 * Destroy the vhub table entries
+	 */
+	if (tbl == NULL)
+		return;
+
+	/*
+	 * Wipe clean the list of entries in the vhub table collected so
+	 * far. Release eib_vhub_table_t only if explicitly asked to do so.
+	 */
+	mutex_enter(&tbl->tb_lock);
+
+	if (tbl->tb_gateway) {
+		kmem_free(tbl->tb_gateway, sizeof (eib_vhub_map_t));
+		tbl->tb_gateway = NULL;
+	}
+
+	if (tbl->tb_unicast_miss) {
+		kmem_free(tbl->tb_unicast_miss, sizeof (eib_vhub_map_t));
+		tbl->tb_unicast_miss = NULL;
+	}
+
+	if (tbl->tb_vhub_multicast) {
+		kmem_free(tbl->tb_vhub_multicast, sizeof (eib_vhub_map_t));
+		tbl->tb_vhub_multicast = NULL;
+	}
+
+	if (!eib_wa_no_mcast_entries) {
+		for (i = 0; i < EIB_TB_NBUCKETS; i++) {
+			for (elem = tbl->tb_mcast_entry[i]; elem != NULL;
+			    elem = nxt) {
+				nxt = elem->mp_next;
+				kmem_free(elem, sizeof (eib_vhub_map_t));
+			}
+			tbl->tb_mcast_entry[i] = NULL;
+		}
+	}
+
+	for (i = 0; i < EIB_TB_NBUCKETS; i++) {
+		for (elem = tbl->tb_vnic_entry[i]; elem != NULL; elem = nxt) {
+			nxt = elem->mp_next;
+			kmem_free(elem, sizeof (eib_vhub_map_t));
+		}
+		tbl->tb_vnic_entry[i] = NULL;
+	}
+
+	tbl->tb_tusn = 0;
+	tbl->tb_eport_state = 0;
+	tbl->tb_entries_seen = 0;
+	tbl->tb_entries_in_table = 0;
+	tbl->tb_checksum = 0;
+
+	mutex_exit(&tbl->tb_lock);
+
+	/*
+	 * Don't throw away space created for holding vhub table if we haven't
+	 * been explicitly asked to do so
+	 */
+	if (clobber) {
+		mutex_destroy(&tbl->tb_lock);
+		kmem_free(tbl, sizeof (eib_vhub_table_t));
+	}
+}
+
+eib_chan_t *
+eib_vnic_get_data_chan(eib_t *ss, int vinst)
+{
+	eib_vnic_t *vnic;
+	eib_chan_t *chan = NULL;
+
+	if (vinst >= 0 && vinst < EIB_MAX_VNICS) {
+		mutex_enter(&ss->ei_vnic_lock);
+		if ((vnic = ss->ei_vnic[vinst]) != NULL)
+			chan = vnic->vn_data_chan;
+		mutex_exit(&ss->ei_vnic_lock);
+	}
+
+	return (chan);
+}
+
+void
+eib_vnic_need_new(eib_t *ss, uint8_t *mac, uint16_t vlan)
+{
+	eib_vnic_req_t *vrq;
+
+	EIB_INCR_COUNTER(&ss->ei_stats->st_noxmitbuf);
+
+	/*
+	 * Create a new vnic request for this {mac,vlan} tuple
+	 */
+	vrq = kmem_zalloc(sizeof (eib_vnic_req_t), KM_NOSLEEP);
+	if (vrq == NULL) {
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_need_new: "
+		    "no memory, failed to queue new vnic creation request");
+		return;
+	}
+	vrq->vr_next = NULL;
+	vrq->vr_req = EIB_CR_REQ_NEW_VNIC;
+	bcopy(mac, vrq->vr_mac, ETHERADDRL);
+	vrq->vr_vlan = vlan;
+
+	eib_vnic_enqueue_req(ss, vrq);
+}
+
+void
+eib_vnic_enqueue_req(eib_t *ss, eib_vnic_req_t *vrq)
+{
+	eib_vnic_req_t *elem = NULL;
+	uint8_t *m;
+
+	/*
+	 * Enqueue this new vnic request with the vnic creator and
+	 * signal it.
+	 */
+	m = vrq->vr_mac;
+	EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_vnic_enqueue_req: "
+	    "BEGIN file request for creation of %x:%x:%x:%x:%x:%x, 0x%x",
+	    m[0], m[1], m[2], m[3], m[4], m[5], vrq->vr_vlan);
+
+
+	mutex_enter(&ss->ei_vnic_req_lock);
+
+	/*
+	 * Death request has the highest priority.  If we've already been asked
+	 * to die, we don't entertain any more requests.
+	 */
+	if (ss->ei_vnic_req) {
+		if (ss->ei_vnic_req->vr_req == EIB_CR_REQ_DIE) {
+			mutex_exit(&ss->ei_vnic_req_lock);
+			kmem_free(vrq, sizeof (eib_vnic_req_t));
+			return;
+		}
+	}
+
+	if (vrq->vr_req == EIB_CR_REQ_DIE || vrq->vr_req == EIB_CR_REQ_FLUSH) {
+		vrq->vr_next = ss->ei_vnic_req;
+		ss->ei_vnic_req = vrq;
+	} else {
+		/*
+		 * If there's already a creation request for this vnic that's
+		 * being processed, return immediately without adding a new
+		 * request.
+		 */
+		if ((elem = ss->ei_pending_vnic_req) != NULL) {
+			EIB_DPRINTF_DEBUG(ss->ei_instance,
+			    "eib_vnic_enqueue_req: "
+			    "ei_pending_vnic_req not NULL");
+
+			if ((elem->vr_vlan == vrq->vr_vlan) &&
+			    (bcmp(elem->vr_mac, vrq->vr_mac,
+			    ETHERADDRL) == 0)) {
+				EIB_DPRINTF_DEBUG(ss->ei_instance,
+				    "eib_vnic_enqueue_req: "
+				    "pending request already present for "
+				    "%x:%x:%x:%x:%x:%x, 0x%x", m[0], m[1], m[2],
+				    m[3], m[4], m[5], vrq->vr_vlan);
+
+				mutex_exit(&ss->ei_vnic_req_lock);
+				kmem_free(vrq, sizeof (eib_vnic_req_t));
+
+				EIB_DPRINTF_DEBUG(ss->ei_instance,
+				    "eib_vnic_enqueue_req: "
+				    "END file request");
+				return;
+			}
+
+			EIB_DPRINTF_DEBUG(ss->ei_instance,
+			    "eib_vnic_enqueue_req: "
+			    "NO pending request for %x:%x:%x:%x:%x:%x, 0x%x",
+			    m[0], m[1], m[2], m[3], m[4], m[5], vrq->vr_vlan);
+		}
+
+		/*
+		 * Or if there's one waiting in the queue for processing, do
+		 * the same thing
+		 */
+		for (elem = ss->ei_vnic_req; elem; elem = elem->vr_next) {
+			/*
+			 * If there's already a create request for this vnic
+			 * waiting in the queue, return immediately
+			 */
+			if (elem->vr_req == EIB_CR_REQ_NEW_VNIC) {
+				if ((elem->vr_vlan == vrq->vr_vlan) &&
+				    (bcmp(elem->vr_mac, vrq->vr_mac,
+				    ETHERADDRL) == 0)) {
+
+					EIB_DPRINTF_DEBUG(ss->ei_instance,
+					    "eib_vnic_enqueue_req: "
+					    "request already present for "
+					    "%x:%x:%x:%x:%x:%x, 0x%x", m[0],
+					    m[1], m[2], m[3], m[4], m[5],
+					    vrq->vr_vlan);
+
+					mutex_exit(&ss->ei_vnic_req_lock);
+					kmem_free(vrq, sizeof (eib_vnic_req_t));
+
+					EIB_DPRINTF_DEBUG(ss->ei_instance,
+					    "eib_vnic_enqueue_req: "
+					    "END file request");
+					return;
+				}
+			}
+
+			if (elem->vr_next == NULL) {
+				EIB_DPRINTF_DEBUG(ss->ei_instance,
+				    "eib_vnic_enqueue_req: "
+				    "request not found, filing afresh");
+				break;
+			}
+		}
+
+		/*
+		 * Otherwise queue up this new creation request and signal the
+		 * service thread.
+		 */
+		if (elem) {
+			elem->vr_next = vrq;
+		} else {
+			ss->ei_vnic_req = vrq;
+		}
+	}
+
+	cv_signal(&ss->ei_vnic_req_cv);
+	mutex_exit(&ss->ei_vnic_req_lock);
+
+	EIB_DPRINTF_DEBUG(ss->ei_instance,
+	    "eib_vnic_enqueue_req: END file request");
+}
+
+void
+eib_vnic_update_failed_macs(eib_t *ss, uint8_t *old_mac, uint16_t old_vlan,
+    uint8_t *new_mac, uint16_t new_vlan)
+{
+	eib_vnic_req_t *vrq;
+	eib_vnic_req_t *elem;
+	eib_vnic_req_t *prev;
+
+	vrq = kmem_zalloc(sizeof (eib_vnic_req_t), KM_NOSLEEP);
+	if (vrq == NULL) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_update_failed_macs: "
+		    "no memory, failed to drop old mac");
+	} else {
+		vrq->vr_next = NULL;
+		vrq->vr_req = 0;	/* unused */
+		bcopy(old_mac, vrq->vr_mac, ETHERADDRL);
+		vrq->vr_vlan = old_vlan;
+	}
+
+	mutex_enter(&ss->ei_vnic_req_lock);
+
+	/*
+	 * We'll search the failed vnics list to see if the new {mac,vlan}
+	 * tuple is in there and remove it if present (since the new address
+	 * is no longer "failed").
+	 */
+	prev = NULL;
+	for (elem = ss->ei_failed_vnic_req; elem; elem = elem->vr_next) {
+		if ((bcmp(elem->vr_mac, new_mac, ETHERADDRL) == 0) &&
+		    (elem->vr_vlan == new_vlan)) {
+			if (prev) {
+				prev->vr_next = elem->vr_next;
+			} else {
+				ss->ei_failed_vnic_req = elem->vr_next;
+			}
+			elem->vr_next = NULL;
+			break;
+		}
+	}
+	if (elem) {
+		kmem_free(elem, sizeof (eib_vnic_req_t));
+	}
+
+	/*
+	 * We'll also insert the old {mac,vlan} tuple to the "failed vnic req"
+	 * list (it shouldn't be there already), to avoid trying to recreate
+	 * the vnic we just explicitly discarded.
+	 */
+	if (vrq) {
+		vrq->vr_next = ss->ei_failed_vnic_req;
+		ss->ei_failed_vnic_req = vrq;
+	}
+
+	mutex_exit(&ss->ei_vnic_req_lock);
+}
+
+void
+eib_vnic_resurrect_zombies(eib_t *ss, uint8_t *vn0_mac)
+{
+	int inst;
+
+	/*
+	 * We want to restart/relogin each vnic instance with the gateway,
+	 * but with the same vnic id and instance as before.
+	 */
+	while ((inst = EIB_FIND_LSB_SET(ss->ei_zombie_vnics)) != -1) {
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_vnic_resurrect_zombies: "
+		    "calling eib_vnic_restart(vn_inst=%d)", inst);
+
+		eib_vnic_restart(ss, inst, vn0_mac);
+
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_vnic_resurrect_zombies: "
+		    "eib_vnic_restart(vn_inst=%d) done", inst);
+	}
+}
+
+void
+eib_vnic_restart(eib_t *ss, int inst, uint8_t *vn0_mac)
+{
+	eib_vnic_t *vnic;
+	eib_login_data_t *ld;
+	uint8_t old_mac[ETHERADDRL];
+	int ret;
+	int err;
+
+	if (inst < 0 || inst >= EIB_MAX_VNICS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_restart: "
+		    "vnic instance (%d) invalid", inst);
+		return;
+	}
+
+	eib_vnic_modify_enter(ss, EIB_VN_BEING_MODIFIED);
+	if ((vnic = ss->ei_vnic[inst]) != NULL) {
+		/*
+		 * Remember what mac was allocated for this vnic last time
+		 */
+		bcopy(vnic->vn_login_data.ld_assigned_mac, old_mac, ETHERADDRL);
+
+		/*
+		 * Tear down and restart this vnic instance
+		 */
+		eib_rb_vnic_create_common(ss, vnic, ~0);
+		ret = eib_vnic_create_common(ss, vnic, &err);
+		if (ret != EIB_E_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_vnic_restart: "
+			    "eib_vnic_create_common(vnic_inst=%d) failed, "
+			    "ret=%d", inst, err);
+		}
+
+		/*
+		 * If this is vnic instance 0 and if our current assigned mac is
+		 * different from what was assigned last time, we need to pass
+		 * this information back to the caller, so the mac layer can be
+		 * appropriately informed. We will also queue up the old mac
+		 * and vlan in the "failed vnic req" list, so any future packets
+		 * to this address on this interface will be dropped.
+		 */
+		ld = &vnic->vn_login_data;
+		if ((inst == 0) &&
+		    (bcmp(ld->ld_assigned_mac, old_mac, ETHERADDRL) != 0)) {
+			uint8_t *m = ld->ld_assigned_mac;
+
+			if (vn0_mac != NULL) {
+				bcopy(ld->ld_assigned_mac, vn0_mac,
+				    ETHERADDRL);
+			}
+
+			EIB_DPRINTF_VERBOSE(ss->ei_instance,
+			    "eib_vnic_restart: updating failed macs list "
+			    "old=%x:%x:%x:%x:%x:%x, new=%x:%x:%x:%x:%x:%x, "
+			    "vlan=0x%x", old_mac[0], old_mac[1], old_mac[2],
+			    old_mac[3], old_mac[4], old_mac[5], m[0], m[1],
+			    m[2], m[3], m[4], m[5], vnic->vn_vlan);
+
+			eib_vnic_update_failed_macs(ss, old_mac, vnic->vn_vlan,
+			    ld->ld_assigned_mac, vnic->vn_vlan);
+		}
+
+		/*
+		 * No longer a zombie or need to rejoin mcgs
+		 */
+		mutex_enter(&ss->ei_vnic_lock);
+		ss->ei_zombie_vnics &= (~((uint64_t)1 << inst));
+		ss->ei_rejoin_vnics &= (~((uint64_t)1 << inst));
+		mutex_exit(&ss->ei_vnic_lock);
+	}
+	eib_vnic_modify_exit(ss, EIB_VN_BEING_MODIFIED);
+}
+
+void
+eib_vnic_rejoin_mcgs(eib_t *ss)
+{
+	eib_vnic_t *vnic;
+	int inst;
+
+	/*
+	 * For each vnic that still requires re-join, go through the
+	 * control channels and data channel and reattach/rejoin mcgs.
+	 */
+	mutex_enter(&ss->ei_vnic_lock);
+	while ((inst = EIB_FIND_LSB_SET(ss->ei_rejoin_vnics)) != -1) {
+		if ((vnic = ss->ei_vnic[inst]) != NULL) {
+			eib_vnic_reattach_ctl_mcgs(ss, vnic);
+			eib_vnic_rejoin_data_mcgs(ss, vnic);
+		}
+		ss->ei_rejoin_vnics &= (~((uint64_t)1 << inst));
+	}
+	mutex_exit(&ss->ei_vnic_lock);
+}
+
+void
+eib_rb_vnic_create(eib_t *ss, eib_vnic_t *vnic, uint_t progress)
+{
+	if (progress & EIB_VNIC_CREATE_COMMON_DONE) {
+		eib_rb_vnic_create_common(ss, vnic, ~0);
+	}
+
+	if (progress & EIB_VNIC_GOT_INSTANCE) {
+		eib_vnic_ret_instance(ss, vnic->vn_instance);
+		vnic->vn_instance = -1;
+	}
+
+	if (progress & EIB_VNIC_STRUCT_ALLOCD) {
+		cv_destroy(&vnic->vn_cv);
+		mutex_destroy(&vnic->vn_lock);
+		kmem_free(vnic, sizeof (eib_vnic_t));
+	}
+}
+
+/*
+ * Currently, we only allow 64 vnics per eoib device instance, for
+ * reasons described in eib.h (see EIB_VNIC_ID() definition), so we
+ * could use a simple bitmap to assign the vnic instance numbers.
+ * Once we start allowing more vnics per device instance, this
+ * allocation scheme will need to be changed.
+ */
+static int
+eib_vnic_get_instance(eib_t *ss, int *vinst)
+{
+	int bitpos;
+	uint64_t nval;
+
+	mutex_enter(&ss->ei_vnic_lock);
+
+	/*
+	 * What we have is the active vnics list --  the in-use vnics are
+	 * indicated by a 1 in the bit position, and the free ones are
+	 * indicated by 0.  We need to find the least significant '0' bit
+	 * to get the first free vnic instance.  Or we could bit-reverse
+	 * the active list and locate the least significant '1'.
+	 */
+	nval = ~(ss->ei_active_vnics);
+	if (nval == 0)
+		return (EIB_E_FAILURE);
+
+	/*
+	 * The single bit-position values in a 64-bit integer are relatively
+	 * prime with 67, so performing a modulus division with 67 guarantees
+	 * a unique number between 0 and 63 for each value (setbit_mod67[]).
+	 */
+	bitpos = EIB_FIND_LSB_SET(nval);
+	if (bitpos == -1)
+		return (EIB_E_FAILURE);
+
+	ss->ei_active_vnics |= ((uint64_t)1 << bitpos);
+	*vinst = bitpos;
+
+	mutex_exit(&ss->ei_vnic_lock);
+
+	return (EIB_E_SUCCESS);
+}
+
+static void
+eib_vnic_ret_instance(eib_t *ss, int vinst)
+{
+	mutex_enter(&ss->ei_vnic_lock);
+
+	if (vinst >= EIB_MAX_VNICS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_ret_instance: "
+		    "vnic instance (%d) invalid", vinst);
+	} else if ((ss->ei_active_vnics & ((uint64_t)1 << vinst)) == 0) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_ret_instance: "
+		    "vnic instance (%d) not active!", vinst);
+	} else {
+		ss->ei_active_vnics &= (~((uint64_t)1 << vinst));
+	}
+
+	mutex_exit(&ss->ei_vnic_lock);
+}
+
+static void
+eib_vnic_modify_enter(eib_t *ss, uint_t op)
+{
+	mutex_enter(&ss->ei_vnic_lock);
+	while (ss->ei_vnic_state & EIB_VN_BEING_MODIFIED)
+		cv_wait(&ss->ei_vnic_cv, &ss->ei_vnic_lock);
+
+	ss->ei_vnic_state |= op;
+	mutex_exit(&ss->ei_vnic_lock);
+}
+
+static void
+eib_vnic_modify_exit(eib_t *ss, uint_t op)
+{
+	mutex_enter(&ss->ei_vnic_lock);
+	ss->ei_vnic_state &= (~op);
+	cv_broadcast(&ss->ei_vnic_cv);
+	mutex_exit(&ss->ei_vnic_lock);
+}
+
+static int
+eib_vnic_create_common(eib_t *ss, eib_vnic_t *vnic, int *err)
+{
+	uint_t progress = 0;
+
+	/*
+	 * When we receive login acks within this vnic creation
+	 * routine we need a way to retrieve the vnic structure
+	 * from the vnic instance, so store this somewhere. Note
+	 * that there can be only one outstanding vnic creation
+	 * at any point of time, so we only need one vnic struct.
+	 */
+	mutex_enter(&ss->ei_vnic_lock);
+	ASSERT(ss->ei_vnic_pending == NULL);
+	ss->ei_vnic_pending = vnic;
+	mutex_exit(&ss->ei_vnic_lock);
+
+	/*
+	 * Create a control qp for this vnic
+	 */
+	if (eib_ctl_create_qp(ss, vnic, err) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_create_common: "
+		    "eib_ctl_create_qp(vn_id=0x%x) failed, ret=%d",
+		    vnic->vn_id, *err);
+		goto vnic_create_common_fail;
+	}
+	progress |= EIB_VNIC_CTLQP_CREATED;
+
+	/*
+	 * Create a data qp for this vnic
+	 */
+	if (eib_data_create_qp(ss, vnic, err) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_create_common: "
+		    "eib_data_create_qp(vn_id=0x%x) failed, ret=%d",
+		    vnic->vn_id, *err);
+		goto vnic_create_common_fail;
+	}
+	progress |= EIB_VNIC_DATAQP_CREATED;
+
+	/*
+	 * Login to the gateway with this vnic's parameters
+	 */
+	if (eib_fip_login(ss, vnic, err) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_create_common: "
+		    "eib_fip_login(vn_id=0x%x) failed, ret=%d",
+		    vnic->vn_id, *err);
+		goto vnic_create_common_fail;
+	}
+	progress |= EIB_VNIC_LOGIN_DONE;
+
+	/*
+	 * Associate the control and data qps for the vnic with the
+	 * vHUB partition
+	 */
+	if (eib_vnic_set_partition(ss, vnic, err) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_create_common: "
+		    "eib_vnic_set_partition(vn_id=0x%x) failed, ret=%d",
+		    vnic->vn_id, *err);
+		goto vnic_create_common_fail;
+	}
+	progress |= EIB_VNIC_PARTITION_SET;
+
+	/*
+	 * Post initial set of rx buffers on the control qp to the HCA
+	 */
+	if (eib_chan_post_rx(ss, vnic->vn_ctl_chan, NULL) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_create_common: "
+		    "eib_chan_post_rx(vn_id=0x%x, CTL_QP) failed, ret=%d",
+		    vnic->vn_id, *err);
+
+		*err = ENOMEM;
+		goto vnic_create_common_fail;
+	}
+	progress |= EIB_VNIC_RX_POSTED_TO_CTLQP;
+
+	/*
+	 * Post initial set of rx buffers on the data qp to the HCA
+	 */
+	if (eib_chan_post_rx(ss, vnic->vn_data_chan, NULL) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_create_common: "
+		    "eib_chan_post_rx(vn_id=0x%x, DATA_QP) failed, ret=%d",
+		    vnic->vn_id, *err);
+
+		*err = ENOMEM;
+		goto vnic_create_common_fail;
+	}
+	progress |= EIB_VNIC_RX_POSTED_TO_DATAQP;
+
+	/*
+	 * Attach to the vHUB table and vHUB update multicast groups
+	 */
+	if (eib_vnic_attach_ctl_mcgs(ss, vnic, err) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_create_common: "
+		    "eib_vnic_attach_ctl_mcgs(vn_id=0x%x) failed, ret=%d",
+		    vnic->vn_id, *err);
+		goto vnic_create_common_fail;
+	}
+	progress |= EIB_VNIC_ATTACHED_TO_CTL_MCGS;
+
+	/*
+	 * Send the vHUB table request and construct the vhub table
+	 */
+	if (eib_fip_vhub_table(ss, vnic, err) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_create_common: "
+		    "eib_fip_vhub_table(vn_id=0x%x) failed, ret=%d",
+		    vnic->vn_id, *err);
+		goto vnic_create_common_fail;
+	}
+	progress |= EIB_VNIC_GOT_VHUB_TABLE;
+
+	/*
+	 * Detach from the vHUB table mcg (we no longer need the vHUB
+	 * table messages) and start the keepalives for this vnic.
+	 */
+	eib_vnic_start_keepalives(ss, vnic);
+	eib_rb_vnic_attach_vhub_table(ss, vnic);
+
+	progress |= EIB_VNIC_KEEPALIVES_STARTED;
+
+	/*
+	 * All ethernet vnics are automatically members of the broadcast
+	 * group for the vlan they are participating in, so join the
+	 * ethernet broadcast group.  Note that when we restart vnics,
+	 * we rejoin the mcgs, so we pass B_TRUE to eib_vnic_join_data_mcg().
+	 */
+	if (eib_vnic_join_data_mcg(ss, vnic, eib_broadcast_mac, B_TRUE,
+	    err) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_create_common: "
+		    "eib_vnic_join_data_mcg(vn_id=0x%x, BCAST_GROUP) failed, "
+		    "ret=%d", vnic->vn_id, *err);
+		goto vnic_create_common_fail;
+	}
+	progress |= EIB_VNIC_BROADCAST_JOINED;
+
+	mutex_enter(&ss->ei_vnic_lock);
+	if (ss->ei_vnic[vnic->vn_instance] == NULL) {
+		ss->ei_vnic[vnic->vn_instance] = vnic;
+	}
+	ss->ei_vnic_pending = NULL;
+	mutex_exit(&ss->ei_vnic_lock);
+
+	return (EIB_E_SUCCESS);
+
+vnic_create_common_fail:
+	eib_rb_vnic_create_common(ss, vnic, progress);
+	return (EIB_E_FAILURE);
+}
+
+static int
+eib_vnic_set_partition(eib_t *ss, eib_vnic_t *vnic, int *err)
+{
+	int ret;
+
+	/*
+	 * Associate the control channel with the vhub partition
+	 */
+	ret = eib_ibt_modify_chan_pkey(ss, vnic->vn_ctl_chan,
+	    vnic->vn_login_data.ld_vhub_pkey);
+	if (ret != EIB_E_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_set_partition: "
+		    "eib_ibt_modify_chan_pkey(vn_id=0x%x, CTL_CHAN, "
+		    "vhub_pkey=0x%x) failed", vnic->vn_id,
+		    vnic->vn_login_data.ld_vhub_pkey);
+		*err = EINVAL;
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Now, do the same thing for the data channel. Note that if a
+	 * failure happens, the channel state(s) are left as-is, since
+	 * it is pointless to try to change them back using the same
+	 * interfaces that have just failed.
+	 */
+	ret = eib_ibt_modify_chan_pkey(ss, vnic->vn_data_chan,
+	    vnic->vn_login_data.ld_vhub_pkey);
+	if (ret != EIB_E_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_set_partition: "
+		    "eib_ibt_modify_chan_pkey(vn_id=0x%x, DATA_CHAN, "
+		    "vhub_pkey=0x%x) failed", vnic->vn_id,
+		    vnic->vn_login_data.ld_vhub_pkey);
+		*err = EINVAL;
+		return (EIB_E_FAILURE);
+	}
+
+	return (EIB_E_SUCCESS);
+}
+
+static void
+eib_vnic_make_vhub_mgid(uint8_t *mg_prefix, uint8_t mg_type,
+    uint8_t *mcast_mac, uint8_t n_mac, uint8_t rss_hash, uint32_t vhub_id,
+    ib_gid_t *mgid)
+{
+	eib_mgid_t em;
+	uint64_t dmac_mask;
+	uint64_t dmac = 0;
+	uint8_t *dmac_str = (uint8_t *)&dmac;
+	uint_t	vhub_id_nw;
+	uint8_t *vhub_id_str = (uint8_t *)&vhub_id_nw;
+
+	/*
+	 * Copy mgid prefix and type
+	 */
+	bcopy(mg_prefix, em.gd_spec.sp_mgid_prefix, FIP_MGID_PREFIX_LEN);
+	em.gd_spec.sp_type = mg_type;
+
+	/*
+	 * Take n_mac bits from mcast_mac and copy dmac
+	 */
+	bcopy(mcast_mac, dmac_str + 2, ETHERADDRL);
+	dmac_mask = ((uint64_t)1 << n_mac) - 1;
+	dmac_mask = htonll(dmac_mask);
+	dmac &= dmac_mask;
+	bcopy(dmac_str + 2, em.gd_spec.sp_dmac, ETHERADDRL);
+
+	/*
+	 * Copy rss hash and prepare vhub id from gw port id and vlan
+	 */
+	em.gd_spec.sp_rss_hash = rss_hash;
+
+	vhub_id_nw = htonl(vhub_id);
+	bcopy(vhub_id_str + 1, em.gd_spec.sp_vhub_id, FIP_VHUBID_LEN);
+
+	/*
+	 * Ok, now we've assembled the mgid as per EoIB spec. We now have to
+	 * represent it in the way Solaris IBTF wants it and return (sigh).
+	 */
+	mgid->gid_prefix = ntohll(em.gd_sol.gid_prefix);
+	mgid->gid_guid = ntohll(em.gd_sol.gid_guid);
+}
+
+static int
+eib_vnic_attach_ctl_mcgs(eib_t *ss, eib_vnic_t *vnic, int *err)
+{
+	/*
+	 * Get tb_vhub_table and tb_vhub_update allocated and ready before
+	 * attaching to the vhub table and vhub update mcgs
+	 */
+	eib_vnic_init_tables(ss, vnic);
+
+	if (eib_vnic_attach_vhub_update(ss, vnic) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_attach_ctl_mcgs: "
+		    "eib_vnic_attach_vhub_update(vn_id=0x%x) failed",
+		    vnic->vn_id);
+
+		*err = EINVAL;
+		eib_vnic_fini_tables(ss, vnic, B_TRUE);
+		return (EIB_E_FAILURE);
+	}
+
+	if (eib_vnic_attach_vhub_table(ss, vnic) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_attach_ctl_mcgs: "
+		    "eib_vnic_attach_vhub_table(vn_id=0x%x) failed",
+		    vnic->vn_id);
+
+		*err = EINVAL;
+		eib_rb_vnic_attach_vhub_update(ss, vnic);
+		eib_vnic_fini_tables(ss, vnic, B_TRUE);
+		return (EIB_E_FAILURE);
+	}
+
+	return (EIB_E_SUCCESS);
+}
+
+static int
+eib_vnic_attach_vhub_table(eib_t *ss, eib_vnic_t *vnic)
+{
+	eib_chan_t *chan = vnic->vn_ctl_chan;
+	eib_login_data_t *ld = &vnic->vn_login_data;
+	eib_mcg_t *mcg;
+	ibt_mcg_info_t *tbl_mcginfo;
+	ibt_mcg_attr_t mcg_attr;
+	ibt_status_t ret;
+	uint_t entries;
+
+	/*
+	 * Compose the MGID for receiving VHUB table
+	 */
+	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
+
+	eib_vnic_make_vhub_mgid(ld->ld_gw_mgid_prefix,
+	    (uint8_t)EIB_MGID_VHUB_TABLE, eib_broadcast_mac, ld->ld_n_mac_mcgid,
+	    0, ld->ld_vhub_id, &(mcg_attr.mc_mgid));
+	mcg_attr.mc_pkey = (ib_pkey_t)ld->ld_vhub_pkey;
+	mcg_attr.mc_qkey = (ib_qkey_t)EIB_FIP_QKEY;
+
+	/*
+	 * Locate the multicast group for receiving vhub table
+	 */
+	ret = ibt_query_mcg(ss->ei_props->ep_sgid, &mcg_attr, 1,
+	    &tbl_mcginfo, &entries);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_attach_vhub_table: "
+		    "ibt_query_mcg(mgid=%llx.%llx, pkey=0x%x) failed, "
+		    "ret=%d", mcg_attr.mc_mgid.gid_prefix,
+		    mcg_attr.mc_mgid.gid_guid, mcg_attr.mc_pkey, ret);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Allocate for and prepare the mcg to add to our list
+	 */
+	mcg = kmem_zalloc(sizeof (eib_mcg_t), KM_NOSLEEP);
+	if (mcg == NULL) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_attach_vhub_table: "
+		    "no memory, failed to attach to vhub table "
+		    "(mgid=%llx.%llx, pkey=0x%x)", mcg_attr.mc_mgid.gid_prefix,
+		    mcg_attr.mc_mgid.gid_guid, mcg_attr.mc_pkey);
+		ibt_free_mcg_info(tbl_mcginfo, 1);
+		return (EIB_E_FAILURE);
+	}
+
+	mcg->mg_next = NULL;
+	mcg->mg_rgid = ss->ei_props->ep_sgid;
+	mcg->mg_mgid = mcg_attr.mc_mgid;
+	mcg->mg_join_state = IB_MC_JSTATE_FULL;
+	mcg->mg_mcginfo = tbl_mcginfo;
+	bcopy(eib_broadcast_mac, mcg->mg_mac, ETHERADDRL);
+
+	/*
+	 * Join the multicast group
+	 */
+	mcg_attr.mc_join_state = mcg->mg_join_state;
+	mcg_attr.mc_flow = tbl_mcginfo->mc_adds_vect.av_flow;
+	mcg_attr.mc_tclass = tbl_mcginfo->mc_adds_vect.av_tclass;
+	mcg_attr.mc_sl = tbl_mcginfo->mc_adds_vect.av_srvl;
+	mcg_attr.mc_scope = 0;	/* IB_MC_SCOPE_SUBNET_LOCAL perhaps ? */
+
+	ret = ibt_join_mcg(mcg->mg_rgid, &mcg_attr, tbl_mcginfo, NULL, NULL);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_attach_vhub_table: "
+		    "ibt_join_mcg(mgid=%llx.%llx, pkey=0x%x, jstate=0x%x) "
+		    "failed, ret=%d", mcg_attr.mc_mgid.gid_prefix,
+		    mcg_attr.mc_mgid.gid_guid, mcg_attr.mc_pkey,
+		    mcg_attr.mc_join_state, ret);
+
+		kmem_free(mcg, sizeof (eib_mcg_t));
+		ibt_free_mcg_info(tbl_mcginfo, 1);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Attach to the multicast group to receive tbl multicasts
+	 */
+	ret = ibt_attach_mcg(chan->ch_chan, tbl_mcginfo);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_attach_vhub_table: "
+		    "ibt_attach_mcg(mgid=%llx.%llx, pkey=0x%x) "
+		    "failed, ret=%d", mcg_attr.mc_mgid.gid_prefix,
+		    mcg_attr.mc_mgid.gid_guid, mcg_attr.mc_pkey);
+
+		(void) ibt_leave_mcg(mcg->mg_rgid, mcg->mg_mgid,
+		    eib_reserved_gid, mcg->mg_join_state);
+		kmem_free(mcg, sizeof (eib_mcg_t));
+		ibt_free_mcg_info(tbl_mcginfo, 1);
+		return (EIB_E_FAILURE);
+	}
+
+	mutex_enter(&chan->ch_vhub_lock);
+	chan->ch_vhub_table = mcg;
+	mutex_exit(&chan->ch_vhub_lock);
+
+	return (EIB_E_SUCCESS);
+}
+
+static int
+eib_vnic_attach_vhub_update(eib_t *ss, eib_vnic_t *vnic)
+{
+	eib_chan_t *chan = vnic->vn_ctl_chan;
+	eib_login_data_t *ld = &vnic->vn_login_data;
+	eib_mcg_t *mcg;
+	ibt_mcg_info_t *upd_mcginfo;
+	ibt_mcg_attr_t mcg_attr;
+	ibt_status_t ret;
+	uint_t entries;
+
+	/*
+	 * Compose the MGID for receiving VHUB updates
+	 */
+	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
+
+	eib_vnic_make_vhub_mgid(ld->ld_gw_mgid_prefix,
+	    (uint8_t)EIB_MGID_VHUB_UPDATE, eib_broadcast_mac,
+	    ld->ld_n_mac_mcgid, 0, ld->ld_vhub_id, &(mcg_attr.mc_mgid));
+	mcg_attr.mc_pkey = (ib_pkey_t)ld->ld_vhub_pkey;
+	mcg_attr.mc_qkey = (ib_qkey_t)EIB_FIP_QKEY;
+
+	/*
+	 * Locate the multicast group for receiving vhub updates
+	 */
+	ret = ibt_query_mcg(ss->ei_props->ep_sgid, &mcg_attr, 1,
+	    &upd_mcginfo, &entries);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_attach_vhub_update: "
+		    "ibt_query_mcg(mgid=%llx.%llx, pkey=0x%x) failed, "
+		    "ret=%d", mcg_attr.mc_mgid.gid_prefix,
+		    mcg_attr.mc_mgid.gid_guid, mcg_attr.mc_pkey, ret);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Allocate for and prepare the mcg to add to our list
+	 */
+	mcg = kmem_zalloc(sizeof (eib_mcg_t), KM_NOSLEEP);
+	if (mcg == NULL) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_attach_vhub_update: "
+		    "no memory, failed to attach to vhub update "
+		    "(mgid=%llx.%llx, pkey=0x%x)", mcg_attr.mc_mgid.gid_prefix,
+		    mcg_attr.mc_mgid.gid_guid, mcg_attr.mc_pkey);
+
+		ibt_free_mcg_info(upd_mcginfo, 1);
+		return (EIB_E_FAILURE);
+	}
+
+	mcg->mg_next = NULL;
+	mcg->mg_rgid = ss->ei_props->ep_sgid;
+	mcg->mg_mgid = mcg_attr.mc_mgid;
+	mcg->mg_join_state = IB_MC_JSTATE_FULL;
+	mcg->mg_mcginfo = upd_mcginfo;
+	bcopy(eib_broadcast_mac, mcg->mg_mac, ETHERADDRL);
+
+	/*
+	 * Join the multicast group
+	 */
+	mcg_attr.mc_join_state = mcg->mg_join_state;
+	mcg_attr.mc_flow = upd_mcginfo->mc_adds_vect.av_flow;
+	mcg_attr.mc_tclass = upd_mcginfo->mc_adds_vect.av_tclass;
+	mcg_attr.mc_sl = upd_mcginfo->mc_adds_vect.av_srvl;
+	mcg_attr.mc_scope = 0;	/* IB_MC_SCOPE_SUBNET_LOCAL perhaps ? */
+
+	ret = ibt_join_mcg(mcg->mg_rgid, &mcg_attr, upd_mcginfo, NULL, NULL);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_attach_vhub_update: "
+		    "ibt_join_mcg(mgid=%llx.%llx, pkey=0x%x, jstate=0x%x) "
+		    "failed, ret=%d", mcg_attr.mc_mgid.gid_prefix,
+		    mcg_attr.mc_mgid.gid_guid, mcg_attr.mc_pkey,
+		    mcg_attr.mc_join_state, ret);
+
+		kmem_free(mcg, sizeof (eib_mcg_t));
+		ibt_free_mcg_info(upd_mcginfo, 1);
+		return (EIB_E_FAILURE);
+	}
+
+	/*
+	 * Attach to the multicast group to receive upd multicasts
+	 */
+	ret = ibt_attach_mcg(chan->ch_chan, upd_mcginfo);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_attach_vhub_update: "
+		    "ibt_attach_mcg(mgid=%llx.%llx, pkey=0x%x) "
+		    "failed, ret=%d", mcg_attr.mc_mgid.gid_prefix,
+		    mcg_attr.mc_mgid.gid_guid, mcg_attr.mc_pkey);
+
+		(void) ibt_leave_mcg(mcg->mg_rgid, mcg->mg_mgid,
+		    eib_reserved_gid, mcg->mg_join_state);
+		kmem_free(mcg, sizeof (eib_mcg_t));
+		ibt_free_mcg_info(upd_mcginfo, 1);
+		return (EIB_E_FAILURE);
+	}
+
+	mutex_enter(&chan->ch_vhub_lock);
+	chan->ch_vhub_update = mcg;
+	mutex_exit(&chan->ch_vhub_lock);
+
+	return (EIB_E_SUCCESS);
+}
+
+static void
+eib_vnic_start_keepalives(eib_t *ss, eib_vnic_t *vnic)
+{
+	eib_ka_vnics_t *kav;
+	eib_ka_vnics_t *elem;
+	int err;
+
+	kav = kmem_zalloc(sizeof (eib_ka_vnics_t), KM_SLEEP);
+	kav->ka_vnic = vnic;
+	kav->ka_next = NULL;
+
+	/*
+	 * Send the first keepalive and then queue this vnic up with
+	 * the keepalives manager
+	 */
+	(void) eib_fip_heartbeat(ss, vnic, &err);
+
+	mutex_enter(&ss->ei_ka_vnics_lock);
+	for (elem = ss->ei_ka_vnics; elem; elem = elem->ka_next) {
+		if (elem->ka_next == NULL)
+			break;
+	}
+	if (elem) {
+		elem->ka_next = kav;
+	} else {
+		ss->ei_ka_vnics = kav;
+	}
+	mutex_exit(&ss->ei_ka_vnics_lock);
+}
+
+/*ARGSUSED*/
+static int
+eib_vnic_lookup_dest(eib_vnic_t *vnic, uint8_t *dmac, uint16_t vlan,
+    eib_vhub_map_t *ucast, ibt_mcg_info_t *mcast, int *dtype)
+{
+	eib_t *ss = vnic->vn_ss;
+	eib_vhub_map_t *elem;
+	eib_mcg_t *mcg;
+	eib_chan_t *chan = vnic->vn_data_chan;
+	eib_login_data_t *ld = &vnic->vn_login_data;
+	eib_vhub_map_t *gw;
+	eib_vhub_table_t *tbl;
+	uint8_t bkt = (dmac[ETHERADDRL-1]) % EIB_TB_NBUCKETS;
+	ib_gid_t mgid;
+
+	/*
+	 * If this was a unicast dmac, locate the vhub entry matching the
+	 * unicast dmac in our vhub table.  If it's not found, return the
+	 * gateway entry
+	 */
+	if (EIB_UNICAST_MAC(dmac)) {
+
+		mutex_enter(&vnic->vn_lock);
+		if ((tbl = vnic->vn_vhub_table) == NULL) {
+			mutex_exit(&vnic->vn_lock);
+			return (EIB_E_FAILURE);
+		}
+
+		mutex_enter(&tbl->tb_lock);
+		gw = tbl->tb_gateway;
+		for (elem = tbl->tb_vnic_entry[bkt]; elem != NULL;
+		    elem = elem->mp_next) {
+			if (bcmp(elem->mp_mac, dmac, ETHERADDRL) == 0)
+				break;
+		}
+		mutex_exit(&tbl->tb_lock);
+
+		if ((elem == NULL) && (gw == NULL)) {
+			mutex_exit(&vnic->vn_lock);
+			return (EIB_E_FAILURE);
+		}
+
+		*dtype = EIB_TX_UNICAST;
+		if (elem) {
+			bcopy(elem, ucast, sizeof (eib_vhub_map_t));
+		} else {
+			bcopy(gw, ucast, sizeof (eib_vhub_map_t));
+		}
+		mutex_exit(&vnic->vn_lock);
+
+		return (EIB_E_SUCCESS);
+	}
+
+	/*
+	 * Is it a broadcast ?
+	 */
+	*dtype = (bcmp(dmac, eib_broadcast_mac, ETHERADDRL) == 0) ?
+	    EIB_TX_BROADCAST : EIB_TX_MULTICAST;
+
+	/*
+	 * If this was a multicast dmac, prepare the mgid and look for it
+	 * in the list of mcgs we've joined and use the address vector from
+	 * the mcginfo stored there.
+	 *
+	 * Note that since we don't have a way to associate each vlan with
+	 * the mcg (see eib_m_multicast()), we'll prepare the mgid to use
+	 * the broadcast channel all the time.
+	 */
+	eib_vnic_make_vhub_mgid(ld->ld_gw_mgid_prefix,
+	    (uint8_t)EIB_MGID_VHUB_DATA, eib_broadcast_mac, ld->ld_n_mac_mcgid,
+	    0, ld->ld_vhub_id, &mgid);
+
+	mutex_enter(&chan->ch_vhub_lock);
+	for (mcg = chan->ch_vhub_data; mcg; mcg = mcg->mg_next) {
+		if ((mcg->mg_mgid.gid_prefix == mgid.gid_prefix) &&
+		    (mcg->mg_mgid.gid_guid == mgid.gid_guid)) {
+			break;
+		}
+	}
+	if (mcg == NULL) {
+		mutex_exit(&chan->ch_vhub_lock);
+
+		EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_lookup_dest: "
+		    "could not find mgid %llx.%llx",
+		    mgid.gid_prefix, mgid.gid_guid);
+
+		return (EIB_E_FAILURE);
+	}
+
+	bcopy(mcg->mg_mcginfo, mcast, sizeof (ibt_mcg_info_t));
+	mutex_exit(&chan->ch_vhub_lock);
+
+	return (EIB_E_SUCCESS);
+}
+
+/*ARGSUSED*/
+static void
+eib_vnic_leave_all_data_mcgs(eib_t *ss, eib_vnic_t *vnic)
+{
+	eib_chan_t *chan = vnic->vn_data_chan;
+	eib_mcg_t *mcglist;
+	eib_mcg_t *mcg;
+	eib_mcg_t *nxt = NULL;
+	ibt_status_t ret;
+
+	/*
+	 * First, take the ch_vhub_data mcg chain out of chan
+	 */
+	mutex_enter(&chan->ch_vhub_lock);
+	mcglist = chan->ch_vhub_data;
+	chan->ch_vhub_data = NULL;
+	mutex_exit(&chan->ch_vhub_lock);
+
+	/*
+	 * Go through the chain of mcgs we've joined, detach the qp from the
+	 * mcg, leave the group and free all associated stuff
+	 */
+	for (mcg = mcglist; mcg != NULL; mcg = nxt) {
+		nxt = mcg->mg_next;
+
+		ret = ibt_detach_mcg(chan->ch_chan, mcg->mg_mcginfo);
+		if (ret != IBT_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_vnic_leave_all_data_mcgs: "
+			    "ibt_detach_mcg(chan_hdl=0x%llx, mcinfo=0x%llx, "
+			    "mgid=%llx.%llx) failed, ret=%d", chan->ch_chan,
+			    mcg->mg_mcginfo, mcg->mg_mgid.gid_prefix,
+			    mcg->mg_mgid.gid_guid, ret);
+		}
+
+		ret = ibt_leave_mcg(mcg->mg_rgid, mcg->mg_mgid,
+		    eib_reserved_gid, mcg->mg_join_state);
+		if (ret != IBT_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_vnic_leave_all_data_mcgs: "
+			    "ibt_leave_mcg(mgid=%llx.%llx, jstate=0x%x) "
+			    "failed, ret=%d", mcg->mg_mgid.gid_prefix,
+			    mcg->mg_mgid.gid_guid, mcg->mg_join_state, ret);
+		}
+
+		if (mcg->mg_mcginfo)
+			kmem_free(mcg->mg_mcginfo, sizeof (ibt_mcg_info_t));
+
+		kmem_free(mcg, sizeof (eib_mcg_t));
+	}
+}
+
+static void
+eib_vnic_rejoin_data_mcgs(eib_t *ss, eib_vnic_t *vnic)
+{
+	eib_chan_t *chan = vnic->vn_data_chan;
+	eib_mcg_t *mcglist;
+	eib_mcg_t *mcg;
+	eib_mcg_t *next;
+	int err;
+
+	/*
+	 * Grab the current list of mcgs
+	 */
+	mutex_enter(&chan->ch_vhub_lock);
+	mcglist = chan->ch_vhub_data;
+	chan->ch_vhub_data = NULL;
+	mutex_exit(&chan->ch_vhub_lock);
+
+	/*
+	 * When rejoin data mcgs is called, we may not even be marked as
+	 * joined in SM's records.  But we still have to leave the old
+	 * one first to prevent leaks in ibtf.
+	 */
+	for (mcg = mcglist; mcg != NULL; mcg = next) {
+		next = mcg->mg_next;
+		mcg->mg_next = NULL;
+
+		(void) ibt_detach_mcg(chan->ch_chan, mcg->mg_mcginfo);
+		(void) ibt_leave_mcg(mcg->mg_rgid, mcg->mg_mgid,
+		    eib_reserved_gid, mcg->mg_join_state);
+
+		if (eib_vnic_join_data_mcg(ss, vnic, mcg->mg_mac, B_TRUE,
+		    &err) != EIB_E_SUCCESS) {
+			uint8_t *m;
+
+			m = mcg->mg_mac;
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_vnic_rejoin_data_mcgs: "
+			    "eib_vnic_join_data_mcg(mcmac=%x:%x:%x:%x:%x:%x) "
+			    "failed, ret=%d", m[0], m[1], m[2], m[3],
+			    m[4], m[5], err);
+		}
+		if (mcg->mg_mcginfo) {
+			kmem_free(mcg->mg_mcginfo, sizeof (ibt_mcg_info_t));
+		}
+		kmem_free(mcg, sizeof (eib_mcg_t));
+	}
+}
+
+static void
+eib_vnic_reattach_ctl_mcgs(eib_t *ss, eib_vnic_t *vnic)
+{
+	/*
+	 * For reattaching to control mcgs, we will not reinitialize the
+	 * vhub table/vhub update we've constructed.  We'll simply detach
+	 * from the table and update mcgs and reattach to them.  Hopefully,
+	 * we wouldn't have missed any updates and won't have to restart
+	 * the vnic.
+	 */
+	eib_rb_vnic_attach_vhub_table(ss, vnic);
+	eib_rb_vnic_attach_vhub_update(ss, vnic);
+
+	if (eib_vnic_attach_vhub_update(ss, vnic) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_reattach_ctl_mcgs: "
+		    "eib_vnic_attach_vhub_update(vn_id=0x%x) failed",
+		    vnic->vn_id);
+	}
+
+	if (eib_vnic_attach_vhub_table(ss, vnic) != EIB_E_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_vnic_reattach_ctl_mcgs: "
+		    "eib_vnic_attach_vhub_table(vn_id=0x%x) failed",
+		    vnic->vn_id);
+
+		eib_rb_vnic_attach_vhub_update(ss, vnic);
+	}
+}
+
+static void
+eib_rb_vnic_create_common(eib_t *ss, eib_vnic_t *vnic, uint_t progress)
+{
+	int err;
+
+	mutex_enter(&ss->ei_vnic_lock);
+	ss->ei_vnic[vnic->vn_instance] = NULL;
+	ss->ei_vnic_pending = NULL;
+	mutex_exit(&ss->ei_vnic_lock);
+
+	if (progress & EIB_VNIC_BROADCAST_JOINED) {
+		eib_vnic_leave_all_data_mcgs(ss, vnic);
+	}
+
+	if (progress & EIB_VNIC_KEEPALIVES_STARTED) {
+		eib_rb_vnic_start_keepalives(ss, vnic);
+	}
+
+	if (progress & EIB_VNIC_ATTACHED_TO_CTL_MCGS) {
+		eib_rb_vnic_attach_ctl_mcgs(ss, vnic);
+	}
+
+	if (progress & EIB_VNIC_LOGIN_DONE) {
+		(void) eib_fip_logout(ss, vnic, &err);
+	}
+
+	if (progress & EIB_VNIC_DATAQP_CREATED) {
+		eib_rb_data_create_qp(ss, vnic);
+	}
+
+	if (progress & EIB_VNIC_CTLQP_CREATED) {
+		eib_rb_ctl_create_qp(ss, vnic);
+	}
+}
+
+static void
+eib_rb_vnic_attach_ctl_mcgs(eib_t *ss, eib_vnic_t *vnic)
+{
+	/*
+	 * Detach from the vhub table and vhub update mcgs before blowing
+	 * up vn_vhub_table and vn_vhub_update, since these are assumed to
+	 * be available by the control cq handler.
+	 */
+	eib_rb_vnic_attach_vhub_table(ss, vnic);
+	eib_rb_vnic_attach_vhub_update(ss, vnic);
+	eib_vnic_fini_tables(ss, vnic, B_TRUE);
+}
+
+/*ARGSUSED*/
+static void
+eib_rb_vnic_attach_vhub_table(eib_t *ss, eib_vnic_t *vnic)
+{
+	eib_chan_t *chan = vnic->vn_ctl_chan;
+	eib_mcg_t *mcg;
+	ibt_channel_hdl_t chan_hdl;
+	ibt_status_t ret;
+
+	if (chan == NULL)
+		return;
+
+	mutex_enter(&chan->ch_vhub_lock);
+	chan_hdl = chan->ch_chan;
+	mcg = chan->ch_vhub_table;
+	chan->ch_vhub_table = NULL;
+	mutex_exit(&chan->ch_vhub_lock);
+
+	if (chan_hdl && mcg) {
+		ret = ibt_detach_mcg(chan_hdl, mcg->mg_mcginfo);
+		if (ret != IBT_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_rb_vnic_attach_vhub_table: "
+			    "ibt_detach_mcg(chan_hdl=0x%llx, mcinfo=0x%llx, "
+			    "mgid=%llx.%llx) failed, ret=%d", chan_hdl,
+			    mcg->mg_mcginfo, mcg->mg_mgid.gid_prefix,
+			    mcg->mg_mgid.gid_guid, ret);
+		}
+
+		ret = ibt_leave_mcg(mcg->mg_rgid, mcg->mg_mgid,
+		    eib_reserved_gid, mcg->mg_join_state);
+		if (ret != IBT_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_rb_vnic_attach_vhub_table: "
+			    "ibt_leave_mcg(mgid=%llx.%llx, jstate=0x%x) "
+			    "failed, ret=%d", mcg->mg_mgid.gid_prefix,
+			    mcg->mg_mgid.gid_guid, mcg->mg_join_state, ret);
+		}
+
+		if (mcg->mg_mcginfo) {
+			ibt_free_mcg_info(mcg->mg_mcginfo, 1);
+		}
+		kmem_free(mcg, sizeof (eib_mcg_t));
+	}
+}
+
+/*ARGSUSED*/
+static void
+eib_rb_vnic_attach_vhub_update(eib_t *ss, eib_vnic_t *vnic)
+{
+	eib_chan_t *chan = vnic->vn_ctl_chan;
+	eib_mcg_t *mcg;
+	ibt_channel_hdl_t chan_hdl;
+	ibt_status_t ret;
+
+	if (chan == NULL)
+		return;
+
+	mutex_enter(&chan->ch_vhub_lock);
+	chan_hdl = chan->ch_chan;
+	mcg = chan->ch_vhub_update;
+	chan->ch_vhub_update = NULL;
+	mutex_exit(&chan->ch_vhub_lock);
+
+	if (chan_hdl && mcg) {
+		ret = ibt_detach_mcg(chan_hdl, mcg->mg_mcginfo);
+		if (ret != IBT_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_rb_vnic_attach_vhub_update: "
+			    "ibt_detach_mcg(chan_hdl=0x%llx, mcinfo=0x%llx, "
+			    "mgid=%llx.%llx) failed, ret=%d", chan_hdl,
+			    mcg->mg_mcginfo, mcg->mg_mgid.gid_prefix,
+			    mcg->mg_mgid.gid_guid, ret);
+		}
+
+		ret = ibt_leave_mcg(mcg->mg_rgid, mcg->mg_mgid,
+		    eib_reserved_gid, mcg->mg_join_state);
+		if (ret != IBT_SUCCESS) {
+			EIB_DPRINTF_WARN(ss->ei_instance,
+			    "eib_rb_vnic_attach_vhub_update: "
+			    "ibt_leave_mcg(mgid=%llx.%llx, jstate=0x%x) "
+			    "failed, ret=%d", mcg->mg_mgid.gid_prefix,
+			    mcg->mg_mgid.gid_guid, mcg->mg_join_state, ret);
+		}
+
+		if (mcg->mg_mcginfo) {
+			ibt_free_mcg_info(mcg->mg_mcginfo, 1);
+		}
+		kmem_free(mcg, sizeof (eib_mcg_t));
+	}
+}
+
+/*ARGSUSED*/
+static void
+eib_rb_vnic_start_keepalives(eib_t *ss, eib_vnic_t *vnic)
+{
+	eib_ka_vnics_t *prev;
+	eib_ka_vnics_t *elem;
+
+	/*
+	 * We only need to locate and remove the vnic entry from the
+	 * keepalives manager list
+	 */
+
+	mutex_enter(&ss->ei_ka_vnics_lock);
+
+	prev = NULL;
+	for (elem = ss->ei_ka_vnics; elem; elem = elem->ka_next) {
+		if (elem->ka_vnic == vnic)
+			break;
+
+		prev = elem;
+	}
+	if (elem == NULL) {
+		EIB_DPRINTF_DEBUG(ss->ei_instance,
+		    "eib_rb_vnic_start_keepalives: no keepalive element found "
+		    "for vnic 0x%llx (vn_inst=%d) with keepalive manager",
+		    vnic, vnic->vn_instance);
+	} else {
+		if (prev) {
+			prev->ka_next = elem->ka_next;
+		} else {
+			ss->ei_ka_vnics = elem->ka_next;
+		}
+		kmem_free(elem, sizeof (eib_ka_vnics_t));
+	}
+	mutex_exit(&ss->ei_ka_vnics_lock);
+}
+
+/*ARGSUSED*/
+static void
+eib_rb_vnic_join_data_mcg(eib_t *ss, eib_vnic_t *vnic, uint8_t *mcast_mac)
+{
+	eib_chan_t *chan = vnic->vn_data_chan;
+	eib_mcg_t *prev;
+	eib_mcg_t *mcg;
+	ibt_status_t ret;
+
+	/*
+	 * Search our list and remove the item if found
+	 */
+	mutex_enter(&chan->ch_vhub_lock);
+
+	prev = NULL;
+	for (mcg = chan->ch_vhub_data; mcg != NULL; mcg = mcg->mg_next) {
+		if (bcmp(mcg->mg_mac, mcast_mac, ETHERADDRL) == 0)
+			break;
+		prev = mcg;
+	}
+
+	if (mcg == NULL) {
+		mutex_exit(&chan->ch_vhub_lock);
+		return;
+	}
+
+	if (prev != NULL)
+		prev->mg_next = mcg->mg_next;
+	else
+		chan->ch_vhub_data = mcg->mg_next;
+
+	mcg->mg_next = NULL;
+
+	mutex_exit(&chan->ch_vhub_lock);
+
+	/*
+	 * Detach data channel qp from the mcg, leave the group and free
+	 * all associated stuff
+	 */
+	ret = ibt_detach_mcg(chan->ch_chan, mcg->mg_mcginfo);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_rb_vnic_join_data_mcg: "
+		    "ibt_detach_mcg(chan_hdl=0x%llx, mcinfo=0x%llx, "
+		    "mgid=%llx.%llx) failed, ret=%d", chan->ch_chan,
+		    mcg->mg_mcginfo, mcg->mg_mgid.gid_prefix,
+		    mcg->mg_mgid.gid_guid, ret);
+	}
+
+	ret = ibt_leave_mcg(mcg->mg_rgid, mcg->mg_mgid, eib_reserved_gid,
+	    mcg->mg_join_state);
+	if (ret != IBT_SUCCESS) {
+		EIB_DPRINTF_WARN(ss->ei_instance,
+		    "eib_rb_vnic_join_data_mcg: "
+		    "ibt_leave_mcg(mgid=%llx.%llx, jstate=0x%x) "
+		    "failed, ret=%d", mcg->mg_mgid.gid_prefix,
+		    mcg->mg_mgid.gid_guid, mcg->mg_join_state, ret);
+	}
+
+	if (mcg->mg_mcginfo)
+		kmem_free(mcg->mg_mcginfo, sizeof (ibt_mcg_info_t));
+
+	kmem_free(mcg, sizeof (eib_mcg_t));
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/ib/clients/eoib/eibnx.conf	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,29 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+#
+
+#
+# Configuration file for the EoIB nexus driver
+#
+name="eibnx" parent="ib" unit-address="0";
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/ib/clients/eoib/enx_ctl.c	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,59 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/cred.h>
+#include <sys/file.h>
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+
+#include <sys/ib/clients/eoib/enx_impl.h>
+
+/*
+ * Devctl cbops: open, close, ioctl
+ */
+
+/*ARGSUSED*/
+int
+eibnx_devctl_open(dev_t *devp, int flags, int otyp, cred_t *credp)
+{
+	return (0);
+}
+
+/*ARGSUSED*/
+int
+eibnx_devctl_close(dev_t dev, int flags, int otyp, cred_t *credp)
+{
+	return (0);
+}
+
+/*ARGSUSED*/
+int
+eibnx_devctl_ioctl(dev_t dev, int cmd, intptr_t arg, int mode,
+    cred_t *cred_p, int *rval_p)
+{
+	return (0);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/ib/clients/eoib/enx_fip.c	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,605 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+#include <sys/byteorder.h>
+
+#include <sys/ib/clients/eoib/enx_impl.h>
+
+const char fip_vendor_mellanox[] = {
+	0x4d, 0x65, 0x6c, 0x6c, 0x61, 0x6e, 0x6f, 0x78
+};
+
+/*
+ * HW/FW workaround
+ *
+ * Verification of descriptor list length in the received packets is
+ * disabled, since experimentation shows that BX does not set the desc
+ * list length correctly.
+ */
+int enx_wa_no_desc_list_len = 1;
+
+/*
+ * Static function declarations
+ */
+static int eibnx_fip_make_solicit_pkt(eibnx_thr_info_t *, eibnx_wqe_t *);
+static int eibnx_fip_send_solicit_pkt(eibnx_thr_info_t *, eibnx_wqe_t *,
+    eibnx_gw_addr_t *);
+static int eibnx_fip_parse_advt_pkt(uint8_t *, eibnx_gw_msg_t *);
+static void eibnx_rb_fip_make_solicit_pkt(eibnx_wqe_t *);
+
+/*
+ * Prepare and send a solicit multicast packet to the All-EoIB-GWs-GID
+ */
+int
+eibnx_fip_solicit_mcast(eibnx_thr_info_t *info)
+{
+	eibnx_wqe_t *swqe;
+	int ret;
+
+	if ((swqe = eibnx_acquire_swqe(info, KM_SLEEP)) == NULL)
+		return (ENX_E_FAILURE);
+
+	ret = eibnx_fip_make_solicit_pkt(info, swqe);
+	if (ret != ENX_E_SUCCESS) {
+		eibnx_release_swqe(swqe);
+		return (ENX_E_FAILURE);
+	}
+
+	ret = eibnx_fip_send_solicit_pkt(info, swqe, NULL);
+	if (ret != ENX_E_SUCCESS) {
+		eibnx_rb_fip_make_solicit_pkt(swqe);
+		eibnx_release_swqe(swqe);
+		return (ENX_E_FAILURE);
+	}
+
+	return (ENX_E_SUCCESS);
+}
+
+/*
+ * Go through the list of already discovered gateways and send
+ * a unicast solicitation to each gateway.  This is required by
+ * the EoIB specification ostensibly to receive updated
+ * advertisements.
+ */
+int
+eibnx_fip_solicit_ucast(eibnx_thr_info_t *info, clock_t *solicit_period_ticks)
+{
+	eibnx_gw_info_t *gw;
+	eibnx_wqe_t *swqe;
+	clock_t min_solicit_period_msec;
+	int ret;
+
+	/*
+	 * We want to read the gwlist and send a unicast to each
+	 * destination.  Now, the only places where the gw list pointers
+	 * are updated are when we're adding a new gw item to the list
+	 * and when the list is being torn down and freed.
+	 *
+	 * Since new GWs are always inserted at the head of the list,
+	 * we're guaranteed that any tail subchain of the list will
+	 * not change by the addition of a new gw item coming into
+	 * the list.
+	 *
+	 * Also, since the gw list is torn down only by the port-monitor
+	 * thread (i.e. ourselves), we are also protected against the
+	 * list itself going away while we're here.
+	 *
+	 * Given these two constraints, we can safely read the list
+	 * of gateways without the gw list lock in this routine.
+	 */
+	min_solicit_period_msec = drv_hztousec(*solicit_period_ticks) / 1000;
+	for (gw = info->ti_gw; gw; gw = gw->gw_next) {
+
+		if (eibnx_is_gw_dead(gw))
+			continue;
+
+		swqe = gw->gw_swqe;
+		ASSERT(swqe != NULL);
+
+		mutex_enter(&swqe->qe_lock);
+		if (swqe->qe_type != ENX_QETYP_SWQE) {
+			ENX_DPRINTF_DEBUG("eibnx_fip_solicit_ucast: "
+			    "gw wqe type (0x%lx) indicates this is not an "
+			    "swqe!, cannot send solicitation to gw",
+			    swqe->qe_type);
+			mutex_exit(&swqe->qe_lock);
+			continue;
+		} else if ((swqe->qe_flags & ENX_QEFL_INUSE) !=
+		    ENX_QEFL_INUSE) {
+			ENX_DPRINTF_DEBUG("eibnx_fip_solicit_ucast: "
+			    "gw swqe flags (0x%lx) indicate swqe is free!, "
+			    "cannot send solicitation to gw", swqe->qe_flags);
+			mutex_exit(&swqe->qe_lock);
+			continue;
+		} else if ((swqe->qe_flags & ENX_QEFL_POSTED) ==
+		    ENX_QEFL_POSTED) {
+			ENX_DPRINTF_DEBUG("eibnx_fip_solicit_ucast: gw swqe "
+			    "flags (0x%lx) indicate swqe is still with HCA!, "
+			    "cannot send solicitation to gw", swqe->qe_flags);
+			mutex_exit(&swqe->qe_lock);
+			continue;
+		}
+		mutex_exit(&swqe->qe_lock);
+
+		/*
+		 * EoIB spec requires that each host send solicitation
+		 * to discovered gateways atleast every 4 * GW_ADV_PERIOD.
+		 * We make sure we send a solicitation to all gateways
+		 * every 4 * GW_ADV_PERIOD of the smallest value of
+		 * GW_ADV_PERIOD that we have in our gw list.
+		 */
+		if ((gw->gw_adv_period * 4) < min_solicit_period_msec)
+			min_solicit_period_msec = gw->gw_adv_period * 4;
+
+		ret = eibnx_fip_make_solicit_pkt(info, swqe);
+		if (ret != ENX_E_SUCCESS)
+			continue;
+
+		ret = eibnx_fip_send_solicit_pkt(info, swqe, &gw->gw_addr);
+		if (ret != ENX_E_SUCCESS)
+			eibnx_rb_fip_make_solicit_pkt(swqe);
+	}
+
+	*solicit_period_ticks = drv_usectohz(min_solicit_period_msec * 1000);
+
+	return (ENX_E_SUCCESS);
+}
+
+/*
+ * Given a send wqe and an eibnx_thr_info_t pointer, fill in the
+ * send buffer with a solicit packet in the network byte order.
+ */
+static int
+eibnx_fip_make_solicit_pkt(eibnx_thr_info_t *info, eibnx_wqe_t *swqe)
+{
+	fip_solicit_t *solicit;
+	fip_proto_t *proto;
+	fip_basic_hdr_t *hdr;
+	fip_desc_iba_t *iba;
+	ib_gid_t port_gid;
+	ib_guid_t port_guid;
+
+	uint8_t *pkt = (uint8_t *)(uintptr_t)(swqe->qe_sgl.ds_va);
+	uint_t pktsz = swqe->qe_sgl.ds_len;
+	uint_t solicit_sz = sizeof (fip_solicit_t);
+
+	if (pktsz < solicit_sz) {
+		ENX_DPRINTF_ERR("swqe bufsize too small for pkt, "
+		    "pktsz=%x < expsz=%x", pktsz, solicit_sz);
+		return (ENX_E_FAILURE);
+	}
+
+	/*
+	 * Lint complains that there may be an alignment issue here,
+	 * but we know that the "pkt" is atleast double-word aligned,
+	 * so it's ok.
+	 */
+	solicit = (fip_solicit_t *)pkt;
+
+	/*
+	 * Fill in the FIP protocol version
+	 */
+	proto = &solicit->sl_proto_version;
+	proto->pr_version = FIP_PROTO_VERSION;
+
+	/*
+	 * Fill in the basic header
+	 */
+	hdr = &solicit->sl_fip_hdr;
+	hdr->hd_opcode = htons(FIP_OPCODE_EOIB);
+	hdr->hd_subcode = FIP_SUBCODE_H_SOLICIT;
+	hdr->hd_desc_list_len = htons((solicit_sz >> 2) - 2);
+	hdr->hd_flags = 0;
+	hdr->hd_type = FIP_DESC_TYPE_VENDOR_ID;
+	hdr->hd_len = FIP_DESC_LEN_VENDOR_ID;
+	bcopy(fip_vendor_mellanox, hdr->hd_vendor_id, FIP_VENDOR_LEN);
+
+	/*
+	 * Fill in the Infiniband Address descriptor
+	 */
+	iba = &solicit->sl_iba;
+	iba->ia_type = FIP_DESC_TYPE_IBA;
+	iba->ia_len = FIP_DESC_LEN_IBA;
+	bcopy(fip_vendor_mellanox, iba->ia_vendor_id, FIP_VENDOR_LEN);
+	iba->ia_qpn = htonl(info->ti_qpn);
+	iba->ia_sl_portid = 0;
+	iba->ia_lid = htons(info->ti_pi->p_base_lid);
+	port_gid = info->ti_pi->p_sgid_tbl[0];
+	port_guid = htonll(port_gid.gid_guid);
+	bcopy(&port_guid, iba->ia_guid, FIP_GUID_LEN);
+
+	/*
+	 * Adjust the ds_len in the sgl to indicate the size of the
+	 * solicit pkt before returning
+	 */
+	swqe->qe_sgl.ds_len = solicit_sz;
+
+	return (ENX_E_SUCCESS);
+}
+
+static int
+eibnx_setup_ud_dest(eibnx_thr_info_t *info, eibnx_wqe_t *swqe,
+    eibnx_gw_addr_t *gw_addr)
+{
+	eibnx_t *ss = enx_global_ss;
+	ibt_path_attr_t attr;
+	ibt_path_info_t path;
+	ibt_status_t ret;
+
+	/*
+	 * If this a multicast send, we'll have the gateway address NULL,
+	 * and we'll need to modify the UD destination to send to the
+	 * solicit mcg.
+	 */
+	if (gw_addr == NULL) {
+		ret = ibt_modify_ud_dest(swqe->qe_wr.send.wr.ud.udwr_dest,
+		    info->ti_solicit_mcg->mc_qkey, IB_MC_QPN,
+		    &info->ti_solicit_mcg->mc_adds_vect);
+		if (ret != IBT_SUCCESS) {
+			ENX_DPRINTF_ERR("ibt_modify_ud_dest() failed with "
+			    "ret=%d, qkey=%x, qpn=%x", ret,
+			    info->ti_solicit_mcg->mc_qkey, IB_MC_QPN);
+			return (ENX_E_FAILURE);
+		}
+
+		return (ENX_E_SUCCESS);
+	}
+
+	/*
+	 * If this is a unicast send, but we already have the gw address
+	 * vector, the ud destination handle has already been set up for
+	 * this gateway, so we can return.
+	 */
+	if (gw_addr->ga_vect)
+		return (ENX_E_SUCCESS);
+
+	/*
+	 * Get the reversible path information for this gateway
+	 */
+	bzero(&attr, sizeof (ibt_path_info_t));
+	attr.pa_dgids = &gw_addr->ga_gid;
+	attr.pa_num_dgids = 1;
+	attr.pa_sgid = info->ti_pi->p_sgid_tbl[0];
+	attr.pa_pkey = gw_addr->ga_pkey;
+
+	bzero(&path, sizeof (ibt_path_info_t));
+	ret = ibt_get_paths(ss->nx_ibt_hdl, IBT_PATH_PKEY,
+	    &attr, 1, &path, NULL);
+	if ((ret != IBT_SUCCESS) || (path.pi_hca_guid == 0)) {
+		ENX_DPRINTF_ERR("ibt_get_paths() failed with "
+		    "ret=%d, gid_prefix=%llx, gid_guid=%llx", ret,
+		    gw_addr->ga_gid.gid_prefix, gw_addr->ga_gid.gid_guid);
+		return (ENX_E_FAILURE);
+	}
+
+	/*
+	 * And save the address vector
+	 */
+	gw_addr->ga_vect = kmem_zalloc(sizeof (ibt_adds_vect_t), KM_SLEEP);
+	bcopy(&path.pi_prim_cep_path.cep_adds_vect, gw_addr->ga_vect,
+	    sizeof (ibt_adds_vect_t));
+
+	/*
+	 * Modify the UD destination handle on this swqe entry to address
+	 * this gateway
+	 */
+	ret = ibt_modify_ud_dest(swqe->qe_wr.send.wr.ud.udwr_dest,
+	    gw_addr->ga_qkey, gw_addr->ga_qpn, gw_addr->ga_vect);
+	if (ret != IBT_SUCCESS) {
+		ENX_DPRINTF_ERR("ibt_modify_ud_dest() failed with "
+		    "ret=%d, qkey=%x, qpn=%x", ret, gw_addr->ga_qkey,
+		    gw_addr->ga_qpn);
+		kmem_free(gw_addr->ga_vect, sizeof (ibt_adds_vect_t));
+		gw_addr->ga_vect = NULL;
+		return (ENX_E_FAILURE);
+	}
+
+	return (ENX_E_SUCCESS);
+}
+
+/*
+ * Send a solicit packet to the appropriate destination: if the
+ * destination gw addr is specified, send a unicast message to it;
+ * if not, send a multicast using the solicit mcg address.
+ */
+static int
+eibnx_fip_send_solicit_pkt(eibnx_thr_info_t *info, eibnx_wqe_t *swqe,
+    eibnx_gw_addr_t *gw_addr)
+{
+	ibt_status_t ret;
+
+	if (eibnx_setup_ud_dest(info, swqe, gw_addr) != ENX_E_SUCCESS)
+		return (ENX_E_FAILURE);
+
+	mutex_enter(&swqe->qe_lock);
+
+	/*
+	 * Note that if the post send fails, we don't really need to undo
+	 * anything we did in setting up the ud destination; we can always
+	 * use it for the next time.
+	 */
+	ret = ibt_post_send(info->ti_chan, &(swqe->qe_wr.send), 1, NULL);
+	if (ret != IBT_SUCCESS) {
+		mutex_exit(&swqe->qe_lock);
+		ENX_DPRINTF_ERR("ibt_post_send() failed for solicit, "
+		    "ret=%d", ret);
+		return (ENX_E_FAILURE);
+	}
+
+	/*
+	 * Set the 'posted' flag for the send wqe. If this is an unicast
+	 * send, the wqe is attached to a specific gw entry and we should
+	 * not release the wqe back to the pool on the send completion.
+	 */
+	swqe->qe_flags |= ENX_QEFL_POSTED;
+	if (gw_addr == NULL) {
+		swqe->qe_flags |= ENX_QEFL_RELONCOMP;
+		info->ti_mcast_done = 1;
+	}
+
+	mutex_exit(&swqe->qe_lock);
+
+	return (ENX_E_SUCCESS);
+}
+
+/*
+ * Parse a received packet from the gateway into the
+ * eibnx_gw_msg_t argument.  Note that at this point, this
+ * driver only expects to receive advertisements from the
+ * GW, nothing else.
+ */
+int
+eibnx_fip_parse_pkt(uint8_t *pkt, eibnx_gw_msg_t *msg)
+{
+	fip_basic_hdr_t *hdr;
+	uint16_t opcode;
+	uint8_t subcode;
+	int ret = ENX_E_FAILURE;
+
+	/*
+	 * Lint complains about potential alignment problem here,
+	 * but the fip_* structures are all packed and each of them
+	 * is aligned on a word boundary, so we're ok.
+	 */
+	hdr = (fip_basic_hdr_t *)(pkt + sizeof (fip_proto_t));
+
+	/*
+	 * Verify that the opcode is EoIB
+	 */
+	if ((opcode = ntohs(hdr->hd_opcode)) != FIP_OPCODE_EOIB) {
+		ENX_DPRINTF_WARN("unsupported opcode (%x) found in "
+		    "gw advertisement, ignoring", opcode);
+		return (ENX_E_FAILURE);
+	}
+
+	/*
+	 * We only handle GW advertisements in the eibnx driver code.  However,
+	 * the BridgeX gateway software currently sends login acknowledgements
+	 * to the one who did the solicitation instead of the one who actually
+	 * made the login request, so we need to do something about this as
+	 * well.
+	 */
+	subcode = hdr->hd_subcode;
+	switch (subcode) {
+	case FIP_SUBCODE_G_ADVERTISE:
+		ret = eibnx_fip_parse_advt_pkt(pkt, msg);
+		break;
+
+	case FIP_SUBCODE_G_VNIC_LOGIN_ACK:
+		msg->gm_type = FIP_VNIC_LOGIN_ACK;
+		ret = ENX_E_SUCCESS;
+		break;
+
+	default:
+		ENX_DPRINTF_WARN("unsupported subcode (%x) found in "
+		    "gw advertisement, ignoring", subcode);
+		ret = ENX_E_FAILURE;
+		break;
+	}
+
+	return (ret);
+}
+
+/*
+ * Parse and validate a packet known to be an advertisement from
+ * the GW.
+ */
+static int
+eibnx_fip_parse_advt_pkt(uint8_t *pkt, eibnx_gw_msg_t *msg)
+{
+	fip_advertise_t *advertise;
+	fip_basic_hdr_t *hdr;
+	fip_desc_iba_t *desc_iba;
+	fip_desc_gwinfo_t *desc_gwinfo;
+	fip_desc_gwid_t *desc_gwid;
+	fip_desc_keepalive_t *desc_ka;
+	eibnx_gw_info_t *gwi;
+	ib_guid_t guid;
+	uint16_t rss_qpn_num_net_vnics;
+	uint16_t sl_portid;
+	uint16_t flags;
+
+	/*
+	 * Lint complains about potential alignment problem here,
+	 * but we know that "pkt" is always atleast double-word
+	 * aligned when it's passed to us, so we're ok.
+	 */
+	advertise = (fip_advertise_t *)pkt;
+
+	/*
+	 * Verify if the descriptor list length in the received
+	 * packet is valid.  Currently disabled.
+	 *
+	 * Experimentation shows that BX doesn't set the desc list
+	 * length correctly, so we also simply ignore it and move
+	 * on.  If and when BX fixes this problem, we'll need to
+	 * enable the warning+failure below.
+	 */
+	hdr = &(advertise->ad_fip_header);
+	if (!enx_wa_no_desc_list_len) {
+		uint_t pkt_data_sz;
+
+		pkt_data_sz = (ntohs(hdr->hd_desc_list_len) + 2) << 2;
+		if (pkt_data_sz < sizeof (fip_advertise_t)) {
+			ENX_DPRINTF_WARN("advertisement from gw too small; "
+			    "expected %x, got %x", sizeof (fip_advertise_t),
+			    pkt_data_sz);
+			return (ENX_E_FAILURE);
+		}
+	}
+
+	/*
+	 * Validate all the header and descriptor types and lengths
+	 */
+
+	if (hdr->hd_type != FIP_DESC_TYPE_VENDOR_ID ||
+	    hdr->hd_len != FIP_DESC_LEN_VENDOR_ID) {
+		ENX_DPRINTF_WARN("invalid type/len in fip basic header; "
+		    "expected (%x,%x), got (%x,%x)", FIP_DESC_TYPE_VENDOR_ID,
+		    FIP_DESC_LEN_VENDOR_ID, hdr->hd_type, hdr->hd_len);
+		return (ENX_E_FAILURE);
+	}
+
+	desc_iba = &(advertise->ad_iba);
+	if (desc_iba->ia_type != FIP_DESC_TYPE_IBA ||
+	    desc_iba->ia_len != FIP_DESC_LEN_IBA) {
+		ENX_DPRINTF_WARN("invalid type/len in fip iba desc; "
+		    "expected (%x,%x), got (%x,%x)", FIP_DESC_TYPE_IBA,
+		    FIP_DESC_LEN_IBA, desc_iba->ia_type, desc_iba->ia_len);
+		return (ENX_E_FAILURE);
+	}
+
+	desc_gwinfo = &(advertise->ad_gwinfo);
+	if (desc_gwinfo->gi_type != FIP_DESC_TYPE_EOIB_GW_INFO ||
+	    desc_gwinfo->gi_len != FIP_DESC_LEN_EOIB_GW_INFO) {
+		ENX_DPRINTF_WARN("invalid type/len in fip gwinfo desc; "
+		    "expected (%x,%x), got (%x,%x)",
+		    FIP_DESC_TYPE_EOIB_GW_INFO, FIP_DESC_LEN_EOIB_GW_INFO,
+		    desc_gwinfo->gi_type, desc_gwinfo->gi_len);
+		return (ENX_E_FAILURE);
+	}
+
+	desc_gwid = &(advertise->ad_gwid);
+	if (desc_gwid->id_type != FIP_DESC_TYPE_GW_ID ||
+	    desc_gwid->id_len != FIP_DESC_LEN_GW_ID) {
+		ENX_DPRINTF_WARN("invalid type/len in fip gwid desc; "
+		    "expected (%x,%x), got (%x,%x)",
+		    FIP_DESC_TYPE_GW_ID, FIP_DESC_LEN_GW_ID,
+		    desc_gwid->id_type, desc_gwid->id_len);
+		return (ENX_E_FAILURE);
+	}
+
+	desc_ka = &(advertise->ad_keep_alive);
+	if (desc_ka->ka_type != FIP_DESC_TYPE_KEEP_ALIVE ||
+	    desc_ka->ka_len != FIP_DESC_LEN_KEEP_ALIVE) {
+		ENX_DPRINTF_WARN("invalid type/len in fip ka desc; "
+		    "expected (%x,%x), got (%x,%x)",
+		    FIP_DESC_TYPE_KEEP_ALIVE, FIP_DESC_LEN_KEEP_ALIVE,
+		    desc_ka->ka_type, desc_ka->ka_len);
+		return (ENX_E_FAILURE);
+	}
+
+	/*
+	 * Record if the gw is available for login ('A' bit in the header)
+	 */
+	flags = ntohs(hdr->hd_flags);
+	gwi = &(msg->u.gm_info);
+	gwi->gw_flag_available = (flags & FIP_BHFLAG_GWAVAIL) ? 1 : 0;
+
+	/*
+	 * Record if this was in response to a solicit request (unicast
+	 * advertisement) or not ('S' bit in the header)
+	 */
+	gwi->gw_flag_ucast_advt = (flags & FIP_BHFLAG_SLCTMSG) ? 1 : 0;
+	msg->gm_type = (gwi->gw_flag_ucast_advt) ?
+	    FIP_GW_ADVERTISE_UCAST : FIP_GW_ADVERTISE_MCAST;
+
+	/*
+	 * Record all info from the Infiniband Address descriptor
+	 */
+	gwi->gw_ctrl_qpn = (ntohl(desc_iba->ia_qpn) & FIP_IBA_QPN_MASK);
+
+	sl_portid = ntohs(desc_iba->ia_sl_portid);
+	gwi->gw_portid = (sl_portid & FIP_IBA_PORTID_MASK);
+	gwi->gw_sl = ((sl_portid & FIP_IBA_SL_MASK) >> FIP_IBA_SL_SHIFT);
+
+	gwi->gw_lid = ntohs(desc_iba->ia_lid);
+
+	bcopy(desc_iba->ia_guid, &guid, sizeof (ib_guid_t));
+	gwi->gw_guid = ntohll(guid);
+
+	/*
+	 * Record all info from the EoIB GW Information descriptor
+	 */
+	if (desc_gwinfo->gi_flags & FIP_GWI_HOST_ADMIND_VNICS_MASK)
+		gwi->gw_is_host_adm_vnics = 1;
+	else
+		gwi->gw_is_host_adm_vnics = 0;
+
+	rss_qpn_num_net_vnics = ntohs(desc_gwinfo->gi_rss_qpn_num_net_vnics);
+	gwi->gw_num_net_vnics = (rss_qpn_num_net_vnics &
+	    FIP_GWI_NUM_NET_VNICS_MASK);
+	gwi->gw_n_rss_qpn = ((rss_qpn_num_net_vnics &
+	    FIP_GWI_RSS_QPN_MASK) >> FIP_GWI_RSS_QPN_SHIFT);
+	bcopy(desc_gwinfo->gi_vendor_id, gwi->gw_vendor_id, FIP_VENDOR_LEN);
+	(gwi->gw_vendor_id)[FIP_VENDOR_LEN] = '\0';
+
+	/*
+	 * Record all info from the Gateway Identifier descriptor
+	 */
+	bcopy(desc_gwid->id_guid, &guid, sizeof (ib_guid_t));
+	gwi->gw_system_guid = ntohll(guid);
+	bcopy(desc_gwid->id_sysname, gwi->gw_system_name, FIP_SYSNAME_LEN);
+	(gwi->gw_system_name)[FIP_SYSNAME_LEN] = '\0';
+	bcopy(desc_gwid->id_portname, gwi->gw_port_name, FIP_PORTNAME_LEN);
+	(gwi->gw_port_name)[FIP_PORTNAME_LEN] = '\0';
+
+	/*
+	 * Record all info from the Keep Alive descriptor
+	 */
+	gwi->gw_adv_period = ntohl(desc_ka->ka_gw_adv_period);
+	gwi->gw_ka_period = ntohl(desc_ka->ka_gw_ka_period);
+	gwi->gw_vnic_ka_period = ntohl(desc_ka->ka_vnic_ka_period);
+
+	gwi->gw_next = NULL;
+
+	return (ENX_E_SUCCESS);
+}
+
+/*
+ * Rollback whatever we did for making a solicit packet
+ */
+static void
+eibnx_rb_fip_make_solicit_pkt(eibnx_wqe_t *swqe)
+{
+	uint8_t *pkt = (uint8_t *)(uintptr_t)(swqe->qe_sgl.ds_va);
+
+	bzero(pkt, sizeof (fip_solicit_t));
+	swqe->qe_sgl.ds_len = swqe->qe_bufsz;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/ib/clients/eoib/enx_hdlrs.c	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,1127 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/sunndi.h>
+#include <sys/ksynch.h>
+#include <sys/callb.h>
+#include <sys/ib/mgt/sm_attr.h>		/* SM_INIT_TYPE_REPLY_... */
+
+#include <sys/ib/clients/eoib/enx_impl.h>
+
+/*
+ * Static function declarations
+ */
+static void eibnx_gw_is_alive(eibnx_gw_info_t *);
+static void eibnx_gw_is_aware(eibnx_thr_info_t *, eibnx_gw_info_t *, boolean_t);
+static void eibnx_process_rx(eibnx_thr_info_t *, ibt_wc_t *, eibnx_wqe_t *);
+static void eibnx_handle_wcerr(uint8_t, eibnx_wqe_t *, eibnx_thr_info_t *);
+static void eibnx_handle_login_ack(eibnx_thr_info_t *, uint8_t *);
+static void eibnx_handle_gw_rebirth(eibnx_thr_info_t *, uint16_t);
+static void eibnx_handle_gw_info_update(eibnx_thr_info_t *, uint16_t, void *);
+static int eibnx_replace_portinfo(eibnx_thr_info_t *, ibt_hca_portinfo_t *,
+    uint_t);
+static void eibnx_handle_port_events(ibt_hca_hdl_t, uint8_t);
+static void eibnx_handle_hca_attach(ib_guid_t);
+static void eibnx_handle_hca_detach(ib_guid_t);
+
+/*
+ * NDI event handle we need
+ */
+extern ndi_event_hdl_t enx_ndi_event_hdl;
+
+/*
+ * SM's init type reply flags
+ */
+#define	ENX_PORT_ATTR_LOADED(itr)				\
+	(((itr) & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0)
+#define	ENX_PORT_ATTR_NOT_PRESERVED(itr)			\
+	(((itr) & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)
+#define	ENX_PORT_PRES_NOT_PRESERVED(itr)			\
+	(((itr) & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 0)
+
+/*
+ * Port monitor progress flags (all flag values should be non-zero)
+ */
+#define	ENX_MON_LINKSTATE_UP		0x01
+#define	ENX_MON_FOUND_MCGS		0x02
+#define	ENX_MON_SETUP_CQ		0x04
+#define	ENX_MON_SETUP_UD_CHAN		0x08
+#define	ENX_MON_SETUP_BUFS		0x10
+#define	ENX_MON_SETUP_CQ_HDLR		0x20
+#define	ENX_MON_JOINED_MCGS		0x40
+#define	ENX_MON_MULTICAST_SLCT		0x80
+#define	ENX_MON_MAX			0xFF
+
+/*
+ * Per-port thread to solicit, monitor and discover EoIB gateways
+ * and create the corresponding EoIB driver instances on the host.
+ */
+void
+eibnx_port_monitor(eibnx_thr_info_t *info)
+{
+	clock_t solicit_period_ticks;
+	clock_t deadline;
+	kmutex_t ci_lock;
+	callb_cpr_t ci;
+	char thr_name[MAXNAMELEN];
+
+	(void) snprintf(thr_name, MAXNAMELEN, ENX_PORT_MONITOR,
+	    info->ti_pi->p_port_num);
+
+	mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL);
+	CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, thr_name);
+
+	info->ti_progress = 0;
+
+	/*
+	 * If the port is not active yet, wait for a port up event. The
+	 * async handler, when it sees a port-up event, is expected to
+	 * update the port_monitor's portinfo structure's p_linkstate
+	 * and wake us up with ENX_EVENT_LINK_UP.
+	 */
+	while (info->ti_pi->p_linkstate != IBT_PORT_ACTIVE) {
+		mutex_enter(&info->ti_event_lock);
+		while ((info->ti_event &
+		    (ENX_EVENT_LINK_UP | ENX_EVENT_DIE)) == 0) {
+			mutex_enter(&ci_lock);
+			CALLB_CPR_SAFE_BEGIN(&ci);
+			mutex_exit(&ci_lock);
+
+			cv_wait(&info->ti_event_cv, &info->ti_event_lock);
+
+			mutex_enter(&ci_lock);
+			CALLB_CPR_SAFE_END(&ci, &ci_lock);
+			mutex_exit(&ci_lock);
+		}
+		if (info->ti_event & ENX_EVENT_DIE) {
+			mutex_exit(&info->ti_event_lock);
+			goto port_monitor_exit;
+		}
+		info->ti_event &= (~ENX_EVENT_LINK_UP);
+		mutex_exit(&info->ti_event_lock);
+	}
+	info->ti_progress |= ENX_MON_LINKSTATE_UP;
+
+	/*
+	 * Locate the multicast groups for sending solicit requests
+	 * to the GW and receiving advertisements from the GW. If
+	 * either of the mcg is not present, wait for them to be
+	 * created by the GW.
+	 */
+	while (eibnx_find_mgroups(info) != ENX_E_SUCCESS) {
+		mutex_enter(&info->ti_event_lock);
+		while ((info->ti_event &
+		    (ENX_EVENT_MCGS_AVAILABLE | ENX_EVENT_DIE)) == 0) {
+			mutex_enter(&ci_lock);
+			CALLB_CPR_SAFE_BEGIN(&ci);
+			mutex_exit(&ci_lock);
+
+			cv_wait(&info->ti_event_cv, &info->ti_event_lock);
+
+			mutex_enter(&ci_lock);
+			CALLB_CPR_SAFE_END(&ci, &ci_lock);
+			mutex_exit(&ci_lock);
+		}
+		if (info->ti_event & ENX_EVENT_DIE) {
+			mutex_exit(&info->ti_event_lock);
+			goto port_monitor_exit;
+		}
+		info->ti_event &= (~ENX_EVENT_MCGS_AVAILABLE);
+		mutex_exit(&info->ti_event_lock);
+	}
+	info->ti_progress |= ENX_MON_FOUND_MCGS;
+
+	/*
+	 * Setup a shared CQ
+	 */
+	if (eibnx_setup_cq(info) != ENX_E_SUCCESS) {
+		ENX_DPRINTF_ERR("eibnx_setup_cq() failed, terminating "
+		    "port monitor for (hca_guid=0x%llx, port_num=0x%x)",
+		    info->ti_hca_guid, info->ti_pi->p_port_num);
+		goto port_monitor_exit;
+	}
+	info->ti_progress |= ENX_MON_SETUP_CQ;
+
+	/*
+	 * Setup UD channel
+	 */
+	if (eibnx_setup_ud_channel(info) != ENX_E_SUCCESS) {
+		ENX_DPRINTF_ERR("eibnx_setup_ud_channel() failed, terminating "
+		    "port monitor for (hca_guid=0x%llx, port_num=0x%x)",
+		    info->ti_hca_guid, info->ti_pi->p_port_num);
+		goto port_monitor_exit;
+	}
+	info->ti_progress |= ENX_MON_SETUP_UD_CHAN;
+
+	/*
+	 * Allocate/initialize any tx/rx buffers
+	 */
+	if (eibnx_setup_bufs(info) != ENX_E_SUCCESS) {
+		ENX_DPRINTF_ERR("eibnx_setup_bufs() failed, terminating "
+		    "port monitor for (hca_guid=0x%llx, port_num=0x%x)",
+		    info->ti_hca_guid, info->ti_pi->p_port_num);
+		goto port_monitor_exit;
+	}
+	info->ti_progress |= ENX_MON_SETUP_BUFS;
+
+	/*
+	 * Setup completion handler
+	 */
+	if (eibnx_setup_cq_handler(info) != ENX_E_SUCCESS) {
+		ENX_DPRINTF_ERR("eibnx_setup_cq_handler() failed, terminating "
+		    "port monitor for (hca_guid=0x%llx, port_num=0x%x)",
+		    info->ti_hca_guid, info->ti_pi->p_port_num);
+		goto port_monitor_exit;
+	}
+	info->ti_progress |= ENX_MON_SETUP_CQ_HDLR;
+
+	/*
+	 * Join EoIB multicast groups
+	 */
+	if (eibnx_join_mcgs(info) != ENX_E_SUCCESS) {
+		ENX_DPRINTF_ERR("eibnx_join_mcgs() failed, terminating ",
+		    "port monitor for (hca_guid=0x%llx, port_num=0x%x)",
+		    info->ti_hca_guid, info->ti_pi->p_port_num);
+		goto port_monitor_exit;
+	}
+	info->ti_progress |= ENX_MON_JOINED_MCGS;
+
+	/*
+	 * Send SOLICIT pkt to the EoIB multicast group
+	 */
+	if (eibnx_fip_solicit_mcast(info) != ENX_E_SUCCESS) {
+		ENX_DPRINTF_ERR("eibnx_fip_solicit_mcast() failed, terminating "
+		    "port monitor for (hca_guid=0x%llx, port_num=0x%x)",
+		    info->ti_hca_guid, info->ti_pi->p_port_num);
+		goto port_monitor_exit;
+	}
+	info->ti_progress |= ENX_MON_MULTICAST_SLCT;
+
+	mutex_enter(&info->ti_event_lock);
+
+	solicit_period_ticks = drv_usectohz(ENX_DFL_SOLICIT_PERIOD_USEC);
+
+periodic_solicit:
+	deadline = ddi_get_lbolt() + solicit_period_ticks;
+	while ((info->ti_event & (ENX_EVENT_TIMED_OUT | ENX_EVENT_DIE)) == 0) {
+		mutex_enter(&ci_lock);
+		CALLB_CPR_SAFE_BEGIN(&ci);
+		mutex_exit(&ci_lock);
+
+		if (cv_timedwait(&info->ti_event_cv, &info->ti_event_lock,
+		    deadline) == -1) {
+			info->ti_event |= ENX_EVENT_TIMED_OUT;
+		}
+
+		mutex_enter(&ci_lock);
+		CALLB_CPR_SAFE_END(&ci, &ci_lock);
+		mutex_exit(&ci_lock);
+	}
+
+	if (info->ti_event & ENX_EVENT_DIE) {
+		mutex_exit(&info->ti_event_lock);
+		goto port_monitor_exit;
+	}
+
+	if (info->ti_event & ENX_EVENT_TIMED_OUT) {
+		if (eibnx_fip_solicit_ucast(info,
+		    &solicit_period_ticks) != ENX_E_SUCCESS) {
+			ENX_DPRINTF_WARN("failed to send solicit ucast to "
+			    "gateways (hca_guid=0x%llx, port_num=0x%x)",
+			    info->ti_hca_guid, info->ti_pi->p_port_num);
+		}
+		info->ti_event &= ~ENX_EVENT_TIMED_OUT;
+	}
+
+	goto periodic_solicit;
+
+port_monitor_exit:
+	if (info->ti_progress & ENX_MON_MULTICAST_SLCT) {
+		eibnx_cleanup_port_nodes(info);
+		info->ti_progress &= (~ENX_MON_MULTICAST_SLCT);
+	}
+	if (info->ti_progress & ENX_MON_JOINED_MCGS) {
+		eibnx_rb_join_mcgs(info);
+		info->ti_progress &= (~ENX_MON_JOINED_MCGS);
+	}
+	if (info->ti_progress & ENX_MON_SETUP_CQ_HDLR) {
+		eibnx_rb_setup_cq_handler(info);
+		info->ti_progress &= (~ENX_MON_SETUP_CQ_HDLR);
+	}
+	if (info->ti_progress & ENX_MON_SETUP_BUFS) {
+		eibnx_rb_setup_bufs(info);
+		info->ti_progress &= (~ENX_MON_SETUP_BUFS);
+	}
+	if (info->ti_progress & ENX_MON_SETUP_UD_CHAN) {
+		eibnx_rb_setup_ud_channel(info);
+		info->ti_progress &= (~ENX_MON_SETUP_UD_CHAN);
+	}
+	if (info->ti_progress & ENX_MON_SETUP_CQ) {
+		eibnx_rb_setup_cq(info);
+		info->ti_progress &= (~ENX_MON_SETUP_CQ);
+	}
+	if (info->ti_progress & ENX_MON_FOUND_MCGS) {
+		eibnx_rb_find_mgroups(info);
+		info->ti_progress &= (~ENX_MON_FOUND_MCGS);
+	}
+
+	mutex_enter(&ci_lock);
+	CALLB_CPR_EXIT(&ci);
+	mutex_destroy(&ci_lock);
+}
+
+/*
+ * Async subnet notices handler registered with IBTF
+ */
+/*ARGSUSED*/
+void
+eibnx_subnet_notices_handler(void *arg, ib_gid_t gid,
+    ibt_subnet_event_code_t sn_evcode, ibt_subnet_event_t *sn_event)
+{
+	eibnx_t *ss = enx_global_ss;
+	eibnx_thr_info_t *ti;
+	ib_gid_t notice_gid;
+
+	switch (sn_evcode) {
+	case IBT_SM_EVENT_MCG_CREATED:
+		notice_gid = sn_event->sm_notice_gid;
+
+		if ((notice_gid.gid_prefix == enx_solicit_mgid.gid_prefix &&
+		    notice_gid.gid_guid == enx_solicit_mgid.gid_guid) ||
+		    (notice_gid.gid_prefix == enx_advertise_mgid.gid_prefix &&
+		    notice_gid.gid_guid == enx_advertise_mgid.gid_guid)) {
+
+			mutex_enter(&ss->nx_lock);
+			for (ti = ss->nx_thr_info; ti; ti = ti->ti_next) {
+				mutex_enter(&ti->ti_event_lock);
+				ti->ti_event |= ENX_EVENT_MCGS_AVAILABLE;
+				cv_broadcast(&ti->ti_event_cv);
+				mutex_exit(&ti->ti_event_lock);
+			}
+			mutex_exit(&ss->nx_lock);
+		}
+		break;
+
+	case IBT_SM_EVENT_MCG_DELETED:
+		break;
+
+	default:
+		break;
+	}
+}
+
+/*
+ * Async event handler registered with IBTF
+ */
+/*ARGSUSED*/
+void
+eibnx_async_handler(void *clnt_pvt, ibt_hca_hdl_t hca,
+    ibt_async_code_t code, ibt_async_event_t *event)
+{
+	switch (code) {
+	case IBT_ERROR_CATASTROPHIC_CHAN:
+	case IBT_ERROR_INVALID_REQUEST_CHAN:
+	case IBT_ERROR_ACCESS_VIOLATION_CHAN:
+	case IBT_ERROR_CQ:
+	case IBT_ERROR_CATASTROPHIC_SRQ:
+		ENX_DPRINTF_ERR("ibt ERROR event 0x%x received "
+		    "(hca_guid=0x%llx)", code, event->ev_hca_guid);
+		break;
+
+	case IBT_ERROR_PORT_DOWN:
+		ENX_DPRINTF_WARN("ibt PORT_DOWN event received "
+		    "(hca_guid=0x%llx, port_num=0x%x)",
+		    event->ev_hca_guid, event->ev_port);
+		break;
+
+	case IBT_EVENT_PORT_UP:
+		ENX_DPRINTF_WARN("ibt PORT_UP event received "
+		    "(hca_guid=0x%llx, port_num=0x%x)",
+		    event->ev_hca_guid, event->ev_port);
+		eibnx_handle_port_events(hca, event->ev_port);
+		break;
+
+	case IBT_PORT_CHANGE_EVENT:
+		ENX_DPRINTF_WARN("ibt PORT_CHANGE event received "
+		    "(hca_guid=0x%llx, port_num=0x%x)",
+		    event->ev_hca_guid, event->ev_port);
+		eibnx_handle_port_events(hca, event->ev_port);
+		break;
+
+	case IBT_CLNT_REREG_EVENT:
+		ENX_DPRINTF_WARN("ibt CLNT_REREG event received "
+		    "(hca_guid=0x%llx, port_num=0x%x)",
+		    event->ev_hca_guid, event->ev_port);
+		eibnx_handle_port_events(hca, event->ev_port);
+		break;
+
+	case IBT_HCA_ATTACH_EVENT:
+		ENX_DPRINTF_VERBOSE("ibt HCA_ATTACH event received "
+		    "(new hca_guid=0x%llx)", event->ev_hca_guid);
+		eibnx_handle_hca_attach(event->ev_hca_guid);
+		break;
+
+	case IBT_HCA_DETACH_EVENT:
+		ENX_DPRINTF_VERBOSE("ibt HCA_DETACH event received "
+		    "(target hca_guid=0x%llx)", event->ev_hca_guid);
+		eibnx_handle_hca_detach(event->ev_hca_guid);
+		break;
+
+	default:
+		ENX_DPRINTF_VERBOSE("ibt UNSUPPORTED event 0x%x received "
+		    "(hca_guid=0x%llx)", code, event->ev_hca_guid);
+		break;
+	}
+}
+
+boolean_t
+eibnx_is_gw_dead(eibnx_gw_info_t *gwi)
+{
+	int64_t cur_lbolt;
+
+	cur_lbolt = ddi_get_lbolt64();
+
+	mutex_enter(&gwi->gw_adv_lock);
+	if ((cur_lbolt - gwi->gw_adv_last_lbolt) > gwi->gw_adv_timeout_ticks) {
+		gwi->gw_adv_flag = ENX_GW_DEAD;
+		mutex_exit(&gwi->gw_adv_lock);
+		return (B_TRUE);
+	}
+	mutex_exit(&gwi->gw_adv_lock);
+
+	return (B_FALSE);
+}
+
+static void
+eibnx_gw_is_alive(eibnx_gw_info_t *gwi)
+{
+	/*
+	 * We've just received a multicast advertisement from this
+	 * gateway.  Multicast or unicast, this means that the gateway
+	 * is alive. Record this timestamp (in ticks).
+	 */
+	mutex_enter(&gwi->gw_adv_lock);
+	gwi->gw_adv_last_lbolt = ddi_get_lbolt64();
+	if (gwi->gw_adv_flag == ENX_GW_DEAD) {
+		gwi->gw_adv_flag = ENX_GW_ALIVE;
+	}
+	mutex_exit(&gwi->gw_adv_lock);
+}
+
+static void
+eibnx_gw_is_aware(eibnx_thr_info_t *info, eibnx_gw_info_t *gwi,
+    boolean_t gwi_changed)
+{
+	eib_gw_info_t eib_gwi;
+	boolean_t post_rebirth_event = B_FALSE;
+
+	/*
+	 * We're here when we receive a unicast advertisement from a
+	 * gateway. If this gateway was discovered earlier but was in
+	 * a dead state, this means it has come back alive and become
+	 * aware of us.  We may need to inform any EoIB children
+	 * waiting for notification.  Note that if this gateway is
+	 * being discovered for the first time now, we wouldn't have
+	 * created the binding eoib node for it (we will do that when
+	 * we return from this routine), so the "rebirth" and "gw info
+	 * update" event postings will be NOPs.
+	 */
+	mutex_enter(&gwi->gw_adv_lock);
+	gwi->gw_adv_last_lbolt = ddi_get_lbolt64();
+	if (gwi->gw_adv_flag != ENX_GW_AWARE) {
+		post_rebirth_event = B_TRUE;
+	}
+	gwi->gw_adv_flag = ENX_GW_AWARE;
+	mutex_exit(&gwi->gw_adv_lock);
+
+	/*
+	 * If we have a gateway information update event, we post that
+	 * first, so any rebirth event processed later will have the
+	 * correct gateway information.
+	 */
+	if (gwi_changed) {
+		eib_gwi.gi_system_guid = gwi->gw_system_guid;
+		eib_gwi.gi_guid = gwi->gw_guid;
+		eib_gwi.gi_sn_prefix = gwi->gw_addr.ga_gid.gid_prefix;
+		eib_gwi.gi_adv_period = gwi->gw_adv_period;
+		eib_gwi.gi_ka_period = gwi->gw_ka_period;
+		eib_gwi.gi_vnic_ka_period = gwi->gw_vnic_ka_period;
+		eib_gwi.gi_ctrl_qpn = gwi->gw_ctrl_qpn;
+		eib_gwi.gi_lid = gwi->gw_lid;
+		eib_gwi.gi_portid = gwi->gw_portid;
+		eib_gwi.gi_num_net_vnics = gwi->gw_num_net_vnics;
+		eib_gwi.gi_flag_available = gwi->gw_flag_available;
+		eib_gwi.gi_is_host_adm_vnics = gwi->gw_is_host_adm_vnics;
+		eib_gwi.gi_sl = gwi->gw_sl;
+		eib_gwi.gi_n_rss_qpn = gwi->gw_n_rss_qpn;
+		bcopy(gwi->gw_system_name, eib_gwi.gi_system_name,
+		    EIB_GW_SYSNAME_LEN);
+		bcopy(gwi->gw_port_name, eib_gwi.gi_port_name,
+		    EIB_GW_PORTNAME_LEN);
+		bcopy(gwi->gw_vendor_id, eib_gwi.gi_vendor_id,
+		    EIB_GW_VENDOR_LEN);
+
+		eibnx_handle_gw_info_update(info, eib_gwi.gi_portid, &eib_gwi);
+	}
+	if (post_rebirth_event) {
+		eibnx_handle_gw_rebirth(info, gwi->gw_portid);
+	}
+}
+
+/*
+ * Thread to create eoib nodes and online instances
+ */
+void
+eibnx_create_eoib_node(void)
+{
+	eibnx_t *ss = enx_global_ss;
+	eibnx_nodeq_t *node;
+	kmutex_t ci_lock;
+	callb_cpr_t ci;
+
+	mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL);
+	CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, ENX_NODE_CREATOR);
+
+wait_for_node_to_create:
+	mutex_enter(&ss->nx_nodeq_lock);
+
+	while ((ss->nx_nodeq == NULL) && (ss->nx_nodeq_thr_die == 0)) {
+		mutex_enter(&ci_lock);
+		CALLB_CPR_SAFE_BEGIN(&ci);
+		mutex_exit(&ci_lock);
+
+		cv_wait(&ss->nx_nodeq_cv, &ss->nx_nodeq_lock);
+
+		mutex_enter(&ci_lock);
+		CALLB_CPR_SAFE_END(&ci, &ci_lock);
+		mutex_exit(&ci_lock);
+	}
+
+	/*
+	 * If this is not really a work item, but a request for us to
+	 * die, throwaway all pending work requests and just die.
+	 */
+	if (ss->nx_nodeq_thr_die) {
+		while (ss->nx_nodeq) {
+			node = ss->nx_nodeq;
+			ss->nx_nodeq = node->nc_next;
+			node->nc_next = NULL;
+
+			kmem_free(node, sizeof (eibnx_nodeq_t));
+		}
+		mutex_exit(&ss->nx_nodeq_lock);
+
+		mutex_enter(&ci_lock);
+		CALLB_CPR_EXIT(&ci);
+		mutex_destroy(&ci_lock);
+
+		return;
+	}
+
+	/*
+	 * Grab the first node entry from the queue
+	 */
+	ASSERT(ss->nx_nodeq != NULL);
+	node = ss->nx_nodeq;
+	ss->nx_nodeq = node->nc_next;
+	node->nc_next = NULL;
+
+	mutex_exit(&ss->nx_nodeq_lock);
+
+	(void) eibnx_configure_node(node->nc_info, node->nc_gwi, NULL);
+
+	kmem_free(node, sizeof (eibnx_nodeq_t));
+	goto wait_for_node_to_create;
+
+	/*NOTREACHED*/
+}
+
+/*
+ * Tx and Rx completion interrupt handler. Guaranteed to be single
+ * threaded and nonreentrant for this CQ.
+ */
+void
+eibnx_comp_intr(ibt_cq_hdl_t cq_hdl, void *arg)
+{
+	eibnx_thr_info_t *info = arg;
+
+	if (info->ti_cq_hdl != cq_hdl) {
+		ENX_DPRINTF_DEBUG("eibnx_comp_intr: "
+		    "cq_hdl(0x%llx) != info->ti_cq_hdl(0x%llx), "
+		    "ignoring completion", cq_hdl, info->ti_cq_hdl);
+		return;
+	}
+
+	ASSERT(info->ti_softint_hdl != NULL);
+
+	(void) ddi_intr_trigger_softint(info->ti_softint_hdl, NULL);
+}
+
+/*
+ * Send and Receive completion handler functions for EoIB nexus
+ */
+
+/*ARGSUSED*/
+uint_t
+eibnx_comp_handler(caddr_t arg1, caddr_t arg2)
+{
+	eibnx_thr_info_t *info = (eibnx_thr_info_t *)arg1;
+	ibt_wc_t *wc;
+	eibnx_wqe_t *wqe;
+	ibt_status_t ret;
+	uint_t polled;
+	int i;
+
+	/*
+	 * Make sure the port monitor isn't killed if we're in the completion
+	 * handler. If the port monitor thread is already being killed, we'll
+	 * stop processing completions.
+	 */
+	mutex_enter(&info->ti_event_lock);
+	if (info->ti_event & (ENX_EVENT_DIE | ENX_EVENT_COMPLETION)) {
+		mutex_exit(&info->ti_event_lock);
+		return ((uint_t)ENX_E_SUCCESS);
+	}
+	info->ti_event |= ENX_EVENT_COMPLETION;
+	mutex_exit(&info->ti_event_lock);
+
+	/*
+	 * Re-arm the notification callback before we start polling
+	 * the completion queue.  There's nothing much we can do if the
+	 * enable_cq_notify fails - we issue a warning and move on.
+	 */
+	ret = ibt_enable_cq_notify(info->ti_cq_hdl, IBT_NEXT_COMPLETION);
+	if (ret != IBT_SUCCESS) {
+		ENX_DPRINTF_WARN("ibt_enable_cq_notify(cq_hdl=0x%llx) "
+		    "failed, ret=%d", info->ti_cq_hdl, ret);
+	}
+
+	/*
+	 * Handle tx and rx completions
+	 */
+	while ((ret = ibt_poll_cq(info->ti_cq_hdl, info->ti_wc, info->ti_cq_sz,
+	    &polled)) == IBT_SUCCESS) {
+		for (wc = info->ti_wc, i = 0; i < polled; i++, wc++) {
+			wqe = (eibnx_wqe_t *)(uintptr_t)wc->wc_id;
+			if (wc->wc_status != IBT_WC_SUCCESS) {
+				eibnx_handle_wcerr(wc->wc_status, wqe, info);
+			} else if (wqe->qe_type == ENX_QETYP_RWQE) {
+				eibnx_process_rx(info, wc, wqe);
+				eibnx_return_rwqe(info, wqe);
+			} else {
+				eibnx_return_swqe(wqe);
+			}
+		}
+	}
+
+	/*
+	 * On the way out, make sure we wake up any pending death requestor
+	 * for the port-monitor thread. Note that we need to do a cv_broadcast()
+	 * here since there could be multiple threads sleeping on the event cv
+	 * and we want to make sure all waiters get a chance to see if it's
+	 * their turn.
+	 */
+	mutex_enter(&info->ti_event_lock);
+	info->ti_event &= (~ENX_EVENT_COMPLETION);
+	cv_broadcast(&info->ti_event_cv);
+	mutex_exit(&info->ti_event_lock);
+
+	return (DDI_INTR_CLAIMED);
+}
+
+/*
+ * Rx processing code
+ */
+static void
+eibnx_process_rx(eibnx_thr_info_t *info, ibt_wc_t *wc, eibnx_wqe_t *wqe)
+{
+	eibnx_gw_msg_t msg;
+	eibnx_gw_info_t *gwi;
+	eibnx_gw_info_t *orig_gwi;
+	eibnx_gw_info_t *new_gwi;
+	uint_t orig_gw_state;
+	uint8_t *pkt = (uint8_t *)(uintptr_t)(wqe->qe_sgl.ds_va);
+	boolean_t gwi_changed;
+
+	/*
+	 * We'll simply drop any packet (including broadcast advertisements
+	 * from gws) we receive before we've done our solicitation broadcast.
+	 */
+	if (info->ti_mcast_done == 0) {
+		return;
+	}
+
+	/*
+	 * Skip the GRH and parse the message in the packet
+	 */
+	if (eibnx_fip_parse_pkt(pkt + ENX_GRH_SZ, &msg) != ENX_E_SUCCESS) {
+		return;
+	}
+
+	/*
+	 * If it was a login ack for one of our children, we need to pass
+	 * it on to the child
+	 */
+	if (msg.gm_type == FIP_VNIC_LOGIN_ACK) {
+		eibnx_handle_login_ack(info, pkt);
+		return;
+	}
+
+	/*
+	 * Other than that, we only handle gateway advertisements
+	 */
+	if (msg.gm_type != FIP_GW_ADVERTISE_MCAST &&
+	    msg.gm_type != FIP_GW_ADVERTISE_UCAST) {
+		return;
+	}
+
+	gwi = &msg.u.gm_info;
+
+	/*
+	 * State machine to create eoib instances. Whether this advertisement
+	 * is from a new gateway or an old gateway that we already know about,
+	 * if this was a unicast response to our earlier solicitation and it's
+	 * the first time we're receiving it from this gateway, we're ready to
+	 * login, so we create the EoIB instance for it.
+	 */
+	orig_gwi = eibnx_find_gw_in_gwlist(info, gwi);
+	if (orig_gwi == NULL) {
+		if (gwi->gw_flag_available == 0) {
+			gwi->gw_state = ENX_GW_STATE_UNAVAILABLE;
+			gwi->gw_adv_flag = ENX_GW_ALIVE;
+			(void) eibnx_add_gw_to_gwlist(info, gwi, wc, pkt);
+		} else if (gwi->gw_flag_ucast_advt == 0) {
+			gwi->gw_state = ENX_GW_STATE_AVAILABLE;
+			gwi->gw_adv_flag = ENX_GW_ALIVE;
+			(void) eibnx_add_gw_to_gwlist(info, gwi, wc, pkt);
+		} else {
+			gwi->gw_state = ENX_GW_STATE_READY_TO_LOGIN;
+			gwi->gw_adv_flag = ENX_GW_AWARE;
+			if ((new_gwi = eibnx_add_gw_to_gwlist(info, gwi,
+			    wc, pkt)) != NULL) {
+				eibnx_queue_for_creation(info, new_gwi);
+			}
+		}
+	} else {
+		orig_gw_state = orig_gwi->gw_state;
+		if (gwi->gw_flag_available == 0) {
+			gwi->gw_state = ENX_GW_STATE_UNAVAILABLE;
+			eibnx_replace_gw_in_gwlist(info, orig_gwi, gwi,
+			    wc, pkt, NULL);
+			eibnx_gw_is_alive(orig_gwi);
+
+		} else if (gwi->gw_flag_ucast_advt == 0) {
+			if (orig_gw_state == ENX_GW_STATE_UNAVAILABLE) {
+				gwi->gw_state = ENX_GW_STATE_AVAILABLE;
+			} else {
+				gwi->gw_state = orig_gw_state;
+			}
+			eibnx_replace_gw_in_gwlist(info, orig_gwi, gwi,
+			    wc, pkt, NULL);
+			eibnx_gw_is_alive(orig_gwi);
+
+		} else {
+			gwi->gw_state = ENX_GW_STATE_READY_TO_LOGIN;
+			eibnx_replace_gw_in_gwlist(info, orig_gwi, gwi,
+			    wc, pkt, &gwi_changed);
+			eibnx_gw_is_aware(info, orig_gwi, gwi_changed);
+
+			if (orig_gw_state != ENX_GW_STATE_READY_TO_LOGIN)
+				eibnx_queue_for_creation(info, orig_gwi);
+		}
+	}
+}
+
+/*ARGSUSED*/
+static void
+eibnx_handle_wcerr(uint8_t wcerr, eibnx_wqe_t *wqe, eibnx_thr_info_t *info)
+{
+	/*
+	 * Currently, all we do is report
+	 */
+	switch (wcerr) {
+	case IBT_WC_WR_FLUSHED_ERR:
+		ENX_DPRINTF_VERBOSE("IBT_WC_WR_FLUSHED_ERR seen "
+		    "(hca_guid=0x%llx, port_num=0x%x, wqe_type=0x%x)",
+		    info->ti_hca_guid, info->ti_pi->p_port_num, wqe->qe_type);
+		break;
+
+	case IBT_WC_LOCAL_CHAN_OP_ERR:
+		ENX_DPRINTF_ERR("IBT_WC_LOCAL_CHAN_OP_ERR seen "
+		    "(hca_guid=0x%llx, port_num=0x%x, wqe_type=0x%x)",
+		    info->ti_hca_guid, info->ti_pi->p_port_num, wqe->qe_type);
+		break;
+
+	case IBT_WC_LOCAL_PROTECT_ERR:
+		ENX_DPRINTF_ERR("IBT_WC_LOCAL_PROTECT_ERR seen "
+		    "(hca_guid=0x%llx, port_num=0x%x, wqe_type=0x%x)",
+		    info->ti_hca_guid, info->ti_pi->p_port_num, wqe->qe_type);
+		break;
+	}
+}
+
+static void
+eibnx_handle_login_ack(eibnx_thr_info_t *info, uint8_t *pkt)
+{
+	eibnx_t *ss = enx_global_ss;
+	fip_login_ack_t *ack;
+	fip_desc_vnic_login_t *login;
+	ddi_eventcookie_t cookie;
+	dev_info_t *rdip;
+	uint16_t vnic_id;
+	uint16_t inst;
+	int ret;
+
+	/*
+	 * When we get login acknowledgements, we simply invoke the
+	 * appropriate EoIB driver callback to process it on behalf
+	 * of the driver instance. We will let the callback do error
+	 * checks.
+	 */
+	ack = (fip_login_ack_t *)(pkt + ENX_GRH_SZ);
+	login = &(ack->ak_vnic_login);
+	vnic_id = ntohs(login->vl_vnic_id);
+	inst = EIB_DEVI_INSTANCE(vnic_id);
+
+	if ((rdip = eibnx_find_child_dip_by_inst(info, inst)) == NULL) {
+		ENX_DPRINTF_DEBUG("no eoib child with instance 0x%x found "
+		    "for (hca_guid=0x%llx, port_num=0x%x)", inst,
+		    info->ti_hca_guid, info->ti_pi->p_port_num);
+		return;
+	}
+
+	ret = ndi_event_retrieve_cookie(enx_ndi_event_hdl, rdip,
+	    EIB_NDI_EVENT_LOGIN_ACK, &cookie, NDI_EVENT_NOPASS);
+	if (ret != NDI_SUCCESS) {
+		ENX_DPRINTF_WARN("no login-ack cookie for (hca_guid=0x%llx, "
+		    "port_num=0x%x, eoib_inst=0x%x), ret=%d", info->ti_hca_guid,
+		    info->ti_pi->p_port_num, inst, ret);
+		return;
+	}
+
+	(void) ndi_post_event(ss->nx_dip, rdip, cookie, (void *)pkt);
+}
+
+static void
+eibnx_handle_gw_rebirth(eibnx_thr_info_t *info, uint16_t portid)
+{
+	eibnx_t *ss = enx_global_ss;
+	ddi_eventcookie_t cookie;
+	dev_info_t *rdip;
+	int ret;
+
+	if ((rdip = eibnx_find_child_dip_by_gw(info, portid)) == NULL) {
+		ENX_DPRINTF_WARN("no eoib child bound to gw portid 0x%x "
+		    "found for (hca_guid=0x%llx, port_num=0x%x)",
+		    portid, info->ti_hca_guid, info->ti_pi->p_port_num);
+		return;
+	}
+
+	ret = ndi_event_retrieve_cookie(enx_ndi_event_hdl, rdip,
+	    EIB_NDI_EVENT_GW_AVAILABLE, &cookie, NDI_EVENT_NOPASS);
+	if (ret != NDI_SUCCESS) {
+		ENX_DPRINTF_WARN("no gw-available cookie for (hca_guid=0x%llx, "
+		    "port_num=0x%x, gw_portid=0x%x), ret=%d", info->ti_hca_guid,
+		    info->ti_pi->p_port_num, portid, ret);
+		return;
+	}
+
+	(void) ndi_post_event(ss->nx_dip, rdip, cookie, NULL);
+}
+
+static void
+eibnx_handle_gw_info_update(eibnx_thr_info_t *info, uint16_t portid,
+    void *new_gw_info)
+{
+	eibnx_t *ss = enx_global_ss;
+	ddi_eventcookie_t cookie;
+	dev_info_t *rdip;
+	int ret;
+
+	if ((rdip = eibnx_find_child_dip_by_gw(info, portid)) == NULL) {
+		ENX_DPRINTF_WARN("no eoib child bound to gw portid 0x%x "
+		    "found for (hca_guid=0x%llx, port_num=0x%x)",
+		    portid, info->ti_hca_guid, info->ti_pi->p_port_num);
+		return;
+	}
+
+	ret = ndi_event_retrieve_cookie(enx_ndi_event_hdl, rdip,
+	    EIB_NDI_EVENT_GW_INFO_UPDATE, &cookie, NDI_EVENT_NOPASS);
+	if (ret != NDI_SUCCESS) {
+		ENX_DPRINTF_WARN("no gw-info-update cookie for "
+		    "(hca_guid=0x%llx, port_num=0x%x, gw_portid=0x%x), "
+		    "ret=%d", info->ti_hca_guid, info->ti_pi->p_port_num,
+		    portid, ret);
+		return;
+	}
+
+	(void) ndi_post_event(ss->nx_dip, rdip, cookie, new_gw_info);
+}
+
+static int
+eibnx_replace_portinfo(eibnx_thr_info_t *ti, ibt_hca_portinfo_t *new_pi,
+    uint_t new_size_pi)
+{
+	eibnx_t *ss = enx_global_ss;
+	eibnx_hca_t *hca;
+	eibnx_port_t *port;
+
+	mutex_enter(&ss->nx_lock);
+
+	for (hca = ss->nx_hca; hca; hca = hca->hc_next) {
+		if (hca->hc_hdl == ti->ti_hca)
+			break;
+	}
+
+	if (hca == NULL) {
+		ENX_DPRINTF_WARN("hca hdl (0x%llx) not found in hca list",
+		    ti->ti_hca);
+		mutex_exit(&ss->nx_lock);
+		return (ENX_E_FAILURE);
+	}
+
+	for (port = hca->hc_port; port; port = port->po_next) {
+		if (port->po_pi == ti->ti_pi) {
+			ibt_free_portinfo(port->po_pi, port->po_pi_size);
+			port->po_pi = new_pi;
+			port->po_pi_size = new_size_pi;
+			ti->ti_pi = port->po_pi;
+			break;
+		}
+	}
+
+	if (port == NULL) {
+		ENX_DPRINTF_WARN("portinfo (0x%llx) not found in hca list",
+		    ti->ti_pi);
+		mutex_exit(&ss->nx_lock);
+		return (ENX_E_FAILURE);
+	}
+
+	mutex_exit(&ss->nx_lock);
+
+	return (ENX_E_SUCCESS);
+}
+
+static void
+eibnx_handle_port_events(ibt_hca_hdl_t ev_hca, uint8_t ev_portnum)
+{
+	eibnx_t *ss = enx_global_ss;
+	eibnx_thr_info_t *ti;
+	ibt_hca_portinfo_t *pi;
+	ibt_status_t ret;
+	uint_t num_pi;
+	uint_t size_pi;
+	uint8_t itr;
+
+	/*
+	 * Find the port monitor thread that matches the event hca and
+	 * portnum
+	 */
+	mutex_enter(&ss->nx_lock);
+	for (ti = ss->nx_thr_info; ti; ti = ti->ti_next) {
+		if ((ti->ti_hca == ev_hca) &&
+		    (ti->ti_pi->p_port_num == ev_portnum)) {
+			break;
+		}
+	}
+	mutex_exit(&ss->nx_lock);
+
+	if (ti == NULL)
+		return;
+
+	/*
+	 * See if we need to rejoin the mcgs for this port and do so if true
+	 */
+	ret = ibt_query_hca_ports(ev_hca, ev_portnum, &pi, &num_pi, &size_pi);
+	if (ret != IBT_SUCCESS) {
+		ENX_DPRINTF_WARN("ibt_query_hca_ports() failed with %d", ret);
+		return;
+	} else if (num_pi != 1 || pi->p_linkstate != IBT_PORT_ACTIVE) {
+		ENX_DPRINTF_WARN("ibt_query_hca_ports(port_num=%d) failed, "
+		    "num_pi=%d, linkstate=0x%x", ev_portnum, num_pi,
+		    pi->p_linkstate);
+		ibt_free_portinfo(pi, size_pi);
+		return;
+	}
+
+	itr = pi->p_init_type_reply;
+	if (ENX_PORT_ATTR_LOADED(itr) && ENX_PORT_ATTR_NOT_PRESERVED(itr)) {
+		/*
+		 * If our port's base lid has changed, we need to replace
+		 * the saved portinfo in our lists with the new one before
+		 * going further.
+		 */
+		if (ti->ti_pi->p_base_lid != pi->p_base_lid) {
+			if (eibnx_replace_portinfo(ti, pi, size_pi) ==
+			    ENX_E_SUCCESS) {
+				pi = NULL;
+				size_pi = 0;
+			}
+		}
+	}
+
+	/*
+	 * If the port monitor was stuck waiting for the link to come up,
+	 * let it know that it is up now.
+	 */
+	mutex_enter(&ti->ti_event_lock);
+	if ((ti->ti_progress & ENX_MON_LINKSTATE_UP) != ENX_MON_LINKSTATE_UP) {
+		ti->ti_pi->p_linkstate = IBT_PORT_ACTIVE;
+		ti->ti_event |= ENX_EVENT_LINK_UP;
+		cv_broadcast(&ti->ti_event_cv);
+	}
+	mutex_exit(&ti->ti_event_lock);
+
+	if (ENX_PORT_PRES_NOT_PRESERVED(itr)) {
+		if (ti->ti_progress & ENX_MON_JOINED_MCGS)
+			(void) eibnx_rejoin_mcgs(ti);
+	}
+
+	if (pi != NULL)
+		ibt_free_portinfo(pi, size_pi);
+}
+
+static void
+eibnx_handle_hca_attach(ib_guid_t new_hca_guid)
+{
+	eibnx_t *ss = enx_global_ss;
+	eibnx_thr_info_t *ti;
+	eibnx_hca_t *hca;
+	eibnx_port_t *port;
+
+	/*
+	 * All we need to do is to start a port monitor for all the ports
+	 * on the new HCA.  To do this, go through our current port monitors
+	 * and see if we already have a monitor for this HCA - if so, print
+	 * a warning and return.
+	 */
+	mutex_enter(&ss->nx_lock);
+	for (ti = ss->nx_thr_info; ti; ti = ti->ti_next) {
+		if (ti->ti_hca_guid == new_hca_guid) {
+			ENX_DPRINTF_VERBOSE("hca (guid=0x%llx) already "
+			    "attached", new_hca_guid);
+			mutex_exit(&ss->nx_lock);
+			return;
+		}
+	}
+	mutex_exit(&ss->nx_lock);
+
+	/*
+	 * If we don't have it in our list, process the HCA and start the
+	 * port monitors
+	 */
+	if ((hca = eibnx_prepare_hca(new_hca_guid)) != NULL) {
+		mutex_enter(&ss->nx_lock);
+
+		hca->hc_next = ss->nx_hca;
+		ss->nx_hca = hca;
+
+		for (port = hca->hc_port; port; port = port->po_next) {
+			ti = eibnx_start_port_monitor(hca, port);
+
+			ti->ti_next = ss->nx_thr_info;
+			ss->nx_thr_info = ti;
+		}
+		mutex_exit(&ss->nx_lock);
+	}
+}
+
+static void
+eibnx_handle_hca_detach(ib_guid_t del_hca_guid)
+{
+	eibnx_t *ss = enx_global_ss;
+	eibnx_thr_info_t *ti;
+	eibnx_thr_info_t *ti_stop_list = NULL;
+	eibnx_thr_info_t *ti_prev;
+	eibnx_thr_info_t *ti_next;
+	eibnx_hca_t *hca;
+	eibnx_hca_t *hca_prev;
+
+	/*
+	 * We need to locate all monitor threads for this HCA and stop them
+	 */
+	mutex_enter(&ss->nx_lock);
+	ti_prev = NULL;
+	for (ti = ss->nx_thr_info; ti; ti = ti_next) {
+		ti_next = ti->ti_next;
+
+		if (ti->ti_hca_guid != del_hca_guid) {
+			ti_prev = ti;
+		} else {
+			/*
+			 * Take it out from the good list
+			 */
+			if (ti_prev)
+				ti_prev->ti_next = ti_next;
+			else
+				ss->nx_thr_info = ti_next;
+
+			/*
+			 * And put it in the to-stop list
+			 */
+			ti->ti_next = ti_stop_list;
+			ti_stop_list = ti;
+		}
+	}
+	mutex_exit(&ss->nx_lock);
+
+	/*
+	 * Ask all the port_monitor threads to die.
+	 */
+	for (ti = ti_stop_list; ti; ti = ti_next) {
+		ti_next = ti->ti_next;
+		eibnx_stop_port_monitor(ti);
+	}
+
+	/*
+	 * Now, locate the HCA in our list and release all HCA related
+	 * resources.
+	 */
+	mutex_enter(&ss->nx_lock);
+	hca_prev = NULL;
+	for (hca = ss->nx_hca; hca; hca = hca->hc_next) {
+		if (hca->hc_guid != del_hca_guid) {
+			hca_prev = hca;
+		} else {
+			if (hca_prev) {
+				hca_prev->hc_next = hca->hc_next;
+			} else {
+				ss->nx_hca = hca->hc_next;
+			}
+			hca->hc_next = NULL;
+			break;
+		}
+	}
+	mutex_exit(&ss->nx_lock);
+
+	if (hca) {
+		(void) eibnx_cleanup_hca(hca);
+	}
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/ib/clients/eoib/enx_ibt.c	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,1261 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+
+#include <sys/ib/clients/eoib/enx_impl.h>
+
+/*
+ * Module (static) info passed to IBTL during ibt_attach
+ */
+static ibt_clnt_modinfo_t eibnx_clnt_modinfo = {
+	IBTI_V_CURR,
+	IBT_GENERIC,
+	eibnx_async_handler,
+	NULL,
+	"EoIB Nexus"
+};
+
+ib_gid_t enx_advertise_mgid;
+ib_gid_t enx_solicit_mgid;
+
+/*
+ * Static function declarations
+ */
+static int eibnx_state_init(void);
+static int eibnx_setup_txbufs(eibnx_thr_info_t *);
+static int eibnx_setup_rxbufs(eibnx_thr_info_t *);
+static int eibnx_join_solicit_mcg(eibnx_thr_info_t *);
+static int eibnx_join_advertise_mcg(eibnx_thr_info_t *);
+static int eibnx_rb_ibt_init(eibnx_t *);
+static void eibnx_rb_state_init(void);
+static void eibnx_rb_setup_txbufs(eibnx_thr_info_t *);
+static void eibnx_rb_setup_rxbufs(eibnx_thr_info_t *);
+static void eibnx_rb_join_solicit_mcg(eibnx_thr_info_t *);
+static void eibnx_rb_join_advertise_mcg(eibnx_thr_info_t *);
+
+/*
+ * eibnx_ibt_init() is expected to be called during the nexus driver's
+ * attach time; given that there is only one instance of the nexus
+ * driver allowed, and no threads are active before the initialization
+ * is complete, we don't really have to acquire any driver specific mutex
+ * within this routine.
+ */
+int
+eibnx_ibt_init(eibnx_t *ss)
+{
+	eibnx_hca_t *hca_list;
+	eibnx_hca_t *hca_tail;
+	eibnx_hca_t *hca;
+	uint_t num_hcas;
+	ib_guid_t *hca_guids;
+	ibt_status_t ret;
+	int i;
+
+	/*
+	 * Do per-state initialization
+	 */
+	(void) eibnx_state_init();
+
+	/*
+	 * Attach to IBTL
+	 */
+	if ((ret = ibt_attach(&eibnx_clnt_modinfo, ss->nx_dip, ss,
+	    &ss->nx_ibt_hdl)) != IBT_SUCCESS) {
+		ENX_DPRINTF_ERR("ibt_attach() failed, ret=%d", ret);
+		eibnx_rb_state_init();
+		return (ENX_E_FAILURE);
+	}
+
+	/*
+	 * Get the list of HCA guids on the system
+	 */
+	if ((num_hcas = ibt_get_hca_list(&hca_guids)) == 0) {
+		ENX_DPRINTF_VERBOSE("no HCAs found on the system");
+		if ((ret = ibt_detach(ss->nx_ibt_hdl)) != IBT_SUCCESS) {
+			ENX_DPRINTF_ERR("ibt_detach() failed, ret=%d", ret);
+		}
+		ss->nx_ibt_hdl = NULL;
+		return (ENX_E_FAILURE);
+	}
+
+	/*
+	 * Open the HCAs and store the handles
+	 */
+	hca_list = hca_tail = NULL;
+	for (i = 0; i < num_hcas; i++) {
+		/*
+		 * If we cannot open a HCA, allocate a protection domain
+		 * on it or get portinfo on it, print an error and move on
+		 * to the next HCA.  Otherwise, queue it up in our hca list
+		 */
+		if ((hca = eibnx_prepare_hca(hca_guids[i])) == NULL)
+			continue;
+
+		if (hca_tail) {
+			hca_tail->hc_next = hca;
+		} else {
+			hca_list = hca;
+		}
+		hca_tail = hca;
+	}
+
+	/*
+	 * Free the HCA guid list we've allocated via ibt_get_hca_list()
+	 */
+	ibt_free_hca_list(hca_guids, num_hcas);
+
+	/*
+	 * Put the hca list in the state structure
+	 */
+	mutex_enter(&ss->nx_lock);
+	ss->nx_hca = hca_list;
+	mutex_exit(&ss->nx_lock);
+
+	/*
+	 * Register for subnet notices
+	 */
+	ibt_register_subnet_notices(ss->nx_ibt_hdl,
+	    eibnx_subnet_notices_handler, ss);
+
+	return (ENX_E_SUCCESS);
+}
+
+static int
+eibnx_state_init(void)
+{
+	eibnx_t *ss = enx_global_ss;
+	kthread_t *kt;
+
+	/*
+	 * Initialize synchronization primitives
+	 */
+	mutex_init(&ss->nx_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&ss->nx_nodeq_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&ss->nx_nodeq_cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&ss->nx_busop_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&ss->nx_busop_cv, NULL, CV_DEFAULT, NULL);
+
+	/*
+	 * Initialize well-known mgids: there must be a better way to
+	 * do this instead of having to express every single gid as a
+	 * tuple of two 8-byte integer quantities.
+	 */
+	enx_solicit_mgid.gid_prefix = EIB_GUID_SOLICIT_PREFIX;
+	enx_solicit_mgid.gid_guid = 0;
+	enx_advertise_mgid.gid_prefix = EIB_GUID_ADVERTISE_PREFIX;
+	enx_advertise_mgid.gid_guid = 0;
+
+	/*
+	 * Start up the eoib node creation thread
+	 */
+	kt = thread_create(NULL, 0, eibnx_create_eoib_node, NULL, 0,
+	    &p0, TS_RUN, minclsyspri);
+	ss->nx_nodeq_kt_did = kt->t_did;
+
+	return (ENX_E_SUCCESS);
+}
+
+/*
+ * Locate the two multicast groups: the All-EoIB-GWs-GID and
+ * All-EoIB-ENodes-GID.  Make sure the MTU is something that
+ * we can work with and Qkey is as expected.
+ */
+int
+eibnx_find_mgroups(eibnx_thr_info_t *info)
+{
+	ibt_hca_portinfo_t *pi = info->ti_pi;
+	ibt_mcg_attr_t mcg_attr;
+	ib_gid_t rgid;
+	ibt_status_t ret;
+	uint_t entries;
+
+	mutex_enter(&info->ti_mcg_lock);
+
+	if ((info->ti_mcg_status & ENX_MCGS_FOUND) == ENX_MCGS_FOUND) {
+		mutex_exit(&info->ti_mcg_lock);
+		return (ENX_E_SUCCESS);
+	}
+
+	/*
+	 * Request GID defining this port
+	 */
+	rgid = pi->p_sgid_tbl[0];
+
+	/*
+	 * First, locate the multicast group to use for sending solicit
+	 * requests to the GW
+	 */
+	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
+	mcg_attr.mc_mgid = enx_solicit_mgid;
+	mcg_attr.mc_pkey = (ib_pkey_t)EIB_ADMIN_PKEY;
+	mcg_attr.mc_qkey = (ib_qkey_t)EIB_FIP_QKEY;
+
+	if ((ret = ibt_query_mcg(rgid, &mcg_attr, 1, &info->ti_solicit_mcg,
+	    &entries)) != IBT_SUCCESS) {
+		ENX_DPRINTF_WARN("solicit mcg (gid=%llx.%llx) not found, "
+		    "ibt_query_mcg() returned %d", enx_solicit_mgid.gid_prefix,
+		    enx_solicit_mgid.gid_guid, ret);
+		goto find_mgroups_fail;
+	}
+
+	/*
+	 * Make sure the multicast mtu isn't bigger than the port mtu
+	 * and the multicast group's qkey is the same as EIB_FIP_QKEY.
+	 */
+	if (info->ti_solicit_mcg->mc_mtu > pi->p_mtu) {
+		ENX_DPRINTF_WARN("solicit mcg (gid=%llx.%llx) mtu too big, "
+		    "0x%x > 0x%x", enx_solicit_mgid.gid_prefix,
+		    enx_solicit_mgid.gid_guid, info->ti_solicit_mcg->mc_mtu,
+		    pi->p_mtu);
+		goto find_mgroups_fail;
+	}
+	if (info->ti_solicit_mcg->mc_qkey != EIB_FIP_QKEY) {
+		ENX_DPRINTF_WARN("solicit mcg (gid=%llx.%llx) qkey bad, "
+		    "actual=0x%x, expected=0x%x", enx_solicit_mgid.gid_prefix,
+		    enx_solicit_mgid.gid_guid, info->ti_solicit_mcg->mc_qkey,
+		    EIB_FIP_QKEY);
+		goto find_mgroups_fail;
+	}
+
+	/*
+	 * Now, locate the multicast group for receiving discover
+	 * advertisements from the GW
+	 */
+	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
+	mcg_attr.mc_mgid = enx_advertise_mgid;
+	mcg_attr.mc_pkey = (ib_pkey_t)EIB_ADMIN_PKEY;
+	mcg_attr.mc_qkey = (ib_qkey_t)EIB_FIP_QKEY;
+
+	if ((ret = ibt_query_mcg(rgid, &mcg_attr, 1, &info->ti_advertise_mcg,
+	    &entries)) != IBT_SUCCESS) {
+		ENX_DPRINTF_WARN("advertise mcg (gid=%llx.%llx) not found, "
+		    "ibt_query_mcg() returned %d",
+		    enx_advertise_mgid.gid_prefix,
+		    enx_advertise_mgid.gid_guid, ret);
+		goto find_mgroups_fail;
+	}
+
+	/*
+	 * Verify the multicast group's mtu and qkey as before
+	 */
+	if (info->ti_advertise_mcg->mc_mtu > pi->p_mtu) {
+		ENX_DPRINTF_WARN("advertise mcg (gid=%llx.%llx) mtu too big, "
+		    "0x%x > 0x%x", enx_advertise_mgid.gid_prefix,
+		    enx_advertise_mgid.gid_guid,
+		    info->ti_advertise_mcg->mc_mtu, pi->p_mtu);
+		goto find_mgroups_fail;
+	}
+	if (info->ti_advertise_mcg->mc_qkey != EIB_FIP_QKEY) {
+		ENX_DPRINTF_WARN("advertise mcg (gid=%llx.%llx) qkey bad, "
+		    "actual=0x%x, expected=0x%x",
+		    enx_advertise_mgid.gid_prefix, enx_advertise_mgid.gid_guid,
+		    info->ti_advertise_mcg->mc_qkey, EIB_FIP_QKEY);
+		goto find_mgroups_fail;
+	}
+
+	info->ti_mcg_status |= ENX_MCGS_FOUND;
+	mutex_exit(&info->ti_mcg_lock);
+
+	return (ENX_E_SUCCESS);
+
+find_mgroups_fail:
+	if (info->ti_advertise_mcg) {
+		ibt_free_mcg_info(info->ti_advertise_mcg, 1);
+		info->ti_advertise_mcg = NULL;
+	}
+	if (info->ti_solicit_mcg) {
+		ibt_free_mcg_info(info->ti_solicit_mcg, 1);
+		info->ti_solicit_mcg = NULL;
+	}
+	mutex_exit(&info->ti_mcg_lock);
+
+	return (ENX_E_FAILURE);
+}
+
+/*
+ * Allocate and setup a single completion queue for tx and rx
+ */
+int
+eibnx_setup_cq(eibnx_thr_info_t *info)
+{
+	ibt_hca_attr_t hca_attr;
+	ibt_cq_attr_t cq_attr;
+	ibt_status_t ret;
+	uint_t sz;
+
+	/*
+	 * Get this HCA's attributes
+	 */
+	ret = ibt_query_hca(info->ti_hca, &hca_attr);
+	if (ret != IBT_SUCCESS) {
+		ENX_DPRINTF_ERR("ibt_query_hca(hca_hdl=0x%llx) failed, ret=%d",
+		    info->ti_hca, ret);
+		return (ENX_E_FAILURE);
+	}
+
+	/*
+	 * Allocate a completion queue for our sends and receives
+	 */
+	cq_attr.cq_sched = NULL;
+	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
+	cq_attr.cq_size = (hca_attr.hca_max_cq_sz < ENX_CQ_SIZE) ?
+	    hca_attr.hca_max_cq_sz : ENX_CQ_SIZE;
+
+	ret = ibt_alloc_cq(info->ti_hca, &cq_attr, &info->ti_cq_hdl, &sz);
+	if (ret != IBT_SUCCESS) {
+		ENX_DPRINTF_ERR("ibt_alloc_cq(hca_hdl=0x%llx, cq_sz=0x%lx) "
+		    "failed, ret=%d", info->ti_hca, cq_attr.cq_size, ret);
+		return (ENX_E_FAILURE);
+	}
+
+	/*
+	 * Set up other parameters for collecting completion information
+	 */
+	info->ti_cq_sz = sz;
+	info->ti_wc = kmem_zalloc(sizeof (ibt_wc_t) * sz, KM_SLEEP);
+
+	return (ENX_E_SUCCESS);
+}
+
+/*
+ * Allocate and setup the UD channel parameters
+ */
+int
+eibnx_setup_ud_channel(eibnx_thr_info_t *info)
+{
+	ibt_ud_chan_alloc_args_t alloc_attr;
+	ibt_ud_chan_query_attr_t query_attr;
+	ibt_status_t ret;
+
+	/*
+	 * Protect against arbitrary additions to the chan_alloc_args
+	 * and chan_query_attr structures (make sure the ones we don't
+	 * use are zero'd).
+	 */
+	bzero(&alloc_attr, sizeof (ibt_ud_chan_alloc_args_t));
+	bzero(&query_attr, sizeof (ibt_ud_chan_query_attr_t));
+
+	/*
+	 * This ud channel is not going to be used by the nexus driver
+	 * to send any LSO packets, so we won't need the IBT_USES_LSO flag.
+	 */
+	alloc_attr.ud_flags = IBT_ALL_SIGNALED;
+	alloc_attr.ud_hca_port_num = info->ti_pi->p_port_num;
+
+	ret = ibt_pkey2index(info->ti_hca, info->ti_pi->p_port_num,
+	    (ib_pkey_t)EIB_ADMIN_PKEY, &(alloc_attr.ud_pkey_ix));
+	if (ret != IBT_SUCCESS) {
+		ENX_DPRINTF_ERR("ibt_pkey2index(hca_hdl=0x%llx, "
+		    "port_num=0x%x, pkey=0x%x) failed, ret=%d",
+		    info->ti_hca, info->ti_pi->p_port_num,
+		    EIB_ADMIN_PKEY, ret);
+		return (ENX_E_FAILURE);
+	}
+
+	alloc_attr.ud_sizes.cs_sq = ENX_NUM_SWQE;
+	alloc_attr.ud_sizes.cs_rq = ENX_NUM_RWQE;
+	alloc_attr.ud_sizes.cs_sq_sgl = 1;
+	alloc_attr.ud_sizes.cs_rq_sgl = 1;
+	alloc_attr.ud_sizes.cs_inline = 0;
+
+	alloc_attr.ud_qkey = EIB_FIP_QKEY;
+	alloc_attr.ud_scq = info->ti_cq_hdl;
+	alloc_attr.ud_rcq = info->ti_cq_hdl;
+	alloc_attr.ud_pd = info->ti_pd;
+
+	ret = ibt_alloc_ud_channel(info->ti_hca, IBT_ACHAN_NO_FLAGS,
+	    &alloc_attr, &info->ti_chan, NULL);
+	if (ret != IBT_SUCCESS) {
+		ENX_DPRINTF_ERR("ibt_alloc_ud_channel(hca_hdl=0x%llx, "
+		    "cs_sq=0x%lx, cs_rq=0x%lx) failed, ret=%d",
+		    info->ti_hca, alloc_attr.ud_sizes.cs_sq,
+		    alloc_attr.ud_sizes.cs_rq, ret);
+		return (ENX_E_FAILURE);
+	}
+
+	ret = ibt_query_ud_channel(info->ti_chan, &query_attr);
+	if (ret != IBT_SUCCESS) {
+		ENX_DPRINTF_ERR("ibt_query_ud_channel(chan_hdl=0x%llx) "
+		    "failed, ret=%d", info->ti_chan, ret);
+		if ((ret = ibt_free_channel(info->ti_chan)) != IBT_SUCCESS) {
+			ENX_DPRINTF_WARN("ibt_free_channel(chan_hdl=0x%llx) "
+			    "failed, ret=%d", info->ti_chan, ret);
+		}
+		info->ti_chan = NULL;
+		return (ENX_E_FAILURE);
+	}
+	info->ti_qpn = query_attr.ud_qpn;
+
+	return (ENX_E_SUCCESS);
+}
+
+/*
+ * Set up the transmit buffers for communicating with the gateway. Since
+ * the EoIB Nexus driver only exchanges control messages with the
+ * gateway, we don't really need too much space.
+ */
+static int
+eibnx_setup_txbufs(eibnx_thr_info_t *info)
+{
+	eibnx_tx_t *snd_p = &info->ti_snd;
+	eibnx_wqe_t *swqe;
+	ibt_mr_attr_t attr;
+	ibt_mr_desc_t desc;
+	ib_memlen_t tx_bufsz;
+	ibt_status_t ret;
+	ibt_ud_dest_hdl_t dest;
+	uint8_t	*buf;
+	uint_t mtu = (128 << info->ti_pi->p_mtu);
+	int i;
+
+	/*
+	 * Allocate for the tx buf
+	 */
+	tx_bufsz = ENX_NUM_SWQE * mtu;
+	snd_p->tx_vaddr = (ib_vaddr_t)(uintptr_t)kmem_zalloc(tx_bufsz,
+	    KM_SLEEP);
+
+	/*
+	 * Register the memory region with IBTF for use
+	 */
+	attr.mr_vaddr = snd_p->tx_vaddr;
+	attr.mr_len = tx_bufsz;
+	attr.mr_as = NULL;
+	attr.mr_flags = IBT_MR_SLEEP;
+	if ((ret = ibt_register_mr(info->ti_hca, info->ti_pd, &attr,
+	    &snd_p->tx_mr, &desc)) != IBT_SUCCESS) {
+		ENX_DPRINTF_ERR("ibt_register_mr() failed for tx "
+		    "region (0x%llx, 0x%llx) with ret=%d",
+		    attr.mr_vaddr, attr.mr_len, ret);
+		kmem_free((void *)(uintptr_t)(snd_p->tx_vaddr), tx_bufsz);
+		return (ENX_E_FAILURE);
+	}
+	snd_p->tx_lkey = desc.md_lkey;
+
+	/*
+	 * Now setup the send wqes
+	 */
+	buf = (uint8_t *)(uintptr_t)(snd_p->tx_vaddr);
+	for (i = 0; i < ENX_NUM_SWQE; i++) {
+		swqe = &snd_p->tx_wqe[i];
+
+		/*
+		 * Allocate a UD destination handle
+		 */
+		ret = ibt_alloc_ud_dest(info->ti_hca, IBT_UD_DEST_NO_FLAGS,
+		    info->ti_pd, &dest);
+		if (ret != IBT_SUCCESS) {
+			ENX_DPRINTF_ERR("ibt_alloc_ud_dest(hca_hdl=0x%llx) "
+			    "failed, ret=%d", info->ti_hca, ret);
+			eibnx_rb_setup_txbufs(info);
+			return (ENX_E_FAILURE);
+		}
+
+		/*
+		 * We set up everything in the send wqes except initialize
+		 * the UD destination and the state of the entry. The ds_len
+		 * should also be adjusted correctly. All this should be
+		 * done later in the appropriate routines, before posting.
+		 */
+		swqe->qe_type = ENX_QETYP_SWQE;
+		swqe->qe_bufsz = mtu;
+		swqe->qe_sgl.ds_va = (ib_vaddr_t)(uintptr_t)buf;
+		swqe->qe_sgl.ds_key = snd_p->tx_lkey;
+		swqe->qe_sgl.ds_len = swqe->qe_bufsz;
+		swqe->qe_wr.send.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
+		swqe->qe_wr.send.wr_flags = IBT_WR_NO_FLAGS;
+		swqe->qe_wr.send.wr_trans = IBT_UD_SRV;
+		swqe->qe_wr.send.wr_opcode = IBT_WRC_SEND;
+		swqe->qe_wr.send.wr_nds = 1;
+		swqe->qe_wr.send.wr_sgl = &swqe->qe_sgl;
+		swqe->qe_wr.send.wr.ud.udwr_dest = dest;
+
+		mutex_init(&swqe->qe_lock, NULL, MUTEX_DRIVER, NULL);
+		swqe->qe_flags = 0;
+
+		buf += mtu;
+	}
+
+	return (ENX_E_SUCCESS);
+}
+
+/*
+ * Set up bufs for receiving gateway advertisements
+ */
+static int
+eibnx_setup_rxbufs(eibnx_thr_info_t *info)
+{
+	eibnx_rx_t *rcv_p = &info->ti_rcv;
+	eibnx_wqe_t *rwqe;
+	ibt_mr_attr_t attr;
+	ibt_mr_desc_t desc;
+	ib_memlen_t rx_bufsz;
+	ibt_status_t ret;
+	uint8_t	*buf;
+	uint_t mtu = (128 << info->ti_pi->p_mtu);
+	int i;
+
+	/*
+	 * Allocate for the rx buf
+	 */
+	rx_bufsz = ENX_NUM_RWQE * (mtu + ENX_GRH_SZ);
+	rcv_p->rx_vaddr = (ib_vaddr_t)(uintptr_t)kmem_zalloc(rx_bufsz,
+	    KM_SLEEP);
+
+	attr.mr_vaddr = rcv_p->rx_vaddr;
+	attr.mr_len = rx_bufsz;
+	attr.mr_as = NULL;
+	attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
+	if ((ret = ibt_register_mr(info->ti_hca, info->ti_pd, &attr,
+	    &rcv_p->rx_mr, &desc)) != IBT_SUCCESS) {
+		ENX_DPRINTF_ERR("ibt_register_mr() failed for rx "
+		    "region (0x%llx, 0x%llx) with ret=%d",
+		    attr.mr_vaddr, attr.mr_len, ret);
+		kmem_free((void *)(uintptr_t)(rcv_p->rx_vaddr), rx_bufsz);
+		return (ENX_E_FAILURE);
+	}
+	rcv_p->rx_lkey = desc.md_lkey;
+
+	buf = (uint8_t *)(uintptr_t)(rcv_p->rx_vaddr);
+	for (i = 0; i < ENX_NUM_RWQE; i++) {
+		rwqe = &rcv_p->rx_wqe[i];
+
+		rwqe->qe_type = ENX_QETYP_RWQE;
+		rwqe->qe_bufsz = mtu + ENX_GRH_SZ;
+		rwqe->qe_sgl.ds_va = (ib_vaddr_t)(uintptr_t)buf;
+		rwqe->qe_sgl.ds_key = rcv_p->rx_lkey;
+		rwqe->qe_sgl.ds_len = rwqe->qe_bufsz;
+		rwqe->qe_wr.recv.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
+		rwqe->qe_wr.recv.wr_nds = 1;
+		rwqe->qe_wr.recv.wr_sgl = &rwqe->qe_sgl;
+
+		mutex_init(&rwqe->qe_lock, NULL, MUTEX_DRIVER, NULL);
+		rwqe->qe_flags = 0;
+
+		buf += (mtu + ENX_GRH_SZ);
+	}
+
+	return (ENX_E_SUCCESS);
+}
+
+/*
+ * Set up transmit and receive buffers and post the receive buffers
+ */
+int
+eibnx_setup_bufs(eibnx_thr_info_t *info)
+{
+	eibnx_rx_t *rcv_p = &info->ti_rcv;
+	eibnx_wqe_t *rwqe;
+	ibt_status_t ret;
+	int i;
+
+	if (eibnx_setup_txbufs(info) != ENX_E_SUCCESS)
+		return (ENX_E_FAILURE);
+
+	if (eibnx_setup_rxbufs(info) != ENX_E_SUCCESS) {
+		eibnx_rb_setup_txbufs(info);
+		return (ENX_E_FAILURE);
+	}
+
+	for (i = 0; i < ENX_NUM_RWQE; i++) {
+		rwqe = &rcv_p->rx_wqe[i];
+
+		mutex_enter(&rwqe->qe_lock);
+
+		rwqe->qe_flags |= (ENX_QEFL_INUSE | ENX_QEFL_POSTED);
+		ret = ibt_post_recv(info->ti_chan, &(rwqe->qe_wr.recv), 1,
+		    NULL);
+
+		mutex_exit(&rwqe->qe_lock);
+
+		if (ret != IBT_SUCCESS) {
+			ENX_DPRINTF_ERR("ibt_post_recv(chan_hdl=0x%llx) "
+			    "failed, ret=%d", info->ti_chan, ret);
+
+			ret = ibt_flush_channel(info->ti_chan);
+			if (ret != IBT_SUCCESS) {
+				ENX_DPRINTF_WARN("ibt_flush_channel"
+				    "(chan_hdl=0x%llx) failed, ret=%d",
+				    info->ti_chan, ret);
+			}
+
+			eibnx_rb_setup_rxbufs(info);
+			eibnx_rb_setup_txbufs(info);
+			return (ENX_E_FAILURE);
+		}
+	}
+
+	return (ENX_E_SUCCESS);
+}
+
+/*
+ * Set up the completion queue handler.  While we don't quit if  we cannot
+ * use soft interrupts, that path is really unreliable and untested.
+ */
+int
+eibnx_setup_cq_handler(eibnx_thr_info_t *info)
+{
+	eibnx_t *ss = enx_global_ss;
+	ibt_status_t ret;
+	int rv;
+
+	/*
+	 * We'll try to use a softintr if possible.  If not, it's not
+	 * fatal, we'll try and use the completion handler directly from
+	 * the interrupt handler.
+	 */
+
+	rv = ddi_intr_add_softint(ss->nx_dip, &info->ti_softint_hdl,
+	    EIB_SOFTPRI_ADM, eibnx_comp_handler, info);
+	if (rv != DDI_SUCCESS) {
+		ENX_DPRINTF_WARN("ddi_intr_add_softint(dip=0x%llx) "
+		    "failed, ret=%d", ss->nx_dip, rv);
+	}
+
+	ibt_set_cq_handler(info->ti_cq_hdl, eibnx_comp_intr, info);
+
+	ret = ibt_enable_cq_notify(info->ti_cq_hdl, IBT_NEXT_COMPLETION);
+	if (ret != IBT_SUCCESS) {
+		ENX_DPRINTF_WARN("ibt_enable_cq_notify(cq_hdl=0x%llx) "
+		    "failed, ret=%d", info->ti_cq_hdl, ret);
+		if (info->ti_softint_hdl) {
+			(void) ddi_intr_remove_softint(info->ti_softint_hdl);
+			info->ti_softint_hdl = NULL;
+		}
+		return (ENX_E_FAILURE);
+	}
+
+	return (ENX_E_SUCCESS);
+}
+
+/*
+ * Join the solicit multicast group (All-EoIB-GWs-GID) as a full member
+ */
+static int
+eibnx_join_solicit_mcg(eibnx_thr_info_t *info)
+{
+	ib_gid_t rgid = info->ti_pi->p_sgid_tbl[0];
+	ibt_mcg_attr_t mcg_attr;
+	ibt_mcg_info_t mcg_info;
+	ibt_status_t ret;
+
+	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
+
+	mcg_attr.mc_mgid = enx_solicit_mgid;
+	mcg_attr.mc_qkey = (ib_qkey_t)EIB_FIP_QKEY;
+	mcg_attr.mc_pkey = (ib_pkey_t)EIB_ADMIN_PKEY;
+	mcg_attr.mc_join_state = IB_MC_JSTATE_FULL;
+	mcg_attr.mc_flow = info->ti_solicit_mcg->mc_adds_vect.av_flow;
+	mcg_attr.mc_tclass = info->ti_solicit_mcg->mc_adds_vect.av_tclass;
+	mcg_attr.mc_sl = info->ti_solicit_mcg->mc_adds_vect.av_srvl;
+	mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL;
+
+	/*
+	 * We only need to send to solicit mcg, so we only need to join
+	 * the multicast group, no need to attach our qp to it
+	 */
+	ret = ibt_join_mcg(rgid, &mcg_attr, &mcg_info, NULL, NULL);
+	if (ret != IBT_SUCCESS) {
+		ENX_DPRINTF_ERR("ibt_join_mcg() failed for solicit "
+		    "mgid=%llx.%llx, ret=%x", enx_solicit_mgid.gid_prefix,
+		    enx_solicit_mgid.gid_guid, ret);
+		return (ENX_E_FAILURE);
+	}
+
+	/*
+	 * We can throw away the old mcg info we got when we queried
+	 * for the mcg and use the new one. They both should be the
+	 * same, really.
+	 */
+	if (info->ti_solicit_mcg) {
+		bcopy(&mcg_info, info->ti_solicit_mcg,
+		    sizeof (ibt_mcg_info_t));
+	}
+
+	return (ENX_E_SUCCESS);
+}
+
+/*
+ * Join and attach to the advertise multicast group (All-EoIB-ENodes-GID)
+ * to receive unsolicitied advertisements from the gateways.
+ */
+static int
+eibnx_join_advertise_mcg(eibnx_thr_info_t *info)
+{
+	ib_gid_t rgid = info->ti_pi->p_sgid_tbl[0];
+	ibt_mcg_attr_t mcg_attr;
+	ibt_mcg_info_t mcg_info;
+	ibt_status_t ret;
+
+	if (info->ti_chan == NULL)
+		return (ENX_E_FAILURE);
+
+	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
+
+	mcg_attr.mc_mgid = enx_advertise_mgid;
+	mcg_attr.mc_qkey = (ib_qkey_t)EIB_FIP_QKEY;
+	mcg_attr.mc_pkey = (ib_pkey_t)EIB_ADMIN_PKEY;
+	mcg_attr.mc_join_state = IB_MC_JSTATE_FULL;
+	mcg_attr.mc_flow = info->ti_advertise_mcg->mc_adds_vect.av_flow;
+	mcg_attr.mc_tclass = info->ti_advertise_mcg->mc_adds_vect.av_tclass;
+	mcg_attr.mc_sl = info->ti_advertise_mcg->mc_adds_vect.av_srvl;
+	mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL;
+
+	ret = ibt_join_mcg(rgid, &mcg_attr, &mcg_info, NULL, NULL);
+	if (ret != IBT_SUCCESS) {
+		ENX_DPRINTF_ERR("ibt_join_mcg() failed for advertise "
+		    "mgid=%llx.%llx, ret=%x", enx_advertise_mgid.gid_prefix,
+		    enx_advertise_mgid.gid_guid, ret);
+		return (ENX_E_FAILURE);
+	}
+
+	/*
+	 * We can throw away the old mcg info we got when we queried
+	 * for the mcg and use the new one. They both should be the
+	 * same, really.
+	 */
+	if (info->ti_advertise_mcg) {
+		bcopy(&mcg_info, info->ti_advertise_mcg,
+		    sizeof (ibt_mcg_info_t));
+	}
+
+	/*
+	 * Since we need to receive advertisements, we'll attach our qp
+	 * to the advertise mcg
+	 */
+	ret = ibt_attach_mcg(info->ti_chan, info->ti_advertise_mcg);
+	if (ret != IBT_SUCCESS) {
+		ENX_DPRINTF_ERR("ibt_attach_mcg(chan_hdl=0x%llx, "
+		    "advt_mcg=0x%llx) failed, ret=%d", info->ti_chan,
+		    info->ti_advertise_mcg, ret);
+		return (ENX_E_FAILURE);
+	}
+
+	return (ENX_E_SUCCESS);
+}
+
+/*
+ * Join the multicast groups we're interested in
+ */
+int
+eibnx_join_mcgs(eibnx_thr_info_t *info)
+{
+	mutex_enter(&info->ti_mcg_lock);
+
+	/*
+	 * We should've located the mcg first
+	 */
+	if ((info->ti_mcg_status & ENX_MCGS_FOUND) == 0) {
+		mutex_exit(&info->ti_mcg_lock);
+		return (ENX_E_FAILURE);
+	}
+
+	/*
+	 * If we're already joined to the mcgs, we must leave first
+	 */
+	if ((info->ti_mcg_status & ENX_MCGS_JOINED) == ENX_MCGS_JOINED) {
+		mutex_exit(&info->ti_mcg_lock);
+		return (ENX_E_FAILURE);
+	}
+
+	/*
+	 * Join the two mcgs
+	 */
+	if (eibnx_join_advertise_mcg(info) != ENX_E_SUCCESS) {
+		mutex_exit(&info->ti_mcg_lock);
+		return (ENX_E_FAILURE);
+	}
+	if (eibnx_join_solicit_mcg(info) != ENX_E_SUCCESS) {
+		eibnx_rb_join_advertise_mcg(info);
+		mutex_exit(&info->ti_mcg_lock);
+		return (ENX_E_FAILURE);
+	}
+
+	info->ti_mcg_status |= ENX_MCGS_JOINED;
+	mutex_exit(&info->ti_mcg_lock);
+
+	return (ENX_E_SUCCESS);
+}
+
+int
+eibnx_rejoin_mcgs(eibnx_thr_info_t *info)
+{
+	/*
+	 * Lookup the MCGs again and join them
+	 */
+	eibnx_rb_join_mcgs(info);
+	eibnx_rb_find_mgroups(info);
+
+	if (eibnx_find_mgroups(info) != ENX_E_SUCCESS)
+		return (ENX_E_FAILURE);
+
+	if (eibnx_join_mcgs(info) != ENX_E_SUCCESS)
+		return (ENX_E_FAILURE);
+
+	return (ENX_E_SUCCESS);
+}
+
+int
+eibnx_ibt_fini(eibnx_t *ss)
+{
+	return (eibnx_rb_ibt_init(ss));
+}
+
+static int
+eibnx_rb_ibt_init(eibnx_t *ss)
+{
+	eibnx_hca_t *hca;
+	eibnx_hca_t *hca_next;
+	eibnx_hca_t *hca_list;
+	ibt_status_t	ret;
+
+	/*
+	 * Disable subnet notices callbacks
+	 */
+	ibt_register_subnet_notices(ss->nx_ibt_hdl, NULL, NULL);
+
+	/*
+	 * Remove the hca list from the state structure
+	 */
+	mutex_enter(&ss->nx_lock);
+	hca_list = ss->nx_hca;
+	ss->nx_hca = NULL;
+	mutex_exit(&ss->nx_lock);
+
+	/*
+	 * For each HCA in the list, free up the portinfo/port structs,
+	 * free the pd, close the hca handle and release the hca struct.
+	 * If something goes wrong, try to put back whatever good remains
+	 * back on the hca list and return failure.
+	 */
+	for (hca = hca_list; hca; hca = hca_next) {
+		hca_next = hca->hc_next;
+		if (eibnx_cleanup_hca(hca) != ENX_E_SUCCESS) {
+			mutex_enter(&ss->nx_lock);
+			ss->nx_hca = hca_next;
+			mutex_exit(&ss->nx_lock);
+			return (ENX_E_FAILURE);
+		}
+	}
+
+	if ((ret = ibt_detach(ss->nx_ibt_hdl)) != IBT_SUCCESS) {
+		ENX_DPRINTF_WARN("ibt_detach(ibt_hdl=0x%llx) "
+		    "failed, ret=%d", ss->nx_ibt_hdl, ret);
+		return (ENX_E_FAILURE);
+	}
+	ss->nx_ibt_hdl = NULL;
+
+	eibnx_rb_state_init();
+
+	return (ENX_E_SUCCESS);
+}
+
+static void
+eibnx_rb_state_init(void)
+{
+	eibnx_t *ss = enx_global_ss;
+	kt_did_t thr_id;
+
+	/*
+	 * Ask the eoib node creation thread to die and wait for
+	 * it to happen
+	 */
+	mutex_enter(&ss->nx_nodeq_lock);
+
+	thr_id = ss->nx_nodeq_kt_did;
+	ss->nx_nodeq_thr_die = 1;
+	ss->nx_nodeq_kt_did = 0;
+
+	cv_signal(&ss->nx_nodeq_cv);
+	mutex_exit(&ss->nx_nodeq_lock);
+
+	if (thr_id) {
+		thread_join(thr_id);
+	}
+
+	cv_destroy(&ss->nx_busop_cv);
+	mutex_destroy(&ss->nx_busop_lock);
+	cv_destroy(&ss->nx_nodeq_cv);
+	mutex_destroy(&ss->nx_nodeq_lock);
+	mutex_destroy(&ss->nx_lock);
+}
+
+void
+eibnx_rb_find_mgroups(eibnx_thr_info_t *info)
+{
+	mutex_enter(&info->ti_mcg_lock);
+	if ((info->ti_mcg_status & ENX_MCGS_FOUND) == ENX_MCGS_FOUND) {
+		if (info->ti_advertise_mcg) {
+			ibt_free_mcg_info(info->ti_advertise_mcg, 1);
+			info->ti_advertise_mcg = NULL;
+		}
+		if (info->ti_solicit_mcg) {
+			ibt_free_mcg_info(info->ti_solicit_mcg, 1);
+			info->ti_solicit_mcg = NULL;
+		}
+		info->ti_mcg_status &= (~ENX_MCGS_FOUND);
+	}
+	mutex_exit(&info->ti_mcg_lock);
+}
+
+void
+eibnx_rb_setup_cq(eibnx_thr_info_t *info)
+{
+	ibt_status_t ret;
+
+	if (info->ti_wc && info->ti_cq_sz)
+		kmem_free(info->ti_wc, sizeof (ibt_wc_t) * info->ti_cq_sz);
+
+	info->ti_cq_sz = 0;
+	info->ti_wc = NULL;
+
+	if (info->ti_cq_hdl) {
+		ret = ibt_free_cq(info->ti_cq_hdl);
+		if (ret != IBT_SUCCESS) {
+			ENX_DPRINTF_WARN("ibt_free_cq(cq_hdl=0x%llx) "
+			    "failed, ret=%d", info->ti_cq_hdl, ret);
+		}
+		info->ti_cq_hdl = NULL;
+	}
+}
+
+void
+eibnx_rb_setup_ud_channel(eibnx_thr_info_t *info)
+{
+	ibt_status_t ret;
+
+	if ((ret = ibt_free_channel(info->ti_chan)) != IBT_SUCCESS) {
+		ENX_DPRINTF_WARN("ibt_free_channel(chan=0x%llx) "
+		    "failed, ret=%d", info->ti_chan, ret);
+	}
+	info->ti_chan = NULL;
+	info->ti_qpn = 0;
+}
+
+static void
+eibnx_rb_setup_txbufs(eibnx_thr_info_t *info)
+{
+	eibnx_tx_t *snd_p = &info->ti_snd;
+	eibnx_wqe_t *swqe;
+	ibt_status_t ret;
+	int i;
+	uint_t mtu = (128 << info->ti_pi->p_mtu);
+
+	/*
+	 * Release any UD destination handle we may have allocated.  Note that
+	 * the per swqe lock would've been initialized only if we were able to
+	 * allocate the UD dest handle.
+	 */
+	for (i = 0; i < ENX_NUM_SWQE; i++) {
+		swqe = &snd_p->tx_wqe[i];
+
+		if (swqe->qe_wr.send.wr.ud.udwr_dest) {
+			mutex_destroy(&swqe->qe_lock);
+
+			ret =
+			    ibt_free_ud_dest(swqe->qe_wr.send.wr.ud.udwr_dest);
+			if (ret != IBT_SUCCESS) {
+				ENX_DPRINTF_WARN("ibt_free_ud_dest(dest=0x%llx)"
+				    " failed, ret=%d",
+				    swqe->qe_wr.send.wr.ud.udwr_dest, ret);
+			}
+		}
+	}
+
+	/*
+	 * Clear all the workq entries
+	 */
+	bzero(snd_p->tx_wqe, sizeof (eibnx_wqe_t) * ENX_NUM_SWQE);
+
+	/*
+	 * Clear Lkey and deregister any memory region we may have
+	 * registered earlier
+	 */
+	snd_p->tx_lkey = 0;
+	if (snd_p->tx_mr) {
+		if ((ret = ibt_deregister_mr(info->ti_hca,
+		    snd_p->tx_mr)) != IBT_SUCCESS) {
+			ENX_DPRINTF_WARN("ibt_deregister_TXmr(hca_hdl=0x%llx,"
+			    "mr=0x%llx) failed, ret=%d", info->ti_hca,
+			    snd_p->tx_mr, ret);
+		}
+		snd_p->tx_mr = NULL;
+	}
+
+	/*
+	 * Release any memory allocated for the tx bufs
+	 */
+	if (snd_p->tx_vaddr) {
+		kmem_free((void *)(uintptr_t)(snd_p->tx_vaddr),
+		    ENX_NUM_SWQE * mtu);
+		snd_p->tx_vaddr = 0;
+	}
+
+}
+
+static void
+eibnx_rb_setup_rxbufs(eibnx_thr_info_t *info)
+{
+	eibnx_rx_t *rcv_p = &info->ti_rcv;
+	eibnx_wqe_t *rwqe;
+	ibt_status_t ret;
+	uint_t mtu = (128 << info->ti_pi->p_mtu);
+	int i;
+
+	for (i = 0; i < ENX_NUM_RWQE; i++) {
+		rwqe = &rcv_p->rx_wqe[i];
+		mutex_destroy(&rwqe->qe_lock);
+	}
+	bzero(rcv_p->rx_wqe, sizeof (eibnx_wqe_t) * ENX_NUM_RWQE);
+
+	rcv_p->rx_lkey = 0;
+
+	if ((ret = ibt_deregister_mr(info->ti_hca,
+	    rcv_p->rx_mr)) != IBT_SUCCESS) {
+		ENX_DPRINTF_WARN("ibt_deregister_RXmr(hca_hdl=0x%llx,"
+		    "mr=0x%llx) failed, ret=%d", info->ti_hca,
+		    rcv_p->rx_mr, ret);
+	}
+	rcv_p->rx_mr = NULL;
+
+	kmem_free((void *)(uintptr_t)(rcv_p->rx_vaddr),
+	    ENX_NUM_RWQE * (mtu + ENX_GRH_SZ));
+	rcv_p->rx_vaddr = 0;
+}
+
+void
+eibnx_rb_setup_bufs(eibnx_thr_info_t *info)
+{
+	ibt_status_t ret;
+
+	if ((ret = ibt_flush_channel(info->ti_chan)) != IBT_SUCCESS) {
+		ENX_DPRINTF_WARN("ibt_flush_channel(chan_hdl=0x%llx) "
+		    "failed, ret=%d", info->ti_chan, ret);
+	}
+
+	eibnx_rb_setup_rxbufs(info);
+
+	eibnx_rb_setup_txbufs(info);
+}
+
+void
+eibnx_rb_setup_cq_handler(eibnx_thr_info_t *info)
+{
+	ibt_set_cq_handler(info->ti_cq_hdl, NULL, NULL);
+
+	if (info->ti_softint_hdl) {
+		(void) ddi_intr_remove_softint(info->ti_softint_hdl);
+		info->ti_softint_hdl = NULL;
+	}
+}
+
+static void
+eibnx_rb_join_solicit_mcg(eibnx_thr_info_t *info)
+{
+	ib_gid_t rgid = info->ti_pi->p_sgid_tbl[0];
+	ib_gid_t rsvd_gid;
+	ibt_status_t ret;
+
+	rsvd_gid.gid_prefix = 0;
+	rsvd_gid.gid_guid = 0;
+
+	ret = ibt_leave_mcg(rgid, enx_solicit_mgid,
+	    rsvd_gid, IB_MC_JSTATE_FULL);
+	if (ret != IBT_SUCCESS) {
+		ENX_DPRINTF_WARN("ibt_leave_mcg(slct_mgid=%llx.%llx) "
+		    "failed, ret=%d", enx_solicit_mgid.gid_prefix,
+		    enx_solicit_mgid.gid_guid, ret);
+	}
+}
+
+static void
+eibnx_rb_join_advertise_mcg(eibnx_thr_info_t *info)
+{
+	ib_gid_t rgid = info->ti_pi->p_sgid_tbl[0];
+	ib_gid_t rsvd_gid;
+	ibt_status_t ret;
+
+	ret = ibt_detach_mcg(info->ti_chan, info->ti_advertise_mcg);
+	if (ret != IBT_SUCCESS) {
+		ENX_DPRINTF_WARN("ibt_detach_mcg(chan_hdl=0x%llx, "
+		    "advt_mcg=0x%llx) failed, ret=%d",
+		    info->ti_chan, info->ti_advertise_mcg, ret);
+	}
+
+	rsvd_gid.gid_prefix = 0;
+	rsvd_gid.gid_guid = 0;
+
+	ret = ibt_leave_mcg(rgid, enx_advertise_mgid,
+	    rsvd_gid, IB_MC_JSTATE_FULL);
+	if (ret != IBT_SUCCESS) {
+		ENX_DPRINTF_WARN("ibt_leave_mcg(advt_mgid=%llx.%llx) "
+		    "failed, ret=%d", enx_advertise_mgid.gid_prefix,
+		    enx_advertise_mgid.gid_guid, ret);
+	}
+}
+
+void
+eibnx_rb_join_mcgs(eibnx_thr_info_t *info)
+{
+	mutex_enter(&info->ti_mcg_lock);
+	if ((info->ti_mcg_status & ENX_MCGS_JOINED) == ENX_MCGS_JOINED) {
+		eibnx_rb_join_solicit_mcg(info);
+		eibnx_rb_join_advertise_mcg(info);
+
+		info->ti_mcg_status &= (~ENX_MCGS_JOINED);
+	}
+	mutex_exit(&info->ti_mcg_lock);
+}
+
+eibnx_hca_t *
+eibnx_prepare_hca(ib_guid_t hca_guid)
+{
+	eibnx_t *ss = enx_global_ss;
+	eibnx_hca_t *hca;
+	eibnx_port_t *port;
+	eibnx_port_t *port_tail;
+	ibt_hca_hdl_t hca_hdl;
+	ibt_pd_hdl_t pd_hdl;
+	ibt_hca_portinfo_t *pi;
+	uint_t num_pi;
+	uint_t size_pi;
+	ibt_hca_attr_t hca_attr;
+	ibt_status_t ret;
+	int i;
+
+	ret = ibt_open_hca(ss->nx_ibt_hdl, hca_guid, &hca_hdl);
+	if (ret != IBT_SUCCESS) {
+		ENX_DPRINTF_ERR("ibt_open_hca(hca_guid=0x%llx) "
+		    "failed, ret=%d", hca_guid, ret);
+		return (NULL);
+	}
+
+	bzero(&hca_attr, sizeof (ibt_hca_attr_t));
+	if ((ret = ibt_query_hca(hca_hdl, &hca_attr)) != IBT_SUCCESS) {
+		ENX_DPRINTF_ERR("ibt_query_hca(hca_hdl=0x%llx, "
+		    "hca_guid=0x%llx) failed, ret=%d",
+		    hca_hdl, hca_guid, ret);
+
+		if ((ret = ibt_close_hca(hca_hdl)) != IBT_SUCCESS) {
+			ENX_DPRINTF_WARN("ibt_close_hca(hca_hdl=0x%llx) "
+			    "failed, ret=%d", hca_hdl, ret);
+		}
+		return (NULL);
+	}
+
+	ret = ibt_alloc_pd(hca_hdl, IBT_PD_NO_FLAGS, &pd_hdl);
+	if (ret != IBT_SUCCESS) {
+		ENX_DPRINTF_ERR("ibt_alloc_pd(hca_hdl=0x%llx, "
+		    "hca_guid=0x%llx) failed, ret=%d",
+		    hca_hdl, hca_guid, ret);
+
+		if ((ret = ibt_close_hca(hca_hdl)) != IBT_SUCCESS) {
+			ENX_DPRINTF_WARN("ibt_close_hca(hca_hdl=0x%llx) "
+			    "failed, ret=%d", hca_hdl, ret);
+		}
+		return (NULL);
+	}
+
+	/*
+	 * We have all the information we want about this hca, create
+	 * a new struct and return it.
+	 */
+	hca = kmem_zalloc(sizeof (eibnx_hca_t), KM_SLEEP);
+	hca->hc_next = NULL;
+	hca->hc_guid = hca_guid;
+	hca->hc_hdl = hca_hdl;
+	hca->hc_pd = pd_hdl;
+	hca->hc_port = port_tail = NULL;
+
+	for (i = 0; i < hca_attr.hca_nports; i++) {
+		ret = ibt_query_hca_ports(hca_hdl, i + 1, &pi,
+		    &num_pi, &size_pi);
+		if (ret != IBT_SUCCESS) {
+			ENX_DPRINTF_WARN("ibt_query_hca_ports(hca_hdl=0x%llx, "
+			    "port=0x%x) failed, ret=%d", hca_hdl, i + 1, ret);
+		} else {
+			port = kmem_zalloc(sizeof (eibnx_port_t), KM_SLEEP);
+			port->po_next = NULL;
+			port->po_pi = pi;
+			port->po_pi_size = size_pi;
+
+			if (port_tail) {
+				port_tail->po_next = port;
+			} else {
+				hca->hc_port = port;
+			}
+			port_tail = port;
+		}
+	}
+
+	/*
+	 * If we couldn't query about any ports on the HCA, return failure
+	 */
+	if (hca->hc_port == NULL) {
+		ENX_DPRINTF_ERR("all hca port queries failed for "
+		    "hca_guid=0x%llx", hca_guid);
+		(void) eibnx_cleanup_hca(hca);
+		return (NULL);
+	}
+
+	return (hca);
+}
+
+int
+eibnx_cleanup_hca(eibnx_hca_t *hca)
+{
+	eibnx_port_t *port;
+	eibnx_port_t *port_next;
+	ibt_status_t ret;
+
+	for (port = hca->hc_port; port; port = port_next) {
+		port_next = port->po_next;
+
+		ibt_free_portinfo(port->po_pi, port->po_pi_size);
+		kmem_free(port, sizeof (eibnx_port_t));
+	}
+
+	if ((ret = ibt_free_pd(hca->hc_hdl, hca->hc_pd)) != IBT_SUCCESS) {
+		ENX_DPRINTF_WARN("ibt_free_pd(hca_hdl=0x%lx, pd_hd=0x%lx) "
+		    "failed, ret=%d", hca->hc_hdl, hca->hc_pd, ret);
+		return (ENX_E_FAILURE);
+	}
+
+	if ((ret = ibt_close_hca(hca->hc_hdl)) != IBT_SUCCESS) {
+		ENX_DPRINTF_WARN("ibt_close_hca(hca_hdl=0x%lx) failed, "
+		    "ret=%d", hca->hc_hdl, ret);
+		return (ENX_E_FAILURE);
+	}
+
+	kmem_free(hca, sizeof (eibnx_hca_t));
+
+	return (ENX_E_SUCCESS);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/ib/clients/eoib/enx_log.c	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,252 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+#include <sys/varargs.h>
+
+#include <sys/ib/clients/eoib/enx_impl.h>
+
+/*
+ * Defaults
+ */
+uint_t enx_log_size = ENX_LOGSZ_DEFAULT;
+int enx_log_level = ENX_MSGS_DEFAULT | ENX_MSGS_DEBUG;
+int enx_log_timestamps = 0;
+
+/*
+ * Debug variables, should not be tunables so allocated debug buffer
+ * and its size remain consistent.
+ */
+static kmutex_t enx_debug_buf_lock;
+static uint8_t *enx_debug_buf;
+static uint32_t enx_debug_buf_ndx;
+static uint_t enx_debug_buf_sz;
+
+static void eibnx_log(char *);
+
+void
+eibnx_debug_init(void)
+{
+	enx_debug_buf_ndx = 0;
+	enx_debug_buf_sz = enx_log_size;
+	enx_debug_buf = kmem_zalloc(enx_debug_buf_sz, KM_SLEEP);
+
+	mutex_init(&enx_debug_buf_lock, NULL, MUTEX_DRIVER, NULL);
+}
+
+void
+eibnx_debug_fini(void)
+{
+	mutex_destroy(&enx_debug_buf_lock);
+
+	if (enx_debug_buf && enx_debug_buf_sz) {
+		kmem_free(enx_debug_buf, enx_debug_buf_sz);
+		enx_debug_buf = NULL;
+	}
+	enx_debug_buf_sz = 0;
+	enx_debug_buf_ndx = 0;
+}
+
+void
+eibnx_log(char *msg)
+{
+	uint32_t off;
+	int msglen;
+	char msgbuf[ENX_MAX_LINE];
+
+	if (enx_debug_buf == NULL)
+		return;
+
+	if (enx_log_timestamps) {
+		msglen = snprintf(msgbuf, ENX_MAX_LINE, "%llx: %s",
+		    (unsigned long long)ddi_get_lbolt64(), msg);
+	} else {
+		msglen = snprintf(msgbuf, ENX_MAX_LINE, "%s", msg);
+	}
+
+	if (msglen < 0)
+		return;
+	else if (msglen >= ENX_MAX_LINE)
+		msglen = ENX_MAX_LINE - 1;
+
+	mutex_enter(&enx_debug_buf_lock);
+
+	if ((enx_debug_buf_ndx == 0) ||
+	    (enx_debug_buf[enx_debug_buf_ndx-1] != '\n')) {
+		enx_debug_buf[enx_debug_buf_ndx] = '\n';
+		enx_debug_buf_ndx++;
+	}
+
+	off = enx_debug_buf_ndx;	/* current msg should go here */
+
+	enx_debug_buf_ndx += msglen;	/* next msg should start here */
+	enx_debug_buf[enx_debug_buf_ndx] = 0;	/* terminate current msg */
+
+	if (enx_debug_buf_ndx >= (enx_debug_buf_sz - 2 * ENX_MAX_LINE))
+		enx_debug_buf_ndx = 0;
+
+	mutex_exit(&enx_debug_buf_lock);
+
+	bcopy(msgbuf, enx_debug_buf+off, msglen);    /* no lock needed */
+}
+
+#ifdef ENX_DEBUG
+void
+eibnx_dprintf_verbose(const char *fmt, ...)
+{
+	va_list ap;
+	int msglen;
+	char msgbuf[ENX_MAX_LINE];
+	char newfmt[ENX_MAX_LINE];
+
+	if ((enx_log_level & ENX_MSGS_VERBOSE) != ENX_MSGS_VERBOSE)
+		return;
+
+	(void) snprintf(newfmt, ENX_MAX_LINE, "..........%s", fmt);
+
+	va_start(ap, fmt);
+	msglen = vsnprintf(msgbuf, ENX_MAX_LINE, newfmt, ap);
+	va_end(ap);
+
+	if (msglen > 0) {
+		eibnx_log(msgbuf);
+	}
+}
+
+void
+eibnx_dprintf_args(const char *fmt, ...)
+{
+	va_list ap;
+	int msglen;
+	char msgbuf[ENX_MAX_LINE];
+	char newfmt[ENX_MAX_LINE];
+
+	if ((enx_log_level & ENX_MSGS_ARGS) != ENX_MSGS_ARGS)
+		return;
+
+	(void) snprintf(newfmt, ENX_MAX_LINE, "........%s", fmt);
+
+	va_start(ap, fmt);
+	msglen = vsnprintf(msgbuf, ENX_MAX_LINE, newfmt, ap);
+	va_end(ap);
+
+	if (msglen > 0) {
+		eibnx_log(msgbuf);
+	}
+}
+
+void
+eibnx_dprintf_debug(const char *fmt, ...)
+{
+	va_list ap;
+	int msglen;
+	char msgbuf[ENX_MAX_LINE];
+	char newfmt[ENX_MAX_LINE];
+
+	if ((enx_log_level & ENX_MSGS_DEBUG) != ENX_MSGS_DEBUG)
+		return;
+
+	(void) snprintf(newfmt, ENX_MAX_LINE, "......%s", fmt);
+
+	va_start(ap, fmt);
+	msglen = vsnprintf(msgbuf, ENX_MAX_LINE, newfmt, ap);
+	va_end(ap);
+
+	if (msglen > 0) {
+		eibnx_log(msgbuf);
+	}
+}
+#endif
+
+void
+eibnx_dprintf_warn(const char *fmt, ...)
+{
+	va_list ap;
+	int msglen;
+	char msgbuf[ENX_MAX_LINE];
+	char newfmt[ENX_MAX_LINE];
+
+	if ((enx_log_level & ENX_MSGS_WARN) != ENX_MSGS_WARN)
+		return;
+
+	(void) snprintf(newfmt, ENX_MAX_LINE, "....%s", fmt);
+
+	va_start(ap, fmt);
+	msglen = vsnprintf(msgbuf, ENX_MAX_LINE, newfmt, ap);
+	va_end(ap);
+
+	if (msglen > 0) {
+		eibnx_log(msgbuf);
+	}
+}
+
+void
+eibnx_dprintf_err(const char *fmt, ...)
+{
+	va_list ap;
+	int msglen;
+	char msgbuf[ENX_MAX_LINE];
+	char newfmt[ENX_MAX_LINE];
+
+	if ((enx_log_level & ENX_MSGS_ERR) != ENX_MSGS_ERR)
+		return;
+
+	(void) snprintf(newfmt, ENX_MAX_LINE, "..%s", fmt);
+
+	va_start(ap, fmt);
+	msglen = vsnprintf(msgbuf, ENX_MAX_LINE, newfmt, ap);
+	va_end(ap);
+
+	if (msglen > 0) {
+		eibnx_log(msgbuf);
+		cmn_err(CE_WARN, "!%s\n", msgbuf);
+	}
+}
+
+void
+eibnx_dprintf_crit(const char *fmt, ...)
+{
+	va_list ap;
+	int msglen;
+	char msgbuf[ENX_MAX_LINE];
+
+	if ((enx_log_level & ENX_MSGS_CRIT) != ENX_MSGS_CRIT)
+		return;
+
+	va_start(ap, fmt);
+	msglen = vsnprintf(msgbuf, ENX_MAX_LINE, fmt, ap);
+	va_end(ap);
+
+	if (msglen > 0) {
+		eibnx_log(msgbuf);
+		cmn_err(CE_PANIC, "!%s\n", msgbuf);
+	}
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/ib/clients/eoib/enx_main.c	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,638 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * The Ethernet Over Infiniband Nexus driver is a bus nexus driver
+ * that enumerates all the EoIB nodes.
+ */
+
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/devops.h>
+#include <sys/kmem.h>
+#include <sys/ksynch.h>
+#include <sys/modctl.h>
+#include <sys/stat.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/sunndi.h>
+
+#include <sys/ib/clients/eoib/enx_impl.h>
+
+/*
+ * Global per-instance EoIB Nexus data.  Only one instance
+ * of EoIB Nexus is supported
+ */
+eibnx_t *enx_global_ss = NULL;
+
+/*
+ * Static function declarations
+ */
+static int eibnx_attach(dev_info_t *, ddi_attach_cmd_t);
+static int eibnx_detach(dev_info_t *, ddi_detach_cmd_t);
+static int eibnx_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
+static int eibnx_bus_ctl(dev_info_t *, dev_info_t *, ddi_ctl_enum_t,
+    void *, void *);
+
+static int eibnx_get_eventcookie(dev_info_t *, dev_info_t *, char *,
+    ddi_eventcookie_t *);
+static int eibnx_add_eventcall(dev_info_t *, dev_info_t *, ddi_eventcookie_t,
+    void (*)(dev_info_t *, ddi_eventcookie_t, void *, void *),
+    void *, ddi_callback_id_t *);
+static int eibnx_remove_eventcall(dev_info_t *, ddi_callback_id_t);
+static int eibnx_post_event(dev_info_t *, dev_info_t *,
+    ddi_eventcookie_t, void *);
+
+static int eibnx_bus_config(dev_info_t *, uint_t, ddi_bus_config_op_t,
+    void *, dev_info_t **);
+static int eibnx_bus_unconfig(dev_info_t *, uint_t, ddi_bus_config_op_t,
+    void *);
+static int eibnx_config_all_children(dev_info_t *);
+static void eibnx_unconfig_all_children(dev_info_t *);
+static int eibnx_config_child(char *, dev_info_t **);
+static int eibnx_unconfig_child(char *);
+
+/*
+ * Cbops
+ */
+static struct cb_ops enx_cb_ops = {
+	eibnx_devctl_open,	/* cb_open */
+	eibnx_devctl_close,	/* cb_close */
+	nodev,			/* cb_strategy */
+	nodev,			/* cb_print */
+	nodev,			/* cb_dump */
+	nodev,			/* cb_read */
+	nodev,			/* cb_write */
+	eibnx_devctl_ioctl,	/* cb_ioctl */
+	nodev,			/* cb_devmap */
+	nodev,			/* cb_mmap */
+	nodev,			/* cb_segmap */
+	nochpoll,		/* cb_chpoll */
+	ddi_prop_op,		/* cb_prop_op */
+	NULL,			/* cb_str */
+	D_MP,			/* cb_flag */
+	CB_REV, 		/* cb_rev */
+	nodev,			/* cb_aread */
+	nodev			/* cb_awrite */
+};
+
+/*
+ * Busops
+ */
+static struct bus_ops enx_bus_ops = {
+	BUSO_REV,
+	nullbusmap,		/* bus_map */
+	NULL,			/* bus_get_intrspec */
+	NULL,			/* bus_add_intrspec */
+	NULL,			/* bus_remove_intrspec */
+	i_ddi_map_fault,	/* bus_map_fault */
+	ddi_no_dma_map,		/* bus_dma_map */
+	NULL,			/* bus_dma_allochdl */
+	NULL,			/* bus_dma_freehdl */
+	NULL,			/* bus_dma_bindhdl */
+	NULL,			/* bus_dma_unbindhdl */
+	NULL,			/* bus_dma_flush */
+	NULL,			/* bus_dma_win */
+	NULL,			/* bus_dma_ctl */
+	eibnx_bus_ctl,		/* bus_ctl */
+	ddi_bus_prop_op,	/* bus_prop_op */
+	eibnx_get_eventcookie,	/* bus_get_eventcookie */
+	eibnx_add_eventcall,	/* bus_add_eventcall */
+	eibnx_remove_eventcall,	/* bus_remove_eventcall */
+	eibnx_post_event,	/* bus_post_event */
+	NULL,			/* bus_intr_ctl */
+	eibnx_bus_config,	/* bus_config */
+	eibnx_bus_unconfig,	/* bus_unconfig */
+};
+
+/*
+ * Nexus ops
+ */
+static struct dev_ops enx_ops = {
+	DEVO_REV,		/* devo_rev, */
+	0,			/* devo_refcnt  */
+	eibnx_getinfo,		/* devo_info */
+	nulldev,		/* devo_identify */
+	nulldev,		/* devo_probe */
+	eibnx_attach,		/* devo_attach */
+	eibnx_detach,		/* devo_detach */
+	nodev,			/* devo_reset */
+	&enx_cb_ops,		/* devo_cb_ops */
+	&enx_bus_ops,		/* devo_bus_ops */
+	nulldev,		/* devo_power */
+	ddi_quiesce_not_needed	/* devo_quiesce */
+};
+
+/*
+ * Module linkage information for the kernel
+ */
+static struct modldrv enx_modldrv = {
+	&mod_driverops,		/* Driver module */
+	"EoIB Nexus",		/* Driver name and version */
+	&enx_ops,		/* Driver ops */
+};
+
+static struct modlinkage enx_modlinkage = {
+	MODREV_1, (void *)&enx_modldrv, NULL
+};
+
+/*
+ * EoIB NDI events
+ */
+static ndi_event_definition_t enx_ndi_event_defs[] = {
+	{ ENX_EVENT_TAG_GW_INFO_UPDATE, EIB_NDI_EVENT_GW_INFO_UPDATE,
+		EPL_KERNEL, NDI_EVENT_POST_TO_TGT },
+	{ ENX_EVENT_TAG_GW_AVAILABLE, EIB_NDI_EVENT_GW_AVAILABLE,
+		EPL_KERNEL, NDI_EVENT_POST_TO_TGT },
+	{ ENX_EVENT_TAG_LOGIN_ACK, EIB_NDI_EVENT_LOGIN_ACK,
+		EPL_KERNEL, NDI_EVENT_POST_TO_TGT }
+};
+#define	ENX_NUM_NDI_EVENTS		\
+	(sizeof (enx_ndi_event_defs) / sizeof (enx_ndi_event_defs[0]))
+
+static ndi_event_set_t enx_ndi_events = {
+	NDI_EVENTS_REV1,
+	ENX_NUM_NDI_EVENTS,
+	enx_ndi_event_defs
+};
+ndi_event_hdl_t enx_ndi_event_hdl;
+
+
+/*
+ * Common loadable module entry points _init, _fini, _info
+ */
+
+int
+_init(void)
+{
+	int ret;
+
+	if ((ret = mod_install(&enx_modlinkage)) == 0)
+		eibnx_debug_init();
+
+	return (ret);
+}
+
+int
+_fini(void)
+{
+	int ret;
+
+	if ((ret = mod_remove(&enx_modlinkage)) == 0)
+		eibnx_debug_fini();
+
+	return (ret);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&enx_modlinkage, modinfop));
+}
+
+/*
+ * Autoconfiguration entry points: attach, detach, getinfo
+ */
+
+static int
+eibnx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	eibnx_t *ss;
+	int instance;
+
+	if (cmd == DDI_RESUME)
+		return (DDI_SUCCESS);
+	else if (cmd != DDI_ATTACH)
+		return (DDI_FAILURE);
+
+	/*
+	 * Don't allow more than one instance to attach
+	 */
+	if (enx_global_ss)
+		return (DDI_FAILURE);
+
+	/*
+	 * Alloc this instance's softstate
+	 */
+	ss = kmem_zalloc(sizeof (eibnx_t), KM_SLEEP);
+	ss->nx_dip = dip;
+
+	enx_global_ss = ss;
+
+	/*
+	 * Allocate our NDI event handle and bind our event set
+	 */
+	if (ndi_event_alloc_hdl(dip, 0, &enx_ndi_event_hdl,
+	    NDI_SLEEP) != NDI_SUCCESS) {
+		ENX_DPRINTF_ERR("ndi_event_alloc_hdl(dip=0x%llx) "
+		    "failed", dip);
+
+		kmem_free(enx_global_ss, sizeof (eibnx_t));
+		enx_global_ss = NULL;
+		return (DDI_FAILURE);
+	}
+	if (ndi_event_bind_set(enx_ndi_event_hdl, &enx_ndi_events,
+	    NDI_SLEEP) != NDI_SUCCESS) {
+		ENX_DPRINTF_ERR("ndi_event_bind_set(ndi_event_hdl=0x%llx) "
+		    "failed", enx_ndi_event_hdl);
+
+		(void) ndi_event_free_hdl(enx_ndi_event_hdl);
+		enx_ndi_event_hdl = NULL;
+		kmem_free(enx_global_ss, sizeof (eibnx_t));
+		enx_global_ss = NULL;
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * Create "devctl" minor node for general ioctl interface to the
+	 * eoib nexus. If we cannot, it isn't fatal - we'll operate without
+	 * the support for devctl (but issue a warning).
+	 */
+	instance = ddi_get_instance(dip);
+	if (ddi_create_minor_node(dip, "devctl", S_IFCHR, instance,
+	    DDI_NT_NEXUS, 0) != DDI_SUCCESS) {
+		ENX_DPRINTF_WARN("could not create devctl minor node "
+		    "for instance %d", instance);
+	}
+
+	/*
+	 * Do IBTF related initializations. If we fail, we cannot operate,
+	 * so fail the attach.
+	 */
+	if (eibnx_ibt_init(ss) != ENX_E_SUCCESS) {
+		(void) ddi_remove_minor_node(dip, NULL);
+		(void) ndi_event_unbind_set(enx_ndi_event_hdl,
+		    &enx_ndi_events, NDI_SLEEP);
+		(void) ndi_event_free_hdl(enx_ndi_event_hdl);
+		enx_ndi_event_hdl = NULL;
+		kmem_free(enx_global_ss, sizeof (eibnx_t));
+		enx_global_ss = NULL;
+		return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+static int
+eibnx_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	eibnx_t *ss = enx_global_ss;
+
+	if (cmd == DDI_SUSPEND)
+		return (DDI_SUCCESS);
+	else if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	/*
+	 * If there's no instance of eibnx attached, fail
+	 */
+	if (ss == NULL)
+		return (DDI_FAILURE);
+
+	/*
+	 * Before we do anything, we need to stop the port monitors
+	 * we may have started earlier.
+	 */
+	eibnx_terminate_monitors();
+
+	/*
+	 * If eibnx_ibt_fini() fails, it could be because one of the
+	 * HCA's pd could not be freed, the hca could not be closed
+	 * or the IBTF detach wasn't successful.  If this is the case,
+	 * we have to return failure, but cannot do much about the
+	 * port monitors we've already terminated.
+	 */
+	if (eibnx_ibt_fini(ss) == ENX_E_FAILURE)
+		return (DDI_FAILURE);
+
+	/*
+	 * Cleanup any devctl minor node we may have created, unbind and
+	 * free ndi event handle and free the instance softstate.
+	 */
+	(void) ddi_remove_minor_node(dip, NULL);
+	(void) ndi_event_unbind_set(enx_ndi_event_hdl,
+	    &enx_ndi_events, NDI_SLEEP);
+	(void) ndi_event_free_hdl(enx_ndi_event_hdl);
+	enx_ndi_event_hdl = NULL;
+	kmem_free(enx_global_ss, sizeof (eibnx_t));
+	enx_global_ss = NULL;
+
+	return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+eibnx_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp)
+{
+	eibnx_t *ss = enx_global_ss;
+	int ret;
+
+	if (cmd == DDI_INFO_DEVT2DEVINFO) {
+		*resultp = (ss) ? ss->nx_dip : NULL;
+		ret = (ss) ? DDI_SUCCESS : DDI_FAILURE;
+	} else if (cmd == DDI_INFO_DEVT2INSTANCE) {
+		*resultp = 0;
+		ret = DDI_SUCCESS;
+	} else {
+		ret = DDI_FAILURE;
+	}
+
+	return (ret);
+}
+
+/*
+ * Busops: bus_ctl, bus_config, bus_unconfig
+ */
+
+/*ARGSUSED*/
+static int
+eibnx_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop,
+    void *arg, void *result)
+{
+	dev_info_t *child = arg;
+	int ret;
+	char name[MAXNAMELEN];
+
+	switch (ctlop) {
+	case DDI_CTLOPS_REPORTDEV:
+		ENX_DPRINTF_DEBUG("EoIB device: %s@%s, %s%d",
+		    ddi_node_name(rdip), ddi_get_name_addr(rdip),
+		    ddi_driver_name(rdip), ddi_get_instance(rdip));
+		/*FALLTHROUGH*/
+
+	case DDI_CTLOPS_ATTACH:
+	case DDI_CTLOPS_DETACH:
+	case DDI_CTLOPS_POWER:
+	case DDI_CTLOPS_SIDDEV:
+	case DDI_CTLOPS_IOMIN:
+		ret = DDI_SUCCESS;
+		break;
+
+	case DDI_CTLOPS_INITCHILD:
+		if ((ret = eibnx_name_child(child, name,
+		    sizeof (name))) == DDI_SUCCESS) {
+			ddi_set_name_addr(child, name);
+		}
+		break;
+
+	case DDI_CTLOPS_UNINITCHILD:
+		ddi_set_name_addr(child, NULL);
+		ret = DDI_SUCCESS;
+		break;
+
+	default:
+		ret = ddi_ctlops(dip, rdip, ctlop, arg, result);
+		break;
+	}
+
+	return (ret);
+}
+
+/*ARGSUSED*/
+static int
+eibnx_bus_config(dev_info_t *parent, uint_t flags,
+    ddi_bus_config_op_t op, void *arg, dev_info_t **childp)
+{
+	eibnx_t *ss = enx_global_ss;
+	int ret = NDI_SUCCESS;
+
+	switch (op) {
+	case BUS_CONFIG_ONE:
+		eibnx_busop_inprog_enter(ss);
+		ret = eibnx_config_child(arg, childp);
+		eibnx_busop_inprog_exit(ss);
+		break;
+
+	case BUS_CONFIG_ALL:
+	case BUS_CONFIG_DRIVER:
+		eibnx_busop_inprog_enter(ss);
+		if ((ss->nx_busop_flags & NX_FL_BUSCFG_COMPLETE) == 0) {
+			ret = eibnx_config_all_children(parent);
+			if (ret == NDI_SUCCESS)
+				ss->nx_busop_flags |= NX_FL_BUSCFG_COMPLETE;
+		}
+		eibnx_busop_inprog_exit(ss);
+		break;
+
+	default:
+		ret = NDI_FAILURE;
+	}
+
+	if (ret == NDI_SUCCESS)
+		ret = ndi_busop_bus_config(parent, flags, op, arg, childp, 0);
+
+	return (ret);
+}
+
+static int
+eibnx_bus_unconfig(dev_info_t *parent, uint_t flags,
+    ddi_bus_config_op_t op, void *arg)
+{
+	eibnx_t *ss = enx_global_ss;
+	int ret;
+
+	ret = ndi_busop_bus_unconfig(parent, flags, op, arg);
+	if (ret != NDI_SUCCESS)
+		return (ret);
+
+	switch (op) {
+	case BUS_UNCONFIG_ONE:
+		if (flags & (NDI_UNCONFIG | NDI_DEVI_REMOVE)) {
+			eibnx_busop_inprog_enter(ss);
+
+			if ((ret = eibnx_unconfig_child(arg)) == ENX_E_SUCCESS)
+				ss->nx_busop_flags &= (~NX_FL_BUSCFG_COMPLETE);
+			else {
+				ENX_DPRINTF_DEBUG("eibnx_bus_config: "
+				    "unconfig child %s failed", (char *)arg);
+			}
+
+			eibnx_busop_inprog_exit(ss);
+		}
+		break;
+
+	case BUS_UNCONFIG_ALL:
+	case BUS_UNCONFIG_DRIVER:
+		if (flags & (NDI_UNCONFIG | NDI_DEVI_REMOVE)) {
+			eibnx_busop_inprog_enter(ss);
+
+			eibnx_unconfig_all_children(parent);
+			ss->nx_busop_flags &= (~NX_FL_BUSCFG_COMPLETE);
+
+			eibnx_busop_inprog_exit(ss);
+		}
+		break;
+
+	default:
+		break;
+	}
+
+	return (ret);
+}
+
+/*
+ * Event Handling: bus_get_eventcookie, bus_add_eventcall, bus_remove_eventcall
+ * and bus_post_event
+ */
+
+/*ARGSUSED*/
+static int
+eibnx_get_eventcookie(dev_info_t *dip, dev_info_t *rdip,
+    char *name, ddi_eventcookie_t *cookiep)
+{
+	return (ndi_event_retrieve_cookie(enx_ndi_event_hdl, rdip, name,
+	    cookiep, NDI_EVENT_NOPASS));
+}
+
+/*ARGSUSED*/
+static int
+eibnx_add_eventcall(dev_info_t *dip, dev_info_t *rdip, ddi_eventcookie_t cookie,
+    void (*callback)(dev_info_t *cb_dip, ddi_eventcookie_t cb_cookie,
+    void *cb_arg, void *cb_impl_data),
+    void *arg, ddi_callback_id_t *cb_id)
+{
+	return (ndi_event_add_callback(enx_ndi_event_hdl, rdip, cookie,
+	    callback, arg, NDI_SLEEP, cb_id));
+}
+
+/*ARGSUSED*/
+static int
+eibnx_remove_eventcall(dev_info_t *dip, ddi_callback_id_t cb_id)
+{
+	return (ndi_event_remove_callback(enx_ndi_event_hdl, cb_id));
+}
+
+/*ARGSUSED*/
+static int
+eibnx_post_event(dev_info_t *dip, dev_info_t *rdip,
+    ddi_eventcookie_t cookie, void *impl_data)
+{
+	return (ndi_event_run_callbacks(enx_ndi_event_hdl, rdip, cookie,
+	    impl_data));
+}
+
+/*
+ * Routines to configure/unconfigure EoIB node(s) on a system.
+ */
+
+/*ARGSUSED*/
+static int
+eibnx_config_all_children(dev_info_t *parent)
+{
+	eibnx_t *ss = enx_global_ss;
+	eibnx_hca_t *hca;
+	eibnx_port_t *port;
+	eibnx_thr_info_t *ti;
+	eibnx_thr_info_t *ti_tail;
+	eibnx_gw_info_t *gwi;
+
+	/*
+	 * Go through each port of each hca and create a thread to solicit,
+	 * monitor, receive advertisements, create eoib nodes and attach eoib
+	 * driver instances.
+	 */
+	mutex_enter(&ss->nx_lock);
+	if (!ss->nx_monitors_up) {
+		ss->nx_thr_info = ti_tail = NULL;
+		for (hca = ss->nx_hca; hca; hca = hca->hc_next) {
+			for (port = hca->hc_port; port; port = port->po_next) {
+				ti = eibnx_start_port_monitor(hca, port);
+				if (ti_tail) {
+					ti_tail->ti_next = ti;
+				} else {
+					ss->nx_thr_info = ti;
+				}
+				ti_tail = ti;
+			}
+		}
+
+		ss->nx_monitors_up = B_TRUE;
+		mutex_exit(&ss->nx_lock);
+
+		return (NDI_SUCCESS);
+	}
+	mutex_exit(&ss->nx_lock);
+
+	while (eibnx_locate_unconfigured_node(&ti, &gwi) == ENX_E_SUCCESS)
+		(void) eibnx_configure_node(ti, gwi, NULL);
+
+	return (NDI_SUCCESS);
+}
+
+/*
+ * Routine to unconfigure all the EoIB nodes on a system. This terminates
+ * all the per-port monitor threads and releases any resources allocated.
+ */
+
+/*ARGSUSED*/
+static void
+eibnx_unconfig_all_children(dev_info_t *parent)
+{
+	eibnx_t *ss = enx_global_ss;
+	eibnx_thr_info_t *ti;
+	eibnx_child_t *ch;
+
+	mutex_enter(&ss->nx_lock);
+	for (ti = ss->nx_thr_info; ti; ti = ti->ti_next) {
+		mutex_enter(&ti->ti_child_lock);
+		for (ch = ti->ti_child; ch; ch = ch->ch_next) {
+			ch->ch_dip = NULL;
+		}
+		mutex_exit(&ti->ti_child_lock);
+	}
+	mutex_exit(&ss->nx_lock);
+}
+
+/*ARGSUSED*/
+static int
+eibnx_config_child(char *devname, dev_info_t **childp)
+{
+	eibnx_thr_info_t *ti;
+	eibnx_gw_info_t *gwi;
+
+	if (eibnx_locate_node_name(devname, &ti, &gwi) == ENX_E_FAILURE) {
+		ENX_DPRINTF_DEBUG("eibnx_config_child: invalid eoib "
+		    "nodename %s, no such address", devname);
+		return (ENX_E_FAILURE);
+	}
+
+	return (eibnx_configure_node(ti, gwi, childp));
+}
+
+/*ARGSUSED*/
+static int
+eibnx_unconfig_child(char *devname)
+{
+	eibnx_thr_info_t *ti;
+	eibnx_gw_info_t *gwi;
+
+	if (eibnx_locate_node_name(devname, &ti, &gwi) == ENX_E_FAILURE) {
+		ENX_DPRINTF_DEBUG("eibnx_unconfig_child: invalid eoib "
+		    "nodename %s, no such address", devname);
+		return (ENX_E_FAILURE);
+	}
+
+	return (eibnx_unconfigure_node(ti, gwi));
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/ib/clients/eoib/enx_misc.c	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,627 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/ksynch.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/sunndi.h>
+
+#include <sys/ib/clients/eoib/enx_impl.h>
+
+static char *eibnx_make_nodename(eibnx_thr_info_t *, uint16_t);
+
+/*
+ * This routine is only called when the port-monitor thread is
+ * about to die.  Between the time the first mcast solicitation
+ * was done by the port-monitor thread and the time it is asked
+ * to die, a lot of things could've happened and we need to
+ * cleanup all of it.
+ */
+void
+eibnx_cleanup_port_nodes(eibnx_thr_info_t *info)
+{
+	eibnx_t *ss = enx_global_ss;
+	eibnx_nodeq_t *node;
+	eibnx_nodeq_t *prev;
+	eibnx_gw_info_t *gwi;
+	eibnx_gw_info_t *gw_list;
+	eibnx_gw_info_t *nxt_gwi;
+	eibnx_child_t *child;
+	eibnx_child_t *nxt_child;
+	eibnx_child_t *children;
+
+	/*
+	 * Since we would've already stopped processing completions for
+	 * this thread's work queue, we don't have to worry about requests
+	 * coming in for creation of new eoib nodes.  However, there may
+	 * be pending node creation requests for this port (thr_info)
+	 * that we will have to drop.
+	 */
+	mutex_enter(&ss->nx_nodeq_lock);
+	prev = NULL;
+	for (node = ss->nx_nodeq; node; node = node->nc_next) {
+		if (node->nc_info != info) {
+			prev = node;
+		} else {
+			if (prev == NULL) {
+				ss->nx_nodeq = node->nc_next;
+			} else {
+				prev->nc_next = node->nc_next;
+			}
+			kmem_free(node, sizeof (eibnx_nodeq_t));
+		}
+	}
+	mutex_exit(&ss->nx_nodeq_lock);
+
+	/*
+	 * Now go through the list of all children and free up any
+	 * resource we might've allocated;  note that the child dips
+	 * could've been offlined/removed by now, so we don't do
+	 * anything with them.
+	 */
+	mutex_enter(&info->ti_child_lock);
+	children = info->ti_child;
+	info->ti_child = NULL;
+	mutex_exit(&info->ti_child_lock);
+
+	for (child = children; child; child = nxt_child) {
+		nxt_child = child->ch_next;
+
+		if (child->ch_node_name) {
+			kmem_free(child->ch_node_name, MAXNAMELEN);
+		}
+		kmem_free(child, sizeof (eibnx_child_t));
+	}
+
+	/*
+	 * Return all the swqes we've acquired for the gateway unicast
+	 * solicitations, free any address vectors we've allocated and
+	 * finally free the gw entries from the list.
+	 */
+	mutex_enter(&info->ti_gw_lock);
+	gw_list = info->ti_gw;
+	info->ti_gw = NULL;
+	mutex_exit(&info->ti_gw_lock);
+
+	for (gwi = gw_list; gwi; gwi = nxt_gwi) {
+		nxt_gwi = gwi->gw_next;
+
+		eibnx_release_swqe((eibnx_wqe_t *)(gwi->gw_swqe));
+		if ((gwi->gw_addr).ga_vect) {
+			kmem_free((gwi->gw_addr).ga_vect,
+			    sizeof (ibt_adds_vect_t));
+			(gwi->gw_addr).ga_vect = NULL;
+		}
+		mutex_destroy(&gwi->gw_adv_lock);
+
+		kmem_free(gwi, sizeof (eibnx_gw_info_t));
+	}
+}
+
+/*
+ * Communicate all the details we received about the gateway (via the
+ * advertisement control message) to the eoib instance we're creating.
+ */
+void
+eibnx_create_node_props(dev_info_t *dip, eibnx_thr_info_t *info,
+    eibnx_gw_info_t *gwi)
+{
+	int ret;
+
+	ret = ndi_prop_update_int64(DDI_DEV_T_NONE, dip, EIB_PROP_HCA_GUID,
+	    info->ti_hca_guid);
+	if (ret != DDI_PROP_SUCCESS) {
+		ENX_DPRINTF_WARN("ndi_prop_update_int64() failed to set "
+		    "%s property to 0x%llx for child dip 0x%llx, ret=%d",
+		    EIB_PROP_HCA_GUID, info->ti_hca_guid, dip, ret);
+	}
+
+	ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_HCA_PORTNUM,
+	    info->ti_pi->p_port_num);
+	if (ret != DDI_PROP_SUCCESS) {
+		ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set "
+		    "%s property to 0x%lx for child dip 0x%llx, ret=%d",
+		    EIB_PROP_HCA_PORTNUM, info->ti_pi->p_port_num, dip, ret);
+	}
+
+	ret = ndi_prop_update_int64(DDI_DEV_T_NONE, dip, EIB_PROP_GW_SYS_GUID,
+	    gwi->gw_system_guid);
+	if (ret != DDI_PROP_SUCCESS) {
+		ENX_DPRINTF_WARN("ndi_prop_update_int64() failed to set "
+		    "%s property to 0x%llx for child dip 0x%llx, ret=%d",
+		    EIB_PROP_GW_SYS_GUID, gwi->gw_system_guid, dip, ret);
+	}
+
+	ret = ndi_prop_update_int64(DDI_DEV_T_NONE, dip, EIB_PROP_GW_GUID,
+	    gwi->gw_guid);
+	if (ret != DDI_PROP_SUCCESS) {
+		ENX_DPRINTF_WARN("ndi_prop_update_int64() failed to set "
+		    "%s property to 0x%llx for child dip 0x%llx, ret=%d",
+		    EIB_PROP_GW_GUID, gwi->gw_guid, dip, ret);
+	}
+
+	ret = ndi_prop_update_int64(DDI_DEV_T_NONE, dip, EIB_PROP_GW_SN_PREFIX,
+	    (gwi->gw_addr).ga_gid.gid_prefix);
+	if (ret != DDI_PROP_SUCCESS) {
+		ENX_DPRINTF_WARN("ndi_prop_update_int64() failed to set "
+		    "%s property to 0x%llx for child dip 0x%llx, ret=%d",
+		    EIB_PROP_GW_SN_PREFIX, (gwi->gw_addr).ga_gid.gid_prefix,
+		    dip, ret);
+	}
+
+	ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_ADV_PERIOD,
+	    gwi->gw_adv_period);
+	if (ret != DDI_PROP_SUCCESS) {
+		ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set "
+		    "%s property to 0x%lx for child dip 0x%llx, ret=%d",
+		    EIB_PROP_GW_ADV_PERIOD, gwi->gw_adv_period, dip, ret);
+	}
+
+	ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_KA_PERIOD,
+	    gwi->gw_ka_period);
+	if (ret != DDI_PROP_SUCCESS) {
+		ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set "
+		    "%s property to 0x%lx for child dip 0x%llx, ret=%d",
+		    EIB_PROP_GW_KA_PERIOD, gwi->gw_ka_period, dip, ret);
+	}
+
+	ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_VNIC_KA_PERIOD,
+	    gwi->gw_vnic_ka_period);
+	if (ret != DDI_PROP_SUCCESS) {
+		ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set "
+		    "%s property to 0x%lx for child dip 0x%llx, ret=%d",
+		    EIB_PROP_VNIC_KA_PERIOD, gwi->gw_vnic_ka_period, dip, ret);
+	}
+
+	ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_CTRL_QPN,
+	    gwi->gw_ctrl_qpn);
+	if (ret != DDI_PROP_SUCCESS) {
+		ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set "
+		    "%s property to 0x%lx for child dip 0x%llx, ret=%d",
+		    EIB_PROP_GW_CTRL_QPN, gwi->gw_ctrl_qpn, dip, ret);
+	}
+
+	ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_LID,
+	    gwi->gw_lid);
+	if (ret != DDI_PROP_SUCCESS) {
+		ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set "
+		    "%s property to 0x%lx for child dip 0x%llx, ret=%d",
+		    EIB_PROP_GW_LID, gwi->gw_lid, dip, ret);
+	}
+
+	ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_PORTID,
+	    gwi->gw_portid);
+	if (ret != DDI_PROP_SUCCESS) {
+		ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set "
+		    "%s property to 0x%lx for child dip 0x%llx, ret=%d",
+		    EIB_PROP_GW_PORTID, gwi->gw_portid, dip, ret);
+	}
+
+	ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip,
+	    EIB_PROP_GW_NUM_NET_VNICS, gwi->gw_num_net_vnics);
+	if (ret != DDI_PROP_SUCCESS) {
+		ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set "
+		    "%s property to 0x%lx for child dip 0x%llx, ret=%d",
+		    EIB_PROP_GW_NUM_NET_VNICS, gwi->gw_num_net_vnics, dip, ret);
+	}
+
+	ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_AVAILABLE,
+	    gwi->gw_flag_available);
+	if (ret != DDI_PROP_SUCCESS) {
+		ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set "
+		    "%s property to 0x%lx for child dip 0x%llx, ret=%d",
+		    EIB_PROP_GW_AVAILABLE, gwi->gw_flag_available, dip, ret);
+	}
+
+	ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_HOST_VNICS,
+	    gwi->gw_is_host_adm_vnics);
+	if (ret != DDI_PROP_SUCCESS) {
+		ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set "
+		    "%s property to 0x%lx for child dip 0x%llx, ret=%d",
+		    EIB_PROP_GW_HOST_VNICS, gwi->gw_is_host_adm_vnics,
+		    dip, ret);
+	}
+
+	ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_SL,
+	    gwi->gw_sl);
+	if (ret != DDI_PROP_SUCCESS) {
+		ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set "
+		    "%s property to 0x%lx for child dip 0x%llx, ret=%d",
+		    EIB_PROP_GW_SL, gwi->gw_sl, dip, ret);
+	}
+
+	ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_N_RSS_QPN,
+	    gwi->gw_n_rss_qpn);
+	if (ret != DDI_PROP_SUCCESS) {
+		ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set "
+		    "%s property to 0x%lx for child dip 0x%llx, ret=%d",
+		    EIB_PROP_GW_N_RSS_QPN, gwi->gw_n_rss_qpn, dip, ret);
+	}
+
+	ret = ndi_prop_update_string(DDI_DEV_T_NONE, dip, EIB_PROP_GW_SYS_NAME,
+	    (char *)(gwi->gw_system_name));
+	if (ret != DDI_PROP_SUCCESS) {
+		ENX_DPRINTF_WARN("ndi_prop_update_string() failed to set "
+		    "%s property to '%s' for child dip 0x%llx, ret=%d",
+		    EIB_PROP_GW_SYS_NAME, gwi->gw_system_name, dip, ret);
+	}
+
+	ret = ndi_prop_update_string(DDI_DEV_T_NONE, dip, EIB_PROP_GW_PORT_NAME,
+	    (char *)(gwi->gw_port_name));
+	if (ret != DDI_PROP_SUCCESS) {
+		ENX_DPRINTF_WARN("ndi_prop_update_string() failed to set "
+		    "%s property to '%s' for child dip 0x%llx, ret=%d",
+		    EIB_PROP_GW_PORT_NAME, gwi->gw_port_name, dip, ret);
+	}
+
+	ret = ndi_prop_update_string(DDI_DEV_T_NONE, dip, EIB_PROP_GW_VENDOR_ID,
+	    (char *)(gwi->gw_vendor_id));
+	if (ret != DDI_PROP_SUCCESS) {
+		ENX_DPRINTF_WARN("ndi_prop_update_string() failed to set "
+		    "%s property to '%s' for child dip 0x%llx, ret=%d",
+		    EIB_PROP_GW_VENDOR_ID, gwi->gw_vendor_id, dip, ret);
+	}
+}
+
+int
+eibnx_name_child(dev_info_t *child, char *name, size_t namesz)
+{
+	char *node_name;
+
+	if ((node_name = ddi_get_parent_data(child)) == NULL) {
+		ENX_DPRINTF_ERR("ddi_get_parent_data(child=0x%llx) "
+		    "returned NULL", child);
+		return (DDI_NOT_WELL_FORMED);
+	}
+
+	/*
+	 * Skip the name and "@" part in the eoib node path and copy the
+	 * address part out to the caller.
+	 */
+	(void) strlcpy(name, node_name + strlen(EIB_DRV_NAME) + 1, namesz);
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * Synchronization functions to mark/clear the in-progress status of
+ * bus config/unconfig operations
+ */
+
+void
+eibnx_busop_inprog_enter(eibnx_t *ss)
+{
+	mutex_enter(&ss->nx_busop_lock);
+
+	while (ss->nx_busop_flags & NX_FL_BUSOP_INPROG)
+		cv_wait(&ss->nx_busop_cv, &ss->nx_busop_lock);
+
+	ss->nx_busop_flags |= NX_FL_BUSOP_INPROG;
+
+	mutex_exit(&ss->nx_busop_lock);
+}
+
+void
+eibnx_busop_inprog_exit(eibnx_t *ss)
+{
+	mutex_enter(&ss->nx_busop_lock);
+
+	ss->nx_busop_flags &= (~NX_FL_BUSOP_INPROG);
+
+	cv_broadcast(&ss->nx_busop_cv);
+	mutex_exit(&ss->nx_busop_lock);
+}
+
+eibnx_thr_info_t *
+eibnx_start_port_monitor(eibnx_hca_t *hca, eibnx_port_t *port)
+{
+	eibnx_thr_info_t *ti;
+	kthread_t *kt;
+	dev_info_t *hca_dip;
+	const char *hca_drv_name;
+	int hca_drv_inst;
+
+	ti = kmem_zalloc(sizeof (eibnx_thr_info_t), KM_SLEEP);
+
+	mutex_init(&ti->ti_mcg_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&ti->ti_gw_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&ti->ti_child_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&ti->ti_event_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&ti->ti_event_cv, NULL, CV_DEFAULT, NULL);
+
+	ti->ti_next = NULL;
+	ti->ti_hca_guid = hca->hc_guid;
+	ti->ti_hca = hca->hc_hdl;
+	ti->ti_pd = hca->hc_pd;
+	ti->ti_pi = port->po_pi;
+	ti->ti_ident = kmem_zalloc(MAXNAMELEN, KM_SLEEP);
+
+	/*
+	 * Prepare the "ident" for EoIB nodes from this port monitor.  To
+	 * associate eoib instances with the corresponding HCA nodes easily,
+	 * and to make sure eoib instance numbers do not change when
+	 * like-for-like HCA replacements are made, tie up the ident to
+	 * HCA driver name, HCA driver instance and the HCA port number.
+	 * The eoib node address is later composed using this ident and
+	 * the gateway port ids after discovery.
+	 */
+	if ((hca_dip = ibtl_ibnex_hcaguid2dip(ti->ti_hca_guid)) == NULL) {
+		ENX_DPRINTF_WARN("ibtl_ibnex_hcaguid2dip(hca_guid=0x%llx) "
+		    "returned NULL", ti->ti_hca_guid);
+	} else if ((hca_drv_name = ddi_driver_name(hca_dip)) == NULL) {
+		ENX_DPRINTF_WARN("hca driver name NULL for "
+		    "hca_guid=0x%llx, hca_dip=0x%llx",
+		    ti->ti_hca_guid, hca_dip);
+	} else if ((hca_drv_inst = ddi_get_instance(hca_dip)) < 0) {
+		ENX_DPRINTF_ERR("hca driver instance (%d) invalid for "
+		    "hca_guid=0x%llx, hca_dip=0x%llx",
+		    ti->ti_hca_guid, hca_dip);
+	} else {
+		(void) snprintf(ti->ti_ident, MAXNAMELEN, "%s%d,%x",
+		    hca_drv_name, hca_drv_inst, ti->ti_pi->p_port_num);
+	}
+
+	kt = thread_create(NULL, 0, eibnx_port_monitor,
+	    ti, 0, &p0, TS_RUN, minclsyspri);
+
+	ti->ti_kt_did = kt->t_did;
+
+	return (ti);
+}
+
+void
+eibnx_stop_port_monitor(eibnx_thr_info_t *ti)
+{
+	/*
+	 * Tell the port monitor thread to stop and wait for it to
+	 * happen.  Before marking it for death, make sure there
+	 * aren't any completions being processed.
+	 */
+	mutex_enter(&ti->ti_event_lock);
+	while (ti->ti_event & ENX_EVENT_COMPLETION) {
+		cv_wait(&ti->ti_event_cv, &ti->ti_event_lock);
+	}
+	ti->ti_event |= ENX_EVENT_DIE;
+	cv_broadcast(&ti->ti_event_cv);
+	mutex_exit(&ti->ti_event_lock);
+
+	thread_join(ti->ti_kt_did);
+
+	/*
+	 * Destroy synchronization primitives initialized for this ti
+	 */
+	cv_destroy(&ti->ti_event_cv);
+	mutex_destroy(&ti->ti_event_lock);
+	mutex_destroy(&ti->ti_child_lock);
+	mutex_destroy(&ti->ti_gw_lock);
+	mutex_destroy(&ti->ti_mcg_lock);
+
+	kmem_free(ti->ti_ident, MAXNAMELEN);
+	kmem_free(ti, sizeof (eibnx_thr_info_t));
+}
+
+void
+eibnx_terminate_monitors(void)
+{
+	eibnx_t *ss = enx_global_ss;
+	eibnx_thr_info_t *ti_list;
+	eibnx_thr_info_t *ti;
+	eibnx_thr_info_t *ti_next;
+
+	mutex_enter(&ss->nx_lock);
+	ti_list = ss->nx_thr_info;
+	ss->nx_thr_info = NULL;
+	mutex_exit(&ss->nx_lock);
+
+	/*
+	 * Ask all the port_monitor threads to die. Before marking them
+	 * for death, make sure there aren't any completions being
+	 * processed by the thread.
+	 */
+	for (ti = ti_list; ti; ti = ti_next) {
+		ti_next = ti->ti_next;
+		eibnx_stop_port_monitor(ti);
+	}
+
+	mutex_enter(&ss->nx_lock);
+	ss->nx_monitors_up = B_FALSE;
+	mutex_exit(&ss->nx_lock);
+}
+
+int
+eibnx_configure_node(eibnx_thr_info_t *ti, eibnx_gw_info_t *gwi,
+    dev_info_t **childp)
+{
+	eibnx_t *ss = enx_global_ss;
+	dev_info_t *child_dip;
+	char *node_name;
+	int circular;
+	int ret;
+
+	/*
+	 * Prepare the new node's name
+	 */
+	if ((node_name = eibnx_make_nodename(ti, gwi->gw_portid)) == NULL)
+		return (ENX_E_FAILURE);
+
+	ndi_devi_enter(ss->nx_dip, &circular);
+
+	if (child_dip = ndi_devi_findchild(ss->nx_dip, node_name)) {
+		ret = eibnx_update_child(ti, gwi, child_dip);
+		if (ret == ENX_E_SUCCESS) {
+			ndi_devi_exit(ss->nx_dip, circular);
+			kmem_free(node_name, MAXNAMELEN);
+
+			if (childp) {
+				*childp = child_dip;
+			}
+			return (ENX_E_SUCCESS);
+		}
+	}
+
+	/*
+	 * If the node does not already exist, we may need to create it
+	 */
+	if (child_dip == NULL) {
+		ndi_devi_alloc_sleep(ss->nx_dip, EIB_DRV_NAME,
+		    (pnode_t)DEVI_SID_NODEID, &child_dip);
+
+		ddi_set_parent_data(child_dip, node_name);
+		eibnx_create_node_props(child_dip, ti, gwi);
+	}
+
+	/*
+	 * Whether there was no devinfo node at all for the given node name or
+	 * we had a devinfo node, but it wasn't in our list of eoib children,
+	 * we'll try to online the instance here.
+	 */
+	ENX_DPRINTF_DEBUG("onlining %s", node_name);
+	ret = ndi_devi_online(child_dip, 0);
+	if (ret != NDI_SUCCESS) {
+		ENX_DPRINTF_ERR("ndi_devi_online(node_name=%s) failed "
+		    "with ret=0x%x", node_name, ret);
+
+		ddi_set_parent_data(child_dip, NULL);
+		(void) ndi_devi_free(child_dip);
+
+		ndi_devi_exit(ss->nx_dip, circular);
+		kmem_free(node_name, MAXNAMELEN);
+
+		return (ENX_E_FAILURE);
+	}
+
+	eibnx_enqueue_child(ti, gwi, node_name, child_dip);
+
+	ndi_devi_exit(ss->nx_dip, circular);
+
+	if (childp) {
+		*childp = child_dip;
+	}
+
+	return (ENX_E_SUCCESS);
+}
+
+int
+eibnx_unconfigure_node(eibnx_thr_info_t *ti, eibnx_gw_info_t *gwi)
+{
+	/*
+	 * To unconfigure an eoib node, we only need to set the child's
+	 * dip to NULL.  When the node gets configured again, we either
+	 * find the dip for the pathname and set it in this child, or
+	 * allocate a new dip and set it in this child.
+	 */
+	return (eibnx_update_child(ti, gwi, NULL));
+}
+
+int
+eibnx_locate_node_name(char *devname, eibnx_thr_info_t **ti_p,
+    eibnx_gw_info_t **gwi_p)
+{
+	eibnx_t *ss = enx_global_ss;
+	eibnx_thr_info_t *ti;
+	eibnx_gw_info_t *gwi;
+	char name[MAXNAMELEN];
+
+	/*
+	 * Locate the port monitor thread info and gateway info
+	 * that corresponds to the supplied devname.
+	 */
+	mutex_enter(&ss->nx_lock);
+	for (ti = ss->nx_thr_info; ti; ti = ti->ti_next) {
+		if (ti->ti_ident[0] == '\0')
+			continue;
+
+		mutex_enter(&ti->ti_gw_lock);
+		for (gwi = ti->ti_gw; gwi; gwi = gwi->gw_next) {
+			(void) snprintf(name, MAXNAMELEN,
+			    "%s@%s,%x", EIB_DRV_NAME, ti->ti_ident,
+			    gwi->gw_portid);
+
+			if (strcmp(name, devname) == 0)
+				break;
+		}
+		mutex_exit(&ti->ti_gw_lock);
+
+		if (gwi) {
+			break;
+		}
+	}
+	mutex_exit(&ss->nx_lock);
+
+	if (ti == NULL || gwi == NULL) {
+		return (ENX_E_FAILURE);
+	}
+
+	*ti_p = ti;
+	*gwi_p = gwi;
+
+	return (ENX_E_SUCCESS);
+}
+
+int
+eibnx_locate_unconfigured_node(eibnx_thr_info_t **ti_p, eibnx_gw_info_t **gwi_p)
+{
+	eibnx_t *ss = enx_global_ss;
+	eibnx_thr_info_t *ti;
+	eibnx_child_t *ch;
+
+	mutex_enter(&ss->nx_lock);
+	for (ti = ss->nx_thr_info; ti; ti = ti->ti_next) {
+		mutex_enter(&ti->ti_child_lock);
+		for (ch = ti->ti_child; ch; ch = ch->ch_next) {
+			if (ch->ch_dip == NULL) {
+				*ti_p = ti;
+				*gwi_p = ch->ch_gwi;
+
+				mutex_exit(&ti->ti_child_lock);
+				mutex_exit(&ss->nx_lock);
+
+				return (ENX_E_SUCCESS);
+			}
+		}
+		mutex_exit(&ti->ti_child_lock);
+	}
+	mutex_exit(&ss->nx_lock);
+
+	return (ENX_E_FAILURE);
+}
+
+static char *
+eibnx_make_nodename(eibnx_thr_info_t *info, uint16_t gw_portid)
+{
+	char *name;
+
+	if (info->ti_ident[0] == NULL)
+		return (NULL);
+
+	name = kmem_zalloc(MAXNAMELEN, KM_SLEEP);
+	(void) snprintf(name, MAXNAMELEN, "%s@%s,%x", EIB_DRV_NAME,
+	    info->ti_ident, gw_portid);
+
+	return (name);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/ib/clients/eoib/enx_q.c	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,644 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+
+#include <sys/ib/clients/eoib/enx_impl.h>
+
+/*
+ * Acquire an SWQE
+ */
+
+/*ARGSUSED*/
+eibnx_wqe_t *
+eibnx_acquire_swqe(eibnx_thr_info_t *info, int flag)
+{
+	eibnx_wqe_t *wqe = NULL;
+	eibnx_tx_t *snd_p = &info->ti_snd;
+	int i;
+
+	for (i = 0; i < ENX_NUM_SWQE; i++) {
+		wqe = &(snd_p->tx_wqe[i]);
+
+		mutex_enter(&wqe->qe_lock);
+		if ((wqe->qe_flags & ENX_QEFL_INUSE) == 0) {
+			wqe->qe_flags |= ENX_QEFL_INUSE;
+			mutex_exit(&wqe->qe_lock);
+			break;
+		}
+		mutex_exit(&wqe->qe_lock);
+	}
+
+	/*
+	 * We probably have enough swqe entries for doing our solicitations.
+	 * If we find it not enough in practice, we need to implement some
+	 * sort of dynamic allocation.
+	 */
+	if (i == ENX_NUM_SWQE)
+		wqe = NULL;
+
+	return (wqe);
+}
+
+/*
+ * Return a SWQE from completion. We may have to release
+ * it or keep it.
+ */
+void
+eibnx_return_swqe(eibnx_wqe_t *wqe)
+{
+	ASSERT(wqe->qe_type == ENX_QETYP_SWQE);
+
+	mutex_enter(&wqe->qe_lock);
+
+	/*
+	 * This send wqe is from the completion queue.  We need to
+	 * clear the 'posted' flag first.
+	 */
+	ASSERT((wqe->qe_flags & ENX_QEFL_POSTED) == ENX_QEFL_POSTED);
+	wqe->qe_flags &= (~ENX_QEFL_POSTED);
+
+	/*
+	 * See if we need to release this send wqe back to the pool
+	 * on completion. We may not need to do so if, for example,
+	 * this were a swqe acquired specifically for a particular gw.
+	 */
+	if (wqe->qe_flags & ENX_QEFL_RELONCOMP) {
+		wqe->qe_sgl.ds_len = wqe->qe_bufsz;
+		wqe->qe_flags &= (~ENX_QEFL_INUSE);
+
+		wqe->qe_flags &= (~ENX_QEFL_RELONCOMP);
+	}
+
+	mutex_exit(&wqe->qe_lock);
+}
+
+/*
+ * Return a RWQE from completion. We probably have to repost it.
+ */
+void
+eibnx_return_rwqe(eibnx_thr_info_t *info, eibnx_wqe_t *wqe)
+{
+	ibt_status_t ret;
+
+	ASSERT(wqe->qe_type == ENX_QETYP_RWQE);
+
+	mutex_enter(&wqe->qe_lock);
+
+	/*
+	 * We should never need to free an rwqe on completion.
+	 */
+	ASSERT((wqe->qe_flags & ENX_QEFL_RELONCOMP) == 0);
+
+	/*
+	 * An rwqe is always in-use and posted, so we only need to make
+	 * sure the ds_len is adjusted back to the value it's supposed
+	 * to have.
+	 */
+	wqe->qe_sgl.ds_len = wqe->qe_bufsz;
+
+	/*
+	 * Repost the recv wqe
+	 */
+	ret = ibt_post_recv(info->ti_chan, &(wqe->qe_wr.recv), 1, NULL);
+	if (ret != IBT_SUCCESS) {
+		ENX_DPRINTF_WARN("ibt_post_recv(chan_hdl=0x%llx) failed, "
+		    "ret=%d", info->ti_chan, ret);
+	}
+
+	mutex_exit(&wqe->qe_lock);
+}
+
+/*
+ * Release an SWQE that was acquired earlier.
+ */
+void
+eibnx_release_swqe(eibnx_wqe_t *wqe)
+{
+	ASSERT(wqe->qe_type == ENX_QETYP_SWQE);
+
+	mutex_enter(&wqe->qe_lock);
+
+	/*
+	 * Make sure this swqe is in use. Since this routine may also be
+	 * called when we're trying to cleanup the eoib nodes, we
+	 * should clear all flag bits.
+	 */
+	ASSERT((wqe->qe_flags & ENX_QEFL_INUSE) == ENX_QEFL_INUSE);
+	wqe->qe_flags = 0;
+
+	mutex_exit(&wqe->qe_lock);
+}
+
+/*
+ * Insert the passed child to the head of the queue
+ */
+void
+eibnx_enqueue_child(eibnx_thr_info_t *info, eibnx_gw_info_t *gwi,
+    char *node_name, dev_info_t *dip)
+{
+	eibnx_child_t *ch;
+	eibnx_child_t *new_ch;
+
+	new_ch = kmem_zalloc(sizeof (eibnx_child_t), KM_SLEEP);
+	new_ch->ch_dip = dip;
+	new_ch->ch_node_name = node_name;
+	new_ch->ch_gwi = gwi;
+
+	mutex_enter(&info->ti_child_lock);
+
+	/*
+	 * Search existing children to see if we already have this
+	 * child.  If so, simply update its dip and node_name
+	 */
+	for (ch = info->ti_child; ch; ch = ch->ch_next) {
+		if (ch->ch_gwi->gw_portid == gwi->gw_portid) {
+			ch->ch_dip = dip;
+			if (ch->ch_node_name) {
+				kmem_free(ch->ch_node_name, MAXNAMELEN);
+			}
+			ch->ch_node_name = node_name;
+			kmem_free(new_ch, sizeof (eibnx_child_t));
+			return;
+		}
+	}
+
+	/*
+	 * If not, add the new child to the list of children
+	 */
+	new_ch->ch_next = info->ti_child;
+	info->ti_child = new_ch;
+
+	mutex_exit(&info->ti_child_lock);
+}
+
+int
+eibnx_update_child(eibnx_thr_info_t *info, eibnx_gw_info_t *gwi,
+    dev_info_t *dip)
+{
+	eibnx_child_t *ch;
+
+	mutex_enter(&info->ti_child_lock);
+	for (ch = info->ti_child; ch; ch = ch->ch_next) {
+		if (ch->ch_gwi->gw_portid == gwi->gw_portid) {
+			if (ch->ch_dip != dip) {
+				ENX_DPRINTF_DEBUG("updating child dip for "
+				    "gw portid 0x%x to 0x%llx",
+				    gwi->gw_portid, dip);
+				ch->ch_dip = dip;
+			}
+			mutex_exit(&info->ti_child_lock);
+
+			return (ENX_E_SUCCESS);
+		}
+	}
+	mutex_exit(&info->ti_child_lock);
+
+	return (ENX_E_FAILURE);
+}
+
+dev_info_t *
+eibnx_find_child_dip_by_inst(eibnx_thr_info_t *info, int inst)
+{
+	eibnx_child_t *ch;
+	dev_info_t *dip = NULL;
+
+	mutex_enter(&info->ti_child_lock);
+	for (ch = info->ti_child; ch != NULL; ch = ch->ch_next) {
+		dip = ch->ch_dip;
+		if (ddi_get_instance(dip) == inst)
+			break;
+	}
+	mutex_exit(&info->ti_child_lock);
+
+	return (dip);
+}
+
+dev_info_t *
+eibnx_find_child_dip_by_gw(eibnx_thr_info_t *info, uint16_t gw_portid)
+{
+	eibnx_child_t *ch;
+	dev_info_t *dip = NULL;
+
+	mutex_enter(&info->ti_child_lock);
+	for (ch = info->ti_child; ch != NULL; ch = ch->ch_next) {
+		dip = ch->ch_dip;
+		if (ch->ch_gwi->gw_portid == gw_portid)
+			break;
+	}
+	mutex_exit(&info->ti_child_lock);
+
+	return (dip);
+}
+
+/*
+ * See if the passed gateway is already found in our list.  Note
+ * that we assume that the gateway port id uniquely identifies each
+ * gateway.
+ */
+eibnx_gw_info_t *
+eibnx_find_gw_in_gwlist(eibnx_thr_info_t *info, eibnx_gw_info_t *gwi)
+{
+	eibnx_gw_info_t *lgw = NULL;
+
+	mutex_enter(&info->ti_gw_lock);
+	for (lgw = info->ti_gw; lgw; lgw = lgw->gw_next) {
+		if (lgw->gw_portid == gwi->gw_portid)
+			break;
+	}
+	mutex_exit(&info->ti_gw_lock);
+
+	return (lgw);
+}
+
+/*
+ * Add a newly discovered gateway to the gateway list.  Since we'll
+ * need to send unicast solicitations to this gateway soon, we'll
+ * also grab a swqe entry, and initialize basic gw adress parameters
+ * such as the gid, qpn, qkey and pkey of the GW.  When we eventually
+ * get to sending the unicast to this gateway for the first time,
+ * we'll discover the path to this gateway using these parameters
+ * and modify the ud destination handle appropriately.
+ */
+eibnx_gw_info_t *
+eibnx_add_gw_to_gwlist(eibnx_thr_info_t *info, eibnx_gw_info_t *gwi,
+    ibt_wc_t *wc, uint8_t *recv_buf)
+{
+	eibnx_gw_info_t *new_gwi;
+	eibnx_wqe_t *wqe;
+	ib_grh_t *grh;
+	ib_gid_t sgid;
+	clock_t timeout_usecs;
+
+	/*
+	 * For now, we'll simply do KM_NOSLEEP allocation, since this code
+	 * is called from within rx processing
+	 */
+	new_gwi = kmem_zalloc(sizeof (eibnx_gw_info_t), KM_NOSLEEP);
+	if (new_gwi == NULL) {
+		ENX_DPRINTF_WARN("no memory, gw port_id 0x%x "
+		    "will be ignored by hca_guid=0x%llx, port=0x%x",
+		    gwi->gw_portid, info->ti_hca_guid,
+		    info->ti_pi->p_port_num);
+		return (NULL);
+	}
+
+	/*
+	 * We also need to acquire a send wqe to do unicast solicitations
+	 * to this gateway later on. We should've enough pre-allocated swqes
+	 * to do this without sleeping.
+	 */
+	if ((wqe = eibnx_acquire_swqe(info, KM_NOSLEEP)) == NULL) {
+		ENX_DPRINTF_WARN("no swqe available, gw port_id 0x%x "
+		    "will be ignored by hca_guid=0x%llx, port=0x%x",
+		    gwi->gw_portid, info->ti_hca_guid,
+		    info->ti_pi->p_port_num);
+		kmem_free(new_gwi, sizeof (eibnx_gw_info_t));
+		return (NULL);
+	}
+
+	/*
+	 * Initialize gw state and wqe information.
+	 */
+	new_gwi->gw_next = NULL;
+	new_gwi->gw_swqe = wqe;
+	new_gwi->gw_state = gwi->gw_state;
+
+	/*
+	 * Set up gateway advertisement monitoring parameters. Since we
+	 * always need to check against a timeout value of 2.5 * gw_adv_period,
+	 * we'll keep this pre-calculated value as well.
+	 */
+	mutex_init(&new_gwi->gw_adv_lock, NULL, MUTEX_DRIVER, NULL);
+	new_gwi->gw_adv_flag = gwi->gw_adv_flag;
+	new_gwi->gw_adv_last_lbolt = ddi_get_lbolt64();
+	timeout_usecs = gwi->gw_adv_period * 1000;
+	timeout_usecs = ((timeout_usecs << 2) + timeout_usecs) >> 1;
+	new_gwi->gw_adv_timeout_ticks = drv_usectohz(timeout_usecs);
+
+	/*
+	 * Initialize gateway address information. Note that if the message has
+	 * a GRH, we'll use the subnet prefix, otherwise we'll assume that the
+	 * gateway is in the same subnet as ourselves.
+	 */
+	new_gwi->gw_addr.ga_vect = NULL;
+	if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
+		grh = (ib_grh_t *)(uintptr_t)recv_buf;
+		new_gwi->gw_addr.ga_gid.gid_prefix =
+		    ntohll(grh->SGID.gid_prefix);
+	} else {
+		sgid = info->ti_pi->p_sgid_tbl[0];
+		new_gwi->gw_addr.ga_gid.gid_prefix =
+		    sgid.gid_prefix;
+	}
+	new_gwi->gw_addr.ga_gid.gid_guid = gwi->gw_guid;
+	new_gwi->gw_addr.ga_qpn = gwi->gw_ctrl_qpn;
+	new_gwi->gw_addr.ga_qkey = EIB_FIP_QKEY;
+	new_gwi->gw_addr.ga_pkey = EIB_ADMIN_PKEY;
+
+	/*
+	 * Initialize gateway parameters received via the advertisement
+	 */
+	new_gwi->gw_system_guid = gwi->gw_system_guid;
+	new_gwi->gw_guid = gwi->gw_guid;
+	new_gwi->gw_adv_period = gwi->gw_adv_period;
+	new_gwi->gw_ka_period = gwi->gw_ka_period;
+	new_gwi->gw_vnic_ka_period = gwi->gw_vnic_ka_period;
+	new_gwi->gw_ctrl_qpn = gwi->gw_ctrl_qpn;
+	new_gwi->gw_lid = gwi->gw_lid;
+	new_gwi->gw_portid = gwi->gw_portid;
+	new_gwi->gw_num_net_vnics = gwi->gw_num_net_vnics;
+	new_gwi->gw_is_host_adm_vnics = gwi->gw_is_host_adm_vnics;
+	new_gwi->gw_sl = gwi->gw_sl;
+	new_gwi->gw_n_rss_qpn = gwi->gw_n_rss_qpn;
+	new_gwi->gw_flag_ucast_advt = gwi->gw_flag_ucast_advt;
+	new_gwi->gw_flag_available = gwi->gw_flag_available;
+	bcopy(gwi->gw_system_name, new_gwi->gw_system_name,
+	    sizeof (new_gwi->gw_system_name));
+	bcopy(gwi->gw_port_name, new_gwi->gw_port_name,
+	    sizeof (new_gwi->gw_port_name));
+	bcopy(gwi->gw_vendor_id, new_gwi->gw_vendor_id,
+	    sizeof (new_gwi->gw_vendor_id));
+
+	/*
+	 * Queue up the new gwi and return it
+	 */
+	mutex_enter(&info->ti_gw_lock);
+	new_gwi->gw_next = info->ti_gw;
+	info->ti_gw = new_gwi;
+	mutex_exit(&info->ti_gw_lock);
+
+	return (new_gwi);
+}
+
+/*
+ * Update old data for the gateway in our list with the new data.
+ */
+void
+eibnx_replace_gw_in_gwlist(eibnx_thr_info_t *info, eibnx_gw_info_t *orig_gwi,
+    eibnx_gw_info_t *new_gwi, ibt_wc_t *wc, uint8_t *recv_buf,
+    boolean_t *gwi_changed)
+{
+	ib_sn_prefix_t new_gw_sn_prefix;
+	ib_grh_t *grh;
+	ib_gid_t sgid;
+	boolean_t changed = B_FALSE;
+	boolean_t gw_addr_changed = B_TRUE;
+
+	/*
+	 * We'll update all info received in the new advertisement in
+	 * the original gwi and also move the gw_state to that of the state
+	 * in the new gwi.
+	 */
+	mutex_enter(&info->ti_gw_lock);
+
+	orig_gwi->gw_state = new_gwi->gw_state;
+
+	/*
+	 * The guids shouldn't really change for the "same" gateway
+	 */
+	if (new_gwi->gw_system_guid != orig_gwi->gw_system_guid) {
+		ENX_DPRINTF_WARN("gateway system guid changed for the "
+		    "*same* gateway from 0x%llx to 0x%llx",
+		    orig_gwi->gw_system_guid, new_gwi->gw_system_guid);
+
+		orig_gwi->gw_system_guid = new_gwi->gw_system_guid;
+		changed = B_TRUE;
+	}
+	if (new_gwi->gw_guid != orig_gwi->gw_guid) {
+		ENX_DPRINTF_WARN("gateway guid changed for the "
+		    "*same* gateway from 0x%llx to 0x%llx",
+		    orig_gwi->gw_guid, new_gwi->gw_guid);
+
+		orig_gwi->gw_guid = new_gwi->gw_guid;
+		changed = B_TRUE;
+		gw_addr_changed = B_TRUE;
+	}
+
+	if (new_gwi->gw_adv_period != orig_gwi->gw_adv_period) {
+		ENX_DPRINTF_DEBUG("gateway adv period changed "
+		    "from 0x%lx to 0x%lx", orig_gwi->gw_adv_period,
+		    new_gwi->gw_adv_period);
+
+		orig_gwi->gw_adv_period = new_gwi->gw_adv_period;
+		changed = B_TRUE;
+	}
+	if (new_gwi->gw_ka_period != orig_gwi->gw_ka_period) {
+		ENX_DPRINTF_DEBUG("gateway ka period changed "
+		    "from 0x%lx to 0x%lx", orig_gwi->gw_ka_period,
+		    new_gwi->gw_ka_period);
+
+		orig_gwi->gw_ka_period = new_gwi->gw_ka_period;
+		changed = B_TRUE;
+	}
+	if (new_gwi->gw_vnic_ka_period != orig_gwi->gw_vnic_ka_period) {
+		ENX_DPRINTF_DEBUG("vnic ka period changed "
+		    "from 0x%lx to 0x%lx", orig_gwi->gw_vnic_ka_period,
+		    new_gwi->gw_vnic_ka_period);
+
+		orig_gwi->gw_vnic_ka_period = new_gwi->gw_vnic_ka_period;
+		changed = B_TRUE;
+	}
+	if (new_gwi->gw_ctrl_qpn != orig_gwi->gw_ctrl_qpn) {
+		ENX_DPRINTF_DEBUG("gateway control qpn changed "
+		    "from 0x%lx to 0x%lx", orig_gwi->gw_ctrl_qpn,
+		    new_gwi->gw_ctrl_qpn);
+
+		orig_gwi->gw_ctrl_qpn = new_gwi->gw_ctrl_qpn;
+		changed = B_TRUE;
+	}
+	if (new_gwi->gw_lid != orig_gwi->gw_lid) {
+		ENX_DPRINTF_DEBUG("gateway lid changed from 0x%x to 0x%x",
+		    orig_gwi->gw_lid, new_gwi->gw_lid);
+
+		orig_gwi->gw_lid = new_gwi->gw_lid;
+		changed = B_TRUE;
+		gw_addr_changed = B_TRUE;
+	}
+
+	/*
+	 * The identity of the gateway is currently defined by its portid,
+	 * so this cannot be different or eibnx_find_gw_in_gwlist() wouldn't
+	 * have thought it's the same.  For now though, we'll treat it
+	 * like any other parameter, and flag it if we find this different.
+	 */
+	if (new_gwi->gw_portid != orig_gwi->gw_portid) {
+		ENX_DPRINTF_WARN("gateway portid changed for the *same* "
+		    "gateway from 0x%x to 0x%x", orig_gwi->gw_portid,
+		    new_gwi->gw_portid);
+
+		orig_gwi->gw_portid = new_gwi->gw_portid;
+		changed = B_TRUE;
+	}
+
+	if (new_gwi->gw_is_host_adm_vnics != orig_gwi->gw_is_host_adm_vnics) {
+		ENX_DPRINTF_DEBUG("host adm vnics changed from 0x%x to 0x%x",
+		    orig_gwi->gw_is_host_adm_vnics,
+		    new_gwi->gw_is_host_adm_vnics);
+
+		orig_gwi->gw_is_host_adm_vnics = new_gwi->gw_is_host_adm_vnics;
+		changed = B_TRUE;
+	}
+	if (new_gwi->gw_sl != orig_gwi->gw_sl) {
+		ENX_DPRINTF_DEBUG("gateway sl changed from 0x%x to 0x%x",
+		    orig_gwi->gw_sl, new_gwi->gw_sl);
+
+		orig_gwi->gw_sl = new_gwi->gw_sl;
+		changed = B_TRUE;
+	}
+	if (new_gwi->gw_n_rss_qpn != orig_gwi->gw_n_rss_qpn) {
+		ENX_DPRINTF_DEBUG("gateway n_rss_qpn changed from 0x%x to 0x%x",
+		    orig_gwi->gw_n_rss_qpn, new_gwi->gw_n_rss_qpn);
+
+		orig_gwi->gw_n_rss_qpn = new_gwi->gw_n_rss_qpn;
+		changed = B_TRUE;
+	}
+
+	/*
+	 * The gw_flag_ucast_advt and gw_flag_available are expected to
+	 * change over time (and even gw_num_net_vnics could change, but
+	 * it's of no use to us presently), and we shouldn't trigger any
+	 * flag for these
+	 */
+	orig_gwi->gw_flag_ucast_advt = new_gwi->gw_flag_ucast_advt;
+	orig_gwi->gw_flag_available = new_gwi->gw_flag_available;
+	orig_gwi->gw_num_net_vnics = new_gwi->gw_num_net_vnics;
+
+	if (strncmp((const char *)new_gwi->gw_system_name,
+	    (const char *)orig_gwi->gw_system_name, EIB_GW_SYSNAME_LEN) != 0) {
+		ENX_DPRINTF_DEBUG("gateway system name changed from %s to %s",
+		    orig_gwi->gw_system_name, new_gwi->gw_system_name);
+
+		bcopy(new_gwi->gw_system_name, orig_gwi->gw_system_name,
+		    EIB_GW_SYSNAME_LEN);
+		changed = B_TRUE;
+	}
+	if (strncmp((const char *)new_gwi->gw_port_name,
+	    (const char *)orig_gwi->gw_port_name, EIB_GW_PORTNAME_LEN) != 0) {
+		ENX_DPRINTF_DEBUG("gateway port name changed from %s to %s",
+		    orig_gwi->gw_port_name, new_gwi->gw_port_name);
+
+		bcopy(new_gwi->gw_port_name, orig_gwi->gw_port_name,
+		    EIB_GW_PORTNAME_LEN);
+		changed = B_TRUE;
+	}
+	if (strncmp((const char *)new_gwi->gw_vendor_id,
+	    (const char *)orig_gwi->gw_vendor_id, EIB_GW_VENDOR_LEN) != 0) {
+		ENX_DPRINTF_DEBUG("vendor id changed from %s to %s",
+		    orig_gwi->gw_vendor_id, new_gwi->gw_vendor_id);
+
+		bcopy(new_gwi->gw_vendor_id, orig_gwi->gw_vendor_id,
+		    EIB_GW_VENDOR_LEN);
+		changed = B_TRUE;
+	}
+
+	/*
+	 * See if the subnet prefix for the gateway has changed
+	 */
+	if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
+		grh = (ib_grh_t *)(uintptr_t)recv_buf;
+		new_gw_sn_prefix = ntohll(grh->SGID.gid_prefix);
+	} else {
+		sgid = info->ti_pi->p_sgid_tbl[0];
+		new_gw_sn_prefix = sgid.gid_prefix;
+	}
+	if (new_gw_sn_prefix != orig_gwi->gw_addr.ga_gid.gid_prefix) {
+		ENX_DPRINTF_WARN("subnet prefix changed from 0x%llx to 0x%llx",
+		    orig_gwi->gw_addr.ga_gid.gid_prefix, new_gw_sn_prefix);
+
+		changed = B_TRUE;
+		gw_addr_changed = B_TRUE;
+	}
+
+	/*
+	 * If the gateway address has changed in any way, clear the current
+	 * address vector and update the gateway guid and gateway qpn. The
+	 * address vector will be created the next time a unicast solicit
+	 * is attempted for this gateway.
+	 */
+	if (gw_addr_changed) {
+		if (orig_gwi->gw_addr.ga_vect != NULL) {
+			kmem_free(orig_gwi->gw_addr.ga_vect,
+			    sizeof (ibt_adds_vect_t));
+			orig_gwi->gw_addr.ga_vect = NULL;
+		}
+		orig_gwi->gw_addr.ga_gid.gid_prefix = new_gw_sn_prefix;
+		orig_gwi->gw_addr.ga_gid.gid_guid = new_gwi->gw_guid;
+		orig_gwi->gw_addr.ga_qpn = new_gwi->gw_ctrl_qpn;
+		orig_gwi->gw_addr.ga_qkey = EIB_FIP_QKEY;
+		orig_gwi->gw_addr.ga_pkey = EIB_ADMIN_PKEY;
+	}
+
+	mutex_exit(&info->ti_gw_lock);
+
+	if (gwi_changed) {
+		*gwi_changed = changed;
+	}
+}
+
+/*
+ * Queue up a node for EoIB instantiation and wake up the thread
+ * that creates eoib nodes.
+ */
+void
+eibnx_queue_for_creation(eibnx_thr_info_t *info, eibnx_gw_info_t *gwi)
+{
+	eibnx_t *ss = enx_global_ss;
+	eibnx_nodeq_t *new_node;
+
+	/*
+	 * For now, we'll simply do KM_NOSLEEP allocation, since this
+	 * code is called from within rx processing
+	 */
+	new_node = kmem_zalloc(sizeof (eibnx_nodeq_t), KM_NOSLEEP);
+	if (new_node == NULL) {
+		ENX_DPRINTF_WARN("no memory, eoib node will not be "
+		    "created for hca_guid=0x%llx, hca_port=0x%x, "
+		    "gw_port_id=0x%x", info->ti_hca_guid,
+		    info->ti_pi->p_port_num, gwi->gw_portid);
+		return;
+	}
+	new_node->nc_info = info;
+	new_node->nc_gwi = gwi;
+
+	/*
+	 * If the eoib node creation thread is dying (or dead), don't
+	 * queue up any more requests for creation
+	 */
+	mutex_enter(&ss->nx_nodeq_lock);
+	if (ss->nx_nodeq_thr_die) {
+		kmem_free(new_node, sizeof (eibnx_nodeq_t));
+	} else {
+		new_node->nc_next = ss->nx_nodeq;
+		ss->nx_nodeq = new_node;
+		cv_signal(&ss->nx_nodeq_cv);
+	}
+	mutex_exit(&ss->nx_nodeq_lock);
+}
--- a/usr/src/uts/common/io/ib/ibtl/ibtl_ibnex.c	Fri Aug 13 14:44:26 2010 +0800
+++ b/usr/src/uts/common/io/ib/ibtl/ibtl_ibnex.c	Fri Aug 13 07:02:57 2010 -0400
@@ -18,9 +18,9 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  */

 #include <sys/systm.h>
@@ -517,9 +517,10 @@
  *	For a given pdip, of Port/VPPA devices, match it against all the
  *	registered HCAs's dip.  If match found return IBT_SUCCESS,
  *	else IBT_NO_HCAS_AVAILABLE.
+ *
  *	For IOC/Pseudo devices check if the given pdip is that of
- *	the ib(7d) nexus. If yes return IBT_SUCCESS,
- *	else IBT_NO_HCAS_AVAILABLE.
+ *	the ib(7d) nexus or that of the eoib(7d) nexus. If yes
+ *	return IBT_SUCCESS, else IBT_NO_HCAS_AVAILABLE.
  */
 ibt_status_t
 ibtl_ibnex_valid_hca_parent(dev_info_t *pdip)
@@ -530,9 +531,10 @@
 	    pdip);

 	/* For Pseudo devices and IOCs */
-	if (strncmp(ddi_node_name(pdip), "ib", 2) == 0)
+	if (strncmp(ddi_node_name(pdip), "ib", 2) == 0 ||
+	    strncmp(ddi_node_name(pdip), "eibnx", 5) == 0) {
 		return (IBT_SUCCESS);
-	else {
+	} else {
 		/* For Port devices and VPPAs */
 		mutex_enter(&ibtl_clnt_list_mutex);
 		hca_devp = ibtl_hca_list;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/sys/ib/clients/eoib/eib.h	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,189 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_IB_EOIB_EIB_H
+#define	_SYS_IB_EOIB_EIB_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ *
+ * EoIB Encapsulation Header Layout
+ *
+ *  31 30 29 28 27 26 25 24     22 21 20 ... 16 15                      0
+ * +-----+-----+-----+-----+--+---+--+---------+-------------------------+
+ * | sig | ver | TCP | IP  |  |fcs|ms| segment |       segment id        |
+ * |     |     | chk | chk |  |   |  | offset  |                         |
+ * +-----+-----+-----+-----+--+---+--+---------+-------------------------+
+ *
+ */
+#define	EIB_ENCAP_HDR_SZ		4
+
+#define	EIB_ENCAP_SIGN_MASK		0x3
+#define	EIB_ENCAP_SIGN_SHIFT		30
+#define	EIB_ENCAP_VER_MASK		0x3
+#define	EIB_ENCAP_VER_SHIFT		28
+#define	EIB_ENCAP_TCPCHK_MASK		0x3
+#define	EIB_ENCAP_TCPCHK_SHIFT		26
+#define	EIB_ENCAP_IPCHK_MASK		0x3
+#define	EIB_ENCAP_IPCHK_SHIFT		24
+#define	EIB_ENCAP_FCS_B_SHIFT		22
+#define	EIB_ENCAP_MS_B_SHIFT		21
+#define	EIB_ENCAP_SEGOFF_MASK		0x1F
+#define	EIB_ENCAP_SEGOFF_SHIFT		16
+#define	EIB_ENCAP_SEGID_MASK		0xFFFF
+
+/*
+ * Bit fields values definitions
+ */
+#define	EIB_EH_SIGNATURE		3
+#define	EIB_EH_VERSION			0
+#define	EIB_EH_CSUM_UNKNOWN		0
+#define	EIB_EH_TCPCSUM_OK		1
+#define	EIB_EH_UDPCSUM_OK		2
+#define	EIB_EH_CSUM_BAD			3
+#define	EIB_EH_IPCSUM_OK		1
+
+/*
+ * Some shortcuts
+ */
+#define	EIB_TX_ENCAP_HDR		0xC0000000
+#define	EIB_RX_ENCAP_TCPIP_OK		0xC5000000
+#define	EIB_RX_ENCAP_UDPIP_OK		0xC9000000
+
+/*
+ * Driver name
+ */
+#define	EIB_DRV_NAME			"eoib"
+
+/*
+ * Currently, the gateway responds to login requests on the qpn that carried
+ * the solication request, rather than on the qpn that carried the login
+ * request.  This means that EoIB nexus receives the acknowledgements from
+ * gateways to login requests made by the individual EoIB instances, and must
+ * pass this login ack information back to the appropriate EoIB instance.
+ *
+ * Now, the only field in the login ack packet that could identify the
+ * individual EoIB instance is the vNIC id field, but this is a 16-bit field,
+ * with the MSB reserved to indicate whether the mac/vlan is host-managed
+ * or gateway-managed.  This leaves us with just 15-bits to encode the EoIB
+ * device instance and its Solaris vnic instance.  For now, we divide this
+ * field as a 6-bit vnic instance number (max Solaris vnics is 64) and a
+ * 9-bit device instance number (max EoIB pseudo-NICs in a system is 512).
+ *
+ * The long-term solution is to get the gateway to respond directly to the
+ * login requestor, so the requestor can use all 15-bits to identify its
+ * Solaris vnic instance (max 32K) and leave the device instance limit to
+ * the system limit.
+ */
+#define	EIB_DVI_SHIFT			6
+#define	EIB_DVI_MASK			0x1FF
+#define	EIB_VNI_MASK			0x03F
+
+#define	EIB_VNIC_INSTANCE(id)		((id) & EIB_VNI_MASK)
+#define	EIB_DEVI_INSTANCE(id)		(((id) >> EIB_DVI_SHIFT) & EIB_DVI_MASK)
+#define	EIB_VNIC_ID(dvi, vni)		\
+	((((dvi) & EIB_DVI_MASK) << EIB_DVI_SHIFT) | ((vni) & EIB_VNI_MASK))
+
+/*
+ * Making VHUB_ID from vlan and portid
+ */
+#define	EIB_VHUB_ID(portid, vlan)	\
+	((((uint_t)(portid) & 0xfff) << 12) | ((uint_t)(vlan) & 0xfff))
+
+/*
+ * NDI Events that individual EoIB instance will be interested in
+ */
+#define	EIB_NDI_EVENT_GW_AVAILABLE	"SUNW,eoib:gateway-available"
+#define	EIB_NDI_EVENT_LOGIN_ACK		"SUNW,eoib:vnic-login-ack"
+#define	EIB_NDI_EVENT_GW_INFO_UPDATE	"SUNW,eoib:gateway-info-update"
+
+/*
+ * Properties for each eoib node created
+ */
+#define	EIB_PROP_HCA_GUID		"hca-guid"
+#define	EIB_PROP_HCA_PORTNUM		"hca-port#"
+#define	EIB_PROP_GW_SYS_GUID		"gw-system-guid"
+#define	EIB_PROP_GW_GUID		"gw-guid"
+#define	EIB_PROP_GW_SN_PREFIX		"gw-sn-prefix"
+#define	EIB_PROP_GW_ADV_PERIOD		"gw-adv-period"
+#define	EIB_PROP_GW_KA_PERIOD		"gw-ka-period"
+#define	EIB_PROP_VNIC_KA_PERIOD		"vnic-ka-period"
+#define	EIB_PROP_GW_CTRL_QPN		"gw-ctrl-qpn"
+#define	EIB_PROP_GW_LID			"gw-lid"
+#define	EIB_PROP_GW_PORTID		"gw-portid"
+#define	EIB_PROP_GW_NUM_NET_VNICS	"gw-num-net-vnics"
+#define	EIB_PROP_GW_AVAILABLE		"gw-available?"
+#define	EIB_PROP_GW_HOST_VNICS		"gw-host-vnics?"
+#define	EIB_PROP_GW_SL			"gw-sl"
+#define	EIB_PROP_GW_N_RSS_QPN		"gw-n-rss-qpn"
+#define	EIB_PROP_GW_SYS_NAME		"gw-system-name"
+#define	EIB_PROP_GW_PORT_NAME		"gw-port-name"
+#define	EIB_PROP_GW_VENDOR_ID		"gw-vendor-id"
+
+/*
+ * Gateway information passed by eibnx to eoib.  The lengths of character
+ * strings should be longer than what is defined for these objects in fip.h,
+ * to accomodate the terminating null.
+ */
+#define	EIB_GW_SYSNAME_LEN		40
+#define	EIB_GW_PORTNAME_LEN		12
+#define	EIB_GW_VENDOR_LEN		12
+
+typedef struct eib_gw_info_s {
+	ib_guid_t	gi_system_guid;
+	ib_guid_t	gi_guid;
+	ib_sn_prefix_t	gi_sn_prefix;
+	uint32_t	gi_adv_period;
+	uint32_t	gi_ka_period;
+	uint32_t	gi_vnic_ka_period;
+	ib_qpn_t	gi_ctrl_qpn;
+	ib_lid_t	gi_lid;
+	uint16_t	gi_portid;
+	uint16_t	gi_num_net_vnics;
+	uint8_t		gi_flag_available;
+	uint8_t		gi_is_host_adm_vnics;
+	uint8_t		gi_sl;
+	uint8_t		gi_n_rss_qpn;
+	uint8_t		gi_system_name[EIB_GW_SYSNAME_LEN];
+	uint8_t		gi_port_name[EIB_GW_PORTNAME_LEN];
+	uint8_t		gi_vendor_id[EIB_GW_VENDOR_LEN];
+} eib_gw_info_t;
+
+/*
+ * Softint priority levels to use for data and control/admin cq handling
+ * in EoIB leaf and nexus drivers
+ */
+#define	EIB_SOFTPRI_DATA		(DDI_INTR_SOFTPRI_MIN)
+#define	EIB_SOFTPRI_CTL			(DDI_INTR_SOFTPRI_MIN + 1)
+#define	EIB_SOFTPRI_ADM			(DDI_INTR_SOFTPRI_MIN + 1)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _SYS_IB_EOIB_EIB_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/sys/ib/clients/eoib/eib_impl.h	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,991 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_IB_EOIB_EIB_IMPL_H
+#define	_SYS_IB_EOIB_EIB_IMPL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/ddi.h>
+#include <sys/mac.h>
+#include <sys/sunddi.h>
+#include <sys/varargs.h>
+#include <sys/vlan.h>
+#include <sys/ib/ibtl/ibti.h>
+#include <sys/ib/ibtl/ibvti.h>
+#include <sys/ib/ib_pkt_hdrs.h>
+
+#include <sys/ib/clients/eoib/fip.h>
+#include <sys/ib/clients/eoib/eib.h>
+
+/*
+ * Driver specific constants
+ */
+#define	EIB_E_SUCCESS			0
+#define	EIB_E_FAILURE			-1
+#define	EIB_MAX_LINE			128
+#define	EIB_MAX_SGL			59
+#define	EIB_MAX_POST_MULTIPLE		4
+#define	EIB_MAX_PAYLOAD_HDR_SZ		160
+#define	EIB_TX_COPY_THRESH		4096	/* greater than mtu */
+#define	EIB_MAX_VNICS			64	/* do not change this */
+#define	EIB_LOGIN_TIMEOUT_USEC		8000000
+#define	EIB_RWR_CHUNK_SZ		8
+#define	EIB_IPHDR_ALIGN_ROOM		32
+#define	EIB_IP_HDR_ALIGN		2
+#define	EIB_MAX_RX_PKTS_ONINTR		0x800
+#define	EIB_MAX_LOGIN_ATTEMPTS		3
+#define	EIB_MAX_VHUB_TBL_ATTEMPTS	3
+#define	EIB_MAX_KA_ATTEMPTS		3
+#define	EIB_MAX_ATTEMPTS		10
+#define	EIB_DELAY_HALF_SECOND		500000
+#define	EIB_GRH_SZ			(sizeof (ib_grh_t))
+
+/*
+ * Debug messages
+ */
+#define	EIB_MSGS_CRIT		0x01
+#define	EIB_MSGS_ERR		0x02
+#define	EIB_MSGS_WARN		0x04
+#define	EIB_MSGS_DEBUG		0x08
+#define	EIB_MSGS_ARGS		0x10
+#define	EIB_MSGS_PKT		0x20
+#define	EIB_MSGS_VERBOSE	0x40
+#define	EIB_MSGS_DEFAULT	(EIB_MSGS_CRIT | EIB_MSGS_ERR | EIB_MSGS_WARN)
+
+#define	EIB_LOGSZ_DEFAULT	0x20000
+
+#define	EIB_DPRINTF_CRIT	eib_dprintf_crit
+#define	EIB_DPRINTF_ERR		eib_dprintf_err
+#define	EIB_DPRINTF_WARN	eib_dprintf_warn
+#ifdef EIB_DEBUG
+#define	EIB_DPRINTF_DEBUG	eib_dprintf_debug
+#define	EIB_DPRINTF_ARGS	eib_dprintf_args
+#define	EIB_DPRINTF_PKT		eib_dprintf_pkt
+#define	EIB_DPRINTF_VERBOSE	eib_dprintf_verbose
+#else
+#define	EIB_DPRINTF_DEBUG	0 &&
+#define	EIB_DPRINTF_ARGS	0 &&
+#define	EIB_DPRINTF_PKT		0 &&
+#define	EIB_DPRINTF_VERBOSE	0 &&
+#endif
+
+/*
+ *  EoIB threads to provide various services
+ */
+#define	EIB_EVENTS_HDLR		"eib_events_handler"
+#define	EIB_RWQES_REFILLER	"eib_rwqes_refiller"
+#define	EIB_VNIC_CREATOR	"eib_vnic_creator"
+#define	EIB_TXWQES_MONITOR	"eib_txwqe_monitor"
+#define	EIB_LSOBUFS_MONITOR	"eib_lsobufs_monitor"
+
+/*
+ * Macro for finding the least significant bit set in a 64-bit unsigned int
+ */
+#define	EIB_FIND_LSB_SET(val64)	eib_setbit_mod67[((-(val64) & (val64)) % 67)]
+
+/*
+ * LSO buffers
+ *
+ * Under normal circumstances we should never need to use any buffer
+ * that's larger than MTU.  Unfortunately, IB HCA has limitations
+ * on the length of SGL that are much smaller than those for regular
+ * ethernet NICs.  Since the network layer doesn't care to limit the
+ * number of mblk fragments in any send mp chain, we end up having to
+ * use these larger buffers occasionally.
+ */
+#define	EIB_LSO_MAXLEN			65536
+#define	EIB_LSO_BUFSZ			8192
+#define	EIB_LSO_NUM_BUFS		1024
+#define	EIB_LSO_FREE_BUFS_THRESH	(EIB_LSO_NUM_BUFS >> 5)
+
+typedef struct eib_lsobuf_s {
+	struct eib_lsobuf_s *lb_next;
+	uint8_t		*lb_buf;
+	int		lb_isfree;
+} eib_lsobuf_t;
+
+typedef struct eib_lsobkt_s {
+	kmutex_t	bk_lock;
+	kcondvar_t	bk_cv;
+	uint_t		bk_status;
+	uint8_t		*bk_mem;
+	eib_lsobuf_t	*bk_bufl;
+	eib_lsobuf_t	*bk_free_head;
+	ibt_mr_hdl_t	bk_mr_hdl;
+	ibt_lkey_t	bk_lkey;
+	uint_t		bk_nelem;
+	uint_t		bk_nfree;
+} eib_lsobkt_t;
+
+#define	EIB_LBUF_SHORT		0x1
+#define	EIB_LBUF_MONITOR_DIE	0x2
+
+/*
+ * The admin partition is only used for sending login and logout messages
+ * and receiving login acknowledgements from the gateway.  While packets
+ * going out on several vlans at the same time could result in multiple
+ * vnic creations happening at the same time (and therefore multiple login
+ * packets), we serialize the vnic creation via the vnic creator thread, so
+ * we shouldn't need a lot of send wqes or receive wqes.  Note also that we
+ * keep the cq size request to slightly less than a 2^n boundary to allow
+ * the alloc cq routine to return the closest 2^n boundary as the real cq
+ * size without wasting too much memory.
+ */
+#define	EIB_ADMIN_MAX_SWQE	30
+#define	EIB_ADMIN_MAX_RWQE	30
+#define	EIB_ADMIN_CQ_SIZE	(EIB_ADMIN_MAX_SWQE + EIB_ADMIN_MAX_RWQE + 1)
+
+/*
+ * The control qp is per vhub partition, and is used to send and receive
+ * vhub control messages such as vhub table request/response, vhub
+ * update response and vnic alive messages.  While the vhub table response
+ * and vhub update messages might take a few rwqes, the vhub table request
+ * is made only once per vnic, and the vnic alive message is periodic
+ * and uses a single swqe as well.  Per vnic, we should certainly not need
+ * too many swqes/rwqes.
+ */
+#define	EIB_CTL_MAX_SWQE	30
+#define	EIB_CTL_MAX_RWQE	30
+#define	EIB_CTL_CQ_SIZE		(EIB_CTL_MAX_SWQE + EIB_CTL_MAX_RWQE + 1)
+
+/*
+ * For the vNIC's data channel, there are three items that are of importance:
+ * the constraints defined below, the hca_max_chan_sz attribute and the value of
+ * (hca_max_cq_sz - 1).  The maximum limit on swqe/rwqe is set to the minimum
+ * of these three values.
+ *
+ * While the total number of RWQEs posted to the data channel of any vNIC will
+ * not exceed EIB_DATA_MAX_RWQE, we also do not want to acquire and post all of
+ * it during the data channel initialization, since that is a lot of wqes for
+ * one vnic to consume when we don't even know if the vnic will need it at all.
+ * We post an initial set of EIB_DATA_RWQE_BKT rwqes, and slowly post more and
+ * more sets as we see them being consumed, until we hit the hard limit of
+ * EIB_DATA_MAX_RWQE.
+ */
+#define	EIB_DATA_MAX_SWQE	4000
+#define	EIB_DATA_MAX_RWQE	4000
+#define	EIB_DATA_RWQE_BKT	512
+
+/*
+ * vNIC data channel CQ moderation parameters
+ */
+#define	EIB_TX_COMP_COUNT		10
+#define	EIB_TX_COMP_USEC		300
+#define	EIB_RX_COMP_COUNT		4
+#define	EIB_RX_COMP_USEC		10
+
+/*
+ * qe_info masks (blk:ndx:type:flags)
+ */
+#define	EIB_WQEBLK_SHIFT		24
+#define	EIB_WQEBLK_MASK			0xFF
+#define	EIB_WQENDX_SHIFT		16
+#define	EIB_WQENDX_MASK			0xFF
+#define	EIB_WQETYP_SHIFT		8
+#define	EIB_WQETYP_MASK			0xFF
+#define	EIB_WQEFLGS_SHIFT		0
+#define	EIB_WQEFLGS_MASK		0xFF
+
+/*
+ * Macros to get the bit fields from qe_info
+ */
+#define	EIB_WQE_BLK(info)	(((info) >> EIB_WQEBLK_SHIFT) & EIB_WQEBLK_MASK)
+#define	EIB_WQE_NDX(info)	(((info) >> EIB_WQENDX_SHIFT) & EIB_WQENDX_MASK)
+#define	EIB_WQE_TYPE(info)	(((info) >> EIB_WQETYP_SHIFT) & EIB_WQETYP_MASK)
+#define	EIB_WQE_FLAGS(info)	((info) & EIB_WQEFLGS_MASK)
+
+/*
+ * Values for type and flags in qe_info
+ */
+#define	EIB_WQE_TX			0x1
+#define	EIB_WQE_RX			0x2
+
+/*
+ * Flags for rx wqes/buffers
+ */
+#define	EIB_WQE_FLG_POSTED_TO_HCA	0x1
+#define	EIB_WQE_FLG_WITH_NW		0x2
+
+/*
+ * Flags for tx wqes/buffers
+ */
+#define	EIB_WQE_FLG_BUFTYPE_LSO		0x4
+#define	EIB_WQE_FLG_BUFTYPE_MAPPED	0x8
+
+/*
+ * Send/Recv workq entries
+ */
+typedef struct eib_wqe_s {
+	struct eib_wqe_pool_s	*qe_pool;
+	uint8_t			*qe_cpbuf;
+	uint8_t			*qe_payload_hdr;
+	uint_t			qe_bufsz;
+	uint_t			qe_info;
+	int			qe_vnic_inst;
+	ibt_ud_dest_hdl_t	qe_dest;
+	frtn_t			qe_frp;
+
+	mblk_t			*qe_mp;
+	ibt_mi_hdl_t		qe_iov_hdl;
+	ibt_all_wr_t		qe_wr;
+	ibt_wr_ds_t		qe_sgl;
+	ibt_wr_ds_t		qe_big_sgl[EIB_MAX_SGL];
+	struct eib_wqe_s	*qe_nxt_post;
+	struct eib_chan_s	*qe_chan;
+} eib_wqe_t;
+
+/*
+ * The wqe in-use/free status in EoIB is managed via a 2-level bitmap
+ * logic.
+ *
+ * Each set of 64 wqes (a "wqe block") is managed by a single 64-bit
+ * integer bitmap.  The free status of a set of 64 such wqe blocks (a
+ * "wqe pool") is managed by one 64-bit integer bitmap (if any wqe in
+ * the wqe block is free, the bit in the map is 1, otherwise it is 0).
+ *
+ * The maximum pool size is 4096 wqes, but this can easily be extended
+ * to support more wqes using additional pools of wqes.
+ *
+ * Note that an entire pool of wqes is allocated via a single allocation,
+ * the wqe addresses in a pool are all contiguous.  The tx/rx copy buffers
+ * for a wqe pool are also allocated via a single allocation.
+ */
+#define	EIB_BLKS_PER_POOL	64
+#define	EIB_WQES_PER_BLK	64	/* do not change this */
+#define	EIB_WQES_PER_POOL	(EIB_BLKS_PER_POOL * EIB_WQES_PER_BLK)
+
+#define	EIB_WQE_SZ		(sizeof (eib_wqe_t))
+#define	EIB_WQEBLK_SZ		(EIB_WQES_PER_BLK * EIB_WQE_SZ)
+
+typedef struct eib_wqe_pool_s {
+	struct eib_wqe_pool_s	*wp_next;
+	struct eib_s		*wp_ss;
+	ib_vaddr_t		wp_vaddr;
+	ib_memlen_t		wp_memsz;
+	ibt_mr_hdl_t		wp_mr;
+	ibt_lkey_t		wp_lkey;
+	uint_t			wp_nfree_lwm;
+	int			wp_type;
+
+	kmutex_t		wp_lock;
+	kcondvar_t		wp_cv;
+	uint_t			wp_status;
+	uint_t			wp_nfree;
+	uint64_t		wp_free_blks;
+	uint64_t		wp_free_wqes[EIB_BLKS_PER_POOL];
+	struct eib_wqe_s	*wp_wqe;
+} eib_wqe_pool_t;
+
+/*
+ * Values for wp_type
+ */
+#define	EIB_WP_TYPE_TX		0x1
+#define	EIB_WP_TYPE_RX		0x2
+
+/*
+ * Values for wp_status (bit fields)
+ */
+#define	EIB_TXWQE_SHORT		0x1	/* only for tx wqe pool */
+#define	EIB_TXWQE_MONITOR_DIE	0x2	/* only for tx wqe pool */
+
+#define	EIB_RXWQE_SHORT		0x1	/* only for rx wqe pool */
+
+/*
+ * The low-water-mark is an indication of when wqe grabs for low-priority
+ * qps should start to get refused (swqe grabs for control messages such
+ * as keepalives and rwqe grabs for posting back to control qps will still
+ * be allowed).  The high-water-mark is an indication of when normal
+ * behavior should resume.
+ */
+#define	EIB_NFREE_SWQES_LWM	(EIB_WQES_PER_POOL / 64)	/* 1/64 */
+#define	EIB_NFREE_SWQES_HWM	(EIB_WQES_PER_POOL / 32)	/* 1/32 */
+#define	EIB_NFREE_RWQES_LWM	(EIB_WQES_PER_POOL / 10)	/* 10% */
+#define	EIB_NFREE_RWQES_HWM	(EIB_WQES_PER_POOL / 5)		/* 20% */
+
+/*
+ * The "rwqes low" is used to determine when we should start using allocb()
+ * to copy and send received mblks in the rx path.  It should be a little
+ * above the rwqes low-water-mark, but less than the high-water-mark.
+ */
+#define	EIB_NFREE_RWQES_LOW	\
+	((EIB_NFREE_RWQES_LWM + EIB_NFREE_RWQES_HWM) / 2)
+
+#define	EIB_WPRI_HI		1	/* for keepalive posts */
+#define	EIB_WPRI_LO		2	/* for all other posts */
+
+/*
+ * Multicast GID Layout: the multicast gid is specified in big-endian
+ * representation, as a collection of different-sized fields in the
+ * EoIB specification.  On Solaris, the multicast gid is represented
+ * as a collection of two 8-byte fields (in ib_gid_t).
+ */
+typedef struct eib_mgid_spec_s {
+	uint8_t			sp_mgid_prefix[FIP_MGID_PREFIX_LEN];
+	uint8_t			sp_type;
+	uint8_t			sp_dmac[ETHERADDRL];
+	uint8_t			sp_rss_hash;
+	uint8_t			sp_vhub_id[FIP_VHUBID_LEN];
+} eib_mgid_spec_t;
+
+/*
+ * Values for sp_type in mgid as per EoIB specification
+ */
+#define	EIB_MGID_VHUB_DATA	0x0
+#define	EIB_MGID_VHUB_UPDATE	0x2
+#define	EIB_MGID_VHUB_TABLE	0x3
+
+typedef union eib_mgid_s {
+	eib_mgid_spec_t		gd_spec;
+	ib_gid_t		gd_sol;
+} eib_mgid_t;
+
+/*
+ * Gateway properties handed over to us by the EoIB nexus
+ */
+typedef struct eib_gw_props_s {
+	kmutex_t		pp_gw_lock;
+
+	ib_guid_t		pp_gw_system_guid;
+	ib_guid_t		pp_gw_guid;
+	ib_sn_prefix_t		pp_gw_sn_prefix;
+
+	uint_t			pp_gw_adv_period;
+	uint_t			pp_gw_ka_period;
+	uint_t			pp_vnic_ka_period;
+
+	ib_qpn_t		pp_gw_ctrl_qpn;
+	ib_lid_t		pp_gw_lid;
+	uint16_t		pp_gw_portid;
+
+	uint16_t		pp_gw_num_net_vnics;
+	uint8_t			pp_gw_flag_available;
+	uint8_t			pp_gw_is_host_adm_vnics;
+	uint8_t			pp_gw_sl;
+	uint8_t			pp_gw_n_rss_qpn;
+
+	uint8_t			*pp_gw_system_name;
+	uint8_t			*pp_gw_port_name;
+	uint8_t			*pp_gw_vendor_id;
+
+	clock_t			pp_gw_ka_ticks;		/* 2.5 x gw_ka_period */
+	clock_t			pp_vnic_ka_ticks;	/* vnic_ka_period */
+} eib_gw_props_t;
+
+/*
+ * Port-specific properties
+ */
+typedef struct eib_props_s {
+	uint64_t		ep_ifspeed;
+	ib_guid_t		ep_hca_guid;
+	uint8_t			ep_port_num;
+	ib_gid_t		ep_sgid;
+	ib_lid_t		ep_blid;
+	uint16_t		ep_mtu;
+	ibt_srate_t		ep_srate;
+} eib_props_t;
+
+/*
+ * Capabilities derived from HCA attributes
+ */
+typedef struct eib_caps_s {
+	uint_t			cp_lso_maxlen;
+	uint32_t		cp_cksum_flags;
+	int			cp_resv_lkey_capab;
+	ibt_lkey_t		cp_resv_lkey;
+
+	uint_t			cp_max_swqe;
+	uint_t			cp_max_rwqe;
+	uint_t			cp_max_sgl;
+	uint_t			cp_hiwm_sgl;
+} eib_caps_t;
+
+/*
+ * List of multicast groups the vnic joined
+ */
+typedef struct eib_mcg_s {
+	struct eib_mcg_s	*mg_next;
+	ib_gid_t		mg_rgid;
+	ib_gid_t		mg_mgid;
+	uint8_t			mg_join_state;
+	uint8_t			mg_mac[ETHERADDRL];
+	ibt_mcg_info_t		*mg_mcginfo;
+} eib_mcg_t;
+
+/*
+ * Admin/control/data channel information
+ */
+typedef struct eib_chan_s {
+	ibt_channel_hdl_t	ch_chan;
+	ib_qpn_t		ch_qpn;
+
+	ibt_wc_t		*ch_wc;
+	ibt_cq_hdl_t		ch_cq_hdl;
+	uint_t			ch_cq_sz;
+
+	ibt_wc_t		*ch_rcv_wc;
+	ibt_cq_hdl_t		ch_rcv_cq_hdl;
+	uint_t			ch_rcv_cq_sz;
+
+	int			ch_vnic_inst;
+	uint_t			ch_max_swqes;
+	uint_t			ch_max_rwqes;
+	uint_t			ch_lwm_rwqes;
+	uint_t			ch_rwqe_bktsz;
+	uint_t			ch_ip_hdr_align;
+	boolean_t		ch_alloc_mp;
+	boolean_t		ch_tear_down;
+
+	kmutex_t		ch_pkey_lock;
+	ib_pkey_t		ch_pkey;
+	uint16_t		ch_pkey_ix;
+
+	kmutex_t		ch_cep_lock;
+	kcondvar_t		ch_cep_cv;
+	ibt_cep_state_t		ch_cep_state;
+
+	kmutex_t		ch_tx_lock;
+	kcondvar_t		ch_tx_cv;
+	uint_t			ch_tx_posted;
+	boolean_t		ch_tx_busy;
+	struct eib_wqe_s	*ch_tx;
+	struct eib_wqe_s	*ch_tx_tail;
+
+	kmutex_t		ch_rx_lock;
+	kcondvar_t		ch_rx_cv;
+	uint_t			ch_rx_posted;
+	boolean_t		ch_rx_refilling;
+
+	kmutex_t		ch_vhub_lock;
+	struct eib_mcg_s	*ch_vhub_table;
+	struct eib_mcg_s	*ch_vhub_update;
+	struct eib_mcg_s	*ch_vhub_data;
+
+	struct eib_chan_s	*ch_rxpost_next;
+} eib_chan_t;
+
+/*
+ * States for vNIC state machine during login
+ */
+#define	EIB_LOGIN_INIT		0
+#define	EIB_LOGIN_ACK_WAIT	1
+#define	EIB_LOGIN_ACK_RCVD	2
+#define	EIB_LOGIN_NACK_RCVD	3
+#define	EIB_LOGIN_TBL_WAIT	4
+#define	EIB_LOGIN_TBL_INPROG	5
+#define	EIB_LOGIN_TBL_DONE	6
+#define	EIB_LOGIN_TBL_FAILED	7
+#define	EIB_LOGIN_DONE		8
+#define	EIB_LOGIN_TIMED_OUT	9
+#define	EIB_LOGOUT_DONE		10
+
+typedef struct eib_login_data_s {
+	ib_guid_t		ld_gw_guid;
+	ib_lid_t		ld_gw_lid;
+	uint_t			ld_syndrome;
+	uint16_t		ld_gw_port_id;
+	ib_qpn_t		ld_gw_data_qpn;
+	ib_qpn_t		ld_gw_ctl_qpn;
+	uint16_t		ld_vnic_id;	/* includes set msbit */
+	uint16_t		ld_vhub_mtu;
+	uint16_t		ld_vhub_pkey;
+	uint16_t		ld_assigned_vlan;
+	uint8_t			ld_gw_sl;
+	uint8_t			ld_n_rss_mcgid;
+	uint8_t			ld_n_mac_mcgid;
+	uint8_t			ld_vnic_name[FIP_VNIC_NAME_LEN];
+	uint8_t			ld_assigned_mac[ETHERADDRL];
+	uint8_t			ld_gw_mgid_prefix[FIP_MGID_PREFIX_LEN];
+	uint8_t			ld_vlan_in_packets;
+	uint32_t		ld_vhub_id;
+} eib_login_data_t;
+
+#define	EIB_UNICAST_MAC(mac)		(((mac)[0] & 0x01) == 0)
+
+/*
+ * Map to translate between DMAC and {qpn, lid, sl}
+ */
+typedef struct eib_vhub_map_s {
+	struct eib_vhub_map_s	*mp_next;
+	uint32_t		mp_tusn;
+	ib_qpn_t		mp_qpn;
+	ib_lid_t		mp_lid;
+	uint8_t			mp_mac[ETHERADDRL];
+	uint8_t			mp_sl;
+	uint8_t			mp_v_rss_type;
+} eib_vhub_map_t;
+
+/*
+ * Per-vNIC vHUB Table
+ */
+#define	EIB_TB_NBUCKETS		13
+typedef struct eib_vhub_table_s {
+	kmutex_t		tb_lock;
+	struct eib_vhub_map_s	*tb_gateway;
+	struct eib_vhub_map_s	*tb_unicast_miss;
+	struct eib_vhub_map_s	*tb_vhub_multicast;
+	struct eib_vhub_map_s	*tb_vnic_entry[EIB_TB_NBUCKETS];
+	struct eib_vhub_map_s	*tb_mcast_entry[EIB_TB_NBUCKETS];
+
+	uint32_t		tb_tusn;
+	uint8_t			tb_eport_state;
+
+	uint16_t		tb_entries_seen;
+	uint16_t		tb_entries_in_table;
+	uint32_t		tb_checksum;
+} eib_vhub_table_t;
+
+typedef struct eib_vhub_update_s {
+	kmutex_t		up_lock;
+	eib_vhub_map_t		*up_vnic_entry;
+	uint32_t		up_tusn;
+	uint8_t			up_eport_state;
+} eib_vhub_update_t;
+
+typedef struct eib_ether_hdr_s {
+	int			eh_tagless;
+	uint16_t		eh_ether_type;
+	uint16_t		eh_vlan;
+	uint8_t			eh_dmac[ETHERADDRL];
+	uint8_t			eh_smac[ETHERADDRL];
+} eib_ether_hdr_t;
+
+/*
+ * vNIC Information
+ */
+typedef struct eib_vnic_s {
+	struct eib_s		*vn_ss;
+	eib_chan_t		*vn_ctl_chan;
+	eib_chan_t		*vn_data_chan;
+	int			vn_instance;
+	uint16_t		vn_vlan;
+	uint16_t		vn_id;
+	uint8_t			vn_macaddr[ETHERADDRL];
+	struct eib_login_data_s	vn_login_data;
+
+	kmutex_t		vn_lock;
+	kcondvar_t		vn_cv;
+	uint_t			vn_state;
+	struct eib_vhub_table_s	*vn_vhub_table;
+	struct eib_vhub_update_s *vn_vhub_update;
+
+	ddi_softint_handle_t    vn_ctl_si_hdl;
+	ddi_softint_handle_t    vn_data_tx_si_hdl;
+	ddi_softint_handle_t    vn_data_rx_si_hdl;
+} eib_vnic_t;
+
+
+/*
+ * Base NIC's mac state flags. The lock protects the starting/stopping
+ * bits.  Access to the rest of the mac state is protected by these
+ * two bits.
+ */
+#define	EIB_NIC_STARTING	0x01
+#define	EIB_NIC_STOPPING	0x02
+#define	EIB_NIC_STARTED		0x80
+#define	EIB_NIC_RESTARTING	(EIB_NIC_STARTING | EIB_NIC_STOPPING)
+
+typedef struct eib_node_state_s {
+	kmutex_t		ns_lock;
+	kcondvar_t		ns_cv;
+	uint_t			ns_nic_state;
+	link_state_t		ns_link_state;
+} eib_node_state_t;
+
+/*
+ * MIB-II statistics to report to the mac layer
+ */
+typedef struct eib_stats_s {
+	uint64_t		st_obytes;	/* bytes sent out */
+	uint64_t		st_opkts;	/* pkts sent out */
+	uint64_t		st_brdcstxmit;	/* broadcast pkts transmitted */
+	uint64_t		st_multixmit;	/* multicast pkts transmitted */
+	uint64_t		st_oerrors;	/* transmit errors */
+	uint64_t		st_noxmitbuf;	/* transmit pkts discarded */
+
+	uint64_t		st_rbytes;	/* bytes received */
+	uint64_t		st_ipkts;	/* pkts received */
+	uint64_t		st_brdcstrcv;	/* broadcast pkts received */
+	uint64_t		st_multircv;	/* multicast pkts received */
+	uint64_t		st_ierrors;	/* receive errors */
+	uint64_t		st_norcvbuf;	/* receive pkts discarded */
+} eib_stats_t;
+
+#define	EIB_UPDATE_COUNTER(addr, val)	(atomic_add_64((addr), (val)))
+#define	EIB_INCR_COUNTER(addr)		(atomic_inc_64((addr)))
+#define	EIB_DECR_COUNTER(addr)		(atomic_dec_64((addr)))
+
+/*
+ * Cache of address vectors with dlid as the key. Currently we use
+ * eib state structure's  ei_lock to protect the individual address
+ * vector's fields.  This is a lock granularity that's slightly
+ * bigger than ideal, but it should do for now.
+ */
+#define	EIB_AV_NBUCKETS		17
+typedef struct eib_avect_s {
+	struct eib_avect_s	*av_next;
+	ibt_adds_vect_t		av_vect;
+	uint_t			av_ref;
+} eib_avect_t;
+
+/*
+ * vNIC creation and deletion are serialized by a non-zero value
+ * to the ei_vnic_state member (i.e. only one vnic may be created
+ * or deleted at a time). The code makes sure to access/update
+ * the ei_active_vnics member only after a successful setting of
+ * ei_vnic_state.
+ */
+#define	EIB_VN_BEING_CREATED	0x01
+#define	EIB_VN_BEING_DELETED	0x02
+#define	EIB_VN_BEING_MODIFIED	(EIB_VN_BEING_CREATED | EIB_VN_BEING_DELETED)
+
+/*
+ * All possible EoIB event work items that need to be handled
+ */
+#define	EIB_EV_NONE		0
+#define	EIB_EV_PORT_DOWN	1
+#define	EIB_EV_PORT_UP		2
+#define	EIB_EV_PKEY_CHANGE	3
+#define	EIB_EV_SGID_CHANGE	4
+#define	EIB_EV_CLNT_REREG	5
+#define	EIB_EV_GW_EPORT_DOWN	6
+#define	EIB_EV_GW_DOWN		7
+#define	EIB_EV_GW_UP		8
+#define	EIB_EV_GW_INFO_UPDATE	9
+#define	EIB_EV_MCG_DELETED	10
+#define	EIB_EV_MCG_CREATED	11
+#define	EIB_EV_SHUTDOWN		12
+
+typedef struct eib_event_s {
+	struct eib_event_s	*ev_next;
+	uint_t			ev_code;
+	void			*ev_arg;
+} eib_event_t;
+
+/*
+ * Work element for new vnic creation
+ */
+typedef struct eib_vnic_req_s {
+	struct eib_vnic_req_s	*vr_next;
+	uint_t			vr_req;
+	uint8_t			vr_mac[ETHERADDRL];
+	uint16_t		vr_vlan;
+} eib_vnic_req_t;
+
+/*
+ * Values for vr_req
+ */
+#define	EIB_CR_REQ_NEW_VNIC	1
+#define	EIB_CR_REQ_FLUSH	2
+#define	EIB_CR_REQ_DIE		3
+
+/*
+ * Work element for vnics kept alive by the keepalive manager thread
+ * and bitfield values for ei_ka_vnics_event.
+ */
+typedef struct eib_ka_vnics_s {
+	struct eib_ka_vnics_s	*ka_next;
+	struct eib_vnic_s	*ka_vnic;
+} eib_ka_vnics_t;
+
+#define	EIB_KA_VNICS_DIE	0x1
+#define	EIB_KA_VNICS_TIMED_OUT	0x2
+
+/*
+ * EoIB per-instance state
+ */
+typedef struct eib_s {
+	ibt_clnt_hdl_t		ei_ibt_hdl;
+	ibt_hca_hdl_t		ei_hca_hdl;
+	ibt_pd_hdl_t		ei_pd_hdl;
+	mac_handle_t		ei_mac_hdl;
+
+	ddi_softint_handle_t    ei_admin_si_hdl;
+	ddi_callback_id_t	ei_login_ack_cb;
+	ddi_callback_id_t	ei_gw_alive_cb;
+	ddi_callback_id_t	ei_gw_info_cb;
+
+	ibt_hca_attr_t		*ei_hca_attrs;
+	dev_info_t		*ei_dip;
+	uint_t			ei_instance;
+
+	struct eib_gw_props_s	*ei_gw_props;
+	struct eib_props_s	*ei_props;
+	struct eib_caps_s	*ei_caps;
+	struct eib_stats_s	*ei_stats;
+
+	struct eib_node_state_s	*ei_node_state;
+	struct eib_chan_s	*ei_admin_chan;
+
+	struct eib_wqe_pool_s	*ei_tx;
+	struct eib_wqe_pool_s	*ei_rx;
+	struct eib_lsobkt_s	*ei_lso;
+
+	kmutex_t		ei_vnic_lock;
+	kcondvar_t		ei_vnic_cv;
+	uint_t			ei_vnic_state;
+	uint64_t		ei_active_vnics;
+	uint64_t		ei_zombie_vnics;
+	uint64_t		ei_rejoin_vnics;
+	struct eib_vnic_s	*ei_vnic[EIB_MAX_VNICS];
+	struct eib_vnic_s	*ei_vnic_pending;
+	int64_t			ei_gw_last_heartbeat;
+	boolean_t		ei_gw_unreachable;
+	uint8_t			ei_gw_eport_state;
+
+	kmutex_t		ei_av_lock;
+	struct eib_avect_s	*ei_av[EIB_AV_NBUCKETS];
+
+	kmutex_t		ei_ev_lock;
+	kcondvar_t		ei_ev_cv;
+	struct eib_event_s	*ei_event;
+
+	kmutex_t		ei_rxpost_lock;
+	kcondvar_t		ei_rxpost_cv;
+	uint_t			ei_rxpost_die;
+	struct eib_chan_s	*ei_rxpost;
+
+	kmutex_t		ei_vnic_req_lock;
+	kcondvar_t		ei_vnic_req_cv;
+	struct eib_vnic_req_s	*ei_vnic_req;
+	struct eib_vnic_req_s	*ei_failed_vnic_req;
+	struct eib_vnic_req_s	*ei_pending_vnic_req;
+
+	kmutex_t		ei_ka_vnics_lock;
+	kcondvar_t		ei_ka_vnics_cv;
+	uint_t			ei_ka_vnics_event;
+	struct eib_ka_vnics_s	*ei_ka_vnics;
+
+	kt_did_t		ei_txwqe_monitor;
+	kt_did_t		ei_lsobufs_monitor;
+	kt_did_t		ei_rwqes_refiller;
+	kt_did_t		ei_vnic_creator;
+	kt_did_t		ei_events_handler;
+	kt_did_t		ei_keepalives_manager;
+} eib_t;
+
+/*
+ * Private read-only datalink properties
+ */
+#define	EIB_DLPROP_GW_EPORT_STATE	"_eib_eport_state"
+#define	EIB_DLPROP_HCA_GUID		"_eib_hca_guid"
+#define	EIB_DLPROP_PORT_GUID		"_eib_port_guid"
+
+/*
+ * FUNCTION PROTOTYPES FOR CROSS-FILE LINKAGE
+ */
+
+/*
+ * FIP protocol related
+ */
+extern int eib_fip_login(eib_t *, eib_vnic_t *, int *);
+extern int eib_fip_heartbeat(eib_t *, eib_vnic_t *, int *);
+extern int eib_fip_vhub_table(eib_t *, eib_vnic_t *, int *);
+extern int eib_fip_logout(eib_t *, eib_vnic_t *, int *);
+extern int eib_fip_parse_login_ack(eib_t *, uint8_t *, eib_login_data_t *);
+extern int eib_fip_parse_ctl_pkt(uint8_t *, eib_vnic_t *);
+
+/*
+ * Service threads and other handlers
+ */
+extern void eib_events_handler(eib_t *);
+extern void eib_svc_enqueue_event(eib_t *, eib_event_t *);
+extern void eib_refill_rwqes(eib_t *);
+extern void eib_vnic_creator(eib_t *);
+extern void eib_monitor_tx_wqes(eib_t *);
+extern void eib_monitor_lso_bufs(eib_t *);
+extern void eib_manage_keepalives(eib_t *);
+extern void eib_stop_events_handler(eib_t *);
+extern void eib_stop_refill_rwqes(eib_t *);
+extern void eib_stop_vnic_creator(eib_t *);
+extern void eib_stop_monitor_tx_wqes(eib_t *);
+extern int eib_stop_monitor_lso_bufs(eib_t *, boolean_t);
+extern void eib_stop_manage_keepalives(eib_t *);
+extern void eib_flush_vnic_reqs(eib_t *);
+extern void eib_gw_info_cb(dev_info_t *, ddi_eventcookie_t, void *, void *);
+extern void eib_gw_alive_cb(dev_info_t *, ddi_eventcookie_t, void *, void *);
+extern void eib_login_ack_cb(dev_info_t *, ddi_eventcookie_t, void *, void *);
+
+/*
+ * Admin QP related
+ */
+extern int eib_adm_setup_qp(eib_t *, int *);
+extern uint_t eib_adm_comp_handler(caddr_t, caddr_t);
+extern void eib_rb_adm_setup_qp(eib_t *);
+
+/*
+ * Control QP related
+ */
+extern int eib_ctl_create_qp(eib_t *, eib_vnic_t *, int *);
+extern uint_t eib_ctl_comp_handler(caddr_t, caddr_t);
+extern void eib_rb_ctl_create_qp(eib_t *, eib_vnic_t *);
+
+/*
+ * Data QP related
+ */
+extern int eib_data_create_qp(eib_t *, eib_vnic_t *, int *);
+extern uint_t eib_data_rx_comp_handler(caddr_t, caddr_t);
+extern uint_t eib_data_tx_comp_handler(caddr_t, caddr_t);
+extern void eib_data_rx_recycle(caddr_t);
+extern void eib_data_post_tx(eib_vnic_t *, eib_wqe_t *);
+extern void eib_data_parse_ether_hdr(mblk_t *, eib_ether_hdr_t *);
+extern int eib_data_lookup_vnic(eib_t *, uint8_t *, uint16_t, eib_vnic_t **,
+    boolean_t *);
+extern int eib_data_prepare_frame(eib_vnic_t *, eib_wqe_t *, mblk_t *,
+    eib_ether_hdr_t *);
+extern void eib_rb_data_create_qp(eib_t *, eib_vnic_t *);
+
+/*
+ * Resource related
+ */
+extern int eib_rsrc_setup_bufs(eib_t *, int *);
+extern int eib_rsrc_grab_swqes(eib_t *, eib_wqe_t **, uint_t, uint_t *, int);
+extern int eib_rsrc_grab_rwqes(eib_t *, eib_wqe_t **, uint_t, uint_t *, int);
+extern int eib_rsrc_grab_lsobufs(eib_t *, uint_t, ibt_wr_ds_t *, uint32_t *);
+extern eib_wqe_t *eib_rsrc_grab_swqe(eib_t *, int);
+extern eib_wqe_t *eib_rsrc_grab_rwqe(eib_t *, int);
+extern void eib_rsrc_return_swqe(eib_t *, eib_wqe_t *, eib_chan_t *);
+extern void eib_rsrc_return_rwqe(eib_t *, eib_wqe_t *, eib_chan_t *);
+extern void eib_rsrc_return_lsobufs(eib_t *, ibt_wr_ds_t *, uint32_t);
+extern void eib_rsrc_decr_posted_swqe(eib_t *, eib_chan_t *);
+extern void eib_rsrc_decr_posted_rwqe(eib_t *, eib_chan_t *);
+extern void eib_rsrc_txwqes_needed(eib_t *);
+extern void eib_rsrc_lsobufs_needed(eib_t *);
+extern boolean_t eib_rsrc_rxpool_low(eib_wqe_t *);
+extern void eib_rb_rsrc_setup_bufs(eib_t *, boolean_t);
+
+/*
+ * IBT related
+ */
+extern int eib_ibt_hca_init(eib_t *);
+extern void eib_ibt_link_mod(eib_t *);
+extern int eib_ibt_modify_chan_pkey(eib_t *, eib_chan_t *, ib_pkey_t);
+extern eib_avect_t *eib_ibt_hold_avect(eib_t *, ib_lid_t, uint8_t);
+extern void eib_ibt_release_avect(eib_t *, eib_avect_t *);
+extern void eib_ibt_free_avects(eib_t *);
+extern void eib_ibt_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
+    ibt_async_event_t *);
+extern void eib_ibt_record_capab(eib_t *, ibt_hca_attr_t *, eib_caps_t *);
+extern void eib_rb_ibt_hca_init(eib_t *, uint_t);
+
+/*
+ * Chan related
+ */
+extern eib_chan_t *eib_chan_init(void);
+extern void eib_chan_fini(eib_chan_t *);
+extern int eib_chan_post_rx(eib_t *, eib_chan_t *, uint_t *);
+extern int eib_chan_post_recv(eib_t *, eib_chan_t *, eib_wqe_t *);
+
+/*
+ * Mac layer related
+ */
+extern void eib_mac_set_nic_state(eib_t *, uint_t);
+extern void eib_mac_clr_nic_state(eib_t *, uint_t);
+extern void eib_mac_upd_nic_state(eib_t *, uint_t, uint_t);
+extern uint_t eib_mac_get_nic_state(eib_t *);
+extern void eib_mac_link_state(eib_t *, link_state_t, boolean_t);
+extern void eib_mac_link_down(eib_t *, boolean_t);
+extern void eib_mac_link_up(eib_t *, boolean_t);
+extern int eib_mac_start(eib_t *);
+extern void eib_mac_stop(eib_t *);
+extern int eib_mac_multicast(eib_t *, boolean_t, uint8_t *);
+extern int eib_mac_promisc(eib_t *, boolean_t);
+extern int eib_mac_tx(eib_t *, mblk_t *);
+extern int eib_mac_hca_portstate(eib_t *, ib_lid_t *, int *);
+
+/*
+ * VNIC related
+ */
+extern int eib_vnic_create(eib_t *, uint8_t *, uint16_t, eib_vnic_t **, int *);
+extern void eib_vnic_delete(eib_t *, eib_vnic_t *);
+extern int eib_vnic_wait_for_login_ack(eib_t *, eib_vnic_t *, int *);
+extern void eib_vnic_login_ack(eib_t *, eib_login_data_t *);
+extern int eib_vnic_wait_for_table(eib_t *, eib_vnic_t *, int *);
+extern void eib_vnic_vhub_table_done(eib_vnic_t *, uint_t);
+extern int eib_vnic_join_data_mcg(eib_t *, eib_vnic_t *, uint8_t *,
+    boolean_t, int *);
+extern int eib_vnic_setup_dest(eib_vnic_t *, eib_wqe_t *, uint8_t *, uint16_t);
+extern void eib_vnic_leave_data_mcg(eib_t *, eib_vnic_t *, uint8_t *);
+extern void eib_vnic_init_tables(eib_t *, eib_vnic_t *);
+extern void eib_vnic_fini_tables(eib_t *, eib_vnic_t *, boolean_t);
+extern eib_chan_t *eib_vnic_get_data_chan(eib_t *, int);
+extern void eib_vnic_need_new(eib_t *, uint8_t *, uint16_t);
+extern void eib_vnic_enqueue_req(eib_t *, eib_vnic_req_t *);
+extern void eib_vnic_resurrect_zombies(eib_t *, uint8_t *);
+extern void eib_vnic_restart(eib_t *, int, uint8_t *);
+extern void eib_vnic_rejoin_mcgs(eib_t *);
+extern void eib_rb_vnic_create(eib_t *, eib_vnic_t *, uint_t);
+
+/*
+ * Logging and other stuff
+ */
+extern void eib_debug_init(void);
+extern void eib_debug_fini(void);
+extern void eib_dprintf_crit(int, const char *fmt, ...);
+extern void eib_dprintf_err(int, const char *fmt, ...);
+extern void eib_dprintf_warn(int, const char *fmt, ...);
+#ifdef EIB_DEBUG
+extern void eib_dprintf_debug(int, const char *fmt, ...);
+extern void eib_dprintf_args(int, const char *fmt, ...);
+extern void eib_dprintf_pkt(int, uint8_t *, uint_t);
+extern void eib_dprintf_verbose(int, const char *fmt, ...);
+#endif
+extern int eib_get_props(eib_t *);
+extern void eib_update_props(eib_t *, eib_gw_info_t *);
+extern void eib_rb_get_props(eib_t *);
+
+/*
+ * EoIB specific global variables
+ */
+extern ib_gid_t eib_reserved_gid;
+extern uint8_t eib_zero_mac[];
+extern uint8_t eib_broadcast_mac[];
+extern int eib_setbit_mod67[];
+extern char *eib_pvt_props[];
+
+/*
+ * HW/FW workarounds
+ */
+extern int eib_wa_no_desc_list_len;
+extern int eib_wa_no_cksum_offload;
+extern int eib_wa_no_lso;
+extern int eib_wa_no_mcast_entries;
+extern int eib_wa_no_av_discover;
+extern int eib_wa_no_good_vp_flag;
+extern int eib_wa_no_good_vhub_cksum;
+
+/*
+ * Miscellaneous externs
+ */
+extern void freemsgchain(mblk_t *);
+extern pri_t minclsyspri;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _SYS_IB_EOIB_EIB_IMPL_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/sys/ib/clients/eoib/enx_impl.h	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,532 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_IB_EOIB_ENX_IMPL_H
+#define	_SYS_IB_EOIB_ENX_IMPL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/varargs.h>
+#include <sys/ib/ibtl/ibti.h>
+#include <sys/ib/ibtl/ibvti.h>
+#include <sys/ib/ib_pkt_hdrs.h>
+#include <sys/ib/ibtl/impl/ibtl_ibnex.h>
+#include <sys/ib/mgt/sm_attr.h>
+
+#include <sys/ib/clients/eoib/fip.h>
+#include <sys/ib/clients/eoib/eib.h>
+
+/*
+ * Driver specific constants
+ */
+#define	ENX_E_SUCCESS		0
+#define	ENX_E_FAILURE		-1
+#define	ENX_MAX_LINE		128
+#define	ENX_GRH_SZ		(sizeof (ib_grh_t))
+
+/*
+ * Debug messages
+ */
+#define	ENX_MSGS_CRIT		0x01
+#define	ENX_MSGS_ERR		0x02
+#define	ENX_MSGS_WARN		0x04
+#define	ENX_MSGS_DEBUG		0x08
+#define	ENX_MSGS_ARGS		0x10
+#define	ENX_MSGS_VERBOSE	0x20
+#define	ENX_MSGS_DEFAULT	(ENX_MSGS_CRIT | ENX_MSGS_ERR | ENX_MSGS_WARN)
+
+#define	ENX_LOGSZ_DEFAULT	0x20000
+
+#define	ENX_DPRINTF_CRIT	eibnx_dprintf_crit
+#define	ENX_DPRINTF_ERR		eibnx_dprintf_err
+#define	ENX_DPRINTF_WARN	eibnx_dprintf_warn
+#ifdef ENX_DEBUG
+#define	ENX_DPRINTF_DEBUG	eibnx_dprintf_debug
+#define	ENX_DPRINTF_ARGS	eibnx_dprintf_args
+#define	ENX_DPRINTF_VERBOSE	eibnx_dprintf_verbose
+#else
+#define	ENX_DPRINTF_DEBUG	0 &&
+#define	ENX_DPRINTF_ARGS	0 &&
+#define	ENX_DPRINTF_VERBOSE	0 &&
+#endif
+
+/*
+ *  EoIB Nexus service threads
+ */
+#define	ENX_PORT_MONITOR	"eibnx_port_%d_monitor"
+#define	ENX_NODE_CREATOR	"eibnx_node_creator"
+
+/*
+ * Default period (us) for unicast solicitations to discovered gateways.
+ * EoIB specification requires that hosts send solicitation atleast every
+ * 4 * GW_ADV_PERIOD.
+ */
+#define	ENX_DFL_SOLICIT_PERIOD_USEC	32000000
+
+/*
+ * Portinfo list per HCA
+ */
+typedef struct eibnx_port_s {
+	struct eibnx_port_s 	*po_next;
+	ibt_hca_portinfo_t	*po_pi;
+	uint_t			po_pi_size;
+} eibnx_port_t;
+
+/*
+ * HCA details
+ */
+typedef struct eibnx_hca_s {
+	struct eibnx_hca_s 	*hc_next;
+	ib_guid_t		hc_guid;
+	ibt_hca_hdl_t		hc_hdl;
+	ibt_pd_hdl_t		hc_pd;
+	eibnx_port_t		*hc_port;
+} eibnx_hca_t;
+
+/*
+ * The port_monitor thread in EoIB nexus driver only sends two types of
+ * packets: multicast solicitation the first time around, and periodic
+ * unicast solicitations later to gateways that have been discovered. So
+ * we need a couple of send wqes for the multicast solicitation and
+ * probably as many send wqes as the number of gateways that may be
+ * discovered from each port, for sending the unicast solicitations.
+ * For unicast solicitations though, the UD destination needs to be set
+ * up at the time we receive the advertisement from the gateway, using
+ * ibt_modify_reply_ud_dest(), so we'll assign one send wqe for each
+ * gateway that we discover.  This means that we need to acquire these
+ * send wqe entries during rx processing in the completion handler, which
+ * means we must avoid sleeping in trying to acquire the swqe. Therefore,
+ * we'll pre-allocate these unicast solication send wqes to be atleast
+ * twice the number of recv wqes.
+ *
+ * The receive packets expected by the EoIB nexus driver are the multicast
+ * and unicast messages on the SOLICIT and ADVERTISE groups. These
+ * shouldn't be too many, and should be tuned as we gain experience on
+ * the traffic pattern.  We'll start with 16.
+ */
+#define	ENX_NUM_SWQE			46
+#define	ENX_NUM_RWQE			16
+#define	ENX_CQ_SIZE			(ENX_NUM_SWQE + ENX_NUM_RWQE + 2)
+
+/*
+ * qe_type values
+ */
+#define	ENX_QETYP_RWQE			0x1
+#define	ENX_QETYP_SWQE			0x2
+
+/*
+ * qe_flags bitmasks (protected by qe_lock). None of the
+ * flag values may be zero.
+ */
+#define	ENX_QEFL_INUSE			0x01
+#define	ENX_QEFL_POSTED			0x02
+#define	ENX_QEFL_RELONCOMP		0x04
+
+/*
+ * Recv and send workq entries
+ */
+typedef struct eibnx_wqe_s {
+	uint_t			qe_type;
+	uint_t			qe_bufsz;
+	ibt_wr_ds_t		qe_sgl;
+	ibt_all_wr_t		qe_wr;
+	kmutex_t		qe_lock;
+	uint_t			qe_flags;
+} eibnx_wqe_t;
+
+/*
+ * Tx descriptor
+ */
+typedef struct eibnx_tx_s {
+	ib_vaddr_t		tx_vaddr;
+	ibt_mr_hdl_t		tx_mr;
+	ibt_lkey_t		tx_lkey;
+	eibnx_wqe_t		tx_wqe[ENX_NUM_SWQE];
+} eibnx_tx_t;
+
+/*
+ * Rx descriptor
+ */
+typedef struct eibnx_rx_s {
+	ib_vaddr_t		rx_vaddr;
+	ibt_mr_hdl_t		rx_mr;
+	ibt_lkey_t		rx_lkey;
+	eibnx_wqe_t		rx_wqe[ENX_NUM_RWQE];
+} eibnx_rx_t;
+
+/*
+ * Details about the address of each gateway we discover.
+ */
+typedef struct eibnx_gw_addr_s {
+	ibt_adds_vect_t		*ga_vect;
+	ib_gid_t		ga_gid;
+	ib_qpn_t		ga_qpn;
+	ib_qkey_t		ga_qkey;
+	ib_pkey_t		ga_pkey;
+} eibnx_gw_addr_t;
+
+/*
+ * States for each GW
+ */
+#define	ENX_GW_STATE_UNAVAILABLE	1	/* GW nackd availability */
+#define	ENX_GW_STATE_AVAILABLE		2	/* GW mcasted availability */
+#define	ENX_GW_STATE_READY_TO_LOGIN	3	/* GW ucasted availability */
+
+typedef struct eibnx_gw_info_s {
+	struct eibnx_gw_info_s	*gw_next;
+	eibnx_wqe_t		*gw_swqe;
+	uint_t			gw_state;
+
+	kmutex_t		gw_adv_lock;
+	uint_t			gw_adv_flag;
+	int64_t			gw_adv_last_lbolt;
+	int64_t			gw_adv_timeout_ticks;
+
+	eibnx_gw_addr_t		gw_addr;
+
+	ib_guid_t		gw_system_guid;
+	ib_guid_t		gw_guid;
+
+	uint32_t		gw_adv_period;
+	uint32_t		gw_ka_period;
+	uint32_t		gw_vnic_ka_period;
+	ib_qpn_t		gw_ctrl_qpn;
+
+	ib_lid_t		gw_lid;
+	uint16_t		gw_portid;
+	uint16_t		gw_num_net_vnics;
+
+	uint8_t			gw_is_host_adm_vnics;
+	uint8_t			gw_sl;
+	uint8_t			gw_n_rss_qpn;
+	uint8_t			gw_flag_ucast_advt;
+	uint8_t			gw_flag_available;
+
+	uint8_t			gw_system_name[EIB_GW_SYSNAME_LEN];
+	uint8_t			gw_port_name[EIB_GW_PORTNAME_LEN];
+	uint8_t			gw_vendor_id[EIB_GW_VENDOR_LEN];
+} eibnx_gw_info_t;
+
+/*
+ * Values for gw_adv_flag (non-zero only)
+ */
+#define	ENX_GW_DEAD		1
+#define	ENX_GW_ALIVE		2
+#define	ENX_GW_AWARE		3
+
+/*
+ * Currently, we only expect the advertisement type of packets
+ * from the gw. But we do get login acks from the gateway also
+ * here in the nexus, so we'll need an identifier for that.
+ */
+typedef enum {
+	FIP_GW_ADVERTISE_MCAST = 0,
+	FIP_GW_ADVERTISE_UCAST,
+	FIP_VNIC_LOGIN_ACK
+} eibnx_gw_pkt_type_t;
+
+/*
+ * Currently, the only gw response handled by the eibnx driver
+ * are the ucast/mcast advertisements.  Information collected from
+ * both these responses may be packed into a eibnx_gw_info_t.
+ * In the future, if we decide to handle other types of responses
+ * from the gw, we could simply add the new types to the union.
+ */
+typedef struct eibnx_gw_msg_s {
+	eibnx_gw_pkt_type_t	gm_type;
+	union {
+		eibnx_gw_info_t	gm_info;
+	} u;
+} eibnx_gw_msg_t;
+
+/*
+ * List to hold the devinfo nodes of eoib instances
+ */
+typedef struct eibnx_child_s {
+	struct eibnx_child_s	*ch_next;
+	dev_info_t		*ch_dip;
+	eibnx_gw_info_t		*ch_gwi;
+	char			*ch_node_name;
+} eibnx_child_t;
+
+/*
+ * Event bitmasks for the port-monitor to wait on. None of these flags
+ * may be zero.
+ */
+#define	ENX_EVENT_LINK_UP		0x01
+#define	ENX_EVENT_MCGS_AVAILABLE	0x02
+#define	ENX_EVENT_TIMED_OUT		0x04
+#define	ENX_EVENT_DIE			0x08
+#define	ENX_EVENT_COMPLETION		0x10
+
+/*
+ * MCG Query/Join status
+ */
+#define	ENX_MCGS_FOUND			0x1
+#define	ENX_MCGS_JOINED			0x2
+
+/*
+ * Information that each port-monitor thread cares about
+ */
+typedef struct eibnx_thr_info_s {
+	struct eibnx_thr_info_s	*ti_next;
+	uint_t			ti_progress;
+
+	/*
+	 * Our kernel thread id
+	 */
+	kt_did_t		ti_kt_did;
+
+	/*
+	 * HCA, port and protection domain information
+	 */
+	ib_guid_t		ti_hca_guid;
+	ibt_hca_hdl_t		ti_hca;
+	ibt_pd_hdl_t		ti_pd;
+	ibt_hca_portinfo_t	*ti_pi;
+	char			*ti_ident;
+
+	/*
+	 * Well-known multicast groups for solicitations
+	 * and advertisements.
+	 */
+	kmutex_t		ti_mcg_lock;
+	uint_t			ti_mcg_status;
+	ibt_mcg_info_t		*ti_advertise_mcg;
+	ibt_mcg_info_t		*ti_solicit_mcg;
+	uint_t			ti_mcast_done;
+
+	/*
+	 * Completion queue stuff
+	 */
+	ibt_cq_hdl_t		ti_cq_hdl;
+	uint_t			ti_cq_sz;
+	ibt_wc_t		*ti_wc;
+	ddi_softint_handle_t    ti_softint_hdl;
+
+	/*
+	 * Channel related
+	 */
+	ibt_channel_hdl_t	ti_chan;
+	ib_qpn_t		ti_qpn;
+
+	/*
+	 * Transmit/Receive stuff
+	 */
+	eibnx_tx_t		ti_snd;
+	eibnx_rx_t		ti_rcv;
+
+	/*
+	 * GW related stuff
+	 */
+	kmutex_t		ti_gw_lock;
+	eibnx_gw_info_t		*ti_gw;
+
+	/*
+	 * Devinfo nodes for the eoib children
+	 */
+	kmutex_t		ti_child_lock;
+	eibnx_child_t		*ti_child;
+
+	/*
+	 * Events that we wait on and/or handle
+	 */
+	kmutex_t		ti_event_lock;
+	kcondvar_t		ti_event_cv;
+	uint_t			ti_event;
+} eibnx_thr_info_t;
+
+/*
+ * Workq entry for creation of eoib nodes
+ */
+typedef struct eibnx_nodeq_s {
+	struct eibnx_nodeq_s	*nc_next;
+	eibnx_thr_info_t	*nc_info;
+	eibnx_gw_info_t		*nc_gwi;
+} eibnx_nodeq_t;
+
+/*
+ * Bus config status flags.  The in-prog is protected by
+ * nx_lock, and the rest of the flags (currently only
+ * buscfg-complete) is protected by the in-prog bit itself.
+ */
+#define	NX_FL_BUSOP_INPROG		0x1
+#define	NX_FL_BUSCFG_COMPLETE		0x2
+#define	NX_FL_BUSOP_MASK		0x3
+
+/*
+ * EoIB nexus per-instance state
+ */
+typedef struct eibnx_s {
+	dev_info_t		*nx_dip;
+	ibt_clnt_hdl_t		nx_ibt_hdl;
+
+	kmutex_t		nx_lock;
+	eibnx_hca_t		*nx_hca;
+	eibnx_thr_info_t	*nx_thr_info;
+	boolean_t		nx_monitors_up;
+
+	kmutex_t		nx_nodeq_lock;
+	kcondvar_t		nx_nodeq_cv;
+	eibnx_nodeq_t		*nx_nodeq;
+	kt_did_t		nx_nodeq_kt_did;
+	uint_t			nx_nodeq_thr_die;
+
+	kmutex_t		nx_busop_lock;
+	kcondvar_t		nx_busop_cv;
+	uint_t			nx_busop_flags;
+} eibnx_t;
+
+
+/*
+ * Event tags for EoIB Nexus events delivered to EoIB instances
+ */
+#define	ENX_EVENT_TAG_GW_INFO_UPDATE		0
+#define	ENX_EVENT_TAG_GW_AVAILABLE		1
+#define	ENX_EVENT_TAG_LOGIN_ACK			2
+
+/*
+ * FUNCTION PROTOTYPES FOR CROSS-FILE LINKAGE
+ */
+
+/*
+ * Threads and Event Handlers
+ */
+void eibnx_port_monitor(eibnx_thr_info_t *);
+void eibnx_subnet_notices_handler(void *, ib_gid_t, ibt_subnet_event_code_t,
+    ibt_subnet_event_t *);
+void eibnx_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
+    ibt_async_event_t *);
+boolean_t eibnx_is_gw_dead(eibnx_gw_info_t *);
+void eibnx_create_eoib_node(void);
+void eibnx_comp_intr(ibt_cq_hdl_t, void *);
+uint_t eibnx_comp_handler(caddr_t, caddr_t);
+
+/*
+ * IBT related functions
+ */
+int eibnx_ibt_init(eibnx_t *);
+int eibnx_find_mgroups(eibnx_thr_info_t *);
+int eibnx_setup_cq(eibnx_thr_info_t *);
+int eibnx_setup_ud_channel(eibnx_thr_info_t *);
+int eibnx_setup_bufs(eibnx_thr_info_t *);
+int eibnx_setup_cq_handler(eibnx_thr_info_t *);
+int eibnx_join_mcgs(eibnx_thr_info_t *);
+int eibnx_rejoin_mcgs(eibnx_thr_info_t *);
+int eibnx_ibt_fini(eibnx_t *);
+
+void eibnx_rb_find_mgroups(eibnx_thr_info_t *);
+void eibnx_rb_setup_cq(eibnx_thr_info_t *);
+void eibnx_rb_setup_ud_channel(eibnx_thr_info_t *);
+void eibnx_rb_setup_bufs(eibnx_thr_info_t *);
+void eibnx_rb_setup_cq_handler(eibnx_thr_info_t *);
+void eibnx_rb_join_mcgs(eibnx_thr_info_t *);
+
+eibnx_hca_t *eibnx_prepare_hca(ib_guid_t);
+int eibnx_cleanup_hca(eibnx_hca_t *);
+
+/*
+ * FIP packetizing related functions
+ */
+int eibnx_fip_solicit_mcast(eibnx_thr_info_t *);
+int eibnx_fip_solicit_ucast(eibnx_thr_info_t *, clock_t *);
+int eibnx_fip_parse_pkt(uint8_t *, eibnx_gw_msg_t *);
+
+/*
+ * Queue and List related routines
+ */
+eibnx_wqe_t *eibnx_acquire_swqe(eibnx_thr_info_t *, int);
+void eibnx_return_swqe(eibnx_wqe_t *);
+void eibnx_return_rwqe(eibnx_thr_info_t *, eibnx_wqe_t *);
+void eibnx_release_swqe(eibnx_wqe_t *);
+
+void eibnx_enqueue_child(eibnx_thr_info_t *, eibnx_gw_info_t *, char *,
+    dev_info_t *);
+int eibnx_update_child(eibnx_thr_info_t *, eibnx_gw_info_t *, dev_info_t *);
+dev_info_t *eibnx_find_child_dip_by_inst(eibnx_thr_info_t *, int);
+dev_info_t *eibnx_find_child_dip_by_gw(eibnx_thr_info_t *, uint16_t);
+
+eibnx_gw_info_t *eibnx_find_gw_in_gwlist(eibnx_thr_info_t *, eibnx_gw_info_t *);
+eibnx_gw_info_t *eibnx_add_gw_to_gwlist(eibnx_thr_info_t *, eibnx_gw_info_t *,
+    ibt_wc_t *, uint8_t *);
+void eibnx_replace_gw_in_gwlist(eibnx_thr_info_t *, eibnx_gw_info_t *,
+    eibnx_gw_info_t *, ibt_wc_t *, uint8_t *, boolean_t *);
+void eibnx_queue_for_creation(eibnx_thr_info_t *, eibnx_gw_info_t *);
+
+/*
+ * Logging and Error reporting routines
+ */
+void eibnx_debug_init(void);
+void eibnx_debug_fini(void);
+void eibnx_dprintf_crit(const char *fmt, ...);
+void eibnx_dprintf_err(const char *fmt, ...);
+void eibnx_dprintf_warn(const char *fmt, ...);
+#ifdef ENX_DEBUG
+void eibnx_dprintf_debug(const char *fmt, ...);
+void eibnx_dprintf_args(const char *fmt, ...);
+void eibnx_dprintf_verbose(const char *fmt, ...);
+#endif
+
+/*
+ * Miscellaneous
+ */
+void eibnx_cleanup_port_nodes(eibnx_thr_info_t *);
+void eibnx_create_node_props(dev_info_t *, eibnx_thr_info_t *,
+    eibnx_gw_info_t *);
+int eibnx_name_child(dev_info_t *, char *, size_t);
+void eibnx_busop_inprog_enter(eibnx_t *);
+void eibnx_busop_inprog_exit(eibnx_t *);
+eibnx_thr_info_t *eibnx_start_port_monitor(eibnx_hca_t *, eibnx_port_t *);
+void eibnx_stop_port_monitor(eibnx_thr_info_t *);
+void eibnx_terminate_monitors(void);
+int eibnx_configure_node(eibnx_thr_info_t *, eibnx_gw_info_t *, dev_info_t **);
+int eibnx_unconfigure_node(eibnx_thr_info_t *, eibnx_gw_info_t *);
+int eibnx_locate_node_name(char *, eibnx_thr_info_t **, eibnx_gw_info_t **);
+int eibnx_locate_unconfigured_node(eibnx_thr_info_t **, eibnx_gw_info_t **);
+
+/*
+ * Devctl cbops (currently dummy)
+ */
+int eibnx_devctl_open(dev_t *, int, int, cred_t *);
+int eibnx_devctl_close(dev_t, int, int, cred_t *);
+int eibnx_devctl_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
+
+/*
+ * External variable references
+ */
+extern pri_t minclsyspri;
+extern eibnx_t *enx_global_ss;
+extern ib_gid_t enx_solicit_mgid;
+extern ib_gid_t enx_advertise_mgid;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _SYS_IB_EOIB_ENX_IMPL_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/sys/ib/clients/eoib/fip.h	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,465 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_IB_EOIB_FIP_H
+#define	_SYS_IB_EOIB_FIP_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/ethernet.h>
+#include <sys/ib/ib_types.h>
+
+/*
+ * Sizes of various objects in FIP headers
+ */
+#define	FIP_VENDOR_LEN			8
+#define	FIP_GUID_LEN			8
+#define	FIP_SYSNAME_LEN			32
+#define	FIP_PORTNAME_LEN		8
+#define	FIP_MGID_PREFIX_LEN		5
+#define	FIP_VNIC_NAME_LEN		16
+#define	FIP_VHUBID_LEN			3
+
+/*
+ * EoIB Pkeys and Qkeys
+ */
+#define	EIB_ADMIN_PKEY			0xFFFF
+#define	EIB_FIP_QKEY			0x80020002
+#define	EIB_DATA_QKEY			0x80020003
+
+/*
+ * EoIB Advertise and Solicit MCG GUIDs
+ */
+#define	EIB_GUID_ADVERTISE_PREFIX	0xFF12E01B00060000
+#define	EIB_GUID_SOLICIT_PREFIX		0xFF12E01B00070000
+
+/*
+ * FIP_Protocol_Version
+ */
+#define	FIP_PROTO_VERSION		0
+typedef struct fip_proto_s {
+	uint8_t		pr_version;
+	uint8_t		pr_reserved[3];
+} fip_proto_t;
+
+/*
+ * Basic FIP Header: Opcodes and subcodes for EoIB
+ */
+#define	FIP_OPCODE_EOIB			0xFFF9
+
+#define	FIP_SUBCODE_H_SOLICIT		0x1
+#define	FIP_SUBCODE_G_ADVERTISE		0x2
+#define	FIP_SUBCODE_H_VNIC_LOGIN	0x3
+#define	FIP_SUBCODE_G_VNIC_LOGIN_ACK	0x4
+#define	FIP_SUBCODE_H_VNIC_LOGOUT	0x5
+#define	FIP_SUBCODE_G_VHUB_UPDATE	0x6
+#define	FIP_SUBCODE_G_VHUB_TABLE	0x7
+#define	FIP_SUBCODE_H_KEEP_ALIVE	0x8
+
+/*
+ * Basic FIP Header: Flags relevant to EoIB
+ */
+#define	FIP_BHFLAG_GWAVAIL		0x4
+#define	FIP_BHFLAG_SLCTMSG		0x2
+
+/*
+ * FIP_Basic_Header
+ */
+#define	FIP_DESC_TYPE_VENDOR_ID		13
+#define	FIP_DESC_LEN_VENDOR_ID		3
+typedef struct fip_basic_hdr_s {
+	uint16_t	hd_opcode;
+	uint8_t		hd_reserved1;
+	uint8_t		hd_subcode;
+	uint16_t	hd_desc_list_len;
+	uint16_t	hd_flags;
+	uint8_t		hd_type;
+	uint8_t		hd_len;
+	uint8_t		hd_reserved2[2];
+	uint8_t		hd_vendor_id[FIP_VENDOR_LEN];
+} fip_basic_hdr_t;
+
+#define	FIP_IBA_QPN_MASK		0x00FFFFFF
+#define	FIP_IBA_PORTID_MASK		0x0FFF
+#define	FIP_IBA_SL_MASK			0xF000
+#define	FIP_IBA_SL_SHIFT		12
+
+/*
+ * FIP_Descriptor_Infiniband_Address
+ */
+#define	FIP_DESC_TYPE_IBA		240
+#define	FIP_DESC_LEN_IBA		7
+typedef struct fip_desc_iba_s {
+	uint8_t		ia_type;
+	uint8_t		ia_len;
+	uint8_t		ia_reserved[2];
+	uint8_t		ia_vendor_id[FIP_VENDOR_LEN];
+	uint32_t	ia_qpn;
+	uint16_t	ia_sl_portid;
+	uint16_t	ia_lid;
+	uint8_t		ia_guid[FIP_GUID_LEN];
+} fip_desc_iba_t;
+
+/*
+ * FIP Solicitation Control Message:
+ *
+ * 	FIP_Protocol_Version
+ * 	FIP_Basic_Header
+ * 	FIP_Descriptor_Infiniband_Address
+ */
+typedef struct fip_solicit_s {
+	fip_proto_t	sl_proto_version;
+	fip_basic_hdr_t	sl_fip_hdr;
+	fip_desc_iba_t	sl_iba;
+} fip_solicit_t;
+
+/*
+ * FIP_Descriptor_EoIB_Gateway_Information
+ */
+#define	FIP_DESC_TYPE_EOIB_GW_INFO	241
+#define	FIP_DESC_LEN_EOIB_GW_INFO	4
+typedef struct fip_desc_gwinfo_s {
+	uint8_t		gi_type;
+	uint8_t		gi_len;
+	uint8_t		gi_reserved1[2];
+	uint8_t		gi_vendor_id[FIP_VENDOR_LEN];
+	uint8_t		gi_flags;
+	uint8_t		gi_reserved2;
+	uint16_t	gi_rss_qpn_num_net_vnics;
+} fip_desc_gwinfo_t;
+
+#define	FIP_GWI_HOST_ADMIND_VNICS_MASK	0x80
+#define	FIP_GWI_NUM_NET_VNICS_MASK	0x0FFF
+#define	FIP_GWI_RSS_QPN_MASK		0xF000
+#define	FIP_GWI_RSS_QPN_SHIFT		12
+
+/*
+ * FIP_Descriptor_Gateway_Identifier
+ */
+#define	FIP_DESC_TYPE_GW_ID		248
+#define	FIP_DESC_LEN_GW_ID		15
+typedef struct fip_desc_gwid_s {
+	uint8_t		id_type;
+	uint8_t		id_len;
+	uint8_t		id_reserved[2];
+	uint8_t		id_vendor_id[FIP_VENDOR_LEN];
+	uint8_t		id_guid[FIP_GUID_LEN];
+	uint8_t		id_sysname[FIP_SYSNAME_LEN];
+	uint8_t		id_portname[FIP_PORTNAME_LEN];
+} fip_desc_gwid_t;
+
+/*
+ * FIP_Descriptor_Keep_Alive_Parameters
+ */
+#define	FIP_DESC_TYPE_KEEP_ALIVE	249
+#define	FIP_DESC_LEN_KEEP_ALIVE		6
+typedef struct fip_desc_keepalive_s {
+	uint8_t		ka_type;
+	uint8_t		ka_len;
+	uint8_t		ka_reserved[2];
+	uint8_t		ka_vendor_id[FIP_VENDOR_LEN];
+	uint32_t	ka_gw_adv_period;
+	uint32_t	ka_gw_ka_period;
+	uint32_t	ka_vnic_ka_period;
+} fip_desc_keepalive_t;
+
+/*
+ * FIP Advertise Control Message:
+ *
+ * 	FIP_Protocol_Version
+ * 	FIP_Basic_Header
+ * 	FIP_Descriptor_Infiniband_Address
+ * 	FIP_Descriptor_EoIB_Gateway_Information
+ *	FIP_Descriptor_Gateway_Identifier
+ *	FIP_Descriptor_Keep_Alive_Parameters
+ */
+typedef struct fip_advertise_s {
+	fip_proto_t		ad_proto_version;
+	fip_basic_hdr_t		ad_fip_header;
+	fip_desc_iba_t		ad_iba;
+	fip_desc_gwinfo_t	ad_gwinfo;
+	fip_desc_gwid_t		ad_gwid;
+	fip_desc_keepalive_t	ad_keep_alive;
+} fip_advertise_t;
+
+/*
+ * FIP_Descriptor_vNIC_Login
+ */
+#define	FIP_DESC_TYPE_VNIC_LOGIN	242
+#define	FIP_DESC_LEN_VNIC_LOGIN		13
+typedef struct fip_desc_vnic_login_s {
+	uint8_t			vl_type;
+	uint8_t			vl_len;
+	uint8_t			vl_reserved1[2];
+	uint8_t			vl_vendor_id[FIP_VENDOR_LEN];
+	uint16_t		vl_mtu;
+	uint16_t		vl_vnic_id;
+	uint16_t		vl_flags_vlan;
+	uint8_t			vl_mac[ETHERADDRL];
+	uint8_t			vl_gw_mgid_prefix[FIP_MGID_PREFIX_LEN];
+	uint8_t			vl_reserved2;
+	uint8_t			vl_flags_rss;
+	uint8_t			vl_n_mac_mcgid;
+	uint32_t		vl_syndrome_ctl_qpn;
+	uint8_t			vl_vnic_name[FIP_VNIC_NAME_LEN];
+} fip_desc_vnic_login_t;
+
+/*
+ * Flags, masks and error codes for FIP_Descriptor_vNIC_Login
+ */
+#define	FIP_VL_VNIC_ID_MSBIT		0x8000
+#define	FIP_VL_FLAGS_V			0x8000
+#define	FIP_VL_FLAGS_M			0x4000
+#define	FIP_VL_FLAGS_VP			0x2000
+#define	FIP_VL_FLAGS_H			0x1000
+#define	FIP_VL_VLAN_MASK		0x0FFF
+#define	FIP_VL_RSS_MASK			0x10
+#define	FIP_VL_N_RSS_MCGID_MASK		0x0F
+#define	FIP_VL_N_MAC_MCGID_MASK		0x3F
+#define	FIP_VL_CTL_QPN_MASK		0x00FFFFFF
+
+#define	FIP_VL_SYN_MASK			0xFF000000
+#define	FIP_VL_SYN_SHIFT		24
+
+#define	FIP_VL_SYN_SUCCESS		0
+#define	FIP_VL_SYN_REJECTED		1
+#define	FIP_VL_SYN_GW_NO_RESOURCE	2
+#define	FIP_VL_SYN_NO_MORE_NWK_ADDRS	3
+#define	FIP_VL_SYN_UNKNOWN_HOST		4
+#define	FIP_VL_SYN_UNSUPP_PARAM		5
+
+/*
+ * FIP_Descriptor_Partition
+ */
+#define	FIP_DESC_TYPE_PARTITION		246
+#define	FIP_DESC_LEN_PARTITION		4
+typedef struct fip_desc_partition_s {
+	uint8_t			pn_type;
+	uint8_t			pn_len;
+	uint8_t			pn_reserved1[2];
+	uint8_t			pn_vendor_id[FIP_VENDOR_LEN];
+	uint8_t			pn_reserved2[2];
+	uint16_t		pn_pkey;
+} fip_desc_partition_t;
+
+/*
+ * FIP Login Control Message:
+ *
+ * 	FIP_Protocol_Version
+ * 	FIP_Basic_Header
+ * 	FIP_Descriptor_Infiniband_Address
+ * 	FIP_Descriptor_vNIC_Login
+ */
+typedef struct fip_login_s {
+	fip_proto_t		lg_proto_version;
+	fip_basic_hdr_t		lg_fip_header;
+	fip_desc_iba_t		lg_iba;
+	fip_desc_vnic_login_t	lg_vnic_login;
+} fip_login_t;
+
+/*
+ * FIP Login ACK Control Message:
+ *
+ * 	FIP_Protocol_Version
+ * 	FIP_Basic_Header
+ * 	FIP_Descriptor_Infiniband_Address
+ * 	FIP_Descriptor_vNIC_Login
+ *	FIP_Descriptor_Partition
+ */
+typedef struct fip_login_ack_s {
+	fip_proto_t		ak_proto_version;
+	fip_basic_hdr_t		ak_fip_header;
+	fip_desc_iba_t		ak_iba;
+	fip_desc_vnic_login_t	ak_vnic_login;
+	fip_desc_partition_t	ak_vhub_partition;
+} fip_login_ack_t;
+
+/*
+ * FIP_Descriptor_vNIC_Identity
+ */
+#define	FIP_DESC_TYPE_VNIC_IDENTITY	245
+#define	FIP_DESC_LEN_VNIC_IDENTITY	13
+typedef struct fip_desc_vnic_identity_s {
+	uint8_t			vi_type;
+	uint8_t			vi_len;
+	uint8_t			vi_reserved1[2];
+	uint8_t			vi_vendor_id[FIP_VENDOR_LEN];
+	uint32_t		vi_flags_vhub_id;
+	uint32_t		vi_tusn;
+	uint16_t		vi_vnic_id;
+	uint8_t			vi_mac[ETHERADDRL];
+	uint8_t			vi_port_guid[FIP_GUID_LEN];
+	uint8_t			vi_vnic_name[FIP_VNIC_NAME_LEN];
+} fip_desc_vnic_identity_t;
+
+#define	FIP_VI_FLAG_U		0x80000000
+#define	FIP_VI_FLAG_R		0x40000000
+#define	FIP_VI_FLAG_VP		0x01000000
+
+/*
+ * FIP Keep Alive Control Message:
+ *
+ *	FIP_Protocol_Version
+ *	FIP_Basic_Header
+ *	FIP_Descriptor_vNIC_Identity
+ */
+typedef struct fip_keep_alive_s {
+	fip_proto_t			ka_proto_version;
+	fip_basic_hdr_t			ka_fip_header;
+	fip_desc_vnic_identity_t	ka_vnic_identity;
+} fip_keep_alive_t;
+
+/*
+ * FIP_vHUB_Table_Entry
+ */
+typedef struct fip_vhub_table_entry_s {
+	uint8_t			te_v_rss_type;
+	uint8_t			te_reserved1;
+	uint8_t			te_mac[ETHERADDRL];
+	uint32_t		te_qpn;
+	uint8_t			te_reserved2;
+	uint8_t			te_sl;
+	uint16_t		te_lid;
+} fip_vhub_table_entry_t;
+
+#define	FIP_TE_VALID			0x80
+#define	FIP_TE_RSS			0x40
+
+#define	FIP_TE_TYPE_MASK		0x0F
+#define	FIP_TE_TYPE_VNIC		0x00
+#define	FIP_TE_TYPE_GATEWAY		0x01
+#define	FIP_TE_TYPE_UNICAST_MISS	0x02
+#define	FIP_TE_TYPE_MULTICAST_ENTRY	0x03
+#define	FIP_TE_TYPE_VHUB_MULTICAST	0x04
+
+#define	FIP_TE_SL_MASK			0x0F
+#define	FIP_TE_QPN_MASK			0x00FFFFFF
+
+#define	FIP_VHUB_TABLE_ENTRY_SZ		(sizeof (fip_vhub_table_entry_t))
+#define	FIP_VHUB_TABLE_ENTRY_WORDS	(FIP_VHUB_TABLE_ENTRY_SZ >> 2)
+
+/*
+ * FIP_Descriptor_vHUB_Update
+ */
+#define	FIP_DESC_TYPE_VHUB_UPDATE	243
+#define	FIP_DESC_LEN_VHUB_UPDATE	9
+typedef struct fip_desc_vhub_update_s {
+	uint8_t			up_type;
+	uint8_t			up_len;
+	uint8_t			up_reserved1[2];
+	uint8_t			up_vendor_id[FIP_VENDOR_LEN];
+	uint32_t		up_eport_vp_vhub_id;
+	uint32_t		up_tusn;
+	fip_vhub_table_entry_t	up_tbl_entry;
+} fip_desc_vhub_update_t;
+
+#define	FIP_UP_VP_SHIFT			24
+#define	FIP_UP_VP_MASK			0x1
+#define	FIP_UP_EPORT_STATE_SHIFT	28
+#define	FIP_UP_EPORT_STATE_MASK		0x3
+#define	FIP_UP_VHUB_ID_MASK		0x00FFFFFF
+
+#define	FIP_EPORT_DOWN			0x0
+#define	FIP_EPORT_UP			0x1
+
+/*
+ * FIP_Descriptor_vHUB_Table
+ */
+#define	FIP_DESC_TYPE_VHUB_TABLE	244
+typedef struct fip_desc_vhub_table_s {
+	uint8_t			tb_type;
+	uint8_t			tb_len;
+	uint8_t			tb_reserved1[2];
+	uint8_t			tb_vendor_id[FIP_VENDOR_LEN];
+	uint32_t		tb_flags_vhub_id;
+	uint32_t		tb_tusn;
+	uint8_t			tb_hdr;
+	uint8_t			tb_reserved2;
+	uint16_t		tb_table_size;
+	/*
+	 * FIP_vHUB_Table_Entry
+	 * FIP_vHUB_Table_Entry
+	 * .
+	 * .
+	 * .
+	 * uint32_t Checksum
+	 */
+} fip_desc_vhub_table_t;
+
+#define	FIP_TB_FLAGS_VP_SHIFT		24
+#define	FIP_TB_FLAGS_VP_MASK		0x1
+
+#define	FIP_TB_VHUB_ID_MASK		0x00FFFFFF
+
+#define	FIP_TB_HDR_MIDDLE		0x00
+#define	FIP_TB_HDR_FIRST		0x40
+#define	FIP_TB_HDR_LAST			0x80
+#define	FIP_TB_HDR_ONLY			0xC0
+
+#define	FIP_DESC_VHUB_TABLE_SZ		(sizeof (fip_desc_vhub_table_t))
+#define	FIP_DESC_VHUB_TABLE_WORDS	(FIP_DESC_VHUB_TABLE_SZ >> 2)
+
+/*
+ * FIP vHUB Table Message:
+ *
+ * 	FIP_Protocol_Version
+ * 	FIP_Basic_Header
+ * 	FIP_Descriptor_vHUB_Table
+ */
+typedef struct fip_vhub_table_s {
+	fip_proto_t		vt_proto_version;
+	fip_basic_hdr_t		vt_fip_header;
+	fip_desc_vhub_table_t	vt_vhub_table;
+} fip_vhub_table_t;
+
+/*
+ * FIP vHUB Update Message:
+ *
+ * 	FIP_Protocol_Version
+ * 	FIP_Basic_Header
+ * 	FIP_Descriptor_vHUB_Update
+ */
+typedef struct fip_vhub_update_s {
+	fip_proto_t		vu_proto_version;
+	fip_basic_hdr_t		vu_fip_header;
+	fip_desc_vhub_update_t	vu_vhub_update;
+} fip_vhub_update_t;
+
+/*
+ * Just a generic container to handle either type of VHUB
+ * messages
+ */
+typedef struct fip_vhub_pkt_s {
+	fip_proto_t		hb_proto_version;
+	fip_basic_hdr_t		hb_fip_header;
+} fip_vhub_pkt_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _SYS_IB_EOIB_FIP_H */
--- a/usr/src/uts/intel/Makefile.intel.shared	Fri Aug 13 14:44:26 2010 +0800
+++ b/usr/src/uts/intel/Makefile.intel.shared	Fri Aug 13 07:02:57 2010 -0400
@@ -490,7 +490,7 @@
 #
 #	InfiniBand pseudo drivers
 #
-DRV_KMODS	+= ib ibp rdsib sdp iser daplt hermon tavor sol_ucma sol_uverbs
+DRV_KMODS	+= ib ibp eibnx eoib rdsib sdp iser daplt hermon tavor sol_ucma sol_uverbs
 DRV_KMODS	+= sol_umad

 #
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/intel/eibnx/Makefile	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,137 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+#
+
+#
+# This makefile drives the production of the EoIB Nexus driver
+#
+# intel architecture dependent
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts)
+#
+UTSBASE	= ../..
+
+#
+# Define the module and object file sets
+#
+MODULE		= eibnx
+OBJECTS		= $(EIBNX_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(EIBNX_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_DRV_DIR)/$(MODULE)
+CONF_SRCDIR	= $(UTSBASE)/common/io/ib/clients/eoib
+WARLOCK_OUT	= $(EIBNX_OBJS:%.o=%.ll)
+WARLOCK_OK	= $(MODULE).ok
+WLCMD_DIR	= $(UTSBASE)/common/io/warlock
+
+#
+# Include common rules
+#
+include $(UTSBASE)/intel/Makefile.intel
+
+#
+# Define targets
+#
+ALL_TARGET	= $(BINARY) $(SRC_CONFILE)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
+
+# Module specific debug flag
+#
+CPPFLAGS += -DENX_DEBUG
+
+#
+# Lint pass one enforcement
+#
+CFLAGS += $(CCVERBOSE)
+
+#
+# Depends on misc/ibtl
+#
+LDFLAGS	+= -dy -Nmisc/ibcm -Nmisc/ibtl
+
+#
+# The only lint flag we should need
+#
+LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN
+
+#
+# Default build targets
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+		$(RM) $(WARLOCK_OUT) $(WARLOCK_OK)
+
+clobber:	$(CLOBBER_DEPS)
+		$(RM) $(WARLOCK_OUT) $(WARLOCK_OK)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+# Include common targets
+#
+include $(UTSBASE)/intel/Makefile.targ
+
+#
+# Defines for local commands
+#
+WARLOCK		= warlock
+WLCC		= wlcc
+TOUCH		= touch
+TEST		= test
+
+warlock: $(WARLOCK_OK)
+
+$(WARLOCK_OK): $(WARLOCK_OUT) $(WLCMD_DIR)/eibnx.wlcmd warlock_ddi.files
+	$(WARLOCK) -c $(WLCMD_DIR)/eibnx.wlcmd $(WARLOCK_OUT)  \
+		-l ../warlock/ddi_dki_impl.ll
+	$(TOUCH) $@
+
+%.ll: $(UTSBASE)/common/io/ib/clients/eoib/enx_main.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/enx_hdlrs.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/enx_ibt.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/enx_log.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/enx_fip.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/enx_misc.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/enx_q.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/enx_ctl.c \
+	    $(UTSBASE)/common/sys/ib/clients/eoib/fip.h \
+	    $(UTSBASE)/common/sys/ib/clients/eoib/eib.h \
+	    $(UTSBASE)/common/sys/ib/clients/eoib/enx_impl.h
+	$(WLCC)  $(CPPFLAGS) -DDEBUG -o $@ $<
+
+warlock_ddi.files:
+	@cd ../warlock; pwd; $(MAKE) warlock
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/intel/eoib/Makefile	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,136 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+#
+
+#
+# This makefile drives the production of the EoIB Nexus driver
+#
+# intel architecture dependent
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts)
+#
+UTSBASE	= ../..
+
+#
+# Define the module and object file sets
+#
+MODULE		= eoib
+OBJECTS		= $(EOIB_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(EOIB_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_DRV_DIR)/$(MODULE)
+WARLOCK_OUT	= $(EOIB_OBJS:%.o=%.ll)
+WARLOCK_OK	= $(MODULE).ok
+WLCMD_DIR	= $(UTSBASE)/common/io/warlock
+
+#
+# Include common rules
+#
+include $(UTSBASE)/intel/Makefile.intel
+
+#
+# Define targets
+#
+ALL_TARGET	= $(BINARY)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE)
+
+# Module specific debug flag
+#
+CPPFLAGS += -DEIB_DEBUG
+
+#
+# Lint pass one enforcement
+#
+CFLAGS += $(CCVERBOSE)
+
+#
+# Depends on misc/ibtl
+#
+LDFLAGS	+= -dy -Nmisc/mac -Nmisc/ibtl -Nmisc/ibcm -Nmisc/ibmf
+
+#
+# Default build targets
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+		$(RM) $(WARLOCK_OUT) $(WARLOCK_OK)
+
+clobber:	$(CLOBBER_DEPS)
+		$(RM) $(WARLOCK_OUT) $(WARLOCK_OK)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+# Include common targets
+#
+include $(UTSBASE)/intel/Makefile.targ
+
+#
+# Defines for local commands
+#
+WARLOCK		= warlock
+WLCC		= wlcc
+TOUCH		= touch
+TEST		= test
+
+warlock: $(WARLOCK_OK)
+
+$(WARLOCK_OK): $(WARLOCK_OUT) $(WLCMD_DIR)/eoib.wlcmd warlock_ddi.files
+	$(WARLOCK) -c $(WLCMD_DIR)/eoib.wlcmd $(WARLOCK_OUT)  \
+		-l ../warlock/ddi_dki_impl.ll
+	$(TOUCH) $@
+
+%.ll: $(UTSBASE)/common/io/ib/clients/eoib/eib_adm.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_chan.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_cmn.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_ctl.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_data.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_fip.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_ibt.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_log.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_mac.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_main.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_rsrc.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_svc.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_vnic.c \
+	    $(UTSBASE)/common/sys/ib/clients/eoib/fip.h \
+	    $(UTSBASE)/common/sys/ib/clients/eoib/eib.h \
+	    $(UTSBASE)/common/sys/ib/clients/eoib/eib_impl.h
+	$(WLCC)  $(CPPFLAGS) -DDEBUG -o $@ $<
+
+warlock_ddi.files:
+	@cd ../warlock; pwd; $(MAKE) warlock
--- a/usr/src/uts/sparc/Makefile.sparc.shared	Fri Aug 13 14:44:26 2010 +0800
+++ b/usr/src/uts/sparc/Makefile.sparc.shared	Fri Aug 13 07:02:57 2010 -0400
@@ -283,7 +283,7 @@
 DRV_KMODS	+= usbecm
 DRV_KMODS	+= hci1394 av1394 scsa1394 dcam1394
 DRV_KMODS	+= sbp2
-DRV_KMODS	+= ib ibp rdsib sdp iser daplt hermon tavor sol_ucma sol_uverbs
+DRV_KMODS	+= ib ibp eibnx eoib rdsib sdp iser daplt hermon tavor sol_ucma sol_uverbs
 DRV_KMODS	+= sol_umad
 DRV_KMODS	+= pci_pci pcieb pcieb_bcm
 DRV_KMODS	+= i8042 kb8042 mouse8042
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sparc/eibnx/Makefile	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,143 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+#
+
+#
+# This makefile drives the production of the EoIB Nexus driver
+#
+# sparc architecture dependent
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts)
+#
+UTSBASE	= ../..
+
+#
+# Define the module and object file sets
+#
+MODULE		= eibnx
+OBJECTS		= $(EIBNX_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(EIBNX_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_DRV_DIR)/$(MODULE)
+CONF_SRCDIR	= $(UTSBASE)/common/io/ib/clients/eoib
+WARLOCK_OUT	= $(EIBNX_OBJS:%.o=%.ll)
+WARLOCK_OK	= $(MODULE).ok
+WLCMD_DIR	= $(UTSBASE)/common/io/warlock
+
+#
+# Include common rules
+#
+include $(UTSBASE)/sparc/Makefile.sparc
+
+#
+# Define targets
+#
+ALL_TARGET	= $(BINARY) $(SRC_CONFILE)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
+
+#
+# Overrides.
+#
+ALL_BUILDS	= $(ALL_BUILDSONLY64)
+DEF_BUILDS	= $(DEF_BUILDSONLY64)
+
+# Module specific debug flag
+#
+CPPFLAGS += -DENX_DEBUG
+
+#
+# Lint pass one enforcement
+#
+CFLAGS += $(CCVERBOSE)
+
+#
+# Depends on misc/ibtl
+#
+LDFLAGS	+= -dy -Nmisc/ibcm -Nmisc/ibtl
+
+#
+# The only lint flag we should need
+#
+LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN
+
+#
+# Default build targets
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+		$(RM) $(WARLOCK_OUT) $(WARLOCK_OK)
+
+clobber:	$(CLOBBER_DEPS)
+		$(RM) $(WARLOCK_OUT) $(WARLOCK_OK)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS) lint32
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+# Include common targets
+#
+include $(UTSBASE)/sparc/Makefile.targ
+
+#
+# Defines for local commands
+#
+WARLOCK		= warlock
+WLCC		= wlcc
+TOUCH		= touch
+TEST		= test
+
+warlock: $(WARLOCK_OK)
+
+$(WARLOCK_OK): $(WARLOCK_OUT) $(WLCMD_DIR)/eibnx.wlcmd warlock_ddi.files
+	$(WARLOCK) -c $(WLCMD_DIR)/eibnx.wlcmd $(WARLOCK_OUT)  \
+		-l ../warlock/ddi_dki_impl.ll
+	$(TOUCH) $@
+
+%.ll: $(UTSBASE)/common/io/ib/clients/eoib/enx_main.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/enx_hdlrs.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/enx_ibt.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/enx_log.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/enx_fip.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/enx_misc.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/enx_q.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/enx_ctl.c \
+	    $(UTSBASE)/common/sys/ib/clients/eoib/fip.h \
+	    $(UTSBASE)/common/sys/ib/clients/eoib/eib.h \
+	    $(UTSBASE)/common/sys/ib/clients/eoib/enx_impl.h
+	$(WLCC)  $(CPPFLAGS) -DDEBUG -o $@ $<
+
+warlock_ddi.files:
+	@cd ../warlock; pwd; $(MAKE) warlock
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sparc/eoib/Makefile	Fri Aug 13 07:02:57 2010 -0400
@@ -0,0 +1,142 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+#
+
+#
+# This makefile drives the production of the EoIB Nexus driver
+#
+# sparc architecture dependent
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts)
+#
+UTSBASE	= ../..
+
+#
+# Define the module and object file sets
+#
+MODULE		= eoib
+OBJECTS		= $(EOIB_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(EOIB_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_DRV_DIR)/$(MODULE)
+WARLOCK_OUT	= $(EOIB_OBJS:%.o=%.ll)
+WARLOCK_OK	= $(MODULE).ok
+WLCMD_DIR	= $(UTSBASE)/common/io/warlock
+
+#
+# Include common rules
+#
+include $(UTSBASE)/sparc/Makefile.sparc
+
+#
+# Define targets
+#
+ALL_TARGET	= $(BINARY)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE)
+
+#
+# Overrides
+#
+ALL_BUILDS	= $(ALL_BUILDSONLY64)
+DEF_BUILDS	= $(DEF_BUILDSONLY64)
+
+# Module specific debug flag
+#
+CPPFLAGS += -DEIB_DEBUG
+
+#
+# Lint pass one enforcement
+#
+CFLAGS += $(CCVERBOSE)
+
+#
+# Depends on misc/ibtl
+#
+LDFLAGS	+= -dy -Nmisc/mac -Nmisc/ibtl -Nmisc/ibcm -Nmisc/ibmf
+
+#
+# Default build targets
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+		$(RM) $(WARLOCK_OUT) $(WARLOCK_OK)
+
+clobber:	$(CLOBBER_DEPS)
+		$(RM) $(WARLOCK_OUT) $(WARLOCK_OK)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS) lint32
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+# Include common targets
+#
+include $(UTSBASE)/sparc/Makefile.targ
+
+#
+# Defines for local commands
+#
+WARLOCK		= warlock
+WLCC		= wlcc
+TOUCH		= touch
+TEST		= test
+
+warlock: $(WARLOCK_OK)
+
+$(WARLOCK_OK): $(WARLOCK_OUT) $(WLCMD_DIR)/eoib.wlcmd warlock_ddi.files
+	$(WARLOCK) -c $(WLCMD_DIR)/eoib.wlcmd $(WARLOCK_OUT)  \
+		-l ../warlock/ddi_dki_impl.ll
+	$(TOUCH) $@
+
+%.ll: $(UTSBASE)/common/io/ib/clients/eoib/eib_adm.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_chan.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_cmn.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_ctl.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_data.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_fip.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_ibt.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_log.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_mac.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_main.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_rsrc.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_svc.c \
+	    $(UTSBASE)/common/io/ib/clients/eoib/eib_vnic.c \
+	    $(UTSBASE)/common/sys/ib/clients/eoib/fip.h \
+	    $(UTSBASE)/common/sys/ib/clients/eoib/eib.h \
+	    $(UTSBASE)/common/sys/ib/clients/eoib/eib_impl.h
+	$(WLCC)  $(CPPFLAGS) -DDEBUG -o $@ $<
+
+warlock_ddi.files:
+	@cd ../warlock; pwd; $(MAKE) warlock