Mercurial > illumos > illumos-gate
changeset 13113:da7b13ec3a28
PSARC/2010/259 Ethernet over IB
6891335 Driver for supporting "Ethernet over InfiniBand" protocol
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/pkg/manifests/driver-network-eoib.mf Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,49 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# + +# +# The default for payload-bearing actions in this package is to appear in the +# global zone only. See the include file for greater detail, as well as +# information about overriding the defaults. +# +<include global_zone_only_component> +set name=pkg.fmri value=pkg:/driver/network/eoib@$(PKGVERS) +set name=pkg.description value="Solaris Drivers for Ethernet over InfiniBand" +set name=pkg.summary value="Solaris Ethernet over InfiniBand" +set name=info.classification \ + value=org.opensolaris.category.2008:System/Hardware +set name=variant.arch value=$(ARCH) +dir path=kernel group=sys +dir path=kernel/drv group=sys +dir path=kernel/drv/$(ARCH64) group=sys +driver name=eibnx perms="* 0666 root sys" +driver name=eoib clone_perms="eoib 0666 root sys" perms="* 0666 root sys" +file path=kernel/drv/$(ARCH64)/eibnx group=sys +file path=kernel/drv/$(ARCH64)/eoib group=sys +$(i386_ONLY)file path=kernel/drv/eibnx group=sys +file path=kernel/drv/eibnx.conf group=sys +$(i386_ONLY)file path=kernel/drv/eoib group=sys +license cr_Sun license=cr_Sun +license lic_CDDL license=lic_CDDL
--- a/usr/src/uts/common/Makefile.files Fri Aug 13 14:44:26 2010 +0800 +++ b/usr/src/uts/common/Makefile.files Fri Aug 13 07:02:57 2010 -0400 @@ -1734,6 +1734,13 @@ IBD_OBJS += ibd.o ibd_cm.o +EIBNX_OBJS += enx_main.o enx_hdlrs.o enx_ibt.o enx_log.o enx_fip.o \ + enx_misc.o enx_q.o enx_ctl.o + +EOIB_OBJS += eib_adm.o eib_chan.o eib_cmn.o eib_ctl.o eib_data.o \ + eib_fip.o eib_ibt.o eib_log.o eib_mac.o eib_main.o \ + eib_rsrc.o eib_svc.o eib_vnic.o + DLPISTUB_OBJS += dlpistub.o SDP_OBJS += sdpddi.o
--- a/usr/src/uts/common/Makefile.rules Fri Aug 13 14:44:26 2010 +0800 +++ b/usr/src/uts/common/Makefile.rules Fri Aug 13 07:02:57 2010 -0400 @@ -760,6 +760,10 @@ $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/ib/clients/eoib/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/ib/clients/of/sol_ofs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -2070,6 +2074,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/ib/clients/ibd/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/ib/clients/eoib/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/ib/clients/of/sol_ofs/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/ib/clients/eoib/eib_adm.c Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,487 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ksynch.h> + +#include <sys/ib/clients/eoib/eib_impl.h> + +/* + * Declarations private to this file + */ +static int eib_adm_setup_cq(eib_t *); +static int eib_adm_setup_ud_channel(eib_t *); +static void eib_adm_comp_intr(ibt_cq_hdl_t, void *); +static void eib_adm_rx_comp(eib_t *, eib_wqe_t *); +static void eib_adm_tx_comp(eib_t *, eib_wqe_t *); +static void eib_adm_err_comp(eib_t *, eib_wqe_t *, ibt_wc_t *); +static void eib_rb_adm_setup_cq(eib_t *); +static void eib_rb_adm_setup_ud_channel(eib_t *); + +int +eib_adm_setup_qp(eib_t *ss, int *err) +{ + eib_chan_t *chan; + ibt_status_t ret; + uint16_t pkey_ix; + + /* + * Verify pkey + */ + ret = ibt_pkey2index(ss->ei_hca_hdl, ss->ei_props->ep_port_num, + EIB_ADMIN_PKEY, &pkey_ix); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_setup_qp: " + "ibt_pkey2index() failed, port_num=0x%x, " + "pkey=0x%x, ret=%d", ss->ei_props->ep_port_num, + EIB_ADMIN_PKEY, ret); + *err = ENONET; + goto adm_setup_qp_fail; + } + + /* + * Allocate a eib_chan_t to store stuff about admin qp and + * initialize some basic stuff + */ + ss->ei_admin_chan = eib_chan_init(); + + chan = ss->ei_admin_chan; + chan->ch_pkey = EIB_ADMIN_PKEY; + chan->ch_pkey_ix = pkey_ix; + chan->ch_vnic_inst = -1; + + /* + * Setup a combined CQ and completion handler + */ + if (eib_adm_setup_cq(ss) != EIB_E_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_setup_qp: " + "eib_adm_setup_cq() failed"); + *err = ENOMEM; + goto adm_setup_qp_fail; + } + + /* + * Setup UD channel + */ + if (eib_adm_setup_ud_channel(ss) != EIB_E_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_setup_qp: " + "eib_adm_setup_ud_channel() failed"); + *err = ENOMEM; + goto adm_setup_qp_fail; + } + + /* + * Post initial set of rx buffers to the HCA + */ + if (eib_chan_post_rx(ss, chan, NULL) != EIB_E_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_setup_qp: " + "eib_chan_post_rx() failed"); + *err = ENOMEM; + goto adm_setup_qp_fail; + } + + return (EIB_E_SUCCESS); + +adm_setup_qp_fail: + eib_rb_adm_setup_qp(ss); + return (EIB_E_FAILURE); +} + +/*ARGSUSED*/ +uint_t +eib_adm_comp_handler(caddr_t arg1, caddr_t arg2) +{ + eib_t *ss = (eib_t *)(void *)arg1; + eib_chan_t *chan = ss->ei_admin_chan; + ibt_wc_t *wc; + eib_wqe_t *wqe; + ibt_status_t ret; + uint_t polled; + int i; + + /* + * Re-arm the notification callback before we start polling + * the completion queue. There's nothing much we can do if the + * enable_cq_notify fails - we issue a warning and move on. + */ + ret = ibt_enable_cq_notify(chan->ch_cq_hdl, IBT_NEXT_COMPLETION); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_adm_comp_handler: " + "ibt_enable_cq_notify() failed, ret=%d", ret); + } + + /* + * Handle tx and rx completions + */ + while ((ret = ibt_poll_cq(chan->ch_cq_hdl, chan->ch_wc, chan->ch_cq_sz, + &polled)) == IBT_SUCCESS) { + for (wc = chan->ch_wc, i = 0; i < polled; i++, wc++) { + wqe = (eib_wqe_t *)(uintptr_t)wc->wc_id; + if (wc->wc_status != IBT_WC_SUCCESS) { + eib_adm_err_comp(ss, wqe, wc); + } else if (EIB_WQE_TYPE(wqe->qe_info) == EIB_WQE_RX) { + eib_adm_rx_comp(ss, wqe); + } else { + eib_adm_tx_comp(ss, wqe); + } + } + } + + return (DDI_INTR_CLAIMED); +} + +void +eib_rb_adm_setup_qp(eib_t *ss) +{ + eib_rb_adm_setup_ud_channel(ss); + + eib_rb_adm_setup_cq(ss); + + eib_chan_fini(ss->ei_admin_chan); + ss->ei_admin_chan = NULL; +} + +static int +eib_adm_setup_cq(eib_t *ss) +{ + eib_chan_t *chan = ss->ei_admin_chan; + ibt_cq_attr_t cq_attr; + ibt_status_t ret; + uint_t sz; + int rv; + + /* + * Allocate the admin completion queue for sending vnic logins and + * logouts and receiving vnic login acks. + */ + cq_attr.cq_sched = NULL; + cq_attr.cq_flags = IBT_CQ_NO_FLAGS; + if (ss->ei_hca_attrs->hca_max_cq_sz < EIB_ADMIN_CQ_SIZE) + cq_attr.cq_size = ss->ei_hca_attrs->hca_max_cq_sz; + else + cq_attr.cq_size = EIB_ADMIN_CQ_SIZE; + + ret = ibt_alloc_cq(ss->ei_hca_hdl, &cq_attr, &chan->ch_cq_hdl, &sz); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_setup_cq: " + "ibt_alloc_cq(cq_sz=0x%lx) failed, ret=%d", + cq_attr.cq_size, ret); + goto adm_setup_cq_fail; + } + + /* + * Set up other parameters for collecting completion information + */ + chan->ch_cq_sz = sz; + chan->ch_wc = kmem_zalloc(sizeof (ibt_wc_t) * sz, KM_SLEEP); + + /* + * Allocate soft interrupt for the admin channel cq handler and + * set up the handler as well. + */ + if ((rv = ddi_intr_add_softint(ss->ei_dip, &ss->ei_admin_si_hdl, + EIB_SOFTPRI_ADM, eib_adm_comp_handler, ss)) != DDI_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_setup_cq: " + "ddi_intr_add_softint() failed for adm qp, ret=%d", rv); + goto adm_setup_cq_fail; + } + + /* + * Now, set up the admin completion queue handler. + */ + ibt_set_cq_handler(chan->ch_cq_hdl, eib_adm_comp_intr, ss); + + ret = ibt_enable_cq_notify(chan->ch_cq_hdl, IBT_NEXT_COMPLETION); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_setup_cq: " + "ibt_enable_cq_notify() failed, ret=%d", ret); + goto adm_setup_cq_fail; + } + + return (EIB_E_SUCCESS); + +adm_setup_cq_fail: + eib_rb_adm_setup_cq(ss); + return (EIB_E_FAILURE); +} + +static int +eib_adm_setup_ud_channel(eib_t *ss) +{ + eib_chan_t *chan = ss->ei_admin_chan; + ibt_ud_chan_alloc_args_t alloc_attr; + ibt_ud_chan_query_attr_t query_attr; + ibt_status_t ret; + + bzero(&alloc_attr, sizeof (ibt_ud_chan_alloc_args_t)); + bzero(&query_attr, sizeof (ibt_ud_chan_query_attr_t)); + + alloc_attr.ud_flags = IBT_ALL_SIGNALED; + alloc_attr.ud_hca_port_num = ss->ei_props->ep_port_num; + alloc_attr.ud_pkey_ix = chan->ch_pkey_ix; + alloc_attr.ud_sizes.cs_sq = EIB_ADMIN_MAX_SWQE; + alloc_attr.ud_sizes.cs_rq = EIB_ADMIN_MAX_RWQE; + alloc_attr.ud_sizes.cs_sq_sgl = 1; + alloc_attr.ud_sizes.cs_rq_sgl = 1; + alloc_attr.ud_sizes.cs_inline = 0; + + alloc_attr.ud_qkey = EIB_FIP_QKEY; + alloc_attr.ud_scq = chan->ch_cq_hdl; + alloc_attr.ud_rcq = chan->ch_cq_hdl; + alloc_attr.ud_pd = ss->ei_pd_hdl; + + ret = ibt_alloc_ud_channel(ss->ei_hca_hdl, IBT_ACHAN_NO_FLAGS, + &alloc_attr, &chan->ch_chan, NULL); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_setup_ud_channel: " + "ibt_alloc_ud_channel(port=0x%x, pkey_ix=0x%x) " + "failed, ret=%d", alloc_attr.ud_hca_port_num, + chan->ch_pkey_ix, ret); + goto adm_setup_ud_channel_fail; + } + + ret = ibt_query_ud_channel(chan->ch_chan, &query_attr); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_setup_ud_channel: " + "ibt_query_ud_channel() failed, ret=%d", ret); + goto adm_setup_ud_channel_fail; + } + + chan->ch_qpn = query_attr.ud_qpn; + chan->ch_max_swqes = query_attr.ud_chan_sizes.cs_sq; + chan->ch_max_rwqes = query_attr.ud_chan_sizes.cs_rq; + chan->ch_lwm_rwqes = chan->ch_max_rwqes >> 2; + chan->ch_rwqe_bktsz = chan->ch_max_rwqes; + chan->ch_ip_hdr_align = 0; + chan->ch_alloc_mp = B_FALSE; + chan->ch_tear_down = B_FALSE; + + return (EIB_E_SUCCESS); + +adm_setup_ud_channel_fail: + eib_rb_adm_setup_ud_channel(ss); + return (EIB_E_FAILURE); +} + +static void +eib_adm_comp_intr(ibt_cq_hdl_t cq_hdl, void *arg) +{ + eib_t *ss = arg; + eib_chan_t *chan = ss->ei_admin_chan; + + if (cq_hdl != chan->ch_cq_hdl) { + EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_adm_comp_intr: " + "cq_hdl(0x%llx) != chan->ch_cq_hdl(0x%llx), " + "ignoring completion", cq_hdl, chan->ch_cq_hdl); + return; + } + + ASSERT(ss->ei_admin_si_hdl != NULL); + + (void) ddi_intr_trigger_softint(ss->ei_admin_si_hdl, NULL); +} + +static void +eib_adm_rx_comp(eib_t *ss, eib_wqe_t *wqe) +{ + eib_chan_t *chan = ss->ei_admin_chan; + eib_login_data_t ld; + uint8_t *pkt = (uint8_t *)(uintptr_t)(wqe->qe_sgl.ds_va); + ibt_status_t ret; + + /* + * Skip the GRH and parse the login ack message in the packet + */ + if (eib_fip_parse_login_ack(ss, pkt + EIB_GRH_SZ, &ld) == EIB_E_SUCCESS) + eib_vnic_login_ack(ss, &ld); + + /* + * Try to repost the rwqe. For admin channel, we can take the shortcut + * and not go through eib_chan_post_recv(), since we know that the + * qe_info flag, qe_chan and qe_vinst are all already set correctly; we + * just took this out of the rx queue, so the ch_rx_posted will be ok + * if we just posted it back. And there are no mblk allocation or + * buffer alignment restrictions for this channel as well. + */ + if (chan->ch_tear_down) { + eib_rsrc_return_rwqe(ss, wqe, chan); + } else { + ret = ibt_post_recv(chan->ch_chan, &(wqe->qe_wr.recv), 1, NULL); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_rx_comp: " + "ibt_post_recv() failed, ret=%d", ret); + eib_rsrc_return_rwqe(ss, wqe, chan); + } + } +} + +static void +eib_adm_tx_comp(eib_t *ss, eib_wqe_t *wqe) +{ + eib_rsrc_return_swqe(ss, wqe, ss->ei_admin_chan); +} + +/*ARGSUSED*/ +static void +eib_adm_err_comp(eib_t *ss, eib_wqe_t *wqe, ibt_wc_t *wc) +{ + /* + * Currently, all we do is report + */ + switch (wc->wc_status) { + case IBT_WC_WR_FLUSHED_ERR: + break; + + case IBT_WC_LOCAL_CHAN_OP_ERR: + EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_err_comp: " + "IBT_WC_LOCAL_CHAN_OP_ERR seen, wqe_info=0x%lx ", + wqe->qe_info); + break; + + case IBT_WC_LOCAL_PROTECT_ERR: + EIB_DPRINTF_ERR(ss->ei_instance, "eib_adm_err_comp: " + "IBT_WC_LOCAL_PROTECT_ERR seen, wqe_info=0x%lx ", + wqe->qe_info); + break; + } + + /* + * When a wc indicates error, we do not attempt to repost but + * simply return it to the wqe pool. + */ + if (EIB_WQE_TYPE(wqe->qe_info) == EIB_WQE_RX) + eib_rsrc_return_rwqe(ss, wqe, ss->ei_admin_chan); + else + eib_rsrc_return_swqe(ss, wqe, ss->ei_admin_chan); +} + +static void +eib_rb_adm_setup_cq(eib_t *ss) +{ + eib_chan_t *chan = ss->ei_admin_chan; + ibt_status_t ret; + + if (chan == NULL) + return; + + /* + * Reset any completion handler we may have set up + */ + if (chan->ch_cq_hdl) + ibt_set_cq_handler(chan->ch_cq_hdl, NULL, NULL); + + /* + * Remove any softint we may have allocated for the admin cq + */ + if (ss->ei_admin_si_hdl) { + (void) ddi_intr_remove_softint(ss->ei_admin_si_hdl); + ss->ei_admin_si_hdl = NULL; + } + + /* + * Release any work completion buffers we may have allocated + */ + if (chan->ch_wc && chan->ch_cq_sz) + kmem_free(chan->ch_wc, sizeof (ibt_wc_t) * chan->ch_cq_sz); + + chan->ch_cq_sz = 0; + chan->ch_wc = NULL; + + /* + * Free any completion queue we may have allocated + */ + if (chan->ch_cq_hdl) { + ret = ibt_free_cq(chan->ch_cq_hdl); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_rb_adm_setup_cq: " + "ibt_free_cq() failed, ret=%d", ret); + } + chan->ch_cq_hdl = NULL; + } +} + +static void +eib_rb_adm_setup_ud_channel(eib_t *ss) +{ + eib_chan_t *chan = ss->ei_admin_chan; + ibt_status_t ret; + + if (chan == NULL) + return; + + if (chan->ch_chan) { + /* + * We're trying to tear down this UD channel. Make sure that + * we don't attempt to refill (repost) at any point from now on. + */ + chan->ch_tear_down = B_TRUE; + if ((ret = ibt_flush_channel(chan->ch_chan)) != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_rb_adm_setup_ud_channel: " + "ibt_flush_channel() failed, ret=%d", ret); + } + + /* + * Wait until all posted tx wqes on this channel are back with + * the wqe pool. + */ + mutex_enter(&chan->ch_tx_lock); + while (chan->ch_tx_posted > 0) + cv_wait(&chan->ch_tx_cv, &chan->ch_tx_lock); + mutex_exit(&chan->ch_tx_lock); + + /* + * Wait until all posted rx wqes on this channel are back with + * the wqe pool. + */ + mutex_enter(&chan->ch_rx_lock); + while (chan->ch_rx_posted > 0) + cv_wait(&chan->ch_rx_cv, &chan->ch_rx_lock); + mutex_exit(&chan->ch_rx_lock); + + /* + * Now we're ready to free this channel + */ + if ((ret = ibt_free_channel(chan->ch_chan)) != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_rb_adm_setup_ud_channel: " + "ibt_free_channel() failed, ret=%d", ret); + } + + chan->ch_alloc_mp = B_FALSE; + chan->ch_ip_hdr_align = 0; + chan->ch_rwqe_bktsz = 0; + chan->ch_lwm_rwqes = 0; + chan->ch_max_rwqes = 0; + chan->ch_max_swqes = 0; + chan->ch_qpn = 0; + chan->ch_chan = NULL; + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/ib/clients/eoib/eib_chan.c Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,216 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ksynch.h> + +#include <sys/ib/clients/eoib/eib_impl.h> + +eib_chan_t * +eib_chan_init(void) +{ + eib_chan_t *chan; + + /* + * Allocate a eib_chan_t to store stuff about admin qp and + * initialize some basic stuff + */ + chan = kmem_zalloc(sizeof (eib_chan_t), KM_SLEEP); + + mutex_init(&chan->ch_pkey_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&chan->ch_cep_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&chan->ch_tx_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&chan->ch_rx_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&chan->ch_vhub_lock, NULL, MUTEX_DRIVER, NULL); + + cv_init(&chan->ch_cep_cv, NULL, CV_DEFAULT, NULL); + cv_init(&chan->ch_tx_cv, NULL, CV_DEFAULT, NULL); + cv_init(&chan->ch_rx_cv, NULL, CV_DEFAULT, NULL); + + return (chan); +} + +void +eib_chan_fini(eib_chan_t *chan) +{ + if (chan) { + cv_destroy(&chan->ch_rx_cv); + cv_destroy(&chan->ch_tx_cv); + cv_destroy(&chan->ch_cep_cv); + + mutex_destroy(&chan->ch_vhub_lock); + mutex_destroy(&chan->ch_rx_lock); + mutex_destroy(&chan->ch_tx_lock); + mutex_destroy(&chan->ch_cep_lock); + mutex_destroy(&chan->ch_pkey_lock); + + kmem_free(chan, sizeof (eib_chan_t)); + } +} + +int +eib_chan_post_rx(eib_t *ss, eib_chan_t *chan, uint_t *n_posted) +{ + eib_wqe_t *rwqes[EIB_RWR_CHUNK_SZ]; + ibt_status_t ret; + uint_t n_got = 0; + uint_t n_good = 0; + uint_t limit = 0; + uint_t room = 0; + uint_t chunk_sz; + int wndx; + int i; + + /* + * We don't want to post beyond the maximum rwqe size for this channel + */ + room = chan->ch_max_rwqes - chan->ch_rx_posted; + limit = (room > chan->ch_rwqe_bktsz) ? chan->ch_rwqe_bktsz : room; + + for (wndx = 0; wndx < limit; wndx += chunk_sz) { + /* + * Grab a chunk of rwqes + */ + chunk_sz = ((limit - wndx) < EIB_RWR_CHUNK_SZ) ? + (limit - wndx) : EIB_RWR_CHUNK_SZ; + + /* + * When eib_chan_post_rx() is called to post a bunch of rwqes, + * it is either during the vnic setup or when we're refilling + * the data channel. Neither situation is important enough for + * us to grab the wqes reserved for sending keepalives of + * previously established vnics. + */ + ret = eib_rsrc_grab_rwqes(ss, rwqes, chunk_sz, &n_got, + EIB_WPRI_LO); + if (ret != EIB_E_SUCCESS) + break; + + /* + * Post work requests from the rwqes we just grabbed + */ + for (i = 0; i < n_got; i++) { + eib_wqe_t *rwqe = rwqes[i]; + + ret = eib_chan_post_recv(ss, chan, rwqe); + if (ret == EIB_E_SUCCESS) { + n_good++; + } else if (rwqe->qe_mp) { + freemsg(rwqe->qe_mp); + } else { + eib_rsrc_return_rwqe(ss, rwqe, NULL); + } + } + + /* + * If we got less rwqes than we asked for during the grab + * earlier, we'll stop asking for more and quit now. + */ + if (n_got < chunk_sz) + break; + } + + /* + * If we posted absolutely nothing, we return failure; otherwise + * return success. + */ + if (n_good == 0) + return (EIB_E_FAILURE); + + if (n_posted) + *n_posted = n_good; + + return (EIB_E_SUCCESS); +} + +/*ARGSUSED*/ +int +eib_chan_post_recv(eib_t *ss, eib_chan_t *chan, eib_wqe_t *rwqe) +{ + ibt_status_t ret; + uint8_t *mp_base; + size_t mp_len; + + rwqe->qe_sgl.ds_va = (ib_vaddr_t)(uintptr_t)rwqe->qe_cpbuf; + rwqe->qe_sgl.ds_len = rwqe->qe_bufsz; + + /* + * If this channel has receive buffer alignment restrictions, make + * sure the requirements are met + */ + if (chan->ch_ip_hdr_align) { + rwqe->qe_sgl.ds_va += chan->ch_ip_hdr_align; + rwqe->qe_sgl.ds_len -= chan->ch_ip_hdr_align; + } + + /* + * If the receive buffer for this channel needs to have an mblk + * allocated, do it + */ + if (chan->ch_alloc_mp) { + mp_base = (uint8_t *)(uintptr_t)(rwqe->qe_sgl.ds_va); + mp_len = rwqe->qe_sgl.ds_len; + + rwqe->qe_mp = desballoc(mp_base, mp_len, 0, &rwqe->qe_frp); + if (rwqe->qe_mp == NULL) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_chan_post_recv: " + "desballoc(base=0x%llx, len=0x%llx) failed", + mp_base, mp_len); + return (EIB_E_FAILURE); + } + } + + /* + * Check if the recv queue is already full or if we can post one more + */ + mutex_enter(&chan->ch_rx_lock); + if (chan->ch_rx_posted > (chan->ch_max_rwqes - 1)) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_chan_post_recv: " + "too many rwqes posted already, posted=0x%lx, max=0x%lx", + chan->ch_rx_posted, chan->ch_max_rwqes); + mutex_exit(&chan->ch_rx_lock); + return (EIB_E_FAILURE); + } + + rwqe->qe_vnic_inst = chan->ch_vnic_inst; + rwqe->qe_chan = chan; + rwqe->qe_info |= EIB_WQE_FLG_POSTED_TO_HCA; + + ret = ibt_post_recv(chan->ch_chan, &(rwqe->qe_wr.recv), 1, NULL); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_chan_post_recv: " + "ibt_post_recv() failed, ret=%d", ret); + mutex_exit(&chan->ch_rx_lock); + return (EIB_E_FAILURE); + } + chan->ch_rx_posted++; + mutex_exit(&chan->ch_rx_lock); + + return (EIB_E_SUCCESS); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/ib/clients/eoib/eib_cmn.c Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,394 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ksynch.h> + +#include <sys/ib/clients/eoib/eib_impl.h> + +/* + * Definitions private to this file + */ +ib_gid_t eib_reserved_gid; + +uint8_t eib_zero_mac[] = { + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 +}; + +uint8_t eib_broadcast_mac[] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; + +int eib_setbit_mod67[] = { + -1, 0, 1, 39, 2, 15, 40, 23, + 3, 12, 16, 59, 41, 19, 24, 54, + 4, -1, 13, 10, 17, 62, 60, 28, + 42, 30, 20, 51, 25, 44, 55, 47, + 5, 32, -1, 38, 14, 22, 11, 58, + 18, 53, 63, 9, 61, 27, 29, 50, + 43, 46, 31, 37, 21, 57, 52, 8, + 26, 49, 45, 36, 56, 7, 48, 35, + 6, 34, 33 +}; + +char *eib_pvt_props[] = { + EIB_DLPROP_GW_EPORT_STATE, + EIB_DLPROP_HCA_GUID, + EIB_DLPROP_PORT_GUID, + NULL +}; + +#define eib_prop_get_and_test(inst, dp, propname, propval) \ +{ \ + (propval) = ddi_prop_get_int(DDI_DEV_T_ANY, (dp), \ + DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, (propname), -1); \ + if ((propval) == -1) { \ + EIB_DPRINTF_WARN((inst), "eib_get_props: " \ + "ddi_prop_get_int() could not find " \ + "property '%s'", (propname)); \ + goto get_props_fail; \ + } \ +} + +#define eib_prop64_get_and_test(inst, dp, propname, propval) \ +{ \ + (propval) = ddi_prop_get_int64(DDI_DEV_T_ANY, (dp), \ + DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, (propname), -1); \ + if ((propval) == -1) { \ + EIB_DPRINTF_WARN((inst), "eib_get_props: " \ + "ddi_prop_get_int64() could not find " \ + "property '%s'", (propname)); \ + goto get_props_fail; \ + } \ +} + +#define eib_propstr_get_and_test(inst, dp, propname, propval_p) \ +{ \ + int rv; \ + \ + *(propval_p) = NULL; \ + \ + rv = ddi_prop_lookup_string(DDI_DEV_T_ANY, (dp), \ + DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, (propname), \ + (propval_p)); \ + if (rv != DDI_PROP_SUCCESS) { \ + EIB_DPRINTF_WARN((inst), "eib_get_props: " \ + "ddi_prop_lookup_string() could not find " \ + "property '%s'", (propname)); \ + goto get_props_fail; \ + } \ +} + +/* + * HW/FW workarounds + */ + +/* + * 1. Verification of descriptor list length in the received packets is + * disabled, since experimentation shows that BX does not set the desc + * list length correctly. True for EoIB nexus as well. + */ +int eib_wa_no_desc_list_len = 1; + +/* + * 2. LSO/Checksum_Offload for EoIB packets does not seem to be supported + * currently, so we'll disable both temporarily. + */ +int eib_wa_no_cksum_offload = 1; +int eib_wa_no_lso = 1; + +/* + * 3. The "multicast entry" types are not clearly defined in the spec + * at the moment. The current BX software/firmware appears to ignore + * the type of the context table entries, so we will treat these + * addresses just like regular vnic addresses. + */ +int eib_wa_no_mcast_entries = 1; + +/* + * 4. VHUB updates from the gateways provide us with destination LIDs, + * and we will hand-create these address vectors. + */ +int eib_wa_no_av_discover = 1; + +/* + * 5. The older BX software does not seem to set the VP flag correctly + * in the login acknowledgements even when it successfully allocates + * a vlan, so we will ignore it for now. + */ +int eib_wa_no_good_vp_flag = 1; + +/* + * 6. Each vhub table is expected to carry a checksum at the end to + * verify the contents of the received vhub table. The current BX + * software/firmware does not seem to fill this field with the + * correct value (and/or the spec description is ambiguous). We + * will ignore the vhub table checksum verification for now. + */ +int eib_wa_no_good_vhub_cksum = 1; + +int +eib_get_props(eib_t *ss) +{ + int val; + int64_t val64; + char *str; + clock_t gw_ka_usecs; + clock_t vnic_ka_usecs; + + ss->ei_gw_props = kmem_zalloc(sizeof (eib_gw_props_t), KM_SLEEP); + ss->ei_props = kmem_zalloc(sizeof (eib_props_t), KM_SLEEP); + + mutex_init(&ss->ei_gw_props->pp_gw_lock, NULL, MUTEX_DRIVER, NULL); + + /* + * The interface speed is currently set to 10Gb/s, since we don't + * have a way yet to figure this virtual-wire specific data from + * the gateway. The rest of the properties are handed over to us + * by the EoIB nexus. + */ + ss->ei_props->ep_ifspeed = 10000000000; + + eib_prop64_get_and_test(ss->ei_instance, ss->ei_dip, + EIB_PROP_HCA_GUID, val64); + ss->ei_props->ep_hca_guid = (ib_guid_t)val64; + + eib_prop64_get_and_test(ss->ei_instance, ss->ei_dip, + EIB_PROP_GW_SYS_GUID, val64); + ss->ei_gw_props->pp_gw_system_guid = (ib_guid_t)val64; + + eib_prop64_get_and_test(ss->ei_instance, ss->ei_dip, + EIB_PROP_GW_GUID, val64); + ss->ei_gw_props->pp_gw_guid = (ib_guid_t)val64; + + eib_prop64_get_and_test(ss->ei_instance, ss->ei_dip, + EIB_PROP_GW_SN_PREFIX, val64); + ss->ei_gw_props->pp_gw_sn_prefix = (ib_sn_prefix_t)val64; + + eib_prop_get_and_test(ss->ei_instance, ss->ei_dip, + EIB_PROP_GW_ADV_PERIOD, val); + ss->ei_gw_props->pp_gw_adv_period = (uint_t)val; + + eib_prop_get_and_test(ss->ei_instance, ss->ei_dip, + EIB_PROP_GW_KA_PERIOD, val); + ss->ei_gw_props->pp_gw_ka_period = (uint_t)val; + + gw_ka_usecs = ss->ei_gw_props->pp_gw_ka_period * 1000; + gw_ka_usecs = ((gw_ka_usecs << 2) + gw_ka_usecs) >> 1; + ss->ei_gw_props->pp_gw_ka_ticks = drv_usectohz(gw_ka_usecs); + + eib_prop_get_and_test(ss->ei_instance, ss->ei_dip, + EIB_PROP_VNIC_KA_PERIOD, val); + ss->ei_gw_props->pp_vnic_ka_period = (uint_t)val; + + vnic_ka_usecs = ss->ei_gw_props->pp_vnic_ka_period * 1000; + ss->ei_gw_props->pp_vnic_ka_ticks = drv_usectohz(vnic_ka_usecs); + + eib_prop_get_and_test(ss->ei_instance, ss->ei_dip, + EIB_PROP_GW_CTRL_QPN, val); + ss->ei_gw_props->pp_gw_ctrl_qpn = (ib_qpn_t)val; + + eib_prop_get_and_test(ss->ei_instance, ss->ei_dip, + EIB_PROP_GW_LID, val); + ss->ei_gw_props->pp_gw_lid = (ib_lid_t)val; + + eib_prop_get_and_test(ss->ei_instance, ss->ei_dip, + EIB_PROP_GW_PORTID, val); + ss->ei_gw_props->pp_gw_portid = (uint16_t)val; + + eib_prop_get_and_test(ss->ei_instance, ss->ei_dip, + EIB_PROP_GW_NUM_NET_VNICS, val); + ss->ei_gw_props->pp_gw_num_net_vnics = (uint16_t)val; + + eib_prop_get_and_test(ss->ei_instance, ss->ei_dip, + EIB_PROP_GW_AVAILABLE, val); + ss->ei_gw_props->pp_gw_flag_available = (uint8_t)val; + + eib_prop_get_and_test(ss->ei_instance, ss->ei_dip, + EIB_PROP_GW_HOST_VNICS, val); + ss->ei_gw_props->pp_gw_is_host_adm_vnics = (uint8_t)val; + + eib_prop_get_and_test(ss->ei_instance, ss->ei_dip, + EIB_PROP_GW_SL, val); + ss->ei_gw_props->pp_gw_sl = (uint8_t)val; + + eib_prop_get_and_test(ss->ei_instance, ss->ei_dip, + EIB_PROP_GW_N_RSS_QPN, val); + ss->ei_gw_props->pp_gw_n_rss_qpn = (uint8_t)val; + + eib_prop_get_and_test(ss->ei_instance, ss->ei_dip, + EIB_PROP_HCA_PORTNUM, val); + ss->ei_props->ep_port_num = (uint8_t)val; + + eib_propstr_get_and_test(ss->ei_instance, ss->ei_dip, + EIB_PROP_GW_SYS_NAME, &str); + ss->ei_gw_props->pp_gw_system_name = (uint8_t *)str; + + eib_propstr_get_and_test(ss->ei_instance, ss->ei_dip, + EIB_PROP_GW_PORT_NAME, &str); + ss->ei_gw_props->pp_gw_port_name = (uint8_t *)str; + + eib_propstr_get_and_test(ss->ei_instance, ss->ei_dip, + EIB_PROP_GW_VENDOR_ID, &str); + ss->ei_gw_props->pp_gw_vendor_id = (uint8_t *)str; + + return (EIB_E_SUCCESS); + +get_props_fail: + eib_rb_get_props(ss); + return (EIB_E_FAILURE); +} + +void +eib_update_props(eib_t *ss, eib_gw_info_t *new_gw_info) +{ + eib_gw_props_t *gwp = ss->ei_gw_props; + dev_info_t *dip = ss->ei_dip; + char *str; + + ASSERT(gwp != NULL && dip != NULL); + + mutex_enter(&gwp->pp_gw_lock); + + gwp->pp_gw_system_guid = new_gw_info->gi_system_guid; + (void) ddi_prop_update_int64(DDI_DEV_T_NONE, dip, EIB_PROP_GW_SYS_GUID, + gwp->pp_gw_system_guid); + + gwp->pp_gw_guid = new_gw_info->gi_guid; + (void) ddi_prop_update_int64(DDI_DEV_T_NONE, dip, EIB_PROP_GW_GUID, + gwp->pp_gw_guid); + + gwp->pp_gw_sn_prefix = new_gw_info->gi_sn_prefix; + (void) ddi_prop_update_int64(DDI_DEV_T_NONE, dip, EIB_PROP_GW_SN_PREFIX, + gwp->pp_gw_sn_prefix); + + gwp->pp_gw_adv_period = new_gw_info->gi_adv_period; + (void) ddi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_ADV_PERIOD, + gwp->pp_gw_adv_period); + + gwp->pp_gw_ka_period = new_gw_info->gi_ka_period; + (void) ddi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_KA_PERIOD, + gwp->pp_gw_ka_period); + + gwp->pp_vnic_ka_period = new_gw_info->gi_vnic_ka_period; + (void) ddi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_VNIC_KA_PERIOD, + gwp->pp_vnic_ka_period); + + gwp->pp_gw_ctrl_qpn = new_gw_info->gi_ctrl_qpn; + (void) ddi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_CTRL_QPN, + gwp->pp_gw_ctrl_qpn); + + gwp->pp_gw_lid = new_gw_info->gi_lid; + (void) ddi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_LID, + gwp->pp_gw_lid); + + gwp->pp_gw_portid = new_gw_info->gi_portid; + (void) ddi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_PORTID, + gwp->pp_gw_portid); + + gwp->pp_gw_num_net_vnics = new_gw_info->gi_num_net_vnics; + (void) ddi_prop_update_int(DDI_DEV_T_NONE, dip, + EIB_PROP_GW_NUM_NET_VNICS, gwp->pp_gw_num_net_vnics); + + gwp->pp_gw_flag_available = new_gw_info->gi_flag_available; + (void) ddi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_AVAILABLE, + gwp->pp_gw_flag_available); + + gwp->pp_gw_is_host_adm_vnics = new_gw_info->gi_is_host_adm_vnics; + (void) ddi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_HOST_VNICS, + gwp->pp_gw_is_host_adm_vnics); + + gwp->pp_gw_sl = new_gw_info->gi_sl; + (void) ddi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_SL, + gwp->pp_gw_sl); + + gwp->pp_gw_n_rss_qpn = new_gw_info->gi_n_rss_qpn; + (void) ddi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_N_RSS_QPN, + gwp->pp_gw_n_rss_qpn); + + (void) ddi_prop_update_string(DDI_DEV_T_NONE, dip, + EIB_PROP_GW_SYS_NAME, (char *)(new_gw_info->gi_system_name)); + (void) ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, EIB_PROP_GW_SYS_NAME, &str); + if (gwp->pp_gw_system_name) { + ddi_prop_free(gwp->pp_gw_system_name); + } + gwp->pp_gw_system_name = (uint8_t *)str; + + (void) ddi_prop_update_string(DDI_DEV_T_NONE, dip, + EIB_PROP_GW_PORT_NAME, (char *)(new_gw_info->gi_port_name)); + (void) ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, EIB_PROP_GW_PORT_NAME, &str); + if (gwp->pp_gw_port_name) { + ddi_prop_free(gwp->pp_gw_port_name); + } + gwp->pp_gw_port_name = (uint8_t *)str; + + (void) ddi_prop_update_string(DDI_DEV_T_NONE, dip, + EIB_PROP_GW_VENDOR_ID, (char *)(new_gw_info->gi_vendor_id)); + (void) ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, EIB_PROP_GW_VENDOR_ID, &str); + if (gwp->pp_gw_vendor_id) { + ddi_prop_free(gwp->pp_gw_vendor_id); + } + gwp->pp_gw_vendor_id = (uint8_t *)str; + + mutex_exit(&gwp->pp_gw_lock); +} + +void +eib_rb_get_props(eib_t *ss) +{ + /* + * Free any allocations + */ + if (ss->ei_gw_props->pp_gw_vendor_id) { + ddi_prop_free(ss->ei_gw_props->pp_gw_vendor_id); + ss->ei_gw_props->pp_gw_vendor_id = NULL; + } + if (ss->ei_gw_props->pp_gw_port_name) { + ddi_prop_free(ss->ei_gw_props->pp_gw_port_name); + ss->ei_gw_props->pp_gw_port_name = NULL; + } + if (ss->ei_gw_props->pp_gw_system_name) { + ddi_prop_free(ss->ei_gw_props->pp_gw_system_name); + ss->ei_gw_props->pp_gw_system_name = NULL; + } + + mutex_destroy(&ss->ei_gw_props->pp_gw_lock); + + /* + * Free space allocated for holding the props + */ + kmem_free(ss->ei_props, sizeof (eib_props_t)); + kmem_free(ss->ei_gw_props, sizeof (eib_gw_props_t)); + + ss->ei_props = NULL; + ss->ei_gw_props = NULL; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/ib/clients/eoib/eib_ctl.c Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,469 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ksynch.h> + +#include <sys/ib/clients/eoib/eib_impl.h> + +/* + * Declarations private to this file + */ +static int eib_ctl_setup_cq(eib_t *, eib_vnic_t *); +static int eib_ctl_setup_ud_channel(eib_t *, eib_vnic_t *); +static void eib_ctl_comp_intr(ibt_cq_hdl_t, void *); +static void eib_ctl_rx_comp(eib_vnic_t *, eib_wqe_t *); +static void eib_ctl_tx_comp(eib_vnic_t *, eib_wqe_t *); +static void eib_ctl_err_comp(eib_vnic_t *, eib_wqe_t *, ibt_wc_t *); +static void eib_rb_ctl_setup_cq(eib_t *, eib_vnic_t *); +static void eib_rb_ctl_setup_ud_channel(eib_t *, eib_vnic_t *); + +int +eib_ctl_create_qp(eib_t *ss, eib_vnic_t *vnic, int *err) +{ + eib_chan_t *chan = NULL; + + /* + * Allocate a eib_chan_t to store stuff about this vnic's ctl qp + * and initialize it with default admin qp pkey parameters. We'll + * re-associate this with the pkey we receive from the gw once we + * receive the login ack. + */ + vnic->vn_ctl_chan = eib_chan_init(); + + chan = vnic->vn_ctl_chan; + chan->ch_pkey = ss->ei_admin_chan->ch_pkey; + chan->ch_pkey_ix = ss->ei_admin_chan->ch_pkey_ix; + chan->ch_vnic_inst = vnic->vn_instance; + + /* + * Setup a combined CQ and completion handler + */ + if (eib_ctl_setup_cq(ss, vnic) != EIB_E_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_ctl_create_qp: " + "eib_ctl_setup_cq() failed"); + *err = ENOMEM; + goto ctl_create_qp_fail; + } + + /* + * Setup UD channel + */ + if (eib_ctl_setup_ud_channel(ss, vnic) != EIB_E_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_ctl_create_qp: " + "eib_ctl_setup_ud_channel() failed"); + *err = ENOMEM; + goto ctl_create_qp_fail; + } + + return (EIB_E_SUCCESS); + +ctl_create_qp_fail: + eib_rb_ctl_create_qp(ss, vnic); + return (EIB_E_FAILURE); +} + +/*ARGSUSED*/ +uint_t +eib_ctl_comp_handler(caddr_t arg1, caddr_t arg2) +{ + eib_vnic_t *vnic = (eib_vnic_t *)(void *)arg1; + eib_chan_t *chan = vnic->vn_ctl_chan; + eib_t *ss = vnic->vn_ss; + ibt_wc_t *wc; + eib_wqe_t *wqe; + ibt_status_t ret; + uint_t polled; + int i; + + /* + * Re-arm the notification callback before we start polling + * the completion queue. There's nothing much we can do if the + * enable_cq_notify fails - we issue a warning and move on. + */ + ret = ibt_enable_cq_notify(chan->ch_cq_hdl, IBT_NEXT_COMPLETION); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_ctl_comp_handler: " + "ibt_enable_cq_notify() failed, ret=%d", ret); + } + + /* + * Handle tx and rx completions + */ + while ((ret = ibt_poll_cq(chan->ch_cq_hdl, chan->ch_wc, chan->ch_cq_sz, + &polled)) == IBT_SUCCESS) { + for (wc = chan->ch_wc, i = 0; i < polled; i++, wc++) { + wqe = (eib_wqe_t *)(uintptr_t)wc->wc_id; + if (wc->wc_status != IBT_WC_SUCCESS) { + eib_ctl_err_comp(vnic, wqe, wc); + } else if (EIB_WQE_TYPE(wqe->qe_info) == EIB_WQE_RX) { + eib_ctl_rx_comp(vnic, wqe); + } else { + eib_ctl_tx_comp(vnic, wqe); + } + } + } + + return (DDI_INTR_CLAIMED); +} + +void +eib_rb_ctl_create_qp(eib_t *ss, eib_vnic_t *vnic) +{ + eib_rb_ctl_setup_ud_channel(ss, vnic); + + eib_rb_ctl_setup_cq(ss, vnic); + + eib_chan_fini(vnic->vn_ctl_chan); + vnic->vn_ctl_chan = NULL; +} + +static int +eib_ctl_setup_cq(eib_t *ss, eib_vnic_t *vnic) +{ + eib_chan_t *chan = vnic->vn_ctl_chan; + ibt_cq_attr_t cq_attr; + ibt_status_t ret; + uint_t sz; + int rv; + + /* + * Allocate a completion queue for sending vhub table request + * and vhub-update/vnic-alive messages and responses from the + * gateway + */ + cq_attr.cq_sched = NULL; + cq_attr.cq_flags = IBT_CQ_NO_FLAGS; + if (ss->ei_hca_attrs->hca_max_cq_sz < EIB_CTL_CQ_SIZE) + cq_attr.cq_size = ss->ei_hca_attrs->hca_max_cq_sz; + else + cq_attr.cq_size = EIB_CTL_CQ_SIZE; + + ret = ibt_alloc_cq(ss->ei_hca_hdl, &cq_attr, &chan->ch_cq_hdl, &sz); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_ctl_setup_cq: " + "ibt_alloc_cq(cq_sz=0x%lx) failed, ret=%d", + cq_attr.cq_size, ret); + goto ctl_setup_cq_fail; + } + + /* + * Set up other parameters for collecting completion information + */ + chan->ch_cq_sz = sz; + chan->ch_wc = kmem_zalloc(sizeof (ibt_wc_t) * sz, KM_SLEEP); + + /* + * Allocate soft interrupt for this vnic's control channel cq + * handler and set up the IBTL cq handler. + */ + if ((rv = ddi_intr_add_softint(ss->ei_dip, &vnic->vn_ctl_si_hdl, + EIB_SOFTPRI_CTL, eib_ctl_comp_handler, vnic)) != DDI_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_ctl_setup_cq: " + "ddi_intr_add_softint() failed for vnic %d ctl qp, ret=%d", + vnic->vn_instance, rv); + goto ctl_setup_cq_fail; + } + + /* + * Now, set up this vnic's control channel completion queue handler + */ + ibt_set_cq_handler(chan->ch_cq_hdl, eib_ctl_comp_intr, vnic); + + ret = ibt_enable_cq_notify(chan->ch_cq_hdl, IBT_NEXT_COMPLETION); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_ctl_setup_cq: " + "ibt_enable_cq_notify() failed, ret=%d", ret); + goto ctl_setup_cq_fail; + } + + return (EIB_E_SUCCESS); + +ctl_setup_cq_fail: + eib_rb_ctl_setup_cq(ss, vnic); + return (EIB_E_FAILURE); +} + +static int +eib_ctl_setup_ud_channel(eib_t *ss, eib_vnic_t *vnic) +{ + eib_chan_t *chan = vnic->vn_ctl_chan; + ibt_ud_chan_alloc_args_t alloc_attr; + ibt_ud_chan_query_attr_t query_attr; + ibt_status_t ret; + + bzero(&alloc_attr, sizeof (ibt_ud_chan_alloc_args_t)); + bzero(&query_attr, sizeof (ibt_ud_chan_query_attr_t)); + + alloc_attr.ud_flags = IBT_ALL_SIGNALED; + alloc_attr.ud_hca_port_num = ss->ei_props->ep_port_num; + alloc_attr.ud_pkey_ix = chan->ch_pkey_ix; + alloc_attr.ud_sizes.cs_sq = EIB_CTL_MAX_SWQE; + alloc_attr.ud_sizes.cs_rq = EIB_CTL_MAX_RWQE; + alloc_attr.ud_sizes.cs_sq_sgl = 1; + alloc_attr.ud_sizes.cs_rq_sgl = 1; + alloc_attr.ud_sizes.cs_inline = 0; + + alloc_attr.ud_qkey = EIB_FIP_QKEY; + alloc_attr.ud_scq = chan->ch_cq_hdl; + alloc_attr.ud_rcq = chan->ch_cq_hdl; + alloc_attr.ud_pd = ss->ei_pd_hdl; + + ret = ibt_alloc_ud_channel(ss->ei_hca_hdl, IBT_ACHAN_NO_FLAGS, + &alloc_attr, &chan->ch_chan, NULL); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_ctl_setup_ud_channel: " + "ibt_alloc_ud_channel(port=0x%x, pkey_ix=0x%x) " + "failed, ret=%d", alloc_attr.ud_hca_port_num, + chan->ch_pkey_ix, ret); + goto ctl_setup_ud_channel_fail; + } + + ret = ibt_query_ud_channel(chan->ch_chan, &query_attr); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_ctl_setup_ud_channel: " + "ibt_query_ud_channel() failed, ret=%d", ret); + goto ctl_setup_ud_channel_fail; + } + + chan->ch_qpn = query_attr.ud_qpn; + chan->ch_max_swqes = query_attr.ud_chan_sizes.cs_sq; + chan->ch_max_rwqes = query_attr.ud_chan_sizes.cs_rq; + chan->ch_lwm_rwqes = chan->ch_max_rwqes >> 2; + chan->ch_rwqe_bktsz = chan->ch_max_rwqes; + chan->ch_ip_hdr_align = 0; + chan->ch_alloc_mp = B_FALSE; + chan->ch_tear_down = B_FALSE; + + return (EIB_E_SUCCESS); + +ctl_setup_ud_channel_fail: + eib_rb_ctl_setup_ud_channel(ss, vnic); + return (EIB_E_FAILURE); +} + +static void +eib_ctl_comp_intr(ibt_cq_hdl_t cq_hdl, void *arg) +{ + eib_vnic_t *vnic = arg; + eib_t *ss = vnic->vn_ss; + eib_chan_t *chan = vnic->vn_ctl_chan; + + if (cq_hdl != chan->ch_cq_hdl) { + EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_ctl_comp_intr: " + "cq_hdl(0x%llx) != chan->ch_cq_hdl(0x%llx), " + "ignoring completion", cq_hdl, chan->ch_cq_hdl); + return; + } + + ASSERT(vnic->vn_ctl_si_hdl != NULL); + + (void) ddi_intr_trigger_softint(vnic->vn_ctl_si_hdl, NULL); +} + +static void +eib_ctl_rx_comp(eib_vnic_t *vnic, eib_wqe_t *wqe) +{ + eib_t *ss = vnic->vn_ss; + eib_chan_t *chan = vnic->vn_ctl_chan; + uint8_t *pkt = (uint8_t *)(uintptr_t)(wqe->qe_sgl.ds_va); + ibt_status_t ret; + + /* + * Skip the GRH and parse the message in the packet + */ + (void) eib_fip_parse_ctl_pkt(pkt + EIB_GRH_SZ, vnic); + + /* + * Try to repost the rwqe. For control channels, we take the shortcut + * and not go through eib_chan_post_recv(), since we know that the + * qe_info flag, qe_chan and qe_vinst are all already set correctly; we + * just took this out of the rx queue, so the ch_rx_posted will be ok + * if we just posted it back. And there are no mblk allocation or + * buffer alignment restrictions for this channel as well. + */ + if (chan->ch_tear_down) { + eib_rsrc_return_rwqe(ss, wqe, chan); + } else { + ret = ibt_post_recv(chan->ch_chan, &(wqe->qe_wr.recv), 1, NULL); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_ctl_rx_comp: " + "ibt_post_recv() failed, ret=%d", ret); + eib_rsrc_return_rwqe(ss, wqe, chan); + } + } +} + +static void +eib_ctl_tx_comp(eib_vnic_t *vnic, eib_wqe_t *wqe) +{ + eib_rsrc_return_swqe(vnic->vn_ss, wqe, vnic->vn_ctl_chan); +} + +static void +eib_ctl_err_comp(eib_vnic_t *vnic, eib_wqe_t *wqe, ibt_wc_t *wc) +{ + eib_t *ss = vnic->vn_ss; + + /* + * Currently, all we do is report + */ + switch (wc->wc_status) { + case IBT_WC_WR_FLUSHED_ERR: + break; + + case IBT_WC_LOCAL_CHAN_OP_ERR: + EIB_DPRINTF_ERR(ss->ei_instance, "eib_ctl_err_comp: " + "IBT_WC_LOCAL_CHAN_OP_ERR seen, wqe_info=0x%lx ", + wqe->qe_info); + break; + + case IBT_WC_LOCAL_PROTECT_ERR: + EIB_DPRINTF_ERR(ss->ei_instance, "eib_ctl_err_comp: " + "IBT_WC_LOCAL_PROTECT_ERR seen, wqe_info=0x%lx ", + wqe->qe_info); + break; + } + + /* + * When a wc indicates error, we do not attempt to repost but + * simply return it to the wqe pool. + */ + if (EIB_WQE_TYPE(wqe->qe_info) == EIB_WQE_RX) + eib_rsrc_return_rwqe(ss, wqe, vnic->vn_ctl_chan); + else + eib_rsrc_return_swqe(ss, wqe, vnic->vn_ctl_chan); +} + +/*ARGSUSED*/ +static void +eib_rb_ctl_setup_cq(eib_t *ss, eib_vnic_t *vnic) +{ + eib_chan_t *chan = vnic->vn_ctl_chan; + ibt_status_t ret; + + if (chan == NULL) + return; + + /* + * Reset any completion handler we may have set up + */ + if (chan->ch_cq_hdl) + ibt_set_cq_handler(chan->ch_cq_hdl, NULL, NULL); + + /* + * Remove any softint we may have allocated for this cq + */ + if (vnic->vn_ctl_si_hdl) { + (void) ddi_intr_remove_softint(vnic->vn_ctl_si_hdl); + vnic->vn_ctl_si_hdl = NULL; + } + + /* + * Release any work completion buffers we may have allocated + */ + if (chan->ch_wc && chan->ch_cq_sz) + kmem_free(chan->ch_wc, sizeof (ibt_wc_t) * chan->ch_cq_sz); + + chan->ch_cq_sz = 0; + chan->ch_wc = NULL; + + /* + * Free any completion queue we may have allocated + */ + if (chan->ch_cq_hdl) { + ret = ibt_free_cq(chan->ch_cq_hdl); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_rb_ctl_setup_cq: " + "ibt_free_cq() failed, ret=%d", ret); + } + chan->ch_cq_hdl = NULL; + } +} + +/*ARGSUSED*/ +static void +eib_rb_ctl_setup_ud_channel(eib_t *ss, eib_vnic_t *vnic) +{ + eib_chan_t *chan = vnic->vn_ctl_chan; + ibt_status_t ret; + + if (chan == NULL) + return; + + if (chan->ch_chan) { + /* + * We're trying to tear down this UD channel. Make sure that + * we don't attempt to refill (repost) at any point from now on. + */ + chan->ch_tear_down = B_TRUE; + if ((ret = ibt_flush_channel(chan->ch_chan)) != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_rb_ctl_setup_ud_channel: " + "ibt_flush_channel() failed, ret=%d", ret); + } + + /* + * Wait until all posted tx wqes on this channel are back with + * the wqe pool. + */ + mutex_enter(&chan->ch_tx_lock); + while (chan->ch_tx_posted > 0) + cv_wait(&chan->ch_tx_cv, &chan->ch_tx_lock); + mutex_exit(&chan->ch_tx_lock); + + /* + * Wait until all posted rx wqes on this channel are back with + * the wqe pool. + */ + mutex_enter(&chan->ch_rx_lock); + while (chan->ch_rx_posted > 0) + cv_wait(&chan->ch_rx_cv, &chan->ch_rx_lock); + mutex_exit(&chan->ch_rx_lock); + + /* + * Now we're ready to free this channel + */ + if ((ret = ibt_free_channel(chan->ch_chan)) != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_rb_ctl_setup_ud_channel: " + "ibt_free_channel() failed, ret=%d", ret); + } + + chan->ch_alloc_mp = B_FALSE; + chan->ch_ip_hdr_align = 0; + chan->ch_rwqe_bktsz = 0; + chan->ch_lwm_rwqes = 0; + chan->ch_max_rwqes = 0; + chan->ch_max_swqes = 0; + chan->ch_qpn = 0; + chan->ch_chan = NULL; + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/ib/clients/eoib/eib_data.c Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,1496 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ksynch.h> +#include <sys/pattr.h> /* HCK_* */ +#include <inet/ip.h> /* ipha_t */ +#include <inet/tcp.h> /* tcph_t */ +#include <sys/mac_provider.h> /* mac_* */ +#include <sys/strsun.h> /* MBLKL */ + +#include <sys/ib/clients/eoib/eib_impl.h> + +/* + * Declarations private to this file + */ +static int eib_data_setup_cqs(eib_t *, eib_vnic_t *); +static int eib_data_setup_ud_channel(eib_t *, eib_vnic_t *); +static void eib_data_setup_lso(eib_wqe_t *, mblk_t *, uint32_t, + eib_ether_hdr_t *); +static int eib_data_prepare_sgl(eib_vnic_t *, eib_wqe_t *, mblk_t *); +static int eib_data_is_mcast_pkt_ok(eib_vnic_t *, uint8_t *, uint64_t *, + uint64_t *); +static void eib_data_rx_comp_intr(ibt_cq_hdl_t, void *); +static void eib_data_tx_comp_intr(ibt_cq_hdl_t, void *); +static mblk_t *eib_data_rx_comp(eib_vnic_t *, eib_wqe_t *, ibt_wc_t *); +static void eib_data_tx_comp(eib_vnic_t *, eib_wqe_t *, eib_chan_t *); +static void eib_data_err_comp(eib_vnic_t *, eib_wqe_t *, ibt_wc_t *); +static void eib_rb_data_setup_cqs(eib_t *, eib_vnic_t *); +static void eib_rb_data_setup_ud_channel(eib_t *, eib_vnic_t *); + + +int +eib_data_create_qp(eib_t *ss, eib_vnic_t *vnic, int *err) +{ + eib_chan_t *chan = NULL; + + /* + * Allocate a eib_chan_t to store stuff about this vnic's data qp + * and initialize it with default admin qp pkey parameters. We'll + * re-associate this with the pkey we receive from the gw once we + * receive the login ack. + */ + vnic->vn_data_chan = eib_chan_init(); + + chan = vnic->vn_data_chan; + chan->ch_pkey = ss->ei_admin_chan->ch_pkey; + chan->ch_pkey_ix = ss->ei_admin_chan->ch_pkey_ix; + chan->ch_vnic_inst = vnic->vn_instance; + + /* + * Setup tx/rx CQs and completion handlers + */ + if (eib_data_setup_cqs(ss, vnic) != EIB_E_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_create_qp: " + "eib_data_setup_cqs(vn_inst=0x%x) failed", + vnic->vn_instance); + *err = ENOMEM; + goto data_create_qp_fail; + } + + /* + * Setup UD channel + */ + if (eib_data_setup_ud_channel(ss, vnic) != EIB_E_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_create_qp: " + "eib_data_setup_ud_channel(vn_inst=0x%x) failed", + vnic->vn_instance); + *err = ENOMEM; + goto data_create_qp_fail; + } + + return (EIB_E_SUCCESS); + +data_create_qp_fail: + eib_rb_data_create_qp(ss, vnic); + return (EIB_E_FAILURE); +} + +/*ARGSUSED*/ +uint_t +eib_data_rx_comp_handler(caddr_t arg1, caddr_t arg2) +{ + eib_vnic_t *vnic = (eib_vnic_t *)(void *)arg1; + eib_t *ss = vnic->vn_ss; + eib_chan_t *chan = vnic->vn_data_chan; + eib_stats_t *stats = ss->ei_stats; + ibt_wc_t *wc; + eib_wqe_t *wqe; + mblk_t *mp; + mblk_t *head = NULL; + mblk_t *tail = NULL; + ibt_status_t ret; + uint_t pkts_per_call = 0; + uint_t polled; + uint_t rbytes; + uint_t ipkts; + uint_t num_wc; + int i; + + /* + * Re-arm the rx notification callback before we start polling + * the completion queue. There's nothing much we can do if the + * enable_cq_notify fails - we issue a warning and move on. + */ + ret = ibt_enable_cq_notify(chan->ch_rcv_cq_hdl, IBT_NEXT_COMPLETION); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp_handler: " + "ibt_enable_cq_notify() failed, ret=%d", ret); + } + + /* + * We don't want to be stuck in receive processing for too long without + * giving others a chance. + */ + num_wc = (chan->ch_rcv_cq_sz < EIB_MAX_RX_PKTS_ONINTR) ? + chan->ch_rcv_cq_sz : EIB_MAX_RX_PKTS_ONINTR; + + /* + * Handle rx completions + */ + while ((ret = ibt_poll_cq(chan->ch_rcv_cq_hdl, chan->ch_rcv_wc, + num_wc, &polled)) == IBT_SUCCESS) { + + rbytes = ipkts = 0; + head = tail = NULL; + + for (wc = chan->ch_rcv_wc, i = 0; i < polled; i++, wc++) { + wqe = (eib_wqe_t *)(uintptr_t)wc->wc_id; + + ASSERT(EIB_WQE_TYPE(wqe->qe_info) == EIB_WQE_RX); + + /* + * Clear the posted-to-hca flag and reduce the number + * of posted-rwqes count + */ + wqe->qe_info &= (~EIB_WQE_FLG_POSTED_TO_HCA); + eib_rsrc_decr_posted_rwqe(ss, chan); + + rbytes += wc->wc_bytes_xfer; + if (wc->wc_status != IBT_WC_SUCCESS) { + EIB_INCR_COUNTER(&stats->st_ierrors); + eib_data_err_comp(vnic, wqe, wc); + } else { + ipkts++; + mp = eib_data_rx_comp(vnic, wqe, wc); + if (mp == NULL) { + continue; + } else { + /* + * Add this mp to the list to + * send it to the nw layer. Note + * that the wqe could've been + * returned to the pool if we're + * running low, so don't process + * wqe after this point. + */ + if (head) + tail->b_next = mp; + else + head = mp; + tail = mp; + } + } + } + + /* + * We reduce the number of atomic updates to key statistics + * by pooling them here, once per ibt_poll_cq(). The accuracy + * and consistency of the published statistics within a cq + * polling cycle will be compromised a little bit, but that + * should be ok, given that we probably gain a little bit by + * not having to do these atomic operations per packet. + */ + EIB_UPDATE_COUNTER(&stats->st_rbytes, rbytes); + EIB_UPDATE_COUNTER(&stats->st_ipkts, ipkts); + + pkts_per_call += ipkts; + + if (head) { + mac_rx(ss->ei_mac_hdl, NULL, head); + } + + /* + * If we have processed too many packets in one attempt, we'll + * have to come back here later. + */ + if (pkts_per_call >= EIB_MAX_RX_PKTS_ONINTR) { + (void) ddi_intr_trigger_softint(vnic->vn_data_rx_si_hdl, + NULL); + break; + } + + num_wc -= polled; + } + + return (DDI_INTR_CLAIMED); +} + +/*ARGSUSED*/ +uint_t +eib_data_tx_comp_handler(caddr_t arg1, caddr_t arg2) +{ + eib_vnic_t *vnic = (eib_vnic_t *)(void *)arg1; + eib_t *ss = vnic->vn_ss; + eib_chan_t *chan = vnic->vn_data_chan; + eib_stats_t *stats = ss->ei_stats; + ibt_wc_t *wc; + eib_wqe_t *wqe; + ibt_status_t ret; + uint_t polled; + int i; + + /* + * Re-arm the tx notification callback before we start polling + * the completion queue. There's nothing much we can do if the + * enable_cq_notify fails - we issue a warning and move on. + */ + ret = ibt_enable_cq_notify(chan->ch_cq_hdl, IBT_NEXT_COMPLETION); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_tx_comp_handler: " + "ibt_enable_cq_notify() failed, ret=%d", ret); + } + + /* + * Handle tx completions + */ + while ((ret = ibt_poll_cq(chan->ch_cq_hdl, chan->ch_wc, chan->ch_cq_sz, + &polled)) == IBT_SUCCESS) { + for (wc = chan->ch_wc, i = 0; i < polled; i++, wc++) { + wqe = (eib_wqe_t *)(uintptr_t)wc->wc_id; + + ASSERT(EIB_WQE_TYPE(wqe->qe_info) == EIB_WQE_TX); + + if (wc->wc_status != IBT_WC_SUCCESS) { + EIB_INCR_COUNTER(&stats->st_oerrors); + eib_data_err_comp(vnic, wqe, wc); + } else { + eib_data_tx_comp(vnic, wqe, vnic->vn_data_chan); + } + } + } + + return (DDI_INTR_CLAIMED); +} + +void +eib_data_rx_recycle(caddr_t arg) +{ + eib_wqe_t *rwqe = (eib_wqe_t *)(void *)arg; + eib_t *ss = rwqe->qe_pool->wp_ss; + eib_chan_t *vn_chan; + uint_t nic_state; + int ret; + + /* + * We come here from three places - (a) from the nw layer if the + * rx mblk we handed to it has been done with and the nw layer is + * calling the freemsg() (b) from eib_data_rx_comp() if the rx + * completion processing discovers that the received EoIB packet + * has a problem and (c) from eib_data_err_comp() if we're tearing + * down this channel. We only need to repost the rwqe if we're + * being called back from the nw layer. For the other two cases, + * we'll simply return the rwqe to the pool. Also, since we would've + * already updated the ch_rx_posted counters in the rx completion + * handler, we don't pass the chan pointer to eib_rsrc_return_rwqe + * from within this routine. + */ + rwqe->qe_mp = NULL; + if ((rwqe->qe_info & EIB_WQE_FLG_WITH_NW) == 0) { + eib_rsrc_return_rwqe(ss, rwqe, NULL); + return; + } + + rwqe->qe_info &= (~EIB_WQE_FLG_WITH_NW); + + /* + * If the buffers are being returned by nw layer after a long + * time, this eoib instance could've even been stopped by now. + * If so, simply return the rwqe to the pool. + */ + nic_state = eib_mac_get_nic_state(ss); + if ((nic_state & EIB_NIC_STARTED) != EIB_NIC_STARTED) { + eib_rsrc_return_rwqe(ss, rwqe, NULL); + return; + } + + /* + * Or it could've taken even longer, and the nic has even been + * restarted. Only thing we can do is to make sure that the + * original channel pointer we passed corresponds to what's in + * the instance of the vnic currently. + */ + vn_chan = eib_vnic_get_data_chan(ss, rwqe->qe_vnic_inst); + if (vn_chan == NULL || vn_chan != rwqe->qe_chan) { + eib_rsrc_return_rwqe(ss, rwqe, NULL); + return; + } + + /* + * Try to repost the rwqe if we're not tearing down this channel + */ + if (vn_chan->ch_tear_down) { + eib_rsrc_return_rwqe(ss, rwqe, NULL); + } else { + ret = eib_chan_post_recv(ss, vn_chan, rwqe); + if (ret != EIB_E_SUCCESS) { + if (rwqe->qe_mp) + freemsg(rwqe->qe_mp); + else + eib_rsrc_return_rwqe(ss, rwqe, NULL); + } + } +} + +void +eib_data_post_tx(eib_vnic_t *vnic, eib_wqe_t *swqe) +{ + eib_chan_t *chan = vnic->vn_data_chan; + eib_t *ss = vnic->vn_ss; + eib_stats_t *stats = vnic->vn_ss->ei_stats; + ibt_send_wr_t wrs[EIB_MAX_POST_MULTIPLE]; + eib_wqe_t *wqes[EIB_MAX_POST_MULTIPLE]; + eib_wqe_t *elem; + ibt_status_t ret; + uint_t n_wrs; + uint_t n_posted; + uint_t total_failed = 0; + uint_t n_failed = 0; + uint_t i; + + /* + * See if we have room for this wqe and then add it to the + * list of tx wrs to post in this channel. + */ + mutex_enter(&chan->ch_tx_lock); + + if ((chan->ch_tx_posted + 1) >= (chan->ch_max_swqes - 1)) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_post_tx: " + "too many swqes posted already, posted=0x%lx, " + "max=0x%lx", chan->ch_tx_posted, chan->ch_max_swqes); + mutex_exit(&chan->ch_tx_lock); + return; + } + + swqe->qe_nxt_post = NULL; + if (chan->ch_tx) { + chan->ch_tx_tail->qe_nxt_post = swqe; + } else { + chan->ch_tx = swqe; + } + chan->ch_tx_tail = swqe; + chan->ch_tx_posted++; /* pre-increment */ + + /* + * If someone's already posting tx wqes in this channel, let + * them post ours as well. + */ + if (chan->ch_tx_busy == B_TRUE) { + mutex_exit(&chan->ch_tx_lock); + return; + } + chan->ch_tx_busy = B_TRUE; + + while (chan->ch_tx) { + /* + * Post EIB_MAX_POST_MULTIPLE wrs at a time + */ + for (n_wrs = 0, elem = chan->ch_tx; + (elem) && (n_wrs < EIB_MAX_POST_MULTIPLE); + elem = elem->qe_nxt_post, n_wrs++) { + wqes[n_wrs] = elem; + wrs[n_wrs] = (elem->qe_wr).send; + } + chan->ch_tx = elem; + if (elem == NULL) { + chan->ch_tx_tail = NULL; + } + mutex_exit(&chan->ch_tx_lock); + + ASSERT(n_wrs != 0); + + /* + * If multiple wrs posting fails for some reason, we'll try + * posting the unposted ones one by one. If even that fails, + * we'll release any mappings/buffers/mblks associated with + * this wqe and return it to the pool. + */ + n_posted = n_failed = 0; + ret = ibt_post_send(chan->ch_chan, wrs, n_wrs, &n_posted); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_post_tx: " + "ibt_post_send(n_wrs=0x%lx, n_posted=0x%lx) " + "failed, ret=%d", n_wrs, n_posted, ret); + + for (i = n_posted; i < n_wrs; i++) { + ret = ibt_post_send(chan->ch_chan, &wrs[i], + 1, NULL); + if (ret != IBT_SUCCESS) { + n_failed++; + eib_data_tx_comp(vnic, wqes[i], chan); + + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_data_post_tx: " + "ibt_post_send(n_wrs=1) failed, " + "ret=%d", ret); + } + } + } + total_failed += n_failed; + + mutex_enter(&chan->ch_tx_lock); + } + + chan->ch_tx_busy = B_FALSE; + mutex_exit(&chan->ch_tx_lock); + + /* + * If we failed to post something, update error stats + */ + if (total_failed) { + EIB_UPDATE_COUNTER(&stats->st_oerrors, total_failed); + } +} + +void +eib_data_parse_ether_hdr(mblk_t *mp, eib_ether_hdr_t *evh) +{ + struct ether_vlan_header *vl_hdr; + struct ether_header *hdr; + + /* + * Assume that the ether header (with or without vlan tag) is + * contained in one fragment + */ + hdr = (struct ether_header *)(void *)mp->b_rptr; + vl_hdr = (struct ether_vlan_header *)(void *)mp->b_rptr; + + evh->eh_ether_type = ntohs(hdr->ether_type); + if (evh->eh_ether_type != ETHERTYPE_VLAN) { + evh->eh_tagless = 1; + evh->eh_vlan = 0; + ether_copy((void *)hdr->ether_dhost.ether_addr_octet, + (void *)evh->eh_dmac); + ether_copy((void *)hdr->ether_shost.ether_addr_octet, + (void *)evh->eh_smac); + } else { + evh->eh_ether_type = ntohs(vl_hdr->ether_type); + evh->eh_tagless = 0; + evh->eh_vlan = VLAN_ID(ntohs(vl_hdr->ether_tci)); + ether_copy((void *)vl_hdr->ether_dhost.ether_addr_octet, + (void *)evh->eh_dmac); + ether_copy((void *)vl_hdr->ether_shost.ether_addr_octet, + (void *)evh->eh_smac); + } +} + +int +eib_data_lookup_vnic(eib_t *ss, uint8_t *mac, uint16_t vlan, eib_vnic_t **vnicp, + boolean_t *failed) +{ + eib_vnic_t *vnic; + eib_vnic_req_t *vrq; + uint8_t *vn_mac; + uint16_t vn_vlan; + uint64_t av; + int inst = 0; + + if (mac == NULL) + return (EIB_E_FAILURE); + + /* + * For now, a simple search (but only what we've allocated). Note that + * if we're in the process of creating a vnic, the instance might've + * been allocated, but the vnic entry would be NULL. + */ + mutex_enter(&ss->ei_vnic_lock); + av = ss->ei_active_vnics; + while ((inst = EIB_FIND_LSB_SET(av)) != -1) { + if ((vnic = ss->ei_vnic[inst]) != NULL) { + vn_mac = vnic->vn_login_data.ld_assigned_mac; + vn_vlan = vnic->vn_login_data.ld_assigned_vlan; + + if ((vn_vlan == vlan) && + (bcmp(vn_mac, mac, ETHERADDRL) == 0)) { + if (vnicp) { + *vnicp = vnic; + } + mutex_exit(&ss->ei_vnic_lock); + return (EIB_E_SUCCESS); + } + } + + av &= (~((uint64_t)1 << inst)); + } + mutex_exit(&ss->ei_vnic_lock); + + /* + * If we haven't been able to locate a vnic for this {mac,vlan} tuple, + * see if we've already failed a creation request for this vnic, and + * return that information. + */ + if (failed) { + mutex_enter(&ss->ei_vnic_req_lock); + *failed = B_FALSE; + for (vrq = ss->ei_failed_vnic_req; vrq; vrq = vrq->vr_next) { + if ((vrq->vr_vlan == vlan) && + (bcmp(vrq->vr_mac, mac, ETHERADDRL) == 0)) { + *failed = B_TRUE; + } + } + mutex_exit(&ss->ei_vnic_req_lock); + } + + return (EIB_E_FAILURE); +} + +int +eib_data_prepare_frame(eib_vnic_t *vnic, eib_wqe_t *swqe, mblk_t *mp, + eib_ether_hdr_t *evh) +{ + uint32_t mss; + uint32_t lsoflags; + uint32_t hckflags; + + /* + * The swqe defaults are set to use the regular ud work request + * member and the IBT_WRC_SEND opcode, so we don't need to do + * anything here if this isn't an LSO packet. + */ + mac_lso_get(mp, &mss, &lsoflags); + if ((lsoflags & HW_LSO) == HW_LSO) + eib_data_setup_lso(swqe, mp, mss, evh); + + mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags); + if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM) { + swqe->qe_wr.send.wr_flags |= IBT_WR_SEND_CKSUM; + } else { + swqe->qe_wr.send.wr_flags &= (~IBT_WR_SEND_CKSUM); + } + + if (eib_data_prepare_sgl(vnic, swqe, mp) != 0) + return (EIB_E_FAILURE); + + swqe->qe_mp = mp; + + return (EIB_E_SUCCESS); +} + +void +eib_rb_data_create_qp(eib_t *ss, eib_vnic_t *vnic) +{ + eib_rb_data_setup_ud_channel(ss, vnic); + + eib_rb_data_setup_cqs(ss, vnic); + + eib_chan_fini(vnic->vn_data_chan); + vnic->vn_data_chan = NULL; +} + +static int +eib_data_setup_cqs(eib_t *ss, eib_vnic_t *vnic) +{ + eib_chan_t *chan = vnic->vn_data_chan; + ibt_cq_attr_t cq_attr; + ibt_status_t ret; + uint_t snd_sz; + uint_t rcv_sz; + int rv; + + /* + * Allocate send completion queue. Note that we've already verified + * that cp_max_swqe and cp_max_rwqe meet the max cq size requirements + * of the hca. + */ + cq_attr.cq_sched = NULL; + cq_attr.cq_flags = IBT_CQ_NO_FLAGS; + cq_attr.cq_size = ss->ei_caps->cp_max_swqe + 1; + + ret = ibt_alloc_cq(ss->ei_hca_hdl, &cq_attr, &chan->ch_cq_hdl, &snd_sz); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: " + "ibt_alloc_cq(snd_cq_sz=0x%lx) failed, ret=%d", + cq_attr.cq_size, ret); + goto setup_data_cqs_fail; + } + ret = ibt_modify_cq(chan->ch_cq_hdl, EIB_TX_COMP_COUNT, + EIB_TX_COMP_USEC, 0); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_setup_cqs: " + "ibt_modify_cq(snd_comp_count=0x%lx, snd_comp_usec=0x%lx) " + "failed, ret=%d", + EIB_TX_COMP_COUNT, EIB_TX_COMP_USEC, ret); + } + + /* + * Allocate receive completion queue + */ + cq_attr.cq_sched = NULL; + cq_attr.cq_flags = IBT_CQ_NO_FLAGS; + cq_attr.cq_size = ss->ei_caps->cp_max_rwqe + 1; + + ret = ibt_alloc_cq(ss->ei_hca_hdl, &cq_attr, &chan->ch_rcv_cq_hdl, + &rcv_sz); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: " + "ibt_alloc_cq(rcv_cq_sz=0x%lx) failed, ret=%d", + cq_attr.cq_size, ret); + goto setup_data_cqs_fail; + } + ret = ibt_modify_cq(chan->ch_rcv_cq_hdl, EIB_RX_COMP_COUNT, + EIB_RX_COMP_USEC, 0); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_setup_cqs: " + "ibt_modify_cq(rcv_comp_count=0x%lx, rcv_comp_usec=0x%lx) " + "failed, ret=%d", + EIB_RX_COMP_COUNT, EIB_RX_COMP_USEC, ret); + } + + /* + * Set up parameters for collecting tx and rx completion information + */ + chan->ch_cq_sz = snd_sz; + chan->ch_wc = kmem_zalloc(sizeof (ibt_wc_t) * snd_sz, KM_SLEEP); + chan->ch_rcv_cq_sz = rcv_sz; + chan->ch_rcv_wc = kmem_zalloc(sizeof (ibt_wc_t) * rcv_sz, KM_SLEEP); + + /* + * Set up the vnic's data tx completion queue handler and allocate + * a softint for it as well. + */ + if ((rv = ddi_intr_add_softint(ss->ei_dip, &vnic->vn_data_tx_si_hdl, + EIB_SOFTPRI_DATA, eib_data_tx_comp_handler, vnic)) != DDI_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: " + "ddi_intr_add_softint() failed for data tx qp, ret=%d", rv); + goto setup_data_cqs_fail; + } + ibt_set_cq_handler(chan->ch_cq_hdl, eib_data_tx_comp_intr, vnic); + ret = ibt_enable_cq_notify(chan->ch_cq_hdl, IBT_NEXT_COMPLETION); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: " + "ibt_enable_cq_notify() failed for tx cq, ret=%d", ret); + goto setup_data_cqs_fail; + } + + /* + * And then the data rx completion queue handler + */ + if ((rv = ddi_intr_add_softint(ss->ei_dip, &vnic->vn_data_rx_si_hdl, + EIB_SOFTPRI_DATA, eib_data_rx_comp_handler, vnic)) != DDI_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: " + "ddi_intr_add_softint() failed for data rx qp, ret=%d", rv); + goto setup_data_cqs_fail; + } + ibt_set_cq_handler(chan->ch_rcv_cq_hdl, eib_data_rx_comp_intr, vnic); + ret = ibt_enable_cq_notify(chan->ch_rcv_cq_hdl, IBT_NEXT_COMPLETION); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: " + "ibt_enable_cq_notify() failed for rx cq, ret=%d", ret); + goto setup_data_cqs_fail; + } + + return (EIB_E_SUCCESS); + +setup_data_cqs_fail: + eib_rb_data_setup_cqs(ss, vnic); + return (EIB_E_FAILURE); +} + +static int +eib_data_setup_ud_channel(eib_t *ss, eib_vnic_t *vnic) +{ + eib_chan_t *chan = vnic->vn_data_chan; + ibt_ud_chan_alloc_args_t alloc_attr; + ibt_ud_chan_query_attr_t query_attr; + ibt_status_t ret; + + bzero(&alloc_attr, sizeof (ibt_ud_chan_alloc_args_t)); + bzero(&query_attr, sizeof (ibt_ud_chan_query_attr_t)); + + alloc_attr.ud_flags = IBT_ALL_SIGNALED; + if (ss->ei_caps->cp_resv_lkey_capab) + alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY; + if (ss->ei_caps->cp_lso_maxlen) + alloc_attr.ud_flags |= IBT_USES_LSO; + + alloc_attr.ud_hca_port_num = ss->ei_props->ep_port_num; + alloc_attr.ud_pkey_ix = chan->ch_pkey_ix; + alloc_attr.ud_sizes.cs_sq = ss->ei_caps->cp_max_swqe; + alloc_attr.ud_sizes.cs_rq = ss->ei_caps->cp_max_rwqe; + alloc_attr.ud_sizes.cs_sq_sgl = ss->ei_caps->cp_max_sgl; + alloc_attr.ud_sizes.cs_rq_sgl = 1; + alloc_attr.ud_sizes.cs_inline = 0; + + alloc_attr.ud_qkey = EIB_DATA_QKEY; + alloc_attr.ud_scq = chan->ch_cq_hdl; + alloc_attr.ud_rcq = chan->ch_rcv_cq_hdl; + alloc_attr.ud_pd = ss->ei_pd_hdl; + + ret = ibt_alloc_ud_channel(ss->ei_hca_hdl, IBT_ACHAN_NO_FLAGS, + &alloc_attr, &chan->ch_chan, NULL); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_ud_channel: " + "ibt_alloc_ud_channel(port=0x%x, pkey_ix=0x%x, " + "cs_sq=0x%lx, cs_rq=0x%lx, sq_sgl=0x%lx) failed, ret=%d", + alloc_attr.ud_hca_port_num, chan->ch_pkey_ix, + alloc_attr.ud_sizes.cs_sq, alloc_attr.ud_sizes.cs_rq, + alloc_attr.ud_sizes.cs_sq_sgl, ret); + + goto setup_data_ud_channel_fail; + } + + ret = ibt_query_ud_channel(chan->ch_chan, &query_attr); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_ud_channel: " + "ibt_query_ud_channel() failed, ret=%d", ret); + goto setup_data_ud_channel_fail; + } + + chan->ch_qpn = query_attr.ud_qpn; + chan->ch_max_swqes = query_attr.ud_chan_sizes.cs_sq; + chan->ch_max_rwqes = query_attr.ud_chan_sizes.cs_rq; + chan->ch_lwm_rwqes = chan->ch_max_rwqes >> 2; + chan->ch_rwqe_bktsz = (chan->ch_max_rwqes < EIB_DATA_RWQE_BKT) ? + chan->ch_max_rwqes : EIB_DATA_RWQE_BKT; + chan->ch_ip_hdr_align = EIB_IP_HDR_ALIGN; + chan->ch_alloc_mp = B_TRUE; + chan->ch_tear_down = B_FALSE; + + return (EIB_E_SUCCESS); + +setup_data_ud_channel_fail: + eib_rb_data_setup_ud_channel(ss, vnic); + return (EIB_E_FAILURE); +} + +static void +eib_data_setup_lso(eib_wqe_t *swqe, mblk_t *mp, uint32_t mss, + eib_ether_hdr_t *evh) +{ + ibt_wr_lso_t *lso; + mblk_t *nmp; + uint8_t *dst; + uintptr_t ip_start; + uintptr_t tcp_start; + uint_t pending; + uint_t mblen; + uint_t eth_hdr_len; + uint_t ip_hdr_len; + uint_t tcp_hdr_len; + + /* + * When the swqe was grabbed, it would've had its wr_opcode and + * wr.ud.udwr_dest set to default values. Since we're now going + * to use LSO, we need to change these. + */ + swqe->qe_wr.send.wr_opcode = IBT_WRC_SEND_LSO; + lso = &(swqe->qe_wr.send.wr.ud_lso); + lso->lso_ud_dest = swqe->qe_dest; + lso->lso_mss = mss; + + /* + * Details on the ethernet header in the mp is already known to us + */ + eth_hdr_len = (evh->eh_tagless) ? (sizeof (struct ether_header)) : + (sizeof (struct ether_vlan_header)); + + /* + * Calculate the LSO header size and set it in the UD LSO structure. + * Note that the only assumption we make is that each of the Ethernet, + * IP and TCP headers will be contained in a single mblk fragment; + * together, the headers may span multiple mblk fragments. Note also + * that since the EoIB encapsulation header is not part of the message + * block we receive, we'll need to account space for inserting it later. + */ + nmp = mp; + ip_start = (uintptr_t)(nmp->b_rptr) + eth_hdr_len; + if (ip_start >= (uintptr_t)(nmp->b_wptr)) { + ip_start = (uintptr_t)nmp->b_cont->b_rptr + + (ip_start - (uintptr_t)(nmp->b_wptr)); + nmp = nmp->b_cont; + } + ip_hdr_len = IPH_HDR_LENGTH((ipha_t *)ip_start); + + tcp_start = ip_start + ip_hdr_len; + if (tcp_start >= (uintptr_t)(nmp->b_wptr)) { + tcp_start = (uintptr_t)nmp->b_cont->b_rptr + + (tcp_start - (uintptr_t)(nmp->b_wptr)); + nmp = nmp->b_cont; + } + tcp_hdr_len = TCP_HDR_LENGTH((tcph_t *)tcp_start); + + /* + * Since the passed mp fragment never contains the EoIB encapsulation + * header, we always have to copy the lso header. Sigh. + */ + lso->lso_hdr = swqe->qe_payload_hdr; + lso->lso_hdr_sz = EIB_ENCAP_HDR_SZ + eth_hdr_len + + ip_hdr_len + tcp_hdr_len; + + /* + * We already have the EoIB encapsulation header written at the + * start of wqe->qe_payload_hdr during swqe acquisition. Only + * copy the remaining headers. + */ + dst = lso->lso_hdr + EIB_ENCAP_HDR_SZ; + pending = lso->lso_hdr_sz - EIB_ENCAP_HDR_SZ; + + for (nmp = mp; nmp && pending; nmp = nmp->b_cont) { + mblen = MBLKL(nmp); + if (pending > mblen) { + bcopy(nmp->b_rptr, dst, mblen); + dst += mblen; + pending -= mblen; + } else { + bcopy(nmp->b_rptr, dst, pending); + break; + } + } +} + +static int +eib_data_prepare_sgl(eib_vnic_t *vnic, eib_wqe_t *swqe, mblk_t *mp) +{ + eib_t *ss = vnic->vn_ss; + eib_stats_t *stats = vnic->vn_ss->ei_stats; + ibt_iov_t iov_arr[EIB_MAX_SGL]; + ibt_iov_attr_t iov_attr; + ibt_wr_ds_t *sgl; + ibt_status_t ret; + mblk_t *nmp; + mblk_t *data_mp; + uchar_t *bufp; + size_t blksize; + size_t skip; + size_t avail; + uint_t lsohdr_sz; + uint_t pktsz; + ptrdiff_t frag_len; + uint_t pending_hdr; + uint_t nblks; + uint_t i; + + /* + * Let's skip ahead to the TCP data if this is LSO. Note that while + * the lso header size in the swqe includes the EoIB encapsulation + * header size, that encapsulation header itself won't be found in + * the mblk. + */ + lsohdr_sz = (swqe->qe_wr.send.wr_opcode == IBT_WRC_SEND) ? 0 : + swqe->qe_wr.send.wr.ud_lso.lso_hdr_sz; + + data_mp = mp; + pending_hdr = 0; + if (lsohdr_sz) { + pending_hdr = lsohdr_sz - EIB_ENCAP_HDR_SZ; + for (nmp = mp; nmp; nmp = nmp->b_cont) { + frag_len = + (uintptr_t)nmp->b_wptr - (uintptr_t)nmp->b_rptr; + if (frag_len > pending_hdr) + break; + pending_hdr -= frag_len; + } + data_mp = nmp; /* start of data past lso header */ + ASSERT(data_mp != NULL); + } + + /* + * If this is an LSO packet, we want pktsz to hold the size of the + * data following the eoib/ethernet/tcp/ip headers. If this is a + * non-LSO packet, we want pktsz to refer to the size of the entire + * packet with all the headers, and nblks to hold the number of + * mappings we'll need to iov map this (for reserved lkey request). + */ + if (lsohdr_sz == 0) { + nblks = 1; + pktsz = EIB_ENCAP_HDR_SZ; + } else { + nblks = 0; + pktsz = 0; + } + for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) { + pktsz += MBLKL(nmp); + nblks++; + } + pktsz -= pending_hdr; + + EIB_UPDATE_COUNTER(&stats->st_obytes, pktsz); + EIB_INCR_COUNTER(&stats->st_opkts); + + /* + * We only do ibt_map_mem_iov() if the pktsz is above the tx copy + * threshold and if the number of mp fragments is less than the + * maximum acceptable. + */ + if ((ss->ei_caps->cp_resv_lkey_capab) && (pktsz > EIB_TX_COPY_THRESH) && + (nblks < ss->ei_caps->cp_hiwm_sgl)) { + + iov_attr.iov_as = NULL; + iov_attr.iov = iov_arr; + iov_attr.iov_buf = NULL; + iov_attr.iov_list_len = nblks; + iov_attr.iov_wr_nds = ss->ei_caps->cp_max_sgl; + iov_attr.iov_lso_hdr_sz = lsohdr_sz; + iov_attr.iov_flags = IBT_IOV_SLEEP; + + i = 0; + if (lsohdr_sz == 0) { + iov_arr[i].iov_addr = (caddr_t)swqe->qe_payload_hdr; + iov_arr[i].iov_len = EIB_ENCAP_HDR_SZ; + i++; + } + for (nmp = data_mp; i < nblks; i++, nmp = nmp->b_cont) { + iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr; + iov_arr[i].iov_len = MBLKL(nmp); + if (nmp == data_mp) { + iov_arr[i].iov_addr += pending_hdr; + iov_arr[i].iov_len -= pending_hdr; + } + } + swqe->qe_info |= EIB_WQE_FLG_BUFTYPE_MAPPED; + swqe->qe_wr.send.wr_sgl = swqe->qe_big_sgl; + + ret = ibt_map_mem_iov(ss->ei_hca_hdl, &iov_attr, + &swqe->qe_wr, &swqe->qe_iov_hdl); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_data_prepare_sgl: " + "ibt_map_mem_iov(nblks=0x%lx) failed, ret=%d ", + "attempting to use copy path", nblks, ret); + goto prepare_sgl_copy_path; + } + + return (EIB_E_SUCCESS); + } + +prepare_sgl_copy_path: + if (pktsz <= swqe->qe_bufsz) { + swqe->qe_wr.send.wr_nds = 1; + swqe->qe_wr.send.wr_sgl = &swqe->qe_sgl; + swqe->qe_sgl.ds_len = pktsz; + + /* + * Even though this is the copy path for transfers less than + * qe_bufsz, it could still be an LSO packet. If so, we only + * have to write the data following all the headers into the + * work request buffer, since we'll be sending the lso header + * itself separately. If this is not an LSO send (but pkt size + * greater than mtu, say for a jumbo frame), then we need + * to write all the headers including EoIB encapsulation, + * into the work request buffer. + */ + bufp = (uchar_t *)(uintptr_t)swqe->qe_sgl.ds_va; + if (lsohdr_sz == 0) { + *(uint32_t *)((void *)bufp) = htonl(EIB_TX_ENCAP_HDR); + bufp += EIB_ENCAP_HDR_SZ; + } + for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) { + blksize = MBLKL(nmp) - pending_hdr; + bcopy(nmp->b_rptr + pending_hdr, bufp, blksize); + bufp += blksize; + pending_hdr = 0; + } + + /* + * If the ethernet frame we're going to send is less than + * ETHERMIN, pad up the buffer to ETHERMIN (with zeros) + */ + if ((pktsz + lsohdr_sz) < (ETHERMIN + EIB_ENCAP_HDR_SZ)) { + bzero(bufp, (ETHERMIN + EIB_ENCAP_HDR_SZ) - + (pktsz + lsohdr_sz)); + swqe->qe_sgl.ds_len = ETHERMIN + EIB_ENCAP_HDR_SZ; + } + return (EIB_E_SUCCESS); + } + + /* + * Copy path for transfers greater than swqe->qe_bufsz + */ + swqe->qe_wr.send.wr_sgl = swqe->qe_big_sgl; + if (eib_rsrc_grab_lsobufs(ss, pktsz, swqe->qe_wr.send.wr_sgl, + &(swqe->qe_wr.send.wr_nds)) != EIB_E_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_prepare_sgl: " + "eib_rsrc_grab_lsobufs() failed"); + return (EIB_E_FAILURE); + } + swqe->qe_info |= EIB_WQE_FLG_BUFTYPE_LSO; + + /* + * Copy the larger-than-qe_buf_sz packet into a set of fixed-sized, + * pre-mapped LSO buffers. Note that we might need to skip part of + * the LSO header in the first fragment as before. + */ + nmp = data_mp; + skip = pending_hdr; + for (i = 0; i < swqe->qe_wr.send.wr_nds; i++) { + sgl = swqe->qe_wr.send.wr_sgl + i; + bufp = (uchar_t *)(uintptr_t)sgl->ds_va; + avail = EIB_LSO_BUFSZ; + + /* + * If this is a non-LSO packet (perhaps a jumbo frame?) + * we may still need to prefix the EoIB header in the + * wr buffer. + */ + if ((i == 0) && (lsohdr_sz == 0)) { + *(uint32_t *)((void *)bufp) = htonl(EIB_TX_ENCAP_HDR); + bufp += EIB_ENCAP_HDR_SZ; + avail -= EIB_ENCAP_HDR_SZ; + } + + while (nmp && avail) { + blksize = MBLKL(nmp) - skip; + if (blksize > avail) { + bcopy(nmp->b_rptr + skip, bufp, avail); + skip += avail; + avail = 0; + } else { + bcopy(nmp->b_rptr + skip, bufp, blksize); + skip = 0; + bufp += blksize; + avail -= blksize; + nmp = nmp->b_cont; + } + } + } + + return (EIB_E_SUCCESS); +} + +/*ARGSUSED*/ +static int +eib_data_is_mcast_pkt_ok(eib_vnic_t *vnic, uint8_t *macaddr, uint64_t *brdcst, + uint64_t *multicst) +{ + /* + * If the dmac is a broadcast packet, let it through. Otherwise, either + * we should be in promiscuous mode or the dmac should be in our list of + * joined multicast addresses. Currently we only update the stat + * counters and always let things through. + */ + if (bcmp(macaddr, eib_broadcast_mac, ETHERADDRL) == 0) + EIB_INCR_COUNTER(brdcst); + else + EIB_INCR_COUNTER(multicst); + + return (1); +} + +static void +eib_data_rx_comp_intr(ibt_cq_hdl_t cq_hdl, void *arg) +{ + eib_vnic_t *vnic = arg; + eib_chan_t *chan = vnic->vn_data_chan; + eib_t *ss = vnic->vn_ss; + + if (cq_hdl != chan->ch_rcv_cq_hdl) { + EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_data_rx_comp_intr: " + "cq_hdl(0x%llx) != chan->ch_cq_hdl(0x%llx), " + "ignoring completion", cq_hdl, chan->ch_cq_hdl); + return; + } + + ASSERT(vnic->vn_data_rx_si_hdl != NULL); + + (void) ddi_intr_trigger_softint(vnic->vn_data_rx_si_hdl, NULL); +} + +static void +eib_data_tx_comp_intr(ibt_cq_hdl_t cq_hdl, void *arg) +{ + eib_vnic_t *vnic = arg; + eib_chan_t *chan = vnic->vn_data_chan; + eib_t *ss = vnic->vn_ss; + + if (cq_hdl != chan->ch_cq_hdl) { + EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_data_tx_comp_intr: " + "cq_hdl(0x%llx) != chan->ch_cq_hdl(0x%llx), " + "ignoring completion", cq_hdl, chan->ch_cq_hdl); + return; + } + + ASSERT(vnic->vn_data_tx_si_hdl != NULL); + + (void) ddi_intr_trigger_softint(vnic->vn_data_tx_si_hdl, NULL); +} + +static mblk_t * +eib_data_rx_comp(eib_vnic_t *vnic, eib_wqe_t *wqe, ibt_wc_t *wc) +{ + eib_t *ss = vnic->vn_ss; + eib_chan_t *chan = vnic->vn_data_chan; + eib_login_data_t *ld = &vnic->vn_login_data; + eib_stats_t *stats = ss->ei_stats; + eib_ether_hdr_t evh; + mblk_t *mp; + boolean_t allocd_mp = B_FALSE; + uint_t ec_hdr; + uint_t ec_sign; + uint_t ec_ver; + uint_t ec_tu_cs; + uint_t ec_ip_cs; + + /* + * Before we process this mblk and send it up to network layer, see + * if we're running low on rwqes in the wqe pool. If so, allocate a + * new mblk, copy the received data into it and send it up (and return + * the current rwqe back to the pool immediately by calling freemsg() + * on the original mblk). + */ + if (!eib_rsrc_rxpool_low(wqe)) { + mp = wqe->qe_mp; + } else { + if ((mp = allocb(wc->wc_bytes_xfer, BPRI_HI)) != NULL) { + bcopy(wqe->qe_mp->b_rptr, mp->b_rptr, + wc->wc_bytes_xfer); + freemsg(wqe->qe_mp); + allocd_mp = B_TRUE; + } else { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: " + "wqe level below watermark, dropping rx pkt"); + EIB_INCR_COUNTER(&stats->st_norcvbuf); + freemsg(wqe->qe_mp); + return (NULL); + } + } + + /* + * Adjust write pointer depending on how much data came in. Note that + * since the nw layer will expect us to hand over the mp with the + * ethernet header starting at mp->b_rptr, update the b_rptr as well. + */ + mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer; + + /* + * We have a problem if this really happens! + */ + if (mp->b_next != NULL) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: " + "received packet's b_next not NULL, possible dup from cq"); + mp->b_next = NULL; + } + + /* + * Drop loopback packets ? + */ + if ((wc->wc_slid == ss->ei_props->ep_blid) && + (wc->wc_qpn == chan->ch_qpn)) { + goto data_rx_comp_fail; + } + + mp->b_rptr += EIB_GRH_SZ; + + /* + * Since the recv buffer has been aligned for IP header to start on + * a word boundary, it is safe to say that the EoIB and ethernet + * headers won't start on a word boundary. + */ + bcopy(mp->b_rptr, &ec_hdr, EIB_ENCAP_HDR_SZ); + + /* + * Check EoIB signature and version + */ + ec_hdr = ntohl(ec_hdr); + + ec_sign = (ec_hdr >> EIB_ENCAP_SIGN_SHIFT) & EIB_ENCAP_SIGN_MASK; + if (ec_sign != EIB_EH_SIGNATURE) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: " + "EoIB encapsulation header signature (0x%lx) unknown", + ec_sign); + goto data_rx_comp_fail; + } + + ec_ver = (ec_hdr >> EIB_ENCAP_VER_SHIFT) & EIB_ENCAP_VER_MASK; + if (ec_ver != EIB_EH_VERSION) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: " + "EoIB encapsulation header version (0x%lx) unknown", + ec_ver); + goto data_rx_comp_fail; + } + + /* + * Check TCP/UDP and IP checksum + */ + ec_tu_cs = (ec_hdr >> EIB_ENCAP_TCPCHK_SHIFT) & EIB_ENCAP_TCPCHK_MASK; + ec_ip_cs = (ec_hdr >> EIB_ENCAP_IPCHK_SHIFT) & EIB_ENCAP_IPCHK_MASK; + + if ((ec_tu_cs == EIB_EH_UDPCSUM_OK || ec_tu_cs == EIB_EH_TCPCSUM_OK) && + (ec_ip_cs == EIB_EH_IPCSUM_OK)) { + mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK); + } else if (ec_tu_cs == EIB_EH_CSUM_BAD || ec_ip_cs == EIB_EH_CSUM_BAD) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: " + "EoIB encapsulation header tcp/udp checksum (0x%lx) or" + "ip checksum (0x%lx) is bad", ec_tu_cs, ec_ip_cs); + } + + /* + * Update the message block's b_rptr to the start of ethernet header + * and parse the header information + */ + mp->b_rptr += EIB_ENCAP_HDR_SZ; + eib_data_parse_ether_hdr(mp, &evh); + + /* + * If the incoming packet is vlan-tagged, but the tag doesn't match + * this vnic's vlan, drop it. + */ + if ((evh.eh_tagless == 0) && (evh.eh_vlan != ld->ld_assigned_vlan)) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: " + "received packet's vlan unknown, expected=0x%x, got=0x%x", + ld->ld_assigned_vlan, evh.eh_vlan); + goto data_rx_comp_fail; + } + + /* + * Final checks to see if the unicast destination is indeed correct + * and to see if the multicast address is ok for us. + */ + if (EIB_UNICAST_MAC(evh.eh_dmac)) { + if (bcmp(evh.eh_dmac, ld->ld_assigned_mac, ETHERADDRL) != 0) { + uint8_t *exp; + uint8_t *got; + + exp = ld->ld_assigned_mac; + got = evh.eh_dmac; + + EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: " + "received packet's macaddr mismatch, " + "expected=%x:%x:%x:%x:%x:%x, got=%x:%x:%x:%x:%x:%x", + exp[0], exp[1], exp[2], exp[3], exp[4], exp[5], + got[0], got[1], got[2], got[3], got[4], got[5]); + + goto data_rx_comp_fail; + } + } else { + if (!eib_data_is_mcast_pkt_ok(vnic, evh.eh_dmac, + &stats->st_brdcstrcv, &stats->st_multircv)) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: " + "multicast packet not ok"); + goto data_rx_comp_fail; + } + } + + /* + * Strip ethernet FCS if present in the packet. ConnectX-2 doesn't + * support ethernet FCS, so this shouldn't happen anyway. + */ + if ((ec_hdr >> EIB_ENCAP_FCS_B_SHIFT) & 0x1) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: " + "ethernet FCS present (ec_hdr=0%lx), ignoring", + ec_hdr); + + mp->b_wptr -= ETHERFCSL; + } + + /* + * If this is the same mp as was in the original rwqe (i.e. we didn't + * do any allocb()), then mark the rwqe flag so we know that its mblk + * is with the network layer. + */ + if (!allocd_mp) { + wqe->qe_info |= EIB_WQE_FLG_WITH_NW; + } + + return (mp); + +data_rx_comp_fail: + freemsg(mp); + return (NULL); +} + +static void +eib_data_tx_comp(eib_vnic_t *vnic, eib_wqe_t *wqe, eib_chan_t *chan) +{ + eib_t *ss = vnic->vn_ss; + ibt_status_t ret; + + if (wqe->qe_mp) { + if (wqe->qe_info & EIB_WQE_FLG_BUFTYPE_MAPPED) { + ret = ibt_unmap_mem_iov(ss->ei_hca_hdl, + wqe->qe_iov_hdl); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_data_tx_comp: " + "ibt_unmap_mem_iov() failed, ret=%d", ret); + } + wqe->qe_iov_hdl = NULL; + } else if (wqe->qe_info & EIB_WQE_FLG_BUFTYPE_LSO) { + eib_rsrc_return_lsobufs(ss, wqe->qe_big_sgl, + wqe->qe_wr.send.wr_nds); + } + freemsg(wqe->qe_mp); + wqe->qe_mp = NULL; + } + + eib_rsrc_return_swqe(ss, wqe, chan); +} + +static void +eib_data_err_comp(eib_vnic_t *vnic, eib_wqe_t *wqe, ibt_wc_t *wc) +{ + eib_t *ss = vnic->vn_ss; + + /* + * Currently, all we do is report + */ + switch (wc->wc_status) { + case IBT_WC_WR_FLUSHED_ERR: + break; + + case IBT_WC_LOCAL_CHAN_OP_ERR: + EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_err_comp: " + "IBT_WC_LOCAL_CHAN_OP_ERR seen, wqe_info=0x%lx ", + wqe->qe_info); + break; + + case IBT_WC_LOCAL_PROTECT_ERR: + EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_err_comp: " + "IBT_WC_LOCAL_PROTECT_ERR seen, wqe_info=0x%lx ", + wqe->qe_info); + break; + } + + /* + * When a wc indicates error, we do not attempt to repost the + * rwqe but simply return it to the wqe pool. Also for rwqes, + * attempting to free the mblk in the wqe invokes the + * eib_data_rx_recycle() callback. For tx wqes, error handling + * is the same as successful completion handling. We still + * have to unmap iov/free lsobufs/free mblk and then return the + * swqe to the pool. + */ + if (EIB_WQE_TYPE(wqe->qe_info) == EIB_WQE_RX) { + ASSERT(wqe->qe_mp != NULL); + freemsg(wqe->qe_mp); + } else { + eib_data_tx_comp(vnic, wqe, vnic->vn_data_chan); + } +} + +/*ARGSUSED*/ +static void +eib_rb_data_setup_cqs(eib_t *ss, eib_vnic_t *vnic) +{ + eib_chan_t *chan = vnic->vn_data_chan; + ibt_status_t ret; + + if (chan == NULL) + return; + + /* + * Reset any completion handlers we may have set up + */ + if (chan->ch_rcv_cq_hdl) { + ibt_set_cq_handler(chan->ch_rcv_cq_hdl, NULL, NULL); + } + if (chan->ch_cq_hdl) { + ibt_set_cq_handler(chan->ch_cq_hdl, NULL, NULL); + } + + /* + * Remove any softints that were added + */ + if (vnic->vn_data_rx_si_hdl) { + (void) ddi_intr_remove_softint(vnic->vn_data_rx_si_hdl); + vnic->vn_data_rx_si_hdl = NULL; + } + if (vnic->vn_data_tx_si_hdl) { + (void) ddi_intr_remove_softint(vnic->vn_data_tx_si_hdl); + vnic->vn_data_tx_si_hdl = NULL; + } + + /* + * Release any work completion buffers we may have allocated + */ + if (chan->ch_rcv_wc && chan->ch_rcv_cq_sz) { + kmem_free(chan->ch_rcv_wc, + sizeof (ibt_wc_t) * chan->ch_rcv_cq_sz); + } + chan->ch_rcv_cq_sz = 0; + chan->ch_rcv_wc = NULL; + + if (chan->ch_wc && chan->ch_cq_sz) { + kmem_free(chan->ch_wc, sizeof (ibt_wc_t) * chan->ch_cq_sz); + } + chan->ch_cq_sz = 0; + chan->ch_wc = NULL; + + /* + * Free any completion queues we may have allocated + */ + if (chan->ch_rcv_cq_hdl) { + ret = ibt_free_cq(chan->ch_rcv_cq_hdl); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_rb_data_setup_cqs: " + "ibt_free_cq(rcv_cq) failed, ret=%d", ret); + } + chan->ch_rcv_cq_hdl = NULL; + } + if (chan->ch_cq_hdl) { + ret = ibt_free_cq(chan->ch_cq_hdl); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_rb_data_setup_cqs: " + "ibt_free_cq(snd_cq) failed, ret=%d", ret); + } + chan->ch_cq_hdl = NULL; + } +} + +/*ARGSUSED*/ +static void +eib_rb_data_setup_ud_channel(eib_t *ss, eib_vnic_t *vnic) +{ + eib_chan_t *chan = vnic->vn_data_chan; + ibt_status_t ret; + + if (chan == NULL) + return; + + if (chan->ch_chan) { + /* + * We're trying to tear down this UD channel. Make sure that + * we don't attempt to refill (repost) at any point from now on. + */ + chan->ch_tear_down = B_TRUE; + if ((ret = ibt_flush_channel(chan->ch_chan)) != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_rb_data_setup_ud_channel: " + "ibt_flush_channel() failed, ret=%d", ret); + } + + /* + * Wait until all posted tx wqes on this channel are back with + * the wqe pool. + */ + mutex_enter(&chan->ch_tx_lock); + while (chan->ch_tx_posted > 0) + cv_wait(&chan->ch_tx_cv, &chan->ch_tx_lock); + mutex_exit(&chan->ch_tx_lock); + + /* + * Wait until all posted rx wqes on this channel are back with + * the wqe pool. + */ + mutex_enter(&chan->ch_rx_lock); + while (chan->ch_rx_posted > 0) + cv_wait(&chan->ch_rx_cv, &chan->ch_rx_lock); + mutex_exit(&chan->ch_rx_lock); + + /* + * Now we're ready to free this channel + */ + if ((ret = ibt_free_channel(chan->ch_chan)) != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_rb_data_setup_ud_channel: " + "ibt_free_channel() failed, ret=%d", ret); + } + + chan->ch_alloc_mp = B_FALSE; + chan->ch_ip_hdr_align = 0; + chan->ch_rwqe_bktsz = 0; + chan->ch_lwm_rwqes = 0; + chan->ch_max_rwqes = 0; + chan->ch_max_swqes = 0; + chan->ch_qpn = 0; + chan->ch_chan = NULL; + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/ib/clients/eoib/eib_fip.c Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,1504 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ksynch.h> +#include <sys/byteorder.h> + +#include <sys/ib/clients/eoib/eib_impl.h> + +/* + * Declarations private to this file + */ +static int eib_fip_make_login(eib_t *, eib_vnic_t *, eib_wqe_t *, int *); +static int eib_fip_make_update(eib_t *, eib_vnic_t *, eib_wqe_t *, int, int *); +static int eib_fip_make_table(eib_t *, eib_vnic_t *, eib_wqe_t *, int *); +static int eib_fip_make_ka(eib_t *, eib_vnic_t *, eib_wqe_t *, int *); +static int eib_fip_make_logout(eib_t *, eib_vnic_t *, eib_wqe_t *, int *); + +static int eib_fip_send_login(eib_t *, eib_vnic_t *, eib_wqe_t *, int *); +static int eib_fip_send_update(eib_t *, eib_vnic_t *, eib_wqe_t *, + uint_t, int *); +static int eib_fip_send_table(eib_t *, eib_vnic_t *, eib_wqe_t *, int *); +static int eib_fip_send_ka(eib_t *, eib_vnic_t *, eib_wqe_t *, int *); +static int eib_fip_send_logout(eib_t *, eib_vnic_t *, eib_wqe_t *, int *); + +static int eib_fip_parse_vhub_table(uint8_t *, eib_vnic_t *); +static int eib_fip_parse_vhub_update(uint8_t *, eib_vnic_t *); +static void eib_fip_update_eport_state(eib_t *, eib_vhub_table_t *, + eib_vhub_update_t *, boolean_t, uint8_t); +static void eib_fip_queue_tbl_entry(eib_vhub_table_t *, eib_vhub_map_t *, + uint32_t, uint8_t); +static void eib_fip_queue_upd_entry(eib_vhub_update_t *, eib_vhub_map_t *, + uint32_t, uint8_t); +static void eib_fip_queue_gw_entry(eib_vnic_t *, eib_vhub_table_t *, uint32_t, + uint8_t); +static int eib_fip_apply_updates(eib_t *, eib_vhub_table_t *, + eib_vhub_update_t *); +static void eib_fip_dequeue_tbl_entry(eib_vhub_table_t *, uint8_t *, uint32_t, + uint8_t); +static eib_vhub_map_t *eib_fip_get_vhub_map(void); + +/* + * Definitions private to this file + */ +const char eib_vendor_mellanox[] = { + 0x4d, 0x65, 0x6c, 0x6c, 0x61, 0x6e, 0x6f, 0x78 +}; + +/* + * The three requests to the gateway - request a vHUB table, request a + * vHUB update (aka keepalive) and vNIC logout - all need the same + * vnic identity descriptor to be sent with different flag settings. + * + * vHUB table: R=1, U=0, TUSN=last, subcode=KEEPALIVE + * keepalive/vHUB update: R=0, U=1, TUSN=last, subcode=KEEPALIVE + * vNIC logout: R=0, U=0, TUSN=0, subcode=LOGOUT + */ +#define EIB_UPD_REQ_TABLE 1 +#define EIB_UPD_REQ_KA 2 +#define EIB_UPD_REQ_LOGOUT 3 + +int +eib_fip_login(eib_t *ss, eib_vnic_t *vnic, int *err) +{ + eib_wqe_t *swqe; + int ret; + int ntries = 0; + + do { + if ((swqe = eib_rsrc_grab_swqe(ss, EIB_WPRI_LO)) == NULL) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_login: " + "no swqe available, not sending " + "vnic login request"); + *err = ENOMEM; + return (EIB_E_FAILURE); + } + + ret = eib_fip_make_login(ss, vnic, swqe, err); + if (ret != EIB_E_SUCCESS) { + eib_rsrc_return_swqe(ss, swqe, NULL); + return (EIB_E_FAILURE); + } + + ret = eib_fip_send_login(ss, vnic, swqe, err); + if (ret != EIB_E_SUCCESS) { + eib_rsrc_return_swqe(ss, swqe, NULL); + return (EIB_E_FAILURE); + } + + ret = eib_vnic_wait_for_login_ack(ss, vnic, err); + if (ret == EIB_E_SUCCESS) + break; + + } while ((*err == ETIME) && (ntries++ < EIB_MAX_LOGIN_ATTEMPTS)); + + return (ret); +} + +int +eib_fip_vhub_table(eib_t *ss, eib_vnic_t *vnic, int *err) +{ + eib_wqe_t *swqe; + int ret; + int ntries = 0; + + do { + if ((swqe = eib_rsrc_grab_swqe(ss, EIB_WPRI_LO)) == NULL) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_vhub_table: " + "no swqe available, not sending " + "vhub table request"); + *err = ENOMEM; + return (EIB_E_FAILURE); + } + + ret = eib_fip_make_table(ss, vnic, swqe, err); + if (ret != EIB_E_SUCCESS) { + eib_rsrc_return_swqe(ss, swqe, NULL); + return (EIB_E_FAILURE); + } + + ret = eib_fip_send_table(ss, vnic, swqe, err); + if (ret != EIB_E_SUCCESS) { + eib_rsrc_return_swqe(ss, swqe, NULL); + return (EIB_E_FAILURE); + } + + ret = eib_vnic_wait_for_table(ss, vnic, err); + if (ret == EIB_E_SUCCESS) { + return (EIB_E_SUCCESS); + } + + /* + * If we'd failed in constructing a proper vhub table above, + * the vnic login state would be set to EIB_LOGIN_TBL_FAILED. + * We need to clean up any pending entries from the vhub + * table and vhub update structures and reset the vnic state + * to EIB_LOGIN_ACK_RCVD before we can try again. + */ + eib_vnic_fini_tables(ss, vnic, B_FALSE); + mutex_enter(&vnic->vn_lock); + vnic->vn_state = EIB_LOGIN_ACK_RCVD; + mutex_exit(&vnic->vn_lock); + + } while ((*err == ETIME) && (ntries++ < EIB_MAX_VHUB_TBL_ATTEMPTS)); + + return (EIB_E_FAILURE); +} + +int +eib_fip_heartbeat(eib_t *ss, eib_vnic_t *vnic, int *err) +{ + eib_wqe_t *swqe; + int ntries = 0; + int ret; + + /* + * Even if we're running low on the wqe resource, we want to be + * able to grab a wqe to send the keepalive, to avoid getting + * logged out by the gateway, so we use EIB_WPRI_HI. + */ + if ((swqe = eib_rsrc_grab_swqe(ss, EIB_WPRI_HI)) == NULL) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_heartbeat: " + "no swqe available, not sending heartbeat"); + return (EIB_E_FAILURE); + } + + while (ntries++ < EIB_MAX_KA_ATTEMPTS) { + ret = eib_fip_make_ka(ss, vnic, swqe, err); + if (ret != EIB_E_SUCCESS) + continue; + + ret = eib_fip_send_ka(ss, vnic, swqe, err); + if (ret == EIB_E_SUCCESS) + break; + } + + if (ret != EIB_E_SUCCESS) + eib_rsrc_return_swqe(ss, swqe, NULL); + + return (ret); +} + +int +eib_fip_logout(eib_t *ss, eib_vnic_t *vnic, int *err) +{ + eib_wqe_t *swqe; + int ret; + + /* + * This routine is only called after the vnic has successfully + * logged in to the gateway. If that's really the case, there + * is nothing in terms of resources we need to release: the swqe + * that was acquired during login has already been posted, the + * work has been completed and the swqe has also been reaped back + * into the free pool. The only thing we need to rollback is the + * fact that we're logged in to the gateway at all -- and the way + * to do this is to send a logout request. + */ + if ((swqe = eib_rsrc_grab_swqe(ss, EIB_WPRI_LO)) == NULL) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_logout: " + "no swqe available, not sending logout"); + return (EIB_E_FAILURE); + } + + ret = eib_fip_make_logout(ss, vnic, swqe, err); + if (ret != EIB_E_SUCCESS) { + eib_rsrc_return_swqe(ss, swqe, NULL); + return (EIB_E_FAILURE); + } + + ret = eib_fip_send_logout(ss, vnic, swqe, err); + if (ret != EIB_E_SUCCESS) { + eib_rsrc_return_swqe(ss, swqe, NULL); + return (EIB_E_FAILURE); + } + + return (EIB_E_SUCCESS); +} + +int +eib_fip_parse_login_ack(eib_t *ss, uint8_t *pkt, eib_login_data_t *ld) +{ + fip_login_ack_t *ack; + fip_basic_hdr_t *hdr; + fip_desc_iba_t *iba; + fip_desc_vnic_login_t *login; + fip_desc_partition_t *partition; + ib_guid_t guid; + uint32_t syn_ctl_qpn; + uint16_t sl_portid; + uint16_t flags_vlan; + uint16_t opcode; + uint8_t subcode; + + /* + * Note that 'pkt' is always atleast double-word aligned + * when it is passed to us, so we can cast it without any + * problems. + */ + ack = (fip_login_ack_t *)(void *)pkt; + hdr = &(ack->ak_fip_header); + + /* + * Verify that the opcode is EoIB + */ + if ((opcode = ntohs(hdr->hd_opcode)) != FIP_OPCODE_EOIB) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_login_ack: " + "unsupported opcode 0x%x in login ack, ignoring", + opcode); + return (EIB_E_FAILURE); + } + + /* + * The admin qp in the EoIB driver should receive only the login + * acknowledgements + */ + subcode = hdr->hd_subcode; + if (subcode != FIP_SUBCODE_G_VNIC_LOGIN_ACK) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_login_ack: " + "unexpected subcode 0x%x received by adm qp, ignoring", + subcode); + return (EIB_E_FAILURE); + } + + /* + * Verify if the descriptor list length in the received packet is + * valid if the workaround to disable it explicitly is absent. + */ + if (!eib_wa_no_desc_list_len) { + uint_t pkt_data_sz; + + pkt_data_sz = (ntohs(hdr->hd_desc_list_len) + 2) << 2; + if (pkt_data_sz < sizeof (fip_login_ack_t)) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_fip_parse_login_ack: " + "login ack desc list len (0x%lx) too small " + "(min 0x%lx)", + pkt_data_sz, sizeof (fip_login_ack_t)); + return (EIB_E_FAILURE); + } + } + + /* + * Validate all the header and descriptor types and lengths + */ + if (hdr->hd_type != FIP_DESC_TYPE_VENDOR_ID || + hdr->hd_len != FIP_DESC_LEN_VENDOR_ID) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_login_ack: " + "invalid type/len in basic hdr: expected (0x%x,0x%x), " + "got (0x%x,0x%x)", FIP_DESC_TYPE_VENDOR_ID, + FIP_DESC_LEN_VENDOR_ID, hdr->hd_type, hdr->hd_len); + return (EIB_E_FAILURE); + } + iba = &(ack->ak_iba); + if (iba->ia_type != FIP_DESC_TYPE_IBA || + iba->ia_len != FIP_DESC_LEN_IBA) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_login_ack: " + "invalid type/len in iba desc: expected (0x%x,0x%x), " + "got (0x%x,0x%x)", FIP_DESC_TYPE_IBA, FIP_DESC_LEN_IBA, + iba->ia_type, iba->ia_len); + return (EIB_E_FAILURE); + } + login = &(ack->ak_vnic_login); + if (login->vl_type != FIP_DESC_TYPE_VNIC_LOGIN || + login->vl_len != FIP_DESC_LEN_VNIC_LOGIN) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_login_ack: " + "invalid type/len in login desc: expected (0x%x,0x%x), " + "got (0x%x,0x%x)", FIP_DESC_TYPE_VNIC_LOGIN, + FIP_DESC_LEN_VNIC_LOGIN, login->vl_type, login->vl_len); + return (EIB_E_FAILURE); + } + partition = &(ack->ak_vhub_partition); + if (partition->pn_type != FIP_DESC_TYPE_PARTITION || + partition->pn_len != FIP_DESC_LEN_PARTITION) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_login_ack: " + "invalid type/len in partition desc: expected (0x%x,0x%x), " + "got (0x%x,0x%x)", FIP_DESC_TYPE_PARTITION, + FIP_DESC_LEN_PARTITION, partition->pn_type, + partition->pn_len); + return (EIB_E_FAILURE); + } + + /* + * Note that we'll return the vnic id as-is. The msb is not actually + * part of the vnic id in our internal records, so we'll mask it out + * later before we do our searches. + */ + ld->ld_vnic_id = ntohs(login->vl_vnic_id); + + syn_ctl_qpn = ntohl(login->vl_syndrome_ctl_qpn); + + /* + * If the syndrome indicates a nack, we're done. No need to collect + * any more information + */ + ld->ld_syndrome = (uint8_t)((syn_ctl_qpn & FIP_VL_SYN_MASK) >> + FIP_VL_SYN_SHIFT); + if (ld->ld_syndrome) { + return (EIB_E_SUCCESS); + } + + /* + * Let's get the rest of the information out of the login ack + */ + sl_portid = ntohs(iba->ia_sl_portid); + ld->ld_gw_port_id = sl_portid & FIP_IBA_PORTID_MASK; + ld->ld_gw_sl = (sl_portid & FIP_IBA_SL_MASK) >> FIP_IBA_SL_SHIFT; + + ld->ld_gw_data_qpn = ntohl(iba->ia_qpn) & FIP_IBA_QPN_MASK; + ld->ld_gw_lid = ntohs(iba->ia_lid); + + bcopy(iba->ia_guid, &guid, sizeof (ib_guid_t)); + ld->ld_gw_guid = ntohll(guid); + ld->ld_vhub_mtu = ntohs(login->vl_mtu); + bcopy(login->vl_mac, ld->ld_assigned_mac, ETHERADDRL); + bcopy(login->vl_gw_mgid_prefix, ld->ld_gw_mgid_prefix, + FIP_MGID_PREFIX_LEN); + ld->ld_n_rss_mcgid = login->vl_flags_rss & FIP_VL_N_RSS_MCGID_MASK; + ld->ld_n_mac_mcgid = login->vl_n_mac_mcgid & FIP_VL_N_MAC_MCGID_MASK; + ld->ld_gw_ctl_qpn = (syn_ctl_qpn & FIP_VL_CTL_QPN_MASK); + + flags_vlan = ntohs(login->vl_flags_vlan); + ld->ld_assigned_vlan = flags_vlan & FIP_VL_VLAN_MASK; + ld->ld_vlan_in_packets = (flags_vlan & FIP_VL_FLAGS_VP) ? 1 : 0; + bcopy(login->vl_vnic_name, ld->ld_vnic_name, FIP_VNIC_NAME_LEN); + + ld->ld_vhub_pkey = ntohs(partition->pn_pkey); + + return (EIB_E_SUCCESS); +} + +int +eib_fip_parse_ctl_pkt(uint8_t *pkt, eib_vnic_t *vnic) +{ + eib_t *ss = vnic->vn_ss; + fip_vhub_pkt_t *vhb; + fip_basic_hdr_t *hdr; + uint16_t opcode; + uint8_t subcode; + uint_t vnic_state; + int ret = EIB_E_FAILURE; + + /* + * Note that 'pkt' is always atleast double-word aligned when it is + * passed to us, so we can cast it without any problems. + */ + vhb = (fip_vhub_pkt_t *)(void *)pkt; + hdr = &(vhb->hb_fip_header); + + /* + * Verify that the opcode is EoIB + */ + if ((opcode = ntohs(hdr->hd_opcode)) != FIP_OPCODE_EOIB) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_ctl_pkt: " + "unsupported opcode 0x%x in ctl pkt, ignoring", + opcode); + return (EIB_E_FAILURE); + } + + mutex_enter(&vnic->vn_lock); + vnic_state = vnic->vn_state; + mutex_exit(&vnic->vn_lock); + + /* + * The ctl qp in the EoIB driver should receive only vHUB messages + */ + subcode = hdr->hd_subcode; + if (subcode == FIP_SUBCODE_G_VHUB_UPDATE) { + if (vnic_state != EIB_LOGIN_TBL_WAIT && + vnic_state != EIB_LOGIN_TBL_INPROG && + vnic_state != EIB_LOGIN_TBL_DONE && + vnic_state != EIB_LOGIN_DONE) { + + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_fip_parse_ctl_pkt: unexpected vnic state " + "(0x%lx) for subcode (VHUB_UPDATE 0x%x)", + vnic_state, subcode); + return (EIB_E_FAILURE); + } + + ret = eib_fip_parse_vhub_update(pkt, vnic); + + } else if (subcode == FIP_SUBCODE_G_VHUB_TABLE) { + if ((vnic_state != EIB_LOGIN_TBL_WAIT) && + (vnic_state != EIB_LOGIN_TBL_INPROG)) { + + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_fip_parse_ctl_pkt: unexpected vnic state " + "(0x%lx) for subcode (VHUB_TABLE 0x%x)", + vnic_state, subcode); + return (EIB_E_FAILURE); + } + + ret = eib_fip_parse_vhub_table(pkt, vnic); + + } else { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_ctl_pkt: " + "unexpected subcode 0x%x for ctl pkt", subcode); + } + + if (ret == EIB_E_SUCCESS) { + /* + * Update last gateway heartbeat received time and + * gateway eport state. The eport state should only + * be updated if the vnic's vhub table has been fully + * constructed. + */ + mutex_enter(&ss->ei_vnic_lock); + ss->ei_gw_last_heartbeat = ddi_get_lbolt64(); + if (vnic_state == EIB_LOGIN_TBL_DONE || + vnic_state == EIB_LOGIN_DONE) { + ss->ei_gw_eport_state = + vnic->vn_vhub_table->tb_eport_state; + } + mutex_exit(&ss->ei_vnic_lock); + } + + return (ret); +} + +static int +eib_fip_make_login(eib_t *ss, eib_vnic_t *vnic, eib_wqe_t *swqe, int *err) +{ + fip_login_t *login; + fip_proto_t *proto; + fip_basic_hdr_t *hdr; + fip_desc_iba_t *iba; + fip_desc_vnic_login_t *vlg; + ib_gid_t port_gid; + ib_guid_t port_guid; + uint16_t sl_portid; + uint16_t flags_vlan; + + uint16_t gw_portid = ss->ei_gw_props->pp_gw_portid; + uint16_t sl = ss->ei_gw_props->pp_gw_sl; + uint8_t *pkt = (uint8_t *)(uintptr_t)(swqe->qe_sgl.ds_va); + uint_t pktsz = swqe->qe_sgl.ds_len; + uint_t login_sz = sizeof (fip_login_t); + + if (pktsz < login_sz) { + *err = EINVAL; + + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_make_login: " + "send buffer size (0x%lx) too small to send" + "login request (min 0x%lx)", + pktsz, login_sz); + return (EIB_E_FAILURE); + } + + /* + * Lint complains that there may be an alignment issue here, + * but we know that the "pkt" is atleast double-word aligned, + * so it's ok. + */ + login = (fip_login_t *)(void *)pkt; + bzero(pkt, login_sz); + + /* + * Fill in the FIP protocol version + */ + proto = &login->lg_proto_version; + proto->pr_version = FIP_PROTO_VERSION; + + /* + * Fill in the basic header + */ + hdr = &login->lg_fip_header; + hdr->hd_opcode = htons(FIP_OPCODE_EOIB); + hdr->hd_subcode = FIP_SUBCODE_H_VNIC_LOGIN; + hdr->hd_desc_list_len = htons((login_sz >> 2) - 2); + hdr->hd_flags = 0; + hdr->hd_type = FIP_DESC_TYPE_VENDOR_ID; + hdr->hd_len = FIP_DESC_LEN_VENDOR_ID; + bcopy(eib_vendor_mellanox, hdr->hd_vendor_id, FIP_VENDOR_LEN); + + /* + * Fill in the Infiniband Address descriptor + */ + iba = &login->lg_iba; + iba->ia_type = FIP_DESC_TYPE_IBA; + iba->ia_len = FIP_DESC_LEN_IBA; + bcopy(eib_vendor_mellanox, iba->ia_vendor_id, FIP_VENDOR_LEN); + iba->ia_qpn = htonl(vnic->vn_data_chan->ch_qpn); + + sl_portid = (gw_portid & FIP_IBA_PORTID_MASK) | + ((sl << FIP_IBA_SL_SHIFT) & FIP_IBA_SL_MASK); + iba->ia_sl_portid = htons(sl_portid); + + iba->ia_lid = htons(ss->ei_props->ep_blid); + + port_gid = ss->ei_props->ep_sgid; + port_guid = htonll(port_gid.gid_guid); + bcopy(&port_guid, iba->ia_guid, FIP_GUID_LEN); + + /* + * Now, fill in the vNIC Login descriptor + */ + + vlg = &login->lg_vnic_login; + vlg->vl_type = FIP_DESC_TYPE_VNIC_LOGIN; + vlg->vl_len = FIP_DESC_LEN_VNIC_LOGIN; + bcopy(eib_vendor_mellanox, vlg->vl_vendor_id, FIP_VENDOR_LEN); + + /* + * Only for the physlink instance 0, we ask the gateway to assign + * the mac address and a VLAN (tagless, actually). For this vnic + * only, we do not set the H bit. All other vnics are created by + * Solaris admin and will have the H bit set. Note also that we + * need to clear the vnic id's most significant bit for those that + * are administered by the gateway, so vnic0's vnic_id's msb should + * be 0 as well. + */ + if (vnic->vn_instance == 0) { + vlg->vl_vnic_id = htons(vnic->vn_id); + flags_vlan = vnic->vn_vlan & FIP_VL_VLAN_MASK; + } else { + vlg->vl_vnic_id = htons(vnic->vn_id | FIP_VL_VNIC_ID_MSBIT); + flags_vlan = (vnic->vn_vlan & FIP_VL_VLAN_MASK) | + FIP_VL_FLAGS_H | FIP_VL_FLAGS_M; + + if (vnic->vn_vlan & FIP_VL_VLAN_MASK) + flags_vlan |= (FIP_VL_FLAGS_V | FIP_VL_FLAGS_VP); + } + + vlg->vl_flags_vlan = htons(flags_vlan); + bcopy(vnic->vn_macaddr, vlg->vl_mac, ETHERADDRL); + + /* + * We aren't ready to enable rss, so we set the RSS bit and + * the n_rss_mcgid field to 0. Set the mac mcgid to 0 as well. + */ + vlg->vl_flags_rss = 0; + vlg->vl_n_mac_mcgid = 0; + + /* + * Set the syndrome to 0 and pass the control qpn + */ + vlg->vl_syndrome_ctl_qpn = + htonl(vnic->vn_ctl_chan->ch_qpn & FIP_VL_CTL_QPN_MASK); + + /* + * Try to set as unique a name as possible for this vnic + */ + (void) snprintf((char *)(vlg->vl_vnic_name), FIP_VNIC_NAME_LEN, + "eoib_%02x_%02x", ss->ei_instance, vnic->vn_instance); + + /* + * Adjust the ds_len in the sgl to indicate the size of this + * request before returning + */ + swqe->qe_sgl.ds_len = login_sz; + + return (EIB_E_SUCCESS); +} + +static int +eib_fip_make_update(eib_t *ss, eib_vnic_t *vnic, eib_wqe_t *swqe, int req, + int *err) +{ + fip_keep_alive_t *ka; + fip_proto_t *proto; + fip_basic_hdr_t *hdr; + fip_desc_vnic_identity_t *vid; + ib_gid_t port_gid; + ib_guid_t port_guid; + uint32_t flags_vhub_id; + + uint8_t *pkt = (uint8_t *)(uintptr_t)(swqe->qe_sgl.ds_va); + uint_t pktsz = swqe->qe_sgl.ds_len; + uint_t ka_sz = sizeof (fip_keep_alive_t); + + if (pktsz < ka_sz) { + *err = EINVAL; + + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_make_update: " + "send buffer size (0x%lx) too small to send" + "keepalive/update request (min 0x%lx)", + pktsz, ka_sz); + return (EIB_E_FAILURE); + } + + /* + * Lint complains that there may be an alignment issue here, + * but we know that the "pkt" is atleast double-word aligned, + * so it's ok. + */ + ka = (fip_keep_alive_t *)(void *)pkt; + bzero(pkt, ka_sz); + + /* + * Fill in the FIP protocol version + */ + proto = &ka->ka_proto_version; + proto->pr_version = FIP_PROTO_VERSION; + + /* + * Fill in the basic header + */ + hdr = &ka->ka_fip_header; + hdr->hd_opcode = htons(FIP_OPCODE_EOIB); + hdr->hd_subcode = (req == EIB_UPD_REQ_LOGOUT) ? + FIP_SUBCODE_H_VNIC_LOGOUT : FIP_SUBCODE_H_KEEP_ALIVE; + hdr->hd_desc_list_len = htons((ka_sz >> 2) - 2); + hdr->hd_flags = 0; + hdr->hd_type = FIP_DESC_TYPE_VENDOR_ID; + hdr->hd_len = FIP_DESC_LEN_VENDOR_ID; + bcopy(eib_vendor_mellanox, hdr->hd_vendor_id, FIP_VENDOR_LEN); + + /* + * Fill in the vNIC Identity descriptor + */ + vid = &ka->ka_vnic_identity; + + vid->vi_type = FIP_DESC_TYPE_VNIC_IDENTITY; + vid->vi_len = FIP_DESC_LEN_VNIC_IDENTITY; + bcopy(eib_vendor_mellanox, vid->vi_vendor_id, FIP_VENDOR_LEN); + + flags_vhub_id = vnic->vn_login_data.ld_vhub_id; + if (vnic->vn_login_data.ld_vlan_in_packets) { + flags_vhub_id |= FIP_VI_FLAG_VP; + } + if (req == EIB_UPD_REQ_TABLE) { + flags_vhub_id |= FIP_VI_FLAG_R; + } else if (req == EIB_UPD_REQ_KA) { + flags_vhub_id |= FIP_VI_FLAG_U; + } + vid->vi_flags_vhub_id = htonl(flags_vhub_id); + + vid->vi_tusn = (req != EIB_UPD_REQ_LOGOUT) ? + htonl(vnic->vn_vhub_table->tb_tusn) : 0; + + vid->vi_vnic_id = htons(vnic->vn_login_data.ld_vnic_id); + bcopy(vnic->vn_login_data.ld_assigned_mac, vid->vi_mac, ETHERADDRL); + + port_gid = ss->ei_props->ep_sgid; + port_guid = htonll(port_gid.gid_guid); + bcopy(&port_guid, vid->vi_port_guid, FIP_GUID_LEN); + bcopy(vnic->vn_login_data.ld_vnic_name, vid->vi_vnic_name, + FIP_VNIC_NAME_LEN); + + /* + * Adjust the ds_len in the sgl to indicate the size of this + * request before returning + */ + swqe->qe_sgl.ds_len = ka_sz; + + return (EIB_E_SUCCESS); +} + +static int +eib_fip_make_table(eib_t *ss, eib_vnic_t *vnic, eib_wqe_t *swqe, int *err) +{ + return (eib_fip_make_update(ss, vnic, swqe, EIB_UPD_REQ_TABLE, err)); +} + +static int +eib_fip_make_ka(eib_t *ss, eib_vnic_t *vnic, eib_wqe_t *swqe, int *err) +{ + return (eib_fip_make_update(ss, vnic, swqe, EIB_UPD_REQ_KA, err)); +} + +static int +eib_fip_make_logout(eib_t *ss, eib_vnic_t *vnic, eib_wqe_t *swqe, int *err) +{ + return (eib_fip_make_update(ss, vnic, swqe, EIB_UPD_REQ_LOGOUT, err)); +} + +static int +eib_fip_send_login(eib_t *ss, eib_vnic_t *vnic, eib_wqe_t *swqe, int *err) +{ + eib_avect_t *av; + eib_chan_t *chan = ss->ei_admin_chan; + ibt_status_t ret; + + /* + * Get an address vector for this destination + */ + if ((av = eib_ibt_hold_avect(ss, ss->ei_gw_props->pp_gw_lid, + ss->ei_gw_props->pp_gw_sl)) == NULL) { + *err = ENOMEM; + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_send_login: " + "eib_ibt_hold_avect(gw_lid=0x%x, sl=0x%x) failed", + ss->ei_gw_props->pp_gw_lid, ss->ei_gw_props->pp_gw_sl); + return (EIB_E_FAILURE); + } + + /* + * Modify the UD destination handle to the gateway + */ + ret = ibt_modify_ud_dest(swqe->qe_dest, EIB_FIP_QKEY, + ss->ei_gw_props->pp_gw_ctrl_qpn, &av->av_vect); + + eib_ibt_release_avect(ss, av); + if (ret != IBT_SUCCESS) { + *err = EINVAL; + + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_send_login: " + "ibt_modify_ud_dest(gw_ctl_qpn=0x%lx, qkey=0x%lx) failed, " + "ret=%d", ss->ei_gw_props->pp_gw_ctrl_qpn, + EIB_FIP_QKEY, ret); + return (EIB_E_FAILURE); + } + + /* + * Send the login packet to the destination gateway. Posting + * the login and setting the login state to wait-for-ack should + * ideally be atomic to avoid race. + */ + mutex_enter(&vnic->vn_lock); + ret = ibt_post_send(chan->ch_chan, &(swqe->qe_wr.send), 1, NULL); + if (ret != IBT_SUCCESS) { + mutex_exit(&vnic->vn_lock); + *err = EINVAL; + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_send_login: " + "ibt_post_send() failed for vnic id 0x%x, ret=%d", + vnic->vn_id, ret); + return (EIB_E_FAILURE); + } + vnic->vn_state = EIB_LOGIN_ACK_WAIT; + + mutex_enter(&chan->ch_tx_lock); + chan->ch_tx_posted++; + mutex_exit(&chan->ch_tx_lock); + + mutex_exit(&vnic->vn_lock); + + return (EIB_E_SUCCESS); +} + +static int +eib_fip_send_update(eib_t *ss, eib_vnic_t *vnic, eib_wqe_t *swqe, + uint_t nxt_state, int *err) +{ + eib_login_data_t *ld = &vnic->vn_login_data; + eib_chan_t *chan = vnic->vn_ctl_chan; + eib_avect_t *av; + ibt_status_t ret; + + /* + * Get an address vector for this destination + */ + if ((av = eib_ibt_hold_avect(ss, ld->ld_gw_lid, + ld->ld_gw_sl)) == NULL) { + *err = ENOMEM; + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_send_update: " + "eib_ibt_hold_avect(gw_lid=0x%x, sl=0x%x) failed", + ld->ld_gw_lid, ld->ld_gw_sl); + return (EIB_E_FAILURE); + } + + /* + * Modify the UD destination handle to the destination appropriately + */ + ret = ibt_modify_ud_dest(swqe->qe_dest, EIB_FIP_QKEY, + ld->ld_gw_ctl_qpn, &av->av_vect); + + eib_ibt_release_avect(ss, av); + if (ret != IBT_SUCCESS) { + *err = EINVAL; + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_send_update: " + "ibt_modify_ud_dest(gw_ctl_qpn=0x%lx, qkey=0x%lx) failed, " + "ret=%d", ld->ld_gw_ctl_qpn, EIB_FIP_QKEY, ret); + return (EIB_E_FAILURE); + } + + /* + * Send the update packet to the destination. Posting the update request + * and setting the login state to wait-for-vhub_table needs to be atomic + * to avoid race. + */ + mutex_enter(&vnic->vn_lock); + ret = ibt_post_send(chan->ch_chan, &(swqe->qe_wr.send), 1, NULL); + if (ret != IBT_SUCCESS) { + mutex_exit(&vnic->vn_lock); + *err = EINVAL; + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_send_update: " + "ibt_post_send() failed for vnic id 0x%x, ret=%d", + vnic->vn_id, ret); + return (EIB_E_FAILURE); + } + vnic->vn_state = nxt_state; + + mutex_enter(&chan->ch_tx_lock); + chan->ch_tx_posted++; + mutex_exit(&chan->ch_tx_lock); + + mutex_exit(&vnic->vn_lock); + + return (EIB_E_SUCCESS); +} + +static int +eib_fip_send_table(eib_t *ss, eib_vnic_t *vnic, eib_wqe_t *swqe, int *err) +{ + return (eib_fip_send_update(ss, vnic, swqe, EIB_LOGIN_TBL_WAIT, err)); +} + +static int +eib_fip_send_ka(eib_t *ss, eib_vnic_t *vnic, eib_wqe_t *swqe, int *err) +{ + return (eib_fip_send_update(ss, vnic, swqe, EIB_LOGIN_DONE, err)); +} + +static int +eib_fip_send_logout(eib_t *ss, eib_vnic_t *vnic, eib_wqe_t *swqe, int *err) +{ + return (eib_fip_send_update(ss, vnic, swqe, EIB_LOGOUT_DONE, err)); +} + +static int +eib_fip_parse_vhub_table(uint8_t *pkt, eib_vnic_t *vnic) +{ + fip_vhub_table_t *tbl; + fip_desc_vhub_table_t *desc_tbl; + fip_vhub_table_entry_t *entry; + fip_basic_hdr_t *hdr; + eib_t *ss = vnic->vn_ss; + eib_login_data_t *ld = &vnic->vn_login_data; + eib_vhub_table_t *etbl = vnic->vn_vhub_table; + eib_vhub_update_t *eupd = vnic->vn_vhub_update; + eib_vhub_map_t *newmap; + + uint32_t *ipkt; + uint32_t init_checksum = 0; + uint32_t tusn; + uint32_t vhub_id; + uint_t entries_in_pkt; + uint_t ndx; + uint_t i; + + /* + * If we're here receiving vhub table messages, we certainly should + * have the vhub table structure allocated and present at this point. + */ + if (etbl == NULL) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_table: " + "vhub table missing for vnic id 0x%x", vnic->vn_id); + return (EIB_E_FAILURE); + } + + /* + * Note that 'pkt' is always atleast double-word aligned when it is + * passed to us, so we can cast it without any problems. + */ + ipkt = (uint32_t *)(void *)pkt; + tbl = (fip_vhub_table_t *)(void *)pkt; + hdr = &(tbl->vt_fip_header); + + /* + * Validate all the header and descriptor types and lengths + */ + if (hdr->hd_type != FIP_DESC_TYPE_VENDOR_ID || + hdr->hd_len != FIP_DESC_LEN_VENDOR_ID) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_table: " + "invalid type/len in fip basic header, " + "exp (0x%x,0x%x), got (0x%x,0x%x)", + FIP_DESC_TYPE_VENDOR_ID, FIP_DESC_LEN_VENDOR_ID, + hdr->hd_type, hdr->hd_len); + return (EIB_E_FAILURE); + } + desc_tbl = &(tbl->vt_vhub_table); + if (desc_tbl->tb_type != FIP_DESC_TYPE_VHUB_TABLE) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_table: " + "invalid type in vhub desc, exp 0x%x, got 0x%x", + FIP_DESC_TYPE_VHUB_TABLE, desc_tbl->tb_type); + return (EIB_E_FAILURE); + } + + /* + * Verify that the vhub id is ok for this vnic + */ + vhub_id = ntohl(desc_tbl->tb_flags_vhub_id) & FIP_TB_VHUB_ID_MASK; + if (vhub_id != ld->ld_vhub_id) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_table: " + "invalid vhub id in vhub table pkt: exp 0x%x, got 0x%x", + ld->ld_vhub_id, vhub_id); + return (EIB_E_FAILURE); + } + + /* + * Count the number of vhub table entries in this packet + */ + entries_in_pkt = (desc_tbl->tb_len - FIP_DESC_VHUB_TABLE_WORDS) / + FIP_VHUB_TABLE_ENTRY_WORDS; + + /* + * While we're here, also compute the 32-bit 2's complement carry- + * discarded checksum of the vHUB table descriptor in this packet + * till the first vhub table entry. + */ + for (i = 0; i < FIP_DESC_VHUB_TABLE_WORDS; i++) + init_checksum += ipkt[i]; + + /* + * Initialize the vhub's Table Update Sequence Number (tusn), + * checksum and record the total number of entries in in the table + * if this is the first pkt of the table. + */ + tusn = ntohl(desc_tbl->tb_tusn); + if (desc_tbl->tb_hdr & FIP_TB_HDR_FIRST) { + etbl->tb_entries_in_table = ntohs(desc_tbl->tb_table_size); + etbl->tb_tusn = tusn; + etbl->tb_checksum = 0; + + mutex_enter(&vnic->vn_lock); + vnic->vn_state = EIB_LOGIN_TBL_INPROG; + mutex_exit(&vnic->vn_lock); + } + + /* + * First, middle or last, the current table TUSN we have must match this + * packet's TUSN. + */ + if (etbl->tb_tusn != tusn) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_table: " + "unexpected TUSN (0x%lx) during vhub table construction, " + "expected 0x%lx", etbl->tb_tusn, tusn); + goto vhub_table_fail; + } + + /* + * See if we've overrun/underrun our original entries count + */ + if ((etbl->tb_entries_seen + entries_in_pkt) > + etbl->tb_entries_in_table) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_table: " + "vhub table overrun, total_exp=%d, so_far=%d, this_pkt=%d", + etbl->tb_entries_in_table, etbl->tb_entries_seen, + entries_in_pkt); + goto vhub_table_fail; + } else if (((etbl->tb_entries_seen + entries_in_pkt) < + etbl->tb_entries_in_table) && + (desc_tbl->tb_hdr & FIP_TB_HDR_LAST)) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_table: " + "vhub table underrun, total_exp=%d, so_far=%d, last_pkt=%d", + etbl->tb_entries_in_table, etbl->tb_entries_seen, + entries_in_pkt); + goto vhub_table_fail; + } + + /* + * Process and add the entries we have in this packet + */ + etbl->tb_checksum += init_checksum; + entry = (fip_vhub_table_entry_t *)(void *) + ((uint8_t *)desc_tbl + FIP_DESC_VHUB_TABLE_SZ); + + for (ndx = 0; ndx < entries_in_pkt; ndx++, entry++) { + /* + * Allocate a eib_vhub_map_t, copy the current entry details + * and chain it to the appropriate queue. + */ + if ((newmap = eib_fip_get_vhub_map()) == NULL) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_fip_parse_vhub_table: no memory for vhub " + "table entry, ignoring this vhub table packet"); + goto vhub_table_fail; + } + + ASSERT((entry->te_v_rss_type & FIP_TE_VALID) == FIP_TE_VALID); + newmap->mp_v_rss_type = entry->te_v_rss_type; + bcopy(entry->te_mac, newmap->mp_mac, ETHERADDRL); + newmap->mp_qpn = (ntohl(entry->te_qpn) & FIP_TE_QPN_MASK); + newmap->mp_sl = (entry->te_sl & FIP_TE_SL_MASK); + newmap->mp_lid = ntohs(entry->te_lid); + newmap->mp_tusn = tusn; + newmap->mp_next = NULL; + + /* + * The vhub table messages do not provide status on eport + * state, so we'll simply assume that the eport is up. + */ + eib_fip_queue_tbl_entry(etbl, newmap, tusn, FIP_EPORT_UP); + + /* + * Update table checksum with this entry's computed checksum + */ + ipkt = (uint32_t *)entry; + for (i = 0; i < FIP_VHUB_TABLE_ENTRY_WORDS; i++) + etbl->tb_checksum += ipkt[i]; + } + etbl->tb_entries_seen += entries_in_pkt; + + /* + * If this is the last packet of this vhub table, complete vhub + * table by verifying checksum and applying all the vhub updates + * that may have come in while we were constructing this table. + */ + if (desc_tbl->tb_hdr & FIP_TB_HDR_LAST) { + + ipkt = (uint32_t *)entry; + if (!eib_wa_no_good_vhub_cksum) { + if (*ipkt != etbl->tb_checksum) { + EIB_DPRINTF_VERBOSE(ss->ei_instance, + "eib_fip_parse_vhub_table: " + "vhub table checksum invalid, " + "computed=0x%lx, found=0x%lx", + etbl->tb_checksum, *ipkt); + } + } + + /* + * Per the EoIB specification, the gateway is supposed to + * include its address information for data messages in the + * vhub table. But we've observed that it doesn't do this + * (with the current version). If this is the case, we'll + * hand-create and add a vhub map for the gateway from the + * information we got in login ack. + */ + if (etbl->tb_gateway == NULL) + eib_fip_queue_gw_entry(vnic, etbl, tusn, FIP_EPORT_UP); + + /* + * Apply pending vhub updates and reset table counters needed + * during table construction. + */ + if (eib_fip_apply_updates(ss, etbl, eupd) != EIB_E_SUCCESS) + goto vhub_table_fail; + + etbl->tb_entries_seen = 0; + etbl->tb_entries_in_table = 0; + + eib_vnic_vhub_table_done(vnic, EIB_LOGIN_TBL_DONE); + } + + return (EIB_E_SUCCESS); + +vhub_table_fail: + eib_vnic_vhub_table_done(vnic, EIB_LOGIN_TBL_FAILED); + return (EIB_E_FAILURE); +} + +static int +eib_fip_parse_vhub_update(uint8_t *pkt, eib_vnic_t *vnic) +{ + fip_vhub_update_t *upd; + fip_desc_vhub_update_t *desc_upd; + fip_vhub_table_entry_t *entry; + fip_basic_hdr_t *hdr; + eib_t *ss = vnic->vn_ss; + eib_login_data_t *ld = &vnic->vn_login_data; + eib_vhub_table_t *etbl = vnic->vn_vhub_table; + eib_vhub_update_t *eupd = vnic->vn_vhub_update; + eib_vhub_map_t *newmap; + boolean_t vhub_tbl_done; + uint32_t eport_vp_vhub_id; + uint32_t vhub_id; + uint32_t tusn; + uint32_t prev_tusn; + uint8_t eport_state; + + /* + * We should have the vhub table allocated as long as we're receiving + * vhub control messages. + */ + if (etbl == NULL) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_update: " + "vhub table missing for vnic id 0x%x", vnic->vn_id); + return (EIB_E_FAILURE); + } + + mutex_enter(&vnic->vn_lock); + vhub_tbl_done = ((vnic->vn_state == EIB_LOGIN_TBL_DONE) || + (vnic->vn_state == EIB_LOGIN_DONE)) ? B_TRUE : B_FALSE; + mutex_exit(&vnic->vn_lock); + + /* + * Note that 'pkt' is always atleast double-word aligned when it is + * passed to us, so we can cast it without any problems. + */ + upd = (fip_vhub_update_t *)(void *)pkt; + hdr = &(upd->vu_fip_header); + + /* + * Validate all the header and descriptor types and lengths + */ + if (hdr->hd_type != FIP_DESC_TYPE_VENDOR_ID || + hdr->hd_len != FIP_DESC_LEN_VENDOR_ID) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_update: " + "invalid type/len in fip basic header, " + "exp (0x%x,0x%x), got (0x%x,0x%x)", + FIP_DESC_TYPE_VENDOR_ID, FIP_DESC_LEN_VENDOR_ID, + hdr->hd_type, hdr->hd_len); + return (EIB_E_FAILURE); + } + desc_upd = &(upd->vu_vhub_update); + if (desc_upd->up_type != FIP_DESC_TYPE_VHUB_UPDATE || + desc_upd->up_len != FIP_DESC_LEN_VHUB_UPDATE) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_update: " + "invalid type/len in vhub update desc: " + "exp (0x%x,0x%x), got (0x%x,0x%x)", + FIP_DESC_TYPE_VHUB_UPDATE, FIP_DESC_LEN_VHUB_UPDATE, + desc_upd->up_type, desc_upd->up_len); + return (EIB_E_FAILURE); + } + + /* + * Verify that the vhub id is ok for this vnic and save the eport state + */ + eport_vp_vhub_id = ntohl(desc_upd->up_eport_vp_vhub_id); + + vhub_id = eport_vp_vhub_id & FIP_UP_VHUB_ID_MASK; + if (vhub_id != ld->ld_vhub_id) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_update: " + "invalid vhub id in vhub update pkt: exp 0x%x, got 0x%x", + ld->ld_vhub_id, vhub_id); + return (EIB_E_FAILURE); + } + eport_state = (uint8_t)((eport_vp_vhub_id >> FIP_UP_EPORT_STATE_SHIFT) & + FIP_UP_EPORT_STATE_MASK); + + /* + * If this is the first update we receive, any tusn is ok. Otherwise, + * make sure the tusn we see in the packet is appropriate. + */ + tusn = ntohl(desc_upd->up_tusn); + prev_tusn = vhub_tbl_done ? etbl->tb_tusn : eupd->up_tusn; + + if (prev_tusn != 0) { + if (tusn == prev_tusn) { + eib_fip_update_eport_state(ss, etbl, eupd, + vhub_tbl_done, eport_state); + return (EIB_E_SUCCESS); + } + if (tusn != (prev_tusn + 1)) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_fip_parse_vhub_update: " + "out of order TUSN received (exp 0x%lx, " + "got 0x%lx), dropping pkt", prev_tusn + 1, tusn); + return (EIB_E_FAILURE); + } + } + + /* + * EoIB expects only type 0 (vnic address) entries to maintain the + * context table + */ + entry = &(desc_upd->up_tbl_entry); + ASSERT((entry->te_v_rss_type & FIP_TE_TYPE_MASK) == FIP_TE_TYPE_VNIC); + + /* + * If the vHUB table has already been fully constructed and if we've + * now received a notice to remove a vnic entry from it, do it. + */ + if ((vhub_tbl_done) && + ((entry->te_v_rss_type & FIP_TE_VALID) == 0)) { + eib_fip_dequeue_tbl_entry(etbl, entry->te_mac, + tusn, eport_state); + + if (bcmp(entry->te_mac, ld->ld_assigned_mac, ETHERADDRL) == 0) { + uint8_t *mymac; + + mymac = entry->te_mac; + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_fip_parse_vhub_update: " + "vhub update pkt received to kill self " + "(%x:%x:%x:%x:%x:%x)", mymac[0], mymac[1], mymac[2], + mymac[3], mymac[4], mymac[5]); + + return (EIB_E_FAILURE); + } + return (EIB_E_SUCCESS); + } + + /* + * Otherwise, allocate a new eib_vhub_map_t and fill it in with + * the details of the new entry + */ + if ((newmap = eib_fip_get_vhub_map()) == NULL) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_parse_vhub_update: " + "no memory for vhub update entry, will be ignoring" + "this vhub update packet"); + return (EIB_E_FAILURE); + } + + newmap->mp_v_rss_type = entry->te_v_rss_type; + bcopy(entry->te_mac, newmap->mp_mac, ETHERADDRL); + newmap->mp_qpn = (ntohl(entry->te_qpn) & FIP_TE_QPN_MASK); + newmap->mp_sl = (entry->te_sl & FIP_TE_SL_MASK); + newmap->mp_lid = ntohs(entry->te_lid); + newmap->mp_tusn = tusn; + newmap->mp_next = NULL; + + /* + * Update the full vhub table or chain it to the list of pending + * updates depending on if the vhub table construction is over + * or not. + */ + if (vhub_tbl_done) { + eib_fip_queue_tbl_entry(etbl, newmap, tusn, eport_state); + } else { + eib_fip_queue_upd_entry(eupd, newmap, tusn, eport_state); + } + + return (EIB_E_SUCCESS); +} + +static void +eib_fip_update_eport_state(eib_t *ss, eib_vhub_table_t *tbl, + eib_vhub_update_t *upd, boolean_t tbl_done, uint8_t eport_state) +{ + if (tbl_done) { + mutex_enter(&tbl->tb_lock); + if (tbl->tb_eport_state != eport_state) { + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_fip_update_eport_state: " + "eport state changing from %d to %d", + tbl->tb_eport_state, eport_state); + tbl->tb_eport_state = eport_state; + } + mutex_exit(&tbl->tb_lock); + } else { + mutex_enter(&upd->up_lock); + if (upd->up_eport_state != eport_state) { + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_fip_update_eport_state: " + "eport state changing from %d to %d", + upd->up_eport_state, eport_state); + upd->up_eport_state = eport_state; + } + mutex_exit(&upd->up_lock); + } +} + +static void +eib_fip_queue_tbl_entry(eib_vhub_table_t *tbl, eib_vhub_map_t *map, + uint32_t tusn, uint8_t eport_state) +{ + uint8_t bkt; + + mutex_enter(&tbl->tb_lock); + + switch (map->mp_v_rss_type & FIP_TE_TYPE_MASK) { + case FIP_TE_TYPE_GATEWAY: + if (tbl->tb_gateway) { + kmem_free(tbl->tb_gateway, + sizeof (eib_vhub_map_t)); + } + tbl->tb_gateway = map; + break; + + case FIP_TE_TYPE_UNICAST_MISS: + if (tbl->tb_unicast_miss) { + kmem_free(tbl->tb_unicast_miss, + sizeof (eib_vhub_map_t)); + } + tbl->tb_unicast_miss = map; + break; + + case FIP_TE_TYPE_VHUB_MULTICAST: + if (tbl->tb_vhub_multicast) { + kmem_free(tbl->tb_vhub_multicast, + sizeof (eib_vhub_map_t)); + } + tbl->tb_vhub_multicast = map; + break; + + case FIP_TE_TYPE_MULTICAST_ENTRY: + /* + * If multicast entry types are not to be specially + * processed, treat them like regular vnic addresses. + */ + if (!eib_wa_no_mcast_entries) { + bkt = (map->mp_mac[ETHERADDRL-1]) % EIB_TB_NBUCKETS; + map->mp_next = tbl->tb_mcast_entry[bkt]; + tbl->tb_mcast_entry[bkt] = map; + break; + } + /*FALLTHROUGH*/ + + case FIP_TE_TYPE_VNIC: + bkt = (map->mp_mac[ETHERADDRL-1]) % EIB_TB_NBUCKETS; + map->mp_next = tbl->tb_vnic_entry[bkt]; + tbl->tb_vnic_entry[bkt] = map; + break; + } + + tbl->tb_tusn = tusn; + tbl->tb_eport_state = eport_state; + + mutex_exit(&tbl->tb_lock); +} + +static void +eib_fip_queue_upd_entry(eib_vhub_update_t *upd, eib_vhub_map_t *map, + uint32_t tusn, uint8_t eport_state) +{ + eib_vhub_map_t *tail; + + /* + * The eib_vhub_update_t list is only touched/traversed when the + * control cq handler is parsing either update or table message, + * or by the table cleanup routine when we aren't attached to any + * control mcgs. Bottom line is that this list traversal is always + * single-threaded and we could probably do away with the lock. + */ + mutex_enter(&upd->up_lock); + for (tail = upd->up_vnic_entry; tail != NULL; tail = tail->mp_next) { + if (tail->mp_next == NULL) + break; + } + if (tail) { + tail->mp_next = map; + } else { + upd->up_vnic_entry = map; + } + + upd->up_tusn = tusn; + upd->up_eport_state = eport_state; + + mutex_exit(&upd->up_lock); +} + +static void +eib_fip_queue_gw_entry(eib_vnic_t *vnic, eib_vhub_table_t *tbl, uint32_t tusn, + uint8_t eport_state) +{ + eib_t *ss = vnic->vn_ss; + eib_vhub_map_t *newmap; + eib_login_data_t *ld = &vnic->vn_login_data; + + if ((newmap = eib_fip_get_vhub_map()) == NULL) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_queue_gw_entry: " + "no memory to queue gw entry, transactions could fail"); + return; + } + + newmap->mp_v_rss_type = FIP_TE_VALID | FIP_TE_TYPE_GATEWAY; + bcopy(eib_zero_mac, newmap->mp_mac, ETHERADDRL); + newmap->mp_qpn = ld->ld_gw_data_qpn; + newmap->mp_sl = ld->ld_gw_sl; + newmap->mp_lid = ld->ld_gw_lid; + newmap->mp_tusn = tusn; + newmap->mp_next = NULL; + + eib_fip_queue_tbl_entry(tbl, newmap, tusn, eport_state); +} + +static int +eib_fip_apply_updates(eib_t *ss, eib_vhub_table_t *tbl, eib_vhub_update_t *upd) +{ + eib_vhub_map_t *list; + eib_vhub_map_t *map; + eib_vhub_map_t *nxt; + uint32_t tbl_tusn = tbl->tb_tusn; + + /* + * Take the update list out + */ + mutex_enter(&upd->up_lock); + list = upd->up_vnic_entry; + upd->up_vnic_entry = NULL; + mutex_exit(&upd->up_lock); + + /* + * Skip any updates with older/same tusn as our vhub table + */ + nxt = NULL; + for (map = list; (map) && (map->mp_tusn <= tbl_tusn); map = nxt) { + nxt = map->mp_next; + kmem_free(map, sizeof (eib_vhub_map_t)); + } + + if (map == NULL) + return (EIB_E_SUCCESS); + + /* + * If we missed any updates between table tusn and the first + * update tusn we got, we need to fail. + */ + if (map->mp_tusn > (tbl_tusn + 1)) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_fip_apply_updates: " + "vhub update missed tusn(s), expected=0x%lx, got=0x%lx", + (tbl_tusn + 1), map->mp_tusn); + for (; map != NULL; map = nxt) { + nxt = map->mp_next; + kmem_free(map, sizeof (eib_vhub_map_t)); + } + return (EIB_E_FAILURE); + } + + /* + * If everything is fine, apply all the updates we received + */ + for (; map != NULL; map = nxt) { + nxt = map->mp_next; + map->mp_next = NULL; + + if (map->mp_v_rss_type & FIP_TE_VALID) { + eib_fip_queue_tbl_entry(tbl, map, upd->up_tusn, + upd->up_eport_state); + } else { + eib_fip_dequeue_tbl_entry(tbl, map->mp_mac, + upd->up_tusn, upd->up_eport_state); + kmem_free(map, sizeof (eib_vhub_map_t)); + } + } + + return (EIB_E_SUCCESS); +} + +static void +eib_fip_dequeue_tbl_entry(eib_vhub_table_t *tbl, uint8_t *mac, uint32_t tusn, + uint8_t eport_state) +{ + uint8_t bkt; + eib_vhub_map_t *prev; + eib_vhub_map_t *elem; + + bkt = (mac[ETHERADDRL-1]) % EIB_TB_NBUCKETS; + + mutex_enter(&tbl->tb_lock); + + /* + * Note that for EoIB, the vhub table is maintained using only + * vnic entry updates + */ + prev = NULL; + for (elem = tbl->tb_vnic_entry[bkt]; elem; elem = elem->mp_next) { + if (bcmp(elem->mp_mac, mac, ETHERADDRL) == 0) + break; + prev = elem; + } + + if (prev && elem) { + prev->mp_next = elem->mp_next; + kmem_free(elem, sizeof (eib_vhub_map_t)); + } + + tbl->tb_tusn = tusn; + tbl->tb_eport_state = eport_state; + + mutex_exit(&tbl->tb_lock); +} + +static eib_vhub_map_t * +eib_fip_get_vhub_map(void) +{ + return (kmem_zalloc(sizeof (eib_vhub_map_t), KM_NOSLEEP)); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/ib/clients/eoib/eib_ibt.c Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,1004 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ksynch.h> +#include <sys/dlpi.h> /* HCKSUM_INET_FULL_V4 */ +#include <sys/pattr.h> /* HCK_FULLCKSUM */ +#include <sys/ib/mgt/sm_attr.h> /* SM_INIT_TYPE_REPLY_... */ + +#include <sys/ib/clients/eoib/eib_impl.h> + +/* + * Declarations private to this file + */ +static void eib_ibt_reset_partitions(eib_t *); +static void eib_ibt_wakeup_sqd_waiters(eib_t *, ibt_channel_hdl_t); +static int eib_ibt_chan_pkey(eib_t *, eib_chan_t *, ib_pkey_t, boolean_t, + boolean_t *); +static boolean_t eib_ibt_has_chan_pkey_changed(eib_t *, eib_chan_t *); +static boolean_t eib_ibt_has_any_pkey_changed(eib_t *); +static int eib_ibt_fill_avect(eib_t *, eib_avect_t *, ib_lid_t); +static void eib_ibt_record_srate(eib_t *); + +/* + * Definitions private to this file + */ + +/* + * SM's init type reply flags + */ +#define EIB_PORT_ATTR_LOADED(itr) \ + (((itr) & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) +#define EIB_PORT_ATTR_NOT_PRESERVED(itr) \ + (((itr) & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0) +#define EIB_PORT_PRES_NOT_PRESERVED(itr) \ + (((itr) & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 0) + +/* + * eib_ibt_hca_init() initialization progress flags + */ +#define EIB_HCAINIT_HCA_OPENED 0x01 +#define EIB_HCAINIT_ATTRS_ALLOCD 0x02 +#define EIB_HCAINIT_HCA_PORTS_QUERIED 0x04 +#define EIB_HCAINIT_PD_ALLOCD 0x08 +#define EIB_HCAINIT_CAPAB_RECORDED 0x10 + +int +eib_ibt_hca_init(eib_t *ss) +{ + ibt_status_t ret; + ibt_hca_portinfo_t *pi; + uint_t num_pi; + uint_t sz_pi; + uint_t progress = 0; + + if (ss->ei_hca_hdl) + return (EIB_E_SUCCESS); + + /* + * Open the HCA + */ + ret = ibt_open_hca(ss->ei_ibt_hdl, ss->ei_props->ep_hca_guid, + &ss->ei_hca_hdl); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, + "ibt_open_hca(hca_guid=0x%llx) " + "failed, ret=%d", ss->ei_props->ep_hca_guid, ret); + goto ibt_hca_init_fail; + } + progress |= EIB_HCAINIT_HCA_OPENED; + + /* + * Query and store HCA attributes + */ + ss->ei_hca_attrs = kmem_zalloc(sizeof (ibt_hca_attr_t), KM_SLEEP); + progress |= EIB_HCAINIT_ATTRS_ALLOCD; + + ret = ibt_query_hca(ss->ei_hca_hdl, ss->ei_hca_attrs); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, + "ibt_query_hca(hca_hdl=0x%llx, " + "hca_guid=0x%llx) failed, ret=%d", + ss->ei_hca_hdl, ss->ei_props->ep_hca_guid, ret); + goto ibt_hca_init_fail; + } + + /* + * At this point, we don't even care about the linkstate, we only want + * to record our invariant base port guid and mtu + */ + ret = ibt_query_hca_ports(ss->ei_hca_hdl, ss->ei_props->ep_port_num, + &pi, &num_pi, &sz_pi); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, + "ibt_query_hca_ports(hca_hdl=0x%llx, " + "port=0x%x) failed, ret=%d", ss->ei_hca_hdl, + ss->ei_props->ep_port_num, ret); + goto ibt_hca_init_fail; + } + if (num_pi != 1) { + EIB_DPRINTF_ERR(ss->ei_instance, + "ibt_query_hca_ports(hca_hdl=0x%llx, " + "port=0x%x) returned num_pi=%d", ss->ei_hca_hdl, + ss->ei_props->ep_port_num, num_pi); + ibt_free_portinfo(pi, sz_pi); + goto ibt_hca_init_fail; + } + + ss->ei_props->ep_sgid = pi->p_sgid_tbl[0]; + ss->ei_props->ep_mtu = (128 << pi->p_mtu); + ibt_free_portinfo(pi, sz_pi); + + progress |= EIB_HCAINIT_HCA_PORTS_QUERIED; + + /* + * Allocate a protection domain for all our transactions + */ + ret = ibt_alloc_pd(ss->ei_hca_hdl, IBT_PD_NO_FLAGS, &ss->ei_pd_hdl); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, + "ibt_alloc_pd(hca_hdl=0x%llx, " + "hca_guid=0x%llx) failed, ret=%d", + ss->ei_hca_hdl, ss->ei_props->ep_hca_guid, ret); + goto ibt_hca_init_fail; + } + progress |= EIB_HCAINIT_PD_ALLOCD; + + /* + * Finally, record the capabilities + */ + ss->ei_caps = kmem_zalloc(sizeof (eib_caps_t), KM_SLEEP); + eib_ibt_record_capab(ss, ss->ei_hca_attrs, ss->ei_caps); + eib_ibt_record_srate(ss); + + progress |= EIB_HCAINIT_CAPAB_RECORDED; + + return (EIB_E_SUCCESS); + +ibt_hca_init_fail: + eib_rb_ibt_hca_init(ss, progress); + return (EIB_E_FAILURE); +} + +void +eib_ibt_link_mod(eib_t *ss) +{ + eib_node_state_t *ns = ss->ei_node_state; + ibt_hca_portinfo_t *pi; + ibt_status_t ret; + uint8_t vn0_mac[ETHERADDRL]; + boolean_t all_zombies = B_FALSE; + boolean_t all_need_rejoin = B_FALSE; + uint_t num_pi; + uint_t sz_pi; + uint8_t itr; + + if (ns->ns_link_state == LINK_STATE_UNKNOWN) + return; + + /* + * See if we can get the port attributes or we're as good as down. + */ + ret = ibt_query_hca_ports(ss->ei_hca_hdl, ss->ei_props->ep_port_num, + &pi, &num_pi, &sz_pi); + if ((ret != IBT_SUCCESS) || (pi->p_linkstate != IBT_PORT_ACTIVE)) { + ibt_free_portinfo(pi, sz_pi); + eib_mac_link_down(ss, B_FALSE); + return; + } + + /* + * If the SM re-initialized the port attributes, but did not preserve + * the old attributes, we need to check more. + */ + itr = pi->p_init_type_reply; + if (EIB_PORT_ATTR_LOADED(itr) && EIB_PORT_ATTR_NOT_PRESERVED(itr)) { + /* + * We're just coming back up; if we see that our base lid + * or sgid table has changed, we'll update these and try to + * restart all active vnics. If any of the vnic pkeys have + * changed, we'll reset the affected channels to the new pkey. + */ + if (bcmp(pi->p_sgid_tbl, &ss->ei_props->ep_sgid, + sizeof (ib_gid_t)) != 0) { + EIB_DPRINTF_VERBOSE(ss->ei_instance, + "eib_ibt_link_mod: port sgid table changed " + "(old %llx.%llx != new %llx.%llx), " + "all vnics are zombies now.", + ss->ei_props->ep_sgid.gid_prefix, + ss->ei_props->ep_sgid.gid_guid, + pi->p_sgid_tbl[0].gid_prefix, + pi->p_sgid_tbl[0].gid_guid); + + ss->ei_props->ep_sgid = pi->p_sgid_tbl[0]; + all_zombies = B_TRUE; + + } else if (ss->ei_props->ep_blid != pi->p_base_lid) { + EIB_DPRINTF_VERBOSE(ss->ei_instance, + "eib_ibt_link_mod: port base lid changed " + "(old 0x%x != new 0x%x), " + "all vnics are zombies now.", + ss->ei_props->ep_blid, pi->p_base_lid); + + ss->ei_props->ep_blid = pi->p_base_lid; + all_zombies = B_TRUE; + + } else if (eib_ibt_has_any_pkey_changed(ss)) { + EIB_DPRINTF_VERBOSE(ss->ei_instance, + "eib_ibt_link_mod: pkey has changed for vnic(s), " + "resetting all partitions"); + + eib_ibt_reset_partitions(ss); + } + } + + if (pi) { + ibt_free_portinfo(pi, sz_pi); + } + + /* + * If the SM hasn't preserved our presence in MCGs, we need to + * rejoin all of them. + */ + if (EIB_PORT_PRES_NOT_PRESERVED(itr)) { + EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_link_mod: " + "hca_guid=0x%llx, port=0x%x presence not preserved in SM, " + "rejoining all mcgs", ss->ei_props->ep_hca_guid, + ss->ei_props->ep_port_num); + + all_need_rejoin = B_TRUE; + } + + /* + * Before we do the actual work of restarting/rejoining, we need to + * see if the GW is reachable at this point of time. If not, we + * still continue to keep our link "down." Whenever the GW becomes + * reachable again, we'll restart/rejoin all the vnics that we've + * just marked. + */ + mutex_enter(&ss->ei_vnic_lock); + if (all_zombies) { + ss->ei_zombie_vnics = ss->ei_active_vnics; + } + if (all_need_rejoin) { + ss->ei_rejoin_vnics = ss->ei_active_vnics; + } + if (ss->ei_gw_unreachable) { + mutex_exit(&ss->ei_vnic_lock); + + EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_link_mod: " + "gateway (gw_port=0x%x) unreachable for " + "hca_guid=0x%llx, port=0x%x, link state down", + ss->ei_gw_props->pp_gw_portid, ss->ei_props->ep_hca_guid, + ss->ei_props->ep_port_num); + + eib_mac_link_down(ss, B_FALSE); + return; + } + mutex_exit(&ss->ei_vnic_lock); + + /* + * Try to awaken the dead if possible + */ + bcopy(eib_zero_mac, vn0_mac, ETHERADDRL); + if (all_zombies) { + EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_link_mod: " + "hca_guid=0x%llx, hca_port=0x%x, gw_port=0x%x, " + "attempting to resurrect zombies", + ss->ei_props->ep_hca_guid, ss->ei_props->ep_port_num, + ss->ei_gw_props->pp_gw_portid); + + eib_vnic_resurrect_zombies(ss, vn0_mac); + } + + /* + * Re-join the mcgs if we need to + */ + if (all_need_rejoin) { + EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_link_mod: " + "hca_guid=0x%llx, hca_port=0x%x, gw_port=0x%x, " + "attempting to rejoin mcgs", + ss->ei_props->ep_hca_guid, ss->ei_props->ep_port_num, + ss->ei_gw_props->pp_gw_portid); + + eib_vnic_rejoin_mcgs(ss); + } + + /* + * If we've restarted the zombies because the gateway went down and + * came back, it is possible our unicast mac address changed from + * what it was earlier. If so, we need to update our unicast address + * with the mac layer before marking the link up. + */ + if (bcmp(vn0_mac, eib_zero_mac, ETHERADDRL) != 0) + mac_unicst_update(ss->ei_mac_hdl, vn0_mac); + + /* + * Notify the link state up if required + */ + eib_mac_link_up(ss, B_FALSE); +} + +int +eib_ibt_modify_chan_pkey(eib_t *ss, eib_chan_t *chan, ib_pkey_t pkey) +{ + /* + * Make sure the channel pkey and index are set to what we need + */ + return (eib_ibt_chan_pkey(ss, chan, pkey, B_TRUE, NULL)); +} + +eib_avect_t * +eib_ibt_hold_avect(eib_t *ss, ib_lid_t dlid, uint8_t sl) +{ + uint_t ndx = dlid % EIB_AV_NBUCKETS; /* simple hashing */ + eib_avect_t *av; + eib_avect_t *prev; + int ret; + + mutex_enter(&ss->ei_av_lock); + + /* + * See if we have the address vector + */ + prev = NULL; + for (av = ss->ei_av[ndx]; av; av = av->av_next) { + prev = av; + if ((av->av_vect).av_dlid == dlid) + break; + } + + /* + * If we don't have it, create a new one and chain it to + * the same bucket + */ + if (av == NULL) { + av = kmem_zalloc(sizeof (eib_avect_t), KM_NOSLEEP); + if (av == NULL) { + mutex_exit(&ss->ei_av_lock); + EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_hold_avect: " + "no memory, could not allocate address vector"); + return (NULL); + } + + ret = EIB_E_FAILURE; + if (!eib_wa_no_av_discover) + ret = eib_ibt_fill_avect(ss, av, dlid); + + if (ret != EIB_E_SUCCESS) { + (av->av_vect).av_srate = IBT_SRATE_10; + (av->av_vect).av_srvl = sl; + (av->av_vect).av_port_num = ss->ei_props->ep_port_num; + (av->av_vect).av_send_grh = B_FALSE; + (av->av_vect).av_dlid = dlid; + (av->av_vect).av_src_path = 0; /* we use base lid */ + } + + if (prev) + prev->av_next = av; + else + ss->ei_av[ndx] = av; + } + + /* + * Increment the address vector reference count before returning + */ + (av->av_ref)++; + + mutex_exit(&ss->ei_av_lock); + + return (av); +} + +static int +eib_ibt_fill_avect(eib_t *ss, eib_avect_t *av, ib_lid_t dlid) +{ + ibt_node_info_t ni; + ibt_path_attr_t attr; + ibt_path_info_t path; + ibt_status_t ret; + ib_gid_t dgid; + + if ((ret = ibt_lid_to_node_info(dlid, &ni)) != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_fill_avect: " + "ibt_lid_to_node_info(dlid=0x%x) failed, ret=%d", + dlid, ret); + return (EIB_E_FAILURE); + } + dgid.gid_prefix = ss->ei_gw_props->pp_gw_sn_prefix; + dgid.gid_guid = ni.n_port_guid; + + /* + * Get the reversible path information for this destination + */ + bzero(&attr, sizeof (ibt_path_info_t)); + attr.pa_sgid = ss->ei_props->ep_sgid; + attr.pa_dgids = &dgid; + attr.pa_num_dgids = 1; + + bzero(&path, sizeof (ibt_path_info_t)); + ret = ibt_get_paths(ss->ei_ibt_hdl, IBT_PATH_NO_FLAGS, + &attr, 1, &path, NULL); + if ((ret != IBT_SUCCESS) || (path.pi_hca_guid == 0)) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_fill_avect: " + "ibt_get_paths(dgid=%llx.%llx) failed, ret=%d", + dgid.gid_prefix, dgid.gid_guid); + return (EIB_E_FAILURE); + } + + /* + * Fill in the address vector + */ + bcopy(&path.pi_prim_cep_path.cep_adds_vect, &av->av_vect, + sizeof (ibt_adds_vect_t)); + + return (EIB_E_SUCCESS); +} + +void +eib_ibt_release_avect(eib_t *ss, eib_avect_t *av) +{ + mutex_enter(&ss->ei_av_lock); + + ASSERT(av->av_ref > 0); + (av->av_ref)--; + + mutex_exit(&ss->ei_av_lock); +} + +void +eib_ibt_free_avects(eib_t *ss) +{ + eib_avect_t *av; + eib_avect_t *av_next; + int ndx; + + mutex_enter(&ss->ei_av_lock); + for (ndx = 0; ndx < EIB_AV_NBUCKETS; ndx++) { + for (av = ss->ei_av[ndx]; av; av = av_next) { + av_next = av->av_next; + + ASSERT(av->av_ref == 0); + kmem_free(av, sizeof (eib_avect_t)); + } + ss->ei_av[ndx] = NULL; + } + mutex_exit(&ss->ei_av_lock); +} + +/*ARGSUSED*/ +void +eib_ibt_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, + ibt_async_code_t code, ibt_async_event_t *event) +{ + eib_t *ss = (eib_t *)clnt_private; + eib_event_t *evi; + uint_t ev_code; + + ev_code = EIB_EV_NONE; + + switch (code) { + case IBT_EVENT_SQD: + EIB_DPRINTF_VERBOSE(ss->ei_instance, + "eib_ibt_async_handler: got IBT_EVENT_SQD"); + eib_ibt_wakeup_sqd_waiters(ss, event->ev_chan_hdl); + break; + + case IBT_EVENT_PORT_UP: + if (event->ev_port == ss->ei_props->ep_port_num) { + EIB_DPRINTF_VERBOSE(ss->ei_instance, + "eib_ibt_async_handler: got IBT_EVENT_PORT_UP"); + ev_code = EIB_EV_PORT_UP; + } + break; + + case IBT_ERROR_PORT_DOWN: + if (event->ev_port == ss->ei_props->ep_port_num) { + EIB_DPRINTF_VERBOSE(ss->ei_instance, + "eib_ibt_async_handler: got IBT_ERROR_PORT_DOWN"); + ev_code = EIB_EV_PORT_DOWN; + } + break; + + case IBT_CLNT_REREG_EVENT: + if (event->ev_port == ss->ei_props->ep_port_num) { + EIB_DPRINTF_VERBOSE(ss->ei_instance, + "eib_ibt_async_handler: got IBT_CLNT_REREG_EVENT"); + ev_code = EIB_EV_CLNT_REREG; + } + break; + + case IBT_PORT_CHANGE_EVENT: + if ((event->ev_port == ss->ei_props->ep_port_num) && + (event->ev_port_flags & IBT_PORT_CHANGE_PKEY)) { + EIB_DPRINTF_VERBOSE(ss->ei_instance, + "eib_ibt_async_handler: " + "got IBT_PORT_CHANGE_EVENT(PKEY_CHANGE)"); + ev_code = EIB_EV_PKEY_CHANGE; + } else if ((event->ev_port == ss->ei_props->ep_port_num) && + (event->ev_port_flags & IBT_PORT_CHANGE_SGID)) { + EIB_DPRINTF_VERBOSE(ss->ei_instance, + "eib_ibt_async_handler: " + "got IBT_PORT_CHANGE_EVENT(SGID_CHANGE)"); + ev_code = EIB_EV_SGID_CHANGE; + } + break; + + case IBT_HCA_ATTACH_EVENT: + /* + * For HCA attach, after a new HCA is plugged in and + * configured using cfgadm, an explicit plumb will need + * to be run, so we don't need to do anything here. + */ + EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_async_handler: " + "got IBT_HCA_ATTACH_EVENT"); + break; + + case IBT_HCA_DETACH_EVENT: + /* + * Before an HCA unplug, cfgadm is expected to trigger + * any rcm scripts to unplumb the EoIB instances on the + * card. If so, we should not be holding any hca resource, + * since we don't do ibt_open_hca() until plumb time. However, + * if an earlier unplumb hadn't cleaned up the hca resources + * properly because the network layer hadn't returned the + * buffers at that time, we could be holding hca resources. + * We'll try to release them here, and protect the code from + * racing with some other plumb/unplumb operation. + */ + EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_async_handler: " + "got IBT_HCA_DETACH_EVENT"); + + eib_mac_set_nic_state(ss, EIB_NIC_STOPPING); + eib_rb_rsrc_setup_bufs(ss, B_FALSE); + if (ss->ei_tx || ss->ei_rx || ss->ei_lso) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_events_handler: nw layer still holding " + "hca resources, could not detach HCA"); + } else if (ss->ei_hca_hdl) { + eib_rb_ibt_hca_init(ss, ~0); + } + eib_mac_clr_nic_state(ss, EIB_NIC_STOPPING); + + break; + } + + if (ev_code != EIB_EV_NONE) { + evi = kmem_zalloc(sizeof (eib_event_t), KM_NOSLEEP); + if (evi == NULL) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_ibt_async_handler: " + "no memory, could not handle event 0x%lx", ev_code); + } else { + evi->ev_code = ev_code; + evi->ev_arg = NULL; + eib_svc_enqueue_event(ss, evi); + } + } +} + +/*ARGSUSED*/ +void +eib_ibt_record_capab(eib_t *ss, ibt_hca_attr_t *hca_attrs, eib_caps_t *caps) +{ + uint_t max_swqe = EIB_DATA_MAX_SWQE; + uint_t max_rwqe = EIB_DATA_MAX_RWQE; + + /* + * Checksum + */ + caps->cp_cksum_flags = 0; + if ((!eib_wa_no_cksum_offload) && + (hca_attrs->hca_flags & IBT_HCA_CKSUM_FULL)) { + caps->cp_cksum_flags = + HCK_FULLCKSUM | HCKSUM_INET_FULL_V4; + /* HCKSUM_INET_FULL_V4 | HCKSUM_IPHDRCKSUM; */ + } + + /* + * Reserved L-Key + */ + if (hca_attrs->hca_flags2 & IBT_HCA2_RES_LKEY) { + caps->cp_resv_lkey_capab = 1; + caps->cp_resv_lkey = hca_attrs->hca_reserved_lkey; + } + + /* + * LSO + */ + caps->cp_lso_maxlen = 0; + if (!eib_wa_no_lso) { + if (hca_attrs->hca_max_lso_size > EIB_LSO_MAXLEN) { + caps->cp_lso_maxlen = EIB_LSO_MAXLEN; + } else { + caps->cp_lso_maxlen = hca_attrs->hca_max_lso_size; + } + } + + /* + * SGL + * + * Translating virtual address regions into physical regions + * for using the Reserved LKey feature results in a wr sgl that + * is a little longer. Since failing ibt_map_mem_iov() is costly, + * we'll record a high-water mark (65%) when we should stop + * trying to use Reserved LKey + */ + if (hca_attrs->hca_flags & IBT_HCA_WQE_SIZE_INFO) { + caps->cp_max_sgl = hca_attrs->hca_ud_send_sgl_sz; + } else { + caps->cp_max_sgl = hca_attrs->hca_max_sgl; + } + if (caps->cp_max_sgl > EIB_MAX_SGL) { + caps->cp_max_sgl = EIB_MAX_SGL; + } + caps->cp_hiwm_sgl = (caps->cp_max_sgl * 65) / 100; + + /* + * SWQE/RWQE: meet max chan size and max cq size limits (leave room + * to avoid cq overflow event) + */ + if (max_swqe > hca_attrs->hca_max_chan_sz) + max_swqe = hca_attrs->hca_max_chan_sz; + if (max_swqe > (hca_attrs->hca_max_cq_sz - 1)) + max_swqe = hca_attrs->hca_max_cq_sz - 1; + caps->cp_max_swqe = max_swqe; + + if (max_rwqe > hca_attrs->hca_max_chan_sz) + max_rwqe = hca_attrs->hca_max_chan_sz; + if (max_rwqe > (hca_attrs->hca_max_cq_sz - 1)) + max_rwqe = hca_attrs->hca_max_cq_sz - 1; + caps->cp_max_rwqe = max_rwqe; +} + +void +eib_rb_ibt_hca_init(eib_t *ss, uint_t progress) +{ + ibt_status_t ret; + + if (progress & EIB_HCAINIT_CAPAB_RECORDED) { + if (ss->ei_caps) { + kmem_free(ss->ei_caps, sizeof (eib_caps_t)); + ss->ei_caps = NULL; + } + } + + if (progress & EIB_HCAINIT_PD_ALLOCD) { + if (ss->ei_pd_hdl) { + ret = ibt_free_pd(ss->ei_hca_hdl, ss->ei_pd_hdl); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_rb_ibt_hca_init: " + "ibt_free_pd(hca_hdl=0x%lx, pd_hdl=0x%lx) " + "failed, ret=%d", ss->ei_hca_hdl, + ss->ei_pd_hdl, ret); + } + ss->ei_pd_hdl = NULL; + } + } + + if (progress & EIB_HCAINIT_HCA_PORTS_QUERIED) { + ss->ei_props->ep_mtu = 0; + bzero(&ss->ei_props->ep_sgid, sizeof (ib_gid_t)); + } + + if (progress & EIB_HCAINIT_ATTRS_ALLOCD) { + kmem_free(ss->ei_hca_attrs, sizeof (ibt_hca_attr_t)); + ss->ei_hca_attrs = NULL; + } + + if (progress & EIB_HCAINIT_HCA_OPENED) { + ret = ibt_close_hca(ss->ei_hca_hdl); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "ibt_close_hca(hca_hdl=0x%lx) failed, " + "ret=%d", ss->ei_hca_hdl, ret); + } + ss->ei_hca_hdl = NULL; + } +} + +static void +eib_ibt_reset_partitions(eib_t *ss) +{ + eib_vnic_t *vnic; + eib_chan_t *chan = NULL; + uint64_t av; + int inst = 0; + + /* + * We already have the vhub pkey recorded in our eib_chan_t. + * We only need to make sure our pkey index still matches it. + * If not, modify the channel appropriately and update our + * records. + */ + if ((chan = ss->ei_admin_chan) != NULL) + (void) eib_ibt_modify_chan_pkey(ss, chan, chan->ch_pkey); + + mutex_enter(&ss->ei_vnic_lock); + av = ss->ei_active_vnics; + while ((inst = EIB_FIND_LSB_SET(av)) != -1) { + if ((vnic = ss->ei_vnic[inst]) != NULL) { + if ((chan = vnic->vn_ctl_chan) != NULL) { + (void) eib_ibt_modify_chan_pkey(ss, chan, + chan->ch_pkey); + } + if ((chan = vnic->vn_data_chan) != NULL) { + (void) eib_ibt_modify_chan_pkey(ss, chan, + chan->ch_pkey); + } + } + av &= (~((uint64_t)1 << inst)); + } + mutex_exit(&ss->ei_vnic_lock); +} + +static void +eib_ibt_wakeup_sqd_waiters(eib_t *ss, ibt_channel_hdl_t ev_chan_hdl) +{ + eib_vnic_t *vnic; + eib_chan_t *chan = NULL; + uint64_t av; + int inst = 0; + + /* + * See if this channel has been waiting for its queue to drain. + * + * Note that since this is especially likely to be called during + * logging in to the gateway, we also need to check the vnic + * currently being created. + */ + mutex_enter(&ss->ei_vnic_lock); + + if ((vnic = ss->ei_vnic_pending) != NULL) { + chan = vnic->vn_ctl_chan; + if ((chan) && (chan->ch_chan == ev_chan_hdl)) + goto wakeup_sqd_waiters; + + chan = vnic->vn_data_chan; + if ((chan) && (chan->ch_chan == ev_chan_hdl)) + goto wakeup_sqd_waiters; + } + + av = ss->ei_active_vnics; + while ((inst = EIB_FIND_LSB_SET(av)) != -1) { + if ((vnic = ss->ei_vnic[inst]) != NULL) { + chan = vnic->vn_ctl_chan; + if (chan->ch_chan == ev_chan_hdl) + break; + + chan = vnic->vn_data_chan; + if (chan->ch_chan == ev_chan_hdl) + break; + } + av &= (~((uint64_t)1 << inst)); + } + +wakeup_sqd_waiters: + if (chan) { + mutex_enter(&chan->ch_cep_lock); + chan->ch_cep_state = IBT_STATE_SQD; + cv_broadcast(&chan->ch_cep_cv); + mutex_exit(&chan->ch_cep_lock); + } + + mutex_exit(&ss->ei_vnic_lock); +} + +static int +eib_ibt_chan_pkey(eib_t *ss, eib_chan_t *chan, ib_pkey_t new_pkey, + boolean_t set, boolean_t *pkey_changed) +{ + ibt_qp_info_t qp_attr; + ibt_status_t ret; + uint16_t new_pkey_ix; + + ret = ibt_pkey2index(ss->ei_hca_hdl, ss->ei_props->ep_port_num, + new_pkey, &new_pkey_ix); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_chan_pkey: " + "ibt_pkey2index(hca_hdl=0x%llx, port_num=0x%x, " + "pkey=0x%x) failed, ret=%d", + ss->ei_hca_hdl, ss->ei_props->ep_port_num, new_pkey, ret); + return (EIB_E_FAILURE); + } + + /* + * If the pkey and the pkey index we have already matches the + * new one, nothing to do. + */ + mutex_enter(&chan->ch_pkey_lock); + if ((chan->ch_pkey == new_pkey) && (chan->ch_pkey_ix == new_pkey_ix)) { + if (pkey_changed) { + *pkey_changed = B_FALSE; + } + mutex_exit(&chan->ch_pkey_lock); + return (EIB_E_SUCCESS); + } + if (pkey_changed) { + *pkey_changed = B_TRUE; + } + mutex_exit(&chan->ch_pkey_lock); + + /* + * Otherwise, if we're asked only to test if the pkey index + * supplied matches the one recorded in the channel, return + * success, but don't set the pkey. + */ + if (!set) { + return (EIB_E_SUCCESS); + } + + /* + * Otherwise, we need to change channel pkey. Pause the + * channel sendq first. + */ + ret = ibt_pause_sendq(chan->ch_chan, IBT_CEP_SET_SQD_EVENT); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_chan_pkey: " + "ibt_pause_sendq(chan_hdl=0x%llx) failed, ret=%d", + chan->ch_chan, ret); + return (EIB_E_FAILURE); + } + + /* + * Wait for the channel to enter the IBT_STATE_SQD state + */ + mutex_enter(&chan->ch_cep_lock); + while (chan->ch_cep_state != IBT_STATE_SQD) + cv_wait(&chan->ch_cep_cv, &chan->ch_cep_lock); + mutex_exit(&chan->ch_cep_lock); + + /* + * Modify the qp with the supplied pkey index and unpause the channel + * If either of these operations fail, we'll leave the channel in + * the paused state and fail. + */ + bzero(&qp_attr, sizeof (ibt_qp_info_t)); + + qp_attr.qp_trans = IBT_UD_SRV; + qp_attr.qp_current_state = IBT_STATE_SQD; + qp_attr.qp_state = IBT_STATE_SQD; + qp_attr.qp_transport.ud.ud_pkey_ix = new_pkey_ix; + + /* + * Modify the qp to set the new pkey index, then unpause the + * channel and put it in RTS state and update the new values + * in our records + */ + mutex_enter(&chan->ch_pkey_lock); + + ret = ibt_modify_qp(chan->ch_chan, + IBT_CEP_SET_STATE | IBT_CEP_SET_PKEY_IX, &qp_attr, NULL); + if (ret != IBT_SUCCESS) { + mutex_exit(&chan->ch_pkey_lock); + EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_chan_pkey: " + "ibt_modify_qp(chan_hdl=0x%llx, IBT_CEP_SET_PKEY_IX) " + "failed for new_pkey_ix=0x%x, ret=%d", + chan->ch_chan, new_pkey_ix, ret); + return (EIB_E_FAILURE); + } + + if ((ret = ibt_unpause_sendq(chan->ch_chan)) != IBT_SUCCESS) { + mutex_exit(&chan->ch_pkey_lock); + EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_chan_pkey: " + "ibt_unpause_sendq(chan_hdl=0x%llx) failed, ret=%d", + chan->ch_chan, ret); + return (EIB_E_FAILURE); + } + + chan->ch_pkey = new_pkey; + chan->ch_pkey_ix = new_pkey_ix; + mutex_exit(&chan->ch_pkey_lock); + + return (EIB_E_SUCCESS); +} + +static boolean_t +eib_ibt_has_chan_pkey_changed(eib_t *ss, eib_chan_t *chan) +{ + boolean_t changed; + int ret; + + /* + * Don't modify the pkey, just ask if the pkey index for the channel's + * pkey has changed for any reason. If we fail, assume that the pkey + * has changed. + */ + ret = eib_ibt_chan_pkey(ss, chan, chan->ch_pkey, B_FALSE, &changed); + if (ret != EIB_E_SUCCESS) + changed = B_TRUE; + + return (changed); +} + +static boolean_t +eib_ibt_has_any_pkey_changed(eib_t *ss) +{ + eib_vnic_t *vnic; + eib_chan_t *chan = NULL; + uint64_t av; + int inst = 0; + + /* + * Return true if the pkey index of any our pkeys (of the channels + * of all active vnics) has changed. + */ + + chan = ss->ei_admin_chan; + if ((chan) && (eib_ibt_has_chan_pkey_changed(ss, chan))) + return (B_TRUE); + + mutex_enter(&ss->ei_vnic_lock); + av = ss->ei_active_vnics; + while ((inst = EIB_FIND_LSB_SET(av)) != -1) { + if ((vnic = ss->ei_vnic[inst]) != NULL) { + chan = vnic->vn_ctl_chan; + if ((chan) && (eib_ibt_has_chan_pkey_changed(ss, chan))) + return (B_TRUE); + + chan = vnic->vn_data_chan; + if ((chan) && (eib_ibt_has_chan_pkey_changed(ss, chan))) + return (B_TRUE); + } + av &= (~((uint64_t)1 << inst)); + } + mutex_exit(&ss->ei_vnic_lock); + + return (B_FALSE); +} + +/* + * This routine is currently used simply to derive and record the port + * speed from the loopback path information (for debug purposes). For + * EoIB, currently the srate used in address vectors to IB neighbors + * and the gateway is fixed at IBT_SRATE_10. Eventually though, this + * information (and sl) has to come from the gateway for all destinations + * in the vhub table. + */ +static void +eib_ibt_record_srate(eib_t *ss) +{ + ib_gid_t sgid = ss->ei_props->ep_sgid; + ibt_srate_t srate = IBT_SRATE_10; + ibt_path_info_t path; + ibt_path_attr_t path_attr; + ibt_status_t ret; + uint8_t num_paths; + + bzero(&path_attr, sizeof (path_attr)); + path_attr.pa_dgids = &sgid; + path_attr.pa_num_dgids = 1; + path_attr.pa_sgid = sgid; + + ret = ibt_get_paths(ss->ei_ibt_hdl, IBT_PATH_NO_FLAGS, + &path_attr, 1, &path, &num_paths); + if (ret == IBT_SUCCESS && num_paths >= 1) { + switch (srate = path.pi_prim_cep_path.cep_adds_vect.av_srate) { + case IBT_SRATE_2: + case IBT_SRATE_10: + case IBT_SRATE_30: + case IBT_SRATE_5: + case IBT_SRATE_20: + case IBT_SRATE_40: + case IBT_SRATE_60: + case IBT_SRATE_80: + case IBT_SRATE_120: + break; + default: + srate = IBT_SRATE_10; + } + } + + ss->ei_props->ep_srate = srate; + + EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_ibt_record_srate: " + "srate = %d", srate); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/ib/clients/eoib/eib_log.c Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,304 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ksynch.h> +#include <sys/varargs.h> + +#include <sys/ib/clients/eoib/eib_impl.h> + +/* + * Defaults + */ +uint_t eib_log_size = EIB_LOGSZ_DEFAULT; +int eib_log_level = EIB_MSGS_DEFAULT | EIB_MSGS_DEBUG; +int eib_log_timestamps = 0; + +/* + * Debug variables, should not be tunables so allocated debug buffer + * and its size remain consistent. + */ +static kmutex_t eib_debug_buf_lock; +static uint8_t *eib_debug_buf; +static uint32_t eib_debug_buf_ndx; +static uint_t eib_debug_buf_sz = 0; + +/* + * Local declarations + */ +static void eib_log(char *); + +void +eib_debug_init(void) +{ + eib_debug_buf_ndx = 0; + eib_debug_buf_sz = eib_log_size; + eib_debug_buf = kmem_zalloc(eib_debug_buf_sz, KM_SLEEP); + + mutex_init(&eib_debug_buf_lock, NULL, MUTEX_DRIVER, NULL); +} + +void +eib_debug_fini(void) +{ + mutex_destroy(&eib_debug_buf_lock); + + if (eib_debug_buf && eib_debug_buf_sz) { + kmem_free(eib_debug_buf, eib_debug_buf_sz); + eib_debug_buf = NULL; + } + eib_debug_buf_sz = 0; + eib_debug_buf_ndx = 0; +} + +void +eib_log(char *msg) +{ + uint32_t off; + int msglen; + char msgbuf[EIB_MAX_LINE]; + + if (eib_debug_buf == NULL) + return; + + if (eib_log_timestamps) { + msglen = snprintf(msgbuf, EIB_MAX_LINE, "%llx: %s", + (unsigned long long)ddi_get_lbolt64(), msg); + } else { + msglen = snprintf(msgbuf, EIB_MAX_LINE, "%s", msg); + } + + if (msglen < 0) + return; + else if (msglen >= EIB_MAX_LINE) + msglen = EIB_MAX_LINE - 1; + + mutex_enter(&eib_debug_buf_lock); + if ((eib_debug_buf_ndx == 0) || + (eib_debug_buf[eib_debug_buf_ndx-1] != '\n')) { + eib_debug_buf[eib_debug_buf_ndx] = '\n'; + eib_debug_buf_ndx++; + } + + off = eib_debug_buf_ndx; /* current msg should go here */ + + eib_debug_buf_ndx += msglen; /* next msg should start here */ + eib_debug_buf[eib_debug_buf_ndx] = 0; /* terminate current msg */ + + if (eib_debug_buf_ndx >= (eib_debug_buf_sz - 2 * EIB_MAX_LINE)) + eib_debug_buf_ndx = 0; + + mutex_exit(&eib_debug_buf_lock); + + bcopy(msgbuf, eib_debug_buf+off, msglen); /* no lock needed */ +} + +#ifdef EIB_DEBUG +void +eib_dprintf_verbose(int inst, const char *fmt, ...) +{ + va_list ap; + int msglen; + char msgbuf[EIB_MAX_LINE]; + char newfmt[EIB_MAX_LINE]; + + if ((eib_log_level & EIB_MSGS_VERBOSE) != EIB_MSGS_VERBOSE) + return; + + (void) snprintf(newfmt, EIB_MAX_LINE, "eoib%d__%s", inst, fmt); + + va_start(ap, fmt); + msglen = vsnprintf(msgbuf, EIB_MAX_LINE, newfmt, ap); + va_end(ap); + + if (msglen > 0) { + eib_log(msgbuf); + } +} + +void +eib_dprintf_pkt(int inst, uint8_t *pkt, uint_t sz) +{ + char msgbuf[EIB_MAX_LINE]; + char *bufp; + uint8_t *p = pkt; + uint_t len; + uint_t i; + + if ((eib_log_level & EIB_MSGS_PKT) != EIB_MSGS_PKT) + return; + + while (sz >= 16) { + (void) snprintf(msgbuf, EIB_MAX_LINE, + "eoib%02d__%02x %02x %02x %02x %02x %02x %02x %02x " + "%02x %02x %02x %02x %02x %02x %02x %02x\n", inst, + p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); + + eib_log(msgbuf); + + p += 16; + sz -= 16; + } + + len = EIB_MAX_LINE; + bufp = msgbuf; + for (i = 0; i < sz; i++) { + if (i == 0) { + (void) snprintf(bufp, len, "eoib%02d__%02x ", + inst, p[i]); + len -= 11; + bufp += 11; + } else if (i < (sz - 1)) { + (void) snprintf(bufp, len, "%02x ", p[i]); + len -= 3; + bufp += 3; + } else { + (void) snprintf(bufp, len, "%02x\n", p[i]); + len -= 3; + bufp += 3; + } + } + + eib_log(msgbuf); +} + +void +eib_dprintf_args(int inst, const char *fmt, ...) +{ + va_list ap; + int msglen; + char msgbuf[EIB_MAX_LINE]; + char newfmt[EIB_MAX_LINE]; + + if ((eib_log_level & EIB_MSGS_ARGS) != EIB_MSGS_ARGS) + return; + + (void) snprintf(newfmt, EIB_MAX_LINE, "eoib%d__%s", inst, fmt); + + va_start(ap, fmt); + msglen = vsnprintf(msgbuf, EIB_MAX_LINE, newfmt, ap); + va_end(ap); + + if (msglen > 0) { + eib_log(msgbuf); + } +} + +void +eib_dprintf_debug(int inst, const char *fmt, ...) +{ + va_list ap; + int msglen; + char msgbuf[EIB_MAX_LINE]; + char newfmt[EIB_MAX_LINE]; + + if ((eib_log_level & EIB_MSGS_DEBUG) != EIB_MSGS_DEBUG) + return; + + (void) snprintf(newfmt, EIB_MAX_LINE, "eoib%d__%s", inst, fmt); + + va_start(ap, fmt); + msglen = vsnprintf(msgbuf, EIB_MAX_LINE, newfmt, ap); + va_end(ap); + + if (msglen > 0) { + eib_log(msgbuf); + } +} +#endif + +void +eib_dprintf_warn(int inst, const char *fmt, ...) +{ + va_list ap; + int msglen; + char msgbuf[EIB_MAX_LINE]; + char newfmt[EIB_MAX_LINE]; + + if ((eib_log_level & EIB_MSGS_WARN) != EIB_MSGS_WARN) + return; + + (void) snprintf(newfmt, EIB_MAX_LINE, "eoib%d__%s", inst, fmt); + + va_start(ap, fmt); + msglen = vsnprintf(msgbuf, EIB_MAX_LINE, newfmt, ap); + va_end(ap); + + if (msglen > 0) { + eib_log(msgbuf); + } +} + +void +eib_dprintf_err(int inst, const char *fmt, ...) +{ + va_list ap; + int msglen; + char msgbuf[EIB_MAX_LINE]; + char newfmt[EIB_MAX_LINE]; + + if ((eib_log_level & EIB_MSGS_ERR) != EIB_MSGS_ERR) + return; + + (void) snprintf(newfmt, EIB_MAX_LINE, "eoib%d__%s", inst, fmt); + + va_start(ap, fmt); + msglen = vsnprintf(msgbuf, EIB_MAX_LINE, newfmt, ap); + va_end(ap); + + if (msglen > 0) { + eib_log(msgbuf); + cmn_err(CE_WARN, "!%s\n", msgbuf); + } +} + +void +eib_dprintf_crit(int inst, const char *fmt, ...) +{ + va_list ap; + int msglen; + char msgbuf[EIB_MAX_LINE]; + char newfmt[EIB_MAX_LINE]; + + if ((eib_log_level & EIB_MSGS_CRIT) != EIB_MSGS_CRIT) + return; + + (void) snprintf(newfmt, EIB_MAX_LINE, "eoib%d__%s", inst, fmt); + + va_start(ap, fmt); + msglen = vsnprintf(msgbuf, EIB_MAX_LINE, newfmt, ap); + va_end(ap); + + if (msglen > 0) { + eib_log(msgbuf); + cmn_err(CE_PANIC, "!%s\n", msgbuf); + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/ib/clients/eoib/eib_mac.c Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,532 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/devops.h> +#include <sys/kmem.h> +#include <sys/ksynch.h> +#include <sys/modctl.h> +#include <sys/stat.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/mac_provider.h> +#include <sys/mac_ether.h> + +#include <sys/ib/clients/eoib/eib_impl.h> + +/* + * Declarations private to this file + */ +static void eib_rb_mac_start(eib_t *, eib_vnic_t *); + +/* + * This set of routines are used to set/clear the condition that the + * caller is about to do something that affects the state of the nic. + * If there's already someone doing either a start or a stop (possibly + * due to the async handler, a plumb or a dlpi_open happening, or an + * unplumb or dlpi_close coming in), we wait until that's done. + */ +void +eib_mac_set_nic_state(eib_t *ss, uint_t flags) +{ + eib_node_state_t *ns = ss->ei_node_state; + + mutex_enter(&ns->ns_lock); + + while ((ns->ns_nic_state & EIB_NIC_STARTING) || + (ns->ns_nic_state & EIB_NIC_STOPPING)) { + cv_wait(&ns->ns_cv, &ns->ns_lock); + } + ns->ns_nic_state |= flags; + + mutex_exit(&ns->ns_lock); +} + +void +eib_mac_clr_nic_state(eib_t *ss, uint_t flags) +{ + eib_node_state_t *ns = ss->ei_node_state; + + mutex_enter(&ns->ns_lock); + + ns->ns_nic_state &= (~flags); + + cv_broadcast(&ns->ns_cv); + mutex_exit(&ns->ns_lock); +} + +void +eib_mac_upd_nic_state(eib_t *ss, uint_t clr_flags, uint_t set_flags) +{ + eib_node_state_t *ns = ss->ei_node_state; + + mutex_enter(&ns->ns_lock); + + ns->ns_nic_state &= (~clr_flags); + ns->ns_nic_state |= set_flags; + + cv_broadcast(&ns->ns_cv); + mutex_exit(&ns->ns_lock); +} + +uint_t +eib_mac_get_nic_state(eib_t *ss) +{ + eib_node_state_t *ns = ss->ei_node_state; + uint_t nic_state; + + mutex_enter(&ns->ns_lock); + nic_state = ns->ns_nic_state; + mutex_exit(&ns->ns_lock); + + return (nic_state); +} + +void +eib_mac_link_state(eib_t *ss, link_state_t new_link_state, + boolean_t force) +{ + eib_node_state_t *ns = ss->ei_node_state; + boolean_t state_changed = B_FALSE; + + mutex_enter(&ns->ns_lock); + + /* + * We track the link state only if the current link state is + * not unknown. Obviously therefore, the first calls to set + * the link state from eib_mac_start() have to pass an explicit + * 'force' flag to force the state change tracking. + */ + if (ns->ns_link_state != LINK_STATE_UNKNOWN) + force = B_TRUE; + + if ((force) && (new_link_state != ns->ns_link_state)) { + ns->ns_link_state = new_link_state; + state_changed = B_TRUE; + } + mutex_exit(&ns->ns_lock); + + if (state_changed) { + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_mac_link_state: changing link state to %d", + new_link_state); + + mac_link_update(ss->ei_mac_hdl, new_link_state); + } else { + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_mac_link_state: link state already %d", + new_link_state); + } +} + +void +eib_mac_link_up(eib_t *ss, boolean_t force) +{ + eib_mac_link_state(ss, LINK_STATE_UP, force); +} + +void +eib_mac_link_down(eib_t *ss, boolean_t force) +{ + eib_mac_link_state(ss, LINK_STATE_DOWN, force); +} + +int +eib_mac_start(eib_t *ss) +{ + eib_vnic_t *vnic0 = NULL; + eib_login_data_t *ld; + int err; + + /* + * Perform HCA related initializations + */ + if (eib_ibt_hca_init(ss) != EIB_E_SUCCESS) + goto start_fail; + + /* + * Make sure port is up. Also record the port base lid if it's up. + */ + if (eib_mac_hca_portstate(ss, &ss->ei_props->ep_blid, + &err) != EIB_E_SUCCESS) { + goto start_fail; + } + + /* + * Set up tx and rx buffer pools + */ + if (eib_rsrc_setup_bufs(ss, &err) != EIB_E_SUCCESS) + goto start_fail; + + /* + * Set up admin qp for logins and logouts + */ + if (eib_adm_setup_qp(ss, &err) != EIB_E_SUCCESS) + goto start_fail; + + /* + * Create the vnic for physlink (instance 0) + */ + if (eib_vnic_create(ss, 0, 0, &vnic0, &err) != EIB_E_SUCCESS) + goto start_fail; + + /* + * Update the mac layer about the correct values for MTU and + * unicast MAC address. Note that we've already verified that the + * vhub mtu (plus the eoib encapsulation header) is not greater + * than our port mtu, so we can go ahead and report the vhub mtu + * (of vnic0) directly. + */ + ld = &(vnic0->vn_login_data); + (void) mac_maxsdu_update(ss->ei_mac_hdl, ld->ld_vhub_mtu); + mac_unicst_update(ss->ei_mac_hdl, ld->ld_assigned_mac); + + /* + * Report that the link is up and ready + */ + eib_mac_link_up(ss, B_TRUE); + return (0); + +start_fail: + eib_rb_mac_start(ss, vnic0); + eib_mac_link_down(ss, B_TRUE); + return (err); +} + +void +eib_mac_stop(eib_t *ss) +{ + eib_vnic_t *vnic; + link_state_t cur_link_state = ss->ei_node_state->ns_link_state; + int ndx; + + /* + * Stopping an EoIB device instance is somewhat different from starting + * it. Between the time the device instance was started and the call to + * eib_m_stop() now, a number of vnics could've been created. All of + * these will need to be destroyed before we can stop the device. + */ + for (ndx = EIB_MAX_VNICS - 1; ndx >= 0; ndx--) { + if ((vnic = ss->ei_vnic[ndx]) != NULL) + eib_vnic_delete(ss, vnic); + } + + /* + * And now, to undo the things we did in start (other than creation + * of vnics itself) + */ + eib_rb_mac_start(ss, NULL); + + /* + * Now that we're completed stopped, there's no mac address assigned + * to us. Update the mac layer with this information. Note that we + * can let the old max mtu information remain as-is, since we're likely + * to get that same mtu on a later plumb. + */ + mac_unicst_update(ss->ei_mac_hdl, eib_zero_mac); + + /* + * If our link state was up when the eib_m_stop() callback was called, + * we'll mark the link state as unknown now. Otherwise, we'll leave + * the link state as-is (down). + */ + if (cur_link_state == LINK_STATE_UP) + eib_mac_link_state(ss, LINK_STATE_UNKNOWN, B_TRUE); +} + +int +eib_mac_multicast(eib_t *ss, boolean_t add, uint8_t *mcast_mac) +{ + int ret = EIB_E_SUCCESS; + int err = 0; + + /* + * If it's a broadcast group join, each vnic needs to and is always + * joined to the broadcast address, so we return success immediately. + * If it's a broadcast group leave, we fail immediately for the same + * reason as above. + */ + if (bcmp(mcast_mac, eib_broadcast_mac, ETHERADDRL) == 0) { + if (add) + return (0); + else + return (EINVAL); + } + + if (ss->ei_vnic[0]) { + if (add) { + ret = eib_vnic_join_data_mcg(ss, ss->ei_vnic[0], + mcast_mac, B_FALSE, &err); + } else { + eib_vnic_leave_data_mcg(ss, ss->ei_vnic[0], mcast_mac); + ret = EIB_E_SUCCESS; + } + } + + if (ret == EIB_E_SUCCESS) + return (0); + else + return (err); +} + +int +eib_mac_promisc(eib_t *ss, boolean_t set) +{ + int ret = EIB_E_SUCCESS; + int err = 0; + + if (ss->ei_vnic[0]) { + if (set) { + ret = eib_vnic_join_data_mcg(ss, ss->ei_vnic[0], + eib_zero_mac, B_FALSE, &err); + } else { + eib_vnic_leave_data_mcg(ss, ss->ei_vnic[0], + eib_zero_mac); + ret = EIB_E_SUCCESS; + } + } + + if (ret == EIB_E_SUCCESS) + return (0); + else + return (err); +} + +int +eib_mac_tx(eib_t *ss, mblk_t *mp) +{ + eib_ether_hdr_t evh; + eib_vnic_t *vnic = NULL; + eib_wqe_t *swqe = NULL; + boolean_t failed_vnic; + int found; + int ret; + + /* + * Grab a send wqe. If we cannot get one, wake up a service + * thread to monitor the swqe status and let the mac layer know + * as soon as we have enough tx wqes to start the traffic again. + */ + if ((swqe = eib_rsrc_grab_swqe(ss, EIB_WPRI_LO)) == NULL) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_mac_tx: " + "no swqe available, holding tx until resource " + "becomes available"); + eib_rsrc_txwqes_needed(ss); + return (EIB_E_FAILURE); + } + + /* + * Determine dmac, smac and vlan information + */ + eib_data_parse_ether_hdr(mp, &evh); + + /* + * Lookup the {smac, vlan} tuple in our vnic list. If it isn't + * there, this is obviously a new packet on a vnic/vlan that + * we haven't been informed about. So go ahead and file a request + * to create a new vnic. This is obviously not a clean thing to + * do - we should be informed when a vnic/vlan is being created + * and should be given a proper opportunity to login to the gateway + * and do the creation. But we don't have that luxury now, and + * this is the next best thing to do. Note that we return failure + * from here, so tx flow control should prevent further packets + * from coming in until the vnic creation has completed. + */ + found = eib_data_lookup_vnic(ss, evh.eh_smac, evh.eh_vlan, &vnic, + &failed_vnic); + if (found != EIB_E_SUCCESS) { + uint8_t *m = evh.eh_smac; + + /* + * Return the swqe back to the pool + */ + eib_rsrc_return_swqe(ss, swqe, NULL); + + /* + * If we had previously tried creating this vnic and had + * failed, we'll simply drop the packets on this vnic. + * Otherwise, we'll queue up a request to create this vnic. + */ + if (failed_vnic) { + EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_mac_tx: " + "vnic creation for mac=%x:%x:%x:%x:%x:%x " + "vlan=0x%x failed previously, dropping pkt", + m[0], m[1], m[2], m[3], m[4], m[5], evh.eh_vlan); + return (EIB_E_SUCCESS); + } else { + eib_vnic_need_new(ss, evh.eh_smac, evh.eh_vlan); + return (EIB_E_FAILURE); + } + } + + /* + * We'll try to setup the destination in the swqe for this dmac + * and vlan. If we don't succeed, there's no need to undo any + * vnic-creation we might've made above (if we didn't find the + * vnic corresponding to the {smac, vlan} originally). Note that + * this is not a resource issue, so we'll issue a warning and + * drop the packet, but won't return failure from here. + */ + ret = eib_vnic_setup_dest(vnic, swqe, evh.eh_dmac, evh.eh_vlan); + if (ret != EIB_E_SUCCESS) { + uint8_t *dmac; + + dmac = evh.eh_dmac; + EIB_DPRINTF_WARN(ss->ei_instance, "eib_mac_tx: " + "eib_vnic_setup_dest() failed for mac=%x:%x:%x:%x:%x:%x, " + "vlan=0x%x, dropping pkt", dmac[0], dmac[1], dmac[2], + dmac[3], dmac[4], dmac[5]); + + eib_rsrc_return_swqe(ss, swqe, NULL); + return (EIB_E_SUCCESS); + } + + /* + * The only reason why this would fail is if we needed LSO buffer(s) + * to prepare this frame and couldn't find enough of those. + */ + ret = eib_data_prepare_frame(vnic, swqe, mp, &evh); + if (ret != EIB_E_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_mac_tx: " + "eib_data_prepare_frame() failed (no LSO bufs?), " + "holding tx until resource becomes available"); + + eib_rsrc_return_swqe(ss, swqe, NULL); + eib_rsrc_lsobufs_needed(ss); + return (EIB_E_FAILURE); + } + + eib_data_post_tx(vnic, swqe); + + return (EIB_E_SUCCESS); +} + +int +eib_mac_hca_portstate(eib_t *ss, ib_lid_t *blid, int *err) +{ + ibt_hca_portinfo_t *pi; + ibt_status_t ret; + uint_t num_pi; + uint_t sz_pi; + + ret = ibt_query_hca_ports(ss->ei_hca_hdl, ss->ei_props->ep_port_num, + &pi, &num_pi, &sz_pi); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, + "ibt_query_hca_ports(hca_hdl=0x%llx, " + "port=0x%x) failed, ret=%d", ss->ei_hca_hdl, + ss->ei_props->ep_port_num, ret); + goto mac_hca_portstate_fail; + } + if (num_pi != 1) { + EIB_DPRINTF_ERR(ss->ei_instance, + "ibt_query_hca_ports(hca_hdl=0x%llx, " + "port=0x%x) returned num_pi=%d", ss->ei_hca_hdl, + ss->ei_props->ep_port_num, num_pi); + goto mac_hca_portstate_fail; + } + + if (pi->p_linkstate != IBT_PORT_ACTIVE) + goto mac_hca_portstate_fail; + + /* + * Return the port's base lid if asked + */ + if (blid) { + *blid = pi->p_base_lid; + } + + ibt_free_portinfo(pi, sz_pi); + return (EIB_E_SUCCESS); + +mac_hca_portstate_fail: + if (pi) { + ibt_free_portinfo(pi, sz_pi); + } + if (err) { + *err = ENETDOWN; + } + return (EIB_E_FAILURE); +} + +static void +eib_rb_mac_start(eib_t *ss, eib_vnic_t *vnic0) +{ + int ntries; + + /* + * If vnic0 is non-null, delete it + */ + if (vnic0) { + eib_rb_vnic_create(ss, vnic0, ~0); + } + + /* + * At this point, we're pretty much done with all communication that + * we need to do for vnic-logout, etc. so we can get rid of any address + * vectors we might've allocated to send control/data packets. + */ + eib_ibt_free_avects(ss); + + /* + * Tear down the rest of it + */ + if (ss->ei_admin_chan) { + eib_rb_adm_setup_qp(ss); + } + + /* + * If (say) the network layer has been holding onto our rx buffers, we + * wait a reasonable time for it to hand them back to us. If we don't + * get it still, we have nothing to do but avoid rolling back hca init + * since we cannot unregister the memory, release the pd or close the + * hca. We'll try to reuse it if there's a plumb again. + */ + for (ntries = 0; ntries < EIB_MAX_ATTEMPTS; ntries++) { + eib_rb_rsrc_setup_bufs(ss, B_FALSE); + if ((ss->ei_tx == NULL) && (ss->ei_rx == NULL) && + (ss->ei_lso == NULL)) { + break; + } + + delay(drv_usectohz(EIB_DELAY_HALF_SECOND)); + } + + if (ntries == EIB_MAX_ATTEMPTS) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_rb_mac_start: " + "bufs outstanding, tx=0x%llx, rx=0x%llx, lso=0x%llx", + ss->ei_tx, ss->ei_rx, ss->ei_lso); + } else if (ss->ei_hca_hdl) { + eib_rb_ibt_hca_init(ss, ~0); + } + ss->ei_props->ep_blid = 0; + + /* + * Pending vnic creation requests (and failed-vnic records) will have + * to be cleaned up in any case + */ + eib_flush_vnic_reqs(ss); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/ib/clients/eoib/eib_main.c Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,977 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * The Ethernet Over Infiniband driver + */ + +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/devops.h> +#include <sys/kmem.h> +#include <sys/ksynch.h> +#include <sys/modctl.h> +#include <sys/stat.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> + +#include <sys/mac_provider.h> +#include <sys/mac_ether.h> + +#include <sys/ib/clients/eoib/eib_impl.h> + +/* + * Driver entry point declarations + */ +static int eib_attach(dev_info_t *, ddi_attach_cmd_t); +static int eib_detach(dev_info_t *, ddi_detach_cmd_t); + +/* + * MAC callbacks + */ +static int eib_m_stat(void *, uint_t, uint64_t *); +static int eib_m_start(void *); +static void eib_m_stop(void *); +static int eib_m_promisc(void *, boolean_t); +static int eib_m_multicast(void *, boolean_t, const uint8_t *); +static int eib_m_unicast(void *, const uint8_t *); +static mblk_t *eib_m_tx(void *, mblk_t *); +static boolean_t eib_m_getcapab(void *, mac_capab_t, void *); +static int eib_m_setprop(void *, const char *, mac_prop_id_t, uint_t, + const void *); +static int eib_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *); +static void eib_m_propinfo(void *, const char *, mac_prop_id_t, + mac_prop_info_handle_t); + +/* + * Devops definition + */ +DDI_DEFINE_STREAM_OPS(eib_ops, nulldev, nulldev, eib_attach, eib_detach, + nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed); + +/* + * Module Driver Info + */ +static struct modldrv eib_modldrv = { + &mod_driverops, /* Driver module */ + "EoIB Driver", /* Driver name and version */ + &eib_ops, /* Driver ops */ +}; + +/* + * Module Linkage + */ +static struct modlinkage eib_modlinkage = { + MODREV_1, (void *)&eib_modldrv, NULL +}; + +/* + * GLDv3 entry points + */ +#define EIB_M_CALLBACK_FLAGS \ + (MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO) +static mac_callbacks_t eib_m_callbacks = { + EIB_M_CALLBACK_FLAGS, + eib_m_stat, + eib_m_start, + eib_m_stop, + eib_m_promisc, + eib_m_multicast, + eib_m_unicast, + eib_m_tx, + NULL, + NULL, + eib_m_getcapab, + NULL, + NULL, + eib_m_setprop, + eib_m_getprop, + eib_m_propinfo +}; + +/* + * Async handler callback for ibt events + */ +static ibt_clnt_modinfo_t eib_clnt_modinfo = { + IBTI_V_CURR, + IBT_NETWORK, + eib_ibt_async_handler, + NULL, + EIB_DRV_NAME +}; + +/* + * Driver State Pointer + */ +void *eib_state; + +/* + * Declarations private to this file + */ +static int eib_state_init(eib_t *); +static int eib_add_event_callbacks(eib_t *); +static int eib_register_with_mac(eib_t *, dev_info_t *); +static void eib_rb_attach(eib_t *, uint_t); +static void eib_rb_state_init(eib_t *); +static void eib_rb_add_event_callbacks(eib_t *); +static void eib_rb_register_with_mac(eib_t *); + +/* + * Definitions private to this file + */ +#define EIB_ATTACH_STATE_ALLOCD 0x01 +#define EIB_ATTACH_PROPS_PARSED 0x02 +#define EIB_ATTACH_STATE_INIT_DONE 0x04 +#define EIB_ATTACH_IBT_ATT_DONE 0x08 +#define EIB_ATTACH_EV_CBS_ADDED 0x10 +#define EIB_ATTACH_REGISTER_MAC_DONE 0x20 + +int +_init() +{ + int ret; + + if (ddi_name_to_major(EIB_DRV_NAME) == (major_t)-1) + return (ENODEV); + + if ((ret = ddi_soft_state_init(&eib_state, sizeof (eib_t), 0)) != 0) + return (ret); + + mac_init_ops(&eib_ops, EIB_DRV_NAME); + if ((ret = mod_install(&eib_modlinkage)) != 0) { + mac_fini_ops(&eib_ops); + ddi_soft_state_fini(&eib_state); + return (ret); + } + + eib_debug_init(); + + return (ret); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&eib_modlinkage, modinfop)); +} + +int +_fini() +{ + int ret; + + if ((ret = mod_remove(&eib_modlinkage)) != 0) + return (ret); + + eib_debug_fini(); + + mac_fini_ops(&eib_ops); + ddi_soft_state_fini(&eib_state); + + return (ret); +} + +static int +eib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + eib_t *ss; + ibt_status_t ret; + int instance; + uint_t progress = 0; + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + /* + * Allocate softstate for this instance + */ + instance = ddi_get_instance(dip); + if (ddi_soft_state_zalloc(eib_state, instance) == DDI_FAILURE) + goto attach_fail; + + progress |= EIB_ATTACH_STATE_ALLOCD; + + ss = ddi_get_soft_state(eib_state, instance); + ss->ei_dip = dip; + ss->ei_instance = (uint_t)instance; + + /* + * Parse the node properties and get the gateway parameters + * for this instance + */ + if (eib_get_props(ss) != EIB_E_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, + "eib_attach: eib_get_props() failed"); + goto attach_fail; + } + progress |= EIB_ATTACH_PROPS_PARSED; + + /* + * Do per-state initialization + */ + if (eib_state_init(ss) != EIB_E_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, + "eib_attach: eib_state_init() failed"); + goto attach_fail; + } + progress |= EIB_ATTACH_STATE_INIT_DONE; + + /* + * Attach to IBTL + */ + if ((ret = ibt_attach(&eib_clnt_modinfo, ss->ei_dip, ss, + &ss->ei_ibt_hdl)) != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, + "eib_attach: ibt_attach() failed, ret=%d", ret); + goto attach_fail; + } + progress |= EIB_ATTACH_IBT_ATT_DONE; + + /* + * Register NDI event callbacks with EoIB nexus + */ + if (eib_add_event_callbacks(ss) != EIB_E_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, + "eib_attach: eib_add_event_callbacks() failed"); + goto attach_fail; + } + progress |= EIB_ATTACH_EV_CBS_ADDED; + + /* + * Register with mac layer + */ + if (eib_register_with_mac(ss, dip) != EIB_E_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, + "eib_attach: eib_register_with_mac() failed"); + goto attach_fail; + } + progress |= EIB_ATTACH_REGISTER_MAC_DONE; + + return (DDI_SUCCESS); + +attach_fail: + eib_rb_attach(ss, progress); + return (DDI_FAILURE); +} + +static int +eib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + eib_t *ss; + int instance; + + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + instance = ddi_get_instance(dip); + ss = ddi_get_soft_state(eib_state, instance); + + /* + * If we had not cleaned up rx buffers (and hca resources) during + * unplumb because they were stuck with the nw layer at the time, + * we can try to clean them up now before doing the detach. + */ + eib_mac_set_nic_state(ss, EIB_NIC_STOPPING); + + eib_rb_rsrc_setup_bufs(ss, B_FALSE); + if (ss->ei_tx || ss->ei_rx || ss->ei_lso) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_detach: buffers still not returned " + "(tx=0x%llx, rx=0x%llx, lso=0x%llx), could " + "not detach", ss->ei_tx, ss->ei_rx, ss->ei_lso); + eib_mac_clr_nic_state(ss, EIB_NIC_STOPPING); + return (DDI_FAILURE); + } + if (ss->ei_hca_hdl) { + eib_rb_ibt_hca_init(ss, ~0); + } + eib_mac_clr_nic_state(ss, EIB_NIC_STOPPING); + + eib_rb_attach(ss, ~0); + + return (DDI_SUCCESS); +} + +static int +eib_m_stat(void *arg, uint_t stat, uint64_t *val) +{ + eib_t *ss = arg; + eib_stats_t *stats = ss->ei_stats; + + switch (stat) { + case MAC_STAT_IFSPEED: + *val = ss->ei_props->ep_ifspeed; + break; + + case MAC_STAT_OBYTES: + *val = stats->st_obytes; + break; + + case MAC_STAT_OPACKETS: + *val = stats->st_opkts; + break; + + case MAC_STAT_BRDCSTXMT: + *val = stats->st_brdcstxmit; + break; + + case MAC_STAT_MULTIXMT: + *val = stats->st_multixmit; + break; + + case MAC_STAT_OERRORS: + *val = stats->st_oerrors; + break; + + case MAC_STAT_NOXMTBUF: + *val = stats->st_noxmitbuf; + break; + + case MAC_STAT_RBYTES: + *val = stats->st_rbytes; + break; + + case MAC_STAT_IPACKETS: + *val = stats->st_ipkts; + break; + + case MAC_STAT_BRDCSTRCV: + *val = stats->st_brdcstrcv; + break; + + case MAC_STAT_MULTIRCV: + *val = stats->st_multircv; + break; + + case MAC_STAT_IERRORS: + *val = stats->st_ierrors; + break; + + case MAC_STAT_NORCVBUF: + *val = stats->st_norcvbuf; + break; + + case ETHER_STAT_LINK_DUPLEX: + *val = LINK_DUPLEX_FULL; + break; + + default: + return (ENOTSUP); + } + + return (0); +} + +static int +eib_m_start(void *arg) +{ + eib_t *ss = arg; + int ret = -1; + + eib_mac_set_nic_state(ss, EIB_NIC_STARTING); + + if ((ss->ei_node_state->ns_nic_state & EIB_NIC_STARTED) == 0) + ret = eib_mac_start(ss); + + if (ret == 0) + eib_mac_upd_nic_state(ss, EIB_NIC_STARTING, EIB_NIC_STARTED); + else + eib_mac_clr_nic_state(ss, EIB_NIC_STARTING); + + return (ret); +} + +static void +eib_m_stop(void *arg) +{ + eib_t *ss = arg; + + eib_mac_set_nic_state(ss, EIB_NIC_STOPPING); + + if ((ss->ei_node_state->ns_nic_state & EIB_NIC_STARTED) != 0) + eib_mac_stop(ss); + + eib_mac_clr_nic_state(ss, EIB_NIC_STARTED|EIB_NIC_STOPPING); +} + +static int +eib_m_promisc(void *arg, boolean_t flag) +{ + eib_t *ss = arg; + + if ((ss->ei_node_state->ns_nic_state & EIB_NIC_STARTED) == 0) + return (0); + + return (eib_mac_promisc(ss, flag)); +} + +static int +eib_m_multicast(void *arg, boolean_t add, const uint8_t *mcast_mac) +{ + eib_t *ss = arg; + + if ((ss->ei_node_state->ns_nic_state & EIB_NIC_STARTED) == 0) + return (0); + + /* + * We don't have any knowledge which of the vnics built on top of + * the physlink is this multicast group relevant for. We'll join + * it for vnic0 for now. + * + * Since the tx routine in EoIB currently piggy backs all multicast + * traffic over the broadcast channel, and all vnics are joined to + * the broadcast address when they're created, everyone should receive + * all multicast traffic anyway. + * + * On the rx side, we'll check if the incoming multicast address is + * either on the vnic's list of mcgs joined to (which will only be the + * broadcast address) or on vnic0's list of mcgs. If we find a match, + * we let the packet come through. + * + * This isn't perfect, but it's the best we can do given that we don't + * have any vlan information corresponding to this multicast address. + * + * Also, for now we'll use the synchronous multicast joins and + * leaves instead of the asynchronous mechanism provided by + * ibt_join_mcg() since that involves additional complexity for failed + * joins and removals. + */ + return (eib_mac_multicast(ss, add, (uint8_t *)mcast_mac)); +} + +static int +eib_m_unicast(void *arg, const uint8_t *macaddr) +{ + eib_t *ss = arg; + eib_vnic_t *vnic; + + if ((ss->ei_node_state->ns_nic_state & EIB_NIC_STARTED) == 0) + return (0); + + mutex_enter(&ss->ei_vnic_lock); + + vnic = ss->ei_vnic[0]; + if (bcmp(macaddr, vnic->vn_login_data.ld_assigned_mac, + ETHERADDRL) == 0) { + mutex_exit(&ss->ei_vnic_lock); + return (0); + } + + mutex_exit(&ss->ei_vnic_lock); + + return (EINVAL); +} + +static mblk_t * +eib_m_tx(void *arg, mblk_t *mp) +{ + eib_t *ss = arg; + mblk_t *next; + + /* + * If the nic hasn't been started, drop the message(s) + */ + if ((ss->ei_node_state->ns_nic_state & EIB_NIC_STARTED) == 0) { + freemsgchain(mp); + return (NULL); + } + + for (; mp != NULL; mp = next) { + /* + * Detach this message from the message chain + */ + next = mp->b_next; + mp->b_next = NULL; + + /* + * Attempt to send the message; if we fail (likely due + * to lack of resources), reattach this message to the + * chain and return the unsent chain back. When we're + * ready to send again, we'll issue a mac_tx_update(). + */ + if (eib_mac_tx(ss, mp) != EIB_E_SUCCESS) { + mp->b_next = next; + break; + } + } + + return (mp); +} + +static boolean_t +eib_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) +{ + eib_t *ss = arg; + eib_caps_t *caps = ss->ei_caps; + eib_caps_t s_caps; + ibt_hca_attr_t hca_attrs; + ibt_status_t ret; + + /* + * If we haven't been plumbed yet, try getting the hca attributes + * and figure out the capabilities now + */ + if (caps == NULL) { + ASSERT(ss->ei_props != NULL); + + ret = ibt_query_hca_byguid(ss->ei_props->ep_hca_guid, + &hca_attrs); + if (ret == IBT_SUCCESS) { + eib_ibt_record_capab(ss, &hca_attrs, &s_caps); + caps = &s_caps; + } + } + + if ((caps != NULL) && (cap == MAC_CAPAB_HCKSUM)) { + uint32_t *tx_flags = cap_data; + + if (caps->cp_cksum_flags == 0) { + EIB_DPRINTF_VERBOSE(ss->ei_instance, + "eib_m_getcapab: hw cksum disabled, cksum_flags=0"); + return (B_FALSE); + } + + *tx_flags = caps->cp_cksum_flags; + + return (B_TRUE); + + } else if ((caps != NULL) && (cap == MAC_CAPAB_LSO)) { + mac_capab_lso_t *cap_lso = cap_data; + + /* + * If the HCA supports LSO, it will advertise a non-zero + * "max lso size" parameter. Also, LSO relies on hw + * checksum being available. Finally, if the HCA + * doesn't provide the reserved-lkey capability, LSO + * will adversely affect the performance. So, we'll + * enable LSO only if we have a non-zero max lso size, + * support checksum offload and provide reserved lkey. + */ + if (caps->cp_lso_maxlen == 0 || + caps->cp_cksum_flags == 0 || + caps->cp_resv_lkey_capab == 0) { + EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_m_getcapab: " + "LSO disabled, lso_maxlen=0x%lx, " + "cksum_flags=0x%lx, resv_lkey_capab=%d", + caps->cp_lso_maxlen, + caps->cp_cksum_flags, + caps->cp_resv_lkey_capab); + return (B_FALSE); + } + + cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; + cap_lso->lso_basic_tcp_ipv4.lso_max = caps->cp_lso_maxlen - 1; + + return (B_TRUE); + } + + return (B_FALSE); +} + +/*ARGSUSED*/ +static int +eib_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, + uint_t pr_valsize, const void *pr_val) +{ + return (ENOTSUP); +} + +static int +eib_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, + uint_t pr_valsize, void *pr_val) +{ + eib_t *ss = arg; + link_duplex_t duplex = LINK_DUPLEX_FULL; + uint64_t speed = ss->ei_props->ep_ifspeed; + int err = 0; + + switch (pr_num) { + case MAC_PROP_DUPLEX: + ASSERT(pr_valsize >= sizeof (link_duplex_t)); + bcopy(&duplex, pr_val, sizeof (link_duplex_t)); + break; + + case MAC_PROP_SPEED: + ASSERT(pr_valsize >= sizeof (uint64_t)); + bcopy(&speed, pr_val, sizeof (speed)); + break; + + case MAC_PROP_PRIVATE: + if (strcmp(pr_name, EIB_DLPROP_GW_EPORT_STATE) == 0) { + if (ss->ei_gw_eport_state == FIP_EPORT_UP) { + (void) snprintf(pr_val, pr_valsize, + "%s", "up"); + } else { + (void) snprintf(pr_val, pr_valsize, + "%s", "down"); + } + } else if (strcmp(pr_name, EIB_DLPROP_HCA_GUID) == 0) { + (void) snprintf(pr_val, pr_valsize, "%llX", + (u_longlong_t)ss->ei_props->ep_hca_guid); + + } else if (strcmp(pr_name, EIB_DLPROP_PORT_GUID) == 0) { + (void) snprintf(pr_val, pr_valsize, "%llX", + (u_longlong_t)((ss->ei_props->ep_sgid).gid_guid)); + } + break; + + default: + err = ENOTSUP; + break; + } + + return (err); +} + +/*ARGSUSED*/ +static void +eib_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, + mac_prop_info_handle_t prh) +{ + switch (pr_num) { + case MAC_PROP_DUPLEX: + case MAC_PROP_SPEED: + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + break; + + case MAC_PROP_MTU: + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + mac_prop_info_set_range_uint32(prh, ETHERMTU, ETHERMTU); + break; + + case MAC_PROP_PRIVATE: + if (strcmp(pr_name, EIB_DLPROP_GW_EPORT_STATE) == 0) { + mac_prop_info_set_default_str(prh, "up "); + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + } else if (strcmp(pr_name, EIB_DLPROP_HCA_GUID) == 0) { + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + } else if (strcmp(pr_name, EIB_DLPROP_PORT_GUID) == 0) { + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + } + break; + } +} + +static int +eib_state_init(eib_t *ss) +{ + kthread_t *kt; + + /* + * Initialize synchronization primitives + */ + mutex_init(&ss->ei_vnic_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&ss->ei_av_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&ss->ei_ev_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&ss->ei_rxpost_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&ss->ei_vnic_req_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&ss->ei_ka_vnics_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&ss->ei_vnic_cv, NULL, CV_DEFAULT, NULL); + cv_init(&ss->ei_ev_cv, NULL, CV_DEFAULT, NULL); + cv_init(&ss->ei_rxpost_cv, NULL, CV_DEFAULT, NULL); + cv_init(&ss->ei_vnic_req_cv, NULL, CV_DEFAULT, NULL); + cv_init(&ss->ei_ka_vnics_cv, NULL, CV_DEFAULT, NULL); + + /* + * Create a node state structure and initialize + */ + ss->ei_node_state = kmem_zalloc(sizeof (eib_node_state_t), KM_SLEEP); + ss->ei_node_state->ns_link_state = LINK_STATE_UNKNOWN; + mutex_init(&ss->ei_node_state->ns_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&ss->ei_node_state->ns_cv, NULL, CV_DEFAULT, NULL); + + /* + * Allocate for gathering statistics + */ + ss->ei_stats = kmem_zalloc(sizeof (eib_stats_t), KM_SLEEP); + + /* + * Start up service threads + */ + kt = thread_create(NULL, 0, eib_events_handler, ss, 0, + &p0, TS_RUN, minclsyspri); + ss->ei_events_handler = kt->t_did; + + kt = thread_create(NULL, 0, eib_refill_rwqes, ss, 0, + &p0, TS_RUN, minclsyspri); + ss->ei_rwqes_refiller = kt->t_did; + + kt = thread_create(NULL, 0, eib_vnic_creator, ss, 0, + &p0, TS_RUN, minclsyspri); + ss->ei_vnic_creator = kt->t_did; + + kt = thread_create(NULL, 0, eib_manage_keepalives, ss, 0, + &p0, TS_RUN, minclsyspri); + ss->ei_keepalives_manager = kt->t_did; + + /* + * Set default state of gw eport + */ + ss->ei_gw_eport_state = FIP_EPORT_UP; + + /* + * Do static initializations of common structures + */ + eib_reserved_gid.gid_prefix = 0; + eib_reserved_gid.gid_guid = 0; + + return (EIB_E_SUCCESS); +} + +static int +eib_add_event_callbacks(eib_t *ss) +{ + int ret; + ddi_eventcookie_t login_ack_evc; + ddi_eventcookie_t gw_alive_evc; + ddi_eventcookie_t gw_info_evc; + + /* + * Add callback for receiving vnic login acks from the gateway + */ + if ((ret = ddi_get_eventcookie(ss->ei_dip, EIB_NDI_EVENT_LOGIN_ACK, + &login_ack_evc)) != DDI_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_add_event_callbacks: " + "ddi_get_eventcookie(LOGIN_ACK) failed, ret=%d", ret); + return (EIB_E_FAILURE); + } + if ((ret = ddi_add_event_handler(ss->ei_dip, login_ack_evc, + eib_login_ack_cb, ss, &ss->ei_login_ack_cb)) != DDI_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_add_event_callbacks: " + "ddi_add_event_handler(LOGIN_ACK) failed, ret=%d", ret); + return (EIB_E_FAILURE); + } + + /* + * Add callback for receiving status on gateway transitioning from + * not-available to available + */ + if ((ret = ddi_get_eventcookie(ss->ei_dip, EIB_NDI_EVENT_GW_AVAILABLE, + &gw_alive_evc)) != DDI_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_add_event_callbacks: " + "ddi_get_eventcookie(GW_AVAILABLE) failed, ret=%d", ret); + (void) ddi_remove_event_handler(ss->ei_login_ack_cb); + return (EIB_E_FAILURE); + } + if ((ret = ddi_add_event_handler(ss->ei_dip, gw_alive_evc, + eib_gw_alive_cb, ss, &ss->ei_gw_alive_cb)) != DDI_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_add_event_callbacks: " + "ddi_add_event_handler(GW_AVAILABLE) failed, ret=%d", ret); + (void) ddi_remove_event_handler(ss->ei_login_ack_cb); + return (EIB_E_FAILURE); + } + + /* + * Add callback for receiving gateway info update + */ + if ((ret = ddi_get_eventcookie(ss->ei_dip, EIB_NDI_EVENT_GW_INFO_UPDATE, + &gw_info_evc)) != DDI_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_add_event_callbacks: " + "ddi_get_eventcookie(GW_INFO_UPDATE) failed, ret=%d", ret); + (void) ddi_remove_event_handler(ss->ei_gw_alive_cb); + (void) ddi_remove_event_handler(ss->ei_login_ack_cb); + return (EIB_E_FAILURE); + } + if ((ret = ddi_add_event_handler(ss->ei_dip, gw_info_evc, + eib_gw_info_cb, ss, &ss->ei_gw_info_cb)) != DDI_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_add_event_callbacks: " + "ddi_add_event_handler(GW_INFO) failed, ret=%d", ret); + (void) ddi_remove_event_handler(ss->ei_gw_alive_cb); + (void) ddi_remove_event_handler(ss->ei_login_ack_cb); + return (EIB_E_FAILURE); + } + + return (EIB_E_SUCCESS); +} + +static int +eib_register_with_mac(eib_t *ss, dev_info_t *dip) +{ + mac_register_t *macp; + int ret; + + if ((macp = mac_alloc(MAC_VERSION)) == NULL) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_register_with_mac: " + "mac_alloc(MAC_VERSION=%d) failed", MAC_VERSION); + return (EIB_E_FAILURE); + } + + /* + * Note that when we register with mac during attach, we don't + * have the mac address yet (we'll get that after we login into + * the gateway) so we'll simply register a zero macaddr that + * we'll overwrite later during plumb, in eib_m_start(). Likewise, + * we'll also update the max-sdu with the correct MTU after we + * figure it out when we login to the gateway during plumb. + */ + macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; + macp->m_driver = ss; + macp->m_dip = dip; + macp->m_src_addr = eib_zero_mac; + macp->m_callbacks = &eib_m_callbacks; + macp->m_min_sdu = 0; + macp->m_max_sdu = ETHERMTU; + macp->m_margin = VLAN_TAGSZ; + macp->m_priv_props = eib_pvt_props; + + ret = mac_register(macp, &ss->ei_mac_hdl); + mac_free(macp); + + if (ret != 0) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_register_with_mac: " + "mac_register() failed, ret=%d", ret); + return (EIB_E_FAILURE); + } + + return (EIB_E_SUCCESS); +} + +static void +eib_rb_attach(eib_t *ss, uint_t progress) +{ + ibt_status_t ret; + int instance; + + if (progress & EIB_ATTACH_REGISTER_MAC_DONE) + eib_rb_register_with_mac(ss); + + if (progress & EIB_ATTACH_EV_CBS_ADDED) + eib_rb_add_event_callbacks(ss); + + if (progress & EIB_ATTACH_IBT_ATT_DONE) { + ret = ibt_detach(ss->ei_ibt_hdl); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_rb_attach: " + "ibt_detach() failed, ret=%d", ret); + } + ss->ei_ibt_hdl = NULL; + } + + if (progress & EIB_ATTACH_STATE_INIT_DONE) + eib_rb_state_init(ss); + + if (progress & EIB_ATTACH_PROPS_PARSED) + eib_rb_get_props(ss); + + if (progress & EIB_ATTACH_STATE_ALLOCD) { + instance = ddi_get_instance(ss->ei_dip); + ddi_soft_state_free(eib_state, instance); + } +} + +static void +eib_rb_state_init(eib_t *ss) +{ + /* + * Terminate service threads + */ + if (ss->ei_keepalives_manager) { + eib_stop_manage_keepalives(ss); + ss->ei_keepalives_manager = 0; + } + if (ss->ei_vnic_creator) { + eib_stop_vnic_creator(ss); + ss->ei_vnic_creator = 0; + } + if (ss->ei_rwqes_refiller) { + eib_stop_refill_rwqes(ss); + ss->ei_rwqes_refiller = 0; + } + if (ss->ei_events_handler) { + eib_stop_events_handler(ss); + ss->ei_events_handler = 0; + } + + /* + * Remove space allocated for gathering statistics + */ + if (ss->ei_stats) { + kmem_free(ss->ei_stats, sizeof (eib_stats_t)); + ss->ei_stats = NULL; + } + + /* + * Remove space allocated for keeping node state + */ + if (ss->ei_node_state) { + cv_destroy(&ss->ei_node_state->ns_cv); + mutex_destroy(&ss->ei_node_state->ns_lock); + kmem_free(ss->ei_node_state, sizeof (eib_node_state_t)); + ss->ei_node_state = NULL; + } + + /* + * Finally, destroy all synchronization resources + */ + cv_destroy(&ss->ei_ka_vnics_cv); + cv_destroy(&ss->ei_vnic_req_cv); + cv_destroy(&ss->ei_rxpost_cv); + cv_destroy(&ss->ei_ev_cv); + cv_destroy(&ss->ei_vnic_cv); + mutex_destroy(&ss->ei_ka_vnics_lock); + mutex_destroy(&ss->ei_vnic_req_lock); + mutex_destroy(&ss->ei_rxpost_lock); + mutex_destroy(&ss->ei_ev_lock); + mutex_destroy(&ss->ei_av_lock); + mutex_destroy(&ss->ei_vnic_lock); +} + +static void +eib_rb_add_event_callbacks(eib_t *ss) +{ + ddi_eventcookie_t evc; + + if (ddi_get_eventcookie(ss->ei_dip, EIB_NDI_EVENT_GW_INFO_UPDATE, + &evc) == DDI_SUCCESS) { + (void) ddi_remove_event_handler(ss->ei_gw_info_cb); + ss->ei_gw_info_cb = NULL; + } + + if (ddi_get_eventcookie(ss->ei_dip, EIB_NDI_EVENT_GW_AVAILABLE, + &evc) == DDI_SUCCESS) { + (void) ddi_remove_event_handler(ss->ei_gw_alive_cb); + ss->ei_gw_alive_cb = NULL; + } + + if (ddi_get_eventcookie(ss->ei_dip, EIB_NDI_EVENT_LOGIN_ACK, + &evc) == DDI_SUCCESS) { + (void) ddi_remove_event_handler(ss->ei_login_ack_cb); + ss->ei_login_ack_cb = NULL; + } +} + +static void +eib_rb_register_with_mac(eib_t *ss) +{ + int ret; + + if ((ret = mac_unregister(ss->ei_mac_hdl)) != 0) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_rb_register_with_mac: " + "mac_unregister() failed, ret=%d", ret); + } + + ss->ei_mac_hdl = NULL; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/ib/clients/eoib/eib_rsrc.c Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,1233 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ksynch.h> + +#include <sys/ib/clients/eoib/eib_impl.h> + +/* + * Declarations private to this file + */ +static int eib_rsrc_setup_txbufs(eib_t *, int *); +static int eib_rsrc_setup_rxbufs(eib_t *, int *); +static int eib_rsrc_setup_lsobufs(eib_t *, int *); +static void eib_rsrc_init_wqe_pool(eib_t *, eib_wqe_pool_t **, + ib_memlen_t, int); +static void eib_rsrc_fini_wqe_pool(eib_t *, eib_wqe_pool_t **); +static boolean_t eib_rsrc_ok_to_free_pool(eib_t *, eib_wqe_pool_t *, boolean_t); +static int eib_rsrc_grab_wqes(eib_t *, eib_wqe_pool_t *, eib_wqe_t **, uint_t, + uint_t *, int); +static void eib_rsrc_return_wqes(eib_t *, eib_wqe_pool_t *, eib_wqe_t **, + uint_t); + +static void eib_rb_rsrc_setup_txbufs(eib_t *, boolean_t); +static void eib_rb_rsrc_setup_rxbufs(eib_t *, boolean_t); +static void eib_rb_rsrc_setup_lsobufs(eib_t *, boolean_t); + +/* + * Definitions private to this file + */ +static uint_t eib_lso_num_bufs = EIB_LSO_NUM_BUFS; /* tunable? */ + +int +eib_rsrc_setup_bufs(eib_t *ss, int *err) +{ + if (eib_rsrc_setup_txbufs(ss, err) != EIB_E_SUCCESS) + return (EIB_E_FAILURE); + + if (ss->ei_caps->cp_lso_maxlen && ss->ei_caps->cp_cksum_flags && + ss->ei_caps->cp_resv_lkey_capab) { + if (eib_rsrc_setup_lsobufs(ss, err) != EIB_E_SUCCESS) { + eib_rb_rsrc_setup_txbufs(ss, B_FALSE); + return (EIB_E_FAILURE); + } + } + + if (eib_rsrc_setup_rxbufs(ss, err) != EIB_E_SUCCESS) { + eib_rb_rsrc_setup_lsobufs(ss, B_FALSE); + eib_rb_rsrc_setup_txbufs(ss, B_FALSE); + return (EIB_E_FAILURE); + } + + return (EIB_E_SUCCESS); +} + +int +eib_rsrc_grab_swqes(eib_t *ss, eib_wqe_t **wqes, uint_t n_req, uint_t *actual, + int pri) +{ + eib_wqe_t *wqe; + uint32_t *encap_hdr; + int ret; + int i; + + ASSERT(ss->ei_tx != NULL); + + ret = eib_rsrc_grab_wqes(ss, ss->ei_tx, wqes, n_req, actual, pri); + if (ret != EIB_E_SUCCESS) + return (EIB_E_FAILURE); + + /* + * See note for eib_rsrc_grab_swqe() + */ + for (i = 0; i < (*actual); i++) { + wqe = wqes[i]; + wqe->qe_wr.send.wr_flags = IBT_WR_NO_FLAGS; + wqe->qe_wr.send.wr.ud.udwr_dest = wqe->qe_dest; + wqe->qe_wr.send.wr_opcode = IBT_WRC_SEND; + wqe->qe_wr.send.wr_nds = 1; + wqe->qe_wr.send.wr_sgl = &wqe->qe_sgl; + wqe->qe_nxt_post = NULL; + wqe->qe_iov_hdl = NULL; + + encap_hdr = (uint32_t *)(void *)wqe->qe_payload_hdr; + *encap_hdr = htonl(EIB_TX_ENCAP_HDR); + } + + return (EIB_E_SUCCESS); +} + +int +eib_rsrc_grab_rwqes(eib_t *ss, eib_wqe_t **wqes, uint_t n_req, uint_t *actual, + int pri) +{ + ASSERT(ss->ei_rx != NULL); + + return (eib_rsrc_grab_wqes(ss, ss->ei_rx, wqes, n_req, actual, pri)); +} + +int +eib_rsrc_grab_lsobufs(eib_t *ss, uint_t req_sz, ibt_wr_ds_t *sgl, uint32_t *nds) +{ + eib_lsobkt_t *bkt = ss->ei_lso; + eib_lsobuf_t *elem; + eib_lsobuf_t *nxt; + uint_t frag_sz; + uint_t num_needed; + int i; + + ASSERT(req_sz != 0); + ASSERT(sgl != NULL); + ASSERT(nds != NULL); + + /* + * Determine how many bufs we'd need for the size requested + */ + num_needed = req_sz / EIB_LSO_BUFSZ; + if ((frag_sz = req_sz % EIB_LSO_BUFSZ) != 0) + num_needed++; + + if (bkt == NULL) + return (EIB_E_FAILURE); + + /* + * If we don't have enough lso bufs, return failure + */ + mutex_enter(&bkt->bk_lock); + if (bkt->bk_nfree < num_needed) { + mutex_exit(&bkt->bk_lock); + return (EIB_E_FAILURE); + } + + /* + * Pick the first "num_needed" bufs from the free list + */ + elem = bkt->bk_free_head; + for (i = 0; i < num_needed; i++) { + ASSERT(elem->lb_isfree != 0); + ASSERT(elem->lb_buf != NULL); + + nxt = elem->lb_next; + + sgl[i].ds_va = (ib_vaddr_t)(uintptr_t)elem->lb_buf; + sgl[i].ds_key = bkt->bk_lkey; + sgl[i].ds_len = EIB_LSO_BUFSZ; + + elem->lb_isfree = 0; + elem->lb_next = NULL; + + elem = nxt; + } + bkt->bk_free_head = elem; + + /* + * If the requested size is not a multiple of EIB_LSO_BUFSZ, we need + * to adjust the last sgl entry's length. Since we know we need atleast + * one, the i-1 use below is ok. + */ + if (frag_sz) { + sgl[i-1].ds_len = frag_sz; + } + + /* + * Update nfree count and return + */ + bkt->bk_nfree -= num_needed; + + mutex_exit(&bkt->bk_lock); + + *nds = num_needed; + + return (EIB_E_SUCCESS); +} + +eib_wqe_t * +eib_rsrc_grab_swqe(eib_t *ss, int pri) +{ + eib_wqe_t *wqe = NULL; + uint32_t *encap_hdr; + + ASSERT(ss->ei_tx != NULL); + (void) eib_rsrc_grab_wqes(ss, ss->ei_tx, &wqe, 1, NULL, pri); + + /* + * Let's reset the swqe basic wr parameters to default. We need + * to do this because this swqe could've previously been used + * for a checksum offload (when the flags would've been set) + * or for an LSO send (in which case the opcode would've been set + * to a different value), or been iov mapped (in which case the + * sgl/nds could've been set to different values). We'll make + * it easy and initialize it here, so simple transactions can + * go through without any special effort by the caller. + * + * Note that even though the wqe structure is common for both + * send and recv, they're in two independent pools and the wqe + * type remains the same throughout its lifetime. So we don't + * have to worry about resetting any other field. + */ + if (wqe) { + wqe->qe_wr.send.wr_flags = IBT_WR_NO_FLAGS; + wqe->qe_wr.send.wr.ud.udwr_dest = wqe->qe_dest; + wqe->qe_wr.send.wr_opcode = IBT_WRC_SEND; + wqe->qe_wr.send.wr_nds = 1; + wqe->qe_wr.send.wr_sgl = &wqe->qe_sgl; + wqe->qe_nxt_post = NULL; + wqe->qe_iov_hdl = NULL; + + encap_hdr = (uint32_t *)(void *)wqe->qe_payload_hdr; + *encap_hdr = htonl(EIB_TX_ENCAP_HDR); + } + + return (wqe); +} + +eib_wqe_t * +eib_rsrc_grab_rwqe(eib_t *ss, int pri) +{ + eib_wqe_t *wqe = NULL; + + ASSERT(ss->ei_rx != NULL); + (void) eib_rsrc_grab_wqes(ss, ss->ei_rx, &wqe, 1, NULL, pri); + + return (wqe); +} + +void +eib_rsrc_return_swqe(eib_t *ss, eib_wqe_t *wqe, eib_chan_t *chan) +{ + ASSERT(ss->ei_tx != NULL); + + eib_rsrc_return_wqes(ss, ss->ei_tx, &wqe, 1); + if (chan) { + eib_rsrc_decr_posted_swqe(ss, chan); + } +} + + +void +eib_rsrc_return_rwqe(eib_t *ss, eib_wqe_t *wqe, eib_chan_t *chan) +{ + ASSERT(ss->ei_rx != NULL); + + eib_rsrc_return_wqes(ss, ss->ei_rx, &wqe, 1); + if (chan) { + eib_rsrc_decr_posted_rwqe(ss, chan); + } +} + +void +eib_rsrc_return_lsobufs(eib_t *ss, ibt_wr_ds_t *sgl_p, uint32_t nds) +{ + eib_lsobkt_t *bkt = ss->ei_lso; + eib_lsobuf_t *elem; + uint8_t *va; + ptrdiff_t ndx; + int i; + + /* + * Nowhere to return the buffers to ?? + */ + if (bkt == NULL) + return; + + mutex_enter(&bkt->bk_lock); + + for (i = 0; i < nds; i++) { + va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va; + + ASSERT(va >= bkt->bk_mem); + ASSERT(va < (bkt->bk_mem + bkt->bk_nelem * EIB_LSO_BUFSZ)); + + /* + * Figure out the buflist element this sgl buffer corresponds + * to and put it back at the head + */ + ndx = ((uintptr_t)va - (uintptr_t)bkt->bk_mem) / EIB_LSO_BUFSZ; + elem = bkt->bk_bufl + ndx; + + ASSERT(elem->lb_isfree == 0); + ASSERT(elem->lb_buf == va); + + elem->lb_isfree = 1; + elem->lb_next = bkt->bk_free_head; + bkt->bk_free_head = elem; + } + bkt->bk_nfree += nds; + + /* + * If the number of available lso buffers just crossed the + * threshold, wakeup anyone who may be sleeping on the event. + */ + if (((bkt->bk_nfree - nds) < EIB_LSO_FREE_BUFS_THRESH) && + (bkt->bk_nfree >= EIB_LSO_FREE_BUFS_THRESH)) { + cv_broadcast(&bkt->bk_cv); + } + + mutex_exit(&bkt->bk_lock); +} + +/*ARGSUSED*/ +void +eib_rsrc_decr_posted_swqe(eib_t *ss, eib_chan_t *chan) +{ + ASSERT(chan != NULL); + + mutex_enter(&chan->ch_tx_lock); + + chan->ch_tx_posted--; + if ((chan->ch_tear_down) && (chan->ch_tx_posted == 0)) { + cv_signal(&chan->ch_tx_cv); + } + + mutex_exit(&chan->ch_tx_lock); +} + +void +eib_rsrc_decr_posted_rwqe(eib_t *ss, eib_chan_t *chan) +{ + eib_chan_t *tail; + boolean_t queue_for_refill = B_FALSE; + + ASSERT(chan != NULL); + + /* + * Decrement the ch_rx_posted count. If we are tearing this channel + * down, signal the waiter when the count reaches 0. If we aren't + * tearing the channel down, see if the count has gone below the low + * water mark. If it has, and if this channel isn't already being + * refilled, queue the channel up with the service thread for a + * rwqe refill. + */ + mutex_enter(&chan->ch_rx_lock); + chan->ch_rx_posted--; + if (chan->ch_tear_down) { + if (chan->ch_rx_posted == 0) + cv_signal(&chan->ch_rx_cv); + } else if (chan->ch_rx_posted < chan->ch_lwm_rwqes) { + if (chan->ch_rx_refilling == B_FALSE) { + chan->ch_rx_refilling = B_TRUE; + queue_for_refill = B_TRUE; + } + } + mutex_exit(&chan->ch_rx_lock); + + if (queue_for_refill) { + mutex_enter(&ss->ei_rxpost_lock); + + chan->ch_rxpost_next = NULL; + for (tail = ss->ei_rxpost; tail; tail = tail->ch_rxpost_next) { + if (tail->ch_rxpost_next == NULL) + break; + } + if (tail) { + tail->ch_rxpost_next = chan; + } else { + ss->ei_rxpost = chan; + } + + cv_signal(&ss->ei_rxpost_cv); + mutex_exit(&ss->ei_rxpost_lock); + } +} + +void +eib_rsrc_txwqes_needed(eib_t *ss) +{ + eib_wqe_pool_t *wp = ss->ei_tx; + + EIB_INCR_COUNTER(&ss->ei_stats->st_noxmitbuf); + + mutex_enter(&wp->wp_lock); + if ((wp->wp_status & EIB_TXWQE_SHORT) == 0) { + wp->wp_status |= EIB_TXWQE_SHORT; + cv_broadcast(&wp->wp_cv); + } + mutex_exit(&wp->wp_lock); +} + +void +eib_rsrc_lsobufs_needed(eib_t *ss) +{ + eib_lsobkt_t *bkt = ss->ei_lso; + + EIB_INCR_COUNTER(&ss->ei_stats->st_noxmitbuf); + + if (bkt == NULL) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_rsrc_lsobufs_needed: " + "lso bufs seem to be needed even though " + "LSO support was not advertised"); + return; + } + + mutex_enter(&bkt->bk_lock); + if ((bkt->bk_status & EIB_LBUF_SHORT) == 0) { + bkt->bk_status |= EIB_LBUF_SHORT; + cv_broadcast(&bkt->bk_cv); + } + mutex_exit(&bkt->bk_lock); +} + +boolean_t +eib_rsrc_rxpool_low(eib_wqe_t *wqe) +{ + eib_wqe_pool_t *wp = wqe->qe_pool; + boolean_t ret = B_FALSE; + + /* + * Set the EIB_RXWQE_SHORT flag when the number of free wqes + * in the rx pool falls below the low threshold for rwqes and + * clear it only when the number of free wqes gets back above + * the high water mark. + */ + mutex_enter(&wp->wp_lock); + + if (wp->wp_nfree <= EIB_NFREE_RWQES_LOW) { + wp->wp_status |= (EIB_RXWQE_SHORT); + } else if (wp->wp_nfree >= EIB_NFREE_RWQES_HWM) { + wp->wp_status &= (~EIB_RXWQE_SHORT); + } + + if ((wp->wp_status & EIB_RXWQE_SHORT) == EIB_RXWQE_SHORT) + ret = B_TRUE; + + mutex_exit(&wp->wp_lock); + + return (ret); +} + +void +eib_rb_rsrc_setup_bufs(eib_t *ss, boolean_t force) +{ + eib_rb_rsrc_setup_rxbufs(ss, force); + eib_rb_rsrc_setup_lsobufs(ss, force); + eib_rb_rsrc_setup_txbufs(ss, force); +} + +static int +eib_rsrc_setup_txbufs(eib_t *ss, int *err) +{ + eib_wqe_pool_t *tx; + eib_wqe_t *wqe; + ibt_ud_dest_hdl_t dest; + ibt_mr_attr_t attr; + ibt_mr_desc_t desc; + ibt_status_t ret; + kthread_t *kt; + uint32_t *encap_hdr; + uint8_t *buf; + uint_t mtu = ss->ei_props->ep_mtu; + uint_t tx_bufsz; + uint_t blk; + uint_t ndx; + uint_t i; + int lso_enabled; + + /* + * Try to allocate and initialize the tx wqe pool + */ + if (ss->ei_tx != NULL) + return (EIB_E_SUCCESS); + + /* + * If we keep the tx buffers as mtu-sized, then potentially every + * LSO request that cannot be satisfactorily mapped, will use up + * the 8K large (default size) lso buffers. This may be inadvisable + * given that lso buffers are a scarce resource. Instead, we'll + * slightly raise the size of the copy buffers in the send wqes + * (say to EIB_TX_COPY_THRESH) so that requests that cannot be + * mapped could still avoid using the 8K LSO buffers if they're + * less than the copy threshold size. + */ + lso_enabled = ss->ei_caps->cp_lso_maxlen && + ss->ei_caps->cp_cksum_flags && ss->ei_caps->cp_resv_lkey_capab; + tx_bufsz = ((lso_enabled) && (EIB_TX_COPY_THRESH > mtu)) ? + EIB_TX_COPY_THRESH : mtu; + + eib_rsrc_init_wqe_pool(ss, &ss->ei_tx, tx_bufsz, EIB_WP_TYPE_TX); + tx = ss->ei_tx; + + /* + * Register the TX memory region with IBTF for use + */ + attr.mr_vaddr = tx->wp_vaddr; + attr.mr_len = tx->wp_memsz; + attr.mr_as = NULL; + attr.mr_flags = IBT_MR_SLEEP; + + ret = ibt_register_mr(ss->ei_hca_hdl, ss->ei_pd_hdl, &attr, + &tx->wp_mr, &desc); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_rsrc_setup_txbufs: " + "ibt_register_mr() failed for tx " + "region (0x%llx, 0x%llx) with ret=%d", + attr.mr_vaddr, attr.mr_len, ret); + + *err = EINVAL; + goto rsrc_setup_txbufs_fail; + } + tx->wp_lkey = desc.md_lkey; + + /* + * Now setup the tx wqes + */ + buf = (uint8_t *)(uintptr_t)(tx->wp_vaddr); + for (i = 0, blk = 0; blk < EIB_BLKS_PER_POOL; blk++) { + for (ndx = 0; ndx < EIB_WQES_PER_BLK; ndx++, i++) { + wqe = &tx->wp_wqe[i]; + /* + * Allocate a UD destination handle + */ + ret = ibt_alloc_ud_dest(ss->ei_hca_hdl, + IBT_UD_DEST_NO_FLAGS, ss->ei_pd_hdl, &dest); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, + "eib_rsrc_setup_txbufs: " + "ibt_alloc_ud_dest(hca_hdl=0x%llx) " + "failed, ret=%d", ss->ei_hca_hdl, ret); + + *err = ENOMEM; + goto rsrc_setup_txbufs_fail; + } + + /* + * These parameters should remain fixed throughout the + * lifetime of this wqe. + */ + wqe->qe_pool = tx; + wqe->qe_cpbuf = buf; + wqe->qe_bufsz = tx_bufsz; + + /* + * The qe_dest and qe_payload_hdr are specific to tx + * only, but remain unchanged throughout the lifetime + * of the wqe. + * + * The payload header is normally used when we have an + * LSO packet to send. Since the EoIB encapsulation + * header won't be part of the message we get from the + * network layer, we'll need to copy the lso header into + * a new buffer every time before we hand over the LSO + * send request to the hca driver. + */ + wqe->qe_dest = dest; + wqe->qe_payload_hdr = + kmem_zalloc(EIB_MAX_PAYLOAD_HDR_SZ, KM_SLEEP); + + /* + * The encapsulation header is at the start of the + * payload header and is initialized to the default + * encapsulation header we use (no multiple segments, + * no FCS). This part of the header is not expected + * to change. + */ + encap_hdr = (uint32_t *)(void *)wqe->qe_payload_hdr; + *encap_hdr = htonl(EIB_TX_ENCAP_HDR); + + /* + * The parameter set below are used in tx and rx paths. + * These parameters (except ds_key) are reset to these + * default values in eib_rsrc_return_wqes(). + */ + wqe->qe_sgl.ds_key = tx->wp_lkey; + wqe->qe_sgl.ds_va = (ib_vaddr_t)(uintptr_t)buf; + wqe->qe_sgl.ds_len = wqe->qe_bufsz; + wqe->qe_mp = NULL; + wqe->qe_info = + ((blk & EIB_WQEBLK_MASK) << EIB_WQEBLK_SHIFT) | + ((ndx & EIB_WQENDX_MASK) << EIB_WQENDX_SHIFT) | + ((uint_t)EIB_WQE_TX << EIB_WQETYP_SHIFT); + + /* + * These tx-specific parameters (except wr_id and + * wr_trans) are reset in eib_rsrc_grab_swqes() to make + * sure any freshly acquired swqe from the pool has + * these default settings for the caller. + */ + wqe->qe_wr.send.wr_id = (ibt_wrid_t)(uintptr_t)wqe; + wqe->qe_wr.send.wr_trans = IBT_UD_SRV; + wqe->qe_wr.send.wr_flags = IBT_WR_NO_FLAGS; + wqe->qe_wr.send.wr.ud.udwr_dest = wqe->qe_dest; + wqe->qe_wr.send.wr_opcode = IBT_WRC_SEND; + wqe->qe_wr.send.wr_nds = 1; + wqe->qe_wr.send.wr_sgl = &wqe->qe_sgl; + wqe->qe_nxt_post = NULL; + wqe->qe_iov_hdl = NULL; + + buf += wqe->qe_bufsz; + } + } + + /* + * Before returning, create a kernel thread to monitor the status + * of wqes in the tx wqe pool. Note that this thread cannot be + * created from eib_state_init() during attach(), since the thread + * expects the wqe pool to be allocated and ready when it starts, + * and the tx bufs initialization only happens during eib_m_start(). + */ + kt = thread_create(NULL, 0, eib_monitor_tx_wqes, ss, 0, + &p0, TS_RUN, minclsyspri); + ss->ei_txwqe_monitor = kt->t_did; + + return (EIB_E_SUCCESS); + +rsrc_setup_txbufs_fail: + eib_rb_rsrc_setup_txbufs(ss, B_FALSE); + return (EIB_E_FAILURE); +} + +static int +eib_rsrc_setup_rxbufs(eib_t *ss, int *err) +{ + eib_wqe_pool_t *rx; + eib_wqe_t *wqe; + ibt_mr_attr_t attr; + ibt_mr_desc_t desc; + ibt_status_t ret; + uint8_t *buf; + uint_t mtu = ss->ei_props->ep_mtu; + uint_t blk; + uint_t ndx; + uint_t i; + + /* + * Try to allocate and initialize the wqe pool. When this is called + * during a plumb via the mac m_start callback, we need to make + * sure there is a need to allocate a wqe pool afresh. If during a + * previous unplumb we didn't free the wqe pool because the nw layer + * was holding on to some rx buffers, we don't need to allocate new + * pool and set up the buffers again; we'll just start re-using the + * previous one. + */ + if (ss->ei_rx != NULL) + return (EIB_E_SUCCESS); + + /* + * The receive buffer has to work for all channels, specifically the + * data qp of the vnics. This means that the buffer must be large + * enough to hold MTU sized IB payload (including the EoIB and ethernet + * headers) plus the GRH. In addition, because the ethernet header is + * either 14 or 18 bytes (tagless or vlan tagged), we should have the + * buffer filled in such a way that the IP header starts at atleast a + * 4-byte aligned address. In order to do this, we need to have some + * additional room. + */ + eib_rsrc_init_wqe_pool(ss, &ss->ei_rx, + mtu + EIB_GRH_SZ + EIB_IPHDR_ALIGN_ROOM, EIB_WP_TYPE_RX); + rx = ss->ei_rx; + + /* + * Register the RX memory region with IBTF for use + */ + attr.mr_vaddr = rx->wp_vaddr; + attr.mr_len = rx->wp_memsz; + attr.mr_as = NULL; + attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; + + ret = ibt_register_mr(ss->ei_hca_hdl, ss->ei_pd_hdl, &attr, + &rx->wp_mr, &desc); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, "eib_rsrc_setup_rxbufs: " + "ibt_register_mr() failed for rx " + "region (0x%llx, 0x%llx) with ret=%d", + attr.mr_vaddr, attr.mr_len, ret); + + *err = EINVAL; + goto rsrc_setup_rxbufs_fail; + } + rx->wp_lkey = desc.md_lkey; + + /* + * Now setup the rx wqes + */ + buf = (uint8_t *)(uintptr_t)(rx->wp_vaddr); + for (i = 0, blk = 0; blk < EIB_BLKS_PER_POOL; blk++) { + for (ndx = 0; ndx < EIB_WQES_PER_BLK; ndx++, i++) { + wqe = &rx->wp_wqe[i]; + + /* + * These parameters should remain fixed throughout the + * lifetime of this recv wqe. The qe_frp will only be + * used by the data channel of vnics and will remain + * unused by other channels. + */ + wqe->qe_pool = rx; + wqe->qe_cpbuf = buf; + wqe->qe_bufsz = mtu + EIB_GRH_SZ + EIB_IPHDR_ALIGN_ROOM; + wqe->qe_wr.recv.wr_id = (ibt_wrid_t)(uintptr_t)wqe; + wqe->qe_wr.recv.wr_nds = 1; + wqe->qe_wr.recv.wr_sgl = &wqe->qe_sgl; + wqe->qe_frp.free_func = eib_data_rx_recycle; + wqe->qe_frp.free_arg = (caddr_t)wqe; + + /* + * The parameter set below are used in tx and rx paths. + * These parameters (except ds_key) are reset to these + * default values in eib_rsrc_return_wqes(). + */ + wqe->qe_sgl.ds_key = rx->wp_lkey; + wqe->qe_sgl.ds_va = (ib_vaddr_t)(uintptr_t)buf; + wqe->qe_sgl.ds_len = wqe->qe_bufsz; + wqe->qe_mp = NULL; + wqe->qe_info = + ((blk & EIB_WQEBLK_MASK) << EIB_WQEBLK_SHIFT) | + ((ndx & EIB_WQENDX_MASK) << EIB_WQENDX_SHIFT) | + ((uint_t)EIB_WQE_RX << EIB_WQETYP_SHIFT); + + /* + * These rx-specific parameters are also reset to + * these default values in eib_rsrc_return_wqes(). + */ + wqe->qe_chan = NULL; + wqe->qe_vnic_inst = -1; + + buf += (mtu + EIB_GRH_SZ + EIB_IPHDR_ALIGN_ROOM); + } + } + + return (EIB_E_SUCCESS); + +rsrc_setup_rxbufs_fail: + eib_rb_rsrc_setup_rxbufs(ss, B_FALSE); + return (EIB_E_FAILURE); +} + +static int +eib_rsrc_setup_lsobufs(eib_t *ss, int *err) +{ + eib_lsobkt_t *bkt; + eib_lsobuf_t *elem; + eib_lsobuf_t *tail; + ibt_mr_attr_t attr; + ibt_mr_desc_t desc; + kthread_t *kt; + + uint8_t *lsomem; + uint8_t *memp; + ibt_status_t ret; + int i; + + /* + * Allocate the lso bucket and space for buffers + */ + bkt = kmem_zalloc(sizeof (eib_lsobkt_t), KM_SLEEP); + lsomem = kmem_zalloc(eib_lso_num_bufs * EIB_LSO_BUFSZ, KM_SLEEP); + + /* + * Register lso memory and save the lkey + */ + attr.mr_vaddr = (uint64_t)(uintptr_t)lsomem; + attr.mr_len = eib_lso_num_bufs * EIB_LSO_BUFSZ; + attr.mr_as = NULL; + attr.mr_flags = IBT_MR_SLEEP; + + ret = ibt_register_mr(ss->ei_hca_hdl, ss->ei_pd_hdl, &attr, + &bkt->bk_mr_hdl, &desc); + if (ret != IBT_SUCCESS) { + *err = EINVAL; + EIB_DPRINTF_ERR(ss->ei_instance, "eib_rsrc_setup_lsobufs: " + "ibt_register_mr() failed for LSO " + "region (0x%llx, 0x%llx) with ret=%d", + attr.mr_vaddr, attr.mr_len, ret); + + kmem_free(lsomem, eib_lso_num_bufs * EIB_LSO_BUFSZ); + kmem_free(bkt, sizeof (eib_lsobkt_t)); + + return (EIB_E_FAILURE); + } + bkt->bk_lkey = desc.md_lkey; + + /* + * Now allocate the buflist. Note that the elements in the buflist and + * the buffers in the lso memory have a permanent 1-1 relation, so we + * can always derive the address of a buflist entry from the address of + * an lso buffer. + */ + bkt->bk_bufl = kmem_zalloc(eib_lso_num_bufs * sizeof (eib_lsobuf_t), + KM_SLEEP); + + /* + * Set up the lso buf chain + */ + memp = lsomem; + elem = bkt->bk_bufl; + for (i = 0; i < eib_lso_num_bufs; i++) { + elem->lb_isfree = 1; + elem->lb_buf = memp; + elem->lb_next = elem + 1; + + tail = elem; + + memp += EIB_LSO_BUFSZ; + elem++; + } + tail->lb_next = NULL; + + /* + * Set up the LSO buffer information in eib state + */ + bkt->bk_free_head = bkt->bk_bufl; + bkt->bk_mem = lsomem; + bkt->bk_nelem = eib_lso_num_bufs; + bkt->bk_nfree = bkt->bk_nelem; + + mutex_init(&bkt->bk_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&bkt->bk_cv, NULL, CV_DEFAULT, NULL); + + ss->ei_lso = bkt; + + /* + * Before returning, create a kernel thread to monitor the status + * of lso bufs + */ + kt = thread_create(NULL, 0, eib_monitor_lso_bufs, ss, 0, + &p0, TS_RUN, minclsyspri); + ss->ei_lsobufs_monitor = kt->t_did; + + return (EIB_E_SUCCESS); +} + +static void +eib_rsrc_init_wqe_pool(eib_t *ss, eib_wqe_pool_t **wpp, ib_memlen_t bufsz, + int wp_type) +{ + eib_wqe_pool_t *wp; + uint_t wp_wqesz; + int i; + + ASSERT(wpp != NULL); + ASSERT(*wpp == NULL); + + /* + * Allocate the wqe pool, wqes and bufs + */ + wp = kmem_zalloc(sizeof (eib_wqe_pool_t), KM_SLEEP); + wp_wqesz = EIB_WQES_PER_POOL * sizeof (eib_wqe_t); + wp->wp_wqe = (eib_wqe_t *)kmem_zalloc(wp_wqesz, KM_SLEEP); + wp->wp_memsz = EIB_WQES_PER_POOL * bufsz; + wp->wp_vaddr = (ib_vaddr_t)(uintptr_t)kmem_zalloc(wp->wp_memsz, + KM_SLEEP); + wp->wp_ss = ss; + wp->wp_type = wp_type; + wp->wp_nfree_lwm = (wp_type == EIB_WP_TYPE_TX) ? + EIB_NFREE_SWQES_LWM : EIB_NFREE_RWQES_LWM; + + /* + * Initialize the lock and bitmaps: everything is available at first, + * but note that if the number of blocks per pool is less than 64, we + * need to initialize those extra bits as "unavailable" - these will + * remain unavailable throughout. + */ + mutex_init(&wp->wp_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&wp->wp_cv, NULL, CV_DEFAULT, NULL); + + wp->wp_nfree = EIB_WQES_PER_POOL; + wp->wp_free_blks = (EIB_BLKS_PER_POOL >= 64) ? (~0) : + (((uint64_t)1 << EIB_BLKS_PER_POOL) - 1); + for (i = 0; i < EIB_BLKS_PER_POOL; i++) + wp->wp_free_wqes[i] = ~0; + + *wpp = wp; +} + +/*ARGSUSED*/ +static void +eib_rsrc_fini_wqe_pool(eib_t *ss, eib_wqe_pool_t **wpp) +{ + eib_wqe_pool_t *wp; + + ASSERT(wpp != NULL); + + wp = *wpp; + ASSERT(*wpp != NULL); + + cv_destroy(&wp->wp_cv); + mutex_destroy(&wp->wp_lock); + + kmem_free((void *)(uintptr_t)(wp->wp_vaddr), wp->wp_memsz); + kmem_free(wp->wp_wqe, EIB_WQES_PER_POOL * sizeof (eib_wqe_t)); + kmem_free(wp, sizeof (eib_wqe_pool_t)); + + *wpp = NULL; +} + +/*ARGSUSED*/ +static boolean_t +eib_rsrc_ok_to_free_pool(eib_t *ss, eib_wqe_pool_t *wp, boolean_t force) +{ + uint64_t free_blks; + int i; + + /* + * See if we can release all memory allocated for buffers, wqes and + * the pool. Note that in the case of data channel rx buffers, some + * of the buffers may not be free if the nw layer is holding on to + * them still. If this is the case, we cannot free the wqe pool now + * or a subsequent access by the nw layer to the buffers will cause + * a panic. + */ + ASSERT(wp != NULL); + + /* + * If force-free flag is set, we can always release the memory. + * Note that this flag is unused currently, and should be removed. + */ + if (force == B_TRUE) + return (B_TRUE); + + mutex_enter(&wp->wp_lock); + + /* + * If a whole block remains allocated, obviously we cannot free + * the pool + */ + free_blks = (EIB_BLKS_PER_POOL >= 64) ? (~0) : + (((uint64_t)1 << EIB_BLKS_PER_POOL) - 1); + if (wp->wp_free_blks != free_blks) { + mutex_exit(&wp->wp_lock); + return (B_FALSE); + } + + /* + * If even a single wqe within any one block remains in-use, we + * cannot free the pool + */ + for (i = 0; i < EIB_BLKS_PER_POOL; i++) { + if (wp->wp_free_wqes[i] != (~0)) { + mutex_exit(&wp->wp_lock); + return (B_FALSE); + } + } + + mutex_exit(&wp->wp_lock); + + return (B_TRUE); +} + +/*ARGSUSED*/ +static int +eib_rsrc_grab_wqes(eib_t *ss, eib_wqe_pool_t *wp, eib_wqe_t **wqes, + uint_t n_req, uint_t *actual, int pri) +{ + uint_t n_allocd = 0; + int blk; + int ndx; + int wqe_ndx; + + ASSERT(wp != NULL); + ASSERT(wqes != NULL); + + mutex_enter(&wp->wp_lock); + + /* + * If this is a low priority request, adjust the number requested + * so we don't allocate beyond the low-water-mark + */ + if (pri == EIB_WPRI_LO) { + if (wp->wp_nfree <= wp->wp_nfree_lwm) + n_req = 0; + else if ((wp->wp_nfree - n_req) < wp->wp_nfree_lwm) + n_req = wp->wp_nfree - wp->wp_nfree_lwm; + } + + for (n_allocd = 0; n_allocd < n_req; n_allocd++) { + /* + * If the entire pool is unavailable, quit + */ + if (wp->wp_free_blks == 0) + break; + + /* + * Find the first wqe that's available + */ + blk = EIB_FIND_LSB_SET(wp->wp_free_blks); + ASSERT(blk != -1); + ndx = EIB_FIND_LSB_SET(wp->wp_free_wqes[blk]); + ASSERT(ndx != -1); + + /* + * Mark the wqe as allocated + */ + wp->wp_free_wqes[blk] &= (~((uint64_t)1 << ndx)); + + /* + * If this was the last free wqe in this block, mark + * the block itself as unavailable + */ + if (wp->wp_free_wqes[blk] == 0) + wp->wp_free_blks &= (~((uint64_t)1 << blk)); + + /* + * Return this wqe to the caller + */ + wqe_ndx = blk * EIB_WQES_PER_BLK + ndx; + wqes[n_allocd] = &(wp->wp_wqe[wqe_ndx]); + } + + wp->wp_nfree -= n_allocd; + + mutex_exit(&wp->wp_lock); + + if (n_allocd == 0) + return (EIB_E_FAILURE); + + if (actual) { + *actual = n_allocd; + } + + return (EIB_E_SUCCESS); +} + +/*ARGSUSED*/ +static void +eib_rsrc_return_wqes(eib_t *ss, eib_wqe_pool_t *wp, eib_wqe_t **wqes, + uint_t n_wqes) +{ + eib_wqe_t *wqe; + uint_t n_freed = 0; + uint_t blk; + uint_t ndx; + + ASSERT(wp != NULL); + ASSERT(wqes != NULL); + + mutex_enter(&wp->wp_lock); + for (n_freed = 0; n_freed < n_wqes; n_freed++) { + wqe = wqes[n_freed]; + + /* + * This wqe is being returned back to the pool, so clear + * any wqe flags and reset buffer address and size in the + * single segment sgl back to what they were initially. + * Also erase any mblk pointer and callback function ptrs. + */ + wqe->qe_sgl.ds_va = (ib_vaddr_t)(uintptr_t)wqe->qe_cpbuf; + wqe->qe_sgl.ds_len = wqe->qe_bufsz; + wqe->qe_mp = NULL; + wqe->qe_chan = NULL; + wqe->qe_vnic_inst = -1; + wqe->qe_info &= (~EIB_WQEFLGS_MASK); + + /* + * Mark the wqe free in its block + */ + blk = EIB_WQE_BLK(wqe->qe_info); + ndx = EIB_WQE_NDX(wqe->qe_info); + + wp->wp_free_wqes[blk] |= ((uint64_t)1 << ndx); + + /* + * This block now has atleast one wqe free, so mark + * the block itself as available and move on to the + * next wqe to free + */ + wp->wp_free_blks |= ((uint64_t)1 << blk); + } + + wp->wp_nfree += n_freed; + + /* + * If the number of available wqes in the pool has just crossed + * the high-water-mark, wakeup anyone who may be sleeping on it. + */ + if ((wp->wp_type == EIB_WP_TYPE_TX) && + ((wp->wp_nfree - n_freed) < EIB_NFREE_SWQES_HWM) && + (wp->wp_nfree >= EIB_NFREE_SWQES_HWM)) { + cv_broadcast(&wp->wp_cv); + } + + mutex_exit(&wp->wp_lock); +} + +static void +eib_rb_rsrc_setup_txbufs(eib_t *ss, boolean_t force) +{ + eib_wqe_pool_t *wp = ss->ei_tx; + eib_wqe_t *wqe; + ibt_ud_dest_hdl_t dest; + ibt_status_t ret; + uint8_t *plhdr; + int i; + + if (wp == NULL) + return; + + /* + * Check if it's ok to free the tx wqe pool (i.e. all buffers have + * been reclaimed) and if so, stop the txwqe monitor thread (and wait + * for it to die), release the UD destination handles, deregister + * memory and fini the wqe pool. + */ + if (eib_rsrc_ok_to_free_pool(ss, wp, force)) { + eib_stop_monitor_tx_wqes(ss); + + for (i = 0; i < EIB_WQES_PER_POOL; i++) { + wqe = &wp->wp_wqe[i]; + if ((plhdr = wqe->qe_payload_hdr) != NULL) { + kmem_free(plhdr, EIB_MAX_PAYLOAD_HDR_SZ); + } + if ((dest = wqe->qe_dest) != NULL) { + ret = ibt_free_ud_dest(dest); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_rb_rsrc_setup_txbufs: " + "ibt_free_ud_dest() failed, ret=%d", + ret); + } + } + } + if (wp->wp_mr) { + if ((ret = ibt_deregister_mr(ss->ei_hca_hdl, + wp->wp_mr)) != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_rb_rsrc_setup_txbufs: " + "ibt_deregister_mr() failed, ret=%d", ret); + } + wp->wp_mr = NULL; + } + eib_rsrc_fini_wqe_pool(ss, &ss->ei_tx); + } +} + +void +eib_rb_rsrc_setup_rxbufs(eib_t *ss, boolean_t force) +{ + eib_wqe_pool_t *rx = ss->ei_rx; + ibt_status_t ret; + + if (rx == NULL) + return; + + /* + * Check if it's ok to free the rx wqe pool (i.e. all buffers have + * been reclaimed) and if so, deregister memory and fini the wqe pool. + */ + if (eib_rsrc_ok_to_free_pool(ss, rx, force)) { + if (rx->wp_mr) { + if ((ret = ibt_deregister_mr(ss->ei_hca_hdl, + rx->wp_mr)) != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_rb_rsrc_setup_rxbufs: " + "ibt_deregister_mr() failed, ret=%d", ret); + } + rx->wp_mr = NULL; + } + + eib_rsrc_fini_wqe_pool(ss, &ss->ei_rx); + } +} + +static void +eib_rb_rsrc_setup_lsobufs(eib_t *ss, boolean_t force) +{ + eib_lsobkt_t *bkt; + ibt_status_t ret; + + /* + * Remove the lso bucket from the state + */ + if ((bkt = ss->ei_lso) == NULL) + return; + + /* + * Try to stop the lso bufs monitor thread. If we fail, we simply + * return. We'll have another shot at it later from detach() with + * the force flag set. + */ + if (eib_stop_monitor_lso_bufs(ss, force) != EIB_E_SUCCESS) + return; + + /* + * Free the buflist + */ + if (bkt->bk_bufl) { + kmem_free(bkt->bk_bufl, bkt->bk_nelem * sizeof (eib_lsobuf_t)); + bkt->bk_bufl = NULL; + } + + /* + * Deregister LSO memory and free it + */ + if (bkt->bk_mr_hdl) { + if ((ret = ibt_deregister_mr(ss->ei_hca_hdl, + bkt->bk_mr_hdl)) != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_rb_rsrc_setup_lsobufs: " + "ibt_deregister_mr() failed, ret=%d", ret); + } + bkt->bk_mr_hdl = NULL; + } + if (bkt->bk_mem) { + kmem_free(bkt->bk_mem, bkt->bk_nelem * EIB_LSO_BUFSZ); + bkt->bk_mem = NULL; + } + + /* + * Destroy the mutex and condvar + */ + cv_destroy(&bkt->bk_cv); + mutex_destroy(&bkt->bk_lock); + + /* + * Finally, free the lso bucket itself + */ + kmem_free(bkt, sizeof (eib_lsobkt_t)); + ss->ei_lso = NULL; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/ib/clients/eoib/eib_svc.c Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,1001 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ksynch.h> +#include <sys/callb.h> +#include <sys/mac_provider.h> + +#include <sys/ib/clients/eoib/eib_impl.h> + +/* + * Thread to handle EoIB events asynchronously + */ +void +eib_events_handler(eib_t *ss) +{ + eib_event_t *evi; + eib_event_t *nxt; + kmutex_t ci_lock; + callb_cpr_t ci; + + mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL); + CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_EVENTS_HDLR); + +wait_for_event: + mutex_enter(&ss->ei_ev_lock); + while ((evi = ss->ei_event) == NULL) { + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_BEGIN(&ci); + mutex_exit(&ci_lock); + + cv_wait(&ss->ei_ev_cv, &ss->ei_ev_lock); + + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_END(&ci, &ci_lock); + mutex_exit(&ci_lock); + } + + /* + * Are we being asked to die ? + */ + if (evi->ev_code == EIB_EV_SHUTDOWN) { + while (evi) { + nxt = evi->ev_next; + kmem_free(evi, sizeof (eib_event_t)); + evi = nxt; + } + ss->ei_event = NULL; + mutex_exit(&ss->ei_ev_lock); + + mutex_enter(&ci_lock); + CALLB_CPR_EXIT(&ci); + mutex_destroy(&ci_lock); + + return; + } + + /* + * Otherwise, pull out the first entry from our work queue + */ + ss->ei_event = evi->ev_next; + evi->ev_next = NULL; + + mutex_exit(&ss->ei_ev_lock); + + /* + * Process this event + * + * Note that we don't want to race with plumb/unplumb in this + * handler, since we may have to restart vnics or do stuff that + * may get re-initialized or released if we allowed plumb/unplumb + * to happen in parallel. + */ + eib_mac_set_nic_state(ss, EIB_NIC_RESTARTING); + + switch (evi->ev_code) { + case EIB_EV_PORT_DOWN: + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: Begin EIB_EV_PORT_DOWN"); + + eib_mac_link_down(ss, B_FALSE); + + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: End EIB_EV_PORT_DOWN"); + break; + + case EIB_EV_PORT_UP: + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: Begin EIB_EV_PORT_UP"); + + eib_ibt_link_mod(ss); + + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: End EIB_EV_PORT_UP"); + break; + + case EIB_EV_PKEY_CHANGE: + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: Begin EIB_EV_PKEY_CHANGE"); + + eib_ibt_link_mod(ss); + + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: End EIB_EV_PKEY_CHANGE"); + break; + + case EIB_EV_SGID_CHANGE: + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: Begin EIB_EV_SGID_CHANGE"); + + eib_ibt_link_mod(ss); + + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: End EIB_EV_SGID_CHANGE"); + break; + + case EIB_EV_CLNT_REREG: + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: Begin EIB_EV_CLNT_REREG"); + + eib_ibt_link_mod(ss); + + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: End EIB_EV_CLNT_REREG"); + break; + + case EIB_EV_GW_UP: + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: Begin EIB_EV_GW_UP"); + + /* + * EoIB nexus has notified us that our gateway is now + * reachable. Unless we already think it is reachable, + * mark it so in our records and try to resurrect dead + * vnics. + */ + mutex_enter(&ss->ei_vnic_lock); + if (ss->ei_gw_unreachable == B_FALSE) { + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: gw reachable"); + mutex_exit(&ss->ei_vnic_lock); + + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: End EIB_EV_GW_UP"); + break; + } + ss->ei_gw_unreachable = B_FALSE; + mutex_exit(&ss->ei_vnic_lock); + + /* + * If we've not even started yet, we have nothing to do. + */ + if ((ss->ei_node_state->ns_nic_state & EIB_NIC_STARTED) == 0) { + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: End EIB_EV_GW_UP"); + break; + } + + if (eib_mac_hca_portstate(ss, NULL, NULL) != EIB_E_SUCCESS) { + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: " + "HCA portstate failed, marking link down"); + + eib_mac_link_down(ss, B_FALSE); + } else { + uint8_t vn0_mac[ETHERADDRL]; + + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: " + "HCA portstate ok, resurrecting zombies"); + + bcopy(eib_zero_mac, vn0_mac, ETHERADDRL); + eib_vnic_resurrect_zombies(ss, vn0_mac); + + /* + * If we've resurrected the zombies because the gateway + * went down and came back, it is possible our unicast + * mac address changed from what it was earlier. If + * so, we need to update our unicast address with the + * mac layer before marking the link up. + */ + if (bcmp(vn0_mac, eib_zero_mac, ETHERADDRL) != 0) { + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: updating unicast " + "addr to %x:%x:%x:%x:%x:%x", vn0_mac[0], + vn0_mac[1], vn0_mac[2], vn0_mac[3], + vn0_mac[4], vn0_mac[5]); + + mac_unicst_update(ss->ei_mac_hdl, vn0_mac); + } + + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: eib_mac_link_up(B_FALSE)"); + + eib_mac_link_up(ss, B_FALSE); + } + + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: End EIB_EV_GW_UP"); + break; + + case EIB_EV_GW_INFO_UPDATE: + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: Begin EIB_EV_GW_INFO_UPDATE"); + + if (evi->ev_arg) { + eib_update_props(ss, (eib_gw_info_t *)(evi->ev_arg)); + kmem_free(evi->ev_arg, sizeof (eib_gw_info_t)); + } + + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: End EIB_EV_GW_INFO_UPDATE"); + break; + + case EIB_EV_MCG_DELETED: + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: Begin-End EIB_EV_MCG_DELETED"); + break; + + case EIB_EV_MCG_CREATED: + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: Begin-End EIB_EV_MCG_CREATED"); + break; + + case EIB_EV_GW_EPORT_DOWN: + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: Begin-End EIB_EV_GW_EPORT_DOWN"); + break; + + case EIB_EV_GW_DOWN: + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_events_handler: Begin-End EIB_EV_GW_DOWN"); + break; + } + + eib_mac_clr_nic_state(ss, EIB_NIC_RESTARTING); + + kmem_free(evi, sizeof (eib_event_t)); + goto wait_for_event; + + /*NOTREACHED*/ +} + +void +eib_svc_enqueue_event(eib_t *ss, eib_event_t *evi) +{ + eib_event_t *elem = NULL; + eib_event_t *tail = NULL; + + mutex_enter(&ss->ei_ev_lock); + + /* + * Notice to shutdown has a higher priority than the + * rest and goes to the head of the list. Everything + * else goes at the end. + */ + if (evi->ev_code == EIB_EV_SHUTDOWN) { + evi->ev_next = ss->ei_event; + ss->ei_event = evi; + } else { + for (elem = ss->ei_event; elem; elem = elem->ev_next) + tail = elem; + + if (tail) + tail->ev_next = evi; + else + ss->ei_event = evi; + } + + cv_signal(&ss->ei_ev_cv); + mutex_exit(&ss->ei_ev_lock); +} + +/* + * Thread to refill channels with rwqes whenever they get low. + */ +void +eib_refill_rwqes(eib_t *ss) +{ + eib_chan_t *chan; + kmutex_t ci_lock; + callb_cpr_t ci; + + mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL); + CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_RWQES_REFILLER); + +wait_for_refill_work: + mutex_enter(&ss->ei_rxpost_lock); + + while ((ss->ei_rxpost == NULL) && (ss->ei_rxpost_die == 0)) { + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_BEGIN(&ci); + mutex_exit(&ci_lock); + + cv_wait(&ss->ei_rxpost_cv, &ss->ei_rxpost_lock); + + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_END(&ci, &ci_lock); + mutex_exit(&ci_lock); + } + + /* + * Discard all requests for refill if we're being asked to die + */ + if (ss->ei_rxpost_die) { + ss->ei_rxpost = NULL; + mutex_exit(&ss->ei_rxpost_lock); + + mutex_enter(&ci_lock); + CALLB_CPR_EXIT(&ci); + mutex_destroy(&ci_lock); + + return; + } + ASSERT(ss->ei_rxpost != NULL); + + /* + * Take the first element out of the queue + */ + chan = ss->ei_rxpost; + ss->ei_rxpost = chan->ch_rxpost_next; + chan->ch_rxpost_next = NULL; + + mutex_exit(&ss->ei_rxpost_lock); + + /* + * Try to post a bunch of recv wqes into this channel. If we + * fail, it means that we haven't even been able to post a + * single recv wqe. This is alarming, but there's nothing + * we can do. We just move on to the next channel needing + * our service. + */ + if (eib_chan_post_rx(ss, chan, NULL) != EIB_E_SUCCESS) { + EIB_DPRINTF_ERR(ss->ei_instance, + "eib_refill_rwqes: eib_chan_post_rx() failed"); + } + + /* + * Mark it to indicate that the refilling is done + */ + mutex_enter(&chan->ch_rx_lock); + chan->ch_rx_refilling = B_FALSE; + mutex_exit(&chan->ch_rx_lock); + + goto wait_for_refill_work; + + /*NOTREACHED*/ +} + +/* + * Thread to create or restart vnics when required + */ +void +eib_vnic_creator(eib_t *ss) +{ + eib_vnic_req_t *vrq; + eib_vnic_req_t *elem; + eib_vnic_req_t *nxt; + kmutex_t ci_lock; + callb_cpr_t ci; + uint_t vr_req; + uint8_t *vr_mac; + int ret; + int err; + + mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL); + CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_VNIC_CREATOR); + +wait_for_vnic_req: + mutex_enter(&ss->ei_vnic_req_lock); + + while ((vrq = ss->ei_vnic_req) == NULL) { + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_BEGIN(&ci); + mutex_exit(&ci_lock); + + cv_wait(&ss->ei_vnic_req_cv, &ss->ei_vnic_req_lock); + + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_END(&ci, &ci_lock); + mutex_exit(&ci_lock); + } + + /* + * Pull out the first request + */ + ss->ei_vnic_req = vrq->vr_next; + vrq->vr_next = NULL; + + vr_req = vrq->vr_req; + vr_mac = vrq->vr_mac; + + switch (vr_req) { + case EIB_CR_REQ_DIE: + case EIB_CR_REQ_FLUSH: + /* + * Cleanup all pending reqs and failed reqs + */ + for (elem = ss->ei_vnic_req; elem; elem = nxt) { + nxt = elem->vr_next; + kmem_free(elem, sizeof (eib_vnic_req_t)); + } + for (elem = ss->ei_failed_vnic_req; elem; elem = nxt) { + nxt = elem->vr_next; + kmem_free(elem, sizeof (eib_vnic_req_t)); + } + ss->ei_vnic_req = NULL; + ss->ei_failed_vnic_req = NULL; + ss->ei_pending_vnic_req = NULL; + mutex_exit(&ss->ei_vnic_req_lock); + + break; + + case EIB_CR_REQ_NEW_VNIC: + ss->ei_pending_vnic_req = vrq; + mutex_exit(&ss->ei_vnic_req_lock); + + EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_vnic_creator: " + "new vnic creation request for %x:%x:%x:%x:%x:%x, 0x%x", + vr_mac[0], vr_mac[1], vr_mac[2], vr_mac[3], vr_mac[4], + vr_mac[5], vrq->vr_vlan); + + /* + * Make sure we don't race with the plumb/unplumb code. If + * the eoib instance has been unplumbed already, we ignore any + * creation requests that may have been pending. + */ + eib_mac_set_nic_state(ss, EIB_NIC_STARTING); + + if ((ss->ei_node_state->ns_nic_state & EIB_NIC_STARTED) != + EIB_NIC_STARTED) { + mutex_enter(&ss->ei_vnic_req_lock); + ss->ei_pending_vnic_req = NULL; + mutex_exit(&ss->ei_vnic_req_lock); + eib_mac_clr_nic_state(ss, EIB_NIC_STARTING); + break; + } + + /* + * Try to create a new vnic with the supplied parameters. + */ + err = 0; + if ((ret = eib_vnic_create(ss, vrq->vr_mac, vrq->vr_vlan, + NULL, &err)) != EIB_E_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_creator: " + "eib_vnic_create(mac=%x:%x:%x:%x:%x:%x, vlan=0x%x) " + "failed, ret=%d", vr_mac[0], vr_mac[1], vr_mac[2], + vr_mac[3], vr_mac[4], vr_mac[5], vrq->vr_vlan, err); + } + + /* + * If we failed, add this vnic req to our failed list (unless + * it already exists there), so we won't try to create this + * vnic again. Whether we fail or succeed, we're done with + * processing this req, so clear the pending req. + */ + mutex_enter(&ss->ei_vnic_req_lock); + if ((ret != EIB_E_SUCCESS) && (err != EEXIST)) { + vrq->vr_next = ss->ei_failed_vnic_req; + ss->ei_failed_vnic_req = vrq; + vrq = NULL; + } + ss->ei_pending_vnic_req = NULL; + mutex_exit(&ss->ei_vnic_req_lock); + + /* + * Notify the mac layer that it should retry its tx again. If we + * had created the vnic successfully, we'll be able to send the + * packets; if we had not been successful, we'll drop packets on + * this vnic. + */ + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_vnic_creator: calling mac_tx_update()"); + mac_tx_update(ss->ei_mac_hdl); + + eib_mac_clr_nic_state(ss, EIB_NIC_STARTING); + break; + + default: + EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_vnic_creator: " + "unknown request 0x%lx, ignoring", vrq->vr_req); + break; + } + + /* + * Free the current req and quit if we have to + */ + if (vrq) { + kmem_free(vrq, sizeof (eib_vnic_req_t)); + } + + if (vr_req == EIB_CR_REQ_DIE) { + mutex_enter(&ci_lock); + CALLB_CPR_EXIT(&ci); + mutex_destroy(&ci_lock); + + return; + } + + goto wait_for_vnic_req; + /*NOTREACHED*/ +} + +/* + * Thread to monitor tx wqes and update the mac layer when needed. + * Note that this thread can only be started after the tx wqe pool + * has been allocated and initialized. + */ +void +eib_monitor_tx_wqes(eib_t *ss) +{ + eib_wqe_pool_t *wp = ss->ei_tx; + kmutex_t ci_lock; + callb_cpr_t ci; + + mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL); + CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_TXWQES_MONITOR); + + ASSERT(wp != NULL); + +monitor_wqe_status: + mutex_enter(&wp->wp_lock); + + /* + * Wait till someone falls short of wqes + */ + while (wp->wp_status == 0) { + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_BEGIN(&ci); + mutex_exit(&ci_lock); + + cv_wait(&wp->wp_cv, &wp->wp_lock); + + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_END(&ci, &ci_lock); + mutex_exit(&ci_lock); + } + + /* + * Have we been asked to die ? + */ + if (wp->wp_status & EIB_TXWQE_MONITOR_DIE) { + mutex_exit(&wp->wp_lock); + + mutex_enter(&ci_lock); + CALLB_CPR_EXIT(&ci); + mutex_destroy(&ci_lock); + + return; + } + + ASSERT((wp->wp_status & EIB_TXWQE_SHORT) != 0); + + /* + * Start monitoring free wqes till they cross min threshold + */ + while ((wp->wp_nfree < EIB_NFREE_SWQES_HWM) && + ((wp->wp_status & EIB_TXWQE_MONITOR_DIE) == 0)) { + + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_BEGIN(&ci); + mutex_exit(&ci_lock); + + cv_wait(&wp->wp_cv, &wp->wp_lock); + + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_END(&ci, &ci_lock); + mutex_exit(&ci_lock); + } + + /* + * Have we been asked to die ? + */ + if (wp->wp_status & EIB_TXWQE_MONITOR_DIE) { + mutex_exit(&wp->wp_lock); + + mutex_enter(&ci_lock); + CALLB_CPR_EXIT(&ci); + mutex_destroy(&ci_lock); + + return; + } + + ASSERT(wp->wp_nfree >= EIB_NFREE_SWQES_HWM); + wp->wp_status &= (~EIB_TXWQE_SHORT); + + mutex_exit(&wp->wp_lock); + + /* + * Inform the mac layer that tx resources are now available + * and go back to monitoring + */ + if (ss->ei_mac_hdl) { + mac_tx_update(ss->ei_mac_hdl); + } + goto monitor_wqe_status; + + /*NOTREACHED*/ +} + +/* + * Thread to monitor lso bufs and update the mac layer as needed. + * Note that this thread can only be started after the lso buckets + * have been allocated and initialized. + */ +void +eib_monitor_lso_bufs(eib_t *ss) +{ + eib_lsobkt_t *bkt = ss->ei_lso; + kmutex_t ci_lock; + callb_cpr_t ci; + + mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL); + CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_LSOBUFS_MONITOR); + + ASSERT(bkt != NULL); + +monitor_lso_status: + mutex_enter(&bkt->bk_lock); + + /* + * Wait till someone falls short of LSO buffers or we're asked + * to die + */ + while (bkt->bk_status == 0) { + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_BEGIN(&ci); + mutex_exit(&ci_lock); + + cv_wait(&bkt->bk_cv, &bkt->bk_lock); + + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_END(&ci, &ci_lock); + mutex_exit(&ci_lock); + } + + if (bkt->bk_status & EIB_LBUF_MONITOR_DIE) { + mutex_exit(&bkt->bk_lock); + + mutex_enter(&ci_lock); + CALLB_CPR_EXIT(&ci); + mutex_destroy(&ci_lock); + + return; + } + + ASSERT((bkt->bk_status & EIB_LBUF_SHORT) != 0); + + /* + * Start monitoring free LSO buffers till there are enough + * free buffers available + */ + while ((bkt->bk_nfree < EIB_LSO_FREE_BUFS_THRESH) && + ((bkt->bk_status & EIB_LBUF_MONITOR_DIE) == 0)) { + + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_BEGIN(&ci); + mutex_exit(&ci_lock); + + cv_wait(&bkt->bk_cv, &bkt->bk_lock); + + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_END(&ci, &ci_lock); + mutex_exit(&ci_lock); + } + + if (bkt->bk_status & EIB_LBUF_MONITOR_DIE) { + mutex_exit(&bkt->bk_lock); + + mutex_enter(&ci_lock); + CALLB_CPR_EXIT(&ci); + mutex_destroy(&ci_lock); + + return; + } + + /* + * We have enough lso buffers available now + */ + ASSERT(bkt->bk_nfree >= EIB_LSO_FREE_BUFS_THRESH); + bkt->bk_status &= (~EIB_LBUF_SHORT); + + mutex_exit(&bkt->bk_lock); + + /* + * Inform the mac layer that tx lso resources are now available + * and go back to monitoring + */ + if (ss->ei_mac_hdl) { + mac_tx_update(ss->ei_mac_hdl); + } + goto monitor_lso_status; + + /*NOTREACHED*/ +} + +/* + * Thread to manage the keepalive requirements for vnics and the gateway. + */ +void +eib_manage_keepalives(eib_t *ss) +{ + eib_ka_vnics_t *elem; + eib_ka_vnics_t *nxt; + clock_t deadline; + int64_t lbolt64; + int err; + kmutex_t ci_lock; + callb_cpr_t ci; + + mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL); + CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_EVENTS_HDLR); + + mutex_enter(&ss->ei_ka_vnics_lock); + +periodic_keepalive: + deadline = ddi_get_lbolt() + ss->ei_gw_props->pp_vnic_ka_ticks; + + while ((ss->ei_ka_vnics_event & + (EIB_KA_VNICS_DIE | EIB_KA_VNICS_TIMED_OUT)) == 0) { + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_BEGIN(&ci); + mutex_exit(&ci_lock); + + if (cv_timedwait(&ss->ei_ka_vnics_cv, &ss->ei_ka_vnics_lock, + deadline) == -1) { + ss->ei_ka_vnics_event |= EIB_KA_VNICS_TIMED_OUT; + } + + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_END(&ci, &ci_lock); + mutex_exit(&ci_lock); + } + + if (ss->ei_ka_vnics_event & EIB_KA_VNICS_DIE) { + for (elem = ss->ei_ka_vnics; elem; elem = nxt) { + nxt = elem->ka_next; + kmem_free(elem, sizeof (eib_ka_vnics_t)); + } + ss->ei_ka_vnics = NULL; + mutex_exit(&ss->ei_ka_vnics_lock); + + mutex_enter(&ci_lock); + CALLB_CPR_EXIT(&ci); + mutex_destroy(&ci_lock); + + return; + } + + /* + * Are there any vnics that need keepalive management ? + */ + ss->ei_ka_vnics_event &= ~EIB_KA_VNICS_TIMED_OUT; + if (ss->ei_ka_vnics == NULL) + goto periodic_keepalive; + + /* + * Ok, we need to send vnic keepalives to our gateway. But first + * check if the gateway heartbeat is good as of this moment. Note + * that we need do get the lbolt value after acquiring ei_vnic_lock + * to ensure that ei_gw_last_heartbeat does not change before the + * comparison (to avoid a negative value in the comparison result + * causing us to incorrectly assume that the gateway heartbeat has + * stopped). + */ + mutex_enter(&ss->ei_vnic_lock); + + lbolt64 = ddi_get_lbolt64(); + + if (ss->ei_gw_last_heartbeat != 0) { + if ((lbolt64 - ss->ei_gw_last_heartbeat) > + ss->ei_gw_props->pp_gw_ka_ticks) { + + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_manage_keepalives: no keepalives from gateway " + "0x%x for hca_guid=0x%llx, port=0x%x, " + "last_gw_ka=0x%llx", ss->ei_gw_props->pp_gw_portid, + ss->ei_props->ep_hca_guid, + ss->ei_props->ep_port_num, + ss->ei_gw_last_heartbeat); + + for (elem = ss->ei_ka_vnics; elem; elem = nxt) { + nxt = elem->ka_next; + ss->ei_zombie_vnics |= + ((uint64_t)1 << elem->ka_vnic->vn_instance); + kmem_free(elem, sizeof (eib_ka_vnics_t)); + } + ss->ei_ka_vnics = NULL; + ss->ei_gw_unreachable = B_TRUE; + mutex_exit(&ss->ei_vnic_lock); + + eib_mac_link_down(ss, B_FALSE); + + goto periodic_keepalive; + } + } + mutex_exit(&ss->ei_vnic_lock); + + for (elem = ss->ei_ka_vnics; elem; elem = elem->ka_next) + (void) eib_fip_heartbeat(ss, elem->ka_vnic, &err); + + goto periodic_keepalive; + /*NOTREACHED*/ +} + +void +eib_stop_events_handler(eib_t *ss) +{ + eib_event_t *evi; + + evi = kmem_zalloc(sizeof (eib_event_t), KM_SLEEP); + evi->ev_code = EIB_EV_SHUTDOWN; + evi->ev_arg = NULL; + + eib_svc_enqueue_event(ss, evi); + + thread_join(ss->ei_events_handler); +} + +void +eib_stop_refill_rwqes(eib_t *ss) +{ + mutex_enter(&ss->ei_rxpost_lock); + + ss->ei_rxpost_die = 1; + + cv_signal(&ss->ei_rxpost_cv); + mutex_exit(&ss->ei_rxpost_lock); + + thread_join(ss->ei_rwqes_refiller); +} + +void +eib_stop_vnic_creator(eib_t *ss) +{ + eib_vnic_req_t *vrq; + + vrq = kmem_zalloc(sizeof (eib_vnic_req_t), KM_SLEEP); + vrq->vr_req = EIB_CR_REQ_DIE; + vrq->vr_next = NULL; + + eib_vnic_enqueue_req(ss, vrq); + + thread_join(ss->ei_vnic_creator); +} + +void +eib_stop_monitor_tx_wqes(eib_t *ss) +{ + eib_wqe_pool_t *wp = ss->ei_tx; + + mutex_enter(&wp->wp_lock); + + wp->wp_status |= EIB_TXWQE_MONITOR_DIE; + + cv_signal(&wp->wp_cv); + mutex_exit(&wp->wp_lock); + + thread_join(ss->ei_txwqe_monitor); +} + +int +eib_stop_monitor_lso_bufs(eib_t *ss, boolean_t force) +{ + eib_lsobkt_t *bkt = ss->ei_lso; + + mutex_enter(&bkt->bk_lock); + + /* + * If there are some buffers still not reaped and the force + * flag is not set, return without doing anything. Otherwise, + * stop the lso bufs monitor and wait for it to die. + */ + if ((bkt->bk_nelem != bkt->bk_nfree) && (force == B_FALSE)) { + mutex_exit(&bkt->bk_lock); + return (EIB_E_FAILURE); + } + + bkt->bk_status |= EIB_LBUF_MONITOR_DIE; + + cv_signal(&bkt->bk_cv); + mutex_exit(&bkt->bk_lock); + + thread_join(ss->ei_lsobufs_monitor); + return (EIB_E_SUCCESS); +} + +void +eib_stop_manage_keepalives(eib_t *ss) +{ + mutex_enter(&ss->ei_ka_vnics_lock); + + ss->ei_ka_vnics_event |= EIB_KA_VNICS_DIE; + + cv_signal(&ss->ei_ka_vnics_cv); + mutex_exit(&ss->ei_ka_vnics_lock); + + thread_join(ss->ei_keepalives_manager); +} + +void +eib_flush_vnic_reqs(eib_t *ss) +{ + eib_vnic_req_t *vrq; + + vrq = kmem_zalloc(sizeof (eib_vnic_req_t), KM_SLEEP); + vrq->vr_req = EIB_CR_REQ_FLUSH; + vrq->vr_next = NULL; + + eib_vnic_enqueue_req(ss, vrq); +} + +/*ARGSUSED*/ +void +eib_gw_alive_cb(dev_info_t *dip, ddi_eventcookie_t cookie, void *arg, + void *impl_data) +{ + eib_t *ss = (eib_t *)arg; + eib_event_t *evi; + + evi = kmem_zalloc(sizeof (eib_event_t), KM_NOSLEEP); + if (evi == NULL) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_gw_alive_cb: " + "no memory, ignoring this gateway alive event"); + } else { + evi->ev_code = EIB_EV_GW_UP; + evi->ev_arg = NULL; + eib_svc_enqueue_event(ss, evi); + } +} + +/*ARGSUSED*/ +void +eib_login_ack_cb(dev_info_t *dip, ddi_eventcookie_t cookie, void *arg, + void *impl_data) +{ + eib_t *ss = (eib_t *)arg; + uint8_t *pkt = (uint8_t *)impl_data; + eib_login_data_t ld; + + /* + * We have received a login ack message from the gateway via the EoIB + * nexus (solicitation qpn). The packet is passed to us raw (unparsed) + * and we have to figure out if this is a vnic login ack. + */ + if (eib_fip_parse_login_ack(ss, pkt + EIB_GRH_SZ, &ld) == EIB_E_SUCCESS) + eib_vnic_login_ack(ss, &ld); +} + +/*ARGSUSED*/ +void +eib_gw_info_cb(dev_info_t *dip, ddi_eventcookie_t cookie, void *arg, + void *impl_data) +{ + eib_t *ss = (eib_t *)arg; + eib_event_t *evi; + + evi = kmem_zalloc(sizeof (eib_event_t), KM_NOSLEEP); + if (evi == NULL) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_gw_info_cb: " + "no memory, ignoring this gateway props update event"); + return; + } + evi->ev_arg = kmem_zalloc(sizeof (eib_gw_info_t), KM_NOSLEEP); + if (evi->ev_arg == NULL) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_gw_info_cb: " + "no memory, ignoring this gateway props update event"); + kmem_free(evi, sizeof (eib_event_t)); + return; + } + bcopy(impl_data, evi->ev_arg, sizeof (eib_gw_info_t)); + evi->ev_code = EIB_EV_GW_INFO_UPDATE; + + eib_svc_enqueue_event(ss, evi); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/ib/clients/eoib/eib_vnic.c Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,2228 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ksynch.h> + +#include <sys/ib/clients/eoib/eib_impl.h> + +/* + * Declarations private to this file + */ +static int eib_vnic_get_instance(eib_t *, int *); +static void eib_vnic_ret_instance(eib_t *, int); +static void eib_vnic_modify_enter(eib_t *, uint_t); +static void eib_vnic_modify_exit(eib_t *, uint_t); +static int eib_vnic_create_common(eib_t *, eib_vnic_t *, int *); +static int eib_vnic_set_partition(eib_t *, eib_vnic_t *, int *); +static void eib_vnic_make_vhub_mgid(uint8_t *, uint8_t, uint8_t *, uint8_t, + uint8_t, uint32_t, ib_gid_t *); +static int eib_vnic_attach_ctl_mcgs(eib_t *, eib_vnic_t *, int *); +static int eib_vnic_attach_vhub_table(eib_t *, eib_vnic_t *); +static int eib_vnic_attach_vhub_update(eib_t *, eib_vnic_t *); +static void eib_vnic_start_keepalives(eib_t *, eib_vnic_t *); +static int eib_vnic_lookup_dest(eib_vnic_t *, uint8_t *, uint16_t, + eib_vhub_map_t *, ibt_mcg_info_t *, int *); +static void eib_vnic_leave_all_data_mcgs(eib_t *, eib_vnic_t *); +static void eib_vnic_rejoin_data_mcgs(eib_t *, eib_vnic_t *); +static void eib_vnic_reattach_ctl_mcgs(eib_t *, eib_vnic_t *); +static void eib_rb_vnic_create_common(eib_t *, eib_vnic_t *, uint_t); +static void eib_rb_vnic_attach_ctl_mcgs(eib_t *, eib_vnic_t *); +static void eib_rb_vnic_attach_vhub_table(eib_t *, eib_vnic_t *); +static void eib_rb_vnic_attach_vhub_update(eib_t *, eib_vnic_t *); +static void eib_rb_vnic_start_keepalives(eib_t *, eib_vnic_t *); +static void eib_rb_vnic_join_data_mcg(eib_t *, eib_vnic_t *, uint8_t *); + +/* + * Definitions private to this file + */ +#define EIB_VNIC_STRUCT_ALLOCD 0x0001 +#define EIB_VNIC_GOT_INSTANCE 0x0002 +#define EIB_VNIC_CREATE_COMMON_DONE 0x0004 +#define EIB_VNIC_CTLQP_CREATED 0x0008 +#define EIB_VNIC_DATAQP_CREATED 0x0010 +#define EIB_VNIC_LOGIN_DONE 0x0020 +#define EIB_VNIC_PARTITION_SET 0x0040 +#define EIB_VNIC_RX_POSTED_TO_CTLQP 0x0080 +#define EIB_VNIC_RX_POSTED_TO_DATAQP 0x0100 +#define EIB_VNIC_ATTACHED_TO_CTL_MCGS 0x0200 +#define EIB_VNIC_GOT_VHUB_TABLE 0x0400 +#define EIB_VNIC_KEEPALIVES_STARTED 0x0800 +#define EIB_VNIC_BROADCAST_JOINED 0x1000 + +/* + * Destination type + */ +#define EIB_TX_UNICAST 1 +#define EIB_TX_MULTICAST 2 +#define EIB_TX_BROADCAST 3 + +int +eib_vnic_create(eib_t *ss, uint8_t *macaddr, uint16_t vlan, eib_vnic_t **vnicp, + int *err) +{ + eib_vnic_t *vnic = NULL; + boolean_t failed_vnic = B_FALSE; + uint_t progress = 0; + + eib_vnic_modify_enter(ss, EIB_VN_BEING_CREATED); + + /* + * When a previously created vnic is being resurrected due to a + * gateway reboot, there's a race possible where a creation request + * for the existing vnic could get filed with the vnic creator + * thread. So, before we go ahead with the creation of this vnic, + * make sure we already don't have the vnic. + */ + if (macaddr) { + if (eib_data_lookup_vnic(ss, macaddr, vlan, vnicp, + &failed_vnic) == EIB_E_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_create: " + "vnic for mac=%x:%x:%x:%x:%x:%x, vlan=0x%x " + "already there, no duplicate creation", macaddr[0], + macaddr[1], macaddr[2], macaddr[3], macaddr[4], + macaddr[5], vlan); + + eib_vnic_modify_exit(ss, EIB_VN_BEING_CREATED); + return (EIB_E_SUCCESS); + } else if (failed_vnic) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_create: " + "vnic for mac=%x:%x:%x:%x:%x:%x, vlan=0x%x " + "failed earlier, shouldn't be here at all", + macaddr[0], macaddr[1], macaddr[2], macaddr[3], + macaddr[4], macaddr[5], vlan); + + *err = EEXIST; + + eib_vnic_modify_exit(ss, EIB_VN_BEING_CREATED); + return (EIB_E_FAILURE); + } + } + + /* + * Allocate a vnic structure for this instance + */ + vnic = kmem_zalloc(sizeof (eib_vnic_t), KM_SLEEP); + vnic->vn_ss = ss; + vnic->vn_instance = -1; + mutex_init(&vnic->vn_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&vnic->vn_cv, NULL, CV_DEFAULT, NULL); + + progress |= EIB_VNIC_STRUCT_ALLOCD; + + /* + * Get a vnic instance + */ + if (eib_vnic_get_instance(ss, &vnic->vn_instance) != EIB_E_SUCCESS) { + *err = EMFILE; + goto vnic_create_fail; + } + progress |= EIB_VNIC_GOT_INSTANCE; + + /* + * Initialize vnic's basic parameters. Note that we set the 15-bit + * vnic id to send to gw during a login to be a 2-tuple of + * {devi_instance#, eoib_vnic_instance#}. + */ + vnic->vn_vlan = vlan; + if (macaddr) { + bcopy(macaddr, vnic->vn_macaddr, sizeof (vnic->vn_macaddr)); + } + vnic->vn_id = (uint16_t)EIB_VNIC_ID(ss->ei_instance, vnic->vn_instance); + + /* + * Start up this vnic instance + */ + if (eib_vnic_create_common(ss, vnic, err) != EIB_E_SUCCESS) + goto vnic_create_fail; + + progress |= EIB_VNIC_CREATE_COMMON_DONE; + + /* + * Return the created vnic + */ + if (vnicp) { + *vnicp = vnic; + } + + eib_vnic_modify_exit(ss, EIB_VN_BEING_CREATED); + return (EIB_E_SUCCESS); + +vnic_create_fail: + eib_rb_vnic_create(ss, vnic, progress); + eib_vnic_modify_exit(ss, EIB_VN_BEING_CREATED); + return (EIB_E_FAILURE); +} + +void +eib_vnic_delete(eib_t *ss, eib_vnic_t *vnic) +{ + eib_vnic_modify_enter(ss, EIB_VN_BEING_DELETED); + eib_rb_vnic_create(ss, vnic, ~0); + eib_vnic_modify_exit(ss, EIB_VN_BEING_DELETED); +} + +/*ARGSUSED*/ +int +eib_vnic_wait_for_login_ack(eib_t *ss, eib_vnic_t *vnic, int *err) +{ + clock_t deadline; + int ret = EIB_E_SUCCESS; + + deadline = ddi_get_lbolt() + drv_usectohz(EIB_LOGIN_TIMEOUT_USEC); + + /* + * Wait for login ack/nack or wait time to get over. If we wake up + * with a login failure, record the reason. + */ + mutex_enter(&vnic->vn_lock); + while (vnic->vn_state == EIB_LOGIN_ACK_WAIT) { + if (cv_timedwait(&vnic->vn_cv, &vnic->vn_lock, + deadline) == -1) { + if (vnic->vn_state == EIB_LOGIN_ACK_WAIT) + vnic->vn_state = EIB_LOGIN_TIMED_OUT; + } + } + + if (vnic->vn_state != EIB_LOGIN_ACK_RCVD) { + ret = EIB_E_FAILURE; + *err = (vnic->vn_state == EIB_LOGIN_TIMED_OUT) ? + ETIME : ECANCELED; + } + mutex_exit(&vnic->vn_lock); + + return (ret); +} + +void +eib_vnic_login_ack(eib_t *ss, eib_login_data_t *ld) +{ + eib_vnic_t *vnic; + uint_t vnic_instance; + uint_t hdrs_sz; + uint16_t vnic_id; + int nack = 1; + + /* + * The msb in the vnic id in login ack message is not + * part of our vNIC id. + */ + vnic_id = ld->ld_vnic_id & (~FIP_VL_VNIC_ID_MSBIT); + + /* + * Now, we deconstruct the vnic id and determine the vnic + * instance number. If this vnic_instance number isn't + * valid or the vnic_id of the vnic for this instance + * number doesn't match in our records, we quit. + */ + vnic_instance = EIB_VNIC_INSTANCE(vnic_id); + if (vnic_instance >= EIB_MAX_VNICS) + return; + + /* + * At this point, we haven't fully created the vnic, so + * this vnic should be present as ei_vnic_pending. + */ + mutex_enter(&ss->ei_vnic_lock); + if ((vnic = ss->ei_vnic_pending) == NULL) { + mutex_exit(&ss->ei_vnic_lock); + return; + } else if (vnic->vn_id != vnic_id) { + mutex_exit(&ss->ei_vnic_lock); + return; + } + mutex_exit(&ss->ei_vnic_lock); + + /* + * First check if the vnic is still sleeping, waiting + * for login ack. If not, we might as well quit now. + */ + mutex_enter(&vnic->vn_lock); + if (vnic->vn_state != EIB_LOGIN_ACK_WAIT) { + mutex_exit(&vnic->vn_lock); + return; + } + + /* + * We NACK the waiter under these conditions: + * + * . syndrome was set + * . vhub mtu is bigger than our max mtu (minus eoib/eth hdrs sz) + * . assigned vlan is different from requested vlan (except + * when we didn't request a specific vlan) + * . when the assigned mac is different from the requested mac + * (except when we didn't request a specific mac) + * . when the VP bit indicates that vlan tag should be used + * but we had not specified a vlan tag in our request + * . when the VP bit indicates that vlan tag should not be + * present and we'd specified a vlan tag in our request + * + * The last case is interesting: if we had not specified any vlan id + * in our request, but the gateway has assigned a vlan and asks us + * to use/expect that tag on every packet dealt by this vnic, it + * means effectively the EoIB driver has to insert/remove vlan + * tagging on this vnic traffic, since the nw layer on Solaris + * won't be using/expecting any tag on traffic for this vnic. This + * feature is not supported currently. + */ + hdrs_sz = EIB_ENCAP_HDR_SZ + sizeof (struct ether_header) + VLAN_TAGSZ; + if (ld->ld_syndrome) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_login_ack: " + "non-zero syndrome 0x%lx, NACK", ld->ld_syndrome); + + } else if (ld->ld_vhub_mtu > (ss->ei_props->ep_mtu - hdrs_sz)) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_login_ack: " + "vhub mtu (0x%x) bigger than port mtu (0x%x), NACK", + ld->ld_vhub_mtu, ss->ei_props->ep_mtu); + + } else if ((vnic->vn_vlan) && (vnic->vn_vlan != ld->ld_assigned_vlan)) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_login_ack: " + "assigned vlan (0x%x) different from asked (0x%x), " + "for vnic id 0x%x, NACK", ld->ld_assigned_vlan, + vnic->vn_vlan, vnic->vn_id); + + } else if (bcmp(vnic->vn_macaddr, eib_zero_mac, ETHERADDRL) && + bcmp(vnic->vn_macaddr, ld->ld_assigned_mac, ETHERADDRL)) { + uint8_t *asked, *got; + + asked = vnic->vn_macaddr; + got = ld->ld_assigned_mac; + + EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_login_ack: " + "assigned mac (%x:%x:%x:%x:%x:%x) different from " + "asked (%x:%x:%x:%x:%x:%x) for vnic id 0x%x, NACK", + got[0], got[1], got[2], got[3], got[4], got[5], asked[0], + asked[1], asked[2], asked[3], asked[4], asked[5]); + + } else if ((vnic->vn_vlan == 0) && (ld->ld_vlan_in_packets)) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_login_ack: " + "asked for tagless vlan, but VP flag is set " + "for vnic id 0x%x, NACK", vnic->vn_id); + + } else if ((vnic->vn_vlan) && (!ld->ld_vlan_in_packets)) { + if (eib_wa_no_good_vp_flag) { + ld->ld_vlan_in_packets = 1; + ld->ld_vhub_id = EIB_VHUB_ID(ld->ld_gw_port_id, + ld->ld_assigned_vlan); + nack = 0; + } else { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_login_ack: " + "vlan was assigned correctly, but VP flag is not " + "set for vnic id 0x%x, NACK", vnic->vn_id); + } + } else { + ld->ld_vhub_id = EIB_VHUB_ID(ld->ld_gw_port_id, + ld->ld_assigned_vlan); + nack = 0; + } + + /* + * ACK/NACK the waiter + */ + if (nack) { + vnic->vn_state = EIB_LOGIN_NACK_RCVD; + } else { + bcopy(ld, &vnic->vn_login_data, sizeof (eib_login_data_t)); + vnic->vn_state = EIB_LOGIN_ACK_RCVD; + } + + cv_signal(&vnic->vn_cv); + mutex_exit(&vnic->vn_lock); +} + +int +eib_vnic_wait_for_table(eib_t *ss, eib_vnic_t *vnic, int *err) +{ + clock_t deadline; + int ret = EIB_E_SUCCESS; + + /* + * The EoIB spec does not detail exactly within what time a vhub table + * request is expected to be answered. However, it does mention that + * in the worst case, the vhub update messages from the gateway must + * be seen atleast once in 2.5 * GW_KA_PERIOD (already saved in + * pp_gw_ka_ticks), so we'll settle for that limit. + */ + deadline = ddi_get_lbolt() + ss->ei_gw_props->pp_gw_ka_ticks; + + /* + * Wait for vhub table to be constructed. If we wake up with a + * vhub table construction failure, record the reason. + */ + mutex_enter(&vnic->vn_lock); + while (vnic->vn_state == EIB_LOGIN_TBL_WAIT) { + if (cv_timedwait(&vnic->vn_cv, &vnic->vn_lock, + deadline) == -1) { + if (vnic->vn_state == EIB_LOGIN_TBL_WAIT) + vnic->vn_state = EIB_LOGIN_TIMED_OUT; + } + } + + if (vnic->vn_state != EIB_LOGIN_TBL_DONE) { + ret = EIB_E_FAILURE; + *err = (vnic->vn_state == EIB_LOGIN_TIMED_OUT) ? + ETIME : ECANCELED; + } + mutex_exit(&vnic->vn_lock); + + return (ret); +} + +void +eib_vnic_vhub_table_done(eib_vnic_t *vnic, uint_t result_state) +{ + ASSERT(result_state == EIB_LOGIN_TBL_DONE || + result_state == EIB_LOGIN_TBL_FAILED); + + /* + * Construction of vhub table for the vnic is done one way or + * the other. Set the login wait state appropriately and signal + * the waiter. If it's a vhub table failure, we shouldn't parse + * any more vhub table or vhub update packets until the vnic state + * is changed. + */ + mutex_enter(&vnic->vn_lock); + vnic->vn_state = result_state; + cv_signal(&vnic->vn_cv); + mutex_exit(&vnic->vn_lock); +} + +int +eib_vnic_join_data_mcg(eib_t *ss, eib_vnic_t *vnic, uint8_t *mcast_mac, + boolean_t rejoin, int *err) +{ + eib_chan_t *chan = vnic->vn_data_chan; + eib_login_data_t *ld = &vnic->vn_login_data; + eib_mcg_t *mcg; + eib_mcg_t *elem; + eib_mcg_t *tail; + ibt_mcg_info_t *mcg_info; + ibt_mcg_attr_t mcg_attr; + ibt_status_t ret; + + /* + * Compose the multicast MGID to join + */ + bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); + + eib_vnic_make_vhub_mgid(ld->ld_gw_mgid_prefix, + (uint8_t)EIB_MGID_VHUB_DATA, mcast_mac, ld->ld_n_mac_mcgid, 0, + ld->ld_vhub_id, &(mcg_attr.mc_mgid)); + mcg_attr.mc_pkey = (ib_pkey_t)ld->ld_vhub_pkey; + mcg_attr.mc_qkey = (ib_qkey_t)EIB_DATA_QKEY; + + /* + * Allocate for and prepare the mcg to add to our list + */ + mcg_info = kmem_zalloc(sizeof (ibt_mcg_info_t), KM_NOSLEEP); + if (mcg_info == NULL) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_join_data_mcg: " + "no memory, failed to join mcg (mac=%x:%x:%x:%x:%x:%x)", + mcast_mac[0], mcast_mac[1], mcast_mac[2], + mcast_mac[3], mcast_mac[4], mcast_mac[5]); + + *err = ENOMEM; + goto vnic_join_data_mcg_fail; + } + mcg = kmem_zalloc(sizeof (eib_mcg_t), KM_NOSLEEP); + if (mcg == NULL) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_join_data_mcg: " + "no memory, failed to join mcg (mac=%x:%x:%x:%x:%x:%x)", + mcast_mac[0], mcast_mac[1], mcast_mac[2], + mcast_mac[3], mcast_mac[4], mcast_mac[5]); + + *err = ENOMEM; + goto vnic_join_data_mcg_fail; + } + mcg->mg_next = NULL; + mcg->mg_rgid = ss->ei_props->ep_sgid; + mcg->mg_mgid = mcg_attr.mc_mgid; + mcg->mg_join_state = IB_MC_JSTATE_FULL; + mcg->mg_mcginfo = mcg_info; + bcopy(mcast_mac, mcg->mg_mac, ETHERADDRL); + + /* + * Join the multicast group + * + * Should we query for the mcg and join instead of attempting to + * join directly ? + */ + mcg_attr.mc_join_state = mcg->mg_join_state; + mcg_attr.mc_flow = 0; + mcg_attr.mc_tclass = 0; + mcg_attr.mc_sl = 0; + mcg_attr.mc_scope = 0; /* IB_MC_SCOPE_SUBNET_LOCAL perhaps ? */ + + ret = ibt_join_mcg(mcg->mg_rgid, &mcg_attr, mcg_info, NULL, NULL); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_join_data_mcg: " + "ibt_join_mcg(mgid=%llx.%llx, pkey=0x%x, qkey=0x%lx, " + "jstate=0x%x) failed, ret=%d", mcg_attr.mc_mgid.gid_prefix, + mcg_attr.mc_mgid.gid_guid, mcg_attr.mc_pkey, + mcg_attr.mc_qkey, mcg_attr.mc_join_state, ret); + + *err = EINVAL; + goto vnic_join_data_mcg_fail; + } + + /* + * Attach to the group to receive multicast messages + */ + ret = ibt_attach_mcg(chan->ch_chan, mcg_info); + if (ret != IBT_SUCCESS) { + *err = EINVAL; + + ret = ibt_leave_mcg(mcg->mg_rgid, mcg->mg_mgid, + eib_reserved_gid, mcg->mg_join_state); + if (ret != EIB_E_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_join_data_mcg: " + "ibt_leave_mcg(mgid=%llx.%llx, jstate=0x%x) " + "failed, ret=%d", mcg->mg_mgid.gid_prefix, + mcg->mg_mgid.gid_guid, mcg->mg_join_state, ret); + } + + goto vnic_join_data_mcg_fail; + } + + mutex_enter(&chan->ch_vhub_lock); + + tail = NULL; + for (elem = chan->ch_vhub_data; elem != NULL; elem = elem->mg_next) { + if ((elem->mg_mgid.gid_prefix == mcg_attr.mc_mgid.gid_prefix) && + (elem->mg_mgid.gid_guid == mcg_attr.mc_mgid.gid_guid)) { + break; + } + tail = elem; + } + + /* + * If we had't already joined to this mcg, add the newly joined mcg + * to the tail and return success + */ + if (elem == NULL) { + if (tail) + tail->mg_next = mcg; + else + chan->ch_vhub_data = mcg; + mutex_exit(&chan->ch_vhub_lock); + return (EIB_E_SUCCESS); + } + + /* + * Duplicate. We need to leave one of the two joins. If "rejoin" + * was requested, leave the old join, otherwise leave the new join. + * + * Note that we must not detach the qp from the mcg, since if this + * was a dup, a second ibt_attach_mcg() above would've simply been + * a nop. + * + * Note also that the leave may not be successful here if our presence + * has been removed by the SM, but we need to do this to prevent leaks + * in ibtf. + */ + if (rejoin) { + ASSERT(elem->mg_mcginfo != NULL); + kmem_free(elem->mg_mcginfo, sizeof (ibt_mcg_info_t)); + (void) ibt_leave_mcg(elem->mg_rgid, elem->mg_mgid, + eib_reserved_gid, elem->mg_join_state); + /* + * Copy the new mcg over the old one (including the new + * mg_mcginfo), but preserve the link to the next element + * on the list + */ + mcg->mg_next = elem->mg_next; + bcopy(mcg, elem, sizeof (eib_mcg_t)); + } else { + ASSERT(mcg->mg_mcginfo != NULL); + kmem_free(mcg->mg_mcginfo, sizeof (ibt_mcg_info_t)); + (void) ibt_leave_mcg(mcg->mg_rgid, mcg->mg_mgid, + eib_reserved_gid, mcg->mg_join_state); + } + mutex_exit(&chan->ch_vhub_lock); + + kmem_free(mcg, sizeof (eib_mcg_t)); + return (EIB_E_SUCCESS); + +vnic_join_data_mcg_fail: + if (mcg) { + kmem_free(mcg, sizeof (eib_mcg_t)); + } + if (mcg_info) { + kmem_free(mcg_info, sizeof (ibt_mcg_info_t)); + } + return (EIB_E_FAILURE); +} + +int +eib_vnic_setup_dest(eib_vnic_t *vnic, eib_wqe_t *swqe, uint8_t *dmac, + uint16_t vlan) +{ + eib_t *ss = vnic->vn_ss; + eib_stats_t *stats = ss->ei_stats; + eib_avect_t *av; + eib_vhub_map_t ucast; + ibt_mcg_info_t mcast; + ibt_status_t ret; + int dtype; + int rv; + + /* + * Lookup the destination in the vhub table or in our mcg list + */ + rv = eib_vnic_lookup_dest(vnic, dmac, vlan, &ucast, &mcast, &dtype); + if (rv != EIB_E_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_setup_dest: " + "eib_vnic_lookup_dest(dmac=%x:%x:%x:%x:%x:%x, vlan=0x%x) " + "failed", dmac[0], dmac[1], dmac[2], dmac[3], dmac[4], + dmac[5], vlan); + + return (EIB_E_FAILURE); + } + + /* + * If we found a unicast address, get an address vector for the lid + * and sl, modify the ud dest based on the address vector and return. + * If we found a multicast address, use the address vector in the + * mcg info to modify the ud dest and return. + */ + if (dtype == EIB_TX_UNICAST) { + if ((av = eib_ibt_hold_avect(ss, ucast.mp_lid, + ucast.mp_sl)) == NULL) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_setup_dest: " + "eib_ibt_hold_avect(lid=0x%x, sl=0x%x) failed", + ucast.mp_lid, ucast.mp_sl); + + return (EIB_E_FAILURE); + } + ret = ibt_modify_ud_dest(swqe->qe_dest, EIB_DATA_QKEY, + ucast.mp_qpn, &av->av_vect); + + eib_ibt_release_avect(ss, av); + + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_setup_dest: " + "ibt_modify_ud_dest(qpn=0x%lx, qkey=0x%lx) " + "failed, ret=%d", ucast.mp_qpn, EIB_DATA_QKEY, ret); + return (EIB_E_FAILURE); + } + } else { + ret = ibt_modify_ud_dest(swqe->qe_dest, EIB_DATA_QKEY, + IB_MC_QPN, &(mcast.mc_adds_vect)); + + if (dtype == EIB_TX_BROADCAST) + EIB_INCR_COUNTER(&stats->st_brdcstxmit); + else + EIB_INCR_COUNTER(&stats->st_multixmit); + + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_setup_dest: " + "ibt_modify_ud_dest(mc_qpn=0x%lx, qkey=0x%lx) " + "failed, ret=%d", IB_MC_QPN, EIB_DATA_QKEY, ret); + return (EIB_E_FAILURE); + } + } + + return (EIB_E_SUCCESS); +} + +void +eib_vnic_leave_data_mcg(eib_t *ss, eib_vnic_t *vnic, uint8_t *mcast_mac) +{ + eib_rb_vnic_join_data_mcg(ss, vnic, mcast_mac); +} + +/*ARGSUSED*/ +void +eib_vnic_init_tables(eib_t *ss, eib_vnic_t *vnic) +{ + eib_vhub_table_t *tbl; + eib_vhub_update_t *upd; + + tbl = kmem_zalloc(sizeof (eib_vhub_table_t), KM_SLEEP); + mutex_init(&tbl->tb_lock, NULL, MUTEX_DRIVER, NULL); + tbl->tb_eport_state = FIP_EPORT_UP; + + upd = kmem_zalloc(sizeof (eib_vhub_update_t), KM_SLEEP); + mutex_init(&upd->up_lock, NULL, MUTEX_DRIVER, NULL); + + mutex_enter(&vnic->vn_lock); + vnic->vn_vhub_table = tbl; + vnic->vn_vhub_update = upd; + mutex_exit(&vnic->vn_lock); +} + +/*ARGSUSED*/ +void +eib_vnic_fini_tables(eib_t *ss, eib_vnic_t *vnic, boolean_t clobber) +{ + eib_vhub_update_t *upd; + eib_vhub_table_t *tbl; + eib_vhub_map_t *elem; + eib_vhub_map_t *nxt; + int i; + + /* + * We come here only when we've either completely detached from + * the vhub multicast groups and so cannot receive anymore table + * or update control messages, or we've had a recent vhub table + * construction failure and the vnic state is currently + * EIB_LOGIN_TBL_FAILED and so won't parse any table or update + * control messages. Also, since we haven't completed the vnic + * creation, no one from the tx path will be accessing the + * vn_vhub_table entries either. All said, we're free to play + * around with the vnic's vn_vhub_table and vn_vhub_update here. + */ + + mutex_enter(&vnic->vn_lock); + upd = vnic->vn_vhub_update; + tbl = vnic->vn_vhub_table; + if (clobber) { + vnic->vn_vhub_update = NULL; + vnic->vn_vhub_table = NULL; + } + mutex_exit(&vnic->vn_lock); + + /* + * Destroy the vhub update entries if any + */ + if (upd) { + /* + * Wipe clean the list of vnic entries accumulated via + * vhub updates so far. Release eib_vhub_update_t only + * if explicitly asked to do so + */ + mutex_enter(&upd->up_lock); + for (elem = upd->up_vnic_entry; elem != NULL; elem = nxt) { + nxt = elem->mp_next; + kmem_free(elem, sizeof (eib_vhub_map_t)); + } + upd->up_vnic_entry = NULL; + upd->up_tusn = 0; + upd->up_eport_state = 0; + mutex_exit(&upd->up_lock); + + if (clobber) { + mutex_destroy(&upd->up_lock); + kmem_free(upd, sizeof (eib_vhub_update_t)); + } + } + + /* + * Destroy the vhub table entries + */ + if (tbl == NULL) + return; + + /* + * Wipe clean the list of entries in the vhub table collected so + * far. Release eib_vhub_table_t only if explicitly asked to do so. + */ + mutex_enter(&tbl->tb_lock); + + if (tbl->tb_gateway) { + kmem_free(tbl->tb_gateway, sizeof (eib_vhub_map_t)); + tbl->tb_gateway = NULL; + } + + if (tbl->tb_unicast_miss) { + kmem_free(tbl->tb_unicast_miss, sizeof (eib_vhub_map_t)); + tbl->tb_unicast_miss = NULL; + } + + if (tbl->tb_vhub_multicast) { + kmem_free(tbl->tb_vhub_multicast, sizeof (eib_vhub_map_t)); + tbl->tb_vhub_multicast = NULL; + } + + if (!eib_wa_no_mcast_entries) { + for (i = 0; i < EIB_TB_NBUCKETS; i++) { + for (elem = tbl->tb_mcast_entry[i]; elem != NULL; + elem = nxt) { + nxt = elem->mp_next; + kmem_free(elem, sizeof (eib_vhub_map_t)); + } + tbl->tb_mcast_entry[i] = NULL; + } + } + + for (i = 0; i < EIB_TB_NBUCKETS; i++) { + for (elem = tbl->tb_vnic_entry[i]; elem != NULL; elem = nxt) { + nxt = elem->mp_next; + kmem_free(elem, sizeof (eib_vhub_map_t)); + } + tbl->tb_vnic_entry[i] = NULL; + } + + tbl->tb_tusn = 0; + tbl->tb_eport_state = 0; + tbl->tb_entries_seen = 0; + tbl->tb_entries_in_table = 0; + tbl->tb_checksum = 0; + + mutex_exit(&tbl->tb_lock); + + /* + * Don't throw away space created for holding vhub table if we haven't + * been explicitly asked to do so + */ + if (clobber) { + mutex_destroy(&tbl->tb_lock); + kmem_free(tbl, sizeof (eib_vhub_table_t)); + } +} + +eib_chan_t * +eib_vnic_get_data_chan(eib_t *ss, int vinst) +{ + eib_vnic_t *vnic; + eib_chan_t *chan = NULL; + + if (vinst >= 0 && vinst < EIB_MAX_VNICS) { + mutex_enter(&ss->ei_vnic_lock); + if ((vnic = ss->ei_vnic[vinst]) != NULL) + chan = vnic->vn_data_chan; + mutex_exit(&ss->ei_vnic_lock); + } + + return (chan); +} + +void +eib_vnic_need_new(eib_t *ss, uint8_t *mac, uint16_t vlan) +{ + eib_vnic_req_t *vrq; + + EIB_INCR_COUNTER(&ss->ei_stats->st_noxmitbuf); + + /* + * Create a new vnic request for this {mac,vlan} tuple + */ + vrq = kmem_zalloc(sizeof (eib_vnic_req_t), KM_NOSLEEP); + if (vrq == NULL) { + EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_need_new: " + "no memory, failed to queue new vnic creation request"); + return; + } + vrq->vr_next = NULL; + vrq->vr_req = EIB_CR_REQ_NEW_VNIC; + bcopy(mac, vrq->vr_mac, ETHERADDRL); + vrq->vr_vlan = vlan; + + eib_vnic_enqueue_req(ss, vrq); +} + +void +eib_vnic_enqueue_req(eib_t *ss, eib_vnic_req_t *vrq) +{ + eib_vnic_req_t *elem = NULL; + uint8_t *m; + + /* + * Enqueue this new vnic request with the vnic creator and + * signal it. + */ + m = vrq->vr_mac; + EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_vnic_enqueue_req: " + "BEGIN file request for creation of %x:%x:%x:%x:%x:%x, 0x%x", + m[0], m[1], m[2], m[3], m[4], m[5], vrq->vr_vlan); + + + mutex_enter(&ss->ei_vnic_req_lock); + + /* + * Death request has the highest priority. If we've already been asked + * to die, we don't entertain any more requests. + */ + if (ss->ei_vnic_req) { + if (ss->ei_vnic_req->vr_req == EIB_CR_REQ_DIE) { + mutex_exit(&ss->ei_vnic_req_lock); + kmem_free(vrq, sizeof (eib_vnic_req_t)); + return; + } + } + + if (vrq->vr_req == EIB_CR_REQ_DIE || vrq->vr_req == EIB_CR_REQ_FLUSH) { + vrq->vr_next = ss->ei_vnic_req; + ss->ei_vnic_req = vrq; + } else { + /* + * If there's already a creation request for this vnic that's + * being processed, return immediately without adding a new + * request. + */ + if ((elem = ss->ei_pending_vnic_req) != NULL) { + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_vnic_enqueue_req: " + "ei_pending_vnic_req not NULL"); + + if ((elem->vr_vlan == vrq->vr_vlan) && + (bcmp(elem->vr_mac, vrq->vr_mac, + ETHERADDRL) == 0)) { + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_vnic_enqueue_req: " + "pending request already present for " + "%x:%x:%x:%x:%x:%x, 0x%x", m[0], m[1], m[2], + m[3], m[4], m[5], vrq->vr_vlan); + + mutex_exit(&ss->ei_vnic_req_lock); + kmem_free(vrq, sizeof (eib_vnic_req_t)); + + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_vnic_enqueue_req: " + "END file request"); + return; + } + + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_vnic_enqueue_req: " + "NO pending request for %x:%x:%x:%x:%x:%x, 0x%x", + m[0], m[1], m[2], m[3], m[4], m[5], vrq->vr_vlan); + } + + /* + * Or if there's one waiting in the queue for processing, do + * the same thing + */ + for (elem = ss->ei_vnic_req; elem; elem = elem->vr_next) { + /* + * If there's already a create request for this vnic + * waiting in the queue, return immediately + */ + if (elem->vr_req == EIB_CR_REQ_NEW_VNIC) { + if ((elem->vr_vlan == vrq->vr_vlan) && + (bcmp(elem->vr_mac, vrq->vr_mac, + ETHERADDRL) == 0)) { + + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_vnic_enqueue_req: " + "request already present for " + "%x:%x:%x:%x:%x:%x, 0x%x", m[0], + m[1], m[2], m[3], m[4], m[5], + vrq->vr_vlan); + + mutex_exit(&ss->ei_vnic_req_lock); + kmem_free(vrq, sizeof (eib_vnic_req_t)); + + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_vnic_enqueue_req: " + "END file request"); + return; + } + } + + if (elem->vr_next == NULL) { + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_vnic_enqueue_req: " + "request not found, filing afresh"); + break; + } + } + + /* + * Otherwise queue up this new creation request and signal the + * service thread. + */ + if (elem) { + elem->vr_next = vrq; + } else { + ss->ei_vnic_req = vrq; + } + } + + cv_signal(&ss->ei_vnic_req_cv); + mutex_exit(&ss->ei_vnic_req_lock); + + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_vnic_enqueue_req: END file request"); +} + +void +eib_vnic_update_failed_macs(eib_t *ss, uint8_t *old_mac, uint16_t old_vlan, + uint8_t *new_mac, uint16_t new_vlan) +{ + eib_vnic_req_t *vrq; + eib_vnic_req_t *elem; + eib_vnic_req_t *prev; + + vrq = kmem_zalloc(sizeof (eib_vnic_req_t), KM_NOSLEEP); + if (vrq == NULL) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_update_failed_macs: " + "no memory, failed to drop old mac"); + } else { + vrq->vr_next = NULL; + vrq->vr_req = 0; /* unused */ + bcopy(old_mac, vrq->vr_mac, ETHERADDRL); + vrq->vr_vlan = old_vlan; + } + + mutex_enter(&ss->ei_vnic_req_lock); + + /* + * We'll search the failed vnics list to see if the new {mac,vlan} + * tuple is in there and remove it if present (since the new address + * is no longer "failed"). + */ + prev = NULL; + for (elem = ss->ei_failed_vnic_req; elem; elem = elem->vr_next) { + if ((bcmp(elem->vr_mac, new_mac, ETHERADDRL) == 0) && + (elem->vr_vlan == new_vlan)) { + if (prev) { + prev->vr_next = elem->vr_next; + } else { + ss->ei_failed_vnic_req = elem->vr_next; + } + elem->vr_next = NULL; + break; + } + } + if (elem) { + kmem_free(elem, sizeof (eib_vnic_req_t)); + } + + /* + * We'll also insert the old {mac,vlan} tuple to the "failed vnic req" + * list (it shouldn't be there already), to avoid trying to recreate + * the vnic we just explicitly discarded. + */ + if (vrq) { + vrq->vr_next = ss->ei_failed_vnic_req; + ss->ei_failed_vnic_req = vrq; + } + + mutex_exit(&ss->ei_vnic_req_lock); +} + +void +eib_vnic_resurrect_zombies(eib_t *ss, uint8_t *vn0_mac) +{ + int inst; + + /* + * We want to restart/relogin each vnic instance with the gateway, + * but with the same vnic id and instance as before. + */ + while ((inst = EIB_FIND_LSB_SET(ss->ei_zombie_vnics)) != -1) { + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_vnic_resurrect_zombies: " + "calling eib_vnic_restart(vn_inst=%d)", inst); + + eib_vnic_restart(ss, inst, vn0_mac); + + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_vnic_resurrect_zombies: " + "eib_vnic_restart(vn_inst=%d) done", inst); + } +} + +void +eib_vnic_restart(eib_t *ss, int inst, uint8_t *vn0_mac) +{ + eib_vnic_t *vnic; + eib_login_data_t *ld; + uint8_t old_mac[ETHERADDRL]; + int ret; + int err; + + if (inst < 0 || inst >= EIB_MAX_VNICS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_restart: " + "vnic instance (%d) invalid", inst); + return; + } + + eib_vnic_modify_enter(ss, EIB_VN_BEING_MODIFIED); + if ((vnic = ss->ei_vnic[inst]) != NULL) { + /* + * Remember what mac was allocated for this vnic last time + */ + bcopy(vnic->vn_login_data.ld_assigned_mac, old_mac, ETHERADDRL); + + /* + * Tear down and restart this vnic instance + */ + eib_rb_vnic_create_common(ss, vnic, ~0); + ret = eib_vnic_create_common(ss, vnic, &err); + if (ret != EIB_E_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_restart: " + "eib_vnic_create_common(vnic_inst=%d) failed, " + "ret=%d", inst, err); + } + + /* + * If this is vnic instance 0 and if our current assigned mac is + * different from what was assigned last time, we need to pass + * this information back to the caller, so the mac layer can be + * appropriately informed. We will also queue up the old mac + * and vlan in the "failed vnic req" list, so any future packets + * to this address on this interface will be dropped. + */ + ld = &vnic->vn_login_data; + if ((inst == 0) && + (bcmp(ld->ld_assigned_mac, old_mac, ETHERADDRL) != 0)) { + uint8_t *m = ld->ld_assigned_mac; + + if (vn0_mac != NULL) { + bcopy(ld->ld_assigned_mac, vn0_mac, + ETHERADDRL); + } + + EIB_DPRINTF_VERBOSE(ss->ei_instance, + "eib_vnic_restart: updating failed macs list " + "old=%x:%x:%x:%x:%x:%x, new=%x:%x:%x:%x:%x:%x, " + "vlan=0x%x", old_mac[0], old_mac[1], old_mac[2], + old_mac[3], old_mac[4], old_mac[5], m[0], m[1], + m[2], m[3], m[4], m[5], vnic->vn_vlan); + + eib_vnic_update_failed_macs(ss, old_mac, vnic->vn_vlan, + ld->ld_assigned_mac, vnic->vn_vlan); + } + + /* + * No longer a zombie or need to rejoin mcgs + */ + mutex_enter(&ss->ei_vnic_lock); + ss->ei_zombie_vnics &= (~((uint64_t)1 << inst)); + ss->ei_rejoin_vnics &= (~((uint64_t)1 << inst)); + mutex_exit(&ss->ei_vnic_lock); + } + eib_vnic_modify_exit(ss, EIB_VN_BEING_MODIFIED); +} + +void +eib_vnic_rejoin_mcgs(eib_t *ss) +{ + eib_vnic_t *vnic; + int inst; + + /* + * For each vnic that still requires re-join, go through the + * control channels and data channel and reattach/rejoin mcgs. + */ + mutex_enter(&ss->ei_vnic_lock); + while ((inst = EIB_FIND_LSB_SET(ss->ei_rejoin_vnics)) != -1) { + if ((vnic = ss->ei_vnic[inst]) != NULL) { + eib_vnic_reattach_ctl_mcgs(ss, vnic); + eib_vnic_rejoin_data_mcgs(ss, vnic); + } + ss->ei_rejoin_vnics &= (~((uint64_t)1 << inst)); + } + mutex_exit(&ss->ei_vnic_lock); +} + +void +eib_rb_vnic_create(eib_t *ss, eib_vnic_t *vnic, uint_t progress) +{ + if (progress & EIB_VNIC_CREATE_COMMON_DONE) { + eib_rb_vnic_create_common(ss, vnic, ~0); + } + + if (progress & EIB_VNIC_GOT_INSTANCE) { + eib_vnic_ret_instance(ss, vnic->vn_instance); + vnic->vn_instance = -1; + } + + if (progress & EIB_VNIC_STRUCT_ALLOCD) { + cv_destroy(&vnic->vn_cv); + mutex_destroy(&vnic->vn_lock); + kmem_free(vnic, sizeof (eib_vnic_t)); + } +} + +/* + * Currently, we only allow 64 vnics per eoib device instance, for + * reasons described in eib.h (see EIB_VNIC_ID() definition), so we + * could use a simple bitmap to assign the vnic instance numbers. + * Once we start allowing more vnics per device instance, this + * allocation scheme will need to be changed. + */ +static int +eib_vnic_get_instance(eib_t *ss, int *vinst) +{ + int bitpos; + uint64_t nval; + + mutex_enter(&ss->ei_vnic_lock); + + /* + * What we have is the active vnics list -- the in-use vnics are + * indicated by a 1 in the bit position, and the free ones are + * indicated by 0. We need to find the least significant '0' bit + * to get the first free vnic instance. Or we could bit-reverse + * the active list and locate the least significant '1'. + */ + nval = ~(ss->ei_active_vnics); + if (nval == 0) + return (EIB_E_FAILURE); + + /* + * The single bit-position values in a 64-bit integer are relatively + * prime with 67, so performing a modulus division with 67 guarantees + * a unique number between 0 and 63 for each value (setbit_mod67[]). + */ + bitpos = EIB_FIND_LSB_SET(nval); + if (bitpos == -1) + return (EIB_E_FAILURE); + + ss->ei_active_vnics |= ((uint64_t)1 << bitpos); + *vinst = bitpos; + + mutex_exit(&ss->ei_vnic_lock); + + return (EIB_E_SUCCESS); +} + +static void +eib_vnic_ret_instance(eib_t *ss, int vinst) +{ + mutex_enter(&ss->ei_vnic_lock); + + if (vinst >= EIB_MAX_VNICS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_ret_instance: " + "vnic instance (%d) invalid", vinst); + } else if ((ss->ei_active_vnics & ((uint64_t)1 << vinst)) == 0) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_ret_instance: " + "vnic instance (%d) not active!", vinst); + } else { + ss->ei_active_vnics &= (~((uint64_t)1 << vinst)); + } + + mutex_exit(&ss->ei_vnic_lock); +} + +static void +eib_vnic_modify_enter(eib_t *ss, uint_t op) +{ + mutex_enter(&ss->ei_vnic_lock); + while (ss->ei_vnic_state & EIB_VN_BEING_MODIFIED) + cv_wait(&ss->ei_vnic_cv, &ss->ei_vnic_lock); + + ss->ei_vnic_state |= op; + mutex_exit(&ss->ei_vnic_lock); +} + +static void +eib_vnic_modify_exit(eib_t *ss, uint_t op) +{ + mutex_enter(&ss->ei_vnic_lock); + ss->ei_vnic_state &= (~op); + cv_broadcast(&ss->ei_vnic_cv); + mutex_exit(&ss->ei_vnic_lock); +} + +static int +eib_vnic_create_common(eib_t *ss, eib_vnic_t *vnic, int *err) +{ + uint_t progress = 0; + + /* + * When we receive login acks within this vnic creation + * routine we need a way to retrieve the vnic structure + * from the vnic instance, so store this somewhere. Note + * that there can be only one outstanding vnic creation + * at any point of time, so we only need one vnic struct. + */ + mutex_enter(&ss->ei_vnic_lock); + ASSERT(ss->ei_vnic_pending == NULL); + ss->ei_vnic_pending = vnic; + mutex_exit(&ss->ei_vnic_lock); + + /* + * Create a control qp for this vnic + */ + if (eib_ctl_create_qp(ss, vnic, err) != EIB_E_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_create_common: " + "eib_ctl_create_qp(vn_id=0x%x) failed, ret=%d", + vnic->vn_id, *err); + goto vnic_create_common_fail; + } + progress |= EIB_VNIC_CTLQP_CREATED; + + /* + * Create a data qp for this vnic + */ + if (eib_data_create_qp(ss, vnic, err) != EIB_E_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_create_common: " + "eib_data_create_qp(vn_id=0x%x) failed, ret=%d", + vnic->vn_id, *err); + goto vnic_create_common_fail; + } + progress |= EIB_VNIC_DATAQP_CREATED; + + /* + * Login to the gateway with this vnic's parameters + */ + if (eib_fip_login(ss, vnic, err) != EIB_E_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_create_common: " + "eib_fip_login(vn_id=0x%x) failed, ret=%d", + vnic->vn_id, *err); + goto vnic_create_common_fail; + } + progress |= EIB_VNIC_LOGIN_DONE; + + /* + * Associate the control and data qps for the vnic with the + * vHUB partition + */ + if (eib_vnic_set_partition(ss, vnic, err) != EIB_E_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_create_common: " + "eib_vnic_set_partition(vn_id=0x%x) failed, ret=%d", + vnic->vn_id, *err); + goto vnic_create_common_fail; + } + progress |= EIB_VNIC_PARTITION_SET; + + /* + * Post initial set of rx buffers on the control qp to the HCA + */ + if (eib_chan_post_rx(ss, vnic->vn_ctl_chan, NULL) != EIB_E_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_create_common: " + "eib_chan_post_rx(vn_id=0x%x, CTL_QP) failed, ret=%d", + vnic->vn_id, *err); + + *err = ENOMEM; + goto vnic_create_common_fail; + } + progress |= EIB_VNIC_RX_POSTED_TO_CTLQP; + + /* + * Post initial set of rx buffers on the data qp to the HCA + */ + if (eib_chan_post_rx(ss, vnic->vn_data_chan, NULL) != EIB_E_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_create_common: " + "eib_chan_post_rx(vn_id=0x%x, DATA_QP) failed, ret=%d", + vnic->vn_id, *err); + + *err = ENOMEM; + goto vnic_create_common_fail; + } + progress |= EIB_VNIC_RX_POSTED_TO_DATAQP; + + /* + * Attach to the vHUB table and vHUB update multicast groups + */ + if (eib_vnic_attach_ctl_mcgs(ss, vnic, err) != EIB_E_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_create_common: " + "eib_vnic_attach_ctl_mcgs(vn_id=0x%x) failed, ret=%d", + vnic->vn_id, *err); + goto vnic_create_common_fail; + } + progress |= EIB_VNIC_ATTACHED_TO_CTL_MCGS; + + /* + * Send the vHUB table request and construct the vhub table + */ + if (eib_fip_vhub_table(ss, vnic, err) != EIB_E_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_create_common: " + "eib_fip_vhub_table(vn_id=0x%x) failed, ret=%d", + vnic->vn_id, *err); + goto vnic_create_common_fail; + } + progress |= EIB_VNIC_GOT_VHUB_TABLE; + + /* + * Detach from the vHUB table mcg (we no longer need the vHUB + * table messages) and start the keepalives for this vnic. + */ + eib_vnic_start_keepalives(ss, vnic); + eib_rb_vnic_attach_vhub_table(ss, vnic); + + progress |= EIB_VNIC_KEEPALIVES_STARTED; + + /* + * All ethernet vnics are automatically members of the broadcast + * group for the vlan they are participating in, so join the + * ethernet broadcast group. Note that when we restart vnics, + * we rejoin the mcgs, so we pass B_TRUE to eib_vnic_join_data_mcg(). + */ + if (eib_vnic_join_data_mcg(ss, vnic, eib_broadcast_mac, B_TRUE, + err) != EIB_E_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_create_common: " + "eib_vnic_join_data_mcg(vn_id=0x%x, BCAST_GROUP) failed, " + "ret=%d", vnic->vn_id, *err); + goto vnic_create_common_fail; + } + progress |= EIB_VNIC_BROADCAST_JOINED; + + mutex_enter(&ss->ei_vnic_lock); + if (ss->ei_vnic[vnic->vn_instance] == NULL) { + ss->ei_vnic[vnic->vn_instance] = vnic; + } + ss->ei_vnic_pending = NULL; + mutex_exit(&ss->ei_vnic_lock); + + return (EIB_E_SUCCESS); + +vnic_create_common_fail: + eib_rb_vnic_create_common(ss, vnic, progress); + return (EIB_E_FAILURE); +} + +static int +eib_vnic_set_partition(eib_t *ss, eib_vnic_t *vnic, int *err) +{ + int ret; + + /* + * Associate the control channel with the vhub partition + */ + ret = eib_ibt_modify_chan_pkey(ss, vnic->vn_ctl_chan, + vnic->vn_login_data.ld_vhub_pkey); + if (ret != EIB_E_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_set_partition: " + "eib_ibt_modify_chan_pkey(vn_id=0x%x, CTL_CHAN, " + "vhub_pkey=0x%x) failed", vnic->vn_id, + vnic->vn_login_data.ld_vhub_pkey); + *err = EINVAL; + return (EIB_E_FAILURE); + } + + /* + * Now, do the same thing for the data channel. Note that if a + * failure happens, the channel state(s) are left as-is, since + * it is pointless to try to change them back using the same + * interfaces that have just failed. + */ + ret = eib_ibt_modify_chan_pkey(ss, vnic->vn_data_chan, + vnic->vn_login_data.ld_vhub_pkey); + if (ret != EIB_E_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_set_partition: " + "eib_ibt_modify_chan_pkey(vn_id=0x%x, DATA_CHAN, " + "vhub_pkey=0x%x) failed", vnic->vn_id, + vnic->vn_login_data.ld_vhub_pkey); + *err = EINVAL; + return (EIB_E_FAILURE); + } + + return (EIB_E_SUCCESS); +} + +static void +eib_vnic_make_vhub_mgid(uint8_t *mg_prefix, uint8_t mg_type, + uint8_t *mcast_mac, uint8_t n_mac, uint8_t rss_hash, uint32_t vhub_id, + ib_gid_t *mgid) +{ + eib_mgid_t em; + uint64_t dmac_mask; + uint64_t dmac = 0; + uint8_t *dmac_str = (uint8_t *)&dmac; + uint_t vhub_id_nw; + uint8_t *vhub_id_str = (uint8_t *)&vhub_id_nw; + + /* + * Copy mgid prefix and type + */ + bcopy(mg_prefix, em.gd_spec.sp_mgid_prefix, FIP_MGID_PREFIX_LEN); + em.gd_spec.sp_type = mg_type; + + /* + * Take n_mac bits from mcast_mac and copy dmac + */ + bcopy(mcast_mac, dmac_str + 2, ETHERADDRL); + dmac_mask = ((uint64_t)1 << n_mac) - 1; + dmac_mask = htonll(dmac_mask); + dmac &= dmac_mask; + bcopy(dmac_str + 2, em.gd_spec.sp_dmac, ETHERADDRL); + + /* + * Copy rss hash and prepare vhub id from gw port id and vlan + */ + em.gd_spec.sp_rss_hash = rss_hash; + + vhub_id_nw = htonl(vhub_id); + bcopy(vhub_id_str + 1, em.gd_spec.sp_vhub_id, FIP_VHUBID_LEN); + + /* + * Ok, now we've assembled the mgid as per EoIB spec. We now have to + * represent it in the way Solaris IBTF wants it and return (sigh). + */ + mgid->gid_prefix = ntohll(em.gd_sol.gid_prefix); + mgid->gid_guid = ntohll(em.gd_sol.gid_guid); +} + +static int +eib_vnic_attach_ctl_mcgs(eib_t *ss, eib_vnic_t *vnic, int *err) +{ + /* + * Get tb_vhub_table and tb_vhub_update allocated and ready before + * attaching to the vhub table and vhub update mcgs + */ + eib_vnic_init_tables(ss, vnic); + + if (eib_vnic_attach_vhub_update(ss, vnic) != EIB_E_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_attach_ctl_mcgs: " + "eib_vnic_attach_vhub_update(vn_id=0x%x) failed", + vnic->vn_id); + + *err = EINVAL; + eib_vnic_fini_tables(ss, vnic, B_TRUE); + return (EIB_E_FAILURE); + } + + if (eib_vnic_attach_vhub_table(ss, vnic) != EIB_E_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_attach_ctl_mcgs: " + "eib_vnic_attach_vhub_table(vn_id=0x%x) failed", + vnic->vn_id); + + *err = EINVAL; + eib_rb_vnic_attach_vhub_update(ss, vnic); + eib_vnic_fini_tables(ss, vnic, B_TRUE); + return (EIB_E_FAILURE); + } + + return (EIB_E_SUCCESS); +} + +static int +eib_vnic_attach_vhub_table(eib_t *ss, eib_vnic_t *vnic) +{ + eib_chan_t *chan = vnic->vn_ctl_chan; + eib_login_data_t *ld = &vnic->vn_login_data; + eib_mcg_t *mcg; + ibt_mcg_info_t *tbl_mcginfo; + ibt_mcg_attr_t mcg_attr; + ibt_status_t ret; + uint_t entries; + + /* + * Compose the MGID for receiving VHUB table + */ + bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); + + eib_vnic_make_vhub_mgid(ld->ld_gw_mgid_prefix, + (uint8_t)EIB_MGID_VHUB_TABLE, eib_broadcast_mac, ld->ld_n_mac_mcgid, + 0, ld->ld_vhub_id, &(mcg_attr.mc_mgid)); + mcg_attr.mc_pkey = (ib_pkey_t)ld->ld_vhub_pkey; + mcg_attr.mc_qkey = (ib_qkey_t)EIB_FIP_QKEY; + + /* + * Locate the multicast group for receiving vhub table + */ + ret = ibt_query_mcg(ss->ei_props->ep_sgid, &mcg_attr, 1, + &tbl_mcginfo, &entries); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_attach_vhub_table: " + "ibt_query_mcg(mgid=%llx.%llx, pkey=0x%x) failed, " + "ret=%d", mcg_attr.mc_mgid.gid_prefix, + mcg_attr.mc_mgid.gid_guid, mcg_attr.mc_pkey, ret); + return (EIB_E_FAILURE); + } + + /* + * Allocate for and prepare the mcg to add to our list + */ + mcg = kmem_zalloc(sizeof (eib_mcg_t), KM_NOSLEEP); + if (mcg == NULL) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_attach_vhub_table: " + "no memory, failed to attach to vhub table " + "(mgid=%llx.%llx, pkey=0x%x)", mcg_attr.mc_mgid.gid_prefix, + mcg_attr.mc_mgid.gid_guid, mcg_attr.mc_pkey); + ibt_free_mcg_info(tbl_mcginfo, 1); + return (EIB_E_FAILURE); + } + + mcg->mg_next = NULL; + mcg->mg_rgid = ss->ei_props->ep_sgid; + mcg->mg_mgid = mcg_attr.mc_mgid; + mcg->mg_join_state = IB_MC_JSTATE_FULL; + mcg->mg_mcginfo = tbl_mcginfo; + bcopy(eib_broadcast_mac, mcg->mg_mac, ETHERADDRL); + + /* + * Join the multicast group + */ + mcg_attr.mc_join_state = mcg->mg_join_state; + mcg_attr.mc_flow = tbl_mcginfo->mc_adds_vect.av_flow; + mcg_attr.mc_tclass = tbl_mcginfo->mc_adds_vect.av_tclass; + mcg_attr.mc_sl = tbl_mcginfo->mc_adds_vect.av_srvl; + mcg_attr.mc_scope = 0; /* IB_MC_SCOPE_SUBNET_LOCAL perhaps ? */ + + ret = ibt_join_mcg(mcg->mg_rgid, &mcg_attr, tbl_mcginfo, NULL, NULL); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_attach_vhub_table: " + "ibt_join_mcg(mgid=%llx.%llx, pkey=0x%x, jstate=0x%x) " + "failed, ret=%d", mcg_attr.mc_mgid.gid_prefix, + mcg_attr.mc_mgid.gid_guid, mcg_attr.mc_pkey, + mcg_attr.mc_join_state, ret); + + kmem_free(mcg, sizeof (eib_mcg_t)); + ibt_free_mcg_info(tbl_mcginfo, 1); + return (EIB_E_FAILURE); + } + + /* + * Attach to the multicast group to receive tbl multicasts + */ + ret = ibt_attach_mcg(chan->ch_chan, tbl_mcginfo); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_attach_vhub_table: " + "ibt_attach_mcg(mgid=%llx.%llx, pkey=0x%x) " + "failed, ret=%d", mcg_attr.mc_mgid.gid_prefix, + mcg_attr.mc_mgid.gid_guid, mcg_attr.mc_pkey); + + (void) ibt_leave_mcg(mcg->mg_rgid, mcg->mg_mgid, + eib_reserved_gid, mcg->mg_join_state); + kmem_free(mcg, sizeof (eib_mcg_t)); + ibt_free_mcg_info(tbl_mcginfo, 1); + return (EIB_E_FAILURE); + } + + mutex_enter(&chan->ch_vhub_lock); + chan->ch_vhub_table = mcg; + mutex_exit(&chan->ch_vhub_lock); + + return (EIB_E_SUCCESS); +} + +static int +eib_vnic_attach_vhub_update(eib_t *ss, eib_vnic_t *vnic) +{ + eib_chan_t *chan = vnic->vn_ctl_chan; + eib_login_data_t *ld = &vnic->vn_login_data; + eib_mcg_t *mcg; + ibt_mcg_info_t *upd_mcginfo; + ibt_mcg_attr_t mcg_attr; + ibt_status_t ret; + uint_t entries; + + /* + * Compose the MGID for receiving VHUB updates + */ + bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); + + eib_vnic_make_vhub_mgid(ld->ld_gw_mgid_prefix, + (uint8_t)EIB_MGID_VHUB_UPDATE, eib_broadcast_mac, + ld->ld_n_mac_mcgid, 0, ld->ld_vhub_id, &(mcg_attr.mc_mgid)); + mcg_attr.mc_pkey = (ib_pkey_t)ld->ld_vhub_pkey; + mcg_attr.mc_qkey = (ib_qkey_t)EIB_FIP_QKEY; + + /* + * Locate the multicast group for receiving vhub updates + */ + ret = ibt_query_mcg(ss->ei_props->ep_sgid, &mcg_attr, 1, + &upd_mcginfo, &entries); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_attach_vhub_update: " + "ibt_query_mcg(mgid=%llx.%llx, pkey=0x%x) failed, " + "ret=%d", mcg_attr.mc_mgid.gid_prefix, + mcg_attr.mc_mgid.gid_guid, mcg_attr.mc_pkey, ret); + return (EIB_E_FAILURE); + } + + /* + * Allocate for and prepare the mcg to add to our list + */ + mcg = kmem_zalloc(sizeof (eib_mcg_t), KM_NOSLEEP); + if (mcg == NULL) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_attach_vhub_update: " + "no memory, failed to attach to vhub update " + "(mgid=%llx.%llx, pkey=0x%x)", mcg_attr.mc_mgid.gid_prefix, + mcg_attr.mc_mgid.gid_guid, mcg_attr.mc_pkey); + + ibt_free_mcg_info(upd_mcginfo, 1); + return (EIB_E_FAILURE); + } + + mcg->mg_next = NULL; + mcg->mg_rgid = ss->ei_props->ep_sgid; + mcg->mg_mgid = mcg_attr.mc_mgid; + mcg->mg_join_state = IB_MC_JSTATE_FULL; + mcg->mg_mcginfo = upd_mcginfo; + bcopy(eib_broadcast_mac, mcg->mg_mac, ETHERADDRL); + + /* + * Join the multicast group + */ + mcg_attr.mc_join_state = mcg->mg_join_state; + mcg_attr.mc_flow = upd_mcginfo->mc_adds_vect.av_flow; + mcg_attr.mc_tclass = upd_mcginfo->mc_adds_vect.av_tclass; + mcg_attr.mc_sl = upd_mcginfo->mc_adds_vect.av_srvl; + mcg_attr.mc_scope = 0; /* IB_MC_SCOPE_SUBNET_LOCAL perhaps ? */ + + ret = ibt_join_mcg(mcg->mg_rgid, &mcg_attr, upd_mcginfo, NULL, NULL); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_attach_vhub_update: " + "ibt_join_mcg(mgid=%llx.%llx, pkey=0x%x, jstate=0x%x) " + "failed, ret=%d", mcg_attr.mc_mgid.gid_prefix, + mcg_attr.mc_mgid.gid_guid, mcg_attr.mc_pkey, + mcg_attr.mc_join_state, ret); + + kmem_free(mcg, sizeof (eib_mcg_t)); + ibt_free_mcg_info(upd_mcginfo, 1); + return (EIB_E_FAILURE); + } + + /* + * Attach to the multicast group to receive upd multicasts + */ + ret = ibt_attach_mcg(chan->ch_chan, upd_mcginfo); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_attach_vhub_update: " + "ibt_attach_mcg(mgid=%llx.%llx, pkey=0x%x) " + "failed, ret=%d", mcg_attr.mc_mgid.gid_prefix, + mcg_attr.mc_mgid.gid_guid, mcg_attr.mc_pkey); + + (void) ibt_leave_mcg(mcg->mg_rgid, mcg->mg_mgid, + eib_reserved_gid, mcg->mg_join_state); + kmem_free(mcg, sizeof (eib_mcg_t)); + ibt_free_mcg_info(upd_mcginfo, 1); + return (EIB_E_FAILURE); + } + + mutex_enter(&chan->ch_vhub_lock); + chan->ch_vhub_update = mcg; + mutex_exit(&chan->ch_vhub_lock); + + return (EIB_E_SUCCESS); +} + +static void +eib_vnic_start_keepalives(eib_t *ss, eib_vnic_t *vnic) +{ + eib_ka_vnics_t *kav; + eib_ka_vnics_t *elem; + int err; + + kav = kmem_zalloc(sizeof (eib_ka_vnics_t), KM_SLEEP); + kav->ka_vnic = vnic; + kav->ka_next = NULL; + + /* + * Send the first keepalive and then queue this vnic up with + * the keepalives manager + */ + (void) eib_fip_heartbeat(ss, vnic, &err); + + mutex_enter(&ss->ei_ka_vnics_lock); + for (elem = ss->ei_ka_vnics; elem; elem = elem->ka_next) { + if (elem->ka_next == NULL) + break; + } + if (elem) { + elem->ka_next = kav; + } else { + ss->ei_ka_vnics = kav; + } + mutex_exit(&ss->ei_ka_vnics_lock); +} + +/*ARGSUSED*/ +static int +eib_vnic_lookup_dest(eib_vnic_t *vnic, uint8_t *dmac, uint16_t vlan, + eib_vhub_map_t *ucast, ibt_mcg_info_t *mcast, int *dtype) +{ + eib_t *ss = vnic->vn_ss; + eib_vhub_map_t *elem; + eib_mcg_t *mcg; + eib_chan_t *chan = vnic->vn_data_chan; + eib_login_data_t *ld = &vnic->vn_login_data; + eib_vhub_map_t *gw; + eib_vhub_table_t *tbl; + uint8_t bkt = (dmac[ETHERADDRL-1]) % EIB_TB_NBUCKETS; + ib_gid_t mgid; + + /* + * If this was a unicast dmac, locate the vhub entry matching the + * unicast dmac in our vhub table. If it's not found, return the + * gateway entry + */ + if (EIB_UNICAST_MAC(dmac)) { + + mutex_enter(&vnic->vn_lock); + if ((tbl = vnic->vn_vhub_table) == NULL) { + mutex_exit(&vnic->vn_lock); + return (EIB_E_FAILURE); + } + + mutex_enter(&tbl->tb_lock); + gw = tbl->tb_gateway; + for (elem = tbl->tb_vnic_entry[bkt]; elem != NULL; + elem = elem->mp_next) { + if (bcmp(elem->mp_mac, dmac, ETHERADDRL) == 0) + break; + } + mutex_exit(&tbl->tb_lock); + + if ((elem == NULL) && (gw == NULL)) { + mutex_exit(&vnic->vn_lock); + return (EIB_E_FAILURE); + } + + *dtype = EIB_TX_UNICAST; + if (elem) { + bcopy(elem, ucast, sizeof (eib_vhub_map_t)); + } else { + bcopy(gw, ucast, sizeof (eib_vhub_map_t)); + } + mutex_exit(&vnic->vn_lock); + + return (EIB_E_SUCCESS); + } + + /* + * Is it a broadcast ? + */ + *dtype = (bcmp(dmac, eib_broadcast_mac, ETHERADDRL) == 0) ? + EIB_TX_BROADCAST : EIB_TX_MULTICAST; + + /* + * If this was a multicast dmac, prepare the mgid and look for it + * in the list of mcgs we've joined and use the address vector from + * the mcginfo stored there. + * + * Note that since we don't have a way to associate each vlan with + * the mcg (see eib_m_multicast()), we'll prepare the mgid to use + * the broadcast channel all the time. + */ + eib_vnic_make_vhub_mgid(ld->ld_gw_mgid_prefix, + (uint8_t)EIB_MGID_VHUB_DATA, eib_broadcast_mac, ld->ld_n_mac_mcgid, + 0, ld->ld_vhub_id, &mgid); + + mutex_enter(&chan->ch_vhub_lock); + for (mcg = chan->ch_vhub_data; mcg; mcg = mcg->mg_next) { + if ((mcg->mg_mgid.gid_prefix == mgid.gid_prefix) && + (mcg->mg_mgid.gid_guid == mgid.gid_guid)) { + break; + } + } + if (mcg == NULL) { + mutex_exit(&chan->ch_vhub_lock); + + EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_lookup_dest: " + "could not find mgid %llx.%llx", + mgid.gid_prefix, mgid.gid_guid); + + return (EIB_E_FAILURE); + } + + bcopy(mcg->mg_mcginfo, mcast, sizeof (ibt_mcg_info_t)); + mutex_exit(&chan->ch_vhub_lock); + + return (EIB_E_SUCCESS); +} + +/*ARGSUSED*/ +static void +eib_vnic_leave_all_data_mcgs(eib_t *ss, eib_vnic_t *vnic) +{ + eib_chan_t *chan = vnic->vn_data_chan; + eib_mcg_t *mcglist; + eib_mcg_t *mcg; + eib_mcg_t *nxt = NULL; + ibt_status_t ret; + + /* + * First, take the ch_vhub_data mcg chain out of chan + */ + mutex_enter(&chan->ch_vhub_lock); + mcglist = chan->ch_vhub_data; + chan->ch_vhub_data = NULL; + mutex_exit(&chan->ch_vhub_lock); + + /* + * Go through the chain of mcgs we've joined, detach the qp from the + * mcg, leave the group and free all associated stuff + */ + for (mcg = mcglist; mcg != NULL; mcg = nxt) { + nxt = mcg->mg_next; + + ret = ibt_detach_mcg(chan->ch_chan, mcg->mg_mcginfo); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_leave_all_data_mcgs: " + "ibt_detach_mcg(chan_hdl=0x%llx, mcinfo=0x%llx, " + "mgid=%llx.%llx) failed, ret=%d", chan->ch_chan, + mcg->mg_mcginfo, mcg->mg_mgid.gid_prefix, + mcg->mg_mgid.gid_guid, ret); + } + + ret = ibt_leave_mcg(mcg->mg_rgid, mcg->mg_mgid, + eib_reserved_gid, mcg->mg_join_state); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_leave_all_data_mcgs: " + "ibt_leave_mcg(mgid=%llx.%llx, jstate=0x%x) " + "failed, ret=%d", mcg->mg_mgid.gid_prefix, + mcg->mg_mgid.gid_guid, mcg->mg_join_state, ret); + } + + if (mcg->mg_mcginfo) + kmem_free(mcg->mg_mcginfo, sizeof (ibt_mcg_info_t)); + + kmem_free(mcg, sizeof (eib_mcg_t)); + } +} + +static void +eib_vnic_rejoin_data_mcgs(eib_t *ss, eib_vnic_t *vnic) +{ + eib_chan_t *chan = vnic->vn_data_chan; + eib_mcg_t *mcglist; + eib_mcg_t *mcg; + eib_mcg_t *next; + int err; + + /* + * Grab the current list of mcgs + */ + mutex_enter(&chan->ch_vhub_lock); + mcglist = chan->ch_vhub_data; + chan->ch_vhub_data = NULL; + mutex_exit(&chan->ch_vhub_lock); + + /* + * When rejoin data mcgs is called, we may not even be marked as + * joined in SM's records. But we still have to leave the old + * one first to prevent leaks in ibtf. + */ + for (mcg = mcglist; mcg != NULL; mcg = next) { + next = mcg->mg_next; + mcg->mg_next = NULL; + + (void) ibt_detach_mcg(chan->ch_chan, mcg->mg_mcginfo); + (void) ibt_leave_mcg(mcg->mg_rgid, mcg->mg_mgid, + eib_reserved_gid, mcg->mg_join_state); + + if (eib_vnic_join_data_mcg(ss, vnic, mcg->mg_mac, B_TRUE, + &err) != EIB_E_SUCCESS) { + uint8_t *m; + + m = mcg->mg_mac; + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_rejoin_data_mcgs: " + "eib_vnic_join_data_mcg(mcmac=%x:%x:%x:%x:%x:%x) " + "failed, ret=%d", m[0], m[1], m[2], m[3], + m[4], m[5], err); + } + if (mcg->mg_mcginfo) { + kmem_free(mcg->mg_mcginfo, sizeof (ibt_mcg_info_t)); + } + kmem_free(mcg, sizeof (eib_mcg_t)); + } +} + +static void +eib_vnic_reattach_ctl_mcgs(eib_t *ss, eib_vnic_t *vnic) +{ + /* + * For reattaching to control mcgs, we will not reinitialize the + * vhub table/vhub update we've constructed. We'll simply detach + * from the table and update mcgs and reattach to them. Hopefully, + * we wouldn't have missed any updates and won't have to restart + * the vnic. + */ + eib_rb_vnic_attach_vhub_table(ss, vnic); + eib_rb_vnic_attach_vhub_update(ss, vnic); + + if (eib_vnic_attach_vhub_update(ss, vnic) != EIB_E_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_reattach_ctl_mcgs: " + "eib_vnic_attach_vhub_update(vn_id=0x%x) failed", + vnic->vn_id); + } + + if (eib_vnic_attach_vhub_table(ss, vnic) != EIB_E_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_vnic_reattach_ctl_mcgs: " + "eib_vnic_attach_vhub_table(vn_id=0x%x) failed", + vnic->vn_id); + + eib_rb_vnic_attach_vhub_update(ss, vnic); + } +} + +static void +eib_rb_vnic_create_common(eib_t *ss, eib_vnic_t *vnic, uint_t progress) +{ + int err; + + mutex_enter(&ss->ei_vnic_lock); + ss->ei_vnic[vnic->vn_instance] = NULL; + ss->ei_vnic_pending = NULL; + mutex_exit(&ss->ei_vnic_lock); + + if (progress & EIB_VNIC_BROADCAST_JOINED) { + eib_vnic_leave_all_data_mcgs(ss, vnic); + } + + if (progress & EIB_VNIC_KEEPALIVES_STARTED) { + eib_rb_vnic_start_keepalives(ss, vnic); + } + + if (progress & EIB_VNIC_ATTACHED_TO_CTL_MCGS) { + eib_rb_vnic_attach_ctl_mcgs(ss, vnic); + } + + if (progress & EIB_VNIC_LOGIN_DONE) { + (void) eib_fip_logout(ss, vnic, &err); + } + + if (progress & EIB_VNIC_DATAQP_CREATED) { + eib_rb_data_create_qp(ss, vnic); + } + + if (progress & EIB_VNIC_CTLQP_CREATED) { + eib_rb_ctl_create_qp(ss, vnic); + } +} + +static void +eib_rb_vnic_attach_ctl_mcgs(eib_t *ss, eib_vnic_t *vnic) +{ + /* + * Detach from the vhub table and vhub update mcgs before blowing + * up vn_vhub_table and vn_vhub_update, since these are assumed to + * be available by the control cq handler. + */ + eib_rb_vnic_attach_vhub_table(ss, vnic); + eib_rb_vnic_attach_vhub_update(ss, vnic); + eib_vnic_fini_tables(ss, vnic, B_TRUE); +} + +/*ARGSUSED*/ +static void +eib_rb_vnic_attach_vhub_table(eib_t *ss, eib_vnic_t *vnic) +{ + eib_chan_t *chan = vnic->vn_ctl_chan; + eib_mcg_t *mcg; + ibt_channel_hdl_t chan_hdl; + ibt_status_t ret; + + if (chan == NULL) + return; + + mutex_enter(&chan->ch_vhub_lock); + chan_hdl = chan->ch_chan; + mcg = chan->ch_vhub_table; + chan->ch_vhub_table = NULL; + mutex_exit(&chan->ch_vhub_lock); + + if (chan_hdl && mcg) { + ret = ibt_detach_mcg(chan_hdl, mcg->mg_mcginfo); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_rb_vnic_attach_vhub_table: " + "ibt_detach_mcg(chan_hdl=0x%llx, mcinfo=0x%llx, " + "mgid=%llx.%llx) failed, ret=%d", chan_hdl, + mcg->mg_mcginfo, mcg->mg_mgid.gid_prefix, + mcg->mg_mgid.gid_guid, ret); + } + + ret = ibt_leave_mcg(mcg->mg_rgid, mcg->mg_mgid, + eib_reserved_gid, mcg->mg_join_state); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_rb_vnic_attach_vhub_table: " + "ibt_leave_mcg(mgid=%llx.%llx, jstate=0x%x) " + "failed, ret=%d", mcg->mg_mgid.gid_prefix, + mcg->mg_mgid.gid_guid, mcg->mg_join_state, ret); + } + + if (mcg->mg_mcginfo) { + ibt_free_mcg_info(mcg->mg_mcginfo, 1); + } + kmem_free(mcg, sizeof (eib_mcg_t)); + } +} + +/*ARGSUSED*/ +static void +eib_rb_vnic_attach_vhub_update(eib_t *ss, eib_vnic_t *vnic) +{ + eib_chan_t *chan = vnic->vn_ctl_chan; + eib_mcg_t *mcg; + ibt_channel_hdl_t chan_hdl; + ibt_status_t ret; + + if (chan == NULL) + return; + + mutex_enter(&chan->ch_vhub_lock); + chan_hdl = chan->ch_chan; + mcg = chan->ch_vhub_update; + chan->ch_vhub_update = NULL; + mutex_exit(&chan->ch_vhub_lock); + + if (chan_hdl && mcg) { + ret = ibt_detach_mcg(chan_hdl, mcg->mg_mcginfo); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_rb_vnic_attach_vhub_update: " + "ibt_detach_mcg(chan_hdl=0x%llx, mcinfo=0x%llx, " + "mgid=%llx.%llx) failed, ret=%d", chan_hdl, + mcg->mg_mcginfo, mcg->mg_mgid.gid_prefix, + mcg->mg_mgid.gid_guid, ret); + } + + ret = ibt_leave_mcg(mcg->mg_rgid, mcg->mg_mgid, + eib_reserved_gid, mcg->mg_join_state); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_rb_vnic_attach_vhub_update: " + "ibt_leave_mcg(mgid=%llx.%llx, jstate=0x%x) " + "failed, ret=%d", mcg->mg_mgid.gid_prefix, + mcg->mg_mgid.gid_guid, mcg->mg_join_state, ret); + } + + if (mcg->mg_mcginfo) { + ibt_free_mcg_info(mcg->mg_mcginfo, 1); + } + kmem_free(mcg, sizeof (eib_mcg_t)); + } +} + +/*ARGSUSED*/ +static void +eib_rb_vnic_start_keepalives(eib_t *ss, eib_vnic_t *vnic) +{ + eib_ka_vnics_t *prev; + eib_ka_vnics_t *elem; + + /* + * We only need to locate and remove the vnic entry from the + * keepalives manager list + */ + + mutex_enter(&ss->ei_ka_vnics_lock); + + prev = NULL; + for (elem = ss->ei_ka_vnics; elem; elem = elem->ka_next) { + if (elem->ka_vnic == vnic) + break; + + prev = elem; + } + if (elem == NULL) { + EIB_DPRINTF_DEBUG(ss->ei_instance, + "eib_rb_vnic_start_keepalives: no keepalive element found " + "for vnic 0x%llx (vn_inst=%d) with keepalive manager", + vnic, vnic->vn_instance); + } else { + if (prev) { + prev->ka_next = elem->ka_next; + } else { + ss->ei_ka_vnics = elem->ka_next; + } + kmem_free(elem, sizeof (eib_ka_vnics_t)); + } + mutex_exit(&ss->ei_ka_vnics_lock); +} + +/*ARGSUSED*/ +static void +eib_rb_vnic_join_data_mcg(eib_t *ss, eib_vnic_t *vnic, uint8_t *mcast_mac) +{ + eib_chan_t *chan = vnic->vn_data_chan; + eib_mcg_t *prev; + eib_mcg_t *mcg; + ibt_status_t ret; + + /* + * Search our list and remove the item if found + */ + mutex_enter(&chan->ch_vhub_lock); + + prev = NULL; + for (mcg = chan->ch_vhub_data; mcg != NULL; mcg = mcg->mg_next) { + if (bcmp(mcg->mg_mac, mcast_mac, ETHERADDRL) == 0) + break; + prev = mcg; + } + + if (mcg == NULL) { + mutex_exit(&chan->ch_vhub_lock); + return; + } + + if (prev != NULL) + prev->mg_next = mcg->mg_next; + else + chan->ch_vhub_data = mcg->mg_next; + + mcg->mg_next = NULL; + + mutex_exit(&chan->ch_vhub_lock); + + /* + * Detach data channel qp from the mcg, leave the group and free + * all associated stuff + */ + ret = ibt_detach_mcg(chan->ch_chan, mcg->mg_mcginfo); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_rb_vnic_join_data_mcg: " + "ibt_detach_mcg(chan_hdl=0x%llx, mcinfo=0x%llx, " + "mgid=%llx.%llx) failed, ret=%d", chan->ch_chan, + mcg->mg_mcginfo, mcg->mg_mgid.gid_prefix, + mcg->mg_mgid.gid_guid, ret); + } + + ret = ibt_leave_mcg(mcg->mg_rgid, mcg->mg_mgid, eib_reserved_gid, + mcg->mg_join_state); + if (ret != IBT_SUCCESS) { + EIB_DPRINTF_WARN(ss->ei_instance, + "eib_rb_vnic_join_data_mcg: " + "ibt_leave_mcg(mgid=%llx.%llx, jstate=0x%x) " + "failed, ret=%d", mcg->mg_mgid.gid_prefix, + mcg->mg_mgid.gid_guid, mcg->mg_join_state, ret); + } + + if (mcg->mg_mcginfo) + kmem_free(mcg->mg_mcginfo, sizeof (ibt_mcg_info_t)); + + kmem_free(mcg, sizeof (eib_mcg_t)); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/ib/clients/eoib/eibnx.conf Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,29 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# + +# +# Configuration file for the EoIB nexus driver +# +name="eibnx" parent="ib" unit-address="0";
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/ib/clients/eoib/enx_ctl.c Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,59 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/cred.h> +#include <sys/file.h> +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> + +#include <sys/ib/clients/eoib/enx_impl.h> + +/* + * Devctl cbops: open, close, ioctl + */ + +/*ARGSUSED*/ +int +eibnx_devctl_open(dev_t *devp, int flags, int otyp, cred_t *credp) +{ + return (0); +} + +/*ARGSUSED*/ +int +eibnx_devctl_close(dev_t dev, int flags, int otyp, cred_t *credp) +{ + return (0); +} + +/*ARGSUSED*/ +int +eibnx_devctl_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, + cred_t *cred_p, int *rval_p) +{ + return (0); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/ib/clients/eoib/enx_fip.c Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,605 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ksynch.h> +#include <sys/byteorder.h> + +#include <sys/ib/clients/eoib/enx_impl.h> + +const char fip_vendor_mellanox[] = { + 0x4d, 0x65, 0x6c, 0x6c, 0x61, 0x6e, 0x6f, 0x78 +}; + +/* + * HW/FW workaround + * + * Verification of descriptor list length in the received packets is + * disabled, since experimentation shows that BX does not set the desc + * list length correctly. + */ +int enx_wa_no_desc_list_len = 1; + +/* + * Static function declarations + */ +static int eibnx_fip_make_solicit_pkt(eibnx_thr_info_t *, eibnx_wqe_t *); +static int eibnx_fip_send_solicit_pkt(eibnx_thr_info_t *, eibnx_wqe_t *, + eibnx_gw_addr_t *); +static int eibnx_fip_parse_advt_pkt(uint8_t *, eibnx_gw_msg_t *); +static void eibnx_rb_fip_make_solicit_pkt(eibnx_wqe_t *); + +/* + * Prepare and send a solicit multicast packet to the All-EoIB-GWs-GID + */ +int +eibnx_fip_solicit_mcast(eibnx_thr_info_t *info) +{ + eibnx_wqe_t *swqe; + int ret; + + if ((swqe = eibnx_acquire_swqe(info, KM_SLEEP)) == NULL) + return (ENX_E_FAILURE); + + ret = eibnx_fip_make_solicit_pkt(info, swqe); + if (ret != ENX_E_SUCCESS) { + eibnx_release_swqe(swqe); + return (ENX_E_FAILURE); + } + + ret = eibnx_fip_send_solicit_pkt(info, swqe, NULL); + if (ret != ENX_E_SUCCESS) { + eibnx_rb_fip_make_solicit_pkt(swqe); + eibnx_release_swqe(swqe); + return (ENX_E_FAILURE); + } + + return (ENX_E_SUCCESS); +} + +/* + * Go through the list of already discovered gateways and send + * a unicast solicitation to each gateway. This is required by + * the EoIB specification ostensibly to receive updated + * advertisements. + */ +int +eibnx_fip_solicit_ucast(eibnx_thr_info_t *info, clock_t *solicit_period_ticks) +{ + eibnx_gw_info_t *gw; + eibnx_wqe_t *swqe; + clock_t min_solicit_period_msec; + int ret; + + /* + * We want to read the gwlist and send a unicast to each + * destination. Now, the only places where the gw list pointers + * are updated are when we're adding a new gw item to the list + * and when the list is being torn down and freed. + * + * Since new GWs are always inserted at the head of the list, + * we're guaranteed that any tail subchain of the list will + * not change by the addition of a new gw item coming into + * the list. + * + * Also, since the gw list is torn down only by the port-monitor + * thread (i.e. ourselves), we are also protected against the + * list itself going away while we're here. + * + * Given these two constraints, we can safely read the list + * of gateways without the gw list lock in this routine. + */ + min_solicit_period_msec = drv_hztousec(*solicit_period_ticks) / 1000; + for (gw = info->ti_gw; gw; gw = gw->gw_next) { + + if (eibnx_is_gw_dead(gw)) + continue; + + swqe = gw->gw_swqe; + ASSERT(swqe != NULL); + + mutex_enter(&swqe->qe_lock); + if (swqe->qe_type != ENX_QETYP_SWQE) { + ENX_DPRINTF_DEBUG("eibnx_fip_solicit_ucast: " + "gw wqe type (0x%lx) indicates this is not an " + "swqe!, cannot send solicitation to gw", + swqe->qe_type); + mutex_exit(&swqe->qe_lock); + continue; + } else if ((swqe->qe_flags & ENX_QEFL_INUSE) != + ENX_QEFL_INUSE) { + ENX_DPRINTF_DEBUG("eibnx_fip_solicit_ucast: " + "gw swqe flags (0x%lx) indicate swqe is free!, " + "cannot send solicitation to gw", swqe->qe_flags); + mutex_exit(&swqe->qe_lock); + continue; + } else if ((swqe->qe_flags & ENX_QEFL_POSTED) == + ENX_QEFL_POSTED) { + ENX_DPRINTF_DEBUG("eibnx_fip_solicit_ucast: gw swqe " + "flags (0x%lx) indicate swqe is still with HCA!, " + "cannot send solicitation to gw", swqe->qe_flags); + mutex_exit(&swqe->qe_lock); + continue; + } + mutex_exit(&swqe->qe_lock); + + /* + * EoIB spec requires that each host send solicitation + * to discovered gateways atleast every 4 * GW_ADV_PERIOD. + * We make sure we send a solicitation to all gateways + * every 4 * GW_ADV_PERIOD of the smallest value of + * GW_ADV_PERIOD that we have in our gw list. + */ + if ((gw->gw_adv_period * 4) < min_solicit_period_msec) + min_solicit_period_msec = gw->gw_adv_period * 4; + + ret = eibnx_fip_make_solicit_pkt(info, swqe); + if (ret != ENX_E_SUCCESS) + continue; + + ret = eibnx_fip_send_solicit_pkt(info, swqe, &gw->gw_addr); + if (ret != ENX_E_SUCCESS) + eibnx_rb_fip_make_solicit_pkt(swqe); + } + + *solicit_period_ticks = drv_usectohz(min_solicit_period_msec * 1000); + + return (ENX_E_SUCCESS); +} + +/* + * Given a send wqe and an eibnx_thr_info_t pointer, fill in the + * send buffer with a solicit packet in the network byte order. + */ +static int +eibnx_fip_make_solicit_pkt(eibnx_thr_info_t *info, eibnx_wqe_t *swqe) +{ + fip_solicit_t *solicit; + fip_proto_t *proto; + fip_basic_hdr_t *hdr; + fip_desc_iba_t *iba; + ib_gid_t port_gid; + ib_guid_t port_guid; + + uint8_t *pkt = (uint8_t *)(uintptr_t)(swqe->qe_sgl.ds_va); + uint_t pktsz = swqe->qe_sgl.ds_len; + uint_t solicit_sz = sizeof (fip_solicit_t); + + if (pktsz < solicit_sz) { + ENX_DPRINTF_ERR("swqe bufsize too small for pkt, " + "pktsz=%x < expsz=%x", pktsz, solicit_sz); + return (ENX_E_FAILURE); + } + + /* + * Lint complains that there may be an alignment issue here, + * but we know that the "pkt" is atleast double-word aligned, + * so it's ok. + */ + solicit = (fip_solicit_t *)pkt; + + /* + * Fill in the FIP protocol version + */ + proto = &solicit->sl_proto_version; + proto->pr_version = FIP_PROTO_VERSION; + + /* + * Fill in the basic header + */ + hdr = &solicit->sl_fip_hdr; + hdr->hd_opcode = htons(FIP_OPCODE_EOIB); + hdr->hd_subcode = FIP_SUBCODE_H_SOLICIT; + hdr->hd_desc_list_len = htons((solicit_sz >> 2) - 2); + hdr->hd_flags = 0; + hdr->hd_type = FIP_DESC_TYPE_VENDOR_ID; + hdr->hd_len = FIP_DESC_LEN_VENDOR_ID; + bcopy(fip_vendor_mellanox, hdr->hd_vendor_id, FIP_VENDOR_LEN); + + /* + * Fill in the Infiniband Address descriptor + */ + iba = &solicit->sl_iba; + iba->ia_type = FIP_DESC_TYPE_IBA; + iba->ia_len = FIP_DESC_LEN_IBA; + bcopy(fip_vendor_mellanox, iba->ia_vendor_id, FIP_VENDOR_LEN); + iba->ia_qpn = htonl(info->ti_qpn); + iba->ia_sl_portid = 0; + iba->ia_lid = htons(info->ti_pi->p_base_lid); + port_gid = info->ti_pi->p_sgid_tbl[0]; + port_guid = htonll(port_gid.gid_guid); + bcopy(&port_guid, iba->ia_guid, FIP_GUID_LEN); + + /* + * Adjust the ds_len in the sgl to indicate the size of the + * solicit pkt before returning + */ + swqe->qe_sgl.ds_len = solicit_sz; + + return (ENX_E_SUCCESS); +} + +static int +eibnx_setup_ud_dest(eibnx_thr_info_t *info, eibnx_wqe_t *swqe, + eibnx_gw_addr_t *gw_addr) +{ + eibnx_t *ss = enx_global_ss; + ibt_path_attr_t attr; + ibt_path_info_t path; + ibt_status_t ret; + + /* + * If this a multicast send, we'll have the gateway address NULL, + * and we'll need to modify the UD destination to send to the + * solicit mcg. + */ + if (gw_addr == NULL) { + ret = ibt_modify_ud_dest(swqe->qe_wr.send.wr.ud.udwr_dest, + info->ti_solicit_mcg->mc_qkey, IB_MC_QPN, + &info->ti_solicit_mcg->mc_adds_vect); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_ERR("ibt_modify_ud_dest() failed with " + "ret=%d, qkey=%x, qpn=%x", ret, + info->ti_solicit_mcg->mc_qkey, IB_MC_QPN); + return (ENX_E_FAILURE); + } + + return (ENX_E_SUCCESS); + } + + /* + * If this is a unicast send, but we already have the gw address + * vector, the ud destination handle has already been set up for + * this gateway, so we can return. + */ + if (gw_addr->ga_vect) + return (ENX_E_SUCCESS); + + /* + * Get the reversible path information for this gateway + */ + bzero(&attr, sizeof (ibt_path_info_t)); + attr.pa_dgids = &gw_addr->ga_gid; + attr.pa_num_dgids = 1; + attr.pa_sgid = info->ti_pi->p_sgid_tbl[0]; + attr.pa_pkey = gw_addr->ga_pkey; + + bzero(&path, sizeof (ibt_path_info_t)); + ret = ibt_get_paths(ss->nx_ibt_hdl, IBT_PATH_PKEY, + &attr, 1, &path, NULL); + if ((ret != IBT_SUCCESS) || (path.pi_hca_guid == 0)) { + ENX_DPRINTF_ERR("ibt_get_paths() failed with " + "ret=%d, gid_prefix=%llx, gid_guid=%llx", ret, + gw_addr->ga_gid.gid_prefix, gw_addr->ga_gid.gid_guid); + return (ENX_E_FAILURE); + } + + /* + * And save the address vector + */ + gw_addr->ga_vect = kmem_zalloc(sizeof (ibt_adds_vect_t), KM_SLEEP); + bcopy(&path.pi_prim_cep_path.cep_adds_vect, gw_addr->ga_vect, + sizeof (ibt_adds_vect_t)); + + /* + * Modify the UD destination handle on this swqe entry to address + * this gateway + */ + ret = ibt_modify_ud_dest(swqe->qe_wr.send.wr.ud.udwr_dest, + gw_addr->ga_qkey, gw_addr->ga_qpn, gw_addr->ga_vect); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_ERR("ibt_modify_ud_dest() failed with " + "ret=%d, qkey=%x, qpn=%x", ret, gw_addr->ga_qkey, + gw_addr->ga_qpn); + kmem_free(gw_addr->ga_vect, sizeof (ibt_adds_vect_t)); + gw_addr->ga_vect = NULL; + return (ENX_E_FAILURE); + } + + return (ENX_E_SUCCESS); +} + +/* + * Send a solicit packet to the appropriate destination: if the + * destination gw addr is specified, send a unicast message to it; + * if not, send a multicast using the solicit mcg address. + */ +static int +eibnx_fip_send_solicit_pkt(eibnx_thr_info_t *info, eibnx_wqe_t *swqe, + eibnx_gw_addr_t *gw_addr) +{ + ibt_status_t ret; + + if (eibnx_setup_ud_dest(info, swqe, gw_addr) != ENX_E_SUCCESS) + return (ENX_E_FAILURE); + + mutex_enter(&swqe->qe_lock); + + /* + * Note that if the post send fails, we don't really need to undo + * anything we did in setting up the ud destination; we can always + * use it for the next time. + */ + ret = ibt_post_send(info->ti_chan, &(swqe->qe_wr.send), 1, NULL); + if (ret != IBT_SUCCESS) { + mutex_exit(&swqe->qe_lock); + ENX_DPRINTF_ERR("ibt_post_send() failed for solicit, " + "ret=%d", ret); + return (ENX_E_FAILURE); + } + + /* + * Set the 'posted' flag for the send wqe. If this is an unicast + * send, the wqe is attached to a specific gw entry and we should + * not release the wqe back to the pool on the send completion. + */ + swqe->qe_flags |= ENX_QEFL_POSTED; + if (gw_addr == NULL) { + swqe->qe_flags |= ENX_QEFL_RELONCOMP; + info->ti_mcast_done = 1; + } + + mutex_exit(&swqe->qe_lock); + + return (ENX_E_SUCCESS); +} + +/* + * Parse a received packet from the gateway into the + * eibnx_gw_msg_t argument. Note that at this point, this + * driver only expects to receive advertisements from the + * GW, nothing else. + */ +int +eibnx_fip_parse_pkt(uint8_t *pkt, eibnx_gw_msg_t *msg) +{ + fip_basic_hdr_t *hdr; + uint16_t opcode; + uint8_t subcode; + int ret = ENX_E_FAILURE; + + /* + * Lint complains about potential alignment problem here, + * but the fip_* structures are all packed and each of them + * is aligned on a word boundary, so we're ok. + */ + hdr = (fip_basic_hdr_t *)(pkt + sizeof (fip_proto_t)); + + /* + * Verify that the opcode is EoIB + */ + if ((opcode = ntohs(hdr->hd_opcode)) != FIP_OPCODE_EOIB) { + ENX_DPRINTF_WARN("unsupported opcode (%x) found in " + "gw advertisement, ignoring", opcode); + return (ENX_E_FAILURE); + } + + /* + * We only handle GW advertisements in the eibnx driver code. However, + * the BridgeX gateway software currently sends login acknowledgements + * to the one who did the solicitation instead of the one who actually + * made the login request, so we need to do something about this as + * well. + */ + subcode = hdr->hd_subcode; + switch (subcode) { + case FIP_SUBCODE_G_ADVERTISE: + ret = eibnx_fip_parse_advt_pkt(pkt, msg); + break; + + case FIP_SUBCODE_G_VNIC_LOGIN_ACK: + msg->gm_type = FIP_VNIC_LOGIN_ACK; + ret = ENX_E_SUCCESS; + break; + + default: + ENX_DPRINTF_WARN("unsupported subcode (%x) found in " + "gw advertisement, ignoring", subcode); + ret = ENX_E_FAILURE; + break; + } + + return (ret); +} + +/* + * Parse and validate a packet known to be an advertisement from + * the GW. + */ +static int +eibnx_fip_parse_advt_pkt(uint8_t *pkt, eibnx_gw_msg_t *msg) +{ + fip_advertise_t *advertise; + fip_basic_hdr_t *hdr; + fip_desc_iba_t *desc_iba; + fip_desc_gwinfo_t *desc_gwinfo; + fip_desc_gwid_t *desc_gwid; + fip_desc_keepalive_t *desc_ka; + eibnx_gw_info_t *gwi; + ib_guid_t guid; + uint16_t rss_qpn_num_net_vnics; + uint16_t sl_portid; + uint16_t flags; + + /* + * Lint complains about potential alignment problem here, + * but we know that "pkt" is always atleast double-word + * aligned when it's passed to us, so we're ok. + */ + advertise = (fip_advertise_t *)pkt; + + /* + * Verify if the descriptor list length in the received + * packet is valid. Currently disabled. + * + * Experimentation shows that BX doesn't set the desc list + * length correctly, so we also simply ignore it and move + * on. If and when BX fixes this problem, we'll need to + * enable the warning+failure below. + */ + hdr = &(advertise->ad_fip_header); + if (!enx_wa_no_desc_list_len) { + uint_t pkt_data_sz; + + pkt_data_sz = (ntohs(hdr->hd_desc_list_len) + 2) << 2; + if (pkt_data_sz < sizeof (fip_advertise_t)) { + ENX_DPRINTF_WARN("advertisement from gw too small; " + "expected %x, got %x", sizeof (fip_advertise_t), + pkt_data_sz); + return (ENX_E_FAILURE); + } + } + + /* + * Validate all the header and descriptor types and lengths + */ + + if (hdr->hd_type != FIP_DESC_TYPE_VENDOR_ID || + hdr->hd_len != FIP_DESC_LEN_VENDOR_ID) { + ENX_DPRINTF_WARN("invalid type/len in fip basic header; " + "expected (%x,%x), got (%x,%x)", FIP_DESC_TYPE_VENDOR_ID, + FIP_DESC_LEN_VENDOR_ID, hdr->hd_type, hdr->hd_len); + return (ENX_E_FAILURE); + } + + desc_iba = &(advertise->ad_iba); + if (desc_iba->ia_type != FIP_DESC_TYPE_IBA || + desc_iba->ia_len != FIP_DESC_LEN_IBA) { + ENX_DPRINTF_WARN("invalid type/len in fip iba desc; " + "expected (%x,%x), got (%x,%x)", FIP_DESC_TYPE_IBA, + FIP_DESC_LEN_IBA, desc_iba->ia_type, desc_iba->ia_len); + return (ENX_E_FAILURE); + } + + desc_gwinfo = &(advertise->ad_gwinfo); + if (desc_gwinfo->gi_type != FIP_DESC_TYPE_EOIB_GW_INFO || + desc_gwinfo->gi_len != FIP_DESC_LEN_EOIB_GW_INFO) { + ENX_DPRINTF_WARN("invalid type/len in fip gwinfo desc; " + "expected (%x,%x), got (%x,%x)", + FIP_DESC_TYPE_EOIB_GW_INFO, FIP_DESC_LEN_EOIB_GW_INFO, + desc_gwinfo->gi_type, desc_gwinfo->gi_len); + return (ENX_E_FAILURE); + } + + desc_gwid = &(advertise->ad_gwid); + if (desc_gwid->id_type != FIP_DESC_TYPE_GW_ID || + desc_gwid->id_len != FIP_DESC_LEN_GW_ID) { + ENX_DPRINTF_WARN("invalid type/len in fip gwid desc; " + "expected (%x,%x), got (%x,%x)", + FIP_DESC_TYPE_GW_ID, FIP_DESC_LEN_GW_ID, + desc_gwid->id_type, desc_gwid->id_len); + return (ENX_E_FAILURE); + } + + desc_ka = &(advertise->ad_keep_alive); + if (desc_ka->ka_type != FIP_DESC_TYPE_KEEP_ALIVE || + desc_ka->ka_len != FIP_DESC_LEN_KEEP_ALIVE) { + ENX_DPRINTF_WARN("invalid type/len in fip ka desc; " + "expected (%x,%x), got (%x,%x)", + FIP_DESC_TYPE_KEEP_ALIVE, FIP_DESC_LEN_KEEP_ALIVE, + desc_ka->ka_type, desc_ka->ka_len); + return (ENX_E_FAILURE); + } + + /* + * Record if the gw is available for login ('A' bit in the header) + */ + flags = ntohs(hdr->hd_flags); + gwi = &(msg->u.gm_info); + gwi->gw_flag_available = (flags & FIP_BHFLAG_GWAVAIL) ? 1 : 0; + + /* + * Record if this was in response to a solicit request (unicast + * advertisement) or not ('S' bit in the header) + */ + gwi->gw_flag_ucast_advt = (flags & FIP_BHFLAG_SLCTMSG) ? 1 : 0; + msg->gm_type = (gwi->gw_flag_ucast_advt) ? + FIP_GW_ADVERTISE_UCAST : FIP_GW_ADVERTISE_MCAST; + + /* + * Record all info from the Infiniband Address descriptor + */ + gwi->gw_ctrl_qpn = (ntohl(desc_iba->ia_qpn) & FIP_IBA_QPN_MASK); + + sl_portid = ntohs(desc_iba->ia_sl_portid); + gwi->gw_portid = (sl_portid & FIP_IBA_PORTID_MASK); + gwi->gw_sl = ((sl_portid & FIP_IBA_SL_MASK) >> FIP_IBA_SL_SHIFT); + + gwi->gw_lid = ntohs(desc_iba->ia_lid); + + bcopy(desc_iba->ia_guid, &guid, sizeof (ib_guid_t)); + gwi->gw_guid = ntohll(guid); + + /* + * Record all info from the EoIB GW Information descriptor + */ + if (desc_gwinfo->gi_flags & FIP_GWI_HOST_ADMIND_VNICS_MASK) + gwi->gw_is_host_adm_vnics = 1; + else + gwi->gw_is_host_adm_vnics = 0; + + rss_qpn_num_net_vnics = ntohs(desc_gwinfo->gi_rss_qpn_num_net_vnics); + gwi->gw_num_net_vnics = (rss_qpn_num_net_vnics & + FIP_GWI_NUM_NET_VNICS_MASK); + gwi->gw_n_rss_qpn = ((rss_qpn_num_net_vnics & + FIP_GWI_RSS_QPN_MASK) >> FIP_GWI_RSS_QPN_SHIFT); + bcopy(desc_gwinfo->gi_vendor_id, gwi->gw_vendor_id, FIP_VENDOR_LEN); + (gwi->gw_vendor_id)[FIP_VENDOR_LEN] = '\0'; + + /* + * Record all info from the Gateway Identifier descriptor + */ + bcopy(desc_gwid->id_guid, &guid, sizeof (ib_guid_t)); + gwi->gw_system_guid = ntohll(guid); + bcopy(desc_gwid->id_sysname, gwi->gw_system_name, FIP_SYSNAME_LEN); + (gwi->gw_system_name)[FIP_SYSNAME_LEN] = '\0'; + bcopy(desc_gwid->id_portname, gwi->gw_port_name, FIP_PORTNAME_LEN); + (gwi->gw_port_name)[FIP_PORTNAME_LEN] = '\0'; + + /* + * Record all info from the Keep Alive descriptor + */ + gwi->gw_adv_period = ntohl(desc_ka->ka_gw_adv_period); + gwi->gw_ka_period = ntohl(desc_ka->ka_gw_ka_period); + gwi->gw_vnic_ka_period = ntohl(desc_ka->ka_vnic_ka_period); + + gwi->gw_next = NULL; + + return (ENX_E_SUCCESS); +} + +/* + * Rollback whatever we did for making a solicit packet + */ +static void +eibnx_rb_fip_make_solicit_pkt(eibnx_wqe_t *swqe) +{ + uint8_t *pkt = (uint8_t *)(uintptr_t)(swqe->qe_sgl.ds_va); + + bzero(pkt, sizeof (fip_solicit_t)); + swqe->qe_sgl.ds_len = swqe->qe_bufsz; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/ib/clients/eoib/enx_hdlrs.c Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,1127 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/sunndi.h> +#include <sys/ksynch.h> +#include <sys/callb.h> +#include <sys/ib/mgt/sm_attr.h> /* SM_INIT_TYPE_REPLY_... */ + +#include <sys/ib/clients/eoib/enx_impl.h> + +/* + * Static function declarations + */ +static void eibnx_gw_is_alive(eibnx_gw_info_t *); +static void eibnx_gw_is_aware(eibnx_thr_info_t *, eibnx_gw_info_t *, boolean_t); +static void eibnx_process_rx(eibnx_thr_info_t *, ibt_wc_t *, eibnx_wqe_t *); +static void eibnx_handle_wcerr(uint8_t, eibnx_wqe_t *, eibnx_thr_info_t *); +static void eibnx_handle_login_ack(eibnx_thr_info_t *, uint8_t *); +static void eibnx_handle_gw_rebirth(eibnx_thr_info_t *, uint16_t); +static void eibnx_handle_gw_info_update(eibnx_thr_info_t *, uint16_t, void *); +static int eibnx_replace_portinfo(eibnx_thr_info_t *, ibt_hca_portinfo_t *, + uint_t); +static void eibnx_handle_port_events(ibt_hca_hdl_t, uint8_t); +static void eibnx_handle_hca_attach(ib_guid_t); +static void eibnx_handle_hca_detach(ib_guid_t); + +/* + * NDI event handle we need + */ +extern ndi_event_hdl_t enx_ndi_event_hdl; + +/* + * SM's init type reply flags + */ +#define ENX_PORT_ATTR_LOADED(itr) \ + (((itr) & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) +#define ENX_PORT_ATTR_NOT_PRESERVED(itr) \ + (((itr) & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0) +#define ENX_PORT_PRES_NOT_PRESERVED(itr) \ + (((itr) & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 0) + +/* + * Port monitor progress flags (all flag values should be non-zero) + */ +#define ENX_MON_LINKSTATE_UP 0x01 +#define ENX_MON_FOUND_MCGS 0x02 +#define ENX_MON_SETUP_CQ 0x04 +#define ENX_MON_SETUP_UD_CHAN 0x08 +#define ENX_MON_SETUP_BUFS 0x10 +#define ENX_MON_SETUP_CQ_HDLR 0x20 +#define ENX_MON_JOINED_MCGS 0x40 +#define ENX_MON_MULTICAST_SLCT 0x80 +#define ENX_MON_MAX 0xFF + +/* + * Per-port thread to solicit, monitor and discover EoIB gateways + * and create the corresponding EoIB driver instances on the host. + */ +void +eibnx_port_monitor(eibnx_thr_info_t *info) +{ + clock_t solicit_period_ticks; + clock_t deadline; + kmutex_t ci_lock; + callb_cpr_t ci; + char thr_name[MAXNAMELEN]; + + (void) snprintf(thr_name, MAXNAMELEN, ENX_PORT_MONITOR, + info->ti_pi->p_port_num); + + mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL); + CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, thr_name); + + info->ti_progress = 0; + + /* + * If the port is not active yet, wait for a port up event. The + * async handler, when it sees a port-up event, is expected to + * update the port_monitor's portinfo structure's p_linkstate + * and wake us up with ENX_EVENT_LINK_UP. + */ + while (info->ti_pi->p_linkstate != IBT_PORT_ACTIVE) { + mutex_enter(&info->ti_event_lock); + while ((info->ti_event & + (ENX_EVENT_LINK_UP | ENX_EVENT_DIE)) == 0) { + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_BEGIN(&ci); + mutex_exit(&ci_lock); + + cv_wait(&info->ti_event_cv, &info->ti_event_lock); + + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_END(&ci, &ci_lock); + mutex_exit(&ci_lock); + } + if (info->ti_event & ENX_EVENT_DIE) { + mutex_exit(&info->ti_event_lock); + goto port_monitor_exit; + } + info->ti_event &= (~ENX_EVENT_LINK_UP); + mutex_exit(&info->ti_event_lock); + } + info->ti_progress |= ENX_MON_LINKSTATE_UP; + + /* + * Locate the multicast groups for sending solicit requests + * to the GW and receiving advertisements from the GW. If + * either of the mcg is not present, wait for them to be + * created by the GW. + */ + while (eibnx_find_mgroups(info) != ENX_E_SUCCESS) { + mutex_enter(&info->ti_event_lock); + while ((info->ti_event & + (ENX_EVENT_MCGS_AVAILABLE | ENX_EVENT_DIE)) == 0) { + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_BEGIN(&ci); + mutex_exit(&ci_lock); + + cv_wait(&info->ti_event_cv, &info->ti_event_lock); + + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_END(&ci, &ci_lock); + mutex_exit(&ci_lock); + } + if (info->ti_event & ENX_EVENT_DIE) { + mutex_exit(&info->ti_event_lock); + goto port_monitor_exit; + } + info->ti_event &= (~ENX_EVENT_MCGS_AVAILABLE); + mutex_exit(&info->ti_event_lock); + } + info->ti_progress |= ENX_MON_FOUND_MCGS; + + /* + * Setup a shared CQ + */ + if (eibnx_setup_cq(info) != ENX_E_SUCCESS) { + ENX_DPRINTF_ERR("eibnx_setup_cq() failed, terminating " + "port monitor for (hca_guid=0x%llx, port_num=0x%x)", + info->ti_hca_guid, info->ti_pi->p_port_num); + goto port_monitor_exit; + } + info->ti_progress |= ENX_MON_SETUP_CQ; + + /* + * Setup UD channel + */ + if (eibnx_setup_ud_channel(info) != ENX_E_SUCCESS) { + ENX_DPRINTF_ERR("eibnx_setup_ud_channel() failed, terminating " + "port monitor for (hca_guid=0x%llx, port_num=0x%x)", + info->ti_hca_guid, info->ti_pi->p_port_num); + goto port_monitor_exit; + } + info->ti_progress |= ENX_MON_SETUP_UD_CHAN; + + /* + * Allocate/initialize any tx/rx buffers + */ + if (eibnx_setup_bufs(info) != ENX_E_SUCCESS) { + ENX_DPRINTF_ERR("eibnx_setup_bufs() failed, terminating " + "port monitor for (hca_guid=0x%llx, port_num=0x%x)", + info->ti_hca_guid, info->ti_pi->p_port_num); + goto port_monitor_exit; + } + info->ti_progress |= ENX_MON_SETUP_BUFS; + + /* + * Setup completion handler + */ + if (eibnx_setup_cq_handler(info) != ENX_E_SUCCESS) { + ENX_DPRINTF_ERR("eibnx_setup_cq_handler() failed, terminating " + "port monitor for (hca_guid=0x%llx, port_num=0x%x)", + info->ti_hca_guid, info->ti_pi->p_port_num); + goto port_monitor_exit; + } + info->ti_progress |= ENX_MON_SETUP_CQ_HDLR; + + /* + * Join EoIB multicast groups + */ + if (eibnx_join_mcgs(info) != ENX_E_SUCCESS) { + ENX_DPRINTF_ERR("eibnx_join_mcgs() failed, terminating ", + "port monitor for (hca_guid=0x%llx, port_num=0x%x)", + info->ti_hca_guid, info->ti_pi->p_port_num); + goto port_monitor_exit; + } + info->ti_progress |= ENX_MON_JOINED_MCGS; + + /* + * Send SOLICIT pkt to the EoIB multicast group + */ + if (eibnx_fip_solicit_mcast(info) != ENX_E_SUCCESS) { + ENX_DPRINTF_ERR("eibnx_fip_solicit_mcast() failed, terminating " + "port monitor for (hca_guid=0x%llx, port_num=0x%x)", + info->ti_hca_guid, info->ti_pi->p_port_num); + goto port_monitor_exit; + } + info->ti_progress |= ENX_MON_MULTICAST_SLCT; + + mutex_enter(&info->ti_event_lock); + + solicit_period_ticks = drv_usectohz(ENX_DFL_SOLICIT_PERIOD_USEC); + +periodic_solicit: + deadline = ddi_get_lbolt() + solicit_period_ticks; + while ((info->ti_event & (ENX_EVENT_TIMED_OUT | ENX_EVENT_DIE)) == 0) { + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_BEGIN(&ci); + mutex_exit(&ci_lock); + + if (cv_timedwait(&info->ti_event_cv, &info->ti_event_lock, + deadline) == -1) { + info->ti_event |= ENX_EVENT_TIMED_OUT; + } + + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_END(&ci, &ci_lock); + mutex_exit(&ci_lock); + } + + if (info->ti_event & ENX_EVENT_DIE) { + mutex_exit(&info->ti_event_lock); + goto port_monitor_exit; + } + + if (info->ti_event & ENX_EVENT_TIMED_OUT) { + if (eibnx_fip_solicit_ucast(info, + &solicit_period_ticks) != ENX_E_SUCCESS) { + ENX_DPRINTF_WARN("failed to send solicit ucast to " + "gateways (hca_guid=0x%llx, port_num=0x%x)", + info->ti_hca_guid, info->ti_pi->p_port_num); + } + info->ti_event &= ~ENX_EVENT_TIMED_OUT; + } + + goto periodic_solicit; + +port_monitor_exit: + if (info->ti_progress & ENX_MON_MULTICAST_SLCT) { + eibnx_cleanup_port_nodes(info); + info->ti_progress &= (~ENX_MON_MULTICAST_SLCT); + } + if (info->ti_progress & ENX_MON_JOINED_MCGS) { + eibnx_rb_join_mcgs(info); + info->ti_progress &= (~ENX_MON_JOINED_MCGS); + } + if (info->ti_progress & ENX_MON_SETUP_CQ_HDLR) { + eibnx_rb_setup_cq_handler(info); + info->ti_progress &= (~ENX_MON_SETUP_CQ_HDLR); + } + if (info->ti_progress & ENX_MON_SETUP_BUFS) { + eibnx_rb_setup_bufs(info); + info->ti_progress &= (~ENX_MON_SETUP_BUFS); + } + if (info->ti_progress & ENX_MON_SETUP_UD_CHAN) { + eibnx_rb_setup_ud_channel(info); + info->ti_progress &= (~ENX_MON_SETUP_UD_CHAN); + } + if (info->ti_progress & ENX_MON_SETUP_CQ) { + eibnx_rb_setup_cq(info); + info->ti_progress &= (~ENX_MON_SETUP_CQ); + } + if (info->ti_progress & ENX_MON_FOUND_MCGS) { + eibnx_rb_find_mgroups(info); + info->ti_progress &= (~ENX_MON_FOUND_MCGS); + } + + mutex_enter(&ci_lock); + CALLB_CPR_EXIT(&ci); + mutex_destroy(&ci_lock); +} + +/* + * Async subnet notices handler registered with IBTF + */ +/*ARGSUSED*/ +void +eibnx_subnet_notices_handler(void *arg, ib_gid_t gid, + ibt_subnet_event_code_t sn_evcode, ibt_subnet_event_t *sn_event) +{ + eibnx_t *ss = enx_global_ss; + eibnx_thr_info_t *ti; + ib_gid_t notice_gid; + + switch (sn_evcode) { + case IBT_SM_EVENT_MCG_CREATED: + notice_gid = sn_event->sm_notice_gid; + + if ((notice_gid.gid_prefix == enx_solicit_mgid.gid_prefix && + notice_gid.gid_guid == enx_solicit_mgid.gid_guid) || + (notice_gid.gid_prefix == enx_advertise_mgid.gid_prefix && + notice_gid.gid_guid == enx_advertise_mgid.gid_guid)) { + + mutex_enter(&ss->nx_lock); + for (ti = ss->nx_thr_info; ti; ti = ti->ti_next) { + mutex_enter(&ti->ti_event_lock); + ti->ti_event |= ENX_EVENT_MCGS_AVAILABLE; + cv_broadcast(&ti->ti_event_cv); + mutex_exit(&ti->ti_event_lock); + } + mutex_exit(&ss->nx_lock); + } + break; + + case IBT_SM_EVENT_MCG_DELETED: + break; + + default: + break; + } +} + +/* + * Async event handler registered with IBTF + */ +/*ARGSUSED*/ +void +eibnx_async_handler(void *clnt_pvt, ibt_hca_hdl_t hca, + ibt_async_code_t code, ibt_async_event_t *event) +{ + switch (code) { + case IBT_ERROR_CATASTROPHIC_CHAN: + case IBT_ERROR_INVALID_REQUEST_CHAN: + case IBT_ERROR_ACCESS_VIOLATION_CHAN: + case IBT_ERROR_CQ: + case IBT_ERROR_CATASTROPHIC_SRQ: + ENX_DPRINTF_ERR("ibt ERROR event 0x%x received " + "(hca_guid=0x%llx)", code, event->ev_hca_guid); + break; + + case IBT_ERROR_PORT_DOWN: + ENX_DPRINTF_WARN("ibt PORT_DOWN event received " + "(hca_guid=0x%llx, port_num=0x%x)", + event->ev_hca_guid, event->ev_port); + break; + + case IBT_EVENT_PORT_UP: + ENX_DPRINTF_WARN("ibt PORT_UP event received " + "(hca_guid=0x%llx, port_num=0x%x)", + event->ev_hca_guid, event->ev_port); + eibnx_handle_port_events(hca, event->ev_port); + break; + + case IBT_PORT_CHANGE_EVENT: + ENX_DPRINTF_WARN("ibt PORT_CHANGE event received " + "(hca_guid=0x%llx, port_num=0x%x)", + event->ev_hca_guid, event->ev_port); + eibnx_handle_port_events(hca, event->ev_port); + break; + + case IBT_CLNT_REREG_EVENT: + ENX_DPRINTF_WARN("ibt CLNT_REREG event received " + "(hca_guid=0x%llx, port_num=0x%x)", + event->ev_hca_guid, event->ev_port); + eibnx_handle_port_events(hca, event->ev_port); + break; + + case IBT_HCA_ATTACH_EVENT: + ENX_DPRINTF_VERBOSE("ibt HCA_ATTACH event received " + "(new hca_guid=0x%llx)", event->ev_hca_guid); + eibnx_handle_hca_attach(event->ev_hca_guid); + break; + + case IBT_HCA_DETACH_EVENT: + ENX_DPRINTF_VERBOSE("ibt HCA_DETACH event received " + "(target hca_guid=0x%llx)", event->ev_hca_guid); + eibnx_handle_hca_detach(event->ev_hca_guid); + break; + + default: + ENX_DPRINTF_VERBOSE("ibt UNSUPPORTED event 0x%x received " + "(hca_guid=0x%llx)", code, event->ev_hca_guid); + break; + } +} + +boolean_t +eibnx_is_gw_dead(eibnx_gw_info_t *gwi) +{ + int64_t cur_lbolt; + + cur_lbolt = ddi_get_lbolt64(); + + mutex_enter(&gwi->gw_adv_lock); + if ((cur_lbolt - gwi->gw_adv_last_lbolt) > gwi->gw_adv_timeout_ticks) { + gwi->gw_adv_flag = ENX_GW_DEAD; + mutex_exit(&gwi->gw_adv_lock); + return (B_TRUE); + } + mutex_exit(&gwi->gw_adv_lock); + + return (B_FALSE); +} + +static void +eibnx_gw_is_alive(eibnx_gw_info_t *gwi) +{ + /* + * We've just received a multicast advertisement from this + * gateway. Multicast or unicast, this means that the gateway + * is alive. Record this timestamp (in ticks). + */ + mutex_enter(&gwi->gw_adv_lock); + gwi->gw_adv_last_lbolt = ddi_get_lbolt64(); + if (gwi->gw_adv_flag == ENX_GW_DEAD) { + gwi->gw_adv_flag = ENX_GW_ALIVE; + } + mutex_exit(&gwi->gw_adv_lock); +} + +static void +eibnx_gw_is_aware(eibnx_thr_info_t *info, eibnx_gw_info_t *gwi, + boolean_t gwi_changed) +{ + eib_gw_info_t eib_gwi; + boolean_t post_rebirth_event = B_FALSE; + + /* + * We're here when we receive a unicast advertisement from a + * gateway. If this gateway was discovered earlier but was in + * a dead state, this means it has come back alive and become + * aware of us. We may need to inform any EoIB children + * waiting for notification. Note that if this gateway is + * being discovered for the first time now, we wouldn't have + * created the binding eoib node for it (we will do that when + * we return from this routine), so the "rebirth" and "gw info + * update" event postings will be NOPs. + */ + mutex_enter(&gwi->gw_adv_lock); + gwi->gw_adv_last_lbolt = ddi_get_lbolt64(); + if (gwi->gw_adv_flag != ENX_GW_AWARE) { + post_rebirth_event = B_TRUE; + } + gwi->gw_adv_flag = ENX_GW_AWARE; + mutex_exit(&gwi->gw_adv_lock); + + /* + * If we have a gateway information update event, we post that + * first, so any rebirth event processed later will have the + * correct gateway information. + */ + if (gwi_changed) { + eib_gwi.gi_system_guid = gwi->gw_system_guid; + eib_gwi.gi_guid = gwi->gw_guid; + eib_gwi.gi_sn_prefix = gwi->gw_addr.ga_gid.gid_prefix; + eib_gwi.gi_adv_period = gwi->gw_adv_period; + eib_gwi.gi_ka_period = gwi->gw_ka_period; + eib_gwi.gi_vnic_ka_period = gwi->gw_vnic_ka_period; + eib_gwi.gi_ctrl_qpn = gwi->gw_ctrl_qpn; + eib_gwi.gi_lid = gwi->gw_lid; + eib_gwi.gi_portid = gwi->gw_portid; + eib_gwi.gi_num_net_vnics = gwi->gw_num_net_vnics; + eib_gwi.gi_flag_available = gwi->gw_flag_available; + eib_gwi.gi_is_host_adm_vnics = gwi->gw_is_host_adm_vnics; + eib_gwi.gi_sl = gwi->gw_sl; + eib_gwi.gi_n_rss_qpn = gwi->gw_n_rss_qpn; + bcopy(gwi->gw_system_name, eib_gwi.gi_system_name, + EIB_GW_SYSNAME_LEN); + bcopy(gwi->gw_port_name, eib_gwi.gi_port_name, + EIB_GW_PORTNAME_LEN); + bcopy(gwi->gw_vendor_id, eib_gwi.gi_vendor_id, + EIB_GW_VENDOR_LEN); + + eibnx_handle_gw_info_update(info, eib_gwi.gi_portid, &eib_gwi); + } + if (post_rebirth_event) { + eibnx_handle_gw_rebirth(info, gwi->gw_portid); + } +} + +/* + * Thread to create eoib nodes and online instances + */ +void +eibnx_create_eoib_node(void) +{ + eibnx_t *ss = enx_global_ss; + eibnx_nodeq_t *node; + kmutex_t ci_lock; + callb_cpr_t ci; + + mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL); + CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, ENX_NODE_CREATOR); + +wait_for_node_to_create: + mutex_enter(&ss->nx_nodeq_lock); + + while ((ss->nx_nodeq == NULL) && (ss->nx_nodeq_thr_die == 0)) { + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_BEGIN(&ci); + mutex_exit(&ci_lock); + + cv_wait(&ss->nx_nodeq_cv, &ss->nx_nodeq_lock); + + mutex_enter(&ci_lock); + CALLB_CPR_SAFE_END(&ci, &ci_lock); + mutex_exit(&ci_lock); + } + + /* + * If this is not really a work item, but a request for us to + * die, throwaway all pending work requests and just die. + */ + if (ss->nx_nodeq_thr_die) { + while (ss->nx_nodeq) { + node = ss->nx_nodeq; + ss->nx_nodeq = node->nc_next; + node->nc_next = NULL; + + kmem_free(node, sizeof (eibnx_nodeq_t)); + } + mutex_exit(&ss->nx_nodeq_lock); + + mutex_enter(&ci_lock); + CALLB_CPR_EXIT(&ci); + mutex_destroy(&ci_lock); + + return; + } + + /* + * Grab the first node entry from the queue + */ + ASSERT(ss->nx_nodeq != NULL); + node = ss->nx_nodeq; + ss->nx_nodeq = node->nc_next; + node->nc_next = NULL; + + mutex_exit(&ss->nx_nodeq_lock); + + (void) eibnx_configure_node(node->nc_info, node->nc_gwi, NULL); + + kmem_free(node, sizeof (eibnx_nodeq_t)); + goto wait_for_node_to_create; + + /*NOTREACHED*/ +} + +/* + * Tx and Rx completion interrupt handler. Guaranteed to be single + * threaded and nonreentrant for this CQ. + */ +void +eibnx_comp_intr(ibt_cq_hdl_t cq_hdl, void *arg) +{ + eibnx_thr_info_t *info = arg; + + if (info->ti_cq_hdl != cq_hdl) { + ENX_DPRINTF_DEBUG("eibnx_comp_intr: " + "cq_hdl(0x%llx) != info->ti_cq_hdl(0x%llx), " + "ignoring completion", cq_hdl, info->ti_cq_hdl); + return; + } + + ASSERT(info->ti_softint_hdl != NULL); + + (void) ddi_intr_trigger_softint(info->ti_softint_hdl, NULL); +} + +/* + * Send and Receive completion handler functions for EoIB nexus + */ + +/*ARGSUSED*/ +uint_t +eibnx_comp_handler(caddr_t arg1, caddr_t arg2) +{ + eibnx_thr_info_t *info = (eibnx_thr_info_t *)arg1; + ibt_wc_t *wc; + eibnx_wqe_t *wqe; + ibt_status_t ret; + uint_t polled; + int i; + + /* + * Make sure the port monitor isn't killed if we're in the completion + * handler. If the port monitor thread is already being killed, we'll + * stop processing completions. + */ + mutex_enter(&info->ti_event_lock); + if (info->ti_event & (ENX_EVENT_DIE | ENX_EVENT_COMPLETION)) { + mutex_exit(&info->ti_event_lock); + return ((uint_t)ENX_E_SUCCESS); + } + info->ti_event |= ENX_EVENT_COMPLETION; + mutex_exit(&info->ti_event_lock); + + /* + * Re-arm the notification callback before we start polling + * the completion queue. There's nothing much we can do if the + * enable_cq_notify fails - we issue a warning and move on. + */ + ret = ibt_enable_cq_notify(info->ti_cq_hdl, IBT_NEXT_COMPLETION); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_WARN("ibt_enable_cq_notify(cq_hdl=0x%llx) " + "failed, ret=%d", info->ti_cq_hdl, ret); + } + + /* + * Handle tx and rx completions + */ + while ((ret = ibt_poll_cq(info->ti_cq_hdl, info->ti_wc, info->ti_cq_sz, + &polled)) == IBT_SUCCESS) { + for (wc = info->ti_wc, i = 0; i < polled; i++, wc++) { + wqe = (eibnx_wqe_t *)(uintptr_t)wc->wc_id; + if (wc->wc_status != IBT_WC_SUCCESS) { + eibnx_handle_wcerr(wc->wc_status, wqe, info); + } else if (wqe->qe_type == ENX_QETYP_RWQE) { + eibnx_process_rx(info, wc, wqe); + eibnx_return_rwqe(info, wqe); + } else { + eibnx_return_swqe(wqe); + } + } + } + + /* + * On the way out, make sure we wake up any pending death requestor + * for the port-monitor thread. Note that we need to do a cv_broadcast() + * here since there could be multiple threads sleeping on the event cv + * and we want to make sure all waiters get a chance to see if it's + * their turn. + */ + mutex_enter(&info->ti_event_lock); + info->ti_event &= (~ENX_EVENT_COMPLETION); + cv_broadcast(&info->ti_event_cv); + mutex_exit(&info->ti_event_lock); + + return (DDI_INTR_CLAIMED); +} + +/* + * Rx processing code + */ +static void +eibnx_process_rx(eibnx_thr_info_t *info, ibt_wc_t *wc, eibnx_wqe_t *wqe) +{ + eibnx_gw_msg_t msg; + eibnx_gw_info_t *gwi; + eibnx_gw_info_t *orig_gwi; + eibnx_gw_info_t *new_gwi; + uint_t orig_gw_state; + uint8_t *pkt = (uint8_t *)(uintptr_t)(wqe->qe_sgl.ds_va); + boolean_t gwi_changed; + + /* + * We'll simply drop any packet (including broadcast advertisements + * from gws) we receive before we've done our solicitation broadcast. + */ + if (info->ti_mcast_done == 0) { + return; + } + + /* + * Skip the GRH and parse the message in the packet + */ + if (eibnx_fip_parse_pkt(pkt + ENX_GRH_SZ, &msg) != ENX_E_SUCCESS) { + return; + } + + /* + * If it was a login ack for one of our children, we need to pass + * it on to the child + */ + if (msg.gm_type == FIP_VNIC_LOGIN_ACK) { + eibnx_handle_login_ack(info, pkt); + return; + } + + /* + * Other than that, we only handle gateway advertisements + */ + if (msg.gm_type != FIP_GW_ADVERTISE_MCAST && + msg.gm_type != FIP_GW_ADVERTISE_UCAST) { + return; + } + + gwi = &msg.u.gm_info; + + /* + * State machine to create eoib instances. Whether this advertisement + * is from a new gateway or an old gateway that we already know about, + * if this was a unicast response to our earlier solicitation and it's + * the first time we're receiving it from this gateway, we're ready to + * login, so we create the EoIB instance for it. + */ + orig_gwi = eibnx_find_gw_in_gwlist(info, gwi); + if (orig_gwi == NULL) { + if (gwi->gw_flag_available == 0) { + gwi->gw_state = ENX_GW_STATE_UNAVAILABLE; + gwi->gw_adv_flag = ENX_GW_ALIVE; + (void) eibnx_add_gw_to_gwlist(info, gwi, wc, pkt); + } else if (gwi->gw_flag_ucast_advt == 0) { + gwi->gw_state = ENX_GW_STATE_AVAILABLE; + gwi->gw_adv_flag = ENX_GW_ALIVE; + (void) eibnx_add_gw_to_gwlist(info, gwi, wc, pkt); + } else { + gwi->gw_state = ENX_GW_STATE_READY_TO_LOGIN; + gwi->gw_adv_flag = ENX_GW_AWARE; + if ((new_gwi = eibnx_add_gw_to_gwlist(info, gwi, + wc, pkt)) != NULL) { + eibnx_queue_for_creation(info, new_gwi); + } + } + } else { + orig_gw_state = orig_gwi->gw_state; + if (gwi->gw_flag_available == 0) { + gwi->gw_state = ENX_GW_STATE_UNAVAILABLE; + eibnx_replace_gw_in_gwlist(info, orig_gwi, gwi, + wc, pkt, NULL); + eibnx_gw_is_alive(orig_gwi); + + } else if (gwi->gw_flag_ucast_advt == 0) { + if (orig_gw_state == ENX_GW_STATE_UNAVAILABLE) { + gwi->gw_state = ENX_GW_STATE_AVAILABLE; + } else { + gwi->gw_state = orig_gw_state; + } + eibnx_replace_gw_in_gwlist(info, orig_gwi, gwi, + wc, pkt, NULL); + eibnx_gw_is_alive(orig_gwi); + + } else { + gwi->gw_state = ENX_GW_STATE_READY_TO_LOGIN; + eibnx_replace_gw_in_gwlist(info, orig_gwi, gwi, + wc, pkt, &gwi_changed); + eibnx_gw_is_aware(info, orig_gwi, gwi_changed); + + if (orig_gw_state != ENX_GW_STATE_READY_TO_LOGIN) + eibnx_queue_for_creation(info, orig_gwi); + } + } +} + +/*ARGSUSED*/ +static void +eibnx_handle_wcerr(uint8_t wcerr, eibnx_wqe_t *wqe, eibnx_thr_info_t *info) +{ + /* + * Currently, all we do is report + */ + switch (wcerr) { + case IBT_WC_WR_FLUSHED_ERR: + ENX_DPRINTF_VERBOSE("IBT_WC_WR_FLUSHED_ERR seen " + "(hca_guid=0x%llx, port_num=0x%x, wqe_type=0x%x)", + info->ti_hca_guid, info->ti_pi->p_port_num, wqe->qe_type); + break; + + case IBT_WC_LOCAL_CHAN_OP_ERR: + ENX_DPRINTF_ERR("IBT_WC_LOCAL_CHAN_OP_ERR seen " + "(hca_guid=0x%llx, port_num=0x%x, wqe_type=0x%x)", + info->ti_hca_guid, info->ti_pi->p_port_num, wqe->qe_type); + break; + + case IBT_WC_LOCAL_PROTECT_ERR: + ENX_DPRINTF_ERR("IBT_WC_LOCAL_PROTECT_ERR seen " + "(hca_guid=0x%llx, port_num=0x%x, wqe_type=0x%x)", + info->ti_hca_guid, info->ti_pi->p_port_num, wqe->qe_type); + break; + } +} + +static void +eibnx_handle_login_ack(eibnx_thr_info_t *info, uint8_t *pkt) +{ + eibnx_t *ss = enx_global_ss; + fip_login_ack_t *ack; + fip_desc_vnic_login_t *login; + ddi_eventcookie_t cookie; + dev_info_t *rdip; + uint16_t vnic_id; + uint16_t inst; + int ret; + + /* + * When we get login acknowledgements, we simply invoke the + * appropriate EoIB driver callback to process it on behalf + * of the driver instance. We will let the callback do error + * checks. + */ + ack = (fip_login_ack_t *)(pkt + ENX_GRH_SZ); + login = &(ack->ak_vnic_login); + vnic_id = ntohs(login->vl_vnic_id); + inst = EIB_DEVI_INSTANCE(vnic_id); + + if ((rdip = eibnx_find_child_dip_by_inst(info, inst)) == NULL) { + ENX_DPRINTF_DEBUG("no eoib child with instance 0x%x found " + "for (hca_guid=0x%llx, port_num=0x%x)", inst, + info->ti_hca_guid, info->ti_pi->p_port_num); + return; + } + + ret = ndi_event_retrieve_cookie(enx_ndi_event_hdl, rdip, + EIB_NDI_EVENT_LOGIN_ACK, &cookie, NDI_EVENT_NOPASS); + if (ret != NDI_SUCCESS) { + ENX_DPRINTF_WARN("no login-ack cookie for (hca_guid=0x%llx, " + "port_num=0x%x, eoib_inst=0x%x), ret=%d", info->ti_hca_guid, + info->ti_pi->p_port_num, inst, ret); + return; + } + + (void) ndi_post_event(ss->nx_dip, rdip, cookie, (void *)pkt); +} + +static void +eibnx_handle_gw_rebirth(eibnx_thr_info_t *info, uint16_t portid) +{ + eibnx_t *ss = enx_global_ss; + ddi_eventcookie_t cookie; + dev_info_t *rdip; + int ret; + + if ((rdip = eibnx_find_child_dip_by_gw(info, portid)) == NULL) { + ENX_DPRINTF_WARN("no eoib child bound to gw portid 0x%x " + "found for (hca_guid=0x%llx, port_num=0x%x)", + portid, info->ti_hca_guid, info->ti_pi->p_port_num); + return; + } + + ret = ndi_event_retrieve_cookie(enx_ndi_event_hdl, rdip, + EIB_NDI_EVENT_GW_AVAILABLE, &cookie, NDI_EVENT_NOPASS); + if (ret != NDI_SUCCESS) { + ENX_DPRINTF_WARN("no gw-available cookie for (hca_guid=0x%llx, " + "port_num=0x%x, gw_portid=0x%x), ret=%d", info->ti_hca_guid, + info->ti_pi->p_port_num, portid, ret); + return; + } + + (void) ndi_post_event(ss->nx_dip, rdip, cookie, NULL); +} + +static void +eibnx_handle_gw_info_update(eibnx_thr_info_t *info, uint16_t portid, + void *new_gw_info) +{ + eibnx_t *ss = enx_global_ss; + ddi_eventcookie_t cookie; + dev_info_t *rdip; + int ret; + + if ((rdip = eibnx_find_child_dip_by_gw(info, portid)) == NULL) { + ENX_DPRINTF_WARN("no eoib child bound to gw portid 0x%x " + "found for (hca_guid=0x%llx, port_num=0x%x)", + portid, info->ti_hca_guid, info->ti_pi->p_port_num); + return; + } + + ret = ndi_event_retrieve_cookie(enx_ndi_event_hdl, rdip, + EIB_NDI_EVENT_GW_INFO_UPDATE, &cookie, NDI_EVENT_NOPASS); + if (ret != NDI_SUCCESS) { + ENX_DPRINTF_WARN("no gw-info-update cookie for " + "(hca_guid=0x%llx, port_num=0x%x, gw_portid=0x%x), " + "ret=%d", info->ti_hca_guid, info->ti_pi->p_port_num, + portid, ret); + return; + } + + (void) ndi_post_event(ss->nx_dip, rdip, cookie, new_gw_info); +} + +static int +eibnx_replace_portinfo(eibnx_thr_info_t *ti, ibt_hca_portinfo_t *new_pi, + uint_t new_size_pi) +{ + eibnx_t *ss = enx_global_ss; + eibnx_hca_t *hca; + eibnx_port_t *port; + + mutex_enter(&ss->nx_lock); + + for (hca = ss->nx_hca; hca; hca = hca->hc_next) { + if (hca->hc_hdl == ti->ti_hca) + break; + } + + if (hca == NULL) { + ENX_DPRINTF_WARN("hca hdl (0x%llx) not found in hca list", + ti->ti_hca); + mutex_exit(&ss->nx_lock); + return (ENX_E_FAILURE); + } + + for (port = hca->hc_port; port; port = port->po_next) { + if (port->po_pi == ti->ti_pi) { + ibt_free_portinfo(port->po_pi, port->po_pi_size); + port->po_pi = new_pi; + port->po_pi_size = new_size_pi; + ti->ti_pi = port->po_pi; + break; + } + } + + if (port == NULL) { + ENX_DPRINTF_WARN("portinfo (0x%llx) not found in hca list", + ti->ti_pi); + mutex_exit(&ss->nx_lock); + return (ENX_E_FAILURE); + } + + mutex_exit(&ss->nx_lock); + + return (ENX_E_SUCCESS); +} + +static void +eibnx_handle_port_events(ibt_hca_hdl_t ev_hca, uint8_t ev_portnum) +{ + eibnx_t *ss = enx_global_ss; + eibnx_thr_info_t *ti; + ibt_hca_portinfo_t *pi; + ibt_status_t ret; + uint_t num_pi; + uint_t size_pi; + uint8_t itr; + + /* + * Find the port monitor thread that matches the event hca and + * portnum + */ + mutex_enter(&ss->nx_lock); + for (ti = ss->nx_thr_info; ti; ti = ti->ti_next) { + if ((ti->ti_hca == ev_hca) && + (ti->ti_pi->p_port_num == ev_portnum)) { + break; + } + } + mutex_exit(&ss->nx_lock); + + if (ti == NULL) + return; + + /* + * See if we need to rejoin the mcgs for this port and do so if true + */ + ret = ibt_query_hca_ports(ev_hca, ev_portnum, &pi, &num_pi, &size_pi); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_WARN("ibt_query_hca_ports() failed with %d", ret); + return; + } else if (num_pi != 1 || pi->p_linkstate != IBT_PORT_ACTIVE) { + ENX_DPRINTF_WARN("ibt_query_hca_ports(port_num=%d) failed, " + "num_pi=%d, linkstate=0x%x", ev_portnum, num_pi, + pi->p_linkstate); + ibt_free_portinfo(pi, size_pi); + return; + } + + itr = pi->p_init_type_reply; + if (ENX_PORT_ATTR_LOADED(itr) && ENX_PORT_ATTR_NOT_PRESERVED(itr)) { + /* + * If our port's base lid has changed, we need to replace + * the saved portinfo in our lists with the new one before + * going further. + */ + if (ti->ti_pi->p_base_lid != pi->p_base_lid) { + if (eibnx_replace_portinfo(ti, pi, size_pi) == + ENX_E_SUCCESS) { + pi = NULL; + size_pi = 0; + } + } + } + + /* + * If the port monitor was stuck waiting for the link to come up, + * let it know that it is up now. + */ + mutex_enter(&ti->ti_event_lock); + if ((ti->ti_progress & ENX_MON_LINKSTATE_UP) != ENX_MON_LINKSTATE_UP) { + ti->ti_pi->p_linkstate = IBT_PORT_ACTIVE; + ti->ti_event |= ENX_EVENT_LINK_UP; + cv_broadcast(&ti->ti_event_cv); + } + mutex_exit(&ti->ti_event_lock); + + if (ENX_PORT_PRES_NOT_PRESERVED(itr)) { + if (ti->ti_progress & ENX_MON_JOINED_MCGS) + (void) eibnx_rejoin_mcgs(ti); + } + + if (pi != NULL) + ibt_free_portinfo(pi, size_pi); +} + +static void +eibnx_handle_hca_attach(ib_guid_t new_hca_guid) +{ + eibnx_t *ss = enx_global_ss; + eibnx_thr_info_t *ti; + eibnx_hca_t *hca; + eibnx_port_t *port; + + /* + * All we need to do is to start a port monitor for all the ports + * on the new HCA. To do this, go through our current port monitors + * and see if we already have a monitor for this HCA - if so, print + * a warning and return. + */ + mutex_enter(&ss->nx_lock); + for (ti = ss->nx_thr_info; ti; ti = ti->ti_next) { + if (ti->ti_hca_guid == new_hca_guid) { + ENX_DPRINTF_VERBOSE("hca (guid=0x%llx) already " + "attached", new_hca_guid); + mutex_exit(&ss->nx_lock); + return; + } + } + mutex_exit(&ss->nx_lock); + + /* + * If we don't have it in our list, process the HCA and start the + * port monitors + */ + if ((hca = eibnx_prepare_hca(new_hca_guid)) != NULL) { + mutex_enter(&ss->nx_lock); + + hca->hc_next = ss->nx_hca; + ss->nx_hca = hca; + + for (port = hca->hc_port; port; port = port->po_next) { + ti = eibnx_start_port_monitor(hca, port); + + ti->ti_next = ss->nx_thr_info; + ss->nx_thr_info = ti; + } + mutex_exit(&ss->nx_lock); + } +} + +static void +eibnx_handle_hca_detach(ib_guid_t del_hca_guid) +{ + eibnx_t *ss = enx_global_ss; + eibnx_thr_info_t *ti; + eibnx_thr_info_t *ti_stop_list = NULL; + eibnx_thr_info_t *ti_prev; + eibnx_thr_info_t *ti_next; + eibnx_hca_t *hca; + eibnx_hca_t *hca_prev; + + /* + * We need to locate all monitor threads for this HCA and stop them + */ + mutex_enter(&ss->nx_lock); + ti_prev = NULL; + for (ti = ss->nx_thr_info; ti; ti = ti_next) { + ti_next = ti->ti_next; + + if (ti->ti_hca_guid != del_hca_guid) { + ti_prev = ti; + } else { + /* + * Take it out from the good list + */ + if (ti_prev) + ti_prev->ti_next = ti_next; + else + ss->nx_thr_info = ti_next; + + /* + * And put it in the to-stop list + */ + ti->ti_next = ti_stop_list; + ti_stop_list = ti; + } + } + mutex_exit(&ss->nx_lock); + + /* + * Ask all the port_monitor threads to die. + */ + for (ti = ti_stop_list; ti; ti = ti_next) { + ti_next = ti->ti_next; + eibnx_stop_port_monitor(ti); + } + + /* + * Now, locate the HCA in our list and release all HCA related + * resources. + */ + mutex_enter(&ss->nx_lock); + hca_prev = NULL; + for (hca = ss->nx_hca; hca; hca = hca->hc_next) { + if (hca->hc_guid != del_hca_guid) { + hca_prev = hca; + } else { + if (hca_prev) { + hca_prev->hc_next = hca->hc_next; + } else { + ss->nx_hca = hca->hc_next; + } + hca->hc_next = NULL; + break; + } + } + mutex_exit(&ss->nx_lock); + + if (hca) { + (void) eibnx_cleanup_hca(hca); + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/ib/clients/eoib/enx_ibt.c Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,1261 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ksynch.h> + +#include <sys/ib/clients/eoib/enx_impl.h> + +/* + * Module (static) info passed to IBTL during ibt_attach + */ +static ibt_clnt_modinfo_t eibnx_clnt_modinfo = { + IBTI_V_CURR, + IBT_GENERIC, + eibnx_async_handler, + NULL, + "EoIB Nexus" +}; + +ib_gid_t enx_advertise_mgid; +ib_gid_t enx_solicit_mgid; + +/* + * Static function declarations + */ +static int eibnx_state_init(void); +static int eibnx_setup_txbufs(eibnx_thr_info_t *); +static int eibnx_setup_rxbufs(eibnx_thr_info_t *); +static int eibnx_join_solicit_mcg(eibnx_thr_info_t *); +static int eibnx_join_advertise_mcg(eibnx_thr_info_t *); +static int eibnx_rb_ibt_init(eibnx_t *); +static void eibnx_rb_state_init(void); +static void eibnx_rb_setup_txbufs(eibnx_thr_info_t *); +static void eibnx_rb_setup_rxbufs(eibnx_thr_info_t *); +static void eibnx_rb_join_solicit_mcg(eibnx_thr_info_t *); +static void eibnx_rb_join_advertise_mcg(eibnx_thr_info_t *); + +/* + * eibnx_ibt_init() is expected to be called during the nexus driver's + * attach time; given that there is only one instance of the nexus + * driver allowed, and no threads are active before the initialization + * is complete, we don't really have to acquire any driver specific mutex + * within this routine. + */ +int +eibnx_ibt_init(eibnx_t *ss) +{ + eibnx_hca_t *hca_list; + eibnx_hca_t *hca_tail; + eibnx_hca_t *hca; + uint_t num_hcas; + ib_guid_t *hca_guids; + ibt_status_t ret; + int i; + + /* + * Do per-state initialization + */ + (void) eibnx_state_init(); + + /* + * Attach to IBTL + */ + if ((ret = ibt_attach(&eibnx_clnt_modinfo, ss->nx_dip, ss, + &ss->nx_ibt_hdl)) != IBT_SUCCESS) { + ENX_DPRINTF_ERR("ibt_attach() failed, ret=%d", ret); + eibnx_rb_state_init(); + return (ENX_E_FAILURE); + } + + /* + * Get the list of HCA guids on the system + */ + if ((num_hcas = ibt_get_hca_list(&hca_guids)) == 0) { + ENX_DPRINTF_VERBOSE("no HCAs found on the system"); + if ((ret = ibt_detach(ss->nx_ibt_hdl)) != IBT_SUCCESS) { + ENX_DPRINTF_ERR("ibt_detach() failed, ret=%d", ret); + } + ss->nx_ibt_hdl = NULL; + return (ENX_E_FAILURE); + } + + /* + * Open the HCAs and store the handles + */ + hca_list = hca_tail = NULL; + for (i = 0; i < num_hcas; i++) { + /* + * If we cannot open a HCA, allocate a protection domain + * on it or get portinfo on it, print an error and move on + * to the next HCA. Otherwise, queue it up in our hca list + */ + if ((hca = eibnx_prepare_hca(hca_guids[i])) == NULL) + continue; + + if (hca_tail) { + hca_tail->hc_next = hca; + } else { + hca_list = hca; + } + hca_tail = hca; + } + + /* + * Free the HCA guid list we've allocated via ibt_get_hca_list() + */ + ibt_free_hca_list(hca_guids, num_hcas); + + /* + * Put the hca list in the state structure + */ + mutex_enter(&ss->nx_lock); + ss->nx_hca = hca_list; + mutex_exit(&ss->nx_lock); + + /* + * Register for subnet notices + */ + ibt_register_subnet_notices(ss->nx_ibt_hdl, + eibnx_subnet_notices_handler, ss); + + return (ENX_E_SUCCESS); +} + +static int +eibnx_state_init(void) +{ + eibnx_t *ss = enx_global_ss; + kthread_t *kt; + + /* + * Initialize synchronization primitives + */ + mutex_init(&ss->nx_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&ss->nx_nodeq_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&ss->nx_nodeq_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&ss->nx_busop_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&ss->nx_busop_cv, NULL, CV_DEFAULT, NULL); + + /* + * Initialize well-known mgids: there must be a better way to + * do this instead of having to express every single gid as a + * tuple of two 8-byte integer quantities. + */ + enx_solicit_mgid.gid_prefix = EIB_GUID_SOLICIT_PREFIX; + enx_solicit_mgid.gid_guid = 0; + enx_advertise_mgid.gid_prefix = EIB_GUID_ADVERTISE_PREFIX; + enx_advertise_mgid.gid_guid = 0; + + /* + * Start up the eoib node creation thread + */ + kt = thread_create(NULL, 0, eibnx_create_eoib_node, NULL, 0, + &p0, TS_RUN, minclsyspri); + ss->nx_nodeq_kt_did = kt->t_did; + + return (ENX_E_SUCCESS); +} + +/* + * Locate the two multicast groups: the All-EoIB-GWs-GID and + * All-EoIB-ENodes-GID. Make sure the MTU is something that + * we can work with and Qkey is as expected. + */ +int +eibnx_find_mgroups(eibnx_thr_info_t *info) +{ + ibt_hca_portinfo_t *pi = info->ti_pi; + ibt_mcg_attr_t mcg_attr; + ib_gid_t rgid; + ibt_status_t ret; + uint_t entries; + + mutex_enter(&info->ti_mcg_lock); + + if ((info->ti_mcg_status & ENX_MCGS_FOUND) == ENX_MCGS_FOUND) { + mutex_exit(&info->ti_mcg_lock); + return (ENX_E_SUCCESS); + } + + /* + * Request GID defining this port + */ + rgid = pi->p_sgid_tbl[0]; + + /* + * First, locate the multicast group to use for sending solicit + * requests to the GW + */ + bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); + mcg_attr.mc_mgid = enx_solicit_mgid; + mcg_attr.mc_pkey = (ib_pkey_t)EIB_ADMIN_PKEY; + mcg_attr.mc_qkey = (ib_qkey_t)EIB_FIP_QKEY; + + if ((ret = ibt_query_mcg(rgid, &mcg_attr, 1, &info->ti_solicit_mcg, + &entries)) != IBT_SUCCESS) { + ENX_DPRINTF_WARN("solicit mcg (gid=%llx.%llx) not found, " + "ibt_query_mcg() returned %d", enx_solicit_mgid.gid_prefix, + enx_solicit_mgid.gid_guid, ret); + goto find_mgroups_fail; + } + + /* + * Make sure the multicast mtu isn't bigger than the port mtu + * and the multicast group's qkey is the same as EIB_FIP_QKEY. + */ + if (info->ti_solicit_mcg->mc_mtu > pi->p_mtu) { + ENX_DPRINTF_WARN("solicit mcg (gid=%llx.%llx) mtu too big, " + "0x%x > 0x%x", enx_solicit_mgid.gid_prefix, + enx_solicit_mgid.gid_guid, info->ti_solicit_mcg->mc_mtu, + pi->p_mtu); + goto find_mgroups_fail; + } + if (info->ti_solicit_mcg->mc_qkey != EIB_FIP_QKEY) { + ENX_DPRINTF_WARN("solicit mcg (gid=%llx.%llx) qkey bad, " + "actual=0x%x, expected=0x%x", enx_solicit_mgid.gid_prefix, + enx_solicit_mgid.gid_guid, info->ti_solicit_mcg->mc_qkey, + EIB_FIP_QKEY); + goto find_mgroups_fail; + } + + /* + * Now, locate the multicast group for receiving discover + * advertisements from the GW + */ + bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); + mcg_attr.mc_mgid = enx_advertise_mgid; + mcg_attr.mc_pkey = (ib_pkey_t)EIB_ADMIN_PKEY; + mcg_attr.mc_qkey = (ib_qkey_t)EIB_FIP_QKEY; + + if ((ret = ibt_query_mcg(rgid, &mcg_attr, 1, &info->ti_advertise_mcg, + &entries)) != IBT_SUCCESS) { + ENX_DPRINTF_WARN("advertise mcg (gid=%llx.%llx) not found, " + "ibt_query_mcg() returned %d", + enx_advertise_mgid.gid_prefix, + enx_advertise_mgid.gid_guid, ret); + goto find_mgroups_fail; + } + + /* + * Verify the multicast group's mtu and qkey as before + */ + if (info->ti_advertise_mcg->mc_mtu > pi->p_mtu) { + ENX_DPRINTF_WARN("advertise mcg (gid=%llx.%llx) mtu too big, " + "0x%x > 0x%x", enx_advertise_mgid.gid_prefix, + enx_advertise_mgid.gid_guid, + info->ti_advertise_mcg->mc_mtu, pi->p_mtu); + goto find_mgroups_fail; + } + if (info->ti_advertise_mcg->mc_qkey != EIB_FIP_QKEY) { + ENX_DPRINTF_WARN("advertise mcg (gid=%llx.%llx) qkey bad, " + "actual=0x%x, expected=0x%x", + enx_advertise_mgid.gid_prefix, enx_advertise_mgid.gid_guid, + info->ti_advertise_mcg->mc_qkey, EIB_FIP_QKEY); + goto find_mgroups_fail; + } + + info->ti_mcg_status |= ENX_MCGS_FOUND; + mutex_exit(&info->ti_mcg_lock); + + return (ENX_E_SUCCESS); + +find_mgroups_fail: + if (info->ti_advertise_mcg) { + ibt_free_mcg_info(info->ti_advertise_mcg, 1); + info->ti_advertise_mcg = NULL; + } + if (info->ti_solicit_mcg) { + ibt_free_mcg_info(info->ti_solicit_mcg, 1); + info->ti_solicit_mcg = NULL; + } + mutex_exit(&info->ti_mcg_lock); + + return (ENX_E_FAILURE); +} + +/* + * Allocate and setup a single completion queue for tx and rx + */ +int +eibnx_setup_cq(eibnx_thr_info_t *info) +{ + ibt_hca_attr_t hca_attr; + ibt_cq_attr_t cq_attr; + ibt_status_t ret; + uint_t sz; + + /* + * Get this HCA's attributes + */ + ret = ibt_query_hca(info->ti_hca, &hca_attr); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_ERR("ibt_query_hca(hca_hdl=0x%llx) failed, ret=%d", + info->ti_hca, ret); + return (ENX_E_FAILURE); + } + + /* + * Allocate a completion queue for our sends and receives + */ + cq_attr.cq_sched = NULL; + cq_attr.cq_flags = IBT_CQ_NO_FLAGS; + cq_attr.cq_size = (hca_attr.hca_max_cq_sz < ENX_CQ_SIZE) ? + hca_attr.hca_max_cq_sz : ENX_CQ_SIZE; + + ret = ibt_alloc_cq(info->ti_hca, &cq_attr, &info->ti_cq_hdl, &sz); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_ERR("ibt_alloc_cq(hca_hdl=0x%llx, cq_sz=0x%lx) " + "failed, ret=%d", info->ti_hca, cq_attr.cq_size, ret); + return (ENX_E_FAILURE); + } + + /* + * Set up other parameters for collecting completion information + */ + info->ti_cq_sz = sz; + info->ti_wc = kmem_zalloc(sizeof (ibt_wc_t) * sz, KM_SLEEP); + + return (ENX_E_SUCCESS); +} + +/* + * Allocate and setup the UD channel parameters + */ +int +eibnx_setup_ud_channel(eibnx_thr_info_t *info) +{ + ibt_ud_chan_alloc_args_t alloc_attr; + ibt_ud_chan_query_attr_t query_attr; + ibt_status_t ret; + + /* + * Protect against arbitrary additions to the chan_alloc_args + * and chan_query_attr structures (make sure the ones we don't + * use are zero'd). + */ + bzero(&alloc_attr, sizeof (ibt_ud_chan_alloc_args_t)); + bzero(&query_attr, sizeof (ibt_ud_chan_query_attr_t)); + + /* + * This ud channel is not going to be used by the nexus driver + * to send any LSO packets, so we won't need the IBT_USES_LSO flag. + */ + alloc_attr.ud_flags = IBT_ALL_SIGNALED; + alloc_attr.ud_hca_port_num = info->ti_pi->p_port_num; + + ret = ibt_pkey2index(info->ti_hca, info->ti_pi->p_port_num, + (ib_pkey_t)EIB_ADMIN_PKEY, &(alloc_attr.ud_pkey_ix)); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_ERR("ibt_pkey2index(hca_hdl=0x%llx, " + "port_num=0x%x, pkey=0x%x) failed, ret=%d", + info->ti_hca, info->ti_pi->p_port_num, + EIB_ADMIN_PKEY, ret); + return (ENX_E_FAILURE); + } + + alloc_attr.ud_sizes.cs_sq = ENX_NUM_SWQE; + alloc_attr.ud_sizes.cs_rq = ENX_NUM_RWQE; + alloc_attr.ud_sizes.cs_sq_sgl = 1; + alloc_attr.ud_sizes.cs_rq_sgl = 1; + alloc_attr.ud_sizes.cs_inline = 0; + + alloc_attr.ud_qkey = EIB_FIP_QKEY; + alloc_attr.ud_scq = info->ti_cq_hdl; + alloc_attr.ud_rcq = info->ti_cq_hdl; + alloc_attr.ud_pd = info->ti_pd; + + ret = ibt_alloc_ud_channel(info->ti_hca, IBT_ACHAN_NO_FLAGS, + &alloc_attr, &info->ti_chan, NULL); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_ERR("ibt_alloc_ud_channel(hca_hdl=0x%llx, " + "cs_sq=0x%lx, cs_rq=0x%lx) failed, ret=%d", + info->ti_hca, alloc_attr.ud_sizes.cs_sq, + alloc_attr.ud_sizes.cs_rq, ret); + return (ENX_E_FAILURE); + } + + ret = ibt_query_ud_channel(info->ti_chan, &query_attr); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_ERR("ibt_query_ud_channel(chan_hdl=0x%llx) " + "failed, ret=%d", info->ti_chan, ret); + if ((ret = ibt_free_channel(info->ti_chan)) != IBT_SUCCESS) { + ENX_DPRINTF_WARN("ibt_free_channel(chan_hdl=0x%llx) " + "failed, ret=%d", info->ti_chan, ret); + } + info->ti_chan = NULL; + return (ENX_E_FAILURE); + } + info->ti_qpn = query_attr.ud_qpn; + + return (ENX_E_SUCCESS); +} + +/* + * Set up the transmit buffers for communicating with the gateway. Since + * the EoIB Nexus driver only exchanges control messages with the + * gateway, we don't really need too much space. + */ +static int +eibnx_setup_txbufs(eibnx_thr_info_t *info) +{ + eibnx_tx_t *snd_p = &info->ti_snd; + eibnx_wqe_t *swqe; + ibt_mr_attr_t attr; + ibt_mr_desc_t desc; + ib_memlen_t tx_bufsz; + ibt_status_t ret; + ibt_ud_dest_hdl_t dest; + uint8_t *buf; + uint_t mtu = (128 << info->ti_pi->p_mtu); + int i; + + /* + * Allocate for the tx buf + */ + tx_bufsz = ENX_NUM_SWQE * mtu; + snd_p->tx_vaddr = (ib_vaddr_t)(uintptr_t)kmem_zalloc(tx_bufsz, + KM_SLEEP); + + /* + * Register the memory region with IBTF for use + */ + attr.mr_vaddr = snd_p->tx_vaddr; + attr.mr_len = tx_bufsz; + attr.mr_as = NULL; + attr.mr_flags = IBT_MR_SLEEP; + if ((ret = ibt_register_mr(info->ti_hca, info->ti_pd, &attr, + &snd_p->tx_mr, &desc)) != IBT_SUCCESS) { + ENX_DPRINTF_ERR("ibt_register_mr() failed for tx " + "region (0x%llx, 0x%llx) with ret=%d", + attr.mr_vaddr, attr.mr_len, ret); + kmem_free((void *)(uintptr_t)(snd_p->tx_vaddr), tx_bufsz); + return (ENX_E_FAILURE); + } + snd_p->tx_lkey = desc.md_lkey; + + /* + * Now setup the send wqes + */ + buf = (uint8_t *)(uintptr_t)(snd_p->tx_vaddr); + for (i = 0; i < ENX_NUM_SWQE; i++) { + swqe = &snd_p->tx_wqe[i]; + + /* + * Allocate a UD destination handle + */ + ret = ibt_alloc_ud_dest(info->ti_hca, IBT_UD_DEST_NO_FLAGS, + info->ti_pd, &dest); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_ERR("ibt_alloc_ud_dest(hca_hdl=0x%llx) " + "failed, ret=%d", info->ti_hca, ret); + eibnx_rb_setup_txbufs(info); + return (ENX_E_FAILURE); + } + + /* + * We set up everything in the send wqes except initialize + * the UD destination and the state of the entry. The ds_len + * should also be adjusted correctly. All this should be + * done later in the appropriate routines, before posting. + */ + swqe->qe_type = ENX_QETYP_SWQE; + swqe->qe_bufsz = mtu; + swqe->qe_sgl.ds_va = (ib_vaddr_t)(uintptr_t)buf; + swqe->qe_sgl.ds_key = snd_p->tx_lkey; + swqe->qe_sgl.ds_len = swqe->qe_bufsz; + swqe->qe_wr.send.wr_id = (ibt_wrid_t)(uintptr_t)swqe; + swqe->qe_wr.send.wr_flags = IBT_WR_NO_FLAGS; + swqe->qe_wr.send.wr_trans = IBT_UD_SRV; + swqe->qe_wr.send.wr_opcode = IBT_WRC_SEND; + swqe->qe_wr.send.wr_nds = 1; + swqe->qe_wr.send.wr_sgl = &swqe->qe_sgl; + swqe->qe_wr.send.wr.ud.udwr_dest = dest; + + mutex_init(&swqe->qe_lock, NULL, MUTEX_DRIVER, NULL); + swqe->qe_flags = 0; + + buf += mtu; + } + + return (ENX_E_SUCCESS); +} + +/* + * Set up bufs for receiving gateway advertisements + */ +static int +eibnx_setup_rxbufs(eibnx_thr_info_t *info) +{ + eibnx_rx_t *rcv_p = &info->ti_rcv; + eibnx_wqe_t *rwqe; + ibt_mr_attr_t attr; + ibt_mr_desc_t desc; + ib_memlen_t rx_bufsz; + ibt_status_t ret; + uint8_t *buf; + uint_t mtu = (128 << info->ti_pi->p_mtu); + int i; + + /* + * Allocate for the rx buf + */ + rx_bufsz = ENX_NUM_RWQE * (mtu + ENX_GRH_SZ); + rcv_p->rx_vaddr = (ib_vaddr_t)(uintptr_t)kmem_zalloc(rx_bufsz, + KM_SLEEP); + + attr.mr_vaddr = rcv_p->rx_vaddr; + attr.mr_len = rx_bufsz; + attr.mr_as = NULL; + attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; + if ((ret = ibt_register_mr(info->ti_hca, info->ti_pd, &attr, + &rcv_p->rx_mr, &desc)) != IBT_SUCCESS) { + ENX_DPRINTF_ERR("ibt_register_mr() failed for rx " + "region (0x%llx, 0x%llx) with ret=%d", + attr.mr_vaddr, attr.mr_len, ret); + kmem_free((void *)(uintptr_t)(rcv_p->rx_vaddr), rx_bufsz); + return (ENX_E_FAILURE); + } + rcv_p->rx_lkey = desc.md_lkey; + + buf = (uint8_t *)(uintptr_t)(rcv_p->rx_vaddr); + for (i = 0; i < ENX_NUM_RWQE; i++) { + rwqe = &rcv_p->rx_wqe[i]; + + rwqe->qe_type = ENX_QETYP_RWQE; + rwqe->qe_bufsz = mtu + ENX_GRH_SZ; + rwqe->qe_sgl.ds_va = (ib_vaddr_t)(uintptr_t)buf; + rwqe->qe_sgl.ds_key = rcv_p->rx_lkey; + rwqe->qe_sgl.ds_len = rwqe->qe_bufsz; + rwqe->qe_wr.recv.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; + rwqe->qe_wr.recv.wr_nds = 1; + rwqe->qe_wr.recv.wr_sgl = &rwqe->qe_sgl; + + mutex_init(&rwqe->qe_lock, NULL, MUTEX_DRIVER, NULL); + rwqe->qe_flags = 0; + + buf += (mtu + ENX_GRH_SZ); + } + + return (ENX_E_SUCCESS); +} + +/* + * Set up transmit and receive buffers and post the receive buffers + */ +int +eibnx_setup_bufs(eibnx_thr_info_t *info) +{ + eibnx_rx_t *rcv_p = &info->ti_rcv; + eibnx_wqe_t *rwqe; + ibt_status_t ret; + int i; + + if (eibnx_setup_txbufs(info) != ENX_E_SUCCESS) + return (ENX_E_FAILURE); + + if (eibnx_setup_rxbufs(info) != ENX_E_SUCCESS) { + eibnx_rb_setup_txbufs(info); + return (ENX_E_FAILURE); + } + + for (i = 0; i < ENX_NUM_RWQE; i++) { + rwqe = &rcv_p->rx_wqe[i]; + + mutex_enter(&rwqe->qe_lock); + + rwqe->qe_flags |= (ENX_QEFL_INUSE | ENX_QEFL_POSTED); + ret = ibt_post_recv(info->ti_chan, &(rwqe->qe_wr.recv), 1, + NULL); + + mutex_exit(&rwqe->qe_lock); + + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_ERR("ibt_post_recv(chan_hdl=0x%llx) " + "failed, ret=%d", info->ti_chan, ret); + + ret = ibt_flush_channel(info->ti_chan); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_WARN("ibt_flush_channel" + "(chan_hdl=0x%llx) failed, ret=%d", + info->ti_chan, ret); + } + + eibnx_rb_setup_rxbufs(info); + eibnx_rb_setup_txbufs(info); + return (ENX_E_FAILURE); + } + } + + return (ENX_E_SUCCESS); +} + +/* + * Set up the completion queue handler. While we don't quit if we cannot + * use soft interrupts, that path is really unreliable and untested. + */ +int +eibnx_setup_cq_handler(eibnx_thr_info_t *info) +{ + eibnx_t *ss = enx_global_ss; + ibt_status_t ret; + int rv; + + /* + * We'll try to use a softintr if possible. If not, it's not + * fatal, we'll try and use the completion handler directly from + * the interrupt handler. + */ + + rv = ddi_intr_add_softint(ss->nx_dip, &info->ti_softint_hdl, + EIB_SOFTPRI_ADM, eibnx_comp_handler, info); + if (rv != DDI_SUCCESS) { + ENX_DPRINTF_WARN("ddi_intr_add_softint(dip=0x%llx) " + "failed, ret=%d", ss->nx_dip, rv); + } + + ibt_set_cq_handler(info->ti_cq_hdl, eibnx_comp_intr, info); + + ret = ibt_enable_cq_notify(info->ti_cq_hdl, IBT_NEXT_COMPLETION); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_WARN("ibt_enable_cq_notify(cq_hdl=0x%llx) " + "failed, ret=%d", info->ti_cq_hdl, ret); + if (info->ti_softint_hdl) { + (void) ddi_intr_remove_softint(info->ti_softint_hdl); + info->ti_softint_hdl = NULL; + } + return (ENX_E_FAILURE); + } + + return (ENX_E_SUCCESS); +} + +/* + * Join the solicit multicast group (All-EoIB-GWs-GID) as a full member + */ +static int +eibnx_join_solicit_mcg(eibnx_thr_info_t *info) +{ + ib_gid_t rgid = info->ti_pi->p_sgid_tbl[0]; + ibt_mcg_attr_t mcg_attr; + ibt_mcg_info_t mcg_info; + ibt_status_t ret; + + bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); + + mcg_attr.mc_mgid = enx_solicit_mgid; + mcg_attr.mc_qkey = (ib_qkey_t)EIB_FIP_QKEY; + mcg_attr.mc_pkey = (ib_pkey_t)EIB_ADMIN_PKEY; + mcg_attr.mc_join_state = IB_MC_JSTATE_FULL; + mcg_attr.mc_flow = info->ti_solicit_mcg->mc_adds_vect.av_flow; + mcg_attr.mc_tclass = info->ti_solicit_mcg->mc_adds_vect.av_tclass; + mcg_attr.mc_sl = info->ti_solicit_mcg->mc_adds_vect.av_srvl; + mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL; + + /* + * We only need to send to solicit mcg, so we only need to join + * the multicast group, no need to attach our qp to it + */ + ret = ibt_join_mcg(rgid, &mcg_attr, &mcg_info, NULL, NULL); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_ERR("ibt_join_mcg() failed for solicit " + "mgid=%llx.%llx, ret=%x", enx_solicit_mgid.gid_prefix, + enx_solicit_mgid.gid_guid, ret); + return (ENX_E_FAILURE); + } + + /* + * We can throw away the old mcg info we got when we queried + * for the mcg and use the new one. They both should be the + * same, really. + */ + if (info->ti_solicit_mcg) { + bcopy(&mcg_info, info->ti_solicit_mcg, + sizeof (ibt_mcg_info_t)); + } + + return (ENX_E_SUCCESS); +} + +/* + * Join and attach to the advertise multicast group (All-EoIB-ENodes-GID) + * to receive unsolicitied advertisements from the gateways. + */ +static int +eibnx_join_advertise_mcg(eibnx_thr_info_t *info) +{ + ib_gid_t rgid = info->ti_pi->p_sgid_tbl[0]; + ibt_mcg_attr_t mcg_attr; + ibt_mcg_info_t mcg_info; + ibt_status_t ret; + + if (info->ti_chan == NULL) + return (ENX_E_FAILURE); + + bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); + + mcg_attr.mc_mgid = enx_advertise_mgid; + mcg_attr.mc_qkey = (ib_qkey_t)EIB_FIP_QKEY; + mcg_attr.mc_pkey = (ib_pkey_t)EIB_ADMIN_PKEY; + mcg_attr.mc_join_state = IB_MC_JSTATE_FULL; + mcg_attr.mc_flow = info->ti_advertise_mcg->mc_adds_vect.av_flow; + mcg_attr.mc_tclass = info->ti_advertise_mcg->mc_adds_vect.av_tclass; + mcg_attr.mc_sl = info->ti_advertise_mcg->mc_adds_vect.av_srvl; + mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL; + + ret = ibt_join_mcg(rgid, &mcg_attr, &mcg_info, NULL, NULL); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_ERR("ibt_join_mcg() failed for advertise " + "mgid=%llx.%llx, ret=%x", enx_advertise_mgid.gid_prefix, + enx_advertise_mgid.gid_guid, ret); + return (ENX_E_FAILURE); + } + + /* + * We can throw away the old mcg info we got when we queried + * for the mcg and use the new one. They both should be the + * same, really. + */ + if (info->ti_advertise_mcg) { + bcopy(&mcg_info, info->ti_advertise_mcg, + sizeof (ibt_mcg_info_t)); + } + + /* + * Since we need to receive advertisements, we'll attach our qp + * to the advertise mcg + */ + ret = ibt_attach_mcg(info->ti_chan, info->ti_advertise_mcg); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_ERR("ibt_attach_mcg(chan_hdl=0x%llx, " + "advt_mcg=0x%llx) failed, ret=%d", info->ti_chan, + info->ti_advertise_mcg, ret); + return (ENX_E_FAILURE); + } + + return (ENX_E_SUCCESS); +} + +/* + * Join the multicast groups we're interested in + */ +int +eibnx_join_mcgs(eibnx_thr_info_t *info) +{ + mutex_enter(&info->ti_mcg_lock); + + /* + * We should've located the mcg first + */ + if ((info->ti_mcg_status & ENX_MCGS_FOUND) == 0) { + mutex_exit(&info->ti_mcg_lock); + return (ENX_E_FAILURE); + } + + /* + * If we're already joined to the mcgs, we must leave first + */ + if ((info->ti_mcg_status & ENX_MCGS_JOINED) == ENX_MCGS_JOINED) { + mutex_exit(&info->ti_mcg_lock); + return (ENX_E_FAILURE); + } + + /* + * Join the two mcgs + */ + if (eibnx_join_advertise_mcg(info) != ENX_E_SUCCESS) { + mutex_exit(&info->ti_mcg_lock); + return (ENX_E_FAILURE); + } + if (eibnx_join_solicit_mcg(info) != ENX_E_SUCCESS) { + eibnx_rb_join_advertise_mcg(info); + mutex_exit(&info->ti_mcg_lock); + return (ENX_E_FAILURE); + } + + info->ti_mcg_status |= ENX_MCGS_JOINED; + mutex_exit(&info->ti_mcg_lock); + + return (ENX_E_SUCCESS); +} + +int +eibnx_rejoin_mcgs(eibnx_thr_info_t *info) +{ + /* + * Lookup the MCGs again and join them + */ + eibnx_rb_join_mcgs(info); + eibnx_rb_find_mgroups(info); + + if (eibnx_find_mgroups(info) != ENX_E_SUCCESS) + return (ENX_E_FAILURE); + + if (eibnx_join_mcgs(info) != ENX_E_SUCCESS) + return (ENX_E_FAILURE); + + return (ENX_E_SUCCESS); +} + +int +eibnx_ibt_fini(eibnx_t *ss) +{ + return (eibnx_rb_ibt_init(ss)); +} + +static int +eibnx_rb_ibt_init(eibnx_t *ss) +{ + eibnx_hca_t *hca; + eibnx_hca_t *hca_next; + eibnx_hca_t *hca_list; + ibt_status_t ret; + + /* + * Disable subnet notices callbacks + */ + ibt_register_subnet_notices(ss->nx_ibt_hdl, NULL, NULL); + + /* + * Remove the hca list from the state structure + */ + mutex_enter(&ss->nx_lock); + hca_list = ss->nx_hca; + ss->nx_hca = NULL; + mutex_exit(&ss->nx_lock); + + /* + * For each HCA in the list, free up the portinfo/port structs, + * free the pd, close the hca handle and release the hca struct. + * If something goes wrong, try to put back whatever good remains + * back on the hca list and return failure. + */ + for (hca = hca_list; hca; hca = hca_next) { + hca_next = hca->hc_next; + if (eibnx_cleanup_hca(hca) != ENX_E_SUCCESS) { + mutex_enter(&ss->nx_lock); + ss->nx_hca = hca_next; + mutex_exit(&ss->nx_lock); + return (ENX_E_FAILURE); + } + } + + if ((ret = ibt_detach(ss->nx_ibt_hdl)) != IBT_SUCCESS) { + ENX_DPRINTF_WARN("ibt_detach(ibt_hdl=0x%llx) " + "failed, ret=%d", ss->nx_ibt_hdl, ret); + return (ENX_E_FAILURE); + } + ss->nx_ibt_hdl = NULL; + + eibnx_rb_state_init(); + + return (ENX_E_SUCCESS); +} + +static void +eibnx_rb_state_init(void) +{ + eibnx_t *ss = enx_global_ss; + kt_did_t thr_id; + + /* + * Ask the eoib node creation thread to die and wait for + * it to happen + */ + mutex_enter(&ss->nx_nodeq_lock); + + thr_id = ss->nx_nodeq_kt_did; + ss->nx_nodeq_thr_die = 1; + ss->nx_nodeq_kt_did = 0; + + cv_signal(&ss->nx_nodeq_cv); + mutex_exit(&ss->nx_nodeq_lock); + + if (thr_id) { + thread_join(thr_id); + } + + cv_destroy(&ss->nx_busop_cv); + mutex_destroy(&ss->nx_busop_lock); + cv_destroy(&ss->nx_nodeq_cv); + mutex_destroy(&ss->nx_nodeq_lock); + mutex_destroy(&ss->nx_lock); +} + +void +eibnx_rb_find_mgroups(eibnx_thr_info_t *info) +{ + mutex_enter(&info->ti_mcg_lock); + if ((info->ti_mcg_status & ENX_MCGS_FOUND) == ENX_MCGS_FOUND) { + if (info->ti_advertise_mcg) { + ibt_free_mcg_info(info->ti_advertise_mcg, 1); + info->ti_advertise_mcg = NULL; + } + if (info->ti_solicit_mcg) { + ibt_free_mcg_info(info->ti_solicit_mcg, 1); + info->ti_solicit_mcg = NULL; + } + info->ti_mcg_status &= (~ENX_MCGS_FOUND); + } + mutex_exit(&info->ti_mcg_lock); +} + +void +eibnx_rb_setup_cq(eibnx_thr_info_t *info) +{ + ibt_status_t ret; + + if (info->ti_wc && info->ti_cq_sz) + kmem_free(info->ti_wc, sizeof (ibt_wc_t) * info->ti_cq_sz); + + info->ti_cq_sz = 0; + info->ti_wc = NULL; + + if (info->ti_cq_hdl) { + ret = ibt_free_cq(info->ti_cq_hdl); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_WARN("ibt_free_cq(cq_hdl=0x%llx) " + "failed, ret=%d", info->ti_cq_hdl, ret); + } + info->ti_cq_hdl = NULL; + } +} + +void +eibnx_rb_setup_ud_channel(eibnx_thr_info_t *info) +{ + ibt_status_t ret; + + if ((ret = ibt_free_channel(info->ti_chan)) != IBT_SUCCESS) { + ENX_DPRINTF_WARN("ibt_free_channel(chan=0x%llx) " + "failed, ret=%d", info->ti_chan, ret); + } + info->ti_chan = NULL; + info->ti_qpn = 0; +} + +static void +eibnx_rb_setup_txbufs(eibnx_thr_info_t *info) +{ + eibnx_tx_t *snd_p = &info->ti_snd; + eibnx_wqe_t *swqe; + ibt_status_t ret; + int i; + uint_t mtu = (128 << info->ti_pi->p_mtu); + + /* + * Release any UD destination handle we may have allocated. Note that + * the per swqe lock would've been initialized only if we were able to + * allocate the UD dest handle. + */ + for (i = 0; i < ENX_NUM_SWQE; i++) { + swqe = &snd_p->tx_wqe[i]; + + if (swqe->qe_wr.send.wr.ud.udwr_dest) { + mutex_destroy(&swqe->qe_lock); + + ret = + ibt_free_ud_dest(swqe->qe_wr.send.wr.ud.udwr_dest); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_WARN("ibt_free_ud_dest(dest=0x%llx)" + " failed, ret=%d", + swqe->qe_wr.send.wr.ud.udwr_dest, ret); + } + } + } + + /* + * Clear all the workq entries + */ + bzero(snd_p->tx_wqe, sizeof (eibnx_wqe_t) * ENX_NUM_SWQE); + + /* + * Clear Lkey and deregister any memory region we may have + * registered earlier + */ + snd_p->tx_lkey = 0; + if (snd_p->tx_mr) { + if ((ret = ibt_deregister_mr(info->ti_hca, + snd_p->tx_mr)) != IBT_SUCCESS) { + ENX_DPRINTF_WARN("ibt_deregister_TXmr(hca_hdl=0x%llx," + "mr=0x%llx) failed, ret=%d", info->ti_hca, + snd_p->tx_mr, ret); + } + snd_p->tx_mr = NULL; + } + + /* + * Release any memory allocated for the tx bufs + */ + if (snd_p->tx_vaddr) { + kmem_free((void *)(uintptr_t)(snd_p->tx_vaddr), + ENX_NUM_SWQE * mtu); + snd_p->tx_vaddr = 0; + } + +} + +static void +eibnx_rb_setup_rxbufs(eibnx_thr_info_t *info) +{ + eibnx_rx_t *rcv_p = &info->ti_rcv; + eibnx_wqe_t *rwqe; + ibt_status_t ret; + uint_t mtu = (128 << info->ti_pi->p_mtu); + int i; + + for (i = 0; i < ENX_NUM_RWQE; i++) { + rwqe = &rcv_p->rx_wqe[i]; + mutex_destroy(&rwqe->qe_lock); + } + bzero(rcv_p->rx_wqe, sizeof (eibnx_wqe_t) * ENX_NUM_RWQE); + + rcv_p->rx_lkey = 0; + + if ((ret = ibt_deregister_mr(info->ti_hca, + rcv_p->rx_mr)) != IBT_SUCCESS) { + ENX_DPRINTF_WARN("ibt_deregister_RXmr(hca_hdl=0x%llx," + "mr=0x%llx) failed, ret=%d", info->ti_hca, + rcv_p->rx_mr, ret); + } + rcv_p->rx_mr = NULL; + + kmem_free((void *)(uintptr_t)(rcv_p->rx_vaddr), + ENX_NUM_RWQE * (mtu + ENX_GRH_SZ)); + rcv_p->rx_vaddr = 0; +} + +void +eibnx_rb_setup_bufs(eibnx_thr_info_t *info) +{ + ibt_status_t ret; + + if ((ret = ibt_flush_channel(info->ti_chan)) != IBT_SUCCESS) { + ENX_DPRINTF_WARN("ibt_flush_channel(chan_hdl=0x%llx) " + "failed, ret=%d", info->ti_chan, ret); + } + + eibnx_rb_setup_rxbufs(info); + + eibnx_rb_setup_txbufs(info); +} + +void +eibnx_rb_setup_cq_handler(eibnx_thr_info_t *info) +{ + ibt_set_cq_handler(info->ti_cq_hdl, NULL, NULL); + + if (info->ti_softint_hdl) { + (void) ddi_intr_remove_softint(info->ti_softint_hdl); + info->ti_softint_hdl = NULL; + } +} + +static void +eibnx_rb_join_solicit_mcg(eibnx_thr_info_t *info) +{ + ib_gid_t rgid = info->ti_pi->p_sgid_tbl[0]; + ib_gid_t rsvd_gid; + ibt_status_t ret; + + rsvd_gid.gid_prefix = 0; + rsvd_gid.gid_guid = 0; + + ret = ibt_leave_mcg(rgid, enx_solicit_mgid, + rsvd_gid, IB_MC_JSTATE_FULL); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_WARN("ibt_leave_mcg(slct_mgid=%llx.%llx) " + "failed, ret=%d", enx_solicit_mgid.gid_prefix, + enx_solicit_mgid.gid_guid, ret); + } +} + +static void +eibnx_rb_join_advertise_mcg(eibnx_thr_info_t *info) +{ + ib_gid_t rgid = info->ti_pi->p_sgid_tbl[0]; + ib_gid_t rsvd_gid; + ibt_status_t ret; + + ret = ibt_detach_mcg(info->ti_chan, info->ti_advertise_mcg); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_WARN("ibt_detach_mcg(chan_hdl=0x%llx, " + "advt_mcg=0x%llx) failed, ret=%d", + info->ti_chan, info->ti_advertise_mcg, ret); + } + + rsvd_gid.gid_prefix = 0; + rsvd_gid.gid_guid = 0; + + ret = ibt_leave_mcg(rgid, enx_advertise_mgid, + rsvd_gid, IB_MC_JSTATE_FULL); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_WARN("ibt_leave_mcg(advt_mgid=%llx.%llx) " + "failed, ret=%d", enx_advertise_mgid.gid_prefix, + enx_advertise_mgid.gid_guid, ret); + } +} + +void +eibnx_rb_join_mcgs(eibnx_thr_info_t *info) +{ + mutex_enter(&info->ti_mcg_lock); + if ((info->ti_mcg_status & ENX_MCGS_JOINED) == ENX_MCGS_JOINED) { + eibnx_rb_join_solicit_mcg(info); + eibnx_rb_join_advertise_mcg(info); + + info->ti_mcg_status &= (~ENX_MCGS_JOINED); + } + mutex_exit(&info->ti_mcg_lock); +} + +eibnx_hca_t * +eibnx_prepare_hca(ib_guid_t hca_guid) +{ + eibnx_t *ss = enx_global_ss; + eibnx_hca_t *hca; + eibnx_port_t *port; + eibnx_port_t *port_tail; + ibt_hca_hdl_t hca_hdl; + ibt_pd_hdl_t pd_hdl; + ibt_hca_portinfo_t *pi; + uint_t num_pi; + uint_t size_pi; + ibt_hca_attr_t hca_attr; + ibt_status_t ret; + int i; + + ret = ibt_open_hca(ss->nx_ibt_hdl, hca_guid, &hca_hdl); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_ERR("ibt_open_hca(hca_guid=0x%llx) " + "failed, ret=%d", hca_guid, ret); + return (NULL); + } + + bzero(&hca_attr, sizeof (ibt_hca_attr_t)); + if ((ret = ibt_query_hca(hca_hdl, &hca_attr)) != IBT_SUCCESS) { + ENX_DPRINTF_ERR("ibt_query_hca(hca_hdl=0x%llx, " + "hca_guid=0x%llx) failed, ret=%d", + hca_hdl, hca_guid, ret); + + if ((ret = ibt_close_hca(hca_hdl)) != IBT_SUCCESS) { + ENX_DPRINTF_WARN("ibt_close_hca(hca_hdl=0x%llx) " + "failed, ret=%d", hca_hdl, ret); + } + return (NULL); + } + + ret = ibt_alloc_pd(hca_hdl, IBT_PD_NO_FLAGS, &pd_hdl); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_ERR("ibt_alloc_pd(hca_hdl=0x%llx, " + "hca_guid=0x%llx) failed, ret=%d", + hca_hdl, hca_guid, ret); + + if ((ret = ibt_close_hca(hca_hdl)) != IBT_SUCCESS) { + ENX_DPRINTF_WARN("ibt_close_hca(hca_hdl=0x%llx) " + "failed, ret=%d", hca_hdl, ret); + } + return (NULL); + } + + /* + * We have all the information we want about this hca, create + * a new struct and return it. + */ + hca = kmem_zalloc(sizeof (eibnx_hca_t), KM_SLEEP); + hca->hc_next = NULL; + hca->hc_guid = hca_guid; + hca->hc_hdl = hca_hdl; + hca->hc_pd = pd_hdl; + hca->hc_port = port_tail = NULL; + + for (i = 0; i < hca_attr.hca_nports; i++) { + ret = ibt_query_hca_ports(hca_hdl, i + 1, &pi, + &num_pi, &size_pi); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_WARN("ibt_query_hca_ports(hca_hdl=0x%llx, " + "port=0x%x) failed, ret=%d", hca_hdl, i + 1, ret); + } else { + port = kmem_zalloc(sizeof (eibnx_port_t), KM_SLEEP); + port->po_next = NULL; + port->po_pi = pi; + port->po_pi_size = size_pi; + + if (port_tail) { + port_tail->po_next = port; + } else { + hca->hc_port = port; + } + port_tail = port; + } + } + + /* + * If we couldn't query about any ports on the HCA, return failure + */ + if (hca->hc_port == NULL) { + ENX_DPRINTF_ERR("all hca port queries failed for " + "hca_guid=0x%llx", hca_guid); + (void) eibnx_cleanup_hca(hca); + return (NULL); + } + + return (hca); +} + +int +eibnx_cleanup_hca(eibnx_hca_t *hca) +{ + eibnx_port_t *port; + eibnx_port_t *port_next; + ibt_status_t ret; + + for (port = hca->hc_port; port; port = port_next) { + port_next = port->po_next; + + ibt_free_portinfo(port->po_pi, port->po_pi_size); + kmem_free(port, sizeof (eibnx_port_t)); + } + + if ((ret = ibt_free_pd(hca->hc_hdl, hca->hc_pd)) != IBT_SUCCESS) { + ENX_DPRINTF_WARN("ibt_free_pd(hca_hdl=0x%lx, pd_hd=0x%lx) " + "failed, ret=%d", hca->hc_hdl, hca->hc_pd, ret); + return (ENX_E_FAILURE); + } + + if ((ret = ibt_close_hca(hca->hc_hdl)) != IBT_SUCCESS) { + ENX_DPRINTF_WARN("ibt_close_hca(hca_hdl=0x%lx) failed, " + "ret=%d", hca->hc_hdl, ret); + return (ENX_E_FAILURE); + } + + kmem_free(hca, sizeof (eibnx_hca_t)); + + return (ENX_E_SUCCESS); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/ib/clients/eoib/enx_log.c Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,252 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ksynch.h> +#include <sys/varargs.h> + +#include <sys/ib/clients/eoib/enx_impl.h> + +/* + * Defaults + */ +uint_t enx_log_size = ENX_LOGSZ_DEFAULT; +int enx_log_level = ENX_MSGS_DEFAULT | ENX_MSGS_DEBUG; +int enx_log_timestamps = 0; + +/* + * Debug variables, should not be tunables so allocated debug buffer + * and its size remain consistent. + */ +static kmutex_t enx_debug_buf_lock; +static uint8_t *enx_debug_buf; +static uint32_t enx_debug_buf_ndx; +static uint_t enx_debug_buf_sz; + +static void eibnx_log(char *); + +void +eibnx_debug_init(void) +{ + enx_debug_buf_ndx = 0; + enx_debug_buf_sz = enx_log_size; + enx_debug_buf = kmem_zalloc(enx_debug_buf_sz, KM_SLEEP); + + mutex_init(&enx_debug_buf_lock, NULL, MUTEX_DRIVER, NULL); +} + +void +eibnx_debug_fini(void) +{ + mutex_destroy(&enx_debug_buf_lock); + + if (enx_debug_buf && enx_debug_buf_sz) { + kmem_free(enx_debug_buf, enx_debug_buf_sz); + enx_debug_buf = NULL; + } + enx_debug_buf_sz = 0; + enx_debug_buf_ndx = 0; +} + +void +eibnx_log(char *msg) +{ + uint32_t off; + int msglen; + char msgbuf[ENX_MAX_LINE]; + + if (enx_debug_buf == NULL) + return; + + if (enx_log_timestamps) { + msglen = snprintf(msgbuf, ENX_MAX_LINE, "%llx: %s", + (unsigned long long)ddi_get_lbolt64(), msg); + } else { + msglen = snprintf(msgbuf, ENX_MAX_LINE, "%s", msg); + } + + if (msglen < 0) + return; + else if (msglen >= ENX_MAX_LINE) + msglen = ENX_MAX_LINE - 1; + + mutex_enter(&enx_debug_buf_lock); + + if ((enx_debug_buf_ndx == 0) || + (enx_debug_buf[enx_debug_buf_ndx-1] != '\n')) { + enx_debug_buf[enx_debug_buf_ndx] = '\n'; + enx_debug_buf_ndx++; + } + + off = enx_debug_buf_ndx; /* current msg should go here */ + + enx_debug_buf_ndx += msglen; /* next msg should start here */ + enx_debug_buf[enx_debug_buf_ndx] = 0; /* terminate current msg */ + + if (enx_debug_buf_ndx >= (enx_debug_buf_sz - 2 * ENX_MAX_LINE)) + enx_debug_buf_ndx = 0; + + mutex_exit(&enx_debug_buf_lock); + + bcopy(msgbuf, enx_debug_buf+off, msglen); /* no lock needed */ +} + +#ifdef ENX_DEBUG +void +eibnx_dprintf_verbose(const char *fmt, ...) +{ + va_list ap; + int msglen; + char msgbuf[ENX_MAX_LINE]; + char newfmt[ENX_MAX_LINE]; + + if ((enx_log_level & ENX_MSGS_VERBOSE) != ENX_MSGS_VERBOSE) + return; + + (void) snprintf(newfmt, ENX_MAX_LINE, "..........%s", fmt); + + va_start(ap, fmt); + msglen = vsnprintf(msgbuf, ENX_MAX_LINE, newfmt, ap); + va_end(ap); + + if (msglen > 0) { + eibnx_log(msgbuf); + } +} + +void +eibnx_dprintf_args(const char *fmt, ...) +{ + va_list ap; + int msglen; + char msgbuf[ENX_MAX_LINE]; + char newfmt[ENX_MAX_LINE]; + + if ((enx_log_level & ENX_MSGS_ARGS) != ENX_MSGS_ARGS) + return; + + (void) snprintf(newfmt, ENX_MAX_LINE, "........%s", fmt); + + va_start(ap, fmt); + msglen = vsnprintf(msgbuf, ENX_MAX_LINE, newfmt, ap); + va_end(ap); + + if (msglen > 0) { + eibnx_log(msgbuf); + } +} + +void +eibnx_dprintf_debug(const char *fmt, ...) +{ + va_list ap; + int msglen; + char msgbuf[ENX_MAX_LINE]; + char newfmt[ENX_MAX_LINE]; + + if ((enx_log_level & ENX_MSGS_DEBUG) != ENX_MSGS_DEBUG) + return; + + (void) snprintf(newfmt, ENX_MAX_LINE, "......%s", fmt); + + va_start(ap, fmt); + msglen = vsnprintf(msgbuf, ENX_MAX_LINE, newfmt, ap); + va_end(ap); + + if (msglen > 0) { + eibnx_log(msgbuf); + } +} +#endif + +void +eibnx_dprintf_warn(const char *fmt, ...) +{ + va_list ap; + int msglen; + char msgbuf[ENX_MAX_LINE]; + char newfmt[ENX_MAX_LINE]; + + if ((enx_log_level & ENX_MSGS_WARN) != ENX_MSGS_WARN) + return; + + (void) snprintf(newfmt, ENX_MAX_LINE, "....%s", fmt); + + va_start(ap, fmt); + msglen = vsnprintf(msgbuf, ENX_MAX_LINE, newfmt, ap); + va_end(ap); + + if (msglen > 0) { + eibnx_log(msgbuf); + } +} + +void +eibnx_dprintf_err(const char *fmt, ...) +{ + va_list ap; + int msglen; + char msgbuf[ENX_MAX_LINE]; + char newfmt[ENX_MAX_LINE]; + + if ((enx_log_level & ENX_MSGS_ERR) != ENX_MSGS_ERR) + return; + + (void) snprintf(newfmt, ENX_MAX_LINE, "..%s", fmt); + + va_start(ap, fmt); + msglen = vsnprintf(msgbuf, ENX_MAX_LINE, newfmt, ap); + va_end(ap); + + if (msglen > 0) { + eibnx_log(msgbuf); + cmn_err(CE_WARN, "!%s\n", msgbuf); + } +} + +void +eibnx_dprintf_crit(const char *fmt, ...) +{ + va_list ap; + int msglen; + char msgbuf[ENX_MAX_LINE]; + + if ((enx_log_level & ENX_MSGS_CRIT) != ENX_MSGS_CRIT) + return; + + va_start(ap, fmt); + msglen = vsnprintf(msgbuf, ENX_MAX_LINE, fmt, ap); + va_end(ap); + + if (msglen > 0) { + eibnx_log(msgbuf); + cmn_err(CE_PANIC, "!%s\n", msgbuf); + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/ib/clients/eoib/enx_main.c Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,638 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * The Ethernet Over Infiniband Nexus driver is a bus nexus driver + * that enumerates all the EoIB nodes. + */ + +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/devops.h> +#include <sys/kmem.h> +#include <sys/ksynch.h> +#include <sys/modctl.h> +#include <sys/stat.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/sunndi.h> + +#include <sys/ib/clients/eoib/enx_impl.h> + +/* + * Global per-instance EoIB Nexus data. Only one instance + * of EoIB Nexus is supported + */ +eibnx_t *enx_global_ss = NULL; + +/* + * Static function declarations + */ +static int eibnx_attach(dev_info_t *, ddi_attach_cmd_t); +static int eibnx_detach(dev_info_t *, ddi_detach_cmd_t); +static int eibnx_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); +static int eibnx_bus_ctl(dev_info_t *, dev_info_t *, ddi_ctl_enum_t, + void *, void *); + +static int eibnx_get_eventcookie(dev_info_t *, dev_info_t *, char *, + ddi_eventcookie_t *); +static int eibnx_add_eventcall(dev_info_t *, dev_info_t *, ddi_eventcookie_t, + void (*)(dev_info_t *, ddi_eventcookie_t, void *, void *), + void *, ddi_callback_id_t *); +static int eibnx_remove_eventcall(dev_info_t *, ddi_callback_id_t); +static int eibnx_post_event(dev_info_t *, dev_info_t *, + ddi_eventcookie_t, void *); + +static int eibnx_bus_config(dev_info_t *, uint_t, ddi_bus_config_op_t, + void *, dev_info_t **); +static int eibnx_bus_unconfig(dev_info_t *, uint_t, ddi_bus_config_op_t, + void *); +static int eibnx_config_all_children(dev_info_t *); +static void eibnx_unconfig_all_children(dev_info_t *); +static int eibnx_config_child(char *, dev_info_t **); +static int eibnx_unconfig_child(char *); + +/* + * Cbops + */ +static struct cb_ops enx_cb_ops = { + eibnx_devctl_open, /* cb_open */ + eibnx_devctl_close, /* cb_close */ + nodev, /* cb_strategy */ + nodev, /* cb_print */ + nodev, /* cb_dump */ + nodev, /* cb_read */ + nodev, /* cb_write */ + eibnx_devctl_ioctl, /* cb_ioctl */ + nodev, /* cb_devmap */ + nodev, /* cb_mmap */ + nodev, /* cb_segmap */ + nochpoll, /* cb_chpoll */ + ddi_prop_op, /* cb_prop_op */ + NULL, /* cb_str */ + D_MP, /* cb_flag */ + CB_REV, /* cb_rev */ + nodev, /* cb_aread */ + nodev /* cb_awrite */ +}; + +/* + * Busops + */ +static struct bus_ops enx_bus_ops = { + BUSO_REV, + nullbusmap, /* bus_map */ + NULL, /* bus_get_intrspec */ + NULL, /* bus_add_intrspec */ + NULL, /* bus_remove_intrspec */ + i_ddi_map_fault, /* bus_map_fault */ + ddi_no_dma_map, /* bus_dma_map */ + NULL, /* bus_dma_allochdl */ + NULL, /* bus_dma_freehdl */ + NULL, /* bus_dma_bindhdl */ + NULL, /* bus_dma_unbindhdl */ + NULL, /* bus_dma_flush */ + NULL, /* bus_dma_win */ + NULL, /* bus_dma_ctl */ + eibnx_bus_ctl, /* bus_ctl */ + ddi_bus_prop_op, /* bus_prop_op */ + eibnx_get_eventcookie, /* bus_get_eventcookie */ + eibnx_add_eventcall, /* bus_add_eventcall */ + eibnx_remove_eventcall, /* bus_remove_eventcall */ + eibnx_post_event, /* bus_post_event */ + NULL, /* bus_intr_ctl */ + eibnx_bus_config, /* bus_config */ + eibnx_bus_unconfig, /* bus_unconfig */ +}; + +/* + * Nexus ops + */ +static struct dev_ops enx_ops = { + DEVO_REV, /* devo_rev, */ + 0, /* devo_refcnt */ + eibnx_getinfo, /* devo_info */ + nulldev, /* devo_identify */ + nulldev, /* devo_probe */ + eibnx_attach, /* devo_attach */ + eibnx_detach, /* devo_detach */ + nodev, /* devo_reset */ + &enx_cb_ops, /* devo_cb_ops */ + &enx_bus_ops, /* devo_bus_ops */ + nulldev, /* devo_power */ + ddi_quiesce_not_needed /* devo_quiesce */ +}; + +/* + * Module linkage information for the kernel + */ +static struct modldrv enx_modldrv = { + &mod_driverops, /* Driver module */ + "EoIB Nexus", /* Driver name and version */ + &enx_ops, /* Driver ops */ +}; + +static struct modlinkage enx_modlinkage = { + MODREV_1, (void *)&enx_modldrv, NULL +}; + +/* + * EoIB NDI events + */ +static ndi_event_definition_t enx_ndi_event_defs[] = { + { ENX_EVENT_TAG_GW_INFO_UPDATE, EIB_NDI_EVENT_GW_INFO_UPDATE, + EPL_KERNEL, NDI_EVENT_POST_TO_TGT }, + { ENX_EVENT_TAG_GW_AVAILABLE, EIB_NDI_EVENT_GW_AVAILABLE, + EPL_KERNEL, NDI_EVENT_POST_TO_TGT }, + { ENX_EVENT_TAG_LOGIN_ACK, EIB_NDI_EVENT_LOGIN_ACK, + EPL_KERNEL, NDI_EVENT_POST_TO_TGT } +}; +#define ENX_NUM_NDI_EVENTS \ + (sizeof (enx_ndi_event_defs) / sizeof (enx_ndi_event_defs[0])) + +static ndi_event_set_t enx_ndi_events = { + NDI_EVENTS_REV1, + ENX_NUM_NDI_EVENTS, + enx_ndi_event_defs +}; +ndi_event_hdl_t enx_ndi_event_hdl; + + +/* + * Common loadable module entry points _init, _fini, _info + */ + +int +_init(void) +{ + int ret; + + if ((ret = mod_install(&enx_modlinkage)) == 0) + eibnx_debug_init(); + + return (ret); +} + +int +_fini(void) +{ + int ret; + + if ((ret = mod_remove(&enx_modlinkage)) == 0) + eibnx_debug_fini(); + + return (ret); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&enx_modlinkage, modinfop)); +} + +/* + * Autoconfiguration entry points: attach, detach, getinfo + */ + +static int +eibnx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + eibnx_t *ss; + int instance; + + if (cmd == DDI_RESUME) + return (DDI_SUCCESS); + else if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + /* + * Don't allow more than one instance to attach + */ + if (enx_global_ss) + return (DDI_FAILURE); + + /* + * Alloc this instance's softstate + */ + ss = kmem_zalloc(sizeof (eibnx_t), KM_SLEEP); + ss->nx_dip = dip; + + enx_global_ss = ss; + + /* + * Allocate our NDI event handle and bind our event set + */ + if (ndi_event_alloc_hdl(dip, 0, &enx_ndi_event_hdl, + NDI_SLEEP) != NDI_SUCCESS) { + ENX_DPRINTF_ERR("ndi_event_alloc_hdl(dip=0x%llx) " + "failed", dip); + + kmem_free(enx_global_ss, sizeof (eibnx_t)); + enx_global_ss = NULL; + return (DDI_FAILURE); + } + if (ndi_event_bind_set(enx_ndi_event_hdl, &enx_ndi_events, + NDI_SLEEP) != NDI_SUCCESS) { + ENX_DPRINTF_ERR("ndi_event_bind_set(ndi_event_hdl=0x%llx) " + "failed", enx_ndi_event_hdl); + + (void) ndi_event_free_hdl(enx_ndi_event_hdl); + enx_ndi_event_hdl = NULL; + kmem_free(enx_global_ss, sizeof (eibnx_t)); + enx_global_ss = NULL; + return (DDI_FAILURE); + } + + /* + * Create "devctl" minor node for general ioctl interface to the + * eoib nexus. If we cannot, it isn't fatal - we'll operate without + * the support for devctl (but issue a warning). + */ + instance = ddi_get_instance(dip); + if (ddi_create_minor_node(dip, "devctl", S_IFCHR, instance, + DDI_NT_NEXUS, 0) != DDI_SUCCESS) { + ENX_DPRINTF_WARN("could not create devctl minor node " + "for instance %d", instance); + } + + /* + * Do IBTF related initializations. If we fail, we cannot operate, + * so fail the attach. + */ + if (eibnx_ibt_init(ss) != ENX_E_SUCCESS) { + (void) ddi_remove_minor_node(dip, NULL); + (void) ndi_event_unbind_set(enx_ndi_event_hdl, + &enx_ndi_events, NDI_SLEEP); + (void) ndi_event_free_hdl(enx_ndi_event_hdl); + enx_ndi_event_hdl = NULL; + kmem_free(enx_global_ss, sizeof (eibnx_t)); + enx_global_ss = NULL; + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +static int +eibnx_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + eibnx_t *ss = enx_global_ss; + + if (cmd == DDI_SUSPEND) + return (DDI_SUCCESS); + else if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + /* + * If there's no instance of eibnx attached, fail + */ + if (ss == NULL) + return (DDI_FAILURE); + + /* + * Before we do anything, we need to stop the port monitors + * we may have started earlier. + */ + eibnx_terminate_monitors(); + + /* + * If eibnx_ibt_fini() fails, it could be because one of the + * HCA's pd could not be freed, the hca could not be closed + * or the IBTF detach wasn't successful. If this is the case, + * we have to return failure, but cannot do much about the + * port monitors we've already terminated. + */ + if (eibnx_ibt_fini(ss) == ENX_E_FAILURE) + return (DDI_FAILURE); + + /* + * Cleanup any devctl minor node we may have created, unbind and + * free ndi event handle and free the instance softstate. + */ + (void) ddi_remove_minor_node(dip, NULL); + (void) ndi_event_unbind_set(enx_ndi_event_hdl, + &enx_ndi_events, NDI_SLEEP); + (void) ndi_event_free_hdl(enx_ndi_event_hdl); + enx_ndi_event_hdl = NULL; + kmem_free(enx_global_ss, sizeof (eibnx_t)); + enx_global_ss = NULL; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +eibnx_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) +{ + eibnx_t *ss = enx_global_ss; + int ret; + + if (cmd == DDI_INFO_DEVT2DEVINFO) { + *resultp = (ss) ? ss->nx_dip : NULL; + ret = (ss) ? DDI_SUCCESS : DDI_FAILURE; + } else if (cmd == DDI_INFO_DEVT2INSTANCE) { + *resultp = 0; + ret = DDI_SUCCESS; + } else { + ret = DDI_FAILURE; + } + + return (ret); +} + +/* + * Busops: bus_ctl, bus_config, bus_unconfig + */ + +/*ARGSUSED*/ +static int +eibnx_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop, + void *arg, void *result) +{ + dev_info_t *child = arg; + int ret; + char name[MAXNAMELEN]; + + switch (ctlop) { + case DDI_CTLOPS_REPORTDEV: + ENX_DPRINTF_DEBUG("EoIB device: %s@%s, %s%d", + ddi_node_name(rdip), ddi_get_name_addr(rdip), + ddi_driver_name(rdip), ddi_get_instance(rdip)); + /*FALLTHROUGH*/ + + case DDI_CTLOPS_ATTACH: + case DDI_CTLOPS_DETACH: + case DDI_CTLOPS_POWER: + case DDI_CTLOPS_SIDDEV: + case DDI_CTLOPS_IOMIN: + ret = DDI_SUCCESS; + break; + + case DDI_CTLOPS_INITCHILD: + if ((ret = eibnx_name_child(child, name, + sizeof (name))) == DDI_SUCCESS) { + ddi_set_name_addr(child, name); + } + break; + + case DDI_CTLOPS_UNINITCHILD: + ddi_set_name_addr(child, NULL); + ret = DDI_SUCCESS; + break; + + default: + ret = ddi_ctlops(dip, rdip, ctlop, arg, result); + break; + } + + return (ret); +} + +/*ARGSUSED*/ +static int +eibnx_bus_config(dev_info_t *parent, uint_t flags, + ddi_bus_config_op_t op, void *arg, dev_info_t **childp) +{ + eibnx_t *ss = enx_global_ss; + int ret = NDI_SUCCESS; + + switch (op) { + case BUS_CONFIG_ONE: + eibnx_busop_inprog_enter(ss); + ret = eibnx_config_child(arg, childp); + eibnx_busop_inprog_exit(ss); + break; + + case BUS_CONFIG_ALL: + case BUS_CONFIG_DRIVER: + eibnx_busop_inprog_enter(ss); + if ((ss->nx_busop_flags & NX_FL_BUSCFG_COMPLETE) == 0) { + ret = eibnx_config_all_children(parent); + if (ret == NDI_SUCCESS) + ss->nx_busop_flags |= NX_FL_BUSCFG_COMPLETE; + } + eibnx_busop_inprog_exit(ss); + break; + + default: + ret = NDI_FAILURE; + } + + if (ret == NDI_SUCCESS) + ret = ndi_busop_bus_config(parent, flags, op, arg, childp, 0); + + return (ret); +} + +static int +eibnx_bus_unconfig(dev_info_t *parent, uint_t flags, + ddi_bus_config_op_t op, void *arg) +{ + eibnx_t *ss = enx_global_ss; + int ret; + + ret = ndi_busop_bus_unconfig(parent, flags, op, arg); + if (ret != NDI_SUCCESS) + return (ret); + + switch (op) { + case BUS_UNCONFIG_ONE: + if (flags & (NDI_UNCONFIG | NDI_DEVI_REMOVE)) { + eibnx_busop_inprog_enter(ss); + + if ((ret = eibnx_unconfig_child(arg)) == ENX_E_SUCCESS) + ss->nx_busop_flags &= (~NX_FL_BUSCFG_COMPLETE); + else { + ENX_DPRINTF_DEBUG("eibnx_bus_config: " + "unconfig child %s failed", (char *)arg); + } + + eibnx_busop_inprog_exit(ss); + } + break; + + case BUS_UNCONFIG_ALL: + case BUS_UNCONFIG_DRIVER: + if (flags & (NDI_UNCONFIG | NDI_DEVI_REMOVE)) { + eibnx_busop_inprog_enter(ss); + + eibnx_unconfig_all_children(parent); + ss->nx_busop_flags &= (~NX_FL_BUSCFG_COMPLETE); + + eibnx_busop_inprog_exit(ss); + } + break; + + default: + break; + } + + return (ret); +} + +/* + * Event Handling: bus_get_eventcookie, bus_add_eventcall, bus_remove_eventcall + * and bus_post_event + */ + +/*ARGSUSED*/ +static int +eibnx_get_eventcookie(dev_info_t *dip, dev_info_t *rdip, + char *name, ddi_eventcookie_t *cookiep) +{ + return (ndi_event_retrieve_cookie(enx_ndi_event_hdl, rdip, name, + cookiep, NDI_EVENT_NOPASS)); +} + +/*ARGSUSED*/ +static int +eibnx_add_eventcall(dev_info_t *dip, dev_info_t *rdip, ddi_eventcookie_t cookie, + void (*callback)(dev_info_t *cb_dip, ddi_eventcookie_t cb_cookie, + void *cb_arg, void *cb_impl_data), + void *arg, ddi_callback_id_t *cb_id) +{ + return (ndi_event_add_callback(enx_ndi_event_hdl, rdip, cookie, + callback, arg, NDI_SLEEP, cb_id)); +} + +/*ARGSUSED*/ +static int +eibnx_remove_eventcall(dev_info_t *dip, ddi_callback_id_t cb_id) +{ + return (ndi_event_remove_callback(enx_ndi_event_hdl, cb_id)); +} + +/*ARGSUSED*/ +static int +eibnx_post_event(dev_info_t *dip, dev_info_t *rdip, + ddi_eventcookie_t cookie, void *impl_data) +{ + return (ndi_event_run_callbacks(enx_ndi_event_hdl, rdip, cookie, + impl_data)); +} + +/* + * Routines to configure/unconfigure EoIB node(s) on a system. + */ + +/*ARGSUSED*/ +static int +eibnx_config_all_children(dev_info_t *parent) +{ + eibnx_t *ss = enx_global_ss; + eibnx_hca_t *hca; + eibnx_port_t *port; + eibnx_thr_info_t *ti; + eibnx_thr_info_t *ti_tail; + eibnx_gw_info_t *gwi; + + /* + * Go through each port of each hca and create a thread to solicit, + * monitor, receive advertisements, create eoib nodes and attach eoib + * driver instances. + */ + mutex_enter(&ss->nx_lock); + if (!ss->nx_monitors_up) { + ss->nx_thr_info = ti_tail = NULL; + for (hca = ss->nx_hca; hca; hca = hca->hc_next) { + for (port = hca->hc_port; port; port = port->po_next) { + ti = eibnx_start_port_monitor(hca, port); + if (ti_tail) { + ti_tail->ti_next = ti; + } else { + ss->nx_thr_info = ti; + } + ti_tail = ti; + } + } + + ss->nx_monitors_up = B_TRUE; + mutex_exit(&ss->nx_lock); + + return (NDI_SUCCESS); + } + mutex_exit(&ss->nx_lock); + + while (eibnx_locate_unconfigured_node(&ti, &gwi) == ENX_E_SUCCESS) + (void) eibnx_configure_node(ti, gwi, NULL); + + return (NDI_SUCCESS); +} + +/* + * Routine to unconfigure all the EoIB nodes on a system. This terminates + * all the per-port monitor threads and releases any resources allocated. + */ + +/*ARGSUSED*/ +static void +eibnx_unconfig_all_children(dev_info_t *parent) +{ + eibnx_t *ss = enx_global_ss; + eibnx_thr_info_t *ti; + eibnx_child_t *ch; + + mutex_enter(&ss->nx_lock); + for (ti = ss->nx_thr_info; ti; ti = ti->ti_next) { + mutex_enter(&ti->ti_child_lock); + for (ch = ti->ti_child; ch; ch = ch->ch_next) { + ch->ch_dip = NULL; + } + mutex_exit(&ti->ti_child_lock); + } + mutex_exit(&ss->nx_lock); +} + +/*ARGSUSED*/ +static int +eibnx_config_child(char *devname, dev_info_t **childp) +{ + eibnx_thr_info_t *ti; + eibnx_gw_info_t *gwi; + + if (eibnx_locate_node_name(devname, &ti, &gwi) == ENX_E_FAILURE) { + ENX_DPRINTF_DEBUG("eibnx_config_child: invalid eoib " + "nodename %s, no such address", devname); + return (ENX_E_FAILURE); + } + + return (eibnx_configure_node(ti, gwi, childp)); +} + +/*ARGSUSED*/ +static int +eibnx_unconfig_child(char *devname) +{ + eibnx_thr_info_t *ti; + eibnx_gw_info_t *gwi; + + if (eibnx_locate_node_name(devname, &ti, &gwi) == ENX_E_FAILURE) { + ENX_DPRINTF_DEBUG("eibnx_unconfig_child: invalid eoib " + "nodename %s, no such address", devname); + return (ENX_E_FAILURE); + } + + return (eibnx_unconfigure_node(ti, gwi)); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/ib/clients/eoib/enx_misc.c Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,627 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/ksynch.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/sunndi.h> + +#include <sys/ib/clients/eoib/enx_impl.h> + +static char *eibnx_make_nodename(eibnx_thr_info_t *, uint16_t); + +/* + * This routine is only called when the port-monitor thread is + * about to die. Between the time the first mcast solicitation + * was done by the port-monitor thread and the time it is asked + * to die, a lot of things could've happened and we need to + * cleanup all of it. + */ +void +eibnx_cleanup_port_nodes(eibnx_thr_info_t *info) +{ + eibnx_t *ss = enx_global_ss; + eibnx_nodeq_t *node; + eibnx_nodeq_t *prev; + eibnx_gw_info_t *gwi; + eibnx_gw_info_t *gw_list; + eibnx_gw_info_t *nxt_gwi; + eibnx_child_t *child; + eibnx_child_t *nxt_child; + eibnx_child_t *children; + + /* + * Since we would've already stopped processing completions for + * this thread's work queue, we don't have to worry about requests + * coming in for creation of new eoib nodes. However, there may + * be pending node creation requests for this port (thr_info) + * that we will have to drop. + */ + mutex_enter(&ss->nx_nodeq_lock); + prev = NULL; + for (node = ss->nx_nodeq; node; node = node->nc_next) { + if (node->nc_info != info) { + prev = node; + } else { + if (prev == NULL) { + ss->nx_nodeq = node->nc_next; + } else { + prev->nc_next = node->nc_next; + } + kmem_free(node, sizeof (eibnx_nodeq_t)); + } + } + mutex_exit(&ss->nx_nodeq_lock); + + /* + * Now go through the list of all children and free up any + * resource we might've allocated; note that the child dips + * could've been offlined/removed by now, so we don't do + * anything with them. + */ + mutex_enter(&info->ti_child_lock); + children = info->ti_child; + info->ti_child = NULL; + mutex_exit(&info->ti_child_lock); + + for (child = children; child; child = nxt_child) { + nxt_child = child->ch_next; + + if (child->ch_node_name) { + kmem_free(child->ch_node_name, MAXNAMELEN); + } + kmem_free(child, sizeof (eibnx_child_t)); + } + + /* + * Return all the swqes we've acquired for the gateway unicast + * solicitations, free any address vectors we've allocated and + * finally free the gw entries from the list. + */ + mutex_enter(&info->ti_gw_lock); + gw_list = info->ti_gw; + info->ti_gw = NULL; + mutex_exit(&info->ti_gw_lock); + + for (gwi = gw_list; gwi; gwi = nxt_gwi) { + nxt_gwi = gwi->gw_next; + + eibnx_release_swqe((eibnx_wqe_t *)(gwi->gw_swqe)); + if ((gwi->gw_addr).ga_vect) { + kmem_free((gwi->gw_addr).ga_vect, + sizeof (ibt_adds_vect_t)); + (gwi->gw_addr).ga_vect = NULL; + } + mutex_destroy(&gwi->gw_adv_lock); + + kmem_free(gwi, sizeof (eibnx_gw_info_t)); + } +} + +/* + * Communicate all the details we received about the gateway (via the + * advertisement control message) to the eoib instance we're creating. + */ +void +eibnx_create_node_props(dev_info_t *dip, eibnx_thr_info_t *info, + eibnx_gw_info_t *gwi) +{ + int ret; + + ret = ndi_prop_update_int64(DDI_DEV_T_NONE, dip, EIB_PROP_HCA_GUID, + info->ti_hca_guid); + if (ret != DDI_PROP_SUCCESS) { + ENX_DPRINTF_WARN("ndi_prop_update_int64() failed to set " + "%s property to 0x%llx for child dip 0x%llx, ret=%d", + EIB_PROP_HCA_GUID, info->ti_hca_guid, dip, ret); + } + + ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_HCA_PORTNUM, + info->ti_pi->p_port_num); + if (ret != DDI_PROP_SUCCESS) { + ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set " + "%s property to 0x%lx for child dip 0x%llx, ret=%d", + EIB_PROP_HCA_PORTNUM, info->ti_pi->p_port_num, dip, ret); + } + + ret = ndi_prop_update_int64(DDI_DEV_T_NONE, dip, EIB_PROP_GW_SYS_GUID, + gwi->gw_system_guid); + if (ret != DDI_PROP_SUCCESS) { + ENX_DPRINTF_WARN("ndi_prop_update_int64() failed to set " + "%s property to 0x%llx for child dip 0x%llx, ret=%d", + EIB_PROP_GW_SYS_GUID, gwi->gw_system_guid, dip, ret); + } + + ret = ndi_prop_update_int64(DDI_DEV_T_NONE, dip, EIB_PROP_GW_GUID, + gwi->gw_guid); + if (ret != DDI_PROP_SUCCESS) { + ENX_DPRINTF_WARN("ndi_prop_update_int64() failed to set " + "%s property to 0x%llx for child dip 0x%llx, ret=%d", + EIB_PROP_GW_GUID, gwi->gw_guid, dip, ret); + } + + ret = ndi_prop_update_int64(DDI_DEV_T_NONE, dip, EIB_PROP_GW_SN_PREFIX, + (gwi->gw_addr).ga_gid.gid_prefix); + if (ret != DDI_PROP_SUCCESS) { + ENX_DPRINTF_WARN("ndi_prop_update_int64() failed to set " + "%s property to 0x%llx for child dip 0x%llx, ret=%d", + EIB_PROP_GW_SN_PREFIX, (gwi->gw_addr).ga_gid.gid_prefix, + dip, ret); + } + + ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_ADV_PERIOD, + gwi->gw_adv_period); + if (ret != DDI_PROP_SUCCESS) { + ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set " + "%s property to 0x%lx for child dip 0x%llx, ret=%d", + EIB_PROP_GW_ADV_PERIOD, gwi->gw_adv_period, dip, ret); + } + + ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_KA_PERIOD, + gwi->gw_ka_period); + if (ret != DDI_PROP_SUCCESS) { + ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set " + "%s property to 0x%lx for child dip 0x%llx, ret=%d", + EIB_PROP_GW_KA_PERIOD, gwi->gw_ka_period, dip, ret); + } + + ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_VNIC_KA_PERIOD, + gwi->gw_vnic_ka_period); + if (ret != DDI_PROP_SUCCESS) { + ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set " + "%s property to 0x%lx for child dip 0x%llx, ret=%d", + EIB_PROP_VNIC_KA_PERIOD, gwi->gw_vnic_ka_period, dip, ret); + } + + ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_CTRL_QPN, + gwi->gw_ctrl_qpn); + if (ret != DDI_PROP_SUCCESS) { + ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set " + "%s property to 0x%lx for child dip 0x%llx, ret=%d", + EIB_PROP_GW_CTRL_QPN, gwi->gw_ctrl_qpn, dip, ret); + } + + ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_LID, + gwi->gw_lid); + if (ret != DDI_PROP_SUCCESS) { + ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set " + "%s property to 0x%lx for child dip 0x%llx, ret=%d", + EIB_PROP_GW_LID, gwi->gw_lid, dip, ret); + } + + ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_PORTID, + gwi->gw_portid); + if (ret != DDI_PROP_SUCCESS) { + ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set " + "%s property to 0x%lx for child dip 0x%llx, ret=%d", + EIB_PROP_GW_PORTID, gwi->gw_portid, dip, ret); + } + + ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip, + EIB_PROP_GW_NUM_NET_VNICS, gwi->gw_num_net_vnics); + if (ret != DDI_PROP_SUCCESS) { + ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set " + "%s property to 0x%lx for child dip 0x%llx, ret=%d", + EIB_PROP_GW_NUM_NET_VNICS, gwi->gw_num_net_vnics, dip, ret); + } + + ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_AVAILABLE, + gwi->gw_flag_available); + if (ret != DDI_PROP_SUCCESS) { + ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set " + "%s property to 0x%lx for child dip 0x%llx, ret=%d", + EIB_PROP_GW_AVAILABLE, gwi->gw_flag_available, dip, ret); + } + + ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_HOST_VNICS, + gwi->gw_is_host_adm_vnics); + if (ret != DDI_PROP_SUCCESS) { + ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set " + "%s property to 0x%lx for child dip 0x%llx, ret=%d", + EIB_PROP_GW_HOST_VNICS, gwi->gw_is_host_adm_vnics, + dip, ret); + } + + ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_SL, + gwi->gw_sl); + if (ret != DDI_PROP_SUCCESS) { + ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set " + "%s property to 0x%lx for child dip 0x%llx, ret=%d", + EIB_PROP_GW_SL, gwi->gw_sl, dip, ret); + } + + ret = ndi_prop_update_int(DDI_DEV_T_NONE, dip, EIB_PROP_GW_N_RSS_QPN, + gwi->gw_n_rss_qpn); + if (ret != DDI_PROP_SUCCESS) { + ENX_DPRINTF_WARN("ndi_prop_update_int() failed to set " + "%s property to 0x%lx for child dip 0x%llx, ret=%d", + EIB_PROP_GW_N_RSS_QPN, gwi->gw_n_rss_qpn, dip, ret); + } + + ret = ndi_prop_update_string(DDI_DEV_T_NONE, dip, EIB_PROP_GW_SYS_NAME, + (char *)(gwi->gw_system_name)); + if (ret != DDI_PROP_SUCCESS) { + ENX_DPRINTF_WARN("ndi_prop_update_string() failed to set " + "%s property to '%s' for child dip 0x%llx, ret=%d", + EIB_PROP_GW_SYS_NAME, gwi->gw_system_name, dip, ret); + } + + ret = ndi_prop_update_string(DDI_DEV_T_NONE, dip, EIB_PROP_GW_PORT_NAME, + (char *)(gwi->gw_port_name)); + if (ret != DDI_PROP_SUCCESS) { + ENX_DPRINTF_WARN("ndi_prop_update_string() failed to set " + "%s property to '%s' for child dip 0x%llx, ret=%d", + EIB_PROP_GW_PORT_NAME, gwi->gw_port_name, dip, ret); + } + + ret = ndi_prop_update_string(DDI_DEV_T_NONE, dip, EIB_PROP_GW_VENDOR_ID, + (char *)(gwi->gw_vendor_id)); + if (ret != DDI_PROP_SUCCESS) { + ENX_DPRINTF_WARN("ndi_prop_update_string() failed to set " + "%s property to '%s' for child dip 0x%llx, ret=%d", + EIB_PROP_GW_VENDOR_ID, gwi->gw_vendor_id, dip, ret); + } +} + +int +eibnx_name_child(dev_info_t *child, char *name, size_t namesz) +{ + char *node_name; + + if ((node_name = ddi_get_parent_data(child)) == NULL) { + ENX_DPRINTF_ERR("ddi_get_parent_data(child=0x%llx) " + "returned NULL", child); + return (DDI_NOT_WELL_FORMED); + } + + /* + * Skip the name and "@" part in the eoib node path and copy the + * address part out to the caller. + */ + (void) strlcpy(name, node_name + strlen(EIB_DRV_NAME) + 1, namesz); + + return (DDI_SUCCESS); +} + +/* + * Synchronization functions to mark/clear the in-progress status of + * bus config/unconfig operations + */ + +void +eibnx_busop_inprog_enter(eibnx_t *ss) +{ + mutex_enter(&ss->nx_busop_lock); + + while (ss->nx_busop_flags & NX_FL_BUSOP_INPROG) + cv_wait(&ss->nx_busop_cv, &ss->nx_busop_lock); + + ss->nx_busop_flags |= NX_FL_BUSOP_INPROG; + + mutex_exit(&ss->nx_busop_lock); +} + +void +eibnx_busop_inprog_exit(eibnx_t *ss) +{ + mutex_enter(&ss->nx_busop_lock); + + ss->nx_busop_flags &= (~NX_FL_BUSOP_INPROG); + + cv_broadcast(&ss->nx_busop_cv); + mutex_exit(&ss->nx_busop_lock); +} + +eibnx_thr_info_t * +eibnx_start_port_monitor(eibnx_hca_t *hca, eibnx_port_t *port) +{ + eibnx_thr_info_t *ti; + kthread_t *kt; + dev_info_t *hca_dip; + const char *hca_drv_name; + int hca_drv_inst; + + ti = kmem_zalloc(sizeof (eibnx_thr_info_t), KM_SLEEP); + + mutex_init(&ti->ti_mcg_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&ti->ti_gw_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&ti->ti_child_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&ti->ti_event_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&ti->ti_event_cv, NULL, CV_DEFAULT, NULL); + + ti->ti_next = NULL; + ti->ti_hca_guid = hca->hc_guid; + ti->ti_hca = hca->hc_hdl; + ti->ti_pd = hca->hc_pd; + ti->ti_pi = port->po_pi; + ti->ti_ident = kmem_zalloc(MAXNAMELEN, KM_SLEEP); + + /* + * Prepare the "ident" for EoIB nodes from this port monitor. To + * associate eoib instances with the corresponding HCA nodes easily, + * and to make sure eoib instance numbers do not change when + * like-for-like HCA replacements are made, tie up the ident to + * HCA driver name, HCA driver instance and the HCA port number. + * The eoib node address is later composed using this ident and + * the gateway port ids after discovery. + */ + if ((hca_dip = ibtl_ibnex_hcaguid2dip(ti->ti_hca_guid)) == NULL) { + ENX_DPRINTF_WARN("ibtl_ibnex_hcaguid2dip(hca_guid=0x%llx) " + "returned NULL", ti->ti_hca_guid); + } else if ((hca_drv_name = ddi_driver_name(hca_dip)) == NULL) { + ENX_DPRINTF_WARN("hca driver name NULL for " + "hca_guid=0x%llx, hca_dip=0x%llx", + ti->ti_hca_guid, hca_dip); + } else if ((hca_drv_inst = ddi_get_instance(hca_dip)) < 0) { + ENX_DPRINTF_ERR("hca driver instance (%d) invalid for " + "hca_guid=0x%llx, hca_dip=0x%llx", + ti->ti_hca_guid, hca_dip); + } else { + (void) snprintf(ti->ti_ident, MAXNAMELEN, "%s%d,%x", + hca_drv_name, hca_drv_inst, ti->ti_pi->p_port_num); + } + + kt = thread_create(NULL, 0, eibnx_port_monitor, + ti, 0, &p0, TS_RUN, minclsyspri); + + ti->ti_kt_did = kt->t_did; + + return (ti); +} + +void +eibnx_stop_port_monitor(eibnx_thr_info_t *ti) +{ + /* + * Tell the port monitor thread to stop and wait for it to + * happen. Before marking it for death, make sure there + * aren't any completions being processed. + */ + mutex_enter(&ti->ti_event_lock); + while (ti->ti_event & ENX_EVENT_COMPLETION) { + cv_wait(&ti->ti_event_cv, &ti->ti_event_lock); + } + ti->ti_event |= ENX_EVENT_DIE; + cv_broadcast(&ti->ti_event_cv); + mutex_exit(&ti->ti_event_lock); + + thread_join(ti->ti_kt_did); + + /* + * Destroy synchronization primitives initialized for this ti + */ + cv_destroy(&ti->ti_event_cv); + mutex_destroy(&ti->ti_event_lock); + mutex_destroy(&ti->ti_child_lock); + mutex_destroy(&ti->ti_gw_lock); + mutex_destroy(&ti->ti_mcg_lock); + + kmem_free(ti->ti_ident, MAXNAMELEN); + kmem_free(ti, sizeof (eibnx_thr_info_t)); +} + +void +eibnx_terminate_monitors(void) +{ + eibnx_t *ss = enx_global_ss; + eibnx_thr_info_t *ti_list; + eibnx_thr_info_t *ti; + eibnx_thr_info_t *ti_next; + + mutex_enter(&ss->nx_lock); + ti_list = ss->nx_thr_info; + ss->nx_thr_info = NULL; + mutex_exit(&ss->nx_lock); + + /* + * Ask all the port_monitor threads to die. Before marking them + * for death, make sure there aren't any completions being + * processed by the thread. + */ + for (ti = ti_list; ti; ti = ti_next) { + ti_next = ti->ti_next; + eibnx_stop_port_monitor(ti); + } + + mutex_enter(&ss->nx_lock); + ss->nx_monitors_up = B_FALSE; + mutex_exit(&ss->nx_lock); +} + +int +eibnx_configure_node(eibnx_thr_info_t *ti, eibnx_gw_info_t *gwi, + dev_info_t **childp) +{ + eibnx_t *ss = enx_global_ss; + dev_info_t *child_dip; + char *node_name; + int circular; + int ret; + + /* + * Prepare the new node's name + */ + if ((node_name = eibnx_make_nodename(ti, gwi->gw_portid)) == NULL) + return (ENX_E_FAILURE); + + ndi_devi_enter(ss->nx_dip, &circular); + + if (child_dip = ndi_devi_findchild(ss->nx_dip, node_name)) { + ret = eibnx_update_child(ti, gwi, child_dip); + if (ret == ENX_E_SUCCESS) { + ndi_devi_exit(ss->nx_dip, circular); + kmem_free(node_name, MAXNAMELEN); + + if (childp) { + *childp = child_dip; + } + return (ENX_E_SUCCESS); + } + } + + /* + * If the node does not already exist, we may need to create it + */ + if (child_dip == NULL) { + ndi_devi_alloc_sleep(ss->nx_dip, EIB_DRV_NAME, + (pnode_t)DEVI_SID_NODEID, &child_dip); + + ddi_set_parent_data(child_dip, node_name); + eibnx_create_node_props(child_dip, ti, gwi); + } + + /* + * Whether there was no devinfo node at all for the given node name or + * we had a devinfo node, but it wasn't in our list of eoib children, + * we'll try to online the instance here. + */ + ENX_DPRINTF_DEBUG("onlining %s", node_name); + ret = ndi_devi_online(child_dip, 0); + if (ret != NDI_SUCCESS) { + ENX_DPRINTF_ERR("ndi_devi_online(node_name=%s) failed " + "with ret=0x%x", node_name, ret); + + ddi_set_parent_data(child_dip, NULL); + (void) ndi_devi_free(child_dip); + + ndi_devi_exit(ss->nx_dip, circular); + kmem_free(node_name, MAXNAMELEN); + + return (ENX_E_FAILURE); + } + + eibnx_enqueue_child(ti, gwi, node_name, child_dip); + + ndi_devi_exit(ss->nx_dip, circular); + + if (childp) { + *childp = child_dip; + } + + return (ENX_E_SUCCESS); +} + +int +eibnx_unconfigure_node(eibnx_thr_info_t *ti, eibnx_gw_info_t *gwi) +{ + /* + * To unconfigure an eoib node, we only need to set the child's + * dip to NULL. When the node gets configured again, we either + * find the dip for the pathname and set it in this child, or + * allocate a new dip and set it in this child. + */ + return (eibnx_update_child(ti, gwi, NULL)); +} + +int +eibnx_locate_node_name(char *devname, eibnx_thr_info_t **ti_p, + eibnx_gw_info_t **gwi_p) +{ + eibnx_t *ss = enx_global_ss; + eibnx_thr_info_t *ti; + eibnx_gw_info_t *gwi; + char name[MAXNAMELEN]; + + /* + * Locate the port monitor thread info and gateway info + * that corresponds to the supplied devname. + */ + mutex_enter(&ss->nx_lock); + for (ti = ss->nx_thr_info; ti; ti = ti->ti_next) { + if (ti->ti_ident[0] == '\0') + continue; + + mutex_enter(&ti->ti_gw_lock); + for (gwi = ti->ti_gw; gwi; gwi = gwi->gw_next) { + (void) snprintf(name, MAXNAMELEN, + "%s@%s,%x", EIB_DRV_NAME, ti->ti_ident, + gwi->gw_portid); + + if (strcmp(name, devname) == 0) + break; + } + mutex_exit(&ti->ti_gw_lock); + + if (gwi) { + break; + } + } + mutex_exit(&ss->nx_lock); + + if (ti == NULL || gwi == NULL) { + return (ENX_E_FAILURE); + } + + *ti_p = ti; + *gwi_p = gwi; + + return (ENX_E_SUCCESS); +} + +int +eibnx_locate_unconfigured_node(eibnx_thr_info_t **ti_p, eibnx_gw_info_t **gwi_p) +{ + eibnx_t *ss = enx_global_ss; + eibnx_thr_info_t *ti; + eibnx_child_t *ch; + + mutex_enter(&ss->nx_lock); + for (ti = ss->nx_thr_info; ti; ti = ti->ti_next) { + mutex_enter(&ti->ti_child_lock); + for (ch = ti->ti_child; ch; ch = ch->ch_next) { + if (ch->ch_dip == NULL) { + *ti_p = ti; + *gwi_p = ch->ch_gwi; + + mutex_exit(&ti->ti_child_lock); + mutex_exit(&ss->nx_lock); + + return (ENX_E_SUCCESS); + } + } + mutex_exit(&ti->ti_child_lock); + } + mutex_exit(&ss->nx_lock); + + return (ENX_E_FAILURE); +} + +static char * +eibnx_make_nodename(eibnx_thr_info_t *info, uint16_t gw_portid) +{ + char *name; + + if (info->ti_ident[0] == NULL) + return (NULL); + + name = kmem_zalloc(MAXNAMELEN, KM_SLEEP); + (void) snprintf(name, MAXNAMELEN, "%s@%s,%x", EIB_DRV_NAME, + info->ti_ident, gw_portid); + + return (name); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/io/ib/clients/eoib/enx_q.c Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,644 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ksynch.h> + +#include <sys/ib/clients/eoib/enx_impl.h> + +/* + * Acquire an SWQE + */ + +/*ARGSUSED*/ +eibnx_wqe_t * +eibnx_acquire_swqe(eibnx_thr_info_t *info, int flag) +{ + eibnx_wqe_t *wqe = NULL; + eibnx_tx_t *snd_p = &info->ti_snd; + int i; + + for (i = 0; i < ENX_NUM_SWQE; i++) { + wqe = &(snd_p->tx_wqe[i]); + + mutex_enter(&wqe->qe_lock); + if ((wqe->qe_flags & ENX_QEFL_INUSE) == 0) { + wqe->qe_flags |= ENX_QEFL_INUSE; + mutex_exit(&wqe->qe_lock); + break; + } + mutex_exit(&wqe->qe_lock); + } + + /* + * We probably have enough swqe entries for doing our solicitations. + * If we find it not enough in practice, we need to implement some + * sort of dynamic allocation. + */ + if (i == ENX_NUM_SWQE) + wqe = NULL; + + return (wqe); +} + +/* + * Return a SWQE from completion. We may have to release + * it or keep it. + */ +void +eibnx_return_swqe(eibnx_wqe_t *wqe) +{ + ASSERT(wqe->qe_type == ENX_QETYP_SWQE); + + mutex_enter(&wqe->qe_lock); + + /* + * This send wqe is from the completion queue. We need to + * clear the 'posted' flag first. + */ + ASSERT((wqe->qe_flags & ENX_QEFL_POSTED) == ENX_QEFL_POSTED); + wqe->qe_flags &= (~ENX_QEFL_POSTED); + + /* + * See if we need to release this send wqe back to the pool + * on completion. We may not need to do so if, for example, + * this were a swqe acquired specifically for a particular gw. + */ + if (wqe->qe_flags & ENX_QEFL_RELONCOMP) { + wqe->qe_sgl.ds_len = wqe->qe_bufsz; + wqe->qe_flags &= (~ENX_QEFL_INUSE); + + wqe->qe_flags &= (~ENX_QEFL_RELONCOMP); + } + + mutex_exit(&wqe->qe_lock); +} + +/* + * Return a RWQE from completion. We probably have to repost it. + */ +void +eibnx_return_rwqe(eibnx_thr_info_t *info, eibnx_wqe_t *wqe) +{ + ibt_status_t ret; + + ASSERT(wqe->qe_type == ENX_QETYP_RWQE); + + mutex_enter(&wqe->qe_lock); + + /* + * We should never need to free an rwqe on completion. + */ + ASSERT((wqe->qe_flags & ENX_QEFL_RELONCOMP) == 0); + + /* + * An rwqe is always in-use and posted, so we only need to make + * sure the ds_len is adjusted back to the value it's supposed + * to have. + */ + wqe->qe_sgl.ds_len = wqe->qe_bufsz; + + /* + * Repost the recv wqe + */ + ret = ibt_post_recv(info->ti_chan, &(wqe->qe_wr.recv), 1, NULL); + if (ret != IBT_SUCCESS) { + ENX_DPRINTF_WARN("ibt_post_recv(chan_hdl=0x%llx) failed, " + "ret=%d", info->ti_chan, ret); + } + + mutex_exit(&wqe->qe_lock); +} + +/* + * Release an SWQE that was acquired earlier. + */ +void +eibnx_release_swqe(eibnx_wqe_t *wqe) +{ + ASSERT(wqe->qe_type == ENX_QETYP_SWQE); + + mutex_enter(&wqe->qe_lock); + + /* + * Make sure this swqe is in use. Since this routine may also be + * called when we're trying to cleanup the eoib nodes, we + * should clear all flag bits. + */ + ASSERT((wqe->qe_flags & ENX_QEFL_INUSE) == ENX_QEFL_INUSE); + wqe->qe_flags = 0; + + mutex_exit(&wqe->qe_lock); +} + +/* + * Insert the passed child to the head of the queue + */ +void +eibnx_enqueue_child(eibnx_thr_info_t *info, eibnx_gw_info_t *gwi, + char *node_name, dev_info_t *dip) +{ + eibnx_child_t *ch; + eibnx_child_t *new_ch; + + new_ch = kmem_zalloc(sizeof (eibnx_child_t), KM_SLEEP); + new_ch->ch_dip = dip; + new_ch->ch_node_name = node_name; + new_ch->ch_gwi = gwi; + + mutex_enter(&info->ti_child_lock); + + /* + * Search existing children to see if we already have this + * child. If so, simply update its dip and node_name + */ + for (ch = info->ti_child; ch; ch = ch->ch_next) { + if (ch->ch_gwi->gw_portid == gwi->gw_portid) { + ch->ch_dip = dip; + if (ch->ch_node_name) { + kmem_free(ch->ch_node_name, MAXNAMELEN); + } + ch->ch_node_name = node_name; + kmem_free(new_ch, sizeof (eibnx_child_t)); + return; + } + } + + /* + * If not, add the new child to the list of children + */ + new_ch->ch_next = info->ti_child; + info->ti_child = new_ch; + + mutex_exit(&info->ti_child_lock); +} + +int +eibnx_update_child(eibnx_thr_info_t *info, eibnx_gw_info_t *gwi, + dev_info_t *dip) +{ + eibnx_child_t *ch; + + mutex_enter(&info->ti_child_lock); + for (ch = info->ti_child; ch; ch = ch->ch_next) { + if (ch->ch_gwi->gw_portid == gwi->gw_portid) { + if (ch->ch_dip != dip) { + ENX_DPRINTF_DEBUG("updating child dip for " + "gw portid 0x%x to 0x%llx", + gwi->gw_portid, dip); + ch->ch_dip = dip; + } + mutex_exit(&info->ti_child_lock); + + return (ENX_E_SUCCESS); + } + } + mutex_exit(&info->ti_child_lock); + + return (ENX_E_FAILURE); +} + +dev_info_t * +eibnx_find_child_dip_by_inst(eibnx_thr_info_t *info, int inst) +{ + eibnx_child_t *ch; + dev_info_t *dip = NULL; + + mutex_enter(&info->ti_child_lock); + for (ch = info->ti_child; ch != NULL; ch = ch->ch_next) { + dip = ch->ch_dip; + if (ddi_get_instance(dip) == inst) + break; + } + mutex_exit(&info->ti_child_lock); + + return (dip); +} + +dev_info_t * +eibnx_find_child_dip_by_gw(eibnx_thr_info_t *info, uint16_t gw_portid) +{ + eibnx_child_t *ch; + dev_info_t *dip = NULL; + + mutex_enter(&info->ti_child_lock); + for (ch = info->ti_child; ch != NULL; ch = ch->ch_next) { + dip = ch->ch_dip; + if (ch->ch_gwi->gw_portid == gw_portid) + break; + } + mutex_exit(&info->ti_child_lock); + + return (dip); +} + +/* + * See if the passed gateway is already found in our list. Note + * that we assume that the gateway port id uniquely identifies each + * gateway. + */ +eibnx_gw_info_t * +eibnx_find_gw_in_gwlist(eibnx_thr_info_t *info, eibnx_gw_info_t *gwi) +{ + eibnx_gw_info_t *lgw = NULL; + + mutex_enter(&info->ti_gw_lock); + for (lgw = info->ti_gw; lgw; lgw = lgw->gw_next) { + if (lgw->gw_portid == gwi->gw_portid) + break; + } + mutex_exit(&info->ti_gw_lock); + + return (lgw); +} + +/* + * Add a newly discovered gateway to the gateway list. Since we'll + * need to send unicast solicitations to this gateway soon, we'll + * also grab a swqe entry, and initialize basic gw adress parameters + * such as the gid, qpn, qkey and pkey of the GW. When we eventually + * get to sending the unicast to this gateway for the first time, + * we'll discover the path to this gateway using these parameters + * and modify the ud destination handle appropriately. + */ +eibnx_gw_info_t * +eibnx_add_gw_to_gwlist(eibnx_thr_info_t *info, eibnx_gw_info_t *gwi, + ibt_wc_t *wc, uint8_t *recv_buf) +{ + eibnx_gw_info_t *new_gwi; + eibnx_wqe_t *wqe; + ib_grh_t *grh; + ib_gid_t sgid; + clock_t timeout_usecs; + + /* + * For now, we'll simply do KM_NOSLEEP allocation, since this code + * is called from within rx processing + */ + new_gwi = kmem_zalloc(sizeof (eibnx_gw_info_t), KM_NOSLEEP); + if (new_gwi == NULL) { + ENX_DPRINTF_WARN("no memory, gw port_id 0x%x " + "will be ignored by hca_guid=0x%llx, port=0x%x", + gwi->gw_portid, info->ti_hca_guid, + info->ti_pi->p_port_num); + return (NULL); + } + + /* + * We also need to acquire a send wqe to do unicast solicitations + * to this gateway later on. We should've enough pre-allocated swqes + * to do this without sleeping. + */ + if ((wqe = eibnx_acquire_swqe(info, KM_NOSLEEP)) == NULL) { + ENX_DPRINTF_WARN("no swqe available, gw port_id 0x%x " + "will be ignored by hca_guid=0x%llx, port=0x%x", + gwi->gw_portid, info->ti_hca_guid, + info->ti_pi->p_port_num); + kmem_free(new_gwi, sizeof (eibnx_gw_info_t)); + return (NULL); + } + + /* + * Initialize gw state and wqe information. + */ + new_gwi->gw_next = NULL; + new_gwi->gw_swqe = wqe; + new_gwi->gw_state = gwi->gw_state; + + /* + * Set up gateway advertisement monitoring parameters. Since we + * always need to check against a timeout value of 2.5 * gw_adv_period, + * we'll keep this pre-calculated value as well. + */ + mutex_init(&new_gwi->gw_adv_lock, NULL, MUTEX_DRIVER, NULL); + new_gwi->gw_adv_flag = gwi->gw_adv_flag; + new_gwi->gw_adv_last_lbolt = ddi_get_lbolt64(); + timeout_usecs = gwi->gw_adv_period * 1000; + timeout_usecs = ((timeout_usecs << 2) + timeout_usecs) >> 1; + new_gwi->gw_adv_timeout_ticks = drv_usectohz(timeout_usecs); + + /* + * Initialize gateway address information. Note that if the message has + * a GRH, we'll use the subnet prefix, otherwise we'll assume that the + * gateway is in the same subnet as ourselves. + */ + new_gwi->gw_addr.ga_vect = NULL; + if (wc->wc_flags & IBT_WC_GRH_PRESENT) { + grh = (ib_grh_t *)(uintptr_t)recv_buf; + new_gwi->gw_addr.ga_gid.gid_prefix = + ntohll(grh->SGID.gid_prefix); + } else { + sgid = info->ti_pi->p_sgid_tbl[0]; + new_gwi->gw_addr.ga_gid.gid_prefix = + sgid.gid_prefix; + } + new_gwi->gw_addr.ga_gid.gid_guid = gwi->gw_guid; + new_gwi->gw_addr.ga_qpn = gwi->gw_ctrl_qpn; + new_gwi->gw_addr.ga_qkey = EIB_FIP_QKEY; + new_gwi->gw_addr.ga_pkey = EIB_ADMIN_PKEY; + + /* + * Initialize gateway parameters received via the advertisement + */ + new_gwi->gw_system_guid = gwi->gw_system_guid; + new_gwi->gw_guid = gwi->gw_guid; + new_gwi->gw_adv_period = gwi->gw_adv_period; + new_gwi->gw_ka_period = gwi->gw_ka_period; + new_gwi->gw_vnic_ka_period = gwi->gw_vnic_ka_period; + new_gwi->gw_ctrl_qpn = gwi->gw_ctrl_qpn; + new_gwi->gw_lid = gwi->gw_lid; + new_gwi->gw_portid = gwi->gw_portid; + new_gwi->gw_num_net_vnics = gwi->gw_num_net_vnics; + new_gwi->gw_is_host_adm_vnics = gwi->gw_is_host_adm_vnics; + new_gwi->gw_sl = gwi->gw_sl; + new_gwi->gw_n_rss_qpn = gwi->gw_n_rss_qpn; + new_gwi->gw_flag_ucast_advt = gwi->gw_flag_ucast_advt; + new_gwi->gw_flag_available = gwi->gw_flag_available; + bcopy(gwi->gw_system_name, new_gwi->gw_system_name, + sizeof (new_gwi->gw_system_name)); + bcopy(gwi->gw_port_name, new_gwi->gw_port_name, + sizeof (new_gwi->gw_port_name)); + bcopy(gwi->gw_vendor_id, new_gwi->gw_vendor_id, + sizeof (new_gwi->gw_vendor_id)); + + /* + * Queue up the new gwi and return it + */ + mutex_enter(&info->ti_gw_lock); + new_gwi->gw_next = info->ti_gw; + info->ti_gw = new_gwi; + mutex_exit(&info->ti_gw_lock); + + return (new_gwi); +} + +/* + * Update old data for the gateway in our list with the new data. + */ +void +eibnx_replace_gw_in_gwlist(eibnx_thr_info_t *info, eibnx_gw_info_t *orig_gwi, + eibnx_gw_info_t *new_gwi, ibt_wc_t *wc, uint8_t *recv_buf, + boolean_t *gwi_changed) +{ + ib_sn_prefix_t new_gw_sn_prefix; + ib_grh_t *grh; + ib_gid_t sgid; + boolean_t changed = B_FALSE; + boolean_t gw_addr_changed = B_TRUE; + + /* + * We'll update all info received in the new advertisement in + * the original gwi and also move the gw_state to that of the state + * in the new gwi. + */ + mutex_enter(&info->ti_gw_lock); + + orig_gwi->gw_state = new_gwi->gw_state; + + /* + * The guids shouldn't really change for the "same" gateway + */ + if (new_gwi->gw_system_guid != orig_gwi->gw_system_guid) { + ENX_DPRINTF_WARN("gateway system guid changed for the " + "*same* gateway from 0x%llx to 0x%llx", + orig_gwi->gw_system_guid, new_gwi->gw_system_guid); + + orig_gwi->gw_system_guid = new_gwi->gw_system_guid; + changed = B_TRUE; + } + if (new_gwi->gw_guid != orig_gwi->gw_guid) { + ENX_DPRINTF_WARN("gateway guid changed for the " + "*same* gateway from 0x%llx to 0x%llx", + orig_gwi->gw_guid, new_gwi->gw_guid); + + orig_gwi->gw_guid = new_gwi->gw_guid; + changed = B_TRUE; + gw_addr_changed = B_TRUE; + } + + if (new_gwi->gw_adv_period != orig_gwi->gw_adv_period) { + ENX_DPRINTF_DEBUG("gateway adv period changed " + "from 0x%lx to 0x%lx", orig_gwi->gw_adv_period, + new_gwi->gw_adv_period); + + orig_gwi->gw_adv_period = new_gwi->gw_adv_period; + changed = B_TRUE; + } + if (new_gwi->gw_ka_period != orig_gwi->gw_ka_period) { + ENX_DPRINTF_DEBUG("gateway ka period changed " + "from 0x%lx to 0x%lx", orig_gwi->gw_ka_period, + new_gwi->gw_ka_period); + + orig_gwi->gw_ka_period = new_gwi->gw_ka_period; + changed = B_TRUE; + } + if (new_gwi->gw_vnic_ka_period != orig_gwi->gw_vnic_ka_period) { + ENX_DPRINTF_DEBUG("vnic ka period changed " + "from 0x%lx to 0x%lx", orig_gwi->gw_vnic_ka_period, + new_gwi->gw_vnic_ka_period); + + orig_gwi->gw_vnic_ka_period = new_gwi->gw_vnic_ka_period; + changed = B_TRUE; + } + if (new_gwi->gw_ctrl_qpn != orig_gwi->gw_ctrl_qpn) { + ENX_DPRINTF_DEBUG("gateway control qpn changed " + "from 0x%lx to 0x%lx", orig_gwi->gw_ctrl_qpn, + new_gwi->gw_ctrl_qpn); + + orig_gwi->gw_ctrl_qpn = new_gwi->gw_ctrl_qpn; + changed = B_TRUE; + } + if (new_gwi->gw_lid != orig_gwi->gw_lid) { + ENX_DPRINTF_DEBUG("gateway lid changed from 0x%x to 0x%x", + orig_gwi->gw_lid, new_gwi->gw_lid); + + orig_gwi->gw_lid = new_gwi->gw_lid; + changed = B_TRUE; + gw_addr_changed = B_TRUE; + } + + /* + * The identity of the gateway is currently defined by its portid, + * so this cannot be different or eibnx_find_gw_in_gwlist() wouldn't + * have thought it's the same. For now though, we'll treat it + * like any other parameter, and flag it if we find this different. + */ + if (new_gwi->gw_portid != orig_gwi->gw_portid) { + ENX_DPRINTF_WARN("gateway portid changed for the *same* " + "gateway from 0x%x to 0x%x", orig_gwi->gw_portid, + new_gwi->gw_portid); + + orig_gwi->gw_portid = new_gwi->gw_portid; + changed = B_TRUE; + } + + if (new_gwi->gw_is_host_adm_vnics != orig_gwi->gw_is_host_adm_vnics) { + ENX_DPRINTF_DEBUG("host adm vnics changed from 0x%x to 0x%x", + orig_gwi->gw_is_host_adm_vnics, + new_gwi->gw_is_host_adm_vnics); + + orig_gwi->gw_is_host_adm_vnics = new_gwi->gw_is_host_adm_vnics; + changed = B_TRUE; + } + if (new_gwi->gw_sl != orig_gwi->gw_sl) { + ENX_DPRINTF_DEBUG("gateway sl changed from 0x%x to 0x%x", + orig_gwi->gw_sl, new_gwi->gw_sl); + + orig_gwi->gw_sl = new_gwi->gw_sl; + changed = B_TRUE; + } + if (new_gwi->gw_n_rss_qpn != orig_gwi->gw_n_rss_qpn) { + ENX_DPRINTF_DEBUG("gateway n_rss_qpn changed from 0x%x to 0x%x", + orig_gwi->gw_n_rss_qpn, new_gwi->gw_n_rss_qpn); + + orig_gwi->gw_n_rss_qpn = new_gwi->gw_n_rss_qpn; + changed = B_TRUE; + } + + /* + * The gw_flag_ucast_advt and gw_flag_available are expected to + * change over time (and even gw_num_net_vnics could change, but + * it's of no use to us presently), and we shouldn't trigger any + * flag for these + */ + orig_gwi->gw_flag_ucast_advt = new_gwi->gw_flag_ucast_advt; + orig_gwi->gw_flag_available = new_gwi->gw_flag_available; + orig_gwi->gw_num_net_vnics = new_gwi->gw_num_net_vnics; + + if (strncmp((const char *)new_gwi->gw_system_name, + (const char *)orig_gwi->gw_system_name, EIB_GW_SYSNAME_LEN) != 0) { + ENX_DPRINTF_DEBUG("gateway system name changed from %s to %s", + orig_gwi->gw_system_name, new_gwi->gw_system_name); + + bcopy(new_gwi->gw_system_name, orig_gwi->gw_system_name, + EIB_GW_SYSNAME_LEN); + changed = B_TRUE; + } + if (strncmp((const char *)new_gwi->gw_port_name, + (const char *)orig_gwi->gw_port_name, EIB_GW_PORTNAME_LEN) != 0) { + ENX_DPRINTF_DEBUG("gateway port name changed from %s to %s", + orig_gwi->gw_port_name, new_gwi->gw_port_name); + + bcopy(new_gwi->gw_port_name, orig_gwi->gw_port_name, + EIB_GW_PORTNAME_LEN); + changed = B_TRUE; + } + if (strncmp((const char *)new_gwi->gw_vendor_id, + (const char *)orig_gwi->gw_vendor_id, EIB_GW_VENDOR_LEN) != 0) { + ENX_DPRINTF_DEBUG("vendor id changed from %s to %s", + orig_gwi->gw_vendor_id, new_gwi->gw_vendor_id); + + bcopy(new_gwi->gw_vendor_id, orig_gwi->gw_vendor_id, + EIB_GW_VENDOR_LEN); + changed = B_TRUE; + } + + /* + * See if the subnet prefix for the gateway has changed + */ + if (wc->wc_flags & IBT_WC_GRH_PRESENT) { + grh = (ib_grh_t *)(uintptr_t)recv_buf; + new_gw_sn_prefix = ntohll(grh->SGID.gid_prefix); + } else { + sgid = info->ti_pi->p_sgid_tbl[0]; + new_gw_sn_prefix = sgid.gid_prefix; + } + if (new_gw_sn_prefix != orig_gwi->gw_addr.ga_gid.gid_prefix) { + ENX_DPRINTF_WARN("subnet prefix changed from 0x%llx to 0x%llx", + orig_gwi->gw_addr.ga_gid.gid_prefix, new_gw_sn_prefix); + + changed = B_TRUE; + gw_addr_changed = B_TRUE; + } + + /* + * If the gateway address has changed in any way, clear the current + * address vector and update the gateway guid and gateway qpn. The + * address vector will be created the next time a unicast solicit + * is attempted for this gateway. + */ + if (gw_addr_changed) { + if (orig_gwi->gw_addr.ga_vect != NULL) { + kmem_free(orig_gwi->gw_addr.ga_vect, + sizeof (ibt_adds_vect_t)); + orig_gwi->gw_addr.ga_vect = NULL; + } + orig_gwi->gw_addr.ga_gid.gid_prefix = new_gw_sn_prefix; + orig_gwi->gw_addr.ga_gid.gid_guid = new_gwi->gw_guid; + orig_gwi->gw_addr.ga_qpn = new_gwi->gw_ctrl_qpn; + orig_gwi->gw_addr.ga_qkey = EIB_FIP_QKEY; + orig_gwi->gw_addr.ga_pkey = EIB_ADMIN_PKEY; + } + + mutex_exit(&info->ti_gw_lock); + + if (gwi_changed) { + *gwi_changed = changed; + } +} + +/* + * Queue up a node for EoIB instantiation and wake up the thread + * that creates eoib nodes. + */ +void +eibnx_queue_for_creation(eibnx_thr_info_t *info, eibnx_gw_info_t *gwi) +{ + eibnx_t *ss = enx_global_ss; + eibnx_nodeq_t *new_node; + + /* + * For now, we'll simply do KM_NOSLEEP allocation, since this + * code is called from within rx processing + */ + new_node = kmem_zalloc(sizeof (eibnx_nodeq_t), KM_NOSLEEP); + if (new_node == NULL) { + ENX_DPRINTF_WARN("no memory, eoib node will not be " + "created for hca_guid=0x%llx, hca_port=0x%x, " + "gw_port_id=0x%x", info->ti_hca_guid, + info->ti_pi->p_port_num, gwi->gw_portid); + return; + } + new_node->nc_info = info; + new_node->nc_gwi = gwi; + + /* + * If the eoib node creation thread is dying (or dead), don't + * queue up any more requests for creation + */ + mutex_enter(&ss->nx_nodeq_lock); + if (ss->nx_nodeq_thr_die) { + kmem_free(new_node, sizeof (eibnx_nodeq_t)); + } else { + new_node->nc_next = ss->nx_nodeq; + ss->nx_nodeq = new_node; + cv_signal(&ss->nx_nodeq_cv); + } + mutex_exit(&ss->nx_nodeq_lock); +}
--- a/usr/src/uts/common/io/ib/ibtl/ibtl_ibnex.c Fri Aug 13 14:44:26 2010 +0800 +++ b/usr/src/uts/common/io/ib/ibtl/ibtl_ibnex.c Fri Aug 13 07:02:57 2010 -0400 @@ -18,9 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/systm.h> @@ -517,9 +517,10 @@ * For a given pdip, of Port/VPPA devices, match it against all the * registered HCAs's dip. If match found return IBT_SUCCESS, * else IBT_NO_HCAS_AVAILABLE. + * * For IOC/Pseudo devices check if the given pdip is that of - * the ib(7d) nexus. If yes return IBT_SUCCESS, - * else IBT_NO_HCAS_AVAILABLE. + * the ib(7d) nexus or that of the eoib(7d) nexus. If yes + * return IBT_SUCCESS, else IBT_NO_HCAS_AVAILABLE. */ ibt_status_t ibtl_ibnex_valid_hca_parent(dev_info_t *pdip) @@ -530,9 +531,10 @@ pdip); /* For Pseudo devices and IOCs */ - if (strncmp(ddi_node_name(pdip), "ib", 2) == 0) + if (strncmp(ddi_node_name(pdip), "ib", 2) == 0 || + strncmp(ddi_node_name(pdip), "eibnx", 5) == 0) { return (IBT_SUCCESS); - else { + } else { /* For Port devices and VPPAs */ mutex_enter(&ibtl_clnt_list_mutex); hca_devp = ibtl_hca_list;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/sys/ib/clients/eoib/eib.h Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,189 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_IB_EOIB_EIB_H +#define _SYS_IB_EOIB_EIB_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * + * EoIB Encapsulation Header Layout + * + * 31 30 29 28 27 26 25 24 22 21 20 ... 16 15 0 + * +-----+-----+-----+-----+--+---+--+---------+-------------------------+ + * | sig | ver | TCP | IP | |fcs|ms| segment | segment id | + * | | | chk | chk | | | | offset | | + * +-----+-----+-----+-----+--+---+--+---------+-------------------------+ + * + */ +#define EIB_ENCAP_HDR_SZ 4 + +#define EIB_ENCAP_SIGN_MASK 0x3 +#define EIB_ENCAP_SIGN_SHIFT 30 +#define EIB_ENCAP_VER_MASK 0x3 +#define EIB_ENCAP_VER_SHIFT 28 +#define EIB_ENCAP_TCPCHK_MASK 0x3 +#define EIB_ENCAP_TCPCHK_SHIFT 26 +#define EIB_ENCAP_IPCHK_MASK 0x3 +#define EIB_ENCAP_IPCHK_SHIFT 24 +#define EIB_ENCAP_FCS_B_SHIFT 22 +#define EIB_ENCAP_MS_B_SHIFT 21 +#define EIB_ENCAP_SEGOFF_MASK 0x1F +#define EIB_ENCAP_SEGOFF_SHIFT 16 +#define EIB_ENCAP_SEGID_MASK 0xFFFF + +/* + * Bit fields values definitions + */ +#define EIB_EH_SIGNATURE 3 +#define EIB_EH_VERSION 0 +#define EIB_EH_CSUM_UNKNOWN 0 +#define EIB_EH_TCPCSUM_OK 1 +#define EIB_EH_UDPCSUM_OK 2 +#define EIB_EH_CSUM_BAD 3 +#define EIB_EH_IPCSUM_OK 1 + +/* + * Some shortcuts + */ +#define EIB_TX_ENCAP_HDR 0xC0000000 +#define EIB_RX_ENCAP_TCPIP_OK 0xC5000000 +#define EIB_RX_ENCAP_UDPIP_OK 0xC9000000 + +/* + * Driver name + */ +#define EIB_DRV_NAME "eoib" + +/* + * Currently, the gateway responds to login requests on the qpn that carried + * the solication request, rather than on the qpn that carried the login + * request. This means that EoIB nexus receives the acknowledgements from + * gateways to login requests made by the individual EoIB instances, and must + * pass this login ack information back to the appropriate EoIB instance. + * + * Now, the only field in the login ack packet that could identify the + * individual EoIB instance is the vNIC id field, but this is a 16-bit field, + * with the MSB reserved to indicate whether the mac/vlan is host-managed + * or gateway-managed. This leaves us with just 15-bits to encode the EoIB + * device instance and its Solaris vnic instance. For now, we divide this + * field as a 6-bit vnic instance number (max Solaris vnics is 64) and a + * 9-bit device instance number (max EoIB pseudo-NICs in a system is 512). + * + * The long-term solution is to get the gateway to respond directly to the + * login requestor, so the requestor can use all 15-bits to identify its + * Solaris vnic instance (max 32K) and leave the device instance limit to + * the system limit. + */ +#define EIB_DVI_SHIFT 6 +#define EIB_DVI_MASK 0x1FF +#define EIB_VNI_MASK 0x03F + +#define EIB_VNIC_INSTANCE(id) ((id) & EIB_VNI_MASK) +#define EIB_DEVI_INSTANCE(id) (((id) >> EIB_DVI_SHIFT) & EIB_DVI_MASK) +#define EIB_VNIC_ID(dvi, vni) \ + ((((dvi) & EIB_DVI_MASK) << EIB_DVI_SHIFT) | ((vni) & EIB_VNI_MASK)) + +/* + * Making VHUB_ID from vlan and portid + */ +#define EIB_VHUB_ID(portid, vlan) \ + ((((uint_t)(portid) & 0xfff) << 12) | ((uint_t)(vlan) & 0xfff)) + +/* + * NDI Events that individual EoIB instance will be interested in + */ +#define EIB_NDI_EVENT_GW_AVAILABLE "SUNW,eoib:gateway-available" +#define EIB_NDI_EVENT_LOGIN_ACK "SUNW,eoib:vnic-login-ack" +#define EIB_NDI_EVENT_GW_INFO_UPDATE "SUNW,eoib:gateway-info-update" + +/* + * Properties for each eoib node created + */ +#define EIB_PROP_HCA_GUID "hca-guid" +#define EIB_PROP_HCA_PORTNUM "hca-port#" +#define EIB_PROP_GW_SYS_GUID "gw-system-guid" +#define EIB_PROP_GW_GUID "gw-guid" +#define EIB_PROP_GW_SN_PREFIX "gw-sn-prefix" +#define EIB_PROP_GW_ADV_PERIOD "gw-adv-period" +#define EIB_PROP_GW_KA_PERIOD "gw-ka-period" +#define EIB_PROP_VNIC_KA_PERIOD "vnic-ka-period" +#define EIB_PROP_GW_CTRL_QPN "gw-ctrl-qpn" +#define EIB_PROP_GW_LID "gw-lid" +#define EIB_PROP_GW_PORTID "gw-portid" +#define EIB_PROP_GW_NUM_NET_VNICS "gw-num-net-vnics" +#define EIB_PROP_GW_AVAILABLE "gw-available?" +#define EIB_PROP_GW_HOST_VNICS "gw-host-vnics?" +#define EIB_PROP_GW_SL "gw-sl" +#define EIB_PROP_GW_N_RSS_QPN "gw-n-rss-qpn" +#define EIB_PROP_GW_SYS_NAME "gw-system-name" +#define EIB_PROP_GW_PORT_NAME "gw-port-name" +#define EIB_PROP_GW_VENDOR_ID "gw-vendor-id" + +/* + * Gateway information passed by eibnx to eoib. The lengths of character + * strings should be longer than what is defined for these objects in fip.h, + * to accomodate the terminating null. + */ +#define EIB_GW_SYSNAME_LEN 40 +#define EIB_GW_PORTNAME_LEN 12 +#define EIB_GW_VENDOR_LEN 12 + +typedef struct eib_gw_info_s { + ib_guid_t gi_system_guid; + ib_guid_t gi_guid; + ib_sn_prefix_t gi_sn_prefix; + uint32_t gi_adv_period; + uint32_t gi_ka_period; + uint32_t gi_vnic_ka_period; + ib_qpn_t gi_ctrl_qpn; + ib_lid_t gi_lid; + uint16_t gi_portid; + uint16_t gi_num_net_vnics; + uint8_t gi_flag_available; + uint8_t gi_is_host_adm_vnics; + uint8_t gi_sl; + uint8_t gi_n_rss_qpn; + uint8_t gi_system_name[EIB_GW_SYSNAME_LEN]; + uint8_t gi_port_name[EIB_GW_PORTNAME_LEN]; + uint8_t gi_vendor_id[EIB_GW_VENDOR_LEN]; +} eib_gw_info_t; + +/* + * Softint priority levels to use for data and control/admin cq handling + * in EoIB leaf and nexus drivers + */ +#define EIB_SOFTPRI_DATA (DDI_INTR_SOFTPRI_MIN) +#define EIB_SOFTPRI_CTL (DDI_INTR_SOFTPRI_MIN + 1) +#define EIB_SOFTPRI_ADM (DDI_INTR_SOFTPRI_MIN + 1) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_IB_EOIB_EIB_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/sys/ib/clients/eoib/eib_impl.h Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,991 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_IB_EOIB_EIB_IMPL_H +#define _SYS_IB_EOIB_EIB_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/ddi.h> +#include <sys/mac.h> +#include <sys/sunddi.h> +#include <sys/varargs.h> +#include <sys/vlan.h> +#include <sys/ib/ibtl/ibti.h> +#include <sys/ib/ibtl/ibvti.h> +#include <sys/ib/ib_pkt_hdrs.h> + +#include <sys/ib/clients/eoib/fip.h> +#include <sys/ib/clients/eoib/eib.h> + +/* + * Driver specific constants + */ +#define EIB_E_SUCCESS 0 +#define EIB_E_FAILURE -1 +#define EIB_MAX_LINE 128 +#define EIB_MAX_SGL 59 +#define EIB_MAX_POST_MULTIPLE 4 +#define EIB_MAX_PAYLOAD_HDR_SZ 160 +#define EIB_TX_COPY_THRESH 4096 /* greater than mtu */ +#define EIB_MAX_VNICS 64 /* do not change this */ +#define EIB_LOGIN_TIMEOUT_USEC 8000000 +#define EIB_RWR_CHUNK_SZ 8 +#define EIB_IPHDR_ALIGN_ROOM 32 +#define EIB_IP_HDR_ALIGN 2 +#define EIB_MAX_RX_PKTS_ONINTR 0x800 +#define EIB_MAX_LOGIN_ATTEMPTS 3 +#define EIB_MAX_VHUB_TBL_ATTEMPTS 3 +#define EIB_MAX_KA_ATTEMPTS 3 +#define EIB_MAX_ATTEMPTS 10 +#define EIB_DELAY_HALF_SECOND 500000 +#define EIB_GRH_SZ (sizeof (ib_grh_t)) + +/* + * Debug messages + */ +#define EIB_MSGS_CRIT 0x01 +#define EIB_MSGS_ERR 0x02 +#define EIB_MSGS_WARN 0x04 +#define EIB_MSGS_DEBUG 0x08 +#define EIB_MSGS_ARGS 0x10 +#define EIB_MSGS_PKT 0x20 +#define EIB_MSGS_VERBOSE 0x40 +#define EIB_MSGS_DEFAULT (EIB_MSGS_CRIT | EIB_MSGS_ERR | EIB_MSGS_WARN) + +#define EIB_LOGSZ_DEFAULT 0x20000 + +#define EIB_DPRINTF_CRIT eib_dprintf_crit +#define EIB_DPRINTF_ERR eib_dprintf_err +#define EIB_DPRINTF_WARN eib_dprintf_warn +#ifdef EIB_DEBUG +#define EIB_DPRINTF_DEBUG eib_dprintf_debug +#define EIB_DPRINTF_ARGS eib_dprintf_args +#define EIB_DPRINTF_PKT eib_dprintf_pkt +#define EIB_DPRINTF_VERBOSE eib_dprintf_verbose +#else +#define EIB_DPRINTF_DEBUG 0 && +#define EIB_DPRINTF_ARGS 0 && +#define EIB_DPRINTF_PKT 0 && +#define EIB_DPRINTF_VERBOSE 0 && +#endif + +/* + * EoIB threads to provide various services + */ +#define EIB_EVENTS_HDLR "eib_events_handler" +#define EIB_RWQES_REFILLER "eib_rwqes_refiller" +#define EIB_VNIC_CREATOR "eib_vnic_creator" +#define EIB_TXWQES_MONITOR "eib_txwqe_monitor" +#define EIB_LSOBUFS_MONITOR "eib_lsobufs_monitor" + +/* + * Macro for finding the least significant bit set in a 64-bit unsigned int + */ +#define EIB_FIND_LSB_SET(val64) eib_setbit_mod67[((-(val64) & (val64)) % 67)] + +/* + * LSO buffers + * + * Under normal circumstances we should never need to use any buffer + * that's larger than MTU. Unfortunately, IB HCA has limitations + * on the length of SGL that are much smaller than those for regular + * ethernet NICs. Since the network layer doesn't care to limit the + * number of mblk fragments in any send mp chain, we end up having to + * use these larger buffers occasionally. + */ +#define EIB_LSO_MAXLEN 65536 +#define EIB_LSO_BUFSZ 8192 +#define EIB_LSO_NUM_BUFS 1024 +#define EIB_LSO_FREE_BUFS_THRESH (EIB_LSO_NUM_BUFS >> 5) + +typedef struct eib_lsobuf_s { + struct eib_lsobuf_s *lb_next; + uint8_t *lb_buf; + int lb_isfree; +} eib_lsobuf_t; + +typedef struct eib_lsobkt_s { + kmutex_t bk_lock; + kcondvar_t bk_cv; + uint_t bk_status; + uint8_t *bk_mem; + eib_lsobuf_t *bk_bufl; + eib_lsobuf_t *bk_free_head; + ibt_mr_hdl_t bk_mr_hdl; + ibt_lkey_t bk_lkey; + uint_t bk_nelem; + uint_t bk_nfree; +} eib_lsobkt_t; + +#define EIB_LBUF_SHORT 0x1 +#define EIB_LBUF_MONITOR_DIE 0x2 + +/* + * The admin partition is only used for sending login and logout messages + * and receiving login acknowledgements from the gateway. While packets + * going out on several vlans at the same time could result in multiple + * vnic creations happening at the same time (and therefore multiple login + * packets), we serialize the vnic creation via the vnic creator thread, so + * we shouldn't need a lot of send wqes or receive wqes. Note also that we + * keep the cq size request to slightly less than a 2^n boundary to allow + * the alloc cq routine to return the closest 2^n boundary as the real cq + * size without wasting too much memory. + */ +#define EIB_ADMIN_MAX_SWQE 30 +#define EIB_ADMIN_MAX_RWQE 30 +#define EIB_ADMIN_CQ_SIZE (EIB_ADMIN_MAX_SWQE + EIB_ADMIN_MAX_RWQE + 1) + +/* + * The control qp is per vhub partition, and is used to send and receive + * vhub control messages such as vhub table request/response, vhub + * update response and vnic alive messages. While the vhub table response + * and vhub update messages might take a few rwqes, the vhub table request + * is made only once per vnic, and the vnic alive message is periodic + * and uses a single swqe as well. Per vnic, we should certainly not need + * too many swqes/rwqes. + */ +#define EIB_CTL_MAX_SWQE 30 +#define EIB_CTL_MAX_RWQE 30 +#define EIB_CTL_CQ_SIZE (EIB_CTL_MAX_SWQE + EIB_CTL_MAX_RWQE + 1) + +/* + * For the vNIC's data channel, there are three items that are of importance: + * the constraints defined below, the hca_max_chan_sz attribute and the value of + * (hca_max_cq_sz - 1). The maximum limit on swqe/rwqe is set to the minimum + * of these three values. + * + * While the total number of RWQEs posted to the data channel of any vNIC will + * not exceed EIB_DATA_MAX_RWQE, we also do not want to acquire and post all of + * it during the data channel initialization, since that is a lot of wqes for + * one vnic to consume when we don't even know if the vnic will need it at all. + * We post an initial set of EIB_DATA_RWQE_BKT rwqes, and slowly post more and + * more sets as we see them being consumed, until we hit the hard limit of + * EIB_DATA_MAX_RWQE. + */ +#define EIB_DATA_MAX_SWQE 4000 +#define EIB_DATA_MAX_RWQE 4000 +#define EIB_DATA_RWQE_BKT 512 + +/* + * vNIC data channel CQ moderation parameters + */ +#define EIB_TX_COMP_COUNT 10 +#define EIB_TX_COMP_USEC 300 +#define EIB_RX_COMP_COUNT 4 +#define EIB_RX_COMP_USEC 10 + +/* + * qe_info masks (blk:ndx:type:flags) + */ +#define EIB_WQEBLK_SHIFT 24 +#define EIB_WQEBLK_MASK 0xFF +#define EIB_WQENDX_SHIFT 16 +#define EIB_WQENDX_MASK 0xFF +#define EIB_WQETYP_SHIFT 8 +#define EIB_WQETYP_MASK 0xFF +#define EIB_WQEFLGS_SHIFT 0 +#define EIB_WQEFLGS_MASK 0xFF + +/* + * Macros to get the bit fields from qe_info + */ +#define EIB_WQE_BLK(info) (((info) >> EIB_WQEBLK_SHIFT) & EIB_WQEBLK_MASK) +#define EIB_WQE_NDX(info) (((info) >> EIB_WQENDX_SHIFT) & EIB_WQENDX_MASK) +#define EIB_WQE_TYPE(info) (((info) >> EIB_WQETYP_SHIFT) & EIB_WQETYP_MASK) +#define EIB_WQE_FLAGS(info) ((info) & EIB_WQEFLGS_MASK) + +/* + * Values for type and flags in qe_info + */ +#define EIB_WQE_TX 0x1 +#define EIB_WQE_RX 0x2 + +/* + * Flags for rx wqes/buffers + */ +#define EIB_WQE_FLG_POSTED_TO_HCA 0x1 +#define EIB_WQE_FLG_WITH_NW 0x2 + +/* + * Flags for tx wqes/buffers + */ +#define EIB_WQE_FLG_BUFTYPE_LSO 0x4 +#define EIB_WQE_FLG_BUFTYPE_MAPPED 0x8 + +/* + * Send/Recv workq entries + */ +typedef struct eib_wqe_s { + struct eib_wqe_pool_s *qe_pool; + uint8_t *qe_cpbuf; + uint8_t *qe_payload_hdr; + uint_t qe_bufsz; + uint_t qe_info; + int qe_vnic_inst; + ibt_ud_dest_hdl_t qe_dest; + frtn_t qe_frp; + + mblk_t *qe_mp; + ibt_mi_hdl_t qe_iov_hdl; + ibt_all_wr_t qe_wr; + ibt_wr_ds_t qe_sgl; + ibt_wr_ds_t qe_big_sgl[EIB_MAX_SGL]; + struct eib_wqe_s *qe_nxt_post; + struct eib_chan_s *qe_chan; +} eib_wqe_t; + +/* + * The wqe in-use/free status in EoIB is managed via a 2-level bitmap + * logic. + * + * Each set of 64 wqes (a "wqe block") is managed by a single 64-bit + * integer bitmap. The free status of a set of 64 such wqe blocks (a + * "wqe pool") is managed by one 64-bit integer bitmap (if any wqe in + * the wqe block is free, the bit in the map is 1, otherwise it is 0). + * + * The maximum pool size is 4096 wqes, but this can easily be extended + * to support more wqes using additional pools of wqes. + * + * Note that an entire pool of wqes is allocated via a single allocation, + * the wqe addresses in a pool are all contiguous. The tx/rx copy buffers + * for a wqe pool are also allocated via a single allocation. + */ +#define EIB_BLKS_PER_POOL 64 +#define EIB_WQES_PER_BLK 64 /* do not change this */ +#define EIB_WQES_PER_POOL (EIB_BLKS_PER_POOL * EIB_WQES_PER_BLK) + +#define EIB_WQE_SZ (sizeof (eib_wqe_t)) +#define EIB_WQEBLK_SZ (EIB_WQES_PER_BLK * EIB_WQE_SZ) + +typedef struct eib_wqe_pool_s { + struct eib_wqe_pool_s *wp_next; + struct eib_s *wp_ss; + ib_vaddr_t wp_vaddr; + ib_memlen_t wp_memsz; + ibt_mr_hdl_t wp_mr; + ibt_lkey_t wp_lkey; + uint_t wp_nfree_lwm; + int wp_type; + + kmutex_t wp_lock; + kcondvar_t wp_cv; + uint_t wp_status; + uint_t wp_nfree; + uint64_t wp_free_blks; + uint64_t wp_free_wqes[EIB_BLKS_PER_POOL]; + struct eib_wqe_s *wp_wqe; +} eib_wqe_pool_t; + +/* + * Values for wp_type + */ +#define EIB_WP_TYPE_TX 0x1 +#define EIB_WP_TYPE_RX 0x2 + +/* + * Values for wp_status (bit fields) + */ +#define EIB_TXWQE_SHORT 0x1 /* only for tx wqe pool */ +#define EIB_TXWQE_MONITOR_DIE 0x2 /* only for tx wqe pool */ + +#define EIB_RXWQE_SHORT 0x1 /* only for rx wqe pool */ + +/* + * The low-water-mark is an indication of when wqe grabs for low-priority + * qps should start to get refused (swqe grabs for control messages such + * as keepalives and rwqe grabs for posting back to control qps will still + * be allowed). The high-water-mark is an indication of when normal + * behavior should resume. + */ +#define EIB_NFREE_SWQES_LWM (EIB_WQES_PER_POOL / 64) /* 1/64 */ +#define EIB_NFREE_SWQES_HWM (EIB_WQES_PER_POOL / 32) /* 1/32 */ +#define EIB_NFREE_RWQES_LWM (EIB_WQES_PER_POOL / 10) /* 10% */ +#define EIB_NFREE_RWQES_HWM (EIB_WQES_PER_POOL / 5) /* 20% */ + +/* + * The "rwqes low" is used to determine when we should start using allocb() + * to copy and send received mblks in the rx path. It should be a little + * above the rwqes low-water-mark, but less than the high-water-mark. + */ +#define EIB_NFREE_RWQES_LOW \ + ((EIB_NFREE_RWQES_LWM + EIB_NFREE_RWQES_HWM) / 2) + +#define EIB_WPRI_HI 1 /* for keepalive posts */ +#define EIB_WPRI_LO 2 /* for all other posts */ + +/* + * Multicast GID Layout: the multicast gid is specified in big-endian + * representation, as a collection of different-sized fields in the + * EoIB specification. On Solaris, the multicast gid is represented + * as a collection of two 8-byte fields (in ib_gid_t). + */ +typedef struct eib_mgid_spec_s { + uint8_t sp_mgid_prefix[FIP_MGID_PREFIX_LEN]; + uint8_t sp_type; + uint8_t sp_dmac[ETHERADDRL]; + uint8_t sp_rss_hash; + uint8_t sp_vhub_id[FIP_VHUBID_LEN]; +} eib_mgid_spec_t; + +/* + * Values for sp_type in mgid as per EoIB specification + */ +#define EIB_MGID_VHUB_DATA 0x0 +#define EIB_MGID_VHUB_UPDATE 0x2 +#define EIB_MGID_VHUB_TABLE 0x3 + +typedef union eib_mgid_s { + eib_mgid_spec_t gd_spec; + ib_gid_t gd_sol; +} eib_mgid_t; + +/* + * Gateway properties handed over to us by the EoIB nexus + */ +typedef struct eib_gw_props_s { + kmutex_t pp_gw_lock; + + ib_guid_t pp_gw_system_guid; + ib_guid_t pp_gw_guid; + ib_sn_prefix_t pp_gw_sn_prefix; + + uint_t pp_gw_adv_period; + uint_t pp_gw_ka_period; + uint_t pp_vnic_ka_period; + + ib_qpn_t pp_gw_ctrl_qpn; + ib_lid_t pp_gw_lid; + uint16_t pp_gw_portid; + + uint16_t pp_gw_num_net_vnics; + uint8_t pp_gw_flag_available; + uint8_t pp_gw_is_host_adm_vnics; + uint8_t pp_gw_sl; + uint8_t pp_gw_n_rss_qpn; + + uint8_t *pp_gw_system_name; + uint8_t *pp_gw_port_name; + uint8_t *pp_gw_vendor_id; + + clock_t pp_gw_ka_ticks; /* 2.5 x gw_ka_period */ + clock_t pp_vnic_ka_ticks; /* vnic_ka_period */ +} eib_gw_props_t; + +/* + * Port-specific properties + */ +typedef struct eib_props_s { + uint64_t ep_ifspeed; + ib_guid_t ep_hca_guid; + uint8_t ep_port_num; + ib_gid_t ep_sgid; + ib_lid_t ep_blid; + uint16_t ep_mtu; + ibt_srate_t ep_srate; +} eib_props_t; + +/* + * Capabilities derived from HCA attributes + */ +typedef struct eib_caps_s { + uint_t cp_lso_maxlen; + uint32_t cp_cksum_flags; + int cp_resv_lkey_capab; + ibt_lkey_t cp_resv_lkey; + + uint_t cp_max_swqe; + uint_t cp_max_rwqe; + uint_t cp_max_sgl; + uint_t cp_hiwm_sgl; +} eib_caps_t; + +/* + * List of multicast groups the vnic joined + */ +typedef struct eib_mcg_s { + struct eib_mcg_s *mg_next; + ib_gid_t mg_rgid; + ib_gid_t mg_mgid; + uint8_t mg_join_state; + uint8_t mg_mac[ETHERADDRL]; + ibt_mcg_info_t *mg_mcginfo; +} eib_mcg_t; + +/* + * Admin/control/data channel information + */ +typedef struct eib_chan_s { + ibt_channel_hdl_t ch_chan; + ib_qpn_t ch_qpn; + + ibt_wc_t *ch_wc; + ibt_cq_hdl_t ch_cq_hdl; + uint_t ch_cq_sz; + + ibt_wc_t *ch_rcv_wc; + ibt_cq_hdl_t ch_rcv_cq_hdl; + uint_t ch_rcv_cq_sz; + + int ch_vnic_inst; + uint_t ch_max_swqes; + uint_t ch_max_rwqes; + uint_t ch_lwm_rwqes; + uint_t ch_rwqe_bktsz; + uint_t ch_ip_hdr_align; + boolean_t ch_alloc_mp; + boolean_t ch_tear_down; + + kmutex_t ch_pkey_lock; + ib_pkey_t ch_pkey; + uint16_t ch_pkey_ix; + + kmutex_t ch_cep_lock; + kcondvar_t ch_cep_cv; + ibt_cep_state_t ch_cep_state; + + kmutex_t ch_tx_lock; + kcondvar_t ch_tx_cv; + uint_t ch_tx_posted; + boolean_t ch_tx_busy; + struct eib_wqe_s *ch_tx; + struct eib_wqe_s *ch_tx_tail; + + kmutex_t ch_rx_lock; + kcondvar_t ch_rx_cv; + uint_t ch_rx_posted; + boolean_t ch_rx_refilling; + + kmutex_t ch_vhub_lock; + struct eib_mcg_s *ch_vhub_table; + struct eib_mcg_s *ch_vhub_update; + struct eib_mcg_s *ch_vhub_data; + + struct eib_chan_s *ch_rxpost_next; +} eib_chan_t; + +/* + * States for vNIC state machine during login + */ +#define EIB_LOGIN_INIT 0 +#define EIB_LOGIN_ACK_WAIT 1 +#define EIB_LOGIN_ACK_RCVD 2 +#define EIB_LOGIN_NACK_RCVD 3 +#define EIB_LOGIN_TBL_WAIT 4 +#define EIB_LOGIN_TBL_INPROG 5 +#define EIB_LOGIN_TBL_DONE 6 +#define EIB_LOGIN_TBL_FAILED 7 +#define EIB_LOGIN_DONE 8 +#define EIB_LOGIN_TIMED_OUT 9 +#define EIB_LOGOUT_DONE 10 + +typedef struct eib_login_data_s { + ib_guid_t ld_gw_guid; + ib_lid_t ld_gw_lid; + uint_t ld_syndrome; + uint16_t ld_gw_port_id; + ib_qpn_t ld_gw_data_qpn; + ib_qpn_t ld_gw_ctl_qpn; + uint16_t ld_vnic_id; /* includes set msbit */ + uint16_t ld_vhub_mtu; + uint16_t ld_vhub_pkey; + uint16_t ld_assigned_vlan; + uint8_t ld_gw_sl; + uint8_t ld_n_rss_mcgid; + uint8_t ld_n_mac_mcgid; + uint8_t ld_vnic_name[FIP_VNIC_NAME_LEN]; + uint8_t ld_assigned_mac[ETHERADDRL]; + uint8_t ld_gw_mgid_prefix[FIP_MGID_PREFIX_LEN]; + uint8_t ld_vlan_in_packets; + uint32_t ld_vhub_id; +} eib_login_data_t; + +#define EIB_UNICAST_MAC(mac) (((mac)[0] & 0x01) == 0) + +/* + * Map to translate between DMAC and {qpn, lid, sl} + */ +typedef struct eib_vhub_map_s { + struct eib_vhub_map_s *mp_next; + uint32_t mp_tusn; + ib_qpn_t mp_qpn; + ib_lid_t mp_lid; + uint8_t mp_mac[ETHERADDRL]; + uint8_t mp_sl; + uint8_t mp_v_rss_type; +} eib_vhub_map_t; + +/* + * Per-vNIC vHUB Table + */ +#define EIB_TB_NBUCKETS 13 +typedef struct eib_vhub_table_s { + kmutex_t tb_lock; + struct eib_vhub_map_s *tb_gateway; + struct eib_vhub_map_s *tb_unicast_miss; + struct eib_vhub_map_s *tb_vhub_multicast; + struct eib_vhub_map_s *tb_vnic_entry[EIB_TB_NBUCKETS]; + struct eib_vhub_map_s *tb_mcast_entry[EIB_TB_NBUCKETS]; + + uint32_t tb_tusn; + uint8_t tb_eport_state; + + uint16_t tb_entries_seen; + uint16_t tb_entries_in_table; + uint32_t tb_checksum; +} eib_vhub_table_t; + +typedef struct eib_vhub_update_s { + kmutex_t up_lock; + eib_vhub_map_t *up_vnic_entry; + uint32_t up_tusn; + uint8_t up_eport_state; +} eib_vhub_update_t; + +typedef struct eib_ether_hdr_s { + int eh_tagless; + uint16_t eh_ether_type; + uint16_t eh_vlan; + uint8_t eh_dmac[ETHERADDRL]; + uint8_t eh_smac[ETHERADDRL]; +} eib_ether_hdr_t; + +/* + * vNIC Information + */ +typedef struct eib_vnic_s { + struct eib_s *vn_ss; + eib_chan_t *vn_ctl_chan; + eib_chan_t *vn_data_chan; + int vn_instance; + uint16_t vn_vlan; + uint16_t vn_id; + uint8_t vn_macaddr[ETHERADDRL]; + struct eib_login_data_s vn_login_data; + + kmutex_t vn_lock; + kcondvar_t vn_cv; + uint_t vn_state; + struct eib_vhub_table_s *vn_vhub_table; + struct eib_vhub_update_s *vn_vhub_update; + + ddi_softint_handle_t vn_ctl_si_hdl; + ddi_softint_handle_t vn_data_tx_si_hdl; + ddi_softint_handle_t vn_data_rx_si_hdl; +} eib_vnic_t; + + +/* + * Base NIC's mac state flags. The lock protects the starting/stopping + * bits. Access to the rest of the mac state is protected by these + * two bits. + */ +#define EIB_NIC_STARTING 0x01 +#define EIB_NIC_STOPPING 0x02 +#define EIB_NIC_STARTED 0x80 +#define EIB_NIC_RESTARTING (EIB_NIC_STARTING | EIB_NIC_STOPPING) + +typedef struct eib_node_state_s { + kmutex_t ns_lock; + kcondvar_t ns_cv; + uint_t ns_nic_state; + link_state_t ns_link_state; +} eib_node_state_t; + +/* + * MIB-II statistics to report to the mac layer + */ +typedef struct eib_stats_s { + uint64_t st_obytes; /* bytes sent out */ + uint64_t st_opkts; /* pkts sent out */ + uint64_t st_brdcstxmit; /* broadcast pkts transmitted */ + uint64_t st_multixmit; /* multicast pkts transmitted */ + uint64_t st_oerrors; /* transmit errors */ + uint64_t st_noxmitbuf; /* transmit pkts discarded */ + + uint64_t st_rbytes; /* bytes received */ + uint64_t st_ipkts; /* pkts received */ + uint64_t st_brdcstrcv; /* broadcast pkts received */ + uint64_t st_multircv; /* multicast pkts received */ + uint64_t st_ierrors; /* receive errors */ + uint64_t st_norcvbuf; /* receive pkts discarded */ +} eib_stats_t; + +#define EIB_UPDATE_COUNTER(addr, val) (atomic_add_64((addr), (val))) +#define EIB_INCR_COUNTER(addr) (atomic_inc_64((addr))) +#define EIB_DECR_COUNTER(addr) (atomic_dec_64((addr))) + +/* + * Cache of address vectors with dlid as the key. Currently we use + * eib state structure's ei_lock to protect the individual address + * vector's fields. This is a lock granularity that's slightly + * bigger than ideal, but it should do for now. + */ +#define EIB_AV_NBUCKETS 17 +typedef struct eib_avect_s { + struct eib_avect_s *av_next; + ibt_adds_vect_t av_vect; + uint_t av_ref; +} eib_avect_t; + +/* + * vNIC creation and deletion are serialized by a non-zero value + * to the ei_vnic_state member (i.e. only one vnic may be created + * or deleted at a time). The code makes sure to access/update + * the ei_active_vnics member only after a successful setting of + * ei_vnic_state. + */ +#define EIB_VN_BEING_CREATED 0x01 +#define EIB_VN_BEING_DELETED 0x02 +#define EIB_VN_BEING_MODIFIED (EIB_VN_BEING_CREATED | EIB_VN_BEING_DELETED) + +/* + * All possible EoIB event work items that need to be handled + */ +#define EIB_EV_NONE 0 +#define EIB_EV_PORT_DOWN 1 +#define EIB_EV_PORT_UP 2 +#define EIB_EV_PKEY_CHANGE 3 +#define EIB_EV_SGID_CHANGE 4 +#define EIB_EV_CLNT_REREG 5 +#define EIB_EV_GW_EPORT_DOWN 6 +#define EIB_EV_GW_DOWN 7 +#define EIB_EV_GW_UP 8 +#define EIB_EV_GW_INFO_UPDATE 9 +#define EIB_EV_MCG_DELETED 10 +#define EIB_EV_MCG_CREATED 11 +#define EIB_EV_SHUTDOWN 12 + +typedef struct eib_event_s { + struct eib_event_s *ev_next; + uint_t ev_code; + void *ev_arg; +} eib_event_t; + +/* + * Work element for new vnic creation + */ +typedef struct eib_vnic_req_s { + struct eib_vnic_req_s *vr_next; + uint_t vr_req; + uint8_t vr_mac[ETHERADDRL]; + uint16_t vr_vlan; +} eib_vnic_req_t; + +/* + * Values for vr_req + */ +#define EIB_CR_REQ_NEW_VNIC 1 +#define EIB_CR_REQ_FLUSH 2 +#define EIB_CR_REQ_DIE 3 + +/* + * Work element for vnics kept alive by the keepalive manager thread + * and bitfield values for ei_ka_vnics_event. + */ +typedef struct eib_ka_vnics_s { + struct eib_ka_vnics_s *ka_next; + struct eib_vnic_s *ka_vnic; +} eib_ka_vnics_t; + +#define EIB_KA_VNICS_DIE 0x1 +#define EIB_KA_VNICS_TIMED_OUT 0x2 + +/* + * EoIB per-instance state + */ +typedef struct eib_s { + ibt_clnt_hdl_t ei_ibt_hdl; + ibt_hca_hdl_t ei_hca_hdl; + ibt_pd_hdl_t ei_pd_hdl; + mac_handle_t ei_mac_hdl; + + ddi_softint_handle_t ei_admin_si_hdl; + ddi_callback_id_t ei_login_ack_cb; + ddi_callback_id_t ei_gw_alive_cb; + ddi_callback_id_t ei_gw_info_cb; + + ibt_hca_attr_t *ei_hca_attrs; + dev_info_t *ei_dip; + uint_t ei_instance; + + struct eib_gw_props_s *ei_gw_props; + struct eib_props_s *ei_props; + struct eib_caps_s *ei_caps; + struct eib_stats_s *ei_stats; + + struct eib_node_state_s *ei_node_state; + struct eib_chan_s *ei_admin_chan; + + struct eib_wqe_pool_s *ei_tx; + struct eib_wqe_pool_s *ei_rx; + struct eib_lsobkt_s *ei_lso; + + kmutex_t ei_vnic_lock; + kcondvar_t ei_vnic_cv; + uint_t ei_vnic_state; + uint64_t ei_active_vnics; + uint64_t ei_zombie_vnics; + uint64_t ei_rejoin_vnics; + struct eib_vnic_s *ei_vnic[EIB_MAX_VNICS]; + struct eib_vnic_s *ei_vnic_pending; + int64_t ei_gw_last_heartbeat; + boolean_t ei_gw_unreachable; + uint8_t ei_gw_eport_state; + + kmutex_t ei_av_lock; + struct eib_avect_s *ei_av[EIB_AV_NBUCKETS]; + + kmutex_t ei_ev_lock; + kcondvar_t ei_ev_cv; + struct eib_event_s *ei_event; + + kmutex_t ei_rxpost_lock; + kcondvar_t ei_rxpost_cv; + uint_t ei_rxpost_die; + struct eib_chan_s *ei_rxpost; + + kmutex_t ei_vnic_req_lock; + kcondvar_t ei_vnic_req_cv; + struct eib_vnic_req_s *ei_vnic_req; + struct eib_vnic_req_s *ei_failed_vnic_req; + struct eib_vnic_req_s *ei_pending_vnic_req; + + kmutex_t ei_ka_vnics_lock; + kcondvar_t ei_ka_vnics_cv; + uint_t ei_ka_vnics_event; + struct eib_ka_vnics_s *ei_ka_vnics; + + kt_did_t ei_txwqe_monitor; + kt_did_t ei_lsobufs_monitor; + kt_did_t ei_rwqes_refiller; + kt_did_t ei_vnic_creator; + kt_did_t ei_events_handler; + kt_did_t ei_keepalives_manager; +} eib_t; + +/* + * Private read-only datalink properties + */ +#define EIB_DLPROP_GW_EPORT_STATE "_eib_eport_state" +#define EIB_DLPROP_HCA_GUID "_eib_hca_guid" +#define EIB_DLPROP_PORT_GUID "_eib_port_guid" + +/* + * FUNCTION PROTOTYPES FOR CROSS-FILE LINKAGE + */ + +/* + * FIP protocol related + */ +extern int eib_fip_login(eib_t *, eib_vnic_t *, int *); +extern int eib_fip_heartbeat(eib_t *, eib_vnic_t *, int *); +extern int eib_fip_vhub_table(eib_t *, eib_vnic_t *, int *); +extern int eib_fip_logout(eib_t *, eib_vnic_t *, int *); +extern int eib_fip_parse_login_ack(eib_t *, uint8_t *, eib_login_data_t *); +extern int eib_fip_parse_ctl_pkt(uint8_t *, eib_vnic_t *); + +/* + * Service threads and other handlers + */ +extern void eib_events_handler(eib_t *); +extern void eib_svc_enqueue_event(eib_t *, eib_event_t *); +extern void eib_refill_rwqes(eib_t *); +extern void eib_vnic_creator(eib_t *); +extern void eib_monitor_tx_wqes(eib_t *); +extern void eib_monitor_lso_bufs(eib_t *); +extern void eib_manage_keepalives(eib_t *); +extern void eib_stop_events_handler(eib_t *); +extern void eib_stop_refill_rwqes(eib_t *); +extern void eib_stop_vnic_creator(eib_t *); +extern void eib_stop_monitor_tx_wqes(eib_t *); +extern int eib_stop_monitor_lso_bufs(eib_t *, boolean_t); +extern void eib_stop_manage_keepalives(eib_t *); +extern void eib_flush_vnic_reqs(eib_t *); +extern void eib_gw_info_cb(dev_info_t *, ddi_eventcookie_t, void *, void *); +extern void eib_gw_alive_cb(dev_info_t *, ddi_eventcookie_t, void *, void *); +extern void eib_login_ack_cb(dev_info_t *, ddi_eventcookie_t, void *, void *); + +/* + * Admin QP related + */ +extern int eib_adm_setup_qp(eib_t *, int *); +extern uint_t eib_adm_comp_handler(caddr_t, caddr_t); +extern void eib_rb_adm_setup_qp(eib_t *); + +/* + * Control QP related + */ +extern int eib_ctl_create_qp(eib_t *, eib_vnic_t *, int *); +extern uint_t eib_ctl_comp_handler(caddr_t, caddr_t); +extern void eib_rb_ctl_create_qp(eib_t *, eib_vnic_t *); + +/* + * Data QP related + */ +extern int eib_data_create_qp(eib_t *, eib_vnic_t *, int *); +extern uint_t eib_data_rx_comp_handler(caddr_t, caddr_t); +extern uint_t eib_data_tx_comp_handler(caddr_t, caddr_t); +extern void eib_data_rx_recycle(caddr_t); +extern void eib_data_post_tx(eib_vnic_t *, eib_wqe_t *); +extern void eib_data_parse_ether_hdr(mblk_t *, eib_ether_hdr_t *); +extern int eib_data_lookup_vnic(eib_t *, uint8_t *, uint16_t, eib_vnic_t **, + boolean_t *); +extern int eib_data_prepare_frame(eib_vnic_t *, eib_wqe_t *, mblk_t *, + eib_ether_hdr_t *); +extern void eib_rb_data_create_qp(eib_t *, eib_vnic_t *); + +/* + * Resource related + */ +extern int eib_rsrc_setup_bufs(eib_t *, int *); +extern int eib_rsrc_grab_swqes(eib_t *, eib_wqe_t **, uint_t, uint_t *, int); +extern int eib_rsrc_grab_rwqes(eib_t *, eib_wqe_t **, uint_t, uint_t *, int); +extern int eib_rsrc_grab_lsobufs(eib_t *, uint_t, ibt_wr_ds_t *, uint32_t *); +extern eib_wqe_t *eib_rsrc_grab_swqe(eib_t *, int); +extern eib_wqe_t *eib_rsrc_grab_rwqe(eib_t *, int); +extern void eib_rsrc_return_swqe(eib_t *, eib_wqe_t *, eib_chan_t *); +extern void eib_rsrc_return_rwqe(eib_t *, eib_wqe_t *, eib_chan_t *); +extern void eib_rsrc_return_lsobufs(eib_t *, ibt_wr_ds_t *, uint32_t); +extern void eib_rsrc_decr_posted_swqe(eib_t *, eib_chan_t *); +extern void eib_rsrc_decr_posted_rwqe(eib_t *, eib_chan_t *); +extern void eib_rsrc_txwqes_needed(eib_t *); +extern void eib_rsrc_lsobufs_needed(eib_t *); +extern boolean_t eib_rsrc_rxpool_low(eib_wqe_t *); +extern void eib_rb_rsrc_setup_bufs(eib_t *, boolean_t); + +/* + * IBT related + */ +extern int eib_ibt_hca_init(eib_t *); +extern void eib_ibt_link_mod(eib_t *); +extern int eib_ibt_modify_chan_pkey(eib_t *, eib_chan_t *, ib_pkey_t); +extern eib_avect_t *eib_ibt_hold_avect(eib_t *, ib_lid_t, uint8_t); +extern void eib_ibt_release_avect(eib_t *, eib_avect_t *); +extern void eib_ibt_free_avects(eib_t *); +extern void eib_ibt_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, + ibt_async_event_t *); +extern void eib_ibt_record_capab(eib_t *, ibt_hca_attr_t *, eib_caps_t *); +extern void eib_rb_ibt_hca_init(eib_t *, uint_t); + +/* + * Chan related + */ +extern eib_chan_t *eib_chan_init(void); +extern void eib_chan_fini(eib_chan_t *); +extern int eib_chan_post_rx(eib_t *, eib_chan_t *, uint_t *); +extern int eib_chan_post_recv(eib_t *, eib_chan_t *, eib_wqe_t *); + +/* + * Mac layer related + */ +extern void eib_mac_set_nic_state(eib_t *, uint_t); +extern void eib_mac_clr_nic_state(eib_t *, uint_t); +extern void eib_mac_upd_nic_state(eib_t *, uint_t, uint_t); +extern uint_t eib_mac_get_nic_state(eib_t *); +extern void eib_mac_link_state(eib_t *, link_state_t, boolean_t); +extern void eib_mac_link_down(eib_t *, boolean_t); +extern void eib_mac_link_up(eib_t *, boolean_t); +extern int eib_mac_start(eib_t *); +extern void eib_mac_stop(eib_t *); +extern int eib_mac_multicast(eib_t *, boolean_t, uint8_t *); +extern int eib_mac_promisc(eib_t *, boolean_t); +extern int eib_mac_tx(eib_t *, mblk_t *); +extern int eib_mac_hca_portstate(eib_t *, ib_lid_t *, int *); + +/* + * VNIC related + */ +extern int eib_vnic_create(eib_t *, uint8_t *, uint16_t, eib_vnic_t **, int *); +extern void eib_vnic_delete(eib_t *, eib_vnic_t *); +extern int eib_vnic_wait_for_login_ack(eib_t *, eib_vnic_t *, int *); +extern void eib_vnic_login_ack(eib_t *, eib_login_data_t *); +extern int eib_vnic_wait_for_table(eib_t *, eib_vnic_t *, int *); +extern void eib_vnic_vhub_table_done(eib_vnic_t *, uint_t); +extern int eib_vnic_join_data_mcg(eib_t *, eib_vnic_t *, uint8_t *, + boolean_t, int *); +extern int eib_vnic_setup_dest(eib_vnic_t *, eib_wqe_t *, uint8_t *, uint16_t); +extern void eib_vnic_leave_data_mcg(eib_t *, eib_vnic_t *, uint8_t *); +extern void eib_vnic_init_tables(eib_t *, eib_vnic_t *); +extern void eib_vnic_fini_tables(eib_t *, eib_vnic_t *, boolean_t); +extern eib_chan_t *eib_vnic_get_data_chan(eib_t *, int); +extern void eib_vnic_need_new(eib_t *, uint8_t *, uint16_t); +extern void eib_vnic_enqueue_req(eib_t *, eib_vnic_req_t *); +extern void eib_vnic_resurrect_zombies(eib_t *, uint8_t *); +extern void eib_vnic_restart(eib_t *, int, uint8_t *); +extern void eib_vnic_rejoin_mcgs(eib_t *); +extern void eib_rb_vnic_create(eib_t *, eib_vnic_t *, uint_t); + +/* + * Logging and other stuff + */ +extern void eib_debug_init(void); +extern void eib_debug_fini(void); +extern void eib_dprintf_crit(int, const char *fmt, ...); +extern void eib_dprintf_err(int, const char *fmt, ...); +extern void eib_dprintf_warn(int, const char *fmt, ...); +#ifdef EIB_DEBUG +extern void eib_dprintf_debug(int, const char *fmt, ...); +extern void eib_dprintf_args(int, const char *fmt, ...); +extern void eib_dprintf_pkt(int, uint8_t *, uint_t); +extern void eib_dprintf_verbose(int, const char *fmt, ...); +#endif +extern int eib_get_props(eib_t *); +extern void eib_update_props(eib_t *, eib_gw_info_t *); +extern void eib_rb_get_props(eib_t *); + +/* + * EoIB specific global variables + */ +extern ib_gid_t eib_reserved_gid; +extern uint8_t eib_zero_mac[]; +extern uint8_t eib_broadcast_mac[]; +extern int eib_setbit_mod67[]; +extern char *eib_pvt_props[]; + +/* + * HW/FW workarounds + */ +extern int eib_wa_no_desc_list_len; +extern int eib_wa_no_cksum_offload; +extern int eib_wa_no_lso; +extern int eib_wa_no_mcast_entries; +extern int eib_wa_no_av_discover; +extern int eib_wa_no_good_vp_flag; +extern int eib_wa_no_good_vhub_cksum; + +/* + * Miscellaneous externs + */ +extern void freemsgchain(mblk_t *); +extern pri_t minclsyspri; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_IB_EOIB_EIB_IMPL_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/sys/ib/clients/eoib/enx_impl.h Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,532 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_IB_EOIB_ENX_IMPL_H +#define _SYS_IB_EOIB_ENX_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/varargs.h> +#include <sys/ib/ibtl/ibti.h> +#include <sys/ib/ibtl/ibvti.h> +#include <sys/ib/ib_pkt_hdrs.h> +#include <sys/ib/ibtl/impl/ibtl_ibnex.h> +#include <sys/ib/mgt/sm_attr.h> + +#include <sys/ib/clients/eoib/fip.h> +#include <sys/ib/clients/eoib/eib.h> + +/* + * Driver specific constants + */ +#define ENX_E_SUCCESS 0 +#define ENX_E_FAILURE -1 +#define ENX_MAX_LINE 128 +#define ENX_GRH_SZ (sizeof (ib_grh_t)) + +/* + * Debug messages + */ +#define ENX_MSGS_CRIT 0x01 +#define ENX_MSGS_ERR 0x02 +#define ENX_MSGS_WARN 0x04 +#define ENX_MSGS_DEBUG 0x08 +#define ENX_MSGS_ARGS 0x10 +#define ENX_MSGS_VERBOSE 0x20 +#define ENX_MSGS_DEFAULT (ENX_MSGS_CRIT | ENX_MSGS_ERR | ENX_MSGS_WARN) + +#define ENX_LOGSZ_DEFAULT 0x20000 + +#define ENX_DPRINTF_CRIT eibnx_dprintf_crit +#define ENX_DPRINTF_ERR eibnx_dprintf_err +#define ENX_DPRINTF_WARN eibnx_dprintf_warn +#ifdef ENX_DEBUG +#define ENX_DPRINTF_DEBUG eibnx_dprintf_debug +#define ENX_DPRINTF_ARGS eibnx_dprintf_args +#define ENX_DPRINTF_VERBOSE eibnx_dprintf_verbose +#else +#define ENX_DPRINTF_DEBUG 0 && +#define ENX_DPRINTF_ARGS 0 && +#define ENX_DPRINTF_VERBOSE 0 && +#endif + +/* + * EoIB Nexus service threads + */ +#define ENX_PORT_MONITOR "eibnx_port_%d_monitor" +#define ENX_NODE_CREATOR "eibnx_node_creator" + +/* + * Default period (us) for unicast solicitations to discovered gateways. + * EoIB specification requires that hosts send solicitation atleast every + * 4 * GW_ADV_PERIOD. + */ +#define ENX_DFL_SOLICIT_PERIOD_USEC 32000000 + +/* + * Portinfo list per HCA + */ +typedef struct eibnx_port_s { + struct eibnx_port_s *po_next; + ibt_hca_portinfo_t *po_pi; + uint_t po_pi_size; +} eibnx_port_t; + +/* + * HCA details + */ +typedef struct eibnx_hca_s { + struct eibnx_hca_s *hc_next; + ib_guid_t hc_guid; + ibt_hca_hdl_t hc_hdl; + ibt_pd_hdl_t hc_pd; + eibnx_port_t *hc_port; +} eibnx_hca_t; + +/* + * The port_monitor thread in EoIB nexus driver only sends two types of + * packets: multicast solicitation the first time around, and periodic + * unicast solicitations later to gateways that have been discovered. So + * we need a couple of send wqes for the multicast solicitation and + * probably as many send wqes as the number of gateways that may be + * discovered from each port, for sending the unicast solicitations. + * For unicast solicitations though, the UD destination needs to be set + * up at the time we receive the advertisement from the gateway, using + * ibt_modify_reply_ud_dest(), so we'll assign one send wqe for each + * gateway that we discover. This means that we need to acquire these + * send wqe entries during rx processing in the completion handler, which + * means we must avoid sleeping in trying to acquire the swqe. Therefore, + * we'll pre-allocate these unicast solication send wqes to be atleast + * twice the number of recv wqes. + * + * The receive packets expected by the EoIB nexus driver are the multicast + * and unicast messages on the SOLICIT and ADVERTISE groups. These + * shouldn't be too many, and should be tuned as we gain experience on + * the traffic pattern. We'll start with 16. + */ +#define ENX_NUM_SWQE 46 +#define ENX_NUM_RWQE 16 +#define ENX_CQ_SIZE (ENX_NUM_SWQE + ENX_NUM_RWQE + 2) + +/* + * qe_type values + */ +#define ENX_QETYP_RWQE 0x1 +#define ENX_QETYP_SWQE 0x2 + +/* + * qe_flags bitmasks (protected by qe_lock). None of the + * flag values may be zero. + */ +#define ENX_QEFL_INUSE 0x01 +#define ENX_QEFL_POSTED 0x02 +#define ENX_QEFL_RELONCOMP 0x04 + +/* + * Recv and send workq entries + */ +typedef struct eibnx_wqe_s { + uint_t qe_type; + uint_t qe_bufsz; + ibt_wr_ds_t qe_sgl; + ibt_all_wr_t qe_wr; + kmutex_t qe_lock; + uint_t qe_flags; +} eibnx_wqe_t; + +/* + * Tx descriptor + */ +typedef struct eibnx_tx_s { + ib_vaddr_t tx_vaddr; + ibt_mr_hdl_t tx_mr; + ibt_lkey_t tx_lkey; + eibnx_wqe_t tx_wqe[ENX_NUM_SWQE]; +} eibnx_tx_t; + +/* + * Rx descriptor + */ +typedef struct eibnx_rx_s { + ib_vaddr_t rx_vaddr; + ibt_mr_hdl_t rx_mr; + ibt_lkey_t rx_lkey; + eibnx_wqe_t rx_wqe[ENX_NUM_RWQE]; +} eibnx_rx_t; + +/* + * Details about the address of each gateway we discover. + */ +typedef struct eibnx_gw_addr_s { + ibt_adds_vect_t *ga_vect; + ib_gid_t ga_gid; + ib_qpn_t ga_qpn; + ib_qkey_t ga_qkey; + ib_pkey_t ga_pkey; +} eibnx_gw_addr_t; + +/* + * States for each GW + */ +#define ENX_GW_STATE_UNAVAILABLE 1 /* GW nackd availability */ +#define ENX_GW_STATE_AVAILABLE 2 /* GW mcasted availability */ +#define ENX_GW_STATE_READY_TO_LOGIN 3 /* GW ucasted availability */ + +typedef struct eibnx_gw_info_s { + struct eibnx_gw_info_s *gw_next; + eibnx_wqe_t *gw_swqe; + uint_t gw_state; + + kmutex_t gw_adv_lock; + uint_t gw_adv_flag; + int64_t gw_adv_last_lbolt; + int64_t gw_adv_timeout_ticks; + + eibnx_gw_addr_t gw_addr; + + ib_guid_t gw_system_guid; + ib_guid_t gw_guid; + + uint32_t gw_adv_period; + uint32_t gw_ka_period; + uint32_t gw_vnic_ka_period; + ib_qpn_t gw_ctrl_qpn; + + ib_lid_t gw_lid; + uint16_t gw_portid; + uint16_t gw_num_net_vnics; + + uint8_t gw_is_host_adm_vnics; + uint8_t gw_sl; + uint8_t gw_n_rss_qpn; + uint8_t gw_flag_ucast_advt; + uint8_t gw_flag_available; + + uint8_t gw_system_name[EIB_GW_SYSNAME_LEN]; + uint8_t gw_port_name[EIB_GW_PORTNAME_LEN]; + uint8_t gw_vendor_id[EIB_GW_VENDOR_LEN]; +} eibnx_gw_info_t; + +/* + * Values for gw_adv_flag (non-zero only) + */ +#define ENX_GW_DEAD 1 +#define ENX_GW_ALIVE 2 +#define ENX_GW_AWARE 3 + +/* + * Currently, we only expect the advertisement type of packets + * from the gw. But we do get login acks from the gateway also + * here in the nexus, so we'll need an identifier for that. + */ +typedef enum { + FIP_GW_ADVERTISE_MCAST = 0, + FIP_GW_ADVERTISE_UCAST, + FIP_VNIC_LOGIN_ACK +} eibnx_gw_pkt_type_t; + +/* + * Currently, the only gw response handled by the eibnx driver + * are the ucast/mcast advertisements. Information collected from + * both these responses may be packed into a eibnx_gw_info_t. + * In the future, if we decide to handle other types of responses + * from the gw, we could simply add the new types to the union. + */ +typedef struct eibnx_gw_msg_s { + eibnx_gw_pkt_type_t gm_type; + union { + eibnx_gw_info_t gm_info; + } u; +} eibnx_gw_msg_t; + +/* + * List to hold the devinfo nodes of eoib instances + */ +typedef struct eibnx_child_s { + struct eibnx_child_s *ch_next; + dev_info_t *ch_dip; + eibnx_gw_info_t *ch_gwi; + char *ch_node_name; +} eibnx_child_t; + +/* + * Event bitmasks for the port-monitor to wait on. None of these flags + * may be zero. + */ +#define ENX_EVENT_LINK_UP 0x01 +#define ENX_EVENT_MCGS_AVAILABLE 0x02 +#define ENX_EVENT_TIMED_OUT 0x04 +#define ENX_EVENT_DIE 0x08 +#define ENX_EVENT_COMPLETION 0x10 + +/* + * MCG Query/Join status + */ +#define ENX_MCGS_FOUND 0x1 +#define ENX_MCGS_JOINED 0x2 + +/* + * Information that each port-monitor thread cares about + */ +typedef struct eibnx_thr_info_s { + struct eibnx_thr_info_s *ti_next; + uint_t ti_progress; + + /* + * Our kernel thread id + */ + kt_did_t ti_kt_did; + + /* + * HCA, port and protection domain information + */ + ib_guid_t ti_hca_guid; + ibt_hca_hdl_t ti_hca; + ibt_pd_hdl_t ti_pd; + ibt_hca_portinfo_t *ti_pi; + char *ti_ident; + + /* + * Well-known multicast groups for solicitations + * and advertisements. + */ + kmutex_t ti_mcg_lock; + uint_t ti_mcg_status; + ibt_mcg_info_t *ti_advertise_mcg; + ibt_mcg_info_t *ti_solicit_mcg; + uint_t ti_mcast_done; + + /* + * Completion queue stuff + */ + ibt_cq_hdl_t ti_cq_hdl; + uint_t ti_cq_sz; + ibt_wc_t *ti_wc; + ddi_softint_handle_t ti_softint_hdl; + + /* + * Channel related + */ + ibt_channel_hdl_t ti_chan; + ib_qpn_t ti_qpn; + + /* + * Transmit/Receive stuff + */ + eibnx_tx_t ti_snd; + eibnx_rx_t ti_rcv; + + /* + * GW related stuff + */ + kmutex_t ti_gw_lock; + eibnx_gw_info_t *ti_gw; + + /* + * Devinfo nodes for the eoib children + */ + kmutex_t ti_child_lock; + eibnx_child_t *ti_child; + + /* + * Events that we wait on and/or handle + */ + kmutex_t ti_event_lock; + kcondvar_t ti_event_cv; + uint_t ti_event; +} eibnx_thr_info_t; + +/* + * Workq entry for creation of eoib nodes + */ +typedef struct eibnx_nodeq_s { + struct eibnx_nodeq_s *nc_next; + eibnx_thr_info_t *nc_info; + eibnx_gw_info_t *nc_gwi; +} eibnx_nodeq_t; + +/* + * Bus config status flags. The in-prog is protected by + * nx_lock, and the rest of the flags (currently only + * buscfg-complete) is protected by the in-prog bit itself. + */ +#define NX_FL_BUSOP_INPROG 0x1 +#define NX_FL_BUSCFG_COMPLETE 0x2 +#define NX_FL_BUSOP_MASK 0x3 + +/* + * EoIB nexus per-instance state + */ +typedef struct eibnx_s { + dev_info_t *nx_dip; + ibt_clnt_hdl_t nx_ibt_hdl; + + kmutex_t nx_lock; + eibnx_hca_t *nx_hca; + eibnx_thr_info_t *nx_thr_info; + boolean_t nx_monitors_up; + + kmutex_t nx_nodeq_lock; + kcondvar_t nx_nodeq_cv; + eibnx_nodeq_t *nx_nodeq; + kt_did_t nx_nodeq_kt_did; + uint_t nx_nodeq_thr_die; + + kmutex_t nx_busop_lock; + kcondvar_t nx_busop_cv; + uint_t nx_busop_flags; +} eibnx_t; + + +/* + * Event tags for EoIB Nexus events delivered to EoIB instances + */ +#define ENX_EVENT_TAG_GW_INFO_UPDATE 0 +#define ENX_EVENT_TAG_GW_AVAILABLE 1 +#define ENX_EVENT_TAG_LOGIN_ACK 2 + +/* + * FUNCTION PROTOTYPES FOR CROSS-FILE LINKAGE + */ + +/* + * Threads and Event Handlers + */ +void eibnx_port_monitor(eibnx_thr_info_t *); +void eibnx_subnet_notices_handler(void *, ib_gid_t, ibt_subnet_event_code_t, + ibt_subnet_event_t *); +void eibnx_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, + ibt_async_event_t *); +boolean_t eibnx_is_gw_dead(eibnx_gw_info_t *); +void eibnx_create_eoib_node(void); +void eibnx_comp_intr(ibt_cq_hdl_t, void *); +uint_t eibnx_comp_handler(caddr_t, caddr_t); + +/* + * IBT related functions + */ +int eibnx_ibt_init(eibnx_t *); +int eibnx_find_mgroups(eibnx_thr_info_t *); +int eibnx_setup_cq(eibnx_thr_info_t *); +int eibnx_setup_ud_channel(eibnx_thr_info_t *); +int eibnx_setup_bufs(eibnx_thr_info_t *); +int eibnx_setup_cq_handler(eibnx_thr_info_t *); +int eibnx_join_mcgs(eibnx_thr_info_t *); +int eibnx_rejoin_mcgs(eibnx_thr_info_t *); +int eibnx_ibt_fini(eibnx_t *); + +void eibnx_rb_find_mgroups(eibnx_thr_info_t *); +void eibnx_rb_setup_cq(eibnx_thr_info_t *); +void eibnx_rb_setup_ud_channel(eibnx_thr_info_t *); +void eibnx_rb_setup_bufs(eibnx_thr_info_t *); +void eibnx_rb_setup_cq_handler(eibnx_thr_info_t *); +void eibnx_rb_join_mcgs(eibnx_thr_info_t *); + +eibnx_hca_t *eibnx_prepare_hca(ib_guid_t); +int eibnx_cleanup_hca(eibnx_hca_t *); + +/* + * FIP packetizing related functions + */ +int eibnx_fip_solicit_mcast(eibnx_thr_info_t *); +int eibnx_fip_solicit_ucast(eibnx_thr_info_t *, clock_t *); +int eibnx_fip_parse_pkt(uint8_t *, eibnx_gw_msg_t *); + +/* + * Queue and List related routines + */ +eibnx_wqe_t *eibnx_acquire_swqe(eibnx_thr_info_t *, int); +void eibnx_return_swqe(eibnx_wqe_t *); +void eibnx_return_rwqe(eibnx_thr_info_t *, eibnx_wqe_t *); +void eibnx_release_swqe(eibnx_wqe_t *); + +void eibnx_enqueue_child(eibnx_thr_info_t *, eibnx_gw_info_t *, char *, + dev_info_t *); +int eibnx_update_child(eibnx_thr_info_t *, eibnx_gw_info_t *, dev_info_t *); +dev_info_t *eibnx_find_child_dip_by_inst(eibnx_thr_info_t *, int); +dev_info_t *eibnx_find_child_dip_by_gw(eibnx_thr_info_t *, uint16_t); + +eibnx_gw_info_t *eibnx_find_gw_in_gwlist(eibnx_thr_info_t *, eibnx_gw_info_t *); +eibnx_gw_info_t *eibnx_add_gw_to_gwlist(eibnx_thr_info_t *, eibnx_gw_info_t *, + ibt_wc_t *, uint8_t *); +void eibnx_replace_gw_in_gwlist(eibnx_thr_info_t *, eibnx_gw_info_t *, + eibnx_gw_info_t *, ibt_wc_t *, uint8_t *, boolean_t *); +void eibnx_queue_for_creation(eibnx_thr_info_t *, eibnx_gw_info_t *); + +/* + * Logging and Error reporting routines + */ +void eibnx_debug_init(void); +void eibnx_debug_fini(void); +void eibnx_dprintf_crit(const char *fmt, ...); +void eibnx_dprintf_err(const char *fmt, ...); +void eibnx_dprintf_warn(const char *fmt, ...); +#ifdef ENX_DEBUG +void eibnx_dprintf_debug(const char *fmt, ...); +void eibnx_dprintf_args(const char *fmt, ...); +void eibnx_dprintf_verbose(const char *fmt, ...); +#endif + +/* + * Miscellaneous + */ +void eibnx_cleanup_port_nodes(eibnx_thr_info_t *); +void eibnx_create_node_props(dev_info_t *, eibnx_thr_info_t *, + eibnx_gw_info_t *); +int eibnx_name_child(dev_info_t *, char *, size_t); +void eibnx_busop_inprog_enter(eibnx_t *); +void eibnx_busop_inprog_exit(eibnx_t *); +eibnx_thr_info_t *eibnx_start_port_monitor(eibnx_hca_t *, eibnx_port_t *); +void eibnx_stop_port_monitor(eibnx_thr_info_t *); +void eibnx_terminate_monitors(void); +int eibnx_configure_node(eibnx_thr_info_t *, eibnx_gw_info_t *, dev_info_t **); +int eibnx_unconfigure_node(eibnx_thr_info_t *, eibnx_gw_info_t *); +int eibnx_locate_node_name(char *, eibnx_thr_info_t **, eibnx_gw_info_t **); +int eibnx_locate_unconfigured_node(eibnx_thr_info_t **, eibnx_gw_info_t **); + +/* + * Devctl cbops (currently dummy) + */ +int eibnx_devctl_open(dev_t *, int, int, cred_t *); +int eibnx_devctl_close(dev_t, int, int, cred_t *); +int eibnx_devctl_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); + +/* + * External variable references + */ +extern pri_t minclsyspri; +extern eibnx_t *enx_global_ss; +extern ib_gid_t enx_solicit_mgid; +extern ib_gid_t enx_advertise_mgid; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_IB_EOIB_ENX_IMPL_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/sys/ib/clients/eoib/fip.h Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,465 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_IB_EOIB_FIP_H +#define _SYS_IB_EOIB_FIP_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/ethernet.h> +#include <sys/ib/ib_types.h> + +/* + * Sizes of various objects in FIP headers + */ +#define FIP_VENDOR_LEN 8 +#define FIP_GUID_LEN 8 +#define FIP_SYSNAME_LEN 32 +#define FIP_PORTNAME_LEN 8 +#define FIP_MGID_PREFIX_LEN 5 +#define FIP_VNIC_NAME_LEN 16 +#define FIP_VHUBID_LEN 3 + +/* + * EoIB Pkeys and Qkeys + */ +#define EIB_ADMIN_PKEY 0xFFFF +#define EIB_FIP_QKEY 0x80020002 +#define EIB_DATA_QKEY 0x80020003 + +/* + * EoIB Advertise and Solicit MCG GUIDs + */ +#define EIB_GUID_ADVERTISE_PREFIX 0xFF12E01B00060000 +#define EIB_GUID_SOLICIT_PREFIX 0xFF12E01B00070000 + +/* + * FIP_Protocol_Version + */ +#define FIP_PROTO_VERSION 0 +typedef struct fip_proto_s { + uint8_t pr_version; + uint8_t pr_reserved[3]; +} fip_proto_t; + +/* + * Basic FIP Header: Opcodes and subcodes for EoIB + */ +#define FIP_OPCODE_EOIB 0xFFF9 + +#define FIP_SUBCODE_H_SOLICIT 0x1 +#define FIP_SUBCODE_G_ADVERTISE 0x2 +#define FIP_SUBCODE_H_VNIC_LOGIN 0x3 +#define FIP_SUBCODE_G_VNIC_LOGIN_ACK 0x4 +#define FIP_SUBCODE_H_VNIC_LOGOUT 0x5 +#define FIP_SUBCODE_G_VHUB_UPDATE 0x6 +#define FIP_SUBCODE_G_VHUB_TABLE 0x7 +#define FIP_SUBCODE_H_KEEP_ALIVE 0x8 + +/* + * Basic FIP Header: Flags relevant to EoIB + */ +#define FIP_BHFLAG_GWAVAIL 0x4 +#define FIP_BHFLAG_SLCTMSG 0x2 + +/* + * FIP_Basic_Header + */ +#define FIP_DESC_TYPE_VENDOR_ID 13 +#define FIP_DESC_LEN_VENDOR_ID 3 +typedef struct fip_basic_hdr_s { + uint16_t hd_opcode; + uint8_t hd_reserved1; + uint8_t hd_subcode; + uint16_t hd_desc_list_len; + uint16_t hd_flags; + uint8_t hd_type; + uint8_t hd_len; + uint8_t hd_reserved2[2]; + uint8_t hd_vendor_id[FIP_VENDOR_LEN]; +} fip_basic_hdr_t; + +#define FIP_IBA_QPN_MASK 0x00FFFFFF +#define FIP_IBA_PORTID_MASK 0x0FFF +#define FIP_IBA_SL_MASK 0xF000 +#define FIP_IBA_SL_SHIFT 12 + +/* + * FIP_Descriptor_Infiniband_Address + */ +#define FIP_DESC_TYPE_IBA 240 +#define FIP_DESC_LEN_IBA 7 +typedef struct fip_desc_iba_s { + uint8_t ia_type; + uint8_t ia_len; + uint8_t ia_reserved[2]; + uint8_t ia_vendor_id[FIP_VENDOR_LEN]; + uint32_t ia_qpn; + uint16_t ia_sl_portid; + uint16_t ia_lid; + uint8_t ia_guid[FIP_GUID_LEN]; +} fip_desc_iba_t; + +/* + * FIP Solicitation Control Message: + * + * FIP_Protocol_Version + * FIP_Basic_Header + * FIP_Descriptor_Infiniband_Address + */ +typedef struct fip_solicit_s { + fip_proto_t sl_proto_version; + fip_basic_hdr_t sl_fip_hdr; + fip_desc_iba_t sl_iba; +} fip_solicit_t; + +/* + * FIP_Descriptor_EoIB_Gateway_Information + */ +#define FIP_DESC_TYPE_EOIB_GW_INFO 241 +#define FIP_DESC_LEN_EOIB_GW_INFO 4 +typedef struct fip_desc_gwinfo_s { + uint8_t gi_type; + uint8_t gi_len; + uint8_t gi_reserved1[2]; + uint8_t gi_vendor_id[FIP_VENDOR_LEN]; + uint8_t gi_flags; + uint8_t gi_reserved2; + uint16_t gi_rss_qpn_num_net_vnics; +} fip_desc_gwinfo_t; + +#define FIP_GWI_HOST_ADMIND_VNICS_MASK 0x80 +#define FIP_GWI_NUM_NET_VNICS_MASK 0x0FFF +#define FIP_GWI_RSS_QPN_MASK 0xF000 +#define FIP_GWI_RSS_QPN_SHIFT 12 + +/* + * FIP_Descriptor_Gateway_Identifier + */ +#define FIP_DESC_TYPE_GW_ID 248 +#define FIP_DESC_LEN_GW_ID 15 +typedef struct fip_desc_gwid_s { + uint8_t id_type; + uint8_t id_len; + uint8_t id_reserved[2]; + uint8_t id_vendor_id[FIP_VENDOR_LEN]; + uint8_t id_guid[FIP_GUID_LEN]; + uint8_t id_sysname[FIP_SYSNAME_LEN]; + uint8_t id_portname[FIP_PORTNAME_LEN]; +} fip_desc_gwid_t; + +/* + * FIP_Descriptor_Keep_Alive_Parameters + */ +#define FIP_DESC_TYPE_KEEP_ALIVE 249 +#define FIP_DESC_LEN_KEEP_ALIVE 6 +typedef struct fip_desc_keepalive_s { + uint8_t ka_type; + uint8_t ka_len; + uint8_t ka_reserved[2]; + uint8_t ka_vendor_id[FIP_VENDOR_LEN]; + uint32_t ka_gw_adv_period; + uint32_t ka_gw_ka_period; + uint32_t ka_vnic_ka_period; +} fip_desc_keepalive_t; + +/* + * FIP Advertise Control Message: + * + * FIP_Protocol_Version + * FIP_Basic_Header + * FIP_Descriptor_Infiniband_Address + * FIP_Descriptor_EoIB_Gateway_Information + * FIP_Descriptor_Gateway_Identifier + * FIP_Descriptor_Keep_Alive_Parameters + */ +typedef struct fip_advertise_s { + fip_proto_t ad_proto_version; + fip_basic_hdr_t ad_fip_header; + fip_desc_iba_t ad_iba; + fip_desc_gwinfo_t ad_gwinfo; + fip_desc_gwid_t ad_gwid; + fip_desc_keepalive_t ad_keep_alive; +} fip_advertise_t; + +/* + * FIP_Descriptor_vNIC_Login + */ +#define FIP_DESC_TYPE_VNIC_LOGIN 242 +#define FIP_DESC_LEN_VNIC_LOGIN 13 +typedef struct fip_desc_vnic_login_s { + uint8_t vl_type; + uint8_t vl_len; + uint8_t vl_reserved1[2]; + uint8_t vl_vendor_id[FIP_VENDOR_LEN]; + uint16_t vl_mtu; + uint16_t vl_vnic_id; + uint16_t vl_flags_vlan; + uint8_t vl_mac[ETHERADDRL]; + uint8_t vl_gw_mgid_prefix[FIP_MGID_PREFIX_LEN]; + uint8_t vl_reserved2; + uint8_t vl_flags_rss; + uint8_t vl_n_mac_mcgid; + uint32_t vl_syndrome_ctl_qpn; + uint8_t vl_vnic_name[FIP_VNIC_NAME_LEN]; +} fip_desc_vnic_login_t; + +/* + * Flags, masks and error codes for FIP_Descriptor_vNIC_Login + */ +#define FIP_VL_VNIC_ID_MSBIT 0x8000 +#define FIP_VL_FLAGS_V 0x8000 +#define FIP_VL_FLAGS_M 0x4000 +#define FIP_VL_FLAGS_VP 0x2000 +#define FIP_VL_FLAGS_H 0x1000 +#define FIP_VL_VLAN_MASK 0x0FFF +#define FIP_VL_RSS_MASK 0x10 +#define FIP_VL_N_RSS_MCGID_MASK 0x0F +#define FIP_VL_N_MAC_MCGID_MASK 0x3F +#define FIP_VL_CTL_QPN_MASK 0x00FFFFFF + +#define FIP_VL_SYN_MASK 0xFF000000 +#define FIP_VL_SYN_SHIFT 24 + +#define FIP_VL_SYN_SUCCESS 0 +#define FIP_VL_SYN_REJECTED 1 +#define FIP_VL_SYN_GW_NO_RESOURCE 2 +#define FIP_VL_SYN_NO_MORE_NWK_ADDRS 3 +#define FIP_VL_SYN_UNKNOWN_HOST 4 +#define FIP_VL_SYN_UNSUPP_PARAM 5 + +/* + * FIP_Descriptor_Partition + */ +#define FIP_DESC_TYPE_PARTITION 246 +#define FIP_DESC_LEN_PARTITION 4 +typedef struct fip_desc_partition_s { + uint8_t pn_type; + uint8_t pn_len; + uint8_t pn_reserved1[2]; + uint8_t pn_vendor_id[FIP_VENDOR_LEN]; + uint8_t pn_reserved2[2]; + uint16_t pn_pkey; +} fip_desc_partition_t; + +/* + * FIP Login Control Message: + * + * FIP_Protocol_Version + * FIP_Basic_Header + * FIP_Descriptor_Infiniband_Address + * FIP_Descriptor_vNIC_Login + */ +typedef struct fip_login_s { + fip_proto_t lg_proto_version; + fip_basic_hdr_t lg_fip_header; + fip_desc_iba_t lg_iba; + fip_desc_vnic_login_t lg_vnic_login; +} fip_login_t; + +/* + * FIP Login ACK Control Message: + * + * FIP_Protocol_Version + * FIP_Basic_Header + * FIP_Descriptor_Infiniband_Address + * FIP_Descriptor_vNIC_Login + * FIP_Descriptor_Partition + */ +typedef struct fip_login_ack_s { + fip_proto_t ak_proto_version; + fip_basic_hdr_t ak_fip_header; + fip_desc_iba_t ak_iba; + fip_desc_vnic_login_t ak_vnic_login; + fip_desc_partition_t ak_vhub_partition; +} fip_login_ack_t; + +/* + * FIP_Descriptor_vNIC_Identity + */ +#define FIP_DESC_TYPE_VNIC_IDENTITY 245 +#define FIP_DESC_LEN_VNIC_IDENTITY 13 +typedef struct fip_desc_vnic_identity_s { + uint8_t vi_type; + uint8_t vi_len; + uint8_t vi_reserved1[2]; + uint8_t vi_vendor_id[FIP_VENDOR_LEN]; + uint32_t vi_flags_vhub_id; + uint32_t vi_tusn; + uint16_t vi_vnic_id; + uint8_t vi_mac[ETHERADDRL]; + uint8_t vi_port_guid[FIP_GUID_LEN]; + uint8_t vi_vnic_name[FIP_VNIC_NAME_LEN]; +} fip_desc_vnic_identity_t; + +#define FIP_VI_FLAG_U 0x80000000 +#define FIP_VI_FLAG_R 0x40000000 +#define FIP_VI_FLAG_VP 0x01000000 + +/* + * FIP Keep Alive Control Message: + * + * FIP_Protocol_Version + * FIP_Basic_Header + * FIP_Descriptor_vNIC_Identity + */ +typedef struct fip_keep_alive_s { + fip_proto_t ka_proto_version; + fip_basic_hdr_t ka_fip_header; + fip_desc_vnic_identity_t ka_vnic_identity; +} fip_keep_alive_t; + +/* + * FIP_vHUB_Table_Entry + */ +typedef struct fip_vhub_table_entry_s { + uint8_t te_v_rss_type; + uint8_t te_reserved1; + uint8_t te_mac[ETHERADDRL]; + uint32_t te_qpn; + uint8_t te_reserved2; + uint8_t te_sl; + uint16_t te_lid; +} fip_vhub_table_entry_t; + +#define FIP_TE_VALID 0x80 +#define FIP_TE_RSS 0x40 + +#define FIP_TE_TYPE_MASK 0x0F +#define FIP_TE_TYPE_VNIC 0x00 +#define FIP_TE_TYPE_GATEWAY 0x01 +#define FIP_TE_TYPE_UNICAST_MISS 0x02 +#define FIP_TE_TYPE_MULTICAST_ENTRY 0x03 +#define FIP_TE_TYPE_VHUB_MULTICAST 0x04 + +#define FIP_TE_SL_MASK 0x0F +#define FIP_TE_QPN_MASK 0x00FFFFFF + +#define FIP_VHUB_TABLE_ENTRY_SZ (sizeof (fip_vhub_table_entry_t)) +#define FIP_VHUB_TABLE_ENTRY_WORDS (FIP_VHUB_TABLE_ENTRY_SZ >> 2) + +/* + * FIP_Descriptor_vHUB_Update + */ +#define FIP_DESC_TYPE_VHUB_UPDATE 243 +#define FIP_DESC_LEN_VHUB_UPDATE 9 +typedef struct fip_desc_vhub_update_s { + uint8_t up_type; + uint8_t up_len; + uint8_t up_reserved1[2]; + uint8_t up_vendor_id[FIP_VENDOR_LEN]; + uint32_t up_eport_vp_vhub_id; + uint32_t up_tusn; + fip_vhub_table_entry_t up_tbl_entry; +} fip_desc_vhub_update_t; + +#define FIP_UP_VP_SHIFT 24 +#define FIP_UP_VP_MASK 0x1 +#define FIP_UP_EPORT_STATE_SHIFT 28 +#define FIP_UP_EPORT_STATE_MASK 0x3 +#define FIP_UP_VHUB_ID_MASK 0x00FFFFFF + +#define FIP_EPORT_DOWN 0x0 +#define FIP_EPORT_UP 0x1 + +/* + * FIP_Descriptor_vHUB_Table + */ +#define FIP_DESC_TYPE_VHUB_TABLE 244 +typedef struct fip_desc_vhub_table_s { + uint8_t tb_type; + uint8_t tb_len; + uint8_t tb_reserved1[2]; + uint8_t tb_vendor_id[FIP_VENDOR_LEN]; + uint32_t tb_flags_vhub_id; + uint32_t tb_tusn; + uint8_t tb_hdr; + uint8_t tb_reserved2; + uint16_t tb_table_size; + /* + * FIP_vHUB_Table_Entry + * FIP_vHUB_Table_Entry + * . + * . + * . + * uint32_t Checksum + */ +} fip_desc_vhub_table_t; + +#define FIP_TB_FLAGS_VP_SHIFT 24 +#define FIP_TB_FLAGS_VP_MASK 0x1 + +#define FIP_TB_VHUB_ID_MASK 0x00FFFFFF + +#define FIP_TB_HDR_MIDDLE 0x00 +#define FIP_TB_HDR_FIRST 0x40 +#define FIP_TB_HDR_LAST 0x80 +#define FIP_TB_HDR_ONLY 0xC0 + +#define FIP_DESC_VHUB_TABLE_SZ (sizeof (fip_desc_vhub_table_t)) +#define FIP_DESC_VHUB_TABLE_WORDS (FIP_DESC_VHUB_TABLE_SZ >> 2) + +/* + * FIP vHUB Table Message: + * + * FIP_Protocol_Version + * FIP_Basic_Header + * FIP_Descriptor_vHUB_Table + */ +typedef struct fip_vhub_table_s { + fip_proto_t vt_proto_version; + fip_basic_hdr_t vt_fip_header; + fip_desc_vhub_table_t vt_vhub_table; +} fip_vhub_table_t; + +/* + * FIP vHUB Update Message: + * + * FIP_Protocol_Version + * FIP_Basic_Header + * FIP_Descriptor_vHUB_Update + */ +typedef struct fip_vhub_update_s { + fip_proto_t vu_proto_version; + fip_basic_hdr_t vu_fip_header; + fip_desc_vhub_update_t vu_vhub_update; +} fip_vhub_update_t; + +/* + * Just a generic container to handle either type of VHUB + * messages + */ +typedef struct fip_vhub_pkt_s { + fip_proto_t hb_proto_version; + fip_basic_hdr_t hb_fip_header; +} fip_vhub_pkt_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_IB_EOIB_FIP_H */
--- a/usr/src/uts/intel/Makefile.intel.shared Fri Aug 13 14:44:26 2010 +0800 +++ b/usr/src/uts/intel/Makefile.intel.shared Fri Aug 13 07:02:57 2010 -0400 @@ -490,7 +490,7 @@ # # InfiniBand pseudo drivers # -DRV_KMODS += ib ibp rdsib sdp iser daplt hermon tavor sol_ucma sol_uverbs +DRV_KMODS += ib ibp eibnx eoib rdsib sdp iser daplt hermon tavor sol_ucma sol_uverbs DRV_KMODS += sol_umad #
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/intel/eibnx/Makefile Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,137 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# + +# +# This makefile drives the production of the EoIB Nexus driver +# +# intel architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts) +# +UTSBASE = ../.. + +# +# Define the module and object file sets +# +MODULE = eibnx +OBJECTS = $(EIBNX_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(EIBNX_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/common/io/ib/clients/eoib +WARLOCK_OUT = $(EIBNX_OBJS:%.o=%.ll) +WARLOCK_OK = $(MODULE).ok +WLCMD_DIR = $(UTSBASE)/common/io/warlock + +# +# Include common rules +# +include $(UTSBASE)/intel/Makefile.intel + +# +# Define targets +# +ALL_TARGET = $(BINARY) $(SRC_CONFILE) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +# Module specific debug flag +# +CPPFLAGS += -DENX_DEBUG + +# +# Lint pass one enforcement +# +CFLAGS += $(CCVERBOSE) + +# +# Depends on misc/ibtl +# +LDFLAGS += -dy -Nmisc/ibcm -Nmisc/ibtl + +# +# The only lint flag we should need +# +LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN + +# +# Default build targets +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + $(RM) $(WARLOCK_OUT) $(WARLOCK_OK) + +clobber: $(CLOBBER_DEPS) + $(RM) $(WARLOCK_OUT) $(WARLOCK_OK) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets +# +include $(UTSBASE)/intel/Makefile.targ + +# +# Defines for local commands +# +WARLOCK = warlock +WLCC = wlcc +TOUCH = touch +TEST = test + +warlock: $(WARLOCK_OK) + +$(WARLOCK_OK): $(WARLOCK_OUT) $(WLCMD_DIR)/eibnx.wlcmd warlock_ddi.files + $(WARLOCK) -c $(WLCMD_DIR)/eibnx.wlcmd $(WARLOCK_OUT) \ + -l ../warlock/ddi_dki_impl.ll + $(TOUCH) $@ + +%.ll: $(UTSBASE)/common/io/ib/clients/eoib/enx_main.c \ + $(UTSBASE)/common/io/ib/clients/eoib/enx_hdlrs.c \ + $(UTSBASE)/common/io/ib/clients/eoib/enx_ibt.c \ + $(UTSBASE)/common/io/ib/clients/eoib/enx_log.c \ + $(UTSBASE)/common/io/ib/clients/eoib/enx_fip.c \ + $(UTSBASE)/common/io/ib/clients/eoib/enx_misc.c \ + $(UTSBASE)/common/io/ib/clients/eoib/enx_q.c \ + $(UTSBASE)/common/io/ib/clients/eoib/enx_ctl.c \ + $(UTSBASE)/common/sys/ib/clients/eoib/fip.h \ + $(UTSBASE)/common/sys/ib/clients/eoib/eib.h \ + $(UTSBASE)/common/sys/ib/clients/eoib/enx_impl.h + $(WLCC) $(CPPFLAGS) -DDEBUG -o $@ $< + +warlock_ddi.files: + @cd ../warlock; pwd; $(MAKE) warlock
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/intel/eoib/Makefile Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,136 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# + +# +# This makefile drives the production of the EoIB Nexus driver +# +# intel architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts) +# +UTSBASE = ../.. + +# +# Define the module and object file sets +# +MODULE = eoib +OBJECTS = $(EOIB_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(EOIB_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) +WARLOCK_OUT = $(EOIB_OBJS:%.o=%.ll) +WARLOCK_OK = $(MODULE).ok +WLCMD_DIR = $(UTSBASE)/common/io/warlock + +# +# Include common rules +# +include $(UTSBASE)/intel/Makefile.intel + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# Module specific debug flag +# +CPPFLAGS += -DEIB_DEBUG + +# +# Lint pass one enforcement +# +CFLAGS += $(CCVERBOSE) + +# +# Depends on misc/ibtl +# +LDFLAGS += -dy -Nmisc/mac -Nmisc/ibtl -Nmisc/ibcm -Nmisc/ibmf + +# +# Default build targets +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + $(RM) $(WARLOCK_OUT) $(WARLOCK_OK) + +clobber: $(CLOBBER_DEPS) + $(RM) $(WARLOCK_OUT) $(WARLOCK_OK) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets +# +include $(UTSBASE)/intel/Makefile.targ + +# +# Defines for local commands +# +WARLOCK = warlock +WLCC = wlcc +TOUCH = touch +TEST = test + +warlock: $(WARLOCK_OK) + +$(WARLOCK_OK): $(WARLOCK_OUT) $(WLCMD_DIR)/eoib.wlcmd warlock_ddi.files + $(WARLOCK) -c $(WLCMD_DIR)/eoib.wlcmd $(WARLOCK_OUT) \ + -l ../warlock/ddi_dki_impl.ll + $(TOUCH) $@ + +%.ll: $(UTSBASE)/common/io/ib/clients/eoib/eib_adm.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_chan.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_cmn.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_ctl.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_data.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_fip.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_ibt.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_log.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_mac.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_main.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_rsrc.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_svc.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_vnic.c \ + $(UTSBASE)/common/sys/ib/clients/eoib/fip.h \ + $(UTSBASE)/common/sys/ib/clients/eoib/eib.h \ + $(UTSBASE)/common/sys/ib/clients/eoib/eib_impl.h + $(WLCC) $(CPPFLAGS) -DDEBUG -o $@ $< + +warlock_ddi.files: + @cd ../warlock; pwd; $(MAKE) warlock
--- a/usr/src/uts/sparc/Makefile.sparc.shared Fri Aug 13 14:44:26 2010 +0800 +++ b/usr/src/uts/sparc/Makefile.sparc.shared Fri Aug 13 07:02:57 2010 -0400 @@ -283,7 +283,7 @@ DRV_KMODS += usbecm DRV_KMODS += hci1394 av1394 scsa1394 dcam1394 DRV_KMODS += sbp2 -DRV_KMODS += ib ibp rdsib sdp iser daplt hermon tavor sol_ucma sol_uverbs +DRV_KMODS += ib ibp eibnx eoib rdsib sdp iser daplt hermon tavor sol_ucma sol_uverbs DRV_KMODS += sol_umad DRV_KMODS += pci_pci pcieb pcieb_bcm DRV_KMODS += i8042 kb8042 mouse8042
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/sparc/eibnx/Makefile Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,143 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# + +# +# This makefile drives the production of the EoIB Nexus driver +# +# sparc architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts) +# +UTSBASE = ../.. + +# +# Define the module and object file sets +# +MODULE = eibnx +OBJECTS = $(EIBNX_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(EIBNX_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/common/io/ib/clients/eoib +WARLOCK_OUT = $(EIBNX_OBJS:%.o=%.ll) +WARLOCK_OK = $(MODULE).ok +WLCMD_DIR = $(UTSBASE)/common/io/warlock + +# +# Include common rules +# +include $(UTSBASE)/sparc/Makefile.sparc + +# +# Define targets +# +ALL_TARGET = $(BINARY) $(SRC_CONFILE) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +# +# Overrides. +# +ALL_BUILDS = $(ALL_BUILDSONLY64) +DEF_BUILDS = $(DEF_BUILDSONLY64) + +# Module specific debug flag +# +CPPFLAGS += -DENX_DEBUG + +# +# Lint pass one enforcement +# +CFLAGS += $(CCVERBOSE) + +# +# Depends on misc/ibtl +# +LDFLAGS += -dy -Nmisc/ibcm -Nmisc/ibtl + +# +# The only lint flag we should need +# +LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN + +# +# Default build targets +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + $(RM) $(WARLOCK_OUT) $(WARLOCK_OK) + +clobber: $(CLOBBER_DEPS) + $(RM) $(WARLOCK_OUT) $(WARLOCK_OK) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) lint32 + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets +# +include $(UTSBASE)/sparc/Makefile.targ + +# +# Defines for local commands +# +WARLOCK = warlock +WLCC = wlcc +TOUCH = touch +TEST = test + +warlock: $(WARLOCK_OK) + +$(WARLOCK_OK): $(WARLOCK_OUT) $(WLCMD_DIR)/eibnx.wlcmd warlock_ddi.files + $(WARLOCK) -c $(WLCMD_DIR)/eibnx.wlcmd $(WARLOCK_OUT) \ + -l ../warlock/ddi_dki_impl.ll + $(TOUCH) $@ + +%.ll: $(UTSBASE)/common/io/ib/clients/eoib/enx_main.c \ + $(UTSBASE)/common/io/ib/clients/eoib/enx_hdlrs.c \ + $(UTSBASE)/common/io/ib/clients/eoib/enx_ibt.c \ + $(UTSBASE)/common/io/ib/clients/eoib/enx_log.c \ + $(UTSBASE)/common/io/ib/clients/eoib/enx_fip.c \ + $(UTSBASE)/common/io/ib/clients/eoib/enx_misc.c \ + $(UTSBASE)/common/io/ib/clients/eoib/enx_q.c \ + $(UTSBASE)/common/io/ib/clients/eoib/enx_ctl.c \ + $(UTSBASE)/common/sys/ib/clients/eoib/fip.h \ + $(UTSBASE)/common/sys/ib/clients/eoib/eib.h \ + $(UTSBASE)/common/sys/ib/clients/eoib/enx_impl.h + $(WLCC) $(CPPFLAGS) -DDEBUG -o $@ $< + +warlock_ddi.files: + @cd ../warlock; pwd; $(MAKE) warlock
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/sparc/eoib/Makefile Fri Aug 13 07:02:57 2010 -0400 @@ -0,0 +1,142 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# + +# +# This makefile drives the production of the EoIB Nexus driver +# +# sparc architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts) +# +UTSBASE = ../.. + +# +# Define the module and object file sets +# +MODULE = eoib +OBJECTS = $(EOIB_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(EOIB_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) +WARLOCK_OUT = $(EOIB_OBJS:%.o=%.ll) +WARLOCK_OK = $(MODULE).ok +WLCMD_DIR = $(UTSBASE)/common/io/warlock + +# +# Include common rules +# +include $(UTSBASE)/sparc/Makefile.sparc + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# Overrides +# +ALL_BUILDS = $(ALL_BUILDSONLY64) +DEF_BUILDS = $(DEF_BUILDSONLY64) + +# Module specific debug flag +# +CPPFLAGS += -DEIB_DEBUG + +# +# Lint pass one enforcement +# +CFLAGS += $(CCVERBOSE) + +# +# Depends on misc/ibtl +# +LDFLAGS += -dy -Nmisc/mac -Nmisc/ibtl -Nmisc/ibcm -Nmisc/ibmf + +# +# Default build targets +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + $(RM) $(WARLOCK_OUT) $(WARLOCK_OK) + +clobber: $(CLOBBER_DEPS) + $(RM) $(WARLOCK_OUT) $(WARLOCK_OK) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) lint32 + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets +# +include $(UTSBASE)/sparc/Makefile.targ + +# +# Defines for local commands +# +WARLOCK = warlock +WLCC = wlcc +TOUCH = touch +TEST = test + +warlock: $(WARLOCK_OK) + +$(WARLOCK_OK): $(WARLOCK_OUT) $(WLCMD_DIR)/eoib.wlcmd warlock_ddi.files + $(WARLOCK) -c $(WLCMD_DIR)/eoib.wlcmd $(WARLOCK_OUT) \ + -l ../warlock/ddi_dki_impl.ll + $(TOUCH) $@ + +%.ll: $(UTSBASE)/common/io/ib/clients/eoib/eib_adm.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_chan.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_cmn.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_ctl.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_data.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_fip.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_ibt.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_log.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_mac.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_main.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_rsrc.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_svc.c \ + $(UTSBASE)/common/io/ib/clients/eoib/eib_vnic.c \ + $(UTSBASE)/common/sys/ib/clients/eoib/fip.h \ + $(UTSBASE)/common/sys/ib/clients/eoib/eib.h \ + $(UTSBASE)/common/sys/ib/clients/eoib/eib_impl.h + $(WLCC) $(CPPFLAGS) -DDEBUG -o $@ $< + +warlock_ddi.files: + @cd ../warlock; pwd; $(MAKE) warlock