Mercurial > illumos > illumos-gate
changeset 9985:f0a94a205b15
6845344 IBD driver should attach if PKEY is Valid with fullmembership bit set
author | Venkatakrishnan Rajagopalan <Venkatakrishnan.Rajagopalan@Sun.COM> |
---|---|
date | Mon, 29 Jun 2009 02:56:22 -0700 |
parents | cc88cd2a9d42 |
children | 4d51e0eb2206 |
files | usr/src/uts/common/io/ib/clients/ibd/ibd.c usr/src/uts/common/sys/ib/clients/ibd/ibd.h usr/src/uts/intel/ibd/Makefile usr/src/uts/sparc/ibd/Makefile |
diffstat | 4 files changed, 1009 insertions(+), 822 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/uts/common/io/ib/clients/ibd/ibd.c Mon Jun 29 02:30:05 2009 -0700 +++ b/usr/src/uts/common/io/ib/clients/ibd/ibd.c Mon Jun 29 02:56:22 2009 -0700 @@ -157,6 +157,12 @@ static uint_t ibd_rxcomp_usec = 10; /* + * Send CQ moderation parameters: NOT tunables + */ +#define IBD_TXCOMP_COUNT 10 +#define IBD_TXCOMP_USEC 300 + +/* * Thresholds * * When waiting for resources (swqes or lso buffers) to become available, @@ -225,11 +231,36 @@ #define IBD_OP_ROUTERED 4 /* + * State of IBD driver initialization during attach/m_start + */ +#define IBD_DRV_STATE_INITIALIZED 0x00001 +#define IBD_DRV_RXINTR_ADDED 0x00002 +#define IBD_DRV_TXINTR_ADDED 0x00004 +#define IBD_DRV_IBTL_ATTACH_DONE 0x00008 +#define IBD_DRV_HCA_OPENED 0x00010 +#define IBD_DRV_PD_ALLOCD 0x00020 +#define IBD_DRV_MAC_REGISTERED 0x00040 +#define IBD_DRV_PORT_DETAILS_OBTAINED 0x00080 +#define IBD_DRV_BCAST_GROUP_FOUND 0x00100 +#define IBD_DRV_ACACHE_INITIALIZED 0x00200 +#define IBD_DRV_CQS_ALLOCD 0x00400 +#define IBD_DRV_UD_CHANNEL_SETUP 0x00800 +#define IBD_DRV_TXLIST_ALLOCD 0x01000 +#define IBD_DRV_SCQ_NOTIFY_ENABLED 0x02000 +#define IBD_DRV_RXLIST_ALLOCD 0x04000 +#define IBD_DRV_BCAST_GROUP_JOINED 0x08000 +#define IBD_DRV_ASYNC_THR_CREATED 0x10000 +#define IBD_DRV_RCQ_NOTIFY_ENABLED 0x20000 +#define IBD_DRV_SM_NOTICES_REGISTERED 0x40000 +#define IBD_DRV_STARTED 0x80000 + +/* * Miscellaneous constants */ #define IBD_SEND 0 #define IBD_RECV 1 #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF +#define IBD_DEF_MAX_SDU 2044 #ifdef IBD_LOGGING #define IBD_DMAX_LINE 100 #endif @@ -283,7 +314,6 @@ * Initialization */ static int ibd_state_init(ibd_state_t *, dev_info_t *); -static int ibd_drv_init(ibd_state_t *); static int ibd_init_txlist(ibd_state_t *); static int ibd_init_rxlist(ibd_state_t *); static int ibd_acache_init(ibd_state_t *); @@ -295,7 +325,6 @@ * Termination/cleanup */ static void ibd_state_fini(ibd_state_t *); -static void ibd_drv_fini(ibd_state_t *); static void ibd_fini_txlist(ibd_state_t *); static void ibd_fini_rxlist(ibd_state_t *); static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); @@ -348,7 +377,7 @@ */ static boolean_t ibd_send(ibd_state_t *, mblk_t *); static void ibd_post_send(ibd_state_t *, ibd_swqe_t *); -static int ibd_post_rwqe(ibd_state_t *, ibd_rwqe_t *, boolean_t); +static int ibd_post_recv(ibd_state_t *, ibd_rwqe_t *, boolean_t); static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); static void ibd_flush_rx(ibd_state_t *, mblk_t *); @@ -384,7 +413,6 @@ static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *); static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t); static uint64_t ibd_get_portspeed(ibd_state_t *); -static int ibd_get_portpkey(ibd_state_t *, ib_guid_t *); static boolean_t ibd_async_safe(ibd_state_t *); static void ibd_async_done(ibd_state_t *); static ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int); @@ -394,6 +422,18 @@ static void ibd_link_mod(ibd_state_t *, ibt_async_code_t); /* + * Helpers for attach/start routines + */ +static int ibd_register_mac(ibd_state_t *, dev_info_t *); +static int ibd_record_capab(ibd_state_t *, dev_info_t *); +static int ibd_unattach(ibd_state_t *, dev_info_t *); +static int ibd_get_port_details(ibd_state_t *); +static int ibd_alloc_cqs(ibd_state_t *); +static int ibd_setup_ud_channel(ibd_state_t *); +static int ibd_undo_m_start(ibd_state_t *); + + +/* * Miscellaneous helpers */ static int ibd_sched_poll(ibd_state_t *, int, int); @@ -439,7 +479,7 @@ * GLDv3 entry points */ #define IBD_M_CALLBACK_FLAGS (MC_GETCAPAB) -static mac_callbacks_t ib_m_callbacks = { +static mac_callbacks_t ibd_m_callbacks = { IBD_M_CALLBACK_FLAGS, ibd_m_stat, ibd_m_start, @@ -625,7 +665,7 @@ _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, ibd_state_t::id_link_state)) _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state)) -_NOTE(SCHEME_PROTECTS_DATA("only async thr and drv init", +_NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start", ibd_state_t::id_link_speed)) /* @@ -1236,7 +1276,7 @@ } /* - * Wake up ibd_drv_fini() if the detach code is waiting for pending subnet + * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet * trap or event handling to complete to kill the async thread and deconstruct * the mcg/ace list. */ @@ -1660,7 +1700,7 @@ ibt_path_attr_t path_attr; ibt_path_info_t path_info; ib_gid_t destgid; - int ret = IBD_OP_NOTSTARTED; + char ret = IBD_OP_NOTSTARTED; DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), @@ -1954,7 +1994,7 @@ mutex_enter(&state->id_link_mutex); /* - * If the init code in ibd_drv_init hasn't yet set up the + * If the init code in ibd_m_start hasn't yet set up the * pkey/gid, nothing to do; that code will set the link state. */ if (state->id_link_state == LINK_STATE_UNKNOWN) { @@ -2071,8 +2111,8 @@ * done ibt_open_hca() but not yet done ibt_close_hca(). * Only need to do work for our port; IBTF will deliver * events for other ports on the hca we have ibt_open_hca'ed - * too. Note that ibd_drv_init() initializes id_port before - * doing ibt_open_hca(). + * too. Note that id_port is initialized in ibd_attach() + * before we do an ibt_open_hca() in ibd_attach(). */ ASSERT(state->id_hca_hdl == hca_hdl); if (state->id_port != event->ev_port) @@ -2091,8 +2131,8 @@ * done ibt_open_hca() but not yet done ibt_close_hca(). * Only need to do work for our port; IBTF will deliver * events for other ports on the hca we have ibt_open_hca'ed - * too. Note that ibd_drv_init() initializes id_port before - * doing ibt_open_hca(). + * too. Note that id_port is initialized in ibd_attach() + * before we do an ibt_open_hca() in ibd_attach(). */ ASSERT(state->id_hca_hdl == hca_hdl); if (state->id_port != event->ev_port) @@ -2121,160 +2161,330 @@ } } +static int +ibd_register_mac(ibd_state_t *state, dev_info_t *dip) +{ + mac_register_t *macp; + int ret; + + if ((macp = mac_alloc(MAC_VERSION)) == NULL) { + DPRINT(10, "ibd_register_mac: mac_alloc() failed"); + return (DDI_FAILURE); + } + + /* + * Note that when we register with mac during attach, we don't + * have the id_macaddr yet, so we'll simply be registering a + * zero macaddr that we'll overwrite later during plumb (in + * ibd_m_start()). Similar is the case with id_mtu - we'll + * update the mac layer with the correct mtu during plumb. + */ + macp->m_type_ident = MAC_PLUGIN_IDENT_IB; + macp->m_driver = state; + macp->m_dip = dip; + macp->m_src_addr = (uint8_t *)&state->id_macaddr; + macp->m_callbacks = &ibd_m_callbacks; + macp->m_min_sdu = 0; + macp->m_max_sdu = IBD_DEF_MAX_SDU; + + /* + * Register ourselves with the GLDv3 interface + */ + if ((ret = mac_register(macp, &state->id_mh)) != 0) { + mac_free(macp); + DPRINT(10, + "ibd_register_mac: mac_register() failed, ret=%d", ret); + return (DDI_FAILURE); + } + + mac_free(macp); + return (DDI_SUCCESS); +} + +static int +ibd_record_capab(ibd_state_t *state, dev_info_t *dip) +{ + ibt_hca_attr_t hca_attrs; + ibt_status_t ibt_status; + + /* + * Query the HCA and fetch its attributes + */ + ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); + ASSERT(ibt_status == IBT_SUCCESS); + + /* + * 1. Set the Hardware Checksum capability. Currently we only consider + * full checksum offload. + */ + if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) == IBT_HCA_CKSUM_FULL) { + state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL; + } + + /* + * 2. Set LSO policy, capability and maximum length + */ + if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) { + state->id_lso_policy = B_TRUE; + } else { + state->id_lso_policy = B_FALSE; + } + if (hca_attrs.hca_max_lso_size > 0) { + state->id_lso_capable = B_TRUE; + if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN) + state->id_lso_maxlen = IBD_LSO_MAXLEN; + else + state->id_lso_maxlen = hca_attrs.hca_max_lso_size; + } else { + state->id_lso_capable = B_FALSE; + state->id_lso_maxlen = 0; + } + + /* + * 3. Set Reserved L_Key capability + */ + if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) { + state->id_hca_res_lkey_capab = 1; + state->id_res_lkey = hca_attrs.hca_reserved_lkey; + } + + /* + * 4. Set maximum sqseg value after checking to see if extended sgl + * size information is provided by the hca + */ + if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) { + state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz; + } else { + state->id_max_sqseg = hca_attrs.hca_max_sgl; + } + if (state->id_max_sqseg > IBD_MAX_SQSEG) { + state->id_max_sqseg = IBD_MAX_SQSEG; + } else if (state->id_max_sqseg < IBD_MAX_SQSEG) { + ibd_print_warn(state, "Set #sgl = %d instead of default %d", + state->id_max_sqseg, IBD_MAX_SQSEG); + } + + /* + * 5. Set number of recv and send wqes after checking hca maximum + * channel size + */ + if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) { + state->id_num_rwqe = hca_attrs.hca_max_chan_sz; + } else { + state->id_num_rwqe = IBD_NUM_RWQE; + } + if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) { + state->id_num_swqe = hca_attrs.hca_max_chan_sz; + } else { + state->id_num_swqe = IBD_NUM_SWQE; + } + + return (DDI_SUCCESS); +} + +static int +ibd_unattach(ibd_state_t *state, dev_info_t *dip) +{ + int instance; + uint32_t progress = state->id_mac_state; + ibt_status_t ret; + + if (progress & IBD_DRV_MAC_REGISTERED) { + (void) mac_unregister(state->id_mh); + state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); + } + + if (progress & IBD_DRV_PD_ALLOCD) { + if ((ret = ibt_free_pd(state->id_hca_hdl, + state->id_pd_hdl)) != IBT_SUCCESS) { + ibd_print_warn(state, "failed to free " + "protection domain, ret=%d", ret); + } + state->id_pd_hdl = NULL; + state->id_mac_state &= (~IBD_DRV_PD_ALLOCD); + } + + if (progress & IBD_DRV_HCA_OPENED) { + if ((ret = ibt_close_hca(state->id_hca_hdl)) != + IBT_SUCCESS) { + ibd_print_warn(state, "failed to close " + "HCA device, ret=%d", ret); + } + state->id_hca_hdl = NULL; + state->id_mac_state &= (~IBD_DRV_HCA_OPENED); + } + + if (progress & IBD_DRV_IBTL_ATTACH_DONE) { + if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) { + ibd_print_warn(state, + "ibt_detach() failed, ret=%d", ret); + } + state->id_ibt_hdl = NULL; + state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); + } + + if (progress & IBD_DRV_TXINTR_ADDED) { + ddi_remove_softintr(state->id_tx); + state->id_tx = NULL; + state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED); + } + + if (progress & IBD_DRV_RXINTR_ADDED) { + ddi_remove_softintr(state->id_rx); + state->id_rx = NULL; + state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED); + } + + if (progress & IBD_DRV_STATE_INITIALIZED) { + ibd_state_fini(state); + state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED); + } + + instance = ddi_get_instance(dip); + ddi_soft_state_free(ibd_list, instance); + + return (DDI_SUCCESS); +} + /* * Attach device to the IO framework. */ static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { - mac_register_t *macp; - ibd_state_t *state; + ibd_state_t *state = NULL; + ib_guid_t hca_guid; int instance; - int err; - - switch (cmd) { - case DDI_ATTACH: - break; - case DDI_RESUME: - /* This driver does not support resume */ - default: - return (DDI_FAILURE); - } + ibt_status_t ret; + int rv; /* - * Allocate soft device data structure + * IBD doesn't support suspend/resume + */ + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + /* + * Allocate softstate structure */ instance = ddi_get_instance(dip); if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) return (DDI_FAILURE); state = ddi_get_soft_state(ibd_list, instance); - /* pre ibt_attach() soft state initialization */ + /* + * Initialize mutexes and condition variables + */ if (ibd_state_init(state, dip) != DDI_SUCCESS) { - DPRINT(10, "ibd_attach : failed in ibd_state_init()"); - goto attach_fail_state_init; - } - - /* alloc rx soft intr */ - if ((ibd_rx_softintr == 1) && - ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, - NULL, NULL, ibd_intr, (caddr_t)state) != DDI_SUCCESS) { - DPRINT(10, "ibd_attach : failed in ddi_add_softintr()"); - goto attach_fail_ddi_add_rx_softintr; - } - - /* alloc tx soft intr */ - if ((ibd_tx_softintr == 1) && - ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, - NULL, NULL, ibd_tx_recycle, (caddr_t)state) != DDI_SUCCESS) { - DPRINT(10, "ibd_attach : failed in ddi_add_softintr()"); - goto attach_fail_ddi_add_tx_softintr; - } - - /* "attach" to IBTL */ - if (ibt_attach(&ibd_clnt_modinfo, dip, state, - &state->id_ibt_hdl) != IBT_SUCCESS) { - DPRINT(10, "ibd_attach : failed in ibt_attach()"); - goto attach_fail_ibt_attach; - } - - /* Finish initializing this driver */ - if (ibd_drv_init(state) != DDI_SUCCESS) { - DPRINT(10, "ibd_attach : failed in ibd_drv_init()\n"); - goto attach_fail_drv_init; - } + DPRINT(10, "ibd_attach: failed in ibd_state_init()"); + goto attach_fail; + } + state->id_mac_state |= IBD_DRV_STATE_INITIALIZED; /* - * Initialize pointers to device specific functions which will be - * used by the generic layer. + * Allocate rx,tx softintr */ - if ((macp = mac_alloc(MAC_VERSION)) == NULL) { - DPRINT(10, "ibd_attach : failed in mac_alloc()"); - goto attach_fail_drv_init; - } - - macp->m_type_ident = MAC_PLUGIN_IDENT_IB; - macp->m_driver = state; - macp->m_dip = state->id_dip; - macp->m_src_addr = (uint8_t *)&state->id_macaddr; - macp->m_callbacks = &ib_m_callbacks; - macp->m_min_sdu = 0; - macp->m_max_sdu = state->id_mtu - IPOIB_HDRSIZE; - - /* - * Register ourselves with the GLDv3 interface - */ - err = mac_register(macp, &state->id_mh); - mac_free(macp); - if (err != 0) { - DPRINT(10, "ibd_attach : failed in mac_register()"); - goto attach_fail_mac_register; + if (ibd_rx_softintr == 1) { + if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, + NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) { + DPRINT(10, "ibd_attach: failed in " + "ddi_add_softintr(id_rx), ret=%d", rv); + goto attach_fail; + } + state->id_mac_state |= IBD_DRV_RXINTR_ADDED; + } + if (ibd_tx_softintr == 1) { + if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, + NULL, NULL, ibd_tx_recycle, + (caddr_t)state)) != DDI_SUCCESS) { + DPRINT(10, "ibd_attach: failed in " + "ddi_add_softintr(id_tx), ret=%d", rv); + goto attach_fail; + } + state->id_mac_state |= IBD_DRV_TXINTR_ADDED; } /* - * Setup the handler we will use for regular DLPI stuff. Its important - * to setup the recv handler after registering with gldv3. + * Obtain IBA P_Key, port number and HCA guid and validate + * them (for P_Key, only full members are allowed as per + * IPoIB specification; neither port number nor HCA guid + * can be zero) */ - ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); - if (ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION) != - IBT_SUCCESS) { - DPRINT(10, "ibd_attach : failed in ibt_enable_cq_notify()\n"); - goto attach_fail_setup_handler; + if ((state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, + "port-pkey", IB_PKEY_INVALID_LIMITED)) <= IB_PKEY_INVALID_FULL) { + DPRINT(10, "ibd_attach: port device has wrong partition (0x%x)", + state->id_pkey); + goto attach_fail; + } + if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, + "port-number", 0)) == 0) { + DPRINT(10, "ibd_attach: invalid port number (%d)", + state->id_port); + goto attach_fail; + } + if ((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, + "hca-guid", 0)) == 0) { + DPRINT(10, "ibd_attach: port hca has invalid guid (0x%llx)", + hca_guid); + goto attach_fail; } /* - * Setup the subnet notices handler after we initialize the a/mcaches - * and start the async thread, both of which are required for the - * trap handler to function properly. Enable the trap handler to - * queue requests to the async thread after the mac_register, because - * the async daemon invokes mac_tx_update(), which must be done after - * mac_register(). + * Attach to IBTL + */ + if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, + &state->id_ibt_hdl)) != IBT_SUCCESS) { + DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", ret); + goto attach_fail; + } + state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; + + /* + * Open the HCA */ - ibt_register_subnet_notices(state->id_ibt_hdl, - ibd_snet_notices_handler, state); - mutex_enter(&state->id_trap_lock); - state->id_trap_stop = B_FALSE; - mutex_exit(&state->id_trap_lock); + if ((ret = ibt_open_hca(state->id_ibt_hdl, hca_guid, + &state->id_hca_hdl)) != IBT_SUCCESS) { + DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret); + goto attach_fail; + } + state->id_mac_state |= IBD_DRV_HCA_OPENED; + + /* + * Record capabilities + */ + (void) ibd_record_capab(state, dip); /* - * Indicate link status to GLDv3 and higher layers. By default, - * we assume we are in up state (which must have been true at - * least at the time the broadcast mcg's were probed); if there - * were any up/down transitions till the time we come here, the - * async handler will have updated last known state, which we - * use to tell GLDv3. The async handler will not send any - * notifications to GLDv3 till we reach here in the initialization - * sequence. + * Allocate a protection domain on the HCA + */ + if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, + &state->id_pd_hdl)) != IBT_SUCCESS) { + DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret); + goto attach_fail; + } + state->id_mac_state |= IBD_DRV_PD_ALLOCD; + + + /* + * Register ibd interfaces with the Nemo framework */ - mac_link_update(state->id_mh, state->id_link_state); - + if (ibd_register_mac(state, dip) != IBT_SUCCESS) { + DPRINT(10, "ibd_attach: failed in ibd_register_mac()"); + goto attach_fail; + } + state->id_mac_state |= IBD_DRV_MAC_REGISTERED; + + /* + * We're done with everything we could to make the attach + * succeed. All the buffer allocations and IPoIB broadcast + * group joins are deferred to when the interface instance + * is actually plumbed to avoid wasting memory. + */ return (DDI_SUCCESS); - /* Attach failure points, cleanup */ -attach_fail_setup_handler: - (void) mac_unregister(state->id_mh); - -attach_fail_mac_register: - ibd_drv_fini(state); - -attach_fail_drv_init: - if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS) - ibd_print_warn(state, "failed to free IB resources"); - -attach_fail_ibt_attach: - if (ibd_tx_softintr == 1) - ddi_remove_softintr(state->id_tx); - -attach_fail_ddi_add_tx_softintr: - if (ibd_rx_softintr == 1) - ddi_remove_softintr(state->id_rx); - -attach_fail_ddi_add_rx_softintr: - ibd_state_fini(state); - -attach_fail_state_init: - ddi_soft_state_free(ibd_list, instance); - +attach_fail: + ibd_unattach(state, dip); return (DDI_FAILURE); } @@ -2285,69 +2495,28 @@ ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { ibd_state_t *state; - int status; int instance; - switch (cmd) { - case DDI_DETACH: - break; - case DDI_SUSPEND: - default: - return (DDI_FAILURE); - } - + /* + * IBD doesn't support suspend/resume + */ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + /* + * Get the instance softstate + */ instance = ddi_get_instance(dip); state = ddi_get_soft_state(ibd_list, instance); /* - * First, stop receive interrupts; this stops the - * driver from handing up buffers to higher layers. - * Wait for receive buffers to be returned; give up - * after 5 seconds. + * Release all resources we're holding still. Note that if we'd + * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly + * so far, we should find all the flags we need in id_mac_state. */ - ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); - status = 50; - while (state->id_rx_list.dl_bufs_outstanding > 0) { - delay(drv_usectohz(100000)); - if (--status == 0) { - DPRINT(2, "ibd_detach : reclaiming failed"); - goto failed; - } - } - - if (mac_unregister(state->id_mh) != DDI_SUCCESS) { - DPRINT(10, "ibd_detach : failed in mac_unregister()"); - goto failed; - } - - if (ibd_rx_softintr == 1) - ddi_remove_softintr(state->id_rx); - - if (ibd_tx_softintr == 1) - ddi_remove_softintr(state->id_tx); - - ibd_drv_fini(state); - - if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS) - ibd_print_warn(state, "failed to free all IB resources at " - "driver detach time"); - - ibd_state_fini(state); - ddi_soft_state_free(ibd_list, instance); + (void) ibd_unattach(state, dip); + return (DDI_SUCCESS); - -failed: - /* - * Reap all the Tx/Rx completions that were posted since we - * turned off the notification. Turn on notifications. There - * is a race in that we do not reap completions that come in - * after the poll and before notifications get turned on. That - * is okay, the next rx/tx packet will trigger a completion - * that will reap any missed completions. - */ - ibd_poll_compq(state, state->id_rcq_hdl); - ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); - return (DDI_FAILURE); } /* @@ -2424,48 +2593,6 @@ } /* - * Fetch IBA parameters for the network device from IB nexus. - */ -static int -ibd_get_portpkey(ibd_state_t *state, ib_guid_t *hca_guid) -{ - /* - * Get the IBA Pkey ... allow only fullmembers, per IPoIB spec. - * Note that the default partition is also allowed. - */ - state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip, - 0, "port-pkey", IB_PKEY_INVALID_LIMITED); - if (state->id_pkey <= IB_PKEY_INVALID_FULL) { - DPRINT(10, "ibd_get_portpkey : ERROR: IBport device has wrong" - "partition\n"); - return (DDI_FAILURE); - } - - /* - * ... the IBA port ... - */ - state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip, - 0, "port-number", 0); - if (state->id_port == 0) { - DPRINT(10, "ibd_get_portpkey : ERROR: invalid port number\n"); - return (DDI_FAILURE); - } - - /* - * ... and HCA GUID. - */ - *hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, - 0, "hca-guid", 0); - if (*hca_guid == 0) { - DPRINT(10, "ibd_get_portpkey : ERROR: IBport hca has wrong " - "guid\n"); - return (DDI_FAILURE); - } - - return (DDI_SUCCESS); -} - -/* * Fetch link speed from SA for snmp ifspeed reporting. */ static uint64_t @@ -2951,427 +3078,6 @@ return (IBT_SUCCESS); } -/* - * Post ibt_attach() initialization. - */ -static int -ibd_drv_init(ibd_state_t *state) -{ - kthread_t *kht; - ibt_ud_chan_alloc_args_t ud_alloc_attr; - ibt_ud_chan_query_attr_t ud_chan_attr; - ibt_hca_portinfo_t *port_infop; - ibt_hca_attr_t hca_attrs; - ibt_status_t ibt_status; - ibt_cq_attr_t cq_attr; - ib_guid_t hca_guid; - uint32_t real_size; - uint32_t *ptr; - char pathname[OBP_MAXPATHLEN]; - uint_t psize, port_infosz; - - /* - * Initialize id_port before ibt_open_hca because of - * ordering requirements in port up/down handling. - */ - if (ibd_get_portpkey(state, &hca_guid) != DDI_SUCCESS) - return (DDI_FAILURE); - - if (ibt_open_hca(state->id_ibt_hdl, hca_guid, - &state->id_hca_hdl) != IBT_SUCCESS) { - DPRINT(10, "ibd_drv_init : failed in ibt_open_hca()\n"); - return (DDI_FAILURE); - } - - mutex_enter(&state->id_link_mutex); - ibt_status = ibt_query_hca_ports(state->id_hca_hdl, - state->id_port, &port_infop, &psize, - &port_infosz); - if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { - mutex_exit(&state->id_link_mutex); - DPRINT(10, "ibd_drv_init : failed in ibt_query_port()\n"); - (void) ibt_close_hca(state->id_hca_hdl); - return (DDI_FAILURE); - } - - /* - * If the link already went down by the time we get here, give up; - * we can not even get the gid since that is not valid. We would - * fail in ibd_find_bgroup() anyway. - */ - if (port_infop->p_linkstate != IBT_PORT_ACTIVE) { - mutex_exit(&state->id_link_mutex); - ibt_free_portinfo(port_infop, port_infosz); - (void) ibt_close_hca(state->id_hca_hdl); - ibd_print_warn(state, "Port is not active"); - return (DDI_FAILURE); - } - - /* - * This verifies the Pkey ibnexus handed us is still valid. - * This is also the point from which the pkey table for the - * port must hold the exact pkey value at the exact index - * across port up/downs. - */ - if (ibt_pkey2index(state->id_hca_hdl, state->id_port, - state->id_pkey, &state->id_pkix) != IBT_SUCCESS) { - mutex_exit(&state->id_link_mutex); - ibt_free_portinfo(port_infop, port_infosz); - DPRINT(10, "ibd_drv_init : failed in ibt_pkey2index()\n"); - (void) ibt_close_hca(state->id_hca_hdl); - return (DDI_FAILURE); - } - - state->id_mtu = (128 << port_infop->p_mtu); - state->id_sgid = *port_infop->p_sgid_tbl; - state->id_link_state = LINK_STATE_UP; - mutex_exit(&state->id_link_mutex); - - ibt_free_portinfo(port_infop, port_infosz); - - state->id_link_speed = ibd_get_portspeed(state); - - /* - * Read drv conf and record what the policy is on enabling LSO - */ - if (ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip, - DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) { - state->id_lso_policy = B_TRUE; - } else { - state->id_lso_policy = B_FALSE; - } - - ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); - ASSERT(ibt_status == IBT_SUCCESS); - - if (ibd_find_bgroup(state) != IBT_SUCCESS) { - DPRINT(10, "ibd_drv_init : failed in ibd_find_bgroup\n"); - goto drv_init_fail_find_bgroup; - } - - if (ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, - &state->id_pd_hdl) != IBT_SUCCESS) { - DPRINT(10, "ibd_drv_init : failed in ibt_alloc_pd()\n"); - goto drv_init_fail_alloc_pd; - } - - /* Initialize the parallel ARP cache and AHs */ - if (ibd_acache_init(state) != DDI_SUCCESS) { - DPRINT(10, "ibd_drv_init : failed in ibd_acache_init()\n"); - goto drv_init_fail_acache; - } - - if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) { - state->id_hca_res_lkey_capab = 1; - state->id_res_lkey = hca_attrs.hca_reserved_lkey; - } - - /* - * Check various tunable limits. - */ - - /* - * See if extended sgl size information is provided by the hca; if yes, - * use the correct one and set the maximum sqseg value. - */ - if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) - state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz; - else - state->id_max_sqseg = hca_attrs.hca_max_sgl; - - /* - * Set LSO capability and maximum length - */ - if (hca_attrs.hca_max_lso_size > 0) { - state->id_lso_capable = B_TRUE; - if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN) - state->id_lso_maxlen = IBD_LSO_MAXLEN; - else - state->id_lso_maxlen = hca_attrs.hca_max_lso_size; - } else { - state->id_lso_capable = B_FALSE; - state->id_lso_maxlen = 0; - } - - - /* - * Check #r/s wqes against max channel size. - */ - if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) - state->id_num_rwqe = hca_attrs.hca_max_chan_sz; - else - state->id_num_rwqe = IBD_NUM_RWQE; - - if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) - state->id_num_swqe = hca_attrs.hca_max_chan_sz; - else - state->id_num_swqe = IBD_NUM_SWQE; - - /* - * Check the hardware checksum capability. Currently we only consider - * full checksum offload. - */ - if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) == IBT_HCA_CKSUM_FULL) { - state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL; - } - - /* - * Allocate Rx/combined CQ: - * Theoretically, there is no point in having more than #rwqe - * plus #swqe cqe's, except that the CQ will be signalled for - * overflow when the last wqe completes, if none of the previous - * cqe's have been polled. Thus, we allocate just a few less wqe's - * to make sure such overflow does not occur. - */ - cq_attr.cq_sched = NULL; - cq_attr.cq_flags = IBT_CQ_NO_FLAGS; - - if (ibd_separate_cqs == 1) { - /* - * Allocate Receive CQ. - */ - if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) { - cq_attr.cq_size = state->id_num_rwqe + 1; - } else { - cq_attr.cq_size = hca_attrs.hca_max_cq_sz; - state->id_num_rwqe = cq_attr.cq_size - 1; - } - - if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, - &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) { - DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); - goto drv_init_fail_alloc_rcq; - } - - if (ibt_modify_cq(state->id_rcq_hdl, - ibd_rxcomp_count, ibd_rxcomp_usec, 0) != IBT_SUCCESS) { - DPRINT(10, "ibd_drv_init: Receive CQ interrupt " - "moderation failed\n"); - } - - state->id_rxwcs_size = state->id_num_rwqe + 1; - state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * - state->id_rxwcs_size, KM_SLEEP); - - /* - * Allocate Send CQ. - */ - if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) { - cq_attr.cq_size = state->id_num_swqe + 1; - } else { - cq_attr.cq_size = hca_attrs.hca_max_cq_sz; - state->id_num_swqe = cq_attr.cq_size - 1; - } - - if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, - &state->id_scq_hdl, &real_size) != IBT_SUCCESS) { - DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); - goto drv_init_fail_alloc_scq; - } - if (ibt_modify_cq(state->id_scq_hdl, - 10, 300, 0) != IBT_SUCCESS) { - DPRINT(10, "ibd_drv_init: Send CQ interrupt " - "moderation failed\n"); - } - - state->id_txwcs_size = state->id_num_swqe + 1; - state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * - state->id_txwcs_size, KM_SLEEP); - } else { - /* - * Allocate combined Send/Receive CQ. - */ - if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + - state->id_num_swqe + 1)) { - cq_attr.cq_size = state->id_num_rwqe + - state->id_num_swqe + 1; - } else { - cq_attr.cq_size = hca_attrs.hca_max_cq_sz; - state->id_num_rwqe = ((cq_attr.cq_size - 1) * - state->id_num_rwqe) / (state->id_num_rwqe + - state->id_num_swqe); - state->id_num_swqe = cq_attr.cq_size - 1 - - state->id_num_rwqe; - } - - state->id_rxwcs_size = cq_attr.cq_size; - state->id_txwcs_size = state->id_rxwcs_size; - - if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, - &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) { - DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); - goto drv_init_fail_alloc_rcq; - } - state->id_scq_hdl = state->id_rcq_hdl; - state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * - state->id_rxwcs_size, KM_SLEEP); - state->id_txwcs = state->id_rxwcs; - } - - /* - * Print message in case we could not allocate as many wqe's - * as was requested. Note that in the combined CQ case, we will - * get the following message. - */ - if (state->id_num_rwqe != IBD_NUM_RWQE) - ibd_print_warn(state, "Setting #rwqe = %d instead of default " - "%d", state->id_num_rwqe, IBD_NUM_RWQE); - if (state->id_num_swqe != IBD_NUM_SWQE) - ibd_print_warn(state, "Setting #swqe = %d instead of default " - "%d", state->id_num_swqe, IBD_NUM_SWQE); - - ud_alloc_attr.ud_flags = IBT_WR_SIGNALED; - if (state->id_hca_res_lkey_capab) - ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY; - if (state->id_lso_policy && state->id_lso_capable) - ud_alloc_attr.ud_flags |= IBT_USES_LSO; - - ud_alloc_attr.ud_hca_port_num = state->id_port; - ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; - ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; - ud_alloc_attr.ud_sizes.cs_sq = state->id_num_swqe; - ud_alloc_attr.ud_sizes.cs_rq = state->id_num_rwqe; - ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; - ud_alloc_attr.ud_scq = state->id_scq_hdl; - ud_alloc_attr.ud_rcq = state->id_rcq_hdl; - ud_alloc_attr.ud_pd = state->id_pd_hdl; - ud_alloc_attr.ud_pkey_ix = state->id_pkix; - ud_alloc_attr.ud_clone_chan = NULL; - - if (ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, - &ud_alloc_attr, &state->id_chnl_hdl, NULL) != IBT_SUCCESS) { - DPRINT(10, "ibd_drv_init : failed in ibt_alloc_ud_channel()" - "\n"); - goto drv_init_fail_alloc_chan; - } - - if (ibt_query_ud_channel(state->id_chnl_hdl, &ud_chan_attr) != - DDI_SUCCESS) { - DPRINT(10, "ibd_drv_init : failed in ibt_query_ud_channel()"); - goto drv_init_fail_query_chan; - } - - state->id_qpnum = ud_chan_attr.ud_qpn; - /* state->id_max_sqseg = ud_chan_attr.ud_chan_sizes.cs_sq_sgl; */ - - if (state->id_max_sqseg > IBD_MAX_SQSEG) { - state->id_max_sqseg = IBD_MAX_SQSEG; - } else if (state->id_max_sqseg < IBD_MAX_SQSEG) { - ibd_print_warn(state, "Set #sgl = %d instead of default %d", - state->id_max_sqseg, IBD_MAX_SQSEG); - } - - /* Initialize the Transmit buffer list */ - if (ibd_init_txlist(state) != DDI_SUCCESS) { - DPRINT(10, "ibd_drv_init : failed in ibd_init_txlist()\n"); - goto drv_init_fail_txlist_init; - } - - if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) { - /* - * Setup the handler we will use for regular DLPI stuff - */ - ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); - if (ibt_enable_cq_notify(state->id_scq_hdl, - IBT_NEXT_COMPLETION) != IBT_SUCCESS) { - DPRINT(10, "ibd_drv_init : failed in" - " ibt_enable_cq_notify()\n"); - goto drv_init_fail_cq_notify; - } - } - - /* Initialize the Receive buffer list */ - if (ibd_init_rxlist(state) != DDI_SUCCESS) { - DPRINT(10, "ibd_drv_init : failed in ibd_init_rxlist()\n"); - goto drv_init_fail_rxlist_init; - } - - /* Join to IPoIB broadcast group as required by IPoIB */ - if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { - DPRINT(10, "ibd_drv_init : failed in ibd_join_group\n"); - goto drv_init_fail_join_group; - } - - /* - * Create the async thread; thread_create never fails. - */ - kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, - TS_RUN, minclsyspri); - - state->id_async_thrid = kht->t_did; - - /* - * The local mac address is now known. Create the IPoIB - * address. - */ - ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, - state->id_sgid.gid_prefix, state->id_sgid.gid_guid); - /* - * Similarly, program in the broadcast mac address. - */ - ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, state->id_mgid.gid_prefix, - state->id_mgid.gid_guid); - - ptr = (uint32_t *)&state->id_macaddr; - DPRINT(10, "ibd_drv_init : INFO: MAC %08X:%08X:%08X:%08X:%08X\n", - *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4)); - ptr = (uint32_t *)&state->id_bcaddr; - DPRINT(10, "ibd_drv_init : INFO: BCMAC %08X:%08X:%08X:%08X:%08X\n", - *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4)); - DPRINT(10, "ibd_drv_init : INFO: Pkey 0x%x, Mgid %016llx%016llx\n", - state->id_pkey, state->id_mgid.gid_prefix, - state->id_mgid.gid_guid); - DPRINT(10, "ibd_drv_init : INFO: GID %016llx%016llx\n", - state->id_sgid.gid_prefix, state->id_sgid.gid_guid); - DPRINT(10, "ibd_drv_init : INFO: PKEY %04x\n", state->id_pkey); - DPRINT(10, "ibd_drv_init : INFO: MTU %d\n", state->id_mtu); - (void) ddi_pathname(state->id_dip, pathname); - DPRINT(10, "ibd_drv_init : INFO: Pathname %s\n", pathname); - - return (DDI_SUCCESS); - -drv_init_fail_join_group: - ibd_fini_rxlist(state); - -drv_init_fail_rxlist_init: -drv_init_fail_cq_notify: - ibd_fini_txlist(state); - -drv_init_fail_txlist_init: -drv_init_fail_query_chan: - if (ibt_free_channel(state->id_chnl_hdl) != IBT_SUCCESS) - DPRINT(10, "ibd_drv_init : failed in ibt_free_channel()"); - -drv_init_fail_alloc_chan: - if ((ibd_separate_cqs == 1) && (ibt_free_cq(state->id_scq_hdl) != - IBT_SUCCESS)) - DPRINT(10, "ibd_drv_init : Tx ibt_free_cq()"); - - if (ibd_separate_cqs == 1) - kmem_free(state->id_txwcs, sizeof (ibt_wc_t) * - state->id_txwcs_size); - -drv_init_fail_alloc_scq: - if (ibt_free_cq(state->id_rcq_hdl) != IBT_SUCCESS) - DPRINT(10, "ibd_drv_init : Rx ibt_free_cq()"); - kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * state->id_rxwcs_size); - -drv_init_fail_alloc_rcq: - ibd_acache_fini(state); -drv_init_fail_acache: - if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS) - DPRINT(10, "ibd_drv_init : failed in ibt_free_pd()"); - -drv_init_fail_alloc_pd: - ibt_free_mcg_info(state->id_mcinfo, 1); -drv_init_fail_find_bgroup: - if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS) - DPRINT(10, "ibd_drv_init : failed in ibt_close_hca()"); - - return (DDI_FAILURE); -} - - static int ibd_alloc_tx_copybufs(ibd_state_t *state) { @@ -3722,8 +3428,8 @@ while (state->id_tx_list.dl_head != NULL) { node = WQE_TO_SWQE(state->id_tx_list.dl_head); state->id_tx_list.dl_head = node->swqe_next; + ASSERT(state->id_tx_list.dl_cnt > 0); state->id_tx_list.dl_cnt--; - ASSERT(state->id_tx_list.dl_cnt >= 0); ibd_free_swqe(state, node); } mutex_exit(&state->id_tx_list.dl_mutex); @@ -3782,7 +3488,7 @@ * recycled, or this is a new one. */ static int -ibd_post_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle) +ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle) { ibt_status_t ibt_status; @@ -3827,7 +3533,7 @@ if (ibt_status != IBT_SUCCESS) { (void) atomic_add_32_nv( &state->id_rx_list.dl_cnt, -1); - ibd_print_warn(state, "ibd_post_rwqe: " + ibd_print_warn(state, "ibd_post_recv: " "posting failed, ret=%d", ibt_status); return (DDI_FAILURE); } @@ -3861,7 +3567,7 @@ return (DDI_FAILURE); } - if (ibd_post_rwqe(state, rwqe, B_FALSE) == DDI_FAILURE) { + if (ibd_post_recv(state, rwqe, B_FALSE) == DDI_FAILURE) { ibd_free_rwqe(state, rwqe); ibd_fini_rxlist(state); return (DDI_FAILURE); @@ -3884,8 +3590,8 @@ while (state->id_rx_list.dl_head != NULL) { node = WQE_TO_RWQE(state->id_rx_list.dl_head); state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next; + ASSERT(state->id_rx_list.dl_cnt > 0); state->id_rx_list.dl_cnt--; - ASSERT(state->id_rx_list.dl_cnt >= 0); ibd_free_rwqe(state, node); } @@ -4009,150 +3715,6 @@ } /* - * Pre ibt_detach() deconstruction. - */ -static void -ibd_drv_fini(ibd_state_t *state) -{ - ib_gid_t mgid; - ibd_mce_t *mce; - ibt_status_t status; - uint8_t jstate; - - /* - * Desubscribe from trap notices; we will be tearing down - * the mcg lists soon. Make sure the trap handler does nothing - * even if it is invoked (ie till we invoke ibt_detach()). - */ - ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); - mutex_enter(&state->id_trap_lock); - state->id_trap_stop = B_TRUE; - while (state->id_trap_inprog > 0) - cv_wait(&state->id_trap_cv, &state->id_trap_lock); - mutex_exit(&state->id_trap_lock); - - /* - * Flushing the channel ensures that all pending WQE's - * are marked with flush_error and handed to the CQ. It - * does not guarantee the invocation of the CQ handler. - * This call is guaranteed to return successfully for UD QPNs. - */ - status = ibt_flush_channel(state->id_chnl_hdl); - ASSERT(status == IBT_SUCCESS); - - /* - * We possibly need a loop here to wait for all the Tx - * callbacks to happen. The Tx handlers will retrieve - * held resources like AH ac_ref count, registered memory - * and possibly IBD_ASYNC_REAP requests. Rx interrupts were already - * turned off (in ibd_detach()); turn off Tx interrupts and - * poll. By the time the polling returns an empty indicator, - * we are sure we have seen all pending Tx callbacks. Note - * that after the ibt_set_cq_handler() returns, the old handler - * is guaranteed not to be invoked anymore. - */ - if (ibd_separate_cqs == 1) - ibt_set_cq_handler(state->id_scq_hdl, 0, 0); - ibd_poll_compq(state, state->id_scq_hdl); - - /* - * No more async requests will be posted since the device has been - * unregistered; completion handlers have been turned off, so Tx - * handler will not cause any more IBD_ASYNC_REAP requests. Queue a - * request for the async thread to exit, which will be serviced - * after any pending ones. This can take a while, specially if the - * SM is unreachable, since IBMF will slowly timeout each SM request - * issued by the async thread. Reap the thread before continuing on, - * we do not want it to be lingering in modunloaded code. - */ - ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT); - thread_join(state->id_async_thrid); - - /* - * We can not be in promiscuous mode anymore, upper layers - * would have made a request to disable it (if ever set previously) - * before the detach is allowed to progress to this point; and the - * aysnc thread would have processed that request by now. Thus the - * nonmember list is guaranteed empty at this point. - */ - ASSERT(state->id_prom_op != IBD_OP_COMPLETED); - - /* - * Drop all residual full/non membership. This includes full - * membership to the broadcast group, and any nonmembership - * acquired during transmits. We do this after the Tx completion - * handlers are done, since those might result in some late - * leaves; this also eliminates a potential race with that - * path wrt the mc full list insert/delete. Trap handling - * has also been suppressed at this point. Thus, no locks - * are required while traversing the mc full list. - */ - DPRINT(2, "ibd_drv_fini : clear full cache entries"); - mce = list_head(&state->id_mc_full); - while (mce != NULL) { - mgid = mce->mc_info.mc_adds_vect.av_dgid; - jstate = mce->mc_jstate; - mce = list_next(&state->id_mc_full, mce); - ibd_leave_group(state, mgid, jstate); - } - - ibt_free_mcg_info(state->id_mcinfo, 1); - - /* - * Kill the channel now; guaranteed to return successfully - * for UD QPNs. - */ - status = ibt_free_channel(state->id_chnl_hdl); - ASSERT(status == IBT_SUCCESS); - - /* - * Kill the CQ; all completion handlers are guaranteed to - * have terminated by the time this returns. Since we killed - * the QPN above, we can not receive the IBT_CQ_BUSY error. - */ - status = ibt_free_cq(state->id_rcq_hdl); - ASSERT(status == IBT_SUCCESS); - kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * state->id_rxwcs_size); - - if (ibd_separate_cqs == 1) { - status = ibt_free_cq(state->id_scq_hdl); - ASSERT(status == IBT_SUCCESS); - kmem_free(state->id_txwcs, sizeof (ibt_wc_t) * - state->id_txwcs_size); - } - - /* - * Since these following will act on the Rx/Tx list, which - * is also looked at by the Rx/Tx handlers, keep them around - * till all handlers are guaranteed to have completed. - */ - ibd_fini_rxlist(state); - ibd_fini_txlist(state); - - /* - * Clean up the active AH hash list. - */ - mod_hash_destroy_hash(state->id_ah_active_hash); - - /* - * Free parallel ARP cache and AHs; we are sure all of these - * resources have been released by the Tx completion handler. - */ - ibd_acache_fini(state); - - /* - * We freed the QPN, all the MRs and AHs. This step should not - * fail; print a warning message if it does fail, due to a bug - * in the driver. - */ - if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS) - ibd_print_warn(state, "failed to free protection domain"); - - if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS) - ibd_print_warn(state, "failed to close HCA device"); -} - -/* * IBA Rx/Tx completion queue handler. Guaranteed to be single * threaded and nonreentrant for this CQ. When using combined CQ, * this handles Tx and Rx completions. With separate CQs, this handles @@ -4249,7 +3811,7 @@ * [de]initialized; back off then, without doing * anything more, since we are not sure if the * async thread is around, or whether we might - * be racing with the detach code in ibd_drv_fini() + * be racing with the detach code in ibd_m_stop() * that scans the mcg list. */ if (!ibd_async_safe(state)) @@ -4361,6 +3923,422 @@ return (B_TRUE); } +static int +ibd_get_port_details(ibd_state_t *state) +{ + ibt_hca_portinfo_t *port_infop; + ibt_status_t ret; + uint_t psize, port_infosz; + + mutex_enter(&state->id_link_mutex); + + /* + * Query for port information + */ + ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, + &port_infop, &psize, &port_infosz); + if ((ret != IBT_SUCCESS) || (psize != 1)) { + mutex_exit(&state->id_link_mutex); + DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() " + "failed, ret=%d", ret); + return (DDI_FAILURE); + } + + /* + * If the link already went down by the time we get here, + * give up + */ + if (port_infop->p_linkstate != IBT_PORT_ACTIVE) { + mutex_exit(&state->id_link_mutex); + ibt_free_portinfo(port_infop, port_infosz); + DPRINT(10, "ibd_get_port_details: port is not active"); + return (DDI_FAILURE); + } + + /* + * If the link is active, verify the pkey + */ + if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port, + state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) { + mutex_exit(&state->id_link_mutex); + ibt_free_portinfo(port_infop, port_infosz); + DPRINT(10, "ibd_get_port_details: ibt_pkey2index " + "failed, ret=%d", ret); + return (DDI_FAILURE); + } + + state->id_mtu = (128 << port_infop->p_mtu); + state->id_sgid = *port_infop->p_sgid_tbl; + state->id_link_state = LINK_STATE_UP; + + mutex_exit(&state->id_link_mutex); + ibt_free_portinfo(port_infop, port_infosz); + + /* + * Now that the port is active, record the port speed + */ + state->id_link_speed = ibd_get_portspeed(state); + + return (DDI_SUCCESS); +} + +static int +ibd_alloc_cqs(ibd_state_t *state) +{ + ibt_hca_attr_t hca_attrs; + ibt_cq_attr_t cq_attr; + ibt_status_t ret; + uint32_t real_size; + + ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs); + ASSERT(ret == IBT_SUCCESS); + + /* + * Allocate Rx/combined CQ: + * Theoretically, there is no point in having more than #rwqe + * plus #swqe cqe's, except that the CQ will be signalled for + * overflow when the last wqe completes, if none of the previous + * cqe's have been polled. Thus, we allocate just a few less wqe's + * to make sure such overflow does not occur. + */ + cq_attr.cq_sched = NULL; + cq_attr.cq_flags = IBT_CQ_NO_FLAGS; + + if (ibd_separate_cqs == 1) { + /* + * Allocate Receive CQ. + */ + if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) { + cq_attr.cq_size = state->id_num_rwqe + 1; + } else { + cq_attr.cq_size = hca_attrs.hca_max_cq_sz; + state->id_num_rwqe = cq_attr.cq_size - 1; + } + + if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, + &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { + DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) " + "failed, ret=%d\n", ret); + return (DDI_FAILURE); + } + + if ((ret = ibt_modify_cq(state->id_rcq_hdl, + ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) { + DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt " + "moderation failed, ret=%d\n", ret); + } + + state->id_rxwcs_size = state->id_num_rwqe + 1; + state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * + state->id_rxwcs_size, KM_SLEEP); + + /* + * Allocate Send CQ. + */ + if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) { + cq_attr.cq_size = state->id_num_swqe + 1; + } else { + cq_attr.cq_size = hca_attrs.hca_max_cq_sz; + state->id_num_swqe = cq_attr.cq_size - 1; + } + + if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, + &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) { + DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) " + "failed, ret=%d\n", ret); + kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * + state->id_rxwcs_size); + (void) ibt_free_cq(state->id_rcq_hdl); + return (DDI_FAILURE); + } + if ((ret = ibt_modify_cq(state->id_scq_hdl, + IBD_TXCOMP_COUNT, IBD_TXCOMP_USEC, 0)) != IBT_SUCCESS) { + DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt " + "moderation failed, ret=%d\n", ret); + } + + state->id_txwcs_size = state->id_num_swqe + 1; + state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * + state->id_txwcs_size, KM_SLEEP); + } else { + /* + * Allocate combined Send/Receive CQ. + */ + if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + + state->id_num_swqe + 1)) { + cq_attr.cq_size = state->id_num_rwqe + + state->id_num_swqe + 1; + } else { + cq_attr.cq_size = hca_attrs.hca_max_cq_sz; + state->id_num_rwqe = ((cq_attr.cq_size - 1) * + state->id_num_rwqe) / (state->id_num_rwqe + + state->id_num_swqe); + state->id_num_swqe = cq_attr.cq_size - 1 - + state->id_num_rwqe; + } + + state->id_rxwcs_size = cq_attr.cq_size; + state->id_txwcs_size = state->id_rxwcs_size; + + if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, + &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { + DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rscq) " + "failed, ret=%d\n", ret); + return (DDI_FAILURE); + } + state->id_scq_hdl = state->id_rcq_hdl; + state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * + state->id_rxwcs_size, KM_SLEEP); + state->id_txwcs = state->id_rxwcs; + } + + /* + * Print message in case we could not allocate as many wqe's + * as was requested. + */ + if (state->id_num_rwqe != IBD_NUM_RWQE) { + ibd_print_warn(state, "Setting #rwqe = %d instead of default " + "%d", state->id_num_rwqe, IBD_NUM_RWQE); + } + if (state->id_num_swqe != IBD_NUM_SWQE) { + ibd_print_warn(state, "Setting #swqe = %d instead of default " + "%d", state->id_num_swqe, IBD_NUM_SWQE); + } + + return (DDI_SUCCESS); +} + +static int +ibd_setup_ud_channel(ibd_state_t *state) +{ + ibt_ud_chan_alloc_args_t ud_alloc_attr; + ibt_ud_chan_query_attr_t ud_chan_attr; + ibt_status_t ret; + + ud_alloc_attr.ud_flags = IBT_WR_SIGNALED; + if (state->id_hca_res_lkey_capab) + ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY; + if (state->id_lso_policy && state->id_lso_capable) + ud_alloc_attr.ud_flags |= IBT_USES_LSO; + + ud_alloc_attr.ud_hca_port_num = state->id_port; + ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; + ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; + ud_alloc_attr.ud_sizes.cs_sq = state->id_num_swqe; + ud_alloc_attr.ud_sizes.cs_rq = state->id_num_rwqe; + ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; + ud_alloc_attr.ud_scq = state->id_scq_hdl; + ud_alloc_attr.ud_rcq = state->id_rcq_hdl; + ud_alloc_attr.ud_pd = state->id_pd_hdl; + ud_alloc_attr.ud_pkey_ix = state->id_pkix; + ud_alloc_attr.ud_clone_chan = NULL; + + if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, + &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) { + DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() " + "failed, ret=%d\n", ret); + return (DDI_FAILURE); + } + + if ((ret = ibt_query_ud_channel(state->id_chnl_hdl, + &ud_chan_attr)) != IBT_SUCCESS) { + DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() " + "failed, ret=%d\n", ret); + (void) ibt_free_channel(state->id_chnl_hdl); + return (DDI_FAILURE); + } + + state->id_qpnum = ud_chan_attr.ud_qpn; + + return (DDI_SUCCESS); +} + +static int +ibd_undo_m_start(ibd_state_t *state) +{ + uint32_t progress = state->id_mac_state; + uint_t attempts; + ibt_status_t ret; + ib_gid_t mgid; + ibd_mce_t *mce; + uint8_t jstate; + + /* + * Before we try to stop/undo whatever we did in ibd_m_start(), + * we need to mark the link state as unknown to prevent nw + * layer from using this instance for any new transfers. + */ + if (progress & IBD_DRV_PORT_DETAILS_OBTAINED) { + state->id_link_state = LINK_STATE_UNKNOWN; + mac_link_update(state->id_mh, state->id_link_state); + + state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED); + } + + if (progress & IBD_DRV_STARTED) { + state->id_mac_state &= (~IBD_DRV_STARTED); + } + + /* + * First, stop receive interrupts; this stops the driver from + * handing up buffers to higher layers. Wait for receive buffers + * to be returned and give up after 5 seconds. + */ + if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) { + ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); + attempts = 50; + while (state->id_rx_list.dl_bufs_outstanding > 0) { + delay(drv_usectohz(100000)); + if (--attempts == 0) { + /* + * There are pending bufs with the network + * layer and we have no choice but to wait + * for them to be done with. Reap all the + * Tx/Rx completions that were posted since + * we turned off the notification and + * return failure. + */ + DPRINT(2, "ibd_undo_m_start: " + "reclaiming failed"); + ibd_poll_compq(state, state->id_rcq_hdl); + ibt_set_cq_handler(state->id_rcq_hdl, + ibd_rcq_handler, state); + return (DDI_FAILURE); + } + } + state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED); + } + + if (progress & IBD_DRV_SM_NOTICES_REGISTERED) { + ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); + + mutex_enter(&state->id_trap_lock); + state->id_trap_stop = B_TRUE; + while (state->id_trap_inprog > 0) + cv_wait(&state->id_trap_cv, &state->id_trap_lock); + mutex_exit(&state->id_trap_lock); + + state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED); + } + + if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) { + /* + * Flushing the channel ensures that all pending WQE's + * are marked with flush_error and handed to the CQ. It + * does not guarantee the invocation of the CQ handler. + * This call is guaranteed to return successfully for + * UD QPNs. + */ + ret = ibt_flush_channel(state->id_chnl_hdl); + ASSERT(ret == IBT_SUCCESS); + + /* + * Turn off Tx interrupts and poll. By the time the polling + * returns an empty indicator, we are sure we have seen all + * pending Tx callbacks. Note that after the call to + * ibt_set_cq_handler() returns, the old handler is + * guaranteed not to be invoked anymore. + */ + if (ibd_separate_cqs == 1) + ibt_set_cq_handler(state->id_scq_hdl, 0, 0); + ibd_poll_compq(state, state->id_scq_hdl); + + state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED); + } + + if (progress & IBD_DRV_ASYNC_THR_CREATED) { + /* + * No new async requests will be posted since the device + * link state has been marked as unknown; completion handlers + * have been turned off, so Tx handler will not cause any + * more IBD_ASYNC_REAP requests. + * + * Queue a request for the async thread to exit, which will + * be serviced after any pending ones. This can take a while, + * specially if the SM is unreachable, since IBMF will slowly + * timeout each SM request issued by the async thread. Reap + * the thread before continuing on, we do not want it to be + * lingering in modunloaded code (or we could move the reap + * to ibd_detach(), provided we keep track of the current + * id_async_thrid somewhere safe). + */ + ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT); + thread_join(state->id_async_thrid); + + state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED); + } + + if (progress & IBD_DRV_BCAST_GROUP_JOINED) { + /* + * Drop all residual full/non membership. This includes full + * membership to the broadcast group, and any nonmembership + * acquired during transmits. We do this after the Tx completion + * handlers are done, since those might result in some late + * leaves; this also eliminates a potential race with that + * path wrt the mc full list insert/delete. Trap handling + * has also been suppressed at this point. Thus, no locks + * are required while traversing the mc full list. + */ + DPRINT(2, "ibd_undo_m_start: clear full cache entries"); + mce = list_head(&state->id_mc_full); + while (mce != NULL) { + mgid = mce->mc_info.mc_adds_vect.av_dgid; + jstate = mce->mc_jstate; + mce = list_next(&state->id_mc_full, mce); + ibd_leave_group(state, mgid, jstate); + } + state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED); + } + + if (progress & IBD_DRV_RXLIST_ALLOCD) { + ibd_fini_rxlist(state); + state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD); + } + + if (progress & IBD_DRV_TXLIST_ALLOCD) { + ibd_fini_txlist(state); + state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD); + } + + if (progress & IBD_DRV_UD_CHANNEL_SETUP) { + (void) ibt_free_channel(state->id_chnl_hdl); + state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP); + } + + if (progress & IBD_DRV_CQS_ALLOCD) { + if (ibd_separate_cqs == 1) { + kmem_free(state->id_txwcs, + sizeof (ibt_wc_t) * state->id_txwcs_size); + (void) ibt_free_cq(state->id_scq_hdl); + } + + kmem_free(state->id_rxwcs, + sizeof (ibt_wc_t) * state->id_rxwcs_size); + (void) ibt_free_cq(state->id_rcq_hdl); + + state->id_txwcs = NULL; + state->id_rxwcs = NULL; + state->id_scq_hdl = NULL; + state->id_rcq_hdl = NULL; + + state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD); + } + + if (progress & IBD_DRV_ACACHE_INITIALIZED) { + mod_hash_destroy_hash(state->id_ah_active_hash); + ibd_acache_fini(state); + + state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED); + } + + if (progress & IBD_DRV_BCAST_GROUP_FOUND) { + ibt_free_mcg_info(state->id_mcinfo, 1); + state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND); + } + + return (DDI_SUCCESS); +} + /* * GLDv3 entry point to start hardware. */ @@ -4368,7 +4346,185 @@ static int ibd_m_start(void *arg) { - return (0); + ibd_state_t *state = arg; + kthread_t *kht; + int err; + + if (state->id_mac_state & IBD_DRV_STARTED) + return (DDI_SUCCESS); + + /* + * Get port details; if we fail here, very likely the port + * state is inactive or the pkey can't be verified + */ + if (ibd_get_port_details(state) != DDI_SUCCESS) { + DPRINT(10, "ibd_m_start: ibd_get_port_details() failed"); + return (EAGAIN); + } + state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED; + + /* + * Find the IPoIB broadcast group + */ + if (ibd_find_bgroup(state) != IBT_SUCCESS) { + DPRINT(10, "ibd_m_start: ibd_find_bgroup() failed"); + err = ENOENT; + goto m_start_fail; + } + state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND; + + /* + * Initialize per-interface caches and lists; if we fail here, + * it is most likely due to a lack of resources + */ + if (ibd_acache_init(state) != DDI_SUCCESS) { + DPRINT(10, "ibd_m_start: ibd_acache_init() failed"); + err = ENOMEM; + goto m_start_fail; + } + state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED; + + /* + * Allocate send and receive completion queues + */ + if (ibd_alloc_cqs(state) != DDI_SUCCESS) { + DPRINT(10, "ibd_m_start: ibd_alloc_cqs() failed"); + err = ENOMEM; + goto m_start_fail; + } + state->id_mac_state |= IBD_DRV_CQS_ALLOCD; + + /* + * Setup a UD channel + */ + if (ibd_setup_ud_channel(state) != DDI_SUCCESS) { + err = ENOMEM; + DPRINT(10, "ibd_m_start: ibd_setup_ud_channel() failed"); + goto m_start_fail; + } + state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP; + + /* + * Allocate and initialize the tx buffer list + */ + if (ibd_init_txlist(state) != DDI_SUCCESS) { + DPRINT(10, "ibd_m_start: ibd_init_txlist() failed"); + err = ENOMEM; + goto m_start_fail; + } + state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD; + + /* + * If we have separate cqs, create the send cq handler here + */ + if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) { + ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); + if (ibt_enable_cq_notify(state->id_scq_hdl, + IBT_NEXT_COMPLETION) != IBT_SUCCESS) { + DPRINT(10, + "ibd_m_start: ibt_enable_cq_notify(scq) failed"); + err = EINVAL; + goto m_start_fail; + } + state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED; + } + + /* + * Allocate and initialize the rx buffer list + */ + if (ibd_init_rxlist(state) != DDI_SUCCESS) { + DPRINT(10, "ibd_m_start: ibd_init_rxlist() failed"); + err = ENOMEM; + goto m_start_fail; + } + state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD; + + /* + * Join IPoIB broadcast group + */ + if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { + DPRINT(10, "ibd_m_start: ibd_join_group() failed"); + err = EINVAL; + goto m_start_fail; + } + state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED; + + /* + * Create the async thread; thread_create never fails. + */ + kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, + TS_RUN, minclsyspri); + state->id_async_thrid = kht->t_did; + state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED; + + /* + * When we did mac_register() in ibd_attach(), we didn't register + * the real macaddr and we didn't have the true port mtu. Now that + * we're almost ready, set the local mac address and broadcast + * addresses and update gldv3 about the real values of these + * parameters. + */ + ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, + state->id_sgid.gid_prefix, state->id_sgid.gid_guid); + ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, + state->id_mgid.gid_prefix, state->id_mgid.gid_guid); + + mac_maxsdu_update(state->id_mh, state->id_mtu - IPOIB_HDRSIZE); + mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); + + /* + * Setup the receive cq handler + */ + ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); + if (ibt_enable_cq_notify(state->id_rcq_hdl, + IBT_NEXT_COMPLETION) != IBT_SUCCESS) { + DPRINT(10, "ibd_m_start: ibt_enable_cq_notify(rcq) failed"); + err = EINVAL; + goto m_start_fail; + } + state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED; + + /* + * Setup the subnet notices handler after we've initialized the acache/ + * mcache and started the async thread, both of which are required for + * the trap handler to function properly. + * + * Now that the async thread has been started (and we've already done + * a mac_register() during attach so mac_tx_update() can be called + * if necessary without any problem), we can enable the trap handler + * to queue requests to the async thread. + */ + ibt_register_subnet_notices(state->id_ibt_hdl, + ibd_snet_notices_handler, state); + mutex_enter(&state->id_trap_lock); + state->id_trap_stop = B_FALSE; + mutex_exit(&state->id_trap_lock); + state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED; + + /* + * Indicate link status to GLDv3 and higher layers. By default, + * we assume we are in up state (which must have been true at + * least at the time the broadcast mcg's were probed); if there + * were any up/down transitions till the time we come here, the + * async handler will have updated last known state, which we + * use to tell GLDv3. The async handler will not send any + * notifications to GLDv3 till we reach here in the initialization + * sequence. + */ + state->id_mac_state |= IBD_DRV_STARTED; + mac_link_update(state->id_mh, state->id_link_state); + + return (DDI_SUCCESS); + +m_start_fail: + /* + * If we ran into a problem during ibd_m_start() and ran into + * some other problem during undoing our partial work, we can't + * do anything about it. Ignore any errors we might get from + * ibd_undo_m_start() and just return the original error we got. + */ + (void) ibd_undo_m_start(state); + return (err); } /* @@ -4378,6 +4534,15 @@ static void ibd_m_stop(void *arg) { + ibd_state_t *state = arg; + + /* + * Since ibd_m_stop() doesn't expect any return, we cannot + * fail even if we run into some problem with ibd_undo_m_start(). + * The best we can do is to leave it in a good state, so + * perhaps a future unplumb will succeed. + */ + (void) ibd_undo_m_start(state); } /* @@ -4387,9 +4552,15 @@ static int ibd_m_unicst(void *arg, const uint8_t *macaddr) { - ibd_state_t *state; - - state = (ibd_state_t *)arg; + ibd_state_t *state = arg; + + /* + * Don't bother even comparing the macaddr if we haven't + * completed ibd_m_start(). + */ + if ((state->id_mac_state & IBD_DRV_STARTED) == 0) + return (0); + if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) return (0); else @@ -4407,7 +4578,6 @@ "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); if (op == IBD_ASYNC_JOIN) { - if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { ibd_print_warn(state, "Joint multicast group failed :" "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); @@ -4435,6 +4605,14 @@ ibd_req_t *req; /* + * If we haven't completed ibd_m_start(), async thread wouldn't + * have been started and id_bcaddr wouldn't be set, so there's + * no point in continuing. + */ + if ((state->id_mac_state & IBD_DRV_STARTED) == 0) + return (0); + + /* * The incoming multicast address might not be aligned properly * on a 4 byte boundary to be considered an ipoib_mac_t. We force * it to look like one though, to get the offsets of the mc gid, @@ -4461,8 +4639,8 @@ /* * If someone is trying to JOIN/LEAVE the broadcast group, we do - * nothing (ie we stay JOINed to the broadcast group done in - * ibd_drv_init()), to mimic ethernet behavior. IPv4 specifically + * nothing (i.e. we stay JOINed to the broadcast group done in + * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically * requires to be joined to broadcast groups at all times. * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also * depends on this. @@ -4524,7 +4702,8 @@ ibt_mcg_info_t *mcg_info; ib_gid_t mgid; uint_t numg; - int i, ret = IBD_OP_COMPLETED; + int i; + char ret = IBD_OP_COMPLETED; DPRINT(2, "ibd_async_setprom : async_set_promisc"); @@ -4580,6 +4759,13 @@ ibd_state_t *state = (ibd_state_t *)arg; ibd_req_t *req; + /* + * Async thread wouldn't have been started if we haven't + * passed ibd_m_start() + */ + if ((state->id_mac_state & IBD_DRV_STARTED) == 0) + return (0); + req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); if (req == NULL) return (ENOMEM); @@ -5188,6 +5374,13 @@ boolean_t dofree = B_FALSE; boolean_t rc; + /* + * If we aren't done with the device initialization and start, + * we shouldn't be here. + */ + if ((state->id_mac_state & IBD_DRV_STARTED) == 0) + return (B_FALSE); + node = NULL; if (ibd_acquire_swqe(state, &node) != 0) { /* @@ -5935,7 +6128,7 @@ return; } - if (ibd_post_rwqe(state, rwqe, B_TRUE) == DDI_FAILURE) { + if (ibd_post_recv(state, rwqe, B_TRUE) == DDI_FAILURE) { ibd_delete_rwqe(state, rwqe); ibd_free_rwqe(state, rwqe); return;
--- a/usr/src/uts/common/sys/ib/clients/ibd/ibd.h Mon Jun 29 02:30:05 2009 -0700 +++ b/usr/src/uts/common/sys/ib/clients/ibd/ibd.h Mon Jun 29 02:56:22 2009 -0700 @@ -361,6 +361,8 @@ uint_t id_lso_maxlen; int id_hca_res_lkey_capab; ibt_lkey_t id_res_lkey; + + uint32_t id_mac_state; } ibd_state_t; #endif /* _KERNEL && !_BOOT */
--- a/usr/src/uts/intel/ibd/Makefile Mon Jun 29 02:30:05 2009 -0700 +++ b/usr/src/uts/intel/ibd/Makefile Mon Jun 29 02:56:22 2009 -0700 @@ -19,11 +19,9 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" -# # # @@ -59,10 +57,8 @@ # to investigate and remove these for maximum lint coverage. # Please do not carry these forward to new Makefiles. # -LINTTAGS += -erroff=E_SUSPICIOUS_COMPARISON LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN LINTTAGS += -erroff=E_PTRDIFF_OVERFLOW -LINTTAGS += -erroff=E_ASSIGN_NARROW_CONV # # Default build targets.
--- a/usr/src/uts/sparc/ibd/Makefile Mon Jun 29 02:30:05 2009 -0700 +++ b/usr/src/uts/sparc/ibd/Makefile Mon Jun 29 02:56:22 2009 -0700 @@ -19,11 +19,9 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" -# # # @@ -73,8 +71,6 @@ # LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN LINTTAGS += -erroff=E_PTRDIFF_OVERFLOW -LINTTAGS += -erroff=E_ASSIGN_NARROW_CONV -LINTTAGS += -erroff=E_SUSPICIOUS_COMPARISON # # Default build targets.