Mercurial > illumos > illumos-gate
changeset 3715:ffe268e01dfa
6514591 vsw: fix for 6496374 causes softhang
6523891 vsw needs to update lane state correctly for RDX pkts
6523926 handshake restart can fail following reboot under certain conditions
author | sg70180 |
---|---|
date | Mon, 26 Feb 2007 09:52:03 -0800 |
parents | dce229b9418d |
children | 1429cb51c952 |
files | usr/src/uts/sun4v/io/vsw.c usr/src/uts/sun4v/sys/vsw.h |
diffstat | 2 files changed, 355 insertions(+), 247 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/uts/sun4v/io/vsw.c Mon Feb 26 09:37:10 2007 -0800 +++ b/usr/src/uts/sun4v/io/vsw.c Mon Feb 26 09:52:03 2007 -0800 @@ -20,7 +20,7 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -148,9 +148,9 @@ static uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg); /* Handshake routines */ -static void vsw_restart_ldc(vsw_ldc_t *); -static void vsw_restart_handshake(vsw_ldc_t *); -static void vsw_handle_reset(vsw_ldc_t *); +static void vsw_ldc_reinit(vsw_ldc_t *); +static void vsw_process_conn_evt(vsw_ldc_t *, uint16_t); +static void vsw_conn_task(void *); static int vsw_check_flag(vsw_ldc_t *, int, uint64_t); static void vsw_next_milestone(vsw_ldc_t *); static int vsw_supported_version(vio_ver_msg_t *); @@ -191,7 +191,7 @@ static void vsw_send_dring_info(vsw_ldc_t *); static void vsw_send_rdx(vsw_ldc_t *); -static void vsw_send_msg(vsw_ldc_t *, void *, int); +static int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t); /* Forwarding database (FDB) routines */ static int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port); @@ -290,7 +290,7 @@ extern struct mod_ops mod_driverops; static struct modldrv vswmodldrv = { &mod_driverops, - "sun4v Virtual Switch Driver %I%", + "sun4v Virtual Switch %I%", &vsw_ops, }; @@ -3540,7 +3540,6 @@ * is UP. */ mutex_enter(&ldcp->status_lock); - istatus = ldcp->ldc_status; if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) { DERR(vswp, "%s: unable to get status", __func__); mutex_exit(&ldcp->status_lock); @@ -3548,15 +3547,20 @@ return (1); } + + if (ldcp->ldc_status == LDC_UP) { + D2(vswp, "%s: channel %ld now UP (%ld)", __func__, + ldcp->ldc_id, istatus); + mutex_exit(&ldcp->status_lock); + LDC_EXIT_LOCK(ldcp); + + vsw_process_conn_evt(ldcp, VSW_CONN_UP); + return (0); + } + mutex_exit(&ldcp->status_lock); LDC_EXIT_LOCK(ldcp); - if ((istatus != LDC_UP) && (ldcp->ldc_status == LDC_UP)) { - D2(vswp, "%s: channel %ld now UP (%ld)", __func__, - ldcp->ldc_id, istatus); - vsw_restart_handshake(ldcp); - } - D1(vswp, "%s: exit", __func__); return (0); } @@ -3843,42 +3847,27 @@ { vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; vsw_t *vswp = ldcp->ldc_vswp; - ldc_status_t lstatus; - int rv; D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id); mutex_enter(&ldcp->ldc_cblock); + mutex_enter(&ldcp->status_lock); if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) { + mutex_exit(&ldcp->status_lock); mutex_exit(&ldcp->ldc_cblock); return (LDC_SUCCESS); } - - mutex_enter(&ldcp->status_lock); - lstatus = ldcp->ldc_status; - rv = ldc_status(ldcp->ldc_handle, &ldcp->ldc_status); mutex_exit(&ldcp->status_lock); - if (rv != 0) { - cmn_err(CE_WARN, "!vsw%d: Unable to read channel state", - vswp->instance); - goto vsw_cb_exit; - } if (event & LDC_EVT_UP) { /* - * Channel has come up, get the state and then start - * the handshake. + * Channel has come up. */ D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)", - __func__, ldcp->ldc_id, event, lstatus); - D2(vswp, "%s: UP: old status %ld : cur status %ld", - __func__, lstatus, ldcp->ldc_status); - if ((ldcp->ldc_status != lstatus) && - (ldcp->ldc_status == LDC_UP)) { - ldcp->reset_active = 0; - vsw_restart_handshake(ldcp); - } + __func__, ldcp->ldc_id, event, ldcp->ldc_status); + + vsw_process_conn_evt(ldcp, VSW_CONN_UP); ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0); } @@ -3898,40 +3887,10 @@ } if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) { - D2(vswp, "%s: id(%ld) event(%llx) DOWN/RESET", - __func__, ldcp->ldc_id, event); - - /* attempt to restart the connection */ - vsw_restart_ldc(ldcp); - - /* - * vsw_restart_ldc() will attempt to bring the channel - * back up. Check here to see if that succeeded. - */ - mutex_enter(&ldcp->status_lock); - lstatus = ldcp->ldc_status; - rv = ldc_status(ldcp->ldc_handle, &ldcp->ldc_status); - mutex_exit(&ldcp->status_lock); - if (rv != 0) { - DERR(vswp, "%s: unable to read status for channel %ld", - __func__, ldcp->ldc_id); - goto vsw_cb_exit; - } - - D2(vswp, "%s: id(%ld) event(%llx) DOWN/RESET event:" - " old status %ld : cur status %ld", __func__, - ldcp->ldc_id, event, lstatus, ldcp->ldc_status); - - /* - * If channel was not previously UP then (re)start the - * handshake. - */ - if ((ldcp->ldc_status == LDC_UP) && (lstatus != LDC_UP)) { - D2(vswp, "%s: channel %ld now UP, restarting " - "handshake", __func__, ldcp->ldc_id); - ldcp->reset_active = 0; - vsw_restart_handshake(ldcp); - } + D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)", + __func__, ldcp->ldc_id, event, ldcp->ldc_status); + + vsw_process_conn_evt(ldcp, VSW_CONN_RESET); } /* @@ -3961,26 +3920,17 @@ } /* - * Restart the connection with our peer. Free any existing - * data structures and then attempt to bring channel back - * up. + * Reinitialise data structures associated with the channel. */ static void -vsw_restart_ldc(vsw_ldc_t *ldcp) -{ - int rv; +vsw_ldc_reinit(vsw_ldc_t *ldcp) +{ vsw_t *vswp = ldcp->ldc_vswp; vsw_port_t *port; vsw_ldc_list_t *ldcl; D1(vswp, "%s: enter", __func__); - /* - * Check if reset already in progress for this channel. - */ - if (ldstub((uint8_t *)&ldcp->reset_active)) - return; - port = ldcp->ldc_port; ldcl = &port->p_ldclist; @@ -4010,117 +3960,208 @@ ldcp->hcnt = 0; ldcp->hphase = VSW_MILESTONE0; - rv = ldc_up(ldcp->ldc_handle); - if (rv != 0) { - /* - * Not a fatal error for ldc_up() to fail, as peer - * end point may simply not be ready yet. - */ - D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__, - ldcp->ldc_id, rv); - } - D1(vswp, "%s: exit", __func__); } /* - * (Re)start a handshake with our peer by sending them - * our version info. + * Process a connection event. + * + * Note - care must be taken to ensure that this function is + * not called with the dlistrw lock held. */ static void -vsw_restart_handshake(vsw_ldc_t *ldcp) +vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt) { vsw_t *vswp = ldcp->ldc_vswp; - - D1(vswp, "vsw_restart_handshake: enter"); - - if (ldcp->hphase != VSW_MILESTONE0) { - vsw_restart_ldc(ldcp); - } + vsw_conn_evt_t *conn = NULL; + + D1(vswp, "%s: enter", __func__); + + /* + * Check if either a reset or restart event is pending + * or in progress. If so just return. + * + * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT + * being received by the callback handler, or a ECONNRESET error + * code being returned from a ldc_read() or ldc_write() call. + * + * A VSW_CONN_RESTART event occurs when some error checking code + * decides that there is a problem with data from the channel, + * and that the handshake should be restarted. + */ + if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) && + (ldstub((uint8_t *)&ldcp->reset_active))) + return; /* - * We now increment the transaction group id. This allows - * us to identify and disard any tasks which are still pending - * on the taskq and refer to the handshake session we are about - * to restart. These stale messages no longer have any real - * meaning. + * If it is an LDC_UP event we first check the recorded + * state of the channel. If this is UP then we know that + * the channel moving to the UP state has already been dealt + * with and don't need to dispatch a new task. + * + * The reason for this check is that when we do a ldc_up(), + * depending on the state of the peer, we may or may not get + * a LDC_UP event. As we can't depend on getting a LDC_UP evt + * every time we do ldc_up() we explicitly check the channel + * status to see has it come up (ldc_up() is asynch and will + * complete at some undefined time), and take the appropriate + * action. + * + * The flip side of this is that we may get a LDC_UP event + * when we have already seen that the channel is up and have + * dealt with that. + */ + mutex_enter(&ldcp->status_lock); + if (evt == VSW_CONN_UP) { + if ((ldcp->ldc_status == LDC_UP) || + (ldcp->reset_active != 0)) { + mutex_exit(&ldcp->status_lock); + return; + } + } + mutex_exit(&ldcp->status_lock); + + /* + * The transaction group id allows us to identify and discard + * any tasks which are still pending on the taskq and refer + * to the handshake session we are about to restart or reset. + * These stale messages no longer have any real meaning. */ mutex_enter(&ldcp->hss_lock); ldcp->hss_id++; mutex_exit(&ldcp->hss_lock); - if (ldcp->hcnt++ > vsw_num_handshakes) { - cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted " - "handshake attempts (%d) on channel %ld", - vswp->instance, ldcp->hcnt, ldcp->ldc_id); - return; - } - - if ((vswp->taskq_p == NULL) || - (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp, - DDI_NOSLEEP) != DDI_SUCCESS)) { - cmn_err(CE_WARN, "!vsw%d: Can't dispatch version handshake " - "task", vswp->instance); - } - - D1(vswp, "vsw_restart_handshake: exit"); + ASSERT(vswp->taskq_p != NULL); + + if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) { + cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for" + " connection event", vswp->instance); + goto err_exit; + } + + conn->evt = evt; + conn->ldcp = ldcp; + + if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn, + DDI_NOSLEEP) != DDI_SUCCESS) { + cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task", + vswp->instance); + + kmem_free(conn, sizeof (vsw_conn_evt_t)); + goto err_exit; + } + + D1(vswp, "%s: exit", __func__); + return; + +err_exit: + /* + * Have mostly likely failed due to memory shortage. Clear the flag so + * that future requests will at least be attempted and will hopefully + * succeed. + */ + if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) + ldcp->reset_active = 0; } /* - * Deal appropriately with a ECONNRESET event encountered in a ldc_* - * call. + * Deal with events relating to a connection. Invoked from a taskq. */ static void -vsw_handle_reset(vsw_ldc_t *ldcp) -{ - vsw_t *vswp = ldcp->ldc_vswp; - ldc_status_t lstatus; +vsw_conn_task(void *arg) +{ + vsw_conn_evt_t *conn = (vsw_conn_evt_t *)arg; + vsw_ldc_t *ldcp = NULL; + vsw_t *vswp = NULL; + uint16_t evt; + ldc_status_t curr_status; + + ldcp = conn->ldcp; + evt = conn->evt; + vswp = ldcp->ldc_vswp; D1(vswp, "%s: enter", __func__); + /* can safely free now have copied out data */ + kmem_free(conn, sizeof (vsw_conn_evt_t)); + mutex_enter(&ldcp->status_lock); - lstatus = ldcp->ldc_status; - if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) { - DERR(vswp, "%s: unable to read status for channel %ld", - __func__, ldcp->ldc_id); + if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) { + cmn_err(CE_WARN, "!vsw%d: Unable to read status of " + "channel %ld", vswp->instance, ldcp->ldc_id); mutex_exit(&ldcp->status_lock); return; } - mutex_exit(&ldcp->status_lock); - - /* - * Check the channel's previous recorded state to - * determine if this is the first ECONNRESET event - * we've gotten for this particular channel (i.e. was - * previously up but is no longer). If so, terminate - * the channel. - */ - if ((ldcp->ldc_status != LDC_UP) && (lstatus == LDC_UP)) { - vsw_restart_ldc(ldcp); - } /* - * vsw_restart_ldc() will also attempt to bring channel - * back up. Check here if that succeeds. + * If we wish to restart the handshake on this channel, then if + * the channel is UP we bring it DOWN to flush the underlying + * ldc queue. + */ + if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP)) + (void) ldc_down(ldcp->ldc_handle); + + /* + * re-init all the associated data structures. */ - mutex_enter(&ldcp->status_lock); - lstatus = ldcp->ldc_status; - if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) { - DERR(vswp, "%s: unable to read status for channel %ld", - __func__, ldcp->ldc_id); + vsw_ldc_reinit(ldcp); + + /* + * Bring the channel back up (note it does no harm to + * do this even if the channel is already UP, Just + * becomes effectively a no-op). + */ + (void) ldc_up(ldcp->ldc_handle); + + /* + * Check if channel is now UP. This will only happen if + * peer has also done a ldc_up(). + */ + if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) { + cmn_err(CE_WARN, "!vsw%d: Unable to read status of " + "channel %ld", vswp->instance, ldcp->ldc_id); mutex_exit(&ldcp->status_lock); return; } - mutex_exit(&ldcp->status_lock); + + ldcp->ldc_status = curr_status; + + /* channel UP so restart handshake by sending version info */ + if (curr_status == LDC_UP) { + if (ldcp->hcnt++ > vsw_num_handshakes) { + cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted" + " handshake attempts (%d) on channel %ld", + vswp->instance, ldcp->hcnt, ldcp->ldc_id); + mutex_exit(&ldcp->status_lock); + return; + } + + if (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp, + DDI_NOSLEEP) != DDI_SUCCESS) { + cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task", + vswp->instance); + + /* + * Don't count as valid restart attempt if couldn't + * send version msg. + */ + if (ldcp->hcnt > 0) + ldcp->hcnt--; + } + } /* - * If channel is now up and no one else (i.e. the callback routine) - * has dealt with it then we restart the handshake here. + * Mark that the process is complete by clearing the flag. + * + * Note is it possible that the taskq dispatch above may have failed, + * most likely due to memory shortage. We still clear the flag so + * future attempts will at least be attempted and will hopefully + * succeed. */ - if ((lstatus != LDC_UP) && (ldcp->ldc_status == LDC_UP)) { + if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) ldcp->reset_active = 0; - vsw_restart_handshake(ldcp); - } + + mutex_exit(&ldcp->status_lock); D1(vswp, "%s: exit", __func__); } @@ -4148,7 +4189,7 @@ if (phase > VSW_MILESTONE0) { DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV" " when in state %d\n", ldcp->ldc_id, phase); - vsw_restart_handshake(ldcp); + vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return (1); } break; @@ -4159,7 +4200,7 @@ DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK" " or VER_NACK when in state %d\n", ldcp->ldc_id, phase); - vsw_restart_handshake(ldcp); + vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return (1); } else state &= ~VSW_VER_INFO_SENT; @@ -4169,7 +4210,7 @@ if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) { DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV" " when in state %d\n", ldcp->ldc_id, phase); - vsw_restart_handshake(ldcp); + vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return (1); } break; @@ -4180,7 +4221,7 @@ DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK" " or ATTR_NACK when in state %d\n", ldcp->ldc_id, phase); - vsw_restart_handshake(ldcp); + vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return (1); } else state &= ~VSW_ATTR_INFO_SENT; @@ -4190,7 +4231,7 @@ if (phase < VSW_MILESTONE1) { DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV" " when in state %d\n", ldcp->ldc_id, phase); - vsw_restart_handshake(ldcp); + vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return (1); } break; @@ -4201,7 +4242,7 @@ DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK" " or DRING_NACK when in state %d\n", ldcp->ldc_id, phase); - vsw_restart_handshake(ldcp); + vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return (1); } else state &= ~VSW_DRING_INFO_SENT; @@ -4211,7 +4252,7 @@ if (phase < VSW_MILESTONE3) { DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV" " when in state %d\n", ldcp->ldc_id, phase); - vsw_restart_handshake(ldcp); + vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return (1); } break; @@ -4222,7 +4263,7 @@ DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK" " or RDX_NACK when in state %d\n", ldcp->ldc_id, phase); - vsw_restart_handshake(ldcp); + vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return (1); } else state &= ~VSW_RDX_INFO_SENT; @@ -4232,7 +4273,7 @@ if (phase < VSW_MILESTONE3) { DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV" " when in state %d\n", ldcp->ldc_id, phase); - vsw_restart_handshake(ldcp); + vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return (1); } break; @@ -4274,7 +4315,7 @@ if (ldcp->lane_out.lstate == 0) { D2(vswp, "%s: (chan %lld) starting handshake " "with peer", __func__, ldcp->ldc_id); - vsw_restart_handshake(ldcp); + vsw_process_conn_evt(ldcp, VSW_CONN_UP); } /* @@ -4345,8 +4386,8 @@ * * Mark outbound lane as available to transmit data. */ - if ((ldcp->lane_in.lstate & VSW_RDX_ACK_SENT) && - (ldcp->lane_out.lstate & VSW_RDX_ACK_RECV)) { + if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) && + (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) { D2(vswp, "%s: (chan %lld) leaving milestone 3", __func__, ldcp->ldc_id); @@ -4463,7 +4504,7 @@ /* channel has been reset */ if (rv == ECONNRESET) { - vsw_handle_reset(ldcp); + vsw_process_conn_evt(ldcp, VSW_CONN_RESET); break; } @@ -4522,13 +4563,13 @@ if ((tag.vio_subtype_env == VIO_RDX) && (tag.vio_subtype == VIO_SUBTYPE_ACK)) { - if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_ACK_RECV)) + if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV)) return; - ldcp->lane_out.lstate |= VSW_RDX_ACK_RECV; + ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV; D2(vswp, "%s (%ld) handling RDX_ACK in place " "(ostate 0x%llx : hphase %d)", __func__, - ldcp->ldc_id, ldcp->lane_out.lstate, ldcp->hphase); + ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase); vsw_next_milestone(ldcp); return; } @@ -4538,7 +4579,7 @@ if (ctaskp == NULL) { DERR(vswp, "%s: unable to alloc space for ctrl" " msg", __func__); - vsw_restart_handshake(ldcp); + vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return; } @@ -4562,7 +4603,7 @@ __func__); kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); mutex_exit(&port->state_lock); - vsw_restart_handshake(ldcp); + vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return; } } else { @@ -4611,7 +4652,7 @@ DERR(vswp, "%s (chan %d): invalid session id (%llx)", __func__, ldcp->ldc_id, tag.vio_sid); kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); - vsw_restart_handshake(ldcp); + vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return; } } @@ -4714,8 +4755,8 @@ DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); - vsw_send_msg(ldcp, (void *)ver_pkt, - sizeof (vio_ver_msg_t)); + (void) vsw_send_msg(ldcp, (void *)ver_pkt, + sizeof (vio_ver_msg_t), B_TRUE); ldcp->lane_in.lstate |= VSW_VER_NACK_SENT; vsw_next_milestone(ldcp); @@ -4764,7 +4805,8 @@ DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); ver_pkt->tag.vio_sid = ldcp->local_session; - vsw_send_msg(ldcp, (void *)ver_pkt, sizeof (vio_ver_msg_t)); + (void) vsw_send_msg(ldcp, (void *)ver_pkt, + sizeof (vio_ver_msg_t), B_TRUE); vsw_next_milestone(ldcp); break; @@ -4832,8 +4874,8 @@ DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); - vsw_send_msg(ldcp, (void *)ver_pkt, - sizeof (vio_ver_msg_t)); + (void) vsw_send_msg(ldcp, (void *)ver_pkt, + sizeof (vio_ver_msg_t), B_TRUE); vsw_next_milestone(ldcp); @@ -4904,8 +4946,8 @@ DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt); ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT; - vsw_send_msg(ldcp, (void *)attr_pkt, - sizeof (vnet_attr_msg_t)); + (void) vsw_send_msg(ldcp, (void *)attr_pkt, + sizeof (vnet_attr_msg_t), B_TRUE); vsw_next_milestone(ldcp); return; @@ -4949,8 +4991,8 @@ ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT; - vsw_send_msg(ldcp, (void *)attr_pkt, - sizeof (vnet_attr_msg_t)); + (void) vsw_send_msg(ldcp, (void *)attr_pkt, + sizeof (vnet_attr_msg_t), B_TRUE); vsw_next_milestone(ldcp); break; @@ -5036,8 +5078,8 @@ ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; - vsw_send_msg(ldcp, (void *)dring_pkt, - sizeof (vio_dring_reg_msg_t)); + (void) vsw_send_msg(ldcp, (void *)dring_pkt, + sizeof (vio_dring_reg_msg_t), B_TRUE); vsw_next_milestone(ldcp); return; @@ -5084,8 +5126,8 @@ DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; - vsw_send_msg(ldcp, (void *)dring_pkt, - sizeof (vio_dring_reg_msg_t)); + (void) vsw_send_msg(ldcp, (void *)dring_pkt, + sizeof (vio_dring_reg_msg_t), B_TRUE); vsw_next_milestone(ldcp); return; @@ -5104,8 +5146,8 @@ DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; - vsw_send_msg(ldcp, (void *)dring_pkt, - sizeof (vio_dring_reg_msg_t)); + (void) vsw_send_msg(ldcp, (void *)dring_pkt, + sizeof (vio_dring_reg_msg_t), B_TRUE); vsw_next_milestone(ldcp); return; @@ -5148,8 +5190,8 @@ dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; dring_pkt->dring_ident = dp->ident; - vsw_send_msg(ldcp, (void *)dring_pkt, - sizeof (vio_dring_reg_msg_t)); + (void) vsw_send_msg(ldcp, (void *)dring_pkt, + sizeof (vio_dring_reg_msg_t), B_TRUE); ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT; vsw_next_milestone(ldcp); @@ -5186,14 +5228,14 @@ if (dring_found == 0) { DERR(NULL, "%s: unrecognised ring cookie", __func__); - vsw_restart_handshake(ldcp); + vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return; } } else { DERR(vswp, "%s: DRING ACK received but no drings " "allocated", __func__); - vsw_restart_handshake(ldcp); + vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return; } @@ -5246,28 +5288,26 @@ D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); DWARN(vswp, "%s: restarting handshake..", __func__); - vsw_restart_handshake(ldcp); break; case VIO_SUBTYPE_ACK: D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); DWARN(vswp, "%s: restarting handshake..", __func__); - vsw_restart_handshake(ldcp); break; case VIO_SUBTYPE_NACK: D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); DWARN(vswp, "%s: restarting handshake..", __func__); - vsw_restart_handshake(ldcp); break; default: DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, dring_pkt->tag.vio_subtype); - vsw_restart_handshake(ldcp); - } + } + + vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); } @@ -5275,7 +5315,8 @@ #define SND_MCST_NACK(ldcp, pkt) \ pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ pkt->tag.vio_sid = ldcp->local_session; \ - vsw_send_msg(ldcp, (void *)pkt, sizeof (vnet_mcast_msg_t)); + (void) vsw_send_msg(ldcp, (void *)pkt, \ + sizeof (vnet_mcast_msg_t), B_TRUE); /* * Process a multicast request from a vnet. @@ -5359,8 +5400,8 @@ DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt); - vsw_send_msg(ldcp, (void *)mcst_pkt, - sizeof (vnet_mcast_msg_t)); + (void) vsw_send_msg(ldcp, (void *)mcst_pkt, + sizeof (vnet_mcast_msg_t), B_TRUE); break; case VIO_SUBTYPE_ACK: @@ -5417,7 +5458,7 @@ case VIO_SUBTYPE_INFO: D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); - if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_INFO_RECV)) + if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV)) return; rdx_pkt->tag.vio_sid = ldcp->local_session; @@ -5425,10 +5466,10 @@ DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt); - ldcp->lane_in.lstate |= VSW_RDX_ACK_SENT; - - vsw_send_msg(ldcp, (void *)rdx_pkt, - sizeof (vio_rdx_msg_t)); + ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT; + + (void) vsw_send_msg(ldcp, (void *)rdx_pkt, + sizeof (vio_rdx_msg_t), B_TRUE); vsw_next_milestone(ldcp); break; @@ -5438,16 +5479,16 @@ * Should be handled in-band by callback handler. */ DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__); - vsw_restart_handshake(ldcp); + vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); break; case VIO_SUBTYPE_NACK: D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); - if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_NACK_RECV)) + if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV)) return; - ldcp->lane_out.lstate |= VSW_RDX_NACK_RECV; + ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV; vsw_next_milestone(ldcp); break; @@ -5472,7 +5513,7 @@ if (ldcp->peer_session != tag.vio_sid) { DERR(vswp, "%s (chan %d): invalid session id (%llx)", __func__, ldcp->ldc_id, tag.vio_sid); - vsw_restart_handshake(ldcp); + vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return; } } @@ -5487,7 +5528,7 @@ ldcp->lane_in.lstate, ldcp->lane_out.lstate); DUMP_FLAGS(ldcp->lane_in.lstate); DUMP_FLAGS(ldcp->lane_out.lstate); - vsw_restart_handshake(ldcp); + vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); return; } @@ -5512,7 +5553,8 @@ #define SND_DRING_NACK(ldcp, pkt) \ pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ pkt->tag.vio_sid = ldcp->local_session; \ - vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_dring_msg_t)); + (void) vsw_send_msg(ldcp, (void *)pkt, \ + sizeof (vio_dring_msg_t), B_TRUE); static void vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt) @@ -5533,7 +5575,7 @@ uint32_t pos, start, datalen; uint32_t range_start, range_end; int32_t end, num, cnt = 0; - int i, rv; + int i, rv, msg_rv = 0; boolean_t ack_needed = B_FALSE; boolean_t prev_desc_ack = B_FALSE; int read_attempts = 0; @@ -5775,8 +5817,16 @@ dring_pkt->dring_process_state = VIO_DP_ACTIVE; dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; dring_pkt->tag.vio_sid = ldcp->local_session; - vsw_send_msg(ldcp, (void *)dring_pkt, - sizeof (vio_dring_msg_t)); + msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt, + sizeof (vio_dring_msg_t), + B_FALSE); + + /* + * Check if ACK was successfully sent. If not + * we break and deal with that below. + */ + if (msg_rv != 0) + break; prev_desc_ack = B_TRUE; range_start = pos; @@ -5791,18 +5841,27 @@ * allow some other network device (or disk) to * get access to the cpu. */ - /* send the chain of packets to be switched */ if (chain > vsw_chain_len) { D3(vswp, "%s(%lld): switching chain of %d " "msgs", __func__, ldcp->ldc_id, chain); - vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT, - ldcp->ldc_port, NULL); - bp = NULL; break; } } RW_EXIT(&ldcp->lane_in.dlistrw); + /* + * If when we attempted to send the ACK we found that the + * channel had been reset then now handle this. We deal with + * it here as we cannot reset the channel while holding the + * dlistrw lock, and we don't want to acquire/release it + * continuously in the above loop, as a channel reset should + * be a rare event. + */ + if (msg_rv == ECONNRESET) { + vsw_process_conn_evt(ldcp, VSW_CONN_RESET); + break; + } + /* send the chain of packets to be switched */ if (bp != NULL) { D3(vswp, "%s(%lld): switching chain of %d msgs", @@ -5838,8 +5897,8 @@ __func__, ldcp->ldc_id, dring_pkt->start_idx, dring_pkt->end_idx); - vsw_send_msg(ldcp, (void *)dring_pkt, - sizeof (vio_dring_msg_t)); + (void) vsw_send_msg(ldcp, (void *)dring_pkt, + sizeof (vio_dring_msg_t), B_TRUE); break; case VIO_SUBTYPE_ACK: @@ -5970,8 +6029,9 @@ dring_pkt->start_idx, dring_pkt->end_idx); - vsw_send_msg(ldcp, (void *)dring_pkt, - sizeof (vio_dring_msg_t)); + msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt, + sizeof (vio_dring_msg_t), B_FALSE); + } else { mutex_exit(&priv_addr->dstate_lock); dp->restart_reqd = B_TRUE; @@ -5979,6 +6039,11 @@ mutex_exit(&dp->restart_lock); } RW_EXIT(&ldcp->lane_out.dlistrw); + + /* only do channel reset after dropping dlistrw lock */ + if (msg_rv == ECONNRESET) + vsw_process_conn_evt(ldcp, VSW_CONN_RESET); + break; case VIO_SUBTYPE_NACK: @@ -5988,7 +6053,7 @@ * Something is badly wrong if we are getting NACK's * for our data pkts. So reset the channel. */ - vsw_restart_handshake(ldcp); + vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); break; @@ -6103,8 +6168,8 @@ */ ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK; ibnd_desc->hdr.tag.vio_sid = ldcp->local_session; - vsw_send_msg(ldcp, (void *)ibnd_desc, - sizeof (vnet_ibnd_desc_t)); + (void) vsw_send_msg(ldcp, (void *)ibnd_desc, + sizeof (vnet_ibnd_desc_t), B_TRUE); /* send the packet to be switched */ vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, @@ -6931,8 +6996,15 @@ __func__, ldcp->ldc_id, dring_pkt.start_idx, dring_pkt.end_idx, dring_pkt.seq_num); - vsw_send_msg(ldcp, (void *)&dring_pkt, - sizeof (vio_dring_msg_t)); + RW_EXIT(&ldcp->lane_out.dlistrw); + + (void) vsw_send_msg(ldcp, (void *)&dring_pkt, + sizeof (vio_dring_msg_t), B_TRUE); + + /* free the message block */ + freemsg(mp); + return (status); + } else { mutex_exit(&dp->restart_lock); D2(vswp, "%s(%lld): updating descp %d", __func__, @@ -6998,6 +7070,7 @@ size = msgsize(mp); if (size > (size_t)ETHERMAX) { + RW_EXIT(&ldcp->lane_out.dlistrw); DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__, ldcp->ldc_id, size); freemsg(mp); @@ -7008,6 +7081,7 @@ * Find a free descriptor in our buffer ring */ if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) { + RW_EXIT(&ldcp->lane_out.dlistrw); if (warn_msg) { DERR(vswp, "%s(%lld): no descriptor available for ring " "at 0x%llx", __func__, ldcp->ldc_id, dp); @@ -7058,12 +7132,13 @@ ibnd_msg.ncookies = priv_desc->ncookies; ibnd_msg.nbytes = size; - vsw_send_msg(ldcp, (void *)&ibnd_msg, sizeof (vnet_ibnd_desc_t)); + RW_EXIT(&ldcp->lane_out.dlistrw); + + (void) vsw_send_msg(ldcp, (void *)&ibnd_msg, + sizeof (vnet_ibnd_desc_t), B_TRUE); vsw_descrsend_free_exit: - RW_EXIT(&ldcp->lane_out.dlistrw); - /* free the allocated message blocks */ freemsg(mp); @@ -7096,7 +7171,7 @@ DUMP_TAG(ver_msg.tag); - vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t)); + (void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE); D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id); } @@ -7132,9 +7207,9 @@ DUMP_TAG(attr_msg.tag); - vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t)); - - D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); + (void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE); + + D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id); } /* @@ -7197,8 +7272,8 @@ DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg); - vsw_send_msg(ldcp, dring_msg, - sizeof (vio_dring_reg_msg_t)); + (void) vsw_send_msg(ldcp, dring_msg, + sizeof (vio_dring_reg_msg_t), B_TRUE); kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t)); @@ -7218,20 +7293,25 @@ rdx_msg.tag.vio_subtype_env = VIO_RDX; rdx_msg.tag.vio_sid = ldcp->local_session; - ldcp->lane_out.lstate |= VSW_RDX_INFO_SENT; + ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT; DUMP_TAG(rdx_msg.tag); - vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t)); + (void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE); D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id); } /* * Generic routine to send message out over ldc channel. + * + * It is possible that when we attempt to write over the ldc channel + * that we get notified that it has been reset. Depending on the value + * of the handle_reset flag we either handle that event here or simply + * notify the caller that the channel was reset. */ -static void -vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size) +static int +vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset) { int rv; size_t msglen = size; @@ -7258,13 +7338,25 @@ } mutex_exit(&ldcp->ldc_txlock); - /* channel has been reset */ + /* + * If channel has been reset we either handle it here or + * simply report back that it has been reset and let caller + * decide what to do. + */ if (rv == ECONNRESET) { - vsw_handle_reset(ldcp); - } - - D1(vswp, "vsw_send_msg (%lld) exit : sent %d bytes", - ldcp->ldc_id, msglen); + DWARN(vswp, "%s (%lld) channel reset", + __func__, ldcp->ldc_id); + + /* + * N.B - must never be holding the dlistrw lock when + * we do a reset of the channel. + */ + if (handle_reset) { + vsw_process_conn_evt(ldcp, VSW_CONN_RESET); + } + } + + return (rv); } /*
--- a/usr/src/uts/sun4v/sys/vsw.h Mon Feb 26 09:37:10 2007 -0800 +++ b/usr/src/uts/sun4v/sys/vsw.h Mon Feb 26 09:52:03 2007 -0800 @@ -20,7 +20,7 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -388,6 +388,22 @@ } vsw_ctrl_task_t; /* + * State of connection to peer. Some of these states + * can be mapped to LDC events as follows: + * + * VSW_CONN_RESET -> LDC_RESET_EVT + * VSW_CONN_UP -> LDC_UP_EVT + */ +#define VSW_CONN_UP 0x1 /* Connection come up */ +#define VSW_CONN_RESET 0x2 /* Connection reset */ +#define VSW_CONN_RESTART 0x4 /* Restarting handshake on connection */ + +typedef struct vsw_conn_evt { + uint16_t evt; /* Connection event */ + vsw_ldc_t *ldcp; +} vsw_conn_evt_t; + +/* * Vsw queue -- largely modeled after squeue * * VSW_QUEUE_RUNNING, vqueue thread for queue is running.