Mercurial > illumos > illumos-gate
changeset 6480:d50f51fd3502
6622004 vdc should support multi-pathing using multiple vdisk servers
author | narayan |
---|---|
date | Wed, 23 Apr 2008 22:53:05 -0700 |
parents | 2fc187a28649 |
children | 11f45b511199 |
files | usr/src/uts/sun4v/io/vdc.c usr/src/uts/sun4v/sys/vdc.h |
diffstat | 2 files changed, 420 insertions(+), 209 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/uts/sun4v/io/vdc.c Wed Apr 23 17:35:16 2008 -0700 +++ b/usr/src/uts/sun4v/io/vdc.c Wed Apr 23 22:53:05 2008 -0700 @@ -124,7 +124,7 @@ /* setup */ static void vdc_min(struct buf *bufp); static int vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen); -static int vdc_do_ldc_init(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_node); +static int vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr); static int vdc_start_ldc_connection(vdc_t *vdc); static int vdc_create_device_nodes(vdc_t *vdc); static int vdc_create_device_nodes_efi(vdc_t *vdc); @@ -134,10 +134,12 @@ static void vdc_create_err_kstats(vdc_t *vdc); static void vdc_set_err_kstats(vdc_t *vdc); static int vdc_get_md_node(dev_info_t *dip, md_t **mdpp, - mde_cookie_t *vd_nodep, mde_cookie_t *vd_portp); -static int vdc_get_ldc_id(md_t *, mde_cookie_t, uint64_t *); + mde_cookie_t *vd_nodep); +static int vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep); +static void vdc_fini_ports(vdc_t *vdc); +static void vdc_switch_server(vdc_t *vdcp); static int vdc_do_ldc_up(vdc_t *vdc); -static void vdc_terminate_ldc(vdc_t *vdc); +static void vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr); static int vdc_init_descriptor_ring(vdc_t *vdc); static void vdc_destroy_descriptor_ring(vdc_t *vdc); static int vdc_setup_devid(vdc_t *vdc); @@ -226,6 +228,7 @@ static int vdc_hshake_retries = 3; static int vdc_timeout = 0; /* units: seconds */ +static int vdc_ldcup_timeout = 1; /* units: seconds */ static uint64_t vdc_hz_min_ldc_delay; static uint64_t vdc_min_timeout_ldc = 1 * MILLISEC; @@ -448,8 +451,11 @@ /* * try and disable callbacks to prevent another handshake */ - rv = ldc_set_cb_mode(vdc->ldc_handle, LDC_CB_DISABLE); - DMSG(vdc, 0, "callback disabled (rv=%d)\n", rv); + if (vdc->curr_server != NULL) { + rv = ldc_set_cb_mode(vdc->curr_server->ldc_handle, + LDC_CB_DISABLE); + DMSG(vdc, 0, "callback disabled (rv=%d)\n", rv); + } if (vdc->initialized & VDC_THREAD) { mutex_enter(&vdc->read_lock); @@ -484,8 +490,7 @@ if (vdc->initialized & VDC_DRING) vdc_destroy_descriptor_ring(vdc); - if (vdc->initialized & VDC_LDC) - vdc_terminate_ldc(vdc); + vdc_fini_ports(vdc); if (vdc->failfast_thread) { failfast_tid = vdc->failfast_thread->t_did; @@ -575,7 +580,7 @@ vdc_t *vdc = NULL; int status; md_t *mdp; - mde_cookie_t vd_node, vd_port; + mde_cookie_t vd_node; ASSERT(dip != NULL); @@ -606,7 +611,6 @@ vdc->vdisk_label = VD_DISK_LABEL_UNK; vdc->state = VDC_STATE_INIT; vdc->lifecycle = VDC_LC_ATTACHING; - vdc->ldc_state = 0; vdc->session_id = 0; vdc->block_size = DEV_BSIZE; vdc->max_xfer_sz = maxphys / DEV_BSIZE; @@ -651,28 +655,19 @@ vdc->initialized |= VDC_LOCKS; /* get device and port MD node for this disk instance */ - if (vdc_get_md_node(dip, &mdp, &vd_node, &vd_port) != 0) { + if (vdc_get_md_node(dip, &mdp, &vd_node) != 0) { cmn_err(CE_NOTE, "[%d] Could not get machine description node", instance); return (DDI_FAILURE); } - /* set the connection timeout */ - if (vd_port == NULL || (md_get_prop_val(mdp, vd_port, - VDC_MD_TIMEOUT, &vdc->ctimeout) != 0)) { - vdc->ctimeout = 0; - } - - /* initialise LDC channel which will be used to communicate with vds */ - status = vdc_do_ldc_init(vdc, mdp, vd_node); + if (vdc_init_ports(vdc, mdp, vd_node) != 0) { + cmn_err(CE_NOTE, "[%d] Error initialising ports", instance); + return (DDI_FAILURE); + } (void) md_fini_handle(mdp); - if (status != 0) { - cmn_err(CE_NOTE, "[%d] Couldn't initialize LDC", instance); - goto return_status; - } - /* initialize the thread responsible for managing state with server */ vdc->msg_proc_thr = thread_create(NULL, 0, vdc_process_msg_thread, vdc, 0, &p0, TS_RUN, minclsyspri); @@ -763,74 +758,66 @@ } static int -vdc_do_ldc_init(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_node) +vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr) { int status = 0; ldc_status_t ldc_state; ldc_attr_t ldc_attr; - uint64_t ldc_id = 0; ASSERT(vdc != NULL); - - vdc->initialized |= VDC_LDC; - - if ((status = vdc_get_ldc_id(mdp, vd_node, &ldc_id)) != 0) { - DMSG(vdc, 0, "[%d] Failed to get LDC channel ID property", - vdc->instance); - return (EIO); - } - - DMSGX(0, "[%d] LDC id is 0x%lx\n", vdc->instance, ldc_id); - - vdc->ldc_id = ldc_id; + ASSERT(srvr != NULL); ldc_attr.devclass = LDC_DEV_BLK; ldc_attr.instance = vdc->instance; ldc_attr.mode = LDC_MODE_UNRELIABLE; /* unreliable transport */ ldc_attr.mtu = VD_LDC_MTU; - if ((vdc->initialized & VDC_LDC_INIT) == 0) { - status = ldc_init(ldc_id, &ldc_attr, &vdc->ldc_handle); + if ((srvr->state & VDC_LDC_INIT) == 0) { + status = ldc_init(srvr->ldc_id, &ldc_attr, + &srvr->ldc_handle); if (status != 0) { DMSG(vdc, 0, "[%d] ldc_init(chan %ld) returned %d", - vdc->instance, ldc_id, status); + vdc->instance, srvr->ldc_id, status); return (status); } - vdc->initialized |= VDC_LDC_INIT; - } - status = ldc_status(vdc->ldc_handle, &ldc_state); + srvr->state |= VDC_LDC_INIT; + } + status = ldc_status(srvr->ldc_handle, &ldc_state); if (status != 0) { DMSG(vdc, 0, "[%d] Cannot discover LDC status [err=%d]", vdc->instance, status); - return (status); - } - vdc->ldc_state = ldc_state; - - if ((vdc->initialized & VDC_LDC_CB) == 0) { - status = ldc_reg_callback(vdc->ldc_handle, vdc_handle_cb, - (caddr_t)vdc); + goto init_exit; + } + srvr->ldc_state = ldc_state; + + if ((srvr->state & VDC_LDC_CB) == 0) { + status = ldc_reg_callback(srvr->ldc_handle, vdc_handle_cb, + (caddr_t)srvr); if (status != 0) { DMSG(vdc, 0, "[%d] LDC callback reg. failed (%d)", vdc->instance, status); - return (status); + goto init_exit; } - vdc->initialized |= VDC_LDC_CB; - } - - vdc->initialized |= VDC_LDC; + srvr->state |= VDC_LDC_CB; + } /* * At this stage we have initialised LDC, we will now try and open * the connection. */ - if (vdc->ldc_state == LDC_INIT) { - status = ldc_open(vdc->ldc_handle); + if (srvr->ldc_state == LDC_INIT) { + status = ldc_open(srvr->ldc_handle); if (status != 0) { DMSG(vdc, 0, "[%d] ldc_open(chan %ld) returned %d", - vdc->instance, vdc->ldc_id, status); - return (status); + vdc->instance, srvr->ldc_id, status); + goto init_exit; } - vdc->initialized |= VDC_LDC_OPEN; + srvr->state |= VDC_LDC_OPEN; + } + +init_exit: + if (status) { + vdc_terminate_ldc(vdc, srvr); } return (status); @@ -857,10 +844,14 @@ { int status; + ASSERT(vdcp != NULL); + + ASSERT(MUTEX_HELD(&vdcp->lock)); + DMSG(vdcp, 0, ": Resetting connection to vDisk server : state %d\n", vdcp->state); - status = ldc_down(vdcp->ldc_handle); + status = ldc_down(vdcp->curr_server->ldc_handle); DMSG(vdcp, 0, "ldc_down() = %d\n", status); vdcp->initialized &= ~VDC_HANDSHAKE; @@ -1636,8 +1627,8 @@ vdc->instance, status); if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { DMSG(vdc, 0, "[%d] Failed to send Ver negotiation info: " - "id(%lx) rv(%d) size(%ld)", vdc->instance, vdc->ldc_handle, - status, msglen); + "id(%lx) rv(%d) size(%ld)", vdc->instance, + vdc->curr_server->ldc_handle, status, msglen); if (msglen != sizeof (vio_ver_msg_t)) status = ENOMSG; } @@ -1731,8 +1722,8 @@ if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { DMSG(vdc, 0, "[%d] Failed to send Attr negotiation info: " - "id(%lx) rv(%d) size(%ld)", vdc->instance, vdc->ldc_handle, - status, msglen); + "id(%lx) rv(%d) size(%ld)", vdc->instance, + vdc->curr_server->ldc_handle, status, msglen); if (msglen != sizeof (vio_ver_msg_t)) status = ENOMSG; } @@ -2037,7 +2028,7 @@ delay_time = vdc_ldc_read_init_delay; loop: len = *nbytesp; - status = ldc_read(vdc->ldc_handle, (caddr_t)msgp, &len); + status = ldc_read(vdc->curr_server->ldc_handle, (caddr_t)msgp, &len); switch (status) { case EAGAIN: delay_time *= 2; @@ -2060,7 +2051,7 @@ * read state as pending. Otherwise, set the state * back to idle. */ - status = ldc_chkq(vdc->ldc_handle, &q_has_pkts); + status = ldc_chkq(vdc->curr_server->ldc_handle, &q_has_pkts); if (status == 0 && !q_has_pkts) vdc->read_state = VDC_READ_IDLE; @@ -2166,7 +2157,7 @@ delay_ticks = vdc_hz_min_ldc_delay; do { size = *msglen; - status = ldc_write(vdc->ldc_handle, pkt, &size); + status = ldc_write(vdc->curr_server->ldc_handle, pkt, &size); if (status == EWOULDBLOCK) { delay(delay_ticks); /* geometric backoff */ @@ -2208,16 +2199,14 @@ * vdc_get_md_node * * Description: - * Get the MD, the device node and the port node for the given - * disk instance. The caller is responsible for cleaning up the - * reference to the returned MD (mdpp) by calling md_fini_handle(). + * Get the MD, the device node for the given disk instance. The + * caller is responsible for cleaning up the reference to the + * returned MD (mdpp) by calling md_fini_handle(). * * Arguments: * dip - dev info pointer for this instance of the device driver. * mdpp - the returned MD. * vd_nodep - the returned device node. - * vd_portp - the returned port node. The returned port node is NULL - * if no port node is found. * * Return Code: * 0 - Success. @@ -2225,15 +2214,13 @@ * ENXIO - Unexpected error communicating with MD framework */ static int -vdc_get_md_node(dev_info_t *dip, md_t **mdpp, mde_cookie_t *vd_nodep, - mde_cookie_t *vd_portp) +vdc_get_md_node(dev_info_t *dip, md_t **mdpp, mde_cookie_t *vd_nodep) { int status = ENOENT; char *node_name = NULL; md_t *mdp = NULL; int num_nodes; int num_vdevs; - int num_vports; mde_cookie_t rootnode; mde_cookie_t *listp = NULL; boolean_t found_inst = B_FALSE; @@ -2327,18 +2314,6 @@ *vd_nodep = listp[idx]; *mdpp = mdp; - - num_vports = md_scan_dag(mdp, *vd_nodep, - md_find_name(mdp, VDC_MD_PORT_NAME), - md_find_name(mdp, "fwd"), listp); - - if (num_vports != 1) { - DMSGX(0, "Expected 1 '%s' node for '%s' port, found %d\n", - VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME, num_vports); - } - - *vd_portp = (num_vports == 0)? NULL: listp[0]; - done: kmem_free(listp, listsz); return (status); @@ -2346,99 +2321,200 @@ /* * Function: - * vdc_get_ldc_id() + * vdc_init_ports * * Description: - * This function gets the 'ldc-id' for this particular instance of vdc. - * The id returned is the guest domain channel endpoint LDC uses for - * communication with vds. + * Initialize all the ports for this vdisk instance. * * Arguments: - * mdp - pointer to the machine description. - * vd_node - the vdisk element from the MD. - * ldc_id - pointer to variable used to return the 'ldc-id' found. + * vdc - soft state pointer for this instance of the device driver. + * mdp - md pointer + * vd_nodep - device md node. * * Return Code: * 0 - Success. * ENOENT - Expected node or property did not exist. */ static int -vdc_get_ldc_id(md_t *mdp, mde_cookie_t vd_node, uint64_t *ldc_id) +vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep) { - mde_cookie_t *chanp = NULL; + int status = 0; + int idx; + int num_nodes; + int num_vports; + int num_chans; int listsz; - int num_chans; - int num_nodes; - int status = 0; - + mde_cookie_t vd_port; + mde_cookie_t *chanp = NULL; + mde_cookie_t *portp = NULL; + vdc_server_t *srvr; + vdc_server_t *prev_srvr = NULL; + + /* + * We now walk the MD nodes to find the port nodes for this vdisk. + */ num_nodes = md_node_count(mdp); ASSERT(num_nodes > 0); listsz = num_nodes * sizeof (mde_cookie_t); /* allocate memory for nodes */ + portp = kmem_zalloc(listsz, KM_SLEEP); chanp = kmem_zalloc(listsz, KM_SLEEP); - /* get the channels for this node */ - num_chans = md_scan_dag(mdp, vd_node, - md_find_name(mdp, VDC_MD_CHAN_NAME), - md_find_name(mdp, "fwd"), chanp); - - /* expecting at least one channel */ - if (num_chans <= 0) { - cmn_err(CE_NOTE, "No '%s' node for '%s' port", - VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME); + num_vports = md_scan_dag(mdp, vd_nodep, + md_find_name(mdp, VDC_MD_PORT_NAME), + md_find_name(mdp, "fwd"), portp); + if (num_vports == 0) { + DMSGX(0, "Found no '%s' node for '%s' port\n", + VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); status = ENOENT; goto done; - - } else if (num_chans != 1) { - DMSGX(0, "Expected 1 '%s' node for '%s' port, found %d\n", - VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME, num_chans); + } + + DMSGX(1, "Found %d '%s' node(s) for '%s' port\n", + num_vports, VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); + + vdc->num_servers = 0; + for (idx = 0; idx < num_vports; idx++) { + + /* initialize this port */ + vd_port = portp[idx]; + srvr = kmem_zalloc(sizeof (vdc_server_t), KM_SLEEP); + srvr->vdcp = vdc; + + /* get port id */ + if (md_get_prop_val(mdp, vd_port, VDC_MD_ID, &srvr->id) != 0) { + cmn_err(CE_NOTE, "vDisk port '%s' property not found", + VDC_MD_ID); + kmem_free(srvr, sizeof (vdc_server_t)); + continue; + } + + /* set the connection timeout */ + if (md_get_prop_val(mdp, vd_port, VDC_MD_TIMEOUT, + &srvr->ctimeout) != 0) { + srvr->ctimeout = 0; + } + + /* get the ldc id */ + num_chans = md_scan_dag(mdp, vd_port, + md_find_name(mdp, VDC_MD_CHAN_NAME), + md_find_name(mdp, "fwd"), chanp); + + /* expecting at least one channel */ + if (num_chans <= 0) { + cmn_err(CE_NOTE, "No '%s' node for '%s' port", + VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME); + kmem_free(srvr, sizeof (vdc_server_t)); + continue; + } else if (num_chans != 1) { + DMSGX(0, "Expected 1 '%s' node for '%s' port, " + "found %d\n", VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME, + num_chans); + } + + /* + * We use the first channel found (index 0), irrespective of how + * many are there in total. + */ + if (md_get_prop_val(mdp, chanp[0], VDC_MD_ID, + &srvr->ldc_id) != 0) { + cmn_err(CE_NOTE, "Channel '%s' property not found", + VDC_MD_ID); + kmem_free(srvr, sizeof (vdc_server_t)); + continue; + } + + /* + * now initialise LDC channel which will be used to + * communicate with this server + */ + if (vdc_do_ldc_init(vdc, srvr) != 0) { + kmem_free(srvr, sizeof (vdc_server_t)); + continue; + } + + /* add server to list */ + if (prev_srvr) { + prev_srvr->next = srvr; + } else { + vdc->server_list = srvr; + prev_srvr = srvr; + } + + /* inc numbers of servers */ + vdc->num_servers++; } /* - * We use the first channel found (index 0), irrespective of how - * many are there in total. + * Adjust the max number of handshake retries to match + * the number of vdisk servers. */ - if (md_get_prop_val(mdp, chanp[0], VDC_MD_ID, ldc_id) != 0) { - cmn_err(CE_NOTE, "Channel '%s' property not found", VDC_MD_ID); + if (vdc_hshake_retries < vdc->num_servers) + vdc_hshake_retries = vdc->num_servers; + + /* pick first server as current server */ + if (vdc->server_list != NULL) { + vdc->curr_server = vdc->server_list; + status = 0; + } else { status = ENOENT; } done: kmem_free(chanp, listsz); + kmem_free(portp, listsz); return (status); } + +/* + * Function: + * vdc_do_ldc_up + * + * Description: + * Bring the channel for the current server up. + * + * Arguments: + * vdc - soft state pointer for this instance of the device driver. + * + * Return Code: + * 0 - Success. + * EINVAL - Driver is detaching / LDC error + * ECONNREFUSED - Other end is not listening + */ static int vdc_do_ldc_up(vdc_t *vdc) { int status; ldc_status_t ldc_state; + ASSERT(MUTEX_HELD(&vdc->lock)); + DMSG(vdc, 0, "[%d] Bringing up channel %lx\n", - vdc->instance, vdc->ldc_id); + vdc->instance, vdc->curr_server->ldc_id); if (vdc->lifecycle == VDC_LC_DETACHING) return (EINVAL); - if ((status = ldc_up(vdc->ldc_handle)) != 0) { + if ((status = ldc_up(vdc->curr_server->ldc_handle)) != 0) { switch (status) { case ECONNREFUSED: /* listener not ready at other end */ DMSG(vdc, 0, "[%d] ldc_up(%lx,...) return %d\n", - vdc->instance, vdc->ldc_id, status); + vdc->instance, vdc->curr_server->ldc_id, status); status = 0; break; default: DMSG(vdc, 0, "[%d] Failed to bring up LDC: " - "channel=%ld, err=%d", vdc->instance, vdc->ldc_id, - status); + "channel=%ld, err=%d", vdc->instance, + vdc->curr_server->ldc_id, status); break; } } - if (ldc_status(vdc->ldc_handle, &ldc_state) == 0) { - vdc->ldc_state = ldc_state; + if (ldc_status(vdc->curr_server->ldc_handle, &ldc_state) == 0) { + vdc->curr_server->ldc_state = ldc_state; if (ldc_state == LDC_UP) { DMSG(vdc, 0, "[%d] LDC channel already up\n", vdc->instance); @@ -2458,35 +2534,73 @@ * * Arguments: * vdc - soft state pointer for this instance of the device driver. + * srvr - vdc per-server info structure * * Return Code: * None */ static void -vdc_terminate_ldc(vdc_t *vdc) +vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr) { int instance = ddi_get_instance(vdc->dip); + if (srvr->state & VDC_LDC_OPEN) { + DMSG(vdc, 0, "[%d] ldc_close()\n", instance); + (void) ldc_close(srvr->ldc_handle); + } + if (srvr->state & VDC_LDC_CB) { + DMSG(vdc, 0, "[%d] ldc_unreg_callback()\n", instance); + (void) ldc_unreg_callback(srvr->ldc_handle); + } + if (srvr->state & VDC_LDC_INIT) { + DMSG(vdc, 0, "[%d] ldc_fini()\n", instance); + (void) ldc_fini(srvr->ldc_handle); + srvr->ldc_handle = NULL; + } + + srvr->state &= ~(VDC_LDC_INIT | VDC_LDC_CB | VDC_LDC_OPEN); +} + +/* + * Function: + * vdc_fini_ports() + * + * Description: + * Finalize all ports by closing the channel associated with each + * port and also freeing the server structure. + * + * Arguments: + * vdc - soft state pointer for this instance of the device driver. + * + * Return Code: + * None + */ +static void +vdc_fini_ports(vdc_t *vdc) +{ + int instance = ddi_get_instance(vdc->dip); + vdc_server_t *srvr, *prev_srvr; + ASSERT(vdc != NULL); ASSERT(mutex_owned(&vdc->lock)); DMSG(vdc, 0, "[%d] initialized=%x\n", instance, vdc->initialized); - if (vdc->initialized & VDC_LDC_OPEN) { - DMSG(vdc, 0, "[%d] ldc_close()\n", instance); - (void) ldc_close(vdc->ldc_handle); - } - if (vdc->initialized & VDC_LDC_CB) { - DMSG(vdc, 0, "[%d] ldc_unreg_callback()\n", instance); - (void) ldc_unreg_callback(vdc->ldc_handle); - } - if (vdc->initialized & VDC_LDC) { - DMSG(vdc, 0, "[%d] ldc_fini()\n", instance); - (void) ldc_fini(vdc->ldc_handle); - vdc->ldc_handle = NULL; - } - - vdc->initialized &= ~(VDC_LDC | VDC_LDC_CB | VDC_LDC_OPEN); + srvr = vdc->server_list; + + while (srvr) { + + vdc_terminate_ldc(vdc, srvr); + + /* next server */ + prev_srvr = srvr; + srvr = srvr->next; + + /* free server */ + kmem_free(prev_srvr, sizeof (vdc_server_t)); + } + + vdc->server_list = NULL; } /* -------------------------------------------------------------------------- */ @@ -2518,7 +2632,6 @@ ASSERT(vdc != NULL); ASSERT(mutex_owned(&vdc->lock)); - ASSERT(vdc->ldc_handle != NULL); /* ensure we have enough room to store max sized block */ ASSERT(maxphys <= VD_MAX_BLOCK_SIZE); @@ -2546,8 +2659,8 @@ vdc->dring_len = VD_DRING_LEN; status = ldc_mem_dring_create(vdc->dring_len, - vdc->dring_entry_size, &vdc->ldc_dring_hdl); - if ((vdc->ldc_dring_hdl == NULL) || (status != 0)) { + vdc->dring_entry_size, &vdc->dring_hdl); + if ((vdc->dring_hdl == NULL) || (status != 0)) { DMSG(vdc, 0, "[%d] Descriptor ring creation failed", vdc->instance); return (status); @@ -2560,26 +2673,27 @@ vdc->dring_cookie = kmem_zalloc(sizeof (ldc_mem_cookie_t), KM_SLEEP); - status = ldc_mem_dring_bind(vdc->ldc_handle, vdc->ldc_dring_hdl, + status = ldc_mem_dring_bind(vdc->curr_server->ldc_handle, + vdc->dring_hdl, LDC_SHADOW_MAP|LDC_DIRECT_MAP, LDC_MEM_RW, &vdc->dring_cookie[0], &vdc->dring_cookie_count); if (status != 0) { DMSG(vdc, 0, "[%d] Failed to bind descriptor ring " "(%lx) to channel (%lx) status=%d\n", - vdc->instance, vdc->ldc_dring_hdl, - vdc->ldc_handle, status); + vdc->instance, vdc->dring_hdl, + vdc->curr_server->ldc_handle, status); return (status); } ASSERT(vdc->dring_cookie_count == 1); vdc->initialized |= VDC_DRING_BOUND; } - status = ldc_mem_dring_info(vdc->ldc_dring_hdl, &vdc->dring_mem_info); + status = ldc_mem_dring_info(vdc->dring_hdl, &vdc->dring_mem_info); if (status != 0) { DMSG(vdc, 0, "[%d] Failed to get info for descriptor ring (%lx)\n", - vdc->instance, vdc->ldc_dring_hdl); + vdc->instance, vdc->dring_hdl); return (status); } @@ -2604,7 +2718,7 @@ dep = VDC_GET_DRING_ENTRY_PTR(vdc, i); dep->hdr.dstate = VIO_DESC_FREE; - status = ldc_mem_alloc_handle(vdc->ldc_handle, + status = ldc_mem_alloc_handle(vdc->curr_server->ldc_handle, &vdc->local_dring[i].desc_mhdl); if (status != 0) { DMSG(vdc, 0, "![%d] Failed to alloc mem handle for" @@ -2691,26 +2805,26 @@ if (vdc->initialized & VDC_DRING_BOUND) { DMSG(vdc, 0, "[%d] Unbinding DRing\n", vdc->instance); - status = ldc_mem_dring_unbind(vdc->ldc_dring_hdl); + status = ldc_mem_dring_unbind(vdc->dring_hdl); if (status == 0) { vdc->initialized &= ~VDC_DRING_BOUND; } else { DMSG(vdc, 0, "[%d] Error %d unbinding DRing %lx", - vdc->instance, status, vdc->ldc_dring_hdl); + vdc->instance, status, vdc->dring_hdl); } kmem_free(vdc->dring_cookie, sizeof (ldc_mem_cookie_t)); } if (vdc->initialized & VDC_DRING_INIT) { DMSG(vdc, 0, "[%d] Destroying DRing\n", vdc->instance); - status = ldc_mem_dring_destroy(vdc->ldc_dring_hdl); + status = ldc_mem_dring_destroy(vdc->dring_hdl); if (status == 0) { - vdc->ldc_dring_hdl = NULL; + vdc->dring_hdl = NULL; bzero(&vdc->dring_mem_info, sizeof (ldc_mem_info_t)); vdc->initialized &= ~VDC_DRING_INIT; } else { DMSG(vdc, 0, "[%d] Error %d destroying DRing (%lx)", - vdc->instance, status, vdc->ldc_dring_hdl); + vdc->instance, status, vdc->dring_hdl); } } } @@ -3177,7 +3291,8 @@ retries = 0; for (;;) { msglen = sizeof (dmsg); - rv = ldc_read(vdc->ldc_handle, (caddr_t)&dmsg, &msglen); + rv = ldc_read(vdc->curr_server->ldc_handle, (caddr_t)&dmsg, + &msglen); if (rv) { rv = EINVAL; break; @@ -3477,13 +3592,23 @@ { ldc_status_t ldc_state; int rv = 0; - - vdc_t *vdc = (vdc_t *)(void *)arg; + vdc_server_t *srvr = (vdc_server_t *)(void *)arg; + vdc_t *vdc = srvr->vdcp; ASSERT(vdc != NULL); DMSG(vdc, 1, "evt=%lx seqID=%ld\n", event, vdc->seq_num); + /* If callback is not for the current server, ignore it */ + mutex_enter(&vdc->lock); + + if (vdc->curr_server != srvr) { + DMSG(vdc, 0, "[%d] Ignoring event 0x%lx for port@%ld\n", + vdc->instance, event, srvr->id); + mutex_exit(&vdc->lock); + return (LDC_SUCCESS); + } + /* * Depending on the type of event that triggered this callback, * we modify the handshake state or read the data. @@ -3495,16 +3620,16 @@ if (event & LDC_EVT_UP) { DMSG(vdc, 0, "[%d] Received LDC_EVT_UP\n", vdc->instance); - mutex_enter(&vdc->lock); - /* get LDC state */ - rv = ldc_status(vdc->ldc_handle, &ldc_state); + rv = ldc_status(srvr->ldc_handle, &ldc_state); if (rv != 0) { DMSG(vdc, 0, "[%d] Couldn't get LDC status %d", vdc->instance, rv); + mutex_exit(&vdc->lock); return (LDC_SUCCESS); } - if (vdc->ldc_state != LDC_UP && ldc_state == LDC_UP) { + if (srvr->ldc_state != LDC_UP && + ldc_state == LDC_UP) { /* * Reset the transaction sequence numbers when * LDC comes up. We then kick off the handshake @@ -3512,11 +3637,9 @@ */ vdc->seq_num = 1; vdc->seq_num_reply = 0; - vdc->ldc_state = ldc_state; + srvr->ldc_state = ldc_state; cv_signal(&vdc->initwait_cv); } - - mutex_exit(&vdc->lock); } if (event & LDC_EVT_READ) { @@ -3525,6 +3648,7 @@ cv_signal(&vdc->read_cv); vdc->read_state = VDC_READ_PENDING; mutex_exit(&vdc->read_lock); + mutex_exit(&vdc->lock); /* that's all we have to do - no need to handle DOWN/RESET */ return (LDC_SUCCESS); @@ -3534,7 +3658,6 @@ DMSG(vdc, 0, "[%d] Received LDC RESET event\n", vdc->instance); - mutex_enter(&vdc->lock); /* * Need to wake up any readers so they will * detect that a reset has occurred. @@ -3552,8 +3675,9 @@ cv_signal(&vdc->initwait_cv); } - mutex_exit(&vdc->lock); - } + } + + mutex_exit(&vdc->lock); if (event & ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) DMSG(vdc, 0, "![%d] Unexpected LDC event (%lx) received", @@ -3991,6 +4115,42 @@ vdcp->local_dring_backup_len = vdcp->dring_len; } +static void +vdc_switch_server(vdc_t *vdcp) +{ + int rv; + vdc_server_t *curr_server, *new_server; + + ASSERT(MUTEX_HELD(&vdcp->lock)); + + /* if there is only one server return back */ + if (vdcp->num_servers == 1) { + return; + } + + /* Get current and next server */ + curr_server = vdcp->curr_server; + new_server = + (curr_server->next) ? curr_server->next : vdcp->server_list; + ASSERT(curr_server != new_server); + + /* bring current server's channel down */ + rv = ldc_down(curr_server->ldc_handle); + if (rv) { + DMSG(vdcp, 0, "[%d] Cannot bring channel down, port %ld\n", + vdcp->instance, curr_server->id); + return; + } + + /* switch the server */ + vdcp->curr_server = new_server; + + cmn_err(CE_NOTE, "Successfully failed over from VDS on port@%ld to " + "VDS on port@%ld.\n", curr_server->id, new_server->id); + DMSG(vdcp, 0, "[%d] Switched to next vdisk server, port@%ld, ldc@%ld\n", + vdcp->instance, vdcp->curr_server->id, vdcp->curr_server->ldc_id); +} + /* -------------------------------------------------------------------------- */ /* @@ -4017,9 +4177,10 @@ static void vdc_process_msg_thread(vdc_t *vdcp) { - int status; - int ctimeout; - timeout_id_t tmid = 0; + int status; + int ctimeout; + timeout_id_t tmid = 0; + clock_t ldcup_timeout = 0; mutex_enter(&vdcp->lock); @@ -4048,54 +4209,87 @@ * If some reset have occurred while establishing * the connection, we already have a timeout armed * and in that case we don't need to arm a new one. + * + * The same rule applies when there are multiple vds'. + * If either a connection cannot be established or + * the handshake times out, the connection thread will + * try another server. The 'ctimeout' will report + * back an error after it expires irrespective of + * whether the vdisk is trying to connect to just + * one or multiple servers. */ ctimeout = (vdc_timeout != 0)? - vdc_timeout : vdcp->ctimeout; + vdc_timeout : vdcp->curr_server->ctimeout; if (ctimeout != 0 && tmid == 0) { tmid = timeout(vdc_connection_timeout, vdcp, - ctimeout * drv_usectohz(1000000)); + ctimeout * drv_usectohz(MICROSEC)); } - /* Check if have re-initializing repeatedly */ - if (vdcp->hshake_cnt++ > vdc_hshake_retries && + /* Check if we are re-initializing repeatedly */ + if (vdcp->hshake_cnt > vdc_hshake_retries && vdcp->lifecycle != VDC_LC_ONLINE) { + + DMSG(vdcp, 0, "[%d] too many handshakes,cnt=%d", + vdcp->instance, vdcp->hshake_cnt); cmn_err(CE_NOTE, "[%d] disk access failed.\n", vdcp->instance); vdcp->state = VDC_STATE_DETACH; break; } + /* Switch to STATE_DETACH if drv is detaching */ + if (vdcp->lifecycle == VDC_LC_DETACHING) { + vdcp->state = VDC_STATE_DETACH; + break; + } + + /* Switch server */ + if (vdcp->hshake_cnt > 0) + vdc_switch_server(vdcp); + vdcp->hshake_cnt++; + /* Bring up connection with vds via LDC */ status = vdc_start_ldc_connection(vdcp); - if (status == EINVAL) { - DMSG(vdcp, 0, "[%d] Could not start LDC", - vdcp->instance); - vdcp->state = VDC_STATE_DETACH; - } else { + if (status != EINVAL) { vdcp->state = VDC_STATE_INIT_WAITING; } break; case VDC_STATE_INIT_WAITING: - /* - * Let the callback event move us on - * when channel is open to server - */ - while (vdcp->ldc_state != LDC_UP) { + /* if channel is UP, start negotiation */ + if (vdcp->curr_server->ldc_state == LDC_UP) { + vdcp->state = VDC_STATE_NEGOTIATE; + break; + } + + /* check if only one server exists */ + if (vdcp->num_servers == 1) { cv_wait(&vdcp->initwait_cv, &vdcp->lock); - if (vdcp->state != VDC_STATE_INIT_WAITING) { - DMSG(vdcp, 0, - "state moved to %d out from under us...\n", - vdcp->state); - + } else { + /* + * wait for LDC_UP, if it times out, switch + * to another server. + */ + ldcup_timeout = ddi_get_lbolt() + + (vdc_ldcup_timeout * + drv_usectohz(MICROSEC)); + status = cv_timedwait(&vdcp->initwait_cv, + &vdcp->lock, ldcup_timeout); + if (status == -1 && + vdcp->state == VDC_STATE_INIT_WAITING && + vdcp->curr_server->ldc_state != LDC_UP) { + /* timed out & still waiting */ + vdcp->state = VDC_STATE_INIT; break; } } - if (vdcp->state == VDC_STATE_INIT_WAITING && - vdcp->ldc_state == LDC_UP) { - vdcp->state = VDC_STATE_NEGOTIATE; + + if (vdcp->state != VDC_STATE_INIT_WAITING) { + DMSG(vdcp, 0, + "state moved to %d out from under us...\n", + vdcp->state); } break;
--- a/usr/src/uts/sun4v/sys/vdc.h Wed Apr 23 17:35:16 2008 -0700 +++ b/usr/src/uts/sun4v/sys/vdc.h Wed Apr 23 22:53:05 2008 -0700 @@ -54,18 +54,14 @@ #define VDC_LOCKS 0x0002 #define VDC_MINOR 0x0004 #define VDC_THREAD 0x0008 -#define VDC_LDC 0x0010 -#define VDC_LDC_INIT 0x0020 -#define VDC_LDC_CB 0x0040 -#define VDC_LDC_OPEN 0x0080 -#define VDC_DRING_INIT 0x0100 /* The DRing was created */ -#define VDC_DRING_BOUND 0x0200 /* The DRing was bound to an LDC channel */ -#define VDC_DRING_LOCAL 0x0400 /* The local private DRing was allocated */ -#define VDC_DRING_ENTRY 0x0800 /* At least one DRing entry was initialised */ +#define VDC_DRING_INIT 0x0010 /* The DRing was created */ +#define VDC_DRING_BOUND 0x0020 /* The DRing was bound to an LDC channel */ +#define VDC_DRING_LOCAL 0x0040 /* The local private DRing was allocated */ +#define VDC_DRING_ENTRY 0x0080 /* At least one DRing entry was initialised */ #define VDC_DRING (VDC_DRING_INIT | VDC_DRING_BOUND | \ VDC_DRING_LOCAL | VDC_DRING_ENTRY) -#define VDC_HANDSHAKE 0x1000 /* Indicates if a handshake is in progress */ -#define VDC_HANDSHAKE_STOP 0x2000 /* stop further handshakes */ +#define VDC_HANDSHAKE 0x0100 /* Indicates if a handshake is in progress */ +#define VDC_HANDSHAKE_STOP 0x0200 /* stop further handshakes */ /* * Definitions of strings to be used to create device node properties. @@ -240,6 +236,28 @@ } vdc_io_t; /* + * Per vDisk server channel states + */ +#define VDC_LDC_INIT 0x0001 +#define VDC_LDC_CB 0x0002 +#define VDC_LDC_OPEN 0x0004 +#define VDC_LDC (VDC_LDC_INIT | VDC_LDC_CB | VDC_LDC_OPEN) + +/* + * vDisk server information + */ +typedef struct vdc_server { + struct vdc_server *next; /* Next server */ + struct vdc *vdcp; /* Ptr to vdc struct */ + uint64_t id; /* Server port id */ + uint64_t state; /* Server state */ + uint64_t ldc_id; /* Server LDC id */ + ldc_handle_t ldc_handle; /* Server LDC handle */ + ldc_status_t ldc_state; /* Server LDC state */ + uint64_t ctimeout; /* conn tmout (secs) */ +} vdc_server_t; + +/* * vdc soft state structure */ typedef struct vdc { @@ -298,7 +316,6 @@ struct dk_cinfo *cinfo; /* structure to store DKIOCINFO data */ struct dk_minfo *minfo; /* structure for DKIOCGMEDIAINFO data */ ddi_devid_t devid; /* device id */ - uint64_t ctimeout; /* connection timeout in seconds */ boolean_t ctimeout_reached; /* connection timeout has expired */ /* @@ -327,6 +344,7 @@ kstat_t *io_stats; kstat_t *err_stats; + ldc_dring_handle_t dring_hdl; /* dring handle */ ldc_mem_info_t dring_mem_info; /* dring information */ uint_t dring_curr_idx; /* current index */ uint32_t dring_len; /* dring length */ @@ -343,10 +361,9 @@ int local_dring_backup_tail; /* backup dring tail */ int local_dring_backup_len; /* backup dring len */ - uint64_t ldc_id; /* LDC channel id */ - ldc_status_t ldc_state; /* LDC channel state */ - ldc_handle_t ldc_handle; /* LDC handle */ - ldc_dring_handle_t ldc_dring_hdl; /* LDC dring handle */ + int num_servers; /* no. of servers */ + vdc_server_t *server_list; /* vdisk server list */ + vdc_server_t *curr_server; /* curr vdisk server */ } vdc_t; /*