Mercurial > illumos > illumos-gate
changeset 3166:235cc158a526
6418780 vswitch needs to be able to process updates to its MD node
6492078 vds does not handle ldc_close failures
6492423 vSwitch multi-ring code hangs when queue thread not started
6492705 vsw warning messages should identify device instance number
6492706 changing switching mode in one vsw instance should not affect other instance
6493179 Guests fail disk identification on net install if only backup partition exists.
author | sg70180 |
---|---|
date | Fri, 24 Nov 2006 06:52:47 -0800 |
parents | 63d5bf0b6167 |
children | e74c60596d27 |
files | usr/src/uts/sun4v/io/vds.c usr/src/uts/sun4v/io/vsw.c usr/src/uts/sun4v/sys/vsw.h |
diffstat | 3 files changed, 894 insertions(+), 463 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/uts/sun4v/io/vds.c Fri Nov 24 05:40:55 2006 -0800 +++ b/usr/src/uts/sun4v/io/vds.c Fri Nov 24 06:52:47 2006 -0800 @@ -55,7 +55,7 @@ #define VDS_MDEG 0x02 /* Virtual disk server tunable parameters */ -#define VDS_LDC_RETRIES 3 +#define VDS_LDC_RETRIES 5 #define VDS_LDC_DELAY 1000 /* usec */ #define VDS_NCHAINS 32 @@ -871,79 +871,11 @@ return (status); } -/* - * Open any slices which have become non-empty as a result of performing a - * set-VTOC operation for the client. - * - * When serving a full disk, vds attempts to exclusively open all of the - * disk's slices to prevent another thread or process in the service domain - * from "stealing" a slice or from performing I/O to a slice while a vds - * client is accessing it. Unfortunately, underlying drivers, such as sd(7d) - * and cmdk(7d), return an error when attempting to open the device file for a - * slice which is currently empty according to the VTOC. This driver behavior - * means that vds must skip opening empty slices when initializing a vdisk for - * full-disk service and try to open slices that become non-empty (via a - * set-VTOC operation) during use of the full disk in order to begin serving - * such slices to the client. This approach has an inherent (and therefore - * unavoidable) race condition; it also means that failure to open a - * newly-non-empty slice has different semantics than failure to open an - * initially-non-empty slice: Due to driver bahavior, opening a - * newly-non-empty slice is a necessary side effect of vds performing a - * (successful) set-VTOC operation for a client on an in-service (and in-use) - * disk in order to begin serving the slice; failure of this side-effect - * operation does not mean that the client's set-VTOC operation failed or that - * operations on other slices must fail. Therefore, this function prints an - * error message on failure to open a slice, but does not return an error to - * its caller--unlike failure to open a slice initially, which results in an - * error that prevents serving the vdisk (and thereby requires an - * administrator to resolve the problem). Note that, apart from another - * thread or process opening a new slice during the race-condition window, - * failure to open a slice in this function will likely indicate an underlying - * drive problem, which will also likely become evident in errors returned by - * operations on other slices, and which will require administrative - * intervention and possibly servicing the drive. - */ -static void -vd_open_new_slices(vd_t *vd) -{ - int status; - struct vtoc vtoc; - - /* Get the (new) partitions for updated slice sizes */ - if ((status = vd_read_vtoc(vd->ldi_handle[0], &vtoc, - &vd->vdisk_label)) != 0) { - PR0("vd_read_vtoc returned error %d", status); - return; - } - - /* Open any newly-non-empty slices */ - for (int slice = 0; slice < vd->nslices; slice++) { - /* Skip zero-length slices */ - if (vtoc.v_part[slice].p_size == 0) { - if (vd->ldi_handle[slice] != NULL) - PR0("Open slice %u now has zero length", slice); - continue; - } - - /* Skip already-open slices */ - if (vd->ldi_handle[slice] != NULL) - continue; - - PR0("Opening newly-non-empty slice %u", slice); - if ((status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK, - vd_open_flags, kcred, &vd->ldi_handle[slice], - vd->vds->ldi_ident)) != 0) { - PR0("ldi_open_by_dev() returned errno %d " - "for slice %u", status, slice); - } - } -} - #define RNDSIZE(expr) P2ROUNDUP(sizeof (expr), sizeof (uint64_t)) static int vd_ioctl(vd_task_t *task) { - int i, status; + int i, status, rc; void *buf = NULL; struct dk_geom dk_geom = {0}; struct vtoc vtoc = {0}; @@ -1031,8 +963,13 @@ kmem_free(buf, request->nbytes); if (vd->vdisk_type == VD_DISK_TYPE_DISK && (request->operation == VD_OP_SET_VTOC || - request->operation == VD_OP_SET_EFI)) - vd_open_new_slices(vd); + request->operation == VD_OP_SET_EFI)) { + /* update disk information */ + rc = vd_read_vtoc(vd->ldi_handle[0], &vd->vtoc, + &vd->vdisk_label); + if (rc != 0) + PR0("vd_read_vtoc return error %d", rc); + } PR0("Returning %d", status); return (status); } @@ -2125,14 +2062,13 @@ vd_t *vd = (vd_t *)(void *)arg; int status; - ASSERT(vd != NULL); if (!vd_enabled(vd)) return (LDC_SUCCESS); if (event & LDC_EVT_DOWN) { - PRN("LDC_EVT_DOWN: LDC channel went down"); + PR0("LDC_EVT_DOWN: LDC channel went down"); vd_need_reset(vd, B_TRUE); status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, @@ -2281,7 +2217,7 @@ if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGMEDIAINFO, (intptr_t)&dk_minfo, (vd_open_flags | FKIOCTL), kcred, &rval)) != 0) { - PRN("ldi_ioctl(DKIOCGMEDIAINFO) returned errno %d", + PR0("ldi_ioctl(DKIOCGMEDIAINFO) returned errno %d", status); return (status); } @@ -2314,32 +2250,25 @@ vd->dev[slice] = makedevice(major, (minor + slice)); /* - * At least some underlying drivers refuse to open - * devices for (currently) zero-length slices, so skip - * them for now - */ - if (vd->vtoc.v_part[slice].p_size == 0) { - PR0("Skipping zero-length slice %u", slice); - continue; - } - - /* - * Open all non-empty slices of the disk to serve them to the - * client. Slices are opened exclusively to prevent other - * threads or processes in the service domain from performing - * I/O to slices being accessed by a client. Failure to open - * a slice results in vds not serving this disk, as the client - * could attempt (and should be able) to access any non-empty - * slice immediately. Any slices successfully opened before a - * failure will get closed by vds_destroy_vd() as a result of - * the error returned by this function. + * Open all slices of the disk to serve them to the client. + * Slices are opened exclusively to prevent other threads or + * processes in the service domain from performing I/O to + * slices being accessed by a client. Failure to open a slice + * results in vds not serving this disk, as the client could + * attempt (and should be able) to access any slice immediately. + * Any slices successfully opened before a failure will get + * closed by vds_destroy_vd() as a result of the error returned + * by this function. + * + * We need to do the open with FNDELAY so that opening an empty + * slice does not fail. */ PR0("Opening device major %u, minor %u = slice %u", major, minor, slice); if ((status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK, - vd_open_flags, kcred, &vd->ldi_handle[slice], - vd->vds->ldi_ident)) != 0) { - PRN("ldi_open_by_dev() returned errno %d " + vd_open_flags | FNDELAY, kcred, &vd->ldi_handle[slice], + vd->vds->ldi_ident)) != 0) { + PR0("ldi_open_by_dev() returned errno %d " "for slice %u", status, slice); /* vds_destroy_vd() will close any open slices */ return (status); @@ -2603,13 +2532,13 @@ } if ((status = ldc_up(vd->ldc_handle)) != 0) { - PRN("ldc_up() returned errno %d", status); + PR0("ldc_up() returned errno %d", status); } /* Allocate the inband task memory handle */ status = ldc_mem_alloc_handle(vd->ldc_handle, &(vd->inband_task.mhdl)); if (status) { - PRN("ldc_mem_alloc_handle() returned err %d ", status); + PR0("ldc_mem_alloc_handle() returned err %d ", status); return (ENXIO); } @@ -2653,7 +2582,7 @@ vds_destroy_vd(void *arg) { vd_t *vd = (vd_t *)arg; - + int retry = 0, rv; if (vd == NULL) return; @@ -2680,6 +2609,40 @@ vd_free_dring_task(vd); + /* Free the inband task memory handle */ + (void) ldc_mem_free_handle(vd->inband_task.mhdl); + + /* Shut down LDC */ + if (vd->initialized & VD_LDC) { + /* unmap the dring */ + if (vd->initialized & VD_DRING) + (void) ldc_mem_dring_unmap(vd->dring_handle); + + /* close LDC channel - retry on EAGAIN */ + while ((rv = ldc_close(vd->ldc_handle)) == EAGAIN) { + if (++retry > vds_ldc_retries) { + PR0("Timed out closing channel"); + break; + } + drv_usecwait(vds_ldc_delay); + } + if (rv == 0) { + (void) ldc_unreg_callback(vd->ldc_handle); + (void) ldc_fini(vd->ldc_handle); + } else { + /* + * Closing the LDC channel has failed. Ideally we should + * fail here but there is no Zeus level infrastructure + * to handle this. The MD has already been changed and + * we have to do the close. So we try to do as much + * clean up as we can. + */ + (void) ldc_set_cb_mode(vd->ldc_handle, LDC_CB_DISABLE); + while (ldc_unreg_callback(vd->ldc_handle) == EAGAIN) + drv_usecwait(vds_ldc_delay); + } + } + /* Free the staging buffer for msgs */ if (vd->vio_msgp != NULL) { kmem_free(vd->vio_msgp, vd->max_msglen); @@ -2692,18 +2655,6 @@ vd->inband_task.msg = NULL; } - /* Free the inband task memory handle */ - (void) ldc_mem_free_handle(vd->inband_task.mhdl); - - /* Shut down LDC */ - if (vd->initialized & VD_LDC) { - if (vd->initialized & VD_DRING) - (void) ldc_mem_dring_unmap(vd->dring_handle); - (void) ldc_unreg_callback(vd->ldc_handle); - (void) ldc_close(vd->ldc_handle); - (void) ldc_fini(vd->ldc_handle); - } - /* Close any open backing-device slices */ for (uint_t slice = 0; slice < vd->nslices; slice++) { if (vd->ldi_handle[slice] != NULL) {
--- a/usr/src/uts/sun4v/io/vsw.c Fri Nov 24 05:40:55 2006 -0800 +++ b/usr/src/uts/sun4v/io/vsw.c Fri Nov 24 06:52:47 2006 -0800 @@ -77,8 +77,10 @@ static int vsw_attach(dev_info_t *, ddi_attach_cmd_t); static int vsw_detach(dev_info_t *, ddi_detach_cmd_t); static int vsw_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); -static void vsw_get_md_properties(vsw_t *vswp); +static int vsw_get_md_physname(vsw_t *, md_t *, mde_cookie_t, char *); +static int vsw_get_md_smodes(vsw_t *, md_t *, mde_cookie_t, uint8_t *, int *); static int vsw_get_physaddr(vsw_t *); +static int vsw_setup_switching(vsw_t *); static int vsw_setup_layer2(vsw_t *); static int vsw_setup_layer3(vsw_t *); @@ -116,9 +118,12 @@ static mblk_t *vsw_m_tx(void *arg, mblk_t *); /* MDEG routines */ -static void vsw_mdeg_register(vsw_t *vswp); +static int vsw_mdeg_register(vsw_t *vswp); static void vsw_mdeg_unregister(vsw_t *vswp); static int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *); +static int vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *); +static void vsw_get_initial_md_properties(vsw_t *vswp, md_t *, mde_cookie_t); +static void vsw_update_md_prop(vsw_t *, md_t *, mde_cookie_t); /* Port add/deletion routines */ static int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node); @@ -218,7 +223,6 @@ static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t); static int vsw_free_ring(dring_info_t *); - /* Debugging routines */ static void dump_flags(uint64_t); static void display_state(void); @@ -234,13 +238,6 @@ uint32_t vsw_mblk_size = VSW_MBLK_SIZE; uint32_t vsw_num_mblks = VSW_NUM_MBLKS; - -/* - * mode specific frame switching function - */ -void (*vsw_switch_frame)(vsw_t *, mblk_t *, int, vsw_port_t *, - mac_resource_handle_t); - static mac_callbacks_t vsw_m_callbacks = { 0, vsw_m_stat, @@ -344,6 +341,21 @@ vport_prop_match }; /* + * Matching criteria passed to the MDEG to register interest + * in changes to 'virtual-device' nodes (i.e. vsw nodes) identified + * by their 'name' and 'cfg-handle' properties. + */ +static md_prop_match_t vdev_prop_match[] = { + { MDET_PROP_STR, "name" }, + { MDET_PROP_VAL, "cfg-handle" }, + { MDET_LIST_END, NULL } +}; + +static mdeg_node_match_t vdev_match = { "virtual-device", + vdev_prop_match }; + + +/* * Specification of an MD node passed to the MDEG to filter any * 'vport' nodes that do not belong to the specified node. This * template is copied for each vsw instance and filled in with @@ -513,10 +525,9 @@ vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { vsw_t *vswp; - int instance, i; + int instance; char hashname[MAXNAMELEN]; char qname[TASKQ_NAMELEN]; - int rv = 1; enum { PROG_init = 0x00, PROG_if_lock = 0x01, PROG_fdb = 0x02, @@ -555,15 +566,10 @@ vswp->instance = instance; ddi_set_driver_private(dip, (caddr_t)vswp); + mutex_init(&vswp->mac_lock, NULL, MUTEX_DRIVER, NULL); rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL); progress |= PROG_if_lock; - /* - * Get the various properties such as physical device name - * (vsw-phys-dev), switch mode etc from the MD. - */ - vsw_get_md_properties(vswp); - /* setup the unicast forwarding database */ (void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d", vswp->instance); @@ -612,68 +618,39 @@ (void) snprintf(qname, TASKQ_NAMELEN, "vsw_taskq%d", vswp->instance); if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1, TASKQ_DEFAULTPRI, 0)) == NULL) { - cmn_err(CE_WARN, "Unable to create task queue"); + cmn_err(CE_WARN, "!vsw%d: Unable to create task queue", + vswp->instance); goto vsw_attach_fail; } progress |= PROG_taskq; - /* select best switching mode */ - for (i = 0; i < vswp->smode_num; i++) { - vswp->smode_idx = i; - switch (vswp->smode[i]) { - case VSW_LAYER2: - case VSW_LAYER2_PROMISC: - rv = vsw_setup_layer2(vswp); - break; - - case VSW_LAYER3: - rv = vsw_setup_layer3(vswp); - break; - - default: - DERR(vswp, "unknown switch mode"); - rv = 1; - break; - } - - if (rv == 0) - break; - } - - if (rv == 1) { - cmn_err(CE_WARN, "Unable to setup switching mode"); - goto vsw_attach_fail; - } - - D2(vswp, "Operating in mode %d", vswp->smode[vswp->smode_idx]); - - /* - * Register with the MAC layer as a network device so - * we can be plumbed if desired. - * - * Do this in both layer 2 and layer 3 mode. - */ - vswp->if_state &= ~VSW_IF_UP; - if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) { - if (vsw_mac_register(vswp) != 0) { - cmn_err(CE_WARN, "Unable to register as provider " - " with MAC layer, continuing with attach"); - } - } - /* prevent auto-detaching */ if (ddi_prop_update_int(DDI_DEV_T_NONE, vswp->dip, DDI_NO_AUTODETACH, 1) != DDI_SUCCESS) { - cmn_err(CE_NOTE, "Unable to set \"%s\" property for " + cmn_err(CE_NOTE, "!Unable to set \"%s\" property for " "instance %u", DDI_NO_AUTODETACH, instance); } /* - * Now we have everything setup, register for MD change - * events. + * Now we have everything setup, register an interest in + * specific MD nodes. + * + * The callback is invoked in 2 cases, firstly if upon mdeg + * registration there are existing nodes which match our specified + * criteria, and secondly if the MD is changed (and again, there + * are nodes which we are interested in present within it. Note + * that our callback will be invoked even if our specified nodes + * have not actually changed). + * + * Until the callback is invoked we cannot switch any pkts as + * we don't know basic information such as what mode we are + * operating in. However we expect the callback to be invoked + * immediately upon registration as this driver should only + * be attaching if there are vsw nodes in the MD. */ - vsw_mdeg_register(vswp); + if (vsw_mdeg_register(vswp)) + goto vsw_attach_fail; return (DDI_SUCCESS); @@ -702,8 +679,10 @@ vswp->fdb = NULL; } - if (progress & PROG_if_lock) + if (progress & PROG_if_lock) { rw_destroy(&vswp->if_lockrw); + mutex_destroy(&vswp->mac_lock); + } ddi_soft_state_free(vsw_state, instance); return (DDI_FAILURE); @@ -734,31 +713,37 @@ D2(vswp, "detaching instance %d", instance); - if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) { + if (vswp->if_state & VSW_IF_REG) { if (vsw_mac_unregister(vswp) != 0) { - cmn_err(CE_WARN, "Unable to detach from MAC layer"); + cmn_err(CE_WARN, "!vsw%d: Unable to detach from " + "MAC layer", vswp->instance); return (DDI_FAILURE); } - rw_destroy(&vswp->if_lockrw); } vsw_mdeg_unregister(vswp); /* remove mac layer callback */ + mutex_enter(&vswp->mac_lock); if ((vswp->mh != NULL) && (vswp->mrh != NULL)) { mac_rx_remove(vswp->mh, vswp->mrh); vswp->mrh = NULL; } + mutex_exit(&vswp->mac_lock); if (vsw_detach_ports(vswp) != 0) { - cmn_err(CE_WARN, "Unable to detach ports"); + cmn_err(CE_WARN, "!vsw%d: Unable to detach ports", + vswp->instance); return (DDI_FAILURE); } + rw_destroy(&vswp->if_lockrw); + /* * Now that the ports have been deleted, stop and close * the physical device. */ + mutex_enter(&vswp->mac_lock); if (vswp->mh != NULL) { if (vswp->mstarted) mac_stop(vswp->mh); @@ -769,6 +754,8 @@ vswp->mh = NULL; vswp->txinfo = NULL; } + mutex_exit(&vswp->mac_lock); + mutex_destroy(&vswp->mac_lock); /* * Destroy any free pools that may still exist. @@ -865,127 +852,37 @@ } /* - * Get the properties from our MD node. + * Get the value of the "vsw-phys-dev" property in the specified + * node. This property is the name of the physical device that + * the virtual switch will use to talk to the outside world. + * + * Note it is valid for this property to be NULL (but the property + * itself must exist). Callers of this routine should verify that + * the value returned is what they expected (i.e. either NULL or non NULL). + * + * On success returns value of the property in region pointed to by + * the 'name' argument, and with return value of 0. Otherwise returns 1. */ -static void -vsw_get_md_properties(vsw_t *vswp) -{ - md_t *mdp = NULL; - int num_nodes = 0; - int len = 0, listsz = 0; - int num_vdev = 0; - int i, idx; - boolean_t found_node = B_FALSE; - char *smode = NULL; - char *curr_mode = NULL; - char *physname = NULL; - char *node_name = NULL; - char *dev; - uint64_t macaddr = 0; - uint64_t md_inst, obp_inst; - mde_cookie_t *listp = NULL; - mde_cookie_t rootnode; - - D1(vswp, "%s: enter", __func__); - - /* - * Further down we compare the obp 'reg' property to the - * 'cfg-handle' property in the vsw MD node to determine - * if the node refers to this particular instance. So if - * we can't read the obp value then there is no point - * in proceeding further. - */ - if (ddi_prop_exists(DDI_DEV_T_ANY, vswp->dip, - DDI_PROP_DONTPASS, reg_propname) != 1) { - cmn_err(CE_WARN, "Unable to read %s property " - "from OBP device node", reg_propname); - return; - } - - obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip, - DDI_PROP_DONTPASS, reg_propname, 0); - - D2(vswp, "%s: obp_inst 0x%llx", __func__, obp_inst); - - if ((mdp = md_get_handle()) == NULL) { - DERR(vswp, "%s: unable to init MD", __func__); - return; - } - - if ((num_nodes = md_node_count(mdp)) <= 0) { - DERR(vswp, "%s: invalid number of nodes found %d", - __func__, num_nodes); - (void) md_fini_handle(mdp); - return; - } - - D2(vswp, "%s: %d nodes in total in MD", __func__, num_nodes); - - /* allocate enough space for node list */ - listsz = num_nodes * sizeof (mde_cookie_t); - listp = kmem_zalloc(listsz, KM_SLEEP); - - rootnode = md_root_node(mdp); - - /* Get the list of virtual devices */ - num_vdev = md_scan_dag(mdp, rootnode, - md_find_name(mdp, vdev_propname), - md_find_name(mdp, "fwd"), listp); - - if (num_vdev <= 0) { - DERR(vswp, "%s: didn't find any virtual-device nodes in MD", - __func__); - goto md_prop_exit; - } - - D2(vswp, "%s: %d virtual-device nodes found", __func__, num_vdev); - - /* Look for the virtual switch nodes in the list */ - for (idx = 0; idx < num_vdev; idx++) { - if (md_get_prop_str(mdp, listp[idx], - "name", &node_name) != 0) { - DERR(vswp, "%s: unable to get node name", __func__); - continue; - - } - - if (strcmp(node_name, vsw_propname) == 0) { - /* Virtual switch node */ - if (md_get_prop_val(mdp, listp[idx], - "cfg-handle", &md_inst) != 0) { - DERR(vswp, "%s: unable to get cfg-handle from" - " node %d", __func__, idx); - goto md_prop_exit; - } else if (md_inst == obp_inst) { - D2(vswp, "%s: found matching node (%d)" - " 0x%llx == 0x%llx", __func__, idx, - md_inst, obp_inst); - found_node = B_TRUE; - break; - } - } - } - - if (!found_node) { - DWARN(vswp, "%s: couldn't find correct vsw node", __func__); - goto md_prop_exit; - } - - /* - * Now, having found the correct node, get the various properties. - */ - - if (md_get_prop_data(mdp, listp[idx], physdev_propname, +static int +vsw_get_md_physname(vsw_t *vswp, md_t *mdp, mde_cookie_t node, char *name) +{ + int len = 0; + char *physname = NULL; + char *dev; + + if (md_get_prop_data(mdp, node, physdev_propname, (uint8_t **)(&physname), &len) != 0) { - cmn_err(CE_WARN, "%s: unable to get name(s) of physical " - "device(s) from MD", __func__); + cmn_err(CE_WARN, "!vsw%d: Unable to get name(s) of physical " + "device(s) from MD", vswp->instance); + return (1); } else if ((strlen(physname) + 1) > LIFNAMSIZ) { - cmn_err(CE_WARN, "%s is too long a device name", physname); + cmn_err(CE_WARN, "!vsw%d: %s is too long a device name", + vswp->instance, physname); + return (1); } else { - (void) strncpy(vswp->physname, physname, strlen(physname) + 1); - vswp->mdprops |= VSW_MD_PHYSNAME; + (void) strncpy(name, physname, strlen(physname) + 1); D2(vswp, "%s: using first device specified (%s)", - __func__, vswp->physname); + __func__, physname); } #ifdef DEBUG @@ -1001,42 +898,40 @@ if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0, "vsw_physname", &dev) == DDI_PROP_SUCCESS) { if ((strlen(dev) + 1) > LIFNAMSIZ) { - cmn_err(CE_WARN, "%s is too long a device name", dev); + cmn_err(CE_WARN, "vsw%d: %s is too long a device name", + vswp->instance, dev); + ddi_prop_free(dev); + return (1); } else { - cmn_err(CE_NOTE, "%s: using device name (%s) from " - "config file", __func__, dev); - - (void) strncpy(vswp->physname, dev, strlen(dev) + 1); - vswp->mdprops |= VSW_MD_PHYSNAME; + cmn_err(CE_NOTE, "vsw%d: Using device name (%s) from " + "config file", vswp->instance, dev); + + (void) strncpy(name, dev, strlen(dev) + 1); } ddi_prop_free(dev); - } #endif - /* mac address for vswitch device itself */ - if (md_get_prop_val(mdp, listp[idx], - macaddr_propname, &macaddr) != 0) { - cmn_err(CE_WARN, "!Unable to get MAC address from MD"); - - /* - * Fallback to using the mac address of the physical - * device. - */ - if (vsw_get_physaddr(vswp) == 0) { - cmn_err(CE_NOTE, "!Using MAC address from physical " - "device (%s)", vswp->physname); - } - } else { - READ_ENTER(&vswp->if_lockrw); - for (i = ETHERADDRL - 1; i >= 0; i--) { - vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF; - macaddr >>= 8; - } - RW_EXIT(&vswp->if_lockrw); - vswp->mdprops |= VSW_MD_MACADDR; - } + return (0); +} + +/* + * Read the 'vsw-switch-mode' property from the specified MD node. + * + * Returns 0 on success and the number of modes found in 'found', + * otherwise returns 1. + */ +static int +vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node, + uint8_t *modes, int *found) +{ + int len = 0; + int smode_num = 0; + char *smode = NULL; + char *curr_mode = NULL; + + D1(vswp, "%s: enter", __func__); /* * Get the switch-mode property. The modes are listed in @@ -1044,15 +939,17 @@ * first item in list. */ len = 0; - vswp->smode_num = 0; - if (md_get_prop_data(mdp, listp[idx], smode_propname, + smode_num = 0; + if (md_get_prop_data(mdp, node, smode_propname, (uint8_t **)(&smode), &len) != 0) { /* * Unable to get switch-mode property from MD, nothing * more we can do. */ - cmn_err(CE_WARN, "!unable to get switch mode property"); - goto md_prop_exit; + cmn_err(CE_WARN, "!vsw%d: Unable to get switch mode property" + " from the MD", vswp->instance); + *found = 0; + return (1); } curr_mode = smode; @@ -1065,33 +962,29 @@ * 'routed' - layer 3 (i.e. IP) routing, underlying HW * in non-promiscuous mode. */ - while ((curr_mode < (smode + len)) && (vswp->smode_num < NUM_SMODES)) { + while ((curr_mode < (smode + len)) && (smode_num < NUM_SMODES)) { D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode); if (strcmp(curr_mode, "switched") == 0) { - vswp->smode[vswp->smode_num++] = VSW_LAYER2; + modes[smode_num++] = VSW_LAYER2; } else if (strcmp(curr_mode, "promiscuous") == 0) { - vswp->smode[vswp->smode_num++] = VSW_LAYER2_PROMISC; + modes[smode_num++] = VSW_LAYER2_PROMISC; } else if (strcmp(curr_mode, "routed") == 0) { - vswp->smode[vswp->smode_num++] = VSW_LAYER3; + modes[smode_num++] = VSW_LAYER3; } else { - cmn_err(CE_WARN, "Unknown switch mode %s, setting to" - " default switched mode", curr_mode); - vswp->smode[vswp->smode_num++] = VSW_LAYER2; + cmn_err(CE_WARN, "!vsw%d: Unknown switch mode %s, " + "setting to default switched mode", + vswp->instance, curr_mode); + modes[smode_num++] = VSW_LAYER2; } curr_mode += strlen(curr_mode) + 1; } - - D2(vswp, "%d switching modes specified", vswp->smode_num); - - if (vswp->smode_num > 0) - vswp->mdprops |= VSW_MD_SMODE; - -md_prop_exit: - (void) md_fini_handle(mdp); - - kmem_free(listp, listsz); + *found = smode_num; + + D2(vswp, "%s: %d modes found", __func__, smode_num); D1(vswp, "%s: exit", __func__); + + return (0); } /* @@ -1112,7 +1005,8 @@ return (1); if (mac_open(vswp->physname, ddi_instance, &mh) != 0) { - cmn_err(CE_WARN, "!mac_open %s failed", vswp->physname); + cmn_err(CE_WARN, "!vsw%d: mac_open %s failed", + vswp->instance, vswp->physname); return (1); } @@ -1141,20 +1035,24 @@ { D1(vswp, "%s: enter", __func__); + mutex_enter(&vswp->mac_lock); if (vswp->mh == NULL) { + mutex_exit(&vswp->mac_lock); return (1); } if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) { DWARN(vswp, "Unable to get capabilities of" " underlying device (%s)", vswp->physname); + mutex_exit(&vswp->mac_lock); return (1); } + mutex_exit(&vswp->mac_lock); if (vswp->maddr.maddr_naddrfree == 0) { cmn_err(CE_WARN, - "!device %s has no free unicast address slots", - vswp->physname); + "!vsw%d: device %s has no free unicast address slots", + vswp->instance, vswp->physname); return (1); } @@ -1167,6 +1065,55 @@ } /* + * Setup the required switching mode. + * + * Returns 0 on success, 1 on failure. + */ +static int +vsw_setup_switching(vsw_t *vswp) +{ + int i, rv = 1; + + D1(vswp, "%s: enter", __func__); + + /* select best switching mode */ + for (i = 0; i < vswp->smode_num; i++) { + vswp->smode_idx = i; + switch (vswp->smode[i]) { + case VSW_LAYER2: + case VSW_LAYER2_PROMISC: + rv = vsw_setup_layer2(vswp); + break; + + case VSW_LAYER3: + rv = vsw_setup_layer3(vswp); + break; + + default: + DERR(vswp, "unknown switch mode"); + rv = 1; + break; + } + + if (rv == 0) + break; + } + + if (rv == 1) { + cmn_err(CE_WARN, "!vsw%d: Unable to setup specified " + "switching mode", vswp->instance); + return (rv); + } + + D2(vswp, "%s: Operating in mode %d", __func__, + vswp->smode[vswp->smode_idx]); + + D1(vswp, "%s: exit", __func__); + + return (0); +} + +/* * Setup for layer 2 switching. * * Returns 0 on success, 1 on failure. @@ -1176,7 +1123,7 @@ { D1(vswp, "%s: enter", __func__); - vsw_switch_frame = vsw_switch_l2_frame; + vswp->vsw_switch_frame = vsw_switch_l2_frame; /* * Attempt to link into the MAC layer so we can get @@ -1189,8 +1136,8 @@ * so return 1 so that can fall back to next * prefered switching method. */ - cmn_err(CE_WARN, "!Unable to join as MAC layer " - "client"); + cmn_err(CE_WARN, "!vsw%d: Unable to join as MAC layer " + "client", vswp->instance); return (1); } @@ -1200,7 +1147,8 @@ * unicast mac addresses, and has free capacity. */ if (vsw_get_hw_maddr(vswp) != 0) { - cmn_err(CE_WARN, "!unable to setup switching"); + cmn_err(CE_WARN, "!vsw%d: Unable to setup " + "switching", vswp->instance); vsw_mac_detach(vswp); return (1); } @@ -1211,7 +1159,8 @@ * No physical device name found in MD which is * required for layer 2. */ - cmn_err(CE_WARN, "!no physical device name specified"); + cmn_err(CE_WARN, "!vsw%d: no physical device name specified", + vswp->instance); return (1); } @@ -1226,7 +1175,7 @@ D1(vswp, "%s: enter", __func__); D2(vswp, "%s: operating in layer 3 mode", __func__); - vsw_switch_frame = vsw_switch_l3_frame; + vswp->vsw_switch_frame = vsw_switch_l3_frame; D1(vswp, "%s: exit", __func__); @@ -1248,19 +1197,23 @@ D1(vswp, "%s: enter", __func__); - vswp->mh = NULL; - vswp->mrh = NULL; - vswp->mstarted = B_FALSE; - vswp->mresources = B_FALSE; + ASSERT(vswp->mh == NULL); + ASSERT(vswp->mrh == NULL); + ASSERT(vswp->mstarted == B_FALSE); + ASSERT(vswp->mresources == B_FALSE); ASSERT(vswp->mdprops & VSW_MD_PHYSNAME); + mutex_enter(&vswp->mac_lock); if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) { - cmn_err(CE_WARN, "invalid device name: %s", vswp->physname); + cmn_err(CE_WARN, "!vsw%d: invalid device name: %s", + vswp->instance, vswp->physname); goto mac_fail_exit; } + if ((mac_open(vswp->physname, ddi_instance, &vswp->mh)) != 0) { - cmn_err(CE_WARN, "mac_open %s failed", vswp->physname); + cmn_err(CE_WARN, "!vsw%d: mac_open %s failed", + vswp->instance, vswp->physname); goto mac_fail_exit; } @@ -1269,13 +1222,17 @@ D2(vswp, "vsw_mac_attach: using device %s", vswp->physname); if (vsw_multi_ring_enable) { + /* + * Initialize the ring table. + */ vsw_mac_ring_tbl_init(vswp); /* - * Register our receive callback. + * Register our rx callback function. */ vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_queue_cb, (void *)vswp); + ASSERT(vswp->mrh != NULL); /* * Register our mac resource callback. @@ -1293,25 +1250,28 @@ * Just register our rx callback function */ vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp); - } - - ASSERT(vswp->mrh != NULL); + ASSERT(vswp->mrh != NULL); + } /* Get the MAC tx fn */ vswp->txinfo = mac_tx_get(vswp->mh); /* start the interface */ if (mac_start(vswp->mh) != 0) { - cmn_err(CE_WARN, "could not start mac interface"); + cmn_err(CE_WARN, "!vsw%d: Could not start mac interface", + vswp->instance); goto mac_fail_exit; } + mutex_exit(&vswp->mac_lock); + vswp->mstarted = B_TRUE; D1(vswp, "%s: exit", __func__); return (0); mac_fail_exit: + mutex_exit(&vswp->mac_lock); vsw_mac_detach(vswp); D1(vswp, "%s: exit", __func__); @@ -1329,6 +1289,8 @@ vsw_mac_ring_tbl_destroy(vswp); } + mutex_enter(&vswp->mac_lock); + if (vswp->mh != NULL) { if (vswp->mstarted) mac_stop(vswp->mh); @@ -1344,6 +1306,8 @@ vswp->txinfo = NULL; vswp->mstarted = B_FALSE; + mutex_exit(&vswp->mac_lock); + D1(vswp, "vsw_mac_detach: exit"); } @@ -1392,9 +1356,10 @@ err = vswp->maddr.maddr_add(mah, &mac_addr); if (err != 0) { - cmn_err(CE_WARN, "!failed to program addr " + cmn_err(CE_WARN, "!vsw%d: failed to program addr " "%x:%x:%x:%x:%x:%x for port %d into device %s " - ": err %d", port->p_macaddr.ether_addr_octet[0], + ": err %d", vswp->instance, + port->p_macaddr.ether_addr_octet[0], port->p_macaddr.ether_addr_octet[1], port->p_macaddr.ether_addr_octet[2], port->p_macaddr.ether_addr_octet[3], @@ -1471,9 +1436,6 @@ } if (port->addr_set == VSW_ADDR_HW) { - if (vswp->mh == NULL) - return (1); - if (vswp->maddr.maddr_handle == NULL) return (1); @@ -1481,9 +1443,9 @@ err = vswp->maddr.maddr_remove(mah, port->addr_slot); if (err != 0) { - cmn_err(CE_WARN, "!Unable to remove addr " + cmn_err(CE_WARN, "!vsw%d: Unable to remove addr " "%x:%x:%x:%x:%x:%x for port %d from device %s" - " : (err %d)", + " : (err %d)", vswp->instance, port->p_macaddr.ether_addr_octet[0], port->p_macaddr.ether_addr_octet[1], port->p_macaddr.ether_addr_octet[2], @@ -1521,17 +1483,22 @@ { D1(vswp, "%s: enter", __func__); - if (vswp->mh == NULL) + mutex_enter(&vswp->mac_lock); + if (vswp->mh == NULL) { + mutex_exit(&vswp->mac_lock); return (1); + } if (vswp->promisc_cnt++ == 0) { if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) { vswp->promisc_cnt--; + mutex_exit(&vswp->mac_lock); return (1); } - cmn_err(CE_NOTE, "!switching device %s into promiscuous mode", - vswp->physname); - } + cmn_err(CE_NOTE, "!vsw%d: switching device %s into " + "promiscuous mode", vswp->instance, vswp->physname); + } + mutex_exit(&vswp->mac_lock); port->addr_set = VSW_ADDR_PROMISC; D1(vswp, "%s: exit", __func__); @@ -1549,16 +1516,20 @@ { vsw_port_list_t *plist = &vswp->plist; - D1(vswp, "%s: enter", __func__); - - if (vswp->mh == NULL) + D2(vswp, "%s: enter", __func__); + + mutex_enter(&vswp->mac_lock); + if (vswp->mh == NULL) { + mutex_exit(&vswp->mac_lock); return (1); + } ASSERT(port->addr_set == VSW_ADDR_PROMISC); if (--vswp->promisc_cnt == 0) { if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) { vswp->promisc_cnt++; + mutex_exit(&vswp->mac_lock); return (1); } @@ -1571,13 +1542,16 @@ * accordingly. */ if (plist->num_ports != 0) { - cmn_err(CE_NOTE, "!switching device %s back to " - "programmed mode", vswp->physname); + cmn_err(CE_NOTE, "!vsw%d: switching device %s back to " + "programmed mode", vswp->instance, + vswp->physname); } else { - cmn_err(CE_NOTE, "!switching device %s out of " - "promiscuous mode", vswp->physname); - } - } + cmn_err(CE_NOTE, "!vsw%d: switching device %s out of " + "promiscuous mode", vswp->instance, + vswp->physname); + } + } + mutex_exit(&vswp->mac_lock); port->addr_set = VSW_ADDR_UNSET; D1(vswp, "%s: exit", __func__); @@ -1718,22 +1692,24 @@ static void vsw_mac_ring_tbl_destroy(vsw_t *vswp) { - int i; + int i; + vsw_mac_ring_t *ringp; mutex_enter(&vswp->mac_ring_lock); for (i = 0; i < vswp->mac_ring_tbl_sz; i++) { - if (vswp->mac_ring_tbl[i].ring_state != VSW_MAC_RING_FREE) { + ringp = &vswp->mac_ring_tbl[i]; + + if (ringp->ring_state != VSW_MAC_RING_FREE) { /* * Destroy the queue. */ - vsw_queue_stop(vswp->mac_ring_tbl[i].ring_vqp); - vsw_queue_destroy(vswp->mac_ring_tbl[i].ring_vqp); + vsw_queue_stop(ringp->ring_vqp); + vsw_queue_destroy(ringp->ring_vqp); /* * Re-initialize the structure. */ - vsw_mac_ring_tbl_entry_init(vswp, - &vswp->mac_ring_tbl[i]); + vsw_mac_ring_tbl_entry_init(vswp, ringp); } } mutex_exit(&vswp->mac_ring_lock); @@ -1805,6 +1781,29 @@ ringp = NULL; } + if (ringp != NULL) { + /* + * Make sure thread get's running state for + * this ring. + */ + mutex_enter(&vqp->vq_lock); + while ((vqp->vq_state != VSW_QUEUE_RUNNING) && + (vqp->vq_state != VSW_QUEUE_DRAINED)) { + cv_wait(&vqp->vq_cv, &vqp->vq_lock); + } + + /* + * If the thread is not running, cleanup. + */ + if (vqp->vq_state == VSW_QUEUE_DRAINED) { + vsw_queue_destroy(vqp); + vsw_mac_ring_tbl_entry_init(vswp, + ringp); + ringp = NULL; + } + mutex_exit(&vqp->vq_lock); + } + mutex_exit(&vswp->mac_ring_lock); D1(vswp, "%s: exit", __func__); return ((mac_resource_handle_t)ringp); @@ -1832,6 +1831,8 @@ cv_wait(&vqp->vq_cv, &vqp->vq_lock); } + vqp->vq_state = VSW_QUEUE_STOPPED; + mutex_exit(&vqp->vq_lock); } @@ -1846,7 +1847,7 @@ cv_init(&vqp->vq_cv, NULL, CV_DRIVER, NULL); vqp->vq_first = NULL; vqp->vq_last = NULL; - vqp->vq_state = VSW_QUEUE_STOP; + vqp->vq_state = VSW_QUEUE_STOPPED; return (vqp); } @@ -1868,12 +1869,13 @@ mutex_enter(&vqp->vq_lock); - ASSERT(vqp->vq_state == VSW_QUEUE_STOP); + ASSERT(vqp->vq_state == VSW_QUEUE_STOPPED); /* * Set the state to running, since the thread is now active. */ vqp->vq_state = VSW_QUEUE_RUNNING; + cv_signal(&vqp->vq_cv); while (vqp->vq_state == VSW_QUEUE_RUNNING) { /* @@ -1897,7 +1899,8 @@ mutex_exit(&vqp->vq_lock); /* switch the chain of packets received */ - vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL); + vswp->vsw_switch_frame(vswp, mp, + VSW_PHYSDEV, NULL, NULL); mutex_enter(&vqp->vq_lock); } @@ -1959,6 +1962,7 @@ if (vqp->vq_state != VSW_QUEUE_RUNNING) { freemsg(mp); + mutex_exit(&vqp->vq_lock); goto vsw_rx_queue_cb_exit; } @@ -1983,8 +1987,9 @@ /* * Let go of the lock and exit. */ + mutex_exit(&vqp->vq_lock); + vsw_rx_queue_cb_exit: - mutex_exit(&vqp->vq_lock); D1(vswp, "%s: exit", __func__); } @@ -2008,7 +2013,7 @@ D1(vswp, "vsw_rx_cb: enter"); /* switch the chain of packets received */ - vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL); + vswp->vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL); D1(vswp, "vsw_rx_cb: exit"); } @@ -2024,8 +2029,10 @@ const mac_txinfo_t *mtp; mblk_t *nextp; + mutex_enter(&vswp->mac_lock); if (vswp->mh == NULL) { DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail"); + mutex_exit(&vswp->mac_lock); return (mp); } else { for (;;) { @@ -2033,6 +2040,7 @@ mp->b_next = NULL; mtp = vswp->txinfo; + if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) { mp->b_next = nextp; break; @@ -2040,10 +2048,9 @@ if ((mp = nextp) == NULL) break; - - } - - } + } + } + mutex_exit(&vswp->mac_lock); return (mp); } @@ -2104,8 +2111,6 @@ } RW_EXIT(&vswp->if_lockrw); - vswp->mdprops &= ~(VSW_MD_MACADDR | VSW_DEV_MACADDR); - D1(vswp, "%s: exit", __func__); return (rv); @@ -2118,11 +2123,17 @@ D1(vswp, "%s: enter", __func__); - if (vswp->mh == NULL) + mutex_enter(&vswp->mac_lock); + if (vswp->mh == NULL) { + mutex_exit(&vswp->mac_lock); return (EINVAL); + } /* return stats from underlying device */ *val = mac_stat_get(vswp->mh, stat); + + mutex_exit(&vswp->mac_lock); + return (0); } @@ -2218,16 +2229,21 @@ * Call into the underlying driver to program the * address into HW. */ + mutex_enter(&vswp->mac_lock); if (vswp->mh != NULL) { ret = mac_multicst_add(vswp->mh, mca); if (ret != 0) { - cmn_err(CE_WARN, "!unable to add " - "multicast address"); + cmn_err(CE_WARN, "!vsw%d: unable to " + "add multicast address", + vswp->instance); + mutex_exit(&vswp->mac_lock); goto vsw_remove_addr; } } + mutex_exit(&vswp->mac_lock); } else { - cmn_err(CE_WARN, "!unable to add multicast address"); + cmn_err(CE_WARN, "!vsw%d: unable to add multicast " + "address", vswp->instance); } return (ret); } @@ -2246,8 +2262,10 @@ */ vsw_del_addr(VSW_LOCALDEV, vswp, addr); + mutex_enter(&vswp->mac_lock); if (vswp->mh != NULL) (void) mac_multicst_remove(vswp->mh, mca); + mutex_exit(&vswp->mac_lock); } D1(vswp, "%s: exit", __func__); @@ -2281,7 +2299,7 @@ D1(vswp, "%s: enter", __func__); - vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL); + vswp->vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL); D1(vswp, "%s: exit", __func__); @@ -2290,24 +2308,36 @@ /* * Register for machine description (MD) updates. + * + * Returns 0 on success, 1 on failure. */ -static void +static int vsw_mdeg_register(vsw_t *vswp) { mdeg_prop_spec_t *pspecp; mdeg_node_spec_t *inst_specp; - mdeg_handle_t mdeg_hdl; + mdeg_handle_t mdeg_hdl, mdeg_port_hdl; size_t templatesz; int inst, rv; D1(vswp, "%s: enter", __func__); + /* + * In each 'virtual-device' node in the MD there is a + * 'cfg-handle' property which is the MD's concept of + * an instance number (this may be completely different from + * the device drivers instance #). OBP reads that value and + * stores it in the 'reg' property of the appropriate node in + * the device tree. So we use the 'reg' value when registering + * with the mdeg framework, to ensure we get events for the + * correct nodes. + */ inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip, DDI_PROP_DONTPASS, reg_propname, -1); if (inst == -1) { - DERR(vswp, "%s: unable to get %s property", - __func__, reg_propname); - return; + cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from " + "OBP device tree", vswp->instance, reg_propname); + return (1); } D2(vswp, "%s: instance %d registering with mdeg", __func__, inst); @@ -2329,22 +2359,47 @@ inst_specp->namep = "virtual-device"; inst_specp->specp = pspecp; - /* perform the registration */ - rv = mdeg_register(inst_specp, &vport_match, vsw_mdeg_cb, + /* + * Register an interest in 'virtual-device' nodes with a + * 'name' property of 'virtual-network-switch' + */ + rv = mdeg_register(inst_specp, &vdev_match, vsw_mdeg_cb, (void *)vswp, &mdeg_hdl); - + if (rv != MDEG_SUCCESS) { + DERR(vswp, "%s: mdeg_register failed (%d) for vsw node", + __func__, rv); + goto mdeg_reg_fail; + } + + /* + * Register an interest in 'vsw-port' nodes. + */ + rv = mdeg_register(inst_specp, &vport_match, vsw_port_mdeg_cb, + (void *)vswp, &mdeg_port_hdl); if (rv != MDEG_SUCCESS) { DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv); - kmem_free(inst_specp, sizeof (mdeg_node_spec_t)); - kmem_free(pspecp, templatesz); - return; + (void) mdeg_unregister(mdeg_hdl); + goto mdeg_reg_fail; } /* save off data that will be needed later */ vswp->inst_spec = inst_specp; vswp->mdeg_hdl = mdeg_hdl; + vswp->mdeg_port_hdl = mdeg_port_hdl; D1(vswp, "%s: exit", __func__); + return (0); + +mdeg_reg_fail: + cmn_err(CE_WARN, "!vsw%d: Unable to register MDEG callbacks", + vswp->instance); + kmem_free(pspecp, templatesz); + kmem_free(inst_specp, sizeof (mdeg_node_spec_t)); + + vswp->mdeg_hdl = NULL; + vswp->mdeg_port_hdl = NULL; + + return (1); } static void @@ -2352,15 +2407,19 @@ { D1(vswp, "vsw_mdeg_unregister: enter"); - (void) mdeg_unregister(vswp->mdeg_hdl); - - if (vswp->inst_spec->specp != NULL) { - (void) kmem_free(vswp->inst_spec->specp, - sizeof (vsw_prop_template)); - vswp->inst_spec->specp = NULL; - } + if (vswp->mdeg_hdl != NULL) + (void) mdeg_unregister(vswp->mdeg_hdl); + + if (vswp->mdeg_port_hdl != NULL) + (void) mdeg_unregister(vswp->mdeg_port_hdl); if (vswp->inst_spec != NULL) { + if (vswp->inst_spec->specp != NULL) { + (void) kmem_free(vswp->inst_spec->specp, + sizeof (vsw_prop_template)); + vswp->inst_spec->specp = NULL; + } + (void) kmem_free(vswp->inst_spec, sizeof (mdeg_node_spec_t)); vswp->inst_spec = NULL; @@ -2369,6 +2428,9 @@ D1(vswp, "vsw_mdeg_unregister: exit"); } +/* + * Mdeg callback invoked for the vsw node itself. + */ static int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp) { @@ -2377,14 +2439,96 @@ md_t *mdp; mde_cookie_t node; uint64_t inst; + char *node_name = NULL; if (resp == NULL) return (MDEG_FAILURE); vswp = (vsw_t *)cb_argp; - D1(vswp, "%s: added %d : removed %d : matched %d", - __func__, resp->added.nelem, resp->removed.nelem, + D1(vswp, "%s: added %d : removed %d : curr matched %d" + " : prev matched %d", __func__, resp->added.nelem, + resp->removed.nelem, resp->match_curr.nelem, + resp->match_prev.nelem); + + /* + * Expect 'added' to be non-zero if virtual-network-switch + * nodes exist in the MD when the driver attaches. + */ + for (idx = 0; idx < resp->added.nelem; idx++) { + mdp = resp->added.mdp; + node = resp->added.mdep[idx]; + + if (md_get_prop_str(mdp, node, "name", &node_name) != 0) { + DERR(vswp, "%s: unable to get node name for " + "node(%d) 0x%lx", __func__, idx, node); + continue; + } + + if (md_get_prop_val(mdp, node, "cfg-handle", &inst)) { + DERR(vswp, "%s: prop(cfg-handle) not found port(%d)", + __func__, idx); + continue; + } + + D2(vswp, "%s: added node(%d) 0x%lx with name %s " + "and inst %d", __func__, idx, node, node_name, inst); + + vsw_get_initial_md_properties(vswp, mdp, node); + } + + /* + * A non-zero 'match' value indicates that the MD has been + * updated and that a virtual-network-switch node is present + * which may or may not have been updated. It is up to the clients + * to examine their own nodes and determine if they have changed. + */ + for (idx = 0; idx < resp->match_curr.nelem; idx++) { + mdp = resp->match_curr.mdp; + node = resp->match_curr.mdep[idx]; + + if (md_get_prop_str(mdp, node, "name", &node_name) != 0) { + DERR(vswp, "%s: unable to get node name for " + "node(%d) 0x%lx", __func__, idx, node); + continue; + } + + if (md_get_prop_val(mdp, node, "cfg-handle", &inst)) { + DERR(vswp, "%s: prop(cfg-handle) not found port(%d)", + __func__, idx); + continue; + } + + D2(vswp, "%s: changed node(%d) 0x%lx with name %s " + "and inst %d", __func__, idx, node, node_name, inst); + + vsw_update_md_prop(vswp, mdp, node); + } + + return (MDEG_SUCCESS); +} + +/* + * Mdeg callback invoked for changes to the vsw-port nodes + * under the vsw node. + */ +static int +vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *resp) +{ + vsw_t *vswp; + int idx; + md_t *mdp; + mde_cookie_t node; + uint64_t inst; + + if ((resp == NULL) || (cb_argp == NULL)) + return (MDEG_FAILURE); + + vswp = (vsw_t *)cb_argp; + + D2(vswp, "%s: added %d : removed %d : curr matched %d" + " : prev matched %d", __func__, resp->added.nelem, + resp->removed.nelem, resp->match_curr.nelem, resp->match_prev.nelem); /* process added ports */ @@ -2395,8 +2539,8 @@ D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node); if (vsw_port_add(vswp, mdp, &node) != 0) { - cmn_err(CE_WARN, "Unable to add new port (0x%lx)", - node); + cmn_err(CE_WARN, "!vsw%d: Unable to add new port " + "(0x%lx)", vswp->instance, node); } } @@ -2406,7 +2550,7 @@ node = resp->removed.mdep[idx]; if (md_get_prop_val(mdp, node, id_propname, &inst)) { - DERR(vswp, "%s: prop(%s) not found port(%d)", + DERR(vswp, "%s: prop(%s) not found in port(%d)", __func__, id_propname, idx); continue; } @@ -2414,7 +2558,8 @@ D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node); if (vsw_port_detach(vswp, inst) != 0) { - cmn_err(CE_WARN, "Unable to remove port %ld", inst); + cmn_err(CE_WARN, "!vsw%d: Unable to remove port %ld", + vswp->instance, inst); } } @@ -2429,6 +2574,307 @@ } /* + * Read the initial start-of-day values from the specified MD node. + */ +static void +vsw_get_initial_md_properties(vsw_t *vswp, md_t *mdp, mde_cookie_t node) +{ + int i; + uint64_t macaddr = 0; + + D1(vswp, "%s: enter", __func__); + + if (vsw_get_md_physname(vswp, mdp, node, vswp->physname) == 0) { + /* + * Note it is valid for the physname property to + * be NULL so check actual name length to determine + * if we have a actual device name. + */ + if (strlen(vswp->physname) > 0) + vswp->mdprops |= VSW_MD_PHYSNAME; + } else { + cmn_err(CE_WARN, "!vsw%d: Unable to read name of physical " + "device from MD", vswp->instance); + return; + } + + /* mac address for vswitch device itself */ + if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) { + cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD", + vswp->instance); + + /* + * Fallback to using the mac address of the physical + * device. + */ + if (vsw_get_physaddr(vswp) == 0) { + cmn_err(CE_NOTE, "!vsw%d: Using MAC address from " + "physical device (%s)", vswp->instance, + vswp->physname); + } else { + cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address" + "from device %s", vswp->instance, + vswp->physname); + } + } else { + WRITE_ENTER(&vswp->if_lockrw); + for (i = ETHERADDRL - 1; i >= 0; i--) { + vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF; + macaddr >>= 8; + } + RW_EXIT(&vswp->if_lockrw); + vswp->mdprops |= VSW_MD_MACADDR; + } + + if (vsw_get_md_smodes(vswp, mdp, node, + vswp->smode, &vswp->smode_num)) { + cmn_err(CE_WARN, "vsw%d: Unable to read %s property from " + "MD, defaulting to programmed mode", vswp->instance, + smode_propname); + + for (i = 0; i < NUM_SMODES; i++) + vswp->smode[i] = VSW_LAYER2; + + vswp->smode_num = NUM_SMODES; + } else { + ASSERT(vswp->smode_num != 0); + vswp->mdprops |= VSW_MD_SMODE; + } + + /* + * Unable to setup any switching mode, nothing more + * we can do. + */ + if (vsw_setup_switching(vswp)) + return; + + WRITE_ENTER(&vswp->if_lockrw); + vswp->if_state &= ~VSW_IF_UP; + RW_EXIT(&vswp->if_lockrw); + if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) { + if (vsw_mac_register(vswp) != 0) { + /* + * Treat this as a non-fatal error as we may be + * able to operate in some other mode. + */ + cmn_err(CE_WARN, "vsw%d: Unable to register as " + "provider with MAC layer", vswp->instance); + } + } + + D1(vswp, "%s: exit", __func__); +} + +/* + * Check to see if the relevant properties in the specified node have + * changed, and if so take the appropriate action. + * + * If any of the properties are missing or invalid we don't take + * any action, as this function should only be invoked when modifications + * have been made to what we assume is a working configuration, which + * we leave active. + * + * Note it is legal for this routine to be invoked even if none of the + * properties in the port node within the MD have actually changed. + */ +static void +vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node) +{ + char physname[LIFNAMSIZ]; + char drv[LIFNAMSIZ]; + uint_t ddi_instance; + uint8_t new_smode[NUM_SMODES]; + int i, smode_num = 0; + uint64_t macaddr = 0; + vsw_port_list_t *plist = &vswp->plist; + vsw_port_t *port = NULL; + enum {MD_init = 0x1, + MD_physname = 0x2, + MD_macaddr = 0x4, + MD_smode = 0x8} updated; + + updated = MD_init; + + D1(vswp, "%s: enter", __func__); + + /* + * Check if name of physical device in MD has changed. + */ + if (vsw_get_md_physname(vswp, mdp, node, (char *)&physname) == 0) { + /* + * Do basic sanity check on new device name/instance, + * if its non NULL. It is valid for the device name to + * have changed from a non NULL to a NULL value, i.e. + * the vsw is being changed to 'routed' mode. + */ + if ((strlen(physname) != 0) && + (ddi_parse(physname, drv, + &ddi_instance) != DDI_SUCCESS)) { + cmn_err(CE_WARN, "!vsw%d: new device name %s is not" + " a valid device name/instance", + vswp->instance, physname); + goto fail_reconf; + } + + if (strcmp(physname, vswp->physname)) { + D2(vswp, "%s: device name changed from %s to %s", + __func__, vswp->physname, physname); + + updated |= MD_physname; + } else { + D2(vswp, "%s: device name unchanged at %s", + __func__, vswp->physname); + } + } else { + cmn_err(CE_WARN, "!vsw%d: Unable to read name of physical " + "device from updated MD.", vswp->instance); + goto fail_reconf; + } + + /* + * Check if MAC address has changed. + */ + if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) { + cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD", + vswp->instance); + goto fail_reconf; + } else { + READ_ENTER(&vswp->if_lockrw); + for (i = ETHERADDRL - 1; i >= 0; i--) { + if (vswp->if_addr.ether_addr_octet[i] + != (macaddr & 0xFF)) { + D2(vswp, "%s: octet[%d] 0x%x != 0x%x", + __func__, i, + vswp->if_addr.ether_addr_octet[i], + (macaddr & 0xFF)); + updated |= MD_macaddr; + break; + } + macaddr >>= 8; + } + RW_EXIT(&vswp->if_lockrw); + } + + /* + * Check if switching modes have changed. + */ + if (vsw_get_md_smodes(vswp, mdp, node, + new_smode, &smode_num)) { + cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from MD", + vswp->instance, smode_propname); + goto fail_reconf; + } else { + ASSERT(smode_num != 0); + if (smode_num != vswp->smode_num) { + D2(vswp, "%s: number of modes changed from %d to %d", + __func__, vswp->smode_num, smode_num); + } + + for (i = 0; i < smode_num; i++) { + if (new_smode[i] != vswp->smode[i]) { + D2(vswp, "%s: mode changed from %d to %d", + __func__, vswp->smode[i], new_smode[i]); + updated |= MD_smode; + break; + } + } + } + + /* + * Now make any changes which are needed... + */ + + if (updated & (MD_physname | MD_smode)) { + /* + * Disconnect all ports from the current card + */ + WRITE_ENTER(&plist->lockrw); + for (port = plist->head; port != NULL; port = port->p_next) { + /* Remove address if was programmed into HW. */ + if (vsw_unset_hw(vswp, port)) { + RW_EXIT(&plist->lockrw); + goto fail_update; + } + } + RW_EXIT(&plist->lockrw); + + /* + * Stop, detach the old device.. + */ + vsw_mac_detach(vswp); + + /* + * Update phys name. + */ + if (updated & MD_physname) { + cmn_err(CE_NOTE, "!vsw%d: changing from %s to %s", + vswp->instance, vswp->physname, physname); + (void) strncpy(vswp->physname, + physname, strlen(physname) + 1); + + if (strlen(vswp->physname) > 0) + vswp->mdprops |= VSW_MD_PHYSNAME; + } + + /* + * Update array with the new switch mode values. + */ + if (updated & MD_smode) { + for (i = 0; i < smode_num; i++) + vswp->smode[i] = new_smode[i]; + + vswp->smode_num = smode_num; + vswp->smode_idx = 0; + } + + /* + * ..and attach, start the new device. + */ + if (vsw_setup_switching(vswp)) + goto fail_update; + + /* + * Connect ports to new card. + */ + WRITE_ENTER(&plist->lockrw); + for (port = plist->head; port != NULL; port = port->p_next) { + if (vsw_set_hw(vswp, port)) { + RW_EXIT(&plist->lockrw); + goto fail_update; + } + } + RW_EXIT(&plist->lockrw); + } + + if (updated & MD_macaddr) { + cmn_err(CE_NOTE, "!vsw%d: changing mac address to 0x%lx", + vswp->instance, macaddr); + + WRITE_ENTER(&vswp->if_lockrw); + for (i = ETHERADDRL - 1; i >= 0; i--) { + vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF; + macaddr >>= 8; + } + RW_EXIT(&vswp->if_lockrw); + + /* + * Notify the MAC layer of the changed address. + */ + mac_unicst_update(vswp->if_mh, (uint8_t *)&vswp->if_addr); + } + + return; + +fail_reconf: + cmn_err(CE_WARN, "!vsw%d: configuration unchanged", vswp->instance); + return; + +fail_update: + cmn_err(CE_WARN, "!vsw%d: update of configuration failed", + vswp->instance); +} + +/* * Add a new port to the system. * * Returns 0 on success, 1 on failure. @@ -2463,6 +2909,8 @@ return (1); } + D2(vswp, "%s: %d nodes found", __func__, num_nodes); + /* allocate enough space for node list */ listsz = num_nodes * sizeof (mde_cookie_t); listp = kmem_zalloc(listsz, KM_SLEEP); @@ -2772,8 +3220,8 @@ WRITE_ENTER(&ldcl->lockrw); while (ldcl->num_ldcs > 0) { if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {; - cmn_err(CE_WARN, "unable to detach ldc %ld", - ldcl->head->ldc_id); + cmn_err(CE_WARN, "!vsw%d: unable to detach ldc %ld", + vswp->instance, ldcl->head->ldc_id); RW_EXIT(&ldcl->lockrw); return (1); } @@ -2921,9 +3369,9 @@ * the device instance. Another attempt will be made * to free the pool when the device itself detaches. */ - cmn_err(CE_WARN, "Creation of ldc channel %ld failed" - " and cannot destroy associated mblk pool", - ldc_id); + cmn_err(CE_WARN, "!vsw%d: Creation of ldc channel %ld " + "failed and cannot destroy associated mblk " + "pool", vswp->instance, ldc_id); ldcp->rxh->nextp = vswp->rxh; vswp->rxh = ldcp->rxh; } @@ -3405,7 +3853,8 @@ rv = ldc_status(ldcp->ldc_handle, &ldcp->ldc_status); mutex_exit(&ldcp->status_lock); if (rv != 0) { - cmn_err(CE_WARN, "Unable to read channel state"); + cmn_err(CE_WARN, "!vsw%d: Unable to read channel state", + vswp->instance); goto vsw_cb_exit; } @@ -3586,16 +4035,17 @@ mutex_exit(&ldcp->hss_lock); if (ldcp->hcnt++ > vsw_num_handshakes) { - cmn_err(CE_WARN, "exceeded number of permitted " + cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted " "handshake attempts (%d) on channel %ld", - ldcp->hcnt, ldcp->ldc_id); + vswp->instance, ldcp->hcnt, ldcp->ldc_id); return; } if ((vswp->taskq_p == NULL) || (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp, DDI_NOSLEEP) != DDI_SUCCESS)) { - cmn_err(CE_WARN, "Can't dispatch version handshake task"); + cmn_err(CE_WARN, "!vsw%d: Can't dispatch version handshake " + "task", vswp->instance); } D1(vswp, "vsw_restart_handshake: exit"); @@ -5324,7 +5774,7 @@ if (chain > vsw_chain_len) { D3(vswp, "%s(%lld): switching chain of %d " "msgs", __func__, ldcp->ldc_id, chain); - vsw_switch_frame(vswp, bp, VSW_VNETPORT, + vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT, ldcp->ldc_port, NULL); bp = NULL; break; @@ -5335,7 +5785,7 @@ if (bp != NULL) { D3(vswp, "%s(%lld): switching chain of %d msgs", __func__, ldcp->ldc_id, chain); - vsw_switch_frame(vswp, bp, VSW_VNETPORT, + vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT, ldcp->ldc_port, NULL); } @@ -5636,7 +6086,7 @@ sizeof (vio_ibnd_desc_t)); /* send the packet to be switched */ - vsw_switch_frame(vswp, mp, VSW_VNETPORT, + vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL); break; @@ -5648,8 +6098,8 @@ idx = ibnd_desc->hdr.desc_handle; if (idx >= VSW_RING_NUM_EL) { - cmn_err(CE_WARN, "%s: corrupted ACK received " - "(idx %ld)", __func__, idx); + cmn_err(CE_WARN, "!vsw%d: corrupted ACK received " + "(idx %ld)", vswp->instance, idx); return; } @@ -6708,7 +7158,8 @@ dring_msg = vsw_create_dring_info_pkt(ldcp); if (dring_msg == NULL) { - cmn_err(CE_WARN, "vsw_send_dring_info: error creating msg"); + cmn_err(CE_WARN, "!vsw%d: %s: error creating msg", + vswp->instance, __func__); return; } @@ -6856,6 +7307,7 @@ if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key, (mod_hash_val_t *)&port) != 0) { + D2(vswp, "%s: no port found", __func__); return (NULL); } @@ -6875,14 +7327,18 @@ mcst_addr_t *mcst_p = NULL; vsw_t *vswp = port->p_vswp; uint64_t addr = 0x0; - int i, ret; + int i; D1(vswp, "%s: enter", __func__); D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count); - if (vswp->mh == NULL) + mutex_enter(&vswp->mac_lock); + if (vswp->mh == NULL) { + mutex_exit(&vswp->mac_lock); return (1); + } + mutex_exit(&vswp->mac_lock); for (i = 0; i < mcst_pkt->count; i++) { /* @@ -6926,16 +7382,20 @@ * just increments a ref counter (which is * used when the address is being deleted) */ - ret = mac_multicst_add(vswp->mh, - (uchar_t *)&mcst_pkt->mca[i]); - if (ret) { - cmn_err(CE_WARN, "!unable to add " - "multicast address"); + mutex_enter(&vswp->mac_lock); + if ((vswp->mh == NULL) || + mac_multicst_add(vswp->mh, + (uchar_t *)&mcst_pkt->mca[i])) { + mutex_exit(&vswp->mac_lock); + cmn_err(CE_WARN, "!vsw%d: unable to " + "add multicast address", + vswp->instance); (void) vsw_del_mcst(vswp, VSW_VNETPORT, addr, port); vsw_del_addr(VSW_VNETPORT, port, addr); - return (ret); + return (1); } + mutex_exit(&vswp->mac_lock); } else { DERR(vswp, "%s: error adding multicast " @@ -6964,8 +7424,17 @@ * if other ports are interested in this * address. */ - (void) mac_multicst_remove(vswp->mh, - (uchar_t *)&mcst_pkt->mca[i]); + mutex_enter(&vswp->mac_lock); + if ((vswp->mh == NULL) || + mac_multicst_remove(vswp->mh, + (uchar_t *)&mcst_pkt->mca[i])) { + mutex_exit(&vswp->mac_lock); + cmn_err(CE_WARN, "!vsw%d: unable to " + "remove multicast address", + vswp->instance); + return (1); + } + mutex_exit(&vswp->mac_lock); } else { DERR(vswp, "%s: error deleting multicast "
--- a/usr/src/uts/sun4v/sys/vsw.h Fri Nov 24 05:40:55 2006 -0800 +++ b/usr/src/uts/sun4v/sys/vsw.h Fri Nov 24 06:52:47 2006 -0800 @@ -387,10 +387,16 @@ /* * Vsw queue -- largely modeled after squeue + * + * VSW_QUEUE_RUNNING, vqueue thread for queue is running. + * VSW_QUEUE_DRAINED, vqueue thread has drained current work and is exiting. + * VSW_QUEUE_STOP, request for the vqueue thread to stop. + * VSW_QUEUE_STOPPED, vqueue thread is not running. */ #define VSW_QUEUE_RUNNING 0x01 -#define VSW_QUEUE_STOP 0x02 -#define VSW_QUEUE_DRAINED 0x04 +#define VSW_QUEUE_DRAINED 0x02 +#define VSW_QUEUE_STOP 0x04 +#define VSW_QUEUE_STOPPED 0x08 typedef struct vsw_queue_s { kmutex_t vq_lock; /* Lock, before using any member. */ @@ -482,8 +488,12 @@ krwlock_t mfdbrw; /* rwlock for mFDB */ vio_mblk_pool_t *rxh; /* Receive pool handle */ + void (*vsw_switch_frame) + (struct vsw *, mblk_t *, int, + vsw_port_t *, mac_resource_handle_t); /* mac layer */ + kmutex_t mac_lock; /* protect fields below */ mac_handle_t mh; mac_rx_handle_t mrh; multiaddress_capab_t maddr; /* Multiple uni addr capable */ @@ -504,6 +514,7 @@ /* Machine Description updates */ mdeg_node_spec_t *inst_spec; mdeg_handle_t mdeg_hdl; + mdeg_handle_t mdeg_port_hdl; /* if configured as an ethernet interface */ mac_handle_t if_mh; /* MAC handle */