Mercurial > illumos > illumos-gate
changeset 10958:2d0d7434a4fb
6538700 xnf->xnb protocol should support multicast add/remove
6648615 xnb/xnf should support feature-no-csum-offload correctly
6758615 WARNING: The xnf driver requires a dom0 that supports 'feature-rx-copy'
6746372 A hot lock in xnb limits scalability between guest domains
6729609 High CPU utilization @domU while doing Tx at one guest domain due to a hot lock in the xnf driver
author | David Edmondson <dme@sun.com> |
---|---|
date | Thu, 05 Nov 2009 01:05:36 -0800 |
parents | 7681ab1c3e80 |
children | 03b72d60ca65 |
files | usr/src/uts/common/xen/io/xnb.c usr/src/uts/common/xen/io/xnb.h usr/src/uts/common/xen/io/xnbo.c usr/src/uts/common/xen/io/xnbu.c usr/src/uts/common/xen/io/xnf.c usr/src/uts/common/xen/io/xnf.h |
diffstat | 6 files changed, 2387 insertions(+), 2060 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/uts/common/xen/io/xnb.c Wed Nov 04 21:40:43 2009 -0800 +++ b/usr/src/uts/common/xen/io/xnb.c Thu Nov 05 01:05:36 2009 -0800 @@ -35,7 +35,7 @@ #include <sys/modctl.h> #include <sys/conf.h> #include <sys/mac.h> -#include <sys/mac_impl.h> /* XXXXBOW - remove, included for mac_fix_cksum() */ +#include <sys/mac_impl.h> /* For mac_fix_cksum(). */ #include <sys/dlpi.h> #include <sys/strsubr.h> #include <sys/strsun.h> @@ -49,11 +49,10 @@ #include <sys/evtchn_impl.h> #include <sys/gnttab.h> #include <vm/vm_dep.h> - +#include <sys/note.h> #include <sys/gld.h> #include <inet/ip.h> #include <inet/ip_impl.h> -#include <sys/vnic_impl.h> /* blech. */ /* * The terms "transmit" and "receive" are used in alignment with domU, @@ -62,23 +61,9 @@ */ /* - * XXPV dme: things to do, as well as various things indicated - * throughout the source: - * - copy avoidance outbound. - * - copy avoidance inbound. - * - transfer credit limiting. - * - MAC address based filtering. + * Should we allow guests to manipulate multicast group membership? */ - -/* - * Should we attempt to defer checksum calculation? - */ -static boolean_t xnb_cksum_offload = B_TRUE; -/* - * When receiving packets from a guest, should they be copied - * or used as-is (esballoc)? - */ -static boolean_t xnb_tx_always_copy = B_TRUE; +static boolean_t xnb_multicast_control = B_TRUE; static boolean_t xnb_connect_rings(dev_info_t *); static void xnb_disconnect_rings(dev_info_t *); @@ -89,31 +74,55 @@ static int xnb_txbuf_constructor(void *, void *, int); static void xnb_txbuf_destructor(void *, void *); -static xnb_txbuf_t *xnb_txbuf_get(xnb_t *, int); -static void xnb_txbuf_put(xnb_t *, xnb_txbuf_t *); -static void xnb_tx_notify_peer(xnb_t *); -static void xnb_tx_complete(xnb_txbuf_t *); +static void xnb_tx_notify_peer(xnb_t *, boolean_t); static void xnb_tx_mark_complete(xnb_t *, RING_IDX, int16_t); -static void xnb_tx_schedule_unmop(xnb_t *, gnttab_map_grant_ref_t *, - xnb_txbuf_t *); -static void xnb_tx_perform_pending_unmop(xnb_t *); + +mblk_t *xnb_to_peer(xnb_t *, mblk_t *); mblk_t *xnb_copy_to_peer(xnb_t *, mblk_t *); -int xnb_unmop_lowwat = NET_TX_RING_SIZE >> 2; -int xnb_unmop_hiwat = NET_TX_RING_SIZE - (NET_TX_RING_SIZE >> 2); - +static void setup_gop(xnb_t *, gnttab_copy_t *, uchar_t *, + size_t, size_t, size_t, grant_ref_t); +#pragma inline(setup_gop) +static boolean_t is_foreign(void *); +#pragma inline(is_foreign) -boolean_t xnb_hv_copy = B_TRUE; -boolean_t xnb_explicit_pageflip_set = B_FALSE; - -/* XXPV dme: are these really invalid? */ #define INVALID_GRANT_HANDLE ((grant_handle_t)-1) #define INVALID_GRANT_REF ((grant_ref_t)-1) -static kmem_cache_t *xnb_txbuf_cachep; static kmutex_t xnb_alloc_page_lock; /* + * On a 32 bit PAE system physical and machine addresses are larger + * than 32 bits. ddi_btop() on such systems take an unsigned long + * argument, and so addresses above 4G are truncated before ddi_btop() + * gets to see them. To avoid this, code the shift operation here. + */ +#define xnb_btop(addr) ((addr) >> PAGESHIFT) + +/* DMA attributes for transmit and receive data */ +static ddi_dma_attr_t buf_dma_attr = { + DMA_ATTR_V0, /* version of this structure */ + 0, /* lowest usable address */ + 0xffffffffffffffffULL, /* highest usable address */ + 0x7fffffff, /* maximum DMAable byte count */ + MMU_PAGESIZE, /* alignment in bytes */ + 0x7ff, /* bitmap of burst sizes */ + 1, /* minimum transfer */ + 0xffffffffU, /* maximum transfer */ + 0xffffffffffffffffULL, /* maximum segment length */ + 1, /* maximum number of segments */ + 1, /* granularity */ + 0, /* flags (reserved) */ +}; + +/* DMA access attributes for data: NOT to be byte swapped. */ +static ddi_device_acc_attr_t data_accattr = { + DDI_DEVICE_ATTR_V0, + DDI_NEVERSWAP_ACC, + DDI_STRICTORDER_ACC +}; + +/* * Statistics. */ static char *aux_statistics[] = { @@ -226,14 +235,15 @@ } /* - * Software checksum calculation and insertion for an arbitrary packet. + * Calculate and insert the transport checksum for an arbitrary packet. */ -/*ARGSUSED*/ static mblk_t * xnb_software_csum(xnb_t *xnbp, mblk_t *mp) { + _NOTE(ARGUNUSED(xnbp)); + /* - * XXPV dme: shouldn't rely on vnic_fix_cksum(), not least + * XXPV dme: shouldn't rely on mac_fix_cksum(), not least * because it doesn't cover all of the interesting cases :-( */ (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, @@ -254,7 +264,7 @@ /* * Check that the packet is contained in a single mblk. In - * the "from peer" path this is true today, but will change + * the "from peer" path this is true today, but may change * when scatter gather support is added. In the "to peer" * path we cannot be sure, but in most cases it will be true * (in the xnbo case the packet has come from a MAC device @@ -393,7 +403,8 @@ xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data) { xnb_t *xnbp; - char *xsname, mac[ETHERADDRL * 3]; + char *xsname; + char cachename[32]; xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP); @@ -404,18 +415,17 @@ xnbp->xnb_irq = B_FALSE; xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE; xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE; - xnbp->xnb_cksum_offload = xnb_cksum_offload; xnbp->xnb_connected = B_FALSE; xnbp->xnb_hotplugged = B_FALSE; xnbp->xnb_detachable = B_FALSE; xnbp->xnb_peer = xvdi_get_oeid(dip); - xnbp->xnb_tx_pages_writable = B_FALSE; - xnbp->xnb_tx_always_copy = xnb_tx_always_copy; + xnbp->xnb_be_status = XNB_STATE_INIT; + xnbp->xnb_fe_status = XNB_STATE_INIT; xnbp->xnb_tx_buf_count = 0; - xnbp->xnb_tx_unmop_count = 0; - xnbp->xnb_hv_copy = B_FALSE; + xnbp->xnb_rx_hv_copy = B_FALSE; + xnbp->xnb_multicast_control = B_FALSE; xnbp->xnb_rx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); ASSERT(xnbp->xnb_rx_va != NULL); @@ -424,18 +434,28 @@ != DDI_SUCCESS) goto failure; - /* allocated on demand, when/if we enter xnb_copy_to_peer() */ + /* Allocated on demand, when/if we enter xnb_copy_to_peer(). */ xnbp->xnb_rx_cpop = NULL; - xnbp->xnb_cpop_sz = 0; + xnbp->xnb_rx_cpop_count = 0; mutex_init(&xnbp->xnb_tx_lock, NULL, MUTEX_DRIVER, xnbp->xnb_icookie); mutex_init(&xnbp->xnb_rx_lock, NULL, MUTEX_DRIVER, xnbp->xnb_icookie); + mutex_init(&xnbp->xnb_state_lock, NULL, MUTEX_DRIVER, + xnbp->xnb_icookie); - /* set driver private pointer now */ + /* Set driver private pointer now. */ ddi_set_driver_private(dip, xnbp); + (void) sprintf(cachename, "xnb_tx_buf_cache_%d", ddi_get_instance(dip)); + xnbp->xnb_tx_buf_cache = kmem_cache_create(cachename, + sizeof (xnb_txbuf_t), 0, + xnb_txbuf_constructor, xnb_txbuf_destructor, + NULL, xnbp, NULL, 0); + if (xnbp->xnb_tx_buf_cache == NULL) + goto failure_0; + if (!xnb_ks_init(xnbp)) goto failure_1; @@ -457,16 +477,12 @@ xsname = xvdi_get_xsname(dip); if (xenbus_printf(XBT_NULL, xsname, - "feature-no-csum-offload", "%d", - xnbp->xnb_cksum_offload ? 0 : 1) != 0) + "feature-multicast-control", "%d", + xnb_multicast_control ? 1 : 0) != 0) goto failure_3; - /* - * Use global xnb_hv_copy to export this feature. This means that - * we have to decide what to do before starting up a guest domain - */ if (xenbus_printf(XBT_NULL, xsname, - "feature-rx-copy", "%d", xnb_hv_copy ? 1 : 0) != 0) + "feature-rx-copy", "%d", 1) != 0) goto failure_3; /* * Linux domUs seem to depend on "feature-rx-flip" being 0 @@ -475,23 +491,8 @@ * but we might as well play nice. */ if (xenbus_printf(XBT_NULL, xsname, - "feature-rx-flip", "%d", xnb_explicit_pageflip_set ? 1 : 0) != 0) - goto failure_3; - - if (xenbus_scanf(XBT_NULL, xsname, - "mac", "%s", mac) != 0) { - cmn_err(CE_WARN, "xnb_attach: " - "cannot read mac address from %s", - xsname); + "feature-rx-flip", "%d", 0) != 0) goto failure_3; - } - - if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) { - cmn_err(CE_WARN, - "xnb_attach: cannot parse mac address %s", - mac); - goto failure_3; - } (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait); (void) xvdi_post_event(dip, XEN_HP_ADD); @@ -505,6 +506,10 @@ xnb_ks_free(xnbp); failure_1: + kmem_cache_destroy(xnbp->xnb_tx_buf_cache); + +failure_0: + mutex_destroy(&xnbp->xnb_state_lock); mutex_destroy(&xnbp->xnb_rx_lock); mutex_destroy(&xnbp->xnb_tx_lock); @@ -514,7 +519,6 @@ return (DDI_FAILURE); } -/*ARGSUSED*/ void xnb_detach(dev_info_t *dip) { @@ -530,14 +534,17 @@ xnb_ks_free(xnbp); + kmem_cache_destroy(xnbp->xnb_tx_buf_cache); + ddi_set_driver_private(dip, NULL); + mutex_destroy(&xnbp->xnb_state_lock); + mutex_destroy(&xnbp->xnb_rx_lock); mutex_destroy(&xnbp->xnb_tx_lock); - mutex_destroy(&xnbp->xnb_rx_lock); - if (xnbp->xnb_cpop_sz > 0) - kmem_free(xnbp->xnb_rx_cpop, sizeof (*xnbp->xnb_rx_cpop) - * xnbp->xnb_cpop_sz); + if (xnbp->xnb_rx_cpop_count > 0) + kmem_free(xnbp->xnb_rx_cpop, sizeof (xnbp->xnb_rx_cpop[0]) + * xnbp->xnb_rx_cpop_count); ASSERT(xnbp->xnb_rx_va != NULL); vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE); @@ -545,7 +552,12 @@ kmem_free(xnbp, sizeof (*xnbp)); } - +/* + * Allocate a page from the hypervisor to be flipped to the peer. + * + * Try to get pages in batches to reduce the overhead of calls into + * the balloon driver. + */ static mfn_t xnb_alloc_page(xnb_t *xnbp) { @@ -591,10 +603,16 @@ #undef WARNING_RATE_LIMIT } -/*ARGSUSED*/ +/* + * Free a page back to the hypervisor. + * + * This happens only in the error path, so batching is not worth the + * complication. + */ static void xnb_free_page(xnb_t *xnbp, mfn_t mfn) { + _NOTE(ARGUNUSED(xnbp)); int r; pfn_t pfn; @@ -602,10 +620,6 @@ pfnzero(pfn, 0, PAGESIZE); xen_release_pfn(pfn); - /* - * This happens only in the error path, so batching is - * not worth the complication. - */ if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) { cmn_err(CE_WARN, "free_page: cannot decrease memory " "reservation (%d): page kept but unusable (mfn = 0x%lx).", @@ -614,8 +628,8 @@ } /* - * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but - * using local variables. + * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but using + * local variables. Used in both xnb_to_peer() and xnb_copy_to_peer(). */ #define XNB_RING_HAS_UNCONSUMED_REQUESTS(_r) \ ((((_r)->sring->req_prod - loop) < \ @@ -623,6 +637,9 @@ ((_r)->sring->req_prod - loop) : \ (RING_SIZE(_r) - (loop - prod))) +/* + * Pass packets to the peer using page flipping. + */ mblk_t * xnb_to_peer(xnb_t *xnbp, mblk_t *mp) { @@ -835,66 +852,38 @@ return (mp); } -/* helper functions for xnb_copy_to_peer */ +/* Helper functions for xnb_copy_to_peer(). */ /* * Grow the array of copy operation descriptors. - * Returns a pointer to the next available entry. */ -gnttab_copy_t * -grow_cpop_area(xnb_t *xnbp, gnttab_copy_t *o_cpop) +static boolean_t +grow_cpop_area(xnb_t *xnbp) { - /* - * o_cpop (arg.1) is a ptr to the area we would like to copy - * something into but cannot, because we haven't alloc'ed it - * yet, or NULL. - * old_cpop and new_cpop (local) are pointers to old/new - * versions of xnbp->xnb_rx_cpop. - */ - gnttab_copy_t *new_cpop, *old_cpop, *ret_cpop; - size_t newcount; + size_t count; + gnttab_copy_t *new; ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock)); - old_cpop = xnbp->xnb_rx_cpop; - /* - * o_cpop is a pointer into the array pointed to by old_cpop; - * it would be an error for exactly one of these pointers to be NULL. - * We shouldn't call this function if xnb_rx_cpop has already - * been allocated, but we're starting to fill it from the beginning - * again. - */ - ASSERT((o_cpop == NULL && old_cpop == NULL) || - (o_cpop != NULL && old_cpop != NULL && o_cpop != old_cpop)); + count = xnbp->xnb_rx_cpop_count + CPOP_DEFCNT; - newcount = xnbp->xnb_cpop_sz + CPOP_DEFCNT; - - new_cpop = kmem_alloc(sizeof (*new_cpop) * newcount, KM_NOSLEEP); - if (new_cpop == NULL) { + if ((new = kmem_alloc(sizeof (new[0]) * count, KM_NOSLEEP)) == NULL) { xnbp->xnb_stat_other_allocation_failure++; - return (NULL); + return (B_FALSE); } - if (o_cpop != NULL) { - size_t offset = (o_cpop - old_cpop); - - /* we only need to move the parts in use ... */ - (void) memmove(new_cpop, old_cpop, xnbp->xnb_cpop_sz * - (sizeof (*old_cpop))); + bcopy(xnbp->xnb_rx_cpop, new, + sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count); - kmem_free(old_cpop, xnbp->xnb_cpop_sz * sizeof (*old_cpop)); + kmem_free(xnbp->xnb_rx_cpop, + sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count); - ret_cpop = new_cpop + offset; - } else { - ret_cpop = new_cpop; - } - - xnbp->xnb_rx_cpop = new_cpop; - xnbp->xnb_cpop_sz = newcount; + xnbp->xnb_rx_cpop = new; + xnbp->xnb_rx_cpop_count = count; xnbp->xnb_stat_rx_cpoparea_grown++; - return (ret_cpop); + return (B_TRUE); } /* @@ -903,9 +892,9 @@ static boolean_t is_foreign(void *addr) { - pfn_t pfn = hat_getpfnum(kas.a_hat, addr); + pfn_t pfn = hat_getpfnum(kas.a_hat, addr); - return (pfn & PFN_IS_FOREIGN_MFN ? B_TRUE : B_FALSE); + return ((pfn & PFN_IS_FOREIGN_MFN) == PFN_IS_FOREIGN_MFN); } /* @@ -965,17 +954,23 @@ gp->dest.domid = xnbp->xnb_peer; } +/* + * Pass packets to the peer using hypervisor copy operations. + */ mblk_t * xnb_copy_to_peer(xnb_t *xnbp, mblk_t *mp) { mblk_t *free = mp, *mp_prev = NULL, *saved_mp = mp; mblk_t *ml, *ml_prev; - gnttab_copy_t *gop_cp; boolean_t notify; RING_IDX loop, prod; int i; - if (!xnbp->xnb_hv_copy) + /* + * If the peer does not pre-post buffers for received packets, + * use page flipping to pass packets to it. + */ + if (!xnbp->xnb_rx_hv_copy) return (xnb_to_peer(xnbp, mp)); /* @@ -989,13 +984,12 @@ * * NOTE ad 2. * In order to reduce the number of hypercalls, we prepare - * several packets (mp->b_cont != NULL) for the peer and - * perform a single hypercall to transfer them. - * We also have to set up a seperate copy operation for - * every page. + * several mblks (mp->b_cont != NULL) for the peer and + * perform a single hypercall to transfer them. We also have + * to set up a seperate copy operation for every page. * - * If we have more than one message (mp->b_next != NULL), - * we do this whole dance repeatedly. + * If we have more than one packet (mp->b_next != NULL), we do + * this whole dance repeatedly. */ mutex_enter(&xnbp->xnb_rx_lock); @@ -1013,12 +1007,12 @@ while ((mp != NULL) && XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) { netif_rx_request_t *rxreq; + size_t d_offset, len; + int item_count; + gnttab_copy_t *gop_cp; netif_rx_response_t *rxresp; - size_t d_offset; - size_t len; uint16_t cksum_flags; int16_t status = NETIF_RSP_OKAY; - int item_count; /* 1 */ rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop); @@ -1038,8 +1032,9 @@ gop_cp = xnbp->xnb_rx_cpop; /* - * We walk the b_cont pointers and set up a gop_cp - * structure for every page in every data block we have. + * We walk the b_cont pointers and set up a + * gnttab_copy_t for each sub-page chunk in each data + * block. */ /* 2a */ for (ml = mp, ml_prev = NULL; ml != NULL; ml = ml->b_cont) { @@ -1048,8 +1043,13 @@ size_t r_offset; /* - * If we get an mblk on a page that doesn't belong to - * this domain, get a new mblk to replace the old one. + * The hypervisor will not allow us to + * reference a foreign page (e.g. one + * belonging to another domain) by mfn in the + * copy operation. If the data in this mblk is + * on such a page we must copy the data into a + * local page before initiating the hypervisor + * copy operation. */ if (is_foreign(ml->b_rptr) || is_foreign(ml->b_wptr)) { mblk_t *ml_new = replace_msg(ml, chunk, @@ -1080,15 +1080,14 @@ while (chunk > 0) { size_t part_len; - item_count++; - if (item_count > xnbp->xnb_cpop_sz) { - gop_cp = grow_cpop_area(xnbp, gop_cp); - if (gop_cp == NULL) + if (item_count == xnbp->xnb_rx_cpop_count) { + if (!grow_cpop_area(xnbp)) goto failure; + gop_cp = &xnbp->xnb_rx_cpop[item_count]; } /* * If our mblk crosses a page boundary, we need - * to do a seperate copy for every page. + * to do a seperate copy for each page. */ if (r_offset + chunk > PAGESIZE) { part_len = PAGESIZE - r_offset; @@ -1116,8 +1115,10 @@ */ r_offset = 0; gop_cp++; + item_count++; } ml_prev = ml; + DTRACE_PROBE4(mblk_loop_end, (mblk_t *), ml, int, chunk, int, len, int, item_count); } @@ -1152,7 +1153,7 @@ for (i = 0; i < item_count; i++) { if (xnbp->xnb_rx_cpop[i].status != 0) { - DTRACE_PROBE2(cpop__status__nonnull, int, + DTRACE_PROBE2(cpop_status_nonnull, int, (int)xnbp->xnb_rx_cpop[i].status, int, i); status = NETIF_RSP_ERROR; @@ -1213,54 +1214,9 @@ return (mp); } -/*ARGSUSED*/ -static int -xnb_txbuf_constructor(void *buf, void *arg, int kmflag) -{ - xnb_txbuf_t *txp = buf; - - bzero(txp, sizeof (*txp)); - - txp->xt_free_rtn.free_func = xnb_tx_complete; - txp->xt_free_rtn.free_arg = (caddr_t)txp; - - txp->xt_mop.host_addr = - (uint64_t)(uintptr_t)vmem_alloc(heap_arena, PAGESIZE, - ((kmflag & KM_NOSLEEP) == KM_NOSLEEP) ? - VM_NOSLEEP : VM_SLEEP); - - if (txp->xt_mop.host_addr == NULL) { - cmn_err(CE_WARN, "xnb_txbuf_constructor: " - "cannot get address space"); - return (-1); - } - - /* - * Have the hat ensure that page table exists for the VA. - */ - hat_prepare_mapping(kas.a_hat, - (caddr_t)(uintptr_t)txp->xt_mop.host_addr, NULL); - - return (0); -} - -/*ARGSUSED*/ -static void -xnb_txbuf_destructor(void *buf, void *arg) -{ - xnb_txbuf_t *txp = buf; - - ASSERT(txp->xt_mop.host_addr != NULL); - ASSERT((txp->xt_flags & XNB_TXBUF_INUSE) == 0); - - hat_release_mapping(kas.a_hat, - (caddr_t)(uintptr_t)txp->xt_mop.host_addr); - vmem_free(heap_arena, - (caddr_t)(uintptr_t)txp->xt_mop.host_addr, PAGESIZE); -} static void -xnb_tx_notify_peer(xnb_t *xnbp) +xnb_tx_notify_peer(xnb_t *xnbp, boolean_t force) { boolean_t notify; @@ -1268,7 +1224,7 @@ /* LINTED: constant in conditional context */ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_tx_ring, notify); - if (notify) { + if (notify || force) { ec_notify_via_evtchn(xnbp->xnb_evtchn); xnbp->xnb_stat_tx_notify_sent++; } else { @@ -1277,18 +1233,6 @@ } static void -xnb_tx_complete(xnb_txbuf_t *txp) -{ - xnb_t *xnbp = txp->xt_xnbp; - - ASSERT((txp->xt_flags & XNB_TXBUF_INUSE) == XNB_TXBUF_INUSE); - - mutex_enter(&xnbp->xnb_tx_lock); - xnb_tx_schedule_unmop(xnbp, &txp->xt_mop, txp); - mutex_exit(&xnbp->xnb_tx_lock); -} - -static void xnb_tx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status) { RING_IDX i; @@ -1311,185 +1255,105 @@ } static void -xnb_tx_schedule_unmop(xnb_t *xnbp, gnttab_map_grant_ref_t *mop, - xnb_txbuf_t *txp) +xnb_txbuf_recycle(xnb_txbuf_t *txp) { - gnttab_unmap_grant_ref_t *unmop; - int u_count; - int reqs_on_ring; + xnb_t *xnbp = txp->xt_xnbp; - ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock)); - ASSERT(xnbp->xnb_tx_unmop_count < NET_TX_RING_SIZE); + kmem_cache_free(xnbp->xnb_tx_buf_cache, txp); + + xnbp->xnb_tx_buf_outstanding--; +} - u_count = xnbp->xnb_tx_unmop_count++; - - /* Cache data for the time when we actually unmap grant refs */ - xnbp->xnb_tx_unmop_txp[u_count] = txp; +static int +xnb_txbuf_constructor(void *buf, void *arg, int kmflag) +{ + _NOTE(ARGUNUSED(kmflag)); + xnb_txbuf_t *txp = buf; + xnb_t *xnbp = arg; + size_t len; + ddi_dma_cookie_t dma_cookie; + uint_t ncookies; - unmop = &xnbp->xnb_tx_unmop[u_count]; - unmop->host_addr = mop->host_addr; - unmop->dev_bus_addr = mop->dev_bus_addr; - unmop->handle = mop->handle; + txp->xt_free_rtn.free_func = xnb_txbuf_recycle; + txp->xt_free_rtn.free_arg = (caddr_t)txp; + txp->xt_xnbp = xnbp; + txp->xt_next = NULL; + + if (ddi_dma_alloc_handle(xnbp->xnb_devinfo, &buf_dma_attr, + 0, 0, &txp->xt_dma_handle) != DDI_SUCCESS) + goto failure; + + if (ddi_dma_mem_alloc(txp->xt_dma_handle, PAGESIZE, &data_accattr, + DDI_DMA_STREAMING, 0, 0, &txp->xt_buf, &len, + &txp->xt_acc_handle) != DDI_SUCCESS) + goto failure_1; - /* - * We cannot check the ring once we're disconnected from it. Batching - * doesn't seem to be a useful optimisation in this case either, - * so we directly call into the actual unmap function. - */ - if (xnbp->xnb_connected) { - reqs_on_ring = RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_tx_ring); + if (ddi_dma_addr_bind_handle(txp->xt_dma_handle, NULL, txp->xt_buf, + len, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, 0, + &dma_cookie, &ncookies) + != DDI_DMA_MAPPED) + goto failure_2; + ASSERT(ncookies == 1); + + txp->xt_mfn = xnb_btop(dma_cookie.dmac_laddress); + txp->xt_buflen = dma_cookie.dmac_size; + + DTRACE_PROBE(txbuf_allocated); + + atomic_add_32(&xnbp->xnb_tx_buf_count, 1); + xnbp->xnb_tx_buf_outstanding++; + + return (0); + +failure_2: + ddi_dma_mem_free(&txp->xt_acc_handle); - /* - * By tuning xnb_unmop_hiwat to N, we can emulate "N per batch" - * or (with N == 1) "immediate unmop" behaviour. - * The "> xnb_unmop_lowwat" is a guard against ring exhaustion. - */ - if (xnbp->xnb_tx_unmop_count < xnb_unmop_hiwat && - reqs_on_ring > xnb_unmop_lowwat) - return; - } +failure_1: + ddi_dma_free_handle(&txp->xt_dma_handle); + +failure: + + return (-1); +} - xnb_tx_perform_pending_unmop(xnbp); +static void +xnb_txbuf_destructor(void *buf, void *arg) +{ + xnb_txbuf_t *txp = buf; + xnb_t *xnbp = arg; + + (void) ddi_dma_unbind_handle(txp->xt_dma_handle); + ddi_dma_mem_free(&txp->xt_acc_handle); + ddi_dma_free_handle(&txp->xt_dma_handle); + + atomic_add_32(&xnbp->xnb_tx_buf_count, -1); } /* - * Here we perform the actual unmapping of the data that was - * accumulated in xnb_tx_schedule_unmop(). - * Note that it is the caller's responsibility to make sure that - * there's actually something there to unmop. + * Take packets from the peer and deliver them onward. */ -static void -xnb_tx_perform_pending_unmop(xnb_t *xnbp) -{ - RING_IDX loop; -#ifdef XNB_DEBUG - gnttab_unmap_grant_ref_t *unmop; -#endif /* XNB_DEBUG */ - - ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock)); - ASSERT(xnbp->xnb_tx_unmop_count > 0); - - if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, - xnbp->xnb_tx_unmop, xnbp->xnb_tx_unmop_count) < 0) { - cmn_err(CE_WARN, "xnb_tx_perform_pending_unmop: " - "unmap grant operation failed, " - "%d pages lost", xnbp->xnb_tx_unmop_count); - } - -#ifdef XNB_DEBUG - for (loop = 0, unmop = xnbp->xnb_tx_unmop; - loop < xnbp->xnb_tx_unmop_count; - loop++, unmop++) { - if (unmop->status != 0) { - cmn_err(CE_WARN, "xnb_tx_perform_pending_unmop: " - "unmap grant reference failed (%d)", - unmop->status); - } - } -#endif /* XNB_DEBUG */ - - for (loop = 0; loop < xnbp->xnb_tx_unmop_count; loop++) { - xnb_txbuf_t *txp = xnbp->xnb_tx_unmop_txp[loop]; - - if (txp == NULL) - cmn_err(CE_PANIC, - "xnb_tx_perform_pending_unmop: " - "unexpected NULL txp (loop %d; count %d)!", - loop, xnbp->xnb_tx_unmop_count); - - if (xnbp->xnb_connected) - xnb_tx_mark_complete(xnbp, txp->xt_id, txp->xt_status); - xnb_txbuf_put(xnbp, txp); - } - if (xnbp->xnb_connected) - xnb_tx_notify_peer(xnbp); - - xnbp->xnb_tx_unmop_count = 0; - -#ifdef XNB_DEBUG - bzero(xnbp->xnb_tx_unmop, sizeof (xnbp->xnb_tx_unmop)); - bzero(xnbp->xnb_tx_unmop_txp, sizeof (xnbp->xnb_tx_unmop_txp)); -#endif /* XNB_DEBUG */ -} - -static xnb_txbuf_t * -xnb_txbuf_get(xnb_t *xnbp, int flags) -{ - xnb_txbuf_t *txp; - - ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock)); - - txp = kmem_cache_alloc(xnb_txbuf_cachep, flags); - if (txp != NULL) { - ASSERT((txp->xt_flags & XNB_TXBUF_INUSE) == 0); - txp->xt_flags |= XNB_TXBUF_INUSE; - - txp->xt_xnbp = xnbp; - txp->xt_mop.dom = xnbp->xnb_peer; - - txp->xt_mop.flags = GNTMAP_host_map; - if (!xnbp->xnb_tx_pages_writable) - txp->xt_mop.flags |= GNTMAP_readonly; - - xnbp->xnb_tx_buf_count++; - } - - return (txp); -} - -static void -xnb_txbuf_put(xnb_t *xnbp, xnb_txbuf_t *txp) -{ - ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock)); - ASSERT((txp->xt_flags & XNB_TXBUF_INUSE) == XNB_TXBUF_INUSE); - - txp->xt_flags &= ~XNB_TXBUF_INUSE; - xnbp->xnb_tx_buf_count--; - - kmem_cache_free(xnb_txbuf_cachep, txp); -} - static mblk_t * xnb_from_peer(xnb_t *xnbp) { RING_IDX start, end, loop; - gnttab_map_grant_ref_t *mop; + gnttab_copy_t *cop; xnb_txbuf_t **txpp; netif_tx_request_t *txreq; - boolean_t work_to_do; + boolean_t work_to_do, need_notify = B_FALSE; mblk_t *head, *tail; - /* - * If the peer granted a read-only mapping to the page then we - * must copy the data, as the local protocol stack (should the - * packet be destined for this host) will modify the packet - * 'in place'. - */ - boolean_t copy = xnbp->xnb_tx_always_copy || - !xnbp->xnb_tx_pages_writable; + int n_data_req, i; - /* - * For each individual request, the sequence of actions is: - * - * 1. get the request. - * 2. map the page based on the grant ref. - * 3. allocate an mblk, copy the data to it. - * 4. release the grant. - * 5. update the ring. - * 6. pass the packet upward. - * 7. kick the peer. - * - * In fact, we try to perform the grant operations in batches, - * so there are two loops. - */ + ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock)); head = tail = NULL; around: - ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock)); /* LINTED: constant in conditional context */ RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->xnb_tx_ring, work_to_do); if (!work_to_do) { finished: + xnb_tx_notify_peer(xnbp, need_notify); + return (head); } @@ -1517,118 +1381,147 @@ goto around; } - for (loop = start, mop = xnbp->xnb_tx_mop, txpp = xnbp->xnb_tx_bufp; - loop != end; - loop++, mop++, txpp++) { - xnb_txbuf_t *txp; + loop = start; + cop = xnbp->xnb_tx_cop; + txpp = xnbp->xnb_tx_bufp; + n_data_req = 0; + + while (loop < end) { + txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop); + + if (txreq->flags & NETTXF_extra_info) { + struct netif_extra_info *erp; + boolean_t status; + + loop++; /* Consume another slot in the ring. */ + ASSERT(loop <= end); + + erp = (struct netif_extra_info *) + RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop); - txp = xnb_txbuf_get(xnbp, KM_NOSLEEP); - if (txp == NULL) - break; + switch (erp->type) { + case XEN_NETIF_EXTRA_TYPE_MCAST_ADD: + ASSERT(xnbp->xnb_multicast_control); + status = xnbp->xnb_flavour->xf_mcast_add(xnbp, + &erp->u.mcast.addr); + break; + case XEN_NETIF_EXTRA_TYPE_MCAST_DEL: + ASSERT(xnbp->xnb_multicast_control); + status = xnbp->xnb_flavour->xf_mcast_del(xnbp, + &erp->u.mcast.addr); + break; + default: + status = B_FALSE; + cmn_err(CE_WARN, "xnb_from_peer: " + "unknown extra type %d", erp->type); + break; + } - ASSERT(xnbp->xnb_tx_pages_writable || - ((txp->xt_mop.flags & GNTMAP_readonly) - == GNTMAP_readonly)); + xnb_tx_mark_complete(xnbp, txreq->id, + status ? NETIF_RSP_OKAY : NETIF_RSP_ERROR); + need_notify = B_TRUE; + } else { + xnb_txbuf_t *txp; + + txp = kmem_cache_alloc(xnbp->xnb_tx_buf_cache, + KM_NOSLEEP); + if (txp == NULL) + break; + + txp->xt_mblk = desballoc((unsigned char *)txp->xt_buf, + txp->xt_buflen, 0, &txp->xt_free_rtn); + if (txp->xt_mblk == NULL) { + kmem_cache_free(xnbp->xnb_tx_buf_cache, txp); + break; + } + + txp->xt_idx = loop; + txp->xt_id = txreq->id; - txp->xt_mop.ref = - RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop)->gref; + cop->source.u.ref = txreq->gref; + cop->source.domid = xnbp->xnb_peer; + cop->source.offset = txreq->offset; + + cop->dest.u.gmfn = txp->xt_mfn; + cop->dest.domid = DOMID_SELF; + cop->dest.offset = 0; - *mop = txp->xt_mop; - *txpp = txp; + cop->len = txreq->size; + cop->flags = GNTCOPY_source_gref; + cop->status = 0; + + *txpp = txp; + + txpp++; + cop++; + n_data_req++; + + ASSERT(n_data_req <= NET_TX_RING_SIZE); + } + + loop++; } - if ((loop - start) == 0) - goto finished; + xnbp->xnb_tx_ring.req_cons = loop; - end = loop; + if (n_data_req == 0) + goto around; - if (xen_map_gref(GNTTABOP_map_grant_ref, xnbp->xnb_tx_mop, - end - start, B_FALSE) != 0) { + if (HYPERVISOR_grant_table_op(GNTTABOP_copy, + xnbp->xnb_tx_cop, n_data_req) != 0) { - cmn_err(CE_WARN, "xnb_from_peer: map grant operation failed"); - - loop = start; - txpp = xnbp->xnb_tx_bufp; + cmn_err(CE_WARN, "xnb_from_peer: copy operation failed"); - while (loop != end) { - xnb_txbuf_put(xnbp, *txpp); - - loop++; + txpp = xnbp->xnb_tx_bufp; + i = n_data_req; + while (i > 0) { + kmem_cache_free(xnbp->xnb_tx_buf_cache, *txpp); txpp++; + i--; } goto finished; } - for (loop = start, mop = xnbp->xnb_tx_mop, txpp = xnbp->xnb_tx_bufp; - loop != end; - loop++, mop++, txpp++) { - mblk_t *mp = NULL; - int16_t status = NETIF_RSP_OKAY; + txpp = xnbp->xnb_tx_bufp; + cop = xnbp->xnb_tx_cop; + i = n_data_req; + + while (i > 0) { xnb_txbuf_t *txp = *txpp; - if (mop->status != 0) { - cmn_err(CE_WARN, "xnb_from_peer: " - "failed to map buffer: %d", - mop->status); - status = NETIF_RSP_ERROR; - } - - txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop); + txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, txp->xt_idx); - if (status == NETIF_RSP_OKAY) { - if (copy) { - mp = allocb(txreq->size, BPRI_MED); - if (mp == NULL) { - status = NETIF_RSP_ERROR; - xnbp->xnb_stat_tx_allocb_failed++; - } else { - bcopy((caddr_t)(uintptr_t) - mop->host_addr + txreq->offset, - mp->b_wptr, txreq->size); - mp->b_wptr += txreq->size; - } - } else { - mp = desballoc((uchar_t *)(uintptr_t) - mop->host_addr + txreq->offset, - txreq->size, 0, &txp->xt_free_rtn); - if (mp == NULL) { - status = NETIF_RSP_ERROR; - xnbp->xnb_stat_tx_allocb_failed++; - } else { - txp->xt_id = txreq->id; - txp->xt_status = status; - txp->xt_mop = *mop; + if (cop->status != 0) { +#ifdef XNB_DEBUG + cmn_err(CE_WARN, "xnb_from_peer: " + "txpp 0x%p failed (%d)", + (void *)*txpp, cop->status); +#endif /* XNB_DEBUG */ + xnb_tx_mark_complete(xnbp, txp->xt_id, cop->status); + freemsg(txp->xt_mblk); + } else { + mblk_t *mp; - mp->b_wptr += txreq->size; - } - } + mp = txp->xt_mblk; + mp->b_rptr = mp->b_wptr = (unsigned char *)txp->xt_buf; + mp->b_wptr += txreq->size; + mp->b_next = NULL; /* - * If we have a buffer and there are checksum - * flags, process them appropriately. + * If there are checksum flags, process them + * appropriately. */ - if ((mp != NULL) && - ((txreq->flags & + if ((txreq->flags & (NETTXF_csum_blank | NETTXF_data_validated)) - != 0)) { + != 0) { mp = xnbp->xnb_flavour->xf_cksum_from_peer(xnbp, mp, txreq->flags); xnbp->xnb_stat_tx_cksum_no_need++; - } - } - if (copy || (mp == NULL)) { - txp->xt_status = status; - txp->xt_id = txreq->id; - xnb_tx_schedule_unmop(xnbp, mop, txp); - } + txp->xt_mblk = mp; + } - if (mp != NULL) { - xnbp->xnb_stat_opackets++; - xnbp->xnb_stat_obytes += txreq->size; - - mp->b_next = NULL; if (head == NULL) { ASSERT(tail == NULL); head = mp; @@ -1637,18 +1530,22 @@ tail->b_next = mp; } tail = mp; + + xnbp->xnb_stat_opackets++; + xnbp->xnb_stat_obytes += txreq->size; + + xnb_tx_mark_complete(xnbp, txp->xt_id, cop->status); } + + txpp++; + cop++; + i--; } - xnbp->xnb_tx_ring.req_cons = loop; - goto around; /* NOTREACHED */ } -/* - * intr() -- ring interrupt service routine - */ static uint_t xnb_intr(caddr_t arg) { @@ -1683,52 +1580,142 @@ return (DDI_INTR_CLAIMED); } +/* + * Read our configuration from xenstore. + */ +boolean_t +xnb_read_xs_config(xnb_t *xnbp) +{ + char *xsname; + char mac[ETHERADDRL * 3]; + + xsname = xvdi_get_xsname(xnbp->xnb_devinfo); + + if (xenbus_scanf(XBT_NULL, xsname, + "mac", "%s", mac) != 0) { + cmn_err(CE_WARN, "xnb_attach: " + "cannot read mac address from %s", + xsname); + return (B_FALSE); + } + + if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) { + cmn_err(CE_WARN, + "xnb_attach: cannot parse mac address %s", + mac); + return (B_FALSE); + } + + return (B_TRUE); +} + +/* + * Read the configuration of the peer from xenstore. + */ +boolean_t +xnb_read_oe_config(xnb_t *xnbp) +{ + char *oename; + int i; + + oename = xvdi_get_oename(xnbp->xnb_devinfo); + + if (xenbus_gather(XBT_NULL, oename, + "event-channel", "%u", &xnbp->xnb_fe_evtchn, + "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref, + "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref, + NULL) != 0) { + cmn_err(CE_WARN, "xnb_read_oe_config: " + "cannot read other-end details from %s", + oename); + return (B_FALSE); + } + + /* + * Check whether our peer requests receive side hypervisor + * copy. + */ + if (xenbus_scanf(XBT_NULL, oename, + "request-rx-copy", "%d", &i) != 0) + i = 0; + if (i != 0) + xnbp->xnb_rx_hv_copy = B_TRUE; + + /* + * Check whether our peer requests multicast_control. + */ + if (xenbus_scanf(XBT_NULL, oename, + "request-multicast-control", "%d", &i) != 0) + i = 0; + if (i != 0) + xnbp->xnb_multicast_control = B_TRUE; + + /* + * The Linux backend driver here checks to see if the peer has + * set 'feature-no-csum-offload'. This is used to indicate + * that the guest cannot handle receiving packets without a + * valid checksum. We don't check here, because packets passed + * to the peer _always_ have a valid checksum. + * + * There are three cases: + * + * - the NIC is dedicated: packets from the wire should always + * have a valid checksum. If the hardware validates the + * checksum then the relevant bit will be set in the packet + * attributes and we will inform the peer. It can choose to + * ignore the hardware verification. + * + * - the NIC is shared (VNIC) and a packet originates from the + * wire: this is the same as the case above - the packets + * will have a valid checksum. + * + * - the NIC is shared (VNIC) and a packet originates from the + * host: the MAC layer ensures that all such packets have a + * valid checksum by calculating one if the stack did not. + */ + + return (B_TRUE); +} + +void +xnb_start_connect(xnb_t *xnbp) +{ + dev_info_t *dip = xnbp->xnb_devinfo; + + if (!xnb_connect_rings(dip)) { + cmn_err(CE_WARN, "xnb_start_connect: " + "cannot connect rings"); + goto failed; + } + + if (!xnbp->xnb_flavour->xf_start_connect(xnbp)) { + cmn_err(CE_WARN, "xnb_start_connect: " + "flavour failed to connect"); + goto failed; + } + + (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected); + return; + +failed: + xnbp->xnb_flavour->xf_peer_disconnected(xnbp); + xnb_disconnect_rings(dip); + (void) xvdi_switch_state(dip, XBT_NULL, + XenbusStateClosed); + (void) xvdi_post_event(dip, XEN_HP_REMOVE); +} + static boolean_t xnb_connect_rings(dev_info_t *dip) { xnb_t *xnbp = ddi_get_driver_private(dip); - char *oename; struct gnttab_map_grant_ref map_op; - evtchn_port_t evtchn; - int i; /* * Cannot attempt to connect the rings if already connected. */ ASSERT(!xnbp->xnb_connected); - oename = xvdi_get_oename(dip); - - if (xenbus_gather(XBT_NULL, oename, - "event-channel", "%u", &evtchn, - "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref, - "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref, - NULL) != 0) { - cmn_err(CE_WARN, "xnb_connect_rings: " - "cannot read other-end details from %s", - oename); - goto fail; - } - - if (xenbus_scanf(XBT_NULL, oename, - "feature-tx-writable", "%d", &i) != 0) - i = 0; - if (i != 0) - xnbp->xnb_tx_pages_writable = B_TRUE; - - if (xenbus_scanf(XBT_NULL, oename, - "feature-no-csum-offload", "%d", &i) != 0) - i = 0; - if ((i == 1) || !xnbp->xnb_cksum_offload) - xnbp->xnb_cksum_offload = B_FALSE; - - /* Check whether our peer knows and requests hypervisor copy */ - if (xenbus_scanf(XBT_NULL, oename, "request-rx-copy", "%d", &i) - != 0) - i = 0; - if (i != 0) - xnbp->xnb_hv_copy = B_TRUE; - /* * 1. allocate a vaddr for the tx page, one for the rx page. * 2. call GNTTABOP_map_grant_ref to map the relevant pages @@ -1736,8 +1723,7 @@ * 3. call EVTCHNOP_bind_interdomain to have the event channel * bound to this domain. * 4. associate the event channel with an interrupt. - * 5. declare ourselves connected. - * 6. enable the interrupt. + * 5. enable the interrupt. */ /* 1.tx */ @@ -1785,7 +1771,7 @@ (netif_rx_sring_t *)xnbp->xnb_rx_ring_addr, PAGESIZE); /* 3 */ - if (xvdi_bind_evtchn(dip, evtchn) != DDI_SUCCESS) { + if (xvdi_bind_evtchn(dip, xnbp->xnb_fe_evtchn) != DDI_SUCCESS) { cmn_err(CE_WARN, "xnb_connect_rings: " "cannot bind event channel %d", xnbp->xnb_evtchn); xnbp->xnb_evtchn = INVALID_EVTCHN; @@ -1802,13 +1788,12 @@ mutex_enter(&xnbp->xnb_tx_lock); mutex_enter(&xnbp->xnb_rx_lock); - /* 5.1 */ xnbp->xnb_connected = B_TRUE; mutex_exit(&xnbp->xnb_rx_lock); mutex_exit(&xnbp->xnb_tx_lock); - /* 4, 6 */ + /* 4, 5 */ if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp) != DDI_SUCCESS) { cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt"); @@ -1816,9 +1801,6 @@ } xnbp->xnb_irq = B_TRUE; - /* 5.2 */ - (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected); - return (B_TRUE); fail: @@ -1826,6 +1808,7 @@ mutex_enter(&xnbp->xnb_rx_lock); xnbp->xnb_connected = B_FALSE; + mutex_exit(&xnbp->xnb_rx_lock); mutex_exit(&xnbp->xnb_tx_lock); @@ -1842,9 +1825,6 @@ xnbp->xnb_irq = B_FALSE; } - if (xnbp->xnb_tx_unmop_count > 0) - xnb_tx_perform_pending_unmop(xnbp); - if (xnbp->xnb_evtchn != INVALID_EVTCHN) { xvdi_free_evtchn(dip); xnbp->xnb_evtchn = INVALID_EVTCHN; @@ -1895,11 +1875,11 @@ } } -/*ARGSUSED*/ static void xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data) { + _NOTE(ARGUNUSED(id, arg)); xnb_t *xnbp = ddi_get_driver_private(dip); XenbusState new_state = *(XenbusState *)impl_data; @@ -1911,16 +1891,24 @@ if (xnbp->xnb_connected) return; - if (xnb_connect_rings(dip)) { - xnbp->xnb_flavour->xf_peer_connected(xnbp); - } else { - xnbp->xnb_flavour->xf_peer_disconnected(xnbp); - xnb_disconnect_rings(dip); + if (!xnb_read_oe_config(xnbp) || + !xnbp->xnb_flavour->xf_peer_connected(xnbp)) { + cmn_err(CE_WARN, "xnb_oe_state_change: " + "read otherend config error"); (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed); (void) xvdi_post_event(dip, XEN_HP_REMOVE); + + break; } + + mutex_enter(&xnbp->xnb_state_lock); + xnbp->xnb_fe_status = XNB_STATE_READY; + if (xnbp->xnb_be_status == XNB_STATE_READY) + xnb_start_connect(xnbp); + mutex_exit(&xnbp->xnb_state_lock); + /* * Now that we've attempted to connect it's reasonable * to allow an attempt to detach. @@ -1964,33 +1952,42 @@ } } -/*ARGSUSED*/ static void xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data) { + _NOTE(ARGUNUSED(id, arg)); xnb_t *xnbp = ddi_get_driver_private(dip); xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data; - boolean_t success; ASSERT(xnbp != NULL); switch (state) { case Connected: - /* spurious hotplug event */ if (xnbp->xnb_hotplugged) - return; + break; - success = xnbp->xnb_flavour->xf_hotplug_connected(xnbp); + if (!xnb_read_xs_config(xnbp)) + break; + + if (!xnbp->xnb_flavour->xf_hotplug_connected(xnbp)) + break; mutex_enter(&xnbp->xnb_tx_lock); mutex_enter(&xnbp->xnb_rx_lock); - xnbp->xnb_hotplugged = success; + xnbp->xnb_hotplugged = B_TRUE; mutex_exit(&xnbp->xnb_rx_lock); mutex_exit(&xnbp->xnb_tx_lock); + + mutex_enter(&xnbp->xnb_state_lock); + xnbp->xnb_be_status = XNB_STATE_READY; + if (xnbp->xnb_fe_status == XNB_STATE_READY) + xnb_start_connect(xnbp); + mutex_exit(&xnbp->xnb_state_lock); + break; default: @@ -2013,16 +2010,10 @@ mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL); - xnb_txbuf_cachep = kmem_cache_create("xnb_txbuf_cachep", - sizeof (xnb_txbuf_t), 0, xnb_txbuf_constructor, - xnb_txbuf_destructor, NULL, NULL, NULL, 0); - ASSERT(xnb_txbuf_cachep != NULL); + i = mod_install(&modlinkage); + if (i != DDI_SUCCESS) + mutex_destroy(&xnb_alloc_page_lock); - i = mod_install(&modlinkage); - if (i != DDI_SUCCESS) { - kmem_cache_destroy(xnb_txbuf_cachep); - mutex_destroy(&xnb_alloc_page_lock); - } return (i); } @@ -2038,9 +2029,8 @@ int i; i = mod_remove(&modlinkage); - if (i == DDI_SUCCESS) { - kmem_cache_destroy(xnb_txbuf_cachep); + if (i == DDI_SUCCESS) mutex_destroy(&xnb_alloc_page_lock); - } + return (i); }
--- a/usr/src/uts/common/xen/io/xnb.h Wed Nov 04 21:40:43 2009 -0800 +++ b/usr/src/uts/common/xen/io/xnb.h Thu Nov 05 01:05:36 2009 -0800 @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * xnb.h - definitions for Xen dom0 network driver @@ -54,6 +54,9 @@ #define XNBRING 0x20 #define XNBCKSUM 0x40 +#define XNB_STATE_INIT 0x01 +#define XNB_STATE_READY 0x02 + typedef struct xnb xnb_t; /* @@ -70,19 +73,32 @@ */ typedef struct xnb_flavour { void (*xf_from_peer)(xnb_t *, mblk_t *); - void (*xf_peer_connected)(xnb_t *); + boolean_t (*xf_peer_connected)(xnb_t *); void (*xf_peer_disconnected)(xnb_t *); boolean_t (*xf_hotplug_connected)(xnb_t *); + boolean_t (*xf_start_connect)(xnb_t *); mblk_t *(*xf_cksum_from_peer)(xnb_t *, mblk_t *, uint16_t); uint16_t (*xf_cksum_to_peer)(xnb_t *, mblk_t *); + boolean_t (*xf_mcast_add)(xnb_t *, ether_addr_t *); + boolean_t (*xf_mcast_del)(xnb_t *, ether_addr_t *); } xnb_flavour_t; typedef struct xnb_txbuf { frtn_t xt_free_rtn; xnb_t *xt_xnbp; - gnttab_map_grant_ref_t xt_mop; + struct xnb_txbuf *xt_next; RING_IDX xt_id; + RING_IDX xt_idx; uint16_t xt_status; + + ddi_dma_handle_t xt_dma_handle; + ddi_acc_handle_t xt_acc_handle; + caddr_t xt_buf; + size_t xt_buflen; + mfn_t xt_mfn; + + mblk_t *xt_mblk; + unsigned int xt_flags; #define XNB_TXBUF_INUSE 0x01 @@ -140,17 +156,18 @@ kstat_t *xnb_kstat_aux; - boolean_t xnb_cksum_offload; - ddi_iblock_cookie_t xnb_icookie; kmutex_t xnb_rx_lock; kmutex_t xnb_tx_lock; + kmutex_t xnb_state_lock; - int xnb_tx_unmop_count; - int xnb_tx_buf_count; - boolean_t xnb_tx_pages_writable; - boolean_t xnb_tx_always_copy; + int xnb_be_status; + int xnb_fe_status; + + kmem_cache_t *xnb_tx_buf_cache; + uint32_t xnb_tx_buf_count; + int xnb_tx_buf_outstanding; netif_rx_back_ring_t xnb_rx_ring; /* rx interface struct ptr */ void *xnb_rx_ring_addr; @@ -166,22 +183,22 @@ boolean_t xnb_hotplugged; boolean_t xnb_detachable; int xnb_evtchn; /* channel to front end */ + evtchn_port_t xnb_fe_evtchn; domid_t xnb_peer; - xnb_txbuf_t *xnb_tx_bufp[NET_TX_RING_SIZE]; - gnttab_map_grant_ref_t xnb_tx_mop[NET_TX_RING_SIZE]; - gnttab_unmap_grant_ref_t xnb_tx_unmop[NET_TX_RING_SIZE]; - - /* store information for unmop */ - xnb_txbuf_t *xnb_tx_unmop_txp[NET_TX_RING_SIZE]; + xnb_txbuf_t *xnb_tx_bufp[NET_TX_RING_SIZE]; + gnttab_copy_t xnb_tx_cop[NET_TX_RING_SIZE]; caddr_t xnb_rx_va; gnttab_transfer_t xnb_rx_top[NET_RX_RING_SIZE]; - boolean_t xnb_hv_copy; /* do we do hypervisor copy? */ + boolean_t xnb_rx_hv_copy; + boolean_t xnb_multicast_control; + boolean_t xnb_no_csum_offload; + gnttab_copy_t *xnb_rx_cpop; #define CPOP_DEFCNT 8 - size_t xnb_cpop_sz; /* in elements, not bytes */ + size_t xnb_rx_cpop_count; /* in elements */ }; extern int xnb_attach(dev_info_t *, xnb_flavour_t *, void *);
--- a/usr/src/uts/common/xen/io/xnbo.c Wed Nov 04 21:40:43 2009 -0800 +++ b/usr/src/uts/common/xen/io/xnbo.c Thu Nov 05 01:05:36 2009 -0800 @@ -46,7 +46,16 @@ #include <sys/pattr.h> #include <xen/sys/xenbus_impl.h> #include <xen/sys/xendev.h> +#include <sys/sdt.h> +#include <sys/note.h> +/* Track multicast addresses. */ +typedef struct xmca { + struct xmca *next; + ether_addr_t addr; +} xmca_t; + +/* State about this device instance. */ typedef struct xnbo { mac_handle_t o_mh; mac_client_handle_t o_mch; @@ -55,9 +64,14 @@ boolean_t o_running; boolean_t o_promiscuous; uint32_t o_hcksum_capab; + xmca_t *o_mca; + char o_link_name[LIFNAMSIZ]; + boolean_t o_need_rx_filter; + boolean_t o_need_setphysaddr; + boolean_t o_multicast_control; } xnbo_t; -static void xnbo_close_mac(xnbo_t *); +static void xnbo_close_mac(xnb_t *); /* * Packets from the peer come here. We pass them to the mac device. @@ -85,6 +99,10 @@ freemsgchain(mp); } +/* + * Process the checksum flags `flags' provided by the peer for the + * packet `mp'. + */ static mblk_t * xnbo_cksum_from_peer(xnb_t *xnbp, mblk_t *mp, uint16_t flags) { @@ -94,11 +112,6 @@ if ((flags & NETTXF_csum_blank) != 0) { /* - * It would be nice to ASSERT that xnbp->xnb_cksum_offload - * is TRUE here, but some peers insist on assuming - * that it is available even when they have been told - * otherwise. - * * The checksum in the packet is blank. Determine * whether we can do hardware offload and, if so, * update the flags on the mblk according. If not, @@ -111,10 +124,16 @@ return (mp); } +/* + * Calculate the checksum flags to be relayed to the peer for the + * packet `mp'. + */ static uint16_t xnbo_cksum_to_peer(xnb_t *xnbp, mblk_t *mp) { + _NOTE(ARGUNUSED(xnbp)); uint16_t r = 0; + uint32_t pflags, csum; /* * We might also check for HCK_PARTIALCKSUM here and, @@ -126,29 +145,24 @@ * capabilities tend to use HCK_FULLCKSUM on the receive side * - they are actually saying that in the output path the * caller must use HCK_PARTIALCKSUM. + * + * Then again, if a NIC supports HCK_PARTIALCKSUM in its' + * output path, the host IP stack will use it. If such packets + * are destined for the peer (i.e. looped around) we would + * gain some advantage. */ - if (xnbp->xnb_cksum_offload) { - uint32_t pflags, csum; + hcksum_retrieve(mp, NULL, NULL, NULL, NULL, + NULL, &csum, &pflags); - /* - * XXPV dme: Pull in improved hcksum_retrieve() from - * Crossbow, which gives back the csum in the seventh - * argument for HCK_FULLCKSUM. - */ - hcksum_retrieve(mp, NULL, NULL, NULL, NULL, - NULL, NULL, &pflags); - csum = DB_CKSUM16(mp); - - /* - * If the MAC driver has asserted that the checksum is - * good, let the peer know. - */ - if (((pflags & HCK_FULLCKSUM) != 0) && - (((pflags & HCK_FULLCKSUM_OK) != 0) || - (csum == 0xffff))) - r |= NETRXF_data_validated; - } + /* + * If the MAC driver has asserted that the checksum is + * good, let the peer know. + */ + if (((pflags & HCK_FULLCKSUM) != 0) && + (((pflags & HCK_FULLCKSUM_OK) != 0) || + (csum == 0xffff))) + r |= NETRXF_data_validated; return (r); } @@ -174,11 +188,11 @@ * the destination mac address matches or it's a multicast/broadcast * address. */ -/*ARGSUSED*/ static void xnbo_from_mac_filter(void *arg, mac_resource_handle_t mrh, mblk_t *mp, boolean_t loopback) { + _NOTE(ARGUNUSED(loopback)); xnb_t *xnbp = arg; xnbo_t *xnbop = xnbp->xnb_flavour_data; mblk_t *next, *keep, *keep_head, *free, *free_head; @@ -230,16 +244,13 @@ xnbo_open_mac(xnb_t *xnbp, char *mac) { xnbo_t *xnbop = xnbp->xnb_flavour_data; - int err, need_rx_filter, need_setphysaddr, need_promiscuous; + int err; const mac_info_t *mi; - char *xsname; void (*rx_fn)(void *, mac_resource_handle_t, mblk_t *, boolean_t); struct ether_addr ea; uint_t max_sdu; mac_diag_t diag; - xsname = xvdi_get_xsname(xnbp->xnb_devinfo); - if ((err = mac_open_by_linkname(mac, &xnbop->o_mh)) != 0) { cmn_err(CE_WARN, "xnbo_open_mac: " "cannot open mac for link %s (%d)", mac, err); @@ -253,14 +264,14 @@ if (mi->mi_media != DL_ETHER) { cmn_err(CE_WARN, "xnbo_open_mac: " "device is not DL_ETHER (%d)", mi->mi_media); - xnbo_close_mac(xnbop); + xnbo_close_mac(xnbp); return (B_FALSE); } if (mi->mi_media != mi->mi_nativemedia) { cmn_err(CE_WARN, "xnbo_open_mac: " "device media and native media mismatch (%d != %d)", mi->mi_media, mi->mi_nativemedia); - xnbo_close_mac(xnbop); + xnbo_close_mac(xnbp); return (B_FALSE); } @@ -268,7 +279,7 @@ if (max_sdu > XNBMAXPKT) { cmn_err(CE_WARN, "xnbo_open_mac: mac device SDU too big (%d)", max_sdu); - xnbo_close_mac(xnbop); + xnbo_close_mac(xnbp); return (B_FALSE); } @@ -286,40 +297,25 @@ MAC_OPEN_FLAGS_MULTI_PRIMARY) != 0) { cmn_err(CE_WARN, "xnbo_open_mac: " "error (%d) opening mac client", err); - xnbo_close_mac(xnbop); + xnbo_close_mac(xnbp); return (B_FALSE); } - /* - * Should the receive path filter packets from the downstream - * NIC before passing them to the peer? The default is "no". - */ - if (xenbus_scanf(XBT_NULL, xsname, - "SUNW-need-rx-filter", "%d", &need_rx_filter) != 0) - need_rx_filter = 0; - if (need_rx_filter > 0) + if (xnbop->o_need_rx_filter) rx_fn = xnbo_from_mac_filter; else rx_fn = xnbo_from_mac; - /* - * Should we set the underlying NIC into promiscuous mode? The - * default is "no". - */ - if (xenbus_scanf(XBT_NULL, xsname, - "SUNW-need-promiscuous", "%d", &need_promiscuous) != 0) { - need_promiscuous = 0; - } err = mac_unicast_add_set_rx(xnbop->o_mch, NULL, MAC_UNICAST_PRIMARY, - &xnbop->o_mah, 0, &diag, need_promiscuous == 0 ? rx_fn : - NULL, xnbp); + &xnbop->o_mah, 0, &diag, xnbop->o_multicast_control ? rx_fn : NULL, + xnbp); if (err != 0) { cmn_err(CE_WARN, "xnbo_open_mac: failed to get the primary " "MAC address of %s: %d", mac, err); - xnbo_close_mac(xnbop); + xnbo_close_mac(xnbp); return (B_FALSE); } - if (need_promiscuous != 0) { + if (!xnbop->o_multicast_control) { err = mac_promisc_add(xnbop->o_mch, MAC_CLIENT_PROMISC_ALL, rx_fn, xnbp, &xnbop->o_mphp, MAC_PROMISC_FLAGS_NO_TX_LOOP | MAC_PROMISC_FLAGS_VLAN_TAG_STRIP); @@ -327,24 +323,13 @@ cmn_err(CE_WARN, "xnbo_open_mac: " "cannot enable promiscuous mode of %s: %d", mac, err); - xnbo_close_mac(xnbop); + xnbo_close_mac(xnbp); return (B_FALSE); } xnbop->o_promiscuous = B_TRUE; } - if (!mac_capab_get(xnbop->o_mh, MAC_CAPAB_HCKSUM, - &xnbop->o_hcksum_capab)) - xnbop->o_hcksum_capab = 0; - - /* - * Should we set the physical address of the underlying NIC - * to match that assigned to the peer? The default is "no". - */ - if (xenbus_scanf(XBT_NULL, xsname, - "SUNW-need-set-physaddr", "%d", &need_setphysaddr) != 0) - need_setphysaddr = 0; - if (need_setphysaddr > 0) { + if (xnbop->o_need_setphysaddr) { err = mac_unicast_primary_set(xnbop->o_mh, xnbp->xnb_mac_addr); /* Warn, but continue on. */ if (err != 0) { @@ -356,41 +341,42 @@ } } + if (!mac_capab_get(xnbop->o_mh, MAC_CAPAB_HCKSUM, + &xnbop->o_hcksum_capab)) + xnbop->o_hcksum_capab = 0; + xnbop->o_running = B_TRUE; return (B_TRUE); } -/* - * xnb calls back here when the user-level hotplug code reports that - * the hotplug has successfully completed. For this flavour that means - * that the underlying MAC device that we will use is ready to be - * opened. - */ -static boolean_t -xnbo_hotplug(xnb_t *xnbp) +static void +xnbo_close_mac(xnb_t *xnbp) { - char *xsname; - char mac[LIFNAMSIZ]; + xnbo_t *xnbop = xnbp->xnb_flavour_data; + xmca_t *loop; - xsname = xvdi_get_xsname(xnbp->xnb_devinfo); - if (xenbus_scanf(XBT_NULL, xsname, "nic", "%s", mac) != 0) { - cmn_err(CE_WARN, "xnbo_hotplug: " - "cannot read nic name from %s", xsname); - return (B_FALSE); - } - - return (xnbo_open_mac(xnbp, mac)); -} - -static void -xnbo_close_mac(xnbo_t *xnbop) -{ if (xnbop->o_mh == NULL) return; - if (xnbop->o_running) { + if (xnbop->o_running) xnbop->o_running = B_FALSE; + + mutex_enter(&xnbp->xnb_state_lock); + loop = xnbop->o_mca; + xnbop->o_mca = NULL; + mutex_exit(&xnbp->xnb_state_lock); + + while (loop != NULL) { + xmca_t *next = loop->next; + + DTRACE_PROBE3(mcast_remove, + (char *), "close", + (void *), xnbp, + (etheraddr_t *), loop->addr); + (void) mac_multicast_remove(xnbop->o_mch, loop->addr); + kmem_free(loop, sizeof (*loop)); + loop = next; } if (xnbop->o_promiscuous) { @@ -419,32 +405,194 @@ } /* - * xnb calls back here when we successfully synchronize with the - * driver in the guest domain. In this flavour there is nothing to do as - * we open the underlying MAC device on successful hotplug completion. + * Hotplug has completed and we are connected to the peer. We have all + * the information we need to exchange traffic, so open the MAC device + * and configure it appropriately. + */ +static boolean_t +xnbo_start_connect(xnb_t *xnbp) +{ + xnbo_t *xnbop = xnbp->xnb_flavour_data; + + return (xnbo_open_mac(xnbp, xnbop->o_link_name)); +} + +/* + * The guest has successfully synchronize with this instance. We read + * the configuration of the guest from xenstore to check whether the + * guest requests multicast control. If not (the default) we make a + * note that the MAC device needs to be used in promiscious mode. + */ +static boolean_t +xnbo_peer_connected(xnb_t *xnbp) +{ + char *oename; + int request; + xnbo_t *xnbop = xnbp->xnb_flavour_data; + + oename = xvdi_get_oename(xnbp->xnb_devinfo); + + if (xenbus_scanf(XBT_NULL, oename, + "request-multicast-control", "%d", &request) != 0) + request = 0; + xnbop->o_multicast_control = (request > 0); + + return (B_TRUE); +} + +/* + * The guest domain has closed down the inter-domain connection. We + * close the underlying MAC device. */ -/*ARGSUSED*/ static void -xnbo_connected(xnb_t *xnbp) +xnbo_peer_disconnected(xnb_t *xnbp) +{ + xnbo_close_mac(xnbp); +} + +/* + * The hotplug script has completed. We read information from xenstore + * about our configuration, most notably the name of the MAC device we + * should use. + */ +static boolean_t +xnbo_hotplug_connected(xnb_t *xnbp) { + char *xsname; + xnbo_t *xnbop = xnbp->xnb_flavour_data; + int need; + + xsname = xvdi_get_xsname(xnbp->xnb_devinfo); + + if (xenbus_scanf(XBT_NULL, xsname, + "nic", "%s", xnbop->o_link_name) != 0) { + cmn_err(CE_WARN, "xnbo_connect: " + "cannot read nic name from %s", xsname); + return (B_FALSE); + } + + if (xenbus_scanf(XBT_NULL, xsname, + "SUNW-need-rx-filter", "%d", &need) != 0) + need = 0; + xnbop->o_need_rx_filter = (need > 0); + + if (xenbus_scanf(XBT_NULL, xsname, + "SUNW-need-set-physaddr", "%d", &need) != 0) + need = 0; + xnbop->o_need_setphysaddr = (need > 0); + + return (B_TRUE); } /* - * xnb calls back here when the driver in the guest domain has closed - * down the inter-domain connection. We close the underlying MAC device. + * Find the multicast address `addr', return B_TRUE if it is one that + * we receive. If `remove', remove it from the set received. */ -static void -xnbo_disconnected(xnb_t *xnbp) +static boolean_t +xnbo_mcast_find(xnb_t *xnbp, ether_addr_t *addr, boolean_t remove) { - xnbo_close_mac(xnbp->xnb_flavour_data); + xnbo_t *xnbop = xnbp->xnb_flavour_data; + xmca_t *prev, *del, *this; + + ASSERT(MUTEX_HELD(&xnbp->xnb_state_lock)); + ASSERT(xnbop->o_promiscuous == B_FALSE); + + prev = del = NULL; + + this = xnbop->o_mca; + + while (this != NULL) { + if (bcmp(&this->addr, addr, sizeof (this->addr)) == 0) { + del = this; + if (remove) { + if (prev == NULL) + xnbop->o_mca = this->next; + else + prev->next = this->next; + } + break; + } + + prev = this; + this = this->next; + } + + if (del == NULL) + return (B_FALSE); + + if (remove) { + DTRACE_PROBE3(mcast_remove, + (char *), "remove", + (void *), xnbp, + (etheraddr_t *), del->addr); + mac_multicast_remove(xnbop->o_mch, del->addr); + kmem_free(del, sizeof (*del)); + } + + return (B_TRUE); +} + +/* + * Add the multicast address `addr' to the set received. + */ +static boolean_t +xnbo_mcast_add(xnb_t *xnbp, ether_addr_t *addr) +{ + xnbo_t *xnbop = xnbp->xnb_flavour_data; + boolean_t r = B_FALSE; + + ASSERT(xnbop->o_promiscuous == B_FALSE); + + mutex_enter(&xnbp->xnb_state_lock); + + if (xnbo_mcast_find(xnbp, addr, B_FALSE)) { + r = B_TRUE; + } else if (mac_multicast_add(xnbop->o_mch, + (const uint8_t *)addr) == 0) { + xmca_t *mca; + + DTRACE_PROBE3(mcast_add, + (char *), "add", + (void *), xnbp, + (etheraddr_t *), addr); + + mca = kmem_alloc(sizeof (*mca), KM_SLEEP); + bcopy(addr, &mca->addr, sizeof (mca->addr)); + + mca->next = xnbop->o_mca; + xnbop->o_mca = mca; + + r = B_TRUE; + } + + mutex_exit(&xnbp->xnb_state_lock); + + return (r); +} + +/* + * Remove the multicast address `addr' from the set received. + */ +static boolean_t +xnbo_mcast_del(xnb_t *xnbp, ether_addr_t *addr) +{ + boolean_t r; + + mutex_enter(&xnbp->xnb_state_lock); + r = xnbo_mcast_find(xnbp, addr, B_TRUE); + mutex_exit(&xnbp->xnb_state_lock); + + return (r); } static int xnbo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { static xnb_flavour_t flavour = { - xnbo_to_mac, xnbo_connected, xnbo_disconnected, xnbo_hotplug, + xnbo_to_mac, xnbo_peer_connected, xnbo_peer_disconnected, + xnbo_hotplug_connected, xnbo_start_connect, xnbo_cksum_from_peer, xnbo_cksum_to_peer, + xnbo_mcast_add, xnbo_mcast_del, }; xnbo_t *xnbop; @@ -459,13 +607,6 @@ xnbop = kmem_zalloc(sizeof (*xnbop), KM_SLEEP); - xnbop->o_mh = NULL; - xnbop->o_mch = NULL; - xnbop->o_mah = NULL; - xnbop->o_mphp = NULL; - xnbop->o_running = B_FALSE; - xnbop->o_hcksum_capab = 0; - if (xnb_attach(dip, &flavour, xnbop) != DDI_SUCCESS) { kmem_free(xnbop, sizeof (*xnbop)); return (DDI_FAILURE); @@ -503,7 +644,7 @@ mutex_exit(&xnbp->xnb_rx_lock); mutex_exit(&xnbp->xnb_tx_lock); - xnbo_close_mac(xnbop); + xnbo_close_mac(xnbp); kmem_free(xnbop, sizeof (*xnbop)); xnb_detach(dip);
--- a/usr/src/uts/common/xen/io/xnbu.c Wed Nov 04 21:40:43 2009 -0800 +++ b/usr/src/uts/common/xen/io/xnbu.c Thu Nov 05 01:05:36 2009 -0800 @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -43,6 +43,7 @@ #include <sys/mac_provider.h> #include <sys/mac_ether.h> #include <xen/sys/xendev.h> +#include <sys/note.h> /* Required driver entry points for GLDv3 */ static int xnbu_m_start(void *); @@ -59,7 +60,7 @@ boolean_t u_need_sched; } xnbu_t; -static mac_callbacks_t xnb_callbacks = { +static mac_callbacks_t xnbu_callbacks = { MC_GETCAPAB, xnbu_m_stat, xnbu_m_start, @@ -147,28 +148,26 @@ static uint16_t xnbu_cksum_to_peer(xnb_t *xnbp, mblk_t *mp) { + _NOTE(ARGUNUSED(xnbp)); uint16_t r = 0; + uint32_t pflags; - if (xnbp->xnb_cksum_offload) { - uint32_t pflags; - - hcksum_retrieve(mp, NULL, NULL, NULL, NULL, - NULL, NULL, &pflags); + hcksum_retrieve(mp, NULL, NULL, NULL, NULL, + NULL, NULL, &pflags); - /* - * If the protocol stack has requested checksum - * offload, inform the peer that we have not - * calculated the checksum. - */ - if ((pflags & HCK_FULLCKSUM) != 0) - r |= NETRXF_csum_blank; - } + /* + * If the protocol stack has requested checksum + * offload, inform the peer that we have not + * calculated the checksum. + */ + if ((pflags & HCK_FULLCKSUM) != 0) + r |= NETRXF_csum_blank; return (r); } -static void -xnbu_connected(xnb_t *xnbp) +static boolean_t +xnbu_start_connect(xnb_t *xnbp) { xnbu_t *xnbup = xnbp->xnb_flavour_data; @@ -177,10 +176,20 @@ * We are able to send packets now - bring them on. */ mac_tx_update(xnbup->u_mh); + + return (B_TRUE); +} + +static boolean_t +xnbu_peer_connected(xnb_t *xnbp) +{ + _NOTE(ARGUNUSED(xnbp)); + + return (B_TRUE); } static void -xnbu_disconnected(xnb_t *xnbp) +xnbu_peer_disconnected(xnb_t *xnbp) { xnbu_t *xnbup = xnbp->xnb_flavour_data; @@ -189,7 +198,7 @@ /*ARGSUSED*/ static boolean_t -xnbu_hotplug(xnb_t *xnbp) +xnbu_hotplug_connected(xnb_t *xnbp) { return (B_TRUE); } @@ -199,28 +208,30 @@ { xnb_t *xnbp = arg; xnbu_t *xnbup = xnbp->xnb_flavour_data; + boolean_t sched = B_FALSE; mp = xnb_copy_to_peer(arg, mp); - /* XXPV dme: playing with need_sched without txlock? */ - + mutex_enter(&xnbp->xnb_rx_lock); /* * If we consumed all of the mblk_t's offered, perhaps we need * to indicate that we can accept more. Otherwise we are full * and need to wait for space. */ if (mp == NULL) { - /* - * If a previous transmit attempt failed because the ring - * was full, try again now. - */ - if (xnbup->u_need_sched) { - xnbup->u_need_sched = B_FALSE; - mac_tx_update(xnbup->u_mh); - } + sched = xnbup->u_need_sched; + xnbup->u_need_sched = B_FALSE; } else { xnbup->u_need_sched = B_TRUE; } + mutex_exit(&xnbp->xnb_rx_lock); + + /* + * If a previous transmit attempt failed because the ring + * was full, try again now. + */ + if (sched) + mac_tx_update(xnbup->u_mh); return (mp); } @@ -327,16 +338,13 @@ static boolean_t xnbu_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) { - xnb_t *xnbp = arg; + _NOTE(ARGUNUSED(arg)); switch (cap) { case MAC_CAPAB_HCKSUM: { uint32_t *capab = cap_data; - if (xnbp->xnb_cksum_offload) - *capab = HCKSUM_INET_PARTIAL; - else - *capab = 0; + *capab = HCKSUM_INET_PARTIAL; break; } default: @@ -346,12 +354,34 @@ return (B_TRUE); } +/* + * All packets are passed to the peer, so adding and removing + * multicast addresses is meaningless. + */ +static boolean_t +xnbu_mcast_add(xnb_t *xnbp, ether_addr_t *addr) +{ + _NOTE(ARGUNUSED(xnbp, addr)); + + return (B_TRUE); +} + +static boolean_t +xnbu_mcast_del(xnb_t *xnbp, ether_addr_t *addr) +{ + _NOTE(ARGUNUSED(xnbp, addr)); + + return (B_TRUE); +} + static int xnbu_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { static xnb_flavour_t flavour = { - xnbu_to_host, xnbu_connected, xnbu_disconnected, xnbu_hotplug, + xnbu_to_host, xnbu_peer_connected, xnbu_peer_disconnected, + xnbu_hotplug_connected, xnbu_start_connect, xnbu_cksum_from_peer, xnbu_cksum_to_peer, + xnbu_mcast_add, xnbu_mcast_del, }; xnbu_t *xnbup; xnb_t *xnbp; @@ -392,7 +422,7 @@ */ mr->m_type_ident = MAC_PLUGIN_IDENT_ETHER; mr->m_src_addr = xnbp->xnb_mac_addr; - mr->m_callbacks = &xnb_callbacks; + mr->m_callbacks = &xnbu_callbacks; mr->m_min_sdu = 0; mr->m_max_sdu = XNBMAXPKT; /*
--- a/usr/src/uts/common/xen/io/xnf.c Wed Nov 04 21:40:43 2009 -0800 +++ b/usr/src/uts/common/xen/io/xnf.c Thu Nov 05 01:05:36 2009 -0800 @@ -57,7 +57,50 @@ */ /* - * xnf.c - Nemo-based network driver for domU + * xnf.c - GLDv3 network driver for domU. + */ + +/* + * This driver uses four per-instance locks: + * + * xnf_gref_lock: + * + * Protects access to the grant reference list stored in + * xnf_gref_head. Grant references should be acquired and released + * using gref_get() and gref_put() respectively. + * + * xnf_schedlock: + * + * Protects: + * xnf_need_sched - used to record that a previous transmit attempt + * failed (and consequently it will be necessary to call + * mac_tx_update() when transmit resources are available). + * xnf_pending_multicast - the number of multicast requests that + * have been submitted to the backend for which we have not + * processed responses. + * + * xnf_txlock: + * + * Protects the transmit ring (xnf_tx_ring) and associated + * structures (notably xnf_tx_pkt_id and xnf_tx_pkt_id_head). + * + * xnf_rxlock: + * + * Protects the receive ring (xnf_rx_ring) and associated + * structures (notably xnf_rx_pkt_info). + * + * If driver-global state that affects both the transmit and receive + * rings is manipulated, both xnf_txlock and xnf_rxlock should be + * held, in that order. + * + * xnf_schedlock is acquired both whilst holding xnf_txlock and + * without. It should always be acquired after xnf_txlock if both are + * held. + * + * Notes: + * - atomic_add_64() is used to manipulate counters where we require + * accuracy. For counters intended only for observation by humans, + * post increment/decrement are used instead. */ #include <sys/types.h> @@ -67,6 +110,7 @@ #include <sys/systm.h> #include <sys/stream.h> #include <sys/strsubr.h> +#include <sys/strsun.h> #include <sys/conf.h> #include <sys/ddi.h> #include <sys/devops.h> @@ -96,17 +140,18 @@ #include <sys/gnttab.h> #include <xen/sys/xendev.h> #include <sys/sdt.h> +#include <sys/note.h> +#include <sys/debug.h> #include <io/xnf.h> - -/* - * Declarations and Module Linkage - */ - #if defined(DEBUG) || defined(__lint) #define XNF_DEBUG -int xnfdebug = 0; +#endif + +#ifdef XNF_DEBUG +int xnf_debug = 0; +xnf_t *xnf_debug_instance = NULL; #endif /* @@ -117,23 +162,39 @@ */ #define xnf_btop(addr) ((addr) >> PAGESHIFT) -boolean_t xnf_cksum_offload = B_TRUE; - -/* Default value for hypervisor-based copy operations */ -boolean_t xnf_rx_hvcopy = B_TRUE; +unsigned int xnf_max_tx_frags = 1; + +/* + * Should we use the multicast control feature if the backend provides + * it? + */ +boolean_t xnf_multicast_control = B_TRUE; /* - * Should pages used for transmit be readonly for the peer? + * Received packets below this size are copied to a new streams buffer + * rather than being desballoc'ed. + * + * This value is chosen to accommodate traffic where there are a large + * number of small packets. For data showing a typical distribution, + * see: + * + * Sinha07a: + * Rishi Sinha, Christos Papadopoulos, and John + * Heidemann. Internet Packet Size Distributions: Some + * Observations. Technical Report ISI-TR-2007-643, + * USC/Information Sciences Institute, May, 2007. Orignally + * released October 2005 as web page + * http://netweb.usc.edu/~sinha/pkt-sizes/. + * <http://www.isi.edu/~johnh/PAPERS/Sinha07a.html>. */ -boolean_t xnf_tx_pages_readonly = B_FALSE; -/* - * Packets under this size are bcopied instead of using desballoc. - * Choose a value > XNF_FRAMESIZE (1514) to force the receive path to - * always copy. - */ -unsigned int xnf_rx_bcopy_thresh = 64; - -unsigned int xnf_max_tx_frags = 1; +size_t xnf_rx_copy_limit = 64; + +#define INVALID_GRANT_HANDLE ((grant_handle_t)-1) +#define INVALID_GRANT_REF ((grant_ref_t)-1) +#define INVALID_TX_ID ((uint16_t)-1) + +#define TX_ID_TO_TXID(p, id) (&((p)->xnf_tx_pkt_id[(id)])) +#define TX_ID_VALID(i) (((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE)) /* Required system entry points */ static int xnf_attach(dev_info_t *, ddi_attach_cmd_t); @@ -148,35 +209,46 @@ static mblk_t *xnf_send(void *, mblk_t *); static uint_t xnf_intr(caddr_t); static int xnf_stat(void *, uint_t, uint64_t *); -static void xnf_ioctl(void *, queue_t *, mblk_t *); static boolean_t xnf_getcapab(void *, mac_capab_t, void *); /* Driver private functions */ static int xnf_alloc_dma_resources(xnf_t *); static void xnf_release_dma_resources(xnf_t *); -static mblk_t *xnf_process_recv(xnf_t *); -static void xnf_rcv_complete(struct xnf_buffer_desc *); static void xnf_release_mblks(xnf_t *); -static struct xnf_buffer_desc *xnf_alloc_tx_buffer(xnf_t *); -static struct xnf_buffer_desc *xnf_alloc_buffer(xnf_t *); -static struct xnf_buffer_desc *xnf_get_tx_buffer(xnf_t *); -static struct xnf_buffer_desc *xnf_get_buffer(xnf_t *); -static void xnf_free_buffer(struct xnf_buffer_desc *); -static void xnf_free_tx_buffer(struct xnf_buffer_desc *); + +static int xnf_buf_constructor(void *, void *, int); +static void xnf_buf_destructor(void *, void *); +static xnf_buf_t *xnf_buf_get(xnf_t *, int, boolean_t); +#pragma inline(xnf_buf_get) +static void xnf_buf_put(xnf_t *, xnf_buf_t *, boolean_t); +#pragma inline(xnf_buf_put) +static void xnf_buf_refresh(xnf_buf_t *); +#pragma inline(xnf_buf_refresh) +static void xnf_buf_recycle(xnf_buf_t *); + +static int xnf_tx_buf_constructor(void *, void *, int); +static void xnf_tx_buf_destructor(void *, void *); + +static grant_ref_t gref_get(xnf_t *); +#pragma inline(gref_get) +static void gref_put(xnf_t *, grant_ref_t); +#pragma inline(gref_put) + +static xnf_txid_t *txid_get(xnf_t *); +#pragma inline(txid_get) +static void txid_put(xnf_t *, xnf_txid_t *); +#pragma inline(txid_put) + void xnf_send_driver_status(int, int); -static void rx_buffer_hang(xnf_t *, struct xnf_buffer_desc *); -static int xnf_clean_tx_ring(xnf_t *); +static void xnf_rxbuf_hang(xnf_t *, xnf_buf_t *); +static int xnf_tx_clean_ring(xnf_t *); static void oe_state_change(dev_info_t *, ddi_eventcookie_t, void *, void *); -static mblk_t *xnf_process_hvcopy_recv(xnf_t *xnfp); -static boolean_t xnf_hvcopy_peer_status(dev_info_t *devinfo); -static boolean_t xnf_kstat_init(xnf_t *xnfp); - -/* - * XXPV dme: remove MC_IOCTL? - */ +static boolean_t xnf_kstat_init(xnf_t *); +static void xnf_rx_collect(xnf_t *); + static mac_callbacks_t xnf_callbacks = { - MC_IOCTL | MC_GETCAPAB, + MC_GETCAPAB, xnf_stat, xnf_start, xnf_stop, @@ -184,14 +256,10 @@ xnf_set_multicast, xnf_set_mac_addr, xnf_send, - xnf_ioctl, + NULL, xnf_getcapab }; -#define GRANT_INVALID_REF 0 -const int xnf_rx_bufs_lowat = 4 * NET_RX_RING_SIZE; -const int xnf_rx_bufs_hiwat = 8 * NET_RX_RING_SIZE; /* default max */ - /* DMA attributes for network ring buffer */ static ddi_dma_attr_t ringbuf_dma_attr = { DMA_ATTR_V0, /* version of this structure */ @@ -208,24 +276,8 @@ 0, /* flags (reserved) */ }; -/* DMA attributes for transmit data */ -static ddi_dma_attr_t tx_buffer_dma_attr = { - DMA_ATTR_V0, /* version of this structure */ - 0, /* lowest usable address */ - 0xffffffffffffffffULL, /* highest usable address */ - 0x7fffffff, /* maximum DMAable byte count */ - MMU_PAGESIZE, /* alignment in bytes */ - 0x7ff, /* bitmap of burst sizes */ - 1, /* minimum transfer */ - 0xffffffffU, /* maximum transfer */ - 0xffffffffffffffffULL, /* maximum segment length */ - 1, /* maximum number of segments */ - 1, /* granularity */ - 0, /* flags (reserved) */ -}; - -/* DMA attributes for a receive buffer */ -static ddi_dma_attr_t rx_buffer_dma_attr = { +/* DMA attributes for transmit and receive data */ +static ddi_dma_attr_t buf_dma_attr = { DMA_ATTR_V0, /* version of this structure */ 0, /* lowest usable address */ 0xffffffffffffffffULL, /* highest usable address */ @@ -254,9 +306,6 @@ DDI_STRICTORDER_ACC }; -unsigned char xnf_broadcastaddr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; -int xnf_diagnose = 0; /* Patchable global for diagnostic purposes */ - DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach, nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported); @@ -286,7 +335,7 @@ int _fini(void) { - return (EBUSY); /* XXPV dme: should be removable */ + return (EBUSY); /* XXPV should be removable */ } int @@ -295,19 +344,148 @@ return (mod_info(&modlinkage, modinfop)); } +/* + * Acquire a grant reference. + */ +static grant_ref_t +gref_get(xnf_t *xnfp) +{ + grant_ref_t gref; + + mutex_enter(&xnfp->xnf_gref_lock); + + do { + gref = gnttab_claim_grant_reference(&xnfp->xnf_gref_head); + + } while ((gref == INVALID_GRANT_REF) && + (gnttab_alloc_grant_references(16, &xnfp->xnf_gref_head) == 0)); + + mutex_exit(&xnfp->xnf_gref_lock); + + if (gref == INVALID_GRANT_REF) { + xnfp->xnf_stat_gref_failure++; + } else { + atomic_add_64(&xnfp->xnf_stat_gref_outstanding, 1); + if (xnfp->xnf_stat_gref_outstanding > xnfp->xnf_stat_gref_peak) + xnfp->xnf_stat_gref_peak = + xnfp->xnf_stat_gref_outstanding; + } + + return (gref); +} + +/* + * Release a grant reference. + */ +static void +gref_put(xnf_t *xnfp, grant_ref_t gref) +{ + ASSERT(gref != INVALID_GRANT_REF); + + mutex_enter(&xnfp->xnf_gref_lock); + gnttab_release_grant_reference(&xnfp->xnf_gref_head, gref); + mutex_exit(&xnfp->xnf_gref_lock); + + atomic_add_64(&xnfp->xnf_stat_gref_outstanding, -1); +} + +/* + * Acquire a transmit id. + */ +static xnf_txid_t * +txid_get(xnf_t *xnfp) +{ + xnf_txid_t *tidp; + + ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); + + if (xnfp->xnf_tx_pkt_id_head == INVALID_TX_ID) + return (NULL); + + ASSERT(TX_ID_VALID(xnfp->xnf_tx_pkt_id_head)); + + tidp = TX_ID_TO_TXID(xnfp, xnfp->xnf_tx_pkt_id_head); + xnfp->xnf_tx_pkt_id_head = tidp->next; + tidp->next = INVALID_TX_ID; + + ASSERT(tidp->txbuf == NULL); + + return (tidp); +} + +/* + * Release a transmit id. + */ +static void +txid_put(xnf_t *xnfp, xnf_txid_t *tidp) +{ + ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); + ASSERT(TX_ID_VALID(tidp->id)); + ASSERT(tidp->next == INVALID_TX_ID); + + tidp->txbuf = NULL; + tidp->next = xnfp->xnf_tx_pkt_id_head; + xnfp->xnf_tx_pkt_id_head = tidp->id; +} + +/* + * Get `wanted' slots in the transmit ring, waiting for at least that + * number if `wait' is B_TRUE. Force the ring to be cleaned by setting + * `wanted' to zero. + * + * Return the number of slots available. + */ +static int +tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait) +{ + int slotsfree; + boolean_t forced_clean = (wanted == 0); + + ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); + + /* LINTED: constant in conditional context */ + while (B_TRUE) { + slotsfree = RING_FREE_REQUESTS(&xnfp->xnf_tx_ring); + + if ((slotsfree < wanted) || forced_clean) + slotsfree = xnf_tx_clean_ring(xnfp); + + /* + * If there are more than we need free, tell other + * people to come looking again. We hold txlock, so we + * are able to take our slots before anyone else runs. + */ + if (slotsfree > wanted) + cv_broadcast(&xnfp->xnf_cv_tx_slots); + + if (slotsfree >= wanted) + break; + + if (!wait) + break; + + cv_wait(&xnfp->xnf_cv_tx_slots, &xnfp->xnf_txlock); + } + + ASSERT(slotsfree <= RING_SIZE(&(xnfp->xnf_tx_ring))); + + return (slotsfree); +} + static int xnf_setup_rings(xnf_t *xnfp) { - int ix, err; + domid_t oeid; + struct xenbus_device *xsd; RING_IDX i; - struct xnf_buffer_desc *bdesc, *rbp; - struct xenbus_device *xsd; - domid_t oeid; + int err; + xnf_txid_t *tidp; + xnf_buf_t **bdescp; oeid = xvdi_get_oeid(xnfp->xnf_devinfo); xsd = xvdi_get_xsd(xnfp->xnf_devinfo); - if (xnfp->xnf_tx_ring_ref != GRANT_INVALID_REF) + if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF) gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0); err = gnttab_grant_foreign_access(oeid, @@ -319,7 +497,7 @@ } xnfp->xnf_tx_ring_ref = (grant_ref_t)err; - if (xnfp->xnf_rx_ring_ref != GRANT_INVALID_REF) + if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF) gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0); err = gnttab_grant_foreign_access(oeid, @@ -331,139 +509,130 @@ } xnfp->xnf_rx_ring_ref = (grant_ref_t)err; - - mutex_enter(&xnfp->xnf_intrlock); + mutex_enter(&xnfp->xnf_txlock); /* - * Cleanup the TX ring. We just clean up any valid tx_pktinfo structs - * and reset the ring. Note that this can lose packets after a resume, - * but we expect to stagger on. + * Setup/cleanup the TX ring. Note that this can lose packets + * after a resume, but we expect to stagger on. */ - mutex_enter(&xnfp->xnf_txlock); - - for (i = 0; i < xnfp->xnf_n_tx; i++) { - struct tx_pktinfo *txp = &xnfp->xnf_tx_pkt_info[i]; - - txp->id = i + 1; - - if (txp->grant_ref == GRANT_INVALID_REF) { - ASSERT(txp->mp == NULL); - ASSERT(txp->bdesc == NULL); + xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */ + for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0]; + i < NET_TX_RING_SIZE; + i++, tidp++) { + xnf_txbuf_t *txp; + + tidp->id = i; + + txp = tidp->txbuf; + if (txp == NULL) { + tidp->next = INVALID_TX_ID; /* Appease txid_put(). */ + txid_put(xnfp, tidp); continue; } - if (gnttab_query_foreign_access(txp->grant_ref) != 0) - panic("tx grant still in use by backend domain"); - - freemsg(txp->mp); - txp->mp = NULL; - - (void) ddi_dma_unbind_handle(txp->dma_handle); - - if (txp->bdesc != NULL) { - xnf_free_tx_buffer(txp->bdesc); - txp->bdesc = NULL; + ASSERT(txp->tx_txreq.gref != INVALID_GRANT_REF); + ASSERT(txp->tx_mp != NULL); + + switch (txp->tx_type) { + case TX_DATA: + VERIFY(gnttab_query_foreign_access(txp->tx_txreq.gref) + == 0); + + if (txp->tx_bdesc == NULL) { + (void) gnttab_end_foreign_access_ref( + txp->tx_txreq.gref, 1); + gref_put(xnfp, txp->tx_txreq.gref); + (void) ddi_dma_unbind_handle( + txp->tx_dma_handle); + } else { + xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE); + } + + freemsg(txp->tx_mp); + txid_put(xnfp, tidp); + kmem_cache_free(xnfp->xnf_tx_buf_cache, txp); + + break; + + case TX_MCAST_REQ: + txp->tx_type = TX_MCAST_RSP; + txp->tx_status = NETIF_RSP_DROPPED; + cv_broadcast(&xnfp->xnf_cv_multicast); + + /* + * The request consumed two slots in the ring, + * yet only a single xnf_txid_t is used. Step + * over the empty slot. + */ + i++; + ASSERT(i < NET_TX_RING_SIZE); + + break; + + case TX_MCAST_RSP: + break; } - - (void) gnttab_end_foreign_access_ref(txp->grant_ref, - xnfp->xnf_tx_pages_readonly); - gnttab_release_grant_reference(&xnfp->xnf_gref_tx_head, - txp->grant_ref); - txp->grant_ref = GRANT_INVALID_REF; } - xnfp->xnf_tx_pkt_id_list = 0; - xnfp->xnf_tx_ring.rsp_cons = 0; - xnfp->xnf_tx_ring.req_prod_pvt = 0; - /* LINTED: constant in conditional context */ SHARED_RING_INIT(xnfp->xnf_tx_ring.sring); + /* LINTED: constant in conditional context */ + FRONT_RING_INIT(&xnfp->xnf_tx_ring, + xnfp->xnf_tx_ring.sring, PAGESIZE); mutex_exit(&xnfp->xnf_txlock); + mutex_enter(&xnfp->xnf_rxlock); + /* - * Rebuild the RX ring. We have to rebuild the RX ring because some of - * our pages are currently flipped out/granted so we can't just free - * the RX buffers. Reclaim any unprocessed recv buffers, they won't be - * useable anyway since the mfn's they refer to are no longer valid. - * Grant the backend domain access to each hung rx buffer. + * Clean out any buffers currently posted to the receive ring + * before we reset it. */ - i = xnfp->xnf_rx_ring.rsp_cons; - while (i++ != xnfp->xnf_rx_ring.sring->req_prod) { - volatile netif_rx_request_t *rxrp; - - rxrp = RING_GET_REQUEST(&xnfp->xnf_rx_ring, i); - ix = rxrp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0); - rbp = xnfp->xnf_rxpkt_bufptr[ix]; - if (rbp != NULL) { - grant_ref_t ref = rbp->grant_ref; - - ASSERT(ref != GRANT_INVALID_REF); - if (xnfp->xnf_rx_hvcopy) { - pfn_t pfn = xnf_btop(rbp->buf_phys); - mfn_t mfn = pfn_to_mfn(pfn); - - gnttab_grant_foreign_access_ref(ref, oeid, - mfn, 0); - } else { - gnttab_grant_foreign_transfer_ref(ref, - oeid, 0); - } - rxrp->id = ix; - rxrp->gref = ref; + for (i = 0, bdescp = &xnfp->xnf_rx_pkt_info[0]; + i < NET_RX_RING_SIZE; + i++, bdescp++) { + if (*bdescp != NULL) { + xnf_buf_put(xnfp, *bdescp, B_FALSE); + *bdescp = NULL; } } - /* - * Reset the ring pointers to initial state. - * Hang buffers for any empty ring slots. - */ - xnfp->xnf_rx_ring.rsp_cons = 0; - xnfp->xnf_rx_ring.req_prod_pvt = 0; - /* LINTED: constant in conditional context */ SHARED_RING_INIT(xnfp->xnf_rx_ring.sring); - + /* LINTED: constant in conditional context */ + FRONT_RING_INIT(&xnfp->xnf_rx_ring, + xnfp->xnf_rx_ring.sring, PAGESIZE); + + /* + * Fill the ring with buffers. + */ for (i = 0; i < NET_RX_RING_SIZE; i++) { - xnfp->xnf_rx_ring.req_prod_pvt = i; - if (xnfp->xnf_rxpkt_bufptr[i] != NULL) - continue; - if ((bdesc = xnf_get_buffer(xnfp)) == NULL) - break; - rx_buffer_hang(xnfp, bdesc); + xnf_buf_t *bdesc; + + bdesc = xnf_buf_get(xnfp, KM_SLEEP, B_FALSE); + VERIFY(bdesc != NULL); + xnf_rxbuf_hang(xnfp, bdesc); } - xnfp->xnf_rx_ring.req_prod_pvt = i; + /* LINTED: constant in conditional context */ RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring); - mutex_exit(&xnfp->xnf_intrlock); + mutex_exit(&xnfp->xnf_rxlock); return (0); out: - if (xnfp->xnf_tx_ring_ref != GRANT_INVALID_REF) + if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF) gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0); - xnfp->xnf_tx_ring_ref = GRANT_INVALID_REF; - - if (xnfp->xnf_rx_ring_ref != GRANT_INVALID_REF) + xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF; + + if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF) gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0); - xnfp->xnf_rx_ring_ref = GRANT_INVALID_REF; + xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF; return (err); } - -/* Called when the upper layers free a message we passed upstream */ -static void -xnf_copy_rcv_complete(struct xnf_buffer_desc *bdesc) -{ - (void) ddi_dma_unbind_handle(bdesc->dma_handle); - ddi_dma_mem_free(&bdesc->acc_handle); - ddi_dma_free_handle(&bdesc->dma_handle); - kmem_free(bdesc, sizeof (*bdesc)); -} - - /* * Connect driver to back end, called to set up communication with * back end driver both initially and on resume after restore/migrate. @@ -523,31 +692,24 @@ goto abort_transaction; } - if (!xnfp->xnf_tx_pages_readonly) { - err = xenbus_printf(xbt, xsname, "feature-tx-writable", - "%d", 1); - if (err != 0) { - message = "writing feature-tx-writable"; - goto abort_transaction; - } - } - - err = xenbus_printf(xbt, xsname, "feature-no-csum-offload", "%d", - xnfp->xnf_cksum_offload ? 0 : 1); - if (err != 0) { - message = "writing feature-no-csum-offload"; - goto abort_transaction; - } - err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d", - xnfp->xnf_rx_hvcopy ? 1 : 0); + err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d", 1); if (err != 0) { message = "writing request-rx-copy"; goto abort_transaction; } - err = xenbus_printf(xbt, xsname, "state", "%d", XenbusStateConnected); + if (xnfp->xnf_be_mcast_control) { + err = xenbus_printf(xbt, xsname, "request-multicast-control", + "%d", 1); + if (err != 0) { + message = "writing request-multicast-control"; + goto abort_transaction; + } + } + + err = xvdi_switch_state(xnfp->xnf_devinfo, xbt, XenbusStateConnected); if (err != 0) { - message = "writing frontend XenbusStateConnected"; + message = "switching state to XenbusStateConnected"; goto abort_transaction; } @@ -566,15 +728,16 @@ } /* - * Read config info from xenstore + * Read configuration information from xenstore. */ void xnf_read_config(xnf_t *xnfp) { - char mac[ETHERADDRL * 3]; - int err, be_no_cksum_offload; - - err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->xnf_devinfo), "mac", + int err, be_cap; + char mac[ETHERADDRL * 3]; + char *oename = xvdi_get_oename(xnfp->xnf_devinfo); + + err = xenbus_scanf(XBT_NULL, oename, "mac", "%s", (char *)&mac[0]); if (err != 0) { /* @@ -593,27 +756,31 @@ return; } - err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->xnf_devinfo), - "feature-no-csum-offload", "%d", &be_no_cksum_offload); + err = xenbus_scanf(XBT_NULL, oename, + "feature-rx-copy", "%d", &be_cap); /* * If we fail to read the store we assume that the key is * absent, implying an older domain at the far end. Older - * domains always support checksum offload. + * domains cannot do HV copy. */ if (err != 0) - be_no_cksum_offload = 0; + be_cap = 0; + xnfp->xnf_be_rx_copy = (be_cap != 0); + + err = xenbus_scanf(XBT_NULL, oename, + "feature-multicast-control", "%d", &be_cap); /* - * If the far end cannot do checksum offload or we do not wish - * to do it, disable it. + * If we fail to read the store we assume that the key is + * absent, implying an older domain at the far end. Older + * domains do not support multicast control. */ - if ((be_no_cksum_offload == 1) || !xnfp->xnf_cksum_offload) - xnfp->xnf_cksum_offload = B_FALSE; + if (err != 0) + be_cap = 0; + xnfp->xnf_be_mcast_control = (be_cap != 0) && xnf_multicast_control; } /* * attach(9E) -- Attach a device to the system - * - * Called once for each board successfully probed. */ static int xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd) @@ -621,9 +788,10 @@ mac_register_t *macp; xnf_t *xnfp; int err; + char cachename[32]; #ifdef XNF_DEBUG - if (xnfdebug & XNF_DEBUG_DDI) + if (xnf_debug & XNF_DEBUG_DDI) printf("xnf%d: attach(0x%p)\n", ddi_get_instance(devinfo), (void *)devinfo); #endif @@ -631,6 +799,7 @@ switch (cmd) { case DDI_RESUME: xnfp = ddi_get_driver_private(devinfo); + xnfp->xnf_gen++; (void) xvdi_resume(devinfo); (void) xvdi_alloc_evtchn(devinfo); @@ -642,16 +811,6 @@ (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp); #endif - xnf_be_connect(xnfp); - /* - * Our MAC address may have changed if we're resuming: - * - on a different host - * - on the same one and got a different MAC address - * because we didn't specify one of our own. - * so it's useful to claim that it changed in order that - * IP send out a gratuitous ARP. - */ - mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr); return (DDI_SUCCESS); case DDI_ATTACH: @@ -681,11 +840,14 @@ xnfp->xnf_running = B_FALSE; xnfp->xnf_connected = B_FALSE; - xnfp->xnf_cksum_offload = xnf_cksum_offload; - xnfp->xnf_tx_pages_readonly = xnf_tx_pages_readonly; + xnfp->xnf_be_rx_copy = B_FALSE; + xnfp->xnf_be_mcast_control = B_FALSE; xnfp->xnf_need_sched = B_FALSE; - xnfp->xnf_rx_hvcopy = xnf_hvcopy_peer_status(devinfo) && xnf_rx_hvcopy; + xnfp->xnf_rx_head = NULL; + xnfp->xnf_rx_tail = NULL; + xnfp->xnf_rx_new_buffers_posted = B_FALSE; + #ifdef XPV_HVM_DRIVER /* * Report our version to dom0. @@ -693,12 +855,6 @@ if (xenbus_printf(XBT_NULL, "guest/xnf", "version", "%d", HVMPV_XNF_VERS)) cmn_err(CE_WARN, "xnf: couldn't write version\n"); - - if (!xnfp->xnf_rx_hvcopy) { - cmn_err(CE_WARN, "The xnf driver requires a dom0 that " - "supports 'feature-rx-copy'"); - goto failure; - } #endif /* @@ -707,59 +863,58 @@ if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie) != DDI_SUCCESS) goto failure; - /* - * Driver locking strategy: the txlock protects all paths - * through the driver, except the interrupt thread. - * If the interrupt thread needs to do something which could - * affect the operation of any other part of the driver, - * it needs to acquire the txlock mutex. - */ - mutex_init(&xnfp->xnf_tx_buf_mutex, - NULL, MUTEX_DRIVER, xnfp->xnf_icookie); - mutex_init(&xnfp->xnf_rx_buf_mutex, - NULL, MUTEX_DRIVER, xnfp->xnf_icookie); + mutex_init(&xnfp->xnf_txlock, NULL, MUTEX_DRIVER, xnfp->xnf_icookie); - mutex_init(&xnfp->xnf_intrlock, + mutex_init(&xnfp->xnf_rxlock, + NULL, MUTEX_DRIVER, xnfp->xnf_icookie); + mutex_init(&xnfp->xnf_schedlock, + NULL, MUTEX_DRIVER, xnfp->xnf_icookie); + mutex_init(&xnfp->xnf_gref_lock, NULL, MUTEX_DRIVER, xnfp->xnf_icookie); - cv_init(&xnfp->xnf_cv, NULL, CV_DEFAULT, NULL); - - xnfp->xnf_gref_tx_head = (grant_ref_t)-1; - xnfp->xnf_gref_rx_head = (grant_ref_t)-1; - if (gnttab_alloc_grant_references(NET_TX_RING_SIZE, - &xnfp->xnf_gref_tx_head) < 0) { - cmn_err(CE_WARN, "xnf%d: can't alloc tx grant refs", - ddi_get_instance(xnfp->xnf_devinfo)); + + cv_init(&xnfp->xnf_cv_state, NULL, CV_DEFAULT, NULL); + cv_init(&xnfp->xnf_cv_multicast, NULL, CV_DEFAULT, NULL); + cv_init(&xnfp->xnf_cv_tx_slots, NULL, CV_DEFAULT, NULL); + + (void) sprintf(cachename, "xnf_buf_cache_%d", + ddi_get_instance(devinfo)); + xnfp->xnf_buf_cache = kmem_cache_create(cachename, + sizeof (xnf_buf_t), 0, + xnf_buf_constructor, xnf_buf_destructor, + NULL, xnfp, NULL, 0); + if (xnfp->xnf_buf_cache == NULL) + goto failure_0; + + (void) sprintf(cachename, "xnf_tx_buf_cache_%d", + ddi_get_instance(devinfo)); + xnfp->xnf_tx_buf_cache = kmem_cache_create(cachename, + sizeof (xnf_txbuf_t), 0, + xnf_tx_buf_constructor, xnf_tx_buf_destructor, + NULL, xnfp, NULL, 0); + if (xnfp->xnf_tx_buf_cache == NULL) goto failure_1; - } - if (gnttab_alloc_grant_references(NET_RX_RING_SIZE, - &xnfp->xnf_gref_rx_head) < 0) { - cmn_err(CE_WARN, "xnf%d: can't alloc rx grant refs", - ddi_get_instance(xnfp->xnf_devinfo)); - goto failure_1; - } + + xnfp->xnf_gref_head = INVALID_GRANT_REF; + if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) { cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize " "driver data structures", ddi_get_instance(xnfp->xnf_devinfo)); - goto failure_1; + goto failure_2; } xnfp->xnf_rx_ring.sring->rsp_event = xnfp->xnf_tx_ring.sring->rsp_event = 1; - xnfp->xnf_tx_ring_ref = GRANT_INVALID_REF; - xnfp->xnf_rx_ring_ref = GRANT_INVALID_REF; + xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF; + xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF; /* set driver private pointer now */ ddi_set_driver_private(devinfo, xnfp); - if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL) - != DDI_SUCCESS) - goto failure_1; - if (!xnf_kstat_init(xnfp)) - goto failure_2; + goto failure_3; /* * Allocate an event channel, add the interrupt handler and @@ -773,12 +928,15 @@ (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp); #endif - xnf_read_config(xnfp); err = mac_register(macp, &xnfp->xnf_mh); mac_free(macp); macp = NULL; if (err != 0) - goto failure_3; + goto failure_4; + + if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL) + != DDI_SUCCESS) + goto failure_5; #ifdef XPV_HVM_DRIVER /* @@ -792,15 +950,17 @@ "Ethernet controller"); #endif - /* - * connect to the backend - */ - xnf_be_connect(xnfp); +#ifdef XNF_DEBUG + if (xnf_debug_instance == NULL) + xnf_debug_instance = xnfp; +#endif return (DDI_SUCCESS); -failure_3: - kstat_delete(xnfp->xnf_kstat_aux); +failure_5: + mac_unregister(xnfp->xnf_mh); + +failure_4: #ifdef XPV_HVM_DRIVER ec_unbind_evtchn(xnfp->xnf_evtchn); xvdi_free_evtchn(devinfo); @@ -808,20 +968,26 @@ ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie); #endif xnfp->xnf_evtchn = INVALID_EVTCHN; + kstat_delete(xnfp->xnf_kstat_aux); + +failure_3: + xnf_release_dma_resources(xnfp); failure_2: - xvdi_remove_event_handler(devinfo, XS_OE_STATE); + kmem_cache_destroy(xnfp->xnf_tx_buf_cache); failure_1: - if (xnfp->xnf_gref_tx_head != (grant_ref_t)-1) - gnttab_free_grant_references(xnfp->xnf_gref_tx_head); - if (xnfp->xnf_gref_rx_head != (grant_ref_t)-1) - gnttab_free_grant_references(xnfp->xnf_gref_rx_head); - xnf_release_dma_resources(xnfp); - cv_destroy(&xnfp->xnf_cv); - mutex_destroy(&xnfp->xnf_rx_buf_mutex); + kmem_cache_destroy(xnfp->xnf_buf_cache); + +failure_0: + cv_destroy(&xnfp->xnf_cv_tx_slots); + cv_destroy(&xnfp->xnf_cv_multicast); + cv_destroy(&xnfp->xnf_cv_state); + + mutex_destroy(&xnfp->xnf_gref_lock); + mutex_destroy(&xnfp->xnf_schedlock); + mutex_destroy(&xnfp->xnf_rxlock); mutex_destroy(&xnfp->xnf_txlock); - mutex_destroy(&xnfp->xnf_intrlock); failure: kmem_free(xnfp, sizeof (*xnfp)); @@ -836,10 +1002,9 @@ xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd) { xnf_t *xnfp; /* Our private device info */ - int i; #ifdef XNF_DEBUG - if (xnfdebug & XNF_DEBUG_DDI) + if (xnf_debug & XNF_DEBUG_DDI) printf("xnf_detach(0x%p)\n", (void *)devinfo); #endif @@ -856,13 +1021,13 @@ xvdi_suspend(devinfo); - mutex_enter(&xnfp->xnf_intrlock); + mutex_enter(&xnfp->xnf_rxlock); mutex_enter(&xnfp->xnf_txlock); xnfp->xnf_evtchn = INVALID_EVTCHN; xnfp->xnf_connected = B_FALSE; mutex_exit(&xnfp->xnf_txlock); - mutex_exit(&xnfp->xnf_intrlock); + mutex_exit(&xnfp->xnf_rxlock); /* claim link to be down after disconnect */ mac_link_update(xnfp->xnf_mh, LINK_STATE_DOWN); @@ -878,25 +1043,11 @@ if (xnfp->xnf_connected) return (DDI_FAILURE); - /* Wait for receive buffers to be returned; give up after 5 seconds */ - i = 50; - - mutex_enter(&xnfp->xnf_rx_buf_mutex); - while (xnfp->xnf_rx_bufs_outstanding > 0) { - mutex_exit(&xnfp->xnf_rx_buf_mutex); - delay(drv_usectohz(100000)); - if (--i == 0) { - cmn_err(CE_WARN, - "xnf%d: never reclaimed all the " - "receive buffers. Still have %d " - "buffers outstanding.", - ddi_get_instance(xnfp->xnf_devinfo), - xnfp->xnf_rx_bufs_outstanding); - return (DDI_FAILURE); - } - mutex_enter(&xnfp->xnf_rx_buf_mutex); - } - mutex_exit(&xnfp->xnf_rx_buf_mutex); + /* + * Cannot detach if we have xnf_buf_t outstanding. + */ + if (xnfp->xnf_stat_buf_allocated > 0) + return (DDI_FAILURE); if (mac_unregister(xnfp->xnf_mh) != 0) return (DDI_FAILURE); @@ -922,10 +1073,17 @@ /* Release all DMA resources */ xnf_release_dma_resources(xnfp); - cv_destroy(&xnfp->xnf_cv); - mutex_destroy(&xnfp->xnf_rx_buf_mutex); + cv_destroy(&xnfp->xnf_cv_tx_slots); + cv_destroy(&xnfp->xnf_cv_multicast); + cv_destroy(&xnfp->xnf_cv_state); + + kmem_cache_destroy(xnfp->xnf_tx_buf_cache); + kmem_cache_destroy(xnfp->xnf_buf_cache); + + mutex_destroy(&xnfp->xnf_gref_lock); + mutex_destroy(&xnfp->xnf_schedlock); + mutex_destroy(&xnfp->xnf_rxlock); mutex_destroy(&xnfp->xnf_txlock); - mutex_destroy(&xnfp->xnf_intrlock); kmem_free(xnfp, sizeof (*xnfp)); @@ -935,24 +1093,13 @@ /* * xnf_set_mac_addr() -- set the physical network address on the board. */ -/*ARGSUSED*/ static int xnf_set_mac_addr(void *arg, const uint8_t *macaddr) { - xnf_t *xnfp = arg; - -#ifdef XNF_DEBUG - if (xnfdebug & XNF_DEBUG_TRACE) - printf("xnf%d: set_mac_addr(0x%p): " - "%02x:%02x:%02x:%02x:%02x:%02x\n", - ddi_get_instance(xnfp->xnf_devinfo), - (void *)xnfp, macaddr[0], macaddr[1], macaddr[2], - macaddr[3], macaddr[4], macaddr[5]); -#endif + _NOTE(ARGUNUSED(arg, macaddr)); + /* * We can't set our macaddr. - * - * XXPV dme: Why not? */ return (ENOTSUP); } @@ -961,33 +1108,113 @@ * xnf_set_multicast() -- set (enable) or disable a multicast address. * * Program the hardware to enable/disable the multicast address - * in "mcast". Enable if "add" is true, disable if false. + * in "mca". Enable if "add" is true, disable if false. */ -/*ARGSUSED*/ static int xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca) { xnf_t *xnfp = arg; - -#ifdef XNF_DEBUG - if (xnfdebug & XNF_DEBUG_TRACE) - printf("xnf%d set_multicast(0x%p): " - "%02x:%02x:%02x:%02x:%02x:%02x\n", - ddi_get_instance(xnfp->xnf_devinfo), - (void *)xnfp, mca[0], mca[1], mca[2], - mca[3], mca[4], mca[5]); -#endif + xnf_txbuf_t *txp; + int n_slots; + RING_IDX slot; + xnf_txid_t *tidp; + netif_tx_request_t *txrp; + struct netif_extra_info *erp; + boolean_t notify, result; + + /* + * If the backend does not support multicast control then we + * must assume that the right packets will just arrive. + */ + if (!xnfp->xnf_be_mcast_control) + return (0); + + txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP); + if (txp == NULL) + return (1); + + mutex_enter(&xnfp->xnf_txlock); + + /* + * If we're not yet connected then claim success. This is + * acceptable because we refresh the entire set of multicast + * addresses when we get connected. + * + * We can't wait around here because the MAC layer expects + * this to be a non-blocking operation - waiting ends up + * causing a deadlock during resume. + */ + if (!xnfp->xnf_connected) { + mutex_exit(&xnfp->xnf_txlock); + return (0); + } /* - * XXPV dme: Ideally we'd relay the address to the backend for - * enabling. The protocol doesn't support that (interesting - * extension), so we simply succeed and hope that the relevant - * packets are going to arrive. - * - * If protocol support is added for enable/disable then we'll - * need to keep a list of those in use and re-add on resume. + * 1. Acquire two slots in the ring. + * 2. Fill in the slots. + * 3. Request notification when the operation is done. + * 4. Kick the peer. + * 5. Wait for the response via xnf_tx_clean_ring(). */ - return (0); + + n_slots = tx_slots_get(xnfp, 2, B_TRUE); + ASSERT(n_slots >= 2); + + slot = xnfp->xnf_tx_ring.req_prod_pvt; + tidp = txid_get(xnfp); + VERIFY(tidp != NULL); + + txp->tx_type = TX_MCAST_REQ; + txp->tx_slot = slot; + + txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot); + erp = (struct netif_extra_info *) + RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot + 1); + + txrp->gref = 0; + txrp->size = 0; + txrp->offset = 0; + /* Set tx_txreq.id to appease xnf_tx_clean_ring(). */ + txrp->id = txp->tx_txreq.id = tidp->id; + txrp->flags = NETTXF_extra_info; + + erp->type = add ? XEN_NETIF_EXTRA_TYPE_MCAST_ADD : + XEN_NETIF_EXTRA_TYPE_MCAST_DEL; + bcopy((void *)mca, &erp->u.mcast.addr, ETHERADDRL); + + tidp->txbuf = txp; + + xnfp->xnf_tx_ring.req_prod_pvt = slot + 2; + + mutex_enter(&xnfp->xnf_schedlock); + xnfp->xnf_pending_multicast++; + mutex_exit(&xnfp->xnf_schedlock); + + /* LINTED: constant in conditional context */ + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring, + notify); + if (notify) + ec_notify_via_evtchn(xnfp->xnf_evtchn); + + while (txp->tx_type == TX_MCAST_REQ) + cv_wait(&xnfp->xnf_cv_multicast, + &xnfp->xnf_txlock); + + ASSERT(txp->tx_type == TX_MCAST_RSP); + + mutex_enter(&xnfp->xnf_schedlock); + xnfp->xnf_pending_multicast--; + mutex_exit(&xnfp->xnf_schedlock); + + result = (txp->tx_status == NETIF_RSP_OKAY); + + txid_put(xnfp, tidp); + + mutex_exit(&xnfp->xnf_txlock); + + kmem_cache_free(xnfp->xnf_tx_buf_cache, txp); + + return (result ? 0 : 1); } /* @@ -995,18 +1222,11 @@ * * Program the hardware to enable/disable promiscuous mode. */ -/*ARGSUSED*/ static int xnf_set_promiscuous(void *arg, boolean_t on) { - xnf_t *xnfp = arg; - -#ifdef XNF_DEBUG - if (xnfdebug & XNF_DEBUG_TRACE) - printf("xnf%d set_promiscuous(0x%p, %x)\n", - ddi_get_instance(xnfp->xnf_devinfo), - (void *)xnfp, on); -#endif + _NOTE(ARGUNUSED(arg, on)); + /* * We can't really do this, but we pretend that we can in * order that snoop will work. @@ -1018,51 +1238,88 @@ * Clean buffers that we have responses for from the transmit ring. */ static int -xnf_clean_tx_ring(xnf_t *xnfp) +xnf_tx_clean_ring(xnf_t *xnfp) { - RING_IDX next_resp, i; - struct tx_pktinfo *reap; - int id; - grant_ref_t ref; - boolean_t work_to_do; + boolean_t work_to_do; ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); loop: while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) { - /* - * index of next transmission ack - */ - next_resp = xnfp->xnf_tx_ring.sring->rsp_prod; + RING_IDX cons, prod, i; + + cons = xnfp->xnf_tx_ring.rsp_cons; + prod = xnfp->xnf_tx_ring.sring->rsp_prod; membar_consumer(); /* - * Clean tx packets from ring that we have responses for + * Clean tx requests from ring that we have responses + * for. */ - for (i = xnfp->xnf_tx_ring.rsp_cons; i != next_resp; i++) { - id = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i)->id; - reap = &xnfp->xnf_tx_pkt_info[id]; - ref = reap->grant_ref; - /* - * Return id to free list - */ - reap->id = xnfp->xnf_tx_pkt_id_list; - xnfp->xnf_tx_pkt_id_list = id; - if (gnttab_query_foreign_access(ref) != 0) - panic("tx grant still in use " - "by backend domain"); - (void) ddi_dma_unbind_handle(reap->dma_handle); - (void) gnttab_end_foreign_access_ref(ref, - xnfp->xnf_tx_pages_readonly); - gnttab_release_grant_reference(&xnfp->xnf_gref_tx_head, - ref); - freemsg(reap->mp); - reap->mp = NULL; - reap->grant_ref = GRANT_INVALID_REF; - if (reap->bdesc != NULL) - xnf_free_tx_buffer(reap->bdesc); - reap->bdesc = NULL; + DTRACE_PROBE2(xnf_tx_clean_range, int, cons, int, prod); + for (i = cons; i != prod; i++) { + netif_tx_response_t *trp; + xnf_txid_t *tidp; + xnf_txbuf_t *txp; + + trp = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i); + ASSERT(TX_ID_VALID(trp->id)); + + tidp = TX_ID_TO_TXID(xnfp, trp->id); + ASSERT(tidp->id == trp->id); + ASSERT(tidp->next == INVALID_TX_ID); + + txp = tidp->txbuf; + ASSERT(txp != NULL); + ASSERT(txp->tx_txreq.id == trp->id); + + switch (txp->tx_type) { + case TX_DATA: + if (gnttab_query_foreign_access( + txp->tx_txreq.gref) != 0) + cmn_err(CE_PANIC, + "tx grant %d still in use by " + "backend domain", + txp->tx_txreq.gref); + + if (txp->tx_bdesc == NULL) { + (void) gnttab_end_foreign_access_ref( + txp->tx_txreq.gref, 1); + gref_put(xnfp, txp->tx_txreq.gref); + (void) ddi_dma_unbind_handle( + txp->tx_dma_handle); + } else { + xnf_buf_put(xnfp, txp->tx_bdesc, + B_TRUE); + } + + freemsg(txp->tx_mp); + txid_put(xnfp, tidp); + kmem_cache_free(xnfp->xnf_tx_buf_cache, txp); + + break; + + case TX_MCAST_REQ: + txp->tx_type = TX_MCAST_RSP; + txp->tx_status = trp->status; + cv_broadcast(&xnfp->xnf_cv_multicast); + + break; + + case TX_MCAST_RSP: + break; + + default: + cmn_err(CE_PANIC, "xnf_tx_clean_ring: " + "invalid xnf_txbuf_t type: %d", + txp->tx_type); + break; + } } - xnfp->xnf_tx_ring.rsp_cons = next_resp; + /* + * Record the last response we dealt with so that we + * know where to start next time around. + */ + xnfp->xnf_tx_ring.rsp_cons = prod; membar_enter(); } @@ -1075,40 +1332,40 @@ } /* - * If we need to pull up data from either a packet that crosses a page - * boundary or consisting of multiple mblks, do it here. We allocate - * a page aligned buffer and copy the data into it. The header for the - * allocated buffer is returned. (which is also allocated here) + * Allocate and fill in a look-aside buffer for the packet `mp'. Used + * to ensure that the packet is physically contiguous and contained + * within a single page. */ -static struct xnf_buffer_desc * -xnf_pullupmsg(xnf_t *xnfp, mblk_t *mp) +static xnf_buf_t * +xnf_tx_pullup(xnf_t *xnfp, mblk_t *mp) { - struct xnf_buffer_desc *bdesc; - mblk_t *mptr; - caddr_t bp; - int len; - - /* - * get a xmit buffer from the xmit buffer pool - */ - mutex_enter(&xnfp->xnf_rx_buf_mutex); - bdesc = xnf_get_tx_buffer(xnfp); - mutex_exit(&xnfp->xnf_rx_buf_mutex); - if (bdesc == NULL) - return (bdesc); - /* - * Copy the data into the buffer - */ + xnf_buf_t *bd; + caddr_t bp; + + bd = xnf_buf_get(xnfp, KM_SLEEP, B_TRUE); + if (bd == NULL) + return (NULL); + + bp = bd->buf; + while (mp != NULL) { + size_t len = MBLKL(mp); + + bcopy(mp->b_rptr, bp, len); + bp += len; + + mp = mp->b_cont; + } + + ASSERT((bp - bd->buf) <= PAGESIZE); + xnfp->xnf_stat_tx_pullup++; - bp = bdesc->buf; - for (mptr = mp; mptr != NULL; mptr = mptr->b_cont) { - len = mptr->b_wptr - mptr->b_rptr; - bcopy(mptr->b_rptr, bp, len); - bp += len; - } - return (bdesc); + + return (bd); } +/* + * Insert the pseudo-header checksum into the packet `buf'. + */ void xnf_pseudo_cksum(caddr_t buf, int length) { @@ -1179,280 +1436,419 @@ } /* - * xnf_send_one() -- send a packet - * - * Called when a packet is ready to be transmitted. A pointer to an - * M_DATA message that contains the packet is passed to this routine. - * At least the complete LLC header is contained in the message's - * first message block, and the remainder of the packet is contained - * within additional M_DATA message blocks linked to the first - * message block. - * + * Push a list of prepared packets (`txp') into the transmit ring. */ -static boolean_t -xnf_send_one(xnf_t *xnfp, mblk_t *mp) +static xnf_txbuf_t * +tx_push_packets(xnf_t *xnfp, xnf_txbuf_t *txp) +{ + int slots_free; + RING_IDX slot; + boolean_t notify; + + mutex_enter(&xnfp->xnf_txlock); + + ASSERT(xnfp->xnf_running); + + /* + * Wait until we are connected to the backend. + */ + while (!xnfp->xnf_connected) + cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock); + + slots_free = tx_slots_get(xnfp, 1, B_FALSE); + DTRACE_PROBE1(xnf_send_slotsfree, int, slots_free); + + slot = xnfp->xnf_tx_ring.req_prod_pvt; + + while ((txp != NULL) && (slots_free > 0)) { + xnf_txid_t *tidp; + netif_tx_request_t *txrp; + + tidp = txid_get(xnfp); + VERIFY(tidp != NULL); + + txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot); + + txp->tx_slot = slot; + txp->tx_txreq.id = tidp->id; + *txrp = txp->tx_txreq; + + tidp->txbuf = txp; + + xnfp->xnf_stat_opackets++; + xnfp->xnf_stat_obytes += txp->tx_txreq.size; + + txp = txp->tx_next; + slots_free--; + slot++; + + } + + xnfp->xnf_tx_ring.req_prod_pvt = slot; + + /* + * Tell the peer that we sent something, if it cares. + */ + /* LINTED: constant in conditional context */ + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring, + notify); + if (notify) + ec_notify_via_evtchn(xnfp->xnf_evtchn); + + mutex_exit(&xnfp->xnf_txlock); + + return (txp); +} + +/* + * Send the chain of packets `mp'. Called by the MAC framework. + */ +static mblk_t * +xnf_send(void *arg, mblk_t *mp) { - struct xnf_buffer_desc *xmitbuf; - struct tx_pktinfo *txp_info; - mblk_t *mptr; - ddi_dma_cookie_t dma_cookie; - RING_IDX slot; - int length = 0, i, pktlen = 0, rc, tx_id; - int tx_ring_freespace, page_oops; - uint_t ncookies; - volatile netif_tx_request_t *txrp; - caddr_t bufaddr; - grant_ref_t ref; - unsigned long mfn; - uint32_t pflags; - domid_t oeid; + xnf_t *xnfp = arg; + domid_t oeid; + xnf_txbuf_t *head, *tail; + mblk_t *ml; + int prepared; + + oeid = xvdi_get_oeid(xnfp->xnf_devinfo); + + /* + * Prepare packets for transmission. + */ + head = tail = NULL; + prepared = 0; + while (mp != NULL) { + xnf_txbuf_t *txp; + int n_chunks, length; + boolean_t page_oops; + uint32_t pflags; + + for (ml = mp, n_chunks = length = 0, page_oops = B_FALSE; + ml != NULL; + ml = ml->b_cont, n_chunks++) { + + /* + * Test if this buffer includes a page + * boundary. The test assumes that the range + * b_rptr...b_wptr can include only a single + * boundary. + */ + if (xnf_btop((size_t)ml->b_rptr) != + xnf_btop((size_t)ml->b_wptr)) { + xnfp->xnf_stat_tx_pagebndry++; + page_oops = B_TRUE; + } + + length += MBLKL(ml); + } + DTRACE_PROBE1(xnf_send_b_cont, int, n_chunks); + + /* + * Make sure packet isn't too large. + */ + if (length > XNF_FRAMESIZE) { + cmn_err(CE_WARN, + "xnf%d: oversized packet (%d bytes) dropped", + ddi_get_instance(xnfp->xnf_devinfo), length); + freemsg(mp); + continue; + } + + txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP); + if (txp == NULL) + break; + + txp->tx_type = TX_DATA; + + if ((n_chunks > xnf_max_tx_frags) || page_oops) { + /* + * Loan a side buffer rather than the mblk + * itself. + */ + txp->tx_bdesc = xnf_tx_pullup(xnfp, mp); + if (txp->tx_bdesc == NULL) { + kmem_cache_free(xnfp->xnf_tx_buf_cache, txp); + break; + } + + txp->tx_bufp = txp->tx_bdesc->buf; + txp->tx_mfn = txp->tx_bdesc->buf_mfn; + txp->tx_txreq.gref = txp->tx_bdesc->grant_ref; + + } else { + int rc; + ddi_dma_cookie_t dma_cookie; + uint_t ncookies; + + rc = ddi_dma_addr_bind_handle(txp->tx_dma_handle, + NULL, (char *)mp->b_rptr, length, + DDI_DMA_WRITE | DDI_DMA_STREAMING, + DDI_DMA_DONTWAIT, 0, &dma_cookie, + &ncookies); + if (rc != DDI_DMA_MAPPED) { + ASSERT(rc != DDI_DMA_INUSE); + ASSERT(rc != DDI_DMA_PARTIAL_MAP); #ifdef XNF_DEBUG - if (xnfdebug & XNF_DEBUG_SEND) - printf("xnf%d send(0x%p, 0x%p)\n", - ddi_get_instance(xnfp->xnf_devinfo), - (void *)xnfp, (void *)mp); + if (rc != DDI_DMA_NORESOURCES) + cmn_err(CE_WARN, + "xnf%d: bind_handle failed (%x)", + ddi_get_instance(xnfp->xnf_devinfo), + rc); #endif - - ASSERT(mp != NULL); - ASSERT(mp->b_next == NULL); - ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); - - tx_ring_freespace = xnf_clean_tx_ring(xnfp); - ASSERT(tx_ring_freespace >= 0); - - oeid = xvdi_get_oeid(xnfp->xnf_devinfo); - xnfp->xnf_stat_tx_attempt++; - /* - * If there are no xmit ring slots available, return. - */ - if (tx_ring_freespace == 0) { - xnfp->xnf_stat_tx_defer++; - return (B_FALSE); /* Send should be retried */ + kmem_cache_free(xnfp->xnf_tx_buf_cache, txp); + break; + } + ASSERT(ncookies == 1); + + txp->tx_bdesc = NULL; + txp->tx_bufp = (caddr_t)mp->b_rptr; + txp->tx_mfn = + xnf_btop(pa_to_ma(dma_cookie.dmac_laddress)); + txp->tx_txreq.gref = gref_get(xnfp); + if (txp->tx_txreq.gref == INVALID_GRANT_REF) { + (void) ddi_dma_unbind_handle( + txp->tx_dma_handle); + kmem_cache_free(xnfp->xnf_tx_buf_cache, txp); + break; + } + gnttab_grant_foreign_access_ref(txp->tx_txreq.gref, + oeid, txp->tx_mfn, 1); + } + + txp->tx_next = NULL; + txp->tx_mp = mp; + txp->tx_txreq.size = length; + txp->tx_txreq.offset = (uintptr_t)txp->tx_bufp & PAGEOFFSET; + txp->tx_txreq.flags = 0; + hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, + &pflags); + if (pflags != 0) { + /* + * If the local protocol stack requests checksum + * offload we set the 'checksum blank' flag, + * indicating to the peer that we need the checksum + * calculated for us. + * + * We _don't_ set the validated flag, because we haven't + * validated that the data and the checksum match. + */ + xnf_pseudo_cksum(txp->tx_bufp, length); + txp->tx_txreq.flags |= NETTXF_csum_blank; + + xnfp->xnf_stat_tx_cksum_deferred++; + } + + if (head == NULL) { + ASSERT(tail == NULL); + + head = txp; + } else { + ASSERT(tail != NULL); + + tail->tx_next = txp; + } + tail = txp; + + mp = mp->b_next; + prepared++; + + /* + * There is no point in preparing more than + * NET_TX_RING_SIZE, as we won't be able to push them + * into the ring in one go and would hence have to + * un-prepare the extra. + */ + if (prepared == NET_TX_RING_SIZE) + break; } - slot = xnfp->xnf_tx_ring.req_prod_pvt; - /* Count the number of mblks in message and compute packet size */ - for (i = 0, mptr = mp; mptr != NULL; mptr = mptr->b_cont, i++) - pktlen += (mptr->b_wptr - mptr->b_rptr); - - /* Make sure packet isn't too large */ - if (pktlen > XNF_FRAMESIZE) { - cmn_err(CE_WARN, "xnf%d: oversized packet (%d bytes) dropped", - ddi_get_instance(xnfp->xnf_devinfo), pktlen); - freemsg(mp); - return (B_TRUE); + DTRACE_PROBE1(xnf_send_prepared, int, prepared); + + if (mp != NULL) { +#ifdef XNF_DEBUG + int notprepared = 0; + mblk_t *l = mp; + + while (l != NULL) { + notprepared++; + l = l->b_next; + } + + DTRACE_PROBE1(xnf_send_notprepared, int, notprepared); +#else /* !XNF_DEBUG */ + DTRACE_PROBE1(xnf_send_notprepared, int, -1); +#endif /* XNF_DEBUG */ } /* - * Test if we cross a page boundary with our buffer - */ - page_oops = (i == 1) && - (xnf_btop((size_t)mp->b_rptr) != - xnf_btop((size_t)(mp->b_rptr + pktlen))); - /* - * XXPV - unfortunately, the Xen virtual net device currently - * doesn't support multiple packet frags, so this will always - * end up doing the pullup if we got more than one packet. + * Push the packets we have prepared into the ring. They may + * not all go. */ - if (i > xnf_max_tx_frags || page_oops) { - if (page_oops) - xnfp->xnf_stat_tx_pagebndry++; - if ((xmitbuf = xnf_pullupmsg(xnfp, mp)) == NULL) { - /* could not allocate resources? */ -#ifdef XNF_DEBUG - cmn_err(CE_WARN, "xnf%d: pullupmsg failed", - ddi_get_instance(xnfp->xnf_devinfo)); -#endif - xnfp->xnf_stat_tx_defer++; - return (B_FALSE); /* Retry send */ - } - bufaddr = xmitbuf->buf; - } else { - xmitbuf = NULL; - bufaddr = (caddr_t)mp->b_rptr; - } - - /* set up data descriptor */ - length = pktlen; + if (head != NULL) + head = tx_push_packets(xnfp, head); /* - * Get packet id from free list + * If some packets that we prepared were not sent, unprepare + * them and add them back to the head of those we didn't + * prepare. */ - tx_id = xnfp->xnf_tx_pkt_id_list; - ASSERT(tx_id < NET_TX_RING_SIZE); - txp_info = &xnfp->xnf_tx_pkt_info[tx_id]; - xnfp->xnf_tx_pkt_id_list = txp_info->id; - txp_info->id = tx_id; - - /* Prepare for DMA mapping of tx buffer(s) */ - rc = ddi_dma_addr_bind_handle(txp_info->dma_handle, - NULL, bufaddr, length, DDI_DMA_WRITE | DDI_DMA_STREAMING, - DDI_DMA_DONTWAIT, 0, &dma_cookie, &ncookies); - if (rc != DDI_DMA_MAPPED) { - ASSERT(rc != DDI_DMA_INUSE); - ASSERT(rc != DDI_DMA_PARTIAL_MAP); - /* - * Return id to free list - */ - txp_info->id = xnfp->xnf_tx_pkt_id_list; - xnfp->xnf_tx_pkt_id_list = tx_id; - if (rc == DDI_DMA_NORESOURCES) { - xnfp->xnf_stat_tx_defer++; - return (B_FALSE); /* Retry later */ + { + xnf_txbuf_t *loop; + mblk_t *mp_head, *mp_tail; + int unprepared = 0; + + mp_head = mp_tail = NULL; + loop = head; + + while (loop != NULL) { + xnf_txbuf_t *next = loop->tx_next; + + if (loop->tx_bdesc == NULL) { + (void) gnttab_end_foreign_access_ref( + loop->tx_txreq.gref, 1); + gref_put(xnfp, loop->tx_txreq.gref); + (void) ddi_dma_unbind_handle( + loop->tx_dma_handle); + } else { + xnf_buf_put(xnfp, loop->tx_bdesc, B_TRUE); + } + + ASSERT(loop->tx_mp != NULL); + if (mp_head == NULL) + mp_head = loop->tx_mp; + mp_tail = loop->tx_mp; + + kmem_cache_free(xnfp->xnf_tx_buf_cache, loop); + loop = next; + unprepared++; } -#ifdef XNF_DEBUG - cmn_err(CE_WARN, "xnf%d: bind_handle failed (%x)", - ddi_get_instance(xnfp->xnf_devinfo), rc); -#endif - return (B_FALSE); + + if (mp_tail == NULL) { + ASSERT(mp_head == NULL); + } else { + ASSERT(mp_head != NULL); + + mp_tail->b_next = mp; + mp = mp_head; + } + + DTRACE_PROBE1(xnf_send_unprepared, int, unprepared); } - ASSERT(ncookies == 1); - ref = gnttab_claim_grant_reference(&xnfp->xnf_gref_tx_head); - ASSERT((signed short)ref >= 0); - mfn = xnf_btop(pa_to_ma((paddr_t)dma_cookie.dmac_laddress)); - gnttab_grant_foreign_access_ref(ref, oeid, mfn, - xnfp->xnf_tx_pages_readonly); - txp_info->grant_ref = ref; - txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot); - txrp->gref = ref; - txrp->size = dma_cookie.dmac_size; - txrp->offset = (uintptr_t)bufaddr & PAGEOFFSET; - txrp->id = tx_id; - txrp->flags = 0; - hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &pflags); - if (pflags != 0) { - ASSERT(xnfp->xnf_cksum_offload); - /* - * If the local protocol stack requests checksum - * offload we set the 'checksum blank' flag, - * indicating to the peer that we need the checksum - * calculated for us. - * - * We _don't_ set the validated flag, because we haven't - * validated that the data and the checksum match. - */ - xnf_pseudo_cksum(bufaddr, length); - txrp->flags |= NETTXF_csum_blank; - xnfp->xnf_stat_tx_cksum_deferred++; - } - membar_producer(); - xnfp->xnf_tx_ring.req_prod_pvt = slot + 1; - - txp_info->mp = mp; - txp_info->bdesc = xmitbuf; - - xnfp->xnf_stat_opackets++; - xnfp->xnf_stat_obytes += pktlen; - - return (B_TRUE); /* successful transmit attempt */ -} - -mblk_t * -xnf_send(void *arg, mblk_t *mp) -{ - xnf_t *xnfp = arg; - mblk_t *next; - boolean_t sent_something = B_FALSE; - - mutex_enter(&xnfp->xnf_txlock); - /* - * Transmission attempts should be impossible without having - * previously called xnf_start(). + * If any mblks are left then we have deferred for some reason + * and need to ask for a re-schedule later. This is typically + * due to the ring filling. */ - ASSERT(xnfp->xnf_running); - - /* - * Wait for getting connected to the backend - */ - while (!xnfp->xnf_connected) { - cv_wait(&xnfp->xnf_cv, &xnfp->xnf_txlock); + if (mp != NULL) { + mutex_enter(&xnfp->xnf_schedlock); + xnfp->xnf_need_sched = B_TRUE; + mutex_exit(&xnfp->xnf_schedlock); + + xnfp->xnf_stat_tx_defer++; } - while (mp != NULL) { - next = mp->b_next; - mp->b_next = NULL; - - if (!xnf_send_one(xnfp, mp)) { - mp->b_next = next; - break; - } - - mp = next; - sent_something = B_TRUE; - } - - if (sent_something) { - boolean_t notify; - - /* LINTED: constant in conditional context */ - RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring, - notify); - if (notify) - ec_notify_via_evtchn(xnfp->xnf_evtchn); - } - - if (mp != NULL) - xnfp->xnf_need_sched = B_TRUE; - - mutex_exit(&xnfp->xnf_txlock); - return (mp); } /* - * xnf_intr() -- ring interrupt service routine + * Notification of RX packets. Currently no TX-complete interrupt is + * used, as we clean the TX ring lazily. */ static uint_t xnf_intr(caddr_t arg) { xnf_t *xnfp = (xnf_t *)arg; - boolean_t sched = B_FALSE; - - mutex_enter(&xnfp->xnf_intrlock); - - /* spurious intr */ + mblk_t *mp; + boolean_t need_sched, clean_ring; + + mutex_enter(&xnfp->xnf_rxlock); + + /* + * Interrupts before we are connected are spurious. + */ if (!xnfp->xnf_connected) { - mutex_exit(&xnfp->xnf_intrlock); + mutex_exit(&xnfp->xnf_rxlock); xnfp->xnf_stat_unclaimed_interrupts++; return (DDI_INTR_UNCLAIMED); } -#ifdef XNF_DEBUG - if (xnfdebug & XNF_DEBUG_INT) - printf("xnf%d intr(0x%p)\n", - ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp); -#endif - if (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) { - mblk_t *mp; - - if (xnfp->xnf_rx_hvcopy) - mp = xnf_process_hvcopy_recv(xnfp); - else - mp = xnf_process_recv(xnfp); - - if (mp != NULL) - mac_rx(xnfp->xnf_mh, NULL, mp); + /* + * Receive side processing. + */ + do { + /* + * Collect buffers from the ring. + */ + xnf_rx_collect(xnfp); + + /* + * Interrupt me when the next receive buffer is consumed. + */ + xnfp->xnf_rx_ring.sring->rsp_event = + xnfp->xnf_rx_ring.rsp_cons + 1; + xen_mb(); + + } while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)); + + if (xnfp->xnf_rx_new_buffers_posted) { + boolean_t notify; + + /* + * Indicate to the peer that we have re-filled the + * receive ring, if it cares. + */ + /* LINTED: constant in conditional context */ + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify); + if (notify) + ec_notify_via_evtchn(xnfp->xnf_evtchn); + xnfp->xnf_rx_new_buffers_posted = B_FALSE; } + mp = xnfp->xnf_rx_head; + xnfp->xnf_rx_head = xnfp->xnf_rx_tail = NULL; + xnfp->xnf_stat_interrupts++; - mutex_exit(&xnfp->xnf_intrlock); + mutex_exit(&xnfp->xnf_rxlock); + + if (mp != NULL) + mac_rx(xnfp->xnf_mh, NULL, mp); /* - * Clean tx ring and try to start any blocked xmit streams if - * there is now some space. + * Transmit side processing. + * + * If a previous transmit attempt failed or we have pending + * multicast requests, clean the ring. + * + * If we previously stalled transmission and cleaning produces + * some free slots, tell upstream to attempt sending again. + * + * The odd style is to avoid acquiring xnf_txlock unless we + * will actually look inside the tx machinery. */ - mutex_enter(&xnfp->xnf_txlock); - if (xnf_clean_tx_ring(xnfp) > 0) { - sched = xnfp->xnf_need_sched; - xnfp->xnf_need_sched = B_FALSE; + mutex_enter(&xnfp->xnf_schedlock); + need_sched = xnfp->xnf_need_sched; + clean_ring = need_sched || (xnfp->xnf_pending_multicast > 0); + mutex_exit(&xnfp->xnf_schedlock); + + if (clean_ring) { + int free_slots; + + mutex_enter(&xnfp->xnf_txlock); + free_slots = tx_slots_get(xnfp, 0, B_FALSE); + + if (need_sched && (free_slots > 0)) { + mutex_enter(&xnfp->xnf_schedlock); + xnfp->xnf_need_sched = B_FALSE; + mutex_exit(&xnfp->xnf_schedlock); + + mac_tx_update(xnfp->xnf_mh); + } + mutex_exit(&xnfp->xnf_txlock); } - mutex_exit(&xnfp->xnf_txlock); - - if (sched) - mac_tx_update(xnfp->xnf_mh); return (DDI_INTR_CLAIMED); } @@ -1466,19 +1862,19 @@ xnf_t *xnfp = arg; #ifdef XNF_DEBUG - if (xnfdebug & XNF_DEBUG_TRACE) + if (xnf_debug & XNF_DEBUG_TRACE) printf("xnf%d start(0x%p)\n", ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp); #endif - mutex_enter(&xnfp->xnf_intrlock); + mutex_enter(&xnfp->xnf_rxlock); mutex_enter(&xnfp->xnf_txlock); /* Accept packets from above. */ xnfp->xnf_running = B_TRUE; mutex_exit(&xnfp->xnf_txlock); - mutex_exit(&xnfp->xnf_intrlock); + mutex_exit(&xnfp->xnf_rxlock); return (0); } @@ -1490,389 +1886,217 @@ xnf_t *xnfp = arg; #ifdef XNF_DEBUG - if (xnfdebug & XNF_DEBUG_TRACE) + if (xnf_debug & XNF_DEBUG_TRACE) printf("xnf%d stop(0x%p)\n", ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp); #endif - mutex_enter(&xnfp->xnf_intrlock); + mutex_enter(&xnfp->xnf_rxlock); mutex_enter(&xnfp->xnf_txlock); xnfp->xnf_running = B_FALSE; mutex_exit(&xnfp->xnf_txlock); - mutex_exit(&xnfp->xnf_intrlock); + mutex_exit(&xnfp->xnf_rxlock); } /* - * Driver private functions follow - */ - -/* - * Hang buffer on rx ring + * Hang buffer `bdesc' on the RX ring. */ static void -rx_buffer_hang(xnf_t *xnfp, struct xnf_buffer_desc *bdesc) +xnf_rxbuf_hang(xnf_t *xnfp, xnf_buf_t *bdesc) { - volatile netif_rx_request_t *reqp; - RING_IDX hang_ix; - grant_ref_t ref; - domid_t oeid; - - oeid = xvdi_get_oeid(xnfp->xnf_devinfo); - - ASSERT(MUTEX_HELD(&xnfp->xnf_intrlock)); + netif_rx_request_t *reqp; + RING_IDX hang_ix; + + ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock)); + reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring, xnfp->xnf_rx_ring.req_prod_pvt); hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0)); - ASSERT(xnfp->xnf_rxpkt_bufptr[hang_ix] == NULL); - if (bdesc->grant_ref == GRANT_INVALID_REF) { - ref = gnttab_claim_grant_reference(&xnfp->xnf_gref_rx_head); - ASSERT((signed short)ref >= 0); - bdesc->grant_ref = ref; - if (xnfp->xnf_rx_hvcopy) { - pfn_t pfn = xnf_btop(bdesc->buf_phys); - mfn_t mfn = pfn_to_mfn(pfn); - - gnttab_grant_foreign_access_ref(ref, oeid, mfn, 0); - } else { - gnttab_grant_foreign_transfer_ref(ref, oeid, 0); - } - } - reqp->id = hang_ix; + ASSERT(xnfp->xnf_rx_pkt_info[hang_ix] == NULL); + + reqp->id = bdesc->id = hang_ix; reqp->gref = bdesc->grant_ref; - bdesc->id = hang_ix; - xnfp->xnf_rxpkt_bufptr[hang_ix] = bdesc; - membar_producer(); + + xnfp->xnf_rx_pkt_info[hang_ix] = bdesc; xnfp->xnf_rx_ring.req_prod_pvt++; + + xnfp->xnf_rx_new_buffers_posted = B_TRUE; } -static mblk_t * -xnf_process_hvcopy_recv(xnf_t *xnfp) +/* + * Collect packets from the RX ring, storing them in `xnfp' for later + * use. + */ +static void +xnf_rx_collect(xnf_t *xnfp) { - netif_rx_response_t *rxpkt; - mblk_t *mp, *head, *tail; - struct xnf_buffer_desc *bdesc; - boolean_t hwcsum = B_FALSE, notify, work_to_do; - size_t len; + mblk_t *head, *tail; + + ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock)); /* - * in loop over unconsumed responses, we do: + * Loop over unconsumed responses: * 1. get a response * 2. take corresponding buffer off recv. ring * 3. indicate this by setting slot to NULL * 4. create a new message and * 5. copy data in, adjust ptr - * - * outside loop: - * 7. make sure no more data has arrived; kick HV */ head = tail = NULL; -loop: while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) { + netif_rx_response_t *rxpkt; + xnf_buf_t *bdesc; + ssize_t len; + size_t off; + mblk_t *mp = NULL; + boolean_t hwcsum = B_FALSE; + grant_ref_t ref; /* 1. */ rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring, xnfp->xnf_rx_ring.rsp_cons); - DTRACE_PROBE4(got_PKT, int, (int)rxpkt->id, int, - (int)rxpkt->offset, - int, (int)rxpkt->flags, int, (int)rxpkt->status); + DTRACE_PROBE4(xnf_rx_got_rsp, int, (int)rxpkt->id, + int, (int)rxpkt->offset, + int, (int)rxpkt->flags, + int, (int)rxpkt->status); /* * 2. - * Take buffer off of receive ring + */ + bdesc = xnfp->xnf_rx_pkt_info[rxpkt->id]; + + /* + * 3. */ - hwcsum = B_FALSE; - bdesc = xnfp->xnf_rxpkt_bufptr[rxpkt->id]; - /* 3 */ - xnfp->xnf_rxpkt_bufptr[rxpkt->id] = NULL; + xnfp->xnf_rx_pkt_info[rxpkt->id] = NULL; ASSERT(bdesc->id == rxpkt->id); - mp = NULL; + + ref = bdesc->grant_ref; + off = rxpkt->offset; + len = rxpkt->status; + if (!xnfp->xnf_running) { - DTRACE_PROBE4(pkt_dropped, int, rxpkt->status, + DTRACE_PROBE4(xnf_rx_not_running, + int, rxpkt->status, char *, bdesc->buf, int, rxpkt->offset, char *, ((char *)bdesc->buf) + rxpkt->offset); + xnfp->xnf_stat_drop++; - /* - * re-hang the buffer - */ - rx_buffer_hang(xnfp, bdesc); - } else if (rxpkt->status <= 0) { - DTRACE_PROBE4(pkt_status_negative, int, rxpkt->status, + + } else if (len <= 0) { + DTRACE_PROBE4(xnf_rx_pkt_status_negative, + int, rxpkt->status, char *, bdesc->buf, int, rxpkt->offset, char *, ((char *)bdesc->buf) + rxpkt->offset); + xnfp->xnf_stat_errrx++; - if (rxpkt->status == 0) + + switch (len) { + case 0: xnfp->xnf_stat_runt++; - if (rxpkt->status == NETIF_RSP_ERROR) + break; + case NETIF_RSP_ERROR: xnfp->xnf_stat_mac_rcv_error++; - if (rxpkt->status == NETIF_RSP_DROPPED) + break; + case NETIF_RSP_DROPPED: xnfp->xnf_stat_norxbuf++; - /* - * re-hang the buffer - */ - rx_buffer_hang(xnfp, bdesc); + break; + } + + } else if (bdesc->grant_ref == INVALID_GRANT_REF) { + cmn_err(CE_WARN, "Bad rx grant reference %d " + "from domain %d", ref, + xvdi_get_oeid(xnfp->xnf_devinfo)); + + } else if ((off + len) > PAGESIZE) { + cmn_err(CE_WARN, "Rx packet overflows page " + "(offset %ld, length %ld) from domain %d", + off, len, xvdi_get_oeid(xnfp->xnf_devinfo)); } else { - grant_ref_t ref = bdesc->grant_ref; - struct xnf_buffer_desc *new_bdesc; - unsigned long off = rxpkt->offset; - - DTRACE_PROBE4(pkt_status_ok, int, rxpkt->status, - char *, bdesc->buf, int, rxpkt->offset, - char *, ((char *)bdesc->buf) + rxpkt->offset); - len = rxpkt->status; + xnf_buf_t *nbuf = NULL; + + DTRACE_PROBE4(xnf_rx_packet, int, len, + char *, bdesc->buf, int, off, + char *, ((char *)bdesc->buf) + off); + ASSERT(off + len <= PAGEOFFSET); - if (ref == GRANT_INVALID_REF) { - mp = NULL; - new_bdesc = bdesc; - cmn_err(CE_WARN, "Bad rx grant reference %d " - "from dom %d", ref, - xvdi_get_oeid(xnfp->xnf_devinfo)); - goto luckless; - } - /* - * Release ref which we'll be re-claiming in - * rx_buffer_hang(). - */ - bdesc->grant_ref = GRANT_INVALID_REF; - (void) gnttab_end_foreign_access_ref(ref, 0); - gnttab_release_grant_reference(&xnfp->xnf_gref_rx_head, - ref); + if (rxpkt->flags & NETRXF_data_validated) hwcsum = B_TRUE; /* - * XXPV for the initial implementation of HVcopy, - * create a new msg and copy in the data + * If the packet is below a pre-determined + * size we will copy data out rather than + * replace it. + */ + if (len > xnf_rx_copy_limit) + nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE); + + /* + * If we have a replacement buffer, attempt to + * wrap the existing one with an mblk_t in + * order that the upper layers of the stack + * might use it directly. */ - /* 4. */ - if ((mp = allocb(len, BPRI_MED)) == NULL) { - /* - * Couldn't get buffer to copy to, - * drop this data, and re-hang - * the buffer on the ring. - */ - xnfp->xnf_stat_norxbuf++; - DTRACE_PROBE(alloc_nix); - } else { - /* 5. */ - DTRACE_PROBE(alloc_ok); - bcopy(bdesc->buf + off, mp->b_wptr, - len); - mp->b_wptr += len; - } - new_bdesc = bdesc; -luckless: - - /* Re-hang old or hang new buffer. */ - rx_buffer_hang(xnfp, new_bdesc); - } - if (mp) { - if (hwcsum) { - /* - * See comments in xnf_process_recv(). - */ - - (void) hcksum_assoc(mp, NULL, - NULL, 0, 0, 0, 0, - HCK_FULLCKSUM | - HCK_FULLCKSUM_OK, - 0); - xnfp->xnf_stat_rx_cksum_no_need++; - } - if (head == NULL) { - head = tail = mp; - } else { - tail->b_next = mp; - tail = mp; + if (nbuf != NULL) { + mp = desballoc((unsigned char *)bdesc->buf, + bdesc->len, 0, &bdesc->free_rtn); + if (mp == NULL) { + xnfp->xnf_stat_rx_desballoc_fail++; + xnfp->xnf_stat_norxbuf++; + + xnf_buf_put(xnfp, nbuf, B_FALSE); + nbuf = NULL; + } else { + mp->b_rptr = mp->b_rptr + off; + mp->b_wptr = mp->b_rptr + len; + + /* + * Release the grant reference + * associated with this buffer + * - they are scarce and the + * upper layers of the stack + * don't need it. + */ + (void) gnttab_end_foreign_access_ref( + bdesc->grant_ref, 0); + gref_put(xnfp, bdesc->grant_ref); + bdesc->grant_ref = INVALID_GRANT_REF; + + bdesc = nbuf; + } } - ASSERT(mp->b_next == NULL); - - xnfp->xnf_stat_ipackets++; - xnfp->xnf_stat_rbytes += len; - } - - xnfp->xnf_rx_ring.rsp_cons++; - - xnfp->xnf_stat_hvcopy_packet_processed++; - } - - /* 7. */ - /* - * Has more data come in since we started? - */ - /* LINTED: constant in conditional context */ - RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_rx_ring, work_to_do); - if (work_to_do) - goto loop; - - /* - * Indicate to the backend that we have re-filled the receive - * ring. - */ - /* LINTED: constant in conditional context */ - RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify); - if (notify) - ec_notify_via_evtchn(xnfp->xnf_evtchn); - - return (head); -} - -/* Process all queued received packets */ -static mblk_t * -xnf_process_recv(xnf_t *xnfp) -{ - volatile netif_rx_response_t *rxpkt; - mblk_t *mp, *head, *tail; - struct xnf_buffer_desc *bdesc; - extern mblk_t *desballoc(unsigned char *, size_t, uint_t, frtn_t *); - boolean_t hwcsum = B_FALSE, notify, work_to_do; - size_t len; - pfn_t pfn; - long cnt; - - head = tail = NULL; -loop: - while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) { - - rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring, - xnfp->xnf_rx_ring.rsp_cons); - - /* - * Take buffer off of receive ring - */ - hwcsum = B_FALSE; - bdesc = xnfp->xnf_rxpkt_bufptr[rxpkt->id]; - xnfp->xnf_rxpkt_bufptr[rxpkt->id] = NULL; - ASSERT(bdesc->id == rxpkt->id); - mp = NULL; - if (!xnfp->xnf_running) { - xnfp->xnf_stat_drop++; - /* - * re-hang the buffer - */ - rx_buffer_hang(xnfp, bdesc); - } else if (rxpkt->status <= 0) { - xnfp->xnf_stat_errrx++; - if (rxpkt->status == 0) - xnfp->xnf_stat_runt++; - if (rxpkt->status == NETIF_RSP_ERROR) - xnfp->xnf_stat_mac_rcv_error++; - if (rxpkt->status == NETIF_RSP_DROPPED) - xnfp->xnf_stat_norxbuf++; - /* - * re-hang the buffer - */ - rx_buffer_hang(xnfp, bdesc); - } else { - grant_ref_t ref = bdesc->grant_ref; - struct xnf_buffer_desc *new_bdesc; - unsigned long off = rxpkt->offset; - unsigned long mfn; - - len = rxpkt->status; - ASSERT(off + len <= PAGEOFFSET); - if (ref == GRANT_INVALID_REF) { - mp = NULL; - new_bdesc = bdesc; - cmn_err(CE_WARN, "Bad rx grant reference %d " - "from dom %d", ref, - xvdi_get_oeid(xnfp->xnf_devinfo)); - goto luckless; - } - bdesc->grant_ref = GRANT_INVALID_REF; - mfn = gnttab_end_foreign_transfer_ref(ref); - ASSERT(mfn != MFN_INVALID); - ASSERT(hat_getpfnum(kas.a_hat, bdesc->buf) == - PFN_INVALID); - - gnttab_release_grant_reference(&xnfp->xnf_gref_rx_head, - ref); - reassign_pfn(xnf_btop(bdesc->buf_phys), mfn); - hat_devload(kas.a_hat, bdesc->buf, PAGESIZE, - xnf_btop(bdesc->buf_phys), - PROT_READ | PROT_WRITE, HAT_LOAD); - balloon_drv_added(1); - - if (rxpkt->flags & NETRXF_data_validated) - hwcsum = B_TRUE; - if (len <= xnf_rx_bcopy_thresh) { + if (nbuf == NULL) { /* - * For small buffers, just copy the data - * and send the copy upstream. - */ - new_bdesc = NULL; - } else { - /* - * We send a pointer to this data upstream; - * we need a new buffer to replace this one. + * No replacement buffer allocated - + * attempt to copy the data out and + * re-hang the existing buffer. */ - mutex_enter(&xnfp->xnf_rx_buf_mutex); - new_bdesc = xnf_get_buffer(xnfp); - if (new_bdesc != NULL) { - xnfp->xnf_rx_bufs_outstanding++; - } else { - xnfp->xnf_stat_rx_no_ringbuf++; - } - mutex_exit(&xnfp->xnf_rx_buf_mutex); - } - - if (new_bdesc == NULL) { - /* - * Don't have a new ring buffer; bcopy the data - * from the buffer, and preserve the - * original buffer - */ - if ((mp = allocb(len, BPRI_MED)) == NULL) { - /* - * Could't get buffer to copy to, - * drop this data, and re-hang - * the buffer on the ring. - */ + + /* 4. */ + mp = allocb(len, BPRI_MED); + if (mp == NULL) { + xnfp->xnf_stat_rx_allocb_fail++; xnfp->xnf_stat_norxbuf++; } else { + /* 5. */ bcopy(bdesc->buf + off, mp->b_wptr, len); - } - /* - * Give the buffer page back to xen - */ - pfn = xnf_btop(bdesc->buf_phys); - cnt = balloon_free_pages(1, &mfn, bdesc->buf, - &pfn); - if (cnt != 1) { - cmn_err(CE_WARN, "unable to give a " - "page back to the hypervisor\n"); + mp->b_wptr += len; } - new_bdesc = bdesc; - } else { - if ((mp = desballoc((unsigned char *)bdesc->buf, - off + len, 0, (frtn_t *)bdesc)) == NULL) { - /* - * Couldn't get mblk to pass recv data - * up with, free the old ring buffer - */ - xnfp->xnf_stat_norxbuf++; - xnf_rcv_complete(bdesc); - goto luckless; - } - (void) ddi_dma_sync(bdesc->dma_handle, - 0, 0, DDI_DMA_SYNC_FORCPU); - - mp->b_wptr += off; - mp->b_rptr += off; } -luckless: - if (mp) - mp->b_wptr += len; - /* re-hang old or hang new buffer */ - rx_buffer_hang(xnfp, new_bdesc); } - if (mp) { + + /* Re-hang the buffer. */ + xnf_rxbuf_hang(xnfp, bdesc); + + if (mp != NULL) { if (hwcsum) { /* * If the peer says that the data has @@ -1895,20 +2119,22 @@ * If it was necessary we could grovel * in the packet to find it. */ - (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, HCK_FULLCKSUM | - HCK_FULLCKSUM_OK, - 0); + HCK_FULLCKSUM_OK, 0); xnfp->xnf_stat_rx_cksum_no_need++; } if (head == NULL) { - head = tail = mp; + ASSERT(tail == NULL); + + head = mp; } else { + ASSERT(tail != NULL); + tail->b_next = mp; - tail = mp; } + tail = mp; ASSERT(mp->b_next == NULL); @@ -1920,67 +2146,21 @@ } /* - * Has more data come in since we started? - */ - /* LINTED: constant in conditional context */ - RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_rx_ring, work_to_do); - if (work_to_do) - goto loop; - - /* - * Indicate to the backend that we have re-filled the receive - * ring. + * Store the mblks we have collected. */ - /* LINTED: constant in conditional context */ - RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify); - if (notify) - ec_notify_via_evtchn(xnfp->xnf_evtchn); - - return (head); -} - -/* Called when the upper layers free a message we passed upstream */ -static void -xnf_rcv_complete(struct xnf_buffer_desc *bdesc) -{ - xnf_t *xnfp = bdesc->xnfp; - pfn_t pfn; - long cnt; - - /* One less outstanding receive buffer */ - mutex_enter(&xnfp->xnf_rx_buf_mutex); - --xnfp->xnf_rx_bufs_outstanding; - /* - * Return buffer to the free list, unless the free list is getting - * too large. XXPV - this threshold may need tuning. - */ - if (xnfp->xnf_rx_descs_free < xnf_rx_bufs_lowat) { - /* - * Unmap the page, and hand the machine page back - * to xen so it can be re-used as a backend net buffer. - */ - pfn = xnf_btop(bdesc->buf_phys); - cnt = balloon_free_pages(1, NULL, bdesc->buf, &pfn); - if (cnt != 1) { - cmn_err(CE_WARN, "unable to give a page back to the " - "hypervisor\n"); + if (head != NULL) { + ASSERT(tail != NULL); + + if (xnfp->xnf_rx_head == NULL) { + ASSERT(xnfp->xnf_rx_tail == NULL); + + xnfp->xnf_rx_head = head; + } else { + ASSERT(xnfp->xnf_rx_tail != NULL); + + xnfp->xnf_rx_tail->b_next = head; } - - bdesc->next = xnfp->xnf_free_list; - xnfp->xnf_free_list = bdesc; - xnfp->xnf_rx_descs_free++; - mutex_exit(&xnfp->xnf_rx_buf_mutex); - } else { - /* - * We can return everything here since we have a free buffer - * that we have not given the backing page for back to xen. - */ - --xnfp->xnf_rx_buffer_count; - mutex_exit(&xnfp->xnf_rx_buf_mutex); - (void) ddi_dma_unbind_handle(bdesc->dma_handle); - ddi_dma_mem_free(&bdesc->acc_handle); - ddi_dma_free_handle(&bdesc->dma_handle); - kmem_free(bdesc, sizeof (*bdesc)); + xnfp->xnf_rx_tail = tail; } } @@ -1991,34 +2171,16 @@ xnf_alloc_dma_resources(xnf_t *xnfp) { dev_info_t *devinfo = xnfp->xnf_devinfo; - int i; size_t len; ddi_dma_cookie_t dma_cookie; uint_t ncookies; - struct xnf_buffer_desc *bdesc; int rc; caddr_t rptr; - xnfp->xnf_n_rx = NET_RX_RING_SIZE; - xnfp->xnf_max_rx_bufs = xnf_rx_bufs_hiwat; - - xnfp->xnf_n_tx = NET_TX_RING_SIZE; - /* * The code below allocates all the DMA data structures that * need to be released when the driver is detached. * - * First allocate handles for mapping (virtual address) pointers to - * transmit data buffers to physical addresses - */ - for (i = 0; i < xnfp->xnf_n_tx; i++) { - if ((rc = ddi_dma_alloc_handle(devinfo, - &tx_buffer_dma_attr, DDI_DMA_SLEEP, 0, - &xnfp->xnf_tx_pkt_info[i].dma_handle)) != DDI_SUCCESS) - return (DDI_FAILURE); - } - - /* * Allocate page for the transmit descriptor ring. */ if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr, @@ -2092,18 +2254,6 @@ FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE); xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress; - /* - * Preallocate receive buffers for each receive descriptor. - */ - - /* Set up the "free list" of receive buffer descriptors */ - for (i = 0; i < xnfp->xnf_n_rx; i++) { - if ((bdesc = xnf_alloc_buffer(xnfp)) == NULL) - goto alloc_error; - bdesc->next = xnfp->xnf_free_list; - xnfp->xnf_free_list = bdesc; - } - return (DDI_SUCCESS); alloc_error: @@ -2116,8 +2266,6 @@ /* * Release all DMA resources in the opposite order from acquisition - * Should not be called until all outstanding esballoc buffers - * have been returned. */ static void xnf_release_dma_resources(xnf_t *xnfp) @@ -2126,25 +2274,27 @@ /* * Free receive buffers which are currently associated with - * descriptors + * descriptors. */ - for (i = 0; i < xnfp->xnf_n_rx; i++) { - struct xnf_buffer_desc *bp; - - if ((bp = xnfp->xnf_rxpkt_bufptr[i]) == NULL) + mutex_enter(&xnfp->xnf_rxlock); + for (i = 0; i < NET_RX_RING_SIZE; i++) { + xnf_buf_t *bp; + + if ((bp = xnfp->xnf_rx_pkt_info[i]) == NULL) continue; - xnf_free_buffer(bp); - xnfp->xnf_rxpkt_bufptr[i] = NULL; + xnfp->xnf_rx_pkt_info[i] = NULL; + xnf_buf_put(xnfp, bp, B_FALSE); } - - /* Free the receive ring buffer */ + mutex_exit(&xnfp->xnf_rxlock); + + /* Free the receive ring buffer. */ if (xnfp->xnf_rx_ring_dma_acchandle != NULL) { (void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle); ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle); ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle); xnfp->xnf_rx_ring_dma_acchandle = NULL; } - /* Free the transmit ring buffer */ + /* Free the transmit ring buffer. */ if (xnfp->xnf_tx_ring_dma_acchandle != NULL) { (void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle); ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle); @@ -2152,219 +2302,75 @@ xnfp->xnf_tx_ring_dma_acchandle = NULL; } - /* - * Free handles for mapping (virtual address) pointers to - * transmit data buffers to physical addresses - */ - for (i = 0; i < xnfp->xnf_n_tx; i++) { - if (xnfp->xnf_tx_pkt_info[i].dma_handle != NULL) { - ddi_dma_free_handle( - &xnfp->xnf_tx_pkt_info[i].dma_handle); - } - } - -} - -static void -xnf_release_mblks(xnf_t *xnfp) -{ - int i; - - for (i = 0; i < xnfp->xnf_n_tx; i++) { - if (xnfp->xnf_tx_pkt_info[i].mp == NULL) - continue; - freemsg(xnfp->xnf_tx_pkt_info[i].mp); - xnfp->xnf_tx_pkt_info[i].mp = NULL; - (void) ddi_dma_unbind_handle( - xnfp->xnf_tx_pkt_info[i].dma_handle); - } -} - -/* - * Remove a xmit buffer descriptor from the head of the free list and return - * a pointer to it. If no buffers on list, attempt to allocate a new one. - * Called with the tx_buf_mutex held. - */ -static struct xnf_buffer_desc * -xnf_get_tx_buffer(xnf_t *xnfp) -{ - struct xnf_buffer_desc *bdesc; - - bdesc = xnfp->xnf_tx_free_list; - if (bdesc != NULL) { - xnfp->xnf_tx_free_list = bdesc->next; - } else { - bdesc = xnf_alloc_tx_buffer(xnfp); - } - return (bdesc); -} - -/* - * Remove a buffer descriptor from the head of the free list and return - * a pointer to it. If no buffers on list, attempt to allocate a new one. - * Called with the rx_buf_mutex held. - */ -static struct xnf_buffer_desc * -xnf_get_buffer(xnf_t *xnfp) -{ - struct xnf_buffer_desc *bdesc; - - bdesc = xnfp->xnf_free_list; - if (bdesc != NULL) { - xnfp->xnf_free_list = bdesc->next; - xnfp->xnf_rx_descs_free--; - } else { - bdesc = xnf_alloc_buffer(xnfp); - } - return (bdesc); -} - -/* - * Free a xmit buffer back to the xmit free list - */ -static void -xnf_free_tx_buffer(struct xnf_buffer_desc *bp) -{ - xnf_t *xnfp = bp->xnfp; - - mutex_enter(&xnfp->xnf_tx_buf_mutex); - bp->next = xnfp->xnf_tx_free_list; - xnfp->xnf_tx_free_list = bp; - mutex_exit(&xnfp->xnf_tx_buf_mutex); } /* - * Put a buffer descriptor onto the head of the free list. - * for page-flip: - * We can't really free these buffers back to the kernel - * since we have given away their backing page to be used - * by the back end net driver. - * for hvcopy: - * release all the memory + * Release any packets and associated structures used by the TX ring. */ static void -xnf_free_buffer(struct xnf_buffer_desc *bdesc) +xnf_release_mblks(xnf_t *xnfp) { - xnf_t *xnfp = bdesc->xnfp; - - mutex_enter(&xnfp->xnf_rx_buf_mutex); - if (xnfp->xnf_rx_hvcopy) { - if (ddi_dma_unbind_handle(bdesc->dma_handle) != DDI_SUCCESS) - goto out; - ddi_dma_mem_free(&bdesc->acc_handle); - ddi_dma_free_handle(&bdesc->dma_handle); - kmem_free(bdesc, sizeof (*bdesc)); - xnfp->xnf_rx_buffer_count--; - } else { - bdesc->next = xnfp->xnf_free_list; - xnfp->xnf_free_list = bdesc; - xnfp->xnf_rx_descs_free++; + RING_IDX i; + xnf_txid_t *tidp; + + for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0]; + i < NET_TX_RING_SIZE; + i++, tidp++) { + xnf_txbuf_t *txp = tidp->txbuf; + + if (txp != NULL) { + ASSERT(txp->tx_mp != NULL); + freemsg(txp->tx_mp); + + txid_put(xnfp, tidp); + kmem_cache_free(xnfp->xnf_tx_buf_cache, txp); + } } -out: - mutex_exit(&xnfp->xnf_rx_buf_mutex); } -/* - * Allocate a DMA-able xmit buffer, including a structure to - * keep track of the buffer. Called with tx_buf_mutex held. - */ -static struct xnf_buffer_desc * -xnf_alloc_tx_buffer(xnf_t *xnfp) +static int +xnf_buf_constructor(void *buf, void *arg, int kmflag) { - struct xnf_buffer_desc *bdesc; + int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP; + xnf_buf_t *bdesc = buf; + xnf_t *xnfp = arg; + ddi_dma_cookie_t dma_cookie; + uint_t ncookies; size_t len; - if ((bdesc = kmem_zalloc(sizeof (*bdesc), KM_NOSLEEP)) == NULL) - return (NULL); - - /* allocate a DMA access handle for receive buffer */ - if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buffer_dma_attr, - 0, 0, &bdesc->dma_handle) != DDI_SUCCESS) + if (kmflag & KM_NOSLEEP) + ddiflags = DDI_DMA_DONTWAIT; + + /* Allocate a DMA access handle for the buffer. */ + if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &buf_dma_attr, + ddiflags, 0, &bdesc->dma_handle) != DDI_SUCCESS) goto failure; - /* Allocate DMA-able memory for transmit buffer */ + /* Allocate DMA-able memory for buffer. */ if (ddi_dma_mem_alloc(bdesc->dma_handle, - PAGESIZE, &data_accattr, DDI_DMA_STREAMING, 0, 0, + PAGESIZE, &data_accattr, DDI_DMA_STREAMING, ddiflags, 0, &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS) goto failure_1; + /* Bind to virtual address of buffer to get physical address. */ + if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL, + bdesc->buf, len, DDI_DMA_RDWR | DDI_DMA_STREAMING, + ddiflags, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED) + goto failure_2; + ASSERT(ncookies == 1); + + bdesc->free_rtn.free_func = xnf_buf_recycle; + bdesc->free_rtn.free_arg = (caddr_t)bdesc; bdesc->xnfp = xnfp; - xnfp->xnf_tx_buffer_count++; - - return (bdesc); - -failure_1: - ddi_dma_free_handle(&bdesc->dma_handle); - -failure: - kmem_free(bdesc, sizeof (*bdesc)); - return (NULL); -} - -/* - * Allocate a DMA-able receive buffer, including a structure to - * keep track of the buffer. Called with rx_buf_mutex held. - */ -static struct xnf_buffer_desc * -xnf_alloc_buffer(xnf_t *xnfp) -{ - struct xnf_buffer_desc *bdesc; - size_t len; - uint_t ncookies; - ddi_dma_cookie_t dma_cookie; - long cnt; - pfn_t pfn; - - if (xnfp->xnf_rx_buffer_count >= xnfp->xnf_max_rx_bufs) - return (NULL); - - if ((bdesc = kmem_zalloc(sizeof (*bdesc), KM_NOSLEEP)) == NULL) - return (NULL); - - /* allocate a DMA access handle for receive buffer */ - if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buffer_dma_attr, - 0, 0, &bdesc->dma_handle) != DDI_SUCCESS) - goto failure; - - /* Allocate DMA-able memory for receive buffer */ - if (ddi_dma_mem_alloc(bdesc->dma_handle, - PAGESIZE, &data_accattr, DDI_DMA_STREAMING, 0, 0, - &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS) - goto failure_1; - - /* bind to virtual address of buffer to get physical address */ - if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL, - bdesc->buf, PAGESIZE, DDI_DMA_READ | DDI_DMA_STREAMING, - DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED) - goto failure_2; - bdesc->buf_phys = dma_cookie.dmac_laddress; - bdesc->xnfp = xnfp; - if (xnfp->xnf_rx_hvcopy) { - bdesc->free_rtn.free_func = xnf_copy_rcv_complete; - } else { - bdesc->free_rtn.free_func = xnf_rcv_complete; - } - bdesc->free_rtn.free_arg = (char *)bdesc; - bdesc->grant_ref = GRANT_INVALID_REF; - ASSERT(ncookies == 1); - - xnfp->xnf_rx_buffer_count++; - - if (!xnfp->xnf_rx_hvcopy) { - /* - * Unmap the page, and hand the machine page back - * to xen so it can be used as a backend net buffer. - */ - pfn = xnf_btop(bdesc->buf_phys); - cnt = balloon_free_pages(1, NULL, bdesc->buf, &pfn); - if (cnt != 1) { - cmn_err(CE_WARN, "unable to give a page back to the " - "hypervisor\n"); - } - } - - return (bdesc); + bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys)); + bdesc->len = dma_cookie.dmac_size; + bdesc->grant_ref = INVALID_GRANT_REF; + bdesc->gen = xnfp->xnf_gen; + + atomic_add_64(&xnfp->xnf_stat_buf_allocated, 1); + + return (0); failure_2: ddi_dma_mem_free(&bdesc->acc_handle); @@ -2373,8 +2379,117 @@ ddi_dma_free_handle(&bdesc->dma_handle); failure: - kmem_free(bdesc, sizeof (*bdesc)); - return (NULL); + + return (-1); +} + +static void +xnf_buf_destructor(void *buf, void *arg) +{ + xnf_buf_t *bdesc = buf; + xnf_t *xnfp = arg; + + (void) ddi_dma_unbind_handle(bdesc->dma_handle); + ddi_dma_mem_free(&bdesc->acc_handle); + ddi_dma_free_handle(&bdesc->dma_handle); + + atomic_add_64(&xnfp->xnf_stat_buf_allocated, -1); +} + +static xnf_buf_t * +xnf_buf_get(xnf_t *xnfp, int flags, boolean_t readonly) +{ + grant_ref_t gref; + xnf_buf_t *bufp; + + /* + * Usually grant references are more scarce than memory, so we + * attempt to acquire a grant reference first. + */ + gref = gref_get(xnfp); + if (gref == INVALID_GRANT_REF) + return (NULL); + + bufp = kmem_cache_alloc(xnfp->xnf_buf_cache, flags); + if (bufp == NULL) { + gref_put(xnfp, gref); + return (NULL); + } + + ASSERT(bufp->grant_ref == INVALID_GRANT_REF); + + bufp->grant_ref = gref; + + if (bufp->gen != xnfp->xnf_gen) + xnf_buf_refresh(bufp); + + gnttab_grant_foreign_access_ref(bufp->grant_ref, + xvdi_get_oeid(bufp->xnfp->xnf_devinfo), + bufp->buf_mfn, readonly ? 1 : 0); + + atomic_add_64(&xnfp->xnf_stat_buf_outstanding, 1); + + return (bufp); +} + +static void +xnf_buf_put(xnf_t *xnfp, xnf_buf_t *bufp, boolean_t readonly) +{ + if (bufp->grant_ref != INVALID_GRANT_REF) { + (void) gnttab_end_foreign_access_ref( + bufp->grant_ref, readonly ? 1 : 0); + gref_put(xnfp, bufp->grant_ref); + bufp->grant_ref = INVALID_GRANT_REF; + } + + kmem_cache_free(xnfp->xnf_buf_cache, bufp); + + atomic_add_64(&xnfp->xnf_stat_buf_outstanding, -1); +} + +/* + * Refresh any cached data about a buffer after resume. + */ +static void +xnf_buf_refresh(xnf_buf_t *bdesc) +{ + bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys)); + bdesc->gen = bdesc->xnfp->xnf_gen; +} + +/* + * Streams `freeb' routine for `xnf_buf_t' when used as transmit + * look-aside buffers. + */ +static void +xnf_buf_recycle(xnf_buf_t *bdesc) +{ + xnf_t *xnfp = bdesc->xnfp; + + xnf_buf_put(xnfp, bdesc, B_TRUE); +} + +static int +xnf_tx_buf_constructor(void *buf, void *arg, int kmflag) +{ + _NOTE(ARGUNUSED(kmflag)); + xnf_txbuf_t *txp = buf; + xnf_t *xnfp = arg; + + if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &buf_dma_attr, + 0, 0, &txp->tx_dma_handle) != DDI_SUCCESS) + return (-1); + + return (0); +} + +static void +xnf_tx_buf_destructor(void *buf, void *arg) +{ + _NOTE(ARGUNUSED(arg)); + xnf_txbuf_t *txp = buf; + + ddi_dma_free_handle(&txp->tx_dma_handle); } /* @@ -2388,8 +2503,13 @@ "tx_pullup", "tx_pagebndry", "tx_attempt", - "rx_no_ringbuf", - "hvcopy_packet_processed", + "buf_allocated", + "buf_outstanding", + "gref_outstanding", + "gref_failure", + "gref_peak", + "rx_allocb_fail", + "rx_desballoc_fail", }; static int @@ -2416,9 +2536,14 @@ (knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup; (knp++)->value.ui64 = xnfp->xnf_stat_tx_pagebndry; (knp++)->value.ui64 = xnfp->xnf_stat_tx_attempt; - (knp++)->value.ui64 = xnfp->xnf_stat_rx_no_ringbuf; - - (knp++)->value.ui64 = xnfp->xnf_stat_hvcopy_packet_processed; + + (knp++)->value.ui64 = xnfp->xnf_stat_buf_allocated; + (knp++)->value.ui64 = xnfp->xnf_stat_buf_outstanding; + (knp++)->value.ui64 = xnfp->xnf_stat_gref_outstanding; + (knp++)->value.ui64 = xnfp->xnf_stat_gref_failure; + (knp++)->value.ui64 = xnfp->xnf_stat_gref_peak; + (knp++)->value.ui64 = xnfp->xnf_stat_rx_allocb_fail; + (knp++)->value.ui64 = xnfp->xnf_stat_rx_desballoc_fail; return (0); } @@ -2462,7 +2587,7 @@ { xnf_t *xnfp = arg; - mutex_enter(&xnfp->xnf_intrlock); + mutex_enter(&xnfp->xnf_rxlock); mutex_enter(&xnfp->xnf_txlock); #define mac_stat(q, r) \ @@ -2500,7 +2625,7 @@ default: mutex_exit(&xnfp->xnf_txlock); - mutex_exit(&xnfp->xnf_intrlock); + mutex_exit(&xnfp->xnf_rxlock); return (ENOTSUP); } @@ -2509,22 +2634,15 @@ #undef ether_stat mutex_exit(&xnfp->xnf_txlock); - mutex_exit(&xnfp->xnf_intrlock); + mutex_exit(&xnfp->xnf_rxlock); return (0); } -/*ARGSUSED*/ -static void -xnf_ioctl(void *arg, queue_t *q, mblk_t *mp) -{ - miocnak(q, mp, 0, EINVAL); -} - static boolean_t xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data) { - xnf_t *xnfp = arg; + _NOTE(ARGUNUSED(arg)); switch (cap) { case MAC_CAPAB_HCKSUM: { @@ -2547,10 +2665,7 @@ * field and must insert the pseudo-header checksum * before passing the packet to the IO domain. */ - if (xnfp->xnf_cksum_offload) - *capab = HCKSUM_INET_FULL_V4; - else - *capab = 0; + *capab = HCKSUM_INET_FULL_V4; break; } default: @@ -2560,74 +2675,95 @@ return (B_TRUE); } -/*ARGSUSED*/ +/* + * The state of the peer has changed - react accordingly. + */ static void oe_state_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data) { + _NOTE(ARGUNUSED(id, arg)); xnf_t *xnfp = ddi_get_driver_private(dip); XenbusState new_state = *(XenbusState *)impl_data; ASSERT(xnfp != NULL); switch (new_state) { + case XenbusStateUnknown: + case XenbusStateInitialising: + case XenbusStateInitialised: + case XenbusStateClosing: + case XenbusStateClosed: + case XenbusStateReconfiguring: + case XenbusStateReconfigured: + break; + + case XenbusStateInitWait: + xnf_read_config(xnfp); + + if (!xnfp->xnf_be_rx_copy) { + cmn_err(CE_WARN, + "The xnf driver requires a dom0 that " + "supports 'feature-rx-copy'."); + (void) xvdi_switch_state(xnfp->xnf_devinfo, + XBT_NULL, XenbusStateClosed); + break; + } + + /* + * Connect to the backend. + */ + xnf_be_connect(xnfp); + + /* + * Our MAC address as discovered by xnf_read_config(). + */ + mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr); + + break; + case XenbusStateConnected: - mutex_enter(&xnfp->xnf_intrlock); + mutex_enter(&xnfp->xnf_rxlock); mutex_enter(&xnfp->xnf_txlock); xnfp->xnf_connected = B_TRUE; /* - * wake up threads wanting to send data to backend, - * but got blocked due to backend is not ready + * Wake up any threads waiting to send data to + * backend. */ - cv_broadcast(&xnfp->xnf_cv); + cv_broadcast(&xnfp->xnf_cv_state); mutex_exit(&xnfp->xnf_txlock); - mutex_exit(&xnfp->xnf_intrlock); + mutex_exit(&xnfp->xnf_rxlock); /* - * kick backend in case it missed any tx request - * in the TX ring buffer + * Kick the peer in case it missed any transmits + * request in the TX ring. */ ec_notify_via_evtchn(xnfp->xnf_evtchn); /* - * there maybe already queued rx data in the RX ring - * sent by backend after it gets connected but before - * we see its state change here, so we call our intr - * handling routine to handle them, if any + * There may already be completed receive requests in + * the ring sent by backend after it gets connected + * but before we see its state change here, so we call + * xnf_intr() to handle them, if any. */ (void) xnf_intr((caddr_t)xnfp); - /* mark as link up after get connected */ + /* + * Mark the link up now that we are connected. + */ mac_link_update(xnfp->xnf_mh, LINK_STATE_UP); + /* + * Tell the backend about the multicast addresses in + * which we are interested. + */ + mac_multicast_refresh(xnfp->xnf_mh, NULL, xnfp, B_TRUE); + break; default: break; } } - -/* - * Check whether backend is capable of and willing to talk - * to us via hypervisor copy, as opposed to page flip. - */ -static boolean_t -xnf_hvcopy_peer_status(dev_info_t *devinfo) -{ - int be_rx_copy; - int err; - - err = xenbus_scanf(XBT_NULL, xvdi_get_oename(devinfo), - "feature-rx-copy", "%d", &be_rx_copy); - /* - * If we fail to read the store we assume that the key is - * absent, implying an older domain at the far end. Older - * domains cannot do HV copy (we assume ..). - */ - if (err != 0) - be_rx_copy = 0; - - return (be_rx_copy?B_TRUE:B_FALSE); -}
--- a/usr/src/uts/common/xen/io/xnf.h Wed Nov 04 21:40:43 2009 -0800 +++ b/usr/src/uts/common/xen/io/xnf.h Thu Nov 05 01:05:36 2009 -0800 @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -37,70 +37,74 @@ #define XNF_MAXPKT 1500 /* MTU size */ #define XNF_FRAMESIZE 1514 /* frame size including MAC header */ -#define XNF_MAX_RXDESCS 256 - -#define MCAST_HASHBITS 256 - -extern int xnf_diagnose; /* Available for use any time. */ - -/* Flags to set in the global xnf_diagnose */ -#define XNF_DIAG_RX 0x01 -#define XNF_DIAG_TX 0x02 -#define XNF_DIAG_STATS 0x04 -#define XNF_DIAG_RX_BUFS 0x08 - /* DEBUG flags */ #define XNF_DEBUG_DDI 0x01 #define XNF_DEBUG_TRACE 0x02 -#define XNF_DEBUG_SEND 0x04 -#define XNF_DEBUG_INT 0x08 -#define XNF_DESC_ALIGN 8 - - -/* Info pertaining to each xmit/receive buffer */ -struct xnf_buffer_desc { - frtn_t free_rtn; /* desballoc() structure */ +/* + * Information about each receive buffer and any transmit look-aside + * buffers. + */ +typedef struct xnf_buf { + frtn_t free_rtn; struct xnf *xnfp; ddi_dma_handle_t dma_handle; caddr_t buf; /* DMA-able data buffer */ paddr_t buf_phys; - struct xnf_buffer_desc *next; /* For linking into free list */ + mfn_t buf_mfn; + size_t len; + struct xnf_buf *next; /* For linking into free list */ ddi_acc_handle_t acc_handle; grant_ref_t grant_ref; /* grant table reference */ uint16_t id; /* buffer id */ -}; + unsigned int gen; +} xnf_buf_t; + +/* + * Information about each transmit buffer. + */ +typedef struct xnf_txbuf { + struct xnf_txbuf *tx_next; + mblk_t *tx_mp; /* mblk associated with packet */ + netif_tx_request_t tx_txreq; + caddr_t tx_bufp; + ddi_dma_handle_t tx_dma_handle; + mfn_t tx_mfn; + xnf_buf_t *tx_bdesc; /* Look-aside buffer, if used. */ + unsigned char tx_type; + int16_t tx_status; + RING_IDX tx_slot; -/* Various information about each transmit packet */ -struct tx_pktinfo { - mblk_t *mp; /* mblk associated with packet */ - ddi_dma_handle_t dma_handle; - struct xnf_buffer_desc *bdesc; /* pointer to buffer descriptor */ - grant_ref_t grant_ref; /* grant table reference */ - uint16_t id; /* tx pkt id/free list next pointer */ -}; +#define TX_DATA 1 +#define TX_MCAST_REQ 2 +#define TX_MCAST_RSP 3 +} xnf_txbuf_t; -/* Per network-interface-controller driver private structure */ +/* + * Information about each outstanding transmit operation. + */ +typedef struct xnf_txid { + uint16_t id; /* Id of this transmit buffer. */ + uint16_t next; /* Freelist of ids. */ + xnf_txbuf_t *txbuf; /* Buffer details. */ +} xnf_txid_t; + +/* + * Per-instance data. + */ typedef struct xnf { /* most interesting stuff first to assist debugging */ - dev_info_t *xnf_devinfo; /* System per-device info. */ - mac_handle_t xnf_mh; /* Nemo per-device info. */ - int xnf_rx_bufs_outstanding; - int xnf_tx_descs_free; - int xnf_rx_descs_free; /* count of free rx bufs */ - int xnf_n_tx; /* No. xmit descriptors */ - int xnf_n_rx; /* No. recv descriptors */ - int xnf_n_rx_bufs; /* No. recv DMA buffers */ - int xnf_tx_start_thresh_regval; + dev_info_t *xnf_devinfo; + mac_handle_t xnf_mh; unsigned char xnf_mac_addr[ETHERADDRL]; - int xnf_max_rx_bufs; - int xnf_rx_buffer_count; - int xnf_tx_buffer_count; + + unsigned int xnf_gen; /* Increments on resume. */ boolean_t xnf_connected; boolean_t xnf_running; - boolean_t xnf_cksum_offload; + boolean_t xnf_be_rx_copy; + boolean_t xnf_be_mcast_control; uint64_t xnf_stat_interrupts; uint64_t xnf_stat_unclaimed_interrupts; @@ -112,7 +116,6 @@ uint64_t xnf_stat_tx_pullup; uint64_t xnf_stat_tx_pagebndry; uint64_t xnf_stat_tx_defer; - uint64_t xnf_stat_rx_no_ringbuf; uint64_t xnf_stat_mac_rcv_error; uint64_t xnf_stat_runt; @@ -123,44 +126,54 @@ uint64_t xnf_stat_tx_cksum_deferred; uint64_t xnf_stat_rx_cksum_no_need; - uint64_t xnf_stat_hvcopy_enabled; /* on/off */ - uint64_t xnf_stat_hvcopy_packet_processed; + + uint64_t xnf_stat_buf_allocated; + uint64_t xnf_stat_buf_outstanding; + uint64_t xnf_stat_gref_outstanding; + uint64_t xnf_stat_gref_failure; + uint64_t xnf_stat_gref_peak; + uint64_t xnf_stat_rx_allocb_fail; + uint64_t xnf_stat_rx_desballoc_fail; kstat_t *xnf_kstat_aux; - struct xnf_buffer_desc *xnf_free_list; - struct xnf_buffer_desc *xnf_tx_free_list; - int xnf_tx_pkt_id_list; - /* free list of avail pkt ids */ - struct tx_pktinfo xnf_tx_pkt_info[NET_TX_RING_SIZE]; - struct xnf_buffer_desc *xnf_rxpkt_bufptr[XNF_MAX_RXDESCS]; + ddi_iblock_cookie_t xnf_icookie; - ddi_iblock_cookie_t xnf_icookie; - kmutex_t xnf_tx_buf_mutex; - kmutex_t xnf_rx_buf_mutex; - kmutex_t xnf_txlock; - kmutex_t xnf_intrlock; - boolean_t xnf_tx_pages_readonly; - boolean_t xnf_need_sched; - - netif_tx_front_ring_t xnf_tx_ring; /* tx interface struct ptr */ + netif_tx_front_ring_t xnf_tx_ring; ddi_dma_handle_t xnf_tx_ring_dma_handle; ddi_acc_handle_t xnf_tx_ring_dma_acchandle; paddr_t xnf_tx_ring_phys_addr; grant_ref_t xnf_tx_ring_ref; - netif_rx_front_ring_t xnf_rx_ring; /* rx interface struct ptr */ + xnf_txid_t xnf_tx_pkt_id[NET_TX_RING_SIZE]; + uint16_t xnf_tx_pkt_id_head; + kmutex_t xnf_txlock; + kmutex_t xnf_schedlock; + boolean_t xnf_need_sched; + kcondvar_t xnf_cv_tx_slots; + kmem_cache_t *xnf_tx_buf_cache; + + netif_rx_front_ring_t xnf_rx_ring; ddi_dma_handle_t xnf_rx_ring_dma_handle; ddi_acc_handle_t xnf_rx_ring_dma_acchandle; paddr_t xnf_rx_ring_phys_addr; grant_ref_t xnf_rx_ring_ref; - uint16_t xnf_evtchn; /* channel to back end ctlr */ - grant_ref_t xnf_gref_tx_head; /* tx grant free list */ - grant_ref_t xnf_gref_rx_head; /* rx grant free list */ - kcondvar_t xnf_cv; + xnf_buf_t *xnf_rx_pkt_info[NET_RX_RING_SIZE]; + kmutex_t xnf_rxlock; + mblk_t *xnf_rx_head; + mblk_t *xnf_rx_tail; + boolean_t xnf_rx_new_buffers_posted; + kmem_cache_t *xnf_buf_cache; - boolean_t xnf_rx_hvcopy; /* do we do HV copy? */ + uint16_t xnf_evtchn; + + kmutex_t xnf_gref_lock; + grant_ref_t xnf_gref_head; + + kcondvar_t xnf_cv_state; + kcondvar_t xnf_cv_multicast; + uint_t xnf_pending_multicast; } xnf_t; #ifdef __cplusplus