Mercurial > illumos > illumos-gate

--- a/usr/src/uts/common/xen/io/xnb.c	Wed Nov 04 21:40:43 2009 -0800
+++ b/usr/src/uts/common/xen/io/xnb.c	Thu Nov 05 01:05:36 2009 -0800
@@ -35,7 +35,7 @@
 #include <sys/modctl.h>
 #include <sys/conf.h>
 #include <sys/mac.h>
-#include <sys/mac_impl.h> /* XXXXBOW - remove, included for mac_fix_cksum() */
+#include <sys/mac_impl.h> /* For mac_fix_cksum(). */
 #include <sys/dlpi.h>
 #include <sys/strsubr.h>
 #include <sys/strsun.h>
@@ -49,11 +49,10 @@
 #include <sys/evtchn_impl.h>
 #include <sys/gnttab.h>
 #include <vm/vm_dep.h>
-
+#include <sys/note.h>
 #include <sys/gld.h>
 #include <inet/ip.h>
 #include <inet/ip_impl.h>
-#include <sys/vnic_impl.h> /* blech. */

 /*
  * The terms "transmit" and "receive" are used in alignment with domU,
@@ -62,23 +61,9 @@
  */

 /*
- * XXPV dme: things to do, as well as various things indicated
- * throughout the source:
- * - copy avoidance outbound.
- * - copy avoidance inbound.
- * - transfer credit limiting.
- * - MAC address based filtering.
+ * Should we allow guests to manipulate multicast group membership?
  */
-
-/*
- * Should we attempt to defer checksum calculation?
- */
-static boolean_t	xnb_cksum_offload = B_TRUE;
-/*
- * When receiving packets from a guest, should they be copied
- * or used as-is (esballoc)?
- */
-static boolean_t	xnb_tx_always_copy = B_TRUE;
+static boolean_t	xnb_multicast_control = B_TRUE;

 static boolean_t	xnb_connect_rings(dev_info_t *);
 static void		xnb_disconnect_rings(dev_info_t *);
@@ -89,31 +74,55 @@

 static int	xnb_txbuf_constructor(void *, void *, int);
 static void	xnb_txbuf_destructor(void *, void *);
-static xnb_txbuf_t *xnb_txbuf_get(xnb_t *, int);
-static void	xnb_txbuf_put(xnb_t *, xnb_txbuf_t *);
-static void	xnb_tx_notify_peer(xnb_t *);
-static void	xnb_tx_complete(xnb_txbuf_t *);
+static void	xnb_tx_notify_peer(xnb_t *, boolean_t);
 static void	xnb_tx_mark_complete(xnb_t *, RING_IDX, int16_t);
-static void 	xnb_tx_schedule_unmop(xnb_t *, gnttab_map_grant_ref_t *,
-    xnb_txbuf_t *);
-static void	xnb_tx_perform_pending_unmop(xnb_t *);
+
+mblk_t		*xnb_to_peer(xnb_t *, mblk_t *);
 mblk_t		*xnb_copy_to_peer(xnb_t *, mblk_t *);

-int		xnb_unmop_lowwat = NET_TX_RING_SIZE >> 2;
-int		xnb_unmop_hiwat = NET_TX_RING_SIZE - (NET_TX_RING_SIZE >> 2);
-
+static void		setup_gop(xnb_t *, gnttab_copy_t *, uchar_t *,
+    size_t, size_t, size_t, grant_ref_t);
+#pragma inline(setup_gop)
+static boolean_t	is_foreign(void *);
+#pragma inline(is_foreign)

-boolean_t	xnb_hv_copy = B_TRUE;
-boolean_t	xnb_explicit_pageflip_set = B_FALSE;
-
-/* XXPV dme: are these really invalid? */
 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
 #define	INVALID_GRANT_REF	((grant_ref_t)-1)

-static kmem_cache_t *xnb_txbuf_cachep;
 static kmutex_t	xnb_alloc_page_lock;

 /*
+ * On a 32 bit PAE system physical and machine addresses are larger
+ * than 32 bits.  ddi_btop() on such systems take an unsigned long
+ * argument, and so addresses above 4G are truncated before ddi_btop()
+ * gets to see them.  To avoid this, code the shift operation here.
+ */
+#define	xnb_btop(addr)	((addr) >> PAGESHIFT)
+
+/* DMA attributes for transmit and receive data */
+static ddi_dma_attr_t buf_dma_attr = {
+	DMA_ATTR_V0,		/* version of this structure */
+	0,			/* lowest usable address */
+	0xffffffffffffffffULL,	/* highest usable address */
+	0x7fffffff,		/* maximum DMAable byte count */
+	MMU_PAGESIZE,		/* alignment in bytes */
+	0x7ff,			/* bitmap of burst sizes */
+	1,			/* minimum transfer */
+	0xffffffffU,		/* maximum transfer */
+	0xffffffffffffffffULL,	/* maximum segment length */
+	1,			/* maximum number of segments */
+	1,			/* granularity */
+	0,			/* flags (reserved) */
+};
+
+/* DMA access attributes for data: NOT to be byte swapped. */
+static ddi_device_acc_attr_t data_accattr = {
+	DDI_DEVICE_ATTR_V0,
+	DDI_NEVERSWAP_ACC,
+	DDI_STRICTORDER_ACC
+};
+
+/*
  * Statistics.
  */
 static char *aux_statistics[] = {
@@ -226,14 +235,15 @@
 }

 /*
- * Software checksum calculation and insertion for an arbitrary packet.
+ * Calculate and insert the transport checksum for an arbitrary packet.
  */
-/*ARGSUSED*/
 static mblk_t *
 xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
 {
+	_NOTE(ARGUNUSED(xnbp));
+
 	/*
-	 * XXPV dme: shouldn't rely on vnic_fix_cksum(), not least
+	 * XXPV dme: shouldn't rely on mac_fix_cksum(), not least
 	 * because it doesn't cover all of the interesting cases :-(
 	 */
 	(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
@@ -254,7 +264,7 @@

 	/*
 	 * Check that the packet is contained in a single mblk.  In
-	 * the "from peer" path this is true today, but will change
+	 * the "from peer" path this is true today, but may change
 	 * when scatter gather support is added.  In the "to peer"
 	 * path we cannot be sure, but in most cases it will be true
 	 * (in the xnbo case the packet has come from a MAC device
@@ -393,7 +403,8 @@
 xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data)
 {
 	xnb_t *xnbp;
-	char *xsname, mac[ETHERADDRL * 3];
+	char *xsname;
+	char cachename[32];

 	xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP);

@@ -404,18 +415,17 @@
 	xnbp->xnb_irq = B_FALSE;
 	xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
 	xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
-	xnbp->xnb_cksum_offload = xnb_cksum_offload;
 	xnbp->xnb_connected = B_FALSE;
 	xnbp->xnb_hotplugged = B_FALSE;
 	xnbp->xnb_detachable = B_FALSE;
 	xnbp->xnb_peer = xvdi_get_oeid(dip);
-	xnbp->xnb_tx_pages_writable = B_FALSE;
-	xnbp->xnb_tx_always_copy = xnb_tx_always_copy;
+	xnbp->xnb_be_status = XNB_STATE_INIT;
+	xnbp->xnb_fe_status = XNB_STATE_INIT;

 	xnbp->xnb_tx_buf_count = 0;
-	xnbp->xnb_tx_unmop_count = 0;

-	xnbp->xnb_hv_copy = B_FALSE;
+	xnbp->xnb_rx_hv_copy = B_FALSE;
+	xnbp->xnb_multicast_control = B_FALSE;

 	xnbp->xnb_rx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
 	ASSERT(xnbp->xnb_rx_va != NULL);
@@ -424,18 +434,28 @@
 	    != DDI_SUCCESS)
 		goto failure;

-	/* allocated on demand, when/if we enter xnb_copy_to_peer() */
+	/* Allocated on demand, when/if we enter xnb_copy_to_peer(). */
 	xnbp->xnb_rx_cpop = NULL;
-	xnbp->xnb_cpop_sz = 0;
+	xnbp->xnb_rx_cpop_count = 0;

 	mutex_init(&xnbp->xnb_tx_lock, NULL, MUTEX_DRIVER,
 	    xnbp->xnb_icookie);
 	mutex_init(&xnbp->xnb_rx_lock, NULL, MUTEX_DRIVER,
 	    xnbp->xnb_icookie);
+	mutex_init(&xnbp->xnb_state_lock, NULL, MUTEX_DRIVER,
+	    xnbp->xnb_icookie);

-	/* set driver private pointer now */
+	/* Set driver private pointer now. */
 	ddi_set_driver_private(dip, xnbp);

+	(void) sprintf(cachename, "xnb_tx_buf_cache_%d", ddi_get_instance(dip));
+	xnbp->xnb_tx_buf_cache = kmem_cache_create(cachename,
+	    sizeof (xnb_txbuf_t), 0,
+	    xnb_txbuf_constructor, xnb_txbuf_destructor,
+	    NULL, xnbp, NULL, 0);
+	if (xnbp->xnb_tx_buf_cache == NULL)
+		goto failure_0;
+
 	if (!xnb_ks_init(xnbp))
 		goto failure_1;

@@ -457,16 +477,12 @@
 	xsname = xvdi_get_xsname(dip);

 	if (xenbus_printf(XBT_NULL, xsname,
-	    "feature-no-csum-offload", "%d",
-	    xnbp->xnb_cksum_offload ? 0 : 1) != 0)
+	    "feature-multicast-control", "%d",
+	    xnb_multicast_control ? 1 : 0) != 0)
 		goto failure_3;

-	/*
-	 * Use global xnb_hv_copy to export this feature. This means that
-	 * we have to decide what to do before starting up a guest domain
-	 */
 	if (xenbus_printf(XBT_NULL, xsname,
-	    "feature-rx-copy", "%d", xnb_hv_copy ? 1 : 0) != 0)
+	    "feature-rx-copy", "%d",  1) != 0)
 		goto failure_3;
 	/*
 	 * Linux domUs seem to depend on "feature-rx-flip" being 0
@@ -475,23 +491,8 @@
 	 * but we might as well play nice.
 	 */
 	if (xenbus_printf(XBT_NULL, xsname,
-	    "feature-rx-flip", "%d", xnb_explicit_pageflip_set ? 1 : 0) != 0)
-		goto failure_3;
-
-	if (xenbus_scanf(XBT_NULL, xsname,
-	    "mac", "%s", mac) != 0) {
-		cmn_err(CE_WARN, "xnb_attach: "
-		    "cannot read mac address from %s",
-		    xsname);
+	    "feature-rx-flip", "%d", 0) != 0)
 		goto failure_3;
-	}
-
-	if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) {
-		cmn_err(CE_WARN,
-		    "xnb_attach: cannot parse mac address %s",
-		    mac);
-		goto failure_3;
-	}

 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
 	(void) xvdi_post_event(dip, XEN_HP_ADD);
@@ -505,6 +506,10 @@
 	xnb_ks_free(xnbp);

 failure_1:
+	kmem_cache_destroy(xnbp->xnb_tx_buf_cache);
+
+failure_0:
+	mutex_destroy(&xnbp->xnb_state_lock);
 	mutex_destroy(&xnbp->xnb_rx_lock);
 	mutex_destroy(&xnbp->xnb_tx_lock);

@@ -514,7 +519,6 @@
 	return (DDI_FAILURE);
 }

-/*ARGSUSED*/
 void
 xnb_detach(dev_info_t *dip)
 {
@@ -530,14 +534,17 @@

 	xnb_ks_free(xnbp);

+	kmem_cache_destroy(xnbp->xnb_tx_buf_cache);
+
 	ddi_set_driver_private(dip, NULL);

+	mutex_destroy(&xnbp->xnb_state_lock);
+	mutex_destroy(&xnbp->xnb_rx_lock);
 	mutex_destroy(&xnbp->xnb_tx_lock);
-	mutex_destroy(&xnbp->xnb_rx_lock);

-	if (xnbp->xnb_cpop_sz > 0)
-		kmem_free(xnbp->xnb_rx_cpop, sizeof (*xnbp->xnb_rx_cpop)
-		    * xnbp->xnb_cpop_sz);
+	if (xnbp->xnb_rx_cpop_count > 0)
+		kmem_free(xnbp->xnb_rx_cpop, sizeof (xnbp->xnb_rx_cpop[0])
+		    * xnbp->xnb_rx_cpop_count);

 	ASSERT(xnbp->xnb_rx_va != NULL);
 	vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
@@ -545,7 +552,12 @@
 	kmem_free(xnbp, sizeof (*xnbp));
 }

-
+/*
+ * Allocate a page from the hypervisor to be flipped to the peer.
+ *
+ * Try to get pages in batches to reduce the overhead of calls into
+ * the balloon driver.
+ */
 static mfn_t
 xnb_alloc_page(xnb_t *xnbp)
 {
@@ -591,10 +603,16 @@
 #undef WARNING_RATE_LIMIT
 }

-/*ARGSUSED*/
+/*
+ * Free a page back to the hypervisor.
+ *
+ * This happens only in the error path, so batching is not worth the
+ * complication.
+ */
 static void
 xnb_free_page(xnb_t *xnbp, mfn_t mfn)
 {
+	_NOTE(ARGUNUSED(xnbp));
 	int r;
 	pfn_t pfn;

@@ -602,10 +620,6 @@
 	pfnzero(pfn, 0, PAGESIZE);
 	xen_release_pfn(pfn);

-	/*
-	 * This happens only in the error path, so batching is
-	 * not worth the complication.
-	 */
 	if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) {
 		cmn_err(CE_WARN, "free_page: cannot decrease memory "
 		    "reservation (%d): page kept but unusable (mfn = 0x%lx).",
@@ -614,8 +628,8 @@
 }

 /*
- * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but
- * using local variables.
+ * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but using
+ * local variables. Used in both xnb_to_peer() and xnb_copy_to_peer().
  */
 #define	XNB_RING_HAS_UNCONSUMED_REQUESTS(_r)		\
 	((((_r)->sring->req_prod - loop) <		\
@@ -623,6 +637,9 @@
 	    ((_r)->sring->req_prod - loop) :		\
 	    (RING_SIZE(_r) - (loop - prod)))

+/*
+ * Pass packets to the peer using page flipping.
+ */
 mblk_t *
 xnb_to_peer(xnb_t *xnbp, mblk_t *mp)
 {
@@ -835,66 +852,38 @@
 	return (mp);
 }

-/* helper functions for xnb_copy_to_peer */
+/* Helper functions for xnb_copy_to_peer(). */

 /*
  * Grow the array of copy operation descriptors.
- * Returns a pointer to the next available entry.
  */
-gnttab_copy_t *
-grow_cpop_area(xnb_t *xnbp, gnttab_copy_t *o_cpop)
+static boolean_t
+grow_cpop_area(xnb_t *xnbp)
 {
-	/*
-	 * o_cpop (arg.1) is a ptr to the area we would like to copy
-	 * something into but cannot, because we haven't alloc'ed it
-	 * yet, or NULL.
-	 * old_cpop and new_cpop (local) are pointers to old/new
-	 * versions of xnbp->xnb_rx_cpop.
-	 */
-	gnttab_copy_t	*new_cpop, *old_cpop, *ret_cpop;
-	size_t		newcount;
+	size_t count;
+	gnttab_copy_t *new;

 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));

-	old_cpop = xnbp->xnb_rx_cpop;
-	/*
-	 * o_cpop is a pointer into the array pointed to by old_cpop;
-	 * it would be an error for exactly one of these pointers to be NULL.
-	 * We shouldn't call this function if xnb_rx_cpop has already
-	 * been allocated, but we're starting to fill it from the beginning
-	 * again.
-	 */
-	ASSERT((o_cpop == NULL && old_cpop == NULL) ||
-	    (o_cpop != NULL && old_cpop != NULL && o_cpop != old_cpop));
+	count = xnbp->xnb_rx_cpop_count + CPOP_DEFCNT;

-	newcount = xnbp->xnb_cpop_sz + CPOP_DEFCNT;
-
-	new_cpop = kmem_alloc(sizeof (*new_cpop) * newcount, KM_NOSLEEP);
-	if (new_cpop == NULL) {
+	if ((new = kmem_alloc(sizeof (new[0]) * count, KM_NOSLEEP)) == NULL) {
 		xnbp->xnb_stat_other_allocation_failure++;
-		return (NULL);
+		return (B_FALSE);
 	}

-	if (o_cpop != NULL) {
-		size_t	 offset = (o_cpop - old_cpop);
-
-		/* we only need to move the parts in use ... */
-		(void) memmove(new_cpop, old_cpop, xnbp->xnb_cpop_sz *
-		    (sizeof (*old_cpop)));
+	bcopy(xnbp->xnb_rx_cpop, new,
+	    sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count);

-		kmem_free(old_cpop, xnbp->xnb_cpop_sz * sizeof (*old_cpop));
+	kmem_free(xnbp->xnb_rx_cpop,
+	    sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count);

-		ret_cpop = new_cpop + offset;
-	} else {
-		ret_cpop = new_cpop;
-	}
-
-	xnbp->xnb_rx_cpop = new_cpop;
-	xnbp->xnb_cpop_sz = newcount;
+	xnbp->xnb_rx_cpop = new;
+	xnbp->xnb_rx_cpop_count = count;

 	xnbp->xnb_stat_rx_cpoparea_grown++;

-	return (ret_cpop);
+	return (B_TRUE);
 }

 /*
@@ -903,9 +892,9 @@
 static boolean_t
 is_foreign(void *addr)
 {
-	pfn_t	pfn = hat_getpfnum(kas.a_hat, addr);
+	pfn_t pfn = hat_getpfnum(kas.a_hat, addr);

-	return (pfn & PFN_IS_FOREIGN_MFN ? B_TRUE : B_FALSE);
+	return ((pfn & PFN_IS_FOREIGN_MFN) == PFN_IS_FOREIGN_MFN);
 }

 /*
@@ -965,17 +954,23 @@
 	gp->dest.domid = xnbp->xnb_peer;
 }

+/*
+ * Pass packets to the peer using hypervisor copy operations.
+ */
 mblk_t *
 xnb_copy_to_peer(xnb_t *xnbp, mblk_t *mp)
 {
 	mblk_t		*free = mp, *mp_prev = NULL, *saved_mp = mp;
 	mblk_t		*ml, *ml_prev;
-	gnttab_copy_t	*gop_cp;
 	boolean_t	notify;
 	RING_IDX	loop, prod;
 	int		i;

-	if (!xnbp->xnb_hv_copy)
+	/*
+	 * If the peer does not pre-post buffers for received packets,
+	 * use page flipping to pass packets to it.
+	 */
+	if (!xnbp->xnb_rx_hv_copy)
 		return (xnb_to_peer(xnbp, mp));

 	/*
@@ -989,13 +984,12 @@
 	 *
 	 * NOTE ad 2.
 	 *  In order to reduce the number of hypercalls, we prepare
-	 *  several packets (mp->b_cont != NULL) for the peer and
-	 *  perform a single hypercall to transfer them.
-	 *  We also have to set up a seperate copy operation for
-	 *  every page.
+	 *  several mblks (mp->b_cont != NULL) for the peer and
+	 *  perform a single hypercall to transfer them.  We also have
+	 *  to set up a seperate copy operation for every page.
 	 *
-	 * If we have more than one message (mp->b_next != NULL),
-	 * we do this whole dance repeatedly.
+	 * If we have more than one packet (mp->b_next != NULL), we do
+	 * this whole dance repeatedly.
 	 */

 	mutex_enter(&xnbp->xnb_rx_lock);
@@ -1013,12 +1007,12 @@
 	while ((mp != NULL) &&
 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
 		netif_rx_request_t	*rxreq;
+		size_t			d_offset, len;
+		int			item_count;
+		gnttab_copy_t		*gop_cp;
 		netif_rx_response_t	*rxresp;
-		size_t			d_offset;
-		size_t			len;
 		uint16_t		cksum_flags;
 		int16_t			status = NETIF_RSP_OKAY;
-		int			item_count;

 		/* 1 */
 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
@@ -1038,8 +1032,9 @@
 		gop_cp = xnbp->xnb_rx_cpop;

 		/*
-		 * We walk the b_cont pointers and set up a gop_cp
-		 * structure for every page in every data block we have.
+		 * We walk the b_cont pointers and set up a
+		 * gnttab_copy_t for each sub-page chunk in each data
+		 * block.
 		 */
 		/* 2a */
 		for (ml = mp, ml_prev = NULL; ml != NULL; ml = ml->b_cont) {
@@ -1048,8 +1043,13 @@
 			size_t	r_offset;

 			/*
-			 * If we get an mblk on a page that doesn't belong to
-			 * this domain, get a new mblk to replace the old one.
+			 * The hypervisor will not allow us to
+			 * reference a foreign page (e.g. one
+			 * belonging to another domain) by mfn in the
+			 * copy operation. If the data in this mblk is
+			 * on such a page we must copy the data into a
+			 * local page before initiating the hypervisor
+			 * copy operation.
 			 */
 			if (is_foreign(ml->b_rptr) || is_foreign(ml->b_wptr)) {
 				mblk_t *ml_new = replace_msg(ml, chunk,
@@ -1080,15 +1080,14 @@
 			while (chunk > 0) {
 				size_t part_len;

-				item_count++;
-				if (item_count > xnbp->xnb_cpop_sz) {
-					gop_cp = grow_cpop_area(xnbp, gop_cp);
-					if (gop_cp == NULL)
+				if (item_count == xnbp->xnb_rx_cpop_count) {
+					if (!grow_cpop_area(xnbp))
 						goto failure;
+					gop_cp = &xnbp->xnb_rx_cpop[item_count];
 				}
 				/*
 				 * If our mblk crosses a page boundary, we need
-				 * to do a seperate copy for every page.
+				 * to do a seperate copy for each page.
 				 */
 				if (r_offset + chunk > PAGESIZE) {
 					part_len = PAGESIZE - r_offset;
@@ -1116,8 +1115,10 @@
 				 */
 				r_offset = 0;
 				gop_cp++;
+				item_count++;
 			}
 			ml_prev = ml;
+
 			DTRACE_PROBE4(mblk_loop_end, (mblk_t *), ml, int,
 			    chunk, int, len, int, item_count);
 		}
@@ -1152,7 +1153,7 @@

 		for (i = 0; i < item_count; i++) {
 			if (xnbp->xnb_rx_cpop[i].status != 0) {
-				DTRACE_PROBE2(cpop__status__nonnull, int,
+				DTRACE_PROBE2(cpop_status_nonnull, int,
 				    (int)xnbp->xnb_rx_cpop[i].status,
 				    int, i);
 				status = NETIF_RSP_ERROR;
@@ -1213,54 +1214,9 @@
 	return (mp);
 }

-/*ARGSUSED*/
-static int
-xnb_txbuf_constructor(void *buf, void *arg, int kmflag)
-{
-	xnb_txbuf_t *txp = buf;
-
-	bzero(txp, sizeof (*txp));
-
-	txp->xt_free_rtn.free_func = xnb_tx_complete;
-	txp->xt_free_rtn.free_arg = (caddr_t)txp;
-
-	txp->xt_mop.host_addr =
-	    (uint64_t)(uintptr_t)vmem_alloc(heap_arena, PAGESIZE,
-	    ((kmflag & KM_NOSLEEP) == KM_NOSLEEP) ?
-	    VM_NOSLEEP : VM_SLEEP);
-
-	if (txp->xt_mop.host_addr == NULL) {
-		cmn_err(CE_WARN, "xnb_txbuf_constructor: "
-		    "cannot get address space");
-		return (-1);
-	}
-
-	/*
-	 * Have the hat ensure that page table exists for the VA.
-	 */
-	hat_prepare_mapping(kas.a_hat,
-	    (caddr_t)(uintptr_t)txp->xt_mop.host_addr, NULL);
-
-	return (0);
-}
-
-/*ARGSUSED*/
-static void
-xnb_txbuf_destructor(void *buf, void *arg)
-{
-	xnb_txbuf_t *txp = buf;
-
-	ASSERT(txp->xt_mop.host_addr != NULL);
-	ASSERT((txp->xt_flags & XNB_TXBUF_INUSE) == 0);
-
-	hat_release_mapping(kas.a_hat,
-	    (caddr_t)(uintptr_t)txp->xt_mop.host_addr);
-	vmem_free(heap_arena,
-	    (caddr_t)(uintptr_t)txp->xt_mop.host_addr, PAGESIZE);
-}

 static void
-xnb_tx_notify_peer(xnb_t *xnbp)
+xnb_tx_notify_peer(xnb_t *xnbp, boolean_t force)
 {
 	boolean_t notify;

@@ -1268,7 +1224,7 @@

 	/* LINTED: constant in conditional context */
 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_tx_ring, notify);
-	if (notify) {
+	if (notify || force) {
 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
 		xnbp->xnb_stat_tx_notify_sent++;
 	} else {
@@ -1277,18 +1233,6 @@
 }

 static void
-xnb_tx_complete(xnb_txbuf_t *txp)
-{
-	xnb_t *xnbp = txp->xt_xnbp;
-
-	ASSERT((txp->xt_flags & XNB_TXBUF_INUSE) == XNB_TXBUF_INUSE);
-
-	mutex_enter(&xnbp->xnb_tx_lock);
-	xnb_tx_schedule_unmop(xnbp, &txp->xt_mop, txp);
-	mutex_exit(&xnbp->xnb_tx_lock);
-}
-
-static void
 xnb_tx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status)
 {
 	RING_IDX i;
@@ -1311,185 +1255,105 @@
 }

 static void
-xnb_tx_schedule_unmop(xnb_t *xnbp, gnttab_map_grant_ref_t *mop,
-    xnb_txbuf_t *txp)
+xnb_txbuf_recycle(xnb_txbuf_t *txp)
 {
-	gnttab_unmap_grant_ref_t	*unmop;
-	int				u_count;
-	int				reqs_on_ring;
+	xnb_t *xnbp = txp->xt_xnbp;

-	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
-	ASSERT(xnbp->xnb_tx_unmop_count < NET_TX_RING_SIZE);
+	kmem_cache_free(xnbp->xnb_tx_buf_cache, txp);
+
+	xnbp->xnb_tx_buf_outstanding--;
+}

-	u_count = xnbp->xnb_tx_unmop_count++;
-
-	/* Cache data for the time when we actually unmap grant refs */
-	xnbp->xnb_tx_unmop_txp[u_count] = txp;
+static int
+xnb_txbuf_constructor(void *buf, void *arg, int kmflag)
+{
+	_NOTE(ARGUNUSED(kmflag));
+	xnb_txbuf_t *txp = buf;
+	xnb_t *xnbp = arg;
+	size_t len;
+	ddi_dma_cookie_t dma_cookie;
+	uint_t ncookies;

-	unmop = &xnbp->xnb_tx_unmop[u_count];
-	unmop->host_addr = mop->host_addr;
-	unmop->dev_bus_addr = mop->dev_bus_addr;
-	unmop->handle = mop->handle;
+	txp->xt_free_rtn.free_func = xnb_txbuf_recycle;
+	txp->xt_free_rtn.free_arg = (caddr_t)txp;
+	txp->xt_xnbp = xnbp;
+	txp->xt_next = NULL;
+
+	if (ddi_dma_alloc_handle(xnbp->xnb_devinfo, &buf_dma_attr,
+	    0, 0, &txp->xt_dma_handle) != DDI_SUCCESS)
+		goto failure;
+
+	if (ddi_dma_mem_alloc(txp->xt_dma_handle, PAGESIZE, &data_accattr,
+	    DDI_DMA_STREAMING, 0, 0, &txp->xt_buf, &len,
+	    &txp->xt_acc_handle) != DDI_SUCCESS)
+		goto failure_1;

-	/*
-	 * We cannot check the ring once we're disconnected from it. Batching
-	 * doesn't seem to be a useful optimisation in this case either,
-	 * so we directly call into the actual unmap function.
-	 */
-	if (xnbp->xnb_connected) {
-		reqs_on_ring = RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_tx_ring);
+	if (ddi_dma_addr_bind_handle(txp->xt_dma_handle, NULL, txp->xt_buf,
+	    len, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, 0,
+	    &dma_cookie, &ncookies)
+	    != DDI_DMA_MAPPED)
+		goto failure_2;
+	ASSERT(ncookies == 1);
+
+	txp->xt_mfn = xnb_btop(dma_cookie.dmac_laddress);
+	txp->xt_buflen = dma_cookie.dmac_size;
+
+	DTRACE_PROBE(txbuf_allocated);
+
+	atomic_add_32(&xnbp->xnb_tx_buf_count, 1);
+	xnbp->xnb_tx_buf_outstanding++;
+
+	return (0);
+
+failure_2:
+	ddi_dma_mem_free(&txp->xt_acc_handle);

-		/*
-		 * By tuning xnb_unmop_hiwat to N, we can emulate "N per batch"
-		 * or (with N == 1) "immediate unmop" behaviour.
-		 * The "> xnb_unmop_lowwat" is a guard against ring exhaustion.
-		 */
-		if (xnbp->xnb_tx_unmop_count < xnb_unmop_hiwat &&
-		    reqs_on_ring > xnb_unmop_lowwat)
-			return;
-	}
+failure_1:
+	ddi_dma_free_handle(&txp->xt_dma_handle);
+
+failure:
+
+	return (-1);
+}

-	xnb_tx_perform_pending_unmop(xnbp);
+static void
+xnb_txbuf_destructor(void *buf, void *arg)
+{
+	xnb_txbuf_t *txp = buf;
+	xnb_t *xnbp = arg;
+
+	(void) ddi_dma_unbind_handle(txp->xt_dma_handle);
+	ddi_dma_mem_free(&txp->xt_acc_handle);
+	ddi_dma_free_handle(&txp->xt_dma_handle);
+
+	atomic_add_32(&xnbp->xnb_tx_buf_count, -1);
 }

 /*
- * Here we perform the actual unmapping of the data that was
- * accumulated in xnb_tx_schedule_unmop().
- * Note that it is the caller's responsibility to make sure that
- * there's actually something there to unmop.
+ * Take packets from the peer and deliver them onward.
  */
-static void
-xnb_tx_perform_pending_unmop(xnb_t *xnbp)
-{
-	RING_IDX loop;
-#ifdef XNB_DEBUG
-	gnttab_unmap_grant_ref_t *unmop;
-#endif /* XNB_DEBUG */
-
-	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
-	ASSERT(xnbp->xnb_tx_unmop_count > 0);
-
-	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
-	    xnbp->xnb_tx_unmop, xnbp->xnb_tx_unmop_count) < 0) {
-		cmn_err(CE_WARN, "xnb_tx_perform_pending_unmop: "
-		    "unmap grant operation failed, "
-		    "%d pages lost", xnbp->xnb_tx_unmop_count);
-	}
-
-#ifdef XNB_DEBUG
-	for (loop = 0, unmop = xnbp->xnb_tx_unmop;
-	    loop < xnbp->xnb_tx_unmop_count;
-	    loop++, unmop++) {
-		if (unmop->status != 0) {
-			cmn_err(CE_WARN, "xnb_tx_perform_pending_unmop: "
-			    "unmap grant reference failed (%d)",
-			    unmop->status);
-		}
-	}
-#endif /* XNB_DEBUG */
-
-	for (loop = 0; loop < xnbp->xnb_tx_unmop_count; loop++) {
-		xnb_txbuf_t	*txp = xnbp->xnb_tx_unmop_txp[loop];
-
-		if (txp == NULL)
-			cmn_err(CE_PANIC,
-			    "xnb_tx_perform_pending_unmop: "
-			    "unexpected NULL txp (loop %d; count %d)!",
-			    loop, xnbp->xnb_tx_unmop_count);
-
-		if (xnbp->xnb_connected)
-			xnb_tx_mark_complete(xnbp, txp->xt_id, txp->xt_status);
-		xnb_txbuf_put(xnbp, txp);
-	}
-	if (xnbp->xnb_connected)
-		xnb_tx_notify_peer(xnbp);
-
-	xnbp->xnb_tx_unmop_count = 0;
-
-#ifdef XNB_DEBUG
-	bzero(xnbp->xnb_tx_unmop, sizeof (xnbp->xnb_tx_unmop));
-	bzero(xnbp->xnb_tx_unmop_txp, sizeof (xnbp->xnb_tx_unmop_txp));
-#endif /* XNB_DEBUG */
-}
-
-static xnb_txbuf_t *
-xnb_txbuf_get(xnb_t *xnbp, int flags)
-{
-	xnb_txbuf_t *txp;
-
-	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
-
-	txp = kmem_cache_alloc(xnb_txbuf_cachep, flags);
-	if (txp != NULL) {
-		ASSERT((txp->xt_flags & XNB_TXBUF_INUSE) == 0);
-		txp->xt_flags |= XNB_TXBUF_INUSE;
-
-		txp->xt_xnbp = xnbp;
-		txp->xt_mop.dom = xnbp->xnb_peer;
-
-		txp->xt_mop.flags = GNTMAP_host_map;
-		if (!xnbp->xnb_tx_pages_writable)
-			txp->xt_mop.flags |= GNTMAP_readonly;
-
-		xnbp->xnb_tx_buf_count++;
-	}
-
-	return (txp);
-}
-
-static void
-xnb_txbuf_put(xnb_t *xnbp, xnb_txbuf_t *txp)
-{
-	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
-	ASSERT((txp->xt_flags & XNB_TXBUF_INUSE) == XNB_TXBUF_INUSE);
-
-	txp->xt_flags &= ~XNB_TXBUF_INUSE;
-	xnbp->xnb_tx_buf_count--;
-
-	kmem_cache_free(xnb_txbuf_cachep, txp);
-}
-
 static mblk_t *
 xnb_from_peer(xnb_t *xnbp)
 {
 	RING_IDX start, end, loop;
-	gnttab_map_grant_ref_t *mop;
+	gnttab_copy_t *cop;
 	xnb_txbuf_t **txpp;
 	netif_tx_request_t *txreq;
-	boolean_t work_to_do;
+	boolean_t work_to_do, need_notify = B_FALSE;
 	mblk_t *head, *tail;
-	/*
-	 * If the peer granted a read-only mapping to the page then we
-	 * must copy the data, as the local protocol stack (should the
-	 * packet be destined for this host) will modify the packet
-	 * 'in place'.
-	 */
-	boolean_t copy = xnbp->xnb_tx_always_copy ||
-	    !xnbp->xnb_tx_pages_writable;
+	int n_data_req, i;

-	/*
-	 * For each individual request, the sequence of actions is:
-	 *
-	 * 1. get the request.
-	 * 2. map the page based on the grant ref.
-	 * 3. allocate an mblk, copy the data to it.
-	 * 4. release the grant.
-	 * 5. update the ring.
-	 * 6. pass the packet upward.
-	 * 7. kick the peer.
-	 *
-	 * In fact, we try to perform the grant operations in batches,
-	 * so there are two loops.
-	 */
+	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));

 	head = tail = NULL;
 around:
-	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));

 	/* LINTED: constant in conditional context */
 	RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->xnb_tx_ring, work_to_do);
 	if (!work_to_do) {
 finished:
+		xnb_tx_notify_peer(xnbp, need_notify);
+
 		return (head);
 	}

@@ -1517,118 +1381,147 @@
 		goto around;
 	}

-	for (loop = start, mop = xnbp->xnb_tx_mop, txpp = xnbp->xnb_tx_bufp;
-	    loop != end;
-	    loop++, mop++, txpp++) {
-		xnb_txbuf_t *txp;
+	loop = start;
+	cop = xnbp->xnb_tx_cop;
+	txpp = xnbp->xnb_tx_bufp;
+	n_data_req = 0;
+
+	while (loop < end) {
+		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
+
+		if (txreq->flags & NETTXF_extra_info) {
+			struct netif_extra_info *erp;
+			boolean_t status;
+
+			loop++; /* Consume another slot in the ring. */
+			ASSERT(loop <= end);
+
+			erp = (struct netif_extra_info *)
+			    RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);

-		txp = xnb_txbuf_get(xnbp, KM_NOSLEEP);
-		if (txp == NULL)
-			break;
+			switch (erp->type) {
+			case XEN_NETIF_EXTRA_TYPE_MCAST_ADD:
+				ASSERT(xnbp->xnb_multicast_control);
+				status = xnbp->xnb_flavour->xf_mcast_add(xnbp,
+				    &erp->u.mcast.addr);
+				break;
+			case XEN_NETIF_EXTRA_TYPE_MCAST_DEL:
+				ASSERT(xnbp->xnb_multicast_control);
+				status = xnbp->xnb_flavour->xf_mcast_del(xnbp,
+				    &erp->u.mcast.addr);
+				break;
+			default:
+				status = B_FALSE;
+				cmn_err(CE_WARN, "xnb_from_peer: "
+				    "unknown extra type %d", erp->type);
+				break;
+			}

-		ASSERT(xnbp->xnb_tx_pages_writable ||
-		    ((txp->xt_mop.flags & GNTMAP_readonly)
-		    == GNTMAP_readonly));
+			xnb_tx_mark_complete(xnbp, txreq->id,
+			    status ? NETIF_RSP_OKAY : NETIF_RSP_ERROR);
+			need_notify = B_TRUE;
+		} else {
+			xnb_txbuf_t *txp;
+
+			txp = kmem_cache_alloc(xnbp->xnb_tx_buf_cache,
+			    KM_NOSLEEP);
+			if (txp == NULL)
+				break;
+
+			txp->xt_mblk = desballoc((unsigned char *)txp->xt_buf,
+			    txp->xt_buflen, 0, &txp->xt_free_rtn);
+			if (txp->xt_mblk == NULL) {
+				kmem_cache_free(xnbp->xnb_tx_buf_cache, txp);
+				break;
+			}
+
+			txp->xt_idx = loop;
+			txp->xt_id = txreq->id;

-		txp->xt_mop.ref =
-		    RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop)->gref;
+			cop->source.u.ref = txreq->gref;
+			cop->source.domid = xnbp->xnb_peer;
+			cop->source.offset = txreq->offset;
+
+			cop->dest.u.gmfn = txp->xt_mfn;
+			cop->dest.domid = DOMID_SELF;
+			cop->dest.offset = 0;

-		*mop = txp->xt_mop;
-		*txpp = txp;
+			cop->len = txreq->size;
+			cop->flags = GNTCOPY_source_gref;
+			cop->status = 0;
+
+			*txpp = txp;
+
+			txpp++;
+			cop++;
+			n_data_req++;
+
+			ASSERT(n_data_req <= NET_TX_RING_SIZE);
+		}
+
+		loop++;
 	}

-	if ((loop - start) == 0)
-		goto finished;
+	xnbp->xnb_tx_ring.req_cons = loop;

-	end = loop;
+	if (n_data_req == 0)
+		goto around;

-	if (xen_map_gref(GNTTABOP_map_grant_ref, xnbp->xnb_tx_mop,
-	    end - start, B_FALSE) != 0) {
+	if (HYPERVISOR_grant_table_op(GNTTABOP_copy,
+	    xnbp->xnb_tx_cop, n_data_req) != 0) {

-		cmn_err(CE_WARN, "xnb_from_peer: map grant operation failed");
-
-		loop = start;
-		txpp = xnbp->xnb_tx_bufp;
+		cmn_err(CE_WARN, "xnb_from_peer: copy operation failed");

-		while (loop != end) {
-			xnb_txbuf_put(xnbp, *txpp);
-
-			loop++;
+		txpp = xnbp->xnb_tx_bufp;
+		i = n_data_req;
+		while (i > 0) {
+			kmem_cache_free(xnbp->xnb_tx_buf_cache, *txpp);
 			txpp++;
+			i--;
 		}

 		goto finished;
 	}

-	for (loop = start, mop = xnbp->xnb_tx_mop, txpp = xnbp->xnb_tx_bufp;
-	    loop != end;
-	    loop++, mop++, txpp++) {
-		mblk_t *mp = NULL;
-		int16_t status = NETIF_RSP_OKAY;
+	txpp = xnbp->xnb_tx_bufp;
+	cop = xnbp->xnb_tx_cop;
+	i = n_data_req;
+
+	while (i > 0) {
 		xnb_txbuf_t *txp = *txpp;

-		if (mop->status != 0) {
-			cmn_err(CE_WARN, "xnb_from_peer: "
-			    "failed to map buffer: %d",
-			    mop->status);
-			status = NETIF_RSP_ERROR;
-		}
-
-		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
+		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, txp->xt_idx);

-		if (status == NETIF_RSP_OKAY) {
-			if (copy) {
-				mp = allocb(txreq->size, BPRI_MED);
-				if (mp == NULL) {
-					status = NETIF_RSP_ERROR;
-					xnbp->xnb_stat_tx_allocb_failed++;
-				} else {
-					bcopy((caddr_t)(uintptr_t)
-					    mop->host_addr + txreq->offset,
-					    mp->b_wptr, txreq->size);
-					mp->b_wptr += txreq->size;
-				}
-			} else {
-				mp = desballoc((uchar_t *)(uintptr_t)
-				    mop->host_addr + txreq->offset,
-				    txreq->size, 0, &txp->xt_free_rtn);
-				if (mp == NULL) {
-					status = NETIF_RSP_ERROR;
-					xnbp->xnb_stat_tx_allocb_failed++;
-				} else {
-					txp->xt_id = txreq->id;
-					txp->xt_status = status;
-					txp->xt_mop = *mop;
+		if (cop->status != 0) {
+#ifdef XNB_DEBUG
+			cmn_err(CE_WARN, "xnb_from_peer: "
+			    "txpp 0x%p failed (%d)",
+			    (void *)*txpp, cop->status);
+#endif /* XNB_DEBUG */
+			xnb_tx_mark_complete(xnbp, txp->xt_id, cop->status);
+			freemsg(txp->xt_mblk);
+		} else {
+			mblk_t *mp;

-					mp->b_wptr += txreq->size;
-				}
-			}
+			mp = txp->xt_mblk;
+			mp->b_rptr = mp->b_wptr = (unsigned char *)txp->xt_buf;
+			mp->b_wptr += txreq->size;
+			mp->b_next = NULL;

 			/*
-			 * If we have a buffer and there are checksum
-			 * flags, process them appropriately.
+			 * If there are checksum flags, process them
+			 * appropriately.
 			 */
-			if ((mp != NULL) &&
-			    ((txreq->flags &
+			if ((txreq->flags &
 			    (NETTXF_csum_blank | NETTXF_data_validated))
-			    != 0)) {
+			    != 0) {
 				mp = xnbp->xnb_flavour->xf_cksum_from_peer(xnbp,
 				    mp, txreq->flags);
 				xnbp->xnb_stat_tx_cksum_no_need++;
-			}
-		}

-		if (copy || (mp == NULL)) {
-			txp->xt_status = status;
-			txp->xt_id = txreq->id;
-			xnb_tx_schedule_unmop(xnbp, mop, txp);
-		}
+				txp->xt_mblk = mp;
+			}

-		if (mp != NULL) {
-			xnbp->xnb_stat_opackets++;
-			xnbp->xnb_stat_obytes += txreq->size;
-
-			mp->b_next = NULL;
 			if (head == NULL) {
 				ASSERT(tail == NULL);
 				head = mp;
@@ -1637,18 +1530,22 @@
 				tail->b_next = mp;
 			}
 			tail = mp;
+
+			xnbp->xnb_stat_opackets++;
+			xnbp->xnb_stat_obytes += txreq->size;
+
+			xnb_tx_mark_complete(xnbp, txp->xt_id, cop->status);
 		}
+
+		txpp++;
+		cop++;
+		i--;
 	}

-	xnbp->xnb_tx_ring.req_cons = loop;
-
 	goto around;
 	/* NOTREACHED */
 }

-/*
- *  intr() -- ring interrupt service routine
- */
 static uint_t
 xnb_intr(caddr_t arg)
 {
@@ -1683,52 +1580,142 @@
 	return (DDI_INTR_CLAIMED);
 }

+/*
+ * Read our configuration from xenstore.
+ */
+boolean_t
+xnb_read_xs_config(xnb_t *xnbp)
+{
+	char *xsname;
+	char mac[ETHERADDRL * 3];
+
+	xsname = xvdi_get_xsname(xnbp->xnb_devinfo);
+
+	if (xenbus_scanf(XBT_NULL, xsname,
+	    "mac", "%s", mac) != 0) {
+		cmn_err(CE_WARN, "xnb_attach: "
+		    "cannot read mac address from %s",
+		    xsname);
+		return (B_FALSE);
+	}
+
+	if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) {
+		cmn_err(CE_WARN,
+		    "xnb_attach: cannot parse mac address %s",
+		    mac);
+		return (B_FALSE);
+	}
+
+	return (B_TRUE);
+}
+
+/*
+ * Read the configuration of the peer from xenstore.
+ */
+boolean_t
+xnb_read_oe_config(xnb_t *xnbp)
+{
+	char *oename;
+	int i;
+
+	oename = xvdi_get_oename(xnbp->xnb_devinfo);
+
+	if (xenbus_gather(XBT_NULL, oename,
+	    "event-channel", "%u", &xnbp->xnb_fe_evtchn,
+	    "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref,
+	    "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref,
+	    NULL) != 0) {
+		cmn_err(CE_WARN, "xnb_read_oe_config: "
+		    "cannot read other-end details from %s",
+		    oename);
+		return (B_FALSE);
+	}
+
+	/*
+	 * Check whether our peer requests receive side hypervisor
+	 * copy.
+	 */
+	if (xenbus_scanf(XBT_NULL, oename,
+	    "request-rx-copy", "%d", &i) != 0)
+		i = 0;
+	if (i != 0)
+		xnbp->xnb_rx_hv_copy = B_TRUE;
+
+	/*
+	 * Check whether our peer requests multicast_control.
+	 */
+	if (xenbus_scanf(XBT_NULL, oename,
+	    "request-multicast-control", "%d", &i) != 0)
+		i = 0;
+	if (i != 0)
+		xnbp->xnb_multicast_control = B_TRUE;
+
+	/*
+	 * The Linux backend driver here checks to see if the peer has
+	 * set 'feature-no-csum-offload'. This is used to indicate
+	 * that the guest cannot handle receiving packets without a
+	 * valid checksum. We don't check here, because packets passed
+	 * to the peer _always_ have a valid checksum.
+	 *
+	 * There are three cases:
+	 *
+	 * - the NIC is dedicated: packets from the wire should always
+	 *   have a valid checksum. If the hardware validates the
+	 *   checksum then the relevant bit will be set in the packet
+	 *   attributes and we will inform the peer. It can choose to
+	 *   ignore the hardware verification.
+	 *
+	 * - the NIC is shared (VNIC) and a packet originates from the
+	 *   wire: this is the same as the case above - the packets
+	 *   will have a valid checksum.
+	 *
+	 * - the NIC is shared (VNIC) and a packet originates from the
+	 *   host: the MAC layer ensures that all such packets have a
+	 *   valid checksum by calculating one if the stack did not.
+	 */
+
+	return (B_TRUE);
+}
+
+void
+xnb_start_connect(xnb_t *xnbp)
+{
+	dev_info_t  *dip = xnbp->xnb_devinfo;
+
+	if (!xnb_connect_rings(dip)) {
+		cmn_err(CE_WARN, "xnb_start_connect: "
+		    "cannot connect rings");
+		goto failed;
+	}
+
+	if (!xnbp->xnb_flavour->xf_start_connect(xnbp)) {
+		cmn_err(CE_WARN, "xnb_start_connect: "
+		    "flavour failed to connect");
+		goto failed;
+	}
+
+	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
+	return;
+
+failed:
+	xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
+	xnb_disconnect_rings(dip);
+	(void) xvdi_switch_state(dip, XBT_NULL,
+	    XenbusStateClosed);
+	(void) xvdi_post_event(dip, XEN_HP_REMOVE);
+}
+
 static boolean_t
 xnb_connect_rings(dev_info_t *dip)
 {
 	xnb_t *xnbp = ddi_get_driver_private(dip);
-	char *oename;
 	struct gnttab_map_grant_ref map_op;
-	evtchn_port_t evtchn;
-	int i;

 	/*
 	 * Cannot attempt to connect the rings if already connected.
 	 */
 	ASSERT(!xnbp->xnb_connected);

-	oename = xvdi_get_oename(dip);
-
-	if (xenbus_gather(XBT_NULL, oename,
-	    "event-channel", "%u", &evtchn,
-	    "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref,
-	    "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref,
-	    NULL) != 0) {
-		cmn_err(CE_WARN, "xnb_connect_rings: "
-		    "cannot read other-end details from %s",
-		    oename);
-		goto fail;
-	}
-
-	if (xenbus_scanf(XBT_NULL, oename,
-	    "feature-tx-writable", "%d", &i) != 0)
-		i = 0;
-	if (i != 0)
-		xnbp->xnb_tx_pages_writable = B_TRUE;
-
-	if (xenbus_scanf(XBT_NULL, oename,
-	    "feature-no-csum-offload", "%d", &i) != 0)
-		i = 0;
-	if ((i == 1) || !xnbp->xnb_cksum_offload)
-		xnbp->xnb_cksum_offload = B_FALSE;
-
-	/* Check whether our peer knows and requests hypervisor copy */
-	if (xenbus_scanf(XBT_NULL, oename, "request-rx-copy", "%d", &i)
-	    != 0)
-		i = 0;
-	if (i != 0)
-		xnbp->xnb_hv_copy = B_TRUE;
-
 	/*
 	 * 1. allocate a vaddr for the tx page, one for the rx page.
 	 * 2. call GNTTABOP_map_grant_ref to map the relevant pages
@@ -1736,8 +1723,7 @@
 	 * 3. call EVTCHNOP_bind_interdomain to have the event channel
 	 *    bound to this domain.
 	 * 4. associate the event channel with an interrupt.
-	 * 5. declare ourselves connected.
-	 * 6. enable the interrupt.
+	 * 5. enable the interrupt.
 	 */

 	/* 1.tx */
@@ -1785,7 +1771,7 @@
 	    (netif_rx_sring_t *)xnbp->xnb_rx_ring_addr, PAGESIZE);

 	/* 3 */
-	if (xvdi_bind_evtchn(dip, evtchn) != DDI_SUCCESS) {
+	if (xvdi_bind_evtchn(dip, xnbp->xnb_fe_evtchn) != DDI_SUCCESS) {
 		cmn_err(CE_WARN, "xnb_connect_rings: "
 		    "cannot bind event channel %d", xnbp->xnb_evtchn);
 		xnbp->xnb_evtchn = INVALID_EVTCHN;
@@ -1802,13 +1788,12 @@
 	mutex_enter(&xnbp->xnb_tx_lock);
 	mutex_enter(&xnbp->xnb_rx_lock);

-	/* 5.1 */
 	xnbp->xnb_connected = B_TRUE;

 	mutex_exit(&xnbp->xnb_rx_lock);
 	mutex_exit(&xnbp->xnb_tx_lock);

-	/* 4, 6 */
+	/* 4, 5 */
 	if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp)
 	    != DDI_SUCCESS) {
 		cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt");
@@ -1816,9 +1801,6 @@
 	}
 	xnbp->xnb_irq = B_TRUE;

-	/* 5.2 */
-	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
-
 	return (B_TRUE);

 fail:
@@ -1826,6 +1808,7 @@
 	mutex_enter(&xnbp->xnb_rx_lock);

 	xnbp->xnb_connected = B_FALSE;
+
 	mutex_exit(&xnbp->xnb_rx_lock);
 	mutex_exit(&xnbp->xnb_tx_lock);

@@ -1842,9 +1825,6 @@
 		xnbp->xnb_irq = B_FALSE;
 	}

-	if (xnbp->xnb_tx_unmop_count > 0)
-		xnb_tx_perform_pending_unmop(xnbp);
-
 	if (xnbp->xnb_evtchn != INVALID_EVTCHN) {
 		xvdi_free_evtchn(dip);
 		xnbp->xnb_evtchn = INVALID_EVTCHN;
@@ -1895,11 +1875,11 @@
 	}
 }

-/*ARGSUSED*/
 static void
 xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
     void *arg, void *impl_data)
 {
+	_NOTE(ARGUNUSED(id, arg));
 	xnb_t *xnbp = ddi_get_driver_private(dip);
 	XenbusState new_state = *(XenbusState *)impl_data;

@@ -1911,16 +1891,24 @@
 		if (xnbp->xnb_connected)
 			return;

-		if (xnb_connect_rings(dip)) {
-			xnbp->xnb_flavour->xf_peer_connected(xnbp);
-		} else {
-			xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
-			xnb_disconnect_rings(dip);
+		if (!xnb_read_oe_config(xnbp) ||
+		    !xnbp->xnb_flavour->xf_peer_connected(xnbp)) {
+			cmn_err(CE_WARN, "xnb_oe_state_change: "
+			    "read otherend config error");
 			(void) xvdi_switch_state(dip, XBT_NULL,
 			    XenbusStateClosed);
 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
+
+			break;
 		}

+
+		mutex_enter(&xnbp->xnb_state_lock);
+		xnbp->xnb_fe_status = XNB_STATE_READY;
+		if (xnbp->xnb_be_status == XNB_STATE_READY)
+			xnb_start_connect(xnbp);
+		mutex_exit(&xnbp->xnb_state_lock);
+
 		/*
 		 * Now that we've attempted to connect it's reasonable
 		 * to allow an attempt to detach.
@@ -1964,33 +1952,42 @@
 	}
 }

-/*ARGSUSED*/
 static void
 xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id,
     void *arg, void *impl_data)
 {
+	_NOTE(ARGUNUSED(id, arg));
 	xnb_t *xnbp = ddi_get_driver_private(dip);
 	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
-	boolean_t success;

 	ASSERT(xnbp != NULL);

 	switch (state) {
 	case Connected:
-
 		/* spurious hotplug event */
 		if (xnbp->xnb_hotplugged)
-			return;
+			break;

-		success = xnbp->xnb_flavour->xf_hotplug_connected(xnbp);
+		if (!xnb_read_xs_config(xnbp))
+			break;
+
+		if (!xnbp->xnb_flavour->xf_hotplug_connected(xnbp))
+			break;

 		mutex_enter(&xnbp->xnb_tx_lock);
 		mutex_enter(&xnbp->xnb_rx_lock);

-		xnbp->xnb_hotplugged = success;
+		xnbp->xnb_hotplugged = B_TRUE;

 		mutex_exit(&xnbp->xnb_rx_lock);
 		mutex_exit(&xnbp->xnb_tx_lock);
+
+		mutex_enter(&xnbp->xnb_state_lock);
+		xnbp->xnb_be_status = XNB_STATE_READY;
+		if (xnbp->xnb_fe_status == XNB_STATE_READY)
+			xnb_start_connect(xnbp);
+		mutex_exit(&xnbp->xnb_state_lock);
+
 		break;

 	default:
@@ -2013,16 +2010,10 @@

 	mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL);

-	xnb_txbuf_cachep = kmem_cache_create("xnb_txbuf_cachep",
-	    sizeof (xnb_txbuf_t), 0, xnb_txbuf_constructor,
-	    xnb_txbuf_destructor, NULL, NULL, NULL, 0);
-	ASSERT(xnb_txbuf_cachep != NULL);
+	i = mod_install(&modlinkage);
+	if (i != DDI_SUCCESS)
+		mutex_destroy(&xnb_alloc_page_lock);

-	i = mod_install(&modlinkage);
-	if (i != DDI_SUCCESS) {
-		kmem_cache_destroy(xnb_txbuf_cachep);
-		mutex_destroy(&xnb_alloc_page_lock);
-	}
 	return (i);
 }

@@ -2038,9 +2029,8 @@
 	int i;

 	i = mod_remove(&modlinkage);
-	if (i == DDI_SUCCESS) {
-		kmem_cache_destroy(xnb_txbuf_cachep);
+	if (i == DDI_SUCCESS)
 		mutex_destroy(&xnb_alloc_page_lock);
-	}
+
 	return (i);
 }
--- a/usr/src/uts/common/xen/io/xnb.h	Wed Nov 04 21:40:43 2009 -0800
+++ b/usr/src/uts/common/xen/io/xnb.h	Thu Nov 05 01:05:36 2009 -0800
@@ -20,7 +20,7 @@
  */

 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  *
  * xnb.h - definitions for Xen dom0 network driver
@@ -54,6 +54,9 @@
 #define	XNBRING		0x20
 #define	XNBCKSUM	0x40

+#define	XNB_STATE_INIT	0x01
+#define	XNB_STATE_READY	0x02
+
 typedef struct xnb xnb_t;

 /*
@@ -70,19 +73,32 @@
  */
 typedef struct xnb_flavour {
 	void		(*xf_from_peer)(xnb_t *, mblk_t *);
-	void		(*xf_peer_connected)(xnb_t *);
+	boolean_t	(*xf_peer_connected)(xnb_t *);
 	void		(*xf_peer_disconnected)(xnb_t *);
 	boolean_t	(*xf_hotplug_connected)(xnb_t *);
+	boolean_t	(*xf_start_connect)(xnb_t *);
 	mblk_t		*(*xf_cksum_from_peer)(xnb_t *, mblk_t *, uint16_t);
 	uint16_t	(*xf_cksum_to_peer)(xnb_t *, mblk_t *);
+	boolean_t	(*xf_mcast_add)(xnb_t *, ether_addr_t *);
+	boolean_t	(*xf_mcast_del)(xnb_t *, ether_addr_t *);
 } xnb_flavour_t;

 typedef struct xnb_txbuf {
 	frtn_t			xt_free_rtn;
 	xnb_t			*xt_xnbp;
-	gnttab_map_grant_ref_t	xt_mop;
+	struct xnb_txbuf	*xt_next;
 	RING_IDX		xt_id;
+	RING_IDX		xt_idx;
 	uint16_t		xt_status;
+
+	ddi_dma_handle_t	xt_dma_handle;
+	ddi_acc_handle_t	xt_acc_handle;
+	caddr_t			xt_buf;
+	size_t			xt_buflen;
+	mfn_t			xt_mfn;
+
+	mblk_t			*xt_mblk;
+
 	unsigned int		xt_flags;

 #define	XNB_TXBUF_INUSE	0x01
@@ -140,17 +156,18 @@

 	kstat_t			*xnb_kstat_aux;

-	boolean_t		xnb_cksum_offload;
-
 	ddi_iblock_cookie_t	xnb_icookie;

 	kmutex_t		xnb_rx_lock;
 	kmutex_t		xnb_tx_lock;
+	kmutex_t		xnb_state_lock;

-	int			xnb_tx_unmop_count;
-	int			xnb_tx_buf_count;
-	boolean_t		xnb_tx_pages_writable;
-	boolean_t		xnb_tx_always_copy;
+	int			xnb_be_status;
+	int			xnb_fe_status;
+
+	kmem_cache_t		*xnb_tx_buf_cache;
+	uint32_t		xnb_tx_buf_count;
+	int			xnb_tx_buf_outstanding;

 	netif_rx_back_ring_t	xnb_rx_ring;	/* rx interface struct ptr */
 	void			*xnb_rx_ring_addr;
@@ -166,22 +183,22 @@
 	boolean_t		xnb_hotplugged;
 	boolean_t		xnb_detachable;
 	int			xnb_evtchn;	/* channel to front end */
+	evtchn_port_t		xnb_fe_evtchn;
 	domid_t			xnb_peer;

-	xnb_txbuf_t			*xnb_tx_bufp[NET_TX_RING_SIZE];
-	gnttab_map_grant_ref_t		xnb_tx_mop[NET_TX_RING_SIZE];
-	gnttab_unmap_grant_ref_t	xnb_tx_unmop[NET_TX_RING_SIZE];
-
-	/* store information for unmop */
-	xnb_txbuf_t		*xnb_tx_unmop_txp[NET_TX_RING_SIZE];
+	xnb_txbuf_t		*xnb_tx_bufp[NET_TX_RING_SIZE];
+	gnttab_copy_t		xnb_tx_cop[NET_TX_RING_SIZE];

 	caddr_t			xnb_rx_va;
 	gnttab_transfer_t	xnb_rx_top[NET_RX_RING_SIZE];

-	boolean_t		xnb_hv_copy;	/* do we do hypervisor copy? */
+	boolean_t		xnb_rx_hv_copy;
+	boolean_t		xnb_multicast_control;
+	boolean_t		xnb_no_csum_offload;
+
 	gnttab_copy_t		*xnb_rx_cpop;
 #define	CPOP_DEFCNT 	8
-	size_t			xnb_cpop_sz; 	/* in elements, not bytes */
+	size_t			xnb_rx_cpop_count; 	/* in elements */
 };

 extern int xnb_attach(dev_info_t *, xnb_flavour_t *, void *);
--- a/usr/src/uts/common/xen/io/xnbo.c	Wed Nov 04 21:40:43 2009 -0800
+++ b/usr/src/uts/common/xen/io/xnbo.c	Thu Nov 05 01:05:36 2009 -0800
@@ -46,7 +46,16 @@
 #include <sys/pattr.h>
 #include <xen/sys/xenbus_impl.h>
 #include <xen/sys/xendev.h>
+#include <sys/sdt.h>
+#include <sys/note.h>

+/* Track multicast addresses. */
+typedef struct xmca {
+	struct xmca *next;
+	ether_addr_t addr;
+} xmca_t;
+
+/* State about this device instance. */
 typedef struct xnbo {
 	mac_handle_t		o_mh;
 	mac_client_handle_t	o_mch;
@@ -55,9 +64,14 @@
 	boolean_t		o_running;
 	boolean_t		o_promiscuous;
 	uint32_t		o_hcksum_capab;
+	xmca_t			*o_mca;
+	char			o_link_name[LIFNAMSIZ];
+	boolean_t		o_need_rx_filter;
+	boolean_t		o_need_setphysaddr;
+	boolean_t		o_multicast_control;
 } xnbo_t;

-static void xnbo_close_mac(xnbo_t *);
+static void xnbo_close_mac(xnb_t *);

 /*
  * Packets from the peer come here.  We pass them to the mac device.
@@ -85,6 +99,10 @@
 	freemsgchain(mp);
 }

+/*
+ * Process the checksum flags `flags' provided by the peer for the
+ * packet `mp'.
+ */
 static mblk_t *
 xnbo_cksum_from_peer(xnb_t *xnbp, mblk_t *mp, uint16_t flags)
 {
@@ -94,11 +112,6 @@

 	if ((flags & NETTXF_csum_blank) != 0) {
 		/*
-		 * It would be nice to ASSERT that xnbp->xnb_cksum_offload
-		 * is TRUE here, but some peers insist on assuming
-		 * that it is available even when they have been told
-		 * otherwise.
-		 *
 		 * The checksum in the packet is blank.  Determine
 		 * whether we can do hardware offload and, if so,
 		 * update the flags on the mblk according.  If not,
@@ -111,10 +124,16 @@
 	return (mp);
 }

+/*
+ * Calculate the checksum flags to be relayed to the peer for the
+ * packet `mp'.
+ */
 static uint16_t
 xnbo_cksum_to_peer(xnb_t *xnbp, mblk_t *mp)
 {
+	_NOTE(ARGUNUSED(xnbp));
 	uint16_t r = 0;
+	uint32_t pflags, csum;

 	/*
 	 * We might also check for HCK_PARTIALCKSUM here and,
@@ -126,29 +145,24 @@
 	 * capabilities tend to use HCK_FULLCKSUM on the receive side
 	 * - they are actually saying that in the output path the
 	 * caller must use HCK_PARTIALCKSUM.
+	 *
+	 * Then again, if a NIC supports HCK_PARTIALCKSUM in its'
+	 * output path, the host IP stack will use it. If such packets
+	 * are destined for the peer (i.e. looped around) we would
+	 * gain some advantage.
 	 */

-	if (xnbp->xnb_cksum_offload) {
-		uint32_t pflags, csum;
+	hcksum_retrieve(mp, NULL, NULL, NULL, NULL,
+	    NULL, &csum, &pflags);

-		/*
-		 * XXPV dme: Pull in improved hcksum_retrieve() from
-		 * Crossbow, which gives back the csum in the seventh
-		 * argument for HCK_FULLCKSUM.
-		 */
-		hcksum_retrieve(mp, NULL, NULL, NULL, NULL,
-		    NULL, NULL, &pflags);
-		csum = DB_CKSUM16(mp);
-
-		/*
-		 * If the MAC driver has asserted that the checksum is
-		 * good, let the peer know.
-		 */
-		if (((pflags & HCK_FULLCKSUM) != 0) &&
-		    (((pflags & HCK_FULLCKSUM_OK) != 0) ||
-		    (csum == 0xffff)))
-			r |= NETRXF_data_validated;
-	}
+	/*
+	 * If the MAC driver has asserted that the checksum is
+	 * good, let the peer know.
+	 */
+	if (((pflags & HCK_FULLCKSUM) != 0) &&
+	    (((pflags & HCK_FULLCKSUM_OK) != 0) ||
+	    (csum == 0xffff)))
+		r |= NETRXF_data_validated;

 	return (r);
 }
@@ -174,11 +188,11 @@
  * the destination mac address matches or it's a multicast/broadcast
  * address.
  */
-/*ARGSUSED*/
 static void
 xnbo_from_mac_filter(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
     boolean_t loopback)
 {
+	_NOTE(ARGUNUSED(loopback));
 	xnb_t *xnbp = arg;
 	xnbo_t *xnbop = xnbp->xnb_flavour_data;
 	mblk_t *next, *keep, *keep_head, *free, *free_head;
@@ -230,16 +244,13 @@
 xnbo_open_mac(xnb_t *xnbp, char *mac)
 {
 	xnbo_t *xnbop = xnbp->xnb_flavour_data;
-	int err, need_rx_filter, need_setphysaddr, need_promiscuous;
+	int err;
 	const mac_info_t *mi;
-	char *xsname;
 	void (*rx_fn)(void *, mac_resource_handle_t, mblk_t *, boolean_t);
 	struct ether_addr ea;
 	uint_t max_sdu;
 	mac_diag_t diag;

-	xsname = xvdi_get_xsname(xnbp->xnb_devinfo);
-
 	if ((err = mac_open_by_linkname(mac, &xnbop->o_mh)) != 0) {
 		cmn_err(CE_WARN, "xnbo_open_mac: "
 		    "cannot open mac for link %s (%d)", mac, err);
@@ -253,14 +264,14 @@
 	if (mi->mi_media != DL_ETHER) {
 		cmn_err(CE_WARN, "xnbo_open_mac: "
 		    "device is not DL_ETHER (%d)", mi->mi_media);
-		xnbo_close_mac(xnbop);
+		xnbo_close_mac(xnbp);
 		return (B_FALSE);
 	}
 	if (mi->mi_media != mi->mi_nativemedia) {
 		cmn_err(CE_WARN, "xnbo_open_mac: "
 		    "device media and native media mismatch (%d != %d)",
 		    mi->mi_media, mi->mi_nativemedia);
-		xnbo_close_mac(xnbop);
+		xnbo_close_mac(xnbp);
 		return (B_FALSE);
 	}

@@ -268,7 +279,7 @@
 	if (max_sdu > XNBMAXPKT) {
 		cmn_err(CE_WARN, "xnbo_open_mac: mac device SDU too big (%d)",
 		    max_sdu);
-		xnbo_close_mac(xnbop);
+		xnbo_close_mac(xnbp);
 		return (B_FALSE);
 	}

@@ -286,40 +297,25 @@
 	    MAC_OPEN_FLAGS_MULTI_PRIMARY) != 0) {
 		cmn_err(CE_WARN, "xnbo_open_mac: "
 		    "error (%d) opening mac client", err);
-		xnbo_close_mac(xnbop);
+		xnbo_close_mac(xnbp);
 		return (B_FALSE);
 	}

-	/*
-	 * Should the receive path filter packets from the downstream
-	 * NIC before passing them to the peer? The default is "no".
-	 */
-	if (xenbus_scanf(XBT_NULL, xsname,
-	    "SUNW-need-rx-filter", "%d", &need_rx_filter) != 0)
-		need_rx_filter = 0;
-	if (need_rx_filter > 0)
+	if (xnbop->o_need_rx_filter)
 		rx_fn = xnbo_from_mac_filter;
 	else
 		rx_fn = xnbo_from_mac;

-	/*
-	 * Should we set the underlying NIC into promiscuous mode? The
-	 * default is "no".
-	 */
-	if (xenbus_scanf(XBT_NULL, xsname,
-	    "SUNW-need-promiscuous", "%d", &need_promiscuous) != 0) {
-		need_promiscuous = 0;
-	}
 	err = mac_unicast_add_set_rx(xnbop->o_mch, NULL, MAC_UNICAST_PRIMARY,
-	    &xnbop->o_mah, 0, &diag, need_promiscuous == 0 ? rx_fn :
-	    NULL, xnbp);
+	    &xnbop->o_mah, 0, &diag, xnbop->o_multicast_control ? rx_fn : NULL,
+	    xnbp);
 	if (err != 0) {
 		cmn_err(CE_WARN, "xnbo_open_mac: failed to get the primary "
 		    "MAC address of %s: %d", mac, err);
-		xnbo_close_mac(xnbop);
+		xnbo_close_mac(xnbp);
 		return (B_FALSE);
 	}
-	if (need_promiscuous != 0) {
+	if (!xnbop->o_multicast_control) {
 		err = mac_promisc_add(xnbop->o_mch, MAC_CLIENT_PROMISC_ALL,
 		    rx_fn, xnbp, &xnbop->o_mphp, MAC_PROMISC_FLAGS_NO_TX_LOOP |
 		    MAC_PROMISC_FLAGS_VLAN_TAG_STRIP);
@@ -327,24 +323,13 @@
 			cmn_err(CE_WARN, "xnbo_open_mac: "
 			    "cannot enable promiscuous mode of %s: %d",
 			    mac, err);
-			xnbo_close_mac(xnbop);
+			xnbo_close_mac(xnbp);
 			return (B_FALSE);
 		}
 		xnbop->o_promiscuous = B_TRUE;
 	}

-	if (!mac_capab_get(xnbop->o_mh, MAC_CAPAB_HCKSUM,
-	    &xnbop->o_hcksum_capab))
-		xnbop->o_hcksum_capab = 0;
-
-	/*
-	 * Should we set the physical address of the underlying NIC
-	 * to match that assigned to the peer? The default is "no".
-	 */
-	if (xenbus_scanf(XBT_NULL, xsname,
-	    "SUNW-need-set-physaddr", "%d", &need_setphysaddr) != 0)
-		need_setphysaddr = 0;
-	if (need_setphysaddr > 0) {
+	if (xnbop->o_need_setphysaddr) {
 		err = mac_unicast_primary_set(xnbop->o_mh, xnbp->xnb_mac_addr);
 		/* Warn, but continue on. */
 		if (err != 0) {
@@ -356,41 +341,42 @@
 		}
 	}

+	if (!mac_capab_get(xnbop->o_mh, MAC_CAPAB_HCKSUM,
+	    &xnbop->o_hcksum_capab))
+		xnbop->o_hcksum_capab = 0;
+
 	xnbop->o_running = B_TRUE;

 	return (B_TRUE);
 }

-/*
- * xnb calls back here when the user-level hotplug code reports that
- * the hotplug has successfully completed. For this flavour that means
- * that the underlying MAC device that we will use is ready to be
- * opened.
- */
-static boolean_t
-xnbo_hotplug(xnb_t *xnbp)
+static void
+xnbo_close_mac(xnb_t *xnbp)
 {
-	char *xsname;
-	char mac[LIFNAMSIZ];
+	xnbo_t *xnbop = xnbp->xnb_flavour_data;
+	xmca_t *loop;

-	xsname = xvdi_get_xsname(xnbp->xnb_devinfo);
-	if (xenbus_scanf(XBT_NULL, xsname, "nic", "%s", mac) != 0) {
-		cmn_err(CE_WARN, "xnbo_hotplug: "
-		    "cannot read nic name from %s", xsname);
-		return (B_FALSE);
-	}
-
-	return (xnbo_open_mac(xnbp, mac));
-}
-
-static void
-xnbo_close_mac(xnbo_t *xnbop)
-{
 	if (xnbop->o_mh == NULL)
 		return;

-	if (xnbop->o_running) {
+	if (xnbop->o_running)
 		xnbop->o_running = B_FALSE;
+
+	mutex_enter(&xnbp->xnb_state_lock);
+	loop = xnbop->o_mca;
+	xnbop->o_mca = NULL;
+	mutex_exit(&xnbp->xnb_state_lock);
+
+	while (loop != NULL) {
+		xmca_t *next = loop->next;
+
+		DTRACE_PROBE3(mcast_remove,
+		    (char *), "close",
+		    (void *), xnbp,
+		    (etheraddr_t *), loop->addr);
+		(void) mac_multicast_remove(xnbop->o_mch, loop->addr);
+		kmem_free(loop, sizeof (*loop));
+		loop = next;
 	}

 	if (xnbop->o_promiscuous) {
@@ -419,32 +405,194 @@
 }

 /*
- * xnb calls back here when we successfully synchronize with the
- * driver in the guest domain. In this flavour there is nothing to do as
- * we open the underlying MAC device on successful hotplug completion.
+ * Hotplug has completed and we are connected to the peer. We have all
+ * the information we need to exchange traffic, so open the MAC device
+ * and configure it appropriately.
+ */
+static boolean_t
+xnbo_start_connect(xnb_t *xnbp)
+{
+	xnbo_t *xnbop = xnbp->xnb_flavour_data;
+
+	return (xnbo_open_mac(xnbp, xnbop->o_link_name));
+}
+
+/*
+ * The guest has successfully synchronize with this instance. We read
+ * the configuration of the guest from xenstore to check whether the
+ * guest requests multicast control. If not (the default) we make a
+ * note that the MAC device needs to be used in promiscious mode.
+ */
+static boolean_t
+xnbo_peer_connected(xnb_t *xnbp)
+{
+	char *oename;
+	int request;
+	xnbo_t *xnbop = xnbp->xnb_flavour_data;
+
+	oename = xvdi_get_oename(xnbp->xnb_devinfo);
+
+	if (xenbus_scanf(XBT_NULL, oename,
+	    "request-multicast-control", "%d", &request) != 0)
+		request = 0;
+	xnbop->o_multicast_control = (request > 0);
+
+	return (B_TRUE);
+}
+
+/*
+ * The guest domain has closed down the inter-domain connection. We
+ * close the underlying MAC device.
  */
-/*ARGSUSED*/
 static void
-xnbo_connected(xnb_t *xnbp)
+xnbo_peer_disconnected(xnb_t *xnbp)
+{
+	xnbo_close_mac(xnbp);
+}
+
+/*
+ * The hotplug script has completed. We read information from xenstore
+ * about our configuration, most notably the name of the MAC device we
+ * should use.
+ */
+static boolean_t
+xnbo_hotplug_connected(xnb_t *xnbp)
 {
+	char *xsname;
+	xnbo_t *xnbop = xnbp->xnb_flavour_data;
+	int need;
+
+	xsname = xvdi_get_xsname(xnbp->xnb_devinfo);
+
+	if (xenbus_scanf(XBT_NULL, xsname,
+	    "nic", "%s", xnbop->o_link_name) != 0) {
+		cmn_err(CE_WARN, "xnbo_connect: "
+		    "cannot read nic name from %s", xsname);
+		return (B_FALSE);
+	}
+
+	if (xenbus_scanf(XBT_NULL, xsname,
+	    "SUNW-need-rx-filter", "%d", &need) != 0)
+		need = 0;
+	xnbop->o_need_rx_filter = (need > 0);
+
+	if (xenbus_scanf(XBT_NULL, xsname,
+	    "SUNW-need-set-physaddr", "%d", &need) != 0)
+		need = 0;
+	xnbop->o_need_setphysaddr = (need > 0);
+
+	return (B_TRUE);
 }

 /*
- * xnb calls back here when the driver in the guest domain has closed
- * down the inter-domain connection. We close the underlying MAC device.
+ * Find the multicast address `addr', return B_TRUE if it is one that
+ * we receive. If `remove', remove it from the set received.
  */
-static void
-xnbo_disconnected(xnb_t *xnbp)
+static boolean_t
+xnbo_mcast_find(xnb_t *xnbp, ether_addr_t *addr, boolean_t remove)
 {
-	xnbo_close_mac(xnbp->xnb_flavour_data);
+	xnbo_t *xnbop = xnbp->xnb_flavour_data;
+	xmca_t *prev, *del, *this;
+
+	ASSERT(MUTEX_HELD(&xnbp->xnb_state_lock));
+	ASSERT(xnbop->o_promiscuous == B_FALSE);
+
+	prev = del = NULL;
+
+	this = xnbop->o_mca;
+
+	while (this != NULL) {
+		if (bcmp(&this->addr, addr, sizeof (this->addr)) == 0) {
+			del = this;
+			if (remove) {
+				if (prev == NULL)
+					xnbop->o_mca = this->next;
+				else
+					prev->next = this->next;
+			}
+			break;
+		}
+
+		prev = this;
+		this = this->next;
+	}
+
+	if (del == NULL)
+		return (B_FALSE);
+
+	if (remove) {
+		DTRACE_PROBE3(mcast_remove,
+		    (char *), "remove",
+		    (void *), xnbp,
+		    (etheraddr_t *), del->addr);
+		mac_multicast_remove(xnbop->o_mch, del->addr);
+		kmem_free(del, sizeof (*del));
+	}
+
+	return (B_TRUE);
+}
+
+/*
+ * Add the multicast address `addr' to the set received.
+ */
+static boolean_t
+xnbo_mcast_add(xnb_t *xnbp, ether_addr_t *addr)
+{
+	xnbo_t *xnbop = xnbp->xnb_flavour_data;
+	boolean_t r = B_FALSE;
+
+	ASSERT(xnbop->o_promiscuous == B_FALSE);
+
+	mutex_enter(&xnbp->xnb_state_lock);
+
+	if (xnbo_mcast_find(xnbp, addr, B_FALSE)) {
+		r = B_TRUE;
+	} else if (mac_multicast_add(xnbop->o_mch,
+	    (const uint8_t *)addr) == 0) {
+		xmca_t *mca;
+
+		DTRACE_PROBE3(mcast_add,
+		    (char *), "add",
+		    (void *), xnbp,
+		    (etheraddr_t *), addr);
+
+		mca = kmem_alloc(sizeof (*mca), KM_SLEEP);
+		bcopy(addr, &mca->addr, sizeof (mca->addr));
+
+		mca->next = xnbop->o_mca;
+		xnbop->o_mca = mca;
+
+		r = B_TRUE;
+	}
+
+	mutex_exit(&xnbp->xnb_state_lock);
+
+	return (r);
+}
+
+/*
+ * Remove the multicast address `addr' from the set received.
+ */
+static boolean_t
+xnbo_mcast_del(xnb_t *xnbp, ether_addr_t *addr)
+{
+	boolean_t r;
+
+	mutex_enter(&xnbp->xnb_state_lock);
+	r = xnbo_mcast_find(xnbp, addr, B_TRUE);
+	mutex_exit(&xnbp->xnb_state_lock);
+
+	return (r);
 }

 static int
 xnbo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 {
 	static xnb_flavour_t flavour = {
-		xnbo_to_mac, xnbo_connected, xnbo_disconnected, xnbo_hotplug,
+		xnbo_to_mac, xnbo_peer_connected, xnbo_peer_disconnected,
+		xnbo_hotplug_connected, xnbo_start_connect,
 		xnbo_cksum_from_peer, xnbo_cksum_to_peer,
+		xnbo_mcast_add, xnbo_mcast_del,
 	};
 	xnbo_t *xnbop;

@@ -459,13 +607,6 @@

 	xnbop = kmem_zalloc(sizeof (*xnbop), KM_SLEEP);

-	xnbop->o_mh = NULL;
-	xnbop->o_mch = NULL;
-	xnbop->o_mah = NULL;
-	xnbop->o_mphp = NULL;
-	xnbop->o_running = B_FALSE;
-	xnbop->o_hcksum_capab = 0;
-
 	if (xnb_attach(dip, &flavour, xnbop) != DDI_SUCCESS) {
 		kmem_free(xnbop, sizeof (*xnbop));
 		return (DDI_FAILURE);
@@ -503,7 +644,7 @@
 	mutex_exit(&xnbp->xnb_rx_lock);
 	mutex_exit(&xnbp->xnb_tx_lock);

-	xnbo_close_mac(xnbop);
+	xnbo_close_mac(xnbp);
 	kmem_free(xnbop, sizeof (*xnbop));

 	xnb_detach(dip);
--- a/usr/src/uts/common/xen/io/xnbu.c	Wed Nov 04 21:40:43 2009 -0800
+++ b/usr/src/uts/common/xen/io/xnbu.c	Thu Nov 05 01:05:36 2009 -0800
@@ -20,7 +20,7 @@
  */

 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */

@@ -43,6 +43,7 @@
 #include <sys/mac_provider.h>
 #include <sys/mac_ether.h>
 #include <xen/sys/xendev.h>
+#include <sys/note.h>

 /* Required driver entry points for GLDv3 */
 static int	xnbu_m_start(void *);
@@ -59,7 +60,7 @@
 	boolean_t		u_need_sched;
 } xnbu_t;

-static mac_callbacks_t xnb_callbacks = {
+static mac_callbacks_t xnbu_callbacks = {
 	MC_GETCAPAB,
 	xnbu_m_stat,
 	xnbu_m_start,
@@ -147,28 +148,26 @@
 static uint16_t
 xnbu_cksum_to_peer(xnb_t *xnbp, mblk_t *mp)
 {
+	_NOTE(ARGUNUSED(xnbp));
 	uint16_t r = 0;
+	uint32_t pflags;

-	if (xnbp->xnb_cksum_offload) {
-		uint32_t pflags;
-
-		hcksum_retrieve(mp, NULL, NULL, NULL, NULL,
-		    NULL, NULL, &pflags);
+	hcksum_retrieve(mp, NULL, NULL, NULL, NULL,
+	    NULL, NULL, &pflags);

-		/*
-		 * If the protocol stack has requested checksum
-		 * offload, inform the peer that we have not
-		 * calculated the checksum.
-		 */
-		if ((pflags & HCK_FULLCKSUM) != 0)
-			r |= NETRXF_csum_blank;
-	}
+	/*
+	 * If the protocol stack has requested checksum
+	 * offload, inform the peer that we have not
+	 * calculated the checksum.
+	 */
+	if ((pflags & HCK_FULLCKSUM) != 0)
+		r |= NETRXF_csum_blank;

 	return (r);
 }

-static void
-xnbu_connected(xnb_t *xnbp)
+static boolean_t
+xnbu_start_connect(xnb_t *xnbp)
 {
 	xnbu_t *xnbup = xnbp->xnb_flavour_data;

@@ -177,10 +176,20 @@
 	 * We are able to send packets now - bring them on.
 	 */
 	mac_tx_update(xnbup->u_mh);
+
+	return (B_TRUE);
+}
+
+static boolean_t
+xnbu_peer_connected(xnb_t *xnbp)
+{
+	_NOTE(ARGUNUSED(xnbp));
+
+	return (B_TRUE);
 }

 static void
-xnbu_disconnected(xnb_t *xnbp)
+xnbu_peer_disconnected(xnb_t *xnbp)
 {
 	xnbu_t *xnbup = xnbp->xnb_flavour_data;

@@ -189,7 +198,7 @@

 /*ARGSUSED*/
 static boolean_t
-xnbu_hotplug(xnb_t *xnbp)
+xnbu_hotplug_connected(xnb_t *xnbp)
 {
 	return (B_TRUE);
 }
@@ -199,28 +208,30 @@
 {
 	xnb_t *xnbp = arg;
 	xnbu_t *xnbup = xnbp->xnb_flavour_data;
+	boolean_t sched = B_FALSE;

 	mp = xnb_copy_to_peer(arg, mp);

-	/* XXPV dme: playing with need_sched without txlock? */
-
+	mutex_enter(&xnbp->xnb_rx_lock);
 	/*
 	 * If we consumed all of the mblk_t's offered, perhaps we need
 	 * to indicate that we can accept more.  Otherwise we are full
 	 * and need to wait for space.
 	 */
 	if (mp == NULL) {
-		/*
-		 * If a previous transmit attempt failed because the ring
-		 * was full, try again now.
-		 */
-		if (xnbup->u_need_sched) {
-			xnbup->u_need_sched = B_FALSE;
-			mac_tx_update(xnbup->u_mh);
-		}
+		sched = xnbup->u_need_sched;
+		xnbup->u_need_sched = B_FALSE;
 	} else {
 		xnbup->u_need_sched = B_TRUE;
 	}
+	mutex_exit(&xnbp->xnb_rx_lock);
+
+	/*
+	 * If a previous transmit attempt failed because the ring
+	 * was full, try again now.
+	 */
+	if (sched)
+		mac_tx_update(xnbup->u_mh);

 	return (mp);
 }
@@ -327,16 +338,13 @@
 static boolean_t
 xnbu_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 {
-	xnb_t *xnbp = arg;
+	_NOTE(ARGUNUSED(arg));

 	switch (cap) {
 	case MAC_CAPAB_HCKSUM: {
 		uint32_t *capab = cap_data;

-		if (xnbp->xnb_cksum_offload)
-			*capab = HCKSUM_INET_PARTIAL;
-		else
-			*capab = 0;
+		*capab = HCKSUM_INET_PARTIAL;
 		break;
 	}
 	default:
@@ -346,12 +354,34 @@
 	return (B_TRUE);
 }

+/*
+ * All packets are passed to the peer, so adding and removing
+ * multicast addresses is meaningless.
+ */
+static boolean_t
+xnbu_mcast_add(xnb_t *xnbp, ether_addr_t *addr)
+{
+	_NOTE(ARGUNUSED(xnbp, addr));
+
+	return (B_TRUE);
+}
+
+static boolean_t
+xnbu_mcast_del(xnb_t *xnbp, ether_addr_t *addr)
+{
+	_NOTE(ARGUNUSED(xnbp, addr));
+
+	return (B_TRUE);
+}
+
 static int
 xnbu_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 {
 	static xnb_flavour_t flavour = {
-		xnbu_to_host, xnbu_connected, xnbu_disconnected, xnbu_hotplug,
+		xnbu_to_host, xnbu_peer_connected, xnbu_peer_disconnected,
+		xnbu_hotplug_connected, xnbu_start_connect,
 		xnbu_cksum_from_peer, xnbu_cksum_to_peer,
+		xnbu_mcast_add, xnbu_mcast_del,
 	};
 	xnbu_t *xnbup;
 	xnb_t *xnbp;
@@ -392,7 +422,7 @@
 	 */
 	mr->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
 	mr->m_src_addr = xnbp->xnb_mac_addr;
-	mr->m_callbacks = &xnb_callbacks;
+	mr->m_callbacks = &xnbu_callbacks;
 	mr->m_min_sdu = 0;
 	mr->m_max_sdu = XNBMAXPKT;
 	/*
--- a/usr/src/uts/common/xen/io/xnf.c	Wed Nov 04 21:40:43 2009 -0800
+++ b/usr/src/uts/common/xen/io/xnf.c	Thu Nov 05 01:05:36 2009 -0800
@@ -57,7 +57,50 @@
  */

 /*
- * xnf.c - Nemo-based network driver for domU
+ * xnf.c - GLDv3 network driver for domU.
+ */
+
+/*
+ * This driver uses four per-instance locks:
+ *
+ * xnf_gref_lock:
+ *
+ *    Protects access to the grant reference list stored in
+ *    xnf_gref_head. Grant references should be acquired and released
+ *    using gref_get() and gref_put() respectively.
+ *
+ * xnf_schedlock:
+ *
+ *    Protects:
+ *    xnf_need_sched - used to record that a previous transmit attempt
+ *       failed (and consequently it will be necessary to call
+ *       mac_tx_update() when transmit resources are available).
+ *    xnf_pending_multicast - the number of multicast requests that
+ *       have been submitted to the backend for which we have not
+ *       processed responses.
+ *
+ * xnf_txlock:
+ *
+ *    Protects the transmit ring (xnf_tx_ring) and associated
+ *    structures (notably xnf_tx_pkt_id and xnf_tx_pkt_id_head).
+ *
+ * xnf_rxlock:
+ *
+ *    Protects the receive ring (xnf_rx_ring) and associated
+ *    structures (notably xnf_rx_pkt_info).
+ *
+ * If driver-global state that affects both the transmit and receive
+ * rings is manipulated, both xnf_txlock and xnf_rxlock should be
+ * held, in that order.
+ *
+ * xnf_schedlock is acquired both whilst holding xnf_txlock and
+ * without. It should always be acquired after xnf_txlock if both are
+ * held.
+ *
+ * Notes:
+ * - atomic_add_64() is used to manipulate counters where we require
+ *   accuracy. For counters intended only for observation by humans,
+ *   post increment/decrement are used instead.
  */

 #include <sys/types.h>
@@ -67,6 +110,7 @@
 #include <sys/systm.h>
 #include <sys/stream.h>
 #include <sys/strsubr.h>
+#include <sys/strsun.h>
 #include <sys/conf.h>
 #include <sys/ddi.h>
 #include <sys/devops.h>
@@ -96,17 +140,18 @@
 #include <sys/gnttab.h>
 #include <xen/sys/xendev.h>
 #include <sys/sdt.h>
+#include <sys/note.h>
+#include <sys/debug.h>

 #include <io/xnf.h>

-
-/*
- *  Declarations and Module Linkage
- */
-
 #if defined(DEBUG) || defined(__lint)
 #define	XNF_DEBUG
-int	xnfdebug = 0;
+#endif
+
+#ifdef XNF_DEBUG
+int xnf_debug = 0;
+xnf_t *xnf_debug_instance = NULL;
 #endif

 /*
@@ -117,23 +162,39 @@
  */
 #define	xnf_btop(addr)	((addr) >> PAGESHIFT)

-boolean_t	xnf_cksum_offload = B_TRUE;
-
-/* Default value for hypervisor-based copy operations */
-boolean_t	xnf_rx_hvcopy = B_TRUE;
+unsigned int	xnf_max_tx_frags = 1;
+
+/*
+ * Should we use the multicast control feature if the backend provides
+ * it?
+ */
+boolean_t xnf_multicast_control = B_TRUE;

 /*
- * Should pages used for transmit be readonly for the peer?
+ * Received packets below this size are copied to a new streams buffer
+ * rather than being desballoc'ed.
+ *
+ * This value is chosen to accommodate traffic where there are a large
+ * number of small packets. For data showing a typical distribution,
+ * see:
+ *
+ * Sinha07a:
+ *	Rishi Sinha, Christos Papadopoulos, and John
+ *	Heidemann. Internet Packet Size Distributions: Some
+ *	Observations. Technical Report ISI-TR-2007-643,
+ *	USC/Information Sciences Institute, May, 2007. Orignally
+ *	released October 2005 as web page
+ *	http://netweb.usc.edu/~sinha/pkt-sizes/.
+ *	<http://www.isi.edu/~johnh/PAPERS/Sinha07a.html>.
  */
-boolean_t	xnf_tx_pages_readonly = B_FALSE;
-/*
- * Packets under this size are bcopied instead of using desballoc.
- * Choose a value > XNF_FRAMESIZE (1514) to force the receive path to
- * always copy.
- */
-unsigned int	xnf_rx_bcopy_thresh = 64;
-
-unsigned int	xnf_max_tx_frags = 1;
+size_t xnf_rx_copy_limit = 64;
+
+#define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
+#define	INVALID_GRANT_REF	((grant_ref_t)-1)
+#define	INVALID_TX_ID		((uint16_t)-1)
+
+#define	TX_ID_TO_TXID(p, id) (&((p)->xnf_tx_pkt_id[(id)]))
+#define	TX_ID_VALID(i) (((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE))

 /* Required system entry points */
 static int	xnf_attach(dev_info_t *, ddi_attach_cmd_t);
@@ -148,35 +209,46 @@
 static mblk_t	*xnf_send(void *, mblk_t *);
 static uint_t	xnf_intr(caddr_t);
 static int	xnf_stat(void *, uint_t, uint64_t *);
-static void	xnf_ioctl(void *, queue_t *, mblk_t *);
 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);

 /* Driver private functions */
 static int xnf_alloc_dma_resources(xnf_t *);
 static void xnf_release_dma_resources(xnf_t *);
-static mblk_t *xnf_process_recv(xnf_t *);
-static void xnf_rcv_complete(struct xnf_buffer_desc *);
 static void xnf_release_mblks(xnf_t *);
-static struct xnf_buffer_desc *xnf_alloc_tx_buffer(xnf_t *);
-static struct xnf_buffer_desc *xnf_alloc_buffer(xnf_t *);
-static struct xnf_buffer_desc *xnf_get_tx_buffer(xnf_t *);
-static struct xnf_buffer_desc *xnf_get_buffer(xnf_t *);
-static void xnf_free_buffer(struct xnf_buffer_desc *);
-static void xnf_free_tx_buffer(struct xnf_buffer_desc *);
+
+static int xnf_buf_constructor(void *, void *, int);
+static void xnf_buf_destructor(void *, void *);
+static xnf_buf_t *xnf_buf_get(xnf_t *, int, boolean_t);
+#pragma inline(xnf_buf_get)
+static void xnf_buf_put(xnf_t *, xnf_buf_t *, boolean_t);
+#pragma inline(xnf_buf_put)
+static void xnf_buf_refresh(xnf_buf_t *);
+#pragma inline(xnf_buf_refresh)
+static void xnf_buf_recycle(xnf_buf_t *);
+
+static int xnf_tx_buf_constructor(void *, void *, int);
+static void xnf_tx_buf_destructor(void *, void *);
+
+static grant_ref_t gref_get(xnf_t *);
+#pragma inline(gref_get)
+static void gref_put(xnf_t *, grant_ref_t);
+#pragma inline(gref_put)
+
+static xnf_txid_t *txid_get(xnf_t *);
+#pragma inline(txid_get)
+static void txid_put(xnf_t *, xnf_txid_t *);
+#pragma inline(txid_put)
+
 void xnf_send_driver_status(int, int);
-static void rx_buffer_hang(xnf_t *, struct xnf_buffer_desc *);
-static int xnf_clean_tx_ring(xnf_t  *);
+static void xnf_rxbuf_hang(xnf_t *, xnf_buf_t *);
+static int xnf_tx_clean_ring(xnf_t  *);
 static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
     void *, void *);
-static mblk_t *xnf_process_hvcopy_recv(xnf_t *xnfp);
-static boolean_t xnf_hvcopy_peer_status(dev_info_t *devinfo);
-static boolean_t xnf_kstat_init(xnf_t *xnfp);
-
-/*
- * XXPV dme: remove MC_IOCTL?
- */
+static boolean_t xnf_kstat_init(xnf_t *);
+static void xnf_rx_collect(xnf_t *);
+
 static mac_callbacks_t xnf_callbacks = {
-	MC_IOCTL | MC_GETCAPAB,
+	MC_GETCAPAB,
 	xnf_stat,
 	xnf_start,
 	xnf_stop,
@@ -184,14 +256,10 @@
 	xnf_set_multicast,
 	xnf_set_mac_addr,
 	xnf_send,
-	xnf_ioctl,
+	NULL,
 	xnf_getcapab
 };

-#define	GRANT_INVALID_REF	0
-const int xnf_rx_bufs_lowat = 4 * NET_RX_RING_SIZE;
-const int xnf_rx_bufs_hiwat = 8 * NET_RX_RING_SIZE; /* default max */
-
 /* DMA attributes for network ring buffer */
 static ddi_dma_attr_t ringbuf_dma_attr = {
 	DMA_ATTR_V0,		/* version of this structure */
@@ -208,24 +276,8 @@
 	0,			/* flags (reserved) */
 };

-/* DMA attributes for transmit data */
-static ddi_dma_attr_t tx_buffer_dma_attr = {
-	DMA_ATTR_V0,		/* version of this structure */
-	0,			/* lowest usable address */
-	0xffffffffffffffffULL,	/* highest usable address */
-	0x7fffffff,		/* maximum DMAable byte count */
-	MMU_PAGESIZE,		/* alignment in bytes */
-	0x7ff,			/* bitmap of burst sizes */
-	1,			/* minimum transfer */
-	0xffffffffU,		/* maximum transfer */
-	0xffffffffffffffffULL,	/* maximum segment length */
-	1,			/* maximum number of segments */
-	1,			/* granularity */
-	0,			/* flags (reserved) */
-};
-
-/* DMA attributes for a receive buffer */
-static ddi_dma_attr_t rx_buffer_dma_attr = {
+/* DMA attributes for transmit and receive data */
+static ddi_dma_attr_t buf_dma_attr = {
 	DMA_ATTR_V0,		/* version of this structure */
 	0,			/* lowest usable address */
 	0xffffffffffffffffULL,	/* highest usable address */
@@ -254,9 +306,6 @@
 	DDI_STRICTORDER_ACC
 };

-unsigned char xnf_broadcastaddr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
-int xnf_diagnose = 0; /* Patchable global for diagnostic purposes */
-
 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach,
     nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported);

@@ -286,7 +335,7 @@
 int
 _fini(void)
 {
-	return (EBUSY); /* XXPV dme: should be removable */
+	return (EBUSY); /* XXPV should be removable */
 }

 int
@@ -295,19 +344,148 @@
 	return (mod_info(&modlinkage, modinfop));
 }

+/*
+ * Acquire a grant reference.
+ */
+static grant_ref_t
+gref_get(xnf_t *xnfp)
+{
+	grant_ref_t gref;
+
+	mutex_enter(&xnfp->xnf_gref_lock);
+
+	do {
+		gref = gnttab_claim_grant_reference(&xnfp->xnf_gref_head);
+
+	} while ((gref == INVALID_GRANT_REF) &&
+	    (gnttab_alloc_grant_references(16, &xnfp->xnf_gref_head) == 0));
+
+	mutex_exit(&xnfp->xnf_gref_lock);
+
+	if (gref == INVALID_GRANT_REF) {
+		xnfp->xnf_stat_gref_failure++;
+	} else {
+		atomic_add_64(&xnfp->xnf_stat_gref_outstanding, 1);
+		if (xnfp->xnf_stat_gref_outstanding > xnfp->xnf_stat_gref_peak)
+			xnfp->xnf_stat_gref_peak =
+			    xnfp->xnf_stat_gref_outstanding;
+	}
+
+	return (gref);
+}
+
+/*
+ * Release a grant reference.
+ */
+static void
+gref_put(xnf_t *xnfp, grant_ref_t gref)
+{
+	ASSERT(gref != INVALID_GRANT_REF);
+
+	mutex_enter(&xnfp->xnf_gref_lock);
+	gnttab_release_grant_reference(&xnfp->xnf_gref_head, gref);
+	mutex_exit(&xnfp->xnf_gref_lock);
+
+	atomic_add_64(&xnfp->xnf_stat_gref_outstanding, -1);
+}
+
+/*
+ * Acquire a transmit id.
+ */
+static xnf_txid_t *
+txid_get(xnf_t *xnfp)
+{
+	xnf_txid_t *tidp;
+
+	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
+
+	if (xnfp->xnf_tx_pkt_id_head == INVALID_TX_ID)
+		return (NULL);
+
+	ASSERT(TX_ID_VALID(xnfp->xnf_tx_pkt_id_head));
+
+	tidp = TX_ID_TO_TXID(xnfp, xnfp->xnf_tx_pkt_id_head);
+	xnfp->xnf_tx_pkt_id_head = tidp->next;
+	tidp->next = INVALID_TX_ID;
+
+	ASSERT(tidp->txbuf == NULL);
+
+	return (tidp);
+}
+
+/*
+ * Release a transmit id.
+ */
+static void
+txid_put(xnf_t *xnfp, xnf_txid_t *tidp)
+{
+	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
+	ASSERT(TX_ID_VALID(tidp->id));
+	ASSERT(tidp->next == INVALID_TX_ID);
+
+	tidp->txbuf = NULL;
+	tidp->next = xnfp->xnf_tx_pkt_id_head;
+	xnfp->xnf_tx_pkt_id_head = tidp->id;
+}
+
+/*
+ * Get `wanted' slots in the transmit ring, waiting for at least that
+ * number if `wait' is B_TRUE. Force the ring to be cleaned by setting
+ * `wanted' to zero.
+ *
+ * Return the number of slots available.
+ */
+static int
+tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait)
+{
+	int slotsfree;
+	boolean_t forced_clean = (wanted == 0);
+
+	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
+
+	/* LINTED: constant in conditional context */
+	while (B_TRUE) {
+		slotsfree = RING_FREE_REQUESTS(&xnfp->xnf_tx_ring);
+
+		if ((slotsfree < wanted) || forced_clean)
+			slotsfree = xnf_tx_clean_ring(xnfp);
+
+		/*
+		 * If there are more than we need free, tell other
+		 * people to come looking again. We hold txlock, so we
+		 * are able to take our slots before anyone else runs.
+		 */
+		if (slotsfree > wanted)
+			cv_broadcast(&xnfp->xnf_cv_tx_slots);
+
+		if (slotsfree >= wanted)
+			break;
+
+		if (!wait)
+			break;
+
+		cv_wait(&xnfp->xnf_cv_tx_slots, &xnfp->xnf_txlock);
+	}
+
+	ASSERT(slotsfree <= RING_SIZE(&(xnfp->xnf_tx_ring)));
+
+	return (slotsfree);
+}
+
 static int
 xnf_setup_rings(xnf_t *xnfp)
 {
-	int			ix, err;
+	domid_t			oeid;
+	struct xenbus_device	*xsd;
 	RING_IDX		i;
-	struct xnf_buffer_desc	*bdesc, *rbp;
-	struct xenbus_device	*xsd;
-	domid_t			oeid;
+	int			err;
+	xnf_txid_t		*tidp;
+	xnf_buf_t **bdescp;

 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);

-	if (xnfp->xnf_tx_ring_ref != GRANT_INVALID_REF)
+	if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);

 	err = gnttab_grant_foreign_access(oeid,
@@ -319,7 +497,7 @@
 	}
 	xnfp->xnf_tx_ring_ref = (grant_ref_t)err;

-	if (xnfp->xnf_rx_ring_ref != GRANT_INVALID_REF)
+	if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);

 	err = gnttab_grant_foreign_access(oeid,
@@ -331,139 +509,130 @@
 	}
 	xnfp->xnf_rx_ring_ref = (grant_ref_t)err;

-
-	mutex_enter(&xnfp->xnf_intrlock);
+	mutex_enter(&xnfp->xnf_txlock);

 	/*
-	 * Cleanup the TX ring.  We just clean up any valid tx_pktinfo structs
-	 * and reset the ring.  Note that this can lose packets after a resume,
-	 * but we expect to stagger on.
+	 * Setup/cleanup the TX ring.  Note that this can lose packets
+	 * after a resume, but we expect to stagger on.
 	 */
-	mutex_enter(&xnfp->xnf_txlock);
-
-	for (i = 0; i < xnfp->xnf_n_tx; i++) {
-		struct tx_pktinfo *txp = &xnfp->xnf_tx_pkt_info[i];
-
-		txp->id = i + 1;
-
-		if (txp->grant_ref == GRANT_INVALID_REF) {
-			ASSERT(txp->mp == NULL);
-			ASSERT(txp->bdesc == NULL);
+	xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
+	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
+	    i < NET_TX_RING_SIZE;
+	    i++, tidp++) {
+		xnf_txbuf_t *txp;
+
+		tidp->id = i;
+
+		txp = tidp->txbuf;
+		if (txp == NULL) {
+			tidp->next = INVALID_TX_ID; /* Appease txid_put(). */
+			txid_put(xnfp, tidp);
 			continue;
 		}

-		if (gnttab_query_foreign_access(txp->grant_ref) != 0)
-			panic("tx grant still in use by backend domain");
-
-		freemsg(txp->mp);
-		txp->mp = NULL;
-
-		(void) ddi_dma_unbind_handle(txp->dma_handle);
-
-		if (txp->bdesc != NULL) {
-			xnf_free_tx_buffer(txp->bdesc);
-			txp->bdesc = NULL;
+		ASSERT(txp->tx_txreq.gref != INVALID_GRANT_REF);
+		ASSERT(txp->tx_mp != NULL);
+
+		switch (txp->tx_type) {
+		case TX_DATA:
+			VERIFY(gnttab_query_foreign_access(txp->tx_txreq.gref)
+			    == 0);
+
+			if (txp->tx_bdesc == NULL) {
+				(void) gnttab_end_foreign_access_ref(
+				    txp->tx_txreq.gref, 1);
+				gref_put(xnfp, txp->tx_txreq.gref);
+				(void) ddi_dma_unbind_handle(
+				    txp->tx_dma_handle);
+			} else {
+				xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE);
+			}
+
+			freemsg(txp->tx_mp);
+			txid_put(xnfp, tidp);
+			kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
+
+			break;
+
+		case TX_MCAST_REQ:
+			txp->tx_type = TX_MCAST_RSP;
+			txp->tx_status = NETIF_RSP_DROPPED;
+			cv_broadcast(&xnfp->xnf_cv_multicast);
+
+			/*
+			 * The request consumed two slots in the ring,
+			 * yet only a single xnf_txid_t is used. Step
+			 * over the empty slot.
+			 */
+			i++;
+			ASSERT(i < NET_TX_RING_SIZE);
+
+			break;
+
+		case TX_MCAST_RSP:
+			break;
 		}
-
-		(void) gnttab_end_foreign_access_ref(txp->grant_ref,
-		    xnfp->xnf_tx_pages_readonly);
-		gnttab_release_grant_reference(&xnfp->xnf_gref_tx_head,
-		    txp->grant_ref);
-		txp->grant_ref = GRANT_INVALID_REF;
 	}

-	xnfp->xnf_tx_pkt_id_list = 0;
-	xnfp->xnf_tx_ring.rsp_cons = 0;
-	xnfp->xnf_tx_ring.req_prod_pvt = 0;
-
 	/* LINTED: constant in conditional context */
 	SHARED_RING_INIT(xnfp->xnf_tx_ring.sring);
+	/* LINTED: constant in conditional context */
+	FRONT_RING_INIT(&xnfp->xnf_tx_ring,
+	    xnfp->xnf_tx_ring.sring, PAGESIZE);

 	mutex_exit(&xnfp->xnf_txlock);

+	mutex_enter(&xnfp->xnf_rxlock);
+
 	/*
-	 * Rebuild the RX ring.  We have to rebuild the RX ring because some of
-	 * our pages are currently flipped out/granted so we can't just free
-	 * the RX buffers.  Reclaim any unprocessed recv buffers, they won't be
-	 * useable anyway since the mfn's they refer to are no longer valid.
-	 * Grant the backend domain access to each hung rx buffer.
+	 * Clean out any buffers currently posted to the receive ring
+	 * before we reset it.
 	 */
-	i = xnfp->xnf_rx_ring.rsp_cons;
-	while (i++ != xnfp->xnf_rx_ring.sring->req_prod) {
-		volatile netif_rx_request_t	*rxrp;
-
-		rxrp = RING_GET_REQUEST(&xnfp->xnf_rx_ring, i);
-		ix = rxrp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0);
-		rbp = xnfp->xnf_rxpkt_bufptr[ix];
-		if (rbp != NULL) {
-			grant_ref_t	ref = rbp->grant_ref;
-
-			ASSERT(ref != GRANT_INVALID_REF);
-			if (xnfp->xnf_rx_hvcopy) {
-				pfn_t pfn = xnf_btop(rbp->buf_phys);
-				mfn_t mfn = pfn_to_mfn(pfn);
-
-				gnttab_grant_foreign_access_ref(ref, oeid,
-				    mfn, 0);
-			} else {
-				gnttab_grant_foreign_transfer_ref(ref,
-				    oeid, 0);
-			}
-			rxrp->id = ix;
-			rxrp->gref = ref;
+	for (i = 0, bdescp = &xnfp->xnf_rx_pkt_info[0];
+	    i < NET_RX_RING_SIZE;
+	    i++, bdescp++) {
+		if (*bdescp != NULL) {
+			xnf_buf_put(xnfp, *bdescp, B_FALSE);
+			*bdescp = NULL;
 		}
 	}

-	/*
-	 * Reset the ring pointers to initial state.
-	 * Hang buffers for any empty ring slots.
-	 */
-	xnfp->xnf_rx_ring.rsp_cons = 0;
-	xnfp->xnf_rx_ring.req_prod_pvt = 0;
-
 	/* LINTED: constant in conditional context */
 	SHARED_RING_INIT(xnfp->xnf_rx_ring.sring);
-
+	/* LINTED: constant in conditional context */
+	FRONT_RING_INIT(&xnfp->xnf_rx_ring,
+	    xnfp->xnf_rx_ring.sring, PAGESIZE);
+
+	/*
+	 * Fill the ring with buffers.
+	 */
 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
-		xnfp->xnf_rx_ring.req_prod_pvt = i;
-		if (xnfp->xnf_rxpkt_bufptr[i] != NULL)
-			continue;
-		if ((bdesc = xnf_get_buffer(xnfp)) == NULL)
-			break;
-		rx_buffer_hang(xnfp, bdesc);
+		xnf_buf_t *bdesc;
+
+		bdesc = xnf_buf_get(xnfp, KM_SLEEP, B_FALSE);
+		VERIFY(bdesc != NULL);
+		xnf_rxbuf_hang(xnfp, bdesc);
 	}
-	xnfp->xnf_rx_ring.req_prod_pvt = i;
+
 	/* LINTED: constant in conditional context */
 	RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring);

-	mutex_exit(&xnfp->xnf_intrlock);
+	mutex_exit(&xnfp->xnf_rxlock);

 	return (0);

 out:
-	if (xnfp->xnf_tx_ring_ref != GRANT_INVALID_REF)
+	if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
-	xnfp->xnf_tx_ring_ref = GRANT_INVALID_REF;
-
-	if (xnfp->xnf_rx_ring_ref != GRANT_INVALID_REF)
+	xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
+
+	if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
-	xnfp->xnf_rx_ring_ref = GRANT_INVALID_REF;
+	xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;

 	return (err);
 }

-
-/* Called when the upper layers free a message we passed upstream */
-static void
-xnf_copy_rcv_complete(struct xnf_buffer_desc *bdesc)
-{
-	(void) ddi_dma_unbind_handle(bdesc->dma_handle);
-	ddi_dma_mem_free(&bdesc->acc_handle);
-	ddi_dma_free_handle(&bdesc->dma_handle);
-	kmem_free(bdesc, sizeof (*bdesc));
-}
-
-
 /*
  * Connect driver to back end, called to set up communication with
  * back end driver both initially and on resume after restore/migrate.
@@ -523,31 +692,24 @@
 		goto abort_transaction;
 	}

-	if (!xnfp->xnf_tx_pages_readonly) {
-		err = xenbus_printf(xbt, xsname, "feature-tx-writable",
-		    "%d", 1);
-		if (err != 0) {
-			message = "writing feature-tx-writable";
-			goto abort_transaction;
-		}
-	}
-
-	err = xenbus_printf(xbt, xsname, "feature-no-csum-offload", "%d",
-	    xnfp->xnf_cksum_offload ? 0 : 1);
-	if (err != 0) {
-		message = "writing feature-no-csum-offload";
-		goto abort_transaction;
-	}
-	err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d",
-	    xnfp->xnf_rx_hvcopy ? 1 : 0);
+	err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d", 1);
 	if (err != 0) {
 		message = "writing request-rx-copy";
 		goto abort_transaction;
 	}

-	err = xenbus_printf(xbt, xsname, "state", "%d", XenbusStateConnected);
+	if (xnfp->xnf_be_mcast_control) {
+		err = xenbus_printf(xbt, xsname, "request-multicast-control",
+		    "%d", 1);
+		if (err != 0) {
+			message = "writing request-multicast-control";
+			goto abort_transaction;
+		}
+	}
+
+	err = xvdi_switch_state(xnfp->xnf_devinfo, xbt, XenbusStateConnected);
 	if (err != 0) {
-		message = "writing frontend XenbusStateConnected";
+		message = "switching state to XenbusStateConnected";
 		goto abort_transaction;
 	}

@@ -566,15 +728,16 @@
 }

 /*
- * Read config info from xenstore
+ * Read configuration information from xenstore.
  */
 void
 xnf_read_config(xnf_t *xnfp)
 {
-	char		mac[ETHERADDRL * 3];
-	int		err, be_no_cksum_offload;
-
-	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->xnf_devinfo), "mac",
+	int err, be_cap;
+	char mac[ETHERADDRL * 3];
+	char *oename = xvdi_get_oename(xnfp->xnf_devinfo);
+
+	err = xenbus_scanf(XBT_NULL, oename, "mac",
 	    "%s", (char *)&mac[0]);
 	if (err != 0) {
 		/*
@@ -593,27 +756,31 @@
 		return;
 	}

-	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->xnf_devinfo),
-	    "feature-no-csum-offload", "%d", &be_no_cksum_offload);
+	err = xenbus_scanf(XBT_NULL, oename,
+	    "feature-rx-copy", "%d", &be_cap);
 	/*
 	 * If we fail to read the store we assume that the key is
 	 * absent, implying an older domain at the far end.  Older
-	 * domains always support checksum offload.
+	 * domains cannot do HV copy.
 	 */
 	if (err != 0)
-		be_no_cksum_offload = 0;
+		be_cap = 0;
+	xnfp->xnf_be_rx_copy = (be_cap != 0);
+
+	err = xenbus_scanf(XBT_NULL, oename,
+	    "feature-multicast-control", "%d", &be_cap);
 	/*
-	 * If the far end cannot do checksum offload or we do not wish
-	 * to do it, disable it.
+	 * If we fail to read the store we assume that the key is
+	 * absent, implying an older domain at the far end.  Older
+	 * domains do not support multicast control.
 	 */
-	if ((be_no_cksum_offload == 1) || !xnfp->xnf_cksum_offload)
-		xnfp->xnf_cksum_offload = B_FALSE;
+	if (err != 0)
+		be_cap = 0;
+	xnfp->xnf_be_mcast_control = (be_cap != 0) && xnf_multicast_control;
 }

 /*
  *  attach(9E) -- Attach a device to the system
- *
- *  Called once for each board successfully probed.
  */
 static int
 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
@@ -621,9 +788,10 @@
 	mac_register_t *macp;
 	xnf_t *xnfp;
 	int err;
+	char cachename[32];

 #ifdef XNF_DEBUG
-	if (xnfdebug & XNF_DEBUG_DDI)
+	if (xnf_debug & XNF_DEBUG_DDI)
 		printf("xnf%d: attach(0x%p)\n", ddi_get_instance(devinfo),
 		    (void *)devinfo);
 #endif
@@ -631,6 +799,7 @@
 	switch (cmd) {
 	case DDI_RESUME:
 		xnfp = ddi_get_driver_private(devinfo);
+		xnfp->xnf_gen++;

 		(void) xvdi_resume(devinfo);
 		(void) xvdi_alloc_evtchn(devinfo);
@@ -642,16 +811,6 @@
 		(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr,
 		    (caddr_t)xnfp);
 #endif
-		xnf_be_connect(xnfp);
-		/*
-		 * Our MAC address may have changed if we're resuming:
-		 * - on a different host
-		 * - on the same one and got a different MAC address
-		 *   because we didn't specify one of our own.
-		 * so it's useful to claim that it changed in order that
-		 * IP send out a gratuitous ARP.
-		 */
-		mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);
 		return (DDI_SUCCESS);

 	case DDI_ATTACH:
@@ -681,11 +840,14 @@

 	xnfp->xnf_running = B_FALSE;
 	xnfp->xnf_connected = B_FALSE;
-	xnfp->xnf_cksum_offload = xnf_cksum_offload;
-	xnfp->xnf_tx_pages_readonly = xnf_tx_pages_readonly;
+	xnfp->xnf_be_rx_copy = B_FALSE;
+	xnfp->xnf_be_mcast_control = B_FALSE;
 	xnfp->xnf_need_sched = B_FALSE;

-	xnfp->xnf_rx_hvcopy = xnf_hvcopy_peer_status(devinfo) && xnf_rx_hvcopy;
+	xnfp->xnf_rx_head = NULL;
+	xnfp->xnf_rx_tail = NULL;
+	xnfp->xnf_rx_new_buffers_posted = B_FALSE;
+
 #ifdef XPV_HVM_DRIVER
 	/*
 	 * Report our version to dom0.
@@ -693,12 +855,6 @@
 	if (xenbus_printf(XBT_NULL, "guest/xnf", "version", "%d",
 	    HVMPV_XNF_VERS))
 		cmn_err(CE_WARN, "xnf: couldn't write version\n");
-
-	if (!xnfp->xnf_rx_hvcopy) {
-		cmn_err(CE_WARN, "The xnf driver requires a dom0 that "
-		    "supports 'feature-rx-copy'");
-		goto failure;
-	}
 #endif

 	/*
@@ -707,59 +863,58 @@
 	if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie)
 	    != DDI_SUCCESS)
 		goto failure;
-	/*
-	 * Driver locking strategy: the txlock protects all paths
-	 * through the driver, except the interrupt thread.
-	 * If the interrupt thread needs to do something which could
-	 * affect the operation of any other part of the driver,
-	 * it needs to acquire the txlock mutex.
-	 */
-	mutex_init(&xnfp->xnf_tx_buf_mutex,
-	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
-	mutex_init(&xnfp->xnf_rx_buf_mutex,
-	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
+
 	mutex_init(&xnfp->xnf_txlock,
 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
-	mutex_init(&xnfp->xnf_intrlock,
+	mutex_init(&xnfp->xnf_rxlock,
+	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
+	mutex_init(&xnfp->xnf_schedlock,
+	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
+	mutex_init(&xnfp->xnf_gref_lock,
 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
-	cv_init(&xnfp->xnf_cv, NULL, CV_DEFAULT, NULL);
-
-	xnfp->xnf_gref_tx_head = (grant_ref_t)-1;
-	xnfp->xnf_gref_rx_head = (grant_ref_t)-1;
-	if (gnttab_alloc_grant_references(NET_TX_RING_SIZE,
-	    &xnfp->xnf_gref_tx_head) < 0) {
-		cmn_err(CE_WARN, "xnf%d: can't alloc tx grant refs",
-		    ddi_get_instance(xnfp->xnf_devinfo));
+
+	cv_init(&xnfp->xnf_cv_state, NULL, CV_DEFAULT, NULL);
+	cv_init(&xnfp->xnf_cv_multicast, NULL, CV_DEFAULT, NULL);
+	cv_init(&xnfp->xnf_cv_tx_slots, NULL, CV_DEFAULT, NULL);
+
+	(void) sprintf(cachename, "xnf_buf_cache_%d",
+	    ddi_get_instance(devinfo));
+	xnfp->xnf_buf_cache = kmem_cache_create(cachename,
+	    sizeof (xnf_buf_t), 0,
+	    xnf_buf_constructor, xnf_buf_destructor,
+	    NULL, xnfp, NULL, 0);
+	if (xnfp->xnf_buf_cache == NULL)
+		goto failure_0;
+
+	(void) sprintf(cachename, "xnf_tx_buf_cache_%d",
+	    ddi_get_instance(devinfo));
+	xnfp->xnf_tx_buf_cache = kmem_cache_create(cachename,
+	    sizeof (xnf_txbuf_t), 0,
+	    xnf_tx_buf_constructor, xnf_tx_buf_destructor,
+	    NULL, xnfp, NULL, 0);
+	if (xnfp->xnf_tx_buf_cache == NULL)
 		goto failure_1;
-	}
-	if (gnttab_alloc_grant_references(NET_RX_RING_SIZE,
-	    &xnfp->xnf_gref_rx_head) < 0) {
-		cmn_err(CE_WARN, "xnf%d: can't alloc rx grant refs",
-		    ddi_get_instance(xnfp->xnf_devinfo));
-		goto failure_1;
-	}
+
+	xnfp->xnf_gref_head = INVALID_GRANT_REF;
+
 	if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) {
 		cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize "
 		    "driver data structures",
 		    ddi_get_instance(xnfp->xnf_devinfo));
-		goto failure_1;
+		goto failure_2;
 	}

 	xnfp->xnf_rx_ring.sring->rsp_event =
 	    xnfp->xnf_tx_ring.sring->rsp_event = 1;

-	xnfp->xnf_tx_ring_ref = GRANT_INVALID_REF;
-	xnfp->xnf_rx_ring_ref = GRANT_INVALID_REF;
+	xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
+	xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;

 	/* set driver private pointer now */
 	ddi_set_driver_private(devinfo, xnfp);

-	if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL)
-	    != DDI_SUCCESS)
-		goto failure_1;
-
 	if (!xnf_kstat_init(xnfp))
-		goto failure_2;
+		goto failure_3;

 	/*
 	 * Allocate an event channel, add the interrupt handler and
@@ -773,12 +928,15 @@
 	(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp);
 #endif

-	xnf_read_config(xnfp);
 	err = mac_register(macp, &xnfp->xnf_mh);
 	mac_free(macp);
 	macp = NULL;
 	if (err != 0)
-		goto failure_3;
+		goto failure_4;
+
+	if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL)
+	    != DDI_SUCCESS)
+		goto failure_5;

 #ifdef XPV_HVM_DRIVER
 	/*
@@ -792,15 +950,17 @@
 	    "Ethernet controller");
 #endif

-	/*
-	 * connect to the backend
-	 */
-	xnf_be_connect(xnfp);
+#ifdef XNF_DEBUG
+	if (xnf_debug_instance == NULL)
+		xnf_debug_instance = xnfp;
+#endif

 	return (DDI_SUCCESS);

-failure_3:
-	kstat_delete(xnfp->xnf_kstat_aux);
+failure_5:
+	mac_unregister(xnfp->xnf_mh);
+
+failure_4:
 #ifdef XPV_HVM_DRIVER
 	ec_unbind_evtchn(xnfp->xnf_evtchn);
 	xvdi_free_evtchn(devinfo);
@@ -808,20 +968,26 @@
 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
 #endif
 	xnfp->xnf_evtchn = INVALID_EVTCHN;
+	kstat_delete(xnfp->xnf_kstat_aux);
+
+failure_3:
+	xnf_release_dma_resources(xnfp);

 failure_2:
-	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
+	kmem_cache_destroy(xnfp->xnf_tx_buf_cache);

 failure_1:
-	if (xnfp->xnf_gref_tx_head != (grant_ref_t)-1)
-		gnttab_free_grant_references(xnfp->xnf_gref_tx_head);
-	if (xnfp->xnf_gref_rx_head != (grant_ref_t)-1)
-		gnttab_free_grant_references(xnfp->xnf_gref_rx_head);
-	xnf_release_dma_resources(xnfp);
-	cv_destroy(&xnfp->xnf_cv);
-	mutex_destroy(&xnfp->xnf_rx_buf_mutex);
+	kmem_cache_destroy(xnfp->xnf_buf_cache);
+
+failure_0:
+	cv_destroy(&xnfp->xnf_cv_tx_slots);
+	cv_destroy(&xnfp->xnf_cv_multicast);
+	cv_destroy(&xnfp->xnf_cv_state);
+
+	mutex_destroy(&xnfp->xnf_gref_lock);
+	mutex_destroy(&xnfp->xnf_schedlock);
+	mutex_destroy(&xnfp->xnf_rxlock);
 	mutex_destroy(&xnfp->xnf_txlock);
-	mutex_destroy(&xnfp->xnf_intrlock);

 failure:
 	kmem_free(xnfp, sizeof (*xnfp));
@@ -836,10 +1002,9 @@
 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
 {
 	xnf_t *xnfp;		/* Our private device info */
-	int i;

 #ifdef XNF_DEBUG
-	if (xnfdebug & XNF_DEBUG_DDI)
+	if (xnf_debug & XNF_DEBUG_DDI)
 		printf("xnf_detach(0x%p)\n", (void *)devinfo);
 #endif

@@ -856,13 +1021,13 @@

 		xvdi_suspend(devinfo);

-		mutex_enter(&xnfp->xnf_intrlock);
+		mutex_enter(&xnfp->xnf_rxlock);
 		mutex_enter(&xnfp->xnf_txlock);

 		xnfp->xnf_evtchn = INVALID_EVTCHN;
 		xnfp->xnf_connected = B_FALSE;
 		mutex_exit(&xnfp->xnf_txlock);
-		mutex_exit(&xnfp->xnf_intrlock);
+		mutex_exit(&xnfp->xnf_rxlock);

 		/* claim link to be down after disconnect */
 		mac_link_update(xnfp->xnf_mh, LINK_STATE_DOWN);
@@ -878,25 +1043,11 @@
 	if (xnfp->xnf_connected)
 		return (DDI_FAILURE);

-	/* Wait for receive buffers to be returned; give up after 5 seconds */
-	i = 50;
-
-	mutex_enter(&xnfp->xnf_rx_buf_mutex);
-	while (xnfp->xnf_rx_bufs_outstanding > 0) {
-		mutex_exit(&xnfp->xnf_rx_buf_mutex);
-		delay(drv_usectohz(100000));
-		if (--i == 0) {
-			cmn_err(CE_WARN,
-			    "xnf%d: never reclaimed all the "
-			    "receive buffers.  Still have %d "
-			    "buffers outstanding.",
-			    ddi_get_instance(xnfp->xnf_devinfo),
-			    xnfp->xnf_rx_bufs_outstanding);
-			return (DDI_FAILURE);
-		}
-		mutex_enter(&xnfp->xnf_rx_buf_mutex);
-	}
-	mutex_exit(&xnfp->xnf_rx_buf_mutex);
+	/*
+	 * Cannot detach if we have xnf_buf_t outstanding.
+	 */
+	if (xnfp->xnf_stat_buf_allocated > 0)
+		return (DDI_FAILURE);

 	if (mac_unregister(xnfp->xnf_mh) != 0)
 		return (DDI_FAILURE);
@@ -922,10 +1073,17 @@
 	/* Release all DMA resources */
 	xnf_release_dma_resources(xnfp);

-	cv_destroy(&xnfp->xnf_cv);
-	mutex_destroy(&xnfp->xnf_rx_buf_mutex);
+	cv_destroy(&xnfp->xnf_cv_tx_slots);
+	cv_destroy(&xnfp->xnf_cv_multicast);
+	cv_destroy(&xnfp->xnf_cv_state);
+
+	kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
+	kmem_cache_destroy(xnfp->xnf_buf_cache);
+
+	mutex_destroy(&xnfp->xnf_gref_lock);
+	mutex_destroy(&xnfp->xnf_schedlock);
+	mutex_destroy(&xnfp->xnf_rxlock);
 	mutex_destroy(&xnfp->xnf_txlock);
-	mutex_destroy(&xnfp->xnf_intrlock);

 	kmem_free(xnfp, sizeof (*xnfp));

@@ -935,24 +1093,13 @@
 /*
  *  xnf_set_mac_addr() -- set the physical network address on the board.
  */
-/*ARGSUSED*/
 static int
 xnf_set_mac_addr(void *arg, const uint8_t *macaddr)
 {
-	xnf_t *xnfp = arg;
-
-#ifdef XNF_DEBUG
-	if (xnfdebug & XNF_DEBUG_TRACE)
-		printf("xnf%d: set_mac_addr(0x%p): "
-		    "%02x:%02x:%02x:%02x:%02x:%02x\n",
-		    ddi_get_instance(xnfp->xnf_devinfo),
-		    (void *)xnfp, macaddr[0], macaddr[1], macaddr[2],
-		    macaddr[3], macaddr[4], macaddr[5]);
-#endif
+	_NOTE(ARGUNUSED(arg, macaddr));
+
 	/*
 	 * We can't set our macaddr.
-	 *
-	 * XXPV dme: Why not?
 	 */
 	return (ENOTSUP);
 }
@@ -961,33 +1108,113 @@
  *  xnf_set_multicast() -- set (enable) or disable a multicast address.
  *
  *  Program the hardware to enable/disable the multicast address
- *  in "mcast".  Enable if "add" is true, disable if false.
+ *  in "mca".  Enable if "add" is true, disable if false.
  */
-/*ARGSUSED*/
 static int
 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
 {
 	xnf_t *xnfp = arg;
-
-#ifdef XNF_DEBUG
-	if (xnfdebug & XNF_DEBUG_TRACE)
-		printf("xnf%d set_multicast(0x%p): "
-		    "%02x:%02x:%02x:%02x:%02x:%02x\n",
-		    ddi_get_instance(xnfp->xnf_devinfo),
-		    (void *)xnfp, mca[0], mca[1], mca[2],
-		    mca[3], mca[4], mca[5]);
-#endif
+	xnf_txbuf_t *txp;
+	int n_slots;
+	RING_IDX slot;
+	xnf_txid_t *tidp;
+	netif_tx_request_t *txrp;
+	struct netif_extra_info *erp;
+	boolean_t notify, result;
+
+	/*
+	 * If the backend does not support multicast control then we
+	 * must assume that the right packets will just arrive.
+	 */
+	if (!xnfp->xnf_be_mcast_control)
+		return (0);
+
+	txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
+	if (txp == NULL)
+		return (1);
+
+	mutex_enter(&xnfp->xnf_txlock);
+
+	/*
+	 * If we're not yet connected then claim success. This is
+	 * acceptable because we refresh the entire set of multicast
+	 * addresses when we get connected.
+	 *
+	 * We can't wait around here because the MAC layer expects
+	 * this to be a non-blocking operation - waiting ends up
+	 * causing a deadlock during resume.
+	 */
+	if (!xnfp->xnf_connected) {
+		mutex_exit(&xnfp->xnf_txlock);
+		return (0);
+	}

 	/*
-	 * XXPV dme: Ideally we'd relay the address to the backend for
-	 * enabling.  The protocol doesn't support that (interesting
-	 * extension), so we simply succeed and hope that the relevant
-	 * packets are going to arrive.
-	 *
-	 * If protocol support is added for enable/disable then we'll
-	 * need to keep a list of those in use and re-add on resume.
+	 * 1. Acquire two slots in the ring.
+	 * 2. Fill in the slots.
+	 * 3. Request notification when the operation is done.
+	 * 4. Kick the peer.
+	 * 5. Wait for the response via xnf_tx_clean_ring().
 	 */
-	return (0);
+
+	n_slots = tx_slots_get(xnfp, 2, B_TRUE);
+	ASSERT(n_slots >= 2);
+
+	slot = xnfp->xnf_tx_ring.req_prod_pvt;
+	tidp = txid_get(xnfp);
+	VERIFY(tidp != NULL);
+
+	txp->tx_type = TX_MCAST_REQ;
+	txp->tx_slot = slot;
+
+	txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
+	erp = (struct netif_extra_info *)
+	    RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot + 1);
+
+	txrp->gref = 0;
+	txrp->size = 0;
+	txrp->offset = 0;
+	/* Set tx_txreq.id to appease xnf_tx_clean_ring(). */
+	txrp->id = txp->tx_txreq.id = tidp->id;
+	txrp->flags = NETTXF_extra_info;
+
+	erp->type = add ? XEN_NETIF_EXTRA_TYPE_MCAST_ADD :
+	    XEN_NETIF_EXTRA_TYPE_MCAST_DEL;
+	bcopy((void *)mca, &erp->u.mcast.addr, ETHERADDRL);
+
+	tidp->txbuf = txp;
+
+	xnfp->xnf_tx_ring.req_prod_pvt = slot + 2;
+
+	mutex_enter(&xnfp->xnf_schedlock);
+	xnfp->xnf_pending_multicast++;
+	mutex_exit(&xnfp->xnf_schedlock);
+
+	/* LINTED: constant in conditional context */
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
+	    notify);
+	if (notify)
+		ec_notify_via_evtchn(xnfp->xnf_evtchn);
+
+	while (txp->tx_type == TX_MCAST_REQ)
+		cv_wait(&xnfp->xnf_cv_multicast,
+		    &xnfp->xnf_txlock);
+
+	ASSERT(txp->tx_type == TX_MCAST_RSP);
+
+	mutex_enter(&xnfp->xnf_schedlock);
+	xnfp->xnf_pending_multicast--;
+	mutex_exit(&xnfp->xnf_schedlock);
+
+	result = (txp->tx_status == NETIF_RSP_OKAY);
+
+	txid_put(xnfp, tidp);
+
+	mutex_exit(&xnfp->xnf_txlock);
+
+	kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
+
+	return (result ? 0 : 1);
 }

 /*
@@ -995,18 +1222,11 @@
  *
  *  Program the hardware to enable/disable promiscuous mode.
  */
-/*ARGSUSED*/
 static int
 xnf_set_promiscuous(void *arg, boolean_t on)
 {
-	xnf_t *xnfp = arg;
-
-#ifdef XNF_DEBUG
-	if (xnfdebug & XNF_DEBUG_TRACE)
-		printf("xnf%d set_promiscuous(0x%p, %x)\n",
-		    ddi_get_instance(xnfp->xnf_devinfo),
-		    (void *)xnfp, on);
-#endif
+	_NOTE(ARGUNUSED(arg, on));
+
 	/*
 	 * We can't really do this, but we pretend that we can in
 	 * order that snoop will work.
@@ -1018,51 +1238,88 @@
  * Clean buffers that we have responses for from the transmit ring.
  */
 static int
-xnf_clean_tx_ring(xnf_t *xnfp)
+xnf_tx_clean_ring(xnf_t *xnfp)
 {
-	RING_IDX		next_resp, i;
-	struct tx_pktinfo	*reap;
-	int			id;
-	grant_ref_t		ref;
-	boolean_t		work_to_do;
+	boolean_t work_to_do;

 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));

 loop:
 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) {
-		/*
-		 * index of next transmission ack
-		 */
-		next_resp = xnfp->xnf_tx_ring.sring->rsp_prod;
+		RING_IDX cons, prod, i;
+
+		cons = xnfp->xnf_tx_ring.rsp_cons;
+		prod = xnfp->xnf_tx_ring.sring->rsp_prod;
 		membar_consumer();
 		/*
-		 * Clean tx packets from ring that we have responses for
+		 * Clean tx requests from ring that we have responses
+		 * for.
 		 */
-		for (i = xnfp->xnf_tx_ring.rsp_cons; i != next_resp; i++) {
-			id = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i)->id;
-			reap = &xnfp->xnf_tx_pkt_info[id];
-			ref = reap->grant_ref;
-			/*
-			 * Return id to free list
-			 */
-			reap->id = xnfp->xnf_tx_pkt_id_list;
-			xnfp->xnf_tx_pkt_id_list = id;
-			if (gnttab_query_foreign_access(ref) != 0)
-				panic("tx grant still in use "
-				    "by backend domain");
-			(void) ddi_dma_unbind_handle(reap->dma_handle);
-			(void) gnttab_end_foreign_access_ref(ref,
-			    xnfp->xnf_tx_pages_readonly);
-			gnttab_release_grant_reference(&xnfp->xnf_gref_tx_head,
-			    ref);
-			freemsg(reap->mp);
-			reap->mp = NULL;
-			reap->grant_ref = GRANT_INVALID_REF;
-			if (reap->bdesc != NULL)
-				xnf_free_tx_buffer(reap->bdesc);
-			reap->bdesc = NULL;
+		DTRACE_PROBE2(xnf_tx_clean_range, int, cons, int, prod);
+		for (i = cons; i != prod; i++) {
+			netif_tx_response_t *trp;
+			xnf_txid_t *tidp;
+			xnf_txbuf_t *txp;
+
+			trp = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i);
+			ASSERT(TX_ID_VALID(trp->id));
+
+			tidp = TX_ID_TO_TXID(xnfp, trp->id);
+			ASSERT(tidp->id == trp->id);
+			ASSERT(tidp->next == INVALID_TX_ID);
+
+			txp = tidp->txbuf;
+			ASSERT(txp != NULL);
+			ASSERT(txp->tx_txreq.id == trp->id);
+
+			switch (txp->tx_type) {
+			case TX_DATA:
+				if (gnttab_query_foreign_access(
+				    txp->tx_txreq.gref) != 0)
+					cmn_err(CE_PANIC,
+					    "tx grant %d still in use by "
+					    "backend domain",
+					    txp->tx_txreq.gref);
+
+				if (txp->tx_bdesc == NULL) {
+					(void) gnttab_end_foreign_access_ref(
+					    txp->tx_txreq.gref, 1);
+					gref_put(xnfp, txp->tx_txreq.gref);
+					(void) ddi_dma_unbind_handle(
+					    txp->tx_dma_handle);
+				} else {
+					xnf_buf_put(xnfp, txp->tx_bdesc,
+					    B_TRUE);
+				}
+
+				freemsg(txp->tx_mp);
+				txid_put(xnfp, tidp);
+				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
+
+				break;
+
+			case TX_MCAST_REQ:
+				txp->tx_type = TX_MCAST_RSP;
+				txp->tx_status = trp->status;
+				cv_broadcast(&xnfp->xnf_cv_multicast);
+
+				break;
+
+			case TX_MCAST_RSP:
+				break;
+
+			default:
+				cmn_err(CE_PANIC, "xnf_tx_clean_ring: "
+				    "invalid xnf_txbuf_t type: %d",
+				    txp->tx_type);
+				break;
+			}
 		}
-		xnfp->xnf_tx_ring.rsp_cons = next_resp;
+		/*
+		 * Record the last response we dealt with so that we
+		 * know where to start next time around.
+		 */
+		xnfp->xnf_tx_ring.rsp_cons = prod;
 		membar_enter();
 	}

@@ -1075,40 +1332,40 @@
 }

 /*
- * If we need to pull up data from either a packet that crosses a page
- * boundary or consisting of multiple mblks, do it here.  We allocate
- * a page aligned buffer and copy the data into it.  The header for the
- * allocated buffer is returned. (which is also allocated here)
+ * Allocate and fill in a look-aside buffer for the packet `mp'. Used
+ * to ensure that the packet is physically contiguous and contained
+ * within a single page.
  */
-static struct xnf_buffer_desc *
-xnf_pullupmsg(xnf_t *xnfp, mblk_t *mp)
+static xnf_buf_t *
+xnf_tx_pullup(xnf_t *xnfp, mblk_t *mp)
 {
-	struct xnf_buffer_desc	*bdesc;
-	mblk_t			*mptr;
-	caddr_t			bp;
-	int			len;
-
-	/*
-	 * get a xmit buffer from the xmit buffer pool
-	 */
-	mutex_enter(&xnfp->xnf_rx_buf_mutex);
-	bdesc = xnf_get_tx_buffer(xnfp);
-	mutex_exit(&xnfp->xnf_rx_buf_mutex);
-	if (bdesc == NULL)
-		return (bdesc);
-	/*
-	 * Copy the data into the buffer
-	 */
+	xnf_buf_t *bd;
+	caddr_t bp;
+
+	bd = xnf_buf_get(xnfp, KM_SLEEP, B_TRUE);
+	if (bd == NULL)
+		return (NULL);
+
+	bp = bd->buf;
+	while (mp != NULL) {
+		size_t len = MBLKL(mp);
+
+		bcopy(mp->b_rptr, bp, len);
+		bp += len;
+
+		mp = mp->b_cont;
+	}
+
+	ASSERT((bp - bd->buf) <= PAGESIZE);
+
 	xnfp->xnf_stat_tx_pullup++;
-	bp = bdesc->buf;
-	for (mptr = mp; mptr != NULL; mptr = mptr->b_cont) {
-		len = mptr->b_wptr - mptr->b_rptr;
-		bcopy(mptr->b_rptr, bp, len);
-		bp += len;
-	}
-	return (bdesc);
+
+	return (bd);
 }

+/*
+ * Insert the pseudo-header checksum into the packet `buf'.
+ */
 void
 xnf_pseudo_cksum(caddr_t buf, int length)
 {
@@ -1179,280 +1436,419 @@
 }

 /*
- *  xnf_send_one() -- send a packet
- *
- *  Called when a packet is ready to be transmitted. A pointer to an
- *  M_DATA message that contains the packet is passed to this routine.
- *  At least the complete LLC header is contained in the message's
- *  first message block, and the remainder of the packet is contained
- *  within additional M_DATA message blocks linked to the first
- *  message block.
- *
+ * Push a list of prepared packets (`txp') into the transmit ring.
  */
-static boolean_t
-xnf_send_one(xnf_t *xnfp, mblk_t *mp)
+static xnf_txbuf_t *
+tx_push_packets(xnf_t *xnfp, xnf_txbuf_t *txp)
+{
+	int slots_free;
+	RING_IDX slot;
+	boolean_t notify;
+
+	mutex_enter(&xnfp->xnf_txlock);
+
+	ASSERT(xnfp->xnf_running);
+
+	/*
+	 * Wait until we are connected to the backend.
+	 */
+	while (!xnfp->xnf_connected)
+		cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock);
+
+	slots_free = tx_slots_get(xnfp, 1, B_FALSE);
+	DTRACE_PROBE1(xnf_send_slotsfree, int, slots_free);
+
+	slot = xnfp->xnf_tx_ring.req_prod_pvt;
+
+	while ((txp != NULL) && (slots_free > 0)) {
+		xnf_txid_t *tidp;
+		netif_tx_request_t *txrp;
+
+		tidp = txid_get(xnfp);
+		VERIFY(tidp != NULL);
+
+		txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
+
+		txp->tx_slot = slot;
+		txp->tx_txreq.id = tidp->id;
+		*txrp = txp->tx_txreq;
+
+		tidp->txbuf = txp;
+
+		xnfp->xnf_stat_opackets++;
+		xnfp->xnf_stat_obytes += txp->tx_txreq.size;
+
+		txp = txp->tx_next;
+		slots_free--;
+		slot++;
+
+	}
+
+	xnfp->xnf_tx_ring.req_prod_pvt = slot;
+
+	/*
+	 * Tell the peer that we sent something, if it cares.
+	 */
+	/* LINTED: constant in conditional context */
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
+	    notify);
+	if (notify)
+		ec_notify_via_evtchn(xnfp->xnf_evtchn);
+
+	mutex_exit(&xnfp->xnf_txlock);
+
+	return (txp);
+}
+
+/*
+ * Send the chain of packets `mp'. Called by the MAC framework.
+ */
+static mblk_t *
+xnf_send(void *arg, mblk_t *mp)
 {
-	struct xnf_buffer_desc	*xmitbuf;
-	struct tx_pktinfo	*txp_info;
-	mblk_t			*mptr;
-	ddi_dma_cookie_t	dma_cookie;
-	RING_IDX		slot;
-	int			length = 0, i, pktlen = 0, rc, tx_id;
-	int			tx_ring_freespace, page_oops;
-	uint_t			ncookies;
-	volatile netif_tx_request_t	*txrp;
-	caddr_t			bufaddr;
-	grant_ref_t		ref;
-	unsigned long		mfn;
-	uint32_t		pflags;
-	domid_t			oeid;
+	xnf_t *xnfp = arg;
+	domid_t oeid;
+	xnf_txbuf_t *head, *tail;
+	mblk_t *ml;
+	int prepared;
+
+	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
+
+	/*
+	 * Prepare packets for transmission.
+	 */
+	head = tail = NULL;
+	prepared = 0;
+	while (mp != NULL) {
+		xnf_txbuf_t *txp;
+		int n_chunks, length;
+		boolean_t page_oops;
+		uint32_t pflags;
+
+		for (ml = mp, n_chunks = length = 0, page_oops = B_FALSE;
+		    ml != NULL;
+		    ml = ml->b_cont, n_chunks++) {
+
+			/*
+			 * Test if this buffer includes a page
+			 * boundary. The test assumes that the range
+			 * b_rptr...b_wptr can include only a single
+			 * boundary.
+			 */
+			if (xnf_btop((size_t)ml->b_rptr) !=
+			    xnf_btop((size_t)ml->b_wptr)) {
+				xnfp->xnf_stat_tx_pagebndry++;
+				page_oops = B_TRUE;
+			}
+
+			length += MBLKL(ml);
+		}
+		DTRACE_PROBE1(xnf_send_b_cont, int, n_chunks);
+
+		/*
+		 * Make sure packet isn't too large.
+		 */
+		if (length > XNF_FRAMESIZE) {
+			cmn_err(CE_WARN,
+			    "xnf%d: oversized packet (%d bytes) dropped",
+			    ddi_get_instance(xnfp->xnf_devinfo), length);
+			freemsg(mp);
+			continue;
+		}
+
+		txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
+		if (txp == NULL)
+			break;
+
+		txp->tx_type = TX_DATA;
+
+		if ((n_chunks > xnf_max_tx_frags) || page_oops) {
+			/*
+			 * Loan a side buffer rather than the mblk
+			 * itself.
+			 */
+			txp->tx_bdesc = xnf_tx_pullup(xnfp, mp);
+			if (txp->tx_bdesc == NULL) {
+				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
+				break;
+			}
+
+			txp->tx_bufp = txp->tx_bdesc->buf;
+			txp->tx_mfn = txp->tx_bdesc->buf_mfn;
+			txp->tx_txreq.gref = txp->tx_bdesc->grant_ref;
+
+		} else {
+			int rc;
+			ddi_dma_cookie_t dma_cookie;
+			uint_t ncookies;
+
+			rc = ddi_dma_addr_bind_handle(txp->tx_dma_handle,
+			    NULL, (char *)mp->b_rptr, length,
+			    DDI_DMA_WRITE | DDI_DMA_STREAMING,
+			    DDI_DMA_DONTWAIT, 0, &dma_cookie,
+			    &ncookies);
+			if (rc != DDI_DMA_MAPPED) {
+				ASSERT(rc != DDI_DMA_INUSE);
+				ASSERT(rc != DDI_DMA_PARTIAL_MAP);

 #ifdef XNF_DEBUG
-	if (xnfdebug & XNF_DEBUG_SEND)
-		printf("xnf%d send(0x%p, 0x%p)\n",
-		    ddi_get_instance(xnfp->xnf_devinfo),
-		    (void *)xnfp, (void *)mp);
+				if (rc != DDI_DMA_NORESOURCES)
+					cmn_err(CE_WARN,
+					    "xnf%d: bind_handle failed (%x)",
+					    ddi_get_instance(xnfp->xnf_devinfo),
+					    rc);
 #endif
-
-	ASSERT(mp != NULL);
-	ASSERT(mp->b_next == NULL);
-	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
-
-	tx_ring_freespace = xnf_clean_tx_ring(xnfp);
-	ASSERT(tx_ring_freespace >= 0);
-
-	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
-	xnfp->xnf_stat_tx_attempt++;
-	/*
-	 * If there are no xmit ring slots available, return.
-	 */
-	if (tx_ring_freespace == 0) {
-		xnfp->xnf_stat_tx_defer++;
-		return (B_FALSE);	/* Send should be retried */
+				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
+				break;
+			}
+			ASSERT(ncookies == 1);
+
+			txp->tx_bdesc = NULL;
+			txp->tx_bufp = (caddr_t)mp->b_rptr;
+			txp->tx_mfn =
+			    xnf_btop(pa_to_ma(dma_cookie.dmac_laddress));
+			txp->tx_txreq.gref = gref_get(xnfp);
+			if (txp->tx_txreq.gref == INVALID_GRANT_REF) {
+				(void) ddi_dma_unbind_handle(
+				    txp->tx_dma_handle);
+				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
+				break;
+			}
+			gnttab_grant_foreign_access_ref(txp->tx_txreq.gref,
+			    oeid, txp->tx_mfn, 1);
+		}
+
+		txp->tx_next = NULL;
+		txp->tx_mp = mp;
+		txp->tx_txreq.size = length;
+		txp->tx_txreq.offset = (uintptr_t)txp->tx_bufp & PAGEOFFSET;
+		txp->tx_txreq.flags = 0;
+		hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL,
+		    &pflags);
+		if (pflags != 0) {
+			/*
+			 * If the local protocol stack requests checksum
+			 * offload we set the 'checksum blank' flag,
+			 * indicating to the peer that we need the checksum
+			 * calculated for us.
+			 *
+			 * We _don't_ set the validated flag, because we haven't
+			 * validated that the data and the checksum match.
+			 */
+			xnf_pseudo_cksum(txp->tx_bufp, length);
+			txp->tx_txreq.flags |= NETTXF_csum_blank;
+
+			xnfp->xnf_stat_tx_cksum_deferred++;
+		}
+
+		if (head == NULL) {
+			ASSERT(tail == NULL);
+
+			head = txp;
+		} else {
+			ASSERT(tail != NULL);
+
+			tail->tx_next = txp;
+		}
+		tail = txp;
+
+		mp = mp->b_next;
+		prepared++;
+
+		/*
+		 * There is no point in preparing more than
+		 * NET_TX_RING_SIZE, as we won't be able to push them
+		 * into the ring in one go and would hence have to
+		 * un-prepare the extra.
+		 */
+		if (prepared == NET_TX_RING_SIZE)
+			break;
 	}

-	slot = xnfp->xnf_tx_ring.req_prod_pvt;
-	/* Count the number of mblks in message and compute packet size */
-	for (i = 0, mptr = mp; mptr != NULL; mptr = mptr->b_cont, i++)
-		pktlen += (mptr->b_wptr - mptr->b_rptr);
-
-	/* Make sure packet isn't too large */
-	if (pktlen > XNF_FRAMESIZE) {
-		cmn_err(CE_WARN, "xnf%d: oversized packet (%d bytes) dropped",
-		    ddi_get_instance(xnfp->xnf_devinfo), pktlen);
-		freemsg(mp);
-		return (B_TRUE);
+	DTRACE_PROBE1(xnf_send_prepared, int, prepared);
+
+	if (mp != NULL) {
+#ifdef XNF_DEBUG
+		int notprepared = 0;
+		mblk_t *l = mp;
+
+		while (l != NULL) {
+			notprepared++;
+			l = l->b_next;
+		}
+
+		DTRACE_PROBE1(xnf_send_notprepared, int, notprepared);
+#else /* !XNF_DEBUG */
+		DTRACE_PROBE1(xnf_send_notprepared, int, -1);
+#endif /* XNF_DEBUG */
 	}

 	/*
-	 * Test if we cross a page boundary with our buffer
-	 */
-	page_oops = (i == 1) &&
-	    (xnf_btop((size_t)mp->b_rptr) !=
-	    xnf_btop((size_t)(mp->b_rptr + pktlen)));
-	/*
-	 * XXPV - unfortunately, the Xen virtual net device currently
-	 * doesn't support multiple packet frags, so this will always
-	 * end up doing the pullup if we got more than one packet.
+	 * Push the packets we have prepared into the ring. They may
+	 * not all go.
 	 */
-	if (i > xnf_max_tx_frags || page_oops) {
-		if (page_oops)
-			xnfp->xnf_stat_tx_pagebndry++;
-		if ((xmitbuf = xnf_pullupmsg(xnfp, mp)) == NULL) {
-			/* could not allocate resources? */
-#ifdef XNF_DEBUG
-			cmn_err(CE_WARN, "xnf%d: pullupmsg failed",
-			    ddi_get_instance(xnfp->xnf_devinfo));
-#endif
-			xnfp->xnf_stat_tx_defer++;
-			return (B_FALSE);	/* Retry send */
-		}
-		bufaddr = xmitbuf->buf;
-	} else {
-		xmitbuf = NULL;
-		bufaddr = (caddr_t)mp->b_rptr;
-	}
-
-	/* set up data descriptor */
-	length = pktlen;
+	if (head != NULL)
+		head = tx_push_packets(xnfp, head);

 	/*
-	 * Get packet id from free list
+	 * If some packets that we prepared were not sent, unprepare
+	 * them and add them back to the head of those we didn't
+	 * prepare.
 	 */
-	tx_id = xnfp->xnf_tx_pkt_id_list;
-	ASSERT(tx_id < NET_TX_RING_SIZE);
-	txp_info = &xnfp->xnf_tx_pkt_info[tx_id];
-	xnfp->xnf_tx_pkt_id_list = txp_info->id;
-	txp_info->id = tx_id;
-
-	/* Prepare for DMA mapping of tx buffer(s) */
-	rc = ddi_dma_addr_bind_handle(txp_info->dma_handle,
-	    NULL, bufaddr, length, DDI_DMA_WRITE | DDI_DMA_STREAMING,
-	    DDI_DMA_DONTWAIT, 0, &dma_cookie, &ncookies);
-	if (rc != DDI_DMA_MAPPED) {
-		ASSERT(rc != DDI_DMA_INUSE);
-		ASSERT(rc != DDI_DMA_PARTIAL_MAP);
-		/*
-		 *  Return id to free list
-		 */
-		txp_info->id = xnfp->xnf_tx_pkt_id_list;
-		xnfp->xnf_tx_pkt_id_list = tx_id;
-		if (rc == DDI_DMA_NORESOURCES) {
-			xnfp->xnf_stat_tx_defer++;
-			return (B_FALSE); /* Retry later */
+	{
+		xnf_txbuf_t *loop;
+		mblk_t *mp_head, *mp_tail;
+		int unprepared = 0;
+
+		mp_head = mp_tail = NULL;
+		loop = head;
+
+		while (loop != NULL) {
+			xnf_txbuf_t *next = loop->tx_next;
+
+			if (loop->tx_bdesc == NULL) {
+				(void) gnttab_end_foreign_access_ref(
+				    loop->tx_txreq.gref, 1);
+				gref_put(xnfp, loop->tx_txreq.gref);
+				(void) ddi_dma_unbind_handle(
+				    loop->tx_dma_handle);
+			} else {
+				xnf_buf_put(xnfp, loop->tx_bdesc, B_TRUE);
+			}
+
+			ASSERT(loop->tx_mp != NULL);
+			if (mp_head == NULL)
+				mp_head = loop->tx_mp;
+			mp_tail = loop->tx_mp;
+
+			kmem_cache_free(xnfp->xnf_tx_buf_cache, loop);
+			loop = next;
+			unprepared++;
 		}
-#ifdef XNF_DEBUG
-		cmn_err(CE_WARN, "xnf%d: bind_handle failed (%x)",
-		    ddi_get_instance(xnfp->xnf_devinfo), rc);
-#endif
-		return (B_FALSE);
+
+		if (mp_tail == NULL) {
+			ASSERT(mp_head == NULL);
+		} else {
+			ASSERT(mp_head != NULL);
+
+			mp_tail->b_next = mp;
+			mp = mp_head;
+		}
+
+		DTRACE_PROBE1(xnf_send_unprepared, int, unprepared);
 	}

-	ASSERT(ncookies == 1);
-	ref = gnttab_claim_grant_reference(&xnfp->xnf_gref_tx_head);
-	ASSERT((signed short)ref >= 0);
-	mfn = xnf_btop(pa_to_ma((paddr_t)dma_cookie.dmac_laddress));
-	gnttab_grant_foreign_access_ref(ref, oeid, mfn,
-	    xnfp->xnf_tx_pages_readonly);
-	txp_info->grant_ref = ref;
-	txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
-	txrp->gref = ref;
-	txrp->size = dma_cookie.dmac_size;
-	txrp->offset = (uintptr_t)bufaddr & PAGEOFFSET;
-	txrp->id = tx_id;
-	txrp->flags = 0;
-	hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &pflags);
-	if (pflags != 0) {
-		ASSERT(xnfp->xnf_cksum_offload);
-		/*
-		 * If the local protocol stack requests checksum
-		 * offload we set the 'checksum blank' flag,
-		 * indicating to the peer that we need the checksum
-		 * calculated for us.
-		 *
-		 * We _don't_ set the validated flag, because we haven't
-		 * validated that the data and the checksum match.
-		 */
-		xnf_pseudo_cksum(bufaddr, length);
-		txrp->flags |= NETTXF_csum_blank;
-		xnfp->xnf_stat_tx_cksum_deferred++;
-	}
-	membar_producer();
-	xnfp->xnf_tx_ring.req_prod_pvt = slot + 1;
-
-	txp_info->mp = mp;
-	txp_info->bdesc = xmitbuf;
-
-	xnfp->xnf_stat_opackets++;
-	xnfp->xnf_stat_obytes += pktlen;
-
-	return (B_TRUE);	/* successful transmit attempt */
-}
-
-mblk_t *
-xnf_send(void *arg, mblk_t *mp)
-{
-	xnf_t *xnfp = arg;
-	mblk_t *next;
-	boolean_t sent_something = B_FALSE;
-
-	mutex_enter(&xnfp->xnf_txlock);
-
 	/*
-	 * Transmission attempts should be impossible without having
-	 * previously called xnf_start().
+	 * If any mblks are left then we have deferred for some reason
+	 * and need to ask for a re-schedule later. This is typically
+	 * due to the ring filling.
 	 */
-	ASSERT(xnfp->xnf_running);
-
-	/*
-	 * Wait for getting connected to the backend
-	 */
-	while (!xnfp->xnf_connected) {
-		cv_wait(&xnfp->xnf_cv, &xnfp->xnf_txlock);
+	if (mp != NULL) {
+		mutex_enter(&xnfp->xnf_schedlock);
+		xnfp->xnf_need_sched = B_TRUE;
+		mutex_exit(&xnfp->xnf_schedlock);
+
+		xnfp->xnf_stat_tx_defer++;
 	}

-	while (mp != NULL) {
-		next = mp->b_next;
-		mp->b_next = NULL;
-
-		if (!xnf_send_one(xnfp, mp)) {
-			mp->b_next = next;
-			break;
-		}
-
-		mp = next;
-		sent_something = B_TRUE;
-	}
-
-	if (sent_something) {
-		boolean_t notify;
-
-		/* LINTED: constant in conditional context */
-		RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
-		    notify);
-		if (notify)
-			ec_notify_via_evtchn(xnfp->xnf_evtchn);
-	}
-
-	if (mp != NULL)
-		xnfp->xnf_need_sched = B_TRUE;
-
-	mutex_exit(&xnfp->xnf_txlock);
-
 	return (mp);
 }

 /*
- *  xnf_intr() -- ring interrupt service routine
+ * Notification of RX packets. Currently no TX-complete interrupt is
+ * used, as we clean the TX ring lazily.
  */
 static uint_t
 xnf_intr(caddr_t arg)
 {
 	xnf_t *xnfp = (xnf_t *)arg;
-	boolean_t sched = B_FALSE;
-
-	mutex_enter(&xnfp->xnf_intrlock);
-
-	/* spurious intr */
+	mblk_t *mp;
+	boolean_t need_sched, clean_ring;
+
+	mutex_enter(&xnfp->xnf_rxlock);
+
+	/*
+	 * Interrupts before we are connected are spurious.
+	 */
 	if (!xnfp->xnf_connected) {
-		mutex_exit(&xnfp->xnf_intrlock);
+		mutex_exit(&xnfp->xnf_rxlock);
 		xnfp->xnf_stat_unclaimed_interrupts++;
 		return (DDI_INTR_UNCLAIMED);
 	}

-#ifdef XNF_DEBUG
-	if (xnfdebug & XNF_DEBUG_INT)
-		printf("xnf%d intr(0x%p)\n",
-		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
-#endif
-	if (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
-		mblk_t *mp;
-
-		if (xnfp->xnf_rx_hvcopy)
-			mp = xnf_process_hvcopy_recv(xnfp);
-		else
-			mp = xnf_process_recv(xnfp);
-
-		if (mp != NULL)
-			mac_rx(xnfp->xnf_mh, NULL, mp);
+	/*
+	 * Receive side processing.
+	 */
+	do {
+		/*
+		 * Collect buffers from the ring.
+		 */
+		xnf_rx_collect(xnfp);
+
+		/*
+		 * Interrupt me when the next receive buffer is consumed.
+		 */
+		xnfp->xnf_rx_ring.sring->rsp_event =
+		    xnfp->xnf_rx_ring.rsp_cons + 1;
+		xen_mb();
+
+	} while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring));
+
+	if (xnfp->xnf_rx_new_buffers_posted) {
+		boolean_t notify;
+
+		/*
+		 * Indicate to the peer that we have re-filled the
+		 * receive ring, if it cares.
+		 */
+		/* LINTED: constant in conditional context */
+		RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
+		if (notify)
+			ec_notify_via_evtchn(xnfp->xnf_evtchn);
+		xnfp->xnf_rx_new_buffers_posted = B_FALSE;
 	}

+	mp = xnfp->xnf_rx_head;
+	xnfp->xnf_rx_head = xnfp->xnf_rx_tail = NULL;
+
 	xnfp->xnf_stat_interrupts++;
-	mutex_exit(&xnfp->xnf_intrlock);
+	mutex_exit(&xnfp->xnf_rxlock);
+
+	if (mp != NULL)
+		mac_rx(xnfp->xnf_mh, NULL, mp);

 	/*
-	 * Clean tx ring and try to start any blocked xmit streams if
-	 * there is now some space.
+	 * Transmit side processing.
+	 *
+	 * If a previous transmit attempt failed or we have pending
+	 * multicast requests, clean the ring.
+	 *
+	 * If we previously stalled transmission and cleaning produces
+	 * some free slots, tell upstream to attempt sending again.
+	 *
+	 * The odd style is to avoid acquiring xnf_txlock unless we
+	 * will actually look inside the tx machinery.
 	 */
-	mutex_enter(&xnfp->xnf_txlock);
-	if (xnf_clean_tx_ring(xnfp) > 0) {
-		sched = xnfp->xnf_need_sched;
-		xnfp->xnf_need_sched = B_FALSE;
+	mutex_enter(&xnfp->xnf_schedlock);
+	need_sched = xnfp->xnf_need_sched;
+	clean_ring = need_sched || (xnfp->xnf_pending_multicast > 0);
+	mutex_exit(&xnfp->xnf_schedlock);
+
+	if (clean_ring) {
+		int free_slots;
+
+		mutex_enter(&xnfp->xnf_txlock);
+		free_slots = tx_slots_get(xnfp, 0, B_FALSE);
+
+		if (need_sched && (free_slots > 0)) {
+			mutex_enter(&xnfp->xnf_schedlock);
+			xnfp->xnf_need_sched = B_FALSE;
+			mutex_exit(&xnfp->xnf_schedlock);
+
+			mac_tx_update(xnfp->xnf_mh);
+		}
+		mutex_exit(&xnfp->xnf_txlock);
 	}
-	mutex_exit(&xnfp->xnf_txlock);
-
-	if (sched)
-		mac_tx_update(xnfp->xnf_mh);

 	return (DDI_INTR_CLAIMED);
 }
@@ -1466,19 +1862,19 @@
 	xnf_t *xnfp = arg;

 #ifdef XNF_DEBUG
-	if (xnfdebug & XNF_DEBUG_TRACE)
+	if (xnf_debug & XNF_DEBUG_TRACE)
 		printf("xnf%d start(0x%p)\n",
 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
 #endif

-	mutex_enter(&xnfp->xnf_intrlock);
+	mutex_enter(&xnfp->xnf_rxlock);
 	mutex_enter(&xnfp->xnf_txlock);

 	/* Accept packets from above. */
 	xnfp->xnf_running = B_TRUE;

 	mutex_exit(&xnfp->xnf_txlock);
-	mutex_exit(&xnfp->xnf_intrlock);
+	mutex_exit(&xnfp->xnf_rxlock);

 	return (0);
 }
@@ -1490,389 +1886,217 @@
 	xnf_t *xnfp = arg;

 #ifdef XNF_DEBUG
-	if (xnfdebug & XNF_DEBUG_TRACE)
+	if (xnf_debug & XNF_DEBUG_TRACE)
 		printf("xnf%d stop(0x%p)\n",
 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
 #endif

-	mutex_enter(&xnfp->xnf_intrlock);
+	mutex_enter(&xnfp->xnf_rxlock);
 	mutex_enter(&xnfp->xnf_txlock);

 	xnfp->xnf_running = B_FALSE;

 	mutex_exit(&xnfp->xnf_txlock);
-	mutex_exit(&xnfp->xnf_intrlock);
+	mutex_exit(&xnfp->xnf_rxlock);
 }

 /*
- * Driver private functions follow
- */
-
-/*
- * Hang buffer on rx ring
+ * Hang buffer `bdesc' on the RX ring.
  */
 static void
-rx_buffer_hang(xnf_t *xnfp, struct xnf_buffer_desc *bdesc)
+xnf_rxbuf_hang(xnf_t *xnfp, xnf_buf_t *bdesc)
 {
-	volatile netif_rx_request_t	*reqp;
-	RING_IDX			hang_ix;
-	grant_ref_t			ref;
-	domid_t				oeid;
-
-	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
-
-	ASSERT(MUTEX_HELD(&xnfp->xnf_intrlock));
+	netif_rx_request_t *reqp;
+	RING_IDX hang_ix;
+
+	ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
+
 	reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring,
 	    xnfp->xnf_rx_ring.req_prod_pvt);
 	hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0));
-	ASSERT(xnfp->xnf_rxpkt_bufptr[hang_ix] == NULL);
-	if (bdesc->grant_ref == GRANT_INVALID_REF) {
-		ref = gnttab_claim_grant_reference(&xnfp->xnf_gref_rx_head);
-		ASSERT((signed short)ref >= 0);
-		bdesc->grant_ref = ref;
-		if (xnfp->xnf_rx_hvcopy) {
-			pfn_t pfn = xnf_btop(bdesc->buf_phys);
-			mfn_t mfn = pfn_to_mfn(pfn);
-
-			gnttab_grant_foreign_access_ref(ref, oeid, mfn, 0);
-		} else {
-			gnttab_grant_foreign_transfer_ref(ref, oeid, 0);
-		}
-	}
-	reqp->id = hang_ix;
+	ASSERT(xnfp->xnf_rx_pkt_info[hang_ix] == NULL);
+
+	reqp->id = bdesc->id = hang_ix;
 	reqp->gref = bdesc->grant_ref;
-	bdesc->id = hang_ix;
-	xnfp->xnf_rxpkt_bufptr[hang_ix] = bdesc;
-	membar_producer();
+
+	xnfp->xnf_rx_pkt_info[hang_ix] = bdesc;
 	xnfp->xnf_rx_ring.req_prod_pvt++;
+
+	xnfp->xnf_rx_new_buffers_posted = B_TRUE;
 }

-static mblk_t *
-xnf_process_hvcopy_recv(xnf_t *xnfp)
+/*
+ * Collect packets from the RX ring, storing them in `xnfp' for later
+ * use.
+ */
+static void
+xnf_rx_collect(xnf_t *xnfp)
 {
-	netif_rx_response_t *rxpkt;
-	mblk_t		*mp, *head, *tail;
-	struct		xnf_buffer_desc *bdesc;
-	boolean_t	hwcsum = B_FALSE, notify, work_to_do;
-	size_t 		len;
+	mblk_t *head, *tail;
+
+	ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));

 	/*
-	 * in loop over unconsumed responses, we do:
+	 * Loop over unconsumed responses:
 	 * 1. get a response
 	 * 2. take corresponding buffer off recv. ring
 	 * 3. indicate this by setting slot to NULL
 	 * 4. create a new message and
 	 * 5. copy data in, adjust ptr
-	 *
-	 * outside loop:
-	 * 7. make sure no more data has arrived; kick HV
 	 */

 	head = tail = NULL;

-loop:
 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
+		netif_rx_response_t *rxpkt;
+		xnf_buf_t *bdesc;
+		ssize_t len;
+		size_t off;
+		mblk_t *mp = NULL;
+		boolean_t hwcsum = B_FALSE;
+		grant_ref_t ref;

 		/* 1. */
 		rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring,
 		    xnfp->xnf_rx_ring.rsp_cons);

-		DTRACE_PROBE4(got_PKT, int, (int)rxpkt->id, int,
-		    (int)rxpkt->offset,
-		    int, (int)rxpkt->flags, int, (int)rxpkt->status);
+		DTRACE_PROBE4(xnf_rx_got_rsp, int, (int)rxpkt->id,
+		    int, (int)rxpkt->offset,
+		    int, (int)rxpkt->flags,
+		    int, (int)rxpkt->status);

 		/*
 		 * 2.
-		 * Take buffer off of receive ring
+		 */
+		bdesc = xnfp->xnf_rx_pkt_info[rxpkt->id];
+
+		/*
+		 * 3.
 		 */
-		hwcsum = B_FALSE;
-		bdesc = xnfp->xnf_rxpkt_bufptr[rxpkt->id];
-		/* 3 */
-		xnfp->xnf_rxpkt_bufptr[rxpkt->id] = NULL;
+		xnfp->xnf_rx_pkt_info[rxpkt->id] = NULL;
 		ASSERT(bdesc->id == rxpkt->id);
-		mp = NULL;
+
+		ref = bdesc->grant_ref;
+		off = rxpkt->offset;
+		len = rxpkt->status;
+
 		if (!xnfp->xnf_running) {
-			DTRACE_PROBE4(pkt_dropped, int, rxpkt->status,
+			DTRACE_PROBE4(xnf_rx_not_running,
+			    int, rxpkt->status,
 			    char *, bdesc->buf, int, rxpkt->offset,
 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
+
 			xnfp->xnf_stat_drop++;
-			/*
-			 * re-hang the buffer
-			 */
-			rx_buffer_hang(xnfp, bdesc);
-		} else if (rxpkt->status <= 0) {
-			DTRACE_PROBE4(pkt_status_negative, int, rxpkt->status,
+
+		} else if (len <= 0) {
+			DTRACE_PROBE4(xnf_rx_pkt_status_negative,
+			    int, rxpkt->status,
 			    char *, bdesc->buf, int, rxpkt->offset,
 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
+
 			xnfp->xnf_stat_errrx++;
-			if (rxpkt->status == 0)
+
+			switch (len) {
+			case 0:
 				xnfp->xnf_stat_runt++;
-			if (rxpkt->status == NETIF_RSP_ERROR)
+				break;
+			case NETIF_RSP_ERROR:
 				xnfp->xnf_stat_mac_rcv_error++;
-			if (rxpkt->status == NETIF_RSP_DROPPED)
+				break;
+			case NETIF_RSP_DROPPED:
 				xnfp->xnf_stat_norxbuf++;
-			/*
-			 * re-hang the buffer
-			 */
-			rx_buffer_hang(xnfp, bdesc);
+				break;
+			}
+
+		} else if (bdesc->grant_ref == INVALID_GRANT_REF) {
+			cmn_err(CE_WARN, "Bad rx grant reference %d "
+			    "from domain %d", ref,
+			    xvdi_get_oeid(xnfp->xnf_devinfo));
+
+		} else if ((off + len) > PAGESIZE) {
+			cmn_err(CE_WARN, "Rx packet overflows page "
+			    "(offset %ld, length %ld) from domain %d",
+			    off, len, xvdi_get_oeid(xnfp->xnf_devinfo));
 		} else {
-			grant_ref_t		ref =  bdesc->grant_ref;
-			struct xnf_buffer_desc	*new_bdesc;
-			unsigned long		off = rxpkt->offset;
-
-			DTRACE_PROBE4(pkt_status_ok, int, rxpkt->status,
-			    char *, bdesc->buf, int, rxpkt->offset,
-			    char *, ((char *)bdesc->buf) + rxpkt->offset);
-			len = rxpkt->status;
+			xnf_buf_t *nbuf = NULL;
+
+			DTRACE_PROBE4(xnf_rx_packet, int, len,
+			    char *, bdesc->buf, int, off,
+			    char *, ((char *)bdesc->buf) + off);
+
 			ASSERT(off + len <= PAGEOFFSET);
-			if (ref == GRANT_INVALID_REF) {
-				mp = NULL;
-				new_bdesc = bdesc;
-				cmn_err(CE_WARN, "Bad rx grant reference %d "
-				    "from dom %d", ref,
-				    xvdi_get_oeid(xnfp->xnf_devinfo));
-				goto luckless;
-			}
-			/*
-			 * Release ref which we'll be re-claiming in
-			 * rx_buffer_hang().
-			 */
-			bdesc->grant_ref = GRANT_INVALID_REF;
-			(void) gnttab_end_foreign_access_ref(ref, 0);
-			gnttab_release_grant_reference(&xnfp->xnf_gref_rx_head,
-			    ref);
+
 			if (rxpkt->flags & NETRXF_data_validated)
 				hwcsum = B_TRUE;

 			/*
-			 * XXPV for the initial implementation of HVcopy,
-			 * create a new msg and copy in the data
+			 * If the packet is below a pre-determined
+			 * size we will copy data out rather than
+			 * replace it.
+			 */
+			if (len > xnf_rx_copy_limit)
+				nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE);
+
+			/*
+			 * If we have a replacement buffer, attempt to
+			 * wrap the existing one with an mblk_t in
+			 * order that the upper layers of the stack
+			 * might use it directly.
 			 */
-			/* 4. */
-			if ((mp = allocb(len, BPRI_MED)) == NULL) {
-				/*
-				 * Couldn't get buffer to copy to,
-				 * drop this data, and re-hang
-				 * the buffer on the ring.
-				 */
-				xnfp->xnf_stat_norxbuf++;
-				DTRACE_PROBE(alloc_nix);
-			} else {
-				/* 5. */
-				DTRACE_PROBE(alloc_ok);
-				bcopy(bdesc->buf + off, mp->b_wptr,
-				    len);
-				mp->b_wptr += len;
-			}
-			new_bdesc = bdesc;
-luckless:
-
-			/* Re-hang old or hang new buffer. */
-			rx_buffer_hang(xnfp, new_bdesc);
-		}
-		if (mp) {
-			if (hwcsum) {
-				/*
-				 * See comments in xnf_process_recv().
-				 */
-
-				(void) hcksum_assoc(mp, NULL,
-				    NULL, 0, 0, 0, 0,
-				    HCK_FULLCKSUM |
-				    HCK_FULLCKSUM_OK,
-				    0);
-				xnfp->xnf_stat_rx_cksum_no_need++;
-			}
-			if (head == NULL) {
-				head = tail = mp;
-			} else {
-				tail->b_next = mp;
-				tail = mp;
+			if (nbuf != NULL) {
+				mp = desballoc((unsigned char *)bdesc->buf,
+				    bdesc->len, 0, &bdesc->free_rtn);
+				if (mp == NULL) {
+					xnfp->xnf_stat_rx_desballoc_fail++;
+					xnfp->xnf_stat_norxbuf++;
+
+					xnf_buf_put(xnfp, nbuf, B_FALSE);
+					nbuf = NULL;
+				} else {
+					mp->b_rptr = mp->b_rptr + off;
+					mp->b_wptr = mp->b_rptr + len;
+
+					/*
+					 * Release the grant reference
+					 * associated with this buffer
+					 * - they are scarce and the
+					 * upper layers of the stack
+					 * don't need it.
+					 */
+					(void) gnttab_end_foreign_access_ref(
+					    bdesc->grant_ref, 0);
+					gref_put(xnfp, bdesc->grant_ref);
+					bdesc->grant_ref = INVALID_GRANT_REF;
+
+					bdesc = nbuf;
+				}
 			}

-			ASSERT(mp->b_next == NULL);
-
-			xnfp->xnf_stat_ipackets++;
-			xnfp->xnf_stat_rbytes += len;
-		}
-
-		xnfp->xnf_rx_ring.rsp_cons++;
-
-		xnfp->xnf_stat_hvcopy_packet_processed++;
-	}
-
-	/* 7. */
-	/*
-	 * Has more data come in since we started?
-	 */
-	/* LINTED: constant in conditional context */
-	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_rx_ring, work_to_do);
-	if (work_to_do)
-		goto loop;
-
-	/*
-	 * Indicate to the backend that we have re-filled the receive
-	 * ring.
-	 */
-	/* LINTED: constant in conditional context */
-	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
-	if (notify)
-		ec_notify_via_evtchn(xnfp->xnf_evtchn);
-
-	return (head);
-}
-
-/* Process all queued received packets */
-static mblk_t *
-xnf_process_recv(xnf_t *xnfp)
-{
-	volatile netif_rx_response_t *rxpkt;
-	mblk_t *mp, *head, *tail;
-	struct xnf_buffer_desc *bdesc;
-	extern mblk_t *desballoc(unsigned char *, size_t, uint_t, frtn_t *);
-	boolean_t hwcsum = B_FALSE, notify, work_to_do;
-	size_t len;
-	pfn_t pfn;
-	long cnt;
-
-	head = tail = NULL;
-loop:
-	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
-
-		rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring,
-		    xnfp->xnf_rx_ring.rsp_cons);
-
-		/*
-		 * Take buffer off of receive ring
-		 */
-		hwcsum = B_FALSE;
-		bdesc = xnfp->xnf_rxpkt_bufptr[rxpkt->id];
-		xnfp->xnf_rxpkt_bufptr[rxpkt->id] = NULL;
-		ASSERT(bdesc->id == rxpkt->id);
-		mp = NULL;
-		if (!xnfp->xnf_running) {
-			xnfp->xnf_stat_drop++;
-			/*
-			 * re-hang the buffer
-			 */
-			rx_buffer_hang(xnfp, bdesc);
-		} else if (rxpkt->status <= 0) {
-			xnfp->xnf_stat_errrx++;
-			if (rxpkt->status == 0)
-				xnfp->xnf_stat_runt++;
-			if (rxpkt->status == NETIF_RSP_ERROR)
-				xnfp->xnf_stat_mac_rcv_error++;
-			if (rxpkt->status == NETIF_RSP_DROPPED)
-				xnfp->xnf_stat_norxbuf++;
-			/*
-			 * re-hang the buffer
-			 */
-			rx_buffer_hang(xnfp, bdesc);
-		} else {
-			grant_ref_t ref =  bdesc->grant_ref;
-			struct xnf_buffer_desc *new_bdesc;
-			unsigned long off = rxpkt->offset;
-			unsigned long mfn;
-
-			len = rxpkt->status;
-			ASSERT(off + len <= PAGEOFFSET);
-			if (ref == GRANT_INVALID_REF) {
-				mp = NULL;
-				new_bdesc = bdesc;
-				cmn_err(CE_WARN, "Bad rx grant reference %d "
-				    "from dom %d", ref,
-				    xvdi_get_oeid(xnfp->xnf_devinfo));
-				goto luckless;
-			}
-			bdesc->grant_ref = GRANT_INVALID_REF;
-			mfn = gnttab_end_foreign_transfer_ref(ref);
-			ASSERT(mfn != MFN_INVALID);
-			ASSERT(hat_getpfnum(kas.a_hat, bdesc->buf) ==
-			    PFN_INVALID);
-
-			gnttab_release_grant_reference(&xnfp->xnf_gref_rx_head,
-			    ref);
-			reassign_pfn(xnf_btop(bdesc->buf_phys), mfn);
-			hat_devload(kas.a_hat, bdesc->buf, PAGESIZE,
-			    xnf_btop(bdesc->buf_phys),
-			    PROT_READ | PROT_WRITE, HAT_LOAD);
-			balloon_drv_added(1);
-
-			if (rxpkt->flags & NETRXF_data_validated)
-				hwcsum = B_TRUE;
-			if (len <= xnf_rx_bcopy_thresh) {
+			if (nbuf == NULL) {
 				/*
-				 * For small buffers, just copy the data
-				 * and send the copy upstream.
-				 */
-				new_bdesc = NULL;
-			} else {
-				/*
-				 * We send a pointer to this data upstream;
-				 * we need a new buffer to replace this one.
+				 * No replacement buffer allocated -
+				 * attempt to copy the data out and
+				 * re-hang the existing buffer.
 				 */
-				mutex_enter(&xnfp->xnf_rx_buf_mutex);
-				new_bdesc = xnf_get_buffer(xnfp);
-				if (new_bdesc != NULL) {
-					xnfp->xnf_rx_bufs_outstanding++;
-				} else {
-					xnfp->xnf_stat_rx_no_ringbuf++;
-				}
-				mutex_exit(&xnfp->xnf_rx_buf_mutex);
-			}
-
-			if (new_bdesc == NULL) {
-				/*
-				 * Don't have a new ring buffer; bcopy the data
-				 * from the buffer, and preserve the
-				 * original buffer
-				 */
-				if ((mp = allocb(len, BPRI_MED)) == NULL) {
-					/*
-					 * Could't get buffer to copy to,
-					 * drop this data, and re-hang
-					 * the buffer on the ring.
-					 */
+
+				/* 4. */
+				mp = allocb(len, BPRI_MED);
+				if (mp == NULL) {
+					xnfp->xnf_stat_rx_allocb_fail++;
 					xnfp->xnf_stat_norxbuf++;
 				} else {
+					/* 5. */
 					bcopy(bdesc->buf + off, mp->b_wptr,
 					    len);
-				}
-				/*
-				 * Give the buffer page back to xen
-				 */
-				pfn = xnf_btop(bdesc->buf_phys);
-				cnt = balloon_free_pages(1, &mfn, bdesc->buf,
-				    &pfn);
-				if (cnt != 1) {
-					cmn_err(CE_WARN, "unable to give a "
-					    "page back to the hypervisor\n");
+					mp->b_wptr += len;
 				}
-				new_bdesc = bdesc;
-			} else {
-				if ((mp = desballoc((unsigned char *)bdesc->buf,
-				    off + len, 0, (frtn_t *)bdesc)) == NULL) {
-					/*
-					 * Couldn't get mblk to pass recv data
-					 * up with, free the old ring buffer
-					 */
-					xnfp->xnf_stat_norxbuf++;
-					xnf_rcv_complete(bdesc);
-					goto luckless;
-				}
-				(void) ddi_dma_sync(bdesc->dma_handle,
-				    0, 0, DDI_DMA_SYNC_FORCPU);
-
-				mp->b_wptr += off;
-				mp->b_rptr += off;
 			}
-luckless:
-			if (mp)
-				mp->b_wptr += len;
-			/* re-hang old or hang new buffer */
-			rx_buffer_hang(xnfp, new_bdesc);
 		}
-		if (mp) {
+
+		/* Re-hang the buffer. */
+		xnf_rxbuf_hang(xnfp, bdesc);
+
+		if (mp != NULL) {
 			if (hwcsum) {
 				/*
 				 * If the peer says that the data has
@@ -1895,20 +2119,22 @@
 				 * If it was necessary we could grovel
 				 * in the packet to find it.
 				 */
-
 				(void) hcksum_assoc(mp, NULL,
 				    NULL, 0, 0, 0, 0,
 				    HCK_FULLCKSUM |
-				    HCK_FULLCKSUM_OK,
-				    0);
+				    HCK_FULLCKSUM_OK, 0);
 				xnfp->xnf_stat_rx_cksum_no_need++;
 			}
 			if (head == NULL) {
-				head = tail = mp;
+				ASSERT(tail == NULL);
+
+				head = mp;
 			} else {
+				ASSERT(tail != NULL);
+
 				tail->b_next = mp;
-				tail = mp;
 			}
+			tail = mp;

 			ASSERT(mp->b_next == NULL);

@@ -1920,67 +2146,21 @@
 	}

 	/*
-	 * Has more data come in since we started?
-	 */
-	/* LINTED: constant in conditional context */
-	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_rx_ring, work_to_do);
-	if (work_to_do)
-		goto loop;
-
-	/*
-	 * Indicate to the backend that we have re-filled the receive
-	 * ring.
+	 * Store the mblks we have collected.
 	 */
-	/* LINTED: constant in conditional context */
-	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
-	if (notify)
-		ec_notify_via_evtchn(xnfp->xnf_evtchn);
-
-	return (head);
-}
-
-/* Called when the upper layers free a message we passed upstream */
-static void
-xnf_rcv_complete(struct xnf_buffer_desc *bdesc)
-{
-	xnf_t *xnfp = bdesc->xnfp;
-	pfn_t pfn;
-	long cnt;
-
-	/* One less outstanding receive buffer */
-	mutex_enter(&xnfp->xnf_rx_buf_mutex);
-	--xnfp->xnf_rx_bufs_outstanding;
-	/*
-	 * Return buffer to the free list, unless the free list is getting
-	 * too large.  XXPV - this threshold may need tuning.
-	 */
-	if (xnfp->xnf_rx_descs_free < xnf_rx_bufs_lowat) {
-		/*
-		 * Unmap the page, and hand the machine page back
-		 * to xen so it can be re-used as a backend net buffer.
-		 */
-		pfn = xnf_btop(bdesc->buf_phys);
-		cnt = balloon_free_pages(1, NULL, bdesc->buf, &pfn);
-		if (cnt != 1) {
-			cmn_err(CE_WARN, "unable to give a page back to the "
-			    "hypervisor\n");
+	if (head != NULL) {
+		ASSERT(tail != NULL);
+
+		if (xnfp->xnf_rx_head == NULL) {
+			ASSERT(xnfp->xnf_rx_tail == NULL);
+
+			xnfp->xnf_rx_head = head;
+		} else {
+			ASSERT(xnfp->xnf_rx_tail != NULL);
+
+			xnfp->xnf_rx_tail->b_next = head;
 		}
-
-		bdesc->next = xnfp->xnf_free_list;
-		xnfp->xnf_free_list = bdesc;
-		xnfp->xnf_rx_descs_free++;
-		mutex_exit(&xnfp->xnf_rx_buf_mutex);
-	} else {
-		/*
-		 * We can return everything here since we have a free buffer
-		 * that we have not given the backing page for back to xen.
-		 */
-		--xnfp->xnf_rx_buffer_count;
-		mutex_exit(&xnfp->xnf_rx_buf_mutex);
-		(void) ddi_dma_unbind_handle(bdesc->dma_handle);
-		ddi_dma_mem_free(&bdesc->acc_handle);
-		ddi_dma_free_handle(&bdesc->dma_handle);
-		kmem_free(bdesc, sizeof (*bdesc));
+		xnfp->xnf_rx_tail = tail;
 	}
 }

@@ -1991,34 +2171,16 @@
 xnf_alloc_dma_resources(xnf_t *xnfp)
 {
 	dev_info_t 		*devinfo = xnfp->xnf_devinfo;
-	int			i;
 	size_t			len;
 	ddi_dma_cookie_t	dma_cookie;
 	uint_t			ncookies;
-	struct xnf_buffer_desc	*bdesc;
 	int			rc;
 	caddr_t			rptr;

-	xnfp->xnf_n_rx = NET_RX_RING_SIZE;
-	xnfp->xnf_max_rx_bufs = xnf_rx_bufs_hiwat;
-
-	xnfp->xnf_n_tx = NET_TX_RING_SIZE;
-
 	/*
 	 * The code below allocates all the DMA data structures that
 	 * need to be released when the driver is detached.
 	 *
-	 * First allocate handles for mapping (virtual address) pointers to
-	 * transmit data buffers to physical addresses
-	 */
-	for (i = 0; i < xnfp->xnf_n_tx; i++) {
-		if ((rc = ddi_dma_alloc_handle(devinfo,
-		    &tx_buffer_dma_attr, DDI_DMA_SLEEP, 0,
-		    &xnfp->xnf_tx_pkt_info[i].dma_handle)) != DDI_SUCCESS)
-			return (DDI_FAILURE);
-	}
-
-	/*
 	 * Allocate page for the transmit descriptor ring.
 	 */
 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
@@ -2092,18 +2254,6 @@
 	FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE);
 	xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress;

-	/*
-	 * Preallocate receive buffers for each receive descriptor.
-	 */
-
-	/* Set up the "free list" of receive buffer descriptors */
-	for (i = 0; i < xnfp->xnf_n_rx; i++) {
-		if ((bdesc = xnf_alloc_buffer(xnfp)) == NULL)
-			goto alloc_error;
-		bdesc->next = xnfp->xnf_free_list;
-		xnfp->xnf_free_list = bdesc;
-	}
-
 	return (DDI_SUCCESS);

 alloc_error:
@@ -2116,8 +2266,6 @@

 /*
  * Release all DMA resources in the opposite order from acquisition
- * Should not be called until all outstanding esballoc buffers
- * have been returned.
  */
 static void
 xnf_release_dma_resources(xnf_t *xnfp)
@@ -2126,25 +2274,27 @@

 	/*
 	 * Free receive buffers which are currently associated with
-	 * descriptors
+	 * descriptors.
 	 */
-	for (i = 0; i < xnfp->xnf_n_rx; i++) {
-		struct xnf_buffer_desc *bp;
-
-		if ((bp = xnfp->xnf_rxpkt_bufptr[i]) == NULL)
+	mutex_enter(&xnfp->xnf_rxlock);
+	for (i = 0; i < NET_RX_RING_SIZE; i++) {
+		xnf_buf_t *bp;
+
+		if ((bp = xnfp->xnf_rx_pkt_info[i]) == NULL)
 			continue;
-		xnf_free_buffer(bp);
-		xnfp->xnf_rxpkt_bufptr[i] = NULL;
+		xnfp->xnf_rx_pkt_info[i] = NULL;
+		xnf_buf_put(xnfp, bp, B_FALSE);
 	}
-
-	/* Free the receive ring buffer */
+	mutex_exit(&xnfp->xnf_rxlock);
+
+	/* Free the receive ring buffer. */
 	if (xnfp->xnf_rx_ring_dma_acchandle != NULL) {
 		(void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle);
 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
 	}
-	/* Free the transmit ring buffer */
+	/* Free the transmit ring buffer. */
 	if (xnfp->xnf_tx_ring_dma_acchandle != NULL) {
 		(void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle);
 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
@@ -2152,219 +2302,75 @@
 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
 	}

-	/*
-	 * Free handles for mapping (virtual address) pointers to
-	 * transmit data buffers to physical addresses
-	 */
-	for (i = 0; i < xnfp->xnf_n_tx; i++) {
-		if (xnfp->xnf_tx_pkt_info[i].dma_handle != NULL) {
-			ddi_dma_free_handle(
-			    &xnfp->xnf_tx_pkt_info[i].dma_handle);
-		}
-	}
-
-}
-
-static void
-xnf_release_mblks(xnf_t *xnfp)
-{
-	int	i;
-
-	for (i = 0; i < xnfp->xnf_n_tx; i++) {
-		if (xnfp->xnf_tx_pkt_info[i].mp == NULL)
-			continue;
-		freemsg(xnfp->xnf_tx_pkt_info[i].mp);
-		xnfp->xnf_tx_pkt_info[i].mp = NULL;
-		(void) ddi_dma_unbind_handle(
-		    xnfp->xnf_tx_pkt_info[i].dma_handle);
-	}
-}
-
-/*
- * Remove a xmit buffer descriptor from the head of the free list and return
- * a pointer to it.  If no buffers on list, attempt to allocate a new one.
- * Called with the tx_buf_mutex held.
- */
-static struct xnf_buffer_desc *
-xnf_get_tx_buffer(xnf_t *xnfp)
-{
-	struct xnf_buffer_desc *bdesc;
-
-	bdesc = xnfp->xnf_tx_free_list;
-	if (bdesc != NULL) {
-		xnfp->xnf_tx_free_list = bdesc->next;
-	} else {
-		bdesc = xnf_alloc_tx_buffer(xnfp);
-	}
-	return (bdesc);
-}
-
-/*
- * Remove a buffer descriptor from the head of the free list and return
- * a pointer to it.  If no buffers on list, attempt to allocate a new one.
- * Called with the rx_buf_mutex held.
- */
-static struct xnf_buffer_desc *
-xnf_get_buffer(xnf_t *xnfp)
-{
-	struct xnf_buffer_desc *bdesc;
-
-	bdesc = xnfp->xnf_free_list;
-	if (bdesc != NULL) {
-		xnfp->xnf_free_list = bdesc->next;
-		xnfp->xnf_rx_descs_free--;
-	} else {
-		bdesc = xnf_alloc_buffer(xnfp);
-	}
-	return (bdesc);
-}
-
-/*
- * Free a xmit buffer back to the xmit free list
- */
-static void
-xnf_free_tx_buffer(struct xnf_buffer_desc *bp)
-{
-	xnf_t *xnfp = bp->xnfp;
-
-	mutex_enter(&xnfp->xnf_tx_buf_mutex);
-	bp->next = xnfp->xnf_tx_free_list;
-	xnfp->xnf_tx_free_list = bp;
-	mutex_exit(&xnfp->xnf_tx_buf_mutex);
 }

 /*
- * Put a buffer descriptor onto the head of the free list.
- * for page-flip:
- * We can't really free these buffers back to the kernel
- * since we have given away their backing page to be used
- * by the back end net driver.
- * for hvcopy:
- * release all the memory
+ * Release any packets and associated structures used by the TX ring.
  */
 static void
-xnf_free_buffer(struct xnf_buffer_desc *bdesc)
+xnf_release_mblks(xnf_t *xnfp)
 {
-	xnf_t *xnfp = bdesc->xnfp;
-
-	mutex_enter(&xnfp->xnf_rx_buf_mutex);
-	if (xnfp->xnf_rx_hvcopy) {
-		if (ddi_dma_unbind_handle(bdesc->dma_handle) != DDI_SUCCESS)
-			goto out;
-		ddi_dma_mem_free(&bdesc->acc_handle);
-		ddi_dma_free_handle(&bdesc->dma_handle);
-		kmem_free(bdesc, sizeof (*bdesc));
-		xnfp->xnf_rx_buffer_count--;
-	} else {
-		bdesc->next = xnfp->xnf_free_list;
-		xnfp->xnf_free_list = bdesc;
-		xnfp->xnf_rx_descs_free++;
+	RING_IDX i;
+	xnf_txid_t *tidp;
+
+	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
+	    i < NET_TX_RING_SIZE;
+	    i++, tidp++) {
+		xnf_txbuf_t *txp = tidp->txbuf;
+
+		if (txp != NULL) {
+			ASSERT(txp->tx_mp != NULL);
+			freemsg(txp->tx_mp);
+
+			txid_put(xnfp, tidp);
+			kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
+		}
 	}
-out:
-	mutex_exit(&xnfp->xnf_rx_buf_mutex);
 }

-/*
- * Allocate a DMA-able xmit buffer, including a structure to
- * keep track of the buffer.  Called with tx_buf_mutex held.
- */
-static struct xnf_buffer_desc *
-xnf_alloc_tx_buffer(xnf_t *xnfp)
+static int
+xnf_buf_constructor(void *buf, void *arg, int kmflag)
 {
-	struct xnf_buffer_desc *bdesc;
+	int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
+	xnf_buf_t *bdesc = buf;
+	xnf_t *xnfp = arg;
+	ddi_dma_cookie_t dma_cookie;
+	uint_t ncookies;
 	size_t len;

-	if ((bdesc = kmem_zalloc(sizeof (*bdesc), KM_NOSLEEP)) == NULL)
-		return (NULL);
-
-	/* allocate a DMA access handle for receive buffer */
-	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buffer_dma_attr,
-	    0, 0, &bdesc->dma_handle) != DDI_SUCCESS)
+	if (kmflag & KM_NOSLEEP)
+		ddiflags = DDI_DMA_DONTWAIT;
+
+	/* Allocate a DMA access handle for the buffer. */
+	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &buf_dma_attr,
+	    ddiflags, 0, &bdesc->dma_handle) != DDI_SUCCESS)
 		goto failure;

-	/* Allocate DMA-able memory for transmit buffer */
+	/* Allocate DMA-able memory for buffer. */
 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
-	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, 0, 0,
+	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, ddiflags, 0,
 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
 		goto failure_1;

+	/* Bind to virtual address of buffer to get physical address. */
+	if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
+	    bdesc->buf, len, DDI_DMA_RDWR | DDI_DMA_STREAMING,
+	    ddiflags, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
+		goto failure_2;
+	ASSERT(ncookies == 1);
+
+	bdesc->free_rtn.free_func = xnf_buf_recycle;
+	bdesc->free_rtn.free_arg = (caddr_t)bdesc;
 	bdesc->xnfp = xnfp;
-	xnfp->xnf_tx_buffer_count++;
-
-	return (bdesc);
-
-failure_1:
-	ddi_dma_free_handle(&bdesc->dma_handle);
-
-failure:
-	kmem_free(bdesc, sizeof (*bdesc));
-	return (NULL);
-}
-
-/*
- * Allocate a DMA-able receive buffer, including a structure to
- * keep track of the buffer.  Called with rx_buf_mutex held.
- */
-static struct xnf_buffer_desc *
-xnf_alloc_buffer(xnf_t *xnfp)
-{
-	struct			xnf_buffer_desc *bdesc;
-	size_t			len;
-	uint_t			ncookies;
-	ddi_dma_cookie_t	dma_cookie;
-	long			cnt;
-	pfn_t			pfn;
-
-	if (xnfp->xnf_rx_buffer_count >= xnfp->xnf_max_rx_bufs)
-		return (NULL);
-
-	if ((bdesc = kmem_zalloc(sizeof (*bdesc), KM_NOSLEEP)) == NULL)
-		return (NULL);
-
-	/* allocate a DMA access handle for receive buffer */
-	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buffer_dma_attr,
-	    0, 0, &bdesc->dma_handle) != DDI_SUCCESS)
-		goto failure;
-
-	/* Allocate DMA-able memory for receive buffer */
-	if (ddi_dma_mem_alloc(bdesc->dma_handle,
-	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, 0, 0,
-	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
-		goto failure_1;
-
-	/* bind to virtual address of buffer to get physical address */
-	if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
-	    bdesc->buf, PAGESIZE, DDI_DMA_READ | DDI_DMA_STREAMING,
-	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
-		goto failure_2;
-
 	bdesc->buf_phys = dma_cookie.dmac_laddress;
-	bdesc->xnfp = xnfp;
-	if (xnfp->xnf_rx_hvcopy) {
-		bdesc->free_rtn.free_func = xnf_copy_rcv_complete;
-	} else {
-		bdesc->free_rtn.free_func = xnf_rcv_complete;
-	}
-	bdesc->free_rtn.free_arg = (char *)bdesc;
-	bdesc->grant_ref = GRANT_INVALID_REF;
-	ASSERT(ncookies == 1);
-
-	xnfp->xnf_rx_buffer_count++;
-
-	if (!xnfp->xnf_rx_hvcopy) {
-		/*
-		 * Unmap the page, and hand the machine page back
-		 * to xen so it can be used as a backend net buffer.
-		 */
-		pfn = xnf_btop(bdesc->buf_phys);
-		cnt = balloon_free_pages(1, NULL, bdesc->buf, &pfn);
-		if (cnt != 1) {
-			cmn_err(CE_WARN, "unable to give a page back to the "
-			    "hypervisor\n");
-		}
-	}
-
-	return (bdesc);
+	bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
+	bdesc->len = dma_cookie.dmac_size;
+	bdesc->grant_ref = INVALID_GRANT_REF;
+	bdesc->gen = xnfp->xnf_gen;
+
+	atomic_add_64(&xnfp->xnf_stat_buf_allocated, 1);
+
+	return (0);

 failure_2:
 	ddi_dma_mem_free(&bdesc->acc_handle);
@@ -2373,8 +2379,117 @@
 	ddi_dma_free_handle(&bdesc->dma_handle);

 failure:
-	kmem_free(bdesc, sizeof (*bdesc));
-	return (NULL);
+
+	return (-1);
+}
+
+static void
+xnf_buf_destructor(void *buf, void *arg)
+{
+	xnf_buf_t *bdesc = buf;
+	xnf_t *xnfp = arg;
+
+	(void) ddi_dma_unbind_handle(bdesc->dma_handle);
+	ddi_dma_mem_free(&bdesc->acc_handle);
+	ddi_dma_free_handle(&bdesc->dma_handle);
+
+	atomic_add_64(&xnfp->xnf_stat_buf_allocated, -1);
+}
+
+static xnf_buf_t *
+xnf_buf_get(xnf_t *xnfp, int flags, boolean_t readonly)
+{
+	grant_ref_t gref;
+	xnf_buf_t *bufp;
+
+	/*
+	 * Usually grant references are more scarce than memory, so we
+	 * attempt to acquire a grant reference first.
+	 */
+	gref = gref_get(xnfp);
+	if (gref == INVALID_GRANT_REF)
+		return (NULL);
+
+	bufp = kmem_cache_alloc(xnfp->xnf_buf_cache, flags);
+	if (bufp == NULL) {
+		gref_put(xnfp, gref);
+		return (NULL);
+	}
+
+	ASSERT(bufp->grant_ref == INVALID_GRANT_REF);
+
+	bufp->grant_ref = gref;
+
+	if (bufp->gen != xnfp->xnf_gen)
+		xnf_buf_refresh(bufp);
+
+	gnttab_grant_foreign_access_ref(bufp->grant_ref,
+	    xvdi_get_oeid(bufp->xnfp->xnf_devinfo),
+	    bufp->buf_mfn, readonly ? 1 : 0);
+
+	atomic_add_64(&xnfp->xnf_stat_buf_outstanding, 1);
+
+	return (bufp);
+}
+
+static void
+xnf_buf_put(xnf_t *xnfp, xnf_buf_t *bufp, boolean_t readonly)
+{
+	if (bufp->grant_ref != INVALID_GRANT_REF) {
+		(void) gnttab_end_foreign_access_ref(
+		    bufp->grant_ref, readonly ? 1 : 0);
+		gref_put(xnfp, bufp->grant_ref);
+		bufp->grant_ref = INVALID_GRANT_REF;
+	}
+
+	kmem_cache_free(xnfp->xnf_buf_cache, bufp);
+
+	atomic_add_64(&xnfp->xnf_stat_buf_outstanding, -1);
+}
+
+/*
+ * Refresh any cached data about a buffer after resume.
+ */
+static void
+xnf_buf_refresh(xnf_buf_t *bdesc)
+{
+	bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
+	bdesc->gen = bdesc->xnfp->xnf_gen;
+}
+
+/*
+ * Streams `freeb' routine for `xnf_buf_t' when used as transmit
+ * look-aside buffers.
+ */
+static void
+xnf_buf_recycle(xnf_buf_t *bdesc)
+{
+	xnf_t *xnfp = bdesc->xnfp;
+
+	xnf_buf_put(xnfp, bdesc, B_TRUE);
+}
+
+static int
+xnf_tx_buf_constructor(void *buf, void *arg, int kmflag)
+{
+	_NOTE(ARGUNUSED(kmflag));
+	xnf_txbuf_t *txp = buf;
+	xnf_t *xnfp = arg;
+
+	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &buf_dma_attr,
+	    0, 0, &txp->tx_dma_handle) != DDI_SUCCESS)
+		return (-1);
+
+	return (0);
+}
+
+static void
+xnf_tx_buf_destructor(void *buf, void *arg)
+{
+	_NOTE(ARGUNUSED(arg));
+	xnf_txbuf_t *txp = buf;
+
+	ddi_dma_free_handle(&txp->tx_dma_handle);
 }

 /*
@@ -2388,8 +2503,13 @@
 	"tx_pullup",
 	"tx_pagebndry",
 	"tx_attempt",
-	"rx_no_ringbuf",
-	"hvcopy_packet_processed",
+	"buf_allocated",
+	"buf_outstanding",
+	"gref_outstanding",
+	"gref_failure",
+	"gref_peak",
+	"rx_allocb_fail",
+	"rx_desballoc_fail",
 };

 static int
@@ -2416,9 +2536,14 @@
 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup;
 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pagebndry;
 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_attempt;
-	(knp++)->value.ui64 = xnfp->xnf_stat_rx_no_ringbuf;
-
-	(knp++)->value.ui64 = xnfp->xnf_stat_hvcopy_packet_processed;
+
+	(knp++)->value.ui64 = xnfp->xnf_stat_buf_allocated;
+	(knp++)->value.ui64 = xnfp->xnf_stat_buf_outstanding;
+	(knp++)->value.ui64 = xnfp->xnf_stat_gref_outstanding;
+	(knp++)->value.ui64 = xnfp->xnf_stat_gref_failure;
+	(knp++)->value.ui64 = xnfp->xnf_stat_gref_peak;
+	(knp++)->value.ui64 = xnfp->xnf_stat_rx_allocb_fail;
+	(knp++)->value.ui64 = xnfp->xnf_stat_rx_desballoc_fail;

 	return (0);
 }
@@ -2462,7 +2587,7 @@
 {
 	xnf_t *xnfp = arg;

-	mutex_enter(&xnfp->xnf_intrlock);
+	mutex_enter(&xnfp->xnf_rxlock);
 	mutex_enter(&xnfp->xnf_txlock);

 #define	mac_stat(q, r)				\
@@ -2500,7 +2625,7 @@

 	default:
 		mutex_exit(&xnfp->xnf_txlock);
-		mutex_exit(&xnfp->xnf_intrlock);
+		mutex_exit(&xnfp->xnf_rxlock);

 		return (ENOTSUP);
 	}
@@ -2509,22 +2634,15 @@
 #undef ether_stat

 	mutex_exit(&xnfp->xnf_txlock);
-	mutex_exit(&xnfp->xnf_intrlock);
+	mutex_exit(&xnfp->xnf_rxlock);

 	return (0);
 }

-/*ARGSUSED*/
-static void
-xnf_ioctl(void *arg, queue_t *q, mblk_t *mp)
-{
-	miocnak(q, mp, 0, EINVAL);
-}
-
 static boolean_t
 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 {
-	xnf_t *xnfp = arg;
+	_NOTE(ARGUNUSED(arg));

 	switch (cap) {
 	case MAC_CAPAB_HCKSUM: {
@@ -2547,10 +2665,7 @@
 		 * field and must insert the pseudo-header checksum
 		 * before passing the packet to the IO domain.
 		 */
-		if (xnfp->xnf_cksum_offload)
-			*capab = HCKSUM_INET_FULL_V4;
-		else
-			*capab = 0;
+		*capab = HCKSUM_INET_FULL_V4;
 		break;
 	}
 	default:
@@ -2560,74 +2675,95 @@
 	return (B_TRUE);
 }

-/*ARGSUSED*/
+/*
+ * The state of the peer has changed - react accordingly.
+ */
 static void
 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
     void *arg, void *impl_data)
 {
+	_NOTE(ARGUNUSED(id, arg));
 	xnf_t *xnfp = ddi_get_driver_private(dip);
 	XenbusState new_state = *(XenbusState *)impl_data;

 	ASSERT(xnfp != NULL);

 	switch (new_state) {
+	case XenbusStateUnknown:
+	case XenbusStateInitialising:
+	case XenbusStateInitialised:
+	case XenbusStateClosing:
+	case XenbusStateClosed:
+	case XenbusStateReconfiguring:
+	case XenbusStateReconfigured:
+		break;
+
+	case XenbusStateInitWait:
+		xnf_read_config(xnfp);
+
+		if (!xnfp->xnf_be_rx_copy) {
+			cmn_err(CE_WARN,
+			    "The xnf driver requires a dom0 that "
+			    "supports 'feature-rx-copy'.");
+			(void) xvdi_switch_state(xnfp->xnf_devinfo,
+			    XBT_NULL, XenbusStateClosed);
+			break;
+		}
+
+		/*
+		 * Connect to the backend.
+		 */
+		xnf_be_connect(xnfp);
+
+		/*
+		 * Our MAC address as discovered by xnf_read_config().
+		 */
+		mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);
+
+		break;
+
 	case XenbusStateConnected:
-		mutex_enter(&xnfp->xnf_intrlock);
+		mutex_enter(&xnfp->xnf_rxlock);
 		mutex_enter(&xnfp->xnf_txlock);

 		xnfp->xnf_connected = B_TRUE;
 		/*
-		 * wake up threads wanting to send data to backend,
-		 * but got blocked due to backend is not ready
+		 * Wake up any threads waiting to send data to
+		 * backend.
 		 */
-		cv_broadcast(&xnfp->xnf_cv);
+		cv_broadcast(&xnfp->xnf_cv_state);

 		mutex_exit(&xnfp->xnf_txlock);
-		mutex_exit(&xnfp->xnf_intrlock);
+		mutex_exit(&xnfp->xnf_rxlock);

 		/*
-		 * kick backend in case it missed any tx request
-		 * in the TX ring buffer
+		 * Kick the peer in case it missed any transmits
+		 * request in the TX ring.
 		 */
 		ec_notify_via_evtchn(xnfp->xnf_evtchn);

 		/*
-		 * there maybe already queued rx data in the RX ring
-		 * sent by backend after it gets connected but before
-		 * we see its state change here, so we call our intr
-		 * handling routine to handle them, if any
+		 * There may already be completed receive requests in
+		 * the ring sent by backend after it gets connected
+		 * but before we see its state change here, so we call
+		 * xnf_intr() to handle them, if any.
 		 */
 		(void) xnf_intr((caddr_t)xnfp);

-		/* mark as link up after get connected */
+		/*
+		 * Mark the link up now that we are connected.
+		 */
 		mac_link_update(xnfp->xnf_mh, LINK_STATE_UP);

+		/*
+		 * Tell the backend about the multicast addresses in
+		 * which we are interested.
+		 */
+		mac_multicast_refresh(xnfp->xnf_mh, NULL, xnfp, B_TRUE);
+
 		break;

 	default:
 		break;
 	}
 }
-
-/*
- * Check whether backend is capable of and willing to talk
- * to us via hypervisor copy, as opposed to page flip.
- */
-static boolean_t
-xnf_hvcopy_peer_status(dev_info_t *devinfo)
-{
-	int	be_rx_copy;
-	int	err;
-
-	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(devinfo),
-	    "feature-rx-copy", "%d", &be_rx_copy);
-	/*
-	 * If we fail to read the store we assume that the key is
-	 * absent, implying an older domain at the far end.  Older
-	 * domains cannot do HV copy (we assume ..).
-	 */
-	if (err != 0)
-		be_rx_copy = 0;
-
-	return (be_rx_copy?B_TRUE:B_FALSE);
-}
--- a/usr/src/uts/common/xen/io/xnf.h	Wed Nov 04 21:40:43 2009 -0800
+++ b/usr/src/uts/common/xen/io/xnf.h	Thu Nov 05 01:05:36 2009 -0800
@@ -20,7 +20,7 @@
  */

 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */

@@ -37,70 +37,74 @@
 #define	XNF_MAXPKT	1500		/* MTU size */
 #define	XNF_FRAMESIZE	1514		/* frame size including MAC header */

-#define	XNF_MAX_RXDESCS	256
-
-#define	MCAST_HASHBITS		256
-
-extern int	xnf_diagnose;	/* Available for use any time. */
-
-/* Flags to set in the global xnf_diagnose */
-#define	XNF_DIAG_RX		0x01
-#define	XNF_DIAG_TX		0x02
-#define	XNF_DIAG_STATS		0x04
-#define	XNF_DIAG_RX_BUFS	0x08
-
 /* DEBUG flags */
 #define	XNF_DEBUG_DDI		0x01
 #define	XNF_DEBUG_TRACE		0x02
-#define	XNF_DEBUG_SEND		0x04
-#define	XNF_DEBUG_INT		0x08

-#define	XNF_DESC_ALIGN		8
-
-
-/* Info pertaining to each xmit/receive buffer */
-struct xnf_buffer_desc {
-	frtn_t			free_rtn;	/* desballoc() structure */
+/*
+ * Information about each receive buffer and any transmit look-aside
+ * buffers.
+ */
+typedef struct xnf_buf {
+	frtn_t			free_rtn;
 	struct xnf		*xnfp;
 	ddi_dma_handle_t	dma_handle;
 	caddr_t			buf;		/* DMA-able data buffer */
 	paddr_t			buf_phys;
-	struct xnf_buffer_desc	*next;	/* For linking into free list */
+	mfn_t			buf_mfn;
+	size_t			len;
+	struct xnf_buf		*next;	/* For linking into free list */
 	ddi_acc_handle_t	acc_handle;
 	grant_ref_t		grant_ref;	/* grant table reference */
 	uint16_t		id;		/* buffer id */
-};
+	unsigned int		gen;
+} xnf_buf_t;
+
+/*
+ * Information about each transmit buffer.
+ */
+typedef struct xnf_txbuf {
+	struct xnf_txbuf	*tx_next;
+	mblk_t			*tx_mp;	/* mblk associated with packet */
+	netif_tx_request_t	tx_txreq;
+	caddr_t			tx_bufp;
+	ddi_dma_handle_t	tx_dma_handle;
+	mfn_t			tx_mfn;
+	xnf_buf_t		*tx_bdesc; /* Look-aside buffer, if used. */
+	unsigned char		tx_type;
+	int16_t			tx_status;
+	RING_IDX		tx_slot;

-/* Various information about each transmit packet */
-struct tx_pktinfo {
-	mblk_t			*mp;	/* mblk associated with packet */
-	ddi_dma_handle_t	dma_handle;
-	struct xnf_buffer_desc	*bdesc; /* pointer to buffer descriptor */
-	grant_ref_t		grant_ref;	/* grant table reference */
-	uint16_t		id;	/* tx pkt id/free list next pointer */
-};
+#define	TX_DATA		1
+#define	TX_MCAST_REQ	2
+#define	TX_MCAST_RSP	3
+} xnf_txbuf_t;

-/* Per network-interface-controller driver private structure */
+/*
+ * Information about each outstanding transmit operation.
+ */
+typedef struct xnf_txid {
+	uint16_t	id;	/* Id of this transmit buffer. */
+	uint16_t	next;	/* Freelist of ids. */
+	xnf_txbuf_t	*txbuf;	/* Buffer details. */
+} xnf_txid_t;
+
+/*
+ * Per-instance data.
+ */
 typedef struct xnf {
 	/* most interesting stuff first to assist debugging */
-	dev_info_t		*xnf_devinfo;	/* System per-device info. */
-	mac_handle_t		xnf_mh;		/* Nemo per-device info. */
-	int			xnf_rx_bufs_outstanding;
-	int			xnf_tx_descs_free;
-	int			xnf_rx_descs_free; /* count of free rx bufs */
-	int			xnf_n_tx;	/* No. xmit descriptors */
-	int			xnf_n_rx;	/* No. recv descriptors */
-	int			xnf_n_rx_bufs;	/* No. recv DMA buffers */
-	int			xnf_tx_start_thresh_regval;
+	dev_info_t		*xnf_devinfo;
+	mac_handle_t		xnf_mh;
 	unsigned char		xnf_mac_addr[ETHERADDRL];
-	int			xnf_max_rx_bufs;
-	int			xnf_rx_buffer_count;
-	int			xnf_tx_buffer_count;
+
+	unsigned int		xnf_gen;	/* Increments on resume. */

 	boolean_t		xnf_connected;
 	boolean_t		xnf_running;

-	boolean_t		xnf_cksum_offload;
+	boolean_t		xnf_be_rx_copy;
+	boolean_t		xnf_be_mcast_control;

 	uint64_t		xnf_stat_interrupts;
 	uint64_t		xnf_stat_unclaimed_interrupts;
@@ -112,7 +116,6 @@
 	uint64_t		xnf_stat_tx_pullup;
 	uint64_t		xnf_stat_tx_pagebndry;
 	uint64_t		xnf_stat_tx_defer;
-	uint64_t		xnf_stat_rx_no_ringbuf;
 	uint64_t		xnf_stat_mac_rcv_error;
 	uint64_t		xnf_stat_runt;

@@ -123,44 +126,54 @@

 	uint64_t		xnf_stat_tx_cksum_deferred;
 	uint64_t		xnf_stat_rx_cksum_no_need;
-	uint64_t		xnf_stat_hvcopy_enabled; /* on/off */
-	uint64_t		xnf_stat_hvcopy_packet_processed;
+
+	uint64_t		xnf_stat_buf_allocated;
+	uint64_t		xnf_stat_buf_outstanding;
+	uint64_t		xnf_stat_gref_outstanding;
+	uint64_t		xnf_stat_gref_failure;
+	uint64_t		xnf_stat_gref_peak;
+	uint64_t		xnf_stat_rx_allocb_fail;
+	uint64_t		xnf_stat_rx_desballoc_fail;

 	kstat_t			*xnf_kstat_aux;

-	struct xnf_buffer_desc	*xnf_free_list;
-	struct xnf_buffer_desc	*xnf_tx_free_list;
-	int			xnf_tx_pkt_id_list;
-				/* free list of avail pkt ids */
-	struct tx_pktinfo	xnf_tx_pkt_info[NET_TX_RING_SIZE];
-	struct xnf_buffer_desc	*xnf_rxpkt_bufptr[XNF_MAX_RXDESCS];
+	ddi_iblock_cookie_t	xnf_icookie;

-	ddi_iblock_cookie_t	xnf_icookie;
-	kmutex_t		xnf_tx_buf_mutex;
-	kmutex_t		xnf_rx_buf_mutex;
-	kmutex_t		xnf_txlock;
-	kmutex_t		xnf_intrlock;
-	boolean_t		xnf_tx_pages_readonly;
-	boolean_t		xnf_need_sched;
-
-	netif_tx_front_ring_t	xnf_tx_ring;	/* tx interface struct ptr */
+	netif_tx_front_ring_t	xnf_tx_ring;
 	ddi_dma_handle_t	xnf_tx_ring_dma_handle;
 	ddi_acc_handle_t	xnf_tx_ring_dma_acchandle;
 	paddr_t			xnf_tx_ring_phys_addr;
 	grant_ref_t		xnf_tx_ring_ref;

-	netif_rx_front_ring_t	xnf_rx_ring;	/* rx interface struct ptr */
+	xnf_txid_t		xnf_tx_pkt_id[NET_TX_RING_SIZE];
+	uint16_t		xnf_tx_pkt_id_head;
+	kmutex_t		xnf_txlock;
+	kmutex_t		xnf_schedlock;
+	boolean_t		xnf_need_sched;
+	kcondvar_t		xnf_cv_tx_slots;
+	kmem_cache_t		*xnf_tx_buf_cache;
+
+	netif_rx_front_ring_t	xnf_rx_ring;
 	ddi_dma_handle_t	xnf_rx_ring_dma_handle;
 	ddi_acc_handle_t	xnf_rx_ring_dma_acchandle;
 	paddr_t			xnf_rx_ring_phys_addr;
 	grant_ref_t		xnf_rx_ring_ref;

-	uint16_t		xnf_evtchn;	/* channel to back end ctlr */
-	grant_ref_t		xnf_gref_tx_head;	/* tx grant free list */
-	grant_ref_t		xnf_gref_rx_head;	/* rx grant free list */
-	kcondvar_t		xnf_cv;
+	xnf_buf_t		*xnf_rx_pkt_info[NET_RX_RING_SIZE];
+	kmutex_t		xnf_rxlock;
+	mblk_t			*xnf_rx_head;
+	mblk_t			*xnf_rx_tail;
+	boolean_t		xnf_rx_new_buffers_posted;
+	kmem_cache_t		*xnf_buf_cache;

-	boolean_t		xnf_rx_hvcopy;	/* do we do HV copy? */
+	uint16_t		xnf_evtchn;
+
+	kmutex_t		xnf_gref_lock;
+	grant_ref_t		xnf_gref_head;
+
+	kcondvar_t		xnf_cv_state;
+	kcondvar_t		xnf_cv_multicast;
+	uint_t			xnf_pending_multicast;
 } xnf_t;

 #ifdef __cplusplus