changeset 10309:1b8c848f3840

6793813 NIU Hybrid I/O performance regression with Crossbow
author Sriharsha Basavapatna <Sriharsha.Basavapatna@Sun.COM>
date Fri, 14 Aug 2009 09:48:09 -0700
parents a0c54044a2bd
children ba87b3315737
files usr/src/uts/common/io/aggr/aggr_grp.c usr/src/uts/common/io/mac/mac.c usr/src/uts/common/io/mac/mac_datapath_setup.c usr/src/uts/common/io/nxge/nxge_hio.c usr/src/uts/common/io/nxge/nxge_hio_guest.c usr/src/uts/common/io/nxge/nxge_main.c usr/src/uts/common/io/nxge/nxge_rxdma.c usr/src/uts/common/io/nxge/nxge_send.c usr/src/uts/common/io/nxge/nxge_virtual.c usr/src/uts/common/sys/mac_client_priv.h usr/src/uts/common/sys/mac_impl.h usr/src/uts/common/sys/mac_soft_ring.h usr/src/uts/common/sys/nxge/nxge_hio.h usr/src/uts/sun4v/io/vnet.c usr/src/uts/sun4v/io/vnet_dds.c usr/src/uts/sun4v/io/vnet_gen.c usr/src/uts/sun4v/sys/vnet.h usr/src/uts/sun4v/sys/vnet_gen.h
diffstat 18 files changed, 2002 insertions(+), 486 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/io/aggr/aggr_grp.c	Fri Aug 14 12:16:18 2009 -0400
+++ b/usr/src/uts/common/io/aggr/aggr_grp.c	Fri Aug 14 09:48:09 2009 -0700
@@ -623,7 +623,8 @@
 	/*
 	 * Get the list the the underlying HW rings.
 	 */
-	hw_rh_cnt = mac_hwrings_get(port->lp_mch, &port->lp_hwgh, hw_rh);
+	hw_rh_cnt = mac_hwrings_get(port->lp_mch, &port->lp_hwgh, hw_rh,
+	    MAC_RING_TYPE_RX);
 
 	if (port->lp_hwgh != NULL) {
 		/*
@@ -689,7 +690,8 @@
 		goto done;
 
 	ASSERT(rx_grp->arg_gh != NULL);
-	hw_rh_cnt = mac_hwrings_get(port->lp_mch, &hwgh, hw_rh);
+	hw_rh_cnt = mac_hwrings_get(port->lp_mch, &hwgh, hw_rh,
+	    MAC_RING_TYPE_RX);
 
 	/*
 	 * If hw_rh_cnt is 0, it means that the underlying port does not
--- a/usr/src/uts/common/io/mac/mac.c	Fri Aug 14 12:16:18 2009 -0400
+++ b/usr/src/uts/common/io/mac/mac.c	Fri Aug 14 09:48:09 2009 -0700
@@ -1426,35 +1426,54 @@
  */
 int
 mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh,
-    mac_ring_handle_t *hwrh)
+    mac_ring_handle_t *hwrh, mac_ring_type_t rtype)
 {
 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
-	flow_entry_t		*flent = mcip->mci_flent;
-	mac_group_t		*grp = flent->fe_rx_ring_group;
-	mac_ring_t		*ring;
 	int			cnt = 0;
 
-	/*
-	 * The mac client did not reserve any RX group, return directly.
-	 * This is probably because the underlying MAC does not support
-	 * any RX groups.
-	 */
-	*hwgh = NULL;
-	if (grp == NULL)
-		return (0);
-
-	/*
-	 * This RX group must be reserved by this mac client.
-	 */
-	ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) &&
-	    (mch == (mac_client_handle_t)(MAC_RX_GROUP_ONLY_CLIENT(grp))));
-
-	for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next) {
-		ASSERT(cnt < MAX_RINGS_PER_GROUP);
-		hwrh[cnt++] = (mac_ring_handle_t)ring;
+	switch (rtype) {
+	case MAC_RING_TYPE_RX: {
+		flow_entry_t	*flent = mcip->mci_flent;
+		mac_group_t	*grp;
+		mac_ring_t	*ring;
+
+		grp = flent->fe_rx_ring_group;
+		/*
+		 * The mac client did not reserve any RX group, return directly.
+		 * This is probably because the underlying MAC does not support
+		 * any groups.
+		 */
+		*hwgh = NULL;
+		if (grp == NULL)
+			return (0);
+		/*
+		 * This group must be reserved by this mac client.
+		 */
+		ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) &&
+		    (mch == (mac_client_handle_t)
+		    (MAC_RX_GROUP_ONLY_CLIENT(grp))));
+		for (ring = grp->mrg_rings;
+		    ring != NULL; ring = ring->mr_next, cnt++) {
+			ASSERT(cnt < MAX_RINGS_PER_GROUP);
+			hwrh[cnt] = (mac_ring_handle_t)ring;
+		}
+		*hwgh = (mac_group_handle_t)grp;
+		return (cnt);
 	}
-	*hwgh = (mac_group_handle_t)grp;
-	return (cnt);
+	case MAC_RING_TYPE_TX: {
+		mac_soft_ring_set_t	*tx_srs;
+		mac_srs_tx_t		*tx;
+
+		tx_srs = MCIP_TX_SRS(mcip);
+		tx = &tx_srs->srs_tx;
+		for (; cnt < tx->st_ring_count; cnt++)
+			hwrh[cnt] = tx->st_rings[cnt];
+		return (cnt);
+	}
+	default:
+		ASSERT(B_FALSE);
+		return (-1);
+	}
 }
 
 /*
@@ -1524,6 +1543,22 @@
 	return (info->mri_poll(info->mri_driver, bytes_to_pickup));
 }
 
+/*
+ * Send packets through the selected tx ring.
+ */
+mblk_t *
+mac_hwring_tx(mac_ring_handle_t rh, mblk_t *mp)
+{
+	mac_ring_t *ring = (mac_ring_t *)rh;
+	mac_ring_info_t *info = &ring->mr_info;
+
+	ASSERT(ring->mr_type == MAC_RING_TYPE_TX);
+	ASSERT(ring->mr_state >= MR_INUSE);
+	ASSERT(info->mri_tx != NULL);
+
+	return (info->mri_tx(info->mri_driver, mp));
+}
+
 int
 mac_hwgroup_addmac(mac_group_handle_t gh, const uint8_t *addr)
 {
@@ -3429,22 +3464,6 @@
 }
 
 /*
- * Send packets through a selected tx ring.
- */
-mblk_t *
-mac_ring_tx(mac_ring_handle_t rh, mblk_t *mp)
-{
-	mac_ring_t *ring = (mac_ring_t *)rh;
-	mac_ring_info_t *info = &ring->mr_info;
-
-	ASSERT(ring->mr_type == MAC_RING_TYPE_TX);
-	ASSERT(ring->mr_state >= MR_INUSE);
-	ASSERT(info->mri_tx != NULL);
-
-	return (info->mri_tx(info->mri_driver, mp));
-}
-
-/*
  * Find a ring from its index.
  */
 mac_ring_t *
--- a/usr/src/uts/common/io/mac/mac_datapath_setup.c	Fri Aug 14 12:16:18 2009 -0400
+++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c	Fri Aug 14 09:48:09 2009 -0700
@@ -2235,6 +2235,10 @@
 			    tx->st_group);
 			tx->st_group = NULL;
 		}
+		if (tx->st_ring_count != 0) {
+			kmem_free(tx->st_rings,
+			    sizeof (mac_ring_handle_t) * tx->st_ring_count);
+		}
 		if (tx->st_arg2 != NULL) {
 			ASSERT(tx_srs->srs_type & SRST_TX);
 			mac_release_tx_ring(tx->st_arg2);
@@ -3203,7 +3207,7 @@
 	mac_impl_t *mip = mcip->mci_mip;
 	mac_soft_ring_set_t *tx_srs;
 	int i, tx_ring_count = 0, tx_rings_reserved = 0;
-	mac_ring_handle_t *tx_ring = NULL;
+	mac_ring_handle_t *tx_rings = NULL;
 	uint32_t soft_ring_type;
 	mac_group_t *grp = NULL;
 	mac_ring_t *ring;
@@ -3221,7 +3225,7 @@
 	}
 
 	if (tx_ring_count != 0) {
-		tx_ring = kmem_zalloc(sizeof (mac_ring_handle_t) *
+		tx_rings = kmem_zalloc(sizeof (mac_ring_handle_t) *
 		    tx_ring_count, KM_SLEEP);
 	}
 
@@ -3231,8 +3235,12 @@
 	 * NIC's.
 	 */
 	if (srs_type == SRST_FLOW ||
-	    (mcip->mci_state_flags & MCIS_NO_HWRINGS) != 0)
-		goto use_default_ring;
+	    (mcip->mci_state_flags & MCIS_NO_HWRINGS) != 0) {
+		/* use default ring */
+		tx_rings[0] = (void *)mip->mi_default_tx_ring;
+		tx_rings_reserved++;
+		goto rings_assigned;
+	}
 
 	if (mcip->mci_share != NULL)
 		ring = grp->mrg_rings;
@@ -3245,8 +3253,7 @@
 	 * then each Tx ring will have a Tx-side soft ring. All
 	 * these soft rings will be hang off Tx SRS.
 	 */
-	for (i = 0, tx_rings_reserved = 0;
-	    i < tx_ring_count; i++, tx_rings_reserved++) {
+	for (i = 0; i < tx_ring_count; i++) {
 		if (mcip->mci_share != NULL) {
 			/*
 			 * The ring was already chosen and associated
@@ -3255,42 +3262,39 @@
 			 * between the share and non-share cases.
 			 */
 			ASSERT(ring != NULL);
-			tx_ring[i] = (mac_ring_handle_t)ring;
+			tx_rings[i] = (mac_ring_handle_t)ring;
 			ring = ring->mr_next;
 		} else {
-			tx_ring[i] =
+			tx_rings[i] =
 			    (mac_ring_handle_t)mac_reserve_tx_ring(mip, NULL);
-			if (tx_ring[i] == NULL)
+			if (tx_rings[i] == NULL) {
+				/*
+				 * We have run out of Tx rings. So
+				 * give the default ring too.
+				 */
+				tx_rings[i] = (void *)mip->mi_default_tx_ring;
+				tx_rings_reserved++;
 				break;
+			}
 		}
+		tx_rings_reserved++;
 	}
+
+rings_assigned:
 	if (mac_tx_serialize || (mip->mi_v12n_level & MAC_VIRT_SERIALIZE))
 		serialize = B_TRUE;
 	/*
 	 * Did we get the requested number of tx rings?
-	 * There are 3 actions we can take depending upon the number
+	 * There are 2 actions we can take depending upon the number
 	 * of tx_rings we got.
-	 * 1) If we got none, then hook up the tx_srs with the
-	 * default ring.
-	 * 2) If we got one, then get the tx_ring from the soft ring,
+	 * 1) If we got one, then get the tx_ring from the soft ring,
 	 * save it in SRS and free up the soft ring.
-	 * 3) If we got more than 1, then do the tx fanout among the
+	 * 2) If we got more than 1, then do the tx fanout among the
 	 * rings we obtained.
 	 */
-	switch (tx_rings_reserved) {
-	case 1:
-		/*
-		 * No need to allocate Tx soft rings. Tx-side soft
-		 * rings are for Tx fanout case. Just use Tx SRS.
-		 */
-		/* FALLTHRU */
-
-	case 0:
-use_default_ring:
-		if (tx_rings_reserved == 0)
-			tx->st_arg2 = (void *)mip->mi_default_tx_ring;
-		else
-			tx->st_arg2 = (void *)tx_ring[0];
+	ASSERT(tx_rings_reserved != 0);
+	if (tx_rings_reserved == 1) {
+		tx->st_arg2 = (void *)tx_rings[0];
 		/* For ring_count of 0 or 1, set the tx_mode and return */
 		if (tx_srs->srs_type & SRST_BW_CONTROL)
 			tx->st_mode = SRS_TX_BW;
@@ -3298,18 +3302,9 @@
 			tx->st_mode = SRS_TX_SERIALIZE;
 		else
 			tx->st_mode = SRS_TX_DEFAULT;
-		break;
-
-	default:
+	} else {
 		/*
 		 * We got multiple Tx rings for Tx fanout.
-		 *
-		 * cpuid of -1 is passed. This creates an unbound
-		 * worker thread. Instead the code should get CPU
-		 * binding information and pass that to
-		 * mac_soft_ring_create(). This needs to be done
-		 * in conjunction with Rx-side soft ring
-		 * bindings.
 		 */
 		soft_ring_type = ST_RING_OTH | ST_RING_TX;
 		if (tx_srs->srs_type & SRST_BW_CONTROL) {
@@ -3322,7 +3317,7 @@
 		for (i = 0; i < tx_rings_reserved; i++) {
 			(void) mac_soft_ring_create(i, 0, NULL, soft_ring_type,
 			    maxclsyspri, mcip, tx_srs, -1, NULL, mcip,
-			    (mac_resource_handle_t)tx_ring[i]);
+			    (mac_resource_handle_t)tx_rings[i]);
 		}
 		mac_srs_update_fanout_list(tx_srs);
 	}
@@ -3332,8 +3327,12 @@
 	    int, tx->st_mode, int, tx_srs->srs_oth_ring_count);
 
 	if (tx_ring_count != 0) {
-		kmem_free(tx_ring,
-		    sizeof (mac_ring_handle_t) * tx_ring_count);
+		tx->st_ring_count = tx_rings_reserved;
+		tx->st_rings = kmem_zalloc(sizeof (mac_ring_handle_t) *
+		    tx_rings_reserved, KM_SLEEP);
+		for (i = 0; i < tx->st_ring_count; i++)
+			tx->st_rings[i] = tx_rings[i];
+		kmem_free(tx_rings, sizeof (mac_ring_handle_t) * tx_ring_count);
 	}
 }
 
--- a/usr/src/uts/common/io/nxge/nxge_hio.c	Fri Aug 14 12:16:18 2009 -0400
+++ b/usr/src/uts/common/io/nxge/nxge_hio.c	Fri Aug 14 09:48:09 2009 -0700
@@ -41,9 +41,6 @@
 #include <sys/nxge/nxge_txdma.h>
 #include <sys/nxge/nxge_hio.h>
 
-#define	NXGE_HIO_SHARE_MIN_CHANNELS 2
-#define	NXGE_HIO_SHARE_MAX_CHANNELS 2
-
 /*
  * External prototypes
  */
@@ -1057,23 +1054,6 @@
 		NXGE_DEBUG_MSG((nxge, HIO_CTL,
 		    "Hybrid IO-capable service domain"));
 		return (NXGE_OK);
-	} else {
-		/*
-		 * isLDOMguest(nxge) == B_TRUE
-		 */
-		nx_vio_fp_t *vio;
-		nhd->type = NXGE_HIO_TYPE_GUEST;
-
-		vio = &nhd->hio.vio;
-		vio->__register = (vio_net_resource_reg_t)
-		    modgetsymvalue("vio_net_resource_reg", 0);
-		vio->unregister = (vio_net_resource_unreg_t)
-		    modgetsymvalue("vio_net_resource_unreg", 0);
-
-		if (vio->__register == 0 || vio->unregister == 0) {
-			NXGE_ERROR_MSG((nxge, VIR_CTL, "vio_net is absent!"));
-			return (NXGE_ERROR);
-		}
 	}
 
 	return (0);
@@ -1144,12 +1124,16 @@
 static int
 nxge_hio_add_mac(void *arg, const uint8_t *mac_addr)
 {
-	nxge_ring_group_t *group = (nxge_ring_group_t *)arg;
-	p_nxge_t nxge = group->nxgep;
-	int rv;
-	nxge_hio_vr_t *vr;	/* The Virtualization Region */
+	nxge_ring_group_t	*group = (nxge_ring_group_t *)arg;
+	p_nxge_t		nxge = group->nxgep;
+	int			rv;
+	nxge_hio_vr_t		*vr;	/* The Virtualization Region */
 
 	ASSERT(group->type == MAC_RING_TYPE_RX);
+	ASSERT(group->nxgep != NULL);
+
+	if (isLDOMguest(group->nxgep))
+		return (0);
 
 	mutex_enter(nxge->genlock);
 
@@ -1174,8 +1158,7 @@
 	/*
 	 * Program the mac address for the group.
 	 */
-	if ((rv = nxge_hio_group_mac_add(nxge, group,
-	    mac_addr)) != 0) {
+	if ((rv = nxge_hio_group_mac_add(nxge, group, mac_addr)) != 0) {
 		return (rv);
 	}
 
@@ -1206,6 +1189,10 @@
 	int rv, slot;
 
 	ASSERT(group->type == MAC_RING_TYPE_RX);
+	ASSERT(group->nxgep != NULL);
+
+	if (isLDOMguest(group->nxgep))
+		return (0);
 
 	mutex_enter(nxge->genlock);
 
@@ -1253,14 +1240,16 @@
 	int			dev_gindex;
 
 	ASSERT(group->type == MAC_RING_TYPE_RX);
+	ASSERT(group->nxgep != NULL);
 
-#ifdef later
 	ASSERT(group->nxgep->nxge_mac_state == NXGE_MAC_STARTED);
-#endif
 	if (group->nxgep->nxge_mac_state != NXGE_MAC_STARTED)
 		return (ENXIO);
 
 	mutex_enter(group->nxgep->genlock);
+	if (isLDOMguest(group->nxgep))
+		goto nxge_hio_group_start_exit;
+
 	dev_gindex = group->nxgep->pt_config.hw_config.def_mac_rxdma_grpid +
 	    group->gindex;
 	rdc_grp_p = &group->nxgep->pt_config.rdc_grps[dev_gindex];
@@ -1289,9 +1278,9 @@
 
 	(void) nxge_init_fzc_rdc_tbl(group->nxgep, rdc_grp_p, rdctbl);
 
+nxge_hio_group_start_exit:
 	group->started = B_TRUE;
 	mutex_exit(group->nxgep->genlock);
-
 	return (0);
 }
 
@@ -1305,6 +1294,9 @@
 	mutex_enter(group->nxgep->genlock);
 	group->started = B_FALSE;
 
+	if (isLDOMguest(group->nxgep))
+		goto nxge_hio_group_stop_exit;
+
 	/*
 	 * Unbind the RDC table previously bound for this group.
 	 *
@@ -1314,6 +1306,7 @@
 	if (group->gindex != 0)
 		(void) nxge_fzc_rdc_tbl_unbind(group->nxgep, group->rdctbl);
 
+nxge_hio_group_stop_exit:
 	mutex_exit(group->nxgep->genlock);
 }
 
@@ -1334,20 +1327,26 @@
 		group->gindex = groupid;
 		group->sindex = 0;	/* not yet bound to a share */
 
-		dev_gindex = nxgep->pt_config.hw_config.def_mac_rxdma_grpid +
-		    groupid;
+		if (!isLDOMguest(nxgep)) {
+			dev_gindex =
+			    nxgep->pt_config.hw_config.def_mac_rxdma_grpid +
+			    groupid;
 
-		if (nxgep->pt_config.hw_config.def_mac_rxdma_grpid ==
-		    dev_gindex)
-			group->port_default_grp = B_TRUE;
+			if (nxgep->pt_config.hw_config.def_mac_rxdma_grpid ==
+			    dev_gindex)
+				group->port_default_grp = B_TRUE;
+
+			infop->mgi_count =
+			    nxgep->pt_config.rdc_grps[dev_gindex].max_rdcs;
+		} else {
+			infop->mgi_count = NXGE_HIO_SHARE_MAX_CHANNELS;
+		}
 
 		infop->mgi_driver = (mac_group_driver_t)group;
 		infop->mgi_start = nxge_hio_group_start;
 		infop->mgi_stop = nxge_hio_group_stop;
 		infop->mgi_addmac = nxge_hio_add_mac;
 		infop->mgi_remmac = nxge_hio_rem_mac;
-		infop->mgi_count =
-		    nxgep->pt_config.rdc_grps[dev_gindex].max_rdcs;
 		break;
 
 	case MAC_RING_TYPE_TX:
--- a/usr/src/uts/common/io/nxge/nxge_hio_guest.c	Fri Aug 14 12:16:18 2009 -0400
+++ b/usr/src/uts/common/io/nxge/nxge_hio_guest.c	Fri Aug 14 09:48:09 2009 -0700
@@ -35,46 +35,9 @@
 #include <sys/nxge/nxge_fzc.h>
 #include <sys/nxge/nxge_rxdma.h>
 #include <sys/nxge/nxge_txdma.h>
-
 #include <sys/nxge/nxge_hio.h>
 
 /*
- * nxge_hio_unregister
- *
- *	Unregister with the VNET module.
- *
- * Arguments:
- * 	nxge
- *
- * Notes:
- *	We must uninitialize all DMA channels associated with the VR, too.
- *
- *	We're assuming that the channels will be disabled & unassigned
- *	in the service domain, after we're done here.
- *
- * Context:
- *	Guest domain
- */
-void
-nxge_hio_unregister(
-	nxge_t *nxge)
-{
-	nxge_hio_data_t *nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio;
-
-	if (nhd == 0) {
-		return;
-	}
-
-#if defined(sun4v)
-	/* Unregister with vNet. */
-	if (nhd->hio.vio.unregister) {
-		if (nxge->hio_vr)
-			(*nhd->hio.vio.unregister)(nxge->hio_vr->vhp);
-	}
-#endif
-}
-
-/*
  * nxge_guest_regs_map
  *
  *	Map in a guest domain's register set(s).
@@ -95,8 +58,7 @@
 };
 
 int
-nxge_guest_regs_map(
-	nxge_t *nxge)
+nxge_guest_regs_map(nxge_t *nxge)
 {
 	dev_regs_t 	*regs;
 	off_t		regsize;
@@ -211,31 +173,22 @@
 int
 nxge_hio_vr_add(nxge_t *nxge)
 {
-	extern mac_callbacks_t nxge_m_callbacks;
-
-	nxge_hio_data_t *nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio;
-	nxge_hio_vr_t *vr;
-	nxge_hio_dc_t *dc;
-
-	int *reg_val;
-	uint_t reg_len;
-	uint8_t vr_index;
+	extern nxge_status_t	nxge_mac_register(p_nxge_t);
 
-	nxhv_vr_fp_t *fp;
-	uint64_t vr_address, vr_size;
-	uint32_t cookie;
-
-	nxhv_dc_fp_t *tx, *rx;
-	uint64_t tx_map, rx_map;
-
-	uint64_t hv_rv;
-
-	/* Variables needed to register with vnet. */
-	mac_register_t *mac_info;
-	ether_addr_t mac_addr;
-	nx_vio_fp_t *vio;
-
-	int i;
+	nxge_hio_data_t		*nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio;
+	nxge_hio_vr_t		*vr;
+	nxge_hio_dc_t		*dc;
+	int			*reg_val;
+	uint_t			reg_len;
+	uint8_t			vr_index;
+	nxhv_vr_fp_t		*fp;
+	uint64_t		vr_address, vr_size;
+	uint32_t		cookie;
+	nxhv_dc_fp_t		*tx, *rx;
+	uint64_t		tx_map, rx_map;
+	uint64_t		hv_rv;
+	int			i;
+	nxge_status_t		status;
 
 	NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_hio_vr_add"));
 
@@ -384,40 +337,13 @@
 		}
 	}
 
-	/*
-	 * Register with vnet.
-	 */
-	if ((mac_info = mac_alloc(MAC_VERSION)) == NULL)
-		return (NXGE_ERROR);
-
-	mac_info->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
-	mac_info->m_driver = nxge;
-	mac_info->m_dip = nxge->dip;
-	mac_info->m_src_addr = KMEM_ZALLOC(MAXMACADDRLEN, KM_SLEEP);
-	mac_info->m_dst_addr = KMEM_ZALLOC(MAXMACADDRLEN, KM_SLEEP);
-	(void) memset(mac_info->m_src_addr, 0xff, sizeof (MAXMACADDRLEN));
-	mac_info->m_callbacks = &nxge_m_callbacks;
-	mac_info->m_min_sdu = 0;
-	mac_info->m_max_sdu = NXGE_MTU_DEFAULT_MAX -
-	    sizeof (struct ether_header) - ETHERFCSL - 4;
-
-	(void) memset(&mac_addr, 0xff, sizeof (mac_addr));
-
-	/* Register with vio_net. */
-	vio = &nhd->hio.vio;
-	if ((*vio->__register)(mac_info, VIO_NET_RES_HYBRID,
-	    nxge->hio_mac_addr, mac_addr, &vr->vhp, &vio->cb)) {
-		NXGE_DEBUG_MSG((nxge, HIO_CTL, "HIO registration() failed"));
-		KMEM_FREE(mac_info->m_src_addr, MAXMACADDRLEN);
-		KMEM_FREE(mac_info->m_dst_addr, MAXMACADDRLEN);
-		mac_free(mac_info);
-		return (NXGE_ERROR);
+	status = nxge_mac_register(nxge);
+	if (status != NXGE_OK) {
+		cmn_err(CE_WARN, "nxge(%d): nxge_mac_register failed\n",
+		    nxge->instance);
+		return (status);
 	}
 
-	KMEM_FREE(mac_info->m_src_addr, MAXMACADDRLEN);
-	KMEM_FREE(mac_info->m_dst_addr, MAXMACADDRLEN);
-	mac_free(mac_info);
-
 	nxge->hio_vr = vr;	/* For faster lookups. */
 
 	NXGE_DEBUG_MSG((nxge, HIO_CTL, "<== nxge_hio_vr_add"));
--- a/usr/src/uts/common/io/nxge/nxge_main.c	Fri Aug 14 12:16:18 2009 -0400
+++ b/usr/src/uts/common/io/nxge/nxge_main.c	Fri Aug 14 09:48:09 2009 -0700
@@ -272,14 +272,11 @@
 static int nxge_m_multicst(void *, boolean_t, const uint8_t *);
 static int nxge_m_promisc(void *, boolean_t);
 static void nxge_m_ioctl(void *, queue_t *, mblk_t *);
-static nxge_status_t nxge_mac_register(p_nxge_t);
+nxge_status_t nxge_mac_register(p_nxge_t);
 static int nxge_altmac_set(p_nxge_t nxgep, uint8_t *mac_addr,
 	int slot, int rdctbl, boolean_t usetbl);
 void nxge_mmac_kstat_update(p_nxge_t nxgep, int slot,
 	boolean_t factory);
-#if defined(sun4v)
-extern mblk_t *nxge_m_tx(void *arg, mblk_t *mp);
-#endif
 
 static void nxge_m_getfactaddr(void *, uint_t, uint8_t *);
 static	boolean_t nxge_m_getcapab(void *, mac_capab_t, void *);
@@ -630,11 +627,6 @@
 	if (nxgep->niu_type != N2_NIU) {
 		nxge_set_pci_replay_timeout(nxgep);
 	}
-#if defined(sun4v)
-	if (isLDOMguest(nxgep)) {
-		nxge_m_callbacks.mc_tx = nxge_m_tx;
-	}
-#endif
 
 #if defined(sun4v)
 	/* This is required by nxge_hio_init(), which follows. */
@@ -961,11 +953,7 @@
 
 	(void) nxge_link_monitor(nxgep, LINK_MONITOR_STOP);
 
-	if (isLDOMguest(nxgep)) {
-		if (nxgep->nxge_mac_state == NXGE_MAC_STARTED)
-			nxge_m_stop((void *)nxgep);
-		nxge_hio_unregister(nxgep);
-	} else if (nxgep->mach && (status = mac_unregister(nxgep->mach)) != 0) {
+	if (nxgep->mach && (status = mac_unregister(nxgep->mach)) != 0) {
 		NXGE_ERROR_MSG((nxgep, NXGE_ERR_CTL,
 		    "<== nxge_detach status = 0x%08X", status));
 		return (DDI_FAILURE);
@@ -4294,10 +4282,13 @@
 	case MAC_CAPAB_MULTIFACTADDR: {
 		mac_capab_multifactaddr_t	*mfacp = cap_data;
 
-		mutex_enter(nxgep->genlock);
-		mfacp->mcm_naddr = nxgep->nxge_mmac_info.num_factory_mmac;
-		mfacp->mcm_getaddr = nxge_m_getfactaddr;
-		mutex_exit(nxgep->genlock);
+		if (!isLDOMguest(nxgep)) {
+			mutex_enter(nxgep->genlock);
+			mfacp->mcm_naddr =
+			    nxgep->nxge_mmac_info.num_factory_mmac;
+			mfacp->mcm_getaddr = nxge_m_getfactaddr;
+			mutex_exit(nxgep->genlock);
+		}
 		break;
 	}
 
@@ -4325,34 +4316,68 @@
 
 		mutex_enter(nxgep->genlock);
 		if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
-			cap_rings->mr_group_type = MAC_GROUP_TYPE_DYNAMIC;
-			cap_rings->mr_rnum = p_cfgp->max_rdcs;
-			cap_rings->mr_rget = nxge_fill_ring;
-			cap_rings->mr_gnum = p_cfgp->max_rdc_grpids;
-			cap_rings->mr_gget = nxge_hio_group_get;
-			cap_rings->mr_gaddring = nxge_group_add_ring;
-			cap_rings->mr_gremring = nxge_group_rem_ring;
+			if (isLDOMguest(nxgep))  {
+				cap_rings->mr_group_type =
+				    MAC_GROUP_TYPE_STATIC;
+				cap_rings->mr_rnum =
+				    NXGE_HIO_SHARE_MAX_CHANNELS;
+				cap_rings->mr_rget = nxge_fill_ring;
+				cap_rings->mr_gnum = 1;
+				cap_rings->mr_gget = nxge_hio_group_get;
+				cap_rings->mr_gaddring = NULL;
+				cap_rings->mr_gremring = NULL;
+			} else {
+				/*
+				 * Service Domain.
+				 */
+				cap_rings->mr_group_type =
+				    MAC_GROUP_TYPE_DYNAMIC;
+				cap_rings->mr_rnum = p_cfgp->max_rdcs;
+				cap_rings->mr_rget = nxge_fill_ring;
+				cap_rings->mr_gnum = p_cfgp->max_rdc_grpids;
+				cap_rings->mr_gget = nxge_hio_group_get;
+				cap_rings->mr_gaddring = nxge_group_add_ring;
+				cap_rings->mr_gremring = nxge_group_rem_ring;
+			}
 
 			NXGE_DEBUG_MSG((nxgep, RX_CTL,
 			    "==> nxge_m_getcapab: rx nrings[%d] ngroups[%d]",
 			    p_cfgp->max_rdcs, p_cfgp->max_rdc_grpids));
 		} else {
-			cap_rings->mr_group_type = MAC_GROUP_TYPE_DYNAMIC;
-			cap_rings->mr_rnum = p_cfgp->tdc.count;
-			cap_rings->mr_rget = nxge_fill_ring;
-			if (isLDOMservice(nxgep)) {
-				/* share capable */
-				/* Do not report the default ring: hence -1 */
+			/*
+			 * TX Rings.
+			 */
+			if (isLDOMguest(nxgep)) {
+				cap_rings->mr_group_type =
+				    MAC_GROUP_TYPE_STATIC;
+				cap_rings->mr_rnum =
+				    NXGE_HIO_SHARE_MAX_CHANNELS;
+				cap_rings->mr_rget = nxge_fill_ring;
+				cap_rings->mr_gnum = 0;
+				cap_rings->mr_gget = NULL;
+				cap_rings->mr_gaddring = NULL;
+				cap_rings->mr_gremring = NULL;
+			} else {
+				/*
+				 * Service Domain.
+				 */
+				cap_rings->mr_group_type =
+				    MAC_GROUP_TYPE_DYNAMIC;
+				cap_rings->mr_rnum = p_cfgp->tdc.count;
+				cap_rings->mr_rget = nxge_fill_ring;
+
+				/*
+				 * Share capable.
+				 *
+				 * Do not report the default group: hence -1
+				 */
 				cap_rings->mr_gnum =
 				    NXGE_MAX_TDC_GROUPS / nxgep->nports - 1;
-			} else {
-				cap_rings->mr_gnum = 0;
+				cap_rings->mr_gget = nxge_hio_group_get;
+				cap_rings->mr_gaddring = nxge_group_add_ring;
+				cap_rings->mr_gremring = nxge_group_rem_ring;
 			}
 
-			cap_rings->mr_gget = nxge_hio_group_get;
-			cap_rings->mr_gaddring = nxge_group_add_ring;
-			cap_rings->mr_gremring = nxge_group_rem_ring;
-
 			NXGE_DEBUG_MSG((nxgep, TX_CTL,
 			    "==> nxge_m_getcapab: tx rings # of rings %d",
 			    p_cfgp->tdc.count));
@@ -6372,7 +6397,7 @@
 	NXGE_DEBUG_MSG((nxgep, INT_CTL, "<== nxge_intrs_disable"));
 }
 
-static nxge_status_t
+nxge_status_t
 nxge_mac_register(p_nxge_t nxgep)
 {
 	mac_register_t *macp;
@@ -6386,7 +6411,13 @@
 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
 	macp->m_driver = nxgep;
 	macp->m_dip = nxgep->dip;
-	macp->m_src_addr = nxgep->ouraddr.ether_addr_octet;
+	if (!isLDOMguest(nxgep)) {
+		macp->m_src_addr = nxgep->ouraddr.ether_addr_octet;
+	} else {
+		macp->m_src_addr = KMEM_ZALLOC(MAXMACADDRLEN, KM_SLEEP);
+		macp->m_dst_addr = KMEM_ZALLOC(MAXMACADDRLEN, KM_SLEEP);
+		(void) memset(macp->m_src_addr, 0xff, sizeof (MAXMACADDRLEN));
+	}
 	macp->m_callbacks = &nxge_m_callbacks;
 	macp->m_min_sdu = 0;
 	nxgep->mac.default_mtu = nxgep->mac.maxframesize -
@@ -6395,7 +6426,12 @@
 	macp->m_margin = VLAN_TAGSZ;
 	macp->m_priv_props = nxge_priv_props;
 	macp->m_priv_prop_count = NXGE_MAX_PRIV_PROPS;
-	macp->m_v12n = MAC_VIRT_HIO | MAC_VIRT_LEVEL1 | MAC_VIRT_SERIALIZE;
+	if (isLDOMguest(nxgep)) {
+		macp->m_v12n = MAC_VIRT_LEVEL1 | MAC_VIRT_SERIALIZE;
+	} else {
+		macp->m_v12n = MAC_VIRT_HIO | MAC_VIRT_LEVEL1 | \
+		    MAC_VIRT_SERIALIZE;
+	}
 
 	NXGE_DEBUG_MSG((nxgep, MAC_CTL,
 	    "==> nxge_mac_register: instance %d "
@@ -6406,6 +6442,10 @@
 	    NXGE_EHEADER_VLAN_CRC));
 
 	status = mac_register(macp, &nxgep->mach);
+	if (isLDOMguest(nxgep)) {
+		KMEM_FREE(macp->m_src_addr, MAXMACADDRLEN);
+		KMEM_FREE(macp->m_dst_addr, MAXMACADDRLEN);
+	}
 	mac_free(macp);
 
 	if (status != 0) {
--- a/usr/src/uts/common/io/nxge/nxge_rxdma.c	Fri Aug 14 12:16:18 2009 -0400
+++ b/usr/src/uts/common/io/nxge/nxge_rxdma.c	Fri Aug 14 09:48:09 2009 -0700
@@ -1756,7 +1756,7 @@
 	uint8_t			channel;
 	npi_handle_t		handle;
 	rx_dma_ctl_stat_t	cs;
-	p_rx_rcr_ring_t		rcr_ring;
+	p_rx_rcr_ring_t		rcrp;
 	mblk_t			*mp = NULL;
 
 	if (ldvp == NULL) {
@@ -1789,7 +1789,7 @@
 	/*
 	 * Get the ring to enable us to process packets.
 	 */
-	rcr_ring = nxgep->rx_rcr_rings->rcr_rings[ldvp->vdma_index];
+	rcrp = nxgep->rx_rcr_rings->rcr_rings[ldvp->vdma_index];
 
 	/*
 	 * The RCR ring lock must be held when packets
@@ -1799,7 +1799,7 @@
 	 * (will cause fatal errors such as rcrincon bit set)
 	 * and the setting of the poll_flag.
 	 */
-	MUTEX_ENTER(&rcr_ring->lock);
+	MUTEX_ENTER(&rcrp->lock);
 
 	/*
 	 * Get the control and status for this channel.
@@ -1840,12 +1840,12 @@
 				    mgm.value);
 			}
 		}
-		MUTEX_EXIT(&rcr_ring->lock);
+		MUTEX_EXIT(&rcrp->lock);
 		return (DDI_INTR_CLAIMED);
 	}
 
-	ASSERT(rcr_ring->ldgp == ldgp);
-	ASSERT(rcr_ring->ldvp == ldvp);
+	ASSERT(rcrp->ldgp == ldgp);
+	ASSERT(rcrp->ldvp == ldvp);
 
 	RXDMA_REG_READ64(handle, RX_DMA_CTL_STAT_REG, channel, &cs.value);
 
@@ -1856,8 +1856,8 @@
 	    cs.bits.hdw.rcrto,
 	    cs.bits.hdw.rcrthres));
 
-	if (rcr_ring->poll_flag == 0) {
-		mp = nxge_rx_pkts(nxgep, rcr_ring, cs, -1);
+	if (!rcrp->poll_flag) {
+		mp = nxge_rx_pkts(nxgep, rcrp, cs, -1);
 	}
 
 	/* error events. */
@@ -1873,27 +1873,34 @@
 	 * these two edge triggered bits.
 	 */
 	cs.value &= RX_DMA_CTL_STAT_WR1C;
-	cs.bits.hdw.mex = rcr_ring->poll_flag ? 0 : 1;
+	cs.bits.hdw.mex = rcrp->poll_flag ? 0 : 1;
 	RXDMA_REG_WRITE64(handle, RX_DMA_CTL_STAT_REG, channel,
 	    cs.value);
 
 	/*
 	 * If the polling mode is enabled, disable the interrupt.
 	 */
-	if (rcr_ring->poll_flag) {
+	if (rcrp->poll_flag) {
 		NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
 		    "==> nxge_rx_intr: rdc %d ldgp $%p ldvp $%p "
 		    "(disabling interrupts)", channel, ldgp, ldvp));
+
 		/*
 		 * Disarm this logical group if this is a single device
 		 * group.
 		 */
 		if (ldgp->nldvs == 1) {
-			ldgimgm_t mgm;
-			mgm.value = 0;
-			mgm.bits.ldw.arm = 0;
-			NXGE_REG_WR64(handle,
-			    LDGIMGN_REG + LDSV_OFFSET(ldgp->ldg), mgm.value);
+			if (isLDOMguest(nxgep)) {
+				ldgp->arm = B_FALSE;
+				nxge_hio_ldgimgn(nxgep, ldgp);
+			} else {
+				ldgimgm_t mgm;
+				mgm.value = 0;
+				mgm.bits.ldw.arm = 0;
+				NXGE_REG_WR64(handle,
+				    LDGIMGN_REG + LDSV_OFFSET(ldgp->ldg),
+				    mgm.value);
+			}
 		}
 	} else {
 		/*
@@ -1920,24 +1927,11 @@
 		    "==> nxge_rx_intr: rdc %d ldgp $%p "
 		    "exiting ISR (and call mac_rx_ring)", channel, ldgp));
 	}
-	MUTEX_EXIT(&rcr_ring->lock);
+	MUTEX_EXIT(&rcrp->lock);
 
 	if (mp != NULL) {
-		if (!isLDOMguest(nxgep))
-			mac_rx_ring(nxgep->mach, rcr_ring->rcr_mac_handle, mp,
-			    rcr_ring->rcr_gen_num);
-#if defined(sun4v)
-		else {			/* isLDOMguest(nxgep) */
-			nxge_hio_data_t *nhd = (nxge_hio_data_t *)
-			    nxgep->nxge_hw_p->hio;
-			nx_vio_fp_t *vio = &nhd->hio.vio;
-
-			if (vio->cb.vio_net_rx_cb) {
-				(*vio->cb.vio_net_rx_cb)
-				    (nxgep->hio_vr->vhp, mp);
-			}
-		}
-#endif
+		mac_rx_ring(nxgep->mach, rcrp->rcr_mac_handle, mp,
+		    rcrp->rcr_gen_num);
 	}
 	NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_rx_intr: DDI_INTR_CLAIMED"));
 	return (DDI_INTR_CLAIMED);
@@ -2720,6 +2714,7 @@
 	uint32_t		channel;
 
 	if (ring_handle == NULL) {
+		ASSERT(ring_handle != NULL);
 		return (0);
 	}
 
@@ -2760,6 +2755,7 @@
 	uint32_t		channel;
 
 	if (ring_handle == NULL) {
+		ASSERT(ring_handle != NULL);
 		return (0);
 	}
 
@@ -2816,12 +2812,18 @@
 		    "==> nxge_disable_poll: rdc %d ldgp $%p (enable intr)",
 		    ringp->rdc, ldgp));
 		if (ldgp->nldvs == 1) {
-			ldgimgm_t	mgm;
-			mgm.value = 0;
-			mgm.bits.ldw.arm = 1;
-			mgm.bits.ldw.timer = ldgp->ldg_timer;
-			NXGE_REG_WR64(handle,
-			    LDGIMGN_REG + LDSV_OFFSET(ldgp->ldg), mgm.value);
+			if (isLDOMguest(nxgep)) {
+				ldgp->arm = B_TRUE;
+				nxge_hio_ldgimgn(nxgep, ldgp);
+			} else {
+				ldgimgm_t	mgm;
+				mgm.value = 0;
+				mgm.bits.ldw.arm = 1;
+				mgm.bits.ldw.timer = ldgp->ldg_timer;
+				NXGE_REG_WR64(handle,
+				    LDGIMGN_REG + LDSV_OFFSET(ldgp->ldg),
+				    mgm.value);
+			}
 		}
 		ringp->poll_flag = 0;
 	}
--- a/usr/src/uts/common/io/nxge/nxge_send.c	Fri Aug 14 12:16:18 2009 -0400
+++ b/usr/src/uts/common/io/nxge/nxge_send.c	Fri Aug 14 09:48:09 2009 -0700
@@ -66,20 +66,9 @@
 	(void) nxge_txdma_reclaim(ring->nxgep, ring, 0);
 	MUTEX_EXIT(&ring->lock);
 
-	if (!isLDOMguest(ring->nxgep) && !ring->tx_ring_offline)
+	if (!ring->tx_ring_offline) {
 		mac_tx_ring_update(ring->nxgep->mach, ring->tx_ring_handle);
-#if defined(sun4v)
-	else {
-		nxge_hio_data_t *nhd =
-		    (nxge_hio_data_t *)ring->nxgep->nxge_hw_p->hio;
-		nx_vio_fp_t *vio = &nhd->hio.vio;
-
-		/* Call back vnet. */
-		if (vio->cb.vio_net_tx_update) {
-			(*vio->cb.vio_net_tx_update)(ring->nxgep->hio_vr->vhp);
-		}
 	}
-#endif
 }
 
 static void
@@ -141,65 +130,6 @@
 	return ((mblk_t *)NULL);
 }
 
-#if defined(sun4v)
-
-/*
- * Hashing policy for load balancing over the set of TX rings
- * available to the driver.
- */
-static uint8_t nxge_tx_hash_policy = MAC_PKT_HASH_L4;
-
-/*
- * nxge_m_tx() is needed for Hybrid I/O operation of the vnet in
- *	the guest domain.  See CR 6778758 for long term solution.
- *
- *	The guest domain driver will for now hash the packet
- *	to pick a DMA channel from the only group it has group 0.
- */
-
-mblk_t *
-nxge_m_tx(void *arg, mblk_t *mp)
-{
-	p_nxge_t		nxgep = (p_nxge_t)arg;
-	mblk_t			*next;
-	uint64_t		rindex;
-	p_tx_ring_t		tx_ring_p;
-	int			status;
-
-	NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_m_tx"));
-
-	/*
-	 * Hash to pick a ring from Group 0, the only TX group
-	 * for a guest domain driver.
-	 */
-	rindex = mac_pkt_hash(DL_ETHER, mp, nxge_tx_hash_policy, B_TRUE);
-	rindex = rindex % nxgep->pt_config.tdc_grps[0].max_tdcs;
-
-	/*
-	 * Get the ring handle.
-	 */
-	tx_ring_p = nxgep->tx_rings->rings[rindex];
-
-	while (mp != NULL) {
-		next = mp->b_next;
-		mp->b_next = NULL;
-
-		status = nxge_start(nxgep, tx_ring_p, mp);
-		if (status != 0) {
-			mp->b_next = next;
-			nxge_tx_ring_dispatch(tx_ring_p);
-			return (mp);
-		}
-
-		mp = next;
-	}
-
-	NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_m_tx"));
-	return ((mblk_t *)NULL);
-}
-
-#endif
-
 int
 nxge_start(p_nxge_t nxgep, p_tx_ring_t tx_ring_p, p_mblk_t mp)
 {
--- a/usr/src/uts/common/io/nxge/nxge_virtual.c	Fri Aug 14 12:16:18 2009 -0400
+++ b/usr/src/uts/common/io/nxge/nxge_virtual.c	Fri Aug 14 09:48:09 2009 -0700
@@ -3994,6 +3994,9 @@
 	p_dma_cfgp = &nxgep->pt_config;
 	p_cfgp = &p_dma_cfgp->hw_config;
 
+	if (isLDOMguest(nxgep))
+		return (ringidx);
+
 	for (i = 0; i < groupid; i++) {
 		rdc_grp_p =
 		    &p_dma_cfgp->rdc_grps[p_cfgp->def_mac_rxdma_grpid + i];
--- a/usr/src/uts/common/sys/mac_client_priv.h	Fri Aug 14 12:16:18 2009 -0400
+++ b/usr/src/uts/common/sys/mac_client_priv.h	Fri Aug 14 09:48:09 2009 -0700
@@ -120,7 +120,7 @@
 extern void	mac_rx_client_restart(mac_client_handle_t);
 extern void	mac_srs_perm_quiesce(mac_client_handle_t, boolean_t);
 extern int	mac_hwrings_get(mac_client_handle_t, mac_group_handle_t *,
-		    mac_ring_handle_t *);
+		    mac_ring_handle_t *, mac_ring_type_t);
 extern void	mac_hwring_setup(mac_ring_handle_t, mac_resource_handle_t);
 extern void	mac_hwring_teardown(mac_ring_handle_t);
 extern int	mac_hwring_disable_intr(mac_ring_handle_t);
--- a/usr/src/uts/common/sys/mac_impl.h	Fri Aug 14 12:16:18 2009 -0400
+++ b/usr/src/uts/common/sys/mac_impl.h	Fri Aug 14 09:48:09 2009 -0700
@@ -262,7 +262,7 @@
 #define	MAC_RING_TX_DEFAULT(mip, mp)			\
 	((mip->mi_default_tx_ring == NULL) ?		\
 	mip->mi_tx(mip->mi_driver, mp) :		\
-	mac_ring_tx(mip->mi_default_tx_ring, mp))
+	mac_hwring_tx(mip->mi_default_tx_ring, mp))
 
 #define	MAC_TX(mip, ring, mp, mcip) {					\
 	/*								\
@@ -275,7 +275,7 @@
 	    (ring == NULL))						\
 		mp = MAC_RING_TX_DEFAULT(mip, mp);			\
 	else								\
-		mp = mac_ring_tx(ring, mp);				\
+		mp = mac_hwring_tx(ring, mp);				\
 }
 
 /* mci_tx_flag */
@@ -585,7 +585,7 @@
 extern int mac_group_remmac(mac_group_t *, const uint8_t *);
 extern int mac_rx_group_add_flow(mac_client_impl_t *, flow_entry_t *,
     mac_group_t *);
-extern mblk_t *mac_ring_tx(mac_ring_handle_t, mblk_t *);
+extern mblk_t *mac_hwring_tx(mac_ring_handle_t, mblk_t *);
 extern mac_ring_t *mac_reserve_tx_ring(mac_impl_t *, mac_ring_t *);
 extern void mac_release_tx_ring(mac_ring_handle_t);
 extern mac_group_t *mac_reserve_tx_group(mac_impl_t *, mac_share_handle_t);
--- a/usr/src/uts/common/sys/mac_soft_ring.h	Fri Aug 14 12:16:18 2009 -0400
+++ b/usr/src/uts/common/sys/mac_soft_ring.h	Fri Aug 14 09:48:09 2009 -0700
@@ -131,6 +131,9 @@
 	void		*st_arg1;
 	void		*st_arg2;
 	mac_group_t	*st_group;	/* TX group for share */
+	uint32_t	st_ring_count;	/* no. of tx rings */
+	mac_ring_handle_t	*st_rings;
+
 	boolean_t	st_woken_up;
 
 	/*
--- a/usr/src/uts/common/sys/nxge/nxge_hio.h	Fri Aug 14 12:16:18 2009 -0400
+++ b/usr/src/uts/common/sys/nxge/nxge_hio.h	Fri Aug 14 09:48:09 2009 -0700
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -35,9 +35,6 @@
 #include <nxge_ipp.h>
 #include <nxge_fflp.h>
 #include <sys/mac_provider.h>
-#if defined(sun4v)
-#include <sys/vnet_res.h>
-#endif
 
 #define	isLDOMservice(nxge) \
 	(nxge->environs == SOLARIS_SERVICE_DOMAIN)
@@ -46,6 +43,9 @@
 #define	isLDOMs(nxge) \
 	(isLDOMservice(nxge) || isLDOMguest(nxge))
 
+#define	NXGE_HIO_SHARE_MIN_CHANNELS	2
+#define	NXGE_HIO_SHARE_MAX_CHANNELS	2
+
 /* ------------------------------------------------------------------ */
 typedef uint8_t nx_rdc_t;
 typedef uint8_t nx_tdc_t;
@@ -88,37 +88,19 @@
 	dc_getinfo	getinfo;
 } nxhv_dc_fp_t;
 
-#if defined(sun4v)
-typedef struct {
-	vio_net_resource_reg_t	__register;
-	vio_net_resource_unreg_t unregister;
-
-	vio_net_callbacks_t	cb;
-
-} nx_vio_fp_t;
-#endif
-
 typedef struct {
 	boolean_t	ldoms;
-
 	nxhv_vr_fp_t	vr;
 	nxhv_dc_fp_t	tx;
 	nxhv_dc_fp_t	rx;
-
-#if defined(sun4v)
-	nx_vio_fp_t	vio;
-#endif
-
 } nxhv_fp_t;
 
 /* ------------------------------------------------------------------ */
 #define	NXGE_VR_SR_MAX		8 /* There are 8 subregions (SR). */
 
 typedef enum {
-
 	NXGE_HIO_TYPE_SERVICE,	/* We are a service domain driver. */
 	NXGE_HIO_TYPE_GUEST	/* We are a guest domain driver. */
-
 } nxge_hio_type_t;
 
 typedef enum {
@@ -130,7 +112,6 @@
 	FUNC2_VIR = 0x5000000,
 	FUNC3_MNT = 0x6000000,
 	FUNC3_VIR = 0x7000000
-
 } vr_base_address_t;
 
 #define	VR_STEP		0x2000000
@@ -146,7 +127,6 @@
 	FUNC3_VIR0,
 	FUNC3_VIR1,
 	FUNC_VIR_MAX
-
 } vr_region_t;
 
 typedef enum {
@@ -159,13 +139,11 @@
 	VP_CHANNEL_6,
 	VP_CHANNEL_7,
 	VP_CHANNEL_MAX
-
 } vp_channel_t;
 
 typedef enum {
 	VP_BOUND_TX = 1,
 	VP_BOUND_RX
-
 } vpc_type_t;
 
 #define	VP_VC_OFFSET(channel)	(channel << 10)
@@ -254,9 +232,6 @@
 	ether_addr_t	altmac;	/* The alternate MAC address. */
 	int		slot;	/* According to nxge_m_mmac_add(). */
 
-#if defined(sun4v)
-	vio_net_handle_t vhp;	/* The handle given to us by the vnet. */
-#endif
 	nxge_grp_t	rx_group;
 	nxge_grp_t	tx_group;
 
@@ -273,7 +248,6 @@
 	uint64_t	map;	/* Currently unused */
 
 	int		vector;	/* The DDI vector number (index) */
-
 } hio_ldg_t;
 
 /*
--- a/usr/src/uts/sun4v/io/vnet.c	Fri Aug 14 12:16:18 2009 -0400
+++ b/usr/src/uts/sun4v/io/vnet.c	Fri Aug 14 09:48:09 2009 -0700
@@ -40,6 +40,8 @@
 #include <sys/dlpi.h>
 #include <net/if.h>
 #include <sys/mac_provider.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_priv.h>
 #include <sys/mac_ether.h>
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
@@ -75,11 +77,38 @@
 #ifdef	VNET_IOC_DEBUG
 static void vnet_force_link_state(vnet_t *vnetp, queue_t *q, mblk_t *mp);
 #endif
+static boolean_t vnet_m_capab(void *arg, mac_capab_t cap, void *cap_data);
+static void vnet_get_ring(void *arg, mac_ring_type_t rtype, const int g_index,
+	const int r_index, mac_ring_info_t *infop, mac_ring_handle_t r_handle);
+static void vnet_get_group(void *arg, mac_ring_type_t type, const int index,
+	mac_group_info_t *infop, mac_group_handle_t handle);
+static int vnet_rx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num);
+static void vnet_rx_ring_stop(mac_ring_driver_t rdriver);
+static int vnet_tx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num);
+static void vnet_tx_ring_stop(mac_ring_driver_t rdriver);
+static int vnet_ring_enable_intr(void *arg);
+static int vnet_ring_disable_intr(void *arg);
+static mblk_t *vnet_rx_poll(void *arg, int bytes_to_pickup);
+static int vnet_addmac(void *arg, const uint8_t *mac_addr);
+static int vnet_remmac(void *arg, const uint8_t *mac_addr);
 
 /* vnet internal functions */
 static int vnet_unattach(vnet_t *vnetp);
+static void vnet_ring_grp_init(vnet_t *vnetp);
+static void vnet_ring_grp_uninit(vnet_t *vnetp);
 static int vnet_mac_register(vnet_t *);
 static int vnet_read_mac_address(vnet_t *vnetp);
+static int vnet_bind_vgenring(vnet_res_t *vresp);
+static void vnet_unbind_vgenring(vnet_res_t *vresp);
+static int vnet_bind_hwrings(vnet_t *vnetp);
+static void vnet_unbind_hwrings(vnet_t *vnetp);
+static int vnet_bind_rings(vnet_res_t *vresp);
+static void vnet_unbind_rings(vnet_res_t *vresp);
+static int vnet_hio_stat(void *, uint_t, uint64_t *);
+static int vnet_hio_start(void *);
+static void vnet_hio_stop(void *);
+static void vnet_hio_notify_cb(void *arg, mac_notify_type_t type);
+mblk_t *vnet_hio_tx(void *, mblk_t *);
 
 /* Forwarding database (FDB) routines */
 static void vnet_fdb_create(vnet_t *vnetp);
@@ -98,6 +127,8 @@
 static void vnet_dispatch_res_task(vnet_t *vnetp);
 static void vnet_res_start_task(void *arg);
 static void vnet_handle_res_err(vio_net_handle_t vrh, vio_net_err_val_t err);
+static void vnet_add_resource(vnet_t *vnetp, vnet_res_t *vresp);
+static vnet_res_t *vnet_rem_resource(vnet_t *vnetp, vnet_res_t *vresp);
 
 /* Exported to vnet_gen */
 int vnet_mtu_update(vnet_t *vnetp, uint32_t mtu);
@@ -112,15 +143,21 @@
 
 /* Exported to to vnet_dds */
 int vnet_send_dds_msg(vnet_t *vnetp, void *dmsg);
+int vnet_hio_mac_init(vnet_t *vnetp, char *ifname);
+void vnet_hio_mac_cleanup(vnet_t *vnetp);
 
 /* Externs that are imported from vnet_gen */
 extern int vgen_init(void *vnetp, uint64_t regprop, dev_info_t *vnetdip,
     const uint8_t *macaddr, void **vgenhdl);
+extern int vgen_init_mdeg(void *arg);
 extern void vgen_uninit(void *arg);
 extern int vgen_dds_tx(void *arg, void *dmsg);
 extern void vgen_mod_init(void);
 extern int vgen_mod_cleanup(void);
 extern void vgen_mod_fini(void);
+extern int vgen_enable_intr(void *arg);
+extern int vgen_disable_intr(void *arg);
+extern mblk_t *vgen_poll(void *arg, int bytes_to_pickup);
 
 /* Externs that are imported from vnet_dds */
 extern void vdds_mod_init(void);
@@ -131,6 +168,9 @@
 extern void vdds_cleanup_hybrid_res(void *arg);
 extern void vdds_cleanup_hio(vnet_t *vnetp);
 
+/* Externs imported from mac_impl */
+extern mblk_t *mac_hwring_tx(mac_ring_handle_t, mblk_t *);
+
 #define	DRV_NAME	"vnet"
 #define	VNET_FDBE_REFHOLD(p)						\
 {									\
@@ -145,9 +185,9 @@
 }
 
 #ifdef	VNET_IOC_DEBUG
-#define	VNET_M_CALLBACK_FLAGS	(MC_IOCTL)
+#define	VNET_M_CALLBACK_FLAGS	(MC_IOCTL | MC_GETCAPAB)
 #else
-#define	VNET_M_CALLBACK_FLAGS	(0)
+#define	VNET_M_CALLBACK_FLAGS	(MC_GETCAPAB)
 #endif
 
 static mac_callbacks_t vnet_m_callbacks = {
@@ -157,9 +197,23 @@
 	vnet_m_stop,
 	vnet_m_promisc,
 	vnet_m_multicst,
-	vnet_m_unicst,
-	vnet_m_tx,
+	NULL,	/* m_unicst entry must be NULL while rx rings are exposed */
+	NULL,	/* m_tx entry must be NULL while tx rings are exposed */
 	vnet_m_ioctl,
+	vnet_m_capab,
+	NULL
+};
+
+static mac_callbacks_t vnet_hio_res_callbacks = {
+	0,
+	vnet_hio_stat,
+	vnet_hio_start,
+	vnet_hio_stop,
+	NULL,
+	NULL,
+	NULL,
+	vnet_hio_tx,
+	NULL,
 	NULL,
 	NULL
 };
@@ -176,6 +230,9 @@
 uint32_t vnet_ldcwd_txtimeout = VNET_LDCWD_TXTIMEOUT;  /* tx timeout in msec */
 uint32_t vnet_ldc_mtu = VNET_LDC_MTU;		/* ldc mtu */
 
+/* Configure tx serialization in mac layer for the vnet device */
+boolean_t vnet_mac_tx_serialize = B_TRUE;
+
 /*
  * Set this to non-zero to enable additional internal receive buffer pools
  * based on the MTU of the device for better performance at the cost of more
@@ -206,6 +263,11 @@
 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 };
 
+/* mac_open() retry delay in usec */
+uint32_t vnet_mac_open_delay = 100;	/* 0.1 ms */
+
+/* max # of mac_open() retries */
+uint32_t vnet_mac_open_retries = 100;
 
 /*
  * Property names
@@ -375,6 +437,9 @@
 	rw_init(&vnetp->vsw_fp_rw, NULL, RW_DRIVER, NULL);
 	attach_progress |= AST_vnet_alloc;
 
+	vnet_ring_grp_init(vnetp);
+	attach_progress |= AST_ring_init;
+
 	status = vdds_init(vnetp);
 	if (status != 0) {
 		goto vnet_attach_fail;
@@ -419,10 +484,19 @@
 	attach_progress |= AST_vnet_list;
 
 	/*
-	 * Initialize the generic vnet plugin which provides
-	 * communication via sun4v LDC (logical domain channel) based
-	 * resources. It will register the LDC resources as and when
-	 * they become available.
+	 * Initialize the generic vnet plugin which provides communication via
+	 * sun4v LDC (logical domain channel) based resources. This involves 2
+	 * steps; first, vgen_init() is invoked to read the various properties
+	 * of the vnet device from its MD node (including its mtu which is
+	 * needed to mac_register()) and obtain a handle to the vgen layer.
+	 * After mac_register() is done and we have a mac handle, we then
+	 * invoke vgen_init_mdeg() which registers with the the MD event
+	 * generator (mdeg) framework to allow LDC resource notifications.
+	 * Note: this sequence also allows us to report the correct default #
+	 * of pseudo rings (2TX and 3RX) in vnet_m_capab() which gets invoked
+	 * in the context of mac_register(); and avoids conflicting with
+	 * dynamic pseudo rx rings which get added/removed as a result of mdeg
+	 * events in vgen.
 	 */
 	status = vgen_init(vnetp, reg, vnetp->dip,
 	    (uint8_t *)vnetp->curr_macaddr, &vnetp->vgenhdl);
@@ -432,15 +506,19 @@
 	}
 	attach_progress |= AST_vgen_init;
 
-	/* register with MAC layer */
 	status = vnet_mac_register(vnetp);
 	if (status != DDI_SUCCESS) {
 		goto vnet_attach_fail;
 	}
 	vnetp->link_state = LINK_STATE_UNKNOWN;
-
 	attach_progress |= AST_macreg;
 
+	status = vgen_init_mdeg(vnetp->vgenhdl);
+	if (status != DDI_SUCCESS) {
+		goto vnet_attach_fail;
+	}
+	attach_progress |= AST_init_mdeg;
+
 	vnetp->attach_progress = attach_progress;
 
 	DBG1(NULL, "instance(%d) exit\n", instance);
@@ -503,21 +581,25 @@
 	attach_progress = vnetp->attach_progress;
 
 	/*
-	 * Unregister from the gldv3 subsystem. This can fail, in particular
-	 * if there are still any open references to this mac device; in which
-	 * case we just return failure without continuing to detach further.
+	 * Disable the mac device in the gldv3 subsystem. This can fail, in
+	 * particular if there are still any open references to this mac
+	 * device; in which case we just return failure without continuing to
+	 * detach further.
+	 * If it succeeds, we then invoke vgen_uninit() which should unregister
+	 * any pseudo rings registered with the mac layer. Note we keep the
+	 * AST_macreg flag on, so we can unregister with the mac layer at
+	 * the end of this routine.
 	 */
 	if (attach_progress & AST_macreg) {
-		if (mac_unregister(vnetp->mh) != 0) {
+		if (mac_disable(vnetp->mh) != 0) {
 			return (1);
 		}
-		attach_progress &= ~AST_macreg;
 	}
 
 	/*
-	 * Now that we have unregistered from gldv3, we must finish all other
-	 * steps and successfully return from this function; otherwise we will
-	 * end up leaving the device in a broken/unusable state.
+	 * Now that we have disabled the device, we must finish all other steps
+	 * and successfully return from this function; otherwise we will end up
+	 * leaving the device in a broken/unusable state.
 	 *
 	 * First, release any hybrid resources assigned to this vnet device.
 	 */
@@ -530,9 +612,10 @@
 	 * Uninit vgen. This stops further mdeg callbacks to this vnet
 	 * device and/or its ports; and detaches any existing ports.
 	 */
-	if (attach_progress & AST_vgen_init) {
+	if (attach_progress & (AST_vgen_init|AST_init_mdeg)) {
 		vgen_uninit(vnetp->vgenhdl);
 		attach_progress &= ~AST_vgen_init;
+		attach_progress &= ~AST_init_mdeg;
 	}
 
 	/* Destroy the taskq. */
@@ -563,6 +646,17 @@
 		attach_progress &= ~AST_vnet_list;
 	}
 
+	if (attach_progress & AST_ring_init) {
+		vnet_ring_grp_uninit(vnetp);
+		attach_progress &= ~AST_ring_init;
+	}
+
+	if (attach_progress & AST_macreg) {
+		VERIFY(mac_unregister(vnetp->mh) == 0);
+		vnetp->mh = NULL;
+		attach_progress &= ~AST_macreg;
+	}
+
 	if (attach_progress & AST_vnet_alloc) {
 		rw_destroy(&vnetp->vrwlock);
 		rw_destroy(&vnetp->vsw_fp_rw);
@@ -683,8 +777,9 @@
  * external hosts.
  */
 mblk_t *
-vnet_m_tx(void *arg, mblk_t *mp)
+vnet_tx_ring_send(void *arg, mblk_t *mp)
 {
+	vnet_pseudo_tx_ring_t	*tx_ringp;
 	vnet_t			*vnetp;
 	vnet_res_t		*vresp;
 	mblk_t			*next;
@@ -694,8 +789,10 @@
 	boolean_t		is_unicast;
 	boolean_t		is_pvid;	/* non-default pvid ? */
 	boolean_t		hres;		/* Hybrid resource ? */
-
-	vnetp = (vnet_t *)arg;
+	void			*tx_arg;
+
+	tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
+	vnetp = (vnet_t *)tx_ringp->vnetp;
 	DBG1(vnetp, "enter\n");
 	ASSERT(mp != NULL);
 
@@ -790,10 +887,14 @@
 					}
 
 				}
+
+				macp = &vresp->macreg;
+				tx_arg = tx_ringp;
+			} else {
+				macp = &vresp->macreg;
+				tx_arg = macp->m_driver;
 			}
-
-			macp = &vresp->macreg;
-			resid_mp = macp->m_callbacks->mc_tx(macp->m_driver, mp);
+			resid_mp = macp->m_callbacks->mc_tx(tx_arg, mp);
 
 			/* tx done; now release ref on fdb entry */
 			VNET_FDBE_REFRELE(vresp);
@@ -848,6 +949,124 @@
 	return (0);
 }
 
+static void
+vnet_ring_grp_init(vnet_t *vnetp)
+{
+	vnet_pseudo_rx_group_t	*rx_grp;
+	vnet_pseudo_rx_ring_t	*rx_ringp;
+	vnet_pseudo_tx_group_t	*tx_grp;
+	vnet_pseudo_tx_ring_t	*tx_ringp;
+	int			i;
+
+	tx_grp = &vnetp->tx_grp[0];
+	tx_ringp = kmem_zalloc(sizeof (vnet_pseudo_tx_ring_t) *
+	    VNET_NUM_PSEUDO_TXRINGS, KM_SLEEP);
+	for (i = 0; i < VNET_NUM_PSEUDO_TXRINGS; i++) {
+		tx_ringp[i].state |= VNET_TXRING_SHARED;
+	}
+	tx_grp->rings = tx_ringp;
+	tx_grp->ring_cnt = VNET_NUM_PSEUDO_TXRINGS;
+
+	rx_grp = &vnetp->rx_grp[0];
+	rx_grp->max_ring_cnt = MAX_RINGS_PER_GROUP;
+	rw_init(&rx_grp->lock, NULL, RW_DRIVER, NULL);
+	rx_ringp = kmem_zalloc(sizeof (vnet_pseudo_rx_ring_t) *
+	    rx_grp->max_ring_cnt, KM_SLEEP);
+
+	/*
+	 * Setup the first 3 Pseudo RX Rings that are reserved;
+	 * 1 for LDC resource to vswitch + 2 for RX rings of Hybrid resource.
+	 */
+	rx_ringp[0].state |= VNET_RXRING_INUSE|VNET_RXRING_LDC_SERVICE;
+	rx_ringp[0].index = 0;
+	rx_ringp[1].state |= VNET_RXRING_INUSE|VNET_RXRING_HYBRID;
+	rx_ringp[1].index = 1;
+	rx_ringp[2].state |= VNET_RXRING_INUSE|VNET_RXRING_HYBRID;
+	rx_ringp[2].index = 2;
+
+	rx_grp->ring_cnt = VNET_NUM_PSEUDO_RXRINGS_DEFAULT;
+	rx_grp->rings = rx_ringp;
+
+	for (i = VNET_NUM_PSEUDO_RXRINGS_DEFAULT;
+	    i < rx_grp->max_ring_cnt; i++) {
+		rx_ringp = &rx_grp->rings[i];
+		rx_ringp->state = VNET_RXRING_FREE;
+		rx_ringp->index = i;
+	}
+}
+
+static void
+vnet_ring_grp_uninit(vnet_t *vnetp)
+{
+	vnet_pseudo_rx_group_t	*rx_grp;
+	vnet_pseudo_tx_group_t	*tx_grp;
+
+	tx_grp = &vnetp->tx_grp[0];
+	if (tx_grp->rings != NULL) {
+		ASSERT(tx_grp->ring_cnt == VNET_NUM_PSEUDO_TXRINGS);
+		kmem_free(tx_grp->rings, sizeof (vnet_pseudo_tx_ring_t) *
+		    tx_grp->ring_cnt);
+		tx_grp->rings = NULL;
+	}
+
+	rx_grp = &vnetp->rx_grp[0];
+	if (rx_grp->rings != NULL) {
+		ASSERT(rx_grp->max_ring_cnt == MAX_RINGS_PER_GROUP);
+		ASSERT(rx_grp->ring_cnt == VNET_NUM_PSEUDO_RXRINGS_DEFAULT);
+		kmem_free(rx_grp->rings, sizeof (vnet_pseudo_rx_ring_t) *
+		    rx_grp->max_ring_cnt);
+		rx_grp->rings = NULL;
+	}
+}
+
+static vnet_pseudo_rx_ring_t *
+vnet_alloc_pseudo_rx_ring(vnet_t *vnetp)
+{
+	vnet_pseudo_rx_group_t  *rx_grp;
+	vnet_pseudo_rx_ring_t	*rx_ringp;
+	int			index;
+
+	rx_grp = &vnetp->rx_grp[0];
+	WRITE_ENTER(&rx_grp->lock);
+
+	if (rx_grp->ring_cnt == rx_grp->max_ring_cnt) {
+		/* no rings available */
+		RW_EXIT(&rx_grp->lock);
+		return (NULL);
+	}
+
+	for (index = VNET_NUM_PSEUDO_RXRINGS_DEFAULT;
+	    index < rx_grp->max_ring_cnt; index++) {
+		rx_ringp = &rx_grp->rings[index];
+		if (rx_ringp->state == VNET_RXRING_FREE) {
+			rx_ringp->state |= VNET_RXRING_INUSE;
+			rx_grp->ring_cnt++;
+			break;
+		}
+	}
+
+	RW_EXIT(&rx_grp->lock);
+	return (rx_ringp);
+}
+
+static void
+vnet_free_pseudo_rx_ring(vnet_t *vnetp, vnet_pseudo_rx_ring_t *ringp)
+{
+	vnet_pseudo_rx_group_t  *rx_grp;
+
+	ASSERT(ringp->index >= VNET_NUM_PSEUDO_RXRINGS_DEFAULT);
+	rx_grp = &vnetp->rx_grp[0];
+	WRITE_ENTER(&rx_grp->lock);
+
+	if (ringp->state != VNET_RXRING_FREE) {
+		ringp->state = VNET_RXRING_FREE;
+		ringp->handle = NULL;
+		rx_grp->ring_cnt--;
+	}
+
+	RW_EXIT(&rx_grp->lock);
+}
+
 /* wrapper function for mac_register() */
 static int
 vnet_mac_register(vnet_t *vnetp)
@@ -867,6 +1086,15 @@
 	macp->m_margin = VLAN_TAGSZ;
 
 	/*
+	 * MAC_VIRT_SERIALIZE flag is needed while hybridIO is enabled to
+	 * workaround tx lock contention issues in nxge.
+	 */
+	macp->m_v12n = MAC_VIRT_LEVEL1;
+	if (vnet_mac_tx_serialize == B_TRUE) {
+		macp->m_v12n |= MAC_VIRT_SERIALIZE;
+	}
+
+	/*
 	 * Finally, we're ready to register ourselves with the MAC layer
 	 * interface; if this succeeds, we're all ready to start()
 	 */
@@ -1116,42 +1344,57 @@
 static void
 vnet_rx(vio_net_handle_t vrh, mblk_t *mp)
 {
-	vnet_res_t	*vresp = (vnet_res_t *)vrh;
-	vnet_t		*vnetp = vresp->vnetp;
+	vnet_res_t		*vresp = (vnet_res_t *)vrh;
+	vnet_t			*vnetp = vresp->vnetp;
+	vnet_pseudo_rx_ring_t	*ringp;
 
 	if ((vnetp == NULL) || (vnetp->mh == 0)) {
 		freemsgchain(mp);
 		return;
 	}
 
-	/*
-	 * Packets received over a hybrid resource need additional processing
-	 * to remove the tag, for the pvid case. The underlying resource is
-	 * not aware of the vnet's pvid and thus packets are received with the
-	 * vlan tag in the header; unlike packets that are received over a ldc
-	 * channel in which case the peer vnet/vsw would have already removed
-	 * the tag.
-	 */
-	if (vresp->type == VIO_NET_RES_HYBRID &&
-	    vnetp->pvid != vnetp->default_vlan_id) {
-
-		vnet_rx_frames_untag(vnetp->pvid, &mp);
-		if (mp == NULL) {
-			return;
-		}
-	}
-
-	mac_rx(vnetp->mh, NULL, mp);
+	ringp = vresp->rx_ringp;
+	mac_rx_ring(vnetp->mh, ringp->handle, mp, ringp->gen_num);
 }
 
 void
 vnet_tx_update(vio_net_handle_t vrh)
 {
-	vnet_res_t *vresp = (vnet_res_t *)vrh;
-	vnet_t *vnetp = vresp->vnetp;
-
-	if ((vnetp != NULL) && (vnetp->mh != NULL)) {
-		mac_tx_update(vnetp->mh);
+	vnet_res_t		*vresp = (vnet_res_t *)vrh;
+	vnet_t			*vnetp = vresp->vnetp;
+	vnet_pseudo_tx_ring_t	*tx_ringp;
+	vnet_pseudo_tx_group_t	*tx_grp;
+	int			i;
+
+	if (vnetp == NULL || vnetp->mh == NULL) {
+		return;
+	}
+
+	/*
+	 * Currently, the tx hwring API (used to access rings that belong to
+	 * a Hybrid IO resource) does not provide us a per ring flow ctrl
+	 * update; also the pseudo rings are shared by the ports/ldcs in the
+	 * vgen layer. Thus we can't figure out which pseudo ring is being
+	 * re-enabled for transmits. To work around this, when we get a tx
+	 * restart notification from below, we simply propagate that to all
+	 * the tx pseudo rings registered with the mac layer above.
+	 *
+	 * There are a couple of side effects with this approach, but they are
+	 * not harmful, as outlined below:
+	 *
+	 * A) We might send an invalid ring_update() for a ring that is not
+	 * really flow controlled. This will not have any effect in the mac
+	 * layer and packets will continue to be transmitted on that ring.
+	 *
+	 * B) We might end up clearing the flow control in the mac layer for
+	 * a ring that is still flow controlled in the underlying resource.
+	 * This will result in the mac layer restarting	transmit, only to be
+	 * flow controlled again on that ring.
+	 */
+	tx_grp = &vnetp->tx_grp[0];
+	for (i = 0; i < tx_grp->ring_cnt; i++) {
+		tx_ringp = &tx_grp->rings[i];
+		mac_tx_ring_update(vnetp->mh, tx_ringp->handle);
 	}
 }
 
@@ -1233,8 +1476,8 @@
     ether_addr_t local_macaddr, ether_addr_t rem_macaddr, vio_net_handle_t *vhp,
     vio_net_callbacks_t *vcb)
 {
-	vnet_t	*vnetp;
-	vnet_res_t *vresp;
+	vnet_t		*vnetp;
+	vnet_res_t	*vresp;
 
 	vresp = kmem_zalloc(sizeof (vnet_res_t), KM_SLEEP);
 	ether_copy(local_macaddr, vresp->local_macaddr);
@@ -1260,11 +1503,7 @@
 					    vnetp->instance);
 				}
 			}
-
-			WRITE_ENTER(&vnetp->vrwlock);
-			vresp->nextp = vnetp->vres_list;
-			vnetp->vres_list = vresp;
-			RW_EXIT(&vnetp->vrwlock);
+			vnet_add_resource(vnetp, vresp);
 			break;
 		}
 		vnetp = vnetp->nextp;
@@ -1281,6 +1520,14 @@
 	vcb->vio_net_tx_update = vnet_tx_update;
 	vcb->vio_net_report_err = vnet_handle_res_err;
 
+	/* Bind the resource to pseudo ring(s) */
+	if (vnet_bind_rings(vresp) != 0) {
+		(void) vnet_rem_resource(vnetp, vresp);
+		vnet_hio_destroy_kstats(vresp->ksp);
+		KMEM_FREE(vresp);
+		return (1);
+	}
+
 	/* Dispatch a task to start resources */
 	vnet_dispatch_res_task(vnetp);
 	return (0);
@@ -1294,8 +1541,6 @@
 {
 	vnet_res_t	*vresp = (vnet_res_t *)vhp;
 	vnet_t		*vnetp = vresp->vnetp;
-	vnet_res_t	*vrp;
-	kstat_t		*ksp = NULL;
 
 	DBG1(NULL, "Resource Registerig hdl=0x%p", vhp);
 
@@ -1306,7 +1551,29 @@
 	 */
 	vnet_fdbe_del(vnetp, vresp);
 
+	vnet_unbind_rings(vresp);
+
 	/* Now remove the resource from the list */
+	(void) vnet_rem_resource(vnetp, vresp);
+
+	vnet_hio_destroy_kstats(vresp->ksp);
+	KMEM_FREE(vresp);
+}
+
+static void
+vnet_add_resource(vnet_t *vnetp, vnet_res_t *vresp)
+{
+	WRITE_ENTER(&vnetp->vrwlock);
+	vresp->nextp = vnetp->vres_list;
+	vnetp->vres_list = vresp;
+	RW_EXIT(&vnetp->vrwlock);
+}
+
+static vnet_res_t *
+vnet_rem_resource(vnet_t *vnetp, vnet_res_t *vresp)
+{
+	vnet_res_t	*vrp;
+
 	WRITE_ENTER(&vnetp->vrwlock);
 	if (vresp == vnetp->vres_list) {
 		vnetp->vres_list = vresp->nextp;
@@ -1320,15 +1587,12 @@
 			vrp = vrp->nextp;
 		}
 	}
-
-	ksp = vresp->ksp;
-	vresp->ksp = NULL;
-
 	vresp->vnetp = NULL;
 	vresp->nextp = NULL;
+
 	RW_EXIT(&vnetp->vrwlock);
-	vnet_hio_destroy_kstats(ksp);
-	KMEM_FREE(vresp);
+
+	return (vresp);
 }
 
 /*
@@ -1710,6 +1974,1024 @@
 	}
 }
 
+static boolean_t
+vnet_m_capab(void *arg, mac_capab_t cap, void *cap_data)
+{
+	vnet_t	*vnetp = (vnet_t *)arg;
+
+	if (vnetp == NULL) {
+		return (0);
+	}
+
+	switch (cap) {
+
+	case MAC_CAPAB_RINGS: {
+
+		mac_capab_rings_t *cap_rings = cap_data;
+		/*
+		 * Rings Capability Notes:
+		 * We advertise rings to make use of the rings framework in
+		 * gldv3 mac layer, to improve the performance. This is
+		 * specifically needed when a Hybrid resource (with multiple
+		 * tx/rx hardware rings) is assigned to a vnet device. We also
+		 * leverage this for the normal case when no Hybrid resource is
+		 * assigned.
+		 *
+		 * Ring Allocation:
+		 * - TX path:
+		 * We expose a pseudo ring group with 2 pseudo tx rings (as
+		 * currently HybridIO exports only 2 rings) In the normal case,
+		 * transmit traffic that comes down to the driver through the
+		 * mri_tx (vnet_tx_ring_send()) entry point goes through the
+		 * distributed switching algorithm in vnet and gets transmitted
+		 * over a port/LDC in the vgen layer to either the vswitch or a
+		 * peer vnet. If and when a Hybrid resource is assigned to the
+		 * vnet, we obtain the tx ring information of the Hybrid device
+		 * (nxge) and map the pseudo rings 1:1 to the 2 hw tx rings.
+		 * Traffic being sent over the Hybrid resource by the mac layer
+		 * gets spread across both hw rings, as they are mapped to the
+		 * 2 pseudo tx rings in vnet.
+		 *
+		 * - RX path:
+		 * We expose a pseudo ring group with 3 pseudo rx rings (static
+		 * rings) initially. The first (default) pseudo rx ring is
+		 * reserved for the resource that connects to the vswitch
+		 * service. The next 2 rings are reserved for a Hybrid resource
+		 * that may be assigned to the vnet device. If and when a
+		 * Hybrid resource is assigned to the vnet, we obtain the rx
+		 * ring information of the Hybrid device (nxge) and map these
+		 * pseudo rings 1:1 to the 2 hw rx rings. For each additional
+		 * resource that connects to a peer vnet, we dynamically
+		 * allocate a pseudo rx ring and map it to that resource, when
+		 * the resource gets added; and the pseudo rx ring is
+		 * dynamically registered with the upper mac layer. We do the
+		 * reverse and unregister the ring with the mac layer when
+		 * the resource gets removed.
+		 *
+		 * Synchronization notes:
+		 * We don't need any lock to protect members of ring structure,
+		 * specifically ringp->hw_rh, in either the TX or the RX ring,
+		 * as explained below.
+		 * - TX ring:
+		 * ring->hw_rh is initialized only when a Hybrid resource is
+		 * associated; and gets referenced only in vnet_hio_tx(). The
+		 * Hybrid resource itself is available in fdb only after tx
+		 * hwrings are found and mapped; i.e, in vio_net_resource_reg()
+		 * we call vnet_bind_rings() first and then call
+		 * vnet_start_resources() which adds an entry to fdb. For
+		 * traffic going over LDC resources, we don't reference
+		 * ring->hw_rh at all.
+		 * - RX ring:
+		 * For rings mapped to Hybrid resource ring->hw_rh is
+		 * initialized and only then do we add the rx callback for
+		 * the underlying Hybrid resource; we disable callbacks before
+		 * we unmap ring->hw_rh. For rings mapped to LDC resources, we
+		 * stop the rx callbacks (in vgen) before we remove ring->hw_rh
+		 * (vio_net_resource_unreg()).
+		 */
+
+		if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
+			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+
+			/*
+			 * The ring_cnt for rx grp is initialized in
+			 * vnet_ring_grp_init(). Later, the ring_cnt gets
+			 * updated dynamically whenever LDC resources are added
+			 * or removed.
+			 */
+			cap_rings->mr_rnum = vnetp->rx_grp[0].ring_cnt;
+			cap_rings->mr_rget = vnet_get_ring;
+
+			cap_rings->mr_gnum = VNET_NUM_PSEUDO_GROUPS;
+			cap_rings->mr_gget = vnet_get_group;
+			cap_rings->mr_gaddring = NULL;
+			cap_rings->mr_gremring = NULL;
+		} else {
+			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+
+			/*
+			 * The ring_cnt for tx grp is initialized in
+			 * vnet_ring_grp_init() and remains constant, as we
+			 * do not support dymanic tx rings for now.
+			 */
+			cap_rings->mr_rnum = vnetp->tx_grp[0].ring_cnt;
+			cap_rings->mr_rget = vnet_get_ring;
+
+			/*
+			 * Transmit rings are not grouped; i.e, the number of
+			 * transmit ring groups advertised should be set to 0.
+			 */
+			cap_rings->mr_gnum = 0;
+
+			cap_rings->mr_gget = vnet_get_group;
+			cap_rings->mr_gaddring = NULL;
+			cap_rings->mr_gremring = NULL;
+		}
+		return (B_TRUE);
+
+	}
+
+	default:
+		break;
+
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * Callback funtion for MAC layer to get ring information.
+ */
+static void
+vnet_get_ring(void *arg, mac_ring_type_t rtype, const int g_index,
+    const int r_index, mac_ring_info_t *infop, mac_ring_handle_t r_handle)
+{
+	vnet_t	*vnetp = arg;
+
+	switch (rtype) {
+
+	case MAC_RING_TYPE_RX: {
+
+		vnet_pseudo_rx_group_t	*rx_grp;
+		vnet_pseudo_rx_ring_t	*rx_ringp;
+		mac_intr_t		*mintr;
+
+		/* We advertised only one RX group */
+		ASSERT(g_index == 0);
+		rx_grp = &vnetp->rx_grp[g_index];
+
+		/* Check the current # of rings in the rx group */
+		ASSERT((r_index >= 0) && (r_index < rx_grp->max_ring_cnt));
+
+		/* Get the ring based on the index */
+		rx_ringp = &rx_grp->rings[r_index];
+
+		rx_ringp->handle = r_handle;
+		/*
+		 * Note: we don't need to save the incoming r_index in rx_ring,
+		 * as vnet_ring_grp_init() would have initialized the index for
+		 * each ring in the array.
+		 */
+		rx_ringp->grp = rx_grp;
+		rx_ringp->vnetp = vnetp;
+
+		mintr = &infop->mri_intr;
+		mintr->mi_handle = (mac_intr_handle_t)rx_ringp;
+		mintr->mi_enable = (mac_intr_enable_t)vnet_ring_enable_intr;
+		mintr->mi_disable = (mac_intr_disable_t)vnet_ring_disable_intr;
+
+		infop->mri_driver = (mac_ring_driver_t)rx_ringp;
+		infop->mri_start = vnet_rx_ring_start;
+		infop->mri_stop = vnet_rx_ring_stop;
+
+		/* Set the poll function, as this is an rx ring */
+		infop->mri_poll = vnet_rx_poll;
+
+		break;
+	}
+
+	case MAC_RING_TYPE_TX: {
+		vnet_pseudo_tx_group_t	*tx_grp;
+		vnet_pseudo_tx_ring_t	*tx_ringp;
+
+		/*
+		 * No need to check grp index; mac layer passes -1 for it.
+		 */
+		tx_grp = &vnetp->tx_grp[0];
+
+		/* Check the # of rings in the tx group */
+		ASSERT((r_index >= 0) && (r_index < tx_grp->ring_cnt));
+
+		/* Get the ring based on the index */
+		tx_ringp = &tx_grp->rings[r_index];
+
+		tx_ringp->handle = r_handle;
+		tx_ringp->index = r_index;
+		tx_ringp->grp = tx_grp;
+		tx_ringp->vnetp = vnetp;
+
+		infop->mri_driver = (mac_ring_driver_t)tx_ringp;
+		infop->mri_start = vnet_tx_ring_start;
+		infop->mri_stop = vnet_tx_ring_stop;
+
+		/* Set the transmit function, as this is a tx ring */
+		infop->mri_tx = vnet_tx_ring_send;
+
+		break;
+	}
+
+	default:
+		break;
+	}
+}
+
+/*
+ * Callback funtion for MAC layer to get group information.
+ */
+static void
+vnet_get_group(void *arg, mac_ring_type_t type, const int index,
+	mac_group_info_t *infop, mac_group_handle_t handle)
+{
+	vnet_t	*vnetp = (vnet_t *)arg;
+
+	switch (type) {
+
+	case MAC_RING_TYPE_RX:
+	{
+		vnet_pseudo_rx_group_t	*rx_grp;
+
+		/* We advertised only one RX group */
+		ASSERT(index == 0);
+
+		rx_grp = &vnetp->rx_grp[index];
+		rx_grp->handle = handle;
+		rx_grp->index = index;
+		rx_grp->vnetp = vnetp;
+
+		infop->mgi_driver = (mac_group_driver_t)rx_grp;
+		infop->mgi_start = NULL;
+		infop->mgi_stop = NULL;
+		infop->mgi_addmac = vnet_addmac;
+		infop->mgi_remmac = vnet_remmac;
+		infop->mgi_count = rx_grp->ring_cnt;
+
+		break;
+	}
+
+	case MAC_RING_TYPE_TX:
+	{
+		vnet_pseudo_tx_group_t	*tx_grp;
+
+		/* We advertised only one TX group */
+		ASSERT(index == 0);
+
+		tx_grp = &vnetp->tx_grp[index];
+		tx_grp->handle = handle;
+		tx_grp->index = index;
+		tx_grp->vnetp = vnetp;
+
+		infop->mgi_driver = (mac_group_driver_t)tx_grp;
+		infop->mgi_start = NULL;
+		infop->mgi_stop = NULL;
+		infop->mgi_addmac = NULL;
+		infop->mgi_remmac = NULL;
+		infop->mgi_count = VNET_NUM_PSEUDO_TXRINGS;
+
+		break;
+	}
+
+	default:
+		break;
+
+	}
+}
+
+static int
+vnet_rx_ring_start(mac_ring_driver_t arg, uint64_t mr_gen_num)
+{
+	vnet_pseudo_rx_ring_t	*rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
+	int			err;
+
+	/*
+	 * If this ring is mapped to a LDC resource, simply mark the state to
+	 * indicate the ring is started and return.
+	 */
+	if ((rx_ringp->state &
+	    (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0) {
+		rx_ringp->gen_num = mr_gen_num;
+		rx_ringp->state |= VNET_RXRING_STARTED;
+		return (0);
+	}
+
+	ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
+
+	/*
+	 * This must be a ring reserved for a hwring. If the hwring is not
+	 * bound yet, simply mark the state to indicate the ring is started and
+	 * return. If and when a hybrid resource is activated for this vnet
+	 * device, we will bind the hwring and start it then. If a hwring is
+	 * already bound, start it now.
+	 */
+	if (rx_ringp->hw_rh == NULL) {
+		rx_ringp->gen_num = mr_gen_num;
+		rx_ringp->state |= VNET_RXRING_STARTED;
+		return (0);
+	}
+
+	err = mac_hwring_start(rx_ringp->hw_rh);
+	if (err == 0) {
+		rx_ringp->gen_num = mr_gen_num;
+		rx_ringp->state |= VNET_RXRING_STARTED;
+	} else {
+		err = ENXIO;
+	}
+
+	return (err);
+}
+
+static void
+vnet_rx_ring_stop(mac_ring_driver_t arg)
+{
+	vnet_pseudo_rx_ring_t	*rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
+
+	/*
+	 * If this ring is mapped to a LDC resource, simply mark the state to
+	 * indicate the ring is now stopped and return.
+	 */
+	if ((rx_ringp->state &
+	    (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0) {
+		rx_ringp->state &= ~VNET_RXRING_STARTED;
+	}
+
+	ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
+
+	/*
+	 * This must be a ring reserved for a hwring. If the hwring is not
+	 * bound yet, simply mark the state to indicate the ring is stopped and
+	 * return. If a hwring is already bound, stop it now.
+	 */
+	if (rx_ringp->hw_rh == NULL) {
+		rx_ringp->state &= ~VNET_RXRING_STARTED;
+		return;
+	}
+
+	mac_hwring_stop(rx_ringp->hw_rh);
+	rx_ringp->state &= ~VNET_RXRING_STARTED;
+}
+
+/* ARGSUSED */
+static int
+vnet_tx_ring_start(mac_ring_driver_t arg, uint64_t mr_gen_num)
+{
+	vnet_pseudo_tx_ring_t	*tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
+
+	tx_ringp->state |= VNET_TXRING_STARTED;
+	return (0);
+}
+
+static void
+vnet_tx_ring_stop(mac_ring_driver_t arg)
+{
+	vnet_pseudo_tx_ring_t	*tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
+
+	tx_ringp->state &= ~VNET_TXRING_STARTED;
+}
+
+/*
+ * Disable polling for a ring and enable its interrupt.
+ */
+static int
+vnet_ring_enable_intr(void *arg)
+{
+	vnet_pseudo_rx_ring_t	*rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
+	vnet_res_t		*vresp;
+
+	if (rx_ringp->hw_rh == NULL) {
+		/*
+		 * Ring enable intr func is being invoked, but the ring is
+		 * not bound to any underlying resource ? This must be a ring
+		 * reserved for Hybrid resource and no such resource has been
+		 * assigned to this vnet device yet. We simply return success.
+		 */
+		ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
+		return (0);
+	}
+
+	/*
+	 * The rx ring has been bound to either a LDC or a Hybrid resource.
+	 * Call the appropriate function to enable interrupts for the ring.
+	 */
+	if (rx_ringp->state & VNET_RXRING_HYBRID) {
+		return (mac_hwring_enable_intr(rx_ringp->hw_rh));
+	} else {
+		vresp = (vnet_res_t *)rx_ringp->hw_rh;
+		return (vgen_enable_intr(vresp->macreg.m_driver));
+	}
+}
+
+/*
+ * Enable polling for a ring and disable its interrupt.
+ */
+static int
+vnet_ring_disable_intr(void *arg)
+{
+	vnet_pseudo_rx_ring_t	*rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
+	vnet_res_t		*vresp;
+
+	if (rx_ringp->hw_rh == NULL) {
+		/*
+		 * Ring disable intr func is being invoked, but the ring is
+		 * not bound to any underlying resource ? This must be a ring
+		 * reserved for Hybrid resource and no such resource has been
+		 * assigned to this vnet device yet. We simply return success.
+		 */
+		ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
+		return (0);
+	}
+
+	/*
+	 * The rx ring has been bound to either a LDC or a Hybrid resource.
+	 * Call the appropriate function to disable interrupts for the ring.
+	 */
+	if (rx_ringp->state & VNET_RXRING_HYBRID) {
+		return (mac_hwring_disable_intr(rx_ringp->hw_rh));
+	} else {
+		vresp = (vnet_res_t *)rx_ringp->hw_rh;
+		return (vgen_disable_intr(vresp->macreg.m_driver));
+	}
+}
+
+/*
+ * Poll 'bytes_to_pickup' bytes of message from the rx ring.
+ */
+static mblk_t *
+vnet_rx_poll(void *arg, int bytes_to_pickup)
+{
+	vnet_pseudo_rx_ring_t	*rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
+	mblk_t			*mp = NULL;
+	vnet_res_t		*vresp;
+	vnet_t			*vnetp = rx_ringp->vnetp;
+
+	if (rx_ringp->hw_rh == NULL) {
+		return (NULL);
+	}
+
+	if (rx_ringp->state & VNET_RXRING_HYBRID) {
+		mp = mac_hwring_poll(rx_ringp->hw_rh, bytes_to_pickup);
+		/*
+		 * Packets received over a hybrid resource need additional
+		 * processing to remove the tag, for the pvid case. The
+		 * underlying resource is not aware of the vnet's pvid and thus
+		 * packets are received with the vlan tag in the header; unlike
+		 * packets that are received over a ldc channel in which case
+		 * the peer vnet/vsw would have already removed the tag.
+		 */
+		if (vnetp->pvid != vnetp->default_vlan_id) {
+			vnet_rx_frames_untag(vnetp->pvid, &mp);
+		}
+	} else {
+		vresp = (vnet_res_t *)rx_ringp->hw_rh;
+		mp = vgen_poll(vresp->macreg.m_driver, bytes_to_pickup);
+	}
+	return (mp);
+}
+
+/* ARGSUSED */
+void
+vnet_hio_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+	boolean_t loopback)
+{
+	vnet_t			*vnetp = (vnet_t *)arg;
+	vnet_pseudo_rx_ring_t	*ringp = (vnet_pseudo_rx_ring_t *)mrh;
+
+	/*
+	 * Packets received over a hybrid resource need additional processing
+	 * to remove the tag, for the pvid case. The underlying resource is
+	 * not aware of the vnet's pvid and thus packets are received with the
+	 * vlan tag in the header; unlike packets that are received over a ldc
+	 * channel in which case the peer vnet/vsw would have already removed
+	 * the tag.
+	 */
+	if (vnetp->pvid != vnetp->default_vlan_id) {
+		vnet_rx_frames_untag(vnetp->pvid, &mp);
+		if (mp == NULL) {
+			return;
+		}
+	}
+	mac_rx_ring(vnetp->mh, ringp->handle, mp, ringp->gen_num);
+}
+
+static int
+vnet_addmac(void *arg, const uint8_t *mac_addr)
+{
+	vnet_pseudo_rx_group_t  *rx_grp = (vnet_pseudo_rx_group_t *)arg;
+	vnet_t			*vnetp;
+
+	vnetp = rx_grp->vnetp;
+
+	if (bcmp(mac_addr, vnetp->curr_macaddr, ETHERADDRL) == 0) {
+		return (0);
+	}
+
+	cmn_err(CE_CONT, "!vnet%d: %s: Multiple macaddr unsupported\n",
+	    vnetp->instance, __func__);
+	return (EINVAL);
+}
+
+static int
+vnet_remmac(void *arg, const uint8_t *mac_addr)
+{
+	vnet_pseudo_rx_group_t  *rx_grp = (vnet_pseudo_rx_group_t *)arg;
+	vnet_t			*vnetp;
+
+	vnetp = rx_grp->vnetp;
+
+	if (bcmp(mac_addr, vnetp->curr_macaddr, ETHERADDRL) == 0) {
+		return (0);
+	}
+
+	cmn_err(CE_CONT, "!vnet%d: %s: Invalid macaddr: %s\n",
+	    vnetp->instance, __func__, ether_sprintf((void *)mac_addr));
+	return (EINVAL);
+}
+
+int
+vnet_hio_mac_init(vnet_t *vnetp, char *ifname)
+{
+	mac_handle_t		mh;
+	mac_client_handle_t	mch = NULL;
+	mac_unicast_handle_t	muh = NULL;
+	mac_diag_t		diag;
+	mac_register_t		*macp;
+	char			client_name[MAXNAMELEN];
+	int			rv;
+	uint16_t		mac_flags = MAC_UNICAST_TAG_DISABLE |
+	    MAC_UNICAST_STRIP_DISABLE | MAC_UNICAST_PRIMARY;
+	vio_net_callbacks_t	vcb;
+	ether_addr_t		rem_addr =
+		{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+	uint32_t		retries = 0;
+
+	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
+		return (EAGAIN);
+	}
+
+	do {
+		rv = mac_open_by_linkname(ifname, &mh);
+		if (rv == 0) {
+			break;
+		}
+		if (rv != ENOENT || (retries++ >= vnet_mac_open_retries)) {
+			mac_free(macp);
+			return (rv);
+		}
+		drv_usecwait(vnet_mac_open_delay);
+	} while (rv == ENOENT);
+
+	vnetp->hio_mh = mh;
+
+	(void) snprintf(client_name, MAXNAMELEN, "vnet%d-%s", vnetp->instance,
+	    ifname);
+	rv = mac_client_open(mh, &mch, client_name, MAC_OPEN_FLAGS_EXCLUSIVE);
+	if (rv != 0) {
+		goto fail;
+	}
+	vnetp->hio_mch = mch;
+
+	rv = mac_unicast_add(mch, vnetp->curr_macaddr, mac_flags, &muh, 0,
+	    &diag);
+	if (rv != 0) {
+		goto fail;
+	}
+	vnetp->hio_muh = muh;
+
+	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
+	macp->m_driver = vnetp;
+	macp->m_dip = NULL;
+	macp->m_src_addr = NULL;
+	macp->m_callbacks = &vnet_hio_res_callbacks;
+	macp->m_min_sdu = 0;
+	macp->m_max_sdu = ETHERMTU;
+
+	rv = vio_net_resource_reg(macp, VIO_NET_RES_HYBRID,
+	    vnetp->curr_macaddr, rem_addr, &vnetp->hio_vhp, &vcb);
+	if (rv != 0) {
+		goto fail;
+	}
+	mac_free(macp);
+
+	/* add the recv callback */
+	mac_rx_set(vnetp->hio_mch, vnet_hio_rx_cb, vnetp);
+
+	/* add the notify callback - only tx updates for now */
+	vnetp->hio_mnh = mac_notify_add(vnetp->hio_mh, vnet_hio_notify_cb,
+	    vnetp);
+
+	return (0);
+
+fail:
+	mac_free(macp);
+	vnet_hio_mac_cleanup(vnetp);
+	return (1);
+}
+
+void
+vnet_hio_mac_cleanup(vnet_t *vnetp)
+{
+	if (vnetp->hio_mnh != NULL) {
+		(void) mac_notify_remove(vnetp->hio_mnh, B_TRUE);
+		vnetp->hio_mnh = NULL;
+	}
+
+	if (vnetp->hio_vhp != NULL) {
+		vio_net_resource_unreg(vnetp->hio_vhp);
+		vnetp->hio_vhp = NULL;
+	}
+
+	if (vnetp->hio_muh != NULL) {
+		mac_unicast_remove(vnetp->hio_mch, vnetp->hio_muh);
+		vnetp->hio_muh = NULL;
+	}
+
+	if (vnetp->hio_mch != NULL) {
+		mac_client_close(vnetp->hio_mch, 0);
+		vnetp->hio_mch = NULL;
+	}
+
+	if (vnetp->hio_mh != NULL) {
+		mac_close(vnetp->hio_mh);
+		vnetp->hio_mh = NULL;
+	}
+}
+
+/* Bind pseudo rings to hwrings */
+static int
+vnet_bind_hwrings(vnet_t *vnetp)
+{
+	mac_ring_handle_t	hw_rh[VNET_NUM_HYBRID_RINGS];
+	mac_perim_handle_t	mph1;
+	vnet_pseudo_rx_group_t	*rx_grp;
+	vnet_pseudo_rx_ring_t	*rx_ringp;
+	vnet_pseudo_tx_group_t	*tx_grp;
+	vnet_pseudo_tx_ring_t	*tx_ringp;
+	int			hw_ring_cnt;
+	int			i;
+	int			rv;
+
+	mac_perim_enter_by_mh(vnetp->hio_mh, &mph1);
+
+	/* Get the list of the underlying RX rings. */
+	hw_ring_cnt = mac_hwrings_get(vnetp->hio_mch, &vnetp->rx_hwgh, hw_rh,
+	    MAC_RING_TYPE_RX);
+
+	/* We expect the the # of hw rx rings to match VNET_NUM_HYBRID_RINGS */
+	if (hw_ring_cnt != VNET_NUM_HYBRID_RINGS) {
+		cmn_err(CE_WARN,
+		    "!vnet%d: vnet_bind_hwrings: bad rx hw_ring_cnt(%d)\n",
+		    vnetp->instance, hw_ring_cnt);
+		goto fail;
+	}
+
+	if (vnetp->rx_hwgh != NULL) {
+		/*
+		 * Quiesce the HW ring and the mac srs on the ring. Note
+		 * that the HW ring will be restarted when the pseudo ring
+		 * is started. At that time all the packets will be
+		 * directly passed up to the pseudo RX ring and handled
+		 * by mac srs created over the pseudo RX ring.
+		 */
+		mac_rx_client_quiesce(vnetp->hio_mch);
+		mac_srs_perm_quiesce(vnetp->hio_mch, B_TRUE);
+	}
+
+	/*
+	 * Bind the pseudo rings to the hwrings and start the hwrings.
+	 * Note we don't need to register these with the upper mac, as we have
+	 * statically exported these pseudo rxrings which are reserved for
+	 * rxrings of Hybrid resource.
+	 */
+	rx_grp = &vnetp->rx_grp[0];
+	for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) {
+		/* Pick the rxrings reserved for Hybrid resource */
+		rx_ringp = &rx_grp->rings[i + VNET_HYBRID_RXRING_INDEX];
+
+		/* Store the hw ring handle */
+		rx_ringp->hw_rh = hw_rh[i];
+
+		/* Bind the pseudo ring to the underlying hwring */
+		mac_hwring_setup(rx_ringp->hw_rh,
+		    (mac_resource_handle_t)rx_ringp);
+
+		/* Start the hwring if needed */
+		if (rx_ringp->state & VNET_RXRING_STARTED) {
+			rv = mac_hwring_start(rx_ringp->hw_rh);
+			if (rv != 0) {
+				mac_hwring_teardown(rx_ringp->hw_rh);
+				rx_ringp->hw_rh = NULL;
+				goto fail;
+			}
+		}
+	}
+
+	/* Get the list of the underlying TX rings. */
+	hw_ring_cnt = mac_hwrings_get(vnetp->hio_mch, &vnetp->tx_hwgh, hw_rh,
+	    MAC_RING_TYPE_TX);
+
+	/* We expect the # of hw tx rings to match VNET_NUM_HYBRID_RINGS */
+	if (hw_ring_cnt != VNET_NUM_HYBRID_RINGS) {
+		cmn_err(CE_WARN,
+		    "!vnet%d: vnet_bind_hwrings: bad tx hw_ring_cnt(%d)\n",
+		    vnetp->instance, hw_ring_cnt);
+		goto fail;
+	}
+
+	/*
+	 * Now map the pseudo txrings to the hw txrings. Note we don't need
+	 * to register these with the upper mac, as we have statically exported
+	 * these rings. Note that these rings will continue to be used for LDC
+	 * resources to peer vnets and vswitch (shared ring).
+	 */
+	tx_grp = &vnetp->tx_grp[0];
+	for (i = 0; i < tx_grp->ring_cnt; i++) {
+		tx_ringp = &tx_grp->rings[i];
+		tx_ringp->hw_rh = hw_rh[i];
+		tx_ringp->state |= VNET_TXRING_HYBRID;
+	}
+
+	mac_perim_exit(mph1);
+	return (0);
+
+fail:
+	mac_perim_exit(mph1);
+	vnet_unbind_hwrings(vnetp);
+	return (1);
+}
+
+/* Unbind pseudo rings from hwrings */
+static void
+vnet_unbind_hwrings(vnet_t *vnetp)
+{
+	mac_perim_handle_t	mph1;
+	vnet_pseudo_rx_ring_t	*rx_ringp;
+	vnet_pseudo_rx_group_t	*rx_grp;
+	vnet_pseudo_tx_group_t	*tx_grp;
+	vnet_pseudo_tx_ring_t	*tx_ringp;
+	int			i;
+
+	mac_perim_enter_by_mh(vnetp->hio_mh, &mph1);
+
+	tx_grp = &vnetp->tx_grp[0];
+	for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) {
+		tx_ringp = &tx_grp->rings[i];
+		if (tx_ringp->state & VNET_TXRING_HYBRID) {
+			tx_ringp->state &= ~VNET_TXRING_HYBRID;
+			tx_ringp->hw_rh = NULL;
+		}
+	}
+
+	rx_grp = &vnetp->rx_grp[0];
+	for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) {
+		rx_ringp = &rx_grp->rings[i + VNET_HYBRID_RXRING_INDEX];
+		if (rx_ringp->hw_rh != NULL) {
+			/* Stop the hwring */
+			mac_hwring_stop(rx_ringp->hw_rh);
+
+			/* Teardown the hwring */
+			mac_hwring_teardown(rx_ringp->hw_rh);
+			rx_ringp->hw_rh = NULL;
+		}
+	}
+
+	if (vnetp->rx_hwgh != NULL) {
+		vnetp->rx_hwgh = NULL;
+		/*
+		 * First clear the permanent-quiesced flag of the RX srs then
+		 * restart the HW ring and the mac srs on the ring.
+		 */
+		mac_srs_perm_quiesce(vnetp->hio_mch, B_FALSE);
+		mac_rx_client_restart(vnetp->hio_mch);
+	}
+
+	mac_perim_exit(mph1);
+}
+
+/* Bind pseudo ring to a LDC resource */
+static int
+vnet_bind_vgenring(vnet_res_t *vresp)
+{
+	vnet_t			*vnetp;
+	vnet_pseudo_rx_group_t	*rx_grp;
+	vnet_pseudo_rx_ring_t	*rx_ringp;
+	mac_perim_handle_t	mph1;
+	int			rv;
+	int			type;
+
+	vnetp = vresp->vnetp;
+	type = vresp->type;
+	rx_grp = &vnetp->rx_grp[0];
+
+	if (type == VIO_NET_RES_LDC_SERVICE) {
+		/*
+		 * Ring Index 0 is the default ring in the group and is
+		 * reserved for LDC_SERVICE in vnet_ring_grp_init(). This ring
+		 * is allocated statically and is reported to the mac layer
+		 * in vnet_m_capab(). So, all we need to do here, is save a
+		 * reference to the associated vresp.
+		 */
+		rx_ringp = &rx_grp->rings[0];
+		rx_ringp->hw_rh = (mac_ring_handle_t)vresp;
+		vresp->rx_ringp = (void *)rx_ringp;
+		return (0);
+	}
+	ASSERT(type == VIO_NET_RES_LDC_GUEST);
+
+	mac_perim_enter_by_mh(vnetp->mh, &mph1);
+
+	rx_ringp = vnet_alloc_pseudo_rx_ring(vnetp);
+	if (rx_ringp == NULL) {
+		cmn_err(CE_WARN, "!vnet%d: Failed to allocate pseudo rx ring",
+		    vnetp->instance);
+		goto fail;
+	}
+
+	/* Store the LDC resource itself as the ring handle */
+	rx_ringp->hw_rh = (mac_ring_handle_t)vresp;
+
+	/*
+	 * Save a reference to the ring in the resource for lookup during
+	 * unbind. Note this is only done for LDC resources. We don't need this
+	 * in the case of a Hybrid resource (see vnet_bind_hwrings()), as its
+	 * rx rings are mapped to reserved pseudo rx rings (index 1 and 2).
+	 */
+	vresp->rx_ringp = (void *)rx_ringp;
+	rx_ringp->state |= VNET_RXRING_LDC_GUEST;
+
+	/* Register the pseudo ring with upper-mac */
+	rv = mac_group_add_ring(rx_grp->handle, rx_ringp->index);
+	if (rv != 0) {
+		rx_ringp->state &= ~VNET_RXRING_LDC_GUEST;
+		rx_ringp->hw_rh = NULL;
+		vnet_free_pseudo_rx_ring(vnetp, rx_ringp);
+		goto fail;
+	}
+
+	mac_perim_exit(mph1);
+	return (0);
+fail:
+	mac_perim_exit(mph1);
+	return (1);
+}
+
+/* Unbind pseudo ring from a LDC resource */
+static void
+vnet_unbind_vgenring(vnet_res_t *vresp)
+{
+	vnet_t			*vnetp;
+	vnet_pseudo_rx_group_t	*rx_grp;
+	vnet_pseudo_rx_ring_t	*rx_ringp;
+	mac_perim_handle_t	mph1;
+	int			type;
+
+	vnetp = vresp->vnetp;
+	type = vresp->type;
+	rx_grp = &vnetp->rx_grp[0];
+
+	if (vresp->rx_ringp == NULL) {
+		return;
+	}
+
+	if (type == VIO_NET_RES_LDC_SERVICE) {
+		/*
+		 * Ring Index 0 is the default ring in the group and is
+		 * reserved for LDC_SERVICE in vnet_ring_grp_init(). This ring
+		 * is allocated statically and is reported to the mac layer
+		 * in vnet_m_capab(). So, all we need to do here, is remove its
+		 * reference to the associated vresp.
+		 */
+		rx_ringp = &rx_grp->rings[0];
+		rx_ringp->hw_rh = NULL;
+		vresp->rx_ringp = NULL;
+		return;
+	}
+	ASSERT(type == VIO_NET_RES_LDC_GUEST);
+
+	mac_perim_enter_by_mh(vnetp->mh, &mph1);
+
+	rx_ringp = (vnet_pseudo_rx_ring_t *)vresp->rx_ringp;
+	vresp->rx_ringp = NULL;
+
+	if (rx_ringp != NULL && (rx_ringp->state & VNET_RXRING_LDC_GUEST)) {
+		/* Unregister the pseudo ring with upper-mac */
+		mac_group_rem_ring(rx_grp->handle, rx_ringp->handle);
+
+		rx_ringp->hw_rh = NULL;
+		rx_ringp->state &= ~VNET_RXRING_LDC_GUEST;
+
+		/* Free the pseudo rx ring */
+		vnet_free_pseudo_rx_ring(vnetp, rx_ringp);
+	}
+
+	mac_perim_exit(mph1);
+}
+
+static void
+vnet_unbind_rings(vnet_res_t *vresp)
+{
+	switch (vresp->type) {
+
+	case VIO_NET_RES_LDC_SERVICE:
+	case VIO_NET_RES_LDC_GUEST:
+		vnet_unbind_vgenring(vresp);
+		break;
+
+	case VIO_NET_RES_HYBRID:
+		vnet_unbind_hwrings(vresp->vnetp);
+		break;
+
+	default:
+		break;
+
+	}
+}
+
+static int
+vnet_bind_rings(vnet_res_t *vresp)
+{
+	int	rv;
+
+	switch (vresp->type) {
+
+	case VIO_NET_RES_LDC_SERVICE:
+	case VIO_NET_RES_LDC_GUEST:
+		rv = vnet_bind_vgenring(vresp);
+		break;
+
+	case VIO_NET_RES_HYBRID:
+		rv = vnet_bind_hwrings(vresp->vnetp);
+		break;
+
+	default:
+		rv = 1;
+		break;
+
+	}
+
+	return (rv);
+}
+
+/* ARGSUSED */
+int
+vnet_hio_stat(void *arg, uint_t stat, uint64_t *val)
+{
+	vnet_t	*vnetp = (vnet_t *)arg;
+
+	*val = mac_stat_get(vnetp->hio_mh, stat);
+	return (0);
+}
+
+/*
+ * The start() and stop() routines for the Hybrid resource below, are just
+ * dummy functions. This is provided to avoid resource type specific code in
+ * vnet_start_resources() and vnet_stop_resources(). The starting and stopping
+ * of the Hybrid resource happens in the context of the mac_client interfaces
+ * that are invoked in vnet_hio_mac_init() and vnet_hio_mac_cleanup().
+ */
+/* ARGSUSED */
+static int
+vnet_hio_start(void *arg)
+{
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+vnet_hio_stop(void *arg)
+{
+}
+
+mblk_t *
+vnet_hio_tx(void *arg, mblk_t *mp)
+{
+	vnet_pseudo_tx_ring_t	*tx_ringp;
+	mblk_t			*nextp;
+	mblk_t			*ret_mp;
+
+	tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
+	for (;;) {
+		nextp = mp->b_next;
+		mp->b_next = NULL;
+
+		ret_mp = mac_hwring_tx(tx_ringp->hw_rh, mp);
+		if (ret_mp != NULL) {
+			ret_mp->b_next = nextp;
+			mp = ret_mp;
+			break;
+		}
+
+		if ((mp = nextp) == NULL)
+			break;
+	}
+	return (mp);
+}
+
+static void
+vnet_hio_notify_cb(void *arg, mac_notify_type_t type)
+{
+	vnet_t			*vnetp = (vnet_t *)arg;
+	mac_perim_handle_t	mph;
+
+	mac_perim_enter_by_mh(vnetp->hio_mh, &mph);
+	switch (type) {
+	case MAC_NOTE_TX:
+		vnet_tx_update(vnetp->hio_vhp);
+		break;
+
+	default:
+		break;
+	}
+	mac_perim_exit(mph);
+}
+
 #ifdef	VNET_IOC_DEBUG
 
 /*
--- a/usr/src/uts/sun4v/io/vnet_dds.c	Fri Aug 14 12:16:18 2009 -0400
+++ b/usr/src/uts/sun4v/io/vnet_dds.c	Fri Aug 14 09:48:09 2009 -0700
@@ -113,6 +113,8 @@
 
 /* Functions imported from other files */
 extern int vnet_send_dds_msg(vnet_t *vnetp, void *dmsg);
+extern int vnet_hio_mac_init(vnet_t *vnetp, char *ifname);
+extern void vnet_hio_mac_cleanup(vnet_t *vnetp);
 
 /* HV functions that are used in this file */
 extern uint64_t vdds_hv_niu_vr_getinfo(uint32_t hvcookie,
@@ -412,7 +414,31 @@
 		} else {
 			vdds->hio_dip = dip;
 			vdds->hio_cookie = hio_cookie;
-			(void) vdds_send_dds_resp_msg(vnetp, dmsg, B_TRUE);
+			sprintf(vdds->hio_ifname, "%s%d", ddi_driver_name(dip),
+			    ddi_get_instance(dip));
+
+			rv = vnet_hio_mac_init(vnetp, vdds->hio_ifname);
+			if (rv != 0) {
+				/* failed - cleanup, send failed DDS message */
+				DERR(vdds, "HIO mac init failed, cleaning up");
+				rv = vdds_destroy_niu_node(dip, hio_cookie);
+				if (rv == 0) {
+					/* use DERR to print by default */
+					DERR(vdds, "Successfully destroyed"
+					    " Hybrid node");
+				} else {
+					cmn_err(CE_WARN, "vnet%d:Failed to "
+					    "destroy Hybrid node",
+					    vnetp->instance);
+				}
+				vdds->hio_dip = NULL;
+				vdds->hio_cookie = 0;
+				(void) vdds_send_dds_resp_msg(vnetp,
+				    dmsg, B_FALSE);
+			} else {
+				(void) vdds_send_dds_resp_msg(vnetp,
+				    dmsg, B_TRUE);
+			}
 			/* DERR used only print by default */
 			DERR(vdds, "Successfully created HIO node");
 		}
@@ -424,6 +450,7 @@
 			DBG2(vdds, "NACK: No HIO device destroy");
 			(void) vdds_send_dds_resp_msg(vnetp, dmsg, B_FALSE);
 		} else {
+			vnet_hio_mac_cleanup(vnetp);
 			rv = vdds_destroy_niu_node(vnetp->vdds_info.hio_dip,
 			    vdds->hio_cookie);
 			if (rv == 0) {
@@ -444,6 +471,7 @@
 	case VNET_DDS_TASK_REL_SHARE:
 		DBG2(vdds, "REL_SHARE task...");
 		if (vnetp->vdds_info.hio_dip != NULL) {
+			vnet_hio_mac_cleanup(vnetp);
 			rv = vdds_destroy_niu_node(vnetp->vdds_info.hio_dip,
 			    vdds->hio_cookie);
 			if (rv == 0) {
--- a/usr/src/uts/sun4v/io/vnet_gen.c	Fri Aug 14 12:16:18 2009 -0400
+++ b/usr/src/uts/sun4v/io/vnet_gen.c	Fri Aug 14 09:48:09 2009 -0700
@@ -73,11 +73,15 @@
 /* vgen proxy entry points */
 int vgen_init(void *vnetp, uint64_t regprop, dev_info_t *vnetdip,
     const uint8_t *macaddr, void **vgenhdl);
+int vgen_init_mdeg(void *arg);
 void vgen_uninit(void *arg);
 int vgen_dds_tx(void *arg, void *dmsg);
 void vgen_mod_init(void);
 int vgen_mod_cleanup(void);
 void vgen_mod_fini(void);
+int vgen_enable_intr(void *arg);
+int vgen_disable_intr(void *arg);
+mblk_t *vgen_poll(void *arg, int bytes_to_pickup);
 static int vgen_start(void *arg);
 static void vgen_stop(void *arg);
 static mblk_t *vgen_tx(void *arg, mblk_t *mp);
@@ -151,6 +155,7 @@
 static int vgen_tx_dring_full(vgen_ldc_t *ldcp);
 static int vgen_ldc_txtimeout(vgen_ldc_t *ldcp);
 static void vgen_ldc_watchdog(void *arg);
+static mblk_t *vgen_ldc_poll(vgen_ldc_t *ldcp, int bytes_to_pickup);
 
 /* vgen handshake functions */
 static vgen_ldc_t *vh_nextphase(vgen_ldc_t *ldcp);
@@ -200,7 +205,7 @@
 static void vgen_drain_rcv_thread(vgen_ldc_t *ldcp);
 static void vgen_ldc_rcv_worker(void *arg);
 static void vgen_handle_evt_read(vgen_ldc_t *ldcp);
-static void vgen_rx(vgen_ldc_t *ldcp, mblk_t *bp);
+static void vgen_rx(vgen_ldc_t *ldcp, mblk_t *bp, mblk_t *bpt);
 static void vgen_set_vnet_proto_ops(vgen_ldc_t *ldcp);
 static void vgen_reset_vnet_proto_ops(vgen_ldc_t *ldcp);
 static void vgen_link_update(vgen_t *vgenp, link_state_t link_state);
@@ -536,13 +541,6 @@
 	if (rv != 0) {
 		goto vgen_init_fail;
 	}
-
-	/* register with MD event generator */
-	rv = vgen_mdeg_reg(vgenp);
-	if (rv != DDI_SUCCESS) {
-		goto vgen_init_fail;
-	}
-
 	*vgenhdl = (void *)vgenp;
 
 	DBG1(NULL, NULL, "vnet(%d): exit\n", instance);
@@ -562,6 +560,15 @@
 	return (DDI_FAILURE);
 }
 
+int
+vgen_init_mdeg(void *arg)
+{
+	vgen_t	*vgenp = (vgen_t *)arg;
+
+	/* register with MD event generator */
+	return (vgen_mdeg_reg(vgenp));
+}
+
 /*
  * Called by vnet to undo the initializations done by vgen_init().
  * The handle provided by generic transport during vgen_init() is the argument.
@@ -2094,13 +2101,21 @@
 static void
 vgen_mdeg_unreg(vgen_t *vgenp)
 {
-	(void) mdeg_unregister(vgenp->mdeg_dev_hdl);
-	(void) mdeg_unregister(vgenp->mdeg_port_hdl);
-	kmem_free(vgenp->mdeg_parentp->specp, sizeof (vgen_prop_template));
-	KMEM_FREE(vgenp->mdeg_parentp);
-	vgenp->mdeg_parentp = NULL;
-	vgenp->mdeg_dev_hdl = NULL;
-	vgenp->mdeg_port_hdl = NULL;
+	if (vgenp->mdeg_dev_hdl != NULL) {
+		(void) mdeg_unregister(vgenp->mdeg_dev_hdl);
+		vgenp->mdeg_dev_hdl = NULL;
+	}
+	if (vgenp->mdeg_port_hdl != NULL) {
+		(void) mdeg_unregister(vgenp->mdeg_port_hdl);
+		vgenp->mdeg_port_hdl = NULL;
+	}
+
+	if (vgenp->mdeg_parentp != NULL) {
+		kmem_free(vgenp->mdeg_parentp->specp,
+		    sizeof (vgen_prop_template));
+		KMEM_FREE(vgenp->mdeg_parentp);
+		vgenp->mdeg_parentp = NULL;
+	}
 }
 
 /* mdeg callback function for the port node */
@@ -2907,6 +2922,7 @@
 	mutex_init(&ldcp->tclock, NULL, MUTEX_DRIVER, NULL);
 	mutex_init(&ldcp->wrlock, NULL, MUTEX_DRIVER, NULL);
 	mutex_init(&ldcp->rxlock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&ldcp->pollq_lock, NULL, MUTEX_DRIVER, NULL);
 
 	attach_state |= AST_mutex_init;
 
@@ -3032,6 +3048,7 @@
 		mutex_destroy(&ldcp->cblock);
 		mutex_destroy(&ldcp->wrlock);
 		mutex_destroy(&ldcp->rxlock);
+		mutex_destroy(&ldcp->pollq_lock);
 	}
 	if (attach_state & AST_ldc_alloc) {
 		KMEM_FREE(ldcp);
@@ -3100,6 +3117,7 @@
 		mutex_destroy(&ldcp->cblock);
 		mutex_destroy(&ldcp->wrlock);
 		mutex_destroy(&ldcp->rxlock);
+		mutex_destroy(&ldcp->pollq_lock);
 
 		/* unlink it from the list */
 		*prev_ldcp = ldcp->nextp;
@@ -6278,7 +6296,7 @@
 			 */
 			if (bp != NULL) {
 				DTRACE_PROBE1(vgen_rcv_msgs, int, count);
-				vgen_rx(ldcp, bp);
+				vgen_rx(ldcp, bp, bpt);
 				count = 0;
 				bp = bpt = NULL;
 			}
@@ -6459,7 +6477,7 @@
 
 		if (count++ > vgen_chain_len) {
 			DTRACE_PROBE1(vgen_rcv_msgs, int, count);
-			vgen_rx(ldcp, bp);
+			vgen_rx(ldcp, bp, bpt);
 			count = 0;
 			bp = bpt = NULL;
 		}
@@ -6512,7 +6530,7 @@
 	/* send up packets received so far */
 	if (bp != NULL) {
 		DTRACE_PROBE1(vgen_rcv_msgs, int, count);
-		vgen_rx(ldcp, bp);
+		vgen_rx(ldcp, bp, bpt);
 		bp = bpt = NULL;
 	}
 	DBG1(vgenp, ldcp, "exit rv(%d)\n", rv);
@@ -6996,18 +7014,57 @@
  * Send received packets up the stack.
  */
 static void
-vgen_rx(vgen_ldc_t *ldcp, mblk_t *bp)
+vgen_rx(vgen_ldc_t *ldcp, mblk_t *bp, mblk_t *bpt)
 {
 	vio_net_rx_cb_t vrx_cb = ldcp->portp->vcb.vio_net_rx_cb;
+	vgen_t		*vgenp = LDC_TO_VGEN(ldcp);
 
 	if (ldcp->rcv_thread != NULL) {
 		ASSERT(MUTEX_HELD(&ldcp->rxlock));
-		mutex_exit(&ldcp->rxlock);
 	} else {
 		ASSERT(MUTEX_HELD(&ldcp->cblock));
+	}
+
+	mutex_enter(&ldcp->pollq_lock);
+
+	if (ldcp->polling_on == B_TRUE) {
+		/*
+		 * If we are in polling mode, simply queue
+		 * the packets onto the poll queue and return.
+		 */
+		if (ldcp->pollq_headp == NULL) {
+			ldcp->pollq_headp = bp;
+			ldcp->pollq_tailp = bpt;
+		} else {
+			ldcp->pollq_tailp->b_next = bp;
+			ldcp->pollq_tailp = bpt;
+		}
+
+		mutex_exit(&ldcp->pollq_lock);
+		return;
+	}
+
+	/*
+	 * Prepend any pending mblks in the poll queue, now that we
+	 * are in interrupt mode, before sending up the chain of pkts.
+	 */
+	if (ldcp->pollq_headp != NULL) {
+		DBG2(vgenp, ldcp, "vgen_rx(%lx), pending pollq_headp\n",
+		    (uintptr_t)ldcp);
+		ldcp->pollq_tailp->b_next = bp;
+		bp = ldcp->pollq_headp;
+		ldcp->pollq_headp = ldcp->pollq_tailp = NULL;
+	}
+
+	mutex_exit(&ldcp->pollq_lock);
+
+	if (ldcp->rcv_thread != NULL) {
+		mutex_exit(&ldcp->rxlock);
+	} else {
 		mutex_exit(&ldcp->cblock);
 	}
 
+	/* Send up the packets */
 	vrx_cb(ldcp->portp->vhp, bp);
 
 	if (ldcp->rcv_thread != NULL) {
@@ -7233,6 +7290,145 @@
 	vgen_handshake_retry(ldcp);
 }
 
+int
+vgen_enable_intr(void *arg)
+{
+	vgen_port_t		*portp = (vgen_port_t *)arg;
+	vgen_ldclist_t		*ldclp;
+	vgen_ldc_t		*ldcp;
+
+	ldclp = &portp->ldclist;
+	READ_ENTER(&ldclp->rwlock);
+	/*
+	 * NOTE: for now, we will assume we have a single channel.
+	 */
+	if (ldclp->headp == NULL) {
+		RW_EXIT(&ldclp->rwlock);
+		return (1);
+	}
+	ldcp = ldclp->headp;
+
+	mutex_enter(&ldcp->pollq_lock);
+	ldcp->polling_on = B_FALSE;
+	mutex_exit(&ldcp->pollq_lock);
+
+	RW_EXIT(&ldclp->rwlock);
+
+	return (0);
+}
+
+int
+vgen_disable_intr(void *arg)
+{
+	vgen_port_t		*portp = (vgen_port_t *)arg;
+	vgen_ldclist_t		*ldclp;
+	vgen_ldc_t		*ldcp;
+
+	ldclp = &portp->ldclist;
+	READ_ENTER(&ldclp->rwlock);
+	/*
+	 * NOTE: for now, we will assume we have a single channel.
+	 */
+	if (ldclp->headp == NULL) {
+		RW_EXIT(&ldclp->rwlock);
+		return (1);
+	}
+	ldcp = ldclp->headp;
+
+
+	mutex_enter(&ldcp->pollq_lock);
+	ldcp->polling_on = B_TRUE;
+	mutex_exit(&ldcp->pollq_lock);
+
+	RW_EXIT(&ldclp->rwlock);
+
+	return (0);
+}
+
+mblk_t *
+vgen_poll(void *arg, int bytes_to_pickup)
+{
+	vgen_port_t		*portp = (vgen_port_t *)arg;
+	vgen_ldclist_t		*ldclp;
+	vgen_ldc_t		*ldcp;
+	mblk_t			*mp = NULL;
+
+	ldclp = &portp->ldclist;
+	READ_ENTER(&ldclp->rwlock);
+	/*
+	 * NOTE: for now, we will assume we have a single channel.
+	 */
+	if (ldclp->headp == NULL) {
+		RW_EXIT(&ldclp->rwlock);
+		return (NULL);
+	}
+	ldcp = ldclp->headp;
+
+	mp = vgen_ldc_poll(ldcp, bytes_to_pickup);
+
+	RW_EXIT(&ldclp->rwlock);
+	return (mp);
+}
+
+static mblk_t *
+vgen_ldc_poll(vgen_ldc_t *ldcp, int bytes_to_pickup)
+{
+	mblk_t	*bp = NULL;
+	mblk_t	*bpt = NULL;
+	mblk_t	*mp = NULL;
+	size_t	mblk_sz = 0;
+	size_t	sz = 0;
+	uint_t	count = 0;
+
+	mutex_enter(&ldcp->pollq_lock);
+
+	bp = ldcp->pollq_headp;
+	while (bp != NULL) {
+		/* get the size of this packet */
+		mblk_sz = msgdsize(bp);
+
+		/* if adding this pkt, exceeds the size limit, we are done. */
+		if (sz + mblk_sz >  bytes_to_pickup) {
+			break;
+		}
+
+		/* we have room for this packet */
+		sz += mblk_sz;
+
+		/* increment the # of packets being sent up */
+		count++;
+
+		/* track the last processed pkt */
+		bpt = bp;
+
+		/* get the next pkt */
+		bp = bp->b_next;
+	}
+
+	if (count != 0) {
+		/*
+		 * picked up some packets; save the head of pkts to be sent up.
+		 */
+		mp = ldcp->pollq_headp;
+
+		/* move the pollq_headp to skip over the pkts being sent up */
+		ldcp->pollq_headp = bp;
+
+		/* picked up all pending pkts in the queue; reset tail also */
+		if (ldcp->pollq_headp == NULL) {
+			ldcp->pollq_tailp = NULL;
+		}
+
+		/* terminate the tail of pkts to be sent up */
+		bpt->b_next = NULL;
+	}
+
+	mutex_exit(&ldcp->pollq_lock);
+
+	DTRACE_PROBE1(vgen_poll_pkts, uint_t, count);
+	return (mp);
+}
+
 #if DEBUG
 
 /*
--- a/usr/src/uts/sun4v/sys/vnet.h	Fri Aug 14 12:16:18 2009 -0400
+++ b/usr/src/uts/sun4v/sys/vnet.h	Fri Aug 14 09:48:09 2009 -0700
@@ -34,6 +34,8 @@
 #include <sys/vnet_res.h>
 #include <sys/vnet_mailbox.h>
 #include <sys/modhash.h>
+#include <net/if.h>
+#include <sys/mac_client.h>
 
 #define	VNET_SUCCESS		(0)	/* successful return */
 #define	VNET_FAILURE		(-1)	/* unsuccessful return */
@@ -117,6 +119,7 @@
 	uint32_t		refcnt;		/* reference count */
 	struct	vnet		*vnetp;		/* back pointer to vnet */
 	kstat_t			*ksp;		/* hio kstats */
+	void			*rx_ringp;	/* assoc pseudo rx ring */
 } vnet_res_t;
 
 #define	VNET_DDS_TASK_ADD_SHARE		0x01
@@ -131,6 +134,7 @@
 	vio_dds_msg_t	dmsg;		/* Pending DDS message */
 	dev_info_t	*hio_dip;	/* Hybrid device's dip */
 	uint64_t	hio_cookie;	/* Hybrid device's cookie */
+	char		hio_ifname[LIFNAMSIZ];  /* Hybrid interface name */
 	ddi_taskq_t	*dds_taskqp;	/* Taskq's used for DDS */
 	struct vnet	*vnetp;		/* Back pointer to vnetp */
 } vnet_dds_info_t;
@@ -155,12 +159,103 @@
 
 typedef enum {
 		AST_init = 0x0, AST_vnet_alloc = 0x1,
-		AST_mac_alloc = 0x2, AST_read_macaddr = 0x4,
-		AST_vgen_init = 0x8, AST_fdbh_alloc = 0x10,
-		AST_vdds_init = 0x20, AST_taskq_create = 0x40,
-		AST_vnet_list = 0x80, AST_macreg = 0x100
+		AST_ring_init = 0x2, AST_vdds_init = 0x4,
+		AST_read_macaddr = 0x8, AST_fdbh_alloc = 0x10,
+		AST_taskq_create = 0x20, AST_vnet_list = 0x40,
+		AST_vgen_init = 0x80, AST_macreg = 0x100,
+		AST_init_mdeg = 0x200
 } vnet_attach_progress_t;
 
+#define	VNET_NUM_PSEUDO_GROUPS		1	/* # of pseudo ring grps */
+#define	VNET_NUM_HYBRID_RINGS		2	/* # of Hybrid tx/rx rings */
+#define	VNET_HYBRID_RXRING_INDEX	1	/* Hybrid rx ring start index */
+
+/*
+ * # of Pseudo TX Rings is defined based on the possible
+ * # of TX Hardware Rings from a Hybrid resource.
+ */
+#define	VNET_NUM_PSEUDO_TXRINGS		VNET_NUM_HYBRID_RINGS
+
+/*
+ * # of Pseudo RX Rings that are reserved and exposed by default.
+ * 1 for LDC resource to vsw + 2 for RX rings of Hybrid resource.
+ */
+#define	VNET_NUM_PSEUDO_RXRINGS_DEFAULT	(VNET_NUM_HYBRID_RINGS + 1)
+
+/* Pseudo RX Ring States */
+typedef enum {
+	VNET_RXRING_FREE = 0x0,		/* Free */
+	VNET_RXRING_INUSE = 0x1,	/* In use */
+	VNET_RXRING_LDC_SERVICE = 0x2,	/* Mapped to vswitch */
+	VNET_RXRING_LDC_GUEST = 0x4,	/* Mapped to a peer vnet */
+	VNET_RXRING_HYBRID = 0x8,	/* Mapped to Hybrid resource */
+	VNET_RXRING_STARTED = 0x10	/* Started */
+} vnet_rxring_state_t;
+
+/* Pseudo TX Ring States */
+typedef enum {
+	VNET_TXRING_FREE = 0x0,		/* Free */
+	VNET_TXRING_INUSE = 0x1,	/* In use */
+	VNET_TXRING_SHARED = 0x2,	/* Shared among LDCs */
+	VNET_TXRING_HYBRID = 0x4,	/* Shared among LDCs, Hybrid resource */
+	VNET_TXRING_STARTED = 0x8	/* Started */
+} vnet_txring_state_t;
+
+/*
+ * Psuedo TX Ring
+ */
+typedef struct vnet_pseudo_tx_ring {
+	uint_t			index;		/* ring index */
+	vnet_txring_state_t	state;		/* ring state */
+	void			*grp;		/* grp associated */
+	void			*vnetp;		/* vnet associated */
+	mac_ring_handle_t	handle;		/* ring handle in mac layer */
+	mac_ring_handle_t	hw_rh;	/* Resource type dependent, internal */
+					/* ring handle. Hybrid res: ring hdl */
+					/* of hardware rx ring; LDC res: hdl */
+					/* to the res itself (vnet_res_t)    */
+} vnet_pseudo_tx_ring_t;
+
+/*
+ * Psuedo RX Ring
+ */
+typedef struct vnet_pseudo_rx_ring {
+	uint_t			index;		/* ring index */
+	vnet_rxring_state_t	state;		/* ring state */
+	void			*grp;		/* grp associated */
+	void			*vnetp;		/* vnet associated */
+	mac_ring_handle_t	handle;		/* ring handle in mac layer */
+	mac_ring_handle_t	hw_rh;	/* Resource type dependent, internal */
+					/* ring handle. Hybrid res: ring hdl */
+					/* of hardware tx ring; otherwise    */
+					/* NULL */
+	uint64_t		gen_num;	/* Mac layer gen_num */
+} vnet_pseudo_rx_ring_t;
+
+/*
+ * Psuedo TX Ring Group
+ */
+typedef struct vnet_pseudo_tx_group {
+	uint_t			index;		/* group index */
+	void			*vnetp;		/* vnet associated */
+	mac_group_handle_t	handle;		/* grp handle in mac layer */
+	uint_t			ring_cnt;	/* total # of rings in grp */
+	vnet_pseudo_tx_ring_t	*rings;		/* array of rings */
+} vnet_pseudo_tx_group_t;
+
+/*
+ * Psuedo RX Ring Group
+ */
+typedef struct vnet_pseudo_rx_group {
+	krwlock_t		lock;		/* sync rings access in grp */
+	int			index;		/* group index */
+	void			*vnetp;		/* vnet this grp belongs to */
+	mac_group_handle_t	handle;		/* grp handle in mac layer */
+	uint_t			max_ring_cnt;	/* total # of rings in grp */
+	uint_t			ring_cnt;	/* # of rings in use */
+	vnet_pseudo_rx_ring_t	*rings;		/* array of rings */
+} vnet_pseudo_rx_group_t;
+
 /*
  * vnet instance state information
  */
@@ -194,6 +289,18 @@
 	vnet_dds_info_t		vdds_info;	/* DDS related info */
 	krwlock_t		vrwlock;	/* Resource list lock */
 	ddi_taskq_t		*taskqp;	/* Resource taskq */
+
+	/* pseudo ring groups */
+	vnet_pseudo_rx_group_t	rx_grp[VNET_NUM_PSEUDO_GROUPS];
+	vnet_pseudo_tx_group_t	tx_grp[VNET_NUM_PSEUDO_GROUPS];
+
+	vio_net_handle_t	hio_vhp;	/* HIO resource hdl */
+	mac_handle_t		hio_mh;		/* HIO mac hdl */
+	mac_client_handle_t	hio_mch;	/* HIO mac client hdl */
+	mac_unicast_handle_t	hio_muh;	/* HIO mac unicst hdl */
+	mac_notify_handle_t	hio_mnh;	/* HIO notify cb hdl */
+	mac_group_handle_t	rx_hwgh;	/* HIO rx ring-group hdl */
+	mac_group_handle_t	tx_hwgh;	/* HIO tx ring-group hdl */
 } vnet_t;
 
 #ifdef DEBUG
--- a/usr/src/uts/sun4v/sys/vnet_gen.h	Fri Aug 14 12:16:18 2009 -0400
+++ b/usr/src/uts/sun4v/sys/vnet_gen.h	Fri Aug 14 09:48:09 2009 -0700
@@ -180,6 +180,7 @@
 	kmutex_t		tclock;		/* tx reclaim lock */
 	kmutex_t		wrlock;		/* sync transmits */
 	kmutex_t		rxlock;		/* sync reception */
+	kmutex_t		pollq_lock;	/* sync polling and rxworker */
 
 	/* channel info from ldc layer */
 	uint64_t		ldc_id;		/* channel number */
@@ -248,6 +249,11 @@
 	kmutex_t		rcv_thr_lock;	/* lock for receive thread */
 	kcondvar_t		rcv_thr_cv;	/* cond.var for recv thread */
 
+	/* receive polling fields */
+	boolean_t		polling_on;	/* polling enabled ? */
+	mblk_t			*pollq_headp;	/* head of pkts in pollq */
+	mblk_t			*pollq_tailp;	/* tail of pkts in pollq */
+
 	/* channel statistics */
 	vgen_stats_t		stats;		/* channel statistics */
 	kstat_t			*ksp;		/* channel kstats */