changeset 13135:9efd3d43accd

PSARC/2010/325 Different MTU for unicast and multicast 6836162 Interface to report different multicast and unicast MTUs from network stack
author Erik Nordmark <Erik.Nordmark@Sun.COM>
date Mon, 16 Aug 2010 15:30:54 -0700
parents 61fe7fb74c94
children 8f28cf08bb11
files usr/src/uts/common/inet/ip.h usr/src/uts/common/inet/ip/igmp.c usr/src/uts/common/inet/ip/ip.c usr/src/uts/common/inet/ip/ip6.c usr/src/uts/common/inet/ip/ip6_if.c usr/src/uts/common/inet/ip/ip_dce.c usr/src/uts/common/inet/ip/ip_if.c usr/src/uts/common/inet/ip/ip_input.c usr/src/uts/common/inet/ip/ip_mroute.c usr/src/uts/common/inet/ip/ip_output.c usr/src/uts/common/inet/ip/ipmp.c usr/src/uts/common/io/dld/dld_proto.c usr/src/uts/common/io/dld/dld_str.c usr/src/uts/common/io/ib/clients/ibd/ibd.c usr/src/uts/common/io/mac/mac.c usr/src/uts/common/io/mac/mac_client.c usr/src/uts/common/io/mac/mac_provider.c usr/src/uts/common/sys/dlpi.h usr/src/uts/common/sys/mac.h usr/src/uts/common/sys/mac_impl.h usr/src/uts/common/sys/mac_provider.h
diffstat 21 files changed, 231 insertions(+), 69 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/inet/ip.h	Mon Aug 16 15:11:00 2010 -0700
+++ b/usr/src/uts/common/inet/ip.h	Mon Aug 16 15:30:54 2010 -0700
@@ -1506,6 +1506,7 @@
  *	ig_cast_ill		ipsq or ipmp_lock	ipsq and ipmp_lock
  *	ig_arpent		ipsq			ipsq
  *	ig_mtu			ipsq			ipsq
+ *	ig_mc_mtu		ipsq			ipsq
  */
 typedef struct ipmp_illgrp_s {
 	list_t		ig_if; 		/* list of all interfaces */
@@ -1515,7 +1516,8 @@
 	struct ill_s	*ig_ipmp_ill;	/* backpointer to IPMP meta-interface */
 	struct ill_s	*ig_cast_ill;	/* nominated ill for multi/broadcast */
 	list_t		ig_arpent;	/* list of ARP entries */
-	uint_t		ig_mtu;		/* ig_ipmp_ill->ill_max_mtu */
+	uint_t		ig_mtu;		/* ig_ipmp_ill->ill_mtu */
+	uint_t		ig_mc_mtu;	/* ig_ipmp_ill->ill_mc_mtu */
 } ipmp_illgrp_t;
 
 /*
@@ -1611,6 +1613,7 @@
 	uint_t	ill_max_frag;		/* Max IDU from DLPI. */
 	uint_t	ill_current_frag;	/* Current IDU from DLPI. */
 	uint_t	ill_mtu;		/* User-specified MTU; SIOCSLIFMTU */
+	uint_t	ill_mc_mtu;		/* MTU for multi/broadcast */
 	uint_t	ill_metric;		/* BSD if metric, for compatibility. */
 	char	*ill_name;		/* Our name. */
 	uint_t	ill_ipif_dup_count;	/* Number of duplicate addresses. */
@@ -1905,6 +1908,7 @@
  * ill_max_hops			ipsq			Not atomic
  *
  * ill_mtu			ill_lock		None
+ * ill_mc_mtu			ill_lock		None
  *
  * ill_user_mtu			ipsq + ill_lock		ill_lock
  * ill_reachable_time		ipsq + ill_lock		ill_lock
@@ -3189,6 +3193,7 @@
 extern mblk_t	*ip_carve_mp(mblk_t **, ssize_t);
 extern mblk_t	*ip_dlpi_alloc(size_t, t_uscalar_t);
 extern mblk_t	*ip_dlnotify_alloc(uint_t, uint_t);
+extern mblk_t	*ip_dlnotify_alloc2(uint_t, uint_t, uint_t);
 extern char	*ip_dot_addr(ipaddr_t, char *);
 extern const char *mac_colon_addr(const uint8_t *, size_t, char *, size_t);
 extern void	ip_lwput(queue_t *, mblk_t *);
--- a/usr/src/uts/common/inet/ip/igmp.c	Mon Aug 16 15:11:00 2010 -0700
+++ b/usr/src/uts/common/inet/ip/igmp.c	Mon Aug 16 15:30:54 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -1861,7 +1860,7 @@
  * Sends an IGMP_V3_MEMBERSHIP_REPORT message out the ill.
  * The report will contain one group record
  * for each element of reclist.  If this causes packet length to
- * exceed ill->ill_mtu, multiple reports are sent.
+ * exceed ill->ill_mc_mtu, multiple reports are sent.
  * reclist is assumed to be made up of buffers allocated by mcast_bldmrec(),
  * and those buffers are freed here.
  */
@@ -1897,7 +1896,7 @@
 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
 		rsize = sizeof (grphdra_t) +
 		    (rp->mrec_srcs.sl_numsrc * sizeof (ipaddr_t));
-		if (size + rsize > ill->ill_mtu) {
+		if (size + rsize > ill->ill_mc_mtu) {
 			if (rp == cur_reclist) {
 				/*
 				 * If the first mrec we looked at is too big
@@ -1908,7 +1907,7 @@
 				 * other types).
 				 */
 				int srcspace, srcsperpkt;
-				srcspace = ill->ill_mtu - (size +
+				srcspace = ill->ill_mc_mtu - (size +
 				    sizeof (grphdra_t));
 
 				/*
@@ -2498,7 +2497,7 @@
 /*
  * Sends an MLD_V2_LISTENER_REPORT message out the passed-in ill.  The
  * report will contain one multicast address record for each element of
- * reclist.  If this causes packet length to exceed ill->ill_mtu,
+ * reclist.  If this causes packet length to exceed ill->ill_mc_mtu,
  * multiple reports are sent.  reclist is assumed to be made up of
  * buffers allocated by mcast_bldmrec(), and those buffers are freed here.
  */
@@ -2542,7 +2541,7 @@
 	    rp = rp->mrec_next, numrec++) {
 		rsize = sizeof (mld2mar_t) +
 		    (rp->mrec_srcs.sl_numsrc * sizeof (in6_addr_t));
-		if (size + rsize > ill->ill_mtu) {
+		if (size + rsize > ill->ill_mc_mtu) {
 			if (rp == cur_reclist) {
 				/*
 				 * If the first mrec we looked at is too big
@@ -2553,7 +2552,7 @@
 				 * other types).
 				 */
 				int srcspace, srcsperpkt;
-				srcspace = ill->ill_mtu -
+				srcspace = ill->ill_mc_mtu -
 				    (size + sizeof (mld2mar_t));
 
 				/*
--- a/usr/src/uts/common/inet/ip/ip.c	Mon Aug 16 15:11:00 2010 -0700
+++ b/usr/src/uts/common/inet/ip/ip.c	Mon Aug 16 15:30:54 2010 -0700
@@ -3352,7 +3352,7 @@
  * If uinfo is set, then we fill in the best available information
  * we have for the destination. This is based on (in priority order) any
  * metrics and path MTU stored in a dce_t, route metrics, and finally the
- * ill_mtu.
+ * ill_mtu/ill_mc_mtu.
  *
  * Tsol note: If we have a source route then dst_addr != firsthop. But we
  * always do the label check on dst_addr.
@@ -3681,9 +3681,14 @@
 uint_t
 ip_get_base_mtu(ill_t *ill, ire_t *ire)
 {
-	uint_t mtu = ill->ill_mtu;
+	uint_t mtu;
 	uint_t iremtu = ire->ire_metrics.iulp_mtu;
 
+	if (ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST))
+		mtu = ill->ill_mc_mtu;
+	else
+		mtu = ill->ill_mtu;
+
 	if (iremtu != 0 && iremtu < mtu)
 		mtu = iremtu;
 
@@ -3796,17 +3801,32 @@
 		 * an ill. We'd use the above IP_MAXPACKET in that case just
 		 * to tell the transport something larger than zero.
 		 */
-		if (nce->nce_common->ncec_ill->ill_mtu < pmtu)
-			pmtu = nce->nce_common->ncec_ill->ill_mtu;
-		if (nce->nce_common->ncec_ill != nce->nce_ill &&
-		    nce->nce_ill->ill_mtu < pmtu) {
-			/*
-			 * for interfaces in an IPMP group, the mtu of
-			 * the nce_ill (under_ill) could be different
-			 * from the mtu of the ncec_ill, so we take the
-			 * min of the two.
-			 */
-			pmtu = nce->nce_ill->ill_mtu;
+		if (ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST)) {
+			if (nce->nce_common->ncec_ill->ill_mc_mtu < pmtu)
+				pmtu = nce->nce_common->ncec_ill->ill_mc_mtu;
+			if (nce->nce_common->ncec_ill != nce->nce_ill &&
+			    nce->nce_ill->ill_mc_mtu < pmtu) {
+				/*
+				 * for interfaces in an IPMP group, the mtu of
+				 * the nce_ill (under_ill) could be different
+				 * from the mtu of the ncec_ill, so we take the
+				 * min of the two.
+				 */
+				pmtu = nce->nce_ill->ill_mc_mtu;
+			}
+		} else {
+			if (nce->nce_common->ncec_ill->ill_mtu < pmtu)
+				pmtu = nce->nce_common->ncec_ill->ill_mtu;
+			if (nce->nce_common->ncec_ill != nce->nce_ill &&
+			    nce->nce_ill->ill_mtu < pmtu) {
+				/*
+				 * for interfaces in an IPMP group, the mtu of
+				 * the nce_ill (under_ill) could be different
+				 * from the mtu of the ncec_ill, so we take the
+				 * min of the two.
+				 */
+				pmtu = nce->nce_ill->ill_mtu;
+			}
 		}
 	}
 
@@ -4684,6 +4704,22 @@
 	return (mp);
 }
 
+mblk_t *
+ip_dlnotify_alloc2(uint_t notification, uint_t data1, uint_t data2)
+{
+	dl_notify_ind_t	*notifyp;
+	mblk_t		*mp;
+
+	if ((mp = ip_dlpi_alloc(DL_NOTIFY_IND_SIZE, DL_NOTIFY_IND)) == NULL)
+		return (NULL);
+
+	notifyp = (dl_notify_ind_t *)mp->b_rptr;
+	notifyp->dl_notification = notification;
+	notifyp->dl_data1 = data1;
+	notifyp->dl_data2 = data2;
+	return (mp);
+}
+
 /*
  * Debug formatting routine.  Returns a character string representation of the
  * addr in buf, of the form xxx.xxx.xxx.xxx.  This routine takes the address
@@ -8449,7 +8485,7 @@
 
 	case DL_NOTIFY_IND: {
 		dl_notify_ind_t *notify = (dl_notify_ind_t *)mp->b_rptr;
-		uint_t orig_mtu;
+		uint_t orig_mtu, orig_mc_mtu;
 
 		switch (notify->dl_notification) {
 		case DL_NOTE_PHYS_ADDR:
@@ -8470,6 +8506,7 @@
 			break;
 
 		case DL_NOTE_SDU_SIZE:
+		case DL_NOTE_SDU_SIZE2:
 			/*
 			 * The dce and fragmentation code can cope with
 			 * this changing while packets are being sent.
@@ -8479,11 +8516,23 @@
 			 * Change the MTU size of the interface.
 			 */
 			mutex_enter(&ill->ill_lock);
-			ill->ill_current_frag = (uint_t)notify->dl_data;
+			orig_mtu = ill->ill_mtu;
+			orig_mc_mtu = ill->ill_mc_mtu;
+			switch (notify->dl_notification) {
+			case DL_NOTE_SDU_SIZE:
+				ill->ill_current_frag =
+				    (uint_t)notify->dl_data;
+				ill->ill_mc_mtu = (uint_t)notify->dl_data;
+				break;
+			case DL_NOTE_SDU_SIZE2:
+				ill->ill_current_frag =
+				    (uint_t)notify->dl_data1;
+				ill->ill_mc_mtu = (uint_t)notify->dl_data2;
+				break;
+			}
 			if (ill->ill_current_frag > ill->ill_max_frag)
 				ill->ill_max_frag = ill->ill_current_frag;
 
-			orig_mtu = ill->ill_mtu;
 			if (!(ill->ill_flags & ILLF_FIXEDMTU)) {
 				ill->ill_mtu = ill->ill_current_frag;
 
@@ -8495,20 +8544,32 @@
 				    ill->ill_user_mtu < ill->ill_mtu)
 					ill->ill_mtu = ill->ill_user_mtu;
 
+				if (ill->ill_user_mtu != 0 &&
+				    ill->ill_user_mtu < ill->ill_mc_mtu)
+					ill->ill_mc_mtu = ill->ill_user_mtu;
+
 				if (ill->ill_isv6) {
 					if (ill->ill_mtu < IPV6_MIN_MTU)
 						ill->ill_mtu = IPV6_MIN_MTU;
+					if (ill->ill_mc_mtu < IPV6_MIN_MTU)
+						ill->ill_mc_mtu = IPV6_MIN_MTU;
 				} else {
 					if (ill->ill_mtu < IP_MIN_MTU)
 						ill->ill_mtu = IP_MIN_MTU;
+					if (ill->ill_mc_mtu < IP_MIN_MTU)
+						ill->ill_mc_mtu = IP_MIN_MTU;
 				}
-			}
+			} else if (ill->ill_mc_mtu > ill->ill_mtu) {
+				ill->ill_mc_mtu = ill->ill_mtu;
+			}
+
 			mutex_exit(&ill->ill_lock);
 			/*
 			 * Make sure all dce_generation checks find out
-			 * that ill_mtu has changed.
-			 */
-			if (orig_mtu != ill->ill_mtu) {
+			 * that ill_mtu/ill_mc_mtu has changed.
+			 */
+			if (orig_mtu != ill->ill_mtu ||
+			    orig_mc_mtu != ill->ill_mc_mtu) {
 				dce_increment_all_generations(ill->ill_isv6,
 				    ill->ill_ipst);
 			}
--- a/usr/src/uts/common/inet/ip/ip6.c	Mon Aug 16 15:11:00 2010 -0700
+++ b/usr/src/uts/common/inet/ip/ip6.c	Mon Aug 16 15:30:54 2010 -0700
@@ -709,6 +709,8 @@
 	mutex_enter(&dce->dce_lock);
 	if (dce->dce_flags & DCEF_PMTU)
 		old_max_frag = dce->dce_pmtu;
+	else if (IN6_IS_ADDR_MULTICAST(&final_dst))
+		old_max_frag = ill->ill_mc_mtu;
 	else
 		old_max_frag = ill->ill_mtu;
 
@@ -1954,7 +1956,7 @@
  * If uinfo is set, then we fill in the best available information
  * we have for the destination. This is based on (in priority order) any
  * metrics and path MTU stored in a dce_t, route metrics, and finally the
- * ill_mtu.
+ * ill_mtu/ill_mc_mtu.
  *
  * Tsol note: If we have a source route then dst_addr != firsthop. But we
  * always do the label check on dst_addr.
--- a/usr/src/uts/common/inet/ip/ip6_if.c	Mon Aug 16 15:11:00 2010 -0700
+++ b/usr/src/uts/common/inet/ip/ip6_if.c	Mon Aug 16 15:30:54 2010 -0700
@@ -2299,7 +2299,7 @@
 	    (DL_NOTE_PHYS_ADDR | DL_NOTE_SDU_SIZE | DL_NOTE_FASTPATH_FLUSH |
 	    DL_NOTE_LINK_UP | DL_NOTE_LINK_DOWN | DL_NOTE_CAPAB_RENEG |
 	    DL_NOTE_PROMISC_ON_PHYS | DL_NOTE_PROMISC_OFF_PHYS |
-	    DL_NOTE_REPLUMB | DL_NOTE_ALLOWED_IPS);
+	    DL_NOTE_REPLUMB | DL_NOTE_ALLOWED_IPS | DL_NOTE_SDU_SIZE2);
 
 	phys_mp = ip_dlpi_alloc(sizeof (dl_phys_addr_req_t) +
 	    sizeof (t_scalar_t), DL_PHYS_ADDR_REQ);
--- a/usr/src/uts/common/inet/ip/ip_dce.c	Mon Aug 16 15:11:00 2010 -0700
+++ b/usr/src/uts/common/inet/ip/ip_dce.c	Mon Aug 16 15:30:54 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -664,7 +663,7 @@
 
 /*
  * Increment the generation number on all dces that have a path MTU and
- * the default DCE. Used when ill_mtu changes.
+ * the default DCE. Used when ill_mtu or ill_mc_mtu changes.
  */
 void
 dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst)
--- a/usr/src/uts/common/inet/ip/ip_if.c	Mon Aug 16 15:11:00 2010 -0700
+++ b/usr/src/uts/common/inet/ip/ip_if.c	Mon Aug 16 15:30:54 2010 -0700
@@ -3717,6 +3717,7 @@
 		goto done;
 	ill->ill_current_frag = ill->ill_max_frag;
 	ill->ill_mtu = ill->ill_max_frag;	/* Initial value */
+	ill->ill_mc_mtu = ill->ill_mtu;
 	/*
 	 * ipif_loopback_name can't be pointed at directly because its used
 	 * by both the ipv4 and ipv6 interfaces.  When the ill is removed
@@ -4189,6 +4190,7 @@
 	ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu);
 	ill->ill_current_frag = ill->ill_max_frag;
 	ill->ill_mtu = ill->ill_max_frag;
+	ill->ill_mc_mtu = ill->ill_mtu;	/* Overridden by DL_NOTE_SDU_SIZE2 */
 
 	ill->ill_type = ipm->ip_m_type;
 
@@ -10816,6 +10818,10 @@
 		mutex_exit(&ill->ill_lock);
 		return (EINVAL);
 	}
+	/* Avoid increasing ill_mc_mtu */
+	if (ill->ill_mc_mtu > mtu)
+		ill->ill_mc_mtu = mtu;
+
 	/*
 	 * The dce and fragmentation code can handle changes to ill_mtu
 	 * concurrent with sending/fragmenting packets.
@@ -10826,7 +10832,7 @@
 
 	/*
 	 * Make sure all dce_generation checks find out
-	 * that ill_mtu has changed.
+	 * that ill_mtu/ill_mc_mtu has changed.
 	 */
 	dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst);
 
@@ -11584,12 +11590,13 @@
 		 * here.
 		 */
 		ill->ill_mtu = MIN(ill->ill_current_frag, ill->ill_user_mtu);
+		ill->ill_mc_mtu = MIN(ill->ill_mc_mtu, ill->ill_user_mtu);
 	}
 	mutex_exit(&ill->ill_lock);
 
 	/*
 	 * Make sure all dce_generation checks find out
-	 * that ill_mtu has changed.
+	 * that ill_mtu/ill_mc_mtu has changed.
 	 */
 	if (!(ill->ill_flags & ILLF_FIXEDMTU) && (lir->lir_maxmtu != 0))
 		dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst);
--- a/usr/src/uts/common/inet/ip/ip_input.c	Mon Aug 16 15:11:00 2010 -0700
+++ b/usr/src/uts/common/inet/ip/ip_input.c	Mon Aug 16 15:30:54 2010 -0700
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -1475,7 +1475,7 @@
 		goto done;
 	}
 
-	ip_forward_xmit_v4(nce, ill, mp, ipha, ira, dst_ill->ill_mtu, 0);
+	ip_forward_xmit_v4(nce, ill, mp, ipha, ira, dst_ill->ill_mc_mtu, 0);
 	nce_refrele(nce);
 done:
 	/* Restore */
--- a/usr/src/uts/common/inet/ip/ip_mroute.c	Mon Aug 16 15:11:00 2010 -0700
+++ b/usr/src/uts/common/inet/ip/ip_mroute.c	Mon Aug 16 15:30:54 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -3185,7 +3184,8 @@
 		 * statistics for input errors will be increased on the wrong
 		 * ill but that isn't a big deal.
 		 */
-		ip_forward_xmit_v4(nce, ill, mp, ipha, &iras, ill->ill_mtu, 0);
+		ip_forward_xmit_v4(nce, ill, mp, ipha, &iras, ill->ill_mc_mtu,
+		    0);
 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
 
 		nce_refrele(nce);
--- a/usr/src/uts/common/inet/ip/ip_output.c	Mon Aug 16 15:11:00 2010 -0700
+++ b/usr/src/uts/common/inet/ip/ip_output.c	Mon Aug 16 15:30:54 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -328,7 +327,8 @@
 	 * An initial ixa_fragsize was set in ip_set_destination
 	 * and we update it if any routing changes above.
 	 * A change to ill_mtu with ifconfig will increase all dce_generation
-	 * so that we will detect that with the generation check.
+	 * so that we will detect that with the generation check. Ditto for
+	 * ill_mc_mtu.
 	 */
 
 	/*
--- a/usr/src/uts/common/inet/ip/ipmp.c	Mon Aug 16 15:11:00 2010 -0700
+++ b/usr/src/uts/common/inet/ip/ipmp.c	Mon Aug 16 15:30:54 2010 -0700
@@ -79,7 +79,7 @@
 static ill_t	*ipmp_illgrp_min_ill(ipmp_illgrp_t *);
 static ill_t	*ipmp_illgrp_max_ill(ipmp_illgrp_t *);
 static void	ipmp_illgrp_set_cast(ipmp_illgrp_t *, ill_t *);
-static void	ipmp_illgrp_set_mtu(ipmp_illgrp_t *, uint_t);
+static void	ipmp_illgrp_set_mtu(ipmp_illgrp_t *, uint_t, uint_t);
 static boolean_t ipmp_ill_activate(ill_t *);
 static void	ipmp_ill_deactivate(ill_t *);
 static void	ipmp_ill_ire_mark_testhidden(ire_t *, char *);
@@ -556,7 +556,7 @@
 
 	illg->ig_ipmp_ill = ill;
 	ill->ill_grp = illg;
-	ipmp_illgrp_set_mtu(illg, mtu);
+	ipmp_illgrp_set_mtu(illg, mtu, mtu);
 
 	return (illg);
 }
@@ -995,7 +995,7 @@
  * Caller must be inside the IPSQ unless this is initialization.
  */
 static void
-ipmp_illgrp_set_mtu(ipmp_illgrp_t *illg, uint_t mtu)
+ipmp_illgrp_set_mtu(ipmp_illgrp_t *illg, uint_t mtu, uint_t mc_mtu)
 {
 	ill_t *ill = illg->ig_ipmp_ill;
 	mblk_t *mp;
@@ -1005,8 +1005,9 @@
 	/*
 	 * If allocation fails, we have bigger problems than MTU.
 	 */
-	if ((mp = ip_dlnotify_alloc(DL_NOTE_SDU_SIZE, mtu)) != NULL) {
+	if ((mp = ip_dlnotify_alloc2(DL_NOTE_SDU_SIZE2, mtu, mc_mtu)) != NULL) {
 		illg->ig_mtu = mtu;
+		illg->ig_mc_mtu = mc_mtu;
 		put(ill->ill_rq, mp);
 	}
 }
@@ -1021,6 +1022,7 @@
 	ill_t *ill;
 	ill_t *ipmp_ill = illg->ig_ipmp_ill;
 	uint_t mtu = 0;
+	uint_t mc_mtu = 0;
 
 	ASSERT(IAM_WRITER_ILL(ipmp_ill));
 
@@ -1035,6 +1037,8 @@
 		mutex_enter(&ill->ill_lock);
 		if (mtu == 0 || ill->ill_mtu < mtu)
 			mtu = ill->ill_mtu;
+		if (mc_mtu == 0 || ill->ill_mc_mtu < mc_mtu)
+			mc_mtu = ill->ill_mc_mtu;
 		mutex_exit(&ill->ill_lock);
 	}
 
@@ -1042,9 +1046,9 @@
 	 * MTU must be at least the minimum MTU.
 	 */
 	mtu = MAX(mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU);
-
-	if (illg->ig_mtu != mtu)
-		ipmp_illgrp_set_mtu(illg, mtu);
+	mc_mtu = MAX(mc_mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU);
+	if (illg->ig_mtu != mtu || illg->ig_mc_mtu != mc_mtu)
+		ipmp_illgrp_set_mtu(illg, mtu, mc_mtu);
 }
 
 /*
@@ -1174,7 +1178,7 @@
 			ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
 			mutex_exit(&ipmp_ill->ill_lock);
 		}
-		ipmp_illgrp_set_mtu(illg, ill->ill_mtu);
+		ipmp_illgrp_set_mtu(illg, ill->ill_mtu, ill->ill_mc_mtu);
 	} else {
 		ASSERT(ipmp_ill->ill_phys_addr_length ==
 		    ill->ill_phys_addr_length);
@@ -1185,8 +1189,11 @@
 			ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
 			mutex_exit(&ipmp_ill->ill_lock);
 		}
-		if (illg->ig_mtu > ill->ill_mtu)
-			ipmp_illgrp_set_mtu(illg, ill->ill_mtu);
+		if (illg->ig_mtu > ill->ill_mtu ||
+		    illg->ig_mc_mtu > ill->ill_mc_mtu) {
+			ipmp_illgrp_set_mtu(illg, ill->ill_mtu,
+			    ill->ill_mc_mtu);
+		}
 	}
 
 	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
--- a/usr/src/uts/common/io/dld/dld_proto.c	Mon Aug 16 15:11:00 2010 -0700
+++ b/usr/src/uts/common/io/dld/dld_proto.c	Mon Aug 16 15:30:54 2010 -0700
@@ -1128,6 +1128,7 @@
 	    DL_NOTE_FASTPATH_FLUSH |
 	    DL_NOTE_SPEED |
 	    DL_NOTE_SDU_SIZE|
+	    DL_NOTE_SDU_SIZE2|
 	    DL_NOTE_ALLOWED_IPS;
 
 	if (MBLKL(mp) < sizeof (dl_notify_req_t)) {
--- a/usr/src/uts/common/io/dld/dld_str.c	Mon Aug 16 15:11:00 2010 -0700
+++ b/usr/src/uts/common/io/dld/dld_str.c	Mon Aug 16 15:30:54 2010 -0700
@@ -1394,12 +1394,12 @@
  * DL_NOTIFY_IND: DL_NOTE_SDU_SIZE
  */
 static void
-str_notify_sdu_size(dld_str_t *dsp, uint_t max_sdu)
+str_notify_sdu_size(dld_str_t *dsp, uint_t max_sdu, uint_t multicast_sdu)
 {
 	mblk_t		*mp;
 	dl_notify_ind_t *dlip;
 
-	if (!(dsp->ds_notifications & DL_NOTE_SDU_SIZE))
+	if (!(dsp->ds_notifications & (DL_NOTE_SDU_SIZE|DL_NOTE_SDU_SIZE2)))
 		return;
 
 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
@@ -1409,8 +1409,14 @@
 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
 	dlip = (dl_notify_ind_t *)mp->b_rptr;
 	dlip->dl_primitive = DL_NOTIFY_IND;
-	dlip->dl_notification = DL_NOTE_SDU_SIZE;
-	dlip->dl_data = max_sdu;
+	if (dsp->ds_notifications & DL_NOTE_SDU_SIZE2) {
+		dlip->dl_notification = DL_NOTE_SDU_SIZE2;
+		dlip->dl_data1 = max_sdu;
+		dlip->dl_data2 = multicast_sdu;
+	} else {
+		dlip->dl_notification = DL_NOTE_SDU_SIZE;
+		dlip->dl_data = max_sdu;
+	}
 
 	qreply(dsp->ds_wq, mp);
 }
@@ -1865,8 +1871,9 @@
 
 	case MAC_NOTE_SDU_SIZE: {
 		uint_t  max_sdu;
-		mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
-		str_notify_sdu_size(dsp, max_sdu);
+		uint_t	multicast_sdu;
+		mac_sdu_get2(dsp->ds_mh, NULL, &max_sdu, &multicast_sdu);
+		str_notify_sdu_size(dsp, max_sdu, multicast_sdu);
 		break;
 	}
 
--- a/usr/src/uts/common/io/ib/clients/ibd/ibd.c	Mon Aug 16 15:11:00 2010 -0700
+++ b/usr/src/uts/common/io/ib/clients/ibd/ibd.c	Mon Aug 16 15:30:54 2010 -0700
@@ -155,7 +155,7 @@
  * Changing the linkmode requires some bookkeeping in the driver. The
  * capabilities need to be re-reported to the mac layer. This is done by
  * calling mac_capab_update().  The maxsdu is updated by calling
- * mac_maxsdu_update().
+ * mac_maxsdu_update2().
  * The private properties retain their values across the change of linkmode.
  * NOTE:
  * - The port driver does not support any property apart from mtu.
@@ -2392,6 +2392,7 @@
 	macp->m_src_addr = (uint8_t *)&state->id_macaddr;
 	macp->m_callbacks = &ibd_m_callbacks;
 	macp->m_min_sdu = 0;
+	macp->m_multicast_sdu = IBD_DEF_MAX_SDU;
 	if (state->id_type == IBD_PORT_DRIVER) {
 		macp->m_max_sdu = IBD_DEF_RC_MAX_SDU;
 	} else if (state->id_enable_rc) {
@@ -4592,14 +4593,16 @@
 					}
 					state->id_enable_rc = 1;
 					/* inform MAC framework of new MTU */
-					err = mac_maxsdu_update(state->id_mh,
-					    state->rc_mtu - IPOIB_HDRSIZE);
+					err = mac_maxsdu_update2(state->id_mh,
+					    state->rc_mtu - IPOIB_HDRSIZE,
+					    state->id_mtu - IPOIB_HDRSIZE);
 				} else {
 					if (!state->id_enable_rc) {
 						return (0);
 					}
 					state->id_enable_rc = 0;
-					err = mac_maxsdu_update(state->id_mh,
+					err = mac_maxsdu_update2(state->id_mh,
+					    state->id_mtu - IPOIB_HDRSIZE,
 					    state->id_mtu - IPOIB_HDRSIZE);
 				}
 				(void) ibd_record_capab(state);
@@ -6007,8 +6010,9 @@
 	    state->id_mgid.gid_prefix, state->id_mgid.gid_guid);
 
 	if (!state->id_enable_rc) {
-		(void) mac_maxsdu_update(state->id_mh, state->id_mtu
-		    - IPOIB_HDRSIZE);
+		(void) mac_maxsdu_update2(state->id_mh,
+		    state->id_mtu - IPOIB_HDRSIZE,
+		    state->id_mtu - IPOIB_HDRSIZE);
 	}
 	mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
 
@@ -8272,6 +8276,7 @@
 	macp->m_src_addr	= (uint8_t *)&state->id_macaddr;
 	macp->m_callbacks	= &ibd_m_callbacks;
 	macp->m_min_sdu		= 0;
+	macp->m_multicast_sdu	= IBD_DEF_MAX_SDU;
 	if (state->id_enable_rc) {
 		macp->m_max_sdu		= IBD_DEF_RC_MAX_SDU;
 	} else {
--- a/usr/src/uts/common/io/mac/mac.c	Mon Aug 16 15:11:00 2010 -0700
+++ b/usr/src/uts/common/io/mac/mac.c	Mon Aug 16 15:30:54 2010 -0700
@@ -3140,7 +3140,7 @@
 		uint32_t sdu;
 
 		ASSERT(valsize >= sizeof (uint32_t));
-		mac_sdu_get(mh, NULL, &sdu);
+		mac_sdu_get2(mh, NULL, &sdu, NULL);
 		bcopy(&sdu, val, sizeof (sdu));
 
 		return (0);
@@ -3398,7 +3398,7 @@
 	case MAC_PROP_MTU: {
 		uint32_t sdu;
 
-		mac_sdu_get(mh, NULL, &sdu);
+		mac_sdu_get2(mh, NULL, &sdu, NULL);
 
 		if (range != NULL && !(state.pr_flags &
 		    MAC_PROP_INFO_RANGE)) {
--- a/usr/src/uts/common/io/mac/mac_client.c	Mon Aug 16 15:11:00 2010 -0700
+++ b/usr/src/uts/common/io/mac/mac_client.c	Mon Aug 16 15:30:54 2010 -0700
@@ -783,6 +783,20 @@
 		*max_sdu = mip->mi_sdu_max;
 }
 
+void
+mac_sdu_get2(mac_handle_t mh, uint_t *min_sdu, uint_t *max_sdu,
+    uint_t *multicast_sdu)
+{
+	mac_impl_t	*mip = (mac_impl_t *)mh;
+
+	if (min_sdu != NULL)
+		*min_sdu = mip->mi_sdu_min;
+	if (max_sdu != NULL)
+		*max_sdu = mip->mi_sdu_max;
+	if (multicast_sdu != NULL)
+		*multicast_sdu = mip->mi_sdu_multicast;
+}
+
 /*
  * Update the MAC unicast address of the specified client's flows. Currently
  * only one unicast MAC unicast address is allowed per client.
--- a/usr/src/uts/common/io/mac/mac_provider.c	Mon Aug 16 15:11:00 2010 -0700
+++ b/usr/src/uts/common/io/mac/mac_provider.c	Mon Aug 16 15:30:54 2010 -0700
@@ -212,8 +212,14 @@
 	mip->mi_info.mi_nativemedia = mtype->mt_nativetype;
 	if (mregp->m_max_sdu <= mregp->m_min_sdu)
 		goto fail;
+	if (mregp->m_multicast_sdu == 0)
+		mregp->m_multicast_sdu = mregp->m_max_sdu;
+	if (mregp->m_multicast_sdu < mregp->m_min_sdu ||
+	    mregp->m_multicast_sdu > mregp->m_max_sdu)
+		goto fail;
 	mip->mi_sdu_min = mregp->m_min_sdu;
 	mip->mi_sdu_max = mregp->m_max_sdu;
+	mip->mi_sdu_multicast = mregp->m_multicast_sdu;
 	mip->mi_info.mi_addr_length = mip->mi_type->mt_addr_length;
 	/*
 	 * If the media supports a broadcast address, cache a pointer to it
@@ -934,6 +940,13 @@
 	i_mac_notify((mac_impl_t *)mh, MAC_NOTE_CAPAB_CHG);
 }
 
+/*
+ * Used by normal drivers to update the max sdu size.
+ * We need to handle the case of a smaller mi_sdu_multicast
+ * since this is called by mac_set_mtu() even for drivers that
+ * have differing unicast and multicast mtu and we don't want to
+ * increase the multicast mtu by accident in that case.
+ */
 int
 mac_maxsdu_update(mac_handle_t mh, uint_t sdu_max)
 {
@@ -942,6 +955,31 @@
 	if (sdu_max == 0 || sdu_max < mip->mi_sdu_min)
 		return (EINVAL);
 	mip->mi_sdu_max = sdu_max;
+	if (mip->mi_sdu_multicast > mip->mi_sdu_max)
+		mip->mi_sdu_multicast = mip->mi_sdu_max;
+
+	/* Send a MAC_NOTE_SDU_SIZE notification. */
+	i_mac_notify(mip, MAC_NOTE_SDU_SIZE);
+	return (0);
+}
+
+/*
+ * Version of the above function that is used by drivers that have a different
+ * max sdu size for multicast/broadcast vs. unicast.
+ */
+int
+mac_maxsdu_update2(mac_handle_t mh, uint_t sdu_max, uint_t sdu_multicast)
+{
+	mac_impl_t	*mip = (mac_impl_t *)mh;
+
+	if (sdu_max == 0 || sdu_max < mip->mi_sdu_min)
+		return (EINVAL);
+	if (sdu_multicast == 0)
+		sdu_multicast = sdu_max;
+	if (sdu_multicast > sdu_max || sdu_multicast < mip->mi_sdu_min)
+		return (EINVAL);
+	mip->mi_sdu_max = sdu_max;
+	mip->mi_sdu_multicast = sdu_multicast;
 
 	/* Send a MAC_NOTE_SDU_SIZE notification. */
 	i_mac_notify(mip, MAC_NOTE_SDU_SIZE);
--- a/usr/src/uts/common/sys/dlpi.h	Mon Aug 16 15:11:00 2010 -0700
+++ b/usr/src/uts/common/sys/dlpi.h	Mon Aug 16 15:30:54 2010 -0700
@@ -407,6 +407,7 @@
 #define	DL_NOTE_CAPAB_RENEG	0x0400	/* Initiate capability renegotiation */
 #define	DL_NOTE_REPLUMB		0x0800	/* Inform the link to replumb */
 #define	DL_NOTE_ALLOWED_IPS	0x1000	/* "allowed-ips"  notification */
+#define	DL_NOTE_SDU_SIZE2	0x2000	/* New unicast and multicast size */
 
 /*
  * DLPI notification codes for DL_NOTIFY_CONF primitives.
@@ -991,7 +992,13 @@
 typedef struct {
 	t_uscalar_t	dl_primitive;	/* set to DL_NOTIFY_IND */
 	uint32_t	dl_notification; /* Which notification? */
-	uint32_t	dl_data;	/* notification specific */
+	union {
+		uint32_t	dlu_data32;	/* notification specific */
+		uint16_t	dlu_data16[2];	/* For DL_NOTE_SDU_SIZE2 */
+	} dl_dlu;
+#define	dl_data		dl_dlu.dlu_data32
+#define	dl_data1	dl_dlu.dlu_data16[0]	/* Unicast MTU */
+#define	dl_data2	dl_dlu.dlu_data16[1]	/* Multicast MTU */
 	t_uscalar_t	dl_addr_length;	/* length of complete DLSAP addr */
 	t_uscalar_t	dl_addr_offset;	/* offset from start of M_PROTO */
 } dl_notify_ind_t;
--- a/usr/src/uts/common/sys/mac.h	Mon Aug 16 15:11:00 2010 -0700
+++ b/usr/src/uts/common/sys/mac.h	Mon Aug 16 15:30:54 2010 -0700
@@ -584,7 +584,11 @@
 extern minor_t			mac_minor_hold(boolean_t);
 extern void			mac_minor_rele(minor_t);
 extern void			mac_sdu_get(mac_handle_t, uint_t *, uint_t *);
+extern void			mac_sdu_get2(mac_handle_t, uint_t *, uint_t *,
+				    uint_t *);
 extern int			mac_maxsdu_update(mac_handle_t, uint_t);
+extern int			mac_maxsdu_update2(mac_handle_t, uint_t,
+				    uint_t);
 extern uint_t			mac_addr_len(mac_handle_t);
 extern int			mac_type(mac_handle_t);
 extern int			mac_nativetype(mac_handle_t);
--- a/usr/src/uts/common/sys/mac_impl.h	Mon Aug 16 15:11:00 2010 -0700
+++ b/usr/src/uts/common/sys/mac_impl.h	Mon Aug 16 15:30:54 2010 -0700
@@ -494,6 +494,7 @@
 	uint32_t		mi_margin;		/* mi_rw_lock */
 	uint_t			mi_sdu_min;		/* mi_rw_lock */
 	uint_t			mi_sdu_max;		/* mi_rw_lock */
+	uint_t			mi_sdu_multicast;	/* mi_rw_lock */
 
 	/*
 	 * Cache of factory MAC addresses provided by the driver. If
--- a/usr/src/uts/common/sys/mac_provider.h	Mon Aug 16 15:11:00 2010 -0700
+++ b/usr/src/uts/common/sys/mac_provider.h	Mon Aug 16 15:30:54 2010 -0700
@@ -450,6 +450,7 @@
 	char			**m_priv_props;
 	uint32_t		m_margin;
 	uint32_t		m_v12n;		/* Virtualization level */
+	uint_t			m_multicast_sdu;
 } mac_register_t;
 
 /*
@@ -457,7 +458,11 @@
  */
 extern mac_protect_t		*mac_protect_get(mac_handle_t);
 extern void			mac_sdu_get(mac_handle_t, uint_t *, uint_t *);
+extern void			mac_sdu_get2(mac_handle_t, uint_t *, uint_t *,
+				    uint_t *);
 extern int			mac_maxsdu_update(mac_handle_t, uint_t);
+extern int			mac_maxsdu_update2(mac_handle_t, uint_t,
+				    uint_t);
 
 extern mac_register_t		*mac_alloc(uint_t);
 extern void			mac_free(mac_register_t *);