changeset 13997:c5f371428359

918 Need better IP fanout (esp. with VLANs present) Reviewed by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> Reviewed by: Sebastien Roy <sebastien.roy@delphix.com> Reviewed by: Garrett D'Amore <garrett@damore.org> Approved by: Gordon Ross <gwr@nexenta.com>
author Dan McDonald <danmcd@nexenta.com>
date Wed, 03 Apr 2013 14:02:24 -0400
parents df716869fcba
children 4ba0940c01f8
files usr/src/uts/common/io/mac/mac_sched.c
diffstat 1 files changed, 130 insertions(+), 108 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/io/mac/mac_sched.c	Fri Mar 22 15:27:11 2013 -0400
+++ b/usr/src/uts/common/io/mac/mac_sched.c	Wed Apr 03 14:02:24 2013 -0400
@@ -21,9 +21,8 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- */
-/*
  * Copyright 2011 Joyent, Inc.  All rights reserved.
+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -530,12 +529,13 @@
 
 /*
  * In general we do port based hashing to spread traffic over different
- * softrings. The below tunable allows to override that behavior. Setting it
- * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior
- * is also the applicable to ipv6 packets carrying multiple optional headers
- * and other uncommon packet types.
+ * softrings. The below tunables allow to override that behavior. Setting one
+ * (depending on IPv6 or IPv4) to B_TRUE allows a fanout based on src
+ * IPv6 or IPv4 address. This behavior is also applicable to IPv6 packets
+ * carrying multiple optional headers and other uncommon packet types.
  */
 boolean_t mac_src_ipv6_fanout = B_FALSE;
+boolean_t mac_src_ipv4_fanout = B_FALSE;
 
 /*
  * Pair of local and remote ports in the transport header
@@ -760,146 +760,168 @@
 	}
 }
 
-int	fanout_unalligned = 0;
+int	fanout_unaligned = 0;
 
 /*
  * mac_rx_srs_long_fanout
  *
- * The fanout routine for IPv6
+ * The fanout routine for VLANs, and for anything else that isn't performing
+ * explicit dls bypass.  Returns -1 on an error (drop the packet due to a
+ * malformed packet), 0 on success, with values written in *indx and *type.
  */
 static int
 mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
     uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx)
 {
 	ip6_t		*ip6h;
+	ipha_t		*ipha;
 	uint8_t		*whereptr;
 	uint_t		hash;
 	uint16_t	remlen;
 	uint8_t		nexthdr;
 	uint16_t	hdr_len;
+	uint32_t	src_val;
+	boolean_t	modifiable = B_TRUE;
+	boolean_t	v6;
+
+	ASSERT(MBLKL(mp) >= hdrsize);
 
 	if (sap == ETHERTYPE_IPV6) {
-		boolean_t	modifiable = B_TRUE;
-
-		ASSERT(MBLKL(mp) >= hdrsize);
-
-		ip6h = (ip6_t *)(mp->b_rptr + hdrsize);
-		if ((unsigned char *)ip6h == mp->b_wptr) {
-			/*
-			 * The first mblk_t only includes the mac header.
-			 * Note that it is safe to change the mp pointer here,
-			 * as the subsequent operation does not assume mp
-			 * points to the start of the mac header.
-			 */
-			mp = mp->b_cont;
-
-			/*
-			 * Make sure ip6h holds the full ip6_t structure.
-			 */
-			if (mp == NULL)
+		v6 = B_TRUE;
+		hdr_len = IPV6_HDR_LEN;
+	} else if (sap == ETHERTYPE_IP) {
+		v6 = B_FALSE;
+		hdr_len = IP_SIMPLE_HDR_LENGTH;
+	} else {
+		*indx = 0;
+		*type = OTH;
+		return (0);
+	}
+
+	ip6h = (ip6_t *)(mp->b_rptr + hdrsize);
+	ipha = (ipha_t *)ip6h;
+
+	if ((uint8_t *)ip6h == mp->b_wptr) {
+		/*
+		 * The first mblk_t only includes the mac header.
+		 * Note that it is safe to change the mp pointer here,
+		 * as the subsequent operation does not assume mp
+		 * points to the start of the mac header.
+		 */
+		mp = mp->b_cont;
+
+		/*
+		 * Make sure the IP header points to an entire one.
+		 */
+		if (mp == NULL)
+			return (-1);
+
+		if (MBLKL(mp) < hdr_len) {
+			modifiable = (DB_REF(mp) == 1);
+
+			if (modifiable && !pullupmsg(mp, hdr_len))
 				return (-1);
-
-			if (MBLKL(mp) < IPV6_HDR_LEN) {
-				modifiable = (DB_REF(mp) == 1);
-
-				if (modifiable &&
-				    !pullupmsg(mp, IPV6_HDR_LEN)) {
-					return (-1);
-				}
-			}
-
-			ip6h = (ip6_t *)mp->b_rptr;
 		}
 
-		if (!modifiable || !(OK_32PTR((char *)ip6h)) ||
-		    ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) {
-			/*
-			 * If either ip6h is not alligned, or ip6h does not
-			 * hold the complete ip6_t structure (a pullupmsg()
-			 * is not an option since it would result in an
-			 * unalligned ip6h), fanout to the default ring. Note
-			 * that this may cause packets reordering.
-			 */
-			*indx = 0;
-			*type = OTH;
-			fanout_unalligned++;
-			return (0);
-		}
-
+		ip6h = (ip6_t *)mp->b_rptr;
+		ipha = (ipha_t *)ip6h;
+	}
+
+	if (!modifiable || !(OK_32PTR((char *)ip6h)) ||
+	    ((uint8_t *)ip6h + hdr_len > mp->b_wptr)) {
+		/*
+		 * If either the IP header is not aligned, or it does not hold
+		 * the complete simple structure (a pullupmsg() is not an
+		 * option since it would result in an unaligned IP header),
+		 * fanout to the default ring.
+		 *
+		 * Note that this may cause packet reordering.
+		 */
+		*indx = 0;
+		*type = OTH;
+		fanout_unaligned++;
+		return (0);
+	}
+
+	/*
+	 * Extract next-header, full header length, and source-hash value
+	 * using v4/v6 specific fields.
+	 */
+	if (v6) {
 		remlen = ntohs(ip6h->ip6_plen);
 		nexthdr = ip6h->ip6_nxt;
-
-		if (remlen < MIN_EHDR_LEN)
-			return (-1);
+		src_val = V4_PART_OF_V6(ip6h->ip6_src);
 		/*
 		 * Do src based fanout if below tunable is set to B_TRUE or
 		 * when mac_ip_hdr_length_v6() fails because of malformed
-		 * packets or because mblk's need to be concatenated using
+		 * packets or because mblks need to be concatenated using
 		 * pullupmsg().
 		 */
 		if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(ip6h,
 		    mp->b_wptr, &hdr_len, &nexthdr, NULL)) {
 			goto src_based_fanout;
 		}
-		whereptr = (uint8_t *)ip6h + hdr_len;
-
-		/* If the transport is one of below, we do port based fanout */
-		switch (nexthdr) {
-		case IPPROTO_TCP:
-		case IPPROTO_UDP:
-		case IPPROTO_SCTP:
-		case IPPROTO_ESP:
-			/*
-			 * If the ports in the transport header is not part of
-			 * the mblk, do src_based_fanout, instead of calling
-			 * pullupmsg().
-			 */
-			if (mp->b_cont != NULL &&
-			    whereptr + PORTS_SIZE > mp->b_wptr) {
-				goto src_based_fanout;
-			}
-			break;
-		default:
-			break;
-		}
-
-		switch (nexthdr) {
-		case IPPROTO_TCP:
-			hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src),
-			    *(uint32_t *)whereptr);
-			*indx = COMPUTE_INDEX(hash,
-			    mac_srs->srs_tcp_ring_count);
-			*type = OTH;
-			break;
-
-		case IPPROTO_UDP:
-		case IPPROTO_SCTP:
-		case IPPROTO_ESP:
-			if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
-				hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src),
-				    *(uint32_t *)whereptr);
-				*indx = COMPUTE_INDEX(hash,
-				    mac_srs->srs_udp_ring_count);
-			} else {
-				*indx = mac_srs->srs_ind %
-				    mac_srs->srs_udp_ring_count;
-				mac_srs->srs_ind++;
-			}
-			*type = OTH;
-			break;
-
-			/* For all other protocol, do source based fanout */
-		default:
+	} else {
+		hdr_len = IPH_HDR_LENGTH(ipha);
+		remlen = ntohs(ipha->ipha_length) - hdr_len;
+		nexthdr = ipha->ipha_protocol;
+		src_val = (uint32_t)ipha->ipha_src;
+		/*
+		 * Catch IPv4 fragment case here.  IPv6 has nexthdr == FRAG
+		 * for its equivalent case.
+		 */
+		if (mac_src_ipv4_fanout ||
+		    (ntohs(ipha->ipha_fragment_offset_and_flags) &
+		    (IPH_MF | IPH_OFFSET)) != 0) {
 			goto src_based_fanout;
 		}
-	} else {
-		*indx = 0;
+	}
+	if (remlen < MIN_EHDR_LEN)
+		return (-1);
+	whereptr = (uint8_t *)ip6h + hdr_len;
+
+	/* If the transport is one of below, we do port/SPI based fanout */
+	switch (nexthdr) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_SCTP:
+	case IPPROTO_ESP:
+		/*
+		 * If the ports or SPI in the transport header is not part of
+		 * the mblk, do src_based_fanout, instead of calling
+		 * pullupmsg().
+		 */
+		if (mp->b_cont == NULL || whereptr + PORTS_SIZE <= mp->b_wptr)
+			break;	/* out of switch... */
+		/* FALLTHRU */
+	default:
+		goto src_based_fanout;
+	}
+
+	switch (nexthdr) {
+	case IPPROTO_TCP:
+		hash = HASH_ADDR(src_val, *(uint32_t *)whereptr);
+		*indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
 		*type = OTH;
+		break;
+	case IPPROTO_UDP:
+	case IPPROTO_SCTP:
+	case IPPROTO_ESP:
+		if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
+			hash = HASH_ADDR(src_val, *(uint32_t *)whereptr);
+			*indx = COMPUTE_INDEX(hash,
+			    mac_srs->srs_udp_ring_count);
+		} else {
+			*indx = mac_srs->srs_ind % mac_srs->srs_udp_ring_count;
+			mac_srs->srs_ind++;
+		}
+		*type = OTH;
+		break;
 	}
 	return (0);
 
 src_based_fanout:
-	hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0);
+	hash = HASH_ADDR(src_val, (uint32_t)0);
 	*indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
 	*type = OTH;
 	return (0);