# HG changeset patch # User Dan McDonald # Date 1365012144 14400 # Node ID c5f37142835930ab7517f23a5302cd61ff2f572e # Parent df716869fcba1aac2b4933178d5cb3a9b716c968 918 Need better IP fanout (esp. with VLANs present) Reviewed by: Hans Rosenfeld Reviewed by: Sebastien Roy Reviewed by: Garrett D'Amore Approved by: Gordon Ross diff -r df716869fcba -r c5f371428359 usr/src/uts/common/io/mac/mac_sched.c --- a/usr/src/uts/common/io/mac/mac_sched.c Fri Mar 22 15:27:11 2013 -0400 +++ b/usr/src/uts/common/io/mac/mac_sched.c Wed Apr 03 14:02:24 2013 -0400 @@ -21,9 +21,8 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - */ -/* * Copyright 2011 Joyent, Inc. All rights reserved. + * Copyright 2013 Nexenta Systems, Inc. All rights reserved. */ #include @@ -530,12 +529,13 @@ /* * In general we do port based hashing to spread traffic over different - * softrings. The below tunable allows to override that behavior. Setting it - * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior - * is also the applicable to ipv6 packets carrying multiple optional headers - * and other uncommon packet types. + * softrings. The below tunables allow to override that behavior. Setting one + * (depending on IPv6 or IPv4) to B_TRUE allows a fanout based on src + * IPv6 or IPv4 address. This behavior is also applicable to IPv6 packets + * carrying multiple optional headers and other uncommon packet types. */ boolean_t mac_src_ipv6_fanout = B_FALSE; +boolean_t mac_src_ipv4_fanout = B_FALSE; /* * Pair of local and remote ports in the transport header @@ -760,146 +760,168 @@ } } -int fanout_unalligned = 0; +int fanout_unaligned = 0; /* * mac_rx_srs_long_fanout * - * The fanout routine for IPv6 + * The fanout routine for VLANs, and for anything else that isn't performing + * explicit dls bypass. Returns -1 on an error (drop the packet due to a + * malformed packet), 0 on success, with values written in *indx and *type. */ static int mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx) { ip6_t *ip6h; + ipha_t *ipha; uint8_t *whereptr; uint_t hash; uint16_t remlen; uint8_t nexthdr; uint16_t hdr_len; + uint32_t src_val; + boolean_t modifiable = B_TRUE; + boolean_t v6; + + ASSERT(MBLKL(mp) >= hdrsize); if (sap == ETHERTYPE_IPV6) { - boolean_t modifiable = B_TRUE; - - ASSERT(MBLKL(mp) >= hdrsize); - - ip6h = (ip6_t *)(mp->b_rptr + hdrsize); - if ((unsigned char *)ip6h == mp->b_wptr) { - /* - * The first mblk_t only includes the mac header. - * Note that it is safe to change the mp pointer here, - * as the subsequent operation does not assume mp - * points to the start of the mac header. - */ - mp = mp->b_cont; - - /* - * Make sure ip6h holds the full ip6_t structure. - */ - if (mp == NULL) + v6 = B_TRUE; + hdr_len = IPV6_HDR_LEN; + } else if (sap == ETHERTYPE_IP) { + v6 = B_FALSE; + hdr_len = IP_SIMPLE_HDR_LENGTH; + } else { + *indx = 0; + *type = OTH; + return (0); + } + + ip6h = (ip6_t *)(mp->b_rptr + hdrsize); + ipha = (ipha_t *)ip6h; + + if ((uint8_t *)ip6h == mp->b_wptr) { + /* + * The first mblk_t only includes the mac header. + * Note that it is safe to change the mp pointer here, + * as the subsequent operation does not assume mp + * points to the start of the mac header. + */ + mp = mp->b_cont; + + /* + * Make sure the IP header points to an entire one. + */ + if (mp == NULL) + return (-1); + + if (MBLKL(mp) < hdr_len) { + modifiable = (DB_REF(mp) == 1); + + if (modifiable && !pullupmsg(mp, hdr_len)) return (-1); - - if (MBLKL(mp) < IPV6_HDR_LEN) { - modifiable = (DB_REF(mp) == 1); - - if (modifiable && - !pullupmsg(mp, IPV6_HDR_LEN)) { - return (-1); - } - } - - ip6h = (ip6_t *)mp->b_rptr; } - if (!modifiable || !(OK_32PTR((char *)ip6h)) || - ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) { - /* - * If either ip6h is not alligned, or ip6h does not - * hold the complete ip6_t structure (a pullupmsg() - * is not an option since it would result in an - * unalligned ip6h), fanout to the default ring. Note - * that this may cause packets reordering. - */ - *indx = 0; - *type = OTH; - fanout_unalligned++; - return (0); - } - + ip6h = (ip6_t *)mp->b_rptr; + ipha = (ipha_t *)ip6h; + } + + if (!modifiable || !(OK_32PTR((char *)ip6h)) || + ((uint8_t *)ip6h + hdr_len > mp->b_wptr)) { + /* + * If either the IP header is not aligned, or it does not hold + * the complete simple structure (a pullupmsg() is not an + * option since it would result in an unaligned IP header), + * fanout to the default ring. + * + * Note that this may cause packet reordering. + */ + *indx = 0; + *type = OTH; + fanout_unaligned++; + return (0); + } + + /* + * Extract next-header, full header length, and source-hash value + * using v4/v6 specific fields. + */ + if (v6) { remlen = ntohs(ip6h->ip6_plen); nexthdr = ip6h->ip6_nxt; - - if (remlen < MIN_EHDR_LEN) - return (-1); + src_val = V4_PART_OF_V6(ip6h->ip6_src); /* * Do src based fanout if below tunable is set to B_TRUE or * when mac_ip_hdr_length_v6() fails because of malformed - * packets or because mblk's need to be concatenated using + * packets or because mblks need to be concatenated using * pullupmsg(). */ if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(ip6h, mp->b_wptr, &hdr_len, &nexthdr, NULL)) { goto src_based_fanout; } - whereptr = (uint8_t *)ip6h + hdr_len; - - /* If the transport is one of below, we do port based fanout */ - switch (nexthdr) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_SCTP: - case IPPROTO_ESP: - /* - * If the ports in the transport header is not part of - * the mblk, do src_based_fanout, instead of calling - * pullupmsg(). - */ - if (mp->b_cont != NULL && - whereptr + PORTS_SIZE > mp->b_wptr) { - goto src_based_fanout; - } - break; - default: - break; - } - - switch (nexthdr) { - case IPPROTO_TCP: - hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), - *(uint32_t *)whereptr); - *indx = COMPUTE_INDEX(hash, - mac_srs->srs_tcp_ring_count); - *type = OTH; - break; - - case IPPROTO_UDP: - case IPPROTO_SCTP: - case IPPROTO_ESP: - if (mac_fanout_type == MAC_FANOUT_DEFAULT) { - hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), - *(uint32_t *)whereptr); - *indx = COMPUTE_INDEX(hash, - mac_srs->srs_udp_ring_count); - } else { - *indx = mac_srs->srs_ind % - mac_srs->srs_udp_ring_count; - mac_srs->srs_ind++; - } - *type = OTH; - break; - - /* For all other protocol, do source based fanout */ - default: + } else { + hdr_len = IPH_HDR_LENGTH(ipha); + remlen = ntohs(ipha->ipha_length) - hdr_len; + nexthdr = ipha->ipha_protocol; + src_val = (uint32_t)ipha->ipha_src; + /* + * Catch IPv4 fragment case here. IPv6 has nexthdr == FRAG + * for its equivalent case. + */ + if (mac_src_ipv4_fanout || + (ntohs(ipha->ipha_fragment_offset_and_flags) & + (IPH_MF | IPH_OFFSET)) != 0) { goto src_based_fanout; } - } else { - *indx = 0; + } + if (remlen < MIN_EHDR_LEN) + return (-1); + whereptr = (uint8_t *)ip6h + hdr_len; + + /* If the transport is one of below, we do port/SPI based fanout */ + switch (nexthdr) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_SCTP: + case IPPROTO_ESP: + /* + * If the ports or SPI in the transport header is not part of + * the mblk, do src_based_fanout, instead of calling + * pullupmsg(). + */ + if (mp->b_cont == NULL || whereptr + PORTS_SIZE <= mp->b_wptr) + break; /* out of switch... */ + /* FALLTHRU */ + default: + goto src_based_fanout; + } + + switch (nexthdr) { + case IPPROTO_TCP: + hash = HASH_ADDR(src_val, *(uint32_t *)whereptr); + *indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count); *type = OTH; + break; + case IPPROTO_UDP: + case IPPROTO_SCTP: + case IPPROTO_ESP: + if (mac_fanout_type == MAC_FANOUT_DEFAULT) { + hash = HASH_ADDR(src_val, *(uint32_t *)whereptr); + *indx = COMPUTE_INDEX(hash, + mac_srs->srs_udp_ring_count); + } else { + *indx = mac_srs->srs_ind % mac_srs->srs_udp_ring_count; + mac_srs->srs_ind++; + } + *type = OTH; + break; } return (0); src_based_fanout: - hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0); + hash = HASH_ADDR(src_val, (uint32_t)0); *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); *type = OTH; return (0);