Mercurial > illumos > illumos-gate
changeset 11681:fe992d6ccc26
4173841 Packet goes out with source IP address of another interface
6921533 ioctls could be executed when the thread is not a WRITER on the ipif
6921451 ira_pktlen not computed correctly for ipsec packets
6921615 IPMP need ~5 seconds for traffic passing through when its state transfers from failed
author | Sowmini Varadhan <Sowmini.Varadhan@Sun.COM> |
---|---|
date | Wed, 17 Feb 2010 22:59:58 -0500 |
parents | f7d6d87905e0 |
children | 6b625d44458f |
files | usr/src/uts/common/inet/ip.h usr/src/uts/common/inet/ip/icmp.c usr/src/uts/common/inet/ip/ip.c usr/src/uts/common/inet/ip/ip6.c usr/src/uts/common/inet/ip/ip6_if.c usr/src/uts/common/inet/ip/ip6_input.c usr/src/uts/common/inet/ip/ip6_ire.c usr/src/uts/common/inet/ip/ip6_output.c usr/src/uts/common/inet/ip/ip_ftable.c usr/src/uts/common/inet/ip/ip_if.c usr/src/uts/common/inet/ip/ip_input.c usr/src/uts/common/inet/ip/ip_ire.c usr/src/uts/common/inet/ip/ip_mroute.c usr/src/uts/common/inet/ip/ip_ndp.c usr/src/uts/common/inet/ip/ip_output.c usr/src/uts/common/inet/ip/ipsecesp.c usr/src/uts/common/inet/ip_if.h usr/src/uts/common/inet/ip_ire.h |
diffstat | 18 files changed, 629 insertions(+), 95 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/uts/common/inet/ip.h Wed Feb 17 19:52:19 2010 -0800 +++ b/usr/src/uts/common/inet/ip.h Wed Feb 17 22:59:58 2010 -0500 @@ -2242,8 +2242,9 @@ /* * The normal flags for sending packets e.g., icmp errors */ -#define IXAF_BASIC_SIMPLE_V4 (IXAF_SET_ULP_CKSUM | IXAF_IS_IPV4) -#define IXAF_BASIC_SIMPLE_V6 (IXAF_SET_ULP_CKSUM) +#define IXAF_BASIC_SIMPLE_V4 \ + (IXAF_SET_ULP_CKSUM | IXAF_IS_IPV4 | IXAF_VERIFY_SOURCE) +#define IXAF_BASIC_SIMPLE_V6 (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE) /* * Normally these fields do not have a hold. But in some cases they do, for @@ -2677,6 +2678,12 @@ boolean_t ire_trace_disable; /* True when alloc fails */ ip_stack_t *ire_ipst; /* Does not have a netstack_hold */ iulp_t ire_metrics; + /* + * default and prefix routes that are added without explicitly + * specifying the interface are termed "unbound" routes, and will + * have ire_unbound set to true. + */ + boolean_t ire_unbound; }; /* IPv4 compatibility macros */ @@ -3005,6 +3012,8 @@ #define ips_ipv6_icmp_return_pmtu ips_param_arr[73].ip_param_value #define ips_ip_arp_publish_count ips_param_arr[74].ip_param_value #define ips_ip_arp_publish_interval ips_param_arr[75].ip_param_value +#define ips_ip_strict_src_multihoming ips_param_arr[76].ip_param_value +#define ips_ipv6_strict_src_multihoming ips_param_arr[77].ip_param_value extern int dohwcksum; /* use h/w cksum if supported by the h/w */ #ifdef ZC_TEST
--- a/usr/src/uts/common/inet/ip/icmp.c Wed Feb 17 19:52:19 2010 -0800 +++ b/usr/src/uts/common/inet/ip/icmp.c Wed Feb 17 22:59:58 2010 -0500 @@ -3103,6 +3103,16 @@ /* Even for multicast and broadcast we honor the apps ttl */ ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; + /* + * No source verification for non-local addresses + */ + if (ipha->ipha_src != INADDR_ANY && + ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid, + is->is_netstack->netstack_ip, B_FALSE) + != IPVL_UNICAST_UP) { + ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; + } + if (ipha->ipha_dst == INADDR_ANY) ipha->ipha_dst = htonl(INADDR_LOOPBACK); @@ -3468,6 +3478,26 @@ v6src = ipp->ipp_addr; } } + /* + * Allow source not assigned to the system + * only if it is not a local addresses + */ + if (!V6_OR_V4_INADDR_ANY(v6src)) { + ip_laddr_t laddr_type; + + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ipaddr_t v4src; + + IN6_V4MAPPED_TO_IPADDR(&v6src, v4src); + laddr_type = ip_laddr_verify_v4(v4src, ixa->ixa_zoneid, + is->is_netstack->netstack_ip, B_FALSE); + } else { + laddr_type = ip_laddr_verify_v6(&v6src, ixa->ixa_zoneid, + is->is_netstack->netstack_ip, B_FALSE, B_FALSE); + } + if (laddr_type != IPVL_UNICAST_UP) + ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; + } ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, @@ -3562,8 +3592,6 @@ /* We're done. Pass the packet to ip. */ BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); - /* Allow source not assigned to the system? */ - ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; error = conn_ip_output(mp, ixa); if (!connp->conn_unspec_src) ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
--- a/usr/src/uts/common/inet/ip/ip.c Wed Feb 17 19:52:19 2010 -0800 +++ b/usr/src/uts/common/inet/ip/ip.c Wed Feb 17 22:59:58 2010 -0500 @@ -826,6 +826,10 @@ { 0, 99999, 100, "ip_icmp_err_interval" }, { 1, 99999, 10, "ip_icmp_err_burst" }, { 0, 999999999, 1000000, "ip_reass_queue_bytes" }, + /* + * See comments for ip_strict_src_multihoming for an explanation + * of the semantics of ip_strict_dst_multihoming + */ { 0, 1, 0, "ip_strict_dst_multihoming" }, { 1, MAX_ADDRS_PER_IF, 256, "ip_addrs_per_if"}, { 0, 1, 0, "ipsec_override_persocket_policy" }, @@ -841,6 +845,10 @@ { 0, 1, 1, "ip6_respond_to_echo_multicast"}, { 0, 1, 1, "ip6_send_redirects"}, { 0, 1, 0, "ip6_ignore_redirect" }, + /* + * See comments for ip6_strict_src_multihoming for an explanation + * of the semantics of ip6_strict_dst_multihoming + */ { 0, 1, 0, "ip6_strict_dst_multihoming" }, { 0, 2, 2, "ip_src_check" }, @@ -907,7 +915,48 @@ * for IPv4, IPv6. */ { 1, 20, 5, "ip_arp_publish_count" }, - { 1000, 20000, 2000, "ip_arp_publish_interval" }, + { 1000, 20000, 2000, "ip_arp_publish_interval" }, + /* + * The ip*strict_src_multihoming and ip*strict_dst_multihoming provide + * a range of choices for setting strong/weak/preferred end-system + * behavior. The semantics for setting these are: + * + * ip*_strict_dst_multihoming = 0 + * weak end system model for managing ip destination addresses. + * A packet with IP dst D1 that's received on interface I1 will be + * accepted as long as D1 is one of the local addresses on + * the machine, even if D1 is not configured on I1. + * ip*strict_dst_multihioming = 1 + * strong end system model for managing ip destination addresses. + * A packet with IP dst D1 that's received on interface I1 will be + * accepted if, and only if, D1 is configured on I1. + * + * ip*strict_src_multihoming = 0 + * Source agnostic route selection for outgoing packets: the + * outgoing interface for a packet will be computed using + * default algorithms for route selection, where the route + * with the longest matching prefix is chosen for the output + * unless other route selection constraints are explicitly + * specified during routing table lookup. This may result + * in packet being sent out on interface I2 with source + * address S1, even though S1 is not a configured address on I2. + * ip*strict_src_multihoming = 1 + * Preferred source aware route selection for outgoing packets: for + * a packet with source S2, destination D2, the route selection + * algorithm will first attempt to find a route for the destination + * that goes out through an interface where S2 is + * configured. If such a route cannot be found, then the + * best-matching route for D2 will be selected. + * ip*strict_src_multihoming = 2 + * Source aware route selection for outgoing packets: a packet will + * be sent out on an interface I2 only if the src address S2 of the + * packet is a configured address on I2. In conjunction with + * the setting 'ip_strict_dst_multihoming == 1', this will result in + * the implementation of Strong ES as defined in Section 3.3.4.2 of + * RFC 1122 + */ + { 0, 2, 0, "ip_strict_src_multihoming" }, + { 0, 2, 0, "ip6_strict_src_multihoming" } }; /* @@ -3562,8 +3611,8 @@ * a "hidden" route (i.e., going through a specific under_ill) * if ixa_ifindex has been specified. */ - ire = ip_select_route_v4(firsthop, ixa, &generation, &setsrc, &error, - &multirt); + ire = ip_select_route_v4(firsthop, *src_addrp, ixa, + &generation, &setsrc, &error, &multirt); ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ if (error != 0) goto bad_addr; @@ -6773,6 +6822,85 @@ return (B_TRUE); } +/* + * When the src multihoming is changed from weak to [strong, preferred] + * ip_ire_rebind_walker is called to walk the list of all ire_t entries + * and identify routes that were created by user-applications in the + * unbound state (i.e., without RTA_IFP), and for which an ire_ill is not + * currently defined. These routes are then 'rebound', i.e., their ire_ill + * is selected by finding an interface route for the gateway. + */ +/* ARGSUSED */ +static void +ip_ire_rebind_walker(ire_t *ire, void *notused) +{ + if (!ire->ire_unbound || ire->ire_ill != NULL) + return; + ire_rebind(ire); + ire_delete(ire); +} + +/* + * When the src multihoming is changed from [strong, preferred] to weak, + * ip_ire_unbind_walker is called to walk the list of all ire_t entries, and + * set any entries that were created by user-applications in the unbound state + * (i.e., without RTA_IFP) back to having a NULL ire_ill. + */ +/* ARGSUSED */ +static void +ip_ire_unbind_walker(ire_t *ire, void *notused) +{ + ire_t *new_ire; + + if (!ire->ire_unbound || ire->ire_ill == NULL) + return; + if (ire->ire_ipversion == IPV6_VERSION) { + new_ire = ire_create_v6(&ire->ire_addr_v6, &ire->ire_mask_v6, + &ire->ire_gateway_addr_v6, ire->ire_type, NULL, + ire->ire_zoneid, ire->ire_flags, NULL, ire->ire_ipst); + } else { + new_ire = ire_create((uchar_t *)&ire->ire_addr, + (uchar_t *)&ire->ire_mask, + (uchar_t *)&ire->ire_gateway_addr, ire->ire_type, NULL, + ire->ire_zoneid, ire->ire_flags, NULL, ire->ire_ipst); + } + if (new_ire == NULL) + return; + new_ire->ire_unbound = B_TRUE; + /* + * The bound ire must first be deleted so that we don't return + * the existing one on the attempt to add the unbound new_ire. + */ + ire_delete(ire); + new_ire = ire_add(new_ire); + if (new_ire != NULL) + ire_refrele(new_ire); +} + +/* + * When the settings of ip*_strict_src_multihoming tunables are changed, + * all cached routes need to be recomputed. This recomputation needs to be + * done when going from weaker to stronger modes so that the cached ire + * for the connection does not violate the current ip*_strict_src_multihoming + * setting. It also needs to be done when going from stronger to weaker modes, + * so that we fall back to matching on the longest-matching-route (as opposed + * to a shorter match that may have been selected in the strong mode + * to satisfy src_multihoming settings). + * + * The cached ixa_ire entires for all conn_t entries are marked as + * "verify" so that they will be recomputed for the next packet. + */ +static void +conn_ire_revalidate(conn_t *connp, void *arg) +{ + boolean_t isv6 = (boolean_t)arg; + + if ((isv6 && connp->conn_ipversion != IPV6_VERSION) || + (!isv6 && connp->conn_ipversion != IPV4_VERSION)) + return; + connp->conn_ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; +} + /* Named Dispatch routine to negotiate a new value for one of our parameters. */ /* ARGSUSED */ static int @@ -6780,12 +6908,35 @@ { long new_value; ipparam_t *ippa = (ipparam_t *)cp; - + ip_stack_t *ipst = CONNQ_TO_IPST(q); + int strict_src4, strict_src6; + + strict_src4 = ipst->ips_ip_strict_src_multihoming; + strict_src6 = ipst->ips_ipv6_strict_src_multihoming; if (ddi_strtol(value, NULL, 10, &new_value) != 0 || new_value < ippa->ip_param_min || new_value > ippa->ip_param_max) { return (EINVAL); } ippa->ip_param_value = new_value; + if (ipst->ips_ip_strict_src_multihoming != strict_src4) { + if (strict_src4 == 0) { + ire_walk_v4(ip_ire_rebind_walker, NULL, ALL_ZONES, + ipst); + } else { + ire_walk_v4(ip_ire_unbind_walker, NULL, ALL_ZONES, + ipst); + } + ipcl_walk(conn_ire_revalidate, (void *)B_FALSE, ipst); + } else if (ipst->ips_ipv6_strict_src_multihoming != strict_src6) { + if (strict_src6 == 0) { + ire_walk_v6(ip_ire_rebind_walker, NULL, ALL_ZONES, + ipst); + } else { + ire_walk_v4(ip_ire_unbind_walker, NULL, ALL_ZONES, + ipst); + } + ipcl_walk(conn_ire_revalidate, (void *)B_TRUE, ipst); + } return (0); }
--- a/usr/src/uts/common/inet/ip/ip6.c Wed Feb 17 19:52:19 2010 -0800 +++ b/usr/src/uts/common/inet/ip/ip6.c Wed Feb 17 22:59:58 2010 -0500 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* @@ -2004,8 +2004,8 @@ * a "hidden" route (i.e., going through a specific under_ill) * if ixa_ifindex has been specified. */ - ire = ip_select_route_v6(firsthop, ixa, &generation, &setsrc, &error, - &multirt); + ire = ip_select_route_v6(firsthop, *src_addrp, ixa, &generation, + &setsrc, &error, &multirt); ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ if (error != 0) goto bad_addr;
--- a/usr/src/uts/common/inet/ip/ip6_if.c Wed Feb 17 19:52:19 2010 -0800 +++ b/usr/src/uts/common/inet/ip/ip6_if.c Wed Feb 17 22:59:58 2010 -0500 @@ -405,6 +405,7 @@ tsol_gc_t *gc = NULL; tsol_gcgrp_t *gcgrp = NULL; boolean_t gcgrp_xtraref = B_FALSE; + boolean_t unbound = B_FALSE; if (ire_arg != NULL) *ire_arg = NULL; @@ -724,6 +725,11 @@ ipif_refrele(ipif); return (ENETUNREACH); } + if (ill == NULL && !(flags & RTF_INDIRECT)) { + unbound = B_TRUE; + if (ipst->ips_ipv6_strict_src_multihoming > 0) + ill = gw_ire->ire_ill; + } /* * We create one of three types of IREs as a result of this request @@ -819,6 +825,8 @@ if ((flags & RTF_SETSRC) && !IN6_IS_ADDR_UNSPECIFIED(src_addr)) ire->ire_setsrc_addr_v6 = *src_addr; + ire->ire_unbound = unbound; + /* * POLICY: should we allow an RTF_HOST with address INADDR_ANY? * SUN/OS socket stuff does but do we really want to allow ::0 ?
--- a/usr/src/uts/common/inet/ip/ip6_input.c Wed Feb 17 19:52:19 2010 -0800 +++ b/usr/src/uts/common/inet/ip/ip6_input.c Wed Feb 17 22:59:58 2010 -0500 @@ -1539,6 +1539,8 @@ zoneid_t zoneid; mblk_t *mp1; ip6_t *ip6h1; + uint_t ira_pktlen = ira->ira_pktlen; + uint16_t ira_ip_hdr_length = ira->ira_ip_hdr_length; /* ire_recv_multicast has switched to the upper ill for IPMP */ ASSERT(!IS_UNDER_IPMP(ill)); @@ -1598,6 +1600,12 @@ } ip6h1 = (ip6_t *)mp1->b_rptr; ip_fanout_v6(mp1, ip6h1, ira); + /* + * IPsec might have modified ira_pktlen and ira_ip_hdr_length + * so we restore them for a potential next iteration + */ + ira->ira_pktlen = ira_pktlen; + ira->ira_ip_hdr_length = ira_ip_hdr_length; } /* Do the main ire */
--- a/usr/src/uts/common/inet/ip/ip6_ire.c Wed Feb 17 19:52:19 2010 -0800 +++ b/usr/src/uts/common/inet/ip/ip6_ire.c Wed Feb 17 22:59:58 2010 -0500 @@ -690,7 +690,7 @@ ASSERT(addr != NULL); ASSERT(mask != NULL); ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL); - ASSERT((!(match_flags & MATCH_IRE_ILL)) || + ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL))) || (ill != NULL && ill->ill_isv6)); /* @@ -771,7 +771,7 @@ } } /* - * For exampe, with + * For example, with * route add 11.0.0.0 gw1 -ifp bge0 * route add 11.0.0.0 gw2 -ifp bge1 * this code would differentiate based on @@ -799,13 +799,13 @@ } matchit: + ire_ill = ire->ire_ill; if (match_flags & MATCH_IRE_GW) { mutex_enter(&ire->ire_lock); gw_addr_v6 = ire->ire_gateway_addr_v6; mutex_exit(&ire->ire_lock); } if (match_flags & MATCH_IRE_ILL) { - ire_ill = ire->ire_ill; /* * If asked to match an ill, we *must* match @@ -830,6 +830,17 @@ return (B_FALSE); } } + if (match_flags & MATCH_IRE_SRC_ILL) { + if (ire_ill == NULL) + return (B_FALSE); + if (!IS_ON_SAME_LAN(ill, ire_ill)) { + if (ire_ill->ill_usesrc_ifindex == 0 || + (ire_ill->ill_usesrc_ifindex != + ill->ill_phyint->phyint_ifindex)) + return (B_FALSE); + } + } + /* No ire_addr_v6 bits set past the mask */ ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6)); @@ -910,9 +921,9 @@ /* * ire_match_args_v6() will dereference ill if MATCH_IRE_ILL - * is set. + * or MATCH_IRE_SRC_ILL is set. */ - if ((flags & (MATCH_IRE_ILL)) && (ill == NULL)) + if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL)) return (NULL); rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); @@ -1113,12 +1124,13 @@ } ire_t * -ip_select_route_v6(const in6_addr_t *dst, ip_xmit_attr_t *ixa, - uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp) +ip_select_route_v6(const in6_addr_t *dst, const in6_addr_t src, + ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp, + int *errorp, boolean_t *multirtp) { ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4)); - return (ip_select_route(dst, ixa, generationp, setsrcp, errorp, + return (ip_select_route(dst, src, ixa, generationp, setsrcp, errorp, multirtp)); } @@ -1127,8 +1139,6 @@ * the zoneid, ill, and label. Used for the data paths. See also * ire_route_recursive_dstonly. * - * If ill is set this means we will match it by adding MATCH_IRE_ILL. - * * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never * create an IRE_IF_CLONE. This is used on the receive side when we are not * forwarding. @@ -1164,9 +1174,6 @@ if (gwattrp != NULL) ASSERT(*gwattrp == NULL); - if (ill_arg != NULL) - match_args |= MATCH_IRE_ILL; - /* * We iterate up to three times to resolve a route, even though * we have four slots in the array. The extra slot is for an @@ -1177,7 +1184,7 @@ /* ire_ftable_lookup handles round-robin/ECMP */ if (ire == NULL) { ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type, - (ill_arg != NULL ? ill_arg : ill), zoneid, tsl, + (ill != NULL ? ill : ill_arg), zoneid, tsl, match_args, xmit_hint, ipst, &generation); } else { /* Caller passed it; extra hold since we will rele */ @@ -1322,6 +1329,10 @@ * recursing. The type match is used by some callers * to exclude certain types (such as IRE_IF_CLONE or * IRE_LOCAL|IRE_LOOPBACK). + * + * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof' + * ire->ire_ill, and we want to find the IRE_INTERFACE for + * ire_ill, so we set ill to the ire_ill */ match_args &= MATCH_IRE_TYPE; v6nexthop = ire->ire_gateway_addr_v6;
--- a/usr/src/uts/common/inet/ip/ip6_output.c Wed Feb 17 19:52:19 2010 -0800 +++ b/usr/src/uts/common/inet/ip/ip6_output.c Wed Feb 17 22:59:58 2010 -0500 @@ -150,8 +150,8 @@ repeat_ire: error = 0; setsrc = ipv6_all_zeros; - ire = ip_select_route_v6(&firsthop, ixa, NULL, &setsrc, &error, - &multirt); + ire = ip_select_route_v6(&firsthop, ip6h->ip6_src, ixa, NULL, &setsrc, + &error, &multirt); ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ if (error != 0) { BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); @@ -1228,10 +1228,13 @@ * starting at ire1. */ ire_t *ire2; + uint_t match_flags = MATCH_IRE_DSTONLY; + if (ire1->ire_ill != NULL) + match_flags |= MATCH_IRE_ILL; ire2 = ire_route_recursive_impl_v6(ire1, &ire1->ire_addr_v6, ire1->ire_type, ire1->ire_ill, - ire1->ire_zoneid, NULL, MATCH_IRE_DSTONLY, + ire1->ire_zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL); if (ire2 != NULL) ire_refrele(ire2);
--- a/usr/src/uts/common/inet/ip/ip_ftable.c Wed Feb 17 19:52:19 2010 -0800 +++ b/usr/src/uts/common/inet/ip/ip_ftable.c Wed Feb 17 22:59:58 2010 -0500 @@ -77,6 +77,10 @@ (((ire)->ire_type & IRE_DEFAULT) || \ (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0))) +#define IP_SRC_MULTIHOMING(isv6, ipst) \ + (isv6 ? ipst->ips_ipv6_strict_src_multihoming : \ + ipst->ips_ip_strict_src_multihoming) + static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *); static void ire_del_host_redir(ire_t *, char *); static boolean_t ire_find_best_route(struct radix_node *, void *); @@ -104,7 +108,7 @@ * ire_match_args() will dereference ill if MATCH_IRE_ILL * is set. */ - if ((flags & MATCH_IRE_ILL) && (ill == NULL)) + if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL)) return (NULL); bzero(&rdst, sizeof (rdst)); @@ -673,7 +677,8 @@ for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { if (IRE_IS_CONDEMNED(ire)) continue; - if (margs->ift_flags & (MATCH_IRE_MASK|MATCH_IRE_SHORTERMASK)) + ASSERT((margs->ift_flags & MATCH_IRE_SHORTERMASK) == 0); + if (margs->ift_flags & MATCH_IRE_MASK) match_mask = margs->ift_mask; else match_mask = ire->ire_mask; @@ -968,24 +973,112 @@ irb_refrele_ftable(&((rt_t *)(rn))->rt_irb); } + +/* + * ip_select_src_ill() is used by ip_select_route() to find the src_ill + * to be used for source-aware routing table lookup. This function will + * ignore IPIF_UNNUMBERED interface addresses, and will only return a + * numbered interface (ipif_lookup_addr_nondup() will ignore UNNUMBERED + * interfaces). + */ +static ill_t * +ip_select_src_ill(const in6_addr_t *v6src, zoneid_t zoneid, ip_stack_t *ipst) +{ + ipif_t *ipif; + ill_t *ill; + boolean_t isv6 = !IN6_IS_ADDR_V4MAPPED(v6src); + ipaddr_t v4src; + + if (isv6) { + ipif = ipif_lookup_addr_nondup_v6(v6src, NULL, zoneid, ipst); + } else { + IN6_V4MAPPED_TO_IPADDR(v6src, v4src); + ipif = ipif_lookup_addr_nondup(v4src, NULL, zoneid, ipst); + } + if (ipif == NULL) + return (NULL); + ill = ipif->ipif_ill; + ill_refhold(ill); + ipif_refrele(ipif); + return (ill); +} + +/* + * verify that v6src is configured on ill + */ +static boolean_t +ip_verify_src_on_ill(const in6_addr_t v6src, ill_t *ill, zoneid_t zoneid) +{ + ipif_t *ipif; + ip_stack_t *ipst; + ipaddr_t v4src; + + if (ill == NULL) + return (B_FALSE); + ipst = ill->ill_ipst; + + if (ill->ill_isv6) { + ipif = ipif_lookup_addr_nondup_v6(&v6src, ill, zoneid, ipst); + } else { + IN6_V4MAPPED_TO_IPADDR(&v6src, v4src); + ipif = ipif_lookup_addr_nondup(v4src, ill, zoneid, ipst); + } + + if (ipif != NULL) { + ipif_refrele(ipif); + return (B_TRUE); + } else { + return (B_FALSE); + } +} + /* * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject * routes this routine sets up a ire_nce_cache as well. The caller needs to * lookup an nce for the multicast case. + * + * When src_multihoming is set to 2 (strict src multihoming) we use the source + * address to select the interface and route. If IP_BOUND_IF etc are + * specified, we require that they specify an interface on which the + * source address is assigned. + * + * When src_multihoming is set to 1 (preferred src aware route + * selection) the unicast lookup prefers a matching source + * (i.e., that the route points out an ill on which the source is assigned), but + * if no such route is found we fallback to not considering the source in the + * route lookup. + * + * We skip the src_multihoming check when the source isn't (yet) set, and + * when IXAF_VERIFY_SOURCE is not set. The latter allows RAW sockets to send + * with bogus source addresses as allowed by IP_HDRINCL and IPV6_PKTINFO + * when secpolicy_net_rawaccess(). */ ire_t * -ip_select_route(const in6_addr_t *v6dst, ip_xmit_attr_t *ixa, - uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp) +ip_select_route(const in6_addr_t *v6dst, const in6_addr_t v6src, + ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp, + int *errorp, boolean_t *multirtp) { uint_t match_args; uint_t ire_type; - ill_t *ill; + ill_t *ill = NULL; ire_t *ire; ip_stack_t *ipst = ixa->ixa_ipst; ipaddr_t v4dst; in6_addr_t v6nexthop; iaflags_t ixaflags = ixa->ixa_flags; nce_t *nce; + boolean_t preferred_src_aware = B_FALSE; + boolean_t verify_src; + boolean_t isv6 = !(ixa->ixa_flags & IXAF_IS_IPV4); + int src_multihoming = IP_SRC_MULTIHOMING(isv6, ipst); + + /* + * We only verify that the src has been configured on a selected + * interface if the src is not :: or INADDR_ANY, and if the + * IXAF_VERIFY_SOURCE flag is set. + */ + verify_src = (!V6_OR_V4_INADDR_ANY(v6src) && + (ixa->ixa_flags & IXAF_VERIFY_SOURCE)); match_args = MATCH_IRE_SECATTR; IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst); @@ -999,17 +1092,16 @@ * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set */ - if ((ixaflags & IXAF_IS_IPV4) ? CLASSD(v4dst) : - IN6_IS_ADDR_MULTICAST(v6dst)) { + if (isv6 ? IN6_IS_ADDR_MULTICAST(v6dst) : CLASSD(v4dst)) { /* Pick up the IRE_MULTICAST for the ill */ if (ixa->ixa_multicast_ifindex != 0) { ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex, - !(ixaflags & IXAF_IS_IPV4), ipst); + isv6, ipst); } else if (ixaflags & IXAF_SCOPEID_SET) { /* sin6_scope_id takes precedence over ixa_ifindex */ ASSERT(ixa->ixa_scopeid != 0); ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, - !(ixaflags & IXAF_IS_IPV4), ipst); + isv6, ipst); } else if (ixa->ixa_ifindex != 0) { /* * In the ipmp case, the ixa_ifindex is set to @@ -1017,17 +1109,32 @@ * ire_multicast() corresponding to that under_ill. */ ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, - !(ixaflags & IXAF_IS_IPV4), ipst); - } else if (ixaflags & IXAF_IS_IPV4) { + isv6, ipst); + } else if (src_multihoming != 0 && verify_src) { + /* Look up the ill based on the source address */ + ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst); + /* + * Since we looked up the ill from the source there + * is no need to verify that the source is on the ill + * below. + */ + verify_src = B_FALSE; + if (ill != NULL && IS_VNI(ill)) { + ill_t *usesrc = ill; + + ill = ill_lookup_usesrc(usesrc); + ill_refrele(usesrc); + } + } else if (!isv6) { ipaddr_t v4setsrc = INADDR_ANY; - ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, ipst, - multirtp, &v4setsrc); + ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, + ipst, multirtp, &v4setsrc); if (setsrcp != NULL) IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); } else { - ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, ipst, - multirtp, setsrcp); + ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, + ipst, multirtp, setsrcp); } if (ill != NULL && IS_VNI(ill)) { ill_refrele(ill); @@ -1037,7 +1144,7 @@ if (errorp != NULL) *errorp = ENXIO; /* Get a hold on the IRE_NOROUTE */ - ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); + ire = ire_reject(ipst, isv6); return (ire); } if (!(ill->ill_flags & ILLF_MULTICAST)) { @@ -1045,7 +1152,21 @@ if (errorp != NULL) *errorp = EHOSTUNREACH; /* Get a hold on the IRE_NOROUTE */ - ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); + ire = ire_reject(ipst, isv6); + return (ire); + } + /* + * If we are doing the strictest src_multihoming, then + * we check that IP_MULTICAST_IF, IP_BOUND_IF, etc specify + * an interface that is consistent with the source address. + */ + if (verify_src && src_multihoming == 2 && + !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) { + if (errorp != NULL) + *errorp = EADDRNOTAVAIL; + ill_refrele(ill); + /* Get a hold on the IRE_NOROUTE */ + ire = ire_reject(ipst, isv6); return (ire); } /* Get a refcnt on the single IRE_MULTICAST per ill */ @@ -1060,16 +1181,17 @@ return (ire); } + /* Now for unicast */ if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) { if (ixaflags & IXAF_SCOPEID_SET) { /* sin6_scope_id takes precedence over ixa_ifindex */ ASSERT(ixa->ixa_scopeid != 0); ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, - !(ixaflags & IXAF_IS_IPV4), ipst); + isv6, ipst); } else { ASSERT(ixa->ixa_ifindex != 0); ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, - !(ixaflags & IXAF_IS_IPV4), ipst); + isv6, ipst); } if (ill != NULL && IS_VNI(ill)) { ill_refrele(ill); @@ -1079,9 +1201,12 @@ if (errorp != NULL) *errorp = ENXIO; /* Get a hold on the IRE_NOROUTE */ - ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); + ire = ire_reject(ipst, isv6); return (ire); } + + match_args |= MATCH_IRE_ILL; + /* * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF * so for both of them we need to be able look for an under @@ -1089,8 +1214,38 @@ */ if (IS_UNDER_IPMP(ill)) match_args |= MATCH_IRE_TESTHIDDEN; - } else { - ill = NULL; + + /* + * If we are doing the strictest src_multihoming, then + * we check that IP_BOUND_IF, IP_PKTINFO, etc specify + * an interface that is consistent with the source address. + */ + if (src_multihoming == 2 && + !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) { + if (errorp != NULL) + *errorp = EADDRNOTAVAIL; + ill_refrele(ill); + /* Get a hold on the IRE_NOROUTE */ + ire = ire_reject(ipst, isv6); + return (ire); + } + } else if (src_multihoming != 0 && verify_src) { + /* Look up the ill based on the source address */ + ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst); + if (ill == NULL) { + char addrbuf[INET6_ADDRSTRLEN]; + + ip3dbg(("%s not a valid src for unicast", + inet_ntop(AF_INET6, &v6src, addrbuf, + sizeof (addrbuf)))); + if (errorp != NULL) + *errorp = EADDRNOTAVAIL; + /* Get a hold on the IRE_NOROUTE */ + ire = ire_reject(ipst, isv6); + return (ire); + } + match_args |= MATCH_IRE_SRC_ILL; + preferred_src_aware = (src_multihoming == 1); } if (ixaflags & IXAF_NEXTHOP_SET) { @@ -1101,7 +1256,6 @@ } ire_type = 0; - /* If ill is null then ire_route_recursive will set MATCH_IRE_ILL */ /* * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then @@ -1112,7 +1266,8 @@ ire_type = IRE_ONLINK; } - if (ixaflags & IXAF_IS_IPV4) { +retry: + if (!isv6) { ipaddr_t v4nexthop; ipaddr_t v4setsrc = INADDR_ANY; @@ -1134,12 +1289,24 @@ v4dst, (void *)ire)); } #endif - - if (ill != NULL) + if (ill != NULL) { ill_refrele(ill); - + ill = NULL; + } if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || (ire->ire_type & IRE_MULTICAST)) { + if (preferred_src_aware) { + /* + * "Preferred Source Aware" send mode. If we cannot + * find an ire whose ire_ill had the desired source + * address retry after relaxing the ill matching + * constraint. + */ + ire_refrele(ire); + preferred_src_aware = B_FALSE; + match_args &= ~MATCH_IRE_SRC_ILL; + goto retry; + } /* No ire_nce_cache */ return (ire); } @@ -1169,34 +1336,36 @@ { if (ixa->ixa_flags & IXAF_IS_IPV4) { ipha_t *ipha = (ipha_t *)mp->b_rptr; - in6_addr_t v6dst; + in6_addr_t v6dst, v6src; IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); - return (ip_select_route(&v6dst, ixa, generationp, + return (ip_select_route(&v6dst, v6src, ixa, generationp, NULL, errorp, multirtp)); } else { ip6_t *ip6h = (ip6_t *)mp->b_rptr; - return (ip_select_route(&ip6h->ip6_dst, ixa, generationp, - NULL, errorp, multirtp)); + return (ip_select_route(&ip6h->ip6_dst, ip6h->ip6_src, + ixa, generationp, NULL, errorp, multirtp)); } } ire_t * -ip_select_route_v4(ipaddr_t dst, ip_xmit_attr_t *ixa, uint_t *generationp, - ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp) +ip_select_route_v4(ipaddr_t dst, ipaddr_t src, ip_xmit_attr_t *ixa, + uint_t *generationp, ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp) { - in6_addr_t v6dst; + in6_addr_t v6dst, v6src; ire_t *ire; in6_addr_t setsrc; ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); IN6_IPADDR_TO_V4MAPPED(dst, &v6dst); + IN6_IPADDR_TO_V4MAPPED(src, &v6src); setsrc = ipv6_all_zeros; - ire = ip_select_route(&v6dst, ixa, generationp, &setsrc, errorp, + ire = ip_select_route(&v6dst, v6src, ixa, generationp, &setsrc, errorp, multirtp); if (v4setsrcp != NULL) IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp); @@ -1208,8 +1377,6 @@ * the zoneid, ill, and label. Used for the data paths. See also * ire_route_recursive. * - * If ill is set this means we will match it by adding MATCH_IRE_ILL. - * * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never * create an IRE_IF_CLONE. This is used on the receive side when we are not * forwarding. @@ -1244,9 +1411,6 @@ if (gwattrp != NULL) ASSERT(*gwattrp == NULL); - if (ill_arg != NULL) - match_args |= MATCH_IRE_ILL; - /* * We iterate up to three times to resolve a route, even though * we have four slots in the array. The extra slot is for an @@ -1257,7 +1421,7 @@ /* ire_ftable_lookup handles round-robin/ECMP */ if (ire == NULL) { ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type, - (ill_arg != NULL ? ill_arg : ill), zoneid, tsl, + (ill != NULL? ill : ill_arg), zoneid, tsl, match_args, xmit_hint, ipst, &generation); } else { /* Caller passed it; extra hold since we will rele */ @@ -1403,6 +1567,10 @@ * recursing. The type match is used by some callers * to exclude certain types (such as IRE_IF_CLONE or * IRE_LOCAL|IRE_LOOPBACK). + * + * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof' + * ire->ire_ill, and we want to find the IRE_INTERFACE for + * ire_ill, so we set ill to the ire_ill; */ match_args &= MATCH_IRE_TYPE; nexthop = ire->ire_gateway_addr;
--- a/usr/src/uts/common/inet/ip/ip_if.c Wed Feb 17 19:52:19 2010 -0800 +++ b/usr/src/uts/common/inet/ip/ip_if.c Wed Feb 17 22:59:58 2010 -0500 @@ -934,17 +934,15 @@ /* * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. * In the case of ioctl from a conn, there can be only 1 mp - * queued on the ipsq. If an ill is being unplumbed, only messages - * related to this ill are flushed, like M_ERROR or M_HANGUP message. - * ioctls meant for this ill form conn's are not flushed. They will - * be processed during ipsq_exit and will not find the ill and will - * return error. + * queued on the ipsq. If an ill is being unplumbed flush all + * the messages. */ mutex_enter(&ipsq->ipsq_lock); for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; curr = next) { next = curr->b_next; - if (curr->b_queue == wq || curr->b_queue == rq) { + if (connp == NULL || + (curr->b_queue == wq || curr->b_queue == rq)) { /* Unlink the mblk from the pending mp list */ if (prev != NULL) { prev->b_next = curr->b_next; @@ -1201,7 +1199,7 @@ /* * ire_walk routine used to delete every IRE that depends on - * 'ill'. (Always called as writer.) + * 'ill'. (Always called as writer, and may only be called from ire_walk.) * * Note: since the routes added by the kernel are deleted separately, * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE. @@ -1223,8 +1221,23 @@ mutex_exit(&ire->ire_lock); if (nce != NULL) nce_refrele(nce); - if (ire->ire_ill == ill) + if (ire->ire_ill == ill) { + /* + * The existing interface binding for ire must be + * deleted before trying to bind the route to another + * interface. However, since we are using the contents of the + * ire after ire_delete, the caller has to ensure that + * CONDEMNED (deleted) ire's are not removed from the list + * when ire_delete() returns. Currently ill_downi() is + * only called as part of ire_walk*() routines, so that + * the irb_refhold() done by ire_walk*() will ensure that + * ire_delete() does not lead to ire_inactive(). + */ + ASSERT(ire->ire_bucket->irb_refcnt > 0); ire_delete(ire); + if (ire->ire_unbound) + ire_rebind(ire); + } } /* Remove IRE_IF_CLONE on this ill */ @@ -5441,6 +5454,7 @@ tsol_gcgrp_t *gcgrp = NULL; boolean_t gcgrp_xtraref = B_FALSE; boolean_t cgtp_broadcast; + boolean_t unbound = B_FALSE; ip1dbg(("ip_rt_add:")); @@ -5765,6 +5779,12 @@ return (ENETUNREACH); } + if (ill == NULL && !(flags & RTF_INDIRECT)) { + unbound = B_TRUE; + if (ipst->ips_ip_strict_src_multihoming > 0) + ill = gw_ire->ire_ill; + } + /* * We create one of three types of IREs as a result of this request * based on the netmask. A netmask of all ones (which is automatically @@ -5863,6 +5883,8 @@ if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) ire->ire_setsrc_addr = src_addr; + ire->ire_unbound = unbound; + /* * POLICY: should we allow an RTF_HOST with address INADDR_ANY? * SUN/OS socket stuff does but do we really want to allow 0.0.0.0? @@ -7601,8 +7623,8 @@ } lifr++; } + rw_exit(&ipst->ips_ill_g_lock); rw_exit(&ipst->ips_ill_g_usesrc_lock); - rw_exit(&ipst->ips_ill_g_lock); ipif_refrele(orig_ipif); mp1->b_wptr = (uchar_t *)lifr; STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr)); @@ -18478,3 +18500,30 @@ nce_flush(ill, B_TRUE); } } + +/* + * find the first interface that uses usill for its source address. + */ +ill_t * +ill_lookup_usesrc(ill_t *usill) +{ + ip_stack_t *ipst = usill->ill_ipst; + ill_t *ill; + + ASSERT(usill != NULL); + + /* ill_g_usesrc_lock protects ill_usesrc_grp_next */ + rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + for (ill = usill->ill_usesrc_grp_next; ill != NULL && ill != usill; + ill = ill->ill_usesrc_grp_next) { + if (!IS_UNDER_IPMP(ill) && (ill->ill_flags & ILLF_MULTICAST) && + !ILL_IS_CONDEMNED(ill)) { + ill_refhold(ill); + break; + } + } + rw_exit(&ipst->ips_ill_g_lock); + rw_exit(&ipst->ips_ill_g_usesrc_lock); + return (ill); +}
--- a/usr/src/uts/common/inet/ip/ip_input.c Wed Feb 17 19:52:19 2010 -0800 +++ b/usr/src/uts/common/inet/ip/ip_input.c Wed Feb 17 22:59:58 2010 -0500 @@ -1841,6 +1841,8 @@ ire_t *ire1; mblk_t *mp1; ipha_t *ipha1; + uint_t ira_pktlen = ira->ira_pktlen; + uint16_t ira_ip_hdr_length = ira->ira_ip_hdr_length; irb = ire->ire_bucket; @@ -1883,6 +1885,12 @@ ira->ira_zoneid = ire1->ire_zoneid; ipha1 = (ipha_t *)mp1->b_rptr; ip_fanout_v4(mp1, ipha1, ira); + /* + * IPsec might have modified ira_pktlen and ira_ip_hdr_length + * so we restore them for a potential next iteration + */ + ira->ira_pktlen = ira_pktlen; + ira->ira_ip_hdr_length = ira_ip_hdr_length; } irb_refrele(irb); /* Do the main ire */ @@ -1913,6 +1921,8 @@ zoneid_t zoneid; mblk_t *mp1; ipha_t *ipha1; + uint_t ira_pktlen = ira->ira_pktlen; + uint16_t ira_ip_hdr_length = ira->ira_ip_hdr_length; /* ire_recv_multicast has switched to the upper ill for IPMP */ ASSERT(!IS_UNDER_IPMP(ill)); @@ -1972,6 +1982,12 @@ } ipha1 = (ipha_t *)mp1->b_rptr; ip_fanout_v4(mp1, ipha1, ira); + /* + * IPsec might have modified ira_pktlen and ira_ip_hdr_length + * so we restore them for a potential next iteration + */ + ira->ira_pktlen = ira_pktlen; + ira->ira_ip_hdr_length = ira_ip_hdr_length; } /* Do the main ire */
--- a/usr/src/uts/common/inet/ip/ip_ire.c Wed Feb 17 19:52:19 2010 -0800 +++ b/usr/src/uts/common/inet/ip/ip_ire.c Wed Feb 17 22:59:58 2010 -0500 @@ -1856,7 +1856,7 @@ ASSERT(ire->ire_ipversion == IPV4_VERSION); ASSERT((ire->ire_addr & ~ire->ire_mask) == 0); - ASSERT((!(match_flags & MATCH_IRE_ILL)) || + ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL))) || (ill != NULL && !ill->ill_isv6)); /* @@ -1936,7 +1936,7 @@ } } /* - * For exampe, with + * For example, with * route add 11.0.0.0 gw1 -ifp bge0 * route add 11.0.0.0 gw2 -ifp bge1 * this code would differentiate based on @@ -1965,8 +1965,8 @@ } matchit: + ire_ill = ire->ire_ill; if (match_flags & MATCH_IRE_ILL) { - ire_ill = ire->ire_ill; /* * If asked to match an ill, we *must* match @@ -1991,6 +1991,16 @@ return (B_FALSE); } } + if (match_flags & MATCH_IRE_SRC_ILL) { + if (ire_ill == NULL) + return (B_FALSE); + if (!IS_ON_SAME_LAN(ill, ire_ill)) { + if (ire_ill->ill_usesrc_ifindex == 0 || + (ire_ill->ill_usesrc_ifindex != + ill->ill_phyint->phyint_ifindex)) + return (B_FALSE); + } + } if ((ire->ire_addr == (addr & mask)) && ((!(match_flags & MATCH_IRE_GW)) || @@ -3563,3 +3573,60 @@ return (5); return (-1); /* unknown ire_type */ } + +/* + * In the preferred/strict src multihoming modes, unbound routes (i.e., + * ire_t entries with ire_unbound set to B_TRUE) are bound to an interface + * by selecting the first available interface that has an interface route for + * the ire_gateway. If that interface is subsequently brought down, ill_downi() + * will call ire_rebind() so that the unbound route can be bound to some other + * matching interface thereby preserving the intended reachability information + * from the original unbound route. + */ +void +ire_rebind(ire_t *ire) +{ + ire_t *gw_ire, *new_ire; + int match_flags = MATCH_IRE_TYPE; + ill_t *gw_ill; + boolean_t isv6 = (ire->ire_ipversion == IPV6_VERSION); + ip_stack_t *ipst = ire->ire_ipst; + + ASSERT(ire->ire_unbound); +again: + if (isv6) { + gw_ire = ire_ftable_lookup_v6(&ire->ire_gateway_addr_v6, 0, 0, + IRE_INTERFACE, NULL, ALL_ZONES, NULL, match_flags, 0, + ipst, NULL); + } else { + gw_ire = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0, + IRE_INTERFACE, NULL, ALL_ZONES, NULL, match_flags, 0, + ipst, NULL); + } + if (gw_ire == NULL) { + /* see comments in ip_rt_add[_v6]() for IPMP */ + if (match_flags & MATCH_IRE_TESTHIDDEN) + return; + + match_flags |= MATCH_IRE_TESTHIDDEN; + goto again; + } + gw_ill = gw_ire->ire_ill; + if (isv6) { + new_ire = ire_create_v6(&ire->ire_addr_v6, &ire->ire_mask_v6, + &ire->ire_gateway_addr_v6, ire->ire_type, gw_ill, + ire->ire_zoneid, ire->ire_flags, NULL, ipst); + } else { + new_ire = ire_create((uchar_t *)&ire->ire_addr, + (uchar_t *)&ire->ire_mask, + (uchar_t *)&ire->ire_gateway_addr, ire->ire_type, gw_ill, + ire->ire_zoneid, ire->ire_flags, NULL, ipst); + } + ire_refrele(gw_ire); + if (new_ire == NULL) + return; + new_ire->ire_unbound = B_TRUE; + new_ire = ire_add(new_ire); + if (new_ire != NULL) + ire_refrele(new_ire); +}
--- a/usr/src/uts/common/inet/ip/ip_mroute.c Wed Feb 17 19:52:19 2010 -0800 +++ b/usr/src/uts/common/inet/ip/ip_mroute.c Wed Feb 17 22:59:58 2010 -0500 @@ -3113,7 +3113,8 @@ (ptrdiff_t)(vifp - ipst->ips_vifs)); } bzero(&ixas, sizeof (ixas)); - ixas.ixa_flags = IXAF_IS_IPV4 | IXAF_NO_TTL_CHANGE; + ixas.ixa_flags = + IXAF_IS_IPV4 | IXAF_NO_TTL_CHANGE | IXAF_VERIFY_SOURCE; ixas.ixa_ipst = ipst; ixas.ixa_ifindex = 0; ixas.ixa_cred = kcred;
--- a/usr/src/uts/common/inet/ip/ip_ndp.c Wed Feb 17 19:52:19 2010 -0800 +++ b/usr/src/uts/common/inet/ip/ip_ndp.c Wed Feb 17 22:59:58 2010 -0500 @@ -165,8 +165,8 @@ * the probe is sent on the ncec_ill (in the non-IPMP case) or the * IPMP cast_ill (in the IPMP case). * - * Note that the probe interval is based on ncec->ncec_ill which - * may be the ipmp_ill. + * Note that the probe interval is based on the src_ill for IPv6, and + * the ncec_xmit_interval for IPv4. */ static void nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe) @@ -180,7 +180,7 @@ dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT, ncec->ncec_lladdr, ncec->ncec_lladdr_length, &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE); - probe_interval = ILL_PROBE_INTERVAL(ncec->ncec_ill); + probe_interval = ILL_PROBE_INTERVAL(src_ill); } else { /* IPv4 DAD delay the initial probe. */ if (send_probe) @@ -4464,8 +4464,17 @@ */ ASSERT(IN6_IS_ADDR_V4MAPPED(addr)); IN6_V4MAPPED_TO_IPADDR(addr, addr4); - if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) + if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) { fastprobe = B_TRUE; + } else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) && + !IS_IPV4_LL_SPACE(&addr4)) { + ill_t *hwaddr_ill; + + hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr, + hw_addr_len); + if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link) + fastprobe = B_TRUE; + } if (fastprobe) { ncec->ncec_xmit_interval = ipst->ips_arp_fastprobe_interval;
--- a/usr/src/uts/common/inet/ip/ip_output.c Wed Feb 17 19:52:19 2010 -0800 +++ b/usr/src/uts/common/inet/ip/ip_output.c Wed Feb 17 22:59:58 2010 -0500 @@ -847,8 +847,8 @@ repeat_ire: error = 0; setsrc = INADDR_ANY; - ire = ip_select_route_v4(firsthop, ixa, NULL, &setsrc, &error, - &multirt); + ire = ip_select_route_v4(firsthop, ipha->ipha_src, ixa, NULL, + &setsrc, &error, &multirt); ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ if (error != 0) { BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); @@ -2295,10 +2295,13 @@ * starting at ire1. */ ire_t *ire2; + uint_t match_flags = MATCH_IRE_DSTONLY; + if (ire1->ire_ill != NULL) + match_flags |= MATCH_IRE_ILL; ire2 = ire_route_recursive_impl_v4(ire1, ire1->ire_addr, ire1->ire_type, ire1->ire_ill, - ire1->ire_zoneid, NULL, MATCH_IRE_DSTONLY, + ire1->ire_zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL); if (ire2 != NULL) ire_refrele(ire2);
--- a/usr/src/uts/common/inet/ip/ipsecesp.c Wed Feb 17 19:52:19 2010 -0800 +++ b/usr/src/uts/common/inet/ip/ipsecesp.c Wed Feb 17 22:59:58 2010 -0500 @@ -2320,7 +2320,7 @@ ixas.ixa_tsl = NULL; ixas.ixa_ipst = ns->netstack_ip; /* No ULP checksum; done by esp_prepare_udp */ - ixas.ixa_flags = IXAF_IS_IPV4 | IXAF_NO_IPSEC; + ixas.ixa_flags = (IXAF_IS_IPV4 | IXAF_NO_IPSEC | IXAF_VERIFY_SOURCE); (void) ip_output_simple(mp, &ixas); ixa_cleanup(&ixas);
--- a/usr/src/uts/common/inet/ip_if.h Wed Feb 17 19:52:19 2010 -0800 +++ b/usr/src/uts/common/inet/ip_if.h Wed Feb 17 22:59:58 2010 -0500 @@ -291,6 +291,7 @@ extern int ipif_arp_down(ipif_t *ipif); extern void ipif_mask_reply(ipif_t *); extern int ipif_up(ipif_t *, queue_t *, mblk_t *); +extern ill_t *ill_lookup_usesrc(ill_t *); extern void ipsq_current_start(ipsq_t *, ipif_t *, int); extern void ipsq_current_finish(ipsq_t *);
--- a/usr/src/uts/common/inet/ip_ire.h Wed Feb 17 19:52:19 2010 -0800 +++ b/usr/src/uts/common/inet/ip_ire.h Wed Feb 17 22:59:58 2010 -0500 @@ -93,6 +93,7 @@ /* zones or shared IREs */ #define MATCH_IRE_SECATTR 0x0040 /* Match gateway security attributes */ #define MATCH_IRE_TESTHIDDEN 0x0080 /* Match ire_testhidden IREs */ +#define MATCH_IRE_SRC_ILL 0x0100 /* ire_ill uses a src address on ill */ #define MAX_IRE_RECURSION 4 /* Max IREs in ire_route_recursive */ @@ -321,12 +322,12 @@ extern ire_t *ip_select_route_pkt(mblk_t *, ip_xmit_attr_t *, uint_t *, int *, boolean_t *); -extern ire_t *ip_select_route(const in6_addr_t *, ip_xmit_attr_t *, - uint_t *, in6_addr_t *, int *, boolean_t *); -extern ire_t *ip_select_route_v4(ipaddr_t, ip_xmit_attr_t *, +extern ire_t *ip_select_route(const in6_addr_t *, const in6_addr_t, + ip_xmit_attr_t *, uint_t *, in6_addr_t *, int *, boolean_t *); +extern ire_t *ip_select_route_v4(ipaddr_t, ipaddr_t, ip_xmit_attr_t *, uint_t *, ipaddr_t *, int *, boolean_t *); -extern ire_t *ip_select_route_v6(const in6_addr_t *, ip_xmit_attr_t *, - uint_t *, in6_addr_t *, int *, boolean_t *); +extern ire_t *ip_select_route_v6(const in6_addr_t *, const in6_addr_t, + ip_xmit_attr_t *, uint_t *, in6_addr_t *, int *, boolean_t *); extern void ire_walk(pfv_t, void *, ip_stack_t *); extern void ire_walk_ill(uint_t, uint_t, pfv_t, void *, ill_t *); @@ -348,6 +349,7 @@ zoneid_t, ip_stack_t *); extern void ire_increment_generation(ire_t *); extern void ire_increment_multicast_generation(ip_stack_t *, boolean_t); +extern void ire_rebind(ire_t *); #endif /* _KERNEL */