changeset 11681:fe992d6ccc26

4173841 Packet goes out with source IP address of another interface 6921533 ioctls could be executed when the thread is not a WRITER on the ipif 6921451 ira_pktlen not computed correctly for ipsec packets 6921615 IPMP need ~5 seconds for traffic passing through when its state transfers from failed
author Sowmini Varadhan <Sowmini.Varadhan@Sun.COM>
date Wed, 17 Feb 2010 22:59:58 -0500
parents f7d6d87905e0
children 6b625d44458f
files usr/src/uts/common/inet/ip.h usr/src/uts/common/inet/ip/icmp.c usr/src/uts/common/inet/ip/ip.c usr/src/uts/common/inet/ip/ip6.c usr/src/uts/common/inet/ip/ip6_if.c usr/src/uts/common/inet/ip/ip6_input.c usr/src/uts/common/inet/ip/ip6_ire.c usr/src/uts/common/inet/ip/ip6_output.c usr/src/uts/common/inet/ip/ip_ftable.c usr/src/uts/common/inet/ip/ip_if.c usr/src/uts/common/inet/ip/ip_input.c usr/src/uts/common/inet/ip/ip_ire.c usr/src/uts/common/inet/ip/ip_mroute.c usr/src/uts/common/inet/ip/ip_ndp.c usr/src/uts/common/inet/ip/ip_output.c usr/src/uts/common/inet/ip/ipsecesp.c usr/src/uts/common/inet/ip_if.h usr/src/uts/common/inet/ip_ire.h
diffstat 18 files changed, 629 insertions(+), 95 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/inet/ip.h	Wed Feb 17 19:52:19 2010 -0800
+++ b/usr/src/uts/common/inet/ip.h	Wed Feb 17 22:59:58 2010 -0500
@@ -2242,8 +2242,9 @@
 /*
  * The normal flags for sending packets e.g., icmp errors
  */
-#define	IXAF_BASIC_SIMPLE_V4	(IXAF_SET_ULP_CKSUM | IXAF_IS_IPV4)
-#define	IXAF_BASIC_SIMPLE_V6	(IXAF_SET_ULP_CKSUM)
+#define	IXAF_BASIC_SIMPLE_V4	\
+	(IXAF_SET_ULP_CKSUM | IXAF_IS_IPV4 | IXAF_VERIFY_SOURCE)
+#define	IXAF_BASIC_SIMPLE_V6	(IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE)
 
 /*
  * Normally these fields do not have a hold. But in some cases they do, for
@@ -2677,6 +2678,12 @@
 	boolean_t	ire_trace_disable;	/* True when alloc fails */
 	ip_stack_t	*ire_ipst;	/* Does not have a netstack_hold */
 	iulp_t		ire_metrics;
+	/*
+	 * default and prefix routes that are added without explicitly
+	 * specifying the interface are termed "unbound" routes, and will
+	 * have ire_unbound set to true.
+	 */
+	boolean_t	ire_unbound;
 };
 
 /* IPv4 compatibility macros */
@@ -3005,6 +3012,8 @@
 #define	ips_ipv6_icmp_return_pmtu	ips_param_arr[73].ip_param_value
 #define	ips_ip_arp_publish_count	ips_param_arr[74].ip_param_value
 #define	ips_ip_arp_publish_interval	ips_param_arr[75].ip_param_value
+#define	ips_ip_strict_src_multihoming	ips_param_arr[76].ip_param_value
+#define	ips_ipv6_strict_src_multihoming	ips_param_arr[77].ip_param_value
 
 extern int	dohwcksum;	/* use h/w cksum if supported by the h/w */
 #ifdef ZC_TEST
--- a/usr/src/uts/common/inet/ip/icmp.c	Wed Feb 17 19:52:19 2010 -0800
+++ b/usr/src/uts/common/inet/ip/icmp.c	Wed Feb 17 22:59:58 2010 -0500
@@ -3103,6 +3103,16 @@
 	/* Even for multicast and broadcast we honor the apps ttl */
 	ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
 
+	/*
+	 * No source verification for non-local addresses
+	 */
+	if (ipha->ipha_src != INADDR_ANY &&
+	    ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid,
+	    is->is_netstack->netstack_ip, B_FALSE)
+	    != IPVL_UNICAST_UP) {
+		ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
+	}
+
 	if (ipha->ipha_dst == INADDR_ANY)
 		ipha->ipha_dst = htonl(INADDR_LOOPBACK);
 
@@ -3468,6 +3478,26 @@
 				v6src = ipp->ipp_addr;
 		}
 	}
+	/*
+	 * Allow source not assigned to the system
+	 * only if it is not a local addresses
+	 */
+	if (!V6_OR_V4_INADDR_ANY(v6src)) {
+		ip_laddr_t laddr_type;
+
+		if (ixa->ixa_flags & IXAF_IS_IPV4) {
+			ipaddr_t v4src;
+
+			IN6_V4MAPPED_TO_IPADDR(&v6src, v4src);
+			laddr_type = ip_laddr_verify_v4(v4src, ixa->ixa_zoneid,
+			    is->is_netstack->netstack_ip, B_FALSE);
+		} else {
+			laddr_type = ip_laddr_verify_v6(&v6src, ixa->ixa_zoneid,
+			    is->is_netstack->netstack_ip, B_FALSE, B_FALSE);
+		}
+		if (laddr_type != IPVL_UNICAST_UP)
+			ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
+	}
 
 	ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
 	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
@@ -3562,8 +3592,6 @@
 	/* We're done.  Pass the packet to ip. */
 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
 
-	/* Allow source not assigned to the system? */
-	ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
 	error = conn_ip_output(mp, ixa);
 	if (!connp->conn_unspec_src)
 		ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
--- a/usr/src/uts/common/inet/ip/ip.c	Wed Feb 17 19:52:19 2010 -0800
+++ b/usr/src/uts/common/inet/ip/ip.c	Wed Feb 17 22:59:58 2010 -0500
@@ -826,6 +826,10 @@
 	{  0,	99999,	100,	"ip_icmp_err_interval" },
 	{  1,	99999,	10,	"ip_icmp_err_burst" },
 	{  0,	999999999,	1000000, "ip_reass_queue_bytes" },
+	/*
+	 * See comments for ip_strict_src_multihoming for an explanation
+	 * of the semantics of ip_strict_dst_multihoming
+	 */
 	{  0,	1,	0,	"ip_strict_dst_multihoming" },
 	{  1,	MAX_ADDRS_PER_IF,	256,	"ip_addrs_per_if"},
 	{  0,	1,	0,	"ipsec_override_persocket_policy" },
@@ -841,6 +845,10 @@
 	{  0,	1,	1,	"ip6_respond_to_echo_multicast"},
 	{  0,	1,	1,	"ip6_send_redirects"},
 	{  0,	1,	0,	"ip6_ignore_redirect" },
+	/*
+	 * See comments for ip6_strict_src_multihoming for an explanation
+	 * of the semantics of ip6_strict_dst_multihoming
+	 */
 	{  0,	1,	0,	"ip6_strict_dst_multihoming" },
 
 	{  0,	2,	2,	"ip_src_check" },
@@ -907,7 +915,48 @@
 	 * for IPv4, IPv6.
 	 */
 	{  1,	20,	5,	"ip_arp_publish_count" },
-	{  1000, 20000,	2000,	"ip_arp_publish_interval" },
+	{  1000, 20000, 2000,   "ip_arp_publish_interval" },
+	/*
+	 * The ip*strict_src_multihoming and ip*strict_dst_multihoming provide
+	 * a range of choices for setting strong/weak/preferred end-system
+	 * behavior. The semantics for setting these are:
+	 *
+	 * ip*_strict_dst_multihoming = 0
+	 *    weak end system model for managing ip destination addresses.
+	 *    A packet with IP dst D1 that's received on interface I1 will be
+	 *    accepted as long as D1 is one of the local addresses on
+	 *    the machine, even if D1 is not configured on I1.
+	 * ip*strict_dst_multihioming = 1
+	 *    strong end system model for managing ip destination addresses.
+	 *    A packet with IP dst D1 that's received on interface I1 will be
+	 *    accepted if, and only if, D1 is configured on I1.
+	 *
+	 * ip*strict_src_multihoming = 0
+	 *    Source agnostic route selection for outgoing packets: the
+	 *    outgoing interface for a packet will be computed using
+	 *    default algorithms for route selection, where the route
+	 *    with the longest matching prefix is chosen for the output
+	 *    unless other route selection constraints are explicitly
+	 *    specified during routing table lookup.  This may result
+	 *    in packet being sent out on interface I2 with source
+	 *    address S1, even though S1 is not a configured address on I2.
+	 * ip*strict_src_multihoming = 1
+	 *    Preferred source aware route selection for outgoing packets: for
+	 *    a packet with source S2, destination D2, the route selection
+	 *    algorithm will first attempt to find a route for the destination
+	 *    that goes out through an interface where S2 is
+	 *    configured. If such a route cannot be found, then the
+	 *    best-matching route for D2 will be selected.
+	 * ip*strict_src_multihoming = 2
+	 *    Source aware route selection for outgoing packets: a packet will
+	 *    be sent out on an interface I2 only if the src address S2 of the
+	 *    packet is a configured address on I2. In conjunction with
+	 *    the setting 'ip_strict_dst_multihoming == 1', this will result in
+	 *    the implementation of Strong ES as defined in Section 3.3.4.2 of
+	 *    RFC 1122
+	 */
+	{  0,	2,	0,	"ip_strict_src_multihoming" },
+	{  0,	2,	0,	"ip6_strict_src_multihoming" }
 };
 
 /*
@@ -3562,8 +3611,8 @@
 	 * a "hidden" route (i.e., going through a specific under_ill)
 	 * if ixa_ifindex has been specified.
 	 */
-	ire = ip_select_route_v4(firsthop, ixa, &generation, &setsrc, &error,
-	    &multirt);
+	ire = ip_select_route_v4(firsthop, *src_addrp, ixa,
+	    &generation, &setsrc, &error, &multirt);
 	ASSERT(ire != NULL);	/* IRE_NOROUTE if none found */
 	if (error != 0)
 		goto bad_addr;
@@ -6773,6 +6822,85 @@
 	return (B_TRUE);
 }
 
+/*
+ * When the src multihoming is changed from weak to [strong, preferred]
+ * ip_ire_rebind_walker is called to walk the list of all ire_t entries
+ * and identify routes that were created by user-applications in the
+ * unbound state (i.e., without RTA_IFP), and for which an ire_ill is not
+ * currently defined. These routes are then 'rebound', i.e., their ire_ill
+ * is selected by finding an interface route for the gateway.
+ */
+/* ARGSUSED */
+static void
+ip_ire_rebind_walker(ire_t *ire, void *notused)
+{
+	if (!ire->ire_unbound || ire->ire_ill != NULL)
+		return;
+	ire_rebind(ire);
+	ire_delete(ire);
+}
+
+/*
+ * When the src multihoming is changed from  [strong, preferred] to weak,
+ * ip_ire_unbind_walker is called to walk the list of all ire_t entries, and
+ * set any entries that were created by user-applications in the unbound state
+ * (i.e., without RTA_IFP) back to having a NULL ire_ill.
+ */
+/* ARGSUSED */
+static void
+ip_ire_unbind_walker(ire_t *ire, void *notused)
+{
+	ire_t *new_ire;
+
+	if (!ire->ire_unbound || ire->ire_ill == NULL)
+		return;
+	if (ire->ire_ipversion == IPV6_VERSION) {
+		new_ire = ire_create_v6(&ire->ire_addr_v6, &ire->ire_mask_v6,
+		    &ire->ire_gateway_addr_v6, ire->ire_type, NULL,
+		    ire->ire_zoneid, ire->ire_flags, NULL, ire->ire_ipst);
+	} else {
+		new_ire = ire_create((uchar_t *)&ire->ire_addr,
+		    (uchar_t *)&ire->ire_mask,
+		    (uchar_t *)&ire->ire_gateway_addr, ire->ire_type, NULL,
+		    ire->ire_zoneid, ire->ire_flags, NULL, ire->ire_ipst);
+	}
+	if (new_ire == NULL)
+		return;
+	new_ire->ire_unbound = B_TRUE;
+	/*
+	 * The bound ire must first be deleted so that we don't return
+	 * the existing one on the attempt to add the unbound new_ire.
+	 */
+	ire_delete(ire);
+	new_ire = ire_add(new_ire);
+	if (new_ire != NULL)
+		ire_refrele(new_ire);
+}
+
+/*
+ * When the settings of ip*_strict_src_multihoming tunables are changed,
+ * all cached routes need to be recomputed. This recomputation needs to be
+ * done when going from weaker to stronger modes so that the cached ire
+ * for the connection does not violate the current ip*_strict_src_multihoming
+ * setting. It also needs to be done when going from stronger to weaker modes,
+ * so that we fall back to matching on the longest-matching-route (as opposed
+ * to a shorter match that may have been selected in the strong mode
+ * to satisfy src_multihoming settings).
+ *
+ * The cached ixa_ire entires for all conn_t entries are marked as
+ * "verify" so that they will be recomputed for the next packet.
+ */
+static void
+conn_ire_revalidate(conn_t *connp, void *arg)
+{
+	boolean_t isv6 = (boolean_t)arg;
+
+	if ((isv6 && connp->conn_ipversion != IPV6_VERSION) ||
+	    (!isv6 && connp->conn_ipversion != IPV4_VERSION))
+		return;
+	connp->conn_ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
+}
+
 /* Named Dispatch routine to negotiate a new value for one of our parameters. */
 /* ARGSUSED */
 static int
@@ -6780,12 +6908,35 @@
 {
 	long		new_value;
 	ipparam_t	*ippa = (ipparam_t *)cp;
-
+	ip_stack_t	*ipst = CONNQ_TO_IPST(q);
+	int		strict_src4, strict_src6;
+
+	strict_src4 = ipst->ips_ip_strict_src_multihoming;
+	strict_src6 = ipst->ips_ipv6_strict_src_multihoming;
 	if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
 	    new_value < ippa->ip_param_min || new_value > ippa->ip_param_max) {
 		return (EINVAL);
 	}
 	ippa->ip_param_value = new_value;
+	if (ipst->ips_ip_strict_src_multihoming != strict_src4) {
+		if (strict_src4 == 0) {
+			ire_walk_v4(ip_ire_rebind_walker, NULL, ALL_ZONES,
+			    ipst);
+		} else {
+			ire_walk_v4(ip_ire_unbind_walker, NULL, ALL_ZONES,
+			    ipst);
+		}
+		ipcl_walk(conn_ire_revalidate, (void *)B_FALSE, ipst);
+	} else if (ipst->ips_ipv6_strict_src_multihoming != strict_src6) {
+		if (strict_src6 == 0) {
+			ire_walk_v6(ip_ire_rebind_walker, NULL, ALL_ZONES,
+			    ipst);
+		} else {
+			ire_walk_v4(ip_ire_unbind_walker, NULL, ALL_ZONES,
+			    ipst);
+		}
+		ipcl_walk(conn_ire_revalidate, (void *)B_TRUE, ipst);
+	}
 	return (0);
 }
 
--- a/usr/src/uts/common/inet/ip/ip6.c	Wed Feb 17 19:52:19 2010 -0800
+++ b/usr/src/uts/common/inet/ip/ip6.c	Wed Feb 17 22:59:58 2010 -0500
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /*
@@ -2004,8 +2004,8 @@
 	 * a "hidden" route (i.e., going through a specific under_ill)
 	 * if ixa_ifindex has been specified.
 	 */
-	ire = ip_select_route_v6(firsthop, ixa, &generation, &setsrc, &error,
-	    &multirt);
+	ire = ip_select_route_v6(firsthop, *src_addrp, ixa, &generation,
+	    &setsrc, &error, &multirt);
 	ASSERT(ire != NULL);	/* IRE_NOROUTE if none found */
 	if (error != 0)
 		goto bad_addr;
--- a/usr/src/uts/common/inet/ip/ip6_if.c	Wed Feb 17 19:52:19 2010 -0800
+++ b/usr/src/uts/common/inet/ip/ip6_if.c	Wed Feb 17 22:59:58 2010 -0500
@@ -405,6 +405,7 @@
 	tsol_gc_t *gc = NULL;
 	tsol_gcgrp_t *gcgrp = NULL;
 	boolean_t gcgrp_xtraref = B_FALSE;
+	boolean_t unbound = B_FALSE;
 
 	if (ire_arg != NULL)
 		*ire_arg = NULL;
@@ -724,6 +725,11 @@
 			ipif_refrele(ipif);
 		return (ENETUNREACH);
 	}
+	if (ill == NULL && !(flags & RTF_INDIRECT)) {
+		unbound = B_TRUE;
+		if (ipst->ips_ipv6_strict_src_multihoming > 0)
+			ill = gw_ire->ire_ill;
+	}
 
 	/*
 	 * We create one of three types of IREs as a result of this request
@@ -819,6 +825,8 @@
 	if ((flags & RTF_SETSRC) && !IN6_IS_ADDR_UNSPECIFIED(src_addr))
 		ire->ire_setsrc_addr_v6 = *src_addr;
 
+	ire->ire_unbound = unbound;
+
 	/*
 	 * POLICY: should we allow an RTF_HOST with address INADDR_ANY?
 	 * SUN/OS socket stuff does but do we really want to allow ::0 ?
--- a/usr/src/uts/common/inet/ip/ip6_input.c	Wed Feb 17 19:52:19 2010 -0800
+++ b/usr/src/uts/common/inet/ip/ip6_input.c	Wed Feb 17 22:59:58 2010 -0500
@@ -1539,6 +1539,8 @@
 	zoneid_t	zoneid;
 	mblk_t		*mp1;
 	ip6_t		*ip6h1;
+	uint_t		ira_pktlen = ira->ira_pktlen;
+	uint16_t	ira_ip_hdr_length = ira->ira_ip_hdr_length;
 
 	/* ire_recv_multicast has switched to the upper ill for IPMP */
 	ASSERT(!IS_UNDER_IPMP(ill));
@@ -1598,6 +1600,12 @@
 		}
 		ip6h1 = (ip6_t *)mp1->b_rptr;
 		ip_fanout_v6(mp1, ip6h1, ira);
+		/*
+		 * IPsec might have modified ira_pktlen and ira_ip_hdr_length
+		 * so we restore them for a potential next iteration
+		 */
+		ira->ira_pktlen = ira_pktlen;
+		ira->ira_ip_hdr_length = ira_ip_hdr_length;
 	}
 
 	/* Do the main ire */
--- a/usr/src/uts/common/inet/ip/ip6_ire.c	Wed Feb 17 19:52:19 2010 -0800
+++ b/usr/src/uts/common/inet/ip/ip6_ire.c	Wed Feb 17 22:59:58 2010 -0500
@@ -690,7 +690,7 @@
 	ASSERT(addr != NULL);
 	ASSERT(mask != NULL);
 	ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL);
-	ASSERT((!(match_flags & MATCH_IRE_ILL)) ||
+	ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL))) ||
 	    (ill != NULL && ill->ill_isv6));
 
 	/*
@@ -771,7 +771,7 @@
 			}
 		}
 		/*
-		 * For exampe, with
+		 * For example, with
 		 * route add 11.0.0.0 gw1 -ifp bge0
 		 * route add 11.0.0.0 gw2 -ifp bge1
 		 * this code would differentiate based on
@@ -799,13 +799,13 @@
 	}
 
 matchit:
+	ire_ill = ire->ire_ill;
 	if (match_flags & MATCH_IRE_GW) {
 		mutex_enter(&ire->ire_lock);
 		gw_addr_v6 = ire->ire_gateway_addr_v6;
 		mutex_exit(&ire->ire_lock);
 	}
 	if (match_flags & MATCH_IRE_ILL) {
-		ire_ill = ire->ire_ill;
 
 		/*
 		 * If asked to match an ill, we *must* match
@@ -830,6 +830,17 @@
 				return (B_FALSE);
 		}
 	}
+	if (match_flags & MATCH_IRE_SRC_ILL) {
+		if (ire_ill == NULL)
+			return (B_FALSE);
+		if (!IS_ON_SAME_LAN(ill, ire_ill)) {
+			if (ire_ill->ill_usesrc_ifindex == 0 ||
+			    (ire_ill->ill_usesrc_ifindex !=
+			    ill->ill_phyint->phyint_ifindex))
+				return (B_FALSE);
+		}
+	}
+
 	/* No ire_addr_v6 bits set past the mask */
 	ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6,
 	    ire->ire_addr_v6));
@@ -910,9 +921,9 @@
 
 	/*
 	 * ire_match_args_v6() will dereference ill if MATCH_IRE_ILL
-	 * is set.
+	 * or MATCH_IRE_SRC_ILL is set.
 	 */
-	if ((flags & (MATCH_IRE_ILL)) && (ill == NULL))
+	if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL))
 		return (NULL);
 
 	rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
@@ -1113,12 +1124,13 @@
 }
 
 ire_t *
-ip_select_route_v6(const in6_addr_t *dst, ip_xmit_attr_t *ixa,
-    uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp)
+ip_select_route_v6(const in6_addr_t *dst, const in6_addr_t src,
+    ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp,
+    int *errorp, boolean_t *multirtp)
 {
 	ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
 
-	return (ip_select_route(dst, ixa, generationp, setsrcp, errorp,
+	return (ip_select_route(dst, src, ixa, generationp, setsrcp, errorp,
 	    multirtp));
 }
 
@@ -1127,8 +1139,6 @@
  * the zoneid, ill, and label. Used for the data paths. See also
  * ire_route_recursive_dstonly.
  *
- * If ill is set this means we will match it by adding MATCH_IRE_ILL.
- *
  * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
  * create an IRE_IF_CLONE. This is used on the receive side when we are not
  * forwarding.
@@ -1164,9 +1174,6 @@
 	if (gwattrp != NULL)
 		ASSERT(*gwattrp == NULL);
 
-	if (ill_arg != NULL)
-		match_args |= MATCH_IRE_ILL;
-
 	/*
 	 * We iterate up to three times to resolve a route, even though
 	 * we have four slots in the array. The extra slot is for an
@@ -1177,7 +1184,7 @@
 		/* ire_ftable_lookup handles round-robin/ECMP */
 		if (ire == NULL) {
 			ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type,
-			    (ill_arg != NULL ? ill_arg : ill), zoneid, tsl,
+			    (ill != NULL ? ill : ill_arg), zoneid, tsl,
 			    match_args, xmit_hint, ipst, &generation);
 		} else {
 			/* Caller passed it; extra hold since we will rele */
@@ -1322,6 +1329,10 @@
 		 * recursing. The type match is used by some callers
 		 * to exclude certain types (such as IRE_IF_CLONE or
 		 * IRE_LOCAL|IRE_LOOPBACK).
+		 *
+		 * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof'
+		 * ire->ire_ill, and we want to find the IRE_INTERFACE for
+		 * ire_ill, so we set ill to the ire_ill
 		 */
 		match_args &= MATCH_IRE_TYPE;
 		v6nexthop = ire->ire_gateway_addr_v6;
--- a/usr/src/uts/common/inet/ip/ip6_output.c	Wed Feb 17 19:52:19 2010 -0800
+++ b/usr/src/uts/common/inet/ip/ip6_output.c	Wed Feb 17 22:59:58 2010 -0500
@@ -150,8 +150,8 @@
 repeat_ire:
 	error = 0;
 	setsrc = ipv6_all_zeros;
-	ire = ip_select_route_v6(&firsthop, ixa, NULL, &setsrc, &error,
-	    &multirt);
+	ire = ip_select_route_v6(&firsthop, ip6h->ip6_src, ixa, NULL, &setsrc,
+	    &error, &multirt);
 	ASSERT(ire != NULL);	/* IRE_NOROUTE if none found */
 	if (error != 0) {
 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
@@ -1228,10 +1228,13 @@
 			 * starting at ire1.
 			 */
 			ire_t *ire2;
+			uint_t match_flags = MATCH_IRE_DSTONLY;
 
+			if (ire1->ire_ill != NULL)
+				match_flags |= MATCH_IRE_ILL;
 			ire2 = ire_route_recursive_impl_v6(ire1,
 			    &ire1->ire_addr_v6, ire1->ire_type, ire1->ire_ill,
-			    ire1->ire_zoneid, NULL, MATCH_IRE_DSTONLY,
+			    ire1->ire_zoneid, NULL, match_flags,
 			    IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL);
 			if (ire2 != NULL)
 				ire_refrele(ire2);
--- a/usr/src/uts/common/inet/ip/ip_ftable.c	Wed Feb 17 19:52:19 2010 -0800
+++ b/usr/src/uts/common/inet/ip/ip_ftable.c	Wed Feb 17 22:59:58 2010 -0500
@@ -77,6 +77,10 @@
 	(((ire)->ire_type & IRE_DEFAULT) || \
 	    (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0)))
 
+#define	IP_SRC_MULTIHOMING(isv6, ipst) 			\
+	(isv6 ? ipst->ips_ipv6_strict_src_multihoming :	\
+	ipst->ips_ip_strict_src_multihoming)
+
 static ire_t	*route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *);
 static void	ire_del_host_redir(ire_t *, char *);
 static boolean_t ire_find_best_route(struct radix_node *, void *);
@@ -104,7 +108,7 @@
 	 * ire_match_args() will dereference ill if MATCH_IRE_ILL
 	 * is set.
 	 */
-	if ((flags & MATCH_IRE_ILL) && (ill == NULL))
+	if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL))
 		return (NULL);
 
 	bzero(&rdst, sizeof (rdst));
@@ -673,7 +677,8 @@
 	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
 		if (IRE_IS_CONDEMNED(ire))
 			continue;
-		if (margs->ift_flags & (MATCH_IRE_MASK|MATCH_IRE_SHORTERMASK))
+		ASSERT((margs->ift_flags & MATCH_IRE_SHORTERMASK) == 0);
+		if (margs->ift_flags & MATCH_IRE_MASK)
 			match_mask = margs->ift_mask;
 		else
 			match_mask = ire->ire_mask;
@@ -968,24 +973,112 @@
 		irb_refrele_ftable(&((rt_t *)(rn))->rt_irb);
 }
 
+
+/*
+ * ip_select_src_ill() is used by ip_select_route() to find the src_ill
+ * to be used for source-aware routing table lookup. This function will
+ * ignore IPIF_UNNUMBERED interface addresses, and will only return a
+ * numbered interface (ipif_lookup_addr_nondup() will ignore UNNUMBERED
+ * interfaces).
+ */
+static ill_t *
+ip_select_src_ill(const in6_addr_t *v6src, zoneid_t zoneid, ip_stack_t *ipst)
+{
+	ipif_t *ipif;
+	ill_t *ill;
+	boolean_t isv6 = !IN6_IS_ADDR_V4MAPPED(v6src);
+	ipaddr_t v4src;
+
+	if (isv6) {
+		ipif = ipif_lookup_addr_nondup_v6(v6src, NULL, zoneid, ipst);
+	} else {
+		IN6_V4MAPPED_TO_IPADDR(v6src, v4src);
+		ipif = ipif_lookup_addr_nondup(v4src, NULL, zoneid, ipst);
+	}
+	if (ipif == NULL)
+		return (NULL);
+	ill = ipif->ipif_ill;
+	ill_refhold(ill);
+	ipif_refrele(ipif);
+	return (ill);
+}
+
+/*
+ * verify that v6src is configured on ill
+ */
+static boolean_t
+ip_verify_src_on_ill(const in6_addr_t v6src, ill_t *ill, zoneid_t zoneid)
+{
+	ipif_t *ipif;
+	ip_stack_t *ipst;
+	ipaddr_t v4src;
+
+	if (ill == NULL)
+		return (B_FALSE);
+	ipst = ill->ill_ipst;
+
+	if (ill->ill_isv6) {
+		ipif = ipif_lookup_addr_nondup_v6(&v6src, ill, zoneid, ipst);
+	} else {
+		IN6_V4MAPPED_TO_IPADDR(&v6src, v4src);
+		ipif = ipif_lookup_addr_nondup(v4src, ill, zoneid, ipst);
+	}
+
+	if (ipif != NULL) {
+		ipif_refrele(ipif);
+		return (B_TRUE);
+	} else {
+		return (B_FALSE);
+	}
+}
+
 /*
  * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject
  * routes this routine sets up a ire_nce_cache as well. The caller needs to
  * lookup an nce for the multicast case.
+ *
+ * When src_multihoming is set to 2 (strict src multihoming) we use the source
+ * address to select the interface and route. If IP_BOUND_IF etc are
+ * specified, we require that they specify an interface on which the
+ * source address is assigned.
+ *
+ * When src_multihoming is set to 1 (preferred src aware route
+ * selection)  the unicast lookup prefers a matching source
+ * (i.e., that the route points out an ill on which the source is assigned), but
+ * if no such route is found we fallback to not considering the source in the
+ * route lookup.
+ *
+ * We skip the src_multihoming check when the source isn't (yet) set, and
+ * when IXAF_VERIFY_SOURCE is not set. The latter allows RAW sockets to send
+ * with bogus source addresses as allowed by IP_HDRINCL and IPV6_PKTINFO
+ * when secpolicy_net_rawaccess().
  */
 ire_t *
-ip_select_route(const in6_addr_t *v6dst, ip_xmit_attr_t *ixa,
-    uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp)
+ip_select_route(const in6_addr_t *v6dst, const in6_addr_t v6src,
+    ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp,
+    int *errorp, boolean_t *multirtp)
 {
 	uint_t		match_args;
 	uint_t		ire_type;
-	ill_t		*ill;
+	ill_t		*ill = NULL;
 	ire_t		*ire;
 	ip_stack_t	*ipst = ixa->ixa_ipst;
 	ipaddr_t	v4dst;
 	in6_addr_t	v6nexthop;
 	iaflags_t	ixaflags = ixa->ixa_flags;
 	nce_t		*nce;
+	boolean_t	preferred_src_aware = B_FALSE;
+	boolean_t	verify_src;
+	boolean_t	isv6 = !(ixa->ixa_flags & IXAF_IS_IPV4);
+	int		src_multihoming = IP_SRC_MULTIHOMING(isv6, ipst);
+
+	/*
+	 * We only verify that the src has been configured on a selected
+	 * interface if the src is not :: or INADDR_ANY, and if the
+	 * IXAF_VERIFY_SOURCE flag is set.
+	 */
+	verify_src = (!V6_OR_V4_INADDR_ANY(v6src) &&
+	    (ixa->ixa_flags & IXAF_VERIFY_SOURCE));
 
 	match_args = MATCH_IRE_SECATTR;
 	IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst);
@@ -999,17 +1092,16 @@
 	 * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set
 	 */
 
-	if ((ixaflags & IXAF_IS_IPV4) ? CLASSD(v4dst) :
-	    IN6_IS_ADDR_MULTICAST(v6dst)) {
+	if (isv6 ? IN6_IS_ADDR_MULTICAST(v6dst) : CLASSD(v4dst)) {
 		/* Pick up the IRE_MULTICAST for the ill */
 		if (ixa->ixa_multicast_ifindex != 0) {
 			ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex,
-			    !(ixaflags & IXAF_IS_IPV4), ipst);
+			    isv6, ipst);
 		} else if (ixaflags & IXAF_SCOPEID_SET) {
 			/* sin6_scope_id takes precedence over ixa_ifindex */
 			ASSERT(ixa->ixa_scopeid != 0);
 			ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
-			    !(ixaflags & IXAF_IS_IPV4), ipst);
+			    isv6, ipst);
 		} else if (ixa->ixa_ifindex != 0) {
 			/*
 			 * In the ipmp case, the ixa_ifindex is set to
@@ -1017,17 +1109,32 @@
 			 * ire_multicast() corresponding to that under_ill.
 			 */
 			ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
-			    !(ixaflags & IXAF_IS_IPV4), ipst);
-		} else if (ixaflags & IXAF_IS_IPV4) {
+			    isv6, ipst);
+		} else if (src_multihoming != 0 && verify_src) {
+			/* Look up the ill based on the source address */
+			ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst);
+			/*
+			 * Since we looked up the ill from the source there
+			 * is no need to verify that the source is on the ill
+			 * below.
+			 */
+			verify_src = B_FALSE;
+			if (ill != NULL && IS_VNI(ill)) {
+				ill_t *usesrc = ill;
+
+				ill = ill_lookup_usesrc(usesrc);
+				ill_refrele(usesrc);
+			}
+		} else if (!isv6) {
 			ipaddr_t	v4setsrc = INADDR_ANY;
 
-			ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, ipst,
-			    multirtp, &v4setsrc);
+			ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid,
+			    ipst, multirtp, &v4setsrc);
 			if (setsrcp != NULL)
 				IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
 		} else {
-			ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, ipst,
-			    multirtp, setsrcp);
+			ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid,
+			    ipst, multirtp, setsrcp);
 		}
 		if (ill != NULL && IS_VNI(ill)) {
 			ill_refrele(ill);
@@ -1037,7 +1144,7 @@
 			if (errorp != NULL)
 				*errorp = ENXIO;
 			/* Get a hold on the IRE_NOROUTE */
-			ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
+			ire = ire_reject(ipst, isv6);
 			return (ire);
 		}
 		if (!(ill->ill_flags & ILLF_MULTICAST)) {
@@ -1045,7 +1152,21 @@
 			if (errorp != NULL)
 				*errorp = EHOSTUNREACH;
 			/* Get a hold on the IRE_NOROUTE */
-			ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
+			ire = ire_reject(ipst, isv6);
+			return (ire);
+		}
+		/*
+		 * If we are doing the strictest src_multihoming, then
+		 * we check that IP_MULTICAST_IF, IP_BOUND_IF, etc specify
+		 * an interface that is consistent with the source address.
+		 */
+		if (verify_src && src_multihoming == 2 &&
+		    !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) {
+			if (errorp != NULL)
+				*errorp = EADDRNOTAVAIL;
+			ill_refrele(ill);
+			/* Get a hold on the IRE_NOROUTE */
+			ire = ire_reject(ipst, isv6);
 			return (ire);
 		}
 		/* Get a refcnt on the single IRE_MULTICAST per ill */
@@ -1060,16 +1181,17 @@
 		return (ire);
 	}
 
+	/* Now for unicast */
 	if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) {
 		if (ixaflags & IXAF_SCOPEID_SET) {
 			/* sin6_scope_id takes precedence over ixa_ifindex */
 			ASSERT(ixa->ixa_scopeid != 0);
 			ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
-			    !(ixaflags & IXAF_IS_IPV4), ipst);
+			    isv6, ipst);
 		} else {
 			ASSERT(ixa->ixa_ifindex != 0);
 			ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
-			    !(ixaflags & IXAF_IS_IPV4), ipst);
+			    isv6, ipst);
 		}
 		if (ill != NULL && IS_VNI(ill)) {
 			ill_refrele(ill);
@@ -1079,9 +1201,12 @@
 			if (errorp != NULL)
 				*errorp = ENXIO;
 			/* Get a hold on the IRE_NOROUTE */
-			ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
+			ire = ire_reject(ipst, isv6);
 			return (ire);
 		}
+
+		match_args |= MATCH_IRE_ILL;
+
 		/*
 		 * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF
 		 * so for both of them we need to be able look for an under
@@ -1089,8 +1214,38 @@
 		 */
 		if (IS_UNDER_IPMP(ill))
 			match_args |= MATCH_IRE_TESTHIDDEN;
-	} else {
-		ill = NULL;
+
+		/*
+		 * If we are doing the strictest src_multihoming, then
+		 * we check that IP_BOUND_IF, IP_PKTINFO, etc specify
+		 * an interface that is consistent with the source address.
+		 */
+		if (src_multihoming == 2 &&
+		    !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) {
+			if (errorp != NULL)
+				*errorp = EADDRNOTAVAIL;
+			ill_refrele(ill);
+			/* Get a hold on the IRE_NOROUTE */
+			ire = ire_reject(ipst, isv6);
+			return (ire);
+		}
+	} else if (src_multihoming != 0 && verify_src) {
+		/* Look up the ill based on the source address */
+		ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst);
+		if (ill == NULL) {
+			char addrbuf[INET6_ADDRSTRLEN];
+
+			ip3dbg(("%s not a valid src for unicast",
+			    inet_ntop(AF_INET6, &v6src, addrbuf,
+			    sizeof (addrbuf))));
+			if (errorp != NULL)
+				*errorp = EADDRNOTAVAIL;
+			/* Get a hold on the IRE_NOROUTE */
+			ire = ire_reject(ipst, isv6);
+			return (ire);
+		}
+		match_args |= MATCH_IRE_SRC_ILL;
+		preferred_src_aware = (src_multihoming == 1);
 	}
 
 	if (ixaflags & IXAF_NEXTHOP_SET) {
@@ -1101,7 +1256,6 @@
 	}
 
 	ire_type = 0;
-	/* If ill is null then ire_route_recursive will set MATCH_IRE_ILL */
 
 	/*
 	 * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then
@@ -1112,7 +1266,8 @@
 		ire_type = IRE_ONLINK;
 	}
 
-	if (ixaflags & IXAF_IS_IPV4) {
+retry:
+	if (!isv6) {
 		ipaddr_t	v4nexthop;
 		ipaddr_t	v4setsrc = INADDR_ANY;
 
@@ -1134,12 +1289,24 @@
 		    v4dst, (void *)ire));
 	}
 #endif
-
-	if (ill != NULL)
+	if (ill != NULL) {
 		ill_refrele(ill);
-
+		ill = NULL;
+	}
 	if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
 	    (ire->ire_type & IRE_MULTICAST)) {
+		if (preferred_src_aware) {
+			/*
+			 * "Preferred Source Aware" send mode. If we cannot
+			 * find an ire whose ire_ill had the desired source
+			 * address retry after relaxing the ill matching
+			 * constraint.
+			 */
+			ire_refrele(ire);
+			preferred_src_aware = B_FALSE;
+			match_args &= ~MATCH_IRE_SRC_ILL;
+			goto retry;
+		}
 		/* No ire_nce_cache */
 		return (ire);
 	}
@@ -1169,34 +1336,36 @@
 {
 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
 		ipha_t		*ipha = (ipha_t *)mp->b_rptr;
-		in6_addr_t	v6dst;
+		in6_addr_t	v6dst, v6src;
 
 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
+		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
 
-		return (ip_select_route(&v6dst, ixa, generationp,
+		return (ip_select_route(&v6dst, v6src, ixa, generationp,
 		    NULL, errorp, multirtp));
 	} else {
 		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
 
-		return (ip_select_route(&ip6h->ip6_dst, ixa, generationp,
-		    NULL, errorp, multirtp));
+		return (ip_select_route(&ip6h->ip6_dst, ip6h->ip6_src,
+		    ixa, generationp, NULL, errorp, multirtp));
 	}
 }
 
 ire_t *
-ip_select_route_v4(ipaddr_t dst, ip_xmit_attr_t *ixa, uint_t *generationp,
-    ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp)
+ip_select_route_v4(ipaddr_t dst, ipaddr_t src, ip_xmit_attr_t *ixa,
+    uint_t *generationp, ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp)
 {
-	in6_addr_t	v6dst;
+	in6_addr_t	v6dst, v6src;
 	ire_t		*ire;
 	in6_addr_t	setsrc;
 
 	ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
 
 	IN6_IPADDR_TO_V4MAPPED(dst, &v6dst);
+	IN6_IPADDR_TO_V4MAPPED(src, &v6src);
 
 	setsrc = ipv6_all_zeros;
-	ire = ip_select_route(&v6dst, ixa, generationp, &setsrc, errorp,
+	ire = ip_select_route(&v6dst, v6src, ixa, generationp, &setsrc, errorp,
 	    multirtp);
 	if (v4setsrcp != NULL)
 		IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp);
@@ -1208,8 +1377,6 @@
  * the zoneid, ill, and label. Used for the data paths. See also
  * ire_route_recursive.
  *
- * If ill is set this means we will match it by adding MATCH_IRE_ILL.
- *
  * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
  * create an IRE_IF_CLONE. This is used on the receive side when we are not
  * forwarding.
@@ -1244,9 +1411,6 @@
 	if (gwattrp != NULL)
 		ASSERT(*gwattrp == NULL);
 
-	if (ill_arg != NULL)
-		match_args |= MATCH_IRE_ILL;
-
 	/*
 	 * We iterate up to three times to resolve a route, even though
 	 * we have four slots in the array. The extra slot is for an
@@ -1257,7 +1421,7 @@
 		/* ire_ftable_lookup handles round-robin/ECMP */
 		if (ire == NULL) {
 			ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type,
-			    (ill_arg != NULL ? ill_arg : ill), zoneid, tsl,
+			    (ill != NULL? ill : ill_arg), zoneid, tsl,
 			    match_args, xmit_hint, ipst, &generation);
 		} else {
 			/* Caller passed it; extra hold since we will rele */
@@ -1403,6 +1567,10 @@
 		 * recursing. The type match is used by some callers
 		 * to exclude certain types (such as IRE_IF_CLONE or
 		 * IRE_LOCAL|IRE_LOOPBACK).
+		 *
+		 * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof'
+		 * ire->ire_ill, and we want to find the IRE_INTERFACE for
+		 * ire_ill, so we set ill to the ire_ill;
 		 */
 		match_args &= MATCH_IRE_TYPE;
 		nexthop = ire->ire_gateway_addr;
--- a/usr/src/uts/common/inet/ip/ip_if.c	Wed Feb 17 19:52:19 2010 -0800
+++ b/usr/src/uts/common/inet/ip/ip_if.c	Wed Feb 17 22:59:58 2010 -0500
@@ -934,17 +934,15 @@
 	/*
 	 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any.
 	 * In the case of ioctl from a conn, there can be only 1 mp
-	 * queued on the ipsq. If an ill is being unplumbed, only messages
-	 * related to this ill are flushed, like M_ERROR or M_HANGUP message.
-	 * ioctls meant for this ill form conn's are not flushed. They will
-	 * be processed during ipsq_exit and will not find the ill and will
-	 * return error.
+	 * queued on the ipsq. If an ill is being unplumbed flush all
+	 * the messages.
 	 */
 	mutex_enter(&ipsq->ipsq_lock);
 	for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL;
 	    curr = next) {
 		next = curr->b_next;
-		if (curr->b_queue == wq || curr->b_queue == rq) {
+		if (connp == NULL ||
+		    (curr->b_queue == wq || curr->b_queue == rq)) {
 			/* Unlink the mblk from the pending mp list */
 			if (prev != NULL) {
 				prev->b_next = curr->b_next;
@@ -1201,7 +1199,7 @@
 
 /*
  * ire_walk routine used to delete every IRE that depends on
- * 'ill'.  (Always called as writer.)
+ * 'ill'.  (Always called as writer, and may only be called from ire_walk.)
  *
  * Note: since the routes added by the kernel are deleted separately,
  * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE.
@@ -1223,8 +1221,23 @@
 	mutex_exit(&ire->ire_lock);
 	if (nce != NULL)
 		nce_refrele(nce);
-	if (ire->ire_ill == ill)
+	if (ire->ire_ill == ill) {
+		/*
+		 * The existing interface binding for ire must be
+		 * deleted before trying to bind the route to another
+		 * interface. However, since we are using the contents of the
+		 * ire after ire_delete, the caller has to ensure that
+		 * CONDEMNED (deleted) ire's are not removed from the list
+		 * when ire_delete() returns. Currently ill_downi() is
+		 * only called as part of ire_walk*() routines, so that
+		 * the irb_refhold() done by ire_walk*() will ensure that
+		 * ire_delete() does not lead to ire_inactive().
+		 */
+		ASSERT(ire->ire_bucket->irb_refcnt > 0);
 		ire_delete(ire);
+		if (ire->ire_unbound)
+			ire_rebind(ire);
+	}
 }
 
 /* Remove IRE_IF_CLONE on this ill */
@@ -5441,6 +5454,7 @@
 	tsol_gcgrp_t *gcgrp = NULL;
 	boolean_t gcgrp_xtraref = B_FALSE;
 	boolean_t cgtp_broadcast;
+	boolean_t unbound = B_FALSE;
 
 	ip1dbg(("ip_rt_add:"));
 
@@ -5765,6 +5779,12 @@
 		return (ENETUNREACH);
 	}
 
+	if (ill == NULL && !(flags & RTF_INDIRECT)) {
+		unbound = B_TRUE;
+		if (ipst->ips_ip_strict_src_multihoming > 0)
+			ill = gw_ire->ire_ill;
+	}
+
 	/*
 	 * We create one of three types of IREs as a result of this request
 	 * based on the netmask.  A netmask of all ones (which is automatically
@@ -5863,6 +5883,8 @@
 	if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
 		ire->ire_setsrc_addr = src_addr;
 
+	ire->ire_unbound = unbound;
+
 	/*
 	 * POLICY: should we allow an RTF_HOST with address INADDR_ANY?
 	 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0?
@@ -7601,8 +7623,8 @@
 		}
 		lifr++;
 	}
+	rw_exit(&ipst->ips_ill_g_lock);
 	rw_exit(&ipst->ips_ill_g_usesrc_lock);
-	rw_exit(&ipst->ips_ill_g_lock);
 	ipif_refrele(orig_ipif);
 	mp1->b_wptr = (uchar_t *)lifr;
 	STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr));
@@ -18478,3 +18500,30 @@
 			nce_flush(ill, B_TRUE);
 	}
 }
+
+/*
+ * find the first interface that uses usill for its source address.
+ */
+ill_t *
+ill_lookup_usesrc(ill_t *usill)
+{
+	ip_stack_t *ipst = usill->ill_ipst;
+	ill_t *ill;
+
+	ASSERT(usill != NULL);
+
+	/* ill_g_usesrc_lock protects ill_usesrc_grp_next */
+	rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
+	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+	for (ill = usill->ill_usesrc_grp_next; ill != NULL && ill != usill;
+	    ill = ill->ill_usesrc_grp_next) {
+		if (!IS_UNDER_IPMP(ill) && (ill->ill_flags & ILLF_MULTICAST) &&
+		    !ILL_IS_CONDEMNED(ill)) {
+			ill_refhold(ill);
+			break;
+		}
+	}
+	rw_exit(&ipst->ips_ill_g_lock);
+	rw_exit(&ipst->ips_ill_g_usesrc_lock);
+	return (ill);
+}
--- a/usr/src/uts/common/inet/ip/ip_input.c	Wed Feb 17 19:52:19 2010 -0800
+++ b/usr/src/uts/common/inet/ip/ip_input.c	Wed Feb 17 22:59:58 2010 -0500
@@ -1841,6 +1841,8 @@
 	ire_t		*ire1;
 	mblk_t		*mp1;
 	ipha_t		*ipha1;
+	uint_t		ira_pktlen = ira->ira_pktlen;
+	uint16_t	ira_ip_hdr_length = ira->ira_ip_hdr_length;
 
 	irb = ire->ire_bucket;
 
@@ -1883,6 +1885,12 @@
 		ira->ira_zoneid = ire1->ire_zoneid;
 		ipha1 = (ipha_t *)mp1->b_rptr;
 		ip_fanout_v4(mp1, ipha1, ira);
+		/*
+		 * IPsec might have modified ira_pktlen and ira_ip_hdr_length
+		 * so we restore them for a potential next iteration
+		 */
+		ira->ira_pktlen = ira_pktlen;
+		ira->ira_ip_hdr_length = ira_ip_hdr_length;
 	}
 	irb_refrele(irb);
 	/* Do the main ire */
@@ -1913,6 +1921,8 @@
 	zoneid_t	zoneid;
 	mblk_t		*mp1;
 	ipha_t		*ipha1;
+	uint_t		ira_pktlen = ira->ira_pktlen;
+	uint16_t	ira_ip_hdr_length = ira->ira_ip_hdr_length;
 
 	/* ire_recv_multicast has switched to the upper ill for IPMP */
 	ASSERT(!IS_UNDER_IPMP(ill));
@@ -1972,6 +1982,12 @@
 		}
 		ipha1 = (ipha_t *)mp1->b_rptr;
 		ip_fanout_v4(mp1, ipha1, ira);
+		/*
+		 * IPsec might have modified ira_pktlen and ira_ip_hdr_length
+		 * so we restore them for a potential next iteration
+		 */
+		ira->ira_pktlen = ira_pktlen;
+		ira->ira_ip_hdr_length = ira_ip_hdr_length;
 	}
 
 	/* Do the main ire */
--- a/usr/src/uts/common/inet/ip/ip_ire.c	Wed Feb 17 19:52:19 2010 -0800
+++ b/usr/src/uts/common/inet/ip/ip_ire.c	Wed Feb 17 22:59:58 2010 -0500
@@ -1856,7 +1856,7 @@
 
 	ASSERT(ire->ire_ipversion == IPV4_VERSION);
 	ASSERT((ire->ire_addr & ~ire->ire_mask) == 0);
-	ASSERT((!(match_flags & MATCH_IRE_ILL)) ||
+	ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL))) ||
 	    (ill != NULL && !ill->ill_isv6));
 
 	/*
@@ -1936,7 +1936,7 @@
 			}
 		}
 		/*
-		 * For exampe, with
+		 * For example, with
 		 * route add 11.0.0.0 gw1 -ifp bge0
 		 * route add 11.0.0.0 gw2 -ifp bge1
 		 * this code would differentiate based on
@@ -1965,8 +1965,8 @@
 	}
 
 matchit:
+	ire_ill = ire->ire_ill;
 	if (match_flags & MATCH_IRE_ILL) {
-		ire_ill = ire->ire_ill;
 
 		/*
 		 * If asked to match an ill, we *must* match
@@ -1991,6 +1991,16 @@
 				return (B_FALSE);
 		}
 	}
+	if (match_flags & MATCH_IRE_SRC_ILL) {
+		if (ire_ill == NULL)
+			return (B_FALSE);
+		if (!IS_ON_SAME_LAN(ill, ire_ill)) {
+			if (ire_ill->ill_usesrc_ifindex == 0 ||
+			    (ire_ill->ill_usesrc_ifindex !=
+			    ill->ill_phyint->phyint_ifindex))
+				return (B_FALSE);
+		}
+	}
 
 	if ((ire->ire_addr == (addr & mask)) &&
 	    ((!(match_flags & MATCH_IRE_GW)) ||
@@ -3563,3 +3573,60 @@
 		return (5);
 	return (-1); /* unknown ire_type */
 }
+
+/*
+ * In the preferred/strict src multihoming modes, unbound routes (i.e.,
+ * ire_t entries with ire_unbound set to B_TRUE) are bound to an interface
+ * by selecting the first available interface that has an interface route for
+ * the ire_gateway. If that interface is subsequently brought down, ill_downi()
+ * will call ire_rebind() so that the unbound route can be bound to some other
+ * matching interface thereby preserving the intended reachability information
+ * from the original unbound route.
+ */
+void
+ire_rebind(ire_t *ire)
+{
+	ire_t	*gw_ire, *new_ire;
+	int	match_flags = MATCH_IRE_TYPE;
+	ill_t	*gw_ill;
+	boolean_t isv6 = (ire->ire_ipversion == IPV6_VERSION);
+	ip_stack_t *ipst = ire->ire_ipst;
+
+	ASSERT(ire->ire_unbound);
+again:
+	if (isv6) {
+		gw_ire = ire_ftable_lookup_v6(&ire->ire_gateway_addr_v6, 0, 0,
+		    IRE_INTERFACE, NULL, ALL_ZONES, NULL, match_flags, 0,
+		    ipst, NULL);
+	} else {
+		gw_ire = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0,
+		    IRE_INTERFACE, NULL, ALL_ZONES, NULL, match_flags, 0,
+		    ipst, NULL);
+	}
+	if (gw_ire == NULL) {
+		/* see comments in ip_rt_add[_v6]() for IPMP */
+		if (match_flags & MATCH_IRE_TESTHIDDEN)
+			return;
+
+		match_flags |= MATCH_IRE_TESTHIDDEN;
+		goto again;
+	}
+	gw_ill = gw_ire->ire_ill;
+	if (isv6) {
+		new_ire = ire_create_v6(&ire->ire_addr_v6, &ire->ire_mask_v6,
+		    &ire->ire_gateway_addr_v6, ire->ire_type, gw_ill,
+		    ire->ire_zoneid, ire->ire_flags, NULL, ipst);
+	} else {
+		new_ire = ire_create((uchar_t *)&ire->ire_addr,
+		    (uchar_t *)&ire->ire_mask,
+		    (uchar_t *)&ire->ire_gateway_addr, ire->ire_type, gw_ill,
+		    ire->ire_zoneid, ire->ire_flags, NULL, ipst);
+	}
+	ire_refrele(gw_ire);
+	if (new_ire == NULL)
+		return;
+	new_ire->ire_unbound = B_TRUE;
+	new_ire = ire_add(new_ire);
+	if (new_ire != NULL)
+		ire_refrele(new_ire);
+}
--- a/usr/src/uts/common/inet/ip/ip_mroute.c	Wed Feb 17 19:52:19 2010 -0800
+++ b/usr/src/uts/common/inet/ip/ip_mroute.c	Wed Feb 17 22:59:58 2010 -0500
@@ -3113,7 +3113,8 @@
 			    (ptrdiff_t)(vifp - ipst->ips_vifs));
 		}
 		bzero(&ixas, sizeof (ixas));
-		ixas.ixa_flags = IXAF_IS_IPV4 | IXAF_NO_TTL_CHANGE;
+		ixas.ixa_flags =
+		    IXAF_IS_IPV4 | IXAF_NO_TTL_CHANGE | IXAF_VERIFY_SOURCE;
 		ixas.ixa_ipst = ipst;
 		ixas.ixa_ifindex = 0;
 		ixas.ixa_cred = kcred;
--- a/usr/src/uts/common/inet/ip/ip_ndp.c	Wed Feb 17 19:52:19 2010 -0800
+++ b/usr/src/uts/common/inet/ip/ip_ndp.c	Wed Feb 17 22:59:58 2010 -0500
@@ -165,8 +165,8 @@
  * the probe is sent on the ncec_ill (in the non-IPMP case) or the
  * IPMP cast_ill (in the IPMP case).
  *
- * Note that the probe interval is based on ncec->ncec_ill which
- * may be the ipmp_ill.
+ * Note that the probe interval is based on the src_ill for IPv6, and
+ * the ncec_xmit_interval for IPv4.
  */
 static void
 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
@@ -180,7 +180,7 @@
 		dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
 		    ncec->ncec_lladdr, ncec->ncec_lladdr_length,
 		    &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
-		probe_interval = ILL_PROBE_INTERVAL(ncec->ncec_ill);
+		probe_interval = ILL_PROBE_INTERVAL(src_ill);
 	} else {
 		/* IPv4 DAD delay the initial probe. */
 		if (send_probe)
@@ -4464,8 +4464,17 @@
 		 */
 		ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
 		IN6_V4MAPPED_TO_IPADDR(addr, addr4);
-		if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4))
+		if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) {
 			fastprobe = B_TRUE;
+		} else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) &&
+		    !IS_IPV4_LL_SPACE(&addr4)) {
+			ill_t *hwaddr_ill;
+
+			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr,
+			    hw_addr_len);
+			if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link)
+				fastprobe = B_TRUE;
+		}
 		if (fastprobe) {
 			ncec->ncec_xmit_interval =
 			    ipst->ips_arp_fastprobe_interval;
--- a/usr/src/uts/common/inet/ip/ip_output.c	Wed Feb 17 19:52:19 2010 -0800
+++ b/usr/src/uts/common/inet/ip/ip_output.c	Wed Feb 17 22:59:58 2010 -0500
@@ -847,8 +847,8 @@
 repeat_ire:
 	error = 0;
 	setsrc = INADDR_ANY;
-	ire = ip_select_route_v4(firsthop, ixa, NULL, &setsrc, &error,
-	    &multirt);
+	ire = ip_select_route_v4(firsthop, ipha->ipha_src, ixa, NULL,
+	    &setsrc, &error, &multirt);
 	ASSERT(ire != NULL);	/* IRE_NOROUTE if none found */
 	if (error != 0) {
 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
@@ -2295,10 +2295,13 @@
 			 * starting at ire1.
 			 */
 			ire_t *ire2;
+			uint_t	match_flags = MATCH_IRE_DSTONLY;
 
+			if (ire1->ire_ill != NULL)
+				match_flags |= MATCH_IRE_ILL;
 			ire2 = ire_route_recursive_impl_v4(ire1,
 			    ire1->ire_addr, ire1->ire_type, ire1->ire_ill,
-			    ire1->ire_zoneid, NULL, MATCH_IRE_DSTONLY,
+			    ire1->ire_zoneid, NULL, match_flags,
 			    IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL);
 			if (ire2 != NULL)
 				ire_refrele(ire2);
--- a/usr/src/uts/common/inet/ip/ipsecesp.c	Wed Feb 17 19:52:19 2010 -0800
+++ b/usr/src/uts/common/inet/ip/ipsecesp.c	Wed Feb 17 22:59:58 2010 -0500
@@ -2320,7 +2320,7 @@
 	ixas.ixa_tsl = NULL;
 	ixas.ixa_ipst = ns->netstack_ip;
 	/* No ULP checksum; done by esp_prepare_udp */
-	ixas.ixa_flags = IXAF_IS_IPV4 | IXAF_NO_IPSEC;
+	ixas.ixa_flags = (IXAF_IS_IPV4 | IXAF_NO_IPSEC | IXAF_VERIFY_SOURCE);
 
 	(void) ip_output_simple(mp, &ixas);
 	ixa_cleanup(&ixas);
--- a/usr/src/uts/common/inet/ip_if.h	Wed Feb 17 19:52:19 2010 -0800
+++ b/usr/src/uts/common/inet/ip_if.h	Wed Feb 17 22:59:58 2010 -0500
@@ -291,6 +291,7 @@
 extern	int	ipif_arp_down(ipif_t *ipif);
 extern	void	ipif_mask_reply(ipif_t *);
 extern	int 	ipif_up(ipif_t *, queue_t *, mblk_t *);
+extern	ill_t	*ill_lookup_usesrc(ill_t *);
 
 extern	void	ipsq_current_start(ipsq_t *, ipif_t *, int);
 extern	void	ipsq_current_finish(ipsq_t *);
--- a/usr/src/uts/common/inet/ip_ire.h	Wed Feb 17 19:52:19 2010 -0800
+++ b/usr/src/uts/common/inet/ip_ire.h	Wed Feb 17 22:59:58 2010 -0500
@@ -93,6 +93,7 @@
 					/* zones or shared IREs */
 #define	MATCH_IRE_SECATTR	0x0040	/* Match gateway security attributes */
 #define	MATCH_IRE_TESTHIDDEN 	0x0080	/* Match ire_testhidden IREs */
+#define	MATCH_IRE_SRC_ILL	0x0100	/* ire_ill uses a src address on ill */
 
 #define	MAX_IRE_RECURSION	4	/* Max IREs in ire_route_recursive */
 
@@ -321,12 +322,12 @@
 
 extern ire_t	*ip_select_route_pkt(mblk_t *, ip_xmit_attr_t *,
     uint_t *, int *, boolean_t *);
-extern ire_t	*ip_select_route(const in6_addr_t *, ip_xmit_attr_t *,
-    uint_t *, in6_addr_t *, int *, boolean_t *);
-extern ire_t	*ip_select_route_v4(ipaddr_t, ip_xmit_attr_t *,
+extern ire_t	*ip_select_route(const in6_addr_t *, const in6_addr_t,
+    ip_xmit_attr_t *, uint_t *, in6_addr_t *, int *, boolean_t *);
+extern ire_t	*ip_select_route_v4(ipaddr_t, ipaddr_t, ip_xmit_attr_t *,
     uint_t *, ipaddr_t *, int *, boolean_t *);
-extern ire_t	*ip_select_route_v6(const in6_addr_t *, ip_xmit_attr_t *,
-    uint_t *, in6_addr_t *, int *, boolean_t *);
+extern ire_t	*ip_select_route_v6(const in6_addr_t *, const in6_addr_t,
+    ip_xmit_attr_t *, uint_t *, in6_addr_t *, int *, boolean_t *);
 
 extern	void	ire_walk(pfv_t, void *, ip_stack_t *);
 extern	void	ire_walk_ill(uint_t, uint_t, pfv_t, void *, ill_t *);
@@ -348,6 +349,7 @@
     zoneid_t, ip_stack_t *);
 extern  void ire_increment_generation(ire_t *);
 extern  void ire_increment_multicast_generation(ip_stack_t *, boolean_t);
+extern	void ire_rebind(ire_t *);
 
 #endif /* _KERNEL */