changeset 4084:1838f1bb1fda

6521408 ip_rput_process_forward() declared twice in ip.c 6545032 race in NCE creation can render off-link hosts unreachable
author sowmini
date Mon, 23 Apr 2007 09:35:29 -0700
parents 08c82bb85ef2
children f5fd2bc7debd
files usr/src/uts/common/inet/ip.h usr/src/uts/common/inet/ip/ip.c usr/src/uts/common/inet/ip/ip_ire.c usr/src/uts/common/inet/ip/ip_ndp.c usr/src/uts/common/inet/ip_ndp.h
diffstat 5 files changed, 73 insertions(+), 78 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/inet/ip.h	Mon Apr 23 09:19:26 2007 -0700
+++ b/usr/src/uts/common/inet/ip.h	Mon Apr 23 09:35:29 2007 -0700
@@ -433,19 +433,22 @@
 /*
  * NCE_EXPIRED is TRUE when we have a non-permanent nce that was
  * found to be REACHABLE more than ip_ire_arp_interval ms ago.
- * This macro is used to trigger re-arp of existing nce_t entries
- * so that they can be updated with the latest information.
- * nce's will get cleaned up (or updated with new info) in the following
- * circumstances:
+ * This macro is used to age existing nce_t entries. The
+ * nce's will get cleaned up in the following circumstances:
  * - ip_ire_trash_reclaim will free nce's using ndp_cache_reclaim
  *    when memory is low,
  * - ip_arp_news, when updates are received.
- * - if the nce is NCE_EXPIRED(), it will be re-initialized to ND_INITIAL,
- *   so that a new arp request will be triggered.
+ * - if the nce is NCE_EXPIRED(), it will deleted, so that a new
+ *   arp request will need to be triggered from an ND_INITIAL nce.
+ *
+ * Note that the nce state transition follows the pattern:
+ *	ND_INITIAL -> ND_INCOMPLETE -> ND_REACHABLE
+ * after which the nce is deleted when it has expired.
+ *
  * nce_last is the timestamp that indicates when the nce_res_mp in the
  * nce_t was last updated to a valid link-layer address.  nce_last gets
  * modified/updated :
- *  - when the nce is created  or reinit-ed
+ *  - when the nce is created
  *  - every time we get a sane arp response for the nce.
  */
 #define	NCE_EXPIRED(nce, ipst)	(nce->nce_last > 0 &&	\
--- a/usr/src/uts/common/inet/ip/ip.c	Mon Apr 23 09:19:26 2007 -0700
+++ b/usr/src/uts/common/inet/ip/ip.c	Mon Apr 23 09:35:29 2007 -0700
@@ -771,9 +771,6 @@
 
 static void	ip_rput_process_forward(queue_t *, mblk_t *, ire_t *,
     ipha_t *, ill_t *, boolean_t);
-
-static void	ip_rput_process_forward(queue_t *, mblk_t *, ire_t *,
-    ipha_t *, ill_t *, boolean_t);
 ipaddr_t	ip_g_all_ones = IP_HOST_MASK;
 
 /* How long, in seconds, we allow frags to hang around. */
@@ -8556,14 +8553,6 @@
 			 */
 			res_mp = save_ire->ire_nce->nce_res_mp;
 			ire_fp_mp = NULL;
-			/*
-			 * save_ire's nce_fp_mp can't change since it is
-			 * not an IRE_MIPRTUN or IRE_BROADCAST
-			 * LOCK_IRE_FP_MP does not do any useful work in
-			 * the case of IRE_CACHE. So we don't use it below.
-			 */
-			if (save_ire->ire_stq == dst_ill->ill_wq)
-				ire_fp_mp = save_ire->ire_nce->nce_fp_mp;
 
 			/*
 			 * Check cached gateway IRE for any security
@@ -28119,17 +28108,15 @@
 			    &fake_ire->ire_gateway_addr : &fake_ire->ire_addr),
 			    B_FALSE)) != NULL) {
 				/*
-				 * cleanup: just reset nce.
+				 * cleanup:
 				 * We check for refcnt 2 (one for the nce
 				 * hash list + 1 for the ref taken by
-				 * ndp_lookup_v4) to ensure that there are
+				 * ndp_lookup_v4) to check that there are
 				 * no ire's pointing at the nce.
 				 */
-				if (nce->nce_refcnt == 2) {
-					nce = nce_reinit(nce);
-				}
-				if (nce != NULL)
-					NCE_REFRELE(nce);
+				if (nce->nce_refcnt == 2)
+					ndp_delete(nce);
+				NCE_REFRELE(nce);
 			}
 			freeb(mp1);  /* dl_unitdata response */
 			freemsg(mp); /* fake ire */
--- a/usr/src/uts/common/inet/ip/ip_ire.c	Mon Apr 23 09:19:26 2007 -0700
+++ b/usr/src/uts/common/inet/ip/ip_ire.c	Mon Apr 23 09:35:29 2007 -0700
@@ -77,7 +77,6 @@
 
 struct kmem_cache *rt_entry_cache;
 
-
 /*
  * Synchronization notes:
  *
@@ -2893,8 +2892,8 @@
  * that has prohibited the addition of incomplete ire's. If this
  * parameter is set, and we find an nce that is in a state other
  * than ND_REACHABLE, we fail the add. Note that nce_state could be
- * something other than ND_REACHABLE if nce_reinit has just
- * kicked in and reset the nce.
+ * something other than ND_REACHABLE if the nce had just expired and
+ * the ire_create preceding the ire_add added a new ND_INITIAL nce.
  */
 int
 ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func,
@@ -3396,15 +3395,18 @@
 		 * if the nce is NCE_F_CONDEMNED, or if it is not ND_REACHABLE
 		 * and the caller has prohibited the addition of incomplete
 		 * ire's, we fail the add. Note that nce_state could be
-		 * something other than ND_REACHABLE if nce_reinit has just
-		 * kicked in and reset the nce.
+		 * something other than ND_REACHABLE if the nce had
+		 * just expired and the ire_create preceding the
+		 * ire_add added a new ND_INITIAL nce.
 		 */
 		if ((nce == NULL) ||
 		    (nce->nce_flags & NCE_F_CONDEMNED) ||
 		    (!allow_unresolved &&
 		    (nce->nce_state != ND_REACHABLE))) {
-			if (nce != NULL)
+			if (nce != NULL) {
+				DTRACE_PROBE1(ire__bad__nce, nce_t *, nce);
 				mutex_exit(&nce->nce_lock);
+			}
 			ire_atomic_end(irb_ptr, ire);
 			mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
 			if (nce != NULL)
@@ -6808,17 +6810,30 @@
 			nce_state = ND_REACHABLE;
 		} else {
 			nce_state = ND_INITIAL;
-			if (fp_mp)
-				freemsg(fp_mp);
-			fp_mp = NULL;
+			ASSERT(fp_mp == NULL);
 		}
 		nce_flags = 0;
 	}
 
+retry_nce:
 	err = ndp_lookup_then_add(ire_ill, NULL,
 	    &addr4, &mask4, NULL, 0, nce_flags, nce_state, &arpce,
 	    fp_mp, res_mp);
 
+	if (err == EEXIST && NCE_EXPIRED(arpce, ipst)) {
+		/*
+		 * We looked up an expired nce.
+		 * Go back and try to create one again.
+		 * The res_mp should be intact for the EEXIST case,
+		 * i.e., there are no new references to it, and there
+		 * is no need to copyb it.
+		 */
+		ndp_delete(arpce);
+		NCE_REFRELE(arpce);
+		arpce = NULL;
+		goto retry_nce;
+	}
+
 	ip1dbg(("ire 0x%p addr 0x%lx mask 0x%lx type 0x%x; "
 	    "found nce 0x%p err %d\n", (void *)ire, (ulong_t)addr4,
 	    (ulong_t)mask4, ire->ire_type, (void *)arpce, err));
@@ -6885,18 +6900,11 @@
 		 */
 		NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce);
 	} else {
-		if (NCE_EXPIRED(arpce, ipst))
-			arpce = nce_reinit(arpce);
-		if (arpce != NULL) {
-			/*
-			 * We are not using this nce_t just yet so release
-			 * the ref taken in ndp_lookup_then_add_v4()
-			 */
-			NCE_REFRELE(arpce);
-		} else {
-			ip0dbg(("can't reinit arpce for ill 0x%p;\n",
-			    (void *)ire_ill));
-		}
+		/*
+		 * We are not using this nce_t just yet so release
+		 * the ref taken in ndp_lookup_then_add_v4()
+		 */
+		NCE_REFRELE(arpce);
 	}
 	return (0);
 }
--- a/usr/src/uts/common/inet/ip/ip_ndp.c	Mon Apr 23 09:19:26 2007 -0700
+++ b/usr/src/uts/common/inet/ip/ip_ndp.c	Mon Apr 23 09:35:29 2007 -0700
@@ -2511,6 +2511,7 @@
 	uchar_t		*h;
 	uchar_t		*m = flags_buf;
 	in6_addr_t	v6addr;
+	uint64_t	now;
 
 	/*
 	 * Lock the nce to protect nce_res_mp from being changed
@@ -2521,9 +2522,21 @@
 	 * In addition, make sure that the mblk has enough space
 	 * before writing to it. If is doesn't, allocate a new one.
 	 */
-	if (nce->nce_ipversion == IPV4_VERSION)
-		/* Don't include v4 nce_ts in NDP cache entry report */
+	if (nce->nce_ipversion == IPV4_VERSION) {
+		/*
+		 * Don't include v4 NCEs in NDP cache entry report.
+		 * But sanity check for lingering ND_INITIAL entries
+		 * when we do 'ndd -get /dev/ip ip_ndp_cache_report'
+		 */
+		if (nce->nce_state == ND_INITIAL) {
+
+			now = TICK_TO_MSEC(lbolt64);
+			if (now - nce->nce_init_time > 120000) {
+				DTRACE_PROBE1(nce__stuck, nce_t *, nce);
+			}
+		}
 		return;
+	}
 
 	ASSERT(ill != NULL);
 	v6addr = nce->nce_mask;
@@ -3695,8 +3708,16 @@
 nce_thread_exit(nce_t *nce, caddr_t arg)
 {
 	th_trace_t	*th_trace;
+	uint64_t	now;
 
 	mutex_enter(&nce->nce_lock);
+	if (nce->nce_state == ND_INITIAL) {
+
+		now = TICK_TO_MSEC(lbolt64);
+		if (now - nce->nce_init_time > 120000) {
+			DTRACE_PROBE1(nce__stuck, nce_t *, nce);
+		}
+	}
 	th_trace = th_trace_nce_lookup(nce);
 
 	if (th_trace == NULL) {
@@ -3865,10 +3886,13 @@
 	nce->nce_ll_extract_start = hw_extract_start;
 	nce->nce_fp_mp = (fp_mp? fp_mp : NULL);
 	nce->nce_res_mp = template;
-	if (state == ND_REACHABLE)
+	if (state == ND_REACHABLE) {
 		nce->nce_last = TICK_TO_MSEC(lbolt64);
-	else
+	} else {
 		nce->nce_last = 0;
+		if (state == ND_INITIAL)
+			nce->nce_init_time = TICK_TO_MSEC(lbolt64);
+	}
 	nce->nce_qd_mp = NULL;
 	nce->nce_mp = mp;
 	if (hw_addr != NULL)
@@ -3928,33 +3952,6 @@
 	}
 }
 
-nce_t *
-nce_reinit(nce_t *nce)
-{
-	nce_t *newnce = NULL;
-	in_addr_t nce_addr, nce_mask;
-	ip_stack_t *ipst = nce->nce_ill->ill_ipst;
-
-	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
-	IN6_V4MAPPED_TO_IPADDR(&nce->nce_mask, nce_mask);
-	/*
-	 * delete the old one. this will get rid of any ire's pointing
-	 * at this nce.
-	 */
-	ndp_delete(nce);
-	/*
-	 * create a new nce with the same addr and mask.
-	 */
-	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
-	(void) ndp_add_v4(nce->nce_ill, NULL, &nce_addr, &nce_mask, NULL, 0, 0,
-	    ND_INITIAL, &newnce, NULL, NULL);
-	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
-	/*
-	 * refrele the old nce.
-	 */
-	NCE_REFRELE(nce);
-	return (newnce);
-}
 
 /*
  * ndp_walk routine to delete all entries that have a given destination or
--- a/usr/src/uts/common/inet/ip_ndp.h	Mon Apr 23 09:19:26 2007 -0700
+++ b/usr/src/uts/common/inet/ip_ndp.h	Mon Apr 23 09:35:29 2007 -0700
@@ -77,6 +77,7 @@
 	uchar_t		nce_ipversion;	/* IPv4(ARP)/IPv6(NDP) version */
 	uint_t		nce_defense_count;	/* number of NDP conflicts */
 	uint_t		nce_defense_time;	/* last time defended (secs) */
+	uint64_t	nce_init_time;  /* time when it was set to ND_INITIAL */
 #ifdef NCE_DEBUG
 	th_trace_t	*nce_trace[IP_TR_HASH_MAX];
 	boolean_t	nce_trace_disable;	/* True when alloc fails */
@@ -331,7 +332,6 @@
     boolean_t (*)(nce_t *, void  *), void *);
 extern	void	nce_queue_mp_common(nce_t *, mblk_t *, boolean_t);
 extern	void	ndp_flush_qd_mp(nce_t *);
-extern	nce_t	*nce_reinit(nce_t *);
 extern	void	nce_delete_hw_changed(nce_t *, void *);
 extern	void	nce_fastpath(nce_t *);