changeset 5055:7a15930aae3c

6588495 IP can use the wrong interface for filtering/qos 6599516 locking in fr_natderef causes lock contention and performance drop
author dr146992
date Fri, 14 Sep 2007 18:14:13 -0700
parents bd0f88647368
children 61acf2c76fd6
files usr/src/uts/common/inet/ip/ip.c usr/src/uts/common/inet/ipf/ip_nat.c usr/src/uts/common/inet/ipf/ip_state.c
diffstat 3 files changed, 58 insertions(+), 35 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/inet/ip/ip.c	Fri Sep 14 16:10:56 2007 -0700
+++ b/usr/src/uts/common/inet/ip/ip.c	Fri Sep 14 18:14:13 2007 -0700
@@ -13035,7 +13035,7 @@
 	/* IP options present */
 	if (u1) {
 		goto ipoptions;
-	} else {
+	} else if (!mctl_present) {
 		/* Check the IP header checksum.  */
 		if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) {
 			/* Clear the IP header h/w cksum flag */
@@ -16435,9 +16435,8 @@
 	}
 
 	/* Get the ill_index of the outgoing ILL */
-	ill_index = ire->ire_ipif->ipif_ill->ill_phyint->phyint_ifindex;
-
-	out_ill = ire->ire_ipif->ipif_ill;
+	out_ill = ire_to_ill(ire);
+	ill_index = out_ill->ill_phyint->phyint_ifindex;
 
 	DTRACE_PROBE4(ip4__forwarding__start,
 	    ill_t *, in_ill, ill_t *, out_ill, ipha_t *, ipha, mblk_t *, mp);
@@ -23141,7 +23140,7 @@
 					}
 				}
 
-				out_ill = ire->ire_ipif->ipif_ill;
+				out_ill = ire_to_ill(ire);
 				DTRACE_PROBE4(ip4__physical__out__start,
 				    ill_t *, NULL,
 				    ill_t *, out_ill,
@@ -23341,7 +23340,7 @@
 			 * is considered to be the "output" side and
 			 * ip_wput_local to be the "input" side.
 			 */
-			out_ill = ire->ire_ipif->ipif_ill;
+			out_ill = ire_to_ill(ire);
 
 			DTRACE_PROBE4(ip4__loopback__out__start,
 			    ill_t *, NULL, ill_t *, out_ill,
@@ -23367,7 +23366,7 @@
 			return;
 		}
 
-		out_ill = ire->ire_ipif->ipif_ill;
+		out_ill = ire_to_ill(ire);
 
 		DTRACE_PROBE4(ip4__loopback__out__start,
 		    ill_t *, NULL, ill_t *, out_ill,
@@ -25680,7 +25679,7 @@
 		ASSERT(q != NULL);
 
 		/* PFHooks: LOOPBACK_OUT */
-		out_ill = ire->ire_ipif->ipif_ill;
+		out_ill = ire_to_ill(ire);
 
 		DTRACE_PROBE4(ip6__loopback__out__start,
 		    ill_t *, NULL, ill_t *, out_ill,
@@ -26016,7 +26015,7 @@
 			ipha->ipha_src = ire->ire_src_addr;
 
 		/* PFHooks: LOOPBACK_OUT */
-		out_ill = ire->ire_ipif->ipif_ill;
+		out_ill = ire_to_ill(ire);
 
 		DTRACE_PROBE4(ip4__loopback__out__start,
 		    ill_t *, NULL, ill_t *, out_ill,
@@ -29793,7 +29792,7 @@
 			mp->b_prev = NULL;
 
 			/* set up ill index for outbound qos processing */
-			out_ill = ire->ire_ipif->ipif_ill;
+			out_ill = ire_to_ill(ire);
 			ill_index = out_ill->ill_phyint->phyint_ifindex;
 			first_mp = ip_wput_attach_llhdr(mp, ire, proc,
 			    ill_index);
--- a/usr/src/uts/common/inet/ipf/ip_nat.c	Fri Sep 14 16:10:56 2007 -0700
+++ b/usr/src/uts/common/inet/ipf/ip_nat.c	Fri Sep 14 18:14:13 2007 -0700
@@ -1587,8 +1587,6 @@
 	if (logtype != 0 && ifs->ifs_nat_logging != 0)
 		nat_log(nat, logtype, ifs);
 
-	MUTEX_ENTER(&ifs->ifs_ipf_nat_new);
-
 	/*
 	 * Take it as a general indication that all the pointers are set if
 	 * nat_pnext is set.
@@ -1629,11 +1627,18 @@
 
 	fr_deletequeueentry(&nat->nat_tqe);
 
-	nat->nat_ref--;
-	if (nat->nat_ref > 0) {
-		MUTEX_EXIT(&ifs->ifs_ipf_nat_new);
+	MUTEX_ENTER(&nat->nat_lock);
+	if (nat->nat_ref > 1) {
+		nat->nat_ref--;
+		MUTEX_EXIT(&nat->nat_lock);
 		return;
 	}
+	MUTEX_EXIT(&nat->nat_lock);
+
+	/*
+	 * At this point, nat_ref is 1, doing "--" would make it 0..
+	 */
+	nat->nat_ref = 0;
 
 #ifdef	IPFILTER_SYNC
 	if (nat->nat_sync)
@@ -1667,7 +1672,6 @@
 
 	aps_free(nat->nat_aps, ifs);
 	ifs->ifs_nat_stats.ns_inuse--;
-	MUTEX_EXIT(&ifs->ifs_ipf_nat_new);
 
 	/*
 	 * If there's a fragment table entry too for this nat entry, then
@@ -4746,6 +4750,14 @@
 /*                                                                          */
 /* Decrement the reference counter for this NAT table entry and free it if  */
 /* there are no more things using it.                                       */
+/*                                                                          */
+/* IF nat_ref == 1 when this function is called, then we have an orphan nat */
+/* structure *because* it only gets called on paths _after_ nat_ref has been*/
+/* incremented.  If nat_ref == 1 then we shouldn't decrement it here        */
+/* because nat_delete() will do that and send nat_ref to -1.                */
+/*                                                                          */
+/* Holding the lock on nat_lock is required to serialise nat_delete() being */
+/* called from a NAT flush ioctl with a deref happening because of a packet.*/
 /* ------------------------------------------------------------------------ */
 void fr_natderef(natp, ifs)
 nat_t **natp;
@@ -4755,10 +4767,17 @@
 
 	nat = *natp;
 	*natp = NULL;
+
+	MUTEX_ENTER(&nat->nat_lock);
+	if (nat->nat_ref > 1) {
+		nat->nat_ref--;
+		MUTEX_EXIT(&nat->nat_lock);
+		return;
+	}
+	MUTEX_EXIT(&nat->nat_lock);
+
 	WRITE_ENTER(&ifs->ifs_ipf_nat);
-	nat->nat_ref--;
-	if (nat->nat_ref == 0)
-	    nat_delete(nat, NL_EXPIRE, ifs);
+	nat_delete(nat, NL_EXPIRE, ifs);
 	RWLOCK_EXIT(&ifs->ifs_ipf_nat);
 }
 
--- a/usr/src/uts/common/inet/ipf/ip_state.c	Fri Sep 14 16:10:56 2007 -0700
+++ b/usr/src/uts/common/inet/ipf/ip_state.c	Fri Sep 14 18:14:13 2007 -0700
@@ -2868,9 +2868,15 @@
 	 * If it is still in use by something else, do not go any further,
 	 * but note that at this point it is now an orphan.
 	 */
-	is->is_ref--;
-	if (is->is_ref > 0)
+	MUTEX_ENTER(&is->is_lock);
+	if (is->is_ref > 1) {
+		is->is_ref--;
+		MUTEX_EXIT(&is->is_lock);
 		return;
+	}
+	MUTEX_EXIT(&is->is_lock);
+
+	is->is_ref = 0;
 
 	if (is->is_tqehead[0] != NULL) {
 		if (fr_deletetimeoutqueue(is->is_tqehead[0]) == 0)
@@ -3823,24 +3829,23 @@
 	fin = fin;	/* LINT */
 	is = *isp;
 	*isp = NULL;
-	WRITE_ENTER(&ifs->ifs_ipf_state);
-	is->is_ref--;
-	if (is->is_ref == 0) {
-		is->is_ref++;		/* To counter ref-- in fr_delstate() */
-		fr_delstate(is, ISL_EXPIRE, ifs);
+
+	MUTEX_ENTER(&is->is_lock);
+	if (is->is_ref > 1) {
+		is->is_ref--;
+		MUTEX_EXIT(&is->is_lock);
 #ifndef	_KERNEL
-#if 0
-	} else if (((fin->fin_out == 1) || (eol == 1)) &&
-		   ((ostate == IPF_TCPS_LAST_ACK) &&
-		   (nstate == IPF_TCPS_TIME_WAIT))) {
-		;
-#else
-	} else if ((is->is_sti.tqe_state[0] > IPF_TCPS_ESTABLISHED) ||
+		if ((is->is_sti.tqe_state[0] > IPF_TCPS_ESTABLISHED) ||
 		   (is->is_sti.tqe_state[1] > IPF_TCPS_ESTABLISHED)) {
+			fr_delstate(is, ISL_ORPHAN, ifs);
+		}
 #endif
-		fr_delstate(is, ISL_ORPHAN, ifs);
-#endif
+		return;
 	}
+	MUTEX_EXIT(&is->is_lock);
+
+	WRITE_ENTER(&ifs->ifs_ipf_state);
+	fr_delstate(is, ISL_EXPIRE, ifs);
 	RWLOCK_EXIT(&ifs->ifs_ipf_state);
 }