changeset 8014:ea9ef48ce5db

6759608 TCP should pre-allocate the T_ordrel_ind mblk
author Kacheong Poon <Kacheong.Poon@Sun.COM>
date Wed, 05 Nov 2008 08:38:41 -0800
parents 61f8e784e28e
children a10a050cef9d
files usr/src/uts/common/inet/ip/ipclassifier.c usr/src/uts/common/inet/tcp.h usr/src/uts/common/inet/tcp/tcp.c
diffstat 3 files changed, 100 insertions(+), 183 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/inet/ip/ipclassifier.c	Wed Nov 05 08:30:52 2008 -0700
+++ b/usr/src/uts/common/inet/ip/ipclassifier.c	Wed Nov 05 08:38:41 2008 -0800
@@ -698,6 +698,21 @@
 		}
 		ASSERT(tcp->tcp_iphc_len == 0);
 
+		if (tcp->tcp_ordrel_mp != NULL) {
+			freeb(tcp->tcp_ordrel_mp);
+			tcp->tcp_ordrel_mp = NULL;
+		}
+
+		/*
+		 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
+		 * the mblk.
+		 */
+		if (tcp->tcp_rsrv_mp != NULL) {
+			freeb(tcp->tcp_rsrv_mp);
+			tcp->tcp_rsrv_mp = NULL;
+			mutex_destroy(&tcp->tcp_rsrv_mp_lock);
+		}
+
 		ASSERT(connp->conn_latch == NULL);
 		ASSERT(connp->conn_policy == NULL);
 
--- a/usr/src/uts/common/inet/tcp.h	Wed Nov 05 08:30:52 2008 -0700
+++ b/usr/src/uts/common/inet/tcp.h	Wed Nov 05 08:38:41 2008 -0800
@@ -247,34 +247,31 @@
 
 		tcp_bind_pending : 1,	/* Client is waiting for bind ack */
 		tcp_unbind_pending : 1, /* Client sent T_UNBIND_REQ */
-		tcp_deferred_clean_death : 1,
-					/* defer tcp endpoint cleanup etc. */
 		tcp_ka_enabled: 1,	/* Connection KeepAlive Timer needed */
+		tcp_zero_win_probe: 1,	/* Zero win probing is in progress */
 
-		tcp_zero_win_probe: 1,	/* Zero win probing is in progress */
 		tcp_loopback: 1,	/* src and dst are the same machine */
 		tcp_localnet: 1,	/* src and dst are on the same subnet */
 		tcp_syn_defense: 1,	/* For defense against SYN attack */
 #define	tcp_dontdrop	tcp_syn_defense
+		tcp_set_timer : 1,
 
-		tcp_set_timer : 1,
 		tcp_active_open: 1,	/* This is a active open */
-		tcp_timeout : 1,	/* qbufcall failed, qtimeout pending */
 		tcp_rexmit : 1,		/* TCP is retransmitting */
-
 		tcp_snd_sack_ok : 1,	/* Can use SACK for this connection */
 		tcp_empty_flag : 1,	/* Empty flag for future use */
+
 		tcp_recvdstaddr : 1,	/* return T_EXTCONN_IND with dst addr */
 		tcp_hwcksum : 1,	/* The NIC is capable of hwcksum */
-
 		tcp_ip_forward_progress : 1,
 		tcp_anon_priv_bind : 1,
+
 		tcp_ecn_ok : 1,		/* Can use ECN for this connection */
 		tcp_ecn_echo_on : 1,	/* Need to do ECN echo */
-
 		tcp_ecn_cwr_sent : 1,	/* ECN_CWR has been sent */
 		tcp_cwr : 1,		/* Cwnd has reduced recently */
-		tcp_pad_to_bit31 : 2;
+
+		tcp_pad_to_bit31 : 4;
 	/* Following manipulated by TCP under squeue protection */
 	uint32_t
 		tcp_mdt : 1,		/* Lower layer is capable of MDT */
@@ -404,7 +401,7 @@
 
 	kmutex_t	*tcp_acceptor_lockp;	/* Ptr to tf_lock */
 
-	timeout_id_t	tcp_ordrelid;		/* qbufcall/qtimeout id */
+	mblk_t		*tcp_ordrel_mp;		/* T_ordrel_ind mblk */
 	t_uscalar_t	tcp_acceptor_id;	/* ACCEPTOR_id */
 
 	int		tcp_ipsec_overhead;
@@ -603,6 +600,11 @@
 	 */
 	sodirect_t	*tcp_sodirect;
 
+	/* mblk_t used to enter TCP's squeue from the service routine. */
+	mblk_t		*tcp_rsrv_mp;
+	/* Mutex for accessing tcp_rsrv_mp */
+	kmutex_t	tcp_rsrv_mp_lock;
+
 #ifdef DEBUG
 	pc_t			tcmp_stk[15];
 #endif
--- a/usr/src/uts/common/inet/tcp/tcp.c	Wed Nov 05 08:30:52 2008 -0700
+++ b/usr/src/uts/common/inet/tcp/tcp.c	Wed Nov 05 08:38:41 2008 -0800
@@ -1630,6 +1630,7 @@
 	conn_t		*connp = tcp->tcp_connp;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
 	netstack_t	*ns = tcps->tcps_netstack;
+	mblk_t		*tcp_rsrv_mp;
 
 	tcp_bind_hash_remove(tcp);
 
@@ -1682,6 +1683,7 @@
 	tcp_iphc = tcp->tcp_iphc;
 	tcp_iphc_len = tcp->tcp_iphc_len;
 	tcp_hdr_grown = tcp->tcp_hdr_grown;
+	tcp_rsrv_mp = tcp->tcp_rsrv_mp;
 
 	if (connp->conn_cred != NULL) {
 		crfree(connp->conn_cred);
@@ -1702,6 +1704,7 @@
 	tcp->tcp_iphc = tcp_iphc;
 	tcp->tcp_iphc_len = tcp_iphc_len;
 	tcp->tcp_hdr_grown = tcp_hdr_grown;
+	tcp->tcp_rsrv_mp = tcp_rsrv_mp;
 
 	tcp->tcp_connp = connp;
 
@@ -3863,20 +3866,6 @@
 
 	TCP_STAT(tcps, tcp_clean_death_nondetached);
 
-	/*
-	 * If T_ORDREL_IND has not been sent yet (done when service routine
-	 * is run) postpone cleaning up the endpoint until service routine
-	 * has sent up the T_ORDREL_IND. Avoid clearing out an existing
-	 * client_errno since tcp_close uses the client_errno field.
-	 */
-	if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
-		if (err != 0)
-			tcp->tcp_client_errno = err;
-
-		tcp->tcp_deferred_clean_death = B_TRUE;
-		return (-1);
-	}
-
 	/* If sodirect, not anymore */
 	SOD_PTR_ENTER(tcp, sodp);
 	if (sodp != NULL) {
@@ -4206,13 +4195,10 @@
 	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
 	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
 
-	/* Cancel any pending timeout */
-	if (tcp->tcp_ordrelid != 0) {
-		if (tcp->tcp_timeout) {
-			(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ordrelid);
-		}
-		tcp->tcp_ordrelid = 0;
-		tcp->tcp_timeout = B_FALSE;
+	/* End point has closed this TCP, no need to send up T_ordrel_ind. */
+	if (tcp->tcp_ordrel_mp != NULL) {
+		freeb(tcp->tcp_ordrel_mp);
+		tcp->tcp_ordrel_mp = NULL;
 	}
 
 	mutex_enter(&tcp->tcp_eager_lock);
@@ -5404,6 +5390,7 @@
 
 		ASSERT(tcp->tcp_tcps == NULL);
 		ASSERT(connp->conn_netstack == NULL);
+		ASSERT(tcp->tcp_rsrv_mp != NULL);
 		ns = tcps->tcps_netstack;
 		netstack_hold(ns);
 		connp->conn_netstack = ns;
@@ -5417,8 +5404,18 @@
 	    tcps->tcps_netstack)) == NULL)
 		return (NULL);
 	tcp = connp->conn_tcp;
+	/*
+	 * Pre-allocate the tcp_rsrv_mp.  This mblk will not be freed
+	 * until this conn_t/tcp_t is freed at ipcl_conn_destroy().
+	 */
+	if ((tcp->tcp_rsrv_mp = allocb(0, BPRI_HI)) == NULL) {
+		ipcl_conn_destroy(connp);
+		return (NULL);
+	}
+	mutex_init(&tcp->tcp_rsrv_mp_lock, NULL, MUTEX_DEFAULT, NULL);
 	tcp->tcp_tcps = tcps;
 	TCPS_REFHOLD(tcps);
+
 	return ((void *)connp);
 }
 
@@ -5724,6 +5721,15 @@
 
 	eager = econnp->conn_tcp;
 
+	/*
+	 * Pre-allocate the T_ordrel_ind mblk so that at close time, we
+	 * will always have that to send up.  Otherwise, we need to do
+	 * special handling in case the allocation fails at that time.
+	 */
+	ASSERT(eager->tcp_ordrel_mp == NULL);
+	if ((eager->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL)
+		goto error3;
+
 	/* Inherit various TCP parameters from the listener */
 	eager->tcp_naglim = tcp->tcp_naglim;
 	eager->tcp_first_timer_threshold =
@@ -6187,6 +6193,17 @@
 	}
 
 	/*
+	 * Pre-allocate the T_ordrel_ind mblk so that at close time, we
+	 * will always have that to send up.  Otherwise, we need to do
+	 * special handling in case the allocation fails at that time.
+	 */
+	ASSERT(tcp->tcp_ordrel_mp == NULL);
+	if ((tcp->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) {
+		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
+		return;
+	}
+
+	/*
 	 * Determine packet type based on type of address passed in
 	 * the request should contain an IPv4 or IPv6 address.
 	 * Make sure that address family matches the type of
@@ -7794,6 +7811,10 @@
 		freeb(tcp->tcp_fused_sigurg_mp);
 		tcp->tcp_fused_sigurg_mp = NULL;
 	}
+	if (tcp->tcp_ordrel_mp != NULL) {
+		freeb(tcp->tcp_ordrel_mp);
+		tcp->tcp_ordrel_mp = NULL;
+	}
 
 	/*
 	 * Following is a union with two members which are
@@ -7990,7 +8011,6 @@
 	tcp->tcp_detached = 0;
 	tcp->tcp_bind_pending = 0;
 	tcp->tcp_unbind_pending = 0;
-	tcp->tcp_deferred_clean_death = 0;
 
 	tcp->tcp_snd_ws_ok = B_FALSE;
 	tcp->tcp_snd_ts_ok = B_FALSE;
@@ -8004,7 +8024,6 @@
 	tcp->tcp_set_timer = 0;
 
 	tcp->tcp_active_open = 0;
-	ASSERT(tcp->tcp_timeout == B_FALSE);
 	tcp->tcp_rexmit = B_FALSE;
 	tcp->tcp_xmit_zc_clean = B_FALSE;
 
@@ -8124,7 +8143,7 @@
 
 	PRESERVE(tcp->tcp_acceptor_lockp);
 
-	ASSERT(tcp->tcp_ordrelid == 0);
+	ASSERT(tcp->tcp_ordrel_mp == NULL);
 	PRESERVE(tcp->tcp_acceptor_id);
 	DONTCARE(tcp->tcp_ipsec_overhead);
 
@@ -8198,6 +8217,9 @@
 
 	tcp->tcp_closemp_used = B_FALSE;
 
+	PRESERVE(tcp->tcp_rsrv_mp);
+	PRESERVE(tcp->tcp_rsrv_mp_lock);
+
 #ifdef DEBUG
 	DONTCARE(tcp->tcmp_stk[0]);
 #endif
@@ -15159,8 +15181,7 @@
 	 */
 	if (tcp->tcp_ipv6_recvancillary != 0) {
 		mp = tcp_rput_add_ancillary(tcp, mp, &ipp);
-		if (mp == NULL)
-			return;
+		ASSERT(mp != NULL);
 	}
 
 	if (tcp->tcp_listener || tcp->tcp_hard_binding) {
@@ -15491,36 +15512,10 @@
 			    tcp->tcp_fused_sigurg);
 		}
 
-		if ((mp1 = mi_tpi_ordrel_ind()) != NULL) {
-			tcp->tcp_ordrel_done = B_TRUE;
-			putnext(tcp->tcp_rq, mp1);
-			if (tcp->tcp_deferred_clean_death) {
-				/*
-				 * tcp_clean_death was deferred
-				 * for T_ORDREL_IND - do it now
-				 */
-				(void) tcp_clean_death(tcp,
-				    tcp->tcp_client_errno, 20);
-				tcp->tcp_deferred_clean_death =	B_FALSE;
-			}
-		} else {
-			/*
-			 * Run the orderly release in the
-			 * service routine.
-			 */
-			qenable(tcp->tcp_rq);
-			/*
-			 * Caveat(XXX): The machine may be so
-			 * overloaded that tcp_rsrv() is not scheduled
-			 * until after the endpoint has transitioned
-			 * to TCPS_TIME_WAIT
-			 * and tcp_time_wait_interval expires. Then
-			 * tcp_timer() will blow away state in tcp_t
-			 * and T_ORDREL_IND will never be delivered
-			 * upstream. Unlikely but potentially
-			 * a problem.
-			 */
-		}
+		mp1 = tcp->tcp_ordrel_mp;
+		tcp->tcp_ordrel_mp = NULL;
+		tcp->tcp_ordrel_done = B_TRUE;
+		putnext(tcp->tcp_rq, mp1);
 	}
 done:
 	ASSERT(!(flags & TH_MARKNEXT_NEEDED));
@@ -16229,25 +16224,6 @@
 	putnext(q, mp);
 }
 
-/*
- * Called as the result of a qbufcall or a qtimeout to remedy a failure
- * to allocate a T_ordrel_ind in tcp_rsrv().  qenable(q) will make
- * tcp_rsrv() try again.
- */
-static void
-tcp_ordrel_kick(void *arg)
-{
-	conn_t 	*connp = (conn_t *)arg;
-	tcp_t	*tcp = connp->conn_tcp;
-
-	tcp->tcp_ordrelid = 0;
-	tcp->tcp_timeout = B_FALSE;
-	if (!TCP_IS_DETACHED(tcp) && tcp->tcp_rq != NULL &&
-	    tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
-		qenable(tcp->tcp_rq);
-	}
-}
-
 /* ARGSUSED */
 static void
 tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2)
@@ -16260,7 +16236,9 @@
 	sodirect_t	*sodp;
 	boolean_t	fc;
 
-	freeb(mp);
+	mutex_enter(&tcp->tcp_rsrv_mp_lock);
+	tcp->tcp_rsrv_mp = mp;
+	mutex_exit(&tcp->tcp_rsrv_mp_lock);
 
 	TCP_STAT(tcps, tcp_rsrv_calls);
 
@@ -16348,60 +16326,6 @@
 			BUMP_MIB(&tcps->tcps_mib, tcpOutWinUpdate);
 		}
 	}
-
-	/* Handle a failure to allocate a T_ORDREL_IND here */
-	if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
-		ASSERT(tcp->tcp_listener == NULL);
-
-		SOD_PTR_ENTER(tcp, sodp);
-		if (sodp != NULL) {
-			if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) {
-				sodp->sod_uioa.uioa_state &= UIOA_CLR;
-				sodp->sod_uioa.uioa_state |= UIOA_FINI;
-			}
-			/* No more sodirect */
-			tcp->tcp_sodirect = NULL;
-			if (!SOD_QEMPTY(sodp)) {
-				/* Notify mblk(s) to process */
-				(void) tcp_rcv_sod_wakeup(tcp, sodp);
-				/* sod_wakeup() does the mutex_exit() */
-			} else {
-				/* Nothing to process */
-				mutex_exit(sodp->sod_lockp);
-			}
-		} else if (tcp->tcp_rcv_list != NULL) {
-			/*
-			 * Push any mblk(s) enqueued from co processing.
-			 */
-			(void) tcp_rcv_drain(tcp->tcp_rq, tcp);
-			ASSERT(tcp->tcp_rcv_list == NULL ||
-			    tcp->tcp_fused_sigurg);
-		}
-
-		mp = mi_tpi_ordrel_ind();
-		if (mp) {
-			tcp->tcp_ordrel_done = B_TRUE;
-			putnext(q, mp);
-			if (tcp->tcp_deferred_clean_death) {
-				/*
-				 * tcp_clean_death was deferred for
-				 * T_ORDREL_IND - do it now
-				 */
-				tcp->tcp_deferred_clean_death = B_FALSE;
-				(void) tcp_clean_death(tcp,
-				    tcp->tcp_client_errno, 22);
-			}
-		} else if (!tcp->tcp_timeout && tcp->tcp_ordrelid == 0) {
-			/*
-			 * If there isn't already a timer running
-			 * start one.  Use a 4 second
-			 * timer as a fallback since it can't fail.
-			 */
-			tcp->tcp_timeout = B_TRUE;
-			tcp->tcp_ordrelid = TCP_TIMER(tcp, tcp_ordrel_kick,
-			    MSEC_TO_TICK(4000));
-		}
-	}
 }
 
 /*
@@ -16409,15 +16333,13 @@
  * result of flow control relief.  Since we don't actually queue anything in
  * TCP, we have no data to send out of here.  What we do is clear the receive
  * window, and send out a window update.
- * This routine is also called to drive an orderly release message upstream
- * if the attempt in tcp_rput failed.
  */
 static void
 tcp_rsrv(queue_t *q)
 {
-	conn_t *connp = Q_TO_CONN(q);
-	tcp_t	*tcp = connp->conn_tcp;
-	mblk_t	*mp;
+	conn_t		*connp = Q_TO_CONN(q);
+	tcp_t		*tcp = connp->conn_tcp;
+	mblk_t		*mp;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
 
 	/* No code does a putq on the read side */
@@ -16428,24 +16350,18 @@
 		return;
 	}
 
-	mp = allocb(0, BPRI_HI);
-	if (mp == NULL) {
-		/*
-		 * We are under memory pressure. Return for now and we
-		 * we will be called again later.
-		 */
-		if (!tcp->tcp_timeout && tcp->tcp_ordrelid == 0) {
-			/*
-			 * If there isn't already a timer running
-			 * start one.  Use a 4 second
-			 * timer as a fallback since it can't fail.
-			 */
-			tcp->tcp_timeout = B_TRUE;
-			tcp->tcp_ordrelid = TCP_TIMER(tcp, tcp_ordrel_kick,
-			    MSEC_TO_TICK(4000));
-		}
-		return;
-	}
+	/*
+	 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_rsrv() has already
+	 * been run.  So just return.
+	 */
+	mutex_enter(&tcp->tcp_rsrv_mp_lock);
+	if ((mp = tcp->tcp_rsrv_mp) == NULL) {
+		mutex_exit(&tcp->tcp_rsrv_mp_lock);
+		return;
+	}
+	tcp->tcp_rsrv_mp = NULL;
+	mutex_exit(&tcp->tcp_rsrv_mp_lock);
+
 	CONN_INC_REF(connp);
 	squeue_enter(connp->conn_sqp, mp, tcp_rsrv_input, connp,
 	    SQTAG_TCP_RSRV);
@@ -18494,26 +18410,10 @@
 	}
 	ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
 	if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
-		mp = mi_tpi_ordrel_ind();
-		if (mp) {
-			tcp->tcp_ordrel_done = B_TRUE;
-			putnext(q, mp);
-			if (tcp->tcp_deferred_clean_death) {
-				/*
-				 * tcp_clean_death was deferred
-				 * for T_ORDREL_IND - do it now
-				 */
-				(void) tcp_clean_death(tcp,
-				    tcp->tcp_client_errno, 21);
-				tcp->tcp_deferred_clean_death = B_FALSE;
-			}
-		} else {
-			/*
-			 * Run the orderly release in the
-			 * service routine.
-			 */
-			qenable(q);
-		}
+		mp = tcp->tcp_ordrel_mp;
+		tcp->tcp_ordrel_mp = NULL;
+		tcp->tcp_ordrel_done = B_TRUE;
+		putnext(q, mp);
 	}
 	if (tcp->tcp_hard_binding) {
 		tcp->tcp_hard_binding = B_FALSE;