changeset 10312:5076f8129626

6848633 ftp put hangs over ib
author Rao Shoaib <Rao.Shoaib@Sun.COM>
date Fri, 14 Aug 2009 13:15:43 -0700
parents 539b18426dae
children ca42e2f0424a
files usr/src/uts/common/inet/tcp.h usr/src/uts/common/inet/tcp/tcp.c usr/src/uts/common/inet/tcp/tcp_fusion.c usr/src/uts/common/inet/tcp_impl.h
diffstat 4 files changed, 74 insertions(+), 114 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/inet/tcp.h	Fri Aug 14 11:48:37 2009 -0700
+++ b/usr/src/uts/common/inet/tcp.h	Fri Aug 14 13:15:43 2009 -0700
@@ -536,7 +536,6 @@
 	 */
 	struct tcp_s *tcp_loopback_peer;	/* peer tcp for loopback */
 	mblk_t	*tcp_fused_sigurg_mp;		/* M_PCSIG mblk for SIGURG */
-	size_t	tcp_fuse_rcv_hiwater;		/* fusion receive queue size */
 
 	uint32_t
 		tcp_fused : 1,		/* loopback tcp in fusion mode */
--- a/usr/src/uts/common/inet/tcp/tcp.c	Fri Aug 14 11:48:37 2009 -0700
+++ b/usr/src/uts/common/inet/tcp/tcp.c	Fri Aug 14 13:15:43 2009 -0700
@@ -789,7 +789,6 @@
 static void	tcp_process_options(tcp_t *, tcph_t *);
 static void	tcp_rput_common(tcp_t *tcp, mblk_t *mp);
 static void	tcp_rsrv(queue_t *q);
-static int	tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd);
 static int	tcp_snmp_state(tcp_t *tcp);
 static void	tcp_timer(void *arg);
 static void	tcp_timer_callback(void *);
@@ -1373,6 +1372,28 @@
 	    iph, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha,		\
 	    ip6_t *, ip6h, int, 0);
 
+static void
+tcp_set_recv_threshold(tcp_t *tcp, uint32_t new_rcvthresh)
+{
+	uint32_t default_threshold = SOCKET_RECVHIWATER >> 3;
+
+	if (IPCL_IS_NONSTR(tcp->tcp_connp)) {
+		conn_t *connp = tcp->tcp_connp;
+		struct sock_proto_props sopp;
+
+		/*
+		 * only increase rcvthresh upto default_threshold
+		 */
+		if (new_rcvthresh > default_threshold)
+			new_rcvthresh = default_threshold;
+
+		sopp.sopp_flags = SOCKOPT_RCVTHRESH;
+		sopp.sopp_rcvthresh = new_rcvthresh;
+
+		(*connp->conn_upcalls->su_set_proto_props)
+		    (connp->conn_upper_handle, &sopp);
+	}
+}
 /*
  * Figure out the value of window scale opton.  Note that the rwnd is
  * ASSUMED to be rounded up to the nearest MSS before the calculation.
@@ -5536,10 +5557,10 @@
 	}
 
 	/*
-	 * listener->tcp_rq->q_hiwat should be the default window size or a
-	 * window size changed via SO_RCVBUF option.  First round up the
-	 * eager's tcp_rwnd to the nearest MSS.  Then find out the window
-	 * scale option value if needed.  Call tcp_rwnd_set() to finish the
+	 * listeners tcp_recv_hiwater should be the default window size or a
+	 * window size changed via SO_RCVBUF option. First round up the
+	 * eager's tcp_rwnd to the nearest MSS. Then find out the window
+	 * scale option value if needed. Call tcp_rwnd_set() to finish the
 	 * setting.
 	 *
 	 * Note if there is a rpipe metric associated with the remote host,
@@ -7568,8 +7589,6 @@
 	tcp->tcp_ip_src_v6 = tcp->tcp_bound_source_v6;
 
 	ASSERT(tcp->tcp_ptpbhn != NULL);
-	if (!IPCL_IS_NONSTR(tcp->tcp_connp))
-		tcp->tcp_rq->q_hiwat = tcps->tcps_recv_hiwat;
 	tcp->tcp_recv_hiwater = tcps->tcps_recv_hiwat;
 	tcp->tcp_recv_lowater = tcp_rinfo.mi_lowat;
 	tcp->tcp_rwnd = tcps->tcps_recv_hiwat;
@@ -7857,7 +7876,7 @@
 	tcp->tcp_unfusable = B_FALSE;
 	tcp->tcp_fused_sigurg = B_FALSE;
 	tcp->tcp_loopback_peer = NULL;
-	tcp->tcp_fuse_rcv_hiwater = 0;
+	tcp->tcp_recv_hiwater = 0;
 
 	tcp->tcp_lso = B_FALSE;
 
@@ -7961,7 +7980,7 @@
 	tcp->tcp_unfusable = B_FALSE;
 	tcp->tcp_fused_sigurg = B_FALSE;
 	tcp->tcp_loopback_peer = NULL;
-	tcp->tcp_fuse_rcv_hiwater = 0;
+	tcp->tcp_recv_hiwater = 0;
 
 	/* Initialize the header template */
 	if (tcp->tcp_ipversion == IPV4_VERSION) {
@@ -8902,7 +8921,7 @@
 	if (TCP_IS_DETACHED(tcp))
 		return (mss);
 	if (tcp->tcp_fused) {
-		maxpsz = tcp_fuse_maxpsz_set(tcp);
+		maxpsz = tcp_fuse_maxpsz(tcp);
 		mss = INFPSZ;
 	} else if (tcp->tcp_mdt || tcp->tcp_lso || tcp->tcp_maxpsz == 0) {
 		/*
@@ -9321,8 +9340,6 @@
 			return (NULL);
 		}
 		q = connp->conn_rq;
-	} else {
-		RD(q)->q_hiwat = tcps->tcps_recv_hiwat;
 	}
 
 	SOCK_CONNID_INIT(tcp->tcp_connid);
@@ -15650,7 +15667,7 @@
 
 	if (canputnext(q)) {
 		/* Not flow-controlled, open rwnd */
-		tcp->tcp_rwnd = q->q_hiwat;
+		tcp->tcp_rwnd = tcp->tcp_recv_hiwater;
 
 		/*
 		 * Send back a window update immediately if TCP is above
@@ -15725,7 +15742,7 @@
  * XXX - Should allow a lower rwnd than tcp_recv_hiwat_minmss * mss if the
  * user requests so.
  */
-static int
+int
 tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd)
 {
 	uint32_t	mss = tcp->tcp_mss;
@@ -15739,25 +15756,11 @@
 		tcp_t *peer_tcp = tcp->tcp_loopback_peer;
 
 		ASSERT(peer_tcp != NULL);
-		/*
-		 * Record the stream head's high water mark for
-		 * this endpoint; this is used for flow-control
-		 * purposes in tcp_fuse_output().
-		 */
 		sth_hiwat = tcp_fuse_set_rcv_hiwat(tcp, rwnd);
 		if (!tcp_detached) {
 			(void) proto_set_rx_hiwat(tcp->tcp_rq, tcp->tcp_connp,
 			    sth_hiwat);
-			if (IPCL_IS_NONSTR(tcp->tcp_connp)) {
-				conn_t *connp = tcp->tcp_connp;
-				struct sock_proto_props sopp;
-
-				sopp.sopp_flags = SOCKOPT_RCVTHRESH;
-				sopp.sopp_rcvthresh = sth_hiwat >> 3;
-
-				(*connp->conn_upcalls->su_set_proto_props)
-				    (connp->conn_upper_handle, &sopp);
-			}
+			tcp_set_recv_threshold(tcp, sth_hiwat >> 3);
 		}
 
 		/*
@@ -15767,7 +15770,7 @@
 		 * have changed we need to update the peer's maxpsz.
 		 */
 		(void) tcp_maxpsz_set(peer_tcp, B_TRUE);
-		return (rwnd);
+		return (sth_hiwat);
 	}
 
 	if (tcp_detached) {
@@ -15840,14 +15843,11 @@
 
 	if (tcp_detached)
 		return (rwnd);
-	/*
-	 * We set the maximum receive window into rq->q_hiwat if it is
-	 * a STREAMS socket.
-	 * This is not actually used for flow control.
-	 */
-	if (!IPCL_IS_NONSTR(tcp->tcp_connp))
-		tcp->tcp_rq->q_hiwat = rwnd;
+
+	tcp_set_recv_threshold(tcp, rwnd >> 3);
+
 	tcp->tcp_recv_hiwater = rwnd;
+
 	/*
 	 * Set the STREAM head high water mark. This doesn't have to be
 	 * here, since we are simply using default values, but we would
@@ -17258,19 +17258,13 @@
 	}
 
 	/*
-	 * Set the max window size (tcp_rq->q_hiwat) of the acceptor
-	 * properly.  This is the first time we know of the acceptor'
-	 * queue.  So we do it here.
-	 *
-	 * XXX
+	 * Set max window size (tcp_recv_hiwater) of the acceptor.
 	 */
 	if (tcp->tcp_rcv_list == NULL) {
 		/*
 		 * Recv queue is empty, tcp_rwnd should not have changed.
 		 * That means it should be equal to the listener's tcp_rwnd.
 		 */
-		if (!IPCL_IS_NONSTR(connp))
-			tcp->tcp_rq->q_hiwat = tcp->tcp_rwnd;
 		tcp->tcp_recv_hiwater = tcp->tcp_rwnd;
 	} else {
 #ifdef DEBUG
@@ -17286,8 +17280,6 @@
 		ASSERT(cnt != 0 && tcp->tcp_rcv_cnt == cnt);
 #endif
 		/* There is some data, add them back to get the max. */
-		if (!IPCL_IS_NONSTR(connp))
-			tcp->tcp_rq->q_hiwat = tcp->tcp_rwnd + tcp->tcp_rcv_cnt;
 		tcp->tcp_recv_hiwater = tcp->tcp_rwnd + tcp->tcp_rcv_cnt;
 	}
 	/*
@@ -17298,10 +17290,6 @@
 	sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_MAXBLK | SOCKOPT_WROFF;
 	sopp_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
 
-	/*
-	 * Record the stream head's high water mark for this endpoint;
-	 * this is used for flow-control purposes.
-	 */
 	sopp_rxhiwat = tcp->tcp_fused ?
 	    tcp_fuse_set_rcv_hiwat(tcp, tcp->tcp_recv_hiwater) :
 	    MAX(tcp->tcp_recv_hiwater, tcps->tcps_sth_rcv_hiwat);
@@ -17455,7 +17443,7 @@
 			/* We drain directly in case of fused tcp loopback */
 
 			if (!tcp->tcp_fused && canputnext(q)) {
-				tcp->tcp_rwnd = q->q_hiwat;
+				tcp->tcp_rwnd = tcp->tcp_recv_hiwater;
 				if (tcp->tcp_state >= TCPS_ESTABLISHED &&
 				    tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
 					tcp_xmit_ctl(NULL,
@@ -25639,7 +25627,6 @@
 	cred_t	*ecr;
 	ts_label_t	*tsl;
 	uint32_t	mss;
-	queue_t	*q = tcp->tcp_rq;
 	conn_t	*connp = tcp->tcp_connp;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
 
@@ -25748,8 +25735,6 @@
 		 */
 		tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss),
 		    tcps->tcps_recv_hiwat_minmss * mss);
-		if (!IPCL_IS_NONSTR(connp))
-			q->q_hiwat = tcp->tcp_rwnd;
 		tcp->tcp_recv_hiwater = tcp->tcp_rwnd;
 		tcp_set_ws_value(tcp);
 		U32_TO_ABE16((tcp->tcp_rwnd >> tcp->tcp_rcv_ws),
@@ -26799,9 +26784,7 @@
 	    tcp->tcp_tcps->tcps_wroff_xtra);
 	if (tcp->tcp_snd_sack_ok)
 		stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
-	stropt->so_hiwat = tcp->tcp_fused ?
-	    tcp_fuse_set_rcv_hiwat(tcp, tcp->tcp_recv_hiwater) :
-	    MAX(tcp->tcp_recv_hiwater, tcp->tcp_tcps->tcps_sth_rcv_hiwat);
+	stropt->so_hiwat = tcp->tcp_recv_hiwater;
 	stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
 
 	putnext(RD(q), stropt_mp);
--- a/usr/src/uts/common/inet/tcp/tcp_fusion.c	Fri Aug 14 11:48:37 2009 -0700
+++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c	Fri Aug 14 13:15:43 2009 -0700
@@ -136,14 +136,14 @@
 	ASSERT(tcp->tcp_loopback);
 	ASSERT(tcp->tcp_loopback_peer == NULL);
 	/*
-	 * We need to inherit q_hiwat of the listener tcp, but we can't
-	 * really use tcp_listener since we get here after sending up
-	 * T_CONN_IND and tcp_wput_accept() may be called independently,
-	 * at which point tcp_listener is cleared; this is why we use
-	 * tcp_saved_listener.  The listener itself is guaranteed to be
-	 * around until tcp_accept_finish() is called on this eager --
-	 * this won't happen until we're done since we're inside the
-	 * eager's perimeter now.
+	 * We need to inherit tcp_recv_hiwater of the listener tcp,
+	 * but we can't really use tcp_listener since we get here after
+	 * sending up T_CONN_IND and tcp_wput_accept() may be called
+	 * independently, at which point tcp_listener is cleared;
+	 * this is why we use tcp_saved_listener. The listener itself
+	 * is guaranteed to be around until tcp_accept_finish() is called
+	 * on this eager -- this won't happen until we're done since we're
+	 * inside the eager's perimeter now.
 	 *
 	 * We can also get called in the case were a connection needs
 	 * to be re-fused. In this case tcp_saved_listener will be
@@ -272,29 +272,19 @@
 		tcp_timers_stop(tcp);
 		tcp_timers_stop(peer_tcp);
 
-		/*
-		 * At this point we are a detached eager tcp and therefore
-		 * don't have a queue assigned to us until accept happens.
-		 * In the mean time the peer endpoint may immediately send
-		 * us data as soon as fusion is finished, and we need to be
-		 * able to flow control it in case it sends down huge amount
-		 * of data while we're still detached.  To prevent that we
-		 * inherit the listener's recv_hiwater value; this is temporary
-		 * since we'll repeat the process in tcp_accept_finish().
-		 */
 		if (!tcp->tcp_refuse) {
-			(void) tcp_fuse_set_rcv_hiwat(tcp,
-			    tcp->tcp_saved_listener->tcp_recv_hiwater);
+			/*
+			 * Set receive buffer and max packet size for the
+			 * active open tcp.
+			 * eager's values will be set in tcp_accept_finish.
+			 */
+
+			(void) tcp_rwnd_set(peer_tcp,
+			    peer_tcp->tcp_recv_hiwater);
 
 			/*
-			 * Set the stream head's write offset value to zero
-			 * since we won't be needing any room for TCP/IP
-			 * headers; tell it to not break up the writes (this
-			 * would reduce the amount of work done by kmem); and
-			 * configure our receive buffer. Note that we can only
-			 * do this for the active connect tcp since our eager is
-			 * still detached; it will be dealt with later in
-			 * tcp_accept_finish().
+			 * Set the write offset value to zero since we won't
+			 * be needing any room for TCP/IP headers.
 			 */
 			if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp)) {
 				struct stroptions *stropt;
@@ -303,19 +293,9 @@
 				mp->b_wptr += sizeof (*stropt);
 
 				stropt = (struct stroptions *)mp->b_rptr;
-				stropt->so_flags = SO_MAXBLK|SO_WROFF|SO_HIWAT;
-				stropt->so_maxblk = tcp_maxpsz_set(peer_tcp,
-				    B_FALSE);
+				stropt->so_flags = SO_WROFF;
 				stropt->so_wroff = 0;
 
-				/*
-				 * Record the stream head's high water mark for
-				 * peer endpoint; this is used for flow-control
-				 * purposes in tcp_fuse_output().
-				 */
-				stropt->so_hiwat = tcp_fuse_set_rcv_hiwat(
-				    peer_tcp, peer_rq->q_hiwat);
-
 				/* Send the options up */
 				putnext(peer_rq, mp);
 			} else {
@@ -324,16 +304,8 @@
 				/* The peer is a non-STREAMS end point */
 				ASSERT(IPCL_IS_TCP(peer_connp));
 
-				(void) tcp_fuse_set_rcv_hiwat(tcp,
-				    tcp->tcp_saved_listener->tcp_recv_hiwater);
-
-				sopp.sopp_flags = SOCKOPT_MAXBLK |
-				    SOCKOPT_WROFF | SOCKOPT_RCVHIWAT;
-				sopp.sopp_maxblk = tcp_maxpsz_set(peer_tcp,
-				    B_FALSE);
+				sopp.sopp_flags = SOCKOPT_WROFF;
 				sopp.sopp_wroff = 0;
-				sopp.sopp_rxhiwat = tcp_fuse_set_rcv_hiwat(
-				    peer_tcp, peer_tcp->tcp_recv_hiwater);
 				(*peer_connp->conn_upcalls->su_set_proto_props)
 				    (peer_connp->conn_upper_handle, &sopp);
 			}
@@ -789,7 +761,7 @@
 	mutex_enter(&tcp->tcp_non_sq_lock);
 	flow_stopped = tcp->tcp_flow_stopped;
 	if ((TCP_IS_DETACHED(peer_tcp) &&
-	    (peer_tcp->tcp_rcv_cnt >= peer_tcp->tcp_fuse_rcv_hiwater)) ||
+	    (peer_tcp->tcp_rcv_cnt >= peer_tcp->tcp_recv_hiwater)) ||
 	    (!TCP_IS_DETACHED(peer_tcp) &&
 	    !IPCL_IS_NONSTR(peer_tcp->tcp_connp) &&
 	    !canputnext(peer_tcp->tcp_rq))) {
@@ -989,7 +961,12 @@
 	 * after SO_SNDBUF; the latter is also similarly rounded up.
 	 */
 	rwnd = P2ROUNDUP_TYPED(rwnd, PAGESIZE, size_t);
-	tcp->tcp_fuse_rcv_hiwater = rwnd;
+
+	/*
+	 * Record high water mark, this is used for flow-control
+	 * purposes in tcp_fuse_output().
+	 */
+	tcp->tcp_recv_hiwater = rwnd;
 	return (rwnd);
 }
 
@@ -997,7 +974,7 @@
  * Calculate the maximum outstanding unread data block for a fused tcp endpoint.
  */
 int
-tcp_fuse_maxpsz_set(tcp_t *tcp)
+tcp_fuse_maxpsz(tcp_t *tcp)
 {
 	tcp_t *peer_tcp = tcp->tcp_loopback_peer;
 	uint_t sndbuf = tcp->tcp_xmit_hiwater;
@@ -1005,7 +982,7 @@
 
 	ASSERT(tcp->tcp_fused);
 	ASSERT(peer_tcp != NULL);
-	ASSERT(peer_tcp->tcp_fuse_rcv_hiwater != 0);
+	ASSERT(peer_tcp->tcp_recv_hiwater != 0);
 	/*
 	 * In the fused loopback case, we want the stream head to split
 	 * up larger writes into smaller chunks for a more accurate flow-
@@ -1014,8 +991,8 @@
 	 * We round up the buffer to system page size due to the lack of
 	 * TCP MSS concept in Fusion.
 	 */
-	if (maxpsz > peer_tcp->tcp_fuse_rcv_hiwater)
-		maxpsz = peer_tcp->tcp_fuse_rcv_hiwater;
+	if (maxpsz > peer_tcp->tcp_recv_hiwater)
+		maxpsz = peer_tcp->tcp_recv_hiwater;
 	maxpsz = P2ROUNDUP_TYPED(maxpsz, PAGESIZE, uint_t) >> 1;
 
 	return (maxpsz);
--- a/usr/src/uts/common/inet/tcp_impl.h	Fri Aug 14 11:48:37 2009 -0700
+++ b/usr/src/uts/common/inet/tcp_impl.h	Fri Aug 14 13:15:43 2009 -0700
@@ -222,8 +222,9 @@
 extern void	tcp_fuse_output_urg(tcp_t *, mblk_t *);
 extern boolean_t tcp_fuse_rcv_drain(queue_t *, tcp_t *, mblk_t **);
 extern size_t	tcp_fuse_set_rcv_hiwat(tcp_t *, size_t);
-extern int	tcp_fuse_maxpsz_set(tcp_t *);
-extern void	tcp_fuse_backenable(tcp_t *tcp);
+extern int	tcp_fuse_maxpsz(tcp_t *);
+extern void	tcp_fuse_backenable(tcp_t *);
+extern int	tcp_rwnd_set(tcp_t *, uint32_t);
 
 /*
  * Object to represent database of options to search passed to