Mercurial > illumos > illumos-gate
changeset 10312:5076f8129626
6848633 ftp put hangs over ib
author | Rao Shoaib <Rao.Shoaib@Sun.COM> |
---|---|
date | Fri, 14 Aug 2009 13:15:43 -0700 |
parents | 539b18426dae |
children | ca42e2f0424a |
files | usr/src/uts/common/inet/tcp.h usr/src/uts/common/inet/tcp/tcp.c usr/src/uts/common/inet/tcp/tcp_fusion.c usr/src/uts/common/inet/tcp_impl.h |
diffstat | 4 files changed, 74 insertions(+), 114 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/uts/common/inet/tcp.h Fri Aug 14 11:48:37 2009 -0700 +++ b/usr/src/uts/common/inet/tcp.h Fri Aug 14 13:15:43 2009 -0700 @@ -536,7 +536,6 @@ */ struct tcp_s *tcp_loopback_peer; /* peer tcp for loopback */ mblk_t *tcp_fused_sigurg_mp; /* M_PCSIG mblk for SIGURG */ - size_t tcp_fuse_rcv_hiwater; /* fusion receive queue size */ uint32_t tcp_fused : 1, /* loopback tcp in fusion mode */
--- a/usr/src/uts/common/inet/tcp/tcp.c Fri Aug 14 11:48:37 2009 -0700 +++ b/usr/src/uts/common/inet/tcp/tcp.c Fri Aug 14 13:15:43 2009 -0700 @@ -789,7 +789,6 @@ static void tcp_process_options(tcp_t *, tcph_t *); static void tcp_rput_common(tcp_t *tcp, mblk_t *mp); static void tcp_rsrv(queue_t *q); -static int tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd); static int tcp_snmp_state(tcp_t *tcp); static void tcp_timer(void *arg); static void tcp_timer_callback(void *); @@ -1373,6 +1372,28 @@ iph, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, \ ip6_t *, ip6h, int, 0); +static void +tcp_set_recv_threshold(tcp_t *tcp, uint32_t new_rcvthresh) +{ + uint32_t default_threshold = SOCKET_RECVHIWATER >> 3; + + if (IPCL_IS_NONSTR(tcp->tcp_connp)) { + conn_t *connp = tcp->tcp_connp; + struct sock_proto_props sopp; + + /* + * only increase rcvthresh upto default_threshold + */ + if (new_rcvthresh > default_threshold) + new_rcvthresh = default_threshold; + + sopp.sopp_flags = SOCKOPT_RCVTHRESH; + sopp.sopp_rcvthresh = new_rcvthresh; + + (*connp->conn_upcalls->su_set_proto_props) + (connp->conn_upper_handle, &sopp); + } +} /* * Figure out the value of window scale opton. Note that the rwnd is * ASSUMED to be rounded up to the nearest MSS before the calculation. @@ -5536,10 +5557,10 @@ } /* - * listener->tcp_rq->q_hiwat should be the default window size or a - * window size changed via SO_RCVBUF option. First round up the - * eager's tcp_rwnd to the nearest MSS. Then find out the window - * scale option value if needed. Call tcp_rwnd_set() to finish the + * listeners tcp_recv_hiwater should be the default window size or a + * window size changed via SO_RCVBUF option. First round up the + * eager's tcp_rwnd to the nearest MSS. Then find out the window + * scale option value if needed. Call tcp_rwnd_set() to finish the * setting. * * Note if there is a rpipe metric associated with the remote host, @@ -7568,8 +7589,6 @@ tcp->tcp_ip_src_v6 = tcp->tcp_bound_source_v6; ASSERT(tcp->tcp_ptpbhn != NULL); - if (!IPCL_IS_NONSTR(tcp->tcp_connp)) - tcp->tcp_rq->q_hiwat = tcps->tcps_recv_hiwat; tcp->tcp_recv_hiwater = tcps->tcps_recv_hiwat; tcp->tcp_recv_lowater = tcp_rinfo.mi_lowat; tcp->tcp_rwnd = tcps->tcps_recv_hiwat; @@ -7857,7 +7876,7 @@ tcp->tcp_unfusable = B_FALSE; tcp->tcp_fused_sigurg = B_FALSE; tcp->tcp_loopback_peer = NULL; - tcp->tcp_fuse_rcv_hiwater = 0; + tcp->tcp_recv_hiwater = 0; tcp->tcp_lso = B_FALSE; @@ -7961,7 +7980,7 @@ tcp->tcp_unfusable = B_FALSE; tcp->tcp_fused_sigurg = B_FALSE; tcp->tcp_loopback_peer = NULL; - tcp->tcp_fuse_rcv_hiwater = 0; + tcp->tcp_recv_hiwater = 0; /* Initialize the header template */ if (tcp->tcp_ipversion == IPV4_VERSION) { @@ -8902,7 +8921,7 @@ if (TCP_IS_DETACHED(tcp)) return (mss); if (tcp->tcp_fused) { - maxpsz = tcp_fuse_maxpsz_set(tcp); + maxpsz = tcp_fuse_maxpsz(tcp); mss = INFPSZ; } else if (tcp->tcp_mdt || tcp->tcp_lso || tcp->tcp_maxpsz == 0) { /* @@ -9321,8 +9340,6 @@ return (NULL); } q = connp->conn_rq; - } else { - RD(q)->q_hiwat = tcps->tcps_recv_hiwat; } SOCK_CONNID_INIT(tcp->tcp_connid); @@ -15650,7 +15667,7 @@ if (canputnext(q)) { /* Not flow-controlled, open rwnd */ - tcp->tcp_rwnd = q->q_hiwat; + tcp->tcp_rwnd = tcp->tcp_recv_hiwater; /* * Send back a window update immediately if TCP is above @@ -15725,7 +15742,7 @@ * XXX - Should allow a lower rwnd than tcp_recv_hiwat_minmss * mss if the * user requests so. */ -static int +int tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) { uint32_t mss = tcp->tcp_mss; @@ -15739,25 +15756,11 @@ tcp_t *peer_tcp = tcp->tcp_loopback_peer; ASSERT(peer_tcp != NULL); - /* - * Record the stream head's high water mark for - * this endpoint; this is used for flow-control - * purposes in tcp_fuse_output(). - */ sth_hiwat = tcp_fuse_set_rcv_hiwat(tcp, rwnd); if (!tcp_detached) { (void) proto_set_rx_hiwat(tcp->tcp_rq, tcp->tcp_connp, sth_hiwat); - if (IPCL_IS_NONSTR(tcp->tcp_connp)) { - conn_t *connp = tcp->tcp_connp; - struct sock_proto_props sopp; - - sopp.sopp_flags = SOCKOPT_RCVTHRESH; - sopp.sopp_rcvthresh = sth_hiwat >> 3; - - (*connp->conn_upcalls->su_set_proto_props) - (connp->conn_upper_handle, &sopp); - } + tcp_set_recv_threshold(tcp, sth_hiwat >> 3); } /* @@ -15767,7 +15770,7 @@ * have changed we need to update the peer's maxpsz. */ (void) tcp_maxpsz_set(peer_tcp, B_TRUE); - return (rwnd); + return (sth_hiwat); } if (tcp_detached) { @@ -15840,14 +15843,11 @@ if (tcp_detached) return (rwnd); - /* - * We set the maximum receive window into rq->q_hiwat if it is - * a STREAMS socket. - * This is not actually used for flow control. - */ - if (!IPCL_IS_NONSTR(tcp->tcp_connp)) - tcp->tcp_rq->q_hiwat = rwnd; + + tcp_set_recv_threshold(tcp, rwnd >> 3); + tcp->tcp_recv_hiwater = rwnd; + /* * Set the STREAM head high water mark. This doesn't have to be * here, since we are simply using default values, but we would @@ -17258,19 +17258,13 @@ } /* - * Set the max window size (tcp_rq->q_hiwat) of the acceptor - * properly. This is the first time we know of the acceptor' - * queue. So we do it here. - * - * XXX + * Set max window size (tcp_recv_hiwater) of the acceptor. */ if (tcp->tcp_rcv_list == NULL) { /* * Recv queue is empty, tcp_rwnd should not have changed. * That means it should be equal to the listener's tcp_rwnd. */ - if (!IPCL_IS_NONSTR(connp)) - tcp->tcp_rq->q_hiwat = tcp->tcp_rwnd; tcp->tcp_recv_hiwater = tcp->tcp_rwnd; } else { #ifdef DEBUG @@ -17286,8 +17280,6 @@ ASSERT(cnt != 0 && tcp->tcp_rcv_cnt == cnt); #endif /* There is some data, add them back to get the max. */ - if (!IPCL_IS_NONSTR(connp)) - tcp->tcp_rq->q_hiwat = tcp->tcp_rwnd + tcp->tcp_rcv_cnt; tcp->tcp_recv_hiwater = tcp->tcp_rwnd + tcp->tcp_rcv_cnt; } /* @@ -17298,10 +17290,6 @@ sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_MAXBLK | SOCKOPT_WROFF; sopp_maxblk = tcp_maxpsz_set(tcp, B_FALSE); - /* - * Record the stream head's high water mark for this endpoint; - * this is used for flow-control purposes. - */ sopp_rxhiwat = tcp->tcp_fused ? tcp_fuse_set_rcv_hiwat(tcp, tcp->tcp_recv_hiwater) : MAX(tcp->tcp_recv_hiwater, tcps->tcps_sth_rcv_hiwat); @@ -17455,7 +17443,7 @@ /* We drain directly in case of fused tcp loopback */ if (!tcp->tcp_fused && canputnext(q)) { - tcp->tcp_rwnd = q->q_hiwat; + tcp->tcp_rwnd = tcp->tcp_recv_hiwater; if (tcp->tcp_state >= TCPS_ESTABLISHED && tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { tcp_xmit_ctl(NULL, @@ -25639,7 +25627,6 @@ cred_t *ecr; ts_label_t *tsl; uint32_t mss; - queue_t *q = tcp->tcp_rq; conn_t *connp = tcp->tcp_connp; tcp_stack_t *tcps = tcp->tcp_tcps; @@ -25748,8 +25735,6 @@ */ tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss), tcps->tcps_recv_hiwat_minmss * mss); - if (!IPCL_IS_NONSTR(connp)) - q->q_hiwat = tcp->tcp_rwnd; tcp->tcp_recv_hiwater = tcp->tcp_rwnd; tcp_set_ws_value(tcp); U32_TO_ABE16((tcp->tcp_rwnd >> tcp->tcp_rcv_ws), @@ -26799,9 +26784,7 @@ tcp->tcp_tcps->tcps_wroff_xtra); if (tcp->tcp_snd_sack_ok) stropt->so_wroff += TCPOPT_MAX_SACK_LEN; - stropt->so_hiwat = tcp->tcp_fused ? - tcp_fuse_set_rcv_hiwat(tcp, tcp->tcp_recv_hiwater) : - MAX(tcp->tcp_recv_hiwater, tcp->tcp_tcps->tcps_sth_rcv_hiwat); + stropt->so_hiwat = tcp->tcp_recv_hiwater; stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE); putnext(RD(q), stropt_mp);
--- a/usr/src/uts/common/inet/tcp/tcp_fusion.c Fri Aug 14 11:48:37 2009 -0700 +++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c Fri Aug 14 13:15:43 2009 -0700 @@ -136,14 +136,14 @@ ASSERT(tcp->tcp_loopback); ASSERT(tcp->tcp_loopback_peer == NULL); /* - * We need to inherit q_hiwat of the listener tcp, but we can't - * really use tcp_listener since we get here after sending up - * T_CONN_IND and tcp_wput_accept() may be called independently, - * at which point tcp_listener is cleared; this is why we use - * tcp_saved_listener. The listener itself is guaranteed to be - * around until tcp_accept_finish() is called on this eager -- - * this won't happen until we're done since we're inside the - * eager's perimeter now. + * We need to inherit tcp_recv_hiwater of the listener tcp, + * but we can't really use tcp_listener since we get here after + * sending up T_CONN_IND and tcp_wput_accept() may be called + * independently, at which point tcp_listener is cleared; + * this is why we use tcp_saved_listener. The listener itself + * is guaranteed to be around until tcp_accept_finish() is called + * on this eager -- this won't happen until we're done since we're + * inside the eager's perimeter now. * * We can also get called in the case were a connection needs * to be re-fused. In this case tcp_saved_listener will be @@ -272,29 +272,19 @@ tcp_timers_stop(tcp); tcp_timers_stop(peer_tcp); - /* - * At this point we are a detached eager tcp and therefore - * don't have a queue assigned to us until accept happens. - * In the mean time the peer endpoint may immediately send - * us data as soon as fusion is finished, and we need to be - * able to flow control it in case it sends down huge amount - * of data while we're still detached. To prevent that we - * inherit the listener's recv_hiwater value; this is temporary - * since we'll repeat the process in tcp_accept_finish(). - */ if (!tcp->tcp_refuse) { - (void) tcp_fuse_set_rcv_hiwat(tcp, - tcp->tcp_saved_listener->tcp_recv_hiwater); + /* + * Set receive buffer and max packet size for the + * active open tcp. + * eager's values will be set in tcp_accept_finish. + */ + + (void) tcp_rwnd_set(peer_tcp, + peer_tcp->tcp_recv_hiwater); /* - * Set the stream head's write offset value to zero - * since we won't be needing any room for TCP/IP - * headers; tell it to not break up the writes (this - * would reduce the amount of work done by kmem); and - * configure our receive buffer. Note that we can only - * do this for the active connect tcp since our eager is - * still detached; it will be dealt with later in - * tcp_accept_finish(). + * Set the write offset value to zero since we won't + * be needing any room for TCP/IP headers. */ if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp)) { struct stroptions *stropt; @@ -303,19 +293,9 @@ mp->b_wptr += sizeof (*stropt); stropt = (struct stroptions *)mp->b_rptr; - stropt->so_flags = SO_MAXBLK|SO_WROFF|SO_HIWAT; - stropt->so_maxblk = tcp_maxpsz_set(peer_tcp, - B_FALSE); + stropt->so_flags = SO_WROFF; stropt->so_wroff = 0; - /* - * Record the stream head's high water mark for - * peer endpoint; this is used for flow-control - * purposes in tcp_fuse_output(). - */ - stropt->so_hiwat = tcp_fuse_set_rcv_hiwat( - peer_tcp, peer_rq->q_hiwat); - /* Send the options up */ putnext(peer_rq, mp); } else { @@ -324,16 +304,8 @@ /* The peer is a non-STREAMS end point */ ASSERT(IPCL_IS_TCP(peer_connp)); - (void) tcp_fuse_set_rcv_hiwat(tcp, - tcp->tcp_saved_listener->tcp_recv_hiwater); - - sopp.sopp_flags = SOCKOPT_MAXBLK | - SOCKOPT_WROFF | SOCKOPT_RCVHIWAT; - sopp.sopp_maxblk = tcp_maxpsz_set(peer_tcp, - B_FALSE); + sopp.sopp_flags = SOCKOPT_WROFF; sopp.sopp_wroff = 0; - sopp.sopp_rxhiwat = tcp_fuse_set_rcv_hiwat( - peer_tcp, peer_tcp->tcp_recv_hiwater); (*peer_connp->conn_upcalls->su_set_proto_props) (peer_connp->conn_upper_handle, &sopp); } @@ -789,7 +761,7 @@ mutex_enter(&tcp->tcp_non_sq_lock); flow_stopped = tcp->tcp_flow_stopped; if ((TCP_IS_DETACHED(peer_tcp) && - (peer_tcp->tcp_rcv_cnt >= peer_tcp->tcp_fuse_rcv_hiwater)) || + (peer_tcp->tcp_rcv_cnt >= peer_tcp->tcp_recv_hiwater)) || (!TCP_IS_DETACHED(peer_tcp) && !IPCL_IS_NONSTR(peer_tcp->tcp_connp) && !canputnext(peer_tcp->tcp_rq))) { @@ -989,7 +961,12 @@ * after SO_SNDBUF; the latter is also similarly rounded up. */ rwnd = P2ROUNDUP_TYPED(rwnd, PAGESIZE, size_t); - tcp->tcp_fuse_rcv_hiwater = rwnd; + + /* + * Record high water mark, this is used for flow-control + * purposes in tcp_fuse_output(). + */ + tcp->tcp_recv_hiwater = rwnd; return (rwnd); } @@ -997,7 +974,7 @@ * Calculate the maximum outstanding unread data block for a fused tcp endpoint. */ int -tcp_fuse_maxpsz_set(tcp_t *tcp) +tcp_fuse_maxpsz(tcp_t *tcp) { tcp_t *peer_tcp = tcp->tcp_loopback_peer; uint_t sndbuf = tcp->tcp_xmit_hiwater; @@ -1005,7 +982,7 @@ ASSERT(tcp->tcp_fused); ASSERT(peer_tcp != NULL); - ASSERT(peer_tcp->tcp_fuse_rcv_hiwater != 0); + ASSERT(peer_tcp->tcp_recv_hiwater != 0); /* * In the fused loopback case, we want the stream head to split * up larger writes into smaller chunks for a more accurate flow- @@ -1014,8 +991,8 @@ * We round up the buffer to system page size due to the lack of * TCP MSS concept in Fusion. */ - if (maxpsz > peer_tcp->tcp_fuse_rcv_hiwater) - maxpsz = peer_tcp->tcp_fuse_rcv_hiwater; + if (maxpsz > peer_tcp->tcp_recv_hiwater) + maxpsz = peer_tcp->tcp_recv_hiwater; maxpsz = P2ROUNDUP_TYPED(maxpsz, PAGESIZE, uint_t) >> 1; return (maxpsz);
--- a/usr/src/uts/common/inet/tcp_impl.h Fri Aug 14 11:48:37 2009 -0700 +++ b/usr/src/uts/common/inet/tcp_impl.h Fri Aug 14 13:15:43 2009 -0700 @@ -222,8 +222,9 @@ extern void tcp_fuse_output_urg(tcp_t *, mblk_t *); extern boolean_t tcp_fuse_rcv_drain(queue_t *, tcp_t *, mblk_t **); extern size_t tcp_fuse_set_rcv_hiwat(tcp_t *, size_t); -extern int tcp_fuse_maxpsz_set(tcp_t *); -extern void tcp_fuse_backenable(tcp_t *tcp); +extern int tcp_fuse_maxpsz(tcp_t *); +extern void tcp_fuse_backenable(tcp_t *); +extern int tcp_rwnd_set(tcp_t *, uint32_t); /* * Object to represent database of options to search passed to