changeset 3795:4204ffa31717

6446494 Single homed host has problem in SCTP INIT address parameter 6530148 Zero window handling still has problems 6531684 Staleness in "Stale Cookie Error" chunk reports the wrong time
author kcpoon
date Fri, 09 Mar 2007 17:41:22 -0800
parents 59c013fd65b1
children 3dc60af20942
files usr/src/cmd/mdb/common/modules/sctp/sctp.c usr/src/uts/common/inet/sctp/sctp_common.c usr/src/uts/common/inet/sctp/sctp_cookie.c usr/src/uts/common/inet/sctp/sctp_impl.h usr/src/uts/common/inet/sctp/sctp_init.c usr/src/uts/common/inet/sctp/sctp_input.c usr/src/uts/common/inet/sctp/sctp_output.c
diffstat 7 files changed, 113 insertions(+), 69 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/mdb/common/modules/sctp/sctp.c	Fri Mar 09 16:52:18 2007 -0800
+++ b/usr/src/cmd/mdb/common/modules/sctp/sctp.c	Fri Mar 09 17:41:22 2007 -0800
@@ -163,6 +163,7 @@
 	mdb_printf("pba\t\t%?u\tacked\t\t%?u\n", fa->pba, fa->acked);
 	mdb_printf("lastactive\t%?ld\thb_secret\t%?#lx\n", fa->lastactive,
 	    fa->hb_secret);
+	mdb_printf("rxt_unacked\t\t%?u\n", fa->rxt_unacked);
 	mdb_printf("timer_mp\t%?p\tire\t\t%?p\n", fa->timer_mp, fa->ire);
 	mdb_printf("hb_pending\t%?d\ttimer_running\t%?d\n"
 	    "df\t\t%?d\tpmtu_discovered\t%?d\n"
--- a/usr/src/uts/common/inet/sctp/sctp_common.c	Fri Mar 09 16:52:18 2007 -0800
+++ b/usr/src/uts/common/inet/sctp/sctp_common.c	Fri Mar 09 17:41:22 2007 -0800
@@ -232,8 +232,8 @@
 		/* Make sure that sfa_pmss is a multiple of SCTP_ALIGN. */
 		fp->sfa_pmss = (ire->ire_max_frag - hdrlen) & ~(SCTP_ALIGN - 1);
 		if (fp->cwnd < (fp->sfa_pmss * 2)) {
-			fp->cwnd = fp->sfa_pmss *
-			    sctps->sctps_slow_start_initial;
+			SET_CWND(fp, fp->sfa_pmss,
+			    sctps->sctps_slow_start_initial);
 		}
 	}
 
@@ -1788,7 +1788,7 @@
 
 	for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next) {
 		fp->ssthresh = sctps->sctps_initial_mtu;
-		fp->cwnd = fp->sfa_pmss * sctps->sctps_slow_start_initial;
+		SET_CWND(fp, fp->sfa_pmss, sctps->sctps_slow_start_initial);
 		fp->suna = 0;
 		fp->pba = 0;
 	}
@@ -1871,6 +1871,7 @@
 	(void) random_get_pseudo_bytes((uint8_t *)&fp->hb_secret,
 	    sizeof (fp->hb_secret));
 	fp->hb_expiry = lbolt64;
+	fp->rxt_unacked = 0;
 
 	sctp_get_ire(sctp, fp);
 }
--- a/usr/src/uts/common/inet/sctp/sctp_cookie.c	Fri Mar 09 16:52:18 2007 -0800
+++ b/usr/src/uts/common/inet/sctp/sctp_cookie.c	Fri Mar 09 17:41:22 2007 -0800
@@ -527,7 +527,7 @@
 	if (initcollision)
 		iacklen += sctp_supaddr_param_len(sctp);
 	if (!linklocal)
-		iacklen += sctp_addr_params_len(sctp, supp_af, B_FALSE);
+		iacklen += sctp_addr_params(sctp, supp_af, NULL, B_FALSE);
 	ipsctplen += sizeof (*iacksh) + iacklen;
 	iacklen += errlen;
 	if ((pad = ipsctplen % 4) != 0) {
@@ -627,7 +627,7 @@
 	if (initcollision)
 		p += sctp_supaddr_param(sctp, (uchar_t *)p);
 	if (!linklocal)
-		p += sctp_addr_params(sctp, supp_af, (uchar_t *)p);
+		p += sctp_addr_params(sctp, supp_af, (uchar_t *)p, B_FALSE);
 	if (((sctp_options & SCTP_PRSCTP_OPTION) || initcollision) &&
 	    sctp->sctp_prsctp_aware && sctps->sctps_prsctp_enabled) {
 		p += sctp_options_param(sctp, p, SCTP_PRSCTP_OPTION);
@@ -1148,7 +1148,7 @@
 
 	/* Timestamp is int64_t, and we only guarantee 32-bit alignment */
 	bcopy(p, &ts, sizeof (ts));
-	/* Cookie life time, int32_t */
+	/* Cookie life time, uint32_t */
 	lt = (uint32_t *)(p + sizeof (ts));
 
 	/*
@@ -1171,11 +1171,18 @@
 	*iackpp = iack;
 	*recv_adaption = 0;
 
-	/* Check the timestamp */
-	diff = lbolt64 - ts;
-	if (diff > *lt && (init->sic_inittag != sctp->sctp_fvtag ||
+	/*
+	 * Check the staleness of the Cookie, specified in 3.3.10.3 of
+	 * RFC 2960.
+	 *
+	 * The mesaure of staleness is the difference, in microseconds,
+	 * between the current time and the time the State Cookie expires.
+	 * So it is lbolt64 - (ts + *lt).  If it is positive, it means
+	 * that the Cookie has expired.
+	 */
+	diff = lbolt64 - (ts + *lt);
+	if (diff > 0 && (init->sic_inittag != sctp->sctp_fvtag ||
 	    iack->sic_inittag != sctp->sctp_lvtag)) {
-
 		uint32_t staleness;
 
 		staleness = TICK_TO_USEC(diff);
--- a/usr/src/uts/common/inet/sctp/sctp_impl.h	Fri Mar 09 16:52:18 2007 -0800
+++ b/usr/src/uts/common/inet/sctp/sctp_impl.h	Fri Mar 09 17:41:22 2007 -0800
@@ -512,6 +512,7 @@
 	uint32_t	T3expire;	/* # of times T3 timer expired */
 
 	uint64_t	hb_secret;	/* per addr "secret" in heartbeat */
+	uint32_t	rxt_unacked;	/* # unack'ed retransmitted bytes */
 } sctp_faddr_t;
 
 /* Flags to indicate supported address type in the PARM_SUP_ADDRS. */
@@ -534,6 +535,19 @@
 	int	ipif_count;
 } sctp_ipif_hash_t;
 
+
+/*
+ * Initialize cwnd according to RFC 3390.  def_max_init_cwnd is
+ * either sctp_slow_start_initial or sctp_slow_start_after idle
+ * depending on the caller.
+ */
+#define	SET_CWND(fp, mss, def_max_init_cwnd)				\
+{									\
+	(fp)->cwnd = MIN(def_max_init_cwnd * (mss),			\
+	    MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss))));	\
+}
+
+
 struct sctp_s;
 
 /*
@@ -811,7 +825,7 @@
 	int64_t		sctp_last_secret_update;
 	uint8_t		sctp_secret[SCTP_SECRET_LEN]; /* for cookie auth */
 	uint8_t		sctp_old_secret[SCTP_SECRET_LEN];
-	uint32_t	sctp_cookie_lifetime;	/* cookie lifetime in ms */
+	uint32_t	sctp_cookie_lifetime;	/* cookie lifetime in tick */
 
 	/*
 	 * Address family that app wishes returned addrsses to be in.
@@ -919,8 +933,7 @@
 extern boolean_t sctp_add_recvq(sctp_t *, mblk_t *, boolean_t);
 extern void	sctp_add_sendq(sctp_t *, mblk_t *);
 extern void	sctp_add_unrec_parm(sctp_parm_hdr_t *, mblk_t **);
-extern size_t	sctp_addr_params(sctp_t *, int, uchar_t *);
-extern size_t	sctp_addr_params_len(sctp_t *, int, boolean_t);
+extern size_t	sctp_addr_params(sctp_t *, int, uchar_t *, boolean_t);
 extern mblk_t	*sctp_add_proto_hdr(sctp_t *, sctp_faddr_t *, mblk_t *, int,
 		    int *);
 extern void	sctp_addr_req(sctp_t *, mblk_t *);
@@ -1040,7 +1053,7 @@
 extern void	sctp_ootb_shutdown_ack(sctp_t *, mblk_t *, uint_t);
 extern size_t	sctp_options_param(const sctp_t *, void *, int);
 extern size_t	sctp_options_param_len(const sctp_t *, int);
-extern void	sctp_output(sctp_t *sctp);
+extern void	sctp_output(sctp_t *, uint_t);
 
 extern boolean_t sctp_param_register(IDP *, sctpparam_t *, int, sctp_stack_t *);
 extern void	sctp_partial_delivery_event(sctp_t *);
--- a/usr/src/uts/common/inet/sctp/sctp_init.c	Fri Mar 09 16:52:18 2007 -0800
+++ b/usr/src/uts/common/inet/sctp/sctp_init.c	Fri Mar 09 17:41:22 2007 -0800
@@ -192,7 +192,7 @@
 		initlen += (sizeof (sctp_parm_hdr_t) + sizeof (uint32_t));
 	}
 	initlen += sctp_supaddr_param_len(sctp);
-	initlen += sctp_addr_params_len(sctp, supp_af, B_TRUE);
+	initlen += sctp_addr_params(sctp, supp_af, NULL, B_TRUE);
 	if (sctp->sctp_prsctp_aware && sctps->sctps_prsctp_enabled)
 		initlen += sctp_options_param_len(sctp, SCTP_PRSCTP_OPTION);
 
@@ -234,7 +234,7 @@
 	p += sctp_supaddr_param(sctp, p);
 
 	/* Add address parameters */
-	p += sctp_addr_params(sctp, supp_af, p);
+	p += sctp_addr_params(sctp, supp_af, p, B_FALSE);
 
 	/* Add Forward-TSN-Supported param */
 	if (sctp->sctp_prsctp_aware && sctps->sctps_prsctp_enabled)
@@ -261,32 +261,21 @@
 }
 
 size_t
-sctp_addr_params_len(sctp_t *sctp, int af, boolean_t modify)
+sctp_addr_params(sctp_t *sctp, int af, uchar_t *p, boolean_t modify)
 {
+	size_t	param_len;
+
 	ASSERT(sctp->sctp_nsaddrs > 0);
 
 	/*
 	 * If we have only one local address or it is a loopback or linklocal
 	 * association, we let the peer pull the address from the IP header.
 	 */
-	if (sctp->sctp_nsaddrs == 1 || sctp->sctp_loopback ||
+	if ((!modify && sctp->sctp_nsaddrs == 1) || sctp->sctp_loopback ||
 	    sctp->sctp_linklocal) {
 		return (0);
 	}
 
-	return (sctp_saddr_info(sctp, af, NULL, modify));
+	param_len = sctp_saddr_info(sctp, af, p, modify);
+	return ((sctp->sctp_nsaddrs == 1) ? 0 : param_len);
 }
-
-size_t
-sctp_addr_params(sctp_t *sctp, int af, uchar_t *p)
-{
-	/*
-	 * If we have only one local address or it is a loopback or linklocal
-	 * association, we let the peer pull the address from the IP header.
-	 */
-	if (sctp->sctp_nsaddrs == 1 || sctp->sctp_loopback ||
-	    sctp->sctp_linklocal) {
-		return (0);
-	}
-	return (sctp_saddr_info(sctp, af, p, B_FALSE));
-}
--- a/usr/src/uts/common/inet/sctp/sctp_input.c	Fri Mar 09 16:52:18 2007 -0800
+++ b/usr/src/uts/common/inet/sctp/sctp_input.c	Fri Mar 09 17:41:22 2007 -0800
@@ -1248,8 +1248,9 @@
 		/* Drop and SACK, but don't advance the cumulative TSN. */
 		sctp->sctp_force_sack = 1;
 		dprint(0, ("sctp_data_chunk: exceed rwnd %d rxqueued %d "
-			"ssn %d tsn %x\n", sctp->sctp_rwnd,
-			sctp->sctp_rxqueued, dc->sdh_ssn, ntohl(dc->sdh_tsn)));
+		    "dlen %d ssn %d tsn %x\n", sctp->sctp_rwnd,
+		    sctp->sctp_rxqueued, dlen, ntohs(dc->sdh_ssn),
+		    ntohl(dc->sdh_tsn)));
 		return;
 	}
 
@@ -2893,12 +2894,15 @@
 			 * this signals that some chunks are still
 			 * missing.
 			 */
-			if (cumack_forward)
+			if (cumack_forward) {
+				fp->rxt_unacked -= acked;
 				sctp_ss_rexmit(sctp);
+			}
 		} else {
 			sctp->sctp_rexmitting = B_FALSE;
 			sctp->sctp_rxt_nxttsn = sctp->sctp_ltsn;
 			sctp->sctp_rxt_maxtsn = sctp->sctp_ltsn;
+			fp->rxt_unacked = 0;
 		}
 	}
 	return (trysend);
@@ -4143,7 +4147,7 @@
 	}
 
 	if (trysend) {
-		sctp_output(sctp);
+		sctp_output(sctp, UINT_MAX);
 		if (sctp->sctp_cxmit_list != NULL)
 			sctp_wput_asconf(sctp, NULL);
 	}
--- a/usr/src/uts/common/inet/sctp/sctp_output.c	Fri Mar 09 16:52:18 2007 -0800
+++ b/usr/src/uts/common/inet/sctp/sctp_output.c	Fri Mar 09 17:41:22 2007 -0800
@@ -297,7 +297,7 @@
 	sctp->sctp_unsent += msg_len;
 	BUMP_LOCAL(sctp->sctp_msgcount);
 	if (sctp->sctp_state == SCTPS_ESTABLISHED)
-		sctp_output(sctp);
+		sctp_output(sctp, UINT_MAX);
 process_sendq:
 	WAKE_SCTP(sctp);
 	sctp_process_sendq(sctp);
@@ -968,7 +968,7 @@
 }
 
 void
-sctp_output(sctp_t *sctp)
+sctp_output(sctp_t *sctp, uint_t num_pkt)
 {
 	mblk_t			*mp = NULL;
 	mblk_t			*nmp;
@@ -989,7 +989,7 @@
 	sctp_data_hdr_t		*sdc;
 	int			error;
 	boolean_t		notsent = B_TRUE;
-	sctp_stack_t	*sctps = sctp->sctp_sctps;
+	sctp_stack_t		*sctps = sctp->sctp_sctps;
 
 	if (sctp->sctp_ftsn == sctp->sctp_lastacked + 1) {
 		sacklen = 0;
@@ -1017,7 +1017,7 @@
 	}
 	if (meta != NULL)
 		mp = meta->b_cont;
-	while (cansend > 0) {
+	while (cansend > 0 && num_pkt-- != 0) {
 		pad = 0;
 
 		/*
@@ -1108,8 +1108,8 @@
 			 * a while, do slow start again.
 			 */
 			if (now - fp->lastactive > fp->rto) {
-				fp->cwnd = sctps->sctps_slow_start_after_idle *
-				    fp->sfa_pmss;
+				SET_CWND(fp, fp->sfa_pmss,
+				    sctps->sctps_slow_start_after_idle);
 			}
 
 			pathmax = fp->cwnd - fp->suna;
@@ -1643,7 +1643,6 @@
 	boolean_t	ftsn_check = B_TRUE;
 	uint32_t	first_ua_tsn;
 	sctp_msg_hdr_t	*mhdr;
-	uint32_t	tot_wnd;
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
 
 	while (meta != NULL) {
@@ -1722,9 +1721,17 @@
 		 */
 		if (sctp->sctp_frwnd < (oldfp->sfa_pmss - sizeof (*sdc)))
 			sctp->sctp_frwnd = oldfp->sfa_pmss - sizeof (*sdc);
+
 		/* next TSN to send */
 		sctp->sctp_rxt_nxttsn = sctp->sctp_ltsn;
-		sctp_output(sctp);
+
+		/*
+		 * The above sctp_frwnd adjustment is coarse.  The "changed"
+		 * sctp_frwnd may allow us to send more than 1 packet.  So
+		 * tell sctp_output() to send only 1 packet.
+		 */
+		sctp_output(sctp, 1);
+
 		/* Last sent TSN */
 		sctp->sctp_rxt_maxtsn = sctp->sctp_ltsn - 1;
 		ASSERT(sctp->sctp_rxt_maxtsn >= sctp->sctp_rxt_nxttsn);
@@ -1734,7 +1741,13 @@
 	return;
 out:
 	/*
-	 * If were are probing for zero window, don't adjust retransmission
+	 * After a time out, assume that everything has left the network.  So
+	 * we can clear rxt_unacked for the original peer address.
+	 */
+	oldfp->rxt_unacked = 0;
+
+	/*
+	 * If we were probing for zero window, don't adjust retransmission
 	 * variables, but the timer is still backed off.
 	 */
 	if (sctp->sctp_zero_win_probe) {
@@ -1756,8 +1769,14 @@
 		} else {
 			SCTP_KSTAT(sctps, sctp_ss_rexmit_failed);
 		}
+
+		/*
+		 * The strikes will be clear by sctp_faddr_alive() when the
+		 * other side sends us an ack.
+		 */
 		oldfp->strikes++;
 		sctp->sctp_strikes++;
+
 		SCTP_CALC_RXT(oldfp, sctp->sctp_rto_max);
 		if (oldfp != fp && oldfp->suna != 0)
 			SCTP_FADDR_TIMER_RESTART(sctp, oldfp, fp->rto);
@@ -1873,18 +1892,8 @@
 
 	mp = mp->b_next;
 
-	/* Check how much more we can send. */
-	tot_wnd = MIN(fp->cwnd, sctp->sctp_frwnd);
-	/*
-	 * If the number of outstanding bytes is more than what we are
-	 * allowed to send, stop.
-	 */
-	if (tot_wnd <= chunklen || tot_wnd < fp->suna + chunklen)
-		goto done_bundle;
-	else
-		tot_wnd -= chunklen;
-
 try_bundle:
+	/* We can at least and at most send 1 packet at timeout. */
 	while (seglen < fp->sfa_pmss) {
 		int32_t new_len;
 
@@ -1917,8 +1926,6 @@
 		sdc = (sctp_data_hdr_t *)mp->b_rptr;
 		new_len = ntohs(sdc->sdh_len);
 		chunklen = new_len - sizeof (*sdc);
-		if (chunklen > tot_wnd)
-			break;
 
 		if ((extra = new_len & (SCTP_ALIGN - 1)) != 0)
 			extra = SCTP_ALIGN - extra;
@@ -1942,7 +1949,6 @@
 		SCTP_CHUNK_SENT(sctp, mp, sdc, fp, chunklen, meta);
 
 		seglen = new_len;
-		tot_wnd -= chunklen;
 		mp = mp->b_next;
 	}
 done_bundle:
@@ -1956,6 +1962,8 @@
 		 */
 		iph->ipha_fragment_offset_and_flags = 0;
 	}
+	fp->rxt_unacked += seglen;
+
 	dprint(2, ("sctp_rexmit: Sending packet %d bytes, tsn %x "
 	    "ssn %d to %p (rwnd %d, lastack_rxd %x)\n",
 	    seglen, ntohl(sdc->sdh_tsn), ntohs(sdc->sdh_ssn),
@@ -2049,7 +2057,7 @@
  * This function is called by sctp_ss_rexmit() to create a packet
  * to be retransmitted to the given fp.  The given meta and mp
  * parameters are respectively the sctp_msg_hdr_t and the mblk of the
- * first chunk to be retransmitted. This is also called when we want
+ * first chunk to be retransmitted.  This is also called when we want
  * to retransmit a zero window probe from sctp_rexmit() or when we
  * want to retransmit the zero window probe after the window has
  * opened from sctp_got_sack().
@@ -2173,6 +2181,7 @@
 		*mp = (*mp)->b_next;
 	}
 	*packet_len = seglen;
+	fp->rxt_unacked += seglen;
 	return (head);
 }
 
@@ -2219,16 +2228,36 @@
 	fp = sctp->sctp_current;
 
 	/*
-	 * Since we are retransmitting, we can only use cwnd to determine
-	 * how much we can send as we were allowed to send those chunks
-	 * previously.
+	 * Since we are retransmitting, we only need to use cwnd to determine
+	 * how much we can send as we were allowed (by peer's receive window)
+	 * to send those retransmitted chunks previously when they are first
+	 * sent.  If we record how much we have retransmitted but
+	 * unacknowledged using rxt_unacked, then the amount we can now send
+	 * is equal to cwnd minus rxt_unacked.
+	 *
+	 * The field rxt_unacked is incremented when we retransmit a packet
+	 * and decremented when we got a SACK acknowledging something.  And
+	 * it is reset when the retransmission timer fires as we assume that
+	 * all packets have left the network after a timeout.  If this
+	 * assumption is not true, it means that after a timeout, we can
+	 * get a SACK acknowledging more than rxt_unacked (its value only
+	 * contains what is retransmitted when the timer fires).  So
+	 * rxt_unacked will become very big (it is an unsiged int so going
+	 * negative means that the value is huge).  This is the reason we
+	 * always send at least 1 MSS bytes.
+	 *
+	 * The reason why we do not have an accurate count is that we
+	 * only know how many packets are outstanding (using the TSN numbers).
+	 * But we do not know how many bytes those packets contain.  To
+	 * have an accurate count, we need to walk through the send list.
+	 * As it is not really important to have an accurate count during
+	 * retransmission, we skip this walk to save some time.  This should
+	 * not make the retransmission too aggressive to cause congestion.
 	 */
-	tot_wnd = fp->cwnd;
-	/* So we have sent more than we can, just return. */
-	if (tot_wnd < fp->suna || tot_wnd - fp->suna < fp->sfa_pmss)
-		return;
+	if (fp->cwnd <= fp->rxt_unacked)
+		tot_wnd = fp->sfa_pmss;
 	else
-		tot_wnd -= fp->suna;
+		tot_wnd = fp->cwnd - fp->rxt_unacked;
 
 	/* Find the first unack'ed chunk */
 	for (meta = sctp->sctp_xmit_head; meta != NULL; meta = meta->b_next) {