changeset 12604:3b6b88cabb4b

6867667 sctp reassembly code is difficult to maintain 6882135 sctp partial delivery can hang when handling out of order received fragments. 6908292 Assertion failure in sctp_free_reass() when closing after partial delivery has occurred. 6906356 With x64 SRT to Sparc SUT fix for 6598652 triggers ASSERT in sctp_recvd().
author George Shepherd <George.Shepherd@Sun.COM>
date Thu, 10 Jun 2010 14:07:42 -0700
parents 4f17200860f5
children 6790e683d5a5
files usr/src/uts/common/inet/sctp/sctp_asconf.c usr/src/uts/common/inet/sctp/sctp_impl.h usr/src/uts/common/inet/sctp/sctp_input.c
diffstat 3 files changed, 329 insertions(+), 150 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/inet/sctp/sctp_asconf.c	Thu Jun 10 14:44:38 2010 -0600
+++ b/usr/src/uts/common/inet/sctp/sctp_asconf.c	Thu Jun 10 14:07:42 2010 -0700
@@ -905,7 +905,7 @@
 	 * be assigned or not, so once set we leave it there.
 	 */
 	if (!SCTP_CHUNK_WANT_REXMIT(sctp->sctp_cxmit_list))
-		SCTP_CHUNK_REXMIT(sctp->sctp_cxmit_list);
+		SCTP_CHUNK_REXMIT(sctp, sctp->sctp_cxmit_list);
 	sctp_wput_asconf(sctp, nfp);
 #undef	SCTP_CLR_SENT_FLAG
 }
--- a/usr/src/uts/common/inet/sctp/sctp_impl.h	Thu Jun 10 14:44:38 2010 -0600
+++ b/usr/src/uts/common/inet/sctp/sctp_impl.h	Thu Jun 10 14:07:42 2010 -0700
@@ -260,6 +260,8 @@
 			}						\
 			(sdc)->sdh_ssn = mhdr->smh_ssn;			\
 		}							\
+		DTRACE_PROBE3(sctp__chunk__sent1, sctp_t *, sctp,	\
+		    mblk_t *, mp, mblk_t *, meta);			\
 		(sctp)->sctp_unacked += (chunkdata);			\
 		(sctp)->sctp_unsent -= (chunkdata);			\
 		(sctp)->sctp_frwnd -= (chunkdata);			\
@@ -272,6 +274,8 @@
 			SCTP_CHUNK_DEST(mp)->suna -= ((chunkdata) + 	\
 					sizeof (*sdc));			\
 		}							\
+		DTRACE_PROBE3(sctp__chunk__sent2, sctp_t *, sctp,	\
+		    mblk_t *, mp, mblk_t *, meta);			\
 		(mp)->b_flag &= ~(SCTP_CHUNK_FLAG_REXMIT |		\
 			SCTP_CHUNK_FLAG_ACKED);				\
 		SCTP_CHUNK_SET_SACKCNT(mp, 0);				\
@@ -292,14 +296,22 @@
 #define	SCTP_CHUNK_DEST(mp)		((sctp_faddr_t *)(mp)->b_queue)
 #define	SCTP_SET_CHUNK_DEST(mp, fp)	((mp)->b_queue = (queue_t *)fp)
 
-#define	SCTP_CHUNK_REXMIT(mp)	((mp)->b_flag |= SCTP_CHUNK_FLAG_REXMIT)
+#define	SCTP_CHUNK_REXMIT(sctp, mp) {					\
+	DTRACE_PROBE2(sctp__chunk__rexmit, sctp_t *, sctp, mblk_t *,	\
+	    mp);							\
+	(mp)->b_flag |= SCTP_CHUNK_FLAG_REXMIT; 			\
+}
 #define	SCTP_CHUNK_CLEAR_REXMIT(mp) ((mp)->b_flag &= ~SCTP_CHUNK_FLAG_REXMIT)
 #define	SCTP_CHUNK_WANT_REXMIT(mp) ((mp)->b_flag & SCTP_CHUNK_FLAG_REXMIT)
 
 #define	SCTP_CHUNK_ACKED(mp) \
 	((mp)->b_flag = (SCTP_CHUNK_FLAG_SENT|SCTP_CHUNK_FLAG_ACKED))
 #define	SCTP_CHUNK_ISACKED(mp)	((mp)->b_flag & SCTP_CHUNK_FLAG_ACKED)
-#define	SCTP_CHUNK_CLEAR_ACKED(mp) ((mp)->b_flag &= ~SCTP_CHUNK_FLAG_ACKED)
+#define	SCTP_CHUNK_CLEAR_ACKED(sctp, mp) {				\
+	DTRACE_PROBE2(sctp__chunk__clracked, sctp_t *, sctp, mblk_t *,	\
+	    mp);							\
+	(mp)->b_flag &= ~SCTP_CHUNK_FLAG_ACKED;				\
+}
 
 #define	SCTP_CHUNK_SACKCNT(mp)	((intptr_t)((mp)->b_prev))
 #define	SCTP_CHUNK_SET_SACKCNT(mp, val) ((mp)->b_prev = \
--- a/usr/src/uts/common/inet/sctp/sctp_input.c	Thu Jun 10 14:44:38 2010 -0600
+++ b/usr/src/uts/common/inet/sctp/sctp_input.c	Thu Jun 10 14:07:42 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -629,6 +628,9 @@
 sctp_free_reass(sctp_instr_t *sip)
 {
 	mblk_t *mp, *mpnext, *mctl;
+#ifdef	DEBUG
+	sctp_reass_t	*srp;
+#endif
 
 	for (mp = sip->istr_reass; mp != NULL; mp = mpnext) {
 		mpnext = mp->b_next;
@@ -636,7 +638,11 @@
 		mp->b_prev = NULL;
 		if (DB_TYPE(mp) == M_CTL) {
 			mctl = mp;
-			ASSERT(mp->b_cont != NULL);
+#ifdef	DEBUG
+			srp = (sctp_reass_t *)DB_BASE(mctl);
+			/* Partial delivery can leave empty srp */
+			ASSERT(mp->b_cont != NULL || srp->got == 0);
+#endif
 			mp = mp->b_cont;
 			mctl->b_cont = NULL;
 			freeb(mctl);
@@ -849,9 +855,14 @@
 		srp->msglen = 0;
 		srp->needed = 0;
 		srp->got = 0;
-		srp->partial_delivered = B_TRUE;
 		srp->tail = NULL;
 	} else {
+		/*
+		 * There is a gap then some ordered frags which are not
+		 * the next deliverable tsn. When the next deliverable
+		 * frag arrives it will be set as the new list head in
+		 * sctp_data_frag() by setting the B bit.
+		 */
 		dmp = hmp->b_cont;
 		hmp->b_cont = mp;
 	}
@@ -859,10 +870,9 @@
 	/*
 	 * mp now points at the last chunk in the sequence,
 	 * and prev points to mp's previous in the list.
-	 * We chop the list at prev, and convert mp into the
-	 * new list head by setting the B bit. Subsequence
-	 * fragment deliveries will follow the normal reassembly
-	 * path.
+	 * We chop the list at prev. Subsequent fragment
+	 * deliveries will follow the normal reassembly
+	 * path unless they too exceed the sctp_pd_point.
 	 */
 	prev->b_cont = NULL;
 	srp->partial_delivered = B_TRUE;
@@ -910,20 +920,20 @@
 }
 
 /*
- * Fragment list for ordered messages.
- * If no error occures, error is set to 0. If we run out of memory, error
- * is set to 1. If the peer commits a fatal error (like using different
- * sequence numbers for the same data fragment series), the association is
- * aborted and error is set to 2. tpfinished indicates whether we have
- * assembled a complete message, this is used in sctp_data_chunk() to
- * see if we can try to send any queued message for this stream.
+ * Handle received fragments for ordered delivery to upper layer protocol.
+ * Manage the per message reassembly queue and if this fragment completes
+ * reassembly of the message, or qualifies the already reassembled data
+ * for partial delivery, prepare the message for delivery upstream.
+ *
+ * tpfinished in the caller remains set only when the incoming fragment
+ * has completed the reassembly of the message associated with its ssn.
  */
 static mblk_t *
 sctp_data_frag(sctp_t *sctp, mblk_t *dmp, sctp_data_hdr_t **dc, int *error,
     sctp_instr_t *sip, boolean_t *tpfinished)
 {
-	mblk_t		*hmp;
-	mblk_t		*pmp;
+	mblk_t		*reassq_curr, *reassq_next, *reassq_prev;
+	mblk_t		*new_reassq;
 	mblk_t		*qmp;
 	mblk_t		*first_mp;
 	sctp_reass_t	*srp;
@@ -935,102 +945,152 @@
 
 	*error = 0;
 
-	/* find the reassembly queue for this data chunk */
-	hmp = qmp = sip->istr_reass;
-	for (; hmp != NULL; hmp = hmp->b_next) {
-		srp = (sctp_reass_t *)DB_BASE(hmp);
-		if (ntohs((*dc)->sdh_ssn) == srp->ssn)
+	/*
+	 * Find the reassembly queue for this data chunk, if none
+	 * yet exists, a new per message queue will be created and
+	 * appended to the end of the list of per message queues.
+	 *
+	 * sip points on sctp_instr_t representing instream messages
+	 * as yet undelivered for this stream (sid) of the association.
+	 */
+	reassq_next = reassq_prev = sip->istr_reass;
+	for (; reassq_next != NULL; reassq_next = reassq_next->b_next) {
+		srp = (sctp_reass_t *)DB_BASE(reassq_next);
+		if (ntohs((*dc)->sdh_ssn) == srp->ssn) {
+			reassq_curr = reassq_next;
 			goto foundit;
-		else if (SSN_GT(srp->ssn, ntohs((*dc)->sdh_ssn)))
+		} else if (SSN_GT(srp->ssn, ntohs((*dc)->sdh_ssn)))
 			break;
-		qmp = hmp;
+		reassq_prev = reassq_next;
 	}
 
 	/*
-	 * Allocate a M_CTL that will contain information about this
-	 * fragmented message.
+	 * First fragment of this message received, allocate a M_CTL that
+	 * will head the reassembly queue for this message. The message
+	 * and all its fragments are identified by having the same ssn.
+	 *
+	 * Arriving fragments will be inserted in tsn order on the
+	 * reassembly queue for this message (ssn), linked by b_cont.
 	 */
-	if ((pmp = allocb(sizeof (*srp), BPRI_MED)) == NULL) {
-		*error = 1;
+	if ((new_reassq = allocb(sizeof (*srp), BPRI_MED)) == NULL) {
+		*error = ENOMEM;
 		return (NULL);
 	}
-	DB_TYPE(pmp) = M_CTL;
-	srp = (sctp_reass_t *)DB_BASE(pmp);
-	pmp->b_cont = dmp;
-
-	if (hmp != NULL) {
-		if (sip->istr_reass == hmp) {
-			sip->istr_reass = pmp;
-			pmp->b_next = hmp;
-			pmp->b_prev = NULL;
-			hmp->b_prev = pmp;
+	DB_TYPE(new_reassq) = M_CTL;
+	srp = (sctp_reass_t *)DB_BASE(new_reassq);
+	new_reassq->b_cont = dmp;
+
+	/*
+	 * All per ssn reassembly queues, (one for each message) on
+	 * this stream are doubly linked by b_next/b_prev back to the
+	 * instr_reass of the instream structure associated with this
+	 * stream id, (sip is initialized as sctp->sctp_instr[sid]).
+	 * Insert the new reassembly queue in the correct (ssn) order.
+	 */
+	if (reassq_next != NULL) {
+		if (sip->istr_reass == reassq_next) {
+			/* head insertion */
+			sip->istr_reass = new_reassq;
+			new_reassq->b_next = reassq_next;
+			new_reassq->b_prev = NULL;
+			reassq_next->b_prev = new_reassq;
 		} else {
-			qmp->b_next = pmp;
-			pmp->b_prev = qmp;
-			pmp->b_next = hmp;
-			hmp->b_prev = pmp;
+			/* mid queue insertion */
+			reassq_prev->b_next = new_reassq;
+			new_reassq->b_prev = reassq_prev;
+			new_reassq->b_next = reassq_next;
+			reassq_next->b_prev = new_reassq;
 		}
 	} else {
-		/* make a new reass head and stick it on the end */
+		/* place new reassembly queue at the end */
 		if (sip->istr_reass == NULL) {
-			sip->istr_reass = pmp;
-			pmp->b_prev = NULL;
+			sip->istr_reass = new_reassq;
+			new_reassq->b_prev = NULL;
 		} else {
-			qmp->b_next = pmp;
-			pmp->b_prev = qmp;
+			reassq_prev->b_next = new_reassq;
+			new_reassq->b_prev = reassq_prev;
 		}
-		pmp->b_next = NULL;
+		new_reassq->b_next = NULL;
 	}
 	srp->partial_delivered = B_FALSE;
 	srp->ssn = ntohs((*dc)->sdh_ssn);
+	srp->hasBchunk = B_FALSE;
 empty_srp:
 	srp->needed = 0;
 	srp->got = 1;
+	/* tail always the highest tsn on the reassembly queue for this ssn */
 	srp->tail = dmp;
 	if (SCTP_DATA_GET_BBIT(*dc)) {
+		/* Incoming frag is flagged as the beginning of message */
 		srp->msglen = ntohs((*dc)->sdh_len);
 		srp->nexttsn = ntohl((*dc)->sdh_tsn) + 1;
 		srp->hasBchunk = B_TRUE;
 	} else if (srp->partial_delivered &&
 	    srp->nexttsn == ntohl((*dc)->sdh_tsn)) {
+		/*
+		 * The real beginning fragment of the message was already
+		 * delivered upward, so this is the earliest frag expected.
+		 * Fake the B-bit then see if this frag also completes the
+		 * message.
+		 */
 		SCTP_DATA_SET_BBIT(*dc);
-		/* Last fragment */
+		srp->hasBchunk = B_TRUE;
+		srp->msglen = ntohs((*dc)->sdh_len);
 		if (SCTP_DATA_GET_EBIT(*dc)) {
+			/* This frag is marked as the end of message */
 			srp->needed = 1;
+			/* Got all fragments of this message now */
 			goto frag_done;
 		}
-		srp->hasBchunk = B_TRUE;
-		srp->msglen = ntohs((*dc)->sdh_len);
 		srp->nexttsn++;
 	}
+
+	/* The only fragment of this message currently queued */
+	*tpfinished = B_FALSE;
 	return (NULL);
 foundit:
 	/*
-	 * else already have a reassembly queue. Insert the new data chunk
-	 * in the reassemble queue. Try the tail first, on the assumption
-	 * that the fragments are coming in in order.
+	 * This message already has a reassembly queue. Insert the new frag
+	 * in the reassembly queue. Try the tail first, on the assumption
+	 * that the fragments are arriving in order.
 	 */
 	qmp = srp->tail;
 
 	/*
-	 * This means the message was partially delivered.
+	 * A NULL tail means all existing fragments of the message have
+	 * been entirely consumed during a partially delivery.
 	 */
 	if (qmp == NULL) {
 		ASSERT(srp->got == 0 && srp->needed == 0 &&
 		    srp->partial_delivered);
-		ASSERT(hmp->b_cont == NULL);
-		hmp->b_cont = dmp;
+		ASSERT(reassq_curr->b_cont == NULL);
+		reassq_curr->b_cont = dmp;
 		goto empty_srp;
+	} else {
+		/*
+		 * If partial delivery did take place but the next arriving
+		 * fragment was not the next to be delivered, or partial
+		 * delivery broke off due to a gap, fragments remain on the
+		 * tail. The next fragment due to be delivered still has to
+		 * be set as the new head of list upon arrival. Fake B-bit
+		 * on that frag then see if it also completes the message.
+		 */
+		if (srp->partial_delivered &&
+		    srp->nexttsn == ntohl((*dc)->sdh_tsn)) {
+			SCTP_DATA_SET_BBIT(*dc);
+			srp->hasBchunk = B_TRUE;
+			if (SCTP_DATA_GET_EBIT(*dc)) {
+				/* Got all fragments of this message now */
+				goto frag_done;
+			}
+		}
 	}
+
+	/* grab the frag header of already queued tail frag for comparison */
 	qdc = (sctp_data_hdr_t *)qmp->b_rptr;
 	ASSERT(qmp->b_cont == NULL);
 
-	/* XXXIs it fine to do this just here? */
-	if ((*dc)->sdh_sid != qdc->sdh_sid) {
-		/* our peer is fatally confused; XXX abort the assc */
-		*error = 2;
-		return (NULL);
-	}
+	/* check if the frag goes on the tail in order */
 	if (SEQ_GT(ntohl((*dc)->sdh_tsn), ntohl(qdc->sdh_tsn))) {
 		qmp->b_cont = dmp;
 		srp->tail = dmp;
@@ -1042,12 +1102,12 @@
 		goto inserted;
 	}
 
-	/* Next check for insertion at the beginning */
-	qmp = hmp->b_cont;
+	/* Next check if we should insert this frag at the beginning */
+	qmp = reassq_curr->b_cont;
 	qdc = (sctp_data_hdr_t *)qmp->b_rptr;
 	if (SEQ_LT(ntohl((*dc)->sdh_tsn), ntohl(qdc->sdh_tsn))) {
 		dmp->b_cont = qmp;
-		hmp->b_cont = dmp;
+		reassq_curr->b_cont = dmp;
 		if (SCTP_DATA_GET_BBIT(*dc)) {
 			srp->hasBchunk = B_TRUE;
 			srp->nexttsn = ntohl((*dc)->sdh_tsn);
@@ -1055,7 +1115,7 @@
 		goto preinserted;
 	}
 
-	/* Insert somewhere in the middle */
+	/* Insert this frag in it's correct order in the middle */
 	for (;;) {
 		/* Tail check above should have caught this */
 		ASSERT(qmp->b_cont != NULL);
@@ -1070,11 +1130,15 @@
 		qmp = qmp->b_cont;
 	}
 preinserted:
+	/*
+	 * Need head of message and to be due to deliver, otherwise skip
+	 * the recalculation of the message length below.
+	 */
 	if (!srp->hasBchunk || ntohl((*dc)->sdh_tsn) != srp->nexttsn)
 		goto inserted;
 	/*
 	 * fraglen contains the length of consecutive chunks of fragments.
-	 * starting from the chunk inserted recently.
+	 * starting from the chunk we just inserted.
 	 */
 	tsn = srp->nexttsn;
 	for (qmp = dmp; qmp != NULL; qmp = qmp->b_cont) {
@@ -1088,13 +1152,17 @@
 	srp->msglen += fraglen;
 inserted:
 	srp->got++;
-	first_mp = hmp->b_cont;
+	first_mp = reassq_curr->b_cont;
+	/* Prior to this frag either the beginning or end frag was missing */
 	if (srp->needed == 0) {
-		/* check if we have the first and last fragments */
+		/* used to check if we have the first and last fragments */
 		bdc = (sctp_data_hdr_t *)first_mp->b_rptr;
 		edc = (sctp_data_hdr_t *)srp->tail->b_rptr;
 
-		/* calculate how many fragments are needed, if possible  */
+		/*
+		 * If we now have both the beginning and the end of the message,
+		 * calculate how many fragments in the complete message.
+		 */
 		if (SCTP_DATA_GET_BBIT(bdc) && SCTP_DATA_GET_EBIT(edc)) {
 			srp->needed = ntohl(edc->sdh_tsn) -
 			    ntohl(bdc->sdh_tsn) + 1;
@@ -1106,53 +1174,64 @@
 	 * partial delivery point. Only do this if we can immediately
 	 * deliver the partially assembled message, and only partially
 	 * deliver one message at a time (i.e. messages cannot be
-	 * intermixed arriving at the upper layer). A simple way to
-	 * enforce this is to only try partial delivery if this TSN is
-	 * the next expected TSN. Partial Delivery not supported
-	 * for un-ordered message.
+	 * intermixed arriving at the upper layer).
+	 * sctp_try_partial_delivery() will return a message consisting
+	 * of only consecutive fragments.
 	 */
 	if (srp->needed != srp->got) {
+		/* we don't have the full message yet */
 		dmp = NULL;
-		if (ntohl((*dc)->sdh_tsn) == sctp->sctp_ftsn &&
-		    srp->msglen >= sctp->sctp_pd_point) {
-			dmp = sctp_try_partial_delivery(sctp, hmp, srp, dc);
-			*tpfinished = B_FALSE;
+		if (ntohl((*dc)->sdh_tsn) <= sctp->sctp_ftsn &&
+		    srp->msglen >= sctp->sctp_pd_point &&
+		    srp->ssn == sip->nextseq) {
+			dmp = sctp_try_partial_delivery(sctp, reassq_curr,
+			    srp, dc);
 		}
+		*tpfinished = B_FALSE;
+		/*
+		 * NULL unless a segment of the message now qualified for
+		 * partial_delivery and has been prepared for delivery by
+		 * sctp_try_partial_delivery().
+		 */
 		return (dmp);
 	}
 frag_done:
 	/*
-	 * else reassembly done; prepare the data for delivery.
-	 * First unlink hmp from the ssn list.
+	 * Reassembly complete for this message, prepare the data for delivery.
+	 * First unlink the reassembly queue for this ssn from the list of
+	 * messages in reassembly.
 	 */
-	if (sip->istr_reass == hmp) {
-		sip->istr_reass = hmp->b_next;
-		if (hmp->b_next)
-			hmp->b_next->b_prev = NULL;
+	if (sip->istr_reass == reassq_curr) {
+		sip->istr_reass = reassq_curr->b_next;
+		if (reassq_curr->b_next)
+			reassq_curr->b_next->b_prev = NULL;
 	} else {
-		ASSERT(hmp->b_prev != NULL);
-		hmp->b_prev->b_next = hmp->b_next;
-		if (hmp->b_next)
-			hmp->b_next->b_prev = hmp->b_prev;
+		ASSERT(reassq_curr->b_prev != NULL);
+		reassq_curr->b_prev->b_next = reassq_curr->b_next;
+		if (reassq_curr->b_next)
+			reassq_curr->b_next->b_prev = reassq_curr->b_prev;
 	}
 
 	/*
-	 * Using b_prev and b_next was a little sinful, but OK since
-	 * this mblk is never put*'d. However, freeb() will still
-	 * ASSERT that they are unused, so we need to NULL them out now.
+	 * Need to clean up b_prev and b_next as freeb() will
+	 * ASSERT that they are unused.
 	 */
-	hmp->b_next = NULL;
-	hmp->b_prev = NULL;
-	dmp = hmp;
+	reassq_curr->b_next = NULL;
+	reassq_curr->b_prev = NULL;
+
+	dmp = reassq_curr;
+	/* point to the head of the reassembled data message */
 	dmp = dmp->b_cont;
-	hmp->b_cont = NULL;
-	freeb(hmp);
+	reassq_curr->b_cont = NULL;
+	freeb(reassq_curr);
+	/* Tell our caller that we are returning a complete message. */
 	*tpfinished = B_TRUE;
 
 	/*
 	 * Adjust all mblk's except the lead so their rptr's point to the
-	 * payload. sctp_data_chunk() will need to process the lead's
-	 * data chunk section, so leave it's rptr pointing at the data chunk.
+	 * payload. sctp_data_chunk() will need to process the lead's data
+	 * data chunk section, so leave its rptr pointing at the data chunk
+	 * header.
 	 */
 	*dc = (sctp_data_hdr_t *)dmp->b_rptr;
 	for (qmp = dmp->b_cont; qmp != NULL; qmp = qmp->b_cont) {
@@ -1163,6 +1242,7 @@
 
 	return (dmp);
 }
+
 static void
 sctp_add_dup(uint32_t tsn, mblk_t **dups)
 {
@@ -1193,6 +1273,11 @@
 	ASSERT((mp->b_wptr - mp->b_rptr) <= bsize);
 }
 
+/*
+ * All incoming sctp data, complete messages and fragments are handled by
+ * this function. Unless the U-bit is set in the data chunk it will be
+ * delivered in order or queued until an in-order delivery can be made.
+ */
 static void
 sctp_data_chunk(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *mp, mblk_t **dups,
     sctp_faddr_t *fp, ip_pkt_t *ipp, ip_recv_attr_t *ira)
@@ -1201,6 +1286,7 @@
 	mblk_t *dmp, *pmp;
 	sctp_instr_t *instr;
 	int ubit;
+	int sid;
 	int isfrag;
 	uint16_t ssn;
 	uint32_t oftsn;
@@ -1244,6 +1330,7 @@
 		return;
 	}
 
+	/* Check for dups of sack'ed data */
 	if (sctp->sctp_sack_info != NULL) {
 		sctp_set_t *sp;
 
@@ -1260,7 +1347,7 @@
 		}
 	}
 
-	/* We cannot deliver anything up now but we still need to handle it. */
+	/* We can no longer deliver anything up, but still need to handle it. */
 	if (SCTP_IS_DETACHED(sctp)) {
 		BUMP_MIB(&sctps->sctps_mib, sctpInClosed);
 		can_deliver = B_FALSE;
@@ -1285,7 +1372,10 @@
 		return;
 	}
 
-	if (ntohs(dc->sdh_sid) >= sctp->sctp_num_istr) {
+	sid = ntohs(dc->sdh_sid);
+
+	/* Data received for a stream not negotiated for this association */
+	if (sid >= sctp->sctp_num_istr) {
 		sctp_bsc_t	inval_parm;
 
 		/* Will populate the CAUSE block in the ERROR chunk. */
@@ -1300,21 +1390,32 @@
 		return;
 	}
 
+	/* unordered delivery OK for this data if ubit set */
 	ubit = SCTP_DATA_GET_UBIT(dc);
 	ASSERT(sctp->sctp_instr != NULL);
-	instr = &sctp->sctp_instr[ntohs(dc->sdh_sid)];
+
+	/* select per stream structure for this stream from the array */
+	instr = &sctp->sctp_instr[sid];
 	/* Initialize the stream, if not yet used */
 	if (instr->sctp == NULL)
 		instr->sctp = sctp;
 
+	/* Begin and End bit set would mean a complete message */
 	isfrag = !(SCTP_DATA_GET_BBIT(dc) && SCTP_DATA_GET_EBIT(dc));
+
+	/* The ssn of this sctp message and of any fragments in it */
 	ssn = ntohs(dc->sdh_ssn);
 
 	dmp = dupb(mp);
 	if (dmp == NULL) {
-		/* drop it and don't ack it, causing the peer to retransmit */
+		/* drop it and don't ack, let the peer retransmit */
 		return;
 	}
+	/*
+	 * Past header and payload, note: the underlying buffer may
+	 * contain further chunks from the same incoming IP packet,
+	 * if so db_ref will be greater than one.
+	 */
 	dmp->b_wptr = (uchar_t *)ch + ntohs(ch->sch_len);
 
 	sctp->sctp_rxqueued += dlen;
@@ -1327,51 +1428,52 @@
 		/* fragmented data chunk */
 		dmp->b_rptr = (uchar_t *)dc;
 		if (ubit) {
+			/* prepare data for unordered delivery */
 			dmp = sctp_uodata_frag(sctp, dmp, &dc);
 #if	DEBUG
 			if (dmp != NULL) {
 				ASSERT(instr ==
-				    &sctp->sctp_instr[ntohs(dc->sdh_sid)]);
+				    &sctp->sctp_instr[sid]);
 			}
 #endif
 		} else {
+			/*
+			 * Assemble fragments and queue for ordered delivery,
+			 * dmp returned is NULL or the head of a complete or
+			 * "partial delivery" message. Any returned message
+			 * and all its fragments will have the same ssn as the
+			 * input fragment currently being handled.
+			 */
 			dmp = sctp_data_frag(sctp, dmp, &dc, &error, instr,
 			    &tpfinished);
 		}
-		if (error != 0) {
+		if (error == ENOMEM) {
+			/* back out the adjustment made earlier */
 			sctp->sctp_rxqueued -= dlen;
-			if (error == 1) {
-				/*
-				 * out of memory; don't ack it so
-				 * the peer retransmits
-				 */
-				return;
-			} else if (error == 2) {
-				/*
-				 * fatal error (i.e. peer used different
-				 * ssn's for same fragmented data) --
-				 * the association has been aborted.
-				 * XXX need to return errval so state
-				 * machine can also abort processing.
-				 */
-				dprint(0, ("error 2: must not happen!\n"));
-				return;
-			}
+			/*
+			 * Don't ack the segment,
+			 * the peer will retransmit.
+			 */
+			return;
 		}
 
 		if (dmp == NULL) {
 			/*
-			 * Can't process this data now, but the cumulative
-			 * TSN may be advanced, so do the checks at done.
+			 * The frag has been queued for later in-order delivery,
+			 * but the cumulative TSN may need to advance, so also
+			 * need to perform the gap ack checks at the done label.
 			 */
 			SCTP_ACK_IT(sctp, tsn);
+			DTRACE_PROBE4(sctp_data_frag_queued, sctp_t *, sctp,
+			    int, sid, int, tsn, uint16_t, ssn);
 			goto done;
 		}
 	}
 
 	/*
-	 * Insert complete messages in correct order for ordered delivery.
-	 * tpfinished is true when the incoming chunk contains a complete
+	 * Unless message is the next for delivery to the ulp, queue complete
+	 * message in the correct order for ordered delivery.
+	 * Note: tpfinished is true when the incoming chunk contains a complete
 	 * message or is the final missing fragment which completed a message.
 	 */
 	if (!ubit && tpfinished && ssn != instr->nextseq) {
@@ -1421,12 +1523,14 @@
 		(instr->istr_nmsgs)++;
 		(sctp->sctp_istr_nmsgs)++;
 		SCTP_ACK_IT(sctp, tsn);
+		DTRACE_PROBE4(sctp_pqueue_completemsg, sctp_t *, sctp,
+		    int, sid, int, tsn, uint16_t, ssn);
 		return;
 	}
 
 	/*
-	 * Else we can deliver the data directly. Recalculate
-	 * dlen now since we may have reassembled data.
+	 * Deliver the data directly. Recalculate dlen now since
+	 * we may have just reassembled this data.
 	 */
 	dlen = dmp->b_wptr - (uchar_t *)dc - sizeof (*dc);
 	for (pmp = dmp->b_cont; pmp != NULL; pmp = pmp->b_cont)
@@ -1438,6 +1542,7 @@
 
 	if (can_deliver) {
 
+		/* step past header to the payload */
 		dmp->b_rptr = (uchar_t *)(dc + 1);
 		if (sctp_input_add_ancillary(sctp, &dmp, dc, fp,
 		    ipp, ira) == 0) {
@@ -1445,7 +1550,9 @@
 			    msgdsize(dmp)));
 			sctp->sctp_rwnd -= dlen;
 			/*
-			 * Override b_flag for SCTP sockfs internal use
+			 * We overload the meaning of b_flag for SCTP sockfs
+			 * internal use, to advise sockfs of partial delivery
+			 * semantics.
 			 */
 			dmp->b_flag = tpfinished ? 0 : SCTP_PARTIAL_DATA;
 			new_rwnd = sctp->sctp_ulp_recv(sctp->sctp_ulpd, dmp,
@@ -1461,21 +1568,21 @@
 				sctp->sctp_rwnd = new_rwnd;
 			SCTP_ACK_IT(sctp, tsn);
 		} else {
-			/* Just free the message if we don't have memory. */
+			/* No memory don't ack, the peer will retransmit. */
 			freemsg(dmp);
 			return;
 		}
 	} else {
-		/* About to free the data */
+		/* Closed above, ack to peer and free the data */
 		freemsg(dmp);
 		SCTP_ACK_IT(sctp, tsn);
 	}
 
 	/*
-	 * data, now enqueued, may already have been processed and free'd
+	 * Data now enqueued, may already have been processed and free'd
 	 * by the ULP (or we may have just freed it above, if we could not
-	 * deliver it), so we must not reference it (this is why we kept
-	 * the ssn and ubit above).
+	 * deliver), so we must not reference it (this is why we saved the
+	 * ssn and ubit earlier).
 	 */
 	if (ubit != 0) {
 		BUMP_LOCAL(sctp->sctp_iudchunks);
@@ -1484,24 +1591,72 @@
 	BUMP_LOCAL(sctp->sctp_idchunks);
 
 	/*
-	 * If there was a partial delivery and it has not finished,
-	 * don't pull anything from the pqueues.
+	 * There was a partial delivery and it has not finished,
+	 * don't pull anything from the pqueues or increment the
+	 * nextseq. This msg must complete before starting on
+	 * the next ssn and the partial message must have the
+	 * same ssn as the next expected message..
 	 */
 	if (!tpfinished) {
+		DTRACE_PROBE4(sctp_partial_delivery, sctp_t *, sctp,
+		    int, sid, int, tsn, uint16_t, ssn);
+		/*
+		 * Verify the partial delivery is part of the
+		 * message expected for ordered delivery.
+		 */
+		if (ssn != instr->nextseq) {
+			DTRACE_PROBE4(sctp_partial_delivery_error,
+			    sctp_t *, sctp, int, sid, int, tsn,
+			    uint16_t, ssn);
+			cmn_err(CE_WARN, "sctp partial"
+			    " delivery error, sctp 0x%p"
+			    " sid = 0x%x ssn != nextseq"
+			    " tsn 0x%x ftsn 0x%x"
+			    " ssn 0x%x nextseq 0x%x",
+			    (void *)sctp, sid,
+			    tsn, sctp->sctp_ftsn, ssn,
+			    instr->nextseq);
+		}
+
+		ASSERT(ssn == instr->nextseq);
 		goto done;
 	}
 
+	if (ssn != instr->nextseq) {
+		DTRACE_PROBE4(sctp_inorder_delivery_error,
+		    sctp_t *, sctp, int, sid, int, tsn,
+		    uint16_t, ssn);
+		cmn_err(CE_WARN, "sctp in-order delivery error, sctp 0x%p "
+		    "sid = 0x%x ssn != nextseq ssn 0x%x nextseq 0x%x",
+		    (void *)sctp, sid, ssn, instr->nextseq);
+	}
+
+	ASSERT(ssn == instr->nextseq);
+
+	DTRACE_PROBE4(sctp_deliver_completemsg, sctp_t *, sctp, int, sid,
+	    int, tsn, uint16_t, ssn);
+
 	instr->nextseq = ssn + 1;
-	/* Deliver any successive data chunks in the instr queue */
+
+	/*
+	 * Deliver any successive data chunks waiting in the instr pqueue
+	 * for the data just sent up.
+	 */
 	while (instr->istr_nmsgs > 0) {
 		dmp = (mblk_t *)instr->istr_msgs;
 		dc = (sctp_data_hdr_t *)dmp->b_rptr;
 		ssn = ntohs(dc->sdh_ssn);
-		/* Gap in the sequence */
+		tsn = ntohl(dc->sdh_tsn);
+		/* Stop at the first gap in the sequence */
 		if (ssn != instr->nextseq)
 			break;
 
-		/* Else deliver the data */
+		DTRACE_PROBE4(sctp_deliver_pqueuedmsg, sctp_t *, sctp,
+		    int, sid, int, tsn, uint16_t, ssn);
+		/*
+		 * Ready to deliver all data before the gap
+		 * to the upper layer.
+		 */
 		(instr->istr_nmsgs)--;
 		(instr->nextseq)++;
 		(sctp->sctp_istr_nmsgs)--;
@@ -1515,8 +1670,10 @@
 		    ntohl(dc->sdh_tsn), (int)ssn));
 
 		/*
-		 * If this chunk was reassembled, each b_cont represents
-		 * another TSN; advance ftsn now.
+		 * Composite messages indicate this chunk was reassembled,
+		 * each b_cont represents another TSN; Follow the chain to
+		 * reach the frag with the last tsn in order to advance ftsn
+		 * shortly by calling SCTP_ACK_IT().
 		 */
 		dlen = dmp->b_wptr - dmp->b_rptr - sizeof (*dc);
 		for (pmp = dmp->b_cont; pmp; pmp = pmp->b_cont)
@@ -1533,7 +1690,9 @@
 				    "bytes\n", msgdsize(dmp)));
 				sctp->sctp_rwnd -= dlen;
 				/*
-				 * Override b_flag for SCTP sockfs internal use
+				 * Meaning of b_flag overloaded for SCTP sockfs
+				 * internal use, advise sockfs of partial
+				 * delivery semantics.
 				 */
 				dmp->b_flag = tpfinished ?
 				    0 : SCTP_PARTIAL_DATA;
@@ -1545,11 +1704,12 @@
 					sctp->sctp_rwnd = new_rwnd;
 				SCTP_ACK_IT(sctp, tsn);
 			} else {
+				/* don't ack, the peer will retransmit */
 				freemsg(dmp);
 				return;
 			}
 		} else {
-			/* About to free the data */
+			/* Closed above, ack and free the data */
 			freemsg(dmp);
 			SCTP_ACK_IT(sctp, tsn);
 		}
@@ -2073,6 +2233,13 @@
 		 * trypartial.
 		 */
 		if (srp->partial_delivered) {
+			if (srp->ssn != sip->nextseq)
+				cmn_err(CE_WARN, "sctp partial"
+				    " delivery notify, sctp 0x%p"
+				    " sip = 0x%p ssn != nextseq"
+				    " ssn 0x%x nextseq 0x%x",
+				    (void *)sctp, (void *)sip,
+				    srp->ssn, sip->nextseq);
 			ASSERT(sip->nextseq == srp->ssn);
 			sctp_partial_delivery_event(sctp);
 		}
@@ -2498,7 +2665,7 @@
 			SCTP_CHUNK_SET_SACKCNT(mp, SCTP_CHUNK_SACKCNT(mp) + 1);
 			if (SCTP_CHUNK_SACKCNT(mp) ==
 			    sctps->sctps_fast_rxt_thresh) {
-				SCTP_CHUNK_REXMIT(mp);
+				SCTP_CHUNK_REXMIT(sctp, mp);
 				sctp->sctp_chk_fast_rexmit = B_TRUE;
 				*trysend = 1;
 				if (!*fast_recovery) {
@@ -2756,7 +2923,7 @@
 			SCTP_CHUNK_SET_SACKCNT(mp, SCTP_CHUNK_SACKCNT(mp) + 1);
 			if (SCTP_CHUNK_SACKCNT(mp) ==
 			    sctps->sctps_fast_rxt_thresh) {
-				SCTP_CHUNK_REXMIT(mp);
+				SCTP_CHUNK_REXMIT(sctp, mp);
 				sctp->sctp_chk_fast_rexmit = B_TRUE;
 				trysend = 1;
 				if (!fast_recovery) {
@@ -2787,7 +2954,7 @@
 				fp = SCTP_CHUNK_DEST(mp);
 				fp->suna += chunklen;
 				sctp->sctp_unacked += chunklen - sizeof (*sdc);
-				SCTP_CHUNK_CLEAR_ACKED(mp);
+				SCTP_CHUNK_CLEAR_ACKED(sctp, mp);
 				if (!fp->timer_running) {
 					SCTP_FADDR_TIMER_RESTART(sctp, fp,
 					    fp->rto);
@@ -4229,7 +4396,7 @@
 		WAKE_SCTP(sctp);
 		return;
 	}
-	ASSERT(sctp->sctp_rwnd >= sctp->sctp_rxqueued);
+
 	old = sctp->sctp_rwnd - sctp->sctp_rxqueued;
 	new = len - sctp->sctp_rxqueued;
 	sctp->sctp_rwnd = len;