changeset 13066:feaeaa778d1c

6962670 MSG_EOR is set when a message is not completely received 6973505 (sctp) com/sun/nio/sctp/SctpMultiChannel/SocketOptionTests.java crashes a system
author Kacheong Poon <Kacheong.Poon@Sun.COM>
date Mon, 09 Aug 2010 20:07:20 -0700
parents 9377d65d657e
children 5b1463e9bb94
files usr/src/cmd/mdb/common/modules/sctp/sctp.c usr/src/uts/common/fs/sockfs/sockcommon.h usr/src/uts/common/fs/sockfs/sockcommon_subr.c usr/src/uts/common/fs/sockfs/sockfilter.c usr/src/uts/common/inet/sctp/sctp.c usr/src/uts/common/inet/sctp/sctp_conn.c usr/src/uts/common/inet/sctp/sctp_impl.h usr/src/uts/common/inet/sctp/sctp_input.c usr/src/uts/common/inet/sctp/sctp_opt_data.c usr/src/uts/common/inet/sctp/sctp_output.c usr/src/uts/common/inet/sockmods/socksctp.c usr/src/uts/common/inet/sockmods/socksctp.h usr/src/uts/common/inet/sockmods/socksctpsubr.c
diffstat 13 files changed, 203 insertions(+), 183 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/mdb/common/modules/sctp/sctp.c	Mon Aug 09 19:07:25 2010 -0700
+++ b/usr/src/cmd/mdb/common/modules/sctp/sctp.c	Mon Aug 09 20:07:20 2010 -0700
@@ -830,10 +830,10 @@
 		mdb_printf("%<b>Flow Control%</b>\n");
 		mdb_printf("tconn_sndbuf\t%?d\n"
 		    "conn_sndlowat\t%?d\tfrwnd\t\t%?u\n"
-		    "rwnd\t\t%?u\tinitial rwnd\t%?u\n"
+		    "rwnd\t\t%?u\tlast advertised rwnd\t%?u\n"
 		    "rxqueued\t%?u\tcwnd_max\t%?u\n", connp->conn_sndbuf,
 		    connp->conn_sndlowat, sctp->sctp_frwnd,
-		    sctp->sctp_rwnd, sctp->sctp_irwnd, sctp->sctp_rxqueued,
+		    sctp->sctp_rwnd, sctp->sctp_arwnd, sctp->sctp_rxqueued,
 		    sctp->sctp_cwnd_max);
 	}
 
--- a/usr/src/uts/common/fs/sockfs/sockcommon.h	Mon Aug 09 19:07:25 2010 -0700
+++ b/usr/src/uts/common/fs/sockfs/sockcommon.h	Mon Aug 09 20:07:20 2010 -0700
@@ -186,7 +186,7 @@
     rval_t *, int);
 extern void	so_enqueue_msg(struct sonode *, mblk_t *, size_t);
 extern void	so_process_new_message(struct sonode *, mblk_t *, mblk_t *);
-extern void	so_check_flow_control(struct sonode *);
+extern boolean_t	so_check_flow_control(struct sonode *);
 
 extern mblk_t	*socopyinuio(uio_t *, ssize_t, size_t, ssize_t, size_t, int *);
 extern mblk_t 	*socopyoutuio(mblk_t *, struct uio *, ssize_t, int *);
--- a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c	Mon Aug 09 19:07:25 2010 -0700
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c	Mon Aug 09 20:07:20 2010 -0700
@@ -613,9 +613,10 @@
 
 /*
  * Check flow control on a given sonode.  Must have so_lock held, and
- * this function will release the hold.
+ * this function will release the hold.  Return true if flow control
+ * is cleared.
  */
-void
+boolean_t
 so_check_flow_control(struct sonode *so)
 {
 	ASSERT(MUTEX_HELD(&so->so_lock));
@@ -635,8 +636,10 @@
 		}
 		/* filters can start injecting data */
 		sof_sonode_notify_filters(so, SOF_EV_INJECT_DATA_IN_OK, 0);
+		return (B_TRUE);
 	} else {
 		mutex_exit(&so->so_lock);
+		return (B_FALSE);
 	}
 }
 
@@ -709,7 +712,7 @@
 		so_process_new_message(so, new_msg_head, new_msg_last_head);
 	}
 	savemp = savemptail = NULL;
-	rvalp->r_val1 = 0;
+	rvalp->r_vals = 0;
 	error = 0;
 	mp = so->so_rcv_q_head;
 
@@ -822,7 +825,7 @@
 				 * so_check_flow_control() will drop
 				 * so->so_lock.
 				 */
-				so_check_flow_control(so);
+				rvalp->r_val2 = so_check_flow_control(so);
 			}
 		}
 		if (mp != NULL) { /* more data blocks in msg */
@@ -840,7 +843,8 @@
 					 * so_check_flow_control() will drop
 					 * so->so_lock.
 					 */
-					so_check_flow_control(so);
+					rvalp->r_val2 =
+					    so_check_flow_control(so);
 				}
 			} else if (partial_read && !somsghasdata(mp)) {
 				/*
--- a/usr/src/uts/common/fs/sockfs/sockfilter.c	Mon Aug 09 19:07:25 2010 -0700
+++ b/usr/src/uts/common/fs/sockfs/sockfilter.c	Mon Aug 09 20:07:20 2010 -0700
@@ -1344,7 +1344,7 @@
 		mutex_enter(&so->so_lock);
 		so->so_rcv_queued += diff;
 		/* so_check_flow_control drops so_lock */
-		so_check_flow_control(so);
+		(void) so_check_flow_control(so);
 	}
 
 	return (retmp);
@@ -1612,7 +1612,7 @@
 		}
 		so->so_state &= ~SS_FIL_RCV_FLOWCTRL;
 		/* so_check_flow_control drops so_lock */
-		so_check_flow_control(so);
+		(void) so_check_flow_control(so);
 	}
 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 }
--- a/usr/src/uts/common/inet/sctp/sctp.c	Mon Aug 09 19:07:25 2010 -0700
+++ b/usr/src/uts/common/inet/sctp/sctp.c	Mon Aug 09 20:07:20 2010 -0700
@@ -336,11 +336,10 @@
 		}
 
 		/*
-		 * In there is unread data, send an ABORT and terminate the
+		 * If there is unread data, send an ABORT and terminate the
 		 * association.
 		 */
-		if (sctp->sctp_rxqueued > 0 || sctp->sctp_irwnd >
-		    sctp->sctp_rwnd) {
+		if (sctp->sctp_rxqueued > 0 || sctp->sctp_ulp_rxqueued > 0) {
 			sctp_user_abort(sctp, NULL);
 			WAKE_SCTP(sctp);
 			return (error);
@@ -807,7 +806,8 @@
 	sctp->sctp_mtu_probe_intvl = sctps->sctps_mtu_probe_interval;
 
 	sctp->sctp_sack_gaps = 0;
-	sctp->sctp_sack_toggle = 2;
+	/* So we will not delay sending the first SACK. */
+	sctp->sctp_sack_toggle = sctps->sctps_deferred_acks_max;
 
 	/* Only need to do the allocation if there is no "cached" one. */
 	if (sctp->sctp_pad_mp == NULL) {
@@ -833,11 +833,13 @@
 		if (err != 0)
 			goto failure;
 
+		sctp->sctp_upcalls = psctp->sctp_upcalls;
+
 		sctp->sctp_cookie_lifetime = psctp->sctp_cookie_lifetime;
 
 		sctp->sctp_cwnd_max = psctp->sctp_cwnd_max;
 		sctp->sctp_rwnd = psctp->sctp_rwnd;
-		sctp->sctp_irwnd = psctp->sctp_rwnd;
+		sctp->sctp_arwnd = psctp->sctp_arwnd;
 		sctp->sctp_pd_point = psctp->sctp_pd_point;
 		sctp->sctp_rto_max = psctp->sctp_rto_max;
 		sctp->sctp_rto_max_init = psctp->sctp_rto_max_init;
@@ -878,7 +880,7 @@
 
 		sctp->sctp_cwnd_max = sctps->sctps_cwnd_max_;
 		sctp->sctp_rwnd = connp->conn_rcvbuf;
-		sctp->sctp_irwnd = sctp->sctp_rwnd;
+		sctp->sctp_arwnd = connp->conn_rcvbuf;
 		sctp->sctp_pd_point = sctp->sctp_rwnd;
 		sctp->sctp_rto_max = MSEC_TO_TICK(sctps->sctps_rto_maxg);
 		sctp->sctp_rto_max_init = sctp->sctp_rto_max;
@@ -1661,6 +1663,13 @@
 	int thrs;
 	int max_tasks;
 
+	mutex_enter(&sctps->sctps_g_lock);
+	/* Someone may have beaten us in creating the taskqs. */
+	if (sctps->sctps_recvq_tq_list_cur_sz > 0) {
+		mutex_exit(&sctps->sctps_g_lock);
+		return;
+	}
+
 	thrs = MIN(sctp_recvq_tq_thr_max, MAX(sctp_recvq_tq_thr_min,
 	    MAX(ncpus, boot_ncpus)));
 	/*
@@ -1688,6 +1697,8 @@
 	sctps->sctps_recvq_tq_list[0] = taskq_create(tq_name, thrs,
 	    minclsyspri, sctp_recvq_tq_task_min, max_tasks, TASKQ_PREPOPULATE);
 	mutex_init(&sctps->sctps_rq_tq_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	mutex_exit(&sctps->sctps_g_lock);
 }
 
 static void
--- a/usr/src/uts/common/inet/sctp/sctp_conn.c	Mon Aug 09 19:07:25 2010 -0700
+++ b/usr/src/uts/common/inet/sctp/sctp_conn.c	Mon Aug 09 20:07:20 2010 -0700
@@ -128,16 +128,6 @@
 	    SCTP_BIND_HASH(ntohs(aconnp->conn_lport))], acceptor, 0);
 
 	SCTP_ASSOC_EST(sctps, acceptor);
-
-	/*
-	 * listener->sctp_rwnd should be the default window size or a
-	 * window size changed via SO_RCVBUF option.
-	 */
-	acceptor->sctp_rwnd = listener->sctp_rwnd;
-	acceptor->sctp_irwnd = acceptor->sctp_rwnd;
-	acceptor->sctp_pd_point = acceptor->sctp_rwnd;
-	acceptor->sctp_upcalls = listener->sctp_upcalls;
-
 	return (0);
 }
 
@@ -151,7 +141,6 @@
 	int	err;
 	conn_t	*connp, *econnp;
 	sctp_stack_t	*sctps;
-	struct sock_proto_props sopp;
 	cred_t		*cr;
 	pid_t		cpid;
 	in6_addr_t	faddr, laddr;
@@ -348,17 +337,6 @@
 	}
 	ASSERT(SCTP_IS_DETACHED(eager));
 	eager->sctp_detached = B_FALSE;
-	bzero(&sopp, sizeof (sopp));
-	sopp.sopp_flags = SOCKOPT_MAXBLK|SOCKOPT_WROFF;
-	sopp.sopp_maxblk = strmsgsz;
-	if (econnp->conn_family == AF_INET) {
-		sopp.sopp_wroff = sctps->sctps_wroff_xtra +
-		    sizeof (sctp_data_hdr_t) + sctp->sctp_hdr_len;
-	} else {
-		sopp.sopp_wroff = sctps->sctps_wroff_xtra +
-		    sizeof (sctp_data_hdr_t) + sctp->sctp_hdr6_len;
-	}
-	eager->sctp_ulp_prop(eager->sctp_ulpd, &sopp);
 	return (eager);
 }
 
--- a/usr/src/uts/common/inet/sctp/sctp_impl.h	Mon Aug 09 19:07:25 2010 -0700
+++ b/usr/src/uts/common/inet/sctp/sctp_impl.h	Mon Aug 09 20:07:20 2010 -0700
@@ -660,7 +660,7 @@
 #define	sctp_ulp_disconnected	sctp_upcalls->su_disconnected
 #define	sctp_ulp_opctl		sctp_upcalls->su_opctl
 #define	sctp_ulp_recv		sctp_upcalls->su_recv
-#define	sctp_ulp_xmitted	sctp_upcalls->su_txq_full
+#define	sctp_ulp_txq_full	sctp_upcalls->su_txq_full
 #define	sctp_ulp_prop		sctp_upcalls->su_set_proto_props
 
 	int32_t		sctp_state;
@@ -739,8 +739,9 @@
 
 	/* Inbound flow control */
 	int32_t		sctp_rwnd;		/* Current receive window */
-	int32_t		sctp_irwnd;		/* Initial receive window */
+	int32_t		sctp_arwnd;		/* Last advertised window */
 	int32_t		sctp_rxqueued;		/* No. of bytes in RX q's */
+	int32_t		sctp_ulp_rxqueued;	/* Data in ULP */
 
 	/* Pre-initialized composite headers */
 	uchar_t		*sctp_iphc;	/* v4 sctp/ip hdr template buffer */
@@ -800,7 +801,8 @@
 
 		sctp_txq_full : 1,	/* the tx queue is full */
 		sctp_ulp_discon_done : 1,	/* ulp_disconnecting done */
-		sctp_dummy : 6;
+		sctp_flowctrld : 1,	/* upper layer flow controlled */
+		sctp_dummy : 5;
 	} sctp_bits;
 	struct {
 		uint32_t
@@ -838,6 +840,7 @@
 #define	sctp_zero_win_probe sctp_bits.sctp_zero_win_probe
 #define	sctp_txq_full sctp_bits.sctp_txq_full
 #define	sctp_ulp_discon_done sctp_bits.sctp_ulp_discon_done
+#define	sctp_flowctrld sctp_bits.sctp_flowctrld
 
 #define	sctp_recvsndrcvinfo sctp_events.sctp_recvsndrcvinfo
 #define	sctp_recvassocevnt sctp_events.sctp_recvassocevnt
@@ -960,7 +963,7 @@
 	if ((sctp)->sctp_txq_full && SCTP_TXQ_LEN(sctp) <=	\
 	    (sctp)->sctp_connp->conn_sndlowat) {		\
 		(sctp)->sctp_txq_full = 0;			\
-		(sctp)->sctp_ulp_xmitted((sctp)->sctp_ulpd,	\
+		(sctp)->sctp_ulp_txq_full((sctp)->sctp_ulpd,	\
 		    B_FALSE);					\
 	}
 
--- a/usr/src/uts/common/inet/sctp/sctp_input.c	Mon Aug 09 19:07:25 2010 -0700
+++ b/usr/src/uts/common/inet/sctp/sctp_input.c	Mon Aug 09 20:07:20 2010 -0700
@@ -1295,7 +1295,6 @@
 	uint32_t tsn;
 	int dlen;
 	boolean_t tpfinished = B_TRUE;
-	int32_t new_rwnd;
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
 	int	error;
 
@@ -1542,31 +1541,27 @@
 	sctp->sctp_rxqueued -= dlen;
 
 	if (can_deliver) {
-
 		/* step past header to the payload */
 		dmp->b_rptr = (uchar_t *)(dc + 1);
 		if (sctp_input_add_ancillary(sctp, &dmp, dc, fp,
 		    ipp, ira) == 0) {
 			dprint(1, ("sctp_data_chunk: delivering %lu bytes\n",
 			    msgdsize(dmp)));
-			sctp->sctp_rwnd -= dlen;
 			/*
 			 * We overload the meaning of b_flag for SCTP sockfs
 			 * internal use, to advise sockfs of partial delivery
 			 * semantics.
 			 */
 			dmp->b_flag = tpfinished ? 0 : SCTP_PARTIAL_DATA;
-			new_rwnd = sctp->sctp_ulp_recv(sctp->sctp_ulpd, dmp,
-			    msgdsize(dmp), 0, &error, NULL);
-			/*
-			 * Since we always deliver the next TSN data chunk,
-			 * we may buffer a little more than allowed. In
-			 * that case, just mark the window as 0.
-			 */
-			if (new_rwnd < 0)
-				sctp->sctp_rwnd = 0;
-			else if (new_rwnd > sctp->sctp_rwnd)
-				sctp->sctp_rwnd = new_rwnd;
+			if (sctp->sctp_flowctrld) {
+				sctp->sctp_rwnd -= dlen;
+				if (sctp->sctp_rwnd < 0)
+					sctp->sctp_rwnd = 0;
+			}
+			if (sctp->sctp_ulp_recv(sctp->sctp_ulpd, dmp,
+			    msgdsize(dmp), 0, &error, NULL) <= 0) {
+				sctp->sctp_flowctrld = B_TRUE;
+			}
 			SCTP_ACK_IT(sctp, tsn);
 		} else {
 			/* No memory don't ack, the peer will retransmit. */
@@ -1689,7 +1684,6 @@
 			    ipp, ira) == 0) {
 				dprint(1, ("sctp_data_chunk: delivering %lu "
 				    "bytes\n", msgdsize(dmp)));
-				sctp->sctp_rwnd -= dlen;
 				/*
 				 * Meaning of b_flag overloaded for SCTP sockfs
 				 * internal use, advise sockfs of partial
@@ -1697,12 +1691,15 @@
 				 */
 				dmp->b_flag = tpfinished ?
 				    0 : SCTP_PARTIAL_DATA;
-				new_rwnd = sctp->sctp_ulp_recv(sctp->sctp_ulpd,
-				    dmp, msgdsize(dmp), 0, &error, NULL);
-				if (new_rwnd < 0)
-					sctp->sctp_rwnd = 0;
-				else if (new_rwnd > sctp->sctp_rwnd)
-					sctp->sctp_rwnd = new_rwnd;
+				if (sctp->sctp_flowctrld) {
+					sctp->sctp_rwnd -= dlen;
+					if (sctp->sctp_rwnd < 0)
+						sctp->sctp_rwnd = 0;
+				}
+				if (sctp->sctp_ulp_recv(sctp->sctp_ulpd, dmp,
+				    msgdsize(dmp), 0, &error, NULL) <= 0) {
+					sctp->sctp_flowctrld = B_TRUE;
+				}
 				SCTP_ACK_IT(sctp, tsn);
 			} else {
 				/* don't ack, the peer will retransmit */
@@ -1772,6 +1769,8 @@
 	} else {
 		sc->ssc_a_rwnd = 0;
 	}
+	/* Remember the last window sent to peer. */
+	sctp->sctp_arwnd = sc->ssc_a_rwnd;
 	sc->ssc_numfrags = htons(num_gaps);
 	sc->ssc_numdups = 0;
 
@@ -2359,7 +2358,6 @@
 				dlen += MBLKL(pmp);
 			}
 			if (can_deliver) {
-				int32_t	nrwnd;
 				int error;
 
 				dmp->b_rptr = (uchar_t *)(dc + 1);
@@ -2368,20 +2366,22 @@
 				if (sctp_input_add_ancillary(sctp,
 				    &dmp, dc, fp, ipp, ira) == 0) {
 					sctp->sctp_rxqueued -= dlen;
-					sctp->sctp_rwnd -= dlen;
 					/*
 					 * Override b_flag for SCTP sockfs
 					 * internal use
 					 */
 
 					dmp->b_flag = 0;
-					nrwnd = sctp->sctp_ulp_recv(
+					if (sctp->sctp_flowctrld) {
+						sctp->sctp_rwnd -= dlen;
+						if (sctp->sctp_rwnd < 0)
+							sctp->sctp_rwnd = 0;
+					}
+					if (sctp->sctp_ulp_recv(
 					    sctp->sctp_ulpd, dmp, msgdsize(dmp),
-					    0, &error, NULL);
-					if (nrwnd < 0)
-						sctp->sctp_rwnd = 0;
-					else if (nrwnd > sctp->sctp_rwnd)
-						sctp->sctp_rwnd = nrwnd;
+					    0, &error, NULL) <= 0) {
+						sctp->sctp_flowctrld = B_TRUE;
+					}
 				} else {
 					/*
 					 * We will resume processing when
@@ -4409,33 +4409,30 @@
 }
 
 /*
- * Some amount of data got removed from rx q.
- * Check if we should send a window update.
- *
- * Due to way sctp_rwnd updates are made, ULP can give reports out-of-order.
- * To keep from dropping incoming data due to this, we only update
- * sctp_rwnd when if it's larger than what we've reported to peer earlier.
+ * Some amount of data got removed from ULP's receive queue and we can
+ * push messages up if we are flow controlled before.  Reset the receive
+ * window to full capacity (conn_rcvbuf) and check if we should send a
+ * window update.
  */
 void
 sctp_recvd(sctp_t *sctp, int len)
 {
-	int32_t old, new;
 	sctp_stack_t	*sctps = sctp->sctp_sctps;
+	conn_t		*connp = sctp->sctp_connp;
+	boolean_t	send_sack = B_FALSE;
 
 	ASSERT(sctp != NULL);
 	RUN_SCTP(sctp);
 
-	if (len < sctp->sctp_rwnd) {
-		WAKE_SCTP(sctp);
-		return;
-	}
-
-	old = sctp->sctp_rwnd - sctp->sctp_rxqueued;
-	new = len - sctp->sctp_rxqueued;
-	sctp->sctp_rwnd = len;
-
-	if (sctp->sctp_state >= SCTPS_ESTABLISHED &&
-	    ((old <= new >> 1) || (old < sctp->sctp_mss))) {
+	sctp->sctp_flowctrld = B_FALSE;
+	/* This is the amount of data queued in ULP. */
+	sctp->sctp_ulp_rxqueued = connp->conn_rcvbuf - len;
+
+	if (connp->conn_rcvbuf - sctp->sctp_arwnd >= sctp->sctp_mss)
+		send_sack = B_TRUE;
+	sctp->sctp_rwnd = connp->conn_rcvbuf;
+
+	if (sctp->sctp_state >= SCTPS_ESTABLISHED && send_sack) {
 		sctp->sctp_force_sack = 1;
 		SCTPS_BUMP_MIB(sctps, sctpOutWinUpdate);
 		(void) sctp_sack(sctp, NULL);
--- a/usr/src/uts/common/inet/sctp/sctp_opt_data.c	Mon Aug 09 19:07:25 2010 -0700
+++ b/usr/src/uts/common/inet/sctp/sctp_opt_data.c	Mon Aug 09 20:07:20 2010 -0700
@@ -1119,7 +1119,7 @@
 				 * protocol and here we just whack it.
 				 */
 				connp->conn_rcvbuf = sctp->sctp_rwnd = *i1;
-				sctp->sctp_irwnd = sctp->sctp_rwnd;
+				sctp->sctp_arwnd = sctp->sctp_rwnd;
 				sctp->sctp_pd_point = sctp->sctp_rwnd;
 
 				sopp.sopp_flags = SOCKOPT_RCVHIWAT;
--- a/usr/src/uts/common/inet/sctp/sctp_output.c	Mon Aug 09 19:07:25 2010 -0700
+++ b/usr/src/uts/common/inet/sctp/sctp_output.c	Mon Aug 09 20:07:20 2010 -0700
@@ -295,7 +295,7 @@
 	 */
 	if (SCTP_TXQ_LEN(sctp) >= connp->conn_sndbuf) {
 		sctp->sctp_txq_full = 1;
-		sctp->sctp_ulp_xmitted(sctp->sctp_ulpd, B_TRUE);
+		sctp->sctp_ulp_txq_full(sctp->sctp_ulpd, B_TRUE);
 	}
 	if (sctp->sctp_state == SCTPS_ESTABLISHED)
 		sctp_output(sctp, UINT_MAX);
--- a/usr/src/uts/common/inet/sockmods/socksctp.c	Mon Aug 09 19:07:25 2010 -0700
+++ b/usr/src/uts/common/inet/sockmods/socksctp.c	Mon Aug 09 20:07:20 2010 -0700
@@ -145,6 +145,7 @@
 	sosctp_close,			/* sop_close 	*/
 };
 
+/* All the upcalls expect the upper handle to be sonode. */
 sock_upcalls_t sosctp_sock_upcalls = {
 	so_newconn,
 	so_connected,
@@ -156,6 +157,7 @@
 	NULL,			/* su_signal_oob */
 };
 
+/* All the upcalls expect the upper handle to be sctp_sonode/sctp_soassoc. */
 sock_upcalls_t sosctp_assoc_upcalls = {
 	sctp_assoc_newconn,
 	sctp_assoc_connected,
@@ -175,7 +177,6 @@
 	struct sctp_sonode *ss;
 	struct sctp_sonode *pss;
 	sctp_sockbuf_limits_t sbl;
-	sock_upcalls_t *upcalls;
 	int err;
 
 	ss = SOTOSSO(so);
@@ -200,19 +201,21 @@
 		return (0);
 	}
 
+	if ((err = secpolicy_basic_net_access(cr)) != 0)
+		return (err);
+
 	if (so->so_type == SOCK_STREAM) {
-		upcalls = &sosctp_sock_upcalls;
+		so->so_proto_handle = (sock_lower_handle_t)sctp_create(so,
+		    NULL, so->so_family, so->so_type, SCTP_CAN_BLOCK,
+		    &sosctp_sock_upcalls, &sbl, cr);
 		so->so_mode = SM_CONNREQUIRED;
 	} else {
 		ASSERT(so->so_type == SOCK_SEQPACKET);
-		upcalls = &sosctp_assoc_upcalls;
+		so->so_proto_handle = (sock_lower_handle_t)sctp_create(ss,
+		    NULL, so->so_family, so->so_type, SCTP_CAN_BLOCK,
+		    &sosctp_assoc_upcalls, &sbl, cr);
 	}
 
-	if ((err = secpolicy_basic_net_access(cr)) != 0)
-		return (err);
-
-	so->so_proto_handle = (sock_lower_handle_t)sctp_create(so, NULL,
-	    so->so_family, so->so_type, SCTP_CAN_BLOCK, upcalls, &sbl, cr);
 	if (so->so_proto_handle == NULL)
 		return (ENOMEM);
 
@@ -482,7 +485,7 @@
 	int flags, error = 0;
 	struct T_unitdata_ind *tind;
 	ssize_t orig_resid = uiop->uio_resid;
-	int len, count, readcnt = 0, rxqueued;
+	int len, count, readcnt = 0;
 	socklen_t controllen, namelen;
 	void *opt;
 	mblk_t *mp;
@@ -591,8 +594,10 @@
 			msg->msg_flags |= MSG_NOTIFICATION;
 		}
 
-		if (!(mp->b_flag & SCTP_PARTIAL_DATA))
+		if (!(mp->b_flag & SCTP_PARTIAL_DATA) &&
+		    !(rval.r_val1 & MOREDATA)) {
 			msg->msg_flags |= MSG_EOR;
+		}
 		freemsg(mp);
 	}
 done:
@@ -606,7 +611,6 @@
 	 */
 	if (ssa == NULL) {
 		mutex_enter(&so->so_lock);
-		rxqueued = so->so_rcv_queued;
 		count = so->so_rcvbuf - so->so_rcv_queued;
 
 		ASSERT(so->so_rcv_q_head != NULL ||
@@ -614,16 +618,17 @@
 		    so->so_rcv_queued == 0);
 
 		so_unlock_read(so);
-		mutex_exit(&so->so_lock);
 
-		if (readcnt > 0 && (((count > 0) &&
-		    ((rxqueued + readcnt) >= so->so_rcvlowat)) ||
-		    (rxqueued == 0))) {
-			/*
-			 * If amount of queued data is higher than watermark,
-			 * updata SCTP's idea of available buffer space.
-			 */
+		/*
+		 * so_dequeue_msg() sets r_val2 to true if flow control was
+		 * cleared and we need to update SCTP.  so_flowctrld was
+		 * cleared in so_dequeue_msg() via so_check_flow_control().
+		 */
+		if (rval.r_val2) {
+			mutex_exit(&so->so_lock);
 			sctp_recvd((struct sctp_s *)so->so_proto_handle, count);
+		} else {
+			mutex_exit(&so->so_lock);
 		}
 	} else {
 		/*
@@ -634,26 +639,23 @@
 		 * done in so_dequeue_msg().
 		 */
 		mutex_enter(&so->so_lock);
-		rxqueued = ssa->ssa_rcv_queued;
-
-		ssa->ssa_rcv_queued = rxqueued - readcnt;
+		ssa->ssa_rcv_queued -= readcnt;
 		count = so->so_rcvbuf - ssa->ssa_rcv_queued;
 
 		so_unlock_read(so);
 
-		if (readcnt > 0 &&
-		    (((count > 0) && (rxqueued >= so->so_rcvlowat)) ||
-		    (ssa->ssa_rcv_queued == 0))) {
+		if (readcnt > 0 && ssa->ssa_flowctrld &&
+		    ssa->ssa_rcv_queued < so->so_rcvlowat) {
 			/*
-			 * If amount of queued data is higher than watermark,
-			 * updata SCTP's idea of available buffer space.
+			 * Need to clear ssa_flowctrld, different from 1-1
+			 * style.
 			 */
+			ssa->ssa_flowctrld = B_FALSE;
 			mutex_exit(&so->so_lock);
-
-			sctp_recvd((struct sctp_s *)ssa->ssa_conn, count);
-
+			sctp_recvd(ssa->ssa_conn, count);
 			mutex_enter(&so->so_lock);
 		}
+
 		/*
 		 * MOREDATA flag is set if all data could not be copied
 		 */
@@ -723,7 +725,6 @@
 sosctp_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
     struct cred *cr)
 {
-	struct sctp_sonode *ss = SOTOSSO(so);
 	mblk_t *mctl;
 	struct cmsghdr *cmsg;
 	struct sctp_sndrcvinfo *sinfo;
@@ -891,8 +892,8 @@
 	}
 
 	/* Copy in the message. */
-	if ((error = sosctp_uiomove(mctl, count, ss->ss_wrsize, ss->ss_wroff,
-	    uiop, flags)) != 0) {
+	if ((error = sosctp_uiomove(mctl, count, so->so_proto_props.sopp_maxblk,
+	    so->so_proto_props.sopp_wroff, uiop, flags)) != 0) {
 		goto error_ret;
 	}
 	error = sctp_sendmsg((struct sctp_s *)so->so_proto_handle, mctl, 0);
@@ -1031,9 +1032,8 @@
 		} else {
 			mutex_exit(&so->so_lock);
 			ssa->ssa_state |= SS_ISDISCONNECTING;
-			sctp_recvd((struct sctp_s *)ssa->ssa_conn,
-			    so->so_rcvbuf);
-			error = sctp_disconnect((struct sctp_s *)ssa->ssa_conn);
+			sctp_recvd(ssa->ssa_conn, so->so_rcvbuf);
+			error = sctp_disconnect(ssa->ssa_conn);
 			mutex_enter(&so->so_lock);
 		}
 		goto refrele;
@@ -1825,8 +1825,8 @@
 	ss = SOTOSSO(so);
 
 	/*
-	 * Initiate connection shutdown.  Update SCTP's receive
-	 * window.
+	 * Initiate connection shutdown.  Tell SCTP if there is any data
+	 * left unread.
 	 */
 	sctp_recvd((struct sctp_s *)so->so_proto_handle,
 	    so->so_rcvbuf - so->so_rcv_queued);
@@ -1845,9 +1845,9 @@
 			sosctp_assoc_isdisconnected(ssa, 0);
 			mutex_exit(&so->so_lock);
 
-			sctp_recvd((struct sctp_s *)ssa->ssa_conn,
-			    so->so_rcvbuf - ssa->ssa_rcv_queued);
-			(void) sctp_disconnect((struct sctp_s *)ssa->ssa_conn);
+			sctp_recvd(ssa->ssa_conn, so->so_rcvbuf -
+			    ssa->ssa_rcv_queued);
+			(void) sctp_disconnect(ssa->ssa_conn);
 
 			mutex_enter(&so->so_lock);
 			SSA_REFRELE(ss, ssa);
@@ -1879,8 +1879,6 @@
 	/* We are the sole owner of so now */
 	mutex_enter(&so->so_lock);
 
-	so_rcv_flush(so);
-
 	/* Free all pending connections */
 	so_acceptq_flush(so, B_TRUE);
 
@@ -1908,6 +1906,15 @@
 		sctp_close((struct sctp_s *)so->so_proto_handle);
 	so->so_proto_handle = NULL;
 
+	/*
+	 * Note until sctp_close() is called, SCTP can still send up
+	 * messages, such as event notifications.  So we should flush
+	 * the recevie buffer after calling sctp_close().
+	 */
+	mutex_enter(&so->so_lock);
+	so_rcv_flush(so);
+	mutex_exit(&so->so_lock);
+
 	sonode_fini(so);
 }
 
@@ -1929,8 +1936,8 @@
     sock_lower_handle_t connind, sock_downcalls_t *dc,
     struct cred *peer_cred, pid_t peer_cpid, sock_upcalls_t **ucp)
 {
-	struct sonode *lso = (struct sonode *)parenthandle;
-	struct sctp_sonode *lss = SOTOSSO(lso);
+	struct sctp_sonode *lss = (struct sctp_sonode *)parenthandle;
+	struct sonode *lso = &lss->ss_so;
 	struct sctp_soassoc *ssa;
 	sctp_assoc_t id;
 
@@ -2144,6 +2151,9 @@
 
 	ssa->ssa_rcv_queued += len;
 	space_available = so->so_rcvbuf - ssa->ssa_rcv_queued;
+	if (space_available <= 0)
+		ssa->ssa_flowctrld = B_TRUE;
+
 	so_enqueue_msg(so, mp, len);
 
 	/* so_notify_data drops so_lock */
@@ -2179,32 +2189,44 @@
     struct sock_proto_props *soppp)
 {
 	struct sctp_soassoc *ssa = (struct sctp_soassoc *)handle;
-	struct sctp_sonode *ss;
+	struct sonode *so;
 
 	if (ssa->ssa_type == SOSCTP_ASSOC) {
-		ss = ssa->ssa_sonode;
-		mutex_enter(&ss->ss_so.so_lock);
+		so = &ssa->ssa_sonode->ss_so;
+
+		mutex_enter(&so->so_lock);
 
-		/*
-		 * Only change them if they're set.
-		 */
-		if (soppp->sopp_wroff != 0) {
+		/* Per assoc_id properties. */
+		if (soppp->sopp_flags & SOCKOPT_WROFF)
 			ssa->ssa_wroff = soppp->sopp_wroff;
-		}
-		if (soppp->sopp_maxblk != 0) {
+		if (soppp->sopp_flags & SOCKOPT_MAXBLK)
 			ssa->ssa_wrsize = soppp->sopp_maxblk;
-		}
 	} else {
-		ss = (struct sctp_sonode *)handle;
-		mutex_enter(&ss->ss_so.so_lock);
+		so = &((struct sctp_sonode *)handle)->ss_so;
+		mutex_enter(&so->so_lock);
+
+		if (soppp->sopp_flags & SOCKOPT_WROFF)
+			so->so_proto_props.sopp_wroff = soppp->sopp_wroff;
+		if (soppp->sopp_flags & SOCKOPT_MAXBLK)
+			so->so_proto_props.sopp_maxblk = soppp->sopp_maxblk;
+		if (soppp->sopp_flags & SOCKOPT_RCVHIWAT) {
+			ssize_t lowat;
 
-		if (soppp->sopp_wroff != 0) {
-			ss->ss_wroff = soppp->sopp_wroff;
-		}
-		if (soppp->sopp_maxblk != 0) {
-			ss->ss_wrsize = soppp->sopp_maxblk;
+			so->so_rcvbuf = soppp->sopp_rxhiwat;
+			/*
+			 * The low water mark should be adjusted properly
+			 * if the high water mark is changed.  It should
+			 * not be bigger than 1/4 of high water mark.
+			 */
+			lowat = soppp->sopp_rxhiwat >> 2;
+			if (so->so_rcvlowat > lowat) {
+				/* Sanity check... */
+				if (lowat == 0)
+					so->so_rcvlowat = soppp->sopp_rxhiwat;
+				else
+					so->so_rcvlowat = lowat;
+			}
 		}
 	}
-
-	mutex_exit(&ss->ss_so.so_lock);
+	mutex_exit(&so->so_lock);
 }
--- a/usr/src/uts/common/inet/sockmods/socksctp.h	Mon Aug 09 19:07:25 2010 -0700
+++ b/usr/src/uts/common/inet/sockmods/socksctp.h	Mon Aug 09 20:07:20 2010 -0700
@@ -18,9 +18,9 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SOCKSCTP_H_
@@ -59,9 +59,10 @@
 	uint_t			ssa_state;	/* same as so_state */
 	int			ssa_error;	/* same as so_error */
 	boolean_t		ssa_snd_qfull;
-	int			ssa_wroff;
-	size_t			ssa_wrsize;
+	ushort_t		ssa_wroff;
+	ssize_t			ssa_wrsize;
 	int			ssa_rcv_queued;	/* queued rx bytes/# of conn */
+	boolean_t		ssa_flowctrld;	/* receive flow controlled */
 };
 
 /* 1-N socket association cache defined in socksctp.c */
--- a/usr/src/uts/common/inet/sockmods/socksctpsubr.c	Mon Aug 09 19:07:25 2010 -0700
+++ b/usr/src/uts/common/inet/sockmods/socksctpsubr.c	Mon Aug 09 20:07:20 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -177,6 +176,7 @@
 		ssa->ssa_error = 0;
 		ssa->ssa_snd_qfull = 0;
 		ssa->ssa_rcv_queued = 0;
+		ssa->ssa_flowctrld = B_FALSE;
 	}
 	dprint(2, ("sosctp_assoc_create %p %p\n", (void *)ss, (void *)ssa));
 	return (ssa);
@@ -515,32 +515,37 @@
 {
 	mblk_t *mp, **nmp, *last_mp;
 	struct sctp_soassoc *tmp;
+	struct sonode *nso, *sso;
 
 	sosctp_so_inherit(ss, nss);
 
-	nss->ss_so.so_state |= (ss->ss_so.so_state & (SS_NDELAY|SS_NONBLOCK));
-	nss->ss_so.so_state |=
+	sso = &ss->ss_so;
+	nso = &nss->ss_so;
+
+	nso->so_state |= (sso->so_state & (SS_NDELAY|SS_NONBLOCK));
+	nso->so_state |=
 	    (ssa->ssa_state & (SS_ISCONNECTED|SS_ISCONNECTING|
 	    SS_ISDISCONNECTING|SS_CANTSENDMORE|SS_CANTRCVMORE|SS_ISBOUND));
-	nss->ss_so.so_error = ssa->ssa_error;
-	nss->ss_so.so_snd_qfull = ssa->ssa_snd_qfull;
-	nss->ss_wroff = ssa->ssa_wroff;
-	nss->ss_wrsize = ssa->ssa_wrsize;
-	nss->ss_so.so_rcv_queued = ssa->ssa_rcv_queued;
-	nss->ss_so.so_proto_handle = (sock_lower_handle_t)ssa->ssa_conn;
+	nso->so_error = ssa->ssa_error;
+	nso->so_snd_qfull = ssa->ssa_snd_qfull;
+	nso->so_proto_props.sopp_wroff = ssa->ssa_wroff;
+	nso->so_proto_props.sopp_maxblk = ssa->ssa_wrsize;
+	nso->so_rcv_queued = ssa->ssa_rcv_queued;
+	nso->so_flowctrld = ssa->ssa_flowctrld;
+	nso->so_proto_handle = (sock_lower_handle_t)ssa->ssa_conn;
 	/* The peeled off socket is connection oriented */
-	nss->ss_so.so_mode |= SM_CONNREQUIRED;
+	nso->so_mode |= SM_CONNREQUIRED;
 
 	/* Consolidate all data on a single rcv list */
-	if (ss->ss_so.so_rcv_head != NULL) {
-		so_process_new_message(&ss->ss_so, ss->ss_so.so_rcv_head,
-		    ss->ss_so.so_rcv_last_head);
-		ss->ss_so.so_rcv_head = NULL;
-		ss->ss_so.so_rcv_last_head = NULL;
+	if (sso->so_rcv_head != NULL) {
+		so_process_new_message(&ss->ss_so, sso->so_rcv_head,
+		    sso->so_rcv_last_head);
+		sso->so_rcv_head = NULL;
+		sso->so_rcv_last_head = NULL;
 	}
 
-	if (nss->ss_so.so_rcv_queued > 0) {
-		nmp = &ss->ss_so.so_rcv_q_head;
+	if (nso->so_rcv_queued > 0) {
+		nmp = &sso->so_rcv_q_head;
 		last_mp = NULL;
 		while ((mp = *nmp) != NULL) {
 			tmp = *(struct sctp_soassoc **)DB_BASE(mp);
@@ -560,13 +565,12 @@
 			if (tmp == ssa) {
 				*nmp = mp->b_next;
 				ASSERT(DB_TYPE(mp) != M_DATA);
-				if (nss->ss_so.so_rcv_q_last_head == NULL) {
-					nss->ss_so.so_rcv_q_head = mp;
+				if (nso->so_rcv_q_last_head == NULL) {
+					nso->so_rcv_q_head = mp;
 				} else {
-					nss->ss_so.so_rcv_q_last_head->b_next =
-					    mp;
+					nso->so_rcv_q_last_head->b_next = mp;
 				}
-				nss->ss_so.so_rcv_q_last_head = mp;
+				nso->so_rcv_q_last_head = mp;
 				mp->b_next = NULL;
 			} else {
 				nmp = &mp->b_next;
@@ -574,7 +578,7 @@
 			}
 		}
 
-		ss->ss_so.so_rcv_q_last_head = last_mp;
+		sso->so_rcv_q_last_head = last_mp;
 	}
 }