changeset 10051:2da8dfa3cb83

6836478 panic - ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE, file: ../../common/inet/ip/ip_if.c, line: 3301 6855949 ill_perim_enter/exit are unreferenced and may be removed
author Thirumalai Srinivasan <Thirumalai.Srinivasan@Sun.COM>
date Tue, 07 Jul 2009 10:46:23 -0700
parents 09746c0f5a44
children 86e6866e392d
files usr/src/uts/common/inet/ip.h usr/src/uts/common/inet/ip/ip_if.c
diffstat 2 files changed, 147 insertions(+), 64 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/inet/ip.h	Tue Jul 07 09:34:55 2009 -0700
+++ b/usr/src/uts/common/inet/ip.h	Tue Jul 07 10:46:23 2009 -0700
@@ -2179,7 +2179,8 @@
  * ill_resolver_mp		ipsq			only when ill is up
  * ill_down_mp			ipsq			ipsq
  * ill_dlpi_deferred		ill_lock		ill_lock
- * ill_dlpi_pending		ill_lock		ill_lock
+ * ill_dlpi_pending		ipsq + ill_lock		ipsq or ill_lock or
+ *							absence of ipsq writer.
  * ill_phys_addr_mp		ipsq + down ill		only when ill is up
  * ill_phys_addr		ipsq + down ill		only when ill is up
  *
--- a/usr/src/uts/common/inet/ip/ip_if.c	Tue Jul 07 09:34:55 2009 -0700
+++ b/usr/src/uts/common/inet/ip/ip_if.c	Tue Jul 07 10:46:23 2009 -0700
@@ -7609,6 +7609,51 @@
 }
 
 /*
+ * Return completion status of previously initiated DLPI operations on
+ * ills in the purview of an ipsq.
+ */
+static boolean_t
+ipsq_dlpi_done(ipsq_t *ipsq)
+{
+	ipsq_t		*ipsq_start;
+	phyint_t	*phyi;
+	ill_t		*ill;
+
+	ASSERT(RW_LOCK_HELD(&ipsq->ipsq_ipst->ips_ill_g_lock));
+	ipsq_start = ipsq;
+
+	do {
+		/*
+		 * The only current users of this function are ipsq_try_enter
+		 * and ipsq_enter which have made sure that ipsq_writer is
+		 * NULL before we reach here. ill_dlpi_pending is modified
+		 * only by an ipsq writer
+		 */
+		ASSERT(ipsq->ipsq_xop->ipx_writer == NULL);
+		phyi = ipsq->ipsq_phyint;
+		/*
+		 * phyi could be NULL if a phyint that is part of an
+		 * IPMP group is being unplumbed. A more detailed
+		 * comment is in ipmp_grp_update_kstats()
+		 */
+		if (phyi != NULL) {
+			ill = phyi->phyint_illv4;
+			if (ill != NULL &&
+			    ill->ill_dlpi_pending != DL_PRIM_INVAL)
+				return (B_FALSE);
+
+			ill = phyi->phyint_illv6;
+			if (ill != NULL &&
+			    ill->ill_dlpi_pending != DL_PRIM_INVAL)
+				return (B_FALSE);
+		}
+
+	} while ((ipsq = ipsq->ipsq_next) != ipsq_start);
+
+	return (B_TRUE);
+}
+
+/*
  * Enter the ipsq corresponding to ill, by waiting synchronously till
  * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq
  * will have to drain completely before ipsq_enter returns success.
@@ -7626,6 +7671,7 @@
 	ipsq_t	*ipsq;
 	ipxop_t *ipx;
 	boolean_t waited_enough = B_FALSE;
+	ip_stack_t *ipst = ill->ill_ipst;
 
 	/*
 	 * Note that the relationship between ill and ipsq is fixed as long as
@@ -7635,10 +7681,12 @@
 	 * while we're waiting.  We wait on ill_cv and rely on ipsq_exit()
 	 * waking up all ills in the xop when it becomes available.
 	 */
-	mutex_enter(&ill->ill_lock);
 	for (;;) {
+		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+		mutex_enter(&ill->ill_lock);
 		if (ill->ill_state_flags & ILL_CONDEMNED) {
 			mutex_exit(&ill->ill_lock);
+			rw_exit(&ipst->ips_ill_g_lock);
 			return (B_FALSE);
 		}
 
@@ -7648,9 +7696,12 @@
 		mutex_enter(&ipx->ipx_lock);
 
 		if (ipx->ipx_writer == NULL && (type == CUR_OP ||
-		    ipx->ipx_current_ipif == NULL || waited_enough))
+		    (ipx->ipx_current_ipif == NULL && ipsq_dlpi_done(ipsq)) ||
+		    waited_enough))
 			break;
 
+		rw_exit(&ipst->ips_ill_g_lock);
+
 		if (!force || ipx->ipx_writer != NULL) {
 			mutex_exit(&ipx->ipx_lock);
 			mutex_exit(&ipsq->ipsq_lock);
@@ -7662,6 +7713,7 @@
 			    &ill->ill_lock, lbolt + ENTER_SQ_WAIT_TICKS);
 			waited_enough = B_TRUE;
 		}
+		mutex_exit(&ill->ill_lock);
 	}
 
 	ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
@@ -7675,19 +7727,78 @@
 	mutex_exit(&ipx->ipx_lock);
 	mutex_exit(&ipsq->ipsq_lock);
 	mutex_exit(&ill->ill_lock);
+	rw_exit(&ipst->ips_ill_g_lock);
+
 	return (B_TRUE);
 }
 
-boolean_t
-ill_perim_enter(ill_t *ill)
-{
-	return (ipsq_enter(ill, B_FALSE, CUR_OP));
-}
-
-void
-ill_perim_exit(ill_t *ill)
-{
-	ipsq_exit(ill->ill_phyint->phyint_ipsq);
+/*
+ * ipif_set_values() has a constraint that it cannot drop the ips_ill_g_lock
+ * across the call to the core interface ipsq_try_enter() and hence calls this
+ * function directly. This is explained more fully in ipif_set_values().
+ * In order to support the above constraint, ipsq_try_enter is implemented as
+ * a wrapper that grabs the ips_ill_g_lock and calls this function subsequently
+ */
+static ipsq_t *
+ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func,
+    int type, boolean_t reentry_ok)
+{
+	ipsq_t	*ipsq;
+	ipxop_t	*ipx;
+	ip_stack_t *ipst = ill->ill_ipst;
+
+	/*
+	 * lock ordering:
+	 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock.
+	 *
+	 * ipx of an ipsq can't change when ipsq_lock is held.
+	 */
+	ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
+	GRAB_CONN_LOCK(q);
+	mutex_enter(&ill->ill_lock);
+	ipsq = ill->ill_phyint->phyint_ipsq;
+	mutex_enter(&ipsq->ipsq_lock);
+	ipx = ipsq->ipsq_xop;
+	mutex_enter(&ipx->ipx_lock);
+
+	/*
+	 * 1. Enter the ipsq if we are already writer and reentry is ok.
+	 *    (Note: If the caller does not specify reentry_ok then neither
+	 *    'func' nor any of its callees must ever attempt to enter the ipsq
+	 *    again. Otherwise it can lead to an infinite loop
+	 * 2. Enter the ipsq if there is no current writer and this attempted
+	 *    entry is part of the current operation
+	 * 3. Enter the ipsq if there is no current writer and this is a new
+	 *    operation and the operation queue is empty and there is no
+	 *    operation currently in progress and if all previously initiated
+	 *    DLPI operations have completed.
+	 */
+	if ((ipx->ipx_writer == curthread && reentry_ok) ||
+	    (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP &&
+	    !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL &&
+	    ipsq_dlpi_done(ipsq))))) {
+		/* Success. */
+		ipx->ipx_reentry_cnt++;
+		ipx->ipx_writer = curthread;
+		ipx->ipx_forced = B_FALSE;
+		mutex_exit(&ipx->ipx_lock);
+		mutex_exit(&ipsq->ipsq_lock);
+		mutex_exit(&ill->ill_lock);
+		RELEASE_CONN_LOCK(q);
+#ifdef DEBUG
+		ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
+#endif
+		return (ipsq);
+	}
+
+	if (func != NULL)
+		ipsq_enq(ipsq, q, mp, func, type, ill);
+
+	mutex_exit(&ipx->ipx_lock);
+	mutex_exit(&ipsq->ipsq_lock);
+	mutex_exit(&ill->ill_lock);
+	RELEASE_CONN_LOCK(q);
+	return (NULL);
 }
 
 /*
@@ -7731,61 +7842,21 @@
 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
     ipsq_func_t func, int type, boolean_t reentry_ok)
 {
-	ipsq_t	*ipsq;
-	ipxop_t	*ipx;
+	ip_stack_t	*ipst;
+	ipsq_t		*ipsq;
 
 	/* Only 1 of ipif or ill can be specified */
 	ASSERT((ipif != NULL) ^ (ill != NULL));
+
 	if (ipif != NULL)
 		ill = ipif->ipif_ill;
-
-	/*
-	 * lock ordering: conn_lock -> ill_lock -> ipsq_lock -> ipx_lock.
-	 * ipx of an ipsq can't change when ipsq_lock is held.
-	 */
-	GRAB_CONN_LOCK(q);
-	mutex_enter(&ill->ill_lock);
-	ipsq = ill->ill_phyint->phyint_ipsq;
-	mutex_enter(&ipsq->ipsq_lock);
-	ipx = ipsq->ipsq_xop;
-	mutex_enter(&ipx->ipx_lock);
-
-	/*
-	 * 1. Enter the ipsq if we are already writer and reentry is ok.
-	 *    (Note: If the caller does not specify reentry_ok then neither
-	 *    'func' nor any of its callees must ever attempt to enter the ipsq
-	 *    again. Otherwise it can lead to an infinite loop
-	 * 2. Enter the ipsq if there is no current writer and this attempted
-	 *    entry is part of the current operation
-	 * 3. Enter the ipsq if there is no current writer and this is a new
-	 *    operation and the operation queue is empty and there is no
-	 *    operation currently in progress
-	 */
-	if ((ipx->ipx_writer == curthread && reentry_ok) ||
-	    (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP &&
-	    !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL)))) {
-		/* Success. */
-		ipx->ipx_reentry_cnt++;
-		ipx->ipx_writer = curthread;
-		ipx->ipx_forced = B_FALSE;
-		mutex_exit(&ipx->ipx_lock);
-		mutex_exit(&ipsq->ipsq_lock);
-		mutex_exit(&ill->ill_lock);
-		RELEASE_CONN_LOCK(q);
-#ifdef DEBUG
-		ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
-#endif
-		return (ipsq);
-	}
-
-	if (func != NULL)
-		ipsq_enq(ipsq, q, mp, func, type, ill);
-
-	mutex_exit(&ipx->ipx_lock);
-	mutex_exit(&ipsq->ipsq_lock);
-	mutex_exit(&ill->ill_lock);
-	RELEASE_CONN_LOCK(q);
-	return (NULL);
+	ipst = ill->ill_ipst;
+
+	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+	ipsq = ipsq_try_enter_internal(ill, q, mp, func, type, reentry_ok);
+	rw_exit(&ipst->ips_ill_g_lock);
+
+	return (ipsq);
 }
 
 /*
@@ -18793,7 +18864,18 @@
 	/* Let SCTP know about this ILL */
 	sctp_update_ill(ill, SCTP_ILL_INSERT);
 
-	ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_reprocess_ioctl, NEW_OP,
+	/*
+	 * ill_glist_insert has made the ill visible globally, and
+	 * ill_phyint_reinit could have changed the ipsq. At this point,
+	 * we need to hold the ips_ill_g_lock across the call to enter the
+	 * ipsq to enforce atomicity and prevent reordering. In the event
+	 * the ipsq has changed, and if the new ipsq is currently busy,
+	 * we need to make sure that this half-completed ioctl is ahead of
+	 * any subsequent ioctl. We achieve this by not dropping the
+	 * ips_ill_g_lock which prevents any ill lookup itself thereby
+	 * ensuring that new ioctls can't start.
+	 */
+	ipsq = ipsq_try_enter_internal(ill, q, mp, ip_reprocess_ioctl, NEW_OP,
 	    B_TRUE);
 
 	rw_exit(&ipst->ips_ill_g_lock);