changeset 4154:bd1265f2f9de

6546475 rds_if_lookup_by_name fails for clrprivnet interface 6546482 ioctl call fails with EINTR 6546498 Panic at rds_handle_portup_event+0x98 due to hcap being NULL 6546511 crash dump showed multiple instances of rdsib driver loaded 6546528 RDS fails to failover sessions across HCAs (card failover) 6546543 Multiple path up/down calls corrupt rds_path_map causing system to panic
author agiri
date Thu, 03 May 2007 08:24:50 -0700
parents 20265a755883
children 3e38fab0382c
files usr/src/uts/common/io/ib/clients/rds/rds_ioctl.c usr/src/uts/common/io/ib/clients/rds/rdsib.c usr/src/uts/common/io/ib/clients/rds/rdsib_buf.c usr/src/uts/common/io/ib/clients/rds/rdsib_cm.c usr/src/uts/common/io/ib/clients/rds/rdsib_ep.c usr/src/uts/common/io/ib/clients/rds/rdsib_ib.c usr/src/uts/common/io/ib/clients/rds/rdsib_sc.c usr/src/uts/common/sys/ib/clients/rds/rdsib_buf.h usr/src/uts/common/sys/ib/clients/rds/rdsib_ep.h
diffstat 9 files changed, 427 insertions(+), 108 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/io/ib/clients/rds/rds_ioctl.c	Thu May 03 03:28:00 2007 -0700
+++ b/usr/src/uts/common/io/ib/clients/rds/rds_ioctl.c	Thu May 03 08:24:50 2007 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -60,6 +60,7 @@
 	vnode_t	*kvp, *vp;
 	TIUSER	*tiptr;
 	struct	strioctl iocb;
+	k_sigset_t smask;
 	int	err = 0;
 
 	if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP,
@@ -79,7 +80,9 @@
 	iocb.ic_timout = 0;
 	iocb.ic_len = len;
 	iocb.ic_dp = arg;
+	sigintr(&smask, 0);
 	err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
+	sigunintr(&smask);
 	(void) t_kclose(tiptr, 0);
 	VN_RELE(kvp);
 	return (err);
@@ -91,6 +94,7 @@
 	dl_info_req_t *info_req;
 	union DL_primitives *dl_prim;
 	mblk_t *mp;
+	k_sigset_t smask;
 	int error;
 
 	if ((mp = allocb(sizeof (dl_info_req_t), BPRI_MED)) == NULL) {
@@ -103,12 +107,16 @@
 	mp->b_wptr += sizeof (dl_info_req_t);
 	info_req->dl_primitive = DL_INFO_REQ;
 
+	sigintr(&smask, 0);
 	if ((error = ldi_putmsg(lh, mp)) != 0) {
+		sigunintr(&smask);
 		return (error);
 	}
 	if ((error = ldi_getmsg(lh, &mp, (timestruc_t *)NULL)) != 0) {
+		sigunintr(&smask);
 		return (error);
 	}
+	sigunintr(&smask);
 
 	dl_prim = (union DL_primitives *)(uintptr_t)mp->b_rptr;
 	switch (dl_prim->dl_primitive) {
@@ -131,7 +139,12 @@
 }
 
 
-static boolean_t
+/*
+ * Return 0 if the interface is IB.
+ * Return error (>0) if any error is encountered during processing.
+ * Return -1 if the interface is not IB and no error.
+ */
+static int
 rds_is_ib_interface(char *name)
 {
 
@@ -156,33 +169,35 @@
 		/*
 		 * null name.
 		 */
-		return (B_FALSE);
+		return (-1);
 	}
 
 	if (strncmp("lo", name, i) == 0) {
 		/*
 		 * loopback interface is considered RDS capable
 		 */
-		return (B_TRUE);
+		return (0);
 	}
 
 	(void) strncat((dev_path + sizeof ("/dev/") -1), name, i);
 
 	ret = ldi_open_by_name(dev_path, FREAD|FWRITE, kcred, &lh, rds_li);
 	if (ret != 0) {
-		return (B_FALSE);
+		return (ret);
 	}
 
 	ret = rds_dl_info(lh, &info);
-
 	(void) ldi_close(lh, FREAD|FWRITE, kcred);
-
-	if (ret != 0 || (info.dl_mac_type != DL_IB &&
-	    !rds_transport_ops->rds_transport_if_lookup_by_name(name))) {
-		return (B_FALSE);
+	if (ret != 0) {
+		return (ret);
 	}
 
-	return (B_TRUE);
+	if (info.dl_mac_type != DL_IB &&
+	    !rds_transport_ops->rds_transport_if_lookup_by_name(name)) {
+		return (-1);
+	}
+
+	return (0);
 }
 
 void
@@ -226,8 +241,14 @@
 		ifr = kifc.ifc_req;
 		n = num_ifs;
 		for (num_ifs = 0; n > 0; ifr++) {
-			if (rds_is_ib_interface(ifr->ifr_name)) {
+			err = rds_is_ib_interface(ifr->ifr_name);
+			if (err == 0) {
 				num_ifs++;
+			} else if (err > 0) {
+				num_ifs = 0;
+				break;
+			} else {
+				err = 0;
 			}
 			n--;
 		}
@@ -277,17 +298,21 @@
 		for (; num_ifs > 0 &&
 		    (int)((uintptr_t)mp1->b_wptr - (uintptr_t)mp1->b_rptr) <
 		    ubuf_size; num_ifs--, ifr++) {
-			if (rds_is_ib_interface(ifr->ifr_name)) {
+			err = rds_is_ib_interface(ifr->ifr_name);
+			if (err == 0) {
 				ifr->ifr_addr.sa_family = AF_INET_OFFLOAD;
 				bcopy((caddr_t)ifr, ptr, sizeof (struct ifreq));
 				ptr++;
 				mp1->b_wptr = (uchar_t *)ptr;
+			} else if (err > 0) {
+				break;
+			} else {
+				err = 0;
 			}
 		}
 
 		STRUCT_FSET(ifc, ifc_len, (int)((uintptr_t)mp1->b_wptr -
 		    (uintptr_t)mp1->b_rptr));
-
 		kmem_free(kifc.ifc_buf, kifc.ifc_len);
 	}
 		break;
@@ -431,7 +456,7 @@
 
 		sin = (struct sockaddr_in *)(uintptr_t)&ifr->ifr_addr;
 		if ((sin->sin_addr.s_addr == addr) &&
-		    rds_is_ib_interface(ifr->ifr_name)) {
+		    (rds_is_ib_interface(ifr->ifr_name) == 0)) {
 				ret = B_TRUE;
 				break;
 		}
--- a/usr/src/uts/common/io/ib/clients/rds/rdsib.c	Thu May 03 03:28:00 2007 -0700
+++ b/usr/src/uts/common/io/ib/clients/rds/rdsib.c	Thu May 03 08:24:50 2007 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -270,6 +270,7 @@
 	    TASKQ_DEFAULTPRI, 0);
 	if (rds_taskq == NULL) {
 		RDS_DPRINTF1(LABEL, "ddi_taskq_create failed for rds_taskq");
+		rdsib_dev_info = NULL;
 		return (DDI_FAILURE);
 	}
 
@@ -278,6 +279,7 @@
 		cmn_err(CE_CONT, "ddi_create_minor_node failed: %d", ret);
 		ddi_taskq_destroy(rds_taskq);
 		rds_taskq = NULL;
+		rdsib_dev_info = NULL;
 		return (DDI_FAILURE);
 	}
 
@@ -313,6 +315,8 @@
 		rds_taskq = NULL;
 	}
 
+	rdsib_dev_info = NULL;
+
 	RDS_DPRINTF4("rdsib_detach", "return");
 
 	return (DDI_SUCCESS);
--- a/usr/src/uts/common/io/ib/clients/rds/rdsib_buf.c	Thu May 03 03:28:00 2007 -0700
+++ b/usr/src/uts/common/io/ib/clients/rds/rdsib_buf.c	Thu May 03 08:24:50 2007 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /*
@@ -523,6 +523,107 @@
 	return (0);
 }
 
+int
+rds_reinit_send_pool(rds_ep_t *ep, ib_guid_t hca_guid)
+{
+	rds_buf_t	*bp;
+	rds_hca_t	*hcap;
+	ibt_mr_attr_t   mem_attr;
+	ibt_mr_desc_t   mem_desc;
+	rds_bufpool_t   *spool;
+	int		ret;
+
+	RDS_DPRINTF2("rds_reinit_send_pool", "Enter: EP(%p)", ep);
+
+	spool = &ep->ep_sndpool;
+	ASSERT(spool->pool_memp != NULL);
+
+	/* deregister the send pool memory from the previous HCA */
+	hcap = rds_get_hcap(rdsib_statep, ep->ep_hca_guid);
+	if (hcap == NULL) {
+		RDS_DPRINTF2("rds_reinit_send_pool", "HCA (0x%llx) not found",
+		    ep->ep_hca_guid);
+	} else {
+		if (ep->ep_snd_mrhdl != NULL) {
+			(void) ibt_deregister_mr(hcap->hca_hdl,
+			    ep->ep_snd_mrhdl);
+			ep->ep_snd_mrhdl = NULL;
+			ep->ep_snd_lkey = 0;
+		}
+
+		if ((ep->ep_type == RDS_EP_TYPE_DATA) &&
+		    (ep->ep_ackhdl != NULL)) {
+			(void) ibt_deregister_mr(hcap->hca_hdl, ep->ep_ackhdl);
+			ep->ep_ackhdl = NULL;
+			ep->ep_ack_rkey = 0;
+		}
+
+		ep->ep_hca_guid = NULL;
+	}
+
+	/* get the hcap for the new HCA */
+	hcap = rds_get_hcap(rdsib_statep, hca_guid);
+	if (hcap == NULL) {
+		RDS_DPRINTF2("rds_reinit_send_pool", "HCA (0x%llx) not found",
+		    hca_guid);
+		return (-1);
+	}
+
+	/* register the send memory */
+	mem_attr.mr_vaddr = (ib_vaddr_t)(uintptr_t)spool->pool_memp;
+	mem_attr.mr_len = spool->pool_memsize;
+	mem_attr.mr_as = NULL;
+	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
+
+	ret = ibt_register_mr(hcap->hca_hdl, hcap->hca_pdhdl,
+	    &mem_attr, &ep->ep_snd_mrhdl, &mem_desc);
+	if (ret != IBT_SUCCESS) {
+		RDS_DPRINTF2("rds_reinit_send_pool",
+		    "EP(%p): ibt_register_mr failed: %d", ep, ret);
+		return (-1);
+	}
+	ep->ep_snd_lkey = mem_desc.md_lkey;
+
+	/* register the acknowledgement space */
+	if (ep->ep_type == RDS_EP_TYPE_DATA) {
+		mem_attr.mr_vaddr = (ib_vaddr_t)ep->ep_ack_addr;
+		mem_attr.mr_len = sizeof (uintptr_t);
+		mem_attr.mr_as = NULL;
+		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
+		    IBT_MR_ENABLE_REMOTE_WRITE;
+
+		ret = ibt_register_mr(hcap->hca_hdl, hcap->hca_pdhdl,
+		    &mem_attr, &ep->ep_ackhdl, &mem_desc);
+		if (ret != IBT_SUCCESS) {
+			RDS_DPRINTF2("rds_reinit_send_pool",
+			    "EP(%p): ibt_register_mr for ack failed: %d",
+			    ep, ret);
+			(void) ibt_deregister_mr(hcap->hca_hdl,
+			    ep->ep_snd_mrhdl);
+			ep->ep_snd_mrhdl = NULL;
+			ep->ep_snd_lkey = 0;
+			return (-1);
+		}
+		ep->ep_ack_rkey = mem_desc.md_rkey;
+
+		/* update the LKEY in the acknowledgement WR */
+		ep->ep_ackds.ds_key = ep->ep_snd_lkey;
+	}
+
+	/* update the LKEY in each buffer */
+	bp = spool->pool_headp;
+	while (bp) {
+		bp->buf_ds.ds_key = ep->ep_snd_lkey;
+		bp = bp->buf_nextp;
+	}
+
+	ep->ep_hca_guid = hca_guid;
+
+	RDS_DPRINTF2("rds_reinit_send_pool", "Return: EP(%p)", ep);
+
+	return (0);
+}
+
 void
 rds_free_recv_pool(rds_ep_t *ep)
 {
--- a/usr/src/uts/common/io/ib/clients/rds/rdsib_cm.c	Thu May 03 03:28:00 2007 -0700
+++ b/usr/src/uts/common/io/ib/clients/rds/rdsib_cm.c	Thu May 03 08:24:50 2007 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /*
@@ -103,7 +103,6 @@
 	rds_session_t		*sp;
 	rds_ep_t		*ep;
 	ibt_channel_hdl_t	chanhdl;
-	rds_hca_t		*hcap;
 	int			ret;
 
 	RDS_DPRINTF2("rds_handle_cm_req", "Enter");
@@ -152,6 +151,16 @@
 		return (IBT_CM_REJECT);
 	}
 
+	/*
+	 * RDS needs more time to process a failover REQ so send an MRA.
+	 * Otherwise, the remote may retry the REQ and fail the connection.
+	 */
+	if ((cmp.cmp_failover) && (cmp.cmp_eptype == RDS_EP_TYPE_DATA)) {
+		RDS_DPRINTF2("rds_handle_cm_req", "Session Failover, send MRA");
+		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, evp->cm_session_id,
+		    10000000 /* 10 sec */, NULL, 0);
+	}
+
 	/* Is there a session to the destination node? */
 	rw_enter(&statep->rds_sessionlock, RW_READER);
 	sp = rds_session_lkup(statep, cmp.cmp_localip, rgid.gid_guid);
@@ -199,21 +208,6 @@
 				sp->session_myip = cmp.cmp_remip;
 				sp->session_lgid = lgid;
 				sp->session_rgid = rgid;
-				hcap = rds_gid_to_hcap(statep, lgid);
-
-				/* change the data channel */
-				mutex_enter(&sp->session_dataep.ep_lock);
-				sp->session_dataep.ep_myip = cmp.cmp_remip;
-				sp->session_dataep.ep_hca_guid =
-				    hcap->hca_guid;
-				mutex_exit(&sp->session_dataep.ep_lock);
-
-				/* change the control channel */
-				mutex_enter(&sp->session_ctrlep.ep_lock);
-				sp->session_ctrlep.ep_myip = cmp.cmp_remip;
-				sp->session_ctrlep.ep_hca_guid =
-				    hcap->hca_guid;
-				mutex_exit(&sp->session_ctrlep.ep_lock);
 			}
 		}
 	}
@@ -237,23 +231,22 @@
 
 		/* move the session to init state */
 		rw_enter(&sp->session_lock, RW_WRITER);
-		sp->session_state = RDS_SESSION_STATE_INIT;
+		ret = rds_session_reinit(sp, lgid);
 		sp->session_myip = cmp.cmp_remip;
 		sp->session_lgid = lgid;
 		sp->session_rgid = rgid;
-		hcap = rds_gid_to_hcap(statep, lgid);
-
-		/* change the data channel */
-		mutex_enter(&sp->session_dataep.ep_lock);
-		sp->session_dataep.ep_myip = cmp.cmp_remip;
-		sp->session_dataep.ep_hca_guid = hcap->hca_guid;
-		mutex_exit(&sp->session_dataep.ep_lock);
-
-		/* change the control channel */
-		mutex_enter(&sp->session_ctrlep.ep_lock);
-		sp->session_ctrlep.ep_myip = cmp.cmp_remip;
-		sp->session_ctrlep.ep_hca_guid = hcap->hca_guid;
-		mutex_exit(&sp->session_ctrlep.ep_lock);
+		if (ret != 0) {
+			rds_session_fini(sp);
+			sp->session_state = RDS_SESSION_STATE_FAILED;
+			RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
+			    "RDS_SESSION_STATE_FAILED", sp);
+			rw_exit(&sp->session_lock);
+			return (IBT_CM_REJECT);
+		} else {
+			sp->session_state = RDS_SESSION_STATE_INIT;
+			RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
+			    "RDS_SESSION_STATE_INIT", sp);
+		}
 
 		if (cmp.cmp_eptype == RDS_EP_TYPE_CTRL) {
 			ep = &sp->session_ctrlep;
@@ -333,15 +326,6 @@
 			 */
 			ASSERT(sp->session_type == RDS_SESSION_ACTIVE);
 			ep->ep_state = RDS_EP_STATE_PASSIVE_PENDING;
-			ep->ep_myip = cmp.cmp_remip;
-			hcap = rds_gid_to_hcap(statep, lgid);
-			ep->ep_hca_guid = hcap->hca_guid;
-
-			/* change the control channel too */
-			mutex_enter(&sp->session_ctrlep.ep_lock);
-			sp->session_ctrlep.ep_myip = cmp.cmp_remip;
-			sp->session_ctrlep.ep_hca_guid = hcap->hca_guid;
-			mutex_exit(&sp->session_dataep.ep_lock);
 
 			rw_enter(&sp->session_lock, RW_WRITER);
 			sp->session_type = RDS_SESSION_PASSIVE;
@@ -565,6 +549,15 @@
 			sp->session_state = RDS_SESSION_STATE_ERROR;
 			RDS_DPRINTF3("rds_handle_cm_event_failure",
 			    "SP(%p) State RDS_SESSION_STATE_ERROR", sp);
+
+			/*
+			 * Store the cm_channel for freeing later
+			 * Active side frees it on ibt_open_rc_channel
+			 * failure
+			 */
+			if (ep->ep_chanhdl == NULL) {
+				ep->ep_chanhdl = evp->cm_channel;
+			}
 			rw_exit(&sp->session_lock);
 
 			/*
@@ -788,6 +781,7 @@
 		ep->ep_recvcq = NULL;
 		(void) ibt_free_cq(ep->ep_sendcq);
 		ep->ep_sendcq = NULL;
+		return (-1);
 	}
 
 	*chanhdl = hdl;
@@ -795,7 +789,7 @@
 	RDS_DPRINTF2("rds_open_rc_channel", "Return: EP(%p) Chan: %p", ep,
 	    *chanhdl);
 
-	return (ret);
+	return (0);
 }
 
 int
--- a/usr/src/uts/common/io/ib/clients/rds/rdsib_ep.c	Thu May 03 03:28:00 2007 -0700
+++ b/usr/src/uts/common/io/ib/clients/rds/rdsib_ep.c	Thu May 03 08:24:50 2007 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /*
@@ -304,6 +304,31 @@
 	return (0);
 }
 
+static int
+rds_ep_reinit(rds_ep_t *ep, ib_guid_t hca_guid)
+{
+	int	ret;
+
+	RDS_DPRINTF3("rds_ep_reinit", "Enter: EP(%p) Type: %d",
+	    ep, ep->ep_type);
+
+	/* Re-initialize send pool */
+	ret = rds_reinit_send_pool(ep, hca_guid);
+	if (ret != 0) {
+		RDS_DPRINTF2("rds_ep_reinit",
+		    "EP(%p): rds_reinit_send_pool failed: %d", ep, ret);
+		return (-1);
+	}
+
+	/* free all the receive buffers in the pool */
+	rds_free_recv_pool(ep);
+
+	RDS_DPRINTF3("rds_ep_reinit", "Return: EP(%p) Type: %d",
+	    ep, ep->ep_type);
+
+	return (0);
+}
+
 void
 rds_session_fini(rds_session_t *sp)
 {
@@ -354,6 +379,74 @@
 	return (0);
 }
 
+/*
+ * This should be called before moving a session from ERROR state to
+ * INIT state. This will update the HCA keys incase the session has moved from
+ * one HCA to another.
+ */
+int
+rds_session_reinit(rds_session_t *sp, ib_gid_t lgid)
+{
+	rds_hca_t	*hcap, *hcap1;
+	int		ret;
+
+	RDS_DPRINTF2("rds_session_reinit", "Enter: SP(0x%p)", sp);
+
+	/* CALLED WITH SESSION WRITE LOCK */
+
+	hcap = rds_gid_to_hcap(rdsib_statep, lgid);
+	if (hcap == NULL) {
+		RDS_DPRINTF1("rds_session_reinit", "SGID is on an "
+		    "uninitialized HCA: %llx", lgid.gid_guid);
+		return (-1);
+	}
+
+	hcap1 = rds_gid_to_hcap(rdsib_statep, sp->session_lgid);
+	if (hcap1 == NULL) {
+		RDS_DPRINTF1("rds_session_reinit", "Seems like HCA %llx "
+		    "is unplugged", sp->session_lgid.gid_guid);
+	} else if (hcap->hca_guid == hcap1->hca_guid) {
+		/*
+		 * No action is needed as the session did not move across
+		 * HCAs
+		 */
+		RDS_DPRINTF2("rds_session_reinit", "Failover on the same HCA");
+		return (0);
+	}
+
+	RDS_DPRINTF2("rds_session_reinit", "Failover across HCAs");
+
+	/* re-initialize the control channel */
+	ret = rds_ep_reinit(&sp->session_ctrlep, hcap->hca_guid);
+	if (ret != 0) {
+		RDS_DPRINTF2("rds_session_reinit",
+		    "SP(%p): Ctrl EP(%p) re-initialization failed",
+		    sp, &sp->session_ctrlep);
+		return (-1);
+	}
+
+	RDS_DPRINTF2("rds_session_reinit", "SP(%p) Control EP(%p)",
+	    sp, &sp->session_ctrlep);
+
+	/* re-initialize the data channel */
+	ret = rds_ep_reinit(&sp->session_dataep, hcap->hca_guid);
+	if (ret != 0) {
+		RDS_DPRINTF2("rds_session_reinit",
+		    "SP(%p): Data EP(%p) re-initialization failed",
+		    sp, &sp->session_dataep);
+		return (-1);
+	}
+
+	RDS_DPRINTF2("rds_session_reinit", "SP(%p) Data EP(%p)",
+	    sp, &sp->session_dataep);
+
+	sp->session_lgid = lgid;
+
+	RDS_DPRINTF2("rds_session_reinit", "Return: SP(0x%p)", sp);
+
+	return (0);
+}
+
 static int
 rds_session_connect(rds_session_t *sp)
 {
@@ -409,7 +502,7 @@
 		ret = rds_open_rc_channel(ep, &pinfo, IBT_BLOCKING, &datachan);
 		if (ret != IBT_SUCCESS) {
 			RDS_DPRINTF2(LABEL, "EP(%p): rds_open_rc_channel "
-			    "failed: %d", ret);
+			    "failed: %d", ep, ret);
 			return (-1);
 		}
 		sp->session_dataep.ep_chanhdl = datachan;
@@ -442,6 +535,9 @@
 		return (-1);
 	}
 
+	RDS_DPRINTF2(LABEL, "Session (%p) 0x%x <--> 0x%x is CONNECTED",
+	    sp, sp->session_myip, sp->session_remip);
+
 	RDS_DPRINTF2("rds_session_connect", "Return SP(%p)", sp);
 
 	return (0);
@@ -637,6 +733,8 @@
 		if (sp->session_type == RDS_SESSION_ACTIVE) {
 			rds_session_fini(sp);
 			sp->session_state = RDS_SESSION_STATE_FAILED;
+			RDS_DPRINTF3("rds_failover_session",
+			    "SP(%p) State RDS_SESSION_STATE_FAILED", sp);
 		} else {
 			RDS_DPRINTF2("rds_failover_session",
 			    "SP(%p) has become passive", sp);
@@ -662,9 +760,21 @@
 	}
 
 	/* move the session to init state */
-	sp->session_state = RDS_SESSION_STATE_INIT;
+	ret = rds_session_reinit(sp, lgid);
 	sp->session_lgid = lgid;
 	sp->session_rgid = rgid;
+	if (ret != 0) {
+		rds_session_fini(sp);
+		sp->session_state = RDS_SESSION_STATE_FAILED;
+		RDS_DPRINTF3("rds_failover_session",
+		    "SP(%p) State RDS_SESSION_STATE_FAILED", sp);
+		rw_exit(&sp->session_lock);
+		return;
+	} else {
+		sp->session_state = RDS_SESSION_STATE_INIT;
+		RDS_DPRINTF3("rds_failover_session",
+		    "SP(%p) State RDS_SESSION_STATE_INIT", sp);
+	}
 	rw_exit(&sp->session_lock);
 
 	rds_session_open(sp);
@@ -887,9 +997,6 @@
 		return;
 	}
 
-	RDS_DPRINTF2(LABEL, "Session (%p) 0x%x <--> 0x%x is CONNECTED",
-	    sp, sp->session_myip, sp->session_remip);
-
 	RDS_DPRINTF2("rds_session_open", "Return: SP(%p)", sp);
 }
 
--- a/usr/src/uts/common/io/ib/clients/rds/rdsib_ib.c	Thu May 03 03:28:00 2007 -0700
+++ b/usr/src/uts/common/io/ib/clients/rds/rdsib_ib.c	Thu May 03 08:24:50 2007 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /*
@@ -184,7 +184,7 @@
 	uint_t		ix, hcaix, nhcas;
 	int		ret;
 
-	RDS_DPRINTF4("rdsib_open_ib", "enter");
+	RDS_DPRINTF4("rdsib_open_ib", "enter: statep %p", rdsib_statep);
 
 	ASSERT(rdsib_statep != NULL);
 	if (rdsib_statep == NULL) {
@@ -309,7 +309,7 @@
 		}
 	}
 
-	RDS_DPRINTF4("rdsib_open_ib", "return");
+	RDS_DPRINTF4("rdsib_open_ib", "return: statep %p", rdsib_statep);
 
 	return (0);
 }
@@ -320,10 +320,10 @@
 void
 rdsib_close_ib()
 {
-	rds_hca_t	*hcap;
+	rds_hca_t	*hcap, *nextp;
 	int		ret;
 
-	RDS_DPRINTF4("rds_close_ib", "enter");
+	RDS_DPRINTF2("rds_close_ib", "enter: statep %p", rdsib_statep);
 
 	if (rdsib_statep->rds_srvhdl != NULL) {
 		(void) ibt_unbind_all_services(rdsib_statep->rds_srvhdl);
@@ -334,10 +334,15 @@
 	/* close and destroy all the sessions */
 	rds_close_sessions(NULL);
 
-	/* Release all IB resources */
+	/* Release all HCA resources */
+	rw_enter(&rdsib_statep->rds_hca_lock, RW_WRITER);
 	hcap = rdsib_statep->rds_hcalistp;
+	rdsib_statep->rds_hcalistp = NULL;
+	rdsib_statep->rds_nhcas = 0;
+	rw_exit(&rdsib_statep->rds_hca_lock);
+
 	while (hcap != NULL) {
-		rdsib_statep->rds_hcalistp = hcap->hca_nextp;
+		nextp = hcap->hca_nextp;
 
 		ret = ibt_free_pd(hcap->hca_hdl, hcap->hca_pdhdl);
 		ASSERT(ret == IBT_SUCCESS);
@@ -348,7 +353,7 @@
 		ASSERT(ret == IBT_SUCCESS);
 
 		kmem_free(hcap, sizeof (rds_hca_t));
-		hcap = rdsib_statep->rds_hcalistp;
+		hcap = nextp;
 	}
 
 	/* Deregister with IBTF */
@@ -357,7 +362,7 @@
 		rdsib_statep->rds_ibhdl = NULL;
 	}
 
-	RDS_DPRINTF4("rds_close_ib", "return");
+	RDS_DPRINTF2("rds_close_ib", "return: statep %p", rdsib_statep);
 }
 
 /* Return hcap, given the hca guid */
@@ -387,20 +392,33 @@
 rds_hca_t *
 rds_gid_to_hcap(rds_state_t *statep, ib_gid_t gid)
 {
-	ibt_node_info_t	nodeinfo;
-	int		ret;
+	rds_hca_t	*hcap;
+	uint_t		ix;
 
 	RDS_DPRINTF4("rds_gid_to_hcap", "Enter: statep: 0x%p gid: %llx:%llx",
 	    statep, gid.gid_prefix, gid.gid_guid);
 
-	ret = ibt_gid_to_node_info(gid, &nodeinfo);
-	if (ret != IBT_SUCCESS) {
-		RDS_DPRINTF2(LABEL, "ibt_gid_node_info for gid: %llx:%llx "
-		    "failed", gid.gid_prefix, gid.gid_guid);
-		return (NULL);
+	rw_enter(&statep->rds_hca_lock, RW_READER);
+
+	hcap = statep->rds_hcalistp;
+	while (hcap != NULL) {
+		for (ix = 0; ix < hcap->hca_nports; ix++) {
+			if ((hcap->hca_pinfop[ix].p_sgid_tbl[0].gid_prefix ==
+			    gid.gid_prefix) &&
+			    (hcap->hca_pinfop[ix].p_sgid_tbl[0].gid_guid ==
+			    gid.gid_guid)) {
+				RDS_DPRINTF4("rds_gid_to_hcap",
+				    "gid found in hcap: 0x%p", hcap);
+				rw_exit(&statep->rds_hca_lock);
+				return (hcap);
+			}
+		}
+		hcap = hcap->hca_nextp;
 	}
 
-	return (rds_get_hcap(statep, nodeinfo.n_node_guid));
+	rw_exit(&statep->rds_hca_lock);
+
+	return (NULL);
 }
 
 /* This is called from the send CQ handler */
@@ -1053,18 +1071,23 @@
 	ibt_cq_attr_t			scqattr, rcqattr;
 	ibt_rc_chan_alloc_args_t	chanargs;
 	ibt_channel_hdl_t		chanhdl;
+	rds_session_t			*sp;
 	rds_hca_t			*hcap;
 
 	RDS_DPRINTF4("rds_ep_alloc_rc_channel", "Enter: 0x%p port: %d",
 	    ep, hca_port);
 
-	/* get the hcap for the HCA hosting this channel */
-	hcap = rds_get_hcap(rdsib_statep, ep->ep_hca_guid);
-	if (hcap == NULL) {
-		RDS_DPRINTF2("rds_ep_alloc_rc_channel",
-		    "HCA (0x%llx) not found", ep->ep_hca_guid);
-		return (NULL);
-	}
+	/* Update the EP with the right IP address and HCA guid */
+	sp = ep->ep_sp;
+	ASSERT(sp != NULL);
+	rw_enter(&sp->session_lock, RW_READER);
+	mutex_enter(&ep->ep_lock);
+	ep->ep_myip = sp->session_myip;
+	ep->ep_remip = sp->session_remip;
+	hcap = rds_gid_to_hcap(rdsib_statep, sp->session_lgid);
+	ep->ep_hca_guid = hcap->hca_guid;
+	mutex_exit(&ep->ep_lock);
+	rw_exit(&sp->session_lock);
 
 	/* reset taskqpending flag here */
 	ep->ep_recvqp.qp_taskqpending = B_FALSE;
@@ -1217,11 +1240,15 @@
 	ib_gid_t		gid;
 	int			ret;
 
-	RDS_DPRINTF2("rds_handle_portup_event", "Enter: GUID: 0x%llx",
-	    event->ev_hca_guid);
+	RDS_DPRINTF2("rds_handle_portup_event",
+	    "Enter: GUID: 0x%llx Statep: %p", event->ev_hca_guid, statep);
 
 	hcap = rds_get_hcap(statep, event->ev_hca_guid);
-	ASSERT(hcap != NULL);
+	if (hcap == NULL) {
+		RDS_DPRINTF2("rds_handle_portup_event", "HCA: 0x%llx is "
+		    "not in our list", event->ev_hca_guid);
+		return;
+	}
 
 	ret = ibt_query_hca_ports(hdl, 0, &newpinfop, &nport, &newsize);
 	if (ret != IBT_SUCCESS) {
--- a/usr/src/uts/common/io/ib/clients/rds/rdsib_sc.c	Thu May 03 03:28:00 2007 -0700
+++ b/usr/src/uts/common/io/ib/clients/rds/rdsib_sc.c	Thu May 03 08:24:50 2007 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -64,8 +64,8 @@
 	ipaddr_t			ribd_ip;
 	struct rds_path_record_s	*up;
 	struct rds_path_record_s	*downp;
-	char				lifname[LIFNAMSIZ];
-	char				rifname[LIFNAMSIZ];
+	char				lifname[MAXNAMELEN];
+	char				rifname[MAXNAMELEN];
 } rds_path_record_t;
 
 typedef struct rds_node_record_s {
@@ -79,6 +79,43 @@
 kmutex_t		rds_pathmap_lock;
 rds_node_record_t	*rds_pathmap = NULL;
 
+static boolean_t
+rds_validate_interface(rds_path_t *path)
+{
+	char			devname[MAXNAMELEN];
+	uint_t			instance;
+
+	/* separate devname and instance number */
+	if (ddi_parse(path->local.ifname, devname, &instance) != DDI_SUCCESS) {
+		RDS_DPRINTF2("rds_validate_interface",
+		    "local: %s is not right", path->local.ifname);
+		return (B_FALSE);
+	}
+
+	/* don't care if it is not IPoIB interface */
+	if (strcmp(devname, "ibd") != 0) {
+		RDS_DPRINTF2("rds_validate_interface",
+		    "local: %s is not IB interface", devname);
+		return (B_FALSE);
+	}
+
+	/* separate devname and instance number */
+	if (ddi_parse(path->remote.ifname, devname, &instance) != DDI_SUCCESS) {
+		RDS_DPRINTF2("rds_validate_interface",
+		    "remote: %s is not right", path->remote.ifname);
+		return (B_FALSE);
+	}
+
+	/* don't care if it is not IPoIB interface */
+	if (strcmp(devname, "ibd") != 0) {
+		RDS_DPRINTF2("rds_validate_interface",
+		    "remote: %s is not IB interface", devname);
+		return (B_FALSE);
+	}
+
+	return (B_TRUE);
+}
+
 /*
  * Called by SC on discovering a new path
  */
@@ -91,11 +128,8 @@
 	ASSERT(path != NULL);
 
 	/* don't care if it is not IPoIB interface */
-	if ((bcmp(path->local.ifname, "ibd", 3) != 0) ||
-	    (bcmp(path->remote.ifname, "ibd", 3) != 0)) {
-		RDS_DPRINTF3("rds_path_up",
-		    "(%s | %s) Not IPoIB interface, ignore",
-		    path->local.ifname, path->remote.ifname);
+	if (rds_validate_interface(path) == B_FALSE) {
+		RDS_DPRINTF2("rds_path_up", "NOT IB interface");
 		return;
 	}
 
@@ -164,11 +198,8 @@
 	ASSERT(path != NULL);
 
 	/* don't care if it is not IPoIB interface */
-	if ((bcmp(path->local.ifname, "ibd", 3) != 0) ||
-	    (bcmp(path->remote.ifname, "ibd", 3) != 0)) {
-		RDS_DPRINTF3("rds_path_down",
-		    "(%s | %s) Not IPoIB interface, ignore",
-		    path->local.ifname, path->remote.ifname);
+	if (rds_validate_interface(path) == B_FALSE) {
+		RDS_DPRINTF2("rds_path_down", "NOT IB interface");
 		return;
 	}
 
@@ -227,7 +258,7 @@
 		} else {
 			/* this is the first node record */
 			ASSERT(p == rds_pathmap);
-			rds_pathmap = p;
+			rds_pathmap = p->nextp;
 		}
 
 		if (p->nextp) {
@@ -276,10 +307,37 @@
 {
 	rds_node_record_t	*p;
 	rds_path_record_t	*p1;
+	char			devname[MAXNAMELEN];
+	uint_t			instance;
+
+	if (ddi_parse(if_name, devname, &instance) != DDI_SUCCESS) {
+		RDS_DPRINTF2("rds_if_lookup_by_name",
+		    "if_name: %s is not right", if_name);
+		return (B_FALSE);
+	}
 
 	mutex_enter(&rds_pathmap_lock);
 
+	if (rds_pathmap == NULL) {
+		/* SC is not configured */
+		RDS_DPRINTF2("rds_if_lookup_by_name", "Pathmap is NULL");
+		mutex_exit(&rds_pathmap_lock);
+		return (B_FALSE);
+	}
+
+	/*
+	 * Sun Cluster always names its interconnect virtual network interface
+	 * as clprivnetx, so  return TRUE if there is atleast one node record
+	 * and the interface name is clprivnet something.
+	 */
+	if (strcmp(devname, "clprivnet") == 0) {
+		/* clprivnet address */
+		mutex_exit(&rds_pathmap_lock);
+		return (B_TRUE);
+	}
+
 	p = rds_pathmap;
+
 	while (p != NULL) {
 		p1 = p->downp;
 		while ((p1 != NULL) && strcmp(if_name, p1->lifname)) {
--- a/usr/src/uts/common/sys/ib/clients/rds/rdsib_buf.h	Thu May 03 03:28:00 2007 -0700
+++ b/usr/src/uts/common/sys/ib/clients/rds/rdsib_buf.h	Thu May 03 08:24:50 2007 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /*
@@ -156,6 +156,7 @@
 int rds_init_recv_caches(rds_state_t *statep);
 void rds_free_recv_caches(rds_state_t *statep);
 int rds_init_send_pool(struct rds_ep_s *ep);
+int rds_reinit_send_pool(struct rds_ep_s *ep, ib_guid_t hca_guid);
 void rds_free_send_pool(struct rds_ep_s *ep);
 int rds_init_recv_pool(struct rds_ep_s *ep);
 void rds_free_recv_pool(struct rds_ep_s *ep);
--- a/usr/src/uts/common/sys/ib/clients/rds/rdsib_ep.h	Thu May 03 03:28:00 2007 -0700
+++ b/usr/src/uts/common/sys/ib/clients/rds/rdsib_ep.h	Thu May 03 08:24:50 2007 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /*
@@ -149,7 +149,7 @@
  *	(5) Failure in rds_session_init()
  *	(6) rds_sendmsg(3SOCKET)/Incoming CM REQ
  *	(7) Failure in rds_session_open()
- *	(8) rds_session_close() and rds_get_ibaddr()
+ *	(8) rds_session_close(), rds_get_ibaddr() and rds_session_reinit()
  *	(9) rds_session_close() and rds_session_fini()
  *	(9) rds_cleanup_passive_session() and rds_passive_session_fini()
  *	(10) Connection Error/Incoming REQ
@@ -309,6 +309,7 @@
 rds_session_t *rds_session_create(rds_state_t *statep, ipaddr_t destip,
     ipaddr_t srcip, ibt_cm_req_rcv_t *reqp, uint8_t type);
 int rds_session_init(rds_session_t *sp);
+int rds_session_reinit(rds_session_t *sp, ib_gid_t lgid);
 void rds_session_open(rds_session_t *sp);
 void rds_session_close(rds_session_t *sp, ibt_execution_mode_t mode,
     uint_t wait);
@@ -320,6 +321,7 @@
 void rds_received_msg(rds_ep_t *ep, rds_buf_t *bp);
 void rds_handle_control_message(rds_session_t *sp, rds_ctrl_pkt_t *cp);
 void rds_handle_send_error(rds_ep_t *ep);
+void rds_session_fini(rds_session_t *sp);
 void rds_passive_session_fini(rds_session_t *sp);
 void rds_cleanup_passive_session(void *arg);