changeset 8452:89d32dfdae6e

6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error 6692015 SVM global reader-writer mutex priority inversion causes deadlock under load 6725904 callers of meta_getdnp_bydevid() should do so correctly 6726615 Bruichladdich - SVM support for sQFS on mirrors in SunCluster 6756133 MD_MN_MSG_MDDB_PARSE message passes incorrect message size when used 6758399 mdmn_ksend_message() retries door_ki_upcall() without resetting data_ptr/data_size 6766848 mdcommd assumes SVCXPRT will survive thr_create() 6769738 md_mps_t not completely cleared before re-use
author John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
date Wed, 24 Dec 2008 08:23:40 -0700
parents 576ca2bde8d6
children 15fa4bb77d8c
files usr/src/cmd/lvm/rpc.mdcommd/mdcomm.xml usr/src/cmd/lvm/rpc.mdcommd/mddoors.c usr/src/cmd/lvm/rpc.mdcommd/mdmn_commd_server.c usr/src/cmd/lvm/rpc.mdcommd/mdmn_commd_service.c usr/src/cmd/lvm/rpc.mdcommd/mdmn_subr.c usr/src/cmd/lvm/util/metaclust.c usr/src/cmd/mdb/common/modules/md/dumpmirror.c usr/src/cmd/mdb/common/modules/md/md.c usr/src/cmd/mdb/common/modules/md/metastat.c usr/src/cmd/mdb/intel/amd64/md/Makefile usr/src/cmd/mdb/intel/ia32/md/Makefile usr/src/cmd/mdb/sparc/v9/md/Makefile usr/src/head/meta.h usr/src/lib/lvm/libmeta/common/mapfile-vers usr/src/lib/lvm/libmeta/common/meta_db.c usr/src/lib/lvm/libmeta/common/meta_mn_changelog.c usr/src/lib/lvm/libmeta/common/meta_mn_comm.c usr/src/lib/lvm/libmeta/common/meta_mn_handlers.c usr/src/lib/lvm/libmeta/common/meta_mn_msg_table.c usr/src/lib/lvm/libmeta/common/meta_mn_subr.c usr/src/lib/lvm/libmeta/common/meta_nameinfo.c usr/src/lib/lvm/libmeta/common/meta_runtime.c usr/src/lib/lvm/libmeta/common/meta_set.c usr/src/lib/lvm/libmeta/common/meta_set_hst.c usr/src/lib/lvm/libmeta/common/meta_sp.c usr/src/uts/common/io/lvm/md/md.c usr/src/uts/common/io/lvm/md/md_ioctl.c usr/src/uts/common/io/lvm/md/md_mddb.c usr/src/uts/common/io/lvm/md/md_subr.c usr/src/uts/common/io/lvm/mirror/mirror.c usr/src/uts/common/io/lvm/mirror/mirror_ioctl.c usr/src/uts/common/io/lvm/mirror/mirror_resync.c usr/src/uts/common/io/lvm/softpart/sp.c usr/src/uts/common/io/lvm/softpart/sp_ioctl.c usr/src/uts/common/sys/lvm/md_mirror.h usr/src/uts/common/sys/lvm/md_sp.h usr/src/uts/common/sys/lvm/mdio.h usr/src/uts/common/sys/lvm/mdmn_commd.x usr/src/uts/common/sys/lvm/mdvar.h
diffstat 39 files changed, 3275 insertions(+), 1362 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/lvm/rpc.mdcommd/mdcomm.xml	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/cmd/lvm/rpc.mdcommd/mdcomm.xml	Wed Dec 24 08:23:40 2008 -0700
@@ -1,7 +1,7 @@
 <?xml version='1.0'?>
 <!DOCTYPE service_bundle SYSTEM '/usr/share/lib/xml/dtd/service_bundle.dtd.1'>
 <!--
- Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  Use is subject to license terms.
 
  CDDL HEADER START
@@ -23,8 +23,6 @@
 
  CDDL HEADER END
 
-	pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 	NOTE:  This service manifest is not editable; its contents will
 	be overwritten by package or patch operations, including
 	operating system upgrade.  Make customizations in a different
@@ -82,8 +80,8 @@
 		<propval name='endpoint_type' type='astring' value='tli' />
 		<propval name='wait' type='boolean' value='true' />
 		<propval name='isrpc' type='boolean' value='true' />
-		<propval name='rpc_low_version' type='integer' value='1' />
-		<propval name='rpc_high_version' type='integer' value='1' />
+		<propval name='rpc_low_version' type='integer' value='2' />
+		<propval name='rpc_high_version' type='integer' value='2' />
 		<propval name='proto' type='astring' value='tcp' />
 	</property_group>
 
--- a/usr/src/cmd/lvm/rpc.mdcommd/mddoors.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/cmd/lvm/rpc.mdcommd/mddoors.c	Wed Dec 24 08:23:40 2008 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,13 +18,12 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <door.h>
 #include <locale.h>
 #include <meta.h>
@@ -106,7 +104,7 @@
 	if (close(daemon_lock_fd) == -1) {
 		syslog(LOG_DAEMON | LOG_DEBUG,
 		    gettext("close(%s) failed - %s\n"),
-			    daemon_lock_file, strerror(errno));
+		    daemon_lock_file, strerror(errno));
 		return;
 	}
 	unlink(daemon_lock_file);
@@ -133,37 +131,32 @@
 	md_mn_kresult_t	kresult;
 
 	md_mn_kmsg_t *kmsg = (md_mn_kmsg_t *)(void *)argp;
-	err = mdmn_send_message(kmsg->kmsg_setno,
-				kmsg->kmsg_type,
-				kmsg->kmsg_flags,
-				(char *)&(kmsg->kmsg_data),
-				kmsg->kmsg_size,
-				&result,
-				&ep);
+	err = mdmn_send_message(kmsg->kmsg_setno, kmsg->kmsg_type,
+	    kmsg->kmsg_flags, kmsg->kmsg_recipient, (char *)&(kmsg->kmsg_data),
+	    kmsg->kmsg_size, &result, &ep);
+
 	if (result == NULL) {
 		kresult.kmmr_comm_state = MDMNE_RPC_FAIL;
 	} else {
 		kresult.kmmr_comm_state = result->mmr_comm_state;
-	}
-	if (err == 0) {
-		kresult.kmmr_msgtype = result->mmr_msgtype;
-		kresult.kmmr_flags = result->mmr_flags;
-		kresult.kmmr_exitval = result->mmr_exitval;
-		kresult.kmmr_failing_node = result->mmr_failing_node;
-		size = result->mmr_out_size;
-		if (size > 0) {
-			/* This is the maximum of data we can transfer, here */
-			if (size > MDMN_MAX_KRES_DATA) {
-				size = MDMN_MAX_KRES_DATA;
+		if (err == 0) {
+			kresult.kmmr_msgtype = result->mmr_msgtype;
+			kresult.kmmr_flags = result->mmr_flags;
+			kresult.kmmr_exitval = result->mmr_exitval;
+			kresult.kmmr_failing_node = result->mmr_failing_node;
+			size = result->mmr_out_size;
+			if (size > 0) {
+				/* This is the max data we can transfer, here */
+				if (size > MDMN_MAX_KRES_DATA) {
+					size = MDMN_MAX_KRES_DATA;
+				}
+				bcopy(result->mmr_out, &(kresult.kmmr_res_data),
+				    size);
+				kresult.kmmr_res_size = size;
+			} else {
+				kresult.kmmr_res_size = 0;
 			}
-			bcopy(result->mmr_out, &(kresult.kmmr_res_data), size);
-			kresult.kmmr_res_size = size;
-		} else {
-			kresult.kmmr_res_size = 0;
 		}
-	}
-
-	if (result != NULL) {
 		free_result(result);
 	}
 
@@ -252,7 +245,7 @@
 	 * At this point we are single threaded.
 	 * We give mdmn_send_message() a chance to initialize safely.
 	 */
-	(void) mdmn_send_message(0, 0, 0, 0, 0, 0, 0);
+	(void) mdmn_send_message(0, 0, 0, 0, 0, 0, 0, 0);
 
 	/* setup the door handle */
 	mdmn_door_handle = door_create(door2rpc, NULL,
@@ -266,12 +259,12 @@
 	if (metaioctl(MD_MN_SET_DOORH, &mdmn_door_handle, &ep,
 	    "mddoors") != 0) {
 		syslog(LOG_DAEMON | LOG_DEBUG, gettext(
-			"Couldn't set door handle"));
+		    "Couldn't set door handle"));
 		exit(1);
 	}
 
 	(void) pause();
 	syslog(LOG_DAEMON | LOG_ERR, gettext(
-			"Unexpected exit from pause()"));
+	    "Unexpected exit from pause()"));
 	return (1);
 }
--- a/usr/src/cmd/lvm/rpc.mdcommd/mdmn_commd_server.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/cmd/lvm/rpc.mdcommd/mdmn_commd_server.c	Wed Dec 24 08:23:40 2008 -0700
@@ -18,13 +18,12 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <unistd.h>
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -42,38 +41,40 @@
 /*
  * This is the communication daemon for SVM Multi Node Disksets.
  * It runs on every node and provides the following rpc services:
- *  - mdmn_send_svc_1
- *  - mdmn_work_svc_1
- *  - mdmn_wakeup_initiator_svc_1
- *  - mdmn_wakeup_master_svc_1
- *  - mdmn_comm_lock_svc_1
- *  - mdmn_comm_unlock_svc_1
- *  - mdmn_comm_suspend_svc_1
- *  - mdmn_comm_resume_svc_1
- *  - mdmn_comm_reinit_set_svc_1
+ *  - mdmn_send_svc_2
+ *  - mdmn_work_svc_2
+ *  - mdmn_wakeup_initiator_svc_2
+ *  - mdmn_wakeup_master_svc_2
+ *  - mdmn_comm_lock_svc_2
+ *  - mdmn_comm_unlock_svc_2
+ *  - mdmn_comm_suspend_svc_2
+ *  - mdmn_comm_resume_svc_2
+ *  - mdmn_comm_reinit_set_svc_2
  * where send, lock, unlock and reinit are meant for external use,
  * work and the two wakeups are for internal use only.
  *
  * NOTE:
- * On every node only one of those xxx_1 functions can be active at the
+ * On every node only one of those xxx_2 functions can be active at the
  * same time because the daemon is single threaded.
  *
+ * (not quite true, as mdmn_send_svc_2 and mdmn_work_svc_2 do thr_create()s
+ * as part of their handlers, so those aspects are multi-threaded)
  *
  * In case an event occurs that has to be propagated to all the nodes...
  *
  * One node (the initiator)
  *	calls the libmeta function mdmn_send_message()
- *	This function calls the local daemon thru mdmn_send_svc_1.
+ *	This function calls the local daemon thru mdmn_send_svc_2.
  *
  * On the initiator:
- *	mdmn_send_svc_1()
+ *	mdmn_send_svc_2()
  *	    - starts a thread -> mdmn_send_to_work() and returns.
  *	mdmn_send_to_work()
  *	    - sends this message over to the master of the diskset.
- *	      This is done by calling mdmn_work_svc_1 on the master.
+ *	      This is done by calling mdmn_work_svc_2 on the master.
  *	    - registers to the initiator_table
  *	    - exits without doing a svc_sendreply() for the call to
- *	      mdmn_send_svc_1. This means that call is blocked until somebody
+ *	      mdmn_send_svc_2. This means that call is blocked until somebody
  *	      (see end of this comment) does a svc_sendreply().
  *	      This means mdmn_send_message() does not yet return.
  *	    - A timeout surveillance is started at this point.
@@ -82,42 +83,42 @@
  *	      to the caller.
  *
  * On the master:
- *	mdmn_work_svc_1()
+ *	mdmn_work_svc_2()
  *	    - starts a thread -> mdmn_master_process_msg() and returns
  *	mdmn_master_process_msg()
  *	    - logs the message to the change log
  *	    - executes the message locally
  *	    - flags the message in the change log
- *	    - sends the message to mdmn_work_svc_1() on all the
+ *	    - sends the message to mdmn_work_svc_2() on all the
  *	      other nodes (slaves)
- *	      after each call to mdmn_work_svc_1 the thread goes to sleep and
- *	      will be woken up by mdmn_wakeup_master_svc_1() as soon as the
+ *	      after each call to mdmn_work_svc_2 the thread goes to sleep and
+ *	      will be woken up by mdmn_wakeup_master_svc_2() as soon as the
  *	      slave node is done with this message.
  *	    - In case the slave doesn't respond in a apropriate time, an error
  *	      is assumed to ensure the master doesn't wait forever.
  *
  * On a slave:
- *	mdmn_work_svc_1()
+ *	mdmn_work_svc_2()
  *	    - starts a thread -> mdmn_slave_process_msg() and returns
  *	mdmn_slave_process_msg()
  *	    - processes this message locally by calling the appropriate message
  *	      handler, that creates some result.
- *	    - sends that result thru a call to mdmn_wakeup_master_svc_1() to
+ *	    - sends that result thru a call to mdmn_wakeup_master_svc_2() to
  *	      the master.
  *
  * Back on the master:
- *	mdmn_wakeup_master_svc_1()
+ *	mdmn_wakeup_master_svc_2()
  *	    - stores the result into the master_table.
  *	    - signals the mdmn_master_process_msg-thread.
  *	    - returns
  *	mdmn_master_process_msg()
  *	    - after getting the results from all nodes
  *	    - sends them back to the initiating node thru a call to
- *	      mdmn_wakeup_initiator_svc_1.
+ *	      mdmn_wakeup_initiator_svc_2.
  *
  * Back on the initiator:
- *	mdmn_wakeup_initiator_svc_1()
- *	    - calls svc_sendreply() which makes the call to mdmn_send_svc_1()
+ *	mdmn_wakeup_initiator_svc_2()
+ *	    - calls svc_sendreply() which makes the call to mdmn_send_svc_2()
  *	      return.
  *	      which allows the initial mdmn_send_message() call to return.
  */
@@ -195,8 +196,8 @@
 {
 	md_mnnode_desc	*node = (md_mnnode_desc *)data;
 
-	return (clnt_create_timed(node->nd_priv_ic, MDMN_COMMD, ONE, "tcp",
-		time_out));
+	return (clnt_create_timed(node->nd_priv_ic, MDMN_COMMD, TWO, "tcp",
+	    time_out));
 }
 
 #define	FLUSH_DEBUGFILE() \
@@ -219,15 +220,15 @@
 
 	if (master_err != MDMNE_ACK) {
 		snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: RPC fail on master "
-			"when processing message type %d\n", type);
+		    "when processing message type %d\n", type);
 	} else if (slave_result == NULL) {
 		snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: RPC fail on node "
-			"%d when processing message type %d\n", nid, type);
+		    "%d when processing message type %d\n", nid, type);
 	} else {
 		snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: Inconsistent "
-			"return value from node %d when processing message "
-			"type %d. Master exitval = %d, Slave exitval = %d\n",
-			nid, type, master_exitval, slave_result->mmr_exitval);
+		    "return value from node %d when processing message "
+		    "type %d. Master exitval = %d, Slave exitval = %d\n",
+		    nid, type, master_exitval, slave_result->mmr_exitval);
 	}
 	commd_err.size = strlen(msg_buf);
 	commd_err.md_message = (uint64_t)(uintptr_t)&msg_buf[0];
@@ -335,12 +336,17 @@
 
 	commd_debug(MD_MMV_MISC, "timeout_ini: (%d, 0x%llx-%d)\n",
 	    MSGID_ELEMS(mid));
+	/*
+	 * Give the result the corresponding msgid from the failed message.
+	 */
+	MSGID_COPY(&mid, &(resultp->mmr_msgid));
 
 	/* return to mdmn_send_message() and let it deal with the situation */
 	mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
 
 	free(resultp);
 	commd_debug(MD_MMV_MISC, "timeout_ini: sendreplied\n");
+	svc_done(transp);
 	mdmn_unregister_initiator_table(setno, class);
 }
 
@@ -499,13 +505,13 @@
  * Perform some global initializations.
  *
  * the following routines have to call this before operation can start:
- *  - mdmn_send_svc_1
- *  - mdmn_work_svc_1
- *  - mdmn_comm_lock_svc_1
- *  - mdmn_comm_unlock_svc_1
- *  - mdmn_comm_suspend_svc_1
- *  - mdmn_comm_resume_svc_1
- *  - mdmn_comm_reinit_set_svc_1
+ *  - mdmn_send_svc_2
+ *  - mdmn_work_svc_2
+ *  - mdmn_comm_lock_svc_2
+ *  - mdmn_comm_unlock_svc_2
+ *  - mdmn_comm_suspend_svc_2
+ *  - mdmn_comm_resume_svc_2
+ *  - mdmn_comm_reinit_set_svc_2
  *
  * This is a single threaded daemon, so it can only be in one of the above
  * routines at the same time.
@@ -547,8 +553,7 @@
 
 	__savetime = gethrtime();
 	(void) time(&clock_val);
-	commd_debug(MD_MMV_MISC, "global init called %s\n",
-			ctime(&clock_val));
+	commd_debug(MD_MMV_MISC, "global init called %s\n", ctime(&clock_val));
 
 	/* start a thread that flushes out the debug on a regular basis */
 	thr_create(NULL, 0, (void *(*)(void *))flush_fcout,
@@ -663,9 +668,9 @@
 		 */
 		while ((client[setno][nid] == (CLIENT *) NULL) &&
 		    (tout < MD_CLNT_CREATE_TOUT)) {
-			client[setno][nid] = meta_client_create_retry
-				(node->nd_nodename, mdmn_clnt_create,
-				(void *) node, MD_CLNT_CREATE_SUBTIMEOUT, &ep);
+			client[setno][nid] = meta_client_create_retry(
+			    node->nd_nodename, mdmn_clnt_create,
+			    (void *) node, MD_CLNT_CREATE_SUBTIMEOUT, &ep);
 			/* Is the node dead? */
 			if (mdmn_is_node_dead(node) == 1) {
 				commd_debug(MD_MMV_SYSLOG,
@@ -889,9 +894,9 @@
 		 */
 		while ((client[setno][nid] == (CLIENT *) NULL) &&
 		    (tout < MD_CLNT_CREATE_TOUT)) {
-			client[setno][nid] = meta_client_create_retry
-				(node->nd_nodename, mdmn_clnt_create,
-				(void *) node, MD_CLNT_CREATE_SUBTIMEOUT, &ep);
+			client[setno][nid] = meta_client_create_retry(
+			    node->nd_nodename, mdmn_clnt_create,
+			    (void *) node, MD_CLNT_CREATE_SUBTIMEOUT, &ep);
 			/* Is the node dead? */
 			if (mdmn_is_node_dead(node) == 1) {
 				commd_debug(MD_MMV_SYSLOG,
@@ -942,7 +947,7 @@
 void *
 mdmn_send_to_work(void *arg)
 {
-	int			*rpc_err;
+	int			*rpc_err = NULL;
 	int			success;
 	int			try_master;
 	set_t			setno;
@@ -956,9 +961,6 @@
 	msg			= matp->mat_msg;
 	transp			= matp->mat_transp;
 
-	/* the alloc was done in mdmn_send_svc_1 */
-	free(matp);
-
 	class = mdmn_get_message_class(msg->msg_type);
 	setno = msg->msg_setno;
 
@@ -980,8 +982,7 @@
 	if (success == MDMNE_CLASS_BUSY) {
 		md_mn_msgid_t		active_mid;
 
-		mdmn_get_initiator_table_id(setno, class,
-		&active_mid);
+		mdmn_get_initiator_table_id(setno, class, &active_mid);
 
 		commd_debug(MD_MMV_SEND,
 		    "send_to_work: received but locally busy "
@@ -1011,7 +1012,8 @@
 		 * Send the request to the work function on the master
 		 * this call will return immediately
 		 */
-		rpc_err = mdmn_work_1(msg, client[setno][set_master]);
+		rpc_err = mdmn_work_2(msg, client[setno][set_master],
+		    set_master);
 
 		/* Everything's Ok? */
 		if (rpc_err == NULL) {
@@ -1043,7 +1045,7 @@
 		/*
 		 * If we are here, we sucessfully delivered the message.
 		 * We register the initiator_table, so that
-		 * wakeup_initiator_1  can do the sendreply with the
+		 * wakeup_initiator_2 can do the sendreply with the
 		 * results for us.
 		 */
 		success = MDMNE_ACK;
@@ -1068,15 +1070,27 @@
 		md_mn_result_t *resultp;
 		resultp = Zalloc(sizeof (md_mn_result_t));
 		resultp->mmr_comm_state = success;
+		/*
+		 * copy the MSGID so that we know _which_ message
+		 * failed (if the transp has got mangled)
+		 */
+		MSGID_COPY(&(msg->msg_msgid), &(resultp->mmr_msgid));
 		mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
 		commd_debug(MD_MMV_SEND,
 		    "send_to_work: not registered (%d, 0x%llx-%d) cs=%d\n",
 		    MSGID_ELEMS(msg->msg_msgid), success);
 		free_result(resultp);
+		/*
+		 * We don't have a timeout registered to wake us up, so we're
+		 * now done with this handle. Release it back to the pool.
+		 */
+		svc_done(transp);
 
 	}
 
 	free_msg(msg);
+	/* the alloc was done in mdmn_send_svc_2 */
+	Free(matp);
 	mutex_unlock(mx);
 	return (NULL);
 
@@ -1186,7 +1200,7 @@
 	int			timeout_retries = 0;
 	int			*ret = NULL;
 	set_t			setno;
-	cond_t			*cv;	/* see mdmn_wakeup_master_svc_1 */
+	cond_t			*cv;	/* see mdmn_wakeup_master_svc_2 */
 	mutex_t			*mx;	/* protection for class_busy */
 	timestruc_t		timeout; /* surveillance for remote daemon */
 	md_mn_nodeid_t		nid;
@@ -1251,7 +1265,7 @@
 		}
 
 		/* send it over, it will return immediately */
-		ret = mdmn_work_1(msg, client[setno][nid]);
+		ret = mdmn_work_2(msg, client[setno][nid], nid);
 
 		rw_unlock(&client_rwlock[setno]);
 
@@ -1462,7 +1476,7 @@
 			result->mmr_comm_state = MDMNE_LOG_FAIL;
 			/*
 			 * Note that the mark_busy was already done by
-			 * mdmn_work_svc_1()
+			 * mdmn_work_svc_2()
 			 */
 			mutex_lock(&mdmn_busy_mutex[setno]);
 			mdmn_mark_class_unbusy(setno, orig_class);
@@ -1487,8 +1501,8 @@
 			commd_debug(MD_MMV_SYSLOG,
 			    "proc_mas: No client for initiator \n");
 		} else {
-			ret = mdmn_wakeup_initiator_1(result,
-			    client[setno][sender]);
+			ret = mdmn_wakeup_initiator_2(result,
+			    client[setno][sender], sender);
 		}
 		rw_unlock(&client_rwlock[setno]);
 
@@ -1677,6 +1691,12 @@
 				continue;
 			}
 
+			/* If a DIRECTED message, skip non-recipient nodes */
+			if ((cmsg->msg_flags & MD_MSGF_DIRECTED) &&
+			    nid != cmsg->msg_recipient) {
+				continue;
+			}
+
 			mutex_lock(mx);
 			/*
 			 * Register the node that is addressed,
@@ -1865,7 +1885,8 @@
 		commd_debug(MD_MMV_SYSLOG,
 		    "proc_mas: unable to create client for initiator\n");
 	} else {
-		ret = mdmn_wakeup_initiator_1(result, client[setno][sender]);
+		ret = mdmn_wakeup_initiator_2(result, client[setno][sender],
+		    sender);
 	}
 	rw_unlock(&client_rwlock[setno]);
 
@@ -2046,14 +2067,14 @@
 			rw_unlock(&client_rwlock[setno]);
 			break;
 		} else {
-			ret = mdmn_wakeup_master_1(result,
-			    client[setno][sender]);
+			ret = mdmn_wakeup_master_2(result,
+			    client[setno][sender], sender);
 			/*
-			 * if mdmn_wakeup_master_1 returns NULL, it can be that
+			 * if mdmn_wakeup_master_2 returns NULL, it can be that
 			 * the master (or the commd on the master) had died.
 			 * In that case, we destroy the client to the master
 			 * and retry.
-			 * If mdmn_wakeup_master_1 doesn't return MDMNE_ACK,
+			 * If mdmn_wakeup_master_2 doesn't return MDMNE_ACK,
 			 * the commd on the master is alive but
 			 * something else is wrong,
 			 * in that case a retry doesn't make sense => break out
@@ -2097,8 +2118,19 @@
 }
 
 
-md_mn_result_t *
-mdmn_send_svc_1(md_mn_msg_t *omsg, struct svc_req *rqstp)
+/*
+ * mdmn_send_svc_2:
+ * ---------------
+ * Check that the issuing node is a legitimate one (i.e. is licensed to send
+ * messages to us), that the RPC request can be staged.
+ *
+ * Returns:
+ *	0	=> no RPC request is in-flight, no deferred svc_sendreply()
+ *	1	=> queued RPC request in-flight. Completion will be made (later)
+ *		   by a wakeup_initiator_2() [hopefully]
+ */
+int
+mdmn_send_svc_2(md_mn_msg_t *omsg, struct svc_req *rqstp)
 {
 	int			err;
 	set_t			setno;
@@ -2121,7 +2153,7 @@
 		mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
 		free_result(resultp);
 		svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
-		return (NULL);
+		return (0);
 	}
 
 	/* check if the global initialization is done */
@@ -2152,7 +2184,7 @@
 			    (char *)resultp);
 			free_result(resultp);
 			svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
-			return (NULL);
+			return (0);
 		}
 	}
 
@@ -2169,7 +2201,7 @@
 		mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
 		free_result(resultp);
 		svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
-		return (NULL);
+		return (0);
 	}
 
 
@@ -2184,10 +2216,10 @@
 		free_result(resultp);
 		svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
 		commd_debug(MD_MMV_SEND,
-			"send: type locked (%d, 0x%llx-%d), set=%d, class=%d, "
-			"type=%d\n", MSGID_ELEMS(msg->msg_msgid), setno, class,
-			msg->msg_type);
-		return (NULL);
+		    "send: type locked (%d, 0x%llx-%d), set=%d, class=%d, "
+		    "type=%d\n", MSGID_ELEMS(msg->msg_msgid), setno, class,
+		    msg->msg_type);
+		return (0);
 	}
 
 
@@ -2213,7 +2245,7 @@
 			free_result(resultp);
 			commd_debug(MD_MMV_SEND,
 			    "send: init err = %d\n", err);
-			return (NULL);
+			return (0);
 		}
 	}
 
@@ -2227,10 +2259,10 @@
 		svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
 		free_result(resultp);
 		commd_debug(MD_MMV_SEND,
-			"send: class suspended (%d, 0x%llx-%d), set=%d, "
-			"class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid),
-			setno, class, msg->msg_type);
-		return (NULL);
+		    "send: class suspended (%d, 0x%llx-%d), set=%d, "
+		    "class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid),
+		    setno, class, msg->msg_type);
+		return (0);
 	}
 	mutex_unlock(&mdmn_busy_mutex[setno]);
 
@@ -2238,10 +2270,10 @@
 	if (check_license(rqstp, 0) == FALSE) {
 		svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
 		commd_debug(MD_MMV_SEND,
-			"send: check licence fail(%d, 0x%llx-%d), set=%d, "
-			"class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid),
-			setno, class, msg->msg_type);
-		return (NULL);
+		    "send: check licence fail(%d, 0x%llx-%d), set=%d, "
+		    "class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid),
+		    setno, class, msg->msg_type);
+		return (0);
 	}
 
 
@@ -2268,17 +2300,17 @@
 	    MSGID_ELEMS(msg->msg_msgid));
 	/*
 	 * We return here without sending results. This will be done by
-	 * mdmn_wakeup_initiator_svc_1() as soon as the results are available.
+	 * mdmn_wakeup_initiator_svc_2() as soon as the results are available.
 	 * Until then the calling send_message will be blocked, while we
 	 * are able to take calls.
 	 */
 
-	return (NULL);
+	return (1);
 }
 
 /* ARGSUSED */
 int *
-mdmn_work_svc_1(md_mn_msg_t *omsg, struct svc_req *rqstp)
+mdmn_work_svc_2(md_mn_msg_t *omsg, struct svc_req *rqstp)
 {
 	int		err;
 	set_t		setno;
@@ -2362,7 +2394,7 @@
 
 	mutex_lock(&mdmn_busy_mutex[setno]);
 
-	/* check if class is locked via a call to mdmn_comm_lock_svc_1 */
+	/* check if class is locked via a call to mdmn_comm_lock_svc_2 */
 	if (mdmn_is_class_locked(setno, class) == TRUE) {
 		mutex_unlock(&mdmn_busy_mutex[setno]);
 		*retval = MDMNE_CLASS_LOCKED;
@@ -2430,14 +2462,14 @@
 
 /* ARGSUSED */
 int *
-mdmn_wakeup_initiator_svc_1(md_mn_result_t *res, struct svc_req *rqstp)
+mdmn_wakeup_initiator_svc_2(md_mn_result_t *res, struct svc_req *rqstp)
 {
 
 	int		*retval;
 	int		err;
 	set_t		setno;
 	mutex_t		*mx;   /* protection of initiator_table */
-	SVCXPRT		*transp;
+	SVCXPRT		*transp = NULL;
 	md_mn_msgid_t	initiator_table_id;
 	md_mn_msgclass_t class;
 
@@ -2491,13 +2523,14 @@
 	 * Search the initiator wakeup table.
 	 * If we find an entry here (which should always be true)
 	 * we are on the initiating node and we wakeup the original
-	 * local rpc call
+	 * local rpc call.
 	 */
 	mdmn_get_initiator_table_id(setno, class, &initiator_table_id);
 
 	if (MSGID_CMP(&(initiator_table_id), &(res->mmr_msgid))) {
 		transp = mdmn_get_initiator_table_transp(setno, class);
 		mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)res);
+		svc_done(transp);
 		mdmn_unregister_initiator_table(setno, class);
 		*retval = MDMNE_ACK;
 
@@ -2532,7 +2565,7 @@
  */
 /* ARGSUSED */
 int *
-mdmn_wakeup_master_svc_1(md_mn_result_t *ores, struct svc_req *rqstp)
+mdmn_wakeup_master_svc_2(md_mn_result_t *ores, struct svc_req *rqstp)
 {
 
 	int		*retval;
@@ -2645,7 +2678,7 @@
  * This is mainly done for debug purpose.
  * This set/class combination immediately is blocked,
  * even in the middle of sending messages to multiple slaves.
- * This remains until the user issues a mdmn_comm_unlock_svc_1 for the same
+ * This remains until the user issues a mdmn_comm_unlock_svc_2 for the same
  * set/class combination.
  *
  * Special messages of class MD_MSG_CLASS0 can never be locked.
@@ -2666,7 +2699,7 @@
 
 /* ARGSUSED */
 int *
-mdmn_comm_lock_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
+mdmn_comm_lock_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
 {
 	int			*retval;
 	set_t			setno = msc->msc_set;
@@ -2722,7 +2755,7 @@
  */
 /* ARGSUSED */
 int *
-mdmn_comm_unlock_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
+mdmn_comm_unlock_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
 {
 	int			*retval;
 	set_t			setno  = msc->msc_set;
@@ -2766,7 +2799,7 @@
 }
 
 /*
- * mdmn_comm_suspend_svc_1(setno, class)
+ * mdmn_comm_suspend_svc_2(setno, class)
  *
  * Drain all outstanding messages for a given set/class combination
  * and don't allow new messages to be processed.
@@ -2812,7 +2845,7 @@
 
 /* ARGSUSED */
 int *
-mdmn_comm_suspend_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
+mdmn_comm_suspend_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
 {
 	int			*retval;
 	int			failure = 0;
@@ -2902,7 +2935,7 @@
 }
 
 /*
- * mdmn_comm_resume_svc_1(setno, class)
+ * mdmn_comm_resume_svc_2(setno, class)
  *
  * Resume processing messages for a given set.
  * This incorporates the repeal of a previous suspend operation.
@@ -2927,7 +2960,7 @@
  */
 /* ARGSUSED */
 int *
-mdmn_comm_resume_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
+mdmn_comm_resume_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
 {
 	int			*retval;
 	set_t			startset, endset;
@@ -3029,7 +3062,7 @@
 }
 /* ARGSUSED */
 int *
-mdmn_comm_reinit_set_svc_1(set_t *setnop, struct svc_req *rqstp)
+mdmn_comm_reinit_set_svc_2(set_t *setnop, struct svc_req *rqstp)
 {
 	int		*retval;
 	md_mnnode_desc	*node;
@@ -3093,7 +3126,7 @@
 
 /* ARGSUSED */
 int *
-mdmn_comm_msglock_svc_1(md_mn_type_and_lock_t *mmtl, struct svc_req *rqstp)
+mdmn_comm_msglock_svc_2(md_mn_type_and_lock_t *mmtl, struct svc_req *rqstp)
 {
 	int			*retval;
 	md_mn_msgtype_t		type = mmtl->mmtl_type;
--- a/usr/src/cmd/lvm/rpc.mdcommd/mdmn_commd_service.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/cmd/lvm/rpc.mdcommd/mdmn_commd_service.c	Wed Dec 24 08:23:40 2008 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,13 +18,12 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/lvm/mdmn_commd.h>
 #include <stdio.h>
 #include <stdlib.h> /* getenv, exit */
@@ -60,16 +58,16 @@
 static int _rpcsvcstate = _IDLE;	/* Set when a request is serviced */
 static int _rpcsvccount = 0;		/* Number of requests being serviced */
 
-extern  md_mn_result_t	*mdmn_send_svc_1();
-extern  int		*mdmn_work_svc_1();
-extern  int		*mdmn_wakeup_initiator_svc_1();
-extern  int		*mdmn_wakeup_master_svc_1();
-extern  int		*mdmn_comm_lock_svc_1();
-extern  int		*mdmn_comm_unlock_svc_1();
-extern  int		*mdmn_comm_suspend_svc_1();
-extern  int		*mdmn_comm_resume_svc_1();
-extern  int		*mdmn_comm_reinit_set_svc_1();
-extern  int		*mdmn_comm_msglock_svc_1();
+extern  int		mdmn_send_svc_2();
+extern  int		*mdmn_work_svc_2();
+extern  int		*mdmn_wakeup_initiator_svc_2();
+extern  int		*mdmn_wakeup_master_svc_2();
+extern  int		*mdmn_comm_lock_svc_2();
+extern  int		*mdmn_comm_unlock_svc_2();
+extern  int		*mdmn_comm_suspend_svc_2();
+extern  int		*mdmn_comm_resume_svc_2();
+extern  int		*mdmn_comm_reinit_set_svc_2();
+extern  int		*mdmn_comm_msglock_svc_2();
 
 
 static void
@@ -107,7 +105,7 @@
 }
 
 static void
-mdmn_commd_1(rqstp, transp)
+mdmn_commd_2(rqstp, transp)
 	struct svc_req *rqstp;
 	register SVCXPRT *transp;
 {
@@ -124,7 +122,6 @@
 	char *(*local)();
 	int free_result = 0;
 
-
 	_rpcsvccount++;
 	switch (rqstp->rq_proc) {
 	case NULLPROC:
@@ -132,6 +129,7 @@
 			(char *)NULL);
 		_rpcsvccount--;
 		_rpcsvcstate = _SERVED;
+		svc_done(transp);
 		return;
 
 	case mdmn_send:
@@ -140,81 +138,94 @@
 		(void) memset((char *)&argument, 0, sizeof (argument));
 		if (!svc_getargs(transp, _xdr_argument, (caddr_t)&argument)) {
 			svcerr_decode(transp);
+			svc_done(transp);
 			_rpcsvccount--;
 			_rpcsvcstate = _SERVED;
 			return;
 		}
 		/*
-		 * mdmn_send_1 will not always do a sendreply.
+		 * mdmn_send_2 will not always do a sendreply.
 		 * it will register in a table and let the mdmn_wakeup1
 		 * do the sendreply for that call.
 		 * in order to register properly we need the transp handle
+		 * If we get a 0 back from mdmn_send_svc_2() we have no pending
+		 * RPC in-flight, so we drop the service count.
 		 */
-		(void) mdmn_send_svc_1((md_mn_msg_t *)&argument, rqstp);
+		if (mdmn_send_svc_2((md_mn_msg_t *)&argument, rqstp) == 0) {
+			_rpcsvccount--;
+			_rpcsvcstate = _SERVED;
+			svc_done(rqstp->rq_xprt);
+		}
 
-		return; /* xdr_free is called by mdmn_wakeup_initiator_svc_1 */
+		return; /* xdr_free is called by mdmn_wakeup_initiator_svc_2 */
 
 	case mdmn_work:
 		_xdr_argument = xdr_md_mn_msg_t;
 		_xdr_result = xdr_int;
-		local = (char *(*)()) mdmn_work_svc_1;
+		local = (char *(*)()) mdmn_work_svc_2;
 		free_result = 1;
 		break;
 
 	case mdmn_wakeup_master:
 		_xdr_argument = xdr_md_mn_result_t;
 		_xdr_result = xdr_int;
-		local = (char *(*)()) mdmn_wakeup_master_svc_1;
+		local = (char *(*)()) mdmn_wakeup_master_svc_2;
 		free_result = 1;
 		break;
 
 	case mdmn_wakeup_initiator:
+		/*
+		 * We must have had an in-flight RPC request to get here,
+		 * so drop the in-flight count.
+		 */
 		_xdr_argument = xdr_md_mn_result_t;
 		_xdr_result = xdr_int;
-		local = (char *(*)()) mdmn_wakeup_initiator_svc_1;
+		local = (char *(*)()) mdmn_wakeup_initiator_svc_2;
 		free_result = 1;
+		_rpcsvccount--;
 		break;
 
 	case mdmn_comm_lock:
 		_xdr_argument = xdr_md_mn_set_and_class_t;
 		_xdr_result = xdr_int;
-		local = (char *(*)()) mdmn_comm_lock_svc_1;
+		local = (char *(*)()) mdmn_comm_lock_svc_2;
 		break;
 
 	case mdmn_comm_unlock:
 		_xdr_argument = xdr_md_mn_set_and_class_t;
 		_xdr_result = xdr_int;
-		local = (char *(*)()) mdmn_comm_unlock_svc_1;
+		local = (char *(*)()) mdmn_comm_unlock_svc_2;
 		break;
 
 	case mdmn_comm_suspend:
 		_xdr_argument = xdr_md_mn_set_and_class_t;
 		_xdr_result = xdr_int;
-		local = (char *(*)()) mdmn_comm_suspend_svc_1;
+		local = (char *(*)()) mdmn_comm_suspend_svc_2;
 		break;
 
 	case mdmn_comm_resume:
 		_xdr_argument = xdr_md_mn_set_and_class_t;
 		_xdr_result = xdr_int;
-		local = (char *(*)()) mdmn_comm_resume_svc_1;
+		local = (char *(*)()) mdmn_comm_resume_svc_2;
 		break;
 
 	case mdmn_comm_reinit_set:
 		_xdr_argument = xdr_u_int;
 		_xdr_result = xdr_int;
-		local = (char *(*)()) mdmn_comm_reinit_set_svc_1;
+		local = (char *(*)()) mdmn_comm_reinit_set_svc_2;
 		break;
 
 	case mdmn_comm_msglock:
 		_xdr_argument = xdr_md_mn_type_and_lock_t;
 		_xdr_result = xdr_int;
-		local = (char *(*)()) mdmn_comm_msglock_svc_1;
+		local = (char *(*)()) mdmn_comm_msglock_svc_2;
 		break;
 
 	default:
 		svcerr_noproc(transp);
 		_rpcsvccount--;
 		_rpcsvcstate = _SERVED;
+		svc_done(transp);
 		return;
 	}
 	(void) memset((char *)&argument, 0, sizeof (argument));
@@ -222,6 +233,7 @@
 		svcerr_decode(transp);
 		_rpcsvccount--;
 		_rpcsvcstate = _SERVED;
+		svc_done(transp);
 		return;
 	}
 	result = (*local)(&argument, rqstp);
@@ -231,12 +243,15 @@
 	}
 	if (!svc_freeargs(transp, _xdr_argument, (caddr_t)&argument)) {
 		_msgout(gettext("unable to free arguments"));
+		svc_done(transp);
 		exit(1);
 	}
 
 	if (free_result == 1) {
 		free(result);
 	}
+
+	svc_done(transp);
 	_rpcsvccount--;
 	_rpcsvcstate = _SERVED;
 }
@@ -249,6 +264,7 @@
 exit_commd()
 {
 	md_error_t	ep = mdnullerror;
+	syslog(LOG_DAEMON | LOG_DEBUG, gettext("mdcommd exiting"));
 	(void) metaioctl(MD_MN_SET_COMMD_RUNNING, 0, &ep, "rpc.mdcommd");
 }
 
@@ -259,10 +275,23 @@
 	pid_t pid;
 	int i;
 	md_error_t	ep = mdnullerror;
+	int		mode = RPC_SVC_MT_USER;
 
 	(void) sigset(SIGPIPE, SIG_IGN);
 
 	/*
+	 * Attempt to set MT_USER behaviour for mdcommd service routines.
+	 * If this isn't done, there is a possibility that the transport
+	 * handle might be freed before the thread created by mdmn_send_svc_2
+	 * can use it.  A consequence of this is that svc_done() must be
+	 * called on the handle when it's no longer needed.
+	 */
+	if (rpc_control(RPC_SVC_MTMODE_SET, &mode) == FALSE) {
+		_msgout(gettext("cannot set MT_USER mode for RPC service"));
+		exit(1);
+	}
+
+	/*
 	 * If stdin looks like a TLI endpoint, we assume
 	 * that we were started by a port monitor. If
 	 * t_getstate fails with TBADF, this is not a
@@ -294,9 +323,9 @@
 		}
 		if (nconf)
 			freenetconfigent(nconf);
-		if (!svc_reg(transp, MDMN_COMMD, ONE, mdmn_commd_1, 0)) {
+		if (!svc_reg(transp, MDMN_COMMD, TWO, mdmn_commd_2, 0)) {
 			_msgout(gettext(
-			    "unable to register (MDMN_COMMD, ONE)."));
+			    "unable to register (MDMN_COMMD, TWO)."));
 			exit(1);
 		}
 
@@ -307,7 +336,8 @@
 			(void) alarm(_RPCSVC_CLOSEDOWN/2);
 		}
 
-		(void) metaioctl(MD_MN_SET_COMMD_RUNNING, (void *)1, &ep,
+		pid = getpid();
+		(void) metaioctl(MD_MN_SET_COMMD_RUNNING, (void *)pid, &ep,
 		    "rpc.mdcommd");
 		svc_run();
 		exit(1);
@@ -343,8 +373,8 @@
 		openlog("mdmn_commd", LOG_PID, LOG_DAEMON);
 #endif
 	}
-	if (!svc_create(mdmn_commd_1, MDMN_COMMD, ONE, "tcp")) {
-		_msgout(gettext("unable to create (MDMN_COMMD, ONE) for tcp."));
+	if (!svc_create(mdmn_commd_2, MDMN_COMMD, TWO, "tcp")) {
+		_msgout(gettext("unable to create (MDMN_COMMD, TWO) for tcp."));
 		exit(1);
 	}
 
--- a/usr/src/cmd/lvm/rpc.mdcommd/mdmn_subr.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/cmd/lvm/rpc.mdcommd/mdmn_subr.c	Wed Dec 24 08:23:40 2008 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,13 +18,12 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <unistd.h>
 #include <sys/types.h>
 #include <sys/socket.h>
@@ -446,6 +444,7 @@
 	commd_debug(dbc, "%s sender	= %d\n", prefix, msg->msg_sender);
 	commd_debug(dbc, "%s flags	= 0x%x\n", prefix, msg->msg_flags);
 	commd_debug(dbc, "%s setno	= %d\n", prefix, msg->msg_setno);
+	commd_debug(dbc, "%s recipient  = %d\n", prefix, msg->msg_recipient);
 	commd_debug(dbc, "%s type	= %d\n", prefix, msg->msg_type);
 	commd_debug(dbc, "%s size	= %d\n", prefix, msg->msg_event_size);
 	if (msg->msg_event_size) {
@@ -513,9 +512,8 @@
 		class = msg->msg_msgid.mid_oclass;
 	}
 
-	mct_index = submsg +
-		    class * MAX_SUBMESSAGES +
-		    nodeid * MAX_SUBMESSAGES * MD_MN_NCLASSES;
+	mct_index = submsg + class * MAX_SUBMESSAGES +
+	    nodeid * MAX_SUBMESSAGES * MD_MN_NCLASSES;
 
 	mct_offset = mct_index * sizeof (md_mn_mce_t);
 
@@ -694,12 +692,12 @@
 			}
 		}
 		commd_debug(MD_MMV_MISC,
-			    "mdmn_check_completion: msg already processed \n");
+		    "mdmn_check_completion: msg already processed \n");
 		dump_result(MD_MMV_MISC, "mdmn_check_completion", result);
 		return (MDMN_MCT_DONE);
 	}
 	commd_debug(MD_MMV_MISC,
-		    "mdmn_check_completion: msg not yet processed\n");
+	    "mdmn_check_completion: msg not yet processed\n");
 	return (MDMN_MCT_NOT_DONE);
 }
 
--- a/usr/src/cmd/lvm/util/metaclust.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/cmd/lvm/util/metaclust.c	Wed Dec 24 08:23:40 2008 -0700
@@ -24,8 +24,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <meta.h>
 #include <sdssc.h>
 #include <signal.h>
@@ -117,6 +115,8 @@
 sigalarmhandler(int sig)
 {
 	int	i, n, ret, stat_loc = 0;
+	FILE	*pgcore;
+	char	corecmd[256];
 
 	n = sizeof (step_table) / sizeof (step_table[0]);
 	for (i = 0; i < n; i++) {
@@ -130,6 +130,25 @@
 	    step_table[i].step_nam,
 	    meta_print_hrtime(gethrtime() - start_time));
 
+	/*
+	 * See what the child was actually doing when the timeout expired.
+	 * A core-dump of this would be _really_ good, so let's just
+	 * try a 'gcore -g c_pid' and hope
+	 */
+
+	(void) memset(corecmd, 0, sizeof (corecmd));
+	(void) snprintf(corecmd, sizeof (corecmd),
+	    "/bin/gcore -g %d >/dev/null 2>&1", (int)c_pid);
+
+	pgcore = popen(corecmd, "r");
+
+	if (pgcore == NULL) {
+		meta_mc_log(MC_LOG1, gettext("Could not grab core for pid %s"),
+		    c_pid);
+	} else {
+		(void) pclose(pgcore);
+	}
+
 	if ((ret = kill(c_pid, SIGKILL)) == 0) {
 		/*
 		 * The child will wait forever until the status is retrieved
@@ -1762,7 +1781,6 @@
 				    "rpc.mdcommd for set %s\n"), sp->setname);
 				md_exit(local_sp, 1);
 			}
-			meta_ping_mnset(setno);
 
 			/* Unblock mddb parse messages */
 			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/mdb/common/modules/md/dumpmirror.c	Wed Dec 24 08:23:40 2008 -0700
@@ -0,0 +1,230 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include "mdinclude.h"
+
+/*
+ * Display an arbitrary bitmap by showing the set bits in the array.
+ * Output will be <start>-<end> for ranges or <position> for singleton bits.
+ */
+static void
+print_mm_bm(unsigned char *bm, uint_t size, char *bm_name)
+{
+	int	i;
+	int	first_set = -1;
+	int	need_comma = 0;
+
+	mdb_printf("%s set bits: ", bm_name);
+	for (i = 0; i < size; i++) {
+		if (isset(bm, i)) {
+			if (first_set == -1) {
+				first_set = i;
+			}
+		} else {
+			if (first_set != -1) {
+				if (first_set != (i-1)) {
+					mdb_printf("%s%u-%u",
+					    (need_comma ? "," : ""),
+					    first_set, (i-1));
+				} else {
+					mdb_printf("%s%u",
+					    (need_comma ? "," : ""), first_set);
+				}
+				need_comma = 1;
+				first_set = -1;
+			}
+		}
+	}
+	if (first_set != -1) {
+		mdb_printf("%s%u-%u", (need_comma ? "," : ""), first_set,
+		    size-1);
+	}
+	mdb_printf("\n");
+}
+
+/*
+ * Print uchar_t sized count fields (typically un_pernode_dirty_map entries)
+ */
+
+static void
+print_mm_cnt_c(unsigned char *bm, uint_t size, char *bm_name)
+{
+	int	i;
+	int	need_comma = 0;
+
+	mdb_printf("%s set counts: ", bm_name);
+	for (i = 0; i < size; i++) {
+		if (bm[i]) {
+			mdb_printf("%s(%d,%3d)", (need_comma ? "," : ""), i,
+			    (uint_t)bm[i]);
+			need_comma = 1;
+		}
+	}
+	mdb_printf("\n");
+}
+
+static void
+print_mm_cnt_w(unsigned short *bm, uint_t size, char *bm_name)
+{
+	int	i;
+	int	need_comma = 0;
+
+	mdb_printf("%s set counts: ", bm_name);
+	for (i = 0; i < size; i++) {
+		if (bm[i]) {
+			mdb_printf("%s(%d,%5d)", (need_comma ? "," : ""), i,
+			    (uint_t)bm[i]);
+			need_comma = 1;
+		}
+	}
+	mdb_printf("\n");
+}
+
+/*
+ * Print the associated bitmaps for the specified mm_unit_t
+ * These are:
+ *	un_pernode_dirty_bm
+ *	un_goingclean_bm
+ *	un_dirty_bm
+ *	un_goingdirty_bm
+ *	un_resync_bm
+ *
+ * Associated counts for unit:
+ *	un_pernode_dirty_sum[] 	(uchar_t)
+ *	un_outstanding_writes[]	(ushort_t)
+ *
+ */
+
+/* ARGSUSED */
+int
+printmmbm(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	mm_unit_t	mm, *mmp;
+	unsigned char	*rr_dirty_bm, *rr_goingclean_bm, *rr_goingdirty_bm;
+	unsigned char	*rr_resync_bm;
+	uintptr_t	un_dbm, un_gcbm, un_gdbm, un_rrbm, un_pnds, un_ow;
+	uint_t		num_rr, rr_bitmap_size;
+	int		i;
+	uintptr_t	un_pernode_bm;
+	unsigned char	*rr_pernode_dirty, *rr_pnds;
+	unsigned short	*rr_ow;
+	/* just enough for un_pernode_dirty_bm[] plus three digits */
+	char		pernode_str[25];
+
+	if (argc != 0)
+		return (DCMD_USAGE);
+
+	if (!(flags & DCMD_ADDRSPEC)) {
+		mdb_warn("No mm_unit_t address specified");
+		return (DCMD_ERR);
+	}
+
+	if (mdb_vread(&mm, sizeof (mm_unit_t), addr) == -1) {
+		mdb_warn("failed to read mm_unit_t at %p\n", addr);
+		return (DCMD_ERR);
+	}
+
+	mmp = &mm;
+
+	num_rr = mm.un_rrd_num;
+
+	un_dbm = (uintptr_t)mmp->un_dirty_bm;
+	un_gcbm = (uintptr_t)mmp->un_goingclean_bm;
+	un_gdbm = (uintptr_t)mmp->un_goingdirty_bm;
+	un_rrbm = (uintptr_t)mmp->un_resync_bm;
+	un_pnds = (uintptr_t)mmp->un_pernode_dirty_sum;
+	un_ow = (uintptr_t)mmp->un_outstanding_writes;
+
+	rr_bitmap_size = howmany(num_rr, NBBY);
+	rr_dirty_bm = (unsigned char *)mdb_alloc(rr_bitmap_size,
+	    UM_SLEEP|UM_GC);
+	rr_goingclean_bm = (unsigned char *)mdb_alloc(rr_bitmap_size,
+	    UM_SLEEP|UM_GC);
+	rr_goingdirty_bm = (unsigned char *)mdb_alloc(rr_bitmap_size,
+	    UM_SLEEP|UM_GC);
+	rr_resync_bm = (unsigned char *)mdb_alloc(rr_bitmap_size,
+	    UM_SLEEP|UM_GC);
+	rr_pnds = (unsigned char *)mdb_alloc(num_rr, UM_SLEEP|UM_GC);
+	rr_ow = (unsigned short *)mdb_alloc(num_rr * sizeof (unsigned short),
+	    UM_SLEEP|UM_GC);
+
+	if (mdb_vread(rr_dirty_bm, rr_bitmap_size, un_dbm) == -1) {
+		mdb_warn("failed to read un_dirty_bm at %p\n", un_dbm);
+		return (DCMD_ERR);
+	}
+	if (mdb_vread(rr_goingclean_bm, rr_bitmap_size, un_gcbm) == -1) {
+		mdb_warn("failed to read un_goingclean_bm at %p\n", un_gcbm);
+		return (DCMD_ERR);
+	}
+	if (mdb_vread(rr_goingdirty_bm, rr_bitmap_size, un_gdbm) == -1) {
+		mdb_warn("failed to read un_goingdirty_bm at %p\n", un_gdbm);
+		return (DCMD_ERR);
+	}
+	if (mdb_vread(rr_resync_bm, rr_bitmap_size, un_rrbm) == -1) {
+		mdb_warn("failed to read un_resync_bm at %p\n", un_rrbm);
+		return (DCMD_ERR);
+	}
+	if (mdb_vread(rr_pnds, num_rr, un_pnds) == -1) {
+		mdb_warn("failed to read un_pernode_dirty_sum at %p\n",
+		    un_pnds);
+		return (DCMD_ERR);
+	}
+	if (mdb_vread(rr_ow, num_rr * sizeof (unsigned short), un_ow) == -1) {
+		mdb_warn("failed to read un_outstanding_writes at %p\n", un_ow);
+		return (DCMD_ERR);
+	}
+
+	print_mm_bm(rr_dirty_bm, num_rr, "un_dirty_bm");
+	print_mm_bm(rr_goingclean_bm, num_rr, "un_goingclean_bm");
+	print_mm_bm(rr_goingdirty_bm, num_rr, "un_goingdirty_bm");
+	print_mm_bm(rr_resync_bm, num_rr, "un_resync_bm");
+
+	/*
+	 * Load all the un_pernode_bm[] entries and iterate through the non-
+	 * NULL entries
+	 */
+	rr_pernode_dirty = (unsigned char *)mdb_alloc(rr_bitmap_size,
+	    UM_SLEEP|UM_GC);
+
+	for (i = 0; i < 128; i++) {
+		un_pernode_bm = (uintptr_t)mmp->un_pernode_dirty_bm[i];
+		if (un_pernode_bm) {
+			mdb_snprintf(pernode_str, sizeof (pernode_str),
+			    "un_pernode_dirty_bm[%d]", i);
+			if (mdb_vread(rr_pernode_dirty, rr_bitmap_size,
+			    un_pernode_bm) == -1) {
+				mdb_warn("failed to read %s at %p\n",
+				    pernode_str, un_pernode_bm);
+				return (DCMD_ERR);
+			}
+			print_mm_bm(rr_pernode_dirty, num_rr, pernode_str);
+		}
+	}
+	print_mm_cnt_c(rr_pnds, num_rr, "un_pernode_dirty_sum");
+
+	print_mm_cnt_w(rr_ow, num_rr, "un_outstanding_writes");
+
+	return (DCMD_OK);
+}
--- a/usr/src/cmd/mdb/common/modules/md/md.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/cmd/mdb/common/modules/md/md.c	Wed Dec 24 08:23:40 2008 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/mdb_modapi.h>
 
 
@@ -37,6 +34,7 @@
 extern int dumpnamespace(uintptr_t, uint_t, int, const mdb_arg_t *);
 extern int dumpsetaddr(uintptr_t, uint_t, int, const mdb_arg_t *);
 extern int dumphotspare(uintptr_t, uint_t, int, const mdb_arg_t *);
+extern int printmmbm(uintptr_t, uint_t, int, const mdb_arg_t *);
 extern void set_io_help();
 
 /* from mdbgen */
@@ -79,6 +77,8 @@
 	    dumpsetaddr },
 	{ "simple_de_ic", NULL, "simple mddb_de_ic_t",
 	    simple_de_ic },
+	{ "printmmbm", NULL, "print bitmaps for given mm_unit_t",
+	    printmmbm },
 	{ NULL }
 };
 
--- a/usr/src/cmd/mdb/common/modules/md/metastat.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/cmd/mdb/common/modules/md/metastat.c	Wed Dec 24 08:23:40 2008 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,13 +18,12 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include "mdinclude.h"
 
 typedef struct	submirror_cb {
@@ -117,16 +115,84 @@
 	return (WALK_NEXT);
 }
 
+/*
+ * Construct an RLE count for the number of 'cleared' bits in the given 'bm'
+ * Output the RLE count in form: [<set>.<cleared>.<set>.<cleared>...]
+ * RLE is Run Length Encoding, a method for compactly describing a bitmap
+ * as a series of numbers indicating the count of consecutive set or cleared
+ * bits.
+ *
+ * Input:
+ *	<bm>	bitmap to scan
+ *	<size>	length of bitmap (in bits)
+ *	<comp_bm>	RLE count array to be updated
+ *	<opstr>	Descriptive text for bitmap RLE count display
+ */
+static void
+print_comp_bm(unsigned char *bm, uint_t size, ushort_t *comp_bm, char *opstr)
+{
+	int	cnt_clean, tot_dirty, cur_idx;
+	int	i, cur_clean, cur_dirty, printit, max_set_cnt, max_reset_cnt;
+
+	cnt_clean = 1;
+	printit = 0;
+	cur_clean = 0;
+	cur_dirty = 0;
+	cur_idx = 0;
+	tot_dirty = 0;
+	max_set_cnt = max_reset_cnt = 0;
+	for (i = 0; i < size; i++) {
+		if (isset(bm, i)) {
+			/* If we're counting clean bits, flush the count out */
+			if (cnt_clean) {
+				cnt_clean = 0;
+				comp_bm[cur_idx] = cur_clean;
+				printit = 1;
+				if (cur_clean > max_reset_cnt) {
+					max_reset_cnt = cur_clean;
+				}
+			}
+			cur_clean = 0;
+			cur_dirty++;
+			tot_dirty++;
+		} else {
+			if (!cnt_clean) {
+				cnt_clean = 1;
+				comp_bm[cur_idx] = cur_dirty;
+				printit = 1;
+				if (cur_dirty > max_set_cnt) {
+					max_set_cnt = cur_dirty;
+				}
+			}
+			cur_dirty = 0;
+			cur_clean++;
+		}
+		if (printit) {
+			mdb_printf("%u.", comp_bm[cur_idx++]);
+			printit = 0;
+		}
+	}
+
+	mdb_printf("\nTotal %s bits = %lu\n", opstr, tot_dirty);
+	mdb_printf("Total %s transactions = %lu\n", opstr, cur_idx);
+	mdb_printf("Maximum %s set count = %lu, reset count = %lu\n", opstr,
+	    max_set_cnt, max_reset_cnt);
+}
+
 void
 print_mirror(void *un_addr, void *mdcptr, uint_t verbose)
 {
-	mm_unit_t	mm;
+	mm_unit_t	mm, *mmp;
 	void		**ptr;
 	int		setno = 0;
 	minor_t		un_self_id;
 	diskaddr_t	un_total_blocks;
 	ushort_t	mm_un_nsm;
 	submirror_cb_t	data;
+	uint_t		num_rr, rr_blksize;
+	ushort_t	*comp_rr;
+	unsigned char	*rr_dirty_bm, *rr_goingclean_bm;
+	uintptr_t	un_dbm, un_gcbm;
 
 	/* read in the device */
 	if (mdb_vread(&mm, sizeof (mm_unit_t),
@@ -134,6 +200,9 @@
 		mdb_warn("failed to read mm_unit_t at %p\n", un_addr);
 		return;
 	}
+
+	mmp = &mm;
+
 	un_self_id = ((mdc_unit_t *)mdcptr)->un_self_id;
 	un_total_blocks = ((mdc_unit_t *)mdcptr)->un_total_blocks;
 	mm_un_nsm = mm.un_nsm;
@@ -148,6 +217,39 @@
 	}
 	mdb_inc_indent(2);
 	mdb_printf("Size: %llu blocks\n", un_total_blocks);
+
+	/*
+	 * Dump out the current un_dirty_bm together with its size
+	 * Also, attempt to Run Length encode the bitmap to see if this
+	 * is a viable option
+	 */
+	num_rr = mm.un_rrd_num;
+	rr_blksize = mm.un_rrd_blksize;
+
+	un_dbm = (uintptr_t)mmp->un_dirty_bm;
+	un_gcbm = (uintptr_t)mmp->un_goingclean_bm;
+
+	mdb_printf("RR size: %lu bits\n", num_rr);
+	mdb_printf("RR block size: %lu blocks\n", rr_blksize);
+
+	rr_dirty_bm = (unsigned char *)mdb_alloc(num_rr, UM_SLEEP|UM_GC);
+	rr_goingclean_bm = (unsigned char *)mdb_alloc(num_rr, UM_SLEEP|UM_GC);
+	comp_rr = (ushort_t *)mdb_alloc(num_rr * sizeof (ushort_t),
+	    UM_SLEEP|UM_GC);
+
+	if (mdb_vread(rr_dirty_bm, num_rr, un_dbm) == -1) {
+		mdb_warn("failed to read un_dirty_bm at %p\n", un_dbm);
+		return;
+	}
+	if (mdb_vread(rr_goingclean_bm, num_rr, un_gcbm) == -1) {
+		mdb_warn("failed to read un_goingclean_bm at %p\n", un_gcbm);
+		return;
+	}
+
+	print_comp_bm(rr_dirty_bm, num_rr, comp_rr, "dirty");
+
+	print_comp_bm(rr_goingclean_bm, num_rr, comp_rr, "clean");
+
 	/*
 	 * find the sub mirrors, search through each metadevice looking
 	 * at the un_parent.
--- a/usr/src/cmd/mdb/intel/amd64/md/Makefile	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/cmd/mdb/intel/amd64/md/Makefile	Wed Dec 24 08:23:40 2008 -0700
@@ -2,9 +2,8 @@
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,15 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-#ident	"%Z%%M%	%I%	%E% SMI"
 
 MODULE = md.so
 MDBTGT = kvm
 
 MODSRCS = dumphotspare.c \
+	dumpmirror.c \
 	dumpnamespace.c \
 	findset.c \
 	md.c  \
--- a/usr/src/cmd/mdb/intel/ia32/md/Makefile	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/cmd/mdb/intel/ia32/md/Makefile	Wed Dec 24 08:23:40 2008 -0700
@@ -2,9 +2,8 @@
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,15 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-#ident	"%Z%%M%	%I%	%E% SMI"
 
 MODULE = md.so
 MDBTGT = kvm
 
 MODSRCS = dumphotspare.c \
+	dumpmirror.c \
 	dumpnamespace.c \
 	findset.c \
 	md.c  \
--- a/usr/src/cmd/mdb/sparc/v9/md/Makefile	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/cmd/mdb/sparc/v9/md/Makefile	Wed Dec 24 08:23:40 2008 -0700
@@ -2,9 +2,8 @@
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,15 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2003 Sun Microsystems, Inc.   All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 MODULE = md.so 
 MDBTGT = kvm
 
 MODSRCS = dumphotspare.c \
+	dumpmirror.c \
 	dumpnamespace.c \
 	findset.c \
 	md.c  \
--- a/usr/src/head/meta.h	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/head/meta.h	Wed Dec 24 08:23:40 2008 -0700
@@ -1844,10 +1844,12 @@
 
 /* meta_mn_comm.c */
 extern int		mdmn_send_message(set_t setno, md_mn_msgtype_t type,
-			    uint_t flags, char *data, int size,
-			    md_mn_result_t **resp, md_error_t *ep);
+			    uint_t flags, md_mn_nodeid_t recipient,
+			    char *data, int size, md_mn_result_t **resp,
+			    md_error_t *ep);
 extern int		mdmn_send_message_with_msgid(set_t setno,
-			    md_mn_msgtype_t type, uint_t flags, char *data,
+			    md_mn_msgtype_t type, uint_t flags,
+			    md_mn_nodeid_t recipient, char *data,
 			    int size, md_mn_result_t **resp,
 			    md_mn_msgid_t *msgid, md_error_t *ep);
 extern int		mdmn_create_msgid(md_mn_msgid_t *id);
@@ -1931,11 +1933,11 @@
 				    md_timeval32_t timestamp,
 				    ulong_t genid, md_error_t *ep);
 
-/* Flags for direction in copy_msg_1 */
+/* Flags for direction in copy_msg_2 */
 #define	MD_MN_COPY_TO_ONDISK 0x0001
 #define	MD_MN_COPY_TO_INCORE 0x0002
 
-extern void		copy_msg_1(md_mn_msg_t *incorep,
+extern void		copy_msg_2(md_mn_msg_t *incorep,
 			    md_mn_msg_od_t *ondiskp, int direction);
 extern void		free_msg(md_mn_msg_t *msg);
 
--- a/usr/src/lib/lvm/libmeta/common/mapfile-vers	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/lib/lvm/libmeta/common/mapfile-vers	Wed Dec 24 08:23:40 2008 -0700
@@ -22,8 +22,6 @@
 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-# ident	"%Z%%M%	%I%	%E% SMI"
-#
 
 SUNWprivate_1.1 {
     global:
@@ -92,7 +90,7 @@
 	commitset;
 	comp_state_to_name;
 	copy_msg;
-	copy_msg_1;
+	copy_msg_2;
 	copy_result;
 	crcfreetab;
 	crcfunc;
@@ -160,12 +158,12 @@
 	md_med_pmap_timeout;
 	mdmn_abort;
 	mdmn_allocate_changelog;
-	mdmn_comm_lock_1;
-	mdmn_comm_msglock_1;
-	mdmn_comm_reinit_set_1;
-	mdmn_comm_resume_1;
-	mdmn_comm_suspend_1;
-	mdmn_comm_unlock_1;
+	mdmn_comm_lock_2;
+	mdmn_comm_msglock_2;
+	mdmn_comm_reinit_set_2;
+	mdmn_comm_resume_2;
+	mdmn_comm_suspend_2;
+	mdmn_comm_unlock_2;
 	mdmn_create_msgid;
 	mdmn_get_changelogrec;
 	mdmn_get_handler;
@@ -177,14 +175,14 @@
 	mdmn_reinit_set;
 	mdmn_reset_changelog;
 	mdmn_resume;
-	mdmn_send_1;
+	mdmn_send_2;
 	mdmn_send_message;
 	mdmn_snarf_changelog;
 	mdmn_suspend;
 	mdmn_unlog_msg;
-	mdmn_wakeup_initiator_1;
-	mdmn_wakeup_master_1;
-	mdmn_work_1;
+	mdmn_wakeup_initiator_2;
+	mdmn_wakeup_master_2;
+	mdmn_work_2;
 	mdnullerror;
 	md_perror;
 	md_post_sig;
--- a/usr/src/lib/lvm/libmeta/common/meta_db.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/lib/lvm/libmeta/common/meta_db.c	Wed Dec 24 08:23:40 2008 -0700
@@ -18,13 +18,12 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Just in case we're not in a build environment, make sure that
  * TEXT_DOMAIN gets set to something.
@@ -928,7 +927,7 @@
 			 */
 			send_rval = mdmn_send_message(sp->setno,
 			    MD_MN_MSG_META_DB_NEWSIDE, MD_MSGF_FAIL_ON_SUSPEND |
-			    MD_MSGF_PANIC_WHEN_INCONSISTENT, (char *)&db_ns,
+			    MD_MSGF_PANIC_WHEN_INCONSISTENT, 0, (char *)&db_ns,
 			    sizeof (md_mn_msg_meta_db_newside_t),
 			    &resultp, ep);
 			if (send_rval != 0) {
@@ -1048,7 +1047,7 @@
 		 */
 		send_rval = mdmn_send_message(sp->setno,
 		    MD_MN_MSG_META_DB_DELSIDE, MD_MSGF_FAIL_ON_SUSPEND |
-		    MD_MSGF_PANIC_WHEN_INCONSISTENT, (char *)&db_ds,
+		    MD_MSGF_PANIC_WHEN_INCONSISTENT, 0, (char *)&db_ds,
 		    sizeof (md_mn_msg_meta_db_delside_t), &resultp, ep);
 		if (send_rval != 0) {
 			if (resultp == NULL)
@@ -1542,7 +1541,7 @@
 				flags |= MD_MSGF_NO_LOG;
 			send_rval = mdmn_send_message(sp->setno,
 			    MD_MN_MSG_META_DB_ATTACH,
-			    flags, (char *)&attach,
+			    flags, 0, (char *)&attach,
 			    sizeof (md_mn_msg_meta_db_attach_t),
 			    &resultp, ep);
 			if (send_rval != 0) {
@@ -2007,7 +2006,7 @@
 				flags |= MD_MSGF_NO_LOG;
 			send_rval = mdmn_send_message(sp->setno,
 			    MD_MN_MSG_META_DB_DETACH,
-			    flags, (char *)&detach,
+			    flags, 0, (char *)&detach,
 			    sizeof (md_mn_msg_meta_db_detach_t),
 			    &resultp, ep);
 			if (send_rval != 0) {
--- a/usr/src/lib/lvm/libmeta/common/meta_mn_changelog.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/lib/lvm/libmeta/common/meta_mn_changelog.c	Wed Dec 24 08:23:40 2008 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,16 +18,14 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <stdlib.h>
 #include <unistd.h>
-
 #include <wait.h>
 #include <sys/time.h>
 #include <meta.h>
@@ -131,7 +128,7 @@
 		odp->lr_class = incp->lr_class;
 		odp->lr_msglen = incp->lr_msglen;
 		if (incp->lr_msglen)
-			copy_msg_1(&incp->lr_msg, &odp->lr_od_msg, direction);
+			copy_msg_2(&incp->lr_msg, &odp->lr_od_msg, direction);
 	} else {
 		incp->lr_revision = odp->lr_revision;
 		incp->lr_flags = odp->lr_flags;
@@ -139,7 +136,7 @@
 		incp->lr_class = odp->lr_class;
 		incp->lr_msglen = odp->lr_msglen;
 		if (odp->lr_msglen)
-			copy_msg_1(&incp->lr_msg, &odp->lr_od_msg, direction);
+			copy_msg_2(&incp->lr_msg, &odp->lr_od_msg, direction);
 	}
 }
 
@@ -196,7 +193,7 @@
 			(void) mdstealerror(ep, &req.ur_mde);
 #ifdef DEBUG
 			syslog(LOG_DEBUG, "allocate_log: %s\n",
-						mde_sperror(ep, ""));
+			    mde_sperror(ep, ""));
 #endif
 			Free(mdmn_changelog[setno]);
 			return (-1);
@@ -389,13 +386,14 @@
 	assert(lr != NULL);
 	if (!MSGID_CMP(&(msg->msg_msgid), &(lr->lr_msg.msg_msgid))) {
 		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
-		"unlog_msg: msgid mismatch\n"
-		"\t\tstored: ID = (%d, 0x%llx-%d) setno %d class %d type %d\n"
-		"\t\tattempting to unlog:\n"
-		"\t\tID = (%d, 0x%llx-%d) setno %d class %d type %d.\n"),
-		MSGID_ELEMS(lr->lr_msg.msg_msgid), lr->lr_setno,
-		lr->lr_class, lr->lr_msgtype, MSGID_ELEMS(msg->msg_msgid),
-		msg->msg_setno, class, msg->msg_type);
+		    "unlog_msg: msgid mismatch\n"
+		    "\t\tstored: ID = (%d, 0x%llx-%d) setno %d "
+		    "class %d type %d\n"
+		    "\t\tattempting to unlog:\n"
+		    "\t\tID = (%d, 0x%llx-%d) setno %d class %d type %d.\n"),
+		    MSGID_ELEMS(lr->lr_msg.msg_msgid), lr->lr_setno,
+		    lr->lr_class, lr->lr_msgtype, MSGID_ELEMS(msg->msg_msgid),
+		    msg->msg_setno, class, msg->msg_type);
 		return (-1);
 	}
 	lr->lr_msglen = 0;
@@ -462,10 +460,10 @@
 	if (!(MD_MNSET_DESC(sd)) || !sd->sd_mn_am_i_master) {
 		if (!(MD_MNSET_DESC(sd))) {
 			syslog(LOG_DAEMON | LOG_ERR, dgettext(TEXT_DOMAIN,
-					"mdmn_commitlog - Not MN Set\n"));
+			    "mdmn_commitlog - Not MN Set\n"));
 		} else {
 			syslog(LOG_DAEMON | LOG_ERR, dgettext(TEXT_DOMAIN,
-				"mdmn_commit_log - Not Master\n"));
+			    "mdmn_commit_log - Not Master\n"));
 		}
 		return (-1);
 	}
@@ -485,7 +483,7 @@
 		req.ur_size  = MDMN_LOGRECSIZE_OD;
 		req.ur_data = (uintptr_t)&clodrec;
 		if ((retval = metaioctl(MD_MN_DB_USERREQ, &req, &req.ur_mde,
-							    NULL)) != 0) {
+		    NULL)) != 0) {
 			(void) mdstealerror(ep, &req.ur_mde);
 #ifdef DEBUG
 			syslog(LOG_DAEMON|LOG_DEBUG,
@@ -501,16 +499,16 @@
 		recs[lrc] = 0;
 		/* Commit to mddb  on disk */
 		METAD_SETUP_LR(MD_DB_COMMIT_MANY, setno,
-					mdmn_changelog[setno][0].lr_selfid);
+		    mdmn_changelog[setno][0].lr_selfid);
 		req.ur_size = size;
 		req.ur_data = (uintptr_t)recs;
 		if ((retval = metaioctl(MD_MN_DB_USERREQ, &req,
-						&req.ur_mde, NULL)) != 0) {
+		    &req.ur_mde, NULL)) != 0) {
 			(void) mdstealerror(ep, &req.ur_mde);
 #ifdef DEBUG
 			syslog(LOG_DAEMON|LOG_DEBUG,
-					"mdmn_commitlog - metaioctl COMMIT_MANY"
-					"Failure\n%s",  mde_sperror(ep, ""));
+			    "mdmn_commitlog - metaioctl COMMIT_MANY"
+			    "Failure\n%s",  mde_sperror(ep, ""));
 #endif
 		}
 	}
@@ -609,7 +607,7 @@
 	}
 
 	lr = (mdmn_changelog_record_od_t *)get_ur_rec(set, MD_UR_GET_NEXT,
-						    MDDB_UR_LR, &id, ep);
+	    MDDB_UR_LR, &id, ep);
 	if (lr == NULL)
 		return (0);
 
@@ -618,7 +616,7 @@
 	if (mdmn_changelog[set] == NULL) {
 		/* Allocate incore state for the log */
 		mdmn_changelog[set] = Zalloc(MDMN_LOGHDR_SIZE *
-			mdmn_logrecs);
+		    mdmn_logrecs);
 	}
 
 	do {
--- a/usr/src/lib/lvm/libmeta/common/meta_mn_comm.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/lib/lvm/libmeta/common/meta_mn_comm.c	Wed Dec 24 08:23:40 2008 -0700
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <stdlib.h>
 #include <unistd.h>
 #include <wait.h>
@@ -72,181 +70,264 @@
 void
 ldump_msg(char *prefix, md_mn_msg_t *msg)
 {
-	(void) fprintf(stderr, "%s &msg   = 0x%x\n", prefix, (uint_t)msg);
-	(void) fprintf(stderr, "%s ID     = (%d, 0x%llx-%d)\n", prefix,
+	(void) fprintf(stderr, "%s &msg       = 0x%x\n", prefix, (uint_t)msg);
+	(void) fprintf(stderr, "%s ID         = (%d, 0x%llx-%d)\n", prefix,
 	    MSGID_ELEMS(msg->msg_msgid));
-	(void) fprintf(stderr, "%s sender = %d\n", prefix, msg->msg_sender);
-	(void) fprintf(stderr, "%s flags  = 0x%x\n", prefix, msg->msg_flags);
-	(void) fprintf(stderr, "%s setno  = %d\n", prefix, msg->msg_setno);
-	(void) fprintf(stderr, "%s type   = %d\n", prefix, msg->msg_type);
-	(void) fprintf(stderr, "%s size   = %d\n", prefix, msg->msg_event_size);
+	(void) fprintf(stderr, "%s sender     = %d\n", prefix, msg->msg_sender);
+	(void) fprintf(stderr, "%s flags      = 0x%x\n",
+	    prefix, msg->msg_flags);
+	(void) fprintf(stderr, "%s setno      = %d\n", prefix, msg->msg_setno);
+	(void) fprintf(stderr, "%s recipient  = %d\n",
+	    prefix, msg->msg_recipient);
+	(void) fprintf(stderr, "%s type       = %d\n", prefix, msg->msg_type);
+	(void) fprintf(stderr, "%s size       = %d\n",
+	    prefix, msg->msg_event_size);
 }
 
+#define	COMMD_PROGNAME	"rpc.mdcommd"
+
+extern uint_t meta_rpc_err_mask(void);
+
+/*
+ * If a clnt_call gets an RPC error, force the message out here with details.
+ * This would be nice to send to commd_debug(), but we can't call rpc.mdcommd
+ * code from libmeta.
+ */
+static void
+mdmn_handle_RPC_error(CLIENT *clnt, char *ident, md_mn_nodeid_t nid)
+{
+	/*
+	 * This is sized for a max message which would look like this:
+	 * "mdmn_wakeup_initiator: rpc.mdcommd node 4294967295"
+	 */
+	char errstr[51];
+	struct rpc_err e;
+
+	CLNT_GETERR((CLIENT *) clnt, &e);
+	if (meta_rpc_err_mask() & (1 << e.re_status)) {
+		if (nid == 0) {
+			(void) snprintf(errstr, sizeof (errstr),
+			    "%s: %s node (local)", ident, COMMD_PROGNAME);
+		} else {
+			(void) snprintf(errstr, sizeof (errstr),
+			    "%s: %s node %d", ident, COMMD_PROGNAME, nid);
+		}
+		syslog(LOG_WARNING, "mdmn_handle_RPC_error: %s",
+		    clnt_sperror(clnt, errstr));
+	}
+}
 
 /* Default timeout can be changed using clnt_control() */
 static struct timeval TIMEOUT = { 25, 0 };
 
 md_mn_result_t *
-mdmn_send_1(argp, clnt)
+mdmn_send_2(argp, clnt, nid)
 	md_mn_msg_t *argp;
 	CLIENT *clnt;
+	md_mn_nodeid_t nid;
 {
+	enum clnt_stat	res;
 	md_mn_result_t *clnt_res = Zalloc(sizeof (md_mn_result_t));
 
-	if (clnt_call(clnt, mdmn_send,
+	res = clnt_call(clnt, mdmn_send,
 		(xdrproc_t)xdr_md_mn_msg_t, (caddr_t)argp,
-		(xdrproc_t)xdr_md_mn_result_t, (caddr_t)clnt_res,
-		TIMEOUT) != RPC_SUCCESS) {
-		return (NULL);
+		(xdrproc_t)xdr_md_mn_result_t, (caddr_t)clnt_res, TIMEOUT);
+
+	if (res == RPC_SUCCESS) {
+		return (clnt_res);
 	}
-	return (clnt_res);
+	mdmn_handle_RPC_error(clnt, "mdmn_send", nid);
+	Free(clnt_res);
+	return (NULL);
 }
 
 int *
-mdmn_work_1(argp, clnt)
+mdmn_work_2(argp, clnt, nid)
 	md_mn_msg_t *argp;
 	CLIENT *clnt;
+	md_mn_nodeid_t nid;
 {
+	enum clnt_stat	res;
 	int *clnt_res = Zalloc(sizeof (int));
 
-	if (clnt_call(clnt, mdmn_work,
+	res = clnt_call(clnt, mdmn_work,
 		(xdrproc_t)xdr_md_mn_msg_t, (caddr_t)argp,
-		(xdrproc_t)xdr_int, (caddr_t)clnt_res,
-		TIMEOUT) != RPC_SUCCESS) {
-		Free(clnt_res);
-		return (NULL);
+		(xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT);
+
+	if (res == RPC_SUCCESS) {
+		return (clnt_res);
 	}
-	return (clnt_res);
+	mdmn_handle_RPC_error(clnt, "mdmn_work", nid);
+	Free(clnt_res);
+	return (NULL);
 }
 
 int *
-mdmn_wakeup_initiator_1(argp, clnt)
+mdmn_wakeup_initiator_2(argp, clnt, nid)
 	md_mn_result_t *argp;
 	CLIENT *clnt;
+	md_mn_nodeid_t nid;
 {
+	enum clnt_stat	res;
 	int *clnt_res = Zalloc(sizeof (int));
 
-	if (clnt_call(clnt, mdmn_wakeup_initiator,
+	res = clnt_call(clnt, mdmn_wakeup_initiator,
 		(xdrproc_t)xdr_md_mn_result_t, (caddr_t)argp,
-		(xdrproc_t)xdr_int, (caddr_t)clnt_res,
-		TIMEOUT) != RPC_SUCCESS) {
-		Free(clnt_res);
-		return (NULL);
+		(xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT);
+
+	if (res == RPC_SUCCESS) {
+		return (clnt_res);
 	}
-	return (clnt_res);
+	mdmn_handle_RPC_error(clnt, "mdmn_wakeup_initiator", nid);
+	Free(clnt_res);
+	return (NULL);
 }
 
 int *
-mdmn_wakeup_master_1(argp, clnt)
+mdmn_wakeup_master_2(argp, clnt, nid)
 	md_mn_result_t *argp;
 	CLIENT *clnt;
+	md_mn_nodeid_t nid;
 {
+	enum clnt_stat	res;
 	int *clnt_res = Zalloc(sizeof (int));
 
-	if (clnt_call(clnt, mdmn_wakeup_master,
+	res = clnt_call(clnt, mdmn_wakeup_master,
 		(xdrproc_t)xdr_md_mn_result_t, (caddr_t)argp,
-		(xdrproc_t)xdr_int, (caddr_t)clnt_res,
-		TIMEOUT) != RPC_SUCCESS) {
-		Free(clnt_res);
-		return (NULL);
+		(xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT);
+
+	if (res == RPC_SUCCESS) {
+		return (clnt_res);
 	}
-	return (clnt_res);
+	mdmn_handle_RPC_error(clnt, "mdmn_wakeup_master", nid);
+	Free(clnt_res);
+	return (NULL);
 }
 
 int *
-mdmn_comm_lock_1(argp, clnt)
+mdmn_comm_lock_2(argp, clnt, nid)
 	md_mn_set_and_class_t *argp;
 	CLIENT *clnt;
+	md_mn_nodeid_t nid;
 {
+	enum clnt_stat	res;
 	int *clnt_res = Zalloc(sizeof (int));
 
-	if (clnt_call(clnt, mdmn_comm_lock,
+	res = clnt_call(clnt, mdmn_comm_lock,
 		(xdrproc_t)xdr_md_mn_set_and_class_t, (caddr_t)argp,
-		(xdrproc_t)xdr_int, (caddr_t)clnt_res,
-		TIMEOUT) != RPC_SUCCESS) {
-		return (NULL);
+		(xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT);
+
+	if (res == RPC_SUCCESS) {
+		return (clnt_res);
 	}
-	return (clnt_res);
+	mdmn_handle_RPC_error(clnt, "mdmn_comm_lock", nid);
+	Free(clnt_res);
+	return (NULL);
 }
 
 int *
-mdmn_comm_unlock_1(argp, clnt)
+mdmn_comm_unlock_2(argp, clnt, nid)
 	md_mn_set_and_class_t *argp;
 	CLIENT *clnt;
+	md_mn_nodeid_t nid;
 {
+	enum clnt_stat	res;
 	int *clnt_res = Zalloc(sizeof (int));
 
-	if (clnt_call(clnt, mdmn_comm_unlock,
+	res = clnt_call(clnt, mdmn_comm_unlock,
 		(xdrproc_t)xdr_md_mn_set_and_class_t, (caddr_t)argp,
-		(xdrproc_t)xdr_int, (caddr_t)clnt_res,
-		TIMEOUT) != RPC_SUCCESS) {
-		return (NULL);
+		(xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT);
+
+	if (res == RPC_SUCCESS) {
+		return (clnt_res);
 	}
-	return (clnt_res);
+	mdmn_handle_RPC_error(clnt, "mdmn_comm_unlock", nid);
+	Free(clnt_res);
+	return (NULL);
 }
 
 int *
-mdmn_comm_suspend_1(argp, clnt)
+mdmn_comm_suspend_2(argp, clnt, nid)
 	md_mn_set_and_class_t *argp;
 	CLIENT *clnt;
+	md_mn_nodeid_t nid;
 {
+	enum clnt_stat	res;
 	int *clnt_res = Zalloc(sizeof (int));
 
-	if (clnt_call(clnt, mdmn_comm_suspend,
+	res = clnt_call(clnt, mdmn_comm_suspend,
 		(xdrproc_t)xdr_md_mn_set_and_class_t, (caddr_t)argp,
-		(xdrproc_t)xdr_int, (caddr_t)clnt_res,
-		TIMEOUT) != RPC_SUCCESS) {
-		return (NULL);
+		(xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT);
+
+	if (res == RPC_SUCCESS) {
+		return (clnt_res);
 	}
-	return (clnt_res);
+	mdmn_handle_RPC_error(clnt, "mdmn_comm_suspend", nid);
+	Free(clnt_res);
+	return (NULL);
 }
 
 int *
-mdmn_comm_resume_1(argp, clnt)
+mdmn_comm_resume_2(argp, clnt, nid)
 	md_mn_set_and_class_t *argp;
 	CLIENT *clnt;
+	md_mn_nodeid_t nid;
 {
+	enum clnt_stat	res;
 	int *clnt_res = Zalloc(sizeof (int));
 
-	if (clnt_call(clnt, mdmn_comm_resume,
+	res = clnt_call(clnt, mdmn_comm_resume,
 		(xdrproc_t)xdr_md_mn_set_and_class_t, (caddr_t)argp,
-		(xdrproc_t)xdr_int, (caddr_t)clnt_res,
-		TIMEOUT) != RPC_SUCCESS) {
-		return (NULL);
+		(xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT);
+
+	if (res == RPC_SUCCESS) {
+		return (clnt_res);
 	}
-	return (clnt_res);
+	mdmn_handle_RPC_error(clnt, "mdmn_comm_resume", nid);
+	Free(clnt_res);
+	return (NULL);
 }
 
 int *
-mdmn_comm_reinit_set_1(argp, clnt)
+mdmn_comm_reinit_set_2(argp, clnt, nid)
 	set_t *argp;
 	CLIENT *clnt;
+	md_mn_nodeid_t nid;
 {
+	enum clnt_stat	res;
 	int *clnt_res = Zalloc(sizeof (int));
 
-	if (clnt_call(clnt, mdmn_comm_reinit_set,
+	res = clnt_call(clnt, mdmn_comm_reinit_set,
 		(xdrproc_t)xdr_set_t, (caddr_t)argp,
-		(xdrproc_t)xdr_int, (caddr_t)clnt_res,
-		TIMEOUT) != RPC_SUCCESS) {
-		return (NULL);
+		(xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT);
+
+	if (res == RPC_SUCCESS) {
+		return (clnt_res);
 	}
-	return (clnt_res);
+	mdmn_handle_RPC_error(clnt, "mdmn_comm_reinit_set", nid);
+	Free(clnt_res);
+	return (NULL);
 }
 
 int *
-mdmn_comm_msglock_1(argp, clnt)
+mdmn_comm_msglock_2(argp, clnt, nid)
 	md_mn_type_and_lock_t *argp;
 	CLIENT *clnt;
+	md_mn_nodeid_t nid;
 {
+	enum clnt_stat	res;
 	int *clnt_res = Zalloc(sizeof (int));
 
-	if (clnt_call(clnt, mdmn_comm_msglock,
+	res = clnt_call(clnt, mdmn_comm_msglock,
 		(xdrproc_t)xdr_md_mn_type_and_lock_t, (caddr_t)argp,
-		(xdrproc_t)xdr_int, (caddr_t)clnt_res,
-		TIMEOUT) != RPC_SUCCESS) {
-		return (NULL);
+		(xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT);
+
+	if (res == RPC_SUCCESS) {
+		return (clnt_res);
 	}
-	return (clnt_res);
+	mdmn_handle_RPC_error(clnt, "mdmn_comm_msglock", nid);
+	Free(clnt_res);
+	return (NULL);
 }
 
 
@@ -370,6 +451,7 @@
 	nmsg->msg_flags		= msg->msg_flags;
 	nmsg->msg_setno		= msg->msg_setno;
 	nmsg->msg_type		= msg->msg_type;
+	nmsg->msg_recipient	= msg->msg_recipient;
 	nmsg->msg_event_size	= msg->msg_event_size;
 	if (msg->msg_event_size > 0) {
 		bcopy(msg->msg_event_data, nmsg->msg_event_data,
@@ -379,7 +461,7 @@
 }
 
 void
-copy_msg_1(md_mn_msg_t *msg, md_mn_msg_od_t *msgod, int direction)
+copy_msg_2(md_mn_msg_t *msg, md_mn_msg_od_t *msgod, int direction)
 {
 	assert((direction == MD_MN_COPY_TO_ONDISK) ||
 	    (direction == MD_MN_COPY_TO_INCORE));
@@ -390,6 +472,7 @@
 		msgod->msg_flags	= msg->msg_flags;
 		msgod->msg_setno	= msg->msg_setno;
 		msgod->msg_type		= msg->msg_type;
+		msgod->msg_recipient	= msg->msg_recipient;
 		msgod->msg_od_event_size = msg->msg_event_size;
 		/* paranoid checks */
 		if (msg->msg_event_size != 0 && msg->msg_event_data != NULL)
@@ -401,6 +484,7 @@
 		msg->msg_flags		= msgod->msg_flags;
 		msg->msg_setno		= msgod->msg_setno;
 		msg->msg_type		= msgod->msg_type;
+		msg->msg_recipient	= msgod->msg_recipient;
 		msg->msg_event_size	= msgod->msg_od_event_size;
 		if (msg->msg_event_data == NULL)
 			msg->msg_event_data = Zalloc(msg->msg_event_size);
@@ -462,7 +546,7 @@
 	if (mdmn_clients == (md_mn_client_list_t *)NULL) {
 		/* if there is no entry, create a client and return a it */
 		local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD,
-			ONE, "tcp");
+		    TWO, "tcp");
 	} else {
 		/*
 		 * If there is an entry from a previous put operation,
@@ -517,6 +601,13 @@
  * a msgid is already attached to it.
  * In that case mdmn_send_message_with_msgid() has to be called directly.
  *
+ * The recipient argument is almost always unused, and is therefore typically
+ * set to zero, as zero is an invalid cluster nodeid.  The exceptions are the
+ * marking and clearing of the DRL from a node that is not currently the
+ * owner.  In these cases, the recipient argument will be the nodeid of the
+ * mirror owner, and MD_MSGF_DIRECTED will be set in the flags.  Non-owner
+ * nodes will not receive these messages.
+ *
  * Return values / CAVEAT EMPTOR: see mdmn_send_message_with_msgid()
  */
 
@@ -525,13 +616,14 @@
 		set_t setno,
 		md_mn_msgtype_t type,
 		uint_t flags,
+		md_mn_nodeid_t recipient,
 		char *data,
 		int size,
 		md_mn_result_t **result,
 		md_error_t *ep)
 {
-	return (mdmn_send_message_with_msgid(
-		setno, type, flags, data, size, result, MD_NULL_MSGID, ep));
+	return (mdmn_send_message_with_msgid(setno, type, flags,
+	    recipient, data, size, result, MD_NULL_MSGID, ep));
 }
 /*
  * mdmn_send_message_with_msgid()
@@ -561,6 +653,7 @@
 		set_t setno,
 		md_mn_msgtype_t type,
 		uint_t flags,
+		md_mn_nodeid_t recipient,
 		char *data,
 		int size,
 		md_mn_result_t **result,
@@ -619,6 +712,7 @@
 	 */
 	msg.msg_flags		= flags;
 	msg.msg_setno		= setno;
+	msg.msg_recipient	= recipient;
 	msg.msg_type		= type;
 	msg.msg_event_size	= size;
 	msg.msg_event_data	= data;
@@ -655,7 +749,7 @@
 	 * - retries1 or retries2 exceeded
 	 */
 	for (; ; ) {
-		*result = mdmn_send_1(&msg, local_daemon);
+		*result = mdmn_send_2(&msg, local_daemon, 0);
 		resp = *result;
 		if (resp != (md_mn_result_t *)NULL) {
 			/* Bingo! */
@@ -800,8 +894,8 @@
 	if ((setno >= MD_MAXSETS) || (class >= MD_MN_NCLASSES)) {
 		return (MDE_DS_COMMDCTL_SUSPEND_FAIL);
 	}
-	local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, ONE,
-		"tcp");
+	local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, TWO,
+	    "tcp");
 	if (local_daemon == (CLIENT *)NULL) {
 		clnt_pcreateerror("local_daemon");
 		return (MDE_DS_COMMDCTL_SUSPEND_FAIL);
@@ -818,7 +912,7 @@
 	msc.msc_class = class;
 	msc.msc_flags = 0;
 
-	resp = mdmn_comm_suspend_1(&msc, local_daemon);
+	resp = mdmn_comm_suspend_2(&msc, local_daemon, 0);
 	clnt_destroy(local_daemon);
 
 	if (resp == NULL) {
@@ -861,8 +955,8 @@
 	if ((setno >= MD_MAXSETS) || (class >= MD_MN_NCLASSES)) {
 		return (MDE_DS_COMMDCTL_RESUME_FAIL);
 	}
-	local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, ONE,
-		"tcp");
+	local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, TWO,
+	    "tcp");
 	if (local_daemon == (CLIENT *)NULL) {
 		clnt_pcreateerror("local_daemon");
 		return (MDE_DS_COMMDCTL_RESUME_FAIL);
@@ -879,7 +973,7 @@
 	msc.msc_class = class;
 	msc.msc_flags = flags;
 
-	resp = mdmn_comm_resume_1(&msc, local_daemon);
+	resp = mdmn_comm_resume_2(&msc, local_daemon, 0);
 
 	if (resp != NULL) {
 		if (*resp == MDMNE_ACK) {
@@ -905,10 +999,8 @@
 	md_error_t	mdne = mdnullerror;
 
 	(void) mdmn_send_message(0, /* No set is needed for this message */
-			MD_MN_MSG_ABORT,
-			MD_MSGF_LOCAL_ONLY,
-			dummy, sizeof (dummy),
-			&resultp, &mdne);
+	    MD_MN_MSG_ABORT, MD_MSGF_LOCAL_ONLY, 0,
+	    dummy, sizeof (dummy), &resultp, &mdne);
 
 	if (resultp != NULL) {
 		Free(resultp);
@@ -935,8 +1027,8 @@
 	if ((setno == 0) || (setno >= MD_MAXSETS)) {
 		return (1);
 	}
-	local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, ONE,
-		"tcp");
+	local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, TWO,
+	    "tcp");
 	if (local_daemon == (CLIENT *)NULL) {
 		clnt_pcreateerror("local_daemon");
 		return (1);
@@ -949,7 +1041,7 @@
 		}
 	}
 
-	resp = mdmn_comm_reinit_set_1(&setno, local_daemon);
+	resp = mdmn_comm_reinit_set_2(&setno, local_daemon, 0);
 
 	if (resp != NULL) {
 		if (*resp == MDMNE_ACK) {
@@ -984,8 +1076,8 @@
 	if ((msgtype == 0) || (msgtype >= MD_MN_NMESSAGES)) {
 		return (1);
 	}
-	local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, ONE,
-		"tcp");
+	local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, TWO,
+	    "tcp");
 	if (local_daemon == (CLIENT *)NULL) {
 		clnt_pcreateerror("local_daemon");
 		return (1);
@@ -993,7 +1085,7 @@
 	mmtl.mmtl_type = msgtype;
 	mmtl.mmtl_lock = locktype;
 
-	resp = mdmn_comm_msglock_1(&mmtl, local_daemon);
+	resp = mdmn_comm_msglock_2(&mmtl, local_daemon, 0);
 
 	if (resp != NULL) {
 		if (*resp == MDMNE_ACK) {
--- a/usr/src/lib/lvm/libmeta/common/meta_mn_handlers.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/lib/lvm/libmeta/common/meta_mn_handlers.c	Wed Dec 24 08:23:40 2008 -0700
@@ -18,13 +18,12 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <stdlib.h>
 #include <unistd.h>
 #include <wait.h>
@@ -448,7 +447,7 @@
 	myflags |= msg->msg_flags & MD_MSGF_INHERIT_BITS;
 
 	ret = mdmn_send_message(MD_MIN2SET(d->msg_chooseid_mnum),
-	    MD_MN_MSG_CHANGE_OWNER, myflags, (char *)&chownermsg,
+	    MD_MN_MSG_CHANGE_OWNER, myflags, 0, (char *)&chownermsg,
 	    sizeof (chownermsg), &resp1, &mde);
 	if (resp1 != NULL)
 		free_result(resp1);
@@ -2120,3 +2119,67 @@
 
 	resp->mmr_exitval = 0;
 }
+
+/*
+ * This is used to issue a MD_MN_RR_DIRTY ioctl to the mirror.
+ */
+/*ARGSUSED*/
+void
+mdmn_do_mark_dirty(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
+{
+	md_mn_msg_rr_dirty_t	*d;
+	md_mn_rr_dirty_params_t	rp;
+	int			ret;
+
+	resp->mmr_out_size = 0;
+	resp->mmr_err_size = 0;
+	resp->mmr_out = NULL;
+	resp->mmr_err = NULL;
+	resp->mmr_comm_state = MDMNE_ACK;
+	d = (md_mn_msg_rr_dirty_t *)((void *)(msg->msg_event_data));
+
+	(void) memset(&rp, 0, sizeof (rp));
+	MD_SETDRIVERNAME(&rp, MD_MIRROR, MD_MIN2SET(d->rr_mnum))
+	rp.rr_mnum = d->rr_mnum;
+	rp.rr_nodeid = d->rr_nodeid;
+	rp.rr_start = (ushort_t)((d->rr_range >> 16) & 0xffff);
+	rp.rr_end = (ushort_t)(d->rr_range & 0xffff);
+
+	ret = metaioctl(MD_MN_RR_DIRTY, &rp, &rp.mde, NULL);
+
+	resp->mmr_exitval = ret;
+}
+
+/*
+ * This is used to issue a MD_MN_RR_CLEAN ioctl to the mirror.
+ */
+/*ARGSUSED*/
+void
+mdmn_do_mark_clean(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
+{
+	md_mn_msg_rr_clean_t	*d;
+	md_mn_rr_clean_params_t	*rcp;
+	int			ret;
+
+	resp->mmr_out_size = 0;
+	resp->mmr_err_size = 0;
+	resp->mmr_out = NULL;
+	resp->mmr_err = NULL;
+	resp->mmr_comm_state = MDMNE_ACK;
+	d = (md_mn_msg_rr_clean_t *)((void *)(msg->msg_event_data));
+
+	rcp = Zalloc(sizeof (struct md_mn_rr_clean_params) +
+	    MDMN_MSG_RR_CLEAN_DATA_BYTES(d));
+	MD_SETDRIVERNAME(rcp, MD_MIRROR, MD_MIN2SET(d->rr_mnum))
+	rcp->rr_mnum = d->rr_mnum;
+	rcp->rr_nodeid = d->rr_nodeid;
+	rcp->rr_start_size = d->rr_start_size;
+	(void) memcpy(MDMN_RR_CLEAN_PARAMS_DATA(rcp), MDMN_MSG_RR_CLEAN_DATA(d),
+	    MDMN_MSG_RR_CLEAN_DATA_BYTES(d));
+
+	ret = metaioctl(MD_MN_RR_CLEAN, rcp, &rcp->mde, NULL);
+
+	Free(rcp);
+
+	resp->mmr_exitval = ret;
+}
--- a/usr/src/lib/lvm/libmeta/common/meta_mn_msg_table.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/lib/lvm/libmeta/common/meta_mn_msg_table.c	Wed Dec 24 08:23:40 2008 -0700
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <meta.h>
 
 extern void mdmn_do_cmd(HANDLER_PARMS);
@@ -56,6 +54,8 @@
 extern void mdmn_do_get_tstate(HANDLER_PARMS);
 extern void mdmn_do_get_mirstate(HANDLER_PARMS);
 extern void mdmn_do_addmdname(HANDLER_PARMS);
+extern void mdmn_do_mark_dirty(HANDLER_PARMS);
+extern void mdmn_do_mark_clean(HANDLER_PARMS);
 
 extern int mdmn_smgen_test6(SMGEN_PARMS);
 extern int mdmn_smgen_state_upd(SMGEN_PARMS);
@@ -693,10 +693,36 @@
 	 * Add metadevice name into replica
 	 */
 		MD_MSG_CLASS1,		/* message class */
-		mdmn_do_addmdname,	/* add ,etadevice name */
+		mdmn_do_addmdname,	/* add metadevice name */
 		NULL,			/* submessage generator */
 		90,			/* times out in 90 secs */
 		10000, 2,		/* class busy retry / time delta */
 		10, 1000		/* comm fail retry / time delta */
 	},
+
+	{
+	/*
+	 * MD_MN_MSG_RR_DIRTY
+	 * Mark given range of un_dirty_bm as dirty
+	 */
+		MD_MSG_CLASS2,		/* message class */
+		mdmn_do_mark_dirty,	/* message handler */
+		NULL,			/* submessage generator */
+		8,			/* timeout in seconds */
+		UINT_MAX, 10,		/* class busy retry / time delta */
+		UINT_MAX, 100		/* comm fail retry / time delta */
+	},
+
+	{
+	/*
+	 * MD_MN_MSG_RR_CLEAN
+	 * Mark given range of un_dirty_bm as clean
+	 */
+		MD_MSG_CLASS2,		/* message class */
+		mdmn_do_mark_clean,	/* message handler */
+		NULL,			/* submessage generator */
+		8,			/* timeout in seconds */
+		UINT_MAX, 10,		/* class busy retry / time delta */
+		UINT_MAX, 100		/* comm fail retry / time delta */
+	},
 };
--- a/usr/src/lib/lvm/libmeta/common/meta_mn_subr.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/lib/lvm/libmeta/common/meta_mn_subr.c	Wed Dec 24 08:23:40 2008 -0700
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Just in case we're not in a build environment, make sure that
  * TEXT_DOMAIN gets set to something.
@@ -62,7 +60,7 @@
 
 	/* Local set cannot be MultiNode */
 	if ((sp == NULL) || (sp->setname == NULL) ||
-				(strcmp(sp->setname, MD_LOCAL_NAME) == 0))
+	    (strcmp(sp->setname, MD_LOCAL_NAME) == 0))
 		return (0);
 	sd = metaget_setdesc(sp, ep);
 	ASSERT(sd != NULL);
@@ -128,7 +126,7 @@
 	md_mn_result_t	*resp = NULL;
 
 	(void) mdmn_send_message(setno, MD_MN_MSG_TEST2,
-	    MD_MSGF_NO_LOG | MD_MSGF_FAIL_ON_SUSPEND, data,
+	    MD_MSGF_NO_LOG | MD_MSGF_FAIL_ON_SUSPEND, 0, data,
 	    sizeof (data), &resp, &mde);
 
 	if (resp != (md_mn_result_t *)NULL) {
@@ -234,9 +232,8 @@
 	} else {
 		send_message_type = MD_MN_MSG_BC_CMD;
 	}
-	err = mdmn_send_message(
-		sp->setno, send_message_type, send_message_flags,
-		cmd, 1024, &resp, ep);
+	err = mdmn_send_message(sp->setno, send_message_type,
+	    send_message_flags, 0, cmd, 1024, &resp, ep);
 
 	free(cmd);
 
@@ -285,9 +282,9 @@
 			    "Command not attempted: Unable to log message "
 			    "in set %s\n"), sp->setname);
 			if (c.c_flags & MDDB_C_STALE) {
-			    (void) mdmddberror(ep, MDE_DB_STALE,
-			    (minor_t)NODEV64, sp->setno, 0, NULL);
-			    mde_perror(ep, "");
+				(void) mdmddberror(ep, MDE_DB_STALE,
+				    (minor_t)NODEV64, sp->setno, 0, NULL);
+				mde_perror(ep, "");
 			}
 		} else {
 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
@@ -333,7 +330,7 @@
 	 */
 	result = mdmn_send_message(MD_MIN2SET(mnum),
 	    MD_MN_MSG_SUSPEND_WRITES,
-	    MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND,
+	    MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, 0,
 	    (char *)&suspwrmsg, sizeof (suspwrmsg), &resp, ep);
 	if (resp != NULL) {
 		free_result(resp);
@@ -608,7 +605,7 @@
 	 * time required.
 	 */
 	ret = mdmn_send_message(sp->setno, MD_MN_MSG_SETSYNC,
-	    MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND,
+	    MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, 0,
 	    (char *)&setsyncmsg, sizeof (setsyncmsg), &resp, ep);
 	if (resp != NULL) {
 		free_result(resp);
@@ -720,7 +717,7 @@
 	resyncmsg.msg_resync_mnum =  mnum;
 	result = mdmn_send_message(MD_MIN2SET(mnum),
 	    MD_MN_MSG_RESYNC_STARTING,
-	    MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND,
+	    MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, 0,
 	    (char *)&resyncmsg, sizeof (resyncmsg), &resp, ep);
 
 	if (resp != NULL) {
@@ -905,7 +902,7 @@
 	tstatemsg.gettstate_dev = dev;
 	result = mdmn_send_message(MD_MIN2SET(mnum),
 	    MD_MN_MSG_GET_TSTATE,
-	    MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST,
+	    MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0,
 	    (char *)&tstatemsg, sizeof (tstatemsg), &resp, ep);
 
 	if (result == 0)
--- a/usr/src/lib/lvm/libmeta/common/meta_nameinfo.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/lib/lvm/libmeta/common/meta_nameinfo.c	Wed Dec 24 08:23:40 2008 -0700
@@ -1244,11 +1244,11 @@
 			 * and the message doesn't need being logged either.
 			 * Hence NO_LOG and NO_MCT
 			 */
-			err = mdmn_send_message(
-			    sp->setno, MD_MN_MSG_CLU_CHECK,
-			    MD_MSGF_NO_MCT | MD_MSGF_STOP_ON_ERROR |
-			    MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND,
-			    (char *)&d, sizeof (md_isopen_t), &resp, ep);
+			err = mdmn_send_message(sp->setno,
+			    MD_MN_MSG_CLU_CHECK, MD_MSGF_NO_MCT |
+			    MD_MSGF_STOP_ON_ERROR | MD_MSGF_NO_LOG |
+			    MD_MSGF_OVERRIDE_SUSPEND, 0, (char *)&d,
+			    sizeof (md_isopen_t), &resp, ep);
 			if (err == 0) {
 				d.isopen = resp->mmr_exitval;
 			} else {
--- a/usr/src/lib/lvm/libmeta/common/meta_runtime.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/lib/lvm/libmeta/common/meta_runtime.c	Wed Dec 24 08:23:40 2008 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,13 +18,12 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Just in case we're not in a build environment, make sure that
  * TEXT_DOMAIN gets set to something.
@@ -171,9 +169,8 @@
 			    ownerioctls_onp) != 0) {
 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 				    "%s: illegal value for %s: %s.\n"),
-					function_namep,
-					ownerioctls_namep,
-					param_valuep);
+				    function_namep, ownerioctls_namep,
+				    param_valuep);
 				syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
 				    "%s: illegal value for %s: %s.\n"),
 				    function_namep,
@@ -216,6 +213,32 @@
 }
 
 /*
+ * This controls what type of RPC errors are sent to syslog().
+ * It is used as a bitmask against the clnt_stat list, which defines
+ * 0 as RPC_SUCCESS, so likely shouldn't be set.
+ *
+ * The #define below provides a default of all errors in the list.
+ * The default can then be modified to reduce the amount of traffic
+ * going to syslog in the event of RPC errors.
+ */
+
+#define	DEFAULT_ERRMASK	(UINT_MAX & ~(1 << RPC_SUCCESS))
+
+uint_t
+meta_rpc_err_mask(void)
+{
+	char		*param_valuep;
+	uint_t retval   = DEFAULT_ERRMASK;
+
+	param_valuep = meta_get_rt_param("commd_RPC_errors", B_FALSE);
+	if (param_valuep != NULL) {
+		retval = (uint_t)strtol(param_valuep, NULL, 16);
+		free(param_valuep);
+	}
+	return (retval);
+}
+
+/*
  * The following lines define private functions
  */
 
@@ -232,27 +255,23 @@
 
 	line_bufferp = (char *)malloc(line_buffer_size);
 	if (line_bufferp == NULL) {
-		(void) fprintf(stderr,
-			dgettext(TEXT_DOMAIN, "%s: malloc failed\n"),
-			function_namep);
-		syslog(LOG_ERR,
-			dgettext(TEXT_DOMAIN, "%s: malloc failed\n"),
-			function_namep);
+		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
+		    "%s: malloc failed\n"), function_namep);
+		syslog(LOG_ERR, dgettext(TEXT_DOMAIN, "%s: malloc failed\n"),
+		    function_namep);
 		return (param_valuep);
 	}
 	param_filep = fopen(param_file_namep, "r");
 	if (param_filep == NULL) {
-		(void) fprintf(stderr,
-			dgettext(TEXT_DOMAIN, "%s: can't open %s\n"),
-			function_namep, param_file_namep);
-		syslog(LOG_ERR,
-			dgettext(TEXT_DOMAIN, "%s: can't open %s\n"),
-			function_namep, param_file_namep);
+		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
+		    "%s: can't open %s\n"), function_namep, param_file_namep);
+		syslog(LOG_ERR, dgettext(TEXT_DOMAIN, "%s: can't open %s\n"),
+		    function_namep, param_file_namep);
 		free(line_bufferp);
 		return (param_valuep);
 	}
 	while ((fgets(line_bufferp, line_buffer_size, param_filep) != NULL) &&
-		(param_valuep == NULL)) {
+	    (param_valuep == NULL)) {
 
 		newlinep = strchr(line_bufferp, '\n');
 		if (newlinep != NULL) {
@@ -261,10 +280,10 @@
 		}
 		param_name_tokenp = strtok(line_bufferp, token_separator_listp);
 		if ((param_name_tokenp != NULL) &&
-			(strcmp(param_namep, param_name_tokenp) == 0)) {
+		    (strcmp(param_namep, param_name_tokenp) == 0)) {
 
 			param_value_tokenp = strtok(NULL,
-						token_separator_listp);
+			    token_separator_listp);
 		}
 		if (param_value_tokenp != NULL) {
 			param_valuep = strdup(param_value_tokenp);
@@ -282,18 +301,12 @@
 		}
 	}
 	if ((param_valuep == NULL) && (warn_if_not_found == B_TRUE)) {
-		(void) fprintf(stderr,
-			dgettext(TEXT_DOMAIN,
-			    "%s: value of %s not set or error in %s\n"),
-			function_namep,
-			param_namep,
-			param_file_namep);
-		syslog(LOG_ERR,
-			dgettext(TEXT_DOMAIN,
-			    "%s: value of %s not set or error in %s\n"),
-			function_namep,
-			param_namep,
-			param_file_namep);
+		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
+		    "%s: value of %s not set or error in %s\n"),
+		    function_namep, param_namep, param_file_namep);
+		syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
+		    "%s: value of %s not set or error in %s\n"),
+		    function_namep, param_namep, param_file_namep);
 	}
 	free(line_bufferp);
 	(void) fclose(param_filep);
--- a/usr/src/lib/lvm/libmeta/common/meta_set.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/lib/lvm/libmeta/common/meta_set.c	Wed Dec 24 08:23:40 2008 -0700
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Just in case we're not in a build environment, make sure that
  * TEXT_DOMAIN gets set to something.
@@ -1877,7 +1875,6 @@
 		return (NULL);
 	}
 
-
 	/*
 	 * Get the devid associated with the key.
 	 *
@@ -1893,6 +1890,11 @@
 		 */
 		dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep);
 		free(devidp);
+
+		/* dnp could be NULL if the devid could not be decoded. */
+		if (dnp == NULL) {
+			return (NULL);
+		}
 		dnp->side_names_key = key;
 	} else {
 		/*
@@ -1981,6 +1983,9 @@
 			 */
 			dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep);
 			free(devidp);
+			if (dnp == NULL) {
+				return (NULL);
+			}
 			dnp->side_names_key = key;
 		}
 	}
@@ -5733,6 +5738,7 @@
 		    lr->lr_msg.msg_type,
 		    lr->lr_msg.msg_flags | MD_MSGF_REPLAY_MSG |
 		    MD_MSGF_OVERRIDE_SUSPEND,
+		    lr->lr_msg.msg_recipient,
 		    lr->lr_msg.msg_event_data,
 		    lr->lr_msg.msg_event_size,
 		    &resultp,
--- a/usr/src/lib/lvm/libmeta/common/meta_set_hst.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/lib/lvm/libmeta/common/meta_set_hst.c	Wed Dec 24 08:23:40 2008 -0700
@@ -18,13 +18,12 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Just in case we're not in a build environment, make sure that
  * TEXT_DOMAIN gets set to something.
@@ -148,7 +147,7 @@
 		send_rval = mdmn_send_message(sp->setno,
 		    MD_MN_MSG_META_MD_ADDSIDE,
 		    MD_MSGF_FAIL_ON_SUSPEND | MD_MSGF_PANIC_WHEN_INCONSISTENT,
-		    (char *)&md_as, sizeof (md_mn_msg_meta_md_addside_t),
+		    0, (char *)&md_as, sizeof (md_mn_msg_meta_md_addside_t),
 		    &resultp, ep);
 		if (send_rval != 0) {
 			(void) mdstealerror(ep, &(resultp->mmr_ep));
@@ -178,7 +177,7 @@
 			 * Let's see if it is hsp or not
 			 */
 			nm.devname = (uintptr_t)meta_getnmentbykey(sp->setno,
-				otherside, nm.key, &drvnm, NULL, NULL, ep);
+			    otherside, nm.key, &drvnm, NULL, NULL, ep);
 			if (nm.devname == NULL || drvnm == NULL) {
 				if (nm.devname)
 					Free((void *)(uintptr_t)nm.devname);
@@ -229,9 +228,9 @@
 			 * increment the count to sync up with the other sides.
 			 */
 			for (i = 0; i < nm.ref_count; i++) {
-			    if (add_name(sp, sideno, nm.key, dname, mnum,
-				cname, NULL, NULL, ep) == -1)
-				rval = -1;
+				if (add_name(sp, sideno, nm.key, dname, mnum,
+				    cname, NULL, NULL, ep) == -1)
+					rval = -1;
 			}
 
 			Free(cname);
@@ -323,17 +322,17 @@
 			(void) strcpy(nd->nd_nodename, node_v[i]);
 			nd->nd_ctime = now;
 			nd->nd_flags = (MD_MN_NODE_ALIVE |
-				MD_MN_NODE_ADD);
+			    MD_MN_NODE_ADD);
 			nl2 = nl;
 			while (nl2) {
-			    if (strcmp(nl2->msl_node_name,
-				node_v[i]) == 0) {
-				    nd->nd_nodeid = nl2->msl_node_id;
-				    (void) strcpy(nd->nd_priv_ic,
-					nl2->msl_node_addr);
-				    break;
-			    }
-			    nl2 = nl2->next;
+				if (strcmp(nl2->msl_node_name,
+				    node_v[i]) == 0) {
+					nd->nd_nodeid = nl2->msl_node_id;
+					(void) strcpy(nd->nd_priv_ic,
+					    nl2->msl_node_addr);
+					break;
+				}
+				nl2 = nl2->next;
 			}
 
 			/*
@@ -1123,7 +1122,7 @@
 		send_rval = mdmn_send_message(sp->setno,
 		    MD_MN_MSG_META_MD_DELSIDE,
 		    MD_MSGF_FAIL_ON_SUSPEND | MD_MSGF_PANIC_WHEN_INCONSISTENT,
-		    (char *)&md_ds, sizeof (md_mn_msg_meta_md_delside_t),
+		    0, (char *)&md_ds, sizeof (md_mn_msg_meta_md_delside_t),
 		    &resultp, ep);
 		if (send_rval != 0) {
 			(void) mdstealerror(ep, &(resultp->mmr_ep));
@@ -1156,8 +1155,8 @@
 			 * actually removed.
 			 */
 			for (i = 0; i < nm.ref_count; i++) {
-			    if (del_name(sp, sideno, nm.key, ep) == -1)
-				return (-1);
+				if (del_name(sp, sideno, nm.key, ep) == -1)
+					return (-1);
 			}
 		}
 	}
@@ -1183,7 +1182,7 @@
 				continue;
 			}
 			has_set = nodehasset(sp, nd->nd_nodename,
-				NHS_NST_EQ, &xep);
+			    NHS_NST_EQ, &xep);
 
 			if (has_set >= 0) {
 				nd = nd->nd_next;
@@ -1207,7 +1206,7 @@
 				continue;
 
 			has_set = nodehasset(sp, sd->sd_nodes[i],
-				NHS_NST_EQ, &xep);
+			    NHS_NST_EQ, &xep);
 
 			if (has_set >= 0)
 				continue;
@@ -1967,7 +1966,8 @@
 		return (-1);
 
 	/* find the end of the link list */
-	for (sn = dnp->side_names; sn->next != NULL; sn = sn->next);
+	for (sn = dnp->side_names; sn->next != NULL; sn = sn->next)
+		;
 	sn_next = &sn->next;
 
 	if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
@@ -1986,13 +1986,13 @@
 		 * used instead of meta_getnextside_devinfo.
 		 */
 		if (meta_getside_devinfo(sp, np->bname, sideno, &sn->cname,
-			&sn->dname, &sn->mnum, ep) == -1)
+		    &sn->dname, &sn->mnum, ep) == -1)
 			err = -1;
 	} else {
 		/* decrement sideno, to look like the previous sideno */
 		sideno--;
-		if (meta_getnextside_devinfo(sp, np->bname, &sideno, &sn->cname,
-			&sn->dname, &sn->mnum, ep) == -1)
+		if (meta_getnextside_devinfo(sp, np->bname, &sideno,
+		    &sn->cname, &sn->dname, &sn->mnum, ep) == -1)
 			err = -1;
 	}
 
@@ -2377,14 +2377,14 @@
 		nd->nd_ctime = now;
 		nl2 = nl;
 		while (nl2) {
-		    if (strcmp(nl2->msl_node_name,
-			node_v[nodeindex]) == 0) {
-			    nd->nd_nodeid = nl2->msl_node_id;
-			    (void) strcpy(nd->nd_priv_ic,
-				nl2->msl_node_addr);
-			    break;
-		    }
-		    nl2 = nl2->next;
+			if (strcmp(nl2->msl_node_name,
+			    node_v[nodeindex]) == 0) {
+				nd->nd_nodeid = nl2->msl_node_id;
+				(void) strcpy(nd->nd_priv_ic,
+				    nl2->msl_node_addr);
+				break;
+			}
+			nl2 = nl2->next;
 		}
 
 		/*
@@ -2773,16 +2773,16 @@
 		 * rpc.mdcommd is running on the nodes with a set.
 		 */
 		if (remote_sets_created == 1) {
-		    for (i = 0; i < node_c; i++) {
-			if (clnt_mdcommdctl(node_v[i], COMMDCTL_REINIT,
-			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
-				if (rval == 0)
-					(void) mdstealerror(ep, &xep);
-				rval = -1;
-				mde_perror(ep, dgettext(TEXT_DOMAIN,
-				    "Unable to reinit rpc.mdcommd.\n"));
-			}
-		    }
+			for (i = 0; i < node_c; i++) {
+				if (clnt_mdcommdctl(node_v[i], COMMDCTL_REINIT,
+				    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
+					if (rval == 0)
+						(void) mdstealerror(ep, &xep);
+					rval = -1;
+					mde_perror(ep, dgettext(TEXT_DOMAIN,
+					    "Unable to reinit rpc.mdcommd.\n"));
+				}
+			}
 		}
 	}
 	if ((suspend1_flag) || (suspendall_flag)) {
@@ -2819,17 +2819,18 @@
 		 * rpc.mdcommd is be running on the nodes with a set.
 		 */
 		if (remote_sets_created == 1) {
-		    for (i = 0; i < node_c; i++) {
-			/* Already verified to be alive */
-			if (clnt_mdcommdctl(node_v[i], COMMDCTL_RESUME,
-			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
-				if (rval == 0)
-					(void) mdstealerror(ep, &xep);
-				rval = -1;
-				mde_perror(ep, dgettext(TEXT_DOMAIN,
-				    "Unable to resume rpc.mdcommd.\n"));
-			}
-		    }
+			for (i = 0; i < node_c; i++) {
+				/* Already verified to be alive */
+				if (clnt_mdcommdctl(node_v[i], COMMDCTL_RESUME,
+				    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS,
+				    &xep)) {
+					if (rval == 0)
+						(void) mdstealerror(ep, &xep);
+					rval = -1;
+					mde_perror(ep, dgettext(TEXT_DOMAIN,
+					    "Unable to resume rpc.mdcommd.\n"));
+				}
+			}
 		}
 		meta_ping_mnset(sp->setno);
 		/*
@@ -4031,7 +4032,8 @@
 		rb_medr.med_rec_sn  = sp->setno;
 		(void) strcpy(rb_medr.med_rec_snm, sp->setname);
 		for (i = 0; i < MD_MAXSIDES; i++)
-		    (void) strcpy(rb_medr.med_rec_nodes[i], sd->sd_nodes[i]);
+			(void) strcpy(rb_medr.med_rec_nodes[i],
+			    sd->sd_nodes[i]);
 		rb_medr.med_rec_meds = sd->sd_med;  /* structure assigment */
 		(void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t));
 		rb_medr.med_rec_foff = 0;
@@ -4432,45 +4434,52 @@
 				 * alive nodes are updated correctly.
 				 */
 				if (strcmp(nd->nd_nodename, node_v[i]) == 0) {
-				    if ((oha == TRUE) &&
-					(!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
+					if ((oha == TRUE) && (!(nd->nd_flags &
+					    MD_MN_NODE_ALIVE))) {
 						nd->nd_flags |= MD_MN_NODE_DEL;
 						nd->nd_flags &= ~MD_MN_NODE_OK;
 						nd = nd->nd_next;
 						continue;
-				    }
-				    if (nd->nd_flags & MD_MN_NODE_OWN) {
-					/*
-					 * Going to set locally cached node
-					 * flags to rollback join so in case
-					 * of error, the rollback code knows
-					 * which nodes to re-join.
-					 * rpc.metad ignores the RB_JOIN flag.
-					 */
-					nd->nd_flags |= MD_MN_NODE_RB_JOIN;
-					nd->nd_flags &= ~MD_MN_NODE_OWN;
-
-					/*
-					 * Be careful in ordering of following
-					 * steps so that recovery from a panic
-					 * between the steps is viable.
-					 * Only reset master info in rpc.metad
-					 * - don't reset local cached info
-					 * which will be used to set master
-					 * info back if failure (rollback).
-					 */
-					if (clnt_withdrawset(nd->nd_nodename,
-					    sp, ep))
-						goto rollback;
-
-					/* Reset master on deleted node */
-					if (clnt_mnsetmaster(node_v[i], sp, "",
-					    MD_MN_INVALID_NID, ep))
-						goto rollback;
-				    }
-
-				    nd->nd_flags |= MD_MN_NODE_DEL;
-				    nd->nd_flags &= ~MD_MN_NODE_OK;
+					}
+					if (nd->nd_flags & MD_MN_NODE_OWN) {
+						/*
+						 * Going to set locally cached
+						 * node flags to rollback join
+						 * so in case of error, the
+						 * rollback code knows which
+						 * nodes to re-join.  rpc.metad
+						 * ignores the RB_JOIN flag.
+						 */
+						nd->nd_flags |=
+						    MD_MN_NODE_RB_JOIN;
+						nd->nd_flags &= ~MD_MN_NODE_OWN;
+
+						/*
+						 * Be careful in ordering of
+						 * following steps so that
+						 * recovery from a panic
+						 * between the steps is viable.
+						 * Only reset master info in
+						 * rpc.metad - don't reset
+						 * local cached info which will
+						 * be used to set master info
+						 * back if failure (rollback).
+						 */
+						if (clnt_withdrawset(
+						    nd->nd_nodename, sp, ep))
+							goto rollback;
+
+						/*
+						 * Reset master on deleted node
+						 */
+						if (clnt_mnsetmaster(node_v[i],
+						    sp, "", MD_MN_INVALID_NID,
+						    ep))
+							goto rollback;
+					}
+
+					nd->nd_flags |= MD_MN_NODE_DEL;
+					nd->nd_flags &= ~MD_MN_NODE_OK;
 				}
 				nd = nd->nd_next;
 			}
@@ -4503,37 +4512,37 @@
 			/* Send reinit */
 			nd = sd->sd_nodelist;
 			while (nd) {
-			    if ((oha == TRUE) &&
-				(!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
-				    nd = nd->nd_next;
-				    continue;
-			    }
-			    /* Class is ignored for REINIT */
-			    if (clnt_mdcommdctl(nd->nd_nodename,
-				COMMDCTL_REINIT,
-				sp, NULL, MD_MSCF_NO_FLAGS, ep)) {
-				    mde_perror(ep, dgettext(TEXT_DOMAIN,
-					"Unable to reinit rpc.mdcommd.\n"));
-				    goto rollback;
-			    }
-			    nd = nd->nd_next;
+				if ((oha == TRUE) &&
+				    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
+					nd = nd->nd_next;
+					continue;
+				}
+				/* Class is ignored for REINIT */
+				if (clnt_mdcommdctl(nd->nd_nodename,
+				    COMMDCTL_REINIT, sp, NULL,
+				    MD_MSCF_NO_FLAGS, ep)) {
+					mde_perror(ep, dgettext(TEXT_DOMAIN,
+					    "Unable to reinit rpc.mdcommd.\n"));
+					goto rollback;
+				}
+				nd = nd->nd_next;
 			}
 			/* Send resume */
 			nd = sd->sd_nodelist;
 			while (nd) {
-			    if ((oha == TRUE) &&
-				(!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
-				    nd = nd->nd_next;
-				    continue;
-			    }
-			    if (clnt_mdcommdctl(nd->nd_nodename,
-				COMMDCTL_RESUME, sp, MD_MSG_CLASS0,
-				MD_MSCF_DONT_RESUME_CLASS1, ep)) {
-				    mde_perror(ep, dgettext(TEXT_DOMAIN,
-					"Unable to resume rpc.mdcommd.\n"));
-				    goto rollback;
-			    }
-			    nd = nd->nd_next;
+				if ((oha == TRUE) &&
+				    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
+					nd = nd->nd_next;
+					continue;
+				}
+				if (clnt_mdcommdctl(nd->nd_nodename,
+				    COMMDCTL_RESUME, sp, MD_MSG_CLASS0,
+				    MD_MSCF_DONT_RESUME_CLASS1, ep)) {
+					mde_perror(ep, dgettext(TEXT_DOMAIN,
+					    "Unable to resume rpc.mdcommd.\n"));
+					goto rollback;
+				}
+				nd = nd->nd_next;
 			}
 			meta_ping_mnset(sp->setno);
 		}
@@ -4727,50 +4736,52 @@
 				RB_TEST(24, "deletehosts", ep)
 			}
 		} else {
-		    nd = sd->sd_nodelist;
-		    /* All nodes guaranteed to be ALIVE unless in oha mode */
-		    while (nd) {
-			/*
-			 * If mirror owner was set to a deleted node, then
-			 * each existing node resets mirror owner to NULL.
-			 *
-			 * During OHA mode, don't issue RPCs to
-			 * non-alive nodes since there is no reason to
-			 * wait for RPC timeouts.
-			 */
-			if ((oha == TRUE) &&
-			    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
-				nd = nd->nd_next;
-				continue;
-			}
-
-			/* Skip nodes being deleted */
-			if (strinlst(nd->nd_nodename, node_c, node_v)) {
+			nd = sd->sd_nodelist;
+			/* All nodes guaranteed ALIVE unless in oha mode */
+			while (nd) {
+				/*
+				 * If mirror owner was set to a deleted node,
+				 * then each existing node resets mirror owner
+				 * to NULL.
+				 *
+				 * During OHA mode, don't issue RPCs to
+				 * non-alive nodes since there is no reason to
+				 * wait for RPC timeouts.
+				 */
+				if ((oha == TRUE) &&
+				    (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
+					nd = nd->nd_next;
+					continue;
+				}
+
+				/* Skip nodes being deleted */
+				if (strinlst(nd->nd_nodename, node_c, node_v)) {
+					nd = nd->nd_next;
+					continue;
+				}
+
+				/*
+				 * If mirror owner is a deleted node, reset
+				 * mirror owners to NULL.  If an error occurs,
+				 * print a warning and continue.  Don't fail
+				 * metaset because of mirror owner reset
+				 * problem since next node to grab mirror
+				 * will resolve this issue.  Before next node
+				 * grabs mirrors, metaset will show the deleted
+				 * node as owner which is why an attempt to
+				 * reset the mirror owner is made.
+				 */
+				if (clnt_reset_mirror_owner(nd->nd_nodename, sp,
+				    node_c, &node_id_list[0], &xep) == -1) {
+					mde_perror(&xep, dgettext(TEXT_DOMAIN,
+					    "Unable to reset mirror owner on"
+					    " node %s\n"), nd->nd_nodename);
+					mdclrerror(&xep);
+				}
+
+				RB_TEST(21, "deletehosts", ep)
 				nd = nd->nd_next;
-				continue;
-			}
-
-			/*
-			 * If mirror owner is a deleted node, reset mirror
-			 * owners to NULL.  If an error occurs, print a
-			 * warning and continue.  Don't fail metaset
-			 * because of mirror owner reset problem since next
-			 * node to grab mirror will resolve this issue.
-			 * Before next node grabs mirrors, metaset will show
-			 * the deleted node as owner which is why an attempt
-			 * to reset the mirror owner is made.
-			 */
-			if (clnt_reset_mirror_owner(nd->nd_nodename, sp,
-			    node_c, &node_id_list[0], &xep) == -1) {
-				mde_perror(&xep, dgettext(TEXT_DOMAIN,
-				    "Unable to reset mirror owner on"
-				    " node %s\n"), nd->nd_nodename);
-				mdclrerror(&xep);
-			}
-
-			RB_TEST(21, "deletehosts", ep)
-			nd = nd->nd_next;
-		    }
+			}
 		}
 	}
 
@@ -4790,10 +4801,10 @@
 		for (i = 0; i < MD_MAXSIDES; i++) {
 			if (strinlst(sd->sd_nodes[i], node_c, node_v))
 				(void) memset(&medr.med_rec_nodes[i],
-					'\0', sizeof (md_node_nm_t));
+				    '\0', sizeof (md_node_nm_t));
 			else
 				(void) strcpy(medr.med_rec_nodes[i],
-					sd->sd_nodes[i]);
+				    sd->sd_nodes[i]);
 		}
 		crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
 
@@ -5636,79 +5647,85 @@
 
 	/* Lock the set on our side */
 	if (clnt_lock_set(hostname, sp, ep)) {
-	    rval = -1;
-	    goto out;
+		rval = -1;
+		goto out;
 	}
 
 	if (take_val) {
-	    /* enable auto_take but only if it is not already set */
-	    if (! (sd->sd_flags & MD_SR_AUTO_TAKE)) {
-		/* verify that we're the only host in the set */
-		for (i = 0; i < MD_MAXSIDES; i++) {
-		    if (sd->sd_nodes[i] == NULL || sd->sd_nodes[i][0] == '\0')
-			continue;
-
-		    if (strcmp(sd->sd_nodes[i], hostname) != 0) {
-			(void) mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL,
-			    NULL, sp->setname);
-			rval = -1;
-			goto out;
-		    }
+		/* enable auto_take but only if it is not already set */
+		if (! (sd->sd_flags & MD_SR_AUTO_TAKE)) {
+			/* verify that we're the only host in the set */
+			for (i = 0; i < MD_MAXSIDES; i++) {
+				if (sd->sd_nodes[i] == NULL ||
+				    sd->sd_nodes[i][0] == '\0')
+					continue;
+
+				if (strcmp(sd->sd_nodes[i], hostname) != 0) {
+					(void) mddserror(ep, MDE_DS_SINGLEHOST,
+					    sp->setno, NULL, NULL, sp->setname);
+					rval = -1;
+					goto out;
+				}
+			}
+
+			if (clnt_enable_sr_flags(hostname, sp,
+			    MD_SR_AUTO_TAKE, ep))
+				rval = -1;
+
+			/* Disable SCSI reservations */
+			if (sd->sd_flags & MD_SR_MB_DEVID)
+				dd = metaget_drivedesc(sp, MD_BASICNAME_OK |
+				    PRINT_FAST, &xep);
+			else
+				dd = metaget_drivedesc(sp, MD_BASICNAME_OK,
+				    &xep);
+
+			if (! mdisok(&xep))
+				mdclrerror(&xep);
+
+			if (dd != NULL) {
+				if (rel_own_bydd(sp, dd, TRUE, &xep))
+					mdclrerror(&xep);
+			}
 		}
 
-		if (clnt_enable_sr_flags(hostname, sp, MD_SR_AUTO_TAKE, ep))
-		    rval = -1;
-
-		/* Disable SCSI reservations */
-		if (sd->sd_flags & MD_SR_MB_DEVID)
-		    dd = metaget_drivedesc(sp, MD_BASICNAME_OK | PRINT_FAST,
-			&xep);
-		else
-		    dd = metaget_drivedesc(sp, MD_BASICNAME_OK, &xep);
-		if (! mdisok(&xep))
-		    mdclrerror(&xep);
-
-		if (dd != NULL) {
-		    if (rel_own_bydd(sp, dd, TRUE, &xep))
-			mdclrerror(&xep);
-		}
-	    }
-
 	} else {
-	    /* disable auto_take, if set, or error */
-	    if (sd->sd_flags & MD_SR_AUTO_TAKE) {
-		if (clnt_disable_sr_flags(hostname, sp, MD_SR_AUTO_TAKE, ep))
-		    rval = -1;
-
-		/* Enable SCSI reservations */
-		if (sd->sd_flags & MD_SR_MB_DEVID)
-		    dd = metaget_drivedesc(sp, MD_BASICNAME_OK | PRINT_FAST,
-			&xep);
-		else
-		    dd = metaget_drivedesc(sp, MD_BASICNAME_OK, &xep);
-		if (! mdisok(&xep))
-		    mdclrerror(&xep);
-
-		if (dd != NULL) {
-		    mhd_mhiargs_t	mhiargs = defmhiargs;
-
-		    if (tk_own_bydd(sp, dd, &mhiargs, TRUE, &xep))
-			mdclrerror(&xep);
+		/* disable auto_take, if set, or error */
+		if (sd->sd_flags & MD_SR_AUTO_TAKE) {
+			if (clnt_disable_sr_flags(hostname, sp,
+			    MD_SR_AUTO_TAKE, ep))
+				rval = -1;
+
+			/* Enable SCSI reservations */
+			if (sd->sd_flags & MD_SR_MB_DEVID)
+				dd = metaget_drivedesc(sp, MD_BASICNAME_OK |
+				    PRINT_FAST, &xep);
+			else
+				dd = metaget_drivedesc(sp, MD_BASICNAME_OK,
+				    &xep);
+
+			if (! mdisok(&xep))
+				mdclrerror(&xep);
+
+			if (dd != NULL) {
+				mhd_mhiargs_t	mhiargs = defmhiargs;
+
+				if (tk_own_bydd(sp, dd, &mhiargs, TRUE, &xep))
+					mdclrerror(&xep);
+			}
+		} else {
+			(void) mddserror(ep, MDE_DS_AUTONOTSET, sp->setno,
+			    NULL, NULL, sp->setname);
+			rval = -1;
 		}
-
-	    } else {
-		(void) mddserror(ep, MDE_DS_AUTONOTSET, sp->setno, NULL, NULL,
-		    sp->setname);
-		rval = -1;
-	    }
 	}
 
 out:
 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
 	if (clnt_unlock_set(hostname, cl_sk, &xep)) {
-	    if (rval == 0)
-		(void) mdstealerror(ep, &xep);
-	    rval = -1;
+		if (rval == 0)
+			(void) mdstealerror(ep, &xep);
+		rval = -1;
 	}
 	cl_set_setkey(NULL);
 
--- a/usr/src/lib/lvm/libmeta/common/meta_sp.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/lib/lvm/libmeta/common/meta_sp.c	Wed Dec 24 08:23:40 2008 -0700
@@ -18,6 +18,7 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
@@ -1895,7 +1896,7 @@
 				    wm.wm_mdname);
 				result = mdmn_send_message(sp->setno,
 				    MD_MN_MSG_ADDMDNAME,
-				    MD_MSGF_PANIC_WHEN_INCONSISTENT,
+				    MD_MSGF_PANIC_WHEN_INCONSISTENT, 0,
 				    (char *)send_params, message_size, &resp,
 				    ep);
 				Free(send_params);
@@ -2384,10 +2385,11 @@
 }
 
 /*
- * FUNCTION:	meta_sp_update_wm()
+ * FUNCTION:	meta_sp_update_wm_common()
  * INPUT:	sp	- the operating set
  *		msp	- a pointer to the XDR unit structure
  *		extlist	- the extent list specifying watermarks to update
+ *		iocval	- either MD_IOC_SPUPDATEWM or MD_MN_IOC_SPUPDATEWM
  * OUTPUT:	ep	- return error pointer
  * RETURNS:	int	- -1 if error, 0 on success
  * PURPOSE:	steps backwards through the extent list updating
@@ -2401,10 +2403,11 @@
  *		are realized.
  */
 static int
-meta_sp_update_wm(
+meta_sp_update_wm_common(
 	mdsetname_t	*sp,
 	md_sp_t		*msp,
 	sp_ext_node_t	*extlist,
+	int		iocval,
 	md_error_t	*ep
 )
 {
@@ -2493,8 +2496,8 @@
 	MD_SETDRIVERNAME(&update_params, MD_SP,
 	    MD_MIN2SET(update_params.mnum));
 
-	if (metaioctl(MD_IOC_SPUPDATEWM, &update_params,
-	    &update_params.mde, msp->common.namep->cname) != 0) {
+	if (metaioctl(iocval, &update_params, &update_params.mde,
+	    msp->common.namep->cname) != 0) {
 		(void) mdstealerror(ep, &update_params.mde);
 		rval = -1;
 		goto out;
@@ -2507,6 +2510,30 @@
 	return (rval);
 }
 
+static int
+meta_sp_update_wm(
+	mdsetname_t	*sp,
+	md_sp_t		*msp,
+	sp_ext_node_t	*extlist,
+	md_error_t	*ep
+)
+{
+	return (meta_sp_update_wm_common(sp, msp, extlist, MD_IOC_SPUPDATEWM,
+	    ep));
+}
+
+static int
+meta_mn_sp_update_wm(
+	mdsetname_t	*sp,
+	md_sp_t		*msp,
+	sp_ext_node_t	*extlist,
+	md_error_t	*ep
+)
+{
+	return (meta_sp_update_wm_common(sp, msp, extlist, MD_MN_IOC_SPUPDATEWM,
+	    ep));
+}
+
 /*
  * FUNCTION:	meta_sp_clear_wm()
  * INPUT:	sp	- the operating set
@@ -4227,9 +4254,9 @@
 	int		committed = 0;
 	int		repart_options = MD_REPART_FORCE;
 	int		create_flag = MD_CRO_32BIT;
+	int		mn_set_master = 0;
 
 	md_set_desc	*sd;
-	mm_unit_t	*mm;
 	md_set_mmown_params_t	*ownpar = NULL;
 	int		comp_is_mirror = 0;
 
@@ -4417,19 +4444,7 @@
 			goto out;
 		}
 		if (MD_MNSET_DESC(sd) && sd->sd_mn_am_i_master) {
-			mm = (mm_unit_t *)meta_get_unit(sp, compnp, ep);
-			if (mm == NULL) {
-				rval = -1;
-				goto out;
-			} else {
-				rval = meta_mn_change_owner(&ownpar, sp->setno,
-				    meta_getminor(compnp->dev),
-				    sd->sd_mn_mynode->nd_nodeid,
-				    MD_MN_MM_PREVENT_CHANGE |
-				    MD_MN_MM_SPAWN_THREAD);
-				if (rval == -1)
-					goto out;
-			}
+			mn_set_master = 1;
 		}
 	}
 
@@ -4450,22 +4465,22 @@
 	committed = 1;
 
 	/* write watermarks */
-	if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) {
-		rval = -1;
-		goto out;
-	}
-
 	/*
-	 * Allow mirror ownership to change. If we don't succeed in this
-	 * ioctl it isn't fatal, but the cluster will probably hang fairly
-	 * soon as the mirror owner won't change. However, we have
-	 * successfully written the watermarks out to the device so the
-	 * softpart creation has succeeded
+	 * Special-case for Multi-node sets. As we now have a distributed DRL
+	 * update mechanism, we _will_ hit the ioctl-within-ioctl deadlock case
+	 * unless we use a 'special' MN-capable ioctl to stage the watermark
+	 * update. This only affects the master-node in an MN set.
 	 */
-	if (ownpar) {
-		(void) meta_mn_change_owner(&ownpar, sp->setno, ownpar->d.mnum,
-		    ownpar->d.owner,
-		    MD_MN_MM_ALLOW_CHANGE | MD_MN_MM_SPAWN_THREAD);
+	if (mn_set_master) {
+		if (meta_mn_sp_update_wm(sp, msp, extlist, ep) < 0) {
+			rval = -1;
+			goto out;
+		}
+	} else {
+		if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) {
+			rval = -1;
+			goto out;
+		}
 	}
 
 	/* second phase of commit, set status to MD_SP_OK */
@@ -5838,7 +5853,7 @@
 			sp_setstat_params.sp_setstat_status = status;
 
 			result = mdmn_send_message(sp->setno,
-			    MD_MN_MSG_SP_SETSTAT, MD_MSGF_DEFAULT_FLAGS,
+			    MD_MN_MSG_SP_SETSTAT, MD_MSGF_DEFAULT_FLAGS, 0,
 			    (char *)&sp_setstat_params,
 			    sizeof (sp_setstat_params),
 			    &resp, ep);
@@ -6022,7 +6037,7 @@
 				    compnp->cname);
 				result = mdmn_send_message(sp->setno,
 				    MD_MN_MSG_ADDKEYNAME, MD_MSGF_DEFAULT_FLAGS,
-				    (char *)send_params, message_size, &resp,
+				    0, (char *)send_params, message_size, &resp,
 				    ep);
 				Free(send_params);
 				if (resp != NULL) {
@@ -6154,7 +6169,7 @@
 			    sizeof (*un_array[i]) - sizeof (mp_ext_t) +
 			    (un_array[i]->un_numexts * sizeof (mp_ext_t)));
 			result = mdmn_send_message(sp->setno,
-			    MD_MN_MSG_IOCSET, MD_MSGF_DEFAULT_FLAGS,
+			    MD_MN_MSG_IOCSET, MD_MSGF_DEFAULT_FLAGS, 0,
 			    (char *)&send_params, mess_size, &resp,
 			    ep);
 			if (resp != NULL) {
@@ -6303,7 +6318,8 @@
 				send_params.delkeyname_key = np->key;
 				(void) mdmn_send_message(sp->setno,
 				    MD_MN_MSG_DELKEYNAME, MD_MSGF_DEFAULT_FLAGS,
-				    (char *)&send_params, sizeof (send_params),
+				    0, (char *)&send_params,
+				    sizeof (send_params),
 				    &resp, ep);
 				if (resp != NULL) {
 					free_result(resp);
--- a/usr/src/uts/common/io/lvm/md/md.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/uts/common/io/lvm/md/md.c	Wed Dec 24 08:23:40 2008 -0700
@@ -1858,6 +1858,9 @@
 	case MD_MN_RESYNC:
 	case MD_MN_SETSYNC:
 	case MD_MN_POKE_HOTSPARES:
+	case MD_MN_RR_DIRTY:
+	case MD_MN_RR_CLEAN:
+	case MD_MN_IOC_SPUPDATEWM:
 		return (1);
 	default:
 		return (0);
--- a/usr/src/uts/common/io/lvm/md/md_ioctl.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/uts/common/io/lvm/md/md_ioctl.c	Wed Dec 24 08:23:40 2008 -0700
@@ -80,40 +80,80 @@
 extern int		med_set_t_ioctl(mddb_med_t_parm_t *tpp, int mode);
 extern unit_t		md_get_nextunit(set_t setno);
 
-static int		md_mn_commd_present;
-
 /* md_mddb.c */
 extern mddb_set_t	*mddb_setenter(set_t setno, int flag, int *errorcodep);
 extern void		mddb_setexit(mddb_set_t *s);
 extern md_krwlock_t	nm_lock;
 
+#define	MD_MN_COMMD_CMD "rpc.mdcommd"
+static pid_t		md_mn_commd_pid;
+
 /*
  * md_mn_is_commd_present:
  * ----------------------
  * Determine if commd is running on this node.
  *
- * Returns:
- *	1	if commd has been started
- *	0	if commd has not been started or has exited
+ * If md_mn_commd_pid is 0, trust it.  Otherwise, do some in-depth checking
+ * to make sure it's still the one we originally set up by checking the
+ * provided PID's u_comm for the right program name in u_comm.
+ *
+ * This one's intended for the "something went awry" cases, and not for
+ * general use, due to its higher cost for the good/normal case.
  */
 int
 md_mn_is_commd_present(void)
 {
-	return (md_mn_commd_present ? 1 : 0);
+	proc_t  *commd_procp;
+
+	if (md_mn_commd_pid == (pid_t)0) {
+		return (0);
+	}
+
+	/* some in-depth checking */
+	mutex_enter(&pidlock);
+	if ((commd_procp = prfind(md_mn_commd_pid)) != NULL &&
+	    strncmp(commd_procp->p_user.u_comm,
+	    MD_MN_COMMD_CMD, strlen(MD_MN_COMMD_CMD)) == 0) {
+		mutex_exit(&pidlock);
+		/*
+		 * returns a little more info than asked for, but it will
+		 * never be PID 0 when valid.
+		 */
+		return ((int)md_mn_commd_pid);
+	}
+	/* if it's not there, make sure we only do these contortions once */
+	md_mn_commd_pid = (pid_t)0;
+	mutex_exit(&pidlock);
+
+	cmn_err(CE_WARN, "!rpc.mdcommd exited abnormally");
+	return (0);
+}
+
+/*
+ * This version merely checks the PID value that was set via an ioctl.
+ * It's intended to be used in the main code flow, where performance is
+ * critical, and accuracy can be sacrificed a little.  If something is
+ * already known to be wrong, don't use this, but use
+ * md_mn_is_commd_present() instead.
+ */
+int
+md_mn_is_commd_present_lite(void)
+{
+	return ((int)md_mn_commd_pid);
 }
 
 /*
  * md_mn_clear_commd_present:
  * -------------------------
- * Clear the commd_present flag. Called only from a CPR request to suspend /
- * terminate a resync thread. We clear the md_mn_commd_present flag so that
+ * Clear the md_mn_commd_pid. Called only from a CPR request to suspend /
+ * terminate a resync thread. We clear the md_mn_commd_pid so that
  * any RPC request that was in transit can complete with a failure and _not_
  * result in an unexpected system panic.
  */
 void
 md_mn_clear_commd_present()
 {
-	md_mn_commd_present = 0;
+	md_mn_commd_pid = (pid_t)0;
 }
 
 /*
@@ -855,7 +895,6 @@
 		return (mderror(mdep, MDE_UNIT_NOT_FOUND));
 	}
 
-	rw_enter(&md_ops[modindex]->md_link_rw.lock, RW_READER);
 	/* if array length is not 0 then allocate the output buffers */
 	if (minor_array_length != 0) {
 		sz = minor_array_length * ((int)sizeof (minor_t));
@@ -863,6 +902,7 @@
 		m_ptr = minors;
 	}
 
+	rw_enter(&md_ops[modindex]->md_link_rw.lock, RW_READER);
 	next = md_ops[modindex]->md_head;
 	count = 0;
 	while (next) {
@@ -2976,6 +3016,7 @@
 			    setno,
 			    MD_MN_MSG_TEST1,
 			    flags,
+			    0,
 			    (char *)&msg_test,
 			    sizeof (msg_test),
 			    result);
@@ -3019,6 +3060,7 @@
 			    setno,
 			    MD_MN_MSG_TEST2,
 			    flags,
+			    0,
 			    (char *)&msg_test,
 			    sizeof (msg_test),
 			    result);
@@ -3408,7 +3450,7 @@
 	}
 
 	/*
-	 * Update md_mn_commd_present global to reflect presence or absence of
+	 * Update md_mn_commd_pid global to reflect presence or absence of
 	 * /usr/sbin/rpc.mdcommd. This allows us to determine if an RPC failure
 	 * is expected during a mdmn_ksend_message() handshake. If the commd is
 	 * not present then an RPC failure is acceptable. If the commd _is_
@@ -3420,7 +3462,7 @@
 		if (! (mode & FWRITE))
 			return (EACCES);
 
-		md_mn_commd_present = (int)(intptr_t)data;
+		md_mn_commd_pid = (pid_t)(intptr_t)data;
 		err = 0;
 		break;
 	}
--- a/usr/src/uts/common/io/lvm/md/md_mddb.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/uts/common/io/lvm/md/md_mddb.c	Wed Dec 24 08:23:40 2008 -0700
@@ -18,13 +18,12 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/conf.h>
 #include <sys/time.h>
@@ -643,7 +642,7 @@
 
 		freeblks = 0;
 		for (mbip = s->s_mbiarray[i]; mbip != NULL;
-					mbip = mbip->mbi_next) {
+		    mbip = mbip->mbi_next) {
 			freeblks += mbip->mbi_mddb_mb.mb_blkcnt;
 		}
 		if (freeblks == 0)	/* this happen when there is no */
@@ -798,7 +797,7 @@
 		if ((did_freep1->free_blk == firstblk) &&
 		    (did_freep1->free_offset <= offset) &&
 		    ((did_freep1->free_length + did_freep1->free_offset) >=
-			(length + offset))) {
+		    (length + offset))) {
 			/* Have found our entry - remove from list */
 			block_found = 1;
 			did_freep_before = did_freep1;
@@ -816,17 +815,17 @@
 			 * offset, length.
 			 */
 			did_freep_before->free_length = offset -
-				did_freep_before->free_offset;
+			    did_freep_before->free_offset;
 			/*
 			 * did_freep_after points to area in block after
 			 * offset, length.
 			 */
 			did_freep_after = (mddb_did_free_t *)kmem_zalloc
-					(sizeof (mddb_did_free_t), KM_SLEEP);
+			    (sizeof (mddb_did_free_t), KM_SLEEP);
 			did_freep_after->free_blk = did_freep_before->free_blk;
 			did_freep_after->free_offset = offset + length;
 			did_freep_after->free_length = old_length - length -
-				did_freep_before->free_length;
+			    did_freep_before->free_length;
 			/*
 			 * Add before and after areas to free list
 			 * If area before or after offset, length has length
@@ -835,28 +834,30 @@
 			if (did_freep_after->free_length) {
 				did_freep_after->free_next = did_freep1;
 				if (did_freep2) {
-				    did_freep2->free_next = did_freep_after;
+					did_freep2->free_next =
+					    did_freep_after;
 				} else {
-				    s->s_did_icp->did_ic_freep =
-					did_freep_after;
+					s->s_did_icp->did_ic_freep =
+					    did_freep_after;
 				}
 				did_freep1 = did_freep_after;
 			} else {
 				kmem_free(did_freep_after,
-					sizeof (mddb_did_free_t));
+				    sizeof (mddb_did_free_t));
 			}
 
 			if (did_freep_before->free_length) {
 				did_freep_before->free_next = did_freep1;
 				if (did_freep2) {
-				    did_freep2->free_next = did_freep_before;
+					did_freep2->free_next =
+					    did_freep_before;
 				} else {
-				    s->s_did_icp->did_ic_freep =
-					did_freep_before;
+					s->s_did_icp->did_ic_freep =
+					    did_freep_before;
 				}
 			} else {
 				kmem_free(did_freep_before,
-					sizeof (mddb_did_free_t));
+				    sizeof (mddb_did_free_t));
 			}
 			break;
 		} else {
@@ -934,10 +935,10 @@
 			if (freep->free_length == 0) {
 				if (freep2) {
 					freep2->free_next =
-					freep->free_next;
+					    freep->free_next;
 				} else {
 					s->s_did_icp->did_ic_freep =
-					freep->free_next;
+					    freep->free_next;
 				}
 				kmem_free(freep, sizeof (mddb_did_free_t));
 			}
@@ -971,7 +972,7 @@
 
 		/* Add unused part of block to free list */
 		(void) mddb_devid_free_add(s, blk_num,
-			len, (dbtob(blk_cnt) - len));
+		    len, (dbtob(blk_cnt) - len));
 	}
 
 	return ((caddr_t)devid_ptr);
@@ -1015,9 +1016,9 @@
 		return (0);
 
 	devid_len = ddi_devid_sizeof(devid);
-	devid_ptr = (ddi_devid_t)
-			mddb_devid_free_get(s, devid_len, &blk, &blkcnt,
-				&offset);
+	devid_ptr = (ddi_devid_t)mddb_devid_free_get(s,
+	    devid_len, &blk, &blkcnt, &offset);
+
 	if (devid_ptr == NULL) {
 		return (1);
 	}
@@ -1090,7 +1091,7 @@
 
 	/* Add new free space in disk block to free list */
 	(void) mddb_devid_free_add(s, did_info->info_firstblk,
-		did_info->info_offset, did_info->info_length);
+	    did_info->info_offset, did_info->info_length);
 
 	return (0);
 }
@@ -1439,7 +1440,7 @@
 		for (i = 0; i < cnt; i++)
 			blkarray[i] = blk + i;
 		ret = wrtblklst(s, buffer, blkarray, cnt,
-			li, 0, MDDB_WR_ONLY_MASTER);
+		    li, 0, MDDB_WR_ONLY_MASTER);
 		kmem_free(blkarray, size);
 		return (ret);
 	}
@@ -1505,7 +1506,7 @@
 		did_blk = s->s_did_icp->did_ic_blkp;
 		did_blk->blk_commitcnt = s->s_lbp->lb_commitcnt;
 		crcgen(did_blk, &did_blk->blk_checksum,
-			dbtob(lbp->lb_didblkcnt), NULL);
+		    dbtob(lbp->lb_didblkcnt), NULL);
 	}
 	crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL);
 
@@ -1521,20 +1522,20 @@
 			did_dbp = s->s_did_icp->did_ic_dbp;
 			while (did_dbp) {
 				err |= writeblks(s, (caddr_t)did_dbp->db_ptr,
-					did_dbp->db_firstblk,
-					did_dbp->db_blkcnt, li,
-					MDDB_WR_ONLY_MASTER);
+				    did_dbp->db_firstblk,
+				    did_dbp->db_blkcnt, li,
+				    MDDB_WR_ONLY_MASTER);
 				did_dbp = did_dbp->db_next;
 			}
 
 			/* write out device id area block */
 			err |= writeblks(s, (caddr_t)did_blk,
-				lbp->lb_didfirstblk, lbp->lb_didblkcnt, li,
-				MDDB_WR_ONLY_MASTER);
+			    lbp->lb_didfirstblk, lbp->lb_didblkcnt, li,
+			    MDDB_WR_ONLY_MASTER);
 		}
 		/* write out locator block */
 		err |= writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li,
-			MDDB_WR_ONLY_MASTER);
+		    MDDB_WR_ONLY_MASTER);
 	}
 
 	/*
@@ -1715,7 +1716,7 @@
 	size_t		size;
 
 	size = sizeof (mddb_de_ic_t) - sizeof (mddb_block_t) +
-		    sizeof (mddb_block_t) * dep->de_blkcount;
+	    sizeof (mddb_block_t) * dep->de_blkcount;
 	return (size);
 }
 
@@ -1727,7 +1728,7 @@
 	size_t		size;
 
 	size = sizeof (*dep) - sizeof (dep->de32_blks) +
-		    sizeof (mddb_block_t) * dep->de32_blkcount;
+	    sizeof (mddb_block_t) * dep->de32_blkcount;
 	return (size);
 }
 
@@ -1760,7 +1761,7 @@
 	if ((dbp->db_firstentry != NULL) && (db32p->db32_firstentry == 0))
 		db32p->db32_firstentry = 0x4;
 	de32p = (mddb_de32_t *)((void *) ((caddr_t)(&db32p->db32_firstentry)
-		+ sizeof (db32p->db32_firstentry)));
+	    + sizeof (db32p->db32_firstentry)));
 	for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
 		detode32(dep, de32p);
 		if ((dep->de_next != NULL) && (de32p->de32_next == 0))
@@ -2067,9 +2068,9 @@
 
 			dep = (mddb_de_ic_t *)
 			    kmem_zalloc(sizeof (mddb_de_ic_t) -
-				sizeof (mddb_block_t) +
-				sizeof (mddb_block_t) * de32p->de32_blkcount,
-				KM_SLEEP);
+			    sizeof (mddb_block_t) +
+			    sizeof (mddb_block_t) * de32p->de32_blkcount,
+			    KM_SLEEP);
 			de32tode(de32p, dep);
 
 			dbp->db_firstentry = dep;
@@ -2078,10 +2079,10 @@
 				de32p2 = nextentry(de32p);
 
 				dep2 = (mddb_de_ic_t *)kmem_zalloc(
-					sizeof (mddb_de_ic_t) -
-					sizeof (mddb_block_t) +
-					sizeof (mddb_block_t) *
-					de32p2->de32_blkcount, KM_SLEEP);
+				    sizeof (mddb_de_ic_t) -
+				    sizeof (mddb_block_t) +
+				    sizeof (mddb_block_t) *
+				    de32p2->de32_blkcount, KM_SLEEP);
 
 				de32tode(de32p2, dep2);
 
@@ -2277,10 +2278,9 @@
 			if ((cb = devopsp[getmajor(lp->l_dev)]->devo_cb_ops)
 			    != NULL) {
 				error = (*cb->cb_prop_op)(DDI_DEV_T_ANY, devi,
-					prop_op,
-					DDI_PROP_NOTPROM|DDI_PROP_DONTPASS,
-					"removable-media",
-					(caddr_t)&propvalue, &proplength);
+				    prop_op, DDI_PROP_NOTPROM |
+				    DDI_PROP_DONTPASS, "removable-media",
+				    (caddr_t)&propvalue, &proplength);
 
 				if (error == DDI_PROP_SUCCESS)
 					removable = 1;
@@ -2348,7 +2348,7 @@
 	    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
 	    (type >= MDDB_FIRST_MODID) &&
 	    ((rbp->rb_revision == MDDB_REV_RB) ||
-		(rbp->rb_revision == MDDB_REV_RBFN))) {
+	    (rbp->rb_revision == MDDB_REV_RBFN))) {
 
 		switch (dep->de_flags) {
 
@@ -2512,7 +2512,7 @@
 		 * In a MN diskset, any node can write optimized record(s).
 		 */
 		wrt_err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
-			dep->de_blkcount, li, &bufhead, MDDB_WR_ANY_NODE);
+		    dep->de_blkcount, li, &bufhead, MDDB_WR_ANY_NODE);
 		/*
 		 * For MN diskset, set error in optinfo structure so
 		 * that mddb_commitrec knows which replica failed.
@@ -2556,10 +2556,10 @@
 				lp = &lbp->lb_locators[dep->de_optinfo[0].o_li];
 				if (lp == bfp->bf_locator) {
 					dep->de_optinfo[0].o_flags |=
-						MDDB_F_EWRITE;
+					    MDDB_F_EWRITE;
 				} else {
 					dep->de_optinfo[1].o_flags |=
-						MDDB_F_EWRITE;
+					    MDDB_F_EWRITE;
 				}
 			}
 			err |= MDDB_F_EWRITE;
@@ -2689,7 +2689,7 @@
 	create_db32rec(db32p, dbp);
 	crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
 	err = writeall(s, (caddr_t)db32p, db32p->db32_blknum,
-		1, MDDB_WR_ONLY_MASTER);
+	    1, MDDB_WR_ONLY_MASTER);
 	kmem_free((caddr_t)db32p, MDDB_BSIZE);
 	return (err);
 }
@@ -2932,13 +2932,13 @@
 
 	if (MD_UPGRADE) {
 		ldev = md_makedevice(md_targ_name_to_major(clp->l_driver),
-			clp->l_mnum);
+		    clp->l_mnum);
 	} else {
 		if (ddi_name_to_major(clp->l_driver) == (major_t)-1)
 			return (EINVAL);
 
 		ldev = md_makedevice(ddi_name_to_major(clp->l_driver),
-			clp->l_mnum);
+		    clp->l_mnum);
 	}
 
 	if (clp->l_devid != 0) {
@@ -3099,7 +3099,7 @@
 		create_db32rec(db32p, dbp);
 		crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
 		err = writeblks(s, (caddr_t)db32p, dbp->db_blknum, 1, li,
-			MDDB_WR_ONLY_MASTER);
+		    MDDB_WR_ONLY_MASTER);
 		kmem_free((caddr_t)db32p, MDDB_BSIZE);
 		if (err)
 			return (err);
@@ -3804,7 +3804,7 @@
 			lnp->ln_revision = MDDB_REV_LN;
 		crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
 		err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
-			lbp->lb_lnblkcnt, 0);
+		    lbp->lb_lnblkcnt, 0);
 		/*
 		 * If a MN diskset and this is the master, set the PARSE_LOCNM
 		 * flag in the mddb_set structure to show that the locator
@@ -4413,28 +4413,34 @@
 	}
 
 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
-	    did_info = &(did_icp->did_ic_blkp->blk_info[li]);
-	    if (did_info->info_flags & MDDB_DID_EXISTS) {
-		sz = (int)ddi_devid_sizeof(did_icp->did_ic_devid[li]);
-		if (clp->l_devid_flags & MDDB_DEVID_SPACE) {
-			/* copy device id from mddb to cfg_loc structure */
-			szalloc = clp->l_devid_sz;
-			if (sz <= szalloc) {
-				for (i = 0; i < sz; i++) {
-					((char *)(uintptr_t)clp->l_devid)[i] =
-					((char *)did_icp->did_ic_devid[li])[i];
+		did_info = &(did_icp->did_ic_blkp->blk_info[li]);
+		if (did_info->info_flags & MDDB_DID_EXISTS) {
+			sz = (int)ddi_devid_sizeof(did_icp->did_ic_devid[li]);
+			if (clp->l_devid_flags & MDDB_DEVID_SPACE) {
+				/*
+				 * copy device id from mddb to
+				 * cfg_loc structure
+				 */
+				szalloc = clp->l_devid_sz;
+				if (sz <= szalloc) {
+					for (i = 0; i < sz; i++) {
+						((char *)(uintptr_t)
+						    clp->l_devid)[i] =
+						    ((char *)did_icp->
+						    did_ic_devid[li])[i];
+					}
+					clp->l_devid_flags |= MDDB_DEVID_VALID;
+					(void) strcpy(clp->l_minor_name,
+					    did_info->info_minor_name);
+				} else {
+					clp->l_devid_flags |=
+					    MDDB_DEVID_NOSPACE;
 				}
-				clp->l_devid_flags |= MDDB_DEVID_VALID;
-				(void) strcpy(clp->l_minor_name,
-					did_info->info_minor_name);
-			} else {
-				clp->l_devid_flags |= MDDB_DEVID_NOSPACE;
-			}
-		} else if (clp->l_devid_flags & MDDB_DEVID_GETSZ) {
-			clp->l_devid_flags = MDDB_DEVID_SZ;
-			clp->l_devid_sz = sz;
-		}
-	    }
+			} else if (clp->l_devid_flags & MDDB_DEVID_GETSZ) {
+				clp->l_devid_flags = MDDB_DEVID_SZ;
+				clp->l_devid_sz = sz;
+			}
+		}
 	}
 
 	/*
@@ -4770,8 +4776,7 @@
 	 * lb_blkcnt will be set correctly for MN set later once getmasters
 	 * has determined that the set is a MN set.
 	 */
-	lb_blkcnt = ((setno == MD_LOCAL_SET) ?
-			MDDB_LOCAL_LBCNT : MDDB_LBCNT);
+	lb_blkcnt = ((setno == MD_LOCAL_SET) ? MDDB_LOCAL_LBCNT : MDDB_LBCNT);
 
 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
 		rip->ri_flags &= (MDDB_F_PTCHED | MDDB_F_IOCTL |
@@ -4919,8 +4924,8 @@
 			/* Read in device ID block */
 			if (did_icp == NULL) {
 				did_icp = (mddb_did_ic_t *)
-					kmem_zalloc(sizeof (mddb_did_ic_t),
-					    KM_SLEEP);
+				    kmem_zalloc(sizeof (mddb_did_ic_t),
+				    KM_SLEEP);
 			} else {
 				/* Reuse did_icp, but clear out data */
 				if (did_icp->did_ic_blkp !=
@@ -4932,22 +4937,23 @@
 					    (mddb_did_blk_t *)NULL;
 				}
 				if (did_icp->did_ic_dbp !=
-					(mddb_did_db_t *)NULL) {
+				    (mddb_did_db_t *)NULL) {
 					did_dbp1 = did_icp->did_ic_dbp;
 					while (did_dbp1) {
-					    did_dbp2 = did_dbp1->db_next;
-					    kmem_free((caddr_t)did_dbp1->db_ptr,
-						dbtob(did_dbp1->db_blkcnt));
-					    kmem_free((caddr_t)did_dbp1,
-						sizeof (mddb_did_db_t));
-					    did_dbp1 = did_dbp2;
+						did_dbp2 = did_dbp1->db_next;
+						kmem_free((caddr_t)
+						    did_dbp1->db_ptr,
+						    dbtob(did_dbp1->db_blkcnt));
+						kmem_free((caddr_t)did_dbp1,
+						    sizeof (mddb_did_db_t));
+						did_dbp1 = did_dbp2;
 					}
 					did_icp->did_ic_dbp =
-						(mddb_did_db_t *)NULL;
+					    (mddb_did_db_t *)NULL;
 				}
 				for (i = 0; i < MDDB_NLB; i++) {
 					did_icp->did_ic_devid[i] =
-						(ddi_devid_t)NULL;
+					    (ddi_devid_t)NULL;
 				}
 			}
 
@@ -4985,7 +4991,7 @@
 			if (revchk(MDDB_REV_DI, did_blkp->blk_revision))
 				continue;
 			if (crcchk(did_blkp, &did_blkp->blk_checksum,
-				dbtob(lbp->lb_didblkcnt), NULL))
+			    dbtob(lbp->lb_didblkcnt), NULL))
 				continue;
 
 			/*
@@ -5037,82 +5043,106 @@
 			 * have been updated to match this valid device
 			 * id information.
 			 */
-		    for (li = 0; li < lbp->lb_loccnt; li++) {
-			did_info = &did_blkp->blk_info[li];
-			if (did_info->info_flags & MDDB_DID_EXISTS)
-				did_info->info_flags &=
-					~(MDDB_DID_VALID | MDDB_DID_UPDATED);
-		    }
-
-		    cont_flag = 0;
-		    for (li = 0; li < lbp->lb_loccnt; li++) {
-			did_info = &did_blkp->blk_info[li];
-			did_block = (caddr_t)NULL;
-			if (did_info->info_flags & MDDB_DID_EXISTS) {
-			    /* Check if block has already been read in */
-			    did_dbp = did_icp->did_ic_dbp;
-			    while (did_dbp != 0) {
-				if (did_dbp->db_firstblk ==
-				    did_info->info_firstblk)
-					break;
-				else
-					did_dbp = did_dbp->db_next;
-			    }
-			    /* if block not found, read it in */
-			    if (did_dbp == NULL) {
-				did_block = (caddr_t)(kmem_zalloc(dbtob
-					    (did_info->info_blkcnt), KM_SLEEP));
-				buffer = (caddr_t)did_block;
-				for (blk = did_info->info_firstblk;
-				    blk < (did_info->info_firstblk +
-				    did_info->info_blkcnt); blk++) {
-					physblk = getphysblk(blk, rip->ri_mbip);
-					err = getblks(s, buffer, dev, physblk,
-					    btodb(MDDB_BSIZE), 0);
-					if (err) {
-						rip->ri_flags |= err;
+			for (li = 0; li < lbp->lb_loccnt; li++) {
+				did_info = &did_blkp->blk_info[li];
+				if (did_info->info_flags & MDDB_DID_EXISTS)
+					did_info->info_flags &=
+					    ~(MDDB_DID_VALID |
+					    MDDB_DID_UPDATED);
+			}
+
+			cont_flag = 0;
+			for (li = 0; li < lbp->lb_loccnt; li++) {
+				did_info = &did_blkp->blk_info[li];
+				did_block = (caddr_t)NULL;
+				if (did_info->info_flags & MDDB_DID_EXISTS) {
+					/*
+					 * Check if block has
+					 * already been read in
+					 */
+					did_dbp = did_icp->did_ic_dbp;
+					while (did_dbp != 0) {
+						if (did_dbp->db_firstblk ==
+						    did_info->info_firstblk)
+							break;
+						else
+							did_dbp =
+							    did_dbp->db_next;
+					}
+					/* if block not found, read it in */
+					if (did_dbp == NULL) {
+						did_block = (caddr_t)
+						    (kmem_zalloc(dbtob(
+						    did_info->info_blkcnt),
+						    KM_SLEEP));
+						buffer = (caddr_t)did_block;
+						for (blk =
+						    did_info->info_firstblk;
+						    blk < (did_info->
+						    info_firstblk +
+						    did_info->info_blkcnt);
+						    blk++) {
+							physblk =
+							    getphysblk(blk,
+							    rip->ri_mbip);
+							err = getblks(s,
+							    buffer, dev,
+							    physblk, btodb(
+							    MDDB_BSIZE), 0);
+							if (err) {
+								rip->ri_flags |=
+								    err;
+								break;
+							}
+							buffer += MDDB_BSIZE;
+						}
+						if (err) {
+							kmem_free(did_block,
+							    dbtob(did_info->
+							    info_blkcnt));
+							did_block =
+							    (caddr_t)NULL;
+							cont_flag = 1;
+							break;
+						}
+
+						/*
+						 * Block read in -
+						 * alloc Disk Block area
+						 */
+						did_dbp = (mddb_did_db_t *)
+						    kmem_zalloc(
+						    sizeof (mddb_did_db_t),
+						    KM_SLEEP);
+						did_dbp->db_ptr = did_block;
+						did_dbp->db_firstblk =
+						    did_info->info_firstblk;
+						did_dbp->db_blkcnt =
+						    did_info->info_blkcnt;
+
+						/* Add to front of dbp list */
+						did_dbp->db_next =
+						    did_icp->did_ic_dbp;
+						did_icp->did_ic_dbp = did_dbp;
+					}
+					/* Check validity of devid in block */
+					if (crcchk(((char *)did_dbp->db_ptr +
+					    did_info->info_offset),
+					    &did_info->info_checksum,
+					    did_info->info_length, NULL)) {
+						cont_flag = 1;
 						break;
 					}
-					buffer += MDDB_BSIZE;
-				}
-				if (err) {
-				    kmem_free(did_block,
-					dbtob(did_info->info_blkcnt));
-					did_block = (caddr_t)NULL;
-				    cont_flag = 1;
-				    break;
+
+					/* Block now pointed to by did_dbp */
+					did_icp->did_ic_devid[li] =
+					    (ddi_devid_t)((char *)
+					    did_dbp->db_ptr +
+					    did_info->info_offset);
 				}
-
-				/*
-				 * Block read in - alloc Disk Block area
-				 */
-				did_dbp = (mddb_did_db_t *)kmem_zalloc(
-				    sizeof (mddb_did_db_t), KM_SLEEP);
-				did_dbp->db_ptr = did_block;
-				did_dbp->db_firstblk = did_info->info_firstblk;
-				did_dbp->db_blkcnt = did_info->info_blkcnt;
-
-				/* Add to front of dbp list */
-				did_dbp->db_next = did_icp->did_ic_dbp;
-				did_icp->did_ic_dbp = did_dbp;
-			    }
-			    /* Check validity of devid in block */
-			    if (crcchk(((char *)did_dbp->db_ptr +
-				did_info->info_offset),
-				&did_info->info_checksum,
-				did_info->info_length, NULL)) {
-				    cont_flag = 1;
-				    break;
-			    }
-
-			    /* Block now pointed to by did_dbp */
-			    did_icp->did_ic_devid[li] = (ddi_devid_t)
-				((char *)did_dbp->db_ptr +
-				did_info->info_offset);
-			}
-		    }
-		    if (cont_flag)
-			continue;
+			}
+			if (cont_flag)
+				continue;
 		}
 
 		/*
@@ -5194,11 +5224,11 @@
 				    (rip->ri_old_devid != (ddi_devid_t)NULL)) {
 					if (ddi_devid_compare(rip->ri_old_devid,
 					    did_icp->did_ic_devid[li]) != 0)
-					    continue;
+						continue;
 				} else {
 					if (ddi_devid_compare(rip->ri_devid,
 					    did_icp->did_ic_devid[li]) != 0)
-					    continue;
+						continue;
 				}
 
 				if (strcmp(rip->ri_minor_name,
@@ -5214,64 +5244,74 @@
 			 * information about itself.
 			 */
 			if (!mn_set) {
-			    for (li = 0; li < lbp->lb_loccnt; li++) {
-				mddb_drvnm_t		*dn;
-				mddb_sidelocator_t	*slp;
-
-				lp = &lbp->lb_locators[li];
-				slp = &lbp->lb_sidelocators[s->s_sideno][li];
-				if (lp->l_flags & MDDB_F_DELETED)
-					continue;
-				if (slp->l_mnum != md_getminor(rip->ri_dev))
-					continue;
-				if (lp->l_blkno != rip->ri_blkno)
-					continue;
-				dn = &lbp->lb_drvnm[slp->l_drvnm_index];
-				if (strncmp(dn->dn_data, rip->ri_driver,
-				    MD_MAXDRVNM) == 0)
-				break;
-			    }
+				for (li = 0; li < lbp->lb_loccnt; li++) {
+					mddb_drvnm_t		*dn;
+					mddb_sidelocator_t	*slp;
+
+					lp = &lbp->lb_locators[li];
+					slp = &lbp->
+					    lb_sidelocators[s->s_sideno][li];
+					if (lp->l_flags & MDDB_F_DELETED)
+						continue;
+					if (slp->l_mnum != md_getminor(
+					    rip->ri_dev))
+						continue;
+					if (lp->l_blkno != rip->ri_blkno)
+						continue;
+					dn = &lbp->lb_drvnm[slp->l_drvnm_index];
+					if (strncmp(dn->dn_data,
+					    rip->ri_driver, MD_MAXDRVNM) == 0)
+						break;
+				}
 			} else {
-			    for (li = 0; li < lbp->lb_loccnt; li++) {
-				mddb_drvnm_t		*dn;
-				mddb_mnsidelocator_t	*mnslp;
-				mddb_mnlb_t		*mnlbp;
-				int			i;
-
-				/*
-				 * Check all possible locators locking for
-				 * match to the currently read-in locator,
-				 * must match on:
-				 *	- blkno
-				 *	- side locator for this node's side
-				 *	- side locator minor number
-				 *	- side locator driver name
-				 */
-
-				/* Looking at sidelocs - cast lbp -> mnlbp */
-				mnlbp = (mddb_mnlb_t *)lbp;
-				lp = &mnlbp->lb_locators[li];
-				if (lp->l_flags & MDDB_F_DELETED)
-					continue;
-				if (lp->l_blkno != rip->ri_blkno)
-					continue;
-
-				for (i = 0; i < MD_MNMAXSIDES; i++) {
-				    mnslp = &mnlbp->lb_mnsidelocators[i][li];
-				    if (mnslp->mnl_sideno == s->s_sideno) {
-					break;
-				    }
+				for (li = 0; li < lbp->lb_loccnt; li++) {
+					mddb_drvnm_t		*dn;
+					mddb_mnsidelocator_t	*mnslp;
+					mddb_mnlb_t		*mnlbp;
+					int			i;
+
+					/*
+					 * Check all possible locators locking
+					 * for match to the currently read-in
+					 * locator, must match on:
+					 *	- blkno
+					 *	- side locator for this
+					 *	  node's side
+					 *	- side locator minor number
+					 *	- side locator driver name
+					 */
+
+					/*
+					 * Looking at sidelocs:
+					 * cast lbp -> mnlbp
+					 */
+					mnlbp = (mddb_mnlb_t *)lbp;
+					lp = &mnlbp->lb_locators[li];
+					if (lp->l_flags & MDDB_F_DELETED)
+						continue;
+					if (lp->l_blkno != rip->ri_blkno)
+						continue;
+
+					for (i = 0; i < MD_MNMAXSIDES; i++) {
+						mnslp = &mnlbp->
+						    lb_mnsidelocators[i][li];
+						if (mnslp->mnl_sideno ==
+						    s->s_sideno) {
+							break;
+						}
+					}
+					/* No matching side found */
+					if (i == MD_MNMAXSIDES)
+						continue;
+					if (mnslp->mnl_mnum !=
+					    md_getminor(rip->ri_dev))
+						continue;
+					dn = &lbp->
+					    lb_drvnm[mnslp->mnl_drvnm_index];
+					if (strncmp(dn->dn_data,
+					    rip->ri_driver, MD_MAXDRVNM) == 0)
+						break;
 				}
-				/* No matching side found */
-				if (i == MD_MNMAXSIDES)
-					continue;
-				if (mnslp->mnl_mnum != md_getminor(rip->ri_dev))
-					continue;
-				dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
-				if (strncmp(dn->dn_data, rip->ri_driver,
-				    MD_MAXDRVNM) == 0)
-					break;
-			    }
 			}
 		}
 
@@ -5549,7 +5589,7 @@
 		did_dbp1 = did_icp->did_ic_dbp;
 		while (did_dbp1) {
 			if (mddb_devid_free_add(s, did_dbp1->db_firstblk,
-				0, dbtob(did_dbp1->db_blkcnt))) {
+			    0, dbtob(did_dbp1->db_blkcnt))) {
 				retval = MDDB_E_NOSPACE;
 				goto errout;
 			}
@@ -5904,9 +5944,9 @@
 				/* Validate device id on current system */
 				newdev[li] = dev;
 				if (mddb_devid_validate(
-					did_icp->did_ic_devid[li],
-					&(newdev[li]),
-					did_info->info_minor_name) == 0) {
+				    did_icp->did_ic_devid[li],
+				    &(newdev[li]),
+				    did_info->info_minor_name) == 0) {
 					/* Set valid flag */
 					did_info->info_flags |= MDDB_DID_VALID;
 				} else {
@@ -5931,20 +5971,21 @@
 						if (mddb_devid_add(s, li,
 						    ret_devid, minor_name)) {
 							cmn_err(CE_WARN,
-							"Not enough space in"
-							" metadevice state"
-							" database\n");
+							    "Not enough space"
+							    " in metadevice"
+							    " state"
+							    " database\n");
 							cmn_err(CE_WARN,
-							"to add relocation"
-							" information for"
-							" device:\n");
+							    "to add relocation"
+							    " information for"
+							    " device:\n");
 							cmn_err(CE_WARN,
-							" major = %d, "
-							" minor = %d\n",
-							getmajor(ddi_dev),
-							getminor(ddi_dev));
+							    " major = %d, "
+							    " minor = %d\n",
+							    getmajor(ddi_dev),
+							    getminor(ddi_dev));
 						} else {
-						    write_lb = 1;
+							write_lb = 1;
 						}
 						kmem_free(minor_name,
 						    strlen(minor_name) + 1);
@@ -6509,7 +6550,7 @@
 				if (! s->s_mbiarray[i])
 					continue;
 				dev = md_expldev(
-					s->s_lbp->lb_locators[i].l_dev);
+				    s->s_lbp->lb_locators[i].l_dev);
 				dev = md_xlate_targ_2_mini(dev);
 				if (dev != NODEV64)
 					mddb_devclose(dev);
@@ -6518,7 +6559,7 @@
 			}
 
 			kmem_free((caddr_t)s->s_mbiarray,
-				sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
+			    sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
 			s->s_mbiarray = NULL;
 		}
 
@@ -6560,7 +6601,7 @@
 	 */
 
 	lb_blkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ?
-				MDDB_LOCAL_LBCNT : MDDB_LBCNT);
+	    MDDB_LOCAL_LBCNT : MDDB_LBCNT);
 	if (flag & MDDB_MULTINODE) {
 		lb_blkcnt = MDDB_MNLBCNT;
 	}
@@ -6623,7 +6664,7 @@
 	/* the btodb that follows is converting the directory block size */
 	/* Data tag part of mddb located after first block of mddb data */
 	lbp->lb_dtfirstblk = (mddb_block_t)(lbp->lb_dbfirstblk +
-						btodb(MDDB_BSIZE));
+	    btodb(MDDB_BSIZE));
 	/* Data tags are not used in MN diskset - so set count to 0 */
 	if (flag & MDDB_MULTINODE)
 		lbp->lb_dtblkcnt = (mddb_block_t)0;
@@ -6675,14 +6716,14 @@
 		devid_flag = 0;
 	if (devid_flag) {
 		lbp->lb_didfirstblk = lbp->lb_dtfirstblk +
-			lbp->lb_dtblkcnt;
+		    lbp->lb_dtblkcnt;
 		lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS;
 		lbp->lb_flags |= MDDB_DEVID_STYLE;
 
 		did_icp = (mddb_did_ic_t *)kmem_zalloc
-			(sizeof (mddb_did_ic_t), KM_SLEEP);
+		    (sizeof (mddb_did_ic_t), KM_SLEEP);
 		did_blkp = (mddb_did_blk_t *)
-			kmem_zalloc(dbtob(lbp->lb_didblkcnt), KM_SLEEP);
+		    kmem_zalloc(dbtob(lbp->lb_didblkcnt), KM_SLEEP);
 		did_blkp->blk_magic = MDDB_MAGIC_DI;
 		did_blkp->blk_revision = MDDB_REV_DI;
 		did_icp->did_ic_blkp = did_blkp;
@@ -6846,8 +6887,7 @@
 	 *	re-grab mutex
 	 *	set s_mn_parseflags_sending to zero
 	 */
-	mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t),
-		KM_SLEEP);
+	mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t), KM_SLEEP);
 	while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) &&
 	    (s->s_mn_parseflags & MDDB_PARSE_MASK) &&
 	    (!(md_get_setstatus(s->s_setno) & MD_SET_MNPARSE_BLK))) {
@@ -6867,18 +6907,18 @@
 		mddb_parse_msg->msg_parse_flags = s->s_mn_parseflags_sending;
 		for (i = 0; i < MDDB_NLB; i++) {
 			mddb_parse_msg->msg_lb_flags[i] =
-				lbp->lb_locators[i].l_flags;
+			    lbp->lb_locators[i].l_flags;
 		}
 		kresult = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP);
 		while (rval != 0) {
 			rval = mdmn_ksend_message(s->s_setno,
-				MD_MN_MSG_MDDB_PARSE, 0,
-				(char *)mddb_parse_msg,
-				sizeof (mddb_parse_msg), kresult);
+			    MD_MN_MSG_MDDB_PARSE, 0, 0,
+			    (char *)mddb_parse_msg,
+			    sizeof (md_mn_msg_mddb_parse_t), kresult);
 			if (rval != 0)
 				cmn_err(CE_WARN, "mddb_setexit: Unable to send "
-					"mddb update message to other nodes in "
-					"diskset %s\n", s->s_setname);
+				    "mddb update message to other nodes in "
+				    "diskset %s\n", s->s_setname);
 		}
 		kmem_free(kresult, sizeof (md_mn_kresult_t));
 
@@ -6987,12 +7027,12 @@
 					if (mddb_devid_add(s, li, ret_devid,
 					    minor_name)) {
 						cmn_err(CE_WARN,
-						"Not enough space in metadb"
-						" to add device id for"
-						"  dev: major = %d, "
-						"minor = %d\n",
-						getmajor(ddi_dev),
-						getminor(ddi_dev));
+						    "Not enough space in metadb"
+						    " to add device id for"
+						    "  dev: major = %d, "
+						    "minor = %d\n",
+						    getmajor(ddi_dev),
+						    getminor(ddi_dev));
 					}
 					sz = strlen(minor_name) + 1;
 					kmem_free(minor_name, sz);
@@ -7179,13 +7219,10 @@
 	}
 
 	md_clr_setstatus(setno, MD_SET_ACCOK | MD_SET_ACCEPT |
-				MD_SET_TAGDATA | MD_SET_USETAG |
-				MD_SET_TOOFEW | MD_SET_STALE |
-				MD_SET_OWNERSHIP | MD_SET_BADTAG |
-				MD_SET_CLRTAG | MD_SET_MNSET |
-				MD_SET_DIDCLUP | MD_SET_MNPARSE_BLK |
-				MD_SET_MN_MIR_STATE_RC | MD_SET_IMPORT |
-				MD_SET_REPLICATED_IMPORT);
+	    MD_SET_TAGDATA | MD_SET_USETAG | MD_SET_TOOFEW | MD_SET_STALE |
+	    MD_SET_OWNERSHIP | MD_SET_BADTAG | MD_SET_CLRTAG | MD_SET_MNSET |
+	    MD_SET_DIDCLUP | MD_SET_MNPARSE_BLK | MD_SET_MN_MIR_STATE_RC |
+	    MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT);
 
 	mutex_exit(SETMUTEX(setno));
 }
@@ -7286,13 +7323,13 @@
 
 		SPN_SUFFIX(spn).suf_len = mnsn->mn_ln_suffix.suf_len;
 		bcopy(mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_data,
-			SPN_SUFFIX(spn).suf_len);
+		    SPN_SUFFIX(spn).suf_len);
 		iprefix = mnsn->mn_ln_suffix.suf_prefix;
 	} else {
 		sn = &lnp->ln_suffixes[sideno][li];
 		SPN_SUFFIX(spn).suf_len = sn->suf_len;
 		bcopy(sn->suf_data, SPN_SUFFIX(spn).suf_data,
-			SPN_SUFFIX(spn).suf_len);
+		    SPN_SUFFIX(spn).suf_len);
 		iprefix = sn->suf_prefix;
 	}
 	SPN_PREFIX(spn).pre_len = lnp->ln_prefixes[iprefix].pre_len;
@@ -7328,7 +7365,7 @@
 	 * Data checking
 	 */
 	if (setno >= md_nsets || cp->c_id < 0 ||
-		cp->c_id > cp->c_dbmax) {
+	    cp->c_id > cp->c_dbmax) {
 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
 	}
 
@@ -7377,14 +7414,14 @@
 		if (cp->c_id < 0 || cp->c_id > cp->c_dbmax) {
 			mddb_setexit(s);
 			return (mdmddberror(ep, MDE_DB_INVALID, NODEV32,
-						setno));
+			    setno));
 		}
 		li = cp->c_id;
 	} else {
 		if (cp->c_id >= cp->c_dbcnt) {
 			mddb_setexit(s);
 			return (mdmddberror(ep, MDE_DB_INVALID, NODEV32,
-						setno));
+			    setno));
 		}
 
 		/* CSTYLED */
@@ -7446,7 +7483,7 @@
 		 * commitcnt to 0.
 		 */
 		(void) writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li,
-			MDDB_WR_ONLY_MASTER);
+		    MDDB_WR_ONLY_MASTER);
 		lbp->lb_commitcnt = commitcnt;
 	}
 
@@ -7689,7 +7726,7 @@
 		lnp->ln_revision = MDDB_REV_LN;
 	crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
 	err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
-		lbp->lb_lnblkcnt, 0);
+	    lbp->lb_lnblkcnt, 0);
 	/*
 	 * If a MN diskset and this is the master, set the PARSE_LOCNM
 	 * flag in the mddb_set structure to show that the locator
@@ -7851,7 +7888,7 @@
 		 */
 		if (devidptr != (ddi_devid_t)NULL) {
 			mb = (mddb_mb_t *)kmem_zalloc(MDDB_BSIZE,
-				KM_SLEEP);
+			    KM_SLEEP);
 			mb->mb_magic = MDDB_MAGIC_DU;
 			mb->mb_revision = MDDB_REV_MB;
 			mb2free = 1;
@@ -8077,7 +8114,7 @@
 				single_thread_end(s);
 				mddb_setexit(s);
 				return (mdmddberror(ep, MDE_DB_TOOSMALL,
-					NODEV32, setno));
+				    NODEV32, setno));
 			}
 		}
 
@@ -8095,7 +8132,7 @@
 			single_thread_end(s);
 			mddb_setexit(s);
 			return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
-						setno));
+			    setno));
 		}
 
 		if (cfgloc2locator(lbp, clp, li, cp->c_sideno, index)) {
@@ -8105,7 +8142,7 @@
 			single_thread_end(s);
 			mddb_setexit(s);
 			return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
-						setno));
+			    setno));
 		}
 	}
 
@@ -8119,9 +8156,9 @@
 				int	j;
 				mnlbp = (mddb_mnlb_t *)lbp;
 				for (j = 0; j < MD_MNMAXSIDES; j++) {
-				    mnslp = &mnlbp->lb_mnsidelocators[j][i];
-				    if (mnslp->mnl_sideno == cp->c_sideno)
-					break;
+					mnslp = &mnlbp->lb_mnsidelocators[j][i];
+					if (mnslp->mnl_sideno == cp->c_sideno)
+						break;
 				}
 				if (j < MD_MNMAXSIDES) {
 					mnslp->mnl_mnum = NODEV32;
@@ -8129,7 +8166,7 @@
 					mnlnp = (mddb_mnln_t *)lnp;
 					mnsn = &(mnlnp->ln_mnsuffixes[j][i]);
 					bzero((caddr_t)mnsn,
-						sizeof (md_mnname_suffix_t));
+					    sizeof (md_mnname_suffix_t));
 				}
 			} else {
 				slp = &lbp->lb_sidelocators[cp->c_sideno][i];
@@ -8148,7 +8185,7 @@
 		lnp->ln_revision = MDDB_REV_LN;
 	crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
 	err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
-		lbp->lb_lnblkcnt, 0);
+	    lbp->lb_lnblkcnt, 0);
 	/*
 	 * If a MN diskset and this is the master, set the PARSE_LOCNM
 	 * flag in the mddb_set structure to show that the locator
@@ -8288,11 +8325,11 @@
 			    ((daddr_t)lp->l_blkno == clp->l_blkno)) {
 				if (command == MDDB_NEWDEV) {
 					ddi_devid_free((ddi_devid_t)(uintptr_t)
-						clp->l_devid);
+					    clp->l_devid);
 					single_thread_end(s);
 					mddb_setexit(s);
 					return (mdmddberror(ep,
-						MDE_DB_EXISTS, NODEV32, setno));
+					    MDE_DB_EXISTS, NODEV32, setno));
 				}
 			}
 		} else {
@@ -8302,7 +8339,7 @@
 					single_thread_end(s);
 					mddb_setexit(s);
 					return (mdmddberror(ep,
-						MDE_DB_EXISTS, NODEV32, setno));
+					    MDE_DB_EXISTS, NODEV32, setno));
 				}
 			}
 		}
@@ -8345,7 +8382,7 @@
 			single_thread_end(s);
 			mddb_setexit(s);
 			return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
-						setno));
+			    setno));
 		}
 	}
 
@@ -8402,7 +8439,7 @@
 			single_thread_end(s);
 			mddb_setexit(s);
 			return (mdmddberror(ep, MDE_DB_TOOSMALL,
-				NODEV32, setno));
+			    NODEV32, setno));
 		}
 	}
 	/*
@@ -8462,7 +8499,7 @@
 		lnp->ln_revision = MDDB_REV_LN;
 	crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
 	err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
-		lbp->lb_lnblkcnt, 0);
+	    lbp->lb_lnblkcnt, 0);
 	/*
 	 * If a MN diskset and this is the master, set the PARSE_LOCNM
 	 * flag in the mddb_set structure to show that the locator
@@ -8579,67 +8616,74 @@
 	mdclrerror(ep);
 
 	switch (command) {
-	    case MDDB_NEWDEV:
-		err = newdev(cp, command, ep);
-		break;
-
-	    case MDDB_NEWSIDE:
-	    case MDDB_DELSIDE:
-		err = delnewside(cp, command, ep);
-		break;
-
-	    case MDDB_GETDEV:
-	    case MDDB_DELDEV:
-	    case MDDB_ENDDEV:
-		err = getdeldev(cp, command, ep);
-		break;
-
-	    case MDDB_GETDRVRNAME:
-		err = getdriver(&cp->c_locator);
-		break;
-
-	    case MDDB_USEDEV:
-		/*
-		 * Note: must allow USEDEV ioctl during upgrade to support
-		 * auto-take disksets.
-		 *
-		 * Also during the set import if the md_devid_destroy
-		 * flag is set then error out
-		 */
-
-		if ((cp->c_flags & MDDB_C_IMPORT) && md_devid_destroy)
-			return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
-
-		if (setno >= md_nsets)
-			return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
-
-		if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) {
-			if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) {
-				err = mddbstatus2error(ep, err, NODEV32, setno);
-				break;
-			}
-		}
-		if (setno == MD_LOCAL_SET)
-			flag = MDDB_F_IOCTL;
-		if (cp->c_locator.l_old_devid) {
-			md_set_setstatus(setno, MD_SET_REPLICATED_IMPORT);
-		}
-		err = ridev(&s->s_rip, &cp->c_locator, NULL, flag);
-		mddb_setexit(s);
-		break;
-
-	    case MDDB_RELEASESET:
-		mutex_enter(&mddb_lock);
-		mddb_unload_set(cp->c_setno);
-		mutex_exit(&mddb_lock);
-		break;
-
-	    case MDDB_SETDID:
-		err = setdid(cp);
-		break;
-
-	    default:
-		err = mdmddberror(ep, MDE_DB_INVALID, NODEV32, cp->c_setno);
+		case MDDB_NEWDEV:
+			err = newdev(cp, command, ep);
+			break;
+
+		case MDDB_NEWSIDE:
+		case MDDB_DELSIDE:
+			err = delnewside(cp, command, ep);
+			break;
+
+		case MDDB_GETDEV:
+		case MDDB_DELDEV:
+		case MDDB_ENDDEV:
+			err = getdeldev(cp, command, ep);
+			break;
+
+		case MDDB_GETDRVRNAME:
+			err = getdriver(&cp->c_locator);
+			break;
+
+		case MDDB_USEDEV:
+			/*
+			 * Note: must allow USEDEV ioctl during upgrade to
+			 * support auto-take disksets.
+			 *
+			 * Also during the set import if the md_devid_destroy
+			 * flag is set then error out
+			 */
+
+			if ((cp->c_flags & MDDB_C_IMPORT) && md_devid_destroy)
+				return (mdmderror(ep, MDE_INVAL_UNIT,
+				    MD_ADM_MINOR));
+
+			if (setno >= md_nsets)
+				return (mdmderror(ep, MDE_INVAL_UNIT,
+				    MD_ADM_MINOR));
+
+			if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) ==
+			    NULL) {
+				if ((s = init_set(cp, MDDB_NOINIT, &err)) ==
+				    NULL) {
+					err = mddbstatus2error(ep, err,
+					    NODEV32, setno);
+					break;
+				}
+			}
+			if (setno == MD_LOCAL_SET)
+				flag = MDDB_F_IOCTL;
+			if (cp->c_locator.l_old_devid) {
+				md_set_setstatus(setno,
+				    MD_SET_REPLICATED_IMPORT);
+			}
+			err = ridev(&s->s_rip, &cp->c_locator, NULL, flag);
+			mddb_setexit(s);
+			break;
+
+		case MDDB_RELEASESET:
+			mutex_enter(&mddb_lock);
+			mddb_unload_set(cp->c_setno);
+			mutex_exit(&mddb_lock);
+			break;
+
+		case MDDB_SETDID:
+			err = setdid(cp);
+			break;
+
+		default:
+			err = mdmddberror(ep, MDE_DB_INVALID, NODEV32,
+			    cp->c_setno);
 	}
 
 	return (err);
@@ -8761,15 +8805,14 @@
 	}
 
 	recsize = roundup((sizeof (*rbp) - sizeof (rbp->rb_data)) +
-				usersize, MDDB_BSIZE);
+	    usersize, MDDB_BSIZE);
 	blkcnt = btodb(recsize);
 
 	if (mddb_maxblocks)
 		maxblocks = mddb_maxblocks;
 	else
-		maxblocks = (MDDB_BSIZE -
-			(sizeof (*db32p) + sizeof (*de32p) -
-			sizeof (de32p->de32_blks))) / sizeof (mddb_block_t);
+		maxblocks = (MDDB_BSIZE - (sizeof (*db32p) + sizeof (*de32p) -
+		    sizeof (de32p->de32_blks))) / sizeof (mddb_block_t);
 
 	if (blkcnt > maxblocks) {
 		mddb_setexit(s);
@@ -8833,7 +8876,7 @@
 	} while (dbp);
 
 	desize = (sizeof (*de32p) - sizeof (de32p->de32_blks)) +
-			(sizeof (mddb_block_t) * blkcnt);
+	    (sizeof (mddb_block_t) * blkcnt);
 
 	/*
 	 * see if a directory block exists which will hold this entry
@@ -8872,7 +8915,8 @@
 			mddb_setexit(s);
 			return (MDDB_E_NOSPACE);
 		}
-		for (dbp = s->s_dbp; dbp->db_next; dbp = dbp->db_next);
+		for (dbp = s->s_dbp; dbp->db_next; dbp = dbp->db_next)
+			;
 		dbp->db_next = newdbp;
 		bzero((caddr_t)dbp->db_next, sizeof (*newdbp));
 		dbp->db_nextblk = getfreeblks(s, 1);
@@ -8888,10 +8932,10 @@
 	 * ready to add record
 	 */
 	desize_ic = (sizeof (*dep) - sizeof (dep->de_blks)) +
-			(sizeof (mddb_block_t) * blkcnt);
+	    (sizeof (mddb_block_t) * blkcnt);
 	if (dbp->db_firstentry) {
-		for (dep = dbp->db_firstentry; dep->de_next;
-		    dep = dep->de_next);
+		for (dep = dbp->db_firstentry; dep->de_next; dep = dep->de_next)
+			;
 		dep->de_next = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP);
 		dep = dep->de_next;
 	} else {
@@ -8919,8 +8963,8 @@
 	dep->de_blkcount = blkcnt;
 	flag_type = options &
 	    (MD_CRO_OPTIMIZE | MD_CRO_STRIPE | MD_CRO_MIRROR | MD_CRO_RAID |
-		MD_CRO_SOFTPART | MD_CRO_TRANS_MASTER | MD_CRO_TRANS_LOG |
-		MD_CRO_HOTSPARE | MD_CRO_HOTSPARE_POOL | MD_CRO_CHANGELOG);
+	    MD_CRO_SOFTPART | MD_CRO_TRANS_MASTER | MD_CRO_TRANS_LOG |
+	    MD_CRO_HOTSPARE | MD_CRO_HOTSPARE_POOL | MD_CRO_CHANGELOG);
 	switch (flag_type) {
 	case MD_CRO_OPTIMIZE:
 		dep->de_flags = MDDB_F_OPT;
@@ -9003,7 +9047,7 @@
 	if ((options & MD_CRO_OPTIMIZE) == 0) {
 		for (i = 0; i < blkcnt;	 i++) {
 			err |= writeall(s, (caddr_t)tmppnt,
-				dep->de_blks[i], 1, 0);
+			    dep->de_blks[i], 1, 0);
 			tmppnt += MDDB_BSIZE;
 		}
 	} else {
@@ -9310,10 +9354,10 @@
 		mddb_rb32_t *nrbp;
 
 		recsize = roundup((sizeof (*nrbp) - sizeof (nrbp->rb_data)) +
-				icsize, MDDB_BSIZE);
+		    icsize, MDDB_BSIZE);
 		if (dep->de_recsize < recsize)
 			cmn_err(CE_PANIC, "mddb_getrecaddr_resize: only "
-				"nonoptimized records can be resized\n");
+			    "nonoptimized records can be resized\n");
 	}
 
 	mddb_setexit(s);
@@ -9673,26 +9717,29 @@
 				lbp = s->s_lbp;
 				mnlbp = (mddb_mnlb_t *)lbp;
 				for (i = 0; i < 2; i++) {
-				    li = dep->de_optinfo[i].o_li;
-				    lp = &lbp->lb_locators[li];
-				    for (j = 0; j < MD_MNMAXSIDES; j++) {
-					mnslp =
-					    &mnlbp->lb_mnsidelocators[j][li];
-					if (mnslp->mnl_sideno == s->s_sideno)
-					    break;
-				    }
-				    if (j == MD_MNMAXSIDES)
-					continue;
-
-				    dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
-				    recerr = &msg_recerr->msg_recerr[i];
-				    recerr->r_li = li;
-				    recerr->r_flags =
-					dep->de_optinfo[i].o_flags;
-				    recerr->r_blkno = lp->l_blkno;
-				    recerr->r_mnum = md_getminor(lp->l_dev);
-				    (void) strncpy(recerr->r_driver_name,
-					dn->dn_data, MD_MAXDRVNM);
+					li = dep->de_optinfo[i].o_li;
+					lp = &lbp->lb_locators[li];
+					for (j = 0; j < MD_MNMAXSIDES; j++) {
+						mnslp =
+						    &mnlbp->
+						    lb_mnsidelocators[j][li];
+						if (mnslp->mnl_sideno ==
+						    s->s_sideno)
+							break;
+					}
+					if (j == MD_MNMAXSIDES)
+						continue;
+
+					dn = &lbp->
+					    lb_drvnm[mnslp->mnl_drvnm_index];
+					recerr = &msg_recerr->msg_recerr[i];
+					recerr->r_li = li;
+					recerr->r_flags =
+					    dep->de_optinfo[i].o_flags;
+					recerr->r_blkno = lp->l_blkno;
+					recerr->r_mnum = md_getminor(lp->l_dev);
+					(void) strncpy(recerr->r_driver_name,
+					    dn->dn_data, MD_MAXDRVNM);
 				}
 
 				/* Release locks */
@@ -9711,17 +9758,17 @@
 				 * the optimized resync records it owns.
 				 */
 				rval = mdmn_ksend_message(s->s_setno,
-					MD_MN_MSG_MDDB_OPTRECERR,
-					MD_MSGF_NO_BCAST,
-					(char *)msg_recerr,
-					sizeof (md_mn_msg_mddb_optrecerr_t),
-					kres);
+				    MD_MN_MSG_MDDB_OPTRECERR,
+				    MD_MSGF_NO_BCAST, 0,
+				    (char *)msg_recerr,
+				    sizeof (md_mn_msg_mddb_optrecerr_t),
+				    kres);
 				if (!MDMN_KSEND_MSG_OK(rval, kres)) {
 					cmn_err(CE_WARN, "mddb_commitrec: "
-						"Unable to send optimized "
-						"resync record failure "
-						"message to other nodes in "
-						"diskset %s\n", s->s_setname);
+					    "Unable to send optimized "
+					    "resync record failure "
+					    "message to other nodes in "
+					    "diskset %s\n", s->s_setname);
 					mdmn_ksend_show_error(rval, kres,
 					    "MD_MN_MSG_MDDB_OPTRECERR");
 				}
@@ -9758,7 +9805,7 @@
 			}
 			kmem_free(kres, sizeof (md_mn_kresult_t));
 			kmem_free(msg_recerr,
-				sizeof (md_mn_msg_mddb_optrecerr_t));
+			    sizeof (md_mn_msg_mddb_optrecerr_t));
 
 			/* Resync record should be fixed - if possible */
 			s->s_optwaiterr--;
@@ -10723,8 +10770,7 @@
 		if ((ddi_lyr_get_devid(ddi_dev, &rtn_devid) == DDI_SUCCESS) &&
 		    (ddi_devid_compare(rtn_devid, devid) == 0)) {
 			did_info->info_flags = MDDB_DID_VALID |
-						MDDB_DID_EXISTS |
-						MDDB_DID_UPDATED;
+			    MDDB_DID_EXISTS | MDDB_DID_UPDATED;
 		} else {
 			cnt++;
 			/*
@@ -11051,7 +11097,7 @@
 			/* Assumes master blocks are already setup */
 			if (lbp == (mddb_lb_t *)NULL) {
 				lbp = (mddb_lb_t *)kmem_zalloc(
-					dbtob(MDDB_MNLBCNT), KM_SLEEP);
+				    dbtob(MDDB_MNLBCNT), KM_SLEEP);
 			}
 			err |= readblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, i);
 
@@ -11135,7 +11181,7 @@
 
 			/* Free this node's old view of mddb locator blocks */
 			kmem_free((caddr_t)s->s_lbp,
-				dbtob(s->s_lbp->lb_blkcnt));
+			    dbtob(s->s_lbp->lb_blkcnt));
 			s->s_lbp = lbp;
 		} else {
 			if (lbp)
@@ -11206,7 +11252,7 @@
 			 * master could have rewritten in during fixoptrecord.
 			 */
 			db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE,
-				KM_SLEEP);
+			    KM_SLEEP);
 			create_db32rec(db32p, dbp);
 			for (li = 0; li < lbp->lb_loccnt; li++) {
 				lp = &lbp->lb_locators[li];
@@ -11216,16 +11262,16 @@
 					continue;
 
 				err = readblks(s, (caddr_t)db32p,
-					db32p->db32_blknum, 1, li);
+				    db32p->db32_blknum, 1, li);
 				if (err)
 					continue;
 
 				/* Reverify db; go to next mddb if bad */
 				if ((db32p->db32_magic != MDDB_MAGIC_DB) ||
 				    (revchk(MDDB_REV_DB,
-					db32p->db32_revision)) ||
+				    db32p->db32_revision)) ||
 				    (crcchk(db32p, &db32p->db32_checksum,
-					MDDB_BSIZE, NULL))) {
+				    MDDB_BSIZE, NULL))) {
 					continue;
 				} else {
 					break;
@@ -11254,9 +11300,8 @@
 			if (li == lbp->lb_loccnt) {
 				kmem_free((caddr_t)db32p, MDDB_BSIZE);
 				cmn_err(CE_PANIC, "md: mddb: Node unable to "
-					"access any SVM state database "
-					"replicas for diskset %s\n",
-					s->s_setname);
+				    "access any SVM state database "
+				    "replicas for diskset %s\n", s->s_setname);
 			}
 			/*
 			 * Setup temp copy of linked list of de's.
@@ -11505,45 +11550,53 @@
 					lp->l_flags &= ~MDDB_F_ACTIVE;
 				}
 			} else {
-			/*
-			 * Passed in li from slave does not match
-			 * the replica in the master's structures.
-			 * This could have occurred if a delete
-			 * mddb command was running when the
-			 * optimized resync record had a failure.
-			 * Search all replicas for this entry.
-			 * If no match, just ignore.
-			 * If a match, set replica in error.
-			 */
-			    for (li = 0; li < lbp->lb_loccnt; li++) {
-				lp = &lbp->lb_locators[li];
-				if (lp->l_flags & MDDB_F_DELETED)
-					continue;
-
-				for (j = 0; j < MD_MNMAXSIDES; j++) {
-					mnslp =
-					    &mnlbp->lb_mnsidelocators[j][li];
-					if (mnslp->mnl_sideno == s->s_sideno)
+				/*
+				 * Passed in li from slave does not match
+				 * the replica in the master's structures.
+				 * This could have occurred if a delete
+				 * mddb command was running when the
+				 * optimized resync record had a failure.
+				 * Search all replicas for this entry.
+				 * If no match, just ignore.
+				 * If a match, set replica in error.
+				 */
+				for (li = 0; li < lbp->lb_loccnt; li++) {
+					lp = &lbp->lb_locators[li];
+					if (lp->l_flags & MDDB_F_DELETED)
+						continue;
+
+					for (j = 0; j < MD_MNMAXSIDES; j++) {
+						mnslp =
+						    &mnlbp->
+						    lb_mnsidelocators[j][li];
+						if (mnslp->mnl_sideno ==
+						    s->s_sideno)
+							break;
+					}
+					if (j == MD_MNMAXSIDES)
+						continue;
+
+					dn = &lbp->
+					    lb_drvnm[mnslp->mnl_drvnm_index];
+					if ((strncmp(dn->dn_data,
+					    recerr->r_driver_name,
+					    MD_MAXDRVNM) == 0) &&
+					    (recerr->r_blkno == lp->l_blkno) &&
+					    (recerr->r_mnum ==
+					    mnslp->mnl_mnum)) {
+						if ((lp->l_flags &
+						    MDDB_F_ACTIVE) ||
+						    ((lp->l_flags &
+						    MDDB_F_EWRITE) == 0)) {
+							something_changed = 1;
+							lp->l_flags |=
+							    MDDB_F_EWRITE;
+							lp->l_flags &=
+							    ~MDDB_F_ACTIVE;
+						}
 						break;
+					}
 				}
-				if (j == MD_MNMAXSIDES)
-					continue;
-
-				dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
-				if ((strncmp(dn->dn_data, recerr->r_driver_name,
-				    MD_MAXDRVNM) == 0) &&
-				    (recerr->r_blkno == lp->l_blkno) &&
-				    (recerr->r_mnum == mnslp->mnl_mnum)) {
-					if ((lp->l_flags & MDDB_F_ACTIVE) ||
-					    ((lp->l_flags & MDDB_F_EWRITE)
-					    == 0)) {
-						something_changed = 1;
-						lp->l_flags |= MDDB_F_EWRITE;
-						lp->l_flags &= ~MDDB_F_ACTIVE;
-					}
-					break;
-				}
-			    }
 			}
 		}
 	}
@@ -11693,8 +11746,7 @@
 	/* Re-verify that set is not stale */
 	if (md_get_setstatus(setno) & MD_SET_STALE) {
 		mddb_setexit(s);
-		return (mdmddberror(ep, MDE_DB_STALE,
-			NODEV32, setno));
+		return (mdmddberror(ep, MDE_DB_STALE, NODEV32, setno));
 	}
 
 	lbp = s->s_lbp;
@@ -11735,34 +11787,39 @@
 		 * They may have been altered by the previous master
 		 */
 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
-		    for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
-			if ((dep->de_flags & MDDB_F_CHANGELOG) == 0) {
-				continue;
-			}
-			/* This has been alloc'ed while joining the set */
-			if (dep->de_rb) {
-				kmem_free(dep->de_rb, dep->de_recsize);
-				dep->de_rb = (mddb_rb32_t *)NULL;
-			}
-			if (dep->de_rb_userdata) {
-				kmem_free(dep->de_rb_userdata, dep->de_reqsize);
-				dep->de_rb_userdata = (caddr_t)NULL;
-			}
-
-			err = getrecord(s, dep, li);
-			if (err) {
+			for (dep = dbp->db_firstentry; dep; dep =
+			    dep->de_next) {
+				if ((dep->de_flags & MDDB_F_CHANGELOG) == 0) {
+					continue;
+				}
 				/*
-				 * When we see on error while reading the
-				 * changelog entries, we move on to the next
-				 * mddb
+				 * This has been alloc'ed while
+				 * joining the set
 				 */
-				err = 1;
-				break; /* out of inner for-loop */
-			}
-			allocuserdata(dep);
-		    }
-		    if (err)
-			    break; /* out of outer for-loop */
+				if (dep->de_rb) {
+					kmem_free(dep->de_rb, dep->de_recsize);
+					dep->de_rb = (mddb_rb32_t *)NULL;
+				}
+				if (dep->de_rb_userdata) {
+					kmem_free(dep->de_rb_userdata,
+					    dep->de_reqsize);
+					dep->de_rb_userdata = (caddr_t)NULL;
+				}
+
+				err = getrecord(s, dep, li);
+				if (err) {
+					/*
+					 * When we see on error while reading
+					 * the changelog entries, we move on
+					 * to the next mddb
+					 */
+					err = 1;
+					break; /* out of inner for-loop */
+				}
+				allocuserdata(dep);
+			}
+			if (err)
+				break; /* out of outer for-loop */
 		}
 
 		/* If err, try next mddb */
@@ -11773,7 +11830,7 @@
 
 		/* Is incore locator block same as ondisk? */
 		if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT))
-									== 1) {
+		    == 1) {
 			write_out_mddb = 1;
 			kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
 			break;
@@ -11786,7 +11843,7 @@
 		    KM_SLEEP);
 		/* read in on-disk locator names */
 		err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk,
-			lbp->lb_lnblkcnt, li);
+		    lbp->lb_lnblkcnt, li);
 
 		/* If err, try next mddb */
 		if (err) {
@@ -11796,7 +11853,7 @@
 
 		/* Are incore locator names same as ondisk? */
 		if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT))
-									== 1) {
+		    == 1) {
 			kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
 			write_out_mddb = 1;
 			break;
@@ -11885,7 +11942,7 @@
 
 		/* Is incore locator block same as ondisk? */
 		if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT))
-									== 1) {
+		    == 1) {
 			kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
 			write_out_mddb = 1;
 			break;
@@ -11909,7 +11966,7 @@
 
 		/* Are incore locator names same as ondisk? */
 		if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT))
-									== 1) {
+		    == 1) {
 			kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
 			write_out_mddb = 1;
 			break;
@@ -12322,8 +12379,7 @@
 			/* disk is powered off or not there */
 			continue;
 
-		if (md_get_setstatus(s->s_setno) &
-			MD_SET_REPLICATED_IMPORT) {
+		if (md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT) {
 			/*
 			 * It is a replicated set
 			 */
--- a/usr/src/uts/common/io/lvm/md/md_subr.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/uts/common/io/lvm/md/md_subr.c	Wed Dec 24 08:23:40 2008 -0700
@@ -86,6 +86,7 @@
 extern md_ops_t			**md_ops;
 extern md_ops_t			*md_opslist;
 extern ddi_modhandle_t		*md_mods;
+extern dev_info_t		*md_devinfo;
 
 extern md_krwlock_t		md_unit_array_rw;
 extern kmutex_t			md_mx;
@@ -113,7 +114,7 @@
 extern struct nm_next_hdr	*get_first_record(set_t, int, int);
 
 struct mdq_anchor	md_done_daemon; /* done request queue */
-struct mdq_anchor	md_mstr_daemon; /* mirror timeout requests */
+struct mdq_anchor	md_mstr_daemon; /* mirror error, WOW requests */
 struct mdq_anchor	md_mhs_daemon;	/* mirror hotspare requests queue */
 struct mdq_anchor	md_hs_daemon;	/* raid hotspare requests queue */
 struct mdq_anchor	md_ff_daemonq;	/* failfast request queue */
@@ -121,6 +122,7 @@
 struct mdq_anchor	md_mirror_io_daemon; /* mirror owner i/o queue */
 struct mdq_anchor	md_mirror_rs_daemon; /* mirror resync done queue */
 struct mdq_anchor	md_sp_daemon;	/* soft-part error daemon queue */
+struct mdq_anchor	md_mto_daemon;	/* mirror timeout daemon queue */
 
 int md_done_daemon_threads = 1;	/* threads for md_done_daemon requestq */
 int md_mstr_daemon_threads = 1;	/* threads for md_mstr_daemon requestq */
@@ -129,6 +131,7 @@
 int md_ff_daemon_threads = 3;	/* threads for md_ff_daemon requestq */
 int md_mirror_daemon_threads = 1; /* threads for md_mirror_daemon requestq */
 int md_sp_daemon_threads = 1;	/* threads for md_sp_daemon requestq */
+int md_mto_daemon_threads = 1;	/* threads for md_mto_daemon requestq */
 
 #ifdef DEBUG
 /* Flag to switch on debug messages */
@@ -146,7 +149,7 @@
  *
  */
 
-#define	MD_DAEMON_QUEUES 10
+#define	MD_DAEMON_QUEUES 11
 
 md_requestq_entry_t md_daemon_queues[MD_DAEMON_QUEUES] = {
 	{&md_done_daemon, &md_done_daemon_threads},
@@ -158,6 +161,7 @@
 	{&md_mirror_rs_daemon, &md_mirror_daemon_threads},
 	{&md_sp_daemon, &md_sp_daemon_threads},
 	{&md_mhs_daemon, &md_mhs_daemon_threads},
+	{&md_mto_daemon, &md_mto_daemon_threads},
 	{0, 0}
 };
 
@@ -176,6 +180,12 @@
 uint_t			md_retry_cnt = 1; /* global so it can be patched */
 
 /*
+ * How many times to try to do the door_ki_upcall() in mdmn_ksend_message.
+ * Again, made patchable here should it prove useful.
+ */
+uint_t			md_send_retry_limit = 30;
+
+/*
  * Bug # 1212146
  * Before this change the user had to pass in a short aligned buffer because of
  * problems in some underlying device drivers.  This problem seems to have been
@@ -712,9 +722,9 @@
 				if (status & MD_SET_STALE)
 					flag |= MD_MSGF_NO_LOG;
 				rval = mdmn_ksend_message(s->s_setno,
-				    MD_MN_MSG_MDDB_PARSE, flag,
+				    MD_MN_MSG_MDDB_PARSE, flag, 0,
 				    (char *)mddb_parse_msg,
-				    sizeof (mddb_parse_msg), kresult);
+				    sizeof (md_mn_msg_mddb_parse_t), kresult);
 				/* if the node hasn't yet joined, it's Ok. */
 				if ((!MDMN_KSEND_MSG_OK(rval, kresult)) &&
 				    (kresult->kmmr_comm_state !=
@@ -2817,6 +2827,15 @@
 	mutex_init(&ui->ui_mx, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&ui->ui_cv, NULL, CV_DEFAULT, NULL);
 
+	if (alloc_lock) {
+		ui->ui_io_lock = kmem_zalloc(sizeof (md_io_lock_t), KM_SLEEP);
+		mutex_init(&ui->ui_io_lock->io_mx, NULL, MUTEX_DEFAULT, NULL);
+		cv_init(&ui->ui_io_lock->io_cv, NULL, CV_DEFAULT, NULL);
+		mutex_init(&ui->ui_io_lock->io_list_mutex, NULL,
+		    MUTEX_DEFAULT, NULL);
+		ui->ui_io_lock->io_list_front = NULL;
+		ui->ui_io_lock->io_list_back = NULL;
+	}
 	if (! (md_get_setstatus(setno) & MD_SET_SNARFING)) {
 		rw_enter(&md_unit_array_rw.lock, RW_WRITER);
 		MDI_VOIDUNIT(mnum) = (void *) ui;
@@ -2829,15 +2848,6 @@
 	ui->ui_link.ln_setno = setno;
 	ui->ui_link.ln_id = mnum;
 	ops->md_head = &ui->ui_link;
-	if (alloc_lock) {
-		ui->ui_io_lock = kmem_zalloc(sizeof (md_io_lock_t), KM_SLEEP);
-		mutex_init(&ui->ui_io_lock->io_mx, NULL, MUTEX_DEFAULT, NULL);
-		cv_init(&ui->ui_io_lock->io_cv, NULL, CV_DEFAULT, NULL);
-		mutex_init(&ui->ui_io_lock->io_list_mutex, NULL,
-		    MUTEX_DEFAULT, NULL);
-		ui->ui_io_lock->io_list_front = NULL;
-		ui->ui_io_lock->io_list_back = NULL;
-	}
 	/* setup the unavailable field */
 #if defined(_ILP32)
 	if (((md_unit_t *)MD_UNIT(mnum))->c.un_revision & MD_64BIT_META_DEV) {
@@ -3865,82 +3875,68 @@
 /*
  * Send a kernel message.
  * user has to provide for an allocated result structure
- * If the door handler disappears we retry forever emitting warnings every so
- * often.
- * TODO: make this a flaggable attribute so that the caller can decide if the
- *	 message is to be a 'one-shot' message or not.
+ * If the door handler disappears we retry, emitting warnings every so often.
+ *
+ * The recipient argument is almost always unused, and is therefore typically
+ * set to zero, as zero is an invalid cluster nodeid.  The exceptions are the
+ * marking and clearing of the DRL from a node that is not currently the
+ * owner.  In these cases, the recipient argument will be the nodeid of the
+ * mirror owner, and MD_MSGF_DIRECTED will be set in the flags.  Non-owner
+ * nodes will not receive these messages.
+ *
+ * For the case where md_mn_is_commd_present() is false, we rely on the
+ * "result" having been kmem_zalloc()ed which, in effect, sets MDMNE_NULL for
+ * kmmr_comm_state making MDMN_KSEND_MSG_OK() result in 0.
  */
 int
 mdmn_ksend_message(
 	set_t		setno,
 	md_mn_msgtype_t	type,
 	uint_t		flags,
+	md_mn_nodeid_t	recipient,
 	char		*data,
 	int		size,
 	md_mn_kresult_t	*result)
 {
 	door_arg_t	da;
 	md_mn_kmsg_t	*kmsg;
-	uint_t		retry_cnt = 0;
+	uint_t		send_try_cnt = 0;
+	uint_t		retry_noise_cnt = 0;
 	int		rval;
+	k_sigset_t	oldmask, newmask;
 
 	if (size > MDMN_MAX_KMSG_DATA)
 		return (ENOMEM);
 	kmsg = kmem_zalloc(sizeof (md_mn_kmsg_t), KM_SLEEP);
 	kmsg->kmsg_flags = flags;
 	kmsg->kmsg_setno = setno;
+	kmsg->kmsg_recipient = recipient;
 	kmsg->kmsg_type	= type;
 	kmsg->kmsg_size	= size;
 	bcopy(data, &(kmsg->kmsg_data), size);
 
-#ifdef DEBUG_COMM
-	printf("send msg: set=%d, flags=%d, type=%d, txid = 0x%llx,"
-	    " size=%d, data=%d, data2=%d\n",
-	    kmsg->kmsg_setno, kmsg->kmsg_flags, kmsg->kmsg_type,
-	    kmsg->kmsg_size, *(int *)data, *(int *)(char *)(&kmsg->kmsg_data));
-
-
-#endif /* DEBUG_COMM */
-
-	da.data_ptr	= (char *)(kmsg);
-	da.data_size	= sizeof (md_mn_kmsg_t);
-	da.desc_ptr	= NULL;
-	da.desc_num	= 0;
-	da.rbuf		= (char *)result;
-	da.rsize	= sizeof (*result);
-
 	/*
 	 * Wait for the door handle to be established.
 	 */
-
 	while (mdmn_door_did == -1) {
-		if ((++retry_cnt % MD_MN_WARN_INTVL) == 0) {
+		if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) {
 			cmn_err(CE_WARN, "door handle not yet ready. "
 			    "Check if /usr/lib/lvm/mddoors is running");
 		}
 		delay(md_hz);
 	}
-	retry_cnt = 0;
-
-	while ((rval = door_ki_upcall_limited(mdmn_door_handle, &da, NULL,
-	    SIZE_MAX, 0)) != 0) {
-		if (rval == EAGAIN)  {
-			if ((++retry_cnt % MD_MN_WARN_INTVL) == 0) {
-				cmn_err(CE_WARN, "door call failed. "
-				"Check if /usr/lib/lvm/mddoors is running");
-			}
-		} else {
-			cmn_err(CE_WARN,
-			    "md door call failed. Returned %d", rval);
-		}
-		delay(md_hz);
-	}
-	kmem_free(kmsg, sizeof (md_mn_kmsg_t));
 
 	/*
-	 * Attempt to determine if the message failed (with an RPC_FAILURE)
-	 * because we are in the middle of shutting the system down.
-	 *
+	 * If MD_MSGF_BLK_SIGNAL is set, mask out all signals so that we
+	 * do not fail if the user process receives a signal while we're
+	 * active in the door interface.
+	 */
+	if (flags & MD_MSGF_BLK_SIGNAL) {
+		sigfillset(&newmask);
+		sigreplace(&newmask, &oldmask);
+	}
+
+	/*
 	 * If message failed with an RPC_FAILURE when rpc.mdcommd had
 	 * been gracefully shutdown (md_mn_is_commd_present returns FALSE)
 	 * then don't retry the message anymore.  If message
@@ -3956,16 +3952,81 @@
 	 *
 	 */
 
-	retry_cnt = 0;
-
-	if (result->kmmr_comm_state == MDMNE_RPC_FAIL) {
-		while (md_mn_is_commd_present() == 1) {
-			if ((++retry_cnt % MD_MN_WARN_INTVL) == 0)
+	retry_noise_cnt = send_try_cnt = 0;
+	while (md_mn_is_commd_present_lite()) {
+		/*
+		 * data_ptr and data_size are initialized here because on
+		 * return from the upcall, they contain data duplicated from
+		 * rbuf and rsize.  This causes subsequent upcalls to fail.
+		 */
+		da.data_ptr = (char *)(kmsg);
+		da.data_size = sizeof (md_mn_kmsg_t);
+		da.desc_ptr = NULL;
+		da.desc_num = 0;
+		da.rbuf = (char *)result;
+		da.rsize = sizeof (*result);
+
+		while ((rval = door_ki_upcall_limited(mdmn_door_handle, &da,
+		    NULL, SIZE_MAX, 0)) != 0) {
+			if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) {
+				if (rval == EAGAIN)  {
+					cmn_err(CE_WARN,
+					    "md: door_upcall failed. "
+					    "Check if mddoors is running.");
+				} else if (rval == EINTR) {
+					cmn_err(CE_WARN,
+					    "md: door_upcall failed. "
+					    "Check if rpc.mdcommd is running.");
+				} else {
+					cmn_err(CE_WARN,
+					    "md: door_upcall failed. "
+					    "Returned %d",
+					    rval);
+				}
+			}
+			if (++send_try_cnt >= md_send_retry_limit)
 				break;
+
 			delay(md_hz);
+
+			/*
+			 * data_ptr and data_size are re-initialized here
+			 * because on return from the upcall, they contain
+			 * data duplicated from rbuf and rsize.  This causes
+			 * subsequent upcalls to fail.
+			 */
+			da.data_ptr = (char *)(kmsg);
+			da.data_size = sizeof (md_mn_kmsg_t);
+			da.desc_ptr = NULL;
+			da.desc_num = 0;
+			da.rbuf = (char *)result;
+			da.rsize = sizeof (*result);
 		}
+
+
+		/*
+		 * If:
+		 * - the send succeeded (MDMNE_ACK)
+		 * - we had an MDMNE_RPC_FAIL and commd is now gone
+		 *   (note: since the outer loop is commd-dependent,
+		 *   checking MDMN_RPC_FAIL here is meaningless)
+		 * - we were told not to retry
+		 * - we exceeded the RPC failure send limit
+		 * punch out of the outer loop prior to the delay()
+		 */
+		if (result->kmmr_comm_state == MDMNE_ACK ||
+		    (flags & MD_MSGF_KSEND_NORETRY) ||
+		    (++send_try_cnt % md_send_retry_limit) == 0 ||
+		    !md_mn_is_commd_present())
+			break;
+		delay(md_hz);
 	}
 
+	if (flags & MD_MSGF_BLK_SIGNAL) {
+		sigreplace(&oldmask, (k_sigset_t *)NULL);
+	}
+	kmem_free(kmsg, sizeof (md_mn_kmsg_t));
+
 	return (0);
 }
 
@@ -4008,7 +4069,7 @@
 	sigfillset(&newmask);
 	sigreplace(&newmask, &oldmask);
 	ret = (mdmn_ksend_message(MD_MIN2SET(mnum), MD_MN_MSG_SET_CAP,
-	    MD_MSGF_NO_LOG, (char *)&msg, sizeof (md_mn_msg_setcap_t),
+	    MD_MSGF_NO_LOG, 0, (char *)&msg, sizeof (md_mn_msg_setcap_t),
 	    kres));
 	sigreplace(&oldmask, (k_sigset_t *)NULL);
 
@@ -4056,7 +4117,7 @@
 	sigreplace(&newmask, &oldmask);
 	ret = mdmn_ksend_message(MD_MIN2SET(mnum),
 	    MD_MN_MSG_CLU_CHECK,
-	    MD_MSGF_STOP_ON_ERROR | MD_MSGF_NO_LOG | MD_MSGF_NO_MCT,
+	    MD_MSGF_STOP_ON_ERROR | MD_MSGF_NO_LOG | MD_MSGF_NO_MCT, 0,
 	    (char *)&clumsg, sizeof (clumsg), kresult);
 	sigreplace(&oldmask, (k_sigset_t *)NULL);
 
@@ -4212,3 +4273,23 @@
 
 	return ((hot_spare_pool_t *)0);
 }
+
+/*
+ * md_create_taskq:
+ *
+ * Create a kernel taskq for the given set/unit combination. This is typically
+ * used to complete a RR_CLEAN request when the callee is unable to obtain the
+ * mutex / condvar access required to update the DRL safely.
+ */
+void *
+md_create_taskq(set_t setno, minor_t mnum)
+{
+	char			name[20];
+	ddi_taskq_t		*tqp;
+
+	(void) snprintf(name, 20, "%d/d%d", setno, MD_MIN2UNIT(mnum));
+
+	tqp = ddi_taskq_create(md_devinfo, name, 1, TASKQ_DEFAULTPRI, 0);
+
+	return ((void *)tqp);
+}
--- a/usr/src/uts/common/io/lvm/mirror/mirror.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/uts/common/io/lvm/mirror/mirror.c	Wed Dec 24 08:23:40 2008 -0700
@@ -173,6 +173,7 @@
 mirror_parent_init(md_mps_t *ps)
 {
 	bzero(ps, offsetof(md_mps_t, ps_mx));
+	bzero(&ps->ps_overlap_node, sizeof (avl_node_t));
 }
 
 /*ARGSUSED1*/
@@ -223,11 +224,17 @@
 
 	kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
 	rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES,
-	    MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, (char *)&pokehsp,
+	    MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0, (char *)&pokehsp,
 	    sizeof (pokehsp), kresult);
 
 	if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
 		mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES");
+		/* If we're shutting down already, pause things here. */
+		if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
+			while (!md_mn_is_commd_present()) {
+				delay(md_hz);
+			}
+		}
 		cmn_err(CE_PANIC,
 		    "ksend_message failure: POKE_HOTSPARES");
 	}
@@ -468,7 +475,7 @@
 		}
 
 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
-		rval = mdmn_ksend_message(setno, msgtype, msgflags,
+		rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
 		    (char *)&allochspmsg, sizeof (allochspmsg),
 		    kresult);
 
@@ -491,6 +498,12 @@
 				kmem_free(kresult, sizeof (md_mn_kresult_t));
 				return (1);
 			}
+			/* If we're shutting down already, pause things here. */
+			if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
+				while (!md_mn_is_commd_present()) {
+					delay(md_hz);
+				}
+			}
 			cmn_err(CE_PANIC,
 			    "ksend_message failure: ALLOCATE_HOTSPARE");
 		}
@@ -1636,9 +1649,14 @@
 	/*
 	 * For directed mirror read (DMR) we only use the specified side and
 	 * do not compute the source of the read.
+	 * If we're running with MD_MPS_DIRTY_RD set we always return the
+	 * first mirror side (this prevents unnecessary ownership switching).
+	 * Otherwise we return the submirror according to the mirror read option
 	 */
 	if (ps->ps_flags & MD_MPS_DMR) {
 		sm_index = un->un_dmr_last_read;
+	} else if (ps->ps_flags & MD_MPS_DIRTY_RD) {
+		sm_index = md_find_nth_unit(running_bm, 0);
 	} else {
 		/* Normal (non-DMR) operation */
 		switch (un->un_read_option) {
@@ -1883,6 +1901,13 @@
 	mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL);
 
+	/*
+	 * Allocate rwlocks for un_pernode_dirty_bm accessing.
+	 */
+	for (i = 0; i < MD_MNMAXSIDES; i++) {
+		rw_init(&un->un_pernode_dirty_mx[i], NULL, RW_DEFAULT, NULL);
+	}
+
 	/* place various information in the in-core data structures */
 	md_nblocks_set(MD_SID(un), un->c.un_total_blocks);
 	MD_UNIT(MD_SID(un)) = un;
@@ -1903,6 +1928,7 @@
 	uint_t		bits = 0;
 	minor_t		selfid;
 	md_unit_t	*su;
+	int		i;
 
 	md_destroy_unit_incore(mnum, &mirror_md_ops);
 
@@ -1917,6 +1943,15 @@
 		kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt);
 	if (un->un_resync_bm)
 		kmem_free((caddr_t)un->un_resync_bm, bitcnt);
+	if (un->un_pernode_dirty_sum)
+		kmem_free((caddr_t)un->un_pernode_dirty_sum, un->un_rrd_num);
+
+	/*
+	 * Destroy the taskq for deferred processing of DRL clean requests.
+	 * This taskq will only be present for Multi Owner mirrors.
+	 */
+	if (un->un_drl_task != NULL)
+		ddi_taskq_destroy(un->un_drl_task);
 
 	md_nblocks_set(mnum, -1ULL);
 	MD_UNIT(mnum) = NULL;
@@ -1965,6 +2000,12 @@
 	mutex_destroy(&un->un_dmr_mx);
 	cv_destroy(&un->un_dmr_cv);
 
+	for (i = 0; i < MD_MNMAXSIDES; i++) {
+		rw_destroy(&un->un_pernode_dirty_mx[i]);
+		if (un->un_pernode_dirty_bm[i])
+			kmem_free((caddr_t)un->un_pernode_dirty_bm[i], bitcnt);
+	}
+
 	/*
 	 * Remove self from the namespace
 	 */
@@ -1972,7 +2013,9 @@
 		(void) md_rem_selfname(un->c.un_self_id);
 	}
 
+	/* This frees the unit structure. */
 	mddb_deleterec_wrapper(un->c.un_record_id);
+
 	if (recid != 0)
 		mddb_deleterec_wrapper(recid);
 
@@ -2430,11 +2473,17 @@
 		}
 
 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
-		rval = mdmn_ksend_message(setno, msgtype, msgflags,
+		rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
 		    (char *)&stchmsg, sizeof (stchmsg), kresult);
 
 		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
 			mdmn_ksend_show_error(rval, kresult, "STATE UPDATE");
+			/* If we're shutting down already, pause things here. */
+			if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
+				while (!md_mn_is_commd_present()) {
+					delay(md_hz);
+				}
+			}
 			cmn_err(CE_PANIC,
 			    "ksend_message failure: STATE_UPDATE");
 		}
@@ -3435,11 +3484,12 @@
 	md_mps_t	*ps = (md_mps_t *)dq;
 	buf_t		*pb = ps->ps_bp;
 	mdi_unit_t	*ui = ps->ps_ui;
-	mm_unit_t	*un;
+	mm_unit_t	*un = MD_UNIT(ui->ui_link.ln_id);
 	set_t		setno;
 	int		restart_resync;
 
-	un = md_unit_writerlock(ui);
+	mutex_enter(&un->un_rrp_inflight_mx);
+	(void) md_unit_writerlock(ui);
 	ps->ps_un = un;
 	setno = MD_MIN2SET(getminor(pb->b_edev));
 	if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) {
@@ -3447,15 +3497,14 @@
 		 * Synchronize our in-core view of what regions need to be
 		 * resync'd with the on-disk version.
 		 */
-		mutex_enter(&un->un_rrp_inflight_mx);
 		mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm,
 		    un->un_dirty_bm);
-		mutex_exit(&un->un_rrp_inflight_mx);
 
 		/* Region dirty map is now up to date */
 	}
 	restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0;
 	md_unit_writerexit(ui);
+	mutex_exit(&un->un_rrp_inflight_mx);
 
 	/* Restart the resync thread if it was previously blocked */
 	if (restart_resync) {
@@ -3581,9 +3630,8 @@
 
 			kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
 			rval = mdmn_ksend_message(setno,
-			    MD_MN_MSG_REQUIRE_OWNER, msg_flags,
-			    /* flags */ (char *)msg,
-			    sizeof (md_mn_req_owner_t), kres);
+			    MD_MN_MSG_REQUIRE_OWNER, msg_flags, 0,
+			    (char *)msg, sizeof (md_mn_req_owner_t), kres);
 
 			kmem_free(msg, sizeof (md_mn_req_owner_t));
 
@@ -3890,19 +3938,19 @@
 	}
 
 	/*
-	 * For Multinode mirrors with a Resync Region (not ABR) we need to
-	 * become the mirror owner before continuing with the write(). For ABR
-	 * mirrors we check that we 'own' the resync if we're in
-	 * write-after-read mode. We do this _after_ ensuring that there are no
-	 * overlaps to ensure that the once we know that we are the owner, the
-	 * readerlock will not released until the write is complete. As a
-	 * change of ownership in a MN set requires the writerlock, this
-	 * ensures that ownership cannot be changed until the write is
-	 * complete
+	 * For Multinode mirrors with no owner and a Resync Region (not ABR)
+	 * we need to become the mirror owner before continuing with the
+	 * write(). For ABR mirrors we check that we 'own' the resync if
+	 * we're in write-after-read mode. We do this _after_ ensuring that
+	 * there are no overlaps to ensure that once we know that we are
+	 * the owner, the readerlock will not be released until the write is
+	 * complete. As a change of ownership in a MN set requires the
+	 * writerlock, this ensures that ownership cannot be changed until
+	 * the write is complete.
 	 */
 	if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) ||
 	    (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) {
-		if (!MD_MN_MIRROR_OWNER(un))  {
+		if (MD_MN_NO_MIRROR_OWNER(un))  {
 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
 				mirror_overlap_tree_remove(ps);
 			md_kstat_waitq_exit(ui);
@@ -3922,10 +3970,11 @@
 	if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) &&
 	    !(flag & MD_STR_WAR)) {
 		if (mirror_mark_resync_region(un, ps->ps_firstblk,
-		    ps->ps_lastblk)) {
+		    ps->ps_lastblk, md_mn_mynode_id)) {
 			pb->b_flags |= B_ERROR;
 			pb->b_resid = pb->b_bcount;
-			ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
+			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
+				mirror_overlap_tree_remove(ps);
 			kmem_cache_free(mirror_parent_cache, ps);
 			md_kstat_waitq_exit(ui);
 			md_unit_readerexit(ui);
@@ -4169,9 +4218,9 @@
 
 				/*
 				 * Before reading the buffer, see if
-				 * we are the owner
+				 * there is an owner.
 				 */
-				if (!MD_MN_MIRROR_OWNER(un))  {
+				if (MD_MN_NO_MIRROR_OWNER(un))  {
 					ps->ps_call = NULL;
 					mirror_overlap_tree_remove(ps);
 					md_kstat_waitq_exit(ui);
@@ -4506,6 +4555,7 @@
 	md_error_t		mde = mdnullerror;
 	md_mps_t		*ps;
 	int			rs_active;
+	int			rr, rr_start, rr_end;
 
 	/* Check that the given device is part of a multi-node set */
 	setno = MD_MIN2SET(p->mnum);
@@ -4580,6 +4630,25 @@
 
 		if (p->rs_originator != md_mn_mynode_id) {
 			/*
+			 * Clear our un_resync_bm for the regions completed.
+			 * The owner (originator) will take care of itself.
+			 */
+			BLK_TO_RR(rr_end, ps->ps_lastblk, un);
+			BLK_TO_RR(rr_start, p->rs_start, un);
+			if (ps->ps_lastblk && rr_end < rr_start) {
+				BLK_TO_RR(rr_start, ps->ps_firstblk, un);
+				mutex_enter(&un->un_resync_mx);
+				/*
+				 * Update our resync bitmap to reflect that
+				 * another node has synchronized this range.
+				 */
+				for (rr = rr_start; rr <= rr_end; rr++) {
+					CLR_KEEPDIRTY(rr, un);
+				}
+				mutex_exit(&un->un_resync_mx);
+			}
+
+			/*
 			 * On all but the originating node, first update
 			 * the resync state, then unblock the previous
 			 * region and block the next one. No need
@@ -4654,6 +4723,7 @@
 				    &p->mde, lockp);
 			}
 		}
+
 		break;
 	case MD_MN_MSG_RESYNC_FINISH:
 		/*
@@ -4792,6 +4862,24 @@
 				un->c.un_status &= ~MD_UN_KEEP_DIRTY;
 				if (!broke_out)
 					un->c.un_status &= ~MD_UN_WAR;
+
+				/*
+				 * Clear our un_resync_bm for the regions
+				 * completed.  The owner (originator) will
+				 * take care of itself.
+				 */
+				if (p->rs_originator != md_mn_mynode_id &&
+				    (ps = un->un_rs_prev_overlap) != NULL) {
+					BLK_TO_RR(rr_start, ps->ps_firstblk,
+					    un);
+					BLK_TO_RR(rr_end, ps->ps_lastblk, un);
+					mutex_enter(&un->un_resync_mx);
+					for (rr = rr_start; rr <= rr_end;
+					    rr++) {
+						CLR_KEEPDIRTY(rr, un);
+					}
+					mutex_exit(&un->un_resync_mx);
+				}
 			}
 
 			/*
--- a/usr/src/uts/common/io/lvm/mirror/mirror_ioctl.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/uts/common/io/lvm/mirror/mirror_ioctl.c	Wed Dec 24 08:23:40 2008 -0700
@@ -1624,7 +1624,7 @@
 
 	kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
 	rval = mdmn_ksend_message(setno, MD_MN_MSG_CHOOSE_OWNER,
-	    MD_MSGF_NO_BCAST | MD_MSGF_NO_LOG, (char *)msg,
+	    MD_MSGF_NO_BCAST | MD_MSGF_NO_LOG, 0, (char *)msg,
 	    sizeof (md_mn_msg_chooseid_t), kres);
 	if (!MDMN_KSEND_MSG_OK(rval, kres)) {
 		mdmn_ksend_show_error(rval, kres, "CHOOSE OWNER");
@@ -1664,7 +1664,8 @@
 
 	kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
 	rval = mdmn_ksend_message(setno, MD_MN_MSG_REQUIRE_OWNER,
-	    MD_MSGF_NO_LOG, (char *)ownp, sizeof (md_mn_req_owner_t), kresult);
+	    MD_MSGF_NO_LOG, 0, (char *)ownp, sizeof (md_mn_req_owner_t),
+	    kresult);
 
 	if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
 		/*
@@ -2358,7 +2359,7 @@
 
 	kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
 	rval = mdmn_ksend_message(setno, MD_MN_MSG_GET_MIRROR_STATE,
-	    MD_MSGF_NO_BCAST | MD_MSGF_NO_LOG, (char *)&msg,
+	    MD_MSGF_NO_BCAST | MD_MSGF_NO_LOG, 0, (char *)&msg,
 	    sizeof (msg), kres);
 
 	/* if the node hasn't yet joined, it's Ok. */
@@ -2949,6 +2950,42 @@
 		break;
 	}
 
+	case MD_MN_RR_DIRTY:
+	{
+		sz = sizeof (md_mn_rr_dirty_params_t);
+		d = kmem_zalloc(sz, KM_SLEEP);
+
+		if (ddi_copyin(data, d, sz, mode)) {
+			err = EFAULT;
+			break;
+		}
+
+		err = mirror_set_dirty_rr((md_mn_rr_dirty_params_t *)d);
+		break;
+	}
+
+	case MD_MN_RR_CLEAN:
+	{
+		md_mn_rr_clean_params_t tmp;
+
+		/* get the first part of the structure to find the size */
+		if (ddi_copyin(data, &tmp, sizeof (tmp), mode)) {
+			err = EFAULT;
+			break;
+		}
+
+		sz = MDMN_RR_CLEAN_PARAMS_SIZE(&tmp);
+		d = kmem_zalloc(sz, KM_SLEEP);
+
+		if (ddi_copyin(data, d, sz, mode)) {
+			err = EFAULT;
+			break;
+		}
+
+		err = mirror_set_clean_rr((md_mn_rr_clean_params_t *)d);
+		break;
+	}
+
 	default:
 		return (ENOTTY);
 	}
--- a/usr/src/uts/common/io/lvm/mirror/mirror_resync.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/uts/common/io/lvm/mirror/mirror_resync.c	Wed Dec 24 08:23:40 2008 -0700
@@ -24,8 +24,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
@@ -67,7 +65,7 @@
 
 extern md_ops_t		mirror_md_ops;
 extern kmem_cache_t	*mirror_child_cache; /* mirror child memory pool */
-extern mdq_anchor_t	md_mstr_daemon;
+extern mdq_anchor_t	md_mto_daemon;
 extern daemon_request_t	mirror_timeout;
 extern md_resync_t	md_cpr_resync;
 extern clock_t		md_hz;
@@ -141,81 +139,365 @@
  */
 int md_max_xfer_bufsz = 2048;
 
+/*
+ * mirror_generate_rr_bitmap:
+ * -------------------
+ * Generate a compressed bitmap md_mn_msg_rr_clean_t for the given clean
+ * bitmap associated with mirror 'un'
+ *
+ * Input:
+ *      un      - mirror unit to get bitmap data from
+ *      *msgp   - location to return newly allocated md_mn_msg_rr_clean_t
+ *      *activep- location to return # of active i/os
+ *
+ * Returns:
+ *      1 => dirty bits cleared from un_dirty_bm and DRL flush required
+ *          *msgp contains bitmap of to-be-cleared bits
+ *      0 => no bits cleared
+ *          *msgp == NULL
+ */
 static int
-process_resync_regions(mm_unit_t *un)
+mirror_generate_rr_bitmap(mm_unit_t *un, md_mn_msg_rr_clean_t **msgp,
+    int *activep)
 {
-	int			i;
+	unsigned int	i, next_bit, data_bytes, start_bit;
+	int		cleared_dirty = 0;
+
+	/* Skip any initial 0s. */
+retry_dirty_scan:
+	if ((start_bit = un->un_rr_clean_start_bit) >= un->un_rrd_num)
+		un->un_rr_clean_start_bit = start_bit = 0;
+
+	/*
+	 * Handle case where NO bits are set in PERNODE_DIRTY but the
+	 * un_dirty_bm[] map does have entries set (after a 1st resync)
+	 */
+	for (; start_bit < un->un_rrd_num &&
+	    !IS_PERNODE_DIRTY(md_mn_mynode_id, start_bit, un) &&
+	    (un->un_pernode_dirty_sum[start_bit] != (uchar_t)0); start_bit++)
+		;
+
+	if (start_bit >= un->un_rrd_num) {
+		if (un->un_rr_clean_start_bit == 0) {
+			return (0);
+		} else {
+			un->un_rr_clean_start_bit = 0;
+			goto retry_dirty_scan;
+		}
+	}
+
+	/* how much to fit into this message */
+	data_bytes = MIN(howmany(un->un_rrd_num - start_bit, NBBY),
+	    MDMN_MSG_RR_CLEAN_DATA_MAX_BYTES);
+
+	(*msgp) = kmem_zalloc(MDMN_MSG_RR_CLEAN_SIZE_DATA(data_bytes),
+	    KM_SLEEP);
+
+	(*msgp)->rr_nodeid = md_mn_mynode_id;
+	(*msgp)->rr_mnum = MD_SID(un);
+	MDMN_MSG_RR_CLEAN_START_SIZE_SET(*msgp, start_bit, data_bytes);
+
+	next_bit = MIN(start_bit + data_bytes * NBBY, un->un_rrd_num);
+
+	for (i = start_bit; i < next_bit; i++) {
+		if (un->c.un_status & MD_UN_KEEP_DIRTY && IS_KEEPDIRTY(i, un)) {
+			continue;
+		}
+		if (!IS_REGION_DIRTY(i, un)) {
+			continue;
+		}
+		if (un->un_outstanding_writes[i] != 0) {
+			(*activep)++;
+			continue;
+		}
+
+		/*
+		 * Handle the case where a resync has completed and we still
+		 * have the un_dirty_bm[] entries marked as dirty (these are
+		 * the most recent DRL re-read from the replica). They need
+		 * to be cleared from our un_dirty_bm[] but they will not have
+		 * corresponding un_pernode_dirty[] entries set unless (and
+		 * until) further write()s have been issued to the area.
+		 * This handles the case where only the un_dirty_bm[] entry is
+		 * set. Without this we'd not clear this region until a local
+		 * write is issued to the affected area.
+		 */
+		if (IS_PERNODE_DIRTY(md_mn_mynode_id, i, un) ||
+		    (un->un_pernode_dirty_sum[i] == (uchar_t)0)) {
+			if (!IS_GOING_CLEAN(i, un)) {
+				SET_GOING_CLEAN(i, un);
+				(*activep)++;
+				continue;
+			}
+			/*
+			 * Now we've got a flagged pernode_dirty, _or_ a clean
+			 * bitmap entry to process. Update the bitmap to flush
+			 * the REGION_DIRTY / GOING_CLEAN bits when we send the
+			 * cross-cluster message.
+			 */
+			cleared_dirty++;
+			setbit(MDMN_MSG_RR_CLEAN_DATA(*msgp), i - start_bit);
+		} else {
+			/*
+			 * Not marked as active in the pernode bitmap, so skip
+			 * any update to this. We just increment the 0 count
+			 * and adjust the active count by any outstanding
+			 * un_pernode_dirty_sum[] entries. This means we don't
+			 * leave the mirror permanently dirty.
+			 */
+			(*activep) += (int)un->un_pernode_dirty_sum[i];
+		}
+	}
+	if (!cleared_dirty) {
+		kmem_free(*msgp, MDMN_MSG_RR_CLEAN_SIZE_DATA(data_bytes));
+		*msgp = NULL;
+	}
+	un->un_rr_clean_start_bit = next_bit;
+	return (cleared_dirty);
+}
+
+/*
+ * There are three paths into here:
+ *
+ * md_daemon -> check_resync_regions -> prr
+ * mirror_internal_close -> mirror_process_unit_resync -> prr
+ * mirror_set_capability -> mirror_process_unit_resync -> prr
+ *
+ * The first one is a kernel daemon, the other two result from system calls.
+ * Thus, only the first case needs to deal with kernel CPR activity.  This
+ * is indicated by the cprinfop being non-NULL for kernel daemon calls, and
+ * NULL for system call paths.
+ */
+static int
+process_resync_regions_non_owner(mm_unit_t *un, callb_cpr_t *cprinfop)
+{
+	int			i, start, end;
 	int			cleared_dirty = 0;
-	/*
-	 * Number of reasons why we can not
-	 * proceed shutting down the mirror.
-	 */
+	/* Number of reasons why we can not proceed shutting down the mirror. */
 	int			active = 0;
 	set_t			setno = MD_UN2SET(un);
+	md_mn_msg_rr_clean_t	*rmsg;
+	md_mn_kresult_t		*kres;
+	int			rval;
+	minor_t			mnum = MD_SID(un);
+	mdi_unit_t		*ui = MDI_UNIT(mnum);
+	md_mn_nodeid_t		owner_node;
 
 	/*
-	 * Resync region processing must be
-	 * single threaded. We can't use
-	 * un_resync_mx for this purpose
-	 * since this mutex gets released
+	 * We drop the readerlock here to assist lock ordering with
+	 * update_resync.  Once we have the un_rrp_inflight_mx, we
+	 * can re-acquire it.
+	 */
+	md_unit_readerexit(ui);
+
+	/*
+	 * Resync region processing must be single threaded. We can't use
+	 * un_resync_mx for this purpose since this mutex gets released
 	 * when blocking on un_resync_cv.
 	 */
 	mutex_enter(&un->un_rrp_inflight_mx);
 
+	(void) md_unit_readerlock(ui);
+
 	mutex_enter(&un->un_resync_mx);
-	while (un->un_resync_flg & MM_RF_STALL_CLEAN)
-		cv_wait(&un->un_resync_cv, &un->un_resync_mx);
-
-	/*
-	 * For a mirror we can only update the resync-record if we currently
-	 * own the mirror. If we are called and we don't have ownership we bail
-	 * out before scanning the outstanding_writes[] array. This cannot be
-	 * set as we'd have become the owner before initiating the i/o to the
-	 * mirror.
-	 * NOTE: we only need to check here (before scanning the array) as we
-	 *	 are called with the readerlock held. This means that a change
-	 *	 of ownership away from us will block until this resync check
-	 *	 has completed.
-	 */
-	if (MD_MNSET_SETNO(setno)) {
-		if (!MD_MN_MIRROR_OWNER(un)) {
-			mutex_exit(&un->un_resync_mx);
+
+	rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1], RW_READER);
+	cleared_dirty = mirror_generate_rr_bitmap(un, &rmsg, &active);
+	rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
+
+	if (cleared_dirty) {
+		owner_node = un->un_mirror_owner;
+		mutex_exit(&un->un_resync_mx);
+
+		/*
+		 * Transmit the 'to-be-cleared' bitmap to all cluster nodes.
+		 * Receipt of the message will cause the mirror owner to
+		 * update the on-disk DRL.
+		 */
+
+		kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
+
+		/* release readerlock before sending message */
+		md_unit_readerexit(ui);
+
+		if (cprinfop) {
+			mutex_enter(&un->un_prr_cpr_mx);
+			CALLB_CPR_SAFE_BEGIN(cprinfop);
+		}
+
+		rval = mdmn_ksend_message(setno, MD_MN_MSG_RR_CLEAN,
+		    MD_MSGF_NO_LOG|MD_MSGF_BLK_SIGNAL|MD_MSGF_KSEND_NORETRY|
+		    MD_MSGF_DIRECTED, un->un_mirror_owner,
+		    (char *)rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg), kres);
+
+		if (cprinfop) {
+			CALLB_CPR_SAFE_END(cprinfop, &un->un_prr_cpr_mx);
+			mutex_exit(&un->un_prr_cpr_mx);
+		}
+
+		/* reacquire readerlock after message */
+		(void) md_unit_readerlock(ui);
+
+		if ((!MDMN_KSEND_MSG_OK(rval, kres)) &&
+		    (kres->kmmr_comm_state != MDMNE_NOT_JOINED)) {
+			/* if commd is gone, no point in printing a message */
+			if (md_mn_is_commd_present())
+				mdmn_ksend_show_error(rval, kres, "RR_CLEAN");
+			kmem_free(kres, sizeof (md_mn_kresult_t));
+			kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
+			mutex_exit(&un->un_rrp_inflight_mx);
+			return (active);
+		}
+		kmem_free(kres, sizeof (md_mn_kresult_t));
+
+		/*
+		 * If ownership changed while we were sending, we probably
+		 * sent the message to the wrong node.  Leave fixing that for
+		 * the next cycle.
+		 */
+		if (un->un_mirror_owner != owner_node) {
 			mutex_exit(&un->un_rrp_inflight_mx);
 			return (active);
 		}
+
+		/*
+		 * Now that we've sent the message, clear them from the
+		 * pernode_dirty arrays.  These are ONLY cleared on a
+		 * successful send, and failure has no impact.
+		 */
+		cleared_dirty = 0;
+		start = MDMN_MSG_RR_CLEAN_START_BIT(rmsg);
+		end = start + MDMN_MSG_RR_CLEAN_DATA_BYTES(rmsg) * NBBY;
+		mutex_enter(&un->un_resync_mx);
+		rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1],
+		    RW_READER);
+		for (i = start; i < end; i++) {
+			if (isset(MDMN_MSG_RR_CLEAN_DATA(rmsg),
+			    i - start)) {
+				if (IS_PERNODE_DIRTY(md_mn_mynode_id, i, un)) {
+					un->un_pernode_dirty_sum[i]--;
+					CLR_PERNODE_DIRTY(md_mn_mynode_id, i,
+					    un);
+				}
+				if (IS_REGION_DIRTY(i, un)) {
+					cleared_dirty++;
+					CLR_REGION_DIRTY(i, un);
+					CLR_GOING_CLEAN(i, un);
+				}
+			}
+		}
+		rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
+
+		kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
 	}
-
-	for (i = 0; i < un->un_rrd_num; i++) {
-
-		if (un->c.un_status & MD_UN_KEEP_DIRTY)
-			if (IS_KEEPDIRTY(i, un))
+	mutex_exit(&un->un_resync_mx);
+
+	mutex_exit(&un->un_rrp_inflight_mx);
+
+	return (active);
+}
+
+static int
+process_resync_regions_owner(mm_unit_t *un)
+{
+	int			i, start, end;
+	int			cleared_dirty = 0;
+	/* Number of reasons why we can not proceed shutting down the mirror. */
+	int			active = 0;
+	set_t			setno = MD_UN2SET(un);
+	int			mnset = MD_MNSET_SETNO(setno);
+	md_mn_msg_rr_clean_t	*rmsg;
+	minor_t			mnum = MD_SID(un);
+	mdi_unit_t		*ui = MDI_UNIT(mnum);
+
+	/*
+	 * We drop the readerlock here to assist lock ordering with
+	 * update_resync.  Once we have the un_rrp_inflight_mx, we
+	 * can re-acquire it.
+	 */
+	md_unit_readerexit(ui);
+
+	/*
+	 * Resync region processing must be single threaded. We can't use
+	 * un_resync_mx for this purpose since this mutex gets released
+	 * when blocking on un_resync_cv.
+	 */
+	mutex_enter(&un->un_rrp_inflight_mx);
+
+	(void) md_unit_readerlock(ui);
+
+	mutex_enter(&un->un_resync_mx);
+	un->un_waiting_to_clear++;
+	while (un->un_resync_flg & MM_RF_STALL_CLEAN)
+		cv_wait(&un->un_resync_cv, &un->un_resync_mx);
+	un->un_waiting_to_clear--;
+
+	if (mnset) {
+		rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1],
+		    RW_READER);
+		cleared_dirty = mirror_generate_rr_bitmap(un, &rmsg, &active);
+
+		if (cleared_dirty) {
+			/*
+			 * Clear the bits from the pernode_dirty arrays.
+			 * If that results in any being cleared from the
+			 * un_dirty_bm, commit it.
+			 */
+			cleared_dirty = 0;
+			start = MDMN_MSG_RR_CLEAN_START_BIT(rmsg);
+			end = start + MDMN_MSG_RR_CLEAN_DATA_BYTES(rmsg) * NBBY;
+			for (i = start; i < end; i++) {
+				if (isset(MDMN_MSG_RR_CLEAN_DATA(rmsg),
+				    i - start)) {
+					if (IS_PERNODE_DIRTY(md_mn_mynode_id, i,
+					    un)) {
+						un->un_pernode_dirty_sum[i]--;
+						CLR_PERNODE_DIRTY(
+						    md_mn_mynode_id, i, un);
+					}
+					if (un->un_pernode_dirty_sum[i] == 0) {
+						cleared_dirty++;
+						CLR_REGION_DIRTY(i, un);
+						CLR_GOING_CLEAN(i, un);
+					}
+				}
+			}
+			kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
+		}
+		rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
+	} else {
+		for (i = 0; i < un->un_rrd_num; i++) {
+			if (un->c.un_status & MD_UN_KEEP_DIRTY)
+				if (IS_KEEPDIRTY(i, un))
+					continue;
+
+			if (!IS_REGION_DIRTY(i, un))
 				continue;
-
-		if (!IS_REGION_DIRTY(i, un))
-			continue;
-		if (un->un_outstanding_writes[i] != 0) {
-			active++;
-			continue;
+			if (un->un_outstanding_writes[i] != 0) {
+				active++;
+				continue;
+			}
+
+			if (!IS_GOING_CLEAN(i, un)) {
+				SET_GOING_CLEAN(i, un);
+				active++;
+				continue;
+			}
+			CLR_REGION_DIRTY(i, un);
+			CLR_GOING_CLEAN(i, un);
+			cleared_dirty++;
 		}
-
-		if (!IS_GOING_CLEAN(i, un)) {
-			SET_GOING_CLEAN(i, un);
-			active++;
-			continue;
-		}
-		CLR_REGION_DIRTY(i, un);
-		CLR_GOING_CLEAN(i, un);
-		cleared_dirty = 1;
 	}
+
 	if (cleared_dirty) {
 		un->un_resync_flg |= MM_RF_GATECLOSED;
 		mutex_exit(&un->un_resync_mx);
-
 		mddb_commitrec_wrapper(un->un_rr_dirty_recid);
-
 		mutex_enter(&un->un_resync_mx);
 		un->un_resync_flg &= ~MM_RF_GATECLOSED;
-		if (un->un_waiting_to_mark != 0) {
+
+		if (un->un_waiting_to_mark != 0 ||
+		    un->un_waiting_to_clear != 0) {
 			active++;
 			cv_broadcast(&un->un_resync_cv);
 		}
@@ -227,6 +509,29 @@
 	return (active);
 }
 
+static int
+process_resync_regions(mm_unit_t *un, callb_cpr_t *cprinfop)
+{
+	int	mnset = MD_MNSET_SETNO(MD_UN2SET(un));
+	/*
+	 * For a mirror we can only update the on-disk resync-record if we
+	 * currently own the mirror. If we are called and there is no owner we
+	 * bail out before scanning the outstanding_writes[] array.
+	 * NOTE: we only need to check here (before scanning the array) as we
+	 * 	are called with the readerlock held. This means that a change
+	 * 	of ownership away from us will block until this resync check
+	 * 	has completed.
+	 */
+	if (mnset && (MD_MN_NO_MIRROR_OWNER(un) ||
+	    (!MD_MN_MIRROR_OWNER(un) && !md_mn_is_commd_present_lite()))) {
+		return (0);
+	} else if (mnset && !MD_MN_MIRROR_OWNER(un)) {
+		return (process_resync_regions_non_owner(un, cprinfop));
+	} else {
+		return (process_resync_regions_owner(un));
+	}
+}
+
 /*
  * Function that is callable from other modules to provide
  * ability to cleanup dirty region bitmap on demand. Used
@@ -240,7 +545,7 @@
 {
 	int	cleans = 0;
 
-	while (process_resync_regions(un)) {
+	while (process_resync_regions(un, NULL)) {
 
 		cleans++;
 		if (cleans >= md_mirror_rr_cleans) {
@@ -265,6 +570,7 @@
 	mdi_unit_t	*ui;
 	mm_unit_t	*un;
 	md_link_t	*next;
+	callb_cpr_t	cprinfo;
 
 	rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
 	for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
@@ -272,8 +578,18 @@
 		if (md_get_setstatus(next->ln_setno) & MD_SET_STALE)
 			continue;
 
+		un = MD_UNIT(next->ln_id);
+
+		/*
+		 * Register this resync thread with the CPR mechanism. This
+		 * allows us to detect when the system is suspended and so
+		 * keep track of the RPC failure condition.
+		 */
+		CALLB_CPR_INIT(&cprinfo, &un->un_prr_cpr_mx, callb_md_mrs_cpr,
+		    "check_resync_regions");
+
 		ui = MDI_UNIT(next->ln_id);
-		un = (mm_unit_t *)md_unit_readerlock(ui);
+		(void) md_unit_readerlock(ui);
 
 		/*
 		 * Do not clean up resync regions if it is an ABR
@@ -287,8 +603,13 @@
 			continue;
 		}
 
-		(void) process_resync_regions(un);
+		(void) process_resync_regions(un, &cprinfo);
+
 		md_unit_readerexit(ui);
+
+		/* Remove this thread from the CPR callback table. */
+		mutex_enter(&un->un_prr_cpr_mx);
+		CALLB_CPR_EXIT(&cprinfo);
 	}
 
 	rw_exit(&mirror_md_ops.md_link_rw.lock);
@@ -306,7 +627,7 @@
 	mutex_enter(&mirror_timeout.dr_mx);
 	if (!mirror_timeout.dr_pending) {
 		mirror_timeout.dr_pending = 1;
-		daemon_request(&md_mstr_daemon, check_resync_regions,
+		daemon_request(&md_mto_daemon, check_resync_regions,
 		    (daemon_queue_t *)&mirror_timeout, REQ_OLD);
 	}
 
@@ -466,6 +787,7 @@
 	un->un_resync_flg = 0;
 	un->un_waiting_to_mark = 0;
 	un->un_waiting_to_commit = 0;
+	un->un_waiting_to_clear = 0;
 
 	un->un_goingclean_bm = NULL;
 	un->un_goingdirty_bm = NULL;
@@ -505,6 +827,27 @@
 	un->un_resync_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany(
 	    un->un_rrd_num, NBBY)), KM_SLEEP);
 
+	/*
+	 * Allocate pernode bitmap for this node. All other nodes' maps will
+	 * be created 'on-the-fly' in the ioctl message handler
+	 */
+	if (MD_MNSET_SETNO(MD_UN2SET(un))) {
+		un->un_pernode_dirty_sum =
+		    (uchar_t *)kmem_zalloc(un->un_rrd_num, KM_SLEEP);
+		if (md_mn_mynode_id > 0) {
+			un->un_pernode_dirty_bm[md_mn_mynode_id-1] = (uchar_t *)
+			    kmem_zalloc((uint_t)(howmany(un->un_rrd_num, NBBY)),
+			    KM_SLEEP);
+		}
+
+		/*
+		 * Allocate taskq to process deferred (due to locking) RR_CLEAN
+		 * requests.
+		 */
+		un->un_drl_task = (ddi_taskq_t *)md_create_taskq(MD_UN2SET(un),
+		    MD_SID(un));
+	}
+
 	if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
 		return (0);
 
@@ -734,7 +1077,7 @@
 	CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
 
 	rval = mdmn_ksend_message(setno, MD_MN_MSG_RESYNC_PHASE_DONE,
-	    MD_MSGF_NO_LOG, (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
+	    MD_MSGF_NO_LOG, 0, (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
 
 	CALLB_CPR_SAFE_END(&un->un_rs_cprinfo, &un->un_rs_cpr_mx);
 	mutex_exit(&un->un_rs_cpr_mx);
@@ -743,6 +1086,12 @@
 	if ((!MDMN_KSEND_MSG_OK(rval, kres)) &&
 	    (kres->kmmr_comm_state !=  MDMNE_NOT_JOINED)) {
 		mdmn_ksend_show_error(rval, kres, "RESYNC_PHASE_DONE");
+		/* If we're shutting down already, pause things here. */
+		if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
+			while (!md_mn_is_commd_present()) {
+				delay(md_hz);
+			}
+		}
 		cmn_err(CE_PANIC, "ksend_message failure: RESYNC_PHASE_DONE");
 	}
 	kmem_free(kres, sizeof (md_mn_kresult_t));
@@ -814,13 +1163,19 @@
 	CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
 
 	rval = mdmn_ksend_message(setno, MD_MN_MSG_RESYNC_NEXT, MD_MSGF_NO_LOG,
-	    (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
+	    0, (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
 
 	CALLB_CPR_SAFE_END(&un->un_rs_cprinfo, &un->un_rs_cpr_mx);
 	mutex_exit(&un->un_rs_cpr_mx);
 
 	if (!MDMN_KSEND_MSG_OK(rval, kres)) {
 		mdmn_ksend_show_error(rval, kres, "RESYNC_NEXT");
+		/* If we're shutting down already, pause things here. */
+		if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
+			while (!md_mn_is_commd_present()) {
+				delay(md_hz);
+			}
+		}
 		cmn_err(CE_PANIC, "ksend_message failure: RESYNC_NEXT");
 	}
 	kmem_free(kres, sizeof (md_mn_kresult_t));
@@ -2301,7 +2656,7 @@
 			CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
 
 			rval = mdmn_ksend_message(setno,
-			    MD_MN_MSG_RESYNC_FINISH, MD_MSGF_NO_LOG,
+			    MD_MN_MSG_RESYNC_FINISH, MD_MSGF_NO_LOG, 0,
 			    (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
 
 			CALLB_CPR_SAFE_END(&un->un_rs_cprinfo,
@@ -2311,6 +2666,12 @@
 			if (!MDMN_KSEND_MSG_OK(rval, kres)) {
 				mdmn_ksend_show_error(rval, kres,
 				    "RESYNC_FINISH");
+				/* If we're shutting down, pause things here. */
+				if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
+					while (!md_mn_is_commd_present()) {
+						delay(md_hz);
+					}
+				}
 				cmn_err(CE_PANIC,
 				    "ksend_message failure: RESYNC_FINISH");
 			}
@@ -2693,30 +3054,209 @@
 }
 
 int
-mirror_mark_resync_region(struct mm_unit *un,
-	diskaddr_t startblk, diskaddr_t endblk)
+mirror_mark_resync_region_non_owner(struct mm_unit *un,
+	diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
 {
-	int		no_change;
-	size_t		start_rr;
-	size_t		current_rr;
-	size_t		end_rr;
+	int			no_change;
+	size_t			start_rr;
+	size_t			current_rr;
+	size_t			end_rr;
+	md_mn_msg_rr_dirty_t	*rr;
+	md_mn_kresult_t		*kres;
+	set_t			setno = MD_UN2SET(un);
+	int			rval;
+	md_mn_nodeid_t		node_idx = source_node - 1;
+	mdi_unit_t		*ui = MDI_UNIT(MD_SID(un));
+	md_mn_nodeid_t		owner_node;
+	minor_t			mnum = MD_SID(un);
 
 	if (un->un_nsm < 2)
 		return (0);
 
+	/*
+	 * Check to see if we have a un_pernode_dirty_bm[] entry allocated. If
+	 * not, allocate it and then fill the [start..end] entries.
+	 * Update un_pernode_dirty_sum if we've gone 0->1.
+	 * Update un_dirty_bm if the corresponding entries are clear.
+	 */
+	rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
+	if (un->un_pernode_dirty_bm[node_idx] == NULL) {
+		un->un_pernode_dirty_bm[node_idx] =
+		    (uchar_t *)kmem_zalloc(
+		    (uint_t)howmany(un->un_rrd_num, NBBY), KM_SLEEP);
+	}
+	rw_exit(&un->un_pernode_dirty_mx[node_idx]);
+
 	BLK_TO_RR(end_rr, endblk, un);
 	BLK_TO_RR(start_rr, startblk, un);
-	mutex_enter(&un->un_resync_mx);
 
 	no_change = 1;
+
+	mutex_enter(&un->un_resync_mx);
+	rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_READER);
 	for (current_rr = start_rr; current_rr <= end_rr; current_rr++) {
 		un->un_outstanding_writes[current_rr]++;
+		if (!IS_PERNODE_DIRTY(source_node, current_rr, un)) {
+			un->un_pernode_dirty_sum[current_rr]++;
+			SET_PERNODE_DIRTY(source_node, current_rr, un);
+		}
+		CLR_GOING_CLEAN(current_rr, un);
+		if (!IS_REGION_DIRTY(current_rr, un)) {
+			no_change = 0;
+			SET_REGION_DIRTY(current_rr, un);
+			SET_GOING_DIRTY(current_rr, un);
+		} else if (IS_GOING_DIRTY(current_rr, un))
+			no_change = 0;
+	}
+	rw_exit(&un->un_pernode_dirty_mx[node_idx]);
+	mutex_exit(&un->un_resync_mx);
+
+	if (no_change) {
+		return (0);
+	}
+
+	/*
+	 * If we have dirty regions to commit, send a
+	 * message to the owning node so that the
+	 * in-core bitmap gets updated appropriately.
+	 * TODO: make this a kmem_cache pool to improve
+	 * alloc/free performance ???
+	 */
+	kres = (md_mn_kresult_t *)kmem_zalloc(sizeof (md_mn_kresult_t),
+	    KM_SLEEP);
+	rr = (md_mn_msg_rr_dirty_t *)kmem_alloc(sizeof (md_mn_msg_rr_dirty_t),
+	    KM_SLEEP);
+
+resend_mmrr:
+	owner_node = un->un_mirror_owner;
+
+	rr->rr_mnum = mnum;
+	rr->rr_nodeid = md_mn_mynode_id;
+	rr->rr_range = (ushort_t)start_rr << 16;
+	rr->rr_range |= (ushort_t)end_rr & 0xFFFF;
+
+	/* release readerlock before sending message */
+	md_unit_readerexit(ui);
+
+	rval = mdmn_ksend_message(setno, MD_MN_MSG_RR_DIRTY,
+	    MD_MSGF_NO_LOG|MD_MSGF_BLK_SIGNAL|MD_MSGF_DIRECTED,
+	    un->un_mirror_owner, (char *)rr,
+	    sizeof (md_mn_msg_rr_dirty_t), kres);
+
+	/* reaquire readerlock on message completion */
+	(void) md_unit_readerlock(ui);
+
+	/* if the message send failed, note it, and pass an error back up */
+	if (!MDMN_KSEND_MSG_OK(rval, kres)) {
+		/* if commd is gone, no point in printing a message */
+		if (md_mn_is_commd_present())
+			mdmn_ksend_show_error(rval, kres, "RR_DIRTY");
+		kmem_free(kres, sizeof (md_mn_kresult_t));
+		kmem_free(rr, sizeof (md_mn_msg_rr_dirty_t));
+		return (1);
+	}
+
+	/*
+	 * if the owner changed while we were sending the message, and it's
+	 * not us, the new mirror owner won't yet have done the right thing
+	 * with our data.  Let him know.  If we became the owner, we'll
+	 * deal with that differently below.  Note that receiving a message
+	 * about another node twice won't hurt anything.
+	 */
+	if (un->un_mirror_owner != owner_node && !MD_MN_MIRROR_OWNER(un))
+		goto resend_mmrr;
+
+	kmem_free(kres, sizeof (md_mn_kresult_t));
+	kmem_free(rr, sizeof (md_mn_msg_rr_dirty_t));
+
+	mutex_enter(&un->un_resync_mx);
+
+	/*
+	 * If we became the owner changed while we were sending the message,
+	 * we have dirty bits in the un_pernode_bm that aren't yet reflected
+	 * in the un_dirty_bm, as it was re-read from disk, and our bits
+	 * are also not reflected in the on-disk DRL.  Fix that now.
+	 */
+	if (MD_MN_MIRROR_OWNER(un)) {
+		rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
+		mirror_copy_rr(howmany(un->un_rrd_num, NBBY),
+		    un->un_pernode_dirty_bm[node_idx], un->un_dirty_bm);
+		rw_exit(&un->un_pernode_dirty_mx[node_idx]);
+
+		un->un_resync_flg |= MM_RF_COMMITING | MM_RF_GATECLOSED;
+
+		mutex_exit(&un->un_resync_mx);
+		mddb_commitrec_wrapper(un->un_rr_dirty_recid);
+		mutex_enter(&un->un_resync_mx);
+
+		un->un_resync_flg &= ~(MM_RF_COMMITING | MM_RF_GATECLOSED);
+		cv_broadcast(&un->un_resync_cv);
+	}
+
+	for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
+		CLR_GOING_DIRTY(current_rr, un);
+
+	mutex_exit(&un->un_resync_mx);
+
+	return (0);
+}
+
+int
+mirror_mark_resync_region_owner(struct mm_unit *un,
+	diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
+{
+	int			no_change;
+	size_t			start_rr;
+	size_t			current_rr;
+	size_t			end_rr;
+	int			mnset = MD_MNSET_SETNO(MD_UN2SET(un));
+	md_mn_nodeid_t		node_idx = source_node - 1;
+
+	if (un->un_nsm < 2)
+		return (0);
+
+	/*
+	 * Check to see if we have a un_pernode_dirty_bm[] entry allocated. If
+	 * not, allocate it and then fill the [start..end] entries.
+	 * Update un_pernode_dirty_sum if we've gone 0->1.
+	 * Update un_dirty_bm if the corresponding entries are clear.
+	 */
+	if (mnset) {
+		rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
+		if (un->un_pernode_dirty_bm[node_idx] == NULL) {
+			un->un_pernode_dirty_bm[node_idx] =
+			    (uchar_t *)kmem_zalloc(
+			    (uint_t)howmany(un->un_rrd_num, NBBY), KM_SLEEP);
+		}
+		rw_exit(&un->un_pernode_dirty_mx[node_idx]);
+	}
+
+	mutex_enter(&un->un_resync_mx);
+
+	if (mnset)
+		rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_READER);
+
+	no_change = 1;
+	BLK_TO_RR(end_rr, endblk, un);
+	BLK_TO_RR(start_rr, startblk, un);
+	for (current_rr = start_rr; current_rr <= end_rr; current_rr++) {
+		if (!mnset || source_node == md_mn_mynode_id)
+			un->un_outstanding_writes[current_rr]++;
+		if (mnset) {
+			if (!IS_PERNODE_DIRTY(source_node, current_rr, un))
+				un->un_pernode_dirty_sum[current_rr]++;
+			SET_PERNODE_DIRTY(source_node, current_rr, un);
+		}
 		CLR_GOING_CLEAN(current_rr, un);
 		if (!IS_REGION_DIRTY(current_rr, un))
 			no_change = 0;
 		if (IS_GOING_DIRTY(current_rr, un))
 			no_change = 0;
 	}
+
+	if (mnset)
+		rw_exit(&un->un_pernode_dirty_mx[node_idx]);
+
 	if (no_change) {
 		mutex_exit(&un->un_resync_mx);
 		return (0);
@@ -2741,7 +3281,7 @@
 		}
 	}
 	if (no_change) {
-		if (un->un_waiting_to_mark == 0)
+		if (un->un_waiting_to_mark == 0 || un->un_waiting_to_clear != 0)
 			cv_broadcast(&un->un_resync_cv);
 		mutex_exit(&un->un_resync_mx);
 		return (0);
@@ -2749,19 +3289,21 @@
 
 	un->un_resync_flg |= MM_RF_COMMIT_NEEDED;
 	un->un_waiting_to_commit++;
-	while ((un->un_waiting_to_mark != 0) &&
-	    (!(un->un_resync_flg & MM_RF_GATECLOSED))) {
+	while (un->un_waiting_to_mark != 0 &&
+	    !(un->un_resync_flg & MM_RF_GATECLOSED)) {
 		if (panicstr)
 			return (1);
 		cv_wait(&un->un_resync_cv, &un->un_resync_mx);
 	}
 
-	if ((un->un_resync_flg & MM_RF_COMMIT_NEEDED)) {
+	if (un->un_resync_flg & MM_RF_COMMIT_NEEDED) {
 		un->un_resync_flg |= MM_RF_COMMITING | MM_RF_GATECLOSED;
 		un->un_resync_flg &= ~MM_RF_COMMIT_NEEDED;
+
 		mutex_exit(&un->un_resync_mx);
 		mddb_commitrec_wrapper(un->un_rr_dirty_recid);
 		mutex_enter(&un->un_resync_mx);
+
 		un->un_resync_flg &= ~MM_RF_COMMITING;
 		cv_broadcast(&un->un_resync_cv);
 	}
@@ -2779,10 +3321,26 @@
 		cv_broadcast(&un->un_resync_cv);
 	}
 	mutex_exit(&un->un_resync_mx);
+
 	return (0);
 }
 
 int
+mirror_mark_resync_region(struct mm_unit *un,
+	diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
+{
+	int	mnset = MD_MNSET_SETNO(MD_UN2SET(un));
+
+	if (mnset && !MD_MN_MIRROR_OWNER(un)) {
+		return (mirror_mark_resync_region_non_owner(un, startblk,
+		    endblk, source_node));
+	} else {
+		return (mirror_mark_resync_region_owner(un, startblk, endblk,
+		    source_node));
+	}
+}
+
+int
 mirror_resize_resync_regions(mm_unit_t *un, diskaddr_t new_tb)
 {
 	short		*owp;
@@ -2793,9 +3351,10 @@
 	size_t		size;
 	mddb_recid_t	recid, old_recid;
 	uchar_t		*old_dirty_bm;
-	int		i;
+	int		i, j;
 	mddb_type_t	typ1;
 	set_t		setno = MD_UN2SET(un);
+	uchar_t		*old_pns;
 
 	old_nregions = un->un_rrd_num;
 	new_nregions = (uint_t)((new_tb/un->un_rrd_blksize) + 1);
@@ -2840,6 +3399,11 @@
 	un->un_outstanding_writes = (short *)kmem_zalloc(
 	    new_nregions * sizeof (short), KM_SLEEP);
 
+	old_pns = un->un_pernode_dirty_sum;
+	if (old_pns)
+		un->un_pernode_dirty_sum = (uchar_t *)kmem_zalloc(new_nregions,
+		    KM_SLEEP);
+
 	/*
 	 * Now translate the old records into the new
 	 * records
@@ -2847,15 +3411,41 @@
 	for (i = 0; i < old_nregions; i++) {
 		/*
 		 * only bring forward the
-		 * outstanding write counters and the dirty bits
+		 * outstanding write counters and the dirty bits and also
+		 * the pernode_summary counts
 		 */
 		if (!isset(old_dirty_bm, i))
 			continue;
 
 		setbit(un->un_dirty_bm, (i / rr_mult));
 		un->un_outstanding_writes[(i / rr_mult)] += owp[i];
+		if (old_pns)
+			un->un_pernode_dirty_sum[(i / rr_mult)] += old_pns[i];
 	}
 	kmem_free((caddr_t)owp, old_nregions * sizeof (short));
+	if (old_pns)
+		kmem_free((caddr_t)old_pns, old_nregions);
+
+	/*
+	 * Copy all non-zero un_pernode_dirty_bm[] arrays to new versions
+	 */
+	for (j = 0; j < MD_MNMAXSIDES; j++) {
+		rw_enter(&un->un_pernode_dirty_mx[j], RW_WRITER);
+		old_dirty_bm = un->un_pernode_dirty_bm[j];
+		if (old_dirty_bm) {
+			un->un_pernode_dirty_bm[j] = (uchar_t *)kmem_zalloc(
+			    new_bm_size, KM_SLEEP);
+			for (i = 0; i < old_nregions; i++) {
+				if (!isset(old_dirty_bm, i))
+					continue;
+
+				setbit(un->un_pernode_dirty_bm[j],
+				    (i / rr_mult));
+			}
+			kmem_free((caddr_t)old_dirty_bm, old_bm_size);
+		}
+		rw_exit(&un->un_pernode_dirty_mx[j]);
+	}
 
 	/* Save the old record id */
 	old_recid = un->un_rr_dirty_recid;
@@ -2891,6 +3481,7 @@
 	mddb_recid_t	recid, old_recid;
 	mddb_type_t	typ1;
 	set_t		setno = MD_UN2SET(un);
+	int		i;
 
 	old_nregions = un->un_rrd_num;
 	new_nregions = (uint_t)((new_tb/un->un_rrd_blksize) + 1);
@@ -2924,6 +3515,8 @@
 	 *		un_goingclean_bm
 	 *		un_resync_bm
 	 *		un_outstanding_writes
+	 *		un_pernode_dirty_sum
+	 *		un_pernode_dirty_bm[]
 	 */
 	old = un->un_goingdirty_bm;
 	un->un_goingdirty_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
@@ -2947,6 +3540,28 @@
 	    old_nregions * sizeof (short));
 	kmem_free((caddr_t)owp, (old_nregions * sizeof (short)));
 
+	old = un->un_pernode_dirty_sum;
+	if (old) {
+		un->un_pernode_dirty_sum = (uchar_t *)kmem_zalloc(
+		    new_nregions, KM_SLEEP);
+		bcopy((caddr_t)old, (caddr_t)un->un_pernode_dirty_sum,
+		    old_nregions);
+		kmem_free((caddr_t)old, old_nregions);
+	}
+
+	for (i = 0; i < MD_MNMAXSIDES; i++) {
+		rw_enter(&un->un_pernode_dirty_mx[i], RW_WRITER);
+		old = un->un_pernode_dirty_bm[i];
+		if (old) {
+			un->un_pernode_dirty_bm[i] = (uchar_t *)kmem_zalloc(
+			    new_bm_size, KM_SLEEP);
+			bcopy((caddr_t)old, (caddr_t)un->un_pernode_dirty_bm[i],
+			    old_bm_size);
+			kmem_free((caddr_t)old, old_bm_size);
+		}
+		rw_exit(&un->un_pernode_dirty_mx[i]);
+	}
+
 	/* Save the old record id */
 	old_recid = un->un_rr_dirty_recid;
 
@@ -2980,3 +3595,263 @@
 	for (i = 0; i < sz; i++)
 		*dest++ |= *src++;
 }
+
+/*
+ * mirror_set_dirty_rr:
+ * -------------------
+ * Set the pernode_dirty_bm[node] entries and un_dirty_bm[] if appropriate.
+ * For the owning node (DRL/mirror owner) update the on-disk RR if needed.
+ * Called on every clean->dirty transition for the originating writer node.
+ * Note: only the non-owning nodes will initiate this message and it is only
+ * the owning node that has to process it.
+ */
+int
+mirror_set_dirty_rr(md_mn_rr_dirty_params_t *iocp)
+{
+
+	minor_t			mnum = iocp->rr_mnum;
+	mm_unit_t		*un;
+	int			start = (int)iocp->rr_start;
+	int			end = (int)iocp->rr_end;
+	set_t			setno = MD_MIN2SET(mnum);
+	md_mn_nodeid_t		orignode = iocp->rr_nodeid;	/* 1-based */
+	diskaddr_t		startblk, endblk;
+
+	mdclrerror(&iocp->mde);
+
+	if ((setno >= md_nsets) ||
+	    (MD_MIN2UNIT(mnum) >= md_nunits)) {
+		return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
+	}
+
+	/* Must have _NO_ ioctl lock set if we update the RR on-disk */
+	un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
+
+	if (un == NULL) {
+		return (mdmderror(&iocp->mde, MDE_UNIT_NOT_SETUP, mnum));
+	}
+	if (un->c.un_type != MD_METAMIRROR) {
+		return (mdmderror(&iocp->mde, MDE_NOT_MM, mnum));
+	}
+	if (orignode < 1 || orignode >= MD_MNMAXSIDES) {
+		return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
+	}
+	if (un->un_nsm < 2) {
+		return (0);
+	}
+
+	/*
+	 * Only process this message if we're the owner of the mirror.
+	 */
+	if (!MD_MN_MIRROR_OWNER(un)) {
+		return (0);
+	}
+
+	RR_TO_BLK(startblk, start, un);
+	RR_TO_BLK(endblk, end, un);
+	return (mirror_mark_resync_region_owner(un, startblk, endblk,
+	    orignode));
+}
+
+/*
+ * mirror_clean_rr_bits:
+ * --------------------
+ * Clear the pernode_dirty_bm[node] entries which are passed in the bitmap
+ * Once _all_ references are removed (pernode_dirty_count[x] == 0) this region
+ * is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all
+ * nodes. Callable from ioctl / interrupt / whatever context.
+ * un_resync_mx is held on entry.
+ */
+static void
+mirror_clean_rr_bits(
+	md_mn_rr_clean_params_t *iocp)
+{
+	minor_t			mnum = iocp->rr_mnum;
+	mm_unit_t		*un;
+	uint_t			cleared_bits;
+	md_mn_nodeid_t		node = iocp->rr_nodeid - 1;
+	md_mn_nodeid_t		orignode = iocp->rr_nodeid;
+	int			i, start, end;
+
+	un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
+
+	cleared_bits = 0;
+	start = MDMN_RR_CLEAN_PARAMS_START_BIT(iocp);
+	end = start + MDMN_RR_CLEAN_PARAMS_DATA_BYTES(iocp) * NBBY;
+	rw_enter(&un->un_pernode_dirty_mx[node], RW_READER);
+	for (i = start; i < end; i++) {
+		if (isset(MDMN_RR_CLEAN_PARAMS_DATA(iocp), i - start)) {
+			if (IS_PERNODE_DIRTY(orignode, i, un)) {
+				un->un_pernode_dirty_sum[i]--;
+				CLR_PERNODE_DIRTY(orignode, i, un);
+			}
+			if (un->un_pernode_dirty_sum[i] == 0) {
+				cleared_bits++;
+				CLR_REGION_DIRTY(i, un);
+				CLR_GOING_CLEAN(i, un);
+			}
+		}
+	}
+	rw_exit(&un->un_pernode_dirty_mx[node]);
+	if (cleared_bits) {
+		/*
+		 * We can only be called iff we are the mirror owner, however
+		 * as this is a (potentially) decoupled routine the ownership
+		 * may have moved from us by the time we get to execute the
+		 * bit clearing. Hence we still need to check for being the
+		 * owner before flushing the DRL to the replica.
+		 */
+		if (MD_MN_MIRROR_OWNER(un)) {
+			mutex_exit(&un->un_resync_mx);
+			mddb_commitrec_wrapper(un->un_rr_dirty_recid);
+			mutex_enter(&un->un_resync_mx);
+		}
+	}
+}
+
+/*
+ * mirror_drl_task:
+ * ---------------
+ * Service routine for clearing the DRL bits on a deferred MD_MN_RR_CLEAN call
+ * We need to obtain exclusive access to the un_resync_cv and then clear the
+ * necessary bits.
+ * On completion, we must also free the passed in argument as it is allocated
+ * at the end of the ioctl handler and won't be freed on completion.
+ */
+static void
+mirror_drl_task(void *arg)
+{
+	md_mn_rr_clean_params_t	*iocp = (md_mn_rr_clean_params_t *)arg;
+	minor_t			mnum = iocp->rr_mnum;
+	mm_unit_t		*un;
+
+	un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
+
+	mutex_enter(&un->un_rrp_inflight_mx);
+	mutex_enter(&un->un_resync_mx);
+	un->un_waiting_to_clear++;
+	while (un->un_resync_flg & MM_RF_STALL_CLEAN)
+		cv_wait(&un->un_resync_cv, &un->un_resync_mx);
+	un->un_waiting_to_clear--;
+
+	un->un_resync_flg |= MM_RF_GATECLOSED;
+	mirror_clean_rr_bits(iocp);
+	un->un_resync_flg &= ~MM_RF_GATECLOSED;
+	if (un->un_waiting_to_mark != 0 || un->un_waiting_to_clear != 0) {
+		cv_broadcast(&un->un_resync_cv);
+	}
+	mutex_exit(&un->un_resync_mx);
+	mutex_exit(&un->un_rrp_inflight_mx);
+
+	kmem_free((caddr_t)iocp, MDMN_RR_CLEAN_PARAMS_SIZE(iocp));
+}
+
+/*
+ * mirror_set_clean_rr:
+ * -------------------
+ * Clear the pernode_dirty_bm[node] entries which are passed in the bitmap
+ * Once _all_ references are removed (pernode_dirty_count[x] == 0) this region
+ * is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all
+ * nodes.
+ *
+ * Only the mirror-owner need process this message as it is the only RR updater.
+ * Non-owner nodes issue this request, but as we have no point-to-point message
+ * support we will receive the message on all nodes.
+ */
+int
+mirror_set_clean_rr(md_mn_rr_clean_params_t *iocp)
+{
+
+	minor_t			mnum = iocp->rr_mnum;
+	mm_unit_t		*un;
+	set_t			setno = MD_MIN2SET(mnum);
+	md_mn_nodeid_t		node = iocp->rr_nodeid - 1;
+	int			can_clear = 0;
+	md_mn_rr_clean_params_t	*newiocp;
+	int			rval = 0;
+
+	mdclrerror(&iocp->mde);
+
+	if ((setno >= md_nsets) ||
+	    (MD_MIN2UNIT(mnum) >= md_nunits)) {
+		return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
+	}
+
+	/* Must have _NO_ ioctl lock set if we update the RR on-disk */
+	un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
+
+	if (un == NULL) {
+		return (mdmderror(&iocp->mde, MDE_UNIT_NOT_SETUP, mnum));
+	}
+	if (un->c.un_type != MD_METAMIRROR) {
+		return (mdmderror(&iocp->mde, MDE_NOT_MM, mnum));
+	}
+	if (un->un_nsm < 2) {
+		return (0);
+	}
+
+	/*
+	 * Check to see if we're the mirror owner. If not, there's nothing
+	 * for us to to.
+	 */
+	if (!MD_MN_MIRROR_OWNER(un)) {
+		return (0);
+	}
+
+	/*
+	 * Process the to-be-cleaned bitmap. We need to update the pernode_dirty
+	 * bits and pernode_dirty_sum[n], and if, and only if, the sum goes 0
+	 * we can then mark the un_dirty_bm entry as GOINGCLEAN. Alternatively
+	 * we can just defer this cleaning until the next process_resync_regions
+	 * timeout.
+	 */
+	rw_enter(&un->un_pernode_dirty_mx[node], RW_WRITER);
+	if (un->un_pernode_dirty_bm[node] == NULL) {
+		un->un_pernode_dirty_bm[node] = (uchar_t *)kmem_zalloc(
+		    un->un_rrd_num, KM_SLEEP);
+	}
+	rw_exit(&un->un_pernode_dirty_mx[node]);
+
+	/*
+	 * See if we can simply clear the un_dirty_bm[] entries. If we're not
+	 * the issuing node _and_ we aren't in the process of marking/clearing
+	 * the RR bitmaps, we can simply update the bits as needed.
+	 * If we're the owning node and _not_ the issuing node, we should also
+	 * sync the RR if we clear any bits in it.
+	 */
+	mutex_enter(&un->un_resync_mx);
+	can_clear = (un->un_resync_flg & MM_RF_STALL_CLEAN) ? 0 : 1;
+	if (can_clear) {
+		un->un_resync_flg |= MM_RF_GATECLOSED;
+		mirror_clean_rr_bits(iocp);
+		un->un_resync_flg &= ~MM_RF_GATECLOSED;
+		if (un->un_waiting_to_mark != 0 ||
+		    un->un_waiting_to_clear != 0) {
+			cv_broadcast(&un->un_resync_cv);
+		}
+	}
+	mutex_exit(&un->un_resync_mx);
+
+	/*
+	 * If we couldn't clear the bits, due to DRL update from m_m_r_r / p_r_r
+	 * we must schedule a blocking call to update the DRL on this node.
+	 * As we're invoked from an ioctl we are going to have the original data
+	 * disappear (kmem_free) once we return. So, copy the data into a new
+	 * structure and let the taskq routine release it on completion.
+	 */
+	if (!can_clear) {
+		size_t	sz = MDMN_RR_CLEAN_PARAMS_SIZE(iocp);
+
+		newiocp = (md_mn_rr_clean_params_t *)kmem_alloc(sz, KM_SLEEP);
+
+		bcopy(iocp, newiocp, sz);
+
+		if (ddi_taskq_dispatch(un->un_drl_task, mirror_drl_task,
+		    newiocp, DDI_NOSLEEP) != DDI_SUCCESS) {
+			kmem_free(newiocp, sz);
+			rval = ENOMEM;	/* probably starvation */
+		}
+	}
+
+	return (rval);
+}
--- a/usr/src/uts/common/io/lvm/softpart/sp.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/uts/common/io/lvm/softpart/sp.c	Wed Dec 24 08:23:40 2008 -0700
@@ -118,6 +118,7 @@
 extern kmutex_t		md_mx;
 extern kcondvar_t	md_cv;
 extern md_krwlock_t	md_unit_array_rw;
+extern clock_t		md_hz;
 
 static kmem_cache_t	*sp_parent_cache = NULL;
 static kmem_cache_t	*sp_child_cache = NULL;
@@ -341,15 +342,19 @@
 	kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
 
 	rval = mdmn_ksend_message(setno, MD_MN_MSG_SP_SETSTAT2, MD_MSGF_NO_LOG,
-	    (char *)&sp_msg, sizeof (sp_msg), kres);
+	    0, (char *)&sp_msg, sizeof (sp_msg), kres);
 
 	if (!MDMN_KSEND_MSG_OK(rval, kres)) {
 		mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_SP_SETSTAT2");
-
+		/* If we're shutting down already, pause things here. */
+		if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
+			while (!md_mn_is_commd_present()) {
+				delay(md_hz);
+			}
+		}
 		/*
 		 * Panic as we are now in an inconsistent state.
 		 */
-
 		cmn_err(CE_PANIC, "md: %s: %s could not be set on all nodes\n",
 		    md_shortname(MD_SID(un)), str);
 	}
--- a/usr/src/uts/common/io/lvm/softpart/sp_ioctl.c	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/uts/common/io/lvm/softpart/sp_ioctl.c	Wed Dec 24 08:23:40 2008 -0700
@@ -1150,6 +1150,7 @@
 	}
 
 	case MD_IOC_SPUPDATEWM:
+	case MD_MN_IOC_SPUPDATEWM:
 	{
 		if (! (mode & FWRITE))
 			return (EACCES);
--- a/usr/src/uts/common/sys/lvm/md_mirror.h	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/uts/common/sys/lvm/md_mirror.h	Wed Dec 24 08:23:40 2008 -0700
@@ -18,6 +18,7 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
@@ -30,6 +31,9 @@
 #include <sys/lvm/mdvar.h>
 #include <sys/lvm/md_mirror_shared.h>
 #include <sys/lvm/md_rename.h>
+#ifdef	_KERNEL
+#include <sys/sunddi.h>
+#endif
 
 #ifdef	__cplusplus
 extern "C" {
@@ -331,9 +335,24 @@
 	kcondvar_t	un_dmr_cv;		/* condvar for DMR requests */
 	int		un_dmr_last_read;	/* last DMR submirror read */
 	callb_cpr_t	un_rs_cprinfo;		/* CPR info for resync thread */
-	kmutex_t	un_rs_cpr_mx;		/* Mutex for CPR info */
+	kmutex_t	un_rs_cpr_mx;		/* mutex for resync CPR info */
+	kmutex_t	un_prr_cpr_mx;		/* mutex for prr CPR info */
 	uint_t		un_resync_completed;	/* type of last resync */
 	int		un_abr_count;		/* count of sp's with abr set */
+
+	uchar_t		*un_pernode_dirty_bm[MD_MNMAXSIDES];
+	uchar_t		*un_pernode_dirty_sum;
+
+	krwlock_t	un_pernode_dirty_mx[MD_MNMAXSIDES];
+	ushort_t	un_rr_clean_start_bit;  /* where to start next clean */
+
+#ifdef	_KERNEL
+	ddi_taskq_t	*un_drl_task;		/* deferred RR_CLEAN taskq */
+#else
+	void		*un_drl_task;		/* deferred RR_CLEAN taskq */
+#endif	/* _KERNEL */
+	uint_t		un_waiting_to_clear;	/* Blocked waiting to clear */
+
 }mm_mirror_ic_t;
 
 #define	MM_MN_OWNER_SENT	0x0001		/* RPC in progress */
@@ -416,9 +435,15 @@
 #define	un_dmr_last_read	un_mmic.un_dmr_last_read
 #define	un_rs_cprinfo		un_mmic.un_rs_cprinfo
 #define	un_rs_cpr_mx		un_mmic.un_rs_cpr_mx
+#define	un_prr_cpr_mx		un_mmic.un_prr_cpr_mx
 #define	un_resync_completed	un_mmic.un_resync_completed
 #define	un_abr_count		un_mmic.un_abr_count
-
+#define	un_pernode_dirty_bm	un_mmic.un_pernode_dirty_bm
+#define	un_pernode_dirty_sum	un_mmic.un_pernode_dirty_sum
+#define	un_pernode_dirty_mx	un_mmic.un_pernode_dirty_mx
+#define	un_rr_clean_start_bit	un_mmic.un_rr_clean_start_bit
+#define	un_drl_task		un_mmic.un_drl_task
+#define	un_waiting_to_clear	un_mmic.un_waiting_to_clear
 
 #define	MM_RF_GATECLOSED	0x0001
 #define	MM_RF_COMMIT_NEEDED	0x0002
@@ -497,6 +522,12 @@
 #define	IS_KEEPDIRTY(i, un)	(isset((un)->un_resync_bm, (i)))
 #define	CLR_KEEPDIRTY(i, un)	(clrbit((un)->un_resync_bm, (i)))
 
+#define	IS_PERNODE_DIRTY(n, i, un) \
+	(isset((un)->un_pernode_dirty_bm[(n)-1], (i)))
+#define	CLR_PERNODE_DIRTY(n, i, un) \
+	(clrbit((un)->un_pernode_dirty_bm[(n)-1], (i)))
+#define	SET_PERNODE_DIRTY(n, i, un) \
+	(setbit((un)->un_pernode_dirty_bm[(n)-1], (i)))
 
 /*
  * Write-On-Write handling.
@@ -579,13 +610,15 @@
 			    md_error_t *ep, IOLOCK *);
 extern int		mirror_ioctl_resync(md_resync_ioctl_t *p, IOLOCK *);
 extern int		mirror_mark_resync_region(mm_unit_t *, diskaddr_t,
-				diskaddr_t);
+				diskaddr_t, md_mn_nodeid_t);
 extern void		resync_start_timeout(set_t setno);
 extern int		mirror_resize_resync_regions(mm_unit_t *, diskaddr_t);
 extern int		mirror_add_resync_regions(mm_unit_t *, diskaddr_t);
 extern int		mirror_probedevs(md_probedev_t *, IOLOCK *);
 extern void		mirror_copy_rr(int, uchar_t *, uchar_t *);
 extern void		mirror_process_unit_resync(mm_unit_t *);
+extern int		mirror_set_dirty_rr(md_mn_rr_dirty_params_t *);
+extern int		mirror_set_clean_rr(md_mn_rr_clean_params_t *);
 #endif	/* _KERNEL */
 
 #ifdef	__cplusplus
--- a/usr/src/uts/common/sys/lvm/md_sp.h	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/uts/common/sys/lvm/md_sp.h	Wed Dec 24 08:23:40 2008 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,16 +18,15 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS__MD_SP_H
 #define	_SYS__MD_SP_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/lvm/mdvar.h>
 
 #ifdef	__cplusplus
@@ -99,6 +97,7 @@
 #define	MD_IOC_SPSTATUS		(MDIOC_MISC|0)
 #define	MD_IOC_SPUPDATEWM	(MDIOC_MISC|1)
 #define	MD_IOC_SPREADWM		(MDIOC_MISC|2)
+#define	MD_MN_IOC_SPUPDATEWM	(MDIOC_MISC|3)
 
 #ifdef _KERNEL
 
--- a/usr/src/uts/common/sys/lvm/mdio.h	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/uts/common/sys/lvm/mdio.h	Wed Dec 24 08:23:40 2008 -0700
@@ -18,16 +18,15 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS__MDIO_H
 #define	_SYS__MDIO_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/debug.h>
 #include <sys/ioctl.h>
 #include <sys/types.h>
@@ -433,6 +432,31 @@
 	unit_t		un;
 } md_mkdev_params_t;
 
+#define	MDMN_RR_CLEAN_PARAMS_DATA(x)	((unsigned char *)(x) + \
+	    sizeof (md_mn_rr_clean_params_t))
+#define	MDMN_RR_CLEAN_PARAMS_SIZE(x)	(sizeof (md_mn_rr_clean_params_t) + \
+	    MDMN_RR_CLEAN_PARAMS_DATA_BYTES(x))
+#define	MDMN_RR_CLEAN_PARAMS_START_BIT(x)	((x)->rr_start_size >> 16)
+#define	MDMN_RR_CLEAN_PARAMS_DATA_BYTES(x)	((x)->rr_start_size & 0xffff)
+
+typedef struct md_mn_rr_clean_params {
+	MD_DRIVER
+	md_error_t	mde;
+	md_mn_nodeid_t	rr_nodeid;
+	minor_t		rr_mnum;
+	unsigned int	rr_start_size;	/* start_bit (16b) | data_bytes (16b) */
+	/* actual data goes here */
+} md_mn_rr_clean_params_t;
+
+typedef struct md_mn_rr_dirty_params {
+	MD_DRIVER
+	md_error_t	mde;
+	minor_t		rr_mnum;
+	md_mn_nodeid_t	rr_nodeid;
+	ushort_t	rr_start;	/* First RR region to mark */
+	ushort_t	rr_end;		/* Last RR region to mark */
+} md_mn_rr_dirty_params_t;
+
 /*
  * Flags to coordinate sending device id between kernel and user space.
  * To get devid from kernel:
@@ -756,7 +780,8 @@
 #define	MD_IOCGET_HSP_NM	(MDIOC|105) /* get hsp entry from namespace */
 #define	MD_IOCREM_DEV		(MDIOC|106) /* remove device node for unit */
 #define	MD_IOCUPDATE_NM_RR_DID	(MDIOC|107) /* update remotely repl did in NM */
-
+#define	MD_MN_RR_DIRTY		(MDIOC|108) /* Mark RR range as dirty */
+#define	MD_MN_RR_CLEAN		(MDIOC|109) /* Clean RR bits from bitmap */
 
 #define	MDIOC_MISC	(MDIOC|128)	/* misc module base */
 /* Used in DEBUG_TEST code */
--- a/usr/src/uts/common/sys/lvm/mdmn_commd.x	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/uts/common/sys/lvm/mdmn_commd.x	Wed Dec 24 08:23:40 2008 -0700
@@ -20,11 +20,10 @@
 % */
 %
 %/*
-% * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+% * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 % * Use is subject to license terms.
 % */
 %
-%#pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 %#include <sys/types.h>
 %#include <sys/types32.h>
@@ -103,6 +102,8 @@
 	MD_MN_MSG_SETSYNC,		/* Set resync status */
 	MD_MN_MSG_POKE_HOTSPARES,	/* Call poke_hotspares */
 	MD_MN_MSG_ADDMDNAME,		/* Add metadevice name */
+	MD_MN_MSG_RR_DIRTY,		/* Mark RR range as dirty */
+	MD_MN_MSG_RR_CLEAN,		/* Mark RR range as clean */
 	MD_MN_NMESSAGES /* insert elements before */
 };
 
@@ -361,6 +362,39 @@
 	minor_t		pokehsp_setno;
 };
 
+/* Message format for MD_MN_MSG_RR_DIRTY message */
+struct md_mn_msg_rr_dirty_t {
+	minor_t		rr_mnum;
+	int		rr_nodeid;
+	u_int		rr_range;	/* Start(16bits) | End(16bits) */
+};
+
+/* Message format for MD_MN_MSG_RR_CLEAN message */
+%#define	MDMN_MSG_RR_CLEAN_DATA_MAX_BYTES	\
+%		    ((MDMN_MAX_KMSG_DATA) - \
+%		    sizeof (struct md_mn_msg_rr_clean_t))
+%#define	MDMN_MSG_RR_CLEAN_SIZE_DATA(x)		\
+%		    (sizeof (struct md_mn_msg_rr_clean_t) + (x))
+%#define	MDMN_MSG_RR_CLEAN_MSG_SIZE(x)		\
+%		    (sizeof (struct md_mn_msg_rr_clean_t) \
+%		    + MDMN_MSG_RR_CLEAN_DATA_BYTES(x))
+%#define	MDMN_MSG_RR_CLEAN_DATA(x)		\
+%		    ((unsigned char *)(x) + \
+%		    sizeof (struct md_mn_msg_rr_clean_t))
+
+/* since we cannot use ushorts, some macros to extract the parts from an int */
+%#define	MDMN_MSG_RR_CLEAN_START_BIT(x)	((x)->rr_start_size >> 16)
+%#define	MDMN_MSG_RR_CLEAN_DATA_BYTES(x)	((x)->rr_start_size & 0xffff)
+%#define	MDMN_MSG_RR_CLEAN_START_SIZE_SET(x, start, size) \
+%			((x)->rr_start_size = (start << 16) | size)
+
+struct md_mn_msg_rr_clean_t {
+	md_mn_nodeid_t	rr_nodeid;
+	unsigned int	rr_mnum;
+	unsigned int	rr_start_size;	/* start_bit (16b) | data_bytes (16b) */
+	/* actual data goes here */
+};
+
 %#define	MD_MSGF_NO_LOG			0x00000001
 %#define	MD_MSGF_NO_BCAST		0x00000002
 %#define	MD_MSGF_STOP_ON_ERROR		0x00000004
@@ -373,6 +407,9 @@
 %#define	MD_MSGF_FAIL_ON_SUSPEND		0x00000200
 %#define	MD_MSGF_NO_MCT			0x00000400
 %#define	MD_MSGF_PANIC_WHEN_INCONSISTENT	0x00000800
+%#define	MD_MSGF_BLK_SIGNAL		0x00001000
+%#define	MD_MSGF_KSEND_NORETRY		0x00002000
+%#define	MD_MSGF_DIRECTED		0x00004000
 %#define	MD_MSGF_VERBOSE			0x10000000
 %#define	MD_MSGF_VERBOSE_2		0x20000000
 
@@ -418,7 +455,8 @@
 	u_int		msg_flags;	/* See MD_MSGF_* above */
 	set_t		msg_setno;	/* which set is involved */
         md_mn_msgtype_t msg_type;       /* what type of message */
-	char		msg_spare[32];	/* Always good to hav'em */
+	md_mn_nodeid_t	msg_recipient;	/* who to send DIRECTED message to */
+	char		msg_spare[28];	/* Always good to hav'em */
 	opaque		msg_event<>;	/* the actual event wrapped up */
 };
 %#define	msg_event_data	msg_event.msg_event_val
@@ -435,7 +473,8 @@
 	uint32_t	msg_flags;	/* See MD_MSGF_* above */
 	set_t		msg_setno;	/* which set is involved */
         md_mn_msgtype_t msg_type;       /* what type of message */
-	char		msg_spare[32];	/* Always good to hav'em */
+	md_mn_nodeid_t	msg_recipient;	/* who to send DIRECTED message to */
+	char		msg_spare[28];	/* Always good to hav'em */
 	uint32_t	msg_ev_len;	
 	char		msg_ev_val[MD_MN_MSG_MAXDATALEN];
 };
@@ -450,6 +489,7 @@
 	u_int		kmsg_flags;
 	set_t		kmsg_setno;
 	md_mn_msgtype_t	kmsg_type;
+	md_mn_nodeid_t	kmsg_recipient;	/* who to send DIRECTED message to */
 	int		kmsg_size;
 	char		kmsg_data[MDMN_MAX_KMSG_DATA];
 };
@@ -549,7 +589,7 @@
 
 
 program MDMN_COMMD {
-	version ONE {
+	version TWO {
 		md_mn_result_t 
 		mdmn_send(md_mn_msg_t) = 1;
 
@@ -579,5 +619,5 @@
 		
 		int
 		mdmn_comm_msglock(md_mn_type_and_lock_t) = 10;
-	} = 1;
+	} = 2;
 } = 100422;
--- a/usr/src/uts/common/sys/lvm/mdvar.h	Wed Dec 24 05:48:11 2008 -0800
+++ b/usr/src/uts/common/sys/lvm/mdvar.h	Wed Dec 24 08:23:40 2008 -0700
@@ -744,8 +744,8 @@
 extern int	md_check_ioctl_against_unit(int, mdc_unit_t);
 extern mddb_recid_t md_vtoc_to_efi_record(mddb_recid_t, set_t);
 
-extern int	mdmn_ksend_message(set_t, md_mn_msgtype_t, uint_t, char *, int,
-		    md_mn_kresult_t *);
+extern int	mdmn_ksend_message(set_t, md_mn_msgtype_t, uint_t,
+		    md_mn_nodeid_t, char *, int, md_mn_kresult_t *);
 extern void	mdmn_ksend_show_error(int, md_mn_kresult_t *, const char *);
 extern int	mdmn_send_capability_message(minor_t, volcap_t, IOLOCK *);
 extern void	mdmn_clear_all_capabilities(minor_t);
@@ -755,9 +755,11 @@
 extern void	md_upd_set_unnext(set_t, unit_t);
 extern int	md_rem_selfname(minor_t);
 extern void	md_rem_hspname(set_t, mdkey_t);
+extern void	*md_create_taskq(set_t, minor_t);
 
 /* Externals from md_ioctl.c */
 extern int	md_mn_is_commd_present(void);
+extern int	md_mn_is_commd_present_lite(void);
 extern void	md_mn_clear_commd_present(void);
 extern int	md_admin_ioctl(md_dev64_t, int, caddr_t, int, IOLOCK *lockp);
 extern void	md_get_geom(md_unit_t *, struct dk_geom *);