Mercurial > illumos > illumos-gate

--- a/usr/src/uts/sun4v/io/vdc.c	Mon Nov 09 08:33:48 2009 -0800
+++ b/usr/src/uts/sun4v/io/vdc.c	Mon Nov 09 09:03:03 2009 -0800
@@ -69,6 +69,7 @@
 #include <sys/mdeg.h>
 #include <sys/note.h>
 #include <sys/open.h>
+#include <sys/random.h>
 #include <sys/sdt.h>
 #include <sys/stat.h>
 #include <sys/sunddi.h>
@@ -82,6 +83,7 @@
 #include <sys/cdio.h>
 #include <sys/dktp/fdisk.h>
 #include <sys/dktp/dadkio.h>
+#include <sys/fs/dv_node.h>
 #include <sys/mhd.h>
 #include <sys/scsi/generic/sense.h>
 #include <sys/scsi/impl/uscsi.h>
@@ -174,18 +176,20 @@
 static int	vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *msg);
 static int 	vdc_send_request(vdc_t *vdcp, int operation,
 		    caddr_t addr, size_t nbytes, int slice, diskaddr_t offset,
-		    int cb_type, void *cb_arg, vio_desc_direction_t dir);
+		    buf_t *bufp, vio_desc_direction_t dir, int flags);
 static int	vdc_map_to_shared_dring(vdc_t *vdcp, int idx);
 static int 	vdc_populate_descriptor(vdc_t *vdcp, int operation,
 		    caddr_t addr, size_t nbytes, int slice, diskaddr_t offset,
-		    int cb_type, void *cb_arg, vio_desc_direction_t dir);
+		    buf_t *bufp, vio_desc_direction_t dir, int flags);
 static int 	vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr,
-		    size_t nbytes, int slice, diskaddr_t offset, int cb_type,
-		    void *cb_arg, vio_desc_direction_t dir, boolean_t);
+		    size_t nbytes, int slice, diskaddr_t offset,
+		    vio_desc_direction_t dir, boolean_t);
+static int	vdc_do_op(vdc_t *vdc, int op, caddr_t addr, size_t nbytes,
+		    int slice, diskaddr_t offset, struct buf *bufp,
+		    vio_desc_direction_t dir, int flags);

 static int	vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp);
-static int	vdc_drain_response(vdc_t *vdcp, vio_cb_type_t cb_type,
-		    struct buf *buf);
+static int	vdc_drain_response(vdc_t *vdcp, struct buf *buf);
 static int	vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx);
 static int	vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep);
 static int	vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg);
@@ -222,9 +226,12 @@
 		    int mode, int dir);

 static void 	vdc_ownership_update(vdc_t *vdc, int ownership_flags);
-static int	vdc_access_set(vdc_t *vdc, uint64_t flags, int mode);
-static vdc_io_t	*vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf);
-static int	vdc_failfast_check_resv(vdc_t *vdc);
+static int	vdc_access_set(vdc_t *vdc, uint64_t flags);
+static vdc_io_t	*vdc_eio_queue(vdc_t *vdc, int index);
+static void	vdc_eio_unqueue(vdc_t *vdc, clock_t deadline,
+		    boolean_t complete_io);
+static int	vdc_eio_check(vdc_t *vdc, int flags);
+static void	vdc_eio_thread(void *arg);

 /*
  * Module variables
@@ -392,7 +399,7 @@
 static int
 vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 {
-	kt_did_t failfast_tid, ownership_tid;
+	kt_did_t eio_tid, ownership_tid;
 	int	instance;
 	int	rv;
 	vdc_server_t *srvr;
@@ -418,14 +425,7 @@
 		return (DDI_FAILURE);
 	}

-	/*
-	 * This function is called when vdc is detached or if it has failed to
-	 * attach. In that case, the attach may have fail before the vdisk type
-	 * has been set so we can't call vdc_is_opened(). However as the attach
-	 * has failed, we know that the vdisk is not opened and we can safely
-	 * detach.
-	 */
-	if (vdc->vdisk_type != VD_DISK_TYPE_UNK && vdc_is_opened(vdc)) {
+	if (vdc_is_opened(vdc)) {
 		DMSG(vdc, 0, "[%d] Cannot detach: device is open", instance);
 		return (DDI_FAILURE);
 	}
@@ -449,7 +449,7 @@
 	/* If we took ownership, release ownership */
 	mutex_enter(&vdc->ownership_lock);
 	if (vdc->ownership & VDC_OWNERSHIP_GRANTED) {
-		rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, FKIOCTL);
+		rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR);
 		if (rv == 0) {
 			vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE);
 		}
@@ -487,6 +487,9 @@
 			    instance);
 			vdc->state = VDC_STATE_RESETTING;
 			cv_signal(&vdc->initwait_cv);
+		} else if (vdc->state == VDC_STATE_FAILED) {
+			vdc->io_pending = B_TRUE;
+			cv_signal(&vdc->io_pending_cv);
 		}
 		mutex_exit(&vdc->lock);

@@ -504,12 +507,13 @@

 	vdc_fini_ports(vdc);

-	if (vdc->failfast_thread) {
-		failfast_tid = vdc->failfast_thread->t_did;
+	if (vdc->eio_thread) {
+		eio_tid = vdc->eio_thread->t_did;
 		vdc->failfast_interval = 0;
-		cv_signal(&vdc->failfast_cv);
+		ASSERT(vdc->num_servers == 0);
+		cv_signal(&vdc->eio_cv);
 	} else {
-		failfast_tid = 0;
+		eio_tid = 0;
 	}

 	if (vdc->ownership & VDC_OWNERSHIP_WANTED) {
@@ -522,8 +526,8 @@

 	mutex_exit(&vdc->lock);

-	if (failfast_tid != 0)
-		thread_join(failfast_tid);
+	if (eio_tid != 0)
+		thread_join(eio_tid);

 	if (ownership_tid != 0)
 		thread_join(ownership_tid);
@@ -548,13 +552,12 @@
 		cv_destroy(&vdc->initwait_cv);
 		cv_destroy(&vdc->dring_free_cv);
 		cv_destroy(&vdc->membind_cv);
-		cv_destroy(&vdc->sync_pending_cv);
 		cv_destroy(&vdc->sync_blocked_cv);
 		cv_destroy(&vdc->read_cv);
 		cv_destroy(&vdc->running_cv);
+		cv_destroy(&vdc->io_pending_cv);
 		cv_destroy(&vdc->ownership_cv);
-		cv_destroy(&vdc->failfast_cv);
-		cv_destroy(&vdc->failfast_io_cv);
+		cv_destroy(&vdc->eio_cv);
 	}

 	if (vdc->minfo)
@@ -647,17 +650,16 @@
 	cv_init(&vdc->dring_free_cv, NULL, CV_DRIVER, NULL);
 	cv_init(&vdc->membind_cv, NULL, CV_DRIVER, NULL);
 	cv_init(&vdc->running_cv, NULL, CV_DRIVER, NULL);
-
+	cv_init(&vdc->io_pending_cv, NULL, CV_DRIVER, NULL);
+
+	vdc->io_pending = B_FALSE;
 	vdc->threads_pending = 0;
-	vdc->sync_op_pending = B_FALSE;
 	vdc->sync_op_blocked = B_FALSE;
-	cv_init(&vdc->sync_pending_cv, NULL, CV_DRIVER, NULL);
 	cv_init(&vdc->sync_blocked_cv, NULL, CV_DRIVER, NULL);

 	mutex_init(&vdc->ownership_lock, NULL, MUTEX_DRIVER, NULL);
 	cv_init(&vdc->ownership_cv, NULL, CV_DRIVER, NULL);
-	cv_init(&vdc->failfast_cv, NULL, CV_DRIVER, NULL);
-	cv_init(&vdc->failfast_io_cv, NULL, CV_DRIVER, NULL);
+	cv_init(&vdc->eio_cv, NULL, CV_DRIVER, NULL);

 	/* init blocking msg read functionality */
 	mutex_init(&vdc->read_lock, NULL, MUTEX_DRIVER, NULL);
@@ -699,6 +701,19 @@
 		return (DDI_FAILURE);
 	}

+	/*
+	 * If there are multiple servers then start the eio thread.
+	 */
+	if (vdc->num_servers > 1) {
+		vdc->eio_thread = thread_create(NULL, 0, vdc_eio_thread, vdc, 0,
+		    &p0, TS_RUN, v.v_maxsyspri - 2);
+		if (vdc->eio_thread == NULL) {
+			cmn_err(CE_NOTE, "[%d] Failed to create error "
+			    "I/O thread", instance);
+			return (DDI_FAILURE);
+		}
+	}
+
 	vdc->initialized |= VDC_THREAD;

 	atomic_inc_32(&vdc_instance_count);
@@ -725,13 +740,6 @@
 	}

 	/*
-	 * Setup devid
-	 */
-	if (vdc_setup_devid(vdc)) {
-		DMSG(vdc, 0, "[%d] No device id available\n", instance);
-	}
-
-	/*
 	 * Fill in the fields of the error statistics kstat that were not
 	 * available when creating the kstat
 	 */
@@ -1029,7 +1037,6 @@
  * Return Values
  *	0		- Success
  *	EIO		- Failed to create node
- *	EINVAL		- Unknown type of disk exported
  */
 static int
 vdc_create_device_nodes(vdc_t *vdc)
@@ -1047,14 +1054,14 @@

 	switch (vdc->vdisk_type) {
 	case VD_DISK_TYPE_DISK:
+	case VD_DISK_TYPE_UNK:
 		num_slices = V_NUMPAR;
 		break;
 	case VD_DISK_TYPE_SLICE:
 		num_slices = 1;
 		break;
-	case VD_DISK_TYPE_UNK:
 	default:
-		return (EINVAL);
+		ASSERT(0);
 	}

 	/*
@@ -1152,22 +1159,10 @@
 static boolean_t
 vdc_is_opened(vdc_t *vdc)
 {
-	int i, nslices;
-
-	switch (vdc->vdisk_type) {
-	case VD_DISK_TYPE_DISK:
-		nslices = V_NUMPAR;
-		break;
-	case VD_DISK_TYPE_SLICE:
-		nslices = 1;
-		break;
-	case VD_DISK_TYPE_UNK:
-	default:
-		ASSERT(0);
-	}
+	int i;

 	/* check if there's any layered open */
-	for (i = 0; i < nslices; i++) {
+	for (i = 0; i < V_NUMPAR; i++) {
 		if (vdc->open_lyr[i] > 0)
 			return (B_TRUE);
 	}
@@ -1193,6 +1188,15 @@

 	slicemask = 1 << slice;

+	/*
+	 * If we have a single-slice disk which was unavailable during the
+	 * attach then a device was created for each 8 slices. Now that
+	 * the type is known, we prevent opening any slice other than 0
+	 * even if a device still exists.
+	 */
+	if (vdc->vdisk_type == VD_DISK_TYPE_SLICE && slice != 0)
+		return (EIO);
+
 	/* check if slice is already exclusively opened */
 	if (vdc->open_excl & slicemask)
 		return (EBUSY);
@@ -1281,7 +1285,12 @@
 		return (status);
 	}

-	if (nodelay) {
+	/*
+	 * If the disk type is unknown then we have to wait for the
+	 * handshake to complete because we don't know if the slice
+	 * device we are opening effectively exists.
+	 */
+	if (vdc->vdisk_type != VD_DISK_TYPE_UNK && nodelay) {

 		/* don't resubmit a validate request if there's already one */
 		if (vdc->validate_pending > 0) {
@@ -1308,8 +1317,10 @@

 	mutex_enter(&vdc->lock);

-	if (vdc->vdisk_label == VD_DISK_LABEL_UNK ||
-	    vdc->slice[slice].nblocks == 0) {
+	if (vdc->vdisk_type == VD_DISK_TYPE_UNK ||
+	    (vdc->vdisk_type == VD_DISK_TYPE_SLICE && slice != 0) ||
+	    (!nodelay && (vdc->vdisk_label == VD_DISK_LABEL_UNK ||
+	    vdc->slice[slice].nblocks == 0))) {
 		vdc_mark_closed(vdc, slice, flag, otyp);
 		status = EIO;
 	}
@@ -1381,7 +1392,7 @@
 static int
 vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
 {
-	int	rv;
+	int	rv, flags;
 	size_t	nbytes = nblk * DEV_BSIZE;
 	int	instance = VDCUNIT(dev);
 	vdc_t	*vdc = NULL;
@@ -1402,16 +1413,20 @@
 	}
 	vio_blkno = blkno >> vdc->vio_bshift;

-	rv = vdc_send_request(vdc, VD_OP_BWRITE, addr, nbytes,
-	    VDCPART(dev), vio_blkno, CB_STRATEGY, 0, VIO_write_dir);
+	/*
+	 * If we are panicking, we need the state to be "running" so that we
+	 * can submit I/Os, but we don't want to check for any backend error.
+	 */
+	flags = (ddi_in_panic())? VDC_OP_STATE_RUNNING : VDC_OP_NORMAL;
+
+	rv = vdc_do_op(vdc, VD_OP_BWRITE, addr, nbytes, VDCPART(dev),
+	    vio_blkno, NULL, VIO_write_dir, flags);
+
 	if (rv) {
 		DMSG(vdc, 0, "Failed to do a disk dump (err=%d)\n", rv);
 		return (rv);
 	}

-	if (ddi_in_panic())
-		(void) vdc_drain_response(vdc, CB_STRATEGY, NULL);
-
 	DMSG(vdc, 0, "[%d] End\n", instance);

 	return (0);
@@ -1435,7 +1450,6 @@
 vdc_strategy(struct buf *buf)
 {
 	diskaddr_t vio_blkno;
-	int	rv = -1;
 	vdc_t	*vdc = NULL;
 	int	instance = VDCUNIT(buf->b_edev);
 	int	op = (buf->b_flags & B_READ) ? VD_OP_BREAD : VD_OP_BWRITE;
@@ -1474,27 +1488,11 @@
 	}
 	vio_blkno = buf->b_lblkno >> vdc->vio_bshift;

-	rv = vdc_send_request(vdc, op, (caddr_t)buf->b_un.b_addr,
+	/* submit the I/O, any error will be reported in the buf structure */
+	(void) vdc_do_op(vdc, op, (caddr_t)buf->b_un.b_addr,
 	    buf->b_bcount, slice, vio_blkno,
-	    CB_STRATEGY, buf, (op == VD_OP_BREAD) ? VIO_read_dir :
-	    VIO_write_dir);
-
-	/*
-	 * If the request was successfully sent, the strategy call returns and
-	 * the ACK handler calls the bioxxx functions when the vDisk server is
-	 * done otherwise we handle the error here.
-	 */
-	if (rv) {
-		DMSG(vdc, 0, "Failed to read/write (err=%d)\n", rv);
-		bioerror(buf, rv);
-		biodone(buf);
-	} else if (ddi_in_panic()) {
-		rv = vdc_drain_response(vdc, CB_STRATEGY, buf);
-		if (rv != 0) {
-			bioerror(buf, EIO);
-			biodone(buf);
-		}
-	}
+	    buf, (op == VD_OP_BREAD) ? VIO_read_dir : VIO_write_dir,
+	    VDC_OP_NORMAL);

 	return (0);
 }
@@ -2368,6 +2366,8 @@
 		vd_port = portp[idx];
 		srvr = kmem_zalloc(sizeof (vdc_server_t), KM_SLEEP);
 		srvr->vdcp = vdc;
+		srvr->svc_state = VDC_SERVICE_OFFLINE;
+		srvr->log_state = VDC_SERVICE_NONE;

 		/* get port id */
 		if (md_get_prop_val(mdp, vd_port, VDC_MD_ID, &srvr->id) != 0) {
@@ -2587,6 +2587,7 @@
 	}

 	vdc->server_list = NULL;
+	vdc->num_servers = 0;
 }

 /* -------------------------------------------------------------------------- */
@@ -2883,10 +2884,7 @@
  *	nbytes	  - number of bytes to read/write
  *	slice	  - the disk slice this request is for
  *	offset	  - relative disk offset
- *	cb_type   - type of call - STRATEGY or SYNC
- *	cb_arg	  - parameter to be sent to server (depends on VD_OP_XXX type)
- *			. mode for ioctl(9e)
- *			. LP64 diskaddr_t (block I/O)
+ *	bufp	  - buf of operation
  *	dir	  - direction of operation (READ/WRITE/BOTH)
  *
  * Return Codes:
@@ -2895,8 +2893,8 @@
  */
 static int
 vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr,
-    size_t nbytes, int slice, diskaddr_t offset, int cb_type,
-    void *cb_arg, vio_desc_direction_t dir)
+    size_t nbytes, int slice, diskaddr_t offset, buf_t *bufp,
+    vio_desc_direction_t dir, int flags)
 {
 	int	rv = 0;

@@ -2917,10 +2915,20 @@
 	 * higher up the stack in vdc_strategy() et. al.
 	 */
 	if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) {
-		DTRACE_IO1(start, buf_t *, cb_arg);
+		DTRACE_IO1(start, buf_t *, bufp);
 		VD_KSTAT_WAITQ_ENTER(vdcp);
 	}

+	/*
+	 * If the request does not expect the state to be VDC_STATE_RUNNING
+	 * then we just try to populate the descriptor ring once.
+	 */
+	if (!(flags & VDC_OP_STATE_RUNNING)) {
+		rv = vdc_populate_descriptor(vdcp, operation, addr,
+		    nbytes, slice, offset, bufp, dir, flags);
+		goto done;
+	}
+
 	do {
 		while (vdcp->state != VDC_STATE_RUNNING) {

@@ -2930,12 +2938,6 @@
 				goto done;
 			}

-			/* fail request if connection timeout is reached */
-			if (vdcp->ctimeout_reached) {
-				rv = EIO;
-				goto done;
-			}
-
 			/*
 			 * If we are panicking and the disk is not ready then
 			 * we can't send any request because we can't complete
@@ -2946,11 +2948,27 @@
 				goto done;
 			}

+			/*
+			 * If the state is faulted, notify that a new I/O is
+			 * being submitted to force the system to check if any
+			 * server has recovered.
+			 */
+			if (vdcp->state == VDC_STATE_FAILED) {
+				vdcp->io_pending = B_TRUE;
+				cv_signal(&vdcp->io_pending_cv);
+			}
+
 			cv_wait(&vdcp->running_cv, &vdcp->lock);
+
+			/* if service is still faulted then fail the request */
+			if (vdcp->state == VDC_STATE_FAILED) {
+				rv = EIO;
+				goto done;
+			}
 		}

 	} while (vdc_populate_descriptor(vdcp, operation, addr,
-	    nbytes, slice, offset, cb_type, cb_arg, dir));
+	    nbytes, slice, offset, bufp, dir, flags));

 done:
 	/*
@@ -2963,11 +2981,11 @@
 	if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) {
 		if (rv == 0) {
 			VD_KSTAT_WAITQ_TO_RUNQ(vdcp);
-			DTRACE_PROBE1(send, buf_t *, cb_arg);
+			DTRACE_PROBE1(send, buf_t *, bufp);
 		} else {
 			VD_UPDATE_ERR_STATS(vdcp, vd_transerrs);
 			VD_KSTAT_WAITQ_EXIT(vdcp);
-			DTRACE_IO1(done, buf_t *, cb_arg);
+			DTRACE_IO1(done, buf_t *, bufp);
 		}
 	}

@@ -2993,10 +3011,7 @@
  *	nbytes	  - number of bytes to read/write
  *	slice	  - the disk slice this request is for
  *	offset	  - relative disk offset
- *	cb_type   - type of call - STRATEGY or SYNC
- *	cb_arg	  - parameter to be sent to server (depends on VD_OP_XXX type)
- *			. mode for ioctl(9e)
- *			. LP64 diskaddr_t (block I/O)
+ *	bufp	  - buf of operation
  *	dir	  - direction of operation (READ/WRITE/BOTH)
  *
  * Return Codes:
@@ -3007,8 +3022,8 @@
  */
 static int
 vdc_populate_descriptor(vdc_t *vdcp, int operation, caddr_t addr,
-    size_t nbytes, int slice, diskaddr_t offset, int cb_type,
-    void *cb_arg, vio_desc_direction_t dir)
+    size_t nbytes, int slice, diskaddr_t offset,
+    buf_t *bufp, vio_desc_direction_t dir, int flags)
 {
 	vdc_local_desc_t	*local_dep = NULL; /* Local Dring Pointer */
 	int			idx;		/* Index of DRing entry used */
@@ -3050,9 +3065,9 @@
 	local_dep->nbytes = nbytes;
 	local_dep->slice = slice;
 	local_dep->offset = offset;
-	local_dep->cb_type = cb_type;
-	local_dep->cb_arg = cb_arg;
+	local_dep->buf = bufp;
 	local_dep->dir = dir;
+	local_dep->flags = flags;

 	local_dep->is_free = B_FALSE;

@@ -3124,11 +3139,127 @@

 /*
  * Function:
+ *	vdc_do_op
+ *
+ * Description:
+ * 	Wrapper around vdc_submit_request(). Each request is associated with a
+ *	buf structure. If a buf structure is provided (bufp != NULL) then the
+ *	request will be submitted with that buf, and the caller can wait for
+ *	completion of the request with biowait(). If a buf structure is not
+ *	provided (bufp == NULL) then a buf structure is created and the function
+ *	waits for the completion of the request.
+ *
+ *	If the flag VD_OP_STATE_RUNNING is set then vdc_submit_request() will
+ *	submit the request only when the vdisk is in state VD_STATE_RUNNING.
+ *	If the vdisk is not in that state then the vdc_submit_request() will
+ *	wait for that state to be reached. After the request is submitted, the
+ *	reply will be processed asynchronously by the vdc_process_msg_thread()
+ *	thread.
+ *
+ *	If the flag VD_OP_STATE_RUNNING is not set then vdc_submit_request()
+ *	submit the request whatever the state of the vdisk is. Then vdc_do_op()
+ *	will wait for a reply message, process the reply and complete the
+ *	request.
+ *
+ * Arguments:
+ *	vdc	- the soft state pointer
+ *	op	- operation we want vds to perform (VD_OP_XXX)
+ *	addr	- address of data buf to be read/written.
+ *	nbytes	- number of bytes to read/write
+ *	slice	- the disk slice this request is for
+ *	offset	- relative disk offset
+ *	bufp	- buf structure associated with the request (can be NULL).
+ *	dir	- direction of operation (READ/WRITE/BOTH)
+ *	flags	- flags for the request.
+ *
+ * Return Codes:
+ *	0	- the request has been succesfully submitted and completed.
+ *	!= 0	- the request has failed. In that case, if a buf structure
+ *		  was provided (bufp != NULL) then the B_ERROR flag is set
+ *		  and the b_error field of the buf structure is set to EIO.
+ */
+static int
+vdc_do_op(vdc_t *vdc, int op, caddr_t addr, size_t nbytes, int slice,
+    diskaddr_t offset, struct buf *bufp, vio_desc_direction_t dir, int flags)
+{
+	vio_msg_t vio_msg;
+	struct buf buf;
+	int rv;
+
+	if (bufp == NULL) {
+		/*
+		 * We use buf just as a convenient way to get a notification
+		 * that the request is completed, so we initialize buf to the
+		 * minimum we need.
+		 */
+		bioinit(&buf);
+		buf.b_bcount = nbytes;
+		buf.b_flags = B_BUSY;
+		bufp = &buf;
+	}
+
+	rv = vdc_send_request(vdc, op, addr, nbytes, slice, offset, bufp,
+	    dir, flags);
+
+	if (rv != 0)
+		goto done;
+
+	/*
+	 * If the request should be done in VDC_STATE_RUNNING state then the
+	 * reply will be received and processed by vdc_process_msg_thread()
+	 * and we just have to handle the panic case. Otherwise we have to
+	 * wait for the reply message and process it.
+	 */
+	if (flags & VDC_OP_STATE_RUNNING) {
+
+		if (ddi_in_panic()) {
+			rv = vdc_drain_response(vdc, bufp);
+			goto done;
+		}
+
+	} else {
+		/* wait for the response message */
+		rv = vdc_wait_for_response(vdc, &vio_msg);
+		if (rv) {
+			/*
+			 * If this is a block read/write we update the I/O
+			 * statistics kstat to take it off the run queue.
+			 */
+			mutex_enter(&vdc->lock);
+			if (op == VD_OP_BREAD || op == VD_OP_BWRITE) {
+				VD_UPDATE_ERR_STATS(vdc, vd_transerrs);
+				VD_KSTAT_RUNQ_EXIT(vdc);
+				DTRACE_IO1(done, buf_t *, bufp);
+			}
+			mutex_exit(&vdc->lock);
+			goto done;
+		}
+
+		rv = vdc_process_data_msg(vdc, &vio_msg);
+		if (rv)
+			goto done;
+	}
+
+	if (bufp == &buf)
+		rv = biowait(bufp);
+
+done:
+	if (bufp == &buf) {
+		biofini(bufp);
+	} else if (rv != 0) {
+		bioerror(bufp, EIO);
+		biodone(bufp);
+	}
+
+	return (rv);
+}
+
+/*
+ * Function:
  *	vdc_do_sync_op
  *
  * Description:
- * 	Wrapper around vdc_populate_descriptor that blocks until the
- * 	response to the message is available.
+ * 	Wrapper around vdc_do_op that serializes requests.
  *
  * Arguments:
  *	vdcp	  - the soft state pointer
@@ -3137,16 +3268,12 @@
  *	nbytes	  - number of bytes to read/write
  *	slice	  - the disk slice this request is for
  *	offset	  - relative disk offset
- *	cb_type   - type of call - STRATEGY or SYNC
- *	cb_arg	  - parameter to be sent to server (depends on VD_OP_XXX type)
- *			. mode for ioctl(9e)
- *			. LP64 diskaddr_t (block I/O)
  *	dir	  - direction of operation (READ/WRITE/BOTH)
  *	rconflict - check for reservation conflict in case of failure
  *
  * rconflict should be set to B_TRUE by most callers. Callers invoking the
  * VD_OP_SCSICMD operation can set rconflict to B_FALSE if they check the
- * result of a successful operation with vd_scsi_status().
+ * result of a successful operation with vdc_scsi_status().
  *
  * Return Codes:
  *	0
@@ -3157,14 +3284,10 @@
  */
 static int
 vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes,
-    int slice, diskaddr_t offset, int cb_type, void *cb_arg,
-    vio_desc_direction_t dir, boolean_t rconflict)
+    int slice, diskaddr_t offset, vio_desc_direction_t dir, boolean_t rconflict)
 {
 	int status;
-	vdc_io_t *vio;
-	boolean_t check_resv_conflict = B_FALSE;
-
-	ASSERT(cb_type == CB_SYNC);
+	int flags = VDC_OP_NORMAL;

 	/*
 	 * Grab the lock, if blocked wait until the server
@@ -3192,69 +3315,29 @@

 	/* now block anyone other thread entering after us */
 	vdcp->sync_op_blocked = B_TRUE;
-	vdcp->sync_op_pending = B_TRUE;
+
 	mutex_exit(&vdcp->lock);

-	status = vdc_send_request(vdcp, operation, addr,
-	    nbytes, slice, offset, cb_type, cb_arg, dir);
+	if (!rconflict)
+		flags &= ~VDC_OP_ERRCHK_CONFLICT;
+
+	status = vdc_do_op(vdcp, operation, addr, nbytes, slice, offset,
+	    NULL, dir, flags);

 	mutex_enter(&vdcp->lock);

-	if (status != 0) {
-		vdcp->sync_op_pending = B_FALSE;
-	} else if (ddi_in_panic()) {
-		if (vdc_drain_response(vdcp, CB_SYNC, NULL) == 0) {
-			status = vdcp->sync_op_status;
-		} else {
-			vdcp->sync_op_pending = B_FALSE;
-			status = EIO;
-		}
-	} else {
-		/*
-		 * block until our transaction completes.
-		 * Also anyone else waiting also gets to go next.
-		 */
-		while (vdcp->sync_op_pending && vdcp->state != VDC_STATE_DETACH)
-			cv_wait(&vdcp->sync_pending_cv, &vdcp->lock);
-
-		DMSG(vdcp, 2, ": operation returned %d\n",
-		    vdcp->sync_op_status);
-		if (vdcp->state == VDC_STATE_DETACH) {
-			vdcp->sync_op_pending = B_FALSE;
-			status = ENXIO;
-		} else {
-			status = vdcp->sync_op_status;
-			if (status != 0 && vdcp->failfast_interval != 0) {
-				/*
-				 * Operation has failed and failfast is enabled.
-				 * We need to check if the failure is due to a
-				 * reservation conflict if this was requested.
-				 */
-				check_resv_conflict = rconflict;
-			}
-
-		}
-	}
-
-	vdcp->sync_op_status = 0;
+	DMSG(vdcp, 2, ": operation returned %d\n", status);
+
+	if (vdcp->state == VDC_STATE_DETACH) {
+		status = ENXIO;
+	}
+
 	vdcp->sync_op_blocked = B_FALSE;
 	vdcp->sync_op_cnt--;

 	/* signal the next waiting thread */
 	cv_signal(&vdcp->sync_blocked_cv);

-	/*
-	 * We have to check for reservation conflict after unblocking sync
-	 * operations because some sync operations will be used to do this
-	 * check.
-	 */
-	if (check_resv_conflict) {
-		vio = vdc_failfast_io_queue(vdcp, NULL);
-		while (vio->vio_qtime != 0)
-			cv_wait(&vdcp->failfast_io_cv, &vdcp->lock);
-		kmem_free(vio, sizeof (vdc_io_t));
-	}
-
 	mutex_exit(&vdcp->lock);

 	return (status);
@@ -3275,23 +3358,16 @@
  *
  * Arguments:
  *	vdc	- soft state pointer for this instance of the device driver.
- *	cb_type	- the type of request we want to drain. If type is CB_SYNC
- *		  then we drain all responses until we find a CB_SYNC request.
- *		  If the type is CB_STRATEGY then the behavior depends on the
- *		  value of the buf argument.
- *	buf	- if the cb_type argument is CB_SYNC then the buf argument
- *		  must be NULL. If the cb_type argument is CB_STRATEGY and
- *		  if buf is NULL then we drain all responses, otherwise we
+ *	buf	- if buf is NULL then we drain all responses, otherwise we
  *		  poll until we receive a ACK/NACK for the specific I/O
  *		  described by buf.
  *
  * Return Code:
  *	0	- Success. If we were expecting a response to a particular
- *		  CB_SYNC or CB_STRATEGY request then this means that a
- *		  response has been received.
+ *		  request then this means that a response has been received.
  */
 static int
-vdc_drain_response(vdc_t *vdc, vio_cb_type_t cb_type, struct buf *buf)
+vdc_drain_response(vdc_t *vdc, struct buf *buf)
 {
 	int 			rv, idx, retries;
 	size_t			msglen;
@@ -3300,8 +3376,6 @@
 	struct buf		*mbuf;
 	boolean_t		ack;

-	ASSERT(cb_type == CB_STRATEGY || cb_type == CB_SYNC);
-
 	mutex_enter(&vdc->lock);

 	retries = 0;
@@ -3369,34 +3443,16 @@
 			continue;
 		}

-		switch (ldep->cb_type) {
-
-		case CB_STRATEGY:
-			mbuf = ldep->cb_arg;
-			if (mbuf != NULL) {
-				mbuf->b_resid = mbuf->b_bcount -
-				    ldep->dep->payload.nbytes;
-				bioerror(mbuf,
-				    ack ? ldep->dep->payload.status : EIO);
-				biodone(mbuf);
-			}
-			rv = vdc_depopulate_descriptor(vdc, idx);
-			if (buf != NULL && buf == mbuf) {
-				rv = 0;
-				goto done;
-			}
-			break;
-
-		case CB_SYNC:
-			rv = vdc_depopulate_descriptor(vdc, idx);
-			vdc->sync_op_status = ack ? rv : EIO;
-			vdc->sync_op_pending = B_FALSE;
-			cv_signal(&vdc->sync_pending_cv);
-			if (cb_type == CB_SYNC) {
-				rv = 0;
-				goto done;
-			}
-			break;
+		mbuf = ldep->buf;
+		ASSERT(mbuf != NULL);
+		mbuf->b_resid = mbuf->b_bcount - ldep->dep->payload.nbytes;
+		bioerror(mbuf, ack ? ldep->dep->payload.status : EIO);
+		biodone(mbuf);
+
+		rv = vdc_depopulate_descriptor(vdc, idx);
+		if (buf != NULL && buf == mbuf) {
+			rv = 0;
+			goto done;
 		}

 		/* if this is the last descriptor - break out of loop */
@@ -3406,7 +3462,7 @@
 			 * request then we return with an error otherwise we
 			 * have successfully completed the drain.
 			 */
-			rv = (buf != NULL || cb_type == CB_SYNC)? ESRCH: 0;
+			rv = (buf != NULL)? ESRCH: 0;
 			break;
 		}
 	}
@@ -3683,8 +3739,10 @@
 			 */
 			vdc->seq_num = 1;
 			vdc->seq_num_reply = 0;
+			vdc->io_pending = B_TRUE;
 			srvr->ldc_state = ldc_state;
 			cv_signal(&vdc->initwait_cv);
+			cv_signal(&vdc->io_pending_cv);
 		}
 	}

@@ -3719,6 +3777,9 @@
 		if (vdc->state == VDC_STATE_INIT_WAITING) {
 			vdc->state = VDC_STATE_RESETTING;
 			cv_signal(&vdc->initwait_cv);
+		} else if (vdc->state == VDC_STATE_FAILED) {
+			vdc->io_pending = B_TRUE;
+			cv_signal(&vdc->io_pending_cv);
 		}

 	}
@@ -3820,8 +3881,6 @@
 	int		b_idx;
 	int		rv = 0;
 	int		dring_size;
-	int		op;
-	vio_msg_t	vio_msg;
 	vdc_local_desc_t	*curr_ldep;

 	ASSERT(MUTEX_NOT_HELD(&vdcp->lock));
@@ -3846,85 +3905,22 @@

 		/* only resubmit outstanding transactions */
 		if (!curr_ldep->is_free) {
-			/*
-			 * If we are retrying a block read/write operation we
-			 * need to update the I/O statistics to indicate that
-			 * the request is being put back on the waitq to be
-			 * serviced (it will have been taken off after the
-			 * error was reported).
-			 */
-			mutex_enter(&vdcp->lock);
-			op = curr_ldep->operation;
-			if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) {
-				DTRACE_IO1(start, buf_t *, curr_ldep->cb_arg);
-				VD_KSTAT_WAITQ_ENTER(vdcp);
-			}

 			DMSG(vdcp, 1, "resubmitting entry idx=%x\n", b_idx);
-			rv = vdc_populate_descriptor(vdcp, op,
+
+			rv = vdc_do_op(vdcp, curr_ldep->operation,
 			    curr_ldep->addr, curr_ldep->nbytes,
 			    curr_ldep->slice, curr_ldep->offset,
-			    curr_ldep->cb_type, curr_ldep->cb_arg,
-			    curr_ldep->dir);
+			    curr_ldep->buf, curr_ldep->dir,
+			    curr_ldep->flags & ~VDC_OP_STATE_RUNNING);

 			if (rv) {
-				if (op == VD_OP_BREAD || op == VD_OP_BWRITE) {
-					VD_UPDATE_ERR_STATS(vdcp, vd_transerrs);
-					VD_KSTAT_WAITQ_EXIT(vdcp);
-					DTRACE_IO1(done, buf_t *,
-					    curr_ldep->cb_arg);
-				}
-				DMSG(vdcp, 1, "[%d] cannot resubmit entry %d\n",
+				DMSG(vdcp, 1, "[%d] resubmit entry %d failed\n",
 				    vdcp->instance, b_idx);
-				mutex_exit(&vdcp->lock);
 				goto done;
 			}

 			/*
-			 * If this is a block read/write we update the I/O
-			 * statistics kstat to indicate that the request
-			 * has been sent back to the vDisk server and should
-			 * now be put on the run queue.
-			 */
-			if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) {
-				DTRACE_PROBE1(send, buf_t *, curr_ldep->cb_arg);
-				VD_KSTAT_WAITQ_TO_RUNQ(vdcp);
-			}
-			mutex_exit(&vdcp->lock);
-
-			/* Wait for the response message. */
-			DMSG(vdcp, 1, "waiting for response to idx=%x\n",
-			    b_idx);
-			rv = vdc_wait_for_response(vdcp, &vio_msg);
-			if (rv) {
-				/*
-				 * If this is a block read/write we update
-				 * the I/O statistics kstat to take it
-				 * off the run queue.
-				 */
-				mutex_enter(&vdcp->lock);
-				if (op == VD_OP_BREAD || op == VD_OP_BWRITE) {
-					VD_UPDATE_ERR_STATS(vdcp, vd_transerrs);
-					VD_KSTAT_RUNQ_EXIT(vdcp);
-					DTRACE_IO1(done, buf_t *,
-					    curr_ldep->cb_arg);
-				}
-				DMSG(vdcp, 1, "[%d] wait_for_response "
-				    "returned err=%d\n", vdcp->instance,
-				    rv);
-				mutex_exit(&vdcp->lock);
-				goto done;
-			}
-
-			DMSG(vdcp, 1, "processing msg for idx=%x\n", b_idx);
-			rv = vdc_process_data_msg(vdcp, &vio_msg);
-			if (rv) {
-				DMSG(vdcp, 1, "[%d] process_data_msg "
-				    "returned err=%d\n", vdcp->instance,
-				    rv);
-				goto done;
-			}
-			/*
 			 * Mark this entry as free so that we will not resubmit
 			 * this "done" request again, if we were to use the same
 			 * backup_dring again in future. This could happen when
@@ -3978,10 +3974,7 @@
 	int		cancelled = 0;

 	ASSERT(MUTEX_HELD(&vdcp->lock));
-	ASSERT(vdcp->state == VDC_STATE_INIT ||
-	    vdcp->state == VDC_STATE_INIT_WAITING ||
-	    vdcp->state == VDC_STATE_NEGOTIATE ||
-	    vdcp->state == VDC_STATE_RESETTING);
+	ASSERT(vdcp->state == VDC_STATE_FAILED);

 	if (vdcp->local_dring_backup == NULL) {
 		/* the pending requests have already been processed */
@@ -4013,29 +4006,17 @@
 			 * requests. Now we just have to notify threads waiting
 			 * for replies that the request has failed.
 			 */
-			switch (ldep->cb_type) {
-			case CB_SYNC:
-				ASSERT(vdcp->sync_op_pending);
-				vdcp->sync_op_status = EIO;
-				vdcp->sync_op_pending = B_FALSE;
-				cv_signal(&vdcp->sync_pending_cv);
-				break;
-
-			case CB_STRATEGY:
-				bufp = ldep->cb_arg;
-				ASSERT(bufp != NULL);
-				bufp->b_resid = bufp->b_bcount;
+			bufp = ldep->buf;
+			ASSERT(bufp != NULL);
+			bufp->b_resid = bufp->b_bcount;
+			if (ldep->operation == VD_OP_BREAD ||
+			    ldep->operation == VD_OP_BWRITE) {
 				VD_UPDATE_ERR_STATS(vdcp, vd_softerrs);
 				VD_KSTAT_RUNQ_EXIT(vdcp);
 				DTRACE_IO1(done, buf_t *, bufp);
-				bioerror(bufp, EIO);
-				biodone(bufp);
-				break;
-
-			default:
-				ASSERT(0);
 			}
-
+			bioerror(bufp, EIO);
+			biodone(bufp);
 		}

 		/* get the next element to cancel */
@@ -4061,14 +4042,12 @@
  * Description:
  *	This function is invoked if the timeout set to establish the connection
  *	with vds expires. This will happen if we spend too much time in the
- *	VDC_STATE_INIT_WAITING or VDC_STATE_NEGOTIATE states. Then we will
- *	cancel any pending request and mark them as failed.
+ *	VDC_STATE_INIT_WAITING or VDC_STATE_NEGOTIATE states.
  *
  *	If the timeout does not expire, it will be cancelled when we reach the
- *	VDC_STATE_HANDLE_PENDING or VDC_STATE_RESETTING state. This function can
- *	be invoked while we are in the VDC_STATE_HANDLE_PENDING or
- *	VDC_STATE_RESETTING state in which case we do nothing because the
- *	timeout is being cancelled.
+ *	VDC_STATE_HANDLE_PENDING, VDC_STATE_FAILED or VDC_STATE_DETACH state.
+ *	This function can also be invoked while we are in those states, in
+ *	which case we do nothing because the timeout is being cancelled.
  *
  * Arguments:
  *	arg	- argument of the timeout function actually a soft state
@@ -4085,28 +4064,18 @@
 	mutex_enter(&vdcp->lock);

 	if (vdcp->state == VDC_STATE_HANDLE_PENDING ||
-	    vdcp->state == VDC_STATE_DETACH) {
+	    vdcp->state == VDC_STATE_DETACH ||
+	    vdcp->state == VDC_STATE_FAILED) {
 		/*
-		 * The connection has just been re-established or
+		 * The connection has just been re-established, has failed or
 		 * we are detaching.
 		 */
 		vdcp->ctimeout_reached = B_FALSE;
-		mutex_exit(&vdcp->lock);
-		return;
-	}
-
-	vdcp->ctimeout_reached = B_TRUE;
-
-	/* notify requests waiting for sending */
-	cv_broadcast(&vdcp->running_cv);
-
-	/* cancel requests waiting for a result */
-	vdc_cancel_backup_dring(vdcp);
+	} else {
+		vdcp->ctimeout_reached = B_TRUE;
+	}

 	mutex_exit(&vdcp->lock);
-
-	cmn_err(CE_NOTE, "[%d] connection to service domain timeout",
-	    vdcp->instance);
 }

 /*
@@ -4202,6 +4171,58 @@
 	    vdcp->instance, vdcp->curr_server->id, vdcp->curr_server->ldc_id);
 }

+static void
+vdc_print_svc_status(vdc_t *vdcp)
+{
+	int instance;
+	uint64_t ldc_id, port_id;
+	vdc_service_state_t svc_state;
+
+	ASSERT(mutex_owned(&vdcp->lock));
+
+	svc_state = vdcp->curr_server->svc_state;
+
+	if (vdcp->curr_server->log_state == svc_state)
+		return;
+
+	instance = vdcp->instance;
+	ldc_id = vdcp->curr_server->ldc_id;
+	port_id = vdcp->curr_server->id;
+
+	switch (svc_state) {
+
+	case VDC_SERVICE_OFFLINE:
+		cmn_err(CE_CONT, "?vdisk@%d is offline\n", instance);
+		break;
+
+	case VDC_SERVICE_CONNECTED:
+		cmn_err(CE_CONT, "?vdisk@%d is connected using ldc@%ld,%ld\n",
+		    instance, ldc_id, port_id);
+		break;
+
+	case VDC_SERVICE_ONLINE:
+		cmn_err(CE_CONT, "?vdisk@%d is online using ldc@%ld,%ld\n",
+		    instance, ldc_id, port_id);
+		break;
+
+	case VDC_SERVICE_FAILED:
+		cmn_err(CE_CONT, "?vdisk@%d access to service failed "
+		    "using ldc@%ld,%ld\n", instance, ldc_id, port_id);
+		break;
+
+	case VDC_SERVICE_FAULTED:
+		cmn_err(CE_CONT, "?vdisk@%d access to backend failed "
+		    "using ldc@%ld,%ld\n", instance, ldc_id, port_id);
+		break;
+
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	vdcp->curr_server->log_state = svc_state;
+}
+
 /* -------------------------------------------------------------------------- */

 /*
@@ -4232,6 +4253,8 @@
 	int		ctimeout;
 	timeout_id_t	tmid = 0;
 	clock_t		ldcup_timeout = 0;
+	vdc_server_t	*srvr;
+	vdc_service_state_t svc_state;

 	mutex_enter(&vdcp->lock);

@@ -4243,6 +4266,8 @@
 		    Q(VDC_STATE_INIT_WAITING)
 		    Q(VDC_STATE_NEGOTIATE)
 		    Q(VDC_STATE_HANDLE_PENDING)
+		    Q(VDC_STATE_FAULTED)
+		    Q(VDC_STATE_FAILED)
 		    Q(VDC_STATE_RUNNING)
 		    Q(VDC_STATE_RESETTING)
 		    Q(VDC_STATE_DETACH)
@@ -4277,21 +4302,27 @@
 				    ctimeout * drv_usectohz(MICROSEC));
 			}

+			/* Switch to STATE_DETACH if drv is detaching */
+			if (vdcp->lifecycle == VDC_LC_DETACHING) {
+				vdcp->state = VDC_STATE_DETACH;
+				break;
+			}
+
+			/* Check if the timeout has been reached */
+			if (vdcp->ctimeout_reached) {
+				ASSERT(tmid != 0);
+				tmid = 0;
+				vdcp->state = VDC_STATE_FAILED;
+				break;
+			}
+
 			/* Check if we are re-initializing repeatedly */
 			if (vdcp->hshake_cnt > vdc_hshake_retries &&
 			    vdcp->lifecycle != VDC_LC_ONLINE) {

 				DMSG(vdcp, 0, "[%d] too many handshakes,cnt=%d",
 				    vdcp->instance, vdcp->hshake_cnt);
-				cmn_err(CE_NOTE, "[%d] disk access failed.\n",
-				    vdcp->instance);
-				vdcp->state = VDC_STATE_DETACH;
-				break;
-			}
-
-			/* Switch to STATE_DETACH if drv is detaching */
-			if (vdcp->lifecycle == VDC_LC_DETACHING) {
-				vdcp->state = VDC_STATE_DETACH;
+				vdcp->state = VDC_STATE_FAILED;
 				break;
 			}

@@ -4304,6 +4335,10 @@
 			status = vdc_start_ldc_connection(vdcp);
 			if (status != EINVAL) {
 				vdcp->state = VDC_STATE_INIT_WAITING;
+			} else {
+				vdcp->curr_server->svc_state =
+				    VDC_SERVICE_FAILED;
+				vdc_print_svc_status(vdcp);
 			}
 			break;

@@ -4315,26 +4350,23 @@
 				break;
 			}

-			/* check if only one server exists */
-			if (vdcp->num_servers == 1) {
-				cv_wait(&vdcp->initwait_cv, &vdcp->lock);
-			} else {
-				/*
-				 * wait for LDC_UP, if it times out, switch
-				 * to another server.
-				 */
-				ldcup_timeout = ddi_get_lbolt() +
-				    (vdc_ldcup_timeout *
-				    drv_usectohz(MICROSEC));
-				status = cv_timedwait(&vdcp->initwait_cv,
-				    &vdcp->lock, ldcup_timeout);
-				if (status == -1 &&
-				    vdcp->state == VDC_STATE_INIT_WAITING &&
-				    vdcp->curr_server->ldc_state != LDC_UP) {
-					/* timed out & still waiting */
-					vdcp->state = VDC_STATE_INIT;
-					break;
-				}
+			/*
+			 * Wait for LDC_UP. If it times out and we have multiple
+			 * servers then we will retry using a different server.
+			 */
+			ldcup_timeout = ddi_get_lbolt() + (vdc_ldcup_timeout *
+			    drv_usectohz(MICROSEC));
+			status = cv_timedwait(&vdcp->initwait_cv, &vdcp->lock,
+			    ldcup_timeout);
+			if (status == -1 &&
+			    vdcp->state == VDC_STATE_INIT_WAITING &&
+			    vdcp->curr_server->ldc_state != LDC_UP) {
+				/* timed out & still waiting */
+				vdcp->curr_server->svc_state =
+				    VDC_SERVICE_FAILED;
+				vdc_print_svc_status(vdcp);
+				vdcp->state = VDC_STATE_INIT;
+				break;
 			}

 			if (vdcp->state != VDC_STATE_INIT_WAITING) {
@@ -4386,6 +4418,8 @@
 			    status);
 			vdcp->state = VDC_STATE_RESETTING;
 			vdcp->self_reset = B_TRUE;
+			vdcp->curr_server->svc_state = VDC_SERVICE_FAILED;
+			vdc_print_svc_status(vdcp);
 done:
 			DMSG(vdcp, 0, "negotiation complete (state=0x%x)...\n",
 			    vdcp->state);
@@ -4393,36 +4427,121 @@

 		case VDC_STATE_HANDLE_PENDING:

-			if (vdcp->ctimeout_reached) {
-				/*
-				 * The connection timeout had been reached so
-				 * pending requests have been cancelled. Now
-				 * that the connection is back we can reset
-				 * the timeout.
-				 */
-				ASSERT(vdcp->local_dring_backup == NULL);
-				ASSERT(tmid != 0);
+			DMSG(vdcp, 0, "[%d] connection to service domain is up",
+			    vdcp->instance);
+			vdcp->curr_server->svc_state = VDC_SERVICE_CONNECTED;
+
+			mutex_exit(&vdcp->lock);
+
+			/*
+			 * If we have multiple servers, check that the backend
+			 * is effectively available before resubmitting any IO.
+			 */
+			if (vdcp->num_servers > 1 &&
+			    vdc_eio_check(vdcp, 0) != 0) {
+				mutex_enter(&vdcp->lock);
+				vdcp->curr_server->svc_state =
+				    VDC_SERVICE_FAULTED;
+				vdcp->state = VDC_STATE_FAULTED;
+				break;
+			}
+
+			if (tmid != 0) {
+				(void) untimeout(tmid);
 				tmid = 0;
 				vdcp->ctimeout_reached = B_FALSE;
+			}
+
+			/*
+			 * Setup devid
+			 */
+			(void) vdc_setup_devid(vdcp);
+
+			status = vdc_resubmit_backup_dring(vdcp);
+
+			mutex_enter(&vdcp->lock);
+
+			if (status) {
+				vdcp->state = VDC_STATE_RESETTING;
+				vdcp->self_reset = B_TRUE;
+				vdcp->curr_server->svc_state =
+				    VDC_SERVICE_FAILED;
+				vdc_print_svc_status(vdcp);
+			} else {
 				vdcp->state = VDC_STATE_RUNNING;
-				DMSG(vdcp, 0, "[%d] connection to service "
-				    "domain is up", vdcp->instance);
-				break;
+			}
+			break;
+
+		case VDC_STATE_FAULTED:
+			/*
+			 * Server is faulted because the backend is unavailable.
+			 * If all servers are faulted then we mark the service
+			 * as failed, otherwise we reset to switch to another
+			 * server.
+			 */
+			vdc_print_svc_status(vdcp);
+
+			/* check if all servers are faulted */
+			for (srvr = vdcp->server_list; srvr != NULL;
+			    srvr = srvr->next) {
+				svc_state = srvr->svc_state;
+				if (svc_state != VDC_SERVICE_FAULTED)
+					break;
 			}

-			mutex_exit(&vdcp->lock);
+			if (srvr != NULL) {
+				vdcp->state = VDC_STATE_RESETTING;
+				vdcp->self_reset = B_TRUE;
+			} else {
+				vdcp->state = VDC_STATE_FAILED;
+			}
+			break;
+
+		case VDC_STATE_FAILED:
+			/*
+			 * We reach this state when we are unable to access the
+			 * backend from any server, either because of a maximum
+			 * connection retries or timeout, or because the backend
+			 * is unavailable.
+			 *
+			 * Then we cancel the backup DRing so that errors get
+			 * reported and we wait for a new I/O before attempting
+			 * another connection.
+			 */
+			cmn_err(CE_NOTE, "vdisk@%d disk access failed",
+			    vdcp->instance);
+
+			/* cancel any timeout */
 			if (tmid != 0) {
 				(void) untimeout(tmid);
 				tmid = 0;
 			}
-			status = vdc_resubmit_backup_dring(vdcp);
-			mutex_enter(&vdcp->lock);
-
-			if (status)
-				vdcp->state = VDC_STATE_RESETTING;
-			else
-				vdcp->state = VDC_STATE_RUNNING;
-
+
+			/* cancel pending I/Os */
+			cv_broadcast(&vdcp->running_cv);
+			vdc_cancel_backup_dring(vdcp);
+
+			/* wait for new I/O */
+			while (!vdcp->io_pending)
+				cv_wait(&vdcp->io_pending_cv, &vdcp->lock);
+
+			/*
+			 * There's a new IO pending. Try to re-establish a
+			 * connection. Mark all services as offline, so that
+			 * we don't stop again before having retried all
+			 * servers.
+			 */
+			for (srvr = vdcp->server_list; srvr != NULL;
+			    srvr = srvr->next) {
+				srvr->svc_state = VDC_SERVICE_OFFLINE;
+			}
+
+			/* reset variables */
+			vdcp->hshake_cnt = 0;
+			vdcp->ctimeout_reached = B_FALSE;
+
+			vdcp->state = VDC_STATE_RESETTING;
+			vdcp->self_reset = B_TRUE;
 			break;

 		/* enter running state */
@@ -4434,17 +4553,18 @@
 			vdcp->hshake_cnt = 0;
 			cv_broadcast(&vdcp->running_cv);

-			/* failfast has to been checked after reset */
-			cv_signal(&vdcp->failfast_cv);
+			/* backend has to be checked after reset */
+			if (vdcp->failfast_interval != 0 ||
+			    vdcp->num_servers > 1)
+				cv_signal(&vdcp->eio_cv);

 			/* ownership is lost during reset */
 			if (vdcp->ownership & VDC_OWNERSHIP_WANTED)
 				vdcp->ownership |= VDC_OWNERSHIP_RESET;
 			cv_signal(&vdcp->ownership_cv);

-			cmn_err(CE_CONT, "?vdisk@%d is online using "
-			    "ldc@%ld,%ld\n", vdcp->instance,
-			    vdcp->curr_server->ldc_id, vdcp->curr_server->id);
+			vdcp->curr_server->svc_state = VDC_SERVICE_ONLINE;
+			vdc_print_svc_status(vdcp);

 			mutex_exit(&vdcp->lock);

@@ -4467,8 +4587,14 @@

 			mutex_enter(&vdcp->lock);

-			cmn_err(CE_CONT, "?vdisk@%d is offline\n",
-			    vdcp->instance);
+			/* all servers are now offline */
+			for (srvr = vdcp->server_list; srvr != NULL;
+			    srvr = srvr->next) {
+				srvr->svc_state = VDC_SERVICE_OFFLINE;
+				srvr->log_state = VDC_SERVICE_NONE;
+			}
+
+			vdc_print_svc_status(vdcp);

 			vdcp->state = VDC_STATE_RESETTING;
 			vdcp->self_reset = B_TRUE;
@@ -4516,6 +4642,13 @@
 			ASSERT(vdcp->read_state != VDC_READ_WAITING);

 			vdcp->read_state = VDC_READ_IDLE;
+			vdcp->io_pending = B_FALSE;
+
+			/*
+			 * Cleanup any pending eio. These I/Os are going to
+			 * be resubmitted.
+			 */
+			vdc_eio_unqueue(vdcp, 0, B_FALSE);

 			vdc_backup_local_dring(vdcp);

@@ -4545,9 +4678,8 @@
 			 */
 			cv_broadcast(&vdcp->running_cv);

-			while (vdcp->sync_op_pending) {
-				cv_signal(&vdcp->sync_pending_cv);
-				cv_signal(&vdcp->sync_blocked_cv);
+			while (vdcp->sync_op_cnt > 0) {
+				cv_broadcast(&vdcp->sync_blocked_cv);
 				mutex_exit(&vdcp->lock);
 				/* give the waiters enough time to wake up */
 				delay(vdc_hz_min_ldc_delay);
@@ -4659,7 +4791,7 @@
 		ldep = &vdcp->local_dring[idx];
 		op = ldep->operation;
 		if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) {
-			DTRACE_IO1(done, buf_t *, ldep->cb_arg);
+			DTRACE_IO1(done, buf_t *, ldep->buf);
 			VD_KSTAT_RUNQ_EXIT(vdcp);
 		}
 		VD_UPDATE_ERR_STATS(vdcp, vd_softerrs);
@@ -4684,62 +4816,57 @@

 	ldep = &vdcp->local_dring[idx];

-	DMSG(vdcp, 1, ": state 0x%x - cb_type 0x%x\n",
-	    ldep->dep->hdr.dstate, ldep->cb_type);
+	DMSG(vdcp, 1, ": state 0x%x\n", ldep->dep->hdr.dstate);

 	if (ldep->dep->hdr.dstate == VIO_DESC_DONE) {
 		struct buf *bufp;

-		switch (ldep->cb_type) {
-		case CB_SYNC:
-			ASSERT(vdcp->sync_op_pending);
-
-			status = vdc_depopulate_descriptor(vdcp, idx);
-			vdcp->sync_op_status = status;
-			vdcp->sync_op_pending = B_FALSE;
-			cv_signal(&vdcp->sync_pending_cv);
-			break;
-
-		case CB_STRATEGY:
-			bufp = ldep->cb_arg;
-			ASSERT(bufp != NULL);
-			bufp->b_resid =
-			    bufp->b_bcount - ldep->dep->payload.nbytes;
-			status = ldep->dep->payload.status; /* Future:ntoh */
-			if (status != 0) {
-				DMSG(vdcp, 1, "strategy status=%d\n", status);
-				VD_UPDATE_ERR_STATS(vdcp, vd_softerrs);
-				bioerror(bufp, status);
-			}
-
-			(void) vdc_depopulate_descriptor(vdcp, idx);
-
-			DMSG(vdcp, 1,
-			    "strategy complete req=%ld bytes resp=%ld bytes\n",
-			    bufp->b_bcount, ldep->dep->payload.nbytes);
-
-			if (status != 0 && vdcp->failfast_interval != 0) {
-				/*
-				 * The I/O has failed and failfast is enabled.
-				 * We need the failfast thread to check if the
-				 * failure is due to a reservation conflict.
-				 */
-				(void) vdc_failfast_io_queue(vdcp, bufp);
-			} else {
+		status = ldep->dep->payload.status;
+
+		bufp = ldep->buf;
+		ASSERT(bufp != NULL);
+
+		bufp->b_resid = bufp->b_bcount - ldep->dep->payload.nbytes;
+		bioerror(bufp, status);
+
+		if (status != 0) {
+			DMSG(vdcp, 1, "I/O status=%d\n", status);
+		}
+
+		DMSG(vdcp, 1,
+		    "I/O complete req=%ld bytes resp=%ld bytes\n",
+		    bufp->b_bcount, ldep->dep->payload.nbytes);
+
+		/*
+		 * If the request has failed and we have multiple servers or
+		 * failfast is enabled then we will have to defer the completion
+		 * of the request until we have checked that the vdisk backend
+		 * is effectively available (if multiple server) or that there
+		 * is no reservation conflict (if failfast).
+		 */
+		if ((status != 0 &&
+		    (vdcp->num_servers > 1 &&
+		    (ldep->flags & VDC_OP_ERRCHK_BACKEND)) ||
+		    (vdcp->failfast_interval != 0 &&
+		    (ldep->flags & VDC_OP_ERRCHK_CONFLICT)))) {
+			/*
+			 * The I/O has failed and we need to check the error.
+			 */
+			(void) vdc_eio_queue(vdcp, idx);
+		} else {
+			op = ldep->operation;
+			if (op == VD_OP_BREAD || op == VD_OP_BWRITE) {
 				if (status == 0) {
-					op = (bufp->b_flags & B_READ) ?
-					    VD_OP_BREAD : VD_OP_BWRITE;
 					VD_UPDATE_IO_STATS(vdcp, op,
 					    ldep->dep->payload.nbytes);
+				} else {
+					VD_UPDATE_ERR_STATS(vdcp, vd_softerrs);
 				}
 				VD_KSTAT_RUNQ_EXIT(vdcp);
 				DTRACE_IO1(done, buf_t *, bufp);
-				biodone(bufp);
 			}
-			break;
-
-		default:
-			ASSERT(0);
+			(void) vdc_depopulate_descriptor(vdcp, idx);
+			biodone(bufp);
 		}
 	}

@@ -4858,6 +4985,7 @@
 vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg)
 {
 	int status = 0;
+	vd_disk_type_t old_type;

 	ASSERT(vdc != NULL);
 	ASSERT(mutex_owned(&vdc->lock));
@@ -4902,6 +5030,7 @@
 		}

 		/* update disk, block and transfer sizes */
+		old_type = vdc->vdisk_type;
 		vdc_update_size(vdc, attr_msg->vdisk_size,
 		    attr_msg->vdisk_block_size, attr_msg->max_xfer_sz);
 		vdc->vdisk_type = attr_msg->vdisk_type;
@@ -4932,6 +5061,25 @@
 		 * fake geometry for the disk.
 		 */
 		vdc_create_fake_geometry(vdc);
+
+		/*
+		 * If the disk type was previously unknown and device nodes
+		 * were created then the driver would have created 8 device
+		 * nodes. If we now find out that this is a single-slice disk
+		 * then we need to re-create the appropriate device nodes.
+		 */
+		if (old_type == VD_DISK_TYPE_UNK &&
+		    (vdc->initialized & VDC_MINOR) &&
+		    vdc->vdisk_type == VD_DISK_TYPE_SLICE) {
+			ddi_remove_minor_node(vdc->dip, NULL);
+			(void) devfs_clean(ddi_get_parent(vdc->dip),
+			    NULL, DV_CLEAN_FORCE);
+			if (vdc_create_device_nodes(vdc) != 0) {
+				DMSG(vdc, 0, "![%d] Failed to update "
+				    "device nodes", vdc->instance);
+			}
+		}
+
 		break;

 	case VIO_SUBTYPE_NACK:
@@ -5183,7 +5331,7 @@
 	ASSERT(vdc != NULL);

 	rv = vdc_do_sync_op(vdc, VD_OP_FLUSH, NULL, 0,
-	    VDCPART(dk_arg->dev), 0, CB_SYNC, 0, VIO_both_dir, B_TRUE);
+	    VDCPART(dk_arg->dev), 0, VIO_both_dir, B_TRUE);
 	if (rv != 0) {
 		DMSG(vdc, 0, "[%d] DKIOCFLUSHWRITECACHE failed %d : model %x\n",
 		    vdc->instance, rv,
@@ -5599,8 +5747,8 @@
 	/* a uscsi reset is converted to a VD_OP_RESET operation */
 	if (uscsi.uscsi_flags & (USCSI_RESET | USCSI_RESET_LUN |
 	    USCSI_RESET_ALL)) {
-		rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0, CB_SYNC,
-		    (void *)(uint64_t)mode, VIO_both_dir, B_TRUE);
+		rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0,
+		    VIO_both_dir, B_TRUE);
 		return (rv);
 	}

@@ -5677,7 +5825,7 @@

 	/* submit the request */
 	rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
-	    0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE);
+	    0, 0, VIO_both_dir, B_FALSE);

 	if (rv != 0)
 		goto done;
@@ -5871,7 +6019,7 @@

 	/* submit the request */
 	rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
-	    0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE);
+	    0, 0, VIO_both_dir, B_FALSE);

 	if (rv != 0)
 		goto done;
@@ -5985,7 +6133,7 @@

 	/* submit the request */
 	rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
-	    0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE);
+	    0, 0, VIO_both_dir, B_FALSE);

 	if (rv != 0)
 		goto done;
@@ -6090,7 +6238,7 @@

 	/* submit the request */
 	rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
-	    0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE);
+	    0, 0, VIO_both_dir, B_FALSE);

 	if (rv == 0)
 		rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE);
@@ -6131,7 +6279,7 @@

 	/* submit the request */
 	rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
-	    0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE);
+	    0, 0, VIO_both_dir, B_FALSE);

 	if (rv == 0)
 		rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE);
@@ -6176,7 +6324,7 @@

 	/* submit the request */
 	rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
-	    0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE);
+	    0, 0, VIO_both_dir, B_FALSE);

 	if (rv == 0)
 		rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE);
@@ -6215,7 +6363,7 @@

 	/* submit the request */
 	rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
-	    0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE);
+	    0, 0, VIO_both_dir, B_FALSE);

 	if (rv == 0)
 		rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE);
@@ -6225,11 +6373,10 @@
 }

 /*
- * This function is used by the failfast mechanism to send a SCSI command
- * to check for reservation conflict.
+ * This function is used to send a (simple) SCSI command and check errors.
  */
 static int
-vdc_failfast_scsi_cmd(vdc_t *vdc, uchar_t scmd)
+vdc_eio_scsi_cmd(vdc_t *vdc, uchar_t scmd, int flags)
 {
 	int cdb_len, sense_len, vd_scsi_len;
 	vd_scsi_t *vd_scsi;
@@ -6254,103 +6401,177 @@
 	vd_scsi->timeout = vdc_scsi_timeout;

 	/*
-	 * Submit the request. The last argument has to be B_FALSE so that
-	 * vdc_do_sync_op does not loop checking for reservation conflict if
-	 * the operation returns an error.
+	 * Submit the request. Note the operation should not request that any
+	 * error is checked because this function is precisely called when
+	 * checking errors.
 	 */
-	rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
-	    0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_FALSE);
+	ASSERT((flags & VDC_OP_ERRCHK) == 0);
+
+	rv = vdc_do_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
+	    0, 0, NULL, VIO_both_dir, flags);

 	if (rv == 0)
-		(void) vdc_scsi_status(vdc, vd_scsi, B_FALSE);
+		rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE);

 	kmem_free(vd_scsi, vd_scsi_len);
 	return (rv);
 }

 /*
- * This function is used by the failfast mechanism to check for reservation
- * conflict. It sends some SCSI commands which will fail with a reservation
- * conflict error if the system does not have access to the disk and this
- * will panic the system.
+ * This function is used to check if a SCSI backend is accessible. It will
+ * also detect reservation conflict if failfast is enabled, and panic the
+ * system in that case.
  *
  * Returned Code:
- *	0	- disk is accessible without reservation conflict error
- *	!= 0	- unable to check if disk is accessible
+ *	0	- disk is accessible
+ *	!= 0	- disk is inaccessible or unable to check if disk is accessible
  */
-int
-vdc_failfast_check_resv(vdc_t *vdc)
+static int
+vdc_eio_scsi_check(vdc_t *vdc, int flags)
 {
 	int failure = 0;
+	int rv;

 	/*
 	 * Send a TEST UNIT READY command. The command will panic
-	 * the system if it fails with a reservation conflict.
+	 * the system if it fails with a reservation conflict and
+	 * failfast is enabled. If there is a reservation conflict
+	 * and failfast is not enabled then the function will return
+	 * EACCES. In that case, there's no problem with accessing
+	 * the backend, it is just reserved.
 	 */
-	if (vdc_failfast_scsi_cmd(vdc, SCMD_TEST_UNIT_READY) != 0)
+	rv = vdc_eio_scsi_cmd(vdc, SCMD_TEST_UNIT_READY, flags);
+	if (rv != 0 && rv != EACCES)
 		failure++;

+	/* we don't need to do more checking if failfast is not enabled */
+	if (vdc->failfast_interval == 0)
+		return (failure);
+
 	/*
 	 * With SPC-3 compliant devices TEST UNIT READY will succeed on
 	 * a reserved device, so we also do a WRITE(10) of zero byte in
 	 * order to provoke a Reservation Conflict status on those newer
 	 * devices.
 	 */
-	if (vdc_failfast_scsi_cmd(vdc, SCMD_WRITE_G1) != 0)
+	if (vdc_eio_scsi_cmd(vdc, SCMD_WRITE_G1, flags) != 0)
 		failure++;

 	return (failure);
 }

 /*
- * Add a pending I/O to the failfast I/O queue. An I/O is added to this
- * queue when it has failed and failfast is enabled. Then we have to check
- * if it has failed because of a reservation conflict in which case we have
- * to panic the system.
- *
- * Async I/O should be queued with their block I/O data transfer structure
- * (buf). Sync I/O should be queued with buf = NULL.
+ * This function is used to check if a backend is effectively accessible.
+ *
+ * Returned Code:
+ *	0	- disk is accessible
+ *	!= 0	- disk is inaccessible or unable to check if disk is accessible
+ */
+static int
+vdc_eio_check(vdc_t *vdc, int flags)
+{
+	char *buffer;
+	diskaddr_t blkno;
+	int rv;
+
+	ASSERT((flags & VDC_OP_ERRCHK) == 0);
+
+	if (VD_OP_SUPPORTED(vdc->operations, VD_OP_SCSICMD))
+		return (vdc_eio_scsi_check(vdc, flags));
+
+	ASSERT(vdc->failfast_interval == 0);
+
+	/*
+	 * If the backend does not support SCSI operations then we simply
+	 * check if the backend is accessible by reading some data blocks.
+	 * We first try to read a random block, to try to avoid getting
+	 * a block that might have been cached on the service domain. Then
+	 * we try the last block, and finally the first block.
+	 *
+	 * We return success as soon as we are able to read any block.
+	 */
+	buffer = kmem_alloc(vdc->vdisk_bsize, KM_SLEEP);
+
+	if (vdc->vdisk_size > 0) {
+
+		/* try a random block */
+		(void) random_get_pseudo_bytes((uint8_t *)&blkno,
+		    sizeof (diskaddr_t));
+		blkno = blkno % vdc->vdisk_size;
+		rv = vdc_do_op(vdc, VD_OP_BREAD, (caddr_t)buffer,
+		    vdc->vdisk_bsize, VD_SLICE_NONE, blkno, NULL,
+		    VIO_read_dir, flags);
+
+		if (rv == 0)
+			goto done;
+
+		/* try the last block */
+		blkno = vdc->vdisk_size - 1;
+		rv = vdc_do_op(vdc, VD_OP_BREAD, (caddr_t)buffer,
+		    vdc->vdisk_bsize, VD_SLICE_NONE, blkno, NULL,
+		    VIO_read_dir, flags);
+
+		if (rv == 0)
+			goto done;
+	}
+
+	/* try block 0 */
+	blkno = 0;
+	rv = vdc_do_op(vdc, VD_OP_BREAD, (caddr_t)buffer, vdc->vdisk_bsize,
+	    VD_SLICE_NONE, blkno, NULL, VIO_read_dir, flags);
+
+done:
+	kmem_free(buffer, vdc->vdisk_bsize);
+	return (rv);
+}
+
+/*
+ * Add a pending I/O to the eio queue. An I/O is added to this queue
+ * when it has failed and failfast is enabled or the vdisk has multiple
+ * servers. It will then be handled by the eio thread (vdc_eio_thread).
+ * The eio queue is ordered starting with the most recent I/O added.
  */
 static vdc_io_t *
-vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf)
+vdc_eio_queue(vdc_t *vdc, int index)
 {
 	vdc_io_t *vio;

 	ASSERT(MUTEX_HELD(&vdc->lock));

 	vio = kmem_alloc(sizeof (vdc_io_t), KM_SLEEP);
-	vio->vio_next = vdc->failfast_io_queue;
-	vio->vio_buf = buf;
+	vio->vio_next = vdc->eio_queue;
+	vio->vio_index = index;
 	vio->vio_qtime = ddi_get_lbolt();

-	vdc->failfast_io_queue = vio;
-
-	/* notify the failfast thread that a new I/O is queued */
-	cv_signal(&vdc->failfast_cv);
+	vdc->eio_queue = vio;
+
+	/* notify the eio thread that a new I/O is queued */
+	cv_signal(&vdc->eio_cv);

 	return (vio);
 }

 /*
- * Remove and complete I/O in the failfast I/O queue which have been
- * added after the indicated deadline. A deadline of 0 means that all
- * I/O have to be unqueued and marked as completed.
+ * Remove I/Os added before the indicated deadline from the eio queue. A
+ * deadline of 0 means that all I/Os have to be unqueued. The complete_io
+ * boolean specifies if unqueued I/Os should be marked as completed or not.
  */
 static void
-vdc_failfast_io_unqueue(vdc_t *vdc, clock_t deadline)
-{
+vdc_eio_unqueue(vdc_t *vdc, clock_t deadline, boolean_t complete_io)
+{
+	struct buf *buf;
 	vdc_io_t *vio, *vio_tmp;
+	int index, op;

 	ASSERT(MUTEX_HELD(&vdc->lock));

 	vio_tmp = NULL;
-	vio = vdc->failfast_io_queue;
+	vio = vdc->eio_queue;

 	if (deadline != 0) {
 		/*
-		 * Skip any io queued after the deadline. The failfast
-		 * I/O queue is ordered starting with the last I/O added
-		 * to the queue.
+		 * Skip any io queued after the deadline. The eio queue is
+		 * ordered starting with the last I/O added to the queue.
 		 */
 		while (vio != NULL && vio->vio_qtime > deadline) {
 			vio_tmp = vio;
@@ -6364,53 +6585,54 @@

 	/* update the queue */
 	if (vio_tmp == NULL)
-		vdc->failfast_io_queue = NULL;
+		vdc->eio_queue = NULL;
 	else
 		vio_tmp->vio_next = NULL;

 	/*
-	 * Complete unqueued I/O. Async I/O have a block I/O data transfer
-	 * structure (buf) and they are completed by calling biodone(). Sync
-	 * I/O do not have a buf and they are completed by setting the
-	 * vio_qtime to zero and signaling failfast_io_cv. In that case, the
-	 * thread waiting for the I/O to complete is responsible for freeing
-	 * the vio structure.
+	 * Free and complete unqueued I/Os if this was requested. All I/Os
+	 * have a block I/O data transfer structure (buf) and they are
+	 * completed by calling biodone().
 	 */
 	while (vio != NULL) {
 		vio_tmp = vio->vio_next;
-		if (vio->vio_buf != NULL) {
-			VD_KSTAT_RUNQ_EXIT(vdc);
-			DTRACE_IO1(done, buf_t *, vio->vio_buf);
-			biodone(vio->vio_buf);
-			kmem_free(vio, sizeof (vdc_io_t));
-		} else {
-			vio->vio_qtime = 0;
+
+		if (complete_io) {
+			index = vio->vio_index;
+			op = vdc->local_dring[index].operation;
+			buf = vdc->local_dring[index].buf;
+			(void) vdc_depopulate_descriptor(vdc, index);
+			ASSERT(buf->b_flags & B_ERROR);
+			if (op == VD_OP_BREAD || op == VD_OP_BWRITE) {
+				VD_UPDATE_ERR_STATS(vdc, vd_softerrs);
+				VD_KSTAT_RUNQ_EXIT(vdc);
+				DTRACE_IO1(done, buf_t *, buf);
+			}
+			biodone(buf);
 		}
+
+		kmem_free(vio, sizeof (vdc_io_t));
 		vio = vio_tmp;
 	}
-
-	cv_broadcast(&vdc->failfast_io_cv);
 }

 /*
- * Failfast Thread.
- *
- * While failfast is enabled, the failfast thread sends a TEST UNIT READY
+ * Error I/O Thread.  There is one eio thread for each virtual disk that
+ * has multiple servers or for which failfast is enabled. Failfast can only
+ * be enabled for vdisk supporting SCSI commands.
+ *
+ * While failfast is enabled, the eio thread sends a TEST UNIT READY
  * and a zero size WRITE(10) SCSI commands on a regular basis to check that
  * we still have access to the disk. If a command fails with a RESERVATION
  * CONFLICT error then the system will immediatly panic.
  *
- * The failfast thread is also woken up when an I/O has failed. It then check
+ * The eio thread is also woken up when an I/O has failed. It then checks
  * the access to the disk to ensure that the I/O failure was not due to a
- * reservation conflict.
- *
- * There is one failfast thread for each virtual disk for which failfast is
- * enabled. We could have only one thread sending requests for all disks but
- * this would need vdc to send asynchronous requests and to have callbacks to
- * process replies.
+ * reservation conflict or to the backend been inaccessible.
+ *
  */
 static void
-vdc_failfast_thread(void *arg)
+vdc_eio_thread(void *arg)
 {
 	int status;
 	vdc_t *vdc = (vdc_t *)arg;
@@ -6418,45 +6640,74 @@

 	mutex_enter(&vdc->lock);

-	while (vdc->failfast_interval != 0) {
+	while (vdc->failfast_interval != 0 || vdc->num_servers > 1) {
+		/*
+		 * Wait if there is nothing in the eio queue or if the state
+		 * is not VDC_STATE_RUNNING.
+		 */
+		if (vdc->eio_queue == NULL || vdc->state != VDC_STATE_RUNNING) {
+			if (vdc->failfast_interval != 0) {
+				timeout = ddi_get_lbolt() +
+				    drv_usectohz(vdc->failfast_interval);
+				(void) cv_timedwait(&vdc->eio_cv, &vdc->lock,
+				    timeout);
+			} else {
+				ASSERT(vdc->num_servers > 1);
+				(void) cv_wait(&vdc->eio_cv, &vdc->lock);
+			}
+
+			if (vdc->state != VDC_STATE_RUNNING)
+				continue;
+		}
+
+		mutex_exit(&vdc->lock);

 		starttime = ddi_get_lbolt();

-		mutex_exit(&vdc->lock);
-
-		/* check for reservation conflict */
-		status = vdc_failfast_check_resv(vdc);
+		/* check error */
+		status = vdc_eio_check(vdc, VDC_OP_STATE_RUNNING);

 		mutex_enter(&vdc->lock);
 		/*
-		 * We have dropped the lock to send the SCSI command so we have
-		 * to check that failfast is still enabled.
+		 * We have dropped the lock to check the backend so we have
+		 * to check that the eio thread is still enabled.
 		 */
-		if (vdc->failfast_interval == 0)
+		if (vdc->failfast_interval == 0 && vdc->num_servers <= 1)
 			break;

 		/*
-		 * If we have successfully check the disk access and there was
-		 * no reservation conflict then we can complete any I/O queued
-		 * before the last check.
+		 * If the eio queue is empty or we are not in running state
+		 * anymore then there is nothing to do.
 		 */
-		if (status == 0)
-			vdc_failfast_io_unqueue(vdc, starttime);
-
-		/* proceed again if some I/O are still in the queue */
-		if (vdc->failfast_io_queue != NULL)
+		if (vdc->state != VDC_STATE_RUNNING || vdc->eio_queue == NULL)
 			continue;

-		timeout = ddi_get_lbolt() +
-		    drv_usectohz(vdc->failfast_interval);
-		(void) cv_timedwait(&vdc->failfast_cv, &vdc->lock, timeout);
+		if (status == 0) {
+			/*
+			 * The backend access has been successfully checked,
+			 * we can complete any I/O queued before the last check.
+			 */
+			vdc_eio_unqueue(vdc, starttime, B_TRUE);
+
+		} else if (vdc->num_servers > 1) {
+			/*
+			 * The backend is inaccessible for a disk with multiple
+			 * servers. So we force a reset to switch to another
+			 * server. The reset will also clear the eio queue and
+			 * resubmit all pending I/Os.
+			 */
+			mutex_enter(&vdc->read_lock);
+			vdc->read_state = VDC_READ_RESET;
+			cv_signal(&vdc->read_cv);
+			mutex_exit(&vdc->read_lock);
+		}
 	}

 	/*
-	 * Failfast is being stop so we can complete any queued I/O.
+	 * The thread is being stopped so we can complete any queued I/O.
 	 */
-	vdc_failfast_io_unqueue(vdc, 0);
-	vdc->failfast_thread = NULL;
+	vdc_eio_unqueue(vdc, 0, B_TRUE);
+	vdc->eio_thread = NULL;
 	mutex_exit(&vdc->lock);
 	thread_exit();
 }
@@ -6473,14 +6724,14 @@
 		return (EFAULT);

 	mutex_enter(&vdc->lock);
-	if (mh_time != 0 && vdc->failfast_thread == NULL) {
-		vdc->failfast_thread = thread_create(NULL, 0,
-		    vdc_failfast_thread, vdc, 0, &p0, TS_RUN,
+	if (mh_time != 0 && vdc->eio_thread == NULL) {
+		vdc->eio_thread = thread_create(NULL, 0,
+		    vdc_eio_thread, vdc, 0, &p0, TS_RUN,
 		    v.v_maxsyspri - 2);
 	}

-	vdc->failfast_interval = mh_time * 1000;
-	cv_signal(&vdc->failfast_cv);
+	vdc->failfast_interval = ((long)mh_time) * MILLISEC;
+	cv_signal(&vdc->eio_cv);
 	mutex_exit(&vdc->lock);

 	return (0);
@@ -6491,14 +6742,13 @@
  * converted to VD_OP_SET_ACCESS operations.
  */
 static int
-vdc_access_set(vdc_t *vdc, uint64_t flags, int mode)
+vdc_access_set(vdc_t *vdc, uint64_t flags)
 {
 	int rv;

 	/* submit owership command request */
 	rv = vdc_do_sync_op(vdc, VD_OP_SET_ACCESS, (caddr_t)&flags,
-	    sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode,
-	    VIO_both_dir, B_TRUE);
+	    sizeof (uint64_t), 0, 0, VIO_both_dir, B_TRUE);

 	return (rv);
 }
@@ -6508,14 +6758,13 @@
  * VD_OP_GET_ACCESS operation.
  */
 static int
-vdc_access_get(vdc_t *vdc, uint64_t *status, int mode)
+vdc_access_get(vdc_t *vdc, uint64_t *status)
 {
 	int rv;

 	/* submit owership command request */
 	rv = vdc_do_sync_op(vdc, VD_OP_GET_ACCESS, (caddr_t)status,
-	    sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode,
-	    VIO_both_dir, B_TRUE);
+	    sizeof (uint64_t), 0, 0, VIO_both_dir, B_TRUE);

 	return (rv);
 }
@@ -6560,7 +6809,7 @@
 			mutex_exit(&vdc->lock);

 			status = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE |
-			    VD_ACCESS_SET_PRESERVE, FKIOCTL);
+			    VD_ACCESS_SET_PRESERVE);

 			mutex_enter(&vdc->lock);

@@ -6645,7 +6894,7 @@
 	vd_cap = kmem_zalloc(alloc_len, KM_SLEEP);

 	rv = vdc_do_sync_op(vdc, VD_OP_GET_CAPACITY, (caddr_t)vd_cap, alloc_len,
-	    0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_TRUE);
+	    0, 0, VIO_both_dir, B_TRUE);

 	*dsk_size = vd_cap->vdisk_size;
 	*blk_size = vd_cap->vdisk_block_size;
@@ -6940,7 +7189,7 @@
 		vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED);

 		rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE |
-		    VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE, mode);
+		    VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE);
 		if (rv == 0) {
 			vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED |
 			    VDC_OWNERSHIP_GRANTED);
@@ -6954,7 +7203,7 @@
 	case MHIOCRELEASE:
 	{
 		mutex_enter(&vdc->ownership_lock);
-		rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, mode);
+		rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR);
 		if (rv == 0) {
 			vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE);
 		}
@@ -6966,7 +7215,7 @@
 	{
 		uint64_t status;

-		rv = vdc_access_get(vdc, &status, mode);
+		rv = vdc_access_get(vdc, &status);
 		if (rv == 0 && rvalp != NULL)
 			*rvalp = (status & VD_ACCESS_ALLOWED)? 0 : 1;
 		return (rv);
@@ -6974,7 +7223,7 @@

 	case MHIOCQRESERVE:
 	{
-		rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE, mode);
+		rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE);
 		return (rv);
 	}

@@ -7152,8 +7401,7 @@
 	 * send request to vds to service the ioctl.
 	 */
 	rv = vdc_do_sync_op(vdc, iop->op, mem_p, alloc_len,
-	    VDCPART(dev), 0, CB_SYNC, (void *)(uint64_t)mode,
-	    VIO_both_dir, B_TRUE);
+	    VDCPART(dev), 0, VIO_both_dir, B_TRUE);

 	if (rv != 0) {
 		/*
@@ -7833,7 +8081,6 @@
 static int
 vdc_validate_geometry(vdc_t *vdc)
 {
-	buf_t	*buf;	/* BREAD requests need to be in a buf_t structure */
 	dev_t	dev;
 	int	rv, rval;
 	struct dk_label *label;
@@ -7960,27 +8207,9 @@
 	 * Read disk label from start of disk
 	 */
 	label = kmem_alloc(vdc->vdisk_bsize, KM_SLEEP);
-	buf = kmem_alloc(sizeof (buf_t), KM_SLEEP);
-	bioinit(buf);
-	buf->b_un.b_addr = (caddr_t)label;
-	buf->b_bcount = vdc->vdisk_bsize;
-	buf->b_flags = B_BUSY | B_READ;
-	buf->b_dev = cmpdev(dev);
-	rv = vdc_send_request(vdc, VD_OP_BREAD, (caddr_t)label,
-	    vdc->vdisk_bsize, VD_SLICE_NONE, 0, CB_STRATEGY, buf, VIO_read_dir);
-	if (rv) {
-		DMSG(vdc, 1, "[%d] Failed to read disk block 0\n",
-		    vdc->instance);
-	} else if (ddi_in_panic()) {
-		rv = vdc_drain_response(vdc, CB_STRATEGY, buf);
-		if (rv == 0) {
-			rv = geterror(buf);
-		}
-	} else {
-		rv = biowait(buf);
-	}
-	biofini(buf);
-	kmem_free(buf, sizeof (buf_t));
+
+	rv = vdc_do_op(vdc, VD_OP_BREAD, (caddr_t)label, vdc->vdisk_bsize,
+	    VD_SLICE_NONE, 0, NULL, VIO_read_dir, VDC_OP_NORMAL);

 	if (rv != 0 || label->dkl_magic != DKL_MAGIC ||
 	    label->dkl_cksum != vdc_lbl2cksum(label)) {
@@ -8031,7 +8260,8 @@
 	(void) vdc_validate_geometry(vdc);

 	/* if the disk label has changed, update device nodes */
-	if (vdc->vdisk_label != old_label) {
+	if (vdc->vdisk_type == VD_DISK_TYPE_DISK &&
+	    vdc->vdisk_label != old_label) {

 		if (vdc->vdisk_label == VD_DISK_LABEL_EFI)
 			rv = vdc_create_device_nodes_efi(vdc);
@@ -8082,6 +8312,8 @@
 	int rv;
 	vd_devid_t *vd_devid;
 	size_t bufsize, bufid_len;
+	ddi_devid_t vdisk_devid;
+	char *devid_str;

 	/*
 	 * At first sight, we don't know the size of the devid that the
@@ -8096,10 +8328,10 @@
 	vd_devid = kmem_zalloc(bufsize, KM_SLEEP);
 	bufid_len = bufsize - sizeof (vd_efi_t) - 1;

-	rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid,
-	    bufsize, 0, 0, CB_SYNC, 0, VIO_both_dir, B_TRUE);
-
-	DMSG(vdc, 2, "sync_op returned %d\n", rv);
+	rv = vdc_do_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid,
+	    bufsize, 0, 0, NULL, VIO_both_dir, 0);
+
+	DMSG(vdc, 2, "do_op returned %d\n", rv);

 	if (rv) {
 		kmem_free(vd_devid, bufsize);
@@ -8117,9 +8349,8 @@
 		vd_devid = kmem_zalloc(bufsize, KM_SLEEP);
 		bufid_len = bufsize - sizeof (vd_efi_t) - 1;

-		rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID,
-		    (caddr_t)vd_devid, bufsize, 0, 0, CB_SYNC, 0,
-		    VIO_both_dir, B_TRUE);
+		rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid,
+		    bufsize, 0, 0, VIO_both_dir, B_TRUE);

 		if (rv) {
 			kmem_free(vd_devid, bufsize);
@@ -8142,23 +8373,58 @@

 	/* build an encapsulated devid based on the returned devid */
 	if (ddi_devid_init(vdc->dip, DEVID_ENCAP, vd_devid->length,
-	    vd_devid->id, &vdc->devid) != DDI_SUCCESS) {
+	    vd_devid->id, &vdisk_devid) != DDI_SUCCESS) {
 		DMSG(vdc, 1, "[%d] Fail to created devid\n", vdc->instance);
 		kmem_free(vd_devid, bufsize);
 		return (1);
 	}

-	DEVID_FORMTYPE((impl_devid_t *)vdc->devid, vd_devid->type);
-
-	ASSERT(ddi_devid_valid(vdc->devid) == DDI_SUCCESS);
+	DEVID_FORMTYPE((impl_devid_t *)vdisk_devid, vd_devid->type);
+
+	ASSERT(ddi_devid_valid(vdisk_devid) == DDI_SUCCESS);

 	kmem_free(vd_devid, bufsize);

-	if (ddi_devid_register(vdc->dip, vdc->devid) != DDI_SUCCESS) {
+	if (vdc->devid != NULL) {
+		/* check that the devid hasn't changed */
+		if (ddi_devid_compare(vdisk_devid, vdc->devid) == 0) {
+			ddi_devid_free(vdisk_devid);
+			return (0);
+		}
+
+		cmn_err(CE_WARN, "vdisk@%d backend devid has changed",
+		    vdc->instance);
+
+		devid_str = ddi_devid_str_encode(vdc->devid, NULL);
+
+		cmn_err(CE_CONT, "vdisk@%d backend initial devid: %s",
+		    vdc->instance,
+		    (devid_str)? devid_str : "<encoding error>");
+
+		if (devid_str)
+			ddi_devid_str_free(devid_str);
+
+		devid_str = ddi_devid_str_encode(vdisk_devid, NULL);
+
+		cmn_err(CE_CONT, "vdisk@%d backend current devid: %s",
+		    vdc->instance,
+		    (devid_str)? devid_str : "<encoding error>");
+
+		if (devid_str)
+			ddi_devid_str_free(devid_str);
+
+		ddi_devid_free(vdisk_devid);
+		return (1);
+	}
+
+	if (ddi_devid_register(vdc->dip, vdisk_devid) != DDI_SUCCESS) {
 		DMSG(vdc, 1, "[%d] Fail to register devid\n", vdc->instance);
+		ddi_devid_free(vdisk_devid);
 		return (1);
 	}

+	vdc->devid = vdisk_devid;
+
 	return (0);
 }
--- a/usr/src/uts/sun4v/sys/vdc.h	Mon Nov 09 08:33:48 2009 -0800
+++ b/usr/src/uts/sun4v/sys/vdc.h	Mon Nov 09 09:03:03 2009 -0800
@@ -84,6 +84,16 @@
 #define	VDC_SEQ_NUM_TODO		1	/* Request needs processing */

 /*
+ * Flags for virtual disk operations.
+ */
+#define	VDC_OP_STATE_RUNNING	0x01	/* do operation in running state */
+#define	VDC_OP_ERRCHK_BACKEND	0x02	/* check backend on error */
+#define	VDC_OP_ERRCHK_CONFLICT	0x04	/* check resv conflict on error */
+
+#define	VDC_OP_ERRCHK	(VDC_OP_ERRCHK_BACKEND | VDC_OP_ERRCHK_CONFLICT)
+#define	VDC_OP_NORMAL	(VDC_OP_STATE_RUNNING | VDC_OP_ERRCHK)
+
+/*
  * Macros to get UNIT and PART number
  */
 #define	VDCUNIT_SHIFT	3
@@ -171,12 +181,26 @@
 	VDC_STATE_INIT_WAITING,		/* waiting for ldc connection */
 	VDC_STATE_NEGOTIATE,		/* doing handshake negotiation */
 	VDC_STATE_HANDLE_PENDING,	/* handle requests in backup dring */
+	VDC_STATE_FAULTED,		/* multipath backend is inaccessible */
+	VDC_STATE_FAILED,		/* device is not usable */
 	VDC_STATE_RUNNING,		/* running and accepting requests */
 	VDC_STATE_DETACH,		/* detaching */
 	VDC_STATE_RESETTING		/* resetting connection with vds */
 } vdc_state_t;

 /*
+ * States of the service provided by a vds server
+ */
+typedef enum vdc_service_state {
+	VDC_SERVICE_NONE = -1, 		/* no state define */
+	VDC_SERVICE_OFFLINE,		/* no connection with the service */
+	VDC_SERVICE_CONNECTED,		/* connection established */
+	VDC_SERVICE_ONLINE,		/* connection and backend available */
+	VDC_SERVICE_FAILED,		/* connection failed */
+	VDC_SERVICE_FAULTED		/* connection but backend unavailable */
+} vdc_service_state_t;
+
+/*
  * The states that the vdc instance can be in.
  */
 typedef enum vdc_lc_state {
@@ -198,11 +222,6 @@
 	VIO_both_dir		/* transfer both in and out in same buffer */
 } vio_desc_direction_t;

-typedef enum {
-	CB_STRATEGY,		/* non-blocking strategy call */
-	CB_SYNC			/* synchronous operation */
-} vio_cb_type_t;
-
 typedef struct vdc_local_desc {
 	boolean_t		is_free;	/* local state - inuse or not */

@@ -211,9 +230,9 @@
 	int			slice;
 	diskaddr_t		offset;		/* disk offset */
 	size_t			nbytes;
-	vio_cb_type_t		cb_type;	/* operation type blk/nonblk */
-	void			*cb_arg;	/* buf passed to strategy() */
+	struct buf		*buf;		/* buf of operation */
 	vio_desc_direction_t	dir;		/* direction of transfer */
+	int			flags;		/* flags of operation */

 	caddr_t			align_addr;	/* used if addr non-aligned */
 	ldc_mem_handle_t	desc_mhdl;	/* Mem handle of buf */
@@ -222,11 +241,11 @@
 } vdc_local_desc_t;

 /*
- * I/O queue used by failfast
+ * I/O queue used for checking backend or failfast
  */
 typedef struct vdc_io {
 	struct vdc_io	*vio_next;	/* next pending I/O in the queue */
-	struct buf	*vio_buf;	/* buf for CB_STRATEGY I/O */
+	int		vio_index;	/* descriptor index */
 	clock_t		vio_qtime;	/* time the I/O was queued */
 } vdc_io_t;

@@ -246,6 +265,8 @@
 	struct vdc		*vdcp;			/* Ptr to vdc struct */
 	uint64_t		id;			/* Server port id */
 	uint64_t		state;			/* Server state */
+	vdc_service_state_t	svc_state;		/* Service state */
+	vdc_service_state_t	log_state;		/* Last state logged */
 	uint64_t		ldc_id;			/* Server LDC id */
 	ldc_handle_t		ldc_handle;		/* Server LDC handle */
 	ldc_status_t		ldc_state;		/* Server LDC state */
@@ -262,7 +283,9 @@
 	kcondvar_t	initwait_cv;	/* signal when ldc conn is up */
 	kcondvar_t	dring_free_cv;	/* signal when desc is avail */
 	kcondvar_t	membind_cv;	/* signal when mem can be bound */
-	boolean_t	self_reset;
+	boolean_t	self_reset;	/* self initiated reset */
+	kcondvar_t	io_pending_cv;	/* signal on pending I/O */
+	boolean_t	io_pending;	/* pending I/O */

 	int		initialized;	/* keeps track of what's init'ed */
 	vdc_lc_state_t	lifecycle;	/* Current state of the vdc instance */
@@ -285,10 +308,7 @@
 	vdc_rd_state_t	read_state;	/* current read state */

 	uint32_t	sync_op_cnt;	/* num of active sync operations */
-	boolean_t	sync_op_pending; /* sync operation is pending */
 	boolean_t	sync_op_blocked; /* blocked waiting to do sync op */
-	uint32_t	sync_op_status;	/* status of sync operation */
-	kcondvar_t	sync_pending_cv; /* cv wait for sync op to finish */
 	kcondvar_t	sync_blocked_cv; /* cv wait for other syncs to finish */

 	uint64_t	session_id;	/* common ID sent with all messages */
@@ -326,13 +346,12 @@
 	kcondvar_t	ownership_cv;		/* cv for ownership update */

 	/*
-	 * The failfast fields are protected by the lock mutex.
+	 * The eio and failfast fields are protected by the lock mutex.
 	 */
-	kthread_t	*failfast_thread;	/* failfast thread */
+	kthread_t	*eio_thread;		/* error io thread */
+	kcondvar_t	eio_cv;			/* cv for eio thread update */
+	vdc_io_t	*eio_queue;		/* error io queue */
 	clock_t		failfast_interval;	/* interval in microsecs */
-	kcondvar_t	failfast_cv;		/* cv for failfast update */
-	kcondvar_t	failfast_io_cv;		/* cv wait for I/O to finish */
-	vdc_io_t	*failfast_io_queue;	/* failfast io queue */

 	/*
 	 * kstats used to store I/O statistics consumed by iostat(1M).