Mercurial > illumos > illumos-gate
changeset 10971:2663784ac9bb
backout 6726533: causes 6898072
author | jmcp <James.McPherson@Sun.COM> |
---|---|
date | Thu, 05 Nov 2009 15:54:24 -0800 |
parents | 53b2abb98d0f |
children | 807794d41b3a |
files | usr/src/uts/sun4v/io/vdc.c usr/src/uts/sun4v/sys/vdc.h |
diffstat | 2 files changed, 614 insertions(+), 899 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/uts/sun4v/io/vdc.c Thu Nov 05 15:51:00 2009 -0800 +++ b/usr/src/uts/sun4v/io/vdc.c Thu Nov 05 15:54:24 2009 -0800 @@ -69,7 +69,6 @@ #include <sys/mdeg.h> #include <sys/note.h> #include <sys/open.h> -#include <sys/random.h> #include <sys/sdt.h> #include <sys/stat.h> #include <sys/sunddi.h> @@ -83,7 +82,6 @@ #include <sys/cdio.h> #include <sys/dktp/fdisk.h> #include <sys/dktp/dadkio.h> -#include <sys/fs/dv_node.h> #include <sys/mhd.h> #include <sys/scsi/generic/sense.h> #include <sys/scsi/impl/uscsi.h> @@ -176,20 +174,18 @@ static int vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *msg); static int vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, - buf_t *bufp, vio_desc_direction_t dir, int flags); + int cb_type, void *cb_arg, vio_desc_direction_t dir); static int vdc_map_to_shared_dring(vdc_t *vdcp, int idx); static int vdc_populate_descriptor(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, - buf_t *bufp, vio_desc_direction_t dir, int flags); + int cb_type, void *cb_arg, vio_desc_direction_t dir); static int vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, - size_t nbytes, int slice, diskaddr_t offset, - vio_desc_direction_t dir, boolean_t); -static int vdc_do_op(vdc_t *vdc, int op, caddr_t addr, size_t nbytes, - int slice, diskaddr_t offset, struct buf *bufp, - vio_desc_direction_t dir, int flags); + size_t nbytes, int slice, diskaddr_t offset, int cb_type, + void *cb_arg, vio_desc_direction_t dir, boolean_t); static int vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp); -static int vdc_drain_response(vdc_t *vdcp, struct buf *buf); +static int vdc_drain_response(vdc_t *vdcp, vio_cb_type_t cb_type, + struct buf *buf); static int vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx); static int vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep); static int vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg); @@ -226,12 +222,9 @@ int mode, int dir); static void vdc_ownership_update(vdc_t *vdc, int ownership_flags); -static int vdc_access_set(vdc_t *vdc, uint64_t flags); -static vdc_io_t *vdc_eio_queue(vdc_t *vdc, int index); -static void vdc_eio_unqueue(vdc_t *vdc, clock_t deadline, - boolean_t complete_io); -static int vdc_eio_check(vdc_t *vdc, int flags); -static void vdc_eio_thread(void *arg); +static int vdc_access_set(vdc_t *vdc, uint64_t flags, int mode); +static vdc_io_t *vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf); +static int vdc_failfast_check_resv(vdc_t *vdc); /* * Module variables @@ -399,7 +392,7 @@ static int vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { - kt_did_t eio_tid, ownership_tid; + kt_did_t failfast_tid, ownership_tid; int instance; int rv; vdc_server_t *srvr; @@ -425,7 +418,14 @@ return (DDI_FAILURE); } - if (vdc_is_opened(vdc)) { + /* + * This function is called when vdc is detached or if it has failed to + * attach. In that case, the attach may have fail before the vdisk type + * has been set so we can't call vdc_is_opened(). However as the attach + * has failed, we know that the vdisk is not opened and we can safely + * detach. + */ + if (vdc->vdisk_type != VD_DISK_TYPE_UNK && vdc_is_opened(vdc)) { DMSG(vdc, 0, "[%d] Cannot detach: device is open", instance); return (DDI_FAILURE); } @@ -449,7 +449,7 @@ /* If we took ownership, release ownership */ mutex_enter(&vdc->ownership_lock); if (vdc->ownership & VDC_OWNERSHIP_GRANTED) { - rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR); + rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, FKIOCTL); if (rv == 0) { vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); } @@ -487,9 +487,6 @@ instance); vdc->state = VDC_STATE_RESETTING; cv_signal(&vdc->initwait_cv); - } else if (vdc->state == VDC_STATE_FAILED) { - vdc->io_pending = B_TRUE; - cv_signal(&vdc->io_pending_cv); } mutex_exit(&vdc->lock); @@ -507,13 +504,12 @@ vdc_fini_ports(vdc); - if (vdc->eio_thread) { - eio_tid = vdc->eio_thread->t_did; + if (vdc->failfast_thread) { + failfast_tid = vdc->failfast_thread->t_did; vdc->failfast_interval = 0; - ASSERT(vdc->num_servers == 0); - cv_signal(&vdc->eio_cv); + cv_signal(&vdc->failfast_cv); } else { - eio_tid = 0; + failfast_tid = 0; } if (vdc->ownership & VDC_OWNERSHIP_WANTED) { @@ -526,8 +522,8 @@ mutex_exit(&vdc->lock); - if (eio_tid != 0) - thread_join(eio_tid); + if (failfast_tid != 0) + thread_join(failfast_tid); if (ownership_tid != 0) thread_join(ownership_tid); @@ -552,12 +548,13 @@ cv_destroy(&vdc->initwait_cv); cv_destroy(&vdc->dring_free_cv); cv_destroy(&vdc->membind_cv); + cv_destroy(&vdc->sync_pending_cv); cv_destroy(&vdc->sync_blocked_cv); cv_destroy(&vdc->read_cv); cv_destroy(&vdc->running_cv); - cv_destroy(&vdc->io_pending_cv); cv_destroy(&vdc->ownership_cv); - cv_destroy(&vdc->eio_cv); + cv_destroy(&vdc->failfast_cv); + cv_destroy(&vdc->failfast_io_cv); } if (vdc->minfo) @@ -650,16 +647,17 @@ cv_init(&vdc->dring_free_cv, NULL, CV_DRIVER, NULL); cv_init(&vdc->membind_cv, NULL, CV_DRIVER, NULL); cv_init(&vdc->running_cv, NULL, CV_DRIVER, NULL); - cv_init(&vdc->io_pending_cv, NULL, CV_DRIVER, NULL); - - vdc->io_pending = B_FALSE; + vdc->threads_pending = 0; + vdc->sync_op_pending = B_FALSE; vdc->sync_op_blocked = B_FALSE; + cv_init(&vdc->sync_pending_cv, NULL, CV_DRIVER, NULL); cv_init(&vdc->sync_blocked_cv, NULL, CV_DRIVER, NULL); mutex_init(&vdc->ownership_lock, NULL, MUTEX_DRIVER, NULL); cv_init(&vdc->ownership_cv, NULL, CV_DRIVER, NULL); - cv_init(&vdc->eio_cv, NULL, CV_DRIVER, NULL); + cv_init(&vdc->failfast_cv, NULL, CV_DRIVER, NULL); + cv_init(&vdc->failfast_io_cv, NULL, CV_DRIVER, NULL); /* init blocking msg read functionality */ mutex_init(&vdc->read_lock, NULL, MUTEX_DRIVER, NULL); @@ -701,19 +699,6 @@ return (DDI_FAILURE); } - /* - * If there are multiple servers then start the eio thread. - */ - if (vdc->num_servers > 1) { - vdc->eio_thread = thread_create(NULL, 0, vdc_eio_thread, vdc, 0, - &p0, TS_RUN, v.v_maxsyspri - 2); - if (vdc->eio_thread == NULL) { - cmn_err(CE_NOTE, "[%d] Failed to create error " - "I/O thread", instance); - return (DDI_FAILURE); - } - } - vdc->initialized |= VDC_THREAD; atomic_inc_32(&vdc_instance_count); @@ -740,6 +725,13 @@ } /* + * Setup devid + */ + if (vdc_setup_devid(vdc)) { + DMSG(vdc, 0, "[%d] No device id available\n", instance); + } + + /* * Fill in the fields of the error statistics kstat that were not * available when creating the kstat */ @@ -1037,6 +1029,7 @@ * Return Values * 0 - Success * EIO - Failed to create node + * EINVAL - Unknown type of disk exported */ static int vdc_create_device_nodes(vdc_t *vdc) @@ -1054,14 +1047,14 @@ switch (vdc->vdisk_type) { case VD_DISK_TYPE_DISK: - case VD_DISK_TYPE_UNK: num_slices = V_NUMPAR; break; case VD_DISK_TYPE_SLICE: num_slices = 1; break; + case VD_DISK_TYPE_UNK: default: - ASSERT(0); + return (EINVAL); } /* @@ -1159,10 +1152,22 @@ static boolean_t vdc_is_opened(vdc_t *vdc) { - int i; + int i, nslices; + + switch (vdc->vdisk_type) { + case VD_DISK_TYPE_DISK: + nslices = V_NUMPAR; + break; + case VD_DISK_TYPE_SLICE: + nslices = 1; + break; + case VD_DISK_TYPE_UNK: + default: + ASSERT(0); + } /* check if there's any layered open */ - for (i = 0; i < V_NUMPAR; i++) { + for (i = 0; i < nslices; i++) { if (vdc->open_lyr[i] > 0) return (B_TRUE); } @@ -1188,15 +1193,6 @@ slicemask = 1 << slice; - /* - * If we have a single-slice disk which was unavailable during the - * attach then a device was created for each 8 slices. Now that - * the type is known, we prevent opening any slice other than 0 - * even if a device still exists. - */ - if (vdc->vdisk_type == VD_DISK_TYPE_SLICE && slice != 0) - return (EIO); - /* check if slice is already exclusively opened */ if (vdc->open_excl & slicemask) return (EBUSY); @@ -1285,12 +1281,7 @@ return (status); } - /* - * If the disk type is unknown then we have to wait for the - * handshake to complete because we don't know if the slice - * device we are opening effectively exists. - */ - if (vdc->vdisk_type != VD_DISK_TYPE_UNK && nodelay) { + if (nodelay) { /* don't resubmit a validate request if there's already one */ if (vdc->validate_pending > 0) { @@ -1317,10 +1308,8 @@ mutex_enter(&vdc->lock); - if (vdc->vdisk_type == VD_DISK_TYPE_UNK || - (vdc->vdisk_type == VD_DISK_TYPE_SLICE && slice != 0) || - (!nodelay && (vdc->vdisk_label == VD_DISK_LABEL_UNK || - vdc->slice[slice].nblocks == 0))) { + if (vdc->vdisk_label == VD_DISK_LABEL_UNK || + vdc->slice[slice].nblocks == 0) { vdc_mark_closed(vdc, slice, flag, otyp); status = EIO; } @@ -1392,7 +1381,7 @@ static int vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) { - int rv, flags; + int rv; size_t nbytes = nblk * DEV_BSIZE; int instance = VDCUNIT(dev); vdc_t *vdc = NULL; @@ -1413,20 +1402,16 @@ } vio_blkno = blkno >> vdc->vio_bshift; - /* - * If we are panicking, we need the state to be "running" so that we - * can submit I/Os, but we don't want to check for any backend error. - */ - flags = (ddi_in_panic())? VDC_OP_STATE_RUNNING : VDC_OP_NORMAL; - - rv = vdc_do_op(vdc, VD_OP_BWRITE, addr, nbytes, VDCPART(dev), - vio_blkno, NULL, VIO_write_dir, flags); - + rv = vdc_send_request(vdc, VD_OP_BWRITE, addr, nbytes, + VDCPART(dev), vio_blkno, CB_STRATEGY, 0, VIO_write_dir); if (rv) { DMSG(vdc, 0, "Failed to do a disk dump (err=%d)\n", rv); return (rv); } + if (ddi_in_panic()) + (void) vdc_drain_response(vdc, CB_STRATEGY, NULL); + DMSG(vdc, 0, "[%d] End\n", instance); return (0); @@ -1450,6 +1435,7 @@ vdc_strategy(struct buf *buf) { diskaddr_t vio_blkno; + int rv = -1; vdc_t *vdc = NULL; int instance = VDCUNIT(buf->b_edev); int op = (buf->b_flags & B_READ) ? VD_OP_BREAD : VD_OP_BWRITE; @@ -1488,11 +1474,27 @@ } vio_blkno = buf->b_lblkno >> vdc->vio_bshift; - /* submit the I/O, any error will be reported in the buf structure */ - (void) vdc_do_op(vdc, op, (caddr_t)buf->b_un.b_addr, + rv = vdc_send_request(vdc, op, (caddr_t)buf->b_un.b_addr, buf->b_bcount, slice, vio_blkno, - buf, (op == VD_OP_BREAD) ? VIO_read_dir : VIO_write_dir, - VDC_OP_NORMAL); + CB_STRATEGY, buf, (op == VD_OP_BREAD) ? VIO_read_dir : + VIO_write_dir); + + /* + * If the request was successfully sent, the strategy call returns and + * the ACK handler calls the bioxxx functions when the vDisk server is + * done otherwise we handle the error here. + */ + if (rv) { + DMSG(vdc, 0, "Failed to read/write (err=%d)\n", rv); + bioerror(buf, rv); + biodone(buf); + } else if (ddi_in_panic()) { + rv = vdc_drain_response(vdc, CB_STRATEGY, buf); + if (rv != 0) { + bioerror(buf, EIO); + biodone(buf); + } + } return (0); } @@ -2366,8 +2368,6 @@ vd_port = portp[idx]; srvr = kmem_zalloc(sizeof (vdc_server_t), KM_SLEEP); srvr->vdcp = vdc; - srvr->svc_state = VDC_SERVICE_OFFLINE; - srvr->log_state = VDC_SERVICE_NONE; /* get port id */ if (md_get_prop_val(mdp, vd_port, VDC_MD_ID, &srvr->id) != 0) { @@ -2587,7 +2587,6 @@ } vdc->server_list = NULL; - vdc->num_servers = 0; } /* -------------------------------------------------------------------------- */ @@ -2884,7 +2883,10 @@ * nbytes - number of bytes to read/write * slice - the disk slice this request is for * offset - relative disk offset - * bufp - buf of operation + * cb_type - type of call - STRATEGY or SYNC + * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) + * . mode for ioctl(9e) + * . LP64 diskaddr_t (block I/O) * dir - direction of operation (READ/WRITE/BOTH) * * Return Codes: @@ -2893,8 +2895,8 @@ */ static int vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr, - size_t nbytes, int slice, diskaddr_t offset, buf_t *bufp, - vio_desc_direction_t dir, int flags) + size_t nbytes, int slice, diskaddr_t offset, int cb_type, + void *cb_arg, vio_desc_direction_t dir) { int rv = 0; @@ -2915,20 +2917,10 @@ * higher up the stack in vdc_strategy() et. al. */ if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { - DTRACE_IO1(start, buf_t *, bufp); + DTRACE_IO1(start, buf_t *, cb_arg); VD_KSTAT_WAITQ_ENTER(vdcp); } - /* - * If the request does not expect the state to be VDC_STATE_RUNNING - * then we just try to populate the descriptor ring once. - */ - if (!(flags & VDC_OP_STATE_RUNNING)) { - rv = vdc_populate_descriptor(vdcp, operation, addr, - nbytes, slice, offset, bufp, dir, flags); - goto done; - } - do { while (vdcp->state != VDC_STATE_RUNNING) { @@ -2938,6 +2930,12 @@ goto done; } + /* fail request if connection timeout is reached */ + if (vdcp->ctimeout_reached) { + rv = EIO; + goto done; + } + /* * If we are panicking and the disk is not ready then * we can't send any request because we can't complete @@ -2948,27 +2946,11 @@ goto done; } - /* - * If the state is faulted, notify that a new I/O is - * being submitted to force the system to check if any - * server has recovered. - */ - if (vdcp->state == VDC_STATE_FAILED) { - vdcp->io_pending = B_TRUE; - cv_signal(&vdcp->io_pending_cv); - } - cv_wait(&vdcp->running_cv, &vdcp->lock); - - /* if service is still faulted then fail the request */ - if (vdcp->state == VDC_STATE_FAILED) { - rv = EIO; - goto done; - } } } while (vdc_populate_descriptor(vdcp, operation, addr, - nbytes, slice, offset, bufp, dir, flags)); + nbytes, slice, offset, cb_type, cb_arg, dir)); done: /* @@ -2981,11 +2963,11 @@ if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { if (rv == 0) { VD_KSTAT_WAITQ_TO_RUNQ(vdcp); - DTRACE_PROBE1(send, buf_t *, bufp); + DTRACE_PROBE1(send, buf_t *, cb_arg); } else { VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); VD_KSTAT_WAITQ_EXIT(vdcp); - DTRACE_IO1(done, buf_t *, bufp); + DTRACE_IO1(done, buf_t *, cb_arg); } } @@ -3011,7 +2993,10 @@ * nbytes - number of bytes to read/write * slice - the disk slice this request is for * offset - relative disk offset - * bufp - buf of operation + * cb_type - type of call - STRATEGY or SYNC + * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) + * . mode for ioctl(9e) + * . LP64 diskaddr_t (block I/O) * dir - direction of operation (READ/WRITE/BOTH) * * Return Codes: @@ -3022,8 +3007,8 @@ */ static int vdc_populate_descriptor(vdc_t *vdcp, int operation, caddr_t addr, - size_t nbytes, int slice, diskaddr_t offset, - buf_t *bufp, vio_desc_direction_t dir, int flags) + size_t nbytes, int slice, diskaddr_t offset, int cb_type, + void *cb_arg, vio_desc_direction_t dir) { vdc_local_desc_t *local_dep = NULL; /* Local Dring Pointer */ int idx; /* Index of DRing entry used */ @@ -3065,9 +3050,9 @@ local_dep->nbytes = nbytes; local_dep->slice = slice; local_dep->offset = offset; - local_dep->buf = bufp; + local_dep->cb_type = cb_type; + local_dep->cb_arg = cb_arg; local_dep->dir = dir; - local_dep->flags = flags; local_dep->is_free = B_FALSE; @@ -3139,127 +3124,11 @@ /* * Function: - * vdc_do_op - * - * Description: - * Wrapper around vdc_submit_request(). Each request is associated with a - * buf structure. If a buf structure is provided (bufp != NULL) then the - * request will be submitted with that buf, and the caller can wait for - * completion of the request with biowait(). If a buf structure is not - * provided (bufp == NULL) then a buf structure is created and the function - * waits for the completion of the request. - * - * If the flag VD_OP_STATE_RUNNING is set then vdc_submit_request() will - * submit the request only when the vdisk is in state VD_STATE_RUNNING. - * If the vdisk is not in that state then the vdc_submit_request() will - * wait for that state to be reached. After the request is submitted, the - * reply will be processed asynchronously by the vdc_process_msg_thread() - * thread. - * - * If the flag VD_OP_STATE_RUNNING is not set then vdc_submit_request() - * submit the request whatever the state of the vdisk is. Then vdc_do_op() - * will wait for a reply message, process the reply and complete the - * request. - * - * Arguments: - * vdc - the soft state pointer - * op - operation we want vds to perform (VD_OP_XXX) - * addr - address of data buf to be read/written. - * nbytes - number of bytes to read/write - * slice - the disk slice this request is for - * offset - relative disk offset - * bufp - buf structure associated with the request (can be NULL). - * dir - direction of operation (READ/WRITE/BOTH) - * flags - flags for the request. - * - * Return Codes: - * 0 - the request has been succesfully submitted and completed. - * != 0 - the request has failed. In that case, if a buf structure - * was provided (bufp != NULL) then the B_ERROR flag is set - * and the b_error field of the buf structure is set to EIO. - */ -static int -vdc_do_op(vdc_t *vdc, int op, caddr_t addr, size_t nbytes, int slice, - diskaddr_t offset, struct buf *bufp, vio_desc_direction_t dir, int flags) -{ - vio_msg_t vio_msg; - struct buf buf; - int rv; - - if (bufp == NULL) { - /* - * We use buf just as a convenient way to get a notification - * that the request is completed, so we initialize buf to the - * minimum we need. - */ - bioinit(&buf); - buf.b_bcount = nbytes; - buf.b_flags = B_BUSY; - bufp = &buf; - } - - rv = vdc_send_request(vdc, op, addr, nbytes, slice, offset, bufp, - dir, flags); - - if (rv != 0) - goto done; - - /* - * If the request should be done in VDC_STATE_RUNNING state then the - * reply will be received and processed by vdc_process_msg_thread() - * and we just have to handle the panic case. Otherwise we have to - * wait for the reply message and process it. - */ - if (flags & VDC_OP_STATE_RUNNING) { - - if (ddi_in_panic()) { - rv = vdc_drain_response(vdc, bufp); - goto done; - } - - } else { - /* wait for the response message */ - rv = vdc_wait_for_response(vdc, &vio_msg); - if (rv) { - /* - * If this is a block read/write we update the I/O - * statistics kstat to take it off the run queue. - */ - mutex_enter(&vdc->lock); - if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { - VD_UPDATE_ERR_STATS(vdc, vd_transerrs); - VD_KSTAT_RUNQ_EXIT(vdc); - DTRACE_IO1(done, buf_t *, bufp); - } - mutex_exit(&vdc->lock); - goto done; - } - - rv = vdc_process_data_msg(vdc, &vio_msg); - if (rv) - goto done; - } - - if (bufp == &buf) - rv = biowait(bufp); - -done: - if (bufp == &buf) { - biofini(bufp); - } else if (rv != 0) { - bioerror(bufp, EIO); - biodone(bufp); - } - - return (rv); -} - -/* - * Function: * vdc_do_sync_op * * Description: - * Wrapper around vdc_do_op that serializes requests. + * Wrapper around vdc_populate_descriptor that blocks until the + * response to the message is available. * * Arguments: * vdcp - the soft state pointer @@ -3268,12 +3137,16 @@ * nbytes - number of bytes to read/write * slice - the disk slice this request is for * offset - relative disk offset + * cb_type - type of call - STRATEGY or SYNC + * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) + * . mode for ioctl(9e) + * . LP64 diskaddr_t (block I/O) * dir - direction of operation (READ/WRITE/BOTH) * rconflict - check for reservation conflict in case of failure * * rconflict should be set to B_TRUE by most callers. Callers invoking the * VD_OP_SCSICMD operation can set rconflict to B_FALSE if they check the - * result of a successful operation with vdc_scsi_status(). + * result of a successful operation with vd_scsi_status(). * * Return Codes: * 0 @@ -3284,10 +3157,14 @@ */ static int vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes, - int slice, diskaddr_t offset, vio_desc_direction_t dir, boolean_t rconflict) + int slice, diskaddr_t offset, int cb_type, void *cb_arg, + vio_desc_direction_t dir, boolean_t rconflict) { int status; - int flags = VDC_OP_NORMAL; + vdc_io_t *vio; + boolean_t check_resv_conflict = B_FALSE; + + ASSERT(cb_type == CB_SYNC); /* * Grab the lock, if blocked wait until the server @@ -3315,29 +3192,69 @@ /* now block anyone other thread entering after us */ vdcp->sync_op_blocked = B_TRUE; - + vdcp->sync_op_pending = B_TRUE; mutex_exit(&vdcp->lock); - if (!rconflict) - flags &= ~VDC_OP_ERRCHK_CONFLICT; - - status = vdc_do_op(vdcp, operation, addr, nbytes, slice, offset, - NULL, dir, flags); + status = vdc_send_request(vdcp, operation, addr, + nbytes, slice, offset, cb_type, cb_arg, dir); mutex_enter(&vdcp->lock); - DMSG(vdcp, 2, ": operation returned %d\n", status); - - if (vdcp->state == VDC_STATE_DETACH) { - status = ENXIO; - } - + if (status != 0) { + vdcp->sync_op_pending = B_FALSE; + } else if (ddi_in_panic()) { + if (vdc_drain_response(vdcp, CB_SYNC, NULL) == 0) { + status = vdcp->sync_op_status; + } else { + vdcp->sync_op_pending = B_FALSE; + status = EIO; + } + } else { + /* + * block until our transaction completes. + * Also anyone else waiting also gets to go next. + */ + while (vdcp->sync_op_pending && vdcp->state != VDC_STATE_DETACH) + cv_wait(&vdcp->sync_pending_cv, &vdcp->lock); + + DMSG(vdcp, 2, ": operation returned %d\n", + vdcp->sync_op_status); + if (vdcp->state == VDC_STATE_DETACH) { + vdcp->sync_op_pending = B_FALSE; + status = ENXIO; + } else { + status = vdcp->sync_op_status; + if (status != 0 && vdcp->failfast_interval != 0) { + /* + * Operation has failed and failfast is enabled. + * We need to check if the failure is due to a + * reservation conflict if this was requested. + */ + check_resv_conflict = rconflict; + } + + } + } + + vdcp->sync_op_status = 0; vdcp->sync_op_blocked = B_FALSE; vdcp->sync_op_cnt--; /* signal the next waiting thread */ cv_signal(&vdcp->sync_blocked_cv); + /* + * We have to check for reservation conflict after unblocking sync + * operations because some sync operations will be used to do this + * check. + */ + if (check_resv_conflict) { + vio = vdc_failfast_io_queue(vdcp, NULL); + while (vio->vio_qtime != 0) + cv_wait(&vdcp->failfast_io_cv, &vdcp->lock); + kmem_free(vio, sizeof (vdc_io_t)); + } + mutex_exit(&vdcp->lock); return (status); @@ -3358,16 +3275,23 @@ * * Arguments: * vdc - soft state pointer for this instance of the device driver. - * buf - if buf is NULL then we drain all responses, otherwise we + * cb_type - the type of request we want to drain. If type is CB_SYNC + * then we drain all responses until we find a CB_SYNC request. + * If the type is CB_STRATEGY then the behavior depends on the + * value of the buf argument. + * buf - if the cb_type argument is CB_SYNC then the buf argument + * must be NULL. If the cb_type argument is CB_STRATEGY and + * if buf is NULL then we drain all responses, otherwise we * poll until we receive a ACK/NACK for the specific I/O * described by buf. * * Return Code: * 0 - Success. If we were expecting a response to a particular - * request then this means that a response has been received. + * CB_SYNC or CB_STRATEGY request then this means that a + * response has been received. */ static int -vdc_drain_response(vdc_t *vdc, struct buf *buf) +vdc_drain_response(vdc_t *vdc, vio_cb_type_t cb_type, struct buf *buf) { int rv, idx, retries; size_t msglen; @@ -3376,6 +3300,8 @@ struct buf *mbuf; boolean_t ack; + ASSERT(cb_type == CB_STRATEGY || cb_type == CB_SYNC); + mutex_enter(&vdc->lock); retries = 0; @@ -3443,16 +3369,34 @@ continue; } - mbuf = ldep->buf; - ASSERT(mbuf != NULL); - mbuf->b_resid = mbuf->b_bcount - ldep->dep->payload.nbytes; - bioerror(mbuf, ack ? ldep->dep->payload.status : EIO); - biodone(mbuf); - - rv = vdc_depopulate_descriptor(vdc, idx); - if (buf != NULL && buf == mbuf) { - rv = 0; - goto done; + switch (ldep->cb_type) { + + case CB_STRATEGY: + mbuf = ldep->cb_arg; + if (mbuf != NULL) { + mbuf->b_resid = mbuf->b_bcount - + ldep->dep->payload.nbytes; + bioerror(mbuf, + ack ? ldep->dep->payload.status : EIO); + biodone(mbuf); + } + rv = vdc_depopulate_descriptor(vdc, idx); + if (buf != NULL && buf == mbuf) { + rv = 0; + goto done; + } + break; + + case CB_SYNC: + rv = vdc_depopulate_descriptor(vdc, idx); + vdc->sync_op_status = ack ? rv : EIO; + vdc->sync_op_pending = B_FALSE; + cv_signal(&vdc->sync_pending_cv); + if (cb_type == CB_SYNC) { + rv = 0; + goto done; + } + break; } /* if this is the last descriptor - break out of loop */ @@ -3462,7 +3406,7 @@ * request then we return with an error otherwise we * have successfully completed the drain. */ - rv = (buf != NULL)? ESRCH: 0; + rv = (buf != NULL || cb_type == CB_SYNC)? ESRCH: 0; break; } } @@ -3739,10 +3683,8 @@ */ vdc->seq_num = 1; vdc->seq_num_reply = 0; - vdc->io_pending = B_TRUE; srvr->ldc_state = ldc_state; cv_signal(&vdc->initwait_cv); - cv_signal(&vdc->io_pending_cv); } } @@ -3777,9 +3719,6 @@ if (vdc->state == VDC_STATE_INIT_WAITING) { vdc->state = VDC_STATE_RESETTING; cv_signal(&vdc->initwait_cv); - } else if (vdc->state == VDC_STATE_FAILED) { - vdc->io_pending = B_TRUE; - cv_signal(&vdc->io_pending_cv); } } @@ -3881,6 +3820,8 @@ int b_idx; int rv = 0; int dring_size; + int op; + vio_msg_t vio_msg; vdc_local_desc_t *curr_ldep; ASSERT(MUTEX_NOT_HELD(&vdcp->lock)); @@ -3905,22 +3846,85 @@ /* only resubmit outstanding transactions */ if (!curr_ldep->is_free) { + /* + * If we are retrying a block read/write operation we + * need to update the I/O statistics to indicate that + * the request is being put back on the waitq to be + * serviced (it will have been taken off after the + * error was reported). + */ + mutex_enter(&vdcp->lock); + op = curr_ldep->operation; + if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { + DTRACE_IO1(start, buf_t *, curr_ldep->cb_arg); + VD_KSTAT_WAITQ_ENTER(vdcp); + } DMSG(vdcp, 1, "resubmitting entry idx=%x\n", b_idx); - - rv = vdc_do_op(vdcp, curr_ldep->operation, + rv = vdc_populate_descriptor(vdcp, op, curr_ldep->addr, curr_ldep->nbytes, curr_ldep->slice, curr_ldep->offset, - curr_ldep->buf, curr_ldep->dir, - curr_ldep->flags & ~VDC_OP_STATE_RUNNING); + curr_ldep->cb_type, curr_ldep->cb_arg, + curr_ldep->dir); if (rv) { - DMSG(vdcp, 1, "[%d] resubmit entry %d failed\n", + if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { + VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); + VD_KSTAT_WAITQ_EXIT(vdcp); + DTRACE_IO1(done, buf_t *, + curr_ldep->cb_arg); + } + DMSG(vdcp, 1, "[%d] cannot resubmit entry %d\n", vdcp->instance, b_idx); + mutex_exit(&vdcp->lock); goto done; } /* + * If this is a block read/write we update the I/O + * statistics kstat to indicate that the request + * has been sent back to the vDisk server and should + * now be put on the run queue. + */ + if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { + DTRACE_PROBE1(send, buf_t *, curr_ldep->cb_arg); + VD_KSTAT_WAITQ_TO_RUNQ(vdcp); + } + mutex_exit(&vdcp->lock); + + /* Wait for the response message. */ + DMSG(vdcp, 1, "waiting for response to idx=%x\n", + b_idx); + rv = vdc_wait_for_response(vdcp, &vio_msg); + if (rv) { + /* + * If this is a block read/write we update + * the I/O statistics kstat to take it + * off the run queue. + */ + mutex_enter(&vdcp->lock); + if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { + VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); + VD_KSTAT_RUNQ_EXIT(vdcp); + DTRACE_IO1(done, buf_t *, + curr_ldep->cb_arg); + } + DMSG(vdcp, 1, "[%d] wait_for_response " + "returned err=%d\n", vdcp->instance, + rv); + mutex_exit(&vdcp->lock); + goto done; + } + + DMSG(vdcp, 1, "processing msg for idx=%x\n", b_idx); + rv = vdc_process_data_msg(vdcp, &vio_msg); + if (rv) { + DMSG(vdcp, 1, "[%d] process_data_msg " + "returned err=%d\n", vdcp->instance, + rv); + goto done; + } + /* * Mark this entry as free so that we will not resubmit * this "done" request again, if we were to use the same * backup_dring again in future. This could happen when @@ -3974,7 +3978,10 @@ int cancelled = 0; ASSERT(MUTEX_HELD(&vdcp->lock)); - ASSERT(vdcp->state == VDC_STATE_FAILED); + ASSERT(vdcp->state == VDC_STATE_INIT || + vdcp->state == VDC_STATE_INIT_WAITING || + vdcp->state == VDC_STATE_NEGOTIATE || + vdcp->state == VDC_STATE_RESETTING); if (vdcp->local_dring_backup == NULL) { /* the pending requests have already been processed */ @@ -4006,17 +4013,29 @@ * requests. Now we just have to notify threads waiting * for replies that the request has failed. */ - bufp = ldep->buf; - ASSERT(bufp != NULL); - bufp->b_resid = bufp->b_bcount; - if (ldep->operation == VD_OP_BREAD || - ldep->operation == VD_OP_BWRITE) { + switch (ldep->cb_type) { + case CB_SYNC: + ASSERT(vdcp->sync_op_pending); + vdcp->sync_op_status = EIO; + vdcp->sync_op_pending = B_FALSE; + cv_signal(&vdcp->sync_pending_cv); + break; + + case CB_STRATEGY: + bufp = ldep->cb_arg; + ASSERT(bufp != NULL); + bufp->b_resid = bufp->b_bcount; VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); VD_KSTAT_RUNQ_EXIT(vdcp); DTRACE_IO1(done, buf_t *, bufp); + bioerror(bufp, EIO); + biodone(bufp); + break; + + default: + ASSERT(0); } - bioerror(bufp, EIO); - biodone(bufp); + } /* get the next element to cancel */ @@ -4042,12 +4061,14 @@ * Description: * This function is invoked if the timeout set to establish the connection * with vds expires. This will happen if we spend too much time in the - * VDC_STATE_INIT_WAITING or VDC_STATE_NEGOTIATE states. + * VDC_STATE_INIT_WAITING or VDC_STATE_NEGOTIATE states. Then we will + * cancel any pending request and mark them as failed. * * If the timeout does not expire, it will be cancelled when we reach the - * VDC_STATE_HANDLE_PENDING, VDC_STATE_FAILED or VDC_STATE_DETACH state. - * This function can also be invoked while we are in those states, in - * which case we do nothing because the timeout is being cancelled. + * VDC_STATE_HANDLE_PENDING or VDC_STATE_RESETTING state. This function can + * be invoked while we are in the VDC_STATE_HANDLE_PENDING or + * VDC_STATE_RESETTING state in which case we do nothing because the + * timeout is being cancelled. * * Arguments: * arg - argument of the timeout function actually a soft state @@ -4064,18 +4085,28 @@ mutex_enter(&vdcp->lock); if (vdcp->state == VDC_STATE_HANDLE_PENDING || - vdcp->state == VDC_STATE_DETACH || - vdcp->state == VDC_STATE_FAILED) { + vdcp->state == VDC_STATE_DETACH) { /* - * The connection has just been re-established, has failed or + * The connection has just been re-established or * we are detaching. */ vdcp->ctimeout_reached = B_FALSE; - } else { - vdcp->ctimeout_reached = B_TRUE; - } + mutex_exit(&vdcp->lock); + return; + } + + vdcp->ctimeout_reached = B_TRUE; + + /* notify requests waiting for sending */ + cv_broadcast(&vdcp->running_cv); + + /* cancel requests waiting for a result */ + vdc_cancel_backup_dring(vdcp); mutex_exit(&vdcp->lock); + + cmn_err(CE_NOTE, "[%d] connection to service domain timeout", + vdcp->instance); } /* @@ -4171,58 +4202,6 @@ vdcp->instance, vdcp->curr_server->id, vdcp->curr_server->ldc_id); } -static void -vdc_print_svc_status(vdc_t *vdcp) -{ - int instance; - uint64_t ldc_id, port_id; - vdc_service_state_t svc_state; - - ASSERT(mutex_owned(&vdcp->lock)); - - svc_state = vdcp->curr_server->svc_state; - - if (vdcp->curr_server->log_state == svc_state) - return; - - instance = vdcp->instance; - ldc_id = vdcp->curr_server->ldc_id; - port_id = vdcp->curr_server->id; - - switch (svc_state) { - - case VDC_SERVICE_OFFLINE: - cmn_err(CE_CONT, "?vdisk@%d is offline\n", instance); - break; - - case VDC_SERVICE_CONNECTED: - cmn_err(CE_CONT, "?vdisk@%d is connected using ldc@%ld,%ld\n", - instance, ldc_id, port_id); - break; - - case VDC_SERVICE_ONLINE: - cmn_err(CE_CONT, "?vdisk@%d is online using ldc@%ld,%ld\n", - instance, ldc_id, port_id); - break; - - case VDC_SERVICE_FAILED: - cmn_err(CE_CONT, "?vdisk@%d access to service failed " - "using ldc@%ld,%ld\n", instance, ldc_id, port_id); - break; - - case VDC_SERVICE_FAULTED: - cmn_err(CE_CONT, "?vdisk@%d access to backend failed " - "using ldc@%ld,%ld\n", instance, ldc_id, port_id); - break; - - default: - ASSERT(0); - break; - } - - vdcp->curr_server->log_state = svc_state; -} - /* -------------------------------------------------------------------------- */ /* @@ -4253,8 +4232,6 @@ int ctimeout; timeout_id_t tmid = 0; clock_t ldcup_timeout = 0; - vdc_server_t *srvr; - vdc_service_state_t svc_state; mutex_enter(&vdcp->lock); @@ -4266,8 +4243,6 @@ Q(VDC_STATE_INIT_WAITING) Q(VDC_STATE_NEGOTIATE) Q(VDC_STATE_HANDLE_PENDING) - Q(VDC_STATE_FAULTED) - Q(VDC_STATE_FAILED) Q(VDC_STATE_RUNNING) Q(VDC_STATE_RESETTING) Q(VDC_STATE_DETACH) @@ -4302,27 +4277,21 @@ ctimeout * drv_usectohz(MICROSEC)); } - /* Switch to STATE_DETACH if drv is detaching */ - if (vdcp->lifecycle == VDC_LC_DETACHING) { - vdcp->state = VDC_STATE_DETACH; - break; - } - - /* Check if the timeout has been reached */ - if (vdcp->ctimeout_reached) { - ASSERT(tmid != 0); - tmid = 0; - vdcp->state = VDC_STATE_FAILED; - break; - } - /* Check if we are re-initializing repeatedly */ if (vdcp->hshake_cnt > vdc_hshake_retries && vdcp->lifecycle != VDC_LC_ONLINE) { DMSG(vdcp, 0, "[%d] too many handshakes,cnt=%d", vdcp->instance, vdcp->hshake_cnt); - vdcp->state = VDC_STATE_FAILED; + cmn_err(CE_NOTE, "[%d] disk access failed.\n", + vdcp->instance); + vdcp->state = VDC_STATE_DETACH; + break; + } + + /* Switch to STATE_DETACH if drv is detaching */ + if (vdcp->lifecycle == VDC_LC_DETACHING) { + vdcp->state = VDC_STATE_DETACH; break; } @@ -4335,10 +4304,6 @@ status = vdc_start_ldc_connection(vdcp); if (status != EINVAL) { vdcp->state = VDC_STATE_INIT_WAITING; - } else { - vdcp->curr_server->svc_state = - VDC_SERVICE_FAILED; - vdc_print_svc_status(vdcp); } break; @@ -4350,23 +4315,26 @@ break; } - /* - * Wait for LDC_UP. If it times out and we have multiple - * servers then we will retry using a different server. - */ - ldcup_timeout = ddi_get_lbolt() + (vdc_ldcup_timeout * - drv_usectohz(MICROSEC)); - status = cv_timedwait(&vdcp->initwait_cv, &vdcp->lock, - ldcup_timeout); - if (status == -1 && - vdcp->state == VDC_STATE_INIT_WAITING && - vdcp->curr_server->ldc_state != LDC_UP) { - /* timed out & still waiting */ - vdcp->curr_server->svc_state = - VDC_SERVICE_FAILED; - vdc_print_svc_status(vdcp); - vdcp->state = VDC_STATE_INIT; - break; + /* check if only one server exists */ + if (vdcp->num_servers == 1) { + cv_wait(&vdcp->initwait_cv, &vdcp->lock); + } else { + /* + * wait for LDC_UP, if it times out, switch + * to another server. + */ + ldcup_timeout = ddi_get_lbolt() + + (vdc_ldcup_timeout * + drv_usectohz(MICROSEC)); + status = cv_timedwait(&vdcp->initwait_cv, + &vdcp->lock, ldcup_timeout); + if (status == -1 && + vdcp->state == VDC_STATE_INIT_WAITING && + vdcp->curr_server->ldc_state != LDC_UP) { + /* timed out & still waiting */ + vdcp->state = VDC_STATE_INIT; + break; + } } if (vdcp->state != VDC_STATE_INIT_WAITING) { @@ -4418,8 +4386,6 @@ status); vdcp->state = VDC_STATE_RESETTING; vdcp->self_reset = B_TRUE; - vdcp->curr_server->svc_state = VDC_SERVICE_FAILED; - vdc_print_svc_status(vdcp); done: DMSG(vdcp, 0, "negotiation complete (state=0x%x)...\n", vdcp->state); @@ -4427,121 +4393,36 @@ case VDC_STATE_HANDLE_PENDING: - DMSG(vdcp, 0, "[%d] connection to service domain is up", - vdcp->instance); - vdcp->curr_server->svc_state = VDC_SERVICE_CONNECTED; - - mutex_exit(&vdcp->lock); - - /* - * If we have multiple servers, check that the backend - * is effectively available before resubmitting any IO. - */ - if (vdcp->num_servers > 1 && - vdc_eio_check(vdcp, 0) != 0) { - mutex_enter(&vdcp->lock); - vdcp->curr_server->svc_state = - VDC_SERVICE_FAULTED; - vdcp->state = VDC_STATE_FAULTED; - break; - } - - if (tmid != 0) { - (void) untimeout(tmid); + if (vdcp->ctimeout_reached) { + /* + * The connection timeout had been reached so + * pending requests have been cancelled. Now + * that the connection is back we can reset + * the timeout. + */ + ASSERT(vdcp->local_dring_backup == NULL); + ASSERT(tmid != 0); tmid = 0; vdcp->ctimeout_reached = B_FALSE; - } - - /* - * Setup devid - */ - (void) vdc_setup_devid(vdcp); - - status = vdc_resubmit_backup_dring(vdcp); - - mutex_enter(&vdcp->lock); - - if (status) { - vdcp->state = VDC_STATE_RESETTING; - vdcp->self_reset = B_TRUE; - vdcp->curr_server->svc_state = - VDC_SERVICE_FAILED; - vdc_print_svc_status(vdcp); - } else { vdcp->state = VDC_STATE_RUNNING; + DMSG(vdcp, 0, "[%d] connection to service " + "domain is up", vdcp->instance); + break; } - break; - - case VDC_STATE_FAULTED: - /* - * Server is faulted because the backend is unavailable. - * If all servers are faulted then we mark the service - * as failed, otherwise we reset to switch to another - * server. - */ - vdc_print_svc_status(vdcp); - - /* check if all servers are faulted */ - for (srvr = vdcp->server_list; srvr != NULL; - srvr = srvr->next) { - svc_state = srvr->svc_state; - if (svc_state != VDC_SERVICE_FAULTED) - break; - } - - if (srvr != NULL) { - vdcp->state = VDC_STATE_RESETTING; - vdcp->self_reset = B_TRUE; - } else { - vdcp->state = VDC_STATE_FAILED; - } - break; - - case VDC_STATE_FAILED: - /* - * We reach this state when we are unable to access the - * backend from any server, either because of a maximum - * connection retries or timeout, or because the backend - * is unavailable. - * - * Then we cancel the backup DRing so that errors get - * reported and we wait for a new I/O before attempting - * another connection. - */ - cmn_err(CE_NOTE, "vdisk@%d disk access failed", - vdcp->instance); - - /* cancel any timeout */ + + mutex_exit(&vdcp->lock); if (tmid != 0) { (void) untimeout(tmid); tmid = 0; } - - /* cancel pending I/Os */ - cv_broadcast(&vdcp->running_cv); - vdc_cancel_backup_dring(vdcp); - - /* wait for new I/O */ - while (!vdcp->io_pending) - cv_wait(&vdcp->io_pending_cv, &vdcp->lock); - - /* - * There's a new IO pending. Try to re-establish a - * connection. Mark all services as offline, so that - * we don't stop again before having retried all - * servers. - */ - for (srvr = vdcp->server_list; srvr != NULL; - srvr = srvr->next) { - srvr->svc_state = VDC_SERVICE_OFFLINE; - } - - /* reset variables */ - vdcp->hshake_cnt = 0; - vdcp->ctimeout_reached = B_FALSE; - - vdcp->state = VDC_STATE_RESETTING; - vdcp->self_reset = B_TRUE; + status = vdc_resubmit_backup_dring(vdcp); + mutex_enter(&vdcp->lock); + + if (status) + vdcp->state = VDC_STATE_RESETTING; + else + vdcp->state = VDC_STATE_RUNNING; + break; /* enter running state */ @@ -4553,18 +4434,17 @@ vdcp->hshake_cnt = 0; cv_broadcast(&vdcp->running_cv); - /* backend has to be checked after reset */ - if (vdcp->failfast_interval != 0 || - vdcp->num_servers > 1) - cv_signal(&vdcp->eio_cv); + /* failfast has to been checked after reset */ + cv_signal(&vdcp->failfast_cv); /* ownership is lost during reset */ if (vdcp->ownership & VDC_OWNERSHIP_WANTED) vdcp->ownership |= VDC_OWNERSHIP_RESET; cv_signal(&vdcp->ownership_cv); - vdcp->curr_server->svc_state = VDC_SERVICE_ONLINE; - vdc_print_svc_status(vdcp); + cmn_err(CE_CONT, "?vdisk@%d is online using " + "ldc@%ld,%ld\n", vdcp->instance, + vdcp->curr_server->ldc_id, vdcp->curr_server->id); mutex_exit(&vdcp->lock); @@ -4587,14 +4467,8 @@ mutex_enter(&vdcp->lock); - /* all servers are now offline */ - for (srvr = vdcp->server_list; srvr != NULL; - srvr = srvr->next) { - srvr->svc_state = VDC_SERVICE_OFFLINE; - srvr->log_state = VDC_SERVICE_NONE; - } - - vdc_print_svc_status(vdcp); + cmn_err(CE_CONT, "?vdisk@%d is offline\n", + vdcp->instance); vdcp->state = VDC_STATE_RESETTING; vdcp->self_reset = B_TRUE; @@ -4642,13 +4516,6 @@ ASSERT(vdcp->read_state != VDC_READ_WAITING); vdcp->read_state = VDC_READ_IDLE; - vdcp->io_pending = B_FALSE; - - /* - * Cleanup any pending eio. These I/Os are going to - * be resubmitted. - */ - vdc_eio_unqueue(vdcp, 0, B_FALSE); vdc_backup_local_dring(vdcp); @@ -4678,8 +4545,9 @@ */ cv_broadcast(&vdcp->running_cv); - while (vdcp->sync_op_cnt > 0) { - cv_broadcast(&vdcp->sync_blocked_cv); + while (vdcp->sync_op_pending) { + cv_signal(&vdcp->sync_pending_cv); + cv_signal(&vdcp->sync_blocked_cv); mutex_exit(&vdcp->lock); /* give the waiters enough time to wake up */ delay(vdc_hz_min_ldc_delay); @@ -4791,7 +4659,7 @@ ldep = &vdcp->local_dring[idx]; op = ldep->operation; if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { - DTRACE_IO1(done, buf_t *, ldep->buf); + DTRACE_IO1(done, buf_t *, ldep->cb_arg); VD_KSTAT_RUNQ_EXIT(vdcp); } VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); @@ -4816,57 +4684,62 @@ ldep = &vdcp->local_dring[idx]; - DMSG(vdcp, 1, ": state 0x%x\n", ldep->dep->hdr.dstate); + DMSG(vdcp, 1, ": state 0x%x - cb_type 0x%x\n", + ldep->dep->hdr.dstate, ldep->cb_type); if (ldep->dep->hdr.dstate == VIO_DESC_DONE) { struct buf *bufp; - status = ldep->dep->payload.status; - - bufp = ldep->buf; - ASSERT(bufp != NULL); - - bufp->b_resid = bufp->b_bcount - ldep->dep->payload.nbytes; - bioerror(bufp, status); - - if (status != 0) { - DMSG(vdcp, 1, "I/O status=%d\n", status); - } - - DMSG(vdcp, 1, - "I/O complete req=%ld bytes resp=%ld bytes\n", - bufp->b_bcount, ldep->dep->payload.nbytes); - - /* - * If the request has failed and we have multiple servers or - * failfast is enabled then we will have to defer the completion - * of the request until we have checked that the vdisk backend - * is effectively available (if multiple server) or that there - * is no reservation conflict (if failfast). - */ - if ((status != 0 && - (vdcp->num_servers > 1 && - (ldep->flags & VDC_OP_ERRCHK_BACKEND)) || - (vdcp->failfast_interval != 0 && - (ldep->flags & VDC_OP_ERRCHK_CONFLICT)))) { - /* - * The I/O has failed and we need to check the error. - */ - (void) vdc_eio_queue(vdcp, idx); - } else { - op = ldep->operation; - if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { + switch (ldep->cb_type) { + case CB_SYNC: + ASSERT(vdcp->sync_op_pending); + + status = vdc_depopulate_descriptor(vdcp, idx); + vdcp->sync_op_status = status; + vdcp->sync_op_pending = B_FALSE; + cv_signal(&vdcp->sync_pending_cv); + break; + + case CB_STRATEGY: + bufp = ldep->cb_arg; + ASSERT(bufp != NULL); + bufp->b_resid = + bufp->b_bcount - ldep->dep->payload.nbytes; + status = ldep->dep->payload.status; /* Future:ntoh */ + if (status != 0) { + DMSG(vdcp, 1, "strategy status=%d\n", status); + VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); + bioerror(bufp, status); + } + + (void) vdc_depopulate_descriptor(vdcp, idx); + + DMSG(vdcp, 1, + "strategy complete req=%ld bytes resp=%ld bytes\n", + bufp->b_bcount, ldep->dep->payload.nbytes); + + if (status != 0 && vdcp->failfast_interval != 0) { + /* + * The I/O has failed and failfast is enabled. + * We need the failfast thread to check if the + * failure is due to a reservation conflict. + */ + (void) vdc_failfast_io_queue(vdcp, bufp); + } else { if (status == 0) { + op = (bufp->b_flags & B_READ) ? + VD_OP_BREAD : VD_OP_BWRITE; VD_UPDATE_IO_STATS(vdcp, op, ldep->dep->payload.nbytes); - } else { - VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); } VD_KSTAT_RUNQ_EXIT(vdcp); DTRACE_IO1(done, buf_t *, bufp); + biodone(bufp); } - (void) vdc_depopulate_descriptor(vdcp, idx); - biodone(bufp); + break; + + default: + ASSERT(0); } } @@ -4985,7 +4858,6 @@ vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg) { int status = 0; - vd_disk_type_t old_type; ASSERT(vdc != NULL); ASSERT(mutex_owned(&vdc->lock)); @@ -5030,7 +4902,6 @@ } /* update disk, block and transfer sizes */ - old_type = vdc->vdisk_type; vdc_update_size(vdc, attr_msg->vdisk_size, attr_msg->vdisk_block_size, attr_msg->max_xfer_sz); vdc->vdisk_type = attr_msg->vdisk_type; @@ -5061,25 +4932,6 @@ * fake geometry for the disk. */ vdc_create_fake_geometry(vdc); - - /* - * If the disk type was previously unknown and device nodes - * were created then the driver would have created 8 device - * nodes. If we now find out that this is a single-slice disk - * then we need to re-create the appropriate device nodes. - */ - if (old_type == VD_DISK_TYPE_UNK && - (vdc->initialized & VDC_MINOR) && - vdc->vdisk_type == VD_DISK_TYPE_SLICE) { - ddi_remove_minor_node(vdc->dip, NULL); - (void) devfs_clean(ddi_get_parent(vdc->dip), - NULL, DV_CLEAN_FORCE); - if (vdc_create_device_nodes(vdc) != 0) { - DMSG(vdc, 0, "![%d] Failed to update " - "device nodes", vdc->instance); - } - } - break; case VIO_SUBTYPE_NACK: @@ -5331,7 +5183,7 @@ ASSERT(vdc != NULL); rv = vdc_do_sync_op(vdc, VD_OP_FLUSH, NULL, 0, - VDCPART(dk_arg->dev), 0, VIO_both_dir, B_TRUE); + VDCPART(dk_arg->dev), 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); if (rv != 0) { DMSG(vdc, 0, "[%d] DKIOCFLUSHWRITECACHE failed %d : model %x\n", vdc->instance, rv, @@ -5747,8 +5599,8 @@ /* a uscsi reset is converted to a VD_OP_RESET operation */ if (uscsi.uscsi_flags & (USCSI_RESET | USCSI_RESET_LUN | USCSI_RESET_ALL)) { - rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0, - VIO_both_dir, B_TRUE); + rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0, CB_SYNC, + (void *)(uint64_t)mode, VIO_both_dir, B_TRUE); return (rv); } @@ -5825,7 +5677,7 @@ /* submit the request */ rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, - 0, 0, VIO_both_dir, B_FALSE); + 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); if (rv != 0) goto done; @@ -6019,7 +5871,7 @@ /* submit the request */ rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, - 0, 0, VIO_both_dir, B_FALSE); + 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); if (rv != 0) goto done; @@ -6133,7 +5985,7 @@ /* submit the request */ rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, - 0, 0, VIO_both_dir, B_FALSE); + 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); if (rv != 0) goto done; @@ -6238,7 +6090,7 @@ /* submit the request */ rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, - 0, 0, VIO_both_dir, B_FALSE); + 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); if (rv == 0) rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); @@ -6279,7 +6131,7 @@ /* submit the request */ rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, - 0, 0, VIO_both_dir, B_FALSE); + 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); if (rv == 0) rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); @@ -6324,7 +6176,7 @@ /* submit the request */ rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, - 0, 0, VIO_both_dir, B_FALSE); + 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); if (rv == 0) rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); @@ -6363,7 +6215,7 @@ /* submit the request */ rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, - 0, 0, VIO_both_dir, B_FALSE); + 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); if (rv == 0) rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); @@ -6373,10 +6225,11 @@ } /* - * This function is used to send a (simple) SCSI command and check errors. + * This function is used by the failfast mechanism to send a SCSI command + * to check for reservation conflict. */ static int -vdc_eio_scsi_cmd(vdc_t *vdc, uchar_t scmd, int flags) +vdc_failfast_scsi_cmd(vdc_t *vdc, uchar_t scmd) { int cdb_len, sense_len, vd_scsi_len; vd_scsi_t *vd_scsi; @@ -6401,177 +6254,103 @@ vd_scsi->timeout = vdc_scsi_timeout; /* - * Submit the request. Note the operation should not request that any - * error is checked because this function is precisely called when - * checking errors. + * Submit the request. The last argument has to be B_FALSE so that + * vdc_do_sync_op does not loop checking for reservation conflict if + * the operation returns an error. */ - ASSERT((flags & VDC_OP_ERRCHK) == 0); - - rv = vdc_do_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, - 0, 0, NULL, VIO_both_dir, flags); + rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, + 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_FALSE); if (rv == 0) - rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); + (void) vdc_scsi_status(vdc, vd_scsi, B_FALSE); kmem_free(vd_scsi, vd_scsi_len); return (rv); } /* - * This function is used to check if a SCSI backend is accessible. It will - * also detect reservation conflict if failfast is enabled, and panic the - * system in that case. + * This function is used by the failfast mechanism to check for reservation + * conflict. It sends some SCSI commands which will fail with a reservation + * conflict error if the system does not have access to the disk and this + * will panic the system. * * Returned Code: - * 0 - disk is accessible - * != 0 - disk is inaccessible or unable to check if disk is accessible + * 0 - disk is accessible without reservation conflict error + * != 0 - unable to check if disk is accessible */ -static int -vdc_eio_scsi_check(vdc_t *vdc, int flags) +int +vdc_failfast_check_resv(vdc_t *vdc) { int failure = 0; - int rv; /* * Send a TEST UNIT READY command. The command will panic - * the system if it fails with a reservation conflict and - * failfast is enabled. If there is a reservation conflict - * and failfast is not enabled then the function will return - * EACCES. In that case, there's no problem with accessing - * the backend, it is just reserved. + * the system if it fails with a reservation conflict. */ - rv = vdc_eio_scsi_cmd(vdc, SCMD_TEST_UNIT_READY, flags); - if (rv != 0 && rv != EACCES) + if (vdc_failfast_scsi_cmd(vdc, SCMD_TEST_UNIT_READY) != 0) failure++; - /* we don't need to do more checking if failfast is not enabled */ - if (vdc->failfast_interval == 0) - return (failure); - /* * With SPC-3 compliant devices TEST UNIT READY will succeed on * a reserved device, so we also do a WRITE(10) of zero byte in * order to provoke a Reservation Conflict status on those newer * devices. */ - if (vdc_eio_scsi_cmd(vdc, SCMD_WRITE_G1, flags) != 0) + if (vdc_failfast_scsi_cmd(vdc, SCMD_WRITE_G1) != 0) failure++; return (failure); } /* - * This function is used to check if a backend is effectively accessible. - * - * Returned Code: - * 0 - disk is accessible - * != 0 - disk is inaccessible or unable to check if disk is accessible - */ -static int -vdc_eio_check(vdc_t *vdc, int flags) -{ - char *buffer; - diskaddr_t blkno; - int rv; - - ASSERT((flags & VDC_OP_ERRCHK) == 0); - - if (VD_OP_SUPPORTED(vdc->operations, VD_OP_SCSICMD)) - return (vdc_eio_scsi_check(vdc, flags)); - - ASSERT(vdc->failfast_interval == 0); - - /* - * If the backend does not support SCSI operations then we simply - * check if the backend is accessible by reading some data blocks. - * We first try to read a random block, to try to avoid getting - * a block that might have been cached on the service domain. Then - * we try the last block, and finally the first block. - * - * We return success as soon as we are able to read any block. - */ - buffer = kmem_alloc(vdc->vdisk_bsize, KM_SLEEP); - - if (vdc->vdisk_size > 0) { - - /* try a random block */ - (void) random_get_pseudo_bytes((uint8_t *)&blkno, - sizeof (diskaddr_t)); - blkno = blkno % vdc->vdisk_size; - rv = vdc_do_op(vdc, VD_OP_BREAD, (caddr_t)buffer, - vdc->vdisk_bsize, VD_SLICE_NONE, blkno, NULL, - VIO_read_dir, flags); - - if (rv == 0) - goto done; - - /* try the last block */ - blkno = vdc->vdisk_size - 1; - rv = vdc_do_op(vdc, VD_OP_BREAD, (caddr_t)buffer, - vdc->vdisk_bsize, VD_SLICE_NONE, blkno, NULL, - VIO_read_dir, flags); - - if (rv == 0) - goto done; - } - - /* try block 0 */ - blkno = 0; - rv = vdc_do_op(vdc, VD_OP_BREAD, (caddr_t)buffer, vdc->vdisk_bsize, - VD_SLICE_NONE, blkno, NULL, VIO_read_dir, flags); - -done: - kmem_free(buffer, vdc->vdisk_bsize); - return (rv); -} - -/* - * Add a pending I/O to the eio queue. An I/O is added to this queue - * when it has failed and failfast is enabled or the vdisk has multiple - * servers. It will then be handled by the eio thread (vdc_eio_thread). - * The eio queue is ordered starting with the most recent I/O added. + * Add a pending I/O to the failfast I/O queue. An I/O is added to this + * queue when it has failed and failfast is enabled. Then we have to check + * if it has failed because of a reservation conflict in which case we have + * to panic the system. + * + * Async I/O should be queued with their block I/O data transfer structure + * (buf). Sync I/O should be queued with buf = NULL. */ static vdc_io_t * -vdc_eio_queue(vdc_t *vdc, int index) +vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf) { vdc_io_t *vio; ASSERT(MUTEX_HELD(&vdc->lock)); vio = kmem_alloc(sizeof (vdc_io_t), KM_SLEEP); - vio->vio_next = vdc->eio_queue; - vio->vio_index = index; + vio->vio_next = vdc->failfast_io_queue; + vio->vio_buf = buf; vio->vio_qtime = ddi_get_lbolt(); - vdc->eio_queue = vio; - - /* notify the eio thread that a new I/O is queued */ - cv_signal(&vdc->eio_cv); + vdc->failfast_io_queue = vio; + + /* notify the failfast thread that a new I/O is queued */ + cv_signal(&vdc->failfast_cv); return (vio); } /* - * Remove I/Os added before the indicated deadline from the eio queue. A - * deadline of 0 means that all I/Os have to be unqueued. The complete_io - * boolean specifies if unqueued I/Os should be marked as completed or not. + * Remove and complete I/O in the failfast I/O queue which have been + * added after the indicated deadline. A deadline of 0 means that all + * I/O have to be unqueued and marked as completed. */ static void -vdc_eio_unqueue(vdc_t *vdc, clock_t deadline, boolean_t complete_io) -{ - struct buf *buf; +vdc_failfast_io_unqueue(vdc_t *vdc, clock_t deadline) +{ vdc_io_t *vio, *vio_tmp; - int index, op; ASSERT(MUTEX_HELD(&vdc->lock)); vio_tmp = NULL; - vio = vdc->eio_queue; + vio = vdc->failfast_io_queue; if (deadline != 0) { /* - * Skip any io queued after the deadline. The eio queue is - * ordered starting with the last I/O added to the queue. + * Skip any io queued after the deadline. The failfast + * I/O queue is ordered starting with the last I/O added + * to the queue. */ while (vio != NULL && vio->vio_qtime > deadline) { vio_tmp = vio; @@ -6585,54 +6364,53 @@ /* update the queue */ if (vio_tmp == NULL) - vdc->eio_queue = NULL; + vdc->failfast_io_queue = NULL; else vio_tmp->vio_next = NULL; /* - * Free and complete unqueued I/Os if this was requested. All I/Os - * have a block I/O data transfer structure (buf) and they are - * completed by calling biodone(). + * Complete unqueued I/O. Async I/O have a block I/O data transfer + * structure (buf) and they are completed by calling biodone(). Sync + * I/O do not have a buf and they are completed by setting the + * vio_qtime to zero and signaling failfast_io_cv. In that case, the + * thread waiting for the I/O to complete is responsible for freeing + * the vio structure. */ while (vio != NULL) { vio_tmp = vio->vio_next; - - if (complete_io) { - index = vio->vio_index; - op = vdc->local_dring[index].operation; - buf = vdc->local_dring[index].buf; - (void) vdc_depopulate_descriptor(vdc, index); - ASSERT(buf->b_flags & B_ERROR); - if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { - VD_UPDATE_ERR_STATS(vdc, vd_softerrs); - VD_KSTAT_RUNQ_EXIT(vdc); - DTRACE_IO1(done, buf_t *, buf); - } - biodone(buf); + if (vio->vio_buf != NULL) { + VD_KSTAT_RUNQ_EXIT(vdc); + DTRACE_IO1(done, buf_t *, vio->vio_buf); + biodone(vio->vio_buf); + kmem_free(vio, sizeof (vdc_io_t)); + } else { + vio->vio_qtime = 0; } - - kmem_free(vio, sizeof (vdc_io_t)); vio = vio_tmp; } + + cv_broadcast(&vdc->failfast_io_cv); } /* - * Error I/O Thread. There is one eio thread for each virtual disk that - * has multiple servers or for which failfast is enabled. Failfast can only - * be enabled for vdisk supporting SCSI commands. - * - * While failfast is enabled, the eio thread sends a TEST UNIT READY + * Failfast Thread. + * + * While failfast is enabled, the failfast thread sends a TEST UNIT READY * and a zero size WRITE(10) SCSI commands on a regular basis to check that * we still have access to the disk. If a command fails with a RESERVATION * CONFLICT error then the system will immediatly panic. * - * The eio thread is also woken up when an I/O has failed. It then checks + * The failfast thread is also woken up when an I/O has failed. It then check * the access to the disk to ensure that the I/O failure was not due to a - * reservation conflict or to the backend been inaccessible. - * + * reservation conflict. + * + * There is one failfast thread for each virtual disk for which failfast is + * enabled. We could have only one thread sending requests for all disks but + * this would need vdc to send asynchronous requests and to have callbacks to + * process replies. */ static void -vdc_eio_thread(void *arg) +vdc_failfast_thread(void *arg) { int status; vdc_t *vdc = (vdc_t *)arg; @@ -6640,74 +6418,45 @@ mutex_enter(&vdc->lock); - while (vdc->failfast_interval != 0 || vdc->num_servers > 1) { - /* - * Wait if there is nothing in the eio queue or if the state - * is not VDC_STATE_RUNNING. - */ - if (vdc->eio_queue == NULL || vdc->state != VDC_STATE_RUNNING) { - if (vdc->failfast_interval != 0) { - timeout = ddi_get_lbolt() + - drv_usectohz(vdc->failfast_interval); - (void) cv_timedwait(&vdc->eio_cv, &vdc->lock, - timeout); - } else { - ASSERT(vdc->num_servers > 1); - (void) cv_wait(&vdc->eio_cv, &vdc->lock); - } - - if (vdc->state != VDC_STATE_RUNNING) - continue; - } + while (vdc->failfast_interval != 0) { + + starttime = ddi_get_lbolt(); mutex_exit(&vdc->lock); - starttime = ddi_get_lbolt(); - - /* check error */ - status = vdc_eio_check(vdc, VDC_OP_STATE_RUNNING); + /* check for reservation conflict */ + status = vdc_failfast_check_resv(vdc); mutex_enter(&vdc->lock); /* - * We have dropped the lock to check the backend so we have - * to check that the eio thread is still enabled. + * We have dropped the lock to send the SCSI command so we have + * to check that failfast is still enabled. */ - if (vdc->failfast_interval == 0 && vdc->num_servers <= 1) + if (vdc->failfast_interval == 0) break; /* - * If the eio queue is empty or we are not in running state - * anymore then there is nothing to do. + * If we have successfully check the disk access and there was + * no reservation conflict then we can complete any I/O queued + * before the last check. */ - if (vdc->state != VDC_STATE_RUNNING || vdc->eio_queue == NULL) + if (status == 0) + vdc_failfast_io_unqueue(vdc, starttime); + + /* proceed again if some I/O are still in the queue */ + if (vdc->failfast_io_queue != NULL) continue; - if (status == 0) { - /* - * The backend access has been successfully checked, - * we can complete any I/O queued before the last check. - */ - vdc_eio_unqueue(vdc, starttime, B_TRUE); - - } else if (vdc->num_servers > 1) { - /* - * The backend is inaccessible for a disk with multiple - * servers. So we force a reset to switch to another - * server. The reset will also clear the eio queue and - * resubmit all pending I/Os. - */ - mutex_enter(&vdc->read_lock); - vdc->read_state = VDC_READ_RESET; - cv_signal(&vdc->read_cv); - mutex_exit(&vdc->read_lock); - } + timeout = ddi_get_lbolt() + + drv_usectohz(vdc->failfast_interval); + (void) cv_timedwait(&vdc->failfast_cv, &vdc->lock, timeout); } /* - * The thread is being stopped so we can complete any queued I/O. + * Failfast is being stop so we can complete any queued I/O. */ - vdc_eio_unqueue(vdc, 0, B_TRUE); - vdc->eio_thread = NULL; + vdc_failfast_io_unqueue(vdc, 0); + vdc->failfast_thread = NULL; mutex_exit(&vdc->lock); thread_exit(); } @@ -6724,14 +6473,14 @@ return (EFAULT); mutex_enter(&vdc->lock); - if (mh_time != 0 && vdc->eio_thread == NULL) { - vdc->eio_thread = thread_create(NULL, 0, - vdc_eio_thread, vdc, 0, &p0, TS_RUN, + if (mh_time != 0 && vdc->failfast_thread == NULL) { + vdc->failfast_thread = thread_create(NULL, 0, + vdc_failfast_thread, vdc, 0, &p0, TS_RUN, v.v_maxsyspri - 2); } - vdc->failfast_interval = ((long)mh_time) * MILLISEC; - cv_signal(&vdc->eio_cv); + vdc->failfast_interval = mh_time * 1000; + cv_signal(&vdc->failfast_cv); mutex_exit(&vdc->lock); return (0); @@ -6742,13 +6491,14 @@ * converted to VD_OP_SET_ACCESS operations. */ static int -vdc_access_set(vdc_t *vdc, uint64_t flags) +vdc_access_set(vdc_t *vdc, uint64_t flags, int mode) { int rv; /* submit owership command request */ rv = vdc_do_sync_op(vdc, VD_OP_SET_ACCESS, (caddr_t)&flags, - sizeof (uint64_t), 0, 0, VIO_both_dir, B_TRUE); + sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, + VIO_both_dir, B_TRUE); return (rv); } @@ -6758,13 +6508,14 @@ * VD_OP_GET_ACCESS operation. */ static int -vdc_access_get(vdc_t *vdc, uint64_t *status) +vdc_access_get(vdc_t *vdc, uint64_t *status, int mode) { int rv; /* submit owership command request */ rv = vdc_do_sync_op(vdc, VD_OP_GET_ACCESS, (caddr_t)status, - sizeof (uint64_t), 0, 0, VIO_both_dir, B_TRUE); + sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, + VIO_both_dir, B_TRUE); return (rv); } @@ -6809,7 +6560,7 @@ mutex_exit(&vdc->lock); status = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | - VD_ACCESS_SET_PRESERVE); + VD_ACCESS_SET_PRESERVE, FKIOCTL); mutex_enter(&vdc->lock); @@ -6894,7 +6645,7 @@ vd_cap = kmem_zalloc(alloc_len, KM_SLEEP); rv = vdc_do_sync_op(vdc, VD_OP_GET_CAPACITY, (caddr_t)vd_cap, alloc_len, - 0, 0, VIO_both_dir, B_TRUE); + 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_TRUE); *dsk_size = vd_cap->vdisk_size; *blk_size = vd_cap->vdisk_block_size; @@ -7189,7 +6940,7 @@ vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED); rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | - VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE); + VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE, mode); if (rv == 0) { vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED | VDC_OWNERSHIP_GRANTED); @@ -7203,7 +6954,7 @@ case MHIOCRELEASE: { mutex_enter(&vdc->ownership_lock); - rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR); + rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, mode); if (rv == 0) { vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); } @@ -7215,7 +6966,7 @@ { uint64_t status; - rv = vdc_access_get(vdc, &status); + rv = vdc_access_get(vdc, &status, mode); if (rv == 0 && rvalp != NULL) *rvalp = (status & VD_ACCESS_ALLOWED)? 0 : 1; return (rv); @@ -7223,7 +6974,7 @@ case MHIOCQRESERVE: { - rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE); + rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE, mode); return (rv); } @@ -7401,7 +7152,8 @@ * send request to vds to service the ioctl. */ rv = vdc_do_sync_op(vdc, iop->op, mem_p, alloc_len, - VDCPART(dev), 0, VIO_both_dir, B_TRUE); + VDCPART(dev), 0, CB_SYNC, (void *)(uint64_t)mode, + VIO_both_dir, B_TRUE); if (rv != 0) { /* @@ -8081,6 +7833,7 @@ static int vdc_validate_geometry(vdc_t *vdc) { + buf_t *buf; /* BREAD requests need to be in a buf_t structure */ dev_t dev; int rv, rval; struct dk_label *label; @@ -8207,9 +7960,27 @@ * Read disk label from start of disk */ label = kmem_alloc(vdc->vdisk_bsize, KM_SLEEP); - - rv = vdc_do_op(vdc, VD_OP_BREAD, (caddr_t)label, vdc->vdisk_bsize, - VD_SLICE_NONE, 0, NULL, VIO_read_dir, VDC_OP_NORMAL); + buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); + bioinit(buf); + buf->b_un.b_addr = (caddr_t)label; + buf->b_bcount = vdc->vdisk_bsize; + buf->b_flags = B_BUSY | B_READ; + buf->b_dev = cmpdev(dev); + rv = vdc_send_request(vdc, VD_OP_BREAD, (caddr_t)label, + vdc->vdisk_bsize, VD_SLICE_NONE, 0, CB_STRATEGY, buf, VIO_read_dir); + if (rv) { + DMSG(vdc, 1, "[%d] Failed to read disk block 0\n", + vdc->instance); + } else if (ddi_in_panic()) { + rv = vdc_drain_response(vdc, CB_STRATEGY, buf); + if (rv == 0) { + rv = geterror(buf); + } + } else { + rv = biowait(buf); + } + biofini(buf); + kmem_free(buf, sizeof (buf_t)); if (rv != 0 || label->dkl_magic != DKL_MAGIC || label->dkl_cksum != vdc_lbl2cksum(label)) { @@ -8260,8 +8031,7 @@ (void) vdc_validate_geometry(vdc); /* if the disk label has changed, update device nodes */ - if (vdc->vdisk_type == VD_DISK_TYPE_DISK && - vdc->vdisk_label != old_label) { + if (vdc->vdisk_label != old_label) { if (vdc->vdisk_label == VD_DISK_LABEL_EFI) rv = vdc_create_device_nodes_efi(vdc); @@ -8312,8 +8082,6 @@ int rv; vd_devid_t *vd_devid; size_t bufsize, bufid_len; - ddi_devid_t vdisk_devid; - char *devid_str; /* * At first sight, we don't know the size of the devid that the @@ -8328,10 +8096,10 @@ vd_devid = kmem_zalloc(bufsize, KM_SLEEP); bufid_len = bufsize - sizeof (vd_efi_t) - 1; - rv = vdc_do_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid, - bufsize, 0, 0, NULL, VIO_both_dir, 0); - - DMSG(vdc, 2, "do_op returned %d\n", rv); + rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid, + bufsize, 0, 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); + + DMSG(vdc, 2, "sync_op returned %d\n", rv); if (rv) { kmem_free(vd_devid, bufsize); @@ -8349,8 +8117,9 @@ vd_devid = kmem_zalloc(bufsize, KM_SLEEP); bufid_len = bufsize - sizeof (vd_efi_t) - 1; - rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid, - bufsize, 0, 0, VIO_both_dir, B_TRUE); + rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, + (caddr_t)vd_devid, bufsize, 0, 0, CB_SYNC, 0, + VIO_both_dir, B_TRUE); if (rv) { kmem_free(vd_devid, bufsize); @@ -8373,58 +8142,23 @@ /* build an encapsulated devid based on the returned devid */ if (ddi_devid_init(vdc->dip, DEVID_ENCAP, vd_devid->length, - vd_devid->id, &vdisk_devid) != DDI_SUCCESS) { + vd_devid->id, &vdc->devid) != DDI_SUCCESS) { DMSG(vdc, 1, "[%d] Fail to created devid\n", vdc->instance); kmem_free(vd_devid, bufsize); return (1); } - DEVID_FORMTYPE((impl_devid_t *)vdisk_devid, vd_devid->type); - - ASSERT(ddi_devid_valid(vdisk_devid) == DDI_SUCCESS); + DEVID_FORMTYPE((impl_devid_t *)vdc->devid, vd_devid->type); + + ASSERT(ddi_devid_valid(vdc->devid) == DDI_SUCCESS); kmem_free(vd_devid, bufsize); - if (vdc->devid != NULL) { - /* check that the devid hasn't changed */ - if (ddi_devid_compare(vdisk_devid, vdc->devid) == 0) { - ddi_devid_free(vdisk_devid); - return (0); - } - - cmn_err(CE_WARN, "vdisk@%d backend devid has changed", - vdc->instance); - - devid_str = ddi_devid_str_encode(vdc->devid, NULL); - - cmn_err(CE_CONT, "vdisk@%d backend initial devid: %s", - vdc->instance, - (devid_str)? devid_str : "<encoding error>"); - - if (devid_str) - ddi_devid_str_free(devid_str); - - devid_str = ddi_devid_str_encode(vdisk_devid, NULL); - - cmn_err(CE_CONT, "vdisk@%d backend current devid: %s", - vdc->instance, - (devid_str)? devid_str : "<encoding error>"); - - if (devid_str) - ddi_devid_str_free(devid_str); - - ddi_devid_free(vdisk_devid); + if (ddi_devid_register(vdc->dip, vdc->devid) != DDI_SUCCESS) { + DMSG(vdc, 1, "[%d] Fail to register devid\n", vdc->instance); return (1); } - if (ddi_devid_register(vdc->dip, vdisk_devid) != DDI_SUCCESS) { - DMSG(vdc, 1, "[%d] Fail to register devid\n", vdc->instance); - ddi_devid_free(vdisk_devid); - return (1); - } - - vdc->devid = vdisk_devid; - return (0); }
--- a/usr/src/uts/sun4v/sys/vdc.h Thu Nov 05 15:51:00 2009 -0800 +++ b/usr/src/uts/sun4v/sys/vdc.h Thu Nov 05 15:54:24 2009 -0800 @@ -84,16 +84,6 @@ #define VDC_SEQ_NUM_TODO 1 /* Request needs processing */ /* - * Flags for virtual disk operations. - */ -#define VDC_OP_STATE_RUNNING 0x01 /* do operation in running state */ -#define VDC_OP_ERRCHK_BACKEND 0x02 /* check backend on error */ -#define VDC_OP_ERRCHK_CONFLICT 0x04 /* check resv conflict on error */ - -#define VDC_OP_ERRCHK (VDC_OP_ERRCHK_BACKEND | VDC_OP_ERRCHK_CONFLICT) -#define VDC_OP_NORMAL (VDC_OP_STATE_RUNNING | VDC_OP_ERRCHK) - -/* * Macros to get UNIT and PART number */ #define VDCUNIT_SHIFT 3 @@ -181,26 +171,12 @@ VDC_STATE_INIT_WAITING, /* waiting for ldc connection */ VDC_STATE_NEGOTIATE, /* doing handshake negotiation */ VDC_STATE_HANDLE_PENDING, /* handle requests in backup dring */ - VDC_STATE_FAULTED, /* multipath backend is inaccessible */ - VDC_STATE_FAILED, /* device is not usable */ VDC_STATE_RUNNING, /* running and accepting requests */ VDC_STATE_DETACH, /* detaching */ VDC_STATE_RESETTING /* resetting connection with vds */ } vdc_state_t; /* - * States of the service provided by a vds server - */ -typedef enum vdc_service_state { - VDC_SERVICE_NONE = -1, /* no state define */ - VDC_SERVICE_OFFLINE, /* no connection with the service */ - VDC_SERVICE_CONNECTED, /* connection established */ - VDC_SERVICE_ONLINE, /* connection and backend available */ - VDC_SERVICE_FAILED, /* connection failed */ - VDC_SERVICE_FAULTED /* connection but backend unavailable */ -} vdc_service_state_t; - -/* * The states that the vdc instance can be in. */ typedef enum vdc_lc_state { @@ -222,6 +198,11 @@ VIO_both_dir /* transfer both in and out in same buffer */ } vio_desc_direction_t; +typedef enum { + CB_STRATEGY, /* non-blocking strategy call */ + CB_SYNC /* synchronous operation */ +} vio_cb_type_t; + typedef struct vdc_local_desc { boolean_t is_free; /* local state - inuse or not */ @@ -230,9 +211,9 @@ int slice; diskaddr_t offset; /* disk offset */ size_t nbytes; - struct buf *buf; /* buf of operation */ + vio_cb_type_t cb_type; /* operation type blk/nonblk */ + void *cb_arg; /* buf passed to strategy() */ vio_desc_direction_t dir; /* direction of transfer */ - int flags; /* flags of operation */ caddr_t align_addr; /* used if addr non-aligned */ ldc_mem_handle_t desc_mhdl; /* Mem handle of buf */ @@ -241,11 +222,11 @@ } vdc_local_desc_t; /* - * I/O queue used for checking backend or failfast + * I/O queue used by failfast */ typedef struct vdc_io { struct vdc_io *vio_next; /* next pending I/O in the queue */ - int vio_index; /* descriptor index */ + struct buf *vio_buf; /* buf for CB_STRATEGY I/O */ clock_t vio_qtime; /* time the I/O was queued */ } vdc_io_t; @@ -265,8 +246,6 @@ struct vdc *vdcp; /* Ptr to vdc struct */ uint64_t id; /* Server port id */ uint64_t state; /* Server state */ - vdc_service_state_t svc_state; /* Service state */ - vdc_service_state_t log_state; /* Last state logged */ uint64_t ldc_id; /* Server LDC id */ ldc_handle_t ldc_handle; /* Server LDC handle */ ldc_status_t ldc_state; /* Server LDC state */ @@ -283,9 +262,7 @@ kcondvar_t initwait_cv; /* signal when ldc conn is up */ kcondvar_t dring_free_cv; /* signal when desc is avail */ kcondvar_t membind_cv; /* signal when mem can be bound */ - boolean_t self_reset; /* self initiated reset */ - kcondvar_t io_pending_cv; /* signal on pending I/O */ - boolean_t io_pending; /* pending I/O */ + boolean_t self_reset; int initialized; /* keeps track of what's init'ed */ vdc_lc_state_t lifecycle; /* Current state of the vdc instance */ @@ -308,7 +285,10 @@ vdc_rd_state_t read_state; /* current read state */ uint32_t sync_op_cnt; /* num of active sync operations */ + boolean_t sync_op_pending; /* sync operation is pending */ boolean_t sync_op_blocked; /* blocked waiting to do sync op */ + uint32_t sync_op_status; /* status of sync operation */ + kcondvar_t sync_pending_cv; /* cv wait for sync op to finish */ kcondvar_t sync_blocked_cv; /* cv wait for other syncs to finish */ uint64_t session_id; /* common ID sent with all messages */ @@ -346,12 +326,13 @@ kcondvar_t ownership_cv; /* cv for ownership update */ /* - * The eio and failfast fields are protected by the lock mutex. + * The failfast fields are protected by the lock mutex. */ - kthread_t *eio_thread; /* error io thread */ - kcondvar_t eio_cv; /* cv for eio thread update */ - vdc_io_t *eio_queue; /* error io queue */ + kthread_t *failfast_thread; /* failfast thread */ clock_t failfast_interval; /* interval in microsecs */ + kcondvar_t failfast_cv; /* cv for failfast update */ + kcondvar_t failfast_io_cv; /* cv wait for I/O to finish */ + vdc_io_t *failfast_io_queue; /* failfast io queue */ /* * kstats used to store I/O statistics consumed by iostat(1M).