Mercurial > illumos > illumos-gate
changeset 3401:b2b8301bd444
6422488 vdc should read/write VTOC & GEOM to a disk on file
6483975 vDisk server should bypass lofi to access disk image files
6492690 vds unable to open zfs volumes on boot
6505765 vntsd crashes while in a tight loop of bind/unbind a guest domain
6506356 VD_OP_BREAD/BWRITE operations do not report nblks read/written by vds
6510356 Vdc and Vds should handle bad or non existent devices better.
author | narayan |
---|---|
date | Wed, 10 Jan 2007 13:46:29 -0800 |
parents | 2427f35263a7 |
children | 6c433d8a7001 |
files | usr/src/cmd/vntsd/console.c usr/src/cmd/vntsd/listen.c usr/src/cmd/vntsd/vntsdvcc.c usr/src/uts/sun4v/io/vdc.c usr/src/uts/sun4v/io/vds.c |
diffstat | 5 files changed, 578 insertions(+), 73 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/cmd/vntsd/console.c Wed Jan 10 11:14:23 2007 -0800 +++ b/usr/src/cmd/vntsd/console.c Wed Jan 10 13:46:29 2007 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" @@ -704,11 +704,29 @@ case VNTSD_SUCCESS: case VNTSD_STATUS_CONTINUE: - case VNTSD_STATUS_NO_CONS: (void) mutex_unlock(&groupp->lock); client_init(clientp); return; + + case VNTSD_STATUS_NO_CONS: + /* + * there are two cases when the status is VNTSD_SATATUS_NO_CONS. + * case 1. the console was removed but there is at least one + * another console in the group that client can connect to. + * case 2. there is no console in the group. Client needs to + * be disconnected from vntsd. + */ + if (groupp->num_cons == 0) { + (void) mutex_unlock(&groupp->lock); + client_fini(groupp, clientp); + } else { + (void) mutex_unlock(&groupp->lock); + client_init(clientp); + } + return; + + case VNTSD_ERR_INVALID_INPUT: (void) mutex_unlock(&groupp->lock); return; @@ -743,6 +761,9 @@ assert(groupp); assert(clientp); + /* free argp, which was allocated in listen thread */ + free(argp); + /* check if group is removed */ D1(stderr, "t@%d get_client_sel@%lld:client@%d\n", thr_self(), @@ -801,9 +822,11 @@ case ' ': - if (num_cons == 0) + if (num_cons == 0) { /* no console in the group */ + rv = VNTSD_STATUS_NO_CONS; break; + } if (clientp->cons == NULL) { if (num_cons == 1) { @@ -832,6 +855,15 @@ rv = display_help(clientp); break; } + + /* + * all consoles in the group + * may be gone before this client + * could select one. + */ + if (rv != VNTSD_SUCCESS) + break; + } else { consp = clientp->cons; }
--- a/usr/src/cmd/vntsd/listen.c Wed Jan 10 11:14:23 2007 -0800 +++ b/usr/src/cmd/vntsd/listen.c Wed Jan 10 13:46:29 2007 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" @@ -170,6 +170,12 @@ if (errno == EADDRINUSE && retries < MAX_BIND_RETRIES) { /* port may be in TIME_WAIT state, retry */ (void) sleep(5); + + /* woke up by signal? */ + if (errno == EINTR) { + return (VNTSD_STATUS_INTR); + } + retries++; continue; } @@ -198,7 +204,7 @@ create_console_thread(vntsd_group_t *groupp, int sockfd) { vntsd_client_t *clientp; - vntsd_thr_arg_t arg; + vntsd_thr_arg_t *thr_arg; int rv; @@ -223,6 +229,14 @@ /* append client to group */ (void) mutex_lock(&groupp->lock); + /* check if the group is [being] removed */ + if (groupp->status & VNTSD_GROUP_IN_CLEANUP) { + (void) mutex_unlock(&groupp->lock); + vntsd_free_client(clientp); + return (VNTSD_STATUS_NO_CONS); + } + + if ((rv = vntsd_que_append(&groupp->no_cons_clientpq, clientp)) != VNTSD_SUCCESS) { (void) mutex_unlock(&groupp->lock); @@ -232,18 +246,28 @@ (void) mutex_unlock(&groupp->lock); + /* + * allocate thr_arg from heap for console thread so + * that thr_arg is still valid after this function exits. + * console thread will free thr_arg. + */ + + thr_arg = (vntsd_thr_arg_t *)malloc(sizeof (vntsd_thr_arg_t)); + if (thr_arg == NULL) { + vntsd_free_client(clientp); + return (VNTSD_ERR_NO_MEM); + } + thr_arg->handle = groupp; + thr_arg->arg = clientp; + (void) mutex_lock(&clientp->lock); - /* parameters for console thread */ - bzero(&arg, sizeof (arg)); - - arg.handle = groupp; - arg.arg = clientp; /* create console selection thread */ if (thr_create(NULL, 0, (thr_func_t)vntsd_console_thread, - &arg, THR_DETACHED, &clientp->cons_tid)) { + thr_arg, THR_DETACHED, &clientp->cons_tid)) { + free(thr_arg); (void) mutex_unlock(&clientp->lock); (void) mutex_lock(&groupp->lock); (void) vntsd_que_rm(&groupp->no_cons_clientpq, clientp);
--- a/usr/src/cmd/vntsd/vntsdvcc.c Wed Jan 10 11:14:23 2007 -0800 +++ b/usr/src/cmd/vntsd/vntsdvcc.c Wed Jan 10 13:46:29 2007 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" @@ -521,12 +521,12 @@ if ((consp->status & VNTSD_CONS_DELETED) == 0) return (B_TRUE); - /* notify clients of console ? */ - clientp = (vntsd_client_t *)consp->clientpq->handle; + if (consp->clientpq == NULL) + /* there is no client for this console */ + return (B_TRUE); - if (clientp == NULL) - /* therre is no client for this console */ - return (B_TRUE); + /* need to notify clients of console ? */ + clientp = (vntsd_client_t *)consp->clientpq->handle; if (clientp->status & VNTSD_CLIENT_CONS_DELETED) /* clients of console have notified */ @@ -576,18 +576,23 @@ (void) mutex_unlock(&groupp->lock); return; } - /* console exists - delete console */ + + /* console exists - mark console for main thread to delete it */ + (void) mutex_lock(&consp->lock); - (void) mutex_lock(&consp->lock); + if (consp->status & VNTSD_CONS_DELETED) { + /* already marked */ + (void) mutex_unlock(&consp->lock); + (void) mutex_unlock(&groupp->lock); + return; + } consp->status |= VNTSD_CONS_DELETED; groupp->status |= VNTSD_GROUP_CLEAN_CONS; (void) mutex_unlock(&consp->lock); - (void) mutex_unlock(&groupp->lock); - vntsd_delete_cons(vntsdp); } /* add a console */
--- a/usr/src/uts/sun4v/io/vdc.c Wed Jan 10 11:14:23 2007 -0800 +++ b/usr/src/uts/sun4v/io/vdc.c Wed Jan 10 13:46:29 2007 -0800 @@ -203,6 +203,7 @@ * various operations */ static int vdc_retries = 10; +static int vdc_hshake_retries = 3; /* calculated from 'vdc_usec_timeout' during attach */ static uint64_t vdc_hz_timeout; /* units: Hz */ @@ -2373,9 +2374,16 @@ mutex_enter(&vdcp->lock); do { - while (vdcp->state != VDC_STATE_RUNNING) + while (vdcp->state != VDC_STATE_RUNNING) { cv_wait(&vdcp->running_cv, &vdcp->lock); + /* return error if detaching */ + if (vdcp->state == VDC_STATE_DETACH) { + mutex_exit(&vdcp->lock); + return (ENXIO); + } + } + } while (vdc_populate_descriptor(vdcp, operation, addr, nbytes, slice, offset, cb_type, cb_arg, dir)); @@ -2604,10 +2612,13 @@ cv_wait(&vdcp->sync_pending_cv, &vdcp->lock); DMSG(vdcp, 2, ": operation returned %d\n", vdcp->sync_op_status); - if (vdcp->state == VDC_STATE_DETACH) + if (vdcp->state == VDC_STATE_DETACH) { + vdcp->sync_op_pending = B_FALSE; status = ENXIO; - else + } else { status = vdcp->sync_op_status; + } + vdcp->sync_op_status = 0; vdcp->sync_op_blocked = B_FALSE; vdcp->sync_op_cnt--; @@ -2780,11 +2791,12 @@ */ if (ldep->align_addr) { ASSERT(ldep->addr != NULL); - ASSERT(dep->payload.nbytes > 0); - - bcopy(ldep->align_addr, ldep->addr, dep->payload.nbytes); + + if (dep->payload.nbytes > 0) + bcopy(ldep->align_addr, ldep->addr, + dep->payload.nbytes); kmem_free(ldep->align_addr, - sizeof (caddr_t) * P2ROUNDUP(dep->payload.nbytes, 8)); + sizeof (caddr_t) * P2ROUNDUP(ldep->nbytes, 8)); ldep->align_addr = NULL; } @@ -2909,7 +2921,7 @@ vdcp->instance, mhdl, i, rv); if (ldep->align_addr) { kmem_free(ldep->align_addr, - sizeof (caddr_t) * dep->payload.nbytes); + sizeof (caddr_t) * ldep->nbytes); ldep->align_addr = NULL; } return (EAGAIN); @@ -3280,7 +3292,9 @@ case VDC_STATE_INIT: /* Check if have re-initializing repeatedly */ - if (vdcp->hshake_cnt++ > VDC_RETRIES) { + if (vdcp->hshake_cnt++ > vdc_hshake_retries) { + cmn_err(CE_NOTE, "[%d] disk access failed.\n", + vdcp->instance); vdcp->state = VDC_STATE_DETACH; break; } @@ -3461,6 +3475,12 @@ DMSG(vdcp, 0, "[%d] Reset thread exit cleanup ..\n", vdcp->instance); + /* + * Signal anyone waiting for connection + * to come online + */ + cv_broadcast(&vdcp->running_cv); + while (vdcp->sync_op_pending) { cv_signal(&vdcp->sync_pending_cv); cv_signal(&vdcp->sync_blocked_cv); @@ -3469,7 +3489,6 @@ mutex_enter(&vdcp->lock); } - cv_signal(&vdcp->running_cv); mutex_exit(&vdcp->lock); DMSG(vdcp, 0, "[%d] Msg processing thread exiting ..\n", @@ -3584,6 +3603,8 @@ case CB_STRATEGY: bufp = ldep->cb_arg; ASSERT(bufp != NULL); + bufp->b_resid = + bufp->b_bcount - ldep->dep->payload.nbytes; status = ldep->dep->payload.status; /* Future:ntoh */ if (status != 0) { DMSG(vdcp, 1, "strategy status=%d\n", status); @@ -3591,6 +3612,10 @@ } status = vdc_depopulate_descriptor(vdcp, idx); biodone(bufp); + + DMSG(vdcp, 1, + "strategy complete req=%ld bytes resp=%ld bytes\n", + bufp->b_bcount, ldep->dep->payload.nbytes); break; default:
--- a/usr/src/uts/sun4v/io/vds.c Wed Jan 10 11:14:23 2007 -0800 +++ b/usr/src/uts/sun4v/io/vds.c Wed Jan 10 13:46:29 2007 -0800 @@ -20,7 +20,7 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -48,15 +48,17 @@ #include <sys/vdsk_mailbox.h> #include <sys/vdsk_common.h> #include <sys/vtoc.h> - +#include <sys/vfs.h> +#include <sys/stat.h> /* Virtual disk server initialization flags */ #define VDS_LDI 0x01 #define VDS_MDEG 0x02 /* Virtual disk server tunable parameters */ -#define VDS_LDC_RETRIES 5 -#define VDS_LDC_DELAY 1000 /* usec */ +#define VDS_RETRIES 5 +#define VDS_LDC_DELAY 1000 /* 1 msecs */ +#define VDS_DEV_DELAY 10000000 /* 10 secs */ #define VDS_NCHAINS 32 /* Identification parameters for MD, synthetic dkio(7i) structures, etc. */ @@ -72,11 +74,12 @@ #define VD_REG_PROP "reg" /* Virtual disk initialization flags */ -#define VD_LOCKING 0x01 -#define VD_LDC 0x02 -#define VD_DRING 0x04 -#define VD_SID 0x08 -#define VD_SEQ_NUM 0x10 +#define VD_DISK_READY 0x01 +#define VD_LOCKING 0x02 +#define VD_LDC 0x04 +#define VD_DRING 0x08 +#define VD_SID 0x10 +#define VD_SEQ_NUM 0x20 /* Flags for opening/closing backing devices via LDI */ #define VD_OPEN_FLAGS (FEXCL | FREAD | FWRITE) @@ -143,13 +146,12 @@ static int vd_msglevel = 0; - #define PR0 if (vd_msglevel > 0) PRN #define PR1 if (vd_msglevel > 1) PRN #define PR2 if (vd_msglevel > 2) PRN #define VD_DUMP_DRING_ELEM(elem) \ - PRN("dst:%x op:%x st:%u nb:%lx addr:%lx ncook:%u\n", \ + PR0("dst:%x op:%x st:%u nb:%lx addr:%lx ncook:%u\n", \ elem->hdr.dstate, \ elem->payload.operation, \ elem->payload.status, \ @@ -284,6 +286,7 @@ ddi_taskq_t *startq; /* queue for I/O start tasks */ ddi_taskq_t *completionq; /* queue for completion tasks */ ldi_handle_t ldi_handle[V_NUMPAR]; /* LDI slice handles */ + char device_path[MAXPATHLEN + 1]; /* vdisk device */ dev_t dev[V_NUMPAR]; /* dev numbers for slices */ uint_t nslices; /* number of slices */ size_t vdisk_size; /* number of blocks in vdisk */ @@ -291,6 +294,10 @@ vd_disk_label_t vdisk_label; /* EFI or VTOC label */ ushort_t max_xfer_sz; /* max xfer size in DEV_BSIZE */ boolean_t pseudo; /* underlying pseudo dev */ + boolean_t file; /* underlying file */ + char *file_maddr; /* file mapping address */ + vnode_t *file_vnode; /* file vnode */ + size_t file_size; /* file size */ struct dk_efi dk_efi; /* synthetic for slice type */ struct dk_geom dk_geom; /* synthetic for slice type */ struct vtoc vtoc; /* synthetic for slice type */ @@ -340,8 +347,10 @@ #define VD_IDENTITY ((void (*)(void *, void *))-1) -static int vds_ldc_retries = VDS_LDC_RETRIES; +static int vds_ldc_retries = VDS_RETRIES; static int vds_ldc_delay = VDS_LDC_DELAY; +static int vds_dev_retries = VDS_RETRIES; +static int vds_dev_delay = VDS_DEV_DELAY; static void *vds_state; static uint64_t vds_operations; /* see vds_operation[] definition below */ @@ -359,6 +368,8 @@ sizeof (vds_version)/sizeof (vds_version[0]); static void vd_free_dring_task(vd_t *vdp); +static int vd_setup_vd(vd_t *vd); +static boolean_t vd_enabled(vd_t *vd); static int vd_start_bio(vd_task_t *task) @@ -368,11 +379,16 @@ vd_dring_payload_t *request = task->request; struct buf *buf = &task->buf; uint8_t mtype; - + caddr_t addr; + size_t offset, maxlen; + int slice; ASSERT(vd != NULL); ASSERT(request != NULL); - ASSERT(request->slice < vd->nslices); + + slice = request->slice; + + ASSERT(slice < vd->nslices); ASSERT((request->operation == VD_OP_BREAD) || (request->operation == VD_OP_BWRITE)); @@ -387,7 +403,7 @@ buf->b_flags = B_BUSY; buf->b_bcount = request->nbytes; buf->b_lblkno = request->addr; - buf->b_edev = vd->dev[request->slice]; + buf->b_edev = vd->dev[slice]; mtype = (&vd->inband_task == task) ? LDC_SHADOW_MAP : LDC_DIRECT_MAP; @@ -412,9 +428,62 @@ buf->b_flags |= (request->operation == VD_OP_BREAD) ? B_READ : B_WRITE; /* Start the block I/O */ - if ((status = ldi_strategy(vd->ldi_handle[request->slice], buf)) == 0) - return (EINPROGRESS); /* will complete on completionq */ - + if (vd->file) { + + if (request->addr >= vd->vtoc.v_part[slice].p_size) { + /* address past the end of the slice */ + PR0("req_addr (0x%lx) > psize (0x%lx)", + request->addr, vd->vtoc.v_part[slice].p_size); + request->nbytes = 0; + status = 0; + goto cleanup; + } + + offset = (vd->vtoc.v_part[slice].p_start + + request->addr) * DEV_BSIZE; + + /* + * If the requested size is greater than the size + * of the partition, truncate the read/write. + */ + maxlen = (vd->vtoc.v_part[slice].p_size - + request->addr) * DEV_BSIZE; + + if (request->nbytes > maxlen) { + PR0("I/O size truncated to %lu bytes from %lu bytes", + maxlen, request->nbytes); + request->nbytes = maxlen; + } + + /* + * We have to ensure that we are reading/writing into the mmap + * range. If we have a partial disk image (e.g. an image of + * s0 instead s2) the system can try to access slices that + * are not included into the disk image. + */ + if ((offset + request->nbytes) >= vd->file_size) { + PR0("offset + nbytes (0x%lx + 0x%lx) >= " + "file_size (0x%lx)", offset, request->nbytes, + vd->file_size); + request->nbytes = 0; + status = EIO; + goto cleanup; + } + + addr = vd->file_maddr + offset; + + if (request->operation == VD_OP_BREAD) + bcopy(addr, buf->b_un.b_addr, request->nbytes); + else + bcopy(buf->b_un.b_addr, addr, request->nbytes); + + } else { + status = ldi_strategy(vd->ldi_handle[slice], buf); + if (status == 0) + return (EINPROGRESS); /* will complete on completionq */ + } + +cleanup: /* Clean up after error */ rv = ldc_mem_release(task->mhdl, 0, buf->b_bcount); if (rv) { @@ -492,6 +561,13 @@ */ ddi_taskq_wait(vd->completionq); + if (vd->file) { + status = VOP_FSYNC(vd->file_vnode, FSYNC, kcred); + if (status) { + PR0("VOP_FSYNC returned errno %d", status); + } + } + if ((vd->initialized & VD_DRING) && ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0)) PR0("ldc_mem_dring_unmap() returned errno %d", status); @@ -552,7 +628,7 @@ } static int -vd_mark_elem_done(vd_t *vd, int idx, int elem_status) +vd_mark_elem_done(vd_t *vd, int idx, int elem_status, int elem_nbytes) { boolean_t accepted; int status; @@ -577,6 +653,7 @@ /* Set the element's status and mark it done */ accepted = (elem->hdr.dstate == VIO_DESC_ACCEPTED); if (accepted) { + elem->payload.nbytes = elem_nbytes; elem->payload.status = elem_status; elem->hdr.dstate = VIO_DESC_DONE; } else { @@ -614,10 +691,14 @@ ASSERT(request != NULL); ASSERT(task->msg != NULL); ASSERT(task->msglen >= sizeof (*task->msg)); + ASSERT(!vd->file); /* Wait for the I/O to complete */ request->status = biowait(buf); + /* return back the number of bytes read/written */ + request->nbytes = buf->b_bcount - buf->b_resid; + /* Release the buffer */ if (!vd->reset_state) status = ldc_mem_release(task->mhdl, 0, buf->b_bcount); @@ -644,7 +725,8 @@ /* Update the dring element for a dring client */ if (!vd->reset_state && (status == 0) && (vd->xfer_mode == VIO_DRING_MODE)) { - status = vd_mark_elem_done(vd, task->index, request->status); + status = vd_mark_elem_done(vd, task->index, + request->status, request->nbytes); if (status == ECONNRESET) vd_mark_in_reset(vd); } @@ -787,10 +869,28 @@ return (0); } +static short +vd_lbl2cksum(struct dk_label *label) +{ + int count; + short sum, *sp; + + count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; + sp = (short *)label; + sum = 0; + while (count--) { + sum ^= *sp++; + } + + return (sum); +} + static int vd_do_slice_ioctl(vd_t *vd, int cmd, void *ioctl_arg) { dk_efi_t *dk_ioc; + struct dk_label *label; + int i; switch (vd->vdisk_label) { @@ -805,6 +905,36 @@ ASSERT(ioctl_arg != NULL); bcopy(&vd->vtoc, ioctl_arg, sizeof (vd->vtoc)); return (0); + case DKIOCSVTOC: + if (!vd->file) + return (ENOTSUP); + ASSERT(ioctl_arg != NULL); + bcopy(ioctl_arg, &vd->vtoc, sizeof (vd->vtoc)); + /* write new VTOC to file */ + label = (struct dk_label *)vd->file_maddr; + label->dkl_vtoc.v_nparts = vd->vtoc.v_nparts; + label->dkl_vtoc.v_sanity = vd->vtoc.v_sanity; + label->dkl_vtoc.v_version = vd->vtoc.v_version; + bcopy(vd->vtoc.v_volume, label->dkl_vtoc.v_volume, + LEN_DKL_VVOL); + for (i = 0; i < vd->vtoc.v_nparts; i++) { + label->dkl_vtoc.v_timestamp[i] = + vd->vtoc.timestamp[i]; + label->dkl_vtoc.v_part[i].p_tag = + vd->vtoc.v_part[i].p_tag; + label->dkl_vtoc.v_part[i].p_flag = + vd->vtoc.v_part[i].p_flag; + label->dkl_map[i].dkl_cylno = + vd->vtoc.v_part[i].p_start / + (label->dkl_nhead * label->dkl_nsect); + label->dkl_map[i].dkl_nblk = + vd->vtoc.v_part[i].p_size; + } + + /* re-compute checksum */ + label->dkl_cksum = vd_lbl2cksum(label); + + return (0); default: return (ENOTSUP); } @@ -862,7 +992,7 @@ * Handle single-slice block devices internally; otherwise, have the * real driver perform the ioctl() */ - if (vd->vdisk_type == VD_DISK_TYPE_SLICE && !vd->pseudo) { + if (vd->file || (vd->vdisk_type == VD_DISK_TYPE_SLICE && !vd->pseudo)) { if ((status = vd_do_slice_ioctl(vd, ioctl->cmd, (void *)ioctl->arg)) != 0) return (status); @@ -990,7 +1120,7 @@ status = vd_do_ioctl(vd, request, buf, &ioctl[i]); if (request->nbytes) kmem_free(buf, request->nbytes); - if (vd->vdisk_type == VD_DISK_TYPE_DISK && + if (!vd->file && vd->vdisk_type == VD_DISK_TYPE_DISK && (request->operation == VD_OP_SET_VTOC || request->operation == VD_OP_SET_EFI)) { /* update disk information */ @@ -1015,6 +1145,11 @@ PR1("Get Device ID, nbytes=%ld", request->nbytes); + if (vd->file) { + /* no devid for disk on file */ + return (ENOENT); + } + if (ddi_lyr_get_devid(vd->dev[request->slice], (ddi_devid_t *)&devid) != DDI_SUCCESS) { /* the most common failure is that no devid is available */ @@ -1294,6 +1429,7 @@ vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) { vd_attr_msg_t *attr_msg = (vd_attr_msg_t *)msg; + int status, retry = 0; ASSERT(msglen >= sizeof (msg->tag)); @@ -1321,6 +1457,39 @@ return (EBADMSG); } + /* + * check if the underlying disk is ready, if not try accessing + * the device again. Open the vdisk device and extract info + * about it, as this is needed to respond to the attr info msg + */ + if ((vd->initialized & VD_DISK_READY) == 0) { + PR0("Retry setting up disk (%s)", vd->device_path); + do { + status = vd_setup_vd(vd); + if (status != EAGAIN || ++retry > vds_dev_retries) + break; + + /* incremental delay */ + delay(drv_usectohz(vds_dev_delay)); + + /* if vdisk is no longer enabled - return error */ + if (!vd_enabled(vd)) + return (ENXIO); + + } while (status == EAGAIN); + + if (status) + return (ENXIO); + + vd->initialized |= VD_DISK_READY; + ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); + PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u", + ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), + (vd->pseudo ? "yes" : "no"), + (vd->file ? "yes" : "no"), + vd->nslices); + } + /* Success: valid message and transfer mode */ vd->xfer_mode = attr_msg->xfer_mode; @@ -1670,7 +1839,9 @@ vd->dring_task[idx].msglen = msglen; if ((status = vd_process_task(&vd->dring_task[idx])) != EINPROGRESS) - status = vd_mark_elem_done(vd, idx, elem->payload.status); + status = vd_mark_elem_done(vd, idx, + vd->dring_task[idx].request->status, + vd->dring_task[idx].request->nbytes); return (status); } @@ -2353,11 +2524,232 @@ } static int -vd_setup_vd(char *device_path, vd_t *vd) +vd_setup_file(vd_t *vd) +{ + int i, rval, status; + short sum; + vattr_t vattr; + dev_t dev; + char *file_path = vd->device_path; + char dev_path[MAXPATHLEN + 1]; + ldi_handle_t lhandle; + struct dk_cinfo dk_cinfo; + struct dk_label *label; + + /* make sure the file is valid */ + if ((status = lookupname(file_path, UIO_SYSSPACE, FOLLOW, + NULLVPP, &vd->file_vnode)) != 0) { + PR0("Cannot lookup file(%s) errno %d", file_path, status); + return (status); + } + + if (vd->file_vnode->v_type != VREG) { + PR0("Invalid file type (%s)\n", file_path); + VN_RELE(vd->file_vnode); + return (EBADF); + } + VN_RELE(vd->file_vnode); + + if ((status = vn_open(file_path, UIO_SYSSPACE, vd_open_flags | FOFFMAX, + 0, &vd->file_vnode, 0, 0)) != 0) { + PR0("vn_open(%s) = errno %d", file_path, status); + return (status); + } + + vattr.va_mask = AT_SIZE; + if ((status = VOP_GETATTR(vd->file_vnode, &vattr, 0, kcred)) != 0) { + PR0("VOP_GETATTR(%s) = errno %d", file_path, status); + (void) VOP_CLOSE(vd->file_vnode, vd_open_flags, 1, 0, kcred); + VN_RELE(vd->file_vnode); + return (EIO); + } + + vd->file_size = vattr.va_size; + /* size should be at least sizeof(dk_label) */ + if (vd->file_size < sizeof (struct dk_label)) { + PRN("Size of file has to be at least %ld bytes", + sizeof (struct dk_label)); + (void) VOP_CLOSE(vd->file_vnode, vd_open_flags, 1, 0, kcred); + VN_RELE(vd->file_vnode); + return (EIO); + } + + if ((status = VOP_MAP(vd->file_vnode, 0, &kas, &vd->file_maddr, + vd->file_size, PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE, + MAP_SHARED, kcred)) != 0) { + PR0("VOP_MAP(%s) = errno %d", file_path, status); + (void) VOP_CLOSE(vd->file_vnode, vd_open_flags, 1, 0, kcred); + VN_RELE(vd->file_vnode); + return (EIO); + } + + label = (struct dk_label *)vd->file_maddr; + + /* label checksum */ + sum = vd_lbl2cksum(label); + + if (label->dkl_magic != DKL_MAGIC || label->dkl_cksum != sum) { + PR0("%s has an invalid disk label " + "(magic=%x cksum=%x (expect %x))", + file_path, label->dkl_magic, label->dkl_cksum, sum); + + /* default label */ + bzero(label, sizeof (struct dk_label)); + + /* + * We must have a resonable number of cylinders and sectors so + * that newfs can run using default values. + * + * if (disk_size < 2MB) + * phys_cylinders = disk_size / 100K + * else + * phys_cylinders = disk_size / 300K + * + * phys_cylinders = (phys_cylinders == 0) ? 1 : phys_cylinders + * alt_cylinders = (phys_cylinders > 2) ? 2 : 0; + * data_cylinders = phys_cylinders - alt_cylinders + * + * sectors = disk_size / (phys_cylinders * blk_size) + */ + if (vd->file_size < (2 * 1024 * 1024)) + label->dkl_pcyl = vd->file_size / (100 * 1024); + else + label->dkl_pcyl = vd->file_size / (300 * 1024); + + if (label->dkl_pcyl == 0) + label->dkl_pcyl = 1; + + if (label->dkl_pcyl > 2) + label->dkl_acyl = 2; + else + label->dkl_acyl = 0; + + label->dkl_nsect = vd->file_size / + (DEV_BSIZE * label->dkl_pcyl); + label->dkl_ncyl = label->dkl_pcyl - label->dkl_acyl; + label->dkl_nhead = 1; + label->dkl_write_reinstruct = 0; + label->dkl_read_reinstruct = 0; + label->dkl_rpm = 7200; + label->dkl_apc = 0; + label->dkl_intrlv = 0; + label->dkl_magic = DKL_MAGIC; + + PR0("requested disk size: %ld bytes\n", vd->file_size); + PR0("setup: ncyl=%d nhead=%d nsec=%d\n", label->dkl_pcyl, + label->dkl_nhead, label->dkl_nsect); + PR0("provided disk size: %ld bytes\n", (uint64_t) + (label->dkl_pcyl * + label->dkl_nhead * label->dkl_nsect * DEV_BSIZE)); + + /* + * We must have a correct label name otherwise format(1m) will + * not recognized the disk as labeled. + */ + (void) snprintf(label->dkl_asciilabel, LEN_DKL_ASCII, + "SUNVDSK cyl %d alt %d hd %d sec %d", + label->dkl_ncyl, label->dkl_acyl, label->dkl_nhead, + label->dkl_nsect); + + /* default VTOC */ + label->dkl_vtoc.v_version = V_VERSION; + label->dkl_vtoc.v_nparts = 8; + label->dkl_vtoc.v_sanity = VTOC_SANE; + label->dkl_vtoc.v_part[2].p_tag = V_BACKUP; + label->dkl_map[2].dkl_cylno = 0; + label->dkl_map[2].dkl_nblk = label->dkl_ncyl * + label->dkl_nhead * label->dkl_nsect; + label->dkl_map[0] = label->dkl_map[2]; + label->dkl_map[0] = label->dkl_map[2]; + label->dkl_cksum = vd_lbl2cksum(label); + } + + vd->nslices = label->dkl_vtoc.v_nparts; + + /* sector size = block size = DEV_BSIZE */ + vd->vdisk_size = (label->dkl_pcyl * + label->dkl_nhead * label->dkl_nsect) / DEV_BSIZE; + vd->vdisk_type = VD_DISK_TYPE_DISK; + vd->vdisk_label = VD_DISK_LABEL_VTOC; + vd->max_xfer_sz = maxphys / DEV_BSIZE; /* default transfer size */ + + /* Get max_xfer_sz from the device where the file is */ + dev = vd->file_vnode->v_vfsp->vfs_dev; + dev_path[0] = NULL; + if (ddi_dev_pathname(dev, S_IFBLK, dev_path) == DDI_SUCCESS) { + PR0("underlying device = %s\n", dev_path); + } + + if ((status = ldi_open_by_dev(&dev, OTYP_BLK, FREAD, + kcred, &lhandle, vd->vds->ldi_ident)) != 0) { + PR0("ldi_open_by_dev() returned errno %d for device %s", + status, dev_path); + } else { + if ((status = ldi_ioctl(lhandle, DKIOCINFO, + (intptr_t)&dk_cinfo, (vd_open_flags | FKIOCTL), kcred, + &rval)) != 0) { + PR0("ldi_ioctl(DKIOCINFO) returned errno %d for %s", + status, dev_path); + } else { + /* + * Store the device's max transfer size for + * return to the client + */ + vd->max_xfer_sz = dk_cinfo.dki_maxtransfer; + } + + PR0("close the device %s", dev_path); + (void) ldi_close(lhandle, FREAD, kcred); + } + + PR0("using for file %s, dev %s, max_xfer = %u blks", + file_path, dev_path, vd->max_xfer_sz); + + vd->pseudo = B_FALSE; + vd->file = B_TRUE; + + vd->dk_geom.dkg_ncyl = label->dkl_ncyl; + vd->dk_geom.dkg_acyl = label->dkl_acyl; + vd->dk_geom.dkg_pcyl = label->dkl_pcyl; + vd->dk_geom.dkg_nhead = label->dkl_nhead; + vd->dk_geom.dkg_nsect = label->dkl_nsect; + vd->dk_geom.dkg_intrlv = label->dkl_intrlv; + vd->dk_geom.dkg_apc = label->dkl_apc; + vd->dk_geom.dkg_rpm = label->dkl_rpm; + vd->dk_geom.dkg_write_reinstruct = label->dkl_write_reinstruct; + vd->dk_geom.dkg_read_reinstruct = label->dkl_read_reinstruct; + + vd->vtoc.v_sanity = label->dkl_vtoc.v_sanity; + vd->vtoc.v_version = label->dkl_vtoc.v_version; + vd->vtoc.v_sectorsz = DEV_BSIZE; + vd->vtoc.v_nparts = label->dkl_vtoc.v_nparts; + + bcopy(label->dkl_vtoc.v_volume, vd->vtoc.v_volume, + LEN_DKL_VVOL); + bcopy(label->dkl_asciilabel, vd->vtoc.v_asciilabel, + LEN_DKL_ASCII); + + for (i = 0; i < vd->nslices; i++) { + vd->vtoc.timestamp[i] = label->dkl_vtoc.v_timestamp[i]; + vd->vtoc.v_part[i].p_tag = label->dkl_vtoc.v_part[i].p_tag; + vd->vtoc.v_part[i].p_flag = label->dkl_vtoc.v_part[i].p_flag; + vd->vtoc.v_part[i].p_start = label->dkl_map[i].dkl_cylno * + label->dkl_nhead * label->dkl_nsect; + vd->vtoc.v_part[i].p_size = label->dkl_map[i].dkl_nblk; + vd->ldi_handle[i] = NULL; + vd->dev[i] = NULL; + } + + return (0); +} + +static int +vd_setup_vd(vd_t *vd) { int rval, status; dev_info_t *dip; struct dk_cinfo dk_cinfo; + char *device_path = vd->device_path; /* * We need to open with FNDELAY so that opening an empty partition @@ -2365,7 +2757,19 @@ */ if ((status = ldi_open_by_name(device_path, vd_open_flags | FNDELAY, kcred, &vd->ldi_handle[0], vd->vds->ldi_ident)) != 0) { - PRN("ldi_open_by_name(%s) = errno %d", device_path, status); + PR0("ldi_open_by_name(%s) = errno %d", device_path, status); + + /* this may not be a device try opening as a file */ + if (status == ENXIO || status == ENODEV) + status = vd_setup_file(vd); + if (status) { + PR0("Cannot use device/file (%s), errno=%d\n", + device_path, status); + if (status == ENXIO || status == ENODEV || + status == ENOENT) { + return (EAGAIN); + } + } return (status); } @@ -2374,6 +2778,7 @@ * the slice we have just opened in case of an error. */ vd->nslices = 1; + vd->file = B_FALSE; /* Get device number and size of backing device */ if ((status = ldi_get_dev(vd->ldi_handle[0], &vd->dev[0])) != 0) { @@ -2421,7 +2826,6 @@ /* Store the device's max transfer size for return to the client */ vd->max_xfer_sz = dk_cinfo.dki_maxtransfer; - /* Determine if backing device is a pseudo device */ if ((dip = ddi_hold_devi_by_instance(getmajor(vd->dev[0]), dev_to_instance(vd->dev[0]), 0)) == NULL) { @@ -2436,7 +2840,6 @@ return (0); /* ...and we're done */ } - /* If slice is entire-disk slice, initialize for full disk */ if (dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE) return (vd_setup_full_disk(vd)); @@ -2504,16 +2907,21 @@ } *vdp = vd; /* assign here so vds_destroy_vd() can cleanup later */ vd->vds = vds; - + (void) strncpy(vd->device_path, device_path, MAXPATHLEN); /* Open vdisk and initialize parameters */ - if ((status = vd_setup_vd(device_path, vd)) != 0) - return (status); - ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); - PR0("vdisk_type = %s, pseudo = %s, nslices = %u", - ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), - (vd->pseudo ? "yes" : "no"), vd->nslices); - + if ((status = vd_setup_vd(vd)) == 0) { + vd->initialized |= VD_DISK_READY; + + ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); + PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u", + ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), + (vd->pseudo ? "yes" : "no"), (vd->file ? "yes" : "no"), + vd->nslices); + } else { + if (status != EAGAIN) + return (status); + } /* Initialize locking */ if (ddi_get_soft_iblock_cookie(vds->dip, DDI_SOFTINT_MED, @@ -2689,13 +3097,22 @@ kmem_free(vd->inband_task.msg, vd->max_msglen); vd->inband_task.msg = NULL; } - - /* Close any open backing-device slices */ - for (uint_t slice = 0; slice < vd->nslices; slice++) { - if (vd->ldi_handle[slice] != NULL) { - PR0("Closing slice %u", slice); - (void) ldi_close(vd->ldi_handle[slice], - vd_open_flags | FNDELAY, kcred); + if (vd->initialized & VD_DISK_READY) { + if (vd->file) { + /* Unmap and close file */ + (void) as_unmap(&kas, vd->file_maddr, vd->file_size); + (void) VOP_CLOSE(vd->file_vnode, vd_open_flags, 1, + 0, kcred); + VN_RELE(vd->file_vnode); + } else { + /* Close any open backing-device slices */ + for (uint_t slice = 0; slice < vd->nslices; slice++) { + if (vd->ldi_handle[slice] != NULL) { + PR0("Closing slice %u", slice); + (void) ldi_close(vd->ldi_handle[slice], + vd_open_flags | FNDELAY, kcred); + } + } } } @@ -2908,6 +3325,7 @@ return (MDEG_SUCCESS); } + static int vds_do_attach(dev_info_t *dip) { @@ -2950,6 +3368,7 @@ return (DDI_FAILURE); } + vds->dip = dip; vds->vd_table = mod_hash_create_ptrhash("vds_vd_table", VDS_NCHAINS, vds_destroy_vd,