changeset 4963:ce6338ba4a73

6531913 vds can lose access to vdisks built from files located on the root fs 6575050 vds should support unformatted disks
author achartre
date Thu, 30 Aug 2007 07:43:53 -0700
parents 44219572abba
children a9481fc76e88
files usr/src/uts/sun4v/io/vdc.c usr/src/uts/sun4v/io/vds.c usr/src/uts/sun4v/sys/vdc.h
diffstat 3 files changed, 839 insertions(+), 489 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/sun4v/io/vdc.c	Thu Aug 30 01:17:19 2007 -0700
+++ b/usr/src/uts/sun4v/io/vdc.c	Thu Aug 30 07:43:53 2007 -0700
@@ -132,7 +132,10 @@
 static int	vdc_init_descriptor_ring(vdc_t *vdc);
 static void	vdc_destroy_descriptor_ring(vdc_t *vdc);
 static int	vdc_setup_devid(vdc_t *vdc);
-static void	vdc_store_efi(vdc_t *vdc, struct dk_gpt *efi);
+static void	vdc_store_label_efi(vdc_t *vdc, struct dk_gpt *efi);
+static void	vdc_store_label_vtoc(vdc_t *, struct dk_geom *, struct vtoc *);
+static void	vdc_store_label_unk(vdc_t *vdc);
+static boolean_t vdc_is_opened(vdc_t *vdc);
 
 /* handshake with vds */
 static int		vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver);
@@ -174,8 +177,10 @@
 
 /* dkio */
 static int	vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode);
-static int	vdc_create_fake_geometry(vdc_t *vdc);
-static int	vdc_setup_disk_layout(vdc_t *vdc);
+static void	vdc_create_fake_geometry(vdc_t *vdc);
+static int	vdc_validate_geometry(vdc_t *vdc);
+static void	vdc_validate(vdc_t *vdc);
+static void	vdc_validate_task(void *arg);
 static int	vdc_null_copy_func(vdc_t *vdc, void *from, void *to,
 		    int mode, int dir);
 static int	vdc_get_wce_convert(vdc_t *vdc, void *from, void *to,
@@ -385,11 +390,25 @@
 		return (DDI_FAILURE);
 	}
 
-	if (vdc->open_count) {
+	if (vdc_is_opened(vdc)) {
 		DMSG(vdc, 0, "[%d] Cannot detach: device is open", instance);
 		return (DDI_FAILURE);
 	}
 
+	if (vdc->dkio_flush_pending) {
+		DMSG(vdc, 0,
+		    "[%d] Cannot detach: %d outstanding DKIO flushes\n",
+		    instance, vdc->dkio_flush_pending);
+		return (DDI_FAILURE);
+	}
+
+	if (vdc->validate_pending) {
+		DMSG(vdc, 0,
+		    "[%d] Cannot detach: %d outstanding validate request\n",
+		    instance, vdc->validate_pending);
+		return (DDI_FAILURE);
+	}
+
 	DMSG(vdc, 0, "[%d] proceeding...\n", instance);
 
 	/* mark instance as detaching */
@@ -465,8 +484,8 @@
 	if (vdc->vtoc)
 		kmem_free(vdc->vtoc, sizeof (struct vtoc));
 
-	if (vdc->label)
-		kmem_free(vdc->label, DK_LABEL_SIZE);
+	if (vdc->geom)
+		kmem_free(vdc->geom, sizeof (struct dk_geom));
 
 	if (vdc->devid) {
 		ddi_devid_unregister(dip);
@@ -518,7 +537,6 @@
 
 	vdc->dip	= dip;
 	vdc->instance	= instance;
-	vdc->open_count	= 0;
 	vdc->vdisk_type	= VD_DISK_TYPE_UNK;
 	vdc->vdisk_label = VD_DISK_LABEL_UNK;
 	vdc->state	= VDC_STATE_INIT;
@@ -529,6 +547,7 @@
 	vdc->max_xfer_sz = maxphys / DEV_BSIZE;
 
 	vdc->vtoc = NULL;
+	vdc->geom = NULL;
 	vdc->cinfo = NULL;
 	vdc->minfo = NULL;
 
@@ -588,16 +607,18 @@
 	atomic_inc_32(&vdc_instance_count);
 
 	/*
-	 * Once the handshake is complete, we can use the DRing to send
-	 * requests to the vDisk server to calculate the geometry and
-	 * VTOC of the "disk"
+	 * Check the disk label. This will send requests and do the handshake.
+	 * We don't really care about the disk label now. What we really need is
+	 * the handshake do be done so that we know the type of the disk (slice
+	 * or full disk) and the appropriate device nodes can be created.
 	 */
-	status = vdc_setup_disk_layout(vdc);
-	if (status != 0) {
-		DMSG(vdc, 0, "[%d] Failed to discover disk layout (err%d)",
-		    vdc->instance, status);
-		goto return_status;
-	}
+	vdc->vdisk_label = VD_DISK_LABEL_UNK;
+	vdc->vtoc = kmem_zalloc(sizeof (struct vtoc), KM_SLEEP);
+	vdc->geom = kmem_zalloc(sizeof (struct dk_geom), KM_SLEEP);
+
+	mutex_enter(&vdc->lock);
+	(void) vdc_validate_geometry(vdc);
+	mutex_exit(&vdc->lock);
 
 	/*
 	 * Now that we have the device info we can create the
@@ -933,16 +954,11 @@
 	int		i;
 
 	ASSERT(vdc != NULL);
+	ASSERT(vdc->vtoc != NULL);
 
 	instance = vdc->instance;
 	dip = vdc->dip;
 
-	if ((vdc->vtoc == NULL) || (vdc->vtoc->v_sanity != VTOC_SANE)) {
-		DMSG(vdc, 0, "![%d] Could not create device node property."
-		    " No VTOC available", instance);
-		return (ENXIO);
-	}
-
 	switch (vdc->vdisk_type) {
 	case VD_DISK_TYPE_DISK:
 		num_slices = V_NUMPAR;
@@ -955,6 +971,17 @@
 		return (EINVAL);
 	}
 
+	if (vdc->vdisk_label == VD_DISK_LABEL_UNK) {
+		/* remove all properties */
+		for (i = 0; i < num_slices; i++) {
+			dev = makedevice(ddi_driver_major(dip),
+			    VD_MAKE_DEV(instance, i));
+			(void) ddi_prop_remove(dev, dip, VDC_SIZE_PROP_NAME);
+			(void) ddi_prop_remove(dev, dip, VDC_NBLOCKS_PROP_NAME);
+		}
+		return (0);
+	}
+
 	for (i = 0; i < num_slices; i++) {
 		dev = makedevice(ddi_driver_major(dip),
 		    VD_MAKE_DEV(instance, i));
@@ -983,18 +1010,125 @@
 	return (0);
 }
 
+/*
+ * Function:
+ *	vdc_is_opened
+ *
+ * Description:
+ *	This function checks if any slice of a given virtual disk is
+ *	currently opened.
+ *
+ * Parameters:
+ *	vdc 		- soft state pointer
+ *
+ * Return Values
+ *	B_TRUE		- at least one slice is opened.
+ *	B_FALSE		- no slice is opened.
+ */
+static boolean_t
+vdc_is_opened(vdc_t *vdc)
+{
+	int i, nslices;
+
+	switch (vdc->vdisk_type) {
+	case VD_DISK_TYPE_DISK:
+		nslices = V_NUMPAR;
+		break;
+	case VD_DISK_TYPE_SLICE:
+		nslices = 1;
+		break;
+	case VD_DISK_TYPE_UNK:
+	default:
+		ASSERT(0);
+	}
+
+	/* check if there's any layered open */
+	for (i = 0; i < nslices; i++) {
+		if (vdc->open_lyr[i] > 0)
+			return (B_TRUE);
+	}
+
+	/* check if there is any other kind of open */
+	for (i = 0; i < OTYPCNT; i++) {
+		if (vdc->open[i] != 0)
+			return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+static int
+vdc_mark_opened(vdc_t *vdc, int slice, int flag, int otyp)
+{
+	uint8_t slicemask;
+	int i;
+
+	ASSERT(otyp < OTYPCNT);
+	ASSERT(slice < V_NUMPAR);
+	ASSERT(MUTEX_HELD(&vdc->lock));
+
+	slicemask = 1 << slice;
+
+	/* check if slice is already exclusively opened */
+	if (vdc->open_excl & slicemask)
+		return (EBUSY);
+
+	/* if open exclusive, check if slice is already opened */
+	if (flag & FEXCL) {
+		if (vdc->open_lyr[slice] > 0)
+			return (EBUSY);
+		for (i = 0; i < OTYPCNT; i++) {
+			if (vdc->open[i] & slicemask)
+				return (EBUSY);
+		}
+		vdc->open_excl |= slicemask;
+	}
+
+	/* mark slice as opened */
+	if (otyp == OTYP_LYR) {
+		vdc->open_lyr[slice]++;
+	} else {
+		vdc->open[otyp] |= slicemask;
+	}
+
+	return (0);
+}
+
+static void
+vdc_mark_closed(vdc_t *vdc, int slice, int flag, int otyp)
+{
+	uint8_t slicemask;
+
+	ASSERT(otyp < OTYPCNT);
+	ASSERT(slice < V_NUMPAR);
+	ASSERT(MUTEX_HELD(&vdc->lock));
+
+	slicemask = 1 << slice;
+
+	if (otyp == OTYP_LYR) {
+		ASSERT(vdc->open_lyr[slice] > 0);
+		vdc->open_lyr[slice]--;
+	} else {
+		vdc->open[otyp] &= ~slicemask;
+	}
+
+	if (flag & FEXCL)
+		vdc->open_excl &= ~slicemask;
+}
+
 static int
 vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred)
 {
 	_NOTE(ARGUNUSED(cred))
 
-	int		instance;
-	vdc_t		*vdc;
+	int	instance;
+	int	slice, status = 0;
+	vdc_t	*vdc;
 
 	ASSERT(dev != NULL);
 	instance = VDCUNIT(*dev);
 
-	if ((otyp != OTYP_CHR) && (otyp != OTYP_BLK))
+	if (otyp >= OTYPCNT)
 		return (EINVAL);
 
 	if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) {
@@ -1005,11 +1139,53 @@
 	DMSG(vdc, 0, "minor = %d flag = %x, otyp = %x\n",
 	    getminor(*dev), flag, otyp);
 
+	slice = VDCPART(*dev);
+
 	mutex_enter(&vdc->lock);
-	vdc->open_count++;
+
+	status = vdc_mark_opened(vdc, slice, flag, otyp);
+
+	if (status != 0) {
+		mutex_exit(&vdc->lock);
+		return (status);
+	}
+
+	if (flag & (FNDELAY | FNONBLOCK)) {
+
+		/* don't resubmit a validate request if there's already one */
+		if (vdc->validate_pending > 0) {
+			mutex_exit(&vdc->lock);
+			return (0);
+		}
+
+		/* call vdc_validate() asynchronously to avoid blocking */
+		if (taskq_dispatch(system_taskq, vdc_validate_task,
+		    (void *)vdc, TQ_NOSLEEP) == NULL) {
+			vdc_mark_closed(vdc, slice, flag, otyp);
+			mutex_exit(&vdc->lock);
+			return (ENXIO);
+		}
+
+		vdc->validate_pending++;
+		mutex_exit(&vdc->lock);
+		return (0);
+	}
+
 	mutex_exit(&vdc->lock);
 
-	return (0);
+	vdc_validate(vdc);
+
+	mutex_enter(&vdc->lock);
+
+	if (vdc->vdisk_label == VD_DISK_LABEL_UNK ||
+	    vdc->vtoc->v_part[slice].p_size == 0) {
+		vdc_mark_closed(vdc, slice, flag, otyp);
+		status = EIO;
+	}
+
+	mutex_exit(&vdc->lock);
+
+	return (status);
 }
 
 static int
@@ -1018,11 +1194,12 @@
 	_NOTE(ARGUNUSED(cred))
 
 	int	instance;
+	int	slice;
 	vdc_t	*vdc;
 
 	instance = VDCUNIT(dev);
 
-	if ((otyp != OTYP_CHR) && (otyp != OTYP_BLK))
+	if (otyp >= OTYPCNT)
 		return (EINVAL);
 
 	if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) {
@@ -1031,19 +1208,11 @@
 	}
 
 	DMSG(vdc, 0, "[%d] flag = %x, otyp = %x\n", instance, flag, otyp);
-	if (vdc->dkio_flush_pending) {
-		DMSG(vdc, 0,
-		    "[%d] Cannot detach: %d outstanding DKIO flushes\n",
-		    instance, vdc->dkio_flush_pending);
-		return (EBUSY);
-	}
-
-	/*
-	 * Should not need the mutex here, since the framework should protect
-	 * against more opens on this device, but just in case.
-	 */
+
+	slice = VDCPART(dev);
+
 	mutex_enter(&vdc->lock);
-	vdc->open_count--;
+	vdc_mark_closed(vdc, slice, flag, otyp);
 	mutex_exit(&vdc->lock);
 
 	return (0);
@@ -4072,6 +4241,32 @@
 		/*
 		 * We now verify the attributes sent by vds.
 		 */
+		if (attr_msg->vdisk_size == 0) {
+			DMSG(vdc, 0, "[%d] Invalid disk size from vds",
+			    vdc->instance);
+			status = EINVAL;
+			break;
+		}
+
+		if (attr_msg->max_xfer_sz == 0) {
+			DMSG(vdc, 0, "[%d] Invalid transfer size from vds",
+			    vdc->instance);
+			status = EINVAL;
+			break;
+		}
+
+		/*
+		 * If the disk size is already set check that it hasn't changed.
+		 */
+		if ((vdc->vdisk_size != 0) &&
+		    (vdc->vdisk_size != attr_msg->vdisk_size)) {
+			DMSG(vdc, 0, "[%d] Different disk size from vds "
+			    "(old=0x%lx - new=0x%lx", vdc->instance,
+			    vdc->vdisk_size, attr_msg->vdisk_size)
+			status = EINVAL;
+			break;
+		}
+
 		vdc->vdisk_size = attr_msg->vdisk_size;
 		vdc->vdisk_type = attr_msg->vdisk_type;
 
@@ -4107,6 +4302,11 @@
 			break;
 		}
 
+		/*
+		 * Now that we have received all attributes we can create a
+		 * fake geometry for the disk.
+		 */
+		vdc_create_fake_geometry(vdc);
 		break;
 
 	case VIO_SUBTYPE_NACK:
@@ -4394,53 +4594,53 @@
  *	This function implements the DKIOCGAPART ioctl.
  *
  * Arguments:
- *	dev	- device
+ *	vdc	- soft state pointer
  *	arg	- a pointer to a dk_map[NDKMAP] or dk_map32[NDKMAP] structure
  *	flag	- ioctl flags
  */
 static int
-vdc_dkio_get_partition(dev_t dev, caddr_t arg, int flag)
+vdc_dkio_get_partition(vdc_t *vdc, caddr_t arg, int flag)
 {
-	struct dk_geom geom;
-	struct vtoc vtoc;
+	struct dk_geom *geom;
+	struct vtoc *vtoc;
 	union {
 		struct dk_map map[NDKMAP];
 		struct dk_map32 map32[NDKMAP];
 	} data;
 	int i, rv, size;
 
-	rv = vd_process_ioctl(dev, DKIOCGGEOM, (caddr_t)&geom, FKIOCTL);
-	if (rv != 0)
+	mutex_enter(&vdc->lock);
+
+	if ((rv = vdc_validate_geometry(vdc)) != 0) {
+		mutex_exit(&vdc->lock);
 		return (rv);
-
-	rv = vd_process_ioctl(dev, DKIOCGVTOC, (caddr_t)&vtoc, FKIOCTL);
-	if (rv != 0)
-		return (rv);
-
-	if (vtoc.v_nparts != NDKMAP ||
-	    geom.dkg_nhead == 0 || geom.dkg_nsect == 0)
-		return (EINVAL);
+	}
+
+	vtoc = vdc->vtoc;
+	geom = vdc->geom;
 
 	if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) {
 
-		for (i = 0; i < NDKMAP; i++) {
-			data.map32[i].dkl_cylno = vtoc.v_part[i].p_start /
-			    (geom.dkg_nhead * geom.dkg_nsect);
-			data.map32[i].dkl_nblk = vtoc.v_part[i].p_size;
+		for (i = 0; i < vtoc->v_nparts; i++) {
+			data.map32[i].dkl_cylno = vtoc->v_part[i].p_start /
+			    (geom->dkg_nhead * geom->dkg_nsect);
+			data.map32[i].dkl_nblk = vtoc->v_part[i].p_size;
 		}
 		size = NDKMAP * sizeof (struct dk_map32);
 
 	} else {
 
-		for (i = 0; i < NDKMAP; i++) {
-			data.map[i].dkl_cylno = vtoc.v_part[i].p_start /
-			    (geom.dkg_nhead * geom.dkg_nsect);
-			data.map[i].dkl_nblk = vtoc.v_part[i].p_size;
+		for (i = 0; i < vtoc->v_nparts; i++) {
+			data.map[i].dkl_cylno = vtoc->v_part[i].p_start /
+			    (geom->dkg_nhead * geom->dkg_nsect);
+			data.map[i].dkl_nblk = vtoc->v_part[i].p_size;
 		}
 		size = NDKMAP * sizeof (struct dk_map);
 
 	}
 
+	mutex_exit(&vdc->lock);
+
 	if (ddi_copyout(&data, arg, size, flag) != 0)
 		return (EFAULT);
 
@@ -4612,7 +4812,6 @@
 	size_t		alloc_len = 0;		/* #bytes to allocate mem for */
 	caddr_t		mem_p = NULL;
 	size_t		nioctls = (sizeof (dk_ioctl)) / (sizeof (dk_ioctl[0]));
-	struct vtoc	vtoc_saved;
 	vdc_dk_ioctl_t	*iop;
 
 	vdc = ddi_get_soft_state(vdc_state, instance);
@@ -4669,6 +4868,9 @@
 
 	case DIOCTL_RWCMD:
 		{
+			if (vdc->cinfo == NULL)
+				return (ENXIO);
+
 			if (vdc->cinfo->dki_ctype != DKC_DIRECT)
 				return (ENOTTY);
 
@@ -4677,10 +4879,7 @@
 
 	case DKIOCGAPART:
 		{
-			if (vdc->vdisk_label != VD_DISK_LABEL_VTOC)
-				return (ENOTSUP);
-
-			return (vdc_dkio_get_partition(dev, arg, mode));
+			return (vdc_dkio_get_partition(vdc, arg, mode));
 		}
 
 	case DKIOCINFO:
@@ -4771,6 +4970,7 @@
 				/* clean up if dispatch fails */
 				mutex_enter(&vdc->lock);
 				vdc->dkio_flush_pending--;
+				mutex_exit(&vdc->lock);
 				kmem_free(dkarg, sizeof (vdc_dk_arg_t));
 			}
 
@@ -4790,14 +4990,6 @@
 	if (alloc_len > 0)
 		mem_p = kmem_zalloc(alloc_len, KM_SLEEP);
 
-	if (cmd == DKIOCSVTOC) {
-		/*
-		 * Save a copy of the current VTOC so that we can roll back
-		 * if the setting of the new VTOC fails.
-		 */
-		bcopy(vdc->vtoc, &vtoc_saved, sizeof (struct vtoc));
-	}
-
 	/*
 	 * Call the conversion function for this ioctl which, if necessary,
 	 * converts from the Solaris format to the format ARC'ed
@@ -4820,6 +5012,15 @@
 	    VDCPART(dev), 0, CB_SYNC, (void *)(uint64_t)mode,
 	    VIO_both_dir);
 
+	if (cmd == DKIOCSVTOC || cmd == DKIOCSETEFI) {
+		/*
+		 * The disk label may have changed. Revalidate the disk
+		 * geometry. This will also update the device nodes and
+		 * properties.
+		 */
+		vdc_validate(vdc);
+	}
+
 	if (rv != 0) {
 		/*
 		 * This is not necessarily an error. The ioctl could
@@ -4831,58 +5032,9 @@
 		if (mem_p != NULL)
 			kmem_free(mem_p, alloc_len);
 
-		if (cmd == DKIOCSVTOC) {
-			/* update of the VTOC has failed, roll back */
-			bcopy(&vtoc_saved, vdc->vtoc, sizeof (struct vtoc));
-		}
-
 		return (rv);
 	}
 
-	if (cmd == DKIOCSVTOC) {
-		/*
-		 * The VTOC has been changed. We need to update the device
-		 * nodes to handle the case where an EFI label has been
-		 * changed to a VTOC label. We also try and update the device
-		 * node properties. Failing to set the properties should
-		 * not cause an error to be return the caller though.
-		 */
-		vdc->vdisk_label = VD_DISK_LABEL_VTOC;
-		(void) vdc_create_device_nodes_vtoc(vdc);
-
-		if (vdc_create_device_nodes_props(vdc)) {
-			DMSG(vdc, 0, "![%d] Failed to update device nodes"
-			    " properties", vdc->instance);
-		}
-
-	} else if (cmd == DKIOCSETEFI) {
-		/*
-		 * The EFI has been changed. We need to update the device
-		 * nodes to handle the case where a VTOC label has been
-		 * changed to an EFI label. We also try and update the device
-		 * node properties. Failing to set the properties should
-		 * not cause an error to be return the caller though.
-		 */
-		struct dk_gpt *efi;
-		size_t efi_len;
-
-		vdc->vdisk_label = VD_DISK_LABEL_EFI;
-		(void) vdc_create_device_nodes_efi(vdc);
-
-		rv = vdc_efi_alloc_and_read(dev, &efi, &efi_len);
-
-		if (rv == 0) {
-			vdc_store_efi(vdc, efi);
-			rv = vdc_create_device_nodes_props(vdc);
-			vd_efi_free(efi, efi_len);
-		}
-
-		if (rv) {
-			DMSG(vdc, 0, "![%d] Failed to update device nodes"
-			    " properties", vdc->instance);
-		}
-	}
-
 	/*
 	 * Call the conversion function (if it exists) for this ioctl
 	 * which converts from the format ARC'ed as part of the vDisk
@@ -5046,6 +5198,8 @@
 static int
 vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir)
 {
+	_NOTE(ARGUNUSED(vdc))
+
 	void		*tmp_mem = NULL;
 	struct vtoc	vt;
 	struct vtoc	*vtp = &vt;
@@ -5078,12 +5232,6 @@
 		vtp = tmp_mem;
 	}
 
-	/*
-	 * The VTOC is being changed, then vdc needs to update the copy
-	 * it saved in the soft state structure.
-	 */
-	bcopy(vtp, vdc->vtoc, sizeof (struct vtoc));
-
 	VTOC2VD_VTOC(vtp, &vtvd);
 	bcopy(&vtvd, to, sizeof (vd_vtoc_t));
 	kmem_free(tmp_mem, copy_len);
@@ -5279,23 +5427,20 @@
  *	vdc	- soft state pointer for this instance of the device driver.
  *
  * Return Code:
- *	0	- Success
+ *	none.
  */
-static int
+static void
 vdc_create_fake_geometry(vdc_t *vdc)
 {
 	ASSERT(vdc != NULL);
-
-	/*
-	 * Check if max_xfer_sz and vdisk_size are valid
-	 */
-	if (vdc->vdisk_size == 0 || vdc->max_xfer_sz == 0)
-		return (EIO);
+	ASSERT(vdc->vdisk_size != 0);
+	ASSERT(vdc->max_xfer_sz != 0);
 
 	/*
 	 * DKIOCINFO support
 	 */
-	vdc->cinfo = kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP);
+	if (vdc->cinfo == NULL)
+		vdc->cinfo = kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP);
 
 	(void) strcpy(vdc->cinfo->dki_cname, VDC_DRIVER_NAME);
 	(void) strcpy(vdc->cinfo->dki_dname, VDC_DRIVER_NAME);
@@ -5329,59 +5474,65 @@
 	vdc->minfo->dki_media_type = DK_FIXED_DISK;
 	vdc->minfo->dki_capacity = vdc->vdisk_size;
 	vdc->minfo->dki_lbsize = DEV_BSIZE;
-
-	return (0);
+}
+
+static ushort_t
+vdc_lbl2cksum(struct dk_label *label)
+{
+	int	count;
+	ushort_t sum, *sp;
+
+	count =	(sizeof (struct dk_label)) / (sizeof (short)) - 1;
+	sp = (ushort_t *)label;
+	sum = 0;
+	while (count--) {
+		sum ^= *sp++;
+	}
+
+	return (sum);
 }
 
 /*
  * Function:
- *	vdc_setup_disk_layout()
+ *	vdc_validate_geometry
  *
  * Description:
- *	This routine discovers all the necessary details about the "disk"
- *	by requesting the data that is available from the vDisk server and by
- *	faking up the rest of the data.
+ *	This routine discovers the label and geometry of the disk. It stores
+ *	the disk label and related information in the vdc structure. If it
+ *	fails to validate the geometry or to discover the disk label then
+ *	the label is marked as unknown (VD_DISK_LABEL_UNK).
  *
  * Arguments:
  *	vdc	- soft state pointer for this instance of the device driver.
  *
  * Return Code:
- *	0	- Success
+ *	0	- success.
+ *	EINVAL	- unknown disk label.
+ *	ENOTSUP	- geometry not applicable (EFI label).
+ *	EIO	- error accessing the disk.
  */
 static int
-vdc_setup_disk_layout(vdc_t *vdc)
+vdc_validate_geometry(vdc_t *vdc)
 {
 	buf_t	*buf;	/* BREAD requests need to be in a buf_t structure */
 	dev_t	dev;
-	int	slice = 0;
-	int	rv, error;
+	int	rv;
+	struct dk_label label;
+	struct dk_geom geom;
+	struct vtoc vtoc;
 
 	ASSERT(vdc != NULL);
-
-	if (vdc->vtoc == NULL)
-		vdc->vtoc = kmem_zalloc(sizeof (struct vtoc), KM_SLEEP);
+	ASSERT(vdc->vtoc != NULL && vdc->geom != NULL);
+	ASSERT(MUTEX_HELD(&vdc->lock));
+
+	mutex_exit(&vdc->lock);
 
 	dev = makedevice(ddi_driver_major(vdc->dip),
 	    VD_MAKE_DEV(vdc->instance, 0));
-	rv = vd_process_ioctl(dev, DKIOCGVTOC, (caddr_t)vdc->vtoc, FKIOCTL);
-
-	if (rv && rv != ENOTSUP) {
-		DMSG(vdc, 0, "[%d] Failed to get VTOC (err=%d)",
-		    vdc->instance, rv);
-		return (rv);
-	}
-
-	/*
-	 * The process of attempting to read VTOC will initiate
-	 * the handshake and establish a connection. Following
-	 * handshake, go ahead and create geometry.
-	 */
-	error = vdc_create_fake_geometry(vdc);
-	if (error != 0) {
-		DMSG(vdc, 0, "[%d] Failed to create disk geometry (err%d)",
-		    vdc->instance, error);
-		return (error);
-	}
+
+	rv = vd_process_ioctl(dev, DKIOCGGEOM, (caddr_t)&geom, FKIOCTL);
+	if (rv == 0)
+		rv = vd_process_ioctl(dev, DKIOCGVTOC, (caddr_t)&vtoc, FKIOCTL);
 
 	if (rv == ENOTSUP) {
 		/*
@@ -5396,58 +5547,171 @@
 		if (rv) {
 			DMSG(vdc, 0, "[%d] Failed to get EFI (err=%d)",
 			    vdc->instance, rv);
-			return (rv);
+			mutex_enter(&vdc->lock);
+			vdc_store_label_unk(vdc);
+			return (EIO);
 		}
 
-		vdc->vdisk_label = VD_DISK_LABEL_EFI;
-		vdc_store_efi(vdc, efi);
+		mutex_enter(&vdc->lock);
+		vdc_store_label_efi(vdc, efi);
 		vd_efi_free(efi, efi_len);
-
-		return (0);
+		return (ENOTSUP);
 	}
 
-	vdc->vdisk_label = VD_DISK_LABEL_VTOC;
+	if (rv != 0) {
+		DMSG(vdc, 0, "[%d] Failed to get VTOC (err=%d)",
+		    vdc->instance, rv);
+		mutex_enter(&vdc->lock);
+		vdc_store_label_unk(vdc);
+		if (rv != EINVAL)
+			rv = EIO;
+		return (rv);
+	}
+
+	/* check that geometry and vtoc are valid */
+	if (geom.dkg_nhead == 0 || geom.dkg_nsect == 0 ||
+	    vtoc.v_sanity != VTOC_SANE) {
+		mutex_enter(&vdc->lock);
+		vdc_store_label_unk(vdc);
+		return (EINVAL);
+	}
 
 	/*
-	 * FUTURE: This could be default way for reading the VTOC
-	 * from the disk as supposed to sending the VD_OP_GET_VTOC
-	 * to the server. Currently this is a sanity check.
+	 * We have a disk and a valid VTOC. However this does not mean
+	 * that the disk currently have a VTOC label. The returned VTOC may
+	 * be a default VTOC to be used for configuring the disk (this is
+	 * what is done for disk image). So we read the label from the
+	 * beginning of the disk to ensure we really have a VTOC label.
 	 *
-	 * find the slice that represents the entire "disk" and use that to
-	 * read the disk label. The convention in Solaris is that slice 2
-	 * represents the whole disk so we check that it is, otherwise we
-	 * default to slice 0
+	 * FUTURE: This could be the default way for reading the VTOC
+	 * from the disk as opposed to sending the VD_OP_GET_VTOC
+	 * to the server. This will be the default if vdc is implemented
+	 * ontop of cmlb.
+	 */
+
+	/*
+	 * Single slice disk does not support read using an absolute disk
+	 * offset so we just rely on the DKIOCGVTOC ioctl in that case.
 	 */
-	if ((vdc->vdisk_type == VD_DISK_TYPE_DISK) &&
-	    (vdc->vtoc->v_part[2].p_tag == V_BACKUP)) {
-		slice = 2;
-	} else {
-		slice = 0;
+	if (vdc->vdisk_type == VD_DISK_TYPE_SLICE) {
+		mutex_enter(&vdc->lock);
+		if (vtoc.v_nparts != 1) {
+			vdc_store_label_unk(vdc);
+			return (EINVAL);
+		}
+		vdc_store_label_vtoc(vdc, &geom, &vtoc);
+		return (0);
+	}
+
+	if (vtoc.v_nparts != V_NUMPAR) {
+		mutex_enter(&vdc->lock);
+		vdc_store_label_unk(vdc);
+		return (EINVAL);
 	}
 
 	/*
 	 * Read disk label from start of disk
 	 */
-	vdc->label = kmem_zalloc(DK_LABEL_SIZE, KM_SLEEP);
 	buf = kmem_alloc(sizeof (buf_t), KM_SLEEP);
 	bioinit(buf);
-	buf->b_un.b_addr = (caddr_t)vdc->label;
+	buf->b_un.b_addr = (caddr_t)&label;
 	buf->b_bcount = DK_LABEL_SIZE;
 	buf->b_flags = B_BUSY | B_READ;
 	buf->b_dev = dev;
-	rv = vdc_send_request(vdc, VD_OP_BREAD, (caddr_t)vdc->label,
-	    DK_LABEL_SIZE, slice, 0, CB_STRATEGY, buf, VIO_read_dir);
+	rv = vdc_send_request(vdc, VD_OP_BREAD, (caddr_t)&label,
+	    DK_LABEL_SIZE, VD_SLICE_NONE, 0, CB_STRATEGY, buf, VIO_read_dir);
 	if (rv) {
 		DMSG(vdc, 1, "[%d] Failed to read disk block 0\n",
 		    vdc->instance);
-		kmem_free(buf, sizeof (buf_t));
-		return (rv);
+	} else {
+		rv = biowait(buf);
+		biofini(buf);
+	}
+	kmem_free(buf, sizeof (buf_t));
+
+	if (rv != 0 || label.dkl_magic != DKL_MAGIC ||
+	    label.dkl_cksum != vdc_lbl2cksum(&label)) {
+		DMSG(vdc, 1, "[%d] Got VTOC with invalid label\n",
+		    vdc->instance);
+		mutex_enter(&vdc->lock);
+		vdc_store_label_unk(vdc);
+		return (EINVAL);
 	}
-	rv = biowait(buf);
-	biofini(buf);
-	kmem_free(buf, sizeof (buf_t));
-
-	return (rv);
+
+	mutex_enter(&vdc->lock);
+	vdc_store_label_vtoc(vdc, &geom, &vtoc);
+	return (0);
+}
+
+/*
+ * Function:
+ *	vdc_validate
+ *
+ * Description:
+ *	This routine discovers the label of the disk and create the
+ *	appropriate device nodes if the label has changed.
+ *
+ * Arguments:
+ *	vdc	- soft state pointer for this instance of the device driver.
+ *
+ * Return Code:
+ *	none.
+ */
+static void
+vdc_validate(vdc_t *vdc)
+{
+	vd_disk_label_t old_label;
+	struct vtoc old_vtoc;
+	int rv;
+
+	ASSERT(!MUTEX_HELD(&vdc->lock));
+
+	mutex_enter(&vdc->lock);
+
+	/* save the current label and vtoc */
+	old_label = vdc->vdisk_label;
+	bcopy(vdc->vtoc, &old_vtoc, sizeof (struct vtoc));
+
+	/* check the geometry */
+	(void) vdc_validate_geometry(vdc);
+
+	/* if the disk label has changed, update device nodes */
+	if (vdc->vdisk_label != old_label) {
+
+		if (vdc->vdisk_label == VD_DISK_LABEL_EFI)
+			rv = vdc_create_device_nodes_efi(vdc);
+		else
+			rv = vdc_create_device_nodes_vtoc(vdc);
+
+		if (rv != 0) {
+			DMSG(vdc, 0, "![%d] Failed to update device nodes",
+			    vdc->instance);
+		}
+	}
+
+	/* if the vtoc has changed, update device nodes properties */
+	if (bcmp(vdc->vtoc, &old_vtoc, sizeof (struct vtoc)) != 0) {
+
+		if (vdc_create_device_nodes_props(vdc) != 0) {
+			DMSG(vdc, 0, "![%d] Failed to update device nodes"
+			    " properties", vdc->instance);
+		}
+	}
+
+	mutex_exit(&vdc->lock);
+}
+
+static void
+vdc_validate_task(void *arg)
+{
+	vdc_t *vdc = (vdc_t *)arg;
+
+	vdc_validate(vdc);
+
+	mutex_enter(&vdc->lock);
+	ASSERT(vdc->validate_pending > 0);
+	vdc->validate_pending--;
+	mutex_exit(&vdc->lock);
 }
 
 /*
@@ -5553,10 +5817,14 @@
 }
 
 static void
-vdc_store_efi(vdc_t *vdc, struct dk_gpt *efi)
+vdc_store_label_efi(vdc_t *vdc, struct dk_gpt *efi)
 {
 	struct vtoc *vtoc = vdc->vtoc;
 
+	ASSERT(MUTEX_HELD(&vdc->lock));
+
+	vdc->vdisk_label = VD_DISK_LABEL_EFI;
+	bzero(vdc->geom, sizeof (struct dk_geom));
 	vd_efi_to_vtoc(efi, vtoc);
 	if (vdc->vdisk_type == VD_DISK_TYPE_SLICE) {
 		/*
@@ -5573,3 +5841,23 @@
 		vtoc->v_part[0].p_size =  vtoc->v_part[VD_EFI_WD_SLICE].p_size;
 	}
 }
+
+static void
+vdc_store_label_vtoc(vdc_t *vdc, struct dk_geom *geom, struct vtoc *vtoc)
+{
+	ASSERT(MUTEX_HELD(&vdc->lock));
+
+	vdc->vdisk_label = VD_DISK_LABEL_VTOC;
+	bcopy(vtoc, vdc->vtoc, sizeof (struct vtoc));
+	bcopy(geom, vdc->geom, sizeof (struct dk_geom));
+}
+
+static void
+vdc_store_label_unk(vdc_t *vdc)
+{
+	ASSERT(MUTEX_HELD(&vdc->lock));
+
+	vdc->vdisk_label = VD_DISK_LABEL_UNK;
+	bzero(vdc->vtoc, sizeof (struct vtoc));
+	bzero(vdc->geom, sizeof (struct dk_geom));
+}
--- a/usr/src/uts/sun4v/io/vds.c	Thu Aug 30 01:17:19 2007 -0700
+++ b/usr/src/uts/sun4v/io/vds.c	Thu Aug 30 07:43:53 2007 -0700
@@ -398,7 +398,8 @@
 static void vd_free_dring_task(vd_t *vdp);
 static int vd_setup_vd(vd_t *vd);
 static boolean_t vd_enabled(vd_t *vd);
-
+static ushort_t vd_lbl2cksum(struct dk_label *label);
+static int vd_file_validate_geometry(vd_t *vd);
 /*
  * Function:
  *	vd_file_rw
@@ -439,6 +440,14 @@
 		offset = blk * DEV_BSIZE;
 	} else {
 		ASSERT(slice >= 0 && slice < V_NUMPAR);
+
+		if (vd->vdisk_label == VD_DISK_LABEL_UNK &&
+		    vd_file_validate_geometry(vd) != 0) {
+			PR0("Unknown disk label, can't do I/O from slice %d",
+			    slice);
+			return (-1);
+		}
+
 		if (blk >= vd->vtoc.v_part[slice].p_size) {
 			/* address past the end of the slice */
 			PR0("req_addr (0x%lx) > psize (0x%lx)",
@@ -520,6 +529,116 @@
 
 /*
  * Function:
+ *	vd_file_build_default_label
+ *
+ * Description:
+ *	Return a default label for the given disk. This is used when the disk
+ *	does not have a valid VTOC so that the user can get a valid default
+ *	configuration. The default label have all slices size set to 0 (except
+ *	slice 2 which is the entire disk) to force the user to write a valid
+ *	label onto the disk image.
+ *
+ * Parameters:
+ *	vd		- disk on which the operation is performed.
+ *	label		- the returned default label.
+ *
+ * Return Code:
+ *	none.
+ */
+static void
+vd_file_build_default_label(vd_t *vd, struct dk_label *label)
+{
+	size_t size;
+	char prefix;
+
+	ASSERT(vd->file);
+
+	/*
+	 * We must have a resonable number of cylinders and sectors so
+	 * that newfs can run using default values.
+	 *
+	 * if (disk_size < 2MB)
+	 * 	phys_cylinders = disk_size / 100K
+	 * else
+	 * 	phys_cylinders = disk_size / 300K
+	 *
+	 * phys_cylinders = (phys_cylinders == 0) ? 1 : phys_cylinders
+	 * alt_cylinders = (phys_cylinders > 2) ? 2 : 0;
+	 * data_cylinders = phys_cylinders - alt_cylinders
+	 *
+	 * sectors = disk_size / (phys_cylinders * blk_size)
+	 *
+	 * The file size test is an attempt to not have too few cylinders
+	 * for a small file, or so many on a big file that you waste space
+	 * for backup superblocks or cylinder group structures.
+	 */
+	if (vd->file_size < (2 * 1024 * 1024))
+		label->dkl_pcyl = vd->file_size / (100 * 1024);
+	else
+		label->dkl_pcyl = vd->file_size / (300 * 1024);
+
+	if (label->dkl_pcyl == 0)
+		label->dkl_pcyl = 1;
+
+	if (label->dkl_pcyl > 2)
+		label->dkl_acyl = 2;
+	else
+		label->dkl_acyl = 0;
+
+	label->dkl_nsect = vd->file_size /
+	    (DEV_BSIZE * label->dkl_pcyl);
+	label->dkl_ncyl = label->dkl_pcyl - label->dkl_acyl;
+	label->dkl_nhead = 1;
+	label->dkl_write_reinstruct = 0;
+	label->dkl_read_reinstruct = 0;
+	label->dkl_rpm = 7200;
+	label->dkl_apc = 0;
+	label->dkl_intrlv = 0;
+
+	PR0("requested disk size: %ld bytes\n", vd->file_size);
+	PR0("setup: ncyl=%d nhead=%d nsec=%d\n", label->dkl_pcyl,
+	    label->dkl_nhead, label->dkl_nsect);
+	PR0("provided disk size: %ld bytes\n", (uint64_t)
+	    (label->dkl_pcyl * label->dkl_nhead *
+	    label->dkl_nsect * DEV_BSIZE));
+
+	if (vd->file_size < (1ULL << 20)) {
+		size = vd->file_size >> 10;
+		prefix = 'K'; /* Kilobyte */
+	} else if (vd->file_size < (1ULL << 30)) {
+		size = vd->file_size >> 20;
+		prefix = 'M'; /* Megabyte */
+	} else if (vd->file_size < (1ULL << 40)) {
+		size = vd->file_size >> 30;
+		prefix = 'G'; /* Gigabyte */
+	} else {
+		size = vd->file_size >> 40;
+		prefix = 'T'; /* Terabyte */
+	}
+
+	/*
+	 * We must have a correct label name otherwise format(1m) will
+	 * not recognized the disk as labeled.
+	 */
+	(void) snprintf(label->dkl_asciilabel, LEN_DKL_ASCII,
+	    "SUN-DiskImage-%ld%cB cyl %d alt %d hd %d sec %d",
+	    size, prefix,
+	    label->dkl_ncyl, label->dkl_acyl, label->dkl_nhead,
+	    label->dkl_nsect);
+
+	/* default VTOC */
+	label->dkl_vtoc.v_version = V_VERSION;
+	label->dkl_vtoc.v_nparts = V_NUMPAR;
+	label->dkl_vtoc.v_sanity = VTOC_SANE;
+	label->dkl_vtoc.v_part[2].p_tag = V_BACKUP;
+	label->dkl_map[2].dkl_cylno = 0;
+	label->dkl_map[2].dkl_nblk = label->dkl_ncyl *
+	    label->dkl_nhead * label->dkl_nsect;
+	label->dkl_cksum = vd_lbl2cksum(label);
+}
+
+/*
+ * Function:
  *	vd_file_set_vtoc
  *
  * Description:
@@ -1451,38 +1570,34 @@
 	kmem_free(dk_efi->dki_data, vd_efi->length);
 }
 
-static int
-vd_read_vtoc(ldi_handle_t handle, struct vtoc *vtoc, vd_disk_label_t *label)
+static vd_disk_label_t
+vd_read_vtoc(ldi_handle_t handle, struct vtoc *vtoc)
 {
 	int status, rval;
 	struct dk_gpt *efi;
 	size_t efi_len;
 
-	*label = VD_DISK_LABEL_UNK;
-
 	status = ldi_ioctl(handle, DKIOCGVTOC, (intptr_t)vtoc,
 	    (vd_open_flags | FKIOCTL), kcred, &rval);
 
 	if (status == 0) {
-		*label = VD_DISK_LABEL_VTOC;
-		return (0);
+		return (VD_DISK_LABEL_VTOC);
 	} else if (status != ENOTSUP) {
 		PR0("ldi_ioctl(DKIOCGVTOC) returned error %d", status);
-		return (status);
+		return (VD_DISK_LABEL_UNK);
 	}
 
 	status = vds_efi_alloc_and_read(handle, &efi, &efi_len);
 
 	if (status) {
 		PR0("vds_efi_alloc_and_read returned error %d", status);
-		return (status);
+		return (VD_DISK_LABEL_UNK);
 	}
 
-	*label = VD_DISK_LABEL_EFI;
 	vd_efi_to_vtoc(efi, vtoc);
 	vd_efi_free(efi, efi_len);
 
-	return (0);
+	return (VD_DISK_LABEL_EFI);
 }
 
 static ushort_t
@@ -1556,6 +1671,102 @@
 }
 
 /*
+ * Function:
+ *	vd_file_validate_geometry
+ *
+ * Description:
+ *	Read the label and validate the geometry of a disk image. The driver
+ *	label, vtoc and geometry information are updated according to the
+ *	label read from the disk image.
+ *
+ *	If no valid label is found, the label is set to unknown and the
+ *	function returns EINVAL, but a default vtoc and geometry are provided
+ *	to the driver.
+ *
+ * Parameters:
+ *	vd	- disk on which the operation is performed.
+ *
+ * Return Code:
+ *	0	- success.
+ *	EIO	- error reading the label from the disk image.
+ *	EINVAL	- unknown disk label.
+ */
+static int
+vd_file_validate_geometry(vd_t *vd)
+{
+	struct dk_label label;
+	struct dk_geom *geom = &vd->dk_geom;
+	struct vtoc *vtoc = &vd->vtoc;
+	int i;
+	int status = 0;
+
+	ASSERT(vd->file);
+
+	if (VD_FILE_LABEL_READ(vd, &label) < 0)
+		return (EIO);
+
+	if (label.dkl_magic != DKL_MAGIC ||
+	    label.dkl_cksum != vd_lbl2cksum(&label) ||
+	    label.dkl_vtoc.v_sanity != VTOC_SANE ||
+	    label.dkl_vtoc.v_nparts != V_NUMPAR) {
+		vd->vdisk_label = VD_DISK_LABEL_UNK;
+		vd_file_build_default_label(vd, &label);
+		status = EINVAL;
+	} else {
+		vd->vdisk_label = VD_DISK_LABEL_VTOC;
+	}
+
+	/* Update the driver geometry */
+	bzero(geom, sizeof (struct dk_geom));
+
+	geom->dkg_ncyl = label.dkl_ncyl;
+	geom->dkg_acyl = label.dkl_acyl;
+	geom->dkg_nhead = label.dkl_nhead;
+	geom->dkg_nsect = label.dkl_nsect;
+	geom->dkg_intrlv = label.dkl_intrlv;
+	geom->dkg_apc = label.dkl_apc;
+	geom->dkg_rpm = label.dkl_rpm;
+	geom->dkg_pcyl = label.dkl_pcyl;
+	geom->dkg_write_reinstruct = label.dkl_write_reinstruct;
+	geom->dkg_read_reinstruct = label.dkl_read_reinstruct;
+
+	/* Update the driver vtoc */
+	bzero(vtoc, sizeof (struct vtoc));
+
+	vtoc->v_sanity = label.dkl_vtoc.v_sanity;
+	vtoc->v_version = label.dkl_vtoc.v_version;
+	vtoc->v_sectorsz = DEV_BSIZE;
+	vtoc->v_nparts = label.dkl_vtoc.v_nparts;
+
+	for (i = 0; i < vtoc->v_nparts; i++) {
+		vtoc->v_part[i].p_tag =
+		    label.dkl_vtoc.v_part[i].p_tag;
+		vtoc->v_part[i].p_flag =
+		    label.dkl_vtoc.v_part[i].p_flag;
+		vtoc->v_part[i].p_start =
+		    label.dkl_map[i].dkl_cylno *
+		    (label.dkl_nhead * label.dkl_nsect);
+		vtoc->v_part[i].p_size = label.dkl_map[i].dkl_nblk;
+		vtoc->timestamp[i] =
+		    label.dkl_vtoc.v_timestamp[i];
+	}
+	/*
+	 * The bootinfo array can not be copied with bcopy() because
+	 * elements are of type long in vtoc (so 64-bit) and of type
+	 * int in dk_vtoc (so 32-bit).
+	 */
+	vtoc->v_bootinfo[0] = label.dkl_vtoc.v_bootinfo[0];
+	vtoc->v_bootinfo[1] = label.dkl_vtoc.v_bootinfo[1];
+	vtoc->v_bootinfo[2] = label.dkl_vtoc.v_bootinfo[2];
+	bcopy(label.dkl_asciilabel, vtoc->v_asciilabel,
+	    LEN_DKL_ASCII);
+	bcopy(label.dkl_vtoc.v_volume, vtoc->v_volume,
+	    LEN_DKL_VVOL);
+
+	return (status);
+}
+
+/*
  * Handle ioctls to a disk image (file-based).
  *
  * Return Values
@@ -1571,7 +1782,6 @@
 	int i, rc;
 
 	ASSERT(vd->file);
-	ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC);
 
 	switch (cmd) {
 
@@ -1579,70 +1789,22 @@
 		ASSERT(ioctl_arg != NULL);
 		geom = (struct dk_geom *)ioctl_arg;
 
-		if (VD_FILE_LABEL_READ(vd, &label) < 0)
-			return (EIO);
-
-		if (label.dkl_magic != DKL_MAGIC ||
-		    label.dkl_cksum != vd_lbl2cksum(&label))
-			return (EINVAL);
-
-		bzero(geom, sizeof (struct dk_geom));
-		geom->dkg_ncyl = label.dkl_ncyl;
-		geom->dkg_acyl = label.dkl_acyl;
-		geom->dkg_nhead = label.dkl_nhead;
-		geom->dkg_nsect = label.dkl_nsect;
-		geom->dkg_intrlv = label.dkl_intrlv;
-		geom->dkg_apc = label.dkl_apc;
-		geom->dkg_rpm = label.dkl_rpm;
-		geom->dkg_pcyl = label.dkl_pcyl;
-		geom->dkg_write_reinstruct = label.dkl_write_reinstruct;
-		geom->dkg_read_reinstruct = label.dkl_read_reinstruct;
-
+		rc = vd_file_validate_geometry(vd);
+		if (rc != 0 && rc != EINVAL)
+			return (rc);
+
+		bcopy(&vd->dk_geom, geom, sizeof (struct dk_geom));
 		return (0);
 
 	case DKIOCGVTOC:
 		ASSERT(ioctl_arg != NULL);
 		vtoc = (struct vtoc *)ioctl_arg;
 
-		if (VD_FILE_LABEL_READ(vd, &label) < 0)
-			return (EIO);
-
-		if (label.dkl_magic != DKL_MAGIC ||
-		    label.dkl_cksum != vd_lbl2cksum(&label))
-			return (EINVAL);
-
-		bzero(vtoc, sizeof (struct vtoc));
-
-		vtoc->v_sanity = label.dkl_vtoc.v_sanity;
-		vtoc->v_version = label.dkl_vtoc.v_version;
-		vtoc->v_sectorsz = DEV_BSIZE;
-		vtoc->v_nparts = label.dkl_vtoc.v_nparts;
-
-		for (i = 0; i < vtoc->v_nparts; i++) {
-			vtoc->v_part[i].p_tag =
-			    label.dkl_vtoc.v_part[i].p_tag;
-			vtoc->v_part[i].p_flag =
-			    label.dkl_vtoc.v_part[i].p_flag;
-			vtoc->v_part[i].p_start =
-			    label.dkl_map[i].dkl_cylno *
-			    (label.dkl_nhead * label.dkl_nsect);
-			vtoc->v_part[i].p_size = label.dkl_map[i].dkl_nblk;
-			vtoc->timestamp[i] =
-			    label.dkl_vtoc.v_timestamp[i];
-		}
-		/*
-		 * The bootinfo array can not be copied with bcopy() because
-		 * elements are of type long in vtoc (so 64-bit) and of type
-		 * int in dk_vtoc (so 32-bit).
-		 */
-		vtoc->v_bootinfo[0] = label.dkl_vtoc.v_bootinfo[0];
-		vtoc->v_bootinfo[1] = label.dkl_vtoc.v_bootinfo[1];
-		vtoc->v_bootinfo[2] = label.dkl_vtoc.v_bootinfo[2];
-		bcopy(label.dkl_asciilabel, vtoc->v_asciilabel,
-		    LEN_DKL_ASCII);
-		bcopy(label.dkl_vtoc.v_volume, vtoc->v_volume,
-		    LEN_DKL_VVOL);
-
+		rc = vd_file_validate_geometry(vd);
+		if (rc != 0 && rc != EINVAL)
+			return (rc);
+
+		bcopy(&vd->vtoc, vtoc, sizeof (struct vtoc));
 		return (0);
 
 	case DKIOCSGEOM:
@@ -1721,8 +1883,9 @@
 		if ((rc = vd_file_set_vtoc(vd, &label)) != 0)
 			return (rc);
 
-		/* update the cached vdisk VTOC */
-		bcopy(vtoc, &vd->vtoc, sizeof (vd->vtoc));
+		/* check the geometry and update the driver info */
+		if ((rc = vd_file_validate_geometry(vd)) != 0)
+			return (rc);
 
 		/*
 		 * The disk geometry may have changed, so we need to write
@@ -2007,6 +2170,8 @@
 
 	bcopy(devid->did_id, vd_devid->id, len);
 
+	request->status = 0;
+
 	/* LDC memory operations require 8-byte multiples */
 	ASSERT(request->nbytes % sizeof (uint64_t) == 0);
 
@@ -3376,6 +3541,51 @@
 }
 
 static int
+vd_setup_partition_vtoc(vd_t *vd)
+{
+	int rval, status;
+	char *device_path = vd->device_path;
+
+	status = ldi_ioctl(vd->ldi_handle[0], DKIOCGGEOM,
+	    (intptr_t)&vd->dk_geom, (vd_open_flags | FKIOCTL), kcred, &rval);
+
+	if (status != 0) {
+		PRN("ldi_ioctl(DKIOCGEOM) returned errno %d for %s",
+		    status, device_path);
+		return (status);
+	}
+
+	/* Initialize dk_geom structure for single-slice device */
+	if (vd->dk_geom.dkg_nsect == 0) {
+		PRN("%s geometry claims 0 sectors per track", device_path);
+		return (EIO);
+	}
+	if (vd->dk_geom.dkg_nhead == 0) {
+		PRN("%s geometry claims 0 heads", device_path);
+		return (EIO);
+	}
+	vd->dk_geom.dkg_ncyl = vd->vdisk_size / vd->dk_geom.dkg_nsect /
+	    vd->dk_geom.dkg_nhead;
+	vd->dk_geom.dkg_acyl = 0;
+	vd->dk_geom.dkg_pcyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl;
+
+
+	/* Initialize vtoc structure for single-slice device */
+	bcopy(VD_VOLUME_NAME, vd->vtoc.v_volume,
+	    MIN(sizeof (VD_VOLUME_NAME), sizeof (vd->vtoc.v_volume)));
+	bzero(vd->vtoc.v_part, sizeof (vd->vtoc.v_part));
+	vd->vtoc.v_nparts = 1;
+	vd->vtoc.v_part[0].p_tag = V_UNASSIGNED;
+	vd->vtoc.v_part[0].p_flag = 0;
+	vd->vtoc.v_part[0].p_start = 0;
+	vd->vtoc.v_part[0].p_size = vd->vdisk_size;
+	bcopy(VD_ASCIILABEL, vd->vtoc.v_asciilabel,
+	    MIN(sizeof (VD_ASCIILABEL), sizeof (vd->vtoc.v_asciilabel)));
+
+	return (0);
+}
+
+static int
 vd_setup_partition_efi(vd_t *vd)
 {
 	efi_gpt_t *gpt;
@@ -3417,17 +3627,13 @@
 static int
 vd_setup_file(vd_t *vd)
 {
-	int 		i, rval, status;
-	ushort_t	sum;
+	int 		rval, status;
 	vattr_t		vattr;
 	dev_t		dev;
-	size_t		size;
 	char		*file_path = vd->device_path;
 	char		dev_path[MAXPATHLEN + 1];
-	char		prefix;
 	ldi_handle_t	lhandle;
 	struct dk_cinfo	dk_cinfo;
-	struct dk_label label;
 
 	/* make sure the file is valid */
 	if ((status = lookupname(file_path, UIO_SYSSPACE, FOLLOW,
@@ -3475,118 +3681,17 @@
 		return (EIO);
 	}
 
-	/* read label from file */
-	if (VD_FILE_LABEL_READ(vd, &label) < 0) {
-		PRN("Can't read label from %s", file_path);
+	/* find and validate the geometry of the disk image */
+	status = vd_file_validate_geometry(vd);
+	if (status != 0 && status != EINVAL) {
+		PRN("Fail to read label from %s", file_path);
 		return (EIO);
 	}
 
-	/* label checksum */
-	sum = vd_lbl2cksum(&label);
-
-	if (label.dkl_magic != DKL_MAGIC || label.dkl_cksum != sum) {
-		PR0("%s has an invalid disk label "
-		    "(magic=%x cksum=%x (expect %x))",
-		    file_path, label.dkl_magic, label.dkl_cksum, sum);
-
-		/* default label */
-		bzero(&label, sizeof (struct dk_label));
-
-		/*
-		 * We must have a resonable number of cylinders and sectors so
-		 * that newfs can run using default values.
-		 *
-		 * if (disk_size < 2MB)
-		 * 	phys_cylinders = disk_size / 100K
-		 * else
-		 * 	phys_cylinders = disk_size / 300K
-		 *
-		 * phys_cylinders = (phys_cylinders == 0) ? 1 : phys_cylinders
-		 * alt_cylinders = (phys_cylinders > 2) ? 2 : 0;
-		 * data_cylinders = phys_cylinders - alt_cylinders
-		 *
-		 * sectors = disk_size / (phys_cylinders * blk_size)
-		 */
-		if (vd->file_size < (2 * 1024 * 1024))
-			label.dkl_pcyl = vd->file_size / (100 * 1024);
-		else
-			label.dkl_pcyl = vd->file_size / (300 * 1024);
-
-		if (label.dkl_pcyl == 0)
-			label.dkl_pcyl = 1;
-
-		if (label.dkl_pcyl > 2)
-			label.dkl_acyl = 2;
-		else
-			label.dkl_acyl = 0;
-
-		label.dkl_nsect = vd->file_size /
-		    (DEV_BSIZE * label.dkl_pcyl);
-		label.dkl_ncyl = label.dkl_pcyl - label.dkl_acyl;
-		label.dkl_nhead = 1;
-		label.dkl_write_reinstruct = 0;
-		label.dkl_read_reinstruct = 0;
-		label.dkl_rpm = 7200;
-		label.dkl_apc = 0;
-		label.dkl_intrlv = 0;
-		label.dkl_magic = DKL_MAGIC;
-
-		PR0("requested disk size: %ld bytes\n", vd->file_size);
-		PR0("setup: ncyl=%d nhead=%d nsec=%d\n", label.dkl_pcyl,
-		    label.dkl_nhead, label.dkl_nsect);
-		PR0("provided disk size: %ld bytes\n", (uint64_t)
-		    (label.dkl_pcyl *
-		    label.dkl_nhead * label.dkl_nsect * DEV_BSIZE));
-
-		if (vd->file_size < (1ULL << 20)) {
-			size = vd->file_size >> 10;
-			prefix = 'K'; /* Kilobyte */
-		} else if (vd->file_size < (1ULL << 30)) {
-			size = vd->file_size >> 20;
-			prefix = 'M'; /* Megabyte */
-		} else if (vd->file_size < (1ULL << 40)) {
-			size = vd->file_size >> 30;
-			prefix = 'G'; /* Gigabyte */
-		} else {
-			size = vd->file_size >> 40;
-			prefix = 'T'; /* Terabyte */
-		}
-
-		/*
-		 * We must have a correct label name otherwise format(1m) will
-		 * not recognized the disk as labeled.
-		 */
-		(void) snprintf(label.dkl_asciilabel, LEN_DKL_ASCII,
-		    "SUN-DiskImage-%ld%cB cyl %d alt %d hd %d sec %d",
-		    size, prefix,
-		    label.dkl_ncyl, label.dkl_acyl, label.dkl_nhead,
-		    label.dkl_nsect);
-
-		/* default VTOC */
-		label.dkl_vtoc.v_version = V_VERSION;
-		label.dkl_vtoc.v_nparts = V_NUMPAR;
-		label.dkl_vtoc.v_sanity = VTOC_SANE;
-		label.dkl_vtoc.v_part[2].p_tag = V_BACKUP;
-		label.dkl_map[2].dkl_cylno = 0;
-		label.dkl_map[2].dkl_nblk = label.dkl_ncyl *
-		    label.dkl_nhead * label.dkl_nsect;
-		label.dkl_map[0] = label.dkl_map[2];
-		label.dkl_map[0] = label.dkl_map[2];
-		label.dkl_cksum = vd_lbl2cksum(&label);
-
-		/* write default label to file */
-		if ((rval = vd_file_set_vtoc(vd, &label)) != 0) {
-			PRN("Can't write label to %s", file_path);
-			return (rval);
-		}
-	}
-
-	vd->nslices = label.dkl_vtoc.v_nparts;
-
+	vd->nslices = V_NUMPAR;
 	/* sector size = block size = DEV_BSIZE */
 	vd->vdisk_size = vd->file_size / DEV_BSIZE;
 	vd->vdisk_type = VD_DISK_TYPE_DISK;
-	vd->vdisk_label = VD_DISK_LABEL_VTOC;
 	vd->max_xfer_sz = maxphys / DEV_BSIZE; /* default transfer size */
 
 	/* Get max_xfer_sz from the device where the file is */
@@ -3621,55 +3726,27 @@
 	PR0("using file %s, dev %s, max_xfer = %u blks",
 	    file_path, dev_path, vd->max_xfer_sz);
 
-	vd->dk_geom.dkg_ncyl = label.dkl_ncyl;
-	vd->dk_geom.dkg_acyl = label.dkl_acyl;
-	vd->dk_geom.dkg_pcyl = label.dkl_pcyl;
-	vd->dk_geom.dkg_nhead = label.dkl_nhead;
-	vd->dk_geom.dkg_nsect = label.dkl_nsect;
-	vd->dk_geom.dkg_intrlv = label.dkl_intrlv;
-	vd->dk_geom.dkg_apc = label.dkl_apc;
-	vd->dk_geom.dkg_rpm = label.dkl_rpm;
-	vd->dk_geom.dkg_write_reinstruct = label.dkl_write_reinstruct;
-	vd->dk_geom.dkg_read_reinstruct = label.dkl_read_reinstruct;
-
-	vd->vtoc.v_sanity = label.dkl_vtoc.v_sanity;
-	vd->vtoc.v_version = label.dkl_vtoc.v_version;
-	vd->vtoc.v_sectorsz = DEV_BSIZE;
-	vd->vtoc.v_nparts = label.dkl_vtoc.v_nparts;
-
-	bcopy(label.dkl_vtoc.v_volume, vd->vtoc.v_volume,
-	    LEN_DKL_VVOL);
-	bcopy(label.dkl_asciilabel, vd->vtoc.v_asciilabel,
-	    LEN_DKL_ASCII);
-
-	for (i = 0; i < vd->nslices; i++) {
-		vd->vtoc.timestamp[i] = label.dkl_vtoc.v_timestamp[i];
-		vd->vtoc.v_part[i].p_tag = label.dkl_vtoc.v_part[i].p_tag;
-		vd->vtoc.v_part[i].p_flag = label.dkl_vtoc.v_part[i].p_flag;
-		vd->vtoc.v_part[i].p_start = label.dkl_map[i].dkl_cylno *
-		    label.dkl_nhead * label.dkl_nsect;
-		vd->vtoc.v_part[i].p_size = label.dkl_map[i].dkl_nblk;
-		vd->ldi_handle[i] = NULL;
-		vd->dev[i] = NULL;
-	}
-
 	/* Setup devid for the disk image */
 
-	status = vd_file_read_devid(vd, &vd->file_devid);
-
-	if (status == 0) {
-		/* a valid devid was found */
-		return (0);
-	}
-
-	if (status != EINVAL) {
-		/*
-		 * There was an error while trying to read the devid. So this
-		 * disk image may have a devid but we are unable to read it.
-		 */
-		PR0("can not read devid for %s", file_path);
-		vd->file_devid = NULL;
-		return (0);
+	if (vd->vdisk_label != VD_DISK_LABEL_UNK) {
+
+		status = vd_file_read_devid(vd, &vd->file_devid);
+
+		if (status == 0) {
+			/* a valid devid was found */
+			return (0);
+		}
+
+		if (status != EINVAL) {
+			/*
+			 * There was an error while trying to read the devid.
+			 * So this disk image may have a devid but we are
+			 * unable to read it.
+			 */
+			PR0("can not read devid for %s", file_path);
+			vd->file_devid = NULL;
+			return (0);
+		}
 	}
 
 	/*
@@ -3686,11 +3763,17 @@
 		return (0);
 	}
 
-	/* write devid to the disk image */
-	if (vd_file_write_devid(vd, vd->file_devid) != 0) {
-		PR0("fail to write devid for %s", file_path);
-		ddi_devid_free(vd->file_devid);
-		vd->file_devid = NULL;
+	/*
+	 * Write devid to the disk image. The devid is stored into the disk
+	 * image if we have a valid label; otherwise the devid will be stored
+	 * when the user writes a valid label.
+	 */
+	if (vd->vdisk_label != VD_DISK_LABEL_UNK) {
+		if (vd_file_write_devid(vd, vd->file_devid) != 0) {
+			PR0("fail to write devid for %s", file_path);
+			ddi_devid_free(vd->file_devid);
+			vd->file_devid = NULL;
+		}
 	}
 
 	return (0);
@@ -3720,7 +3803,7 @@
 			PRN("Cannot use device/file (%s), errno=%d\n",
 			    device_path, status);
 			if (status == ENXIO || status == ENODEV ||
-			    status == ENOENT) {
+			    status == ENOENT || status == EROFS) {
 				return (EAGAIN);
 			}
 		}
@@ -3746,7 +3829,7 @@
 	}
 	vd->vdisk_size = lbtodb(vd->vdisk_size);	/* convert to blocks */
 
-	/* Verify backing device supports dk_cinfo, dk_geom, and vtoc */
+	/* Verify backing device supports dk_cinfo */
 	if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO,
 	    (intptr_t)&dk_cinfo, (vd_open_flags | FKIOCTL), kcred,
 	    &rval)) != 0) {
@@ -3760,22 +3843,7 @@
 		return (EIO);
 	}
 
-	status = vd_read_vtoc(vd->ldi_handle[0], &vd->vtoc, &vd->vdisk_label);
-
-	if (status != 0) {
-		PRN("vd_read_vtoc returned errno %d for %s",
-		    status, device_path);
-		return (status);
-	}
-
-	if (vd->vdisk_label == VD_DISK_LABEL_VTOC &&
-	    (status = ldi_ioctl(vd->ldi_handle[0], DKIOCGGEOM,
-	    (intptr_t)&vd->dk_geom, (vd_open_flags | FKIOCTL),
-	    kcred, &rval)) != 0) {
-		PRN("ldi_ioctl(DKIOCGEOM) returned errno %d for %s",
-		    status, device_path);
-		return (status);
-	}
+	vd->vdisk_label = vd_read_vtoc(vd->ldi_handle[0], &vd->vtoc);
 
 	/* Store the device's max transfer size for return to the client */
 	vd->max_xfer_sz = dk_cinfo.dki_maxtransfer;
@@ -3789,6 +3857,15 @@
 	vd->pseudo = is_pseudo_device(dip);
 	ddi_release_devi(dip);
 	if (vd->pseudo) {
+		/*
+		 * Currently we only support exporting pseudo devices which
+		 * provide a valid disk label.
+		 */
+		if (vd->vdisk_label == VD_DISK_LABEL_UNK) {
+			PRN("%s is a pseudo device with an invalid disk "
+			    "label\n", device_path);
+			return (EINVAL);
+		}
 		vd->vdisk_type	= VD_DISK_TYPE_SLICE;
 		vd->nslices	= 1;
 		return (0);	/* ...and we're done */
@@ -3798,45 +3875,27 @@
 	if (dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE)
 		return (vd_setup_full_disk(vd));
 
+	/* We can only export a slice if the disk has a valid label */
+	if (vd->vdisk_label == VD_DISK_LABEL_UNK) {
+		PRN("%s is a slice from a disk with an unknown disk label\n",
+		    device_path);
+		return (EINVAL);
+	}
 
 	/* Otherwise, we have a non-entire slice of a device */
 	vd->vdisk_type	= VD_DISK_TYPE_SLICE;
 	vd->nslices	= 1;
 
 	if (vd->vdisk_label == VD_DISK_LABEL_EFI) {
+		/* Slice from a disk with an EFI label */
 		status = vd_setup_partition_efi(vd);
-		return (status);
-	}
-
-	/* Initialize dk_geom structure for single-slice device */
-	if (vd->dk_geom.dkg_nsect == 0) {
-		PRN("%s geometry claims 0 sectors per track", device_path);
-		return (EIO);
-	}
-	if (vd->dk_geom.dkg_nhead == 0) {
-		PRN("%s geometry claims 0 heads", device_path);
-		return (EIO);
+	} else {
+		/* Slice from a disk with a VTOC label */
+		ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC);
+		status = vd_setup_partition_vtoc(vd);
 	}
-	vd->dk_geom.dkg_ncyl =
-	    vd->vdisk_size/vd->dk_geom.dkg_nsect/vd->dk_geom.dkg_nhead;
-	vd->dk_geom.dkg_acyl = 0;
-	vd->dk_geom.dkg_pcyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl;
-
-
-	/* Initialize vtoc structure for single-slice device */
-	bcopy(VD_VOLUME_NAME, vd->vtoc.v_volume,
-	    MIN(sizeof (VD_VOLUME_NAME), sizeof (vd->vtoc.v_volume)));
-	bzero(vd->vtoc.v_part, sizeof (vd->vtoc.v_part));
-	vd->vtoc.v_nparts = 1;
-	vd->vtoc.v_part[0].p_tag = V_UNASSIGNED;
-	vd->vtoc.v_part[0].p_flag = 0;
-	vd->vtoc.v_part[0].p_start = 0;
-	vd->vtoc.v_part[0].p_size = vd->vdisk_size;
-	bcopy(VD_ASCIILABEL, vd->vtoc.v_asciilabel,
-	    MIN(sizeof (VD_ASCIILABEL), sizeof (vd->vtoc.v_asciilabel)));
-
-
-	return (0);
+
+	return (status);
 }
 
 static int
--- a/usr/src/uts/sun4v/sys/vdc.h	Thu Aug 30 01:17:19 2007 -0700
+++ b/usr/src/uts/sun4v/sys/vdc.h	Thu Aug 30 07:43:53 2007 -0700
@@ -245,8 +245,14 @@
 	vdc_lc_state_t	lifecycle;	/* Current state of the vdc instance */
 
 	int		hshake_cnt;	/* number of failed handshakes */
-	int		open_count;	/* count of outstanding opens */
+	uint8_t		open[OTYPCNT];	/* mask of opened slices */
+	uint8_t		open_excl;	/* mask of exclusively opened slices */
+	ulong_t		open_lyr[V_NUMPAR]; /* number of layered opens */
 	int		dkio_flush_pending; /* # outstanding DKIO flushes */
+	int		validate_pending; /* # outstanding validate request */
+	vd_disk_label_t vdisk_label; 	/* label type of device/disk imported */
+	struct vtoc	*vtoc;		/* structure to store VTOC data */
+	struct dk_geom	*geom;		/* structure to store geometry data */
 
 	kthread_t	*msg_proc_thr;	/* main msg processing thread */
 
@@ -273,14 +279,11 @@
 
 	vio_ver_t	ver;		/* version number agreed with server */
 	vd_disk_type_t	vdisk_type;	/* type of device/disk being imported */
-	vd_disk_label_t vdisk_label; 	/* label type of device/disk imported */
 	uint64_t	vdisk_size;	/* device size in blocks */
 	uint64_t	max_xfer_sz;	/* maximum block size of a descriptor */
 	uint64_t	block_size;	/* device block size used */
-	struct dk_label	*label;		/* structure to store disk label */
 	struct dk_cinfo	*cinfo;		/* structure to store DKIOCINFO data */
 	struct dk_minfo	*minfo;		/* structure for DKIOCGMEDIAINFO data */
-	struct vtoc	*vtoc;		/* structure to store VTOC data */
 	ddi_devid_t	devid;		/* device id */
 	uint64_t	ctimeout;	/* connection timeout in seconds */
 	boolean_t	ctimeout_reached; /* connection timeout has expired */