Mercurial > illumos > illumos-gate
changeset 5874:4c514db6bfb0
6528974 Usage of DKIOCGETEFI has changed and impacts vdisks using zfs volumes
6558966 Virtual disks created from files do not support EFI labels
author | achartre |
---|---|
date | Tue, 22 Jan 2008 09:22:05 -0800 |
parents | 84603a5221d4 |
children | 599b7007a9cd |
files | usr/src/uts/sun4v/io/vdc.c usr/src/uts/sun4v/io/vds.c usr/src/uts/sun4v/io/vdsk_common.c usr/src/uts/sun4v/sys/vdc.h usr/src/uts/sun4v/sys/vdsk_common.h usr/src/uts/sun4v/vds/Makefile |
diffstat | 6 files changed, 713 insertions(+), 664 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/uts/sun4v/io/vdc.c Tue Jan 22 07:06:39 2008 -0800 +++ b/usr/src/uts/sun4v/io/vdc.c Tue Jan 22 09:22:05 2008 -0800 @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -137,7 +137,7 @@ static int vdc_init_descriptor_ring(vdc_t *vdc); static void vdc_destroy_descriptor_ring(vdc_t *vdc); static int vdc_setup_devid(vdc_t *vdc); -static void vdc_store_label_efi(vdc_t *vdc, struct dk_gpt *efi); +static void vdc_store_label_efi(vdc_t *, efi_gpt_t *, efi_gpe_t *); static void vdc_store_label_vtoc(vdc_t *, struct dk_geom *, struct vtoc *); static void vdc_store_label_unk(vdc_t *vdc); static boolean_t vdc_is_opened(vdc_t *vdc); @@ -182,7 +182,7 @@ /* dkio */ static int vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, int *rvalp); -static int vd_process_efi_ioctl(dev_t dev, int cmd, caddr_t arg, int mode); +static int vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg); static void vdc_create_fake_geometry(vdc_t *vdc); static int vdc_validate_geometry(vdc_t *vdc); static void vdc_validate(vdc_t *vdc); @@ -327,7 +327,6 @@ return (status); if ((status = mod_install(&modlinkage)) != 0) ddi_soft_state_fini(&vdc_state); - vdc_efi_init(vd_process_efi_ioctl); return (status); } @@ -344,7 +343,6 @@ if ((status = mod_remove(&modlinkage)) != 0) return (status); - vdc_efi_fini(); ddi_soft_state_fini(&vdc_state); return (0); } @@ -1023,7 +1021,6 @@ int i; ASSERT(vdc != NULL); - ASSERT(vdc->vtoc != NULL); instance = vdc->instance; dip = vdc->dip; @@ -1055,10 +1052,10 @@ dev = makedevice(ddi_driver_major(dip), VD_MAKE_DEV(instance, i)); - size = vdc->vtoc->v_part[i].p_size * vdc->vtoc->v_sectorsz; + size = vdc->slice[i].nblocks * vdc->block_size; DMSG(vdc, 0, "[%d] sz %ld (%ld Mb) p_size %lx\n", instance, size, size / (1024 * 1024), - vdc->vtoc->v_part[i].p_size); + vdc->slice[i].nblocks); rv = ddi_prop_update_int64(dev, dip, VDC_SIZE_PROP_NAME, size); if (rv != DDI_PROP_SUCCESS) { @@ -1247,7 +1244,7 @@ mutex_enter(&vdc->lock); if (vdc->vdisk_label == VD_DISK_LABEL_UNK || - vdc->vtoc->v_part[slice].p_size == 0) { + vdc->slice[slice].nblocks == 0) { vdc_mark_closed(vdc, slice, flag, otyp); status = EIO; } @@ -6185,18 +6182,20 @@ }; /* - * The signature of vd_process_ioctl() has changed to include the return value - * pointer. However we don't want to change vd_efi_* functions now so we add - * this wrapper function so that we can use it with vdc_efi_init(). - * - * vd_efi_* functions need some changes to fix 6528974 and so we will eventually - * remove this function when fixing that bug. + * This function handles ioctl requests from the vd_efi_alloc_and_read() + * function and forward them to the vdisk. */ static int -vd_process_efi_ioctl(dev_t dev, int cmd, caddr_t arg, int mode) +vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg) { + vdc_t *vdc = (vdc_t *)vdisk; + dev_t dev; int rval; - return (vd_process_ioctl(dev, cmd, arg, mode, &rval)); + + dev = makedevice(ddi_driver_major(vdc->dip), + VD_MAKE_DEV(vdc->instance, 0)); + + return (vd_process_ioctl(dev, cmd, (caddr_t)arg, FKIOCTL, &rval)); } /* @@ -7116,6 +7115,9 @@ struct dk_label label; struct dk_geom geom; struct vtoc vtoc; + efi_gpt_t *gpt; + efi_gpe_t *gpe; + vd_efi_dev_t edev; ASSERT(vdc != NULL); ASSERT(vdc->vtoc != NULL && vdc->geom != NULL); @@ -7135,11 +7137,21 @@ /* * If the device does not support VTOC then we try * to read an EFI label. + * + * We need to know the block size and the disk size to + * be able to read an EFI label. */ - struct dk_gpt *efi; - size_t efi_len; - - rv = vdc_efi_alloc_and_read(dev, &efi, &efi_len); + if (vdc->vdisk_size == 0) { + if ((rv = vdc_check_capacity(vdc)) != 0) { + mutex_enter(&vdc->lock); + vdc_store_label_unk(vdc); + return (rv); + } + } + + VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); + + rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe); if (rv) { DMSG(vdc, 0, "[%d] Failed to get EFI (err=%d)", @@ -7150,8 +7162,8 @@ } mutex_enter(&vdc->lock); - vdc_store_label_efi(vdc, efi); - vd_efi_free(efi, efi_len); + vdc_store_label_efi(vdc, gpt, gpe); + vd_efi_free(&edev, gpt, gpe); return (ENOTSUP); } @@ -7258,7 +7270,7 @@ vdc_validate(vdc_t *vdc) { vd_disk_label_t old_label; - struct vtoc old_vtoc; + vd_slice_t old_slice[V_NUMPAR]; int rv; ASSERT(!MUTEX_HELD(&vdc->lock)); @@ -7267,7 +7279,7 @@ /* save the current label and vtoc */ old_label = vdc->vdisk_label; - bcopy(vdc->vtoc, &old_vtoc, sizeof (struct vtoc)); + bcopy(vdc->slice, &old_slice, sizeof (vd_slice_t) * V_NUMPAR); /* check the geometry */ (void) vdc_validate_geometry(vdc); @@ -7287,7 +7299,7 @@ } /* if the vtoc has changed, update device nodes properties */ - if (bcmp(vdc->vtoc, &old_vtoc, sizeof (struct vtoc)) != 0) { + if (bcmp(vdc->slice, &old_slice, sizeof (vd_slice_t) * V_NUMPAR) != 0) { if (vdc_create_device_nodes_props(vdc) != 0) { DMSG(vdc, 0, "![%d] Failed to update device nodes" @@ -7414,39 +7426,54 @@ } static void -vdc_store_label_efi(vdc_t *vdc, struct dk_gpt *efi) +vdc_store_label_efi(vdc_t *vdc, efi_gpt_t *gpt, efi_gpe_t *gpe) { - struct vtoc *vtoc = vdc->vtoc; + int i, nparts; ASSERT(MUTEX_HELD(&vdc->lock)); vdc->vdisk_label = VD_DISK_LABEL_EFI; + bzero(vdc->vtoc, sizeof (struct vtoc)); bzero(vdc->geom, sizeof (struct dk_geom)); - vd_efi_to_vtoc(efi, vtoc); - if (vdc->vdisk_type == VD_DISK_TYPE_SLICE) { - /* - * vd_efi_to_vtoc() will store information about the EFI Sun - * reserved partition (representing the entire disk) into - * partition 7. However single-slice device will only have - * that single partition and the vdc driver expects to find - * information about that partition in slice 0. So we need - * to copy information from slice 7 to slice 0. - */ - vtoc->v_part[0].p_tag = vtoc->v_part[VD_EFI_WD_SLICE].p_tag; - vtoc->v_part[0].p_flag = vtoc->v_part[VD_EFI_WD_SLICE].p_flag; - vtoc->v_part[0].p_start = vtoc->v_part[VD_EFI_WD_SLICE].p_start; - vtoc->v_part[0].p_size = vtoc->v_part[VD_EFI_WD_SLICE].p_size; - } + bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); + + nparts = gpt->efi_gpt_NumberOfPartitionEntries; + + for (i = 0; i < nparts && i < VD_EFI_WD_SLICE; i++) { + + if (gpe[i].efi_gpe_StartingLBA == 0 || + gpe[i].efi_gpe_EndingLBA == 0) { + continue; + } + + vdc->slice[i].start = gpe[i].efi_gpe_StartingLBA; + vdc->slice[i].nblocks = gpe[i].efi_gpe_EndingLBA - + gpe[i].efi_gpe_StartingLBA + 1; + } + + ASSERT(vdc->vdisk_size != 0); + vdc->slice[VD_EFI_WD_SLICE].start = 0; + vdc->slice[VD_EFI_WD_SLICE].nblocks = vdc->vdisk_size; + } static void vdc_store_label_vtoc(vdc_t *vdc, struct dk_geom *geom, struct vtoc *vtoc) { + int i; + ASSERT(MUTEX_HELD(&vdc->lock)); + ASSERT(vdc->block_size == vtoc->v_sectorsz); vdc->vdisk_label = VD_DISK_LABEL_VTOC; bcopy(vtoc, vdc->vtoc, sizeof (struct vtoc)); bcopy(geom, vdc->geom, sizeof (struct dk_geom)); + bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); + + for (i = 0; i < vtoc->v_nparts; i++) { + vdc->slice[i].start = vtoc->v_part[i].p_start; + vdc->slice[i].nblocks = vtoc->v_part[i].p_size; + } } static void @@ -7457,4 +7484,5 @@ vdc->vdisk_label = VD_DISK_LABEL_UNK; bzero(vdc->vtoc, sizeof (struct vtoc)); bzero(vdc->geom, sizeof (struct dk_geom)); + bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); }
--- a/usr/src/uts/sun4v/io/vds.c Tue Jan 22 07:06:39 2008 -0800 +++ b/usr/src/uts/sun4v/io/vds.c Tue Jan 22 09:22:05 2008 -0800 @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -98,6 +98,9 @@ /* Timeout for SCSI I/O */ #define VD_SCSI_RDWR_TIMEOUT 30 /* 30 secs */ +/* Maximum number of logical partitions */ +#define VD_MAXPART (NDKMAP + 1) + /* * By Solaris convention, slice/partition 2 represents the entire disk; * unfortunately, this convention does not appear to be codified. @@ -356,10 +359,12 @@ vnode_t *file_vnode; /* file vnode */ size_t file_size; /* file size */ ddi_devid_t file_devid; /* devid for disk image */ - struct dk_efi dk_efi; /* synthetic for slice type */ + efi_gpt_t efi_gpt; /* EFI GPT for slice type */ + efi_gpe_t efi_gpe; /* EFI GPE for slice type */ + int efi_reserved; /* EFI reserved slice */ struct dk_geom dk_geom; /* synthetic for slice type */ - struct dk_minfo dk_minfo; /* synthetic for slice type */ struct vtoc vtoc; /* synthetic for slice type */ + vd_slice_t slices[VD_MAXPART]; /* logical partitions */ boolean_t ownership; /* disk ownership status */ ldc_status_t ldc_state; /* LDC connection state */ ldc_handle_t ldc_handle; /* handle for LDC comm */ @@ -466,6 +471,9 @@ static boolean_t vd_file_is_iso_image(vd_t *vd); static void vd_set_exported_operations(vd_t *vd); static void vd_reset_access(vd_t *vd); +static int vd_backend_ioctl(vd_t *vd, int cmd, caddr_t arg); +static int vds_efi_alloc_and_read(vd_t *, efi_gpt_t **, efi_gpe_t **); +static void vds_efi_free(vd_t *, efi_gpt_t *, efi_gpe_t *); /* * Function: @@ -521,27 +529,36 @@ * from v1.1 onwards must do the right thing. */ if (vd->vdisk_label == VD_DISK_LABEL_UNK && - vio_ver_is_supported(vd->version, 1, 1) && - vd_file_validate_geometry(vd) != 0) { - PR0("Unknown disk label, can't do I/O from slice %d", - slice); - return (-1); + vio_ver_is_supported(vd->version, 1, 1)) { + (void) vd_file_validate_geometry(vd); + if (vd->vdisk_label == VD_DISK_LABEL_UNK) { + PR0("Unknown disk label, can't do I/O " + "from slice %d", slice); + return (-1); + } } - if (blk >= vd->vtoc.v_part[slice].p_size) { + if (vd->vdisk_label == VD_DISK_LABEL_VTOC) { + ASSERT(vd->vtoc.v_sectorsz == DEV_BSIZE); + } else { + ASSERT(vd->vdisk_label == VD_DISK_LABEL_EFI); + ASSERT(vd->vdisk_block_size == DEV_BSIZE); + } + + if (blk >= vd->slices[slice].nblocks) { /* address past the end of the slice */ PR0("req_addr (0x%lx) > psize (0x%lx)", - blk, vd->vtoc.v_part[slice].p_size); + blk, vd->slices[slice].nblocks); return (0); } - offset = (vd->vtoc.v_part[slice].p_start + blk) * DEV_BSIZE; + offset = (vd->slices[slice].start + blk) * DEV_BSIZE; /* * If the requested size is greater than the size * of the partition, truncate the read/write. */ - maxlen = (vd->vtoc.v_part[slice].p_size - blk) * DEV_BSIZE; + maxlen = (vd->slices[slice].nblocks - blk) * DEV_BSIZE; if (len > maxlen) { PR0("I/O size truncated to %lu bytes from %lu bytes", @@ -556,8 +573,8 @@ * s0 instead s2) the system can try to access slices that * are not included into the disk image. */ - if ((offset + len) >= vd->file_size) { - PR0("offset + nbytes (0x%lx + 0x%lx) >= " + if ((offset + len) > vd->file_size) { + PR0("offset + nbytes (0x%lx + 0x%lx) > " "file_size (0x%lx)", offset, len, vd->file_size); return (-1); } @@ -630,10 +647,11 @@ { size_t size; char prefix; - int slice, nparts; - uint16_t tag; ASSERT(vd->file); + ASSERT(vd->vdisk_type == VD_DISK_TYPE_DISK); + + bzero(label, sizeof (struct dk_label)); /* * We must have a resonable number of cylinders and sectors so @@ -664,17 +682,8 @@ label->dkl_acyl = 0; - if (vd->vdisk_type == VD_DISK_TYPE_SLICE) { - nparts = 1; - slice = 0; - tag = V_UNASSIGNED; - } else { - if (label->dkl_pcyl > 2) - label->dkl_acyl = 2; - nparts = V_NUMPAR; - slice = VD_ENTIRE_DISK_SLICE; - tag = V_BACKUP; - } + if (label->dkl_pcyl > 2) + label->dkl_acyl = 2; label->dkl_nsect = vd->file_size / (DEV_BSIZE * label->dkl_pcyl); @@ -719,12 +728,13 @@ /* default VTOC */ label->dkl_vtoc.v_version = V_VERSION; - label->dkl_vtoc.v_nparts = nparts; + label->dkl_vtoc.v_nparts = V_NUMPAR; label->dkl_vtoc.v_sanity = VTOC_SANE; - label->dkl_vtoc.v_part[slice].p_tag = tag; - label->dkl_map[slice].dkl_cylno = 0; - label->dkl_map[slice].dkl_nblk = label->dkl_ncyl * + label->dkl_vtoc.v_part[VD_ENTIRE_DISK_SLICE].p_tag = V_BACKUP; + label->dkl_map[VD_ENTIRE_DISK_SLICE].dkl_cylno = 0; + label->dkl_map[VD_ENTIRE_DISK_SLICE].dkl_nblk = label->dkl_ncyl * label->dkl_nhead * label->dkl_nsect; + label->dkl_magic = DKL_MAGIC; label->dkl_cksum = vd_lbl2cksum(label); } @@ -820,6 +830,29 @@ diskaddr_t spc, head, cyl; ASSERT(vd->file); + + if (vd->vdisk_label == VD_DISK_LABEL_UNK) { + /* + * If no label is defined we don't know where to find + * a device id. + */ + return (ENOSPC); + } + + if (vd->vdisk_label == VD_DISK_LABEL_EFI) { + /* + * For an EFI disk, the devid is at the beginning of + * the reserved slice + */ + if (vd->efi_reserved == -1) { + PR0("EFI disk has no reserved slice"); + return (ENOSPC); + } + + *blkp = vd->slices[vd->efi_reserved].start; + return (0); + } + ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC); /* this geometry doesn't allow us to have a devid */ @@ -956,6 +989,11 @@ size_t blk; int status; + if (devid == NULL) { + /* nothing to write */ + return (0); + } + if ((status = vd_file_get_devid_block(vd, &blk)) != 0) return (status); @@ -1921,38 +1959,6 @@ } } -static vd_disk_label_t -vd_read_vtoc(vd_t *vd, struct vtoc *vtoc) -{ - int status, rval; - struct dk_gpt *efi; - size_t efi_len; - - ASSERT(vd->ldi_handle[0] != NULL); - - status = ldi_ioctl(vd->ldi_handle[0], DKIOCGVTOC, (intptr_t)vtoc, - (vd->open_flags | FKIOCTL), kcred, &rval); - - if (status == 0) { - return (VD_DISK_LABEL_VTOC); - } else if (status != ENOTSUP) { - PR0("ldi_ioctl(DKIOCGVTOC) returned error %d", status); - return (VD_DISK_LABEL_UNK); - } - - status = vds_efi_alloc_and_read(vd->ldi_handle[0], &efi, &efi_len); - - if (status) { - PR0("vds_efi_alloc_and_read returned error %d", status); - return (VD_DISK_LABEL_UNK); - } - - vd_efi_to_vtoc(efi, vtoc); - vd_efi_free(efi, efi_len); - - return (VD_DISK_LABEL_EFI); -} - static ushort_t vd_lbl2cksum(struct dk_label *label) { @@ -1982,10 +1988,23 @@ vd_do_slice_ioctl(vd_t *vd, int cmd, void *ioctl_arg) { dk_efi_t *dk_ioc; + int rval; + + ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE); + + if (cmd == DKIOCFLUSHWRITECACHE) { + if (vd->file) { + return (VOP_FSYNC(vd->file_vnode, FSYNC, kcred, NULL)); + } else { + return (ldi_ioctl(vd->ldi_handle[0], cmd, + (intptr_t)ioctl_arg, vd->open_flags | FKIOCTL, + kcred, &rval)); + } + } switch (vd->vdisk_label) { - /* ioctls for a slice from a disk with a VTOC label */ + /* ioctls for a single slice disk with a VTOC label */ case VD_DISK_LABEL_VTOC: switch (cmd) { @@ -2001,17 +2020,49 @@ return (ENOTSUP); } - /* ioctls for a slice from a disk with an EFI label */ + /* ioctls for a single slice disk with an EFI label */ case VD_DISK_LABEL_EFI: switch (cmd) { case DKIOCGETEFI: ASSERT(ioctl_arg != NULL); dk_ioc = (dk_efi_t *)ioctl_arg; - if (dk_ioc->dki_length < vd->dk_efi.dki_length) + + /* + * For a single slice disk with an EFI label, we define + * a fake EFI label with the GPT at LBA 1 and one GPE + * at LBA 2. So we return the GPT or the GPE depending + * on which LBA is requested. + */ + if (dk_ioc->dki_lba == 1) { + + /* return the EFI GPT */ + if (dk_ioc->dki_length < sizeof (efi_gpt_t)) + return (EINVAL); + + bcopy(&vd->efi_gpt, dk_ioc->dki_data, + sizeof (efi_gpt_t)); + + /* also return the GPE if possible */ + if (dk_ioc->dki_length >= sizeof (efi_gpt_t) + + sizeof (efi_gpe_t)) { + bcopy(&vd->efi_gpe, dk_ioc->dki_data + + 1, sizeof (efi_gpe_t)); + } + + } else if (dk_ioc->dki_lba == 2) { + + /* return the EFI GPE */ + if (dk_ioc->dki_length < sizeof (efi_gpe_t)) + return (EINVAL); + + bcopy(&vd->efi_gpe, dk_ioc->dki_data, + sizeof (efi_gpe_t)); + + } else { return (EINVAL); - bcopy(vd->dk_efi.dki_data, dk_ioc->dki_data, - vd->dk_efi.dki_length); + } + return (0); default: return (ENOTSUP); @@ -2023,6 +2074,74 @@ } } +static int +vds_efi_alloc_and_read(vd_t *vd, efi_gpt_t **gpt, efi_gpe_t **gpe) +{ + vd_efi_dev_t edev; + int status; + + VD_EFI_DEV_SET(edev, vd, (vd_efi_ioctl_func)vd_backend_ioctl); + + status = vd_efi_alloc_and_read(&edev, gpt, gpe); + + return (status); +} + +static void +vds_efi_free(vd_t *vd, efi_gpt_t *gpt, efi_gpe_t *gpe) +{ + vd_efi_dev_t edev; + + VD_EFI_DEV_SET(edev, vd, (vd_efi_ioctl_func)vd_backend_ioctl); + + vd_efi_free(&edev, gpt, gpe); +} + +static int +vd_file_validate_efi(vd_t *vd) +{ + efi_gpt_t *gpt; + efi_gpe_t *gpe; + int i, nparts, status; + struct uuid efi_reserved = EFI_RESERVED; + + if ((status = vds_efi_alloc_and_read(vd, &gpt, &gpe)) != 0) + return (status); + + bzero(&vd->vtoc, sizeof (struct vtoc)); + bzero(&vd->dk_geom, sizeof (struct dk_geom)); + bzero(vd->slices, sizeof (vd_slice_t) * VD_MAXPART); + + vd->efi_reserved = -1; + + nparts = gpt->efi_gpt_NumberOfPartitionEntries; + + for (i = 0; i < nparts && i < VD_MAXPART; i++) { + + if (gpe[i].efi_gpe_StartingLBA == 0 || + gpe[i].efi_gpe_EndingLBA == 0) { + continue; + } + + vd->slices[i].start = gpe[i].efi_gpe_StartingLBA; + vd->slices[i].nblocks = gpe[i].efi_gpe_EndingLBA - + gpe[i].efi_gpe_StartingLBA + 1; + + if (bcmp(&gpe[i].efi_gpe_PartitionTypeGUID, &efi_reserved, + sizeof (struct uuid)) == 0) + vd->efi_reserved = i; + + } + + ASSERT(vd->vdisk_size != 0); + vd->slices[VD_EFI_WD_SLICE].start = 0; + vd->slices[VD_EFI_WD_SLICE].nblocks = vd->vdisk_size; + + vds_efi_free(vd, gpt, gpe); + + return (status); +} + /* * Function: * vd_file_validate_geometry @@ -2034,7 +2153,7 @@ * * If no valid label is found, the label is set to unknown and the * function returns EINVAL, but a default vtoc and geometry are provided - * to the driver. + * to the driver. If an EFI label is found, ENOTSUP is returned. * * Parameters: * vd - disk on which the operation is performed. @@ -2043,6 +2162,7 @@ * 0 - success. * EIO - error reading the label from the disk image. * EINVAL - unknown disk label. + * ENOTSUP - geometry not applicable (EFI label). */ static int vd_file_validate_geometry(vd_t *vd) @@ -2054,34 +2174,26 @@ int status = 0; ASSERT(vd->file); - - if (vd->vdisk_type == VD_DISK_TYPE_SLICE) { - /* - * For single slice disk we always fake the geometry, and we - * only need to do it once because the geometry will never - * change. - */ - if (vd->vdisk_label == VD_DISK_LABEL_VTOC) - /* geometry was already validated */ - return (0); - - ASSERT(vd->vdisk_label == VD_DISK_LABEL_UNK); + ASSERT(vd->vdisk_type == VD_DISK_TYPE_DISK); + + if (VD_FILE_LABEL_READ(vd, &label) < 0) + return (EIO); + + if (label.dkl_magic != DKL_MAGIC || + label.dkl_cksum != vd_lbl2cksum(&label) || + label.dkl_vtoc.v_sanity != VTOC_SANE || + label.dkl_vtoc.v_nparts != V_NUMPAR) { + + if (vd_file_validate_efi(vd) == 0) { + vd->vdisk_label = VD_DISK_LABEL_EFI; + return (ENOTSUP); + } + + vd->vdisk_label = VD_DISK_LABEL_UNK; vd_file_build_default_label(vd, &label); + status = EINVAL; + } else { vd->vdisk_label = VD_DISK_LABEL_VTOC; - } else { - if (VD_FILE_LABEL_READ(vd, &label) < 0) - return (EIO); - - if (label.dkl_magic != DKL_MAGIC || - label.dkl_cksum != vd_lbl2cksum(&label) || - label.dkl_vtoc.v_sanity != VTOC_SANE || - label.dkl_vtoc.v_nparts != V_NUMPAR) { - vd->vdisk_label = VD_DISK_LABEL_UNK; - vd_file_build_default_label(vd, &label); - status = EINVAL; - } else { - vd->vdisk_label = VD_DISK_LABEL_VTOC; - } } /* Update the driver geometry */ @@ -2131,6 +2243,15 @@ bcopy(label.dkl_vtoc.v_volume, vtoc->v_volume, LEN_DKL_VVOL); + /* Update logical partitions */ + bzero(vd->slices, sizeof (vd_slice_t) * VD_MAXPART); + if (vd->vdisk_label != VD_DISK_LABEL_UNK) { + for (i = 0; i < vtoc->v_nparts; i++) { + vd->slices[i].start = vtoc->v_part[i].p_start; + vd->slices[i].nblocks = vtoc->v_part[i].p_size; + } + } + return (status); } @@ -2147,9 +2268,11 @@ struct dk_label label; struct dk_geom *geom; struct vtoc *vtoc; + dk_efi_t *efi; int i, rc; ASSERT(vd->file); + ASSERT(vd->vdisk_type == VD_DISK_TYPE_DISK); switch (cmd) { @@ -2158,11 +2281,8 @@ geom = (struct dk_geom *)ioctl_arg; rc = vd_file_validate_geometry(vd); - if (rc != 0 && rc != EINVAL) { - ASSERT(vd->vdisk_type != VD_DISK_TYPE_SLICE); + if (rc != 0 && rc != EINVAL) return (rc); - } - bcopy(&vd->dk_geom, geom, sizeof (struct dk_geom)); return (0); @@ -2171,11 +2291,8 @@ vtoc = (struct vtoc *)ioctl_arg; rc = vd_file_validate_geometry(vd); - if (rc != 0 && rc != EINVAL) { - ASSERT(vd->vdisk_type != VD_DISK_TYPE_SLICE); + if (rc != 0 && rc != EINVAL) return (rc); - } - bcopy(&vd->vtoc, vtoc, sizeof (struct vtoc)); return (0); @@ -2183,10 +2300,6 @@ ASSERT(ioctl_arg != NULL); geom = (struct dk_geom *)ioctl_arg; - /* geometry can only be changed for full disk */ - if (vd->vdisk_type != VD_DISK_TYPE_DISK) - return (ENOTSUP); - if (geom->dkg_nhead == 0 || geom->dkg_nsect == 0) return (EINVAL); @@ -2205,10 +2318,6 @@ vd->dk_geom.dkg_nsect != 0); vtoc = (struct vtoc *)ioctl_arg; - /* vtoc can only be changed for full disk */ - if (vd->vdisk_type != VD_DISK_TYPE_DISK) - return (ENOTSUP); - if (vtoc->v_sanity != VTOC_SANE || vtoc->v_sectorsz != DEV_BSIZE || vtoc->v_nparts != V_NUMPAR) @@ -2263,28 +2372,87 @@ if ((rc = vd_file_set_vtoc(vd, &label)) != 0) return (rc); - /* check the geometry and update the driver info */ - if ((rc = vd_file_validate_geometry(vd)) != 0) - return (rc); - - /* - * The disk geometry may have changed, so we need to write - * the devid (if there is one) so that it is stored at the - * right location. - */ - if (vd->file_devid != NULL && - vd_file_write_devid(vd, vd->file_devid) != 0) { - PR0("Fail to write devid"); - } - - return (0); + break; case DKIOCFLUSHWRITECACHE: return (VOP_FSYNC(vd->file_vnode, FSYNC, kcred, NULL)); + case DKIOCGETEFI: + ASSERT(ioctl_arg != NULL); + efi = (dk_efi_t *)ioctl_arg; + + if (vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, + (caddr_t)efi->dki_data, efi->dki_lba, efi->dki_length) < 0) + return (EIO); + + return (0); + + case DKIOCSETEFI: + ASSERT(ioctl_arg != NULL); + efi = (dk_efi_t *)ioctl_arg; + + if (vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, + (caddr_t)efi->dki_data, efi->dki_lba, efi->dki_length) < 0) + return (EIO); + + break; + + default: return (ENOTSUP); } + + ASSERT(cmd == DKIOCSVTOC || cmd == DKIOCSETEFI); + + /* label has changed, revalidate the geometry */ + (void) vd_file_validate_geometry(vd); + + /* + * The disk geometry may have changed, so we need to write + * the devid (if there is one) so that it is stored at the + * right location. + */ + if (vd_file_write_devid(vd, vd->file_devid) != 0) { + PR0("Fail to write devid"); + } + + return (0); +} + +static int +vd_backend_ioctl(vd_t *vd, int cmd, caddr_t arg) +{ + int rval = 0, status; + + /* + * Call the appropriate function to execute the ioctl depending + * on the type of vdisk. + */ + if (vd->vdisk_type == VD_DISK_TYPE_SLICE) { + + /* slice, file or volume exported as a single slice disk */ + status = vd_do_slice_ioctl(vd, cmd, arg); + + } else if (vd->file) { + + /* file or volume exported as a full disk */ + status = vd_do_file_ioctl(vd, cmd, arg); + + } else { + + /* disk device exported as a full disk */ + status = ldi_ioctl(vd->ldi_handle[0], cmd, (intptr_t)arg, + vd->open_flags | FKIOCTL, kcred, &rval); + } + +#ifdef DEBUG + if (rval != 0) { + PR0("ioctl %x set rval = %d, which is not being returned" + " to caller", cmd, rval); + } +#endif /* DEBUG */ + + return (status); } /* @@ -2303,7 +2471,7 @@ static int vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl) { - int rval = 0, status = 0; + int status = 0; size_t nbytes = request->nbytes; /* modifiable copy */ @@ -2350,29 +2518,9 @@ } /* - * Handle single-slice block devices internally; otherwise, have the - * real driver perform the ioctl() + * Send the ioctl to the disk backend. */ - if (vd->file) { - request->status = - vd_do_file_ioctl(vd, ioctl->cmd, (void *)ioctl->arg); - - } else if (vd->vdisk_type == VD_DISK_TYPE_SLICE && !vd->pseudo) { - request->status = - vd_do_slice_ioctl(vd, ioctl->cmd, (void *)ioctl->arg); - - } else { - request->status = ldi_ioctl(vd->ldi_handle[request->slice], - ioctl->cmd, (intptr_t)ioctl->arg, vd->open_flags | FKIOCTL, - kcred, &rval); - -#ifdef DEBUG - if (rval != 0) { - PR0("%s set rval = %d, which is not being returned to" - " client", ioctl->cmd_name, rval); - } -#endif /* DEBUG */ - } + request->status = vd_backend_ioctl(vd, ioctl->cmd, ioctl->arg); if (request->status != 0) { PR0("ioctl(%s) = errno %d", ioctl->cmd_name, request->status); @@ -4459,6 +4607,36 @@ return (0); } +/* + * When a slice or a volume is exported as a single-slice disk, we want + * the disk backend (i.e. the slice or volume) to be entirely mapped as + * a slice without the addition of any metadata. + * + * So when exporting the disk as a VTOC disk, we fake a disk with the following + * layout: + * + * 0 1 N+1 + * +-+--------------------------+ + * virtual disk: |L| slice 0 | + * +-+--------------------------+ + * ^: : + * |: : + * VTOC LABEL--+: : + * +--------------------------+ + * disk backend: | slice/volume | + * +--------------------------+ + * 0 N + * + * N is the number of blocks in the slice/volume. + * + * We simulate a disk with N+1 blocks. The first block (block 0) is faked and + * can not be changed. The remaining blocks (1 to N+1) defines slice 0 and are + * mapped to the exported slice or volume: + * + * - block 0 (L) can return a fake VTOC label if raw read was implemented. + * - block 1 to N+1 is mapped to the exported slice or volume. + * + */ static int vd_setup_partition_vtoc(vd_t *vd) { @@ -4483,7 +4661,7 @@ PRN("%s geometry claims 0 heads", device_path); return (EIO); } - vd->dk_geom.dkg_ncyl = vd->vdisk_size / vd->dk_geom.dkg_nsect / + vd->dk_geom.dkg_ncyl = (vd->vdisk_size + 1) / vd->dk_geom.dkg_nsect / vd->dk_geom.dkg_nhead; vd->dk_geom.dkg_acyl = 0; vd->dk_geom.dkg_pcyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl; @@ -4496,34 +4674,74 @@ vd->vtoc.v_nparts = 1; vd->vtoc.v_part[0].p_tag = V_UNASSIGNED; vd->vtoc.v_part[0].p_flag = 0; - vd->vtoc.v_part[0].p_start = 0; + vd->vtoc.v_part[0].p_start = 1; vd->vtoc.v_part[0].p_size = vd->vdisk_size; bcopy(VD_ASCIILABEL, vd->vtoc.v_asciilabel, MIN(sizeof (VD_ASCIILABEL), sizeof (vd->vtoc.v_asciilabel))); + /* adjust the vdisk_size, we emulate the first block */ + vd->vdisk_size += 1; + return (0); } +/* + * When a slice, volume or file is exported as a single-slice disk, we want + * the disk backend (i.e. the slice, volume or file) to be entirely mapped + * as a slice without the addition of any metadata. + * + * So when exporting the disk as an EFI disk, we fake a disk with the following + * layout: + * + * 0 1 2 3 34 34+N + * +-+-+-+-------+--------------------------+ + * virtual disk: |X|T|E|XXXXXXX| slice 0 | + * +-+-+-+-------+--------------------------+ + * ^ ^ : : + * | | : : + * GPT-+ +-GPE : : + * +--------------------------+ + * disk backend: | slice/volume/file | + * +--------------------------+ + * 0 N + * + * N is the number of blocks in the slice/volume/file. + * + * We simulate a disk with 34+N blocks. The first 34 blocks (0 to 33) are + * emulated and can not be changed. The remaining blocks (34 to 34+N) defines + * slice 0 and are mapped to the exported slice, volume or file: + * + * - block 0 (X) is unused and can return 0 if raw read was implemented. + * - block 1 (T) returns a fake EFI GPT (via DKIOCGETEFI) + * - block 2 (E) returns a fake EFI GPE (via DKIOCGETEFI) + * - block 3 to 33 (X) are unused and return 0 if raw read is implemented. + * - block 34 to 34+N is mapped to the exported slice, volume or file. + * + */ static int vd_setup_partition_efi(vd_t *vd) { efi_gpt_t *gpt; efi_gpe_t *gpe; - struct uuid uuid = EFI_RESERVED; + struct uuid uuid = EFI_USR; uint32_t crc; - int length; - - length = sizeof (efi_gpt_t) + sizeof (efi_gpe_t); - - gpt = kmem_zalloc(length, KM_SLEEP); - gpe = (efi_gpe_t *)(gpt + 1); + + gpt = &vd->efi_gpt; + gpe = &vd->efi_gpe; + + bzero(gpt, sizeof (efi_gpt_t)); + bzero(gpe, sizeof (efi_gpe_t)); + + /* adjust the vdisk_size, we emulate the first 34 blocks */ + vd->vdisk_size += 34; gpt->efi_gpt_Signature = LE_64(EFI_SIGNATURE); gpt->efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT); gpt->efi_gpt_HeaderSize = LE_32(sizeof (efi_gpt_t)); - gpt->efi_gpt_FirstUsableLBA = LE_64(0ULL); + gpt->efi_gpt_FirstUsableLBA = LE_64(34ULL); gpt->efi_gpt_LastUsableLBA = LE_64(vd->vdisk_size - 1); gpt->efi_gpt_NumberOfPartitionEntries = LE_32(1); + gpt->efi_gpt_PartitionEntryLBA = LE_64(2ULL); gpt->efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (efi_gpe_t)); UUID_LE_CONVERT(gpe->efi_gpe_PartitionTypeGUID, uuid); @@ -4536,10 +4754,6 @@ CRC32(crc, gpt, sizeof (efi_gpt_t), -1U, crc32_table); gpt->efi_gpt_HeaderCRC32 = LE_32(~crc); - vd->dk_efi.dki_lba = 0; - vd->dk_efi.dki_length = length; - vd->dk_efi.dki_data = gpt; - return (0); } @@ -4592,37 +4806,12 @@ return (EIO); } - /* - * Find and validate the geometry of a disk image. For a single slice - * disk image, this will build a fake geometry and vtoc. - */ - status = vd_file_validate_geometry(vd); - if (status != 0 && status != EINVAL) { - PRN("Failed to read label from %s", file_path); - return (EIO); - } - /* sector size = block size = DEV_BSIZE */ vd->block_size = DEV_BSIZE; vd->vdisk_block_size = DEV_BSIZE; vd->vdisk_size = vd->file_size / DEV_BSIZE; vd->max_xfer_sz = maxphys / DEV_BSIZE; /* default transfer size */ - if (vd_file_is_iso_image(vd)) { - /* - * Indicate whether to call this a CD or DVD from the size - * of the ISO image (images for both drive types are stored - * in the ISO-9600 format). CDs can store up to just under 1Gb - */ - if ((vd->vdisk_size * vd->vdisk_block_size) > - (1024 * 1024 * 1024)) - vd->vdisk_media = VD_MEDIA_DVD; - else - vd->vdisk_media = VD_MEDIA_CD; - } else { - vd->vdisk_media = VD_MEDIA_FIXED; - } - /* * Get max_xfer_sz from the device where the file is or from the device * itself if we have a pseudo device. @@ -4666,11 +4855,39 @@ PR0("using file %s, dev %s, max_xfer = %u blks", file_path, dev_path, vd->max_xfer_sz); + if (vd->vdisk_type == VD_DISK_TYPE_SLICE) { + ASSERT(!vd->pseudo); + vd->vdisk_label = VD_DISK_LABEL_EFI; + status = vd_setup_partition_efi(vd); + return (0); + } + + /* + * Find and validate the geometry of a disk image. + */ + status = vd_file_validate_geometry(vd); + if (status != 0 && status != EINVAL && status != ENOTSUP) { + PRN("Failed to read label from %s", file_path); + return (EIO); + } + + if (vd_file_is_iso_image(vd)) { + /* + * Indicate whether to call this a CD or DVD from the size + * of the ISO image (images for both drive types are stored + * in the ISO-9600 format). CDs can store up to just under 1Gb + */ + if ((vd->vdisk_size * vd->vdisk_block_size) > + (1024 * 1024 * 1024)) + vd->vdisk_media = VD_MEDIA_DVD; + else + vd->vdisk_media = VD_MEDIA_CD; + } else { + vd->vdisk_media = VD_MEDIA_FIXED; + } + /* Setup devid for the disk image */ - if (vd->vdisk_type == VD_DISK_TYPE_SLICE) - return (0); - if (vd->vdisk_label != VD_DISK_LABEL_UNK) { status = vd_file_read_devid(vd, &vd->file_devid); @@ -4870,8 +5087,6 @@ return (EIO); } - vd->vdisk_label = vd_read_vtoc(vd, &vd->vtoc); - /* Store the device's max transfer size for return to the client */ vd->max_xfer_sz = dk_cinfo.dki_maxtransfer; @@ -4917,7 +5132,7 @@ static int vd_setup_single_slice_disk(vd_t *vd) { - int status; + int status, rval; char *device_path = vd->device_path; /* Get size of backing device */ @@ -4931,42 +5146,41 @@ vd->vdisk_media = VD_MEDIA_FIXED; if (vd->pseudo) { - ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE); - - /* - * Currently we only support exporting pseudo devices which - * provide a valid disk label. - */ - if (vd->vdisk_label == VD_DISK_LABEL_UNK) { - PRN("%s is a pseudo device with an invalid disk " - "label\n", device_path); - return (EINVAL); - } - return (0); /* ...and we're done */ - } - - /* We can only export a slice if the disk has a valid label */ - if (vd->vdisk_label == VD_DISK_LABEL_UNK) { - PRN("%s is a slice from a disk with an unknown disk label\n", - device_path); - return (EINVAL); } /* * We export the slice as a single slice disk even if the "slice" * option was not specified. */ - vd->vdisk_type = VD_DISK_TYPE_SLICE; + vd->vdisk_type = VD_DISK_TYPE_SLICE; vd->nslices = 1; - if (vd->vdisk_label == VD_DISK_LABEL_EFI) { - /* Slice from a disk with an EFI label */ - status = vd_setup_partition_efi(vd); + /* + * When exporting a slice or a device as a single slice disk, we don't + * care about any partitioning exposed by the backend. The goal is just + * to export the backend as a flat storage. We provide a fake partition + * table (either a VTOC or EFI), which presents only one slice, to + * accommodate tools expecting a disk label. + * + * We check the label of the backend to export the device as a slice + * using the same type of label (VTOC or EFI). If there is no label + * then we create a fake EFI label. + * + * Note that the partition table we are creating could also be faked + * by the client based on the size of the backend device. + */ + status = ldi_ioctl(vd->ldi_handle[0], DKIOCGVTOC, (intptr_t)&vd->vtoc, + (vd->open_flags | FKIOCTL), kcred, &rval); + + if (status == 0) { + /* export with a fake VTOC label */ + vd->vdisk_label = VD_DISK_LABEL_VTOC; + status = vd_setup_partition_vtoc(vd); } else { - /* Slice from a disk with a VTOC label */ - ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC); - status = vd_setup_partition_vtoc(vd); + /* export with a fake EFI label */ + vd->vdisk_label = VD_DISK_LABEL_EFI; + status = vd_setup_partition_efi(vd); } return (status); @@ -5263,9 +5477,6 @@ PR0("Destroying vdisk state"); - if (vd->dk_efi.dki_data != NULL) - kmem_free(vd->dk_efi.dki_data, vd->dk_efi.dki_length); - /* Disable queuing requests for the vdisk */ if (vd->initialized & VD_LOCKING) { mutex_enter(&vd->lock);
--- a/usr/src/uts/sun4v/io/vdsk_common.c Tue Jan 22 07:06:39 2008 -0800 +++ b/usr/src/uts/sun4v/io/vdsk_common.c Tue Jan 22 09:22:05 2008 -0800 @@ -20,7 +20,7 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -45,77 +45,23 @@ */ /* - * This code is a port of the functions efi_alloc_read() and efi_free() from - * the libefi userland library to the kernel so that the vDisk drivers (vdc - * and vds) can read EFI data. We will certaintly be able to remove that code - * once RFE 6213117 is implemented. + * This code provides generic functions to the vds and vdc drivers to read + * EFI labels from the disk backend and to get the EFI GPT and GPE. This is + * inspired from the libefi userland library and the cmlb driver. We will + * certainly be able to remove that code if RFE 6213117 is ever implemented. */ -#define VD_IOCTL_FLAGS (FEXCL | FREAD | FWRITE | FKIOCTL) - #define VD_EFI_DEBUG if (vd_efi_debug) vd_efi_print -/* - * The number of blocks the EFI label takes up (round up to nearest - * block) - */ -#define NBLOCKS(p, l) (1 + ((((p) * (int)sizeof (efi_gpe_t)) + \ - ((l) - 1)) / (l))) -/* number of partitions -- limited by what we can malloc */ -#define MAX_PARTS ((4294967295UL - sizeof (struct dk_gpt)) / \ - sizeof (struct dk_part)) - -/* - * The vd_efi_alloc_and_read() function will use some ioctls to get EFI data - * but the way we issue ioctl is different depending if we are on the vDisk - * server side (vds) or on the vDisk client side. - * - * On the server side (vds), we reference a layered device (ldi_handle_t) so we - * will use the LDI interface to execute ioctls (ldi_ioctl()). On the client - * side (vdc), we reference a vdc device (with a dev_t) so we directly invoke - * the function of the vdc driver implementing ioctls (vd_process_ioctl()). - */ -#define VD_EFI_CALLER_VDS 0 -#define VD_EFI_CALLER_VDC 1 - -typedef struct vd_efi_dev { - int caller; - union { - ldi_handle_t vds; - dev_t vdc; - } ioctl_dev; -} vd_efi_dev_t; +#ifdef DEBUG +static int vd_efi_debug = 1; +#else +static int vd_efi_debug = 0; +#endif -static int (*vdc_ioctl_func)(dev_t dev, int cmd, caddr_t arg, int mode) = NULL; - -static int vd_efi_debug = 1; - -static struct uuid_to_ptag { - struct uuid uuid; -} conversion_array[] = { - { EFI_UNUSED }, - { EFI_BOOT }, - { EFI_ROOT }, - { EFI_SWAP }, - { EFI_USR }, - { EFI_BACKUP }, - { 0 }, /* STAND is never used */ - { EFI_VAR }, - { EFI_HOME }, - { EFI_ALTSCTR }, - { 0 }, /* CACHE (cachefs) is never used */ - { EFI_RESERVED }, - { EFI_SYSTEM }, - { EFI_LEGACY_MBR }, - { EFI_RESV3 }, - { EFI_RESV4 }, - { EFI_MSFT_RESV }, - { EFI_DELL_BASIC }, - { EFI_DELL_RAID }, - { EFI_DELL_SWAP }, - { EFI_DELL_LVM }, - { EFI_DELL_RESV } -}; +#define VD_EFI_GPE_LEN(vdisk, nparts) \ + ((((sizeof (efi_gpe_t) * (nparts) - 1) / (vdisk)->block_size) + 1) * \ + (vdisk)->block_size) static void vd_efi_print(const char *format, ...) @@ -144,51 +90,79 @@ } static int -vd_ioctl(vd_efi_dev_t *dev, int cmd, void *arg, int flag, - cred_t *cred, int *rvalp) +vd_efi_ioctl(vd_efi_dev_t *dev, int cmd, void *arg) { - int error; + int status; + + ASSERT(dev->vdisk_ioctl != NULL); + ASSERT(dev->vdisk != NULL); + status = (*dev->vdisk_ioctl)(dev->vdisk, cmd, (uintptr_t)arg); + + return (status); +} - if (dev->caller == VD_EFI_CALLER_VDS) { - error = ldi_ioctl(dev->ioctl_dev.vds, cmd, - (intptr_t)arg, flag, cred, rvalp); - } else { - ASSERT(vdc_ioctl_func != NULL); - error = (*vdc_ioctl_func)(dev->ioctl_dev.vdc, cmd, - arg, flag); - } - - return (error); +/* + * Swap GPT data to match with the system endianness. + */ +static void +vd_efi_swap_gpt(efi_gpt_t *gpt) +{ + gpt->efi_gpt_Signature = LE_64(gpt->efi_gpt_Signature); + gpt->efi_gpt_Revision = LE_32(gpt->efi_gpt_Revision); + gpt->efi_gpt_HeaderSize = LE_32(gpt->efi_gpt_HeaderSize); + gpt->efi_gpt_HeaderCRC32 = LE_32(gpt->efi_gpt_HeaderCRC32); + gpt->efi_gpt_MyLBA = LE_64(gpt->efi_gpt_MyLBA); + gpt->efi_gpt_AlternateLBA = LE_64(gpt->efi_gpt_AlternateLBA); + gpt->efi_gpt_FirstUsableLBA = LE_64(gpt->efi_gpt_FirstUsableLBA); + gpt->efi_gpt_LastUsableLBA = LE_64(gpt->efi_gpt_LastUsableLBA); + UUID_LE_CONVERT(gpt->efi_gpt_DiskGUID, gpt->efi_gpt_DiskGUID); + gpt->efi_gpt_PartitionEntryLBA = LE_64(gpt->efi_gpt_PartitionEntryLBA); + gpt->efi_gpt_NumberOfPartitionEntries = + LE_32(gpt->efi_gpt_NumberOfPartitionEntries); + gpt->efi_gpt_SizeOfPartitionEntry = + LE_32(gpt->efi_gpt_SizeOfPartitionEntry); + gpt->efi_gpt_PartitionEntryArrayCRC32 = + LE_32(gpt->efi_gpt_PartitionEntryArrayCRC32); } -static int -vd_efi_ioctl(vd_efi_dev_t *dev, int cmd, dk_efi_t *dk_ioc) +/* + * Swap GPE data to match with the system endianness. + */ +static void +vd_efi_swap_gpe(efi_gpe_t *gpe, int nparts) { - void *data = dk_ioc->dki_data; - int error; + int i, j; - dk_ioc->dki_data_64 = (uint64_t)(uintptr_t)data; - error = vd_ioctl(dev, cmd, (caddr_t)dk_ioc, VD_IOCTL_FLAGS, - kcred, NULL); - dk_ioc->dki_data = data; - - return (error); + for (i = 0; i < nparts; i++) { + UUID_LE_CONVERT(gpe[i].efi_gpe_PartitionTypeGUID, + gpe[i].efi_gpe_PartitionTypeGUID); + UUID_LE_CONVERT(gpe[i].efi_gpe_UniquePartitionGUID, + gpe[i].efi_gpe_UniquePartitionGUID); + gpe[i].efi_gpe_StartingLBA = LE_64(gpe[i].efi_gpe_StartingLBA); + gpe[i].efi_gpe_EndingLBA = LE_64(gpe[i].efi_gpe_EndingLBA); + gpe[i].efi_gpe_Attributes.PartitionAttrs = + LE_16(gpe[i].efi_gpe_Attributes.PartitionAttrs); + for (j = 0; j < EFI_PART_NAME_LEN; j++) { + gpe[i].efi_gpe_PartitionName[j] = + LE_16(gpe[i].efi_gpe_PartitionName[j]); + } + } } +/* + * Check that an EFI GPT is valid. This function should be called with a raw + * EFI GPT i.e. GPT data should be in little endian format as indicated in the + * EFI specification and they should not have been swapped to match with the + * system endianness. + */ static int -vd_efi_check_label(vd_efi_dev_t *dev, dk_efi_t *dk_ioc) +vd_efi_check_gpt(vd_efi_dev_t *dev, efi_gpt_t *gpt) { - efi_gpt_t *efi; - uint_t crc; - int status; + uint_t crc_stored, crc_computed; - if ((status = vd_efi_ioctl(dev, DKIOCGETEFI, dk_ioc)) != 0) - return (status); - - efi = dk_ioc->dki_data; - if (efi->efi_gpt_Signature != LE_64(EFI_SIGNATURE)) { + if (gpt->efi_gpt_Signature != LE_64(EFI_SIGNATURE)) { VD_EFI_DEBUG("Bad EFI signature: 0x%llx != 0x%llx\n", - (long long)efi->efi_gpt_Signature, + (long long)gpt->efi_gpt_Signature, (long long)LE_64(EFI_SIGNATURE)); return (EINVAL); } @@ -197,318 +171,140 @@ * check CRC of the header; the size of the header should * never be larger than one block */ - crc = efi->efi_gpt_HeaderCRC32; - efi->efi_gpt_HeaderCRC32 = 0; + if (LE_32(gpt->efi_gpt_HeaderSize) > dev->block_size) { + VD_EFI_DEBUG("Header size (%u bytes) larger than one block" + "(%u bytes)\n", LE_32(gpt->efi_gpt_HeaderSize), + dev->block_size); + return (EINVAL); + } - if (((len_t)LE_32(efi->efi_gpt_HeaderSize) > dk_ioc->dki_length) || - crc != LE_32(vd_efi_crc32((unsigned char *)efi, - LE_32(efi->efi_gpt_HeaderSize)))) { + crc_stored = LE_32(gpt->efi_gpt_HeaderCRC32); + gpt->efi_gpt_HeaderCRC32 = LE_32(0); + crc_computed = vd_efi_crc32((unsigned char *)gpt, + LE_32(gpt->efi_gpt_HeaderSize)); + gpt->efi_gpt_HeaderCRC32 = LE_32(crc_stored); + + if (crc_stored != crc_computed) { VD_EFI_DEBUG("Bad EFI CRC: 0x%x != 0x%x\n", - crc, LE_32(vd_efi_crc32((unsigned char *)efi, - sizeof (struct efi_gpt)))); + crc_stored, crc_computed); return (EINVAL); } return (0); } -static int -vd_efi_read(vd_efi_dev_t *dev, struct dk_gpt *vtoc) +/* + * Allocate and read the EFI GPT and GPE from the disk backend. Note that the + * on-disk GPT and GPE are stored in little endian format but this function + * returns them using the endianness of the system so that any field in the + * GPT/GPE structures can be directly accessible without any further conversion. + * The caller is responsible for freeing the allocated structures by calling + * vd_efi_free(). + */ +int +vd_efi_alloc_and_read(vd_efi_dev_t *dev, efi_gpt_t **efi_gpt, + efi_gpe_t **efi_gpe) { - int i, j, status; - int label_len; - int md_flag = 0; - struct dk_minfo disk_info; - dk_efi_t dk_ioc; - efi_gpt_t *efi; - efi_gpe_t *efi_parts; - struct dk_cinfo dki_info; - uint32_t user_length; + dk_efi_t dk_efi; + efi_gpt_t *gpt = NULL; + efi_gpe_t *gpe = NULL; + size_t gpt_len, gpe_len; + int nparts, status; + + ASSERT(dev->block_size >= sizeof (efi_gpt_t)); + gpt_len = dev->block_size; + gpt = kmem_zalloc(gpt_len, KM_SLEEP); /* - * get the partition number for this file descriptor. + * Read the EFI GPT. */ - if ((status = vd_ioctl(dev, DKIOCINFO, &dki_info, VD_IOCTL_FLAGS, - kcred, NULL)) != 0) { - VD_EFI_DEBUG("DKIOCINFO error 0x%x\n", status); - return (status); - } - if ((strncmp(dki_info.dki_cname, "pseudo", 7) == 0) && - (strncmp(dki_info.dki_dname, "md", 3) == 0)) { - md_flag++; - } - /* get the LBA size */ - if ((status = vd_ioctl(dev, DKIOCGMEDIAINFO, &disk_info, VD_IOCTL_FLAGS, - kcred, NULL)) != 0) { - VD_EFI_DEBUG("assuming LBA 512 bytes %d\n", status); - disk_info.dki_lbsize = DEV_BSIZE; - } - if (disk_info.dki_lbsize == 0) { - VD_EFI_DEBUG("efi_read: assuming LBA 512 bytes\n"); - disk_info.dki_lbsize = DEV_BSIZE; - } - /* - * Read the EFI GPT to figure out how many partitions we need - * to deal with. - */ - dk_ioc.dki_lba = 1; - if (NBLOCKS(vtoc->efi_nparts, disk_info.dki_lbsize) < 34) { - label_len = EFI_MIN_ARRAY_SIZE + disk_info.dki_lbsize; - } else { - label_len = vtoc->efi_nparts * (int) sizeof (efi_gpe_t) + - disk_info.dki_lbsize; - if (label_len % disk_info.dki_lbsize) { - /* pad to physical sector size */ - label_len += disk_info.dki_lbsize; - label_len &= ~(disk_info.dki_lbsize - 1); - } + dk_efi.dki_lba = 1; + dk_efi.dki_data = gpt; + dk_efi.dki_length = gpt_len; + + if ((status = vd_efi_ioctl(dev, DKIOCGETEFI, &dk_efi)) != 0) { + VD_EFI_DEBUG("DKIOCGETEFI (GPT, LBA=1) error %d\n", status); + goto errdone; } - dk_ioc.dki_data = kmem_alloc(label_len, KM_SLEEP); - dk_ioc.dki_length = label_len; - user_length = vtoc->efi_nparts; - efi = dk_ioc.dki_data; - if (md_flag) { - if ((status = vd_efi_ioctl(dev, DKIOCGETEFI, &dk_ioc)) != 0) - return (status); - } else if ((status = vd_efi_check_label(dev, &dk_ioc)) == EINVAL) { - /* no valid label here; try the alternate */ - dk_ioc.dki_lba = disk_info.dki_capacity - 1; - dk_ioc.dki_length = disk_info.dki_lbsize; - if (vd_efi_check_label(dev, &dk_ioc) == 0) { - VD_EFI_DEBUG("efi_read: primary label corrupt; " - "using backup\n"); - dk_ioc.dki_lba = LE_64(efi->efi_gpt_PartitionEntryLBA); - vtoc->efi_flags |= EFI_GPT_PRIMARY_CORRUPT; - vtoc->efi_nparts = - LE_32(efi->efi_gpt_NumberOfPartitionEntries); - /* - * partitions are between last usable LBA and - * backup partition header - */ - dk_ioc.dki_data++; - dk_ioc.dki_length = disk_info.dki_capacity - - dk_ioc.dki_lba - 1; - dk_ioc.dki_length *= disk_info.dki_lbsize; - if (dk_ioc.dki_length > (len_t)label_len) { - status = EINVAL; - } else { - status = vd_efi_ioctl(dev, DKIOCGETEFI, - &dk_ioc); - } + if ((status = vd_efi_check_gpt(dev, gpt)) != 0) { + /* + * No valid label here; try the alternate. The alternate GPT is + * located in the last block of the disk. + */ + dk_efi.dki_lba = dev->disk_size - 1; + dk_efi.dki_data = gpt; + dk_efi.dki_length = gpt_len; + + if ((status = vd_efi_ioctl(dev, DKIOCGETEFI, &dk_efi)) != 0) { + VD_EFI_DEBUG("DKIOCGETEFI (LBA=%lu) error %d\n", + dev->disk_size - 1, status); + goto errdone; } - } - if (status != 0) { - kmem_free(efi, label_len); - return (status); + + if ((status = vd_efi_check_gpt(dev, gpt)) != 0) + goto errdone; + + VD_EFI_DEBUG("efi_read: primary label corrupt; using backup\n"); } - /* partitions start in the next block */ - /* LINTED -- always longlong aligned */ - efi_parts = (efi_gpe_t *)(((char *)efi) + disk_info.dki_lbsize); + /* swap GPT data after checking the GPT is valid */ + vd_efi_swap_gpt(gpt); /* - * Assemble this into a "dk_gpt" struct for easier - * digestibility by applications. + * Read the EFI GPE. */ - vtoc->efi_version = LE_32(efi->efi_gpt_Revision); - vtoc->efi_nparts = LE_32(efi->efi_gpt_NumberOfPartitionEntries); - vtoc->efi_part_size = LE_32(efi->efi_gpt_SizeOfPartitionEntry); - vtoc->efi_lbasize = disk_info.dki_lbsize; - vtoc->efi_last_lba = disk_info.dki_capacity - 1; - vtoc->efi_first_u_lba = LE_64(efi->efi_gpt_FirstUsableLBA); - vtoc->efi_last_u_lba = LE_64(efi->efi_gpt_LastUsableLBA); - UUID_LE_CONVERT(vtoc->efi_disk_uguid, efi->efi_gpt_DiskGUID); + nparts = gpt->efi_gpt_NumberOfPartitionEntries; - /* - * If the array the user passed in is too small, set the length - * to what it needs to be and return - */ - if (user_length < vtoc->efi_nparts) { - kmem_free(efi, label_len); - return (EINVAL); + if (nparts > NDKMAP + 1) { + VD_EFI_DEBUG("Too many EFI partitions (%u)", nparts); + status = EINVAL; + goto errdone; + } + + if (nparts == 0) { + VD_EFI_DEBUG("No partition defined"); + status = EINVAL; + goto errdone; } - for (i = 0; i < vtoc->efi_nparts; i++) { + gpe_len = VD_EFI_GPE_LEN(dev, nparts); + gpe = kmem_zalloc(gpe_len, KM_SLEEP); - UUID_LE_CONVERT(vtoc->efi_parts[i].p_guid, - efi_parts[i].efi_gpe_PartitionTypeGUID); - - for (j = 0; - j < sizeof (conversion_array) / sizeof (struct uuid_to_ptag); - j++) { + dk_efi.dki_lba = gpt->efi_gpt_PartitionEntryLBA; + dk_efi.dki_data = (efi_gpt_t *)gpe; + dk_efi.dki_length = gpe_len; - if (bcmp(&vtoc->efi_parts[i].p_guid, - &conversion_array[j].uuid, - sizeof (struct uuid)) == 0) { - vtoc->efi_parts[i].p_tag = j; - break; - } - } - if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED) - continue; - vtoc->efi_parts[i].p_flag = - LE_16(efi_parts[i].efi_gpe_Attributes.PartitionAttrs); - vtoc->efi_parts[i].p_start = - LE_64(efi_parts[i].efi_gpe_StartingLBA); - vtoc->efi_parts[i].p_size = - LE_64(efi_parts[i].efi_gpe_EndingLBA) - - vtoc->efi_parts[i].p_start + 1; - for (j = 0; j < EFI_PART_NAME_LEN; j++) { - vtoc->efi_parts[i].p_name[j] = - (uchar_t)LE_16(efi_parts[i].efi_gpe_PartitionName[j]); - } + if ((status = vd_efi_ioctl(dev, DKIOCGETEFI, &dk_efi)) != 0) { + VD_EFI_DEBUG("DKIOCGETEFI (GPE, LBA=%lu) error %d\n", + gpt->efi_gpt_PartitionEntryLBA, status); + goto errdone; + } - UUID_LE_CONVERT(vtoc->efi_parts[i].p_uguid, - efi_parts[i].efi_gpe_UniquePartitionGUID); - } - kmem_free(efi, label_len); + vd_efi_swap_gpe(gpe, nparts); + + *efi_gpt = gpt; + *efi_gpe = gpe; return (0); + +errdone: + + if (gpe != NULL) + kmem_free(gpe, gpe_len); + if (gpt != NULL) + kmem_free(gpt, gpt_len); + + return (status); } /* - * Read EFI - return 0 upon success. - */ -static int -vd_efi_alloc_and_read(vd_efi_dev_t *dev, struct dk_gpt **vtoc, size_t *vtoc_len) -{ - int status; - uint32_t nparts; - int length; - - /* figure out the number of entries that would fit into 16K */ - nparts = EFI_MIN_ARRAY_SIZE / sizeof (efi_gpe_t); - length = (int) sizeof (struct dk_gpt) + - (int) sizeof (struct dk_part) * (nparts - 1); - - *vtoc = kmem_zalloc(length, KM_SLEEP); - (*vtoc)->efi_nparts = nparts; - status = vd_efi_read(dev, *vtoc); - - if ((status == EINVAL) && (*vtoc)->efi_nparts > nparts) { - kmem_free(*vtoc, length); - length = (int) sizeof (struct dk_gpt) + - (int) sizeof (struct dk_part) * - ((*vtoc)->efi_nparts - 1); - nparts = (*vtoc)->efi_nparts; - *vtoc = kmem_alloc(length, KM_SLEEP); - status = vd_efi_read(dev, *vtoc); - } - - if (status != 0) { - VD_EFI_DEBUG("read of EFI table failed with error=%d\n", - status); - kmem_free(*vtoc, length); - *vtoc = NULL; - *vtoc_len = 0; - return (status); - } - - *vtoc_len = length; - return (0); -} - -int -vdc_efi_alloc_and_read(dev_t dev, struct dk_gpt **vtoc, size_t *vtoc_len) -{ - vd_efi_dev_t efi_dev; - - ASSERT(vdc_ioctl_func != NULL); - - efi_dev.caller = VD_EFI_CALLER_VDC; - efi_dev.ioctl_dev.vdc = dev; - - return (vd_efi_alloc_and_read(&efi_dev, vtoc, vtoc_len)); -} - -int -vds_efi_alloc_and_read(ldi_handle_t dev, struct dk_gpt **vtoc, size_t *vtoc_len) -{ - vd_efi_dev_t efi_dev; - - efi_dev.caller = VD_EFI_CALLER_VDS; - efi_dev.ioctl_dev.vds = dev; - - return (vd_efi_alloc_and_read(&efi_dev, vtoc, vtoc_len)); -} - -void -vd_efi_free(struct dk_gpt *ptr, size_t length) -{ - kmem_free(ptr, length); -} - -void -vdc_efi_init(int (*func)(dev_t, int, caddr_t, int)) -{ - vdc_ioctl_func = func; -} - -void -vdc_efi_fini(void) -{ - vdc_ioctl_func = NULL; -} - -/* - * This function stores EFI data (as returned by efi_alloc_and_read()) into - * a vtoc structure. The vDisk driver uses a vtoc structure to store generic - * information about disk partitions. + * Free the EFI GPE and GPT structures returned by vd_efi_alloc_and_read(). */ void -vd_efi_to_vtoc(struct dk_gpt *efi, struct vtoc *vtoc) +vd_efi_free(vd_efi_dev_t *dev, efi_gpt_t *gpt, efi_gpe_t *gpe) { - int i, nparts; - - bzero(vtoc, sizeof (struct vtoc)); - - vtoc->v_sanity = VTOC_SANE; - - nparts = efi->efi_nparts; - for (i = 0; i < nparts; i++) { - if (efi->efi_parts[i].p_tag != V_RESERVED) - continue; - bcopy(efi->efi_parts[i].p_name, vtoc->v_volume, - LEN_DKL_VVOL); - bcopy(efi->efi_parts[i].p_name, vtoc->v_asciilabel, - EFI_PART_NAME_LEN); - break; - } - - vtoc->v_sectorsz = efi->efi_lbasize; - vtoc->v_nparts = nparts; - for (i = 0; i < nparts; i++) { - /* - * EFI can have more than 8 partitions. However the current - * implementation of EFI on Solaris only support 7 partitions - * (s0 to s6). There is no partition s7 but the minor number - * corresponding to slice 7 is used to represent the whole - * disk which data are stored in the "Sun Reserved" partition. - * So we use the entry 7 of the vtoc structure to store - * information about the whole disk. - */ - if (efi->efi_parts[i].p_tag == V_RESERVED) { - vtoc->v_part[VD_EFI_WD_SLICE].p_tag = - efi->efi_parts[i].p_tag; - vtoc->v_part[VD_EFI_WD_SLICE].p_flag = - efi->efi_parts[i].p_flag; - vtoc->v_part[VD_EFI_WD_SLICE].p_start = - efi->efi_parts[i].p_start; - vtoc->v_part[VD_EFI_WD_SLICE].p_size = - efi->efi_parts[i].p_size; - continue; - } - - if (i >= VD_EFI_WD_SLICE) { - continue; - } - - vtoc->v_part[i].p_tag = efi->efi_parts[i].p_tag; - if (efi->efi_parts[i].p_tag != V_UNASSIGNED) { - vtoc->v_part[i].p_flag = efi->efi_parts[i].p_flag; - vtoc->v_part[i].p_start = efi->efi_parts[i].p_start; - vtoc->v_part[i].p_size = efi->efi_parts[i].p_size; - } - } + kmem_free(gpe, VD_EFI_GPE_LEN(dev, + gpt->efi_gpt_NumberOfPartitionEntries)); + kmem_free(gpt, dev->block_size); }
--- a/usr/src/uts/sun4v/sys/vdc.h Tue Jan 22 07:06:39 2008 -0800 +++ b/usr/src/uts/sun4v/sys/vdc.h Tue Jan 22 09:22:05 2008 -0800 @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -263,6 +263,7 @@ vd_disk_label_t vdisk_label; /* label type of device/disk imported */ struct vtoc *vtoc; /* structure to store VTOC data */ struct dk_geom *geom; /* structure to store geometry data */ + vd_slice_t slice[V_NUMPAR]; /* logical partitions */ kthread_t *msg_proc_thr; /* main msg processing thread */
--- a/usr/src/uts/sun4v/sys/vdsk_common.h Tue Jan 22 07:06:39 2008 -0800 +++ b/usr/src/uts/sun4v/sys/vdsk_common.h Tue Jan 22 09:22:05 2008 -0800 @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -216,6 +216,14 @@ vd_dring_payload_t payload; /* disk specific data */ } vd_dring_entry_t; +/* + * vDisk logical partition + */ +typedef struct vd_slice { + daddr_t start; /* block number of slice start */ + daddr_t nblocks; /* number of blocks in the slice */ +} vd_slice_t; + /* * vDisk control operation structures @@ -493,22 +501,28 @@ * The EFI alloc_and_read() function will use some ioctls to get EFI data * but the device reference we will use is different depending if the command * is issued from the vDisk server side (vds) or from the vDisk client side - * (vdc). From the server side (vds), we will have a layered device reference - * (ldi_handle_t) while on the client side (vdc) we will have a regular device - * reference (dev_t). + * (vdc). The vd_efi_dev structure is filled by vdc/vds to indicate the ioctl + * function to call back and to provide information about the virtual disk. */ -#ifdef _SUN4V_VDS -int vds_efi_alloc_and_read(ldi_handle_t dev, struct dk_gpt **vtoc, - size_t *vtoc_len); -#else -void vdc_efi_init(int (*func)(dev_t, int, caddr_t, int)); -void vdc_efi_fini(void); -int vdc_efi_alloc_and_read(dev_t dev, struct dk_gpt **vtoc, - size_t *vtoc_len); -#endif +typedef int (*vd_efi_ioctl_func)(void *, int, uintptr_t); + +typedef struct vd_efi_dev { + void *vdisk; /* opaque pointer to the vdisk */ + size_t block_size; /* vdisk block size */ + size_t disk_size; /* vdisk size in blocks */ + vd_efi_ioctl_func vdisk_ioctl; /* vdisk ioctl function */ +} vd_efi_dev_t; -void vd_efi_free(struct dk_gpt *ptr, size_t length); -void vd_efi_to_vtoc(struct dk_gpt *efi, struct vtoc *vtoc); +#define VD_EFI_DEV_SET(efi_dev, vdsk, ioctl) \ + (efi_dev).vdisk = vdsk; \ + (efi_dev).vdisk_ioctl = ioctl; \ + (efi_dev).block_size = (vdsk)->block_size; \ + (efi_dev).disk_size = (vdsk)->vdisk_size; + + +int vd_efi_alloc_and_read(vd_efi_dev_t *dev, efi_gpt_t **gpt, efi_gpe_t **gpe); +void vd_efi_free(vd_efi_dev_t *dev, efi_gpt_t *gpt, efi_gpe_t *gpe); + #ifdef __cplusplus }
--- a/usr/src/uts/sun4v/vds/Makefile Tue Jan 22 07:06:39 2008 -0800 +++ b/usr/src/uts/sun4v/vds/Makefile Tue Jan 22 09:22:05 2008 -0800 @@ -20,7 +20,7 @@ # # -# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # uts/sun4v/vds/Makefile @@ -68,7 +68,6 @@ # lint pass one enforcement # CFLAGS += $(CCVERBOSE) -CPPFLAGS += -D_SUN4V_VDS # # module dependencies