Mercurial > illumos > illumos-gate
view usr/src/uts/common/io/lvm/mirror/mirror_ioctl.c @ 13592:d4e1700ca091
2039 several declarations in uts declare functions with variable number of args even if they are not
Reviewed by: Garrett D'Amore <garrett@damore.org>
Approved by: Richard Lowe <richlowe@richlowe.net>
author | Milan Jurik <milan.jurik@xylab.cz> |
---|---|
date | Fri, 03 Feb 2012 20:27:13 +0100 |
parents | c686aa11575c |
children |
line wrap: on
line source
/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright 2012 Milan Jurik. All rights reserved. */ #include <sys/param.h> #include <sys/systm.h> #include <sys/conf.h> #include <sys/file.h> #include <sys/user.h> #include <sys/uio.h> #include <sys/t_lock.h> #include <sys/buf.h> #include <sys/dkio.h> #include <sys/vtoc.h> #include <sys/kmem.h> #include <vm/page.h> #include <sys/sysmacros.h> #include <sys/types.h> #include <sys/mkdev.h> #include <sys/stat.h> #include <sys/open.h> #include <sys/modctl.h> #include <sys/ddi.h> #include <sys/sunddi.h> #include <sys/lvm/mdvar.h> #include <sys/lvm/md_names.h> #include <sys/lvm/md_mddb.h> #include <sys/lvm/md_stripe.h> #include <sys/lvm/md_mirror.h> #include <sys/model.h> #include <sys/sysevent/eventdefs.h> #include <sys/sysevent/svm.h> #include <sys/lvm/mdmn_commd.h> extern int md_status; extern kmutex_t md_mx; extern kcondvar_t md_cv; extern unit_t md_nunits; extern set_t md_nsets; extern md_set_t md_set[]; extern md_ops_t mirror_md_ops; extern int md_ioctl_cnt; extern md_krwlock_t md_unit_array_rw; extern major_t md_major; extern mdq_anchor_t md_ff_daemonq; extern void md_probe_one(probe_req_t *); extern void mirror_openfail_console_info(mm_unit_t *, int, int); #ifdef DEBUG extern int mirror_debug_flag; #endif static void mirror_resume_writes(mm_unit_t *un) { /* * Release the block on writes to the mirror and resume any blocked * resync thread. * This is only required for MN sets */ if (MD_MNSET_SETNO(MD_UN2SET(un))) { #ifdef DEBUG if (mirror_debug_flag) printf("mirror_resume_writes: mnum %x\n", MD_SID(un)); #endif mutex_enter(&un->un_suspend_wr_mx); un->un_suspend_wr_flag = 0; cv_broadcast(&un->un_suspend_wr_cv); mutex_exit(&un->un_suspend_wr_mx); mutex_enter(&un->un_rs_thread_mx); un->un_rs_thread_flags &= ~MD_RI_BLOCK; cv_signal(&un->un_rs_thread_cv); mutex_exit(&un->un_rs_thread_mx); } } mm_unit_t * mirror_getun(minor_t mnum, md_error_t *mde, int flags, IOLOCK *lock) { mm_unit_t *un; mdi_unit_t *ui; set_t setno = MD_MIN2SET(mnum); if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits)) { (void) mdmderror(mde, MDE_INVAL_UNIT, mnum); return (NULL); } if (!(flags & STALE_OK)) { if (md_get_setstatus(setno) & MD_SET_STALE) { (void) mdmddberror(mde, MDE_DB_STALE, mnum, setno); return (NULL); } } ui = MDI_UNIT(mnum); if (flags & NO_OLD) { if (ui != NULL) { (void) mdmderror(mde, MDE_UNIT_ALREADY_SETUP, mnum); return (NULL); } return ((mm_unit_t *)1); } if (ui == NULL) { (void) mdmderror(mde, MDE_UNIT_NOT_SETUP, mnum); return (NULL); } if (flags & ARRAY_WRITER) md_array_writer(lock); else if (flags & ARRAY_READER) md_array_reader(lock); if (!(flags & NO_LOCK)) { if (flags & WR_LOCK) (void) md_ioctl_writerlock(lock, ui); else /* RD_LOCK */ (void) md_ioctl_readerlock(lock, ui); } un = (mm_unit_t *)MD_UNIT(mnum); if (un->c.un_type != MD_METAMIRROR) { (void) mdmderror(mde, MDE_NOT_MM, mnum); return (NULL); } return (un); } static int mirror_set( void *d, int mode ) { minor_t mnum; mm_unit_t *un; mddb_recid_t recid; mddb_type_t typ1; int err; int i; set_t setno; md_set_params_t *msp = d; mnum = msp->mnum; mdclrerror(&msp->mde); if (mirror_getun(mnum, &msp->mde, NO_OLD, NULL) == NULL) return (0); setno = MD_MIN2SET(mnum); typ1 = (mddb_type_t)md_getshared_key(setno, mirror_md_ops.md_driver.md_drivername); /* * Create the db record for this mdstruct * We don't store incore elements ondisk */ if (msp->options & MD_CRO_64BIT) { #if defined(_ILP32) return (mdmderror(&msp->mde, MDE_UNIT_TOO_LARGE, mnum)); #else recid = mddb_createrec((size_t)msp->size, typ1, MIRROR_REC, MD_CRO_64BIT | MD_CRO_MIRROR | MD_CRO_FN, setno); #endif } else { /* * It's important to use the correct size here */ msp->size = sizeof (mm_unit32_od_t); recid = mddb_createrec((size_t)msp->size, typ1, MIRROR_REC, MD_CRO_32BIT | MD_CRO_MIRROR | MD_CRO_FN, setno); } if (recid < 0) return (mddbstatus2error(&msp->mde, (int)recid, mnum, setno)); /* Resize to include incore fields */ un = (mm_unit_t *)mddb_getrecaddr_resize(recid, sizeof (*un), 0); /* * It is okay that we muck with the mdstruct here, * since no one else will know about the mdstruct * until we commit it. If we crash, the record will * be automatically purged, since we haven't * committed it yet. */ /* copy in the user's mdstruct */ if (err = ddi_copyin((caddr_t)(uintptr_t)msp->mdp, un, (uint_t)msp->size, mode)) { mddb_deleterec_wrapper(recid); return (EFAULT); } /* All 64 bit metadevices only support EFI labels. */ if (msp->options & MD_CRO_64BIT) { un->c.un_flag |= MD_EFILABEL; } un->c.un_revision |= MD_FN_META_DEV; MD_RECID(un) = recid; MD_CAPAB(un) = MD_CAN_PARENT | MD_CAN_META_CHILD | MD_CAN_SP; MD_PARENT(un) = MD_NO_PARENT; for (i = 0; i < NMIRROR; i++) { struct mm_submirror *sm; sm = &un->un_sm[i]; if (!SMS_IS(sm, SMS_INUSE)) continue; /* ensure that the submirror is a metadevice */ if (md_getmajor(sm->sm_dev) != md_major) return (mdmderror(&msp->mde, MDE_INVAL_UNIT, md_getminor(sm->sm_dev))); if (md_get_parent(sm->sm_dev) == MD_NO_PARENT) continue; /* mirror creation should fail here */ md_nblocks_set(mnum, -1ULL); MD_UNIT(mnum) = NULL; mddb_deleterec_wrapper(recid); return (mdmderror(&msp->mde, MDE_IN_USE, md_getminor(sm->sm_dev))); } if (err = mirror_build_incore(un, 0)) { md_nblocks_set(mnum, -1ULL); MD_UNIT(mnum) = NULL; mddb_deleterec_wrapper(recid); return (err); } /* * Update unit availability */ md_set[setno].s_un_avail--; mirror_commit(un, ALL_SUBMIRRORS, 0); md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0); mirror_check_failfast(mnum); SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_METADEVICE, setno, MD_SID(un)); resync_start_timeout(setno); return (0); } static int mirror_get( void *migp, int mode, IOLOCK *lock ) { mm_unit_t *un; md_i_get_t *migph = migp; mdclrerror(&migph->mde); if ((un = mirror_getun(migph->id, &migph->mde, RD_LOCK, lock)) == NULL) return (0); if (migph->size == 0) { migph->size = un->c.un_size; return (0); } if (migph->size < un->c.un_size) { return (EFAULT); } if (ddi_copyout(un, (caddr_t)(uintptr_t)migph->mdp, un->c.un_size, mode)) return (EFAULT); return (0); } static int mirror_getdevs( void *mgdp, int mode, IOLOCK *lock ) { mm_unit_t *un; md_dev64_t *udevs; int cnt; int i; md_dev64_t unit_dev; md_getdevs_params_t *mgdph = mgdp; mdclrerror(&mgdph->mde); if ((un = mirror_getun(mgdph->mnum, &mgdph->mde, RD_LOCK, lock)) == NULL) return (0); udevs = (md_dev64_t *)(uintptr_t)mgdph->devs; for (cnt = 0, i = 0; i < NMIRROR; i++) { if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) continue; if (cnt < mgdph->cnt) { unit_dev = un->un_sm[i].sm_dev; if (md_getmajor(unit_dev) != md_major) { unit_dev = md_xlate_mini_2_targ(unit_dev); if (unit_dev == NODEV64) return (ENODEV); } if (ddi_copyout((caddr_t)&unit_dev, (caddr_t)udevs, sizeof (*udevs), mode) != 0) return (EFAULT); ++udevs; } ++cnt; } mgdph->cnt = cnt; return (0); } static int mirror_reset( md_i_reset_t *mirp ) { minor_t mnum = mirp->mnum; mm_unit_t *un; mdi_unit_t *ui; set_t setno = MD_MIN2SET(mnum); mdclrerror(&mirp->mde); if ((un = mirror_getun(mnum, &mirp->mde, NO_LOCK, NULL)) == NULL) return (0); if (MD_HAS_PARENT(un->c.un_parent)) { return (mdmderror(&mirp->mde, MDE_IN_USE, mnum)); } rw_enter(&md_unit_array_rw.lock, RW_WRITER); /* single thread */ ui = MDI_UNIT(mnum); (void) md_unit_openclose_enter(ui); if (md_unit_isopen(ui)) { md_unit_openclose_exit(ui); rw_exit(&md_unit_array_rw.lock); return (mdmderror(&mirp->mde, MDE_IS_OPEN, mnum)); } md_unit_openclose_exit(ui); if (!mirp->force) { int smi; for (smi = 0; smi < NMIRROR; smi++) { if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) continue; if (!SMS_BY_INDEX_IS(un, smi, SMS_RUNNING)) { rw_exit(&md_unit_array_rw.lock); return (mdmderror(&mirp->mde, MDE_C_WITH_INVAL_SM, mnum)); } } } reset_mirror(un, mnum, 1); /* * Update unit availability */ md_set[setno].s_un_avail++; /* * If MN set, reset s_un_next so all nodes can have * the same view of the next available slot when * nodes are -w and -j */ if (MD_MNSET_SETNO(setno)) { (void) md_upd_set_unnext(setno, MD_MIN2UNIT(mnum)); } rw_exit(&md_unit_array_rw.lock); return (0); } static int mirror_get_geom( mm_unit_t *un, struct dk_geom *geomp ) { md_get_geom((md_unit_t *)un, geomp); return (0); } static int mirror_get_vtoc( mm_unit_t *un, struct vtoc *vtocp ) { md_get_vtoc((md_unit_t *)un, vtocp); return (0); } static int mirror_set_vtoc( mm_unit_t *un, struct vtoc *vtocp ) { return (md_set_vtoc((md_unit_t *)un, vtocp)); } static int mirror_get_extvtoc( mm_unit_t *un, struct extvtoc *vtocp ) { md_get_extvtoc((md_unit_t *)un, vtocp); return (0); } static int mirror_set_extvtoc( mm_unit_t *un, struct extvtoc *vtocp ) { return (md_set_extvtoc((md_unit_t *)un, vtocp)); } static int mirror_get_cgapart( mm_unit_t *un, struct dk_map *dkmapp ) { md_get_cgapart((md_unit_t *)un, dkmapp); return (0); } static int mirror_getcomp_by_dev(mm_unit_t *un, replace_params_t *params, int *smi, int *cip) { mm_submirror_t *sm; mm_submirror_ic_t *smic; ms_comp_t *comp; ms_unit_t *mous; int ci; int i; int compcnt; ms_cd_info_t cd; void (*get_dev)(); md_dev64_t dev = md_expldev(params->old_dev); md_error_t *ep = ¶ms->mde; minor_t mnum = params->mnum; mdkey_t devkey; int nkeys; set_t setno; side_t side; setno = MD_MIN2SET(MD_SID(un)); side = mddb_getsidenum(setno); if (md_getkeyfromdev(setno, side, dev, &devkey, &nkeys) != 0) return (mddeverror(ep, MDE_NAME_SPACE, dev)); for (i = 0; i < NMIRROR; i++) { sm = &un->un_sm[i]; smic = &un->un_smic[i]; if (!SMS_IS(sm, SMS_INUSE)) continue; get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, "get device", 0); compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, un); /* * For each of the underlying stripe components get * the info. */ for (ci = 0; ci < compcnt; ci++) { (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); if ((cd.cd_dev == dev) || (cd.cd_orig_dev == dev)) { *cip = ci; *smi = i; return (1); } } /* * now we rescan looking only for NODEV. If we find * NODEV then we will check the keys to see if its a match. * * If no key was found to match dev, then there is * no way to compare keys - so continue. */ if (nkeys == 0) { continue; } mous = MD_UNIT(md_getminor(sm->sm_dev)); for (ci = 0; ci < compcnt; ci++) { comp = (struct ms_comp *) ((void *)&((char *)mous)[mous->un_ocomp]); (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); if (cd.cd_dev == NODEV64 || cd.cd_orig_dev == NODEV64) { comp += ci; if (comp->un_key == devkey) { if (nkeys > 1) { return (mddeverror( ep, MDE_MULTNM, dev)); } *cip = ci; *smi = i; return (1); } } } } return (mdcomperror(ep, MDE_CANT_FIND_COMP, mnum, dev)); } /* * comp_replace: * ---------------- * Called to implement the component replace function * * Owner is returned in the parameter block passed in by the caller. * * Returns: * 0 success * error code if the functions fails * * For a MN set, on entry all writes to the mirror are suspended, on exit * from this function, writes must be resumed when not a dryrun. */ static int comp_replace( replace_params_t *params, IOLOCK *lock ) { minor_t mnum = params->mnum; set_t setno; side_t side; mm_unit_t *un; mdi_unit_t *ui; ms_unit_t *ms_un; mdi_unit_t *ms_ui; ms_comp_t *comp; mm_submirror_t *sm; md_dev64_t smdev; mddb_recid_t recids[6]; /* recids for stripe on SP */ int smi, ci; ms_new_dev_t nd; int (*repl_dev)(); void (*repl_done)(); void *repl_data; int err = 0; ms_cd_info_t cd; void (*get_dev)(); mdclrerror(¶ms->mde); if ((un = mirror_getun(mnum, ¶ms->mde, WRITERS, lock)) == NULL) { return (0); } ui = MDI_UNIT(mnum); if (ui->ui_tstate & MD_INACCESSIBLE) { (void) mdmderror(¶ms->mde, MDE_IN_UNAVAIL_STATE, mnum); goto errexit; } /* * replace cannot be done while a resync is active or we are * still waiting for an optimized resync to be started */ if (MD_STATUS(un) & (MD_UN_RESYNC_ACTIVE | MD_UN_OPT_NOT_DONE)) { (void) mdmderror(¶ms->mde, MDE_RESYNC_ACTIVE, mnum); goto errexit; } if (mirror_getcomp_by_dev(un, params, &smi, &ci) == 0) { goto errexit; } if (un->un_nsm == 1) { (void) mdmderror(¶ms->mde, MDE_LAST_SM_RE, mnum); goto errexit; } if (mirror_other_sources(un, smi, ci, 0) != 0) { (void) mdcomperror(¶ms->mde, MDE_REPL_INVAL_STATE, mnum, md_expldev(params->old_dev)); goto errexit; } sm = &un->un_sm[smi]; if (sm->sm_state & (SMS_OFFLINE | SMS_OFFLINE_RESYNC)) { (void) mdmderror(¶ms->mde, MDE_ILLEGAL_SM_STATE, mnum); goto errexit; } get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, "get device", 0); (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); repl_dev = (int (*)())md_get_named_service(sm->sm_dev, 0, "replace device", 0); smdev = sm->sm_dev; ms_un = MD_UNIT(md_getminor(smdev)); if (params->cmd == ENABLE_COMP) { md_dev64_t this_dev; int numkeys; mdkey_t this_key; this_dev = ((cd.cd_orig_dev == 0) ? cd.cd_dev : cd.cd_orig_dev); setno = MD_MIN2SET(md_getminor(smdev)); side = mddb_getsidenum(setno); comp = (struct ms_comp *) ((void *)&((char *)ms_un)[ms_un->un_ocomp]); comp += ci; /* * We trust the dev_t because we cannot determine the * dev_t from the device id since a new disk is in the * same location. Since this is a call from metareplace -e dx * AND it is SCSI a new dev_t is not generated. So the * dev_t from the mddb is used. Before enabling the device * we check to make sure that multiple entries for the same * device does not exist in the namespace. If they do we * fail the ioctl. * One of the many ways multiple entries in the name space * can occur is if one removed the failed component in the * stripe of a mirror and put another disk that was part of * another metadevice. After reboot metadevadm would correctly * update the device name for the metadevice whose component * has moved. However now in the metadb there are two entries * for the same name (ctds) that belong to different * metadevices. One is valid, the other is a ghost or "last * know as" ctds. */ this_dev = md_getdevnum(setno, side, comp->un_key, MD_TRUST_DEVT); /* * Verify that multiple keys for the same * dev_t don't exist */ if (md_getkeyfromdev(setno, side, this_dev, &this_key, &numkeys) != 0) { (void) mddeverror(¶ms->mde, MDE_NAME_SPACE, md_expldev(params->old_dev)); goto errexit; } /* * Namespace has multiple entries * for the same devt */ if (numkeys > 1) { (void) mddeverror(¶ms->mde, MDE_MULTNM, md_expldev(params->old_dev)); goto errexit; } if ((numkeys == 0) || (comp->un_key != this_key)) { (void) mdcomperror(¶ms->mde, MDE_CANT_FIND_COMP, mnum, this_dev); goto errexit; } if ((md_getmajor(this_dev) != md_major) && (md_devid_found(setno, side, this_key) == 1)) { if (md_update_namespace_did(setno, side, this_key, ¶ms->mde) != 0) { (void) mddeverror(¶ms->mde, MDE_NAME_SPACE, this_dev); goto errexit; } } if (md_expldev(params->new_dev) != this_dev) { (void) mddeverror(¶ms->mde, MDE_FIX_INVAL_STATE, md_expldev(params->new_dev)); goto errexit; } /* in case of dryrun, don't actually do anything */ if ((params->options & MDIOCTL_DRYRUN) == 0) { err = (*repl_dev)(sm->sm_dev, 0, ci, NULL, recids, 6, &repl_done, &repl_data); } } else if ((params->options & MDIOCTL_DRYRUN) == 0) { nd.nd_dev = md_expldev(params->new_dev); nd.nd_key = params->new_key; nd.nd_start_blk = params->start_blk; nd.nd_nblks = params->number_blks; nd.nd_labeled = params->has_label; nd.nd_hs_id = 0; err = (*repl_dev)(sm->sm_dev, 0, ci, &nd, recids, 6, &repl_done, &repl_data); } if (err != 0) { (void) mdcomperror(¶ms->mde, err, mnum, md_expldev(params->new_dev)); goto errexit; } /* In case of a dryun we're done. */ if (params->options & MDIOCTL_DRYRUN) { mdclrerror(¶ms->mde); return (0); } /* set_sm_comp_state() commits the modified records */ set_sm_comp_state(un, smi, ci, CS_RESYNC, recids, MD_STATE_NO_XMIT, lock); (*repl_done)(sm->sm_dev, repl_data); /* * If the mirror is open then need to make sure that the submirror, * on which the replace ran, is also open and if not then open it. * This is only a concern for a single component sub-mirror stripe * as it may not be open due to the failure of the single component. * * This check has to be done after the call to (*repl_done) * as that function releases the writer lock on the submirror. */ if (md_unit_isopen(ui)) { minor_t ms_mnum = md_getminor(sm->sm_dev); ms_ui = MDI_UNIT(ms_mnum); if (!md_unit_isopen(ms_ui)) { /* * Underlying submirror is not open so open it. */ if (md_layered_open(ms_mnum, &smdev, MD_OFLG_NULL)) { mirror_openfail_console_info(un, smi, ci); goto errexit; } } } mirror_check_failfast(mnum); if (params->cmd == ENABLE_COMP) { SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ENABLE, SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); } else { SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REPLACE, SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); } md_ioctl_writerexit(lock); /* * Reset any saved resync location flags as we've now replaced the * component. This means we have to resync the _whole_ component. */ un->un_rs_resync_done = un->un_rs_resync_2_do = 0; un->un_rs_type = MD_RS_NONE; mirror_resume_writes(un); if (!MD_MNSET_SETNO(MD_UN2SET(un))) (void) mirror_resync_unit(mnum, NULL, ¶ms->mde, lock); mdclrerror(¶ms->mde); return (0); errexit: /* We need to resume writes unless this is a dryrun */ if (!(params->options & MDIOCTL_DRYRUN)) mirror_resume_writes(un); return (0); } /* * mirror_attach: * ---------------- * Called to implement the submirror attach function * * Owner is returned in the parameter block passed in by the caller. * * Returns: * 0 success * error code if the functions fails * * For a MN set, on entry all writes to the mirror are suspended, on exit * from this function, writes must be resumed when not a dryrun. */ static int mirror_attach( md_att_struct_t *att, IOLOCK *lock ) { minor_t mnum = att->mnum; mm_unit_t *un; md_unit_t *su; mm_submirror_t *sm; mm_submirror_ic_t *smic; int smi; md_dev64_t sm_dev; minor_t sm_mnum; mdkey_t indx; set_t setno; uint_t options; /* * This routine should not be called during upgrade. */ if (MD_UPGRADE) { return (0); } mdclrerror(&att->mde); options = att->options; if ((un = mirror_getun(mnum, &att->mde, WRITERS, lock)) == NULL) { return (0); } setno = MD_UN2SET(un); for (smi = 0; smi < NMIRROR; smi++) if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) break; if (smi == NMIRROR) { (void) mdmderror(&att->mde, MDE_MIRROR_FULL, mnum); goto errexit; } sm = &un->un_sm[smi]; smic = &un->un_smic[smi]; sm_dev = att->submirror; sm_mnum = md_getminor(sm_dev); if (md_get_parent(sm_dev) != MD_NO_PARENT) { (void) mdmderror(&att->mde, MDE_IN_USE, sm_mnum); goto errexit; } if (md_unit_isopen(MDI_UNIT(sm_mnum))) { (void) mdmderror(&att->mde, MDE_IS_OPEN, sm_mnum); goto errexit; } /* Check the size */ su = (md_unit_t *)MD_UNIT(sm_mnum); if (un->c.un_total_blocks > su->c.un_total_blocks) { (void) mdmderror(&att->mde, MDE_SM_TOO_SMALL, sm_mnum); goto errexit; } /* Don't attach labeled sm to unlabeled mirrors */ if ((su->c.un_flag & MD_LABELED) && !(un->c.un_flag & MD_LABELED)) { (void) mdmderror(&att->mde, MDE_NO_LABELED_SM, sm_mnum); goto errexit; } indx = md_setshared_name(setno, ddi_major_to_name(md_getmajor(sm_dev)), 0L); /* Open the sm, only if the mirror is open */ if (md_unit_isopen(MDI_UNIT(mnum))) { if (md_layered_open(mnum, &sm_dev, MD_OFLG_NULL)) { (void) md_remshared_name(setno, indx); (void) mdmderror(&att->mde, MDE_SM_OPEN_ERR, md_getminor(att->submirror)); goto errexit; } /* in dryrun mode, don't leave the device open */ if (options & MDIOCTL_DRYRUN) { md_layered_close(sm_dev, MD_OFLG_NULL); } } /* * After this point the checks are done and action is taken. * So, clean up and return in case of dryrun. */ if (options & MDIOCTL_DRYRUN) { md_ioctl_writerexit(lock); mdclrerror(&att->mde); return (0); } sm->sm_key = att->key; sm->sm_dev = sm_dev; md_set_parent(sm_dev, MD_SID(un)); mirror_set_sm_state(sm, smic, SMS_ATTACHED_RESYNC, 1); build_submirror(un, smi, 0); un->un_nsm++; mirror_commit(un, SMI2BIT(smi), 0); mirror_check_failfast(mnum); SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_ATTACH, SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); mirror_resume_writes(un); md_ioctl_writerexit(lock); if (!MD_MNSET_SETNO(setno)) (void) mirror_resync_unit(mnum, NULL, &att->mde, lock); mdclrerror(&att->mde); return (0); errexit: /* We need to resume writes unless this is a dryrun */ if (!(options & MDIOCTL_DRYRUN)) mirror_resume_writes(un); return (0); } void reset_comp_states(mm_submirror_t *sm, mm_submirror_ic_t *smic) { int compcnt; int i; md_m_shared_t *shared; compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm); for (i = 0; i < compcnt; i++) { shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) (sm->sm_dev, sm, i); shared->ms_state = CS_OKAY; shared->ms_flags &= ~MDM_S_NOWRITE; shared->ms_lasterrcnt = 0; } } /* * mirror_detach: * ---------------- * Called to implement the submirror detach function * * Owner is returned in the parameter block passed in by the caller. * * Returns: * 0 success * error code if the functions fails * * For a MN set, on entry all writes to the mirror are suspended, on exit * from this function, writes must be resumed. */ static int mirror_detach( md_detach_params_t *det, IOLOCK *lock ) { minor_t mnum = det->mnum; mm_unit_t *un; mdi_unit_t *ui; mm_submirror_t *sm; mm_submirror_t *old_sm; mm_submirror_t *new_sm; mm_submirror_ic_t *smic; int smi; md_dev64_t sm_dev; md_unit_t *su; sv_dev_t sv; mddb_recid_t recids[2]; int nsv = 0; int smi_remove; mm_submirror_ic_t *old_smic; mm_submirror_ic_t *new_smic; mdclrerror(&det->mde); if ((un = mirror_getun(mnum, &det->mde, WRITERS, lock)) == NULL) { return (0); } ui = MDI_UNIT(mnum); if (ui->ui_tstate & MD_INACCESSIBLE) { mirror_resume_writes(un); return (mdmderror(&det->mde, MDE_IN_UNAVAIL_STATE, mnum)); } /* * detach cannot be done while a resync is active or we are * still waiting for an optimized resync to be started */ if (MD_STATUS(un) & (MD_UN_RESYNC_ACTIVE | MD_UN_OPT_NOT_DONE)) { mirror_resume_writes(un); return (mdmderror(&det->mde, MDE_RESYNC_ACTIVE, mnum)); } for (smi = 0; smi < NMIRROR; smi++) { if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) { continue; } if (un->un_sm[smi].sm_dev == det->submirror) { smi_remove = smi; break; } } if (smi == NMIRROR) { mirror_resume_writes(un); return (mdmderror(&det->mde, MDE_CANT_FIND_SM, mnum)); } if (un->un_nsm == 1) { mirror_resume_writes(un); return (mdmderror(&det->mde, MDE_LAST_SM, mnum)); } if (mirror_other_sources(un, smi, WHOLE_SM, 0) != 0) { mirror_resume_writes(un); return (mdmderror(&det->mde, MDE_NO_READABLE_SM, mnum)); } sm = &un->un_sm[smi]; smic = &un->un_smic[smi]; sm_dev = sm->sm_dev; su = (md_unit_t *)MD_UNIT(md_getminor(sm_dev)); /* * Need to pass in the extra record id, * cause mirror_commit() will not commit * a sm (from the smmask) if the slot is unused. * Which it is, since we are detaching. */ recids[0] = ((md_unit_t *)MD_UNIT(md_getminor(sm_dev)))->c.un_record_id; recids[1] = 0; mirror_set_sm_state(sm, smic, SMS_UNUSED, det->force_detach); /* * If there are any erred components * then make the detach fail and do not unparent the * submirror. */ if (sm->sm_state == SMS_UNUSED) { /* reallow soft partitioning of submirror */ MD_CAPAB(su) |= MD_CAN_SP; md_reset_parent(sm_dev); reset_comp_states(sm, smic); un->un_nsm--; /* Close the sm, only if the mirror is open */ if (md_unit_isopen(MDI_UNIT(mnum))) md_layered_close(sm_dev, MD_OFLG_NULL); sv.setno = MD_UN2SET(un); sv.key = sm->sm_key; nsv = 1; } else (void) mdmderror(&det->mde, MDE_SM_FAILED_COMPS, mnum); /* * Perhaps the mirror changed it's size due to this detach. * (void) mirror_grow_unit(un, &mde); */ /* * NOTE: We are passing the detached sm recid * and not the smmask field. This is correct. */ mirror_commit(un, 0, recids); md_rem_names(&sv, nsv); if (sm->sm_state == SMS_UNUSED) { SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DETACH, SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); } /* * Reshuffle the submirror devices in the array as we potentially * have a dead record in the middle of it. */ for (smi = 0; nsv && (smi < NMIRROR); smi++) { if (smi < smi_remove) { continue; } if (smi > smi_remove) { old_sm = &un->un_sm[smi]; new_sm = &un->un_sm[smi - 1]; new_sm->sm_key = old_sm->sm_key; new_sm->sm_dev = old_sm->sm_dev; new_sm->sm_state = old_sm->sm_state; new_sm->sm_flags = old_sm->sm_flags; new_sm->sm_shared = old_sm->sm_shared; new_sm->sm_hsp_id = old_sm->sm_hsp_id; new_sm->sm_timestamp = old_sm->sm_timestamp; bzero(old_sm, sizeof (mm_submirror_t)); old_smic = &un->un_smic[smi]; new_smic = &un->un_smic[smi - 1]; bcopy(old_smic, new_smic, sizeof (mm_submirror_ic_t)); bzero(old_smic, sizeof (mm_submirror_ic_t)); } } mirror_commit(un, 0, NULL); mirror_resume_writes(un); return (0); } /* * mirror_offline: * ---------------- * Called to implement the submirror offline function * * Owner is returned in the parameter block passed in by the caller. * * Returns: * 0 success * error code if the functions fails * * For a MN set, on entry all writes to the mirror are suspended, on exit * from this function, writes must be resumed. */ static int mirror_offline( md_i_off_on_t *miop, IOLOCK *lock ) { minor_t mnum = miop->mnum; mm_unit_t *un; mm_submirror_t *sm; mm_submirror_ic_t *smic; int smi; mdi_unit_t *ui = MDI_UNIT(mnum); mdclrerror(&miop->mde); if ((un = mirror_getun(mnum, &miop->mde, WR_LOCK, lock)) == NULL) { return (0); } /* * offline cannot be done while a resync is active or we are * still waiting for an optimized resync to be started */ if (MD_STATUS(un) & (MD_UN_RESYNC_ACTIVE | MD_UN_OPT_NOT_DONE)) { mirror_resume_writes(un); return (mdmderror(&miop->mde, MDE_RESYNC_ACTIVE, mnum)); } /* * Reject mirror_offline if ABR is set */ if ((ui->ui_tstate & MD_ABR_CAP) || un->un_abr_count) { mirror_resume_writes(un); return (mderror(&miop->mde, MDE_ABR_SET)); } for (smi = 0; smi < NMIRROR; smi++) { if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) continue; if (un->un_sm[smi].sm_dev == miop->submirror) break; } if (smi == NMIRROR) { mirror_resume_writes(un); return (mdmderror(&miop->mde, MDE_CANT_FIND_SM, mnum)); } sm = &un->un_sm[smi]; smic = &un->un_smic[smi]; if (!SMS_IS(sm, SMS_RUNNING) && !miop->force_offline) { mirror_resume_writes(un); return (mdmderror(&miop->mde, MDE_ILLEGAL_SM_STATE, mnum)); } if (mirror_other_sources(un, smi, WHOLE_SM, 0) != 0) { mirror_resume_writes(un); return (mdmderror(&miop->mde, MDE_NO_READABLE_SM, mnum)); } mirror_set_sm_state(sm, smic, SMS_OFFLINE, 1); mirror_resume_writes(un); MD_STATUS(un) |= MD_UN_OFFLINE_SM; mirror_commit(un, NO_SUBMIRRORS, 0); SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OFFLINE, SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); return (0); } /* * mirror_online: * ---------------- * Called to implement the submirror online function * * Owner is returned in the parameter block passed in by the caller. * * Returns: * 0 success * error code if the functions fails * * For a MN set, on entry all writes to the mirror are suspended, on exit * from this function, writes must be resumed. */ static int mirror_online( md_i_off_on_t *miop, IOLOCK *lock ) { minor_t mnum = miop->mnum; mm_unit_t *un; mm_submirror_t *sm; mm_submirror_ic_t *smic; int smi; set_t setno = MD_MIN2SET(mnum); mdclrerror(&miop->mde); if ((un = mirror_getun(mnum, &miop->mde, WR_LOCK, lock)) == NULL) { return (0); } for (smi = 0; smi < NMIRROR; smi++) { if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) continue; if (un->un_sm[smi].sm_dev == miop->submirror) break; } if (smi == NMIRROR) { mirror_resume_writes(un); return (mdmderror(&miop->mde, MDE_CANT_FIND_SM, mnum)); } sm = &un->un_sm[smi]; smic = &un->un_smic[smi]; if (!SMS_IS(sm, SMS_OFFLINE)) { mirror_resume_writes(un); return (mdmderror(&miop->mde, MDE_ILLEGAL_SM_STATE, mnum)); } /* * online cannot be done while a resync is active or we are * still waiting for an optimized resync to be started */ if (MD_STATUS(un) & (MD_UN_RESYNC_ACTIVE | MD_UN_OPT_NOT_DONE)) { mirror_resume_writes(un); return (mdmderror(&miop->mde, MDE_RESYNC_ACTIVE, mnum)); } mirror_set_sm_state(sm, smic, SMS_OFFLINE_RESYNC, 1); mirror_commit(un, NO_SUBMIRRORS, 0); mirror_check_failfast(mnum); SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ONLINE, SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); /* for MN sets, re-read the resync record from disk */ if (MD_MNSET_SETNO(MD_UN2SET(un))) (void) mddb_reread_rr(setno, un->un_rr_dirty_recid); bcopy((caddr_t)un->un_dirty_bm, (caddr_t)un->un_resync_bm, howmany(un->un_rrd_num, NBBY)); MD_STATUS(un) |= MD_UN_OPT_NOT_DONE; sm->sm_flags |= MD_SM_RESYNC_TARGET; mirror_resume_writes(un); md_ioctl_writerexit(lock); if (!MD_MNSET_SETNO(setno)) return (mirror_resync_unit(mnum, NULL, &miop->mde, lock)); else return (0); } int mirror_grow_unit( mm_unit_t *un, md_error_t *ep ) { md_unit_t *su; mm_submirror_t *sm; int smi; diskaddr_t total_blocks; diskaddr_t current_tb; int spc; /* sectors per head */ minor_t mnum = MD_SID(un); /* * grow_unit cannot be done while a resync is active or we are * still waiting for an optimized resync to be started. Set * flag to indicate GROW_PENDING and once the resync is complete * the grow_unit function will be executed. */ if (MD_STATUS(un) & (MD_UN_RESYNC_ACTIVE | MD_UN_OPT_NOT_DONE)) { MD_STATUS(un) |= MD_UN_GROW_PENDING; mirror_commit(un, NO_SUBMIRRORS, 0); return (mdmderror(ep, MDE_GROW_DELAYED, MD_SID(un))); } /* * Find the smallest submirror */ total_blocks = 0; for (smi = 0; smi < NMIRROR; smi++) { if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) continue; sm = &un->un_sm[smi]; /* * Growth is not possible if there is one or more * submirrors made up of non-Metadevices. */ if (md_getmajor(sm->sm_dev) != md_major) return (0); su = MD_UNIT(md_getminor(sm->sm_dev)); if ((total_blocks == 0) || (su->c.un_total_blocks < total_blocks)) total_blocks = su->c.un_total_blocks; } /* * If the smallest submirror is not larger * than the mirror, we are all done. */ if (total_blocks <= un->c.un_total_blocks) return (0); /* * Growing the mirror now. * First: Round down the actual_tb to be a multiple * of nheads * nsects. */ spc = un->c.un_nhead * un->c.un_nsect; current_tb = (total_blocks/spc) * spc; un->c.un_total_blocks = current_tb; md_nblocks_set(mnum, un->c.un_total_blocks); un->c.un_actual_tb = total_blocks; /* Is the mirror growing from 32 bit device to 64 bit device? */ if (((un->c.un_revision & MD_64BIT_META_DEV) == 0) && (un->c.un_total_blocks > MD_MAX_BLKS_FOR_SMALL_DEVS)) { #if defined(_ILP32) return (mdmderror(ep, MDE_UNIT_TOO_LARGE, mnum)); #else mddb_type_t typ1; mddb_recid_t recid; set_t setno; mddb_recid_t old_recid = un->c.un_record_id; mddb_recid_t old_vtoc; mddb_de_ic_t *dep, *old_dep; md_create_rec_option_t options; /* yup, new device size. So we need to replace the record */ typ1 = (mddb_type_t)md_getshared_key(MD_UN2SET(un), mirror_md_ops.md_driver.md_drivername); setno = MD_MIN2SET(mnum); /* Preserve the friendly name properties of growing unit */ options = MD_CRO_64BIT | MD_CRO_MIRROR; if (un->c.un_revision & MD_FN_META_DEV) options |= MD_CRO_FN; recid = mddb_createrec(offsetof(mm_unit_t, un_smic), typ1, MIRROR_REC, options, setno); /* Resize to include incore fields */ un->c.un_revision |= MD_64BIT_META_DEV; /* All 64 bit metadevices only support EFI labels. */ un->c.un_flag |= MD_EFILABEL; /* * If the device had a vtoc record attached to it, we remove * the vtoc record, because the layout has changed completely. */ old_vtoc = un->c.un_vtoc_id; if (old_vtoc != 0) { un->c.un_vtoc_id = md_vtoc_to_efi_record(old_vtoc, setno); } MD_RECID(un) = recid; dep = mddb_getrecdep(recid); old_dep = mddb_getrecdep(old_recid); kmem_free(dep->de_rb_userdata, dep->de_reqsize); dep->de_rb_userdata = old_dep->de_rb_userdata; dep->de_reqsize = old_dep->de_reqsize; dep->de_rb_userdata_ic = old_dep->de_rb_userdata_ic; dep->de_icreqsize = old_dep->de_icreqsize; mirror_commit(un, NO_SUBMIRRORS, 0); old_dep->de_rb_userdata = NULL; old_dep->de_rb_userdata_ic = NULL; mddb_deleterec_wrapper(old_recid); /* * If there was a vtoc record, it is no longer needed, because * a new efi record has been created for this un. */ if (old_vtoc != 0) { mddb_deleterec_wrapper(old_vtoc); } #endif } if ((current_tb/un->un_rrd_blksize) > MD_MAX_NUM_RR) { if (mirror_resize_resync_regions(un, current_tb)) { return (mdmderror(ep, MDE_RR_ALLOC_ERROR, MD_SID(un))); } mirror_check_failfast(mnum); SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_GROW, SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); return (0); } if (mirror_add_resync_regions(un, current_tb)) { return (mdmderror(ep, MDE_RR_ALLOC_ERROR, MD_SID(un))); } mirror_check_failfast(mnum); SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_GROW, SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); return (0); } static int mirror_grow( void *mgp, IOLOCK *lock ) { mm_unit_t *un; md_grow_params_t *mgph = mgp; mdclrerror(&mgph->mde); if ((un = mirror_getun(mgph->mnum, &mgph->mde, WR_LOCK, lock)) == NULL) return (0); if (MD_STATUS(un) & MD_UN_GROW_PENDING) return (0); return (mirror_grow_unit(un, &mgph->mde)); } static int mirror_change( md_mirror_params_t *mmp, IOLOCK *lock ) { mm_params_t *pp = &mmp->params; mm_unit_t *un; mdclrerror(&mmp->mde); if ((un = mirror_getun(mmp->mnum, &mmp->mde, WR_LOCK, lock)) == NULL) return (0); if (pp->change_read_option) un->un_read_option = pp->read_option; if (pp->change_write_option) un->un_write_option = pp->write_option; if (pp->change_pass_num) un->un_pass_num = pp->pass_num; mirror_commit(un, NO_SUBMIRRORS, 0); SE_NOTIFY(EC_SVM_STATE, ESC_SVM_CHANGE, SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); return (0); } static int mirror_get_resync( md_resync_ioctl_t *ri ) { minor_t mnum = ri->ri_mnum; mm_unit_t *un; u_longlong_t percent; uint_t cnt; uint_t rr; diskaddr_t d; mdclrerror(&ri->mde); if ((un = mirror_getun(mnum, &ri->mde, STALE_OK|NO_LOCK, NULL)) == NULL) return (0); ri->ri_flags = 0; if (md_get_setstatus(MD_MIN2SET(mnum)) & MD_SET_STALE) { ri->ri_percent_done = 0; ri->ri_percent_dirty = 0; return (0); } if (MD_STATUS(un) & (MD_UN_RESYNC_ACTIVE|MD_UN_RESYNC_CANCEL)) { if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ri->ri_flags |= MD_RI_INPROGRESS; /* Return state of resync thread */ ri->ri_flags |= (un->un_rs_thread_flags & MD_RI_BLOCK); d = un->un_rs_resync_2_do; if (d) { percent = un->un_rs_resync_done; if (un->c.un_total_blocks > MD_MAX_BLKS_FOR_SMALL_DEVS) { percent *= 1000; percent /= d; if (percent > 1000) percent = 1000; } else { percent *= 100; percent /= d; } ri->ri_percent_done = (int)percent; } else { ri->ri_percent_done = 0; } } if (un->un_nsm < 2) { ri->ri_percent_dirty = 0; return (0); } cnt = 0; for (rr = 0; rr < un->un_rrd_num; rr++) if (IS_REGION_DIRTY(rr, un)) cnt++; d = un->un_rrd_num; if (d) { percent = cnt; percent *= 100; percent += d - 1; /* round up */ percent /= d; } else percent = 0; ri->ri_percent_dirty = (int)percent; return (0); } /* * mirror_get_owner: * ---------------- * Called to obtain the current owner of a mirror. * * Owner is returned in the parameter block passed in by the caller. * * Returns: * 0 success * EINVAL metadevice does not exist or is not a member of a multi-owned * set. */ static int mirror_get_owner(md_set_mmown_params_t *p, IOLOCK *lock) { mm_unit_t *un; set_t setno; if ((un = mirror_getun(p->d.mnum, &p->mde, RD_LOCK, lock)) == NULL) return (EINVAL); setno = MD_UN2SET(un); if (!MD_MNSET_SETNO(setno)) { return (EINVAL); } p->d.owner = un->un_mirror_owner; return (0); } /* * mirror_choose_owner_thread: * -------------------------- * Called to send a CHOOSE_OWNER message to the commd running on the master * node. This needs to run in a separate context so that mutex livelock is * avoided. This can occur because the original request is issued from a call * to metaioctl() which acquires the global ioctl lock, calls down into the * mirror_ioctl code and then attempts to mdmn_ksend_message() to the master * node. As the handler for the choose_owner message needs to send another * ioctl through the metaioctl() entry point, any other use (by rpc.metad or * mdcommd checking on set ownership) will deadlock the system leading to * cluster reconfiguration timeouts and eventually a node or (at worst) a * cluster-wide panic */ static void mirror_choose_owner_thread(md_mn_msg_chooseid_t *msg) { int rval; md_mn_kresult_t *kres; set_t setno = MD_MIN2SET(msg->msg_chooseid_mnum); kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); rval = mdmn_ksend_message(setno, MD_MN_MSG_CHOOSE_OWNER, MD_MSGF_NO_BCAST | MD_MSGF_NO_LOG, 0, (char *)msg, sizeof (md_mn_msg_chooseid_t), kres); if (!MDMN_KSEND_MSG_OK(rval, kres)) { mdmn_ksend_show_error(rval, kres, "CHOOSE OWNER"); cmn_err(CE_WARN, "ksend_message failure: CHOOSE_OWNER"); } kmem_free(kres, sizeof (md_mn_kresult_t)); kmem_free(msg, sizeof (md_mn_msg_chooseid_t)); thread_exit(); } /* * mirror_owner_thread: * ------------------- * Called to request an ownership change from a thread context. This issues * a mdmn_ksend_message() and then completes the appropriate ownership change * on successful completion of the message transport. * The originating application must poll for completion on the 'flags' member * of the MD_MN_MM_OWNER_STATUS ioctl() parameter block. * Success is marked by a return value of MD_MN_MM_RES_OK, Failure by * MD_MN_MM_RES_FAIL */ static void mirror_owner_thread(md_mn_req_owner_t *ownp) { int rval; set_t setno = MD_MIN2SET(ownp->mnum); mm_unit_t *un = MD_UNIT(ownp->mnum); md_mn_kresult_t *kresult; md_mps_t *ps1; un->un_mirror_owner_status = 0; mutex_enter(&un->un_owner_mx); un->un_owner_state |= MM_MN_OWNER_SENT; mutex_exit(&un->un_owner_mx); kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); rval = mdmn_ksend_message(setno, MD_MN_MSG_REQUIRE_OWNER, MD_MSGF_NO_LOG, 0, (char *)ownp, sizeof (md_mn_req_owner_t), kresult); if (!MDMN_KSEND_MSG_OK(rval, kresult)) { /* * Message transport layer failed. Return the failure code to * the application. */ mdmn_ksend_show_error(rval, kresult, "CHANGE OWNER"); mutex_enter(&un->un_owner_mx); un->un_owner_state &= ~(MM_MN_BECOME_OWNER|MM_MN_OWNER_SENT); mutex_exit(&un->un_owner_mx); un->un_mirror_owner_status = MD_MN_MM_RESULT | MD_MN_MM_RES_FAIL; } else { /* * Ownership change succeeded. Update in-core version of * mirror owner. */ mutex_enter(&un->un_owner_mx); if (un->un_owner_state & MM_MN_BECOME_OWNER) { un->un_mirror_owner = md_mn_mynode_id; /* Sets node owner of un_rr_dirty record */ if (un->un_rr_dirty_recid) (void) mddb_setowner(un->un_rr_dirty_recid, md_mn_mynode_id); /* * Release the block on the current resync region if it * is blocked */ ps1 = un->un_rs_prev_overlap; if ((ps1 != NULL) && (ps1->ps_flags & MD_MPS_ON_OVERLAP)) mirror_overlap_tree_remove(ps1); } un->un_owner_state &= ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER); mutex_exit(&un->un_owner_mx); un->un_mirror_owner_status = MD_MN_MM_RESULT | MD_MN_MM_RES_OK; /* Restart the resync thread if it was previously blocked */ if (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) { mutex_enter(&un->un_rs_thread_mx); un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER; cv_signal(&un->un_rs_thread_cv); mutex_exit(&un->un_rs_thread_mx); } } kmem_free(kresult, sizeof (md_mn_kresult_t)); kmem_free(ownp, sizeof (md_mn_req_owner_t)); thread_exit(); } /* * mirror_set_owner: * ---------------- * Called to change the owner of a mirror to the specified node. If we * are not the owner of the mirror, we do nothing apart from update the in-core * ownership. It can also be used to choose a new owner for the resync of a * mirror, this case is specified by the flag MD_MN_MM_CHOOSE_OWNER, see below. * * The p->d.flags bitfield controls how subsequent ownership changes will be * handled: * MD_MN_MM_SPAWN_THREAD * a separate thread is created which emulates the behaviour of * become_owner() [mirror.c]. This is needed when changing the * ownership from user context as there needs to be a controlling * kernel thread which updates the owner info on the originating * node. Successful completion of the mdmn_ksend_message() means * that the owner field can be changed. * * MD_MN_MM_PREVENT_CHANGE * Disallow any change of ownership once this ownership change has * been processed. The only way of changing the owner away from * the p->d.owner node specified in the call is to issue a request * with MD_MN_MM_ALLOW_CHANGE set in the flags. Any request to * become owner from a different node while the PREVENT_CHANGE * is in operation will result in an EAGAIN return value. * un->un_owner_state has MM_MN_PREVENT_CHANGE set. * * MD_MN_MM_ALLOW_CHANGE * Allow the owner to be changed by a subsequent request. * un->un_owner_state has MM_MN_PREVENT_CHANGE cleared. * * MD_MN_MM_CHOOSE_OWNER * Choose a new owner for a mirror resync. In this case, the new * owner argument is not used. The selection of a new owner * is a round robin allocation using a resync owner count. This * ioctl passes this value in a message to the master node * which uses it to select a node from the node list and then * sends it a message to become the owner. * * If we are the current owner, we must stop further i/o from being scheduled * and wait for any pending i/o to drain. We wait for any in-progress resync * bitmap updates to complete and we can then set the owner. If an update to * the resync bitmap is attempted after this we simply don't write this out to * disk until the ownership is restored. * * If we are the node that wants to become the owner we update the in-core * owner and return. The i/o that initiated the ownership change will complete * on successful return from this ioctl. * * Return Value: * 0 Success * EINVAL Invalid unit referenced * EAGAIN Ownership couldn't be transferred away or change of * ownership is prevented. Caller should retry later on. */ static int mirror_set_owner(md_set_mmown_params_t *p, IOLOCK *lock) { mdi_unit_t *ui; mm_unit_t *un; set_t setno; if ((un = mirror_getun(p->d.mnum, &p->mde, RD_LOCK, lock)) == NULL) return (EINVAL); ui = MDI_UNIT(p->d.mnum); setno = MD_MIN2SET(p->d.mnum); if (!MD_MNSET_SETNO(setno)) { return (EINVAL); } /* * If we are choosing a new resync owner, send a message to the master * to make the choice. */ if (p->d.flags & MD_MN_MM_CHOOSE_OWNER) { /* Release ioctl lock before we call ksend_message() */ md_ioctl_readerexit(lock); /* If we're resetting the owner pass the node id in */ if (p->d.owner != MD_MN_MIRROR_UNOWNED) { return (mirror_choose_owner(un, &p->d)); } else { return (mirror_choose_owner(un, NULL)); } } /* * Check for whether we have to spawn a thread to issue this request. * If set we issue a mdmn_ksend_message() to cause the appropriate * ownership change. On completion of this request the calling * application _must_ poll the structure 'flags' field to determine the * result of the request. All this is necessary until we have true * multi-entrant ioctl support. * If we are just clearing the owner, then MD_MN_MM_SPAWN_THREAD can * be ignored. */ if ((p->d.flags & MD_MN_MM_SPAWN_THREAD) && (p->d.owner != 0)) { md_mn_req_owner_t *ownp; ownp = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP); p->d.flags &= ~MD_MN_MM_SPAWN_THREAD; bcopy(&p->d, ownp, sizeof (md_mn_req_owner_t)); if (thread_create(NULL, 0, mirror_owner_thread, (caddr_t)ownp, 0, &p0, TS_RUN, 60) == NULL) { kmem_free(ownp, sizeof (md_mn_req_owner_t)); return (EFAULT); } else { return (0); } } /* * If setting owner to NULL, this is being done because the owner has * died and therefore we set OPT_NOT_DONE to ensure that the * mirror is marked as "Needs Maintenance" and that an optimized * resync will be done when we resync the mirror, Also clear the * PREVENT_CHANGE flag and remove the last resync region from the * overlap tree. */ if (p->d.owner == 0) { md_mps_t *ps; int i; md_ioctl_readerexit(lock); un = md_ioctl_writerlock(lock, ui); /* * If the ABR capability is not set and the pass_num is non-zero * there is need to perform an optimized resync * Therefore set OPT_NOT_DONE, setup the resync_bm and set * the submirrors as resync targets. */ if (!(ui->ui_tstate & MD_ABR_CAP) && un->un_pass_num) { MD_STATUS(un) |= MD_UN_OPT_NOT_DONE; (void) mddb_reread_rr(setno, un->un_rr_dirty_recid); bcopy((caddr_t)un->un_dirty_bm, (caddr_t)un->un_resync_bm, howmany(un->un_rrd_num, NBBY)); for (i = 0; i < NMIRROR; i++) { if ((SUBMIRROR_IS_READABLE(un, i)) || SMS_BY_INDEX_IS(un, i, SMS_OFFLINE_RESYNC)) un->un_sm[i].sm_flags |= MD_SM_RESYNC_TARGET; } } mutex_enter(&un->un_owner_mx); un->un_owner_state &= ~MD_MN_MM_PREVENT_CHANGE; mutex_exit(&un->un_owner_mx); ps = un->un_rs_prev_overlap; if ((ps != NULL) && (ps->ps_flags & MD_MPS_ON_OVERLAP)) { mirror_overlap_tree_remove(ps); ps->ps_firstblk = 0; ps->ps_lastblk = 0; } md_ioctl_writerexit(lock); un = md_ioctl_readerlock(lock, ui); } mutex_enter(&un->un_owner_mx); if (!(un->un_owner_state & MM_MN_BECOME_OWNER)) { /* * If we are not trying to become owner ourselves check * to see if we have to change the owner */ if (un->un_mirror_owner == p->d.owner) { /* * No need to change owner, * Clear/set PREVENT_CHANGE bit */ if (p->d.flags & MD_MN_MM_PREVENT_CHANGE) { un->un_owner_state |= MM_MN_PREVENT_CHANGE; } else if (p->d.flags & MD_MN_MM_ALLOW_CHANGE) { un->un_owner_state &= ~MM_MN_PREVENT_CHANGE; } mutex_exit(&un->un_owner_mx); return (0); } } /* * Disallow ownership change if previously requested to. This can only * be reset by issuing a request with MD_MN_MM_ALLOW_CHANGE set in the * flags field. */ if ((un->un_owner_state & MM_MN_PREVENT_CHANGE) && !(p->d.flags & MD_MN_MM_ALLOW_CHANGE)) { mutex_exit(&un->un_owner_mx); #ifdef DEBUG cmn_err(CE_WARN, "mirror_ioctl: Node %x attempted to become " "owner while node %x has exclusive access to %s", p->d.owner, un->un_mirror_owner, md_shortname(MD_SID(un))); #endif return (EAGAIN); } if (p->d.owner == md_mn_mynode_id) { /* * I'm becoming the mirror owner. Flag this so that the * message sender can change the in-core owner when all * nodes have processed this message */ un->un_owner_state &= ~MM_MN_OWNER_SENT; un->un_owner_state |= MM_MN_BECOME_OWNER; un->un_owner_state |= (p->d.flags & MD_MN_MM_PREVENT_CHANGE) ? MM_MN_PREVENT_CHANGE : 0; un->un_owner_state &= (p->d.flags & MD_MN_MM_ALLOW_CHANGE) ? ~MM_MN_PREVENT_CHANGE : ~0; mutex_exit(&un->un_owner_mx); } else if ((un->un_mirror_owner == md_mn_mynode_id) || un->un_owner_state & MM_MN_BECOME_OWNER) { mutex_exit(&un->un_owner_mx); /* * I'm releasing ownership. Block and drain i/o. This also * blocks until any in-progress resync record update completes. */ md_ioctl_readerexit(lock); un = md_ioctl_writerlock(lock, ui); /* Block the resync thread */ mutex_enter(&un->un_rs_thread_mx); un->un_rs_thread_flags |= MD_RI_BLOCK_OWNER; mutex_exit(&un->un_rs_thread_mx); mutex_enter(&un->un_owner_mx); un->un_mirror_owner = p->d.owner; /* Sets node owner of un_rr_dirty record */ if (un->un_rr_dirty_recid) (void) mddb_setowner(un->un_rr_dirty_recid, p->d.owner); un->un_owner_state &= ~MM_MN_BECOME_OWNER; un->un_owner_state |= (p->d.flags & MD_MN_MM_PREVENT_CHANGE) ? MM_MN_PREVENT_CHANGE : 0; un->un_owner_state &= (p->d.flags & MD_MN_MM_ALLOW_CHANGE) ? ~MM_MN_PREVENT_CHANGE : ~0; mutex_exit(&un->un_owner_mx); /* * Allow further i/o to occur. Any write() from another node * will now cause another ownership change to occur. */ md_ioctl_writerexit(lock); } else { /* Update the in-core mirror owner */ un->un_mirror_owner = p->d.owner; /* Sets node owner of un_rr_dirty record */ if (un->un_rr_dirty_recid) (void) mddb_setowner(un->un_rr_dirty_recid, p->d.owner); un->un_owner_state |= (p->d.flags & MD_MN_MM_PREVENT_CHANGE) ? MM_MN_PREVENT_CHANGE : 0; un->un_owner_state &= (p->d.flags & MD_MN_MM_ALLOW_CHANGE) ? ~MM_MN_PREVENT_CHANGE : ~0; mutex_exit(&un->un_owner_mx); } return (0); } /* * mirror_allocate_hotspare: * ------------------------ * Called to allocate a hotspare for a failed component. This function is * called by the MD_MN_ALLOCATE_HOTSPARE ioctl. */ static int mirror_allocate_hotspare(md_alloc_hotsp_params_t *p, IOLOCK *lockp) { set_t setno; mm_unit_t *un; #ifdef DEBUG if (mirror_debug_flag) printf("mirror_allocate_hotspare: mnum,sm,comp = %x, %x, %x\n", p->mnum, p->sm, p->comp); #endif if ((un = mirror_getun(p->mnum, &p->mde, WR_LOCK, lockp)) == NULL) return (EINVAL); /* This function is only valid for a multi-node set */ setno = MD_MIN2SET(p->mnum); if (!MD_MNSET_SETNO(setno)) { return (EINVAL); } (void) check_comp_4_hotspares(un, p->sm, p->comp, MD_HOTSPARE_NO_XMIT, p->hs_id, lockp); md_ioctl_writerexit(lockp); return (0); } /* * mirror_get_owner_status: * ----------------------- * Return the status of a previously issued ioctl to change ownership. This is * required for soft-partition support as the request to change mirror owner * needs to be run from a separate daemon thread. * * Returns: * 0 Success (contents of un_mirror_owner_status placed in 'flags') * EINVAL Invalid unit */ static int mirror_get_owner_status(md_mn_own_status_t *p, IOLOCK *lock) { mm_unit_t *un; set_t setno; if ((un = mirror_getun(p->mnum, &p->mde, RD_LOCK, lock)) == NULL) return (EINVAL); setno = MD_MIN2SET(p->mnum); if (!MD_MNSET_SETNO(setno)) { return (EINVAL); } p->flags = un->un_mirror_owner_status; return (0); } /* * mirror_set_state: * --------------- * Called to set the state of the component of a submirror to the specified * value. This function is called by the MD_MN_SET_STATE ioctl. */ static int mirror_set_state(md_set_state_params_t *p, IOLOCK *lockp) { mm_unit_t *un; mm_submirror_t *sm; mm_submirror_ic_t *smic; md_m_shared_t *shared; set_t setno; #ifdef DEBUG if (mirror_debug_flag) printf("mirror_set_state: mnum,sm,comp,state, hs_id = %x, " "%x, %x, %x %x\n", p->mnum, p->sm, p->comp, p->state, p->hs_id); #endif if ((un = mirror_getun(p->mnum, &p->mde, WR_LOCK, lockp)) == NULL) return (EINVAL); /* This function is only valid for a multi-node set */ setno = MD_MIN2SET(p->mnum); if (!MD_MNSET_SETNO(setno)) { return (EINVAL); } sm = &un->un_sm[p->sm]; smic = &un->un_smic[p->sm]; /* Set state in component and update ms_flags */ shared = (md_m_shared_t *) (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, p->comp); /* * If a CS_ERRED state is being sent, verify that the sender * has the same view of the component that this node currently has. * * There is a case where the sender was sending a CS_ERRED when a * component was in error, but before the sender returns from * ksend_message the component has been hotspared and resync'd. * * In this case, the hs_id will be different from the shared ms_hs_id, * so the component has already been hotspared. Just return in this * case. */ if (p->state == CS_ERRED) { if (shared->ms_hs_id != p->hs_id) { #ifdef DEBUG if (mirror_debug_flag) { printf("mirror_set_state: short circuit " "hs_id=0x%x, ms_hs_id=0x%x\n", p->hs_id, shared->ms_hs_id); } #endif /* release the block on writes to the mirror */ mirror_resume_writes(un); md_ioctl_writerexit(lockp); return (0); } } /* * If the device is newly errored then make sure that it is * closed. Closing the device allows for the RCM framework * to unconfigure the device if required. */ if (!(shared->ms_state & CS_ERRED) && (p->state & CS_ERRED) && (shared->ms_flags & MDM_S_ISOPEN)) { void (*get_dev)(); ms_cd_info_t cd; get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, "get device", 0); (void) (*get_dev)(sm->sm_dev, sm, p->comp, &cd); md_layered_close(cd.cd_dev, MD_OFLG_NULL); shared->ms_flags &= ~MDM_S_ISOPEN; } shared->ms_state = p->state; uniqtime32(&shared->ms_timestamp); if (p->state == CS_ERRED) { shared->ms_flags |= MDM_S_NOWRITE; } else shared->ms_flags &= ~MDM_S_NOWRITE; shared->ms_flags &= ~MDM_S_IOERR; un->un_changecnt++; shared->ms_lasterrcnt = un->un_changecnt; /* Update state in submirror */ mirror_set_sm_state(sm, smic, SMS_RUNNING, 0); /* * Commit the state change to the metadb, only the master will write * to disk */ mirror_commit(un, SMI2BIT(p->sm), 0); /* release the block on writes to the mirror */ mirror_resume_writes(un); /* generate NOTIFY events for error state changes */ if (p->state == CS_ERRED) { SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); } else if (p->state == CS_LAST_ERRED) { SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); } md_ioctl_writerexit(lockp); return (0); } /* * mirror_suspend_writes: * --------------------- * Called to suspend writes to a mirror region. The flag un_suspend_wr_flag is * tested in mirror_write_strategy, and if set all writes are blocked. * This function is called by the MD_MN_SUSPEND_WRITES ioctl. */ static int mirror_suspend_writes(md_suspend_wr_params_t *p) { set_t setno; mm_unit_t *un; #ifdef DEBUG if (mirror_debug_flag) printf("mirror_suspend_writes: mnum = %x\n", p->mnum); #endif if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL) return (EINVAL); /* No unit */ /* This function is only valid for a multi-node set */ setno = MD_MIN2SET(p->mnum); if (!MD_MNSET_SETNO(setno)) { return (EINVAL); } /* * Mark the resync as blocked. This will stop any currently running * thread and will prevent a new resync from attempting to perform * i/o */ mutex_enter(&un->un_rs_thread_mx); un->un_rs_thread_flags |= MD_RI_BLOCK; mutex_exit(&un->un_rs_thread_mx); mutex_enter(&un->un_suspend_wr_mx); un->un_suspend_wr_flag = 1; mutex_exit(&un->un_suspend_wr_mx); return (0); } /* * mirror_set_capability: * ------------------------ * Called to set or clear a capability for a mirror * called by the MD_MN_SET_CAP ioctl. */ static int mirror_set_capability(md_mn_setcap_params_t *p, IOLOCK *lockp) { set_t setno; mm_unit_t *un; mdi_unit_t *ui; #ifdef DEBUG if (mirror_debug_flag) printf("mirror_set_capability: mnum = %x\n", p->mnum); #endif if ((un = mirror_getun(p->mnum, &p->mde, RD_LOCK, lockp)) == NULL) return (EINVAL); /* This function is only valid for a multi-node set */ setno = MD_MIN2SET(p->mnum); if (!MD_MNSET_SETNO(setno)) { return (EINVAL); } ui = MDI_UNIT(p->mnum); if (p->sc_set & DKV_ABR_CAP) { ui->ui_tstate |= MD_ABR_CAP; /* Set ABR capability */ /* Clear DRL and set owner to 0 if no resync active */ mirror_process_unit_resync(un); if (!(un->c.un_status & MD_UN_RESYNC_ACTIVE)) { mutex_enter(&un->un_owner_mx); un->un_mirror_owner = 0; mutex_exit(&un->un_owner_mx); } } else { ui->ui_tstate &= ~MD_ABR_CAP; /* Clear ABR capability */ } if (p->sc_set & DKV_DMR_CAP) { ui->ui_tstate |= MD_DMR_CAP; /* Set DMR capability */ } else { ui->ui_tstate &= ~MD_DMR_CAP; /* Clear DMR capability */ } return (0); } /* * mirror_choose_owner: * ------------------------ * Called to choose an owner for a mirror resync. Can be called when starting * resync or by the MD_MN_SET_MM_OWNER ioctl with the MD_MN_MM_CHOOSE_OWNER flag * set. The ioctl is called with this flag set when we are in the cluster * reconfig and we wish to set a new owner for a resync whose owner has left * the cluster. We use a resync owner count to implement a round robin * allocation of resync owners. We send a message to the master including * this count and the message handler uses it to select an owner from the * nodelist and then sends a SET_MM_OWNER message to the chosen node to * become the owner. * * Input: * un - unit reference * ownp - owner information (if non-NULL) */ int mirror_choose_owner(mm_unit_t *un, md_mn_req_owner_t *ownp) { set_t setno; md_mn_msg_chooseid_t *msg; /* This function is only valid for a multi-node set */ setno = MD_UN2SET(un); if (!MD_MNSET_SETNO(setno)) { return (EINVAL); } #ifdef DEBUG if (mirror_debug_flag) printf("send choose owner message, mnum = %x," "rcnt = %d\n", MD_SID(un), md_set[setno].s_rcnt); #endif /* * setup message with current resync count * and then increment the count. If we're called with a non-NULL * owner then we are reestablishing the owner of the mirror. In this * case we have to flag this to the message handler and set rcnt to * the new owner node. */ msg = kmem_zalloc(sizeof (md_mn_msg_chooseid_t), KM_SLEEP); msg->msg_chooseid_mnum = MD_SID(un); if (ownp == NULL) { mutex_enter(&md_mx); msg->msg_chooseid_rcnt = md_set[setno].s_rcnt; md_set[setno].s_rcnt++; mutex_exit(&md_mx); msg->msg_chooseid_set_node = B_FALSE; } else { msg->msg_chooseid_rcnt = ownp->owner; msg->msg_chooseid_set_node = B_TRUE; } /* * Spawn a thread to issue the ksend_message() call so that we can * drop the ioctl lock hierarchy that is blocking further rpc.metad and * commd set ownership checking. */ if (thread_create(NULL, 0, mirror_choose_owner_thread, (caddr_t)msg, 0, &p0, TS_RUN, 60) == NULL) { kmem_free(msg, sizeof (md_mn_msg_chooseid_t)); return (EFAULT); } else { return (0); } } /* * mirror_get_status: * ---------------------------------- * Called by nodes which are not the master node of the cluster. Obtains the * master abr state and the submirror status for each valid submirror of the * unit so that the status returned by metastat is consistent across the * cluster. * We update tstate for the mirror and both the sm_flag and the sm_state for * each submirror. * * Input: * un mirror to obtain status from * * Calling Convention: * writerlock (either ioctl or unit) must be held */ void mirror_get_status(mm_unit_t *un, IOLOCK *lockp) { mm_submirror_t *sm; int smi; int rval; md_mn_kresult_t *kres; md_mn_msg_mir_state_t msg; md_mn_msg_mir_state_res_t *res; set_t setno = MD_UN2SET(un); mdi_unit_t *ui = MDI_UNIT(MD_SID(un)); ASSERT(ui->ui_lock & MD_UL_WRITER); /* * Get all of the information for the mirror. */ bzero(&msg, sizeof (msg)); msg.mir_state_mnum = MD_SID(un); /* * Must drop the writerlock over ksend_message since another * thread on this node could be running a higher class message * and be trying grab the readerlock. * * If we are in the context of an ioctl, drop the ioctl lock. * lockp holds the list of locks held. */ if (lockp) { IOLOCK_RETURN_RELEASE(0, lockp); } else { md_unit_writerexit(ui); } kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); rval = mdmn_ksend_message(setno, MD_MN_MSG_GET_MIRROR_STATE, MD_MSGF_NO_BCAST | MD_MSGF_NO_LOG, 0, (char *)&msg, sizeof (msg), kres); /* if the node hasn't yet joined, it's Ok. */ if ((!MDMN_KSEND_MSG_OK(rval, kres)) && (kres->kmmr_comm_state != MDMNE_NOT_JOINED)) { mdmn_ksend_show_error(rval, kres, "GET_MIRROR_STATE"); cmn_err(CE_WARN, "ksend_message failure: GET_MIRROR_STATE"); } /* if dropped the lock previously, regain it */ if (lockp) { IOLOCK_RETURN_REACQUIRE(lockp); } else { /* * Reacquire dropped locks and update acquirecnts * appropriately. */ (void) md_unit_writerlock(ui); } /* * Check to see if we've got a believable amount of returned data. * If not, we simply return as there is no usable information. */ if (kres->kmmr_res_size < sizeof (*res)) { cmn_err(CE_WARN, "GET_MIRROR_STATE: returned %d bytes, expected" " %d\n", kres->kmmr_res_size, (int)sizeof (*res)); kmem_free(kres, sizeof (md_mn_kresult_t)); return; } /* * Copy the results from the call back into our sm_state/sm_flags */ res = (md_mn_msg_mir_state_res_t *)kres->kmmr_res_data; #ifdef DEBUG if (mirror_debug_flag) printf("mirror_get_status: %s\n", md_shortname(MD_SID(un))); #endif for (smi = 0; smi < NMIRROR; smi++) { sm = &un->un_sm[smi]; #ifdef DEBUG if (mirror_debug_flag) { printf("curr state %4x, new state %4x\n", sm->sm_state, res->sm_state[smi]); printf("curr_flags %4x, new flags %4x\n", sm->sm_flags, res->sm_flags[smi]); } #endif sm->sm_state = res->sm_state[smi]; sm->sm_flags = res->sm_flags[smi]; } /* Set ABR if set on the Master node */ ui->ui_tstate |= (res->mir_tstate & MD_ABR_CAP); kmem_free(kres, sizeof (md_mn_kresult_t)); } /* * mirror_get_mir_state: * ------------------- * Obtain the ABR state of a mirror and the state of all submirrors from the * master node for the unit specified in sm_state->mnum. * Called by MD_MN_GET_MIRROR_STATE ioctl. */ static int mirror_get_mir_state(md_mn_get_mir_state_t *p, IOLOCK *lockp) { mm_unit_t *un; set_t setno; md_error_t mde; mdclrerror(&mde); if ((un = mirror_getun(p->mnum, &mde, WR_LOCK, lockp)) == NULL) { return (EINVAL); } setno = MD_MIN2SET(p->mnum); if (!MD_MNSET_SETNO(setno)) { return (EINVAL); } /* * We've now got a writerlock on the unit structure (so no-one can * modify the incore values) and we'll now send the message to the * master node. Since we're only called as part of a reconfig cycle * we don't need to release the unit locks across the ksend_message as * only the master node will process it, and we never send this to * ourselves if we're the master. */ mirror_get_status(un, lockp); return (0); } static int mirror_admin_ioctl(int cmd, void *data, int mode, IOLOCK *lockp) { size_t sz = 0; void *d = NULL; int err = 0; /* We can only handle 32-bit clients for internal commands */ if ((mode & DATAMODEL_MASK) != DATAMODEL_ILP32) { return (EINVAL); } /* dispatch ioctl */ switch (cmd) { case MD_IOCSET: { if (! (mode & FWRITE)) return (EACCES); sz = sizeof (md_set_params_t); d = kmem_alloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode)) { err = EFAULT; break; } err = mirror_set(d, mode); break; } case MD_IOCGET: { if (! (mode & FREAD)) return (EACCES); sz = sizeof (md_i_get_t); d = kmem_alloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode)) { err = EFAULT; break; } err = mirror_get(d, mode, lockp); break; } case MD_IOCRESET: { if (! (mode & FWRITE)) return (EACCES); sz = sizeof (md_i_reset_t); d = kmem_alloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode)) { err = EFAULT; break; } err = mirror_reset((md_i_reset_t *)d); break; } case MD_IOCSETSYNC: case MD_MN_SETSYNC: { if (! (mode & FWRITE)) return (EACCES); sz = sizeof (md_resync_ioctl_t); d = kmem_alloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode)) { err = EFAULT; break; } err = mirror_ioctl_resync((md_resync_ioctl_t *)d, lockp); break; } case MD_IOCGETSYNC: { if (! (mode & FREAD)) return (EACCES); sz = sizeof (md_resync_ioctl_t); d = kmem_alloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode)) { err = EFAULT; break; } err = mirror_get_resync((md_resync_ioctl_t *)d); break; } case MD_IOCREPLACE: { if (! (mode & FWRITE)) return (EACCES); sz = sizeof (replace_params_t); d = kmem_alloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode)) { err = EFAULT; break; } err = comp_replace((replace_params_t *)d, lockp); break; } case MD_IOCOFFLINE: { if (! (mode & FWRITE)) return (EACCES); sz = sizeof (md_i_off_on_t); d = kmem_alloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode)) { err = EFAULT; break; } err = mirror_offline((md_i_off_on_t *)d, lockp); break; } case MD_IOCONLINE: { if (! (mode & FWRITE)) return (EACCES); sz = sizeof (md_i_off_on_t); d = kmem_alloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode)) { err = EFAULT; break; } err = mirror_online((md_i_off_on_t *)d, lockp); break; } case MD_IOCDETACH: { if (! (mode & FWRITE)) return (EACCES); sz = sizeof (md_detach_params_t); d = kmem_alloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode)) { err = EFAULT; break; } err = mirror_detach((md_detach_params_t *)d, lockp); break; } case MD_IOCATTACH: { if (! (mode & FWRITE)) return (EACCES); sz = sizeof (md_att_struct_t); d = kmem_alloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode)) { err = EFAULT; break; } err = mirror_attach((md_att_struct_t *)d, lockp); break; } case MD_IOCGET_DEVS: { if (! (mode & FREAD)) return (EACCES); sz = sizeof (md_getdevs_params_t); d = kmem_alloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode)) { err = EFAULT; break; } err = mirror_getdevs(d, mode, lockp); break; } case MD_IOCGROW: { if (! (mode & FWRITE)) return (EACCES); sz = sizeof (md_grow_params_t); d = kmem_alloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode)) { err = EFAULT; break; } err = mirror_grow(d, lockp); break; } case MD_IOCCHANGE: { if (! (mode & FWRITE)) return (EACCES); sz = sizeof (md_mirror_params_t); d = kmem_alloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode)) { err = EFAULT; break; } err = mirror_change((md_mirror_params_t *)d, lockp); break; } case MD_IOCPROBE_DEV: { md_probedev_impl_t *p = NULL; md_probedev_t *ph = NULL; daemon_queue_t *hdr = NULL; int i; size_t sz2 = 0; if (! (mode & FREAD)) return (EACCES); sz = sizeof (md_probedev_t); d = kmem_alloc(sz, KM_SLEEP); /* now copy in the data */ if (ddi_copyin(data, d, sz, mode)) { err = EFAULT; goto free_mem; } /* * Sanity test the args. Test name should have the keyword * probe. */ p = kmem_alloc(sizeof (md_probedev_impl_t), KM_SLEEP); p->probe_sema = NULL; p->probe_mx = NULL; p->probe.mnum_list = (uint64_t)NULL; ph = (struct md_probedev *)d; p->probe.nmdevs = ph->nmdevs; (void) strcpy(p->probe.test_name, ph->test_name); bcopy(&ph->md_driver, &(p->probe.md_driver), sizeof (md_driver_t)); if ((p->probe.nmdevs < 1) || (strstr(p->probe.test_name, "probe") == NULL)) { err = EINVAL; goto free_mem; } sz2 = sizeof (minor_t) * p->probe.nmdevs; p->probe.mnum_list = (uint64_t)(uintptr_t)kmem_alloc(sz2, KM_SLEEP); if (ddi_copyin((void *)(uintptr_t)ph->mnum_list, (void *)(uintptr_t)p->probe.mnum_list, sz2, mode)) { err = EFAULT; goto free_mem; } if (err = md_init_probereq(p, &hdr)) goto free_mem; /* * put the request on the queue and wait. */ daemon_request_new(&md_ff_daemonq, md_probe_one, hdr, REQ_NEW); (void) IOLOCK_RETURN(0, lockp); /* wait for the events to occur */ for (i = 0; i < p->probe.nmdevs; i++) { sema_p(PROBE_SEMA(p)); } while (md_ioctl_lock_enter() == EINTR) ; /* * clean up. The hdr list is freed in the probe routines * since the list is NULL by the time we get here. */ free_mem: if (p) { if (p->probe_sema != NULL) { sema_destroy(PROBE_SEMA(p)); kmem_free(p->probe_sema, sizeof (ksema_t)); } if (p->probe_mx != NULL) { mutex_destroy(PROBE_MX(p)); kmem_free(p->probe_mx, sizeof (kmutex_t)); } if ((uintptr_t)p->probe.mnum_list) kmem_free((void *)(uintptr_t) p->probe.mnum_list, sz2); kmem_free(p, sizeof (md_probedev_impl_t)); } break; } case MD_MN_SET_MM_OWNER: { if (! (mode & FWRITE)) return (EACCES); sz = sizeof (md_set_mmown_params_t); d = kmem_alloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode) != 0) { err = EFAULT; break; } err = mirror_set_owner((md_set_mmown_params_t *)d, lockp); break; } case MD_MN_GET_MM_OWNER: { if (! (mode & FREAD)) return (EACCES); sz = sizeof (md_set_mmown_params_t); d = kmem_alloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode) != 0) { err = EFAULT; break; } err = mirror_get_owner((md_set_mmown_params_t *)d, lockp); break; } case MD_MN_MM_OWNER_STATUS: { if (! (mode & FREAD)) return (EACCES); sz = sizeof (md_mn_own_status_t); d = kmem_alloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode) != 0) { err = EFAULT; break; } err = mirror_get_owner_status((md_mn_own_status_t *)d, lockp); break; } case MD_MN_SET_STATE: { if (! (mode & FWRITE)) return (EACCES); sz = sizeof (md_set_state_params_t); d = kmem_alloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode)) { err = EFAULT; break; } err = mirror_set_state((md_set_state_params_t *)d, lockp); break; } case MD_MN_SUSPEND_WRITES: { if (! (mode & FREAD)) return (EACCES); sz = sizeof (md_suspend_wr_params_t); d = kmem_alloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode) != 0) { err = EFAULT; break; } err = mirror_suspend_writes((md_suspend_wr_params_t *)d); break; } case MD_MN_RESYNC: { sz = sizeof (md_mn_rs_params_t); d = kmem_alloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode) != 0) { err = EFAULT; break; } err = mirror_resync_message((md_mn_rs_params_t *)d, lockp); break; } case MD_MN_ALLOCATE_HOTSPARE: { if (! (mode & FWRITE)) return (EACCES); sz = sizeof (md_alloc_hotsp_params_t); d = kmem_alloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode)) { err = EFAULT; break; } err = mirror_allocate_hotspare((md_alloc_hotsp_params_t *)d, lockp); break; } case MD_MN_POKE_HOTSPARES: { (void) poke_hotspares(); break; } case MD_MN_SET_CAP: { if (! (mode & FWRITE)) return (EACCES); sz = sizeof (md_mn_setcap_params_t); d = kmem_alloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode)) { err = EFAULT; break; } err = mirror_set_capability((md_mn_setcap_params_t *)d, lockp); break; } case MD_MN_GET_MIRROR_STATE: { sz = sizeof (md_mn_get_mir_state_t); d = kmem_zalloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode)) { err = EFAULT; break; } err = mirror_get_mir_state((md_mn_get_mir_state_t *)d, lockp); break; } case MD_MN_RR_DIRTY: { sz = sizeof (md_mn_rr_dirty_params_t); d = kmem_zalloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode)) { err = EFAULT; break; } err = mirror_set_dirty_rr((md_mn_rr_dirty_params_t *)d); break; } case MD_MN_RR_CLEAN: { md_mn_rr_clean_params_t tmp; /* get the first part of the structure to find the size */ if (ddi_copyin(data, &tmp, sizeof (tmp), mode)) { err = EFAULT; break; } sz = MDMN_RR_CLEAN_PARAMS_SIZE(&tmp); d = kmem_zalloc(sz, KM_SLEEP); if (ddi_copyin(data, d, sz, mode)) { err = EFAULT; break; } err = mirror_set_clean_rr((md_mn_rr_clean_params_t *)d); break; } default: return (ENOTTY); } /* * copyout and free any args */ if (sz != 0) { if (err == 0) { if (ddi_copyout(d, data, sz, mode) != 0) { err = EFAULT; } } kmem_free(d, sz); } return (err); } int md_mirror_ioctl( dev_t ddi_dev, int cmd, void *data, int mode, IOLOCK *lockp ) { minor_t mnum = getminor(ddi_dev); mm_unit_t *un; int err = 0; /* handle admin ioctls */ if (mnum == MD_ADM_MINOR) return (mirror_admin_ioctl(cmd, data, mode, lockp)); /* check unit */ if ((MD_MIN2SET(mnum) >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits) || ((un = MD_UNIT(mnum)) == NULL)) return (ENXIO); /* is this a supported ioctl? */ err = md_check_ioctl_against_unit(cmd, un->c); if (err != 0) { return (err); } /* dispatch ioctl */ switch (cmd) { case DKIOCINFO: { struct dk_cinfo *p; if (! (mode & FREAD)) return (EACCES); p = kmem_alloc(sizeof (*p), KM_SLEEP); get_info(p, mnum); if (ddi_copyout((caddr_t)p, data, sizeof (*p), mode) != 0) err = EFAULT; kmem_free(p, sizeof (*p)); return (err); } case DKIOCGMEDIAINFO: { struct dk_minfo p; if (! (mode & FREAD)) return (EACCES); get_minfo(&p, mnum); if (ddi_copyout(&p, data, sizeof (struct dk_minfo), mode) != 0) err = EFAULT; return (err); } case DKIOCGGEOM: { struct dk_geom *p; if (! (mode & FREAD)) return (EACCES); p = kmem_alloc(sizeof (*p), KM_SLEEP); if ((err = mirror_get_geom(un, p)) == 0) { if (ddi_copyout((caddr_t)p, data, sizeof (*p), mode) != 0) err = EFAULT; } kmem_free(p, sizeof (*p)); return (err); } case DKIOCGVTOC: { struct vtoc *vtoc; if (! (mode & FREAD)) return (EACCES); vtoc = kmem_zalloc(sizeof (*vtoc), KM_SLEEP); if ((err = mirror_get_vtoc(un, vtoc)) != 0) { kmem_free(vtoc, sizeof (*vtoc)); return (err); } if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) { if (ddi_copyout(vtoc, data, sizeof (*vtoc), mode)) err = EFAULT; } #ifdef _SYSCALL32 else { struct vtoc32 *vtoc32; vtoc32 = kmem_zalloc(sizeof (*vtoc32), KM_SLEEP); vtoctovtoc32((*vtoc), (*vtoc32)); if (ddi_copyout(vtoc32, data, sizeof (*vtoc32), mode)) err = EFAULT; kmem_free(vtoc32, sizeof (*vtoc32)); } #endif /* _SYSCALL32 */ kmem_free(vtoc, sizeof (*vtoc)); return (err); } case DKIOCSVTOC: { struct vtoc *vtoc; if (! (mode & FWRITE)) return (EACCES); vtoc = kmem_zalloc(sizeof (*vtoc), KM_SLEEP); if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) { if (ddi_copyin(data, vtoc, sizeof (*vtoc), mode)) { err = EFAULT; } } #ifdef _SYSCALL32 else { struct vtoc32 *vtoc32; vtoc32 = kmem_zalloc(sizeof (*vtoc32), KM_SLEEP); if (ddi_copyin(data, vtoc32, sizeof (*vtoc32), mode)) { err = EFAULT; } else { vtoc32tovtoc((*vtoc32), (*vtoc)); } kmem_free(vtoc32, sizeof (*vtoc32)); } #endif /* _SYSCALL32 */ if (err == 0) err = mirror_set_vtoc(un, vtoc); kmem_free(vtoc, sizeof (*vtoc)); return (err); } case DKIOCGEXTVTOC: { struct extvtoc *extvtoc; if (! (mode & FREAD)) return (EACCES); extvtoc = kmem_zalloc(sizeof (*extvtoc), KM_SLEEP); if ((err = mirror_get_extvtoc(un, extvtoc)) != 0) { kmem_free(extvtoc, sizeof (*extvtoc)); return (err); } if (ddi_copyout(extvtoc, data, sizeof (*extvtoc), mode)) err = EFAULT; kmem_free(extvtoc, sizeof (*extvtoc)); return (err); } case DKIOCSEXTVTOC: { struct extvtoc *extvtoc; if (! (mode & FWRITE)) return (EACCES); extvtoc = kmem_zalloc(sizeof (*extvtoc), KM_SLEEP); if (ddi_copyin(data, extvtoc, sizeof (*extvtoc), mode)) { err = EFAULT; } if (err == 0) err = mirror_set_extvtoc(un, extvtoc); kmem_free(extvtoc, sizeof (*extvtoc)); return (err); } case DKIOCGAPART: { struct dk_map dmp; if ((err = mirror_get_cgapart(un, &dmp)) != 0) { return (err); } if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) { if (ddi_copyout((caddr_t)&dmp, data, sizeof (dmp), mode) != 0) err = EFAULT; } #ifdef _SYSCALL32 else { struct dk_map32 dmp32; dmp32.dkl_cylno = dmp.dkl_cylno; dmp32.dkl_nblk = dmp.dkl_nblk; if (ddi_copyout((caddr_t)&dmp32, data, sizeof (dmp32), mode) != 0) err = EFAULT; } #endif /* _SYSCALL32 */ return (err); } case DKIOCGETEFI: { /* * This one can be done centralized, * no need to put in the same code for all types of metadevices */ return (md_dkiocgetefi(mnum, data, mode)); } case DKIOCSETEFI: { /* * This one can be done centralized, * no need to put in the same code for all types of metadevices */ return (md_dkiocsetefi(mnum, data, mode)); } case DKIOCPARTITION: { return (md_dkiocpartition(mnum, data, mode)); } case DKIOCGETVOLCAP: { volcap_t vc; mdi_unit_t *ui; /* Only valid for MN sets */ if (!MD_MNSET_SETNO(MD_MIN2SET(mnum))) return (EINVAL); ui = MDI_UNIT(mnum); if (! (mode & FREAD)) return (EACCES); vc.vc_info = DKV_ABR_CAP | DKV_DMR_CAP; vc.vc_set = 0; if (ui->ui_tstate & MD_ABR_CAP) { vc.vc_set |= DKV_ABR_CAP; } if (ddi_copyout(&vc, data, sizeof (volcap_t), mode)) err = EFAULT; return (err); } case DKIOCSETVOLCAP: { volcap_t vc; volcapset_t volcap = 0; mdi_unit_t *ui; /* Only valid for MN sets */ if (!MD_MNSET_SETNO(MD_MIN2SET(mnum))) return (EINVAL); ui = MDI_UNIT(mnum); if (! (mode & FWRITE)) return (EACCES); if (ddi_copyin(data, &vc, sizeof (volcap_t), mode)) return (EFAULT); /* Not valid if a submirror is offline */ if (un->c.un_status & MD_UN_OFFLINE_SM) { return (EINVAL); } if (ui->ui_tstate & MD_ABR_CAP) volcap |= DKV_ABR_CAP; /* Only send capability message if there is a change */ if ((vc.vc_set & (DKV_ABR_CAP)) != volcap) err = mdmn_send_capability_message(mnum, vc, lockp); return (err); } case DKIOCDMR: { vol_directed_rd_t *vdr; #ifdef _MULTI_DATAMODEL vol_directed_rd32_t *vdr32; #endif /* _MULTI_DATAMODEL */ /* Only valid for MN sets */ if (!MD_MNSET_SETNO(MD_MIN2SET(mnum))) return (EINVAL); vdr = kmem_zalloc(sizeof (vol_directed_rd_t), KM_NOSLEEP); if (vdr == NULL) return (ENOMEM); #ifdef _MULTI_DATAMODEL vdr32 = kmem_zalloc(sizeof (vol_directed_rd32_t), KM_NOSLEEP); if (vdr32 == NULL) { kmem_free(vdr, sizeof (vol_directed_rd_t)); return (ENOMEM); } switch (ddi_model_convert_from(mode & FMODELS)) { case DDI_MODEL_ILP32: /* * If we're called from a higher-level driver we don't * need to manipulate the data. Its already been done by * the caller. */ if (!(mode & FKIOCTL)) { if (ddi_copyin(data, vdr32, sizeof (*vdr32), mode)) { kmem_free(vdr, sizeof (*vdr)); return (EFAULT); } vdr->vdr_flags = vdr32->vdr_flags; vdr->vdr_offset = vdr32->vdr_offset; vdr->vdr_nbytes = vdr32->vdr_nbytes; vdr->vdr_data = (void *)(uintptr_t)vdr32->vdr_data; vdr->vdr_side = vdr32->vdr_side; break; } /* FALLTHROUGH */ case DDI_MODEL_NONE: if (ddi_copyin(data, vdr, sizeof (*vdr), mode)) { kmem_free(vdr32, sizeof (*vdr32)); kmem_free(vdr, sizeof (*vdr)); return (EFAULT); } break; default: kmem_free(vdr32, sizeof (*vdr32)); kmem_free(vdr, sizeof (*vdr)); return (EFAULT); } #else /* ! _MULTI_DATAMODEL */ if (ddi_copyin(data, vdr, sizeof (*vdr), mode)) { kmem_free(vdr, sizeof (*vdr)); return (EFAULT); } #endif /* _MULTI_DATAMODEL */ err = mirror_directed_read(ddi_dev, vdr, mode); if (err == 0) { #ifdef _MULTI_DATAMODEL switch (ddi_model_convert_from(mode & FMODELS)) { case DDI_MODEL_ILP32: if (!(mode & FKIOCTL)) { vdr32->vdr_flags = vdr->vdr_flags; vdr32->vdr_offset = vdr->vdr_offset; vdr32->vdr_side = vdr->vdr_side; vdr32->vdr_bytesread = vdr->vdr_bytesread; bcopy(vdr->vdr_side_name, vdr32->vdr_side_name, sizeof (vdr32->vdr_side_name)); if (ddi_copyout(vdr32, data, sizeof (*vdr32), mode)) { err = EFAULT; } break; } /* FALLTHROUGH */ case DDI_MODEL_NONE: if (ddi_copyout(vdr, data, sizeof (*vdr), mode)) err = EFAULT; break; } #else /* ! _MULTI_DATAMODEL */ if (ddi_copyout(vdr, data, sizeof (*vdr), mode)) err = EFAULT; #endif /* _MULTI_DATAMODEL */ if (vdr->vdr_flags & DKV_DMR_ERROR) err = EIO; } #ifdef _MULTI_DATAMODEL kmem_free(vdr32, sizeof (*vdr32)); #endif /* _MULTI_DATAMODEL */ kmem_free(vdr, sizeof (*vdr)); return (err); } default: return (ENOTTY); } } /* * rename named service entry points and support functions */ /* * rename/exchange role swap functions * * most of these are handled by generic role swap functions */ /* * MDRNM_UPDATE_KIDS * rename/exchange of our child or grandchild */ void mirror_renexch_update_kids(md_rendelta_t *delta, md_rentxn_t *rtxnp) { mm_submirror_t *sm; int smi; ASSERT(rtxnp); ASSERT((MDRNOP_RENAME == rtxnp->op) || (rtxnp->op == MDRNOP_EXCHANGE)); ASSERT(rtxnp->recids); ASSERT(delta); ASSERT(delta->unp); ASSERT(delta->old_role == MDRR_PARENT); ASSERT(delta->new_role == MDRR_PARENT); /* * since our role isn't changing (parent->parent) * one of our children must be changing * find the child being modified, and update * our notion of it */ for (smi = 0; smi < NMIRROR; smi++) { mm_unit_t *un = (mm_unit_t *)delta->unp; if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) { continue; } sm = &un->un_sm[smi]; if (md_getminor(sm->sm_dev) == rtxnp->from.mnum) { sm->sm_dev = md_makedevice(md_major, rtxnp->to.mnum); sm->sm_key = rtxnp->to.key; break; } } md_store_recid(&rtxnp->rec_idx, rtxnp->recids, delta->unp); } /* * exchange down (self->child) */ void mirror_exchange_self_update_from_down( md_rendelta_t *delta, md_rentxn_t *rtxnp ) { int smi; mm_submirror_t *found; minor_t from_min, to_min; sv_dev_t sv; ASSERT(rtxnp); ASSERT(MDRNOP_EXCHANGE == rtxnp->op); ASSERT(rtxnp->recids); ASSERT(rtxnp->rec_idx >= 0); ASSERT(delta); ASSERT(delta->unp); ASSERT(delta->uip); ASSERT(delta->old_role == MDRR_SELF); ASSERT(delta->new_role == MDRR_CHILD); ASSERT(md_getminor(delta->dev) == rtxnp->from.mnum); from_min = rtxnp->from.mnum; to_min = rtxnp->to.mnum; /* * self id changes in our own unit struct */ MD_SID(delta->unp) = to_min; /* * parent identifier need not change */ /* * point the set array pointers at the "new" unit and unit in-cores * Note: the other half of this transfer is done in the "update_to" * exchange named service. */ MDI_VOIDUNIT(to_min) = delta->uip; MD_VOIDUNIT(to_min) = delta->unp; /* * transfer kstats */ delta->uip->ui_kstat = rtxnp->to.kstatp; /* * the unit in-core reference to the get next link's id changes */ delta->uip->ui_link.ln_id = to_min; /* * find the child whose identity we're assuming */ for (found = NULL, smi = 0; !found && smi < NMIRROR; smi++) { mm_submirror_t *sm; mm_unit_t *un = (mm_unit_t *)delta->unp; if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) { continue; } sm = &un->un_sm[smi]; if (md_getminor(sm->sm_dev) == to_min) { found = sm; } } ASSERT(found); /* * Update the sub-mirror's identity */ found->sm_dev = md_makedevice(md_major, rtxnp->from.mnum); sv.key = found->sm_key; ASSERT(rtxnp->from.key != MD_KEYWILD); ASSERT(rtxnp->from.key != MD_KEYBAD); found->sm_key = rtxnp->from.key; /* * delete the key for the old sub-mirror from the name space */ sv.setno = MD_MIN2SET(from_min); md_rem_names(&sv, 1); /* * and store the record id (from the unit struct) into recids */ md_store_recid(&rtxnp->rec_idx, rtxnp->recids, delta->unp); } /* * exchange down (parent->self) */ void mirror_exchange_parent_update_to( md_rendelta_t *delta, md_rentxn_t *rtxnp ) { int smi; mm_submirror_t *found; minor_t from_min, to_min; sv_dev_t sv; ASSERT(rtxnp); ASSERT(MDRNOP_EXCHANGE == rtxnp->op); ASSERT(rtxnp->recids); ASSERT(rtxnp->rec_idx >= 0); ASSERT(delta); ASSERT(delta->unp); ASSERT(delta->uip); ASSERT(delta->old_role == MDRR_PARENT); ASSERT(delta->new_role == MDRR_SELF); ASSERT(md_getminor(delta->dev) == rtxnp->to.mnum); from_min = rtxnp->from.mnum; to_min = rtxnp->to.mnum; /* * self id changes in our own unit struct */ MD_SID(delta->unp) = from_min; /* * parent identifier need not change */ /* * point the set array pointers at the "new" unit and unit in-cores * Note: the other half of this transfer is done in the "update_to" * exchange named service. */ MDI_VOIDUNIT(from_min) = delta->uip; MD_VOIDUNIT(from_min) = delta->unp; /* * transfer kstats */ delta->uip->ui_kstat = rtxnp->from.kstatp; /* * the unit in-core reference to the get next link's id changes */ delta->uip->ui_link.ln_id = from_min; /* * find the child whose identity we're assuming */ for (found = NULL, smi = 0; !found && smi < NMIRROR; smi++) { mm_submirror_t *sm; mm_unit_t *un = (mm_unit_t *)delta->unp; if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) { continue; } sm = &un->un_sm[smi]; if (md_getminor(sm->sm_dev) == from_min) { found = sm; } } ASSERT(found); /* * Update the sub-mirror's identity */ found->sm_dev = md_makedevice(md_major, rtxnp->to.mnum); sv.key = found->sm_key; ASSERT(rtxnp->to.key != MD_KEYWILD); ASSERT(rtxnp->to.key != MD_KEYBAD); found->sm_key = rtxnp->to.key; /* * delete the key for the old sub-mirror from the name space */ sv.setno = MD_MIN2SET(to_min); md_rem_names(&sv, 1); /* * and store the record id (from the unit struct) into recids */ md_store_recid(&rtxnp->rec_idx, rtxnp->recids, delta->unp); } /* * MDRNM_LIST_URKIDS: named svc entry point * all all delta entries appropriate for our children onto the * deltalist pointd to by dlpp */ int mirror_rename_listkids(md_rendelta_t **dlpp, md_rentxn_t *rtxnp) { minor_t from_min, to_min; mm_unit_t *from_un; md_rendelta_t *new, *p; int smi; int n_children; mm_submirror_t *sm; ASSERT(rtxnp); ASSERT(dlpp); ASSERT((rtxnp->op == MDRNOP_EXCHANGE) || (rtxnp->op == MDRNOP_RENAME)); from_min = rtxnp->from.mnum; to_min = rtxnp->to.mnum; n_children = 0; if (!MDI_UNIT(from_min) || !(from_un = MD_UNIT(from_min))) { (void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP, from_min); return (-1); } for (p = *dlpp; p && p->next != NULL; p = p->next) { /* NULL */ } for (smi = 0; smi < NMIRROR; smi++) { minor_t child_min; if (!SMS_BY_INDEX_IS(from_un, smi, SMS_INUSE)) { continue; } sm = &from_un->un_sm[smi]; child_min = md_getminor(sm->sm_dev); p = new = md_build_rendelta(MDRR_CHILD, to_min == child_min? MDRR_SELF: MDRR_CHILD, sm->sm_dev, p, MD_UNIT(child_min), MDI_UNIT(child_min), &rtxnp->mde); if (!new) { if (mdisok(&rtxnp->mde)) { (void) mdsyserror(&rtxnp->mde, ENOMEM); } return (-1); } ++n_children; } return (n_children); } /* * support routine for MDRNM_CHECK */ static int mirror_may_renexch_self( mm_unit_t *un, mdi_unit_t *ui, md_rentxn_t *rtxnp) { minor_t from_min; minor_t to_min; bool_t toplevel; bool_t related; int smi; mm_submirror_t *sm; from_min = rtxnp->from.mnum; to_min = rtxnp->to.mnum; if (!un || !ui) { (void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR, from_min); return (EINVAL); } ASSERT(MD_CAPAB(un) & MD_CAN_META_CHILD); if (!(MD_CAPAB(un) & MD_CAN_META_CHILD)) { (void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min); return (EINVAL); } if (MD_PARENT(un) == MD_MULTI_PARENT) { (void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min); return (EINVAL); } toplevel = !MD_HAS_PARENT(MD_PARENT(un)); /* we're related if trying to swap with our parent */ related = (!toplevel) && (MD_PARENT(un) == to_min); switch (rtxnp->op) { case MDRNOP_EXCHANGE: /* * check for a swap with our child */ for (smi = 0; smi < NMIRROR; smi++) { if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) { continue; } sm = &un->un_sm[smi]; if (md_getminor(sm->sm_dev) == to_min) { related |= TRUE; } } if (!related) { (void) mdmderror(&rtxnp->mde, MDE_RENAME_TARGET_UNRELATED, to_min); return (EINVAL); } break; case MDRNOP_RENAME: /* * if from is top-level and is open, then the kernel is using * the md_dev64_t. */ if (toplevel && md_unit_isopen(ui)) { (void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY, from_min); return (EBUSY); } break; default: (void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR, from_min); return (EINVAL); } return (0); /* ok */ } /* * Named service entry point: MDRNM_CHECK */ intptr_t mirror_rename_check( md_rendelta_t *delta, md_rentxn_t *rtxnp) { mm_submirror_t *sm; mm_submirror_ic_t *smic; md_m_shared_t *shared; int ci; int i; int compcnt; mm_unit_t *un; int err = 0; ASSERT(delta); ASSERT(rtxnp); ASSERT(delta->unp); ASSERT(delta->uip); ASSERT((rtxnp->op == MDRNOP_RENAME) || (rtxnp->op == MDRNOP_EXCHANGE)); if (!delta || !rtxnp || !delta->unp || !delta->uip) { (void) mdsyserror(&rtxnp->mde, EINVAL); return (EINVAL); } un = (mm_unit_t *)delta->unp; for (i = 0; i < NMIRROR; i++) { sm = &un->un_sm[i]; smic = &un->un_smic[i]; if (!SMS_IS(sm, SMS_INUSE)) continue; ASSERT(smic->sm_get_component_count); if (!smic->sm_get_component_count) { (void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR, md_getminor(delta->dev)); return (ENXIO); } compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, un); for (ci = 0; ci < compcnt; ci++) { ASSERT(smic->sm_shared_by_indx); if (!smic->sm_shared_by_indx) { (void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR, md_getminor(delta->dev)); return (ENXIO); } shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) (sm->sm_dev, sm, ci); ASSERT(shared); if (!shared) { (void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR, md_getminor(delta->dev)); return (ENXIO); } if (shared->ms_hs_id != 0) { (void) mdmderror(&rtxnp->mde, MDE_SM_FAILED_COMPS, md_getminor(delta->dev)); return (EIO); } switch (shared->ms_state) { case CS_OKAY: break; case CS_RESYNC: (void) mdmderror(&rtxnp->mde, MDE_RESYNC_ACTIVE, md_getminor(delta->dev)); return (EBUSY); default: (void) mdmderror(&rtxnp->mde, MDE_SM_FAILED_COMPS, md_getminor(delta->dev)); return (EINVAL); } } } /* self does additional checks */ if (delta->old_role == MDRR_SELF) { err = mirror_may_renexch_self(un, delta->uip, rtxnp); } return (err); } /* end of rename/exchange */