Mercurial > illumos > illumos-gate
view usr/src/uts/common/io/lvm/mirror/mirror.c @ 14082:6db1b9319cfc
3893 lvm: incorrect flag handling
Reviewed by: Theo Schlossnagle <jesus@omniti.com>
Reviewed by: Dan McDonald <danmcd@nexenta.com>
Approved by: Dan McDonald <danmcd@nexenta.com>
author | Prasad Joshi <pjoshi@stec-inc.com> |
---|---|
date | Wed, 17 Jul 2013 15:47:52 -0400 |
parents | 6bec9720e054 |
children |
line wrap: on
line source
/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Bayard G. Bell. All rights reserved. */ #include <sys/param.h> #include <sys/systm.h> #include <sys/conf.h> #include <sys/file.h> #include <sys/user.h> #include <sys/uio.h> #include <sys/t_lock.h> #include <sys/buf.h> #include <sys/dkio.h> #include <sys/vtoc.h> #include <sys/kmem.h> #include <vm/page.h> #include <sys/cmn_err.h> #include <sys/sysmacros.h> #include <sys/types.h> #include <sys/mkdev.h> #include <sys/stat.h> #include <sys/open.h> #include <sys/modctl.h> #include <sys/ddi.h> #include <sys/sunddi.h> #include <sys/debug.h> #include <sys/dklabel.h> #include <vm/hat.h> #include <sys/lvm/mdvar.h> #include <sys/lvm/md_mirror.h> #include <sys/lvm/md_convert.h> #include <sys/lvm/md_mddb.h> #include <sys/esunddi.h> #include <sys/sysevent/eventdefs.h> #include <sys/sysevent/svm.h> #include <sys/lvm/mdmn_commd.h> #include <sys/avl.h> md_ops_t mirror_md_ops; #ifndef lint md_ops_t *md_interface_ops = &mirror_md_ops; #endif extern mdq_anchor_t md_done_daemon; extern mdq_anchor_t md_mstr_daemon; extern mdq_anchor_t md_mirror_daemon; extern mdq_anchor_t md_mirror_io_daemon; extern mdq_anchor_t md_mirror_rs_daemon; extern mdq_anchor_t md_mhs_daemon; extern unit_t md_nunits; extern set_t md_nsets; extern md_set_t md_set[]; extern int md_status; extern clock_t md_hz; extern md_krwlock_t md_unit_array_rw; extern kmutex_t md_mx; extern kcondvar_t md_cv; extern int md_mtioctl_cnt; daemon_request_t mirror_timeout; static daemon_request_t hotspare_request; static daemon_request_t mn_hs_request[MD_MAXSETS]; /* Multinode hs req */ int md_mirror_mcs_buf_off; /* Flags for mdmn_ksend_message to allow debugging */ int md_mirror_msg_flags; #ifdef DEBUG /* Flag to switch on debug messages */ int mirror_debug_flag = 0; #endif /* * Struct used to hold count of DMR reads and the timestamp of last DMR read * It is used to verify, using a debugger, that the DMR read ioctl has been * executed. */ dmr_stats_t mirror_dmr_stats = {0, 0}; /* * Mutex protecting list of non-failfast drivers. */ static kmutex_t non_ff_drv_mutex; extern char **non_ff_drivers; extern major_t md_major; /* * Write-On-Write memory pool. */ static void copy_write_cont(wowhdr_t *wowhdr); static kmem_cache_t *mirror_wowblk_cache = NULL; static int md_wowbuf_size = 16384; static size_t md_wowblk_size; /* * This is a flag that allows: * - disabling the write-on-write mechanism. * - logging occurrences of write-on-write * - switching wow handling procedure processing * Counter for occurences of WOW. */ static uint_t md_mirror_wow_flg = 0; static int md_mirror_wow_cnt = 0; /* * Tunable to enable/disable dirty region * processing when closing down a mirror. */ static int new_resync = 1; kmem_cache_t *mirror_parent_cache = NULL; kmem_cache_t *mirror_child_cache = NULL; extern int md_ff_disable; /* disable failfast */ static int mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int); static void mirror_read_strategy(buf_t *, int, void *); static void mirror_write_strategy(buf_t *, int, void *); static void become_owner(daemon_queue_t *); static int mirror_done(struct buf *cb); static int mirror_done_common(struct buf *cb); static void clear_retry_error(struct buf *cb); /* * patchables */ int md_min_rr_size = 200; /* 2000 blocks, or 100k */ int md_def_num_rr = 1000; /* Default number of dirty regions */ /* * patchable to change delay before rescheduling mirror ownership request. * Value is clock ticks, default 0.5 seconds */ clock_t md_mirror_owner_to = 500000; /*ARGSUSED1*/ static int mirror_parent_constructor(void *p, void *d1, int d2) { mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL); return (0); } static void mirror_parent_init(md_mps_t *ps) { bzero(ps, offsetof(md_mps_t, ps_mx)); bzero(&ps->ps_overlap_node, sizeof (avl_node_t)); } /*ARGSUSED1*/ static void mirror_parent_destructor(void *p, void *d) { mutex_destroy(&((md_mps_t *)p)->ps_mx); } /*ARGSUSED1*/ static int mirror_child_constructor(void *p, void *d1, int d2) { bioinit(&((md_mcs_t *)p)->cs_buf); return (0); } void mirror_child_init(md_mcs_t *cs) { cs->cs_ps = NULL; cs->cs_mdunit = 0; md_bioreset(&cs->cs_buf); } /*ARGSUSED1*/ static void mirror_child_destructor(void *p, void *d) { biofini(&((md_mcs_t *)p)->cs_buf); } static void mirror_wowblk_init(wowhdr_t *p) { bzero(p, md_wowblk_size); } static void send_poke_hotspares_msg(daemon_request_t *drq) { int rval; int nretries = 0; md_mn_msg_pokehsp_t pokehsp; md_mn_kresult_t *kresult; set_t setno = (set_t)drq->dq.qlen; pokehsp.pokehsp_setno = setno; kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); retry_sphmsg: rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES, MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0, (char *)&pokehsp, sizeof (pokehsp), kresult); if (!MDMN_KSEND_MSG_OK(rval, kresult)) { mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES"); /* If we're shutting down already, pause things here. */ if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) { while (!md_mn_is_commd_present()) { delay(md_hz); } /* * commd has become reachable again, so retry once. * If this fails we'll panic as the system is in an * unexpected state. */ if (nretries++ == 0) goto retry_sphmsg; } cmn_err(CE_PANIC, "ksend_message failure: POKE_HOTSPARES"); } kmem_free(kresult, sizeof (md_mn_kresult_t)); /* Allow further requests to use this set's queue structure */ mutex_enter(&drq->dr_mx); drq->dr_pending = 0; mutex_exit(&drq->dr_mx); } /* * Send a poke_hotspares message to the master node. To avoid swamping the * commd handler with requests we only send a message if there is not one * already outstanding. We punt the request to a separate thread context as * cannot afford to block waiting on the request to be serviced. This is * essential when a reconfig cycle is in progress as any open() of a multinode * metadevice may result in a livelock. */ static void send_poke_hotspares(set_t setno) { daemon_request_t *drq = &mn_hs_request[setno]; mutex_enter(&drq->dr_mx); if (drq->dr_pending == 0) { drq->dr_pending = 1; drq->dq.qlen = (int)setno; daemon_request(&md_mhs_daemon, send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD); } mutex_exit(&drq->dr_mx); } void mirror_set_sm_state( mm_submirror_t *sm, mm_submirror_ic_t *smic, sm_state_t newstate, int force) { int compcnt; int i; int errcnt; sm_state_t origstate; md_m_shared_t *shared; if (force) { sm->sm_state = newstate; uniqtime32(&sm->sm_timestamp); return; } origstate = newstate; compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm); for (i = 0, errcnt = 0; i < compcnt; i++) { shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) (sm->sm_dev, sm, i); if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED)) newstate |= SMS_COMP_ERRED; if (shared->ms_state & (CS_RESYNC)) newstate |= SMS_COMP_RESYNC; if (shared->ms_state & CS_ERRED) errcnt++; } if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0) newstate &= ~origstate; if (errcnt == compcnt) newstate |= SMS_ALL_ERRED; else newstate &= ~SMS_ALL_ERRED; sm->sm_state = newstate; uniqtime32(&sm->sm_timestamp); } static int mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error, int frm_probe) { mm_submirror_t *sm; mm_submirror_ic_t *smic; md_m_shared_t *shared; int ci; int i; int compcnt; int open_comp; /* flag for open component */ for (i = *smi; i < NMIRROR; i++) { sm = &un->un_sm[i]; smic = &un->un_smic[i]; if (!SMS_IS(sm, SMS_INUSE)) continue; compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); for (ci = *cip; ci < compcnt; ci++) { shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) (sm->sm_dev, sm, ci); /* * if called from any routine but probe, we check for * MDM_S_ISOPEN flag. Since probe does a pseduo open, * it sets MDM_S_PROBEOPEN flag and we test for this * flag. They are both exclusive tests. */ open_comp = (frm_probe) ? (shared->ms_flags & MDM_S_PROBEOPEN): (shared->ms_flags & MDM_S_ISOPEN); if (((shared->ms_flags & MDM_S_IOERR || !open_comp) && ((shared->ms_state == CS_OKAY) || (shared->ms_state == CS_RESYNC))) || (!open_comp && (shared->ms_state == CS_LAST_ERRED))) { if (clr_error) { shared->ms_flags &= ~MDM_S_IOERR; } *cip = ci; *smi = i; return (1); } if (clr_error && (shared->ms_flags & MDM_S_IOERR)) { shared->ms_flags &= ~MDM_S_IOERR; } } *cip = 0; } return (0); } /*ARGSUSED*/ static void mirror_run_queue(void *d) { if (!(md_status & MD_GBL_DAEMONS_LIVE)) md_daemon(1, &md_done_daemon); } /* * check_comp_4_hotspares * * This function attempts to allocate a hotspare for this component if the * component is in error. In a MN set, the function can be called in 2 modes. * It can be called either when a component error has been detected or when a * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set * in flags and the request is sent to all nodes. * The handler on each of the nodes then calls this function with * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed. * * For non-MN sets the function simply attempts to allocate a hotspare. * * On entry, the following locks are held * mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set) * md_unit_writerlock * * Returns 0 if ok * 1 if the unit containing the component has been cleared while * the mdmn_ksend_message() was being executed */ extern int check_comp_4_hotspares( mm_unit_t *un, int smi, int ci, uint_t flags, mddb_recid_t hs_id, /* Only used by MN disksets */ IOLOCK *lockp /* can be NULL */ ) { mm_submirror_t *sm; mm_submirror_ic_t *smic; md_m_shared_t *shared; mddb_recid_t recids[6]; minor_t mnum; intptr_t (*hs_dev)(); void (*hs_done)(); void *hs_data; md_error_t mde = mdnullerror; set_t setno; md_mn_msg_allochsp_t allochspmsg; md_mn_kresult_t *kresult; mm_unit_t *new_un; int rval; int nretries = 0; mnum = MD_SID(un); setno = MD_UN2SET(un); sm = &un->un_sm[smi]; smic = &un->un_smic[smi]; shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) (sm->sm_dev, sm, ci); if (shared->ms_state != CS_ERRED) return (0); /* Don't start a new component resync if a resync is already running. */ if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) return (0); if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) { uint_t msgflags; md_mn_msgtype_t msgtype; /* Send allocate hotspare message to all nodes */ allochspmsg.msg_allochsp_mnum = un->c.un_self_id; allochspmsg.msg_allochsp_sm = smi; allochspmsg.msg_allochsp_comp = ci; allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id; /* * Before calling mdmn_ksend_message(), release locks * Can never be in the context of an ioctl. */ md_unit_writerexit(MDI_UNIT(mnum)); if (flags & MD_HOTSPARE_LINKHELD) rw_exit(&mirror_md_ops.md_link_rw.lock); #ifdef DEBUG if (mirror_debug_flag) printf("send alloc hotspare, flags=" "0x%x %x, %x, %x, %x\n", flags, allochspmsg.msg_allochsp_mnum, allochspmsg.msg_allochsp_sm, allochspmsg.msg_allochsp_comp, allochspmsg.msg_allochsp_hs_id); #endif if (flags & MD_HOTSPARE_WMUPDATE) { msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE2; /* * When coming from an update of watermarks, there * must already be a message logged that triggered * this action. So, no need to log this message, too. */ msgflags = MD_MSGF_NO_LOG; } else { msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE; msgflags = MD_MSGF_DEFAULT_FLAGS; } kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); cc4hs_msg: rval = mdmn_ksend_message(setno, msgtype, msgflags, 0, (char *)&allochspmsg, sizeof (allochspmsg), kresult); if (!MDMN_KSEND_MSG_OK(rval, kresult)) { #ifdef DEBUG if (mirror_debug_flag) mdmn_ksend_show_error(rval, kresult, "ALLOCATE HOTSPARE"); #endif /* * If message is sent ok but exitval indicates an error * it must be because the mirror has been cleared. In * this case re-obtain lock and return an error */ if ((rval == 0) && (kresult->kmmr_exitval != 0)) { if (flags & MD_HOTSPARE_LINKHELD) { rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER); } kmem_free(kresult, sizeof (md_mn_kresult_t)); return (1); } /* If we're shutting down already, pause things here. */ if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) { while (!md_mn_is_commd_present()) { delay(md_hz); } /* * commd has become reachable again, so retry * once. If this fails we'll panic as the * system is in an unexpected state. */ if (nretries++ == 0) goto cc4hs_msg; } cmn_err(CE_PANIC, "ksend_message failure: ALLOCATE_HOTSPARE"); } kmem_free(kresult, sizeof (md_mn_kresult_t)); /* * re-obtain the locks */ if (flags & MD_HOTSPARE_LINKHELD) rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER); new_un = md_unit_writerlock(MDI_UNIT(mnum)); /* * As we had to release the locks in order to send the * message to all nodes, we need to check to see if the * unit has changed. If it has we release the writerlock * and return fail. */ if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) { md_unit_writerexit(MDI_UNIT(mnum)); return (1); } } else { if (MD_MNSET_SETNO(setno)) { /* * If 2 or more nodes simultaneously see a * component failure, these nodes will each * send an ALLOCATE_HOTSPARE[2] message. * The first message will allocate the hotspare * and the subsequent messages should do nothing. * * If a slave node doesn't have a hotspare allocated * at the time the message is initiated, then the * passed in hs_id will be 0. If the node * executing this routine has a component shared * ms_hs_id of non-zero, but the message shows a * hs_id of 0, then just return since a hotspare * has already been allocated for this failing * component. When the slave node returns from * the ksend_message the hotspare will have * already been allocated. * * If the slave node does send an hs_id of non-zero, * and the slave node's hs_id matches this node's * ms_hs_id, then the hotspare has error'd and * should be replaced. * * If the slave node sends an hs_id of non-zero and * this node has a different shared ms_hs_id, then * just return since this hotspare has already * been hotspared. */ if (shared->ms_hs_id != 0) { if (hs_id == 0) { #ifdef DEBUG if (mirror_debug_flag) { printf("check_comp_4_hotspares" "(NOXMIT), short circuit " "hs_id=0x%x, " "ms_hs_id=0x%x\n", hs_id, shared->ms_hs_id); } #endif return (0); } if (hs_id != shared->ms_hs_id) { #ifdef DEBUG if (mirror_debug_flag) { printf("check_comp_4_hotspares" "(NOXMIT), short circuit2 " "hs_id=0x%x, " "ms_hs_id=0x%x\n", hs_id, shared->ms_hs_id); } #endif return (0); } } } sm = &un->un_sm[smi]; hs_dev = md_get_named_service(sm->sm_dev, 0, "hotspare device", 0); if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done, &hs_data) != 0) return (0); /* * set_sm_comp_state() commits the modified records. * As we don't transmit the changes, no need to drop the lock. */ set_sm_comp_state(un, smi, ci, CS_RESYNC, recids, MD_STATE_NO_XMIT, (IOLOCK *)NULL); (*hs_done)(sm->sm_dev, hs_data); mirror_check_failfast(mnum); SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE, setno, MD_SID(un)); /* * For a multi-node set we need to reset the un_rs_type, * un_rs_resync_done and un_rs_resync_2_do fields as the * hot-spare resync must copy all applicable data. */ if (MD_MNSET_SETNO(setno)) { un->un_rs_type = MD_RS_NONE; un->un_rs_resync_done = 0; un->un_rs_resync_2_do = 0; } /* * Must drop writer lock since mirror_resync_unit will * open devices and must be able to grab readerlock. * Don't need to drop IOLOCK since any descendent routines * calling ksend_messages will drop the IOLOCK as needed. * */ if (lockp) { md_ioctl_writerexit(lockp); } else { md_unit_writerexit(MDI_UNIT(mnum)); } /* start resync */ (void) mirror_resync_unit(mnum, NULL, &mde, lockp); if (lockp) { new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum)); } else { new_un = md_unit_writerlock(MDI_UNIT(mnum)); } } return (0); } /* * check_unit_4_hotspares * * For a given mirror, allocate hotspares, if available for any components * that are in error * * Returns 0 if ok * 1 if check_comp_4_hotspares returns non-zero. This will only * happen for a MN unit where the unit has been cleared while * the allocate hotspare message is sent to all nodes. */ static int check_unit_4_hotspares(mm_unit_t *un, int flags) { mm_submirror_t *sm; mm_submirror_ic_t *smic; int ci; int i; int compcnt; if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) return (0); for (i = 0; i < NMIRROR; i++) { sm = &un->un_sm[i]; smic = &un->un_smic[i]; if (!SMS_IS(sm, SMS_INUSE)) continue; compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm); for (ci = 0; ci < compcnt; ci++) { md_m_shared_t *shared; shared = (md_m_shared_t *) (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci); /* * Never called from ioctl context, so pass in * (IOLOCK *)NULL. Pass through flags from calling * routine, also setting XMIT flag. */ if (check_comp_4_hotspares(un, i, ci, (MD_HOTSPARE_XMIT | flags), shared->ms_hs_id, (IOLOCK *)NULL) != 0) return (1); } } return (0); } static void check_4_hotspares(daemon_request_t *drq) { mdi_unit_t *ui; mm_unit_t *un; md_link_t *next; int x; mutex_enter(&drq->dr_mx); /* clear up front so can poke */ drq->dr_pending = 0; /* again in low level routine if */ mutex_exit(&drq->dr_mx); /* something found to do */ /* * Used to have a problem here. The disksets weren't marked as being * MNHOLD. This opened a window where we could be searching for * hotspares and have the disk set unloaded (released) from under * us causing a panic in stripe_component_count(). * The way to prevent that is to mark the set MNHOLD which prevents * any diskset from being released while we are scanning the mirrors, * submirrors and components. */ for (x = 0; x < md_nsets; x++) md_holdset_enter(x); rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER); for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) { ui = MDI_UNIT(next->ln_id); un = (mm_unit_t *)md_unit_readerlock(ui); /* * Only check the unit if we are the master for this set * For an MN set, poke_hotspares() is only effective on the * master */ if (MD_MNSET_SETNO(MD_UN2SET(un)) && md_set[MD_UN2SET(un)].s_am_i_master == 0) { md_unit_readerexit(ui); continue; } if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) { md_unit_readerexit(ui); continue; } md_unit_readerexit(ui); un = (mm_unit_t *)md_unit_writerlock(ui); /* * check_unit_4_hotspares will exit 1 if the unit has been * removed during the process of allocating the hotspare. * This can only happen for a MN metadevice. If unit no longer * exists, no need to release writerlock */ if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0) md_unit_writerexit(ui); else { /* * If check_unit_4_hotspares failed, queue another * request and break out of this one */ (void) poke_hotspares(); break; } } rw_exit(&mirror_md_ops.md_link_rw.lock); for (x = 0; x < md_nsets; x++) md_holdset_exit(x); } /* * poke_hotspares * * If there is not a pending poke_hotspares request pending, queue a requent * to call check_4_hotspares(). This will scan all mirrors and attempt to * allocate hotspares for all components in error. */ int poke_hotspares() { mutex_enter(&hotspare_request.dr_mx); if (hotspare_request.dr_pending == 0) { hotspare_request.dr_pending = 1; daemon_request(&md_mhs_daemon, check_4_hotspares, (daemon_queue_t *)&hotspare_request, REQ_OLD); } mutex_exit(&hotspare_request.dr_mx); return (0); } static void free_all_ecomps(err_comp_t *ecomp) { err_comp_t *d; while (ecomp != NULL) { d = ecomp; ecomp = ecomp->ec_next; kmem_free(d, sizeof (err_comp_t)); } } /* * NAME: mirror_openfail_console_info * * DESCRIPTION: Prints a informative message to the console when mirror * cannot be opened. * * PARAMETERS: mm_unit_t un - pointer to mirror unit structure * int smi - submirror index * int ci - component index */ void mirror_openfail_console_info(mm_unit_t *un, int smi, int ci) { void (*get_dev)(); ms_cd_info_t cd; md_dev64_t tmpdev; tmpdev = un->un_sm[smi].sm_dev; get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0); if (get_dev != NULL) { (void) (*get_dev)(tmpdev, smi, ci, &cd); cmn_err(CE_WARN, "md %s: open error on %s", md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un), cd.cd_dev, NULL, 0)); } else { cmn_err(CE_WARN, "md %s: open error", md_shortname(MD_SID(un))); } } static int mirror_close_all_devs(mm_unit_t *un, int md_cflags) { int i; md_dev64_t dev; for (i = 0; i < NMIRROR; i++) { if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) continue; dev = un->un_sm[i].sm_dev; md_layered_close(dev, md_cflags); } return (0); } /* * Keep track of drivers that don't support failfast. We use this so that * we only log one diagnostic message for each of these drivers, no matter * how many times we run the mirror_check_failfast function. * Return 1 if this is a new driver that does not support failfast, * return 0 if we have already seen this non-failfast driver. */ static int new_non_ff_driver(const char *s) { mutex_enter(&non_ff_drv_mutex); if (non_ff_drivers == NULL) { non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *), KM_NOSLEEP); if (non_ff_drivers == NULL) { mutex_exit(&non_ff_drv_mutex); return (1); } non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP); if (non_ff_drivers[0] == NULL) { kmem_free(non_ff_drivers, 2 * sizeof (char *)); non_ff_drivers = NULL; mutex_exit(&non_ff_drv_mutex); return (1); } (void) strcpy(non_ff_drivers[0], s); non_ff_drivers[1] = NULL; } else { int i; char **tnames; char **tmp; for (i = 0; non_ff_drivers[i] != NULL; i++) { if (strcmp(s, non_ff_drivers[i]) == 0) { mutex_exit(&non_ff_drv_mutex); return (0); } } /* allow for new element and null */ i += 2; tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP); if (tnames == NULL) { mutex_exit(&non_ff_drv_mutex); return (1); } for (i = 0; non_ff_drivers[i] != NULL; i++) tnames[i] = non_ff_drivers[i]; tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP); if (tnames[i] == NULL) { /* adjust i so that it is the right count to free */ kmem_free(tnames, (i + 2) * sizeof (char *)); mutex_exit(&non_ff_drv_mutex); return (1); } (void) strcpy(tnames[i++], s); tnames[i] = NULL; tmp = non_ff_drivers; non_ff_drivers = tnames; /* i now represents the count we previously alloced */ kmem_free(tmp, i * sizeof (char *)); } mutex_exit(&non_ff_drv_mutex); return (1); } /* * Check for the "ddi-failfast-supported" devtree property on each submirror * component to indicate if we should do I/O to that submirror with the * B_FAILFAST flag set or not. This check is made at various state transitions * in the mirror code (e.g. open, enable, hotspare, etc.). Sometimes we * only need to check one drive (e.g. hotspare) but since the check is * fast and infrequent and sometimes needs to be done on all components we * just check all components on each call. */ void mirror_check_failfast(minor_t mnum) { int i; mm_unit_t *un; if (md_ff_disable) return; un = MD_UNIT(mnum); for (i = 0; i < NMIRROR; i++) { int ci; int cnt; int ff = 1; mm_submirror_t *sm; mm_submirror_ic_t *smic; void (*get_dev)(); if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) continue; sm = &un->un_sm[i]; smic = &un->un_smic[i]; get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, "get device", 0); cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm); for (ci = 0; ci < cnt; ci++) { int found = 0; dev_t ci_dev; major_t major; dev_info_t *devi; ms_cd_info_t cd; /* * this already returns the hs * dev if the device is spared */ (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); ci_dev = md_dev64_to_dev(cd.cd_dev); major = getmajor(ci_dev); if (major == md_major) { /* * this component must be a soft * partition; get the real dev */ minor_t dev_mnum; mdi_unit_t *ui; mp_unit_t *un; set_t setno; side_t side; md_dev64_t tmpdev; ui = MDI_UNIT(getminor(ci_dev)); /* grab necessary lock */ un = (mp_unit_t *)md_unit_readerlock(ui); dev_mnum = MD_SID(un); setno = MD_MIN2SET(dev_mnum); side = mddb_getsidenum(setno); tmpdev = un->un_dev; /* Get dev by device id */ if (md_devid_found(setno, side, un->un_key) == 1) { tmpdev = md_resolve_bydevid(dev_mnum, tmpdev, un->un_key); } md_unit_readerexit(ui); ci_dev = md_dev64_to_dev(tmpdev); major = getmajor(ci_dev); } if (ci_dev != NODEV32 && (devi = e_ddi_hold_devi_by_dev(ci_dev, 0)) != NULL) { ddi_prop_op_t prop_op = PROP_LEN_AND_VAL_BUF; int propvalue = 0; int proplength = sizeof (int); int error; struct cb_ops *cb; if ((cb = devopsp[major]->devo_cb_ops) != NULL) { error = (*cb->cb_prop_op) (DDI_DEV_T_ANY, devi, prop_op, DDI_PROP_NOTPROM|DDI_PROP_DONTPASS, "ddi-failfast-supported", (caddr_t)&propvalue, &proplength); if (error == DDI_PROP_SUCCESS) found = 1; } if (!found && new_non_ff_driver( ddi_driver_name(devi))) { cmn_err(CE_NOTE, "!md: B_FAILFAST I/O" "disabled on %s", ddi_driver_name(devi)); } ddi_release_devi(devi); } /* * All components must support * failfast in the submirror. */ if (!found) { ff = 0; break; } } if (ff) { sm->sm_flags |= MD_SM_FAILFAST; } else { sm->sm_flags &= ~MD_SM_FAILFAST; } } } /* * Return true if the submirror is unavailable. * If any of the submirror components are opened then the submirror cannot * be unavailable (MD_INACCESSIBLE). * If any of the components are already in the errored state, then the submirror * cannot be unavailable (MD_INACCESSIBLE). */ static bool_t submirror_unavailable(mm_unit_t *un, int smi, int from_probe) { mm_submirror_t *sm; mm_submirror_ic_t *smic; md_m_shared_t *shared; int ci; int compcnt; sm = &un->un_sm[smi]; smic = &un->un_smic[smi]; compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); for (ci = 0; ci < compcnt; ci++) { shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) (sm->sm_dev, sm, ci); if (from_probe) { if (shared->ms_flags & MDM_S_PROBEOPEN) return (B_FALSE); } else { if (shared->ms_flags & MDM_S_ISOPEN) return (B_FALSE); } if (shared->ms_state == CS_ERRED || shared->ms_state == CS_LAST_ERRED) return (B_FALSE); } return (B_TRUE); } static int mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp) { int i; mm_unit_t *un; mdi_unit_t *ui; int err; int smi; int ci; err_comp_t *c; err_comp_t *ecomps = NULL; int smmask = 0; set_t setno; int sm_cnt; int sm_unavail_cnt; mirror_check_failfast(mnum); un = MD_UNIT(mnum); ui = MDI_UNIT(mnum); setno = MD_UN2SET(un); for (i = 0; i < NMIRROR; i++) { md_dev64_t tmpdev = un->un_sm[i].sm_dev; if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) continue; if (md_layered_open(mnum, &tmpdev, md_oflags)) smmask |= SMI2BIT(i); un->un_sm[i].sm_dev = tmpdev; } /* * If smmask is clear, all submirrors are accessible. Clear the * MD_INACCESSIBLE bit in this case. This bit is also cleared for the * mirror device. If smmask is set, we have to determine which of the * submirrors are in error. If no submirror is accessible we mark the * whole mirror as MD_INACCESSIBLE. */ if (smmask == 0) { if (lockp) { md_ioctl_readerexit(lockp); (void) md_ioctl_writerlock(lockp, ui); } else { md_unit_readerexit(ui); (void) md_unit_writerlock(ui); } ui->ui_tstate &= ~MD_INACCESSIBLE; if (lockp) { md_ioctl_writerexit(lockp); (void) md_ioctl_readerlock(lockp, ui); } else { md_unit_writerexit(ui); (void) md_unit_readerlock(ui); } for (i = 0; i < NMIRROR; i++) { md_dev64_t tmpdev; mdi_unit_t *sm_ui; if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) continue; tmpdev = un->un_sm[i].sm_dev; sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); (void) md_unit_writerlock(sm_ui); sm_ui->ui_tstate &= ~MD_INACCESSIBLE; md_unit_writerexit(sm_ui); } return (0); } for (i = 0; i < NMIRROR; i++) { md_dev64_t tmpdev; if (!(smmask & SMI2BIT(i))) continue; tmpdev = un->un_sm[i].sm_dev; err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS); un->un_sm[i].sm_dev = tmpdev; ASSERT(err == 0); } if (lockp) { md_ioctl_readerexit(lockp); un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui); } else { md_unit_readerexit(ui); un = (mm_unit_t *)md_unit_writerlock(ui); } /* * We want to make sure the unavailable flag is not masking a real * error on the submirror. * For each submirror, * if all of the submirror components couldn't be opened and there * are no errors on the submirror, then set the unavailable flag * otherwise, clear unavailable. */ sm_cnt = 0; sm_unavail_cnt = 0; for (i = 0; i < NMIRROR; i++) { md_dev64_t tmpdev; mdi_unit_t *sm_ui; if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) continue; sm_cnt++; tmpdev = un->un_sm[i].sm_dev; sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); (void) md_unit_writerlock(sm_ui); if (submirror_unavailable(un, i, 0)) { sm_ui->ui_tstate |= MD_INACCESSIBLE; sm_unavail_cnt++; } else { sm_ui->ui_tstate &= ~MD_INACCESSIBLE; } md_unit_writerexit(sm_ui); } /* * If all of the submirrors are unavailable, the mirror is also * unavailable. */ if (sm_cnt == sm_unavail_cnt) { ui->ui_tstate |= MD_INACCESSIBLE; } else { ui->ui_tstate &= ~MD_INACCESSIBLE; } smi = 0; ci = 0; while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) { if (mirror_other_sources(un, smi, ci, 1) == 1) { free_all_ecomps(ecomps); (void) mirror_close_all_devs(un, md_oflags); SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE, setno, MD_SID(un)); mirror_openfail_console_info(un, smi, ci); if (lockp) { md_ioctl_writerexit(lockp); (void) md_ioctl_readerlock(lockp, ui); } else { md_unit_writerexit(ui); (void) md_unit_readerlock(ui); } return (ENXIO); } /* track all component states that need changing */ c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP); c->ec_next = ecomps; c->ec_smi = smi; c->ec_ci = ci; ecomps = c; ci++; } /* Make all state changes and commit them */ for (c = ecomps; c != NULL; c = c->ec_next) { /* * If lockp is set, then entering kernel through ioctl. * For a MN set, the only ioctl path is via a commd message * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already * being sent to each node. * In this case, set NO_XMIT so that set_sm_comp_state * won't attempt to send a message on a message. * * In !MN sets, the xmit flag is ignored, so it doesn't matter * which flag is passed. */ if (lockp) { set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0, MD_STATE_NO_XMIT, lockp); } else { set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0, (MD_STATE_XMIT | MD_STATE_OCHELD), lockp); } /* * For a MN set, the NOTIFY is done when the state change is * processed on each node */ if (!MD_MNSET_SETNO(setno)) { SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_METADEVICE, setno, MD_SID(un)); } } if (lockp) { md_ioctl_writerexit(lockp); (void) md_ioctl_readerlock(lockp, ui); } else { md_unit_writerexit(ui); (void) md_unit_readerlock(ui); } free_all_ecomps(ecomps); /* allocate hotspares for all errored components */ if (MD_MNSET_SETNO(setno)) { /* * If we're called from an ioctl (lockp set) then we cannot * directly call send_poke_hotspares as this will block until * the message gets despatched to all nodes. If the cluster is * going through a reconfig cycle then the message will block * until the cycle is complete, and as we originate from a * service call from commd we will livelock. */ if (lockp == NULL) { md_unit_readerexit(ui); send_poke_hotspares(setno); (void) md_unit_readerlock(ui); } } else { (void) poke_hotspares(); } return (0); } void mirror_overlap_tree_remove(md_mps_t *ps) { mm_unit_t *un; if (panicstr) return; VERIFY(ps->ps_flags & MD_MPS_ON_OVERLAP); un = ps->ps_un; mutex_enter(&un->un_overlap_tree_mx); avl_remove(&un->un_overlap_root, ps); ps->ps_flags &= ~MD_MPS_ON_OVERLAP; if (un->un_overlap_tree_flag != 0) { un->un_overlap_tree_flag = 0; cv_broadcast(&un->un_overlap_tree_cv); } mutex_exit(&un->un_overlap_tree_mx); } /* * wait_for_overlaps: * ----------------- * Check that given i/o request does not cause an overlap with already pending * i/o. If it does, block until the overlapped i/o completes. * * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent * structure to be already in the overlap tree and MD_OVERLAP_NO_REPEAT if * it must not already be in the tree. */ static void wait_for_overlaps(md_mps_t *ps, int flags) { mm_unit_t *un; avl_index_t where; md_mps_t *ps1; if (panicstr) return; un = ps->ps_un; mutex_enter(&un->un_overlap_tree_mx); if ((flags & MD_OVERLAP_ALLOW_REPEAT) && (ps->ps_flags & MD_MPS_ON_OVERLAP)) { mutex_exit(&un->un_overlap_tree_mx); return; } VERIFY(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); do { ps1 = avl_find(&un->un_overlap_root, ps, &where); if (ps1 == NULL) { /* * The candidate range does not overlap with any * range in the tree. Insert it and be done. */ avl_insert(&un->un_overlap_root, ps, where); ps->ps_flags |= MD_MPS_ON_OVERLAP; } else { /* * The candidate range would overlap. Set the flag * indicating we need to be woken up, and sleep * until another thread removes a range. If upon * waking up we find this mps was put on the tree * by another thread, the loop terminates. */ un->un_overlap_tree_flag = 1; cv_wait(&un->un_overlap_tree_cv, &un->un_overlap_tree_mx); } } while (!(ps->ps_flags & MD_MPS_ON_OVERLAP)); mutex_exit(&un->un_overlap_tree_mx); } /* * This function is called from mirror_done to check whether any pages have * been modified while a mirrored write was in progress. Returns 0 if * all pages associated with bp are clean, 1 otherwise. */ static int any_pages_dirty(struct buf *bp) { int rval; rval = biomodified(bp); if (rval == -1) rval = 0; return (rval); } #define MAX_EXTRAS 10 void mirror_commit( mm_unit_t *un, int smmask, mddb_recid_t *extras ) { mm_submirror_t *sm; md_unit_t *su; int i; /* 2=mirror,null id */ mddb_recid_t recids[NMIRROR+2+MAX_EXTRAS]; int ri = 0; if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE) return; /* Add two, this includes the mirror unit and the null recid */ if (extras != NULL) { int nrecids = 0; while (extras[nrecids] != 0) { nrecids++; } ASSERT(nrecids <= MAX_EXTRAS); } if (un != NULL) recids[ri++] = un->c.un_record_id; for (i = 0; i < NMIRROR; i++) { if (!(smmask & SMI2BIT(i))) continue; sm = &un->un_sm[i]; if (!SMS_IS(sm, SMS_INUSE)) continue; if (md_getmajor(sm->sm_dev) != md_major) continue; su = MD_UNIT(md_getminor(sm->sm_dev)); recids[ri++] = su->c.un_record_id; } if (extras != NULL) while (*extras != 0) { recids[ri++] = *extras; extras++; } if (ri == 0) return; recids[ri] = 0; /* * Ok to hold ioctl lock across record commit to mddb as * long as the record(s) being committed aren't resync records. */ mddb_commitrecs_wrapper(recids); } /* * This routine is used to set a bit in the writable_bm bitmap * which represents each submirror in a metamirror which * is writable. The first writable submirror index is assigned * to the sm_index. The number of writable submirrors are returned in nunits. * * This routine returns the submirror's unit number. */ static void select_write_units(struct mm_unit *un, md_mps_t *ps) { int i; unsigned writable_bm = 0; unsigned nunits = 0; for (i = 0; i < NMIRROR; i++) { if (SUBMIRROR_IS_WRITEABLE(un, i)) { /* set bit of all writable units */ writable_bm |= SMI2BIT(i); nunits++; } } ps->ps_writable_sm = writable_bm; ps->ps_active_cnt = nunits; ps->ps_current_sm = 0; } static unsigned select_write_after_read_units(struct mm_unit *un, md_mps_t *ps) { int i; unsigned writable_bm = 0; unsigned nunits = 0; for (i = 0; i < NMIRROR; i++) { if (SUBMIRROR_IS_WRITEABLE(un, i) && un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) { writable_bm |= SMI2BIT(i); nunits++; } } if ((writable_bm & ps->ps_allfrom_sm) != 0) { writable_bm &= ~ps->ps_allfrom_sm; nunits--; } ps->ps_writable_sm = writable_bm; ps->ps_active_cnt = nunits; ps->ps_current_sm = 0; return (nunits); } static md_dev64_t select_read_unit( mm_unit_t *un, diskaddr_t blkno, u_longlong_t reqcount, u_longlong_t *cando, int must_be_opened, md_m_shared_t **shared, md_mcs_t *cs) { int i; md_m_shared_t *s; uint_t lasterrcnt = 0; md_dev64_t dev = 0; u_longlong_t cnt; u_longlong_t mincnt; mm_submirror_t *sm; mm_submirror_ic_t *smic; mdi_unit_t *ui; mincnt = reqcount; for (i = 0; i < NMIRROR; i++) { if (!SUBMIRROR_IS_READABLE(un, i)) continue; sm = &un->un_sm[i]; smic = &un->un_smic[i]; cnt = reqcount; /* * If the current submirror is marked as inaccessible, do not * try to access it. */ ui = MDI_UNIT(getminor(expldev(sm->sm_dev))); (void) md_unit_readerlock(ui); if (ui->ui_tstate & MD_INACCESSIBLE) { md_unit_readerexit(ui); continue; } md_unit_readerexit(ui); s = (md_m_shared_t *)(*(smic->sm_shared_by_blk)) (sm->sm_dev, sm, blkno, &cnt); if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN)) continue; if (s->ms_state == CS_OKAY) { *cando = cnt; if (shared != NULL) *shared = s; if (un->un_sm[i].sm_flags & MD_SM_FAILFAST && cs != NULL) { cs->cs_buf.b_flags |= B_FAILFAST; } return (un->un_sm[i].sm_dev); } if (s->ms_state != CS_LAST_ERRED) continue; /* don't use B_FAILFAST since we're Last Erred */ if (mincnt > cnt) mincnt = cnt; if (s->ms_lasterrcnt > lasterrcnt) { lasterrcnt = s->ms_lasterrcnt; if (shared != NULL) *shared = s; dev = un->un_sm[i].sm_dev; } } *cando = mincnt; return (dev); } /* * Given a 32-bit bitmap, this routine will return the bit number * of the nth bit set. The nth bit set is passed via the index integer. * * This routine is used to run through the writable submirror bitmap * and starting all of the writes. See the value returned is the * index to appropriate submirror structure, in the md_sm * array for metamirrors. */ static int md_find_nth_unit(uint_t mask, int index) { int bit, nfound; for (bit = -1, nfound = -1; nfound != index; bit++) { ASSERT(mask != 0); nfound += (mask & 1); mask >>= 1; } return (bit); } static int fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs) { mm_unit_t *un; buf_t *bp; int i; unsigned nunits = 0; int iunit; uint_t running_bm = 0; uint_t sm_index; bp = &cs->cs_buf; un = ps->ps_un; for (i = 0; i < NMIRROR; i++) { if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING)) continue; running_bm |= SMI2BIT(i); nunits++; } if (nunits == 0) return (1); /* * For directed mirror read (DMR) we only use the specified side and * do not compute the source of the read. * If we're running with MD_MPS_DIRTY_RD set we always return the * first mirror side (this prevents unnecessary ownership switching). * Otherwise we return the submirror according to the mirror read option */ if (ps->ps_flags & MD_MPS_DMR) { sm_index = un->un_dmr_last_read; } else if (ps->ps_flags & MD_MPS_DIRTY_RD) { sm_index = md_find_nth_unit(running_bm, 0); } else { /* Normal (non-DMR) operation */ switch (un->un_read_option) { case RD_GEOMETRY: iunit = (int)(bp->b_lblkno / howmany(un->c.un_total_blocks, nunits)); sm_index = md_find_nth_unit(running_bm, iunit); break; case RD_FIRST: sm_index = md_find_nth_unit(running_bm, 0); break; case RD_LOAD_BAL: /* this is intentional to fall into the default */ default: un->un_last_read = (un->un_last_read + 1) % nunits; sm_index = md_find_nth_unit(running_bm, un->un_last_read); break; } } bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev); ps->ps_allfrom_sm = SMI2BIT(sm_index); if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) { bp->b_flags |= B_FAILFAST; } return (0); } static int mirror_are_submirrors_available(mm_unit_t *un) { int i; for (i = 0; i < NMIRROR; i++) { md_dev64_t tmpdev = un->un_sm[i].sm_dev; if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) || md_getmajor(tmpdev) != md_major) continue; if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) || (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits)) return (0); if (MDI_UNIT(md_getminor(tmpdev)) == NULL) return (0); } return (1); } void build_submirror(mm_unit_t *un, int i, int snarfing) { struct mm_submirror *sm; struct mm_submirror_ic *smic; md_unit_t *su; set_t setno; sm = &un->un_sm[i]; smic = &un->un_smic[i]; sm->sm_flags = 0; /* sometime we may need to do more here */ setno = MD_UN2SET(un); if (!SMS_IS(sm, SMS_INUSE)) return; if (snarfing) { sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno), sm->sm_key, MD_NOTRUST_DEVT); } else { if (md_getmajor(sm->sm_dev) == md_major) { su = MD_UNIT(md_getminor(sm->sm_dev)); un->c.un_flag |= (su->c.un_flag & MD_LABELED); /* submirror can no longer be soft partitioned */ MD_CAPAB(su) &= (~MD_CAN_SP); } } smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev, 0, "shared by blk", 0); smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev, 0, "shared by indx", 0); smic->sm_get_component_count = (int (*)())md_get_named_service( sm->sm_dev, 0, "get component count", 0); smic->sm_get_bcss = (int (*)())md_get_named_service(sm->sm_dev, 0, "get block count skip size", 0); sm->sm_state &= ~SMS_IGNORE; if (SMS_IS(sm, SMS_OFFLINE)) MD_STATUS(un) |= MD_UN_OFFLINE_SM; md_set_parent(sm->sm_dev, MD_SID(un)); } static void mirror_cleanup(mm_unit_t *un) { mddb_recid_t recid; int smi; sv_dev_t sv[NMIRROR]; int nsv = 0; /* * If a MN diskset and this node is not the master, do * not delete any records on snarf of the mirror records. */ if (MD_MNSET_SETNO(MD_UN2SET(un)) && md_set[MD_UN2SET(un)].s_am_i_master == 0) { return; } for (smi = 0; smi < NMIRROR; smi++) { if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) continue; sv[nsv].setno = MD_UN2SET(un); sv[nsv++].key = un->un_sm[smi].sm_key; } recid = un->un_rr_dirty_recid; mddb_deleterec_wrapper(un->c.un_record_id); if (recid > 0) mddb_deleterec_wrapper(recid); md_rem_names(sv, nsv); } /* * Comparison function for the avl tree which tracks * outstanding writes on submirrors. * * Returns: * -1: ps1 < ps2 * 0: ps1 and ps2 overlap * 1: ps1 > ps2 */ static int mirror_overlap_compare(const void *p1, const void *p2) { const md_mps_t *ps1 = (md_mps_t *)p1; const md_mps_t *ps2 = (md_mps_t *)p2; if (ps1->ps_firstblk < ps2->ps_firstblk) { if (ps1->ps_lastblk >= ps2->ps_firstblk) return (0); return (-1); } if (ps1->ps_firstblk > ps2->ps_firstblk) { if (ps1->ps_firstblk <= ps2->ps_lastblk) return (0); return (1); } return (0); } /* * Collapse any sparse submirror entries snarfed from the on-disk replica. * Only the in-core entries are updated. The replica will be updated on-disk * when the in-core replica is committed on shutdown of the SVM subsystem. */ static void collapse_submirrors(mm_unit_t *un) { int smi, nremovals, smiremove; mm_submirror_t *sm, *new_sm, *old_sm; mm_submirror_ic_t *smic; int nsmidx = un->un_nsm - 1; rescan: nremovals = 0; smiremove = -1; for (smi = 0; smi <= nsmidx; smi++) { sm = &un->un_sm[smi]; /* * Check to see if this submirror is marked as in-use. * If it isn't then it is a potential sparse entry and * may need to be cleared from the configuration. * The records should _already_ have been cleared by the * original mirror_detach() code, but we need to shuffle * any NULL entries in un_sm[] to the end of the array. * Any NULL un_smic[] entries need to be reset to the underlying * submirror/slice accessor functions. */ if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) { nremovals++; smiremove = smi; break; } } if (nremovals == 0) { /* * Ensure that we have a matching contiguous set of un_smic[] * entries for the corresponding un_sm[] entries */ for (smi = 0; smi <= nsmidx; smi++) { smic = &un->un_smic[smi]; sm = &un->un_sm[smi]; smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev, 0, "shared by_blk", 0); smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev, 0, "shared by indx", 0); smic->sm_get_component_count = (int (*)())md_get_named_service(sm->sm_dev, 0, "get component count", 0); smic->sm_get_bcss = (int (*)())md_get_named_service(sm->sm_dev, 0, "get block count skip size", 0); } return; } /* * Reshuffle the submirror devices so that we do not have a dead record * in the middle of the array. Once we've done this we need to rescan * the mirror to check for any other holes. */ for (smi = 0; smi < NMIRROR; smi++) { if (smi < smiremove) continue; if (smi > smiremove) { old_sm = &un->un_sm[smi]; new_sm = &un->un_sm[smi - 1]; bcopy(old_sm, new_sm, sizeof (mm_submirror_t)); bzero(old_sm, sizeof (mm_submirror_t)); } } /* * Now we need to rescan the array to find the next potential dead * entry. */ goto rescan; } /* Return a -1 if optimized record unavailable and set should be released */ int mirror_build_incore(mm_unit_t *un, int snarfing) { int i; if (MD_STATUS(un) & MD_UN_BEING_RESET) { mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN); return (1); } if (mirror_are_submirrors_available(un) == 0) return (1); if (MD_UNIT(MD_SID(un)) != NULL) return (0); MD_STATUS(un) = 0; /* pre-4.1 didn't define CAN_META_CHILD capability */ MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP; un->un_overlap_tree_flag = 0; avl_create(&un->un_overlap_root, mirror_overlap_compare, sizeof (md_mps_t), offsetof(md_mps_t, ps_overlap_node)); /* * We need to collapse any sparse submirror entries into a non-sparse * array. This is to cover the case where we have an old replica image * which has not been updated (i.e. snarfed) since being modified. * The new code expects all submirror access to be sequential (i.e. * both the un_sm[] and un_smic[] entries correspond to non-empty * submirrors. */ collapse_submirrors(un); for (i = 0; i < NMIRROR; i++) build_submirror(un, i, snarfing); if (unit_setup_resync(un, snarfing) != 0) { if (snarfing) { mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT); /* * If a MN set and set is not stale, then return -1 * which will force the caller to unload the set. * The MN diskset nodes will return failure if * unit_setup_resync fails so that nodes won't * get out of sync. * * If set is STALE, the master node can't allocate * a resync record (if needed), but node needs to * join the set so that user can delete broken mddbs. * So, if set is STALE, just continue on. */ if (MD_MNSET_SETNO(MD_UN2SET(un)) && !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) { return (-1); } } else return (1); } mutex_init(&un->un_overlap_tree_mx, NULL, MUTEX_DEFAULT, NULL); cv_init(&un->un_overlap_tree_cv, NULL, CV_DEFAULT, NULL); un->un_suspend_wr_flag = 0; mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL); cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL); /* * Allocate mutexes for mirror-owner and resync-owner changes. * All references to the owner message state field must be guarded * by this mutex. */ mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL); /* * Allocate mutex and condvar for resync thread manipulation. These * will be used by mirror_resync_unit/mirror_ioctl_resync */ mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL); cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL); /* * Allocate mutex and condvar for resync progress thread manipulation. * This allows resyncs to be continued across an intervening reboot. */ mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL); cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL); /* * Allocate mutex and condvar for Directed Mirror Reads (DMR). This * provides synchronization between a user-ioctl and the resulting * strategy() call that performs the read(). */ mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL); cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL); /* * Allocate rwlocks for un_pernode_dirty_bm accessing. */ for (i = 0; i < MD_MNMAXSIDES; i++) { rw_init(&un->un_pernode_dirty_mx[i], NULL, RW_DEFAULT, NULL); } /* place various information in the in-core data structures */ md_nblocks_set(MD_SID(un), un->c.un_total_blocks); MD_UNIT(MD_SID(un)) = un; return (0); } void reset_mirror(struct mm_unit *un, minor_t mnum, int removing) { mddb_recid_t recid, vtoc_id; size_t bitcnt; size_t shortcnt; int smi; sv_dev_t sv[NMIRROR]; int nsv = 0; uint_t bits = 0; minor_t selfid; md_unit_t *su; int i; md_destroy_unit_incore(mnum, &mirror_md_ops); shortcnt = un->un_rrd_num * sizeof (short); bitcnt = howmany(un->un_rrd_num, NBBY); if (un->un_outstanding_writes) kmem_free((caddr_t)un->un_outstanding_writes, shortcnt); if (un->un_goingclean_bm) kmem_free((caddr_t)un->un_goingclean_bm, bitcnt); if (un->un_goingdirty_bm) kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt); if (un->un_resync_bm) kmem_free((caddr_t)un->un_resync_bm, bitcnt); if (un->un_pernode_dirty_sum) kmem_free((caddr_t)un->un_pernode_dirty_sum, un->un_rrd_num); /* * Destroy the taskq for deferred processing of DRL clean requests. * This taskq will only be present for Multi Owner mirrors. */ if (un->un_drl_task != NULL) ddi_taskq_destroy(un->un_drl_task); md_nblocks_set(mnum, -1ULL); MD_UNIT(mnum) = NULL; /* * Attempt release of its minor node */ md_remove_minor_node(mnum); if (!removing) return; for (smi = 0; smi < NMIRROR; smi++) { if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) continue; /* reallow soft partitioning of submirror and reset parent */ su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev)); MD_CAPAB(su) |= MD_CAN_SP; md_reset_parent(un->un_sm[smi].sm_dev); reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]); sv[nsv].setno = MD_MIN2SET(mnum); sv[nsv++].key = un->un_sm[smi].sm_key; bits |= SMI2BIT(smi); } MD_STATUS(un) |= MD_UN_BEING_RESET; recid = un->un_rr_dirty_recid; vtoc_id = un->c.un_vtoc_id; selfid = MD_SID(un); mirror_commit(un, bits, 0); avl_destroy(&un->un_overlap_root); /* Destroy all mutexes and condvars before returning. */ mutex_destroy(&un->un_suspend_wr_mx); cv_destroy(&un->un_suspend_wr_cv); mutex_destroy(&un->un_overlap_tree_mx); cv_destroy(&un->un_overlap_tree_cv); mutex_destroy(&un->un_owner_mx); mutex_destroy(&un->un_rs_thread_mx); cv_destroy(&un->un_rs_thread_cv); mutex_destroy(&un->un_rs_progress_mx); cv_destroy(&un->un_rs_progress_cv); mutex_destroy(&un->un_dmr_mx); cv_destroy(&un->un_dmr_cv); for (i = 0; i < MD_MNMAXSIDES; i++) { rw_destroy(&un->un_pernode_dirty_mx[i]); if (un->un_pernode_dirty_bm[i]) kmem_free((caddr_t)un->un_pernode_dirty_bm[i], bitcnt); } /* * Remove self from the namespace */ if (un->c.un_revision & MD_FN_META_DEV) { (void) md_rem_selfname(un->c.un_self_id); } /* This frees the unit structure. */ mddb_deleterec_wrapper(un->c.un_record_id); if (recid != 0) mddb_deleterec_wrapper(recid); /* Remove the vtoc, if present */ if (vtoc_id) mddb_deleterec_wrapper(vtoc_id); md_rem_names(sv, nsv); SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE, MD_MIN2SET(selfid), selfid); } int mirror_internal_open( minor_t mnum, int flag, int otyp, int md_oflags, IOLOCK *lockp /* can be NULL */ ) { mdi_unit_t *ui = MDI_UNIT(mnum); int err = 0; tryagain: /* single thread */ if (lockp) { /* * If ioctl lock is held, use openclose_enter * routine that will set the ioctl flag when * grabbing the readerlock. */ (void) md_ioctl_openclose_enter(lockp, ui); } else { (void) md_unit_openclose_enter(ui); } /* * The mirror_open_all_devs routine may end up sending a STATE_UPDATE * message in a MN diskset and this requires that the openclose * lock is dropped in order to send this message. So, another * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from * attempting an open while this thread has an open in progress. * Call the *_lh version of the lock exit routines since the ui_mx * mutex must be held from checking for OPENINPROGRESS until * after the cv_wait call. */ mutex_enter(&ui->ui_mx); if (ui->ui_lock & MD_UL_OPENINPROGRESS) { if (lockp) { (void) md_ioctl_openclose_exit_lh(lockp); } else { md_unit_openclose_exit_lh(ui); } cv_wait(&ui->ui_cv, &ui->ui_mx); mutex_exit(&ui->ui_mx); goto tryagain; } ui->ui_lock |= MD_UL_OPENINPROGRESS; mutex_exit(&ui->ui_mx); /* open devices, if necessary */ if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) { if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0) goto out; } /* count open */ if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) goto out; /* unlock, return success */ out: mutex_enter(&ui->ui_mx); ui->ui_lock &= ~MD_UL_OPENINPROGRESS; mutex_exit(&ui->ui_mx); if (lockp) { /* * If ioctl lock is held, use openclose_exit * routine that will clear the lockp reader flag. */ (void) md_ioctl_openclose_exit(lockp); } else { md_unit_openclose_exit(ui); } return (err); } int mirror_internal_close( minor_t mnum, int otyp, int md_cflags, IOLOCK *lockp /* can be NULL */ ) { mdi_unit_t *ui = MDI_UNIT(mnum); mm_unit_t *un; int err = 0; /* single thread */ if (lockp) { /* * If ioctl lock is held, use openclose_enter * routine that will set the ioctl flag when * grabbing the readerlock. */ un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui); } else { un = (mm_unit_t *)md_unit_openclose_enter(ui); } /* count closed */ if ((err = md_unit_decopen(mnum, otyp)) != 0) goto out; /* close devices, if necessary */ if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { /* * Clean up dirty bitmap for this unit. Do this * before closing the underlying devices to avoid * race conditions with reset_mirror() as a * result of a 'metaset -r' command running in * parallel. This might cause deallocation of * dirty region bitmaps; with underlying metadevices * in place this can't happen. * Don't do this if a MN set and ABR not set */ if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) { if (!MD_MNSET_SETNO(MD_UN2SET(un)) || !(ui->ui_tstate & MD_ABR_CAP)) mirror_process_unit_resync(un); } (void) mirror_close_all_devs(un, md_cflags); /* * For a MN set with transient capabilities (eg ABR/DMR) set, * clear these capabilities on the last open in the cluster. * To do this we send a message to all nodes to see of the * device is open. */ if (MD_MNSET_SETNO(MD_UN2SET(un)) && (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) { if (lockp) { (void) md_ioctl_openclose_exit(lockp); } else { md_unit_openclose_exit(ui); } /* * if we are in the context of an ioctl, drop the * ioctl lock. * Otherwise, no other locks should be held. */ if (lockp) { IOLOCK_RETURN_RELEASE(0, lockp); } mdmn_clear_all_capabilities(mnum); /* if dropped the lock previously, regain it */ if (lockp) { IOLOCK_RETURN_REACQUIRE(lockp); } return (0); } /* unlock and return success */ } out: /* Call whether lockp is NULL or not. */ if (lockp) { md_ioctl_openclose_exit(lockp); } else { md_unit_openclose_exit(ui); } return (err); } /* * When a component has completed resyncing and is now ok, check if the * corresponding component in the other submirrors is in the Last Erred * state. If it is, we want to change that to the Erred state so we stop * using that component and start using this good component instead. * * This is called from set_sm_comp_state and recursively calls * set_sm_comp_state if it needs to change the Last Erred state. */ static void reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags, IOLOCK *lockp) { mm_submirror_t *sm; mm_submirror_ic_t *smic; int ci; int i; int compcnt; int changed = 0; for (i = 0; i < NMIRROR; i++) { sm = &un->un_sm[i]; smic = &un->un_smic[i]; if (!SMS_IS(sm, SMS_INUSE)) continue; /* ignore the submirror that we just made ok */ if (i == smi) continue; compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); for (ci = 0; ci < compcnt; ci++) { md_m_shared_t *shared; shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) (sm->sm_dev, sm, ci); if ((shared->ms_state & CS_LAST_ERRED) && !mirror_other_sources(un, i, ci, 1)) { set_sm_comp_state(un, i, ci, CS_ERRED, extras, flags, lockp); changed = 1; } } } /* maybe there is a hotspare for this newly erred component */ if (changed) { set_t setno; setno = MD_UN2SET(un); if (MD_MNSET_SETNO(setno)) { send_poke_hotspares(setno); } else { (void) poke_hotspares(); } } } /* * set_sm_comp_state * * Set the state of a submirror component to the specified new state. * If the mirror is in a multi-node set, send messages to all nodes to * block all writes to the mirror and then update the state and release the * writes. These messages are only sent if MD_STATE_XMIT is set in flags. * MD_STATE_XMIT will be unset in 2 cases: * 1. When the state is changed to CS_RESYNC as this state change * will already have been updated on each node by the processing of the * distributed metasync command, hence no need to xmit. * 2. When the state is change to CS_OKAY after a resync has completed. Again * the resync completion will already have been processed on each node by * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component * resync, hence no need to xmit. * * In case we are called from the updates of a watermark, * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to * a metainit or similar. In this case the message that we sent to propagate * the state change must not be a class1 message as that would deadlock with * the metainit command that is still being processed. * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2 * instead. This also makes the submessage generator to create a class2 * submessage rather than a class1 (which would also block) * * On entry, unit_writerlock is held * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is * also held. */ void set_sm_comp_state( mm_unit_t *un, int smi, int ci, int newstate, mddb_recid_t *extras, uint_t flags, IOLOCK *lockp ) { mm_submirror_t *sm; mm_submirror_ic_t *smic; md_m_shared_t *shared; int origstate; void (*get_dev)(); ms_cd_info_t cd; char devname[MD_MAX_CTDLEN]; int err; set_t setno = MD_UN2SET(un); md_mn_msg_stch_t stchmsg; mdi_unit_t *ui = MDI_UNIT(MD_SID(un)); md_mn_kresult_t *kresult; int rval; uint_t msgflags; md_mn_msgtype_t msgtype; int save_lock = 0; mdi_unit_t *ui_sm; int nretries = 0; sm = &un->un_sm[smi]; smic = &un->un_smic[smi]; /* If we have a real error status then turn off MD_INACCESSIBLE. */ ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev))); if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) && ui_sm->ui_tstate & MD_INACCESSIBLE) { ui_sm->ui_tstate &= ~MD_INACCESSIBLE; } shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) (sm->sm_dev, sm, ci); origstate = shared->ms_state; /* * If the new state is an error and the old one wasn't, generate * a console message. We do this before we send the state to other * nodes in a MN set because the state change may change the component * name if a hotspare is allocated. */ if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) && (newstate & (CS_ERRED|CS_LAST_ERRED))) { get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, "get device", 0); (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); err = md_getdevname(setno, mddb_getsidenum(setno), 0, cd.cd_dev, devname, sizeof (devname)); if (err == ENOENT) { (void) md_devname(setno, cd.cd_dev, devname, sizeof (devname)); } cmn_err(CE_WARN, "md: %s: %s needs maintenance", md_shortname(md_getminor(sm->sm_dev)), devname); if (newstate & CS_LAST_ERRED) { cmn_err(CE_WARN, "md: %s: %s last erred", md_shortname(md_getminor(sm->sm_dev)), devname); } else if (shared->ms_flags & MDM_S_ISOPEN) { /* * Close the broken device and clear the open flag on * it. Closing the device means the RCM framework will * be able to unconfigure the device if required. * * We have to check that the device is open, otherwise * the first open on it has resulted in the error that * is being processed and the actual cd.cd_dev will be * NODEV64. * * If this is a multi-node mirror, then the multinode * state checks following this code will cause the * slave nodes to close the mirror in the function * mirror_set_state(). */ md_layered_close(cd.cd_dev, MD_OFLG_NULL); shared->ms_flags &= ~MDM_S_ISOPEN; } } else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) && (shared->ms_flags & MDM_S_ISOPEN)) { /* * Similar to logic above except no log messages since we * are just transitioning from Last Erred to Erred. */ get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, "get device", 0); (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); md_layered_close(cd.cd_dev, MD_OFLG_NULL); shared->ms_flags &= ~MDM_S_ISOPEN; } if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) && (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) { /* * For a multi-node mirror, send the state change to the * master, which broadcasts to all nodes, including this * one. Once the message is received, the state is set * in-core and the master commits the change to disk. * There is a case, comp_replace, where this function * can be called from within an ioctl and therefore in this * case, as the ioctl will already be called on each node, * there is no need to xmit the state change to the master for * distribution to the other nodes. MD_STATE_XMIT flag is used * to indicate whether a xmit is required. The mirror's * transient state is set to MD_ERR_PENDING to avoid sending * multiple messages. */ if (newstate & (CS_ERRED|CS_LAST_ERRED)) ui->ui_tstate |= MD_ERR_PENDING; /* * Send a state update message to all nodes. This message * will generate 2 submessages, the first one to suspend * all writes to the mirror and the second to update the * state and resume writes. */ stchmsg.msg_stch_mnum = un->c.un_self_id; stchmsg.msg_stch_sm = smi; stchmsg.msg_stch_comp = ci; stchmsg.msg_stch_new_state = newstate; stchmsg.msg_stch_hs_id = shared->ms_hs_id; #ifdef DEBUG if (mirror_debug_flag) printf("send set state, %x, %x, %x, %x, %x\n", stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm, stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state, stchmsg.msg_stch_hs_id); #endif if (flags & MD_STATE_WMUPDATE) { msgtype = MD_MN_MSG_STATE_UPDATE2; /* * When coming from an update of watermarks, there * must already be a message logged that triggered * this action. So, no need to log this message, too. */ msgflags = MD_MSGF_NO_LOG; } else { msgtype = MD_MN_MSG_STATE_UPDATE; msgflags = MD_MSGF_DEFAULT_FLAGS; } /* * If we are in the context of an ioctl, drop the ioctl lock. * lockp holds the list of locks held. * * Otherwise, increment the appropriate reacquire counters. * If openclose lock is *held, then must reacquire reader * lock before releasing the openclose lock. * Do not drop the ARRAY_WRITER lock as we may not be able * to reacquire it. */ if (lockp) { if (lockp->l_flags & MD_ARRAY_WRITER) { save_lock = MD_ARRAY_WRITER; lockp->l_flags &= ~MD_ARRAY_WRITER; } else if (lockp->l_flags & MD_ARRAY_READER) { save_lock = MD_ARRAY_READER; lockp->l_flags &= ~MD_ARRAY_READER; } IOLOCK_RETURN_RELEASE(0, lockp); } else { if (flags & MD_STATE_OCHELD) { md_unit_writerexit(ui); (void) md_unit_readerlock(ui); md_unit_openclose_exit(ui); } else { md_unit_writerexit(ui); } } kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); sscs_msg: rval = mdmn_ksend_message(setno, msgtype, msgflags, 0, (char *)&stchmsg, sizeof (stchmsg), kresult); if (!MDMN_KSEND_MSG_OK(rval, kresult)) { mdmn_ksend_show_error(rval, kresult, "STATE UPDATE"); /* If we're shutting down already, pause things here. */ if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) { while (!md_mn_is_commd_present()) { delay(md_hz); } /* * commd is now available; retry the message * one time. If that fails we fall through and * panic as the system is in an unexpected state */ if (nretries++ == 0) goto sscs_msg; } cmn_err(CE_PANIC, "ksend_message failure: STATE_UPDATE"); } kmem_free(kresult, sizeof (md_mn_kresult_t)); /* if dropped the lock previously, regain it */ if (lockp) { IOLOCK_RETURN_REACQUIRE(lockp); lockp->l_flags |= save_lock; } else { /* * Reacquire dropped locks and update acquirecnts * appropriately. */ if (flags & MD_STATE_OCHELD) { /* * openclose also grabs readerlock. */ (void) md_unit_openclose_enter(ui); md_unit_readerexit(ui); (void) md_unit_writerlock(ui); } else { (void) md_unit_writerlock(ui); } } ui->ui_tstate &= ~MD_ERR_PENDING; } else { shared->ms_state = newstate; uniqtime32(&shared->ms_timestamp); if (newstate == CS_ERRED) shared->ms_flags |= MDM_S_NOWRITE; else shared->ms_flags &= ~MDM_S_NOWRITE; shared->ms_flags &= ~MDM_S_IOERR; un->un_changecnt++; shared->ms_lasterrcnt = un->un_changecnt; mirror_set_sm_state(sm, smic, SMS_RUNNING, 0); mirror_commit(un, SMI2BIT(smi), extras); } if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) { /* * Resetting the Last Erred state will recursively call back * into this function (set_sm_comp_state) to update the state. */ reset_lasterred(un, smi, extras, flags, lockp); } } static int find_another_logical( mm_unit_t *un, mm_submirror_t *esm, diskaddr_t blk, u_longlong_t cnt, int must_be_open, int state, int err_cnt) { u_longlong_t cando; md_dev64_t dev; md_m_shared_t *s; esm->sm_state |= SMS_IGNORE; while (cnt != 0) { u_longlong_t mcnt; mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024)); /* 1 Gig Blks */ dev = select_read_unit(un, blk, mcnt, &cando, must_be_open, &s, NULL); if (dev == (md_dev64_t)0) break; if ((state == CS_LAST_ERRED) && (s->ms_state == CS_LAST_ERRED) && (err_cnt > s->ms_lasterrcnt)) break; cnt -= cando; blk += cando; } esm->sm_state &= ~SMS_IGNORE; return (cnt != 0); } int mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open) { mm_submirror_t *sm; mm_submirror_ic_t *smic; size_t count; diskaddr_t block; u_longlong_t skip; u_longlong_t size; md_dev64_t dev; int cnt; md_m_shared_t *s; int not_found; sm = &un->un_sm[smi]; smic = &un->un_smic[smi]; dev = sm->sm_dev; /* * Make sure every component of the submirror * has other sources. */ if (ci < 0) { /* Find the highest lasterrcnt */ cnt = (*(smic->sm_get_component_count))(dev, sm); for (ci = 0; ci < cnt; ci++) { not_found = mirror_other_sources(un, smi, ci, must_be_open); if (not_found) return (1); } return (0); } /* * Make sure this component has other sources */ (void) (*(smic->sm_get_bcss)) (dev, sm, ci, &block, &count, &skip, &size); if (count == 0) return (1); s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci); while (count--) { if (block >= un->c.un_total_blocks) return (0); if ((block + size) > un->c.un_total_blocks) size = un->c.un_total_blocks - block; not_found = find_another_logical(un, sm, block, size, must_be_open, s->ms_state, s->ms_lasterrcnt); if (not_found) return (1); block += size + skip; } return (0); } static void finish_error(md_mps_t *ps) { struct buf *pb; mm_unit_t *un; mdi_unit_t *ui; uint_t new_str_flags; pb = ps->ps_bp; un = ps->ps_un; ui = ps->ps_ui; /* * Must flag any error to the resync originator if we're performing * a Write-after-Read. This corresponds to an i/o error on a resync * target device and in this case we ought to abort the resync as there * is nothing that can be done to recover from this without operator * intervention. If we don't set the B_ERROR flag we will continue * reading from the mirror but won't write to the target (as it will * have been placed into an errored state). * To handle the case of multiple components within a submirror we only * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR. * The originator of the resync read will cause this bit to be set if * the underlying component count is one for a submirror resync. All * other resync types will have the flag set as there is no underlying * resync which can be performed on a contained metadevice for these * resync types (optimized or component). */ if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) { if (ps->ps_flags & MD_MPS_FLAG_ERROR) pb->b_flags |= B_ERROR; md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); MPS_FREE(mirror_parent_cache, ps); md_unit_readerexit(ui); md_biodone(pb); return; } /* * Set the MD_IO_COUNTED flag as we are retrying the same I/O * operation therefore this I/O request has already been counted, * the I/O count variable will be decremented by mirror_done()'s * call to md_biodone(). */ if (ps->ps_changecnt != un->un_changecnt) { new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED; if (ps->ps_flags & MD_MPS_WOW) new_str_flags |= MD_STR_WOW; if (ps->ps_flags & MD_MPS_MAPPED) new_str_flags |= MD_STR_MAPPED; /* * If this I/O request was a read that was part of a resync, * set MD_STR_WAR for the retried read to ensure that the * resync write (i.e. write-after-read) will be performed */ if (ps->ps_flags & MD_MPS_RESYNC_READ) new_str_flags |= MD_STR_WAR; md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); MPS_FREE(mirror_parent_cache, ps); md_unit_readerexit(ui); (void) md_mirror_strategy(pb, new_str_flags, NULL); return; } pb->b_flags |= B_ERROR; md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); MPS_FREE(mirror_parent_cache, ps); md_unit_readerexit(ui); md_biodone(pb); } static void error_update_unit(md_mps_t *ps) { mm_unit_t *un; mdi_unit_t *ui; int smi; /* sub mirror index */ int ci; /* errored component */ set_t setno; uint_t flags; /* for set_sm_comp_state() */ uint_t hspflags; /* for check_comp_4_hotspares() */ ui = ps->ps_ui; un = (mm_unit_t *)md_unit_writerlock(ui); setno = MD_UN2SET(un); /* All of these updates have to propagated in case of MN set */ flags = MD_STATE_XMIT; hspflags = MD_HOTSPARE_XMIT; /* special treatment if we are called during updating watermarks */ if (ps->ps_flags & MD_MPS_WMUPDATE) { flags |= MD_STATE_WMUPDATE; hspflags |= MD_HOTSPARE_WMUPDATE; } smi = 0; ci = 0; while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) { if (mirror_other_sources(un, smi, ci, 0) == 1) { /* Never called from ioctl context, so (IOLOCK *)NULL */ set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags, (IOLOCK *)NULL); /* * For a MN set, the NOTIFY is done when the state * change is processed on each node */ if (!MD_MNSET_SETNO(MD_UN2SET(un))) { SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, SVM_TAG_METADEVICE, setno, MD_SID(un)); } continue; } /* Never called from ioctl context, so (IOLOCK *)NULL */ set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags, (IOLOCK *)NULL); /* * For a MN set, the NOTIFY is done when the state * change is processed on each node */ if (!MD_MNSET_SETNO(MD_UN2SET(un))) { SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_METADEVICE, setno, MD_SID(un)); } smi = 0; ci = 0; } md_unit_writerexit(ui); if (MD_MNSET_SETNO(setno)) { send_poke_hotspares(setno); } else { (void) poke_hotspares(); } (void) md_unit_readerlock(ui); finish_error(ps); } /* * When we have a B_FAILFAST IO error on a Last Erred component we need to * retry the IO without B_FAILFAST set so that we try to ensure that the * component "sees" each IO. */ static void last_err_retry(md_mcs_t *cs) { struct buf *cb; md_mps_t *ps; uint_t flags; cb = &cs->cs_buf; cb->b_flags &= ~B_FAILFAST; /* if we're panicing just let this I/O error out */ if (panicstr) { (void) mirror_done(cb); return; } /* reissue the I/O */ ps = cs->cs_ps; bioerror(cb, 0); mutex_enter(&ps->ps_mx); flags = MD_STR_NOTTOP; if (ps->ps_flags & MD_MPS_MAPPED) flags |= MD_STR_MAPPED; if (ps->ps_flags & MD_MPS_NOBLOCK) flags |= MD_NOBLOCK; mutex_exit(&ps->ps_mx); clear_retry_error(cb); cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST", md_shortname(getminor(cb->b_edev))); md_call_strategy(cb, flags, NULL); } static void mirror_error(md_mps_t *ps) { int smi; /* sub mirror index */ int ci; /* errored component */ if (panicstr) { finish_error(ps); return; } if (ps->ps_flags & MD_MPS_ON_OVERLAP) mirror_overlap_tree_remove(ps); smi = 0; ci = 0; if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) { md_unit_readerexit(ps->ps_ui); daemon_request(&md_mstr_daemon, error_update_unit, (daemon_queue_t *)ps, REQ_OLD); return; } finish_error(ps); } static int copy_write_done(struct buf *cb) { md_mps_t *ps; buf_t *pb; char *wowbuf; wowhdr_t *wowhdr; ssize_t wow_resid; /* get wowbuf ans save structure */ wowbuf = cb->b_un.b_addr; wowhdr = WOWBUF_HDR(wowbuf); ps = wowhdr->wow_ps; pb = ps->ps_bp; /* Save error information, then free cb */ if (cb->b_flags & B_ERROR) pb->b_flags |= B_ERROR; if (cb->b_flags & B_REMAPPED) bp_mapout(cb); freerbuf(cb); /* update residual and continue if needed */ if ((pb->b_flags & B_ERROR) == 0) { wow_resid = pb->b_bcount - wowhdr->wow_offset; pb->b_resid = wow_resid; if (wow_resid > 0) { daemon_request(&md_mstr_daemon, copy_write_cont, (daemon_queue_t *)wowhdr, REQ_OLD); return (1); } } /* Write is complete, release resources. */ kmem_cache_free(mirror_wowblk_cache, wowhdr); ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); MPS_FREE(mirror_parent_cache, ps); md_biodone(pb); return (0); } static void copy_write_cont(wowhdr_t *wowhdr) { buf_t *pb; buf_t *cb; char *wowbuf; int wow_offset; size_t wow_resid; diskaddr_t wow_blkno; wowbuf = WOWHDR_BUF(wowhdr); pb = wowhdr->wow_ps->ps_bp; /* get data on current location */ wow_offset = wowhdr->wow_offset; wow_resid = pb->b_bcount - wow_offset; wow_blkno = pb->b_lblkno + lbtodb(wow_offset); /* setup child buffer */ cb = getrbuf(KM_SLEEP); cb->b_flags = B_WRITE; cb->b_edev = pb->b_edev; cb->b_un.b_addr = wowbuf; /* change to point at WOWBUF */ cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */ cb->b_iodone = copy_write_done; cb->b_bcount = MIN(md_wowbuf_size, wow_resid); cb->b_lblkno = wow_blkno; /* move offset to next section */ wowhdr->wow_offset += cb->b_bcount; /* copy and setup write for current section */ bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount); /* do it */ /* * Do not set the MD_IO_COUNTED flag as this is a new I/O request * that handles the WOW condition. The resultant increment on the * I/O count variable is cleared by copy_write_done()'s call to * md_biodone(). */ (void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW | MD_STR_MAPPED, NULL); } static void md_mirror_copy_write(md_mps_t *ps) { wowhdr_t *wowhdr; wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS); mirror_wowblk_init(wowhdr); wowhdr->wow_ps = ps; wowhdr->wow_offset = 0; copy_write_cont(wowhdr); } static void handle_wow(md_mps_t *ps) { buf_t *pb; pb = ps->ps_bp; bp_mapin(pb); md_mirror_wow_cnt++; if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) { cmn_err(CE_NOTE, "md: %s, blk %lld, cnt %ld: Write on write %d occurred", md_shortname(getminor(pb->b_edev)), (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt); } /* * Set the MD_IO_COUNTED flag as we are retrying the same I/O * operation therefore this I/O request has already been counted, * the I/O count variable will be decremented by mirror_done()'s * call to md_biodone(). */ if (md_mirror_wow_flg & WOW_NOCOPY) (void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW | MD_STR_MAPPED | MD_IO_COUNTED, ps); else md_mirror_copy_write(ps); } /* * Return true if the specified submirror is either in the Last Erred * state or is transitioning into the Last Erred state. */ static bool_t submirror_is_lasterred(mm_unit_t *un, int smi) { mm_submirror_t *sm; mm_submirror_ic_t *smic; md_m_shared_t *shared; int ci; int compcnt; sm = &un->un_sm[smi]; smic = &un->un_smic[smi]; compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); for (ci = 0; ci < compcnt; ci++) { shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) (sm->sm_dev, sm, ci); if (shared->ms_state == CS_LAST_ERRED) return (B_TRUE); /* * It is not currently Last Erred, check if entering Last Erred. */ if ((shared->ms_flags & MDM_S_IOERR) && ((shared->ms_state == CS_OKAY) || (shared->ms_state == CS_RESYNC))) { if (mirror_other_sources(un, smi, ci, 0) == 1) return (B_TRUE); } } return (B_FALSE); } static int mirror_done(struct buf *cb) { md_mps_t *ps; md_mcs_t *cs; /*LINTED*/ cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); ps = cs->cs_ps; mutex_enter(&ps->ps_mx); /* check if we need to retry an errored failfast I/O */ if (cb->b_flags & B_ERROR) { struct buf *pb = ps->ps_bp; if (cb->b_flags & B_FAILFAST) { int i; mm_unit_t *un = ps->ps_un; for (i = 0; i < NMIRROR; i++) { if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) continue; if (cb->b_edev == md_dev64_to_dev(un->un_sm[i].sm_dev)) { /* * This is the submirror that had the * error. Check if it is Last Erred. */ if (submirror_is_lasterred(un, i)) { daemon_queue_t *dqp; mutex_exit(&ps->ps_mx); dqp = (daemon_queue_t *)cs; dqp->dq_prev = NULL; dqp->dq_next = NULL; daemon_request(&md_done_daemon, last_err_retry, dqp, REQ_OLD); return (1); } break; } } } /* continue to process the buf without doing a retry */ ps->ps_flags |= MD_MPS_ERROR; pb->b_error = cb->b_error; } return (mirror_done_common(cb)); } /* * Split from the original mirror_done function so we can handle bufs after a * retry. * ps->ps_mx is already held in the caller of this function and the cb error * has already been checked and handled in the caller. */ static int mirror_done_common(struct buf *cb) { struct buf *pb; mm_unit_t *un; mdi_unit_t *ui; md_mps_t *ps; md_mcs_t *cs; size_t end_rr, start_rr, current_rr; /*LINTED*/ cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); ps = cs->cs_ps; pb = ps->ps_bp; if (cb->b_flags & B_REMAPPED) bp_mapout(cb); ps->ps_frags--; if (ps->ps_frags != 0) { mutex_exit(&ps->ps_mx); kmem_cache_free(mirror_child_cache, cs); return (1); } un = ps->ps_un; ui = ps->ps_ui; /* * Do not update outstanding_writes if we're running with ABR * set for this mirror or the write() was issued with MD_STR_ABR set. * Also a resync initiated write() has no outstanding_writes update * either. */ if (((cb->b_flags & B_READ) == 0) && (un->un_nsm >= 2) && (ps->ps_call == NULL) && !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) && !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) { BLK_TO_RR(end_rr, ps->ps_lastblk, un); BLK_TO_RR(start_rr, ps->ps_firstblk, un); mutex_enter(&un->un_resync_mx); for (current_rr = start_rr; current_rr <= end_rr; current_rr++) un->un_outstanding_writes[current_rr]--; mutex_exit(&un->un_resync_mx); } kmem_cache_free(mirror_child_cache, cs); mutex_exit(&ps->ps_mx); if (ps->ps_call != NULL) { daemon_request(&md_done_daemon, ps->ps_call, (daemon_queue_t *)ps, REQ_OLD); return (1); } if ((ps->ps_flags & MD_MPS_ERROR)) { daemon_request(&md_done_daemon, mirror_error, (daemon_queue_t *)ps, REQ_OLD); return (1); } if (ps->ps_flags & MD_MPS_ON_OVERLAP) mirror_overlap_tree_remove(ps); /* * Handle Write-on-Write problem. * Skip In case of Raw and Direct I/O as they are * handled earlier. * */ if (!(md_mirror_wow_flg & WOW_DISABLE) && !(pb->b_flags & B_READ) && !(ps->ps_flags & MD_MPS_WOW) && !(pb->b_flags & B_PHYS) && any_pages_dirty(pb)) { md_unit_readerexit(ps->ps_ui); daemon_request(&md_mstr_daemon, handle_wow, (daemon_queue_t *)ps, REQ_OLD); return (1); } md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); MPS_FREE(mirror_parent_cache, ps); md_unit_readerexit(ui); md_biodone(pb); return (0); } /* * Clear error state in submirror component if the retry worked after * a failfast error. */ static void clear_retry_error(struct buf *cb) { int smi; md_mcs_t *cs; mm_unit_t *un; mdi_unit_t *ui_sm; mm_submirror_t *sm; mm_submirror_ic_t *smic; u_longlong_t cnt; md_m_shared_t *shared; /*LINTED*/ cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); un = cs->cs_ps->ps_un; for (smi = 0; smi < NMIRROR; smi++) { if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) continue; if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev)) break; } if (smi >= NMIRROR) return; sm = &un->un_sm[smi]; smic = &un->un_smic[smi]; cnt = cb->b_bcount; ui_sm = MDI_UNIT(getminor(cb->b_edev)); (void) md_unit_writerlock(ui_sm); shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm, cb->b_blkno, &cnt); if (shared->ms_flags & MDM_S_IOERR) { shared->ms_flags &= ~MDM_S_IOERR; } else { /* the buf spans components and the first one is not erred */ int cnt; int i; cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un); for (i = 0; i < cnt; i++) { shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) (sm->sm_dev, sm, i); if (shared->ms_flags & MDM_S_IOERR && shared->ms_state == CS_OKAY) { shared->ms_flags &= ~MDM_S_IOERR; break; } } } md_unit_writerexit(ui_sm); } static size_t mirror_map_read( md_mps_t *ps, md_mcs_t *cs, diskaddr_t blkno, u_longlong_t count ) { mm_unit_t *un; buf_t *bp; u_longlong_t cando; bp = &cs->cs_buf; un = ps->ps_un; bp->b_lblkno = blkno; if (fast_select_read_unit(ps, cs) == 0) { bp->b_bcount = ldbtob(count); return (0); } bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno, count, &cando, 0, NULL, cs)); bp->b_bcount = ldbtob(cando); if (count != cando) return (cando); return (0); } static void write_after_read(md_mps_t *ps) { struct buf *pb; int flags; if (ps->ps_flags & MD_MPS_ERROR) { mirror_error(ps); return; } pb = ps->ps_bp; md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); ps->ps_call = NULL; ps->ps_flags |= MD_MPS_WRITE_AFTER_READ; flags = MD_STR_NOTTOP | MD_STR_WAR; if (ps->ps_flags & MD_MPS_MAPPED) flags |= MD_STR_MAPPED; if (ps->ps_flags & MD_MPS_NOBLOCK) flags |= MD_NOBLOCK; if (ps->ps_flags & MD_MPS_DIRTY_RD) flags |= MD_STR_DIRTY_RD; (void) mirror_write_strategy(pb, flags, ps); } static void continue_serial(md_mps_t *ps) { md_mcs_t *cs; buf_t *cb; mm_unit_t *un; int flags; un = ps->ps_un; cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); mirror_child_init(cs); cb = &cs->cs_buf; ps->ps_call = NULL; ps->ps_frags = 1; (void) mirror_map_write(un, cs, ps, 0); flags = MD_STR_NOTTOP; if (ps->ps_flags & MD_MPS_MAPPED) flags |= MD_STR_MAPPED; md_call_strategy(cb, flags, NULL); } static int mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war) { int i; dev_t dev; /* needed for bioclone, so not md_dev64_t */ buf_t *cb; buf_t *pb; diskaddr_t blkno; size_t bcount; off_t offset; pb = ps->ps_bp; cb = &cs->cs_buf; cs->cs_ps = ps; i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm); dev = md_dev64_to_dev(un->un_sm[i].sm_dev); blkno = pb->b_lblkno; bcount = pb->b_bcount; offset = 0; if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) { blkno = DK_LABEL_LOC + 1; /* * This handles the case where we're requesting * a write to block 0 on a label partition * and the request size was smaller than the * size of the label. If this is the case * then we'll return -1. Failure to do so will * either cause the calling thread to hang due to * an ssd bug, or worse if the bcount were allowed * to go negative (ie large). */ if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1)) return (-1); bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1)); offset = (DEV_BSIZE*(DK_LABEL_LOC + 1)); } cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done, cb, KM_NOSLEEP); if (war) cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE; /* * If the submirror is in the erred stated, check if any component is * in the Last Erred state. If so, we don't want to use the B_FAILFAST * flag on the IO. * * Provide a fast path for the non-erred case (which should be the * normal case). */ if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) { if (un->un_sm[i].sm_state & SMS_COMP_ERRED) { mm_submirror_t *sm; mm_submirror_ic_t *smic; int ci; int compcnt; sm = &un->un_sm[i]; smic = &un->un_smic[i]; compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); for (ci = 0; ci < compcnt; ci++) { md_m_shared_t *shared; shared = (md_m_shared_t *) (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci); if (shared->ms_state == CS_LAST_ERRED) break; } if (ci >= compcnt) cb->b_flags |= B_FAILFAST; } else { cb->b_flags |= B_FAILFAST; } } ps->ps_current_sm++; if (ps->ps_current_sm != ps->ps_active_cnt) { if (un->un_write_option == WR_SERIAL) { ps->ps_call = continue_serial; return (0); } return (1); } return (0); } /* * directed_read_done: * ------------------ * Completion routine called when a DMR request has been returned from the * underlying driver. Wake-up the original ioctl() and return the data to * the user. */ static void directed_read_done(md_mps_t *ps) { mm_unit_t *un; mdi_unit_t *ui; un = ps->ps_un; ui = ps->ps_ui; md_unit_readerexit(ui); md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); ps->ps_call = NULL; mutex_enter(&un->un_dmr_mx); cv_signal(&un->un_dmr_cv); mutex_exit(&un->un_dmr_mx); /* release the parent structure */ kmem_cache_free(mirror_parent_cache, ps); } /* * daemon_io: * ------------ * Called to issue a mirror_write_strategy() or mirror_read_strategy * call from a blockable context. NOTE: no mutex can be held on entry to this * routine */ static void daemon_io(daemon_queue_t *dq) { md_mps_t *ps = (md_mps_t *)dq; int flag = MD_STR_NOTTOP; buf_t *pb = ps->ps_bp; if (ps->ps_flags & MD_MPS_MAPPED) flag |= MD_STR_MAPPED; if (ps->ps_flags & MD_MPS_WOW) flag |= MD_STR_WOW; if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) flag |= MD_STR_WAR; if (ps->ps_flags & MD_MPS_ABR) flag |= MD_STR_ABR; if (ps->ps_flags & MD_MPS_BLOCKABLE_IO) flag |= MD_STR_BLOCK_OK; /* * If this is a resync read, ie MD_STR_DIRTY_RD not set, set * MD_STR_WAR before calling mirror_read_strategy */ if (pb->b_flags & B_READ) { if (!(ps->ps_flags & MD_MPS_DIRTY_RD)) flag |= MD_STR_WAR; mirror_read_strategy(pb, flag, ps); } else mirror_write_strategy(pb, flag, ps); } /* * update_resync: * ------------- * Called to update the in-core version of the resync record with the latest * version that was committed to disk when the previous mirror owner * relinquished ownership. This call is likely to block as we must hold-off * any current resync processing that may be occurring. * On completion of the resync record update we issue the mirror_write_strategy * call to complete the i/o that first started this sequence. To remove a race * condition between a new write() request which is submitted and the resync * record update we acquire the writerlock. This will hold off all i/o to the * mirror until the resync update has completed. * NOTE: no mutex can be held on entry to this routine */ static void update_resync(daemon_queue_t *dq) { md_mps_t *ps = (md_mps_t *)dq; buf_t *pb = ps->ps_bp; mdi_unit_t *ui = ps->ps_ui; mm_unit_t *un = MD_UNIT(ui->ui_link.ln_id); set_t setno; int restart_resync; mutex_enter(&un->un_rrp_inflight_mx); (void) md_unit_writerlock(ui); ps->ps_un = un; setno = MD_MIN2SET(getminor(pb->b_edev)); if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) { /* * Synchronize our in-core view of what regions need to be * resync'd with the on-disk version. */ mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm, un->un_dirty_bm); /* Region dirty map is now up to date */ } restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0; md_unit_writerexit(ui); mutex_exit(&un->un_rrp_inflight_mx); /* Restart the resync thread if it was previously blocked */ if (restart_resync) { mutex_enter(&un->un_rs_thread_mx); un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER; cv_signal(&un->un_rs_thread_cv); mutex_exit(&un->un_rs_thread_mx); } /* Continue with original deferred i/o */ daemon_io(dq); } /* * owner_timeout: * ------------- * Called if the original mdmn_ksend_message() failed and the request is to be * retried. Reattempt the original ownership change. * * NOTE: called at interrupt context (see timeout(9f)). */ static void owner_timeout(void *arg) { daemon_queue_t *dq = (daemon_queue_t *)arg; daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD); } /* * become_owner: * ------------ * Called to issue RPC request to become the owner of the mirror * associated with this i/o request. We assume that the ownership request * is synchronous, so if it succeeds we will issue the request via * mirror_write_strategy(). * If multiple i/o's are outstanding we will be called from the mirror_daemon * service thread. * NOTE: no mutex should be held on entry to this routine. */ static void become_owner(daemon_queue_t *dq) { md_mps_t *ps = (md_mps_t *)dq; mm_unit_t *un = ps->ps_un; buf_t *pb = ps->ps_bp; set_t setno; md_mn_kresult_t *kres; int msg_flags = md_mirror_msg_flags; md_mps_t *ps1; ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL); /* * If we're already the mirror owner we do not need to send a message * but can simply process the i/o request immediately. * If we've already sent the request to become owner we requeue the * request as we're waiting for the synchronous ownership message to * be processed. */ if (MD_MN_MIRROR_OWNER(un)) { /* * As the strategy() call will potentially block we need to * punt this to a separate thread and complete this request * as quickly as possible. Note: if we're a read request * this must be a resync, we cannot afford to be queued * behind any intervening i/o requests. In this case we put the * request on the md_mirror_rs_daemon queue. */ if (pb->b_flags & B_READ) { daemon_request(&md_mirror_rs_daemon, daemon_io, dq, REQ_OLD); } else { daemon_request(&md_mirror_io_daemon, daemon_io, dq, REQ_OLD); } } else { mutex_enter(&un->un_owner_mx); if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) { md_mn_req_owner_t *msg; int rval = 0; /* * Check to see that we haven't exceeded the maximum * retry count. If we have we fail the i/o as the * comms mechanism has become wedged beyond recovery. */ if (dq->qlen++ >= MD_OWNER_RETRIES) { mutex_exit(&un->un_owner_mx); cmn_err(CE_WARN, "md_mirror: Request exhausted ownership " "retry limit of %d attempts", dq->qlen); pb->b_error = EIO; pb->b_flags |= B_ERROR; pb->b_resid = pb->b_bcount; kmem_cache_free(mirror_parent_cache, ps); md_biodone(pb); return; } /* * Issue request to change ownership. The call is * synchronous so when it returns we can complete the * i/o (if successful), or enqueue it again so that * the operation will be retried. */ un->un_owner_state |= MM_MN_OWNER_SENT; mutex_exit(&un->un_owner_mx); msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP); setno = MD_MIN2SET(getminor(pb->b_edev)); msg->mnum = MD_SID(un); msg->owner = md_mn_mynode_id; msg_flags |= MD_MSGF_NO_LOG; /* * If this IO is triggered by updating a watermark, * it might be issued by the creation of a softpartition * while the commd subsystem is suspended. * We don't want this message to block. */ if (ps->ps_flags & MD_MPS_WMUPDATE) { msg_flags |= MD_MSGF_OVERRIDE_SUSPEND; } kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); rval = mdmn_ksend_message(setno, MD_MN_MSG_REQUIRE_OWNER, msg_flags, 0, (char *)msg, sizeof (md_mn_req_owner_t), kres); kmem_free(msg, sizeof (md_mn_req_owner_t)); if (MDMN_KSEND_MSG_OK(rval, kres)) { dq->qlen = 0; /* * Successfully changed owner, reread the * resync record so that we have a valid idea of * any previously committed incomplete write()s. * NOTE: As we need to acquire the resync mutex * this may block, so we defer it to a separate * thread handler. This makes us (effectively) * non-blocking once the ownership message * handling has completed. */ mutex_enter(&un->un_owner_mx); if (un->un_owner_state & MM_MN_BECOME_OWNER) { un->un_mirror_owner = md_mn_mynode_id; /* Sets owner of un_rr_dirty record */ if (un->un_rr_dirty_recid) (void) mddb_setowner( un->un_rr_dirty_recid, md_mn_mynode_id); un->un_owner_state &= ~MM_MN_BECOME_OWNER; /* * Release the block on the current * resync region if it is blocked */ ps1 = un->un_rs_prev_overlap; if ((ps1 != NULL) && (ps1->ps_flags & MD_MPS_ON_OVERLAP)) mirror_overlap_tree_remove(ps1); mutex_exit(&un->un_owner_mx); /* * If we're a read, this must be a * resync request, issue * the i/o request on the * md_mirror_rs_daemon queue. This is * to avoid a deadlock between the * resync_unit thread and * subsequent i/o requests that may * block on the resync region. */ if (pb->b_flags & B_READ) { daemon_request( &md_mirror_rs_daemon, update_resync, dq, REQ_OLD); } else { daemon_request( &md_mirror_io_daemon, update_resync, dq, REQ_OLD); } kmem_free(kres, sizeof (md_mn_kresult_t)); return; } else { /* * Some other node has beaten us to * obtain ownership. We need to * reschedule our ownership request */ mutex_exit(&un->un_owner_mx); } } else { mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_REQUIRE_OWNER"); /* * Message transport failure is handled by the * comms layer. If the ownership change request * does not succeed we need to flag the error to * the initiator of the i/o. This is handled by * the retry logic above. As the request failed * we do not know _who_ the owner of the mirror * currently is. We reset our idea of the owner * to None so that any further write()s will * attempt to become the owner again. This stops * multiple nodes writing to the same mirror * simultaneously. */ mutex_enter(&un->un_owner_mx); un->un_owner_state &= ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER); un->un_mirror_owner = MD_MN_MIRROR_UNOWNED; mutex_exit(&un->un_owner_mx); } kmem_free(kres, sizeof (md_mn_kresult_t)); } else mutex_exit(&un->un_owner_mx); /* * Re-enqueue this request on the deferred i/o list. Delay the * request for md_mirror_owner_to usecs to stop thrashing. */ (void) timeout(owner_timeout, dq, drv_usectohz(md_mirror_owner_to)); } } static void mirror_write_strategy(buf_t *pb, int flag, void *private) { md_mps_t *ps; md_mcs_t *cs; int more; mm_unit_t *un; mdi_unit_t *ui; buf_t *cb; /* child buf pointer */ set_t setno; int rs_on_overlap = 0; ui = MDI_UNIT(getminor(pb->b_edev)); un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev)); md_kstat_waitq_enter(ui); /* * If a state change is in progress for this mirror in a MN set, * suspend all non-resync writes until the state change is complete. * The objective of this suspend is to ensure that it is not * possible for one node to read data from a submirror that another node * has not written to because of the state change. Therefore we * suspend all writes until the state change has been made. As it is * not possible to read from the target of a resync, there is no need * to suspend resync writes. * Note that we only block here if the caller can handle a busy-wait. * The MD_STR_BLOCK_OK flag is set for daemon_io originated i/o only. */ if (!(flag & MD_STR_WAR)) { if (flag & MD_STR_BLOCK_OK) { mutex_enter(&un->un_suspend_wr_mx); while (un->un_suspend_wr_flag) { cv_wait(&un->un_suspend_wr_cv, &un->un_suspend_wr_mx); } mutex_exit(&un->un_suspend_wr_mx); } (void) md_unit_readerlock(ui); } if (!(flag & MD_STR_NOTTOP)) { if (md_checkbuf(ui, (md_unit_t *)un, pb)) { md_kstat_waitq_exit(ui); return; } } setno = MD_MIN2SET(getminor(pb->b_edev)); /* If an ABR write has been requested, set MD_STR_ABR flag */ if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE)) flag |= MD_STR_ABR; if (private == NULL) { ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS); mirror_parent_init(ps); } else { ps = private; private = NULL; } if (flag & MD_STR_MAPPED) ps->ps_flags |= MD_MPS_MAPPED; if (flag & MD_STR_WOW) ps->ps_flags |= MD_MPS_WOW; if (flag & MD_STR_ABR) ps->ps_flags |= MD_MPS_ABR; if (flag & MD_STR_WMUPDATE) ps->ps_flags |= MD_MPS_WMUPDATE; /* * Save essential information from the original buffhdr * in the md_save structure. */ ps->ps_un = un; ps->ps_ui = ui; ps->ps_bp = pb; ps->ps_addr = pb->b_un.b_addr; ps->ps_firstblk = pb->b_lblkno; ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1; ps->ps_changecnt = un->un_changecnt; /* * Check for suspended writes here. This is where we can defer the * write request to the daemon_io queue which will then call us with * the MD_STR_BLOCK_OK flag set and we'll busy-wait (if necessary) at * the top of this routine. */ if (!(flag & MD_STR_WAR) && !(flag & MD_STR_BLOCK_OK)) { mutex_enter(&un->un_suspend_wr_mx); if (un->un_suspend_wr_flag) { ps->ps_flags |= MD_MPS_BLOCKABLE_IO; mutex_exit(&un->un_suspend_wr_mx); md_unit_readerexit(ui); daemon_request(&md_mirror_daemon, daemon_io, (daemon_queue_t *)ps, REQ_OLD); return; } mutex_exit(&un->un_suspend_wr_mx); } /* * If not MN owner and this is an ABR write, make sure the current * resync region is in the overlaps tree */ mutex_enter(&un->un_owner_mx); if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) && ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) { md_mps_t *ps1; /* Block the current resync region, if not already blocked */ ps1 = un->un_rs_prev_overlap; if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) || (ps1->ps_lastblk != 0))) { /* Drop locks to avoid deadlock */ mutex_exit(&un->un_owner_mx); md_unit_readerexit(ui); wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT); rs_on_overlap = 1; (void) md_unit_readerlock(ui); mutex_enter(&un->un_owner_mx); /* * Check to see if we have obtained ownership * while waiting for overlaps. If we have, remove * the resync_region entry from the overlap tree */ if (MD_MN_MIRROR_OWNER(un) && (ps1->ps_flags & MD_MPS_ON_OVERLAP)) { mirror_overlap_tree_remove(ps1); rs_on_overlap = 0; } } } mutex_exit(&un->un_owner_mx); /* * following keep write after read from writing to the * source in the case where it all came from one place */ if (flag & MD_STR_WAR) { int abort_write = 0; /* * We are perfoming a write-after-read. This is either as a * result of a resync read or as a result of a read in a * dirty resync region when the optimized resync is not * complete. If in a MN set and a resync generated i/o, * if the current block is not in the current * resync region terminate the write as another node must have * completed this resync region */ if ((MD_MNSET_SETNO(MD_UN2SET(un))) && (!(flag & MD_STR_DIRTY_RD))) { if (!IN_RESYNC_REGION(un, ps)) abort_write = 1; } if ((select_write_after_read_units(un, ps) == 0) || (abort_write)) { #ifdef DEBUG if (mirror_debug_flag) printf("Abort resync write on %x, block %lld\n", MD_SID(un), ps->ps_firstblk); #endif if (ps->ps_flags & MD_MPS_ON_OVERLAP) mirror_overlap_tree_remove(ps); kmem_cache_free(mirror_parent_cache, ps); md_kstat_waitq_exit(ui); md_unit_readerexit(ui); md_biodone(pb); return; } } else { select_write_units(un, ps); /* Drop readerlock to avoid deadlock */ md_unit_readerexit(ui); wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT); un = md_unit_readerlock(ui); /* * For a MN set with an ABR write, if we are now the * owner and we have a resync region in the overlap * tree, remove the entry from overlaps and retry the write. */ if (MD_MNSET_SETNO(setno) && ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) { mutex_enter(&un->un_owner_mx); if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) { mirror_overlap_tree_remove(ps); md_kstat_waitq_exit(ui); mutex_exit(&un->un_owner_mx); md_unit_readerexit(ui); daemon_request(&md_mirror_daemon, daemon_io, (daemon_queue_t *)ps, REQ_OLD); return; } mutex_exit(&un->un_owner_mx); } } /* * For Multinode mirrors with no owner and a Resync Region (not ABR) * we need to become the mirror owner before continuing with the * write(). For ABR mirrors we check that we 'own' the resync if * we're in write-after-read mode. We do this _after_ ensuring that * there are no overlaps to ensure that once we know that we are * the owner, the readerlock will not be released until the write is * complete. As a change of ownership in a MN set requires the * writerlock, this ensures that ownership cannot be changed until * the write is complete. */ if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) { if (MD_MN_NO_MIRROR_OWNER(un)) { if (ps->ps_flags & MD_MPS_ON_OVERLAP) mirror_overlap_tree_remove(ps); md_kstat_waitq_exit(ui); ASSERT(!(flag & MD_STR_WAR)); md_unit_readerexit(ui); daemon_request(&md_mirror_daemon, become_owner, (daemon_queue_t *)ps, REQ_OLD); return; } } /* * Mark resync region if mirror has a Resync Region _and_ we are not * a resync initiated write(). Don't mark region if we're flagged as * an ABR write. */ if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) && !(flag & MD_STR_WAR)) { if (mirror_mark_resync_region(un, ps->ps_firstblk, ps->ps_lastblk, md_mn_mynode_id)) { pb->b_flags |= B_ERROR; pb->b_resid = pb->b_bcount; if (ps->ps_flags & MD_MPS_ON_OVERLAP) mirror_overlap_tree_remove(ps); kmem_cache_free(mirror_parent_cache, ps); md_kstat_waitq_exit(ui); md_unit_readerexit(ui); md_biodone(pb); return; } } ps->ps_childbflags = pb->b_flags | B_WRITE; ps->ps_childbflags &= ~B_READ; if (flag & MD_STR_MAPPED) ps->ps_childbflags &= ~B_PAGEIO; if (!(flag & MD_STR_NOTTOP) && panicstr) /* Disable WOW and don't free ps */ ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE); md_kstat_waitq_to_runq(ui); /* * Treat Raw and Direct I/O as Write-on-Write always */ if (!(md_mirror_wow_flg & WOW_DISABLE) && (md_mirror_wow_flg & WOW_PHYS_ENABLE) && (pb->b_flags & B_PHYS) && !(ps->ps_flags & MD_MPS_WOW)) { if (ps->ps_flags & MD_MPS_ON_OVERLAP) mirror_overlap_tree_remove(ps); md_unit_readerexit(ui); daemon_request(&md_mstr_daemon, handle_wow, (daemon_queue_t *)ps, REQ_OLD); return; } ps->ps_frags = 1; do { cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); mirror_child_init(cs); cb = &cs->cs_buf; more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR)); /* * This handles the case where we're requesting * a write to block 0 on a label partition. (more < 0) * means that the request size was smaller than the * size of the label. If so this request is done. */ if (more < 0) { if (ps->ps_flags & MD_MPS_ON_OVERLAP) mirror_overlap_tree_remove(ps); md_kstat_runq_exit(ui); kmem_cache_free(mirror_child_cache, cs); kmem_cache_free(mirror_parent_cache, ps); md_unit_readerexit(ui); md_biodone(pb); return; } if (more) { mutex_enter(&ps->ps_mx); ps->ps_frags++; mutex_exit(&ps->ps_mx); } md_call_strategy(cb, flag, private); } while (more); if (!(flag & MD_STR_NOTTOP) && panicstr) { while (!(ps->ps_flags & MD_MPS_DONE)) { md_daemon(1, &md_done_daemon); drv_usecwait(10); } kmem_cache_free(mirror_parent_cache, ps); } } static void mirror_read_strategy(buf_t *pb, int flag, void *private) { md_mps_t *ps; md_mcs_t *cs; size_t more; mm_unit_t *un; mdi_unit_t *ui; size_t current_count; diskaddr_t current_blkno; off_t current_offset; buf_t *cb; /* child buf pointer */ set_t setno; ui = MDI_UNIT(getminor(pb->b_edev)); md_kstat_waitq_enter(ui); un = (mm_unit_t *)md_unit_readerlock(ui); if (!(flag & MD_STR_NOTTOP)) { if (md_checkbuf(ui, (md_unit_t *)un, pb)) { md_kstat_waitq_exit(ui); return; } } if (private == NULL) { ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS); mirror_parent_init(ps); } else { ps = private; private = NULL; } if (flag & MD_STR_MAPPED) ps->ps_flags |= MD_MPS_MAPPED; if (flag & MD_NOBLOCK) ps->ps_flags |= MD_MPS_NOBLOCK; if (flag & MD_STR_WMUPDATE) ps->ps_flags |= MD_MPS_WMUPDATE; /* * Check to see if this is a DMR driven read. If so we need to use the * specified side (in un->un_dmr_last_read) for the source of the data. */ if (flag & MD_STR_DMR) ps->ps_flags |= MD_MPS_DMR; /* * Save essential information from the original buffhdr * in the md_save structure. */ ps->ps_un = un; ps->ps_ui = ui; ps->ps_bp = pb; ps->ps_addr = pb->b_un.b_addr; ps->ps_firstblk = pb->b_lblkno; ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1; ps->ps_changecnt = un->un_changecnt; current_count = btodb(pb->b_bcount); current_blkno = pb->b_lblkno; current_offset = 0; /* * If flag has MD_STR_WAR set this means that the read is issued by a * resync thread which may or may not be an optimised resync. * * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync * code has not completed; either a resync has not started since snarf, * or there is an optimized resync in progress. * * We need to generate a write after this read in the following two * cases, * * 1. Any Resync-Generated read * * 2. Any read to a DIRTY REGION if there is an optimized resync * pending or in progress. * * The write after read is done in these cases to ensure that all sides * of the mirror are in sync with the read data and that it is not * possible for an application to read the same block multiple times * and get different data. * * This would be possible if the block was in a dirty region. * * If we're performing a directed read we don't write the data out as * the application is responsible for restoring the mirror to a known * state. */ if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) && !(flag & MD_STR_DMR)) { size_t start_rr, i, end_rr; int region_dirty = 1; /* * We enter here under three circumstances, * * MD_UN_OPT_NOT_DONE MD_STR_WAR * 0 1 * 1 0 * 1 1 * * To be optimal we only care to explicitly check for dirty * regions in the second case since if MD_STR_WAR is set we * always do the write after read. */ if (!(flag & MD_STR_WAR)) { BLK_TO_RR(end_rr, ps->ps_lastblk, un); BLK_TO_RR(start_rr, ps->ps_firstblk, un); for (i = start_rr; i <= end_rr; i++) if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0) break; } if ((region_dirty) && !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) { ps->ps_call = write_after_read; /* * Mark this as a RESYNC_READ in ps_flags. * This is used if the read fails during a * resync of a 3-way mirror to ensure that * the retried read to the remaining * good submirror has MD_STR_WAR set. This * is needed to ensure that the resync write * (write-after-read) takes place. */ ps->ps_flags |= MD_MPS_RESYNC_READ; /* * If MD_STR_FLAG_ERR is set in the flags we * set MD_MPS_FLAG_ERROR so that an error on the resync * write (issued by write_after_read) will be flagged * to the biowait'ing resync thread. This allows us to * avoid issuing further resync requests to a device * that has had a write failure. */ if (flag & MD_STR_FLAG_ERR) ps->ps_flags |= MD_MPS_FLAG_ERROR; setno = MD_UN2SET(un); /* * Drop the readerlock to avoid * deadlock */ md_unit_readerexit(ui); wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT); un = md_unit_readerlock(ui); /* * Ensure that we are owner */ if (MD_MNSET_SETNO(setno)) { /* * For a non-resync read that requires a * write-after-read to be done, set a flag * in the parent structure, so that the * write_strategy routine can omit the * test that the write is still within the * resync region */ if (!(flag & MD_STR_WAR)) ps->ps_flags |= MD_MPS_DIRTY_RD; /* * Before reading the buffer, see if * there is an owner. */ if (MD_MN_NO_MIRROR_OWNER(un)) { ps->ps_call = NULL; mirror_overlap_tree_remove(ps); md_kstat_waitq_exit(ui); md_unit_readerexit(ui); daemon_request( &md_mirror_daemon, become_owner, (daemon_queue_t *)ps, REQ_OLD); return; } /* * For a resync read, check to see if I/O is * outside of the current resync region, or * the resync has finished. If so * just terminate the I/O */ if ((flag & MD_STR_WAR) && (!(un->c.un_status & MD_UN_WAR) || (!IN_RESYNC_REGION(un, ps)))) { #ifdef DEBUG if (mirror_debug_flag) printf("Abort resync read " "%x: %lld\n", MD_SID(un), ps->ps_firstblk); #endif mirror_overlap_tree_remove(ps); kmem_cache_free(mirror_parent_cache, ps); md_kstat_waitq_exit(ui); md_unit_readerexit(ui); md_biodone(pb); return; } } } } if (flag & MD_STR_DMR) { ps->ps_call = directed_read_done; } if (!(flag & MD_STR_NOTTOP) && panicstr) ps->ps_flags |= MD_MPS_DONTFREE; md_kstat_waitq_to_runq(ui); ps->ps_frags++; do { cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); mirror_child_init(cs); cb = &cs->cs_buf; cs->cs_ps = ps; cb = md_bioclone(pb, current_offset, current_count, NODEV, current_blkno, mirror_done, cb, KM_NOSLEEP); more = mirror_map_read(ps, cs, current_blkno, (u_longlong_t)current_count); if (more) { mutex_enter(&ps->ps_mx); ps->ps_frags++; mutex_exit(&ps->ps_mx); } /* * Do these calculations now, * so that we pickup a valid b_bcount from the chld_bp. */ current_count -= more; current_offset += cb->b_bcount; current_blkno += more; md_call_strategy(cb, flag, private); } while (more); if (!(flag & MD_STR_NOTTOP) && panicstr) { while (!(ps->ps_flags & MD_MPS_DONE)) { md_daemon(1, &md_done_daemon); drv_usecwait(10); } kmem_cache_free(mirror_parent_cache, ps); } } void md_mirror_strategy(buf_t *bp, int flag, void *private) { set_t setno = MD_MIN2SET(getminor(bp->b_edev)); /* * When doing IO to a multi owner meta device, check if set is halted. * We do this check without the needed lock held, for performance * reasons. * If an IO just slips through while the set is locked via an * MD_MN_SUSPEND_SET, we don't care about it. * Only check for suspension if we are a top-level i/o request * (MD_STR_NOTTOP is cleared in 'flag'). */ if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) == (MD_SET_HALTED | MD_SET_MNSET)) { if ((flag & MD_STR_NOTTOP) == 0) { mutex_enter(&md_mx); /* Here we loop until the set is no longer halted */ while (md_set[setno].s_status & MD_SET_HALTED) { cv_wait(&md_cv, &md_mx); } mutex_exit(&md_mx); } } if ((flag & MD_IO_COUNTED) == 0) { if ((flag & MD_NOBLOCK) == 0) { if (md_inc_iocount(setno) != 0) { bp->b_flags |= B_ERROR; bp->b_error = ENXIO; bp->b_resid = bp->b_bcount; biodone(bp); return; } } else { md_inc_iocount_noblock(setno); } } if (bp->b_flags & B_READ) mirror_read_strategy(bp, flag, private); else mirror_write_strategy(bp, flag, private); } /* * mirror_directed_read: * -------------------- * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror * so that the application can determine what (if any) resync needs to be * performed. The data is copied out to the user-supplied buffer. * * Parameters: * mdev - dev_t for the mirror device * vdr - directed read parameters specifying location and submirror * to perform the read from * mode - used to ddi_copyout() any resulting data from the read * * Returns: * 0 success * !0 error code * EINVAL - invalid request format */ int mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode) { buf_t *bp; minor_t mnum = getminor(mdev); mdi_unit_t *ui = MDI_UNIT(mnum); mm_unit_t *un; mm_submirror_t *sm; char *sm_nm; uint_t next_side; void *kbuffer; if (ui == NULL) return (ENXIO); if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) { return (EINVAL); } /* Check for aligned block access. We disallow non-aligned requests. */ if (vdr->vdr_offset % DEV_BSIZE) { return (EINVAL); } /* * Allocate kernel buffer for target of read(). If we had a reliable * (sorry functional) DDI this wouldn't be needed. */ kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP); if (kbuffer == NULL) { cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx" " bytes\n", vdr->vdr_nbytes); return (ENOMEM); } bp = getrbuf(KM_SLEEP); bp->b_un.b_addr = kbuffer; bp->b_flags = B_READ; bp->b_bcount = vdr->vdr_nbytes; bp->b_lblkno = lbtodb(vdr->vdr_offset); bp->b_edev = mdev; un = md_unit_readerlock(ui); /* * If DKV_SIDE_INIT is set we need to determine the first available * side to start reading from. If it isn't set we increment to the * next readable submirror. * If there are no readable submirrors we error out with DKV_DMR_ERROR. * Note: we check for a readable submirror on completion of the i/o so * we should _always_ have one available. If this becomes unavailable * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if * a metadetach is made between the completion of one DKIOCDMR ioctl * and the start of the next (i.e. a sys-admin 'accident' occurred). * The chance of this is small, but not non-existent. */ if (vdr->vdr_side == DKV_SIDE_INIT) { next_side = 0; } else { next_side = vdr->vdr_side + 1; } while ((next_side < NMIRROR) && !SUBMIRROR_IS_READABLE(un, next_side)) next_side++; if (next_side >= NMIRROR) { vdr->vdr_flags |= DKV_DMR_ERROR; freerbuf(bp); vdr->vdr_bytesread = 0; md_unit_readerexit(ui); return (0); } /* Set the side to read from */ un->un_dmr_last_read = next_side; md_unit_readerexit(ui); /* * Save timestamp for verification purposes. Can be read by debugger * to verify that this ioctl has been executed and to find the number * of DMR reads and the time of the last DMR read. */ uniqtime(&mirror_dmr_stats.dmr_timestamp); mirror_dmr_stats.dmr_count++; /* Issue READ request and wait for completion */ mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL); mutex_enter(&un->un_dmr_mx); cv_wait(&un->un_dmr_cv, &un->un_dmr_mx); mutex_exit(&un->un_dmr_mx); /* * Check to see if we encountered an error during the read. If so we * can make no guarantee about any possibly returned data. */ if ((bp->b_flags & B_ERROR) == 0) { vdr->vdr_flags &= ~DKV_DMR_ERROR; if (bp->b_resid) { vdr->vdr_flags |= DKV_DMR_SHORT; vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid; } else { vdr->vdr_flags |= DKV_DMR_SUCCESS; vdr->vdr_bytesread = vdr->vdr_nbytes; } /* Copy the data read back out to the user supplied buffer */ if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread, mode)) { kmem_free(kbuffer, vdr->vdr_nbytes); return (EFAULT); } } else { /* Error out with DKV_DMR_ERROR */ vdr->vdr_flags |= DKV_DMR_ERROR; vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE); } /* * Update the DMR parameters with the side and name of submirror that * we have just read from (un->un_dmr_last_read) */ un = md_unit_readerlock(ui); vdr->vdr_side = un->un_dmr_last_read; sm = &un->un_sm[un->un_dmr_last_read]; sm_nm = md_shortname(md_getminor(sm->sm_dev)); (void) strncpy(vdr->vdr_side_name, sm_nm, sizeof (vdr->vdr_side_name)); /* * Determine if we've completed the read cycle. This is true iff the * next computed submirror (side) equals or exceeds NMIRROR. We cannot * use un_nsm as we need to handle a sparse array of submirrors (which * can occur if a submirror is metadetached). */ next_side = un->un_dmr_last_read + 1; while ((next_side < NMIRROR) && !SUBMIRROR_IS_READABLE(un, next_side)) next_side++; if (next_side >= NMIRROR) { /* We've finished */ vdr->vdr_flags |= DKV_DMR_DONE; } md_unit_readerexit(ui); freerbuf(bp); kmem_free(kbuffer, vdr->vdr_nbytes); return (0); } /* * mirror_resync_message: * --------------------- * Handle the multi-node resync messages that keep all nodes within a given * disk-set in sync with their view of a mirror's resync status. * * The message types dealt with are: * MD_MN_MSG_RESYNC_STARTING - start a resync thread for a unit * MD_MN_MSG_RESYNC_NEXT - specified next region to be resynced * MD_MN_MSG_RESYNC_FINISH - stop the resync thread for a unit * MD_MN_MSG_RESYNC_PHASE_DONE - end of a resync phase, opt, submirror or comp * * Returns: * 0 Success * >0 Failure error number */ int mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp) { mdi_unit_t *ui; mm_unit_t *un; set_t setno; int is_ABR; int smi; int ci; sm_state_t state; int broke_out; mm_submirror_t *sm; mm_submirror_ic_t *smic; md_m_shared_t *shared; md_error_t mde = mdnullerror; md_mps_t *ps; int rs_active; int rr, rr_start, rr_end; /* Check that the given device is part of a multi-node set */ setno = MD_MIN2SET(p->mnum); if (setno >= md_nsets) { return (ENXIO); } if (!MD_MNSET_SETNO(setno)) { return (EINVAL); } if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL) return (EINVAL); if ((ui = MDI_UNIT(p->mnum)) == NULL) return (EINVAL); is_ABR = (ui->ui_tstate & MD_ABR_CAP); /* Obtain the current resync status */ (void) md_ioctl_readerlock(lockp, ui); rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0; md_ioctl_readerexit(lockp); switch ((md_mn_msgtype_t)p->msg_type) { case MD_MN_MSG_RESYNC_STARTING: /* Start the resync thread for the mirror */ (void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp); break; case MD_MN_MSG_RESYNC_NEXT: /* * We have to release any previously marked overlap regions * so that i/o can resume. Then we need to block the region * from [rs_start..rs_start+rs_size) * so that no i/o is issued. * Update un_rs_resync_done and un_rs_resync_2_do. */ (void) md_ioctl_readerlock(lockp, ui); /* * Ignore the message if there is no active resync thread or * if it is for a resync type that we have already completed. * un_resync_completed is set to the last resync completed * when processing a PHASE_DONE message. */ if (!rs_active || (p->rs_type == un->un_resync_completed)) break; /* * If this message is for the same resync and is for an earlier * resync region, just ignore it. This can only occur if this * node has progressed on to the next resync region before * we receive this message. This can occur if the class for * this message is busy and the originator has to retry thus * allowing this node to move onto the next resync_region. */ if ((p->rs_type == un->un_rs_type) && (p->rs_start < un->un_resync_startbl)) break; ps = un->un_rs_prev_overlap; /* Allocate previous overlap reference if needed */ if (ps == NULL) { ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS); ps->ps_un = un; ps->ps_ui = ui; ps->ps_firstblk = 0; ps->ps_lastblk = 0; ps->ps_flags = 0; md_ioctl_readerexit(lockp); (void) md_ioctl_writerlock(lockp, ui); un->un_rs_prev_overlap = ps; md_ioctl_writerexit(lockp); } else md_ioctl_readerexit(lockp); if (p->rs_originator != md_mn_mynode_id) { /* * Clear our un_resync_bm for the regions completed. * The owner (originator) will take care of itself. */ BLK_TO_RR(rr_end, ps->ps_lastblk, un); BLK_TO_RR(rr_start, p->rs_start, un); if (ps->ps_lastblk && rr_end < rr_start) { BLK_TO_RR(rr_start, ps->ps_firstblk, un); mutex_enter(&un->un_resync_mx); /* * Update our resync bitmap to reflect that * another node has synchronized this range. */ for (rr = rr_start; rr <= rr_end; rr++) { CLR_KEEPDIRTY(rr, un); } mutex_exit(&un->un_resync_mx); } /* * On all but the originating node, first update * the resync state, then unblock the previous * region and block the next one. No need * to do this if the region is already blocked. * Update the submirror state and flags from the * originator. This keeps the cluster in sync with * regards to the resync status. */ (void) md_ioctl_writerlock(lockp, ui); un->un_rs_resync_done = p->rs_done; un->un_rs_resync_2_do = p->rs_2_do; un->un_rs_type = p->rs_type; un->un_resync_startbl = p->rs_start; md_ioctl_writerexit(lockp); /* * Use un_owner_mx to ensure that an ownership change * cannot happen at the same time as this message */ mutex_enter(&un->un_owner_mx); if (MD_MN_MIRROR_OWNER(un)) { ps->ps_firstblk = p->rs_start; ps->ps_lastblk = ps->ps_firstblk + p->rs_size - 1; } else { if ((ps->ps_firstblk != p->rs_start) || (ps->ps_lastblk != p->rs_start + p->rs_size - 1)) { /* Remove previous overlap range */ if (ps->ps_flags & MD_MPS_ON_OVERLAP) mirror_overlap_tree_remove(ps); ps->ps_firstblk = p->rs_start; ps->ps_lastblk = ps->ps_firstblk + p->rs_size - 1; mutex_exit(&un->un_owner_mx); /* Block this range from all i/o. */ if (ps->ps_firstblk != 0 || ps->ps_lastblk != 0) wait_for_overlaps(ps, MD_OVERLAP_ALLOW_REPEAT); mutex_enter(&un->un_owner_mx); /* * Check to see if we have obtained * ownership while waiting for * overlaps. If we have, remove * the resync_region entry from the * overlap tree */ if (MD_MN_MIRROR_OWNER(un) && (ps->ps_flags & MD_MPS_ON_OVERLAP)) mirror_overlap_tree_remove(ps); } } mutex_exit(&un->un_owner_mx); /* * If this is the first RESYNC_NEXT message (i.e. * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags), * issue RESYNC_START NOTIFY event */ if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) { SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START, SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); } /* Ensure that our local resync thread is running */ if (un->un_rs_thread == NULL) { (void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp); } } break; case MD_MN_MSG_RESYNC_FINISH: /* * Complete the resync by stopping the resync thread. * Also release the previous overlap region field. * Update the resync_progress_thread by cv_signal'ing it so * that we mark the end of the resync as soon as possible. This * stops an unnecessary delay should be panic after resync * completion. */ #ifdef DEBUG if (!rs_active) { if (mirror_debug_flag) printf("RESYNC_FINISH (mnum = %x), " "Resync *NOT* active", p->mnum); } #endif if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) && (p->rs_originator != md_mn_mynode_id)) { mutex_enter(&un->un_rs_thread_mx); un->c.un_status &= ~MD_UN_RESYNC_CANCEL; un->un_rs_thread_flags |= MD_RI_SHUTDOWN; un->un_rs_thread_flags &= ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER); cv_signal(&un->un_rs_thread_cv); mutex_exit(&un->un_rs_thread_mx); } if (is_ABR) { /* Resync finished, if ABR set owner to NULL */ mutex_enter(&un->un_owner_mx); un->un_mirror_owner = 0; mutex_exit(&un->un_owner_mx); } (void) md_ioctl_writerlock(lockp, ui); ps = un->un_rs_prev_overlap; if (ps != NULL) { /* Remove previous overlap range */ if (ps->ps_flags & MD_MPS_ON_OVERLAP) mirror_overlap_tree_remove(ps); /* * Release the overlap range reference */ un->un_rs_prev_overlap = NULL; kmem_cache_free(mirror_parent_cache, ps); } md_ioctl_writerexit(lockp); /* Mark the resync as complete in the metadb */ un->un_rs_resync_done = p->rs_done; un->un_rs_resync_2_do = p->rs_2_do; un->un_rs_type = p->rs_type; mutex_enter(&un->un_rs_progress_mx); cv_signal(&un->un_rs_progress_cv); mutex_exit(&un->un_rs_progress_mx); un = md_ioctl_writerlock(lockp, ui); un->c.un_status &= ~MD_UN_RESYNC_ACTIVE; /* Deal with any pending grow_unit */ if (un->c.un_status & MD_UN_GROW_PENDING) { if ((mirror_grow_unit(un, &mde) != 0) || (! mdismderror(&mde, MDE_GROW_DELAYED))) { un->c.un_status &= ~MD_UN_GROW_PENDING; } } md_ioctl_writerexit(lockp); break; case MD_MN_MSG_RESYNC_PHASE_DONE: /* * A phase of the resync, optimized. component or * submirror is complete. Update mirror status. * If the flag CLEAR_OPT_NOT_DONE is set, it means that the * mirror owner is peforming a resync. If we have just snarfed * this set, then we must clear any of the flags set at snarf * time by unit_setup_resync(). * Note that unit_setup_resync() sets up these flags to * indicate that an optimized resync is required. These flags * need to be reset because if we get here, the mirror owner * will have handled the optimized resync. * The flags that must be cleared are MD_UN_OPT_NOT_DONE and * MD_UN_WAR. In addition, for each submirror, * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC * set to SMS_OFFLINE. */ #ifdef DEBUG if (mirror_debug_flag) printf("phase done mess received from %d, mnum=%x," "type=%x, flags=%x\n", p->rs_originator, p->mnum, p->rs_type, p->rs_flags); #endif /* * Ignore the message if there is no active resync thread. */ if (!rs_active) break; broke_out = p->rs_flags & MD_MN_RS_ERR; switch (RS_TYPE(p->rs_type)) { case MD_RS_OPTIMIZED: un = md_ioctl_writerlock(lockp, ui); if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) { /* If we are originator, just clear rs_type */ if (p->rs_originator == md_mn_mynode_id) { SET_RS_TYPE_NONE(un->un_rs_type); md_ioctl_writerexit(lockp); break; } /* * If CLEAR_OPT_NOT_DONE is set, only clear the * flags if OPT_NOT_DONE is set *and* rs_type * is MD_RS_NONE. */ if ((un->c.un_status & MD_UN_OPT_NOT_DONE) && (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) { /* No resync in progress */ un->c.un_status &= ~MD_UN_OPT_NOT_DONE; un->c.un_status &= ~MD_UN_WAR; } else { /* * We are in the middle of an * optimized resync and this message * should be ignored. */ md_ioctl_writerexit(lockp); break; } } else { /* * This is the end of an optimized resync, * clear the OPT_NOT_DONE and OFFLINE_SM flags */ un->c.un_status &= ~MD_UN_KEEP_DIRTY; if (!broke_out) un->c.un_status &= ~MD_UN_WAR; /* * Clear our un_resync_bm for the regions * completed. The owner (originator) will * take care of itself. */ if (p->rs_originator != md_mn_mynode_id && (ps = un->un_rs_prev_overlap) != NULL) { BLK_TO_RR(rr_start, ps->ps_firstblk, un); BLK_TO_RR(rr_end, ps->ps_lastblk, un); mutex_enter(&un->un_resync_mx); for (rr = rr_start; rr <= rr_end; rr++) { CLR_KEEPDIRTY(rr, un); } mutex_exit(&un->un_resync_mx); } } /* * Set resync_completed to last resync type and then * clear resync_type to indicate no resync in progress */ un->un_resync_completed = un->un_rs_type; SET_RS_TYPE_NONE(un->un_rs_type); /* * If resync is as a result of a submirror ONLINE, * reset the submirror state to SMS_RUNNING if the * resync was ok else set back to SMS_OFFLINE. */ for (smi = 0; smi < NMIRROR; smi++) { un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET; if (SMS_BY_INDEX_IS(un, smi, SMS_OFFLINE_RESYNC)) { if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) { state = SMS_OFFLINE; } else { state = (broke_out ? SMS_OFFLINE : SMS_RUNNING); } mirror_set_sm_state( &un->un_sm[smi], &un->un_smic[smi], state, broke_out); mirror_commit(un, NO_SUBMIRRORS, 0); } /* * If we still have an offline submirror, reset * the OFFLINE_SM flag in the mirror status */ if (SMS_BY_INDEX_IS(un, smi, SMS_OFFLINE)) un->c.un_status |= MD_UN_OFFLINE_SM; } md_ioctl_writerexit(lockp); break; case MD_RS_SUBMIRROR: un = md_ioctl_writerlock(lockp, ui); smi = RS_SMI(p->rs_type); sm = &un->un_sm[smi]; smic = &un->un_smic[smi]; /* Clear RESYNC target */ un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET; /* * Set resync_completed to last resync type and then * clear resync_type to indicate no resync in progress */ un->un_resync_completed = un->un_rs_type; SET_RS_TYPE_NONE(un->un_rs_type); /* * If the resync completed ok reset the submirror * state to SMS_RUNNING else reset it to SMS_ATTACHED */ state = (broke_out ? SMS_ATTACHED : SMS_RUNNING); mirror_set_sm_state(sm, smic, state, broke_out); un->c.un_status &= ~MD_UN_WAR; mirror_commit(un, SMI2BIT(smi), 0); md_ioctl_writerexit(lockp); break; case MD_RS_COMPONENT: un = md_ioctl_writerlock(lockp, ui); smi = RS_SMI(p->rs_type); ci = RS_CI(p->rs_type); sm = &un->un_sm[smi]; smic = &un->un_smic[smi]; shared = (md_m_shared_t *) (*(smic->sm_shared_by_indx)) (sm->sm_dev, sm, ci); un->c.un_status &= ~MD_UN_WAR; /* Clear RESYNC target */ un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET; /* * Set resync_completed to last resync type and then * clear resync_type to indicate no resync in progress */ un->un_resync_completed = un->un_rs_type; SET_RS_TYPE_NONE(un->un_rs_type); /* * If the resync completed ok, set the component state * to CS_OKAY. */ if (broke_out) shared->ms_flags |= MDM_S_RS_TRIED; else { /* * As we don't transmit the changes, * no need to drop the lock. */ set_sm_comp_state(un, smi, ci, CS_OKAY, 0, MD_STATE_NO_XMIT, (IOLOCK *)NULL); } md_ioctl_writerexit(lockp); default: break; } /* * If the purpose of this PHASE_DONE message is just to * indicate to all other nodes that the optimized resync * required (OPT_NOT_DONE) flag is to be cleared, there is * no need to generate a notify event as there has not * actually been a resync. */ if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) { if (broke_out) { SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED, SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); } else { SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE, SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); } } break; default: #ifdef DEBUG cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type" " %x\n", p->msg_type); #endif return (EINVAL); } return (0); } /* Return a -1 if snarf of optimized record failed and set should be released */ static int mirror_snarf(md_snarfcmd_t cmd, set_t setno) { mddb_recid_t recid; int gotsomething; int all_mirrors_gotten; mm_unit_t *un; mddb_type_t typ1; mddb_de_ic_t *dep; mddb_rb32_t *rbp; size_t newreqsize; mm_unit_t *big_un; mm_unit32_od_t *small_un; int retval; mdi_unit_t *ui; if (cmd == MD_SNARF_CLEANUP) { if (md_get_setstatus(setno) & MD_SET_STALE) return (0); recid = mddb_makerecid(setno, 0); typ1 = (mddb_type_t)md_getshared_key(setno, mirror_md_ops.md_driver.md_drivername); while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) { un = (mm_unit_t *)mddb_getrecaddr(recid); mirror_cleanup(un); recid = mddb_makerecid(setno, 0); } } return (0); } all_mirrors_gotten = 1; gotsomething = 0; recid = mddb_makerecid(setno, 0); typ1 = (mddb_type_t)md_getshared_key(setno, mirror_md_ops.md_driver.md_drivername); while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) continue; dep = mddb_getrecdep(recid); dep->de_flags = MDDB_F_MIRROR; rbp = dep->de_rb; switch (rbp->rb_revision) { case MDDB_REV_RB: case MDDB_REV_RBFN: if ((rbp->rb_private & MD_PRV_CONVD) == 0) { /* * This means, we have an old and small * record and this record hasn't already * been converted. Before we create an * incore metadevice from this we have to * convert it to a big record. */ small_un = (mm_unit32_od_t *)mddb_getrecaddr(recid); newreqsize = sizeof (mm_unit_t); big_un = (mm_unit_t *)kmem_zalloc(newreqsize, KM_SLEEP); mirror_convert((caddr_t)small_un, (caddr_t)big_un, SMALL_2_BIG); kmem_free(small_un, dep->de_reqsize); /* * Update userdata and incore userdata * incores are at the end of un */ dep->de_rb_userdata_ic = big_un; dep->de_rb_userdata = big_un; dep->de_icreqsize = newreqsize; un = big_un; rbp->rb_private |= MD_PRV_CONVD; } else { /* * Unit already converted, just get the * record address. */ un = (mm_unit_t *)mddb_getrecaddr_resize(recid, sizeof (*un), 0); } un->c.un_revision &= ~MD_64BIT_META_DEV; break; case MDDB_REV_RB64: case MDDB_REV_RB64FN: /* Big device */ un = (mm_unit_t *)mddb_getrecaddr_resize(recid, sizeof (*un), 0); un->c.un_revision |= MD_64BIT_META_DEV; un->c.un_flag |= MD_EFILABEL; break; } MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision); /* * Create minor device node for snarfed entry. */ (void) md_create_minor_node(setno, MD_SID(un)); if (MD_UNIT(MD_SID(un)) != NULL) { mddb_setrecprivate(recid, MD_PRV_PENDDEL); continue; } all_mirrors_gotten = 0; retval = mirror_build_incore(un, 1); if (retval == 0) { mddb_setrecprivate(recid, MD_PRV_GOTIT); md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0); resync_start_timeout(setno); gotsomething = 1; } else { return (retval); } /* * Set flag to indicate that the mirror has not yet * been through a reconfig. This flag is used for MN sets * when determining whether to update the mirror state from * the Master node. */ if (MD_MNSET_SETNO(setno)) { ui = MDI_UNIT(MD_SID(un)); ui->ui_tstate |= MD_RESYNC_NOT_DONE; } } if (!all_mirrors_gotten) return (gotsomething); recid = mddb_makerecid(setno, 0); while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0) if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) mddb_setrecprivate(recid, MD_PRV_PENDDEL); return (0); } static int mirror_halt(md_haltcmd_t cmd, set_t setno) { unit_t i; mdi_unit_t *ui; minor_t mnum; int reset_mirror_flag = 0; if (cmd == MD_HALT_CLOSE) return (0); if (cmd == MD_HALT_OPEN) return (0); if (cmd == MD_HALT_UNLOAD) return (0); if (cmd == MD_HALT_CHECK) { for (i = 0; i < md_nunits; i++) { mnum = MD_MKMIN(setno, i); if ((ui = MDI_UNIT(mnum)) == NULL) continue; if (ui->ui_opsindex != mirror_md_ops.md_selfindex) continue; if (md_unit_isopen(ui)) return (1); } return (0); } if (cmd != MD_HALT_DOIT) return (1); for (i = 0; i < md_nunits; i++) { mnum = MD_MKMIN(setno, i); if ((ui = MDI_UNIT(mnum)) == NULL) continue; if (ui->ui_opsindex != mirror_md_ops.md_selfindex) continue; reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0); /* Set a flag if there is at least one mirror metadevice. */ reset_mirror_flag = 1; } /* * Only wait for the global dr_timeout to finish * - if there are mirror metadevices in this diskset or * - if this is the local set since an unload of the md_mirror * driver could follow a successful mirror halt in the local set. */ if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) { while ((mirror_md_ops.md_head == NULL) && (mirror_timeout.dr_timeout_id != 0)) delay(md_hz); } return (0); } /*ARGSUSED3*/ static int mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags) { IOLOCK lock; minor_t mnum = getminor(*dev); set_t setno; /* * When doing an open of a multi owner metadevice, check to see if this * node is a starting node and if a reconfig cycle is underway. * If so, the system isn't sufficiently set up enough to handle the * open (which involves I/O during sp_validate), so fail with ENXIO. */ setno = MD_MIN2SET(mnum); if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) == (MD_SET_MNSET | MD_SET_MN_START_RC)) { return (ENXIO); } if (md_oflags & MD_OFLG_FROMIOCTL) { /* * This indicates that the caller is an ioctl service routine. * In this case we initialise our stack-based IOLOCK and pass * this into the internal open routine. This allows multi-owner * metadevices to avoid deadlocking if an error is encountered * during the open() attempt. The failure case is: * s-p -> mirror -> s-p (with error). Attempting to metaclear * this configuration would deadlock as the mirror code has to * send a state-update to the other nodes when it detects the * failure of the underlying submirror with an errored soft-part * on it. As there is a class1 message in progress (metaclear) * set_sm_comp_state() cannot send another class1 message; * instead we do not send a state_update message as the * metaclear is distributed and the failed submirror will be * cleared from the configuration by the metaclear. */ IOLOCK_INIT(&lock); return (mirror_internal_open(getminor(*dev), flag, otyp, md_oflags, &lock)); } else { return (mirror_internal_open(getminor(*dev), flag, otyp, md_oflags, (IOLOCK *)NULL)); } } /*ARGSUSED1*/ static int mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags) { return (mirror_internal_close(getminor(dev), otyp, md_cflags, (IOLOCK *)NULL)); } /* * This routine dumps memory to the disk. It assumes that the memory has * already been mapped into mainbus space. It is called at disk interrupt * priority when the system is in trouble. * */ static int mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) { mm_unit_t *un; dev_t mapdev; int result; int smi; int any_succeed = 0; int save_result = 0; /* * Don't need to grab the unit lock. * Cause nothing else is suppose to be happenning. * Also dump is not suppose to sleep. */ un = (mm_unit_t *)MD_UNIT(getminor(dev)); if ((diskaddr_t)blkno >= un->c.un_total_blocks) return (EINVAL); if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks) return (EINVAL); for (smi = 0; smi < NMIRROR; smi++) { if (!SUBMIRROR_IS_WRITEABLE(un, smi)) continue; mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev); result = bdev_dump(mapdev, addr, blkno, nblk); if (result) save_result = result; if (result == 0) any_succeed++; } if (any_succeed) return (0); return (save_result); } /* * NAME: mirror_probe_dev * * DESCRITPION: force opens every component of a mirror. * * On entry the unit writerlock is held */ static int mirror_probe_dev(mdi_unit_t *ui, minor_t mnum) { int i; int smi; int ci; mm_unit_t *un; int md_devopen = 0; set_t setno; int sm_cnt; int sm_unavail_cnt; if (md_unit_isopen(ui)) md_devopen++; un = MD_UNIT(mnum); setno = MD_UN2SET(un); sm_cnt = 0; sm_unavail_cnt = 0; for (i = 0; i < NMIRROR; i++) { md_dev64_t tmpdev; mdi_unit_t *sm_ui; if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) { continue; } sm_cnt++; tmpdev = un->un_sm[i].sm_dev; (void) md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV); un->un_sm[i].sm_dev = tmpdev; sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); /* * Logic similar to that in mirror_open_all_devs. We set or * clear the submirror Unavailable bit. */ (void) md_unit_writerlock(sm_ui); if (submirror_unavailable(un, i, 1)) { sm_ui->ui_tstate |= MD_INACCESSIBLE; sm_unavail_cnt++; } else { sm_ui->ui_tstate &= ~MD_INACCESSIBLE; } md_unit_writerexit(sm_ui); } /* * If all of the submirrors are unavailable, the mirror is also * unavailable. */ if (sm_cnt == sm_unavail_cnt) { ui->ui_tstate |= MD_INACCESSIBLE; } else { ui->ui_tstate &= ~MD_INACCESSIBLE; } /* * Start checking from probe failures. If failures occur we * set the appropriate erred state only if the metadevice is in * use. This is specifically to prevent unnecessary resyncs. * For instance if the disks were accidentally disconnected when * the system booted up then until the metadevice is accessed * (like file system mount) the user can shutdown, recable and * reboot w/o incurring a potentially huge resync. */ smi = 0; ci = 0; while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) { if (mirror_other_sources(un, smi, ci, 0) == 1) { /* * Note that for a MN set, there is no need to call * SE_NOTIFY as that is done when processing the * state change */ if (md_devopen) { /* * Never called from ioctl context, * so (IOLOCK *)NULL */ set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, MD_STATE_XMIT, (IOLOCK *)NULL); if (!MD_MNSET_SETNO(setno)) { SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, SVM_TAG_METADEVICE, setno, MD_SID(un)); } continue; } else { (void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV); if (!MD_MNSET_SETNO(setno)) { SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE, setno, MD_SID(un)); } mirror_openfail_console_info(un, smi, ci); return (ENXIO); } } /* * Note that for a MN set, there is no need to call * SE_NOTIFY as that is done when processing the * state change */ if (md_devopen) { /* Never called from ioctl context, so (IOLOCK *)NULL */ set_sm_comp_state(un, smi, ci, CS_ERRED, 0, MD_STATE_XMIT, (IOLOCK *)NULL); if (!MD_MNSET_SETNO(setno)) { SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_METADEVICE, setno, MD_SID(un)); } } mirror_openfail_console_info(un, smi, ci); ci++; } if (MD_MNSET_SETNO(setno)) { send_poke_hotspares(setno); } else { (void) poke_hotspares(); } (void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV); return (0); } static int mirror_imp_set( set_t setno ) { mddb_recid_t recid; int gotsomething, i; mddb_type_t typ1; mddb_de_ic_t *dep; mddb_rb32_t *rbp; mm_unit32_od_t *un32; mm_unit_t *un64; md_dev64_t self_devt; minor_t *self_id; /* minor needs to be updated */ md_parent_t *parent_id; /* parent needs to be updated */ mddb_recid_t *record_id; /* record id needs to be updated */ mddb_recid_t *optrec_id; md_dev64_t tmpdev; gotsomething = 0; typ1 = (mddb_type_t)md_getshared_key(setno, mirror_md_ops.md_driver.md_drivername); recid = mddb_makerecid(setno, 0); while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) continue; dep = mddb_getrecdep(recid); rbp = dep->de_rb; switch (rbp->rb_revision) { case MDDB_REV_RB: case MDDB_REV_RBFN: /* * Small device */ un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid); self_id = &(un32->c.un_self_id); parent_id = &(un32->c.un_parent); record_id = &(un32->c.un_record_id); optrec_id = &(un32->un_rr_dirty_recid); for (i = 0; i < un32->un_nsm; i++) { tmpdev = md_expldev(un32->un_sm[i].sm_dev); un32->un_sm[i].sm_dev = md_cmpldev (md_makedevice(md_major, MD_MKMIN(setno, MD_MIN2UNIT(md_getminor(tmpdev))))); if (!md_update_minor(setno, mddb_getsidenum (setno), un32->un_sm[i].sm_key)) goto out; } break; case MDDB_REV_RB64: case MDDB_REV_RB64FN: un64 = (mm_unit_t *)mddb_getrecaddr(recid); self_id = &(un64->c.un_self_id); parent_id = &(un64->c.un_parent); record_id = &(un64->c.un_record_id); optrec_id = &(un64->un_rr_dirty_recid); for (i = 0; i < un64->un_nsm; i++) { tmpdev = un64->un_sm[i].sm_dev; un64->un_sm[i].sm_dev = md_makedevice (md_major, MD_MKMIN(setno, MD_MIN2UNIT (md_getminor(tmpdev)))); if (!md_update_minor(setno, mddb_getsidenum (setno), un64->un_sm[i].sm_key)) goto out; } break; } /* * If this is a top level and a friendly name metadevice, * update its minor in the namespace. */ if ((*parent_id == MD_NO_PARENT) && ((rbp->rb_revision == MDDB_REV_RBFN) || (rbp->rb_revision == MDDB_REV_RB64FN))) { self_devt = md_makedevice(md_major, *self_id); if (!md_update_top_device_minor(setno, mddb_getsidenum(setno), self_devt)) goto out; } /* * Update unit with the imported setno * */ mddb_setrecprivate(recid, MD_PRV_GOTIT); *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); if (*parent_id != MD_NO_PARENT) *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); *record_id = MAKERECID(setno, DBID(*record_id)); *optrec_id = MAKERECID(setno, DBID(*optrec_id)); gotsomething = 1; } out: return (gotsomething); } /* * NAME: mirror_check_offline * * DESCRIPTION: return offline_status = 1 if any submirrors are offline * * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE * ioctl. */ int mirror_check_offline(md_dev64_t dev, int *offline_status) { mm_unit_t *un; md_error_t mde = mdnullerror; if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) return (EINVAL); *offline_status = 0; if (un->c.un_status & MD_UN_OFFLINE_SM) *offline_status = 1; return (0); } /* * NAME: mirror_inc_abr_count * * DESCRIPTION: increment the count of layered soft parts with ABR set * * Called from ioctl, so access to un_abr_count is protected by the global * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl. */ int mirror_inc_abr_count(md_dev64_t dev) { mm_unit_t *un; md_error_t mde = mdnullerror; if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) return (EINVAL); un->un_abr_count++; return (0); } /* * NAME: mirror_dec_abr_count * * DESCRIPTION: decrement the count of layered soft parts with ABR set * * Called from ioctl, so access to un_abr_count is protected by the global * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl. */ int mirror_dec_abr_count(md_dev64_t dev) { mm_unit_t *un; md_error_t mde = mdnullerror; if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) return (EINVAL); un->un_abr_count--; return (0); } static md_named_services_t mirror_named_services[] = { {(intptr_t (*)()) poke_hotspares, "poke hotspares" }, {(intptr_t (*)()) mirror_rename_listkids, MDRNM_LIST_URKIDS }, {mirror_rename_check, MDRNM_CHECK }, {(intptr_t (*)()) mirror_renexch_update_kids, MDRNM_UPDATE_KIDS }, {(intptr_t (*)()) mirror_exchange_parent_update_to, MDRNM_PARENT_UPDATE_TO}, {(intptr_t (*)()) mirror_exchange_self_update_from_down, MDRNM_SELF_UPDATE_FROM_DOWN }, {(intptr_t (*)())mirror_probe_dev, "probe open test" }, {(intptr_t (*)())mirror_check_offline, MD_CHECK_OFFLINE }, {(intptr_t (*)())mirror_inc_abr_count, MD_INC_ABR_COUNT }, {(intptr_t (*)())mirror_dec_abr_count, MD_DEC_ABR_COUNT }, { NULL, 0 } }; md_ops_t mirror_md_ops = { mirror_open, /* open */ mirror_close, /* close */ md_mirror_strategy, /* strategy */ NULL, /* print */ mirror_dump, /* dump */ NULL, /* read */ NULL, /* write */ md_mirror_ioctl, /* mirror_ioctl, */ mirror_snarf, /* mirror_snarf */ mirror_halt, /* mirror_halt */ NULL, /* aread */ NULL, /* awrite */ mirror_imp_set, /* import set */ mirror_named_services }; /* module specific initilization */ static void init_init() { md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t); /* Initialize the parent and child save memory pools */ mirror_parent_cache = kmem_cache_create("md_mirror_parent", sizeof (md_mps_t), 0, mirror_parent_constructor, mirror_parent_destructor, mirror_run_queue, NULL, NULL, 0); mirror_child_cache = kmem_cache_create("md_mirror_child", sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0, mirror_child_constructor, mirror_child_destructor, mirror_run_queue, NULL, NULL, 0); /* * Insure wowbuf_size is a multiple of DEV_BSIZE, * then initialize wowbuf memory pool. */ md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE); if (md_wowbuf_size <= 0) md_wowbuf_size = 2 * DEV_BSIZE; if (md_wowbuf_size > (32 * DEV_BSIZE)) md_wowbuf_size = (32 * DEV_BSIZE); md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t); mirror_wowblk_cache = kmem_cache_create("md_mirror_wow", md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0); mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL); } /* module specific uninitilization (undo init_init()) */ static void fini_uninit() { kmem_cache_destroy(mirror_parent_cache); kmem_cache_destroy(mirror_child_cache); kmem_cache_destroy(mirror_wowblk_cache); mirror_parent_cache = mirror_child_cache = mirror_wowblk_cache = NULL; mutex_destroy(&mirror_timeout.dr_mx); mutex_destroy(&hotspare_request.dr_mx); mutex_destroy(&non_ff_drv_mutex); } /* define the module linkage */ MD_PLUGIN_MISC_MODULE("mirrors module", init_init(), fini_uninit())