Mercurial > illumos > illumos-gate
changeset 6901:307e592cef33
6510471 svm overlap chain book keeping does not scale well
author | jkennedy |
---|---|
date | Wed, 18 Jun 2008 08:22:31 -0700 |
parents | 50f0e694522d |
children | 5b004da8de91 |
files | usr/src/common/lvm/md_convert.c usr/src/uts/common/io/lvm/mirror/mirror.c usr/src/uts/common/io/lvm/mirror/mirror_ioctl.c usr/src/uts/common/io/lvm/mirror/mirror_resync.c usr/src/uts/common/sys/lvm/md_mirror.h |
diffstat | 5 files changed, 444 insertions(+), 515 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/common/lvm/md_convert.c Wed Jun 18 00:57:00 2008 -0700 +++ b/usr/src/common/lvm/md_convert.c Wed Jun 18 08:22:31 2008 -0700 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -136,7 +135,7 @@ /* Compute the offset of the first component */ first_comp = sizeof (ms_unit_t) + - sizeof (struct ms_row) * (un->un_nrows - 1); + sizeof (struct ms_row) * (un->un_nrows - 1); first_comp = roundup(first_comp, sizeof (long long)); if (first_comp_only == FIRST_COMP_OFFSET) return (first_comp); @@ -169,7 +168,7 @@ /* Compute the size of the new small ms_unit */ first_comp = sizeof (ms_unit32_od_t) + - sizeof (struct ms_row32_od) * (un->un_nrows - 1); + sizeof (struct ms_row32_od) * (un->un_nrows - 1); first_comp = roundup(first_comp, sizeof (long long)); if (first_comp_only == FIRST_COMP_OFFSET) return (first_comp); @@ -222,9 +221,9 @@ small_un->un_hsp_id = big_un->un_hsp_id; small_un->un_nrows = big_un->un_nrows; small_un->c.un_size = - get_small_stripe_req_size(big_un, COMPLETE_STRUCTURE); + get_small_stripe_req_size(big_un, COMPLETE_STRUCTURE); small_un->un_ocomp = - get_small_stripe_req_size(big_un, FIRST_COMP_OFFSET); + get_small_stripe_req_size(big_un, FIRST_COMP_OFFSET); /* walk through all rows */ big_mdr = &big_un->un_row[0]; @@ -236,10 +235,10 @@ } /* Now copy the components */ - big_mdcomp = (ms_comp_t *)(void *)&((char *)big_un) - [big_un->un_ocomp]; + big_mdcomp = (ms_comp_t *)(void *)&((char *)big_un) + [big_un->un_ocomp]; small_mdcomp = (ms_comp32_od_t *)(void *)&((char *)small_un) - [small_un->un_ocomp]; + [small_un->un_ocomp]; for (comp = 0; (comp < ncomps); ++comp) { ms_comp_t *big_mdcp = &big_mdcomp[comp]; ms_comp32_od_t *small_mdcp = &small_mdcomp[comp]; @@ -255,9 +254,9 @@ big_un->un_hsp_id = small_un->un_hsp_id; big_un->un_nrows = small_un->un_nrows; big_un->c.un_size = - get_big_stripe_req_size(small_un, COMPLETE_STRUCTURE); + get_big_stripe_req_size(small_un, COMPLETE_STRUCTURE); big_un->un_ocomp = - get_big_stripe_req_size(small_un, FIRST_COMP_OFFSET); + get_big_stripe_req_size(small_un, FIRST_COMP_OFFSET); /* walk through all rows */ @@ -270,9 +269,9 @@ } /* Now copy the components */ big_mdcomp = (ms_comp_t *)(void *)&((char *)big_un) - [big_un->un_ocomp]; + [big_un->un_ocomp]; small_mdcomp = (ms_comp32_od_t *)(void *)&((char *)small_un) - [small_un->un_ocomp]; + [small_un->un_ocomp]; for (comp = 0; (comp < ncomps); ++comp) { ms_comp_t *big_mdcp = &big_mdcomp[comp]; ms_comp32_od_t *small_mdcp = &small_mdcomp[comp]; @@ -320,7 +319,7 @@ MMSM_BIG2SMALL((&(big_un->un_sm[i])), (&(small_un->un_sm[i]))); } - small_un->un_ovrlap_chn_flg = big_un->un_ovrlap_chn_flg; + small_un->un_overlap_tree_flag = big_un->un_overlap_tree_flag; small_un->un_read_option = big_un->un_read_option; small_un->un_write_option = big_un->un_write_option; small_un->un_pass_num = big_un->un_pass_num; @@ -353,7 +352,7 @@ /* Now back to the simple things again */ - big_un->un_ovrlap_chn_flg = small_un->un_ovrlap_chn_flg; + big_un->un_overlap_tree_flag = small_un->un_overlap_tree_flag; big_un->un_read_option = small_un->un_read_option; big_un->un_write_option = small_un->un_write_option; big_un->un_pass_num = small_un->un_pass_num;
--- a/usr/src/uts/common/io/lvm/mirror/mirror.c Wed Jun 18 00:57:00 2008 -0700 +++ b/usr/src/uts/common/io/lvm/mirror/mirror.c Wed Jun 18 08:22:31 2008 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -58,6 +58,7 @@ #include <sys/sysevent/eventdefs.h> #include <sys/sysevent/svm.h> #include <sys/lvm/mdmn_commd.h> +#include <sys/avl.h> md_ops_t mirror_md_ops; #ifndef lint @@ -337,11 +338,11 @@ * flag. They are both exclusive tests. */ open_comp = (frm_probe) ? - (shared->ms_flags & MDM_S_PROBEOPEN): - (shared->ms_flags & MDM_S_ISOPEN); + (shared->ms_flags & MDM_S_PROBEOPEN): + (shared->ms_flags & MDM_S_ISOPEN); if ((shared->ms_flags & MDM_S_IOERR || !open_comp) && - ((shared->ms_state == CS_OKAY) || - (shared->ms_state == CS_RESYNC))) { + ((shared->ms_state == CS_OKAY) || + (shared->ms_state == CS_RESYNC))) { if (clr_error) { shared->ms_flags &= ~MDM_S_IOERR; } @@ -418,7 +419,7 @@ sm = &un->un_sm[smi]; smic = &un->un_smic[smi]; shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) - (sm->sm_dev, sm, ci); + (sm->sm_dev, sm, ci); if (shared->ms_state != CS_ERRED) return (0); @@ -447,12 +448,12 @@ rw_exit(&mirror_md_ops.md_link_rw.lock); #ifdef DEBUG if (mirror_debug_flag) - printf("send alloc hotspare, flags=0x%x %x, %x, %x, %x\n", - flags, - allochspmsg.msg_allochsp_mnum, - allochspmsg.msg_allochsp_sm, - allochspmsg.msg_allochsp_comp, - allochspmsg.msg_allochsp_hs_id); + printf("send alloc hotspare, flags=" + "0x%x %x, %x, %x, %x\n", flags, + allochspmsg.msg_allochsp_mnum, + allochspmsg.msg_allochsp_sm, + allochspmsg.msg_allochsp_comp, + allochspmsg.msg_allochsp_hs_id); #endif if (flags & MD_HOTSPARE_WMUPDATE) { msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE2; @@ -661,16 +662,15 @@ md_m_shared_t *shared; shared = (md_m_shared_t *) - (*(smic->sm_shared_by_indx))(sm->sm_dev, - sm, ci); + (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci); /* * Never called from ioctl context, so pass in * (IOLOCK *)NULL. Pass through flags from calling * routine, also setting XMIT flag. */ if (check_comp_4_hotspares(un, i, ci, - (MD_HOTSPARE_XMIT | flags), - shared->ms_hs_id, (IOLOCK *)NULL) != 0) + (MD_HOTSPARE_XMIT | flags), + shared->ms_hs_id, (IOLOCK *)NULL) != 0) return (1); } } @@ -762,8 +762,8 @@ if (hotspare_request.dr_pending == 0) { hotspare_request.dr_pending = 1; daemon_request(&md_mhs_daemon, - check_4_hotspares, - (daemon_queue_t *)&hotspare_request, REQ_OLD); + check_4_hotspares, (daemon_queue_t *)&hotspare_request, + REQ_OLD); } mutex_exit(&hotspare_request.dr_mx); return (0); @@ -804,12 +804,11 @@ if (get_dev != NULL) { (void) (*get_dev)(tmpdev, smi, ci, &cd); cmn_err(CE_WARN, "md %s: open error on %s", - md_shortname(MD_SID(un)), - md_devname(MD_UN2SET(un), cd.cd_dev, - NULL, 0)); + md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un), + cd.cd_dev, NULL, 0)); } else { cmn_err(CE_WARN, "md %s: open error", - md_shortname(MD_SID(un))); + md_shortname(MD_SID(un))); } } @@ -840,62 +839,63 @@ { mutex_enter(&non_ff_drv_mutex); if (non_ff_drivers == NULL) { - non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *), - KM_NOSLEEP); - if (non_ff_drivers == NULL) { - mutex_exit(&non_ff_drv_mutex); - return (1); - } - - non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP); - if (non_ff_drivers[0] == NULL) { - kmem_free(non_ff_drivers, 2 * sizeof (char *)); - non_ff_drivers = NULL; - mutex_exit(&non_ff_drv_mutex); - return (1); - } - - (void) strcpy(non_ff_drivers[0], s); - non_ff_drivers[1] = NULL; + non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *), + KM_NOSLEEP); + if (non_ff_drivers == NULL) { + mutex_exit(&non_ff_drv_mutex); + return (1); + } + + non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1, + KM_NOSLEEP); + if (non_ff_drivers[0] == NULL) { + kmem_free(non_ff_drivers, 2 * sizeof (char *)); + non_ff_drivers = NULL; + mutex_exit(&non_ff_drv_mutex); + return (1); + } + + (void) strcpy(non_ff_drivers[0], s); + non_ff_drivers[1] = NULL; } else { - int i; - char **tnames; - char **tmp; - - for (i = 0; non_ff_drivers[i] != NULL; i++) { - if (strcmp(s, non_ff_drivers[i]) == 0) { - mutex_exit(&non_ff_drv_mutex); - return (0); + int i; + char **tnames; + char **tmp; + + for (i = 0; non_ff_drivers[i] != NULL; i++) { + if (strcmp(s, non_ff_drivers[i]) == 0) { + mutex_exit(&non_ff_drv_mutex); + return (0); + } + } + + /* allow for new element and null */ + i += 2; + tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP); + if (tnames == NULL) { + mutex_exit(&non_ff_drv_mutex); + return (1); } - } - - /* allow for new element and null */ - i += 2; - tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP); - if (tnames == NULL) { - mutex_exit(&non_ff_drv_mutex); - return (1); - } - - for (i = 0; non_ff_drivers[i] != NULL; i++) - tnames[i] = non_ff_drivers[i]; - - tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP); - if (tnames[i] == NULL) { - /* adjust i so that it is the right count to free */ - kmem_free(tnames, (i + 2) * sizeof (char *)); - mutex_exit(&non_ff_drv_mutex); - return (1); - } - - (void) strcpy(tnames[i++], s); - tnames[i] = NULL; - - tmp = non_ff_drivers; - non_ff_drivers = tnames; - /* i now represents the count we previously alloced */ - kmem_free(tmp, i * sizeof (char *)); + + for (i = 0; non_ff_drivers[i] != NULL; i++) + tnames[i] = non_ff_drivers[i]; + + tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP); + if (tnames[i] == NULL) { + /* adjust i so that it is the right count to free */ + kmem_free(tnames, (i + 2) * sizeof (char *)); + mutex_exit(&non_ff_drv_mutex); + return (1); + } + + (void) strcpy(tnames[i++], s); + tnames[i] = NULL; + + tmp = non_ff_drivers; + non_ff_drivers = tnames; + /* i now represents the count we previously alloced */ + kmem_free(tmp, i * sizeof (char *)); } mutex_exit(&non_ff_drv_mutex); @@ -918,110 +918,126 @@ mm_unit_t *un; if (md_ff_disable) - return; + return; un = MD_UNIT(mnum); for (i = 0; i < NMIRROR; i++) { - int ci; - int cnt; - int ff = 1; - mm_submirror_t *sm; - mm_submirror_ic_t *smic; - void (*get_dev)(); - - if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) - continue; - - sm = &un->un_sm[i]; - smic = &un->un_smic[i]; - - get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, - "get device", 0); - - cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm); - for (ci = 0; ci < cnt; ci++) { - int found = 0; - dev_t ci_dev; - major_t major; - dev_info_t *devi; - ms_cd_info_t cd; - - /* this already returns the hs dev if the device is spared */ - (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); - - ci_dev = md_dev64_to_dev(cd.cd_dev); - major = getmajor(ci_dev); - - if (major == md_major) { - /* this component must be a soft partition; get real dev */ - minor_t dev_mnum; - mdi_unit_t *ui; - mp_unit_t *un; - set_t setno; - side_t side; - md_dev64_t tmpdev; - - ui = MDI_UNIT(getminor(ci_dev)); - - /* grab necessary lock */ - un = (mp_unit_t *)md_unit_readerlock(ui); - - dev_mnum = MD_SID(un); - setno = MD_MIN2SET(dev_mnum); - side = mddb_getsidenum(setno); - - tmpdev = un->un_dev; - - /* Get dev by device id */ - if (md_devid_found(setno, side, un->un_key) == 1) { - tmpdev = md_resolve_bydevid(dev_mnum, tmpdev, - un->un_key); - } - - md_unit_readerexit(ui); - - ci_dev = md_dev64_to_dev(tmpdev); - major = getmajor(ci_dev); + int ci; + int cnt; + int ff = 1; + mm_submirror_t *sm; + mm_submirror_ic_t *smic; + void (*get_dev)(); + + if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) + continue; + + sm = &un->un_sm[i]; + smic = &un->un_smic[i]; + + get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, + "get device", 0); + + cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm); + for (ci = 0; ci < cnt; ci++) { + int found = 0; + dev_t ci_dev; + major_t major; + dev_info_t *devi; + ms_cd_info_t cd; + + /* + * this already returns the hs + * dev if the device is spared + */ + (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); + + ci_dev = md_dev64_to_dev(cd.cd_dev); + major = getmajor(ci_dev); + + if (major == md_major) { + /* + * this component must be a soft + * partition; get the real dev + */ + minor_t dev_mnum; + mdi_unit_t *ui; + mp_unit_t *un; + set_t setno; + side_t side; + md_dev64_t tmpdev; + + ui = MDI_UNIT(getminor(ci_dev)); + + /* grab necessary lock */ + un = (mp_unit_t *)md_unit_readerlock(ui); + + dev_mnum = MD_SID(un); + setno = MD_MIN2SET(dev_mnum); + side = mddb_getsidenum(setno); + + tmpdev = un->un_dev; + + /* Get dev by device id */ + if (md_devid_found(setno, side, + un->un_key) == 1) { + tmpdev = md_resolve_bydevid(dev_mnum, + tmpdev, un->un_key); + } + + md_unit_readerexit(ui); + + ci_dev = md_dev64_to_dev(tmpdev); + major = getmajor(ci_dev); + } + + if (ci_dev != NODEV32 && + (devi = e_ddi_hold_devi_by_dev(ci_dev, 0)) + != NULL) { + ddi_prop_op_t prop_op = PROP_LEN_AND_VAL_BUF; + int propvalue = 0; + int proplength = sizeof (int); + int error; + struct cb_ops *cb; + + if ((cb = devopsp[major]->devo_cb_ops) != + NULL) { + error = (*cb->cb_prop_op) + (DDI_DEV_T_ANY, devi, prop_op, + DDI_PROP_NOTPROM|DDI_PROP_DONTPASS, + "ddi-failfast-supported", + (caddr_t)&propvalue, &proplength); + + if (error == DDI_PROP_SUCCESS) + found = 1; + } + + if (!found && new_non_ff_driver( + ddi_driver_name(devi))) { + cmn_err(CE_NOTE, "!md: B_FAILFAST I/O" + "disabled on %s", + ddi_driver_name(devi)); + } + + ddi_release_devi(devi); + } + + /* + * All components must support + * failfast in the submirror. + */ + if (!found) { + ff = 0; + break; + } } - if (ci_dev != NODEV32 && - (devi = e_ddi_hold_devi_by_dev(ci_dev, 0)) != NULL) { - ddi_prop_op_t prop_op = PROP_LEN_AND_VAL_BUF; - int propvalue = 0; - int proplength = sizeof (int); - int error; - struct cb_ops *cb; - - if ((cb = devopsp[major]->devo_cb_ops) != NULL) { - error = (*cb->cb_prop_op)(DDI_DEV_T_ANY, devi, prop_op, - DDI_PROP_NOTPROM|DDI_PROP_DONTPASS, - "ddi-failfast-supported", - (caddr_t)&propvalue, &proplength); - - if (error == DDI_PROP_SUCCESS) - found = 1; - } - - if (!found && new_non_ff_driver(ddi_driver_name(devi))) - cmn_err(CE_NOTE, "!md: B_FAILFAST I/O disabled on %s", - ddi_driver_name(devi)); - - ddi_release_devi(devi); + if (ff) { + sm->sm_flags |= MD_SM_FAILFAST; + } else { + sm->sm_flags &= ~MD_SM_FAILFAST; } - - /* All components must support failfast in the submirror. */ - if (!found) { - ff = 0; - break; - } - } - - if (ff) { - sm->sm_flags |= MD_SM_FAILFAST; - } else { - sm->sm_flags &= ~MD_SM_FAILFAST; - } } } @@ -1288,37 +1304,24 @@ } void -mirror_overlap_chain_remove(md_mps_t *ps) +mirror_overlap_tree_remove(md_mps_t *ps) { mm_unit_t *un; if (panicstr) return; - ASSERT(ps->ps_flags & MD_MPS_ON_OVERLAP); - + VERIFY(ps->ps_flags & MD_MPS_ON_OVERLAP); un = ps->ps_un; - mutex_enter(&un->un_ovrlap_chn_mx); - if (ps->ps_ovrlap_prev != &un->un_ovrlap_chn) - ps->ps_ovrlap_prev->ps_ovrlap_next = ps->ps_ovrlap_next; - else - un->un_ovrlap_chn.ps_ovrlap_next = ps->ps_ovrlap_next; - if (ps->ps_ovrlap_next != &un->un_ovrlap_chn) - ps->ps_ovrlap_next->ps_ovrlap_prev = ps->ps_ovrlap_prev; - else - un->un_ovrlap_chn.ps_ovrlap_prev = ps->ps_ovrlap_prev; - /* Handle empty overlap chain */ - if (un->un_ovrlap_chn.ps_ovrlap_prev == &un->un_ovrlap_chn) { - un->un_ovrlap_chn.ps_ovrlap_prev = - un->un_ovrlap_chn.ps_ovrlap_next = NULL; + mutex_enter(&un->un_overlap_tree_mx); + avl_remove(&un->un_overlap_root, ps); + ps->ps_flags &= ~MD_MPS_ON_OVERLAP; + if (un->un_overlap_tree_flag != 0) { + un->un_overlap_tree_flag = 0; + cv_broadcast(&un->un_overlap_tree_cv); } - if (un->un_ovrlap_chn_flg) { - un->un_ovrlap_chn_flg = 0; - cv_broadcast(&un->un_ovrlap_chn_cv); - } - ps->ps_flags &= ~MD_MPS_ON_OVERLAP; - mutex_exit(&un->un_ovrlap_chn_mx); + mutex_exit(&un->un_overlap_tree_mx); } @@ -1328,139 +1331,53 @@ * Check that given i/o request does not cause an overlap with already pending * i/o. If it does, block until the overlapped i/o completes. * - * Note: the overlap chain is held as a monotonically increasing doubly-linked - * list with the sentinel contained in un->un_ovrlap_chn. We avoid a linear - * search of the list by the following logic: - * ps->ps_lastblk < un_ovrlap_chn.ps_ovrlap_next->ps_firstblk => No overlap - * ps->ps_firstblk > un_ovrlap_chn.ps_ovrlap_prev->ps_lastblk => No overlap - * otherwise - * scan un_ovrlap_chn.ps_ovrlap_next for location where ps->ps_firstblk - * > chain->ps_lastblk. This is the insertion point. As the list is - * guaranteed to be ordered there is no need to continue scanning. - * * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent - * structure to be already on the overlap chain and MD_OVERLAP_NO_REPEAT - * if it must not already be on the chain + * structure to be already in the overlap tree and MD_OVERLAP_NO_REPEAT if + * it must not already be in the tree. */ static void wait_for_overlaps(md_mps_t *ps, int flags) { mm_unit_t *un; - md_mps_t *ps1, **head, **tail; + avl_index_t where; + md_mps_t *ps1; if (panicstr) return; - un = ps->ps_un; - - mutex_enter(&un->un_ovrlap_chn_mx); + mutex_enter(&un->un_overlap_tree_mx); if ((flags & MD_OVERLAP_ALLOW_REPEAT) && (ps->ps_flags & MD_MPS_ON_OVERLAP)) { - mutex_exit(&un->un_ovrlap_chn_mx); - return; - } - - ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); - head = &(un->un_ovrlap_chn.ps_ovrlap_next); - tail = &(un->un_ovrlap_chn.ps_ovrlap_prev); - ps1 = *head; - /* - * Check for simple limit cases: - * *head == NULL - * insert ps at head of list - * lastblk < head->firstblk - * insert at head of list - * firstblk > tail->lastblk - * insert at tail of list - */ - if (ps1 == NULL) { - /* Insert at head */ - ps->ps_ovrlap_next = &un->un_ovrlap_chn; - ps->ps_ovrlap_prev = &un->un_ovrlap_chn; - *head = ps; - *tail = ps; - ps->ps_flags |= MD_MPS_ON_OVERLAP; - mutex_exit(&un->un_ovrlap_chn_mx); - return; - } else if (ps->ps_lastblk < (*head)->ps_firstblk) { - /* Insert at head */ - ps->ps_ovrlap_next = (*head); - ps->ps_ovrlap_prev = &un->un_ovrlap_chn; - (*head)->ps_ovrlap_prev = ps; - *head = ps; - ps->ps_flags |= MD_MPS_ON_OVERLAP; - mutex_exit(&un->un_ovrlap_chn_mx); - return; - } else if (ps->ps_firstblk > (*tail)->ps_lastblk) { - /* Insert at tail */ - ps->ps_ovrlap_prev = (*tail); - ps->ps_ovrlap_next = &un->un_ovrlap_chn; - (*tail)->ps_ovrlap_next = ps; - *tail = ps; - ps->ps_flags |= MD_MPS_ON_OVERLAP; - mutex_exit(&un->un_ovrlap_chn_mx); + mutex_exit(&un->un_overlap_tree_mx); return; } - /* Now we have to scan the list for possible overlaps */ - while (ps1 != NULL) { - /* - * If this region has been put on the chain by another thread - * just exit - */ - if ((flags & MD_OVERLAP_ALLOW_REPEAT) && - (ps->ps_flags & MD_MPS_ON_OVERLAP)) { - mutex_exit(&un->un_ovrlap_chn_mx); - return; - + + VERIFY(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); + + do { + ps1 = avl_find(&un->un_overlap_root, ps, &where); + if (ps1 == NULL) { + /* + * The candidate range does not overlap with any + * range in the tree. Insert it and be done. + */ + avl_insert(&un->un_overlap_root, ps, where); + ps->ps_flags |= MD_MPS_ON_OVERLAP; + } else { + /* + * The candidate range would overlap. Set the flag + * indicating we need to be woken up, and sleep + * until another thread removes a range. If upon + * waking up we find this mps was put on the tree + * by another thread, the loop terminates. + */ + un->un_overlap_tree_flag = 1; + cv_wait(&un->un_overlap_tree_cv, + &un->un_overlap_tree_mx); } - for (ps1 = *head; ps1 && (ps1 != &un->un_ovrlap_chn); - ps1 = ps1->ps_ovrlap_next) { - if (ps->ps_firstblk > (*tail)->ps_lastblk) { - /* Insert at tail */ - ps->ps_ovrlap_prev = (*tail); - ps->ps_ovrlap_next = &un->un_ovrlap_chn; - (*tail)->ps_ovrlap_next = ps; - *tail = ps; - ps->ps_flags |= MD_MPS_ON_OVERLAP; - mutex_exit(&un->un_ovrlap_chn_mx); - return; - } - if (ps->ps_firstblk > ps1->ps_lastblk) - continue; - if (ps->ps_lastblk < ps1->ps_firstblk) { - /* Insert into list at current 'ps1' position */ - ps->ps_ovrlap_next = ps1; - ps->ps_ovrlap_prev = ps1->ps_ovrlap_prev; - ps1->ps_ovrlap_prev->ps_ovrlap_next = ps; - ps1->ps_ovrlap_prev = ps; - ps->ps_flags |= MD_MPS_ON_OVERLAP; - mutex_exit(&un->un_ovrlap_chn_mx); - return; - } - break; - } - if (ps1 != NULL) { - un->un_ovrlap_chn_flg = 1; - cv_wait(&un->un_ovrlap_chn_cv, &un->un_ovrlap_chn_mx); - /* - * Now ps1 refers to the old insertion point and we - * have to check the whole chain to see if we're still - * overlapping any other i/o. - */ - } - } - - /* - * Only get here if we had one overlapping i/o on the list and that - * has now completed. In this case the list is empty so we insert <ps> - * at the head of the chain. - */ - ASSERT(*head == NULL); - *tail = *head = ps; - ps->ps_ovrlap_next = ps->ps_ovrlap_prev = &un->un_ovrlap_chn; - ps->ps_flags |= MD_MPS_ON_OVERLAP; - mutex_exit(&un->un_ovrlap_chn_mx); + } while (!(ps->ps_flags & MD_MPS_ON_OVERLAP)); + mutex_exit(&un->un_overlap_tree_mx); } /* @@ -1747,7 +1664,7 @@ ps->ps_allfrom_sm = SMI2BIT(sm_index); if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) { - bp->b_flags |= B_FAILFAST; + bp->b_flags |= B_FAILFAST; } return (0); @@ -1794,7 +1711,7 @@ return; if (snarfing) { sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno), - sm->sm_key, MD_NOTRUST_DEVT); + sm->sm_key, MD_NOTRUST_DEVT); } else { if (md_getmajor(sm->sm_dev) == md_major) { su = MD_UNIT(md_getminor(sm->sm_dev)); @@ -1807,12 +1724,10 @@ 0, "shared by blk", 0); smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev, 0, "shared by indx", 0); - smic->sm_get_component_count = - (int (*)())md_get_named_service(sm->sm_dev, 0, - "get component count", 0); - smic->sm_get_bcss = - (int (*)())md_get_named_service(sm->sm_dev, 0, - "get block count skip size", 0); + smic->sm_get_component_count = (int (*)())md_get_named_service( + sm->sm_dev, 0, "get component count", 0); + smic->sm_get_bcss = (int (*)())md_get_named_service(sm->sm_dev, 0, + "get block count skip size", 0); sm->sm_state &= ~SMS_IGNORE; if (SMS_IS(sm, SMS_OFFLINE)) MD_STATUS(un) |= MD_UN_OFFLINE_SM; @@ -1851,6 +1766,36 @@ md_rem_names(sv, nsv); } +/* + * Comparison function for the avl tree which tracks + * outstanding writes on submirrors. + * + * Returns: + * -1: ps1 < ps2 + * 0: ps1 and ps2 overlap + * 1: ps1 > ps2 + */ +static int +mirror_overlap_compare(const void *p1, const void *p2) +{ + const md_mps_t *ps1 = (md_mps_t *)p1; + const md_mps_t *ps2 = (md_mps_t *)p2; + + if (ps1->ps_firstblk < ps2->ps_firstblk) { + if (ps1->ps_lastblk >= ps2->ps_firstblk) + return (0); + return (-1); + } + + if (ps1->ps_firstblk > ps2->ps_firstblk) { + if (ps1->ps_firstblk <= ps2->ps_lastblk) + return (0); + return (1); + } + + return (0); +} + /* Return a -1 if optimized record unavailable and set should be released */ int mirror_build_incore(mm_unit_t *un, int snarfing) @@ -1873,8 +1818,9 @@ /* pre-4.1 didn't define CAN_META_CHILD capability */ MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP; - un->un_ovrlap_chn_flg = 0; - bzero(&un->un_ovrlap_chn, sizeof (un->un_ovrlap_chn)); + un->un_overlap_tree_flag = 0; + avl_create(&un->un_overlap_root, mirror_overlap_compare, + sizeof (md_mps_t), offsetof(md_mps_t, ps_overlap_node)); for (i = 0; i < NMIRROR; i++) build_submirror(un, i, snarfing); @@ -1902,8 +1848,8 @@ return (1); } - mutex_init(&un->un_ovrlap_chn_mx, NULL, MUTEX_DEFAULT, NULL); - cv_init(&un->un_ovrlap_chn_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&un->un_overlap_tree_mx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&un->un_overlap_tree_cv, NULL, CV_DEFAULT, NULL); un->un_suspend_wr_flag = 0; mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL); @@ -2001,11 +1947,13 @@ mirror_commit(un, bits, 0); + avl_destroy(&un->un_overlap_root); + /* Destroy all mutexes and condvars before returning. */ mutex_destroy(&un->un_suspend_wr_mx); cv_destroy(&un->un_suspend_wr_cv); - mutex_destroy(&un->un_ovrlap_chn_mx); - cv_destroy(&un->un_ovrlap_chn_cv); + mutex_destroy(&un->un_overlap_tree_mx); + cv_destroy(&un->un_overlap_tree_cv); mutex_destroy(&un->un_owner_mx); mutex_destroy(&un->un_rs_thread_mx); cv_destroy(&un->un_rs_thread_cv); @@ -2329,11 +2277,11 @@ ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev))); if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) && ui_sm->ui_tstate & MD_INACCESSIBLE) { - ui_sm->ui_tstate &= ~MD_INACCESSIBLE; + ui_sm->ui_tstate &= ~MD_INACCESSIBLE; } - shared = (md_m_shared_t *) - (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci); + shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) + (sm->sm_dev, sm, ci); origstate = shared->ms_state; /* @@ -2345,9 +2293,8 @@ if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) && (newstate & (CS_ERRED|CS_LAST_ERRED))) { - get_dev = - (void (*)())md_get_named_service(sm->sm_dev, 0, - "get device", 0); + get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, + "get device", 0); (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); err = md_getdevname(setno, mddb_getsidenum(setno), 0, @@ -2355,7 +2302,7 @@ if (err == ENOENT) { (void) md_devname(setno, cd.cd_dev, devname, - sizeof (devname)); + sizeof (devname)); } cmn_err(CE_WARN, "md: %s: %s needs maintenance", @@ -2480,12 +2427,8 @@ } kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); - rval = mdmn_ksend_message(setno, - msgtype, - msgflags, - (char *)&stchmsg, - sizeof (stchmsg), - kresult); + rval = mdmn_ksend_message(setno, msgtype, msgflags, + (char *)&stchmsg, sizeof (stchmsg), kresult); if (!MDMN_KSEND_MSG_OK(rval, kresult)) { mdmn_ksend_show_error(rval, kresult, "STATE UPDATE"); @@ -2562,8 +2505,8 @@ mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024)); /* 1 Gig Blks */ - dev = select_read_unit(un, blk, mcnt, &cando, must_be_open, &s, - NULL); + dev = select_read_unit(un, blk, mcnt, &cando, + must_be_open, &s, NULL); if (dev == (md_dev64_t)0) break; @@ -2617,7 +2560,7 @@ * Make sure this component has other sources */ (void) (*(smic->sm_get_bcss)) - (dev, sm, ci, &block, &count, &skip, &size); + (dev, sm, ci, &block, &count, &skip, &size); if (count == 0) return (1); @@ -2743,7 +2686,7 @@ /* Never called from ioctl context, so (IOLOCK *)NULL */ set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags, - (IOLOCK *)NULL); + (IOLOCK *)NULL); /* * For a MN set, the NOTIFY is done when the state * change is processed on each node @@ -2756,7 +2699,7 @@ } /* Never called from ioctl context, so (IOLOCK *)NULL */ set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags, - (IOLOCK *)NULL); + (IOLOCK *)NULL); /* * For a MN set, the NOTIFY is done when the state * change is processed on each node @@ -2797,8 +2740,8 @@ /* if we're panicing just let this I/O error out */ if (panicstr) { - (void) mirror_done(cb); - return; + (void) mirror_done(cb); + return; } /* reissue the I/O */ @@ -2820,7 +2763,7 @@ clear_retry_error(cb); cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST", - md_shortname(getminor(cb->b_edev))); + md_shortname(getminor(cb->b_edev))); md_call_strategy(cb, flags, NULL); } @@ -2837,7 +2780,7 @@ } if (ps->ps_flags & MD_MPS_ON_OVERLAP) - mirror_overlap_chain_remove(ps); + mirror_overlap_tree_remove(ps); smi = 0; ci = 0; @@ -2937,7 +2880,7 @@ * md_biodone(). */ (void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW - | MD_STR_MAPPED, NULL); + | MD_STR_MAPPED, NULL); } static void @@ -2977,7 +2920,7 @@ */ if (md_mirror_wow_flg & WOW_NOCOPY) (void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW | - MD_STR_MAPPED | MD_IO_COUNTED, ps); + MD_STR_MAPPED | MD_IO_COUNTED, ps); else md_mirror_copy_write(ps); } @@ -3144,7 +3087,7 @@ } if (ps->ps_flags & MD_MPS_ON_OVERLAP) - mirror_overlap_chain_remove(ps); + mirror_overlap_tree_remove(ps); /* * Handle Write-on-Write problem. @@ -3191,16 +3134,15 @@ un = cs->cs_ps->ps_un; for (smi = 0; smi < NMIRROR; smi++) { - if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) - continue; - - if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev)) { - break; - } + if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) + continue; + + if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev)) + break; } if (smi >= NMIRROR) - return; + return; sm = &un->un_sm[smi]; smic = &un->un_smic[smi]; @@ -3213,25 +3155,25 @@ cb->b_blkno, &cnt); if (shared->ms_flags & MDM_S_IOERR) { - shared->ms_flags &= ~MDM_S_IOERR; + shared->ms_flags &= ~MDM_S_IOERR; } else { - /* the I/O buf spans components and the first one is not erred */ - int cnt; - int i; - - cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un); - for (i = 0; i < cnt; i++) { - shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) - (sm->sm_dev, sm, i); - - if (shared->ms_flags & MDM_S_IOERR && - shared->ms_state == CS_OKAY) { - - shared->ms_flags &= ~MDM_S_IOERR; - break; + /* the buf spans components and the first one is not erred */ + int cnt; + int i; + + cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un); + for (i = 0; i < cnt; i++) { + shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) + (sm->sm_dev, sm, i); + + if (shared->ms_flags & MDM_S_IOERR && + shared->ms_state == CS_OKAY) { + + shared->ms_flags &= ~MDM_S_IOERR; + break; + } } - } } md_unit_writerexit(ui_sm); @@ -3257,8 +3199,8 @@ bp->b_bcount = ldbtob(count); return (0); } - bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno, count, &cando, - 0, NULL, cs)); + bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno, + count, &cando, 0, NULL, cs)); bp->b_bcount = ldbtob(cando); if (count != cando) return (cando); @@ -3634,11 +3576,9 @@ kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); rval = mdmn_ksend_message(setno, - MD_MN_MSG_REQUIRE_OWNER, - msg_flags, /* flags */ - (char *)msg, - sizeof (md_mn_req_owner_t), - kres); + MD_MN_MSG_REQUIRE_OWNER, msg_flags, + /* flags */ (char *)msg, + sizeof (md_mn_req_owner_t), kres); kmem_free(msg, sizeof (md_mn_req_owner_t)); @@ -3668,11 +3608,10 @@ * Release the block on the current * resync region if it is blocked */ - ps1 = un->un_rs_prev_ovrlap; + ps1 = un->un_rs_prev_overlap; if ((ps1 != NULL) && (ps1->ps_flags & MD_MPS_ON_OVERLAP)) - mirror_overlap_chain_remove( - ps1); + mirror_overlap_tree_remove(ps1); mutex_exit(&un->un_owner_mx); /* @@ -3824,14 +3763,14 @@ /* * If not MN owner and this is an ABR write, make sure the current - * resync region is on the overlaps chain + * resync region is in the overlaps tree */ mutex_enter(&un->un_owner_mx); if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) && ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) { md_mps_t *ps1; /* Block the current resync region, if not already blocked */ - ps1 = un->un_rs_prev_ovrlap; + ps1 = un->un_rs_prev_overlap; if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) || (ps1->ps_lastblk != 0))) { @@ -3845,11 +3784,11 @@ /* * Check to see if we have obtained ownership * while waiting for overlaps. If we have, remove - * the resync_region entry from the overlap chain + * the resync_region entry from the overlap tree */ if (MD_MN_MIRROR_OWNER(un) && (ps1->ps_flags & MD_MPS_ON_OVERLAP)) { - mirror_overlap_chain_remove(ps1); + mirror_overlap_tree_remove(ps1); rs_on_overlap = 0; } } @@ -3885,7 +3824,7 @@ MD_SID(un), ps->ps_firstblk); #endif if (ps->ps_flags & MD_MPS_ON_OVERLAP) - mirror_overlap_chain_remove(ps); + mirror_overlap_tree_remove(ps); kmem_cache_free(mirror_parent_cache, ps); md_kstat_waitq_exit(ui); md_unit_readerexit(ui); @@ -3901,15 +3840,15 @@ un = md_unit_readerlock(ui); /* * For a MN set with an ABR write, if we are now the - * owner and we have a resync region on the overlap - * chain, remove the entry from overlaps and retry the write. + * owner and we have a resync region in the overlap + * tree, remove the entry from overlaps and retry the write. */ if (MD_MNSET_SETNO(setno) && ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) { mutex_enter(&un->un_owner_mx); if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) { - mirror_overlap_chain_remove(ps); + mirror_overlap_tree_remove(ps); md_kstat_waitq_exit(ui); mutex_exit(&un->un_owner_mx); md_unit_readerexit(ui); @@ -3936,7 +3875,7 @@ (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) { if (!MD_MN_MIRROR_OWNER(un)) { if (ps->ps_flags & MD_MPS_ON_OVERLAP) - mirror_overlap_chain_remove(ps); + mirror_overlap_tree_remove(ps); md_kstat_waitq_exit(ui); ASSERT(!(flag & MD_STR_WAR)); md_unit_readerexit(ui); @@ -3986,10 +3925,10 @@ (pb->b_flags & B_PHYS) && !(ps->ps_flags & MD_MPS_WOW)) { if (ps->ps_flags & MD_MPS_ON_OVERLAP) - mirror_overlap_chain_remove(ps); + mirror_overlap_tree_remove(ps); md_unit_readerexit(ui); daemon_request(&md_mstr_daemon, handle_wow, - (daemon_queue_t *)ps, REQ_OLD); + (daemon_queue_t *)ps, REQ_OLD); return; } @@ -4008,7 +3947,7 @@ */ if (more < 0) { if (ps->ps_flags & MD_MPS_ON_OVERLAP) - mirror_overlap_chain_remove(ps); + mirror_overlap_tree_remove(ps); md_kstat_runq_exit(ui); kmem_cache_free(mirror_child_cache, cs); kmem_cache_free(mirror_parent_cache, ps); @@ -4205,7 +4144,7 @@ */ if (!MD_MN_MIRROR_OWNER(un)) { ps->ps_call = NULL; - mirror_overlap_chain_remove(ps); + mirror_overlap_tree_remove(ps); md_kstat_waitq_exit(ui); md_unit_readerexit(ui); daemon_request( @@ -4231,7 +4170,7 @@ MD_SID(un), ps->ps_firstblk); #endif - mirror_overlap_chain_remove(ps); + mirror_overlap_tree_remove(ps); kmem_cache_free(mirror_parent_cache, ps); md_kstat_waitq_exit(ui); @@ -4263,7 +4202,7 @@ current_blkno, mirror_done, cb, KM_NOSLEEP); more = mirror_map_read(ps, cs, current_blkno, - (u_longlong_t)current_count); + (u_longlong_t)current_count); if (more) { mutex_enter(&ps->ps_mx); ps->ps_frags++; @@ -4592,12 +4531,12 @@ if ((p->rs_type == un->un_rs_type) && (p->rs_start < un->un_resync_startbl)) break; - ps = un->un_rs_prev_ovrlap; + ps = un->un_rs_prev_overlap; /* Allocate previous overlap reference if needed */ if (ps == NULL) { ps = kmem_cache_alloc(mirror_parent_cache, - MD_ALLOCFLAGS); + MD_ALLOCFLAGS); ps->ps_un = un; ps->ps_ui = ui; ps->ps_firstblk = 0; @@ -4605,7 +4544,7 @@ ps->ps_flags = 0; md_ioctl_readerexit(lockp); (void) md_ioctl_writerlock(lockp, ui); - un->un_rs_prev_ovrlap = ps; + un->un_rs_prev_overlap = ps; md_ioctl_writerexit(lockp); } else md_ioctl_readerexit(lockp); @@ -4642,7 +4581,7 @@ p->rs_size - 1)) { /* Remove previous overlap range */ if (ps->ps_flags & MD_MPS_ON_OVERLAP) - mirror_overlap_chain_remove(ps); + mirror_overlap_tree_remove(ps); ps->ps_firstblk = p->rs_start; ps->ps_lastblk = ps->ps_firstblk + @@ -4660,11 +4599,11 @@ * ownership while waiting for * overlaps. If we have, remove * the resync_region entry from the - * overlap chain + * overlap tree */ if (MD_MN_MIRROR_OWNER(un) && (ps->ps_flags & MD_MPS_ON_OVERLAP)) - mirror_overlap_chain_remove(ps); + mirror_overlap_tree_remove(ps); } } mutex_exit(&un->un_owner_mx); @@ -4722,15 +4661,15 @@ mutex_exit(&un->un_owner_mx); } (void) md_ioctl_writerlock(lockp, ui); - ps = un->un_rs_prev_ovrlap; + ps = un->un_rs_prev_overlap; if (ps != NULL) { /* Remove previous overlap range */ if (ps->ps_flags & MD_MPS_ON_OVERLAP) - mirror_overlap_chain_remove(ps); + mirror_overlap_tree_remove(ps); /* * Release the overlap range reference */ - un->un_rs_prev_ovrlap = NULL; + un->un_rs_prev_overlap = NULL; kmem_cache_free(mirror_parent_cache, ps); } @@ -5023,9 +4962,9 @@ (mm_unit32_od_t *)mddb_getrecaddr(recid); newreqsize = sizeof (mm_unit_t); big_un = (mm_unit_t *)kmem_zalloc(newreqsize, - KM_SLEEP); + KM_SLEEP); mirror_convert((caddr_t)small_un, - (caddr_t)big_un, SMALL_2_BIG); + (caddr_t)big_un, SMALL_2_BIG); kmem_free(small_un, dep->de_reqsize); /* @@ -5043,7 +4982,7 @@ * record address. */ un = (mm_unit_t *)mddb_getrecaddr_resize(recid, - sizeof (*un), 0); + sizeof (*un), 0); } un->c.un_revision &= ~MD_64BIT_META_DEV; break; @@ -5051,7 +4990,7 @@ case MDDB_REV_RB64FN: /* Big device */ un = (mm_unit_t *)mddb_getrecaddr_resize(recid, - sizeof (*un), 0); + sizeof (*un), 0); un->c.un_revision |= MD_64BIT_META_DEV; un->c.un_flag |= MD_EFILABEL; break; @@ -5212,7 +5151,7 @@ mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags) { return (mirror_internal_close(getminor(dev), otyp, md_cflags, - (IOLOCK *)NULL)); + (IOLOCK *)NULL)); } @@ -5301,7 +5240,7 @@ sm_cnt++; tmpdev = un->un_sm[i].sm_dev; (void) md_layered_open(mnum, &tmpdev, - MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV); + MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV); un->un_sm[i].sm_dev = tmpdev; sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); @@ -5455,13 +5394,13 @@ optrec_id = &(un32->un_rr_dirty_recid); for (i = 0; i < un32->un_nsm; i++) { - tmpdev = md_expldev(un32->un_sm[i].sm_dev); - un32->un_sm[i].sm_dev = md_cmpldev - (md_makedevice(md_major, MD_MKMIN(setno, - MD_MIN2UNIT(md_getminor(tmpdev))))); - - if (!md_update_minor(setno, mddb_getsidenum - (setno), un32->un_sm[i].sm_key)) + tmpdev = md_expldev(un32->un_sm[i].sm_dev); + un32->un_sm[i].sm_dev = md_cmpldev + (md_makedevice(md_major, MD_MKMIN(setno, + MD_MIN2UNIT(md_getminor(tmpdev))))); + + if (!md_update_minor(setno, mddb_getsidenum + (setno), un32->un_sm[i].sm_key)) goto out; } break; @@ -5474,13 +5413,13 @@ optrec_id = &(un64->un_rr_dirty_recid); for (i = 0; i < un64->un_nsm; i++) { - tmpdev = un64->un_sm[i].sm_dev; - un64->un_sm[i].sm_dev = md_makedevice - (md_major, MD_MKMIN(setno, MD_MIN2UNIT - (md_getminor(tmpdev)))); - - if (!md_update_minor(setno, mddb_getsidenum - (setno), un64->un_sm[i].sm_key)) + tmpdev = un64->un_sm[i].sm_dev; + un64->un_sm[i].sm_dev = md_makedevice + (md_major, MD_MKMIN(setno, MD_MIN2UNIT + (md_getminor(tmpdev)))); + + if (!md_update_minor(setno, mddb_getsidenum + (setno), un64->un_sm[i].sm_key)) goto out; } break;
--- a/usr/src/uts/common/io/lvm/mirror/mirror_ioctl.c Wed Jun 18 00:57:00 2008 -0700 +++ b/usr/src/uts/common/io/lvm/mirror/mirror_ioctl.c Wed Jun 18 08:22:31 2008 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1668,10 +1668,10 @@ * Release the block on the current resync region if it * is blocked */ - ps1 = un->un_rs_prev_ovrlap; + ps1 = un->un_rs_prev_overlap; if ((ps1 != NULL) && (ps1->ps_flags & MD_MPS_ON_OVERLAP)) - mirror_overlap_chain_remove(ps1); + mirror_overlap_tree_remove(ps1); } un->un_owner_state &= ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER); @@ -1807,7 +1807,7 @@ * mirror is marked as "Needs Maintenance" and that an optimized * resync will be done when we resync the mirror, Also clear the * PREVENT_CHANGE flag and remove the last resync region from the - * overlap chain. + * overlap tree. */ if (p->d.owner == 0) { md_mps_t *ps; @@ -1839,9 +1839,9 @@ mutex_enter(&un->un_owner_mx); un->un_owner_state &= ~MD_MN_MM_PREVENT_CHANGE; mutex_exit(&un->un_owner_mx); - ps = un->un_rs_prev_ovrlap; + ps = un->un_rs_prev_overlap; if ((ps != NULL) && (ps->ps_flags & MD_MPS_ON_OVERLAP)) { - mirror_overlap_chain_remove(ps); + mirror_overlap_tree_remove(ps); ps->ps_firstblk = 0; ps->ps_lastblk = 0; }
--- a/usr/src/uts/common/io/lvm/mirror/mirror_resync.c Wed Jun 18 00:57:00 2008 -0700 +++ b/usr/src/uts/common/io/lvm/mirror/mirror_resync.c Wed Jun 18 08:22:31 2008 -0700 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -21,7 +20,7 @@ */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -190,7 +189,7 @@ if (un->c.un_status & MD_UN_KEEP_DIRTY) if (IS_KEEPDIRTY(i, un)) - continue; + continue; if (!IS_REGION_DIRTY(i, un)) continue; @@ -308,7 +307,7 @@ if (!mirror_timeout.dr_pending) { mirror_timeout.dr_pending = 1; daemon_request(&md_mstr_daemon, check_resync_regions, - (daemon_queue_t *)&mirror_timeout, REQ_OLD); + (daemon_queue_t *)&mirror_timeout, REQ_OLD); } if (mirror_md_ops.md_head != NULL) @@ -345,7 +344,7 @@ for (i = 0; i < NMIRROR; i++) { if (SMS_BY_INDEX_IS(un, i, SMS_OFFLINE)) { mirror_set_sm_state(&un->un_sm[i], - &un->un_smic[i], SMS_ATTACHED, 1); + &un->un_smic[i], SMS_ATTACHED, 1); changed++; } if (SMS_BY_INDEX_IS(un, i, SMS_OFFLINE_RESYNC)) { @@ -415,7 +414,7 @@ mirror_md_ops.md_driver.md_drivername); recid = mddb_createrec(size, typ1, RESYNC_REC, - MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno); + MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno); if (recid < 0) { if (snarfing && !(md_get_setstatus(setno) & MD_SET_STALE)) { md_set_setstatus(setno, MD_SET_STALE); @@ -497,18 +496,14 @@ return (err); } - un->un_goingclean_bm = - (uchar_t *)kmem_zalloc((uint_t)(howmany(un->un_rrd_num, NBBY)), - KM_SLEEP); - un->un_goingdirty_bm = - (uchar_t *)kmem_zalloc((uint_t)(howmany(un->un_rrd_num, NBBY)), - KM_SLEEP); - un->un_outstanding_writes = - (short *)kmem_zalloc((uint_t)un->un_rrd_num * sizeof (short), - KM_SLEEP); - un->un_resync_bm = - (uchar_t *)kmem_zalloc((uint_t)(howmany(un->un_rrd_num, NBBY)), - KM_SLEEP); + un->un_goingclean_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany( + un->un_rrd_num, NBBY)), KM_SLEEP); + un->un_goingdirty_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany( + un->un_rrd_num, NBBY)), KM_SLEEP); + un->un_outstanding_writes = (short *)kmem_zalloc( + (uint_t)un->un_rrd_num * sizeof (short), KM_SLEEP); + un->un_resync_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany( + un->un_rrd_num, NBBY)), KM_SLEEP); if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE) return (0); @@ -819,7 +814,7 @@ CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo); rval = mdmn_ksend_message(setno, MD_MN_MSG_RESYNC_NEXT, MD_MSGF_NO_LOG, - (char *)rmsg, sizeof (md_mn_msg_resync_t), kres); + (char *)rmsg, sizeof (md_mn_msg_resync_t), kres); CALLB_CPR_SAFE_END(&un->un_rs_cprinfo, &un->un_rs_cpr_mx); mutex_exit(&un->un_rs_cpr_mx); @@ -830,12 +825,11 @@ } kmem_free(kres, sizeof (md_mn_kresult_t)); (void) md_unit_readerlock(ui); - ps = un->un_rs_prev_ovrlap; + ps = un->un_rs_prev_overlap; /* Allocate previous overlap reference if needed */ if (ps == NULL) { - ps = kmem_cache_alloc(mirror_parent_cache, - MD_ALLOCFLAGS); + ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS); ps->ps_un = un; ps->ps_ui = ui; ps->ps_firstblk = 0; @@ -843,7 +837,7 @@ ps->ps_flags = 0; md_unit_readerexit(ui); (void) md_unit_writerlock(ui); - un->un_rs_prev_ovrlap = ps; + un->un_rs_prev_overlap = ps; md_unit_writerexit(ui); (void) md_unit_readerlock(ui); } @@ -910,10 +904,10 @@ */ if (un->un_rs_type != rs_type) return (0); - if (un->un_rs_prev_ovrlap->ps_firstblk > + if (un->un_rs_prev_overlap->ps_firstblk > rs_startblk) { currentblk = - un->un_rs_prev_ovrlap->ps_firstblk; + un->un_rs_prev_overlap->ps_firstblk; continue; } } @@ -940,10 +934,10 @@ */ if (un->un_rs_type != rs_type) return (0); - if (un->un_rs_prev_ovrlap->ps_firstblk > + if (un->un_rs_prev_overlap->ps_firstblk > rs_startblk) currentblk = - un->un_rs_prev_ovrlap->ps_firstblk; + un->un_rs_prev_overlap->ps_firstblk; } } } @@ -1623,7 +1617,7 @@ */ if (MD_MNSET_SETNO(setno)) { chunk = ((chunk + MD_DEF_RESYNC_BLK_SZ)/MD_DEF_RESYNC_BLK_SZ) - * MD_DEF_RESYNC_BLK_SZ; + * MD_DEF_RESYNC_BLK_SZ; if (chunk > un->c.un_total_blocks) chunk = un->c.un_total_blocks; } @@ -1762,8 +1756,7 @@ smic = &un->un_smic[i]; if (!SMS_IS(sm, SMS_RUNNING | SMS_LIMPING)) continue; - compcnt = (*(smic->sm_get_component_count)) - (sm->sm_dev, sm); + compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm); for (ci = 0; ci < compcnt; ci++) { SET_RS_SMI(un->un_rs_type, i); SET_RS_CI(un->un_rs_type, ci); @@ -1809,8 +1802,7 @@ smic = &un->un_smic[i]; if (!SMS_IS(sm, SMS_INUSE)) continue; - compcnt = (*(smic->sm_get_component_count)) - (sm->sm_dev, sm); + compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm); for (ci = 0; ci < compcnt; ci++) { shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) (sm->sm_dev, sm, ci); @@ -2113,8 +2105,7 @@ */ ASSERT(un->un_rs_resync_to_id == 0); un->un_rs_resync_to_id = timeout(resync_progress, un, - (clock_t)(drv_usectohz(60000000) * - md_mirror_resync_update_intvl)); + (clock_t)(drv_usectohz(60000000) * md_mirror_resync_update_intvl)); /* * Handle resync restart from the last logged position. The contents @@ -2343,15 +2334,15 @@ un->c.un_status &= ~(MD_UN_RESYNC_CANCEL | MD_UN_RESYNC_ACTIVE); - ps = un->un_rs_prev_ovrlap; + ps = un->un_rs_prev_overlap; if (ps != NULL) { /* Remove previous overlap resync region */ if (ps->ps_flags & MD_MPS_ON_OVERLAP) - mirror_overlap_chain_remove(ps); + mirror_overlap_tree_remove(ps); /* * Release the overlap range reference */ - un->un_rs_prev_ovrlap = NULL; + un->un_rs_prev_overlap = NULL; kmem_cache_free(mirror_parent_cache, ps); } @@ -2822,7 +2813,7 @@ typ1 = (mddb_type_t)md_getshared_key(setno, mirror_md_ops.md_driver.md_drivername); recid = mddb_createrec(size, typ1, RESYNC_REC, - MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno); + MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno); if (recid < 0) return (-1); @@ -2913,7 +2904,7 @@ mirror_md_ops.md_driver.md_drivername); recid = mddb_createrec(size, typ1, RESYNC_REC, - MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno); + MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno); if (recid < 0) return (-1);
--- a/usr/src/uts/common/sys/lvm/md_mirror.h Wed Jun 18 00:57:00 2008 -0700 +++ b/usr/src/uts/common/sys/lvm/md_mirror.h Wed Jun 18 08:22:31 2008 -0700 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -68,9 +67,9 @@ * macro to test if the current block is within the current resync region */ #define IN_RESYNC_REGION(un, ps) \ - ((un->un_rs_prev_ovrlap != NULL) && (ps->ps_firstblk >= \ - un->un_rs_prev_ovrlap->ps_firstblk) && \ - (ps->ps_lastblk <= un->un_rs_prev_ovrlap->ps_lastblk)) + ((un->un_rs_prev_overlap != NULL) && (ps->ps_firstblk >= \ + un->un_rs_prev_overlap->ps_firstblk) && \ + (ps->ps_lastblk <= un->un_rs_prev_overlap->ps_lastblk)) /* * Default resync update interval (in minutes). */ @@ -108,8 +107,8 @@ /* * Define for argument in function wait_for_overlaps() */ -#define MD_OVERLAP_ALLOW_REPEAT 0x1 /* Allow if ps already on chain */ -#define MD_OVERLAP_NO_REPEAT 0 /* ps must not already be on chain */ +#define MD_OVERLAP_ALLOW_REPEAT 0x1 /* Allow if ps already in tree */ +#define MD_OVERLAP_NO_REPEAT 0 /* ps must not already be in tree */ /* * Define for max retries of mirror_owner @@ -153,10 +152,10 @@ uint_t un_changecnt; ushort_t un_nsm; /* number of submirrors */ mm_submirror32_od_t un_sm[NMIRROR]; - int un_ovrlap_chn_flg; - int xx_un_ovrlap_chn_mx[2]; /* replaces mutex */ - ushort_t xx_un_ovrlap_chn_cv; - caddr32_t xx_un_ovrlap_chn; + int un_overlap_tree_flag; + int xx_un_overlap_tree_mx[2]; /* replaces mutex */ + ushort_t xx_un_overlap_tree_cv; + caddr32_t xx_un_overlap_root; mm_rd_opt_t un_read_option; /* mirror read option */ mm_wr_opt_t un_write_option; /* mirror write option */ mm_pass_num_t un_pass_num; /* resync pass number */ @@ -270,10 +269,11 @@ uint_t ps_active_cnt; int ps_frags; uint_t ps_changecnt; - struct md_mps *ps_ovrlap_next; - struct md_mps *ps_ovrlap_prev; + struct md_mps *ps_unused1; + struct md_mps *ps_unused2; void (*ps_call)(); kmutex_t ps_mx; + avl_node_t ps_overlap_node; } md_mps_t; #define MD_MPS_ON_OVERLAP 0x0001 @@ -309,9 +309,9 @@ } md_mcs_t; typedef struct mm_mirror_ic { - kmutex_t un_ovrlap_chn_mx; - kcondvar_t un_ovrlap_chn_cv; - md_mps_t un_ovrlap_chn; /* Sentinel for overlaps */ + kmutex_t un_overlap_tree_mx; + kcondvar_t un_overlap_tree_cv; + avl_tree_t un_overlap_root; kmutex_t un_resync_mx; kcondvar_t un_resync_cv; short *un_outstanding_writes; /* outstanding write array */ @@ -348,7 +348,7 @@ uint_t un_changecnt; ushort_t un_nsm; /* number of submirrors */ mm_submirror_t un_sm[NMIRROR]; - int un_ovrlap_chn_flg; + int un_overlap_tree_flag; mm_rd_opt_t un_read_option; /* mirror read option */ mm_wr_opt_t un_write_option; /* mirror write option */ mm_pass_num_t un_pass_num; /* resync pass number */ @@ -383,7 +383,7 @@ kmutex_t un_rs_thread_mx; /* Thread cv mutex */ kcondvar_t un_rs_thread_cv; /* Cond. Var. for thread */ uint_t un_rs_thread_flags; /* Thread control flags */ - md_mps_t *un_rs_prev_ovrlap; /* existing overlap request */ + md_mps_t *un_rs_prev_overlap; /* existing overlap request */ timeout_id_t un_rs_resync_to_id; /* resync progress timeout */ kmutex_t un_rs_progress_mx; /* Resync progress mutex */ kcondvar_t un_rs_progress_cv; /* Cond. Var. for progress */ @@ -391,9 +391,9 @@ void *un_rs_msg; /* Intra-node resync message */ } mm_unit_t; -#define un_ovrlap_chn_mx un_mmic.un_ovrlap_chn_mx -#define un_ovrlap_chn_cv un_mmic.un_ovrlap_chn_cv -#define un_ovrlap_chn un_mmic.un_ovrlap_chn +#define un_overlap_tree_mx un_mmic.un_overlap_tree_mx +#define un_overlap_tree_cv un_mmic.un_overlap_tree_cv +#define un_overlap_root un_mmic.un_overlap_root #define un_resync_mx un_mmic.un_resync_mx #define un_resync_cv un_mmic.un_resync_cv #define un_outstanding_writes un_mmic.un_outstanding_writes @@ -554,7 +554,7 @@ extern void mirror_check_failfast(minor_t mnum); extern int check_comp_4_hotspares(mm_unit_t *, int, int, uint_t, mddb_recid_t, IOLOCK *); -extern void mirror_overlap_chain_remove(md_mps_t *ps); +extern void mirror_overlap_tree_remove(md_mps_t *ps); extern void mirror_child_init(md_mcs_t *cs); /* Externals from mirror_ioctl.c */