Mercurial > illumos > illumos-gate
annotate usr/src/uts/common/io/lvm/mirror/mirror.c @ 14183:68927c785889 default tip
4099 SMF methods without absolute paths no longer work
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Approved by: Dan McDonald <danmcd@nexenta.com>
author | Jerry Jelinek <jerry.jelinek@joyent.com> |
---|---|
date | Fri, 06 Sep 2013 09:20:56 -0700 |
parents | 6db1b9319cfc |
children |
rev | line source |
---|---|
0 | 1 /* |
2 * CDDL HEADER START | |
3 * | |
4 * The contents of this file are subject to the terms of the | |
1366
18ae7db30fe7
6376469 Drivers are declaring _depends_on incorrectly leading to venus optimizing it out
petede
parents:
46
diff
changeset
|
5 * Common Development and Distribution License (the "License"). |
18ae7db30fe7
6376469 Drivers are declaring _depends_on incorrectly leading to venus optimizing it out
petede
parents:
46
diff
changeset
|
6 * You may not use this file except in compliance with the License. |
0 | 7 * |
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 * or http://www.opensolaris.org/os/licensing. | |
10 * See the License for the specific language governing permissions | |
11 * and limitations under the License. | |
12 * | |
13 * When distributing Covered Code, include this CDDL HEADER in each | |
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 * If applicable, add the following below this CDDL HEADER, with the | |
16 * fields enclosed by brackets "[]" replaced with your own identifying | |
17 * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 * | |
19 * CDDL HEADER END | |
20 */ | |
7627
8599a7568728
6743774 TSlvm tests cause kernel deadlocks on md_unit_array_rw and md_devinfo->devi_cv
Chris Horne <Chris.Horne@Sun.COM>
parents:
6901
diff
changeset
|
21 |
0 | 22 /* |
12629
8a89ca2bbe3a
6914620 SVM can panic system when attempting to access a damaged one-sided mirror
Ray Hassan <Ray.Hassan@oracle.COM>
parents:
11130
diff
changeset
|
23 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. |
13452
6bec9720e054
1073 migrate kernel modules from ancient _depends_on to true ELF dependencies
Bayard Bell <buffer.g.overflow@gmail.com>
parents:
12629
diff
changeset
|
24 * Copyright (c) 2011 Bayard G. Bell. All rights reserved. |
0 | 25 */ |
26 | |
27 #include <sys/param.h> | |
28 #include <sys/systm.h> | |
29 #include <sys/conf.h> | |
30 #include <sys/file.h> | |
31 #include <sys/user.h> | |
32 #include <sys/uio.h> | |
33 #include <sys/t_lock.h> | |
34 #include <sys/buf.h> | |
35 #include <sys/dkio.h> | |
36 #include <sys/vtoc.h> | |
37 #include <sys/kmem.h> | |
38 #include <vm/page.h> | |
39 #include <sys/cmn_err.h> | |
40 #include <sys/sysmacros.h> | |
41 #include <sys/types.h> | |
42 #include <sys/mkdev.h> | |
43 #include <sys/stat.h> | |
44 #include <sys/open.h> | |
45 #include <sys/modctl.h> | |
46 #include <sys/ddi.h> | |
47 #include <sys/sunddi.h> | |
48 #include <sys/debug.h> | |
49 #include <sys/dklabel.h> | |
50 #include <vm/hat.h> | |
1623
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
51 #include <sys/lvm/mdvar.h> |
0 | 52 #include <sys/lvm/md_mirror.h> |
53 #include <sys/lvm/md_convert.h> | |
54 #include <sys/lvm/md_mddb.h> | |
55 #include <sys/esunddi.h> | |
56 | |
57 #include <sys/sysevent/eventdefs.h> | |
58 #include <sys/sysevent/svm.h> | |
59 #include <sys/lvm/mdmn_commd.h> | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
60 #include <sys/avl.h> |
0 | 61 |
62 md_ops_t mirror_md_ops; | |
63 #ifndef lint | |
64 md_ops_t *md_interface_ops = &mirror_md_ops; | |
65 #endif | |
66 | |
67 extern mdq_anchor_t md_done_daemon; | |
68 extern mdq_anchor_t md_mstr_daemon; | |
69 extern mdq_anchor_t md_mirror_daemon; | |
70 extern mdq_anchor_t md_mirror_io_daemon; | |
71 extern mdq_anchor_t md_mirror_rs_daemon; | |
72 extern mdq_anchor_t md_mhs_daemon; | |
73 | |
74 extern unit_t md_nunits; | |
75 extern set_t md_nsets; | |
76 extern md_set_t md_set[]; | |
77 | |
78 extern int md_status; | |
79 extern clock_t md_hz; | |
80 | |
81 extern md_krwlock_t md_unit_array_rw; | |
82 extern kmutex_t md_mx; | |
83 extern kcondvar_t md_cv; | |
84 extern int md_mtioctl_cnt; | |
85 | |
86 daemon_request_t mirror_timeout; | |
87 static daemon_request_t hotspare_request; | |
88 static daemon_request_t mn_hs_request[MD_MAXSETS]; /* Multinode hs req */ | |
89 | |
90 int md_mirror_mcs_buf_off; | |
91 | |
92 /* Flags for mdmn_ksend_message to allow debugging */ | |
93 int md_mirror_msg_flags; | |
94 | |
95 #ifdef DEBUG | |
96 /* Flag to switch on debug messages */ | |
97 int mirror_debug_flag = 0; | |
98 #endif | |
99 | |
100 /* | |
101 * Struct used to hold count of DMR reads and the timestamp of last DMR read | |
102 * It is used to verify, using a debugger, that the DMR read ioctl has been | |
103 * executed. | |
104 */ | |
105 dmr_stats_t mirror_dmr_stats = {0, 0}; | |
106 | |
107 /* | |
108 * Mutex protecting list of non-failfast drivers. | |
109 */ | |
110 static kmutex_t non_ff_drv_mutex; | |
2063
a6ebd483c3cf
6258439 metaset truncates fully qualified mediator host name
hshaw
parents:
1623
diff
changeset
|
111 extern char **non_ff_drivers; |
0 | 112 |
113 extern major_t md_major; | |
114 | |
115 /* | |
116 * Write-On-Write memory pool. | |
117 */ | |
118 static void copy_write_cont(wowhdr_t *wowhdr); | |
119 static kmem_cache_t *mirror_wowblk_cache = NULL; | |
120 static int md_wowbuf_size = 16384; | |
121 static size_t md_wowblk_size; | |
122 | |
123 /* | |
124 * This is a flag that allows: | |
125 * - disabling the write-on-write mechanism. | |
126 * - logging occurrences of write-on-write | |
127 * - switching wow handling procedure processing | |
128 * Counter for occurences of WOW. | |
129 */ | |
130 static uint_t md_mirror_wow_flg = 0; | |
131 static int md_mirror_wow_cnt = 0; | |
132 | |
133 /* | |
134 * Tunable to enable/disable dirty region | |
135 * processing when closing down a mirror. | |
136 */ | |
137 static int new_resync = 1; | |
138 kmem_cache_t *mirror_parent_cache = NULL; | |
139 kmem_cache_t *mirror_child_cache = NULL; | |
140 | |
141 extern int md_ff_disable; /* disable failfast */ | |
142 | |
143 static int mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int); | |
144 static void mirror_read_strategy(buf_t *, int, void *); | |
145 static void mirror_write_strategy(buf_t *, int, void *); | |
146 static void become_owner(daemon_queue_t *); | |
147 static int mirror_done(struct buf *cb); | |
148 static int mirror_done_common(struct buf *cb); | |
149 static void clear_retry_error(struct buf *cb); | |
150 | |
151 /* | |
152 * patchables | |
153 */ | |
154 int md_min_rr_size = 200; /* 2000 blocks, or 100k */ | |
155 int md_def_num_rr = 1000; /* Default number of dirty regions */ | |
156 | |
157 /* | |
158 * patchable to change delay before rescheduling mirror ownership request. | |
159 * Value is clock ticks, default 0.5 seconds | |
160 */ | |
161 clock_t md_mirror_owner_to = 500000; | |
162 | |
163 /*ARGSUSED1*/ | |
164 static int | |
165 mirror_parent_constructor(void *p, void *d1, int d2) | |
166 { | |
167 mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL); | |
168 return (0); | |
169 } | |
170 | |
171 static void | |
172 mirror_parent_init(md_mps_t *ps) | |
173 { | |
174 bzero(ps, offsetof(md_mps_t, ps_mx)); | |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
175 bzero(&ps->ps_overlap_node, sizeof (avl_node_t)); |
0 | 176 } |
177 | |
178 /*ARGSUSED1*/ | |
179 static void | |
180 mirror_parent_destructor(void *p, void *d) | |
181 { | |
182 mutex_destroy(&((md_mps_t *)p)->ps_mx); | |
183 } | |
184 | |
185 /*ARGSUSED1*/ | |
186 static int | |
187 mirror_child_constructor(void *p, void *d1, int d2) | |
188 { | |
189 bioinit(&((md_mcs_t *)p)->cs_buf); | |
190 return (0); | |
191 } | |
192 | |
193 void | |
194 mirror_child_init(md_mcs_t *cs) | |
195 { | |
196 cs->cs_ps = NULL; | |
197 cs->cs_mdunit = 0; | |
198 md_bioreset(&cs->cs_buf); | |
199 } | |
200 | |
201 /*ARGSUSED1*/ | |
202 static void | |
203 mirror_child_destructor(void *p, void *d) | |
204 { | |
205 biofini(&((md_mcs_t *)p)->cs_buf); | |
206 } | |
207 | |
208 static void | |
209 mirror_wowblk_init(wowhdr_t *p) | |
210 { | |
211 bzero(p, md_wowblk_size); | |
212 } | |
213 | |
214 static void | |
215 send_poke_hotspares_msg(daemon_request_t *drq) | |
216 { | |
217 int rval; | |
11130
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
218 int nretries = 0; |
0 | 219 md_mn_msg_pokehsp_t pokehsp; |
220 md_mn_kresult_t *kresult; | |
221 set_t setno = (set_t)drq->dq.qlen; | |
222 | |
223 pokehsp.pokehsp_setno = setno; | |
224 | |
225 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); | |
11130
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
226 |
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
227 retry_sphmsg: |
0 | 228 rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES, |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
229 MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0, (char *)&pokehsp, |
0 | 230 sizeof (pokehsp), kresult); |
231 | |
232 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { | |
233 mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES"); | |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
234 /* If we're shutting down already, pause things here. */ |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
235 if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) { |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
236 while (!md_mn_is_commd_present()) { |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
237 delay(md_hz); |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
238 } |
11130
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
239 /* |
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
240 * commd has become reachable again, so retry once. |
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
241 * If this fails we'll panic as the system is in an |
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
242 * unexpected state. |
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
243 */ |
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
244 if (nretries++ == 0) |
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
245 goto retry_sphmsg; |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
246 } |
0 | 247 cmn_err(CE_PANIC, |
248 "ksend_message failure: POKE_HOTSPARES"); | |
249 } | |
250 kmem_free(kresult, sizeof (md_mn_kresult_t)); | |
251 | |
252 /* Allow further requests to use this set's queue structure */ | |
253 mutex_enter(&drq->dr_mx); | |
254 drq->dr_pending = 0; | |
255 mutex_exit(&drq->dr_mx); | |
256 } | |
257 | |
258 /* | |
259 * Send a poke_hotspares message to the master node. To avoid swamping the | |
260 * commd handler with requests we only send a message if there is not one | |
261 * already outstanding. We punt the request to a separate thread context as | |
262 * cannot afford to block waiting on the request to be serviced. This is | |
263 * essential when a reconfig cycle is in progress as any open() of a multinode | |
264 * metadevice may result in a livelock. | |
265 */ | |
266 static void | |
267 send_poke_hotspares(set_t setno) | |
268 { | |
269 daemon_request_t *drq = &mn_hs_request[setno]; | |
270 | |
271 mutex_enter(&drq->dr_mx); | |
272 if (drq->dr_pending == 0) { | |
273 drq->dr_pending = 1; | |
274 drq->dq.qlen = (int)setno; | |
275 daemon_request(&md_mhs_daemon, | |
276 send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD); | |
277 } | |
278 mutex_exit(&drq->dr_mx); | |
279 } | |
280 | |
281 void | |
282 mirror_set_sm_state( | |
283 mm_submirror_t *sm, | |
284 mm_submirror_ic_t *smic, | |
285 sm_state_t newstate, | |
286 int force) | |
287 { | |
288 int compcnt; | |
289 int i; | |
290 int errcnt; | |
291 sm_state_t origstate; | |
292 md_m_shared_t *shared; | |
293 | |
294 if (force) { | |
295 sm->sm_state = newstate; | |
296 uniqtime32(&sm->sm_timestamp); | |
297 return; | |
298 } | |
299 | |
300 origstate = newstate; | |
301 | |
302 compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm); | |
303 for (i = 0, errcnt = 0; i < compcnt; i++) { | |
304 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) | |
305 (sm->sm_dev, sm, i); | |
306 if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED)) | |
307 newstate |= SMS_COMP_ERRED; | |
308 if (shared->ms_state & (CS_RESYNC)) | |
309 newstate |= SMS_COMP_RESYNC; | |
310 if (shared->ms_state & CS_ERRED) | |
311 errcnt++; | |
312 } | |
313 | |
314 if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0) | |
315 newstate &= ~origstate; | |
316 | |
317 if (errcnt == compcnt) | |
318 newstate |= SMS_ALL_ERRED; | |
319 else | |
320 newstate &= ~SMS_ALL_ERRED; | |
321 | |
322 sm->sm_state = newstate; | |
323 uniqtime32(&sm->sm_timestamp); | |
324 } | |
325 | |
326 static int | |
327 mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error, | |
328 int frm_probe) | |
329 { | |
330 mm_submirror_t *sm; | |
331 mm_submirror_ic_t *smic; | |
332 md_m_shared_t *shared; | |
333 int ci; | |
334 int i; | |
335 int compcnt; | |
336 int open_comp; /* flag for open component */ | |
337 | |
338 for (i = *smi; i < NMIRROR; i++) { | |
339 sm = &un->un_sm[i]; | |
340 smic = &un->un_smic[i]; | |
341 | |
342 if (!SMS_IS(sm, SMS_INUSE)) | |
343 continue; | |
344 | |
345 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); | |
346 for (ci = *cip; ci < compcnt; ci++) { | |
347 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) | |
348 (sm->sm_dev, sm, ci); | |
349 /* | |
350 * if called from any routine but probe, we check for | |
351 * MDM_S_ISOPEN flag. Since probe does a pseduo open, | |
352 * it sets MDM_S_PROBEOPEN flag and we test for this | |
353 * flag. They are both exclusive tests. | |
354 */ | |
355 open_comp = (frm_probe) ? | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
356 (shared->ms_flags & MDM_S_PROBEOPEN): |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
357 (shared->ms_flags & MDM_S_ISOPEN); |
12629
8a89ca2bbe3a
6914620 SVM can panic system when attempting to access a damaged one-sided mirror
Ray Hassan <Ray.Hassan@oracle.COM>
parents:
11130
diff
changeset
|
358 if (((shared->ms_flags & MDM_S_IOERR || !open_comp) && |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
359 ((shared->ms_state == CS_OKAY) || |
12629
8a89ca2bbe3a
6914620 SVM can panic system when attempting to access a damaged one-sided mirror
Ray Hassan <Ray.Hassan@oracle.COM>
parents:
11130
diff
changeset
|
360 (shared->ms_state == CS_RESYNC))) || |
8a89ca2bbe3a
6914620 SVM can panic system when attempting to access a damaged one-sided mirror
Ray Hassan <Ray.Hassan@oracle.COM>
parents:
11130
diff
changeset
|
361 (!open_comp && |
8a89ca2bbe3a
6914620 SVM can panic system when attempting to access a damaged one-sided mirror
Ray Hassan <Ray.Hassan@oracle.COM>
parents:
11130
diff
changeset
|
362 (shared->ms_state == CS_LAST_ERRED))) { |
0 | 363 if (clr_error) { |
364 shared->ms_flags &= ~MDM_S_IOERR; | |
365 } | |
366 *cip = ci; | |
367 *smi = i; | |
368 return (1); | |
369 } | |
370 | |
371 if (clr_error && (shared->ms_flags & MDM_S_IOERR)) { | |
372 shared->ms_flags &= ~MDM_S_IOERR; | |
373 } | |
374 } | |
375 | |
376 *cip = 0; | |
377 } | |
378 return (0); | |
379 } | |
380 | |
381 /*ARGSUSED*/ | |
382 static void | |
383 mirror_run_queue(void *d) | |
384 { | |
385 if (!(md_status & MD_GBL_DAEMONS_LIVE)) | |
386 md_daemon(1, &md_done_daemon); | |
387 } | |
388 /* | |
389 * check_comp_4_hotspares | |
390 * | |
391 * This function attempts to allocate a hotspare for this component if the | |
392 * component is in error. In a MN set, the function can be called in 2 modes. | |
393 * It can be called either when a component error has been detected or when a | |
394 * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set | |
395 * in flags and the request is sent to all nodes. | |
396 * The handler on each of the nodes then calls this function with | |
397 * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed. | |
398 * | |
399 * For non-MN sets the function simply attempts to allocate a hotspare. | |
400 * | |
401 * On entry, the following locks are held | |
402 * mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set) | |
403 * md_unit_writerlock | |
404 * | |
405 * Returns 0 if ok | |
406 * 1 if the unit containing the component has been cleared while | |
407 * the mdmn_ksend_message() was being executed | |
408 */ | |
409 extern int | |
410 check_comp_4_hotspares( | |
411 mm_unit_t *un, | |
412 int smi, | |
413 int ci, | |
414 uint_t flags, | |
415 mddb_recid_t hs_id, /* Only used by MN disksets */ | |
416 IOLOCK *lockp /* can be NULL */ | |
417 ) | |
418 { | |
419 mm_submirror_t *sm; | |
420 mm_submirror_ic_t *smic; | |
421 md_m_shared_t *shared; | |
422 mddb_recid_t recids[6]; | |
423 minor_t mnum; | |
424 intptr_t (*hs_dev)(); | |
425 void (*hs_done)(); | |
426 void *hs_data; | |
427 md_error_t mde = mdnullerror; | |
428 set_t setno; | |
429 md_mn_msg_allochsp_t allochspmsg; | |
430 md_mn_kresult_t *kresult; | |
431 mm_unit_t *new_un; | |
432 int rval; | |
11130
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
433 int nretries = 0; |
0 | 434 |
435 mnum = MD_SID(un); | |
436 setno = MD_UN2SET(un); | |
437 sm = &un->un_sm[smi]; | |
438 smic = &un->un_smic[smi]; | |
439 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
440 (sm->sm_dev, sm, ci); |
0 | 441 |
442 if (shared->ms_state != CS_ERRED) | |
443 return (0); | |
444 | |
445 /* Don't start a new component resync if a resync is already running. */ | |
446 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) | |
447 return (0); | |
448 | |
449 if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) { | |
450 uint_t msgflags; | |
451 md_mn_msgtype_t msgtype; | |
452 | |
453 /* Send allocate hotspare message to all nodes */ | |
454 | |
455 allochspmsg.msg_allochsp_mnum = un->c.un_self_id; | |
456 allochspmsg.msg_allochsp_sm = smi; | |
457 allochspmsg.msg_allochsp_comp = ci; | |
458 allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id; | |
459 | |
460 /* | |
461 * Before calling mdmn_ksend_message(), release locks | |
462 * Can never be in the context of an ioctl. | |
463 */ | |
464 md_unit_writerexit(MDI_UNIT(mnum)); | |
465 if (flags & MD_HOTSPARE_LINKHELD) | |
466 rw_exit(&mirror_md_ops.md_link_rw.lock); | |
467 #ifdef DEBUG | |
468 if (mirror_debug_flag) | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
469 printf("send alloc hotspare, flags=" |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
470 "0x%x %x, %x, %x, %x\n", flags, |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
471 allochspmsg.msg_allochsp_mnum, |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
472 allochspmsg.msg_allochsp_sm, |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
473 allochspmsg.msg_allochsp_comp, |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
474 allochspmsg.msg_allochsp_hs_id); |
0 | 475 #endif |
476 if (flags & MD_HOTSPARE_WMUPDATE) { | |
477 msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE2; | |
478 /* | |
479 * When coming from an update of watermarks, there | |
480 * must already be a message logged that triggered | |
481 * this action. So, no need to log this message, too. | |
482 */ | |
483 msgflags = MD_MSGF_NO_LOG; | |
484 } else { | |
485 msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE; | |
486 msgflags = MD_MSGF_DEFAULT_FLAGS; | |
487 } | |
488 | |
489 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); | |
11130
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
490 |
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
491 cc4hs_msg: |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
492 rval = mdmn_ksend_message(setno, msgtype, msgflags, 0, |
0 | 493 (char *)&allochspmsg, sizeof (allochspmsg), |
494 kresult); | |
495 | |
496 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { | |
497 #ifdef DEBUG | |
498 if (mirror_debug_flag) | |
499 mdmn_ksend_show_error(rval, kresult, | |
500 "ALLOCATE HOTSPARE"); | |
501 #endif | |
502 /* | |
503 * If message is sent ok but exitval indicates an error | |
504 * it must be because the mirror has been cleared. In | |
505 * this case re-obtain lock and return an error | |
506 */ | |
507 if ((rval == 0) && (kresult->kmmr_exitval != 0)) { | |
508 if (flags & MD_HOTSPARE_LINKHELD) { | |
509 rw_enter(&mirror_md_ops.md_link_rw.lock, | |
510 RW_READER); | |
511 } | |
512 kmem_free(kresult, sizeof (md_mn_kresult_t)); | |
513 return (1); | |
514 } | |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
515 /* If we're shutting down already, pause things here. */ |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
516 if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) { |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
517 while (!md_mn_is_commd_present()) { |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
518 delay(md_hz); |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
519 } |
11130
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
520 /* |
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
521 * commd has become reachable again, so retry |
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
522 * once. If this fails we'll panic as the |
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
523 * system is in an unexpected state. |
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
524 */ |
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
525 if (nretries++ == 0) |
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
526 goto cc4hs_msg; |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
527 } |
0 | 528 cmn_err(CE_PANIC, |
529 "ksend_message failure: ALLOCATE_HOTSPARE"); | |
530 } | |
531 kmem_free(kresult, sizeof (md_mn_kresult_t)); | |
532 | |
533 /* | |
534 * re-obtain the locks | |
535 */ | |
536 if (flags & MD_HOTSPARE_LINKHELD) | |
537 rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER); | |
538 new_un = md_unit_writerlock(MDI_UNIT(mnum)); | |
539 | |
540 /* | |
541 * As we had to release the locks in order to send the | |
542 * message to all nodes, we need to check to see if the | |
543 * unit has changed. If it has we release the writerlock | |
544 * and return fail. | |
545 */ | |
546 if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) { | |
547 md_unit_writerexit(MDI_UNIT(mnum)); | |
548 return (1); | |
549 } | |
550 } else { | |
551 if (MD_MNSET_SETNO(setno)) { | |
552 /* | |
553 * If 2 or more nodes simultaneously see a | |
554 * component failure, these nodes will each | |
555 * send an ALLOCATE_HOTSPARE[2] message. | |
556 * The first message will allocate the hotspare | |
557 * and the subsequent messages should do nothing. | |
558 * | |
559 * If a slave node doesn't have a hotspare allocated | |
560 * at the time the message is initiated, then the | |
561 * passed in hs_id will be 0. If the node | |
562 * executing this routine has a component shared | |
563 * ms_hs_id of non-zero, but the message shows a | |
564 * hs_id of 0, then just return since a hotspare | |
565 * has already been allocated for this failing | |
566 * component. When the slave node returns from | |
567 * the ksend_message the hotspare will have | |
568 * already been allocated. | |
569 * | |
570 * If the slave node does send an hs_id of non-zero, | |
571 * and the slave node's hs_id matches this node's | |
572 * ms_hs_id, then the hotspare has error'd and | |
573 * should be replaced. | |
574 * | |
575 * If the slave node sends an hs_id of non-zero and | |
576 * this node has a different shared ms_hs_id, then | |
577 * just return since this hotspare has already | |
578 * been hotspared. | |
579 */ | |
580 if (shared->ms_hs_id != 0) { | |
581 if (hs_id == 0) { | |
582 #ifdef DEBUG | |
583 if (mirror_debug_flag) { | |
584 printf("check_comp_4_hotspares" | |
585 "(NOXMIT), short circuit " | |
586 "hs_id=0x%x, " | |
587 "ms_hs_id=0x%x\n", | |
588 hs_id, shared->ms_hs_id); | |
589 } | |
590 #endif | |
591 return (0); | |
592 } | |
593 if (hs_id != shared->ms_hs_id) { | |
594 #ifdef DEBUG | |
595 if (mirror_debug_flag) { | |
596 printf("check_comp_4_hotspares" | |
597 "(NOXMIT), short circuit2 " | |
598 "hs_id=0x%x, " | |
599 "ms_hs_id=0x%x\n", | |
600 hs_id, shared->ms_hs_id); | |
601 } | |
602 #endif | |
603 return (0); | |
604 } | |
605 } | |
606 } | |
607 | |
608 sm = &un->un_sm[smi]; | |
609 hs_dev = md_get_named_service(sm->sm_dev, 0, | |
610 "hotspare device", 0); | |
611 if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done, | |
612 &hs_data) != 0) | |
613 return (0); | |
614 | |
615 /* | |
616 * set_sm_comp_state() commits the modified records. | |
617 * As we don't transmit the changes, no need to drop the lock. | |
618 */ | |
619 set_sm_comp_state(un, smi, ci, CS_RESYNC, recids, | |
620 MD_STATE_NO_XMIT, (IOLOCK *)NULL); | |
621 | |
622 (*hs_done)(sm->sm_dev, hs_data); | |
623 | |
624 mirror_check_failfast(mnum); | |
625 | |
626 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE, | |
627 setno, MD_SID(un)); | |
628 | |
629 /* | |
630 * For a multi-node set we need to reset the un_rs_type, | |
631 * un_rs_resync_done and un_rs_resync_2_do fields as the | |
632 * hot-spare resync must copy all applicable data. | |
633 */ | |
634 if (MD_MNSET_SETNO(setno)) { | |
635 un->un_rs_type = MD_RS_NONE; | |
636 un->un_rs_resync_done = 0; | |
637 un->un_rs_resync_2_do = 0; | |
638 } | |
639 | |
640 /* | |
641 * Must drop writer lock since mirror_resync_unit will | |
642 * open devices and must be able to grab readerlock. | |
643 * Don't need to drop IOLOCK since any descendent routines | |
644 * calling ksend_messages will drop the IOLOCK as needed. | |
645 * | |
646 */ | |
647 if (lockp) { | |
648 md_ioctl_writerexit(lockp); | |
649 } else { | |
650 md_unit_writerexit(MDI_UNIT(mnum)); | |
651 } | |
652 | |
653 /* start resync */ | |
654 (void) mirror_resync_unit(mnum, NULL, &mde, lockp); | |
655 | |
656 if (lockp) { | |
657 new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum)); | |
658 } else { | |
659 new_un = md_unit_writerlock(MDI_UNIT(mnum)); | |
660 } | |
661 } | |
662 return (0); | |
663 } | |
664 | |
665 /* | |
666 * check_unit_4_hotspares | |
667 * | |
668 * For a given mirror, allocate hotspares, if available for any components | |
669 * that are in error | |
670 * | |
671 * Returns 0 if ok | |
672 * 1 if check_comp_4_hotspares returns non-zero. This will only | |
673 * happen for a MN unit where the unit has been cleared while | |
674 * the allocate hotspare message is sent to all nodes. | |
675 */ | |
676 static int | |
677 check_unit_4_hotspares(mm_unit_t *un, int flags) | |
678 { | |
679 mm_submirror_t *sm; | |
680 mm_submirror_ic_t *smic; | |
681 int ci; | |
682 int i; | |
683 int compcnt; | |
684 | |
685 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) | |
686 return (0); | |
687 | |
688 for (i = 0; i < NMIRROR; i++) { | |
689 sm = &un->un_sm[i]; | |
690 smic = &un->un_smic[i]; | |
691 if (!SMS_IS(sm, SMS_INUSE)) | |
692 continue; | |
693 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm); | |
694 for (ci = 0; ci < compcnt; ci++) { | |
695 md_m_shared_t *shared; | |
696 | |
697 shared = (md_m_shared_t *) | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
698 (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci); |
0 | 699 /* |
700 * Never called from ioctl context, so pass in | |
701 * (IOLOCK *)NULL. Pass through flags from calling | |
702 * routine, also setting XMIT flag. | |
703 */ | |
704 if (check_comp_4_hotspares(un, i, ci, | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
705 (MD_HOTSPARE_XMIT | flags), |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
706 shared->ms_hs_id, (IOLOCK *)NULL) != 0) |
0 | 707 return (1); |
708 } | |
709 } | |
710 return (0); | |
711 } | |
712 | |
713 static void | |
714 check_4_hotspares(daemon_request_t *drq) | |
715 { | |
716 mdi_unit_t *ui; | |
717 mm_unit_t *un; | |
718 md_link_t *next; | |
719 int x; | |
720 | |
721 mutex_enter(&drq->dr_mx); /* clear up front so can poke */ | |
722 drq->dr_pending = 0; /* again in low level routine if */ | |
723 mutex_exit(&drq->dr_mx); /* something found to do */ | |
724 | |
725 /* | |
726 * Used to have a problem here. The disksets weren't marked as being | |
727 * MNHOLD. This opened a window where we could be searching for | |
728 * hotspares and have the disk set unloaded (released) from under | |
729 * us causing a panic in stripe_component_count(). | |
730 * The way to prevent that is to mark the set MNHOLD which prevents | |
731 * any diskset from being released while we are scanning the mirrors, | |
732 * submirrors and components. | |
733 */ | |
734 | |
735 for (x = 0; x < md_nsets; x++) | |
736 md_holdset_enter(x); | |
737 | |
738 rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER); | |
739 for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) { | |
740 ui = MDI_UNIT(next->ln_id); | |
741 | |
742 un = (mm_unit_t *)md_unit_readerlock(ui); | |
743 | |
744 /* | |
745 * Only check the unit if we are the master for this set | |
746 * For an MN set, poke_hotspares() is only effective on the | |
747 * master | |
748 */ | |
749 if (MD_MNSET_SETNO(MD_UN2SET(un)) && | |
750 md_set[MD_UN2SET(un)].s_am_i_master == 0) { | |
751 md_unit_readerexit(ui); | |
752 continue; | |
753 } | |
754 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) { | |
755 md_unit_readerexit(ui); | |
756 continue; | |
757 } | |
758 md_unit_readerexit(ui); | |
759 | |
760 un = (mm_unit_t *)md_unit_writerlock(ui); | |
761 /* | |
762 * check_unit_4_hotspares will exit 1 if the unit has been | |
763 * removed during the process of allocating the hotspare. | |
764 * This can only happen for a MN metadevice. If unit no longer | |
765 * exists, no need to release writerlock | |
766 */ | |
767 if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0) | |
768 md_unit_writerexit(ui); | |
769 else { | |
770 /* | |
771 * If check_unit_4_hotspares failed, queue another | |
772 * request and break out of this one | |
773 */ | |
774 (void) poke_hotspares(); | |
775 break; | |
776 } | |
777 } | |
778 rw_exit(&mirror_md_ops.md_link_rw.lock); | |
779 | |
780 for (x = 0; x < md_nsets; x++) | |
781 md_holdset_exit(x); | |
782 } | |
783 | |
784 /* | |
785 * poke_hotspares | |
786 * | |
787 * If there is not a pending poke_hotspares request pending, queue a requent | |
788 * to call check_4_hotspares(). This will scan all mirrors and attempt to | |
789 * allocate hotspares for all components in error. | |
790 */ | |
791 int | |
792 poke_hotspares() | |
793 { | |
794 mutex_enter(&hotspare_request.dr_mx); | |
795 if (hotspare_request.dr_pending == 0) { | |
796 hotspare_request.dr_pending = 1; | |
797 daemon_request(&md_mhs_daemon, | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
798 check_4_hotspares, (daemon_queue_t *)&hotspare_request, |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
799 REQ_OLD); |
0 | 800 } |
801 mutex_exit(&hotspare_request.dr_mx); | |
802 return (0); | |
803 } | |
804 | |
805 static void | |
806 free_all_ecomps(err_comp_t *ecomp) | |
807 { | |
808 err_comp_t *d; | |
809 | |
810 while (ecomp != NULL) { | |
811 d = ecomp; | |
812 ecomp = ecomp->ec_next; | |
813 kmem_free(d, sizeof (err_comp_t)); | |
814 } | |
815 } | |
816 | |
817 /* | |
818 * NAME: mirror_openfail_console_info | |
819 * | |
820 * DESCRIPTION: Prints a informative message to the console when mirror | |
821 * cannot be opened. | |
822 * | |
823 * PARAMETERS: mm_unit_t un - pointer to mirror unit structure | |
824 * int smi - submirror index | |
825 * int ci - component index | |
826 */ | |
827 | |
828 void | |
829 mirror_openfail_console_info(mm_unit_t *un, int smi, int ci) | |
830 { | |
831 void (*get_dev)(); | |
832 ms_cd_info_t cd; | |
833 md_dev64_t tmpdev; | |
834 | |
835 tmpdev = un->un_sm[smi].sm_dev; | |
836 get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0); | |
837 if (get_dev != NULL) { | |
838 (void) (*get_dev)(tmpdev, smi, ci, &cd); | |
839 cmn_err(CE_WARN, "md %s: open error on %s", | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
840 md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un), |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
841 cd.cd_dev, NULL, 0)); |
0 | 842 } else { |
843 cmn_err(CE_WARN, "md %s: open error", | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
844 md_shortname(MD_SID(un))); |
0 | 845 } |
846 } | |
847 | |
848 static int | |
849 mirror_close_all_devs(mm_unit_t *un, int md_cflags) | |
850 { | |
851 int i; | |
852 md_dev64_t dev; | |
853 | |
854 for (i = 0; i < NMIRROR; i++) { | |
855 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) | |
856 continue; | |
857 dev = un->un_sm[i].sm_dev; | |
858 md_layered_close(dev, md_cflags); | |
859 } | |
860 return (0); | |
861 } | |
862 | |
863 /* | |
864 * Keep track of drivers that don't support failfast. We use this so that | |
865 * we only log one diagnostic message for each of these drivers, no matter | |
866 * how many times we run the mirror_check_failfast function. | |
867 * Return 1 if this is a new driver that does not support failfast, | |
868 * return 0 if we have already seen this non-failfast driver. | |
869 */ | |
870 static int | |
871 new_non_ff_driver(const char *s) | |
872 { | |
873 mutex_enter(&non_ff_drv_mutex); | |
874 if (non_ff_drivers == NULL) { | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
875 non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *), |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
876 KM_NOSLEEP); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
877 if (non_ff_drivers == NULL) { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
878 mutex_exit(&non_ff_drv_mutex); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
879 return (1); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
880 } |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
881 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
882 non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1, |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
883 KM_NOSLEEP); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
884 if (non_ff_drivers[0] == NULL) { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
885 kmem_free(non_ff_drivers, 2 * sizeof (char *)); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
886 non_ff_drivers = NULL; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
887 mutex_exit(&non_ff_drv_mutex); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
888 return (1); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
889 } |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
890 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
891 (void) strcpy(non_ff_drivers[0], s); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
892 non_ff_drivers[1] = NULL; |
0 | 893 |
894 } else { | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
895 int i; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
896 char **tnames; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
897 char **tmp; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
898 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
899 for (i = 0; non_ff_drivers[i] != NULL; i++) { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
900 if (strcmp(s, non_ff_drivers[i]) == 0) { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
901 mutex_exit(&non_ff_drv_mutex); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
902 return (0); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
903 } |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
904 } |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
905 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
906 /* allow for new element and null */ |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
907 i += 2; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
908 tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
909 if (tnames == NULL) { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
910 mutex_exit(&non_ff_drv_mutex); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
911 return (1); |
0 | 912 } |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
913 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
914 for (i = 0; non_ff_drivers[i] != NULL; i++) |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
915 tnames[i] = non_ff_drivers[i]; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
916 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
917 tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
918 if (tnames[i] == NULL) { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
919 /* adjust i so that it is the right count to free */ |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
920 kmem_free(tnames, (i + 2) * sizeof (char *)); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
921 mutex_exit(&non_ff_drv_mutex); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
922 return (1); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
923 } |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
924 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
925 (void) strcpy(tnames[i++], s); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
926 tnames[i] = NULL; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
927 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
928 tmp = non_ff_drivers; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
929 non_ff_drivers = tnames; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
930 /* i now represents the count we previously alloced */ |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
931 kmem_free(tmp, i * sizeof (char *)); |
0 | 932 } |
933 mutex_exit(&non_ff_drv_mutex); | |
934 | |
935 return (1); | |
936 } | |
937 | |
938 /* | |
939 * Check for the "ddi-failfast-supported" devtree property on each submirror | |
940 * component to indicate if we should do I/O to that submirror with the | |
941 * B_FAILFAST flag set or not. This check is made at various state transitions | |
942 * in the mirror code (e.g. open, enable, hotspare, etc.). Sometimes we | |
943 * only need to check one drive (e.g. hotspare) but since the check is | |
944 * fast and infrequent and sometimes needs to be done on all components we | |
945 * just check all components on each call. | |
946 */ | |
947 void | |
948 mirror_check_failfast(minor_t mnum) | |
949 { | |
950 int i; | |
951 mm_unit_t *un; | |
952 | |
953 if (md_ff_disable) | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
954 return; |
0 | 955 |
956 un = MD_UNIT(mnum); | |
957 | |
958 for (i = 0; i < NMIRROR; i++) { | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
959 int ci; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
960 int cnt; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
961 int ff = 1; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
962 mm_submirror_t *sm; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
963 mm_submirror_ic_t *smic; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
964 void (*get_dev)(); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
965 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
966 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
967 continue; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
968 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
969 sm = &un->un_sm[i]; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
970 smic = &un->un_smic[i]; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
971 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
972 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
973 "get device", 0); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
974 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
975 cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
976 for (ci = 0; ci < cnt; ci++) { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
977 int found = 0; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
978 dev_t ci_dev; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
979 major_t major; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
980 dev_info_t *devi; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
981 ms_cd_info_t cd; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
982 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
983 /* |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
984 * this already returns the hs |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
985 * dev if the device is spared |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
986 */ |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
987 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
988 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
989 ci_dev = md_dev64_to_dev(cd.cd_dev); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
990 major = getmajor(ci_dev); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
991 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
992 if (major == md_major) { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
993 /* |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
994 * this component must be a soft |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
995 * partition; get the real dev |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
996 */ |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
997 minor_t dev_mnum; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
998 mdi_unit_t *ui; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
999 mp_unit_t *un; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1000 set_t setno; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1001 side_t side; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1002 md_dev64_t tmpdev; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1003 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1004 ui = MDI_UNIT(getminor(ci_dev)); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1005 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1006 /* grab necessary lock */ |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1007 un = (mp_unit_t *)md_unit_readerlock(ui); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1008 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1009 dev_mnum = MD_SID(un); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1010 setno = MD_MIN2SET(dev_mnum); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1011 side = mddb_getsidenum(setno); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1012 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1013 tmpdev = un->un_dev; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1014 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1015 /* Get dev by device id */ |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1016 if (md_devid_found(setno, side, |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1017 un->un_key) == 1) { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1018 tmpdev = md_resolve_bydevid(dev_mnum, |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1019 tmpdev, un->un_key); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1020 } |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1021 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1022 md_unit_readerexit(ui); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1023 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1024 ci_dev = md_dev64_to_dev(tmpdev); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1025 major = getmajor(ci_dev); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1026 } |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1027 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1028 if (ci_dev != NODEV32 && |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1029 (devi = e_ddi_hold_devi_by_dev(ci_dev, 0)) |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1030 != NULL) { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1031 ddi_prop_op_t prop_op = PROP_LEN_AND_VAL_BUF; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1032 int propvalue = 0; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1033 int proplength = sizeof (int); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1034 int error; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1035 struct cb_ops *cb; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1036 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1037 if ((cb = devopsp[major]->devo_cb_ops) != |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1038 NULL) { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1039 error = (*cb->cb_prop_op) |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1040 (DDI_DEV_T_ANY, devi, prop_op, |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1041 DDI_PROP_NOTPROM|DDI_PROP_DONTPASS, |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1042 "ddi-failfast-supported", |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1043 (caddr_t)&propvalue, &proplength); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1044 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1045 if (error == DDI_PROP_SUCCESS) |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1046 found = 1; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1047 } |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1048 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1049 if (!found && new_non_ff_driver( |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1050 ddi_driver_name(devi))) { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1051 cmn_err(CE_NOTE, "!md: B_FAILFAST I/O" |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1052 "disabled on %s", |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1053 ddi_driver_name(devi)); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1054 } |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1055 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1056 ddi_release_devi(devi); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1057 } |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1058 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1059 /* |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1060 * All components must support |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1061 * failfast in the submirror. |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1062 */ |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1063 if (!found) { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1064 ff = 0; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1065 break; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1066 } |
0 | 1067 } |
1068 | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1069 if (ff) { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1070 sm->sm_flags |= MD_SM_FAILFAST; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1071 } else { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1072 sm->sm_flags &= ~MD_SM_FAILFAST; |
0 | 1073 } |
1074 } | |
1075 } | |
1076 | |
1077 /* | |
1078 * Return true if the submirror is unavailable. | |
1079 * If any of the submirror components are opened then the submirror cannot | |
1080 * be unavailable (MD_INACCESSIBLE). | |
1081 * If any of the components are already in the errored state, then the submirror | |
1082 * cannot be unavailable (MD_INACCESSIBLE). | |
1083 */ | |
1084 static bool_t | |
1085 submirror_unavailable(mm_unit_t *un, int smi, int from_probe) | |
1086 { | |
1087 mm_submirror_t *sm; | |
1088 mm_submirror_ic_t *smic; | |
1089 md_m_shared_t *shared; | |
1090 int ci; | |
1091 int compcnt; | |
1092 | |
1093 sm = &un->un_sm[smi]; | |
1094 smic = &un->un_smic[smi]; | |
1095 | |
1096 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); | |
1097 for (ci = 0; ci < compcnt; ci++) { | |
1098 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) | |
1099 (sm->sm_dev, sm, ci); | |
1100 if (from_probe) { | |
1101 if (shared->ms_flags & MDM_S_PROBEOPEN) | |
1102 return (B_FALSE); | |
1103 } else { | |
1104 if (shared->ms_flags & MDM_S_ISOPEN) | |
1105 return (B_FALSE); | |
1106 } | |
1107 if (shared->ms_state == CS_ERRED || | |
1108 shared->ms_state == CS_LAST_ERRED) | |
1109 return (B_FALSE); | |
1110 } | |
1111 | |
1112 return (B_TRUE); | |
1113 } | |
1114 | |
1115 static int | |
1116 mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp) | |
1117 { | |
1118 int i; | |
1119 mm_unit_t *un; | |
1120 mdi_unit_t *ui; | |
1121 int err; | |
1122 int smi; | |
1123 int ci; | |
1124 err_comp_t *c; | |
1125 err_comp_t *ecomps = NULL; | |
1126 int smmask = 0; | |
1127 set_t setno; | |
1128 int sm_cnt; | |
1129 int sm_unavail_cnt; | |
1130 | |
1131 mirror_check_failfast(mnum); | |
1132 | |
1133 un = MD_UNIT(mnum); | |
1134 ui = MDI_UNIT(mnum); | |
1135 setno = MD_UN2SET(un); | |
1136 | |
1137 for (i = 0; i < NMIRROR; i++) { | |
1138 md_dev64_t tmpdev = un->un_sm[i].sm_dev; | |
1139 | |
1140 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) | |
1141 continue; | |
1142 if (md_layered_open(mnum, &tmpdev, md_oflags)) | |
1143 smmask |= SMI2BIT(i); | |
1144 un->un_sm[i].sm_dev = tmpdev; | |
1145 } | |
1146 | |
1147 /* | |
1148 * If smmask is clear, all submirrors are accessible. Clear the | |
1149 * MD_INACCESSIBLE bit in this case. This bit is also cleared for the | |
1150 * mirror device. If smmask is set, we have to determine which of the | |
1151 * submirrors are in error. If no submirror is accessible we mark the | |
1152 * whole mirror as MD_INACCESSIBLE. | |
1153 */ | |
1154 if (smmask == 0) { | |
1155 if (lockp) { | |
1156 md_ioctl_readerexit(lockp); | |
1157 (void) md_ioctl_writerlock(lockp, ui); | |
1158 } else { | |
1159 md_unit_readerexit(ui); | |
1160 (void) md_unit_writerlock(ui); | |
1161 } | |
1162 ui->ui_tstate &= ~MD_INACCESSIBLE; | |
1163 if (lockp) { | |
1164 md_ioctl_writerexit(lockp); | |
1165 (void) md_ioctl_readerlock(lockp, ui); | |
1166 } else { | |
1167 md_unit_writerexit(ui); | |
1168 (void) md_unit_readerlock(ui); | |
1169 } | |
1170 | |
1171 for (i = 0; i < NMIRROR; i++) { | |
1172 md_dev64_t tmpdev; | |
1173 mdi_unit_t *sm_ui; | |
1174 | |
1175 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) | |
1176 continue; | |
1177 | |
1178 tmpdev = un->un_sm[i].sm_dev; | |
1179 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); | |
1180 (void) md_unit_writerlock(sm_ui); | |
1181 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; | |
1182 md_unit_writerexit(sm_ui); | |
1183 } | |
1184 | |
1185 return (0); | |
1186 } | |
1187 | |
1188 for (i = 0; i < NMIRROR; i++) { | |
1189 md_dev64_t tmpdev; | |
1190 | |
1191 if (!(smmask & SMI2BIT(i))) | |
1192 continue; | |
1193 | |
1194 tmpdev = un->un_sm[i].sm_dev; | |
1195 err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS); | |
1196 un->un_sm[i].sm_dev = tmpdev; | |
1197 ASSERT(err == 0); | |
1198 } | |
1199 | |
1200 if (lockp) { | |
1201 md_ioctl_readerexit(lockp); | |
1202 un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui); | |
1203 } else { | |
1204 md_unit_readerexit(ui); | |
1205 un = (mm_unit_t *)md_unit_writerlock(ui); | |
1206 } | |
1207 | |
1208 /* | |
1209 * We want to make sure the unavailable flag is not masking a real | |
1210 * error on the submirror. | |
1211 * For each submirror, | |
1212 * if all of the submirror components couldn't be opened and there | |
1213 * are no errors on the submirror, then set the unavailable flag | |
1214 * otherwise, clear unavailable. | |
1215 */ | |
1216 sm_cnt = 0; | |
1217 sm_unavail_cnt = 0; | |
1218 for (i = 0; i < NMIRROR; i++) { | |
1219 md_dev64_t tmpdev; | |
1220 mdi_unit_t *sm_ui; | |
1221 | |
1222 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) | |
1223 continue; | |
1224 | |
1225 sm_cnt++; | |
1226 tmpdev = un->un_sm[i].sm_dev; | |
1227 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); | |
1228 | |
1229 (void) md_unit_writerlock(sm_ui); | |
1230 if (submirror_unavailable(un, i, 0)) { | |
1231 sm_ui->ui_tstate |= MD_INACCESSIBLE; | |
1232 sm_unavail_cnt++; | |
1233 } else { | |
1234 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; | |
1235 } | |
1236 md_unit_writerexit(sm_ui); | |
1237 } | |
1238 | |
1239 /* | |
1240 * If all of the submirrors are unavailable, the mirror is also | |
1241 * unavailable. | |
1242 */ | |
1243 if (sm_cnt == sm_unavail_cnt) { | |
1244 ui->ui_tstate |= MD_INACCESSIBLE; | |
1245 } else { | |
1246 ui->ui_tstate &= ~MD_INACCESSIBLE; | |
1247 } | |
1248 | |
1249 smi = 0; | |
1250 ci = 0; | |
1251 while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) { | |
1252 if (mirror_other_sources(un, smi, ci, 1) == 1) { | |
1253 | |
1254 free_all_ecomps(ecomps); | |
1255 (void) mirror_close_all_devs(un, md_oflags); | |
1256 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, | |
1257 SVM_TAG_METADEVICE, setno, MD_SID(un)); | |
1258 mirror_openfail_console_info(un, smi, ci); | |
1259 if (lockp) { | |
1260 md_ioctl_writerexit(lockp); | |
1261 (void) md_ioctl_readerlock(lockp, ui); | |
1262 } else { | |
1263 md_unit_writerexit(ui); | |
1264 (void) md_unit_readerlock(ui); | |
1265 } | |
1266 return (ENXIO); | |
1267 } | |
1268 | |
1269 /* track all component states that need changing */ | |
1270 c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP); | |
1271 c->ec_next = ecomps; | |
1272 c->ec_smi = smi; | |
1273 c->ec_ci = ci; | |
1274 ecomps = c; | |
1275 ci++; | |
1276 } | |
1277 | |
1278 /* Make all state changes and commit them */ | |
1279 for (c = ecomps; c != NULL; c = c->ec_next) { | |
1280 /* | |
1281 * If lockp is set, then entering kernel through ioctl. | |
1282 * For a MN set, the only ioctl path is via a commd message | |
1283 * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already | |
1284 * being sent to each node. | |
1285 * In this case, set NO_XMIT so that set_sm_comp_state | |
1286 * won't attempt to send a message on a message. | |
1287 * | |
1288 * In !MN sets, the xmit flag is ignored, so it doesn't matter | |
1289 * which flag is passed. | |
1290 */ | |
1291 if (lockp) { | |
1292 set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0, | |
1293 MD_STATE_NO_XMIT, lockp); | |
1294 } else { | |
1295 set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0, | |
1296 (MD_STATE_XMIT | MD_STATE_OCHELD), lockp); | |
1297 } | |
1298 /* | |
1299 * For a MN set, the NOTIFY is done when the state change is | |
1300 * processed on each node | |
1301 */ | |
1302 if (!MD_MNSET_SETNO(setno)) { | |
1303 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, | |
1304 SVM_TAG_METADEVICE, setno, MD_SID(un)); | |
1305 } | |
1306 } | |
1307 | |
1308 if (lockp) { | |
1309 md_ioctl_writerexit(lockp); | |
1310 (void) md_ioctl_readerlock(lockp, ui); | |
1311 } else { | |
1312 md_unit_writerexit(ui); | |
1313 (void) md_unit_readerlock(ui); | |
1314 } | |
1315 | |
1316 free_all_ecomps(ecomps); | |
1317 | |
1318 /* allocate hotspares for all errored components */ | |
1319 if (MD_MNSET_SETNO(setno)) { | |
1320 /* | |
1321 * If we're called from an ioctl (lockp set) then we cannot | |
1322 * directly call send_poke_hotspares as this will block until | |
1323 * the message gets despatched to all nodes. If the cluster is | |
1324 * going through a reconfig cycle then the message will block | |
1325 * until the cycle is complete, and as we originate from a | |
1326 * service call from commd we will livelock. | |
1327 */ | |
1328 if (lockp == NULL) { | |
1329 md_unit_readerexit(ui); | |
1330 send_poke_hotspares(setno); | |
1331 (void) md_unit_readerlock(ui); | |
1332 } | |
1333 } else { | |
1334 (void) poke_hotspares(); | |
1335 } | |
1336 return (0); | |
1337 } | |
1338 | |
1339 void | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1340 mirror_overlap_tree_remove(md_mps_t *ps) |
0 | 1341 { |
1342 mm_unit_t *un; | |
1343 | |
1344 if (panicstr) | |
1345 return; | |
1346 | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1347 VERIFY(ps->ps_flags & MD_MPS_ON_OVERLAP); |
0 | 1348 un = ps->ps_un; |
1349 | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1350 mutex_enter(&un->un_overlap_tree_mx); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1351 avl_remove(&un->un_overlap_root, ps); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1352 ps->ps_flags &= ~MD_MPS_ON_OVERLAP; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1353 if (un->un_overlap_tree_flag != 0) { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1354 un->un_overlap_tree_flag = 0; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1355 cv_broadcast(&un->un_overlap_tree_cv); |
0 | 1356 } |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1357 mutex_exit(&un->un_overlap_tree_mx); |
0 | 1358 } |
1359 | |
1360 | |
1361 /* | |
1362 * wait_for_overlaps: | |
1363 * ----------------- | |
1364 * Check that given i/o request does not cause an overlap with already pending | |
1365 * i/o. If it does, block until the overlapped i/o completes. | |
1366 * | |
1367 * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1368 * structure to be already in the overlap tree and MD_OVERLAP_NO_REPEAT if |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1369 * it must not already be in the tree. |
0 | 1370 */ |
1371 static void | |
1372 wait_for_overlaps(md_mps_t *ps, int flags) | |
1373 { | |
1374 mm_unit_t *un; | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1375 avl_index_t where; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1376 md_mps_t *ps1; |
0 | 1377 |
1378 if (panicstr) | |
1379 return; | |
1380 | |
1381 un = ps->ps_un; | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1382 mutex_enter(&un->un_overlap_tree_mx); |
0 | 1383 if ((flags & MD_OVERLAP_ALLOW_REPEAT) && |
1384 (ps->ps_flags & MD_MPS_ON_OVERLAP)) { | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1385 mutex_exit(&un->un_overlap_tree_mx); |
0 | 1386 return; |
1387 } | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1388 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1389 VERIFY(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1390 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1391 do { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1392 ps1 = avl_find(&un->un_overlap_root, ps, &where); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1393 if (ps1 == NULL) { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1394 /* |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1395 * The candidate range does not overlap with any |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1396 * range in the tree. Insert it and be done. |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1397 */ |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1398 avl_insert(&un->un_overlap_root, ps, where); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1399 ps->ps_flags |= MD_MPS_ON_OVERLAP; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1400 } else { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1401 /* |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1402 * The candidate range would overlap. Set the flag |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1403 * indicating we need to be woken up, and sleep |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1404 * until another thread removes a range. If upon |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1405 * waking up we find this mps was put on the tree |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1406 * by another thread, the loop terminates. |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1407 */ |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1408 un->un_overlap_tree_flag = 1; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1409 cv_wait(&un->un_overlap_tree_cv, |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1410 &un->un_overlap_tree_mx); |
0 | 1411 } |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1412 } while (!(ps->ps_flags & MD_MPS_ON_OVERLAP)); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1413 mutex_exit(&un->un_overlap_tree_mx); |
0 | 1414 } |
1415 | |
1416 /* | |
1417 * This function is called from mirror_done to check whether any pages have | |
1418 * been modified while a mirrored write was in progress. Returns 0 if | |
1419 * all pages associated with bp are clean, 1 otherwise. | |
1420 */ | |
1421 static int | |
1422 any_pages_dirty(struct buf *bp) | |
1423 { | |
1424 int rval; | |
1425 | |
1426 rval = biomodified(bp); | |
1427 if (rval == -1) | |
1428 rval = 0; | |
1429 | |
1430 return (rval); | |
1431 } | |
1432 | |
1433 #define MAX_EXTRAS 10 | |
1434 | |
1435 void | |
1436 mirror_commit( | |
1437 mm_unit_t *un, | |
1438 int smmask, | |
1439 mddb_recid_t *extras | |
1440 ) | |
1441 { | |
1442 mm_submirror_t *sm; | |
1443 md_unit_t *su; | |
1444 int i; | |
1445 | |
1446 /* 2=mirror,null id */ | |
1447 mddb_recid_t recids[NMIRROR+2+MAX_EXTRAS]; | |
1448 | |
1449 int ri = 0; | |
1450 | |
1451 if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE) | |
1452 return; | |
1453 | |
1454 /* Add two, this includes the mirror unit and the null recid */ | |
1455 if (extras != NULL) { | |
1456 int nrecids = 0; | |
1457 while (extras[nrecids] != 0) { | |
1458 nrecids++; | |
1459 } | |
1460 ASSERT(nrecids <= MAX_EXTRAS); | |
1461 } | |
1462 | |
1463 if (un != NULL) | |
1464 recids[ri++] = un->c.un_record_id; | |
1465 for (i = 0; i < NMIRROR; i++) { | |
1466 if (!(smmask & SMI2BIT(i))) | |
1467 continue; | |
1468 sm = &un->un_sm[i]; | |
1469 if (!SMS_IS(sm, SMS_INUSE)) | |
1470 continue; | |
1471 if (md_getmajor(sm->sm_dev) != md_major) | |
1472 continue; | |
1473 su = MD_UNIT(md_getminor(sm->sm_dev)); | |
1474 recids[ri++] = su->c.un_record_id; | |
1475 } | |
1476 | |
1477 if (extras != NULL) | |
1478 while (*extras != 0) { | |
1479 recids[ri++] = *extras; | |
1480 extras++; | |
1481 } | |
1482 | |
1483 if (ri == 0) | |
1484 return; | |
1485 recids[ri] = 0; | |
1486 | |
1487 /* | |
1488 * Ok to hold ioctl lock across record commit to mddb as | |
1489 * long as the record(s) being committed aren't resync records. | |
1490 */ | |
1491 mddb_commitrecs_wrapper(recids); | |
1492 } | |
1493 | |
1494 | |
1495 /* | |
1496 * This routine is used to set a bit in the writable_bm bitmap | |
1497 * which represents each submirror in a metamirror which | |
1498 * is writable. The first writable submirror index is assigned | |
1499 * to the sm_index. The number of writable submirrors are returned in nunits. | |
1500 * | |
1501 * This routine returns the submirror's unit number. | |
1502 */ | |
1503 | |
1504 static void | |
1505 select_write_units(struct mm_unit *un, md_mps_t *ps) | |
1506 { | |
1507 | |
1508 int i; | |
1509 unsigned writable_bm = 0; | |
1510 unsigned nunits = 0; | |
1511 | |
1512 for (i = 0; i < NMIRROR; i++) { | |
1513 if (SUBMIRROR_IS_WRITEABLE(un, i)) { | |
1514 /* set bit of all writable units */ | |
1515 writable_bm |= SMI2BIT(i); | |
1516 nunits++; | |
1517 } | |
1518 } | |
1519 ps->ps_writable_sm = writable_bm; | |
1520 ps->ps_active_cnt = nunits; | |
1521 ps->ps_current_sm = 0; | |
1522 } | |
1523 | |
1524 static | |
1525 unsigned | |
1526 select_write_after_read_units(struct mm_unit *un, md_mps_t *ps) | |
1527 { | |
1528 | |
1529 int i; | |
1530 unsigned writable_bm = 0; | |
1531 unsigned nunits = 0; | |
1532 | |
1533 for (i = 0; i < NMIRROR; i++) { | |
1534 if (SUBMIRROR_IS_WRITEABLE(un, i) && | |
1535 un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) { | |
1536 writable_bm |= SMI2BIT(i); | |
1537 nunits++; | |
1538 } | |
1539 } | |
1540 if ((writable_bm & ps->ps_allfrom_sm) != 0) { | |
1541 writable_bm &= ~ps->ps_allfrom_sm; | |
1542 nunits--; | |
1543 } | |
1544 ps->ps_writable_sm = writable_bm; | |
1545 ps->ps_active_cnt = nunits; | |
1546 ps->ps_current_sm = 0; | |
1547 return (nunits); | |
1548 } | |
1549 | |
1550 static md_dev64_t | |
1551 select_read_unit( | |
1552 mm_unit_t *un, | |
1553 diskaddr_t blkno, | |
1554 u_longlong_t reqcount, | |
1555 u_longlong_t *cando, | |
1556 int must_be_opened, | |
1557 md_m_shared_t **shared, | |
1558 md_mcs_t *cs) | |
1559 { | |
1560 int i; | |
1561 md_m_shared_t *s; | |
1562 uint_t lasterrcnt = 0; | |
1563 md_dev64_t dev = 0; | |
1564 u_longlong_t cnt; | |
1565 u_longlong_t mincnt; | |
1566 mm_submirror_t *sm; | |
1567 mm_submirror_ic_t *smic; | |
1568 mdi_unit_t *ui; | |
1569 | |
1570 mincnt = reqcount; | |
1571 for (i = 0; i < NMIRROR; i++) { | |
1572 if (!SUBMIRROR_IS_READABLE(un, i)) | |
1573 continue; | |
1574 sm = &un->un_sm[i]; | |
1575 smic = &un->un_smic[i]; | |
1576 cnt = reqcount; | |
1577 | |
1578 /* | |
1579 * If the current submirror is marked as inaccessible, do not | |
1580 * try to access it. | |
1581 */ | |
1582 ui = MDI_UNIT(getminor(expldev(sm->sm_dev))); | |
1583 (void) md_unit_readerlock(ui); | |
1584 if (ui->ui_tstate & MD_INACCESSIBLE) { | |
1585 md_unit_readerexit(ui); | |
1586 continue; | |
1587 } | |
1588 md_unit_readerexit(ui); | |
1589 | |
1590 s = (md_m_shared_t *)(*(smic->sm_shared_by_blk)) | |
1591 (sm->sm_dev, sm, blkno, &cnt); | |
1592 | |
1593 if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN)) | |
1594 continue; | |
1595 if (s->ms_state == CS_OKAY) { | |
1596 *cando = cnt; | |
1597 if (shared != NULL) | |
1598 *shared = s; | |
1599 | |
1600 if (un->un_sm[i].sm_flags & MD_SM_FAILFAST && | |
1601 cs != NULL) { | |
1602 cs->cs_buf.b_flags |= B_FAILFAST; | |
1603 } | |
1604 | |
1605 return (un->un_sm[i].sm_dev); | |
1606 } | |
1607 if (s->ms_state != CS_LAST_ERRED) | |
1608 continue; | |
1609 | |
1610 /* don't use B_FAILFAST since we're Last Erred */ | |
1611 | |
1612 if (mincnt > cnt) | |
1613 mincnt = cnt; | |
1614 if (s->ms_lasterrcnt > lasterrcnt) { | |
1615 lasterrcnt = s->ms_lasterrcnt; | |
1616 if (shared != NULL) | |
1617 *shared = s; | |
1618 dev = un->un_sm[i].sm_dev; | |
1619 } | |
1620 } | |
1621 *cando = mincnt; | |
1622 return (dev); | |
1623 } | |
1624 | |
1625 /* | |
1626 * Given a 32-bit bitmap, this routine will return the bit number | |
1627 * of the nth bit set. The nth bit set is passed via the index integer. | |
1628 * | |
1629 * This routine is used to run through the writable submirror bitmap | |
1630 * and starting all of the writes. See the value returned is the | |
1631 * index to appropriate submirror structure, in the md_sm | |
1632 * array for metamirrors. | |
1633 */ | |
1634 static int | |
1635 md_find_nth_unit(uint_t mask, int index) | |
1636 { | |
1637 int bit, nfound; | |
1638 | |
1639 for (bit = -1, nfound = -1; nfound != index; bit++) { | |
1640 ASSERT(mask != 0); | |
1641 nfound += (mask & 1); | |
1642 mask >>= 1; | |
1643 } | |
1644 return (bit); | |
1645 } | |
1646 | |
1647 static int | |
1648 fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs) | |
1649 { | |
1650 mm_unit_t *un; | |
1651 buf_t *bp; | |
1652 int i; | |
1653 unsigned nunits = 0; | |
1654 int iunit; | |
1655 uint_t running_bm = 0; | |
1656 uint_t sm_index; | |
1657 | |
1658 bp = &cs->cs_buf; | |
1659 un = ps->ps_un; | |
1660 | |
1661 for (i = 0; i < NMIRROR; i++) { | |
1662 if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING)) | |
1663 continue; | |
1664 running_bm |= SMI2BIT(i); | |
1665 nunits++; | |
1666 } | |
1667 if (nunits == 0) | |
1668 return (1); | |
1669 | |
1670 /* | |
1671 * For directed mirror read (DMR) we only use the specified side and | |
1672 * do not compute the source of the read. | |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
1673 * If we're running with MD_MPS_DIRTY_RD set we always return the |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
1674 * first mirror side (this prevents unnecessary ownership switching). |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
1675 * Otherwise we return the submirror according to the mirror read option |
0 | 1676 */ |
1677 if (ps->ps_flags & MD_MPS_DMR) { | |
1678 sm_index = un->un_dmr_last_read; | |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
1679 } else if (ps->ps_flags & MD_MPS_DIRTY_RD) { |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
1680 sm_index = md_find_nth_unit(running_bm, 0); |
0 | 1681 } else { |
1682 /* Normal (non-DMR) operation */ | |
1683 switch (un->un_read_option) { | |
1684 case RD_GEOMETRY: | |
1685 iunit = (int)(bp->b_lblkno / | |
1686 howmany(un->c.un_total_blocks, nunits)); | |
1687 sm_index = md_find_nth_unit(running_bm, iunit); | |
1688 break; | |
1689 case RD_FIRST: | |
1690 sm_index = md_find_nth_unit(running_bm, 0); | |
1691 break; | |
1692 case RD_LOAD_BAL: | |
1693 /* this is intentional to fall into the default */ | |
1694 default: | |
1695 un->un_last_read = (un->un_last_read + 1) % nunits; | |
1696 sm_index = md_find_nth_unit(running_bm, | |
1697 un->un_last_read); | |
1698 break; | |
1699 } | |
1700 } | |
1701 bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev); | |
1702 ps->ps_allfrom_sm = SMI2BIT(sm_index); | |
1703 | |
1704 if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) { | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1705 bp->b_flags |= B_FAILFAST; |
0 | 1706 } |
1707 | |
1708 return (0); | |
1709 } | |
1710 | |
1711 static | |
1712 int | |
1713 mirror_are_submirrors_available(mm_unit_t *un) | |
1714 { | |
1715 int i; | |
1716 for (i = 0; i < NMIRROR; i++) { | |
1717 md_dev64_t tmpdev = un->un_sm[i].sm_dev; | |
1718 | |
1719 if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) || | |
1720 md_getmajor(tmpdev) != md_major) | |
1721 continue; | |
1722 | |
1723 if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) || | |
1724 (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits)) | |
1725 return (0); | |
1726 | |
1727 if (MDI_UNIT(md_getminor(tmpdev)) == NULL) | |
1728 return (0); | |
1729 } | |
1730 return (1); | |
1731 } | |
1732 | |
1733 void | |
1734 build_submirror(mm_unit_t *un, int i, int snarfing) | |
1735 { | |
1736 struct mm_submirror *sm; | |
1737 struct mm_submirror_ic *smic; | |
1738 md_unit_t *su; | |
1739 set_t setno; | |
1740 | |
1741 sm = &un->un_sm[i]; | |
1742 smic = &un->un_smic[i]; | |
1743 | |
1744 sm->sm_flags = 0; /* sometime we may need to do more here */ | |
1745 | |
1746 setno = MD_UN2SET(un); | |
1747 | |
1748 if (!SMS_IS(sm, SMS_INUSE)) | |
1749 return; | |
1750 if (snarfing) { | |
1751 sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno), | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1752 sm->sm_key, MD_NOTRUST_DEVT); |
0 | 1753 } else { |
1754 if (md_getmajor(sm->sm_dev) == md_major) { | |
1755 su = MD_UNIT(md_getminor(sm->sm_dev)); | |
1756 un->c.un_flag |= (su->c.un_flag & MD_LABELED); | |
1757 /* submirror can no longer be soft partitioned */ | |
1758 MD_CAPAB(su) &= (~MD_CAN_SP); | |
1759 } | |
1760 } | |
1761 smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev, | |
1762 0, "shared by blk", 0); | |
1763 smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev, | |
1764 0, "shared by indx", 0); | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1765 smic->sm_get_component_count = (int (*)())md_get_named_service( |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1766 sm->sm_dev, 0, "get component count", 0); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1767 smic->sm_get_bcss = (int (*)())md_get_named_service(sm->sm_dev, 0, |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1768 "get block count skip size", 0); |
0 | 1769 sm->sm_state &= ~SMS_IGNORE; |
1770 if (SMS_IS(sm, SMS_OFFLINE)) | |
1771 MD_STATUS(un) |= MD_UN_OFFLINE_SM; | |
1772 md_set_parent(sm->sm_dev, MD_SID(un)); | |
1773 } | |
1774 | |
1775 static void | |
1776 mirror_cleanup(mm_unit_t *un) | |
1777 { | |
1778 mddb_recid_t recid; | |
1779 int smi; | |
1780 sv_dev_t sv[NMIRROR]; | |
1781 int nsv = 0; | |
1782 | |
1783 /* | |
1784 * If a MN diskset and this node is not the master, do | |
1785 * not delete any records on snarf of the mirror records. | |
1786 */ | |
1787 if (MD_MNSET_SETNO(MD_UN2SET(un)) && | |
1788 md_set[MD_UN2SET(un)].s_am_i_master == 0) { | |
1789 return; | |
1790 } | |
1791 | |
1792 for (smi = 0; smi < NMIRROR; smi++) { | |
1793 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) | |
1794 continue; | |
1795 sv[nsv].setno = MD_UN2SET(un); | |
1796 sv[nsv++].key = un->un_sm[smi].sm_key; | |
1797 } | |
1798 | |
1799 recid = un->un_rr_dirty_recid; | |
1800 mddb_deleterec_wrapper(un->c.un_record_id); | |
1801 if (recid > 0) | |
1802 mddb_deleterec_wrapper(recid); | |
1803 | |
1804 md_rem_names(sv, nsv); | |
1805 } | |
1806 | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1807 /* |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1808 * Comparison function for the avl tree which tracks |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1809 * outstanding writes on submirrors. |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1810 * |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1811 * Returns: |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1812 * -1: ps1 < ps2 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1813 * 0: ps1 and ps2 overlap |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1814 * 1: ps1 > ps2 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1815 */ |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1816 static int |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1817 mirror_overlap_compare(const void *p1, const void *p2) |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1818 { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1819 const md_mps_t *ps1 = (md_mps_t *)p1; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1820 const md_mps_t *ps2 = (md_mps_t *)p2; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1821 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1822 if (ps1->ps_firstblk < ps2->ps_firstblk) { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1823 if (ps1->ps_lastblk >= ps2->ps_firstblk) |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1824 return (0); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1825 return (-1); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1826 } |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1827 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1828 if (ps1->ps_firstblk > ps2->ps_firstblk) { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1829 if (ps1->ps_firstblk <= ps2->ps_lastblk) |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1830 return (0); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1831 return (1); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1832 } |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1833 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1834 return (0); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1835 } |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1836 |
10948
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1837 /* |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1838 * Collapse any sparse submirror entries snarfed from the on-disk replica. |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1839 * Only the in-core entries are updated. The replica will be updated on-disk |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1840 * when the in-core replica is committed on shutdown of the SVM subsystem. |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1841 */ |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1842 static void |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1843 collapse_submirrors(mm_unit_t *un) |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1844 { |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1845 int smi, nremovals, smiremove; |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1846 mm_submirror_t *sm, *new_sm, *old_sm; |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1847 mm_submirror_ic_t *smic; |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1848 int nsmidx = un->un_nsm - 1; |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1849 |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1850 rescan: |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1851 nremovals = 0; |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1852 smiremove = -1; |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1853 |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1854 for (smi = 0; smi <= nsmidx; smi++) { |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1855 sm = &un->un_sm[smi]; |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1856 |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1857 /* |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1858 * Check to see if this submirror is marked as in-use. |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1859 * If it isn't then it is a potential sparse entry and |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1860 * may need to be cleared from the configuration. |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1861 * The records should _already_ have been cleared by the |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1862 * original mirror_detach() code, but we need to shuffle |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1863 * any NULL entries in un_sm[] to the end of the array. |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1864 * Any NULL un_smic[] entries need to be reset to the underlying |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1865 * submirror/slice accessor functions. |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1866 */ |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1867 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) { |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1868 nremovals++; |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1869 smiremove = smi; |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1870 break; |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1871 } |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1872 } |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1873 |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1874 if (nremovals == 0) { |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1875 /* |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1876 * Ensure that we have a matching contiguous set of un_smic[] |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1877 * entries for the corresponding un_sm[] entries |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1878 */ |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1879 for (smi = 0; smi <= nsmidx; smi++) { |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1880 smic = &un->un_smic[smi]; |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1881 sm = &un->un_sm[smi]; |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1882 |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1883 smic->sm_shared_by_blk = |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1884 md_get_named_service(sm->sm_dev, 0, |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1885 "shared by_blk", 0); |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1886 smic->sm_shared_by_indx = |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1887 md_get_named_service(sm->sm_dev, 0, |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1888 "shared by indx", 0); |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1889 smic->sm_get_component_count = |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1890 (int (*)())md_get_named_service(sm->sm_dev, 0, |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1891 "get component count", 0); |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1892 smic->sm_get_bcss = |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1893 (int (*)())md_get_named_service(sm->sm_dev, 0, |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1894 "get block count skip size", 0); |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1895 } |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1896 return; |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1897 } |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1898 |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1899 /* |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1900 * Reshuffle the submirror devices so that we do not have a dead record |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1901 * in the middle of the array. Once we've done this we need to rescan |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1902 * the mirror to check for any other holes. |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1903 */ |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1904 for (smi = 0; smi < NMIRROR; smi++) { |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1905 if (smi < smiremove) |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1906 continue; |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1907 if (smi > smiremove) { |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1908 old_sm = &un->un_sm[smi]; |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1909 new_sm = &un->un_sm[smi - 1]; |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1910 bcopy(old_sm, new_sm, sizeof (mm_submirror_t)); |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1911 bzero(old_sm, sizeof (mm_submirror_t)); |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1912 } |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1913 } |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1914 |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1915 /* |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1916 * Now we need to rescan the array to find the next potential dead |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1917 * entry. |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1918 */ |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1919 goto rescan; |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1920 } |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1921 |
0 | 1922 /* Return a -1 if optimized record unavailable and set should be released */ |
1923 int | |
1924 mirror_build_incore(mm_unit_t *un, int snarfing) | |
1925 { | |
1926 int i; | |
1927 | |
1928 if (MD_STATUS(un) & MD_UN_BEING_RESET) { | |
1929 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN); | |
1930 return (1); | |
1931 } | |
1932 | |
1933 if (mirror_are_submirrors_available(un) == 0) | |
1934 return (1); | |
1935 | |
1936 if (MD_UNIT(MD_SID(un)) != NULL) | |
1937 return (0); | |
1938 | |
1939 MD_STATUS(un) = 0; | |
1940 | |
1941 /* pre-4.1 didn't define CAN_META_CHILD capability */ | |
1942 MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP; | |
1943 | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1944 un->un_overlap_tree_flag = 0; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1945 avl_create(&un->un_overlap_root, mirror_overlap_compare, |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1946 sizeof (md_mps_t), offsetof(md_mps_t, ps_overlap_node)); |
0 | 1947 |
10948
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1948 /* |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1949 * We need to collapse any sparse submirror entries into a non-sparse |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1950 * array. This is to cover the case where we have an old replica image |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1951 * which has not been updated (i.e. snarfed) since being modified. |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1952 * The new code expects all submirror access to be sequential (i.e. |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1953 * both the un_sm[] and un_smic[] entries correspond to non-empty |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1954 * submirrors. |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1955 */ |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1956 |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1957 collapse_submirrors(un); |
c686aa11575c
6812139 SVM dereferences NULL from sm_get_component_count and panics
James Hall <James.Hall@Sun.COM>
parents:
8452
diff
changeset
|
1958 |
0 | 1959 for (i = 0; i < NMIRROR; i++) |
1960 build_submirror(un, i, snarfing); | |
1961 | |
1962 if (unit_setup_resync(un, snarfing) != 0) { | |
1963 if (snarfing) { | |
1964 mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT); | |
1965 /* | |
1966 * If a MN set and set is not stale, then return -1 | |
1967 * which will force the caller to unload the set. | |
1968 * The MN diskset nodes will return failure if | |
1969 * unit_setup_resync fails so that nodes won't | |
1970 * get out of sync. | |
1971 * | |
1972 * If set is STALE, the master node can't allocate | |
1973 * a resync record (if needed), but node needs to | |
1974 * join the set so that user can delete broken mddbs. | |
1975 * So, if set is STALE, just continue on. | |
1976 */ | |
1977 if (MD_MNSET_SETNO(MD_UN2SET(un)) && | |
1978 !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) { | |
1979 return (-1); | |
1980 } | |
1981 } else | |
1982 return (1); | |
1983 } | |
1984 | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1985 mutex_init(&un->un_overlap_tree_mx, NULL, MUTEX_DEFAULT, NULL); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
1986 cv_init(&un->un_overlap_tree_cv, NULL, CV_DEFAULT, NULL); |
0 | 1987 |
1988 un->un_suspend_wr_flag = 0; | |
1989 mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL); | |
1990 cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL); | |
1991 | |
1992 /* | |
1993 * Allocate mutexes for mirror-owner and resync-owner changes. | |
1994 * All references to the owner message state field must be guarded | |
1995 * by this mutex. | |
1996 */ | |
1997 mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL); | |
1998 | |
1999 /* | |
2000 * Allocate mutex and condvar for resync thread manipulation. These | |
2001 * will be used by mirror_resync_unit/mirror_ioctl_resync | |
2002 */ | |
2003 mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL); | |
2004 cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL); | |
2005 | |
2006 /* | |
2007 * Allocate mutex and condvar for resync progress thread manipulation. | |
2008 * This allows resyncs to be continued across an intervening reboot. | |
2009 */ | |
2010 mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL); | |
2011 cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL); | |
2012 | |
2013 /* | |
2014 * Allocate mutex and condvar for Directed Mirror Reads (DMR). This | |
2015 * provides synchronization between a user-ioctl and the resulting | |
2016 * strategy() call that performs the read(). | |
2017 */ | |
2018 mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL); | |
2019 cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL); | |
2020 | |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2021 /* |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2022 * Allocate rwlocks for un_pernode_dirty_bm accessing. |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2023 */ |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2024 for (i = 0; i < MD_MNMAXSIDES; i++) { |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2025 rw_init(&un->un_pernode_dirty_mx[i], NULL, RW_DEFAULT, NULL); |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2026 } |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2027 |
7627
8599a7568728
6743774 TSlvm tests cause kernel deadlocks on md_unit_array_rw and md_devinfo->devi_cv
Chris Horne <Chris.Horne@Sun.COM>
parents:
6901
diff
changeset
|
2028 /* place various information in the in-core data structures */ |
8599a7568728
6743774 TSlvm tests cause kernel deadlocks on md_unit_array_rw and md_devinfo->devi_cv
Chris Horne <Chris.Horne@Sun.COM>
parents:
6901
diff
changeset
|
2029 md_nblocks_set(MD_SID(un), un->c.un_total_blocks); |
0 | 2030 MD_UNIT(MD_SID(un)) = un; |
7627
8599a7568728
6743774 TSlvm tests cause kernel deadlocks on md_unit_array_rw and md_devinfo->devi_cv
Chris Horne <Chris.Horne@Sun.COM>
parents:
6901
diff
changeset
|
2031 |
0 | 2032 return (0); |
2033 } | |
2034 | |
2035 | |
2036 void | |
2037 reset_mirror(struct mm_unit *un, minor_t mnum, int removing) | |
2038 { | |
2039 mddb_recid_t recid, vtoc_id; | |
2040 size_t bitcnt; | |
2041 size_t shortcnt; | |
2042 int smi; | |
2043 sv_dev_t sv[NMIRROR]; | |
2044 int nsv = 0; | |
2045 uint_t bits = 0; | |
2046 minor_t selfid; | |
2047 md_unit_t *su; | |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2048 int i; |
0 | 2049 |
2050 md_destroy_unit_incore(mnum, &mirror_md_ops); | |
2051 | |
2052 shortcnt = un->un_rrd_num * sizeof (short); | |
2053 bitcnt = howmany(un->un_rrd_num, NBBY); | |
2054 | |
2055 if (un->un_outstanding_writes) | |
2056 kmem_free((caddr_t)un->un_outstanding_writes, shortcnt); | |
2057 if (un->un_goingclean_bm) | |
2058 kmem_free((caddr_t)un->un_goingclean_bm, bitcnt); | |
2059 if (un->un_goingdirty_bm) | |
2060 kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt); | |
2061 if (un->un_resync_bm) | |
2062 kmem_free((caddr_t)un->un_resync_bm, bitcnt); | |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2063 if (un->un_pernode_dirty_sum) |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2064 kmem_free((caddr_t)un->un_pernode_dirty_sum, un->un_rrd_num); |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2065 |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2066 /* |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2067 * Destroy the taskq for deferred processing of DRL clean requests. |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2068 * This taskq will only be present for Multi Owner mirrors. |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2069 */ |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2070 if (un->un_drl_task != NULL) |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2071 ddi_taskq_destroy(un->un_drl_task); |
0 | 2072 |
7627
8599a7568728
6743774 TSlvm tests cause kernel deadlocks on md_unit_array_rw and md_devinfo->devi_cv
Chris Horne <Chris.Horne@Sun.COM>
parents:
6901
diff
changeset
|
2073 md_nblocks_set(mnum, -1ULL); |
0 | 2074 MD_UNIT(mnum) = NULL; |
2075 | |
1623
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
2076 /* |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
2077 * Attempt release of its minor node |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
2078 */ |
2077
ef90dc4e9399
6396946 Misleading typecast on md_remove_minor_node call
tw21770
parents:
2063
diff
changeset
|
2079 md_remove_minor_node(mnum); |
1623
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
2080 |
0 | 2081 if (!removing) |
2082 return; | |
2083 | |
2084 for (smi = 0; smi < NMIRROR; smi++) { | |
2085 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) | |
2086 continue; | |
2087 /* reallow soft partitioning of submirror and reset parent */ | |
2088 su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev)); | |
2089 MD_CAPAB(su) |= MD_CAN_SP; | |
2090 md_reset_parent(un->un_sm[smi].sm_dev); | |
2091 reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]); | |
2092 | |
2093 sv[nsv].setno = MD_MIN2SET(mnum); | |
2094 sv[nsv++].key = un->un_sm[smi].sm_key; | |
2095 bits |= SMI2BIT(smi); | |
2096 } | |
2097 | |
2098 MD_STATUS(un) |= MD_UN_BEING_RESET; | |
2099 recid = un->un_rr_dirty_recid; | |
2100 vtoc_id = un->c.un_vtoc_id; | |
2101 selfid = MD_SID(un); | |
2102 | |
2103 mirror_commit(un, bits, 0); | |
2104 | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
2105 avl_destroy(&un->un_overlap_root); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
2106 |
0 | 2107 /* Destroy all mutexes and condvars before returning. */ |
2108 mutex_destroy(&un->un_suspend_wr_mx); | |
2109 cv_destroy(&un->un_suspend_wr_cv); | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
2110 mutex_destroy(&un->un_overlap_tree_mx); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
2111 cv_destroy(&un->un_overlap_tree_cv); |
0 | 2112 mutex_destroy(&un->un_owner_mx); |
2113 mutex_destroy(&un->un_rs_thread_mx); | |
2114 cv_destroy(&un->un_rs_thread_cv); | |
2115 mutex_destroy(&un->un_rs_progress_mx); | |
2116 cv_destroy(&un->un_rs_progress_cv); | |
2117 mutex_destroy(&un->un_dmr_mx); | |
2118 cv_destroy(&un->un_dmr_cv); | |
1623
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
2119 |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2120 for (i = 0; i < MD_MNMAXSIDES; i++) { |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2121 rw_destroy(&un->un_pernode_dirty_mx[i]); |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2122 if (un->un_pernode_dirty_bm[i]) |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2123 kmem_free((caddr_t)un->un_pernode_dirty_bm[i], bitcnt); |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2124 } |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2125 |
1623
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
2126 /* |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
2127 * Remove self from the namespace |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
2128 */ |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
2129 if (un->c.un_revision & MD_FN_META_DEV) { |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
2130 (void) md_rem_selfname(un->c.un_self_id); |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
2131 } |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
2132 |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2133 /* This frees the unit structure. */ |
0 | 2134 mddb_deleterec_wrapper(un->c.un_record_id); |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2135 |
0 | 2136 if (recid != 0) |
2137 mddb_deleterec_wrapper(recid); | |
2138 | |
2139 /* Remove the vtoc, if present */ | |
2140 if (vtoc_id) | |
2141 mddb_deleterec_wrapper(vtoc_id); | |
2142 | |
2143 md_rem_names(sv, nsv); | |
2144 | |
2145 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE, | |
2146 MD_MIN2SET(selfid), selfid); | |
2147 } | |
2148 | |
2149 int | |
2150 mirror_internal_open( | |
2151 minor_t mnum, | |
2152 int flag, | |
2153 int otyp, | |
2154 int md_oflags, | |
2155 IOLOCK *lockp /* can be NULL */ | |
2156 ) | |
2157 { | |
2158 mdi_unit_t *ui = MDI_UNIT(mnum); | |
2159 int err = 0; | |
2160 | |
2161 tryagain: | |
2162 /* single thread */ | |
2163 if (lockp) { | |
2164 /* | |
2165 * If ioctl lock is held, use openclose_enter | |
2166 * routine that will set the ioctl flag when | |
2167 * grabbing the readerlock. | |
2168 */ | |
2169 (void) md_ioctl_openclose_enter(lockp, ui); | |
2170 } else { | |
2171 (void) md_unit_openclose_enter(ui); | |
2172 } | |
2173 | |
2174 /* | |
2175 * The mirror_open_all_devs routine may end up sending a STATE_UPDATE | |
2176 * message in a MN diskset and this requires that the openclose | |
2177 * lock is dropped in order to send this message. So, another | |
2178 * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from | |
2179 * attempting an open while this thread has an open in progress. | |
2180 * Call the *_lh version of the lock exit routines since the ui_mx | |
2181 * mutex must be held from checking for OPENINPROGRESS until | |
2182 * after the cv_wait call. | |
2183 */ | |
2184 mutex_enter(&ui->ui_mx); | |
2185 if (ui->ui_lock & MD_UL_OPENINPROGRESS) { | |
2186 if (lockp) { | |
2187 (void) md_ioctl_openclose_exit_lh(lockp); | |
2188 } else { | |
2189 md_unit_openclose_exit_lh(ui); | |
2190 } | |
2191 cv_wait(&ui->ui_cv, &ui->ui_mx); | |
2192 mutex_exit(&ui->ui_mx); | |
2193 goto tryagain; | |
2194 } | |
2195 | |
2196 ui->ui_lock |= MD_UL_OPENINPROGRESS; | |
2197 mutex_exit(&ui->ui_mx); | |
2198 | |
2199 /* open devices, if necessary */ | |
2200 if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) { | |
2201 if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0) | |
2202 goto out; | |
2203 } | |
2204 | |
2205 /* count open */ | |
2206 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) | |
2207 goto out; | |
2208 | |
2209 /* unlock, return success */ | |
2210 out: | |
2211 mutex_enter(&ui->ui_mx); | |
2212 ui->ui_lock &= ~MD_UL_OPENINPROGRESS; | |
2213 mutex_exit(&ui->ui_mx); | |
2214 | |
2215 if (lockp) { | |
2216 /* | |
2217 * If ioctl lock is held, use openclose_exit | |
2218 * routine that will clear the lockp reader flag. | |
2219 */ | |
2220 (void) md_ioctl_openclose_exit(lockp); | |
2221 } else { | |
2222 md_unit_openclose_exit(ui); | |
2223 } | |
2224 return (err); | |
2225 } | |
2226 | |
2227 int | |
2228 mirror_internal_close( | |
2229 minor_t mnum, | |
2230 int otyp, | |
2231 int md_cflags, | |
2232 IOLOCK *lockp /* can be NULL */ | |
2233 ) | |
2234 { | |
2235 mdi_unit_t *ui = MDI_UNIT(mnum); | |
2236 mm_unit_t *un; | |
2237 int err = 0; | |
2238 | |
2239 /* single thread */ | |
2240 if (lockp) { | |
2241 /* | |
2242 * If ioctl lock is held, use openclose_enter | |
2243 * routine that will set the ioctl flag when | |
2244 * grabbing the readerlock. | |
2245 */ | |
2246 un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui); | |
2247 } else { | |
2248 un = (mm_unit_t *)md_unit_openclose_enter(ui); | |
2249 } | |
2250 | |
2251 /* count closed */ | |
2252 if ((err = md_unit_decopen(mnum, otyp)) != 0) | |
2253 goto out; | |
2254 | |
2255 /* close devices, if necessary */ | |
2256 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { | |
2257 /* | |
2258 * Clean up dirty bitmap for this unit. Do this | |
2259 * before closing the underlying devices to avoid | |
2260 * race conditions with reset_mirror() as a | |
2261 * result of a 'metaset -r' command running in | |
2262 * parallel. This might cause deallocation of | |
2263 * dirty region bitmaps; with underlying metadevices | |
2264 * in place this can't happen. | |
2265 * Don't do this if a MN set and ABR not set | |
2266 */ | |
2267 if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) { | |
2268 if (!MD_MNSET_SETNO(MD_UN2SET(un)) || | |
2269 !(ui->ui_tstate & MD_ABR_CAP)) | |
2270 mirror_process_unit_resync(un); | |
2271 } | |
2272 (void) mirror_close_all_devs(un, md_cflags); | |
2273 | |
2274 /* | |
2275 * For a MN set with transient capabilities (eg ABR/DMR) set, | |
2276 * clear these capabilities on the last open in the cluster. | |
2277 * To do this we send a message to all nodes to see of the | |
2278 * device is open. | |
2279 */ | |
2280 if (MD_MNSET_SETNO(MD_UN2SET(un)) && | |
2281 (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) { | |
2282 if (lockp) { | |
2283 (void) md_ioctl_openclose_exit(lockp); | |
2284 } else { | |
2285 md_unit_openclose_exit(ui); | |
2286 } | |
2287 | |
2288 /* | |
2289 * if we are in the context of an ioctl, drop the | |
2290 * ioctl lock. | |
2291 * Otherwise, no other locks should be held. | |
2292 */ | |
2293 if (lockp) { | |
2294 IOLOCK_RETURN_RELEASE(0, lockp); | |
2295 } | |
2296 | |
2297 mdmn_clear_all_capabilities(mnum); | |
2298 | |
2299 /* if dropped the lock previously, regain it */ | |
2300 if (lockp) { | |
2301 IOLOCK_RETURN_REACQUIRE(lockp); | |
2302 } | |
2303 return (0); | |
2304 } | |
2305 /* unlock and return success */ | |
2306 } | |
2307 out: | |
2308 /* Call whether lockp is NULL or not. */ | |
2309 if (lockp) { | |
2310 md_ioctl_openclose_exit(lockp); | |
2311 } else { | |
2312 md_unit_openclose_exit(ui); | |
2313 } | |
2314 return (err); | |
2315 } | |
2316 | |
2317 /* | |
2318 * When a component has completed resyncing and is now ok, check if the | |
2319 * corresponding component in the other submirrors is in the Last Erred | |
2320 * state. If it is, we want to change that to the Erred state so we stop | |
2321 * using that component and start using this good component instead. | |
2322 * | |
2323 * This is called from set_sm_comp_state and recursively calls | |
2324 * set_sm_comp_state if it needs to change the Last Erred state. | |
2325 */ | |
2326 static void | |
2327 reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags, | |
2328 IOLOCK *lockp) | |
2329 { | |
2330 mm_submirror_t *sm; | |
2331 mm_submirror_ic_t *smic; | |
2332 int ci; | |
2333 int i; | |
2334 int compcnt; | |
2335 int changed = 0; | |
2336 | |
2337 for (i = 0; i < NMIRROR; i++) { | |
2338 sm = &un->un_sm[i]; | |
2339 smic = &un->un_smic[i]; | |
2340 | |
2341 if (!SMS_IS(sm, SMS_INUSE)) | |
2342 continue; | |
2343 | |
2344 /* ignore the submirror that we just made ok */ | |
2345 if (i == smi) | |
2346 continue; | |
2347 | |
2348 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); | |
2349 for (ci = 0; ci < compcnt; ci++) { | |
2350 md_m_shared_t *shared; | |
2351 | |
2352 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) | |
2353 (sm->sm_dev, sm, ci); | |
2354 | |
2355 if ((shared->ms_state & CS_LAST_ERRED) && | |
2356 !mirror_other_sources(un, i, ci, 1)) { | |
2357 | |
2358 set_sm_comp_state(un, i, ci, CS_ERRED, extras, | |
2359 flags, lockp); | |
2360 changed = 1; | |
2361 } | |
2362 } | |
2363 } | |
2364 | |
2365 /* maybe there is a hotspare for this newly erred component */ | |
2366 if (changed) { | |
2367 set_t setno; | |
2368 | |
2369 setno = MD_UN2SET(un); | |
2370 if (MD_MNSET_SETNO(setno)) { | |
2371 send_poke_hotspares(setno); | |
2372 } else { | |
2373 (void) poke_hotspares(); | |
2374 } | |
2375 } | |
2376 } | |
2377 | |
2378 /* | |
2379 * set_sm_comp_state | |
2380 * | |
2381 * Set the state of a submirror component to the specified new state. | |
2382 * If the mirror is in a multi-node set, send messages to all nodes to | |
2383 * block all writes to the mirror and then update the state and release the | |
2384 * writes. These messages are only sent if MD_STATE_XMIT is set in flags. | |
2385 * MD_STATE_XMIT will be unset in 2 cases: | |
2386 * 1. When the state is changed to CS_RESYNC as this state change | |
2387 * will already have been updated on each node by the processing of the | |
2388 * distributed metasync command, hence no need to xmit. | |
2389 * 2. When the state is change to CS_OKAY after a resync has completed. Again | |
2390 * the resync completion will already have been processed on each node by | |
2391 * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component | |
2392 * resync, hence no need to xmit. | |
2393 * | |
2394 * In case we are called from the updates of a watermark, | |
2395 * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to | |
2396 * a metainit or similar. In this case the message that we sent to propagate | |
2397 * the state change must not be a class1 message as that would deadlock with | |
2398 * the metainit command that is still being processed. | |
2399 * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2 | |
2400 * instead. This also makes the submessage generator to create a class2 | |
2401 * submessage rather than a class1 (which would also block) | |
2402 * | |
2403 * On entry, unit_writerlock is held | |
2404 * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is | |
2405 * also held. | |
2406 */ | |
2407 void | |
2408 set_sm_comp_state( | |
2409 mm_unit_t *un, | |
2410 int smi, | |
2411 int ci, | |
2412 int newstate, | |
2413 mddb_recid_t *extras, | |
2414 uint_t flags, | |
2415 IOLOCK *lockp | |
2416 ) | |
2417 { | |
2418 mm_submirror_t *sm; | |
2419 mm_submirror_ic_t *smic; | |
2420 md_m_shared_t *shared; | |
2421 int origstate; | |
2422 void (*get_dev)(); | |
2423 ms_cd_info_t cd; | |
2424 char devname[MD_MAX_CTDLEN]; | |
2425 int err; | |
2426 set_t setno = MD_UN2SET(un); | |
2427 md_mn_msg_stch_t stchmsg; | |
2428 mdi_unit_t *ui = MDI_UNIT(MD_SID(un)); | |
2429 md_mn_kresult_t *kresult; | |
2430 int rval; | |
2431 uint_t msgflags; | |
2432 md_mn_msgtype_t msgtype; | |
2433 int save_lock = 0; | |
2434 mdi_unit_t *ui_sm; | |
11130
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
2435 int nretries = 0; |
0 | 2436 |
2437 sm = &un->un_sm[smi]; | |
2438 smic = &un->un_smic[smi]; | |
2439 | |
2440 /* If we have a real error status then turn off MD_INACCESSIBLE. */ | |
2441 ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev))); | |
2442 if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) && | |
2443 ui_sm->ui_tstate & MD_INACCESSIBLE) { | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
2444 ui_sm->ui_tstate &= ~MD_INACCESSIBLE; |
0 | 2445 } |
2446 | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
2447 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
2448 (sm->sm_dev, sm, ci); |
0 | 2449 origstate = shared->ms_state; |
2450 | |
2451 /* | |
2452 * If the new state is an error and the old one wasn't, generate | |
2453 * a console message. We do this before we send the state to other | |
2454 * nodes in a MN set because the state change may change the component | |
2455 * name if a hotspare is allocated. | |
2456 */ | |
2457 if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) && | |
2458 (newstate & (CS_ERRED|CS_LAST_ERRED))) { | |
2459 | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
2460 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
2461 "get device", 0); |
0 | 2462 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); |
2463 | |
2464 err = md_getdevname(setno, mddb_getsidenum(setno), 0, | |
2465 cd.cd_dev, devname, sizeof (devname)); | |
2466 | |
2467 if (err == ENOENT) { | |
2468 (void) md_devname(setno, cd.cd_dev, devname, | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
2469 sizeof (devname)); |
0 | 2470 } |
2471 | |
2472 cmn_err(CE_WARN, "md: %s: %s needs maintenance", | |
2473 md_shortname(md_getminor(sm->sm_dev)), devname); | |
2474 | |
2475 if (newstate & CS_LAST_ERRED) { | |
2476 cmn_err(CE_WARN, "md: %s: %s last erred", | |
2477 md_shortname(md_getminor(sm->sm_dev)), | |
2478 devname); | |
2479 | |
2480 } else if (shared->ms_flags & MDM_S_ISOPEN) { | |
2481 /* | |
2482 * Close the broken device and clear the open flag on | |
2483 * it. Closing the device means the RCM framework will | |
2484 * be able to unconfigure the device if required. | |
2485 * | |
2486 * We have to check that the device is open, otherwise | |
2487 * the first open on it has resulted in the error that | |
2488 * is being processed and the actual cd.cd_dev will be | |
2489 * NODEV64. | |
2490 * | |
2491 * If this is a multi-node mirror, then the multinode | |
2492 * state checks following this code will cause the | |
2493 * slave nodes to close the mirror in the function | |
2494 * mirror_set_state(). | |
2495 */ | |
2496 md_layered_close(cd.cd_dev, MD_OFLG_NULL); | |
2497 shared->ms_flags &= ~MDM_S_ISOPEN; | |
2498 } | |
2499 | |
2500 } else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) && | |
2501 (shared->ms_flags & MDM_S_ISOPEN)) { | |
2502 /* | |
2503 * Similar to logic above except no log messages since we | |
2504 * are just transitioning from Last Erred to Erred. | |
2505 */ | |
2506 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, | |
2507 "get device", 0); | |
2508 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); | |
2509 | |
2510 md_layered_close(cd.cd_dev, MD_OFLG_NULL); | |
2511 shared->ms_flags &= ~MDM_S_ISOPEN; | |
2512 } | |
2513 | |
2514 if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) && | |
2515 (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) { | |
2516 /* | |
2517 * For a multi-node mirror, send the state change to the | |
2518 * master, which broadcasts to all nodes, including this | |
2519 * one. Once the message is received, the state is set | |
2520 * in-core and the master commits the change to disk. | |
2521 * There is a case, comp_replace, where this function | |
2522 * can be called from within an ioctl and therefore in this | |
2523 * case, as the ioctl will already be called on each node, | |
2524 * there is no need to xmit the state change to the master for | |
2525 * distribution to the other nodes. MD_STATE_XMIT flag is used | |
2526 * to indicate whether a xmit is required. The mirror's | |
2527 * transient state is set to MD_ERR_PENDING to avoid sending | |
2528 * multiple messages. | |
2529 */ | |
2530 if (newstate & (CS_ERRED|CS_LAST_ERRED)) | |
2531 ui->ui_tstate |= MD_ERR_PENDING; | |
2532 | |
2533 /* | |
2534 * Send a state update message to all nodes. This message | |
2535 * will generate 2 submessages, the first one to suspend | |
2536 * all writes to the mirror and the second to update the | |
2537 * state and resume writes. | |
2538 */ | |
2539 stchmsg.msg_stch_mnum = un->c.un_self_id; | |
2540 stchmsg.msg_stch_sm = smi; | |
2541 stchmsg.msg_stch_comp = ci; | |
2542 stchmsg.msg_stch_new_state = newstate; | |
2543 stchmsg.msg_stch_hs_id = shared->ms_hs_id; | |
2544 #ifdef DEBUG | |
2545 if (mirror_debug_flag) | |
2546 printf("send set state, %x, %x, %x, %x, %x\n", | |
2547 stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm, | |
2548 stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state, | |
2549 stchmsg.msg_stch_hs_id); | |
2550 #endif | |
2551 if (flags & MD_STATE_WMUPDATE) { | |
2552 msgtype = MD_MN_MSG_STATE_UPDATE2; | |
2553 /* | |
2554 * When coming from an update of watermarks, there | |
2555 * must already be a message logged that triggered | |
2556 * this action. So, no need to log this message, too. | |
2557 */ | |
2558 msgflags = MD_MSGF_NO_LOG; | |
2559 } else { | |
2560 msgtype = MD_MN_MSG_STATE_UPDATE; | |
2561 msgflags = MD_MSGF_DEFAULT_FLAGS; | |
2562 } | |
2563 | |
2564 /* | |
2565 * If we are in the context of an ioctl, drop the ioctl lock. | |
2566 * lockp holds the list of locks held. | |
2567 * | |
2568 * Otherwise, increment the appropriate reacquire counters. | |
2569 * If openclose lock is *held, then must reacquire reader | |
2570 * lock before releasing the openclose lock. | |
2571 * Do not drop the ARRAY_WRITER lock as we may not be able | |
2572 * to reacquire it. | |
2573 */ | |
2574 if (lockp) { | |
2575 if (lockp->l_flags & MD_ARRAY_WRITER) { | |
2576 save_lock = MD_ARRAY_WRITER; | |
2577 lockp->l_flags &= ~MD_ARRAY_WRITER; | |
2578 } else if (lockp->l_flags & MD_ARRAY_READER) { | |
2579 save_lock = MD_ARRAY_READER; | |
2580 lockp->l_flags &= ~MD_ARRAY_READER; | |
2581 } | |
2582 IOLOCK_RETURN_RELEASE(0, lockp); | |
2583 } else { | |
2584 if (flags & MD_STATE_OCHELD) { | |
2585 md_unit_writerexit(ui); | |
2586 (void) md_unit_readerlock(ui); | |
2587 md_unit_openclose_exit(ui); | |
2588 } else { | |
2589 md_unit_writerexit(ui); | |
2590 } | |
2591 } | |
2592 | |
2593 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); | |
11130
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
2594 sscs_msg: |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2595 rval = mdmn_ksend_message(setno, msgtype, msgflags, 0, |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
2596 (char *)&stchmsg, sizeof (stchmsg), kresult); |
0 | 2597 |
2598 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { | |
2599 mdmn_ksend_show_error(rval, kresult, "STATE UPDATE"); | |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2600 /* If we're shutting down already, pause things here. */ |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2601 if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) { |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2602 while (!md_mn_is_commd_present()) { |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2603 delay(md_hz); |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2604 } |
11130
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
2605 /* |
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
2606 * commd is now available; retry the message |
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
2607 * one time. If that fails we fall through and |
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
2608 * panic as the system is in an unexpected state |
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
2609 */ |
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
2610 if (nretries++ == 0) |
ce5c27fd996f
6885655 Oban master panicked in RESYNC_NEXT during cluster shutdown
James Hall <James.Hall@Sun.COM>
parents:
10948
diff
changeset
|
2611 goto sscs_msg; |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
2612 } |
0 | 2613 cmn_err(CE_PANIC, |
2614 "ksend_message failure: STATE_UPDATE"); | |
2615 } | |
2616 kmem_free(kresult, sizeof (md_mn_kresult_t)); | |
2617 | |
2618 /* if dropped the lock previously, regain it */ | |
2619 if (lockp) { | |
2620 IOLOCK_RETURN_REACQUIRE(lockp); | |
2621 lockp->l_flags |= save_lock; | |
2622 } else { | |
2623 /* | |
2624 * Reacquire dropped locks and update acquirecnts | |
2625 * appropriately. | |
2626 */ | |
2627 if (flags & MD_STATE_OCHELD) { | |
2628 /* | |
2629 * openclose also grabs readerlock. | |
2630 */ | |
2631 (void) md_unit_openclose_enter(ui); | |
2632 md_unit_readerexit(ui); | |
2633 (void) md_unit_writerlock(ui); | |
2634 } else { | |
2635 (void) md_unit_writerlock(ui); | |
2636 } | |
2637 } | |
2638 | |
2639 ui->ui_tstate &= ~MD_ERR_PENDING; | |
2640 } else { | |
2641 shared->ms_state = newstate; | |
2642 uniqtime32(&shared->ms_timestamp); | |
2643 | |
2644 if (newstate == CS_ERRED) | |
2645 shared->ms_flags |= MDM_S_NOWRITE; | |
2646 else | |
2647 shared->ms_flags &= ~MDM_S_NOWRITE; | |
2648 | |
2649 shared->ms_flags &= ~MDM_S_IOERR; | |
2650 un->un_changecnt++; | |
2651 shared->ms_lasterrcnt = un->un_changecnt; | |
2652 | |
2653 mirror_set_sm_state(sm, smic, SMS_RUNNING, 0); | |
2654 mirror_commit(un, SMI2BIT(smi), extras); | |
2655 } | |
2656 | |
2657 if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) { | |
2658 /* | |
2659 * Resetting the Last Erred state will recursively call back | |
2660 * into this function (set_sm_comp_state) to update the state. | |
2661 */ | |
2662 reset_lasterred(un, smi, extras, flags, lockp); | |
2663 } | |
2664 } | |
2665 | |
2666 static int | |
2667 find_another_logical( | |
2668 mm_unit_t *un, | |
2669 mm_submirror_t *esm, | |
2670 diskaddr_t blk, | |
2671 u_longlong_t cnt, | |
2672 int must_be_open, | |
2673 int state, | |
2674 int err_cnt) | |
2675 { | |
2676 u_longlong_t cando; | |
2677 md_dev64_t dev; | |
2678 md_m_shared_t *s; | |
2679 | |
2680 esm->sm_state |= SMS_IGNORE; | |
2681 while (cnt != 0) { | |
2682 u_longlong_t mcnt; | |
2683 | |
2684 mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024)); /* 1 Gig Blks */ | |
2685 | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
2686 dev = select_read_unit(un, blk, mcnt, &cando, |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
2687 must_be_open, &s, NULL); |
0 | 2688 if (dev == (md_dev64_t)0) |
2689 break; | |
2690 | |
2691 if ((state == CS_LAST_ERRED) && | |
2692 (s->ms_state == CS_LAST_ERRED) && | |
2693 (err_cnt > s->ms_lasterrcnt)) | |
2694 break; | |
2695 | |
2696 cnt -= cando; | |
2697 blk += cando; | |
2698 } | |
2699 esm->sm_state &= ~SMS_IGNORE; | |
2700 return (cnt != 0); | |
2701 } | |
2702 | |
2703 int | |
2704 mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open) | |
2705 { | |
2706 mm_submirror_t *sm; | |
2707 mm_submirror_ic_t *smic; | |
2708 size_t count; | |
2709 diskaddr_t block; | |
2710 u_longlong_t skip; | |
2711 u_longlong_t size; | |
2712 md_dev64_t dev; | |
2713 int cnt; | |
2714 md_m_shared_t *s; | |
2715 int not_found; | |
2716 | |
2717 sm = &un->un_sm[smi]; | |
2718 smic = &un->un_smic[smi]; | |
2719 dev = sm->sm_dev; | |
2720 | |
2721 /* | |
2722 * Make sure every component of the submirror | |
2723 * has other sources. | |
2724 */ | |
2725 if (ci < 0) { | |
2726 /* Find the highest lasterrcnt */ | |
2727 cnt = (*(smic->sm_get_component_count))(dev, sm); | |
2728 for (ci = 0; ci < cnt; ci++) { | |
2729 not_found = mirror_other_sources(un, smi, ci, | |
2730 must_be_open); | |
2731 if (not_found) | |
2732 return (1); | |
2733 } | |
2734 return (0); | |
2735 } | |
2736 | |
2737 /* | |
2738 * Make sure this component has other sources | |
2739 */ | |
2740 (void) (*(smic->sm_get_bcss)) | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
2741 (dev, sm, ci, &block, &count, &skip, &size); |
0 | 2742 |
2743 if (count == 0) | |
2744 return (1); | |
2745 | |
2746 s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci); | |
2747 | |
2748 while (count--) { | |
2749 if (block >= un->c.un_total_blocks) | |
2750 return (0); | |
2751 | |
2752 if ((block + size) > un->c.un_total_blocks) | |
2753 size = un->c.un_total_blocks - block; | |
2754 | |
2755 not_found = find_another_logical(un, sm, block, size, | |
2756 must_be_open, s->ms_state, s->ms_lasterrcnt); | |
2757 if (not_found) | |
2758 return (1); | |
2759 | |
2760 block += size + skip; | |
2761 } | |
2762 return (0); | |
2763 } | |
2764 | |
2765 static void | |
2766 finish_error(md_mps_t *ps) | |
2767 { | |
2768 struct buf *pb; | |
2769 mm_unit_t *un; | |
2770 mdi_unit_t *ui; | |
2771 uint_t new_str_flags; | |
2772 | |
2773 pb = ps->ps_bp; | |
2774 un = ps->ps_un; | |
2775 ui = ps->ps_ui; | |
2776 | |
2777 /* | |
2778 * Must flag any error to the resync originator if we're performing | |
2779 * a Write-after-Read. This corresponds to an i/o error on a resync | |
2780 * target device and in this case we ought to abort the resync as there | |
2781 * is nothing that can be done to recover from this without operator | |
2782 * intervention. If we don't set the B_ERROR flag we will continue | |
2783 * reading from the mirror but won't write to the target (as it will | |
2784 * have been placed into an errored state). | |
2785 * To handle the case of multiple components within a submirror we only | |
2786 * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR. | |
2787 * The originator of the resync read will cause this bit to be set if | |
2788 * the underlying component count is one for a submirror resync. All | |
2789 * other resync types will have the flag set as there is no underlying | |
2790 * resync which can be performed on a contained metadevice for these | |
2791 * resync types (optimized or component). | |
2792 */ | |
2793 | |
2794 if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) { | |
2795 if (ps->ps_flags & MD_MPS_FLAG_ERROR) | |
2796 pb->b_flags |= B_ERROR; | |
2797 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); | |
2798 MPS_FREE(mirror_parent_cache, ps); | |
2799 md_unit_readerexit(ui); | |
2800 md_biodone(pb); | |
2801 return; | |
2802 } | |
2803 /* | |
2804 * Set the MD_IO_COUNTED flag as we are retrying the same I/O | |
2805 * operation therefore this I/O request has already been counted, | |
2806 * the I/O count variable will be decremented by mirror_done()'s | |
2807 * call to md_biodone(). | |
2808 */ | |
2809 if (ps->ps_changecnt != un->un_changecnt) { | |
2810 new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED; | |
2811 if (ps->ps_flags & MD_MPS_WOW) | |
2812 new_str_flags |= MD_STR_WOW; | |
2813 if (ps->ps_flags & MD_MPS_MAPPED) | |
2814 new_str_flags |= MD_STR_MAPPED; | |
2815 /* | |
2816 * If this I/O request was a read that was part of a resync, | |
2817 * set MD_STR_WAR for the retried read to ensure that the | |
2818 * resync write (i.e. write-after-read) will be performed | |
2819 */ | |
2820 if (ps->ps_flags & MD_MPS_RESYNC_READ) | |
2821 new_str_flags |= MD_STR_WAR; | |
2822 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); | |
2823 MPS_FREE(mirror_parent_cache, ps); | |
2824 md_unit_readerexit(ui); | |
2825 (void) md_mirror_strategy(pb, new_str_flags, NULL); | |
2826 return; | |
2827 } | |
2828 | |
2829 pb->b_flags |= B_ERROR; | |
2830 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); | |
2831 MPS_FREE(mirror_parent_cache, ps); | |
2832 md_unit_readerexit(ui); | |
2833 md_biodone(pb); | |
2834 } | |
2835 | |
2836 static void | |
2837 error_update_unit(md_mps_t *ps) | |
2838 { | |
2839 mm_unit_t *un; | |
2840 mdi_unit_t *ui; | |
2841 int smi; /* sub mirror index */ | |
2842 int ci; /* errored component */ | |
2843 set_t setno; | |
2844 uint_t flags; /* for set_sm_comp_state() */ | |
2845 uint_t hspflags; /* for check_comp_4_hotspares() */ | |
2846 | |
2847 ui = ps->ps_ui; | |
2848 un = (mm_unit_t *)md_unit_writerlock(ui); | |
2849 setno = MD_UN2SET(un); | |
2850 | |
2851 /* All of these updates have to propagated in case of MN set */ | |
2852 flags = MD_STATE_XMIT; | |
2853 hspflags = MD_HOTSPARE_XMIT; | |
2854 | |
2855 /* special treatment if we are called during updating watermarks */ | |
2856 if (ps->ps_flags & MD_MPS_WMUPDATE) { | |
2857 flags |= MD_STATE_WMUPDATE; | |
2858 hspflags |= MD_HOTSPARE_WMUPDATE; | |
2859 } | |
2860 smi = 0; | |
2861 ci = 0; | |
2862 while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) { | |
2863 if (mirror_other_sources(un, smi, ci, 0) == 1) { | |
2864 | |
2865 /* Never called from ioctl context, so (IOLOCK *)NULL */ | |
2866 set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags, | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
2867 (IOLOCK *)NULL); |
0 | 2868 /* |
2869 * For a MN set, the NOTIFY is done when the state | |
2870 * change is processed on each node | |
2871 */ | |
2872 if (!MD_MNSET_SETNO(MD_UN2SET(un))) { | |
2873 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, | |
2874 SVM_TAG_METADEVICE, setno, MD_SID(un)); | |
2875 } | |
2876 continue; | |
2877 } | |
2878 /* Never called from ioctl context, so (IOLOCK *)NULL */ | |
2879 set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags, | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
2880 (IOLOCK *)NULL); |
0 | 2881 /* |
2882 * For a MN set, the NOTIFY is done when the state | |
2883 * change is processed on each node | |
2884 */ | |
2885 if (!MD_MNSET_SETNO(MD_UN2SET(un))) { | |
2886 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, | |
2887 SVM_TAG_METADEVICE, setno, MD_SID(un)); | |
2888 } | |
2889 smi = 0; | |
2890 ci = 0; | |
2891 } | |
2892 | |
2893 md_unit_writerexit(ui); | |
2894 if (MD_MNSET_SETNO(setno)) { | |
2895 send_poke_hotspares(setno); | |
2896 } else { | |
2897 (void) poke_hotspares(); | |
2898 } | |
2899 (void) md_unit_readerlock(ui); | |
2900 | |
2901 finish_error(ps); | |
2902 } | |
2903 | |
2904 /* | |
2905 * When we have a B_FAILFAST IO error on a Last Erred component we need to | |
2906 * retry the IO without B_FAILFAST set so that we try to ensure that the | |
2907 * component "sees" each IO. | |
2908 */ | |
2909 static void | |
2910 last_err_retry(md_mcs_t *cs) | |
2911 { | |
2912 struct buf *cb; | |
2913 md_mps_t *ps; | |
2914 uint_t flags; | |
2915 | |
2916 cb = &cs->cs_buf; | |
2917 cb->b_flags &= ~B_FAILFAST; | |
2918 | |
2919 /* if we're panicing just let this I/O error out */ | |
2920 if (panicstr) { | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
2921 (void) mirror_done(cb); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
2922 return; |
0 | 2923 } |
2924 | |
2925 /* reissue the I/O */ | |
2926 | |
2927 ps = cs->cs_ps; | |
2928 | |
2929 bioerror(cb, 0); | |
2930 | |
2931 mutex_enter(&ps->ps_mx); | |
2932 | |
2933 flags = MD_STR_NOTTOP; | |
2934 if (ps->ps_flags & MD_MPS_MAPPED) | |
2935 flags |= MD_STR_MAPPED; | |
2936 if (ps->ps_flags & MD_MPS_NOBLOCK) | |
2937 flags |= MD_NOBLOCK; | |
2938 | |
2939 mutex_exit(&ps->ps_mx); | |
2940 | |
2941 clear_retry_error(cb); | |
2942 | |
2943 cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST", | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
2944 md_shortname(getminor(cb->b_edev))); |
0 | 2945 |
2946 md_call_strategy(cb, flags, NULL); | |
2947 } | |
2948 | |
2949 static void | |
2950 mirror_error(md_mps_t *ps) | |
2951 { | |
2952 int smi; /* sub mirror index */ | |
2953 int ci; /* errored component */ | |
2954 | |
2955 if (panicstr) { | |
2956 finish_error(ps); | |
2957 return; | |
2958 } | |
2959 | |
2960 if (ps->ps_flags & MD_MPS_ON_OVERLAP) | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
2961 mirror_overlap_tree_remove(ps); |
0 | 2962 |
2963 smi = 0; | |
2964 ci = 0; | |
2965 if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) { | |
2966 md_unit_readerexit(ps->ps_ui); | |
2967 daemon_request(&md_mstr_daemon, error_update_unit, | |
2968 (daemon_queue_t *)ps, REQ_OLD); | |
2969 return; | |
2970 } | |
2971 | |
2972 finish_error(ps); | |
2973 } | |
2974 | |
2975 static int | |
2976 copy_write_done(struct buf *cb) | |
2977 { | |
2978 md_mps_t *ps; | |
2979 buf_t *pb; | |
2980 char *wowbuf; | |
2981 wowhdr_t *wowhdr; | |
2982 ssize_t wow_resid; | |
2983 | |
2984 /* get wowbuf ans save structure */ | |
2985 wowbuf = cb->b_un.b_addr; | |
2986 wowhdr = WOWBUF_HDR(wowbuf); | |
2987 ps = wowhdr->wow_ps; | |
2988 pb = ps->ps_bp; | |
2989 | |
2990 /* Save error information, then free cb */ | |
2991 if (cb->b_flags & B_ERROR) | |
2992 pb->b_flags |= B_ERROR; | |
2993 | |
2994 if (cb->b_flags & B_REMAPPED) | |
2995 bp_mapout(cb); | |
2996 | |
2997 freerbuf(cb); | |
2998 | |
2999 /* update residual and continue if needed */ | |
3000 if ((pb->b_flags & B_ERROR) == 0) { | |
3001 wow_resid = pb->b_bcount - wowhdr->wow_offset; | |
3002 pb->b_resid = wow_resid; | |
3003 if (wow_resid > 0) { | |
3004 daemon_request(&md_mstr_daemon, copy_write_cont, | |
3005 (daemon_queue_t *)wowhdr, REQ_OLD); | |
3006 return (1); | |
3007 } | |
3008 } | |
3009 | |
3010 /* Write is complete, release resources. */ | |
3011 kmem_cache_free(mirror_wowblk_cache, wowhdr); | |
3012 ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); | |
3013 md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); | |
3014 MPS_FREE(mirror_parent_cache, ps); | |
3015 md_biodone(pb); | |
3016 return (0); | |
3017 } | |
3018 | |
3019 static void | |
3020 copy_write_cont(wowhdr_t *wowhdr) | |
3021 { | |
3022 buf_t *pb; | |
3023 buf_t *cb; | |
3024 char *wowbuf; | |
3025 int wow_offset; | |
3026 size_t wow_resid; | |
3027 diskaddr_t wow_blkno; | |
3028 | |
3029 wowbuf = WOWHDR_BUF(wowhdr); | |
3030 pb = wowhdr->wow_ps->ps_bp; | |
3031 | |
3032 /* get data on current location */ | |
3033 wow_offset = wowhdr->wow_offset; | |
3034 wow_resid = pb->b_bcount - wow_offset; | |
3035 wow_blkno = pb->b_lblkno + lbtodb(wow_offset); | |
3036 | |
3037 /* setup child buffer */ | |
3038 cb = getrbuf(KM_SLEEP); | |
3039 cb->b_flags = B_WRITE; | |
3040 cb->b_edev = pb->b_edev; | |
3041 cb->b_un.b_addr = wowbuf; /* change to point at WOWBUF */ | |
3042 cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */ | |
3043 cb->b_iodone = copy_write_done; | |
3044 cb->b_bcount = MIN(md_wowbuf_size, wow_resid); | |
3045 cb->b_lblkno = wow_blkno; | |
3046 | |
3047 /* move offset to next section */ | |
3048 wowhdr->wow_offset += cb->b_bcount; | |
3049 | |
3050 /* copy and setup write for current section */ | |
3051 bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount); | |
3052 | |
3053 /* do it */ | |
3054 /* | |
3055 * Do not set the MD_IO_COUNTED flag as this is a new I/O request | |
3056 * that handles the WOW condition. The resultant increment on the | |
3057 * I/O count variable is cleared by copy_write_done()'s call to | |
3058 * md_biodone(). | |
3059 */ | |
3060 (void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3061 | MD_STR_MAPPED, NULL); |
0 | 3062 } |
3063 | |
3064 static void | |
3065 md_mirror_copy_write(md_mps_t *ps) | |
3066 { | |
3067 wowhdr_t *wowhdr; | |
3068 | |
3069 wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS); | |
3070 mirror_wowblk_init(wowhdr); | |
3071 wowhdr->wow_ps = ps; | |
3072 wowhdr->wow_offset = 0; | |
3073 copy_write_cont(wowhdr); | |
3074 } | |
3075 | |
3076 static void | |
3077 handle_wow(md_mps_t *ps) | |
3078 { | |
3079 buf_t *pb; | |
3080 | |
3081 pb = ps->ps_bp; | |
3082 | |
3083 bp_mapin(pb); | |
3084 | |
3085 md_mirror_wow_cnt++; | |
3086 if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) { | |
3087 cmn_err(CE_NOTE, | |
3088 "md: %s, blk %lld, cnt %ld: Write on write %d occurred", | |
3089 md_shortname(getminor(pb->b_edev)), | |
3090 (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt); | |
3091 } | |
3092 | |
3093 /* | |
3094 * Set the MD_IO_COUNTED flag as we are retrying the same I/O | |
3095 * operation therefore this I/O request has already been counted, | |
3096 * the I/O count variable will be decremented by mirror_done()'s | |
3097 * call to md_biodone(). | |
3098 */ | |
3099 if (md_mirror_wow_flg & WOW_NOCOPY) | |
3100 (void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW | | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3101 MD_STR_MAPPED | MD_IO_COUNTED, ps); |
0 | 3102 else |
3103 md_mirror_copy_write(ps); | |
3104 } | |
3105 | |
3106 /* | |
3107 * Return true if the specified submirror is either in the Last Erred | |
3108 * state or is transitioning into the Last Erred state. | |
3109 */ | |
3110 static bool_t | |
3111 submirror_is_lasterred(mm_unit_t *un, int smi) | |
3112 { | |
3113 mm_submirror_t *sm; | |
3114 mm_submirror_ic_t *smic; | |
3115 md_m_shared_t *shared; | |
3116 int ci; | |
3117 int compcnt; | |
3118 | |
3119 sm = &un->un_sm[smi]; | |
3120 smic = &un->un_smic[smi]; | |
3121 | |
3122 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); | |
3123 for (ci = 0; ci < compcnt; ci++) { | |
3124 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) | |
3125 (sm->sm_dev, sm, ci); | |
3126 | |
3127 if (shared->ms_state == CS_LAST_ERRED) | |
3128 return (B_TRUE); | |
3129 | |
3130 /* | |
3131 * It is not currently Last Erred, check if entering Last Erred. | |
3132 */ | |
3133 if ((shared->ms_flags & MDM_S_IOERR) && | |
3134 ((shared->ms_state == CS_OKAY) || | |
3135 (shared->ms_state == CS_RESYNC))) { | |
3136 if (mirror_other_sources(un, smi, ci, 0) == 1) | |
3137 return (B_TRUE); | |
3138 } | |
3139 } | |
3140 | |
3141 return (B_FALSE); | |
3142 } | |
3143 | |
3144 | |
3145 static int | |
3146 mirror_done(struct buf *cb) | |
3147 { | |
3148 md_mps_t *ps; | |
3149 md_mcs_t *cs; | |
3150 | |
3151 /*LINTED*/ | |
3152 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); | |
3153 ps = cs->cs_ps; | |
3154 | |
3155 mutex_enter(&ps->ps_mx); | |
3156 | |
3157 /* check if we need to retry an errored failfast I/O */ | |
3158 if (cb->b_flags & B_ERROR) { | |
3159 struct buf *pb = ps->ps_bp; | |
3160 | |
3161 if (cb->b_flags & B_FAILFAST) { | |
3162 int i; | |
3163 mm_unit_t *un = ps->ps_un; | |
3164 | |
3165 for (i = 0; i < NMIRROR; i++) { | |
3166 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) | |
3167 continue; | |
3168 | |
3169 if (cb->b_edev == | |
3170 md_dev64_to_dev(un->un_sm[i].sm_dev)) { | |
3171 | |
3172 /* | |
3173 * This is the submirror that had the | |
3174 * error. Check if it is Last Erred. | |
3175 */ | |
3176 if (submirror_is_lasterred(un, i)) { | |
3177 daemon_queue_t *dqp; | |
3178 | |
3179 mutex_exit(&ps->ps_mx); | |
3180 dqp = (daemon_queue_t *)cs; | |
3181 dqp->dq_prev = NULL; | |
3182 dqp->dq_next = NULL; | |
3183 daemon_request(&md_done_daemon, | |
3184 last_err_retry, dqp, | |
3185 REQ_OLD); | |
3186 return (1); | |
3187 } | |
3188 break; | |
3189 } | |
3190 } | |
3191 } | |
3192 | |
3193 /* continue to process the buf without doing a retry */ | |
3194 ps->ps_flags |= MD_MPS_ERROR; | |
3195 pb->b_error = cb->b_error; | |
3196 } | |
3197 | |
3198 return (mirror_done_common(cb)); | |
3199 } | |
3200 | |
3201 /* | |
3202 * Split from the original mirror_done function so we can handle bufs after a | |
3203 * retry. | |
3204 * ps->ps_mx is already held in the caller of this function and the cb error | |
3205 * has already been checked and handled in the caller. | |
3206 */ | |
3207 static int | |
3208 mirror_done_common(struct buf *cb) | |
3209 { | |
3210 struct buf *pb; | |
3211 mm_unit_t *un; | |
3212 mdi_unit_t *ui; | |
3213 md_mps_t *ps; | |
3214 md_mcs_t *cs; | |
3215 size_t end_rr, start_rr, current_rr; | |
3216 | |
3217 /*LINTED*/ | |
3218 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); | |
3219 ps = cs->cs_ps; | |
3220 pb = ps->ps_bp; | |
3221 | |
3222 if (cb->b_flags & B_REMAPPED) | |
3223 bp_mapout(cb); | |
3224 | |
3225 ps->ps_frags--; | |
3226 if (ps->ps_frags != 0) { | |
3227 mutex_exit(&ps->ps_mx); | |
3228 kmem_cache_free(mirror_child_cache, cs); | |
3229 return (1); | |
3230 } | |
3231 un = ps->ps_un; | |
3232 ui = ps->ps_ui; | |
3233 | |
3234 /* | |
3235 * Do not update outstanding_writes if we're running with ABR | |
3236 * set for this mirror or the write() was issued with MD_STR_ABR set. | |
3237 * Also a resync initiated write() has no outstanding_writes update | |
3238 * either. | |
3239 */ | |
3240 if (((cb->b_flags & B_READ) == 0) && | |
3241 (un->un_nsm >= 2) && | |
3242 (ps->ps_call == NULL) && | |
3243 !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) && | |
3244 !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) { | |
3245 BLK_TO_RR(end_rr, ps->ps_lastblk, un); | |
3246 BLK_TO_RR(start_rr, ps->ps_firstblk, un); | |
3247 mutex_enter(&un->un_resync_mx); | |
3248 for (current_rr = start_rr; current_rr <= end_rr; current_rr++) | |
3249 un->un_outstanding_writes[current_rr]--; | |
3250 mutex_exit(&un->un_resync_mx); | |
3251 } | |
3252 kmem_cache_free(mirror_child_cache, cs); | |
3253 mutex_exit(&ps->ps_mx); | |
3254 | |
3255 if (ps->ps_call != NULL) { | |
3256 daemon_request(&md_done_daemon, ps->ps_call, | |
3257 (daemon_queue_t *)ps, REQ_OLD); | |
3258 return (1); | |
3259 } | |
3260 | |
3261 if ((ps->ps_flags & MD_MPS_ERROR)) { | |
3262 daemon_request(&md_done_daemon, mirror_error, | |
3263 (daemon_queue_t *)ps, REQ_OLD); | |
3264 return (1); | |
3265 } | |
3266 | |
3267 if (ps->ps_flags & MD_MPS_ON_OVERLAP) | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3268 mirror_overlap_tree_remove(ps); |
0 | 3269 |
3270 /* | |
3271 * Handle Write-on-Write problem. | |
3272 * Skip In case of Raw and Direct I/O as they are | |
3273 * handled earlier. | |
3274 * | |
3275 */ | |
3276 if (!(md_mirror_wow_flg & WOW_DISABLE) && | |
3277 !(pb->b_flags & B_READ) && | |
3278 !(ps->ps_flags & MD_MPS_WOW) && | |
3279 !(pb->b_flags & B_PHYS) && | |
3280 any_pages_dirty(pb)) { | |
3281 md_unit_readerexit(ps->ps_ui); | |
3282 daemon_request(&md_mstr_daemon, handle_wow, | |
3283 (daemon_queue_t *)ps, REQ_OLD); | |
3284 return (1); | |
3285 } | |
3286 | |
3287 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); | |
3288 MPS_FREE(mirror_parent_cache, ps); | |
3289 md_unit_readerexit(ui); | |
3290 md_biodone(pb); | |
3291 return (0); | |
3292 } | |
3293 | |
3294 /* | |
3295 * Clear error state in submirror component if the retry worked after | |
3296 * a failfast error. | |
3297 */ | |
3298 static void | |
3299 clear_retry_error(struct buf *cb) | |
3300 { | |
3301 int smi; | |
3302 md_mcs_t *cs; | |
3303 mm_unit_t *un; | |
3304 mdi_unit_t *ui_sm; | |
3305 mm_submirror_t *sm; | |
3306 mm_submirror_ic_t *smic; | |
3307 u_longlong_t cnt; | |
3308 md_m_shared_t *shared; | |
3309 | |
3310 /*LINTED*/ | |
3311 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); | |
3312 un = cs->cs_ps->ps_un; | |
3313 | |
3314 for (smi = 0; smi < NMIRROR; smi++) { | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3315 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3316 continue; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3317 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3318 if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev)) |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3319 break; |
0 | 3320 } |
3321 | |
3322 if (smi >= NMIRROR) | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3323 return; |
0 | 3324 |
3325 sm = &un->un_sm[smi]; | |
3326 smic = &un->un_smic[smi]; | |
3327 cnt = cb->b_bcount; | |
3328 | |
3329 ui_sm = MDI_UNIT(getminor(cb->b_edev)); | |
3330 (void) md_unit_writerlock(ui_sm); | |
3331 | |
3332 shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm, | |
3333 cb->b_blkno, &cnt); | |
3334 | |
3335 if (shared->ms_flags & MDM_S_IOERR) { | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3336 shared->ms_flags &= ~MDM_S_IOERR; |
0 | 3337 |
3338 } else { | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3339 /* the buf spans components and the first one is not erred */ |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3340 int cnt; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3341 int i; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3342 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3343 cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3344 for (i = 0; i < cnt; i++) { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3345 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3346 (sm->sm_dev, sm, i); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3347 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3348 if (shared->ms_flags & MDM_S_IOERR && |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3349 shared->ms_state == CS_OKAY) { |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3350 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3351 shared->ms_flags &= ~MDM_S_IOERR; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3352 break; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3353 } |
0 | 3354 } |
3355 } | |
3356 | |
3357 md_unit_writerexit(ui_sm); | |
3358 } | |
3359 | |
3360 static size_t | |
3361 mirror_map_read( | |
3362 md_mps_t *ps, | |
3363 md_mcs_t *cs, | |
3364 diskaddr_t blkno, | |
3365 u_longlong_t count | |
3366 ) | |
3367 { | |
3368 mm_unit_t *un; | |
3369 buf_t *bp; | |
3370 u_longlong_t cando; | |
3371 | |
3372 bp = &cs->cs_buf; | |
3373 un = ps->ps_un; | |
3374 | |
3375 bp->b_lblkno = blkno; | |
3376 if (fast_select_read_unit(ps, cs) == 0) { | |
3377 bp->b_bcount = ldbtob(count); | |
3378 return (0); | |
3379 } | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3380 bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno, |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3381 count, &cando, 0, NULL, cs)); |
0 | 3382 bp->b_bcount = ldbtob(cando); |
3383 if (count != cando) | |
3384 return (cando); | |
3385 return (0); | |
3386 } | |
3387 | |
3388 static void | |
3389 write_after_read(md_mps_t *ps) | |
3390 { | |
3391 struct buf *pb; | |
3392 int flags; | |
3393 | |
3394 if (ps->ps_flags & MD_MPS_ERROR) { | |
3395 mirror_error(ps); | |
3396 return; | |
3397 } | |
3398 | |
3399 pb = ps->ps_bp; | |
3400 md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); | |
3401 ps->ps_call = NULL; | |
3402 ps->ps_flags |= MD_MPS_WRITE_AFTER_READ; | |
3403 flags = MD_STR_NOTTOP | MD_STR_WAR; | |
3404 if (ps->ps_flags & MD_MPS_MAPPED) | |
3405 flags |= MD_STR_MAPPED; | |
3406 if (ps->ps_flags & MD_MPS_NOBLOCK) | |
3407 flags |= MD_NOBLOCK; | |
3408 if (ps->ps_flags & MD_MPS_DIRTY_RD) | |
3409 flags |= MD_STR_DIRTY_RD; | |
3410 (void) mirror_write_strategy(pb, flags, ps); | |
3411 } | |
3412 | |
3413 static void | |
3414 continue_serial(md_mps_t *ps) | |
3415 { | |
3416 md_mcs_t *cs; | |
3417 buf_t *cb; | |
3418 mm_unit_t *un; | |
3419 int flags; | |
3420 | |
3421 un = ps->ps_un; | |
3422 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); | |
3423 mirror_child_init(cs); | |
3424 cb = &cs->cs_buf; | |
3425 ps->ps_call = NULL; | |
3426 ps->ps_frags = 1; | |
3427 (void) mirror_map_write(un, cs, ps, 0); | |
3428 flags = MD_STR_NOTTOP; | |
3429 if (ps->ps_flags & MD_MPS_MAPPED) | |
3430 flags |= MD_STR_MAPPED; | |
3431 md_call_strategy(cb, flags, NULL); | |
3432 } | |
3433 | |
3434 static int | |
3435 mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war) | |
3436 { | |
3437 int i; | |
3438 dev_t dev; /* needed for bioclone, so not md_dev64_t */ | |
3439 buf_t *cb; | |
3440 buf_t *pb; | |
3441 diskaddr_t blkno; | |
3442 size_t bcount; | |
3443 off_t offset; | |
3444 | |
3445 pb = ps->ps_bp; | |
3446 cb = &cs->cs_buf; | |
3447 cs->cs_ps = ps; | |
3448 | |
3449 i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm); | |
3450 | |
3451 dev = md_dev64_to_dev(un->un_sm[i].sm_dev); | |
3452 | |
3453 blkno = pb->b_lblkno; | |
3454 bcount = pb->b_bcount; | |
3455 offset = 0; | |
3456 if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) { | |
3457 blkno = DK_LABEL_LOC + 1; | |
3458 /* | |
3459 * This handles the case where we're requesting | |
3460 * a write to block 0 on a label partition | |
3461 * and the request size was smaller than the | |
3462 * size of the label. If this is the case | |
3463 * then we'll return -1. Failure to do so will | |
3464 * either cause the calling thread to hang due to | |
3465 * an ssd bug, or worse if the bcount were allowed | |
3466 * to go negative (ie large). | |
3467 */ | |
3468 if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1)) | |
3469 return (-1); | |
3470 bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1)); | |
3471 offset = (DEV_BSIZE*(DK_LABEL_LOC + 1)); | |
3472 } | |
3473 | |
3474 cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done, | |
3475 cb, KM_NOSLEEP); | |
3476 if (war) | |
3477 cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE; | |
3478 | |
3479 /* | |
3480 * If the submirror is in the erred stated, check if any component is | |
3481 * in the Last Erred state. If so, we don't want to use the B_FAILFAST | |
3482 * flag on the IO. | |
3483 * | |
3484 * Provide a fast path for the non-erred case (which should be the | |
3485 * normal case). | |
3486 */ | |
3487 if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) { | |
3488 if (un->un_sm[i].sm_state & SMS_COMP_ERRED) { | |
3489 mm_submirror_t *sm; | |
3490 mm_submirror_ic_t *smic; | |
3491 int ci; | |
3492 int compcnt; | |
3493 | |
3494 sm = &un->un_sm[i]; | |
3495 smic = &un->un_smic[i]; | |
3496 | |
3497 compcnt = (*(smic->sm_get_component_count)) | |
3498 (sm->sm_dev, un); | |
3499 for (ci = 0; ci < compcnt; ci++) { | |
3500 md_m_shared_t *shared; | |
3501 | |
3502 shared = (md_m_shared_t *) | |
3503 (*(smic->sm_shared_by_indx))(sm->sm_dev, | |
3504 sm, ci); | |
3505 | |
3506 if (shared->ms_state == CS_LAST_ERRED) | |
3507 break; | |
3508 } | |
3509 if (ci >= compcnt) | |
3510 cb->b_flags |= B_FAILFAST; | |
3511 | |
3512 } else { | |
3513 cb->b_flags |= B_FAILFAST; | |
3514 } | |
3515 } | |
3516 | |
3517 ps->ps_current_sm++; | |
3518 if (ps->ps_current_sm != ps->ps_active_cnt) { | |
3519 if (un->un_write_option == WR_SERIAL) { | |
3520 ps->ps_call = continue_serial; | |
3521 return (0); | |
3522 } | |
3523 return (1); | |
3524 } | |
3525 return (0); | |
3526 } | |
3527 | |
3528 /* | |
3529 * directed_read_done: | |
3530 * ------------------ | |
3531 * Completion routine called when a DMR request has been returned from the | |
3532 * underlying driver. Wake-up the original ioctl() and return the data to | |
3533 * the user. | |
3534 */ | |
3535 static void | |
3536 directed_read_done(md_mps_t *ps) | |
3537 { | |
3538 mm_unit_t *un; | |
3539 mdi_unit_t *ui; | |
3540 | |
3541 un = ps->ps_un; | |
3542 ui = ps->ps_ui; | |
3543 | |
3544 md_unit_readerexit(ui); | |
3545 md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); | |
3546 ps->ps_call = NULL; | |
3547 | |
3548 mutex_enter(&un->un_dmr_mx); | |
3549 cv_signal(&un->un_dmr_cv); | |
3550 mutex_exit(&un->un_dmr_mx); | |
3551 | |
3552 /* release the parent structure */ | |
3553 kmem_cache_free(mirror_parent_cache, ps); | |
3554 } | |
3555 | |
3556 /* | |
3557 * daemon_io: | |
3558 * ------------ | |
3559 * Called to issue a mirror_write_strategy() or mirror_read_strategy | |
3560 * call from a blockable context. NOTE: no mutex can be held on entry to this | |
3561 * routine | |
3562 */ | |
3563 static void | |
3564 daemon_io(daemon_queue_t *dq) | |
3565 { | |
3566 md_mps_t *ps = (md_mps_t *)dq; | |
3567 int flag = MD_STR_NOTTOP; | |
3568 buf_t *pb = ps->ps_bp; | |
3569 | |
3570 if (ps->ps_flags & MD_MPS_MAPPED) | |
3571 flag |= MD_STR_MAPPED; | |
3572 if (ps->ps_flags & MD_MPS_WOW) | |
3573 flag |= MD_STR_WOW; | |
3574 if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) | |
3575 flag |= MD_STR_WAR; | |
3576 if (ps->ps_flags & MD_MPS_ABR) | |
3577 flag |= MD_STR_ABR; | |
7975
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3578 if (ps->ps_flags & MD_MPS_BLOCKABLE_IO) |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3579 flag |= MD_STR_BLOCK_OK; |
0 | 3580 |
3581 /* | |
3582 * If this is a resync read, ie MD_STR_DIRTY_RD not set, set | |
3583 * MD_STR_WAR before calling mirror_read_strategy | |
3584 */ | |
3585 if (pb->b_flags & B_READ) { | |
3586 if (!(ps->ps_flags & MD_MPS_DIRTY_RD)) | |
3587 flag |= MD_STR_WAR; | |
3588 mirror_read_strategy(pb, flag, ps); | |
3589 } else | |
3590 mirror_write_strategy(pb, flag, ps); | |
3591 } | |
3592 | |
3593 /* | |
3594 * update_resync: | |
3595 * ------------- | |
3596 * Called to update the in-core version of the resync record with the latest | |
3597 * version that was committed to disk when the previous mirror owner | |
3598 * relinquished ownership. This call is likely to block as we must hold-off | |
3599 * any current resync processing that may be occurring. | |
3600 * On completion of the resync record update we issue the mirror_write_strategy | |
3601 * call to complete the i/o that first started this sequence. To remove a race | |
3602 * condition between a new write() request which is submitted and the resync | |
3603 * record update we acquire the writerlock. This will hold off all i/o to the | |
3604 * mirror until the resync update has completed. | |
3605 * NOTE: no mutex can be held on entry to this routine | |
3606 */ | |
3607 static void | |
3608 update_resync(daemon_queue_t *dq) | |
3609 { | |
3610 md_mps_t *ps = (md_mps_t *)dq; | |
3611 buf_t *pb = ps->ps_bp; | |
3612 mdi_unit_t *ui = ps->ps_ui; | |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
3613 mm_unit_t *un = MD_UNIT(ui->ui_link.ln_id); |
0 | 3614 set_t setno; |
3615 int restart_resync; | |
3616 | |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
3617 mutex_enter(&un->un_rrp_inflight_mx); |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
3618 (void) md_unit_writerlock(ui); |
0 | 3619 ps->ps_un = un; |
3620 setno = MD_MIN2SET(getminor(pb->b_edev)); | |
3621 if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) { | |
3622 /* | |
3623 * Synchronize our in-core view of what regions need to be | |
3624 * resync'd with the on-disk version. | |
3625 */ | |
3626 mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm, | |
3627 un->un_dirty_bm); | |
3628 | |
3629 /* Region dirty map is now up to date */ | |
3630 } | |
3631 restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0; | |
3632 md_unit_writerexit(ui); | |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
3633 mutex_exit(&un->un_rrp_inflight_mx); |
0 | 3634 |
3635 /* Restart the resync thread if it was previously blocked */ | |
3636 if (restart_resync) { | |
3637 mutex_enter(&un->un_rs_thread_mx); | |
3638 un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER; | |
3639 cv_signal(&un->un_rs_thread_cv); | |
3640 mutex_exit(&un->un_rs_thread_mx); | |
3641 } | |
3642 /* Continue with original deferred i/o */ | |
3643 daemon_io(dq); | |
3644 } | |
3645 | |
3646 /* | |
3647 * owner_timeout: | |
3648 * ------------- | |
3649 * Called if the original mdmn_ksend_message() failed and the request is to be | |
3650 * retried. Reattempt the original ownership change. | |
3651 * | |
3652 * NOTE: called at interrupt context (see timeout(9f)). | |
3653 */ | |
3654 static void | |
3655 owner_timeout(void *arg) | |
3656 { | |
3657 daemon_queue_t *dq = (daemon_queue_t *)arg; | |
3658 | |
3659 daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD); | |
3660 } | |
3661 | |
3662 /* | |
3663 * become_owner: | |
3664 * ------------ | |
3665 * Called to issue RPC request to become the owner of the mirror | |
3666 * associated with this i/o request. We assume that the ownership request | |
3667 * is synchronous, so if it succeeds we will issue the request via | |
3668 * mirror_write_strategy(). | |
3669 * If multiple i/o's are outstanding we will be called from the mirror_daemon | |
3670 * service thread. | |
3671 * NOTE: no mutex should be held on entry to this routine. | |
3672 */ | |
3673 static void | |
3674 become_owner(daemon_queue_t *dq) | |
3675 { | |
3676 md_mps_t *ps = (md_mps_t *)dq; | |
3677 mm_unit_t *un = ps->ps_un; | |
3678 buf_t *pb = ps->ps_bp; | |
3679 set_t setno; | |
3680 md_mn_kresult_t *kres; | |
3681 int msg_flags = md_mirror_msg_flags; | |
3682 md_mps_t *ps1; | |
3683 | |
3684 ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL); | |
3685 | |
3686 /* | |
3687 * If we're already the mirror owner we do not need to send a message | |
3688 * but can simply process the i/o request immediately. | |
3689 * If we've already sent the request to become owner we requeue the | |
3690 * request as we're waiting for the synchronous ownership message to | |
3691 * be processed. | |
3692 */ | |
3693 if (MD_MN_MIRROR_OWNER(un)) { | |
3694 /* | |
3695 * As the strategy() call will potentially block we need to | |
3696 * punt this to a separate thread and complete this request | |
3697 * as quickly as possible. Note: if we're a read request | |
3698 * this must be a resync, we cannot afford to be queued | |
3699 * behind any intervening i/o requests. In this case we put the | |
3700 * request on the md_mirror_rs_daemon queue. | |
3701 */ | |
3702 if (pb->b_flags & B_READ) { | |
3703 daemon_request(&md_mirror_rs_daemon, daemon_io, dq, | |
3704 REQ_OLD); | |
3705 } else { | |
3706 daemon_request(&md_mirror_io_daemon, daemon_io, dq, | |
3707 REQ_OLD); | |
3708 } | |
3709 } else { | |
3710 mutex_enter(&un->un_owner_mx); | |
3711 if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) { | |
3712 md_mn_req_owner_t *msg; | |
3713 int rval = 0; | |
3714 | |
3715 /* | |
3716 * Check to see that we haven't exceeded the maximum | |
3717 * retry count. If we have we fail the i/o as the | |
3718 * comms mechanism has become wedged beyond recovery. | |
3719 */ | |
3720 if (dq->qlen++ >= MD_OWNER_RETRIES) { | |
3721 mutex_exit(&un->un_owner_mx); | |
3722 cmn_err(CE_WARN, | |
3723 "md_mirror: Request exhausted ownership " | |
3724 "retry limit of %d attempts", dq->qlen); | |
3725 pb->b_error = EIO; | |
3726 pb->b_flags |= B_ERROR; | |
3727 pb->b_resid = pb->b_bcount; | |
3728 kmem_cache_free(mirror_parent_cache, ps); | |
3729 md_biodone(pb); | |
3730 return; | |
3731 } | |
3732 | |
3733 /* | |
3734 * Issue request to change ownership. The call is | |
3735 * synchronous so when it returns we can complete the | |
3736 * i/o (if successful), or enqueue it again so that | |
3737 * the operation will be retried. | |
3738 */ | |
3739 un->un_owner_state |= MM_MN_OWNER_SENT; | |
3740 mutex_exit(&un->un_owner_mx); | |
3741 | |
3742 msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP); | |
3743 setno = MD_MIN2SET(getminor(pb->b_edev)); | |
3744 msg->mnum = MD_SID(un); | |
3745 msg->owner = md_mn_mynode_id; | |
3746 msg_flags |= MD_MSGF_NO_LOG; | |
3747 /* | |
3748 * If this IO is triggered by updating a watermark, | |
3749 * it might be issued by the creation of a softpartition | |
3750 * while the commd subsystem is suspended. | |
3751 * We don't want this message to block. | |
3752 */ | |
3753 if (ps->ps_flags & MD_MPS_WMUPDATE) { | |
3754 msg_flags |= MD_MSGF_OVERRIDE_SUSPEND; | |
3755 } | |
3756 | |
3757 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); | |
3758 rval = mdmn_ksend_message(setno, | |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
3759 MD_MN_MSG_REQUIRE_OWNER, msg_flags, 0, |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
3760 (char *)msg, sizeof (md_mn_req_owner_t), kres); |
0 | 3761 |
3762 kmem_free(msg, sizeof (md_mn_req_owner_t)); | |
3763 | |
3764 if (MDMN_KSEND_MSG_OK(rval, kres)) { | |
3765 dq->qlen = 0; | |
3766 /* | |
3767 * Successfully changed owner, reread the | |
3768 * resync record so that we have a valid idea of | |
3769 * any previously committed incomplete write()s. | |
3770 * NOTE: As we need to acquire the resync mutex | |
3771 * this may block, so we defer it to a separate | |
3772 * thread handler. This makes us (effectively) | |
3773 * non-blocking once the ownership message | |
3774 * handling has completed. | |
3775 */ | |
3776 mutex_enter(&un->un_owner_mx); | |
3777 if (un->un_owner_state & MM_MN_BECOME_OWNER) { | |
3778 un->un_mirror_owner = md_mn_mynode_id; | |
3779 /* Sets owner of un_rr_dirty record */ | |
3780 if (un->un_rr_dirty_recid) | |
3781 (void) mddb_setowner( | |
3782 un->un_rr_dirty_recid, | |
3783 md_mn_mynode_id); | |
3784 un->un_owner_state &= | |
3785 ~MM_MN_BECOME_OWNER; | |
3786 /* | |
3787 * Release the block on the current | |
3788 * resync region if it is blocked | |
3789 */ | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3790 ps1 = un->un_rs_prev_overlap; |
0 | 3791 if ((ps1 != NULL) && |
3792 (ps1->ps_flags & MD_MPS_ON_OVERLAP)) | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3793 mirror_overlap_tree_remove(ps1); |
0 | 3794 mutex_exit(&un->un_owner_mx); |
3795 | |
3796 /* | |
3797 * If we're a read, this must be a | |
3798 * resync request, issue | |
3799 * the i/o request on the | |
3800 * md_mirror_rs_daemon queue. This is | |
3801 * to avoid a deadlock between the | |
3802 * resync_unit thread and | |
3803 * subsequent i/o requests that may | |
3804 * block on the resync region. | |
3805 */ | |
3806 if (pb->b_flags & B_READ) { | |
3807 daemon_request( | |
3808 &md_mirror_rs_daemon, | |
3809 update_resync, dq, REQ_OLD); | |
3810 } else { | |
3811 daemon_request( | |
3812 &md_mirror_io_daemon, | |
3813 update_resync, dq, REQ_OLD); | |
3814 } | |
3815 kmem_free(kres, | |
3816 sizeof (md_mn_kresult_t)); | |
3817 return; | |
3818 } else { | |
3819 /* | |
3820 * Some other node has beaten us to | |
3821 * obtain ownership. We need to | |
3822 * reschedule our ownership request | |
3823 */ | |
3824 mutex_exit(&un->un_owner_mx); | |
3825 } | |
3826 } else { | |
3827 mdmn_ksend_show_error(rval, kres, | |
3828 "MD_MN_MSG_REQUIRE_OWNER"); | |
3829 /* | |
3830 * Message transport failure is handled by the | |
3831 * comms layer. If the ownership change request | |
3832 * does not succeed we need to flag the error to | |
3833 * the initiator of the i/o. This is handled by | |
3834 * the retry logic above. As the request failed | |
3835 * we do not know _who_ the owner of the mirror | |
3836 * currently is. We reset our idea of the owner | |
3837 * to None so that any further write()s will | |
3838 * attempt to become the owner again. This stops | |
3839 * multiple nodes writing to the same mirror | |
3840 * simultaneously. | |
3841 */ | |
3842 mutex_enter(&un->un_owner_mx); | |
3843 un->un_owner_state &= | |
3844 ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER); | |
3845 un->un_mirror_owner = MD_MN_MIRROR_UNOWNED; | |
3846 mutex_exit(&un->un_owner_mx); | |
3847 } | |
3848 kmem_free(kres, sizeof (md_mn_kresult_t)); | |
3849 } else | |
3850 mutex_exit(&un->un_owner_mx); | |
3851 | |
3852 /* | |
3853 * Re-enqueue this request on the deferred i/o list. Delay the | |
3854 * request for md_mirror_owner_to usecs to stop thrashing. | |
3855 */ | |
3856 (void) timeout(owner_timeout, dq, | |
3857 drv_usectohz(md_mirror_owner_to)); | |
3858 } | |
3859 } | |
3860 | |
3861 static void | |
3862 mirror_write_strategy(buf_t *pb, int flag, void *private) | |
3863 { | |
3864 md_mps_t *ps; | |
3865 md_mcs_t *cs; | |
3866 int more; | |
3867 mm_unit_t *un; | |
3868 mdi_unit_t *ui; | |
3869 buf_t *cb; /* child buf pointer */ | |
3870 set_t setno; | |
3871 int rs_on_overlap = 0; | |
3872 | |
3873 ui = MDI_UNIT(getminor(pb->b_edev)); | |
3874 un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev)); | |
3875 | |
3876 | |
3877 md_kstat_waitq_enter(ui); | |
3878 | |
3879 /* | |
3880 * If a state change is in progress for this mirror in a MN set, | |
3881 * suspend all non-resync writes until the state change is complete. | |
3882 * The objective of this suspend is to ensure that it is not | |
3883 * possible for one node to read data from a submirror that another node | |
3884 * has not written to because of the state change. Therefore we | |
3885 * suspend all writes until the state change has been made. As it is | |
3886 * not possible to read from the target of a resync, there is no need | |
3887 * to suspend resync writes. | |
7975
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3888 * Note that we only block here if the caller can handle a busy-wait. |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3889 * The MD_STR_BLOCK_OK flag is set for daemon_io originated i/o only. |
0 | 3890 */ |
3891 | |
3892 if (!(flag & MD_STR_WAR)) { | |
7975
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3893 if (flag & MD_STR_BLOCK_OK) { |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3894 mutex_enter(&un->un_suspend_wr_mx); |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3895 while (un->un_suspend_wr_flag) { |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3896 cv_wait(&un->un_suspend_wr_cv, |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3897 &un->un_suspend_wr_mx); |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3898 } |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3899 mutex_exit(&un->un_suspend_wr_mx); |
0 | 3900 } |
3901 (void) md_unit_readerlock(ui); | |
3902 } | |
3903 | |
3904 if (!(flag & MD_STR_NOTTOP)) { | |
3905 if (md_checkbuf(ui, (md_unit_t *)un, pb)) { | |
3906 md_kstat_waitq_exit(ui); | |
3907 return; | |
3908 } | |
3909 } | |
3910 | |
3911 setno = MD_MIN2SET(getminor(pb->b_edev)); | |
3912 | |
3913 /* If an ABR write has been requested, set MD_STR_ABR flag */ | |
3914 if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE)) | |
3915 flag |= MD_STR_ABR; | |
3916 | |
3917 if (private == NULL) { | |
3918 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS); | |
3919 mirror_parent_init(ps); | |
3920 } else { | |
3921 ps = private; | |
3922 private = NULL; | |
3923 } | |
3924 if (flag & MD_STR_MAPPED) | |
3925 ps->ps_flags |= MD_MPS_MAPPED; | |
3926 | |
3927 if (flag & MD_STR_WOW) | |
3928 ps->ps_flags |= MD_MPS_WOW; | |
3929 | |
3930 if (flag & MD_STR_ABR) | |
3931 ps->ps_flags |= MD_MPS_ABR; | |
3932 | |
3933 if (flag & MD_STR_WMUPDATE) | |
3934 ps->ps_flags |= MD_MPS_WMUPDATE; | |
3935 | |
3936 /* | |
3937 * Save essential information from the original buffhdr | |
3938 * in the md_save structure. | |
3939 */ | |
3940 ps->ps_un = un; | |
3941 ps->ps_ui = ui; | |
3942 ps->ps_bp = pb; | |
3943 ps->ps_addr = pb->b_un.b_addr; | |
3944 ps->ps_firstblk = pb->b_lblkno; | |
3945 ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1; | |
3946 ps->ps_changecnt = un->un_changecnt; | |
3947 | |
3948 /* | |
7975
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3949 * Check for suspended writes here. This is where we can defer the |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3950 * write request to the daemon_io queue which will then call us with |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3951 * the MD_STR_BLOCK_OK flag set and we'll busy-wait (if necessary) at |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3952 * the top of this routine. |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3953 */ |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3954 if (!(flag & MD_STR_WAR) && !(flag & MD_STR_BLOCK_OK)) { |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3955 mutex_enter(&un->un_suspend_wr_mx); |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3956 if (un->un_suspend_wr_flag) { |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3957 ps->ps_flags |= MD_MPS_BLOCKABLE_IO; |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3958 mutex_exit(&un->un_suspend_wr_mx); |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3959 md_unit_readerexit(ui); |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3960 daemon_request(&md_mirror_daemon, daemon_io, |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3961 (daemon_queue_t *)ps, REQ_OLD); |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3962 return; |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3963 } |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3964 mutex_exit(&un->un_suspend_wr_mx); |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3965 } |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3966 |
f7037f0cdac8
6701425 SVM: Multi-owner disksets do not work well with filesystems
Achim Maurer <Achim.Maurer@Sun.COM>
parents:
7627
diff
changeset
|
3967 /* |
0 | 3968 * If not MN owner and this is an ABR write, make sure the current |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3969 * resync region is in the overlaps tree |
0 | 3970 */ |
3971 mutex_enter(&un->un_owner_mx); | |
3972 if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) && | |
3973 ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) { | |
3974 md_mps_t *ps1; | |
3975 /* Block the current resync region, if not already blocked */ | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3976 ps1 = un->un_rs_prev_overlap; |
0 | 3977 |
3978 if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) || | |
3979 (ps1->ps_lastblk != 0))) { | |
3980 /* Drop locks to avoid deadlock */ | |
3981 mutex_exit(&un->un_owner_mx); | |
3982 md_unit_readerexit(ui); | |
3983 wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT); | |
3984 rs_on_overlap = 1; | |
3985 (void) md_unit_readerlock(ui); | |
3986 mutex_enter(&un->un_owner_mx); | |
3987 /* | |
3988 * Check to see if we have obtained ownership | |
3989 * while waiting for overlaps. If we have, remove | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3990 * the resync_region entry from the overlap tree |
0 | 3991 */ |
3992 if (MD_MN_MIRROR_OWNER(un) && | |
3993 (ps1->ps_flags & MD_MPS_ON_OVERLAP)) { | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
3994 mirror_overlap_tree_remove(ps1); |
0 | 3995 rs_on_overlap = 0; |
3996 } | |
3997 } | |
3998 } | |
3999 mutex_exit(&un->un_owner_mx); | |
4000 | |
4001 | |
4002 /* | |
4003 * following keep write after read from writing to the | |
4004 * source in the case where it all came from one place | |
4005 */ | |
4006 if (flag & MD_STR_WAR) { | |
4007 int abort_write = 0; | |
4008 /* | |
4009 * We are perfoming a write-after-read. This is either as a | |
4010 * result of a resync read or as a result of a read in a | |
4011 * dirty resync region when the optimized resync is not | |
4012 * complete. If in a MN set and a resync generated i/o, | |
4013 * if the current block is not in the current | |
4014 * resync region terminate the write as another node must have | |
4015 * completed this resync region | |
4016 */ | |
4017 if ((MD_MNSET_SETNO(MD_UN2SET(un))) && | |
14082
6db1b9319cfc
3893 lvm: incorrect flag handling
Prasad Joshi <pjoshi@stec-inc.com>
parents:
13452
diff
changeset
|
4018 (!(flag & MD_STR_DIRTY_RD))) { |
0 | 4019 if (!IN_RESYNC_REGION(un, ps)) |
4020 abort_write = 1; | |
4021 } | |
4022 if ((select_write_after_read_units(un, ps) == 0) || | |
4023 (abort_write)) { | |
4024 #ifdef DEBUG | |
4025 if (mirror_debug_flag) | |
4026 printf("Abort resync write on %x, block %lld\n", | |
4027 MD_SID(un), ps->ps_firstblk); | |
4028 #endif | |
4029 if (ps->ps_flags & MD_MPS_ON_OVERLAP) | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
4030 mirror_overlap_tree_remove(ps); |
0 | 4031 kmem_cache_free(mirror_parent_cache, ps); |
4032 md_kstat_waitq_exit(ui); | |
4033 md_unit_readerexit(ui); | |
4034 md_biodone(pb); | |
4035 return; | |
4036 } | |
4037 } else { | |
4038 select_write_units(un, ps); | |
4039 | |
4040 /* Drop readerlock to avoid deadlock */ | |
4041 md_unit_readerexit(ui); | |
4042 wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT); | |
4043 un = md_unit_readerlock(ui); | |
4044 /* | |
4045 * For a MN set with an ABR write, if we are now the | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
4046 * owner and we have a resync region in the overlap |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
4047 * tree, remove the entry from overlaps and retry the write. |
0 | 4048 */ |
4049 | |
4050 if (MD_MNSET_SETNO(setno) && | |
4051 ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) { | |
4052 mutex_enter(&un->un_owner_mx); | |
4053 if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) { | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
4054 mirror_overlap_tree_remove(ps); |
0 | 4055 md_kstat_waitq_exit(ui); |
4056 mutex_exit(&un->un_owner_mx); | |
4057 md_unit_readerexit(ui); | |
4058 daemon_request(&md_mirror_daemon, daemon_io, | |
4059 (daemon_queue_t *)ps, REQ_OLD); | |
4060 return; | |
4061 } | |
4062 mutex_exit(&un->un_owner_mx); | |
4063 } | |
4064 } | |
4065 | |
4066 /* | |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4067 * For Multinode mirrors with no owner and a Resync Region (not ABR) |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4068 * we need to become the mirror owner before continuing with the |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4069 * write(). For ABR mirrors we check that we 'own' the resync if |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4070 * we're in write-after-read mode. We do this _after_ ensuring that |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4071 * there are no overlaps to ensure that once we know that we are |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4072 * the owner, the readerlock will not be released until the write is |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4073 * complete. As a change of ownership in a MN set requires the |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4074 * writerlock, this ensures that ownership cannot be changed until |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4075 * the write is complete. |
0 | 4076 */ |
4077 if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) || | |
4078 (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) { | |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4079 if (MD_MN_NO_MIRROR_OWNER(un)) { |
0 | 4080 if (ps->ps_flags & MD_MPS_ON_OVERLAP) |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
4081 mirror_overlap_tree_remove(ps); |
0 | 4082 md_kstat_waitq_exit(ui); |
4083 ASSERT(!(flag & MD_STR_WAR)); | |
4084 md_unit_readerexit(ui); | |
4085 daemon_request(&md_mirror_daemon, become_owner, | |
4086 (daemon_queue_t *)ps, REQ_OLD); | |
4087 return; | |
4088 } | |
4089 } | |
4090 | |
4091 /* | |
4092 * Mark resync region if mirror has a Resync Region _and_ we are not | |
4093 * a resync initiated write(). Don't mark region if we're flagged as | |
4094 * an ABR write. | |
4095 */ | |
4096 if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) && | |
4097 !(flag & MD_STR_WAR)) { | |
4098 if (mirror_mark_resync_region(un, ps->ps_firstblk, | |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4099 ps->ps_lastblk, md_mn_mynode_id)) { |
0 | 4100 pb->b_flags |= B_ERROR; |
4101 pb->b_resid = pb->b_bcount; | |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4102 if (ps->ps_flags & MD_MPS_ON_OVERLAP) |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4103 mirror_overlap_tree_remove(ps); |
0 | 4104 kmem_cache_free(mirror_parent_cache, ps); |
4105 md_kstat_waitq_exit(ui); | |
4106 md_unit_readerexit(ui); | |
4107 md_biodone(pb); | |
4108 return; | |
4109 } | |
4110 } | |
4111 | |
4112 ps->ps_childbflags = pb->b_flags | B_WRITE; | |
4113 ps->ps_childbflags &= ~B_READ; | |
4114 if (flag & MD_STR_MAPPED) | |
4115 ps->ps_childbflags &= ~B_PAGEIO; | |
4116 | |
4117 if (!(flag & MD_STR_NOTTOP) && panicstr) | |
4118 /* Disable WOW and don't free ps */ | |
4119 ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE); | |
4120 | |
4121 md_kstat_waitq_to_runq(ui); | |
4122 | |
4123 /* | |
4124 * Treat Raw and Direct I/O as Write-on-Write always | |
4125 */ | |
4126 | |
4127 if (!(md_mirror_wow_flg & WOW_DISABLE) && | |
4128 (md_mirror_wow_flg & WOW_PHYS_ENABLE) && | |
4129 (pb->b_flags & B_PHYS) && | |
4130 !(ps->ps_flags & MD_MPS_WOW)) { | |
4131 if (ps->ps_flags & MD_MPS_ON_OVERLAP) | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
4132 mirror_overlap_tree_remove(ps); |
0 | 4133 md_unit_readerexit(ui); |
4134 daemon_request(&md_mstr_daemon, handle_wow, | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
4135 (daemon_queue_t *)ps, REQ_OLD); |
0 | 4136 return; |
4137 } | |
4138 | |
4139 ps->ps_frags = 1; | |
4140 do { | |
4141 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); | |
4142 mirror_child_init(cs); | |
4143 cb = &cs->cs_buf; | |
4144 more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR)); | |
4145 | |
4146 /* | |
4147 * This handles the case where we're requesting | |
4148 * a write to block 0 on a label partition. (more < 0) | |
4149 * means that the request size was smaller than the | |
4150 * size of the label. If so this request is done. | |
4151 */ | |
4152 if (more < 0) { | |
4153 if (ps->ps_flags & MD_MPS_ON_OVERLAP) | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
4154 mirror_overlap_tree_remove(ps); |
0 | 4155 md_kstat_runq_exit(ui); |
4156 kmem_cache_free(mirror_child_cache, cs); | |
4157 kmem_cache_free(mirror_parent_cache, ps); | |
4158 md_unit_readerexit(ui); | |
4159 md_biodone(pb); | |
4160 return; | |
4161 } | |
4162 if (more) { | |
4163 mutex_enter(&ps->ps_mx); | |
4164 ps->ps_frags++; | |
4165 mutex_exit(&ps->ps_mx); | |
4166 } | |
4167 md_call_strategy(cb, flag, private); | |
4168 } while (more); | |
4169 | |
4170 if (!(flag & MD_STR_NOTTOP) && panicstr) { | |
4171 while (!(ps->ps_flags & MD_MPS_DONE)) { | |
4172 md_daemon(1, &md_done_daemon); | |
4173 drv_usecwait(10); | |
4174 } | |
4175 kmem_cache_free(mirror_parent_cache, ps); | |
4176 } | |
4177 } | |
4178 | |
4179 static void | |
4180 mirror_read_strategy(buf_t *pb, int flag, void *private) | |
4181 { | |
4182 md_mps_t *ps; | |
4183 md_mcs_t *cs; | |
4184 size_t more; | |
4185 mm_unit_t *un; | |
4186 mdi_unit_t *ui; | |
4187 size_t current_count; | |
4188 diskaddr_t current_blkno; | |
4189 off_t current_offset; | |
4190 buf_t *cb; /* child buf pointer */ | |
4191 set_t setno; | |
4192 | |
4193 ui = MDI_UNIT(getminor(pb->b_edev)); | |
4194 | |
4195 md_kstat_waitq_enter(ui); | |
4196 | |
4197 un = (mm_unit_t *)md_unit_readerlock(ui); | |
4198 | |
4199 if (!(flag & MD_STR_NOTTOP)) { | |
4200 if (md_checkbuf(ui, (md_unit_t *)un, pb)) { | |
4201 md_kstat_waitq_exit(ui); | |
4202 return; | |
4203 } | |
4204 } | |
4205 | |
4206 if (private == NULL) { | |
4207 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS); | |
4208 mirror_parent_init(ps); | |
4209 } else { | |
4210 ps = private; | |
4211 private = NULL; | |
4212 } | |
4213 | |
4214 if (flag & MD_STR_MAPPED) | |
4215 ps->ps_flags |= MD_MPS_MAPPED; | |
4216 if (flag & MD_NOBLOCK) | |
4217 ps->ps_flags |= MD_MPS_NOBLOCK; | |
4218 if (flag & MD_STR_WMUPDATE) | |
4219 ps->ps_flags |= MD_MPS_WMUPDATE; | |
4220 | |
4221 /* | |
4222 * Check to see if this is a DMR driven read. If so we need to use the | |
4223 * specified side (in un->un_dmr_last_read) for the source of the data. | |
4224 */ | |
4225 if (flag & MD_STR_DMR) | |
4226 ps->ps_flags |= MD_MPS_DMR; | |
4227 | |
4228 /* | |
4229 * Save essential information from the original buffhdr | |
4230 * in the md_save structure. | |
4231 */ | |
4232 ps->ps_un = un; | |
4233 ps->ps_ui = ui; | |
4234 ps->ps_bp = pb; | |
4235 ps->ps_addr = pb->b_un.b_addr; | |
4236 ps->ps_firstblk = pb->b_lblkno; | |
4237 ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1; | |
4238 ps->ps_changecnt = un->un_changecnt; | |
4239 | |
4240 current_count = btodb(pb->b_bcount); | |
4241 current_blkno = pb->b_lblkno; | |
4242 current_offset = 0; | |
4243 | |
4244 /* | |
4245 * If flag has MD_STR_WAR set this means that the read is issued by a | |
4246 * resync thread which may or may not be an optimised resync. | |
4247 * | |
4248 * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync | |
4249 * code has not completed; either a resync has not started since snarf, | |
4250 * or there is an optimized resync in progress. | |
4251 * | |
4252 * We need to generate a write after this read in the following two | |
4253 * cases, | |
4254 * | |
4255 * 1. Any Resync-Generated read | |
4256 * | |
4257 * 2. Any read to a DIRTY REGION if there is an optimized resync | |
4258 * pending or in progress. | |
4259 * | |
4260 * The write after read is done in these cases to ensure that all sides | |
4261 * of the mirror are in sync with the read data and that it is not | |
4262 * possible for an application to read the same block multiple times | |
4263 * and get different data. | |
4264 * | |
4265 * This would be possible if the block was in a dirty region. | |
4266 * | |
4267 * If we're performing a directed read we don't write the data out as | |
4268 * the application is responsible for restoring the mirror to a known | |
4269 * state. | |
4270 */ | |
4271 if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) && | |
4272 !(flag & MD_STR_DMR)) { | |
4273 size_t start_rr, i, end_rr; | |
4274 int region_dirty = 1; | |
4275 | |
4276 /* | |
4277 * We enter here under three circumstances, | |
4278 * | |
4279 * MD_UN_OPT_NOT_DONE MD_STR_WAR | |
4280 * 0 1 | |
4281 * 1 0 | |
4282 * 1 1 | |
4283 * | |
4284 * To be optimal we only care to explicitly check for dirty | |
4285 * regions in the second case since if MD_STR_WAR is set we | |
4286 * always do the write after read. | |
4287 */ | |
4288 if (!(flag & MD_STR_WAR)) { | |
4289 BLK_TO_RR(end_rr, ps->ps_lastblk, un); | |
4290 BLK_TO_RR(start_rr, ps->ps_firstblk, un); | |
4291 | |
4292 for (i = start_rr; i <= end_rr; i++) | |
4293 if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0) | |
4294 break; | |
4295 } | |
4296 | |
4297 if ((region_dirty) && | |
4298 !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) { | |
4299 ps->ps_call = write_after_read; | |
4300 /* | |
4301 * Mark this as a RESYNC_READ in ps_flags. | |
4302 * This is used if the read fails during a | |
4303 * resync of a 3-way mirror to ensure that | |
4304 * the retried read to the remaining | |
4305 * good submirror has MD_STR_WAR set. This | |
4306 * is needed to ensure that the resync write | |
4307 * (write-after-read) takes place. | |
4308 */ | |
4309 ps->ps_flags |= MD_MPS_RESYNC_READ; | |
4310 | |
4311 /* | |
4312 * If MD_STR_FLAG_ERR is set in the flags we | |
4313 * set MD_MPS_FLAG_ERROR so that an error on the resync | |
4314 * write (issued by write_after_read) will be flagged | |
4315 * to the biowait'ing resync thread. This allows us to | |
4316 * avoid issuing further resync requests to a device | |
4317 * that has had a write failure. | |
4318 */ | |
4319 if (flag & MD_STR_FLAG_ERR) | |
4320 ps->ps_flags |= MD_MPS_FLAG_ERROR; | |
4321 | |
4322 setno = MD_UN2SET(un); | |
4323 /* | |
4324 * Drop the readerlock to avoid | |
4325 * deadlock | |
4326 */ | |
4327 md_unit_readerexit(ui); | |
4328 wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT); | |
4329 un = md_unit_readerlock(ui); | |
4330 /* | |
4331 * Ensure that we are owner | |
4332 */ | |
4333 if (MD_MNSET_SETNO(setno)) { | |
4334 /* | |
4335 * For a non-resync read that requires a | |
4336 * write-after-read to be done, set a flag | |
4337 * in the parent structure, so that the | |
4338 * write_strategy routine can omit the | |
4339 * test that the write is still within the | |
4340 * resync region | |
4341 */ | |
4342 if (!(flag & MD_STR_WAR)) | |
4343 ps->ps_flags |= MD_MPS_DIRTY_RD; | |
4344 | |
4345 /* | |
4346 * Before reading the buffer, see if | |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4347 * there is an owner. |
0 | 4348 */ |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4349 if (MD_MN_NO_MIRROR_OWNER(un)) { |
0 | 4350 ps->ps_call = NULL; |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
4351 mirror_overlap_tree_remove(ps); |
0 | 4352 md_kstat_waitq_exit(ui); |
4353 md_unit_readerexit(ui); | |
4354 daemon_request( | |
4355 &md_mirror_daemon, | |
4356 become_owner, | |
4357 (daemon_queue_t *)ps, | |
4358 REQ_OLD); | |
4359 return; | |
4360 } | |
4361 /* | |
4362 * For a resync read, check to see if I/O is | |
4363 * outside of the current resync region, or | |
4364 * the resync has finished. If so | |
4365 * just terminate the I/O | |
4366 */ | |
4367 if ((flag & MD_STR_WAR) && | |
4368 (!(un->c.un_status & MD_UN_WAR) || | |
4369 (!IN_RESYNC_REGION(un, ps)))) { | |
4370 #ifdef DEBUG | |
4371 if (mirror_debug_flag) | |
4372 printf("Abort resync read " | |
4373 "%x: %lld\n", | |
4374 MD_SID(un), | |
4375 ps->ps_firstblk); | |
4376 #endif | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
4377 mirror_overlap_tree_remove(ps); |
0 | 4378 kmem_cache_free(mirror_parent_cache, |
4379 ps); | |
4380 md_kstat_waitq_exit(ui); | |
4381 md_unit_readerexit(ui); | |
4382 md_biodone(pb); | |
4383 return; | |
4384 } | |
4385 } | |
4386 } | |
4387 } | |
4388 | |
4389 if (flag & MD_STR_DMR) { | |
4390 ps->ps_call = directed_read_done; | |
4391 } | |
4392 | |
4393 if (!(flag & MD_STR_NOTTOP) && panicstr) | |
4394 ps->ps_flags |= MD_MPS_DONTFREE; | |
4395 | |
4396 md_kstat_waitq_to_runq(ui); | |
4397 | |
4398 ps->ps_frags++; | |
4399 do { | |
4400 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); | |
4401 mirror_child_init(cs); | |
4402 cb = &cs->cs_buf; | |
4403 cs->cs_ps = ps; | |
4404 | |
4405 cb = md_bioclone(pb, current_offset, current_count, NODEV, | |
4406 current_blkno, mirror_done, cb, KM_NOSLEEP); | |
4407 | |
4408 more = mirror_map_read(ps, cs, current_blkno, | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
4409 (u_longlong_t)current_count); |
0 | 4410 if (more) { |
4411 mutex_enter(&ps->ps_mx); | |
4412 ps->ps_frags++; | |
4413 mutex_exit(&ps->ps_mx); | |
4414 } | |
4415 | |
4416 /* | |
4417 * Do these calculations now, | |
4418 * so that we pickup a valid b_bcount from the chld_bp. | |
4419 */ | |
4420 current_count -= more; | |
4421 current_offset += cb->b_bcount; | |
4422 current_blkno += more; | |
4423 md_call_strategy(cb, flag, private); | |
4424 } while (more); | |
4425 | |
4426 if (!(flag & MD_STR_NOTTOP) && panicstr) { | |
4427 while (!(ps->ps_flags & MD_MPS_DONE)) { | |
4428 md_daemon(1, &md_done_daemon); | |
4429 drv_usecwait(10); | |
4430 } | |
4431 kmem_cache_free(mirror_parent_cache, ps); | |
4432 } | |
4433 } | |
4434 | |
4435 void | |
4436 md_mirror_strategy(buf_t *bp, int flag, void *private) | |
4437 { | |
4438 set_t setno = MD_MIN2SET(getminor(bp->b_edev)); | |
4439 | |
4440 /* | |
4441 * When doing IO to a multi owner meta device, check if set is halted. | |
4442 * We do this check without the needed lock held, for performance | |
4443 * reasons. | |
4444 * If an IO just slips through while the set is locked via an | |
4445 * MD_MN_SUSPEND_SET, we don't care about it. | |
4446 * Only check for suspension if we are a top-level i/o request | |
4447 * (MD_STR_NOTTOP is cleared in 'flag'). | |
4448 */ | |
4449 if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) == | |
4450 (MD_SET_HALTED | MD_SET_MNSET)) { | |
4451 if ((flag & MD_STR_NOTTOP) == 0) { | |
4452 mutex_enter(&md_mx); | |
4453 /* Here we loop until the set is no longer halted */ | |
4454 while (md_set[setno].s_status & MD_SET_HALTED) { | |
4455 cv_wait(&md_cv, &md_mx); | |
4456 } | |
4457 mutex_exit(&md_mx); | |
4458 } | |
4459 } | |
4460 | |
4461 if ((flag & MD_IO_COUNTED) == 0) { | |
4462 if ((flag & MD_NOBLOCK) == 0) { | |
4463 if (md_inc_iocount(setno) != 0) { | |
4464 bp->b_flags |= B_ERROR; | |
4465 bp->b_error = ENXIO; | |
4466 bp->b_resid = bp->b_bcount; | |
4467 biodone(bp); | |
4468 return; | |
4469 } | |
4470 } else { | |
4471 md_inc_iocount_noblock(setno); | |
4472 } | |
4473 } | |
4474 | |
4475 if (bp->b_flags & B_READ) | |
4476 mirror_read_strategy(bp, flag, private); | |
4477 else | |
4478 mirror_write_strategy(bp, flag, private); | |
4479 } | |
4480 | |
4481 /* | |
4482 * mirror_directed_read: | |
4483 * -------------------- | |
4484 * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror | |
4485 * so that the application can determine what (if any) resync needs to be | |
4486 * performed. The data is copied out to the user-supplied buffer. | |
4487 * | |
4488 * Parameters: | |
4489 * mdev - dev_t for the mirror device | |
4490 * vdr - directed read parameters specifying location and submirror | |
4491 * to perform the read from | |
4492 * mode - used to ddi_copyout() any resulting data from the read | |
4493 * | |
4494 * Returns: | |
4495 * 0 success | |
4496 * !0 error code | |
4497 * EINVAL - invalid request format | |
4498 */ | |
4499 int | |
4500 mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode) | |
4501 { | |
4502 buf_t *bp; | |
4503 minor_t mnum = getminor(mdev); | |
4504 mdi_unit_t *ui = MDI_UNIT(mnum); | |
4505 mm_unit_t *un; | |
4506 mm_submirror_t *sm; | |
4507 char *sm_nm; | |
4508 uint_t next_side; | |
4509 void *kbuffer; | |
4510 | |
4511 if (ui == NULL) | |
4512 return (ENXIO); | |
4513 | |
4514 if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) { | |
4515 return (EINVAL); | |
4516 } | |
4517 | |
4518 /* Check for aligned block access. We disallow non-aligned requests. */ | |
4519 if (vdr->vdr_offset % DEV_BSIZE) { | |
4520 return (EINVAL); | |
4521 } | |
4522 | |
4523 /* | |
4524 * Allocate kernel buffer for target of read(). If we had a reliable | |
4525 * (sorry functional) DDI this wouldn't be needed. | |
4526 */ | |
4527 kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP); | |
4528 if (kbuffer == NULL) { | |
4529 cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx" | |
4530 " bytes\n", vdr->vdr_nbytes); | |
4531 return (ENOMEM); | |
4532 } | |
4533 | |
4534 bp = getrbuf(KM_SLEEP); | |
4535 | |
4536 bp->b_un.b_addr = kbuffer; | |
4537 bp->b_flags = B_READ; | |
4538 bp->b_bcount = vdr->vdr_nbytes; | |
4539 bp->b_lblkno = lbtodb(vdr->vdr_offset); | |
4540 bp->b_edev = mdev; | |
4541 | |
4542 un = md_unit_readerlock(ui); | |
4543 | |
4544 /* | |
4545 * If DKV_SIDE_INIT is set we need to determine the first available | |
4546 * side to start reading from. If it isn't set we increment to the | |
4547 * next readable submirror. | |
4548 * If there are no readable submirrors we error out with DKV_DMR_ERROR. | |
4549 * Note: we check for a readable submirror on completion of the i/o so | |
4550 * we should _always_ have one available. If this becomes unavailable | |
4551 * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if | |
4552 * a metadetach is made between the completion of one DKIOCDMR ioctl | |
4553 * and the start of the next (i.e. a sys-admin 'accident' occurred). | |
4554 * The chance of this is small, but not non-existent. | |
4555 */ | |
4556 if (vdr->vdr_side == DKV_SIDE_INIT) { | |
4557 next_side = 0; | |
4558 } else { | |
4559 next_side = vdr->vdr_side + 1; | |
4560 } | |
4561 while ((next_side < NMIRROR) && | |
4562 !SUBMIRROR_IS_READABLE(un, next_side)) | |
4563 next_side++; | |
4564 if (next_side >= NMIRROR) { | |
4565 vdr->vdr_flags |= DKV_DMR_ERROR; | |
4566 freerbuf(bp); | |
4567 vdr->vdr_bytesread = 0; | |
4568 md_unit_readerexit(ui); | |
4569 return (0); | |
4570 } | |
4571 | |
4572 /* Set the side to read from */ | |
4573 un->un_dmr_last_read = next_side; | |
4574 | |
4575 md_unit_readerexit(ui); | |
4576 | |
4577 /* | |
4578 * Save timestamp for verification purposes. Can be read by debugger | |
4579 * to verify that this ioctl has been executed and to find the number | |
4580 * of DMR reads and the time of the last DMR read. | |
4581 */ | |
4582 uniqtime(&mirror_dmr_stats.dmr_timestamp); | |
4583 mirror_dmr_stats.dmr_count++; | |
4584 | |
4585 /* Issue READ request and wait for completion */ | |
4586 mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL); | |
4587 | |
4588 mutex_enter(&un->un_dmr_mx); | |
4589 cv_wait(&un->un_dmr_cv, &un->un_dmr_mx); | |
4590 mutex_exit(&un->un_dmr_mx); | |
4591 | |
4592 /* | |
4593 * Check to see if we encountered an error during the read. If so we | |
4594 * can make no guarantee about any possibly returned data. | |
4595 */ | |
4596 if ((bp->b_flags & B_ERROR) == 0) { | |
4597 vdr->vdr_flags &= ~DKV_DMR_ERROR; | |
4598 if (bp->b_resid) { | |
4599 vdr->vdr_flags |= DKV_DMR_SHORT; | |
4600 vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid; | |
4601 } else { | |
4602 vdr->vdr_flags |= DKV_DMR_SUCCESS; | |
4603 vdr->vdr_bytesread = vdr->vdr_nbytes; | |
4604 } | |
4605 /* Copy the data read back out to the user supplied buffer */ | |
4606 if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread, | |
4607 mode)) { | |
4608 kmem_free(kbuffer, vdr->vdr_nbytes); | |
4609 return (EFAULT); | |
4610 } | |
4611 | |
4612 } else { | |
4613 /* Error out with DKV_DMR_ERROR */ | |
4614 vdr->vdr_flags |= DKV_DMR_ERROR; | |
4615 vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE); | |
4616 } | |
4617 /* | |
4618 * Update the DMR parameters with the side and name of submirror that | |
4619 * we have just read from (un->un_dmr_last_read) | |
4620 */ | |
4621 un = md_unit_readerlock(ui); | |
4622 | |
4623 vdr->vdr_side = un->un_dmr_last_read; | |
4624 sm = &un->un_sm[un->un_dmr_last_read]; | |
4625 sm_nm = md_shortname(md_getminor(sm->sm_dev)); | |
4626 | |
1623
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
4627 (void) strncpy(vdr->vdr_side_name, sm_nm, sizeof (vdr->vdr_side_name)); |
0 | 4628 |
4629 /* | |
4630 * Determine if we've completed the read cycle. This is true iff the | |
4631 * next computed submirror (side) equals or exceeds NMIRROR. We cannot | |
4632 * use un_nsm as we need to handle a sparse array of submirrors (which | |
4633 * can occur if a submirror is metadetached). | |
4634 */ | |
4635 next_side = un->un_dmr_last_read + 1; | |
4636 while ((next_side < NMIRROR) && | |
4637 !SUBMIRROR_IS_READABLE(un, next_side)) | |
4638 next_side++; | |
4639 if (next_side >= NMIRROR) { | |
4640 /* We've finished */ | |
4641 vdr->vdr_flags |= DKV_DMR_DONE; | |
4642 } | |
4643 | |
4644 md_unit_readerexit(ui); | |
4645 freerbuf(bp); | |
4646 kmem_free(kbuffer, vdr->vdr_nbytes); | |
4647 | |
4648 return (0); | |
4649 } | |
4650 | |
4651 /* | |
4652 * mirror_resync_message: | |
4653 * --------------------- | |
4654 * Handle the multi-node resync messages that keep all nodes within a given | |
4655 * disk-set in sync with their view of a mirror's resync status. | |
4656 * | |
4657 * The message types dealt with are: | |
4658 * MD_MN_MSG_RESYNC_STARTING - start a resync thread for a unit | |
4659 * MD_MN_MSG_RESYNC_NEXT - specified next region to be resynced | |
4660 * MD_MN_MSG_RESYNC_FINISH - stop the resync thread for a unit | |
4661 * MD_MN_MSG_RESYNC_PHASE_DONE - end of a resync phase, opt, submirror or comp | |
4662 * | |
4663 * Returns: | |
4664 * 0 Success | |
4665 * >0 Failure error number | |
4666 */ | |
4667 int | |
4668 mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp) | |
4669 { | |
4670 mdi_unit_t *ui; | |
4671 mm_unit_t *un; | |
4672 set_t setno; | |
4673 int is_ABR; | |
4674 int smi; | |
4675 int ci; | |
4676 sm_state_t state; | |
4677 int broke_out; | |
4678 mm_submirror_t *sm; | |
4679 mm_submirror_ic_t *smic; | |
4680 md_m_shared_t *shared; | |
4681 md_error_t mde = mdnullerror; | |
4682 md_mps_t *ps; | |
4683 int rs_active; | |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4684 int rr, rr_start, rr_end; |
0 | 4685 |
4686 /* Check that the given device is part of a multi-node set */ | |
4687 setno = MD_MIN2SET(p->mnum); | |
4688 if (setno >= md_nsets) { | |
4689 return (ENXIO); | |
4690 } | |
4691 if (!MD_MNSET_SETNO(setno)) { | |
4692 return (EINVAL); | |
4693 } | |
4694 | |
4695 if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL) | |
4696 return (EINVAL); | |
4697 if ((ui = MDI_UNIT(p->mnum)) == NULL) | |
4698 return (EINVAL); | |
4699 is_ABR = (ui->ui_tstate & MD_ABR_CAP); | |
4700 | |
4701 /* Obtain the current resync status */ | |
4702 (void) md_ioctl_readerlock(lockp, ui); | |
4703 rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0; | |
4704 md_ioctl_readerexit(lockp); | |
4705 | |
4706 switch ((md_mn_msgtype_t)p->msg_type) { | |
4707 case MD_MN_MSG_RESYNC_STARTING: | |
4708 /* Start the resync thread for the mirror */ | |
4709 (void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp); | |
4710 break; | |
4711 | |
4712 case MD_MN_MSG_RESYNC_NEXT: | |
4713 /* | |
4714 * We have to release any previously marked overlap regions | |
4715 * so that i/o can resume. Then we need to block the region | |
4716 * from [rs_start..rs_start+rs_size) * so that no i/o is issued. | |
4717 * Update un_rs_resync_done and un_rs_resync_2_do. | |
4718 */ | |
4719 (void) md_ioctl_readerlock(lockp, ui); | |
4720 /* | |
4721 * Ignore the message if there is no active resync thread or | |
4722 * if it is for a resync type that we have already completed. | |
4723 * un_resync_completed is set to the last resync completed | |
4724 * when processing a PHASE_DONE message. | |
4725 */ | |
4726 if (!rs_active || (p->rs_type == un->un_resync_completed)) | |
4727 break; | |
4728 /* | |
4729 * If this message is for the same resync and is for an earlier | |
4730 * resync region, just ignore it. This can only occur if this | |
4731 * node has progressed on to the next resync region before | |
4732 * we receive this message. This can occur if the class for | |
4733 * this message is busy and the originator has to retry thus | |
4734 * allowing this node to move onto the next resync_region. | |
4735 */ | |
4736 if ((p->rs_type == un->un_rs_type) && | |
4737 (p->rs_start < un->un_resync_startbl)) | |
4738 break; | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
4739 ps = un->un_rs_prev_overlap; |
0 | 4740 |
4741 /* Allocate previous overlap reference if needed */ | |
4742 if (ps == NULL) { | |
4743 ps = kmem_cache_alloc(mirror_parent_cache, | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
4744 MD_ALLOCFLAGS); |
0 | 4745 ps->ps_un = un; |
4746 ps->ps_ui = ui; | |
4747 ps->ps_firstblk = 0; | |
4748 ps->ps_lastblk = 0; | |
4749 ps->ps_flags = 0; | |
4750 md_ioctl_readerexit(lockp); | |
4751 (void) md_ioctl_writerlock(lockp, ui); | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
4752 un->un_rs_prev_overlap = ps; |
0 | 4753 md_ioctl_writerexit(lockp); |
4754 } else | |
4755 md_ioctl_readerexit(lockp); | |
4756 | |
4757 if (p->rs_originator != md_mn_mynode_id) { | |
4758 /* | |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4759 * Clear our un_resync_bm for the regions completed. |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4760 * The owner (originator) will take care of itself. |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4761 */ |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4762 BLK_TO_RR(rr_end, ps->ps_lastblk, un); |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4763 BLK_TO_RR(rr_start, p->rs_start, un); |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4764 if (ps->ps_lastblk && rr_end < rr_start) { |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4765 BLK_TO_RR(rr_start, ps->ps_firstblk, un); |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4766 mutex_enter(&un->un_resync_mx); |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4767 /* |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4768 * Update our resync bitmap to reflect that |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4769 * another node has synchronized this range. |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4770 */ |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4771 for (rr = rr_start; rr <= rr_end; rr++) { |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4772 CLR_KEEPDIRTY(rr, un); |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4773 } |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4774 mutex_exit(&un->un_resync_mx); |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4775 } |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4776 |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4777 /* |
0 | 4778 * On all but the originating node, first update |
4779 * the resync state, then unblock the previous | |
4780 * region and block the next one. No need | |
4781 * to do this if the region is already blocked. | |
4782 * Update the submirror state and flags from the | |
4783 * originator. This keeps the cluster in sync with | |
4784 * regards to the resync status. | |
4785 */ | |
4786 | |
4787 (void) md_ioctl_writerlock(lockp, ui); | |
4788 un->un_rs_resync_done = p->rs_done; | |
4789 un->un_rs_resync_2_do = p->rs_2_do; | |
4790 un->un_rs_type = p->rs_type; | |
4791 un->un_resync_startbl = p->rs_start; | |
4792 md_ioctl_writerexit(lockp); | |
4793 /* | |
4794 * Use un_owner_mx to ensure that an ownership change | |
4795 * cannot happen at the same time as this message | |
4796 */ | |
4797 mutex_enter(&un->un_owner_mx); | |
4798 if (MD_MN_MIRROR_OWNER(un)) { | |
4799 ps->ps_firstblk = p->rs_start; | |
4800 ps->ps_lastblk = ps->ps_firstblk + | |
4801 p->rs_size - 1; | |
4802 } else { | |
4803 if ((ps->ps_firstblk != p->rs_start) || | |
4804 (ps->ps_lastblk != p->rs_start + | |
4805 p->rs_size - 1)) { | |
4806 /* Remove previous overlap range */ | |
4807 if (ps->ps_flags & MD_MPS_ON_OVERLAP) | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
4808 mirror_overlap_tree_remove(ps); |
0 | 4809 |
4810 ps->ps_firstblk = p->rs_start; | |
4811 ps->ps_lastblk = ps->ps_firstblk + | |
4812 p->rs_size - 1; | |
4813 | |
4814 mutex_exit(&un->un_owner_mx); | |
4815 /* Block this range from all i/o. */ | |
4816 if (ps->ps_firstblk != 0 || | |
4817 ps->ps_lastblk != 0) | |
4818 wait_for_overlaps(ps, | |
4819 MD_OVERLAP_ALLOW_REPEAT); | |
4820 mutex_enter(&un->un_owner_mx); | |
4821 /* | |
4822 * Check to see if we have obtained | |
4823 * ownership while waiting for | |
4824 * overlaps. If we have, remove | |
4825 * the resync_region entry from the | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
4826 * overlap tree |
0 | 4827 */ |
4828 if (MD_MN_MIRROR_OWNER(un) && | |
4829 (ps->ps_flags & MD_MPS_ON_OVERLAP)) | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
4830 mirror_overlap_tree_remove(ps); |
0 | 4831 } |
4832 } | |
4833 mutex_exit(&un->un_owner_mx); | |
4834 | |
4835 /* | |
4836 * If this is the first RESYNC_NEXT message (i.e. | |
4837 * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags), | |
4838 * issue RESYNC_START NOTIFY event | |
4839 */ | |
4840 if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) { | |
4841 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START, | |
4842 SVM_TAG_METADEVICE, MD_UN2SET(un), | |
4843 MD_SID(un)); | |
4844 } | |
4845 | |
4846 /* Ensure that our local resync thread is running */ | |
4847 if (un->un_rs_thread == NULL) { | |
4848 (void) mirror_resync_unit(p->mnum, NULL, | |
4849 &p->mde, lockp); | |
4850 } | |
4851 } | |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4852 |
0 | 4853 break; |
4854 case MD_MN_MSG_RESYNC_FINISH: | |
4855 /* | |
4856 * Complete the resync by stopping the resync thread. | |
4857 * Also release the previous overlap region field. | |
4858 * Update the resync_progress_thread by cv_signal'ing it so | |
4859 * that we mark the end of the resync as soon as possible. This | |
4860 * stops an unnecessary delay should be panic after resync | |
4861 * completion. | |
4862 */ | |
4863 #ifdef DEBUG | |
4864 if (!rs_active) { | |
4865 if (mirror_debug_flag) | |
4866 printf("RESYNC_FINISH (mnum = %x), " | |
4867 "Resync *NOT* active", | |
4868 p->mnum); | |
4869 } | |
4870 #endif | |
4871 | |
4872 if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) && | |
4873 (p->rs_originator != md_mn_mynode_id)) { | |
4874 mutex_enter(&un->un_rs_thread_mx); | |
4875 un->c.un_status &= ~MD_UN_RESYNC_CANCEL; | |
4876 un->un_rs_thread_flags |= MD_RI_SHUTDOWN; | |
4877 un->un_rs_thread_flags &= | |
4878 ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER); | |
4879 cv_signal(&un->un_rs_thread_cv); | |
4880 mutex_exit(&un->un_rs_thread_mx); | |
4881 } | |
4882 if (is_ABR) { | |
4883 /* Resync finished, if ABR set owner to NULL */ | |
4884 mutex_enter(&un->un_owner_mx); | |
4885 un->un_mirror_owner = 0; | |
4886 mutex_exit(&un->un_owner_mx); | |
4887 } | |
4888 (void) md_ioctl_writerlock(lockp, ui); | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
4889 ps = un->un_rs_prev_overlap; |
0 | 4890 if (ps != NULL) { |
4891 /* Remove previous overlap range */ | |
4892 if (ps->ps_flags & MD_MPS_ON_OVERLAP) | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
4893 mirror_overlap_tree_remove(ps); |
0 | 4894 /* |
4895 * Release the overlap range reference | |
4896 */ | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
4897 un->un_rs_prev_overlap = NULL; |
0 | 4898 kmem_cache_free(mirror_parent_cache, |
4899 ps); | |
4900 } | |
4901 md_ioctl_writerexit(lockp); | |
4902 | |
4903 /* Mark the resync as complete in the metadb */ | |
4904 un->un_rs_resync_done = p->rs_done; | |
4905 un->un_rs_resync_2_do = p->rs_2_do; | |
4906 un->un_rs_type = p->rs_type; | |
4907 mutex_enter(&un->un_rs_progress_mx); | |
4908 cv_signal(&un->un_rs_progress_cv); | |
4909 mutex_exit(&un->un_rs_progress_mx); | |
4910 | |
4911 un = md_ioctl_writerlock(lockp, ui); | |
4912 un->c.un_status &= ~MD_UN_RESYNC_ACTIVE; | |
4913 /* Deal with any pending grow_unit */ | |
4914 if (un->c.un_status & MD_UN_GROW_PENDING) { | |
4915 if ((mirror_grow_unit(un, &mde) != 0) || | |
4916 (! mdismderror(&mde, MDE_GROW_DELAYED))) { | |
4917 un->c.un_status &= ~MD_UN_GROW_PENDING; | |
4918 } | |
4919 } | |
4920 md_ioctl_writerexit(lockp); | |
4921 break; | |
4922 | |
4923 case MD_MN_MSG_RESYNC_PHASE_DONE: | |
4924 /* | |
4925 * A phase of the resync, optimized. component or | |
4926 * submirror is complete. Update mirror status. | |
4927 * If the flag CLEAR_OPT_NOT_DONE is set, it means that the | |
4928 * mirror owner is peforming a resync. If we have just snarfed | |
4929 * this set, then we must clear any of the flags set at snarf | |
4930 * time by unit_setup_resync(). | |
4931 * Note that unit_setup_resync() sets up these flags to | |
4932 * indicate that an optimized resync is required. These flags | |
4933 * need to be reset because if we get here, the mirror owner | |
4934 * will have handled the optimized resync. | |
4935 * The flags that must be cleared are MD_UN_OPT_NOT_DONE and | |
4936 * MD_UN_WAR. In addition, for each submirror, | |
4937 * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC | |
4938 * set to SMS_OFFLINE. | |
4939 */ | |
4940 #ifdef DEBUG | |
4941 if (mirror_debug_flag) | |
4942 printf("phase done mess received from %d, mnum=%x," | |
4943 "type=%x, flags=%x\n", p->rs_originator, p->mnum, | |
4944 p->rs_type, p->rs_flags); | |
4945 #endif | |
4946 /* | |
4947 * Ignore the message if there is no active resync thread. | |
4948 */ | |
4949 if (!rs_active) | |
4950 break; | |
4951 | |
4952 broke_out = p->rs_flags & MD_MN_RS_ERR; | |
4953 switch (RS_TYPE(p->rs_type)) { | |
4954 case MD_RS_OPTIMIZED: | |
4955 un = md_ioctl_writerlock(lockp, ui); | |
4956 if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) { | |
4957 /* If we are originator, just clear rs_type */ | |
4958 if (p->rs_originator == md_mn_mynode_id) { | |
4959 SET_RS_TYPE_NONE(un->un_rs_type); | |
4960 md_ioctl_writerexit(lockp); | |
4961 break; | |
4962 } | |
4963 /* | |
4964 * If CLEAR_OPT_NOT_DONE is set, only clear the | |
4965 * flags if OPT_NOT_DONE is set *and* rs_type | |
4966 * is MD_RS_NONE. | |
4967 */ | |
4968 if ((un->c.un_status & MD_UN_OPT_NOT_DONE) && | |
4969 (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) { | |
4970 /* No resync in progress */ | |
4971 un->c.un_status &= ~MD_UN_OPT_NOT_DONE; | |
4972 un->c.un_status &= ~MD_UN_WAR; | |
4973 } else { | |
4974 /* | |
4975 * We are in the middle of an | |
4976 * optimized resync and this message | |
4977 * should be ignored. | |
4978 */ | |
4979 md_ioctl_writerexit(lockp); | |
4980 break; | |
4981 } | |
4982 } else { | |
4983 /* | |
4984 * This is the end of an optimized resync, | |
4985 * clear the OPT_NOT_DONE and OFFLINE_SM flags | |
4986 */ | |
4987 | |
4988 un->c.un_status &= ~MD_UN_KEEP_DIRTY; | |
4989 if (!broke_out) | |
4990 un->c.un_status &= ~MD_UN_WAR; | |
8452
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4991 |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4992 /* |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4993 * Clear our un_resync_bm for the regions |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4994 * completed. The owner (originator) will |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4995 * take care of itself. |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4996 */ |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4997 if (p->rs_originator != md_mn_mynode_id && |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4998 (ps = un->un_rs_prev_overlap) != NULL) { |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
4999 BLK_TO_RR(rr_start, ps->ps_firstblk, |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
5000 un); |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
5001 BLK_TO_RR(rr_end, ps->ps_lastblk, un); |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
5002 mutex_enter(&un->un_resync_mx); |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
5003 for (rr = rr_start; rr <= rr_end; |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
5004 rr++) { |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
5005 CLR_KEEPDIRTY(rr, un); |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
5006 } |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
5007 mutex_exit(&un->un_resync_mx); |
89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
John Wren Kennedy <John.Wren.Kennedy@Sun.COM>
parents:
7975
diff
changeset
|
5008 } |
0 | 5009 } |
5010 | |
5011 /* | |
5012 * Set resync_completed to last resync type and then | |
5013 * clear resync_type to indicate no resync in progress | |
5014 */ | |
5015 un->un_resync_completed = un->un_rs_type; | |
5016 SET_RS_TYPE_NONE(un->un_rs_type); | |
5017 | |
5018 /* | |
5019 * If resync is as a result of a submirror ONLINE, | |
5020 * reset the submirror state to SMS_RUNNING if the | |
5021 * resync was ok else set back to SMS_OFFLINE. | |
5022 */ | |
5023 for (smi = 0; smi < NMIRROR; smi++) { | |
5024 un->un_sm[smi].sm_flags &= | |
5025 ~MD_SM_RESYNC_TARGET; | |
5026 if (SMS_BY_INDEX_IS(un, smi, | |
5027 SMS_OFFLINE_RESYNC)) { | |
5028 if (p->rs_flags & | |
5029 MD_MN_RS_CLEAR_OPT_NOT_DONE) { | |
5030 state = SMS_OFFLINE; | |
5031 } else { | |
5032 state = (broke_out ? | |
5033 SMS_OFFLINE : SMS_RUNNING); | |
5034 } | |
5035 mirror_set_sm_state( | |
5036 &un->un_sm[smi], | |
5037 &un->un_smic[smi], state, | |
5038 broke_out); | |
5039 mirror_commit(un, NO_SUBMIRRORS, | |
5040 0); | |
5041 } | |
5042 /* | |
5043 * If we still have an offline submirror, reset | |
5044 * the OFFLINE_SM flag in the mirror status | |
5045 */ | |
5046 if (SMS_BY_INDEX_IS(un, smi, | |
5047 SMS_OFFLINE)) | |
5048 un->c.un_status |= | |
5049 MD_UN_OFFLINE_SM; | |
5050 } | |
5051 md_ioctl_writerexit(lockp); | |
5052 break; | |
5053 case MD_RS_SUBMIRROR: | |
5054 un = md_ioctl_writerlock(lockp, ui); | |
5055 smi = RS_SMI(p->rs_type); | |
5056 sm = &un->un_sm[smi]; | |
5057 smic = &un->un_smic[smi]; | |
5058 /* Clear RESYNC target */ | |
5059 un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET; | |
5060 /* | |
5061 * Set resync_completed to last resync type and then | |
5062 * clear resync_type to indicate no resync in progress | |
5063 */ | |
5064 un->un_resync_completed = un->un_rs_type; | |
5065 SET_RS_TYPE_NONE(un->un_rs_type); | |
5066 /* | |
5067 * If the resync completed ok reset the submirror | |
5068 * state to SMS_RUNNING else reset it to SMS_ATTACHED | |
5069 */ | |
5070 state = (broke_out ? | |
5071 SMS_ATTACHED : SMS_RUNNING); | |
5072 mirror_set_sm_state(sm, smic, state, broke_out); | |
5073 un->c.un_status &= ~MD_UN_WAR; | |
5074 mirror_commit(un, SMI2BIT(smi), 0); | |
5075 md_ioctl_writerexit(lockp); | |
5076 break; | |
5077 case MD_RS_COMPONENT: | |
5078 un = md_ioctl_writerlock(lockp, ui); | |
5079 smi = RS_SMI(p->rs_type); | |
5080 ci = RS_CI(p->rs_type); | |
5081 sm = &un->un_sm[smi]; | |
5082 smic = &un->un_smic[smi]; | |
5083 shared = (md_m_shared_t *) | |
5084 (*(smic->sm_shared_by_indx)) | |
5085 (sm->sm_dev, sm, ci); | |
5086 un->c.un_status &= ~MD_UN_WAR; | |
5087 /* Clear RESYNC target */ | |
5088 un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET; | |
5089 /* | |
5090 * Set resync_completed to last resync type and then | |
5091 * clear resync_type to indicate no resync in progress | |
5092 */ | |
5093 un->un_resync_completed = un->un_rs_type; | |
5094 SET_RS_TYPE_NONE(un->un_rs_type); | |
5095 | |
5096 /* | |
5097 * If the resync completed ok, set the component state | |
5098 * to CS_OKAY. | |
5099 */ | |
5100 if (broke_out) | |
5101 shared->ms_flags |= MDM_S_RS_TRIED; | |
5102 else { | |
5103 /* | |
5104 * As we don't transmit the changes, | |
5105 * no need to drop the lock. | |
5106 */ | |
5107 set_sm_comp_state(un, smi, ci, CS_OKAY, 0, | |
5108 MD_STATE_NO_XMIT, (IOLOCK *)NULL); | |
5109 } | |
5110 md_ioctl_writerexit(lockp); | |
5111 default: | |
5112 break; | |
5113 } | |
5114 /* | |
5115 * If the purpose of this PHASE_DONE message is just to | |
5116 * indicate to all other nodes that the optimized resync | |
5117 * required (OPT_NOT_DONE) flag is to be cleared, there is | |
5118 * no need to generate a notify event as there has not | |
5119 * actually been a resync. | |
5120 */ | |
5121 if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) { | |
5122 if (broke_out) { | |
5123 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED, | |
5124 SVM_TAG_METADEVICE, MD_UN2SET(un), | |
5125 MD_SID(un)); | |
5126 } else { | |
5127 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE, | |
5128 SVM_TAG_METADEVICE, MD_UN2SET(un), | |
5129 MD_SID(un)); | |
5130 } | |
5131 } | |
5132 break; | |
5133 | |
5134 default: | |
5135 #ifdef DEBUG | |
5136 cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type" | |
5137 " %x\n", p->msg_type); | |
5138 #endif | |
5139 return (EINVAL); | |
5140 } | |
5141 return (0); | |
5142 } | |
5143 | |
5144 /* Return a -1 if snarf of optimized record failed and set should be released */ | |
5145 static int | |
5146 mirror_snarf(md_snarfcmd_t cmd, set_t setno) | |
5147 { | |
5148 mddb_recid_t recid; | |
5149 int gotsomething; | |
5150 int all_mirrors_gotten; | |
5151 mm_unit_t *un; | |
5152 mddb_type_t typ1; | |
5153 mddb_de_ic_t *dep; | |
5154 mddb_rb32_t *rbp; | |
5155 size_t newreqsize; | |
5156 mm_unit_t *big_un; | |
5157 mm_unit32_od_t *small_un; | |
5158 int retval; | |
5159 mdi_unit_t *ui; | |
5160 | |
5161 if (cmd == MD_SNARF_CLEANUP) { | |
5162 if (md_get_setstatus(setno) & MD_SET_STALE) | |
5163 return (0); | |
5164 | |
5165 recid = mddb_makerecid(setno, 0); | |
5166 typ1 = (mddb_type_t)md_getshared_key(setno, | |
5167 mirror_md_ops.md_driver.md_drivername); | |
5168 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { | |
5169 if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) { | |
5170 un = (mm_unit_t *)mddb_getrecaddr(recid); | |
5171 mirror_cleanup(un); | |
5172 recid = mddb_makerecid(setno, 0); | |
5173 } | |
5174 } | |
5175 return (0); | |
5176 } | |
5177 | |
5178 all_mirrors_gotten = 1; | |
5179 gotsomething = 0; | |
5180 | |
5181 recid = mddb_makerecid(setno, 0); | |
5182 typ1 = (mddb_type_t)md_getshared_key(setno, | |
5183 mirror_md_ops.md_driver.md_drivername); | |
5184 | |
5185 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { | |
5186 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) | |
5187 continue; | |
5188 | |
5189 dep = mddb_getrecdep(recid); | |
5190 dep->de_flags = MDDB_F_MIRROR; | |
5191 rbp = dep->de_rb; | |
5192 | |
1623
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5193 switch (rbp->rb_revision) { |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5194 case MDDB_REV_RB: |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5195 case MDDB_REV_RBFN: |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5196 if ((rbp->rb_private & MD_PRV_CONVD) == 0) { |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5197 /* |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5198 * This means, we have an old and small |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5199 * record and this record hasn't already |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5200 * been converted. Before we create an |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5201 * incore metadevice from this we have to |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5202 * convert it to a big record. |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5203 */ |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5204 small_un = |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5205 (mm_unit32_od_t *)mddb_getrecaddr(recid); |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5206 newreqsize = sizeof (mm_unit_t); |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5207 big_un = (mm_unit_t *)kmem_zalloc(newreqsize, |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
5208 KM_SLEEP); |
1623
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5209 mirror_convert((caddr_t)small_un, |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
5210 (caddr_t)big_un, SMALL_2_BIG); |
1623
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5211 kmem_free(small_un, dep->de_reqsize); |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5212 |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5213 /* |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5214 * Update userdata and incore userdata |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5215 * incores are at the end of un |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5216 */ |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5217 dep->de_rb_userdata_ic = big_un; |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5218 dep->de_rb_userdata = big_un; |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5219 dep->de_icreqsize = newreqsize; |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5220 un = big_un; |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5221 rbp->rb_private |= MD_PRV_CONVD; |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5222 } else { |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5223 /* |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5224 * Unit already converted, just get the |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5225 * record address. |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5226 */ |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5227 un = (mm_unit_t *)mddb_getrecaddr_resize(recid, |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
5228 sizeof (*un), 0); |
1623
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5229 } |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5230 un->c.un_revision &= ~MD_64BIT_META_DEV; |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5231 break; |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5232 case MDDB_REV_RB64: |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5233 case MDDB_REV_RB64FN: |
0 | 5234 /* Big device */ |
5235 un = (mm_unit_t *)mddb_getrecaddr_resize(recid, | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
5236 sizeof (*un), 0); |
1623
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5237 un->c.un_revision |= MD_64BIT_META_DEV; |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5238 un->c.un_flag |= MD_EFILABEL; |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5239 break; |
0 | 5240 } |
2077
ef90dc4e9399
6396946 Misleading typecast on md_remove_minor_node call
tw21770
parents:
2063
diff
changeset
|
5241 MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision); |
0 | 5242 |
5243 /* | |
5244 * Create minor device node for snarfed entry. | |
5245 */ | |
5246 (void) md_create_minor_node(setno, MD_SID(un)); | |
5247 | |
5248 if (MD_UNIT(MD_SID(un)) != NULL) { | |
5249 mddb_setrecprivate(recid, MD_PRV_PENDDEL); | |
5250 continue; | |
5251 } | |
5252 all_mirrors_gotten = 0; | |
5253 retval = mirror_build_incore(un, 1); | |
5254 if (retval == 0) { | |
5255 mddb_setrecprivate(recid, MD_PRV_GOTIT); | |
5256 md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0); | |
5257 resync_start_timeout(setno); | |
5258 gotsomething = 1; | |
2063
a6ebd483c3cf
6258439 metaset truncates fully qualified mediator host name
hshaw
parents:
1623
diff
changeset
|
5259 } else { |
a6ebd483c3cf
6258439 metaset truncates fully qualified mediator host name
hshaw
parents:
1623
diff
changeset
|
5260 return (retval); |
0 | 5261 } |
5262 /* | |
5263 * Set flag to indicate that the mirror has not yet | |
5264 * been through a reconfig. This flag is used for MN sets | |
5265 * when determining whether to update the mirror state from | |
5266 * the Master node. | |
5267 */ | |
5268 if (MD_MNSET_SETNO(setno)) { | |
5269 ui = MDI_UNIT(MD_SID(un)); | |
5270 ui->ui_tstate |= MD_RESYNC_NOT_DONE; | |
5271 } | |
5272 } | |
5273 | |
5274 if (!all_mirrors_gotten) | |
5275 return (gotsomething); | |
5276 | |
5277 recid = mddb_makerecid(setno, 0); | |
5278 while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0) | |
5279 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) | |
5280 mddb_setrecprivate(recid, MD_PRV_PENDDEL); | |
5281 | |
5282 return (0); | |
5283 } | |
5284 | |
5285 static int | |
5286 mirror_halt(md_haltcmd_t cmd, set_t setno) | |
5287 { | |
5288 unit_t i; | |
5289 mdi_unit_t *ui; | |
5290 minor_t mnum; | |
5291 int reset_mirror_flag = 0; | |
5292 | |
5293 if (cmd == MD_HALT_CLOSE) | |
5294 return (0); | |
5295 | |
5296 if (cmd == MD_HALT_OPEN) | |
5297 return (0); | |
5298 | |
5299 if (cmd == MD_HALT_UNLOAD) | |
5300 return (0); | |
5301 | |
5302 if (cmd == MD_HALT_CHECK) { | |
5303 for (i = 0; i < md_nunits; i++) { | |
5304 mnum = MD_MKMIN(setno, i); | |
5305 if ((ui = MDI_UNIT(mnum)) == NULL) | |
5306 continue; | |
5307 if (ui->ui_opsindex != mirror_md_ops.md_selfindex) | |
5308 continue; | |
5309 if (md_unit_isopen(ui)) | |
5310 return (1); | |
5311 } | |
5312 return (0); | |
5313 } | |
5314 | |
5315 if (cmd != MD_HALT_DOIT) | |
5316 return (1); | |
5317 | |
5318 for (i = 0; i < md_nunits; i++) { | |
5319 mnum = MD_MKMIN(setno, i); | |
5320 if ((ui = MDI_UNIT(mnum)) == NULL) | |
5321 continue; | |
5322 if (ui->ui_opsindex != mirror_md_ops.md_selfindex) | |
5323 continue; | |
5324 reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0); | |
5325 | |
5326 /* Set a flag if there is at least one mirror metadevice. */ | |
5327 reset_mirror_flag = 1; | |
5328 } | |
5329 | |
5330 /* | |
5331 * Only wait for the global dr_timeout to finish | |
5332 * - if there are mirror metadevices in this diskset or | |
5333 * - if this is the local set since an unload of the md_mirror | |
5334 * driver could follow a successful mirror halt in the local set. | |
5335 */ | |
5336 if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) { | |
5337 while ((mirror_md_ops.md_head == NULL) && | |
5338 (mirror_timeout.dr_timeout_id != 0)) | |
5339 delay(md_hz); | |
5340 } | |
5341 | |
5342 return (0); | |
5343 } | |
5344 | |
5345 /*ARGSUSED3*/ | |
5346 static int | |
5347 mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags) | |
5348 { | |
5349 IOLOCK lock; | |
46
042bf15ebd92
6274840 Cluster node(s) panic when I/O is starting on nodes during reconfig.
skamm
parents:
0
diff
changeset
|
5350 minor_t mnum = getminor(*dev); |
042bf15ebd92
6274840 Cluster node(s) panic when I/O is starting on nodes during reconfig.
skamm
parents:
0
diff
changeset
|
5351 set_t setno; |
042bf15ebd92
6274840 Cluster node(s) panic when I/O is starting on nodes during reconfig.
skamm
parents:
0
diff
changeset
|
5352 |
042bf15ebd92
6274840 Cluster node(s) panic when I/O is starting on nodes during reconfig.
skamm
parents:
0
diff
changeset
|
5353 /* |
042bf15ebd92
6274840 Cluster node(s) panic when I/O is starting on nodes during reconfig.
skamm
parents:
0
diff
changeset
|
5354 * When doing an open of a multi owner metadevice, check to see if this |
042bf15ebd92
6274840 Cluster node(s) panic when I/O is starting on nodes during reconfig.
skamm
parents:
0
diff
changeset
|
5355 * node is a starting node and if a reconfig cycle is underway. |
042bf15ebd92
6274840 Cluster node(s) panic when I/O is starting on nodes during reconfig.
skamm
parents:
0
diff
changeset
|
5356 * If so, the system isn't sufficiently set up enough to handle the |
042bf15ebd92
6274840 Cluster node(s) panic when I/O is starting on nodes during reconfig.
skamm
parents:
0
diff
changeset
|
5357 * open (which involves I/O during sp_validate), so fail with ENXIO. |
042bf15ebd92
6274840 Cluster node(s) panic when I/O is starting on nodes during reconfig.
skamm
parents:
0
diff
changeset
|
5358 */ |
042bf15ebd92
6274840 Cluster node(s) panic when I/O is starting on nodes during reconfig.
skamm
parents:
0
diff
changeset
|
5359 setno = MD_MIN2SET(mnum); |
042bf15ebd92
6274840 Cluster node(s) panic when I/O is starting on nodes during reconfig.
skamm
parents:
0
diff
changeset
|
5360 if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) == |
042bf15ebd92
6274840 Cluster node(s) panic when I/O is starting on nodes during reconfig.
skamm
parents:
0
diff
changeset
|
5361 (MD_SET_MNSET | MD_SET_MN_START_RC)) { |
042bf15ebd92
6274840 Cluster node(s) panic when I/O is starting on nodes during reconfig.
skamm
parents:
0
diff
changeset
|
5362 return (ENXIO); |
042bf15ebd92
6274840 Cluster node(s) panic when I/O is starting on nodes during reconfig.
skamm
parents:
0
diff
changeset
|
5363 } |
0 | 5364 |
5365 if (md_oflags & MD_OFLG_FROMIOCTL) { | |
5366 /* | |
5367 * This indicates that the caller is an ioctl service routine. | |
5368 * In this case we initialise our stack-based IOLOCK and pass | |
5369 * this into the internal open routine. This allows multi-owner | |
5370 * metadevices to avoid deadlocking if an error is encountered | |
5371 * during the open() attempt. The failure case is: | |
5372 * s-p -> mirror -> s-p (with error). Attempting to metaclear | |
5373 * this configuration would deadlock as the mirror code has to | |
5374 * send a state-update to the other nodes when it detects the | |
5375 * failure of the underlying submirror with an errored soft-part | |
5376 * on it. As there is a class1 message in progress (metaclear) | |
5377 * set_sm_comp_state() cannot send another class1 message; | |
5378 * instead we do not send a state_update message as the | |
5379 * metaclear is distributed and the failed submirror will be | |
5380 * cleared from the configuration by the metaclear. | |
5381 */ | |
5382 IOLOCK_INIT(&lock); | |
5383 return (mirror_internal_open(getminor(*dev), flag, otyp, | |
5384 md_oflags, &lock)); | |
5385 } else { | |
5386 return (mirror_internal_open(getminor(*dev), flag, otyp, | |
5387 md_oflags, (IOLOCK *)NULL)); | |
5388 } | |
5389 } | |
5390 | |
5391 | |
5392 /*ARGSUSED1*/ | |
5393 static int | |
5394 mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags) | |
5395 { | |
5396 return (mirror_internal_close(getminor(dev), otyp, md_cflags, | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
5397 (IOLOCK *)NULL)); |
0 | 5398 } |
5399 | |
5400 | |
5401 /* | |
5402 * This routine dumps memory to the disk. It assumes that the memory has | |
5403 * already been mapped into mainbus space. It is called at disk interrupt | |
5404 * priority when the system is in trouble. | |
5405 * | |
5406 */ | |
5407 static int | |
5408 mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) | |
5409 { | |
5410 mm_unit_t *un; | |
5411 dev_t mapdev; | |
5412 int result; | |
5413 int smi; | |
5414 int any_succeed = 0; | |
5415 int save_result = 0; | |
5416 | |
5417 /* | |
5418 * Don't need to grab the unit lock. | |
5419 * Cause nothing else is suppose to be happenning. | |
5420 * Also dump is not suppose to sleep. | |
5421 */ | |
5422 un = (mm_unit_t *)MD_UNIT(getminor(dev)); | |
5423 | |
5424 if ((diskaddr_t)blkno >= un->c.un_total_blocks) | |
5425 return (EINVAL); | |
5426 | |
5427 if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks) | |
5428 return (EINVAL); | |
5429 | |
5430 for (smi = 0; smi < NMIRROR; smi++) { | |
5431 if (!SUBMIRROR_IS_WRITEABLE(un, smi)) | |
5432 continue; | |
5433 mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev); | |
5434 result = bdev_dump(mapdev, addr, blkno, nblk); | |
5435 if (result) | |
5436 save_result = result; | |
5437 | |
5438 if (result == 0) | |
5439 any_succeed++; | |
5440 } | |
5441 | |
5442 if (any_succeed) | |
5443 return (0); | |
5444 | |
5445 return (save_result); | |
5446 } | |
5447 | |
5448 /* | |
5449 * NAME: mirror_probe_dev | |
5450 * | |
5451 * DESCRITPION: force opens every component of a mirror. | |
5452 * | |
5453 * On entry the unit writerlock is held | |
5454 */ | |
5455 static int | |
5456 mirror_probe_dev(mdi_unit_t *ui, minor_t mnum) | |
5457 { | |
5458 int i; | |
5459 int smi; | |
5460 int ci; | |
5461 mm_unit_t *un; | |
5462 int md_devopen = 0; | |
5463 set_t setno; | |
5464 int sm_cnt; | |
5465 int sm_unavail_cnt; | |
5466 | |
5467 if (md_unit_isopen(ui)) | |
5468 md_devopen++; | |
5469 | |
5470 un = MD_UNIT(mnum); | |
5471 setno = MD_UN2SET(un); | |
5472 | |
5473 sm_cnt = 0; | |
5474 sm_unavail_cnt = 0; | |
5475 for (i = 0; i < NMIRROR; i++) { | |
5476 md_dev64_t tmpdev; | |
5477 mdi_unit_t *sm_ui; | |
5478 | |
5479 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) { | |
5480 continue; | |
5481 } | |
5482 | |
5483 sm_cnt++; | |
5484 tmpdev = un->un_sm[i].sm_dev; | |
5485 (void) md_layered_open(mnum, &tmpdev, | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
5486 MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV); |
0 | 5487 un->un_sm[i].sm_dev = tmpdev; |
5488 | |
5489 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); | |
5490 | |
5491 /* | |
5492 * Logic similar to that in mirror_open_all_devs. We set or | |
5493 * clear the submirror Unavailable bit. | |
5494 */ | |
5495 (void) md_unit_writerlock(sm_ui); | |
5496 if (submirror_unavailable(un, i, 1)) { | |
5497 sm_ui->ui_tstate |= MD_INACCESSIBLE; | |
5498 sm_unavail_cnt++; | |
5499 } else { | |
5500 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; | |
5501 } | |
5502 md_unit_writerexit(sm_ui); | |
5503 } | |
5504 | |
5505 /* | |
5506 * If all of the submirrors are unavailable, the mirror is also | |
5507 * unavailable. | |
5508 */ | |
5509 if (sm_cnt == sm_unavail_cnt) { | |
5510 ui->ui_tstate |= MD_INACCESSIBLE; | |
5511 } else { | |
5512 ui->ui_tstate &= ~MD_INACCESSIBLE; | |
5513 } | |
5514 | |
5515 /* | |
5516 * Start checking from probe failures. If failures occur we | |
5517 * set the appropriate erred state only if the metadevice is in | |
5518 * use. This is specifically to prevent unnecessary resyncs. | |
5519 * For instance if the disks were accidentally disconnected when | |
5520 * the system booted up then until the metadevice is accessed | |
5521 * (like file system mount) the user can shutdown, recable and | |
5522 * reboot w/o incurring a potentially huge resync. | |
5523 */ | |
5524 | |
5525 smi = 0; | |
5526 ci = 0; | |
5527 while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) { | |
5528 | |
5529 if (mirror_other_sources(un, smi, ci, 0) == 1) { | |
5530 /* | |
5531 * Note that for a MN set, there is no need to call | |
5532 * SE_NOTIFY as that is done when processing the | |
5533 * state change | |
5534 */ | |
5535 if (md_devopen) { | |
5536 /* | |
5537 * Never called from ioctl context, | |
5538 * so (IOLOCK *)NULL | |
5539 */ | |
5540 set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, | |
5541 0, MD_STATE_XMIT, (IOLOCK *)NULL); | |
5542 if (!MD_MNSET_SETNO(setno)) { | |
5543 SE_NOTIFY(EC_SVM_STATE, | |
5544 ESC_SVM_LASTERRED, | |
5545 SVM_TAG_METADEVICE, setno, | |
5546 MD_SID(un)); | |
5547 } | |
5548 continue; | |
5549 } else { | |
5550 (void) mirror_close_all_devs(un, | |
5551 MD_OFLG_PROBEDEV); | |
5552 if (!MD_MNSET_SETNO(setno)) { | |
5553 SE_NOTIFY(EC_SVM_STATE, | |
5554 ESC_SVM_OPEN_FAIL, | |
5555 SVM_TAG_METADEVICE, setno, | |
5556 MD_SID(un)); | |
5557 } | |
5558 mirror_openfail_console_info(un, smi, ci); | |
5559 return (ENXIO); | |
5560 } | |
5561 } | |
5562 | |
5563 /* | |
5564 * Note that for a MN set, there is no need to call | |
5565 * SE_NOTIFY as that is done when processing the | |
5566 * state change | |
5567 */ | |
5568 if (md_devopen) { | |
5569 /* Never called from ioctl context, so (IOLOCK *)NULL */ | |
5570 set_sm_comp_state(un, smi, ci, CS_ERRED, 0, | |
5571 MD_STATE_XMIT, (IOLOCK *)NULL); | |
5572 if (!MD_MNSET_SETNO(setno)) { | |
5573 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, | |
5574 SVM_TAG_METADEVICE, setno, | |
5575 MD_SID(un)); | |
5576 } | |
5577 } | |
5578 mirror_openfail_console_info(un, smi, ci); | |
5579 ci++; | |
5580 } | |
5581 | |
5582 if (MD_MNSET_SETNO(setno)) { | |
5583 send_poke_hotspares(setno); | |
5584 } else { | |
5585 (void) poke_hotspares(); | |
5586 } | |
5587 (void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV); | |
5588 | |
5589 return (0); | |
5590 } | |
5591 | |
5592 | |
5593 static int | |
5594 mirror_imp_set( | |
5595 set_t setno | |
5596 ) | |
5597 { | |
5598 | |
5599 mddb_recid_t recid; | |
5600 int gotsomething, i; | |
5601 mddb_type_t typ1; | |
5602 mddb_de_ic_t *dep; | |
5603 mddb_rb32_t *rbp; | |
5604 mm_unit32_od_t *un32; | |
5605 mm_unit_t *un64; | |
1623
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5606 md_dev64_t self_devt; |
0 | 5607 minor_t *self_id; /* minor needs to be updated */ |
5608 md_parent_t *parent_id; /* parent needs to be updated */ | |
5609 mddb_recid_t *record_id; /* record id needs to be updated */ | |
5610 mddb_recid_t *optrec_id; | |
5611 md_dev64_t tmpdev; | |
5612 | |
5613 | |
5614 gotsomething = 0; | |
5615 | |
5616 typ1 = (mddb_type_t)md_getshared_key(setno, | |
5617 mirror_md_ops.md_driver.md_drivername); | |
5618 recid = mddb_makerecid(setno, 0); | |
5619 | |
5620 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { | |
5621 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) | |
5622 continue; | |
5623 | |
5624 dep = mddb_getrecdep(recid); | |
5625 rbp = dep->de_rb; | |
5626 | |
1623
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5627 switch (rbp->rb_revision) { |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5628 case MDDB_REV_RB: |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5629 case MDDB_REV_RBFN: |
0 | 5630 /* |
5631 * Small device | |
5632 */ | |
5633 un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid); | |
5634 self_id = &(un32->c.un_self_id); | |
5635 parent_id = &(un32->c.un_parent); | |
5636 record_id = &(un32->c.un_record_id); | |
5637 optrec_id = &(un32->un_rr_dirty_recid); | |
5638 | |
5639 for (i = 0; i < un32->un_nsm; i++) { | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
5640 tmpdev = md_expldev(un32->un_sm[i].sm_dev); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
5641 un32->un_sm[i].sm_dev = md_cmpldev |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
5642 (md_makedevice(md_major, MD_MKMIN(setno, |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
5643 MD_MIN2UNIT(md_getminor(tmpdev))))); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
5644 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
5645 if (!md_update_minor(setno, mddb_getsidenum |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
5646 (setno), un32->un_sm[i].sm_key)) |
0 | 5647 goto out; |
5648 } | |
1623
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5649 break; |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5650 case MDDB_REV_RB64: |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5651 case MDDB_REV_RB64FN: |
0 | 5652 un64 = (mm_unit_t *)mddb_getrecaddr(recid); |
5653 self_id = &(un64->c.un_self_id); | |
5654 parent_id = &(un64->c.un_parent); | |
5655 record_id = &(un64->c.un_record_id); | |
5656 optrec_id = &(un64->un_rr_dirty_recid); | |
5657 | |
5658 for (i = 0; i < un64->un_nsm; i++) { | |
6901
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
5659 tmpdev = un64->un_sm[i].sm_dev; |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
5660 un64->un_sm[i].sm_dev = md_makedevice |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
5661 (md_major, MD_MKMIN(setno, MD_MIN2UNIT |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
5662 (md_getminor(tmpdev)))); |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
5663 |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
5664 if (!md_update_minor(setno, mddb_getsidenum |
307e592cef33
6510471 svm overlap chain book keeping does not scale well
jkennedy
parents:
4932
diff
changeset
|
5665 (setno), un64->un_sm[i].sm_key)) |
0 | 5666 goto out; |
5667 } | |
1623
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5668 break; |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5669 } |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5670 |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5671 /* |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5672 * If this is a top level and a friendly name metadevice, |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5673 * update its minor in the namespace. |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5674 */ |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5675 if ((*parent_id == MD_NO_PARENT) && |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5676 ((rbp->rb_revision == MDDB_REV_RBFN) || |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5677 (rbp->rb_revision == MDDB_REV_RB64FN))) { |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5678 |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5679 self_devt = md_makedevice(md_major, *self_id); |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5680 if (!md_update_top_device_minor(setno, |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5681 mddb_getsidenum(setno), self_devt)) |
7bac4a816ebe
PSARC/2005/153 Bunnahabhain: Descriptive Name Support in SVM
tw21770
parents:
1366
diff
changeset
|
5682 goto out; |
0 | 5683 } |
5684 | |
5685 /* | |
5686 * Update unit with the imported setno | |
5687 * | |
5688 */ | |
5689 mddb_setrecprivate(recid, MD_PRV_GOTIT); | |
5690 | |
5691 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); | |
5692 if (*parent_id != MD_NO_PARENT) | |
5693 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); | |
5694 *record_id = MAKERECID(setno, DBID(*record_id)); | |
5695 *optrec_id = MAKERECID(setno, DBID(*optrec_id)); | |
5696 | |
5697 gotsomething = 1; | |
5698 } | |
5699 | |
5700 out: | |
5701 return (gotsomething); | |
5702 } | |
5703 | |
5704 /* | |
5705 * NAME: mirror_check_offline | |
5706 * | |
5707 * DESCRIPTION: return offline_status = 1 if any submirrors are offline | |
5708 * | |
5709 * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is | |
5710 * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE | |
5711 * ioctl. | |
5712 */ | |
5713 int | |
5714 mirror_check_offline(md_dev64_t dev, int *offline_status) | |
5715 { | |
5716 mm_unit_t *un; | |
5717 md_error_t mde = mdnullerror; | |
5718 | |
5719 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) | |
5720 return (EINVAL); | |
5721 *offline_status = 0; | |
5722 if (un->c.un_status & MD_UN_OFFLINE_SM) | |
5723 *offline_status = 1; | |
5724 return (0); | |
5725 } | |
5726 | |
5727 /* | |
5728 * NAME: mirror_inc_abr_count | |
5729 * | |
5730 * DESCRIPTION: increment the count of layered soft parts with ABR set | |
5731 * | |
5732 * Called from ioctl, so access to un_abr_count is protected by the global | |
5733 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl. | |
5734 */ | |
5735 int | |
5736 mirror_inc_abr_count(md_dev64_t dev) | |
5737 { | |
5738 mm_unit_t *un; | |
5739 md_error_t mde = mdnullerror; | |
5740 | |
5741 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) | |
5742 return (EINVAL); | |
5743 un->un_abr_count++; | |
5744 return (0); | |
5745 } | |
5746 | |
5747 /* | |
5748 * NAME: mirror_dec_abr_count | |
5749 * | |
5750 * DESCRIPTION: decrement the count of layered soft parts with ABR set | |
5751 * | |
5752 * Called from ioctl, so access to un_abr_count is protected by the global | |
5753 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl. | |
5754 */ | |
5755 int | |
5756 mirror_dec_abr_count(md_dev64_t dev) | |
5757 { | |
5758 mm_unit_t *un; | |
5759 md_error_t mde = mdnullerror; | |
5760 | |
5761 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) | |
5762 return (EINVAL); | |
5763 un->un_abr_count--; | |
5764 return (0); | |
5765 } | |
5766 | |
5767 static md_named_services_t mirror_named_services[] = { | |
5768 {(intptr_t (*)()) poke_hotspares, "poke hotspares" }, | |
5769 {(intptr_t (*)()) mirror_rename_listkids, MDRNM_LIST_URKIDS }, | |
5770 {mirror_rename_check, MDRNM_CHECK }, | |
5771 {(intptr_t (*)()) mirror_renexch_update_kids, MDRNM_UPDATE_KIDS }, | |
5772 {(intptr_t (*)()) mirror_exchange_parent_update_to, | |
5773 MDRNM_PARENT_UPDATE_TO}, | |
5774 {(intptr_t (*)()) mirror_exchange_self_update_from_down, | |
5775 MDRNM_SELF_UPDATE_FROM_DOWN }, | |
5776 {(intptr_t (*)())mirror_probe_dev, "probe open test" }, | |
5777 {(intptr_t (*)())mirror_check_offline, MD_CHECK_OFFLINE }, | |
5778 {(intptr_t (*)())mirror_inc_abr_count, MD_INC_ABR_COUNT }, | |
5779 {(intptr_t (*)())mirror_dec_abr_count, MD_DEC_ABR_COUNT }, | |
5780 { NULL, 0 } | |
5781 }; | |
5782 | |
5783 md_ops_t mirror_md_ops = { | |
5784 mirror_open, /* open */ | |
5785 mirror_close, /* close */ | |
5786 md_mirror_strategy, /* strategy */ | |
5787 NULL, /* print */ | |
5788 mirror_dump, /* dump */ | |
5789 NULL, /* read */ | |
5790 NULL, /* write */ | |
5791 md_mirror_ioctl, /* mirror_ioctl, */ | |
5792 mirror_snarf, /* mirror_snarf */ | |
5793 mirror_halt, /* mirror_halt */ | |
5794 NULL, /* aread */ | |
5795 NULL, /* awrite */ | |
5796 mirror_imp_set, /* import set */ | |
5797 mirror_named_services | |
5798 }; | |
5799 | |
5800 /* module specific initilization */ | |
5801 static void | |
5802 init_init() | |
5803 { | |
5804 md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t); | |
5805 | |
5806 /* Initialize the parent and child save memory pools */ | |
5807 mirror_parent_cache = kmem_cache_create("md_mirror_parent", | |
5808 sizeof (md_mps_t), 0, mirror_parent_constructor, | |
5809 mirror_parent_destructor, mirror_run_queue, NULL, NULL, | |
5810 0); | |
5811 | |
5812 mirror_child_cache = kmem_cache_create("md_mirror_child", | |
5813 sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0, | |
5814 mirror_child_constructor, mirror_child_destructor, | |
5815 mirror_run_queue, NULL, NULL, 0); | |
5816 | |
5817 /* | |
5818 * Insure wowbuf_size is a multiple of DEV_BSIZE, | |
5819 * then initialize wowbuf memory pool. | |
5820 */ | |
5821 md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE); | |
5822 if (md_wowbuf_size <= 0) | |
5823 md_wowbuf_size = 2 * DEV_BSIZE; | |
5824 if (md_wowbuf_size > (32 * DEV_BSIZE)) | |
5825 md_wowbuf_size = (32 * DEV_BSIZE); | |
5826 | |
5827 md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t); | |
5828 mirror_wowblk_cache = kmem_cache_create("md_mirror_wow", | |
5829 md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0); | |
5830 | |
5831 mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL); | |
5832 mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL); | |
5833 | |
5834 mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL); | |
5835 } | |
5836 | |
5837 /* module specific uninitilization (undo init_init()) */ | |
5838 static void | |
5839 fini_uninit() | |
5840 { | |
5841 kmem_cache_destroy(mirror_parent_cache); | |
5842 kmem_cache_destroy(mirror_child_cache); | |
5843 kmem_cache_destroy(mirror_wowblk_cache); | |
5844 mirror_parent_cache = mirror_child_cache = | |
5845 mirror_wowblk_cache = NULL; | |
5846 | |
5847 mutex_destroy(&mirror_timeout.dr_mx); | |
5848 mutex_destroy(&hotspare_request.dr_mx); | |
5849 mutex_destroy(&non_ff_drv_mutex); | |
5850 } | |
5851 | |
5852 /* define the module linkage */ | |
4932
cac85bf517af
6464843 metaset -s <setname> -C purge should work on oban sets
petede
parents:
2077
diff
changeset
|
5853 MD_PLUGIN_MISC_MODULE("mirrors module", init_init(), fini_uninit()) |