Mercurial > illumos > illumos-gate
changeset 8452:89d32dfdae6e
6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli error
6692015 SVM global reader-writer mutex priority inversion causes deadlock under load
6725904 callers of meta_getdnp_bydevid() should do so correctly
6726615 Bruichladdich - SVM support for sQFS on mirrors in SunCluster
6756133 MD_MN_MSG_MDDB_PARSE message passes incorrect message size when used
6758399 mdmn_ksend_message() retries door_ki_upcall() without resetting data_ptr/data_size
6766848 mdcommd assumes SVCXPRT will survive thr_create()
6769738 md_mps_t not completely cleared before re-use
line wrap: on
line diff
--- a/usr/src/cmd/lvm/rpc.mdcommd/mdcomm.xml Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/cmd/lvm/rpc.mdcommd/mdcomm.xml Wed Dec 24 08:23:40 2008 -0700 @@ -1,7 +1,7 @@ <?xml version='1.0'?> <!DOCTYPE service_bundle SYSTEM '/usr/share/lib/xml/dtd/service_bundle.dtd.1'> <!-- - Copyright 2007 Sun Microsystems, Inc. All rights reserved. + Copyright 2008 Sun Microsystems, Inc. All rights reserved. Use is subject to license terms. CDDL HEADER START @@ -23,8 +23,6 @@ CDDL HEADER END - pragma ident "%Z%%M% %I% %E% SMI" - NOTE: This service manifest is not editable; its contents will be overwritten by package or patch operations, including operating system upgrade. Make customizations in a different @@ -82,8 +80,8 @@ <propval name='endpoint_type' type='astring' value='tli' /> <propval name='wait' type='boolean' value='true' /> <propval name='isrpc' type='boolean' value='true' /> - <propval name='rpc_low_version' type='integer' value='1' /> - <propval name='rpc_high_version' type='integer' value='1' /> + <propval name='rpc_low_version' type='integer' value='2' /> + <propval name='rpc_high_version' type='integer' value='2' /> <propval name='proto' type='astring' value='tcp' /> </property_group>
--- a/usr/src/cmd/lvm/rpc.mdcommd/mddoors.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/cmd/lvm/rpc.mdcommd/mddoors.c Wed Dec 24 08:23:40 2008 -0700 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,13 +18,12 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <door.h> #include <locale.h> #include <meta.h> @@ -106,7 +104,7 @@ if (close(daemon_lock_fd) == -1) { syslog(LOG_DAEMON | LOG_DEBUG, gettext("close(%s) failed - %s\n"), - daemon_lock_file, strerror(errno)); + daemon_lock_file, strerror(errno)); return; } unlink(daemon_lock_file); @@ -133,37 +131,32 @@ md_mn_kresult_t kresult; md_mn_kmsg_t *kmsg = (md_mn_kmsg_t *)(void *)argp; - err = mdmn_send_message(kmsg->kmsg_setno, - kmsg->kmsg_type, - kmsg->kmsg_flags, - (char *)&(kmsg->kmsg_data), - kmsg->kmsg_size, - &result, - &ep); + err = mdmn_send_message(kmsg->kmsg_setno, kmsg->kmsg_type, + kmsg->kmsg_flags, kmsg->kmsg_recipient, (char *)&(kmsg->kmsg_data), + kmsg->kmsg_size, &result, &ep); + if (result == NULL) { kresult.kmmr_comm_state = MDMNE_RPC_FAIL; } else { kresult.kmmr_comm_state = result->mmr_comm_state; - } - if (err == 0) { - kresult.kmmr_msgtype = result->mmr_msgtype; - kresult.kmmr_flags = result->mmr_flags; - kresult.kmmr_exitval = result->mmr_exitval; - kresult.kmmr_failing_node = result->mmr_failing_node; - size = result->mmr_out_size; - if (size > 0) { - /* This is the maximum of data we can transfer, here */ - if (size > MDMN_MAX_KRES_DATA) { - size = MDMN_MAX_KRES_DATA; + if (err == 0) { + kresult.kmmr_msgtype = result->mmr_msgtype; + kresult.kmmr_flags = result->mmr_flags; + kresult.kmmr_exitval = result->mmr_exitval; + kresult.kmmr_failing_node = result->mmr_failing_node; + size = result->mmr_out_size; + if (size > 0) { + /* This is the max data we can transfer, here */ + if (size > MDMN_MAX_KRES_DATA) { + size = MDMN_MAX_KRES_DATA; + } + bcopy(result->mmr_out, &(kresult.kmmr_res_data), + size); + kresult.kmmr_res_size = size; + } else { + kresult.kmmr_res_size = 0; } - bcopy(result->mmr_out, &(kresult.kmmr_res_data), size); - kresult.kmmr_res_size = size; - } else { - kresult.kmmr_res_size = 0; } - } - - if (result != NULL) { free_result(result); } @@ -252,7 +245,7 @@ * At this point we are single threaded. * We give mdmn_send_message() a chance to initialize safely. */ - (void) mdmn_send_message(0, 0, 0, 0, 0, 0, 0); + (void) mdmn_send_message(0, 0, 0, 0, 0, 0, 0, 0); /* setup the door handle */ mdmn_door_handle = door_create(door2rpc, NULL, @@ -266,12 +259,12 @@ if (metaioctl(MD_MN_SET_DOORH, &mdmn_door_handle, &ep, "mddoors") != 0) { syslog(LOG_DAEMON | LOG_DEBUG, gettext( - "Couldn't set door handle")); + "Couldn't set door handle")); exit(1); } (void) pause(); syslog(LOG_DAEMON | LOG_ERR, gettext( - "Unexpected exit from pause()")); + "Unexpected exit from pause()")); return (1); }
--- a/usr/src/cmd/lvm/rpc.mdcommd/mdmn_commd_server.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/cmd/lvm/rpc.mdcommd/mdmn_commd_server.c Wed Dec 24 08:23:40 2008 -0700 @@ -18,13 +18,12 @@ * * CDDL HEADER END */ + /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <unistd.h> #include <sys/types.h> #include <sys/stat.h> @@ -42,38 +41,40 @@ /* * This is the communication daemon for SVM Multi Node Disksets. * It runs on every node and provides the following rpc services: - * - mdmn_send_svc_1 - * - mdmn_work_svc_1 - * - mdmn_wakeup_initiator_svc_1 - * - mdmn_wakeup_master_svc_1 - * - mdmn_comm_lock_svc_1 - * - mdmn_comm_unlock_svc_1 - * - mdmn_comm_suspend_svc_1 - * - mdmn_comm_resume_svc_1 - * - mdmn_comm_reinit_set_svc_1 + * - mdmn_send_svc_2 + * - mdmn_work_svc_2 + * - mdmn_wakeup_initiator_svc_2 + * - mdmn_wakeup_master_svc_2 + * - mdmn_comm_lock_svc_2 + * - mdmn_comm_unlock_svc_2 + * - mdmn_comm_suspend_svc_2 + * - mdmn_comm_resume_svc_2 + * - mdmn_comm_reinit_set_svc_2 * where send, lock, unlock and reinit are meant for external use, * work and the two wakeups are for internal use only. * * NOTE: - * On every node only one of those xxx_1 functions can be active at the + * On every node only one of those xxx_2 functions can be active at the * same time because the daemon is single threaded. * + * (not quite true, as mdmn_send_svc_2 and mdmn_work_svc_2 do thr_create()s + * as part of their handlers, so those aspects are multi-threaded) * * In case an event occurs that has to be propagated to all the nodes... * * One node (the initiator) * calls the libmeta function mdmn_send_message() - * This function calls the local daemon thru mdmn_send_svc_1. + * This function calls the local daemon thru mdmn_send_svc_2. * * On the initiator: - * mdmn_send_svc_1() + * mdmn_send_svc_2() * - starts a thread -> mdmn_send_to_work() and returns. * mdmn_send_to_work() * - sends this message over to the master of the diskset. - * This is done by calling mdmn_work_svc_1 on the master. + * This is done by calling mdmn_work_svc_2 on the master. * - registers to the initiator_table * - exits without doing a svc_sendreply() for the call to - * mdmn_send_svc_1. This means that call is blocked until somebody + * mdmn_send_svc_2. This means that call is blocked until somebody * (see end of this comment) does a svc_sendreply(). * This means mdmn_send_message() does not yet return. * - A timeout surveillance is started at this point. @@ -82,42 +83,42 @@ * to the caller. * * On the master: - * mdmn_work_svc_1() + * mdmn_work_svc_2() * - starts a thread -> mdmn_master_process_msg() and returns * mdmn_master_process_msg() * - logs the message to the change log * - executes the message locally * - flags the message in the change log - * - sends the message to mdmn_work_svc_1() on all the + * - sends the message to mdmn_work_svc_2() on all the * other nodes (slaves) - * after each call to mdmn_work_svc_1 the thread goes to sleep and - * will be woken up by mdmn_wakeup_master_svc_1() as soon as the + * after each call to mdmn_work_svc_2 the thread goes to sleep and + * will be woken up by mdmn_wakeup_master_svc_2() as soon as the * slave node is done with this message. * - In case the slave doesn't respond in a apropriate time, an error * is assumed to ensure the master doesn't wait forever. * * On a slave: - * mdmn_work_svc_1() + * mdmn_work_svc_2() * - starts a thread -> mdmn_slave_process_msg() and returns * mdmn_slave_process_msg() * - processes this message locally by calling the appropriate message * handler, that creates some result. - * - sends that result thru a call to mdmn_wakeup_master_svc_1() to + * - sends that result thru a call to mdmn_wakeup_master_svc_2() to * the master. * * Back on the master: - * mdmn_wakeup_master_svc_1() + * mdmn_wakeup_master_svc_2() * - stores the result into the master_table. * - signals the mdmn_master_process_msg-thread. * - returns * mdmn_master_process_msg() * - after getting the results from all nodes * - sends them back to the initiating node thru a call to - * mdmn_wakeup_initiator_svc_1. + * mdmn_wakeup_initiator_svc_2. * * Back on the initiator: - * mdmn_wakeup_initiator_svc_1() - * - calls svc_sendreply() which makes the call to mdmn_send_svc_1() + * mdmn_wakeup_initiator_svc_2() + * - calls svc_sendreply() which makes the call to mdmn_send_svc_2() * return. * which allows the initial mdmn_send_message() call to return. */ @@ -195,8 +196,8 @@ { md_mnnode_desc *node = (md_mnnode_desc *)data; - return (clnt_create_timed(node->nd_priv_ic, MDMN_COMMD, ONE, "tcp", - time_out)); + return (clnt_create_timed(node->nd_priv_ic, MDMN_COMMD, TWO, "tcp", + time_out)); } #define FLUSH_DEBUGFILE() \ @@ -219,15 +220,15 @@ if (master_err != MDMNE_ACK) { snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: RPC fail on master " - "when processing message type %d\n", type); + "when processing message type %d\n", type); } else if (slave_result == NULL) { snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: RPC fail on node " - "%d when processing message type %d\n", nid, type); + "%d when processing message type %d\n", nid, type); } else { snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: Inconsistent " - "return value from node %d when processing message " - "type %d. Master exitval = %d, Slave exitval = %d\n", - nid, type, master_exitval, slave_result->mmr_exitval); + "return value from node %d when processing message " + "type %d. Master exitval = %d, Slave exitval = %d\n", + nid, type, master_exitval, slave_result->mmr_exitval); } commd_err.size = strlen(msg_buf); commd_err.md_message = (uint64_t)(uintptr_t)&msg_buf[0]; @@ -335,12 +336,17 @@ commd_debug(MD_MMV_MISC, "timeout_ini: (%d, 0x%llx-%d)\n", MSGID_ELEMS(mid)); + /* + * Give the result the corresponding msgid from the failed message. + */ + MSGID_COPY(&mid, &(resultp->mmr_msgid)); /* return to mdmn_send_message() and let it deal with the situation */ mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp); free(resultp); commd_debug(MD_MMV_MISC, "timeout_ini: sendreplied\n"); + svc_done(transp); mdmn_unregister_initiator_table(setno, class); } @@ -499,13 +505,13 @@ * Perform some global initializations. * * the following routines have to call this before operation can start: - * - mdmn_send_svc_1 - * - mdmn_work_svc_1 - * - mdmn_comm_lock_svc_1 - * - mdmn_comm_unlock_svc_1 - * - mdmn_comm_suspend_svc_1 - * - mdmn_comm_resume_svc_1 - * - mdmn_comm_reinit_set_svc_1 + * - mdmn_send_svc_2 + * - mdmn_work_svc_2 + * - mdmn_comm_lock_svc_2 + * - mdmn_comm_unlock_svc_2 + * - mdmn_comm_suspend_svc_2 + * - mdmn_comm_resume_svc_2 + * - mdmn_comm_reinit_set_svc_2 * * This is a single threaded daemon, so it can only be in one of the above * routines at the same time. @@ -547,8 +553,7 @@ __savetime = gethrtime(); (void) time(&clock_val); - commd_debug(MD_MMV_MISC, "global init called %s\n", - ctime(&clock_val)); + commd_debug(MD_MMV_MISC, "global init called %s\n", ctime(&clock_val)); /* start a thread that flushes out the debug on a regular basis */ thr_create(NULL, 0, (void *(*)(void *))flush_fcout, @@ -663,9 +668,9 @@ */ while ((client[setno][nid] == (CLIENT *) NULL) && (tout < MD_CLNT_CREATE_TOUT)) { - client[setno][nid] = meta_client_create_retry - (node->nd_nodename, mdmn_clnt_create, - (void *) node, MD_CLNT_CREATE_SUBTIMEOUT, &ep); + client[setno][nid] = meta_client_create_retry( + node->nd_nodename, mdmn_clnt_create, + (void *) node, MD_CLNT_CREATE_SUBTIMEOUT, &ep); /* Is the node dead? */ if (mdmn_is_node_dead(node) == 1) { commd_debug(MD_MMV_SYSLOG, @@ -889,9 +894,9 @@ */ while ((client[setno][nid] == (CLIENT *) NULL) && (tout < MD_CLNT_CREATE_TOUT)) { - client[setno][nid] = meta_client_create_retry - (node->nd_nodename, mdmn_clnt_create, - (void *) node, MD_CLNT_CREATE_SUBTIMEOUT, &ep); + client[setno][nid] = meta_client_create_retry( + node->nd_nodename, mdmn_clnt_create, + (void *) node, MD_CLNT_CREATE_SUBTIMEOUT, &ep); /* Is the node dead? */ if (mdmn_is_node_dead(node) == 1) { commd_debug(MD_MMV_SYSLOG, @@ -942,7 +947,7 @@ void * mdmn_send_to_work(void *arg) { - int *rpc_err; + int *rpc_err = NULL; int success; int try_master; set_t setno; @@ -956,9 +961,6 @@ msg = matp->mat_msg; transp = matp->mat_transp; - /* the alloc was done in mdmn_send_svc_1 */ - free(matp); - class = mdmn_get_message_class(msg->msg_type); setno = msg->msg_setno; @@ -980,8 +982,7 @@ if (success == MDMNE_CLASS_BUSY) { md_mn_msgid_t active_mid; - mdmn_get_initiator_table_id(setno, class, - &active_mid); + mdmn_get_initiator_table_id(setno, class, &active_mid); commd_debug(MD_MMV_SEND, "send_to_work: received but locally busy " @@ -1011,7 +1012,8 @@ * Send the request to the work function on the master * this call will return immediately */ - rpc_err = mdmn_work_1(msg, client[setno][set_master]); + rpc_err = mdmn_work_2(msg, client[setno][set_master], + set_master); /* Everything's Ok? */ if (rpc_err == NULL) { @@ -1043,7 +1045,7 @@ /* * If we are here, we sucessfully delivered the message. * We register the initiator_table, so that - * wakeup_initiator_1 can do the sendreply with the + * wakeup_initiator_2 can do the sendreply with the * results for us. */ success = MDMNE_ACK; @@ -1068,15 +1070,27 @@ md_mn_result_t *resultp; resultp = Zalloc(sizeof (md_mn_result_t)); resultp->mmr_comm_state = success; + /* + * copy the MSGID so that we know _which_ message + * failed (if the transp has got mangled) + */ + MSGID_COPY(&(msg->msg_msgid), &(resultp->mmr_msgid)); mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp); commd_debug(MD_MMV_SEND, "send_to_work: not registered (%d, 0x%llx-%d) cs=%d\n", MSGID_ELEMS(msg->msg_msgid), success); free_result(resultp); + /* + * We don't have a timeout registered to wake us up, so we're + * now done with this handle. Release it back to the pool. + */ + svc_done(transp); } free_msg(msg); + /* the alloc was done in mdmn_send_svc_2 */ + Free(matp); mutex_unlock(mx); return (NULL); @@ -1186,7 +1200,7 @@ int timeout_retries = 0; int *ret = NULL; set_t setno; - cond_t *cv; /* see mdmn_wakeup_master_svc_1 */ + cond_t *cv; /* see mdmn_wakeup_master_svc_2 */ mutex_t *mx; /* protection for class_busy */ timestruc_t timeout; /* surveillance for remote daemon */ md_mn_nodeid_t nid; @@ -1251,7 +1265,7 @@ } /* send it over, it will return immediately */ - ret = mdmn_work_1(msg, client[setno][nid]); + ret = mdmn_work_2(msg, client[setno][nid], nid); rw_unlock(&client_rwlock[setno]); @@ -1462,7 +1476,7 @@ result->mmr_comm_state = MDMNE_LOG_FAIL; /* * Note that the mark_busy was already done by - * mdmn_work_svc_1() + * mdmn_work_svc_2() */ mutex_lock(&mdmn_busy_mutex[setno]); mdmn_mark_class_unbusy(setno, orig_class); @@ -1487,8 +1501,8 @@ commd_debug(MD_MMV_SYSLOG, "proc_mas: No client for initiator \n"); } else { - ret = mdmn_wakeup_initiator_1(result, - client[setno][sender]); + ret = mdmn_wakeup_initiator_2(result, + client[setno][sender], sender); } rw_unlock(&client_rwlock[setno]); @@ -1677,6 +1691,12 @@ continue; } + /* If a DIRECTED message, skip non-recipient nodes */ + if ((cmsg->msg_flags & MD_MSGF_DIRECTED) && + nid != cmsg->msg_recipient) { + continue; + } + mutex_lock(mx); /* * Register the node that is addressed, @@ -1865,7 +1885,8 @@ commd_debug(MD_MMV_SYSLOG, "proc_mas: unable to create client for initiator\n"); } else { - ret = mdmn_wakeup_initiator_1(result, client[setno][sender]); + ret = mdmn_wakeup_initiator_2(result, client[setno][sender], + sender); } rw_unlock(&client_rwlock[setno]); @@ -2046,14 +2067,14 @@ rw_unlock(&client_rwlock[setno]); break; } else { - ret = mdmn_wakeup_master_1(result, - client[setno][sender]); + ret = mdmn_wakeup_master_2(result, + client[setno][sender], sender); /* - * if mdmn_wakeup_master_1 returns NULL, it can be that + * if mdmn_wakeup_master_2 returns NULL, it can be that * the master (or the commd on the master) had died. * In that case, we destroy the client to the master * and retry. - * If mdmn_wakeup_master_1 doesn't return MDMNE_ACK, + * If mdmn_wakeup_master_2 doesn't return MDMNE_ACK, * the commd on the master is alive but * something else is wrong, * in that case a retry doesn't make sense => break out @@ -2097,8 +2118,19 @@ } -md_mn_result_t * -mdmn_send_svc_1(md_mn_msg_t *omsg, struct svc_req *rqstp) +/* + * mdmn_send_svc_2: + * --------------- + * Check that the issuing node is a legitimate one (i.e. is licensed to send + * messages to us), that the RPC request can be staged. + * + * Returns: + * 0 => no RPC request is in-flight, no deferred svc_sendreply() + * 1 => queued RPC request in-flight. Completion will be made (later) + * by a wakeup_initiator_2() [hopefully] + */ +int +mdmn_send_svc_2(md_mn_msg_t *omsg, struct svc_req *rqstp) { int err; set_t setno; @@ -2121,7 +2153,7 @@ mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp); free_result(resultp); svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); - return (NULL); + return (0); } /* check if the global initialization is done */ @@ -2152,7 +2184,7 @@ (char *)resultp); free_result(resultp); svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); - return (NULL); + return (0); } } @@ -2169,7 +2201,7 @@ mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp); free_result(resultp); svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); - return (NULL); + return (0); } @@ -2184,10 +2216,10 @@ free_result(resultp); svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); commd_debug(MD_MMV_SEND, - "send: type locked (%d, 0x%llx-%d), set=%d, class=%d, " - "type=%d\n", MSGID_ELEMS(msg->msg_msgid), setno, class, - msg->msg_type); - return (NULL); + "send: type locked (%d, 0x%llx-%d), set=%d, class=%d, " + "type=%d\n", MSGID_ELEMS(msg->msg_msgid), setno, class, + msg->msg_type); + return (0); } @@ -2213,7 +2245,7 @@ free_result(resultp); commd_debug(MD_MMV_SEND, "send: init err = %d\n", err); - return (NULL); + return (0); } } @@ -2227,10 +2259,10 @@ svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); free_result(resultp); commd_debug(MD_MMV_SEND, - "send: class suspended (%d, 0x%llx-%d), set=%d, " - "class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid), - setno, class, msg->msg_type); - return (NULL); + "send: class suspended (%d, 0x%llx-%d), set=%d, " + "class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid), + setno, class, msg->msg_type); + return (0); } mutex_unlock(&mdmn_busy_mutex[setno]); @@ -2238,10 +2270,10 @@ if (check_license(rqstp, 0) == FALSE) { svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); commd_debug(MD_MMV_SEND, - "send: check licence fail(%d, 0x%llx-%d), set=%d, " - "class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid), - setno, class, msg->msg_type); - return (NULL); + "send: check licence fail(%d, 0x%llx-%d), set=%d, " + "class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid), + setno, class, msg->msg_type); + return (0); } @@ -2268,17 +2300,17 @@ MSGID_ELEMS(msg->msg_msgid)); /* * We return here without sending results. This will be done by - * mdmn_wakeup_initiator_svc_1() as soon as the results are available. + * mdmn_wakeup_initiator_svc_2() as soon as the results are available. * Until then the calling send_message will be blocked, while we * are able to take calls. */ - return (NULL); + return (1); } /* ARGSUSED */ int * -mdmn_work_svc_1(md_mn_msg_t *omsg, struct svc_req *rqstp) +mdmn_work_svc_2(md_mn_msg_t *omsg, struct svc_req *rqstp) { int err; set_t setno; @@ -2362,7 +2394,7 @@ mutex_lock(&mdmn_busy_mutex[setno]); - /* check if class is locked via a call to mdmn_comm_lock_svc_1 */ + /* check if class is locked via a call to mdmn_comm_lock_svc_2 */ if (mdmn_is_class_locked(setno, class) == TRUE) { mutex_unlock(&mdmn_busy_mutex[setno]); *retval = MDMNE_CLASS_LOCKED; @@ -2430,14 +2462,14 @@ /* ARGSUSED */ int * -mdmn_wakeup_initiator_svc_1(md_mn_result_t *res, struct svc_req *rqstp) +mdmn_wakeup_initiator_svc_2(md_mn_result_t *res, struct svc_req *rqstp) { int *retval; int err; set_t setno; mutex_t *mx; /* protection of initiator_table */ - SVCXPRT *transp; + SVCXPRT *transp = NULL; md_mn_msgid_t initiator_table_id; md_mn_msgclass_t class; @@ -2491,13 +2523,14 @@ * Search the initiator wakeup table. * If we find an entry here (which should always be true) * we are on the initiating node and we wakeup the original - * local rpc call + * local rpc call. */ mdmn_get_initiator_table_id(setno, class, &initiator_table_id); if (MSGID_CMP(&(initiator_table_id), &(res->mmr_msgid))) { transp = mdmn_get_initiator_table_transp(setno, class); mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)res); + svc_done(transp); mdmn_unregister_initiator_table(setno, class); *retval = MDMNE_ACK; @@ -2532,7 +2565,7 @@ */ /* ARGSUSED */ int * -mdmn_wakeup_master_svc_1(md_mn_result_t *ores, struct svc_req *rqstp) +mdmn_wakeup_master_svc_2(md_mn_result_t *ores, struct svc_req *rqstp) { int *retval; @@ -2645,7 +2678,7 @@ * This is mainly done for debug purpose. * This set/class combination immediately is blocked, * even in the middle of sending messages to multiple slaves. - * This remains until the user issues a mdmn_comm_unlock_svc_1 for the same + * This remains until the user issues a mdmn_comm_unlock_svc_2 for the same * set/class combination. * * Special messages of class MD_MSG_CLASS0 can never be locked. @@ -2666,7 +2699,7 @@ /* ARGSUSED */ int * -mdmn_comm_lock_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp) +mdmn_comm_lock_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp) { int *retval; set_t setno = msc->msc_set; @@ -2722,7 +2755,7 @@ */ /* ARGSUSED */ int * -mdmn_comm_unlock_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp) +mdmn_comm_unlock_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp) { int *retval; set_t setno = msc->msc_set; @@ -2766,7 +2799,7 @@ } /* - * mdmn_comm_suspend_svc_1(setno, class) + * mdmn_comm_suspend_svc_2(setno, class) * * Drain all outstanding messages for a given set/class combination * and don't allow new messages to be processed. @@ -2812,7 +2845,7 @@ /* ARGSUSED */ int * -mdmn_comm_suspend_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp) +mdmn_comm_suspend_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp) { int *retval; int failure = 0; @@ -2902,7 +2935,7 @@ } /* - * mdmn_comm_resume_svc_1(setno, class) + * mdmn_comm_resume_svc_2(setno, class) * * Resume processing messages for a given set. * This incorporates the repeal of a previous suspend operation. @@ -2927,7 +2960,7 @@ */ /* ARGSUSED */ int * -mdmn_comm_resume_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp) +mdmn_comm_resume_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp) { int *retval; set_t startset, endset; @@ -3029,7 +3062,7 @@ } /* ARGSUSED */ int * -mdmn_comm_reinit_set_svc_1(set_t *setnop, struct svc_req *rqstp) +mdmn_comm_reinit_set_svc_2(set_t *setnop, struct svc_req *rqstp) { int *retval; md_mnnode_desc *node; @@ -3093,7 +3126,7 @@ /* ARGSUSED */ int * -mdmn_comm_msglock_svc_1(md_mn_type_and_lock_t *mmtl, struct svc_req *rqstp) +mdmn_comm_msglock_svc_2(md_mn_type_and_lock_t *mmtl, struct svc_req *rqstp) { int *retval; md_mn_msgtype_t type = mmtl->mmtl_type;
--- a/usr/src/cmd/lvm/rpc.mdcommd/mdmn_commd_service.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/cmd/lvm/rpc.mdcommd/mdmn_commd_service.c Wed Dec 24 08:23:40 2008 -0700 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,13 +18,12 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/lvm/mdmn_commd.h> #include <stdio.h> #include <stdlib.h> /* getenv, exit */ @@ -60,16 +58,16 @@ static int _rpcsvcstate = _IDLE; /* Set when a request is serviced */ static int _rpcsvccount = 0; /* Number of requests being serviced */ -extern md_mn_result_t *mdmn_send_svc_1(); -extern int *mdmn_work_svc_1(); -extern int *mdmn_wakeup_initiator_svc_1(); -extern int *mdmn_wakeup_master_svc_1(); -extern int *mdmn_comm_lock_svc_1(); -extern int *mdmn_comm_unlock_svc_1(); -extern int *mdmn_comm_suspend_svc_1(); -extern int *mdmn_comm_resume_svc_1(); -extern int *mdmn_comm_reinit_set_svc_1(); -extern int *mdmn_comm_msglock_svc_1(); +extern int mdmn_send_svc_2(); +extern int *mdmn_work_svc_2(); +extern int *mdmn_wakeup_initiator_svc_2(); +extern int *mdmn_wakeup_master_svc_2(); +extern int *mdmn_comm_lock_svc_2(); +extern int *mdmn_comm_unlock_svc_2(); +extern int *mdmn_comm_suspend_svc_2(); +extern int *mdmn_comm_resume_svc_2(); +extern int *mdmn_comm_reinit_set_svc_2(); +extern int *mdmn_comm_msglock_svc_2(); static void @@ -107,7 +105,7 @@ } static void -mdmn_commd_1(rqstp, transp) +mdmn_commd_2(rqstp, transp) struct svc_req *rqstp; register SVCXPRT *transp; { @@ -124,7 +122,6 @@ char *(*local)(); int free_result = 0; - _rpcsvccount++; switch (rqstp->rq_proc) { case NULLPROC: @@ -132,6 +129,7 @@ (char *)NULL); _rpcsvccount--; _rpcsvcstate = _SERVED; + svc_done(transp); return; case mdmn_send: @@ -140,81 +138,94 @@ (void) memset((char *)&argument, 0, sizeof (argument)); if (!svc_getargs(transp, _xdr_argument, (caddr_t)&argument)) { svcerr_decode(transp); + svc_done(transp); _rpcsvccount--; _rpcsvcstate = _SERVED; return; } /* - * mdmn_send_1 will not always do a sendreply. + * mdmn_send_2 will not always do a sendreply. * it will register in a table and let the mdmn_wakeup1 * do the sendreply for that call. * in order to register properly we need the transp handle + * If we get a 0 back from mdmn_send_svc_2() we have no pending + * RPC in-flight, so we drop the service count. */ - (void) mdmn_send_svc_1((md_mn_msg_t *)&argument, rqstp); + if (mdmn_send_svc_2((md_mn_msg_t *)&argument, rqstp) == 0) { + _rpcsvccount--; + _rpcsvcstate = _SERVED; + svc_done(rqstp->rq_xprt); + } - return; /* xdr_free is called by mdmn_wakeup_initiator_svc_1 */ + return; /* xdr_free is called by mdmn_wakeup_initiator_svc_2 */ case mdmn_work: _xdr_argument = xdr_md_mn_msg_t; _xdr_result = xdr_int; - local = (char *(*)()) mdmn_work_svc_1; + local = (char *(*)()) mdmn_work_svc_2; free_result = 1; break; case mdmn_wakeup_master: _xdr_argument = xdr_md_mn_result_t; _xdr_result = xdr_int; - local = (char *(*)()) mdmn_wakeup_master_svc_1; + local = (char *(*)()) mdmn_wakeup_master_svc_2; free_result = 1; break; case mdmn_wakeup_initiator: + /* + * We must have had an in-flight RPC request to get here, + * so drop the in-flight count. + */ _xdr_argument = xdr_md_mn_result_t; _xdr_result = xdr_int; - local = (char *(*)()) mdmn_wakeup_initiator_svc_1; + local = (char *(*)()) mdmn_wakeup_initiator_svc_2; free_result = 1; + _rpcsvccount--; break; case mdmn_comm_lock: _xdr_argument = xdr_md_mn_set_and_class_t; _xdr_result = xdr_int; - local = (char *(*)()) mdmn_comm_lock_svc_1; + local = (char *(*)()) mdmn_comm_lock_svc_2; break; case mdmn_comm_unlock: _xdr_argument = xdr_md_mn_set_and_class_t; _xdr_result = xdr_int; - local = (char *(*)()) mdmn_comm_unlock_svc_1; + local = (char *(*)()) mdmn_comm_unlock_svc_2; break; case mdmn_comm_suspend: _xdr_argument = xdr_md_mn_set_and_class_t; _xdr_result = xdr_int; - local = (char *(*)()) mdmn_comm_suspend_svc_1; + local = (char *(*)()) mdmn_comm_suspend_svc_2; break; case mdmn_comm_resume: _xdr_argument = xdr_md_mn_set_and_class_t; _xdr_result = xdr_int; - local = (char *(*)()) mdmn_comm_resume_svc_1; + local = (char *(*)()) mdmn_comm_resume_svc_2; break; case mdmn_comm_reinit_set: _xdr_argument = xdr_u_int; _xdr_result = xdr_int; - local = (char *(*)()) mdmn_comm_reinit_set_svc_1; + local = (char *(*)()) mdmn_comm_reinit_set_svc_2; break; case mdmn_comm_msglock: _xdr_argument = xdr_md_mn_type_and_lock_t; _xdr_result = xdr_int; - local = (char *(*)()) mdmn_comm_msglock_svc_1; + local = (char *(*)()) mdmn_comm_msglock_svc_2; break; default: svcerr_noproc(transp); _rpcsvccount--; _rpcsvcstate = _SERVED; + svc_done(transp); return; } (void) memset((char *)&argument, 0, sizeof (argument)); @@ -222,6 +233,7 @@ svcerr_decode(transp); _rpcsvccount--; _rpcsvcstate = _SERVED; + svc_done(transp); return; } result = (*local)(&argument, rqstp); @@ -231,12 +243,15 @@ } if (!svc_freeargs(transp, _xdr_argument, (caddr_t)&argument)) { _msgout(gettext("unable to free arguments")); + svc_done(transp); exit(1); } if (free_result == 1) { free(result); } + + svc_done(transp); _rpcsvccount--; _rpcsvcstate = _SERVED; } @@ -249,6 +264,7 @@ exit_commd() { md_error_t ep = mdnullerror; + syslog(LOG_DAEMON | LOG_DEBUG, gettext("mdcommd exiting")); (void) metaioctl(MD_MN_SET_COMMD_RUNNING, 0, &ep, "rpc.mdcommd"); } @@ -259,10 +275,23 @@ pid_t pid; int i; md_error_t ep = mdnullerror; + int mode = RPC_SVC_MT_USER; (void) sigset(SIGPIPE, SIG_IGN); /* + * Attempt to set MT_USER behaviour for mdcommd service routines. + * If this isn't done, there is a possibility that the transport + * handle might be freed before the thread created by mdmn_send_svc_2 + * can use it. A consequence of this is that svc_done() must be + * called on the handle when it's no longer needed. + */ + if (rpc_control(RPC_SVC_MTMODE_SET, &mode) == FALSE) { + _msgout(gettext("cannot set MT_USER mode for RPC service")); + exit(1); + } + + /* * If stdin looks like a TLI endpoint, we assume * that we were started by a port monitor. If * t_getstate fails with TBADF, this is not a @@ -294,9 +323,9 @@ } if (nconf) freenetconfigent(nconf); - if (!svc_reg(transp, MDMN_COMMD, ONE, mdmn_commd_1, 0)) { + if (!svc_reg(transp, MDMN_COMMD, TWO, mdmn_commd_2, 0)) { _msgout(gettext( - "unable to register (MDMN_COMMD, ONE).")); + "unable to register (MDMN_COMMD, TWO).")); exit(1); } @@ -307,7 +336,8 @@ (void) alarm(_RPCSVC_CLOSEDOWN/2); } - (void) metaioctl(MD_MN_SET_COMMD_RUNNING, (void *)1, &ep, + pid = getpid(); + (void) metaioctl(MD_MN_SET_COMMD_RUNNING, (void *)pid, &ep, "rpc.mdcommd"); svc_run(); exit(1); @@ -343,8 +373,8 @@ openlog("mdmn_commd", LOG_PID, LOG_DAEMON); #endif } - if (!svc_create(mdmn_commd_1, MDMN_COMMD, ONE, "tcp")) { - _msgout(gettext("unable to create (MDMN_COMMD, ONE) for tcp.")); + if (!svc_create(mdmn_commd_2, MDMN_COMMD, TWO, "tcp")) { + _msgout(gettext("unable to create (MDMN_COMMD, TWO) for tcp.")); exit(1); }
--- a/usr/src/cmd/lvm/rpc.mdcommd/mdmn_subr.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/cmd/lvm/rpc.mdcommd/mdmn_subr.c Wed Dec 24 08:23:40 2008 -0700 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,13 +18,12 @@ * * CDDL HEADER END */ + /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <unistd.h> #include <sys/types.h> #include <sys/socket.h> @@ -446,6 +444,7 @@ commd_debug(dbc, "%s sender = %d\n", prefix, msg->msg_sender); commd_debug(dbc, "%s flags = 0x%x\n", prefix, msg->msg_flags); commd_debug(dbc, "%s setno = %d\n", prefix, msg->msg_setno); + commd_debug(dbc, "%s recipient = %d\n", prefix, msg->msg_recipient); commd_debug(dbc, "%s type = %d\n", prefix, msg->msg_type); commd_debug(dbc, "%s size = %d\n", prefix, msg->msg_event_size); if (msg->msg_event_size) { @@ -513,9 +512,8 @@ class = msg->msg_msgid.mid_oclass; } - mct_index = submsg + - class * MAX_SUBMESSAGES + - nodeid * MAX_SUBMESSAGES * MD_MN_NCLASSES; + mct_index = submsg + class * MAX_SUBMESSAGES + + nodeid * MAX_SUBMESSAGES * MD_MN_NCLASSES; mct_offset = mct_index * sizeof (md_mn_mce_t); @@ -694,12 +692,12 @@ } } commd_debug(MD_MMV_MISC, - "mdmn_check_completion: msg already processed \n"); + "mdmn_check_completion: msg already processed \n"); dump_result(MD_MMV_MISC, "mdmn_check_completion", result); return (MDMN_MCT_DONE); } commd_debug(MD_MMV_MISC, - "mdmn_check_completion: msg not yet processed\n"); + "mdmn_check_completion: msg not yet processed\n"); return (MDMN_MCT_NOT_DONE); }
--- a/usr/src/cmd/lvm/util/metaclust.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/cmd/lvm/util/metaclust.c Wed Dec 24 08:23:40 2008 -0700 @@ -24,8 +24,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <meta.h> #include <sdssc.h> #include <signal.h> @@ -117,6 +115,8 @@ sigalarmhandler(int sig) { int i, n, ret, stat_loc = 0; + FILE *pgcore; + char corecmd[256]; n = sizeof (step_table) / sizeof (step_table[0]); for (i = 0; i < n; i++) { @@ -130,6 +130,25 @@ step_table[i].step_nam, meta_print_hrtime(gethrtime() - start_time)); + /* + * See what the child was actually doing when the timeout expired. + * A core-dump of this would be _really_ good, so let's just + * try a 'gcore -g c_pid' and hope + */ + + (void) memset(corecmd, 0, sizeof (corecmd)); + (void) snprintf(corecmd, sizeof (corecmd), + "/bin/gcore -g %d >/dev/null 2>&1", (int)c_pid); + + pgcore = popen(corecmd, "r"); + + if (pgcore == NULL) { + meta_mc_log(MC_LOG1, gettext("Could not grab core for pid %s"), + c_pid); + } else { + (void) pclose(pgcore); + } + if ((ret = kill(c_pid, SIGKILL)) == 0) { /* * The child will wait forever until the status is retrieved @@ -1762,7 +1781,6 @@ "rpc.mdcommd for set %s\n"), sp->setname); md_exit(local_sp, 1); } - meta_ping_mnset(setno); /* Unblock mddb parse messages */ if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/cmd/mdb/common/modules/md/dumpmirror.c Wed Dec 24 08:23:40 2008 -0700 @@ -0,0 +1,230 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include "mdinclude.h" + +/* + * Display an arbitrary bitmap by showing the set bits in the array. + * Output will be <start>-<end> for ranges or <position> for singleton bits. + */ +static void +print_mm_bm(unsigned char *bm, uint_t size, char *bm_name) +{ + int i; + int first_set = -1; + int need_comma = 0; + + mdb_printf("%s set bits: ", bm_name); + for (i = 0; i < size; i++) { + if (isset(bm, i)) { + if (first_set == -1) { + first_set = i; + } + } else { + if (first_set != -1) { + if (first_set != (i-1)) { + mdb_printf("%s%u-%u", + (need_comma ? "," : ""), + first_set, (i-1)); + } else { + mdb_printf("%s%u", + (need_comma ? "," : ""), first_set); + } + need_comma = 1; + first_set = -1; + } + } + } + if (first_set != -1) { + mdb_printf("%s%u-%u", (need_comma ? "," : ""), first_set, + size-1); + } + mdb_printf("\n"); +} + +/* + * Print uchar_t sized count fields (typically un_pernode_dirty_map entries) + */ + +static void +print_mm_cnt_c(unsigned char *bm, uint_t size, char *bm_name) +{ + int i; + int need_comma = 0; + + mdb_printf("%s set counts: ", bm_name); + for (i = 0; i < size; i++) { + if (bm[i]) { + mdb_printf("%s(%d,%3d)", (need_comma ? "," : ""), i, + (uint_t)bm[i]); + need_comma = 1; + } + } + mdb_printf("\n"); +} + +static void +print_mm_cnt_w(unsigned short *bm, uint_t size, char *bm_name) +{ + int i; + int need_comma = 0; + + mdb_printf("%s set counts: ", bm_name); + for (i = 0; i < size; i++) { + if (bm[i]) { + mdb_printf("%s(%d,%5d)", (need_comma ? "," : ""), i, + (uint_t)bm[i]); + need_comma = 1; + } + } + mdb_printf("\n"); +} + +/* + * Print the associated bitmaps for the specified mm_unit_t + * These are: + * un_pernode_dirty_bm + * un_goingclean_bm + * un_dirty_bm + * un_goingdirty_bm + * un_resync_bm + * + * Associated counts for unit: + * un_pernode_dirty_sum[] (uchar_t) + * un_outstanding_writes[] (ushort_t) + * + */ + +/* ARGSUSED */ +int +printmmbm(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) +{ + mm_unit_t mm, *mmp; + unsigned char *rr_dirty_bm, *rr_goingclean_bm, *rr_goingdirty_bm; + unsigned char *rr_resync_bm; + uintptr_t un_dbm, un_gcbm, un_gdbm, un_rrbm, un_pnds, un_ow; + uint_t num_rr, rr_bitmap_size; + int i; + uintptr_t un_pernode_bm; + unsigned char *rr_pernode_dirty, *rr_pnds; + unsigned short *rr_ow; + /* just enough for un_pernode_dirty_bm[] plus three digits */ + char pernode_str[25]; + + if (argc != 0) + return (DCMD_USAGE); + + if (!(flags & DCMD_ADDRSPEC)) { + mdb_warn("No mm_unit_t address specified"); + return (DCMD_ERR); + } + + if (mdb_vread(&mm, sizeof (mm_unit_t), addr) == -1) { + mdb_warn("failed to read mm_unit_t at %p\n", addr); + return (DCMD_ERR); + } + + mmp = &mm; + + num_rr = mm.un_rrd_num; + + un_dbm = (uintptr_t)mmp->un_dirty_bm; + un_gcbm = (uintptr_t)mmp->un_goingclean_bm; + un_gdbm = (uintptr_t)mmp->un_goingdirty_bm; + un_rrbm = (uintptr_t)mmp->un_resync_bm; + un_pnds = (uintptr_t)mmp->un_pernode_dirty_sum; + un_ow = (uintptr_t)mmp->un_outstanding_writes; + + rr_bitmap_size = howmany(num_rr, NBBY); + rr_dirty_bm = (unsigned char *)mdb_alloc(rr_bitmap_size, + UM_SLEEP|UM_GC); + rr_goingclean_bm = (unsigned char *)mdb_alloc(rr_bitmap_size, + UM_SLEEP|UM_GC); + rr_goingdirty_bm = (unsigned char *)mdb_alloc(rr_bitmap_size, + UM_SLEEP|UM_GC); + rr_resync_bm = (unsigned char *)mdb_alloc(rr_bitmap_size, + UM_SLEEP|UM_GC); + rr_pnds = (unsigned char *)mdb_alloc(num_rr, UM_SLEEP|UM_GC); + rr_ow = (unsigned short *)mdb_alloc(num_rr * sizeof (unsigned short), + UM_SLEEP|UM_GC); + + if (mdb_vread(rr_dirty_bm, rr_bitmap_size, un_dbm) == -1) { + mdb_warn("failed to read un_dirty_bm at %p\n", un_dbm); + return (DCMD_ERR); + } + if (mdb_vread(rr_goingclean_bm, rr_bitmap_size, un_gcbm) == -1) { + mdb_warn("failed to read un_goingclean_bm at %p\n", un_gcbm); + return (DCMD_ERR); + } + if (mdb_vread(rr_goingdirty_bm, rr_bitmap_size, un_gdbm) == -1) { + mdb_warn("failed to read un_goingdirty_bm at %p\n", un_gdbm); + return (DCMD_ERR); + } + if (mdb_vread(rr_resync_bm, rr_bitmap_size, un_rrbm) == -1) { + mdb_warn("failed to read un_resync_bm at %p\n", un_rrbm); + return (DCMD_ERR); + } + if (mdb_vread(rr_pnds, num_rr, un_pnds) == -1) { + mdb_warn("failed to read un_pernode_dirty_sum at %p\n", + un_pnds); + return (DCMD_ERR); + } + if (mdb_vread(rr_ow, num_rr * sizeof (unsigned short), un_ow) == -1) { + mdb_warn("failed to read un_outstanding_writes at %p\n", un_ow); + return (DCMD_ERR); + } + + print_mm_bm(rr_dirty_bm, num_rr, "un_dirty_bm"); + print_mm_bm(rr_goingclean_bm, num_rr, "un_goingclean_bm"); + print_mm_bm(rr_goingdirty_bm, num_rr, "un_goingdirty_bm"); + print_mm_bm(rr_resync_bm, num_rr, "un_resync_bm"); + + /* + * Load all the un_pernode_bm[] entries and iterate through the non- + * NULL entries + */ + rr_pernode_dirty = (unsigned char *)mdb_alloc(rr_bitmap_size, + UM_SLEEP|UM_GC); + + for (i = 0; i < 128; i++) { + un_pernode_bm = (uintptr_t)mmp->un_pernode_dirty_bm[i]; + if (un_pernode_bm) { + mdb_snprintf(pernode_str, sizeof (pernode_str), + "un_pernode_dirty_bm[%d]", i); + if (mdb_vread(rr_pernode_dirty, rr_bitmap_size, + un_pernode_bm) == -1) { + mdb_warn("failed to read %s at %p\n", + pernode_str, un_pernode_bm); + return (DCMD_ERR); + } + print_mm_bm(rr_pernode_dirty, num_rr, pernode_str); + } + } + print_mm_cnt_c(rr_pnds, num_rr, "un_pernode_dirty_sum"); + + print_mm_cnt_w(rr_ow, num_rr, "un_outstanding_writes"); + + return (DCMD_OK); +}
--- a/usr/src/cmd/mdb/common/modules/md/md.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/cmd/mdb/common/modules/md/md.c Wed Dec 24 08:23:40 2008 -0700 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/mdb_modapi.h> @@ -37,6 +34,7 @@ extern int dumpnamespace(uintptr_t, uint_t, int, const mdb_arg_t *); extern int dumpsetaddr(uintptr_t, uint_t, int, const mdb_arg_t *); extern int dumphotspare(uintptr_t, uint_t, int, const mdb_arg_t *); +extern int printmmbm(uintptr_t, uint_t, int, const mdb_arg_t *); extern void set_io_help(); /* from mdbgen */ @@ -79,6 +77,8 @@ dumpsetaddr }, { "simple_de_ic", NULL, "simple mddb_de_ic_t", simple_de_ic }, + { "printmmbm", NULL, "print bitmaps for given mm_unit_t", + printmmbm }, { NULL } };
--- a/usr/src/cmd/mdb/common/modules/md/metastat.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/cmd/mdb/common/modules/md/metastat.c Wed Dec 24 08:23:40 2008 -0700 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,13 +18,12 @@ * * CDDL HEADER END */ + /* - * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "mdinclude.h" typedef struct submirror_cb { @@ -117,16 +115,84 @@ return (WALK_NEXT); } +/* + * Construct an RLE count for the number of 'cleared' bits in the given 'bm' + * Output the RLE count in form: [<set>.<cleared>.<set>.<cleared>...] + * RLE is Run Length Encoding, a method for compactly describing a bitmap + * as a series of numbers indicating the count of consecutive set or cleared + * bits. + * + * Input: + * <bm> bitmap to scan + * <size> length of bitmap (in bits) + * <comp_bm> RLE count array to be updated + * <opstr> Descriptive text for bitmap RLE count display + */ +static void +print_comp_bm(unsigned char *bm, uint_t size, ushort_t *comp_bm, char *opstr) +{ + int cnt_clean, tot_dirty, cur_idx; + int i, cur_clean, cur_dirty, printit, max_set_cnt, max_reset_cnt; + + cnt_clean = 1; + printit = 0; + cur_clean = 0; + cur_dirty = 0; + cur_idx = 0; + tot_dirty = 0; + max_set_cnt = max_reset_cnt = 0; + for (i = 0; i < size; i++) { + if (isset(bm, i)) { + /* If we're counting clean bits, flush the count out */ + if (cnt_clean) { + cnt_clean = 0; + comp_bm[cur_idx] = cur_clean; + printit = 1; + if (cur_clean > max_reset_cnt) { + max_reset_cnt = cur_clean; + } + } + cur_clean = 0; + cur_dirty++; + tot_dirty++; + } else { + if (!cnt_clean) { + cnt_clean = 1; + comp_bm[cur_idx] = cur_dirty; + printit = 1; + if (cur_dirty > max_set_cnt) { + max_set_cnt = cur_dirty; + } + } + cur_dirty = 0; + cur_clean++; + } + if (printit) { + mdb_printf("%u.", comp_bm[cur_idx++]); + printit = 0; + } + } + + mdb_printf("\nTotal %s bits = %lu\n", opstr, tot_dirty); + mdb_printf("Total %s transactions = %lu\n", opstr, cur_idx); + mdb_printf("Maximum %s set count = %lu, reset count = %lu\n", opstr, + max_set_cnt, max_reset_cnt); +} + void print_mirror(void *un_addr, void *mdcptr, uint_t verbose) { - mm_unit_t mm; + mm_unit_t mm, *mmp; void **ptr; int setno = 0; minor_t un_self_id; diskaddr_t un_total_blocks; ushort_t mm_un_nsm; submirror_cb_t data; + uint_t num_rr, rr_blksize; + ushort_t *comp_rr; + unsigned char *rr_dirty_bm, *rr_goingclean_bm; + uintptr_t un_dbm, un_gcbm; /* read in the device */ if (mdb_vread(&mm, sizeof (mm_unit_t), @@ -134,6 +200,9 @@ mdb_warn("failed to read mm_unit_t at %p\n", un_addr); return; } + + mmp = &mm; + un_self_id = ((mdc_unit_t *)mdcptr)->un_self_id; un_total_blocks = ((mdc_unit_t *)mdcptr)->un_total_blocks; mm_un_nsm = mm.un_nsm; @@ -148,6 +217,39 @@ } mdb_inc_indent(2); mdb_printf("Size: %llu blocks\n", un_total_blocks); + + /* + * Dump out the current un_dirty_bm together with its size + * Also, attempt to Run Length encode the bitmap to see if this + * is a viable option + */ + num_rr = mm.un_rrd_num; + rr_blksize = mm.un_rrd_blksize; + + un_dbm = (uintptr_t)mmp->un_dirty_bm; + un_gcbm = (uintptr_t)mmp->un_goingclean_bm; + + mdb_printf("RR size: %lu bits\n", num_rr); + mdb_printf("RR block size: %lu blocks\n", rr_blksize); + + rr_dirty_bm = (unsigned char *)mdb_alloc(num_rr, UM_SLEEP|UM_GC); + rr_goingclean_bm = (unsigned char *)mdb_alloc(num_rr, UM_SLEEP|UM_GC); + comp_rr = (ushort_t *)mdb_alloc(num_rr * sizeof (ushort_t), + UM_SLEEP|UM_GC); + + if (mdb_vread(rr_dirty_bm, num_rr, un_dbm) == -1) { + mdb_warn("failed to read un_dirty_bm at %p\n", un_dbm); + return; + } + if (mdb_vread(rr_goingclean_bm, num_rr, un_gcbm) == -1) { + mdb_warn("failed to read un_goingclean_bm at %p\n", un_gcbm); + return; + } + + print_comp_bm(rr_dirty_bm, num_rr, comp_rr, "dirty"); + + print_comp_bm(rr_goingclean_bm, num_rr, comp_rr, "clean"); + /* * find the sub mirrors, search through each metadevice looking * at the un_parent.
--- a/usr/src/cmd/mdb/intel/amd64/md/Makefile Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/cmd/mdb/intel/amd64/md/Makefile Wed Dec 24 08:23:40 2008 -0700 @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,15 @@ # CDDL HEADER END # # -# Copyright 2004 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" MODULE = md.so MDBTGT = kvm MODSRCS = dumphotspare.c \ + dumpmirror.c \ dumpnamespace.c \ findset.c \ md.c \
--- a/usr/src/cmd/mdb/intel/ia32/md/Makefile Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/cmd/mdb/intel/ia32/md/Makefile Wed Dec 24 08:23:40 2008 -0700 @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,15 @@ # CDDL HEADER END # # -# Copyright 2003 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" MODULE = md.so MDBTGT = kvm MODSRCS = dumphotspare.c \ + dumpmirror.c \ dumpnamespace.c \ findset.c \ md.c \
--- a/usr/src/cmd/mdb/sparc/v9/md/Makefile Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/cmd/mdb/sparc/v9/md/Makefile Wed Dec 24 08:23:40 2008 -0700 @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,15 @@ # CDDL HEADER END # # -# Copyright 2003 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#pragma ident "%Z%%M% %I% %E% SMI" MODULE = md.so MDBTGT = kvm MODSRCS = dumphotspare.c \ + dumpmirror.c \ dumpnamespace.c \ findset.c \ md.c \
--- a/usr/src/head/meta.h Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/head/meta.h Wed Dec 24 08:23:40 2008 -0700 @@ -1844,10 +1844,12 @@ /* meta_mn_comm.c */ extern int mdmn_send_message(set_t setno, md_mn_msgtype_t type, - uint_t flags, char *data, int size, - md_mn_result_t **resp, md_error_t *ep); + uint_t flags, md_mn_nodeid_t recipient, + char *data, int size, md_mn_result_t **resp, + md_error_t *ep); extern int mdmn_send_message_with_msgid(set_t setno, - md_mn_msgtype_t type, uint_t flags, char *data, + md_mn_msgtype_t type, uint_t flags, + md_mn_nodeid_t recipient, char *data, int size, md_mn_result_t **resp, md_mn_msgid_t *msgid, md_error_t *ep); extern int mdmn_create_msgid(md_mn_msgid_t *id); @@ -1931,11 +1933,11 @@ md_timeval32_t timestamp, ulong_t genid, md_error_t *ep); -/* Flags for direction in copy_msg_1 */ +/* Flags for direction in copy_msg_2 */ #define MD_MN_COPY_TO_ONDISK 0x0001 #define MD_MN_COPY_TO_INCORE 0x0002 -extern void copy_msg_1(md_mn_msg_t *incorep, +extern void copy_msg_2(md_mn_msg_t *incorep, md_mn_msg_od_t *ondiskp, int direction); extern void free_msg(md_mn_msg_t *msg);
--- a/usr/src/lib/lvm/libmeta/common/mapfile-vers Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/lib/lvm/libmeta/common/mapfile-vers Wed Dec 24 08:23:40 2008 -0700 @@ -22,8 +22,6 @@ # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# SUNWprivate_1.1 { global: @@ -92,7 +90,7 @@ commitset; comp_state_to_name; copy_msg; - copy_msg_1; + copy_msg_2; copy_result; crcfreetab; crcfunc; @@ -160,12 +158,12 @@ md_med_pmap_timeout; mdmn_abort; mdmn_allocate_changelog; - mdmn_comm_lock_1; - mdmn_comm_msglock_1; - mdmn_comm_reinit_set_1; - mdmn_comm_resume_1; - mdmn_comm_suspend_1; - mdmn_comm_unlock_1; + mdmn_comm_lock_2; + mdmn_comm_msglock_2; + mdmn_comm_reinit_set_2; + mdmn_comm_resume_2; + mdmn_comm_suspend_2; + mdmn_comm_unlock_2; mdmn_create_msgid; mdmn_get_changelogrec; mdmn_get_handler; @@ -177,14 +175,14 @@ mdmn_reinit_set; mdmn_reset_changelog; mdmn_resume; - mdmn_send_1; + mdmn_send_2; mdmn_send_message; mdmn_snarf_changelog; mdmn_suspend; mdmn_unlog_msg; - mdmn_wakeup_initiator_1; - mdmn_wakeup_master_1; - mdmn_work_1; + mdmn_wakeup_initiator_2; + mdmn_wakeup_master_2; + mdmn_work_2; mdnullerror; md_perror; md_post_sig;
--- a/usr/src/lib/lvm/libmeta/common/meta_db.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/lib/lvm/libmeta/common/meta_db.c Wed Dec 24 08:23:40 2008 -0700 @@ -18,13 +18,12 @@ * * CDDL HEADER END */ + /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Just in case we're not in a build environment, make sure that * TEXT_DOMAIN gets set to something. @@ -928,7 +927,7 @@ */ send_rval = mdmn_send_message(sp->setno, MD_MN_MSG_META_DB_NEWSIDE, MD_MSGF_FAIL_ON_SUSPEND | - MD_MSGF_PANIC_WHEN_INCONSISTENT, (char *)&db_ns, + MD_MSGF_PANIC_WHEN_INCONSISTENT, 0, (char *)&db_ns, sizeof (md_mn_msg_meta_db_newside_t), &resultp, ep); if (send_rval != 0) { @@ -1048,7 +1047,7 @@ */ send_rval = mdmn_send_message(sp->setno, MD_MN_MSG_META_DB_DELSIDE, MD_MSGF_FAIL_ON_SUSPEND | - MD_MSGF_PANIC_WHEN_INCONSISTENT, (char *)&db_ds, + MD_MSGF_PANIC_WHEN_INCONSISTENT, 0, (char *)&db_ds, sizeof (md_mn_msg_meta_db_delside_t), &resultp, ep); if (send_rval != 0) { if (resultp == NULL) @@ -1542,7 +1541,7 @@ flags |= MD_MSGF_NO_LOG; send_rval = mdmn_send_message(sp->setno, MD_MN_MSG_META_DB_ATTACH, - flags, (char *)&attach, + flags, 0, (char *)&attach, sizeof (md_mn_msg_meta_db_attach_t), &resultp, ep); if (send_rval != 0) { @@ -2007,7 +2006,7 @@ flags |= MD_MSGF_NO_LOG; send_rval = mdmn_send_message(sp->setno, MD_MN_MSG_META_DB_DETACH, - flags, (char *)&detach, + flags, 0, (char *)&detach, sizeof (md_mn_msg_meta_db_detach_t), &resultp, ep); if (send_rval != 0) {
--- a/usr/src/lib/lvm/libmeta/common/meta_mn_changelog.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/lib/lvm/libmeta/common/meta_mn_changelog.c Wed Dec 24 08:23:40 2008 -0700 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,16 +18,14 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <stdlib.h> #include <unistd.h> - #include <wait.h> #include <sys/time.h> #include <meta.h> @@ -131,7 +128,7 @@ odp->lr_class = incp->lr_class; odp->lr_msglen = incp->lr_msglen; if (incp->lr_msglen) - copy_msg_1(&incp->lr_msg, &odp->lr_od_msg, direction); + copy_msg_2(&incp->lr_msg, &odp->lr_od_msg, direction); } else { incp->lr_revision = odp->lr_revision; incp->lr_flags = odp->lr_flags; @@ -139,7 +136,7 @@ incp->lr_class = odp->lr_class; incp->lr_msglen = odp->lr_msglen; if (odp->lr_msglen) - copy_msg_1(&incp->lr_msg, &odp->lr_od_msg, direction); + copy_msg_2(&incp->lr_msg, &odp->lr_od_msg, direction); } } @@ -196,7 +193,7 @@ (void) mdstealerror(ep, &req.ur_mde); #ifdef DEBUG syslog(LOG_DEBUG, "allocate_log: %s\n", - mde_sperror(ep, "")); + mde_sperror(ep, "")); #endif Free(mdmn_changelog[setno]); return (-1); @@ -389,13 +386,14 @@ assert(lr != NULL); if (!MSGID_CMP(&(msg->msg_msgid), &(lr->lr_msg.msg_msgid))) { syslog(LOG_ERR, dgettext(TEXT_DOMAIN, - "unlog_msg: msgid mismatch\n" - "\t\tstored: ID = (%d, 0x%llx-%d) setno %d class %d type %d\n" - "\t\tattempting to unlog:\n" - "\t\tID = (%d, 0x%llx-%d) setno %d class %d type %d.\n"), - MSGID_ELEMS(lr->lr_msg.msg_msgid), lr->lr_setno, - lr->lr_class, lr->lr_msgtype, MSGID_ELEMS(msg->msg_msgid), - msg->msg_setno, class, msg->msg_type); + "unlog_msg: msgid mismatch\n" + "\t\tstored: ID = (%d, 0x%llx-%d) setno %d " + "class %d type %d\n" + "\t\tattempting to unlog:\n" + "\t\tID = (%d, 0x%llx-%d) setno %d class %d type %d.\n"), + MSGID_ELEMS(lr->lr_msg.msg_msgid), lr->lr_setno, + lr->lr_class, lr->lr_msgtype, MSGID_ELEMS(msg->msg_msgid), + msg->msg_setno, class, msg->msg_type); return (-1); } lr->lr_msglen = 0; @@ -462,10 +460,10 @@ if (!(MD_MNSET_DESC(sd)) || !sd->sd_mn_am_i_master) { if (!(MD_MNSET_DESC(sd))) { syslog(LOG_DAEMON | LOG_ERR, dgettext(TEXT_DOMAIN, - "mdmn_commitlog - Not MN Set\n")); + "mdmn_commitlog - Not MN Set\n")); } else { syslog(LOG_DAEMON | LOG_ERR, dgettext(TEXT_DOMAIN, - "mdmn_commit_log - Not Master\n")); + "mdmn_commit_log - Not Master\n")); } return (-1); } @@ -485,7 +483,7 @@ req.ur_size = MDMN_LOGRECSIZE_OD; req.ur_data = (uintptr_t)&clodrec; if ((retval = metaioctl(MD_MN_DB_USERREQ, &req, &req.ur_mde, - NULL)) != 0) { + NULL)) != 0) { (void) mdstealerror(ep, &req.ur_mde); #ifdef DEBUG syslog(LOG_DAEMON|LOG_DEBUG, @@ -501,16 +499,16 @@ recs[lrc] = 0; /* Commit to mddb on disk */ METAD_SETUP_LR(MD_DB_COMMIT_MANY, setno, - mdmn_changelog[setno][0].lr_selfid); + mdmn_changelog[setno][0].lr_selfid); req.ur_size = size; req.ur_data = (uintptr_t)recs; if ((retval = metaioctl(MD_MN_DB_USERREQ, &req, - &req.ur_mde, NULL)) != 0) { + &req.ur_mde, NULL)) != 0) { (void) mdstealerror(ep, &req.ur_mde); #ifdef DEBUG syslog(LOG_DAEMON|LOG_DEBUG, - "mdmn_commitlog - metaioctl COMMIT_MANY" - "Failure\n%s", mde_sperror(ep, "")); + "mdmn_commitlog - metaioctl COMMIT_MANY" + "Failure\n%s", mde_sperror(ep, "")); #endif } } @@ -609,7 +607,7 @@ } lr = (mdmn_changelog_record_od_t *)get_ur_rec(set, MD_UR_GET_NEXT, - MDDB_UR_LR, &id, ep); + MDDB_UR_LR, &id, ep); if (lr == NULL) return (0); @@ -618,7 +616,7 @@ if (mdmn_changelog[set] == NULL) { /* Allocate incore state for the log */ mdmn_changelog[set] = Zalloc(MDMN_LOGHDR_SIZE * - mdmn_logrecs); + mdmn_logrecs); } do {
--- a/usr/src/lib/lvm/libmeta/common/meta_mn_comm.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/lib/lvm/libmeta/common/meta_mn_comm.c Wed Dec 24 08:23:40 2008 -0700 @@ -20,12 +20,10 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <stdlib.h> #include <unistd.h> #include <wait.h> @@ -72,181 +70,264 @@ void ldump_msg(char *prefix, md_mn_msg_t *msg) { - (void) fprintf(stderr, "%s &msg = 0x%x\n", prefix, (uint_t)msg); - (void) fprintf(stderr, "%s ID = (%d, 0x%llx-%d)\n", prefix, + (void) fprintf(stderr, "%s &msg = 0x%x\n", prefix, (uint_t)msg); + (void) fprintf(stderr, "%s ID = (%d, 0x%llx-%d)\n", prefix, MSGID_ELEMS(msg->msg_msgid)); - (void) fprintf(stderr, "%s sender = %d\n", prefix, msg->msg_sender); - (void) fprintf(stderr, "%s flags = 0x%x\n", prefix, msg->msg_flags); - (void) fprintf(stderr, "%s setno = %d\n", prefix, msg->msg_setno); - (void) fprintf(stderr, "%s type = %d\n", prefix, msg->msg_type); - (void) fprintf(stderr, "%s size = %d\n", prefix, msg->msg_event_size); + (void) fprintf(stderr, "%s sender = %d\n", prefix, msg->msg_sender); + (void) fprintf(stderr, "%s flags = 0x%x\n", + prefix, msg->msg_flags); + (void) fprintf(stderr, "%s setno = %d\n", prefix, msg->msg_setno); + (void) fprintf(stderr, "%s recipient = %d\n", + prefix, msg->msg_recipient); + (void) fprintf(stderr, "%s type = %d\n", prefix, msg->msg_type); + (void) fprintf(stderr, "%s size = %d\n", + prefix, msg->msg_event_size); } +#define COMMD_PROGNAME "rpc.mdcommd" + +extern uint_t meta_rpc_err_mask(void); + +/* + * If a clnt_call gets an RPC error, force the message out here with details. + * This would be nice to send to commd_debug(), but we can't call rpc.mdcommd + * code from libmeta. + */ +static void +mdmn_handle_RPC_error(CLIENT *clnt, char *ident, md_mn_nodeid_t nid) +{ + /* + * This is sized for a max message which would look like this: + * "mdmn_wakeup_initiator: rpc.mdcommd node 4294967295" + */ + char errstr[51]; + struct rpc_err e; + + CLNT_GETERR((CLIENT *) clnt, &e); + if (meta_rpc_err_mask() & (1 << e.re_status)) { + if (nid == 0) { + (void) snprintf(errstr, sizeof (errstr), + "%s: %s node (local)", ident, COMMD_PROGNAME); + } else { + (void) snprintf(errstr, sizeof (errstr), + "%s: %s node %d", ident, COMMD_PROGNAME, nid); + } + syslog(LOG_WARNING, "mdmn_handle_RPC_error: %s", + clnt_sperror(clnt, errstr)); + } +} /* Default timeout can be changed using clnt_control() */ static struct timeval TIMEOUT = { 25, 0 }; md_mn_result_t * -mdmn_send_1(argp, clnt) +mdmn_send_2(argp, clnt, nid) md_mn_msg_t *argp; CLIENT *clnt; + md_mn_nodeid_t nid; { + enum clnt_stat res; md_mn_result_t *clnt_res = Zalloc(sizeof (md_mn_result_t)); - if (clnt_call(clnt, mdmn_send, + res = clnt_call(clnt, mdmn_send, (xdrproc_t)xdr_md_mn_msg_t, (caddr_t)argp, - (xdrproc_t)xdr_md_mn_result_t, (caddr_t)clnt_res, - TIMEOUT) != RPC_SUCCESS) { - return (NULL); + (xdrproc_t)xdr_md_mn_result_t, (caddr_t)clnt_res, TIMEOUT); + + if (res == RPC_SUCCESS) { + return (clnt_res); } - return (clnt_res); + mdmn_handle_RPC_error(clnt, "mdmn_send", nid); + Free(clnt_res); + return (NULL); } int * -mdmn_work_1(argp, clnt) +mdmn_work_2(argp, clnt, nid) md_mn_msg_t *argp; CLIENT *clnt; + md_mn_nodeid_t nid; { + enum clnt_stat res; int *clnt_res = Zalloc(sizeof (int)); - if (clnt_call(clnt, mdmn_work, + res = clnt_call(clnt, mdmn_work, (xdrproc_t)xdr_md_mn_msg_t, (caddr_t)argp, - (xdrproc_t)xdr_int, (caddr_t)clnt_res, - TIMEOUT) != RPC_SUCCESS) { - Free(clnt_res); - return (NULL); + (xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT); + + if (res == RPC_SUCCESS) { + return (clnt_res); } - return (clnt_res); + mdmn_handle_RPC_error(clnt, "mdmn_work", nid); + Free(clnt_res); + return (NULL); } int * -mdmn_wakeup_initiator_1(argp, clnt) +mdmn_wakeup_initiator_2(argp, clnt, nid) md_mn_result_t *argp; CLIENT *clnt; + md_mn_nodeid_t nid; { + enum clnt_stat res; int *clnt_res = Zalloc(sizeof (int)); - if (clnt_call(clnt, mdmn_wakeup_initiator, + res = clnt_call(clnt, mdmn_wakeup_initiator, (xdrproc_t)xdr_md_mn_result_t, (caddr_t)argp, - (xdrproc_t)xdr_int, (caddr_t)clnt_res, - TIMEOUT) != RPC_SUCCESS) { - Free(clnt_res); - return (NULL); + (xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT); + + if (res == RPC_SUCCESS) { + return (clnt_res); } - return (clnt_res); + mdmn_handle_RPC_error(clnt, "mdmn_wakeup_initiator", nid); + Free(clnt_res); + return (NULL); } int * -mdmn_wakeup_master_1(argp, clnt) +mdmn_wakeup_master_2(argp, clnt, nid) md_mn_result_t *argp; CLIENT *clnt; + md_mn_nodeid_t nid; { + enum clnt_stat res; int *clnt_res = Zalloc(sizeof (int)); - if (clnt_call(clnt, mdmn_wakeup_master, + res = clnt_call(clnt, mdmn_wakeup_master, (xdrproc_t)xdr_md_mn_result_t, (caddr_t)argp, - (xdrproc_t)xdr_int, (caddr_t)clnt_res, - TIMEOUT) != RPC_SUCCESS) { - Free(clnt_res); - return (NULL); + (xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT); + + if (res == RPC_SUCCESS) { + return (clnt_res); } - return (clnt_res); + mdmn_handle_RPC_error(clnt, "mdmn_wakeup_master", nid); + Free(clnt_res); + return (NULL); } int * -mdmn_comm_lock_1(argp, clnt) +mdmn_comm_lock_2(argp, clnt, nid) md_mn_set_and_class_t *argp; CLIENT *clnt; + md_mn_nodeid_t nid; { + enum clnt_stat res; int *clnt_res = Zalloc(sizeof (int)); - if (clnt_call(clnt, mdmn_comm_lock, + res = clnt_call(clnt, mdmn_comm_lock, (xdrproc_t)xdr_md_mn_set_and_class_t, (caddr_t)argp, - (xdrproc_t)xdr_int, (caddr_t)clnt_res, - TIMEOUT) != RPC_SUCCESS) { - return (NULL); + (xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT); + + if (res == RPC_SUCCESS) { + return (clnt_res); } - return (clnt_res); + mdmn_handle_RPC_error(clnt, "mdmn_comm_lock", nid); + Free(clnt_res); + return (NULL); } int * -mdmn_comm_unlock_1(argp, clnt) +mdmn_comm_unlock_2(argp, clnt, nid) md_mn_set_and_class_t *argp; CLIENT *clnt; + md_mn_nodeid_t nid; { + enum clnt_stat res; int *clnt_res = Zalloc(sizeof (int)); - if (clnt_call(clnt, mdmn_comm_unlock, + res = clnt_call(clnt, mdmn_comm_unlock, (xdrproc_t)xdr_md_mn_set_and_class_t, (caddr_t)argp, - (xdrproc_t)xdr_int, (caddr_t)clnt_res, - TIMEOUT) != RPC_SUCCESS) { - return (NULL); + (xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT); + + if (res == RPC_SUCCESS) { + return (clnt_res); } - return (clnt_res); + mdmn_handle_RPC_error(clnt, "mdmn_comm_unlock", nid); + Free(clnt_res); + return (NULL); } int * -mdmn_comm_suspend_1(argp, clnt) +mdmn_comm_suspend_2(argp, clnt, nid) md_mn_set_and_class_t *argp; CLIENT *clnt; + md_mn_nodeid_t nid; { + enum clnt_stat res; int *clnt_res = Zalloc(sizeof (int)); - if (clnt_call(clnt, mdmn_comm_suspend, + res = clnt_call(clnt, mdmn_comm_suspend, (xdrproc_t)xdr_md_mn_set_and_class_t, (caddr_t)argp, - (xdrproc_t)xdr_int, (caddr_t)clnt_res, - TIMEOUT) != RPC_SUCCESS) { - return (NULL); + (xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT); + + if (res == RPC_SUCCESS) { + return (clnt_res); } - return (clnt_res); + mdmn_handle_RPC_error(clnt, "mdmn_comm_suspend", nid); + Free(clnt_res); + return (NULL); } int * -mdmn_comm_resume_1(argp, clnt) +mdmn_comm_resume_2(argp, clnt, nid) md_mn_set_and_class_t *argp; CLIENT *clnt; + md_mn_nodeid_t nid; { + enum clnt_stat res; int *clnt_res = Zalloc(sizeof (int)); - if (clnt_call(clnt, mdmn_comm_resume, + res = clnt_call(clnt, mdmn_comm_resume, (xdrproc_t)xdr_md_mn_set_and_class_t, (caddr_t)argp, - (xdrproc_t)xdr_int, (caddr_t)clnt_res, - TIMEOUT) != RPC_SUCCESS) { - return (NULL); + (xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT); + + if (res == RPC_SUCCESS) { + return (clnt_res); } - return (clnt_res); + mdmn_handle_RPC_error(clnt, "mdmn_comm_resume", nid); + Free(clnt_res); + return (NULL); } int * -mdmn_comm_reinit_set_1(argp, clnt) +mdmn_comm_reinit_set_2(argp, clnt, nid) set_t *argp; CLIENT *clnt; + md_mn_nodeid_t nid; { + enum clnt_stat res; int *clnt_res = Zalloc(sizeof (int)); - if (clnt_call(clnt, mdmn_comm_reinit_set, + res = clnt_call(clnt, mdmn_comm_reinit_set, (xdrproc_t)xdr_set_t, (caddr_t)argp, - (xdrproc_t)xdr_int, (caddr_t)clnt_res, - TIMEOUT) != RPC_SUCCESS) { - return (NULL); + (xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT); + + if (res == RPC_SUCCESS) { + return (clnt_res); } - return (clnt_res); + mdmn_handle_RPC_error(clnt, "mdmn_comm_reinit_set", nid); + Free(clnt_res); + return (NULL); } int * -mdmn_comm_msglock_1(argp, clnt) +mdmn_comm_msglock_2(argp, clnt, nid) md_mn_type_and_lock_t *argp; CLIENT *clnt; + md_mn_nodeid_t nid; { + enum clnt_stat res; int *clnt_res = Zalloc(sizeof (int)); - if (clnt_call(clnt, mdmn_comm_msglock, + res = clnt_call(clnt, mdmn_comm_msglock, (xdrproc_t)xdr_md_mn_type_and_lock_t, (caddr_t)argp, - (xdrproc_t)xdr_int, (caddr_t)clnt_res, - TIMEOUT) != RPC_SUCCESS) { - return (NULL); + (xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT); + + if (res == RPC_SUCCESS) { + return (clnt_res); } - return (clnt_res); + mdmn_handle_RPC_error(clnt, "mdmn_comm_msglock", nid); + Free(clnt_res); + return (NULL); } @@ -370,6 +451,7 @@ nmsg->msg_flags = msg->msg_flags; nmsg->msg_setno = msg->msg_setno; nmsg->msg_type = msg->msg_type; + nmsg->msg_recipient = msg->msg_recipient; nmsg->msg_event_size = msg->msg_event_size; if (msg->msg_event_size > 0) { bcopy(msg->msg_event_data, nmsg->msg_event_data, @@ -379,7 +461,7 @@ } void -copy_msg_1(md_mn_msg_t *msg, md_mn_msg_od_t *msgod, int direction) +copy_msg_2(md_mn_msg_t *msg, md_mn_msg_od_t *msgod, int direction) { assert((direction == MD_MN_COPY_TO_ONDISK) || (direction == MD_MN_COPY_TO_INCORE)); @@ -390,6 +472,7 @@ msgod->msg_flags = msg->msg_flags; msgod->msg_setno = msg->msg_setno; msgod->msg_type = msg->msg_type; + msgod->msg_recipient = msg->msg_recipient; msgod->msg_od_event_size = msg->msg_event_size; /* paranoid checks */ if (msg->msg_event_size != 0 && msg->msg_event_data != NULL) @@ -401,6 +484,7 @@ msg->msg_flags = msgod->msg_flags; msg->msg_setno = msgod->msg_setno; msg->msg_type = msgod->msg_type; + msg->msg_recipient = msgod->msg_recipient; msg->msg_event_size = msgod->msg_od_event_size; if (msg->msg_event_data == NULL) msg->msg_event_data = Zalloc(msg->msg_event_size); @@ -462,7 +546,7 @@ if (mdmn_clients == (md_mn_client_list_t *)NULL) { /* if there is no entry, create a client and return a it */ local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, - ONE, "tcp"); + TWO, "tcp"); } else { /* * If there is an entry from a previous put operation, @@ -517,6 +601,13 @@ * a msgid is already attached to it. * In that case mdmn_send_message_with_msgid() has to be called directly. * + * The recipient argument is almost always unused, and is therefore typically + * set to zero, as zero is an invalid cluster nodeid. The exceptions are the + * marking and clearing of the DRL from a node that is not currently the + * owner. In these cases, the recipient argument will be the nodeid of the + * mirror owner, and MD_MSGF_DIRECTED will be set in the flags. Non-owner + * nodes will not receive these messages. + * * Return values / CAVEAT EMPTOR: see mdmn_send_message_with_msgid() */ @@ -525,13 +616,14 @@ set_t setno, md_mn_msgtype_t type, uint_t flags, + md_mn_nodeid_t recipient, char *data, int size, md_mn_result_t **result, md_error_t *ep) { - return (mdmn_send_message_with_msgid( - setno, type, flags, data, size, result, MD_NULL_MSGID, ep)); + return (mdmn_send_message_with_msgid(setno, type, flags, + recipient, data, size, result, MD_NULL_MSGID, ep)); } /* * mdmn_send_message_with_msgid() @@ -561,6 +653,7 @@ set_t setno, md_mn_msgtype_t type, uint_t flags, + md_mn_nodeid_t recipient, char *data, int size, md_mn_result_t **result, @@ -619,6 +712,7 @@ */ msg.msg_flags = flags; msg.msg_setno = setno; + msg.msg_recipient = recipient; msg.msg_type = type; msg.msg_event_size = size; msg.msg_event_data = data; @@ -655,7 +749,7 @@ * - retries1 or retries2 exceeded */ for (; ; ) { - *result = mdmn_send_1(&msg, local_daemon); + *result = mdmn_send_2(&msg, local_daemon, 0); resp = *result; if (resp != (md_mn_result_t *)NULL) { /* Bingo! */ @@ -800,8 +894,8 @@ if ((setno >= MD_MAXSETS) || (class >= MD_MN_NCLASSES)) { return (MDE_DS_COMMDCTL_SUSPEND_FAIL); } - local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, ONE, - "tcp"); + local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, TWO, + "tcp"); if (local_daemon == (CLIENT *)NULL) { clnt_pcreateerror("local_daemon"); return (MDE_DS_COMMDCTL_SUSPEND_FAIL); @@ -818,7 +912,7 @@ msc.msc_class = class; msc.msc_flags = 0; - resp = mdmn_comm_suspend_1(&msc, local_daemon); + resp = mdmn_comm_suspend_2(&msc, local_daemon, 0); clnt_destroy(local_daemon); if (resp == NULL) { @@ -861,8 +955,8 @@ if ((setno >= MD_MAXSETS) || (class >= MD_MN_NCLASSES)) { return (MDE_DS_COMMDCTL_RESUME_FAIL); } - local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, ONE, - "tcp"); + local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, TWO, + "tcp"); if (local_daemon == (CLIENT *)NULL) { clnt_pcreateerror("local_daemon"); return (MDE_DS_COMMDCTL_RESUME_FAIL); @@ -879,7 +973,7 @@ msc.msc_class = class; msc.msc_flags = flags; - resp = mdmn_comm_resume_1(&msc, local_daemon); + resp = mdmn_comm_resume_2(&msc, local_daemon, 0); if (resp != NULL) { if (*resp == MDMNE_ACK) { @@ -905,10 +999,8 @@ md_error_t mdne = mdnullerror; (void) mdmn_send_message(0, /* No set is needed for this message */ - MD_MN_MSG_ABORT, - MD_MSGF_LOCAL_ONLY, - dummy, sizeof (dummy), - &resultp, &mdne); + MD_MN_MSG_ABORT, MD_MSGF_LOCAL_ONLY, 0, + dummy, sizeof (dummy), &resultp, &mdne); if (resultp != NULL) { Free(resultp); @@ -935,8 +1027,8 @@ if ((setno == 0) || (setno >= MD_MAXSETS)) { return (1); } - local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, ONE, - "tcp"); + local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, TWO, + "tcp"); if (local_daemon == (CLIENT *)NULL) { clnt_pcreateerror("local_daemon"); return (1); @@ -949,7 +1041,7 @@ } } - resp = mdmn_comm_reinit_set_1(&setno, local_daemon); + resp = mdmn_comm_reinit_set_2(&setno, local_daemon, 0); if (resp != NULL) { if (*resp == MDMNE_ACK) { @@ -984,8 +1076,8 @@ if ((msgtype == 0) || (msgtype >= MD_MN_NMESSAGES)) { return (1); } - local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, ONE, - "tcp"); + local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, TWO, + "tcp"); if (local_daemon == (CLIENT *)NULL) { clnt_pcreateerror("local_daemon"); return (1); @@ -993,7 +1085,7 @@ mmtl.mmtl_type = msgtype; mmtl.mmtl_lock = locktype; - resp = mdmn_comm_msglock_1(&mmtl, local_daemon); + resp = mdmn_comm_msglock_2(&mmtl, local_daemon, 0); if (resp != NULL) { if (*resp == MDMNE_ACK) {
--- a/usr/src/lib/lvm/libmeta/common/meta_mn_handlers.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/lib/lvm/libmeta/common/meta_mn_handlers.c Wed Dec 24 08:23:40 2008 -0700 @@ -18,13 +18,12 @@ * * CDDL HEADER END */ + /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <stdlib.h> #include <unistd.h> #include <wait.h> @@ -448,7 +447,7 @@ myflags |= msg->msg_flags & MD_MSGF_INHERIT_BITS; ret = mdmn_send_message(MD_MIN2SET(d->msg_chooseid_mnum), - MD_MN_MSG_CHANGE_OWNER, myflags, (char *)&chownermsg, + MD_MN_MSG_CHANGE_OWNER, myflags, 0, (char *)&chownermsg, sizeof (chownermsg), &resp1, &mde); if (resp1 != NULL) free_result(resp1); @@ -2120,3 +2119,67 @@ resp->mmr_exitval = 0; } + +/* + * This is used to issue a MD_MN_RR_DIRTY ioctl to the mirror. + */ +/*ARGSUSED*/ +void +mdmn_do_mark_dirty(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + md_mn_msg_rr_dirty_t *d; + md_mn_rr_dirty_params_t rp; + int ret; + + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + resp->mmr_comm_state = MDMNE_ACK; + d = (md_mn_msg_rr_dirty_t *)((void *)(msg->msg_event_data)); + + (void) memset(&rp, 0, sizeof (rp)); + MD_SETDRIVERNAME(&rp, MD_MIRROR, MD_MIN2SET(d->rr_mnum)) + rp.rr_mnum = d->rr_mnum; + rp.rr_nodeid = d->rr_nodeid; + rp.rr_start = (ushort_t)((d->rr_range >> 16) & 0xffff); + rp.rr_end = (ushort_t)(d->rr_range & 0xffff); + + ret = metaioctl(MD_MN_RR_DIRTY, &rp, &rp.mde, NULL); + + resp->mmr_exitval = ret; +} + +/* + * This is used to issue a MD_MN_RR_CLEAN ioctl to the mirror. + */ +/*ARGSUSED*/ +void +mdmn_do_mark_clean(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + md_mn_msg_rr_clean_t *d; + md_mn_rr_clean_params_t *rcp; + int ret; + + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + resp->mmr_comm_state = MDMNE_ACK; + d = (md_mn_msg_rr_clean_t *)((void *)(msg->msg_event_data)); + + rcp = Zalloc(sizeof (struct md_mn_rr_clean_params) + + MDMN_MSG_RR_CLEAN_DATA_BYTES(d)); + MD_SETDRIVERNAME(rcp, MD_MIRROR, MD_MIN2SET(d->rr_mnum)) + rcp->rr_mnum = d->rr_mnum; + rcp->rr_nodeid = d->rr_nodeid; + rcp->rr_start_size = d->rr_start_size; + (void) memcpy(MDMN_RR_CLEAN_PARAMS_DATA(rcp), MDMN_MSG_RR_CLEAN_DATA(d), + MDMN_MSG_RR_CLEAN_DATA_BYTES(d)); + + ret = metaioctl(MD_MN_RR_CLEAN, rcp, &rcp->mde, NULL); + + Free(rcp); + + resp->mmr_exitval = ret; +}
--- a/usr/src/lib/lvm/libmeta/common/meta_mn_msg_table.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/lib/lvm/libmeta/common/meta_mn_msg_table.c Wed Dec 24 08:23:40 2008 -0700 @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <meta.h> extern void mdmn_do_cmd(HANDLER_PARMS); @@ -56,6 +54,8 @@ extern void mdmn_do_get_tstate(HANDLER_PARMS); extern void mdmn_do_get_mirstate(HANDLER_PARMS); extern void mdmn_do_addmdname(HANDLER_PARMS); +extern void mdmn_do_mark_dirty(HANDLER_PARMS); +extern void mdmn_do_mark_clean(HANDLER_PARMS); extern int mdmn_smgen_test6(SMGEN_PARMS); extern int mdmn_smgen_state_upd(SMGEN_PARMS); @@ -693,10 +693,36 @@ * Add metadevice name into replica */ MD_MSG_CLASS1, /* message class */ - mdmn_do_addmdname, /* add ,etadevice name */ + mdmn_do_addmdname, /* add metadevice name */ NULL, /* submessage generator */ 90, /* times out in 90 secs */ 10000, 2, /* class busy retry / time delta */ 10, 1000 /* comm fail retry / time delta */ }, + + { + /* + * MD_MN_MSG_RR_DIRTY + * Mark given range of un_dirty_bm as dirty + */ + MD_MSG_CLASS2, /* message class */ + mdmn_do_mark_dirty, /* message handler */ + NULL, /* submessage generator */ + 8, /* timeout in seconds */ + UINT_MAX, 10, /* class busy retry / time delta */ + UINT_MAX, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_RR_CLEAN + * Mark given range of un_dirty_bm as clean + */ + MD_MSG_CLASS2, /* message class */ + mdmn_do_mark_clean, /* message handler */ + NULL, /* submessage generator */ + 8, /* timeout in seconds */ + UINT_MAX, 10, /* class busy retry / time delta */ + UINT_MAX, 100 /* comm fail retry / time delta */ + }, };
--- a/usr/src/lib/lvm/libmeta/common/meta_mn_subr.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/lib/lvm/libmeta/common/meta_mn_subr.c Wed Dec 24 08:23:40 2008 -0700 @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Just in case we're not in a build environment, make sure that * TEXT_DOMAIN gets set to something. @@ -62,7 +60,7 @@ /* Local set cannot be MultiNode */ if ((sp == NULL) || (sp->setname == NULL) || - (strcmp(sp->setname, MD_LOCAL_NAME) == 0)) + (strcmp(sp->setname, MD_LOCAL_NAME) == 0)) return (0); sd = metaget_setdesc(sp, ep); ASSERT(sd != NULL); @@ -128,7 +126,7 @@ md_mn_result_t *resp = NULL; (void) mdmn_send_message(setno, MD_MN_MSG_TEST2, - MD_MSGF_NO_LOG | MD_MSGF_FAIL_ON_SUSPEND, data, + MD_MSGF_NO_LOG | MD_MSGF_FAIL_ON_SUSPEND, 0, data, sizeof (data), &resp, &mde); if (resp != (md_mn_result_t *)NULL) { @@ -234,9 +232,8 @@ } else { send_message_type = MD_MN_MSG_BC_CMD; } - err = mdmn_send_message( - sp->setno, send_message_type, send_message_flags, - cmd, 1024, &resp, ep); + err = mdmn_send_message(sp->setno, send_message_type, + send_message_flags, 0, cmd, 1024, &resp, ep); free(cmd); @@ -285,9 +282,9 @@ "Command not attempted: Unable to log message " "in set %s\n"), sp->setname); if (c.c_flags & MDDB_C_STALE) { - (void) mdmddberror(ep, MDE_DB_STALE, - (minor_t)NODEV64, sp->setno, 0, NULL); - mde_perror(ep, ""); + (void) mdmddberror(ep, MDE_DB_STALE, + (minor_t)NODEV64, sp->setno, 0, NULL); + mde_perror(ep, ""); } } else { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, @@ -333,7 +330,7 @@ */ result = mdmn_send_message(MD_MIN2SET(mnum), MD_MN_MSG_SUSPEND_WRITES, - MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, + MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, 0, (char *)&suspwrmsg, sizeof (suspwrmsg), &resp, ep); if (resp != NULL) { free_result(resp); @@ -608,7 +605,7 @@ * time required. */ ret = mdmn_send_message(sp->setno, MD_MN_MSG_SETSYNC, - MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, + MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, 0, (char *)&setsyncmsg, sizeof (setsyncmsg), &resp, ep); if (resp != NULL) { free_result(resp); @@ -720,7 +717,7 @@ resyncmsg.msg_resync_mnum = mnum; result = mdmn_send_message(MD_MIN2SET(mnum), MD_MN_MSG_RESYNC_STARTING, - MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, + MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, 0, (char *)&resyncmsg, sizeof (resyncmsg), &resp, ep); if (resp != NULL) { @@ -905,7 +902,7 @@ tstatemsg.gettstate_dev = dev; result = mdmn_send_message(MD_MIN2SET(mnum), MD_MN_MSG_GET_TSTATE, - MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, + MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0, (char *)&tstatemsg, sizeof (tstatemsg), &resp, ep); if (result == 0)
--- a/usr/src/lib/lvm/libmeta/common/meta_nameinfo.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/lib/lvm/libmeta/common/meta_nameinfo.c Wed Dec 24 08:23:40 2008 -0700 @@ -1244,11 +1244,11 @@ * and the message doesn't need being logged either. * Hence NO_LOG and NO_MCT */ - err = mdmn_send_message( - sp->setno, MD_MN_MSG_CLU_CHECK, - MD_MSGF_NO_MCT | MD_MSGF_STOP_ON_ERROR | - MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, - (char *)&d, sizeof (md_isopen_t), &resp, ep); + err = mdmn_send_message(sp->setno, + MD_MN_MSG_CLU_CHECK, MD_MSGF_NO_MCT | + MD_MSGF_STOP_ON_ERROR | MD_MSGF_NO_LOG | + MD_MSGF_OVERRIDE_SUSPEND, 0, (char *)&d, + sizeof (md_isopen_t), &resp, ep); if (err == 0) { d.isopen = resp->mmr_exitval; } else {
--- a/usr/src/lib/lvm/libmeta/common/meta_runtime.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/lib/lvm/libmeta/common/meta_runtime.c Wed Dec 24 08:23:40 2008 -0700 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,13 +18,12 @@ * * CDDL HEADER END */ + /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Just in case we're not in a build environment, make sure that * TEXT_DOMAIN gets set to something. @@ -171,9 +169,8 @@ ownerioctls_onp) != 0) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "%s: illegal value for %s: %s.\n"), - function_namep, - ownerioctls_namep, - param_valuep); + function_namep, ownerioctls_namep, + param_valuep); syslog(LOG_ERR, dgettext(TEXT_DOMAIN, "%s: illegal value for %s: %s.\n"), function_namep, @@ -216,6 +213,32 @@ } /* + * This controls what type of RPC errors are sent to syslog(). + * It is used as a bitmask against the clnt_stat list, which defines + * 0 as RPC_SUCCESS, so likely shouldn't be set. + * + * The #define below provides a default of all errors in the list. + * The default can then be modified to reduce the amount of traffic + * going to syslog in the event of RPC errors. + */ + +#define DEFAULT_ERRMASK (UINT_MAX & ~(1 << RPC_SUCCESS)) + +uint_t +meta_rpc_err_mask(void) +{ + char *param_valuep; + uint_t retval = DEFAULT_ERRMASK; + + param_valuep = meta_get_rt_param("commd_RPC_errors", B_FALSE); + if (param_valuep != NULL) { + retval = (uint_t)strtol(param_valuep, NULL, 16); + free(param_valuep); + } + return (retval); +} + +/* * The following lines define private functions */ @@ -232,27 +255,23 @@ line_bufferp = (char *)malloc(line_buffer_size); if (line_bufferp == NULL) { - (void) fprintf(stderr, - dgettext(TEXT_DOMAIN, "%s: malloc failed\n"), - function_namep); - syslog(LOG_ERR, - dgettext(TEXT_DOMAIN, "%s: malloc failed\n"), - function_namep); + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s: malloc failed\n"), function_namep); + syslog(LOG_ERR, dgettext(TEXT_DOMAIN, "%s: malloc failed\n"), + function_namep); return (param_valuep); } param_filep = fopen(param_file_namep, "r"); if (param_filep == NULL) { - (void) fprintf(stderr, - dgettext(TEXT_DOMAIN, "%s: can't open %s\n"), - function_namep, param_file_namep); - syslog(LOG_ERR, - dgettext(TEXT_DOMAIN, "%s: can't open %s\n"), - function_namep, param_file_namep); + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s: can't open %s\n"), function_namep, param_file_namep); + syslog(LOG_ERR, dgettext(TEXT_DOMAIN, "%s: can't open %s\n"), + function_namep, param_file_namep); free(line_bufferp); return (param_valuep); } while ((fgets(line_bufferp, line_buffer_size, param_filep) != NULL) && - (param_valuep == NULL)) { + (param_valuep == NULL)) { newlinep = strchr(line_bufferp, '\n'); if (newlinep != NULL) { @@ -261,10 +280,10 @@ } param_name_tokenp = strtok(line_bufferp, token_separator_listp); if ((param_name_tokenp != NULL) && - (strcmp(param_namep, param_name_tokenp) == 0)) { + (strcmp(param_namep, param_name_tokenp) == 0)) { param_value_tokenp = strtok(NULL, - token_separator_listp); + token_separator_listp); } if (param_value_tokenp != NULL) { param_valuep = strdup(param_value_tokenp); @@ -282,18 +301,12 @@ } } if ((param_valuep == NULL) && (warn_if_not_found == B_TRUE)) { - (void) fprintf(stderr, - dgettext(TEXT_DOMAIN, - "%s: value of %s not set or error in %s\n"), - function_namep, - param_namep, - param_file_namep); - syslog(LOG_ERR, - dgettext(TEXT_DOMAIN, - "%s: value of %s not set or error in %s\n"), - function_namep, - param_namep, - param_file_namep); + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s: value of %s not set or error in %s\n"), + function_namep, param_namep, param_file_namep); + syslog(LOG_ERR, dgettext(TEXT_DOMAIN, + "%s: value of %s not set or error in %s\n"), + function_namep, param_namep, param_file_namep); } free(line_bufferp); (void) fclose(param_filep);
--- a/usr/src/lib/lvm/libmeta/common/meta_set.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/lib/lvm/libmeta/common/meta_set.c Wed Dec 24 08:23:40 2008 -0700 @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Just in case we're not in a build environment, make sure that * TEXT_DOMAIN gets set to something. @@ -1877,7 +1875,6 @@ return (NULL); } - /* * Get the devid associated with the key. * @@ -1893,6 +1890,11 @@ */ dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep); free(devidp); + + /* dnp could be NULL if the devid could not be decoded. */ + if (dnp == NULL) { + return (NULL); + } dnp->side_names_key = key; } else { /* @@ -1981,6 +1983,9 @@ */ dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep); free(devidp); + if (dnp == NULL) { + return (NULL); + } dnp->side_names_key = key; } } @@ -5733,6 +5738,7 @@ lr->lr_msg.msg_type, lr->lr_msg.msg_flags | MD_MSGF_REPLAY_MSG | MD_MSGF_OVERRIDE_SUSPEND, + lr->lr_msg.msg_recipient, lr->lr_msg.msg_event_data, lr->lr_msg.msg_event_size, &resultp,
--- a/usr/src/lib/lvm/libmeta/common/meta_set_hst.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/lib/lvm/libmeta/common/meta_set_hst.c Wed Dec 24 08:23:40 2008 -0700 @@ -18,13 +18,12 @@ * * CDDL HEADER END */ + /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Just in case we're not in a build environment, make sure that * TEXT_DOMAIN gets set to something. @@ -148,7 +147,7 @@ send_rval = mdmn_send_message(sp->setno, MD_MN_MSG_META_MD_ADDSIDE, MD_MSGF_FAIL_ON_SUSPEND | MD_MSGF_PANIC_WHEN_INCONSISTENT, - (char *)&md_as, sizeof (md_mn_msg_meta_md_addside_t), + 0, (char *)&md_as, sizeof (md_mn_msg_meta_md_addside_t), &resultp, ep); if (send_rval != 0) { (void) mdstealerror(ep, &(resultp->mmr_ep)); @@ -178,7 +177,7 @@ * Let's see if it is hsp or not */ nm.devname = (uintptr_t)meta_getnmentbykey(sp->setno, - otherside, nm.key, &drvnm, NULL, NULL, ep); + otherside, nm.key, &drvnm, NULL, NULL, ep); if (nm.devname == NULL || drvnm == NULL) { if (nm.devname) Free((void *)(uintptr_t)nm.devname); @@ -229,9 +228,9 @@ * increment the count to sync up with the other sides. */ for (i = 0; i < nm.ref_count; i++) { - if (add_name(sp, sideno, nm.key, dname, mnum, - cname, NULL, NULL, ep) == -1) - rval = -1; + if (add_name(sp, sideno, nm.key, dname, mnum, + cname, NULL, NULL, ep) == -1) + rval = -1; } Free(cname); @@ -323,17 +322,17 @@ (void) strcpy(nd->nd_nodename, node_v[i]); nd->nd_ctime = now; nd->nd_flags = (MD_MN_NODE_ALIVE | - MD_MN_NODE_ADD); + MD_MN_NODE_ADD); nl2 = nl; while (nl2) { - if (strcmp(nl2->msl_node_name, - node_v[i]) == 0) { - nd->nd_nodeid = nl2->msl_node_id; - (void) strcpy(nd->nd_priv_ic, - nl2->msl_node_addr); - break; - } - nl2 = nl2->next; + if (strcmp(nl2->msl_node_name, + node_v[i]) == 0) { + nd->nd_nodeid = nl2->msl_node_id; + (void) strcpy(nd->nd_priv_ic, + nl2->msl_node_addr); + break; + } + nl2 = nl2->next; } /* @@ -1123,7 +1122,7 @@ send_rval = mdmn_send_message(sp->setno, MD_MN_MSG_META_MD_DELSIDE, MD_MSGF_FAIL_ON_SUSPEND | MD_MSGF_PANIC_WHEN_INCONSISTENT, - (char *)&md_ds, sizeof (md_mn_msg_meta_md_delside_t), + 0, (char *)&md_ds, sizeof (md_mn_msg_meta_md_delside_t), &resultp, ep); if (send_rval != 0) { (void) mdstealerror(ep, &(resultp->mmr_ep)); @@ -1156,8 +1155,8 @@ * actually removed. */ for (i = 0; i < nm.ref_count; i++) { - if (del_name(sp, sideno, nm.key, ep) == -1) - return (-1); + if (del_name(sp, sideno, nm.key, ep) == -1) + return (-1); } } } @@ -1183,7 +1182,7 @@ continue; } has_set = nodehasset(sp, nd->nd_nodename, - NHS_NST_EQ, &xep); + NHS_NST_EQ, &xep); if (has_set >= 0) { nd = nd->nd_next; @@ -1207,7 +1206,7 @@ continue; has_set = nodehasset(sp, sd->sd_nodes[i], - NHS_NST_EQ, &xep); + NHS_NST_EQ, &xep); if (has_set >= 0) continue; @@ -1967,7 +1966,8 @@ return (-1); /* find the end of the link list */ - for (sn = dnp->side_names; sn->next != NULL; sn = sn->next); + for (sn = dnp->side_names; sn->next != NULL; sn = sn->next) + ; sn_next = &sn->next; if (meta_replicaslice(dnp, &rep_slice, ep) != 0) @@ -1986,13 +1986,13 @@ * used instead of meta_getnextside_devinfo. */ if (meta_getside_devinfo(sp, np->bname, sideno, &sn->cname, - &sn->dname, &sn->mnum, ep) == -1) + &sn->dname, &sn->mnum, ep) == -1) err = -1; } else { /* decrement sideno, to look like the previous sideno */ sideno--; - if (meta_getnextside_devinfo(sp, np->bname, &sideno, &sn->cname, - &sn->dname, &sn->mnum, ep) == -1) + if (meta_getnextside_devinfo(sp, np->bname, &sideno, + &sn->cname, &sn->dname, &sn->mnum, ep) == -1) err = -1; } @@ -2377,14 +2377,14 @@ nd->nd_ctime = now; nl2 = nl; while (nl2) { - if (strcmp(nl2->msl_node_name, - node_v[nodeindex]) == 0) { - nd->nd_nodeid = nl2->msl_node_id; - (void) strcpy(nd->nd_priv_ic, - nl2->msl_node_addr); - break; - } - nl2 = nl2->next; + if (strcmp(nl2->msl_node_name, + node_v[nodeindex]) == 0) { + nd->nd_nodeid = nl2->msl_node_id; + (void) strcpy(nd->nd_priv_ic, + nl2->msl_node_addr); + break; + } + nl2 = nl2->next; } /* @@ -2773,16 +2773,16 @@ * rpc.mdcommd is running on the nodes with a set. */ if (remote_sets_created == 1) { - for (i = 0; i < node_c; i++) { - if (clnt_mdcommdctl(node_v[i], COMMDCTL_REINIT, - sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { - if (rval == 0) - (void) mdstealerror(ep, &xep); - rval = -1; - mde_perror(ep, dgettext(TEXT_DOMAIN, - "Unable to reinit rpc.mdcommd.\n")); - } - } + for (i = 0; i < node_c; i++) { + if (clnt_mdcommdctl(node_v[i], COMMDCTL_REINIT, + sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to reinit rpc.mdcommd.\n")); + } + } } } if ((suspend1_flag) || (suspendall_flag)) { @@ -2819,17 +2819,18 @@ * rpc.mdcommd is be running on the nodes with a set. */ if (remote_sets_created == 1) { - for (i = 0; i < node_c; i++) { - /* Already verified to be alive */ - if (clnt_mdcommdctl(node_v[i], COMMDCTL_RESUME, - sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { - if (rval == 0) - (void) mdstealerror(ep, &xep); - rval = -1; - mde_perror(ep, dgettext(TEXT_DOMAIN, - "Unable to resume rpc.mdcommd.\n")); - } - } + for (i = 0; i < node_c; i++) { + /* Already verified to be alive */ + if (clnt_mdcommdctl(node_v[i], COMMDCTL_RESUME, + sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, + &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to resume rpc.mdcommd.\n")); + } + } } meta_ping_mnset(sp->setno); /* @@ -4031,7 +4032,8 @@ rb_medr.med_rec_sn = sp->setno; (void) strcpy(rb_medr.med_rec_snm, sp->setname); for (i = 0; i < MD_MAXSIDES; i++) - (void) strcpy(rb_medr.med_rec_nodes[i], sd->sd_nodes[i]); + (void) strcpy(rb_medr.med_rec_nodes[i], + sd->sd_nodes[i]); rb_medr.med_rec_meds = sd->sd_med; /* structure assigment */ (void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t)); rb_medr.med_rec_foff = 0; @@ -4432,45 +4434,52 @@ * alive nodes are updated correctly. */ if (strcmp(nd->nd_nodename, node_v[i]) == 0) { - if ((oha == TRUE) && - (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { + if ((oha == TRUE) && (!(nd->nd_flags & + MD_MN_NODE_ALIVE))) { nd->nd_flags |= MD_MN_NODE_DEL; nd->nd_flags &= ~MD_MN_NODE_OK; nd = nd->nd_next; continue; - } - if (nd->nd_flags & MD_MN_NODE_OWN) { - /* - * Going to set locally cached node - * flags to rollback join so in case - * of error, the rollback code knows - * which nodes to re-join. - * rpc.metad ignores the RB_JOIN flag. - */ - nd->nd_flags |= MD_MN_NODE_RB_JOIN; - nd->nd_flags &= ~MD_MN_NODE_OWN; - - /* - * Be careful in ordering of following - * steps so that recovery from a panic - * between the steps is viable. - * Only reset master info in rpc.metad - * - don't reset local cached info - * which will be used to set master - * info back if failure (rollback). - */ - if (clnt_withdrawset(nd->nd_nodename, - sp, ep)) - goto rollback; - - /* Reset master on deleted node */ - if (clnt_mnsetmaster(node_v[i], sp, "", - MD_MN_INVALID_NID, ep)) - goto rollback; - } - - nd->nd_flags |= MD_MN_NODE_DEL; - nd->nd_flags &= ~MD_MN_NODE_OK; + } + if (nd->nd_flags & MD_MN_NODE_OWN) { + /* + * Going to set locally cached + * node flags to rollback join + * so in case of error, the + * rollback code knows which + * nodes to re-join. rpc.metad + * ignores the RB_JOIN flag. + */ + nd->nd_flags |= + MD_MN_NODE_RB_JOIN; + nd->nd_flags &= ~MD_MN_NODE_OWN; + + /* + * Be careful in ordering of + * following steps so that + * recovery from a panic + * between the steps is viable. + * Only reset master info in + * rpc.metad - don't reset + * local cached info which will + * be used to set master info + * back if failure (rollback). + */ + if (clnt_withdrawset( + nd->nd_nodename, sp, ep)) + goto rollback; + + /* + * Reset master on deleted node + */ + if (clnt_mnsetmaster(node_v[i], + sp, "", MD_MN_INVALID_NID, + ep)) + goto rollback; + } + + nd->nd_flags |= MD_MN_NODE_DEL; + nd->nd_flags &= ~MD_MN_NODE_OK; } nd = nd->nd_next; } @@ -4503,37 +4512,37 @@ /* Send reinit */ nd = sd->sd_nodelist; while (nd) { - if ((oha == TRUE) && - (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { - nd = nd->nd_next; - continue; - } - /* Class is ignored for REINIT */ - if (clnt_mdcommdctl(nd->nd_nodename, - COMMDCTL_REINIT, - sp, NULL, MD_MSCF_NO_FLAGS, ep)) { - mde_perror(ep, dgettext(TEXT_DOMAIN, - "Unable to reinit rpc.mdcommd.\n")); - goto rollback; - } - nd = nd->nd_next; + if ((oha == TRUE) && + (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { + nd = nd->nd_next; + continue; + } + /* Class is ignored for REINIT */ + if (clnt_mdcommdctl(nd->nd_nodename, + COMMDCTL_REINIT, sp, NULL, + MD_MSCF_NO_FLAGS, ep)) { + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to reinit rpc.mdcommd.\n")); + goto rollback; + } + nd = nd->nd_next; } /* Send resume */ nd = sd->sd_nodelist; while (nd) { - if ((oha == TRUE) && - (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { - nd = nd->nd_next; - continue; - } - if (clnt_mdcommdctl(nd->nd_nodename, - COMMDCTL_RESUME, sp, MD_MSG_CLASS0, - MD_MSCF_DONT_RESUME_CLASS1, ep)) { - mde_perror(ep, dgettext(TEXT_DOMAIN, - "Unable to resume rpc.mdcommd.\n")); - goto rollback; - } - nd = nd->nd_next; + if ((oha == TRUE) && + (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { + nd = nd->nd_next; + continue; + } + if (clnt_mdcommdctl(nd->nd_nodename, + COMMDCTL_RESUME, sp, MD_MSG_CLASS0, + MD_MSCF_DONT_RESUME_CLASS1, ep)) { + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to resume rpc.mdcommd.\n")); + goto rollback; + } + nd = nd->nd_next; } meta_ping_mnset(sp->setno); } @@ -4727,50 +4736,52 @@ RB_TEST(24, "deletehosts", ep) } } else { - nd = sd->sd_nodelist; - /* All nodes guaranteed to be ALIVE unless in oha mode */ - while (nd) { - /* - * If mirror owner was set to a deleted node, then - * each existing node resets mirror owner to NULL. - * - * During OHA mode, don't issue RPCs to - * non-alive nodes since there is no reason to - * wait for RPC timeouts. - */ - if ((oha == TRUE) && - (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { - nd = nd->nd_next; - continue; - } - - /* Skip nodes being deleted */ - if (strinlst(nd->nd_nodename, node_c, node_v)) { + nd = sd->sd_nodelist; + /* All nodes guaranteed ALIVE unless in oha mode */ + while (nd) { + /* + * If mirror owner was set to a deleted node, + * then each existing node resets mirror owner + * to NULL. + * + * During OHA mode, don't issue RPCs to + * non-alive nodes since there is no reason to + * wait for RPC timeouts. + */ + if ((oha == TRUE) && + (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { + nd = nd->nd_next; + continue; + } + + /* Skip nodes being deleted */ + if (strinlst(nd->nd_nodename, node_c, node_v)) { + nd = nd->nd_next; + continue; + } + + /* + * If mirror owner is a deleted node, reset + * mirror owners to NULL. If an error occurs, + * print a warning and continue. Don't fail + * metaset because of mirror owner reset + * problem since next node to grab mirror + * will resolve this issue. Before next node + * grabs mirrors, metaset will show the deleted + * node as owner which is why an attempt to + * reset the mirror owner is made. + */ + if (clnt_reset_mirror_owner(nd->nd_nodename, sp, + node_c, &node_id_list[0], &xep) == -1) { + mde_perror(&xep, dgettext(TEXT_DOMAIN, + "Unable to reset mirror owner on" + " node %s\n"), nd->nd_nodename); + mdclrerror(&xep); + } + + RB_TEST(21, "deletehosts", ep) nd = nd->nd_next; - continue; - } - - /* - * If mirror owner is a deleted node, reset mirror - * owners to NULL. If an error occurs, print a - * warning and continue. Don't fail metaset - * because of mirror owner reset problem since next - * node to grab mirror will resolve this issue. - * Before next node grabs mirrors, metaset will show - * the deleted node as owner which is why an attempt - * to reset the mirror owner is made. - */ - if (clnt_reset_mirror_owner(nd->nd_nodename, sp, - node_c, &node_id_list[0], &xep) == -1) { - mde_perror(&xep, dgettext(TEXT_DOMAIN, - "Unable to reset mirror owner on" - " node %s\n"), nd->nd_nodename); - mdclrerror(&xep); - } - - RB_TEST(21, "deletehosts", ep) - nd = nd->nd_next; - } + } } } @@ -4790,10 +4801,10 @@ for (i = 0; i < MD_MAXSIDES; i++) { if (strinlst(sd->sd_nodes[i], node_c, node_v)) (void) memset(&medr.med_rec_nodes[i], - '\0', sizeof (md_node_nm_t)); + '\0', sizeof (md_node_nm_t)); else (void) strcpy(medr.med_rec_nodes[i], - sd->sd_nodes[i]); + sd->sd_nodes[i]); } crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL); @@ -5636,79 +5647,85 @@ /* Lock the set on our side */ if (clnt_lock_set(hostname, sp, ep)) { - rval = -1; - goto out; + rval = -1; + goto out; } if (take_val) { - /* enable auto_take but only if it is not already set */ - if (! (sd->sd_flags & MD_SR_AUTO_TAKE)) { - /* verify that we're the only host in the set */ - for (i = 0; i < MD_MAXSIDES; i++) { - if (sd->sd_nodes[i] == NULL || sd->sd_nodes[i][0] == '\0') - continue; - - if (strcmp(sd->sd_nodes[i], hostname) != 0) { - (void) mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, - NULL, sp->setname); - rval = -1; - goto out; - } + /* enable auto_take but only if it is not already set */ + if (! (sd->sd_flags & MD_SR_AUTO_TAKE)) { + /* verify that we're the only host in the set */ + for (i = 0; i < MD_MAXSIDES; i++) { + if (sd->sd_nodes[i] == NULL || + sd->sd_nodes[i][0] == '\0') + continue; + + if (strcmp(sd->sd_nodes[i], hostname) != 0) { + (void) mddserror(ep, MDE_DS_SINGLEHOST, + sp->setno, NULL, NULL, sp->setname); + rval = -1; + goto out; + } + } + + if (clnt_enable_sr_flags(hostname, sp, + MD_SR_AUTO_TAKE, ep)) + rval = -1; + + /* Disable SCSI reservations */ + if (sd->sd_flags & MD_SR_MB_DEVID) + dd = metaget_drivedesc(sp, MD_BASICNAME_OK | + PRINT_FAST, &xep); + else + dd = metaget_drivedesc(sp, MD_BASICNAME_OK, + &xep); + + if (! mdisok(&xep)) + mdclrerror(&xep); + + if (dd != NULL) { + if (rel_own_bydd(sp, dd, TRUE, &xep)) + mdclrerror(&xep); + } } - if (clnt_enable_sr_flags(hostname, sp, MD_SR_AUTO_TAKE, ep)) - rval = -1; - - /* Disable SCSI reservations */ - if (sd->sd_flags & MD_SR_MB_DEVID) - dd = metaget_drivedesc(sp, MD_BASICNAME_OK | PRINT_FAST, - &xep); - else - dd = metaget_drivedesc(sp, MD_BASICNAME_OK, &xep); - if (! mdisok(&xep)) - mdclrerror(&xep); - - if (dd != NULL) { - if (rel_own_bydd(sp, dd, TRUE, &xep)) - mdclrerror(&xep); - } - } - } else { - /* disable auto_take, if set, or error */ - if (sd->sd_flags & MD_SR_AUTO_TAKE) { - if (clnt_disable_sr_flags(hostname, sp, MD_SR_AUTO_TAKE, ep)) - rval = -1; - - /* Enable SCSI reservations */ - if (sd->sd_flags & MD_SR_MB_DEVID) - dd = metaget_drivedesc(sp, MD_BASICNAME_OK | PRINT_FAST, - &xep); - else - dd = metaget_drivedesc(sp, MD_BASICNAME_OK, &xep); - if (! mdisok(&xep)) - mdclrerror(&xep); - - if (dd != NULL) { - mhd_mhiargs_t mhiargs = defmhiargs; - - if (tk_own_bydd(sp, dd, &mhiargs, TRUE, &xep)) - mdclrerror(&xep); + /* disable auto_take, if set, or error */ + if (sd->sd_flags & MD_SR_AUTO_TAKE) { + if (clnt_disable_sr_flags(hostname, sp, + MD_SR_AUTO_TAKE, ep)) + rval = -1; + + /* Enable SCSI reservations */ + if (sd->sd_flags & MD_SR_MB_DEVID) + dd = metaget_drivedesc(sp, MD_BASICNAME_OK | + PRINT_FAST, &xep); + else + dd = metaget_drivedesc(sp, MD_BASICNAME_OK, + &xep); + + if (! mdisok(&xep)) + mdclrerror(&xep); + + if (dd != NULL) { + mhd_mhiargs_t mhiargs = defmhiargs; + + if (tk_own_bydd(sp, dd, &mhiargs, TRUE, &xep)) + mdclrerror(&xep); + } + } else { + (void) mddserror(ep, MDE_DS_AUTONOTSET, sp->setno, + NULL, NULL, sp->setname); + rval = -1; } - - } else { - (void) mddserror(ep, MDE_DS_AUTONOTSET, sp->setno, NULL, NULL, - sp->setname); - rval = -1; - } } out: cl_sk = cl_get_setkey(sp->setno, sp->setname); if (clnt_unlock_set(hostname, cl_sk, &xep)) { - if (rval == 0) - (void) mdstealerror(ep, &xep); - rval = -1; + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; } cl_set_setkey(NULL);
--- a/usr/src/lib/lvm/libmeta/common/meta_sp.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/lib/lvm/libmeta/common/meta_sp.c Wed Dec 24 08:23:40 2008 -0700 @@ -18,6 +18,7 @@ * * CDDL HEADER END */ + /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. @@ -1895,7 +1896,7 @@ wm.wm_mdname); result = mdmn_send_message(sp->setno, MD_MN_MSG_ADDMDNAME, - MD_MSGF_PANIC_WHEN_INCONSISTENT, + MD_MSGF_PANIC_WHEN_INCONSISTENT, 0, (char *)send_params, message_size, &resp, ep); Free(send_params); @@ -2384,10 +2385,11 @@ } /* - * FUNCTION: meta_sp_update_wm() + * FUNCTION: meta_sp_update_wm_common() * INPUT: sp - the operating set * msp - a pointer to the XDR unit structure * extlist - the extent list specifying watermarks to update + * iocval - either MD_IOC_SPUPDATEWM or MD_MN_IOC_SPUPDATEWM * OUTPUT: ep - return error pointer * RETURNS: int - -1 if error, 0 on success * PURPOSE: steps backwards through the extent list updating @@ -2401,10 +2403,11 @@ * are realized. */ static int -meta_sp_update_wm( +meta_sp_update_wm_common( mdsetname_t *sp, md_sp_t *msp, sp_ext_node_t *extlist, + int iocval, md_error_t *ep ) { @@ -2493,8 +2496,8 @@ MD_SETDRIVERNAME(&update_params, MD_SP, MD_MIN2SET(update_params.mnum)); - if (metaioctl(MD_IOC_SPUPDATEWM, &update_params, - &update_params.mde, msp->common.namep->cname) != 0) { + if (metaioctl(iocval, &update_params, &update_params.mde, + msp->common.namep->cname) != 0) { (void) mdstealerror(ep, &update_params.mde); rval = -1; goto out; @@ -2507,6 +2510,30 @@ return (rval); } +static int +meta_sp_update_wm( + mdsetname_t *sp, + md_sp_t *msp, + sp_ext_node_t *extlist, + md_error_t *ep +) +{ + return (meta_sp_update_wm_common(sp, msp, extlist, MD_IOC_SPUPDATEWM, + ep)); +} + +static int +meta_mn_sp_update_wm( + mdsetname_t *sp, + md_sp_t *msp, + sp_ext_node_t *extlist, + md_error_t *ep +) +{ + return (meta_sp_update_wm_common(sp, msp, extlist, MD_MN_IOC_SPUPDATEWM, + ep)); +} + /* * FUNCTION: meta_sp_clear_wm() * INPUT: sp - the operating set @@ -4227,9 +4254,9 @@ int committed = 0; int repart_options = MD_REPART_FORCE; int create_flag = MD_CRO_32BIT; + int mn_set_master = 0; md_set_desc *sd; - mm_unit_t *mm; md_set_mmown_params_t *ownpar = NULL; int comp_is_mirror = 0; @@ -4417,19 +4444,7 @@ goto out; } if (MD_MNSET_DESC(sd) && sd->sd_mn_am_i_master) { - mm = (mm_unit_t *)meta_get_unit(sp, compnp, ep); - if (mm == NULL) { - rval = -1; - goto out; - } else { - rval = meta_mn_change_owner(&ownpar, sp->setno, - meta_getminor(compnp->dev), - sd->sd_mn_mynode->nd_nodeid, - MD_MN_MM_PREVENT_CHANGE | - MD_MN_MM_SPAWN_THREAD); - if (rval == -1) - goto out; - } + mn_set_master = 1; } } @@ -4450,22 +4465,22 @@ committed = 1; /* write watermarks */ - if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) { - rval = -1; - goto out; - } - /* - * Allow mirror ownership to change. If we don't succeed in this - * ioctl it isn't fatal, but the cluster will probably hang fairly - * soon as the mirror owner won't change. However, we have - * successfully written the watermarks out to the device so the - * softpart creation has succeeded + * Special-case for Multi-node sets. As we now have a distributed DRL + * update mechanism, we _will_ hit the ioctl-within-ioctl deadlock case + * unless we use a 'special' MN-capable ioctl to stage the watermark + * update. This only affects the master-node in an MN set. */ - if (ownpar) { - (void) meta_mn_change_owner(&ownpar, sp->setno, ownpar->d.mnum, - ownpar->d.owner, - MD_MN_MM_ALLOW_CHANGE | MD_MN_MM_SPAWN_THREAD); + if (mn_set_master) { + if (meta_mn_sp_update_wm(sp, msp, extlist, ep) < 0) { + rval = -1; + goto out; + } + } else { + if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) { + rval = -1; + goto out; + } } /* second phase of commit, set status to MD_SP_OK */ @@ -5838,7 +5853,7 @@ sp_setstat_params.sp_setstat_status = status; result = mdmn_send_message(sp->setno, - MD_MN_MSG_SP_SETSTAT, MD_MSGF_DEFAULT_FLAGS, + MD_MN_MSG_SP_SETSTAT, MD_MSGF_DEFAULT_FLAGS, 0, (char *)&sp_setstat_params, sizeof (sp_setstat_params), &resp, ep); @@ -6022,7 +6037,7 @@ compnp->cname); result = mdmn_send_message(sp->setno, MD_MN_MSG_ADDKEYNAME, MD_MSGF_DEFAULT_FLAGS, - (char *)send_params, message_size, &resp, + 0, (char *)send_params, message_size, &resp, ep); Free(send_params); if (resp != NULL) { @@ -6154,7 +6169,7 @@ sizeof (*un_array[i]) - sizeof (mp_ext_t) + (un_array[i]->un_numexts * sizeof (mp_ext_t))); result = mdmn_send_message(sp->setno, - MD_MN_MSG_IOCSET, MD_MSGF_DEFAULT_FLAGS, + MD_MN_MSG_IOCSET, MD_MSGF_DEFAULT_FLAGS, 0, (char *)&send_params, mess_size, &resp, ep); if (resp != NULL) { @@ -6303,7 +6318,8 @@ send_params.delkeyname_key = np->key; (void) mdmn_send_message(sp->setno, MD_MN_MSG_DELKEYNAME, MD_MSGF_DEFAULT_FLAGS, - (char *)&send_params, sizeof (send_params), + 0, (char *)&send_params, + sizeof (send_params), &resp, ep); if (resp != NULL) { free_result(resp);
--- a/usr/src/uts/common/io/lvm/md/md.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/uts/common/io/lvm/md/md.c Wed Dec 24 08:23:40 2008 -0700 @@ -1858,6 +1858,9 @@ case MD_MN_RESYNC: case MD_MN_SETSYNC: case MD_MN_POKE_HOTSPARES: + case MD_MN_RR_DIRTY: + case MD_MN_RR_CLEAN: + case MD_MN_IOC_SPUPDATEWM: return (1); default: return (0);
--- a/usr/src/uts/common/io/lvm/md/md_ioctl.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/uts/common/io/lvm/md/md_ioctl.c Wed Dec 24 08:23:40 2008 -0700 @@ -80,40 +80,80 @@ extern int med_set_t_ioctl(mddb_med_t_parm_t *tpp, int mode); extern unit_t md_get_nextunit(set_t setno); -static int md_mn_commd_present; - /* md_mddb.c */ extern mddb_set_t *mddb_setenter(set_t setno, int flag, int *errorcodep); extern void mddb_setexit(mddb_set_t *s); extern md_krwlock_t nm_lock; +#define MD_MN_COMMD_CMD "rpc.mdcommd" +static pid_t md_mn_commd_pid; + /* * md_mn_is_commd_present: * ---------------------- * Determine if commd is running on this node. * - * Returns: - * 1 if commd has been started - * 0 if commd has not been started or has exited + * If md_mn_commd_pid is 0, trust it. Otherwise, do some in-depth checking + * to make sure it's still the one we originally set up by checking the + * provided PID's u_comm for the right program name in u_comm. + * + * This one's intended for the "something went awry" cases, and not for + * general use, due to its higher cost for the good/normal case. */ int md_mn_is_commd_present(void) { - return (md_mn_commd_present ? 1 : 0); + proc_t *commd_procp; + + if (md_mn_commd_pid == (pid_t)0) { + return (0); + } + + /* some in-depth checking */ + mutex_enter(&pidlock); + if ((commd_procp = prfind(md_mn_commd_pid)) != NULL && + strncmp(commd_procp->p_user.u_comm, + MD_MN_COMMD_CMD, strlen(MD_MN_COMMD_CMD)) == 0) { + mutex_exit(&pidlock); + /* + * returns a little more info than asked for, but it will + * never be PID 0 when valid. + */ + return ((int)md_mn_commd_pid); + } + /* if it's not there, make sure we only do these contortions once */ + md_mn_commd_pid = (pid_t)0; + mutex_exit(&pidlock); + + cmn_err(CE_WARN, "!rpc.mdcommd exited abnormally"); + return (0); +} + +/* + * This version merely checks the PID value that was set via an ioctl. + * It's intended to be used in the main code flow, where performance is + * critical, and accuracy can be sacrificed a little. If something is + * already known to be wrong, don't use this, but use + * md_mn_is_commd_present() instead. + */ +int +md_mn_is_commd_present_lite(void) +{ + return ((int)md_mn_commd_pid); } /* * md_mn_clear_commd_present: * ------------------------- - * Clear the commd_present flag. Called only from a CPR request to suspend / - * terminate a resync thread. We clear the md_mn_commd_present flag so that + * Clear the md_mn_commd_pid. Called only from a CPR request to suspend / + * terminate a resync thread. We clear the md_mn_commd_pid so that * any RPC request that was in transit can complete with a failure and _not_ * result in an unexpected system panic. */ void md_mn_clear_commd_present() { - md_mn_commd_present = 0; + md_mn_commd_pid = (pid_t)0; } /* @@ -855,7 +895,6 @@ return (mderror(mdep, MDE_UNIT_NOT_FOUND)); } - rw_enter(&md_ops[modindex]->md_link_rw.lock, RW_READER); /* if array length is not 0 then allocate the output buffers */ if (minor_array_length != 0) { sz = minor_array_length * ((int)sizeof (minor_t)); @@ -863,6 +902,7 @@ m_ptr = minors; } + rw_enter(&md_ops[modindex]->md_link_rw.lock, RW_READER); next = md_ops[modindex]->md_head; count = 0; while (next) { @@ -2976,6 +3016,7 @@ setno, MD_MN_MSG_TEST1, flags, + 0, (char *)&msg_test, sizeof (msg_test), result); @@ -3019,6 +3060,7 @@ setno, MD_MN_MSG_TEST2, flags, + 0, (char *)&msg_test, sizeof (msg_test), result); @@ -3408,7 +3450,7 @@ } /* - * Update md_mn_commd_present global to reflect presence or absence of + * Update md_mn_commd_pid global to reflect presence or absence of * /usr/sbin/rpc.mdcommd. This allows us to determine if an RPC failure * is expected during a mdmn_ksend_message() handshake. If the commd is * not present then an RPC failure is acceptable. If the commd _is_ @@ -3420,7 +3462,7 @@ if (! (mode & FWRITE)) return (EACCES); - md_mn_commd_present = (int)(intptr_t)data; + md_mn_commd_pid = (pid_t)(intptr_t)data; err = 0; break; }
--- a/usr/src/uts/common/io/lvm/md/md_mddb.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/uts/common/io/lvm/md/md_mddb.c Wed Dec 24 08:23:40 2008 -0700 @@ -18,13 +18,12 @@ * * CDDL HEADER END */ + /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/conf.h> #include <sys/time.h> @@ -643,7 +642,7 @@ freeblks = 0; for (mbip = s->s_mbiarray[i]; mbip != NULL; - mbip = mbip->mbi_next) { + mbip = mbip->mbi_next) { freeblks += mbip->mbi_mddb_mb.mb_blkcnt; } if (freeblks == 0) /* this happen when there is no */ @@ -798,7 +797,7 @@ if ((did_freep1->free_blk == firstblk) && (did_freep1->free_offset <= offset) && ((did_freep1->free_length + did_freep1->free_offset) >= - (length + offset))) { + (length + offset))) { /* Have found our entry - remove from list */ block_found = 1; did_freep_before = did_freep1; @@ -816,17 +815,17 @@ * offset, length. */ did_freep_before->free_length = offset - - did_freep_before->free_offset; + did_freep_before->free_offset; /* * did_freep_after points to area in block after * offset, length. */ did_freep_after = (mddb_did_free_t *)kmem_zalloc - (sizeof (mddb_did_free_t), KM_SLEEP); + (sizeof (mddb_did_free_t), KM_SLEEP); did_freep_after->free_blk = did_freep_before->free_blk; did_freep_after->free_offset = offset + length; did_freep_after->free_length = old_length - length - - did_freep_before->free_length; + did_freep_before->free_length; /* * Add before and after areas to free list * If area before or after offset, length has length @@ -835,28 +834,30 @@ if (did_freep_after->free_length) { did_freep_after->free_next = did_freep1; if (did_freep2) { - did_freep2->free_next = did_freep_after; + did_freep2->free_next = + did_freep_after; } else { - s->s_did_icp->did_ic_freep = - did_freep_after; + s->s_did_icp->did_ic_freep = + did_freep_after; } did_freep1 = did_freep_after; } else { kmem_free(did_freep_after, - sizeof (mddb_did_free_t)); + sizeof (mddb_did_free_t)); } if (did_freep_before->free_length) { did_freep_before->free_next = did_freep1; if (did_freep2) { - did_freep2->free_next = did_freep_before; + did_freep2->free_next = + did_freep_before; } else { - s->s_did_icp->did_ic_freep = - did_freep_before; + s->s_did_icp->did_ic_freep = + did_freep_before; } } else { kmem_free(did_freep_before, - sizeof (mddb_did_free_t)); + sizeof (mddb_did_free_t)); } break; } else { @@ -934,10 +935,10 @@ if (freep->free_length == 0) { if (freep2) { freep2->free_next = - freep->free_next; + freep->free_next; } else { s->s_did_icp->did_ic_freep = - freep->free_next; + freep->free_next; } kmem_free(freep, sizeof (mddb_did_free_t)); } @@ -971,7 +972,7 @@ /* Add unused part of block to free list */ (void) mddb_devid_free_add(s, blk_num, - len, (dbtob(blk_cnt) - len)); + len, (dbtob(blk_cnt) - len)); } return ((caddr_t)devid_ptr); @@ -1015,9 +1016,9 @@ return (0); devid_len = ddi_devid_sizeof(devid); - devid_ptr = (ddi_devid_t) - mddb_devid_free_get(s, devid_len, &blk, &blkcnt, - &offset); + devid_ptr = (ddi_devid_t)mddb_devid_free_get(s, + devid_len, &blk, &blkcnt, &offset); + if (devid_ptr == NULL) { return (1); } @@ -1090,7 +1091,7 @@ /* Add new free space in disk block to free list */ (void) mddb_devid_free_add(s, did_info->info_firstblk, - did_info->info_offset, did_info->info_length); + did_info->info_offset, did_info->info_length); return (0); } @@ -1439,7 +1440,7 @@ for (i = 0; i < cnt; i++) blkarray[i] = blk + i; ret = wrtblklst(s, buffer, blkarray, cnt, - li, 0, MDDB_WR_ONLY_MASTER); + li, 0, MDDB_WR_ONLY_MASTER); kmem_free(blkarray, size); return (ret); } @@ -1505,7 +1506,7 @@ did_blk = s->s_did_icp->did_ic_blkp; did_blk->blk_commitcnt = s->s_lbp->lb_commitcnt; crcgen(did_blk, &did_blk->blk_checksum, - dbtob(lbp->lb_didblkcnt), NULL); + dbtob(lbp->lb_didblkcnt), NULL); } crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL); @@ -1521,20 +1522,20 @@ did_dbp = s->s_did_icp->did_ic_dbp; while (did_dbp) { err |= writeblks(s, (caddr_t)did_dbp->db_ptr, - did_dbp->db_firstblk, - did_dbp->db_blkcnt, li, - MDDB_WR_ONLY_MASTER); + did_dbp->db_firstblk, + did_dbp->db_blkcnt, li, + MDDB_WR_ONLY_MASTER); did_dbp = did_dbp->db_next; } /* write out device id area block */ err |= writeblks(s, (caddr_t)did_blk, - lbp->lb_didfirstblk, lbp->lb_didblkcnt, li, - MDDB_WR_ONLY_MASTER); + lbp->lb_didfirstblk, lbp->lb_didblkcnt, li, + MDDB_WR_ONLY_MASTER); } /* write out locator block */ err |= writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li, - MDDB_WR_ONLY_MASTER); + MDDB_WR_ONLY_MASTER); } /* @@ -1715,7 +1716,7 @@ size_t size; size = sizeof (mddb_de_ic_t) - sizeof (mddb_block_t) + - sizeof (mddb_block_t) * dep->de_blkcount; + sizeof (mddb_block_t) * dep->de_blkcount; return (size); } @@ -1727,7 +1728,7 @@ size_t size; size = sizeof (*dep) - sizeof (dep->de32_blks) + - sizeof (mddb_block_t) * dep->de32_blkcount; + sizeof (mddb_block_t) * dep->de32_blkcount; return (size); } @@ -1760,7 +1761,7 @@ if ((dbp->db_firstentry != NULL) && (db32p->db32_firstentry == 0)) db32p->db32_firstentry = 0x4; de32p = (mddb_de32_t *)((void *) ((caddr_t)(&db32p->db32_firstentry) - + sizeof (db32p->db32_firstentry))); + + sizeof (db32p->db32_firstentry))); for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { detode32(dep, de32p); if ((dep->de_next != NULL) && (de32p->de32_next == 0)) @@ -2067,9 +2068,9 @@ dep = (mddb_de_ic_t *) kmem_zalloc(sizeof (mddb_de_ic_t) - - sizeof (mddb_block_t) + - sizeof (mddb_block_t) * de32p->de32_blkcount, - KM_SLEEP); + sizeof (mddb_block_t) + + sizeof (mddb_block_t) * de32p->de32_blkcount, + KM_SLEEP); de32tode(de32p, dep); dbp->db_firstentry = dep; @@ -2078,10 +2079,10 @@ de32p2 = nextentry(de32p); dep2 = (mddb_de_ic_t *)kmem_zalloc( - sizeof (mddb_de_ic_t) - - sizeof (mddb_block_t) + - sizeof (mddb_block_t) * - de32p2->de32_blkcount, KM_SLEEP); + sizeof (mddb_de_ic_t) - + sizeof (mddb_block_t) + + sizeof (mddb_block_t) * + de32p2->de32_blkcount, KM_SLEEP); de32tode(de32p2, dep2); @@ -2277,10 +2278,9 @@ if ((cb = devopsp[getmajor(lp->l_dev)]->devo_cb_ops) != NULL) { error = (*cb->cb_prop_op)(DDI_DEV_T_ANY, devi, - prop_op, - DDI_PROP_NOTPROM|DDI_PROP_DONTPASS, - "removable-media", - (caddr_t)&propvalue, &proplength); + prop_op, DDI_PROP_NOTPROM | + DDI_PROP_DONTPASS, "removable-media", + (caddr_t)&propvalue, &proplength); if (error == DDI_PROP_SUCCESS) removable = 1; @@ -2348,7 +2348,7 @@ (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) && (type >= MDDB_FIRST_MODID) && ((rbp->rb_revision == MDDB_REV_RB) || - (rbp->rb_revision == MDDB_REV_RBFN))) { + (rbp->rb_revision == MDDB_REV_RBFN))) { switch (dep->de_flags) { @@ -2512,7 +2512,7 @@ * In a MN diskset, any node can write optimized record(s). */ wrt_err = wrtblklst(s, (caddr_t)rbp, dep->de_blks, - dep->de_blkcount, li, &bufhead, MDDB_WR_ANY_NODE); + dep->de_blkcount, li, &bufhead, MDDB_WR_ANY_NODE); /* * For MN diskset, set error in optinfo structure so * that mddb_commitrec knows which replica failed. @@ -2556,10 +2556,10 @@ lp = &lbp->lb_locators[dep->de_optinfo[0].o_li]; if (lp == bfp->bf_locator) { dep->de_optinfo[0].o_flags |= - MDDB_F_EWRITE; + MDDB_F_EWRITE; } else { dep->de_optinfo[1].o_flags |= - MDDB_F_EWRITE; + MDDB_F_EWRITE; } } err |= MDDB_F_EWRITE; @@ -2689,7 +2689,7 @@ create_db32rec(db32p, dbp); crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); err = writeall(s, (caddr_t)db32p, db32p->db32_blknum, - 1, MDDB_WR_ONLY_MASTER); + 1, MDDB_WR_ONLY_MASTER); kmem_free((caddr_t)db32p, MDDB_BSIZE); return (err); } @@ -2932,13 +2932,13 @@ if (MD_UPGRADE) { ldev = md_makedevice(md_targ_name_to_major(clp->l_driver), - clp->l_mnum); + clp->l_mnum); } else { if (ddi_name_to_major(clp->l_driver) == (major_t)-1) return (EINVAL); ldev = md_makedevice(ddi_name_to_major(clp->l_driver), - clp->l_mnum); + clp->l_mnum); } if (clp->l_devid != 0) { @@ -3099,7 +3099,7 @@ create_db32rec(db32p, dbp); crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); err = writeblks(s, (caddr_t)db32p, dbp->db_blknum, 1, li, - MDDB_WR_ONLY_MASTER); + MDDB_WR_ONLY_MASTER); kmem_free((caddr_t)db32p, MDDB_BSIZE); if (err) return (err); @@ -3804,7 +3804,7 @@ lnp->ln_revision = MDDB_REV_LN; crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, - lbp->lb_lnblkcnt, 0); + lbp->lb_lnblkcnt, 0); /* * If a MN diskset and this is the master, set the PARSE_LOCNM * flag in the mddb_set structure to show that the locator @@ -4413,28 +4413,34 @@ } if (lbp->lb_flags & MDDB_DEVID_STYLE) { - did_info = &(did_icp->did_ic_blkp->blk_info[li]); - if (did_info->info_flags & MDDB_DID_EXISTS) { - sz = (int)ddi_devid_sizeof(did_icp->did_ic_devid[li]); - if (clp->l_devid_flags & MDDB_DEVID_SPACE) { - /* copy device id from mddb to cfg_loc structure */ - szalloc = clp->l_devid_sz; - if (sz <= szalloc) { - for (i = 0; i < sz; i++) { - ((char *)(uintptr_t)clp->l_devid)[i] = - ((char *)did_icp->did_ic_devid[li])[i]; + did_info = &(did_icp->did_ic_blkp->blk_info[li]); + if (did_info->info_flags & MDDB_DID_EXISTS) { + sz = (int)ddi_devid_sizeof(did_icp->did_ic_devid[li]); + if (clp->l_devid_flags & MDDB_DEVID_SPACE) { + /* + * copy device id from mddb to + * cfg_loc structure + */ + szalloc = clp->l_devid_sz; + if (sz <= szalloc) { + for (i = 0; i < sz; i++) { + ((char *)(uintptr_t) + clp->l_devid)[i] = + ((char *)did_icp-> + did_ic_devid[li])[i]; + } + clp->l_devid_flags |= MDDB_DEVID_VALID; + (void) strcpy(clp->l_minor_name, + did_info->info_minor_name); + } else { + clp->l_devid_flags |= + MDDB_DEVID_NOSPACE; } - clp->l_devid_flags |= MDDB_DEVID_VALID; - (void) strcpy(clp->l_minor_name, - did_info->info_minor_name); - } else { - clp->l_devid_flags |= MDDB_DEVID_NOSPACE; - } - } else if (clp->l_devid_flags & MDDB_DEVID_GETSZ) { - clp->l_devid_flags = MDDB_DEVID_SZ; - clp->l_devid_sz = sz; - } - } + } else if (clp->l_devid_flags & MDDB_DEVID_GETSZ) { + clp->l_devid_flags = MDDB_DEVID_SZ; + clp->l_devid_sz = sz; + } + } } /* @@ -4770,8 +4776,7 @@ * lb_blkcnt will be set correctly for MN set later once getmasters * has determined that the set is a MN set. */ - lb_blkcnt = ((setno == MD_LOCAL_SET) ? - MDDB_LOCAL_LBCNT : MDDB_LBCNT); + lb_blkcnt = ((setno == MD_LOCAL_SET) ? MDDB_LOCAL_LBCNT : MDDB_LBCNT); for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { rip->ri_flags &= (MDDB_F_PTCHED | MDDB_F_IOCTL | @@ -4919,8 +4924,8 @@ /* Read in device ID block */ if (did_icp == NULL) { did_icp = (mddb_did_ic_t *) - kmem_zalloc(sizeof (mddb_did_ic_t), - KM_SLEEP); + kmem_zalloc(sizeof (mddb_did_ic_t), + KM_SLEEP); } else { /* Reuse did_icp, but clear out data */ if (did_icp->did_ic_blkp != @@ -4932,22 +4937,23 @@ (mddb_did_blk_t *)NULL; } if (did_icp->did_ic_dbp != - (mddb_did_db_t *)NULL) { + (mddb_did_db_t *)NULL) { did_dbp1 = did_icp->did_ic_dbp; while (did_dbp1) { - did_dbp2 = did_dbp1->db_next; - kmem_free((caddr_t)did_dbp1->db_ptr, - dbtob(did_dbp1->db_blkcnt)); - kmem_free((caddr_t)did_dbp1, - sizeof (mddb_did_db_t)); - did_dbp1 = did_dbp2; + did_dbp2 = did_dbp1->db_next; + kmem_free((caddr_t) + did_dbp1->db_ptr, + dbtob(did_dbp1->db_blkcnt)); + kmem_free((caddr_t)did_dbp1, + sizeof (mddb_did_db_t)); + did_dbp1 = did_dbp2; } did_icp->did_ic_dbp = - (mddb_did_db_t *)NULL; + (mddb_did_db_t *)NULL; } for (i = 0; i < MDDB_NLB; i++) { did_icp->did_ic_devid[i] = - (ddi_devid_t)NULL; + (ddi_devid_t)NULL; } } @@ -4985,7 +4991,7 @@ if (revchk(MDDB_REV_DI, did_blkp->blk_revision)) continue; if (crcchk(did_blkp, &did_blkp->blk_checksum, - dbtob(lbp->lb_didblkcnt), NULL)) + dbtob(lbp->lb_didblkcnt), NULL)) continue; /* @@ -5037,82 +5043,106 @@ * have been updated to match this valid device * id information. */ - for (li = 0; li < lbp->lb_loccnt; li++) { - did_info = &did_blkp->blk_info[li]; - if (did_info->info_flags & MDDB_DID_EXISTS) - did_info->info_flags &= - ~(MDDB_DID_VALID | MDDB_DID_UPDATED); - } - - cont_flag = 0; - for (li = 0; li < lbp->lb_loccnt; li++) { - did_info = &did_blkp->blk_info[li]; - did_block = (caddr_t)NULL; - if (did_info->info_flags & MDDB_DID_EXISTS) { - /* Check if block has already been read in */ - did_dbp = did_icp->did_ic_dbp; - while (did_dbp != 0) { - if (did_dbp->db_firstblk == - did_info->info_firstblk) - break; - else - did_dbp = did_dbp->db_next; - } - /* if block not found, read it in */ - if (did_dbp == NULL) { - did_block = (caddr_t)(kmem_zalloc(dbtob - (did_info->info_blkcnt), KM_SLEEP)); - buffer = (caddr_t)did_block; - for (blk = did_info->info_firstblk; - blk < (did_info->info_firstblk + - did_info->info_blkcnt); blk++) { - physblk = getphysblk(blk, rip->ri_mbip); - err = getblks(s, buffer, dev, physblk, - btodb(MDDB_BSIZE), 0); - if (err) { - rip->ri_flags |= err; + for (li = 0; li < lbp->lb_loccnt; li++) { + did_info = &did_blkp->blk_info[li]; + if (did_info->info_flags & MDDB_DID_EXISTS) + did_info->info_flags &= + ~(MDDB_DID_VALID | + MDDB_DID_UPDATED); + } + + cont_flag = 0; + for (li = 0; li < lbp->lb_loccnt; li++) { + did_info = &did_blkp->blk_info[li]; + did_block = (caddr_t)NULL; + if (did_info->info_flags & MDDB_DID_EXISTS) { + /* + * Check if block has + * already been read in + */ + did_dbp = did_icp->did_ic_dbp; + while (did_dbp != 0) { + if (did_dbp->db_firstblk == + did_info->info_firstblk) + break; + else + did_dbp = + did_dbp->db_next; + } + /* if block not found, read it in */ + if (did_dbp == NULL) { + did_block = (caddr_t) + (kmem_zalloc(dbtob( + did_info->info_blkcnt), + KM_SLEEP)); + buffer = (caddr_t)did_block; + for (blk = + did_info->info_firstblk; + blk < (did_info-> + info_firstblk + + did_info->info_blkcnt); + blk++) { + physblk = + getphysblk(blk, + rip->ri_mbip); + err = getblks(s, + buffer, dev, + physblk, btodb( + MDDB_BSIZE), 0); + if (err) { + rip->ri_flags |= + err; + break; + } + buffer += MDDB_BSIZE; + } + if (err) { + kmem_free(did_block, + dbtob(did_info-> + info_blkcnt)); + did_block = + (caddr_t)NULL; + cont_flag = 1; + break; + } + + /* + * Block read in - + * alloc Disk Block area + */ + did_dbp = (mddb_did_db_t *) + kmem_zalloc( + sizeof (mddb_did_db_t), + KM_SLEEP); + did_dbp->db_ptr = did_block; + did_dbp->db_firstblk = + did_info->info_firstblk; + did_dbp->db_blkcnt = + did_info->info_blkcnt; + + /* Add to front of dbp list */ + did_dbp->db_next = + did_icp->did_ic_dbp; + did_icp->did_ic_dbp = did_dbp; + } + /* Check validity of devid in block */ + if (crcchk(((char *)did_dbp->db_ptr + + did_info->info_offset), + &did_info->info_checksum, + did_info->info_length, NULL)) { + cont_flag = 1; break; } - buffer += MDDB_BSIZE; - } - if (err) { - kmem_free(did_block, - dbtob(did_info->info_blkcnt)); - did_block = (caddr_t)NULL; - cont_flag = 1; - break; + + /* Block now pointed to by did_dbp */ + did_icp->did_ic_devid[li] = + (ddi_devid_t)((char *) + did_dbp->db_ptr + + did_info->info_offset); } - - /* - * Block read in - alloc Disk Block area - */ - did_dbp = (mddb_did_db_t *)kmem_zalloc( - sizeof (mddb_did_db_t), KM_SLEEP); - did_dbp->db_ptr = did_block; - did_dbp->db_firstblk = did_info->info_firstblk; - did_dbp->db_blkcnt = did_info->info_blkcnt; - - /* Add to front of dbp list */ - did_dbp->db_next = did_icp->did_ic_dbp; - did_icp->did_ic_dbp = did_dbp; - } - /* Check validity of devid in block */ - if (crcchk(((char *)did_dbp->db_ptr + - did_info->info_offset), - &did_info->info_checksum, - did_info->info_length, NULL)) { - cont_flag = 1; - break; - } - - /* Block now pointed to by did_dbp */ - did_icp->did_ic_devid[li] = (ddi_devid_t) - ((char *)did_dbp->db_ptr + - did_info->info_offset); - } - } - if (cont_flag) - continue; + } + if (cont_flag) + continue; } /* @@ -5194,11 +5224,11 @@ (rip->ri_old_devid != (ddi_devid_t)NULL)) { if (ddi_devid_compare(rip->ri_old_devid, did_icp->did_ic_devid[li]) != 0) - continue; + continue; } else { if (ddi_devid_compare(rip->ri_devid, did_icp->did_ic_devid[li]) != 0) - continue; + continue; } if (strcmp(rip->ri_minor_name, @@ -5214,64 +5244,74 @@ * information about itself. */ if (!mn_set) { - for (li = 0; li < lbp->lb_loccnt; li++) { - mddb_drvnm_t *dn; - mddb_sidelocator_t *slp; - - lp = &lbp->lb_locators[li]; - slp = &lbp->lb_sidelocators[s->s_sideno][li]; - if (lp->l_flags & MDDB_F_DELETED) - continue; - if (slp->l_mnum != md_getminor(rip->ri_dev)) - continue; - if (lp->l_blkno != rip->ri_blkno) - continue; - dn = &lbp->lb_drvnm[slp->l_drvnm_index]; - if (strncmp(dn->dn_data, rip->ri_driver, - MD_MAXDRVNM) == 0) - break; - } + for (li = 0; li < lbp->lb_loccnt; li++) { + mddb_drvnm_t *dn; + mddb_sidelocator_t *slp; + + lp = &lbp->lb_locators[li]; + slp = &lbp-> + lb_sidelocators[s->s_sideno][li]; + if (lp->l_flags & MDDB_F_DELETED) + continue; + if (slp->l_mnum != md_getminor( + rip->ri_dev)) + continue; + if (lp->l_blkno != rip->ri_blkno) + continue; + dn = &lbp->lb_drvnm[slp->l_drvnm_index]; + if (strncmp(dn->dn_data, + rip->ri_driver, MD_MAXDRVNM) == 0) + break; + } } else { - for (li = 0; li < lbp->lb_loccnt; li++) { - mddb_drvnm_t *dn; - mddb_mnsidelocator_t *mnslp; - mddb_mnlb_t *mnlbp; - int i; - - /* - * Check all possible locators locking for - * match to the currently read-in locator, - * must match on: - * - blkno - * - side locator for this node's side - * - side locator minor number - * - side locator driver name - */ - - /* Looking at sidelocs - cast lbp -> mnlbp */ - mnlbp = (mddb_mnlb_t *)lbp; - lp = &mnlbp->lb_locators[li]; - if (lp->l_flags & MDDB_F_DELETED) - continue; - if (lp->l_blkno != rip->ri_blkno) - continue; - - for (i = 0; i < MD_MNMAXSIDES; i++) { - mnslp = &mnlbp->lb_mnsidelocators[i][li]; - if (mnslp->mnl_sideno == s->s_sideno) { - break; - } + for (li = 0; li < lbp->lb_loccnt; li++) { + mddb_drvnm_t *dn; + mddb_mnsidelocator_t *mnslp; + mddb_mnlb_t *mnlbp; + int i; + + /* + * Check all possible locators locking + * for match to the currently read-in + * locator, must match on: + * - blkno + * - side locator for this + * node's side + * - side locator minor number + * - side locator driver name + */ + + /* + * Looking at sidelocs: + * cast lbp -> mnlbp + */ + mnlbp = (mddb_mnlb_t *)lbp; + lp = &mnlbp->lb_locators[li]; + if (lp->l_flags & MDDB_F_DELETED) + continue; + if (lp->l_blkno != rip->ri_blkno) + continue; + + for (i = 0; i < MD_MNMAXSIDES; i++) { + mnslp = &mnlbp-> + lb_mnsidelocators[i][li]; + if (mnslp->mnl_sideno == + s->s_sideno) { + break; + } + } + /* No matching side found */ + if (i == MD_MNMAXSIDES) + continue; + if (mnslp->mnl_mnum != + md_getminor(rip->ri_dev)) + continue; + dn = &lbp-> + lb_drvnm[mnslp->mnl_drvnm_index]; + if (strncmp(dn->dn_data, + rip->ri_driver, MD_MAXDRVNM) == 0) + break; } - /* No matching side found */ - if (i == MD_MNMAXSIDES) - continue; - if (mnslp->mnl_mnum != md_getminor(rip->ri_dev)) - continue; - dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index]; - if (strncmp(dn->dn_data, rip->ri_driver, - MD_MAXDRVNM) == 0) - break; - } } } @@ -5549,7 +5589,7 @@ did_dbp1 = did_icp->did_ic_dbp; while (did_dbp1) { if (mddb_devid_free_add(s, did_dbp1->db_firstblk, - 0, dbtob(did_dbp1->db_blkcnt))) { + 0, dbtob(did_dbp1->db_blkcnt))) { retval = MDDB_E_NOSPACE; goto errout; } @@ -5904,9 +5944,9 @@ /* Validate device id on current system */ newdev[li] = dev; if (mddb_devid_validate( - did_icp->did_ic_devid[li], - &(newdev[li]), - did_info->info_minor_name) == 0) { + did_icp->did_ic_devid[li], + &(newdev[li]), + did_info->info_minor_name) == 0) { /* Set valid flag */ did_info->info_flags |= MDDB_DID_VALID; } else { @@ -5931,20 +5971,21 @@ if (mddb_devid_add(s, li, ret_devid, minor_name)) { cmn_err(CE_WARN, - "Not enough space in" - " metadevice state" - " database\n"); + "Not enough space" + " in metadevice" + " state" + " database\n"); cmn_err(CE_WARN, - "to add relocation" - " information for" - " device:\n"); + "to add relocation" + " information for" + " device:\n"); cmn_err(CE_WARN, - " major = %d, " - " minor = %d\n", - getmajor(ddi_dev), - getminor(ddi_dev)); + " major = %d, " + " minor = %d\n", + getmajor(ddi_dev), + getminor(ddi_dev)); } else { - write_lb = 1; + write_lb = 1; } kmem_free(minor_name, strlen(minor_name) + 1); @@ -6509,7 +6550,7 @@ if (! s->s_mbiarray[i]) continue; dev = md_expldev( - s->s_lbp->lb_locators[i].l_dev); + s->s_lbp->lb_locators[i].l_dev); dev = md_xlate_targ_2_mini(dev); if (dev != NODEV64) mddb_devclose(dev); @@ -6518,7 +6559,7 @@ } kmem_free((caddr_t)s->s_mbiarray, - sizeof (mddb_mb_ic_t *) * mddb_maxcopies); + sizeof (mddb_mb_ic_t *) * mddb_maxcopies); s->s_mbiarray = NULL; } @@ -6560,7 +6601,7 @@ */ lb_blkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ? - MDDB_LOCAL_LBCNT : MDDB_LBCNT); + MDDB_LOCAL_LBCNT : MDDB_LBCNT); if (flag & MDDB_MULTINODE) { lb_blkcnt = MDDB_MNLBCNT; } @@ -6623,7 +6664,7 @@ /* the btodb that follows is converting the directory block size */ /* Data tag part of mddb located after first block of mddb data */ lbp->lb_dtfirstblk = (mddb_block_t)(lbp->lb_dbfirstblk + - btodb(MDDB_BSIZE)); + btodb(MDDB_BSIZE)); /* Data tags are not used in MN diskset - so set count to 0 */ if (flag & MDDB_MULTINODE) lbp->lb_dtblkcnt = (mddb_block_t)0; @@ -6675,14 +6716,14 @@ devid_flag = 0; if (devid_flag) { lbp->lb_didfirstblk = lbp->lb_dtfirstblk + - lbp->lb_dtblkcnt; + lbp->lb_dtblkcnt; lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS; lbp->lb_flags |= MDDB_DEVID_STYLE; did_icp = (mddb_did_ic_t *)kmem_zalloc - (sizeof (mddb_did_ic_t), KM_SLEEP); + (sizeof (mddb_did_ic_t), KM_SLEEP); did_blkp = (mddb_did_blk_t *) - kmem_zalloc(dbtob(lbp->lb_didblkcnt), KM_SLEEP); + kmem_zalloc(dbtob(lbp->lb_didblkcnt), KM_SLEEP); did_blkp->blk_magic = MDDB_MAGIC_DI; did_blkp->blk_revision = MDDB_REV_DI; did_icp->did_ic_blkp = did_blkp; @@ -6846,8 +6887,7 @@ * re-grab mutex * set s_mn_parseflags_sending to zero */ - mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t), - KM_SLEEP); + mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t), KM_SLEEP); while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) && (s->s_mn_parseflags & MDDB_PARSE_MASK) && (!(md_get_setstatus(s->s_setno) & MD_SET_MNPARSE_BLK))) { @@ -6867,18 +6907,18 @@ mddb_parse_msg->msg_parse_flags = s->s_mn_parseflags_sending; for (i = 0; i < MDDB_NLB; i++) { mddb_parse_msg->msg_lb_flags[i] = - lbp->lb_locators[i].l_flags; + lbp->lb_locators[i].l_flags; } kresult = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP); while (rval != 0) { rval = mdmn_ksend_message(s->s_setno, - MD_MN_MSG_MDDB_PARSE, 0, - (char *)mddb_parse_msg, - sizeof (mddb_parse_msg), kresult); + MD_MN_MSG_MDDB_PARSE, 0, 0, + (char *)mddb_parse_msg, + sizeof (md_mn_msg_mddb_parse_t), kresult); if (rval != 0) cmn_err(CE_WARN, "mddb_setexit: Unable to send " - "mddb update message to other nodes in " - "diskset %s\n", s->s_setname); + "mddb update message to other nodes in " + "diskset %s\n", s->s_setname); } kmem_free(kresult, sizeof (md_mn_kresult_t)); @@ -6987,12 +7027,12 @@ if (mddb_devid_add(s, li, ret_devid, minor_name)) { cmn_err(CE_WARN, - "Not enough space in metadb" - " to add device id for" - " dev: major = %d, " - "minor = %d\n", - getmajor(ddi_dev), - getminor(ddi_dev)); + "Not enough space in metadb" + " to add device id for" + " dev: major = %d, " + "minor = %d\n", + getmajor(ddi_dev), + getminor(ddi_dev)); } sz = strlen(minor_name) + 1; kmem_free(minor_name, sz); @@ -7179,13 +7219,10 @@ } md_clr_setstatus(setno, MD_SET_ACCOK | MD_SET_ACCEPT | - MD_SET_TAGDATA | MD_SET_USETAG | - MD_SET_TOOFEW | MD_SET_STALE | - MD_SET_OWNERSHIP | MD_SET_BADTAG | - MD_SET_CLRTAG | MD_SET_MNSET | - MD_SET_DIDCLUP | MD_SET_MNPARSE_BLK | - MD_SET_MN_MIR_STATE_RC | MD_SET_IMPORT | - MD_SET_REPLICATED_IMPORT); + MD_SET_TAGDATA | MD_SET_USETAG | MD_SET_TOOFEW | MD_SET_STALE | + MD_SET_OWNERSHIP | MD_SET_BADTAG | MD_SET_CLRTAG | MD_SET_MNSET | + MD_SET_DIDCLUP | MD_SET_MNPARSE_BLK | MD_SET_MN_MIR_STATE_RC | + MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT); mutex_exit(SETMUTEX(setno)); } @@ -7286,13 +7323,13 @@ SPN_SUFFIX(spn).suf_len = mnsn->mn_ln_suffix.suf_len; bcopy(mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_data, - SPN_SUFFIX(spn).suf_len); + SPN_SUFFIX(spn).suf_len); iprefix = mnsn->mn_ln_suffix.suf_prefix; } else { sn = &lnp->ln_suffixes[sideno][li]; SPN_SUFFIX(spn).suf_len = sn->suf_len; bcopy(sn->suf_data, SPN_SUFFIX(spn).suf_data, - SPN_SUFFIX(spn).suf_len); + SPN_SUFFIX(spn).suf_len); iprefix = sn->suf_prefix; } SPN_PREFIX(spn).pre_len = lnp->ln_prefixes[iprefix].pre_len; @@ -7328,7 +7365,7 @@ * Data checking */ if (setno >= md_nsets || cp->c_id < 0 || - cp->c_id > cp->c_dbmax) { + cp->c_id > cp->c_dbmax) { return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); } @@ -7377,14 +7414,14 @@ if (cp->c_id < 0 || cp->c_id > cp->c_dbmax) { mddb_setexit(s); return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, - setno)); + setno)); } li = cp->c_id; } else { if (cp->c_id >= cp->c_dbcnt) { mddb_setexit(s); return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, - setno)); + setno)); } /* CSTYLED */ @@ -7446,7 +7483,7 @@ * commitcnt to 0. */ (void) writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li, - MDDB_WR_ONLY_MASTER); + MDDB_WR_ONLY_MASTER); lbp->lb_commitcnt = commitcnt; } @@ -7689,7 +7726,7 @@ lnp->ln_revision = MDDB_REV_LN; crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, - lbp->lb_lnblkcnt, 0); + lbp->lb_lnblkcnt, 0); /* * If a MN diskset and this is the master, set the PARSE_LOCNM * flag in the mddb_set structure to show that the locator @@ -7851,7 +7888,7 @@ */ if (devidptr != (ddi_devid_t)NULL) { mb = (mddb_mb_t *)kmem_zalloc(MDDB_BSIZE, - KM_SLEEP); + KM_SLEEP); mb->mb_magic = MDDB_MAGIC_DU; mb->mb_revision = MDDB_REV_MB; mb2free = 1; @@ -8077,7 +8114,7 @@ single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDE_DB_TOOSMALL, - NODEV32, setno)); + NODEV32, setno)); } } @@ -8095,7 +8132,7 @@ single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, - setno)); + setno)); } if (cfgloc2locator(lbp, clp, li, cp->c_sideno, index)) { @@ -8105,7 +8142,7 @@ single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, - setno)); + setno)); } } @@ -8119,9 +8156,9 @@ int j; mnlbp = (mddb_mnlb_t *)lbp; for (j = 0; j < MD_MNMAXSIDES; j++) { - mnslp = &mnlbp->lb_mnsidelocators[j][i]; - if (mnslp->mnl_sideno == cp->c_sideno) - break; + mnslp = &mnlbp->lb_mnsidelocators[j][i]; + if (mnslp->mnl_sideno == cp->c_sideno) + break; } if (j < MD_MNMAXSIDES) { mnslp->mnl_mnum = NODEV32; @@ -8129,7 +8166,7 @@ mnlnp = (mddb_mnln_t *)lnp; mnsn = &(mnlnp->ln_mnsuffixes[j][i]); bzero((caddr_t)mnsn, - sizeof (md_mnname_suffix_t)); + sizeof (md_mnname_suffix_t)); } } else { slp = &lbp->lb_sidelocators[cp->c_sideno][i]; @@ -8148,7 +8185,7 @@ lnp->ln_revision = MDDB_REV_LN; crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, - lbp->lb_lnblkcnt, 0); + lbp->lb_lnblkcnt, 0); /* * If a MN diskset and this is the master, set the PARSE_LOCNM * flag in the mddb_set structure to show that the locator @@ -8288,11 +8325,11 @@ ((daddr_t)lp->l_blkno == clp->l_blkno)) { if (command == MDDB_NEWDEV) { ddi_devid_free((ddi_devid_t)(uintptr_t) - clp->l_devid); + clp->l_devid); single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, - MDE_DB_EXISTS, NODEV32, setno)); + MDE_DB_EXISTS, NODEV32, setno)); } } } else { @@ -8302,7 +8339,7 @@ single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, - MDE_DB_EXISTS, NODEV32, setno)); + MDE_DB_EXISTS, NODEV32, setno)); } } } @@ -8345,7 +8382,7 @@ single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, - setno)); + setno)); } } @@ -8402,7 +8439,7 @@ single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDE_DB_TOOSMALL, - NODEV32, setno)); + NODEV32, setno)); } } /* @@ -8462,7 +8499,7 @@ lnp->ln_revision = MDDB_REV_LN; crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, - lbp->lb_lnblkcnt, 0); + lbp->lb_lnblkcnt, 0); /* * If a MN diskset and this is the master, set the PARSE_LOCNM * flag in the mddb_set structure to show that the locator @@ -8579,67 +8616,74 @@ mdclrerror(ep); switch (command) { - case MDDB_NEWDEV: - err = newdev(cp, command, ep); - break; - - case MDDB_NEWSIDE: - case MDDB_DELSIDE: - err = delnewside(cp, command, ep); - break; - - case MDDB_GETDEV: - case MDDB_DELDEV: - case MDDB_ENDDEV: - err = getdeldev(cp, command, ep); - break; - - case MDDB_GETDRVRNAME: - err = getdriver(&cp->c_locator); - break; - - case MDDB_USEDEV: - /* - * Note: must allow USEDEV ioctl during upgrade to support - * auto-take disksets. - * - * Also during the set import if the md_devid_destroy - * flag is set then error out - */ - - if ((cp->c_flags & MDDB_C_IMPORT) && md_devid_destroy) - return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); - - if (setno >= md_nsets) - return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); - - if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) { - if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) { - err = mddbstatus2error(ep, err, NODEV32, setno); - break; - } - } - if (setno == MD_LOCAL_SET) - flag = MDDB_F_IOCTL; - if (cp->c_locator.l_old_devid) { - md_set_setstatus(setno, MD_SET_REPLICATED_IMPORT); - } - err = ridev(&s->s_rip, &cp->c_locator, NULL, flag); - mddb_setexit(s); - break; - - case MDDB_RELEASESET: - mutex_enter(&mddb_lock); - mddb_unload_set(cp->c_setno); - mutex_exit(&mddb_lock); - break; - - case MDDB_SETDID: - err = setdid(cp); - break; - - default: - err = mdmddberror(ep, MDE_DB_INVALID, NODEV32, cp->c_setno); + case MDDB_NEWDEV: + err = newdev(cp, command, ep); + break; + + case MDDB_NEWSIDE: + case MDDB_DELSIDE: + err = delnewside(cp, command, ep); + break; + + case MDDB_GETDEV: + case MDDB_DELDEV: + case MDDB_ENDDEV: + err = getdeldev(cp, command, ep); + break; + + case MDDB_GETDRVRNAME: + err = getdriver(&cp->c_locator); + break; + + case MDDB_USEDEV: + /* + * Note: must allow USEDEV ioctl during upgrade to + * support auto-take disksets. + * + * Also during the set import if the md_devid_destroy + * flag is set then error out + */ + + if ((cp->c_flags & MDDB_C_IMPORT) && md_devid_destroy) + return (mdmderror(ep, MDE_INVAL_UNIT, + MD_ADM_MINOR)); + + if (setno >= md_nsets) + return (mdmderror(ep, MDE_INVAL_UNIT, + MD_ADM_MINOR)); + + if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == + NULL) { + if ((s = init_set(cp, MDDB_NOINIT, &err)) == + NULL) { + err = mddbstatus2error(ep, err, + NODEV32, setno); + break; + } + } + if (setno == MD_LOCAL_SET) + flag = MDDB_F_IOCTL; + if (cp->c_locator.l_old_devid) { + md_set_setstatus(setno, + MD_SET_REPLICATED_IMPORT); + } + err = ridev(&s->s_rip, &cp->c_locator, NULL, flag); + mddb_setexit(s); + break; + + case MDDB_RELEASESET: + mutex_enter(&mddb_lock); + mddb_unload_set(cp->c_setno); + mutex_exit(&mddb_lock); + break; + + case MDDB_SETDID: + err = setdid(cp); + break; + + default: + err = mdmddberror(ep, MDE_DB_INVALID, NODEV32, + cp->c_setno); } return (err); @@ -8761,15 +8805,14 @@ } recsize = roundup((sizeof (*rbp) - sizeof (rbp->rb_data)) + - usersize, MDDB_BSIZE); + usersize, MDDB_BSIZE); blkcnt = btodb(recsize); if (mddb_maxblocks) maxblocks = mddb_maxblocks; else - maxblocks = (MDDB_BSIZE - - (sizeof (*db32p) + sizeof (*de32p) - - sizeof (de32p->de32_blks))) / sizeof (mddb_block_t); + maxblocks = (MDDB_BSIZE - (sizeof (*db32p) + sizeof (*de32p) - + sizeof (de32p->de32_blks))) / sizeof (mddb_block_t); if (blkcnt > maxblocks) { mddb_setexit(s); @@ -8833,7 +8876,7 @@ } while (dbp); desize = (sizeof (*de32p) - sizeof (de32p->de32_blks)) + - (sizeof (mddb_block_t) * blkcnt); + (sizeof (mddb_block_t) * blkcnt); /* * see if a directory block exists which will hold this entry @@ -8872,7 +8915,8 @@ mddb_setexit(s); return (MDDB_E_NOSPACE); } - for (dbp = s->s_dbp; dbp->db_next; dbp = dbp->db_next); + for (dbp = s->s_dbp; dbp->db_next; dbp = dbp->db_next) + ; dbp->db_next = newdbp; bzero((caddr_t)dbp->db_next, sizeof (*newdbp)); dbp->db_nextblk = getfreeblks(s, 1); @@ -8888,10 +8932,10 @@ * ready to add record */ desize_ic = (sizeof (*dep) - sizeof (dep->de_blks)) + - (sizeof (mddb_block_t) * blkcnt); + (sizeof (mddb_block_t) * blkcnt); if (dbp->db_firstentry) { - for (dep = dbp->db_firstentry; dep->de_next; - dep = dep->de_next); + for (dep = dbp->db_firstentry; dep->de_next; dep = dep->de_next) + ; dep->de_next = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP); dep = dep->de_next; } else { @@ -8919,8 +8963,8 @@ dep->de_blkcount = blkcnt; flag_type = options & (MD_CRO_OPTIMIZE | MD_CRO_STRIPE | MD_CRO_MIRROR | MD_CRO_RAID | - MD_CRO_SOFTPART | MD_CRO_TRANS_MASTER | MD_CRO_TRANS_LOG | - MD_CRO_HOTSPARE | MD_CRO_HOTSPARE_POOL | MD_CRO_CHANGELOG); + MD_CRO_SOFTPART | MD_CRO_TRANS_MASTER | MD_CRO_TRANS_LOG | + MD_CRO_HOTSPARE | MD_CRO_HOTSPARE_POOL | MD_CRO_CHANGELOG); switch (flag_type) { case MD_CRO_OPTIMIZE: dep->de_flags = MDDB_F_OPT; @@ -9003,7 +9047,7 @@ if ((options & MD_CRO_OPTIMIZE) == 0) { for (i = 0; i < blkcnt; i++) { err |= writeall(s, (caddr_t)tmppnt, - dep->de_blks[i], 1, 0); + dep->de_blks[i], 1, 0); tmppnt += MDDB_BSIZE; } } else { @@ -9310,10 +9354,10 @@ mddb_rb32_t *nrbp; recsize = roundup((sizeof (*nrbp) - sizeof (nrbp->rb_data)) + - icsize, MDDB_BSIZE); + icsize, MDDB_BSIZE); if (dep->de_recsize < recsize) cmn_err(CE_PANIC, "mddb_getrecaddr_resize: only " - "nonoptimized records can be resized\n"); + "nonoptimized records can be resized\n"); } mddb_setexit(s); @@ -9673,26 +9717,29 @@ lbp = s->s_lbp; mnlbp = (mddb_mnlb_t *)lbp; for (i = 0; i < 2; i++) { - li = dep->de_optinfo[i].o_li; - lp = &lbp->lb_locators[li]; - for (j = 0; j < MD_MNMAXSIDES; j++) { - mnslp = - &mnlbp->lb_mnsidelocators[j][li]; - if (mnslp->mnl_sideno == s->s_sideno) - break; - } - if (j == MD_MNMAXSIDES) - continue; - - dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index]; - recerr = &msg_recerr->msg_recerr[i]; - recerr->r_li = li; - recerr->r_flags = - dep->de_optinfo[i].o_flags; - recerr->r_blkno = lp->l_blkno; - recerr->r_mnum = md_getminor(lp->l_dev); - (void) strncpy(recerr->r_driver_name, - dn->dn_data, MD_MAXDRVNM); + li = dep->de_optinfo[i].o_li; + lp = &lbp->lb_locators[li]; + for (j = 0; j < MD_MNMAXSIDES; j++) { + mnslp = + &mnlbp-> + lb_mnsidelocators[j][li]; + if (mnslp->mnl_sideno == + s->s_sideno) + break; + } + if (j == MD_MNMAXSIDES) + continue; + + dn = &lbp-> + lb_drvnm[mnslp->mnl_drvnm_index]; + recerr = &msg_recerr->msg_recerr[i]; + recerr->r_li = li; + recerr->r_flags = + dep->de_optinfo[i].o_flags; + recerr->r_blkno = lp->l_blkno; + recerr->r_mnum = md_getminor(lp->l_dev); + (void) strncpy(recerr->r_driver_name, + dn->dn_data, MD_MAXDRVNM); } /* Release locks */ @@ -9711,17 +9758,17 @@ * the optimized resync records it owns. */ rval = mdmn_ksend_message(s->s_setno, - MD_MN_MSG_MDDB_OPTRECERR, - MD_MSGF_NO_BCAST, - (char *)msg_recerr, - sizeof (md_mn_msg_mddb_optrecerr_t), - kres); + MD_MN_MSG_MDDB_OPTRECERR, + MD_MSGF_NO_BCAST, 0, + (char *)msg_recerr, + sizeof (md_mn_msg_mddb_optrecerr_t), + kres); if (!MDMN_KSEND_MSG_OK(rval, kres)) { cmn_err(CE_WARN, "mddb_commitrec: " - "Unable to send optimized " - "resync record failure " - "message to other nodes in " - "diskset %s\n", s->s_setname); + "Unable to send optimized " + "resync record failure " + "message to other nodes in " + "diskset %s\n", s->s_setname); mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_MDDB_OPTRECERR"); } @@ -9758,7 +9805,7 @@ } kmem_free(kres, sizeof (md_mn_kresult_t)); kmem_free(msg_recerr, - sizeof (md_mn_msg_mddb_optrecerr_t)); + sizeof (md_mn_msg_mddb_optrecerr_t)); /* Resync record should be fixed - if possible */ s->s_optwaiterr--; @@ -10723,8 +10770,7 @@ if ((ddi_lyr_get_devid(ddi_dev, &rtn_devid) == DDI_SUCCESS) && (ddi_devid_compare(rtn_devid, devid) == 0)) { did_info->info_flags = MDDB_DID_VALID | - MDDB_DID_EXISTS | - MDDB_DID_UPDATED; + MDDB_DID_EXISTS | MDDB_DID_UPDATED; } else { cnt++; /* @@ -11051,7 +11097,7 @@ /* Assumes master blocks are already setup */ if (lbp == (mddb_lb_t *)NULL) { lbp = (mddb_lb_t *)kmem_zalloc( - dbtob(MDDB_MNLBCNT), KM_SLEEP); + dbtob(MDDB_MNLBCNT), KM_SLEEP); } err |= readblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, i); @@ -11135,7 +11181,7 @@ /* Free this node's old view of mddb locator blocks */ kmem_free((caddr_t)s->s_lbp, - dbtob(s->s_lbp->lb_blkcnt)); + dbtob(s->s_lbp->lb_blkcnt)); s->s_lbp = lbp; } else { if (lbp) @@ -11206,7 +11252,7 @@ * master could have rewritten in during fixoptrecord. */ db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, - KM_SLEEP); + KM_SLEEP); create_db32rec(db32p, dbp); for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; @@ -11216,16 +11262,16 @@ continue; err = readblks(s, (caddr_t)db32p, - db32p->db32_blknum, 1, li); + db32p->db32_blknum, 1, li); if (err) continue; /* Reverify db; go to next mddb if bad */ if ((db32p->db32_magic != MDDB_MAGIC_DB) || (revchk(MDDB_REV_DB, - db32p->db32_revision)) || + db32p->db32_revision)) || (crcchk(db32p, &db32p->db32_checksum, - MDDB_BSIZE, NULL))) { + MDDB_BSIZE, NULL))) { continue; } else { break; @@ -11254,9 +11300,8 @@ if (li == lbp->lb_loccnt) { kmem_free((caddr_t)db32p, MDDB_BSIZE); cmn_err(CE_PANIC, "md: mddb: Node unable to " - "access any SVM state database " - "replicas for diskset %s\n", - s->s_setname); + "access any SVM state database " + "replicas for diskset %s\n", s->s_setname); } /* * Setup temp copy of linked list of de's. @@ -11505,45 +11550,53 @@ lp->l_flags &= ~MDDB_F_ACTIVE; } } else { - /* - * Passed in li from slave does not match - * the replica in the master's structures. - * This could have occurred if a delete - * mddb command was running when the - * optimized resync record had a failure. - * Search all replicas for this entry. - * If no match, just ignore. - * If a match, set replica in error. - */ - for (li = 0; li < lbp->lb_loccnt; li++) { - lp = &lbp->lb_locators[li]; - if (lp->l_flags & MDDB_F_DELETED) - continue; - - for (j = 0; j < MD_MNMAXSIDES; j++) { - mnslp = - &mnlbp->lb_mnsidelocators[j][li]; - if (mnslp->mnl_sideno == s->s_sideno) + /* + * Passed in li from slave does not match + * the replica in the master's structures. + * This could have occurred if a delete + * mddb command was running when the + * optimized resync record had a failure. + * Search all replicas for this entry. + * If no match, just ignore. + * If a match, set replica in error. + */ + for (li = 0; li < lbp->lb_loccnt; li++) { + lp = &lbp->lb_locators[li]; + if (lp->l_flags & MDDB_F_DELETED) + continue; + + for (j = 0; j < MD_MNMAXSIDES; j++) { + mnslp = + &mnlbp-> + lb_mnsidelocators[j][li]; + if (mnslp->mnl_sideno == + s->s_sideno) + break; + } + if (j == MD_MNMAXSIDES) + continue; + + dn = &lbp-> + lb_drvnm[mnslp->mnl_drvnm_index]; + if ((strncmp(dn->dn_data, + recerr->r_driver_name, + MD_MAXDRVNM) == 0) && + (recerr->r_blkno == lp->l_blkno) && + (recerr->r_mnum == + mnslp->mnl_mnum)) { + if ((lp->l_flags & + MDDB_F_ACTIVE) || + ((lp->l_flags & + MDDB_F_EWRITE) == 0)) { + something_changed = 1; + lp->l_flags |= + MDDB_F_EWRITE; + lp->l_flags &= + ~MDDB_F_ACTIVE; + } break; + } } - if (j == MD_MNMAXSIDES) - continue; - - dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index]; - if ((strncmp(dn->dn_data, recerr->r_driver_name, - MD_MAXDRVNM) == 0) && - (recerr->r_blkno == lp->l_blkno) && - (recerr->r_mnum == mnslp->mnl_mnum)) { - if ((lp->l_flags & MDDB_F_ACTIVE) || - ((lp->l_flags & MDDB_F_EWRITE) - == 0)) { - something_changed = 1; - lp->l_flags |= MDDB_F_EWRITE; - lp->l_flags &= ~MDDB_F_ACTIVE; - } - break; - } - } } } } @@ -11693,8 +11746,7 @@ /* Re-verify that set is not stale */ if (md_get_setstatus(setno) & MD_SET_STALE) { mddb_setexit(s); - return (mdmddberror(ep, MDE_DB_STALE, - NODEV32, setno)); + return (mdmddberror(ep, MDE_DB_STALE, NODEV32, setno)); } lbp = s->s_lbp; @@ -11735,34 +11787,39 @@ * They may have been altered by the previous master */ for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { - for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { - if ((dep->de_flags & MDDB_F_CHANGELOG) == 0) { - continue; - } - /* This has been alloc'ed while joining the set */ - if (dep->de_rb) { - kmem_free(dep->de_rb, dep->de_recsize); - dep->de_rb = (mddb_rb32_t *)NULL; - } - if (dep->de_rb_userdata) { - kmem_free(dep->de_rb_userdata, dep->de_reqsize); - dep->de_rb_userdata = (caddr_t)NULL; - } - - err = getrecord(s, dep, li); - if (err) { + for (dep = dbp->db_firstentry; dep; dep = + dep->de_next) { + if ((dep->de_flags & MDDB_F_CHANGELOG) == 0) { + continue; + } /* - * When we see on error while reading the - * changelog entries, we move on to the next - * mddb + * This has been alloc'ed while + * joining the set */ - err = 1; - break; /* out of inner for-loop */ - } - allocuserdata(dep); - } - if (err) - break; /* out of outer for-loop */ + if (dep->de_rb) { + kmem_free(dep->de_rb, dep->de_recsize); + dep->de_rb = (mddb_rb32_t *)NULL; + } + if (dep->de_rb_userdata) { + kmem_free(dep->de_rb_userdata, + dep->de_reqsize); + dep->de_rb_userdata = (caddr_t)NULL; + } + + err = getrecord(s, dep, li); + if (err) { + /* + * When we see on error while reading + * the changelog entries, we move on + * to the next mddb + */ + err = 1; + break; /* out of inner for-loop */ + } + allocuserdata(dep); + } + if (err) + break; /* out of outer for-loop */ } /* If err, try next mddb */ @@ -11773,7 +11830,7 @@ /* Is incore locator block same as ondisk? */ if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT)) - == 1) { + == 1) { write_out_mddb = 1; kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT)); break; @@ -11786,7 +11843,7 @@ KM_SLEEP); /* read in on-disk locator names */ err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk, - lbp->lb_lnblkcnt, li); + lbp->lb_lnblkcnt, li); /* If err, try next mddb */ if (err) { @@ -11796,7 +11853,7 @@ /* Are incore locator names same as ondisk? */ if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT)) - == 1) { + == 1) { kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT)); write_out_mddb = 1; break; @@ -11885,7 +11942,7 @@ /* Is incore locator block same as ondisk? */ if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT)) - == 1) { + == 1) { kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT)); write_out_mddb = 1; break; @@ -11909,7 +11966,7 @@ /* Are incore locator names same as ondisk? */ if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT)) - == 1) { + == 1) { kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT)); write_out_mddb = 1; break; @@ -12322,8 +12379,7 @@ /* disk is powered off or not there */ continue; - if (md_get_setstatus(s->s_setno) & - MD_SET_REPLICATED_IMPORT) { + if (md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT) { /* * It is a replicated set */
--- a/usr/src/uts/common/io/lvm/md/md_subr.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/uts/common/io/lvm/md/md_subr.c Wed Dec 24 08:23:40 2008 -0700 @@ -86,6 +86,7 @@ extern md_ops_t **md_ops; extern md_ops_t *md_opslist; extern ddi_modhandle_t *md_mods; +extern dev_info_t *md_devinfo; extern md_krwlock_t md_unit_array_rw; extern kmutex_t md_mx; @@ -113,7 +114,7 @@ extern struct nm_next_hdr *get_first_record(set_t, int, int); struct mdq_anchor md_done_daemon; /* done request queue */ -struct mdq_anchor md_mstr_daemon; /* mirror timeout requests */ +struct mdq_anchor md_mstr_daemon; /* mirror error, WOW requests */ struct mdq_anchor md_mhs_daemon; /* mirror hotspare requests queue */ struct mdq_anchor md_hs_daemon; /* raid hotspare requests queue */ struct mdq_anchor md_ff_daemonq; /* failfast request queue */ @@ -121,6 +122,7 @@ struct mdq_anchor md_mirror_io_daemon; /* mirror owner i/o queue */ struct mdq_anchor md_mirror_rs_daemon; /* mirror resync done queue */ struct mdq_anchor md_sp_daemon; /* soft-part error daemon queue */ +struct mdq_anchor md_mto_daemon; /* mirror timeout daemon queue */ int md_done_daemon_threads = 1; /* threads for md_done_daemon requestq */ int md_mstr_daemon_threads = 1; /* threads for md_mstr_daemon requestq */ @@ -129,6 +131,7 @@ int md_ff_daemon_threads = 3; /* threads for md_ff_daemon requestq */ int md_mirror_daemon_threads = 1; /* threads for md_mirror_daemon requestq */ int md_sp_daemon_threads = 1; /* threads for md_sp_daemon requestq */ +int md_mto_daemon_threads = 1; /* threads for md_mto_daemon requestq */ #ifdef DEBUG /* Flag to switch on debug messages */ @@ -146,7 +149,7 @@ * */ -#define MD_DAEMON_QUEUES 10 +#define MD_DAEMON_QUEUES 11 md_requestq_entry_t md_daemon_queues[MD_DAEMON_QUEUES] = { {&md_done_daemon, &md_done_daemon_threads}, @@ -158,6 +161,7 @@ {&md_mirror_rs_daemon, &md_mirror_daemon_threads}, {&md_sp_daemon, &md_sp_daemon_threads}, {&md_mhs_daemon, &md_mhs_daemon_threads}, + {&md_mto_daemon, &md_mto_daemon_threads}, {0, 0} }; @@ -176,6 +180,12 @@ uint_t md_retry_cnt = 1; /* global so it can be patched */ /* + * How many times to try to do the door_ki_upcall() in mdmn_ksend_message. + * Again, made patchable here should it prove useful. + */ +uint_t md_send_retry_limit = 30; + +/* * Bug # 1212146 * Before this change the user had to pass in a short aligned buffer because of * problems in some underlying device drivers. This problem seems to have been @@ -712,9 +722,9 @@ if (status & MD_SET_STALE) flag |= MD_MSGF_NO_LOG; rval = mdmn_ksend_message(s->s_setno, - MD_MN_MSG_MDDB_PARSE, flag, + MD_MN_MSG_MDDB_PARSE, flag, 0, (char *)mddb_parse_msg, - sizeof (mddb_parse_msg), kresult); + sizeof (md_mn_msg_mddb_parse_t), kresult); /* if the node hasn't yet joined, it's Ok. */ if ((!MDMN_KSEND_MSG_OK(rval, kresult)) && (kresult->kmmr_comm_state != @@ -2817,6 +2827,15 @@ mutex_init(&ui->ui_mx, NULL, MUTEX_DEFAULT, NULL); cv_init(&ui->ui_cv, NULL, CV_DEFAULT, NULL); + if (alloc_lock) { + ui->ui_io_lock = kmem_zalloc(sizeof (md_io_lock_t), KM_SLEEP); + mutex_init(&ui->ui_io_lock->io_mx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&ui->ui_io_lock->io_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&ui->ui_io_lock->io_list_mutex, NULL, + MUTEX_DEFAULT, NULL); + ui->ui_io_lock->io_list_front = NULL; + ui->ui_io_lock->io_list_back = NULL; + } if (! (md_get_setstatus(setno) & MD_SET_SNARFING)) { rw_enter(&md_unit_array_rw.lock, RW_WRITER); MDI_VOIDUNIT(mnum) = (void *) ui; @@ -2829,15 +2848,6 @@ ui->ui_link.ln_setno = setno; ui->ui_link.ln_id = mnum; ops->md_head = &ui->ui_link; - if (alloc_lock) { - ui->ui_io_lock = kmem_zalloc(sizeof (md_io_lock_t), KM_SLEEP); - mutex_init(&ui->ui_io_lock->io_mx, NULL, MUTEX_DEFAULT, NULL); - cv_init(&ui->ui_io_lock->io_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&ui->ui_io_lock->io_list_mutex, NULL, - MUTEX_DEFAULT, NULL); - ui->ui_io_lock->io_list_front = NULL; - ui->ui_io_lock->io_list_back = NULL; - } /* setup the unavailable field */ #if defined(_ILP32) if (((md_unit_t *)MD_UNIT(mnum))->c.un_revision & MD_64BIT_META_DEV) { @@ -3865,82 +3875,68 @@ /* * Send a kernel message. * user has to provide for an allocated result structure - * If the door handler disappears we retry forever emitting warnings every so - * often. - * TODO: make this a flaggable attribute so that the caller can decide if the - * message is to be a 'one-shot' message or not. + * If the door handler disappears we retry, emitting warnings every so often. + * + * The recipient argument is almost always unused, and is therefore typically + * set to zero, as zero is an invalid cluster nodeid. The exceptions are the + * marking and clearing of the DRL from a node that is not currently the + * owner. In these cases, the recipient argument will be the nodeid of the + * mirror owner, and MD_MSGF_DIRECTED will be set in the flags. Non-owner + * nodes will not receive these messages. + * + * For the case where md_mn_is_commd_present() is false, we rely on the + * "result" having been kmem_zalloc()ed which, in effect, sets MDMNE_NULL for + * kmmr_comm_state making MDMN_KSEND_MSG_OK() result in 0. */ int mdmn_ksend_message( set_t setno, md_mn_msgtype_t type, uint_t flags, + md_mn_nodeid_t recipient, char *data, int size, md_mn_kresult_t *result) { door_arg_t da; md_mn_kmsg_t *kmsg; - uint_t retry_cnt = 0; + uint_t send_try_cnt = 0; + uint_t retry_noise_cnt = 0; int rval; + k_sigset_t oldmask, newmask; if (size > MDMN_MAX_KMSG_DATA) return (ENOMEM); kmsg = kmem_zalloc(sizeof (md_mn_kmsg_t), KM_SLEEP); kmsg->kmsg_flags = flags; kmsg->kmsg_setno = setno; + kmsg->kmsg_recipient = recipient; kmsg->kmsg_type = type; kmsg->kmsg_size = size; bcopy(data, &(kmsg->kmsg_data), size); -#ifdef DEBUG_COMM - printf("send msg: set=%d, flags=%d, type=%d, txid = 0x%llx," - " size=%d, data=%d, data2=%d\n", - kmsg->kmsg_setno, kmsg->kmsg_flags, kmsg->kmsg_type, - kmsg->kmsg_size, *(int *)data, *(int *)(char *)(&kmsg->kmsg_data)); - - -#endif /* DEBUG_COMM */ - - da.data_ptr = (char *)(kmsg); - da.data_size = sizeof (md_mn_kmsg_t); - da.desc_ptr = NULL; - da.desc_num = 0; - da.rbuf = (char *)result; - da.rsize = sizeof (*result); - /* * Wait for the door handle to be established. */ - while (mdmn_door_did == -1) { - if ((++retry_cnt % MD_MN_WARN_INTVL) == 0) { + if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) { cmn_err(CE_WARN, "door handle not yet ready. " "Check if /usr/lib/lvm/mddoors is running"); } delay(md_hz); } - retry_cnt = 0; - - while ((rval = door_ki_upcall_limited(mdmn_door_handle, &da, NULL, - SIZE_MAX, 0)) != 0) { - if (rval == EAGAIN) { - if ((++retry_cnt % MD_MN_WARN_INTVL) == 0) { - cmn_err(CE_WARN, "door call failed. " - "Check if /usr/lib/lvm/mddoors is running"); - } - } else { - cmn_err(CE_WARN, - "md door call failed. Returned %d", rval); - } - delay(md_hz); - } - kmem_free(kmsg, sizeof (md_mn_kmsg_t)); /* - * Attempt to determine if the message failed (with an RPC_FAILURE) - * because we are in the middle of shutting the system down. - * + * If MD_MSGF_BLK_SIGNAL is set, mask out all signals so that we + * do not fail if the user process receives a signal while we're + * active in the door interface. + */ + if (flags & MD_MSGF_BLK_SIGNAL) { + sigfillset(&newmask); + sigreplace(&newmask, &oldmask); + } + + /* * If message failed with an RPC_FAILURE when rpc.mdcommd had * been gracefully shutdown (md_mn_is_commd_present returns FALSE) * then don't retry the message anymore. If message @@ -3956,16 +3952,81 @@ * */ - retry_cnt = 0; - - if (result->kmmr_comm_state == MDMNE_RPC_FAIL) { - while (md_mn_is_commd_present() == 1) { - if ((++retry_cnt % MD_MN_WARN_INTVL) == 0) + retry_noise_cnt = send_try_cnt = 0; + while (md_mn_is_commd_present_lite()) { + /* + * data_ptr and data_size are initialized here because on + * return from the upcall, they contain data duplicated from + * rbuf and rsize. This causes subsequent upcalls to fail. + */ + da.data_ptr = (char *)(kmsg); + da.data_size = sizeof (md_mn_kmsg_t); + da.desc_ptr = NULL; + da.desc_num = 0; + da.rbuf = (char *)result; + da.rsize = sizeof (*result); + + while ((rval = door_ki_upcall_limited(mdmn_door_handle, &da, + NULL, SIZE_MAX, 0)) != 0) { + if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) { + if (rval == EAGAIN) { + cmn_err(CE_WARN, + "md: door_upcall failed. " + "Check if mddoors is running."); + } else if (rval == EINTR) { + cmn_err(CE_WARN, + "md: door_upcall failed. " + "Check if rpc.mdcommd is running."); + } else { + cmn_err(CE_WARN, + "md: door_upcall failed. " + "Returned %d", + rval); + } + } + if (++send_try_cnt >= md_send_retry_limit) break; + delay(md_hz); + + /* + * data_ptr and data_size are re-initialized here + * because on return from the upcall, they contain + * data duplicated from rbuf and rsize. This causes + * subsequent upcalls to fail. + */ + da.data_ptr = (char *)(kmsg); + da.data_size = sizeof (md_mn_kmsg_t); + da.desc_ptr = NULL; + da.desc_num = 0; + da.rbuf = (char *)result; + da.rsize = sizeof (*result); } + + + /* + * If: + * - the send succeeded (MDMNE_ACK) + * - we had an MDMNE_RPC_FAIL and commd is now gone + * (note: since the outer loop is commd-dependent, + * checking MDMN_RPC_FAIL here is meaningless) + * - we were told not to retry + * - we exceeded the RPC failure send limit + * punch out of the outer loop prior to the delay() + */ + if (result->kmmr_comm_state == MDMNE_ACK || + (flags & MD_MSGF_KSEND_NORETRY) || + (++send_try_cnt % md_send_retry_limit) == 0 || + !md_mn_is_commd_present()) + break; + delay(md_hz); } + if (flags & MD_MSGF_BLK_SIGNAL) { + sigreplace(&oldmask, (k_sigset_t *)NULL); + } + kmem_free(kmsg, sizeof (md_mn_kmsg_t)); + return (0); } @@ -4008,7 +4069,7 @@ sigfillset(&newmask); sigreplace(&newmask, &oldmask); ret = (mdmn_ksend_message(MD_MIN2SET(mnum), MD_MN_MSG_SET_CAP, - MD_MSGF_NO_LOG, (char *)&msg, sizeof (md_mn_msg_setcap_t), + MD_MSGF_NO_LOG, 0, (char *)&msg, sizeof (md_mn_msg_setcap_t), kres)); sigreplace(&oldmask, (k_sigset_t *)NULL); @@ -4056,7 +4117,7 @@ sigreplace(&newmask, &oldmask); ret = mdmn_ksend_message(MD_MIN2SET(mnum), MD_MN_MSG_CLU_CHECK, - MD_MSGF_STOP_ON_ERROR | MD_MSGF_NO_LOG | MD_MSGF_NO_MCT, + MD_MSGF_STOP_ON_ERROR | MD_MSGF_NO_LOG | MD_MSGF_NO_MCT, 0, (char *)&clumsg, sizeof (clumsg), kresult); sigreplace(&oldmask, (k_sigset_t *)NULL); @@ -4212,3 +4273,23 @@ return ((hot_spare_pool_t *)0); } + +/* + * md_create_taskq: + * + * Create a kernel taskq for the given set/unit combination. This is typically + * used to complete a RR_CLEAN request when the callee is unable to obtain the + * mutex / condvar access required to update the DRL safely. + */ +void * +md_create_taskq(set_t setno, minor_t mnum) +{ + char name[20]; + ddi_taskq_t *tqp; + + (void) snprintf(name, 20, "%d/d%d", setno, MD_MIN2UNIT(mnum)); + + tqp = ddi_taskq_create(md_devinfo, name, 1, TASKQ_DEFAULTPRI, 0); + + return ((void *)tqp); +}
--- a/usr/src/uts/common/io/lvm/mirror/mirror.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/uts/common/io/lvm/mirror/mirror.c Wed Dec 24 08:23:40 2008 -0700 @@ -173,6 +173,7 @@ mirror_parent_init(md_mps_t *ps) { bzero(ps, offsetof(md_mps_t, ps_mx)); + bzero(&ps->ps_overlap_node, sizeof (avl_node_t)); } /*ARGSUSED1*/ @@ -223,11 +224,17 @@ kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES, - MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, (char *)&pokehsp, + MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0, (char *)&pokehsp, sizeof (pokehsp), kresult); if (!MDMN_KSEND_MSG_OK(rval, kresult)) { mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES"); + /* If we're shutting down already, pause things here. */ + if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) { + while (!md_mn_is_commd_present()) { + delay(md_hz); + } + } cmn_err(CE_PANIC, "ksend_message failure: POKE_HOTSPARES"); } @@ -468,7 +475,7 @@ } kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); - rval = mdmn_ksend_message(setno, msgtype, msgflags, + rval = mdmn_ksend_message(setno, msgtype, msgflags, 0, (char *)&allochspmsg, sizeof (allochspmsg), kresult); @@ -491,6 +498,12 @@ kmem_free(kresult, sizeof (md_mn_kresult_t)); return (1); } + /* If we're shutting down already, pause things here. */ + if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) { + while (!md_mn_is_commd_present()) { + delay(md_hz); + } + } cmn_err(CE_PANIC, "ksend_message failure: ALLOCATE_HOTSPARE"); } @@ -1636,9 +1649,14 @@ /* * For directed mirror read (DMR) we only use the specified side and * do not compute the source of the read. + * If we're running with MD_MPS_DIRTY_RD set we always return the + * first mirror side (this prevents unnecessary ownership switching). + * Otherwise we return the submirror according to the mirror read option */ if (ps->ps_flags & MD_MPS_DMR) { sm_index = un->un_dmr_last_read; + } else if (ps->ps_flags & MD_MPS_DIRTY_RD) { + sm_index = md_find_nth_unit(running_bm, 0); } else { /* Normal (non-DMR) operation */ switch (un->un_read_option) { @@ -1883,6 +1901,13 @@ mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL); cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL); + /* + * Allocate rwlocks for un_pernode_dirty_bm accessing. + */ + for (i = 0; i < MD_MNMAXSIDES; i++) { + rw_init(&un->un_pernode_dirty_mx[i], NULL, RW_DEFAULT, NULL); + } + /* place various information in the in-core data structures */ md_nblocks_set(MD_SID(un), un->c.un_total_blocks); MD_UNIT(MD_SID(un)) = un; @@ -1903,6 +1928,7 @@ uint_t bits = 0; minor_t selfid; md_unit_t *su; + int i; md_destroy_unit_incore(mnum, &mirror_md_ops); @@ -1917,6 +1943,15 @@ kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt); if (un->un_resync_bm) kmem_free((caddr_t)un->un_resync_bm, bitcnt); + if (un->un_pernode_dirty_sum) + kmem_free((caddr_t)un->un_pernode_dirty_sum, un->un_rrd_num); + + /* + * Destroy the taskq for deferred processing of DRL clean requests. + * This taskq will only be present for Multi Owner mirrors. + */ + if (un->un_drl_task != NULL) + ddi_taskq_destroy(un->un_drl_task); md_nblocks_set(mnum, -1ULL); MD_UNIT(mnum) = NULL; @@ -1965,6 +2000,12 @@ mutex_destroy(&un->un_dmr_mx); cv_destroy(&un->un_dmr_cv); + for (i = 0; i < MD_MNMAXSIDES; i++) { + rw_destroy(&un->un_pernode_dirty_mx[i]); + if (un->un_pernode_dirty_bm[i]) + kmem_free((caddr_t)un->un_pernode_dirty_bm[i], bitcnt); + } + /* * Remove self from the namespace */ @@ -1972,7 +2013,9 @@ (void) md_rem_selfname(un->c.un_self_id); } + /* This frees the unit structure. */ mddb_deleterec_wrapper(un->c.un_record_id); + if (recid != 0) mddb_deleterec_wrapper(recid); @@ -2430,11 +2473,17 @@ } kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); - rval = mdmn_ksend_message(setno, msgtype, msgflags, + rval = mdmn_ksend_message(setno, msgtype, msgflags, 0, (char *)&stchmsg, sizeof (stchmsg), kresult); if (!MDMN_KSEND_MSG_OK(rval, kresult)) { mdmn_ksend_show_error(rval, kresult, "STATE UPDATE"); + /* If we're shutting down already, pause things here. */ + if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) { + while (!md_mn_is_commd_present()) { + delay(md_hz); + } + } cmn_err(CE_PANIC, "ksend_message failure: STATE_UPDATE"); } @@ -3435,11 +3484,12 @@ md_mps_t *ps = (md_mps_t *)dq; buf_t *pb = ps->ps_bp; mdi_unit_t *ui = ps->ps_ui; - mm_unit_t *un; + mm_unit_t *un = MD_UNIT(ui->ui_link.ln_id); set_t setno; int restart_resync; - un = md_unit_writerlock(ui); + mutex_enter(&un->un_rrp_inflight_mx); + (void) md_unit_writerlock(ui); ps->ps_un = un; setno = MD_MIN2SET(getminor(pb->b_edev)); if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) { @@ -3447,15 +3497,14 @@ * Synchronize our in-core view of what regions need to be * resync'd with the on-disk version. */ - mutex_enter(&un->un_rrp_inflight_mx); mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm, un->un_dirty_bm); - mutex_exit(&un->un_rrp_inflight_mx); /* Region dirty map is now up to date */ } restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0; md_unit_writerexit(ui); + mutex_exit(&un->un_rrp_inflight_mx); /* Restart the resync thread if it was previously blocked */ if (restart_resync) { @@ -3581,9 +3630,8 @@ kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); rval = mdmn_ksend_message(setno, - MD_MN_MSG_REQUIRE_OWNER, msg_flags, - /* flags */ (char *)msg, - sizeof (md_mn_req_owner_t), kres); + MD_MN_MSG_REQUIRE_OWNER, msg_flags, 0, + (char *)msg, sizeof (md_mn_req_owner_t), kres); kmem_free(msg, sizeof (md_mn_req_owner_t)); @@ -3890,19 +3938,19 @@ } /* - * For Multinode mirrors with a Resync Region (not ABR) we need to - * become the mirror owner before continuing with the write(). For ABR - * mirrors we check that we 'own' the resync if we're in - * write-after-read mode. We do this _after_ ensuring that there are no - * overlaps to ensure that the once we know that we are the owner, the - * readerlock will not released until the write is complete. As a - * change of ownership in a MN set requires the writerlock, this - * ensures that ownership cannot be changed until the write is - * complete + * For Multinode mirrors with no owner and a Resync Region (not ABR) + * we need to become the mirror owner before continuing with the + * write(). For ABR mirrors we check that we 'own' the resync if + * we're in write-after-read mode. We do this _after_ ensuring that + * there are no overlaps to ensure that once we know that we are + * the owner, the readerlock will not be released until the write is + * complete. As a change of ownership in a MN set requires the + * writerlock, this ensures that ownership cannot be changed until + * the write is complete. */ if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) { - if (!MD_MN_MIRROR_OWNER(un)) { + if (MD_MN_NO_MIRROR_OWNER(un)) { if (ps->ps_flags & MD_MPS_ON_OVERLAP) mirror_overlap_tree_remove(ps); md_kstat_waitq_exit(ui); @@ -3922,10 +3970,11 @@ if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) && !(flag & MD_STR_WAR)) { if (mirror_mark_resync_region(un, ps->ps_firstblk, - ps->ps_lastblk)) { + ps->ps_lastblk, md_mn_mynode_id)) { pb->b_flags |= B_ERROR; pb->b_resid = pb->b_bcount; - ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); + if (ps->ps_flags & MD_MPS_ON_OVERLAP) + mirror_overlap_tree_remove(ps); kmem_cache_free(mirror_parent_cache, ps); md_kstat_waitq_exit(ui); md_unit_readerexit(ui); @@ -4169,9 +4218,9 @@ /* * Before reading the buffer, see if - * we are the owner + * there is an owner. */ - if (!MD_MN_MIRROR_OWNER(un)) { + if (MD_MN_NO_MIRROR_OWNER(un)) { ps->ps_call = NULL; mirror_overlap_tree_remove(ps); md_kstat_waitq_exit(ui); @@ -4506,6 +4555,7 @@ md_error_t mde = mdnullerror; md_mps_t *ps; int rs_active; + int rr, rr_start, rr_end; /* Check that the given device is part of a multi-node set */ setno = MD_MIN2SET(p->mnum); @@ -4580,6 +4630,25 @@ if (p->rs_originator != md_mn_mynode_id) { /* + * Clear our un_resync_bm for the regions completed. + * The owner (originator) will take care of itself. + */ + BLK_TO_RR(rr_end, ps->ps_lastblk, un); + BLK_TO_RR(rr_start, p->rs_start, un); + if (ps->ps_lastblk && rr_end < rr_start) { + BLK_TO_RR(rr_start, ps->ps_firstblk, un); + mutex_enter(&un->un_resync_mx); + /* + * Update our resync bitmap to reflect that + * another node has synchronized this range. + */ + for (rr = rr_start; rr <= rr_end; rr++) { + CLR_KEEPDIRTY(rr, un); + } + mutex_exit(&un->un_resync_mx); + } + + /* * On all but the originating node, first update * the resync state, then unblock the previous * region and block the next one. No need @@ -4654,6 +4723,7 @@ &p->mde, lockp); } } + break; case MD_MN_MSG_RESYNC_FINISH: /* @@ -4792,6 +4862,24 @@ un->c.un_status &= ~MD_UN_KEEP_DIRTY; if (!broke_out) un->c.un_status &= ~MD_UN_WAR; + + /* + * Clear our un_resync_bm for the regions + * completed. The owner (originator) will + * take care of itself. + */ + if (p->rs_originator != md_mn_mynode_id && + (ps = un->un_rs_prev_overlap) != NULL) { + BLK_TO_RR(rr_start, ps->ps_firstblk, + un); + BLK_TO_RR(rr_end, ps->ps_lastblk, un); + mutex_enter(&un->un_resync_mx); + for (rr = rr_start; rr <= rr_end; + rr++) { + CLR_KEEPDIRTY(rr, un); + } + mutex_exit(&un->un_resync_mx); + } } /*
--- a/usr/src/uts/common/io/lvm/mirror/mirror_ioctl.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/uts/common/io/lvm/mirror/mirror_ioctl.c Wed Dec 24 08:23:40 2008 -0700 @@ -1624,7 +1624,7 @@ kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); rval = mdmn_ksend_message(setno, MD_MN_MSG_CHOOSE_OWNER, - MD_MSGF_NO_BCAST | MD_MSGF_NO_LOG, (char *)msg, + MD_MSGF_NO_BCAST | MD_MSGF_NO_LOG, 0, (char *)msg, sizeof (md_mn_msg_chooseid_t), kres); if (!MDMN_KSEND_MSG_OK(rval, kres)) { mdmn_ksend_show_error(rval, kres, "CHOOSE OWNER"); @@ -1664,7 +1664,8 @@ kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); rval = mdmn_ksend_message(setno, MD_MN_MSG_REQUIRE_OWNER, - MD_MSGF_NO_LOG, (char *)ownp, sizeof (md_mn_req_owner_t), kresult); + MD_MSGF_NO_LOG, 0, (char *)ownp, sizeof (md_mn_req_owner_t), + kresult); if (!MDMN_KSEND_MSG_OK(rval, kresult)) { /* @@ -2358,7 +2359,7 @@ kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); rval = mdmn_ksend_message(setno, MD_MN_MSG_GET_MIRROR_STATE, - MD_MSGF_NO_BCAST | MD_MSGF_NO_LOG, (char *)&msg, + MD_MSGF_NO_BCAST | MD_MSGF_NO_LOG, 0, (char *)&msg, sizeof (msg), kres); /* if the node hasn't yet joined, it's Ok. */ @@ -2949,6 +2950,42 @@ break; } + case MD_MN_RR_DIRTY: + { + sz = sizeof (md_mn_rr_dirty_params_t); + d = kmem_zalloc(sz, KM_SLEEP); + + if (ddi_copyin(data, d, sz, mode)) { + err = EFAULT; + break; + } + + err = mirror_set_dirty_rr((md_mn_rr_dirty_params_t *)d); + break; + } + + case MD_MN_RR_CLEAN: + { + md_mn_rr_clean_params_t tmp; + + /* get the first part of the structure to find the size */ + if (ddi_copyin(data, &tmp, sizeof (tmp), mode)) { + err = EFAULT; + break; + } + + sz = MDMN_RR_CLEAN_PARAMS_SIZE(&tmp); + d = kmem_zalloc(sz, KM_SLEEP); + + if (ddi_copyin(data, d, sz, mode)) { + err = EFAULT; + break; + } + + err = mirror_set_clean_rr((md_mn_rr_clean_params_t *)d); + break; + } + default: return (ENOTTY); }
--- a/usr/src/uts/common/io/lvm/mirror/mirror_resync.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/uts/common/io/lvm/mirror/mirror_resync.c Wed Dec 24 08:23:40 2008 -0700 @@ -24,8 +24,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/param.h> #include <sys/systm.h> #include <sys/conf.h> @@ -67,7 +65,7 @@ extern md_ops_t mirror_md_ops; extern kmem_cache_t *mirror_child_cache; /* mirror child memory pool */ -extern mdq_anchor_t md_mstr_daemon; +extern mdq_anchor_t md_mto_daemon; extern daemon_request_t mirror_timeout; extern md_resync_t md_cpr_resync; extern clock_t md_hz; @@ -141,81 +139,365 @@ */ int md_max_xfer_bufsz = 2048; +/* + * mirror_generate_rr_bitmap: + * ------------------- + * Generate a compressed bitmap md_mn_msg_rr_clean_t for the given clean + * bitmap associated with mirror 'un' + * + * Input: + * un - mirror unit to get bitmap data from + * *msgp - location to return newly allocated md_mn_msg_rr_clean_t + * *activep- location to return # of active i/os + * + * Returns: + * 1 => dirty bits cleared from un_dirty_bm and DRL flush required + * *msgp contains bitmap of to-be-cleared bits + * 0 => no bits cleared + * *msgp == NULL + */ static int -process_resync_regions(mm_unit_t *un) +mirror_generate_rr_bitmap(mm_unit_t *un, md_mn_msg_rr_clean_t **msgp, + int *activep) { - int i; + unsigned int i, next_bit, data_bytes, start_bit; + int cleared_dirty = 0; + + /* Skip any initial 0s. */ +retry_dirty_scan: + if ((start_bit = un->un_rr_clean_start_bit) >= un->un_rrd_num) + un->un_rr_clean_start_bit = start_bit = 0; + + /* + * Handle case where NO bits are set in PERNODE_DIRTY but the + * un_dirty_bm[] map does have entries set (after a 1st resync) + */ + for (; start_bit < un->un_rrd_num && + !IS_PERNODE_DIRTY(md_mn_mynode_id, start_bit, un) && + (un->un_pernode_dirty_sum[start_bit] != (uchar_t)0); start_bit++) + ; + + if (start_bit >= un->un_rrd_num) { + if (un->un_rr_clean_start_bit == 0) { + return (0); + } else { + un->un_rr_clean_start_bit = 0; + goto retry_dirty_scan; + } + } + + /* how much to fit into this message */ + data_bytes = MIN(howmany(un->un_rrd_num - start_bit, NBBY), + MDMN_MSG_RR_CLEAN_DATA_MAX_BYTES); + + (*msgp) = kmem_zalloc(MDMN_MSG_RR_CLEAN_SIZE_DATA(data_bytes), + KM_SLEEP); + + (*msgp)->rr_nodeid = md_mn_mynode_id; + (*msgp)->rr_mnum = MD_SID(un); + MDMN_MSG_RR_CLEAN_START_SIZE_SET(*msgp, start_bit, data_bytes); + + next_bit = MIN(start_bit + data_bytes * NBBY, un->un_rrd_num); + + for (i = start_bit; i < next_bit; i++) { + if (un->c.un_status & MD_UN_KEEP_DIRTY && IS_KEEPDIRTY(i, un)) { + continue; + } + if (!IS_REGION_DIRTY(i, un)) { + continue; + } + if (un->un_outstanding_writes[i] != 0) { + (*activep)++; + continue; + } + + /* + * Handle the case where a resync has completed and we still + * have the un_dirty_bm[] entries marked as dirty (these are + * the most recent DRL re-read from the replica). They need + * to be cleared from our un_dirty_bm[] but they will not have + * corresponding un_pernode_dirty[] entries set unless (and + * until) further write()s have been issued to the area. + * This handles the case where only the un_dirty_bm[] entry is + * set. Without this we'd not clear this region until a local + * write is issued to the affected area. + */ + if (IS_PERNODE_DIRTY(md_mn_mynode_id, i, un) || + (un->un_pernode_dirty_sum[i] == (uchar_t)0)) { + if (!IS_GOING_CLEAN(i, un)) { + SET_GOING_CLEAN(i, un); + (*activep)++; + continue; + } + /* + * Now we've got a flagged pernode_dirty, _or_ a clean + * bitmap entry to process. Update the bitmap to flush + * the REGION_DIRTY / GOING_CLEAN bits when we send the + * cross-cluster message. + */ + cleared_dirty++; + setbit(MDMN_MSG_RR_CLEAN_DATA(*msgp), i - start_bit); + } else { + /* + * Not marked as active in the pernode bitmap, so skip + * any update to this. We just increment the 0 count + * and adjust the active count by any outstanding + * un_pernode_dirty_sum[] entries. This means we don't + * leave the mirror permanently dirty. + */ + (*activep) += (int)un->un_pernode_dirty_sum[i]; + } + } + if (!cleared_dirty) { + kmem_free(*msgp, MDMN_MSG_RR_CLEAN_SIZE_DATA(data_bytes)); + *msgp = NULL; + } + un->un_rr_clean_start_bit = next_bit; + return (cleared_dirty); +} + +/* + * There are three paths into here: + * + * md_daemon -> check_resync_regions -> prr + * mirror_internal_close -> mirror_process_unit_resync -> prr + * mirror_set_capability -> mirror_process_unit_resync -> prr + * + * The first one is a kernel daemon, the other two result from system calls. + * Thus, only the first case needs to deal with kernel CPR activity. This + * is indicated by the cprinfop being non-NULL for kernel daemon calls, and + * NULL for system call paths. + */ +static int +process_resync_regions_non_owner(mm_unit_t *un, callb_cpr_t *cprinfop) +{ + int i, start, end; int cleared_dirty = 0; - /* - * Number of reasons why we can not - * proceed shutting down the mirror. - */ + /* Number of reasons why we can not proceed shutting down the mirror. */ int active = 0; set_t setno = MD_UN2SET(un); + md_mn_msg_rr_clean_t *rmsg; + md_mn_kresult_t *kres; + int rval; + minor_t mnum = MD_SID(un); + mdi_unit_t *ui = MDI_UNIT(mnum); + md_mn_nodeid_t owner_node; /* - * Resync region processing must be - * single threaded. We can't use - * un_resync_mx for this purpose - * since this mutex gets released + * We drop the readerlock here to assist lock ordering with + * update_resync. Once we have the un_rrp_inflight_mx, we + * can re-acquire it. + */ + md_unit_readerexit(ui); + + /* + * Resync region processing must be single threaded. We can't use + * un_resync_mx for this purpose since this mutex gets released * when blocking on un_resync_cv. */ mutex_enter(&un->un_rrp_inflight_mx); + (void) md_unit_readerlock(ui); + mutex_enter(&un->un_resync_mx); - while (un->un_resync_flg & MM_RF_STALL_CLEAN) - cv_wait(&un->un_resync_cv, &un->un_resync_mx); - - /* - * For a mirror we can only update the resync-record if we currently - * own the mirror. If we are called and we don't have ownership we bail - * out before scanning the outstanding_writes[] array. This cannot be - * set as we'd have become the owner before initiating the i/o to the - * mirror. - * NOTE: we only need to check here (before scanning the array) as we - * are called with the readerlock held. This means that a change - * of ownership away from us will block until this resync check - * has completed. - */ - if (MD_MNSET_SETNO(setno)) { - if (!MD_MN_MIRROR_OWNER(un)) { - mutex_exit(&un->un_resync_mx); + + rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1], RW_READER); + cleared_dirty = mirror_generate_rr_bitmap(un, &rmsg, &active); + rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]); + + if (cleared_dirty) { + owner_node = un->un_mirror_owner; + mutex_exit(&un->un_resync_mx); + + /* + * Transmit the 'to-be-cleared' bitmap to all cluster nodes. + * Receipt of the message will cause the mirror owner to + * update the on-disk DRL. + */ + + kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); + + /* release readerlock before sending message */ + md_unit_readerexit(ui); + + if (cprinfop) { + mutex_enter(&un->un_prr_cpr_mx); + CALLB_CPR_SAFE_BEGIN(cprinfop); + } + + rval = mdmn_ksend_message(setno, MD_MN_MSG_RR_CLEAN, + MD_MSGF_NO_LOG|MD_MSGF_BLK_SIGNAL|MD_MSGF_KSEND_NORETRY| + MD_MSGF_DIRECTED, un->un_mirror_owner, + (char *)rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg), kres); + + if (cprinfop) { + CALLB_CPR_SAFE_END(cprinfop, &un->un_prr_cpr_mx); + mutex_exit(&un->un_prr_cpr_mx); + } + + /* reacquire readerlock after message */ + (void) md_unit_readerlock(ui); + + if ((!MDMN_KSEND_MSG_OK(rval, kres)) && + (kres->kmmr_comm_state != MDMNE_NOT_JOINED)) { + /* if commd is gone, no point in printing a message */ + if (md_mn_is_commd_present()) + mdmn_ksend_show_error(rval, kres, "RR_CLEAN"); + kmem_free(kres, sizeof (md_mn_kresult_t)); + kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg)); + mutex_exit(&un->un_rrp_inflight_mx); + return (active); + } + kmem_free(kres, sizeof (md_mn_kresult_t)); + + /* + * If ownership changed while we were sending, we probably + * sent the message to the wrong node. Leave fixing that for + * the next cycle. + */ + if (un->un_mirror_owner != owner_node) { mutex_exit(&un->un_rrp_inflight_mx); return (active); } + + /* + * Now that we've sent the message, clear them from the + * pernode_dirty arrays. These are ONLY cleared on a + * successful send, and failure has no impact. + */ + cleared_dirty = 0; + start = MDMN_MSG_RR_CLEAN_START_BIT(rmsg); + end = start + MDMN_MSG_RR_CLEAN_DATA_BYTES(rmsg) * NBBY; + mutex_enter(&un->un_resync_mx); + rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1], + RW_READER); + for (i = start; i < end; i++) { + if (isset(MDMN_MSG_RR_CLEAN_DATA(rmsg), + i - start)) { + if (IS_PERNODE_DIRTY(md_mn_mynode_id, i, un)) { + un->un_pernode_dirty_sum[i]--; + CLR_PERNODE_DIRTY(md_mn_mynode_id, i, + un); + } + if (IS_REGION_DIRTY(i, un)) { + cleared_dirty++; + CLR_REGION_DIRTY(i, un); + CLR_GOING_CLEAN(i, un); + } + } + } + rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]); + + kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg)); } - - for (i = 0; i < un->un_rrd_num; i++) { - - if (un->c.un_status & MD_UN_KEEP_DIRTY) - if (IS_KEEPDIRTY(i, un)) + mutex_exit(&un->un_resync_mx); + + mutex_exit(&un->un_rrp_inflight_mx); + + return (active); +} + +static int +process_resync_regions_owner(mm_unit_t *un) +{ + int i, start, end; + int cleared_dirty = 0; + /* Number of reasons why we can not proceed shutting down the mirror. */ + int active = 0; + set_t setno = MD_UN2SET(un); + int mnset = MD_MNSET_SETNO(setno); + md_mn_msg_rr_clean_t *rmsg; + minor_t mnum = MD_SID(un); + mdi_unit_t *ui = MDI_UNIT(mnum); + + /* + * We drop the readerlock here to assist lock ordering with + * update_resync. Once we have the un_rrp_inflight_mx, we + * can re-acquire it. + */ + md_unit_readerexit(ui); + + /* + * Resync region processing must be single threaded. We can't use + * un_resync_mx for this purpose since this mutex gets released + * when blocking on un_resync_cv. + */ + mutex_enter(&un->un_rrp_inflight_mx); + + (void) md_unit_readerlock(ui); + + mutex_enter(&un->un_resync_mx); + un->un_waiting_to_clear++; + while (un->un_resync_flg & MM_RF_STALL_CLEAN) + cv_wait(&un->un_resync_cv, &un->un_resync_mx); + un->un_waiting_to_clear--; + + if (mnset) { + rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1], + RW_READER); + cleared_dirty = mirror_generate_rr_bitmap(un, &rmsg, &active); + + if (cleared_dirty) { + /* + * Clear the bits from the pernode_dirty arrays. + * If that results in any being cleared from the + * un_dirty_bm, commit it. + */ + cleared_dirty = 0; + start = MDMN_MSG_RR_CLEAN_START_BIT(rmsg); + end = start + MDMN_MSG_RR_CLEAN_DATA_BYTES(rmsg) * NBBY; + for (i = start; i < end; i++) { + if (isset(MDMN_MSG_RR_CLEAN_DATA(rmsg), + i - start)) { + if (IS_PERNODE_DIRTY(md_mn_mynode_id, i, + un)) { + un->un_pernode_dirty_sum[i]--; + CLR_PERNODE_DIRTY( + md_mn_mynode_id, i, un); + } + if (un->un_pernode_dirty_sum[i] == 0) { + cleared_dirty++; + CLR_REGION_DIRTY(i, un); + CLR_GOING_CLEAN(i, un); + } + } + } + kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg)); + } + rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]); + } else { + for (i = 0; i < un->un_rrd_num; i++) { + if (un->c.un_status & MD_UN_KEEP_DIRTY) + if (IS_KEEPDIRTY(i, un)) + continue; + + if (!IS_REGION_DIRTY(i, un)) continue; - - if (!IS_REGION_DIRTY(i, un)) - continue; - if (un->un_outstanding_writes[i] != 0) { - active++; - continue; + if (un->un_outstanding_writes[i] != 0) { + active++; + continue; + } + + if (!IS_GOING_CLEAN(i, un)) { + SET_GOING_CLEAN(i, un); + active++; + continue; + } + CLR_REGION_DIRTY(i, un); + CLR_GOING_CLEAN(i, un); + cleared_dirty++; } - - if (!IS_GOING_CLEAN(i, un)) { - SET_GOING_CLEAN(i, un); - active++; - continue; - } - CLR_REGION_DIRTY(i, un); - CLR_GOING_CLEAN(i, un); - cleared_dirty = 1; } + if (cleared_dirty) { un->un_resync_flg |= MM_RF_GATECLOSED; mutex_exit(&un->un_resync_mx); - mddb_commitrec_wrapper(un->un_rr_dirty_recid); - mutex_enter(&un->un_resync_mx); un->un_resync_flg &= ~MM_RF_GATECLOSED; - if (un->un_waiting_to_mark != 0) { + + if (un->un_waiting_to_mark != 0 || + un->un_waiting_to_clear != 0) { active++; cv_broadcast(&un->un_resync_cv); } @@ -227,6 +509,29 @@ return (active); } +static int +process_resync_regions(mm_unit_t *un, callb_cpr_t *cprinfop) +{ + int mnset = MD_MNSET_SETNO(MD_UN2SET(un)); + /* + * For a mirror we can only update the on-disk resync-record if we + * currently own the mirror. If we are called and there is no owner we + * bail out before scanning the outstanding_writes[] array. + * NOTE: we only need to check here (before scanning the array) as we + * are called with the readerlock held. This means that a change + * of ownership away from us will block until this resync check + * has completed. + */ + if (mnset && (MD_MN_NO_MIRROR_OWNER(un) || + (!MD_MN_MIRROR_OWNER(un) && !md_mn_is_commd_present_lite()))) { + return (0); + } else if (mnset && !MD_MN_MIRROR_OWNER(un)) { + return (process_resync_regions_non_owner(un, cprinfop)); + } else { + return (process_resync_regions_owner(un)); + } +} + /* * Function that is callable from other modules to provide * ability to cleanup dirty region bitmap on demand. Used @@ -240,7 +545,7 @@ { int cleans = 0; - while (process_resync_regions(un)) { + while (process_resync_regions(un, NULL)) { cleans++; if (cleans >= md_mirror_rr_cleans) { @@ -265,6 +570,7 @@ mdi_unit_t *ui; mm_unit_t *un; md_link_t *next; + callb_cpr_t cprinfo; rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER); for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) { @@ -272,8 +578,18 @@ if (md_get_setstatus(next->ln_setno) & MD_SET_STALE) continue; + un = MD_UNIT(next->ln_id); + + /* + * Register this resync thread with the CPR mechanism. This + * allows us to detect when the system is suspended and so + * keep track of the RPC failure condition. + */ + CALLB_CPR_INIT(&cprinfo, &un->un_prr_cpr_mx, callb_md_mrs_cpr, + "check_resync_regions"); + ui = MDI_UNIT(next->ln_id); - un = (mm_unit_t *)md_unit_readerlock(ui); + (void) md_unit_readerlock(ui); /* * Do not clean up resync regions if it is an ABR @@ -287,8 +603,13 @@ continue; } - (void) process_resync_regions(un); + (void) process_resync_regions(un, &cprinfo); + md_unit_readerexit(ui); + + /* Remove this thread from the CPR callback table. */ + mutex_enter(&un->un_prr_cpr_mx); + CALLB_CPR_EXIT(&cprinfo); } rw_exit(&mirror_md_ops.md_link_rw.lock); @@ -306,7 +627,7 @@ mutex_enter(&mirror_timeout.dr_mx); if (!mirror_timeout.dr_pending) { mirror_timeout.dr_pending = 1; - daemon_request(&md_mstr_daemon, check_resync_regions, + daemon_request(&md_mto_daemon, check_resync_regions, (daemon_queue_t *)&mirror_timeout, REQ_OLD); } @@ -466,6 +787,7 @@ un->un_resync_flg = 0; un->un_waiting_to_mark = 0; un->un_waiting_to_commit = 0; + un->un_waiting_to_clear = 0; un->un_goingclean_bm = NULL; un->un_goingdirty_bm = NULL; @@ -505,6 +827,27 @@ un->un_resync_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany( un->un_rrd_num, NBBY)), KM_SLEEP); + /* + * Allocate pernode bitmap for this node. All other nodes' maps will + * be created 'on-the-fly' in the ioctl message handler + */ + if (MD_MNSET_SETNO(MD_UN2SET(un))) { + un->un_pernode_dirty_sum = + (uchar_t *)kmem_zalloc(un->un_rrd_num, KM_SLEEP); + if (md_mn_mynode_id > 0) { + un->un_pernode_dirty_bm[md_mn_mynode_id-1] = (uchar_t *) + kmem_zalloc((uint_t)(howmany(un->un_rrd_num, NBBY)), + KM_SLEEP); + } + + /* + * Allocate taskq to process deferred (due to locking) RR_CLEAN + * requests. + */ + un->un_drl_task = (ddi_taskq_t *)md_create_taskq(MD_UN2SET(un), + MD_SID(un)); + } + if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE) return (0); @@ -734,7 +1077,7 @@ CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo); rval = mdmn_ksend_message(setno, MD_MN_MSG_RESYNC_PHASE_DONE, - MD_MSGF_NO_LOG, (char *)rmsg, sizeof (md_mn_msg_resync_t), kres); + MD_MSGF_NO_LOG, 0, (char *)rmsg, sizeof (md_mn_msg_resync_t), kres); CALLB_CPR_SAFE_END(&un->un_rs_cprinfo, &un->un_rs_cpr_mx); mutex_exit(&un->un_rs_cpr_mx); @@ -743,6 +1086,12 @@ if ((!MDMN_KSEND_MSG_OK(rval, kres)) && (kres->kmmr_comm_state != MDMNE_NOT_JOINED)) { mdmn_ksend_show_error(rval, kres, "RESYNC_PHASE_DONE"); + /* If we're shutting down already, pause things here. */ + if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) { + while (!md_mn_is_commd_present()) { + delay(md_hz); + } + } cmn_err(CE_PANIC, "ksend_message failure: RESYNC_PHASE_DONE"); } kmem_free(kres, sizeof (md_mn_kresult_t)); @@ -814,13 +1163,19 @@ CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo); rval = mdmn_ksend_message(setno, MD_MN_MSG_RESYNC_NEXT, MD_MSGF_NO_LOG, - (char *)rmsg, sizeof (md_mn_msg_resync_t), kres); + 0, (char *)rmsg, sizeof (md_mn_msg_resync_t), kres); CALLB_CPR_SAFE_END(&un->un_rs_cprinfo, &un->un_rs_cpr_mx); mutex_exit(&un->un_rs_cpr_mx); if (!MDMN_KSEND_MSG_OK(rval, kres)) { mdmn_ksend_show_error(rval, kres, "RESYNC_NEXT"); + /* If we're shutting down already, pause things here. */ + if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) { + while (!md_mn_is_commd_present()) { + delay(md_hz); + } + } cmn_err(CE_PANIC, "ksend_message failure: RESYNC_NEXT"); } kmem_free(kres, sizeof (md_mn_kresult_t)); @@ -2301,7 +2656,7 @@ CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo); rval = mdmn_ksend_message(setno, - MD_MN_MSG_RESYNC_FINISH, MD_MSGF_NO_LOG, + MD_MN_MSG_RESYNC_FINISH, MD_MSGF_NO_LOG, 0, (char *)rmsg, sizeof (md_mn_msg_resync_t), kres); CALLB_CPR_SAFE_END(&un->un_rs_cprinfo, @@ -2311,6 +2666,12 @@ if (!MDMN_KSEND_MSG_OK(rval, kres)) { mdmn_ksend_show_error(rval, kres, "RESYNC_FINISH"); + /* If we're shutting down, pause things here. */ + if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) { + while (!md_mn_is_commd_present()) { + delay(md_hz); + } + } cmn_err(CE_PANIC, "ksend_message failure: RESYNC_FINISH"); } @@ -2693,30 +3054,209 @@ } int -mirror_mark_resync_region(struct mm_unit *un, - diskaddr_t startblk, diskaddr_t endblk) +mirror_mark_resync_region_non_owner(struct mm_unit *un, + diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node) { - int no_change; - size_t start_rr; - size_t current_rr; - size_t end_rr; + int no_change; + size_t start_rr; + size_t current_rr; + size_t end_rr; + md_mn_msg_rr_dirty_t *rr; + md_mn_kresult_t *kres; + set_t setno = MD_UN2SET(un); + int rval; + md_mn_nodeid_t node_idx = source_node - 1; + mdi_unit_t *ui = MDI_UNIT(MD_SID(un)); + md_mn_nodeid_t owner_node; + minor_t mnum = MD_SID(un); if (un->un_nsm < 2) return (0); + /* + * Check to see if we have a un_pernode_dirty_bm[] entry allocated. If + * not, allocate it and then fill the [start..end] entries. + * Update un_pernode_dirty_sum if we've gone 0->1. + * Update un_dirty_bm if the corresponding entries are clear. + */ + rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER); + if (un->un_pernode_dirty_bm[node_idx] == NULL) { + un->un_pernode_dirty_bm[node_idx] = + (uchar_t *)kmem_zalloc( + (uint_t)howmany(un->un_rrd_num, NBBY), KM_SLEEP); + } + rw_exit(&un->un_pernode_dirty_mx[node_idx]); + BLK_TO_RR(end_rr, endblk, un); BLK_TO_RR(start_rr, startblk, un); - mutex_enter(&un->un_resync_mx); no_change = 1; + + mutex_enter(&un->un_resync_mx); + rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_READER); for (current_rr = start_rr; current_rr <= end_rr; current_rr++) { un->un_outstanding_writes[current_rr]++; + if (!IS_PERNODE_DIRTY(source_node, current_rr, un)) { + un->un_pernode_dirty_sum[current_rr]++; + SET_PERNODE_DIRTY(source_node, current_rr, un); + } + CLR_GOING_CLEAN(current_rr, un); + if (!IS_REGION_DIRTY(current_rr, un)) { + no_change = 0; + SET_REGION_DIRTY(current_rr, un); + SET_GOING_DIRTY(current_rr, un); + } else if (IS_GOING_DIRTY(current_rr, un)) + no_change = 0; + } + rw_exit(&un->un_pernode_dirty_mx[node_idx]); + mutex_exit(&un->un_resync_mx); + + if (no_change) { + return (0); + } + + /* + * If we have dirty regions to commit, send a + * message to the owning node so that the + * in-core bitmap gets updated appropriately. + * TODO: make this a kmem_cache pool to improve + * alloc/free performance ??? + */ + kres = (md_mn_kresult_t *)kmem_zalloc(sizeof (md_mn_kresult_t), + KM_SLEEP); + rr = (md_mn_msg_rr_dirty_t *)kmem_alloc(sizeof (md_mn_msg_rr_dirty_t), + KM_SLEEP); + +resend_mmrr: + owner_node = un->un_mirror_owner; + + rr->rr_mnum = mnum; + rr->rr_nodeid = md_mn_mynode_id; + rr->rr_range = (ushort_t)start_rr << 16; + rr->rr_range |= (ushort_t)end_rr & 0xFFFF; + + /* release readerlock before sending message */ + md_unit_readerexit(ui); + + rval = mdmn_ksend_message(setno, MD_MN_MSG_RR_DIRTY, + MD_MSGF_NO_LOG|MD_MSGF_BLK_SIGNAL|MD_MSGF_DIRECTED, + un->un_mirror_owner, (char *)rr, + sizeof (md_mn_msg_rr_dirty_t), kres); + + /* reaquire readerlock on message completion */ + (void) md_unit_readerlock(ui); + + /* if the message send failed, note it, and pass an error back up */ + if (!MDMN_KSEND_MSG_OK(rval, kres)) { + /* if commd is gone, no point in printing a message */ + if (md_mn_is_commd_present()) + mdmn_ksend_show_error(rval, kres, "RR_DIRTY"); + kmem_free(kres, sizeof (md_mn_kresult_t)); + kmem_free(rr, sizeof (md_mn_msg_rr_dirty_t)); + return (1); + } + + /* + * if the owner changed while we were sending the message, and it's + * not us, the new mirror owner won't yet have done the right thing + * with our data. Let him know. If we became the owner, we'll + * deal with that differently below. Note that receiving a message + * about another node twice won't hurt anything. + */ + if (un->un_mirror_owner != owner_node && !MD_MN_MIRROR_OWNER(un)) + goto resend_mmrr; + + kmem_free(kres, sizeof (md_mn_kresult_t)); + kmem_free(rr, sizeof (md_mn_msg_rr_dirty_t)); + + mutex_enter(&un->un_resync_mx); + + /* + * If we became the owner changed while we were sending the message, + * we have dirty bits in the un_pernode_bm that aren't yet reflected + * in the un_dirty_bm, as it was re-read from disk, and our bits + * are also not reflected in the on-disk DRL. Fix that now. + */ + if (MD_MN_MIRROR_OWNER(un)) { + rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER); + mirror_copy_rr(howmany(un->un_rrd_num, NBBY), + un->un_pernode_dirty_bm[node_idx], un->un_dirty_bm); + rw_exit(&un->un_pernode_dirty_mx[node_idx]); + + un->un_resync_flg |= MM_RF_COMMITING | MM_RF_GATECLOSED; + + mutex_exit(&un->un_resync_mx); + mddb_commitrec_wrapper(un->un_rr_dirty_recid); + mutex_enter(&un->un_resync_mx); + + un->un_resync_flg &= ~(MM_RF_COMMITING | MM_RF_GATECLOSED); + cv_broadcast(&un->un_resync_cv); + } + + for (current_rr = start_rr; current_rr <= end_rr; current_rr++) + CLR_GOING_DIRTY(current_rr, un); + + mutex_exit(&un->un_resync_mx); + + return (0); +} + +int +mirror_mark_resync_region_owner(struct mm_unit *un, + diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node) +{ + int no_change; + size_t start_rr; + size_t current_rr; + size_t end_rr; + int mnset = MD_MNSET_SETNO(MD_UN2SET(un)); + md_mn_nodeid_t node_idx = source_node - 1; + + if (un->un_nsm < 2) + return (0); + + /* + * Check to see if we have a un_pernode_dirty_bm[] entry allocated. If + * not, allocate it and then fill the [start..end] entries. + * Update un_pernode_dirty_sum if we've gone 0->1. + * Update un_dirty_bm if the corresponding entries are clear. + */ + if (mnset) { + rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER); + if (un->un_pernode_dirty_bm[node_idx] == NULL) { + un->un_pernode_dirty_bm[node_idx] = + (uchar_t *)kmem_zalloc( + (uint_t)howmany(un->un_rrd_num, NBBY), KM_SLEEP); + } + rw_exit(&un->un_pernode_dirty_mx[node_idx]); + } + + mutex_enter(&un->un_resync_mx); + + if (mnset) + rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_READER); + + no_change = 1; + BLK_TO_RR(end_rr, endblk, un); + BLK_TO_RR(start_rr, startblk, un); + for (current_rr = start_rr; current_rr <= end_rr; current_rr++) { + if (!mnset || source_node == md_mn_mynode_id) + un->un_outstanding_writes[current_rr]++; + if (mnset) { + if (!IS_PERNODE_DIRTY(source_node, current_rr, un)) + un->un_pernode_dirty_sum[current_rr]++; + SET_PERNODE_DIRTY(source_node, current_rr, un); + } CLR_GOING_CLEAN(current_rr, un); if (!IS_REGION_DIRTY(current_rr, un)) no_change = 0; if (IS_GOING_DIRTY(current_rr, un)) no_change = 0; } + + if (mnset) + rw_exit(&un->un_pernode_dirty_mx[node_idx]); + if (no_change) { mutex_exit(&un->un_resync_mx); return (0); @@ -2741,7 +3281,7 @@ } } if (no_change) { - if (un->un_waiting_to_mark == 0) + if (un->un_waiting_to_mark == 0 || un->un_waiting_to_clear != 0) cv_broadcast(&un->un_resync_cv); mutex_exit(&un->un_resync_mx); return (0); @@ -2749,19 +3289,21 @@ un->un_resync_flg |= MM_RF_COMMIT_NEEDED; un->un_waiting_to_commit++; - while ((un->un_waiting_to_mark != 0) && - (!(un->un_resync_flg & MM_RF_GATECLOSED))) { + while (un->un_waiting_to_mark != 0 && + !(un->un_resync_flg & MM_RF_GATECLOSED)) { if (panicstr) return (1); cv_wait(&un->un_resync_cv, &un->un_resync_mx); } - if ((un->un_resync_flg & MM_RF_COMMIT_NEEDED)) { + if (un->un_resync_flg & MM_RF_COMMIT_NEEDED) { un->un_resync_flg |= MM_RF_COMMITING | MM_RF_GATECLOSED; un->un_resync_flg &= ~MM_RF_COMMIT_NEEDED; + mutex_exit(&un->un_resync_mx); mddb_commitrec_wrapper(un->un_rr_dirty_recid); mutex_enter(&un->un_resync_mx); + un->un_resync_flg &= ~MM_RF_COMMITING; cv_broadcast(&un->un_resync_cv); } @@ -2779,10 +3321,26 @@ cv_broadcast(&un->un_resync_cv); } mutex_exit(&un->un_resync_mx); + return (0); } int +mirror_mark_resync_region(struct mm_unit *un, + diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node) +{ + int mnset = MD_MNSET_SETNO(MD_UN2SET(un)); + + if (mnset && !MD_MN_MIRROR_OWNER(un)) { + return (mirror_mark_resync_region_non_owner(un, startblk, + endblk, source_node)); + } else { + return (mirror_mark_resync_region_owner(un, startblk, endblk, + source_node)); + } +} + +int mirror_resize_resync_regions(mm_unit_t *un, diskaddr_t new_tb) { short *owp; @@ -2793,9 +3351,10 @@ size_t size; mddb_recid_t recid, old_recid; uchar_t *old_dirty_bm; - int i; + int i, j; mddb_type_t typ1; set_t setno = MD_UN2SET(un); + uchar_t *old_pns; old_nregions = un->un_rrd_num; new_nregions = (uint_t)((new_tb/un->un_rrd_blksize) + 1); @@ -2840,6 +3399,11 @@ un->un_outstanding_writes = (short *)kmem_zalloc( new_nregions * sizeof (short), KM_SLEEP); + old_pns = un->un_pernode_dirty_sum; + if (old_pns) + un->un_pernode_dirty_sum = (uchar_t *)kmem_zalloc(new_nregions, + KM_SLEEP); + /* * Now translate the old records into the new * records @@ -2847,15 +3411,41 @@ for (i = 0; i < old_nregions; i++) { /* * only bring forward the - * outstanding write counters and the dirty bits + * outstanding write counters and the dirty bits and also + * the pernode_summary counts */ if (!isset(old_dirty_bm, i)) continue; setbit(un->un_dirty_bm, (i / rr_mult)); un->un_outstanding_writes[(i / rr_mult)] += owp[i]; + if (old_pns) + un->un_pernode_dirty_sum[(i / rr_mult)] += old_pns[i]; } kmem_free((caddr_t)owp, old_nregions * sizeof (short)); + if (old_pns) + kmem_free((caddr_t)old_pns, old_nregions); + + /* + * Copy all non-zero un_pernode_dirty_bm[] arrays to new versions + */ + for (j = 0; j < MD_MNMAXSIDES; j++) { + rw_enter(&un->un_pernode_dirty_mx[j], RW_WRITER); + old_dirty_bm = un->un_pernode_dirty_bm[j]; + if (old_dirty_bm) { + un->un_pernode_dirty_bm[j] = (uchar_t *)kmem_zalloc( + new_bm_size, KM_SLEEP); + for (i = 0; i < old_nregions; i++) { + if (!isset(old_dirty_bm, i)) + continue; + + setbit(un->un_pernode_dirty_bm[j], + (i / rr_mult)); + } + kmem_free((caddr_t)old_dirty_bm, old_bm_size); + } + rw_exit(&un->un_pernode_dirty_mx[j]); + } /* Save the old record id */ old_recid = un->un_rr_dirty_recid; @@ -2891,6 +3481,7 @@ mddb_recid_t recid, old_recid; mddb_type_t typ1; set_t setno = MD_UN2SET(un); + int i; old_nregions = un->un_rrd_num; new_nregions = (uint_t)((new_tb/un->un_rrd_blksize) + 1); @@ -2924,6 +3515,8 @@ * un_goingclean_bm * un_resync_bm * un_outstanding_writes + * un_pernode_dirty_sum + * un_pernode_dirty_bm[] */ old = un->un_goingdirty_bm; un->un_goingdirty_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP); @@ -2947,6 +3540,28 @@ old_nregions * sizeof (short)); kmem_free((caddr_t)owp, (old_nregions * sizeof (short))); + old = un->un_pernode_dirty_sum; + if (old) { + un->un_pernode_dirty_sum = (uchar_t *)kmem_zalloc( + new_nregions, KM_SLEEP); + bcopy((caddr_t)old, (caddr_t)un->un_pernode_dirty_sum, + old_nregions); + kmem_free((caddr_t)old, old_nregions); + } + + for (i = 0; i < MD_MNMAXSIDES; i++) { + rw_enter(&un->un_pernode_dirty_mx[i], RW_WRITER); + old = un->un_pernode_dirty_bm[i]; + if (old) { + un->un_pernode_dirty_bm[i] = (uchar_t *)kmem_zalloc( + new_bm_size, KM_SLEEP); + bcopy((caddr_t)old, (caddr_t)un->un_pernode_dirty_bm[i], + old_bm_size); + kmem_free((caddr_t)old, old_bm_size); + } + rw_exit(&un->un_pernode_dirty_mx[i]); + } + /* Save the old record id */ old_recid = un->un_rr_dirty_recid; @@ -2980,3 +3595,263 @@ for (i = 0; i < sz; i++) *dest++ |= *src++; } + +/* + * mirror_set_dirty_rr: + * ------------------- + * Set the pernode_dirty_bm[node] entries and un_dirty_bm[] if appropriate. + * For the owning node (DRL/mirror owner) update the on-disk RR if needed. + * Called on every clean->dirty transition for the originating writer node. + * Note: only the non-owning nodes will initiate this message and it is only + * the owning node that has to process it. + */ +int +mirror_set_dirty_rr(md_mn_rr_dirty_params_t *iocp) +{ + + minor_t mnum = iocp->rr_mnum; + mm_unit_t *un; + int start = (int)iocp->rr_start; + int end = (int)iocp->rr_end; + set_t setno = MD_MIN2SET(mnum); + md_mn_nodeid_t orignode = iocp->rr_nodeid; /* 1-based */ + diskaddr_t startblk, endblk; + + mdclrerror(&iocp->mde); + + if ((setno >= md_nsets) || + (MD_MIN2UNIT(mnum) >= md_nunits)) { + return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum)); + } + + /* Must have _NO_ ioctl lock set if we update the RR on-disk */ + un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL); + + if (un == NULL) { + return (mdmderror(&iocp->mde, MDE_UNIT_NOT_SETUP, mnum)); + } + if (un->c.un_type != MD_METAMIRROR) { + return (mdmderror(&iocp->mde, MDE_NOT_MM, mnum)); + } + if (orignode < 1 || orignode >= MD_MNMAXSIDES) { + return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum)); + } + if (un->un_nsm < 2) { + return (0); + } + + /* + * Only process this message if we're the owner of the mirror. + */ + if (!MD_MN_MIRROR_OWNER(un)) { + return (0); + } + + RR_TO_BLK(startblk, start, un); + RR_TO_BLK(endblk, end, un); + return (mirror_mark_resync_region_owner(un, startblk, endblk, + orignode)); +} + +/* + * mirror_clean_rr_bits: + * -------------------- + * Clear the pernode_dirty_bm[node] entries which are passed in the bitmap + * Once _all_ references are removed (pernode_dirty_count[x] == 0) this region + * is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all + * nodes. Callable from ioctl / interrupt / whatever context. + * un_resync_mx is held on entry. + */ +static void +mirror_clean_rr_bits( + md_mn_rr_clean_params_t *iocp) +{ + minor_t mnum = iocp->rr_mnum; + mm_unit_t *un; + uint_t cleared_bits; + md_mn_nodeid_t node = iocp->rr_nodeid - 1; + md_mn_nodeid_t orignode = iocp->rr_nodeid; + int i, start, end; + + un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL); + + cleared_bits = 0; + start = MDMN_RR_CLEAN_PARAMS_START_BIT(iocp); + end = start + MDMN_RR_CLEAN_PARAMS_DATA_BYTES(iocp) * NBBY; + rw_enter(&un->un_pernode_dirty_mx[node], RW_READER); + for (i = start; i < end; i++) { + if (isset(MDMN_RR_CLEAN_PARAMS_DATA(iocp), i - start)) { + if (IS_PERNODE_DIRTY(orignode, i, un)) { + un->un_pernode_dirty_sum[i]--; + CLR_PERNODE_DIRTY(orignode, i, un); + } + if (un->un_pernode_dirty_sum[i] == 0) { + cleared_bits++; + CLR_REGION_DIRTY(i, un); + CLR_GOING_CLEAN(i, un); + } + } + } + rw_exit(&un->un_pernode_dirty_mx[node]); + if (cleared_bits) { + /* + * We can only be called iff we are the mirror owner, however + * as this is a (potentially) decoupled routine the ownership + * may have moved from us by the time we get to execute the + * bit clearing. Hence we still need to check for being the + * owner before flushing the DRL to the replica. + */ + if (MD_MN_MIRROR_OWNER(un)) { + mutex_exit(&un->un_resync_mx); + mddb_commitrec_wrapper(un->un_rr_dirty_recid); + mutex_enter(&un->un_resync_mx); + } + } +} + +/* + * mirror_drl_task: + * --------------- + * Service routine for clearing the DRL bits on a deferred MD_MN_RR_CLEAN call + * We need to obtain exclusive access to the un_resync_cv and then clear the + * necessary bits. + * On completion, we must also free the passed in argument as it is allocated + * at the end of the ioctl handler and won't be freed on completion. + */ +static void +mirror_drl_task(void *arg) +{ + md_mn_rr_clean_params_t *iocp = (md_mn_rr_clean_params_t *)arg; + minor_t mnum = iocp->rr_mnum; + mm_unit_t *un; + + un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL); + + mutex_enter(&un->un_rrp_inflight_mx); + mutex_enter(&un->un_resync_mx); + un->un_waiting_to_clear++; + while (un->un_resync_flg & MM_RF_STALL_CLEAN) + cv_wait(&un->un_resync_cv, &un->un_resync_mx); + un->un_waiting_to_clear--; + + un->un_resync_flg |= MM_RF_GATECLOSED; + mirror_clean_rr_bits(iocp); + un->un_resync_flg &= ~MM_RF_GATECLOSED; + if (un->un_waiting_to_mark != 0 || un->un_waiting_to_clear != 0) { + cv_broadcast(&un->un_resync_cv); + } + mutex_exit(&un->un_resync_mx); + mutex_exit(&un->un_rrp_inflight_mx); + + kmem_free((caddr_t)iocp, MDMN_RR_CLEAN_PARAMS_SIZE(iocp)); +} + +/* + * mirror_set_clean_rr: + * ------------------- + * Clear the pernode_dirty_bm[node] entries which are passed in the bitmap + * Once _all_ references are removed (pernode_dirty_count[x] == 0) this region + * is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all + * nodes. + * + * Only the mirror-owner need process this message as it is the only RR updater. + * Non-owner nodes issue this request, but as we have no point-to-point message + * support we will receive the message on all nodes. + */ +int +mirror_set_clean_rr(md_mn_rr_clean_params_t *iocp) +{ + + minor_t mnum = iocp->rr_mnum; + mm_unit_t *un; + set_t setno = MD_MIN2SET(mnum); + md_mn_nodeid_t node = iocp->rr_nodeid - 1; + int can_clear = 0; + md_mn_rr_clean_params_t *newiocp; + int rval = 0; + + mdclrerror(&iocp->mde); + + if ((setno >= md_nsets) || + (MD_MIN2UNIT(mnum) >= md_nunits)) { + return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum)); + } + + /* Must have _NO_ ioctl lock set if we update the RR on-disk */ + un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL); + + if (un == NULL) { + return (mdmderror(&iocp->mde, MDE_UNIT_NOT_SETUP, mnum)); + } + if (un->c.un_type != MD_METAMIRROR) { + return (mdmderror(&iocp->mde, MDE_NOT_MM, mnum)); + } + if (un->un_nsm < 2) { + return (0); + } + + /* + * Check to see if we're the mirror owner. If not, there's nothing + * for us to to. + */ + if (!MD_MN_MIRROR_OWNER(un)) { + return (0); + } + + /* + * Process the to-be-cleaned bitmap. We need to update the pernode_dirty + * bits and pernode_dirty_sum[n], and if, and only if, the sum goes 0 + * we can then mark the un_dirty_bm entry as GOINGCLEAN. Alternatively + * we can just defer this cleaning until the next process_resync_regions + * timeout. + */ + rw_enter(&un->un_pernode_dirty_mx[node], RW_WRITER); + if (un->un_pernode_dirty_bm[node] == NULL) { + un->un_pernode_dirty_bm[node] = (uchar_t *)kmem_zalloc( + un->un_rrd_num, KM_SLEEP); + } + rw_exit(&un->un_pernode_dirty_mx[node]); + + /* + * See if we can simply clear the un_dirty_bm[] entries. If we're not + * the issuing node _and_ we aren't in the process of marking/clearing + * the RR bitmaps, we can simply update the bits as needed. + * If we're the owning node and _not_ the issuing node, we should also + * sync the RR if we clear any bits in it. + */ + mutex_enter(&un->un_resync_mx); + can_clear = (un->un_resync_flg & MM_RF_STALL_CLEAN) ? 0 : 1; + if (can_clear) { + un->un_resync_flg |= MM_RF_GATECLOSED; + mirror_clean_rr_bits(iocp); + un->un_resync_flg &= ~MM_RF_GATECLOSED; + if (un->un_waiting_to_mark != 0 || + un->un_waiting_to_clear != 0) { + cv_broadcast(&un->un_resync_cv); + } + } + mutex_exit(&un->un_resync_mx); + + /* + * If we couldn't clear the bits, due to DRL update from m_m_r_r / p_r_r + * we must schedule a blocking call to update the DRL on this node. + * As we're invoked from an ioctl we are going to have the original data + * disappear (kmem_free) once we return. So, copy the data into a new + * structure and let the taskq routine release it on completion. + */ + if (!can_clear) { + size_t sz = MDMN_RR_CLEAN_PARAMS_SIZE(iocp); + + newiocp = (md_mn_rr_clean_params_t *)kmem_alloc(sz, KM_SLEEP); + + bcopy(iocp, newiocp, sz); + + if (ddi_taskq_dispatch(un->un_drl_task, mirror_drl_task, + newiocp, DDI_NOSLEEP) != DDI_SUCCESS) { + kmem_free(newiocp, sz); + rval = ENOMEM; /* probably starvation */ + } + } + + return (rval); +}
--- a/usr/src/uts/common/io/lvm/softpart/sp.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/uts/common/io/lvm/softpart/sp.c Wed Dec 24 08:23:40 2008 -0700 @@ -118,6 +118,7 @@ extern kmutex_t md_mx; extern kcondvar_t md_cv; extern md_krwlock_t md_unit_array_rw; +extern clock_t md_hz; static kmem_cache_t *sp_parent_cache = NULL; static kmem_cache_t *sp_child_cache = NULL; @@ -341,15 +342,19 @@ kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); rval = mdmn_ksend_message(setno, MD_MN_MSG_SP_SETSTAT2, MD_MSGF_NO_LOG, - (char *)&sp_msg, sizeof (sp_msg), kres); + 0, (char *)&sp_msg, sizeof (sp_msg), kres); if (!MDMN_KSEND_MSG_OK(rval, kres)) { mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_SP_SETSTAT2"); - + /* If we're shutting down already, pause things here. */ + if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) { + while (!md_mn_is_commd_present()) { + delay(md_hz); + } + } /* * Panic as we are now in an inconsistent state. */ - cmn_err(CE_PANIC, "md: %s: %s could not be set on all nodes\n", md_shortname(MD_SID(un)), str); }
--- a/usr/src/uts/common/io/lvm/softpart/sp_ioctl.c Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/uts/common/io/lvm/softpart/sp_ioctl.c Wed Dec 24 08:23:40 2008 -0700 @@ -1150,6 +1150,7 @@ } case MD_IOC_SPUPDATEWM: + case MD_MN_IOC_SPUPDATEWM: { if (! (mode & FWRITE)) return (EACCES);
--- a/usr/src/uts/common/sys/lvm/md_mirror.h Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/uts/common/sys/lvm/md_mirror.h Wed Dec 24 08:23:40 2008 -0700 @@ -18,6 +18,7 @@ * * CDDL HEADER END */ + /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. @@ -30,6 +31,9 @@ #include <sys/lvm/mdvar.h> #include <sys/lvm/md_mirror_shared.h> #include <sys/lvm/md_rename.h> +#ifdef _KERNEL +#include <sys/sunddi.h> +#endif #ifdef __cplusplus extern "C" { @@ -331,9 +335,24 @@ kcondvar_t un_dmr_cv; /* condvar for DMR requests */ int un_dmr_last_read; /* last DMR submirror read */ callb_cpr_t un_rs_cprinfo; /* CPR info for resync thread */ - kmutex_t un_rs_cpr_mx; /* Mutex for CPR info */ + kmutex_t un_rs_cpr_mx; /* mutex for resync CPR info */ + kmutex_t un_prr_cpr_mx; /* mutex for prr CPR info */ uint_t un_resync_completed; /* type of last resync */ int un_abr_count; /* count of sp's with abr set */ + + uchar_t *un_pernode_dirty_bm[MD_MNMAXSIDES]; + uchar_t *un_pernode_dirty_sum; + + krwlock_t un_pernode_dirty_mx[MD_MNMAXSIDES]; + ushort_t un_rr_clean_start_bit; /* where to start next clean */ + +#ifdef _KERNEL + ddi_taskq_t *un_drl_task; /* deferred RR_CLEAN taskq */ +#else + void *un_drl_task; /* deferred RR_CLEAN taskq */ +#endif /* _KERNEL */ + uint_t un_waiting_to_clear; /* Blocked waiting to clear */ + }mm_mirror_ic_t; #define MM_MN_OWNER_SENT 0x0001 /* RPC in progress */ @@ -416,9 +435,15 @@ #define un_dmr_last_read un_mmic.un_dmr_last_read #define un_rs_cprinfo un_mmic.un_rs_cprinfo #define un_rs_cpr_mx un_mmic.un_rs_cpr_mx +#define un_prr_cpr_mx un_mmic.un_prr_cpr_mx #define un_resync_completed un_mmic.un_resync_completed #define un_abr_count un_mmic.un_abr_count - +#define un_pernode_dirty_bm un_mmic.un_pernode_dirty_bm +#define un_pernode_dirty_sum un_mmic.un_pernode_dirty_sum +#define un_pernode_dirty_mx un_mmic.un_pernode_dirty_mx +#define un_rr_clean_start_bit un_mmic.un_rr_clean_start_bit +#define un_drl_task un_mmic.un_drl_task +#define un_waiting_to_clear un_mmic.un_waiting_to_clear #define MM_RF_GATECLOSED 0x0001 #define MM_RF_COMMIT_NEEDED 0x0002 @@ -497,6 +522,12 @@ #define IS_KEEPDIRTY(i, un) (isset((un)->un_resync_bm, (i))) #define CLR_KEEPDIRTY(i, un) (clrbit((un)->un_resync_bm, (i))) +#define IS_PERNODE_DIRTY(n, i, un) \ + (isset((un)->un_pernode_dirty_bm[(n)-1], (i))) +#define CLR_PERNODE_DIRTY(n, i, un) \ + (clrbit((un)->un_pernode_dirty_bm[(n)-1], (i))) +#define SET_PERNODE_DIRTY(n, i, un) \ + (setbit((un)->un_pernode_dirty_bm[(n)-1], (i))) /* * Write-On-Write handling. @@ -579,13 +610,15 @@ md_error_t *ep, IOLOCK *); extern int mirror_ioctl_resync(md_resync_ioctl_t *p, IOLOCK *); extern int mirror_mark_resync_region(mm_unit_t *, diskaddr_t, - diskaddr_t); + diskaddr_t, md_mn_nodeid_t); extern void resync_start_timeout(set_t setno); extern int mirror_resize_resync_regions(mm_unit_t *, diskaddr_t); extern int mirror_add_resync_regions(mm_unit_t *, diskaddr_t); extern int mirror_probedevs(md_probedev_t *, IOLOCK *); extern void mirror_copy_rr(int, uchar_t *, uchar_t *); extern void mirror_process_unit_resync(mm_unit_t *); +extern int mirror_set_dirty_rr(md_mn_rr_dirty_params_t *); +extern int mirror_set_clean_rr(md_mn_rr_clean_params_t *); #endif /* _KERNEL */ #ifdef __cplusplus
--- a/usr/src/uts/common/sys/lvm/md_sp.h Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/uts/common/sys/lvm/md_sp.h Wed Dec 24 08:23:40 2008 -0700 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,16 +18,15 @@ * * CDDL HEADER END */ + /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS__MD_SP_H #define _SYS__MD_SP_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/lvm/mdvar.h> #ifdef __cplusplus @@ -99,6 +97,7 @@ #define MD_IOC_SPSTATUS (MDIOC_MISC|0) #define MD_IOC_SPUPDATEWM (MDIOC_MISC|1) #define MD_IOC_SPREADWM (MDIOC_MISC|2) +#define MD_MN_IOC_SPUPDATEWM (MDIOC_MISC|3) #ifdef _KERNEL
--- a/usr/src/uts/common/sys/lvm/mdio.h Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/uts/common/sys/lvm/mdio.h Wed Dec 24 08:23:40 2008 -0700 @@ -18,16 +18,15 @@ * * CDDL HEADER END */ + /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS__MDIO_H #define _SYS__MDIO_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/debug.h> #include <sys/ioctl.h> #include <sys/types.h> @@ -433,6 +432,31 @@ unit_t un; } md_mkdev_params_t; +#define MDMN_RR_CLEAN_PARAMS_DATA(x) ((unsigned char *)(x) + \ + sizeof (md_mn_rr_clean_params_t)) +#define MDMN_RR_CLEAN_PARAMS_SIZE(x) (sizeof (md_mn_rr_clean_params_t) + \ + MDMN_RR_CLEAN_PARAMS_DATA_BYTES(x)) +#define MDMN_RR_CLEAN_PARAMS_START_BIT(x) ((x)->rr_start_size >> 16) +#define MDMN_RR_CLEAN_PARAMS_DATA_BYTES(x) ((x)->rr_start_size & 0xffff) + +typedef struct md_mn_rr_clean_params { + MD_DRIVER + md_error_t mde; + md_mn_nodeid_t rr_nodeid; + minor_t rr_mnum; + unsigned int rr_start_size; /* start_bit (16b) | data_bytes (16b) */ + /* actual data goes here */ +} md_mn_rr_clean_params_t; + +typedef struct md_mn_rr_dirty_params { + MD_DRIVER + md_error_t mde; + minor_t rr_mnum; + md_mn_nodeid_t rr_nodeid; + ushort_t rr_start; /* First RR region to mark */ + ushort_t rr_end; /* Last RR region to mark */ +} md_mn_rr_dirty_params_t; + /* * Flags to coordinate sending device id between kernel and user space. * To get devid from kernel: @@ -756,7 +780,8 @@ #define MD_IOCGET_HSP_NM (MDIOC|105) /* get hsp entry from namespace */ #define MD_IOCREM_DEV (MDIOC|106) /* remove device node for unit */ #define MD_IOCUPDATE_NM_RR_DID (MDIOC|107) /* update remotely repl did in NM */ - +#define MD_MN_RR_DIRTY (MDIOC|108) /* Mark RR range as dirty */ +#define MD_MN_RR_CLEAN (MDIOC|109) /* Clean RR bits from bitmap */ #define MDIOC_MISC (MDIOC|128) /* misc module base */ /* Used in DEBUG_TEST code */
--- a/usr/src/uts/common/sys/lvm/mdmn_commd.x Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/uts/common/sys/lvm/mdmn_commd.x Wed Dec 24 08:23:40 2008 -0700 @@ -20,11 +20,10 @@ % */ % %/* -% * Copyright 2006 Sun Microsystems, Inc. All rights reserved. +% * Copyright 2008 Sun Microsystems, Inc. All rights reserved. % * Use is subject to license terms. % */ % -%#pragma ident "%Z%%M% %I% %E% SMI" %#include <sys/types.h> %#include <sys/types32.h> @@ -103,6 +102,8 @@ MD_MN_MSG_SETSYNC, /* Set resync status */ MD_MN_MSG_POKE_HOTSPARES, /* Call poke_hotspares */ MD_MN_MSG_ADDMDNAME, /* Add metadevice name */ + MD_MN_MSG_RR_DIRTY, /* Mark RR range as dirty */ + MD_MN_MSG_RR_CLEAN, /* Mark RR range as clean */ MD_MN_NMESSAGES /* insert elements before */ }; @@ -361,6 +362,39 @@ minor_t pokehsp_setno; }; +/* Message format for MD_MN_MSG_RR_DIRTY message */ +struct md_mn_msg_rr_dirty_t { + minor_t rr_mnum; + int rr_nodeid; + u_int rr_range; /* Start(16bits) | End(16bits) */ +}; + +/* Message format for MD_MN_MSG_RR_CLEAN message */ +%#define MDMN_MSG_RR_CLEAN_DATA_MAX_BYTES \ +% ((MDMN_MAX_KMSG_DATA) - \ +% sizeof (struct md_mn_msg_rr_clean_t)) +%#define MDMN_MSG_RR_CLEAN_SIZE_DATA(x) \ +% (sizeof (struct md_mn_msg_rr_clean_t) + (x)) +%#define MDMN_MSG_RR_CLEAN_MSG_SIZE(x) \ +% (sizeof (struct md_mn_msg_rr_clean_t) \ +% + MDMN_MSG_RR_CLEAN_DATA_BYTES(x)) +%#define MDMN_MSG_RR_CLEAN_DATA(x) \ +% ((unsigned char *)(x) + \ +% sizeof (struct md_mn_msg_rr_clean_t)) + +/* since we cannot use ushorts, some macros to extract the parts from an int */ +%#define MDMN_MSG_RR_CLEAN_START_BIT(x) ((x)->rr_start_size >> 16) +%#define MDMN_MSG_RR_CLEAN_DATA_BYTES(x) ((x)->rr_start_size & 0xffff) +%#define MDMN_MSG_RR_CLEAN_START_SIZE_SET(x, start, size) \ +% ((x)->rr_start_size = (start << 16) | size) + +struct md_mn_msg_rr_clean_t { + md_mn_nodeid_t rr_nodeid; + unsigned int rr_mnum; + unsigned int rr_start_size; /* start_bit (16b) | data_bytes (16b) */ + /* actual data goes here */ +}; + %#define MD_MSGF_NO_LOG 0x00000001 %#define MD_MSGF_NO_BCAST 0x00000002 %#define MD_MSGF_STOP_ON_ERROR 0x00000004 @@ -373,6 +407,9 @@ %#define MD_MSGF_FAIL_ON_SUSPEND 0x00000200 %#define MD_MSGF_NO_MCT 0x00000400 %#define MD_MSGF_PANIC_WHEN_INCONSISTENT 0x00000800 +%#define MD_MSGF_BLK_SIGNAL 0x00001000 +%#define MD_MSGF_KSEND_NORETRY 0x00002000 +%#define MD_MSGF_DIRECTED 0x00004000 %#define MD_MSGF_VERBOSE 0x10000000 %#define MD_MSGF_VERBOSE_2 0x20000000 @@ -418,7 +455,8 @@ u_int msg_flags; /* See MD_MSGF_* above */ set_t msg_setno; /* which set is involved */ md_mn_msgtype_t msg_type; /* what type of message */ - char msg_spare[32]; /* Always good to hav'em */ + md_mn_nodeid_t msg_recipient; /* who to send DIRECTED message to */ + char msg_spare[28]; /* Always good to hav'em */ opaque msg_event<>; /* the actual event wrapped up */ }; %#define msg_event_data msg_event.msg_event_val @@ -435,7 +473,8 @@ uint32_t msg_flags; /* See MD_MSGF_* above */ set_t msg_setno; /* which set is involved */ md_mn_msgtype_t msg_type; /* what type of message */ - char msg_spare[32]; /* Always good to hav'em */ + md_mn_nodeid_t msg_recipient; /* who to send DIRECTED message to */ + char msg_spare[28]; /* Always good to hav'em */ uint32_t msg_ev_len; char msg_ev_val[MD_MN_MSG_MAXDATALEN]; }; @@ -450,6 +489,7 @@ u_int kmsg_flags; set_t kmsg_setno; md_mn_msgtype_t kmsg_type; + md_mn_nodeid_t kmsg_recipient; /* who to send DIRECTED message to */ int kmsg_size; char kmsg_data[MDMN_MAX_KMSG_DATA]; }; @@ -549,7 +589,7 @@ program MDMN_COMMD { - version ONE { + version TWO { md_mn_result_t mdmn_send(md_mn_msg_t) = 1; @@ -579,5 +619,5 @@ int mdmn_comm_msglock(md_mn_type_and_lock_t) = 10; - } = 1; + } = 2; } = 100422;
--- a/usr/src/uts/common/sys/lvm/mdvar.h Wed Dec 24 05:48:11 2008 -0800 +++ b/usr/src/uts/common/sys/lvm/mdvar.h Wed Dec 24 08:23:40 2008 -0700 @@ -744,8 +744,8 @@ extern int md_check_ioctl_against_unit(int, mdc_unit_t); extern mddb_recid_t md_vtoc_to_efi_record(mddb_recid_t, set_t); -extern int mdmn_ksend_message(set_t, md_mn_msgtype_t, uint_t, char *, int, - md_mn_kresult_t *); +extern int mdmn_ksend_message(set_t, md_mn_msgtype_t, uint_t, + md_mn_nodeid_t, char *, int, md_mn_kresult_t *); extern void mdmn_ksend_show_error(int, md_mn_kresult_t *, const char *); extern int mdmn_send_capability_message(minor_t, volcap_t, IOLOCK *); extern void mdmn_clear_all_capabilities(minor_t); @@ -755,9 +755,11 @@ extern void md_upd_set_unnext(set_t, unit_t); extern int md_rem_selfname(minor_t); extern void md_rem_hspname(set_t, mdkey_t); +extern void *md_create_taskq(set_t, minor_t); /* Externals from md_ioctl.c */ extern int md_mn_is_commd_present(void); +extern int md_mn_is_commd_present_lite(void); extern void md_mn_clear_commd_present(void); extern int md_admin_ioctl(md_dev64_t, int, caddr_t, int, IOLOCK *lockp); extern void md_get_geom(md_unit_t *, struct dk_geom *);