0
|
1 /*
|
|
2 * CDDL HEADER START
|
|
3 *
|
|
4 * The contents of this file are subject to the terms of the
|
|
5 * Common Development and Distribution License (the "License").
|
|
6 * You may not use this file except in compliance with the License.
|
|
7 *
|
|
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
9 * or http://www.opensolaris.org/os/licensing.
|
|
10 * See the License for the specific language governing permissions
|
|
11 * and limitations under the License.
|
|
12 *
|
|
13 * When distributing Covered Code, include this CDDL HEADER in each
|
|
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
15 * If applicable, add the following below this CDDL HEADER, with the
|
|
16 * fields enclosed by brackets "[]" replaced with your own identifying
|
|
17 * information: Portions Copyright [yyyy] [name of copyright owner]
|
|
18 *
|
|
19 * CDDL HEADER END
|
|
20 */
|
|
21 /*
|
|
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
|
|
23 * Use is subject to license terms.
|
|
24 */
|
|
25
|
|
26 #ifndef _CMD_MEM_H
|
|
27 #define _CMD_MEM_H
|
|
28
|
|
29 #pragma ident "@(#)cmd_mem.h 1.14 07/07/26 SMI"
|
|
30
|
|
31 /*
|
|
32 * Support routines for managing state related to memory modules.
|
|
33 *
|
|
34 * Correctable errors generally cause changes to the DIMM-related state (see
|
|
35 * cmd_dimm.c), whereas uncorrectable errors tend to use the bank-related
|
|
36 * routines (see cmd_bank.c). The primary exception to this division (though
|
|
37 * it eventually devolves to one of the two) is the RxE/FRx pair emitted by
|
|
38 * UltraSPARC-IIIi processors. With these errors, a complete pair must be
|
|
39 * received and matched before we know whether we're dealing with a CE or a UE.
|
|
40 */
|
|
41
|
|
42 #include <cmd.h>
|
|
43 #include <cmd_state.h>
|
|
44 #include <cmd_fmri.h>
|
|
45 #include <sys/errclassify.h>
|
|
46 #include <cmd_cpu.h>
|
|
47
|
|
48 #ifdef __cplusplus
|
|
49 extern "C" {
|
|
50 #endif
|
|
51
|
|
52 #define CMD_MEM_F_FAULTING 0x1
|
|
53
|
|
54 /*
|
|
55 * Used to store as-yet unmatched IOxEs, RxEs, and FRxs. When a new IOxE,
|
|
56 * RxE or FRx arrives, we traverse the cmd.cmd_iorxefrx list, looking for
|
|
57 * matching entries. Matching has a cpuid-based component, as well as a
|
|
58 * temporal one. We can compare the cpuids directly, using the cmd_iorxefrx_t
|
|
59 * and the newly-received event. Temporal comparison isn't performed directly.
|
|
60 * Instead, we ensure that entries in the iorxefrx list are removed when they
|
|
61 * expire by means of timers. This frees the matching code from the need to
|
|
62 * worry about time.
|
|
63 */
|
|
64 typedef struct cmd_iorxefrx {
|
|
65 cmd_list_t rf_list; /* List of cmd_iorxefrx_t's */
|
|
66 cmd_errcl_t rf_errcl; /* Error type (CMD_ERRCL_*) */
|
|
67 uint_t rf_afsr_agentid; /* Remote Agent ID (from AFSR) */
|
|
68 uint_t rf_det_agentid; /* Locat Agent ID (from detector) */
|
|
69 id_t rf_expid; /* Timer ID for entry expiration */
|
|
70 uint64_t rf_afar; /* Valid for RxE only */
|
|
71 uint8_t rf_afar_status; /* Valid for RxE only */
|
|
72 ce_dispact_t rf_type; /* Valid for RxE only */
|
|
73 uint16_t rf_synd; /* Valid for FRx only */
|
|
74 uint8_t rf_synd_status; /* Valid for FRx only */
|
|
75 uint64_t rf_afsr; /* Valid for FRx only */
|
|
76 uint64_t rf_disp; /* Valid for RCE only */
|
|
77 } cmd_iorxefrx_t;
|
|
78
|
|
79 typedef struct cmd_dimm cmd_dimm_t;
|
|
80 typedef struct cmd_bank cmd_bank_t;
|
|
81 #ifdef sun4v
|
|
82 typedef struct cmd_branch cmd_branch_t;
|
|
83 #endif
|
|
84
|
|
85 /*
|
|
86 * Correctable and Uncorrectable memory errors
|
|
87 *
|
|
88 * CEs of "Unknown" or "Intermittent" classification are not used in diagnosis.
|
|
89 *
|
|
90 * "Persistent" CEs are added to per-DIMM SERD engines. When the
|
|
91 * engine for a given DIMM fires, the page corresponding to the CE that
|
|
92 * caused the engine to fire is retired, and the SERD engine for that
|
|
93 * DIMM is reset.
|
|
94 *
|
|
95 * "Possibly Persistent" CEs are at least Persistent and so are treated
|
|
96 * as "Persistent" errors above, being added to the same SERD engines.
|
|
97 *
|
|
98 * "Leaky" CEs and "Sticky" CEs trigger immediate page retirement.
|
|
99 *
|
|
100 * "Possibly Sticky" CEs to which no valid partner test has been applied
|
|
101 * are not used in diagnosis. Where a valid partner test has been applied
|
|
102 * but did not confirm "Sticky" status there is a _suggestion_ that the
|
|
103 * original cpu may be a bad reader or writer or suffering from other
|
|
104 * datapath issues. To avoid retiring pages for such non-DIMM problems
|
|
105 * these classifications are also not used in diagnosis.
|
|
106 *
|
|
107 * UEs immediately trigger page retirements, but do not affect the CE SERD
|
|
108 * engines. In addition, UEs are recorded in the UE caches of the detecting
|
|
109 * CPUs. When a page is to be retired, a fault.memory.page fault is
|
|
110 * generated.
|
|
111 *
|
|
112 */
|
|
113
|
|
114 typedef cmd_evdisp_t cmd_xe_handler_f(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
|
|
115 const char *, uint64_t, uint8_t, uint16_t, uint8_t, ce_dispact_t, uint64_t,
|
|
116 nvlist_t *);
|
|
117
|
|
118 extern ce_dispact_t cmd_mem_name2type(const char *, int);
|
|
119 extern int cmd_synd2upos(uint16_t);
|
|
120 extern int cmd_upos2dram(uint16_t);
|
|
121 extern cmd_evdisp_t cmd_ce(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
|
|
122 const char *, cmd_errcl_t);
|
|
123 extern cmd_evdisp_t cmd_ue(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
|
|
124 const char *, cmd_errcl_t);
|
|
125 extern cmd_evdisp_t cmd_ce_common(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
|
|
126 const char *, uint64_t, uint8_t, uint16_t, uint8_t,
|
|
127 ce_dispact_t, uint64_t, nvlist_t *);
|
|
128 extern cmd_evdisp_t cmd_ue_common(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
|
|
129 const char *, uint64_t, uint8_t, uint16_t, uint8_t,
|
|
130 ce_dispact_t, uint64_t, nvlist_t *);
|
|
131 extern cmd_evdisp_t cmd_mem_synd_check(fmd_hdl_t *, uint64_t, uint8_t,
|
|
132 uint16_t, uint8_t, cmd_cpu_t *);
|
|
133 extern void cmd_dimm_close(fmd_hdl_t *, void *);
|
|
134 extern void cmd_bank_close(fmd_hdl_t *, void *);
|
|
135 #ifdef sun4v
|
|
136 extern void cmd_branch_close(fmd_hdl_t *, void *);
|
|
137 extern cmd_evdisp_t cmd_fb(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
|
|
138 const char *, cmd_errcl_t);
|
|
139 #endif
|
|
140
|
|
141 /*
|
|
142 * US-IIIi I/O, Remote and Foreign Read memory errors
|
|
143 *
|
|
144 * When one processor or I/O bridge attempts to read memory local to
|
|
145 * another processor, one each of IOCE/IOUE/RCE/RUE and FRC/FRU will be
|
|
146 * generated, depending on the type of error. Both the IOxE/RxE and the FRx
|
|
147 * are needed, as each contains data necessary to the diagnosis of the error.
|
|
148 * Upon receipt of one of the errors, we wait until we receive the other.
|
|
149 * When the pair has been successfully received and matched, a CE or UE,
|
|
150 * as appropriate, is synthesized from the data in the matched ereports.
|
|
151 * The synthesized ereports are handled by the normal CE and UE mechanisms.
|
|
152 */
|
|
153 extern cmd_evdisp_t cmd_frx(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
|
|
154 const char *, cmd_errcl_t);
|
|
155 extern cmd_evdisp_t cmd_rxe(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
|
|
156 const char *, cmd_errcl_t);
|
|
157 extern cmd_evdisp_t cmd_ioxe(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
|
|
158 const char *, cmd_errcl_t);
|
|
159 extern cmd_evdisp_t cmd_ioxe_sec(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
|
|
160 const char *, cmd_errcl_t);
|
|
161 extern cmd_evdisp_t cmd_rxefrx_common(fmd_hdl_t *hdl, fmd_event_t *ep,
|
|
162 nvlist_t *nvl, const char *class, cmd_errcl_t clcode,
|
|
163 cmd_errcl_t matchmask);
|
|
164
|
|
165 /*
|
|
166 * A list of received IOxE/RxE/FRx ereports is maintained for correlation
|
|
167 * purposes (see above). These two routines manage the addition of new
|
|
168 * ereports, and the retrieval of existing ones. Pruning of the list is
|
|
169 * handled automatically.
|
|
170 */
|
|
171 extern void cmd_iorxefrx_queue(fmd_hdl_t *, cmd_iorxefrx_t *);
|
|
172 extern void cmd_iorxefrx_free(fmd_hdl_t *, cmd_iorxefrx_t *);
|
|
173
|
|
174 extern const char *cmd_fmri_get_unum(nvlist_t *);
|
|
175 extern nvlist_t *cmd_mem_fmri_create(const char *);
|
|
176 extern nvlist_t *cmd_mem_fmri_derive(fmd_hdl_t *, uint64_t, uint64_t, uint16_t);
|
|
177
|
|
178 extern void cmd_mem_case_restore(fmd_hdl_t *, cmd_case_t *, fmd_case_t *,
|
|
179 const char *, const char *);
|
|
180 extern char *cmd_mem_serdnm_create(fmd_hdl_t *, const char *, const char *);
|
|
181 extern char *cmd_page_serdnm_create(fmd_hdl_t *, const char *, uint64_t);
|
|
182 extern void cmd_mem_retirestat_create(fmd_hdl_t *, fmd_stat_t *, const char *,
|
|
183 uint64_t, const char *);
|
|
184 extern int cmd_mem_thresh_check(fmd_hdl_t *, uint_t);
|
|
185 extern ulong_t cmd_mem_get_phys_pages(fmd_hdl_t *);
|
|
186
|
|
187 extern void cmd_mem_timeout(fmd_hdl_t *, id_t);
|
|
188 extern void cmd_mem_gc(fmd_hdl_t *);
|
|
189 extern void cmd_mem_fini(fmd_hdl_t *);
|
|
190
|
|
191 #ifdef __cplusplus
|
|
192 }
|
|
193 #endif
|
|
194
|
|
195 #endif /* _CMD_MEM_H */
|