Print this page
patch tsoome-feedback
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/cmd/fm/modules/sun4v/cpumem-retire/cma_cpu_sun4v.c
+++ new/usr/src/cmd/fm/modules/sun4v/cpumem-retire/cma_cpu_sun4v.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25
26 26 #include <cma.h>
27 27
28 28 #include <sys/fm/ldom.h>
29 29 #include <sys/fm/protocol.h>
30 30 #include <fm/fmd_fmri.h>
31 31 #include <fm/libtopo.h>
32 32
33 33 #include <assert.h>
34 34 #include <fcntl.h>
35 35 #include <unistd.h>
36 36 #include <errno.h>
37 37 #include <strings.h>
38 38
39 39 #include <sys/types.h>
40 40 #include <sys/processor.h>
41 41
42 42 extern ldom_hdl_t *cma_lhp;
43 43
44 44 /*ARGSUSED*/
45 45 int
46 46 cpu_blacklist_cmd(fmd_hdl_t *hdl, nvlist_t *fmri, boolean_t repair)
47 47 {
48 48 if (repair)
49 49 return (ldom_fmri_unblacklist(cma_lhp, fmri));
50 50 else
51 51 return (ldom_fmri_blacklist(cma_lhp, fmri));
52 52 }
53 53
54 54 int
55 55 cma_cpu_blacklist(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
56 56 boolean_t repair)
57 57 {
58 58 nvlist_t *fmri;
59 59 int rc, err;
60 60
61 61 /*
62 62 * Some platforms have special unums for the E$ DIMMs. If we're dealing
63 63 * with a platform that has these unums, one will have been added to the
64 64 * fault as the resource. We'll use that for the blacklisting. If we
65 65 * can't find a resource, we'll fall back to the ASRU.
66 66 */
67 67 if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &fmri) != 0)
68 68 fmri = asru;
69 69
70 70 rc = cpu_blacklist_cmd(hdl, fmri, repair);
71 71 err = errno;
72 72
73 73 if (rc < 0 && err != ENOTSUP) {
74 74 errno = err;
75 75 return (-1);
76 76 }
77 77
78 78 return (0);
79 79 }
80 80
81 81 /*ARGSUSED*/
82 82 static int
83 83 cpu_cmd(fmd_hdl_t *hdl, nvlist_t *fmri, int cmd)
84 84 {
85 85 int rc = 0;
86 86 char *scheme;
87 87
88 88 /*
89 89 * We're using topo retire if the fmri is in "hc" scheme.
90 90 */
91 91 if (nvlist_lookup_string(fmri, FM_FMRI_SCHEME, &scheme) == 0 &&
92 92 strcmp(scheme, FM_FMRI_SCHEME_HC) == 0) {
93 93 if (cmd != P_STATUS) {
94 94 errno = EINVAL;
95 95 return (-1);
96 96 }
97 97 rc = fmd_nvl_fmri_service_state(hdl, fmri);
98 98 switch (rc) {
99 99 case FMD_SERVICE_STATE_UNUSABLE:
100 100 return (P_FAULTED);
101 101 case -1:
102 102 return (-1);
103 103 default:
104 104 return (P_ONLINE);
105 105 }
106 106 }
107 107
108 108 switch (cmd & ~P_FORCED) {
109 109 case P_STATUS:
110 110 rc = ldom_fmri_status(cma_lhp, fmri);
111 111 break;
112 112 case P_FAULTED:
113 113 rc = ldom_fmri_retire(cma_lhp, fmri);
114 114 break;
115 115 case P_ONLINE:
116 116 rc = ldom_fmri_unretire(cma_lhp, fmri);
117 117 break;
118 118 default:
119 119 errno = EINVAL;
120 120 return (-1);
121 121 }
122 122
123 123 if (rc != P_OFFLINE && rc != P_ONLINE && rc != P_FAULTED) {
124 124 errno = rc;
125 125 return (-1);
126 126 }
127 127
128 128 return (rc);
129 129 }
130 130
131 131 void
132 132 cma_cpu_start_retry(fmd_hdl_t *hdl, nvlist_t *fmri, const char *uuid,
133 133 boolean_t repair)
134 134 {
135 135 cma_cpu_t *cpu;
136 136 char *scheme;
137 137 uint_t cpuid;
138 138 nvlist_t *asru = NULL;
139 139 topo_hdl_t *thp;
140 140 int err;
141 141
142 142 if (repair || nvlist_lookup_string(fmri, FM_FMRI_SCHEME, &scheme) != 0)
143 143 return;
144 144 if (strcmp(scheme, FM_FMRI_SCHEME_CPU) == 0) {
145 145 if (nvlist_lookup_uint32(fmri, FM_FMRI_CPU_ID, &cpuid) != 0)
146 146 return;
147 147 } else if (strcmp(scheme, FM_FMRI_SCHEME_HC) != 0) {
148 148 return;
149 149 } else {
150 150 /* lookup cpuid from ASRU */
151 151 thp = fmd_fmri_topo_hold(TOPO_VERSION);
152 152 if (thp != NULL) {
153 153 (void) topo_fmri_asru(thp, fmri, &asru, &err);
154 154 fmd_fmri_topo_rele(thp);
155 155 }
156 156 if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) {
157 157 nvlist_free(asru);
158 158 return;
159 159 }
160 160 }
161 161
162 162 /*
163 163 * check to see if the cpu has been offline.
164 164 */
165 165 fmd_hdl_debug(hdl, "cpu %u is not offline yet - sleeping\n", cpuid);
166 166
167 167 /*
168 168 * Create a cpu node and add to the head of the cpu list
169 169 */
170 170 cpu = fmd_hdl_zalloc(hdl, sizeof (cma_cpu_t), FMD_SLEEP);
171 171 (void) nvlist_dup(fmri, &cpu->cpu_fmri, 0);
172 172 if (uuid != NULL)
173 173 cpu->cpu_uuid = fmd_hdl_strdup(hdl, uuid, FMD_SLEEP);
174 174
175 175 cpu->cpuid = cpuid;
176 176 cpu->cpu_next = cma.cma_cpus;
177 177 cma.cma_cpus = cpu;
178 178
179 179 if (cma.cma_cpu_timerid != 0)
180 180 fmd_timer_remove(hdl, cma.cma_cpu_timerid);
181 181
182 182 cma.cma_cpu_curdelay = cma.cma_cpu_mindelay;
183 183
184 184 cma.cma_cpu_timerid =
185 185 fmd_timer_install(hdl, NULL, NULL, cma.cma_cpu_curdelay);
186 186 }
187 187
188 188
189 189 int
190 190 cma_cpu_statechange(fmd_hdl_t *hdl, nvlist_t *asru, const char *uuid,
191 191 int cpustate, boolean_t repair)
192 192 {
193 193 int i;
194 194 uint_t cpuid;
195 195
196 196 if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) {
197 197 fmd_hdl_debug(hdl, "missing '%s'\n", FM_FMRI_CPU_ID);
198 198 cma_stats.bad_flts.fmds_value.ui64++;
199 199 return (CMA_RA_FAILURE);
200 200 }
201 201
202 202 /*
203 203 * cpu offlining using ldom_fmri_retire() may be asynchronous, so we
204 204 * have to set the timer and check the cpu status later.
205 205 */
206 206 for (i = 0; i < cma.cma_cpu_tries;
207 207 i++, (void) nanosleep(&cma.cma_cpu_delay, NULL)) {
208 208 if (cpu_cmd(hdl, asru, cpustate) != -1) {
209 209 if (repair)
210 210 cma_stats.cpu_repairs.fmds_value.ui64++;
211 211 else
212 212 cma_stats.cpu_flts.fmds_value.ui64++;
213 213 break;
214 214 }
215 215 }
216 216
217 217 if (i >= cma.cma_cpu_tries) {
218 218 cma_stats.cpu_fails.fmds_value.ui64++;
219 219 }
220 220
221 221 cma_cpu_start_retry(hdl, asru, uuid, repair);
222 222
223 223 return (CMA_RA_FAILURE);
224 224 }
225 225
226 226 static int
227 227 cpu_retry(fmd_hdl_t *hdl, cma_cpu_t *cpu)
228 228 {
229 229 int rc = 0;
230 230
231 231 fmd_hdl_debug(hdl, "cpu_retry()\n");
232 232
233 233 if (cpu->cpu_fmri == NULL) {
234 234 return (1);
235 235 }
236 236
237 237 if (!fmd_nvl_fmri_present(hdl, cpu->cpu_fmri)) {
238 238 fmd_hdl_debug(hdl, "cpu %u is not present", cpu->cpuid);
239 239 return (1);
240 240 }
241 241
242 242 rc = cpu_cmd(hdl, cpu->cpu_fmri, P_STATUS);
243 243 if (rc == P_FAULTED || rc == P_OFFLINE) {
244 244 fmd_hdl_debug(hdl, "cpu %u is offlined on retry %u\n",
245 245 cpu->cpuid, cpu->cpu_nretries);
246 246 cma_stats.cpu_flts.fmds_value.ui64++;
247 247
248 248 if (cpu->cpu_uuid != NULL)
249 249 fmd_case_uuclose(hdl, cpu->cpu_uuid);
250 250 return (1); /* success */
251 251 }
252 252
253 253 if (rc == -1) {
254 254 fmd_hdl_debug(hdl, "failed to retry cpu %u\n", cpu->cpuid);
↓ open down ↓ |
254 lines elided |
↑ open up ↑ |
255 255 cma_stats.page_fails.fmds_value.ui64++;
256 256 return (1); /* give up */
257 257 }
258 258
259 259 return (0);
260 260 }
261 261
262 262 static void
263 263 cma_cpu_free(fmd_hdl_t *hdl, cma_cpu_t *cpu)
264 264 {
265 - if (cpu->cpu_fmri != NULL)
266 - nvlist_free(cpu->cpu_fmri);
265 + nvlist_free(cpu->cpu_fmri);
267 266 if (cpu->cpu_uuid != NULL)
268 267 fmd_hdl_strfree(hdl, cpu->cpu_uuid);
269 268 fmd_hdl_free(hdl, cpu, sizeof (cma_cpu_t));
270 269 }
271 270
272 271 void
273 272 cma_cpu_retry(fmd_hdl_t *hdl)
274 273 {
275 274 cma_cpu_t **cpup;
276 275
277 276 fmd_hdl_debug(hdl, "cma_cpu_retry: timer fired\n");
278 277
279 278 cma.cma_cpu_timerid = 0;
280 279
281 280 cpup = &cma.cma_cpus;
282 281 while (*cpup != NULL) {
283 282 cma_cpu_t *cpu = *cpup;
284 283
285 284 if (cpu_retry(hdl, cpu)) {
286 285 /*
287 286 * Successful retry or we're giving up - remove from
288 287 * the list
289 288 */
290 289 *cpup = cpu->cpu_next;
291 290
292 291 cma_cpu_free(hdl, cpu);
293 292 } else {
294 293 cpu->cpu_nretries++;
295 294 cpup = &cpu->cpu_next;
296 295 }
297 296 }
298 297
299 298 if (cma.cma_cpus == NULL)
300 299 return; /* no more cpus */
301 300
302 301 /*
303 302 * We still have cpus to check. Back the delay
304 303 * off, and schedule a retry.
305 304 */
306 305 cma.cma_cpu_curdelay = MIN(cma.cma_cpu_curdelay * 2,
307 306 cma.cma_cpu_maxdelay);
308 307
309 308 fmd_hdl_debug(hdl, "scheduled cpu offline retry for %llu secs\n",
310 309 (u_longlong_t)(cma.cma_cpu_curdelay / NANOSEC));
311 310
312 311 cma.cma_cpu_timerid =
313 312 fmd_timer_install(hdl, NULL, NULL, cma.cma_cpu_curdelay);
314 313 }
315 314
316 315 void
317 316 cma_cpu_fini(fmd_hdl_t *hdl)
318 317 {
319 318 cma_cpu_t *cpu;
320 319
321 320 while ((cpu = cma.cma_cpus) != NULL) {
322 321 cma.cma_cpus = cpu->cpu_next;
323 322 cma_cpu_free(hdl, cpu);
324 323 }
325 324 }
↓ open down ↓ |
49 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX