Print this page
XXXX introduce drv_sectohz
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/sun4u/io/pci/pci_ecc.c
+++ new/usr/src/uts/sun4u/io/pci/pci_ecc.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 */
24 24
25 25 /*
26 26 * PCI ECC support
27 27 */
28 28
29 29 #include <sys/types.h>
30 30 #include <sys/systm.h> /* for strrchr */
31 31 #include <sys/kmem.h>
32 32 #include <sys/sunddi.h>
33 33 #include <sys/intr.h>
34 34 #include <sys/async.h> /* struct async_flt */
35 35 #include <sys/ddi_impldefs.h>
36 36 #include <sys/machsystm.h>
37 37 #include <sys/sysmacros.h>
38 38 #include <sys/fm/protocol.h>
39 39 #include <sys/fm/util.h>
40 40 #include <sys/fm/io/pci.h>
41 41 #include <sys/fm/io/sun4upci.h>
42 42 #include <sys/fm/io/ddi.h>
43 43 #include <sys/pci/pci_obj.h> /* ld/st physio */
44 44 #include <sys/cpuvar.h>
45 45 #include <sys/errclassify.h>
46 46 #include <sys/cpu_module.h>
47 47 #include <sys/async.h>
48 48
49 49 /*LINTLIBRARY*/
50 50
51 51 static void ecc_disable(ecc_t *, int);
52 52 static void ecc_delayed_ce(void *);
53 53 static uint64_t ecc_read_afsr(ecc_intr_info_t *);
54 54 static void ecc_ereport_post(dev_info_t *dip, ecc_errstate_t *ecc_err);
55 55
56 56 clock_t pci_ecc_panic_delay = 200;
57 57 int ecc_ce_delay_secs = 6; /* number of sec to delay reenabling of CEs */
58 58 int ecc_ce_delayed = 1; /* global for enabling/disabling CE delay */
59 59
60 60 void
61 61 ecc_create(pci_t *pci_p)
62 62 {
63 63 #ifdef DEBUG
64 64 dev_info_t *dip = pci_p->pci_dip;
65 65 #endif
66 66 uint64_t cb_base_pa = pci_p->pci_cb_p->cb_base_pa;
67 67 ecc_t *ecc_p;
68 68
69 69 ecc_p = (ecc_t *)kmem_zalloc(sizeof (ecc_t), KM_SLEEP);
70 70 ecc_p->ecc_pci_cmn_p = pci_p->pci_common_p;
71 71 pci_p->pci_ecc_p = ecc_p;
72 72
73 73 ecc_p->ecc_ue.ecc_p = ecc_p;
74 74 ecc_p->ecc_ue.ecc_type = CBNINTR_UE;
75 75 ecc_p->ecc_ce.ecc_p = ecc_p;
76 76 ecc_p->ecc_ce.ecc_type = CBNINTR_CE;
77 77
78 78 pci_ecc_setup(ecc_p);
79 79
80 80 /*
81 81 * Determine the virtual addresses of the streaming cache
82 82 * control/status and flush registers.
83 83 */
84 84 ecc_p->ecc_csr_pa = cb_base_pa + COMMON_ECC_CSR_OFFSET;
85 85 ecc_p->ecc_ue.ecc_afsr_pa = cb_base_pa + COMMON_UE_AFSR_OFFSET;
86 86 ecc_p->ecc_ue.ecc_afar_pa = cb_base_pa + COMMON_UE_AFAR_OFFSET;
87 87 ecc_p->ecc_ce.ecc_afsr_pa = cb_base_pa + COMMON_CE_AFSR_OFFSET;
88 88 ecc_p->ecc_ce.ecc_afar_pa = cb_base_pa + COMMON_CE_AFAR_OFFSET;
89 89
90 90 DEBUG1(DBG_ATTACH, dip, "ecc_create: csr=%x\n", ecc_p->ecc_csr_pa);
91 91 DEBUG2(DBG_ATTACH, dip, "ecc_create: ue_afsr=%x, ue_afar=%x\n",
92 92 ecc_p->ecc_ue.ecc_afsr_pa, ecc_p->ecc_ue.ecc_afar_pa);
93 93 DEBUG2(DBG_ATTACH, dip, "ecc_create: ce_afsr=%x, ce_afar=%x\n",
94 94 ecc_p->ecc_ce.ecc_afsr_pa, ecc_p->ecc_ce.ecc_afar_pa);
95 95
96 96 ecc_configure(pci_p);
97 97
98 98 /*
99 99 * Register routines to be called from system error handling code.
100 100 */
101 101 bus_func_register(BF_TYPE_ERRDIS, (busfunc_t)ecc_disable_nowait, ecc_p);
102 102 }
103 103
104 104 int
105 105 ecc_register_intr(pci_t *pci_p)
106 106 {
107 107 ecc_t *ecc_p = pci_p->pci_ecc_p;
108 108 int ret;
109 109
110 110 /*
111 111 * Install the UE and CE error interrupt handlers.
112 112 */
113 113 if ((ret = pci_ecc_add_intr(pci_p, CBNINTR_UE, &ecc_p->ecc_ue)) !=
114 114 DDI_SUCCESS)
115 115 return (ret);
116 116 if ((ret = pci_ecc_add_intr(pci_p, CBNINTR_CE, &ecc_p->ecc_ce)) !=
117 117 DDI_SUCCESS)
118 118 return (ret);
119 119
120 120 return (DDI_SUCCESS);
121 121 }
122 122
123 123 void
124 124 ecc_destroy(pci_t *pci_p)
125 125 {
126 126 ecc_t *ecc_p = pci_p->pci_ecc_p;
127 127
128 128 DEBUG0(DBG_DETACH, pci_p->pci_dip, "ecc_destroy:\n");
129 129
130 130 /*
131 131 * Disable UE and CE ECC error interrupts.
132 132 */
133 133 ecc_disable_wait(ecc_p);
134 134
135 135 /*
136 136 * Remove the ECC interrupt handlers.
137 137 */
138 138 pci_ecc_rem_intr(pci_p, CBNINTR_UE, &ecc_p->ecc_ue);
139 139 pci_ecc_rem_intr(pci_p, CBNINTR_CE, &ecc_p->ecc_ce);
140 140
141 141 /*
142 142 * Unregister our error handling functions.
143 143 */
144 144 bus_func_unregister(BF_TYPE_ERRDIS,
145 145 (busfunc_t)ecc_disable_nowait, ecc_p);
146 146 /*
147 147 * If a timer has been set, unset it.
148 148 */
149 149 (void) untimeout(ecc_p->ecc_to_id);
150 150
151 151 kmem_free(ecc_p, sizeof (ecc_t));
152 152 pci_p->pci_ecc_p = NULL;
153 153 }
154 154
155 155 void
156 156 ecc_configure(pci_t *pci_p)
157 157 {
158 158 ecc_t *ecc_p = pci_p->pci_ecc_p;
159 159 dev_info_t *dip = pci_p->pci_dip;
160 160 uint64_t l;
161 161
162 162 /*
163 163 * Clear any pending ECC errors.
164 164 */
165 165 DEBUG0(DBG_ATTACH, dip, "ecc_configure: clearing UE and CE errors\n");
166 166 l = (COMMON_ECC_UE_AFSR_E_MASK << COMMON_ECC_UE_AFSR_PE_SHIFT) |
167 167 (COMMON_ECC_UE_AFSR_E_MASK << COMMON_ECC_UE_AFSR_SE_SHIFT);
168 168 stdphysio(ecc_p->ecc_ue.ecc_afsr_pa, l);
169 169
170 170 l = (COMMON_ECC_CE_AFSR_E_MASK << COMMON_ECC_CE_AFSR_PE_SHIFT) |
171 171 (COMMON_ECC_CE_AFSR_E_MASK << COMMON_ECC_CE_AFSR_SE_SHIFT);
172 172 stdphysio(ecc_p->ecc_ce.ecc_afsr_pa, l);
173 173
174 174 /*
175 175 * Enable ECC error detections via the control register.
176 176 */
177 177 DEBUG0(DBG_ATTACH, dip, "ecc_configure: enabling UE CE detection\n");
178 178 l = COMMON_ECC_CTRL_ECC_EN;
179 179 if (ecc_error_intr_enable)
180 180 l |= COMMON_ECC_CTRL_UE_INTEN | COMMON_ECC_CTRL_CE_INTEN;
181 181 stdphysio(ecc_p->ecc_csr_pa, l);
182 182 }
183 183
184 184 void
185 185 ecc_enable_intr(pci_t *pci_p)
186 186 {
187 187 cb_enable_nintr(pci_p, CBNINTR_UE);
188 188 cb_enable_nintr(pci_p, CBNINTR_CE);
189 189 }
190 190
191 191 void
192 192 ecc_disable_wait(ecc_t *ecc_p)
193 193 {
194 194 ecc_disable(ecc_p, IB_INTR_WAIT);
195 195 }
196 196
197 197 uint_t
198 198 ecc_disable_nowait(ecc_t *ecc_p)
199 199 {
200 200 ecc_disable(ecc_p, IB_INTR_NOWAIT);
201 201 return (BF_NONE);
202 202 }
203 203
204 204 static void
205 205 ecc_disable(ecc_t *ecc_p, int wait)
206 206 {
207 207 cb_t *cb_p = ecc_p->ecc_pci_cmn_p->pci_common_cb_p;
208 208 uint64_t csr_pa = ecc_p->ecc_csr_pa;
209 209 uint64_t csr = lddphysio(csr_pa);
210 210
211 211 csr &= ~(COMMON_ECC_CTRL_UE_INTEN | COMMON_ECC_CTRL_CE_INTEN);
212 212 stdphysio(csr_pa, csr);
213 213
214 214 cb_disable_nintr(cb_p, CBNINTR_UE, wait);
215 215 cb_disable_nintr(cb_p, CBNINTR_CE, wait);
216 216 }
217 217
218 218 /*
219 219 * I/O ECC error handling:
220 220 *
221 221 * Below are the generic functions that handle PCI(pcisch, pcipsy) detected
222 222 * ECC errors.
223 223 *
224 224 * The registered interrupt handler for both pcisch and pcipsy is ecc_intr(),
225 225 * it's function is to receive the error, capture some state, and pass that on
226 226 * to the ecc_err_handler() for reporting purposes.
227 227 *
228 228 * ecc_err_handler() gathers more state(via ecc_errstate_get) and attempts
229 229 * to handle and report the error. ecc_err_handler() must determine if we need
230 230 * to panic due to this error (via pci_ecc_classify, which also decodes the
231 231 * ECC afsr), and if any side effects exist that may have caused or are due
232 232 * to this error. PBM errors related to the ECC error may exist, to report
233 233 * them we call pci_pbm_err_handler() and call ndi_fm_handler_dispatch() so
234 234 * that the child devices can log their pci errors.
235 235 *
236 236 * To report the error we must also get the syndrome and unum, which can not
237 237 * be done in high level interrupted context. Therefore we have an error
238 238 * queue(pci_ecc_queue) which we dispatch errors to, to report the errors
239 239 * (ecc_err_drain()).
240 240 *
241 241 * ecc_err_drain() will be called when either the softint is triggered
242 242 * or the system is panicing. Either way it will gather more information
243 243 * about the error from the CPU(via ecc_cpu_call(), ecc.c), attempt to
244 244 * retire the faulty page(if error is a UE), and report the detected error.
245 245 *
246 246 * ecc_delayed_ce() is called via timeout from ecc_err_handler() following
247 247 * the receipt of a CE interrupt. It will be called after 6ms and check to
248 248 * see if any new CEs are present, if so we will log and another timeout will
249 249 * be set by(ecc_err_handler()). If no CEs are present then it will re-enable
250 250 * CEs by clearing the previous interrupt. This is to keep the system going
251 251 * in the event of a CE storm.
252 252 */
253 253
254 254 /*
255 255 * Function used to get ECC AFSR register
256 256 */
257 257 static uint64_t
258 258 ecc_read_afsr(ecc_intr_info_t *ecc_ii_p)
259 259 {
260 260 uint_t i;
261 261 uint64_t afsr = 0ull;
262 262
263 263 ASSERT((ecc_ii_p->ecc_type == CBNINTR_UE) ||
264 264 (ecc_ii_p->ecc_type == CBNINTR_CE));
265 265 if (!ecc_ii_p->ecc_errpndg_mask)
266 266 return (lddphysio(ecc_ii_p->ecc_afsr_pa));
267 267
268 268 for (i = 0; i < pci_ecc_afsr_retries; i++) {
269 269
270 270 /*
271 271 * If we timeout, the logging routine will
272 272 * know because it will see the ERRPNDG bits
273 273 * set in the AFSR.
274 274 */
275 275 afsr = lddphysio(ecc_ii_p->ecc_afsr_pa);
276 276 if ((afsr & ecc_ii_p->ecc_errpndg_mask) == 0)
277 277 break;
278 278 }
279 279 return (afsr);
280 280 }
281 281
282 282 /*
283 283 * IO detected ECC error interrupt handler, calls ecc_err_handler to post
284 284 * error reports and handle the interrupt. Re-entry into ecc_err_handler
285 285 * is protected by the per-chip mutex pci_fm_mutex.
286 286 */
287 287 uint_t
288 288 ecc_intr(caddr_t a)
289 289 {
290 290 ecc_intr_info_t *ecc_ii_p = (ecc_intr_info_t *)a;
291 291 ecc_t *ecc_p = ecc_ii_p->ecc_p;
292 292 pci_common_t *cmn_p = ecc_p->ecc_pci_cmn_p;
293 293 ecc_errstate_t ecc_err;
294 294 int ret = DDI_FM_OK;
295 295
296 296 bzero(&ecc_err, sizeof (ecc_errstate_t));
297 297 ecc_err.ecc_ena = fm_ena_generate(0, FM_ENA_FMT1);
298 298 ecc_err.ecc_ii_p = *ecc_ii_p;
299 299 ecc_err.ecc_p = ecc_p;
300 300 ecc_err.ecc_caller = PCI_ECC_CALL;
301 301
302 302 mutex_enter(&cmn_p->pci_fm_mutex);
303 303 ret = ecc_err_handler(&ecc_err);
304 304 mutex_exit(&cmn_p->pci_fm_mutex);
305 305 if (ret == DDI_FM_FATAL) {
306 306 /*
307 307 * Need delay here to allow CPUs to handle related traps,
308 308 * such as FRUs for USIIIi systems.
309 309 */
310 310 DELAY(pci_ecc_panic_delay);
311 311 fm_panic("Fatal PCI UE Error");
312 312 }
313 313
314 314 return (DDI_INTR_CLAIMED);
315 315 }
316 316
317 317 /*
318 318 * Function used to gather IO ECC error state.
319 319 */
320 320 static void
321 321 ecc_errstate_get(ecc_errstate_t *ecc_err_p)
322 322 {
323 323 ecc_t *ecc_p;
324 324 uint_t bus_id;
325 325
326 326 ASSERT(ecc_err_p);
327 327
328 328 ecc_p = ecc_err_p->ecc_ii_p.ecc_p;
329 329 bus_id = ecc_p->ecc_pci_cmn_p->pci_common_id;
330 330
331 331 ASSERT(MUTEX_HELD(&ecc_p->ecc_pci_cmn_p->pci_fm_mutex));
332 332 /*
333 333 * Read the fault registers.
334 334 */
335 335 ecc_err_p->ecc_afsr = ecc_read_afsr(&ecc_err_p->ecc_ii_p);
336 336 ecc_err_p->ecc_afar = lddphysio(ecc_err_p->ecc_ii_p.ecc_afar_pa);
337 337
338 338 ecc_err_p->ecc_offset = ((ecc_err_p->ecc_afsr &
339 339 ecc_err_p->ecc_ii_p.ecc_offset_mask) >>
340 340 ecc_err_p->ecc_ii_p.ecc_offset_shift) <<
341 341 ecc_err_p->ecc_ii_p.ecc_size_log2;
342 342
343 343 ecc_err_p->ecc_aflt.flt_id = gethrtime();
344 344 ecc_err_p->ecc_aflt.flt_stat = ecc_err_p->ecc_afsr;
345 345 ecc_err_p->ecc_aflt.flt_addr = P2ALIGN(ecc_err_p->ecc_afar, 64) +
346 346 ecc_err_p->ecc_offset;
347 347 ecc_err_p->ecc_aflt.flt_bus_id = bus_id;
348 348 ecc_err_p->ecc_aflt.flt_inst = CPU->cpu_id;
349 349 ecc_err_p->ecc_aflt.flt_status = ECC_IOBUS;
350 350 ecc_err_p->ecc_aflt.flt_in_memory =
351 351 (pf_is_memory(ecc_err_p->ecc_afar >> MMU_PAGESHIFT))? 1: 0;
352 352 ecc_err_p->ecc_aflt.flt_class = BUS_FAULT;
353 353 }
354 354
355 355 /*
356 356 * ecc_pci_check: Called by ecc_err_handler() this function is responsible
357 357 * for calling pci_pbm_err_handler() for both sides of the schizo/psycho
358 358 * and calling their children error handlers(via ndi_fm_handler_dispatch()).
359 359 */
360 360 static int
361 361 ecc_pci_check(ecc_t *ecc_p, uint64_t fme_ena)
362 362 {
363 363 ddi_fm_error_t derr;
364 364 int i;
365 365 int ret;
366 366
367 367 ASSERT(MUTEX_HELD(&ecc_p->ecc_pci_cmn_p->pci_fm_mutex));
368 368
369 369 bzero(&derr, sizeof (ddi_fm_error_t));
370 370 derr.fme_version = DDI_FME_VERSION;
371 371 derr.fme_ena = fme_ena;
372 372 ret = DDI_FM_NONFATAL;
373 373
374 374 /*
375 375 * Need to report any PBM errors which may have caused or
376 376 * resulted from this error.
377 377 *
378 378 * Each psycho or schizo is represented by a pair of pci nodes
379 379 * in the device tree.
380 380 */
381 381 for (i = 0; i < 2; i++) {
382 382 dev_info_t *dip;
383 383 pci_t *pci_p;
384 384
385 385 /* Make sure PBM PCI node exists */
386 386 pci_p = ecc_p->ecc_pci_cmn_p->pci_p[i];
387 387 if (pci_p == NULL)
388 388 continue;
389 389
390 390 dip = pci_p->pci_dip;
391 391 if (pci_pbm_err_handler(dip, &derr, (void *)pci_p,
392 392 PCI_ECC_CALL) == DDI_FM_FATAL)
393 393 ret = DDI_FM_FATAL;
394 394 }
395 395 if (ret == DDI_FM_FATAL)
396 396 return (DDI_FM_FATAL);
397 397 else
398 398 return (DDI_FM_NONFATAL);
399 399 }
400 400
401 401 /*
402 402 * Function used to handle and log IO detected ECC errors, can be called by
403 403 * ecc_intr and pci_err_callback(trap callback). Protected by pci_fm_mutex.
404 404 */
405 405 int
406 406 ecc_err_handler(ecc_errstate_t *ecc_err_p)
407 407 {
408 408 uint64_t pri_err, sec_err;
409 409 ecc_intr_info_t *ecc_ii_p = &ecc_err_p->ecc_ii_p;
410 410 ecc_t *ecc_p = ecc_ii_p->ecc_p;
411 411 pci_t *pci_p;
412 412 cb_t *cb_p;
413 413 int fatal = 0;
414 414 int nonfatal = 0;
415 415 ecc_errstate_t ecc_sec_err;
416 416 uint64_t sec_tmp;
417 417 int i;
418 418 uint64_t afsr_err[] = { COMMON_ECC_AFSR_E_PIO,
419 419 COMMON_ECC_AFSR_E_DRD,
420 420 COMMON_ECC_AFSR_E_DWR };
421 421
422 422
423 423 ASSERT(MUTEX_HELD(&ecc_p->ecc_pci_cmn_p->pci_fm_mutex));
424 424
425 425 pci_p = ecc_p->ecc_pci_cmn_p->pci_p[0];
426 426 if (pci_p == NULL)
427 427 pci_p = ecc_p->ecc_pci_cmn_p->pci_p[1];
428 428
429 429 cb_p = ecc_p->ecc_pci_cmn_p->pci_common_cb_p;
430 430
431 431 ecc_errstate_get(ecc_err_p);
432 432 pri_err = (ecc_err_p->ecc_afsr >> COMMON_ECC_UE_AFSR_PE_SHIFT) &
433 433 COMMON_ECC_UE_AFSR_E_MASK;
434 434
435 435 sec_err = (ecc_err_p->ecc_afsr >> COMMON_ECC_UE_AFSR_SE_SHIFT) &
436 436 COMMON_ECC_UE_AFSR_E_MASK;
437 437
438 438 switch (ecc_ii_p->ecc_type) {
439 439 case CBNINTR_UE:
440 440 if (pri_err) {
441 441 ecc_err_p->ecc_aflt.flt_synd =
442 442 pci_ecc_get_synd(ecc_err_p->ecc_afsr);
443 443 ecc_err_p->ecc_pri = 1;
444 444 pci_ecc_classify(pri_err, ecc_err_p);
445 445 errorq_dispatch(pci_ecc_queue, (void *)ecc_err_p,
446 446 sizeof (ecc_errstate_t),
447 447 ecc_err_p->ecc_aflt.flt_panic);
448 448 }
449 449 if (sec_err) {
450 450 ecc_sec_err = *ecc_err_p;
451 451 ecc_sec_err.ecc_pri = 0;
452 452 /*
453 453 * Secondary errors are cumulative so we need to loop
454 454 * through to capture them all.
455 455 */
456 456 for (i = 0; i < 3; i++) {
457 457 sec_tmp = sec_err & afsr_err[i];
458 458 if (sec_tmp) {
459 459 pci_ecc_classify(sec_tmp, &ecc_sec_err);
460 460 ecc_ereport_post(pci_p->pci_dip,
461 461 &ecc_sec_err);
462 462 }
463 463 }
464 464 }
465 465 /*
466 466 * Check for PCI bus errors that may have resulted from or
467 467 * caused this UE.
468 468 */
469 469 if (ecc_err_p->ecc_caller == PCI_ECC_CALL &&
470 470 ecc_pci_check(ecc_p, ecc_err_p->ecc_ena) == DDI_FM_FATAL)
471 471 ecc_err_p->ecc_aflt.flt_panic = 1;
472 472
473 473 if (ecc_err_p->ecc_aflt.flt_panic &&
474 474 ecc_err_p->ecc_aflt.flt_in_memory)
475 475 panic_aflt = ecc_err_p->ecc_aflt;
476 476
477 477 if (ecc_err_p->ecc_aflt.flt_panic) {
478 478 /*
479 479 * Disable all further errors since this will be
480 480 * treated as a fatal error.
481 481 */
482 482 (void) ecc_disable_nowait(ecc_p);
483 483 fatal++;
484 484 }
485 485 break;
486 486
487 487 case CBNINTR_CE:
488 488 if (pri_err) {
489 489 ecc_err_p->ecc_pri = 1;
490 490 pci_ecc_classify(pri_err, ecc_err_p);
491 491 ecc_err_p->ecc_aflt.flt_synd =
492 492 pci_ecc_get_synd(ecc_err_p->ecc_afsr);
493 493 ce_scrub(&ecc_err_p->ecc_aflt);
494 494 errorq_dispatch(pci_ecc_queue, (void *)ecc_err_p,
495 495 sizeof (ecc_errstate_t), ERRORQ_ASYNC);
496 496 nonfatal++;
497 497 }
498 498 if (sec_err) {
499 499 ecc_sec_err = *ecc_err_p;
500 500 ecc_sec_err.ecc_pri = 0;
501 501 /*
502 502 * Secondary errors are cumulative so we need to loop
503 503 * through to capture them all.
504 504 */
505 505 for (i = 0; i < 3; i++) {
506 506 sec_tmp = sec_err & afsr_err[i];
507 507 if (sec_tmp) {
508 508 pci_ecc_classify(sec_tmp, &ecc_sec_err);
509 509 ecc_ereport_post(pci_p->pci_dip,
510 510 &ecc_sec_err);
511 511 }
512 512 }
513 513 nonfatal++;
514 514 }
515 515 break;
516 516
517 517 default:
518 518 return (DDI_FM_OK);
519 519 }
520 520 /* Clear the errors */
521 521 stdphysio(ecc_ii_p->ecc_afsr_pa, ecc_err_p->ecc_afsr);
522 522 /*
523 523 * Clear the interrupt if called by ecc_intr and UE error or if called
524 524 * by ecc_intr and CE error and delayed CE interrupt handling is
525 525 * turned off.
526 526 */
527 527 if ((ecc_err_p->ecc_caller == PCI_ECC_CALL &&
528 528 ecc_ii_p->ecc_type == CBNINTR_UE && !fatal) ||
529 529 (ecc_err_p->ecc_caller == PCI_ECC_CALL &&
530 530 ecc_ii_p->ecc_type == CBNINTR_CE && !ecc_ce_delayed))
531 531 cb_clear_nintr(cb_p, ecc_ii_p->ecc_type);
532 532 if (!fatal && !nonfatal)
533 533 return (DDI_FM_OK);
534 534 else if (fatal)
535 535 return (DDI_FM_FATAL);
536 536 return (DDI_FM_NONFATAL);
537 537 }
538 538
539 539 /*
540 540 * Called from ecc_err_drain below for CBINTR_CE case.
541 541 */
542 542 static int
543 543 ecc_err_cexdiag(ecc_errstate_t *ecc_err, errorq_elem_t *eqep)
544 544 {
545 545 struct async_flt *ecc = &ecc_err->ecc_aflt;
546 546 uint64_t errors;
547 547
548 548 if (page_retire_check(ecc->flt_addr, &errors) == EINVAL) {
549 549 CE_XDIAG_SETSKIPCODE(ecc->flt_disp, CE_XDIAG_SKIP_NOPP);
550 550 return (0);
551 551 } else if (errors != PR_OK) {
552 552 CE_XDIAG_SETSKIPCODE(ecc->flt_disp, CE_XDIAG_SKIP_PAGEDET);
553 553 return (0);
554 554 } else {
555 555 return (ce_scrub_xdiag_recirc(ecc, pci_ecc_queue, eqep,
556 556 offsetof(ecc_errstate_t, ecc_aflt)));
557 557 }
558 558 }
559 559
560 560 /*
561 561 * Function used to drain pci_ecc_queue, either during panic or after softint
562 562 * is generated, to log IO detected ECC errors.
563 563 */
564 564 /*ARGSUSED*/
565 565 void
566 566 ecc_err_drain(void *not_used, ecc_errstate_t *ecc_err, errorq_elem_t *eqep)
567 567 {
568 568 struct async_flt *ecc = &ecc_err->ecc_aflt;
569 569 pci_t *pci_p = ecc_err->ecc_p->ecc_pci_cmn_p->pci_p[0];
570 570 int ecc_type = ecc_err->ecc_ii_p.ecc_type;
571 571
572 572 if (pci_p == NULL)
573 573 pci_p = ecc_err->ecc_p->ecc_pci_cmn_p->pci_p[1];
574 574
575 575 if (ecc->flt_class == RECIRC_BUS_FAULT) {
576 576 /*
577 577 * Perform any additional actions that occur after the
578 578 * ecc_err_cexdiag below and post the ereport.
579 579 */
580 580 ecc->flt_class = BUS_FAULT;
581 581 ecc_err->ecc_err_type = flt_to_error_type(ecc);
582 582 ecc_ereport_post(pci_p->pci_dip, ecc_err);
583 583 return;
584 584 }
585 585
586 586 ecc_cpu_call(ecc, ecc_err->ecc_unum, (ecc_type == CBNINTR_UE) ?
587 587 ECC_IO_UE : ECC_IO_CE);
588 588
589 589 switch (ecc_type) {
590 590 case CBNINTR_UE:
591 591 if (ecc_err->ecc_pg_ret == 1) {
592 592 (void) page_retire(ecc->flt_addr, PR_UE);
593 593 }
594 594 ecc_err->ecc_err_type = flt_to_error_type(ecc);
595 595 break;
596 596
597 597 case CBNINTR_CE:
↓ open down ↓ |
597 lines elided |
↑ open up ↑ |
598 598 /*
599 599 * Setup timeout (if CE detected via interrupt) to
600 600 * re-enable CE interrupts if no more CEs are detected.
601 601 * This is to protect against CE storms.
602 602 */
603 603 if (ecc_ce_delayed &&
604 604 ecc_err->ecc_caller == PCI_ECC_CALL &&
605 605 ecc_err->ecc_p->ecc_to_id == 0) {
606 606 ecc_err->ecc_p->ecc_to_id = timeout(ecc_delayed_ce,
607 607 (void *)ecc_err->ecc_p,
608 - drv_usectohz((clock_t)ecc_ce_delay_secs *
609 - MICROSEC));
608 + drv_sectohz((clock_t)ecc_ce_delay_secs));
610 609 }
611 610
612 611 /* ecc_err_cexdiag returns nonzero to recirculate */
613 612 if (CE_XDIAG_EXT_ALG_APPLIED(ecc->flt_disp) &&
614 613 ecc_err_cexdiag(ecc_err, eqep))
615 614 return;
616 615 ecc_err->ecc_err_type = flt_to_error_type(ecc);
617 616 break;
618 617 }
619 618
620 619 ecc_ereport_post(pci_p->pci_dip, ecc_err);
621 620 }
622 621
623 622 static void
624 623 ecc_delayed_ce(void *arg)
625 624 {
626 625 ecc_t *ecc_p = (ecc_t *)arg;
627 626 pci_common_t *cmn_p;
628 627 cb_t *cb_p;
629 628
630 629 ASSERT(ecc_p);
631 630
632 631 cmn_p = ecc_p->ecc_pci_cmn_p;
633 632 cb_p = cmn_p->pci_common_cb_p;
634 633 /*
635 634 * If no more CE errors are found then enable interrupts(by
636 635 * clearing the previous interrupt), else send in for logging
637 636 * and the timeout should be set again.
638 637 */
639 638 ecc_p->ecc_to_id = 0;
640 639 if (!((ecc_read_afsr(&ecc_p->ecc_ce) >>
641 640 COMMON_ECC_UE_AFSR_PE_SHIFT) & COMMON_ECC_UE_AFSR_E_MASK)) {
642 641 cb_clear_nintr(cb_p, ecc_p->ecc_ce.ecc_type);
643 642 } else {
644 643 ecc_errstate_t ecc_err;
645 644
646 645 bzero(&ecc_err, sizeof (ecc_errstate_t));
647 646 ecc_err.ecc_ena = fm_ena_generate(0, FM_ENA_FMT1);
648 647 ecc_err.ecc_ii_p = ecc_p->ecc_ce;
649 648 ecc_err.ecc_p = ecc_p;
650 649 ecc_err.ecc_caller = PCI_ECC_CALL;
651 650
652 651 mutex_enter(&cmn_p->pci_fm_mutex);
653 652 (void) ecc_err_handler(&ecc_err);
654 653 mutex_exit(&cmn_p->pci_fm_mutex);
655 654 }
656 655 }
657 656
658 657 /*
659 658 * Function used to post IO detected ECC ereports.
660 659 */
661 660 static void
662 661 ecc_ereport_post(dev_info_t *dip, ecc_errstate_t *ecc_err)
663 662 {
664 663 char buf[FM_MAX_CLASS], dev_path[MAXPATHLEN], *ptr;
665 664 struct i_ddi_fmhdl *fmhdl = DEVI(dip)->devi_fmhdl;
666 665 nvlist_t *ereport, *detector;
667 666 nv_alloc_t *nva;
668 667 errorq_elem_t *eqep;
669 668
670 669 /*
671 670 * We do not use ddi_fm_ereport_post because we need to set a
672 671 * special detector here. Since we do not have a device path for
673 672 * the bridge chip we use what we think it should be to aid in
674 673 * diagnosis. This path fmri is created by pci_fmri_create()
675 674 * during initialization.
676 675 */
677 676 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s.%s", DDI_IO_CLASS,
678 677 ecc_err->ecc_bridge_type, ecc_err->ecc_aflt.flt_erpt_class);
679 678
680 679 ecc_err->ecc_ena = ecc_err->ecc_ena ? ecc_err->ecc_ena :
681 680 fm_ena_generate(0, FM_ENA_FMT1);
682 681
683 682 eqep = errorq_reserve(fmhdl->fh_errorq);
684 683 if (eqep == NULL)
685 684 return;
686 685
687 686 ereport = errorq_elem_nvl(fmhdl->fh_errorq, eqep);
688 687 nva = errorq_elem_nva(fmhdl->fh_errorq, eqep);
689 688 detector = fm_nvlist_create(nva);
690 689
691 690 ASSERT(ereport);
692 691 ASSERT(nva);
693 692 ASSERT(detector);
694 693
695 694 ddi_pathname(dip, dev_path);
696 695 ptr = strrchr(dev_path, (int)',');
697 696
698 697 if (ptr)
699 698 *ptr = '\0';
700 699
701 700 fm_fmri_dev_set(detector, FM_DEV_SCHEME_VERSION, NULL, dev_path,
702 701 NULL, NULL);
703 702
704 703 if (ecc_err->ecc_pri) {
705 704 if ((ecc_err->ecc_fmri = fm_nvlist_create(nva)) != NULL) {
706 705 char sid[DIMM_SERIAL_ID_LEN] = "";
707 706 uint64_t offset = (uint64_t)-1;
708 707 int len;
709 708 int ret;
710 709
711 710 ret = cpu_get_mem_sid(ecc_err->ecc_unum, sid,
712 711 DIMM_SERIAL_ID_LEN, &len);
713 712
714 713 if (ret == 0) {
715 714 (void) cpu_get_mem_offset(
716 715 ecc_err->ecc_aflt.flt_addr, &offset);
717 716 }
718 717
719 718 fm_fmri_mem_set(ecc_err->ecc_fmri,
720 719 FM_MEM_SCHEME_VERSION, NULL, ecc_err->ecc_unum,
721 720 (ret == 0) ? sid : NULL, offset);
722 721 }
723 722 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf,
724 723 ecc_err->ecc_ena, detector,
725 724 PCI_ECC_AFSR, DATA_TYPE_UINT64, ecc_err->ecc_afsr,
726 725 PCI_ECC_AFAR, DATA_TYPE_UINT64, ecc_err->ecc_aflt.flt_addr,
727 726 PCI_ECC_CTRL, DATA_TYPE_UINT64, ecc_err->ecc_ctrl,
728 727 PCI_ECC_SYND, DATA_TYPE_UINT16, ecc_err->ecc_aflt.flt_synd,
729 728 PCI_ECC_TYPE, DATA_TYPE_STRING, ecc_err->ecc_err_type,
730 729 PCI_ECC_DISP, DATA_TYPE_UINT64, ecc_err->ecc_aflt.flt_disp,
731 730 PCI_ECC_RESOURCE, DATA_TYPE_NVLIST, ecc_err->ecc_fmri,
732 731 NULL);
733 732 } else {
734 733 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf,
735 734 ecc_err->ecc_ena, detector,
736 735 PCI_ECC_AFSR, DATA_TYPE_UINT64, ecc_err->ecc_afsr,
737 736 PCI_ECC_CTRL, DATA_TYPE_UINT64, ecc_err->ecc_ctrl,
738 737 NULL);
739 738 }
740 739 errorq_commit(fmhdl->fh_errorq, eqep, ERRORQ_ASYNC);
741 740 }
↓ open down ↓ |
122 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX