1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*
  26  * PCI ECC support
  27  */
  28 
  29 #include <sys/types.h>
  30 #include <sys/systm.h>            /* for strrchr */
  31 #include <sys/kmem.h>
  32 #include <sys/sunddi.h>
  33 #include <sys/intr.h>
  34 #include <sys/async.h>            /* struct async_flt */
  35 #include <sys/ddi_impldefs.h>
  36 #include <sys/machsystm.h>
  37 #include <sys/sysmacros.h>
  38 #include <sys/fm/protocol.h>
  39 #include <sys/fm/util.h>
  40 #include <sys/fm/io/pci.h>
  41 #include <sys/fm/io/sun4upci.h>
  42 #include <sys/fm/io/ddi.h>
  43 #include <sys/pci/pci_obj.h>      /* ld/st physio */
  44 #include <sys/cpuvar.h>
  45 #include <sys/errclassify.h>
  46 #include <sys/cpu_module.h>
  47 #include <sys/async.h>
  48 
  49 /*LINTLIBRARY*/
  50 
  51 static void ecc_disable(ecc_t *, int);
  52 static void ecc_delayed_ce(void *);
  53 static uint64_t ecc_read_afsr(ecc_intr_info_t *);
  54 static void ecc_ereport_post(dev_info_t *dip, ecc_errstate_t *ecc_err);
  55 
  56 clock_t pci_ecc_panic_delay = 200;
  57 int ecc_ce_delay_secs = 6;      /* number of sec to delay reenabling of CEs */
  58 int ecc_ce_delayed = 1;         /* global for enabling/disabling CE delay */
  59 
  60 void
  61 ecc_create(pci_t *pci_p)
  62 {
  63 #ifdef DEBUG
  64         dev_info_t *dip = pci_p->pci_dip;
  65 #endif
  66         uint64_t cb_base_pa = pci_p->pci_cb_p->cb_base_pa;
  67         ecc_t *ecc_p;
  68 
  69         ecc_p = (ecc_t *)kmem_zalloc(sizeof (ecc_t), KM_SLEEP);
  70         ecc_p->ecc_pci_cmn_p = pci_p->pci_common_p;
  71         pci_p->pci_ecc_p = ecc_p;
  72 
  73         ecc_p->ecc_ue.ecc_p = ecc_p;
  74         ecc_p->ecc_ue.ecc_type = CBNINTR_UE;
  75         ecc_p->ecc_ce.ecc_p = ecc_p;
  76         ecc_p->ecc_ce.ecc_type = CBNINTR_CE;
  77 
  78         pci_ecc_setup(ecc_p);
  79 
  80         /*
  81          * Determine the virtual addresses of the streaming cache
  82          * control/status and flush registers.
  83          */
  84         ecc_p->ecc_csr_pa = cb_base_pa + COMMON_ECC_CSR_OFFSET;
  85         ecc_p->ecc_ue.ecc_afsr_pa = cb_base_pa + COMMON_UE_AFSR_OFFSET;
  86         ecc_p->ecc_ue.ecc_afar_pa = cb_base_pa + COMMON_UE_AFAR_OFFSET;
  87         ecc_p->ecc_ce.ecc_afsr_pa = cb_base_pa + COMMON_CE_AFSR_OFFSET;
  88         ecc_p->ecc_ce.ecc_afar_pa = cb_base_pa + COMMON_CE_AFAR_OFFSET;
  89 
  90         DEBUG1(DBG_ATTACH, dip, "ecc_create: csr=%x\n", ecc_p->ecc_csr_pa);
  91         DEBUG2(DBG_ATTACH, dip, "ecc_create: ue_afsr=%x, ue_afar=%x\n",
  92             ecc_p->ecc_ue.ecc_afsr_pa, ecc_p->ecc_ue.ecc_afar_pa);
  93         DEBUG2(DBG_ATTACH, dip, "ecc_create: ce_afsr=%x, ce_afar=%x\n",
  94             ecc_p->ecc_ce.ecc_afsr_pa, ecc_p->ecc_ce.ecc_afar_pa);
  95 
  96         ecc_configure(pci_p);
  97 
  98         /*
  99          * Register routines to be called from system error handling code.
 100          */
 101         bus_func_register(BF_TYPE_ERRDIS, (busfunc_t)ecc_disable_nowait, ecc_p);
 102 }
 103 
 104 int
 105 ecc_register_intr(pci_t *pci_p)
 106 {
 107         ecc_t *ecc_p = pci_p->pci_ecc_p;
 108         int ret;
 109 
 110         /*
 111          * Install the UE and CE error interrupt handlers.
 112          */
 113         if ((ret = pci_ecc_add_intr(pci_p, CBNINTR_UE, &ecc_p->ecc_ue)) !=
 114             DDI_SUCCESS)
 115                 return (ret);
 116         if ((ret = pci_ecc_add_intr(pci_p, CBNINTR_CE, &ecc_p->ecc_ce)) !=
 117             DDI_SUCCESS)
 118                 return (ret);
 119 
 120         return (DDI_SUCCESS);
 121 }
 122 
 123 void
 124 ecc_destroy(pci_t *pci_p)
 125 {
 126         ecc_t *ecc_p = pci_p->pci_ecc_p;
 127 
 128         DEBUG0(DBG_DETACH, pci_p->pci_dip, "ecc_destroy:\n");
 129 
 130         /*
 131          * Disable UE and CE ECC error interrupts.
 132          */
 133         ecc_disable_wait(ecc_p);
 134 
 135         /*
 136          * Remove the ECC interrupt handlers.
 137          */
 138         pci_ecc_rem_intr(pci_p, CBNINTR_UE, &ecc_p->ecc_ue);
 139         pci_ecc_rem_intr(pci_p, CBNINTR_CE, &ecc_p->ecc_ce);
 140 
 141         /*
 142          * Unregister our error handling functions.
 143          */
 144         bus_func_unregister(BF_TYPE_ERRDIS,
 145             (busfunc_t)ecc_disable_nowait, ecc_p);
 146         /*
 147          * If a timer has been set, unset it.
 148          */
 149         (void) untimeout(ecc_p->ecc_to_id);
 150 
 151         kmem_free(ecc_p, sizeof (ecc_t));
 152         pci_p->pci_ecc_p = NULL;
 153 }
 154 
 155 void
 156 ecc_configure(pci_t *pci_p)
 157 {
 158         ecc_t *ecc_p = pci_p->pci_ecc_p;
 159         dev_info_t *dip = pci_p->pci_dip;
 160         uint64_t l;
 161 
 162         /*
 163          * Clear any pending ECC errors.
 164          */
 165         DEBUG0(DBG_ATTACH, dip, "ecc_configure: clearing UE and CE errors\n");
 166         l = (COMMON_ECC_UE_AFSR_E_MASK << COMMON_ECC_UE_AFSR_PE_SHIFT) |
 167             (COMMON_ECC_UE_AFSR_E_MASK << COMMON_ECC_UE_AFSR_SE_SHIFT);
 168         stdphysio(ecc_p->ecc_ue.ecc_afsr_pa, l);
 169 
 170         l = (COMMON_ECC_CE_AFSR_E_MASK << COMMON_ECC_CE_AFSR_PE_SHIFT) |
 171             (COMMON_ECC_CE_AFSR_E_MASK << COMMON_ECC_CE_AFSR_SE_SHIFT);
 172         stdphysio(ecc_p->ecc_ce.ecc_afsr_pa, l);
 173 
 174         /*
 175          * Enable ECC error detections via the control register.
 176          */
 177         DEBUG0(DBG_ATTACH, dip, "ecc_configure: enabling UE CE detection\n");
 178         l = COMMON_ECC_CTRL_ECC_EN;
 179         if (ecc_error_intr_enable)
 180                 l |= COMMON_ECC_CTRL_UE_INTEN | COMMON_ECC_CTRL_CE_INTEN;
 181         stdphysio(ecc_p->ecc_csr_pa, l);
 182 }
 183 
 184 void
 185 ecc_enable_intr(pci_t *pci_p)
 186 {
 187         cb_enable_nintr(pci_p, CBNINTR_UE);
 188         cb_enable_nintr(pci_p, CBNINTR_CE);
 189 }
 190 
 191 void
 192 ecc_disable_wait(ecc_t *ecc_p)
 193 {
 194         ecc_disable(ecc_p, IB_INTR_WAIT);
 195 }
 196 
 197 uint_t
 198 ecc_disable_nowait(ecc_t *ecc_p)
 199 {
 200         ecc_disable(ecc_p, IB_INTR_NOWAIT);
 201         return (BF_NONE);
 202 }
 203 
 204 static void
 205 ecc_disable(ecc_t *ecc_p, int wait)
 206 {
 207         cb_t *cb_p = ecc_p->ecc_pci_cmn_p->pci_common_cb_p;
 208         uint64_t csr_pa = ecc_p->ecc_csr_pa;
 209         uint64_t csr = lddphysio(csr_pa);
 210 
 211         csr &= ~(COMMON_ECC_CTRL_UE_INTEN | COMMON_ECC_CTRL_CE_INTEN);
 212         stdphysio(csr_pa, csr);
 213 
 214         cb_disable_nintr(cb_p, CBNINTR_UE, wait);
 215         cb_disable_nintr(cb_p, CBNINTR_CE, wait);
 216 }
 217 
 218 /*
 219  * I/O ECC error handling:
 220  *
 221  * Below are the generic functions that handle PCI(pcisch, pcipsy) detected
 222  * ECC errors.
 223  *
 224  * The registered interrupt handler for both pcisch and pcipsy is ecc_intr(),
 225  * it's function is to receive the error, capture some state, and pass that on
 226  * to the ecc_err_handler() for reporting purposes.
 227  *
 228  * ecc_err_handler() gathers more state(via ecc_errstate_get) and attempts
 229  * to handle and report the error. ecc_err_handler() must determine if we need
 230  * to panic due to this error (via pci_ecc_classify, which also decodes the
 231  * ECC afsr), and if any side effects exist that may have caused or are due
 232  * to this error. PBM errors related to the ECC error may exist, to report
 233  * them we call pci_pbm_err_handler() and call ndi_fm_handler_dispatch() so
 234  * that the child devices can log their pci errors.
 235  *
 236  * To report the error we must also get the syndrome and unum, which can not
 237  * be done in high level interrupted context. Therefore we have an error
 238  * queue(pci_ecc_queue) which we dispatch errors to, to report the errors
 239  * (ecc_err_drain()).
 240  *
 241  * ecc_err_drain() will be called when either the softint is triggered
 242  * or the system is panicing. Either way it will gather more information
 243  * about the error from the CPU(via ecc_cpu_call(), ecc.c), attempt to
 244  * retire the faulty page(if error is a UE), and report the detected error.
 245  *
 246  * ecc_delayed_ce() is called via timeout from ecc_err_handler() following
 247  * the receipt of a CE interrupt.  It will be called after 6ms and check to
 248  * see if any new CEs are present, if so we will log and another timeout will
 249  * be set by(ecc_err_handler()).  If no CEs are present then it will re-enable
 250  * CEs by clearing the previous interrupt.  This is to keep the system going
 251  * in the event of a CE storm.
 252  */
 253 
 254 /*
 255  * Function used to get ECC AFSR register
 256  */
 257 static uint64_t
 258 ecc_read_afsr(ecc_intr_info_t *ecc_ii_p)
 259 {
 260         uint_t i;
 261         uint64_t afsr = 0ull;
 262 
 263         ASSERT((ecc_ii_p->ecc_type == CBNINTR_UE) ||
 264             (ecc_ii_p->ecc_type == CBNINTR_CE));
 265         if (!ecc_ii_p->ecc_errpndg_mask)
 266                 return (lddphysio(ecc_ii_p->ecc_afsr_pa));
 267 
 268         for (i = 0; i < pci_ecc_afsr_retries; i++) {
 269 
 270                 /*
 271                  * If we timeout, the logging routine will
 272                  * know because it will see the ERRPNDG bits
 273                  * set in the AFSR.
 274                  */
 275                 afsr = lddphysio(ecc_ii_p->ecc_afsr_pa);
 276                 if ((afsr & ecc_ii_p->ecc_errpndg_mask) == 0)
 277                         break;
 278         }
 279         return (afsr);
 280 }
 281 
 282 /*
 283  * IO detected ECC error interrupt handler, calls ecc_err_handler to post
 284  * error reports and handle the interrupt. Re-entry into ecc_err_handler
 285  * is protected by the per-chip mutex pci_fm_mutex.
 286  */
 287 uint_t
 288 ecc_intr(caddr_t a)
 289 {
 290         ecc_intr_info_t *ecc_ii_p = (ecc_intr_info_t *)a;
 291         ecc_t *ecc_p = ecc_ii_p->ecc_p;
 292         pci_common_t *cmn_p = ecc_p->ecc_pci_cmn_p;
 293         ecc_errstate_t ecc_err;
 294         int ret = DDI_FM_OK;
 295 
 296         bzero(&ecc_err, sizeof (ecc_errstate_t));
 297         ecc_err.ecc_ena = fm_ena_generate(0, FM_ENA_FMT1);
 298         ecc_err.ecc_ii_p = *ecc_ii_p;
 299         ecc_err.ecc_p = ecc_p;
 300         ecc_err.ecc_caller = PCI_ECC_CALL;
 301 
 302         mutex_enter(&cmn_p->pci_fm_mutex);
 303         ret = ecc_err_handler(&ecc_err);
 304         mutex_exit(&cmn_p->pci_fm_mutex);
 305         if (ret == DDI_FM_FATAL) {
 306                 /*
 307                  * Need delay here to allow CPUs to handle related traps,
 308                  * such as FRUs for USIIIi systems.
 309                  */
 310                 DELAY(pci_ecc_panic_delay);
 311                 fm_panic("Fatal PCI UE Error");
 312         }
 313 
 314         return (DDI_INTR_CLAIMED);
 315 }
 316 
 317 /*
 318  * Function used to gather IO ECC error state.
 319  */
 320 static void
 321 ecc_errstate_get(ecc_errstate_t *ecc_err_p)
 322 {
 323         ecc_t *ecc_p;
 324         uint_t bus_id;
 325 
 326         ASSERT(ecc_err_p);
 327 
 328         ecc_p = ecc_err_p->ecc_ii_p.ecc_p;
 329         bus_id = ecc_p->ecc_pci_cmn_p->pci_common_id;
 330 
 331         ASSERT(MUTEX_HELD(&ecc_p->ecc_pci_cmn_p->pci_fm_mutex));
 332         /*
 333          * Read the fault registers.
 334          */
 335         ecc_err_p->ecc_afsr = ecc_read_afsr(&ecc_err_p->ecc_ii_p);
 336         ecc_err_p->ecc_afar = lddphysio(ecc_err_p->ecc_ii_p.ecc_afar_pa);
 337 
 338         ecc_err_p->ecc_offset = ((ecc_err_p->ecc_afsr &
 339             ecc_err_p->ecc_ii_p.ecc_offset_mask) >>
 340             ecc_err_p->ecc_ii_p.ecc_offset_shift) <<
 341             ecc_err_p->ecc_ii_p.ecc_size_log2;
 342 
 343         ecc_err_p->ecc_aflt.flt_id = gethrtime();
 344         ecc_err_p->ecc_aflt.flt_stat = ecc_err_p->ecc_afsr;
 345         ecc_err_p->ecc_aflt.flt_addr = P2ALIGN(ecc_err_p->ecc_afar, 64) +
 346             ecc_err_p->ecc_offset;
 347         ecc_err_p->ecc_aflt.flt_bus_id = bus_id;
 348         ecc_err_p->ecc_aflt.flt_inst = CPU->cpu_id;
 349         ecc_err_p->ecc_aflt.flt_status = ECC_IOBUS;
 350         ecc_err_p->ecc_aflt.flt_in_memory =
 351             (pf_is_memory(ecc_err_p->ecc_afar >> MMU_PAGESHIFT))? 1: 0;
 352         ecc_err_p->ecc_aflt.flt_class = BUS_FAULT;
 353 }
 354 
 355 /*
 356  * ecc_pci_check: Called by ecc_err_handler() this function is responsible
 357  * for calling pci_pbm_err_handler() for both sides of the schizo/psycho
 358  * and calling their children error handlers(via ndi_fm_handler_dispatch()).
 359  */
 360 static int
 361 ecc_pci_check(ecc_t *ecc_p, uint64_t fme_ena)
 362 {
 363         ddi_fm_error_t derr;
 364         int i;
 365         int ret;
 366 
 367         ASSERT(MUTEX_HELD(&ecc_p->ecc_pci_cmn_p->pci_fm_mutex));
 368 
 369         bzero(&derr, sizeof (ddi_fm_error_t));
 370         derr.fme_version = DDI_FME_VERSION;
 371         derr.fme_ena = fme_ena;
 372         ret = DDI_FM_NONFATAL;
 373 
 374         /*
 375          * Need to report any PBM errors which may have caused or
 376          * resulted from this error.
 377          *
 378          * Each psycho or schizo is represented by a pair of pci nodes
 379          * in the device tree.
 380          */
 381         for (i = 0; i < 2; i++) {
 382                 dev_info_t *dip;
 383                 pci_t *pci_p;
 384 
 385                 /* Make sure PBM PCI node exists */
 386                 pci_p = ecc_p->ecc_pci_cmn_p->pci_p[i];
 387                 if (pci_p == NULL)
 388                         continue;
 389 
 390                 dip = pci_p->pci_dip;
 391                 if (pci_pbm_err_handler(dip, &derr, (void *)pci_p,
 392                     PCI_ECC_CALL) == DDI_FM_FATAL)
 393                         ret = DDI_FM_FATAL;
 394         }
 395         if (ret == DDI_FM_FATAL)
 396                 return (DDI_FM_FATAL);
 397         else
 398                 return (DDI_FM_NONFATAL);
 399 }
 400 
 401 /*
 402  * Function used to handle and log IO detected ECC errors, can be called by
 403  * ecc_intr and pci_err_callback(trap callback). Protected by pci_fm_mutex.
 404  */
 405 int
 406 ecc_err_handler(ecc_errstate_t *ecc_err_p)
 407 {
 408         uint64_t pri_err, sec_err;
 409         ecc_intr_info_t *ecc_ii_p = &ecc_err_p->ecc_ii_p;
 410         ecc_t *ecc_p = ecc_ii_p->ecc_p;
 411         pci_t *pci_p;
 412         cb_t *cb_p;
 413         int fatal = 0;
 414         int nonfatal = 0;
 415         ecc_errstate_t ecc_sec_err;
 416         uint64_t sec_tmp;
 417         int i;
 418         uint64_t afsr_err[] = { COMMON_ECC_AFSR_E_PIO,
 419                                 COMMON_ECC_AFSR_E_DRD,
 420                                 COMMON_ECC_AFSR_E_DWR };
 421 
 422 
 423         ASSERT(MUTEX_HELD(&ecc_p->ecc_pci_cmn_p->pci_fm_mutex));
 424 
 425         pci_p = ecc_p->ecc_pci_cmn_p->pci_p[0];
 426         if (pci_p == NULL)
 427                 pci_p = ecc_p->ecc_pci_cmn_p->pci_p[1];
 428 
 429         cb_p = ecc_p->ecc_pci_cmn_p->pci_common_cb_p;
 430 
 431         ecc_errstate_get(ecc_err_p);
 432         pri_err = (ecc_err_p->ecc_afsr >> COMMON_ECC_UE_AFSR_PE_SHIFT) &
 433             COMMON_ECC_UE_AFSR_E_MASK;
 434 
 435         sec_err = (ecc_err_p->ecc_afsr >> COMMON_ECC_UE_AFSR_SE_SHIFT) &
 436             COMMON_ECC_UE_AFSR_E_MASK;
 437 
 438         switch (ecc_ii_p->ecc_type) {
 439         case CBNINTR_UE:
 440                 if (pri_err) {
 441                         ecc_err_p->ecc_aflt.flt_synd =
 442                             pci_ecc_get_synd(ecc_err_p->ecc_afsr);
 443                         ecc_err_p->ecc_pri = 1;
 444                         pci_ecc_classify(pri_err, ecc_err_p);
 445                         errorq_dispatch(pci_ecc_queue, (void *)ecc_err_p,
 446                             sizeof (ecc_errstate_t),
 447                             ecc_err_p->ecc_aflt.flt_panic);
 448                 }
 449                 if (sec_err) {
 450                         ecc_sec_err = *ecc_err_p;
 451                         ecc_sec_err.ecc_pri = 0;
 452                         /*
 453                          * Secondary errors are cumulative so we need to loop
 454                          * through to capture them all.
 455                          */
 456                         for (i = 0; i < 3; i++) {
 457                                 sec_tmp = sec_err & afsr_err[i];
 458                                 if (sec_tmp) {
 459                                         pci_ecc_classify(sec_tmp, &ecc_sec_err);
 460                                         ecc_ereport_post(pci_p->pci_dip,
 461                                             &ecc_sec_err);
 462                                 }
 463                         }
 464                 }
 465                 /*
 466                  * Check for PCI bus errors that may have resulted from or
 467                  * caused this UE.
 468                  */
 469                 if (ecc_err_p->ecc_caller == PCI_ECC_CALL &&
 470                     ecc_pci_check(ecc_p, ecc_err_p->ecc_ena) == DDI_FM_FATAL)
 471                         ecc_err_p->ecc_aflt.flt_panic = 1;
 472 
 473                 if (ecc_err_p->ecc_aflt.flt_panic &&
 474                     ecc_err_p->ecc_aflt.flt_in_memory)
 475                         panic_aflt = ecc_err_p->ecc_aflt;
 476 
 477                 if (ecc_err_p->ecc_aflt.flt_panic) {
 478                         /*
 479                          * Disable all further errors since this will be
 480                          * treated as a fatal error.
 481                          */
 482                         (void) ecc_disable_nowait(ecc_p);
 483                         fatal++;
 484                 }
 485                 break;
 486 
 487         case CBNINTR_CE:
 488                 if (pri_err) {
 489                         ecc_err_p->ecc_pri = 1;
 490                         pci_ecc_classify(pri_err, ecc_err_p);
 491                         ecc_err_p->ecc_aflt.flt_synd =
 492                             pci_ecc_get_synd(ecc_err_p->ecc_afsr);
 493                         ce_scrub(&ecc_err_p->ecc_aflt);
 494                         errorq_dispatch(pci_ecc_queue, (void *)ecc_err_p,
 495                             sizeof (ecc_errstate_t), ERRORQ_ASYNC);
 496                         nonfatal++;
 497                 }
 498                 if (sec_err) {
 499                         ecc_sec_err = *ecc_err_p;
 500                         ecc_sec_err.ecc_pri = 0;
 501                         /*
 502                          * Secondary errors are cumulative so we need to loop
 503                          * through to capture them all.
 504                          */
 505                         for (i = 0; i < 3; i++) {
 506                                 sec_tmp = sec_err & afsr_err[i];
 507                                 if (sec_tmp) {
 508                                         pci_ecc_classify(sec_tmp, &ecc_sec_err);
 509                                         ecc_ereport_post(pci_p->pci_dip,
 510                                             &ecc_sec_err);
 511                                 }
 512                         }
 513                         nonfatal++;
 514                 }
 515                 break;
 516 
 517         default:
 518                 return (DDI_FM_OK);
 519         }
 520         /* Clear the errors */
 521         stdphysio(ecc_ii_p->ecc_afsr_pa, ecc_err_p->ecc_afsr);
 522         /*
 523          * Clear the interrupt if called by ecc_intr and UE error or if called
 524          * by ecc_intr and CE error and delayed CE interrupt handling is
 525          * turned off.
 526          */
 527         if ((ecc_err_p->ecc_caller == PCI_ECC_CALL &&
 528             ecc_ii_p->ecc_type == CBNINTR_UE && !fatal) ||
 529             (ecc_err_p->ecc_caller == PCI_ECC_CALL &&
 530             ecc_ii_p->ecc_type == CBNINTR_CE && !ecc_ce_delayed))
 531                 cb_clear_nintr(cb_p, ecc_ii_p->ecc_type);
 532         if (!fatal && !nonfatal)
 533                 return (DDI_FM_OK);
 534         else if (fatal)
 535                 return (DDI_FM_FATAL);
 536         return (DDI_FM_NONFATAL);
 537 }
 538 
 539 /*
 540  * Called from ecc_err_drain below for CBINTR_CE case.
 541  */
 542 static int
 543 ecc_err_cexdiag(ecc_errstate_t *ecc_err, errorq_elem_t *eqep)
 544 {
 545         struct async_flt *ecc = &ecc_err->ecc_aflt;
 546         uint64_t errors;
 547 
 548         if (page_retire_check(ecc->flt_addr, &errors) == EINVAL) {
 549                 CE_XDIAG_SETSKIPCODE(ecc->flt_disp, CE_XDIAG_SKIP_NOPP);
 550                 return (0);
 551         } else if (errors != PR_OK) {
 552                 CE_XDIAG_SETSKIPCODE(ecc->flt_disp, CE_XDIAG_SKIP_PAGEDET);
 553                 return (0);
 554         } else {
 555                 return (ce_scrub_xdiag_recirc(ecc, pci_ecc_queue, eqep,
 556                     offsetof(ecc_errstate_t, ecc_aflt)));
 557         }
 558 }
 559 
 560 /*
 561  * Function used to drain pci_ecc_queue, either during panic or after softint
 562  * is generated, to log IO detected ECC errors.
 563  */
 564 /*ARGSUSED*/
 565 void
 566 ecc_err_drain(void *not_used, ecc_errstate_t *ecc_err, errorq_elem_t *eqep)
 567 {
 568         struct async_flt *ecc = &ecc_err->ecc_aflt;
 569         pci_t *pci_p = ecc_err->ecc_p->ecc_pci_cmn_p->pci_p[0];
 570         int ecc_type = ecc_err->ecc_ii_p.ecc_type;
 571 
 572         if (pci_p == NULL)
 573                 pci_p = ecc_err->ecc_p->ecc_pci_cmn_p->pci_p[1];
 574 
 575         if (ecc->flt_class == RECIRC_BUS_FAULT) {
 576                 /*
 577                  * Perform any additional actions that occur after the
 578                  * ecc_err_cexdiag below and post the ereport.
 579                  */
 580                 ecc->flt_class = BUS_FAULT;
 581                 ecc_err->ecc_err_type = flt_to_error_type(ecc);
 582                 ecc_ereport_post(pci_p->pci_dip, ecc_err);
 583                 return;
 584         }
 585 
 586         ecc_cpu_call(ecc, ecc_err->ecc_unum, (ecc_type == CBNINTR_UE) ?
 587             ECC_IO_UE : ECC_IO_CE);
 588 
 589         switch (ecc_type) {
 590         case CBNINTR_UE:
 591                 if (ecc_err->ecc_pg_ret == 1) {
 592                         (void) page_retire(ecc->flt_addr, PR_UE);
 593                 }
 594                 ecc_err->ecc_err_type = flt_to_error_type(ecc);
 595                 break;
 596 
 597         case CBNINTR_CE:
 598                 /*
 599                  * Setup timeout (if CE detected via interrupt) to
 600                  * re-enable CE interrupts if no more CEs are detected.
 601                  * This is to protect against CE storms.
 602                  */
 603                 if (ecc_ce_delayed &&
 604                     ecc_err->ecc_caller == PCI_ECC_CALL &&
 605                     ecc_err->ecc_p->ecc_to_id == 0) {
 606                         ecc_err->ecc_p->ecc_to_id = timeout(ecc_delayed_ce,
 607                             (void *)ecc_err->ecc_p,
 608                             drv_usectohz((clock_t)ecc_ce_delay_secs *
 609                             MICROSEC));
 610                 }
 611 
 612                 /* ecc_err_cexdiag returns nonzero to recirculate */
 613                 if (CE_XDIAG_EXT_ALG_APPLIED(ecc->flt_disp) &&
 614                     ecc_err_cexdiag(ecc_err, eqep))
 615                         return;
 616                 ecc_err->ecc_err_type = flt_to_error_type(ecc);
 617                 break;
 618         }
 619 
 620         ecc_ereport_post(pci_p->pci_dip, ecc_err);
 621 }
 622 
 623 static void
 624 ecc_delayed_ce(void *arg)
 625 {
 626         ecc_t *ecc_p = (ecc_t *)arg;
 627         pci_common_t *cmn_p;
 628         cb_t *cb_p;
 629 
 630         ASSERT(ecc_p);
 631 
 632         cmn_p = ecc_p->ecc_pci_cmn_p;
 633         cb_p = cmn_p->pci_common_cb_p;
 634         /*
 635          * If no more CE errors are found then enable interrupts(by
 636          * clearing the previous interrupt), else send in for logging
 637          * and the timeout should be set again.
 638          */
 639         ecc_p->ecc_to_id = 0;
 640         if (!((ecc_read_afsr(&ecc_p->ecc_ce) >>
 641             COMMON_ECC_UE_AFSR_PE_SHIFT) & COMMON_ECC_UE_AFSR_E_MASK)) {
 642                 cb_clear_nintr(cb_p, ecc_p->ecc_ce.ecc_type);
 643         } else {
 644                 ecc_errstate_t ecc_err;
 645 
 646                 bzero(&ecc_err, sizeof (ecc_errstate_t));
 647                 ecc_err.ecc_ena = fm_ena_generate(0, FM_ENA_FMT1);
 648                 ecc_err.ecc_ii_p = ecc_p->ecc_ce;
 649                 ecc_err.ecc_p = ecc_p;
 650                 ecc_err.ecc_caller = PCI_ECC_CALL;
 651 
 652                 mutex_enter(&cmn_p->pci_fm_mutex);
 653                 (void) ecc_err_handler(&ecc_err);
 654                 mutex_exit(&cmn_p->pci_fm_mutex);
 655         }
 656 }
 657 
 658 /*
 659  * Function used to post IO detected ECC ereports.
 660  */
 661 static void
 662 ecc_ereport_post(dev_info_t *dip, ecc_errstate_t *ecc_err)
 663 {
 664         char buf[FM_MAX_CLASS], dev_path[MAXPATHLEN], *ptr;
 665         struct i_ddi_fmhdl *fmhdl = DEVI(dip)->devi_fmhdl;
 666         nvlist_t *ereport, *detector;
 667         nv_alloc_t *nva;
 668         errorq_elem_t *eqep;
 669 
 670         /*
 671          * We do not use ddi_fm_ereport_post because we need to set a
 672          * special detector here. Since we do not have a device path for
 673          * the bridge chip we use what we think it should be to aid in
 674          * diagnosis. This path fmri is created by pci_fmri_create()
 675          * during initialization.
 676          */
 677         (void) snprintf(buf, FM_MAX_CLASS, "%s.%s.%s", DDI_IO_CLASS,
 678             ecc_err->ecc_bridge_type, ecc_err->ecc_aflt.flt_erpt_class);
 679 
 680         ecc_err->ecc_ena = ecc_err->ecc_ena ? ecc_err->ecc_ena :
 681             fm_ena_generate(0, FM_ENA_FMT1);
 682 
 683         eqep = errorq_reserve(fmhdl->fh_errorq);
 684         if (eqep == NULL)
 685                 return;
 686 
 687         ereport = errorq_elem_nvl(fmhdl->fh_errorq, eqep);
 688         nva = errorq_elem_nva(fmhdl->fh_errorq, eqep);
 689         detector = fm_nvlist_create(nva);
 690 
 691         ASSERT(ereport);
 692         ASSERT(nva);
 693         ASSERT(detector);
 694 
 695         ddi_pathname(dip, dev_path);
 696         ptr = strrchr(dev_path, (int)',');
 697 
 698         if (ptr)
 699                 *ptr = '\0';
 700 
 701         fm_fmri_dev_set(detector, FM_DEV_SCHEME_VERSION, NULL, dev_path,
 702             NULL, NULL);
 703 
 704         if (ecc_err->ecc_pri) {
 705                 if ((ecc_err->ecc_fmri = fm_nvlist_create(nva)) != NULL) {
 706                         char sid[DIMM_SERIAL_ID_LEN] = "";
 707                         uint64_t offset = (uint64_t)-1;
 708                         int len;
 709                         int ret;
 710 
 711                         ret = cpu_get_mem_sid(ecc_err->ecc_unum, sid,
 712                             DIMM_SERIAL_ID_LEN, &len);
 713 
 714                         if (ret == 0) {
 715                                 (void) cpu_get_mem_offset(
 716                                     ecc_err->ecc_aflt.flt_addr, &offset);
 717                         }
 718 
 719                         fm_fmri_mem_set(ecc_err->ecc_fmri,
 720                             FM_MEM_SCHEME_VERSION, NULL, ecc_err->ecc_unum,
 721                             (ret == 0) ? sid : NULL, offset);
 722                 }
 723                 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf,
 724                     ecc_err->ecc_ena, detector,
 725                     PCI_ECC_AFSR, DATA_TYPE_UINT64, ecc_err->ecc_afsr,
 726                     PCI_ECC_AFAR, DATA_TYPE_UINT64, ecc_err->ecc_aflt.flt_addr,
 727                     PCI_ECC_CTRL, DATA_TYPE_UINT64, ecc_err->ecc_ctrl,
 728                     PCI_ECC_SYND, DATA_TYPE_UINT16, ecc_err->ecc_aflt.flt_synd,
 729                     PCI_ECC_TYPE, DATA_TYPE_STRING, ecc_err->ecc_err_type,
 730                     PCI_ECC_DISP, DATA_TYPE_UINT64, ecc_err->ecc_aflt.flt_disp,
 731                     PCI_ECC_RESOURCE, DATA_TYPE_NVLIST, ecc_err->ecc_fmri,
 732                     NULL);
 733         } else {
 734                 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf,
 735                     ecc_err->ecc_ena, detector,
 736                     PCI_ECC_AFSR, DATA_TYPE_UINT64, ecc_err->ecc_afsr,
 737                     PCI_ECC_CTRL, DATA_TYPE_UINT64, ecc_err->ecc_ctrl,
 738                     NULL);
 739         }
 740         errorq_commit(fmhdl->fh_errorq, eqep, ERRORQ_ASYNC);
 741 }