1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #pragma ident   "%Z%%M% %I%     %E% SMI"
  28 
  29 /*
  30  * OPL platform specific functions for
  31  * CPU/Memory error diagnosis engine.
  32  */
  33 #include <cmd.h>
  34 #include <cmd_dimm.h>
  35 #include <cmd_bank.h>
  36 #include <cmd_page.h>
  37 #include <cmd_opl.h>
  38 #include <string.h>
  39 #include <errno.h>
  40 #include <fcntl.h>
  41 #include <unistd.h>
  42 #include <dirent.h>
  43 #include <sys/stat.h>
  44 
  45 #include <sys/fm/protocol.h>
  46 #include <sys/fm/io/opl_mc_fm.h>
  47 #include <sys/async.h>
  48 #include <sys/opl_olympus_regs.h>
  49 #include <sys/fm/cpu/SPARC64-VI.h>
  50 #include <sys/int_const.h>
  51 #include <sys/mutex.h>
  52 #include <sys/dditypes.h>
  53 #include <opl/sys/mc-opl.h>
  54 
  55 /*
  56  * The following is the common function for handling
  57  * memory UE with EID=MEM.
  58  * The error could be detected by either CPU/IO.
  59  */
  60 cmd_evdisp_t
  61 opl_ue_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
  62     int hdlr_type)
  63 {
  64         nvlist_t *rsrc = NULL, *asru = NULL, *fru = NULL;
  65         uint64_t ubc_ue_log_reg, pa;
  66         cmd_page_t *page;
  67 
  68         if (nvlist_lookup_nvlist(nvl,
  69             FM_EREPORT_PAYLOAD_NAME_RESOURCE, &rsrc) != 0)
  70                 return (CMD_EVD_BAD);
  71 
  72         switch (hdlr_type) {
  73         case CMD_OPL_HDLR_CPU:
  74 
  75                 if (nvlist_lookup_uint64(nvl,
  76                     FM_EREPORT_PAYLOAD_NAME_SFAR, &pa) != 0)
  77                         return (CMD_EVD_BAD);
  78 
  79                 fmd_hdl_debug(hdl, "cmd_ue_mem: pa=%llx\n",
  80                     (u_longlong_t)pa);
  81                 break;
  82 
  83         case CMD_OPL_HDLR_IO:
  84 
  85                 if (nvlist_lookup_uint64(nvl, OBERON_UBC_MUE,
  86                     &ubc_ue_log_reg) != 0)
  87                         return (CMD_EVD_BAD);
  88 
  89                 pa = (ubc_ue_log_reg & UBC_UE_ADR_MASK);
  90 
  91                 fmd_hdl_debug(hdl, "cmd_ue_mem: ue_log_reg=%llx\n",
  92                     (u_longlong_t)ubc_ue_log_reg);
  93                 fmd_hdl_debug(hdl, "cmd_ue_mem: pa=%llx\n",
  94                     (u_longlong_t)pa);
  95                 break;
  96 
  97         default:
  98 
  99                 return (CMD_EVD_BAD);
 100         }
 101 
 102         if ((page = cmd_page_lookup(pa)) != NULL &&
 103             page->page_case.cc_cp != NULL &&
 104             fmd_case_solved(hdl, page->page_case.cc_cp))
 105                 return (CMD_EVD_REDUND);
 106 
 107         if (nvlist_dup(rsrc, &asru, 0) != 0) {
 108                 fmd_hdl_debug(hdl, "opl_ue_mem nvlist dup failed\n");
 109                 return (CMD_EVD_BAD);
 110         }
 111 
 112         if (fmd_nvl_fmri_expand(hdl, asru) < 0) {
 113                 nvlist_free(asru);
 114                 CMD_STAT_BUMP(bad_mem_asru);
 115                 return (CMD_EVD_BAD);
 116         }
 117 
 118         if ((fru = opl_mem_fru_create(hdl, asru)) == NULL) {
 119                 nvlist_free(asru);
 120                 return (CMD_EVD_BAD);
 121         }
 122 
 123         cmd_page_fault(hdl, asru, fru, ep, pa);
 124         nvlist_free(asru);
 125         nvlist_free(fru);
 126         return (CMD_EVD_OK);
 127 }
 128 
 129 /*
 130  * The following is the main function to handle generating
 131  * the sibling cpu suspect list for the CPU detected UE
 132  * error cases.  This is to handle the
 133  * multiple strand/core architecture on the OPL platform.
 134  */
 135 cmd_evdisp_t
 136 cmd_opl_ue_cpu(fmd_hdl_t *hdl, fmd_event_t *ep,
 137     const char *class, const char *fltname,
 138     cmd_ptrsubtype_t ptr, cmd_cpu_t *cpu,
 139     cmd_case_t *cc, uint8_t cpumask)
 140 {
 141         const char *uuid;
 142         cmd_cpu_t *main_cpu, *sib_cpu;
 143         nvlist_t *fmri;
 144         cmd_list_t *cpu_list;
 145         opl_cpu_t *opl_cpu;
 146         uint32_t main_cpuid, nsusp = 1;
 147         uint8_t cert;
 148 
 149         fmd_hdl_debug(hdl,
 150             "Enter OPL_CPUUE_HANDLER for class %x\n", class);
 151 
 152         main_cpu = cpu;
 153         main_cpuid = cpu->cpu_cpuid;
 154 
 155         if (strcmp(fltname, "core") == 0)
 156                 cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid,
 157                     IS_CORE);
 158         else if (strcmp(fltname, "chip") == 0)
 159                 cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid,
 160                     IS_CHIP);
 161         else
 162                 cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid,
 163                     IS_STRAND);
 164 
 165         for (opl_cpu = cmd_list_next(cpu_list); opl_cpu != NULL;
 166             opl_cpu = cmd_list_next(opl_cpu)) {
 167                 if (opl_cpu->oc_cpuid == main_cpuid) {
 168                         sib_cpu = main_cpu;
 169                         opl_cpu->oc_cmd_cpu = main_cpu;
 170                 } else {
 171                         fmri = cmd_cpu_fmri_create(opl_cpu->oc_cpuid, cpumask);
 172                         if (fmri == NULL) {
 173                                 opl_cpu->oc_cmd_cpu = NULL;
 174                                 fmd_hdl_debug(hdl,
 175                                     "missing asru, cpuid %u excluded\n",
 176                                     opl_cpu->oc_cpuid);
 177                                 continue;
 178                         }
 179 
 180                         sib_cpu = cmd_cpu_lookup(hdl, fmri, class,
 181                             CMD_CPU_LEVEL_THREAD);
 182                         if (sib_cpu == NULL || sib_cpu->cpu_faulting) {
 183                                 if (fmri != NULL)
 184                                         nvlist_free(fmri);
 185                                 opl_cpu->oc_cmd_cpu = NULL;
 186                                 fmd_hdl_debug(hdl,
 187                                 "cpu not present, cpuid %u excluded\n",
 188                                     opl_cpu->oc_cpuid);
 189                                 continue;
 190                         }
 191                         opl_cpu->oc_cmd_cpu = sib_cpu;
 192                         if (fmri != NULL)
 193                                 nvlist_free(fmri);
 194                         nsusp++;
 195                 }
 196                 if (cpu->cpu_cpuid == main_cpuid) {
 197                         if (cc->cc_cp != NULL &&
 198                             fmd_case_solved(hdl, cc->cc_cp)) {
 199                                 if (cpu_list != NULL)
 200                                         opl_cpulist_free(hdl, cpu_list);
 201                                 return (CMD_EVD_REDUND);
 202                         }
 203 
 204                         if (cc->cc_cp == NULL)
 205                                 cc->cc_cp = cmd_case_create(hdl,
 206                                     &cpu->cpu_header, ptr, &uuid);
 207 
 208                         if (cc->cc_serdnm != NULL) {
 209                                 fmd_hdl_debug(hdl,
 210                         "destroying existing %s state for class %x\n",
 211                                     cc->cc_serdnm, class);
 212                                 fmd_serd_destroy(hdl, cc->cc_serdnm);
 213                                 fmd_hdl_strfree(hdl, cc->cc_serdnm);
 214                                 cc->cc_serdnm = NULL;
 215                                 fmd_case_reset(hdl, cc->cc_cp);
 216                         }
 217                         fmd_case_add_ereport(hdl, cc->cc_cp, ep);
 218                 }
 219         }
 220         cert = opl_avg(100, nsusp);
 221         for (opl_cpu = cmd_list_next(cpu_list); opl_cpu != NULL;
 222             opl_cpu = cmd_list_next(opl_cpu)) {
 223                 if (opl_cpu->oc_cmd_cpu != NULL) {
 224                         nvlist_t *cpu_rsrc;
 225 
 226                         cpu_rsrc = opl_cpursrc_create(hdl, opl_cpu->oc_cpuid);
 227                         if (cpu_rsrc == NULL) {
 228                                 fmd_hdl_debug(hdl,
 229                                 "missing rsrc, cpuid %u excluded\n",
 230                                     opl_cpu->oc_cpuid);
 231                                 continue;
 232                         }
 233                         cmd_cpu_create_faultlist(hdl, cc->cc_cp,
 234                             opl_cpu->oc_cmd_cpu, fltname, cpu_rsrc, cert);
 235                         nvlist_free(cpu_rsrc);
 236                 }
 237         }
 238         fmd_case_solve(hdl, cc->cc_cp);
 239         if (cpu_list != NULL)
 240                 opl_cpulist_free(hdl, cpu_list);
 241         return (CMD_EVD_OK);
 242 }
 243 
 244 /*
 245  * Generates DIMM fault if the number of Permanent CE
 246  * threshold is exceeded.
 247  */
 248 static void
 249 opl_ce_thresh_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm)
 250 {
 251         nvlist_t *dflt;
 252         fmd_case_t *cp;
 253 
 254         fmd_hdl_debug(hdl,
 255             "Permanent CE event threshold checking.\n");
 256 
 257         if (dimm->dimm_flags & CMD_MEM_F_FAULTING) {
 258                 /* We've already complained about this DIMM */
 259                 return;
 260         }
 261 
 262         if (dimm->dimm_nretired >= fmd_prop_get_int32(hdl,
 263             "max_perm_ce_dimm")) {
 264                 dimm->dimm_flags |= CMD_MEM_F_FAULTING;
 265                 cp = fmd_case_open(hdl, NULL);
 266                 dflt = cmd_dimm_create_fault(hdl, dimm, "fault.memory.dimm",
 267                     CMD_FLTMAXCONF);
 268                 fmd_case_add_suspect(hdl, cp, dflt);
 269                 fmd_case_solve(hdl, cp);
 270         }
 271 }
 272 
 273 /*
 274  * Notify fault page information (pa and errlog) to XSCF via mc-opl
 275  */
 276 #define MC_PHYDEV_DIR   "/devices"
 277 #define MC_PHYPREFIX    "pseudo-mc@"
 278 static int
 279 opl_scf_log(fmd_hdl_t *hdl, nvlist_t *nvl)
 280 {
 281         uint32_t *eadd, *elog;
 282         uint_t n;
 283         uint64_t pa;
 284         char path[MAXPATHLEN];
 285         char *unum;
 286         nvlist_t *rsrc;
 287         DIR *mcdir;
 288         struct dirent *dp;
 289         mc_flt_page_t flt_page;
 290         cmd_page_t *page;
 291         struct stat statbuf;
 292 
 293         /*
 294          * Extract ereport.
 295          * Sanity check of pa is already done at cmd_opl_mac_common().
 296          * mc-opl sets only one entry for MC_OPL_ERR_ADD, MC_OPL_ERR_LOG,
 297          * and MC_OPL_BANK.
 298          */
 299         if ((nvlist_lookup_uint64(nvl, MC_OPL_PA, &pa) != 0) ||
 300             (nvlist_lookup_uint32_array(nvl, MC_OPL_ERR_ADD, &eadd, &n) != 0) ||
 301             (nvlist_lookup_uint32_array(nvl, MC_OPL_ERR_LOG, &elog, &n) != 0)) {
 302                 fmd_hdl_debug(hdl, "opl_scf_log failed to extract ereport.\n");
 303                 return (-1);
 304         }
 305         if (nvlist_lookup_nvlist(nvl, FM_EREPORT_PAYLOAD_NAME_RESOURCE,
 306             &rsrc) != 0) {
 307                 fmd_hdl_debug(hdl, "opl_scf_log failed to get resource.\n");
 308                 return (-1);
 309         }
 310         if (nvlist_lookup_string(rsrc, FM_FMRI_MEM_UNUM, &unum) != 0) {
 311                 fmd_hdl_debug(hdl, "opl_scf_log failed to get unum.\n");
 312                 return (-1);
 313         }
 314 
 315         page = cmd_page_lookup(pa);
 316         if (page != NULL && page->page_flags & CMD_MEM_F_FAULTING) {
 317                 /*
 318                  * fault.memory.page will not be created.
 319                  */
 320                 return (0);
 321         }
 322 
 323         flt_page.err_add = eadd[0];
 324         flt_page.err_log = elog[0];
 325         flt_page.fmri_addr = (uint64_t)(uint32_t)unum;
 326         flt_page.fmri_sz = strlen(unum) + 1;
 327 
 328         fmd_hdl_debug(hdl, "opl_scf_log DIMM: %s (%d)\n",
 329             unum, strlen(unum) + 1);
 330         fmd_hdl_debug(hdl, "opl_scf_log pa:%llx add:%x log:%x\n",
 331             pa, eadd[0], elog[0]);
 332 
 333         if ((mcdir = opendir(MC_PHYDEV_DIR)) != NULL) {
 334                 while ((dp = readdir(mcdir)) != NULL) {
 335                         int fd;
 336 
 337                         if (strncmp(dp->d_name, MC_PHYPREFIX,
 338                             strlen(MC_PHYPREFIX)) != 0)
 339                                 continue;
 340 
 341                         (void) snprintf(path, sizeof (path),
 342                             "%s/%s", MC_PHYDEV_DIR, dp->d_name);
 343 
 344                         if (stat(path, &statbuf) != 0 ||
 345                             (statbuf.st_mode & S_IFCHR) == 0) {
 346                                 /* skip if not a character device */
 347                                 continue;
 348                         }
 349 
 350                         if ((fd = open(path, O_RDONLY)) < 0)
 351                                 continue;
 352 
 353                         if (ioctl(fd, MCIOC_FAULT_PAGE, &flt_page) == 0) {
 354                                 fmd_hdl_debug(hdl, "opl_scf_log ioctl(%s)\n",
 355                                     path);
 356                                 (void) close(fd);
 357                                 (void) closedir(mcdir);
 358                                 return (0);
 359                         }
 360                         (void) close(fd);
 361                 }
 362                 (void) closedir(mcdir);
 363         }
 364 
 365         fmd_hdl_debug(hdl, "opl_scf_log failed ioctl().\n");
 366 
 367         return (-1);
 368 }
 369 
 370 /*
 371  * This is the common function for processing MAC detected
 372  * Intermittent and Permanent CEs.
 373  */
 374 
 375 cmd_evdisp_t
 376 cmd_opl_mac_ce(fmd_hdl_t *hdl, fmd_event_t *ep, const char *class,
 377     nvlist_t *asru, nvlist_t *fru, uint64_t pa, nvlist_t *nvl)
 378 {
 379         cmd_dimm_t *dimm;
 380         const char *uuid;
 381 
 382         fmd_hdl_debug(hdl,
 383             "Processing CE ereport\n");
 384 
 385         if ((dimm = cmd_dimm_lookup(hdl, asru)) == NULL &&
 386             (dimm = cmd_dimm_create(hdl, asru)) == NULL)
 387                 return (CMD_EVD_UNUSED);
 388 
 389         if (dimm->dimm_case.cc_cp == NULL) {
 390                 dimm->dimm_case.cc_cp = cmd_case_create(hdl,
 391                     &dimm->dimm_header, CMD_PTR_DIMM_CASE, &uuid);
 392         }
 393 
 394         if (strcmp(class, "ereport.asic.mac.ptrl-ice") == 0) {
 395                 CMD_STAT_BUMP(ce_interm);
 396                 fmd_hdl_debug(hdl, "adding FJ-Intermittent event "
 397                     "to CE serd engine\n");
 398 
 399                 if (dimm->dimm_case.cc_serdnm == NULL) {
 400                         dimm->dimm_case.cc_serdnm =
 401                             cmd_mem_serdnm_create(hdl,
 402                             "dimm", dimm->dimm_unum);
 403                         fmd_serd_create(hdl, dimm->dimm_case.cc_serdnm,
 404                             fmd_prop_get_int32(hdl, "ce_n"),
 405                             fmd_prop_get_int64(hdl, "ce_t"));
 406                 }
 407 
 408                 if (fmd_serd_record(hdl, dimm->dimm_case.cc_serdnm, ep) ==
 409                     FMD_B_FALSE) {
 410                         return (CMD_EVD_OK); /* engine hasn't fired */
 411                 }
 412                 fmd_hdl_debug(hdl, "ce serd fired\n");
 413                 fmd_case_add_serd(hdl, dimm->dimm_case.cc_cp,
 414                     dimm->dimm_case.cc_serdnm);
 415                 fmd_serd_reset(hdl, dimm->dimm_case.cc_serdnm);
 416 
 417                 (void) opl_scf_log(hdl, nvl);
 418         } else {
 419                 CMD_STAT_BUMP(ce_sticky);
 420         }
 421 
 422         dimm->dimm_nretired++;
 423         dimm->dimm_retstat.fmds_value.ui64++;
 424         cmd_dimm_dirty(hdl, dimm);
 425 
 426         cmd_page_fault(hdl, asru, fru, ep, pa);
 427         opl_ce_thresh_check(hdl, dimm);
 428 
 429         return (CMD_EVD_OK);
 430 }
 431 
 432 /*
 433  * This is the common entry for processing MAC detected errors.
 434  * It is responsible for generating the memory page fault event.
 435  * The permanent CE (sticky) in normal mode is handled here also
 436  * in the same way as in the UE case.
 437  */
 438 /*ARGSUSED*/
 439 cmd_evdisp_t
 440 cmd_opl_mac_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
 441     const char *class, cmd_errcl_t clcode)
 442 {
 443         uint64_t pa;
 444         nvlist_t *rsrc = NULL, *asru = NULL, *fru = NULL;
 445         cmd_page_t *page;
 446 
 447         fmd_hdl_debug(hdl, "cmd_mac_common: clcode=%ll\n", clcode);
 448 
 449         if (nvlist_lookup_nvlist(nvl, MC_OPL_RESOURCE, &rsrc) != 0)
 450                 return (CMD_EVD_BAD);
 451 
 452         if (nvlist_lookup_uint64(nvl, MC_OPL_PA, &pa)
 453             != 0)
 454                 return (CMD_EVD_BAD);
 455 
 456         /*
 457          * Check for invalid pa.
 458          * The most sig. bit should not be on.
 459          * It would be out of the range of possible pa
 460          * in MAC's view.
 461          */
 462         if (((uint64_t)1 << 63) & pa)
 463                 return (CMD_EVD_BAD);
 464 
 465         if ((page = cmd_page_lookup(pa)) != NULL &&
 466             page->page_case.cc_cp != NULL &&
 467             fmd_case_solved(hdl, page->page_case.cc_cp))
 468                 return (CMD_EVD_REDUND);
 469 
 470         if (nvlist_dup(rsrc, &asru, 0) != 0) {
 471                 fmd_hdl_debug(hdl, "cmd_opl_mac_common nvlist dup failed\n");
 472                 return (CMD_EVD_BAD);
 473         }
 474 
 475         if (fmd_nvl_fmri_expand(hdl, asru) < 0) {
 476                 fmd_hdl_debug(hdl, "cmd_opl_mac_common expand failed\n");
 477                 nvlist_free(asru);
 478                 CMD_STAT_BUMP(bad_mem_asru);
 479                 return (CMD_EVD_BAD);
 480         }
 481 
 482         if ((fru = opl_mem_fru_create(hdl, asru)) == NULL) {
 483                 fmd_hdl_debug(hdl, "cmd_opl_mac_common fru_create failed\n");
 484                 nvlist_free(asru);
 485                 return (CMD_EVD_BAD);
 486         }
 487 
 488         /*
 489          * process PCE and ICE to create DIMM fault
 490          */
 491         if (strcmp(class, "ereport.asic.mac.mi-ce") == 0 ||
 492             strcmp(class, "ereport.asic.mac.ptrl-ce") == 0 ||
 493             strcmp(class, "ereport.asic.mac.ptrl-ice") == 0) {
 494                 cmd_evdisp_t ret;
 495 
 496                 ret = cmd_opl_mac_ce(hdl, ep, class, asru, fru, pa, nvl);
 497                 nvlist_free(asru);
 498                 nvlist_free(fru);
 499                 if (ret != CMD_EVD_OK) {
 500                         fmd_hdl_debug(hdl,
 501                             "cmd_opl_mac_common: mac_ce failed\n");
 502                         return (CMD_EVD_BAD);
 503                 } else
 504                         return (CMD_EVD_OK);
 505         }
 506 
 507         /* The following code handles page retires for UEs and CMPEs.  */
 508 
 509         cmd_page_fault(hdl, asru, fru, ep, pa);
 510         nvlist_free(asru);
 511         nvlist_free(fru);
 512         return (CMD_EVD_OK);
 513 }
 514 
 515 /*
 516  * Common entry points for handling CPU/IO detected UE with
 517  * respect to EID=MEM.
 518  */
 519 /*ARGSUSED*/
 520 cmd_evdisp_t
 521 cmd_opl_cpu_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
 522     const char *class, cmd_errcl_t clcode)
 523 {
 524         return (opl_ue_mem(hdl, ep, nvl, CMD_OPL_HDLR_CPU));
 525 }
 526 
 527 /*ARGSUSED*/
 528 cmd_evdisp_t
 529 cmd_opl_io_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
 530     const char *class, cmd_errcl_t clcode)
 531 {
 532         return (opl_ue_mem(hdl, ep, nvl, CMD_OPL_HDLR_IO));
 533 }