1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * OPL platform specific functions for 31 * CPU/Memory error diagnosis engine. 32 */ 33 #include <cmd.h> 34 #include <cmd_dimm.h> 35 #include <cmd_bank.h> 36 #include <cmd_page.h> 37 #include <cmd_opl.h> 38 #include <string.h> 39 #include <errno.h> 40 #include <fcntl.h> 41 #include <unistd.h> 42 #include <dirent.h> 43 #include <sys/stat.h> 44 45 #include <sys/fm/protocol.h> 46 #include <sys/fm/io/opl_mc_fm.h> 47 #include <sys/async.h> 48 #include <sys/opl_olympus_regs.h> 49 #include <sys/fm/cpu/SPARC64-VI.h> 50 #include <sys/int_const.h> 51 #include <sys/mutex.h> 52 #include <sys/dditypes.h> 53 #include <opl/sys/mc-opl.h> 54 55 /* 56 * The following is the common function for handling 57 * memory UE with EID=MEM. 58 * The error could be detected by either CPU/IO. 59 */ 60 cmd_evdisp_t 61 opl_ue_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 62 int hdlr_type) 63 { 64 nvlist_t *rsrc = NULL, *asru = NULL, *fru = NULL; 65 uint64_t ubc_ue_log_reg, pa; 66 cmd_page_t *page; 67 68 if (nvlist_lookup_nvlist(nvl, 69 FM_EREPORT_PAYLOAD_NAME_RESOURCE, &rsrc) != 0) 70 return (CMD_EVD_BAD); 71 72 switch (hdlr_type) { 73 case CMD_OPL_HDLR_CPU: 74 75 if (nvlist_lookup_uint64(nvl, 76 FM_EREPORT_PAYLOAD_NAME_SFAR, &pa) != 0) 77 return (CMD_EVD_BAD); 78 79 fmd_hdl_debug(hdl, "cmd_ue_mem: pa=%llx\n", 80 (u_longlong_t)pa); 81 break; 82 83 case CMD_OPL_HDLR_IO: 84 85 if (nvlist_lookup_uint64(nvl, OBERON_UBC_MUE, 86 &ubc_ue_log_reg) != 0) 87 return (CMD_EVD_BAD); 88 89 pa = (ubc_ue_log_reg & UBC_UE_ADR_MASK); 90 91 fmd_hdl_debug(hdl, "cmd_ue_mem: ue_log_reg=%llx\n", 92 (u_longlong_t)ubc_ue_log_reg); 93 fmd_hdl_debug(hdl, "cmd_ue_mem: pa=%llx\n", 94 (u_longlong_t)pa); 95 break; 96 97 default: 98 99 return (CMD_EVD_BAD); 100 } 101 102 if ((page = cmd_page_lookup(pa)) != NULL && 103 page->page_case.cc_cp != NULL && 104 fmd_case_solved(hdl, page->page_case.cc_cp)) 105 return (CMD_EVD_REDUND); 106 107 if (nvlist_dup(rsrc, &asru, 0) != 0) { 108 fmd_hdl_debug(hdl, "opl_ue_mem nvlist dup failed\n"); 109 return (CMD_EVD_BAD); 110 } 111 112 if (fmd_nvl_fmri_expand(hdl, asru) < 0) { 113 nvlist_free(asru); 114 CMD_STAT_BUMP(bad_mem_asru); 115 return (CMD_EVD_BAD); 116 } 117 118 if ((fru = opl_mem_fru_create(hdl, asru)) == NULL) { 119 nvlist_free(asru); 120 return (CMD_EVD_BAD); 121 } 122 123 cmd_page_fault(hdl, asru, fru, ep, pa); 124 nvlist_free(asru); 125 nvlist_free(fru); 126 return (CMD_EVD_OK); 127 } 128 129 /* 130 * The following is the main function to handle generating 131 * the sibling cpu suspect list for the CPU detected UE 132 * error cases. This is to handle the 133 * multiple strand/core architecture on the OPL platform. 134 */ 135 cmd_evdisp_t 136 cmd_opl_ue_cpu(fmd_hdl_t *hdl, fmd_event_t *ep, 137 const char *class, const char *fltname, 138 cmd_ptrsubtype_t ptr, cmd_cpu_t *cpu, 139 cmd_case_t *cc, uint8_t cpumask) 140 { 141 const char *uuid; 142 cmd_cpu_t *main_cpu, *sib_cpu; 143 nvlist_t *fmri; 144 cmd_list_t *cpu_list; 145 opl_cpu_t *opl_cpu; 146 uint32_t main_cpuid, nsusp = 1; 147 uint8_t cert; 148 149 fmd_hdl_debug(hdl, 150 "Enter OPL_CPUUE_HANDLER for class %x\n", class); 151 152 main_cpu = cpu; 153 main_cpuid = cpu->cpu_cpuid; 154 155 if (strcmp(fltname, "core") == 0) 156 cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid, 157 IS_CORE); 158 else if (strcmp(fltname, "chip") == 0) 159 cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid, 160 IS_CHIP); 161 else 162 cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid, 163 IS_STRAND); 164 165 for (opl_cpu = cmd_list_next(cpu_list); opl_cpu != NULL; 166 opl_cpu = cmd_list_next(opl_cpu)) { 167 if (opl_cpu->oc_cpuid == main_cpuid) { 168 sib_cpu = main_cpu; 169 opl_cpu->oc_cmd_cpu = main_cpu; 170 } else { 171 fmri = cmd_cpu_fmri_create(opl_cpu->oc_cpuid, cpumask); 172 if (fmri == NULL) { 173 opl_cpu->oc_cmd_cpu = NULL; 174 fmd_hdl_debug(hdl, 175 "missing asru, cpuid %u excluded\n", 176 opl_cpu->oc_cpuid); 177 continue; 178 } 179 180 sib_cpu = cmd_cpu_lookup(hdl, fmri, class, 181 CMD_CPU_LEVEL_THREAD); 182 if (sib_cpu == NULL || sib_cpu->cpu_faulting) { 183 if (fmri != NULL) 184 nvlist_free(fmri); 185 opl_cpu->oc_cmd_cpu = NULL; 186 fmd_hdl_debug(hdl, 187 "cpu not present, cpuid %u excluded\n", 188 opl_cpu->oc_cpuid); 189 continue; 190 } 191 opl_cpu->oc_cmd_cpu = sib_cpu; 192 if (fmri != NULL) 193 nvlist_free(fmri); 194 nsusp++; 195 } 196 if (cpu->cpu_cpuid == main_cpuid) { 197 if (cc->cc_cp != NULL && 198 fmd_case_solved(hdl, cc->cc_cp)) { 199 if (cpu_list != NULL) 200 opl_cpulist_free(hdl, cpu_list); 201 return (CMD_EVD_REDUND); 202 } 203 204 if (cc->cc_cp == NULL) 205 cc->cc_cp = cmd_case_create(hdl, 206 &cpu->cpu_header, ptr, &uuid); 207 208 if (cc->cc_serdnm != NULL) { 209 fmd_hdl_debug(hdl, 210 "destroying existing %s state for class %x\n", 211 cc->cc_serdnm, class); 212 fmd_serd_destroy(hdl, cc->cc_serdnm); 213 fmd_hdl_strfree(hdl, cc->cc_serdnm); 214 cc->cc_serdnm = NULL; 215 fmd_case_reset(hdl, cc->cc_cp); 216 } 217 fmd_case_add_ereport(hdl, cc->cc_cp, ep); 218 } 219 } 220 cert = opl_avg(100, nsusp); 221 for (opl_cpu = cmd_list_next(cpu_list); opl_cpu != NULL; 222 opl_cpu = cmd_list_next(opl_cpu)) { 223 if (opl_cpu->oc_cmd_cpu != NULL) { 224 nvlist_t *cpu_rsrc; 225 226 cpu_rsrc = opl_cpursrc_create(hdl, opl_cpu->oc_cpuid); 227 if (cpu_rsrc == NULL) { 228 fmd_hdl_debug(hdl, 229 "missing rsrc, cpuid %u excluded\n", 230 opl_cpu->oc_cpuid); 231 continue; 232 } 233 cmd_cpu_create_faultlist(hdl, cc->cc_cp, 234 opl_cpu->oc_cmd_cpu, fltname, cpu_rsrc, cert); 235 nvlist_free(cpu_rsrc); 236 } 237 } 238 fmd_case_solve(hdl, cc->cc_cp); 239 if (cpu_list != NULL) 240 opl_cpulist_free(hdl, cpu_list); 241 return (CMD_EVD_OK); 242 } 243 244 /* 245 * Generates DIMM fault if the number of Permanent CE 246 * threshold is exceeded. 247 */ 248 static void 249 opl_ce_thresh_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm) 250 { 251 nvlist_t *dflt; 252 fmd_case_t *cp; 253 254 fmd_hdl_debug(hdl, 255 "Permanent CE event threshold checking.\n"); 256 257 if (dimm->dimm_flags & CMD_MEM_F_FAULTING) { 258 /* We've already complained about this DIMM */ 259 return; 260 } 261 262 if (dimm->dimm_nretired >= fmd_prop_get_int32(hdl, 263 "max_perm_ce_dimm")) { 264 dimm->dimm_flags |= CMD_MEM_F_FAULTING; 265 cp = fmd_case_open(hdl, NULL); 266 dflt = cmd_dimm_create_fault(hdl, dimm, "fault.memory.dimm", 267 CMD_FLTMAXCONF); 268 fmd_case_add_suspect(hdl, cp, dflt); 269 fmd_case_solve(hdl, cp); 270 } 271 } 272 273 /* 274 * Notify fault page information (pa and errlog) to XSCF via mc-opl 275 */ 276 #define MC_PHYDEV_DIR "/devices" 277 #define MC_PHYPREFIX "pseudo-mc@" 278 static int 279 opl_scf_log(fmd_hdl_t *hdl, nvlist_t *nvl) 280 { 281 uint32_t *eadd, *elog; 282 uint_t n; 283 uint64_t pa; 284 char path[MAXPATHLEN]; 285 char *unum; 286 nvlist_t *rsrc; 287 DIR *mcdir; 288 struct dirent *dp; 289 mc_flt_page_t flt_page; 290 cmd_page_t *page; 291 struct stat statbuf; 292 293 /* 294 * Extract ereport. 295 * Sanity check of pa is already done at cmd_opl_mac_common(). 296 * mc-opl sets only one entry for MC_OPL_ERR_ADD, MC_OPL_ERR_LOG, 297 * and MC_OPL_BANK. 298 */ 299 if ((nvlist_lookup_uint64(nvl, MC_OPL_PA, &pa) != 0) || 300 (nvlist_lookup_uint32_array(nvl, MC_OPL_ERR_ADD, &eadd, &n) != 0) || 301 (nvlist_lookup_uint32_array(nvl, MC_OPL_ERR_LOG, &elog, &n) != 0)) { 302 fmd_hdl_debug(hdl, "opl_scf_log failed to extract ereport.\n"); 303 return (-1); 304 } 305 if (nvlist_lookup_nvlist(nvl, FM_EREPORT_PAYLOAD_NAME_RESOURCE, 306 &rsrc) != 0) { 307 fmd_hdl_debug(hdl, "opl_scf_log failed to get resource.\n"); 308 return (-1); 309 } 310 if (nvlist_lookup_string(rsrc, FM_FMRI_MEM_UNUM, &unum) != 0) { 311 fmd_hdl_debug(hdl, "opl_scf_log failed to get unum.\n"); 312 return (-1); 313 } 314 315 page = cmd_page_lookup(pa); 316 if (page != NULL && page->page_flags & CMD_MEM_F_FAULTING) { 317 /* 318 * fault.memory.page will not be created. 319 */ 320 return (0); 321 } 322 323 flt_page.err_add = eadd[0]; 324 flt_page.err_log = elog[0]; 325 flt_page.fmri_addr = (uint64_t)(uint32_t)unum; 326 flt_page.fmri_sz = strlen(unum) + 1; 327 328 fmd_hdl_debug(hdl, "opl_scf_log DIMM: %s (%d)\n", 329 unum, strlen(unum) + 1); 330 fmd_hdl_debug(hdl, "opl_scf_log pa:%llx add:%x log:%x\n", 331 pa, eadd[0], elog[0]); 332 333 if ((mcdir = opendir(MC_PHYDEV_DIR)) != NULL) { 334 while ((dp = readdir(mcdir)) != NULL) { 335 int fd; 336 337 if (strncmp(dp->d_name, MC_PHYPREFIX, 338 strlen(MC_PHYPREFIX)) != 0) 339 continue; 340 341 (void) snprintf(path, sizeof (path), 342 "%s/%s", MC_PHYDEV_DIR, dp->d_name); 343 344 if (stat(path, &statbuf) != 0 || 345 (statbuf.st_mode & S_IFCHR) == 0) { 346 /* skip if not a character device */ 347 continue; 348 } 349 350 if ((fd = open(path, O_RDONLY)) < 0) 351 continue; 352 353 if (ioctl(fd, MCIOC_FAULT_PAGE, &flt_page) == 0) { 354 fmd_hdl_debug(hdl, "opl_scf_log ioctl(%s)\n", 355 path); 356 (void) close(fd); 357 (void) closedir(mcdir); 358 return (0); 359 } 360 (void) close(fd); 361 } 362 (void) closedir(mcdir); 363 } 364 365 fmd_hdl_debug(hdl, "opl_scf_log failed ioctl().\n"); 366 367 return (-1); 368 } 369 370 /* 371 * This is the common function for processing MAC detected 372 * Intermittent and Permanent CEs. 373 */ 374 375 cmd_evdisp_t 376 cmd_opl_mac_ce(fmd_hdl_t *hdl, fmd_event_t *ep, const char *class, 377 nvlist_t *asru, nvlist_t *fru, uint64_t pa, nvlist_t *nvl) 378 { 379 cmd_dimm_t *dimm; 380 const char *uuid; 381 382 fmd_hdl_debug(hdl, 383 "Processing CE ereport\n"); 384 385 if ((dimm = cmd_dimm_lookup(hdl, asru)) == NULL && 386 (dimm = cmd_dimm_create(hdl, asru)) == NULL) 387 return (CMD_EVD_UNUSED); 388 389 if (dimm->dimm_case.cc_cp == NULL) { 390 dimm->dimm_case.cc_cp = cmd_case_create(hdl, 391 &dimm->dimm_header, CMD_PTR_DIMM_CASE, &uuid); 392 } 393 394 if (strcmp(class, "ereport.asic.mac.ptrl-ice") == 0) { 395 CMD_STAT_BUMP(ce_interm); 396 fmd_hdl_debug(hdl, "adding FJ-Intermittent event " 397 "to CE serd engine\n"); 398 399 if (dimm->dimm_case.cc_serdnm == NULL) { 400 dimm->dimm_case.cc_serdnm = 401 cmd_mem_serdnm_create(hdl, 402 "dimm", dimm->dimm_unum); 403 fmd_serd_create(hdl, dimm->dimm_case.cc_serdnm, 404 fmd_prop_get_int32(hdl, "ce_n"), 405 fmd_prop_get_int64(hdl, "ce_t")); 406 } 407 408 if (fmd_serd_record(hdl, dimm->dimm_case.cc_serdnm, ep) == 409 FMD_B_FALSE) { 410 return (CMD_EVD_OK); /* engine hasn't fired */ 411 } 412 fmd_hdl_debug(hdl, "ce serd fired\n"); 413 fmd_case_add_serd(hdl, dimm->dimm_case.cc_cp, 414 dimm->dimm_case.cc_serdnm); 415 fmd_serd_reset(hdl, dimm->dimm_case.cc_serdnm); 416 417 (void) opl_scf_log(hdl, nvl); 418 } else { 419 CMD_STAT_BUMP(ce_sticky); 420 } 421 422 dimm->dimm_nretired++; 423 dimm->dimm_retstat.fmds_value.ui64++; 424 cmd_dimm_dirty(hdl, dimm); 425 426 cmd_page_fault(hdl, asru, fru, ep, pa); 427 opl_ce_thresh_check(hdl, dimm); 428 429 return (CMD_EVD_OK); 430 } 431 432 /* 433 * This is the common entry for processing MAC detected errors. 434 * It is responsible for generating the memory page fault event. 435 * The permanent CE (sticky) in normal mode is handled here also 436 * in the same way as in the UE case. 437 */ 438 /*ARGSUSED*/ 439 cmd_evdisp_t 440 cmd_opl_mac_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 441 const char *class, cmd_errcl_t clcode) 442 { 443 uint64_t pa; 444 nvlist_t *rsrc = NULL, *asru = NULL, *fru = NULL; 445 cmd_page_t *page; 446 447 fmd_hdl_debug(hdl, "cmd_mac_common: clcode=%ll\n", clcode); 448 449 if (nvlist_lookup_nvlist(nvl, MC_OPL_RESOURCE, &rsrc) != 0) 450 return (CMD_EVD_BAD); 451 452 if (nvlist_lookup_uint64(nvl, MC_OPL_PA, &pa) 453 != 0) 454 return (CMD_EVD_BAD); 455 456 /* 457 * Check for invalid pa. 458 * The most sig. bit should not be on. 459 * It would be out of the range of possible pa 460 * in MAC's view. 461 */ 462 if (((uint64_t)1 << 63) & pa) 463 return (CMD_EVD_BAD); 464 465 if ((page = cmd_page_lookup(pa)) != NULL && 466 page->page_case.cc_cp != NULL && 467 fmd_case_solved(hdl, page->page_case.cc_cp)) 468 return (CMD_EVD_REDUND); 469 470 if (nvlist_dup(rsrc, &asru, 0) != 0) { 471 fmd_hdl_debug(hdl, "cmd_opl_mac_common nvlist dup failed\n"); 472 return (CMD_EVD_BAD); 473 } 474 475 if (fmd_nvl_fmri_expand(hdl, asru) < 0) { 476 fmd_hdl_debug(hdl, "cmd_opl_mac_common expand failed\n"); 477 nvlist_free(asru); 478 CMD_STAT_BUMP(bad_mem_asru); 479 return (CMD_EVD_BAD); 480 } 481 482 if ((fru = opl_mem_fru_create(hdl, asru)) == NULL) { 483 fmd_hdl_debug(hdl, "cmd_opl_mac_common fru_create failed\n"); 484 nvlist_free(asru); 485 return (CMD_EVD_BAD); 486 } 487 488 /* 489 * process PCE and ICE to create DIMM fault 490 */ 491 if (strcmp(class, "ereport.asic.mac.mi-ce") == 0 || 492 strcmp(class, "ereport.asic.mac.ptrl-ce") == 0 || 493 strcmp(class, "ereport.asic.mac.ptrl-ice") == 0) { 494 cmd_evdisp_t ret; 495 496 ret = cmd_opl_mac_ce(hdl, ep, class, asru, fru, pa, nvl); 497 nvlist_free(asru); 498 nvlist_free(fru); 499 if (ret != CMD_EVD_OK) { 500 fmd_hdl_debug(hdl, 501 "cmd_opl_mac_common: mac_ce failed\n"); 502 return (CMD_EVD_BAD); 503 } else 504 return (CMD_EVD_OK); 505 } 506 507 /* The following code handles page retires for UEs and CMPEs. */ 508 509 cmd_page_fault(hdl, asru, fru, ep, pa); 510 nvlist_free(asru); 511 nvlist_free(fru); 512 return (CMD_EVD_OK); 513 } 514 515 /* 516 * Common entry points for handling CPU/IO detected UE with 517 * respect to EID=MEM. 518 */ 519 /*ARGSUSED*/ 520 cmd_evdisp_t 521 cmd_opl_cpu_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 522 const char *class, cmd_errcl_t clcode) 523 { 524 return (opl_ue_mem(hdl, ep, nvl, CMD_OPL_HDLR_CPU)); 525 } 526 527 /*ARGSUSED*/ 528 cmd_evdisp_t 529 cmd_opl_io_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 530 const char *class, cmd_errcl_t clcode) 531 { 532 return (opl_ue_mem(hdl, ep, nvl, CMD_OPL_HDLR_IO)); 533 }