1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 #include <sys/systm.h>
  26 #include <sys/types.h>
  27 #include <sys/param.h>
  28 #include <sys/thread.h>
  29 #include <sys/cpuvar.h>
  30 #include <sys/cpupart.h>
  31 #include <sys/kmem.h>
  32 #include <sys/cmn_err.h>
  33 #include <sys/kstat.h>
  34 #include <sys/processor.h>
  35 #include <sys/disp.h>
  36 #include <sys/group.h>
  37 #include <sys/pghw.h>
  38 #include <sys/bitset.h>
  39 #include <sys/lgrp.h>
  40 #include <sys/cmt.h>
  41 #include <sys/cpu_pm.h>
  42 
  43 /*
  44  * CMT scheduler / dispatcher support
  45  *
  46  * This file implements CMT scheduler support using Processor Groups.
  47  * The CMT processor group class creates and maintains the CMT class
  48  * specific processor group pg_cmt_t.
  49  *
  50  * ---------------------------- <-- pg_cmt_t *
  51  * | pghw_t                   |
  52  * ----------------------------
  53  * | CMT class specific data  |
  54  * | - hierarchy linkage      |
  55  * | - CMT load balancing data|
  56  * | - active CPU group/bitset|
  57  * ----------------------------
  58  *
  59  * The scheduler/dispatcher leverages knowledge of the performance
  60  * relevant CMT sharing relationships existing between cpus to implement
  61  * optimized affinity, load balancing, and coalescence policies.
  62  *
  63  * Load balancing policy seeks to improve performance by minimizing
  64  * contention over shared processor resources / facilities, Affinity
  65  * policies seek to improve cache and TLB utilization. Coalescence
  66  * policies improve resource utilization and ultimately power efficiency.
  67  *
  68  * The CMT PGs created by this class are already arranged into a
  69  * hierarchy (which is done in the pghw layer). To implement the top-down
  70  * CMT load balancing algorithm, the CMT PGs additionally maintain
  71  * parent, child and sibling hierarchy relationships.
  72  * Parent PGs always contain a superset of their children(s) resources,
  73  * each PG can have at most one parent, and siblings are the group of PGs
  74  * sharing the same parent.
  75  *
  76  * On UMA based systems, the CMT load balancing algorithm begins by balancing
  77  * load across the group of top level PGs in the system hierarchy.
  78  * On NUMA systems, the CMT load balancing algorithm balances load across the
  79  * group of top level PGs in each leaf lgroup...but for root homed threads,
  80  * is willing to balance against all the top level PGs in the system.
  81  *
  82  * Groups of top level PGs are maintained to implement the above, one for each
  83  * leaf lgroup (containing the top level PGs in that lgroup), and one (for the
  84  * root lgroup) that contains all the top level PGs in the system.
  85  */
  86 static cmt_lgrp_t       *cmt_lgrps = NULL;      /* cmt_lgrps list head */
  87 static cmt_lgrp_t       *cpu0_lgrp = NULL;      /* boot CPU's initial lgrp */
  88                                                 /* used for null_proc_lpa */
  89 cmt_lgrp_t              *cmt_root = NULL;       /* Reference to root cmt pg */
  90 
  91 static int              is_cpu0 = 1; /* true if this is boot CPU context */
  92 
  93 /*
  94  * Array of hardware sharing relationships that are blacklisted.
  95  * CMT scheduling optimizations won't be performed for blacklisted sharing
  96  * relationships.
  97  */
  98 static int              cmt_hw_blacklisted[PGHW_NUM_COMPONENTS];
  99 
 100 /*
 101  * Set this to non-zero to disable CMT scheduling
 102  * This must be done via kmdb -d, as /etc/system will be too late
 103  */
 104 int                     cmt_sched_disabled = 0;
 105 
 106 /*
 107  * Status codes for CMT lineage validation
 108  * See pg_cmt_lineage_validate() below
 109  */
 110 typedef enum cmt_lineage_validation {
 111         CMT_LINEAGE_VALID,
 112         CMT_LINEAGE_NON_CONCENTRIC,
 113         CMT_LINEAGE_PG_SPANS_LGRPS,
 114         CMT_LINEAGE_NON_PROMOTABLE,
 115         CMT_LINEAGE_REPAIRED,
 116         CMT_LINEAGE_UNRECOVERABLE
 117 } cmt_lineage_validation_t;
 118 
 119 /*
 120  * Status of the current lineage under construction.
 121  * One must be holding cpu_lock to change this.
 122  */
 123 cmt_lineage_validation_t        cmt_lineage_status = CMT_LINEAGE_VALID;
 124 
 125 /*
 126  * Power domain definitions (on x86) are defined by ACPI, and
 127  * therefore may be subject to BIOS bugs.
 128  */
 129 #define PG_CMT_HW_SUSPECT(hw)   PGHW_IS_PM_DOMAIN(hw)
 130 
 131 /*
 132  * Macro to test if PG is managed by the CMT PG class
 133  */
 134 #define IS_CMT_PG(pg)   (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)
 135 
 136 static pg_cid_t         pg_cmt_class_id;                /* PG class id */
 137 
 138 static pg_t             *pg_cmt_alloc();
 139 static void             pg_cmt_free(pg_t *);
 140 static void             pg_cmt_cpu_init(cpu_t *, cpu_pg_t *);
 141 static void             pg_cmt_cpu_fini(cpu_t *, cpu_pg_t *);
 142 static void             pg_cmt_cpu_active(cpu_t *);
 143 static void             pg_cmt_cpu_inactive(cpu_t *);
 144 static void             pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
 145 static void             pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
 146 static char             *pg_cmt_policy_name(pg_t *);
 147 static void             pg_cmt_hier_sort(pg_cmt_t **, int);
 148 static pg_cmt_t         *pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *);
 149 static int              pg_cmt_cpu_belongs(pg_t *, cpu_t *);
 150 static int              pg_cmt_hw(pghw_type_t);
 151 static cmt_lgrp_t       *pg_cmt_find_lgrp(lgrp_handle_t);
 152 static cmt_lgrp_t       *pg_cmt_lgrp_create(lgrp_handle_t);
 153 static void             cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t,
 154                             kthread_t *, kthread_t *);
 155 static void             cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t,
 156                             kthread_t *, kthread_t *);
 157 static void             cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *);
 158 static cmt_lineage_validation_t pg_cmt_lineage_validate(pg_cmt_t **, int *,
 159                             cpu_pg_t *);
 160 
 161 /*
 162  * CMT PG ops
 163  */
 164 struct pg_ops pg_ops_cmt = {
 165         pg_cmt_alloc,
 166         pg_cmt_free,
 167         pg_cmt_cpu_init,
 168         pg_cmt_cpu_fini,
 169         pg_cmt_cpu_active,
 170         pg_cmt_cpu_inactive,
 171         pg_cmt_cpupart_in,
 172         NULL,                   /* cpupart_out */
 173         pg_cmt_cpupart_move,
 174         pg_cmt_cpu_belongs,
 175         pg_cmt_policy_name,
 176 };
 177 
 178 /*
 179  * Initialize the CMT PG class
 180  */
 181 void
 182 pg_cmt_class_init(void)
 183 {
 184         if (cmt_sched_disabled)
 185                 return;
 186 
 187         pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
 188 }
 189 
 190 /*
 191  * Called to indicate a new CPU has started up so
 192  * that either t0 or the slave startup thread can
 193  * be accounted for.
 194  */
 195 void
 196 pg_cmt_cpu_startup(cpu_t *cp)
 197 {
 198         pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread,
 199             cp->cpu_thread);
 200 }
 201 
 202 /*
 203  * Return non-zero if thread can migrate between "from" and "to"
 204  * without a performance penalty
 205  */
 206 int
 207 pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
 208 {
 209         if (from->cpu_physid->cpu_cacheid ==
 210             to->cpu_physid->cpu_cacheid)
 211                 return (1);
 212         return (0);
 213 }
 214 
 215 /*
 216  * CMT class specific PG allocation
 217  */
 218 static pg_t *
 219 pg_cmt_alloc(void)
 220 {
 221         return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
 222 }
 223 
 224 /*
 225  * Class specific PG de-allocation
 226  */
 227 static void
 228 pg_cmt_free(pg_t *pg)
 229 {
 230         ASSERT(pg != NULL);
 231         ASSERT(IS_CMT_PG(pg));
 232 
 233         kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
 234 }
 235 
 236 /*
 237  * Given a hardware sharing relationship, return which dispatcher
 238  * policies should be implemented to optimize performance and efficiency
 239  */
 240 static pg_cmt_policy_t
 241 pg_cmt_policy(pghw_type_t hw)
 242 {
 243         pg_cmt_policy_t p;
 244 
 245         /*
 246          * Give the platform a chance to override the default
 247          */
 248         if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY)
 249                 return (p);
 250 
 251         switch (hw) {
 252         case PGHW_IPIPE:
 253         case PGHW_FPU:
 254         case PGHW_PROCNODE:
 255         case PGHW_CHIP:
 256                 return (CMT_BALANCE);
 257         case PGHW_CACHE:
 258                 return (CMT_AFFINITY | CMT_BALANCE);
 259         case PGHW_POW_ACTIVE:
 260         case PGHW_POW_IDLE:
 261                 return (CMT_BALANCE);
 262         default:
 263                 return (CMT_NO_POLICY);
 264         }
 265 }
 266 
 267 /*
 268  * Rank the importance of optimizing for the pg1 relationship vs.
 269  * the pg2 relationship.
 270  */
 271 static pg_cmt_t *
 272 pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2)
 273 {
 274         pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw;
 275         pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw;
 276 
 277         /*
 278          * A power domain is only important if CPUPM is enabled.
 279          */
 280         if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) {
 281                 if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2))
 282                         return (pg2);
 283                 if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1))
 284                         return (pg1);
 285         }
 286 
 287         /*
 288          * Otherwise, ask the platform
 289          */
 290         if (pg_plat_hw_rank(hw1, hw2) == hw1)
 291                 return (pg1);
 292         else
 293                 return (pg2);
 294 }
 295 
 296 /*
 297  * Initialize CMT callbacks for the given PG
 298  */
 299 static void
 300 cmt_callback_init(pg_t *pg)
 301 {
 302         /*
 303          * Stick with the default callbacks if there isn't going to be
 304          * any CMT thread placement optimizations implemented.
 305          */
 306         if (((pg_cmt_t *)pg)->cmt_policy == CMT_NO_POLICY)
 307                 return;
 308 
 309         switch (((pghw_t *)pg)->pghw_hw) {
 310         case PGHW_POW_ACTIVE:
 311                 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr;
 312                 pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr;
 313                 break;
 314         default:
 315                 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch;
 316 
 317         }
 318 }
 319 
 320 /*
 321  * Promote PG above it's current parent.
 322  * This is only legal if PG has an equal or greater number of CPUs than its
 323  * parent.
 324  *
 325  * This routine operates on the CPU specific processor group data (for the CPUs
 326  * in the PG being promoted), and may be invoked from a context where one CPU's
 327  * PG data is under construction. In this case the argument "pgdata", if not
 328  * NULL, is a reference to the CPU's under-construction PG data.
 329  */
 330 static void
 331 cmt_hier_promote(pg_cmt_t *pg, cpu_pg_t *pgdata)
 332 {
 333         pg_cmt_t        *parent;
 334         group_t         *children;
 335         cpu_t           *cpu;
 336         group_iter_t    iter;
 337         pg_cpu_itr_t    cpu_iter;
 338         int             r;
 339         int             err;
 340         int             nchildren;
 341 
 342         ASSERT(MUTEX_HELD(&cpu_lock));
 343 
 344         parent = pg->cmt_parent;
 345         if (parent == NULL) {
 346                 /*
 347                  * Nothing to do
 348                  */
 349                 return;
 350         }
 351 
 352         ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent));
 353 
 354         /*
 355          * We're changing around the hierarchy, which is actively traversed
 356          * by the dispatcher. Pause CPUS to ensure exclusivity.
 357          */
 358         pause_cpus(NULL, NULL);
 359 
 360         /*
 361          * If necessary, update the parent's sibling set, replacing parent
 362          * with PG.
 363          */
 364         if (parent->cmt_siblings) {
 365                 if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE)
 366                     != -1) {
 367                         r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE);
 368                         ASSERT(r != -1);
 369                 }
 370         }
 371 
 372         /*
 373          * If the parent is at the top of the hierarchy, replace it's entry
 374          * in the root lgroup's group of top level PGs.
 375          */
 376         if (parent->cmt_parent == NULL &&
 377             parent->cmt_siblings != &cmt_root->cl_pgs) {
 378                 if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE)
 379                     != -1) {
 380                         r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE);
 381                         ASSERT(r != -1);
 382                 }
 383         }
 384 
 385         /*
 386          * We assume (and therefore assert) that the PG being promoted is an
 387          * only child of it's parent. Update the parent's children set
 388          * replacing PG's entry with the parent (since the parent is becoming
 389          * the child). Then have PG and the parent swap children sets and
 390          * children counts.
 391          */
 392         ASSERT(GROUP_SIZE(parent->cmt_children) <= 1);
 393         if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) {
 394                 r = group_add(parent->cmt_children, parent, GRP_NORESIZE);
 395                 ASSERT(r != -1);
 396         }
 397 
 398         children = pg->cmt_children;
 399         pg->cmt_children = parent->cmt_children;
 400         parent->cmt_children = children;
 401 
 402         nchildren = pg->cmt_nchildren;
 403         pg->cmt_nchildren = parent->cmt_nchildren;
 404         parent->cmt_nchildren = nchildren;
 405 
 406         /*
 407          * Update the sibling references for PG and it's parent
 408          */
 409         pg->cmt_siblings = parent->cmt_siblings;
 410         parent->cmt_siblings = pg->cmt_children;
 411 
 412         /*
 413          * Update any cached lineages in the per CPU pg data.
 414          */
 415         PG_CPU_ITR_INIT(pg, cpu_iter);
 416         while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
 417                 int             idx;
 418                 int             sz;
 419                 pg_cmt_t        *cpu_pg;
 420                 cpu_pg_t        *pgd;   /* CPU's PG data */
 421 
 422                 /*
 423                  * The CPU's whose lineage is under construction still
 424                  * references the bootstrap CPU PG data structure.
 425                  */
 426                 if (pg_cpu_is_bootstrapped(cpu))
 427                         pgd = pgdata;
 428                 else
 429                         pgd = cpu->cpu_pg;
 430 
 431                 /*
 432                  * Iterate over the CPU's PGs updating the children
 433                  * of the PG being promoted, since they have a new parent.
 434                  */
 435                 group_iter_init(&iter);
 436                 while ((cpu_pg = group_iterate(&pgd->cmt_pgs, &iter)) != NULL) {
 437                         if (cpu_pg->cmt_parent == pg) {
 438                                 cpu_pg->cmt_parent = parent;
 439                         }
 440                 }
 441 
 442                 /*
 443                  * Update the CMT load balancing lineage
 444                  */
 445                 if ((idx = group_find(&pgd->cmt_pgs, (void *)pg)) == -1) {
 446                         /*
 447                          * Unless this is the CPU who's lineage is being
 448                          * constructed, the PG being promoted should be
 449                          * in the lineage.
 450                          */
 451                         ASSERT(pg_cpu_is_bootstrapped(cpu));
 452                         continue;
 453                 }
 454 
 455                 ASSERT(idx > 0);
 456                 ASSERT(GROUP_ACCESS(&pgd->cmt_pgs, idx - 1) == parent);
 457 
 458                 /*
 459                  * Have the child and the parent swap places in the CPU's
 460                  * lineage
 461                  */
 462                 group_remove_at(&pgd->cmt_pgs, idx);
 463                 group_remove_at(&pgd->cmt_pgs, idx - 1);
 464                 err = group_add_at(&pgd->cmt_pgs, parent, idx);
 465                 ASSERT(err == 0);
 466                 err = group_add_at(&pgd->cmt_pgs, pg, idx - 1);
 467                 ASSERT(err == 0);
 468 
 469                 /*
 470                  * Ensure cmt_lineage references CPU's leaf PG.
 471                  * Since cmt_pgs is top-down ordered, the bottom is the last
 472                  * element.
 473                  */
 474                 if ((sz = GROUP_SIZE(&pgd->cmt_pgs)) > 0)
 475                         pgd->cmt_lineage = GROUP_ACCESS(&pgd->cmt_pgs, sz - 1);
 476         }
 477 
 478         /*
 479          * Update the parent references for PG and it's parent
 480          */
 481         pg->cmt_parent = parent->cmt_parent;
 482         parent->cmt_parent = pg;
 483 
 484         start_cpus();
 485 }
 486 
 487 /*
 488  * CMT class callback for a new CPU entering the system
 489  *
 490  * This routine operates on the CPU specific processor group data (for the CPU
 491  * being initialized). The argument "pgdata" is a reference to the CPU's PG
 492  * data to be constructed.
 493  *
 494  * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
 495  * references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it
 496  * calls must be careful to operate only on the "pgdata" argument, and not
 497  * cp->cpu_pg.
 498  */
 499 static void
 500 pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata)
 501 {
 502         pg_cmt_t        *pg;
 503         group_t         *cmt_pgs;
 504         int             levels, level;
 505         pghw_type_t     hw;
 506         pg_t            *pg_cache = NULL;
 507         pg_cmt_t        *cpu_cmt_hier[PGHW_NUM_COMPONENTS];
 508         lgrp_handle_t   lgrp_handle;
 509         cmt_lgrp_t      *lgrp;
 510         cmt_lineage_validation_t        lineage_status;
 511 
 512         ASSERT(MUTEX_HELD(&cpu_lock));
 513         ASSERT(pg_cpu_is_bootstrapped(cp));
 514 
 515         if (cmt_sched_disabled)
 516                 return;
 517 
 518         /*
 519          * A new CPU is coming into the system.
 520          * Interrogate the platform to see if the CPU
 521          * has any performance or efficiency relevant
 522          * sharing relationships
 523          */
 524         cmt_pgs = &pgdata->cmt_pgs;
 525         pgdata->cmt_lineage = NULL;
 526 
 527         bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
 528         levels = 0;
 529         for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {
 530 
 531                 pg_cmt_policy_t policy;
 532 
 533                 /*
 534                  * We're only interested in the hw sharing relationships
 535                  * for which we know how to optimize.
 536                  */
 537                 policy = pg_cmt_policy(hw);
 538                 if (policy == CMT_NO_POLICY ||
 539                     pg_plat_hw_shared(cp, hw) == 0)
 540                         continue;
 541 
 542                 /*
 543                  * We will still create the PGs for hardware sharing
 544                  * relationships that have been blacklisted, but won't
 545                  * implement CMT thread placement optimizations against them.
 546                  */
 547                 if (cmt_hw_blacklisted[hw] == 1)
 548                         policy = CMT_NO_POLICY;
 549 
 550                 /*
 551                  * Find (or create) the PG associated with
 552                  * the hw sharing relationship in which cp
 553                  * belongs.
 554                  *
 555                  * Determine if a suitable PG already
 556                  * exists, or if one needs to be created.
 557                  */
 558                 pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
 559                 if (pg == NULL) {
 560                         /*
 561                          * Create a new one.
 562                          * Initialize the common...
 563                          */
 564                         pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);
 565 
 566                         /* ... physical ... */
 567                         pghw_init((pghw_t *)pg, cp, hw);
 568 
 569                         /*
 570                          * ... and CMT specific portions of the
 571                          * structure.
 572                          */
 573                         pg->cmt_policy = policy;
 574 
 575                         /* CMT event callbacks */
 576                         cmt_callback_init((pg_t *)pg);
 577 
 578                         bitset_init(&pg->cmt_cpus_actv_set);
 579                         group_create(&pg->cmt_cpus_actv);
 580                 } else {
 581                         ASSERT(IS_CMT_PG(pg));
 582                 }
 583 
 584                 ((pghw_t *)pg)->pghw_generation++;
 585 
 586                 /* Add the CPU to the PG */
 587                 pg_cpu_add((pg_t *)pg, cp, pgdata);
 588 
 589                 /*
 590                  * Ensure capacity of the active CPU group/bitset
 591                  */
 592                 group_expand(&pg->cmt_cpus_actv,
 593                     GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
 594 
 595                 if (cp->cpu_seqid >=
 596                     bitset_capacity(&pg->cmt_cpus_actv_set)) {
 597                         bitset_resize(&pg->cmt_cpus_actv_set,
 598                             cp->cpu_seqid + 1);
 599                 }
 600 
 601                 /*
 602                  * Build a lineage of CMT PGs for load balancing / coalescence
 603                  */
 604                 if (policy & (CMT_BALANCE | CMT_COALESCE)) {
 605                         cpu_cmt_hier[levels++] = pg;
 606                 }
 607 
 608                 /* Cache this for later */
 609                 if (hw == PGHW_CACHE)
 610                         pg_cache = (pg_t *)pg;
 611         }
 612 
 613         group_expand(cmt_pgs, levels);
 614 
 615         if (cmt_root == NULL)
 616                 cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand());
 617 
 618         /*
 619          * Find the lgrp that encapsulates this CPU's CMT hierarchy
 620          */
 621         lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
 622         if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL)
 623                 lgrp = pg_cmt_lgrp_create(lgrp_handle);
 624 
 625         /*
 626          * Ascendingly sort the PGs in the lineage by number of CPUs
 627          */
 628         pg_cmt_hier_sort(cpu_cmt_hier, levels);
 629 
 630         /*
 631          * Examine the lineage and validate it.
 632          * This routine will also try to fix the lineage along with the
 633          * rest of the PG hierarchy should it detect an issue.
 634          *
 635          * If it returns anything other than VALID or REPAIRED, an
 636          * unrecoverable error has occurred, and we cannot proceed.
 637          */
 638         lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels, pgdata);
 639         if ((lineage_status != CMT_LINEAGE_VALID) &&
 640             (lineage_status != CMT_LINEAGE_REPAIRED)) {
 641                 /*
 642                  * In the case of an unrecoverable error where CMT scheduling
 643                  * has been disabled, assert that the under construction CPU's
 644                  * PG data has an empty CMT load balancing lineage.
 645                  */
 646                 ASSERT((cmt_sched_disabled == 0) ||
 647                     (GROUP_SIZE(&(pgdata->cmt_pgs)) == 0));
 648                 return;
 649         }
 650 
 651         /*
 652          * For existing PGs in the lineage, verify that the parent is
 653          * correct, as the generation in the lineage may have changed
 654          * as a result of the sorting. Start the traversal at the top
 655          * of the lineage, moving down.
 656          */
 657         for (level = levels - 1; level >= 0; ) {
 658                 int reorg;
 659 
 660                 reorg = 0;
 661                 pg = cpu_cmt_hier[level];
 662 
 663                 /*
 664                  * Promote PGs at an incorrect generation into place.
 665                  */
 666                 while (pg->cmt_parent &&
 667                     pg->cmt_parent != cpu_cmt_hier[level + 1]) {
 668                         cmt_hier_promote(pg, pgdata);
 669                         reorg++;
 670                 }
 671                 if (reorg > 0)
 672                         level = levels - 1;
 673                 else
 674                         level--;
 675         }
 676 
 677         /*
 678          * For each of the PGs in the CPU's lineage:
 679          *      - Add an entry in the CPU sorted CMT PG group
 680          *        which is used for top down CMT load balancing
 681          *      - Tie the PG into the CMT hierarchy by connecting
 682          *        it to it's parent and siblings.
 683          */
 684         for (level = 0; level < levels; level++) {
 685                 uint_t          children;
 686                 int             err;
 687 
 688                 pg = cpu_cmt_hier[level];
 689                 err = group_add_at(cmt_pgs, pg, levels - level - 1);
 690                 ASSERT(err == 0);
 691 
 692                 if (level == 0)
 693                         pgdata->cmt_lineage = (pg_t *)pg;
 694 
 695                 if (pg->cmt_siblings != NULL) {
 696                         /* Already initialized */
 697                         ASSERT(pg->cmt_parent == NULL ||
 698                             pg->cmt_parent == cpu_cmt_hier[level + 1]);
 699                         ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
 700                             ((pg->cmt_parent != NULL) &&
 701                             pg->cmt_siblings == pg->cmt_parent->cmt_children));
 702                         continue;
 703                 }
 704 
 705                 if ((level + 1) == levels) {
 706                         pg->cmt_parent = NULL;
 707 
 708                         pg->cmt_siblings = &lgrp->cl_pgs;
 709                         children = ++lgrp->cl_npgs;
 710                         if (cmt_root != lgrp)
 711                                 cmt_root->cl_npgs++;
 712                 } else {
 713                         pg->cmt_parent = cpu_cmt_hier[level + 1];
 714 
 715                         /*
 716                          * A good parent keeps track of their children.
 717                          * The parent's children group is also the PG's
 718                          * siblings.
 719                          */
 720                         if (pg->cmt_parent->cmt_children == NULL) {
 721                                 pg->cmt_parent->cmt_children =
 722                                     kmem_zalloc(sizeof (group_t), KM_SLEEP);
 723                                 group_create(pg->cmt_parent->cmt_children);
 724                         }
 725                         pg->cmt_siblings = pg->cmt_parent->cmt_children;
 726                         children = ++pg->cmt_parent->cmt_nchildren;
 727                 }
 728 
 729                 group_expand(pg->cmt_siblings, children);
 730                 group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs);
 731         }
 732 
 733         /*
 734          * Cache the chip and core IDs in the cpu_t->cpu_physid structure
 735          * for fast lookups later.
 736          */
 737         if (cp->cpu_physid) {
 738                 cp->cpu_physid->cpu_chipid =
 739                     pg_plat_hw_instance_id(cp, PGHW_CHIP);
 740                 cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);
 741 
 742                 /*
 743                  * If this cpu has a PG representing shared cache, then set
 744                  * cpu_cacheid to that PG's logical id
 745                  */
 746                 if (pg_cache)
 747                         cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
 748         }
 749 
 750         /* CPU0 only initialization */
 751         if (is_cpu0) {
 752                 is_cpu0 = 0;
 753                 cpu0_lgrp = lgrp;
 754         }
 755 
 756 }
 757 
 758 /*
 759  * Class callback when a CPU is leaving the system (deletion)
 760  *
 761  * "pgdata" is a reference to the CPU's PG data to be deconstructed.
 762  *
 763  * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
 764  * references a "bootstrap" structure across this function's invocation.
 765  * pg_cmt_cpu_fini() and the routines it calls must be careful to operate only
 766  * on the "pgdata" argument, and not cp->cpu_pg.
 767  */
 768 static void
 769 pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata)
 770 {
 771         group_iter_t    i;
 772         pg_cmt_t        *pg;
 773         group_t         *pgs, *cmt_pgs;
 774         lgrp_handle_t   lgrp_handle;
 775         cmt_lgrp_t      *lgrp;
 776 
 777         if (cmt_sched_disabled)
 778                 return;
 779 
 780         ASSERT(pg_cpu_is_bootstrapped(cp));
 781 
 782         pgs = &pgdata->pgs;
 783         cmt_pgs = &pgdata->cmt_pgs;
 784 
 785         /*
 786          * Find the lgroup that encapsulates this CPU's CMT hierarchy
 787          */
 788         lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
 789 
 790         lgrp = pg_cmt_find_lgrp(lgrp_handle);
 791         if (ncpus == 1 && lgrp != cpu0_lgrp) {
 792                 /*
 793                  * One might wonder how we could be deconfiguring the
 794                  * only CPU in the system.
 795                  *
 796                  * On Starcat systems when null_proc_lpa is detected,
 797                  * the boot CPU (which is already configured into a leaf
 798                  * lgroup), is moved into the root lgroup. This is done by
 799                  * deconfiguring it from both lgroups and processor
 800                  * groups), and then later reconfiguring it back in.  This
 801                  * call to pg_cmt_cpu_fini() is part of that deconfiguration.
 802                  *
 803                  * This special case is detected by noting that the platform
 804                  * has changed the CPU's lgrp affiliation (since it now
 805                  * belongs in the root). In this case, use the cmt_lgrp_t
 806                  * cached for the boot CPU, since this is what needs to be
 807                  * torn down.
 808                  */
 809                 lgrp = cpu0_lgrp;
 810         }
 811 
 812         ASSERT(lgrp != NULL);
 813 
 814         /*
 815          * First, clean up anything load balancing specific for each of
 816          * the CPU's PGs that participated in CMT load balancing
 817          */
 818         pg = (pg_cmt_t *)pgdata->cmt_lineage;
 819         while (pg != NULL) {
 820 
 821                 ((pghw_t *)pg)->pghw_generation++;
 822 
 823                 /*
 824                  * Remove the PG from the CPU's load balancing lineage
 825                  */
 826                 (void) group_remove(cmt_pgs, pg, GRP_RESIZE);
 827 
 828                 /*
 829                  * If it's about to become empty, destroy it's children
 830                  * group, and remove it's reference from it's siblings.
 831                  * This is done here (rather than below) to avoid removing
 832                  * our reference from a PG that we just eliminated.
 833                  */
 834                 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
 835                         if (pg->cmt_children != NULL)
 836                                 group_destroy(pg->cmt_children);
 837                         if (pg->cmt_siblings != NULL) {
 838                                 if (pg->cmt_siblings == &lgrp->cl_pgs)
 839                                         lgrp->cl_npgs--;
 840                                 else
 841                                         pg->cmt_parent->cmt_nchildren--;
 842                         }
 843                 }
 844                 pg = pg->cmt_parent;
 845         }
 846         ASSERT(GROUP_SIZE(cmt_pgs) == 0);
 847 
 848         /*
 849          * Now that the load balancing lineage updates have happened,
 850          * remove the CPU from all it's PGs (destroying any that become
 851          * empty).
 852          */
 853         group_iter_init(&i);
 854         while ((pg = group_iterate(pgs, &i)) != NULL) {
 855                 if (IS_CMT_PG(pg) == 0)
 856                         continue;
 857 
 858                 pg_cpu_delete((pg_t *)pg, cp, pgdata);
 859                 /*
 860                  * Deleting the CPU from the PG changes the CPU's
 861                  * PG group over which we are actively iterating
 862                  * Re-initialize the iteration
 863                  */
 864                 group_iter_init(&i);
 865 
 866                 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {
 867 
 868                         /*
 869                          * The PG has become zero sized, so destroy it.
 870                          */
 871                         group_destroy(&pg->cmt_cpus_actv);
 872                         bitset_fini(&pg->cmt_cpus_actv_set);
 873                         pghw_fini((pghw_t *)pg);
 874 
 875                         pg_destroy((pg_t *)pg);
 876                 }
 877         }
 878 }
 879 
 880 /*
 881  * Class callback when a CPU is entering a cpu partition
 882  */
 883 static void
 884 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
 885 {
 886         group_t         *pgs;
 887         pg_t            *pg;
 888         group_iter_t    i;
 889 
 890         ASSERT(MUTEX_HELD(&cpu_lock));
 891 
 892         if (cmt_sched_disabled)
 893                 return;
 894 
 895         pgs = &cp->cpu_pg->pgs;
 896 
 897         /*
 898          * Ensure that the new partition's PG bitset
 899          * is large enough for all CMT PG's to which cp
 900          * belongs
 901          */
 902         group_iter_init(&i);
 903         while ((pg = group_iterate(pgs, &i)) != NULL) {
 904                 if (IS_CMT_PG(pg) == 0)
 905                         continue;
 906 
 907                 if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
 908                         bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
 909         }
 910 }
 911 
 912 /*
 913  * Class callback when a CPU is actually moving partitions
 914  */
 915 static void
 916 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
 917 {
 918         cpu_t           *cpp;
 919         group_t         *pgs;
 920         pg_t            *pg;
 921         group_iter_t    pg_iter;
 922         pg_cpu_itr_t    cpu_iter;
 923         boolean_t       found;
 924 
 925         ASSERT(MUTEX_HELD(&cpu_lock));
 926 
 927         if (cmt_sched_disabled)
 928                 return;
 929 
 930         pgs = &cp->cpu_pg->pgs;
 931         group_iter_init(&pg_iter);
 932 
 933         /*
 934          * Iterate over the CPUs CMT PGs
 935          */
 936         while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {
 937 
 938                 if (IS_CMT_PG(pg) == 0)
 939                         continue;
 940 
 941                 /*
 942                  * Add the PG to the bitset in the new partition.
 943                  */
 944                 bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);
 945 
 946                 /*
 947                  * Remove the PG from the bitset in the old partition
 948                  * if the last of the PG's CPUs have left.
 949                  */
 950                 found = B_FALSE;
 951                 PG_CPU_ITR_INIT(pg, cpu_iter);
 952                 while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
 953                         if (cpp == cp)
 954                                 continue;
 955                         if (CPU_ACTIVE(cpp) &&
 956                             cpp->cpu_part->cp_id == oldpp->cp_id) {
 957                                 found = B_TRUE;
 958                                 break;
 959                         }
 960                 }
 961                 if (!found)
 962                         bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
 963         }
 964 }
 965 
 966 /*
 967  * Class callback when a CPU becomes active (online)
 968  *
 969  * This is called in a context where CPUs are paused
 970  */
 971 static void
 972 pg_cmt_cpu_active(cpu_t *cp)
 973 {
 974         int             err;
 975         group_iter_t    i;
 976         pg_cmt_t        *pg;
 977         group_t         *pgs;
 978 
 979         ASSERT(MUTEX_HELD(&cpu_lock));
 980 
 981         if (cmt_sched_disabled)
 982                 return;
 983 
 984         pgs = &cp->cpu_pg->pgs;
 985         group_iter_init(&i);
 986 
 987         /*
 988          * Iterate over the CPU's PGs
 989          */
 990         while ((pg = group_iterate(pgs, &i)) != NULL) {
 991 
 992                 if (IS_CMT_PG(pg) == 0)
 993                         continue;
 994 
 995                 /*
 996                  * Move to the next generation since topology is changing
 997                  */
 998                 ((pghw_t *)pg)->pghw_generation++;
 999 
1000                 err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
1001                 ASSERT(err == 0);
1002 
1003                 /*
1004                  * If this is the first active CPU in the PG, and it
1005                  * represents a hardware sharing relationship over which
1006                  * CMT load balancing is performed, add it as a candidate
1007                  * for balancing with it's siblings.
1008                  */
1009                 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
1010                     (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
1011                         err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
1012                         ASSERT(err == 0);
1013 
1014                         /*
1015                          * If this is a top level PG, add it as a balancing
1016                          * candidate when balancing within the root lgroup.
1017                          */
1018                         if (pg->cmt_parent == NULL &&
1019                             pg->cmt_siblings != &cmt_root->cl_pgs) {
1020                                 err = group_add(&cmt_root->cl_pgs, pg,
1021                                     GRP_NORESIZE);
1022                                 ASSERT(err == 0);
1023                         }
1024                 }
1025 
1026                 /*
1027                  * Notate the CPU in the PGs active CPU bitset.
1028                  * Also notate the PG as being active in it's associated
1029                  * partition
1030                  */
1031                 bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
1032                 bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
1033         }
1034 }
1035 
1036 /*
1037  * Class callback when a CPU goes inactive (offline)
1038  *
1039  * This is called in a context where CPUs are paused
1040  */
1041 static void
1042 pg_cmt_cpu_inactive(cpu_t *cp)
1043 {
1044         int             err;
1045         group_t         *pgs;
1046         pg_cmt_t        *pg;
1047         cpu_t           *cpp;
1048         group_iter_t    i;
1049         pg_cpu_itr_t    cpu_itr;
1050         boolean_t       found;
1051 
1052         ASSERT(MUTEX_HELD(&cpu_lock));
1053 
1054         if (cmt_sched_disabled)
1055                 return;
1056 
1057         pgs = &cp->cpu_pg->pgs;
1058         group_iter_init(&i);
1059 
1060         while ((pg = group_iterate(pgs, &i)) != NULL) {
1061 
1062                 if (IS_CMT_PG(pg) == 0)
1063                         continue;
1064 
1065                 /*
1066                  * Move to the next generation since topology is changing
1067                  */
1068                 ((pghw_t *)pg)->pghw_generation++;
1069 
1070                 /*
1071                  * Remove the CPU from the CMT PGs active CPU group
1072                  * bitmap
1073                  */
1074                 err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
1075                 ASSERT(err == 0);
1076 
1077                 bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
1078 
1079                 /*
1080                  * If there are no more active CPUs in this PG over which
1081                  * load was balanced, remove it as a balancing candidate.
1082                  */
1083                 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
1084                     (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
1085                         err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
1086                         ASSERT(err == 0);
1087 
1088                         if (pg->cmt_parent == NULL &&
1089                             pg->cmt_siblings != &cmt_root->cl_pgs) {
1090                                 err = group_remove(&cmt_root->cl_pgs, pg,
1091                                     GRP_NORESIZE);
1092                                 ASSERT(err == 0);
1093                         }
1094                 }
1095 
1096                 /*
1097                  * Assert the number of active CPUs does not exceed
1098                  * the total number of CPUs in the PG
1099                  */
1100                 ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
1101                     GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
1102 
1103                 /*
1104                  * Update the PG bitset in the CPU's old partition
1105                  */
1106                 found = B_FALSE;
1107                 PG_CPU_ITR_INIT(pg, cpu_itr);
1108                 while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
1109                         if (cpp == cp)
1110                                 continue;
1111                         if (CPU_ACTIVE(cpp) &&
1112                             cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
1113                                 found = B_TRUE;
1114                                 break;
1115                         }
1116                 }
1117                 if (!found) {
1118                         bitset_del(&cp->cpu_part->cp_cmt_pgs,
1119                             ((pg_t *)pg)->pg_id);
1120                 }
1121         }
1122 }
1123 
1124 /*
1125  * Return non-zero if the CPU belongs in the given PG
1126  */
1127 static int
1128 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
1129 {
1130         cpu_t   *pg_cpu;
1131 
1132         pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);
1133 
1134         ASSERT(pg_cpu != NULL);
1135 
1136         /*
1137          * The CPU belongs if, given the nature of the hardware sharing
1138          * relationship represented by the PG, the CPU has that
1139          * relationship with some other CPU already in the PG
1140          */
1141         if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
1142                 return (1);
1143 
1144         return (0);
1145 }
1146 
1147 /*
1148  * Sort the CPUs CMT hierarchy, where "size" is the number of levels.
1149  */
1150 static void
1151 pg_cmt_hier_sort(pg_cmt_t **hier, int size)
1152 {
1153         int             i, j, inc, sz;
1154         int             start, end;
1155         pg_t            *tmp;
1156         pg_t            **h = (pg_t **)hier;
1157 
1158         /*
1159          * First sort by number of CPUs
1160          */
1161         inc = size / 2;
1162         while (inc > 0) {
1163                 for (i = inc; i < size; i++) {
1164                         j = i;
1165                         tmp = h[i];
1166                         while ((j >= inc) &&
1167                             (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) {
1168                                 h[j] = h[j - inc];
1169                                 j = j - inc;
1170                         }
1171                         h[j] = tmp;
1172                 }
1173                 if (inc == 2)
1174                         inc = 1;
1175                 else
1176                         inc = (inc * 5) / 11;
1177         }
1178 
1179         /*
1180          * Break ties by asking the platform.
1181          * Determine if h[i] outranks h[i + 1] and if so, swap them.
1182          */
1183         for (start = 0; start < size; start++) {
1184 
1185                 /*
1186                  * Find various contiguous sets of elements,
1187                  * in the array, with the same number of cpus
1188                  */
1189                 end = start;
1190                 sz = PG_NUM_CPUS(h[start]);
1191                 while ((end < size) && (sz == PG_NUM_CPUS(h[end])))
1192                         end++;
1193                 /*
1194                  * Sort each such set of the array by rank
1195                  */
1196                 for (i = start + 1; i < end; i++) {
1197                         j = i - 1;
1198                         tmp = h[i];
1199                         while (j >= start &&
1200                             pg_cmt_hier_rank(hier[j],
1201                             (pg_cmt_t *)tmp) == hier[j]) {
1202                                 h[j + 1] = h[j];
1203                                 j--;
1204                         }
1205                         h[j + 1] = tmp;
1206                 }
1207         }
1208 }
1209 
1210 /*
1211  * Return a cmt_lgrp_t * given an lgroup handle.
1212  */
1213 static cmt_lgrp_t *
1214 pg_cmt_find_lgrp(lgrp_handle_t hand)
1215 {
1216         cmt_lgrp_t      *lgrp;
1217 
1218         ASSERT(MUTEX_HELD(&cpu_lock));
1219 
1220         lgrp = cmt_lgrps;
1221         while (lgrp != NULL) {
1222                 if (lgrp->cl_hand == hand)
1223                         break;
1224                 lgrp = lgrp->cl_next;
1225         }
1226         return (lgrp);
1227 }
1228 
1229 /*
1230  * Create a cmt_lgrp_t with the specified handle.
1231  */
1232 static cmt_lgrp_t *
1233 pg_cmt_lgrp_create(lgrp_handle_t hand)
1234 {
1235         cmt_lgrp_t      *lgrp;
1236 
1237         ASSERT(MUTEX_HELD(&cpu_lock));
1238 
1239         lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);
1240 
1241         lgrp->cl_hand = hand;
1242         lgrp->cl_npgs = 0;
1243         lgrp->cl_next = cmt_lgrps;
1244         cmt_lgrps = lgrp;
1245         group_create(&lgrp->cl_pgs);
1246 
1247         return (lgrp);
1248 }
1249 
1250 /*
1251  * Interfaces to enable and disable power aware dispatching
1252  * The caller must be holding cpu_lock.
1253  *
1254  * Return 0 on success and -1 on failure.
1255  */
1256 int
1257 cmt_pad_enable(pghw_type_t type)
1258 {
1259         group_t         *hwset;
1260         group_iter_t    iter;
1261         pg_cmt_t        *pg;
1262 
1263         ASSERT(PGHW_IS_PM_DOMAIN(type));
1264         ASSERT(MUTEX_HELD(&cpu_lock));
1265 
1266         if (cmt_sched_disabled == 1)
1267                 return (-1);
1268 
1269         if ((hwset = pghw_set_lookup(type)) == NULL ||
1270             cmt_hw_blacklisted[type]) {
1271                 /*
1272                  * Unable to find any instances of the specified type
1273                  * of power domain, or the power domains have been blacklisted.
1274                  */
1275                 return (-1);
1276         }
1277 
1278         /*
1279          * Iterate over the power domains, setting the default dispatcher
1280          * policy for power/performance optimization.
1281          *
1282          * Simply setting the policy isn't enough in the case where the power
1283          * domain is an only child of another PG. Because the dispatcher walks
1284          * the PG hierarchy in a top down fashion, the higher up PG's policy
1285          * will dominate. So promote the power domain above it's parent if both
1286          * PG and it's parent have the same CPUs to ensure it's policy
1287          * dominates.
1288          */
1289         group_iter_init(&iter);
1290         while ((pg = group_iterate(hwset, &iter)) != NULL) {
1291                 /*
1292                  * If the power domain is an only child to a parent
1293                  * not implementing the same policy, promote the child
1294                  * above the parent to activate the policy.
1295                  */
1296                 pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw);
1297                 while ((pg->cmt_parent != NULL) &&
1298                     (pg->cmt_parent->cmt_policy != pg->cmt_policy) &&
1299                     (PG_NUM_CPUS((pg_t *)pg) ==
1300                     PG_NUM_CPUS((pg_t *)pg->cmt_parent))) {
1301                         cmt_hier_promote(pg, NULL);
1302                 }
1303         }
1304 
1305         return (0);
1306 }
1307 
1308 int
1309 cmt_pad_disable(pghw_type_t type)
1310 {
1311         group_t         *hwset;
1312         group_iter_t    iter;
1313         pg_cmt_t        *pg;
1314         pg_cmt_t        *child;
1315 
1316         ASSERT(PGHW_IS_PM_DOMAIN(type));
1317         ASSERT(MUTEX_HELD(&cpu_lock));
1318 
1319         if (cmt_sched_disabled == 1)
1320                 return (-1);
1321 
1322         if ((hwset = pghw_set_lookup(type)) == NULL) {
1323                 /*
1324                  * Unable to find any instances of the specified type of
1325                  * power domain.
1326                  */
1327                 return (-1);
1328         }
1329         /*
1330          * Iterate over the power domains, setting the default dispatcher
1331          * policy for performance optimization (load balancing).
1332          */
1333         group_iter_init(&iter);
1334         while ((pg = group_iterate(hwset, &iter)) != NULL) {
1335 
1336                 /*
1337                  * If the power domain has an only child that implements
1338                  * policy other than load balancing, promote the child
1339                  * above the power domain to ensure it's policy dominates.
1340                  */
1341                 if (pg->cmt_children != NULL &&
1342                     GROUP_SIZE(pg->cmt_children) == 1) {
1343                         child = GROUP_ACCESS(pg->cmt_children, 0);
1344                         if ((child->cmt_policy & CMT_BALANCE) == 0) {
1345                                 cmt_hier_promote(child, NULL);
1346                         }
1347                 }
1348                 pg->cmt_policy = CMT_BALANCE;
1349         }
1350         return (0);
1351 }
1352 
1353 /* ARGSUSED */
1354 static void
1355 cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
1356                     kthread_t *new)
1357 {
1358         pg_cmt_t        *cmt_pg = (pg_cmt_t *)pg;
1359 
1360         if (old == cp->cpu_idle_thread) {
1361                 atomic_add_32(&cmt_pg->cmt_utilization, 1);
1362         } else if (new == cp->cpu_idle_thread) {
1363                 atomic_add_32(&cmt_pg->cmt_utilization, -1);
1364         }
1365 }
1366 
1367 /*
1368  * Macro to test whether a thread is currently runnable on a CPU in a PG.
1369  */
1370 #define THREAD_RUNNABLE_IN_PG(t, pg)                                    \
1371         ((t)->t_state == TS_RUN &&                                   \
1372             (t)->t_disp_queue->disp_cpu &&                                \
1373             bitset_in_set(&(pg)->cmt_cpus_actv_set,                      \
1374             (t)->t_disp_queue->disp_cpu->cpu_seqid))
1375 
1376 static void
1377 cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
1378     kthread_t *new)
1379 {
1380         pg_cmt_t        *cmt = (pg_cmt_t *)pg;
1381         cpupm_domain_t  *dom;
1382         uint32_t        u;
1383 
1384         if (old == cp->cpu_idle_thread) {
1385                 ASSERT(new != cp->cpu_idle_thread);
1386                 u = atomic_add_32_nv(&cmt->cmt_utilization, 1);
1387                 if (u == 1) {
1388                         /*
1389                          * Notify the CPU power manager that the domain
1390                          * is non-idle.
1391                          */
1392                         dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1393                         cpupm_utilization_event(cp, now, dom,
1394                             CPUPM_DOM_BUSY_FROM_IDLE);
1395                 }
1396         } else if (new == cp->cpu_idle_thread) {
1397                 ASSERT(old != cp->cpu_idle_thread);
1398                 u = atomic_add_32_nv(&cmt->cmt_utilization, -1);
1399                 if (u == 0) {
1400                         /*
1401                          * The domain is idle, notify the CPU power
1402                          * manager.
1403                          *
1404                          * Avoid notifying if the thread is simply migrating
1405                          * between CPUs in the domain.
1406                          */
1407                         if (!THREAD_RUNNABLE_IN_PG(old, cmt)) {
1408                                 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1409                                 cpupm_utilization_event(cp, now, dom,
1410                                     CPUPM_DOM_IDLE_FROM_BUSY);
1411                         }
1412                 }
1413         }
1414 }
1415 
1416 /* ARGSUSED */
1417 static void
1418 cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t)
1419 {
1420         pg_cmt_t        *cmt = (pg_cmt_t *)pg;
1421         cpupm_domain_t  *dom;
1422 
1423         dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1424         cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY);
1425 }
1426 
1427 /*
1428  * Return the name of the CMT scheduling policy
1429  * being implemented across this PG
1430  */
1431 static char *
1432 pg_cmt_policy_name(pg_t *pg)
1433 {
1434         pg_cmt_policy_t policy;
1435 
1436         policy = ((pg_cmt_t *)pg)->cmt_policy;
1437 
1438         if (policy & CMT_AFFINITY) {
1439                 if (policy & CMT_BALANCE)
1440                         return ("Load Balancing & Affinity");
1441                 else if (policy & CMT_COALESCE)
1442                         return ("Load Coalescence & Affinity");
1443                 else
1444                         return ("Affinity");
1445         } else {
1446                 if (policy & CMT_BALANCE)
1447                         return ("Load Balancing");
1448                 else if (policy & CMT_COALESCE)
1449                         return ("Load Coalescence");
1450                 else
1451                         return ("None");
1452         }
1453 }
1454 
1455 /*
1456  * Prune PG, and all other instances of PG's hardware sharing relationship
1457  * from the CMT PG hierarchy.
1458  *
1459  * This routine operates on the CPU specific processor group data (for the CPUs
1460  * in the PG being pruned), and may be invoked from a context where one CPU's
1461  * PG data is under construction. In this case the argument "pgdata", if not
1462  * NULL, is a reference to the CPU's under-construction PG data.
1463  */
1464 static int
1465 pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
1466 {
1467         group_t         *hwset, *children;
1468         int             i, j, r, size = *sz;
1469         group_iter_t    hw_iter, child_iter;
1470         pg_cpu_itr_t    cpu_iter;
1471         pg_cmt_t        *pg, *child;
1472         cpu_t           *cpu;
1473         int             cap_needed;
1474         pghw_type_t     hw;
1475 
1476         ASSERT(MUTEX_HELD(&cpu_lock));
1477 
1478         /*
1479          * Inform pghw layer that this PG is pruned.
1480          */
1481         pghw_cmt_fini((pghw_t *)pg_bad);
1482 
1483         hw = ((pghw_t *)pg_bad)->pghw_hw;
1484 
1485         if (hw == PGHW_POW_ACTIVE) {
1486                 cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. "
1487                     "Event Based CPUPM Unavailable");
1488         } else if (hw == PGHW_POW_IDLE) {
1489                 cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. "
1490                     "Dispatcher assisted CPUPM disabled.");
1491         }
1492 
1493         /*
1494          * Find and eliminate the PG from the lineage.
1495          */
1496         for (i = 0; i < size; i++) {
1497                 if (lineage[i] == pg_bad) {
1498                         for (j = i; j < size - 1; j++)
1499                                 lineage[j] = lineage[j + 1];
1500                         *sz = size - 1;
1501                         break;
1502                 }
1503         }
1504 
1505         /*
1506          * We'll prune all instances of the hardware sharing relationship
1507          * represented by pg. But before we do that (and pause CPUs) we need
1508          * to ensure the hierarchy's groups are properly sized.
1509          */
1510         hwset = pghw_set_lookup(hw);
1511 
1512         /*
1513          * Blacklist the hardware so future processor groups of this type won't
1514          * participate in CMT thread placement.
1515          *
1516          * XXX
1517          * For heterogeneous system configurations, this might be overkill.
1518          * We may only need to blacklist the illegal PGs, and other instances
1519          * of this hardware sharing relationship may be ok.
1520          */
1521         cmt_hw_blacklisted[hw] = 1;
1522 
1523         /*
1524          * For each of the PGs being pruned, ensure sufficient capacity in
1525          * the siblings set for the PG's children
1526          */
1527         group_iter_init(&hw_iter);
1528         while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
1529                 /*
1530                  * PG is being pruned, but if it is bringing up more than
1531                  * one child, ask for more capacity in the siblings group.
1532                  */
1533                 cap_needed = 0;
1534                 if (pg->cmt_children &&
1535                     GROUP_SIZE(pg->cmt_children) > 1) {
1536                         cap_needed = GROUP_SIZE(pg->cmt_children) - 1;
1537 
1538                         group_expand(pg->cmt_siblings,
1539                             GROUP_SIZE(pg->cmt_siblings) + cap_needed);
1540 
1541                         /*
1542                          * If this is a top level group, also ensure the
1543                          * capacity in the root lgrp level CMT grouping.
1544                          */
1545                         if (pg->cmt_parent == NULL &&
1546                             pg->cmt_siblings != &cmt_root->cl_pgs) {
1547                                 group_expand(&cmt_root->cl_pgs,
1548                                     GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed);
1549                                 cmt_root->cl_npgs += cap_needed;
1550                         }
1551                 }
1552         }
1553 
1554         /*
1555          * We're operating on the PG hierarchy. Pause CPUs to ensure
1556          * exclusivity with respect to the dispatcher.
1557          */
1558         pause_cpus(NULL, NULL);
1559 
1560         /*
1561          * Prune all PG instances of the hardware sharing relationship
1562          * represented by pg.
1563          */
1564         group_iter_init(&hw_iter);
1565         while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
1566 
1567                 /*
1568                  * Remove PG from it's group of siblings, if it's there.
1569                  */
1570                 if (pg->cmt_siblings) {
1571                         (void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
1572                 }
1573                 if (pg->cmt_parent == NULL &&
1574                     pg->cmt_siblings != &cmt_root->cl_pgs) {
1575                         (void) group_remove(&cmt_root->cl_pgs, pg,
1576                             GRP_NORESIZE);
1577                 }
1578 
1579                 /*
1580                  * Indicate that no CMT policy will be implemented across
1581                  * this PG.
1582                  */
1583                 pg->cmt_policy = CMT_NO_POLICY;
1584 
1585                 /*
1586                  * Move PG's children from it's children set to it's parent's
1587                  * children set. Note that the parent's children set, and PG's
1588                  * siblings set are the same thing.
1589                  *
1590                  * Because we are iterating over the same group that we are
1591                  * operating on (removing the children), first add all of PG's
1592                  * children to the parent's children set, and once we are done
1593                  * iterating, empty PG's children set.
1594                  */
1595                 if (pg->cmt_children != NULL) {
1596                         children = pg->cmt_children;
1597 
1598                         group_iter_init(&child_iter);
1599                         while ((child = group_iterate(children, &child_iter))
1600                             != NULL) {
1601                                 if (pg->cmt_siblings != NULL) {
1602                                         r = group_add(pg->cmt_siblings, child,
1603                                             GRP_NORESIZE);
1604                                         ASSERT(r == 0);
1605 
1606                                         if (pg->cmt_parent == NULL &&
1607                                             pg->cmt_siblings !=
1608                                             &cmt_root->cl_pgs) {
1609                                                 r = group_add(&cmt_root->cl_pgs,
1610                                                     child, GRP_NORESIZE);
1611                                                 ASSERT(r == 0);
1612                                         }
1613                                 }
1614                         }
1615                         group_empty(pg->cmt_children);
1616                 }
1617 
1618                 /*
1619                  * Reset the callbacks to the defaults
1620                  */
1621                 pg_callback_set_defaults((pg_t *)pg);
1622 
1623                 /*
1624                  * Update all the CPU lineages in each of PG's CPUs
1625                  */
1626                 PG_CPU_ITR_INIT(pg, cpu_iter);
1627                 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
1628                         pg_cmt_t        *cpu_pg;
1629                         group_iter_t    liter;  /* Iterator for the lineage */
1630                         cpu_pg_t        *cpd;   /* CPU's PG data */
1631 
1632                         /*
1633                          * The CPU's lineage is under construction still
1634                          * references the bootstrap CPU PG data structure.
1635                          */
1636                         if (pg_cpu_is_bootstrapped(cpu))
1637                                 cpd = pgdata;
1638                         else
1639                                 cpd = cpu->cpu_pg;
1640 
1641                         /*
1642                          * Iterate over the CPU's PGs updating the children
1643                          * of the PG being promoted, since they have a new
1644                          * parent and siblings set.
1645                          */
1646                         group_iter_init(&liter);
1647                         while ((cpu_pg = group_iterate(&cpd->pgs,
1648                             &liter)) != NULL) {
1649                                 if (cpu_pg->cmt_parent == pg) {
1650                                         cpu_pg->cmt_parent = pg->cmt_parent;
1651                                         cpu_pg->cmt_siblings = pg->cmt_siblings;
1652                                 }
1653                         }
1654 
1655                         /*
1656                          * Update the CPU's lineages
1657                          *
1658                          * Remove the PG from the CPU's group used for CMT
1659                          * scheduling.
1660                          */
1661                         (void) group_remove(&cpd->cmt_pgs, pg, GRP_NORESIZE);
1662                 }
1663         }
1664         start_cpus();
1665         return (0);
1666 }
1667 
1668 /*
1669  * Disable CMT scheduling
1670  */
1671 static void
1672 pg_cmt_disable(void)
1673 {
1674         cpu_t           *cpu;
1675 
1676         ASSERT(MUTEX_HELD(&cpu_lock));
1677 
1678         pause_cpus(NULL, NULL);
1679         cpu = cpu_list;
1680 
1681         do {
1682                 if (cpu->cpu_pg)
1683                         group_empty(&cpu->cpu_pg->cmt_pgs);
1684         } while ((cpu = cpu->cpu_next) != cpu_list);
1685 
1686         cmt_sched_disabled = 1;
1687         start_cpus();
1688         cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable");
1689 }
1690 
1691 /*
1692  * CMT lineage validation
1693  *
1694  * This routine is invoked by pg_cmt_cpu_init() to validate the integrity
1695  * of the PGs in a CPU's lineage. This is necessary because it's possible that
1696  * some groupings (power domain groupings in particular) may be defined by
1697  * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be
1698  * possible to integrate those groupings into the CMT PG hierarchy, if doing
1699  * so would violate the subset invariant of the hierarchy, which says that
1700  * a PG must be subset of its parent (if it has one).
1701  *
1702  * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that
1703  * would result in a violation of this invariant. If a violation is found,
1704  * and the PG is of a grouping type who's definition is known to originate from
1705  * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the
1706  * PG (and all other instances PG's sharing relationship type) from the CMT
1707  * hierarchy. Further, future instances of that sharing relationship type won't
1708  * be added. If the grouping definition doesn't originate from suspect
1709  * sources, then pg_cmt_disable() will be invoked to log an error, and disable
1710  * CMT scheduling altogether.
1711  *
1712  * This routine is invoked after the CPU has been added to the PGs in which
1713  * it belongs, but before those PGs have been added to (or had their place
1714  * adjusted in) the CMT PG hierarchy.
1715  *
1716  * The first argument is the CPUs PG lineage (essentially an array of PGs in
1717  * which the CPU belongs) that has already been sorted in ascending order
1718  * by CPU count. Some of the PGs in the CPUs lineage may already have other
1719  * CPUs in them, and have already been integrated into the CMT hierarchy.
1720  *
1721  * The addition of this new CPU to these pre-existing PGs means that those
1722  * PGs may need to be promoted up in the hierarchy to satisfy the subset
1723  * invariant. In additon to testing the subset invariant for the lineage,
1724  * this routine also verifies that the addition of the new CPU to the
1725  * existing PGs wouldn't cause the subset invariant to be violated in
1726  * the exiting lineages.
1727  *
1728  * This routine will normally return one of the following:
1729  * CMT_LINEAGE_VALID - There were no problems detected with the lineage.
1730  * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning.
1731  *
1732  * Otherwise, this routine will return a value indicating which error it
1733  * was unable to recover from (and set cmt_lineage_status along the way).
1734  *
1735  * This routine operates on the CPU specific processor group data (for the CPU
1736  * whose lineage is being validated), which is under-construction.
1737  * "pgdata" is a reference to the CPU's under-construction PG data.
1738  * This routine must be careful to operate only on "pgdata", and not cp->cpu_pg.
1739  */
1740 static cmt_lineage_validation_t
1741 pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
1742 {
1743         int             i, j, size;
1744         pg_cmt_t        *pg, *pg_next, *pg_bad, *pg_tmp, *parent;
1745         cpu_t           *cp;
1746         pg_cpu_itr_t    cpu_iter;
1747         lgrp_handle_t   lgrp;
1748 
1749         ASSERT(MUTEX_HELD(&cpu_lock));
1750 
1751 revalidate:
1752         size = *sz;
1753         pg_bad = NULL;
1754         lgrp = LGRP_NULL_HANDLE;
1755         for (i = 0; i < size; i++) {
1756 
1757                 pg = lineage[i];
1758                 if (i < size - 1)
1759                         pg_next = lineage[i + 1];
1760                 else
1761                         pg_next = NULL;
1762 
1763                 /*
1764                  * We assume that the lineage has already been sorted
1765                  * by the number of CPUs. In fact, we depend on it.
1766                  */
1767                 ASSERT(pg_next == NULL ||
1768                     (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next)));
1769 
1770                 /*
1771                  * The CPUs PG lineage was passed as the first argument to
1772                  * this routine and contains the sorted list of the CPU's
1773                  * PGs. Ultimately, the ordering of the PGs in that list, and
1774                  * the ordering as traversed by the cmt_parent list must be
1775                  * the same. PG promotion will be used as the mechanism to
1776                  * achieve this, but first we need to look for cases where
1777                  * promotion will be necessary, and validate that will be
1778                  * possible without violating the subset invarient described
1779                  * above.
1780                  *
1781                  * Since the PG topology is in the middle of being changed, we
1782                  * need to check whether the PG's existing parent (if any) is
1783                  * part of this CPU's lineage (and therefore should contain
1784                  * the new CPU). If not, it means that the addition of the
1785                  * new CPU should have made this PG have more CPUs than its
1786                  * parent (and other ancestors not in the same lineage) and
1787                  * will need to be promoted into place.
1788                  *
1789                  * We need to verify all of this to defend against a buggy
1790                  * BIOS giving bad power domain CPU groupings. Sigh.
1791                  */
1792                 parent = pg->cmt_parent;
1793                 while (parent != NULL) {
1794                         /*
1795                          * Determine if the parent/ancestor is in this lineage
1796                          */
1797                         pg_tmp = NULL;
1798                         for (j = 0; (j < size) && (pg_tmp != parent); j++) {
1799                                 pg_tmp = lineage[j];
1800                         }
1801                         if (pg_tmp == parent) {
1802                                 /*
1803                                  * It's in the lineage. The concentricity
1804                                  * checks will handle the rest.
1805                                  */
1806                                 break;
1807                         }
1808                         /*
1809                          * If it is not in the lineage, PG will eventually
1810                          * need to be promoted above it. Verify the ancestor
1811                          * is a proper subset. There is still an error if
1812                          * the ancestor has the same number of CPUs as PG,
1813                          * since that would imply it should be in the lineage,
1814                          * and we already know it isn't.
1815                          */
1816                         if (PG_NUM_CPUS((pg_t *)parent) >=
1817                             PG_NUM_CPUS((pg_t *)pg)) {
1818                                 /*
1819                                  * Not a proper subset if the parent/ancestor
1820                                  * has the same or more CPUs than PG.
1821                                  */
1822                                 cmt_lineage_status = CMT_LINEAGE_NON_PROMOTABLE;
1823                                 goto handle_error;
1824                         }
1825                         parent = parent->cmt_parent;
1826                 }
1827 
1828                 /*
1829                  * Walk each of the CPUs in the PGs group and perform
1830                  * consistency checks along the way.
1831                  */
1832                 PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter);
1833                 while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
1834                         /*
1835                          * Verify that there aren't any CPUs contained in PG
1836                          * that the next PG in the lineage (which is larger
1837                          * or same size) doesn't also contain.
1838                          */
1839                         if (pg_next != NULL &&
1840                             pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) {
1841                                 cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC;
1842                                 goto handle_error;
1843                         }
1844 
1845                         /*
1846                          * Verify that all the CPUs in the PG are in the same
1847                          * lgroup.
1848                          */
1849                         if (lgrp == LGRP_NULL_HANDLE) {
1850                                 lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id);
1851                         } else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) {
1852                                 cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS;
1853                                 goto handle_error;
1854                         }
1855                 }
1856         }
1857 
1858 handle_error:
1859         /*
1860          * Some of these validation errors can result when the CPU grouping
1861          * information is derived from buggy sources (for example, incorrect
1862          * ACPI tables on x86 systems).
1863          *
1864          * We'll try to recover in such cases by pruning out the illegal
1865          * groupings from the PG hierarchy, which means that we won't optimize
1866          * for those levels, but we will for the remaining ones.
1867          */
1868         switch (cmt_lineage_status) {
1869         case CMT_LINEAGE_VALID:
1870         case CMT_LINEAGE_REPAIRED:
1871                 break;
1872         case CMT_LINEAGE_PG_SPANS_LGRPS:
1873                 /*
1874                  * We've detected a PG whose CPUs span lgroups.
1875                  *
1876                  * This isn't supported, as the dispatcher isn't allowed to
1877                  * to do CMT thread placement across lgroups, as this would
1878                  * conflict with policies implementing MPO thread affinity.
1879                  *
1880                  * If the PG is of a sharing relationship type known to
1881                  * legitimately span lgroups, specify that no CMT thread
1882                  * placement policy should be implemented, and prune the PG
1883                  * from the existing CMT PG hierarchy.
1884                  *
1885                  * Otherwise, fall though to the case below for handling.
1886                  */
1887                 if (((pghw_t *)pg)->pghw_hw == PGHW_CHIP) {
1888                         if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) {
1889                                 cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1890                                 goto revalidate;
1891                         }
1892                 }
1893                 /*LINTED*/
1894         case CMT_LINEAGE_NON_PROMOTABLE:
1895                 /*
1896                  * We've detected a PG that already exists in another CPU's
1897                  * lineage that cannot cannot legally be promoted into place
1898                  * without breaking the invariants of the hierarchy.
1899                  */
1900                 if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
1901                         if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) {
1902                                 cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1903                                 goto revalidate;
1904                         }
1905                 }
1906                 /*
1907                  * Something went wrong trying to prune out the bad level.
1908                  * Disable CMT scheduling altogether.
1909                  */
1910                 pg_cmt_disable();
1911                 break;
1912         case CMT_LINEAGE_NON_CONCENTRIC:
1913                 /*
1914                  * We've detected a non-concentric PG lineage, which means that
1915                  * there's a PG in the lineage that has CPUs that the next PG
1916                  * over in the lineage (which is the same size or larger)
1917                  * doesn't have.
1918                  *
1919                  * In this case, we examine the two PGs to see if either
1920                  * grouping is defined by potentially buggy sources.
1921                  *
1922                  * If one has less CPUs than the other, and contains CPUs
1923                  * not found in the parent, and it is an untrusted enumeration,
1924                  * then prune it. If both have the same number of CPUs, then
1925                  * prune the one that is untrusted.
1926                  *
1927                  * This process repeats until we have a concentric lineage,
1928                  * or we would have to prune out level derived from what we
1929                  * thought was a reliable source, in which case CMT scheduling
1930                  * is disabled altogether.
1931                  */
1932                 if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) &&
1933                     (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) {
1934                         pg_bad = pg;
1935                 } else if (PG_NUM_CPUS((pg_t *)pg) ==
1936                     PG_NUM_CPUS((pg_t *)pg_next)) {
1937                         if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) {
1938                                 pg_bad = pg_next;
1939                         } else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
1940                                 pg_bad = pg;
1941                         }
1942                 }
1943                 if (pg_bad) {
1944                         if (pg_cmt_prune(pg_bad, lineage, sz, pgdata) == 0) {
1945                                 cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1946                                 goto revalidate;
1947                         }
1948                 }
1949                 /*
1950                  * Something went wrong trying to identify and/or prune out
1951                  * the bad level. Disable CMT scheduling altogether.
1952                  */
1953                 pg_cmt_disable();
1954                 break;
1955         default:
1956                 /*
1957                  * If we're here, we've encountered a validation error for
1958                  * which we don't know how to recover. In this case, disable
1959                  * CMT scheduling altogether.
1960                  */
1961                 cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE;
1962                 pg_cmt_disable();
1963         }
1964         return (cmt_lineage_status);
1965 }