setfrontbackdq Wdiff usr/src/uts/common/disp/disp.c

Print this page

patch setfrontbackdq

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/disp/disp.c
          +++ new/usr/src/uts/common/disp/disp.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  
  26   26  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  27   27  /*        All Rights Reserved   */
  28   28  
  29   29  
  30   30  #include <sys/types.h>
  31   31  #include <sys/param.h>
  32   32  #include <sys/sysmacros.h>
  33   33  #include <sys/signal.h>
  34   34  #include <sys/user.h>
  35   35  #include <sys/systm.h>
  36   36  #include <sys/sysinfo.h>
  37   37  #include <sys/var.h>
  38   38  #include <sys/errno.h>
  39   39  #include <sys/cmn_err.h>
  40   40  #include <sys/debug.h>
  41   41  #include <sys/inline.h>
  42   42  #include <sys/disp.h>
  43   43  #include <sys/class.h>
  44   44  #include <sys/bitmap.h>
  45   45  #include <sys/kmem.h>
  46   46  #include <sys/cpuvar.h>
  47   47  #include <sys/vtrace.h>
  48   48  #include <sys/tnf.h>
  49   49  #include <sys/cpupart.h>
  50   50  #include <sys/lgrp.h>
  51   51  #include <sys/pg.h>
  52   52  #include <sys/cmt.h>
  53   53  #include <sys/bitset.h>
  54   54  #include <sys/schedctl.h>
  55   55  #include <sys/atomic.h>
  56   56  #include <sys/dtrace.h>
  57   57  #include <sys/sdt.h>
  58   58  #include <sys/archsystm.h>
  59   59  
  60   60  #include <vm/as.h>
  61   61  
  62   62  #define BOUND_CPU       0x1
  63   63  #define BOUND_PARTITION 0x2
  64   64  #define BOUND_INTR      0x4
  65   65  
  66   66  /* Dispatch queue allocation structure and functions */
  67   67  struct disp_queue_info {
  68   68          disp_t  *dp;
  69   69          dispq_t *olddispq;
  70   70          dispq_t *newdispq;
  71   71          ulong_t *olddqactmap;
  72   72          ulong_t *newdqactmap;
  73   73          int     oldnglobpris;
  74   74  };
  75   75  static void     disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
  76   76      disp_t *dp);
  77   77  static void     disp_dq_assign(struct disp_queue_info *dptr, int numpris);
  78   78  static void     disp_dq_free(struct disp_queue_info *dptr);
  79   79  
  80   80  /* platform-specific routine to call when processor is idle */
  81   81  static void     generic_idle_cpu();
  82   82  void            (*idle_cpu)() = generic_idle_cpu;
  83   83  
  84   84  /* routines invoked when a CPU enters/exits the idle loop */
  85   85  static void     idle_enter();
  86   86  static void     idle_exit();
  87   87  
  88   88  /* platform-specific routine to call when thread is enqueued */
  89   89  static void     generic_enq_thread(cpu_t *, int);
  90   90  void            (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
  91   91  
  92   92  pri_t   kpreemptpri;            /* priority where kernel preemption applies */
  93   93  pri_t   upreemptpri = 0;        /* priority where normal preemption applies */
  94   94  pri_t   intr_pri;               /* interrupt thread priority base level */
  95   95  
  96   96  #define KPQPRI  -1              /* pri where cpu affinity is dropped for kpq */
  97   97  pri_t   kpqpri = KPQPRI;        /* can be set in /etc/system */
  98   98  disp_t  cpu0_disp;              /* boot CPU's dispatch queue */
  99   99  disp_lock_t     swapped_lock;   /* lock swapped threads and swap queue */
 100  100  int     nswapped;               /* total number of swapped threads */
 101  101  void    disp_swapped_enq(kthread_t *tp);
 102  102  static void     disp_swapped_setrun(kthread_t *tp);
 103  103  static void     cpu_resched(cpu_t *cp, pri_t tpri);
 104  104  
 105  105  /*
 106  106   * If this is set, only interrupt threads will cause kernel preemptions.
 107  107   * This is done by changing the value of kpreemptpri.  kpreemptpri
 108  108   * will either be the max sysclass pri + 1 or the min interrupt pri.
 109  109   */
 110  110  int     only_intr_kpreempt;
 111  111  
 112  112  extern void set_idle_cpu(int cpun);
 113  113  extern void unset_idle_cpu(int cpun);
 114  114  static void setkpdq(kthread_t *tp, int borf);
 115  115  #define SETKP_BACK      0
 116  116  #define SETKP_FRONT     1
 117  117  /*
 118  118   * Parameter that determines how recently a thread must have run
 119  119   * on the CPU to be considered loosely-bound to that CPU to reduce
 120  120   * cold cache effects.  The interval is in hertz.
 121  121   */
 122  122  #define RECHOOSE_INTERVAL 3
 123  123  int     rechoose_interval = RECHOOSE_INTERVAL;
 124  124  
 125  125  /*
 126  126   * Parameter that determines how long (in nanoseconds) a thread must
 127  127   * be sitting on a run queue before it can be stolen by another CPU
 128  128   * to reduce migrations.  The interval is in nanoseconds.
 129  129   *
 130  130   * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
 131  131   * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
 132  132   * here indicating it is uninitiallized.
 133  133   * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
 134  134   *
 135  135   */
 136  136  #define NOSTEAL_UNINITIALIZED   (-1)
 137  137  hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
 138  138  extern void cmp_set_nosteal_interval(void);
 139  139  
 140  140  id_t    defaultcid;     /* system "default" class; see dispadmin(1M) */
 141  141  
 142  142  disp_lock_t     transition_lock;        /* lock on transitioning threads */
 143  143  disp_lock_t     stop_lock;              /* lock on stopped threads */
 144  144  
 145  145  static void     cpu_dispqalloc(int numpris);
 146  146  
 147  147  /*
 148  148   * This gets returned by disp_getwork/disp_getbest if we couldn't steal
 149  149   * a thread because it was sitting on its run queue for a very short
 150  150   * period of time.
 151  151   */
 152  152  #define T_DONTSTEAL     (kthread_t *)(-1) /* returned by disp_getwork/getbest */
 153  153  
 154  154  static kthread_t        *disp_getwork(cpu_t *to);
 155  155  static kthread_t        *disp_getbest(disp_t *from);
 156  156  static kthread_t        *disp_ratify(kthread_t *tp, disp_t *kpq);
 157  157  
 158  158  void    swtch_to(kthread_t *);
 159  159  
 160  160  /*
 161  161   * dispatcher and scheduler initialization
 162  162   */
 163  163  
 164  164  /*
 165  165   * disp_setup - Common code to calculate and allocate dispatcher
 166  166   *              variables and structures based on the maximum priority.
 167  167   */
 168  168  static void
 169  169  disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
 170  170  {
 171  171          pri_t   newnglobpris;
 172  172  
 173  173          ASSERT(MUTEX_HELD(&cpu_lock));
 174  174  
 175  175          newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
 176  176  
 177  177          if (newnglobpris > oldnglobpris) {
 178  178                  /*
 179  179                   * Allocate new kp queues for each CPU partition.
 180  180                   */
 181  181                  cpupart_kpqalloc(newnglobpris);
 182  182  
 183  183                  /*
 184  184                   * Allocate new dispatch queues for each CPU.
 185  185                   */
 186  186                  cpu_dispqalloc(newnglobpris);
 187  187  
 188  188                  /*
 189  189                   * compute new interrupt thread base priority
 190  190                   */
 191  191                  intr_pri = maxglobpri;
 192  192                  if (only_intr_kpreempt) {
 193  193                          kpreemptpri = intr_pri + 1;
 194  194                          if (kpqpri == KPQPRI)
 195  195                                  kpqpri = kpreemptpri;
 196  196                  }
 197  197                  v.v_nglobpris = newnglobpris;
 198  198          }
 199  199  }
 200  200  
 201  201  /*
 202  202   * dispinit - Called to initialize all loaded classes and the
 203  203   *            dispatcher framework.
 204  204   */
 205  205  void
 206  206  dispinit(void)
 207  207  {
 208  208          id_t    cid;
 209  209          pri_t   maxglobpri;
 210  210          pri_t   cl_maxglobpri;
 211  211  
 212  212          maxglobpri = -1;
 213  213  
 214  214          /*
 215  215           * Initialize transition lock, which will always be set.
 216  216           */
 217  217          DISP_LOCK_INIT(&transition_lock);
 218  218          disp_lock_enter_high(&transition_lock);
 219  219          DISP_LOCK_INIT(&stop_lock);
 220  220  
 221  221          mutex_enter(&cpu_lock);
 222  222          CPU->cpu_disp->disp_maxrunpri = -1;
 223  223          CPU->cpu_disp->disp_max_unbound_pri = -1;
 224  224  
 225  225          /*
 226  226           * Initialize the default CPU partition.
 227  227           */
 228  228          cpupart_initialize_default();
 229  229          /*
 230  230           * Call the class specific initialization functions for
 231  231           * all pre-installed schedulers.
 232  232           *
 233  233           * We pass the size of a class specific parameter
 234  234           * buffer to each of the initialization functions
 235  235           * to try to catch problems with backward compatibility
 236  236           * of class modules.
 237  237           *
 238  238           * For example a new class module running on an old system
 239  239           * which didn't provide sufficiently large parameter buffers
 240  240           * would be bad news. Class initialization modules can check for
 241  241           * this and take action if they detect a problem.
 242  242           */
 243  243  
 244  244          for (cid = 0; cid < nclass; cid++) {
 245  245                  sclass_t        *sc;
 246  246  
 247  247                  sc = &sclass[cid];
 248  248                  if (SCHED_INSTALLED(sc)) {
 249  249                          cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
 250  250                              &sc->cl_funcs);
 251  251                          if (cl_maxglobpri > maxglobpri)
 252  252                                  maxglobpri = cl_maxglobpri;
 253  253                  }
 254  254          }
 255  255          kpreemptpri = (pri_t)v.v_maxsyspri + 1;
 256  256          if (kpqpri == KPQPRI)
 257  257                  kpqpri = kpreemptpri;
 258  258  
 259  259          ASSERT(maxglobpri >= 0);
 260  260          disp_setup(maxglobpri, 0);
 261  261  
 262  262          mutex_exit(&cpu_lock);
 263  263  
 264  264          /*
 265  265           * Platform specific sticky scheduler setup.
 266  266           */
 267  267          if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
 268  268                  cmp_set_nosteal_interval();
 269  269  
 270  270          /*
 271  271           * Get the default class ID; this may be later modified via
 272  272           * dispadmin(1M).  This will load the class (normally TS) and that will
 273  273           * call disp_add(), which is why we had to drop cpu_lock first.
 274  274           */
 275  275          if (getcid(defaultclass, &defaultcid) != 0) {
 276  276                  cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
 277  277                      defaultclass);
 278  278          }
 279  279  }
 280  280  
 281  281  /*
 282  282   * disp_add - Called with class pointer to initialize the dispatcher
 283  283   *            for a newly loaded class.
 284  284   */
 285  285  void
 286  286  disp_add(sclass_t *clp)
 287  287  {
 288  288          pri_t   maxglobpri;
 289  289          pri_t   cl_maxglobpri;
 290  290  
 291  291          mutex_enter(&cpu_lock);
 292  292          /*
 293  293           * Initialize the scheduler class.
 294  294           */
 295  295          maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
 296  296          cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
 297  297          if (cl_maxglobpri > maxglobpri)
 298  298                  maxglobpri = cl_maxglobpri;
 299  299  
 300  300          /*
 301  301           * Save old queue information.  Since we're initializing a
 302  302           * new scheduling class which has just been loaded, then
 303  303           * the size of the dispq may have changed.  We need to handle
 304  304           * that here.
 305  305           */
 306  306          disp_setup(maxglobpri, v.v_nglobpris);
 307  307  
 308  308          mutex_exit(&cpu_lock);
 309  309  }
 310  310  
 311  311  
 312  312  /*
 313  313   * For each CPU, allocate new dispatch queues
 314  314   * with the stated number of priorities.
 315  315   */
 316  316  static void
 317  317  cpu_dispqalloc(int numpris)
 318  318  {
 319  319          cpu_t   *cpup;
 320  320          struct disp_queue_info  *disp_mem;
 321  321          int i, num;
 322  322  
 323  323          ASSERT(MUTEX_HELD(&cpu_lock));
 324  324  
 325  325          disp_mem = kmem_zalloc(NCPU *
 326  326              sizeof (struct disp_queue_info), KM_SLEEP);
 327  327  
 328  328          /*
 329  329           * This routine must allocate all of the memory before stopping
 330  330           * the cpus because it must not sleep in kmem_alloc while the
 331  331           * CPUs are stopped.  Locks they hold will not be freed until they
 332  332           * are restarted.
 333  333           */
 334  334          i = 0;
 335  335          cpup = cpu_list;
 336  336          do {
 337  337                  disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
 338  338                  i++;
 339  339                  cpup = cpup->cpu_next;
 340  340          } while (cpup != cpu_list);
 341  341          num = i;
 342  342  
 343  343          pause_cpus(NULL, NULL);
 344  344          for (i = 0; i < num; i++)
 345  345                  disp_dq_assign(&disp_mem[i], numpris);
 346  346          start_cpus();
 347  347  
 348  348          /*
 349  349           * I must free all of the memory after starting the cpus because
 350  350           * I can not risk sleeping in kmem_free while the cpus are stopped.
 351  351           */
 352  352          for (i = 0; i < num; i++)
 353  353                  disp_dq_free(&disp_mem[i]);
 354  354  
 355  355          kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
 356  356  }
 357  357  
 358  358  static void
 359  359  disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp)
 360  360  {
 361  361          dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
 362  362          dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
 363  363              sizeof (long), KM_SLEEP);
 364  364          dptr->dp = dp;
 365  365  }
 366  366  
 367  367  static void
 368  368  disp_dq_assign(struct disp_queue_info *dptr, int numpris)
 369  369  {
 370  370          disp_t  *dp;
 371  371  
 372  372          dp = dptr->dp;
 373  373          dptr->olddispq = dp->disp_q;
 374  374          dptr->olddqactmap = dp->disp_qactmap;
 375  375          dptr->oldnglobpris = dp->disp_npri;
 376  376  
 377  377          ASSERT(dptr->oldnglobpris < numpris);
 378  378  
 379  379          if (dptr->olddispq != NULL) {
 380  380                  /*
 381  381                   * Use kcopy because bcopy is platform-specific
 382  382                   * and could block while we might have paused the cpus.
 383  383                   */
 384  384                  (void) kcopy(dptr->olddispq, dptr->newdispq,
 385  385                      dptr->oldnglobpris * sizeof (dispq_t));
 386  386                  (void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
 387  387                      ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
 388  388                      sizeof (long));
 389  389          }
 390  390          dp->disp_q = dptr->newdispq;
 391  391          dp->disp_qactmap = dptr->newdqactmap;
 392  392          dp->disp_q_limit = &dptr->newdispq[numpris];
 393  393          dp->disp_npri = numpris;
 394  394  }
 395  395  
 396  396  static void
 397  397  disp_dq_free(struct disp_queue_info *dptr)
 398  398  {
 399  399          if (dptr->olddispq != NULL)
 400  400                  kmem_free(dptr->olddispq,
 401  401                      dptr->oldnglobpris * sizeof (dispq_t));
 402  402          if (dptr->olddqactmap != NULL)
 403  403                  kmem_free(dptr->olddqactmap,
 404  404                      ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
 405  405  }
 406  406  
 407  407  /*
 408  408   * For a newly created CPU, initialize the dispatch queue.
 409  409   * This is called before the CPU is known through cpu[] or on any lists.
 410  410   */
 411  411  void
 412  412  disp_cpu_init(cpu_t *cp)
 413  413  {
 414  414          disp_t  *dp;
 415  415          dispq_t *newdispq;
 416  416          ulong_t *newdqactmap;
 417  417  
 418  418          ASSERT(MUTEX_HELD(&cpu_lock));  /* protect dispatcher queue sizes */
 419  419  
 420  420          if (cp == cpu0_disp.disp_cpu)
 421  421                  dp = &cpu0_disp;
 422  422          else
 423  423                  dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
 424  424          bzero(dp, sizeof (disp_t));
 425  425          cp->cpu_disp = dp;
 426  426          dp->disp_cpu = cp;
 427  427          dp->disp_maxrunpri = -1;
 428  428          dp->disp_max_unbound_pri = -1;
 429  429          DISP_LOCK_INIT(&cp->cpu_thread_lock);
 430  430          /*
 431  431           * Allocate memory for the dispatcher queue headers
 432  432           * and the active queue bitmap.
 433  433           */
 434  434          newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
 435  435          newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
 436  436              sizeof (long), KM_SLEEP);
 437  437          dp->disp_q = newdispq;
 438  438          dp->disp_qactmap = newdqactmap;
 439  439          dp->disp_q_limit = &newdispq[v.v_nglobpris];
 440  440          dp->disp_npri = v.v_nglobpris;
 441  441  }
 442  442  
 443  443  void
 444  444  disp_cpu_fini(cpu_t *cp)
 445  445  {
 446  446          ASSERT(MUTEX_HELD(&cpu_lock));
 447  447  
 448  448          disp_kp_free(cp->cpu_disp);
 449  449          if (cp->cpu_disp != &cpu0_disp)
 450  450                  kmem_free(cp->cpu_disp, sizeof (disp_t));
 451  451  }
 452  452  
 453  453  /*
 454  454   * Allocate new, larger kpreempt dispatch queue to replace the old one.
 455  455   */
 456  456  void
 457  457  disp_kp_alloc(disp_t *dq, pri_t npri)
 458  458  {
 459  459          struct disp_queue_info  mem_info;
 460  460  
 461  461          if (npri > dq->disp_npri) {
 462  462                  /*
 463  463                   * Allocate memory for the new array.
 464  464                   */
 465  465                  disp_dq_alloc(&mem_info, npri, dq);
 466  466  
 467  467                  /*
 468  468                   * We need to copy the old structures to the new
 469  469                   * and free the old.
 470  470                   */
 471  471                  disp_dq_assign(&mem_info, npri);
 472  472                  disp_dq_free(&mem_info);
 473  473          }
 474  474  }
 475  475  
 476  476  /*
 477  477   * Free dispatch queue.
 478  478   * Used for the kpreempt queues for a removed CPU partition and
 479  479   * for the per-CPU queues of deleted CPUs.
 480  480   */
 481  481  void
 482  482  disp_kp_free(disp_t *dq)
 483  483  {
 484  484          struct disp_queue_info  mem_info;
 485  485  
 486  486          mem_info.olddispq = dq->disp_q;
 487  487          mem_info.olddqactmap = dq->disp_qactmap;
 488  488          mem_info.oldnglobpris = dq->disp_npri;
 489  489          disp_dq_free(&mem_info);
 490  490  }
 491  491  
 492  492  /*
 493  493   * End dispatcher and scheduler initialization.
 494  494   */
 495  495  
 496  496  /*
 497  497   * See if there's anything to do other than remain idle.
 498  498   * Return non-zero if there is.
 499  499   *
 500  500   * This function must be called with high spl, or with
 501  501   * kernel preemption disabled to prevent the partition's
 502  502   * active cpu list from changing while being traversed.
 503  503   *
 504  504   * This is essentially a simpler version of disp_getwork()
 505  505   * to be called by CPUs preparing to "halt".
 506  506   */
 507  507  int
 508  508  disp_anywork(void)
 509  509  {
 510  510          cpu_t           *cp = CPU;
 511  511          cpu_t           *ocp;
 512  512          volatile int    *local_nrunnable = &cp->cpu_disp->disp_nrunnable;
 513  513  
 514  514          if (!(cp->cpu_flags & CPU_OFFLINE)) {
 515  515                  if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
 516  516                          return (1);
 517  517  
 518  518                  for (ocp = cp->cpu_next_part; ocp != cp;
 519  519                      ocp = ocp->cpu_next_part) {
 520  520                          ASSERT(CPU_ACTIVE(ocp));
 521  521  
 522  522                          /*
 523  523                           * Something has appeared on the local run queue.
 524  524                           */
 525  525                          if (*local_nrunnable > 0)
 526  526                                  return (1);
 527  527                          /*
 528  528                           * If we encounter another idle CPU that will
 529  529                           * soon be trolling around through disp_anywork()
 530  530                           * terminate our walk here and let this other CPU
 531  531                           * patrol the next part of the list.
 532  532                           */
 533  533                          if (ocp->cpu_dispatch_pri == -1 &&
 534  534                              (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
 535  535                                  return (0);
 536  536                          /*
 537  537                           * Work can be taken from another CPU if:
 538  538                           *      - There is unbound work on the run queue
 539  539                           *      - That work isn't a thread undergoing a
 540  540                           *      - context switch on an otherwise empty queue.
 541  541                           *      - The CPU isn't running the idle loop.
 542  542                           */
 543  543                          if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
 544  544                              !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
 545  545                              ocp->cpu_disp->disp_nrunnable == 1) &&
 546  546                              ocp->cpu_dispatch_pri != -1)
 547  547                                  return (1);
 548  548                  }
 549  549          }
 550  550          return (0);
 551  551  }
 552  552  
 553  553  /*
 554  554   * Called when CPU enters the idle loop
 555  555   */
 556  556  static void
 557  557  idle_enter()
 558  558  {
 559  559          cpu_t           *cp = CPU;
 560  560  
 561  561          new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
 562  562          CPU_STATS_ADDQ(cp, sys, idlethread, 1);
 563  563          set_idle_cpu(cp->cpu_id);       /* arch-dependent hook */
 564  564  }
 565  565  
 566  566  /*
 567  567   * Called when CPU exits the idle loop
 568  568   */
 569  569  static void
 570  570  idle_exit()
 571  571  {
 572  572          cpu_t           *cp = CPU;
 573  573  
 574  574          new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
 575  575          unset_idle_cpu(cp->cpu_id);     /* arch-dependent hook */
 576  576  }
 577  577  
 578  578  /*
 579  579   * Idle loop.
 580  580   */
 581  581  void
 582  582  idle()
 583  583  {
 584  584          struct cpu      *cp = CPU;              /* pointer to this CPU */
 585  585          kthread_t       *t;                     /* taken thread */
 586  586  
 587  587          idle_enter();
 588  588  
 589  589          /*
 590  590           * Uniprocessor version of idle loop.
 591  591           * Do this until notified that we're on an actual multiprocessor.
 592  592           */
 593  593          while (ncpus == 1) {
 594  594                  if (cp->cpu_disp->disp_nrunnable == 0) {
 595  595                          (*idle_cpu)();
 596  596                          continue;
 597  597                  }
 598  598                  idle_exit();
 599  599                  swtch();
 600  600  
 601  601                  idle_enter(); /* returned from swtch */
 602  602          }
 603  603  
 604  604          /*
 605  605           * Multiprocessor idle loop.
 606  606           */
 607  607          for (;;) {
 608  608                  /*
 609  609                   * If CPU is completely quiesced by p_online(2), just wait
 610  610                   * here with minimal bus traffic until put online.
 611  611                   */
 612  612                  while (cp->cpu_flags & CPU_QUIESCED)
 613  613                          (*idle_cpu)();
 614  614  
 615  615                  if (cp->cpu_disp->disp_nrunnable != 0) {
 616  616                          idle_exit();
 617  617                          swtch();
 618  618                  } else {
 619  619                          if (cp->cpu_flags & CPU_OFFLINE)
 620  620                                  continue;
 621  621                          if ((t = disp_getwork(cp)) == NULL) {
 622  622                                  if (cp->cpu_chosen_level != -1) {
 623  623                                          disp_t *dp = cp->cpu_disp;
 624  624                                          disp_t *kpq;
 625  625  
 626  626                                          disp_lock_enter(&dp->disp_lock);
 627  627                                          /*
 628  628                                           * Set kpq under lock to prevent
 629  629                                           * migration between partitions.
 630  630                                           */
 631  631                                          kpq = &cp->cpu_part->cp_kp_queue;
 632  632                                          if (kpq->disp_maxrunpri == -1)
 633  633                                                  cp->cpu_chosen_level = -1;
 634  634                                          disp_lock_exit(&dp->disp_lock);
 635  635                                  }
 636  636                                  (*idle_cpu)();
 637  637                                  continue;
 638  638                          }
 639  639                          /*
 640  640                           * If there was a thread but we couldn't steal
 641  641                           * it, then keep trying.
 642  642                           */
 643  643                          if (t == T_DONTSTEAL)
 644  644                                  continue;
 645  645                          idle_exit();
 646  646                          swtch_to(t);
 647  647                  }
 648  648                  idle_enter(); /* returned from swtch/swtch_to */
 649  649          }
 650  650  }
 651  651  
 652  652  
 653  653  /*
 654  654   * Preempt the currently running thread in favor of the highest
 655  655   * priority thread.  The class of the current thread controls
 656  656   * where it goes on the dispatcher queues. If panicking, turn
 657  657   * preemption off.
 658  658   */
 659  659  void
 660  660  preempt()
 661  661  {
 662  662          kthread_t       *t = curthread;
 663  663          klwp_t          *lwp = ttolwp(curthread);
 664  664  
 665  665          if (panicstr)
 666  666                  return;
 667  667  
 668  668          TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
 669  669  
 670  670          thread_lock(t);
 671  671  
 672  672          if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
 673  673                  /*
 674  674                   * this thread has already been chosen to be run on
 675  675                   * another CPU. Clear kprunrun on this CPU since we're
 676  676                   * already headed for swtch().
 677  677                   */
 678  678                  CPU->cpu_kprunrun = 0;
 679  679                  thread_unlock_nopreempt(t);
 680  680                  TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
 681  681          } else {
 682  682                  if (lwp != NULL)
 683  683                          lwp->lwp_ru.nivcsw++;
 684  684                  CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
 685  685                  THREAD_TRANSITION(t);
 686  686                  CL_PREEMPT(t);
 687  687                  DTRACE_SCHED(preempt);
 688  688                  thread_unlock_nopreempt(t);
 689  689  
 690  690                  TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
 691  691  
 692  692                  swtch();                /* clears CPU->cpu_runrun via disp() */
 693  693          }
 694  694  }
 695  695  
 696  696  extern kthread_t *thread_unpin();
 697  697  
 698  698  /*
 699  699   * disp() - find the highest priority thread for this processor to run, and
 700  700   * set it in TS_ONPROC state so that resume() can be called to run it.
 701  701   */
 702  702  static kthread_t *
 703  703  disp()
 704  704  {
 705  705          cpu_t           *cpup;
 706  706          disp_t          *dp;
 707  707          kthread_t       *tp;
 708  708          dispq_t         *dq;
 709  709          int             maxrunword;
 710  710          pri_t           pri;
 711  711          disp_t          *kpq;
 712  712  
 713  713          TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
 714  714  
 715  715          cpup = CPU;
 716  716          /*
 717  717           * Find the highest priority loaded, runnable thread.
 718  718           */
 719  719          dp = cpup->cpu_disp;
 720  720  
 721  721  reschedule:
 722  722          /*
 723  723           * If there is more important work on the global queue with a better
 724  724           * priority than the maximum on this CPU, take it now.
 725  725           */
 726  726          kpq = &cpup->cpu_part->cp_kp_queue;
 727  727          while ((pri = kpq->disp_maxrunpri) >= 0 &&
 728  728              pri >= dp->disp_maxrunpri &&
 729  729              (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
 730  730              (tp = disp_getbest(kpq)) != NULL) {
 731  731                  if (disp_ratify(tp, kpq) != NULL) {
 732  732                          TRACE_1(TR_FAC_DISP, TR_DISP_END,
 733  733                              "disp_end:tid %p", tp);
 734  734                          return (tp);
 735  735                  }
 736  736          }
 737  737  
 738  738          disp_lock_enter(&dp->disp_lock);
 739  739          pri = dp->disp_maxrunpri;
 740  740  
 741  741          /*
 742  742           * If there is nothing to run, look at what's runnable on other queues.
 743  743           * Choose the idle thread if the CPU is quiesced.
 744  744           * Note that CPUs that have the CPU_OFFLINE flag set can still run
 745  745           * interrupt threads, which will be the only threads on the CPU's own
 746  746           * queue, but cannot run threads from other queues.
 747  747           */
 748  748          if (pri == -1) {
 749  749                  if (!(cpup->cpu_flags & CPU_OFFLINE)) {
 750  750                          disp_lock_exit(&dp->disp_lock);
 751  751                          if ((tp = disp_getwork(cpup)) == NULL ||
 752  752                              tp == T_DONTSTEAL) {
 753  753                                  tp = cpup->cpu_idle_thread;
 754  754                                  (void) splhigh();
 755  755                                  THREAD_ONPROC(tp, cpup);
 756  756                                  cpup->cpu_dispthread = tp;
 757  757                                  cpup->cpu_dispatch_pri = -1;
 758  758                                  cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
 759  759                                  cpup->cpu_chosen_level = -1;
 760  760                          }
 761  761                  } else {
 762  762                          disp_lock_exit_high(&dp->disp_lock);
 763  763                          tp = cpup->cpu_idle_thread;
 764  764                          THREAD_ONPROC(tp, cpup);
 765  765                          cpup->cpu_dispthread = tp;
 766  766                          cpup->cpu_dispatch_pri = -1;
 767  767                          cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
 768  768                          cpup->cpu_chosen_level = -1;
 769  769                  }
 770  770                  TRACE_1(TR_FAC_DISP, TR_DISP_END,
 771  771                      "disp_end:tid %p", tp);
 772  772                  return (tp);
 773  773          }
 774  774  
 775  775          dq = &dp->disp_q[pri];
 776  776          tp = dq->dq_first;
 777  777  
 778  778          ASSERT(tp != NULL);
 779  779          ASSERT(tp->t_schedflag & TS_LOAD);      /* thread must be swapped in */
 780  780  
 781  781          DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
 782  782  
 783  783          /*
 784  784           * Found it so remove it from queue.
 785  785           */
 786  786          dp->disp_nrunnable--;
 787  787          dq->dq_sruncnt--;
 788  788          if ((dq->dq_first = tp->t_link) == NULL) {
 789  789                  ulong_t *dqactmap = dp->disp_qactmap;
 790  790  
 791  791                  ASSERT(dq->dq_sruncnt == 0);
 792  792                  dq->dq_last = NULL;
 793  793  
 794  794                  /*
 795  795                   * The queue is empty, so the corresponding bit needs to be
 796  796                   * turned off in dqactmap.   If nrunnable != 0 just took the
 797  797                   * last runnable thread off the
 798  798                   * highest queue, so recompute disp_maxrunpri.
 799  799                   */
 800  800                  maxrunword = pri >> BT_ULSHIFT;
 801  801                  dqactmap[maxrunword] &= ~BT_BIW(pri);
 802  802  
 803  803                  if (dp->disp_nrunnable == 0) {
 804  804                          dp->disp_max_unbound_pri = -1;
 805  805                          dp->disp_maxrunpri = -1;
 806  806                  } else {
 807  807                          int ipri;
 808  808  
 809  809                          ipri = bt_gethighbit(dqactmap, maxrunword);
 810  810                          dp->disp_maxrunpri = ipri;
 811  811                          if (ipri < dp->disp_max_unbound_pri)
 812  812                                  dp->disp_max_unbound_pri = ipri;
 813  813                  }
 814  814          } else {
 815  815                  tp->t_link = NULL;
 816  816          }
 817  817  
 818  818          /*
 819  819           * Set TS_DONT_SWAP flag to prevent another processor from swapping
 820  820           * out this thread before we have a chance to run it.
 821  821           * While running, it is protected against swapping by t_lock.
 822  822           */
 823  823          tp->t_schedflag |= TS_DONT_SWAP;
 824  824          cpup->cpu_dispthread = tp;              /* protected by spl only */
 825  825          cpup->cpu_dispatch_pri = pri;
 826  826          ASSERT(pri == DISP_PRIO(tp));
 827  827          thread_onproc(tp, cpup);                /* set t_state to TS_ONPROC */
 828  828          disp_lock_exit_high(&dp->disp_lock);    /* drop run queue lock */
 829  829  
 830  830          ASSERT(tp != NULL);
 831  831          TRACE_1(TR_FAC_DISP, TR_DISP_END,
 832  832              "disp_end:tid %p", tp);
 833  833  
 834  834          if (disp_ratify(tp, kpq) == NULL)
 835  835                  goto reschedule;
 836  836  
 837  837          return (tp);
 838  838  }
 839  839  
 840  840  /*
 841  841   * swtch()
 842  842   *      Find best runnable thread and run it.
 843  843   *      Called with the current thread already switched to a new state,
 844  844   *      on a sleep queue, run queue, stopped, and not zombied.
 845  845   *      May be called at any spl level less than or equal to LOCK_LEVEL.
 846  846   *      Always drops spl to the base level (spl0()).
 847  847   */
 848  848  void
 849  849  swtch()
 850  850  {
 851  851          kthread_t       *t = curthread;
 852  852          kthread_t       *next;
 853  853          cpu_t           *cp;
 854  854  
 855  855          TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
 856  856  
 857  857          if (t->t_flag & T_INTR_THREAD)
 858  858                  cpu_intr_swtch_enter(t);
 859  859  
 860  860          if (t->t_intr != NULL) {
 861  861                  /*
 862  862                   * We are an interrupt thread.  Setup and return
 863  863                   * the interrupted thread to be resumed.
 864  864                   */
 865  865                  (void) splhigh();       /* block other scheduler action */
 866  866                  cp = CPU;               /* now protected against migration */
 867  867                  ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */
 868  868                  CPU_STATS_ADDQ(cp, sys, pswitch, 1);
 869  869                  CPU_STATS_ADDQ(cp, sys, intrblk, 1);
 870  870                  next = thread_unpin();
 871  871                  TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 872  872                  resume_from_intr(next);
 873  873          } else {
 874  874  #ifdef  DEBUG
 875  875                  if (t->t_state == TS_ONPROC &&
 876  876                      t->t_disp_queue->disp_cpu == CPU &&
 877  877                      t->t_preempt == 0) {
 878  878                          thread_lock(t);
 879  879                          ASSERT(t->t_state != TS_ONPROC ||
 880  880                              t->t_disp_queue->disp_cpu != CPU ||
 881  881                              t->t_preempt != 0); /* cannot migrate */
 882  882                          thread_unlock_nopreempt(t);
 883  883                  }
 884  884  #endif  /* DEBUG */
 885  885                  cp = CPU;
 886  886                  next = disp();          /* returns with spl high */
 887  887                  ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */
 888  888  
 889  889                  /* OK to steal anything left on run queue */
 890  890                  cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
 891  891  
 892  892                  if (next != t) {
 893  893                          hrtime_t now;
 894  894  
 895  895                          now = gethrtime_unscaled();
 896  896                          pg_ev_thread_swtch(cp, now, t, next);
 897  897  
 898  898                          /*
 899  899                           * If t was previously in the TS_ONPROC state,
 900  900                           * setfrontdq and setbackdq won't have set its t_waitrq.
 901  901                           * Since we now finally know that we're switching away
 902  902                           * from this thread, set its t_waitrq if it is on a run
 903  903                           * queue.
 904  904                           */
 905  905                          if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
 906  906                                  t->t_waitrq = now;
 907  907                          }
 908  908  
 909  909                          /*
 910  910                           * restore mstate of thread that we are switching to
 911  911                           */
 912  912                          restore_mstate(next);
 913  913  
 914  914                          CPU_STATS_ADDQ(cp, sys, pswitch, 1);
 915  915                          cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
 916  916                          TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 917  917  
 918  918                          if (dtrace_vtime_active)
 919  919                                  dtrace_vtime_switch(next);
 920  920  
 921  921                          resume(next);
 922  922                          /*
 923  923                           * The TR_RESUME_END and TR_SWTCH_END trace points
 924  924                           * appear at the end of resume(), because we may not
 925  925                           * return here
 926  926                           */
 927  927                  } else {
 928  928                          if (t->t_flag & T_INTR_THREAD)
 929  929                                  cpu_intr_swtch_exit(t);
 930  930                          /*
 931  931                           * Threads that enqueue themselves on a run queue defer
 932  932                           * setting t_waitrq. It is then either set in swtch()
 933  933                           * when the CPU is actually yielded, or not at all if it
 934  934                           * is remaining on the CPU.
 935  935                           * There is however a window between where the thread
 936  936                           * placed itself on a run queue, and where it selects
 937  937                           * itself in disp(), where a third party (eg. clock()
 938  938                           * doing tick processing) may have re-enqueued this
 939  939                           * thread, setting t_waitrq in the process. We detect
 940  940                           * this race by noticing that despite switching to
 941  941                           * ourself, our t_waitrq has been set, and should be
 942  942                           * cleared.
 943  943                           */
 944  944                          if (t->t_waitrq != 0)
 945  945                                  t->t_waitrq = 0;
 946  946  
 947  947                          pg_ev_thread_remain(cp, t);
 948  948  
 949  949                          DTRACE_SCHED(remain__cpu);
 950  950                          TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
 951  951                          (void) spl0();
 952  952                  }
 953  953          }
 954  954  }
 955  955  
 956  956  /*
 957  957   * swtch_from_zombie()
 958  958   *      Special case of swtch(), which allows checks for TS_ZOMB to be
 959  959   *      eliminated from normal resume.
 960  960   *      Find best runnable thread and run it.
 961  961   *      Called with the current thread zombied.
 962  962   *      Zombies cannot migrate, so CPU references are safe.
 963  963   */
 964  964  void
 965  965  swtch_from_zombie()
 966  966  {
 967  967          kthread_t       *next;
 968  968          cpu_t           *cpu = CPU;
 969  969  
 970  970          TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
 971  971  
 972  972          ASSERT(curthread->t_state == TS_ZOMB);
 973  973  
 974  974          next = disp();                  /* returns with spl high */
 975  975          ASSERT(CPU_ON_INTR(CPU) == 0);  /* not called with PIL > 10 */
 976  976          CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
 977  977          ASSERT(next != curthread);
 978  978          TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 979  979  
 980  980          pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
 981  981  
 982  982          restore_mstate(next);
 983  983  
 984  984          if (dtrace_vtime_active)
 985  985                  dtrace_vtime_switch(next);
 986  986  
 987  987          resume_from_zombie(next);
 988  988          /*
 989  989           * The TR_RESUME_END and TR_SWTCH_END trace points
 990  990           * appear at the end of resume(), because we certainly will not
 991  991           * return here
 992  992           */
 993  993  }
 994  994  
 995  995  #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
 996  996  
 997  997  /*
 998  998   * search_disp_queues()
 999  999   *      Search the given dispatch queues for thread tp.
1000 1000   *      Return 1 if tp is found, otherwise return 0.
1001 1001   */
1002 1002  static int
1003 1003  search_disp_queues(disp_t *dp, kthread_t *tp)
1004 1004  {
1005 1005          dispq_t         *dq;
1006 1006          dispq_t         *eq;
1007 1007  
1008 1008          disp_lock_enter_high(&dp->disp_lock);
1009 1009  
1010 1010          for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1011 1011                  kthread_t       *rp;
1012 1012  
1013 1013                  ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1014 1014  
1015 1015                  for (rp = dq->dq_first; rp; rp = rp->t_link)
1016 1016                          if (tp == rp) {
1017 1017                                  disp_lock_exit_high(&dp->disp_lock);
1018 1018                                  return (1);
1019 1019                          }
1020 1020          }
1021 1021          disp_lock_exit_high(&dp->disp_lock);
1022 1022  
1023 1023          return (0);
1024 1024  }
1025 1025  
1026 1026  /*
1027 1027   * thread_on_queue()
1028 1028   *      Search all per-CPU dispatch queues and all partition-wide kpreempt
1029 1029   *      queues for thread tp. Return 1 if tp is found, otherwise return 0.
1030 1030   */
1031 1031  static int
1032 1032  thread_on_queue(kthread_t *tp)
1033 1033  {
1034 1034          cpu_t           *cp;
1035 1035          struct cpupart  *part;
1036 1036  
1037 1037          ASSERT(getpil() >= DISP_LEVEL);
1038 1038  
1039 1039          /*
1040 1040           * Search the per-CPU dispatch queues for tp.
1041 1041           */
1042 1042          cp = CPU;
1043 1043          do {
1044 1044                  if (search_disp_queues(cp->cpu_disp, tp))
1045 1045                          return (1);
1046 1046          } while ((cp = cp->cpu_next_onln) != CPU);
1047 1047  
1048 1048          /*
1049 1049           * Search the partition-wide kpreempt queues for tp.
1050 1050           */
1051 1051          part = CPU->cpu_part;
1052 1052          do {
1053 1053                  if (search_disp_queues(&part->cp_kp_queue, tp))
1054 1054                          return (1);
1055 1055          } while ((part = part->cp_next) != CPU->cpu_part);
1056 1056  
1057 1057          return (0);
1058 1058  }
1059 1059  
1060 1060  #else
1061 1061  
1062 1062  #define thread_on_queue(tp)     0       /* ASSERT must be !thread_on_queue */
1063 1063  
1064 1064  #endif  /* DEBUG */
1065 1065  
1066 1066  /*
1067 1067   * like swtch(), but switch to a specified thread taken from another CPU.
1068 1068   *      called with spl high..
1069 1069   */
1070 1070  void
1071 1071  swtch_to(kthread_t *next)
1072 1072  {
1073 1073          cpu_t                   *cp = CPU;
1074 1074          hrtime_t                now;
1075 1075  
1076 1076          TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1077 1077  
1078 1078          /*
1079 1079           * Update context switch statistics.
1080 1080           */
1081 1081          CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1082 1082  
1083 1083          TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1084 1084  
1085 1085          now = gethrtime_unscaled();
1086 1086          pg_ev_thread_swtch(cp, now, curthread, next);
1087 1087  
1088 1088          /* OK to steal anything left on run queue */
1089 1089          cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1090 1090  
1091 1091          /* record last execution time */
1092 1092          cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1093 1093  
1094 1094          /*
1095 1095           * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1096 1096           * won't have set its t_waitrq.  Since we now finally know that we're
1097 1097           * switching away from this thread, set its t_waitrq if it is on a run
1098 1098           * queue.
1099 1099           */
1100 1100          if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1101 1101                  curthread->t_waitrq = now;
1102 1102          }
1103 1103  
1104 1104          /* restore next thread to previously running microstate */
1105 1105          restore_mstate(next);
1106 1106  
1107 1107          if (dtrace_vtime_active)
1108 1108                  dtrace_vtime_switch(next);
1109 1109  
1110 1110          resume(next);
1111 1111          /*
1112 1112           * The TR_RESUME_END and TR_SWTCH_END trace points
1113 1113           * appear at the end of resume(), because we may not
1114 1114           * return here
1115 1115           */
1116 1116  }
1117 1117  
1118 1118  #define CPU_IDLING(pri) ((pri) == -1)
1119 1119  
1120 1120  static void
1121 1121  cpu_resched(cpu_t *cp, pri_t tpri)
1122 1122  {
1123 1123          int     call_poke_cpu = 0;
1124 1124          pri_t   cpupri = cp->cpu_dispatch_pri;
1125 1125  
1126 1126          if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1127 1127                  TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1128 1128                      "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1129 1129                  if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1130 1130                          cp->cpu_runrun = 1;
1131 1131                          aston(cp->cpu_dispthread);
1132 1132                          if (tpri < kpreemptpri && cp != CPU)
1133 1133                                  call_poke_cpu = 1;
1134 1134                  }
1135 1135                  if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1136 1136                          cp->cpu_kprunrun = 1;
1137 1137                          if (cp != CPU)
1138 1138                                  call_poke_cpu = 1;
1139 1139                  }
1140 1140          }
1141 1141  
1142 1142          /*
1143 1143           * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1144 1144           */
1145 1145          membar_enter();
1146 1146  
1147 1147          if (call_poke_cpu)
1148 1148                  poke_cpu(cp->cpu_id);
1149 1149  }
1150 1150  
1151 1151  /*
1152 1152   * setbackdq() keeps runqs balanced such that the difference in length
1153 1153   * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1154 1154   * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1155 1155   * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1156 1156   * try to keep runqs perfectly balanced regardless of the thread priority.
1157 1157   */
1158 1158  #define RUNQ_MATCH_PRI  16      /* pri below which queue lengths must match */
1159 1159  #define RUNQ_MAX_DIFF   2       /* maximum runq length difference */
1160 1160  #define RUNQ_LEN(cp, pri)       ((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1161 1161

↓ open down ↓

1161 lines elided

↑ open up ↑

1162 1162  /*
1163 1163   * Macro that evaluates to true if it is likely that the thread has cache
1164 1164   * warmth. This is based on the amount of time that has elapsed since the
1165 1165   * thread last ran. If that amount of time is less than "rechoose_interval"
1166 1166   * ticks, then we decide that the thread has enough cache warmth to warrant
1167 1167   * some affinity for t->t_cpu.
1168 1168   */
1169 1169  #define THREAD_HAS_CACHE_WARMTH(thread) \
1170 1170          ((thread == curthread) ||       \
1171 1171          ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
     1172 +
1172 1173  /*
1173      - * Put the specified thread on the back of the dispatcher
1174      - * queue corresponding to its current priority.
     1174 + * Put the specified thread on the front/back of the dispatcher queue
     1175 + * corresponding to its current priority.
1175 1176   *
1176      - * Called with the thread in transition, onproc or stopped state
1177      - * and locked (transition implies locked) and at high spl.
1178      - * Returns with the thread in TS_RUN state and still locked.
     1177 + * Called with the thread in transition, onproc or stopped state and locked
     1178 + * (transition implies locked) and at high spl.  Returns with the thread in
     1179 + * TS_RUN state and still locked.
1179 1180   */
1180      -void
1181      -setbackdq(kthread_t *tp)
     1181 +static void
     1182 +setfrontbackdq(kthread_t *tp, boolean_t front)
1182 1183  {
1183      -        dispq_t *dq;
     1184 +        dispq_t         *dq;
1184 1185          disp_t          *dp;
1185 1186          cpu_t           *cp;
1186 1187          pri_t           tpri;
1187      -        int             bound;
     1188 +        boolean_t       bound;
1188 1189          boolean_t       self;
1189 1190  
1190 1191          ASSERT(THREAD_LOCK_HELD(tp));
1191 1192          ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1192 1193          ASSERT(!thread_on_queue(tp));   /* make sure tp isn't on a runq */
1193 1194  
1194 1195          /*
1195 1196           * If thread is "swapped" or on the swap queue don't
1196 1197           * queue it, but wake sched.
1197 1198           */
1198 1199          if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1199 1200                  disp_swapped_setrun(tp);
1200 1201                  return;
1201 1202          }
1202 1203  
1203      -        self = (tp == curthread);
1204      -
1205      -        if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1206      -                bound = 1;
1207      -        else
1208      -                bound = 0;
     1204 +        self  = (tp == curthread);
     1205 +        bound = (tp->t_bound_cpu || tp->t_weakbound_cpu);
1209 1206  
1210 1207          tpri = DISP_PRIO(tp);
1211 1208          if (ncpus == 1)
1212 1209                  cp = tp->t_cpu;
1213 1210          else if (!bound) {
1214 1211                  if (tpri >= kpqpri) {
1215      -                        setkpdq(tp, SETKP_BACK);
     1212 +                        setkpdq(tp, front ? SETKP_FRONT : SETKP_BACK);
1216 1213                          return;
1217 1214                  }
1218 1215  
1219      -                /*
1220      -                 * We'll generally let this thread continue to run where
1221      -                 * it last ran...but will consider migration if:
1222      -                 * - We thread probably doesn't have much cache warmth.
1223      -                 * - The CPU where it last ran is the target of an offline
1224      -                 *   request.
1225      -                 * - The thread last ran outside it's home lgroup.
1226      -                 */
1227      -                if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1228      -                    (tp->t_cpu == cpu_inmotion)) {
1229      -                        cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
1230      -                } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1231      -                        cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1232      -                            self ? tp->t_cpu : NULL);
1233      -                } else {
1234      -                        cp = tp->t_cpu;
1235      -                }
1236      -
1237      -                if (tp->t_cpupart == cp->cpu_part) {
1238      -                        int     qlen;
     1216 +                cp = tp->t_cpu;
1239 1217  
     1218 +                if (!front) {
1240 1219                          /*
1241      -                         * Perform any CMT load balancing
     1220 +                         * We'll generally let this thread continue to run where
     1221 +                         * it last ran...but will consider migration if:
     1222 +                         * - We thread probably doesn't have much cache warmth.
     1223 +                         * - The CPU where it last ran is the target of an offline
     1224 +                         *   request.
     1225 +                         * - The thread last ran outside it's home lgroup.
1242 1226                           */
1243      -                        cp = cmt_balance(tp, cp);
     1227 +                        if ((!THREAD_HAS_CACHE_WARMTH(tp)) || (cp == cpu_inmotion)) {
     1228 +                                cp = disp_lowpri_cpu(cp, tp->t_lpl, tpri, NULL);
     1229 +                        } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) {
     1230 +                                cp = disp_lowpri_cpu(cp, tp->t_lpl, tpri,
     1231 +                                    self ? cp : NULL);
     1232 +                        }
1244 1233  
1245      -                        /*
1246      -                         * Balance across the run queues
1247      -                         */
1248      -                        qlen = RUNQ_LEN(cp, tpri);
1249      -                        if (tpri >= RUNQ_MATCH_PRI &&
1250      -                            !(tp->t_schedflag & TS_RUNQMATCH))
1251      -                                qlen -= RUNQ_MAX_DIFF;
1252      -                        if (qlen > 0) {
1253      -                                cpu_t *newcp;
1254      -
1255      -                                if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1256      -                                        newcp = cp->cpu_next_part;
1257      -                                } else if ((newcp = cp->cpu_next_lpl) == cp) {
1258      -                                        newcp = cp->cpu_next_part;
     1234 +                }
     1235 +
     1236 +                if (tp->t_cpupart == cp->cpu_part) {
     1237 +                        if (front) {
     1238 +                                /*
     1239 +                                 * We'll generally let this thread continue to run
     1240 +                                 * where it last ran, but will consider migration if:
     1241 +                                 * - The thread last ran outside it's home lgroup.
     1242 +                                 * - The CPU where it last ran is the target of an
     1243 +                                 *   offline request (a thread_nomigrate() on the in
     1244 +                                 *   motion CPU relies on this when forcing a preempt).
     1245 +                                 * - The thread isn't the highest priority thread where
     1246 +                                 *   it last ran, and it is considered not likely to
     1247 +                                 *   have significant cache warmth.
     1248 +                                 */
     1249 +                                if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
     1250 +                                    (cp == cpu_inmotion)) {
     1251 +                                        cp = disp_lowpri_cpu(cp, tp->t_lpl, tpri,
     1252 +                                            self ? cp : NULL);
     1253 +                                } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
     1254 +                                    (!THREAD_HAS_CACHE_WARMTH(tp))) {
     1255 +                                        cp = disp_lowpri_cpu(cp, tp->t_lpl, tpri,
     1256 +                                            NULL);
1259 1257                                  }
     1258 +                        } else {
     1259 +                                int     qlen;
1260 1260  
1261      -                                if (RUNQ_LEN(newcp, tpri) < qlen) {
1262      -                                        DTRACE_PROBE3(runq__balance,
1263      -                                            kthread_t *, tp,
1264      -                                            cpu_t *, cp, cpu_t *, newcp);
1265      -                                        cp = newcp;
     1261 +                                /*
     1262 +                                 * Perform any CMT load balancing
     1263 +                                 */
     1264 +                                cp = cmt_balance(tp, cp);
     1265 +
     1266 +                                /*
     1267 +                                 * Balance across the run queues
     1268 +                                 */
     1269 +                                qlen = RUNQ_LEN(cp, tpri);
     1270 +                                if (tpri >= RUNQ_MATCH_PRI &&
     1271 +                                    !(tp->t_schedflag & TS_RUNQMATCH))
     1272 +                                        qlen -= RUNQ_MAX_DIFF;
     1273 +                                if (qlen > 0) {
     1274 +                                        cpu_t *newcp;
     1275 +
     1276 +                                        if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
     1277 +                                                newcp = cp->cpu_next_part;
     1278 +                                        } else if ((newcp = cp->cpu_next_lpl) == cp) {
     1279 +                                                newcp = cp->cpu_next_part;
     1280 +                                        }
     1281 +
     1282 +                                        if (RUNQ_LEN(newcp, tpri) < qlen) {
     1283 +                                                DTRACE_PROBE3(runq__balance,
     1284 +                                                    kthread_t *, tp,
     1285 +                                                    cpu_t *, cp, cpu_t *, newcp);
     1286 +                                                cp = newcp;
     1287 +                                        }
1266 1288                                  }
1267 1289                          }
1268 1290                  } else {
1269 1291                          /*
1270 1292                           * Migrate to a cpu in the new partition.
1271 1293                           */
1272 1294                          cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1273 1295                              tp->t_lpl, tp->t_pri, NULL);
1274 1296                  }
     1297 +
1275 1298                  ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1276 1299          } else {
1277 1300                  /*
1278 1301                   * It is possible that t_weakbound_cpu != t_bound_cpu (for
1279 1302                   * a short time until weak binding that existed when the
1280 1303                   * strong binding was established has dropped) so we must
1281 1304                   * favour weak binding over strong.
1282 1305                   */
1283 1306                  cp = tp->t_weakbound_cpu ?
1284 1307                      tp->t_weakbound_cpu : tp->t_bound_cpu;
1285 1308          }
     1309 +
1286 1310          /*
1287 1311           * A thread that is ONPROC may be temporarily placed on the run queue
1288 1312           * but then chosen to run again by disp.  If the thread we're placing on
1289 1313           * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1290 1314           * replacement process is actually scheduled in swtch().  In this
1291 1315           * situation, curthread is the only thread that could be in the ONPROC
1292 1316           * state.
1293 1317           */
1294 1318          if ((!self) && (tp->t_waitrq == 0)) {
1295 1319                  hrtime_t curtime;

1296 1320

↓ open down ↓

1 lines elided

↑ open up ↑

1297 1321                  curtime = gethrtime_unscaled();
1298 1322                  (void) cpu_update_pct(tp, curtime);
1299 1323                  tp->t_waitrq = curtime;
1300 1324          } else {
1301 1325                  (void) cpu_update_pct(tp, gethrtime_unscaled());
1302 1326          }
1303 1327  
1304 1328          dp = cp->cpu_disp;
1305 1329          disp_lock_enter_high(&dp->disp_lock);
1306 1330  
1307      -        DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1308      -        TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1309      -            tpri, cp, tp);
     1331 +        DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, front);
     1332 +        if (front) {
     1333 +                TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri,
     1334 +                    tp);
     1335 +        } else {
     1336 +                TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
     1337 +                    tpri, cp, tp);
     1338 +        }
1310 1339  
1311 1340  #ifndef NPROBE
1312 1341          /* Kernel probe */
1313 1342          if (tnf_tracing_active)
1314 1343                  tnf_thread_queue(tp, cp, tpri);
1315 1344  #endif /* NPROBE */
1316 1345  
1317 1346          ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1318 1347  
1319 1348          THREAD_RUN(tp, &dp->disp_lock);         /* set t_state to TS_RUN */
1320 1349          tp->t_disp_queue = dp;
1321 1350          tp->t_link = NULL;
1322 1351  
1323 1352          dq = &dp->disp_q[tpri];
1324 1353          dp->disp_nrunnable++;
1325 1354          if (!bound)
1326 1355                  dp->disp_steal = 0;
1327 1356          membar_enter();
1328 1357  
1329 1358          if (dq->dq_sruncnt++ != 0) {
1330      -                ASSERT(dq->dq_first != NULL);
1331      -                dq->dq_last->t_link = tp;
1332      -                dq->dq_last = tp;
     1359 +                if (front) {
     1360 +                        ASSERT(dq->dq_last != NULL);
     1361 +                        tp->t_link = dq->dq_first;
     1362 +                        dq->dq_first = tp;
     1363 +                } else {
     1364 +                        ASSERT(dq->dq_first != NULL);
     1365 +                        dq->dq_last->t_link = tp;
     1366 +                        dq->dq_last = tp;
     1367 +                }
1333 1368          } else {
1334 1369                  ASSERT(dq->dq_first == NULL);
1335 1370                  ASSERT(dq->dq_last == NULL);
1336 1371                  dq->dq_first = dq->dq_last = tp;
1337 1372                  BT_SET(dp->disp_qactmap, tpri);
1338 1373                  if (tpri > dp->disp_maxrunpri) {
1339 1374                          dp->disp_maxrunpri = tpri;
1340 1375                          membar_enter();
1341 1376                          cpu_resched(cp, tpri);
1342 1377                  }

1343 1378          }
1344 1379  
1345 1380          if (!bound && tpri > dp->disp_max_unbound_pri) {
1346 1381                  if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1347 1382                          /*
1348 1383                           * If there are no other unbound threads on the

↓ open down ↓

6 lines elided

↑ open up ↑

1349 1384                           * run queue, don't allow other CPUs to steal
1350 1385                           * this thread while we are in the middle of a
1351 1386                           * context switch. We may just switch to it
1352 1387                           * again right away. CPU_DISP_DONTSTEAL is cleared
1353 1388                           * in swtch and swtch_to.
1354 1389                           */
1355 1390                          cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1356 1391                  }
1357 1392                  dp->disp_max_unbound_pri = tpri;
1358 1393          }
     1394 +
1359 1395          (*disp_enq_thread)(cp, bound);
1360 1396  }
1361 1397  
1362 1398  /*
     1399 + * Put the specified thread on the back of the dispatcher
     1400 + * queue corresponding to its current priority.
     1401 + *
     1402 + * Called with the thread in transition, onproc or stopped state
     1403 + * and locked (transition implies locked) and at high spl.
     1404 + * Returns with the thread in TS_RUN state and still locked.
     1405 + */
     1406 +void
     1407 +setbackdq(kthread_t *tp)
     1408 +{
     1409 +        setfrontbackdq(tp, B_FALSE);
     1410 +}
     1411 +
     1412 +/*
1363 1413   * Put the specified thread on the front of the dispatcher
1364 1414   * queue corresponding to its current priority.
1365 1415   *
1366 1416   * Called with the thread in transition, onproc or stopped state
1367 1417   * and locked (transition implies locked) and at high spl.
1368 1418   * Returns with the thread in TS_RUN state and still locked.
1369 1419   */
1370 1420  void
1371 1421  setfrontdq(kthread_t *tp)
1372 1422  {
1373      -        disp_t          *dp;
1374      -        dispq_t         *dq;
1375      -        cpu_t           *cp;
1376      -        pri_t           tpri;
1377      -        int             bound;
1378      -
1379      -        ASSERT(THREAD_LOCK_HELD(tp));
1380      -        ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1381      -        ASSERT(!thread_on_queue(tp));   /* make sure tp isn't on a runq */
1382      -
1383      -        /*
1384      -         * If thread is "swapped" or on the swap queue don't
1385      -         * queue it, but wake sched.
1386      -         */
1387      -        if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1388      -                disp_swapped_setrun(tp);
1389      -                return;
1390      -        }
1391      -
1392      -        if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1393      -                bound = 1;
1394      -        else
1395      -                bound = 0;
1396      -
1397      -        tpri = DISP_PRIO(tp);
1398      -        if (ncpus == 1)
1399      -                cp = tp->t_cpu;
1400      -        else if (!bound) {
1401      -                if (tpri >= kpqpri) {
1402      -                        setkpdq(tp, SETKP_FRONT);
1403      -                        return;
1404      -                }
1405      -                cp = tp->t_cpu;
1406      -                if (tp->t_cpupart == cp->cpu_part) {
1407      -                        /*
1408      -                         * We'll generally let this thread continue to run
1409      -                         * where it last ran, but will consider migration if:
1410      -                         * - The thread last ran outside it's home lgroup.
1411      -                         * - The CPU where it last ran is the target of an
1412      -                         *   offline request (a thread_nomigrate() on the in
1413      -                         *   motion CPU relies on this when forcing a preempt).
1414      -                         * - The thread isn't the highest priority thread where
1415      -                         *   it last ran, and it is considered not likely to
1416      -                         *   have significant cache warmth.
1417      -                         */
1418      -                        if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
1419      -                            (cp == cpu_inmotion)) {
1420      -                                cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1421      -                                    (tp == curthread) ? cp : NULL);
1422      -                        } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
1423      -                            (!THREAD_HAS_CACHE_WARMTH(tp))) {
1424      -                                cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1425      -                                    NULL);
1426      -                        }
1427      -                } else {
1428      -                        /*
1429      -                         * Migrate to a cpu in the new partition.
1430      -                         */
1431      -                        cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1432      -                            tp->t_lpl, tp->t_pri, NULL);
1433      -                }
1434      -                ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1435      -        } else {
1436      -                /*
1437      -                 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1438      -                 * a short time until weak binding that existed when the
1439      -                 * strong binding was established has dropped) so we must
1440      -                 * favour weak binding over strong.
1441      -                 */
1442      -                cp = tp->t_weakbound_cpu ?
1443      -                    tp->t_weakbound_cpu : tp->t_bound_cpu;
1444      -        }
1445      -
1446      -        /*
1447      -         * A thread that is ONPROC may be temporarily placed on the run queue
1448      -         * but then chosen to run again by disp.  If the thread we're placing on
1449      -         * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1450      -         * replacement process is actually scheduled in swtch().  In this
1451      -         * situation, curthread is the only thread that could be in the ONPROC
1452      -         * state.
1453      -         */
1454      -        if ((tp != curthread) && (tp->t_waitrq == 0)) {
1455      -                hrtime_t curtime;
1456      -
1457      -                curtime = gethrtime_unscaled();
1458      -                (void) cpu_update_pct(tp, curtime);
1459      -                tp->t_waitrq = curtime;
1460      -        } else {
1461      -                (void) cpu_update_pct(tp, gethrtime_unscaled());
1462      -        }
1463      -
1464      -        dp = cp->cpu_disp;
1465      -        disp_lock_enter_high(&dp->disp_lock);
1466      -
1467      -        TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1468      -        DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1469      -
1470      -#ifndef NPROBE
1471      -        /* Kernel probe */
1472      -        if (tnf_tracing_active)
1473      -                tnf_thread_queue(tp, cp, tpri);
1474      -#endif /* NPROBE */
1475      -
1476      -        ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1477      -
1478      -        THREAD_RUN(tp, &dp->disp_lock);         /* set TS_RUN state and lock */
1479      -        tp->t_disp_queue = dp;
1480      -
1481      -        dq = &dp->disp_q[tpri];
1482      -        dp->disp_nrunnable++;
1483      -        if (!bound)
1484      -                dp->disp_steal = 0;
1485      -        membar_enter();
1486      -
1487      -        if (dq->dq_sruncnt++ != 0) {
1488      -                ASSERT(dq->dq_last != NULL);
1489      -                tp->t_link = dq->dq_first;
1490      -                dq->dq_first = tp;
1491      -        } else {
1492      -                ASSERT(dq->dq_last == NULL);
1493      -                ASSERT(dq->dq_first == NULL);
1494      -                tp->t_link = NULL;
1495      -                dq->dq_first = dq->dq_last = tp;
1496      -                BT_SET(dp->disp_qactmap, tpri);
1497      -                if (tpri > dp->disp_maxrunpri) {
1498      -                        dp->disp_maxrunpri = tpri;
1499      -                        membar_enter();
1500      -                        cpu_resched(cp, tpri);
1501      -                }
1502      -        }
1503      -
1504      -        if (!bound && tpri > dp->disp_max_unbound_pri) {
1505      -                if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1506      -                    cp == CPU) {
1507      -                        /*
1508      -                         * If there are no other unbound threads on the
1509      -                         * run queue, don't allow other CPUs to steal
1510      -                         * this thread while we are in the middle of a
1511      -                         * context switch. We may just switch to it
1512      -                         * again right away. CPU_DISP_DONTSTEAL is cleared
1513      -                         * in swtch and swtch_to.
1514      -                         */
1515      -                        cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1516      -                }
1517      -                dp->disp_max_unbound_pri = tpri;
1518      -        }
1519      -        (*disp_enq_thread)(cp, bound);
     1423 +        setfrontbackdq(tp, B_TRUE);
1520 1424  }
1521 1425  
1522 1426  /*
1523 1427   * Put a high-priority unbound thread on the kp queue
1524 1428   */
1525 1429  static void
1526 1430  setkpdq(kthread_t *tp, int borf)
1527 1431  {
1528 1432          dispq_t *dq;
1529 1433          disp_t  *dp;

1530 1434          cpu_t   *cp;
1531 1435          pri_t   tpri;
1532 1436  
1533 1437          tpri = DISP_PRIO(tp);
1534 1438  
1535 1439          dp = &tp->t_cpupart->cp_kp_queue;
1536 1440          disp_lock_enter_high(&dp->disp_lock);
1537 1441  
1538 1442          TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1539 1443  
1540 1444          ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1541 1445          DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1542 1446          THREAD_RUN(tp, &dp->disp_lock);         /* set t_state to TS_RUN */
1543 1447          tp->t_disp_queue = dp;
1544 1448          dp->disp_nrunnable++;
1545 1449          dq = &dp->disp_q[tpri];
1546 1450  
1547 1451          if (dq->dq_sruncnt++ != 0) {
1548 1452                  if (borf == SETKP_BACK) {
1549 1453                          ASSERT(dq->dq_first != NULL);
1550 1454                          tp->t_link = NULL;
1551 1455                          dq->dq_last->t_link = tp;
1552 1456                          dq->dq_last = tp;
1553 1457                  } else {
1554 1458                          ASSERT(dq->dq_last != NULL);
1555 1459                          tp->t_link = dq->dq_first;
1556 1460                          dq->dq_first = tp;
1557 1461                  }
1558 1462          } else {
1559 1463                  if (borf == SETKP_BACK) {
1560 1464                          ASSERT(dq->dq_first == NULL);
1561 1465                          ASSERT(dq->dq_last == NULL);
1562 1466                          dq->dq_first = dq->dq_last = tp;
1563 1467                  } else {
1564 1468                          ASSERT(dq->dq_last == NULL);
1565 1469                          ASSERT(dq->dq_first == NULL);
1566 1470                          tp->t_link = NULL;
1567 1471                          dq->dq_first = dq->dq_last = tp;
1568 1472                  }
1569 1473                  BT_SET(dp->disp_qactmap, tpri);
1570 1474                  if (tpri > dp->disp_max_unbound_pri)
1571 1475                          dp->disp_max_unbound_pri = tpri;
1572 1476                  if (tpri > dp->disp_maxrunpri) {
1573 1477                          dp->disp_maxrunpri = tpri;
1574 1478                          membar_enter();
1575 1479                  }
1576 1480          }
1577 1481  
1578 1482          cp = tp->t_cpu;
1579 1483          if (tp->t_cpupart != cp->cpu_part) {
1580 1484                  /* migrate to a cpu in the new partition */
1581 1485                  cp = tp->t_cpupart->cp_cpulist;
1582 1486          }
1583 1487          cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1584 1488          disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1585 1489          ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1586 1490  
1587 1491  #ifndef NPROBE
1588 1492          /* Kernel probe */
1589 1493          if (tnf_tracing_active)
1590 1494                  tnf_thread_queue(tp, cp, tpri);
1591 1495  #endif /* NPROBE */
1592 1496  
1593 1497          if (cp->cpu_chosen_level < tpri)
1594 1498                  cp->cpu_chosen_level = tpri;
1595 1499          cpu_resched(cp, tpri);
1596 1500          disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1597 1501          (*disp_enq_thread)(cp, 0);
1598 1502  }
1599 1503  
1600 1504  /*
1601 1505   * Remove a thread from the dispatcher queue if it is on it.
1602 1506   * It is not an error if it is not found but we return whether
1603 1507   * or not it was found in case the caller wants to check.
1604 1508   */
1605 1509  int
1606 1510  dispdeq(kthread_t *tp)
1607 1511  {
1608 1512          disp_t          *dp;
1609 1513          dispq_t         *dq;
1610 1514          kthread_t       *rp;
1611 1515          kthread_t       *trp;
1612 1516          kthread_t       **ptp;
1613 1517          int             tpri;
1614 1518  
1615 1519          ASSERT(THREAD_LOCK_HELD(tp));
1616 1520  
1617 1521          if (tp->t_state != TS_RUN)
1618 1522                  return (0);
1619 1523  
1620 1524          /*
1621 1525           * The thread is "swapped" or is on the swap queue and
1622 1526           * hence no longer on the run queue, so return true.
1623 1527           */
1624 1528          if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1625 1529                  return (1);
1626 1530  
1627 1531          tpri = DISP_PRIO(tp);
1628 1532          dp = tp->t_disp_queue;
1629 1533          ASSERT(tpri < dp->disp_npri);
1630 1534          dq = &dp->disp_q[tpri];
1631 1535          ptp = &dq->dq_first;
1632 1536          rp = *ptp;
1633 1537          trp = NULL;
1634 1538  
1635 1539          ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1636 1540  
1637 1541          /*
1638 1542           * Search for thread in queue.
1639 1543           * Double links would simplify this at the expense of disp/setrun.
1640 1544           */
1641 1545          while (rp != tp && rp != NULL) {
1642 1546                  trp = rp;
1643 1547                  ptp = &trp->t_link;
1644 1548                  rp = trp->t_link;
1645 1549          }
1646 1550  
1647 1551          if (rp == NULL) {
1648 1552                  panic("dispdeq: thread not on queue");
1649 1553          }
1650 1554  
1651 1555          DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1652 1556  
1653 1557          /*
1654 1558           * Found it so remove it from queue.
1655 1559           */
1656 1560          if ((*ptp = rp->t_link) == NULL)
1657 1561                  dq->dq_last = trp;
1658 1562  
1659 1563          dp->disp_nrunnable--;
1660 1564          if (--dq->dq_sruncnt == 0) {
1661 1565                  dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1662 1566                  if (dp->disp_nrunnable == 0) {
1663 1567                          dp->disp_max_unbound_pri = -1;
1664 1568                          dp->disp_maxrunpri = -1;
1665 1569                  } else if (tpri == dp->disp_maxrunpri) {
1666 1570                          int ipri;
1667 1571  
1668 1572                          ipri = bt_gethighbit(dp->disp_qactmap,
1669 1573                              dp->disp_maxrunpri >> BT_ULSHIFT);
1670 1574                          if (ipri < dp->disp_max_unbound_pri)
1671 1575                                  dp->disp_max_unbound_pri = ipri;
1672 1576                          dp->disp_maxrunpri = ipri;
1673 1577                  }
1674 1578          }
1675 1579          tp->t_link = NULL;
1676 1580          THREAD_TRANSITION(tp);          /* put in intermediate state */
1677 1581          return (1);
1678 1582  }
1679 1583  
1680 1584  
1681 1585  /*
1682 1586   * dq_sruninc and dq_srundec are public functions for
1683 1587   * incrementing/decrementing the sruncnts when a thread on
1684 1588   * a dispatcher queue is made schedulable/unschedulable by
1685 1589   * resetting the TS_LOAD flag.
1686 1590   *
1687 1591   * The caller MUST have the thread lock and therefore the dispatcher
1688 1592   * queue lock so that the operation which changes
1689 1593   * the flag, the operation that checks the status of the thread to
1690 1594   * determine if it's on a disp queue AND the call to this function
1691 1595   * are one atomic operation with respect to interrupts.
1692 1596   */
1693 1597  
1694 1598  /*
1695 1599   * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1696 1600   */
1697 1601  void
1698 1602  dq_sruninc(kthread_t *t)
1699 1603  {
1700 1604          ASSERT(t->t_state == TS_RUN);
1701 1605          ASSERT(t->t_schedflag & TS_LOAD);
1702 1606  
1703 1607          THREAD_TRANSITION(t);
1704 1608          setfrontdq(t);
1705 1609  }
1706 1610  
1707 1611  /*
1708 1612   * See comment on calling conventions above.
1709 1613   * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1710 1614   */
1711 1615  void
1712 1616  dq_srundec(kthread_t *t)
1713 1617  {
1714 1618          ASSERT(t->t_schedflag & TS_LOAD);
1715 1619  
1716 1620          (void) dispdeq(t);
1717 1621          disp_swapped_enq(t);
1718 1622  }
1719 1623  
1720 1624  /*
1721 1625   * Change the dispatcher lock of thread to the "swapped_lock"
1722 1626   * and return with thread lock still held.
1723 1627   *
1724 1628   * Called with thread_lock held, in transition state, and at high spl.
1725 1629   */
1726 1630  void
1727 1631  disp_swapped_enq(kthread_t *tp)
1728 1632  {
1729 1633          ASSERT(THREAD_LOCK_HELD(tp));
1730 1634          ASSERT(tp->t_schedflag & TS_LOAD);
1731 1635  
1732 1636          switch (tp->t_state) {
1733 1637          case TS_RUN:
1734 1638                  disp_lock_enter_high(&swapped_lock);
1735 1639                  THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1736 1640                  break;
1737 1641          case TS_ONPROC:
1738 1642                  disp_lock_enter_high(&swapped_lock);
1739 1643                  THREAD_TRANSITION(tp);
1740 1644                  wake_sched_sec = 1;             /* tell clock to wake sched */
1741 1645                  THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1742 1646                  break;
1743 1647          default:
1744 1648                  panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1745 1649          }
1746 1650  }
1747 1651  
1748 1652  /*
1749 1653   * This routine is called by setbackdq/setfrontdq if the thread is
1750 1654   * not loaded or loaded and on the swap queue.
1751 1655   *
1752 1656   * Thread state TS_SLEEP implies that a swapped thread
1753 1657   * has been woken up and needs to be swapped in by the swapper.
1754 1658   *
1755 1659   * Thread state TS_RUN, it implies that the priority of a swapped
1756 1660   * thread is being increased by scheduling class (e.g. ts_update).
1757 1661   */
1758 1662  static void
1759 1663  disp_swapped_setrun(kthread_t *tp)
1760 1664  {
1761 1665          ASSERT(THREAD_LOCK_HELD(tp));
1762 1666          ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1763 1667  
1764 1668          switch (tp->t_state) {
1765 1669          case TS_SLEEP:
1766 1670                  disp_lock_enter_high(&swapped_lock);
1767 1671                  /*
1768 1672                   * Wakeup sched immediately (i.e., next tick) if the
1769 1673                   * thread priority is above maxclsyspri.
1770 1674                   */
1771 1675                  if (DISP_PRIO(tp) > maxclsyspri)
1772 1676                          wake_sched = 1;
1773 1677                  else
1774 1678                          wake_sched_sec = 1;
1775 1679                  THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1776 1680                  break;
1777 1681          case TS_RUN:                            /* called from ts_update */
1778 1682                  break;
1779 1683          default:
1780 1684                  panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
1781 1685          }
1782 1686  }
1783 1687  
1784 1688  /*
1785 1689   *      Make a thread give up its processor.  Find the processor on
1786 1690   *      which this thread is executing, and have that processor
1787 1691   *      preempt.
1788 1692   *
1789 1693   *      We allow System Duty Cycle (SDC) threads to be preempted even if
1790 1694   *      they are running at kernel priorities.  To implement this, we always
1791 1695   *      set cpu_kprunrun; this ensures preempt() will be called.  Since SDC
1792 1696   *      calls cpu_surrender() very often, we only preempt if there is anyone
1793 1697   *      competing with us.
1794 1698   */
1795 1699  void
1796 1700  cpu_surrender(kthread_t *tp)
1797 1701  {
1798 1702          cpu_t   *cpup;
1799 1703          int     max_pri;
1800 1704          int     max_run_pri;
1801 1705          klwp_t  *lwp;
1802 1706  
1803 1707          ASSERT(THREAD_LOCK_HELD(tp));
1804 1708  
1805 1709          if (tp->t_state != TS_ONPROC)
1806 1710                  return;
1807 1711          cpup = tp->t_disp_queue->disp_cpu;      /* CPU thread dispatched to */
1808 1712          max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1809 1713          max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1810 1714          if (max_pri < max_run_pri)
1811 1715                  max_pri = max_run_pri;
1812 1716  
1813 1717          if (tp->t_cid == sysdccid) {
1814 1718                  uint_t t_pri = DISP_PRIO(tp);
1815 1719                  if (t_pri > max_pri)
1816 1720                          return;         /* we are not competing w/ anyone */
1817 1721                  cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1818 1722          } else {
1819 1723                  cpup->cpu_runrun = 1;
1820 1724                  if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1821 1725                          cpup->cpu_kprunrun = 1;
1822 1726                  }
1823 1727          }
1824 1728  
1825 1729          /*
1826 1730           * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1827 1731           */
1828 1732          membar_enter();
1829 1733  
1830 1734          DTRACE_SCHED1(surrender, kthread_t *, tp);
1831 1735  
1832 1736          /*
1833 1737           * Make the target thread take an excursion through trap()
1834 1738           * to do preempt() (unless we're already in trap or post_syscall,
1835 1739           * calling cpu_surrender via CL_TRAPRET).
1836 1740           */
1837 1741          if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1838 1742              lwp->lwp_state != LWP_USER) {
1839 1743                  aston(tp);
1840 1744                  if (cpup != CPU)
1841 1745                          poke_cpu(cpup->cpu_id);
1842 1746          }
1843 1747          TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1844 1748              "cpu_surrender:tid %p cpu %p", tp, cpup);
1845 1749  }
1846 1750  
1847 1751  /*
1848 1752   * Commit to and ratify a scheduling decision
1849 1753   */
1850 1754  /*ARGSUSED*/
1851 1755  static kthread_t *
1852 1756  disp_ratify(kthread_t *tp, disp_t *kpq)
1853 1757  {
1854 1758          pri_t   tpri, maxpri;
1855 1759          pri_t   maxkpri;
1856 1760          cpu_t   *cpup;
1857 1761  
1858 1762          ASSERT(tp != NULL);
1859 1763          /*
1860 1764           * Commit to, then ratify scheduling decision
1861 1765           */
1862 1766          cpup = CPU;
1863 1767          if (cpup->cpu_runrun != 0)
1864 1768                  cpup->cpu_runrun = 0;
1865 1769          if (cpup->cpu_kprunrun != 0)
1866 1770                  cpup->cpu_kprunrun = 0;
1867 1771          if (cpup->cpu_chosen_level != -1)
1868 1772                  cpup->cpu_chosen_level = -1;
1869 1773          membar_enter();
1870 1774          tpri = DISP_PRIO(tp);
1871 1775          maxpri = cpup->cpu_disp->disp_maxrunpri;
1872 1776          maxkpri = kpq->disp_maxrunpri;
1873 1777          if (maxpri < maxkpri)
1874 1778                  maxpri = maxkpri;
1875 1779          if (tpri < maxpri) {
1876 1780                  /*
1877 1781                   * should have done better
1878 1782                   * put this one back and indicate to try again
1879 1783                   */
1880 1784                  cpup->cpu_dispthread = curthread;       /* fixup dispthread */
1881 1785                  cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1882 1786                  thread_lock_high(tp);
1883 1787                  THREAD_TRANSITION(tp);
1884 1788                  setfrontdq(tp);
1885 1789                  thread_unlock_nopreempt(tp);
1886 1790  
1887 1791                  tp = NULL;
1888 1792          }
1889 1793          return (tp);
1890 1794  }
1891 1795  
1892 1796  /*
1893 1797   * See if there is any work on the dispatcher queue for other CPUs.
1894 1798   * If there is, dequeue the best thread and return.
1895 1799   */
1896 1800  static kthread_t *
1897 1801  disp_getwork(cpu_t *cp)
1898 1802  {
1899 1803          cpu_t           *ocp;           /* other CPU */
1900 1804          cpu_t           *ocp_start;
1901 1805          cpu_t           *tcp;           /* target local CPU */
1902 1806          kthread_t       *tp;
1903 1807          kthread_t       *retval = NULL;
1904 1808          pri_t           maxpri;
1905 1809          disp_t          *kpq;           /* kp queue for this partition */
1906 1810          lpl_t           *lpl, *lpl_leaf;
1907 1811          int             leafidx, startidx;
1908 1812          hrtime_t        stealtime;
1909 1813          lgrp_id_t       local_id;
1910 1814  
1911 1815          maxpri = -1;
1912 1816          tcp = NULL;
1913 1817  
1914 1818          kpq = &cp->cpu_part->cp_kp_queue;
1915 1819          while (kpq->disp_maxrunpri >= 0) {
1916 1820                  /*
1917 1821                   * Try to take a thread from the kp_queue.
1918 1822                   */
1919 1823                  tp = (disp_getbest(kpq));
1920 1824                  if (tp)
1921 1825                          return (disp_ratify(tp, kpq));
1922 1826          }
1923 1827  
1924 1828          kpreempt_disable();             /* protect the cpu_active list */
1925 1829  
1926 1830          /*
1927 1831           * Try to find something to do on another CPU's run queue.
1928 1832           * Loop through all other CPUs looking for the one with the highest
1929 1833           * priority unbound thread.
1930 1834           *
1931 1835           * On NUMA machines, the partition's CPUs are consulted in order of
1932 1836           * distance from the current CPU. This way, the first available
1933 1837           * work found is also the closest, and will suffer the least
1934 1838           * from being migrated.
1935 1839           */
1936 1840          lpl = lpl_leaf = cp->cpu_lpl;
1937 1841          local_id = lpl_leaf->lpl_lgrpid;
1938 1842          leafidx = startidx = 0;
1939 1843  
1940 1844          /*
1941 1845           * This loop traverses the lpl hierarchy. Higher level lpls represent
1942 1846           * broader levels of locality
1943 1847           */
1944 1848          do {
1945 1849                  /* This loop iterates over the lpl's leaves */
1946 1850                  do {
1947 1851                          if (lpl_leaf != cp->cpu_lpl)
1948 1852                                  ocp = lpl_leaf->lpl_cpus;
1949 1853                          else
1950 1854                                  ocp = cp->cpu_next_lpl;
1951 1855  
1952 1856                          /* This loop iterates over the CPUs in the leaf */
1953 1857                          ocp_start = ocp;
1954 1858                          do {
1955 1859                                  pri_t pri;
1956 1860  
1957 1861                                  ASSERT(CPU_ACTIVE(ocp));
1958 1862  
1959 1863                                  /*
1960 1864                                   * End our stroll around this lpl if:
1961 1865                                   *
1962 1866                                   * - Something became runnable on the local
1963 1867                                   *   queue...which also ends our stroll around
1964 1868                                   *   the partition.
1965 1869                                   *
1966 1870                                   * - We happen across another idle CPU.
1967 1871                                   *   Since it is patrolling the next portion
1968 1872                                   *   of the lpl's list (assuming it's not
1969 1873                                   *   halted, or busy servicing an interrupt),
1970 1874                                   *   move to the next higher level of locality.
1971 1875                                   */
1972 1876                                  if (cp->cpu_disp->disp_nrunnable != 0) {
1973 1877                                          kpreempt_enable();
1974 1878                                          return (NULL);
1975 1879                                  }
1976 1880                                  if (ocp->cpu_dispatch_pri == -1) {
1977 1881                                          if (ocp->cpu_disp_flags &
1978 1882                                              CPU_DISP_HALTED ||
1979 1883                                              ocp->cpu_intr_actv != 0)
1980 1884                                                  continue;
1981 1885                                          else
1982 1886                                                  goto next_level;
1983 1887                                  }
1984 1888  
1985 1889                                  /*
1986 1890                                   * If there's only one thread and the CPU
1987 1891                                   * is in the middle of a context switch,
1988 1892                                   * or it's currently running the idle thread,
1989 1893                                   * don't steal it.
1990 1894                                   */
1991 1895                                  if ((ocp->cpu_disp_flags &
1992 1896                                      CPU_DISP_DONTSTEAL) &&
1993 1897                                      ocp->cpu_disp->disp_nrunnable == 1)
1994 1898                                          continue;
1995 1899  
1996 1900                                  pri = ocp->cpu_disp->disp_max_unbound_pri;
1997 1901                                  if (pri > maxpri) {
1998 1902                                          /*
1999 1903                                           * Don't steal threads that we attempted
2000 1904                                           * to steal recently until they're ready
2001 1905                                           * to be stolen again.
2002 1906                                           */
2003 1907                                          stealtime = ocp->cpu_disp->disp_steal;
2004 1908                                          if (stealtime == 0 ||
2005 1909                                              stealtime - gethrtime() <= 0) {
2006 1910                                                  maxpri = pri;
2007 1911                                                  tcp = ocp;
2008 1912                                          } else {
2009 1913                                                  /*
2010 1914                                                   * Don't update tcp, just set
2011 1915                                                   * the retval to T_DONTSTEAL, so
2012 1916                                                   * that if no acceptable CPUs
2013 1917                                                   * are found the return value
2014 1918                                                   * will be T_DONTSTEAL rather
2015 1919                                                   * then NULL.
2016 1920                                                   */
2017 1921                                                  retval = T_DONTSTEAL;
2018 1922                                          }
2019 1923                                  }
2020 1924                          } while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2021 1925  
2022 1926                          /*
2023 1927                           * Iterate to the next leaf lpl in the resource set
2024 1928                           * at this level of locality. If we hit the end of
2025 1929                           * the set, wrap back around to the beginning.
2026 1930                           *
2027 1931                           * Note: This iteration is NULL terminated for a reason
2028 1932                           * see lpl_topo_bootstrap() in lgrp.c for details.
2029 1933                           */
2030 1934                          if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2031 1935                                  leafidx = 0;
2032 1936                                  lpl_leaf = lpl->lpl_rset[leafidx];
2033 1937                          }
2034 1938                  } while (leafidx != startidx);
2035 1939  
2036 1940  next_level:
2037 1941                  /*
2038 1942                   * Expand the search to include farther away CPUs (next
2039 1943                   * locality level). The closer CPUs that have already been
2040 1944                   * checked will be checked again. In doing so, idle CPUs
2041 1945                   * will tend to be more aggresive about stealing from CPUs
2042 1946                   * that are closer (since the closer CPUs will be considered
2043 1947                   * more often).
2044 1948                   * Begin at this level with the CPUs local leaf lpl.
2045 1949                   */
2046 1950                  if ((lpl = lpl->lpl_parent) != NULL) {
2047 1951                          leafidx = startidx = lpl->lpl_id2rset[local_id];
2048 1952                          lpl_leaf = lpl->lpl_rset[leafidx];
2049 1953                  }
2050 1954          } while (!tcp && lpl);
2051 1955  
2052 1956          kpreempt_enable();
2053 1957  
2054 1958          /*
2055 1959           * If another queue looks good, and there is still nothing on
2056 1960           * the local queue, try to transfer one or more threads
2057 1961           * from it to our queue.
2058 1962           */
2059 1963          if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2060 1964                  tp = disp_getbest(tcp->cpu_disp);
2061 1965                  if (tp == NULL || tp == T_DONTSTEAL)
2062 1966                          return (tp);
2063 1967                  return (disp_ratify(tp, kpq));
2064 1968          }
2065 1969          return (retval);
2066 1970  }
2067 1971  
2068 1972  
2069 1973  /*
2070 1974   * disp_fix_unbound_pri()
2071 1975   *      Determines the maximum priority of unbound threads on the queue.
2072 1976   *      The priority is kept for the queue, but is only increased, never
2073 1977   *      reduced unless some CPU is looking for something on that queue.
2074 1978   *
2075 1979   *      The priority argument is the known upper limit.
2076 1980   *
2077 1981   *      Perhaps this should be kept accurately, but that probably means
2078 1982   *      separate bitmaps for bound and unbound threads.  Since only idled
2079 1983   *      CPUs will have to do this recalculation, it seems better this way.
2080 1984   */
2081 1985  static void
2082 1986  disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2083 1987  {
2084 1988          kthread_t       *tp;
2085 1989          dispq_t         *dq;
2086 1990          ulong_t         *dqactmap = dp->disp_qactmap;
2087 1991          ulong_t         mapword;
2088 1992          int             wx;
2089 1993  
2090 1994          ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2091 1995  
2092 1996          ASSERT(pri >= 0);                       /* checked by caller */
2093 1997  
2094 1998          /*
2095 1999           * Start the search at the next lowest priority below the supplied
2096 2000           * priority.  This depends on the bitmap implementation.
2097 2001           */
2098 2002          do {
2099 2003                  wx = pri >> BT_ULSHIFT;         /* index of word in map */
2100 2004  
2101 2005                  /*
2102 2006                   * Form mask for all lower priorities in the word.
2103 2007                   */
2104 2008                  mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2105 2009  
2106 2010                  /*
2107 2011                   * Get next lower active priority.
2108 2012                   */
2109 2013                  if (mapword != 0) {
2110 2014                          pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2111 2015                  } else if (wx > 0) {
2112 2016                          pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2113 2017                          if (pri < 0)
2114 2018                                  break;
2115 2019                  } else {
2116 2020                          pri = -1;
2117 2021                          break;
2118 2022                  }
2119 2023  
2120 2024                  /*
2121 2025                   * Search the queue for unbound, runnable threads.
2122 2026                   */
2123 2027                  dq = &dp->disp_q[pri];
2124 2028                  tp = dq->dq_first;
2125 2029  
2126 2030                  while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2127 2031                          tp = tp->t_link;
2128 2032                  }
2129 2033  
2130 2034                  /*
2131 2035                   * If a thread was found, set the priority and return.
2132 2036                   */
2133 2037          } while (tp == NULL);
2134 2038  
2135 2039          /*
2136 2040           * pri holds the maximum unbound thread priority or -1.
2137 2041           */
2138 2042          if (dp->disp_max_unbound_pri != pri)
2139 2043                  dp->disp_max_unbound_pri = pri;
2140 2044  }
2141 2045  
2142 2046  /*
2143 2047   * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2144 2048   *      check if the CPU to which is was previously bound should have
2145 2049   *      its disp_max_unbound_pri increased.
2146 2050   */
2147 2051  void
2148 2052  disp_adjust_unbound_pri(kthread_t *tp)
2149 2053  {
2150 2054          disp_t *dp;
2151 2055          pri_t tpri;
2152 2056  
2153 2057          ASSERT(THREAD_LOCK_HELD(tp));
2154 2058  
2155 2059          /*
2156 2060           * Don't do anything if the thread is not bound, or
2157 2061           * currently not runnable or swapped out.
2158 2062           */
2159 2063          if (tp->t_bound_cpu == NULL ||
2160 2064              tp->t_state != TS_RUN ||
2161 2065              tp->t_schedflag & TS_ON_SWAPQ)
2162 2066                  return;
2163 2067  
2164 2068          tpri = DISP_PRIO(tp);
2165 2069          dp = tp->t_bound_cpu->cpu_disp;
2166 2070          ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2167 2071          if (tpri > dp->disp_max_unbound_pri)
2168 2072                  dp->disp_max_unbound_pri = tpri;
2169 2073  }
2170 2074  
2171 2075  /*
2172 2076   * disp_getbest()
2173 2077   *   De-queue the highest priority unbound runnable thread.
2174 2078   *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
2175 2079   *   Returns NULL if nothing found.
2176 2080   *   Returns T_DONTSTEAL if the thread was not stealable.
2177 2081   *   so that the caller will try again later.
2178 2082   *
2179 2083   *   Passed a pointer to a dispatch queue not associated with this CPU, and
2180 2084   *   its type.
2181 2085   */
2182 2086  static kthread_t *
2183 2087  disp_getbest(disp_t *dp)
2184 2088  {
2185 2089          kthread_t       *tp;
2186 2090          dispq_t         *dq;
2187 2091          pri_t           pri;
2188 2092          cpu_t           *cp, *tcp;
2189 2093          boolean_t       allbound;
2190 2094  
2191 2095          disp_lock_enter(&dp->disp_lock);
2192 2096  
2193 2097          /*
2194 2098           * If there is nothing to run, or the CPU is in the middle of a
2195 2099           * context switch of the only thread, return NULL.
2196 2100           */
2197 2101          tcp = dp->disp_cpu;
2198 2102          cp = CPU;
2199 2103          pri = dp->disp_max_unbound_pri;
2200 2104          if (pri == -1 ||
2201 2105              (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2202 2106              tcp->cpu_disp->disp_nrunnable == 1)) {
2203 2107                  disp_lock_exit_nopreempt(&dp->disp_lock);
2204 2108                  return (NULL);
2205 2109          }
2206 2110  
2207 2111          dq = &dp->disp_q[pri];
2208 2112  
2209 2113  
2210 2114          /*
2211 2115           * Assume that all threads are bound on this queue, and change it
2212 2116           * later when we find out that it is not the case.
2213 2117           */
2214 2118          allbound = B_TRUE;
2215 2119          for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2216 2120                  hrtime_t now, nosteal, rqtime;
2217 2121  
2218 2122                  /*
2219 2123                   * Skip over bound threads which could be here even
2220 2124                   * though disp_max_unbound_pri indicated this level.
2221 2125                   */
2222 2126                  if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2223 2127                          continue;
2224 2128  
2225 2129                  /*
2226 2130                   * We've got some unbound threads on this queue, so turn
2227 2131                   * the allbound flag off now.
2228 2132                   */
2229 2133                  allbound = B_FALSE;
2230 2134  
2231 2135                  /*
2232 2136                   * The thread is a candidate for stealing from its run queue. We
2233 2137                   * don't want to steal threads that became runnable just a
2234 2138                   * moment ago. This improves CPU affinity for threads that get
2235 2139                   * preempted for short periods of time and go back on the run
2236 2140                   * queue.
2237 2141                   *
2238 2142                   * We want to let it stay on its run queue if it was only placed
2239 2143                   * there recently and it was running on the same CPU before that
2240 2144                   * to preserve its cache investment. For the thread to remain on
2241 2145                   * its run queue, ALL of the following conditions must be
2242 2146                   * satisfied:
2243 2147                   *
2244 2148                   * - the disp queue should not be the kernel preemption queue
2245 2149                   * - delayed idle stealing should not be disabled
2246 2150                   * - nosteal_nsec should be non-zero
2247 2151                   * - it should run with user priority
2248 2152                   * - it should be on the run queue of the CPU where it was
2249 2153                   *   running before being placed on the run queue
2250 2154                   * - it should be the only thread on the run queue (to prevent
2251 2155                   *   extra scheduling latency for other threads)
2252 2156                   * - it should sit on the run queue for less than per-chip
2253 2157                   *   nosteal interval or global nosteal interval
2254 2158                   * - in case of CPUs with shared cache it should sit in a run
2255 2159                   *   queue of a CPU from a different chip
2256 2160                   *
2257 2161                   * The checks are arranged so that the ones that are faster are
2258 2162                   * placed earlier.
2259 2163                   */
2260 2164                  if (tcp == NULL ||
2261 2165                      pri >= minclsyspri ||
2262 2166                      tp->t_cpu != tcp)
2263 2167                          break;
2264 2168  
2265 2169                  /*
2266 2170                   * Steal immediately if, due to CMT processor architecture
2267 2171                   * migraiton between cp and tcp would incur no performance
2268 2172                   * penalty.
2269 2173                   */
2270 2174                  if (pg_cmt_can_migrate(cp, tcp))
2271 2175                          break;
2272 2176  
2273 2177                  nosteal = nosteal_nsec;
2274 2178                  if (nosteal == 0)
2275 2179                          break;
2276 2180  
2277 2181                  /*
2278 2182                   * Calculate time spent sitting on run queue
2279 2183                   */
2280 2184                  now = gethrtime_unscaled();
2281 2185                  rqtime = now - tp->t_waitrq;
2282 2186                  scalehrtime(&rqtime);
2283 2187  
2284 2188                  /*
2285 2189                   * Steal immediately if the time spent on this run queue is more
2286 2190                   * than allowed nosteal delay.
2287 2191                   *
2288 2192                   * Negative rqtime check is needed here to avoid infinite
2289 2193                   * stealing delays caused by unlikely but not impossible
2290 2194                   * drifts between CPU times on different CPUs.
2291 2195                   */
2292 2196                  if (rqtime > nosteal || rqtime < 0)
2293 2197                          break;
2294 2198  
2295 2199                  DTRACE_PROBE4(nosteal, kthread_t *, tp,
2296 2200                      cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2297 2201                  scalehrtime(&now);
2298 2202                  /*
2299 2203                   * Calculate when this thread becomes stealable
2300 2204                   */
2301 2205                  now += (nosteal - rqtime);
2302 2206  
2303 2207                  /*
2304 2208                   * Calculate time when some thread becomes stealable
2305 2209                   */
2306 2210                  if (now < dp->disp_steal)
2307 2211                          dp->disp_steal = now;
2308 2212          }
2309 2213  
2310 2214          /*
2311 2215           * If there were no unbound threads on this queue, find the queue
2312 2216           * where they are and then return later. The value of
2313 2217           * disp_max_unbound_pri is not always accurate because it isn't
2314 2218           * reduced until another idle CPU looks for work.
2315 2219           */
2316 2220          if (allbound)
2317 2221                  disp_fix_unbound_pri(dp, pri);
2318 2222  
2319 2223          /*
2320 2224           * If we reached the end of the queue and found no unbound threads
2321 2225           * then return NULL so that other CPUs will be considered.  If there
2322 2226           * are unbound threads but they cannot yet be stolen, then
2323 2227           * return T_DONTSTEAL and try again later.
2324 2228           */
2325 2229          if (tp == NULL) {
2326 2230                  disp_lock_exit_nopreempt(&dp->disp_lock);
2327 2231                  return (allbound ? NULL : T_DONTSTEAL);
2328 2232          }
2329 2233  
2330 2234          /*
2331 2235           * Found a runnable, unbound thread, so remove it from queue.
2332 2236           * dispdeq() requires that we have the thread locked, and we do,
2333 2237           * by virtue of holding the dispatch queue lock.  dispdeq() will
2334 2238           * put the thread in transition state, thereby dropping the dispq
2335 2239           * lock.
2336 2240           */
2337 2241  
2338 2242  #ifdef DEBUG
2339 2243          {
2340 2244                  int     thread_was_on_queue;
2341 2245  
2342 2246                  thread_was_on_queue = dispdeq(tp);      /* drops disp_lock */
2343 2247                  ASSERT(thread_was_on_queue);
2344 2248          }
2345 2249  
2346 2250  #else /* DEBUG */
2347 2251          (void) dispdeq(tp);                     /* drops disp_lock */
2348 2252  #endif /* DEBUG */
2349 2253  
2350 2254          /*
2351 2255           * Reset the disp_queue steal time - we do not know what is the smallest
2352 2256           * value across the queue is.
2353 2257           */
2354 2258          dp->disp_steal = 0;
2355 2259  
2356 2260          tp->t_schedflag |= TS_DONT_SWAP;
2357 2261  
2358 2262          /*
2359 2263           * Setup thread to run on the current CPU.
2360 2264           */
2361 2265          tp->t_disp_queue = cp->cpu_disp;
2362 2266  
2363 2267          cp->cpu_dispthread = tp;                /* protected by spl only */
2364 2268          cp->cpu_dispatch_pri = pri;
2365 2269  
2366 2270          /*
2367 2271           * There can be a memory synchronization race between disp_getbest()
2368 2272           * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2369 2273           * to preempt the current thread to run the enqueued thread while
2370 2274           * disp_getbest() and disp_ratify() are changing the current thread
2371 2275           * to the stolen thread. This may lead to a situation where
2372 2276           * cpu_resched() tries to preempt the wrong thread and the
2373 2277           * stolen thread continues to run on the CPU which has been tagged
2374 2278           * for preemption.
2375 2279           * Later the clock thread gets enqueued but doesn't get to run on the
2376 2280           * CPU causing the system to hang.
2377 2281           *
2378 2282           * To avoid this, grabbing and dropping the disp_lock (which does
2379 2283           * a memory barrier) is needed to synchronize the execution of
2380 2284           * cpu_resched() with disp_getbest() and disp_ratify() and
2381 2285           * synchronize the memory read and written by cpu_resched(),
2382 2286           * disp_getbest(), and disp_ratify() with each other.
2383 2287           *  (see CR#6482861 for more details).
2384 2288           */
2385 2289          disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2386 2290          disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2387 2291  
2388 2292          ASSERT(pri == DISP_PRIO(tp));
2389 2293  
2390 2294          DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2391 2295  
2392 2296          thread_onproc(tp, cp);                  /* set t_state to TS_ONPROC */
2393 2297  
2394 2298          /*
2395 2299           * Return with spl high so that swtch() won't need to raise it.
2396 2300           * The disp_lock was dropped by dispdeq().
2397 2301           */
2398 2302  
2399 2303          return (tp);
2400 2304  }
2401 2305  
2402 2306  /*
2403 2307   * disp_bound_common() - common routine for higher level functions
2404 2308   *      that check for bound threads under certain conditions.
2405 2309   *      If 'threadlistsafe' is set then there is no need to acquire
2406 2310   *      pidlock to stop the thread list from changing (eg, if
2407 2311   *      disp_bound_* is called with cpus paused).
2408 2312   */
2409 2313  static int
2410 2314  disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2411 2315  {
2412 2316          int             found = 0;
2413 2317          kthread_t       *tp;
2414 2318  
2415 2319          ASSERT(flag);
2416 2320  
2417 2321          if (!threadlistsafe)
2418 2322                  mutex_enter(&pidlock);
2419 2323          tp = curthread;         /* faster than allthreads */
2420 2324          do {
2421 2325                  if (tp->t_state != TS_FREE) {
2422 2326                          /*
2423 2327                           * If an interrupt thread is busy, but the
2424 2328                           * caller doesn't care (i.e. BOUND_INTR is off),
2425 2329                           * then just ignore it and continue through.
2426 2330                           */
2427 2331                          if ((tp->t_flag & T_INTR_THREAD) &&
2428 2332                              !(flag & BOUND_INTR))
2429 2333                                  continue;
2430 2334  
2431 2335                          /*
2432 2336                           * Skip the idle thread for the CPU
2433 2337                           * we're about to set offline.
2434 2338                           */
2435 2339                          if (tp == cp->cpu_idle_thread)
2436 2340                                  continue;
2437 2341  
2438 2342                          /*
2439 2343                           * Skip the pause thread for the CPU
2440 2344                           * we're about to set offline.
2441 2345                           */
2442 2346                          if (tp == cp->cpu_pause_thread)
2443 2347                                  continue;
2444 2348  
2445 2349                          if ((flag & BOUND_CPU) &&
2446 2350                              (tp->t_bound_cpu == cp ||
2447 2351                              tp->t_bind_cpu == cp->cpu_id ||
2448 2352                              tp->t_weakbound_cpu == cp)) {
2449 2353                                  found = 1;
2450 2354                                  break;
2451 2355                          }
2452 2356  
2453 2357                          if ((flag & BOUND_PARTITION) &&
2454 2358                              (tp->t_cpupart == cp->cpu_part)) {
2455 2359                                  found = 1;
2456 2360                                  break;
2457 2361                          }
2458 2362                  }
2459 2363          } while ((tp = tp->t_next) != curthread && found == 0);
2460 2364          if (!threadlistsafe)
2461 2365                  mutex_exit(&pidlock);
2462 2366          return (found);
2463 2367  }
2464 2368  
2465 2369  /*
2466 2370   * disp_bound_threads - return nonzero if threads are bound to the processor.
2467 2371   *      Called infrequently.  Keep this simple.
2468 2372   *      Includes threads that are asleep or stopped but not onproc.
2469 2373   */
2470 2374  int
2471 2375  disp_bound_threads(cpu_t *cp, int threadlistsafe)
2472 2376  {
2473 2377          return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2474 2378  }
2475 2379  
2476 2380  /*
2477 2381   * disp_bound_anythreads - return nonzero if _any_ threads are bound
2478 2382   * to the given processor, including interrupt threads.
2479 2383   */
2480 2384  int
2481 2385  disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2482 2386  {
2483 2387          return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2484 2388  }
2485 2389  
2486 2390  /*
2487 2391   * disp_bound_partition - return nonzero if threads are bound to the same
2488 2392   * partition as the processor.
2489 2393   *      Called infrequently.  Keep this simple.
2490 2394   *      Includes threads that are asleep or stopped but not onproc.
2491 2395   */
2492 2396  int
2493 2397  disp_bound_partition(cpu_t *cp, int threadlistsafe)
2494 2398  {
2495 2399          return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2496 2400  }
2497 2401  
2498 2402  /*
2499 2403   * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2500 2404   * threads to other CPUs.
2501 2405   */
2502 2406  void
2503 2407  disp_cpu_inactive(cpu_t *cp)
2504 2408  {
2505 2409          kthread_t       *tp;
2506 2410          disp_t          *dp = cp->cpu_disp;
2507 2411          dispq_t         *dq;
2508 2412          pri_t           pri;
2509 2413          int             wasonq;
2510 2414  
2511 2415          disp_lock_enter(&dp->disp_lock);
2512 2416          while ((pri = dp->disp_max_unbound_pri) != -1) {
2513 2417                  dq = &dp->disp_q[pri];
2514 2418                  tp = dq->dq_first;
2515 2419  
2516 2420                  /*
2517 2421                   * Skip over bound threads.
2518 2422                   */
2519 2423                  while (tp != NULL && tp->t_bound_cpu != NULL) {
2520 2424                          tp = tp->t_link;
2521 2425                  }
2522 2426  
2523 2427                  if (tp == NULL) {
2524 2428                          /* disp_max_unbound_pri must be inaccurate, so fix it */
2525 2429                          disp_fix_unbound_pri(dp, pri);
2526 2430                          continue;
2527 2431                  }
2528 2432  
2529 2433                  wasonq = dispdeq(tp);           /* drops disp_lock */
2530 2434                  ASSERT(wasonq);
2531 2435                  ASSERT(tp->t_weakbound_cpu == NULL);
2532 2436  
2533 2437                  setbackdq(tp);
2534 2438                  /*
2535 2439                   * Called from cpu_offline:
2536 2440                   *
2537 2441                   * cp has already been removed from the list of active cpus
2538 2442                   * and tp->t_cpu has been changed so there is no risk of
2539 2443                   * tp ending up back on cp.
2540 2444                   *
2541 2445                   * Called from cpupart_move_cpu:
2542 2446                   *
2543 2447                   * The cpu has moved to a new cpupart.  Any threads that
2544 2448                   * were on it's dispatch queues before the move remain
2545 2449                   * in the old partition and can't run in the new partition.
2546 2450                   */
2547 2451                  ASSERT(tp->t_cpu != cp);
2548 2452                  thread_unlock(tp);
2549 2453  
2550 2454                  disp_lock_enter(&dp->disp_lock);
2551 2455          }
2552 2456          disp_lock_exit(&dp->disp_lock);
2553 2457  }
2554 2458  
2555 2459  /*
2556 2460   * disp_lowpri_cpu - find CPU running the lowest priority thread.
2557 2461   *      The hint passed in is used as a starting point so we don't favor
2558 2462   *      CPU 0 or any other CPU.  The caller should pass in the most recently
2559 2463   *      used CPU for the thread.
2560 2464   *
2561 2465   *      The lgroup and priority are used to determine the best CPU to run on
2562 2466   *      in a NUMA machine.  The lgroup specifies which CPUs are closest while
2563 2467   *      the thread priority will indicate whether the thread will actually run
2564 2468   *      there.  To pick the best CPU, the CPUs inside and outside of the given
2565 2469   *      lgroup which are running the lowest priority threads are found.  The
2566 2470   *      remote CPU is chosen only if the thread will not run locally on a CPU
2567 2471   *      within the lgroup, but will run on the remote CPU. If the thread
2568 2472   *      cannot immediately run on any CPU, the best local CPU will be chosen.
2569 2473   *
2570 2474   *      The lpl specified also identifies the cpu partition from which
2571 2475   *      disp_lowpri_cpu should select a CPU.
2572 2476   *
2573 2477   *      curcpu is used to indicate that disp_lowpri_cpu is being called on
2574 2478   *      behalf of the current thread. (curthread is looking for a new cpu)
2575 2479   *      In this case, cpu_dispatch_pri for this thread's cpu should be
2576 2480   *      ignored.
2577 2481   *
2578 2482   *      If a cpu is the target of an offline request then try to avoid it.
2579 2483   *
2580 2484   *      This function must be called at either high SPL, or with preemption
2581 2485   *      disabled, so that the "hint" CPU cannot be removed from the online
2582 2486   *      CPU list while we are traversing it.
2583 2487   */
2584 2488  cpu_t *
2585 2489  disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2586 2490  {
2587 2491          cpu_t   *bestcpu;
2588 2492          cpu_t   *besthomecpu;
2589 2493          cpu_t   *cp, *cpstart;
2590 2494  
2591 2495          pri_t   bestpri;
2592 2496          pri_t   cpupri;
2593 2497  
2594 2498          klgrpset_t      done;
2595 2499          klgrpset_t      cur_set;
2596 2500  
2597 2501          lpl_t           *lpl_iter, *lpl_leaf;
2598 2502          int             i;
2599 2503  
2600 2504          /*
2601 2505           * Scan for a CPU currently running the lowest priority thread.
2602 2506           * Cannot get cpu_lock here because it is adaptive.
2603 2507           * We do not require lock on CPU list.
2604 2508           */
2605 2509          ASSERT(hint != NULL);
2606 2510          ASSERT(lpl != NULL);
2607 2511          ASSERT(lpl->lpl_ncpu > 0);
2608 2512  
2609 2513          /*
2610 2514           * First examine local CPUs. Note that it's possible the hint CPU
2611 2515           * passed in in remote to the specified home lgroup. If our priority
2612 2516           * isn't sufficient enough such that we can run immediately at home,
2613 2517           * then examine CPUs remote to our home lgroup.
2614 2518           * We would like to give preference to CPUs closest to "home".
2615 2519           * If we can't find a CPU where we'll run at a given level
2616 2520           * of locality, we expand our search to include the next level.
2617 2521           */
2618 2522          bestcpu = besthomecpu = NULL;
2619 2523          klgrpset_clear(done);
2620 2524          /* start with lpl we were passed */
2621 2525  
2622 2526          lpl_iter = lpl;
2623 2527  
2624 2528          do {
2625 2529  
2626 2530                  bestpri = SHRT_MAX;
2627 2531                  klgrpset_clear(cur_set);
2628 2532  
2629 2533                  for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2630 2534                          lpl_leaf = lpl_iter->lpl_rset[i];
2631 2535                          if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2632 2536                                  continue;
2633 2537  
2634 2538                          klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2635 2539  
2636 2540                          if (hint->cpu_lpl == lpl_leaf)
2637 2541                                  cp = cpstart = hint;
2638 2542                          else
2639 2543                                  cp = cpstart = lpl_leaf->lpl_cpus;
2640 2544  
2641 2545                          do {
2642 2546                                  if (cp == curcpu)
2643 2547                                          cpupri = -1;
2644 2548                                  else if (cp == cpu_inmotion)
2645 2549                                          cpupri = SHRT_MAX;
2646 2550                                  else
2647 2551                                          cpupri = cp->cpu_dispatch_pri;
2648 2552                                  if (cp->cpu_disp->disp_maxrunpri > cpupri)
2649 2553                                          cpupri = cp->cpu_disp->disp_maxrunpri;
2650 2554                                  if (cp->cpu_chosen_level > cpupri)
2651 2555                                          cpupri = cp->cpu_chosen_level;
2652 2556                                  if (cpupri < bestpri) {
2653 2557                                          if (CPU_IDLING(cpupri)) {
2654 2558                                                  ASSERT((cp->cpu_flags &
2655 2559                                                      CPU_QUIESCED) == 0);
2656 2560                                                  return (cp);
2657 2561                                          }
2658 2562                                          bestcpu = cp;
2659 2563                                          bestpri = cpupri;
2660 2564                                  }
2661 2565                          } while ((cp = cp->cpu_next_lpl) != cpstart);
2662 2566                  }
2663 2567  
2664 2568                  if (bestcpu && (tpri > bestpri)) {
2665 2569                          ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2666 2570                          return (bestcpu);
2667 2571                  }
2668 2572                  if (besthomecpu == NULL)
2669 2573                          besthomecpu = bestcpu;
2670 2574                  /*
2671 2575                   * Add the lgrps we just considered to the "done" set
2672 2576                   */
2673 2577                  klgrpset_or(done, cur_set);
2674 2578  
2675 2579          } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2676 2580  
2677 2581          /*
2678 2582           * The specified priority isn't high enough to run immediately
2679 2583           * anywhere, so just return the best CPU from the home lgroup.
2680 2584           */
2681 2585          ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2682 2586          return (besthomecpu);
2683 2587  }
2684 2588  
2685 2589  /*
2686 2590   * This routine provides the generic idle cpu function for all processors.
2687 2591   * If a processor has some specific code to execute when idle (say, to stop
2688 2592   * the pipeline and save power) then that routine should be defined in the
2689 2593   * processors specific code (module_xx.c) and the global variable idle_cpu
2690 2594   * set to that function.
2691 2595   */
2692 2596  static void
2693 2597  generic_idle_cpu(void)
2694 2598  {
2695 2599  }
2696 2600  
2697 2601  /*ARGSUSED*/
2698 2602  static void
2699 2603  generic_enq_thread(cpu_t *cpu, int bound)
2700 2604  {
2701 2605  }

↓ open down ↓

1172 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX