Print this page
patch setfrontbackdq
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/disp/disp.c
+++ new/usr/src/uts/common/disp/disp.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25
26 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 27 /* All Rights Reserved */
28 28
29 29
30 30 #include <sys/types.h>
31 31 #include <sys/param.h>
32 32 #include <sys/sysmacros.h>
33 33 #include <sys/signal.h>
34 34 #include <sys/user.h>
35 35 #include <sys/systm.h>
36 36 #include <sys/sysinfo.h>
37 37 #include <sys/var.h>
38 38 #include <sys/errno.h>
39 39 #include <sys/cmn_err.h>
40 40 #include <sys/debug.h>
41 41 #include <sys/inline.h>
42 42 #include <sys/disp.h>
43 43 #include <sys/class.h>
44 44 #include <sys/bitmap.h>
45 45 #include <sys/kmem.h>
46 46 #include <sys/cpuvar.h>
47 47 #include <sys/vtrace.h>
48 48 #include <sys/tnf.h>
49 49 #include <sys/cpupart.h>
50 50 #include <sys/lgrp.h>
51 51 #include <sys/pg.h>
52 52 #include <sys/cmt.h>
53 53 #include <sys/bitset.h>
54 54 #include <sys/schedctl.h>
55 55 #include <sys/atomic.h>
56 56 #include <sys/dtrace.h>
57 57 #include <sys/sdt.h>
58 58 #include <sys/archsystm.h>
59 59
60 60 #include <vm/as.h>
61 61
62 62 #define BOUND_CPU 0x1
63 63 #define BOUND_PARTITION 0x2
64 64 #define BOUND_INTR 0x4
65 65
66 66 /* Dispatch queue allocation structure and functions */
67 67 struct disp_queue_info {
68 68 disp_t *dp;
69 69 dispq_t *olddispq;
70 70 dispq_t *newdispq;
71 71 ulong_t *olddqactmap;
72 72 ulong_t *newdqactmap;
73 73 int oldnglobpris;
74 74 };
75 75 static void disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
76 76 disp_t *dp);
77 77 static void disp_dq_assign(struct disp_queue_info *dptr, int numpris);
78 78 static void disp_dq_free(struct disp_queue_info *dptr);
79 79
80 80 /* platform-specific routine to call when processor is idle */
81 81 static void generic_idle_cpu();
82 82 void (*idle_cpu)() = generic_idle_cpu;
83 83
84 84 /* routines invoked when a CPU enters/exits the idle loop */
85 85 static void idle_enter();
86 86 static void idle_exit();
87 87
88 88 /* platform-specific routine to call when thread is enqueued */
89 89 static void generic_enq_thread(cpu_t *, int);
90 90 void (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
91 91
92 92 pri_t kpreemptpri; /* priority where kernel preemption applies */
93 93 pri_t upreemptpri = 0; /* priority where normal preemption applies */
94 94 pri_t intr_pri; /* interrupt thread priority base level */
95 95
96 96 #define KPQPRI -1 /* pri where cpu affinity is dropped for kpq */
97 97 pri_t kpqpri = KPQPRI; /* can be set in /etc/system */
98 98 disp_t cpu0_disp; /* boot CPU's dispatch queue */
99 99 disp_lock_t swapped_lock; /* lock swapped threads and swap queue */
100 100 int nswapped; /* total number of swapped threads */
101 101 void disp_swapped_enq(kthread_t *tp);
102 102 static void disp_swapped_setrun(kthread_t *tp);
103 103 static void cpu_resched(cpu_t *cp, pri_t tpri);
104 104
105 105 /*
106 106 * If this is set, only interrupt threads will cause kernel preemptions.
107 107 * This is done by changing the value of kpreemptpri. kpreemptpri
108 108 * will either be the max sysclass pri + 1 or the min interrupt pri.
109 109 */
110 110 int only_intr_kpreempt;
111 111
112 112 extern void set_idle_cpu(int cpun);
113 113 extern void unset_idle_cpu(int cpun);
114 114 static void setkpdq(kthread_t *tp, int borf);
115 115 #define SETKP_BACK 0
116 116 #define SETKP_FRONT 1
117 117 /*
118 118 * Parameter that determines how recently a thread must have run
119 119 * on the CPU to be considered loosely-bound to that CPU to reduce
120 120 * cold cache effects. The interval is in hertz.
121 121 */
122 122 #define RECHOOSE_INTERVAL 3
123 123 int rechoose_interval = RECHOOSE_INTERVAL;
124 124
125 125 /*
126 126 * Parameter that determines how long (in nanoseconds) a thread must
127 127 * be sitting on a run queue before it can be stolen by another CPU
128 128 * to reduce migrations. The interval is in nanoseconds.
129 129 *
130 130 * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
131 131 * to an appropriate value. nosteal_nsec is set to NOSTEAL_UNINITIALIZED
132 132 * here indicating it is uninitiallized.
133 133 * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
134 134 *
135 135 */
136 136 #define NOSTEAL_UNINITIALIZED (-1)
137 137 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
138 138 extern void cmp_set_nosteal_interval(void);
139 139
140 140 id_t defaultcid; /* system "default" class; see dispadmin(1M) */
141 141
142 142 disp_lock_t transition_lock; /* lock on transitioning threads */
143 143 disp_lock_t stop_lock; /* lock on stopped threads */
144 144
145 145 static void cpu_dispqalloc(int numpris);
146 146
147 147 /*
148 148 * This gets returned by disp_getwork/disp_getbest if we couldn't steal
149 149 * a thread because it was sitting on its run queue for a very short
150 150 * period of time.
151 151 */
152 152 #define T_DONTSTEAL (kthread_t *)(-1) /* returned by disp_getwork/getbest */
153 153
154 154 static kthread_t *disp_getwork(cpu_t *to);
155 155 static kthread_t *disp_getbest(disp_t *from);
156 156 static kthread_t *disp_ratify(kthread_t *tp, disp_t *kpq);
157 157
158 158 void swtch_to(kthread_t *);
159 159
160 160 /*
161 161 * dispatcher and scheduler initialization
162 162 */
163 163
164 164 /*
165 165 * disp_setup - Common code to calculate and allocate dispatcher
166 166 * variables and structures based on the maximum priority.
167 167 */
168 168 static void
169 169 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
170 170 {
171 171 pri_t newnglobpris;
172 172
173 173 ASSERT(MUTEX_HELD(&cpu_lock));
174 174
175 175 newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
176 176
177 177 if (newnglobpris > oldnglobpris) {
178 178 /*
179 179 * Allocate new kp queues for each CPU partition.
180 180 */
181 181 cpupart_kpqalloc(newnglobpris);
182 182
183 183 /*
184 184 * Allocate new dispatch queues for each CPU.
185 185 */
186 186 cpu_dispqalloc(newnglobpris);
187 187
188 188 /*
189 189 * compute new interrupt thread base priority
190 190 */
191 191 intr_pri = maxglobpri;
192 192 if (only_intr_kpreempt) {
193 193 kpreemptpri = intr_pri + 1;
194 194 if (kpqpri == KPQPRI)
195 195 kpqpri = kpreemptpri;
196 196 }
197 197 v.v_nglobpris = newnglobpris;
198 198 }
199 199 }
200 200
201 201 /*
202 202 * dispinit - Called to initialize all loaded classes and the
203 203 * dispatcher framework.
204 204 */
205 205 void
206 206 dispinit(void)
207 207 {
208 208 id_t cid;
209 209 pri_t maxglobpri;
210 210 pri_t cl_maxglobpri;
211 211
212 212 maxglobpri = -1;
213 213
214 214 /*
215 215 * Initialize transition lock, which will always be set.
216 216 */
217 217 DISP_LOCK_INIT(&transition_lock);
218 218 disp_lock_enter_high(&transition_lock);
219 219 DISP_LOCK_INIT(&stop_lock);
220 220
221 221 mutex_enter(&cpu_lock);
222 222 CPU->cpu_disp->disp_maxrunpri = -1;
223 223 CPU->cpu_disp->disp_max_unbound_pri = -1;
224 224
225 225 /*
226 226 * Initialize the default CPU partition.
227 227 */
228 228 cpupart_initialize_default();
229 229 /*
230 230 * Call the class specific initialization functions for
231 231 * all pre-installed schedulers.
232 232 *
233 233 * We pass the size of a class specific parameter
234 234 * buffer to each of the initialization functions
235 235 * to try to catch problems with backward compatibility
236 236 * of class modules.
237 237 *
238 238 * For example a new class module running on an old system
239 239 * which didn't provide sufficiently large parameter buffers
240 240 * would be bad news. Class initialization modules can check for
241 241 * this and take action if they detect a problem.
242 242 */
243 243
244 244 for (cid = 0; cid < nclass; cid++) {
245 245 sclass_t *sc;
246 246
247 247 sc = &sclass[cid];
248 248 if (SCHED_INSTALLED(sc)) {
249 249 cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
250 250 &sc->cl_funcs);
251 251 if (cl_maxglobpri > maxglobpri)
252 252 maxglobpri = cl_maxglobpri;
253 253 }
254 254 }
255 255 kpreemptpri = (pri_t)v.v_maxsyspri + 1;
256 256 if (kpqpri == KPQPRI)
257 257 kpqpri = kpreemptpri;
258 258
259 259 ASSERT(maxglobpri >= 0);
260 260 disp_setup(maxglobpri, 0);
261 261
262 262 mutex_exit(&cpu_lock);
263 263
264 264 /*
265 265 * Platform specific sticky scheduler setup.
266 266 */
267 267 if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
268 268 cmp_set_nosteal_interval();
269 269
270 270 /*
271 271 * Get the default class ID; this may be later modified via
272 272 * dispadmin(1M). This will load the class (normally TS) and that will
273 273 * call disp_add(), which is why we had to drop cpu_lock first.
274 274 */
275 275 if (getcid(defaultclass, &defaultcid) != 0) {
276 276 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
277 277 defaultclass);
278 278 }
279 279 }
280 280
281 281 /*
282 282 * disp_add - Called with class pointer to initialize the dispatcher
283 283 * for a newly loaded class.
284 284 */
285 285 void
286 286 disp_add(sclass_t *clp)
287 287 {
288 288 pri_t maxglobpri;
289 289 pri_t cl_maxglobpri;
290 290
291 291 mutex_enter(&cpu_lock);
292 292 /*
293 293 * Initialize the scheduler class.
294 294 */
295 295 maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
296 296 cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
297 297 if (cl_maxglobpri > maxglobpri)
298 298 maxglobpri = cl_maxglobpri;
299 299
300 300 /*
301 301 * Save old queue information. Since we're initializing a
302 302 * new scheduling class which has just been loaded, then
303 303 * the size of the dispq may have changed. We need to handle
304 304 * that here.
305 305 */
306 306 disp_setup(maxglobpri, v.v_nglobpris);
307 307
308 308 mutex_exit(&cpu_lock);
309 309 }
310 310
311 311
312 312 /*
313 313 * For each CPU, allocate new dispatch queues
314 314 * with the stated number of priorities.
315 315 */
316 316 static void
317 317 cpu_dispqalloc(int numpris)
318 318 {
319 319 cpu_t *cpup;
320 320 struct disp_queue_info *disp_mem;
321 321 int i, num;
322 322
323 323 ASSERT(MUTEX_HELD(&cpu_lock));
324 324
325 325 disp_mem = kmem_zalloc(NCPU *
326 326 sizeof (struct disp_queue_info), KM_SLEEP);
327 327
328 328 /*
329 329 * This routine must allocate all of the memory before stopping
330 330 * the cpus because it must not sleep in kmem_alloc while the
331 331 * CPUs are stopped. Locks they hold will not be freed until they
332 332 * are restarted.
333 333 */
334 334 i = 0;
335 335 cpup = cpu_list;
336 336 do {
337 337 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
338 338 i++;
339 339 cpup = cpup->cpu_next;
340 340 } while (cpup != cpu_list);
341 341 num = i;
342 342
343 343 pause_cpus(NULL, NULL);
344 344 for (i = 0; i < num; i++)
345 345 disp_dq_assign(&disp_mem[i], numpris);
346 346 start_cpus();
347 347
348 348 /*
349 349 * I must free all of the memory after starting the cpus because
350 350 * I can not risk sleeping in kmem_free while the cpus are stopped.
351 351 */
352 352 for (i = 0; i < num; i++)
353 353 disp_dq_free(&disp_mem[i]);
354 354
355 355 kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
356 356 }
357 357
358 358 static void
359 359 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp)
360 360 {
361 361 dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
362 362 dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
363 363 sizeof (long), KM_SLEEP);
364 364 dptr->dp = dp;
365 365 }
366 366
367 367 static void
368 368 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
369 369 {
370 370 disp_t *dp;
371 371
372 372 dp = dptr->dp;
373 373 dptr->olddispq = dp->disp_q;
374 374 dptr->olddqactmap = dp->disp_qactmap;
375 375 dptr->oldnglobpris = dp->disp_npri;
376 376
377 377 ASSERT(dptr->oldnglobpris < numpris);
378 378
379 379 if (dptr->olddispq != NULL) {
380 380 /*
381 381 * Use kcopy because bcopy is platform-specific
382 382 * and could block while we might have paused the cpus.
383 383 */
384 384 (void) kcopy(dptr->olddispq, dptr->newdispq,
385 385 dptr->oldnglobpris * sizeof (dispq_t));
386 386 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
387 387 ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
388 388 sizeof (long));
389 389 }
390 390 dp->disp_q = dptr->newdispq;
391 391 dp->disp_qactmap = dptr->newdqactmap;
392 392 dp->disp_q_limit = &dptr->newdispq[numpris];
393 393 dp->disp_npri = numpris;
394 394 }
395 395
396 396 static void
397 397 disp_dq_free(struct disp_queue_info *dptr)
398 398 {
399 399 if (dptr->olddispq != NULL)
400 400 kmem_free(dptr->olddispq,
401 401 dptr->oldnglobpris * sizeof (dispq_t));
402 402 if (dptr->olddqactmap != NULL)
403 403 kmem_free(dptr->olddqactmap,
404 404 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
405 405 }
406 406
407 407 /*
408 408 * For a newly created CPU, initialize the dispatch queue.
409 409 * This is called before the CPU is known through cpu[] or on any lists.
410 410 */
411 411 void
412 412 disp_cpu_init(cpu_t *cp)
413 413 {
414 414 disp_t *dp;
415 415 dispq_t *newdispq;
416 416 ulong_t *newdqactmap;
417 417
418 418 ASSERT(MUTEX_HELD(&cpu_lock)); /* protect dispatcher queue sizes */
419 419
420 420 if (cp == cpu0_disp.disp_cpu)
421 421 dp = &cpu0_disp;
422 422 else
423 423 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
424 424 bzero(dp, sizeof (disp_t));
425 425 cp->cpu_disp = dp;
426 426 dp->disp_cpu = cp;
427 427 dp->disp_maxrunpri = -1;
428 428 dp->disp_max_unbound_pri = -1;
429 429 DISP_LOCK_INIT(&cp->cpu_thread_lock);
430 430 /*
431 431 * Allocate memory for the dispatcher queue headers
432 432 * and the active queue bitmap.
433 433 */
434 434 newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
435 435 newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
436 436 sizeof (long), KM_SLEEP);
437 437 dp->disp_q = newdispq;
438 438 dp->disp_qactmap = newdqactmap;
439 439 dp->disp_q_limit = &newdispq[v.v_nglobpris];
440 440 dp->disp_npri = v.v_nglobpris;
441 441 }
442 442
443 443 void
444 444 disp_cpu_fini(cpu_t *cp)
445 445 {
446 446 ASSERT(MUTEX_HELD(&cpu_lock));
447 447
448 448 disp_kp_free(cp->cpu_disp);
449 449 if (cp->cpu_disp != &cpu0_disp)
450 450 kmem_free(cp->cpu_disp, sizeof (disp_t));
451 451 }
452 452
453 453 /*
454 454 * Allocate new, larger kpreempt dispatch queue to replace the old one.
455 455 */
456 456 void
457 457 disp_kp_alloc(disp_t *dq, pri_t npri)
458 458 {
459 459 struct disp_queue_info mem_info;
460 460
461 461 if (npri > dq->disp_npri) {
462 462 /*
463 463 * Allocate memory for the new array.
464 464 */
465 465 disp_dq_alloc(&mem_info, npri, dq);
466 466
467 467 /*
468 468 * We need to copy the old structures to the new
469 469 * and free the old.
470 470 */
471 471 disp_dq_assign(&mem_info, npri);
472 472 disp_dq_free(&mem_info);
473 473 }
474 474 }
475 475
476 476 /*
477 477 * Free dispatch queue.
478 478 * Used for the kpreempt queues for a removed CPU partition and
479 479 * for the per-CPU queues of deleted CPUs.
480 480 */
481 481 void
482 482 disp_kp_free(disp_t *dq)
483 483 {
484 484 struct disp_queue_info mem_info;
485 485
486 486 mem_info.olddispq = dq->disp_q;
487 487 mem_info.olddqactmap = dq->disp_qactmap;
488 488 mem_info.oldnglobpris = dq->disp_npri;
489 489 disp_dq_free(&mem_info);
490 490 }
491 491
492 492 /*
493 493 * End dispatcher and scheduler initialization.
494 494 */
495 495
496 496 /*
497 497 * See if there's anything to do other than remain idle.
498 498 * Return non-zero if there is.
499 499 *
500 500 * This function must be called with high spl, or with
501 501 * kernel preemption disabled to prevent the partition's
502 502 * active cpu list from changing while being traversed.
503 503 *
504 504 * This is essentially a simpler version of disp_getwork()
505 505 * to be called by CPUs preparing to "halt".
506 506 */
507 507 int
508 508 disp_anywork(void)
509 509 {
510 510 cpu_t *cp = CPU;
511 511 cpu_t *ocp;
512 512 volatile int *local_nrunnable = &cp->cpu_disp->disp_nrunnable;
513 513
514 514 if (!(cp->cpu_flags & CPU_OFFLINE)) {
515 515 if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
516 516 return (1);
517 517
518 518 for (ocp = cp->cpu_next_part; ocp != cp;
519 519 ocp = ocp->cpu_next_part) {
520 520 ASSERT(CPU_ACTIVE(ocp));
521 521
522 522 /*
523 523 * Something has appeared on the local run queue.
524 524 */
525 525 if (*local_nrunnable > 0)
526 526 return (1);
527 527 /*
528 528 * If we encounter another idle CPU that will
529 529 * soon be trolling around through disp_anywork()
530 530 * terminate our walk here and let this other CPU
531 531 * patrol the next part of the list.
532 532 */
533 533 if (ocp->cpu_dispatch_pri == -1 &&
534 534 (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
535 535 return (0);
536 536 /*
537 537 * Work can be taken from another CPU if:
538 538 * - There is unbound work on the run queue
539 539 * - That work isn't a thread undergoing a
540 540 * - context switch on an otherwise empty queue.
541 541 * - The CPU isn't running the idle loop.
542 542 */
543 543 if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
544 544 !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
545 545 ocp->cpu_disp->disp_nrunnable == 1) &&
546 546 ocp->cpu_dispatch_pri != -1)
547 547 return (1);
548 548 }
549 549 }
550 550 return (0);
551 551 }
552 552
553 553 /*
554 554 * Called when CPU enters the idle loop
555 555 */
556 556 static void
557 557 idle_enter()
558 558 {
559 559 cpu_t *cp = CPU;
560 560
561 561 new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
562 562 CPU_STATS_ADDQ(cp, sys, idlethread, 1);
563 563 set_idle_cpu(cp->cpu_id); /* arch-dependent hook */
564 564 }
565 565
566 566 /*
567 567 * Called when CPU exits the idle loop
568 568 */
569 569 static void
570 570 idle_exit()
571 571 {
572 572 cpu_t *cp = CPU;
573 573
574 574 new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
575 575 unset_idle_cpu(cp->cpu_id); /* arch-dependent hook */
576 576 }
577 577
578 578 /*
579 579 * Idle loop.
580 580 */
581 581 void
582 582 idle()
583 583 {
584 584 struct cpu *cp = CPU; /* pointer to this CPU */
585 585 kthread_t *t; /* taken thread */
586 586
587 587 idle_enter();
588 588
589 589 /*
590 590 * Uniprocessor version of idle loop.
591 591 * Do this until notified that we're on an actual multiprocessor.
592 592 */
593 593 while (ncpus == 1) {
594 594 if (cp->cpu_disp->disp_nrunnable == 0) {
595 595 (*idle_cpu)();
596 596 continue;
597 597 }
598 598 idle_exit();
599 599 swtch();
600 600
601 601 idle_enter(); /* returned from swtch */
602 602 }
603 603
604 604 /*
605 605 * Multiprocessor idle loop.
606 606 */
607 607 for (;;) {
608 608 /*
609 609 * If CPU is completely quiesced by p_online(2), just wait
610 610 * here with minimal bus traffic until put online.
611 611 */
612 612 while (cp->cpu_flags & CPU_QUIESCED)
613 613 (*idle_cpu)();
614 614
615 615 if (cp->cpu_disp->disp_nrunnable != 0) {
616 616 idle_exit();
617 617 swtch();
618 618 } else {
619 619 if (cp->cpu_flags & CPU_OFFLINE)
620 620 continue;
621 621 if ((t = disp_getwork(cp)) == NULL) {
622 622 if (cp->cpu_chosen_level != -1) {
623 623 disp_t *dp = cp->cpu_disp;
624 624 disp_t *kpq;
625 625
626 626 disp_lock_enter(&dp->disp_lock);
627 627 /*
628 628 * Set kpq under lock to prevent
629 629 * migration between partitions.
630 630 */
631 631 kpq = &cp->cpu_part->cp_kp_queue;
632 632 if (kpq->disp_maxrunpri == -1)
633 633 cp->cpu_chosen_level = -1;
634 634 disp_lock_exit(&dp->disp_lock);
635 635 }
636 636 (*idle_cpu)();
637 637 continue;
638 638 }
639 639 /*
640 640 * If there was a thread but we couldn't steal
641 641 * it, then keep trying.
642 642 */
643 643 if (t == T_DONTSTEAL)
644 644 continue;
645 645 idle_exit();
646 646 swtch_to(t);
647 647 }
648 648 idle_enter(); /* returned from swtch/swtch_to */
649 649 }
650 650 }
651 651
652 652
653 653 /*
654 654 * Preempt the currently running thread in favor of the highest
655 655 * priority thread. The class of the current thread controls
656 656 * where it goes on the dispatcher queues. If panicking, turn
657 657 * preemption off.
658 658 */
659 659 void
660 660 preempt()
661 661 {
662 662 kthread_t *t = curthread;
663 663 klwp_t *lwp = ttolwp(curthread);
664 664
665 665 if (panicstr)
666 666 return;
667 667
668 668 TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
669 669
670 670 thread_lock(t);
671 671
672 672 if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
673 673 /*
674 674 * this thread has already been chosen to be run on
675 675 * another CPU. Clear kprunrun on this CPU since we're
676 676 * already headed for swtch().
677 677 */
678 678 CPU->cpu_kprunrun = 0;
679 679 thread_unlock_nopreempt(t);
680 680 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
681 681 } else {
682 682 if (lwp != NULL)
683 683 lwp->lwp_ru.nivcsw++;
684 684 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
685 685 THREAD_TRANSITION(t);
686 686 CL_PREEMPT(t);
687 687 DTRACE_SCHED(preempt);
688 688 thread_unlock_nopreempt(t);
689 689
690 690 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
691 691
692 692 swtch(); /* clears CPU->cpu_runrun via disp() */
693 693 }
694 694 }
695 695
696 696 extern kthread_t *thread_unpin();
697 697
698 698 /*
699 699 * disp() - find the highest priority thread for this processor to run, and
700 700 * set it in TS_ONPROC state so that resume() can be called to run it.
701 701 */
702 702 static kthread_t *
703 703 disp()
704 704 {
705 705 cpu_t *cpup;
706 706 disp_t *dp;
707 707 kthread_t *tp;
708 708 dispq_t *dq;
709 709 int maxrunword;
710 710 pri_t pri;
711 711 disp_t *kpq;
712 712
713 713 TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
714 714
715 715 cpup = CPU;
716 716 /*
717 717 * Find the highest priority loaded, runnable thread.
718 718 */
719 719 dp = cpup->cpu_disp;
720 720
721 721 reschedule:
722 722 /*
723 723 * If there is more important work on the global queue with a better
724 724 * priority than the maximum on this CPU, take it now.
725 725 */
726 726 kpq = &cpup->cpu_part->cp_kp_queue;
727 727 while ((pri = kpq->disp_maxrunpri) >= 0 &&
728 728 pri >= dp->disp_maxrunpri &&
729 729 (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
730 730 (tp = disp_getbest(kpq)) != NULL) {
731 731 if (disp_ratify(tp, kpq) != NULL) {
732 732 TRACE_1(TR_FAC_DISP, TR_DISP_END,
733 733 "disp_end:tid %p", tp);
734 734 return (tp);
735 735 }
736 736 }
737 737
738 738 disp_lock_enter(&dp->disp_lock);
739 739 pri = dp->disp_maxrunpri;
740 740
741 741 /*
742 742 * If there is nothing to run, look at what's runnable on other queues.
743 743 * Choose the idle thread if the CPU is quiesced.
744 744 * Note that CPUs that have the CPU_OFFLINE flag set can still run
745 745 * interrupt threads, which will be the only threads on the CPU's own
746 746 * queue, but cannot run threads from other queues.
747 747 */
748 748 if (pri == -1) {
749 749 if (!(cpup->cpu_flags & CPU_OFFLINE)) {
750 750 disp_lock_exit(&dp->disp_lock);
751 751 if ((tp = disp_getwork(cpup)) == NULL ||
752 752 tp == T_DONTSTEAL) {
753 753 tp = cpup->cpu_idle_thread;
754 754 (void) splhigh();
755 755 THREAD_ONPROC(tp, cpup);
756 756 cpup->cpu_dispthread = tp;
757 757 cpup->cpu_dispatch_pri = -1;
758 758 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
759 759 cpup->cpu_chosen_level = -1;
760 760 }
761 761 } else {
762 762 disp_lock_exit_high(&dp->disp_lock);
763 763 tp = cpup->cpu_idle_thread;
764 764 THREAD_ONPROC(tp, cpup);
765 765 cpup->cpu_dispthread = tp;
766 766 cpup->cpu_dispatch_pri = -1;
767 767 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
768 768 cpup->cpu_chosen_level = -1;
769 769 }
770 770 TRACE_1(TR_FAC_DISP, TR_DISP_END,
771 771 "disp_end:tid %p", tp);
772 772 return (tp);
773 773 }
774 774
775 775 dq = &dp->disp_q[pri];
776 776 tp = dq->dq_first;
777 777
778 778 ASSERT(tp != NULL);
779 779 ASSERT(tp->t_schedflag & TS_LOAD); /* thread must be swapped in */
780 780
781 781 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
782 782
783 783 /*
784 784 * Found it so remove it from queue.
785 785 */
786 786 dp->disp_nrunnable--;
787 787 dq->dq_sruncnt--;
788 788 if ((dq->dq_first = tp->t_link) == NULL) {
789 789 ulong_t *dqactmap = dp->disp_qactmap;
790 790
791 791 ASSERT(dq->dq_sruncnt == 0);
792 792 dq->dq_last = NULL;
793 793
794 794 /*
795 795 * The queue is empty, so the corresponding bit needs to be
796 796 * turned off in dqactmap. If nrunnable != 0 just took the
797 797 * last runnable thread off the
798 798 * highest queue, so recompute disp_maxrunpri.
799 799 */
800 800 maxrunword = pri >> BT_ULSHIFT;
801 801 dqactmap[maxrunword] &= ~BT_BIW(pri);
802 802
803 803 if (dp->disp_nrunnable == 0) {
804 804 dp->disp_max_unbound_pri = -1;
805 805 dp->disp_maxrunpri = -1;
806 806 } else {
807 807 int ipri;
808 808
809 809 ipri = bt_gethighbit(dqactmap, maxrunword);
810 810 dp->disp_maxrunpri = ipri;
811 811 if (ipri < dp->disp_max_unbound_pri)
812 812 dp->disp_max_unbound_pri = ipri;
813 813 }
814 814 } else {
815 815 tp->t_link = NULL;
816 816 }
817 817
818 818 /*
819 819 * Set TS_DONT_SWAP flag to prevent another processor from swapping
820 820 * out this thread before we have a chance to run it.
821 821 * While running, it is protected against swapping by t_lock.
822 822 */
823 823 tp->t_schedflag |= TS_DONT_SWAP;
824 824 cpup->cpu_dispthread = tp; /* protected by spl only */
825 825 cpup->cpu_dispatch_pri = pri;
826 826 ASSERT(pri == DISP_PRIO(tp));
827 827 thread_onproc(tp, cpup); /* set t_state to TS_ONPROC */
828 828 disp_lock_exit_high(&dp->disp_lock); /* drop run queue lock */
829 829
830 830 ASSERT(tp != NULL);
831 831 TRACE_1(TR_FAC_DISP, TR_DISP_END,
832 832 "disp_end:tid %p", tp);
833 833
834 834 if (disp_ratify(tp, kpq) == NULL)
835 835 goto reschedule;
836 836
837 837 return (tp);
838 838 }
839 839
840 840 /*
841 841 * swtch()
842 842 * Find best runnable thread and run it.
843 843 * Called with the current thread already switched to a new state,
844 844 * on a sleep queue, run queue, stopped, and not zombied.
845 845 * May be called at any spl level less than or equal to LOCK_LEVEL.
846 846 * Always drops spl to the base level (spl0()).
847 847 */
848 848 void
849 849 swtch()
850 850 {
851 851 kthread_t *t = curthread;
852 852 kthread_t *next;
853 853 cpu_t *cp;
854 854
855 855 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
856 856
857 857 if (t->t_flag & T_INTR_THREAD)
858 858 cpu_intr_swtch_enter(t);
859 859
860 860 if (t->t_intr != NULL) {
861 861 /*
862 862 * We are an interrupt thread. Setup and return
863 863 * the interrupted thread to be resumed.
864 864 */
865 865 (void) splhigh(); /* block other scheduler action */
866 866 cp = CPU; /* now protected against migration */
867 867 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */
868 868 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
869 869 CPU_STATS_ADDQ(cp, sys, intrblk, 1);
870 870 next = thread_unpin();
871 871 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
872 872 resume_from_intr(next);
873 873 } else {
874 874 #ifdef DEBUG
875 875 if (t->t_state == TS_ONPROC &&
876 876 t->t_disp_queue->disp_cpu == CPU &&
877 877 t->t_preempt == 0) {
878 878 thread_lock(t);
879 879 ASSERT(t->t_state != TS_ONPROC ||
880 880 t->t_disp_queue->disp_cpu != CPU ||
881 881 t->t_preempt != 0); /* cannot migrate */
882 882 thread_unlock_nopreempt(t);
883 883 }
884 884 #endif /* DEBUG */
885 885 cp = CPU;
886 886 next = disp(); /* returns with spl high */
887 887 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */
888 888
889 889 /* OK to steal anything left on run queue */
890 890 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
891 891
892 892 if (next != t) {
893 893 hrtime_t now;
894 894
895 895 now = gethrtime_unscaled();
896 896 pg_ev_thread_swtch(cp, now, t, next);
897 897
898 898 /*
899 899 * If t was previously in the TS_ONPROC state,
900 900 * setfrontdq and setbackdq won't have set its t_waitrq.
901 901 * Since we now finally know that we're switching away
902 902 * from this thread, set its t_waitrq if it is on a run
903 903 * queue.
904 904 */
905 905 if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
906 906 t->t_waitrq = now;
907 907 }
908 908
909 909 /*
910 910 * restore mstate of thread that we are switching to
911 911 */
912 912 restore_mstate(next);
913 913
914 914 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
915 915 cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
916 916 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
917 917
918 918 if (dtrace_vtime_active)
919 919 dtrace_vtime_switch(next);
920 920
921 921 resume(next);
922 922 /*
923 923 * The TR_RESUME_END and TR_SWTCH_END trace points
924 924 * appear at the end of resume(), because we may not
925 925 * return here
926 926 */
927 927 } else {
928 928 if (t->t_flag & T_INTR_THREAD)
929 929 cpu_intr_swtch_exit(t);
930 930 /*
931 931 * Threads that enqueue themselves on a run queue defer
932 932 * setting t_waitrq. It is then either set in swtch()
933 933 * when the CPU is actually yielded, or not at all if it
934 934 * is remaining on the CPU.
935 935 * There is however a window between where the thread
936 936 * placed itself on a run queue, and where it selects
937 937 * itself in disp(), where a third party (eg. clock()
938 938 * doing tick processing) may have re-enqueued this
939 939 * thread, setting t_waitrq in the process. We detect
940 940 * this race by noticing that despite switching to
941 941 * ourself, our t_waitrq has been set, and should be
942 942 * cleared.
943 943 */
944 944 if (t->t_waitrq != 0)
945 945 t->t_waitrq = 0;
946 946
947 947 pg_ev_thread_remain(cp, t);
948 948
949 949 DTRACE_SCHED(remain__cpu);
950 950 TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
951 951 (void) spl0();
952 952 }
953 953 }
954 954 }
955 955
956 956 /*
957 957 * swtch_from_zombie()
958 958 * Special case of swtch(), which allows checks for TS_ZOMB to be
959 959 * eliminated from normal resume.
960 960 * Find best runnable thread and run it.
961 961 * Called with the current thread zombied.
962 962 * Zombies cannot migrate, so CPU references are safe.
963 963 */
964 964 void
965 965 swtch_from_zombie()
966 966 {
967 967 kthread_t *next;
968 968 cpu_t *cpu = CPU;
969 969
970 970 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
971 971
972 972 ASSERT(curthread->t_state == TS_ZOMB);
973 973
974 974 next = disp(); /* returns with spl high */
975 975 ASSERT(CPU_ON_INTR(CPU) == 0); /* not called with PIL > 10 */
976 976 CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
977 977 ASSERT(next != curthread);
978 978 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
979 979
980 980 pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
981 981
982 982 restore_mstate(next);
983 983
984 984 if (dtrace_vtime_active)
985 985 dtrace_vtime_switch(next);
986 986
987 987 resume_from_zombie(next);
988 988 /*
989 989 * The TR_RESUME_END and TR_SWTCH_END trace points
990 990 * appear at the end of resume(), because we certainly will not
991 991 * return here
992 992 */
993 993 }
994 994
995 995 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
996 996
997 997 /*
998 998 * search_disp_queues()
999 999 * Search the given dispatch queues for thread tp.
1000 1000 * Return 1 if tp is found, otherwise return 0.
1001 1001 */
1002 1002 static int
1003 1003 search_disp_queues(disp_t *dp, kthread_t *tp)
1004 1004 {
1005 1005 dispq_t *dq;
1006 1006 dispq_t *eq;
1007 1007
1008 1008 disp_lock_enter_high(&dp->disp_lock);
1009 1009
1010 1010 for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1011 1011 kthread_t *rp;
1012 1012
1013 1013 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1014 1014
1015 1015 for (rp = dq->dq_first; rp; rp = rp->t_link)
1016 1016 if (tp == rp) {
1017 1017 disp_lock_exit_high(&dp->disp_lock);
1018 1018 return (1);
1019 1019 }
1020 1020 }
1021 1021 disp_lock_exit_high(&dp->disp_lock);
1022 1022
1023 1023 return (0);
1024 1024 }
1025 1025
1026 1026 /*
1027 1027 * thread_on_queue()
1028 1028 * Search all per-CPU dispatch queues and all partition-wide kpreempt
1029 1029 * queues for thread tp. Return 1 if tp is found, otherwise return 0.
1030 1030 */
1031 1031 static int
1032 1032 thread_on_queue(kthread_t *tp)
1033 1033 {
1034 1034 cpu_t *cp;
1035 1035 struct cpupart *part;
1036 1036
1037 1037 ASSERT(getpil() >= DISP_LEVEL);
1038 1038
1039 1039 /*
1040 1040 * Search the per-CPU dispatch queues for tp.
1041 1041 */
1042 1042 cp = CPU;
1043 1043 do {
1044 1044 if (search_disp_queues(cp->cpu_disp, tp))
1045 1045 return (1);
1046 1046 } while ((cp = cp->cpu_next_onln) != CPU);
1047 1047
1048 1048 /*
1049 1049 * Search the partition-wide kpreempt queues for tp.
1050 1050 */
1051 1051 part = CPU->cpu_part;
1052 1052 do {
1053 1053 if (search_disp_queues(&part->cp_kp_queue, tp))
1054 1054 return (1);
1055 1055 } while ((part = part->cp_next) != CPU->cpu_part);
1056 1056
1057 1057 return (0);
1058 1058 }
1059 1059
1060 1060 #else
1061 1061
1062 1062 #define thread_on_queue(tp) 0 /* ASSERT must be !thread_on_queue */
1063 1063
1064 1064 #endif /* DEBUG */
1065 1065
1066 1066 /*
1067 1067 * like swtch(), but switch to a specified thread taken from another CPU.
1068 1068 * called with spl high..
1069 1069 */
1070 1070 void
1071 1071 swtch_to(kthread_t *next)
1072 1072 {
1073 1073 cpu_t *cp = CPU;
1074 1074 hrtime_t now;
1075 1075
1076 1076 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1077 1077
1078 1078 /*
1079 1079 * Update context switch statistics.
1080 1080 */
1081 1081 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1082 1082
1083 1083 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1084 1084
1085 1085 now = gethrtime_unscaled();
1086 1086 pg_ev_thread_swtch(cp, now, curthread, next);
1087 1087
1088 1088 /* OK to steal anything left on run queue */
1089 1089 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1090 1090
1091 1091 /* record last execution time */
1092 1092 cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1093 1093
1094 1094 /*
1095 1095 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1096 1096 * won't have set its t_waitrq. Since we now finally know that we're
1097 1097 * switching away from this thread, set its t_waitrq if it is on a run
1098 1098 * queue.
1099 1099 */
1100 1100 if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1101 1101 curthread->t_waitrq = now;
1102 1102 }
1103 1103
1104 1104 /* restore next thread to previously running microstate */
1105 1105 restore_mstate(next);
1106 1106
1107 1107 if (dtrace_vtime_active)
1108 1108 dtrace_vtime_switch(next);
1109 1109
1110 1110 resume(next);
1111 1111 /*
1112 1112 * The TR_RESUME_END and TR_SWTCH_END trace points
1113 1113 * appear at the end of resume(), because we may not
1114 1114 * return here
1115 1115 */
1116 1116 }
1117 1117
1118 1118 #define CPU_IDLING(pri) ((pri) == -1)
1119 1119
1120 1120 static void
1121 1121 cpu_resched(cpu_t *cp, pri_t tpri)
1122 1122 {
1123 1123 int call_poke_cpu = 0;
1124 1124 pri_t cpupri = cp->cpu_dispatch_pri;
1125 1125
1126 1126 if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1127 1127 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1128 1128 "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1129 1129 if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1130 1130 cp->cpu_runrun = 1;
1131 1131 aston(cp->cpu_dispthread);
1132 1132 if (tpri < kpreemptpri && cp != CPU)
1133 1133 call_poke_cpu = 1;
1134 1134 }
1135 1135 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1136 1136 cp->cpu_kprunrun = 1;
1137 1137 if (cp != CPU)
1138 1138 call_poke_cpu = 1;
1139 1139 }
1140 1140 }
1141 1141
1142 1142 /*
1143 1143 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1144 1144 */
1145 1145 membar_enter();
1146 1146
1147 1147 if (call_poke_cpu)
1148 1148 poke_cpu(cp->cpu_id);
1149 1149 }
1150 1150
1151 1151 /*
1152 1152 * setbackdq() keeps runqs balanced such that the difference in length
1153 1153 * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1154 1154 * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1155 1155 * must match. When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1156 1156 * try to keep runqs perfectly balanced regardless of the thread priority.
1157 1157 */
1158 1158 #define RUNQ_MATCH_PRI 16 /* pri below which queue lengths must match */
1159 1159 #define RUNQ_MAX_DIFF 2 /* maximum runq length difference */
1160 1160 #define RUNQ_LEN(cp, pri) ((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1161 1161
↓ open down ↓ |
1161 lines elided |
↑ open up ↑ |
1162 1162 /*
1163 1163 * Macro that evaluates to true if it is likely that the thread has cache
1164 1164 * warmth. This is based on the amount of time that has elapsed since the
1165 1165 * thread last ran. If that amount of time is less than "rechoose_interval"
1166 1166 * ticks, then we decide that the thread has enough cache warmth to warrant
1167 1167 * some affinity for t->t_cpu.
1168 1168 */
1169 1169 #define THREAD_HAS_CACHE_WARMTH(thread) \
1170 1170 ((thread == curthread) || \
1171 1171 ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1172 +
1172 1173 /*
1173 - * Put the specified thread on the back of the dispatcher
1174 - * queue corresponding to its current priority.
1174 + * Put the specified thread on the front/back of the dispatcher queue
1175 + * corresponding to its current priority.
1175 1176 *
1176 - * Called with the thread in transition, onproc or stopped state
1177 - * and locked (transition implies locked) and at high spl.
1178 - * Returns with the thread in TS_RUN state and still locked.
1177 + * Called with the thread in transition, onproc or stopped state and locked
1178 + * (transition implies locked) and at high spl. Returns with the thread in
1179 + * TS_RUN state and still locked.
1179 1180 */
1180 -void
1181 -setbackdq(kthread_t *tp)
1181 +static void
1182 +setfrontbackdq(kthread_t *tp, boolean_t front)
1182 1183 {
1183 - dispq_t *dq;
1184 + dispq_t *dq;
1184 1185 disp_t *dp;
1185 1186 cpu_t *cp;
1186 1187 pri_t tpri;
1187 - int bound;
1188 + boolean_t bound;
1188 1189 boolean_t self;
1189 1190
1190 1191 ASSERT(THREAD_LOCK_HELD(tp));
1191 1192 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1192 1193 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */
1193 1194
1194 1195 /*
1195 1196 * If thread is "swapped" or on the swap queue don't
1196 1197 * queue it, but wake sched.
1197 1198 */
1198 1199 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1199 1200 disp_swapped_setrun(tp);
1200 1201 return;
1201 1202 }
1202 1203
1203 - self = (tp == curthread);
1204 -
1205 - if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1206 - bound = 1;
1207 - else
1208 - bound = 0;
1204 + self = (tp == curthread);
1205 + bound = (tp->t_bound_cpu || tp->t_weakbound_cpu);
1209 1206
1210 1207 tpri = DISP_PRIO(tp);
1211 1208 if (ncpus == 1)
1212 1209 cp = tp->t_cpu;
1213 1210 else if (!bound) {
1214 1211 if (tpri >= kpqpri) {
1215 - setkpdq(tp, SETKP_BACK);
1212 + setkpdq(tp, front ? SETKP_FRONT : SETKP_BACK);
1216 1213 return;
1217 1214 }
1218 1215
1219 - /*
1220 - * We'll generally let this thread continue to run where
1221 - * it last ran...but will consider migration if:
1222 - * - We thread probably doesn't have much cache warmth.
1223 - * - The CPU where it last ran is the target of an offline
1224 - * request.
1225 - * - The thread last ran outside it's home lgroup.
1226 - */
1227 - if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1228 - (tp->t_cpu == cpu_inmotion)) {
1229 - cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
1230 - } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1231 - cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1232 - self ? tp->t_cpu : NULL);
1233 - } else {
1234 - cp = tp->t_cpu;
1235 - }
1236 -
1237 - if (tp->t_cpupart == cp->cpu_part) {
1238 - int qlen;
1216 + cp = tp->t_cpu;
1239 1217
1218 + if (!front) {
1240 1219 /*
1241 - * Perform any CMT load balancing
1220 + * We'll generally let this thread continue to run where
1221 + * it last ran...but will consider migration if:
1222 + * - We thread probably doesn't have much cache warmth.
1223 + * - The CPU where it last ran is the target of an offline
1224 + * request.
1225 + * - The thread last ran outside it's home lgroup.
1242 1226 */
1243 - cp = cmt_balance(tp, cp);
1227 + if ((!THREAD_HAS_CACHE_WARMTH(tp)) || (cp == cpu_inmotion)) {
1228 + cp = disp_lowpri_cpu(cp, tp->t_lpl, tpri, NULL);
1229 + } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) {
1230 + cp = disp_lowpri_cpu(cp, tp->t_lpl, tpri,
1231 + self ? cp : NULL);
1232 + }
1244 1233
1245 - /*
1246 - * Balance across the run queues
1247 - */
1248 - qlen = RUNQ_LEN(cp, tpri);
1249 - if (tpri >= RUNQ_MATCH_PRI &&
1250 - !(tp->t_schedflag & TS_RUNQMATCH))
1251 - qlen -= RUNQ_MAX_DIFF;
1252 - if (qlen > 0) {
1253 - cpu_t *newcp;
1254 -
1255 - if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1256 - newcp = cp->cpu_next_part;
1257 - } else if ((newcp = cp->cpu_next_lpl) == cp) {
1258 - newcp = cp->cpu_next_part;
1234 + }
1235 +
1236 + if (tp->t_cpupart == cp->cpu_part) {
1237 + if (front) {
1238 + /*
1239 + * We'll generally let this thread continue to run
1240 + * where it last ran, but will consider migration if:
1241 + * - The thread last ran outside it's home lgroup.
1242 + * - The CPU where it last ran is the target of an
1243 + * offline request (a thread_nomigrate() on the in
1244 + * motion CPU relies on this when forcing a preempt).
1245 + * - The thread isn't the highest priority thread where
1246 + * it last ran, and it is considered not likely to
1247 + * have significant cache warmth.
1248 + */
1249 + if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
1250 + (cp == cpu_inmotion)) {
1251 + cp = disp_lowpri_cpu(cp, tp->t_lpl, tpri,
1252 + self ? cp : NULL);
1253 + } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
1254 + (!THREAD_HAS_CACHE_WARMTH(tp))) {
1255 + cp = disp_lowpri_cpu(cp, tp->t_lpl, tpri,
1256 + NULL);
1259 1257 }
1258 + } else {
1259 + int qlen;
1260 1260
1261 - if (RUNQ_LEN(newcp, tpri) < qlen) {
1262 - DTRACE_PROBE3(runq__balance,
1263 - kthread_t *, tp,
1264 - cpu_t *, cp, cpu_t *, newcp);
1265 - cp = newcp;
1261 + /*
1262 + * Perform any CMT load balancing
1263 + */
1264 + cp = cmt_balance(tp, cp);
1265 +
1266 + /*
1267 + * Balance across the run queues
1268 + */
1269 + qlen = RUNQ_LEN(cp, tpri);
1270 + if (tpri >= RUNQ_MATCH_PRI &&
1271 + !(tp->t_schedflag & TS_RUNQMATCH))
1272 + qlen -= RUNQ_MAX_DIFF;
1273 + if (qlen > 0) {
1274 + cpu_t *newcp;
1275 +
1276 + if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1277 + newcp = cp->cpu_next_part;
1278 + } else if ((newcp = cp->cpu_next_lpl) == cp) {
1279 + newcp = cp->cpu_next_part;
1280 + }
1281 +
1282 + if (RUNQ_LEN(newcp, tpri) < qlen) {
1283 + DTRACE_PROBE3(runq__balance,
1284 + kthread_t *, tp,
1285 + cpu_t *, cp, cpu_t *, newcp);
1286 + cp = newcp;
1287 + }
1266 1288 }
1267 1289 }
1268 1290 } else {
1269 1291 /*
1270 1292 * Migrate to a cpu in the new partition.
1271 1293 */
1272 1294 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1273 1295 tp->t_lpl, tp->t_pri, NULL);
1274 1296 }
1297 +
1275 1298 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1276 1299 } else {
1277 1300 /*
1278 1301 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1279 1302 * a short time until weak binding that existed when the
1280 1303 * strong binding was established has dropped) so we must
1281 1304 * favour weak binding over strong.
1282 1305 */
1283 1306 cp = tp->t_weakbound_cpu ?
1284 1307 tp->t_weakbound_cpu : tp->t_bound_cpu;
1285 1308 }
1309 +
1286 1310 /*
1287 1311 * A thread that is ONPROC may be temporarily placed on the run queue
1288 1312 * but then chosen to run again by disp. If the thread we're placing on
1289 1313 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1290 1314 * replacement process is actually scheduled in swtch(). In this
1291 1315 * situation, curthread is the only thread that could be in the ONPROC
1292 1316 * state.
1293 1317 */
1294 1318 if ((!self) && (tp->t_waitrq == 0)) {
1295 1319 hrtime_t curtime;
1296 1320
↓ open down ↓ |
1 lines elided |
↑ open up ↑ |
1297 1321 curtime = gethrtime_unscaled();
1298 1322 (void) cpu_update_pct(tp, curtime);
1299 1323 tp->t_waitrq = curtime;
1300 1324 } else {
1301 1325 (void) cpu_update_pct(tp, gethrtime_unscaled());
1302 1326 }
1303 1327
1304 1328 dp = cp->cpu_disp;
1305 1329 disp_lock_enter_high(&dp->disp_lock);
1306 1330
1307 - DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1308 - TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1309 - tpri, cp, tp);
1331 + DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, front);
1332 + if (front) {
1333 + TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri,
1334 + tp);
1335 + } else {
1336 + TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1337 + tpri, cp, tp);
1338 + }
1310 1339
1311 1340 #ifndef NPROBE
1312 1341 /* Kernel probe */
1313 1342 if (tnf_tracing_active)
1314 1343 tnf_thread_queue(tp, cp, tpri);
1315 1344 #endif /* NPROBE */
1316 1345
1317 1346 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1318 1347
1319 1348 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */
1320 1349 tp->t_disp_queue = dp;
1321 1350 tp->t_link = NULL;
1322 1351
1323 1352 dq = &dp->disp_q[tpri];
1324 1353 dp->disp_nrunnable++;
1325 1354 if (!bound)
1326 1355 dp->disp_steal = 0;
1327 1356 membar_enter();
1328 1357
1329 1358 if (dq->dq_sruncnt++ != 0) {
1330 - ASSERT(dq->dq_first != NULL);
1331 - dq->dq_last->t_link = tp;
1332 - dq->dq_last = tp;
1359 + if (front) {
1360 + ASSERT(dq->dq_last != NULL);
1361 + tp->t_link = dq->dq_first;
1362 + dq->dq_first = tp;
1363 + } else {
1364 + ASSERT(dq->dq_first != NULL);
1365 + dq->dq_last->t_link = tp;
1366 + dq->dq_last = tp;
1367 + }
1333 1368 } else {
1334 1369 ASSERT(dq->dq_first == NULL);
1335 1370 ASSERT(dq->dq_last == NULL);
1336 1371 dq->dq_first = dq->dq_last = tp;
1337 1372 BT_SET(dp->disp_qactmap, tpri);
1338 1373 if (tpri > dp->disp_maxrunpri) {
1339 1374 dp->disp_maxrunpri = tpri;
1340 1375 membar_enter();
1341 1376 cpu_resched(cp, tpri);
1342 1377 }
1343 1378 }
1344 1379
1345 1380 if (!bound && tpri > dp->disp_max_unbound_pri) {
1346 1381 if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1347 1382 /*
1348 1383 * If there are no other unbound threads on the
↓ open down ↓ |
6 lines elided |
↑ open up ↑ |
1349 1384 * run queue, don't allow other CPUs to steal
1350 1385 * this thread while we are in the middle of a
1351 1386 * context switch. We may just switch to it
1352 1387 * again right away. CPU_DISP_DONTSTEAL is cleared
1353 1388 * in swtch and swtch_to.
1354 1389 */
1355 1390 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1356 1391 }
1357 1392 dp->disp_max_unbound_pri = tpri;
1358 1393 }
1394 +
1359 1395 (*disp_enq_thread)(cp, bound);
1360 1396 }
1361 1397
1362 1398 /*
1399 + * Put the specified thread on the back of the dispatcher
1400 + * queue corresponding to its current priority.
1401 + *
1402 + * Called with the thread in transition, onproc or stopped state
1403 + * and locked (transition implies locked) and at high spl.
1404 + * Returns with the thread in TS_RUN state and still locked.
1405 + */
1406 +void
1407 +setbackdq(kthread_t *tp)
1408 +{
1409 + setfrontbackdq(tp, B_FALSE);
1410 +}
1411 +
1412 +/*
1363 1413 * Put the specified thread on the front of the dispatcher
1364 1414 * queue corresponding to its current priority.
1365 1415 *
1366 1416 * Called with the thread in transition, onproc or stopped state
1367 1417 * and locked (transition implies locked) and at high spl.
1368 1418 * Returns with the thread in TS_RUN state and still locked.
1369 1419 */
1370 1420 void
1371 1421 setfrontdq(kthread_t *tp)
1372 1422 {
1373 - disp_t *dp;
1374 - dispq_t *dq;
1375 - cpu_t *cp;
1376 - pri_t tpri;
1377 - int bound;
1378 -
1379 - ASSERT(THREAD_LOCK_HELD(tp));
1380 - ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1381 - ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */
1382 -
1383 - /*
1384 - * If thread is "swapped" or on the swap queue don't
1385 - * queue it, but wake sched.
1386 - */
1387 - if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1388 - disp_swapped_setrun(tp);
1389 - return;
1390 - }
1391 -
1392 - if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1393 - bound = 1;
1394 - else
1395 - bound = 0;
1396 -
1397 - tpri = DISP_PRIO(tp);
1398 - if (ncpus == 1)
1399 - cp = tp->t_cpu;
1400 - else if (!bound) {
1401 - if (tpri >= kpqpri) {
1402 - setkpdq(tp, SETKP_FRONT);
1403 - return;
1404 - }
1405 - cp = tp->t_cpu;
1406 - if (tp->t_cpupart == cp->cpu_part) {
1407 - /*
1408 - * We'll generally let this thread continue to run
1409 - * where it last ran, but will consider migration if:
1410 - * - The thread last ran outside it's home lgroup.
1411 - * - The CPU where it last ran is the target of an
1412 - * offline request (a thread_nomigrate() on the in
1413 - * motion CPU relies on this when forcing a preempt).
1414 - * - The thread isn't the highest priority thread where
1415 - * it last ran, and it is considered not likely to
1416 - * have significant cache warmth.
1417 - */
1418 - if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
1419 - (cp == cpu_inmotion)) {
1420 - cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1421 - (tp == curthread) ? cp : NULL);
1422 - } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
1423 - (!THREAD_HAS_CACHE_WARMTH(tp))) {
1424 - cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1425 - NULL);
1426 - }
1427 - } else {
1428 - /*
1429 - * Migrate to a cpu in the new partition.
1430 - */
1431 - cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1432 - tp->t_lpl, tp->t_pri, NULL);
1433 - }
1434 - ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1435 - } else {
1436 - /*
1437 - * It is possible that t_weakbound_cpu != t_bound_cpu (for
1438 - * a short time until weak binding that existed when the
1439 - * strong binding was established has dropped) so we must
1440 - * favour weak binding over strong.
1441 - */
1442 - cp = tp->t_weakbound_cpu ?
1443 - tp->t_weakbound_cpu : tp->t_bound_cpu;
1444 - }
1445 -
1446 - /*
1447 - * A thread that is ONPROC may be temporarily placed on the run queue
1448 - * but then chosen to run again by disp. If the thread we're placing on
1449 - * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1450 - * replacement process is actually scheduled in swtch(). In this
1451 - * situation, curthread is the only thread that could be in the ONPROC
1452 - * state.
1453 - */
1454 - if ((tp != curthread) && (tp->t_waitrq == 0)) {
1455 - hrtime_t curtime;
1456 -
1457 - curtime = gethrtime_unscaled();
1458 - (void) cpu_update_pct(tp, curtime);
1459 - tp->t_waitrq = curtime;
1460 - } else {
1461 - (void) cpu_update_pct(tp, gethrtime_unscaled());
1462 - }
1463 -
1464 - dp = cp->cpu_disp;
1465 - disp_lock_enter_high(&dp->disp_lock);
1466 -
1467 - TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1468 - DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1469 -
1470 -#ifndef NPROBE
1471 - /* Kernel probe */
1472 - if (tnf_tracing_active)
1473 - tnf_thread_queue(tp, cp, tpri);
1474 -#endif /* NPROBE */
1475 -
1476 - ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1477 -
1478 - THREAD_RUN(tp, &dp->disp_lock); /* set TS_RUN state and lock */
1479 - tp->t_disp_queue = dp;
1480 -
1481 - dq = &dp->disp_q[tpri];
1482 - dp->disp_nrunnable++;
1483 - if (!bound)
1484 - dp->disp_steal = 0;
1485 - membar_enter();
1486 -
1487 - if (dq->dq_sruncnt++ != 0) {
1488 - ASSERT(dq->dq_last != NULL);
1489 - tp->t_link = dq->dq_first;
1490 - dq->dq_first = tp;
1491 - } else {
1492 - ASSERT(dq->dq_last == NULL);
1493 - ASSERT(dq->dq_first == NULL);
1494 - tp->t_link = NULL;
1495 - dq->dq_first = dq->dq_last = tp;
1496 - BT_SET(dp->disp_qactmap, tpri);
1497 - if (tpri > dp->disp_maxrunpri) {
1498 - dp->disp_maxrunpri = tpri;
1499 - membar_enter();
1500 - cpu_resched(cp, tpri);
1501 - }
1502 - }
1503 -
1504 - if (!bound && tpri > dp->disp_max_unbound_pri) {
1505 - if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1506 - cp == CPU) {
1507 - /*
1508 - * If there are no other unbound threads on the
1509 - * run queue, don't allow other CPUs to steal
1510 - * this thread while we are in the middle of a
1511 - * context switch. We may just switch to it
1512 - * again right away. CPU_DISP_DONTSTEAL is cleared
1513 - * in swtch and swtch_to.
1514 - */
1515 - cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1516 - }
1517 - dp->disp_max_unbound_pri = tpri;
1518 - }
1519 - (*disp_enq_thread)(cp, bound);
1423 + setfrontbackdq(tp, B_TRUE);
1520 1424 }
1521 1425
1522 1426 /*
1523 1427 * Put a high-priority unbound thread on the kp queue
1524 1428 */
1525 1429 static void
1526 1430 setkpdq(kthread_t *tp, int borf)
1527 1431 {
1528 1432 dispq_t *dq;
1529 1433 disp_t *dp;
1530 1434 cpu_t *cp;
1531 1435 pri_t tpri;
1532 1436
1533 1437 tpri = DISP_PRIO(tp);
1534 1438
1535 1439 dp = &tp->t_cpupart->cp_kp_queue;
1536 1440 disp_lock_enter_high(&dp->disp_lock);
1537 1441
1538 1442 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1539 1443
1540 1444 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1541 1445 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1542 1446 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */
1543 1447 tp->t_disp_queue = dp;
1544 1448 dp->disp_nrunnable++;
1545 1449 dq = &dp->disp_q[tpri];
1546 1450
1547 1451 if (dq->dq_sruncnt++ != 0) {
1548 1452 if (borf == SETKP_BACK) {
1549 1453 ASSERT(dq->dq_first != NULL);
1550 1454 tp->t_link = NULL;
1551 1455 dq->dq_last->t_link = tp;
1552 1456 dq->dq_last = tp;
1553 1457 } else {
1554 1458 ASSERT(dq->dq_last != NULL);
1555 1459 tp->t_link = dq->dq_first;
1556 1460 dq->dq_first = tp;
1557 1461 }
1558 1462 } else {
1559 1463 if (borf == SETKP_BACK) {
1560 1464 ASSERT(dq->dq_first == NULL);
1561 1465 ASSERT(dq->dq_last == NULL);
1562 1466 dq->dq_first = dq->dq_last = tp;
1563 1467 } else {
1564 1468 ASSERT(dq->dq_last == NULL);
1565 1469 ASSERT(dq->dq_first == NULL);
1566 1470 tp->t_link = NULL;
1567 1471 dq->dq_first = dq->dq_last = tp;
1568 1472 }
1569 1473 BT_SET(dp->disp_qactmap, tpri);
1570 1474 if (tpri > dp->disp_max_unbound_pri)
1571 1475 dp->disp_max_unbound_pri = tpri;
1572 1476 if (tpri > dp->disp_maxrunpri) {
1573 1477 dp->disp_maxrunpri = tpri;
1574 1478 membar_enter();
1575 1479 }
1576 1480 }
1577 1481
1578 1482 cp = tp->t_cpu;
1579 1483 if (tp->t_cpupart != cp->cpu_part) {
1580 1484 /* migrate to a cpu in the new partition */
1581 1485 cp = tp->t_cpupart->cp_cpulist;
1582 1486 }
1583 1487 cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1584 1488 disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1585 1489 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1586 1490
1587 1491 #ifndef NPROBE
1588 1492 /* Kernel probe */
1589 1493 if (tnf_tracing_active)
1590 1494 tnf_thread_queue(tp, cp, tpri);
1591 1495 #endif /* NPROBE */
1592 1496
1593 1497 if (cp->cpu_chosen_level < tpri)
1594 1498 cp->cpu_chosen_level = tpri;
1595 1499 cpu_resched(cp, tpri);
1596 1500 disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1597 1501 (*disp_enq_thread)(cp, 0);
1598 1502 }
1599 1503
1600 1504 /*
1601 1505 * Remove a thread from the dispatcher queue if it is on it.
1602 1506 * It is not an error if it is not found but we return whether
1603 1507 * or not it was found in case the caller wants to check.
1604 1508 */
1605 1509 int
1606 1510 dispdeq(kthread_t *tp)
1607 1511 {
1608 1512 disp_t *dp;
1609 1513 dispq_t *dq;
1610 1514 kthread_t *rp;
1611 1515 kthread_t *trp;
1612 1516 kthread_t **ptp;
1613 1517 int tpri;
1614 1518
1615 1519 ASSERT(THREAD_LOCK_HELD(tp));
1616 1520
1617 1521 if (tp->t_state != TS_RUN)
1618 1522 return (0);
1619 1523
1620 1524 /*
1621 1525 * The thread is "swapped" or is on the swap queue and
1622 1526 * hence no longer on the run queue, so return true.
1623 1527 */
1624 1528 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1625 1529 return (1);
1626 1530
1627 1531 tpri = DISP_PRIO(tp);
1628 1532 dp = tp->t_disp_queue;
1629 1533 ASSERT(tpri < dp->disp_npri);
1630 1534 dq = &dp->disp_q[tpri];
1631 1535 ptp = &dq->dq_first;
1632 1536 rp = *ptp;
1633 1537 trp = NULL;
1634 1538
1635 1539 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1636 1540
1637 1541 /*
1638 1542 * Search for thread in queue.
1639 1543 * Double links would simplify this at the expense of disp/setrun.
1640 1544 */
1641 1545 while (rp != tp && rp != NULL) {
1642 1546 trp = rp;
1643 1547 ptp = &trp->t_link;
1644 1548 rp = trp->t_link;
1645 1549 }
1646 1550
1647 1551 if (rp == NULL) {
1648 1552 panic("dispdeq: thread not on queue");
1649 1553 }
1650 1554
1651 1555 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1652 1556
1653 1557 /*
1654 1558 * Found it so remove it from queue.
1655 1559 */
1656 1560 if ((*ptp = rp->t_link) == NULL)
1657 1561 dq->dq_last = trp;
1658 1562
1659 1563 dp->disp_nrunnable--;
1660 1564 if (--dq->dq_sruncnt == 0) {
1661 1565 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1662 1566 if (dp->disp_nrunnable == 0) {
1663 1567 dp->disp_max_unbound_pri = -1;
1664 1568 dp->disp_maxrunpri = -1;
1665 1569 } else if (tpri == dp->disp_maxrunpri) {
1666 1570 int ipri;
1667 1571
1668 1572 ipri = bt_gethighbit(dp->disp_qactmap,
1669 1573 dp->disp_maxrunpri >> BT_ULSHIFT);
1670 1574 if (ipri < dp->disp_max_unbound_pri)
1671 1575 dp->disp_max_unbound_pri = ipri;
1672 1576 dp->disp_maxrunpri = ipri;
1673 1577 }
1674 1578 }
1675 1579 tp->t_link = NULL;
1676 1580 THREAD_TRANSITION(tp); /* put in intermediate state */
1677 1581 return (1);
1678 1582 }
1679 1583
1680 1584
1681 1585 /*
1682 1586 * dq_sruninc and dq_srundec are public functions for
1683 1587 * incrementing/decrementing the sruncnts when a thread on
1684 1588 * a dispatcher queue is made schedulable/unschedulable by
1685 1589 * resetting the TS_LOAD flag.
1686 1590 *
1687 1591 * The caller MUST have the thread lock and therefore the dispatcher
1688 1592 * queue lock so that the operation which changes
1689 1593 * the flag, the operation that checks the status of the thread to
1690 1594 * determine if it's on a disp queue AND the call to this function
1691 1595 * are one atomic operation with respect to interrupts.
1692 1596 */
1693 1597
1694 1598 /*
1695 1599 * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1696 1600 */
1697 1601 void
1698 1602 dq_sruninc(kthread_t *t)
1699 1603 {
1700 1604 ASSERT(t->t_state == TS_RUN);
1701 1605 ASSERT(t->t_schedflag & TS_LOAD);
1702 1606
1703 1607 THREAD_TRANSITION(t);
1704 1608 setfrontdq(t);
1705 1609 }
1706 1610
1707 1611 /*
1708 1612 * See comment on calling conventions above.
1709 1613 * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1710 1614 */
1711 1615 void
1712 1616 dq_srundec(kthread_t *t)
1713 1617 {
1714 1618 ASSERT(t->t_schedflag & TS_LOAD);
1715 1619
1716 1620 (void) dispdeq(t);
1717 1621 disp_swapped_enq(t);
1718 1622 }
1719 1623
1720 1624 /*
1721 1625 * Change the dispatcher lock of thread to the "swapped_lock"
1722 1626 * and return with thread lock still held.
1723 1627 *
1724 1628 * Called with thread_lock held, in transition state, and at high spl.
1725 1629 */
1726 1630 void
1727 1631 disp_swapped_enq(kthread_t *tp)
1728 1632 {
1729 1633 ASSERT(THREAD_LOCK_HELD(tp));
1730 1634 ASSERT(tp->t_schedflag & TS_LOAD);
1731 1635
1732 1636 switch (tp->t_state) {
1733 1637 case TS_RUN:
1734 1638 disp_lock_enter_high(&swapped_lock);
1735 1639 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1736 1640 break;
1737 1641 case TS_ONPROC:
1738 1642 disp_lock_enter_high(&swapped_lock);
1739 1643 THREAD_TRANSITION(tp);
1740 1644 wake_sched_sec = 1; /* tell clock to wake sched */
1741 1645 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1742 1646 break;
1743 1647 default:
1744 1648 panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1745 1649 }
1746 1650 }
1747 1651
1748 1652 /*
1749 1653 * This routine is called by setbackdq/setfrontdq if the thread is
1750 1654 * not loaded or loaded and on the swap queue.
1751 1655 *
1752 1656 * Thread state TS_SLEEP implies that a swapped thread
1753 1657 * has been woken up and needs to be swapped in by the swapper.
1754 1658 *
1755 1659 * Thread state TS_RUN, it implies that the priority of a swapped
1756 1660 * thread is being increased by scheduling class (e.g. ts_update).
1757 1661 */
1758 1662 static void
1759 1663 disp_swapped_setrun(kthread_t *tp)
1760 1664 {
1761 1665 ASSERT(THREAD_LOCK_HELD(tp));
1762 1666 ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1763 1667
1764 1668 switch (tp->t_state) {
1765 1669 case TS_SLEEP:
1766 1670 disp_lock_enter_high(&swapped_lock);
1767 1671 /*
1768 1672 * Wakeup sched immediately (i.e., next tick) if the
1769 1673 * thread priority is above maxclsyspri.
1770 1674 */
1771 1675 if (DISP_PRIO(tp) > maxclsyspri)
1772 1676 wake_sched = 1;
1773 1677 else
1774 1678 wake_sched_sec = 1;
1775 1679 THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1776 1680 break;
1777 1681 case TS_RUN: /* called from ts_update */
1778 1682 break;
1779 1683 default:
1780 1684 panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
1781 1685 }
1782 1686 }
1783 1687
1784 1688 /*
1785 1689 * Make a thread give up its processor. Find the processor on
1786 1690 * which this thread is executing, and have that processor
1787 1691 * preempt.
1788 1692 *
1789 1693 * We allow System Duty Cycle (SDC) threads to be preempted even if
1790 1694 * they are running at kernel priorities. To implement this, we always
1791 1695 * set cpu_kprunrun; this ensures preempt() will be called. Since SDC
1792 1696 * calls cpu_surrender() very often, we only preempt if there is anyone
1793 1697 * competing with us.
1794 1698 */
1795 1699 void
1796 1700 cpu_surrender(kthread_t *tp)
1797 1701 {
1798 1702 cpu_t *cpup;
1799 1703 int max_pri;
1800 1704 int max_run_pri;
1801 1705 klwp_t *lwp;
1802 1706
1803 1707 ASSERT(THREAD_LOCK_HELD(tp));
1804 1708
1805 1709 if (tp->t_state != TS_ONPROC)
1806 1710 return;
1807 1711 cpup = tp->t_disp_queue->disp_cpu; /* CPU thread dispatched to */
1808 1712 max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1809 1713 max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1810 1714 if (max_pri < max_run_pri)
1811 1715 max_pri = max_run_pri;
1812 1716
1813 1717 if (tp->t_cid == sysdccid) {
1814 1718 uint_t t_pri = DISP_PRIO(tp);
1815 1719 if (t_pri > max_pri)
1816 1720 return; /* we are not competing w/ anyone */
1817 1721 cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1818 1722 } else {
1819 1723 cpup->cpu_runrun = 1;
1820 1724 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1821 1725 cpup->cpu_kprunrun = 1;
1822 1726 }
1823 1727 }
1824 1728
1825 1729 /*
1826 1730 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1827 1731 */
1828 1732 membar_enter();
1829 1733
1830 1734 DTRACE_SCHED1(surrender, kthread_t *, tp);
1831 1735
1832 1736 /*
1833 1737 * Make the target thread take an excursion through trap()
1834 1738 * to do preempt() (unless we're already in trap or post_syscall,
1835 1739 * calling cpu_surrender via CL_TRAPRET).
1836 1740 */
1837 1741 if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1838 1742 lwp->lwp_state != LWP_USER) {
1839 1743 aston(tp);
1840 1744 if (cpup != CPU)
1841 1745 poke_cpu(cpup->cpu_id);
1842 1746 }
1843 1747 TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1844 1748 "cpu_surrender:tid %p cpu %p", tp, cpup);
1845 1749 }
1846 1750
1847 1751 /*
1848 1752 * Commit to and ratify a scheduling decision
1849 1753 */
1850 1754 /*ARGSUSED*/
1851 1755 static kthread_t *
1852 1756 disp_ratify(kthread_t *tp, disp_t *kpq)
1853 1757 {
1854 1758 pri_t tpri, maxpri;
1855 1759 pri_t maxkpri;
1856 1760 cpu_t *cpup;
1857 1761
1858 1762 ASSERT(tp != NULL);
1859 1763 /*
1860 1764 * Commit to, then ratify scheduling decision
1861 1765 */
1862 1766 cpup = CPU;
1863 1767 if (cpup->cpu_runrun != 0)
1864 1768 cpup->cpu_runrun = 0;
1865 1769 if (cpup->cpu_kprunrun != 0)
1866 1770 cpup->cpu_kprunrun = 0;
1867 1771 if (cpup->cpu_chosen_level != -1)
1868 1772 cpup->cpu_chosen_level = -1;
1869 1773 membar_enter();
1870 1774 tpri = DISP_PRIO(tp);
1871 1775 maxpri = cpup->cpu_disp->disp_maxrunpri;
1872 1776 maxkpri = kpq->disp_maxrunpri;
1873 1777 if (maxpri < maxkpri)
1874 1778 maxpri = maxkpri;
1875 1779 if (tpri < maxpri) {
1876 1780 /*
1877 1781 * should have done better
1878 1782 * put this one back and indicate to try again
1879 1783 */
1880 1784 cpup->cpu_dispthread = curthread; /* fixup dispthread */
1881 1785 cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1882 1786 thread_lock_high(tp);
1883 1787 THREAD_TRANSITION(tp);
1884 1788 setfrontdq(tp);
1885 1789 thread_unlock_nopreempt(tp);
1886 1790
1887 1791 tp = NULL;
1888 1792 }
1889 1793 return (tp);
1890 1794 }
1891 1795
1892 1796 /*
1893 1797 * See if there is any work on the dispatcher queue for other CPUs.
1894 1798 * If there is, dequeue the best thread and return.
1895 1799 */
1896 1800 static kthread_t *
1897 1801 disp_getwork(cpu_t *cp)
1898 1802 {
1899 1803 cpu_t *ocp; /* other CPU */
1900 1804 cpu_t *ocp_start;
1901 1805 cpu_t *tcp; /* target local CPU */
1902 1806 kthread_t *tp;
1903 1807 kthread_t *retval = NULL;
1904 1808 pri_t maxpri;
1905 1809 disp_t *kpq; /* kp queue for this partition */
1906 1810 lpl_t *lpl, *lpl_leaf;
1907 1811 int leafidx, startidx;
1908 1812 hrtime_t stealtime;
1909 1813 lgrp_id_t local_id;
1910 1814
1911 1815 maxpri = -1;
1912 1816 tcp = NULL;
1913 1817
1914 1818 kpq = &cp->cpu_part->cp_kp_queue;
1915 1819 while (kpq->disp_maxrunpri >= 0) {
1916 1820 /*
1917 1821 * Try to take a thread from the kp_queue.
1918 1822 */
1919 1823 tp = (disp_getbest(kpq));
1920 1824 if (tp)
1921 1825 return (disp_ratify(tp, kpq));
1922 1826 }
1923 1827
1924 1828 kpreempt_disable(); /* protect the cpu_active list */
1925 1829
1926 1830 /*
1927 1831 * Try to find something to do on another CPU's run queue.
1928 1832 * Loop through all other CPUs looking for the one with the highest
1929 1833 * priority unbound thread.
1930 1834 *
1931 1835 * On NUMA machines, the partition's CPUs are consulted in order of
1932 1836 * distance from the current CPU. This way, the first available
1933 1837 * work found is also the closest, and will suffer the least
1934 1838 * from being migrated.
1935 1839 */
1936 1840 lpl = lpl_leaf = cp->cpu_lpl;
1937 1841 local_id = lpl_leaf->lpl_lgrpid;
1938 1842 leafidx = startidx = 0;
1939 1843
1940 1844 /*
1941 1845 * This loop traverses the lpl hierarchy. Higher level lpls represent
1942 1846 * broader levels of locality
1943 1847 */
1944 1848 do {
1945 1849 /* This loop iterates over the lpl's leaves */
1946 1850 do {
1947 1851 if (lpl_leaf != cp->cpu_lpl)
1948 1852 ocp = lpl_leaf->lpl_cpus;
1949 1853 else
1950 1854 ocp = cp->cpu_next_lpl;
1951 1855
1952 1856 /* This loop iterates over the CPUs in the leaf */
1953 1857 ocp_start = ocp;
1954 1858 do {
1955 1859 pri_t pri;
1956 1860
1957 1861 ASSERT(CPU_ACTIVE(ocp));
1958 1862
1959 1863 /*
1960 1864 * End our stroll around this lpl if:
1961 1865 *
1962 1866 * - Something became runnable on the local
1963 1867 * queue...which also ends our stroll around
1964 1868 * the partition.
1965 1869 *
1966 1870 * - We happen across another idle CPU.
1967 1871 * Since it is patrolling the next portion
1968 1872 * of the lpl's list (assuming it's not
1969 1873 * halted, or busy servicing an interrupt),
1970 1874 * move to the next higher level of locality.
1971 1875 */
1972 1876 if (cp->cpu_disp->disp_nrunnable != 0) {
1973 1877 kpreempt_enable();
1974 1878 return (NULL);
1975 1879 }
1976 1880 if (ocp->cpu_dispatch_pri == -1) {
1977 1881 if (ocp->cpu_disp_flags &
1978 1882 CPU_DISP_HALTED ||
1979 1883 ocp->cpu_intr_actv != 0)
1980 1884 continue;
1981 1885 else
1982 1886 goto next_level;
1983 1887 }
1984 1888
1985 1889 /*
1986 1890 * If there's only one thread and the CPU
1987 1891 * is in the middle of a context switch,
1988 1892 * or it's currently running the idle thread,
1989 1893 * don't steal it.
1990 1894 */
1991 1895 if ((ocp->cpu_disp_flags &
1992 1896 CPU_DISP_DONTSTEAL) &&
1993 1897 ocp->cpu_disp->disp_nrunnable == 1)
1994 1898 continue;
1995 1899
1996 1900 pri = ocp->cpu_disp->disp_max_unbound_pri;
1997 1901 if (pri > maxpri) {
1998 1902 /*
1999 1903 * Don't steal threads that we attempted
2000 1904 * to steal recently until they're ready
2001 1905 * to be stolen again.
2002 1906 */
2003 1907 stealtime = ocp->cpu_disp->disp_steal;
2004 1908 if (stealtime == 0 ||
2005 1909 stealtime - gethrtime() <= 0) {
2006 1910 maxpri = pri;
2007 1911 tcp = ocp;
2008 1912 } else {
2009 1913 /*
2010 1914 * Don't update tcp, just set
2011 1915 * the retval to T_DONTSTEAL, so
2012 1916 * that if no acceptable CPUs
2013 1917 * are found the return value
2014 1918 * will be T_DONTSTEAL rather
2015 1919 * then NULL.
2016 1920 */
2017 1921 retval = T_DONTSTEAL;
2018 1922 }
2019 1923 }
2020 1924 } while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2021 1925
2022 1926 /*
2023 1927 * Iterate to the next leaf lpl in the resource set
2024 1928 * at this level of locality. If we hit the end of
2025 1929 * the set, wrap back around to the beginning.
2026 1930 *
2027 1931 * Note: This iteration is NULL terminated for a reason
2028 1932 * see lpl_topo_bootstrap() in lgrp.c for details.
2029 1933 */
2030 1934 if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2031 1935 leafidx = 0;
2032 1936 lpl_leaf = lpl->lpl_rset[leafidx];
2033 1937 }
2034 1938 } while (leafidx != startidx);
2035 1939
2036 1940 next_level:
2037 1941 /*
2038 1942 * Expand the search to include farther away CPUs (next
2039 1943 * locality level). The closer CPUs that have already been
2040 1944 * checked will be checked again. In doing so, idle CPUs
2041 1945 * will tend to be more aggresive about stealing from CPUs
2042 1946 * that are closer (since the closer CPUs will be considered
2043 1947 * more often).
2044 1948 * Begin at this level with the CPUs local leaf lpl.
2045 1949 */
2046 1950 if ((lpl = lpl->lpl_parent) != NULL) {
2047 1951 leafidx = startidx = lpl->lpl_id2rset[local_id];
2048 1952 lpl_leaf = lpl->lpl_rset[leafidx];
2049 1953 }
2050 1954 } while (!tcp && lpl);
2051 1955
2052 1956 kpreempt_enable();
2053 1957
2054 1958 /*
2055 1959 * If another queue looks good, and there is still nothing on
2056 1960 * the local queue, try to transfer one or more threads
2057 1961 * from it to our queue.
2058 1962 */
2059 1963 if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2060 1964 tp = disp_getbest(tcp->cpu_disp);
2061 1965 if (tp == NULL || tp == T_DONTSTEAL)
2062 1966 return (tp);
2063 1967 return (disp_ratify(tp, kpq));
2064 1968 }
2065 1969 return (retval);
2066 1970 }
2067 1971
2068 1972
2069 1973 /*
2070 1974 * disp_fix_unbound_pri()
2071 1975 * Determines the maximum priority of unbound threads on the queue.
2072 1976 * The priority is kept for the queue, but is only increased, never
2073 1977 * reduced unless some CPU is looking for something on that queue.
2074 1978 *
2075 1979 * The priority argument is the known upper limit.
2076 1980 *
2077 1981 * Perhaps this should be kept accurately, but that probably means
2078 1982 * separate bitmaps for bound and unbound threads. Since only idled
2079 1983 * CPUs will have to do this recalculation, it seems better this way.
2080 1984 */
2081 1985 static void
2082 1986 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2083 1987 {
2084 1988 kthread_t *tp;
2085 1989 dispq_t *dq;
2086 1990 ulong_t *dqactmap = dp->disp_qactmap;
2087 1991 ulong_t mapword;
2088 1992 int wx;
2089 1993
2090 1994 ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2091 1995
2092 1996 ASSERT(pri >= 0); /* checked by caller */
2093 1997
2094 1998 /*
2095 1999 * Start the search at the next lowest priority below the supplied
2096 2000 * priority. This depends on the bitmap implementation.
2097 2001 */
2098 2002 do {
2099 2003 wx = pri >> BT_ULSHIFT; /* index of word in map */
2100 2004
2101 2005 /*
2102 2006 * Form mask for all lower priorities in the word.
2103 2007 */
2104 2008 mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2105 2009
2106 2010 /*
2107 2011 * Get next lower active priority.
2108 2012 */
2109 2013 if (mapword != 0) {
2110 2014 pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2111 2015 } else if (wx > 0) {
2112 2016 pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2113 2017 if (pri < 0)
2114 2018 break;
2115 2019 } else {
2116 2020 pri = -1;
2117 2021 break;
2118 2022 }
2119 2023
2120 2024 /*
2121 2025 * Search the queue for unbound, runnable threads.
2122 2026 */
2123 2027 dq = &dp->disp_q[pri];
2124 2028 tp = dq->dq_first;
2125 2029
2126 2030 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2127 2031 tp = tp->t_link;
2128 2032 }
2129 2033
2130 2034 /*
2131 2035 * If a thread was found, set the priority and return.
2132 2036 */
2133 2037 } while (tp == NULL);
2134 2038
2135 2039 /*
2136 2040 * pri holds the maximum unbound thread priority or -1.
2137 2041 */
2138 2042 if (dp->disp_max_unbound_pri != pri)
2139 2043 dp->disp_max_unbound_pri = pri;
2140 2044 }
2141 2045
2142 2046 /*
2143 2047 * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2144 2048 * check if the CPU to which is was previously bound should have
2145 2049 * its disp_max_unbound_pri increased.
2146 2050 */
2147 2051 void
2148 2052 disp_adjust_unbound_pri(kthread_t *tp)
2149 2053 {
2150 2054 disp_t *dp;
2151 2055 pri_t tpri;
2152 2056
2153 2057 ASSERT(THREAD_LOCK_HELD(tp));
2154 2058
2155 2059 /*
2156 2060 * Don't do anything if the thread is not bound, or
2157 2061 * currently not runnable or swapped out.
2158 2062 */
2159 2063 if (tp->t_bound_cpu == NULL ||
2160 2064 tp->t_state != TS_RUN ||
2161 2065 tp->t_schedflag & TS_ON_SWAPQ)
2162 2066 return;
2163 2067
2164 2068 tpri = DISP_PRIO(tp);
2165 2069 dp = tp->t_bound_cpu->cpu_disp;
2166 2070 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2167 2071 if (tpri > dp->disp_max_unbound_pri)
2168 2072 dp->disp_max_unbound_pri = tpri;
2169 2073 }
2170 2074
2171 2075 /*
2172 2076 * disp_getbest()
2173 2077 * De-queue the highest priority unbound runnable thread.
2174 2078 * Returns with the thread unlocked and onproc but at splhigh (like disp()).
2175 2079 * Returns NULL if nothing found.
2176 2080 * Returns T_DONTSTEAL if the thread was not stealable.
2177 2081 * so that the caller will try again later.
2178 2082 *
2179 2083 * Passed a pointer to a dispatch queue not associated with this CPU, and
2180 2084 * its type.
2181 2085 */
2182 2086 static kthread_t *
2183 2087 disp_getbest(disp_t *dp)
2184 2088 {
2185 2089 kthread_t *tp;
2186 2090 dispq_t *dq;
2187 2091 pri_t pri;
2188 2092 cpu_t *cp, *tcp;
2189 2093 boolean_t allbound;
2190 2094
2191 2095 disp_lock_enter(&dp->disp_lock);
2192 2096
2193 2097 /*
2194 2098 * If there is nothing to run, or the CPU is in the middle of a
2195 2099 * context switch of the only thread, return NULL.
2196 2100 */
2197 2101 tcp = dp->disp_cpu;
2198 2102 cp = CPU;
2199 2103 pri = dp->disp_max_unbound_pri;
2200 2104 if (pri == -1 ||
2201 2105 (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2202 2106 tcp->cpu_disp->disp_nrunnable == 1)) {
2203 2107 disp_lock_exit_nopreempt(&dp->disp_lock);
2204 2108 return (NULL);
2205 2109 }
2206 2110
2207 2111 dq = &dp->disp_q[pri];
2208 2112
2209 2113
2210 2114 /*
2211 2115 * Assume that all threads are bound on this queue, and change it
2212 2116 * later when we find out that it is not the case.
2213 2117 */
2214 2118 allbound = B_TRUE;
2215 2119 for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2216 2120 hrtime_t now, nosteal, rqtime;
2217 2121
2218 2122 /*
2219 2123 * Skip over bound threads which could be here even
2220 2124 * though disp_max_unbound_pri indicated this level.
2221 2125 */
2222 2126 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2223 2127 continue;
2224 2128
2225 2129 /*
2226 2130 * We've got some unbound threads on this queue, so turn
2227 2131 * the allbound flag off now.
2228 2132 */
2229 2133 allbound = B_FALSE;
2230 2134
2231 2135 /*
2232 2136 * The thread is a candidate for stealing from its run queue. We
2233 2137 * don't want to steal threads that became runnable just a
2234 2138 * moment ago. This improves CPU affinity for threads that get
2235 2139 * preempted for short periods of time and go back on the run
2236 2140 * queue.
2237 2141 *
2238 2142 * We want to let it stay on its run queue if it was only placed
2239 2143 * there recently and it was running on the same CPU before that
2240 2144 * to preserve its cache investment. For the thread to remain on
2241 2145 * its run queue, ALL of the following conditions must be
2242 2146 * satisfied:
2243 2147 *
2244 2148 * - the disp queue should not be the kernel preemption queue
2245 2149 * - delayed idle stealing should not be disabled
2246 2150 * - nosteal_nsec should be non-zero
2247 2151 * - it should run with user priority
2248 2152 * - it should be on the run queue of the CPU where it was
2249 2153 * running before being placed on the run queue
2250 2154 * - it should be the only thread on the run queue (to prevent
2251 2155 * extra scheduling latency for other threads)
2252 2156 * - it should sit on the run queue for less than per-chip
2253 2157 * nosteal interval or global nosteal interval
2254 2158 * - in case of CPUs with shared cache it should sit in a run
2255 2159 * queue of a CPU from a different chip
2256 2160 *
2257 2161 * The checks are arranged so that the ones that are faster are
2258 2162 * placed earlier.
2259 2163 */
2260 2164 if (tcp == NULL ||
2261 2165 pri >= minclsyspri ||
2262 2166 tp->t_cpu != tcp)
2263 2167 break;
2264 2168
2265 2169 /*
2266 2170 * Steal immediately if, due to CMT processor architecture
2267 2171 * migraiton between cp and tcp would incur no performance
2268 2172 * penalty.
2269 2173 */
2270 2174 if (pg_cmt_can_migrate(cp, tcp))
2271 2175 break;
2272 2176
2273 2177 nosteal = nosteal_nsec;
2274 2178 if (nosteal == 0)
2275 2179 break;
2276 2180
2277 2181 /*
2278 2182 * Calculate time spent sitting on run queue
2279 2183 */
2280 2184 now = gethrtime_unscaled();
2281 2185 rqtime = now - tp->t_waitrq;
2282 2186 scalehrtime(&rqtime);
2283 2187
2284 2188 /*
2285 2189 * Steal immediately if the time spent on this run queue is more
2286 2190 * than allowed nosteal delay.
2287 2191 *
2288 2192 * Negative rqtime check is needed here to avoid infinite
2289 2193 * stealing delays caused by unlikely but not impossible
2290 2194 * drifts between CPU times on different CPUs.
2291 2195 */
2292 2196 if (rqtime > nosteal || rqtime < 0)
2293 2197 break;
2294 2198
2295 2199 DTRACE_PROBE4(nosteal, kthread_t *, tp,
2296 2200 cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2297 2201 scalehrtime(&now);
2298 2202 /*
2299 2203 * Calculate when this thread becomes stealable
2300 2204 */
2301 2205 now += (nosteal - rqtime);
2302 2206
2303 2207 /*
2304 2208 * Calculate time when some thread becomes stealable
2305 2209 */
2306 2210 if (now < dp->disp_steal)
2307 2211 dp->disp_steal = now;
2308 2212 }
2309 2213
2310 2214 /*
2311 2215 * If there were no unbound threads on this queue, find the queue
2312 2216 * where they are and then return later. The value of
2313 2217 * disp_max_unbound_pri is not always accurate because it isn't
2314 2218 * reduced until another idle CPU looks for work.
2315 2219 */
2316 2220 if (allbound)
2317 2221 disp_fix_unbound_pri(dp, pri);
2318 2222
2319 2223 /*
2320 2224 * If we reached the end of the queue and found no unbound threads
2321 2225 * then return NULL so that other CPUs will be considered. If there
2322 2226 * are unbound threads but they cannot yet be stolen, then
2323 2227 * return T_DONTSTEAL and try again later.
2324 2228 */
2325 2229 if (tp == NULL) {
2326 2230 disp_lock_exit_nopreempt(&dp->disp_lock);
2327 2231 return (allbound ? NULL : T_DONTSTEAL);
2328 2232 }
2329 2233
2330 2234 /*
2331 2235 * Found a runnable, unbound thread, so remove it from queue.
2332 2236 * dispdeq() requires that we have the thread locked, and we do,
2333 2237 * by virtue of holding the dispatch queue lock. dispdeq() will
2334 2238 * put the thread in transition state, thereby dropping the dispq
2335 2239 * lock.
2336 2240 */
2337 2241
2338 2242 #ifdef DEBUG
2339 2243 {
2340 2244 int thread_was_on_queue;
2341 2245
2342 2246 thread_was_on_queue = dispdeq(tp); /* drops disp_lock */
2343 2247 ASSERT(thread_was_on_queue);
2344 2248 }
2345 2249
2346 2250 #else /* DEBUG */
2347 2251 (void) dispdeq(tp); /* drops disp_lock */
2348 2252 #endif /* DEBUG */
2349 2253
2350 2254 /*
2351 2255 * Reset the disp_queue steal time - we do not know what is the smallest
2352 2256 * value across the queue is.
2353 2257 */
2354 2258 dp->disp_steal = 0;
2355 2259
2356 2260 tp->t_schedflag |= TS_DONT_SWAP;
2357 2261
2358 2262 /*
2359 2263 * Setup thread to run on the current CPU.
2360 2264 */
2361 2265 tp->t_disp_queue = cp->cpu_disp;
2362 2266
2363 2267 cp->cpu_dispthread = tp; /* protected by spl only */
2364 2268 cp->cpu_dispatch_pri = pri;
2365 2269
2366 2270 /*
2367 2271 * There can be a memory synchronization race between disp_getbest()
2368 2272 * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2369 2273 * to preempt the current thread to run the enqueued thread while
2370 2274 * disp_getbest() and disp_ratify() are changing the current thread
2371 2275 * to the stolen thread. This may lead to a situation where
2372 2276 * cpu_resched() tries to preempt the wrong thread and the
2373 2277 * stolen thread continues to run on the CPU which has been tagged
2374 2278 * for preemption.
2375 2279 * Later the clock thread gets enqueued but doesn't get to run on the
2376 2280 * CPU causing the system to hang.
2377 2281 *
2378 2282 * To avoid this, grabbing and dropping the disp_lock (which does
2379 2283 * a memory barrier) is needed to synchronize the execution of
2380 2284 * cpu_resched() with disp_getbest() and disp_ratify() and
2381 2285 * synchronize the memory read and written by cpu_resched(),
2382 2286 * disp_getbest(), and disp_ratify() with each other.
2383 2287 * (see CR#6482861 for more details).
2384 2288 */
2385 2289 disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2386 2290 disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2387 2291
2388 2292 ASSERT(pri == DISP_PRIO(tp));
2389 2293
2390 2294 DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2391 2295
2392 2296 thread_onproc(tp, cp); /* set t_state to TS_ONPROC */
2393 2297
2394 2298 /*
2395 2299 * Return with spl high so that swtch() won't need to raise it.
2396 2300 * The disp_lock was dropped by dispdeq().
2397 2301 */
2398 2302
2399 2303 return (tp);
2400 2304 }
2401 2305
2402 2306 /*
2403 2307 * disp_bound_common() - common routine for higher level functions
2404 2308 * that check for bound threads under certain conditions.
2405 2309 * If 'threadlistsafe' is set then there is no need to acquire
2406 2310 * pidlock to stop the thread list from changing (eg, if
2407 2311 * disp_bound_* is called with cpus paused).
2408 2312 */
2409 2313 static int
2410 2314 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2411 2315 {
2412 2316 int found = 0;
2413 2317 kthread_t *tp;
2414 2318
2415 2319 ASSERT(flag);
2416 2320
2417 2321 if (!threadlistsafe)
2418 2322 mutex_enter(&pidlock);
2419 2323 tp = curthread; /* faster than allthreads */
2420 2324 do {
2421 2325 if (tp->t_state != TS_FREE) {
2422 2326 /*
2423 2327 * If an interrupt thread is busy, but the
2424 2328 * caller doesn't care (i.e. BOUND_INTR is off),
2425 2329 * then just ignore it and continue through.
2426 2330 */
2427 2331 if ((tp->t_flag & T_INTR_THREAD) &&
2428 2332 !(flag & BOUND_INTR))
2429 2333 continue;
2430 2334
2431 2335 /*
2432 2336 * Skip the idle thread for the CPU
2433 2337 * we're about to set offline.
2434 2338 */
2435 2339 if (tp == cp->cpu_idle_thread)
2436 2340 continue;
2437 2341
2438 2342 /*
2439 2343 * Skip the pause thread for the CPU
2440 2344 * we're about to set offline.
2441 2345 */
2442 2346 if (tp == cp->cpu_pause_thread)
2443 2347 continue;
2444 2348
2445 2349 if ((flag & BOUND_CPU) &&
2446 2350 (tp->t_bound_cpu == cp ||
2447 2351 tp->t_bind_cpu == cp->cpu_id ||
2448 2352 tp->t_weakbound_cpu == cp)) {
2449 2353 found = 1;
2450 2354 break;
2451 2355 }
2452 2356
2453 2357 if ((flag & BOUND_PARTITION) &&
2454 2358 (tp->t_cpupart == cp->cpu_part)) {
2455 2359 found = 1;
2456 2360 break;
2457 2361 }
2458 2362 }
2459 2363 } while ((tp = tp->t_next) != curthread && found == 0);
2460 2364 if (!threadlistsafe)
2461 2365 mutex_exit(&pidlock);
2462 2366 return (found);
2463 2367 }
2464 2368
2465 2369 /*
2466 2370 * disp_bound_threads - return nonzero if threads are bound to the processor.
2467 2371 * Called infrequently. Keep this simple.
2468 2372 * Includes threads that are asleep or stopped but not onproc.
2469 2373 */
2470 2374 int
2471 2375 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2472 2376 {
2473 2377 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2474 2378 }
2475 2379
2476 2380 /*
2477 2381 * disp_bound_anythreads - return nonzero if _any_ threads are bound
2478 2382 * to the given processor, including interrupt threads.
2479 2383 */
2480 2384 int
2481 2385 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2482 2386 {
2483 2387 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2484 2388 }
2485 2389
2486 2390 /*
2487 2391 * disp_bound_partition - return nonzero if threads are bound to the same
2488 2392 * partition as the processor.
2489 2393 * Called infrequently. Keep this simple.
2490 2394 * Includes threads that are asleep or stopped but not onproc.
2491 2395 */
2492 2396 int
2493 2397 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2494 2398 {
2495 2399 return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2496 2400 }
2497 2401
2498 2402 /*
2499 2403 * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2500 2404 * threads to other CPUs.
2501 2405 */
2502 2406 void
2503 2407 disp_cpu_inactive(cpu_t *cp)
2504 2408 {
2505 2409 kthread_t *tp;
2506 2410 disp_t *dp = cp->cpu_disp;
2507 2411 dispq_t *dq;
2508 2412 pri_t pri;
2509 2413 int wasonq;
2510 2414
2511 2415 disp_lock_enter(&dp->disp_lock);
2512 2416 while ((pri = dp->disp_max_unbound_pri) != -1) {
2513 2417 dq = &dp->disp_q[pri];
2514 2418 tp = dq->dq_first;
2515 2419
2516 2420 /*
2517 2421 * Skip over bound threads.
2518 2422 */
2519 2423 while (tp != NULL && tp->t_bound_cpu != NULL) {
2520 2424 tp = tp->t_link;
2521 2425 }
2522 2426
2523 2427 if (tp == NULL) {
2524 2428 /* disp_max_unbound_pri must be inaccurate, so fix it */
2525 2429 disp_fix_unbound_pri(dp, pri);
2526 2430 continue;
2527 2431 }
2528 2432
2529 2433 wasonq = dispdeq(tp); /* drops disp_lock */
2530 2434 ASSERT(wasonq);
2531 2435 ASSERT(tp->t_weakbound_cpu == NULL);
2532 2436
2533 2437 setbackdq(tp);
2534 2438 /*
2535 2439 * Called from cpu_offline:
2536 2440 *
2537 2441 * cp has already been removed from the list of active cpus
2538 2442 * and tp->t_cpu has been changed so there is no risk of
2539 2443 * tp ending up back on cp.
2540 2444 *
2541 2445 * Called from cpupart_move_cpu:
2542 2446 *
2543 2447 * The cpu has moved to a new cpupart. Any threads that
2544 2448 * were on it's dispatch queues before the move remain
2545 2449 * in the old partition and can't run in the new partition.
2546 2450 */
2547 2451 ASSERT(tp->t_cpu != cp);
2548 2452 thread_unlock(tp);
2549 2453
2550 2454 disp_lock_enter(&dp->disp_lock);
2551 2455 }
2552 2456 disp_lock_exit(&dp->disp_lock);
2553 2457 }
2554 2458
2555 2459 /*
2556 2460 * disp_lowpri_cpu - find CPU running the lowest priority thread.
2557 2461 * The hint passed in is used as a starting point so we don't favor
2558 2462 * CPU 0 or any other CPU. The caller should pass in the most recently
2559 2463 * used CPU for the thread.
2560 2464 *
2561 2465 * The lgroup and priority are used to determine the best CPU to run on
2562 2466 * in a NUMA machine. The lgroup specifies which CPUs are closest while
2563 2467 * the thread priority will indicate whether the thread will actually run
2564 2468 * there. To pick the best CPU, the CPUs inside and outside of the given
2565 2469 * lgroup which are running the lowest priority threads are found. The
2566 2470 * remote CPU is chosen only if the thread will not run locally on a CPU
2567 2471 * within the lgroup, but will run on the remote CPU. If the thread
2568 2472 * cannot immediately run on any CPU, the best local CPU will be chosen.
2569 2473 *
2570 2474 * The lpl specified also identifies the cpu partition from which
2571 2475 * disp_lowpri_cpu should select a CPU.
2572 2476 *
2573 2477 * curcpu is used to indicate that disp_lowpri_cpu is being called on
2574 2478 * behalf of the current thread. (curthread is looking for a new cpu)
2575 2479 * In this case, cpu_dispatch_pri for this thread's cpu should be
2576 2480 * ignored.
2577 2481 *
2578 2482 * If a cpu is the target of an offline request then try to avoid it.
2579 2483 *
2580 2484 * This function must be called at either high SPL, or with preemption
2581 2485 * disabled, so that the "hint" CPU cannot be removed from the online
2582 2486 * CPU list while we are traversing it.
2583 2487 */
2584 2488 cpu_t *
2585 2489 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2586 2490 {
2587 2491 cpu_t *bestcpu;
2588 2492 cpu_t *besthomecpu;
2589 2493 cpu_t *cp, *cpstart;
2590 2494
2591 2495 pri_t bestpri;
2592 2496 pri_t cpupri;
2593 2497
2594 2498 klgrpset_t done;
2595 2499 klgrpset_t cur_set;
2596 2500
2597 2501 lpl_t *lpl_iter, *lpl_leaf;
2598 2502 int i;
2599 2503
2600 2504 /*
2601 2505 * Scan for a CPU currently running the lowest priority thread.
2602 2506 * Cannot get cpu_lock here because it is adaptive.
2603 2507 * We do not require lock on CPU list.
2604 2508 */
2605 2509 ASSERT(hint != NULL);
2606 2510 ASSERT(lpl != NULL);
2607 2511 ASSERT(lpl->lpl_ncpu > 0);
2608 2512
2609 2513 /*
2610 2514 * First examine local CPUs. Note that it's possible the hint CPU
2611 2515 * passed in in remote to the specified home lgroup. If our priority
2612 2516 * isn't sufficient enough such that we can run immediately at home,
2613 2517 * then examine CPUs remote to our home lgroup.
2614 2518 * We would like to give preference to CPUs closest to "home".
2615 2519 * If we can't find a CPU where we'll run at a given level
2616 2520 * of locality, we expand our search to include the next level.
2617 2521 */
2618 2522 bestcpu = besthomecpu = NULL;
2619 2523 klgrpset_clear(done);
2620 2524 /* start with lpl we were passed */
2621 2525
2622 2526 lpl_iter = lpl;
2623 2527
2624 2528 do {
2625 2529
2626 2530 bestpri = SHRT_MAX;
2627 2531 klgrpset_clear(cur_set);
2628 2532
2629 2533 for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2630 2534 lpl_leaf = lpl_iter->lpl_rset[i];
2631 2535 if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2632 2536 continue;
2633 2537
2634 2538 klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2635 2539
2636 2540 if (hint->cpu_lpl == lpl_leaf)
2637 2541 cp = cpstart = hint;
2638 2542 else
2639 2543 cp = cpstart = lpl_leaf->lpl_cpus;
2640 2544
2641 2545 do {
2642 2546 if (cp == curcpu)
2643 2547 cpupri = -1;
2644 2548 else if (cp == cpu_inmotion)
2645 2549 cpupri = SHRT_MAX;
2646 2550 else
2647 2551 cpupri = cp->cpu_dispatch_pri;
2648 2552 if (cp->cpu_disp->disp_maxrunpri > cpupri)
2649 2553 cpupri = cp->cpu_disp->disp_maxrunpri;
2650 2554 if (cp->cpu_chosen_level > cpupri)
2651 2555 cpupri = cp->cpu_chosen_level;
2652 2556 if (cpupri < bestpri) {
2653 2557 if (CPU_IDLING(cpupri)) {
2654 2558 ASSERT((cp->cpu_flags &
2655 2559 CPU_QUIESCED) == 0);
2656 2560 return (cp);
2657 2561 }
2658 2562 bestcpu = cp;
2659 2563 bestpri = cpupri;
2660 2564 }
2661 2565 } while ((cp = cp->cpu_next_lpl) != cpstart);
2662 2566 }
2663 2567
2664 2568 if (bestcpu && (tpri > bestpri)) {
2665 2569 ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2666 2570 return (bestcpu);
2667 2571 }
2668 2572 if (besthomecpu == NULL)
2669 2573 besthomecpu = bestcpu;
2670 2574 /*
2671 2575 * Add the lgrps we just considered to the "done" set
2672 2576 */
2673 2577 klgrpset_or(done, cur_set);
2674 2578
2675 2579 } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2676 2580
2677 2581 /*
2678 2582 * The specified priority isn't high enough to run immediately
2679 2583 * anywhere, so just return the best CPU from the home lgroup.
2680 2584 */
2681 2585 ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2682 2586 return (besthomecpu);
2683 2587 }
2684 2588
2685 2589 /*
2686 2590 * This routine provides the generic idle cpu function for all processors.
2687 2591 * If a processor has some specific code to execute when idle (say, to stop
2688 2592 * the pipeline and save power) then that routine should be defined in the
2689 2593 * processors specific code (module_xx.c) and the global variable idle_cpu
2690 2594 * set to that function.
2691 2595 */
2692 2596 static void
2693 2597 generic_idle_cpu(void)
2694 2598 {
2695 2599 }
2696 2600
2697 2601 /*ARGSUSED*/
2698 2602 static void
2699 2603 generic_enq_thread(cpu_t *cpu, int bound)
2700 2604 {
2701 2605 }
↓ open down ↓ |
1172 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX