1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2013, Joyent, Inc. All rights reserved.
  25  */
  26 
  27 #include <sys/types.h>
  28 #include <sys/param.h>
  29 #include <sys/sysmacros.h>
  30 #include <sys/cred.h>
  31 #include <sys/proc.h>
  32 #include <sys/session.h>
  33 #include <sys/strsubr.h>
  34 #include <sys/user.h>
  35 #include <sys/priocntl.h>
  36 #include <sys/class.h>
  37 #include <sys/disp.h>
  38 #include <sys/procset.h>
  39 #include <sys/debug.h>
  40 #include <sys/kmem.h>
  41 #include <sys/errno.h>
  42 #include <sys/fx.h>
  43 #include <sys/fxpriocntl.h>
  44 #include <sys/cpuvar.h>
  45 #include <sys/systm.h>
  46 #include <sys/vtrace.h>
  47 #include <sys/schedctl.h>
  48 #include <sys/tnf_probe.h>
  49 #include <sys/sunddi.h>
  50 #include <sys/spl.h>
  51 #include <sys/modctl.h>
  52 #include <sys/policy.h>
  53 #include <sys/sdt.h>
  54 #include <sys/cpupart.h>
  55 #include <sys/cpucaps.h>
  56 
  57 static pri_t fx_init(id_t, int, classfuncs_t **);
  58 
  59 static struct sclass csw = {
  60         "FX",
  61         fx_init,
  62         0
  63 };
  64 
  65 static struct modlsched modlsched = {
  66         &mod_schedops, "Fixed priority sched class", &csw
  67 };
  68 
  69 static struct modlinkage modlinkage = {
  70         MODREV_1, (void *)&modlsched, NULL
  71 };
  72 
  73 
  74 /*
  75  * control flags (kparms->fx_cflags).
  76  */
  77 #define FX_DOUPRILIM    0x01    /* change user priority limit */
  78 #define FX_DOUPRI       0x02    /* change user priority */
  79 #define FX_DOTQ         0x04    /* change FX time quantum */
  80 
  81 
  82 #define FXMAXUPRI 60            /* maximum user priority setting */
  83 
  84 #define FX_MAX_UNPRIV_PRI       0       /* maximum unpriviledge priority */
  85 
  86 /*
  87  * The fxproc_t structures that have a registered callback vector,
  88  * are also kept in an array of circular doubly linked lists. A hash on
  89  * the thread id (from ddi_get_kt_did()) is used to determine which list
  90  * each of such fxproc structures should be placed. Each list has a dummy
  91  * "head" which is never removed, so the list is never empty.
  92  */
  93 
  94 #define FX_CB_LISTS 16          /* number of lists, must be power of 2 */
  95 #define FX_CB_LIST_HASH(ktid)   ((uint_t)ktid & (FX_CB_LISTS - 1))
  96 
  97 /* Insert fxproc into callback list */
  98 #define FX_CB_LIST_INSERT(fxpp)                                         \
  99 {                                                                       \
 100         int index = FX_CB_LIST_HASH(fxpp->fx_ktid);                  \
 101         kmutex_t *lockp = &fx_cb_list_lock[index];                  \
 102         fxproc_t *headp = &fx_cb_plisthead[index];                  \
 103         mutex_enter(lockp);                                             \
 104         fxpp->fx_cb_next = headp->fx_cb_next;                             \
 105         fxpp->fx_cb_prev = headp;                                    \
 106         headp->fx_cb_next->fx_cb_prev = fxpp;                             \
 107         headp->fx_cb_next = fxpp;                                    \
 108         mutex_exit(lockp);                                              \
 109 }
 110 
 111 /*
 112  * Remove thread from callback list.
 113  */
 114 #define FX_CB_LIST_DELETE(fxpp)                                         \
 115 {                                                                       \
 116         int index = FX_CB_LIST_HASH(fxpp->fx_ktid);                  \
 117         kmutex_t *lockp = &fx_cb_list_lock[index];                  \
 118         mutex_enter(lockp);                                             \
 119         fxpp->fx_cb_prev->fx_cb_next = fxpp->fx_cb_next;               \
 120         fxpp->fx_cb_next->fx_cb_prev = fxpp->fx_cb_prev;               \
 121         mutex_exit(lockp);                                              \
 122 }
 123 
 124 #define FX_HAS_CB(fxpp) (fxpp->fx_callback != NULL)
 125 
 126 /* adjust x to be between 0 and fx_maxumdpri */
 127 
 128 #define FX_ADJUST_PRI(pri)                                              \
 129 {                                                                       \
 130         if (pri < 0)                                                 \
 131                 pri = 0;                                                \
 132         else if (pri > fx_maxumdpri)                                         \
 133                 pri = fx_maxumdpri;                                     \
 134 }
 135 
 136 #define FX_ADJUST_QUANTUM(q)                                            \
 137 {                                                                       \
 138         if (q > INT_MAX)                                             \
 139                 q = INT_MAX;                                            \
 140         else if (q <= 0)                                             \
 141                 q = FX_TQINF;                                           \
 142 }
 143 
 144 #define FX_ISVALID(pri, quantum) \
 145         (((pri >= 0) || (pri == FX_CB_NOCHANGE)) &&                  \
 146             ((quantum >= 0) || (quantum == FX_NOCHANGE) ||           \
 147                 (quantum == FX_TQDEF) || (quantum == FX_TQINF)))
 148 
 149 
 150 static id_t     fx_cid;         /* fixed priority class ID */
 151 static fxdpent_t *fx_dptbl;     /* fixed priority disp parameter table */
 152 
 153 static pri_t    fx_maxupri = FXMAXUPRI;
 154 static pri_t    fx_maxumdpri;   /* max user mode fixed priority */
 155 
 156 static pri_t    fx_maxglobpri;  /* maximum global priority used by fx class */
 157 static kmutex_t fx_dptblock;    /* protects fixed priority dispatch table */
 158 
 159 
 160 static kmutex_t fx_cb_list_lock[FX_CB_LISTS];   /* protects list of fxprocs */
 161                                                 /* that have callbacks */
 162 static fxproc_t fx_cb_plisthead[FX_CB_LISTS];   /* dummy fxproc at head of */
 163                                                 /* list of fxprocs with */
 164                                                 /* callbacks */
 165 
 166 static int      fx_admin(caddr_t, cred_t *);
 167 static int      fx_getclinfo(void *);
 168 static int      fx_parmsin(void *);
 169 static int      fx_parmsout(void *, pc_vaparms_t *);
 170 static int      fx_vaparmsin(void *, pc_vaparms_t *);
 171 static int      fx_vaparmsout(void *, pc_vaparms_t *);
 172 static int      fx_getclpri(pcpri_t *);
 173 static int      fx_alloc(void **, int);
 174 static void     fx_free(void *);
 175 static int      fx_enterclass(kthread_t *, id_t, void *, cred_t *, void *);
 176 static void     fx_exitclass(void *);
 177 static int      fx_canexit(kthread_t *, cred_t *);
 178 static int      fx_fork(kthread_t *, kthread_t *, void *);
 179 static void     fx_forkret(kthread_t *, kthread_t *);
 180 static void     fx_parmsget(kthread_t *, void *);
 181 static int      fx_parmsset(kthread_t *, void *, id_t, cred_t *);
 182 static void     fx_stop(kthread_t *, int, int);
 183 static void     fx_exit(kthread_t *);
 184 static void     fx_trapret(kthread_t *);
 185 static void     fx_preempt(kthread_t *);
 186 static void     fx_setrun(kthread_t *);
 187 static void     fx_sleep(kthread_t *);
 188 static void     fx_tick(kthread_t *);
 189 static void     fx_wakeup(kthread_t *);
 190 static int      fx_donice(kthread_t *, cred_t *, int, int *);
 191 static int      fx_doprio(kthread_t *, cred_t *, int, int *);
 192 static pri_t    fx_globpri(kthread_t *);
 193 static void     fx_yield(kthread_t *);
 194 static void     fx_nullsys();
 195 
 196 extern fxdpent_t *fx_getdptbl(void);
 197 
 198 static void     fx_change_priority(kthread_t *, fxproc_t *);
 199 static fxproc_t *fx_list_lookup(kt_did_t);
 200 static void fx_list_release(fxproc_t *);
 201 
 202 
 203 static struct classfuncs fx_classfuncs = {
 204         /* class functions */
 205         fx_admin,
 206         fx_getclinfo,
 207         fx_parmsin,
 208         fx_parmsout,
 209         fx_vaparmsin,
 210         fx_vaparmsout,
 211         fx_getclpri,
 212         fx_alloc,
 213         fx_free,
 214 
 215         /* thread functions */
 216         fx_enterclass,
 217         fx_exitclass,
 218         fx_canexit,
 219         fx_fork,
 220         fx_forkret,
 221         fx_parmsget,
 222         fx_parmsset,
 223         fx_stop,
 224         fx_exit,
 225         fx_nullsys,     /* active */
 226         fx_nullsys,     /* inactive */
 227         fx_trapret,
 228         fx_preempt,
 229         fx_setrun,
 230         fx_sleep,
 231         fx_tick,
 232         fx_wakeup,
 233         fx_donice,
 234         fx_globpri,
 235         fx_nullsys,     /* set_process_group */
 236         fx_yield,
 237         fx_doprio,
 238 };
 239 
 240 
 241 int
 242 _init()
 243 {
 244         return (mod_install(&modlinkage));
 245 }
 246 
 247 int
 248 _fini()
 249 {
 250         return (EBUSY);
 251 }
 252 
 253 int
 254 _info(struct modinfo *modinfop)
 255 {
 256         return (mod_info(&modlinkage, modinfop));
 257 }
 258 
 259 /*
 260  * Fixed priority class initialization. Called by dispinit() at boot time.
 261  * We can ignore the clparmsz argument since we know that the smallest
 262  * possible parameter buffer is big enough for us.
 263  */
 264 /* ARGSUSED */
 265 static pri_t
 266 fx_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp)
 267 {
 268         int i;
 269         extern pri_t fx_getmaxumdpri(void);
 270 
 271         fx_dptbl = fx_getdptbl();
 272         fx_maxumdpri = fx_getmaxumdpri();
 273         fx_maxglobpri = fx_dptbl[fx_maxumdpri].fx_globpri;
 274 
 275         fx_cid = cid;           /* Record our class ID */
 276 
 277         /*
 278          * Initialize the hash table for fxprocs with callbacks
 279          */
 280         for (i = 0; i < FX_CB_LISTS; i++) {
 281                 fx_cb_plisthead[i].fx_cb_next = fx_cb_plisthead[i].fx_cb_prev =
 282                     &fx_cb_plisthead[i];
 283         }
 284 
 285         /*
 286          * We're required to return a pointer to our classfuncs
 287          * structure and the highest global priority value we use.
 288          */
 289         *clfuncspp = &fx_classfuncs;
 290         return (fx_maxglobpri);
 291 }
 292 
 293 /*
 294  * Get or reset the fx_dptbl values per the user's request.
 295  */
 296 static int
 297 fx_admin(caddr_t uaddr, cred_t *reqpcredp)
 298 {
 299         fxadmin_t       fxadmin;
 300         fxdpent_t       *tmpdpp;
 301         int             userdpsz;
 302         int             i;
 303         size_t          fxdpsz;
 304 
 305         if (get_udatamodel() == DATAMODEL_NATIVE) {
 306                 if (copyin(uaddr, &fxadmin, sizeof (fxadmin_t)))
 307                         return (EFAULT);
 308         }
 309 #ifdef _SYSCALL32_IMPL
 310         else {
 311                 /* get fxadmin struct from ILP32 caller */
 312                 fxadmin32_t fxadmin32;
 313                 if (copyin(uaddr, &fxadmin32, sizeof (fxadmin32_t)))
 314                         return (EFAULT);
 315                 fxadmin.fx_dpents =
 316                     (struct fxdpent *)(uintptr_t)fxadmin32.fx_dpents;
 317                 fxadmin.fx_ndpents = fxadmin32.fx_ndpents;
 318                 fxadmin.fx_cmd = fxadmin32.fx_cmd;
 319         }
 320 #endif /* _SYSCALL32_IMPL */
 321 
 322         fxdpsz = (fx_maxumdpri + 1) * sizeof (fxdpent_t);
 323 
 324         switch (fxadmin.fx_cmd) {
 325         case FX_GETDPSIZE:
 326                 fxadmin.fx_ndpents = fx_maxumdpri + 1;
 327 
 328                 if (get_udatamodel() == DATAMODEL_NATIVE) {
 329                         if (copyout(&fxadmin, uaddr, sizeof (fxadmin_t)))
 330                                 return (EFAULT);
 331                 }
 332 #ifdef _SYSCALL32_IMPL
 333                 else {
 334                         /* return fxadmin struct to ILP32 caller */
 335                         fxadmin32_t fxadmin32;
 336                         fxadmin32.fx_dpents =
 337                             (caddr32_t)(uintptr_t)fxadmin.fx_dpents;
 338                         fxadmin32.fx_ndpents = fxadmin.fx_ndpents;
 339                         fxadmin32.fx_cmd = fxadmin.fx_cmd;
 340                         if (copyout(&fxadmin32, uaddr, sizeof (fxadmin32_t)))
 341                                 return (EFAULT);
 342                 }
 343 #endif /* _SYSCALL32_IMPL */
 344                 break;
 345 
 346         case FX_GETDPTBL:
 347                 userdpsz = MIN(fxadmin.fx_ndpents * sizeof (fxdpent_t),
 348                     fxdpsz);
 349                 if (copyout(fx_dptbl, fxadmin.fx_dpents, userdpsz))
 350                         return (EFAULT);
 351 
 352                 fxadmin.fx_ndpents = userdpsz / sizeof (fxdpent_t);
 353 
 354                 if (get_udatamodel() == DATAMODEL_NATIVE) {
 355                         if (copyout(&fxadmin, uaddr, sizeof (fxadmin_t)))
 356                                 return (EFAULT);
 357                 }
 358 #ifdef _SYSCALL32_IMPL
 359                 else {
 360                         /* return fxadmin struct to ILP32 callers */
 361                         fxadmin32_t fxadmin32;
 362                         fxadmin32.fx_dpents =
 363                             (caddr32_t)(uintptr_t)fxadmin.fx_dpents;
 364                         fxadmin32.fx_ndpents = fxadmin.fx_ndpents;
 365                         fxadmin32.fx_cmd = fxadmin.fx_cmd;
 366                         if (copyout(&fxadmin32, uaddr, sizeof (fxadmin32_t)))
 367                                 return (EFAULT);
 368                 }
 369 #endif /* _SYSCALL32_IMPL */
 370                 break;
 371 
 372         case FX_SETDPTBL:
 373                 /*
 374                  * We require that the requesting process has sufficient
 375                  * privileges. We also require that the table supplied by
 376                  * the user exactly match the current fx_dptbl in size.
 377                  */
 378                 if (secpolicy_dispadm(reqpcredp) != 0) {
 379                         return (EPERM);
 380                 }
 381                 if (fxadmin.fx_ndpents * sizeof (fxdpent_t) != fxdpsz) {
 382                         return (EINVAL);
 383                 }
 384 
 385                 /*
 386                  * We read the user supplied table into a temporary buffer
 387                  * where it is validated before being copied over the
 388                  * fx_dptbl.
 389                  */
 390                 tmpdpp = kmem_alloc(fxdpsz, KM_SLEEP);
 391                 if (copyin(fxadmin.fx_dpents, tmpdpp, fxdpsz)) {
 392                         kmem_free(tmpdpp, fxdpsz);
 393                         return (EFAULT);
 394                 }
 395                 for (i = 0; i < fxadmin.fx_ndpents; i++) {
 396 
 397                         /*
 398                          * Validate the user supplied values. All we are doing
 399                          * here is verifying that the values are within their
 400                          * allowable ranges and will not panic the system. We
 401                          * make no attempt to ensure that the resulting
 402                          * configuration makes sense or results in reasonable
 403                          * performance.
 404                          */
 405                         if (tmpdpp[i].fx_quantum <= 0 &&
 406                             tmpdpp[i].fx_quantum != FX_TQINF) {
 407                                 kmem_free(tmpdpp, fxdpsz);
 408                                 return (EINVAL);
 409                         }
 410                 }
 411 
 412                 /*
 413                  * Copy the user supplied values over the current fx_dptbl
 414                  * values. The fx_globpri member is read-only so we don't
 415                  * overwrite it.
 416                  */
 417                 mutex_enter(&fx_dptblock);
 418                 for (i = 0; i < fxadmin.fx_ndpents; i++) {
 419                         fx_dptbl[i].fx_quantum = tmpdpp[i].fx_quantum;
 420                 }
 421                 mutex_exit(&fx_dptblock);
 422                 kmem_free(tmpdpp, fxdpsz);
 423                 break;
 424 
 425         default:
 426                 return (EINVAL);
 427         }
 428         return (0);
 429 }
 430 
 431 /*
 432  * Allocate a fixed priority class specific thread structure and
 433  * initialize it with the parameters supplied. Also move the thread
 434  * to specified priority.
 435  */
 436 static int
 437 fx_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
 438     void *bufp)
 439 {
 440         fxkparms_t      *fxkparmsp = (fxkparms_t *)parmsp;
 441         fxproc_t        *fxpp;
 442         pri_t           reqfxupri;
 443         pri_t           reqfxuprilim;
 444 
 445         fxpp = (fxproc_t *)bufp;
 446         ASSERT(fxpp != NULL);
 447 
 448         /*
 449          * Initialize the fxproc structure.
 450          */
 451         fxpp->fx_flags = 0;
 452         fxpp->fx_callback = NULL;
 453         fxpp->fx_cookie = NULL;
 454 
 455         if (fxkparmsp == NULL) {
 456                 /*
 457                  * Use default values.
 458                  */
 459                 fxpp->fx_pri = fxpp->fx_uprilim = 0;
 460                 fxpp->fx_pquantum = fx_dptbl[fxpp->fx_pri].fx_quantum;
 461                 fxpp->fx_nice =  NZERO;
 462         } else {
 463                 /*
 464                  * Use supplied values.
 465                  */
 466 
 467                 if ((fxkparmsp->fx_cflags & FX_DOUPRILIM) == 0) {
 468                         reqfxuprilim = 0;
 469                 } else {
 470                         if (fxkparmsp->fx_uprilim > FX_MAX_UNPRIV_PRI &&
 471                             secpolicy_setpriority(reqpcredp) != 0)
 472                                 return (EPERM);
 473                         reqfxuprilim = fxkparmsp->fx_uprilim;
 474                         FX_ADJUST_PRI(reqfxuprilim);
 475                 }
 476 
 477                 if ((fxkparmsp->fx_cflags & FX_DOUPRI) == 0) {
 478                         reqfxupri = reqfxuprilim;
 479                 } else {
 480                         if (fxkparmsp->fx_upri > FX_MAX_UNPRIV_PRI &&
 481                             secpolicy_setpriority(reqpcredp) != 0)
 482                                 return (EPERM);
 483                         /*
 484                          * Set the user priority to the requested value
 485                          * or the upri limit, whichever is lower.
 486                          */
 487                         reqfxupri = fxkparmsp->fx_upri;
 488                         FX_ADJUST_PRI(reqfxupri);
 489 
 490                         if (reqfxupri > reqfxuprilim)
 491                                 reqfxupri = reqfxuprilim;
 492                 }
 493 
 494 
 495                 fxpp->fx_uprilim = reqfxuprilim;
 496                 fxpp->fx_pri = reqfxupri;
 497 
 498                 fxpp->fx_nice = NZERO - (NZERO * reqfxupri) / fx_maxupri;
 499 
 500                 if (((fxkparmsp->fx_cflags & FX_DOTQ) == 0) ||
 501                     (fxkparmsp->fx_tqntm == FX_TQDEF)) {
 502                         fxpp->fx_pquantum = fx_dptbl[fxpp->fx_pri].fx_quantum;
 503                 } else {
 504                         if (secpolicy_setpriority(reqpcredp) != 0)
 505                                 return (EPERM);
 506 
 507                         if (fxkparmsp->fx_tqntm == FX_TQINF)
 508                                 fxpp->fx_pquantum = FX_TQINF;
 509                         else {
 510                                 fxpp->fx_pquantum = fxkparmsp->fx_tqntm;
 511                         }
 512                 }
 513 
 514         }
 515 
 516         fxpp->fx_timeleft = fxpp->fx_pquantum;
 517         cpucaps_sc_init(&fxpp->fx_caps);
 518         fxpp->fx_tp = t;
 519 
 520         thread_lock(t);                 /* get dispatcher lock on thread */
 521         t->t_clfuncs = &(sclass[cid].cl_funcs->thread);
 522         t->t_cid = cid;
 523         t->t_cldata = (void *)fxpp;
 524         t->t_schedflag &= ~TS_RUNQMATCH;
 525         fx_change_priority(t, fxpp);
 526         thread_unlock(t);
 527 
 528         return (0);
 529 }
 530 
 531 /*
 532  * The thread is exiting.
 533  */
 534 static void
 535 fx_exit(kthread_t *t)
 536 {
 537         fxproc_t *fxpp;
 538 
 539         thread_lock(t);
 540         fxpp = (fxproc_t *)(t->t_cldata);
 541 
 542         /*
 543          * A thread could be exiting in between clock ticks, so we need to
 544          * calculate how much CPU time it used since it was charged last time.
 545          *
 546          * CPU caps are not enforced on exiting processes - it is usually
 547          * desirable to exit as soon as possible to free resources.
 548          */
 549         (void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ONLY);
 550 
 551         if (FX_HAS_CB(fxpp)) {
 552                 FX_CB_EXIT(FX_CALLB(fxpp), fxpp->fx_cookie);
 553                 fxpp->fx_callback = NULL;
 554                 fxpp->fx_cookie = NULL;
 555                 thread_unlock(t);
 556                 FX_CB_LIST_DELETE(fxpp);
 557                 return;
 558         }
 559 
 560         thread_unlock(t);
 561 }
 562 
 563 /*
 564  * Exiting the class. Free fxproc structure of thread.
 565  */
 566 static void
 567 fx_exitclass(void *procp)
 568 {
 569         fxproc_t *fxpp = (fxproc_t *)procp;
 570 
 571         thread_lock(fxpp->fx_tp);
 572         if (FX_HAS_CB(fxpp)) {
 573 
 574                 FX_CB_EXIT(FX_CALLB(fxpp), fxpp->fx_cookie);
 575 
 576                 fxpp->fx_callback = NULL;
 577                 fxpp->fx_cookie = NULL;
 578                 thread_unlock(fxpp->fx_tp);
 579                 FX_CB_LIST_DELETE(fxpp);
 580         } else
 581                 thread_unlock(fxpp->fx_tp);
 582 
 583         kmem_free(fxpp, sizeof (fxproc_t));
 584 }
 585 
 586 /* ARGSUSED */
 587 static int
 588 fx_canexit(kthread_t *t, cred_t *cred)
 589 {
 590         /*
 591          * A thread can always leave the FX class
 592          */
 593         return (0);
 594 }
 595 
 596 /*
 597  * Initialize fixed-priority class specific proc structure for a child.
 598  * callbacks are not inherited upon fork.
 599  */
 600 static int
 601 fx_fork(kthread_t *t, kthread_t *ct, void *bufp)
 602 {
 603         fxproc_t        *pfxpp;         /* ptr to parent's fxproc structure */
 604         fxproc_t        *cfxpp;         /* ptr to child's fxproc structure */
 605 
 606         ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
 607 
 608         cfxpp = (fxproc_t *)bufp;
 609         ASSERT(cfxpp != NULL);
 610         thread_lock(t);
 611         pfxpp = (fxproc_t *)t->t_cldata;
 612         /*
 613          * Initialize child's fxproc structure.
 614          */
 615         cfxpp->fx_timeleft = cfxpp->fx_pquantum = pfxpp->fx_pquantum;
 616         cfxpp->fx_pri = pfxpp->fx_pri;
 617         cfxpp->fx_uprilim = pfxpp->fx_uprilim;
 618         cfxpp->fx_nice = pfxpp->fx_nice;
 619         cfxpp->fx_callback = NULL;
 620         cfxpp->fx_cookie = NULL;
 621         cfxpp->fx_flags = pfxpp->fx_flags & ~(FXBACKQ);
 622         cpucaps_sc_init(&cfxpp->fx_caps);
 623 
 624         cfxpp->fx_tp = ct;
 625         ct->t_cldata = (void *)cfxpp;
 626         thread_unlock(t);
 627 
 628         /*
 629          * Link new structure into fxproc list.
 630          */
 631         return (0);
 632 }
 633 
 634 
 635 /*
 636  * Child is placed at back of dispatcher queue and parent gives
 637  * up processor so that the child runs first after the fork.
 638  * This allows the child immediately execing to break the multiple
 639  * use of copy on write pages with no disk home. The parent will
 640  * get to steal them back rather than uselessly copying them.
 641  */
 642 static void
 643 fx_forkret(kthread_t *t, kthread_t *ct)
 644 {
 645         proc_t  *pp = ttoproc(t);
 646         proc_t  *cp = ttoproc(ct);
 647         fxproc_t *fxpp;
 648 
 649         ASSERT(t == curthread);
 650         ASSERT(MUTEX_HELD(&pidlock));
 651 
 652         /*
 653          * Grab the child's p_lock before dropping pidlock to ensure
 654          * the process does not disappear before we set it running.
 655          */
 656         mutex_enter(&cp->p_lock);
 657         continuelwps(cp);
 658         mutex_exit(&cp->p_lock);
 659 
 660         mutex_enter(&pp->p_lock);
 661         mutex_exit(&pidlock);
 662         continuelwps(pp);
 663 
 664         thread_lock(t);
 665         fxpp = (fxproc_t *)(t->t_cldata);
 666         t->t_pri = fx_dptbl[fxpp->fx_pri].fx_globpri;
 667         ASSERT(t->t_pri >= 0 && t->t_pri <= fx_maxglobpri);
 668         THREAD_TRANSITION(t);
 669         fx_setrun(t);
 670         thread_unlock(t);
 671         /*
 672          * Safe to drop p_lock now since it is safe to change
 673          * the scheduling class after this point.
 674          */
 675         mutex_exit(&pp->p_lock);
 676 
 677         swtch();
 678 }
 679 
 680 
 681 /*
 682  * Get information about the fixed-priority class into the buffer
 683  * pointed to by fxinfop. The maximum configured user priority
 684  * is the only information we supply.
 685  */
 686 static int
 687 fx_getclinfo(void *infop)
 688 {
 689         fxinfo_t *fxinfop = (fxinfo_t *)infop;
 690         fxinfop->fx_maxupri = fx_maxupri;
 691         return (0);
 692 }
 693 
 694 
 695 
 696 /*
 697  * Return the user mode scheduling priority range.
 698  */
 699 static int
 700 fx_getclpri(pcpri_t *pcprip)
 701 {
 702         pcprip->pc_clpmax = fx_maxupri;
 703         pcprip->pc_clpmin = 0;
 704         return (0);
 705 }
 706 
 707 
 708 static void
 709 fx_nullsys()
 710 {}
 711 
 712 
 713 /*
 714  * Get the fixed-priority parameters of the thread pointed to by
 715  * fxprocp into the buffer pointed to by fxparmsp.
 716  */
 717 static void
 718 fx_parmsget(kthread_t *t, void *parmsp)
 719 {
 720         fxproc_t *fxpp = (fxproc_t *)t->t_cldata;
 721         fxkparms_t *fxkparmsp = (fxkparms_t *)parmsp;
 722 
 723         fxkparmsp->fx_upri = fxpp->fx_pri;
 724         fxkparmsp->fx_uprilim = fxpp->fx_uprilim;
 725         fxkparmsp->fx_tqntm = fxpp->fx_pquantum;
 726 }
 727 
 728 
 729 
 730 /*
 731  * Check the validity of the fixed-priority parameters in the buffer
 732  * pointed to by fxparmsp.
 733  */
 734 static int
 735 fx_parmsin(void *parmsp)
 736 {
 737         fxparms_t       *fxparmsp = (fxparms_t *)parmsp;
 738         uint_t          cflags;
 739         longlong_t      ticks;
 740         /*
 741          * Check validity of parameters.
 742          */
 743 
 744         if ((fxparmsp->fx_uprilim > fx_maxupri ||
 745             fxparmsp->fx_uprilim < 0) &&
 746             fxparmsp->fx_uprilim != FX_NOCHANGE)
 747                 return (EINVAL);
 748 
 749         if ((fxparmsp->fx_upri > fx_maxupri ||
 750             fxparmsp->fx_upri < 0) &&
 751             fxparmsp->fx_upri != FX_NOCHANGE)
 752                 return (EINVAL);
 753 
 754         if ((fxparmsp->fx_tqsecs == 0 && fxparmsp->fx_tqnsecs == 0) ||
 755             fxparmsp->fx_tqnsecs >= NANOSEC)
 756                 return (EINVAL);
 757 
 758         cflags = (fxparmsp->fx_upri != FX_NOCHANGE ? FX_DOUPRI : 0);
 759 
 760         if (fxparmsp->fx_uprilim != FX_NOCHANGE) {
 761                 cflags |= FX_DOUPRILIM;
 762         }
 763 
 764         if (fxparmsp->fx_tqnsecs != FX_NOCHANGE)
 765                 cflags |= FX_DOTQ;
 766 
 767         /*
 768          * convert the buffer to kernel format.
 769          */
 770 
 771         if (fxparmsp->fx_tqnsecs >= 0) {
 772                 if ((ticks = SEC_TO_TICK((longlong_t)fxparmsp->fx_tqsecs) +
 773                     NSEC_TO_TICK_ROUNDUP(fxparmsp->fx_tqnsecs)) > INT_MAX)
 774                         return (ERANGE);
 775 
 776                 ((fxkparms_t *)fxparmsp)->fx_tqntm = (int)ticks;
 777         } else {
 778                 if ((fxparmsp->fx_tqnsecs != FX_NOCHANGE) &&
 779                     (fxparmsp->fx_tqnsecs != FX_TQINF) &&
 780                     (fxparmsp->fx_tqnsecs != FX_TQDEF))
 781                         return (EINVAL);
 782                 ((fxkparms_t *)fxparmsp)->fx_tqntm = fxparmsp->fx_tqnsecs;
 783         }
 784 
 785         ((fxkparms_t *)fxparmsp)->fx_cflags = cflags;
 786 
 787         return (0);
 788 }
 789 
 790 
 791 /*
 792  * Check the validity of the fixed-priority parameters in the pc_vaparms_t
 793  * structure vaparmsp and put them in the buffer pointed to by fxprmsp.
 794  * pc_vaparms_t contains (key, value) pairs of parameter.
 795  */
 796 static int
 797 fx_vaparmsin(void *prmsp, pc_vaparms_t *vaparmsp)
 798 {
 799         uint_t          secs = 0;
 800         uint_t          cnt;
 801         int             nsecs = 0;
 802         int             priflag, secflag, nsecflag, limflag;
 803         longlong_t      ticks;
 804         fxkparms_t      *fxprmsp = (fxkparms_t *)prmsp;
 805         pc_vaparm_t     *vpp = &vaparmsp->pc_parms[0];
 806 
 807 
 808         /*
 809          * First check the validity of parameters and convert them
 810          * from the user supplied format to the internal format.
 811          */
 812         priflag = secflag = nsecflag = limflag = 0;
 813 
 814         fxprmsp->fx_cflags = 0;
 815 
 816         if (vaparmsp->pc_vaparmscnt > PC_VAPARMCNT)
 817                 return (EINVAL);
 818 
 819         for (cnt = 0; cnt < vaparmsp->pc_vaparmscnt; cnt++, vpp++) {
 820 
 821                 switch (vpp->pc_key) {
 822                 case FX_KY_UPRILIM:
 823                         if (limflag++)
 824                                 return (EINVAL);
 825                         fxprmsp->fx_cflags |= FX_DOUPRILIM;
 826                         fxprmsp->fx_uprilim = (pri_t)vpp->pc_parm;
 827                         if (fxprmsp->fx_uprilim > fx_maxupri ||
 828                             fxprmsp->fx_uprilim < 0)
 829                                 return (EINVAL);
 830                         break;
 831 
 832                 case FX_KY_UPRI:
 833                         if (priflag++)
 834                                 return (EINVAL);
 835                         fxprmsp->fx_cflags |= FX_DOUPRI;
 836                         fxprmsp->fx_upri = (pri_t)vpp->pc_parm;
 837                         if (fxprmsp->fx_upri > fx_maxupri ||
 838                             fxprmsp->fx_upri < 0)
 839                                 return (EINVAL);
 840                         break;
 841 
 842                 case FX_KY_TQSECS:
 843                         if (secflag++)
 844                                 return (EINVAL);
 845                         fxprmsp->fx_cflags |= FX_DOTQ;
 846                         secs = (uint_t)vpp->pc_parm;
 847                         break;
 848 
 849                 case FX_KY_TQNSECS:
 850                         if (nsecflag++)
 851                                 return (EINVAL);
 852                         fxprmsp->fx_cflags |= FX_DOTQ;
 853                         nsecs = (int)vpp->pc_parm;
 854                         break;
 855 
 856                 default:
 857                         return (EINVAL);
 858                 }
 859         }
 860 
 861         if (vaparmsp->pc_vaparmscnt == 0) {
 862                 /*
 863                  * Use default parameters.
 864                  */
 865                 fxprmsp->fx_upri = 0;
 866                 fxprmsp->fx_uprilim = 0;
 867                 fxprmsp->fx_tqntm = FX_TQDEF;
 868                 fxprmsp->fx_cflags = FX_DOUPRI | FX_DOUPRILIM | FX_DOTQ;
 869         } else if ((fxprmsp->fx_cflags & FX_DOTQ) != 0) {
 870                 if ((secs == 0 && nsecs == 0) || nsecs >= NANOSEC)
 871                         return (EINVAL);
 872 
 873                 if (nsecs >= 0) {
 874                         if ((ticks = SEC_TO_TICK((longlong_t)secs) +
 875                             NSEC_TO_TICK_ROUNDUP(nsecs)) > INT_MAX)
 876                                 return (ERANGE);
 877 
 878                         fxprmsp->fx_tqntm = (int)ticks;
 879                 } else {
 880                         if (nsecs != FX_TQINF && nsecs != FX_TQDEF)
 881                                 return (EINVAL);
 882                         fxprmsp->fx_tqntm = nsecs;
 883                 }
 884         }
 885 
 886         return (0);
 887 }
 888 
 889 
 890 /*
 891  * Nothing to do here but return success.
 892  */
 893 /* ARGSUSED */
 894 static int
 895 fx_parmsout(void *parmsp, pc_vaparms_t *vaparmsp)
 896 {
 897         register fxkparms_t     *fxkprmsp = (fxkparms_t *)parmsp;
 898 
 899         if (vaparmsp != NULL)
 900                 return (0);
 901 
 902         if (fxkprmsp->fx_tqntm < 0) {
 903                 /*
 904                  * Quantum field set to special value (e.g. FX_TQINF)
 905                  */
 906                 ((fxparms_t *)fxkprmsp)->fx_tqnsecs = fxkprmsp->fx_tqntm;
 907                 ((fxparms_t *)fxkprmsp)->fx_tqsecs = 0;
 908 
 909         } else {
 910                 /* Convert quantum from ticks to seconds-nanoseconds */
 911 
 912                 timestruc_t ts;
 913                 TICK_TO_TIMESTRUC(fxkprmsp->fx_tqntm, &ts);
 914                 ((fxparms_t *)fxkprmsp)->fx_tqsecs = ts.tv_sec;
 915                 ((fxparms_t *)fxkprmsp)->fx_tqnsecs = ts.tv_nsec;
 916         }
 917 
 918         return (0);
 919 }
 920 
 921 
 922 /*
 923  * Copy all selected fixed-priority class parameters to the user.
 924  * The parameters are specified by a key.
 925  */
 926 static int
 927 fx_vaparmsout(void *prmsp, pc_vaparms_t *vaparmsp)
 928 {
 929         fxkparms_t      *fxkprmsp = (fxkparms_t *)prmsp;
 930         timestruc_t     ts;
 931         uint_t          cnt;
 932         uint_t          secs;
 933         int             nsecs;
 934         int             priflag, secflag, nsecflag, limflag;
 935         pc_vaparm_t     *vpp = &vaparmsp->pc_parms[0];
 936 
 937         ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
 938 
 939         priflag = secflag = nsecflag = limflag = 0;
 940 
 941         if (vaparmsp->pc_vaparmscnt > PC_VAPARMCNT)
 942                 return (EINVAL);
 943 
 944         if (fxkprmsp->fx_tqntm < 0) {
 945                 /*
 946                  * Quantum field set to special value (e.g. FX_TQINF).
 947                  */
 948                 secs = 0;
 949                 nsecs = fxkprmsp->fx_tqntm;
 950         } else {
 951                 /*
 952                  * Convert quantum from ticks to seconds-nanoseconds.
 953                  */
 954                 TICK_TO_TIMESTRUC(fxkprmsp->fx_tqntm, &ts);
 955                 secs = ts.tv_sec;
 956                 nsecs = ts.tv_nsec;
 957         }
 958 
 959 
 960         for (cnt = 0; cnt < vaparmsp->pc_vaparmscnt; cnt++, vpp++) {
 961 
 962                 switch (vpp->pc_key) {
 963                 case FX_KY_UPRILIM:
 964                         if (limflag++)
 965                                 return (EINVAL);
 966                         if (copyout(&fxkprmsp->fx_uprilim,
 967                             (void *)(uintptr_t)vpp->pc_parm, sizeof (pri_t)))
 968                                 return (EFAULT);
 969                         break;
 970 
 971                 case FX_KY_UPRI:
 972                         if (priflag++)
 973                                 return (EINVAL);
 974                         if (copyout(&fxkprmsp->fx_upri,
 975                             (void *)(uintptr_t)vpp->pc_parm, sizeof (pri_t)))
 976                                 return (EFAULT);
 977                         break;
 978 
 979                 case FX_KY_TQSECS:
 980                         if (secflag++)
 981                                 return (EINVAL);
 982                         if (copyout(&secs,
 983                             (void *)(uintptr_t)vpp->pc_parm, sizeof (uint_t)))
 984                                 return (EFAULT);
 985                         break;
 986 
 987                 case FX_KY_TQNSECS:
 988                         if (nsecflag++)
 989                                 return (EINVAL);
 990                         if (copyout(&nsecs,
 991                             (void *)(uintptr_t)vpp->pc_parm, sizeof (int)))
 992                                 return (EFAULT);
 993                         break;
 994 
 995                 default:
 996                         return (EINVAL);
 997                 }
 998         }
 999 
1000         return (0);
1001 }
1002 
1003 /*
1004  * Set the scheduling parameters of the thread pointed to by fxprocp
1005  * to those specified in the buffer pointed to by fxparmsp.
1006  */
1007 /* ARGSUSED */
1008 static int
1009 fx_parmsset(kthread_t *tx, void *parmsp, id_t reqpcid, cred_t *reqpcredp)
1010 {
1011         char            nice;
1012         pri_t           reqfxuprilim;
1013         pri_t           reqfxupri;
1014         fxkparms_t      *fxkparmsp = (fxkparms_t *)parmsp;
1015         fxproc_t        *fxpp;
1016 
1017 
1018         ASSERT(MUTEX_HELD(&(ttoproc(tx))->p_lock));
1019 
1020         thread_lock(tx);
1021         fxpp = (fxproc_t *)tx->t_cldata;
1022 
1023         if ((fxkparmsp->fx_cflags & FX_DOUPRILIM) == 0)
1024                 reqfxuprilim = fxpp->fx_uprilim;
1025         else
1026                 reqfxuprilim = fxkparmsp->fx_uprilim;
1027 
1028         /*
1029          * Basic permissions enforced by generic kernel code
1030          * for all classes require that a thread attempting
1031          * to change the scheduling parameters of a target
1032          * thread be privileged or have a real or effective
1033          * UID matching that of the target thread. We are not
1034          * called unless these basic permission checks have
1035          * already passed. The fixed priority class requires in
1036          * addition that the calling thread be privileged if it
1037          * is attempting to raise the pri above its current
1038          * value This may have been checked previously but if our
1039          * caller passed us a non-NULL credential pointer we assume
1040          * it hasn't and we check it here.
1041          */
1042 
1043         if ((reqpcredp != NULL) &&
1044             (reqfxuprilim > fxpp->fx_uprilim ||
1045             ((fxkparmsp->fx_cflags & FX_DOTQ) != 0)) &&
1046             secpolicy_raisepriority(reqpcredp) != 0) {
1047                 thread_unlock(tx);
1048                 return (EPERM);
1049         }
1050 
1051         FX_ADJUST_PRI(reqfxuprilim);
1052 
1053         if ((fxkparmsp->fx_cflags & FX_DOUPRI) == 0)
1054                 reqfxupri = fxpp->fx_pri;
1055         else
1056                 reqfxupri = fxkparmsp->fx_upri;
1057 
1058 
1059         /*
1060          * Make sure the user priority doesn't exceed the upri limit.
1061          */
1062         if (reqfxupri > reqfxuprilim)
1063                 reqfxupri = reqfxuprilim;
1064 
1065         /*
1066          * Set fx_nice to the nice value corresponding to the user
1067          * priority we are setting.  Note that setting the nice field
1068          * of the parameter struct won't affect upri or nice.
1069          */
1070 
1071         nice = NZERO - (reqfxupri * NZERO) / fx_maxupri;
1072 
1073         if (nice > NZERO)
1074                 nice = NZERO;
1075 
1076         fxpp->fx_uprilim = reqfxuprilim;
1077         fxpp->fx_pri = reqfxupri;
1078 
1079         if (fxkparmsp->fx_tqntm == FX_TQINF)
1080                 fxpp->fx_pquantum = FX_TQINF;
1081         else if (fxkparmsp->fx_tqntm == FX_TQDEF)
1082                 fxpp->fx_pquantum = fx_dptbl[fxpp->fx_pri].fx_quantum;
1083         else if ((fxkparmsp->fx_cflags & FX_DOTQ) != 0)
1084                 fxpp->fx_pquantum = fxkparmsp->fx_tqntm;
1085 
1086         fxpp->fx_nice = nice;
1087 
1088         fx_change_priority(tx, fxpp);
1089         thread_unlock(tx);
1090         return (0);
1091 }
1092 
1093 
1094 /*
1095  * Return the global scheduling priority that would be assigned
1096  * to a thread entering the fixed-priority class with the fx_upri.
1097  */
1098 static pri_t
1099 fx_globpri(kthread_t *t)
1100 {
1101         fxproc_t *fxpp;
1102 
1103         ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
1104 
1105         fxpp = (fxproc_t *)t->t_cldata;
1106         return (fx_dptbl[fxpp->fx_pri].fx_globpri);
1107 
1108 }
1109 
1110 /*
1111  * Arrange for thread to be placed in appropriate location
1112  * on dispatcher queue.
1113  *
1114  * This is called with the current thread in TS_ONPROC and locked.
1115  */
1116 static void
1117 fx_preempt(kthread_t *t)
1118 {
1119         fxproc_t        *fxpp = (fxproc_t *)(t->t_cldata);
1120 
1121         ASSERT(t == curthread);
1122         ASSERT(THREAD_LOCK_HELD(curthread));
1123 
1124         (void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ENFORCE);
1125 
1126         /*
1127          * Check to see if we're doing "preemption control" here.  If
1128          * we are, and if the user has requested that this thread not
1129          * be preempted, and if preemptions haven't been put off for
1130          * too long, let the preemption happen here but try to make
1131          * sure the thread is rescheduled as soon as possible.  We do
1132          * this by putting it on the front of the highest priority run
1133          * queue in the FX class.  If the preemption has been put off
1134          * for too long, clear the "nopreempt" bit and let the thread
1135          * be preempted.
1136          */
1137         if (t->t_schedctl && schedctl_get_nopreempt(t)) {
1138                 if (fxpp->fx_pquantum == FX_TQINF ||
1139                     fxpp->fx_timeleft > -SC_MAX_TICKS) {
1140                         DTRACE_SCHED1(schedctl__nopreempt, kthread_t *, t);
1141                         schedctl_set_yield(t, 1);
1142                         setfrontdq(t);
1143                         return;
1144                 } else {
1145                         schedctl_set_nopreempt(t, 0);
1146                         DTRACE_SCHED1(schedctl__preempt, kthread_t *, t);
1147                         TNF_PROBE_2(schedctl_preempt, "schedctl FX fx_preempt",
1148                             /* CSTYLED */, tnf_pid, pid, ttoproc(t)->p_pid,
1149                             tnf_lwpid, lwpid, t->t_tid);
1150                         /*
1151                          * Fall through and be preempted below.
1152                          */
1153                 }
1154         }
1155 
1156         if (FX_HAS_CB(fxpp)) {
1157                 clock_t new_quantum =  (clock_t)fxpp->fx_pquantum;
1158                 pri_t   newpri = fxpp->fx_pri;
1159                 FX_CB_PREEMPT(FX_CALLB(fxpp), fxpp->fx_cookie,
1160                     &new_quantum, &newpri);
1161                 FX_ADJUST_QUANTUM(new_quantum);
1162                 if ((int)new_quantum != fxpp->fx_pquantum) {
1163                         fxpp->fx_pquantum = (int)new_quantum;
1164                         fxpp->fx_timeleft = fxpp->fx_pquantum;
1165                 }
1166                 FX_ADJUST_PRI(newpri);
1167                 fxpp->fx_pri = newpri;
1168                 THREAD_CHANGE_PRI(t, fx_dptbl[fxpp->fx_pri].fx_globpri);
1169         }
1170 
1171         /*
1172          * This thread may be placed on wait queue by CPU Caps. In this case we
1173          * do not need to do anything until it is removed from the wait queue.
1174          */
1175         if (CPUCAPS_ENFORCE(t)) {
1176                 return;
1177         }
1178 
1179         if ((fxpp->fx_flags & (FXBACKQ)) == FXBACKQ) {
1180                 fxpp->fx_timeleft = fxpp->fx_pquantum;
1181                 fxpp->fx_flags &= ~FXBACKQ;
1182                 setbackdq(t);
1183         } else {
1184                 setfrontdq(t);
1185         }
1186 }
1187 
1188 static void
1189 fx_setrun(kthread_t *t)
1190 {
1191         fxproc_t *fxpp = (fxproc_t *)(t->t_cldata);
1192 
1193         ASSERT(THREAD_LOCK_HELD(t));    /* t should be in transition */
1194         fxpp->fx_flags &= ~FXBACKQ;
1195 
1196         if (t->t_disp_time != ddi_get_lbolt())
1197                 setbackdq(t);
1198         else
1199                 setfrontdq(t);
1200 }
1201 
1202 
1203 /*
1204  * Prepare thread for sleep. We reset the thread priority so it will
1205  * run at the kernel priority level when it wakes up.
1206  */
1207 static void
1208 fx_sleep(kthread_t *t)
1209 {
1210         fxproc_t        *fxpp = (fxproc_t *)(t->t_cldata);
1211 
1212         ASSERT(t == curthread);
1213         ASSERT(THREAD_LOCK_HELD(t));
1214 
1215         /*
1216          * Account for time spent on CPU before going to sleep.
1217          */
1218         (void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ENFORCE);
1219 
1220         if (FX_HAS_CB(fxpp)) {
1221                 FX_CB_SLEEP(FX_CALLB(fxpp), fxpp->fx_cookie);
1222         }
1223 }
1224 
1225 /* ARGSUSED */
1226 static void
1227 fx_stop(kthread_t *t, int why, int what)
1228 {
1229         fxproc_t *fxpp = (fxproc_t *)(t->t_cldata);
1230 
1231         ASSERT(THREAD_LOCK_HELD(t));
1232 
1233         if (FX_HAS_CB(fxpp)) {
1234                 FX_CB_STOP(FX_CALLB(fxpp), fxpp->fx_cookie);
1235         }
1236 }
1237 
1238 /*
1239  * Check for time slice expiration.  If time slice has expired
1240  * set runrun to cause preemption.
1241  */
1242 static void
1243 fx_tick(kthread_t *t)
1244 {
1245         boolean_t call_cpu_surrender = B_FALSE;
1246         fxproc_t *fxpp;
1247 
1248         ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock));
1249 
1250         thread_lock(t);
1251 
1252         fxpp = (fxproc_t *)(t->t_cldata);
1253 
1254         if (FX_HAS_CB(fxpp)) {
1255                 clock_t new_quantum =  (clock_t)fxpp->fx_pquantum;
1256                 pri_t   newpri = fxpp->fx_pri;
1257                 FX_CB_TICK(FX_CALLB(fxpp), fxpp->fx_cookie,
1258                     &new_quantum, &newpri);
1259                 FX_ADJUST_QUANTUM(new_quantum);
1260                 if ((int)new_quantum != fxpp->fx_pquantum) {
1261                         fxpp->fx_pquantum = (int)new_quantum;
1262                         fxpp->fx_timeleft = fxpp->fx_pquantum;
1263                 }
1264                 FX_ADJUST_PRI(newpri);
1265                 if (newpri != fxpp->fx_pri) {
1266                         fxpp->fx_pri = newpri;
1267                         fx_change_priority(t, fxpp);
1268                 }
1269         }
1270 
1271         /*
1272          * Keep track of thread's project CPU usage.  Note that projects
1273          * get charged even when threads are running in the kernel.
1274          */
1275         call_cpu_surrender =  CPUCAPS_CHARGE(t, &fxpp->fx_caps,
1276             CPUCAPS_CHARGE_ENFORCE);
1277 
1278         if ((fxpp->fx_pquantum != FX_TQINF) &&
1279             (--fxpp->fx_timeleft <= 0)) {
1280                 pri_t   new_pri;
1281 
1282                 /*
1283                  * If we're doing preemption control and trying to
1284                  * avoid preempting this thread, just note that
1285                  * the thread should yield soon and let it keep
1286                  * running (unless it's been a while).
1287                  */
1288                 if (t->t_schedctl && schedctl_get_nopreempt(t)) {
1289                         if (fxpp->fx_timeleft > -SC_MAX_TICKS) {
1290                                 DTRACE_SCHED1(schedctl__nopreempt,
1291                                     kthread_t *, t);
1292                                 schedctl_set_yield(t, 1);
1293                                 thread_unlock_nopreempt(t);
1294                                 return;
1295                         }
1296                         TNF_PROBE_2(schedctl_failsafe,
1297                             "schedctl FX fx_tick", /* CSTYLED */,
1298                             tnf_pid, pid, ttoproc(t)->p_pid,
1299                             tnf_lwpid, lwpid, t->t_tid);
1300                 }
1301                 new_pri = fx_dptbl[fxpp->fx_pri].fx_globpri;
1302                 ASSERT(new_pri >= 0 && new_pri <= fx_maxglobpri);
1303                 /*
1304                  * When the priority of a thread is changed,
1305                  * it may be necessary to adjust its position
1306                  * on a sleep queue or dispatch queue. Even
1307                  * when the priority is not changed, we need
1308                  * to preserve round robin on dispatch queue.
1309                  * The function thread_change_pri accomplishes
1310                  * this.
1311                  */
1312                 if (thread_change_pri(t, new_pri, 0)) {
1313                         fxpp->fx_timeleft = fxpp->fx_pquantum;
1314                 } else {
1315                         call_cpu_surrender = B_TRUE;
1316                 }
1317         } else if (t->t_state == TS_ONPROC &&
1318             t->t_pri < t->t_disp_queue->disp_maxrunpri) {
1319                 call_cpu_surrender = B_TRUE;
1320         }
1321 
1322         if (call_cpu_surrender) {
1323                 fxpp->fx_flags |= FXBACKQ;
1324                 cpu_surrender(t);
1325         }
1326         thread_unlock_nopreempt(t);     /* clock thread can't be preempted */
1327 }
1328 
1329 
1330 static void
1331 fx_trapret(kthread_t *t)
1332 {
1333         cpu_t           *cp = CPU;
1334 
1335         ASSERT(THREAD_LOCK_HELD(t));
1336         ASSERT(t == curthread);
1337         ASSERT(cp->cpu_dispthread == t);
1338         ASSERT(t->t_state == TS_ONPROC);
1339 }
1340 
1341 
1342 /*
1343  * Processes waking up go to the back of their queue.
1344  */
1345 static void
1346 fx_wakeup(kthread_t *t)
1347 {
1348         fxproc_t        *fxpp = (fxproc_t *)(t->t_cldata);
1349 
1350         ASSERT(THREAD_LOCK_HELD(t));
1351 
1352         if (FX_HAS_CB(fxpp)) {
1353                 clock_t new_quantum =  (clock_t)fxpp->fx_pquantum;
1354                 pri_t   newpri = fxpp->fx_pri;
1355                 FX_CB_WAKEUP(FX_CALLB(fxpp), fxpp->fx_cookie,
1356                     &new_quantum, &newpri);
1357                 FX_ADJUST_QUANTUM(new_quantum);
1358                 if ((int)new_quantum != fxpp->fx_pquantum) {
1359                         fxpp->fx_pquantum = (int)new_quantum;
1360                         fxpp->fx_timeleft = fxpp->fx_pquantum;
1361                 }
1362 
1363                 FX_ADJUST_PRI(newpri);
1364                 if (newpri != fxpp->fx_pri) {
1365                         fxpp->fx_pri = newpri;
1366                         THREAD_CHANGE_PRI(t, fx_dptbl[fxpp->fx_pri].fx_globpri);
1367                 }
1368         }
1369 
1370         fxpp->fx_flags &= ~FXBACKQ;
1371 
1372         if (t->t_disp_time != ddi_get_lbolt())
1373                 setbackdq(t);
1374         else
1375                 setfrontdq(t);
1376 }
1377 
1378 
1379 /*
1380  * When a thread yields, put it on the back of the run queue.
1381  */
1382 static void
1383 fx_yield(kthread_t *t)
1384 {
1385         fxproc_t        *fxpp = (fxproc_t *)(t->t_cldata);
1386 
1387         ASSERT(t == curthread);
1388         ASSERT(THREAD_LOCK_HELD(t));
1389 
1390         /*
1391          * Collect CPU usage spent before yielding CPU.
1392          */
1393         (void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ENFORCE);
1394 
1395         if (FX_HAS_CB(fxpp))  {
1396                 clock_t new_quantum =  (clock_t)fxpp->fx_pquantum;
1397                 pri_t   newpri = fxpp->fx_pri;
1398                 FX_CB_PREEMPT(FX_CALLB(fxpp), fxpp->fx_cookie,
1399                     &new_quantum, &newpri);
1400                 FX_ADJUST_QUANTUM(new_quantum);
1401                 if ((int)new_quantum != fxpp->fx_pquantum) {
1402                         fxpp->fx_pquantum = (int)new_quantum;
1403                         fxpp->fx_timeleft = fxpp->fx_pquantum;
1404                 }
1405                 FX_ADJUST_PRI(newpri);
1406                 fxpp->fx_pri = newpri;
1407                 THREAD_CHANGE_PRI(t, fx_dptbl[fxpp->fx_pri].fx_globpri);
1408         }
1409 
1410         /*
1411          * Clear the preemption control "yield" bit since the user is
1412          * doing a yield.
1413          */
1414         if (t->t_schedctl)
1415                 schedctl_set_yield(t, 0);
1416 
1417         if (fxpp->fx_timeleft <= 0) {
1418                 /*
1419                  * Time slice was artificially extended to avoid
1420                  * preemption, so pretend we're preempting it now.
1421                  */
1422                 DTRACE_SCHED1(schedctl__yield, int, -fxpp->fx_timeleft);
1423                 fxpp->fx_timeleft = fxpp->fx_pquantum;
1424                 THREAD_CHANGE_PRI(t, fx_dptbl[fxpp->fx_pri].fx_globpri);
1425                 ASSERT(t->t_pri >= 0 && t->t_pri <= fx_maxglobpri);
1426         }
1427 
1428         fxpp->fx_flags &= ~FXBACKQ;
1429         setbackdq(t);
1430 }
1431 
1432 /*
1433  * Increment the nice value of the specified thread by incr and
1434  * return the new value in *retvalp.
1435  */
1436 static int
1437 fx_donice(kthread_t *t, cred_t *cr, int incr, int *retvalp)
1438 {
1439         int             newnice;
1440         fxproc_t        *fxpp = (fxproc_t *)(t->t_cldata);
1441         fxkparms_t      fxkparms;
1442 
1443         ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock));
1444 
1445         /* If there's no change to priority, just return current setting */
1446         if (incr == 0) {
1447                 if (retvalp) {
1448                         *retvalp = fxpp->fx_nice - NZERO;
1449                 }
1450                 return (0);
1451         }
1452 
1453         if ((incr < 0 || incr > 2 * NZERO) &&
1454             secpolicy_raisepriority(cr) != 0)
1455                 return (EPERM);
1456 
1457         /*
1458          * Specifying a nice increment greater than the upper limit of
1459          * 2 * NZERO - 1 will result in the thread's nice value being
1460          * set to the upper limit.  We check for this before computing
1461          * the new value because otherwise we could get overflow
1462          * if a privileged user specified some ridiculous increment.
1463          */
1464         if (incr > 2 * NZERO - 1)
1465                 incr = 2 * NZERO - 1;
1466 
1467         newnice = fxpp->fx_nice + incr;
1468         if (newnice > NZERO)
1469                 newnice = NZERO;
1470         else if (newnice < 0)
1471                 newnice = 0;
1472 
1473         fxkparms.fx_uprilim = fxkparms.fx_upri =
1474             -((newnice - NZERO) * fx_maxupri) / NZERO;
1475 
1476         fxkparms.fx_cflags = FX_DOUPRILIM | FX_DOUPRI;
1477 
1478         fxkparms.fx_tqntm = FX_TQDEF;
1479 
1480         /*
1481          * Reset the uprilim and upri values of the thread. Adjust
1482          * time quantum accordingly.
1483          */
1484 
1485         (void) fx_parmsset(t, (void *)&fxkparms, (id_t)0, (cred_t *)NULL);
1486 
1487         /*
1488          * Although fx_parmsset already reset fx_nice it may
1489          * not have been set to precisely the value calculated above
1490          * because fx_parmsset determines the nice value from the
1491          * user priority and we may have truncated during the integer
1492          * conversion from nice value to user priority and back.
1493          * We reset fx_nice to the value we calculated above.
1494          */
1495         fxpp->fx_nice = (char)newnice;
1496 
1497         if (retvalp)
1498                 *retvalp = newnice - NZERO;
1499 
1500         return (0);
1501 }
1502 
1503 /*
1504  * Increment the priority of the specified thread by incr and
1505  * return the new value in *retvalp.
1506  */
1507 static int
1508 fx_doprio(kthread_t *t, cred_t *cr, int incr, int *retvalp)
1509 {
1510         int             newpri;
1511         fxproc_t        *fxpp = (fxproc_t *)(t->t_cldata);
1512         fxkparms_t      fxkparms;
1513 
1514         ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock));
1515 
1516         /* If there's no change to priority, just return current setting */
1517         if (incr == 0) {
1518                 *retvalp = fxpp->fx_pri;
1519                 return (0);
1520         }
1521 
1522         newpri = fxpp->fx_pri + incr;
1523         if (newpri > fx_maxupri || newpri < 0)
1524                 return (EINVAL);
1525 
1526         *retvalp = newpri;
1527         fxkparms.fx_uprilim = fxkparms.fx_upri = newpri;
1528         fxkparms.fx_tqntm = FX_NOCHANGE;
1529         fxkparms.fx_cflags = FX_DOUPRILIM | FX_DOUPRI;
1530 
1531         /*
1532          * Reset the uprilim and upri values of the thread.
1533          */
1534         return (fx_parmsset(t, (void *)&fxkparms, (id_t)0, cr));
1535 }
1536 
1537 static void
1538 fx_change_priority(kthread_t *t, fxproc_t *fxpp)
1539 {
1540         pri_t   new_pri;
1541 
1542         ASSERT(THREAD_LOCK_HELD(t));
1543         new_pri = fx_dptbl[fxpp->fx_pri].fx_globpri;
1544         ASSERT(new_pri >= 0 && new_pri <= fx_maxglobpri);
1545         t->t_cpri = fxpp->fx_pri;
1546         if (t == curthread || t->t_state == TS_ONPROC) {
1547                 /* curthread is always onproc */
1548                 cpu_t   *cp = t->t_disp_queue->disp_cpu;
1549                 THREAD_CHANGE_PRI(t, new_pri);
1550                 if (t == cp->cpu_dispthread)
1551                         cp->cpu_dispatch_pri = DISP_PRIO(t);
1552                 if (DISP_MUST_SURRENDER(t)) {
1553                         fxpp->fx_flags |= FXBACKQ;
1554                         cpu_surrender(t);
1555                 } else {
1556                         fxpp->fx_timeleft = fxpp->fx_pquantum;
1557                 }
1558         } else {
1559                 /*
1560                  * When the priority of a thread is changed,
1561                  * it may be necessary to adjust its position
1562                  * on a sleep queue or dispatch queue.
1563                  * The function thread_change_pri accomplishes
1564                  * this.
1565                  */
1566                 if (thread_change_pri(t, new_pri, 0)) {
1567                         /*
1568                          * The thread was on a run queue. Reset
1569                          * its CPU timeleft from the quantum
1570                          * associated with the new priority.
1571                          */
1572                         fxpp->fx_timeleft = fxpp->fx_pquantum;
1573                 } else {
1574                         fxpp->fx_flags |= FXBACKQ;
1575                 }
1576         }
1577 }
1578 
1579 static int
1580 fx_alloc(void **p, int flag)
1581 {
1582         void *bufp;
1583 
1584         bufp = kmem_alloc(sizeof (fxproc_t), flag);
1585         if (bufp == NULL) {
1586                 return (ENOMEM);
1587         } else {
1588                 *p = bufp;
1589                 return (0);
1590         }
1591 }
1592 
1593 static void
1594 fx_free(void *bufp)
1595 {
1596         if (bufp)
1597                 kmem_free(bufp, sizeof (fxproc_t));
1598 }
1599 
1600 /*
1601  * Release the callback list mutex after successful lookup
1602  */
1603 void
1604 fx_list_release(fxproc_t *fxpp)
1605 {
1606         int index = FX_CB_LIST_HASH(fxpp->fx_ktid);
1607         kmutex_t *lockp = &fx_cb_list_lock[index];
1608         mutex_exit(lockp);
1609 }
1610 
1611 fxproc_t *
1612 fx_list_lookup(kt_did_t ktid)
1613 {
1614         int index = FX_CB_LIST_HASH(ktid);
1615         kmutex_t *lockp = &fx_cb_list_lock[index];
1616         fxproc_t *fxpp;
1617 
1618         mutex_enter(lockp);
1619 
1620         for (fxpp = fx_cb_plisthead[index].fx_cb_next;
1621             fxpp != &fx_cb_plisthead[index]; fxpp = fxpp->fx_cb_next) {
1622                 if (fxpp->fx_tp->t_cid == fx_cid && fxpp->fx_ktid == ktid &&
1623                     fxpp->fx_callback != NULL) {
1624                         /*
1625                          * The caller is responsible for calling
1626                          * fx_list_release to drop the lock upon
1627                          * successful lookup
1628                          */
1629                         return (fxpp);
1630                 }
1631         }
1632         mutex_exit(lockp);
1633         return ((fxproc_t *)NULL);
1634 }
1635 
1636 
1637 /*
1638  * register a callback set of routines for current thread
1639  * thread should already be in FX class
1640  */
1641 int
1642 fx_register_callbacks(fx_callbacks_t *fx_callback, fx_cookie_t cookie,
1643         pri_t pri, clock_t quantum)
1644 {
1645 
1646         fxproc_t        *fxpp;
1647 
1648         if (fx_callback == NULL)
1649                 return (EINVAL);
1650 
1651         if (secpolicy_dispadm(CRED()) != 0)
1652                 return (EPERM);
1653 
1654         if (FX_CB_VERSION(fx_callback) != FX_CALLB_REV)
1655                 return (EINVAL);
1656 
1657         if (!FX_ISVALID(pri, quantum))
1658                 return (EINVAL);
1659 
1660         thread_lock(curthread);         /* get dispatcher lock on thread */
1661 
1662         if (curthread->t_cid != fx_cid) {
1663                 thread_unlock(curthread);
1664                 return (EINVAL);
1665         }
1666 
1667         fxpp = (fxproc_t *)(curthread->t_cldata);
1668         ASSERT(fxpp != NULL);
1669         if (FX_HAS_CB(fxpp)) {
1670                 thread_unlock(curthread);
1671                 return (EINVAL);
1672         }
1673 
1674         fxpp->fx_callback = fx_callback;
1675         fxpp->fx_cookie = cookie;
1676 
1677         if (pri != FX_CB_NOCHANGE) {
1678                 fxpp->fx_pri = pri;
1679                 FX_ADJUST_PRI(fxpp->fx_pri);
1680                 if (quantum == FX_TQDEF) {
1681                         fxpp->fx_pquantum = fx_dptbl[fxpp->fx_pri].fx_quantum;
1682                 } else if (quantum == FX_TQINF) {
1683                         fxpp->fx_pquantum = FX_TQINF;
1684                 } else if (quantum != FX_NOCHANGE) {
1685                         FX_ADJUST_QUANTUM(quantum);
1686                         fxpp->fx_pquantum = quantum;
1687                 }
1688         } else if (quantum != FX_NOCHANGE && quantum != FX_TQDEF) {
1689                 if (quantum == FX_TQINF)
1690                         fxpp->fx_pquantum = FX_TQINF;
1691                 else {
1692                         FX_ADJUST_QUANTUM(quantum);
1693                         fxpp->fx_pquantum = quantum;
1694                 }
1695         }
1696 
1697         fxpp->fx_ktid = ddi_get_kt_did();
1698 
1699         fx_change_priority(curthread, fxpp);
1700 
1701         thread_unlock(curthread);
1702 
1703         /*
1704          * Link new structure into fxproc list.
1705          */
1706         FX_CB_LIST_INSERT(fxpp);
1707         return (0);
1708 }
1709 
1710 /* unregister a callback set of routines for current thread */
1711 int
1712 fx_unregister_callbacks()
1713 {
1714         fxproc_t        *fxpp;
1715 
1716         if ((fxpp = fx_list_lookup(ddi_get_kt_did())) == NULL) {
1717                 /*
1718                  * did not have a registered callback;
1719                  */
1720                 return (EINVAL);
1721         }
1722 
1723         thread_lock(fxpp->fx_tp);
1724         fxpp->fx_callback = NULL;
1725         fxpp->fx_cookie = NULL;
1726         thread_unlock(fxpp->fx_tp);
1727         fx_list_release(fxpp);
1728 
1729         FX_CB_LIST_DELETE(fxpp);
1730         return (0);
1731 }
1732 
1733 /*
1734  * modify priority and/or quantum value of a thread with callback
1735  */
1736 int
1737 fx_modify_priority(kt_did_t ktid, clock_t quantum, pri_t pri)
1738 {
1739         fxproc_t        *fxpp;
1740 
1741         if (!FX_ISVALID(pri, quantum))
1742                 return (EINVAL);
1743 
1744         if ((fxpp = fx_list_lookup(ktid)) == NULL) {
1745                 /*
1746                  * either thread had exited or did not have a registered
1747                  * callback;
1748                  */
1749                 return (ESRCH);
1750         }
1751 
1752         thread_lock(fxpp->fx_tp);
1753 
1754         if (pri != FX_CB_NOCHANGE) {
1755                 fxpp->fx_pri = pri;
1756                 FX_ADJUST_PRI(fxpp->fx_pri);
1757                 if (quantum == FX_TQDEF) {
1758                         fxpp->fx_pquantum = fx_dptbl[fxpp->fx_pri].fx_quantum;
1759                 } else if (quantum == FX_TQINF) {
1760                         fxpp->fx_pquantum = FX_TQINF;
1761                 } else if (quantum != FX_NOCHANGE) {
1762                         FX_ADJUST_QUANTUM(quantum);
1763                         fxpp->fx_pquantum = quantum;
1764                 }
1765         } else if (quantum != FX_NOCHANGE && quantum != FX_TQDEF) {
1766                 if (quantum == FX_TQINF) {
1767                         fxpp->fx_pquantum = FX_TQINF;
1768                 } else {
1769                         FX_ADJUST_QUANTUM(quantum);
1770                         fxpp->fx_pquantum = quantum;
1771                 }
1772         }
1773 
1774         fx_change_priority(fxpp->fx_tp, fxpp);
1775 
1776         thread_unlock(fxpp->fx_tp);
1777         fx_list_release(fxpp);
1778         return (0);
1779 }
1780 
1781 
1782 /*
1783  * return an iblock cookie for mutex initialization to be used in callbacks
1784  */
1785 void *
1786 fx_get_mutex_cookie()
1787 {
1788         return ((void *)(uintptr_t)__ipltospl(DISP_LEVEL));
1789 }
1790 
1791 /*
1792  * return maximum relative priority
1793  */
1794 pri_t
1795 fx_get_maxpri()
1796 {
1797         return (fx_maxumdpri);
1798 }