1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2013, Joyent, Inc. All rights reserved.
  25  */
  26 
  27 #include <sys/types.h>
  28 #include <sys/param.h>
  29 #include <sys/sysmacros.h>
  30 #include <sys/cred.h>
  31 #include <sys/proc.h>
  32 #include <sys/session.h>
  33 #include <sys/strsubr.h>
  34 #include <sys/user.h>
  35 #include <sys/priocntl.h>
  36 #include <sys/class.h>
  37 #include <sys/disp.h>
  38 #include <sys/procset.h>
  39 #include <sys/debug.h>
  40 #include <sys/kmem.h>
  41 #include <sys/errno.h>
  42 #include <sys/fx.h>
  43 #include <sys/fxpriocntl.h>
  44 #include <sys/cpuvar.h>
  45 #include <sys/systm.h>
  46 #include <sys/vtrace.h>
  47 #include <sys/schedctl.h>
  48 #include <sys/tnf_probe.h>
  49 #include <sys/sunddi.h>
  50 #include <sys/spl.h>
  51 #include <sys/modctl.h>
  52 #include <sys/policy.h>
  53 #include <sys/sdt.h>
  54 #include <sys/cpupart.h>
  55 #include <sys/cpucaps.h>
  56 
  57 static pri_t fx_init(id_t, int, classfuncs_t **);
  58 
  59 static struct sclass csw = {
  60         "FX",
  61         fx_init,
  62         0
  63 };
  64 
  65 static struct modlsched modlsched = {
  66         &mod_schedops, "Fixed priority sched class", &csw
  67 };
  68 
  69 static struct modlinkage modlinkage = {
  70         MODREV_1, (void *)&modlsched, NULL
  71 };
  72 
  73 
  74 /*
  75  * control flags (kparms->fx_cflags).
  76  */
  77 #define FX_DOUPRILIM    0x01    /* change user priority limit */
  78 #define FX_DOUPRI       0x02    /* change user priority */
  79 #define FX_DOTQ         0x04    /* change FX time quantum */
  80 
  81 
  82 #define FXMAXUPRI 60            /* maximum user priority setting */
  83 
  84 #define FX_MAX_UNPRIV_PRI       0       /* maximum unpriviledge priority */
  85 
  86 /*
  87  * The fxproc_t structures that have a registered callback vector,
  88  * are also kept in an array of circular doubly linked lists. A hash on
  89  * the thread id (from ddi_get_kt_did()) is used to determine which list
  90  * each of such fxproc structures should be placed. Each list has a dummy
  91  * "head" which is never removed, so the list is never empty.
  92  */
  93 
  94 #define FX_CB_LISTS 16          /* number of lists, must be power of 2 */
  95 #define FX_CB_LIST_HASH(ktid)   ((uint_t)ktid & (FX_CB_LISTS - 1))
  96 
  97 /* Insert fxproc into callback list */
  98 #define FX_CB_LIST_INSERT(fxpp)                                         \
  99 {                                                                       \
 100         int index = FX_CB_LIST_HASH(fxpp->fx_ktid);                  \
 101         kmutex_t *lockp = &fx_cb_list_lock[index];                  \
 102         fxproc_t *headp = &fx_cb_plisthead[index];                  \
 103         mutex_enter(lockp);                                             \
 104         fxpp->fx_cb_next = headp->fx_cb_next;                             \
 105         fxpp->fx_cb_prev = headp;                                    \
 106         headp->fx_cb_next->fx_cb_prev = fxpp;                             \
 107         headp->fx_cb_next = fxpp;                                    \
 108         mutex_exit(lockp);                                              \
 109 }
 110 
 111 /*
 112  * Remove thread from callback list.
 113  */
 114 #define FX_CB_LIST_DELETE(fxpp)                                         \
 115 {                                                                       \
 116         int index = FX_CB_LIST_HASH(fxpp->fx_ktid);                  \
 117         kmutex_t *lockp = &fx_cb_list_lock[index];                  \
 118         mutex_enter(lockp);                                             \
 119         fxpp->fx_cb_prev->fx_cb_next = fxpp->fx_cb_next;               \
 120         fxpp->fx_cb_next->fx_cb_prev = fxpp->fx_cb_prev;               \
 121         mutex_exit(lockp);                                              \
 122 }
 123 
 124 #define FX_HAS_CB(fxpp) (fxpp->fx_callback != NULL)
 125 
 126 /* adjust x to be between 0 and fx_maxumdpri */
 127 
 128 #define FX_ADJUST_PRI(pri)                                              \
 129 {                                                                       \
 130         if (pri < 0)                                                 \
 131                 pri = 0;                                                \
 132         else if (pri > fx_maxumdpri)                                         \
 133                 pri = fx_maxumdpri;                                     \
 134 }
 135 
 136 #define FX_ADJUST_QUANTUM(q)                                            \
 137 {                                                                       \
 138         if (q > INT_MAX)                                             \
 139                 q = INT_MAX;                                            \
 140         else if (q <= 0)                                             \
 141                 q = FX_TQINF;                                           \
 142 }
 143 
 144 #define FX_ISVALID(pri, quantum) \
 145         (((pri >= 0) || (pri == FX_CB_NOCHANGE)) &&                  \
 146             ((quantum >= 0) || (quantum == FX_NOCHANGE) ||           \
 147                 (quantum == FX_TQDEF) || (quantum == FX_TQINF)))
 148 
 149 
 150 static id_t     fx_cid;         /* fixed priority class ID */
 151 static fxdpent_t *fx_dptbl;     /* fixed priority disp parameter table */
 152 
 153 static pri_t    fx_maxupri = FXMAXUPRI;
 154 static pri_t    fx_maxumdpri;   /* max user mode fixed priority */
 155 
 156 static pri_t    fx_maxglobpri;  /* maximum global priority used by fx class */
 157 static kmutex_t fx_dptblock;    /* protects fixed priority dispatch table */
 158 
 159 
 160 static kmutex_t fx_cb_list_lock[FX_CB_LISTS];   /* protects list of fxprocs */
 161                                                 /* that have callbacks */
 162 static fxproc_t fx_cb_plisthead[FX_CB_LISTS];   /* dummy fxproc at head of */
 163                                                 /* list of fxprocs with */
 164                                                 /* callbacks */
 165 
 166 static int      fx_admin(caddr_t, cred_t *);
 167 static int      fx_getclinfo(void *);
 168 static int      fx_parmsin(void *);
 169 static int      fx_parmsout(void *, pc_vaparms_t *);
 170 static int      fx_vaparmsin(void *, pc_vaparms_t *);
 171 static int      fx_vaparmsout(void *, pc_vaparms_t *);
 172 static int      fx_getclpri(pcpri_t *);
 173 static int      fx_alloc(void **, int);
 174 static void     fx_free(void *);
 175 static int      fx_enterclass(kthread_t *, id_t, void *, cred_t *, void *);
 176 static void     fx_exitclass(void *);
 177 static int      fx_canexit(kthread_t *, cred_t *);
 178 static int      fx_fork(kthread_t *, kthread_t *, void *);
 179 static void     fx_forkret(kthread_t *, kthread_t *);
 180 static void     fx_parmsget(kthread_t *, void *);
 181 static int      fx_parmsset(kthread_t *, void *, id_t, cred_t *);
 182 static void     fx_stop(kthread_t *, int, int);
 183 static void     fx_exit(kthread_t *);
 184 static pri_t    fx_swapin(kthread_t *, int);
 185 static pri_t    fx_swapout(kthread_t *, int);
 186 static void     fx_trapret(kthread_t *);
 187 static void     fx_preempt(kthread_t *);
 188 static void     fx_setrun(kthread_t *);
 189 static void     fx_sleep(kthread_t *);
 190 static void     fx_tick(kthread_t *);
 191 static void     fx_wakeup(kthread_t *);
 192 static int      fx_donice(kthread_t *, cred_t *, int, int *);
 193 static int      fx_doprio(kthread_t *, cred_t *, int, int *);
 194 static pri_t    fx_globpri(kthread_t *);
 195 static void     fx_yield(kthread_t *);
 196 static void     fx_nullsys();
 197 
 198 extern fxdpent_t *fx_getdptbl(void);
 199 
 200 static void     fx_change_priority(kthread_t *, fxproc_t *);
 201 static fxproc_t *fx_list_lookup(kt_did_t);
 202 static void fx_list_release(fxproc_t *);
 203 
 204 
 205 static struct classfuncs fx_classfuncs = {
 206         /* class functions */
 207         fx_admin,
 208         fx_getclinfo,
 209         fx_parmsin,
 210         fx_parmsout,
 211         fx_vaparmsin,
 212         fx_vaparmsout,
 213         fx_getclpri,
 214         fx_alloc,
 215         fx_free,
 216 
 217         /* thread functions */
 218         fx_enterclass,
 219         fx_exitclass,
 220         fx_canexit,
 221         fx_fork,
 222         fx_forkret,
 223         fx_parmsget,
 224         fx_parmsset,
 225         fx_stop,
 226         fx_exit,
 227         fx_nullsys,     /* active */
 228         fx_nullsys,     /* inactive */
 229         fx_swapin,
 230         fx_swapout,
 231         fx_trapret,
 232         fx_preempt,
 233         fx_setrun,
 234         fx_sleep,
 235         fx_tick,
 236         fx_wakeup,
 237         fx_donice,
 238         fx_globpri,
 239         fx_nullsys,     /* set_process_group */
 240         fx_yield,
 241         fx_doprio,
 242 };
 243 
 244 
 245 int
 246 _init()
 247 {
 248         return (mod_install(&modlinkage));
 249 }
 250 
 251 int
 252 _fini()
 253 {
 254         return (EBUSY);
 255 }
 256 
 257 int
 258 _info(struct modinfo *modinfop)
 259 {
 260         return (mod_info(&modlinkage, modinfop));
 261 }
 262 
 263 /*
 264  * Fixed priority class initialization. Called by dispinit() at boot time.
 265  * We can ignore the clparmsz argument since we know that the smallest
 266  * possible parameter buffer is big enough for us.
 267  */
 268 /* ARGSUSED */
 269 static pri_t
 270 fx_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp)
 271 {
 272         int i;
 273         extern pri_t fx_getmaxumdpri(void);
 274 
 275         fx_dptbl = fx_getdptbl();
 276         fx_maxumdpri = fx_getmaxumdpri();
 277         fx_maxglobpri = fx_dptbl[fx_maxumdpri].fx_globpri;
 278 
 279         fx_cid = cid;           /* Record our class ID */
 280 
 281         /*
 282          * Initialize the hash table for fxprocs with callbacks
 283          */
 284         for (i = 0; i < FX_CB_LISTS; i++) {
 285                 fx_cb_plisthead[i].fx_cb_next = fx_cb_plisthead[i].fx_cb_prev =
 286                     &fx_cb_plisthead[i];
 287         }
 288 
 289         /*
 290          * We're required to return a pointer to our classfuncs
 291          * structure and the highest global priority value we use.
 292          */
 293         *clfuncspp = &fx_classfuncs;
 294         return (fx_maxglobpri);
 295 }
 296 
 297 /*
 298  * Get or reset the fx_dptbl values per the user's request.
 299  */
 300 static int
 301 fx_admin(caddr_t uaddr, cred_t *reqpcredp)
 302 {
 303         fxadmin_t       fxadmin;
 304         fxdpent_t       *tmpdpp;
 305         int             userdpsz;
 306         int             i;
 307         size_t          fxdpsz;
 308 
 309         if (get_udatamodel() == DATAMODEL_NATIVE) {
 310                 if (copyin(uaddr, &fxadmin, sizeof (fxadmin_t)))
 311                         return (EFAULT);
 312         }
 313 #ifdef _SYSCALL32_IMPL
 314         else {
 315                 /* get fxadmin struct from ILP32 caller */
 316                 fxadmin32_t fxadmin32;
 317                 if (copyin(uaddr, &fxadmin32, sizeof (fxadmin32_t)))
 318                         return (EFAULT);
 319                 fxadmin.fx_dpents =
 320                     (struct fxdpent *)(uintptr_t)fxadmin32.fx_dpents;
 321                 fxadmin.fx_ndpents = fxadmin32.fx_ndpents;
 322                 fxadmin.fx_cmd = fxadmin32.fx_cmd;
 323         }
 324 #endif /* _SYSCALL32_IMPL */
 325 
 326         fxdpsz = (fx_maxumdpri + 1) * sizeof (fxdpent_t);
 327 
 328         switch (fxadmin.fx_cmd) {
 329         case FX_GETDPSIZE:
 330                 fxadmin.fx_ndpents = fx_maxumdpri + 1;
 331 
 332                 if (get_udatamodel() == DATAMODEL_NATIVE) {
 333                         if (copyout(&fxadmin, uaddr, sizeof (fxadmin_t)))
 334                                 return (EFAULT);
 335                 }
 336 #ifdef _SYSCALL32_IMPL
 337                 else {
 338                         /* return fxadmin struct to ILP32 caller */
 339                         fxadmin32_t fxadmin32;
 340                         fxadmin32.fx_dpents =
 341                             (caddr32_t)(uintptr_t)fxadmin.fx_dpents;
 342                         fxadmin32.fx_ndpents = fxadmin.fx_ndpents;
 343                         fxadmin32.fx_cmd = fxadmin.fx_cmd;
 344                         if (copyout(&fxadmin32, uaddr, sizeof (fxadmin32_t)))
 345                                 return (EFAULT);
 346                 }
 347 #endif /* _SYSCALL32_IMPL */
 348                 break;
 349 
 350         case FX_GETDPTBL:
 351                 userdpsz = MIN(fxadmin.fx_ndpents * sizeof (fxdpent_t),
 352                     fxdpsz);
 353                 if (copyout(fx_dptbl, fxadmin.fx_dpents, userdpsz))
 354                         return (EFAULT);
 355 
 356                 fxadmin.fx_ndpents = userdpsz / sizeof (fxdpent_t);
 357 
 358                 if (get_udatamodel() == DATAMODEL_NATIVE) {
 359                         if (copyout(&fxadmin, uaddr, sizeof (fxadmin_t)))
 360                                 return (EFAULT);
 361                 }
 362 #ifdef _SYSCALL32_IMPL
 363                 else {
 364                         /* return fxadmin struct to ILP32 callers */
 365                         fxadmin32_t fxadmin32;
 366                         fxadmin32.fx_dpents =
 367                             (caddr32_t)(uintptr_t)fxadmin.fx_dpents;
 368                         fxadmin32.fx_ndpents = fxadmin.fx_ndpents;
 369                         fxadmin32.fx_cmd = fxadmin.fx_cmd;
 370                         if (copyout(&fxadmin32, uaddr, sizeof (fxadmin32_t)))
 371                                 return (EFAULT);
 372                 }
 373 #endif /* _SYSCALL32_IMPL */
 374                 break;
 375 
 376         case FX_SETDPTBL:
 377                 /*
 378                  * We require that the requesting process has sufficient
 379                  * privileges. We also require that the table supplied by
 380                  * the user exactly match the current fx_dptbl in size.
 381                  */
 382                 if (secpolicy_dispadm(reqpcredp) != 0) {
 383                         return (EPERM);
 384                 }
 385                 if (fxadmin.fx_ndpents * sizeof (fxdpent_t) != fxdpsz) {
 386                         return (EINVAL);
 387                 }
 388 
 389                 /*
 390                  * We read the user supplied table into a temporary buffer
 391                  * where it is validated before being copied over the
 392                  * fx_dptbl.
 393                  */
 394                 tmpdpp = kmem_alloc(fxdpsz, KM_SLEEP);
 395                 if (copyin(fxadmin.fx_dpents, tmpdpp, fxdpsz)) {
 396                         kmem_free(tmpdpp, fxdpsz);
 397                         return (EFAULT);
 398                 }
 399                 for (i = 0; i < fxadmin.fx_ndpents; i++) {
 400 
 401                         /*
 402                          * Validate the user supplied values. All we are doing
 403                          * here is verifying that the values are within their
 404                          * allowable ranges and will not panic the system. We
 405                          * make no attempt to ensure that the resulting
 406                          * configuration makes sense or results in reasonable
 407                          * performance.
 408                          */
 409                         if (tmpdpp[i].fx_quantum <= 0 &&
 410                             tmpdpp[i].fx_quantum != FX_TQINF) {
 411                                 kmem_free(tmpdpp, fxdpsz);
 412                                 return (EINVAL);
 413                         }
 414                 }
 415 
 416                 /*
 417                  * Copy the user supplied values over the current fx_dptbl
 418                  * values. The fx_globpri member is read-only so we don't
 419                  * overwrite it.
 420                  */
 421                 mutex_enter(&fx_dptblock);
 422                 for (i = 0; i < fxadmin.fx_ndpents; i++) {
 423                         fx_dptbl[i].fx_quantum = tmpdpp[i].fx_quantum;
 424                 }
 425                 mutex_exit(&fx_dptblock);
 426                 kmem_free(tmpdpp, fxdpsz);
 427                 break;
 428 
 429         default:
 430                 return (EINVAL);
 431         }
 432         return (0);
 433 }
 434 
 435 /*
 436  * Allocate a fixed priority class specific thread structure and
 437  * initialize it with the parameters supplied. Also move the thread
 438  * to specified priority.
 439  */
 440 static int
 441 fx_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
 442     void *bufp)
 443 {
 444         fxkparms_t      *fxkparmsp = (fxkparms_t *)parmsp;
 445         fxproc_t        *fxpp;
 446         pri_t           reqfxupri;
 447         pri_t           reqfxuprilim;
 448 
 449         fxpp = (fxproc_t *)bufp;
 450         ASSERT(fxpp != NULL);
 451 
 452         /*
 453          * Initialize the fxproc structure.
 454          */
 455         fxpp->fx_flags = 0;
 456         fxpp->fx_callback = NULL;
 457         fxpp->fx_cookie = NULL;
 458 
 459         if (fxkparmsp == NULL) {
 460                 /*
 461                  * Use default values.
 462                  */
 463                 fxpp->fx_pri = fxpp->fx_uprilim = 0;
 464                 fxpp->fx_pquantum = fx_dptbl[fxpp->fx_pri].fx_quantum;
 465                 fxpp->fx_nice =  NZERO;
 466         } else {
 467                 /*
 468                  * Use supplied values.
 469                  */
 470 
 471                 if ((fxkparmsp->fx_cflags & FX_DOUPRILIM) == 0) {
 472                         reqfxuprilim = 0;
 473                 } else {
 474                         if (fxkparmsp->fx_uprilim > FX_MAX_UNPRIV_PRI &&
 475                             secpolicy_setpriority(reqpcredp) != 0)
 476                                 return (EPERM);
 477                         reqfxuprilim = fxkparmsp->fx_uprilim;
 478                         FX_ADJUST_PRI(reqfxuprilim);
 479                 }
 480 
 481                 if ((fxkparmsp->fx_cflags & FX_DOUPRI) == 0) {
 482                         reqfxupri = reqfxuprilim;
 483                 } else {
 484                         if (fxkparmsp->fx_upri > FX_MAX_UNPRIV_PRI &&
 485                             secpolicy_setpriority(reqpcredp) != 0)
 486                                 return (EPERM);
 487                         /*
 488                          * Set the user priority to the requested value
 489                          * or the upri limit, whichever is lower.
 490                          */
 491                         reqfxupri = fxkparmsp->fx_upri;
 492                         FX_ADJUST_PRI(reqfxupri);
 493 
 494                         if (reqfxupri > reqfxuprilim)
 495                                 reqfxupri = reqfxuprilim;
 496                 }
 497 
 498 
 499                 fxpp->fx_uprilim = reqfxuprilim;
 500                 fxpp->fx_pri = reqfxupri;
 501 
 502                 fxpp->fx_nice = NZERO - (NZERO * reqfxupri) / fx_maxupri;
 503 
 504                 if (((fxkparmsp->fx_cflags & FX_DOTQ) == 0) ||
 505                     (fxkparmsp->fx_tqntm == FX_TQDEF)) {
 506                         fxpp->fx_pquantum = fx_dptbl[fxpp->fx_pri].fx_quantum;
 507                 } else {
 508                         if (secpolicy_setpriority(reqpcredp) != 0)
 509                                 return (EPERM);
 510 
 511                         if (fxkparmsp->fx_tqntm == FX_TQINF)
 512                                 fxpp->fx_pquantum = FX_TQINF;
 513                         else {
 514                                 fxpp->fx_pquantum = fxkparmsp->fx_tqntm;
 515                         }
 516                 }
 517 
 518         }
 519 
 520         fxpp->fx_timeleft = fxpp->fx_pquantum;
 521         cpucaps_sc_init(&fxpp->fx_caps);
 522         fxpp->fx_tp = t;
 523 
 524         thread_lock(t);                 /* get dispatcher lock on thread */
 525         t->t_clfuncs = &(sclass[cid].cl_funcs->thread);
 526         t->t_cid = cid;
 527         t->t_cldata = (void *)fxpp;
 528         t->t_schedflag &= ~TS_RUNQMATCH;
 529         fx_change_priority(t, fxpp);
 530         thread_unlock(t);
 531 
 532         return (0);
 533 }
 534 
 535 /*
 536  * The thread is exiting.
 537  */
 538 static void
 539 fx_exit(kthread_t *t)
 540 {
 541         fxproc_t *fxpp;
 542 
 543         thread_lock(t);
 544         fxpp = (fxproc_t *)(t->t_cldata);
 545 
 546         /*
 547          * A thread could be exiting in between clock ticks, so we need to
 548          * calculate how much CPU time it used since it was charged last time.
 549          *
 550          * CPU caps are not enforced on exiting processes - it is usually
 551          * desirable to exit as soon as possible to free resources.
 552          */
 553         (void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ONLY);
 554 
 555         if (FX_HAS_CB(fxpp)) {
 556                 FX_CB_EXIT(FX_CALLB(fxpp), fxpp->fx_cookie);
 557                 fxpp->fx_callback = NULL;
 558                 fxpp->fx_cookie = NULL;
 559                 thread_unlock(t);
 560                 FX_CB_LIST_DELETE(fxpp);
 561                 return;
 562         }
 563 
 564         thread_unlock(t);
 565 }
 566 
 567 /*
 568  * Exiting the class. Free fxproc structure of thread.
 569  */
 570 static void
 571 fx_exitclass(void *procp)
 572 {
 573         fxproc_t *fxpp = (fxproc_t *)procp;
 574 
 575         thread_lock(fxpp->fx_tp);
 576         if (FX_HAS_CB(fxpp)) {
 577 
 578                 FX_CB_EXIT(FX_CALLB(fxpp), fxpp->fx_cookie);
 579 
 580                 fxpp->fx_callback = NULL;
 581                 fxpp->fx_cookie = NULL;
 582                 thread_unlock(fxpp->fx_tp);
 583                 FX_CB_LIST_DELETE(fxpp);
 584         } else
 585                 thread_unlock(fxpp->fx_tp);
 586 
 587         kmem_free(fxpp, sizeof (fxproc_t));
 588 }
 589 
 590 /* ARGSUSED */
 591 static int
 592 fx_canexit(kthread_t *t, cred_t *cred)
 593 {
 594         /*
 595          * A thread can always leave the FX class
 596          */
 597         return (0);
 598 }
 599 
 600 /*
 601  * Initialize fixed-priority class specific proc structure for a child.
 602  * callbacks are not inherited upon fork.
 603  */
 604 static int
 605 fx_fork(kthread_t *t, kthread_t *ct, void *bufp)
 606 {
 607         fxproc_t        *pfxpp;         /* ptr to parent's fxproc structure */
 608         fxproc_t        *cfxpp;         /* ptr to child's fxproc structure */
 609 
 610         ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
 611 
 612         cfxpp = (fxproc_t *)bufp;
 613         ASSERT(cfxpp != NULL);
 614         thread_lock(t);
 615         pfxpp = (fxproc_t *)t->t_cldata;
 616         /*
 617          * Initialize child's fxproc structure.
 618          */
 619         cfxpp->fx_timeleft = cfxpp->fx_pquantum = pfxpp->fx_pquantum;
 620         cfxpp->fx_pri = pfxpp->fx_pri;
 621         cfxpp->fx_uprilim = pfxpp->fx_uprilim;
 622         cfxpp->fx_nice = pfxpp->fx_nice;
 623         cfxpp->fx_callback = NULL;
 624         cfxpp->fx_cookie = NULL;
 625         cfxpp->fx_flags = pfxpp->fx_flags & ~(FXBACKQ);
 626         cpucaps_sc_init(&cfxpp->fx_caps);
 627 
 628         cfxpp->fx_tp = ct;
 629         ct->t_cldata = (void *)cfxpp;
 630         thread_unlock(t);
 631 
 632         /*
 633          * Link new structure into fxproc list.
 634          */
 635         return (0);
 636 }
 637 
 638 
 639 /*
 640  * Child is placed at back of dispatcher queue and parent gives
 641  * up processor so that the child runs first after the fork.
 642  * This allows the child immediately execing to break the multiple
 643  * use of copy on write pages with no disk home. The parent will
 644  * get to steal them back rather than uselessly copying them.
 645  */
 646 static void
 647 fx_forkret(kthread_t *t, kthread_t *ct)
 648 {
 649         proc_t  *pp = ttoproc(t);
 650         proc_t  *cp = ttoproc(ct);
 651         fxproc_t *fxpp;
 652 
 653         ASSERT(t == curthread);
 654         ASSERT(MUTEX_HELD(&pidlock));
 655 
 656         /*
 657          * Grab the child's p_lock before dropping pidlock to ensure
 658          * the process does not disappear before we set it running.
 659          */
 660         mutex_enter(&cp->p_lock);
 661         continuelwps(cp);
 662         mutex_exit(&cp->p_lock);
 663 
 664         mutex_enter(&pp->p_lock);
 665         mutex_exit(&pidlock);
 666         continuelwps(pp);
 667 
 668         thread_lock(t);
 669         fxpp = (fxproc_t *)(t->t_cldata);
 670         t->t_pri = fx_dptbl[fxpp->fx_pri].fx_globpri;
 671         ASSERT(t->t_pri >= 0 && t->t_pri <= fx_maxglobpri);
 672         THREAD_TRANSITION(t);
 673         fx_setrun(t);
 674         thread_unlock(t);
 675         /*
 676          * Safe to drop p_lock now since it is safe to change
 677          * the scheduling class after this point.
 678          */
 679         mutex_exit(&pp->p_lock);
 680 
 681         swtch();
 682 }
 683 
 684 
 685 /*
 686  * Get information about the fixed-priority class into the buffer
 687  * pointed to by fxinfop. The maximum configured user priority
 688  * is the only information we supply.
 689  */
 690 static int
 691 fx_getclinfo(void *infop)
 692 {
 693         fxinfo_t *fxinfop = (fxinfo_t *)infop;
 694         fxinfop->fx_maxupri = fx_maxupri;
 695         return (0);
 696 }
 697 
 698 
 699 
 700 /*
 701  * Return the user mode scheduling priority range.
 702  */
 703 static int
 704 fx_getclpri(pcpri_t *pcprip)
 705 {
 706         pcprip->pc_clpmax = fx_maxupri;
 707         pcprip->pc_clpmin = 0;
 708         return (0);
 709 }
 710 
 711 
 712 static void
 713 fx_nullsys()
 714 {}
 715 
 716 
 717 /*
 718  * Get the fixed-priority parameters of the thread pointed to by
 719  * fxprocp into the buffer pointed to by fxparmsp.
 720  */
 721 static void
 722 fx_parmsget(kthread_t *t, void *parmsp)
 723 {
 724         fxproc_t *fxpp = (fxproc_t *)t->t_cldata;
 725         fxkparms_t *fxkparmsp = (fxkparms_t *)parmsp;
 726 
 727         fxkparmsp->fx_upri = fxpp->fx_pri;
 728         fxkparmsp->fx_uprilim = fxpp->fx_uprilim;
 729         fxkparmsp->fx_tqntm = fxpp->fx_pquantum;
 730 }
 731 
 732 
 733 
 734 /*
 735  * Check the validity of the fixed-priority parameters in the buffer
 736  * pointed to by fxparmsp.
 737  */
 738 static int
 739 fx_parmsin(void *parmsp)
 740 {
 741         fxparms_t       *fxparmsp = (fxparms_t *)parmsp;
 742         uint_t          cflags;
 743         longlong_t      ticks;
 744         /*
 745          * Check validity of parameters.
 746          */
 747 
 748         if ((fxparmsp->fx_uprilim > fx_maxupri ||
 749             fxparmsp->fx_uprilim < 0) &&
 750             fxparmsp->fx_uprilim != FX_NOCHANGE)
 751                 return (EINVAL);
 752 
 753         if ((fxparmsp->fx_upri > fx_maxupri ||
 754             fxparmsp->fx_upri < 0) &&
 755             fxparmsp->fx_upri != FX_NOCHANGE)
 756                 return (EINVAL);
 757 
 758         if ((fxparmsp->fx_tqsecs == 0 && fxparmsp->fx_tqnsecs == 0) ||
 759             fxparmsp->fx_tqnsecs >= NANOSEC)
 760                 return (EINVAL);
 761 
 762         cflags = (fxparmsp->fx_upri != FX_NOCHANGE ? FX_DOUPRI : 0);
 763 
 764         if (fxparmsp->fx_uprilim != FX_NOCHANGE) {
 765                 cflags |= FX_DOUPRILIM;
 766         }
 767 
 768         if (fxparmsp->fx_tqnsecs != FX_NOCHANGE)
 769                 cflags |= FX_DOTQ;
 770 
 771         /*
 772          * convert the buffer to kernel format.
 773          */
 774 
 775         if (fxparmsp->fx_tqnsecs >= 0) {
 776                 if ((ticks = SEC_TO_TICK((longlong_t)fxparmsp->fx_tqsecs) +
 777                     NSEC_TO_TICK_ROUNDUP(fxparmsp->fx_tqnsecs)) > INT_MAX)
 778                         return (ERANGE);
 779 
 780                 ((fxkparms_t *)fxparmsp)->fx_tqntm = (int)ticks;
 781         } else {
 782                 if ((fxparmsp->fx_tqnsecs != FX_NOCHANGE) &&
 783                     (fxparmsp->fx_tqnsecs != FX_TQINF) &&
 784                     (fxparmsp->fx_tqnsecs != FX_TQDEF))
 785                         return (EINVAL);
 786                 ((fxkparms_t *)fxparmsp)->fx_tqntm = fxparmsp->fx_tqnsecs;
 787         }
 788 
 789         ((fxkparms_t *)fxparmsp)->fx_cflags = cflags;
 790 
 791         return (0);
 792 }
 793 
 794 
 795 /*
 796  * Check the validity of the fixed-priority parameters in the pc_vaparms_t
 797  * structure vaparmsp and put them in the buffer pointed to by fxprmsp.
 798  * pc_vaparms_t contains (key, value) pairs of parameter.
 799  */
 800 static int
 801 fx_vaparmsin(void *prmsp, pc_vaparms_t *vaparmsp)
 802 {
 803         uint_t          secs = 0;
 804         uint_t          cnt;
 805         int             nsecs = 0;
 806         int             priflag, secflag, nsecflag, limflag;
 807         longlong_t      ticks;
 808         fxkparms_t      *fxprmsp = (fxkparms_t *)prmsp;
 809         pc_vaparm_t     *vpp = &vaparmsp->pc_parms[0];
 810 
 811 
 812         /*
 813          * First check the validity of parameters and convert them
 814          * from the user supplied format to the internal format.
 815          */
 816         priflag = secflag = nsecflag = limflag = 0;
 817 
 818         fxprmsp->fx_cflags = 0;
 819 
 820         if (vaparmsp->pc_vaparmscnt > PC_VAPARMCNT)
 821                 return (EINVAL);
 822 
 823         for (cnt = 0; cnt < vaparmsp->pc_vaparmscnt; cnt++, vpp++) {
 824 
 825                 switch (vpp->pc_key) {
 826                 case FX_KY_UPRILIM:
 827                         if (limflag++)
 828                                 return (EINVAL);
 829                         fxprmsp->fx_cflags |= FX_DOUPRILIM;
 830                         fxprmsp->fx_uprilim = (pri_t)vpp->pc_parm;
 831                         if (fxprmsp->fx_uprilim > fx_maxupri ||
 832                             fxprmsp->fx_uprilim < 0)
 833                                 return (EINVAL);
 834                         break;
 835 
 836                 case FX_KY_UPRI:
 837                         if (priflag++)
 838                                 return (EINVAL);
 839                         fxprmsp->fx_cflags |= FX_DOUPRI;
 840                         fxprmsp->fx_upri = (pri_t)vpp->pc_parm;
 841                         if (fxprmsp->fx_upri > fx_maxupri ||
 842                             fxprmsp->fx_upri < 0)
 843                                 return (EINVAL);
 844                         break;
 845 
 846                 case FX_KY_TQSECS:
 847                         if (secflag++)
 848                                 return (EINVAL);
 849                         fxprmsp->fx_cflags |= FX_DOTQ;
 850                         secs = (uint_t)vpp->pc_parm;
 851                         break;
 852 
 853                 case FX_KY_TQNSECS:
 854                         if (nsecflag++)
 855                                 return (EINVAL);
 856                         fxprmsp->fx_cflags |= FX_DOTQ;
 857                         nsecs = (int)vpp->pc_parm;
 858                         break;
 859 
 860                 default:
 861                         return (EINVAL);
 862                 }
 863         }
 864 
 865         if (vaparmsp->pc_vaparmscnt == 0) {
 866                 /*
 867                  * Use default parameters.
 868                  */
 869                 fxprmsp->fx_upri = 0;
 870                 fxprmsp->fx_uprilim = 0;
 871                 fxprmsp->fx_tqntm = FX_TQDEF;
 872                 fxprmsp->fx_cflags = FX_DOUPRI | FX_DOUPRILIM | FX_DOTQ;
 873         } else if ((fxprmsp->fx_cflags & FX_DOTQ) != 0) {
 874                 if ((secs == 0 && nsecs == 0) || nsecs >= NANOSEC)
 875                         return (EINVAL);
 876 
 877                 if (nsecs >= 0) {
 878                         if ((ticks = SEC_TO_TICK((longlong_t)secs) +
 879                             NSEC_TO_TICK_ROUNDUP(nsecs)) > INT_MAX)
 880                                 return (ERANGE);
 881 
 882                         fxprmsp->fx_tqntm = (int)ticks;
 883                 } else {
 884                         if (nsecs != FX_TQINF && nsecs != FX_TQDEF)
 885                                 return (EINVAL);
 886                         fxprmsp->fx_tqntm = nsecs;
 887                 }
 888         }
 889 
 890         return (0);
 891 }
 892 
 893 
 894 /*
 895  * Nothing to do here but return success.
 896  */
 897 /* ARGSUSED */
 898 static int
 899 fx_parmsout(void *parmsp, pc_vaparms_t *vaparmsp)
 900 {
 901         register fxkparms_t     *fxkprmsp = (fxkparms_t *)parmsp;
 902 
 903         if (vaparmsp != NULL)
 904                 return (0);
 905 
 906         if (fxkprmsp->fx_tqntm < 0) {
 907                 /*
 908                  * Quantum field set to special value (e.g. FX_TQINF)
 909                  */
 910                 ((fxparms_t *)fxkprmsp)->fx_tqnsecs = fxkprmsp->fx_tqntm;
 911                 ((fxparms_t *)fxkprmsp)->fx_tqsecs = 0;
 912 
 913         } else {
 914                 /* Convert quantum from ticks to seconds-nanoseconds */
 915 
 916                 timestruc_t ts;
 917                 TICK_TO_TIMESTRUC(fxkprmsp->fx_tqntm, &ts);
 918                 ((fxparms_t *)fxkprmsp)->fx_tqsecs = ts.tv_sec;
 919                 ((fxparms_t *)fxkprmsp)->fx_tqnsecs = ts.tv_nsec;
 920         }
 921 
 922         return (0);
 923 }
 924 
 925 
 926 /*
 927  * Copy all selected fixed-priority class parameters to the user.
 928  * The parameters are specified by a key.
 929  */
 930 static int
 931 fx_vaparmsout(void *prmsp, pc_vaparms_t *vaparmsp)
 932 {
 933         fxkparms_t      *fxkprmsp = (fxkparms_t *)prmsp;
 934         timestruc_t     ts;
 935         uint_t          cnt;
 936         uint_t          secs;
 937         int             nsecs;
 938         int             priflag, secflag, nsecflag, limflag;
 939         pc_vaparm_t     *vpp = &vaparmsp->pc_parms[0];
 940 
 941         ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
 942 
 943         priflag = secflag = nsecflag = limflag = 0;
 944 
 945         if (vaparmsp->pc_vaparmscnt > PC_VAPARMCNT)
 946                 return (EINVAL);
 947 
 948         if (fxkprmsp->fx_tqntm < 0) {
 949                 /*
 950                  * Quantum field set to special value (e.g. FX_TQINF).
 951                  */
 952                 secs = 0;
 953                 nsecs = fxkprmsp->fx_tqntm;
 954         } else {
 955                 /*
 956                  * Convert quantum from ticks to seconds-nanoseconds.
 957                  */
 958                 TICK_TO_TIMESTRUC(fxkprmsp->fx_tqntm, &ts);
 959                 secs = ts.tv_sec;
 960                 nsecs = ts.tv_nsec;
 961         }
 962 
 963 
 964         for (cnt = 0; cnt < vaparmsp->pc_vaparmscnt; cnt++, vpp++) {
 965 
 966                 switch (vpp->pc_key) {
 967                 case FX_KY_UPRILIM:
 968                         if (limflag++)
 969                                 return (EINVAL);
 970                         if (copyout(&fxkprmsp->fx_uprilim,
 971                             (void *)(uintptr_t)vpp->pc_parm, sizeof (pri_t)))
 972                                 return (EFAULT);
 973                         break;
 974 
 975                 case FX_KY_UPRI:
 976                         if (priflag++)
 977                                 return (EINVAL);
 978                         if (copyout(&fxkprmsp->fx_upri,
 979                             (void *)(uintptr_t)vpp->pc_parm, sizeof (pri_t)))
 980                                 return (EFAULT);
 981                         break;
 982 
 983                 case FX_KY_TQSECS:
 984                         if (secflag++)
 985                                 return (EINVAL);
 986                         if (copyout(&secs,
 987                             (void *)(uintptr_t)vpp->pc_parm, sizeof (uint_t)))
 988                                 return (EFAULT);
 989                         break;
 990 
 991                 case FX_KY_TQNSECS:
 992                         if (nsecflag++)
 993                                 return (EINVAL);
 994                         if (copyout(&nsecs,
 995                             (void *)(uintptr_t)vpp->pc_parm, sizeof (int)))
 996                                 return (EFAULT);
 997                         break;
 998 
 999                 default:
1000                         return (EINVAL);
1001                 }
1002         }
1003 
1004         return (0);
1005 }
1006 
1007 /*
1008  * Set the scheduling parameters of the thread pointed to by fxprocp
1009  * to those specified in the buffer pointed to by fxparmsp.
1010  */
1011 /* ARGSUSED */
1012 static int
1013 fx_parmsset(kthread_t *tx, void *parmsp, id_t reqpcid, cred_t *reqpcredp)
1014 {
1015         char            nice;
1016         pri_t           reqfxuprilim;
1017         pri_t           reqfxupri;
1018         fxkparms_t      *fxkparmsp = (fxkparms_t *)parmsp;
1019         fxproc_t        *fxpp;
1020 
1021 
1022         ASSERT(MUTEX_HELD(&(ttoproc(tx))->p_lock));
1023 
1024         thread_lock(tx);
1025         fxpp = (fxproc_t *)tx->t_cldata;
1026 
1027         if ((fxkparmsp->fx_cflags & FX_DOUPRILIM) == 0)
1028                 reqfxuprilim = fxpp->fx_uprilim;
1029         else
1030                 reqfxuprilim = fxkparmsp->fx_uprilim;
1031 
1032         /*
1033          * Basic permissions enforced by generic kernel code
1034          * for all classes require that a thread attempting
1035          * to change the scheduling parameters of a target
1036          * thread be privileged or have a real or effective
1037          * UID matching that of the target thread. We are not
1038          * called unless these basic permission checks have
1039          * already passed. The fixed priority class requires in
1040          * addition that the calling thread be privileged if it
1041          * is attempting to raise the pri above its current
1042          * value This may have been checked previously but if our
1043          * caller passed us a non-NULL credential pointer we assume
1044          * it hasn't and we check it here.
1045          */
1046 
1047         if ((reqpcredp != NULL) &&
1048             (reqfxuprilim > fxpp->fx_uprilim ||
1049             ((fxkparmsp->fx_cflags & FX_DOTQ) != 0)) &&
1050             secpolicy_raisepriority(reqpcredp) != 0) {
1051                 thread_unlock(tx);
1052                 return (EPERM);
1053         }
1054 
1055         FX_ADJUST_PRI(reqfxuprilim);
1056 
1057         if ((fxkparmsp->fx_cflags & FX_DOUPRI) == 0)
1058                 reqfxupri = fxpp->fx_pri;
1059         else
1060                 reqfxupri = fxkparmsp->fx_upri;
1061 
1062 
1063         /*
1064          * Make sure the user priority doesn't exceed the upri limit.
1065          */
1066         if (reqfxupri > reqfxuprilim)
1067                 reqfxupri = reqfxuprilim;
1068 
1069         /*
1070          * Set fx_nice to the nice value corresponding to the user
1071          * priority we are setting.  Note that setting the nice field
1072          * of the parameter struct won't affect upri or nice.
1073          */
1074 
1075         nice = NZERO - (reqfxupri * NZERO) / fx_maxupri;
1076 
1077         if (nice > NZERO)
1078                 nice = NZERO;
1079 
1080         fxpp->fx_uprilim = reqfxuprilim;
1081         fxpp->fx_pri = reqfxupri;
1082 
1083         if (fxkparmsp->fx_tqntm == FX_TQINF)
1084                 fxpp->fx_pquantum = FX_TQINF;
1085         else if (fxkparmsp->fx_tqntm == FX_TQDEF)
1086                 fxpp->fx_pquantum = fx_dptbl[fxpp->fx_pri].fx_quantum;
1087         else if ((fxkparmsp->fx_cflags & FX_DOTQ) != 0)
1088                 fxpp->fx_pquantum = fxkparmsp->fx_tqntm;
1089 
1090         fxpp->fx_nice = nice;
1091 
1092         fx_change_priority(tx, fxpp);
1093         thread_unlock(tx);
1094         return (0);
1095 }
1096 
1097 
1098 /*
1099  * Return the global scheduling priority that would be assigned
1100  * to a thread entering the fixed-priority class with the fx_upri.
1101  */
1102 static pri_t
1103 fx_globpri(kthread_t *t)
1104 {
1105         fxproc_t *fxpp;
1106 
1107         ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
1108 
1109         fxpp = (fxproc_t *)t->t_cldata;
1110         return (fx_dptbl[fxpp->fx_pri].fx_globpri);
1111 
1112 }
1113 
1114 /*
1115  * Arrange for thread to be placed in appropriate location
1116  * on dispatcher queue.
1117  *
1118  * This is called with the current thread in TS_ONPROC and locked.
1119  */
1120 static void
1121 fx_preempt(kthread_t *t)
1122 {
1123         fxproc_t        *fxpp = (fxproc_t *)(t->t_cldata);
1124 
1125         ASSERT(t == curthread);
1126         ASSERT(THREAD_LOCK_HELD(curthread));
1127 
1128         (void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ENFORCE);
1129 
1130         /*
1131          * Check to see if we're doing "preemption control" here.  If
1132          * we are, and if the user has requested that this thread not
1133          * be preempted, and if preemptions haven't been put off for
1134          * too long, let the preemption happen here but try to make
1135          * sure the thread is rescheduled as soon as possible.  We do
1136          * this by putting it on the front of the highest priority run
1137          * queue in the FX class.  If the preemption has been put off
1138          * for too long, clear the "nopreempt" bit and let the thread
1139          * be preempted.
1140          */
1141         if (t->t_schedctl && schedctl_get_nopreempt(t)) {
1142                 if (fxpp->fx_pquantum == FX_TQINF ||
1143                     fxpp->fx_timeleft > -SC_MAX_TICKS) {
1144                         DTRACE_SCHED1(schedctl__nopreempt, kthread_t *, t);
1145                         schedctl_set_yield(t, 1);
1146                         setfrontdq(t);
1147                         return;
1148                 } else {
1149                         schedctl_set_nopreempt(t, 0);
1150                         DTRACE_SCHED1(schedctl__preempt, kthread_t *, t);
1151                         TNF_PROBE_2(schedctl_preempt, "schedctl FX fx_preempt",
1152                             /* CSTYLED */, tnf_pid, pid, ttoproc(t)->p_pid,
1153                             tnf_lwpid, lwpid, t->t_tid);
1154                         /*
1155                          * Fall through and be preempted below.
1156                          */
1157                 }
1158         }
1159 
1160         if (FX_HAS_CB(fxpp)) {
1161                 clock_t new_quantum =  (clock_t)fxpp->fx_pquantum;
1162                 pri_t   newpri = fxpp->fx_pri;
1163                 FX_CB_PREEMPT(FX_CALLB(fxpp), fxpp->fx_cookie,
1164                     &new_quantum, &newpri);
1165                 FX_ADJUST_QUANTUM(new_quantum);
1166                 if ((int)new_quantum != fxpp->fx_pquantum) {
1167                         fxpp->fx_pquantum = (int)new_quantum;
1168                         fxpp->fx_timeleft = fxpp->fx_pquantum;
1169                 }
1170                 FX_ADJUST_PRI(newpri);
1171                 fxpp->fx_pri = newpri;
1172                 THREAD_CHANGE_PRI(t, fx_dptbl[fxpp->fx_pri].fx_globpri);
1173         }
1174 
1175         /*
1176          * This thread may be placed on wait queue by CPU Caps. In this case we
1177          * do not need to do anything until it is removed from the wait queue.
1178          */
1179         if (CPUCAPS_ENFORCE(t)) {
1180                 return;
1181         }
1182 
1183         if ((fxpp->fx_flags & (FXBACKQ)) == FXBACKQ) {
1184                 fxpp->fx_timeleft = fxpp->fx_pquantum;
1185                 fxpp->fx_flags &= ~FXBACKQ;
1186                 setbackdq(t);
1187         } else {
1188                 setfrontdq(t);
1189         }
1190 }
1191 
1192 static void
1193 fx_setrun(kthread_t *t)
1194 {
1195         fxproc_t *fxpp = (fxproc_t *)(t->t_cldata);
1196 
1197         ASSERT(THREAD_LOCK_HELD(t));    /* t should be in transition */
1198         fxpp->fx_flags &= ~FXBACKQ;
1199 
1200         if (t->t_disp_time != ddi_get_lbolt())
1201                 setbackdq(t);
1202         else
1203                 setfrontdq(t);
1204 }
1205 
1206 
1207 /*
1208  * Prepare thread for sleep. We reset the thread priority so it will
1209  * run at the kernel priority level when it wakes up.
1210  */
1211 static void
1212 fx_sleep(kthread_t *t)
1213 {
1214         fxproc_t        *fxpp = (fxproc_t *)(t->t_cldata);
1215 
1216         ASSERT(t == curthread);
1217         ASSERT(THREAD_LOCK_HELD(t));
1218 
1219         /*
1220          * Account for time spent on CPU before going to sleep.
1221          */
1222         (void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ENFORCE);
1223 
1224         if (FX_HAS_CB(fxpp)) {
1225                 FX_CB_SLEEP(FX_CALLB(fxpp), fxpp->fx_cookie);
1226         }
1227         t->t_stime = ddi_get_lbolt();                /* time stamp for the swapper */
1228 }
1229 
1230 
1231 /*
1232  * Return Values:
1233  *
1234  *      -1 if the thread is loaded or is not eligible to be swapped in.
1235  *
1236  * FX and RT threads are designed so that they don't swapout; however,
1237  * it is possible that while the thread is swapped out and in another class, it
1238  * can be changed to FX or RT.  Since these threads should be swapped in
1239  * as soon as they're runnable, rt_swapin returns SHRT_MAX, and fx_swapin
1240  * returns SHRT_MAX - 1, so that it gives deference to any swapped out
1241  * RT threads.
1242  */
1243 /* ARGSUSED */
1244 static pri_t
1245 fx_swapin(kthread_t *t, int flags)
1246 {
1247         pri_t   tpri = -1;
1248 
1249         ASSERT(THREAD_LOCK_HELD(t));
1250 
1251         if (t->t_state == TS_RUN && (t->t_schedflag & TS_LOAD) == 0) {
1252                 tpri = (pri_t)SHRT_MAX - 1;
1253         }
1254 
1255         return (tpri);
1256 }
1257 
1258 /*
1259  * Return Values
1260  *      -1 if the thread isn't loaded or is not eligible to be swapped out.
1261  */
1262 /* ARGSUSED */
1263 static pri_t
1264 fx_swapout(kthread_t *t, int flags)
1265 {
1266         ASSERT(THREAD_LOCK_HELD(t));
1267 
1268         return (-1);
1269 
1270 }
1271 
1272 /* ARGSUSED */
1273 static void
1274 fx_stop(kthread_t *t, int why, int what)
1275 {
1276         fxproc_t *fxpp = (fxproc_t *)(t->t_cldata);
1277 
1278         ASSERT(THREAD_LOCK_HELD(t));
1279 
1280         if (FX_HAS_CB(fxpp)) {
1281                 FX_CB_STOP(FX_CALLB(fxpp), fxpp->fx_cookie);
1282         }
1283 }
1284 
1285 /*
1286  * Check for time slice expiration.  If time slice has expired
1287  * set runrun to cause preemption.
1288  */
1289 static void
1290 fx_tick(kthread_t *t)
1291 {
1292         boolean_t call_cpu_surrender = B_FALSE;
1293         fxproc_t *fxpp;
1294 
1295         ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock));
1296 
1297         thread_lock(t);
1298 
1299         fxpp = (fxproc_t *)(t->t_cldata);
1300 
1301         if (FX_HAS_CB(fxpp)) {
1302                 clock_t new_quantum =  (clock_t)fxpp->fx_pquantum;
1303                 pri_t   newpri = fxpp->fx_pri;
1304                 FX_CB_TICK(FX_CALLB(fxpp), fxpp->fx_cookie,
1305                     &new_quantum, &newpri);
1306                 FX_ADJUST_QUANTUM(new_quantum);
1307                 if ((int)new_quantum != fxpp->fx_pquantum) {
1308                         fxpp->fx_pquantum = (int)new_quantum;
1309                         fxpp->fx_timeleft = fxpp->fx_pquantum;
1310                 }
1311                 FX_ADJUST_PRI(newpri);
1312                 if (newpri != fxpp->fx_pri) {
1313                         fxpp->fx_pri = newpri;
1314                         fx_change_priority(t, fxpp);
1315                 }
1316         }
1317 
1318         /*
1319          * Keep track of thread's project CPU usage.  Note that projects
1320          * get charged even when threads are running in the kernel.
1321          */
1322         call_cpu_surrender =  CPUCAPS_CHARGE(t, &fxpp->fx_caps,
1323             CPUCAPS_CHARGE_ENFORCE);
1324 
1325         if ((fxpp->fx_pquantum != FX_TQINF) &&
1326             (--fxpp->fx_timeleft <= 0)) {
1327                 pri_t   new_pri;
1328 
1329                 /*
1330                  * If we're doing preemption control and trying to
1331                  * avoid preempting this thread, just note that
1332                  * the thread should yield soon and let it keep
1333                  * running (unless it's been a while).
1334                  */
1335                 if (t->t_schedctl && schedctl_get_nopreempt(t)) {
1336                         if (fxpp->fx_timeleft > -SC_MAX_TICKS) {
1337                                 DTRACE_SCHED1(schedctl__nopreempt,
1338                                     kthread_t *, t);
1339                                 schedctl_set_yield(t, 1);
1340                                 thread_unlock_nopreempt(t);
1341                                 return;
1342                         }
1343                         TNF_PROBE_2(schedctl_failsafe,
1344                             "schedctl FX fx_tick", /* CSTYLED */,
1345                             tnf_pid, pid, ttoproc(t)->p_pid,
1346                             tnf_lwpid, lwpid, t->t_tid);
1347                 }
1348                 new_pri = fx_dptbl[fxpp->fx_pri].fx_globpri;
1349                 ASSERT(new_pri >= 0 && new_pri <= fx_maxglobpri);
1350                 /*
1351                  * When the priority of a thread is changed,
1352                  * it may be necessary to adjust its position
1353                  * on a sleep queue or dispatch queue. Even
1354                  * when the priority is not changed, we need
1355                  * to preserve round robin on dispatch queue.
1356                  * The function thread_change_pri accomplishes
1357                  * this.
1358                  */
1359                 if (thread_change_pri(t, new_pri, 0)) {
1360                         fxpp->fx_timeleft = fxpp->fx_pquantum;
1361                 } else {
1362                         call_cpu_surrender = B_TRUE;
1363                 }
1364         } else if (t->t_state == TS_ONPROC &&
1365             t->t_pri < t->t_disp_queue->disp_maxrunpri) {
1366                 call_cpu_surrender = B_TRUE;
1367         }
1368 
1369         if (call_cpu_surrender) {
1370                 fxpp->fx_flags |= FXBACKQ;
1371                 cpu_surrender(t);
1372         }
1373         thread_unlock_nopreempt(t);     /* clock thread can't be preempted */
1374 }
1375 
1376 
1377 static void
1378 fx_trapret(kthread_t *t)
1379 {
1380         cpu_t           *cp = CPU;
1381 
1382         ASSERT(THREAD_LOCK_HELD(t));
1383         ASSERT(t == curthread);
1384         ASSERT(cp->cpu_dispthread == t);
1385         ASSERT(t->t_state == TS_ONPROC);
1386 }
1387 
1388 
1389 /*
1390  * Processes waking up go to the back of their queue.
1391  */
1392 static void
1393 fx_wakeup(kthread_t *t)
1394 {
1395         fxproc_t        *fxpp = (fxproc_t *)(t->t_cldata);
1396 
1397         ASSERT(THREAD_LOCK_HELD(t));
1398 
1399         t->t_stime = ddi_get_lbolt();                /* time stamp for the swapper */
1400         if (FX_HAS_CB(fxpp)) {
1401                 clock_t new_quantum =  (clock_t)fxpp->fx_pquantum;
1402                 pri_t   newpri = fxpp->fx_pri;
1403                 FX_CB_WAKEUP(FX_CALLB(fxpp), fxpp->fx_cookie,
1404                     &new_quantum, &newpri);
1405                 FX_ADJUST_QUANTUM(new_quantum);
1406                 if ((int)new_quantum != fxpp->fx_pquantum) {
1407                         fxpp->fx_pquantum = (int)new_quantum;
1408                         fxpp->fx_timeleft = fxpp->fx_pquantum;
1409                 }
1410 
1411                 FX_ADJUST_PRI(newpri);
1412                 if (newpri != fxpp->fx_pri) {
1413                         fxpp->fx_pri = newpri;
1414                         THREAD_CHANGE_PRI(t, fx_dptbl[fxpp->fx_pri].fx_globpri);
1415                 }
1416         }
1417 
1418         fxpp->fx_flags &= ~FXBACKQ;
1419 
1420         if (t->t_disp_time != ddi_get_lbolt())
1421                 setbackdq(t);
1422         else
1423                 setfrontdq(t);
1424 }
1425 
1426 
1427 /*
1428  * When a thread yields, put it on the back of the run queue.
1429  */
1430 static void
1431 fx_yield(kthread_t *t)
1432 {
1433         fxproc_t        *fxpp = (fxproc_t *)(t->t_cldata);
1434 
1435         ASSERT(t == curthread);
1436         ASSERT(THREAD_LOCK_HELD(t));
1437 
1438         /*
1439          * Collect CPU usage spent before yielding CPU.
1440          */
1441         (void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ENFORCE);
1442 
1443         if (FX_HAS_CB(fxpp))  {
1444                 clock_t new_quantum =  (clock_t)fxpp->fx_pquantum;
1445                 pri_t   newpri = fxpp->fx_pri;
1446                 FX_CB_PREEMPT(FX_CALLB(fxpp), fxpp->fx_cookie,
1447                     &new_quantum, &newpri);
1448                 FX_ADJUST_QUANTUM(new_quantum);
1449                 if ((int)new_quantum != fxpp->fx_pquantum) {
1450                         fxpp->fx_pquantum = (int)new_quantum;
1451                         fxpp->fx_timeleft = fxpp->fx_pquantum;
1452                 }
1453                 FX_ADJUST_PRI(newpri);
1454                 fxpp->fx_pri = newpri;
1455                 THREAD_CHANGE_PRI(t, fx_dptbl[fxpp->fx_pri].fx_globpri);
1456         }
1457 
1458         /*
1459          * Clear the preemption control "yield" bit since the user is
1460          * doing a yield.
1461          */
1462         if (t->t_schedctl)
1463                 schedctl_set_yield(t, 0);
1464 
1465         if (fxpp->fx_timeleft <= 0) {
1466                 /*
1467                  * Time slice was artificially extended to avoid
1468                  * preemption, so pretend we're preempting it now.
1469                  */
1470                 DTRACE_SCHED1(schedctl__yield, int, -fxpp->fx_timeleft);
1471                 fxpp->fx_timeleft = fxpp->fx_pquantum;
1472                 THREAD_CHANGE_PRI(t, fx_dptbl[fxpp->fx_pri].fx_globpri);
1473                 ASSERT(t->t_pri >= 0 && t->t_pri <= fx_maxglobpri);
1474         }
1475 
1476         fxpp->fx_flags &= ~FXBACKQ;
1477         setbackdq(t);
1478 }
1479 
1480 /*
1481  * Increment the nice value of the specified thread by incr and
1482  * return the new value in *retvalp.
1483  */
1484 static int
1485 fx_donice(kthread_t *t, cred_t *cr, int incr, int *retvalp)
1486 {
1487         int             newnice;
1488         fxproc_t        *fxpp = (fxproc_t *)(t->t_cldata);
1489         fxkparms_t      fxkparms;
1490 
1491         ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock));
1492 
1493         /* If there's no change to priority, just return current setting */
1494         if (incr == 0) {
1495                 if (retvalp) {
1496                         *retvalp = fxpp->fx_nice - NZERO;
1497                 }
1498                 return (0);
1499         }
1500 
1501         if ((incr < 0 || incr > 2 * NZERO) &&
1502             secpolicy_raisepriority(cr) != 0)
1503                 return (EPERM);
1504 
1505         /*
1506          * Specifying a nice increment greater than the upper limit of
1507          * 2 * NZERO - 1 will result in the thread's nice value being
1508          * set to the upper limit.  We check for this before computing
1509          * the new value because otherwise we could get overflow
1510          * if a privileged user specified some ridiculous increment.
1511          */
1512         if (incr > 2 * NZERO - 1)
1513                 incr = 2 * NZERO - 1;
1514 
1515         newnice = fxpp->fx_nice + incr;
1516         if (newnice > NZERO)
1517                 newnice = NZERO;
1518         else if (newnice < 0)
1519                 newnice = 0;
1520 
1521         fxkparms.fx_uprilim = fxkparms.fx_upri =
1522             -((newnice - NZERO) * fx_maxupri) / NZERO;
1523 
1524         fxkparms.fx_cflags = FX_DOUPRILIM | FX_DOUPRI;
1525 
1526         fxkparms.fx_tqntm = FX_TQDEF;
1527 
1528         /*
1529          * Reset the uprilim and upri values of the thread. Adjust
1530          * time quantum accordingly.
1531          */
1532 
1533         (void) fx_parmsset(t, (void *)&fxkparms, (id_t)0, (cred_t *)NULL);
1534 
1535         /*
1536          * Although fx_parmsset already reset fx_nice it may
1537          * not have been set to precisely the value calculated above
1538          * because fx_parmsset determines the nice value from the
1539          * user priority and we may have truncated during the integer
1540          * conversion from nice value to user priority and back.
1541          * We reset fx_nice to the value we calculated above.
1542          */
1543         fxpp->fx_nice = (char)newnice;
1544 
1545         if (retvalp)
1546                 *retvalp = newnice - NZERO;
1547 
1548         return (0);
1549 }
1550 
1551 /*
1552  * Increment the priority of the specified thread by incr and
1553  * return the new value in *retvalp.
1554  */
1555 static int
1556 fx_doprio(kthread_t *t, cred_t *cr, int incr, int *retvalp)
1557 {
1558         int             newpri;
1559         fxproc_t        *fxpp = (fxproc_t *)(t->t_cldata);
1560         fxkparms_t      fxkparms;
1561 
1562         ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock));
1563 
1564         /* If there's no change to priority, just return current setting */
1565         if (incr == 0) {
1566                 *retvalp = fxpp->fx_pri;
1567                 return (0);
1568         }
1569 
1570         newpri = fxpp->fx_pri + incr;
1571         if (newpri > fx_maxupri || newpri < 0)
1572                 return (EINVAL);
1573 
1574         *retvalp = newpri;
1575         fxkparms.fx_uprilim = fxkparms.fx_upri = newpri;
1576         fxkparms.fx_tqntm = FX_NOCHANGE;
1577         fxkparms.fx_cflags = FX_DOUPRILIM | FX_DOUPRI;
1578 
1579         /*
1580          * Reset the uprilim and upri values of the thread.
1581          */
1582         return (fx_parmsset(t, (void *)&fxkparms, (id_t)0, cr));
1583 }
1584 
1585 static void
1586 fx_change_priority(kthread_t *t, fxproc_t *fxpp)
1587 {
1588         pri_t   new_pri;
1589 
1590         ASSERT(THREAD_LOCK_HELD(t));
1591         new_pri = fx_dptbl[fxpp->fx_pri].fx_globpri;
1592         ASSERT(new_pri >= 0 && new_pri <= fx_maxglobpri);
1593         t->t_cpri = fxpp->fx_pri;
1594         if (t == curthread || t->t_state == TS_ONPROC) {
1595                 /* curthread is always onproc */
1596                 cpu_t   *cp = t->t_disp_queue->disp_cpu;
1597                 THREAD_CHANGE_PRI(t, new_pri);
1598                 if (t == cp->cpu_dispthread)
1599                         cp->cpu_dispatch_pri = DISP_PRIO(t);
1600                 if (DISP_MUST_SURRENDER(t)) {
1601                         fxpp->fx_flags |= FXBACKQ;
1602                         cpu_surrender(t);
1603                 } else {
1604                         fxpp->fx_timeleft = fxpp->fx_pquantum;
1605                 }
1606         } else {
1607                 /*
1608                  * When the priority of a thread is changed,
1609                  * it may be necessary to adjust its position
1610                  * on a sleep queue or dispatch queue.
1611                  * The function thread_change_pri accomplishes
1612                  * this.
1613                  */
1614                 if (thread_change_pri(t, new_pri, 0)) {
1615                         /*
1616                          * The thread was on a run queue. Reset
1617                          * its CPU timeleft from the quantum
1618                          * associated with the new priority.
1619                          */
1620                         fxpp->fx_timeleft = fxpp->fx_pquantum;
1621                 } else {
1622                         fxpp->fx_flags |= FXBACKQ;
1623                 }
1624         }
1625 }
1626 
1627 static int
1628 fx_alloc(void **p, int flag)
1629 {
1630         void *bufp;
1631 
1632         bufp = kmem_alloc(sizeof (fxproc_t), flag);
1633         if (bufp == NULL) {
1634                 return (ENOMEM);
1635         } else {
1636                 *p = bufp;
1637                 return (0);
1638         }
1639 }
1640 
1641 static void
1642 fx_free(void *bufp)
1643 {
1644         if (bufp)
1645                 kmem_free(bufp, sizeof (fxproc_t));
1646 }
1647 
1648 /*
1649  * Release the callback list mutex after successful lookup
1650  */
1651 void
1652 fx_list_release(fxproc_t *fxpp)
1653 {
1654         int index = FX_CB_LIST_HASH(fxpp->fx_ktid);
1655         kmutex_t *lockp = &fx_cb_list_lock[index];
1656         mutex_exit(lockp);
1657 }
1658 
1659 fxproc_t *
1660 fx_list_lookup(kt_did_t ktid)
1661 {
1662         int index = FX_CB_LIST_HASH(ktid);
1663         kmutex_t *lockp = &fx_cb_list_lock[index];
1664         fxproc_t *fxpp;
1665 
1666         mutex_enter(lockp);
1667 
1668         for (fxpp = fx_cb_plisthead[index].fx_cb_next;
1669             fxpp != &fx_cb_plisthead[index]; fxpp = fxpp->fx_cb_next) {
1670                 if (fxpp->fx_tp->t_cid == fx_cid && fxpp->fx_ktid == ktid &&
1671                     fxpp->fx_callback != NULL) {
1672                         /*
1673                          * The caller is responsible for calling
1674                          * fx_list_release to drop the lock upon
1675                          * successful lookup
1676                          */
1677                         return (fxpp);
1678                 }
1679         }
1680         mutex_exit(lockp);
1681         return ((fxproc_t *)NULL);
1682 }
1683 
1684 
1685 /*
1686  * register a callback set of routines for current thread
1687  * thread should already be in FX class
1688  */
1689 int
1690 fx_register_callbacks(fx_callbacks_t *fx_callback, fx_cookie_t cookie,
1691         pri_t pri, clock_t quantum)
1692 {
1693 
1694         fxproc_t        *fxpp;
1695 
1696         if (fx_callback == NULL)
1697                 return (EINVAL);
1698 
1699         if (secpolicy_dispadm(CRED()) != 0)
1700                 return (EPERM);
1701 
1702         if (FX_CB_VERSION(fx_callback) != FX_CALLB_REV)
1703                 return (EINVAL);
1704 
1705         if (!FX_ISVALID(pri, quantum))
1706                 return (EINVAL);
1707 
1708         thread_lock(curthread);         /* get dispatcher lock on thread */
1709 
1710         if (curthread->t_cid != fx_cid) {
1711                 thread_unlock(curthread);
1712                 return (EINVAL);
1713         }
1714 
1715         fxpp = (fxproc_t *)(curthread->t_cldata);
1716         ASSERT(fxpp != NULL);
1717         if (FX_HAS_CB(fxpp)) {
1718                 thread_unlock(curthread);
1719                 return (EINVAL);
1720         }
1721 
1722         fxpp->fx_callback = fx_callback;
1723         fxpp->fx_cookie = cookie;
1724 
1725         if (pri != FX_CB_NOCHANGE) {
1726                 fxpp->fx_pri = pri;
1727                 FX_ADJUST_PRI(fxpp->fx_pri);
1728                 if (quantum == FX_TQDEF) {
1729                         fxpp->fx_pquantum = fx_dptbl[fxpp->fx_pri].fx_quantum;
1730                 } else if (quantum == FX_TQINF) {
1731                         fxpp->fx_pquantum = FX_TQINF;
1732                 } else if (quantum != FX_NOCHANGE) {
1733                         FX_ADJUST_QUANTUM(quantum);
1734                         fxpp->fx_pquantum = quantum;
1735                 }
1736         } else if (quantum != FX_NOCHANGE && quantum != FX_TQDEF) {
1737                 if (quantum == FX_TQINF)
1738                         fxpp->fx_pquantum = FX_TQINF;
1739                 else {
1740                         FX_ADJUST_QUANTUM(quantum);
1741                         fxpp->fx_pquantum = quantum;
1742                 }
1743         }
1744 
1745         fxpp->fx_ktid = ddi_get_kt_did();
1746 
1747         fx_change_priority(curthread, fxpp);
1748 
1749         thread_unlock(curthread);
1750 
1751         /*
1752          * Link new structure into fxproc list.
1753          */
1754         FX_CB_LIST_INSERT(fxpp);
1755         return (0);
1756 }
1757 
1758 /* unregister a callback set of routines for current thread */
1759 int
1760 fx_unregister_callbacks()
1761 {
1762         fxproc_t        *fxpp;
1763 
1764         if ((fxpp = fx_list_lookup(ddi_get_kt_did())) == NULL) {
1765                 /*
1766                  * did not have a registered callback;
1767                  */
1768                 return (EINVAL);
1769         }
1770 
1771         thread_lock(fxpp->fx_tp);
1772         fxpp->fx_callback = NULL;
1773         fxpp->fx_cookie = NULL;
1774         thread_unlock(fxpp->fx_tp);
1775         fx_list_release(fxpp);
1776 
1777         FX_CB_LIST_DELETE(fxpp);
1778         return (0);
1779 }
1780 
1781 /*
1782  * modify priority and/or quantum value of a thread with callback
1783  */
1784 int
1785 fx_modify_priority(kt_did_t ktid, clock_t quantum, pri_t pri)
1786 {
1787         fxproc_t        *fxpp;
1788 
1789         if (!FX_ISVALID(pri, quantum))
1790                 return (EINVAL);
1791 
1792         if ((fxpp = fx_list_lookup(ktid)) == NULL) {
1793                 /*
1794                  * either thread had exited or did not have a registered
1795                  * callback;
1796                  */
1797                 return (ESRCH);
1798         }
1799 
1800         thread_lock(fxpp->fx_tp);
1801 
1802         if (pri != FX_CB_NOCHANGE) {
1803                 fxpp->fx_pri = pri;
1804                 FX_ADJUST_PRI(fxpp->fx_pri);
1805                 if (quantum == FX_TQDEF) {
1806                         fxpp->fx_pquantum = fx_dptbl[fxpp->fx_pri].fx_quantum;
1807                 } else if (quantum == FX_TQINF) {
1808                         fxpp->fx_pquantum = FX_TQINF;
1809                 } else if (quantum != FX_NOCHANGE) {
1810                         FX_ADJUST_QUANTUM(quantum);
1811                         fxpp->fx_pquantum = quantum;
1812                 }
1813         } else if (quantum != FX_NOCHANGE && quantum != FX_TQDEF) {
1814                 if (quantum == FX_TQINF) {
1815                         fxpp->fx_pquantum = FX_TQINF;
1816                 } else {
1817                         FX_ADJUST_QUANTUM(quantum);
1818                         fxpp->fx_pquantum = quantum;
1819                 }
1820         }
1821 
1822         fx_change_priority(fxpp->fx_tp, fxpp);
1823 
1824         thread_unlock(fxpp->fx_tp);
1825         fx_list_release(fxpp);
1826         return (0);
1827 }
1828 
1829 
1830 /*
1831  * return an iblock cookie for mutex initialization to be used in callbacks
1832  */
1833 void *
1834 fx_get_mutex_cookie()
1835 {
1836         return ((void *)(uintptr_t)__ipltospl(DISP_LEVEL));
1837 }
1838 
1839 /*
1840  * return maximum relative priority
1841  */
1842 pri_t
1843 fx_get_maxpri()
1844 {
1845         return (fx_maxumdpri);
1846 }