1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 /*
  26  * Copyright (c) 2009,  Intel Corporation.
  27  * All Rights Reserved.
  28  */
  29 
  30 /*
  31  * CPU Device driver. The driver is not DDI-compliant.
  32  *
  33  * The driver supports following features:
  34  *      - Power management.
  35  */
  36 
  37 #include <sys/types.h>
  38 #include <sys/param.h>
  39 #include <sys/errno.h>
  40 #include <sys/modctl.h>
  41 #include <sys/kmem.h>
  42 #include <sys/conf.h>
  43 #include <sys/cmn_err.h>
  44 #include <sys/stat.h>
  45 #include <sys/debug.h>
  46 #include <sys/systm.h>
  47 #include <sys/ddi.h>
  48 #include <sys/sunddi.h>
  49 #include <sys/sdt.h>
  50 #include <sys/epm.h>
  51 #include <sys/machsystm.h>
  52 #include <sys/x_call.h>
  53 #include <sys/cpudrv_mach.h>
  54 #include <sys/msacct.h>
  55 
  56 /*
  57  * CPU power management
  58  *
  59  * The supported power saving model is to slow down the CPU (on SPARC by
  60  * dividing the CPU clock and on x86 by dropping down a P-state).
  61  * Periodically we determine the amount of time the CPU is running
  62  * idle thread and threads in user mode during the last quantum.  If the idle
  63  * thread was running less than its low water mark for current speed for
  64  * number of consecutive sampling periods, or number of running threads in
  65  * user mode are above its high water mark, we arrange to go to the higher
  66  * speed.  If the idle thread was running more than its high water mark without
  67  * dropping a number of consecutive times below the mark, and number of threads
  68  * running in user mode are below its low water mark, we arrange to go to the
  69  * next lower speed.  While going down, we go through all the speeds.  While
  70  * going up we go to the maximum speed to minimize impact on the user, but have
  71  * provisions in the driver to go to other speeds.
  72  *
  73  * The driver does not have knowledge of a particular implementation of this
  74  * scheme and will work with all CPUs supporting this model. On SPARC, the
  75  * driver determines supported speeds by looking at 'clock-divisors' property
  76  * created by OBP. On x86, the driver retrieves the supported speeds from
  77  * ACPI.
  78  */
  79 
  80 /*
  81  * Configuration function prototypes and data structures
  82  */
  83 static int cpudrv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
  84 static int cpudrv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
  85 static int cpudrv_power(dev_info_t *dip, int comp, int level);
  86 
  87 struct dev_ops cpudrv_ops = {
  88         DEVO_REV,               /* rev */
  89         0,                      /* refcnt */
  90         nodev,                  /* getinfo */
  91         nulldev,                /* identify */
  92         nulldev,                /* probe */
  93         cpudrv_attach,          /* attach */
  94         cpudrv_detach,          /* detach */
  95         nodev,                  /* reset */
  96         (struct cb_ops *)NULL,  /* cb_ops */
  97         (struct bus_ops *)NULL, /* bus_ops */
  98         cpudrv_power,           /* power */
  99         ddi_quiesce_not_needed,         /* quiesce */
 100 };
 101 
 102 static struct modldrv modldrv = {
 103         &mod_driverops,                     /* modops */
 104         "CPU Driver",                   /* linkinfo */
 105         &cpudrv_ops,                        /* dev_ops */
 106 };
 107 
 108 static struct modlinkage modlinkage = {
 109         MODREV_1,               /* rev */
 110         &modldrv,           /* linkage */
 111         NULL
 112 };
 113 
 114 /*
 115  * Function prototypes
 116  */
 117 static int cpudrv_init(cpudrv_devstate_t *cpudsp);
 118 static void cpudrv_free(cpudrv_devstate_t *cpudsp);
 119 static int cpudrv_comp_create(cpudrv_devstate_t *cpudsp);
 120 static void cpudrv_monitor_disp(void *arg);
 121 static void cpudrv_monitor(void *arg);
 122 
 123 /*
 124  * Driver global variables
 125  */
 126 uint_t cpudrv_debug = 0;
 127 void *cpudrv_state;
 128 static uint_t cpudrv_idle_hwm = CPUDRV_IDLE_HWM;
 129 static uint_t cpudrv_idle_lwm = CPUDRV_IDLE_LWM;
 130 static uint_t cpudrv_idle_buf_zone = CPUDRV_IDLE_BUF_ZONE;
 131 static uint_t cpudrv_idle_bhwm_cnt_max = CPUDRV_IDLE_BHWM_CNT_MAX;
 132 static uint_t cpudrv_idle_blwm_cnt_max = CPUDRV_IDLE_BLWM_CNT_MAX;
 133 static uint_t cpudrv_user_hwm = CPUDRV_USER_HWM;
 134 
 135 boolean_t cpudrv_enabled = B_TRUE;
 136 
 137 /*
 138  * cpudrv_direct_pm allows user applications to directly control the
 139  * power state transitions (direct pm) without following the normal
 140  * direct pm protocol. This is needed because the normal protocol
 141  * requires that a device only be lowered when it is idle, and be
 142  * brought up when it request to do so by calling pm_raise_power().
 143  * Ignoring this protocol is harmless for CPU (other than speed).
 144  * Moreover it might be the case that CPU is never idle or wants
 145  * to be at higher speed because of the addition CPU cycles required
 146  * to run the user application.
 147  *
 148  * The driver will still report idle/busy status to the framework. Although
 149  * framework will ignore this information for direct pm devices and not
 150  * try to bring them down when idle, user applications can still use this
 151  * information if they wants.
 152  *
 153  * In the future, provide an ioctl to control setting of this mode. In
 154  * that case, this variable should move to the state structure and
 155  * be protected by the lock in the state structure.
 156  */
 157 int cpudrv_direct_pm = 0;
 158 
 159 /*
 160  * Arranges for the handler function to be called at the interval suitable
 161  * for current speed.
 162  */
 163 #define CPUDRV_MONITOR_INIT(cpudsp) { \
 164     if (cpudrv_is_enabled(cpudsp)) {          \
 165                 ASSERT(mutex_owned(&(cpudsp)->lock)); \
 166                 (cpudsp)->cpudrv_pm.timeout_id = \
 167                     timeout(cpudrv_monitor_disp, \
 168                     (cpudsp), (((cpudsp)->cpudrv_pm.cur_spd == NULL) ? \
 169                     CPUDRV_QUANT_CNT_OTHR : \
 170                     (cpudsp)->cpudrv_pm.cur_spd->quant_cnt)); \
 171         } \
 172 }
 173 
 174 /*
 175  * Arranges for the handler function not to be called back.
 176  */
 177 #define CPUDRV_MONITOR_FINI(cpudsp) { \
 178         timeout_id_t tmp_tid; \
 179         ASSERT(mutex_owned(&(cpudsp)->lock)); \
 180         tmp_tid = (cpudsp)->cpudrv_pm.timeout_id; \
 181         (cpudsp)->cpudrv_pm.timeout_id = 0; \
 182         mutex_exit(&(cpudsp)->lock); \
 183         if (tmp_tid != 0) { \
 184                 (void) untimeout(tmp_tid); \
 185                 mutex_enter(&(cpudsp)->cpudrv_pm.timeout_lock); \
 186                 while ((cpudsp)->cpudrv_pm.timeout_count != 0) \
 187                         cv_wait(&(cpudsp)->cpudrv_pm.timeout_cv, \
 188                             &(cpudsp)->cpudrv_pm.timeout_lock); \
 189                 mutex_exit(&(cpudsp)->cpudrv_pm.timeout_lock); \
 190         } \
 191         mutex_enter(&(cpudsp)->lock); \
 192 }
 193 
 194 int
 195 _init(void)
 196 {
 197         int     error;
 198 
 199         DPRINTF(D_INIT, (" _init: function called\n"));
 200         if ((error = ddi_soft_state_init(&cpudrv_state,
 201             sizeof (cpudrv_devstate_t), 0)) != 0) {
 202                 return (error);
 203         }
 204 
 205         if ((error = mod_install(&modlinkage)) != 0)  {
 206                 ddi_soft_state_fini(&cpudrv_state);
 207         }
 208 
 209         /*
 210          * Callbacks used by the PPM driver.
 211          */
 212         CPUDRV_SET_PPM_CALLBACKS();
 213         return (error);
 214 }
 215 
 216 int
 217 _fini(void)
 218 {
 219         int     error;
 220 
 221         DPRINTF(D_FINI, (" _fini: function called\n"));
 222         if ((error = mod_remove(&modlinkage)) == 0) {
 223                 ddi_soft_state_fini(&cpudrv_state);
 224         }
 225 
 226         return (error);
 227 }
 228 
 229 int
 230 _info(struct modinfo *modinfop)
 231 {
 232         return (mod_info(&modlinkage, modinfop));
 233 }
 234 
 235 /*
 236  * Driver attach(9e) entry point.
 237  */
 238 static int
 239 cpudrv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 240 {
 241         int                     instance;
 242         cpudrv_devstate_t       *cpudsp;
 243 
 244         instance = ddi_get_instance(dip);
 245 
 246         switch (cmd) {
 247         case DDI_ATTACH:
 248                 DPRINTF(D_ATTACH, ("cpudrv_attach: instance %d: "
 249                     "DDI_ATTACH called\n", instance));
 250                 if (!cpudrv_is_enabled(NULL))
 251                         return (DDI_FAILURE);
 252                 if (ddi_soft_state_zalloc(cpudrv_state, instance) !=
 253                     DDI_SUCCESS) {
 254                         cmn_err(CE_WARN, "cpudrv_attach: instance %d: "
 255                             "can't allocate state", instance);
 256                         cpudrv_enabled = B_FALSE;
 257                         return (DDI_FAILURE);
 258                 }
 259                 if ((cpudsp = ddi_get_soft_state(cpudrv_state, instance)) ==
 260                     NULL) {
 261                         cmn_err(CE_WARN, "cpudrv_attach: instance %d: "
 262                             "can't get state", instance);
 263                         ddi_soft_state_free(cpudrv_state, instance);
 264                         cpudrv_enabled = B_FALSE;
 265                         return (DDI_FAILURE);
 266                 }
 267                 cpudsp->dip = dip;
 268 
 269                 /*
 270                  * Find CPU number for this dev_info node.
 271                  */
 272                 if (!cpudrv_get_cpu_id(dip, &(cpudsp->cpu_id))) {
 273                         cmn_err(CE_WARN, "cpudrv_attach: instance %d: "
 274                             "can't convert dip to cpu_id", instance);
 275                         ddi_soft_state_free(cpudrv_state, instance);
 276                         cpudrv_enabled = B_FALSE;
 277                         return (DDI_FAILURE);
 278                 }
 279 
 280                 if (!cpudrv_is_enabled(cpudsp)) {
 281                         cmn_err(CE_WARN, "cpudrv_attach: instance %d: "
 282                             "not supported or it got disabled on us",
 283                             instance);
 284                         cpudrv_enabled = B_FALSE;
 285                         ddi_soft_state_free(cpudrv_state, instance);
 286                         return (DDI_FAILURE);
 287                 }
 288 
 289                 mutex_init(&cpudsp->lock, NULL, MUTEX_DRIVER, NULL);
 290                 if (cpudrv_init(cpudsp) != DDI_SUCCESS) {
 291                         cpudrv_enabled = B_FALSE;
 292                         cpudrv_free(cpudsp);
 293                         ddi_soft_state_free(cpudrv_state, instance);
 294                         return (DDI_FAILURE);
 295                 }
 296                 if (cpudrv_comp_create(cpudsp) != DDI_SUCCESS) {
 297                         cpudrv_enabled = B_FALSE;
 298                         cpudrv_free(cpudsp);
 299                         ddi_soft_state_free(cpudrv_state, instance);
 300                         return (DDI_FAILURE);
 301                 }
 302                 if (ddi_prop_update_string(DDI_DEV_T_NONE,
 303                     dip, "pm-class", "CPU") != DDI_PROP_SUCCESS) {
 304                         cpudrv_enabled = B_FALSE;
 305                         cpudrv_free(cpudsp);
 306                         ddi_soft_state_free(cpudrv_state, instance);
 307                         return (DDI_FAILURE);
 308                 }
 309 
 310                 /*
 311                  * Taskq is used to dispatch routine to monitor CPU
 312                  * activities.
 313                  */
 314                 cpudsp->cpudrv_pm.tq = ddi_taskq_create(dip,
 315                     "cpudrv_monitor", CPUDRV_TASKQ_THREADS,
 316                     TASKQ_DEFAULTPRI, 0);
 317                 if (cpudsp->cpudrv_pm.tq == NULL) {
 318                         cpudrv_enabled = B_FALSE;
 319                         cpudrv_free(cpudsp);
 320                         ddi_soft_state_free(cpudrv_state, instance);
 321                         return (DDI_FAILURE);
 322                 }
 323 
 324                 mutex_init(&cpudsp->cpudrv_pm.timeout_lock, NULL,
 325                     MUTEX_DRIVER, NULL);
 326                 cv_init(&cpudsp->cpudrv_pm.timeout_cv, NULL,
 327                     CV_DEFAULT, NULL);
 328 
 329                 /*
 330                  * Driver needs to assume that CPU is running at
 331                  * unknown speed at DDI_ATTACH and switch it to the
 332                  * needed speed. We assume that initial needed speed
 333                  * is full speed for us.
 334                  */
 335                 /*
 336                  * We need to take the lock because cpudrv_monitor()
 337                  * will start running in parallel with attach().
 338                  */
 339                 mutex_enter(&cpudsp->lock);
 340                 cpudsp->cpudrv_pm.cur_spd = NULL;
 341                 cpudsp->cpudrv_pm.pm_started = B_FALSE;
 342                 /*
 343                  * We don't call pm_raise_power() directly from attach
 344                  * because driver attach for a slave CPU node can
 345                  * happen before the CPU is even initialized. We just
 346                  * start the monitoring system which understands
 347                  * unknown speed and moves CPU to top speed when it
 348                  * has been initialized.
 349                  */
 350                 CPUDRV_MONITOR_INIT(cpudsp);
 351                 mutex_exit(&cpudsp->lock);
 352 
 353                 if (!cpudrv_mach_init(cpudsp)) {
 354                         cmn_err(CE_WARN, "cpudrv_attach: instance %d: "
 355                             "cpudrv_mach_init failed", instance);
 356                         cpudrv_enabled = B_FALSE;
 357                         ddi_taskq_destroy(cpudsp->cpudrv_pm.tq);
 358                         cpudrv_free(cpudsp);
 359                         ddi_soft_state_free(cpudrv_state, instance);
 360                         return (DDI_FAILURE);
 361                 }
 362 
 363                 CPUDRV_INSTALL_MAX_CHANGE_HANDLER(cpudsp);
 364 
 365                 (void) ddi_prop_update_int(DDI_DEV_T_NONE, dip,
 366                     DDI_NO_AUTODETACH, 1);
 367                 ddi_report_dev(dip);
 368                 return (DDI_SUCCESS);
 369 
 370         case DDI_RESUME:
 371                 DPRINTF(D_ATTACH, ("cpudrv_attach: instance %d: "
 372                     "DDI_RESUME called\n", instance));
 373 
 374                 cpudsp = ddi_get_soft_state(cpudrv_state, instance);
 375                 ASSERT(cpudsp != NULL);
 376 
 377                 /*
 378                  * Nothing to do for resume, if not doing active PM.
 379                  */
 380                 if (!cpudrv_is_enabled(cpudsp))
 381                         return (DDI_SUCCESS);
 382 
 383                 mutex_enter(&cpudsp->lock);
 384                 /*
 385                  * Driver needs to assume that CPU is running at unknown speed
 386                  * at DDI_RESUME and switch it to the needed speed. We assume
 387                  * that the needed speed is full speed for us.
 388                  */
 389                 cpudsp->cpudrv_pm.cur_spd = NULL;
 390                 CPUDRV_MONITOR_INIT(cpudsp);
 391                 mutex_exit(&cpudsp->lock);
 392                 CPUDRV_REDEFINE_TOPSPEED(dip);
 393                 return (DDI_SUCCESS);
 394 
 395         default:
 396                 return (DDI_FAILURE);
 397         }
 398 }
 399 
 400 /*
 401  * Driver detach(9e) entry point.
 402  */
 403 static int
 404 cpudrv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 405 {
 406         int                     instance;
 407         cpudrv_devstate_t       *cpudsp;
 408         cpudrv_pm_t             *cpupm;
 409 
 410         instance = ddi_get_instance(dip);
 411 
 412         switch (cmd) {
 413         case DDI_DETACH:
 414                 DPRINTF(D_DETACH, ("cpudrv_detach: instance %d: "
 415                     "DDI_DETACH called\n", instance));
 416 
 417 #if defined(__x86)
 418                 cpudsp = ddi_get_soft_state(cpudrv_state, instance);
 419                 ASSERT(cpudsp != NULL);
 420 
 421                 /*
 422                  * Nothing to do for detach, if no doing active PM.
 423                  */
 424                 if (!cpudrv_is_enabled(cpudsp))
 425                         return (DDI_SUCCESS);
 426 
 427                 /*
 428                  * uninstall PPC/_TPC change notification handler
 429                  */
 430                 CPUDRV_UNINSTALL_MAX_CHANGE_HANDLER(cpudsp);
 431 
 432                 /*
 433                  * destruct platform specific resource
 434                  */
 435                 if (!cpudrv_mach_fini(cpudsp))
 436                         return (DDI_FAILURE);
 437 
 438                 mutex_enter(&cpudsp->lock);
 439                 CPUDRV_MONITOR_FINI(cpudsp);
 440                 cv_destroy(&cpudsp->cpudrv_pm.timeout_cv);
 441                 mutex_destroy(&cpudsp->cpudrv_pm.timeout_lock);
 442                 ddi_taskq_destroy(cpudsp->cpudrv_pm.tq);
 443                 cpudrv_free(cpudsp);
 444                 mutex_exit(&cpudsp->lock);
 445                 mutex_destroy(&cpudsp->lock);
 446                 ddi_soft_state_free(cpudrv_state, instance);
 447                 (void) ddi_prop_update_int(DDI_DEV_T_NONE, dip,
 448                     DDI_NO_AUTODETACH, 0);
 449                 return (DDI_SUCCESS);
 450 
 451 #else
 452                 /*
 453                  * If the only thing supported by the driver is power
 454                  * management, we can in future enhance the driver and
 455                  * framework that loads it to unload the driver when
 456                  * user has disabled CPU power management.
 457                  */
 458                 return (DDI_FAILURE);
 459 #endif
 460 
 461         case DDI_SUSPEND:
 462                 DPRINTF(D_DETACH, ("cpudrv_detach: instance %d: "
 463                     "DDI_SUSPEND called\n", instance));
 464 
 465                 cpudsp = ddi_get_soft_state(cpudrv_state, instance);
 466                 ASSERT(cpudsp != NULL);
 467 
 468                 /*
 469                  * Nothing to do for suspend, if not doing active PM.
 470                  */
 471                 if (!cpudrv_is_enabled(cpudsp))
 472                         return (DDI_SUCCESS);
 473 
 474                 /*
 475                  * During a checkpoint-resume sequence, framework will
 476                  * stop interrupts to quiesce kernel activity. This will
 477                  * leave our monitoring system ineffective. Handle this
 478                  * by stopping our monitoring system and bringing CPU
 479                  * to full speed. In case we are in special direct pm
 480                  * mode, we leave the CPU at whatever speed it is. This
 481                  * is harmless other than speed.
 482                  */
 483                 mutex_enter(&cpudsp->lock);
 484                 cpupm = &(cpudsp->cpudrv_pm);
 485 
 486                 DPRINTF(D_DETACH, ("cpudrv_detach: instance %d: DDI_SUSPEND - "
 487                     "cur_spd %d, topspeed %d\n", instance,
 488                     cpupm->cur_spd->pm_level,
 489                     CPUDRV_TOPSPEED(cpupm)->pm_level));
 490 
 491                 CPUDRV_MONITOR_FINI(cpudsp);
 492 
 493                 if (!cpudrv_direct_pm && (cpupm->cur_spd !=
 494                     CPUDRV_TOPSPEED(cpupm))) {
 495                         if (cpupm->pm_busycnt < 1) {
 496                                 if ((pm_busy_component(dip, CPUDRV_COMP_NUM)
 497                                     == DDI_SUCCESS)) {
 498                                         cpupm->pm_busycnt++;
 499                                 } else {
 500                                         CPUDRV_MONITOR_INIT(cpudsp);
 501                                         mutex_exit(&cpudsp->lock);
 502                                         cmn_err(CE_WARN, "cpudrv_detach: "
 503                                             "instance %d: can't busy CPU "
 504                                             "component", instance);
 505                                         return (DDI_FAILURE);
 506                                 }
 507                         }
 508                         mutex_exit(&cpudsp->lock);
 509                         if (pm_raise_power(dip, CPUDRV_COMP_NUM,
 510                             CPUDRV_TOPSPEED(cpupm)->pm_level) !=
 511                             DDI_SUCCESS) {
 512                                 mutex_enter(&cpudsp->lock);
 513                                 CPUDRV_MONITOR_INIT(cpudsp);
 514                                 mutex_exit(&cpudsp->lock);
 515                                 cmn_err(CE_WARN, "cpudrv_detach: instance %d: "
 516                                     "can't raise CPU power level to %d",
 517                                     instance,
 518                                     CPUDRV_TOPSPEED(cpupm)->pm_level);
 519                                 return (DDI_FAILURE);
 520                         } else {
 521                                 return (DDI_SUCCESS);
 522                         }
 523                 } else {
 524                         mutex_exit(&cpudsp->lock);
 525                         return (DDI_SUCCESS);
 526                 }
 527 
 528         default:
 529                 return (DDI_FAILURE);
 530         }
 531 }
 532 
 533 /*
 534  * Driver power(9e) entry point.
 535  *
 536  * Driver's notion of current power is set *only* in power(9e) entry point
 537  * after actual power change operation has been successfully completed.
 538  */
 539 /* ARGSUSED */
 540 static int
 541 cpudrv_power(dev_info_t *dip, int comp, int level)
 542 {
 543         int                     instance;
 544         cpudrv_devstate_t       *cpudsp;
 545         cpudrv_pm_t             *cpudrvpm;
 546         cpudrv_pm_spd_t         *new_spd;
 547         boolean_t               is_ready;
 548         int                     ret;
 549 
 550         instance = ddi_get_instance(dip);
 551 
 552         DPRINTF(D_POWER, ("cpudrv_power: instance %d: level %d\n",
 553             instance, level));
 554 
 555         if ((cpudsp = ddi_get_soft_state(cpudrv_state, instance)) == NULL) {
 556                 cmn_err(CE_WARN, "cpudrv_power: instance %d: can't "
 557                     "get state", instance);
 558                 return (DDI_FAILURE);
 559         }
 560 
 561         /*
 562          * We're not ready until we can  get a cpu_t
 563          */
 564         is_ready = (cpudrv_get_cpu(cpudsp) == DDI_SUCCESS);
 565 
 566         mutex_enter(&cpudsp->lock);
 567         cpudrvpm = &(cpudsp->cpudrv_pm);
 568 
 569         /*
 570          * In normal operation, we fail if we are busy and request is
 571          * to lower the power level. We let this go through if the driver
 572          * is in special direct pm mode. On x86, we also let this through
 573          * if the change is due to a request to govern the max speed.
 574          */
 575         if (!cpudrv_direct_pm && (cpudrvpm->pm_busycnt >= 1) &&
 576             !cpudrv_is_governor_thread(cpudrvpm)) {
 577                 if ((cpudrvpm->cur_spd != NULL) &&
 578                     (level < cpudrvpm->cur_spd->pm_level)) {
 579                         mutex_exit(&cpudsp->lock);
 580                         return (DDI_FAILURE);
 581                 }
 582         }
 583 
 584         for (new_spd = cpudrvpm->head_spd; new_spd; new_spd =
 585             new_spd->down_spd) {
 586                 if (new_spd->pm_level == level)
 587                         break;
 588         }
 589         if (!new_spd) {
 590                 CPUDRV_RESET_GOVERNOR_THREAD(cpudrvpm);
 591                 mutex_exit(&cpudsp->lock);
 592                 cmn_err(CE_WARN, "cpudrv_power: instance %d: "
 593                     "can't locate new CPU speed", instance);
 594                 return (DDI_FAILURE);
 595         }
 596 
 597         /*
 598          * We currently refuse to power manage if the CPU is not ready to
 599          * take cross calls (cross calls fail silently if CPU is not ready
 600          * for it).
 601          *
 602          * Additionally, for x86 platforms we cannot power manage an instance,
 603          * until it has been initialized.
 604          */
 605         if (is_ready) {
 606                 is_ready = CPUDRV_XCALL_IS_READY(cpudsp->cpu_id);
 607                 if (!is_ready) {
 608                         DPRINTF(D_POWER, ("cpudrv_power: instance %d: "
 609                             "CPU not ready for x-calls\n", instance));
 610                 } else if (!(is_ready = cpudrv_power_ready(cpudsp->cp))) {
 611                         DPRINTF(D_POWER, ("cpudrv_power: instance %d: "
 612                             "waiting for all CPUs to be power manageable\n",
 613                             instance));
 614                 }
 615         }
 616         if (!is_ready) {
 617                 CPUDRV_RESET_GOVERNOR_THREAD(cpudrvpm);
 618                 mutex_exit(&cpudsp->lock);
 619                 return (DDI_FAILURE);
 620         }
 621 
 622         /*
 623          * Execute CPU specific routine on the requested CPU to
 624          * change its speed to normal-speed/divisor.
 625          */
 626         if ((ret = cpudrv_change_speed(cpudsp, new_spd)) != DDI_SUCCESS) {
 627                 cmn_err(CE_WARN, "cpudrv_power: "
 628                     "cpudrv_change_speed() return = %d", ret);
 629                 mutex_exit(&cpudsp->lock);
 630                 return (DDI_FAILURE);
 631         }
 632 
 633         /*
 634          * Reset idle threshold time for the new power level.
 635          */
 636         if ((cpudrvpm->cur_spd != NULL) && (level <
 637             cpudrvpm->cur_spd->pm_level)) {
 638                 if (pm_idle_component(dip, CPUDRV_COMP_NUM) ==
 639                     DDI_SUCCESS) {
 640                         if (cpudrvpm->pm_busycnt >= 1)
 641                                 cpudrvpm->pm_busycnt--;
 642                 } else {
 643                         cmn_err(CE_WARN, "cpudrv_power: instance %d: "
 644                             "can't idle CPU component",
 645                             ddi_get_instance(dip));
 646                 }
 647         }
 648         /*
 649          * Reset various parameters because we are now running at new speed.
 650          */
 651         cpudrvpm->lastquan_mstate[CMS_IDLE] = 0;
 652         cpudrvpm->lastquan_mstate[CMS_SYSTEM] = 0;
 653         cpudrvpm->lastquan_mstate[CMS_USER] = 0;
 654         cpudrvpm->lastquan_ticks = 0;
 655         cpudrvpm->cur_spd = new_spd;
 656         CPUDRV_RESET_GOVERNOR_THREAD(cpudrvpm);
 657         mutex_exit(&cpudsp->lock);
 658 
 659         return (DDI_SUCCESS);
 660 }
 661 
 662 /*
 663  * Initialize power management data.
 664  */
 665 static int
 666 cpudrv_init(cpudrv_devstate_t *cpudsp)
 667 {
 668         cpudrv_pm_t     *cpupm = &(cpudsp->cpudrv_pm);
 669         cpudrv_pm_spd_t *cur_spd;
 670         cpudrv_pm_spd_t *prev_spd = NULL;
 671         int             *speeds;
 672         uint_t          nspeeds;
 673         int             idle_cnt_percent;
 674         int             user_cnt_percent;
 675         int             i;
 676 
 677         CPUDRV_GET_SPEEDS(cpudsp, speeds, nspeeds);
 678         if (nspeeds < 2) {
 679                 /* Need at least two speeds to power manage */
 680                 CPUDRV_FREE_SPEEDS(speeds, nspeeds);
 681                 return (DDI_FAILURE);
 682         }
 683         cpupm->num_spd = nspeeds;
 684 
 685         /*
 686          * Calculate the watermarks and other parameters based on the
 687          * supplied speeds.
 688          *
 689          * One of the basic assumption is that for X amount of CPU work,
 690          * if CPU is slowed down by a factor of N, the time it takes to
 691          * do the same work will be N * X.
 692          *
 693          * The driver declares that a CPU is idle and ready for slowed down,
 694          * if amount of idle thread is more than the current speed idle_hwm
 695          * without dropping below idle_hwm a number of consecutive sampling
 696          * intervals and number of running threads in user mode are below
 697          * user_lwm.  We want to set the current user_lwm such that if we
 698          * just switched to the next slower speed with no change in real work
 699          * load, the amount of user threads at the slower speed will be such
 700          * that it falls below the slower speed's user_hwm.  If we didn't do
 701          * that then we will just come back to the higher speed as soon as we
 702          * go down even with no change in work load.
 703          * The user_hwm is a fixed precentage and not calculated dynamically.
 704          *
 705          * We bring the CPU up if idle thread at current speed is less than
 706          * the current speed idle_lwm for a number of consecutive sampling
 707          * intervals or user threads are above the user_hwm for the current
 708          * speed.
 709          */
 710         for (i = 0; i < nspeeds; i++) {
 711                 cur_spd = kmem_zalloc(sizeof (cpudrv_pm_spd_t), KM_SLEEP);
 712                 cur_spd->speed = speeds[i];
 713                 if (i == 0) {   /* normal speed */
 714                         cpupm->head_spd = cur_spd;
 715                         CPUDRV_TOPSPEED(cpupm) = cur_spd;
 716                         cur_spd->quant_cnt = CPUDRV_QUANT_CNT_NORMAL;
 717                         cur_spd->idle_hwm =
 718                             (cpudrv_idle_hwm * cur_spd->quant_cnt) / 100;
 719                         /* can't speed anymore */
 720                         cur_spd->idle_lwm = 0;
 721                         cur_spd->user_hwm = UINT_MAX;
 722                 } else {
 723                         cur_spd->quant_cnt = CPUDRV_QUANT_CNT_OTHR;
 724                         ASSERT(prev_spd != NULL);
 725                         prev_spd->down_spd = cur_spd;
 726                         cur_spd->up_spd = cpupm->head_spd;
 727 
 728                         /*
 729                          * Let's assume CPU is considered idle at full speed
 730                          * when it is spending I% of time in running the idle
 731                          * thread.  At full speed, CPU will be busy (100 - I) %
 732                          * of times.  This % of busyness increases by factor of
 733                          * N as CPU slows down.  CPU that is idle I% of times
 734                          * in full speed, it is idle (100 - ((100 - I) * N)) %
 735                          * of times in N speed.  The idle_lwm is a fixed
 736                          * percentage.  A large value of N may result in
 737                          * idle_hwm to go below idle_lwm.  We need to make sure
 738                          * that there is at least a buffer zone seperation
 739                          * between the idle_lwm and idle_hwm values.
 740                          */
 741                         idle_cnt_percent = CPUDRV_IDLE_CNT_PERCENT(
 742                             cpudrv_idle_hwm, speeds, i);
 743                         idle_cnt_percent = max(idle_cnt_percent,
 744                             (cpudrv_idle_lwm + cpudrv_idle_buf_zone));
 745                         cur_spd->idle_hwm =
 746                             (idle_cnt_percent * cur_spd->quant_cnt) / 100;
 747                         cur_spd->idle_lwm =
 748                             (cpudrv_idle_lwm * cur_spd->quant_cnt) / 100;
 749 
 750                         /*
 751                          * The lwm for user threads are determined such that
 752                          * if CPU slows down, the load of work in the
 753                          * new speed would still keep the CPU at or below the
 754                          * user_hwm in the new speed.  This is to prevent
 755                          * the quick jump back up to higher speed.
 756                          */
 757                         cur_spd->user_hwm = (cpudrv_user_hwm *
 758                             cur_spd->quant_cnt) / 100;
 759                         user_cnt_percent = CPUDRV_USER_CNT_PERCENT(
 760                             cpudrv_user_hwm, speeds, i);
 761                         prev_spd->user_lwm =
 762                             (user_cnt_percent * prev_spd->quant_cnt) / 100;
 763                 }
 764                 prev_spd = cur_spd;
 765         }
 766         /* Slowest speed. Can't slow down anymore */
 767         cur_spd->idle_hwm = UINT_MAX;
 768         cur_spd->user_lwm = -1;
 769 #ifdef  DEBUG
 770         DPRINTF(D_PM_INIT, ("cpudrv_init: instance %d: head_spd spd %d, "
 771             "num_spd %d\n", ddi_get_instance(cpudsp->dip),
 772             cpupm->head_spd->speed, cpupm->num_spd));
 773         for (cur_spd = cpupm->head_spd; cur_spd; cur_spd = cur_spd->down_spd) {
 774                 DPRINTF(D_PM_INIT, ("cpudrv_init: instance %d: speed %d, "
 775                     "down_spd spd %d, idle_hwm %d, user_lwm %d, "
 776                     "up_spd spd %d, idle_lwm %d, user_hwm %d, "
 777                     "quant_cnt %d\n", ddi_get_instance(cpudsp->dip),
 778                     cur_spd->speed,
 779                     (cur_spd->down_spd ? cur_spd->down_spd->speed : 0),
 780                     cur_spd->idle_hwm, cur_spd->user_lwm,
 781                     (cur_spd->up_spd ? cur_spd->up_spd->speed : 0),
 782                     cur_spd->idle_lwm, cur_spd->user_hwm,
 783                     cur_spd->quant_cnt));
 784         }
 785 #endif  /* DEBUG */
 786         CPUDRV_FREE_SPEEDS(speeds, nspeeds);
 787         return (DDI_SUCCESS);
 788 }
 789 
 790 /*
 791  * Free CPU power management data.
 792  */
 793 static void
 794 cpudrv_free(cpudrv_devstate_t *cpudsp)
 795 {
 796         cpudrv_pm_t     *cpupm = &(cpudsp->cpudrv_pm);
 797         cpudrv_pm_spd_t *cur_spd, *next_spd;
 798 
 799         cur_spd = cpupm->head_spd;
 800         while (cur_spd) {
 801                 next_spd = cur_spd->down_spd;
 802                 kmem_free(cur_spd, sizeof (cpudrv_pm_spd_t));
 803                 cur_spd = next_spd;
 804         }
 805         bzero(cpupm, sizeof (cpudrv_pm_t));
 806 }
 807 
 808 /*
 809  * Create pm-components property.
 810  */
 811 static int
 812 cpudrv_comp_create(cpudrv_devstate_t *cpudsp)
 813 {
 814         cpudrv_pm_t     *cpupm = &(cpudsp->cpudrv_pm);
 815         cpudrv_pm_spd_t *cur_spd;
 816         char            **pmc;
 817         int             size;
 818         char            name[] = "NAME=CPU Speed";
 819         int             i, j;
 820         uint_t          comp_spd;
 821         int             result = DDI_FAILURE;
 822 
 823         pmc = kmem_zalloc((cpupm->num_spd + 1) * sizeof (char *), KM_SLEEP);
 824         size = CPUDRV_COMP_SIZE();
 825         if (cpupm->num_spd > CPUDRV_COMP_MAX_VAL) {
 826                 cmn_err(CE_WARN, "cpudrv_comp_create: instance %d: "
 827                     "number of speeds exceeded limits",
 828                     ddi_get_instance(cpudsp->dip));
 829                 kmem_free(pmc, (cpupm->num_spd + 1) * sizeof (char *));
 830                 return (result);
 831         }
 832 
 833         for (i = cpupm->num_spd, cur_spd = cpupm->head_spd; i > 0;
 834             i--, cur_spd = cur_spd->down_spd) {
 835                 cur_spd->pm_level = i;
 836                 pmc[i] = kmem_zalloc((size * sizeof (char)), KM_SLEEP);
 837                 comp_spd = CPUDRV_COMP_SPEED(cpupm, cur_spd);
 838                 if (comp_spd > CPUDRV_COMP_MAX_VAL) {
 839                         cmn_err(CE_WARN, "cpudrv_comp_create: "
 840                             "instance %d: speed exceeded limits",
 841                             ddi_get_instance(cpudsp->dip));
 842                         for (j = cpupm->num_spd; j >= i; j--) {
 843                                 kmem_free(pmc[j], size * sizeof (char));
 844                         }
 845                         kmem_free(pmc, (cpupm->num_spd + 1) *
 846                             sizeof (char *));
 847                         return (result);
 848                 }
 849                 CPUDRV_COMP_SPRINT(pmc[i], cpupm, cur_spd, comp_spd)
 850                 DPRINTF(D_PM_COMP_CREATE, ("cpudrv_comp_create: "
 851                     "instance %d: pm-components power level %d string '%s'\n",
 852                     ddi_get_instance(cpudsp->dip), i, pmc[i]));
 853         }
 854         pmc[0] = kmem_zalloc(sizeof (name), KM_SLEEP);
 855         (void) strcat(pmc[0], name);
 856         DPRINTF(D_PM_COMP_CREATE, ("cpudrv_comp_create: instance %d: "
 857             "pm-components component name '%s'\n",
 858             ddi_get_instance(cpudsp->dip), pmc[0]));
 859 
 860         if (ddi_prop_update_string_array(DDI_DEV_T_NONE, cpudsp->dip,
 861             "pm-components", pmc, cpupm->num_spd + 1) == DDI_PROP_SUCCESS) {
 862                 result = DDI_SUCCESS;
 863         } else {
 864                 cmn_err(CE_WARN, "cpudrv_comp_create: instance %d: "
 865                     "can't create pm-components property",
 866                     ddi_get_instance(cpudsp->dip));
 867         }
 868 
 869         for (i = cpupm->num_spd; i > 0; i--) {
 870                 kmem_free(pmc[i], size * sizeof (char));
 871         }
 872         kmem_free(pmc[0], sizeof (name));
 873         kmem_free(pmc, (cpupm->num_spd + 1) * sizeof (char *));
 874         return (result);
 875 }
 876 
 877 /*
 878  * Mark a component idle.
 879  */
 880 #define CPUDRV_MONITOR_PM_IDLE_COMP(dip, cpupm) { \
 881         if ((cpupm)->pm_busycnt >= 1) { \
 882                 if (pm_idle_component((dip), CPUDRV_COMP_NUM) == \
 883                     DDI_SUCCESS) { \
 884                         DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: " \
 885                             "instance %d: pm_idle_component called\n", \
 886                             ddi_get_instance((dip)))); \
 887                         (cpupm)->pm_busycnt--; \
 888                 } else { \
 889                         cmn_err(CE_WARN, "cpudrv_monitor: instance %d: " \
 890                             "can't idle CPU component", \
 891                             ddi_get_instance((dip))); \
 892                 } \
 893         } \
 894 }
 895 
 896 /*
 897  * Marks a component busy in both PM framework and driver state structure.
 898  */
 899 #define CPUDRV_MONITOR_PM_BUSY_COMP(dip, cpupm) { \
 900         if ((cpupm)->pm_busycnt < 1) { \
 901                 if (pm_busy_component((dip), CPUDRV_COMP_NUM) == \
 902                     DDI_SUCCESS) { \
 903                         DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: " \
 904                             "instance %d: pm_busy_component called\n", \
 905                             ddi_get_instance((dip)))); \
 906                         (cpupm)->pm_busycnt++; \
 907                 } else { \
 908                         cmn_err(CE_WARN, "cpudrv_monitor: instance %d: " \
 909                             "can't busy CPU component", \
 910                             ddi_get_instance((dip))); \
 911                 } \
 912         } \
 913 }
 914 
 915 /*
 916  * Marks a component busy and calls pm_raise_power().
 917  */
 918 #define CPUDRV_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm, new_spd) { \
 919         int ret; \
 920         /* \
 921          * Mark driver and PM framework busy first so framework doesn't try \
 922          * to bring CPU to lower speed when we need to be at higher speed. \
 923          */ \
 924         CPUDRV_MONITOR_PM_BUSY_COMP((dip), (cpupm)); \
 925         mutex_exit(&(cpudsp)->lock); \
 926         DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: instance %d: " \
 927             "pm_raise_power called to %d\n", ddi_get_instance((dip)), \
 928                 (new_spd->pm_level))); \
 929         ret = pm_raise_power((dip), CPUDRV_COMP_NUM, (new_spd->pm_level)); \
 930         if (ret != DDI_SUCCESS) { \
 931                 cmn_err(CE_WARN, "cpudrv_monitor: instance %d: can't " \
 932                     "raise CPU power level", ddi_get_instance((dip))); \
 933         } \
 934         mutex_enter(&(cpudsp)->lock); \
 935         if (ret == DDI_SUCCESS && cpudsp->cpudrv_pm.cur_spd == NULL) { \
 936                 cpudsp->cpudrv_pm.cur_spd = new_spd; \
 937         } \
 938 }
 939 
 940 /*
 941  * In order to monitor a CPU, we need to hold cpu_lock to access CPU
 942  * statistics. Holding cpu_lock is not allowed from a callout routine.
 943  * We dispatch a taskq to do that job.
 944  */
 945 static void
 946 cpudrv_monitor_disp(void *arg)
 947 {
 948         cpudrv_devstate_t       *cpudsp = (cpudrv_devstate_t *)arg;
 949 
 950         /*
 951          * We are here because the last task has scheduled a timeout.
 952          * The queue should be empty at this time.
 953          */
 954         mutex_enter(&cpudsp->cpudrv_pm.timeout_lock);
 955         if ((ddi_taskq_dispatch(cpudsp->cpudrv_pm.tq, cpudrv_monitor, arg,
 956             DDI_NOSLEEP)) != DDI_SUCCESS) {
 957                 mutex_exit(&cpudsp->cpudrv_pm.timeout_lock);
 958                 DPRINTF(D_PM_MONITOR, ("cpudrv_monitor_disp: failed to "
 959                     "dispatch the cpudrv_monitor taskq\n"));
 960                 mutex_enter(&cpudsp->lock);
 961                 CPUDRV_MONITOR_INIT(cpudsp);
 962                 mutex_exit(&cpudsp->lock);
 963                 return;
 964         }
 965         cpudsp->cpudrv_pm.timeout_count++;
 966         mutex_exit(&cpudsp->cpudrv_pm.timeout_lock);
 967 }
 968 
 969 /*
 970  * Monitors each CPU for the amount of time idle thread was running in the
 971  * last quantum and arranges for the CPU to go to the lower or higher speed.
 972  * Called at the time interval appropriate for the current speed. The
 973  * time interval for normal speed is CPUDRV_QUANT_CNT_NORMAL. The time
 974  * interval for other speeds (including unknown speed) is
 975  * CPUDRV_QUANT_CNT_OTHR.
 976  */
 977 static void
 978 cpudrv_monitor(void *arg)
 979 {
 980         cpudrv_devstate_t       *cpudsp = (cpudrv_devstate_t *)arg;
 981         cpudrv_pm_t             *cpupm;
 982         cpudrv_pm_spd_t         *cur_spd, *new_spd;
 983         dev_info_t              *dip;
 984         uint_t                  idle_cnt, user_cnt, system_cnt;
 985         clock_t                 ticks;
 986         uint_t                  tick_cnt;
 987         hrtime_t                msnsecs[NCMSTATES];
 988         boolean_t               is_ready;
 989 
 990 #define GET_CPU_MSTATE_CNT(state, cnt) \
 991         msnsecs[state] = NSEC_TO_TICK(msnsecs[state]); \
 992         if (cpupm->lastquan_mstate[state] > msnsecs[state]) \
 993                 msnsecs[state] = cpupm->lastquan_mstate[state]; \
 994         cnt = msnsecs[state] - cpupm->lastquan_mstate[state]; \
 995         cpupm->lastquan_mstate[state] = msnsecs[state]
 996 
 997         /*
 998          * We're not ready until we can  get a cpu_t
 999          */
1000         is_ready = (cpudrv_get_cpu(cpudsp) == DDI_SUCCESS);
1001 
1002         mutex_enter(&cpudsp->lock);
1003         cpupm = &(cpudsp->cpudrv_pm);
1004         if (cpupm->timeout_id == 0) {
1005                 mutex_exit(&cpudsp->lock);
1006                 goto do_return;
1007         }
1008         cur_spd = cpupm->cur_spd;
1009         dip = cpudsp->dip;
1010 
1011         /*
1012          * We assume that a CPU is initialized and has a valid cpu_t
1013          * structure, if it is ready for cross calls. If this changes,
1014          * additional checks might be needed.
1015          *
1016          * Additionally, for x86 platforms we cannot power manage an
1017          * instance, until it has been initialized.
1018          */
1019         if (is_ready) {
1020                 is_ready = CPUDRV_XCALL_IS_READY(cpudsp->cpu_id);
1021                 if (!is_ready) {
1022                         DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: instance %d: "
1023                             "CPU not ready for x-calls\n",
1024                             ddi_get_instance(dip)));
1025                 } else if (!(is_ready = cpudrv_power_ready(cpudsp->cp))) {
1026                         DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: instance %d: "
1027                             "waiting for all CPUs to be power manageable\n",
1028                             ddi_get_instance(dip)));
1029                 }
1030         }
1031         if (!is_ready) {
1032                 /*
1033                  * Make sure that we are busy so that framework doesn't
1034                  * try to bring us down in this situation.
1035                  */
1036                 CPUDRV_MONITOR_PM_BUSY_COMP(dip, cpupm);
1037                 CPUDRV_MONITOR_INIT(cpudsp);
1038                 mutex_exit(&cpudsp->lock);
1039                 goto do_return;
1040         }
1041 
1042         /*
1043          * Make sure that we are still not at unknown power level.
1044          */
1045         if (cur_spd == NULL) {
1046                 DPRINTF(D_PM_MONITOR, ("cpudrv_monitor: instance %d: "
1047                     "cur_spd is unknown\n", ddi_get_instance(dip)));
1048                 CPUDRV_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm,
1049                     CPUDRV_TOPSPEED(cpupm));
1050                 /*
1051                  * We just changed the speed. Wait till at least next
1052                  * call to this routine before proceeding ahead.
1053                  */
1054                 CPUDRV_MONITOR_INIT(cpudsp);
1055                 mutex_exit(&cpudsp->lock);
1056                 goto do_return;
1057         }
1058 
1059         if (!cpupm->pm_started) {
1060                 cpupm->pm_started = B_TRUE;
1061                 cpudrv_set_supp_freqs(cpudsp);
1062         }
1063 
1064         get_cpu_mstate(cpudsp->cp, msnsecs);
1065         GET_CPU_MSTATE_CNT(CMS_IDLE, idle_cnt);
1066         GET_CPU_MSTATE_CNT(CMS_USER, user_cnt);
1067         GET_CPU_MSTATE_CNT(CMS_SYSTEM, system_cnt);
1068 
1069         /*
1070          * We can't do anything when we have just switched to a state
1071          * because there is no valid timestamp.
1072          */
1073         if (cpupm->lastquan_ticks == 0) {
1074                 cpupm->lastquan_ticks = NSEC_TO_TICK(gethrtime());
1075                 CPUDRV_MONITOR_INIT(cpudsp);
1076                 mutex_exit(&cpudsp->lock);
1077                 goto do_return;
1078         }
1079 
1080         /*
1081          * Various watermarks are based on this routine being called back
1082          * exactly at the requested period. This is not guaranteed
1083          * because this routine is called from a taskq that is dispatched
1084          * from a timeout routine.  Handle this by finding out how many
1085          * ticks have elapsed since the last call and adjusting
1086          * the idle_cnt based on the delay added to the requested period
1087          * by timeout and taskq.
1088          */
1089         ticks = NSEC_TO_TICK(gethrtime());
1090         tick_cnt = ticks - cpupm->lastquan_ticks;
1091         ASSERT(tick_cnt != 0);
1092         cpupm->lastquan_ticks = ticks;
1093 
1094         /*
1095          * Time taken between recording the current counts and
1096          * arranging the next call of this routine is an error in our
1097          * calculation. We minimize the error by calling
1098          * CPUDRV_MONITOR_INIT() here instead of end of this routine.
1099          */
1100         CPUDRV_MONITOR_INIT(cpudsp);
1101         DPRINTF(D_PM_MONITOR_VERBOSE, ("cpudrv_monitor: instance %d: "
1102             "idle count %d, user count %d, system count %d, pm_level %d, "
1103             "pm_busycnt %d\n", ddi_get_instance(dip), idle_cnt, user_cnt,
1104             system_cnt, cur_spd->pm_level, cpupm->pm_busycnt));
1105 
1106 #ifdef  DEBUG
1107         /*
1108          * Notify that timeout and taskq has caused delays and we need to
1109          * scale our parameters accordingly.
1110          *
1111          * To get accurate result, don't turn on other DPRINTFs with
1112          * the following DPRINTF. PROM calls generated by other
1113          * DPRINTFs changes the timing.
1114          */
1115         if (tick_cnt > cur_spd->quant_cnt) {
1116                 DPRINTF(D_PM_MONITOR_DELAY, ("cpudrv_monitor: instance %d: "
1117                     "tick count %d > quantum_count %u\n",
1118                     ddi_get_instance(dip), tick_cnt, cur_spd->quant_cnt));
1119         }
1120 #endif  /* DEBUG */
1121 
1122         /*
1123          * Adjust counts based on the delay added by timeout and taskq.
1124          */
1125         idle_cnt = (idle_cnt * cur_spd->quant_cnt) / tick_cnt;
1126         user_cnt = (user_cnt * cur_spd->quant_cnt) / tick_cnt;
1127 
1128         if ((user_cnt > cur_spd->user_hwm) || (idle_cnt < cur_spd->idle_lwm &&
1129             cur_spd->idle_blwm_cnt >= cpudrv_idle_blwm_cnt_max)) {
1130                 cur_spd->idle_blwm_cnt = 0;
1131                 cur_spd->idle_bhwm_cnt = 0;
1132                 /*
1133                  * In normal situation, arrange to go to next higher speed.
1134                  * If we are running in special direct pm mode, we just stay
1135                  * at the current speed.
1136                  */
1137                 if (cur_spd == cur_spd->up_spd || cpudrv_direct_pm) {
1138                         CPUDRV_MONITOR_PM_BUSY_COMP(dip, cpupm);
1139                 } else {
1140                         new_spd = cur_spd->up_spd;
1141                         CPUDRV_MONITOR_PM_BUSY_AND_RAISE(dip, cpudsp, cpupm,
1142                             new_spd);
1143                 }
1144         } else if ((user_cnt <= cur_spd->user_lwm) &&
1145             (idle_cnt >= cur_spd->idle_hwm) || !CPU_ACTIVE(cpudsp->cp)) {
1146                 cur_spd->idle_blwm_cnt = 0;
1147                 cur_spd->idle_bhwm_cnt = 0;
1148                 /*
1149                  * Arrange to go to next lower speed by informing our idle
1150                  * status to the power management framework.
1151                  */
1152                 CPUDRV_MONITOR_PM_IDLE_COMP(dip, cpupm);
1153         } else {
1154                 /*
1155                  * If we are between the idle water marks and have not
1156                  * been here enough consecutive times to be considered
1157                  * busy, just increment the count and return.
1158                  */
1159                 if ((idle_cnt < cur_spd->idle_hwm) &&
1160                     (idle_cnt >= cur_spd->idle_lwm) &&
1161                     (cur_spd->idle_bhwm_cnt < cpudrv_idle_bhwm_cnt_max)) {
1162                         cur_spd->idle_blwm_cnt = 0;
1163                         cur_spd->idle_bhwm_cnt++;
1164                         mutex_exit(&cpudsp->lock);
1165                         goto do_return;
1166                 }
1167                 if (idle_cnt < cur_spd->idle_lwm) {
1168                         cur_spd->idle_blwm_cnt++;
1169                         cur_spd->idle_bhwm_cnt = 0;
1170                 }
1171                 /*
1172                  * Arranges to stay at the current speed.
1173                  */
1174                 CPUDRV_MONITOR_PM_BUSY_COMP(dip, cpupm);
1175         }
1176         mutex_exit(&cpudsp->lock);
1177 do_return:
1178         mutex_enter(&cpupm->timeout_lock);
1179         ASSERT(cpupm->timeout_count > 0);
1180         cpupm->timeout_count--;
1181         cv_signal(&cpupm->timeout_cv);
1182         mutex_exit(&cpupm->timeout_lock);
1183 }
1184 
1185 /*
1186  * get cpu_t structure for cpudrv_devstate_t
1187  */
1188 int
1189 cpudrv_get_cpu(cpudrv_devstate_t *cpudsp)
1190 {
1191         ASSERT(cpudsp != NULL);
1192 
1193         /*
1194          * return DDI_SUCCESS if cpudrv_devstate_t
1195          * already contains cpu_t structure
1196          */
1197         if (cpudsp->cp != NULL)
1198                 return (DDI_SUCCESS);
1199 
1200         if (MUTEX_HELD(&cpu_lock)) {
1201                 cpudsp->cp = cpu_get(cpudsp->cpu_id);
1202         } else {
1203                 mutex_enter(&cpu_lock);
1204                 cpudsp->cp = cpu_get(cpudsp->cpu_id);
1205                 mutex_exit(&cpu_lock);
1206         }
1207 
1208         if (cpudsp->cp == NULL)
1209                 return (DDI_FAILURE);
1210 
1211         return (DDI_SUCCESS);
1212 }