1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*
  27  * rcapd is a long-running daemon enforcing project-based resource caps (see
  28  * rcapd(1M)).  Each instance of a process aggregate (project or, generically,
  29  * "collection") may have a memory cap.  A single thread monitors the resource
  30  * utilization of capped collections, enforces caps when they are exceeded (and
  31  * other conditions are met), and incorporates changes in configuration or
  32  * caps.  Each of these actions occurs not more frequently than the rate
  33  * specified with rcapadm(1M).
  34  */
  35 
  36 #include <sys/priocntl.h>
  37 #include <sys/proc.h>
  38 #include <sys/resource.h>
  39 #include <sys/sysinfo.h>
  40 #include <sys/stat.h>
  41 #include <sys/sysmacros.h>
  42 #include <sys/time.h>
  43 #include <sys/types.h>
  44 #include <dirent.h>
  45 #include <errno.h>
  46 #include <fcntl.h>
  47 #include <kstat.h>
  48 #include <libintl.h>
  49 #include <limits.h>
  50 #include <locale.h>
  51 #include <priv.h>
  52 #include <signal.h>
  53 #include <stdarg.h>
  54 #include <stdio.h>
  55 #include <stdio_ext.h>
  56 #include <stdlib.h>
  57 #include <libscf.h>
  58 #include <strings.h>
  59 #include <time.h>
  60 #include <unistd.h>
  61 #include <zone.h>
  62 #include <assert.h>
  63 #include <sys/vm_usage.h>
  64 #include "rcapd.h"
  65 #include "rcapd_mapping.h"
  66 #include "rcapd_rfd.h"
  67 #include "rcapd_stat.h"
  68 #include "utils.h"
  69 
  70 #define POSITIVE_MIN(x, y) \
  71         (((x) <= 0) ? (y) : ((y) <= 0) ? (x) : MIN(x, y))
  72 #define NEXT_EVENT_TIME(base, seconds) \
  73         (((int)seconds > 0) ? (base + (hrtime_t)seconds * (hrtime_t)NANOSEC) \
  74         : (hrtime_t)0)
  75 #define NEXT_REPORT_EVENT_TIME(base, seconds) \
  76         ((rcfg.rcfg_stat_file[0] != 0) ?  \
  77             NEXT_EVENT_TIME(gethrtime(), seconds) : (hrtime_t)0)
  78 #define EVENT_TIME(time, eventtime) \
  79         (((time) > (eventtime)) && (eventtime) != 0)
  80 #define STAT_TEMPLATE_SUFFIX    ".XXXXXX"       /* suffix of mkstemp() arg */
  81 #define DAEMON_UID              1               /* uid to use */
  82 
  83 #define CAPPED_PROJECT  0x01
  84 #define CAPPED_ZONE     0x02
  85 
  86 typedef struct soft_scan_arg {
  87         uint64_t ssa_sum_excess;
  88         int64_t ssa_scan_goal;
  89         boolean_t ssa_project_over_cap;
  90 } soft_scan_arg_t;
  91 
  92 typedef struct sample_col_arg {
  93         boolean_t sca_any_over_cap;
  94         boolean_t sca_project_over_cap;
  95 } sample_col_arg_t;
  96 
  97 
  98 static int debug_mode = 0;              /* debug mode flag */
  99 static pid_t rcapd_pid;                 /* rcapd's pid to ensure it's not */
 100                                         /* scanned */
 101 static kstat_ctl_t *kctl;               /* kstat chain */
 102 static int memory_pressure = 0;         /* physical memory utilization (%) */
 103 static int memory_pressure_sample = 0;  /* count of samples */
 104 static long page_size_kb = 0;           /* system page size in KB */
 105 static size_t nvmu_vals = 0;            /* # of kernel RSS/swap vals in array */
 106 static size_t vmu_vals_len = 0;         /* size of RSS/swap vals array */
 107 static vmusage_t *vmu_vals = NULL;      /* snapshot of kernel RSS/swap values */
 108 static hrtime_t next_report;            /* time of next report */
 109 static int termination_signal = 0;      /* terminating signal */
 110 static zoneid_t my_zoneid = (zoneid_t)-1;
 111 static lcollection_t *gz_col;           /* global zone collection */
 112 
 113 rcfg_t rcfg;
 114 /*
 115  * Updated when we re-read the collection configurations if this rcapd instance
 116  * is running in the global zone and the global zone is capped.
 117  */
 118 boolean_t gz_capped = B_FALSE;
 119 
 120 /*
 121  * Flags.
 122  */
 123 static int ever_ran;
 124 int should_run;
 125 static int should_reconfigure;
 126 
 127 static int verify_statistics(void);
 128 static int update_statistics(void);
 129 
 130 /*
 131  * Checks if a process is marked 'system'.  Returns FALSE only when it is not.
 132  */
 133 static boolean_t
 134 proc_issystem(pid_t pid)
 135 {
 136         char pc_clname[PC_CLNMSZ];
 137 
 138         if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname,
 139             PC_KY_NULL) != -1) {
 140                 return (strcmp(pc_clname, "SYS") == 0);
 141         } else {
 142                 debug("cannot get class-specific scheduling parameters; "
 143                     "assuming system process\n");
 144                 return (B_TRUE);
 145         }
 146 }
 147 
 148 static void
 149 lprocess_insert_mark(psinfo_t *psinfop)
 150 {
 151         pid_t pid = psinfop->pr_pid;
 152         /* flag indicating whether the process should be scanned. */
 153         int unscannable = psinfop->pr_nlwp == 0;
 154         rcid_t colid;
 155         lcollection_t *lcol;
 156         lprocess_t *lproc;
 157 
 158         /*
 159          * Determine which collection to put this process into.  We only have
 160          * to worry about tracking both zone and project capped processes if
 161          * this rcapd instance is running in the global zone, since we'll only
 162          * see processes in our own projects in a non-global zone.  In the
 163          * global zone, if the process belongs to a non-global zone, we only
 164          * need to track it for the capped non-global zone collection.  For
 165          * global zone processes, we first attempt to put the process into a
 166          * capped project collection.  On the second pass into this function
 167          * the projid will be cleared so we will just track the process for the
 168          * global zone collection as a whole.
 169          */
 170         if (psinfop->pr_zoneid == my_zoneid && psinfop->pr_projid != -1) {
 171                 colid.rcid_type = RCIDT_PROJECT;
 172                 colid.rcid_val = psinfop->pr_projid;
 173         } else {
 174                 /* try to add to zone collection */
 175                 colid.rcid_type = RCIDT_ZONE;
 176                 colid.rcid_val = psinfop->pr_zoneid;
 177         }
 178 
 179         if ((lcol = lcollection_find(&colid)) == NULL)
 180                 return;
 181 
 182         /*
 183          * If the process is already being tracked, update the unscannable flag,
 184          * as determined by the caller, from the process's psinfo.
 185          */
 186         lproc = lcol->lcol_lprocess;
 187         while (lproc != NULL) {
 188                 if (lproc->lpc_pid == pid) {
 189                         lproc->lpc_mark = 1;
 190                         if (unscannable != 0 && lproc->lpc_unscannable == 0) {
 191                                 debug("process %d: became unscannable\n",
 192                                     (int)lproc->lpc_pid);
 193                                 lproc->lpc_unscannable = 1;
 194                         }
 195                         return;
 196                 }
 197                 lproc = lproc->lpc_next;
 198         }
 199 
 200         /*
 201          * We've fallen off the list without finding our current process;
 202          * insert it at the list head.
 203          */
 204         if ((lproc = malloc(sizeof (*lproc))) == NULL)
 205                 debug("insufficient memory to track new process %d", (int)pid);
 206         else {
 207                 (void) bzero(lproc, sizeof (*lproc));
 208                 lproc->lpc_pid = pid;
 209                 lproc->lpc_mark = 1;
 210                 lproc->lpc_collection = lcol;
 211                 lproc->lpc_psinfo_fd = -1;
 212                 lproc->lpc_pgdata_fd = -1;
 213                 lproc->lpc_xmap_fd = -1;
 214 
 215                 /*
 216                  * If the caller didn't flag this process as unscannable
 217                  * already, do some more checking.
 218                  */
 219                 lproc->lpc_unscannable = unscannable || proc_issystem(pid);
 220 
 221 #ifdef DEBUG
 222                 /*
 223                  * Verify the sanity of lprocess.  It should not contain the
 224                  * process we are about to prepend.
 225                  */
 226                 if (lcollection_member(lcol, lproc)) {
 227                         lprocess_t *cur = lcol->lcol_lprocess;
 228                         debug("The collection %lld already has these members, "
 229                             "including me, %d!\n",
 230                             (long long)lcol->lcol_id.rcid_val,
 231                             (int)lproc->lpc_pid);
 232                         while (cur != NULL) {
 233                                 debug("\t%d\n", (int)cur->lpc_pid);
 234                                 cur = cur->lpc_next;
 235                         }
 236                         info(gettext("process already on lprocess\n"));
 237                         abort();
 238                 }
 239 #endif /* DEBUG */
 240                 lproc->lpc_next = lcol->lcol_lprocess;
 241                 if (lproc->lpc_next != NULL)
 242                         lproc->lpc_next->lpc_prev = lproc;
 243                 lproc->lpc_prev = NULL;
 244                 lcol->lcol_lprocess = lproc;
 245 
 246                 debug("tracking %s %ld %d %s%s\n",
 247                     (colid.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
 248                     (long)colid.rcid_val,
 249                     (int)pid, psinfop->pr_psargs,
 250                     (lproc->lpc_unscannable != 0) ? " (not scannable)" : "");
 251                 lcol->lcol_stat.lcols_proc_in++;
 252         }
 253 }
 254 
 255 static int
 256 list_walk_process_cb(lcollection_t *lcol, void *arg)
 257 {
 258         int (*cb)(lcollection_t *, lprocess_t *) =
 259             (int(*)(lcollection_t *, lprocess_t *))arg;
 260         lprocess_t *member;
 261         lprocess_t *next;
 262 
 263         member = lcol->lcol_lprocess;
 264         while (member != NULL) {
 265                 pid_t pid = member->lpc_pid;
 266                 next = member->lpc_next;
 267 
 268                 debug_high("list_walk_all lpc %d\n", (int)pid);
 269                 if (cb(lcol, member) != 0) {
 270                         debug_high("list_walk_all aborted at lpc %d\n",
 271                             (int)pid);
 272                         return (1);
 273                 }
 274                 member = next;
 275         }
 276 
 277         return (0);
 278 }
 279 
 280 /*
 281  * Invoke the given callback for each process in each collection.  Callbacks
 282  * are allowed to change the linkage of the process on which they act.
 283  */
 284 static void
 285 list_walk_all(int (*cb)(lcollection_t *, lprocess_t *))
 286 {
 287         list_walk_collection(list_walk_process_cb, (void *)cb);
 288 }
 289 
 290 static void
 291 revoke_psinfo(rfd_t *rfd)
 292 {
 293         lprocess_t *lpc = (lprocess_t *)rfd->rfd_data;
 294 
 295         if (lpc != NULL) {
 296                 debug("revoking psinfo fd for process %d\n", (int)lpc->lpc_pid);
 297                 ASSERT(lpc->lpc_psinfo_fd != -1);
 298                 lpc->lpc_psinfo_fd = -1;
 299         } else
 300                 debug("revoking psinfo fd for unknown process\n");
 301 }
 302 
 303 /*
 304  * Retrieve a process's psinfo via an already-opened or new file descriptor.
 305  * The supplied descriptor will be closed on failure.  An optional callback
 306  * will be invoked with the last descriptor tried, and a supplied callback
 307  * argument, as its arguments, such that the new descriptor may be cached, or
 308  * an old one may be invalidated.  If the result of the callback is zero, the
 309  * the caller is to assume responsibility for the file descriptor, to close it
 310  * with rfd_close().
 311  *
 312  * On failure, a nonzero value is returned.
 313  */
 314 int
 315 get_psinfo(pid_t pid, psinfo_t *psinfo, int cached_fd,
 316     int(*fd_update_cb)(void *, int), void *arg, lprocess_t *lpc)
 317 {
 318         int fd;
 319         int can_try_uncached;
 320 
 321         ASSERT(!(cached_fd > 0 && fd_update_cb == NULL));
 322 
 323         do {
 324                 if (cached_fd >= 0) {
 325                         fd = cached_fd;
 326                         can_try_uncached = 1;
 327                         debug_high("%d/psinfo, trying cached fd %d\n",
 328                             (int)pid, fd);
 329                 } else {
 330                         char pathbuf[PROC_PATH_MAX];
 331 
 332                         can_try_uncached = 0;
 333                         (void) snprintf(pathbuf, sizeof (pathbuf),
 334                             "/proc/%d/psinfo", (int)pid);
 335                         if ((fd = rfd_open(pathbuf, 1, RFD_PSINFO,
 336                             revoke_psinfo, lpc, O_RDONLY, 0000)) < 0) {
 337                                 debug("cannot open %s", pathbuf);
 338                                 break;
 339                         } else
 340                                 debug_high("opened %s, fd %d\n", pathbuf, fd);
 341                 }
 342 
 343                 if (pread(fd, psinfo, sizeof (*psinfo), 0) ==
 344                     sizeof (*psinfo) && psinfo->pr_pid == pid)
 345                         break;
 346                 else {
 347                         debug_high("closed fd %d\n", fd);
 348                         if (rfd_close(fd) != 0)
 349                                 debug("could not close fd %d", fd);
 350                         fd = cached_fd = -1;
 351                 }
 352         } while (can_try_uncached == 1);
 353 
 354         if (fd_update_cb == NULL || fd_update_cb(arg, fd) != 0)
 355                 if (fd >= 0) {
 356                         debug_high("closed %s fd %d\n", fd_update_cb == NULL ?
 357                             "uncached" : "cached", fd);
 358                         if (rfd_close(fd) != 0)
 359                                 debug("could not close fd %d", fd);
 360                 }
 361 
 362         debug_high("get_psinfo ret %d, fd %d, %s\n", ((fd >= 0) ? 0 : -1), fd,
 363             fd_update_cb != NULL ? "cached" : "uncached");
 364         return ((fd >= 0) ? 0 : -1);
 365 }
 366 
 367 /*
 368  * Retrieve the collection membership of all processes and update the psinfo of
 369  * those non-system, non-zombie ones in collections.  For global zone processes,
 370  * we first attempt to put the process into a capped project collection.  We
 371  * also want to track the process for the global zone collection as a whole.
 372  */
 373 static void
 374 proc_cb(const pid_t pid)
 375 {
 376         psinfo_t psinfo;
 377 
 378         if (get_psinfo(pid, &psinfo, -1, NULL, NULL, NULL) == 0) {
 379                 lprocess_insert_mark(&psinfo);
 380                 if (gz_capped && psinfo.pr_zoneid == GLOBAL_ZONEID) {
 381                         /*
 382                          * We also want to track this process for the global
 383                          * zone as a whole so add it to the global zone
 384                          * collection as well.
 385                          */
 386                         psinfo.pr_projid = -1;
 387                         lprocess_insert_mark(&psinfo);
 388                 }
 389         }
 390 }
 391 
 392 /*
 393  * Cache the process' psinfo fd, taking responsibility for freeing it.
 394  */
 395 int
 396 lprocess_update_psinfo_fd_cb(void *arg, int fd)
 397 {
 398         lprocess_t *lpc = arg;
 399 
 400         lpc->lpc_psinfo_fd = fd;
 401         return (0);
 402 }
 403 
 404 /*
 405  * Get the system pagesize.
 406  */
 407 static void
 408 get_page_size(void)
 409 {
 410         page_size_kb = sysconf(_SC_PAGESIZE) / 1024;
 411         debug("physical page size: %luKB\n", page_size_kb);
 412 }
 413 
 414 static void
 415 tm_fmt(char *msg, hrtime_t t1, hrtime_t t2)
 416 {
 417         hrtime_t diff = t2 - t1;
 418 
 419         if (diff < MILLISEC)
 420                 debug("%s: %lld nanoseconds\n", msg, diff);
 421         else if (diff < MICROSEC)
 422                 debug("%s: %.2f microseconds\n", msg, (float)diff / MILLISEC);
 423         else if (diff < NANOSEC)
 424                 debug("%s: %.2f milliseconds\n", msg, (float)diff / MICROSEC);
 425         else
 426                 debug("%s: %.2f seconds\n", msg, (float)diff / NANOSEC);
 427 }
 428 
 429 /*
 430  * Get the zone's & project's RSS from the kernel.
 431  */
 432 static void
 433 rss_sample(boolean_t my_zone_only, uint_t col_types)
 434 {
 435         size_t nres;
 436         size_t i;
 437         uint_t flags;
 438         hrtime_t t1, t2;
 439 
 440         if (my_zone_only) {
 441                 flags = VMUSAGE_ZONE;
 442         } else {
 443                 flags = 0;
 444                 if (col_types & CAPPED_PROJECT)
 445                         flags |= VMUSAGE_PROJECTS;
 446                 if (col_types & CAPPED_ZONE && my_zoneid == GLOBAL_ZONEID)
 447                         flags |= VMUSAGE_ALL_ZONES;
 448         }
 449 
 450         debug("vmusage sample flags 0x%x\n", flags);
 451         if (flags == 0)
 452                 return;
 453 
 454 again:
 455         /* try the current buffer to see if the list will fit */
 456         nres = vmu_vals_len;
 457         t1 = gethrtime();
 458         if (getvmusage(flags, my_zone_only ? 0 : rcfg.rcfg_rss_sample_interval,
 459             vmu_vals, &nres) != 0) {
 460                 if (errno != EOVERFLOW) {
 461                         warn(gettext("can't read RSS from kernel\n"));
 462                         return;
 463                 }
 464         }
 465         t2 = gethrtime();
 466         tm_fmt("getvmusage time", t1, t2);
 467 
 468         debug("kernel nres %lu\n", (ulong_t)nres);
 469 
 470         if (nres > vmu_vals_len) {
 471                 /* array size is now too small, increase it and try again */
 472                 free(vmu_vals);
 473 
 474                 if ((vmu_vals = (vmusage_t *)calloc(nres,
 475                     sizeof (vmusage_t))) == NULL) {
 476                         warn(gettext("out of memory: could not read RSS from "
 477                             "kernel\n"));
 478                         vmu_vals_len = nvmu_vals = 0;
 479                         return;
 480                 }
 481                 vmu_vals_len = nres;
 482                 goto again;
 483         }
 484 
 485         nvmu_vals = nres;
 486 
 487         debug("vmusage_sample\n");
 488         for (i = 0; i < nvmu_vals; i++) {
 489                 debug("%d: id: %d, type: 0x%x, rss_all: %llu (%lluKB), "
 490                     "swap: %llu\n", (int)i, (int)vmu_vals[i].vmu_id,
 491                     vmu_vals[i].vmu_type,
 492                     (unsigned long long)vmu_vals[i].vmu_rss_all,
 493                     (unsigned long long)vmu_vals[i].vmu_rss_all / 1024,
 494                     (unsigned long long)vmu_vals[i].vmu_swap_all);
 495         }
 496 }
 497 
 498 static void
 499 update_col_rss(lcollection_t *lcol)
 500 {
 501         int i;
 502 
 503         lcol->lcol_rss = 0;
 504         lcol->lcol_image_size = 0;
 505 
 506         for (i = 0; i < nvmu_vals; i++) {
 507                 if (vmu_vals[i].vmu_id != lcol->lcol_id.rcid_val)
 508                         continue;
 509 
 510                 if (vmu_vals[i].vmu_type == VMUSAGE_ZONE &&
 511                     lcol->lcol_id.rcid_type != RCIDT_ZONE)
 512                         continue;
 513 
 514                 if (vmu_vals[i].vmu_type == VMUSAGE_PROJECTS &&
 515                     lcol->lcol_id.rcid_type != RCIDT_PROJECT)
 516                         continue;
 517 
 518                 /* we found the right RSS entry, update the collection vals */
 519                 lcol->lcol_rss = vmu_vals[i].vmu_rss_all / 1024;
 520                 lcol->lcol_image_size = vmu_vals[i].vmu_swap_all / 1024;
 521                 break;
 522         }
 523 }
 524 
 525 /*
 526  * Sample the collection RSS, updating the collection's statistics with the
 527  * results.  Also, sum the rss of all capped projects & return true if
 528  * the collection is over cap.
 529  */
 530 static int
 531 rss_sample_col_cb(lcollection_t *lcol, void *arg)
 532 {
 533         int64_t excess;
 534         uint64_t rss;
 535         sample_col_arg_t *col_argp = (sample_col_arg_t *)arg;
 536 
 537         update_col_rss(lcol);
 538 
 539         lcol->lcol_stat.lcols_rss_sample++;
 540         rss = lcol->lcol_rss;
 541         excess = rss - lcol->lcol_rss_cap;
 542         if (excess > 0) {
 543                 lcol->lcol_stat.lcols_rss_act_sum += rss;
 544                 col_argp->sca_any_over_cap = B_TRUE;
 545                 if (lcol->lcol_id.rcid_type == RCIDT_PROJECT)
 546                         col_argp->sca_project_over_cap = B_TRUE;
 547         }
 548         lcol->lcol_stat.lcols_rss_sum += rss;
 549 
 550         if (lcol->lcol_stat.lcols_min_rss > rss)
 551                 lcol->lcol_stat.lcols_min_rss = rss;
 552         if (lcol->lcol_stat.lcols_max_rss < rss)
 553                 lcol->lcol_stat.lcols_max_rss = rss;
 554 
 555         return (0);
 556 }
 557 
 558 /*
 559  * Determine if we have capped projects, capped zones or both.
 560  */
 561 static int
 562 col_type_cb(lcollection_t *lcol, void *arg)
 563 {
 564         uint_t *col_type = (uint_t *)arg;
 565 
 566         /* skip uncapped collections */
 567         if (lcol->lcol_rss_cap == 0)
 568                 return (1);
 569 
 570         if (lcol->lcol_id.rcid_type == RCIDT_PROJECT)
 571                 *col_type |= CAPPED_PROJECT;
 572         else
 573                 *col_type |= CAPPED_ZONE;
 574 
 575         /* once we know everything is capped, we can stop looking */
 576         if ((*col_type & CAPPED_ZONE) && (*col_type & CAPPED_PROJECT))
 577                 return (1);
 578 
 579         return (0);
 580 }
 581 
 582 /*
 583  * Open /proc and walk entries.
 584  */
 585 static void
 586 proc_walk_all(void (*cb)(const pid_t))
 587 {
 588         DIR *pdir;
 589         struct dirent *dirent;
 590         pid_t pid;
 591 
 592         (void) rfd_reserve(1);
 593         if ((pdir = opendir("/proc")) == NULL)
 594                 die(gettext("couldn't open /proc!"));
 595 
 596         while ((dirent = readdir(pdir)) != NULL) {
 597                 if (strcmp(".", dirent->d_name) == 0 ||
 598                     strcmp("..", dirent->d_name) == 0)
 599                         continue;
 600                 pid = atoi(dirent->d_name);
 601                 ASSERT(pid != 0 || strcmp(dirent->d_name, "0") == 0);
 602                 if (pid == rcapd_pid)
 603                         continue;
 604                 else
 605                         cb(pid);
 606         }
 607         (void) closedir(pdir);
 608 }
 609 
 610 /*
 611  * Clear unmarked callback.
 612  */
 613 /*ARGSUSED*/
 614 static int
 615 sweep_process_cb(lcollection_t *lcol, lprocess_t *lpc)
 616 {
 617         if (lpc->lpc_mark) {
 618                 lpc->lpc_mark = 0;
 619         } else {
 620                 debug("process %d finished\n", (int)lpc->lpc_pid);
 621                 lprocess_free(lpc);
 622         }
 623 
 624         return (0);
 625 }
 626 
 627 /*
 628  * Print, for debugging purposes, a collection's recently-sampled RSS and
 629  * excess.
 630  */
 631 /*ARGSUSED*/
 632 static int
 633 excess_print_cb(lcollection_t *lcol, void *arg)
 634 {
 635         int64_t excess = lcol->lcol_rss - lcol->lcol_rss_cap;
 636 
 637         debug("%s %s rss/cap: %llu/%llu, excess = %lld kB\n",
 638             (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
 639             lcol->lcol_name,
 640             (unsigned long long)lcol->lcol_rss,
 641             (unsigned long long)lcol->lcol_rss_cap,
 642             (long long)excess);
 643 
 644         return (0);
 645 }
 646 
 647 /*
 648  * Scan those collections which have exceeded their caps.
 649  *
 650  * If we're running in the global zone it might have a cap.  We don't want to
 651  * do any capping for the global zone yet since we might get under the cap by
 652  * just capping the projects in the global zone.
 653  */
 654 /*ARGSUSED*/
 655 static int
 656 scan_cb(lcollection_t *lcol, void *arg)
 657 {
 658         int64_t excess;
 659 
 660         /* skip over global zone collection for now but keep track for later */
 661         if (lcol->lcol_id.rcid_type == RCIDT_ZONE &&
 662             lcol->lcol_id.rcid_val == GLOBAL_ZONEID) {
 663                 gz_col = lcol;
 664                 return (0);
 665         }
 666 
 667         if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) {
 668                 scan(lcol, excess);
 669                 lcol->lcol_stat.lcols_scan++;
 670         }
 671 
 672         return (0);
 673 }
 674 
 675 /*
 676  * Scan the global zone collection and see if it still exceeds its cap.
 677  * We take into account the effects of capping any global zone projects here.
 678  */
 679 static void
 680 scan_gz(lcollection_t *lcol, boolean_t project_over_cap)
 681 {
 682         int64_t excess;
 683 
 684         /*
 685          * If we had projects over their cap and the global zone was also over
 686          * its cap then we need to get the up-to-date global zone rss to
 687          * determine if we are still over the global zone cap.  We might have
 688          * gone under while we scanned the capped projects.  If there were no
 689          * projects over cap then we can use the rss value we already have for
 690          * the global zone.
 691          */
 692         excess = lcol->lcol_rss - lcol->lcol_rss_cap;
 693         if (project_over_cap && excess > 0) {
 694                 rss_sample(B_TRUE, CAPPED_ZONE);
 695                 update_col_rss(lcol);
 696                 excess = lcol->lcol_rss - lcol->lcol_rss_cap;
 697         }
 698 
 699         if (excess > 0) {
 700                 debug("global zone excess %lldKB\n", (long long)excess);
 701                 scan(lcol, excess);
 702                 lcol->lcol_stat.lcols_scan++;
 703         }
 704 }
 705 
 706 /*
 707  * Do a soft scan of those collections which have excesses.  A soft scan is one
 708  * in which the cap enforcement pressure is taken into account.  The difference
 709  * between the utilized physical memory and the cap enforcement pressure will
 710  * be scanned-for, and each collection will be scanned proportionally by their
 711  * present excesses.
 712  */
 713 static int
 714 soft_scan_cb(lcollection_t *lcol, void *a)
 715 {
 716         int64_t excess;
 717         soft_scan_arg_t *arg = a;
 718 
 719         /* skip over global zone collection for now but keep track for later */
 720         if (lcol->lcol_id.rcid_type == RCIDT_ZONE &&
 721             lcol->lcol_id.rcid_val == GLOBAL_ZONEID) {
 722                 gz_col = lcol;
 723                 return (0);
 724         }
 725 
 726         if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) {
 727                 int64_t adjusted_excess =
 728                     excess * arg->ssa_scan_goal / arg->ssa_sum_excess;
 729 
 730                 debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, "
 731                     "scanning %lld\n",
 732                     (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
 733                     "project" : "zone"),
 734                     (long)lcol->lcol_id.rcid_val,
 735                     (long long)excess, (long long)arg->ssa_scan_goal,
 736                     (unsigned long long)arg->ssa_sum_excess,
 737                     (long long)adjusted_excess);
 738 
 739                 scan(lcol, adjusted_excess);
 740                 lcol->lcol_stat.lcols_scan++;
 741         }
 742 
 743         return (0);
 744 }
 745 
 746 static void
 747 soft_scan_gz(lcollection_t *lcol, void *a)
 748 {
 749         int64_t excess;
 750         soft_scan_arg_t *arg = a;
 751 
 752         /*
 753          * If we had projects over their cap and the global zone was also over
 754          * its cap then we need to get the up-to-date global zone rss to
 755          * determine if we are still over the global zone cap.  We might have
 756          * gone under while we scanned the capped projects.  If there were no
 757          * projects over cap then we can use the rss value we already have for
 758          * the global zone.
 759          */
 760         excess = lcol->lcol_rss - lcol->lcol_rss_cap;
 761         if (arg->ssa_project_over_cap && excess > 0) {
 762                 rss_sample(B_TRUE, CAPPED_ZONE);
 763                 update_col_rss(lcol);
 764                 excess = lcol->lcol_rss - lcol->lcol_rss_cap;
 765         }
 766 
 767         if (excess > 0) {
 768                 int64_t adjusted_excess =
 769                     excess * arg->ssa_scan_goal / arg->ssa_sum_excess;
 770 
 771                 debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, "
 772                     "scanning %lld\n",
 773                     (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
 774                     "project" : "zone"),
 775                     (long)lcol->lcol_id.rcid_val,
 776                     (long long)excess, (long long)arg->ssa_scan_goal,
 777                     (unsigned long long)arg->ssa_sum_excess,
 778                     (long long)adjusted_excess);
 779 
 780                 scan(lcol, adjusted_excess);
 781                 lcol->lcol_stat.lcols_scan++;
 782         }
 783 }
 784 
 785 /*
 786  * When a scan could happen, but caps aren't enforced tick the
 787  * lcols_unenforced_cap counter.
 788  */
 789 /*ARGSUSED*/
 790 static int
 791 unenforced_cap_cb(lcollection_t *lcol, void *arg)
 792 {
 793         lcol->lcol_stat.lcols_unenforced_cap++;
 794 
 795         return (0);
 796 }
 797 
 798 /*
 799  * Update the count of physically installed memory.
 800  */
 801 static void
 802 update_phys_total(void)
 803 {
 804         uint64_t old_phys_total;
 805 
 806         old_phys_total = phys_total;
 807         phys_total = (uint64_t)sysconf(_SC_PHYS_PAGES) * page_size_kb;
 808         if (phys_total != old_phys_total)
 809                 debug("physical memory%s: %lluM\n", (old_phys_total == 0 ?
 810                     "" : " adjusted"), (unsigned long long)(phys_total / 1024));
 811 }
 812 
 813 /*
 814  * Unlink a process from its collection, updating relevant statistics, and
 815  * freeing its associated memory.
 816  */
 817 void
 818 lprocess_free(lprocess_t *lpc)
 819 {
 820         pid_t pid;
 821 
 822         lpc->lpc_collection->lcol_stat.lcols_proc_out++;
 823 
 824         if (lpc->lpc_prev != NULL)
 825                 lpc->lpc_prev->lpc_next = lpc->lpc_next;
 826         if (lpc->lpc_next != NULL)
 827                 lpc->lpc_next->lpc_prev = lpc->lpc_prev;
 828         if (lpc->lpc_collection->lcol_lprocess == lpc)
 829                 lpc->lpc_collection->lcol_lprocess = (lpc->lpc_next !=
 830                     lpc ? lpc->lpc_next : NULL);
 831         lpc->lpc_next = lpc->lpc_prev = NULL;
 832 
 833         if (lpc->lpc_prpageheader != NULL)
 834                 free(lpc->lpc_prpageheader);
 835         if (lpc->lpc_xmap != NULL)
 836                 free(lpc->lpc_xmap);
 837         if (lpc->lpc_psinfo_fd >= 0) {
 838                 if (rfd_close(lpc->lpc_psinfo_fd) != 0)
 839                         debug("could not close %d lpc_psinfo_fd %d",
 840                             (int)lpc->lpc_pid, lpc->lpc_psinfo_fd);
 841                 lpc->lpc_psinfo_fd = -1;
 842         }
 843         if (lpc->lpc_pgdata_fd >= 0) {
 844                 if (rfd_close(lpc->lpc_pgdata_fd) != 0)
 845                         debug("could not close %d lpc_pgdata_fd %d",
 846                             (int)lpc->lpc_pid, lpc->lpc_pgdata_fd);
 847                 lpc->lpc_pgdata_fd = -1;
 848         }
 849         if (lpc->lpc_xmap_fd >= 0) {
 850                 if (rfd_close(lpc->lpc_xmap_fd) != 0)
 851                         debug("could not close %d lpc_xmap_fd %d",
 852                             (int)lpc->lpc_pid, lpc->lpc_xmap_fd);
 853                 lpc->lpc_xmap_fd = -1;
 854         }
 855         if (lpc->lpc_ignore != NULL)
 856                 lmapping_free(&lpc->lpc_ignore);
 857         pid = lpc->lpc_pid;
 858         free(lpc);
 859         debug_high("process %d freed\n", (int)pid);
 860 }
 861 
 862 /*
 863  * Collection clear callback.
 864  */
 865 /*ARGSUSED*/
 866 static int
 867 collection_clear_cb(lcollection_t *lcol, void *arg)
 868 {
 869         lcol->lcol_mark = 0;
 870 
 871         return (0);
 872 }
 873 
 874 /*
 875  * Respond to a terminating signal by setting a termination flag.
 876  */
 877 /*ARGSUSED*/
 878 static void
 879 terminate_signal(int signal)
 880 {
 881         if (termination_signal == 0)
 882                 termination_signal = signal;
 883         should_run = 0;
 884 }
 885 
 886 /*
 887  * Handle any synchronous or asynchronous signals that would ordinarily cause a
 888  * process to abort.
 889  */
 890 /*ARGSUSED*/
 891 static void
 892 abort_signal(int signal)
 893 {
 894         /*
 895          * Allow the scanner to make a last-ditch effort to resume any stopped
 896          * processes.
 897          */
 898         scan_abort();
 899         abort();
 900 }
 901 
 902 /*
 903  * Clean up collections which have been removed due to configuration.  Unlink
 904  * the collection from lcollection and free it.
 905  */
 906 /*ARGSUSED*/
 907 static int
 908 collection_sweep_cb(lcollection_t *lcol, void *arg)
 909 {
 910         if (lcol->lcol_mark == 0) {
 911                 debug("freeing %s %s\n",
 912                     (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
 913                     "project" : "zone"), lcol->lcol_name);
 914                 lcollection_free(lcol);
 915         }
 916 
 917         return (0);
 918 }
 919 
 920 /*
 921  * Set those variables which depend on the global configuration.
 922  */
 923 static void
 924 finish_configuration(void)
 925 {
 926         /*
 927          * Warn that any lnode (or non-project) mode specification (by an SRM
 928          * 1.3 configuration file, for example) is ignored.
 929          */
 930         if (strcmp(rcfg.rcfg_mode_name, "project") != 0) {
 931                 warn(gettext("%s mode specification ignored -- using project"
 932                     " mode\n"), rcfg.rcfg_mode_name);
 933                 rcfg.rcfg_mode_name = "project";
 934                 rcfg.rcfg_mode = rctype_project;
 935         }
 936 }
 937 
 938 /*
 939  * Cause the configuration to be reread and applied.
 940  */
 941 static void
 942 reread_configuration(void)
 943 {
 944         rcfg_t rcfg_new;
 945 
 946         if (rcfg_read(&rcfg_new, update_statistics) != E_SUCCESS) {
 947                 warn(gettext("can't reread configuration \n"));
 948                 exit(SMF_EXIT_ERR_CONFIG);
 949         } else {
 950                 /*
 951                  * Done reading configuration.  Remove existing
 952                  * collections in case there is a change in collection type.
 953                  */
 954                 if (rcfg.rcfg_mode != rcfg_new.rcfg_mode) {
 955                         list_walk_collection(collection_clear_cb, NULL);
 956                         list_walk_collection(collection_sweep_cb, NULL);
 957                 }
 958 
 959                 /*
 960                  * Make the newly-read configuration the global one, and update
 961                  * any variables that depend on it.
 962                  */
 963                 rcfg = rcfg_new;
 964                 finish_configuration();
 965         }
 966 }
 967 
 968 /*
 969  * First, examine changes, additions, and deletions to cap definitions.
 970  * Then, set the next event time.
 971  */
 972 static void
 973 reconfigure(hrtime_t now, hrtime_t *next_configuration,
 974     hrtime_t *next_proc_walk, hrtime_t *next_rss_sample)
 975 {
 976         debug("reconfigure...\n");
 977 
 978         /*
 979          * Walk the lcollection, marking active collections so inactive ones
 980          * can be freed.
 981          */
 982         list_walk_collection(collection_clear_cb, NULL);
 983         lcollection_update(LCU_ACTIVE_ONLY); /* mark */
 984         list_walk_collection(collection_sweep_cb, NULL);
 985 
 986         *next_configuration = NEXT_EVENT_TIME(now,
 987             rcfg.rcfg_reconfiguration_interval);
 988 
 989         /*
 990          * Reset each event time to the shorter of the previous and new
 991          * intervals.
 992          */
 993         if (next_report == 0 && rcfg.rcfg_report_interval > 0)
 994                 next_report = now;
 995         else
 996                 next_report = POSITIVE_MIN(next_report,
 997                     NEXT_REPORT_EVENT_TIME(now, rcfg.rcfg_report_interval));
 998 
 999         if (*next_proc_walk == 0 && rcfg.rcfg_proc_walk_interval > 0)
1000                 *next_proc_walk = now;
1001         else
1002                 *next_proc_walk = POSITIVE_MIN(*next_proc_walk,
1003                     NEXT_EVENT_TIME(now, rcfg.rcfg_proc_walk_interval));
1004 
1005         if (*next_rss_sample == 0 && rcfg.rcfg_rss_sample_interval > 0)
1006                 *next_rss_sample = now;
1007         else
1008                 *next_rss_sample = POSITIVE_MIN(*next_rss_sample,
1009                     NEXT_EVENT_TIME(now, rcfg.rcfg_rss_sample_interval));
1010 }
1011 
1012 /*
1013  * Respond to SIGHUP by triggering the rereading the configuration and cap
1014  * definitions.
1015  */
1016 /*ARGSUSED*/
1017 static void
1018 sighup(int signal)
1019 {
1020         should_reconfigure = 1;
1021 }
1022 
1023 /*
1024  * Print, for debugging purposes, each collection's interval statistics.
1025  */
1026 /*ARGSUSED*/
1027 static int
1028 simple_report_collection_cb(lcollection_t *lcol, void *arg)
1029 {
1030 #define DELTA(field) \
1031         (unsigned long long)( \
1032             (lcol->lcol_stat.field - lcol->lcol_stat_old.field))
1033 
1034         debug("%s %s status: succeeded/attempted (k): %llu/%llu, "
1035             "ineffective/scans/unenforced/samplings:  %llu/%llu/%llu/%llu, RSS "
1036             "min/max (k): %llu/%llu, cap %llu kB, processes/thpt: %llu/%llu, "
1037             "%llu scans over %llu ms\n",
1038             (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
1039             lcol->lcol_name,
1040             DELTA(lcols_pg_eff), DELTA(lcols_pg_att),
1041             DELTA(lcols_scan_ineffective), DELTA(lcols_scan),
1042             DELTA(lcols_unenforced_cap), DELTA(lcols_rss_sample),
1043             (unsigned long long)lcol->lcol_stat.lcols_min_rss,
1044             (unsigned long long)lcol->lcol_stat.lcols_max_rss,
1045             (unsigned long long)lcol->lcol_rss_cap,
1046             (unsigned long long)(lcol->lcol_stat.lcols_proc_in -
1047             lcol->lcol_stat.lcols_proc_out), DELTA(lcols_proc_out),
1048             DELTA(lcols_scan_count), DELTA(lcols_scan_time_complete) / (NANOSEC
1049             / MILLISEC));
1050 
1051 #undef DELTA
1052 
1053         return (0);
1054 }
1055 
1056 /*
1057  * Record each collection's interval statistics in the statistics file.
1058  */
1059 static int
1060 report_collection_cb(lcollection_t *lcol, void *arg)
1061 {
1062         lcollection_report_t dc;
1063         int fd = (intptr_t)arg;
1064 
1065         /*
1066          * Copy the relevant fields to the collection's record.
1067          */
1068         bzero(&dc, sizeof (dc));
1069         dc.lcol_id = lcol->lcol_id;
1070         (void) strcpy(dc.lcol_name, lcol->lcol_name);
1071         dc.lcol_rss = lcol->lcol_rss;
1072         dc.lcol_image_size = lcol->lcol_image_size;
1073         dc.lcol_rss_cap = lcol->lcol_rss_cap;
1074         dc.lcol_stat = lcol->lcol_stat;
1075 
1076         if (write(fd, &dc, sizeof (dc)) == sizeof (dc)) {
1077                 lcol->lcol_stat_old = lcol->lcol_stat;
1078         } else {
1079                 debug("can't write %s %s statistics",
1080                     (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
1081                     "project" : "zone"),
1082                     lcol->lcol_name);
1083         }
1084 
1085         return (0);
1086 }
1087 
1088 /*
1089  * Determine the count of pages scanned by the global page scanner, obtained
1090  * from the cpu_stat:*::scan kstats.  Return zero on success.
1091  */
1092 static int
1093 get_globally_scanned_pages(uint64_t *scannedp)
1094 {
1095         kstat_t *ksp;
1096         uint64_t scanned = 0;
1097 
1098         if (kstat_chain_update(kctl) == -1) {
1099                 warn(gettext("can't update kstat chain"));
1100                 return (0);
1101         }
1102 
1103         for (ksp = kctl->kc_chain; ksp != NULL; ksp = ksp->ks_next) {
1104                 if (strcmp(ksp->ks_module, "cpu_stat") == 0) {
1105                         if (kstat_read(kctl, ksp, NULL) != -1) {
1106                                 scanned += ((cpu_stat_t *)
1107                                     ksp->ks_data)->cpu_vminfo.scan;
1108                         } else {
1109                                 return (-1);
1110                         }
1111                 }
1112         }
1113 
1114         *scannedp = scanned;
1115         return (0);
1116 }
1117 
1118 /*
1119  * Determine if the global page scanner is running, during which no memory
1120  * caps should be enforced, to prevent interference with the global page
1121  * scanner.
1122  */
1123 static boolean_t
1124 is_global_scanner_running()
1125 {
1126         /* measure delta in page scan count */
1127         static uint64_t new_sp = 0;
1128         static uint64_t old_sp = 0;
1129         boolean_t res = B_FALSE;
1130 
1131         if (get_globally_scanned_pages(&new_sp) == 0) {
1132                 if (old_sp != 0 && (new_sp - old_sp) > 0) {
1133                         debug("global memory pressure detected (%llu "
1134                             "pages scanned since last interval)\n",
1135                             (unsigned long long)(new_sp - old_sp));
1136                         res = B_TRUE;
1137                 }
1138                 old_sp = new_sp;
1139         } else {
1140                 warn(gettext("unable to read cpu statistics"));
1141                 new_sp = old_sp;
1142         }
1143 
1144         return (res);
1145 }
1146 
1147 /*
1148  * If soft caps are in use, determine if global memory pressure exceeds the
1149  * configured maximum above which soft caps are enforced.
1150  */
1151 static boolean_t
1152 must_enforce_soft_caps()
1153 {
1154         /*
1155          * Check for changes to the amount of installed physical memory, to
1156          * compute the current memory pressure.
1157          */
1158         update_phys_total();
1159 
1160         memory_pressure = 100 - (int)((sysconf(_SC_AVPHYS_PAGES) * page_size_kb)
1161             * 100.0 / phys_total);
1162         memory_pressure_sample++;
1163         if (rcfg.rcfg_memory_cap_enforcement_pressure > 0 &&
1164             memory_pressure > rcfg.rcfg_memory_cap_enforcement_pressure) {
1165                 return (B_TRUE);
1166         }
1167 
1168         return (B_FALSE);
1169 }
1170 
1171 /*
1172  * Update the shared statistics file with each collection's current statistics.
1173  * Return zero on success.
1174  */
1175 static int
1176 update_statistics(void)
1177 {
1178         int fd, res;
1179         static char template[LINELEN];
1180 
1181         /*
1182          * Try to create a directory irrespective of whether it is existing
1183          * or not. If it is not there then it will create. Otherwise any way
1184          * it will fail at mkstemp call below.
1185          */
1186         (void) mkdir(STAT_FILE_DIR, 0755);
1187 
1188         /*
1189          * Create a temporary file.
1190          */
1191         if (sizeof (template) < (strlen(rcfg.rcfg_stat_file) +
1192             strlen(STAT_TEMPLATE_SUFFIX) + 1)) {
1193                 debug("temporary file template size too small\n");
1194                 return (-1);
1195         }
1196         (void) strcpy(template, rcfg.rcfg_stat_file);
1197         (void) strcat(template, STAT_TEMPLATE_SUFFIX);
1198         (void) rfd_reserve(1);
1199         fd = mkstemp(template);
1200 
1201         /*
1202          * Write the header and per-collection statistics.
1203          */
1204         if (fd >= 0) {
1205                 rcapd_stat_hdr_t rs;
1206 
1207                 rs.rs_pid = rcapd_pid;
1208                 rs.rs_time = gethrtime();
1209                 ASSERT(sizeof (rs.rs_mode) > strlen(rcfg.rcfg_mode_name));
1210                 (void) strcpy(rs.rs_mode, rcfg.rcfg_mode_name);
1211                 rs.rs_pressure_cur = memory_pressure;
1212                 rs.rs_pressure_cap = rcfg.rcfg_memory_cap_enforcement_pressure;
1213                 rs.rs_pressure_sample = memory_pressure_sample;
1214 
1215                 if (fchmod(fd, 0644) == 0 && write(fd, &rs, sizeof (rs)) ==
1216                     sizeof (rs)) {
1217                         list_walk_collection(report_collection_cb,
1218                             (void *)(intptr_t)fd);
1219                         /*
1220                          * Replace the existing statistics file with this new
1221                          * one.
1222                          */
1223                         res = rename(template, rcfg.rcfg_stat_file);
1224                 } else
1225                         res = -1;
1226                 (void) close(fd);
1227         } else
1228                 res = -1;
1229 
1230         return (res);
1231 }
1232 
1233 /*
1234  * Verify the statistics file can be created and written to, and die if an
1235  * existing file may be in use by another rcapd.
1236  */
1237 static int
1238 verify_statistics(void)
1239 {
1240         pid_t pid;
1241 
1242         /*
1243          * Warn if another instance of rcapd might be active.
1244          */
1245         (void) rfd_reserve(1);
1246         pid = stat_get_rcapd_pid(rcfg.rcfg_stat_file);
1247         if (pid != rcapd_pid && pid != -1)
1248                 die(gettext("%s exists; rcapd may already be active\n"),
1249                     rcfg.rcfg_stat_file);
1250 
1251         return (update_statistics());
1252 }
1253 
1254 static int
1255 sum_excess_cb(lcollection_t *lcol, void *arg)
1256 {
1257         uint64_t *sum_excess = arg;
1258 
1259         *sum_excess += MAX((int64_t)0, (int64_t)(lcol->lcol_rss -
1260             lcol->lcol_rss_cap));
1261         return (0);
1262 }
1263 
1264 /*
1265  * Compute the quantity of memory (in kilobytes) above the cap enforcement
1266  * pressure.  Set the scan goal to that quantity (or at most the excess).
1267  */
1268 static void
1269 compute_soft_scan_goal(soft_scan_arg_t *argp)
1270 {
1271         /*
1272          * Compute the sum of the collections' excesses, which will be the
1273          * denominator.
1274          */
1275         argp->ssa_sum_excess = 0;
1276         list_walk_collection(sum_excess_cb, &(argp->ssa_sum_excess));
1277 
1278         argp->ssa_scan_goal = MIN((sysconf(_SC_PHYS_PAGES) *
1279             (100 - rcfg.rcfg_memory_cap_enforcement_pressure) / 100 -
1280             sysconf(_SC_AVPHYS_PAGES)) * page_size_kb,
1281             argp->ssa_sum_excess);
1282 }
1283 
1284 static void
1285 rcapd_usage(void)
1286 {
1287         info(gettext("usage: rcapd [-d]\n"));
1288 }
1289 
1290 void
1291 check_update_statistics(void)
1292 {
1293         hrtime_t now = gethrtime();
1294 
1295         if (EVENT_TIME(now, next_report)) {
1296                 debug("updating statistics...\n");
1297                 list_walk_collection(simple_report_collection_cb, NULL);
1298                 if (update_statistics() != 0)
1299                         debug("couldn't update statistics");
1300                 next_report = NEXT_REPORT_EVENT_TIME(now,
1301                     rcfg.rcfg_report_interval);
1302         }
1303 }
1304 
1305 static void
1306 verify_and_set_privileges(void)
1307 {
1308         priv_set_t *required =
1309             priv_str_to_set("zone,sys_resource,proc_owner", ",", NULL);
1310 
1311         /*
1312          * Ensure the required privileges, suitable for controlling processes,
1313          * are possessed.
1314          */
1315         if (setppriv(PRIV_SET, PRIV_PERMITTED, required) != 0 || setppriv(
1316             PRIV_SET, PRIV_EFFECTIVE, required) != 0)
1317                 die(gettext("can't set requisite privileges"));
1318 
1319         /*
1320          * Ensure access to /var/run/daemon.
1321          */
1322         if (setreuid(DAEMON_UID, DAEMON_UID) != 0)
1323                 die(gettext("cannot become user daemon"));
1324 
1325         priv_freeset(required);
1326 }
1327 
1328 /*
1329  * This function does the top-level work to determine if we should do any
1330  * memory capping, and if so, it invokes the right call-backs to do the work.
1331  */
1332 static void
1333 do_capping(hrtime_t now, hrtime_t *next_proc_walk)
1334 {
1335         boolean_t enforce_caps;
1336         /* soft cap enforcement flag, depending on memory pressure */
1337         boolean_t enforce_soft_caps;
1338         /* avoid interference with kernel's page scanner */
1339         boolean_t global_scanner_running;
1340         sample_col_arg_t col_arg;
1341         soft_scan_arg_t arg;
1342         uint_t col_types = 0;
1343 
1344         /* check what kind of collections (project/zone) are capped */
1345         list_walk_collection(col_type_cb, &col_types);
1346         debug("collection types: 0x%x\n", col_types);
1347 
1348         /* no capped collections, skip checking rss */
1349         if (col_types == 0)
1350                 return;
1351 
1352         /* Determine if soft caps are enforced. */
1353         enforce_soft_caps = must_enforce_soft_caps();
1354 
1355         /* Determine if the global page scanner is running. */
1356         global_scanner_running = is_global_scanner_running();
1357 
1358         /*
1359          * Sample collections' member processes RSSes and recompute
1360          * collections' excess.
1361          */
1362         rss_sample(B_FALSE, col_types);
1363 
1364         col_arg.sca_any_over_cap = B_FALSE;
1365         col_arg.sca_project_over_cap = B_FALSE;
1366         list_walk_collection(rss_sample_col_cb, &col_arg);
1367         list_walk_collection(excess_print_cb, NULL);
1368         debug("any collection/project over cap = %d, %d\n",
1369             col_arg.sca_any_over_cap, col_arg.sca_project_over_cap);
1370 
1371         if (enforce_soft_caps)
1372                 debug("memory pressure %d%%\n", memory_pressure);
1373 
1374         /*
1375          * Cap enforcement is determined by the previous conditions.
1376          */
1377         enforce_caps = !global_scanner_running && col_arg.sca_any_over_cap &&
1378             (rcfg.rcfg_memory_cap_enforcement_pressure == 0 ||
1379             enforce_soft_caps);
1380 
1381         debug("%senforcing caps\n", enforce_caps ? "" : "not ");
1382 
1383         /*
1384          * If soft caps are in use, determine the size of the portion from each
1385          * collection to scan for.
1386          */
1387         if (enforce_caps && enforce_soft_caps)
1388                 compute_soft_scan_goal(&arg);
1389 
1390         /*
1391          * Victimize offending collections.
1392          */
1393         if (enforce_caps && (!enforce_soft_caps ||
1394             (arg.ssa_scan_goal > 0 && arg.ssa_sum_excess > 0))) {
1395 
1396                 /*
1397                  * Since at least one collection is over its cap & needs
1398                  * enforcing, check if it is at least time for a process walk
1399                  * (we could be well past time since we only walk /proc when
1400                  * we need to) and if so, update each collections process list
1401                  * in a single pass through /proc.
1402                  */
1403                 if (EVENT_TIME(now, *next_proc_walk)) {
1404                         debug("scanning process list...\n");
1405                         proc_walk_all(proc_cb);          /* insert & mark */
1406                         list_walk_all(sweep_process_cb); /* free dead procs */
1407                         *next_proc_walk = NEXT_EVENT_TIME(now,
1408                             rcfg.rcfg_proc_walk_interval);
1409                 }
1410 
1411                 gz_col = NULL;
1412                 if (enforce_soft_caps) {
1413                         debug("scan goal is %lldKB\n",
1414                             (long long)arg.ssa_scan_goal);
1415                         list_walk_collection(soft_scan_cb, &arg);
1416                         if (gz_capped && gz_col != NULL) {
1417                                 /* process global zone */
1418                                 arg.ssa_project_over_cap =
1419                                     col_arg.sca_project_over_cap;
1420                                 soft_scan_gz(gz_col, &arg);
1421                         }
1422                 } else {
1423                         list_walk_collection(scan_cb, NULL);
1424                         if (gz_capped && gz_col != NULL) {
1425                                 /* process global zone */
1426                                 scan_gz(gz_col, col_arg.sca_project_over_cap);
1427                         }
1428                 }
1429         } else if (col_arg.sca_any_over_cap) {
1430                 list_walk_collection(unenforced_cap_cb, NULL);
1431         }
1432 }
1433 
1434 int
1435 main(int argc, char *argv[])
1436 {
1437         int res;
1438         int should_fork = 1;    /* fork flag */
1439         hrtime_t now;           /* current time */
1440         hrtime_t next;          /* time of next event */
1441         int sig;                /* signal iteration */
1442         struct rlimit rl;
1443         hrtime_t next_proc_walk;        /* time of next /proc scan */
1444         hrtime_t next_configuration;    /* time of next configuration */
1445         hrtime_t next_rss_sample;       /* (latest) time of next RSS sample */
1446 
1447         (void) set_message_priority(RCM_INFO);
1448         (void) setpname("rcapd");
1449         rcapd_pid = getpid();
1450         (void) chdir("/");
1451         should_run = 1;
1452         ever_ran = 0;
1453 
1454         (void) setlocale(LC_ALL, "");
1455         (void) textdomain(TEXT_DOMAIN);
1456 
1457         /*
1458          * Parse command-line options.
1459          */
1460         while ((res = getopt(argc, argv, "dF")) > 0)
1461                 switch (res) {
1462                 case 'd':
1463                         should_fork = 0;
1464                         if (debug_mode == 0) {
1465                                 debug_mode = 1;
1466                                 (void) set_message_priority(RCM_DEBUG);
1467                         } else
1468                                 (void) set_message_priority(RCM_DEBUG_HIGH);
1469                         break;
1470                 case 'F':
1471                         should_fork = 0;
1472                         break;
1473                 default:
1474                         rcapd_usage();
1475                         return (E_USAGE);
1476                         /*NOTREACHED*/
1477                 }
1478 
1479         /*
1480          * Read the configuration.
1481          */
1482         if (rcfg_read(&rcfg, verify_statistics) != E_SUCCESS) {
1483                 warn(gettext("resource caps not configured\n"));
1484                 return (SMF_EXIT_ERR_CONFIG);
1485         }
1486 
1487         /*
1488          * If not debugging, fork and continue operating, changing the
1489          * destination of messages to syslog().
1490          */
1491         if (should_fork == 1) {
1492                 pid_t child;
1493                 debug("forking\n");
1494                 child = fork();
1495                 if (child == -1)
1496                         die(gettext("cannot fork"));
1497                 if (child > 0)
1498                         return (0);
1499                 else {
1500                         rcapd_pid = getpid();
1501                         (void) set_message_destination(RCD_SYSLOG);
1502                         (void) fclose(stdin);
1503                         (void) fclose(stdout);
1504                         (void) fclose(stderr);
1505                 }
1506                 /*
1507                  * Start a new session and detatch from the controlling tty.
1508                  */
1509                 if (setsid() == (pid_t)-1)
1510                         debug(gettext("setsid() failed; cannot detach from "
1511                             "terminal"));
1512         }
1513 
1514         finish_configuration();
1515         should_reconfigure = 0;
1516 
1517         /*
1518          * Check that required privileges are possessed.
1519          */
1520         verify_and_set_privileges();
1521 
1522         now = next_report = next_proc_walk = next_rss_sample = gethrtime();
1523         next_configuration = NEXT_EVENT_TIME(gethrtime(),
1524             rcfg.rcfg_reconfiguration_interval);
1525 
1526         /*
1527          * Open the kstat chain.
1528          */
1529         kctl = kstat_open();
1530         if (kctl == NULL)
1531                 die(gettext("can't open kstats"));
1532 
1533         /*
1534          * Set RLIMIT_NOFILE as high as practical, so roughly 10K processes can
1535          * be effectively managed without revoking descriptors (at 3 per
1536          * process).
1537          */
1538         rl.rlim_cur = 32 * 1024;
1539         rl.rlim_max = 32 * 1024;
1540         if (setrlimit(RLIMIT_NOFILE, &rl) != 0 &&
1541             getrlimit(RLIMIT_NOFILE, &rl) == 0) {
1542                 rl.rlim_cur = rl.rlim_max;
1543                 (void) setrlimit(RLIMIT_NOFILE, &rl);
1544         }
1545         (void) enable_extended_FILE_stdio(-1, -1);
1546 
1547         if (getrlimit(RLIMIT_NOFILE, &rl) == 0)
1548                 debug("fd limit: %lu\n", rl.rlim_cur);
1549         else
1550                 debug("fd limit: unknown\n");
1551 
1552         get_page_size();
1553         my_zoneid = getzoneid();
1554 
1555         /*
1556          * Handle those signals whose (default) exit disposition
1557          * prevents rcapd from finishing scanning before terminating.
1558          */
1559         (void) sigset(SIGINT, terminate_signal);
1560         (void) sigset(SIGQUIT, abort_signal);
1561         (void) sigset(SIGILL, abort_signal);
1562         (void) sigset(SIGEMT, abort_signal);
1563         (void) sigset(SIGFPE, abort_signal);
1564         (void) sigset(SIGBUS, abort_signal);
1565         (void) sigset(SIGSEGV, abort_signal);
1566         (void) sigset(SIGSYS, abort_signal);
1567         (void) sigset(SIGPIPE, terminate_signal);
1568         (void) sigset(SIGALRM, terminate_signal);
1569         (void) sigset(SIGTERM, terminate_signal);
1570         (void) sigset(SIGUSR1, terminate_signal);
1571         (void) sigset(SIGUSR2, terminate_signal);
1572         (void) sigset(SIGPOLL, terminate_signal);
1573         (void) sigset(SIGVTALRM, terminate_signal);
1574         (void) sigset(SIGXCPU, abort_signal);
1575         (void) sigset(SIGXFSZ, abort_signal);
1576         for (sig = SIGRTMIN; sig <= SIGRTMAX; sig++)
1577                 (void) sigset(sig, terminate_signal);
1578 
1579         /*
1580          * Install a signal handler for reconfiguration processing.
1581          */
1582         (void) sigset(SIGHUP, sighup);
1583 
1584         /*
1585          * Determine which process collections to cap.
1586          */
1587         lcollection_update(LCU_COMPLETE);
1588 
1589         /*
1590          * Loop forever, monitoring collections' resident set sizes and
1591          * enforcing their caps.  Look for changes in caps as well as
1592          * responding to requests to reread the configuration.  Update
1593          * per-collection statistics periodically.
1594          */
1595         while (should_run != 0) {
1596                 struct timespec ts;
1597 
1598                 /*
1599                  * Announce that rcapd is starting.
1600                  */
1601                 if (ever_ran == 0) {
1602                         info(gettext("starting\n"));
1603                         ever_ran = 1;
1604                 }
1605 
1606                 /*
1607                  * Check the configuration at every next_configuration interval.
1608                  * Update the rss data once every next_rss_sample interval.
1609                  * The condition of global memory pressure is also checked at
1610                  * the same frequency, if strict caps are in use.
1611                  */
1612                 now = gethrtime();
1613 
1614                 /*
1615                  * Detect configuration and cap changes only when SIGHUP
1616                  * is received. Call reconfigure to apply new configuration
1617                  * parameters.
1618                  */
1619                 if (should_reconfigure == 1) {
1620                         reread_configuration();
1621                         should_reconfigure = 0;
1622                         reconfigure(now, &next_configuration, &next_proc_walk,
1623                             &next_rss_sample);
1624                 }
1625 
1626                 if (EVENT_TIME(now, next_configuration)) {
1627                         reconfigure(now, &next_configuration, &next_proc_walk,
1628                             &next_rss_sample);
1629                 }
1630 
1631                 /*
1632                  * Do the main work for enforcing caps.
1633                  */
1634                 if (EVENT_TIME(now, next_rss_sample)) {
1635                         do_capping(now, &next_proc_walk);
1636 
1637                         next_rss_sample = NEXT_EVENT_TIME(now,
1638                             rcfg.rcfg_rss_sample_interval);
1639                 }
1640 
1641                 /*
1642                  * Update the statistics file, if it's time.
1643                  */
1644                 check_update_statistics();
1645 
1646                 /*
1647                  * Sleep for some time before repeating.
1648                  */
1649                 now = gethrtime();
1650                 next = next_configuration;
1651                 next = POSITIVE_MIN(next, next_report);
1652                 next = POSITIVE_MIN(next, next_rss_sample);
1653                 if (next > now && should_run != 0) {
1654                         debug("sleeping %-4.2f seconds\n", (float)(next -
1655                             now) / (float)NANOSEC);
1656                         hrt2ts(next - now, &ts);
1657                         (void) nanosleep(&ts, NULL);
1658                 }
1659         }
1660         if (termination_signal != 0)
1661                 debug("exiting due to signal %d\n", termination_signal);
1662         if (ever_ran != 0)
1663                 info(gettext("exiting\n"));
1664 
1665         /*
1666          * Unlink the statistics file before exiting.
1667          */
1668         if (rcfg.rcfg_stat_file[0] != 0)
1669                 (void) unlink(rcfg.rcfg_stat_file);
1670 
1671         return (E_SUCCESS);
1672 }