Print this page
patch as-lock-macro-simplification
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/vm/vm_usage.c
+++ new/usr/src/uts/common/vm/vm_usage.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 24 * Use is subject to license terms.
25 25 */
26 26
27 27 /*
28 28 * vm_usage
29 29 *
30 30 * This file implements the getvmusage() private system call.
31 31 * getvmusage() counts the amount of resident memory pages and swap
32 32 * reserved by the specified process collective. A "process collective" is
33 33 * the set of processes owned by a particular, zone, project, task, or user.
34 34 *
35 35 * rss and swap are counted so that for a given process collective, a page is
36 36 * only counted once. For example, this means that if multiple processes in
37 37 * the same project map the same page, then the project will only be charged
38 38 * once for that page. On the other hand, if two processes in different
39 39 * projects map the same page, then both projects will be charged
40 40 * for the page.
41 41 *
42 42 * The vm_getusage() calculation is implemented so that the first thread
43 43 * performs the rss/swap counting. Other callers will wait for that thread to
44 44 * finish, copying the results. This enables multiple rcapds and prstats to
45 45 * consume data from the same calculation. The results are also cached so that
46 46 * a caller interested in recent results can just copy them instead of starting
47 47 * a new calculation. The caller passes the maximium age (in seconds) of the
48 48 * data. If the cached data is young enough, the cache is copied, otherwise,
49 49 * a new calculation is executed and the cache is replaced with the new
50 50 * data.
51 51 *
52 52 * The rss calculation for each process collective is as follows:
53 53 *
54 54 * - Inspect flags, determine if counting rss for zones, projects, tasks,
55 55 * and/or users.
56 56 * - For each proc:
57 57 * - Figure out proc's collectives (zone, project, task, and/or user).
58 58 * - For each seg in proc's address space:
59 59 * - If seg is private:
60 60 * - Lookup anons in the amp.
61 61 * - For incore pages not previously visited each of the
62 62 * proc's collectives, add incore pagesize to each.
63 63 * collective.
64 64 * Anon's with a refcnt of 1 can be assummed to be not
65 65 * previously visited.
66 66 * - For address ranges without anons in the amp:
67 67 * - Lookup pages in underlying vnode.
68 68 * - For incore pages not previously visiting for
69 69 * each of the proc's collectives, add incore
70 70 * pagesize to each collective.
71 71 * - If seg is shared:
72 72 * - Lookup pages in the shared amp or vnode.
73 73 * - For incore pages not previously visited for each of
74 74 * the proc's collectives, add incore pagesize to each
75 75 * collective.
76 76 *
77 77 * Swap is reserved by private segments, and shared anonymous segments.
78 78 * The only shared anon segments which do not reserve swap are ISM segments
79 79 * and schedctl segments, both of which can be identified by having
80 80 * amp->swresv == 0.
81 81 *
82 82 * The swap calculation for each collective is as follows:
83 83 *
84 84 * - Inspect flags, determine if counting rss for zones, projects, tasks,
85 85 * and/or users.
86 86 * - For each proc:
87 87 * - Figure out proc's collectives (zone, project, task, and/or user).
88 88 * - For each seg in proc's address space:
89 89 * - If seg is private:
90 90 * - Add svd->swresv pages to swap count for each of the
91 91 * proc's collectives.
92 92 * - If seg is anon, shared, and amp->swresv != 0
93 93 * - For address ranges in amp not previously visited for
94 94 * each of the proc's collectives, add size of address
95 95 * range to the swap count for each collective.
96 96 *
97 97 * These two calculations are done simultaneously, with most of the work
98 98 * being done in vmu_calculate_seg(). The results of the calculation are
99 99 * copied into "vmu_data.vmu_cache_results".
100 100 *
101 101 * To perform the calculation, various things are tracked and cached:
102 102 *
103 103 * - incore/not-incore page ranges for all vnodes.
104 104 * (vmu_data.vmu_all_vnodes_hash)
105 105 * This eliminates looking up the same page more than once.
106 106 *
107 107 * - incore/not-incore page ranges for all shared amps.
108 108 * (vmu_data.vmu_all_amps_hash)
109 109 * This eliminates looking up the same page more than once.
110 110 *
111 111 * - visited page ranges for each collective.
112 112 * - per vnode (entity->vme_vnode_hash)
113 113 * - per shared amp (entity->vme_amp_hash)
114 114 * For accurate counting of map-shared and COW-shared pages.
115 115 *
116 116 * - visited private anons (refcnt > 1) for each collective.
117 117 * (entity->vme_anon_hash)
118 118 * For accurate counting of COW-shared pages.
119 119 *
120 120 * The common accounting structure is the vmu_entity_t, which represents
121 121 * collectives:
122 122 *
123 123 * - A zone.
124 124 * - A project, task, or user within a zone.
125 125 * - The entire system (vmu_data.vmu_system).
126 126 * - Each collapsed (col) project and user. This means a given projid or
127 127 * uid, regardless of which zone the process is in. For instance,
128 128 * project 0 in the global zone and project 0 in a non global zone are
129 129 * the same collapsed project.
130 130 *
131 131 * Each entity structure tracks which pages have been already visited for
132 132 * that entity (via previously inspected processes) so that these pages are
133 133 * not double counted.
134 134 */
135 135
136 136 #include <sys/errno.h>
137 137 #include <sys/types.h>
138 138 #include <sys/zone.h>
139 139 #include <sys/proc.h>
140 140 #include <sys/project.h>
141 141 #include <sys/task.h>
142 142 #include <sys/thread.h>
143 143 #include <sys/time.h>
144 144 #include <sys/mman.h>
145 145 #include <sys/modhash.h>
146 146 #include <sys/modhash_impl.h>
147 147 #include <sys/shm.h>
148 148 #include <sys/swap.h>
149 149 #include <sys/synch.h>
150 150 #include <sys/systm.h>
151 151 #include <sys/var.h>
152 152 #include <sys/vm_usage.h>
153 153 #include <sys/zone.h>
154 154 #include <sys/sunddi.h>
155 155 #include <sys/avl.h>
156 156 #include <vm/anon.h>
157 157 #include <vm/as.h>
158 158 #include <vm/seg_vn.h>
159 159 #include <vm/seg_spt.h>
160 160
161 161 #define VMUSAGE_HASH_SIZE 512
162 162
163 163 #define VMUSAGE_TYPE_VNODE 1
164 164 #define VMUSAGE_TYPE_AMP 2
165 165 #define VMUSAGE_TYPE_ANON 3
166 166
167 167 #define VMUSAGE_BOUND_UNKNOWN 0
168 168 #define VMUSAGE_BOUND_INCORE 1
169 169 #define VMUSAGE_BOUND_NOT_INCORE 2
170 170
171 171 #define ISWITHIN(node, addr) ((node)->vmb_start <= addr && \
172 172 (node)->vmb_end >= addr ? 1 : 0)
173 173
174 174 /*
175 175 * bounds for vnodes and shared amps
176 176 * Each bound is either entirely incore, entirely not in core, or
177 177 * entirely unknown. bounds are stored in an avl tree sorted by start member
178 178 * when in use, otherwise (free or temporary lists) they're strung
179 179 * together off of vmb_next.
180 180 */
181 181 typedef struct vmu_bound {
182 182 avl_node_t vmb_node;
183 183 struct vmu_bound *vmb_next; /* NULL in tree else on free or temp list */
184 184 pgcnt_t vmb_start; /* page offset in vnode/amp on which bound starts */
185 185 pgcnt_t vmb_end; /* page offset in vnode/amp on which bound ends */
186 186 char vmb_type; /* One of VMUSAGE_BOUND_* */
187 187 } vmu_bound_t;
188 188
189 189 /*
190 190 * hash of visited objects (vnodes or shared amps)
191 191 * key is address of vnode or amp. Bounds lists known incore/non-incore
192 192 * bounds for vnode/amp.
193 193 */
194 194 typedef struct vmu_object {
195 195 struct vmu_object *vmo_next; /* free list */
196 196 caddr_t vmo_key;
197 197 short vmo_type;
198 198 avl_tree_t vmo_bounds;
199 199 } vmu_object_t;
200 200
201 201 /*
202 202 * Entity by which to count results.
203 203 *
204 204 * The entity structure keeps the current rss/swap counts for each entity
205 205 * (zone, project, etc), and hashes of vm structures that have already
206 206 * been visited for the entity.
207 207 *
208 208 * vme_next: links the list of all entities currently being counted by
209 209 * vmu_calculate().
210 210 *
211 211 * vme_next_calc: links the list of entities related to the current process
212 212 * being counted by vmu_calculate_proc().
213 213 *
214 214 * vmu_calculate_proc() walks all processes. For each process, it makes a
215 215 * list of the entities related to that process using vme_next_calc. This
216 216 * list changes each time vmu_calculate_proc() is called.
217 217 *
218 218 */
219 219 typedef struct vmu_entity {
220 220 struct vmu_entity *vme_next;
221 221 struct vmu_entity *vme_next_calc;
222 222 mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */
223 223 mod_hash_t *vme_amp_hash; /* shared amps visited for entity */
224 224 mod_hash_t *vme_anon_hash; /* COW anons visited for entity */
225 225 vmusage_t vme_result; /* identifies entity and results */
226 226 } vmu_entity_t;
227 227
228 228 /*
229 229 * Hash of entities visited within a zone, and an entity for the zone
230 230 * itself.
231 231 */
232 232 typedef struct vmu_zone {
233 233 struct vmu_zone *vmz_next; /* free list */
234 234 id_t vmz_id;
235 235 vmu_entity_t *vmz_zone;
236 236 mod_hash_t *vmz_projects_hash;
237 237 mod_hash_t *vmz_tasks_hash;
238 238 mod_hash_t *vmz_rusers_hash;
239 239 mod_hash_t *vmz_eusers_hash;
240 240 } vmu_zone_t;
241 241
242 242 /*
243 243 * Cache of results from last calculation
244 244 */
245 245 typedef struct vmu_cache {
246 246 vmusage_t *vmc_results; /* Results from last call to */
247 247 /* vm_getusage(). */
248 248 uint64_t vmc_nresults; /* Count of cached results */
249 249 uint64_t vmc_refcnt; /* refcnt for free */
250 250 uint_t vmc_flags; /* Flags for vm_getusage() */
251 251 hrtime_t vmc_timestamp; /* when cache was created */
252 252 } vmu_cache_t;
253 253
254 254 /*
255 255 * top level rss info for the system
256 256 */
257 257 typedef struct vmu_data {
258 258 kmutex_t vmu_lock; /* Protects vmu_data */
259 259 kcondvar_t vmu_cv; /* Used to signal threads */
260 260 /* Waiting for */
261 261 /* Rss_calc_thread to finish */
262 262 vmu_entity_t *vmu_system; /* Entity for tracking */
263 263 /* rss/swap for all processes */
264 264 /* in all zones */
265 265 mod_hash_t *vmu_zones_hash; /* Zones visited */
266 266 mod_hash_t *vmu_projects_col_hash; /* These *_col_hash hashes */
267 267 mod_hash_t *vmu_rusers_col_hash; /* keep track of entities, */
268 268 mod_hash_t *vmu_eusers_col_hash; /* ignoring zoneid, in order */
269 269 /* to implement VMUSAGE_COL_* */
270 270 /* flags, which aggregate by */
271 271 /* project or user regardless */
272 272 /* of zoneid. */
273 273 mod_hash_t *vmu_all_vnodes_hash; /* System wide visited vnodes */
274 274 /* to track incore/not-incore */
275 275 mod_hash_t *vmu_all_amps_hash; /* System wide visited shared */
276 276 /* amps to track incore/not- */
277 277 /* incore */
278 278 vmu_entity_t *vmu_entities; /* Linked list of entities */
279 279 size_t vmu_nentities; /* Count of entities in list */
280 280 vmu_cache_t *vmu_cache; /* Cached results */
281 281 kthread_t *vmu_calc_thread; /* NULL, or thread running */
282 282 /* vmu_calculate() */
283 283 uint_t vmu_calc_flags; /* Flags being using by */
284 284 /* currently running calc */
285 285 /* thread */
286 286 uint_t vmu_pending_flags; /* Flags of vm_getusage() */
287 287 /* threads waiting for */
288 288 /* calc thread to finish */
289 289 uint_t vmu_pending_waiters; /* Number of threads waiting */
290 290 /* for calc thread */
291 291 vmu_bound_t *vmu_free_bounds;
292 292 vmu_object_t *vmu_free_objects;
293 293 vmu_entity_t *vmu_free_entities;
294 294 vmu_zone_t *vmu_free_zones;
295 295 } vmu_data_t;
296 296
297 297 extern struct as kas;
298 298 extern proc_t *practive;
299 299 extern zone_t *global_zone;
300 300 extern struct seg_ops segvn_ops;
301 301 extern struct seg_ops segspt_shmops;
302 302
303 303 static vmu_data_t vmu_data;
304 304 static kmem_cache_t *vmu_bound_cache;
305 305 static kmem_cache_t *vmu_object_cache;
306 306
307 307 /*
308 308 * Comparison routine for AVL tree. We base our comparison on vmb_start.
309 309 */
310 310 static int
311 311 bounds_cmp(const void *bnd1, const void *bnd2)
312 312 {
313 313 const vmu_bound_t *bound1 = bnd1;
314 314 const vmu_bound_t *bound2 = bnd2;
315 315
316 316 if (bound1->vmb_start == bound2->vmb_start) {
317 317 return (0);
318 318 }
319 319 if (bound1->vmb_start < bound2->vmb_start) {
320 320 return (-1);
321 321 }
322 322
323 323 return (1);
324 324 }
325 325
326 326 /*
327 327 * Save a bound on the free list.
328 328 */
329 329 static void
330 330 vmu_free_bound(vmu_bound_t *bound)
331 331 {
332 332 bound->vmb_next = vmu_data.vmu_free_bounds;
333 333 bound->vmb_start = 0;
334 334 bound->vmb_end = 0;
335 335 bound->vmb_type = 0;
336 336 vmu_data.vmu_free_bounds = bound;
337 337 }
338 338
339 339 /*
340 340 * Free an object, and all visited bound info.
341 341 */
342 342 static void
343 343 vmu_free_object(mod_hash_val_t val)
344 344 {
345 345 vmu_object_t *obj = (vmu_object_t *)val;
346 346 avl_tree_t *tree = &(obj->vmo_bounds);
347 347 vmu_bound_t *bound;
348 348 void *cookie = NULL;
349 349
350 350 while ((bound = avl_destroy_nodes(tree, &cookie)) != NULL)
351 351 vmu_free_bound(bound);
352 352 avl_destroy(tree);
353 353
354 354 obj->vmo_type = 0;
355 355 obj->vmo_next = vmu_data.vmu_free_objects;
356 356 vmu_data.vmu_free_objects = obj;
357 357 }
358 358
359 359 /*
360 360 * Free an entity, and hashes of visited objects for that entity.
361 361 */
362 362 static void
363 363 vmu_free_entity(mod_hash_val_t val)
364 364 {
365 365 vmu_entity_t *entity = (vmu_entity_t *)val;
366 366
367 367 if (entity->vme_vnode_hash != NULL)
368 368 i_mod_hash_clear_nosync(entity->vme_vnode_hash);
369 369 if (entity->vme_amp_hash != NULL)
370 370 i_mod_hash_clear_nosync(entity->vme_amp_hash);
371 371 if (entity->vme_anon_hash != NULL)
372 372 i_mod_hash_clear_nosync(entity->vme_anon_hash);
373 373
374 374 entity->vme_next = vmu_data.vmu_free_entities;
375 375 vmu_data.vmu_free_entities = entity;
376 376 }
377 377
378 378 /*
379 379 * Free zone entity, and all hashes of entities inside that zone,
380 380 * which are projects, tasks, and users.
381 381 */
382 382 static void
383 383 vmu_free_zone(mod_hash_val_t val)
384 384 {
385 385 vmu_zone_t *zone = (vmu_zone_t *)val;
386 386
387 387 if (zone->vmz_zone != NULL) {
388 388 vmu_free_entity((mod_hash_val_t)zone->vmz_zone);
389 389 zone->vmz_zone = NULL;
390 390 }
391 391 if (zone->vmz_projects_hash != NULL)
392 392 i_mod_hash_clear_nosync(zone->vmz_projects_hash);
393 393 if (zone->vmz_tasks_hash != NULL)
394 394 i_mod_hash_clear_nosync(zone->vmz_tasks_hash);
395 395 if (zone->vmz_rusers_hash != NULL)
396 396 i_mod_hash_clear_nosync(zone->vmz_rusers_hash);
397 397 if (zone->vmz_eusers_hash != NULL)
398 398 i_mod_hash_clear_nosync(zone->vmz_eusers_hash);
399 399 zone->vmz_next = vmu_data.vmu_free_zones;
400 400 vmu_data.vmu_free_zones = zone;
401 401 }
402 402
403 403 /*
404 404 * Initialize synchronization primitives and hashes for system-wide tracking
405 405 * of visited vnodes and shared amps. Initialize results cache.
406 406 */
407 407 void
408 408 vm_usage_init()
409 409 {
410 410 mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL);
411 411 cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL);
412 412
413 413 vmu_data.vmu_system = NULL;
414 414 vmu_data.vmu_zones_hash = NULL;
415 415 vmu_data.vmu_projects_col_hash = NULL;
416 416 vmu_data.vmu_rusers_col_hash = NULL;
417 417 vmu_data.vmu_eusers_col_hash = NULL;
418 418
419 419 vmu_data.vmu_free_bounds = NULL;
420 420 vmu_data.vmu_free_objects = NULL;
421 421 vmu_data.vmu_free_entities = NULL;
422 422 vmu_data.vmu_free_zones = NULL;
423 423
424 424 vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash(
425 425 "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
426 426 sizeof (vnode_t));
427 427 vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash(
428 428 "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
429 429 sizeof (struct anon_map));
430 430 vmu_data.vmu_projects_col_hash = mod_hash_create_idhash(
431 431 "vmusage collapsed project hash", VMUSAGE_HASH_SIZE,
432 432 vmu_free_entity);
433 433 vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash(
434 434 "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE,
435 435 vmu_free_entity);
436 436 vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash(
437 437 "vmusage collpased euser hash", VMUSAGE_HASH_SIZE,
438 438 vmu_free_entity);
439 439 vmu_data.vmu_zones_hash = mod_hash_create_idhash(
440 440 "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone);
441 441
442 442 vmu_bound_cache = kmem_cache_create("vmu_bound_cache",
443 443 sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
444 444 vmu_object_cache = kmem_cache_create("vmu_object_cache",
445 445 sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
446 446
447 447 vmu_data.vmu_entities = NULL;
448 448 vmu_data.vmu_nentities = 0;
449 449
450 450 vmu_data.vmu_cache = NULL;
451 451 vmu_data.vmu_calc_thread = NULL;
452 452 vmu_data.vmu_calc_flags = 0;
453 453 vmu_data.vmu_pending_flags = 0;
454 454 vmu_data.vmu_pending_waiters = 0;
455 455 }
456 456
457 457 /*
458 458 * Allocate hashes for tracking vm objects visited for an entity.
459 459 * Update list of entities.
460 460 */
461 461 static vmu_entity_t *
462 462 vmu_alloc_entity(id_t id, int type, id_t zoneid)
463 463 {
464 464 vmu_entity_t *entity;
465 465
466 466 if (vmu_data.vmu_free_entities != NULL) {
467 467 entity = vmu_data.vmu_free_entities;
468 468 vmu_data.vmu_free_entities =
469 469 vmu_data.vmu_free_entities->vme_next;
470 470 bzero(&entity->vme_result, sizeof (vmusage_t));
471 471 } else {
472 472 entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP);
473 473 }
474 474 entity->vme_result.vmu_id = id;
475 475 entity->vme_result.vmu_zoneid = zoneid;
476 476 entity->vme_result.vmu_type = type;
477 477
478 478 if (entity->vme_vnode_hash == NULL)
479 479 entity->vme_vnode_hash = mod_hash_create_ptrhash(
480 480 "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
481 481 sizeof (vnode_t));
482 482
483 483 if (entity->vme_amp_hash == NULL)
484 484 entity->vme_amp_hash = mod_hash_create_ptrhash(
485 485 "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
486 486 sizeof (struct anon_map));
487 487
488 488 if (entity->vme_anon_hash == NULL)
489 489 entity->vme_anon_hash = mod_hash_create_ptrhash(
490 490 "vmusage anon hash", VMUSAGE_HASH_SIZE,
491 491 mod_hash_null_valdtor, sizeof (struct anon));
492 492
493 493 entity->vme_next = vmu_data.vmu_entities;
494 494 vmu_data.vmu_entities = entity;
495 495 vmu_data.vmu_nentities++;
496 496
497 497 return (entity);
498 498 }
499 499
500 500 /*
501 501 * Allocate a zone entity, and hashes for tracking visited vm objects
502 502 * for projects, tasks, and users within that zone.
503 503 */
504 504 static vmu_zone_t *
505 505 vmu_alloc_zone(id_t id)
506 506 {
507 507 vmu_zone_t *zone;
508 508
509 509 if (vmu_data.vmu_free_zones != NULL) {
510 510 zone = vmu_data.vmu_free_zones;
511 511 vmu_data.vmu_free_zones =
512 512 vmu_data.vmu_free_zones->vmz_next;
513 513 zone->vmz_next = NULL;
514 514 zone->vmz_zone = NULL;
515 515 } else {
516 516 zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
517 517 }
518 518
519 519 zone->vmz_id = id;
520 520
521 521 if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
522 522 zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
523 523
524 524 if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
525 525 VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
526 526 zone->vmz_projects_hash = mod_hash_create_idhash(
527 527 "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
528 528
529 529 if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
530 530 != 0 && zone->vmz_tasks_hash == NULL)
531 531 zone->vmz_tasks_hash = mod_hash_create_idhash(
532 532 "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
533 533
534 534 if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
535 535 != 0 && zone->vmz_rusers_hash == NULL)
536 536 zone->vmz_rusers_hash = mod_hash_create_idhash(
537 537 "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
538 538
539 539 if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
540 540 != 0 && zone->vmz_eusers_hash == NULL)
541 541 zone->vmz_eusers_hash = mod_hash_create_idhash(
542 542 "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
543 543
544 544 return (zone);
545 545 }
546 546
547 547 /*
548 548 * Allocate a structure for tracking visited bounds for a vm object.
549 549 */
550 550 static vmu_object_t *
551 551 vmu_alloc_object(caddr_t key, int type)
552 552 {
553 553 vmu_object_t *object;
554 554
555 555 if (vmu_data.vmu_free_objects != NULL) {
556 556 object = vmu_data.vmu_free_objects;
557 557 vmu_data.vmu_free_objects =
558 558 vmu_data.vmu_free_objects->vmo_next;
559 559 } else {
560 560 object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP);
561 561 }
562 562
563 563 object->vmo_next = NULL;
564 564 object->vmo_key = key;
565 565 object->vmo_type = type;
566 566 avl_create(&(object->vmo_bounds), bounds_cmp, sizeof (vmu_bound_t), 0);
567 567
568 568 return (object);
569 569 }
570 570
571 571 /*
572 572 * Allocate and return a bound structure.
573 573 */
574 574 static vmu_bound_t *
575 575 vmu_alloc_bound()
576 576 {
577 577 vmu_bound_t *bound;
578 578
579 579 if (vmu_data.vmu_free_bounds != NULL) {
580 580 bound = vmu_data.vmu_free_bounds;
581 581 vmu_data.vmu_free_bounds =
582 582 vmu_data.vmu_free_bounds->vmb_next;
583 583 } else {
584 584 bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP);
585 585 }
586 586
587 587 bound->vmb_next = NULL;
588 588 bound->vmb_start = 0;
589 589 bound->vmb_end = 0;
590 590 bound->vmb_type = 0;
591 591 return (bound);
592 592 }
593 593
594 594 /*
595 595 * vmu_find_insert_* functions implement hash lookup or allocate and
596 596 * insert operations.
597 597 */
598 598 static vmu_object_t *
599 599 vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
600 600 {
601 601 int ret;
602 602 vmu_object_t *object;
603 603
604 604 ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
605 605 (mod_hash_val_t *)&object);
606 606 if (ret != 0) {
607 607 object = vmu_alloc_object(key, type);
608 608 ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
609 609 (mod_hash_val_t)object, (mod_hash_hndl_t)0);
610 610 ASSERT(ret == 0);
611 611 }
612 612 return (object);
613 613 }
614 614
615 615 static int
616 616 vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
617 617 {
618 618 int ret;
619 619 caddr_t val;
620 620
621 621 ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
622 622 (mod_hash_val_t *)&val);
623 623
624 624 if (ret == 0)
625 625 return (0);
626 626
627 627 ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
628 628 (mod_hash_val_t)key, (mod_hash_hndl_t)0);
629 629
630 630 ASSERT(ret == 0);
631 631
632 632 return (1);
633 633 }
634 634
635 635 static vmu_entity_t *
636 636 vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid)
637 637 {
638 638 int ret;
639 639 vmu_entity_t *entity;
640 640
641 641 ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id,
642 642 (mod_hash_val_t *)&entity);
643 643 if (ret != 0) {
644 644 entity = vmu_alloc_entity(id, type, zoneid);
645 645 ret = i_mod_hash_insert_nosync(hash,
646 646 (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity,
647 647 (mod_hash_hndl_t)0);
648 648 ASSERT(ret == 0);
649 649 }
650 650 return (entity);
651 651 }
652 652
653 653
654 654
655 655
656 656 /*
657 657 * Returns list of object bounds between start and end. New bounds inserted
658 658 * by this call are given type.
659 659 *
660 660 * Returns the number of pages covered if new bounds are created. Returns 0
661 661 * if region between start/end consists of all existing bounds.
662 662 */
663 663 static pgcnt_t
664 664 vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t
665 665 end, char type, vmu_bound_t **first, vmu_bound_t **last)
666 666 {
667 667 avl_tree_t *tree = &(ro->vmo_bounds);
668 668 avl_index_t where;
669 669 vmu_bound_t *walker, *tmp;
670 670 pgcnt_t ret = 0;
671 671
672 672 ASSERT(start <= end);
673 673
674 674 *first = *last = NULL;
675 675
676 676 tmp = vmu_alloc_bound();
677 677 tmp->vmb_start = start;
678 678 tmp->vmb_type = type;
679 679
680 680 /* Hopelessly optimistic case. */
681 681 if (walker = avl_find(tree, tmp, &where)) {
682 682 /* We got lucky. */
683 683 vmu_free_bound(tmp);
684 684 *first = walker;
685 685 }
686 686
687 687 if (walker == NULL) {
688 688 /* Is start in the previous node? */
689 689 walker = avl_nearest(tree, where, AVL_BEFORE);
690 690 if (walker != NULL) {
691 691 if (ISWITHIN(walker, start)) {
692 692 /* We found start. */
693 693 vmu_free_bound(tmp);
694 694 *first = walker;
695 695 }
696 696 }
697 697 }
698 698
699 699 /*
700 700 * At this point, if *first is still NULL, then we
701 701 * didn't get a direct hit and start isn't covered
702 702 * by the previous node. We know that the next node
703 703 * must have a greater start value than we require
704 704 * because avl_find tells us where the AVL routines would
705 705 * insert our new node. We have some gap between the
706 706 * start we want and the next node.
707 707 */
708 708 if (*first == NULL) {
709 709 walker = avl_nearest(tree, where, AVL_AFTER);
710 710 if (walker != NULL && walker->vmb_start <= end) {
711 711 /* Fill the gap. */
712 712 tmp->vmb_end = walker->vmb_start - 1;
713 713 *first = tmp;
714 714 } else {
715 715 /* We have a gap over [start, end]. */
716 716 tmp->vmb_end = end;
717 717 *first = *last = tmp;
718 718 }
719 719 ret += tmp->vmb_end - tmp->vmb_start + 1;
720 720 avl_insert(tree, tmp, where);
721 721 }
722 722
723 723 ASSERT(*first != NULL);
724 724
725 725 if (*last != NULL) {
726 726 /* We're done. */
727 727 return (ret);
728 728 }
729 729
730 730 /*
731 731 * If we are here we still need to set *last and
732 732 * that may involve filling in some gaps.
733 733 */
734 734 *last = *first;
735 735 for (;;) {
736 736 if (ISWITHIN(*last, end)) {
737 737 /* We're done. */
738 738 break;
739 739 }
740 740 walker = AVL_NEXT(tree, *last);
741 741 if (walker == NULL || walker->vmb_start > end) {
742 742 /* Bottom or mid tree with gap. */
743 743 tmp = vmu_alloc_bound();
744 744 tmp->vmb_start = (*last)->vmb_end + 1;
745 745 tmp->vmb_end = end;
746 746 tmp->vmb_type = type;
747 747 ret += tmp->vmb_end - tmp->vmb_start + 1;
748 748 avl_insert_here(tree, tmp, *last, AVL_AFTER);
749 749 *last = tmp;
750 750 break;
751 751 } else {
752 752 if ((*last)->vmb_end + 1 != walker->vmb_start) {
753 753 /* Non-contiguous. */
754 754 tmp = vmu_alloc_bound();
755 755 tmp->vmb_start = (*last)->vmb_end + 1;
756 756 tmp->vmb_end = walker->vmb_start - 1;
757 757 tmp->vmb_type = type;
758 758 ret += tmp->vmb_end - tmp->vmb_start + 1;
759 759 avl_insert_here(tree, tmp, *last, AVL_AFTER);
760 760 *last = tmp;
761 761 } else {
762 762 *last = walker;
763 763 }
764 764 }
765 765 }
766 766
767 767 return (ret);
768 768 }
769 769
770 770 /*
771 771 * vmu_update_bounds()
772 772 *
773 773 * tree: avl_tree in which first and last hang.
774 774 *
775 775 * first, last: list of continuous bounds, of which zero or more are of
776 776 * type VMUSAGE_BOUND_UNKNOWN.
777 777 *
778 778 * new_tree: avl_tree in which new_first and new_last hang.
779 779 *
780 780 * new_first, new_last: list of continuous bounds, of which none are of
781 781 * type VMUSAGE_BOUND_UNKNOWN. These bounds are used to
782 782 * update the types of bounds in (first,last) with
783 783 * type VMUSAGE_BOUND_UNKNOWN.
784 784 *
785 785 * For the list of bounds (first,last), this function updates any bounds
786 786 * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in
787 787 * the list (new_first, new_last).
788 788 *
789 789 * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list
790 790 * (new_first, new_last), it will be split into multiple bounds.
791 791 *
792 792 * Return value:
793 793 * The number of pages in the list of bounds (first,last) that were of
794 794 * type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type
795 795 * VMUSAGE_BOUND_INCORE.
796 796 *
797 797 */
798 798 static pgcnt_t
799 799 vmu_update_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last,
800 800 avl_tree_t *new_tree, vmu_bound_t *new_first, vmu_bound_t *new_last)
801 801 {
802 802 vmu_bound_t *next, *new_next, *tmp;
803 803 pgcnt_t rss = 0;
804 804
805 805 next = *first;
806 806 new_next = new_first;
807 807
808 808 /*
809 809 * Verify first and last bound are covered by new bounds if they
810 810 * have unknown type.
811 811 */
812 812 ASSERT((*first)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
813 813 (*first)->vmb_start >= new_first->vmb_start);
814 814 ASSERT((*last)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
815 815 (*last)->vmb_end <= new_last->vmb_end);
816 816 for (;;) {
817 817 /* If bound already has type, proceed to next bound. */
818 818 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
819 819 if (next == *last)
820 820 break;
821 821 next = AVL_NEXT(tree, next);
822 822 continue;
823 823 }
824 824 while (new_next->vmb_end < next->vmb_start)
825 825 new_next = AVL_NEXT(new_tree, new_next);
826 826 ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
827 827 next->vmb_type = new_next->vmb_type;
828 828 if (new_next->vmb_end < next->vmb_end) {
829 829 /* need to split bound */
830 830 tmp = vmu_alloc_bound();
831 831 tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN;
832 832 tmp->vmb_start = new_next->vmb_end + 1;
833 833 tmp->vmb_end = next->vmb_end;
834 834 avl_insert_here(tree, tmp, next, AVL_AFTER);
835 835 next->vmb_end = new_next->vmb_end;
836 836 if (*last == next)
837 837 *last = tmp;
838 838 if (next->vmb_type == VMUSAGE_BOUND_INCORE)
839 839 rss += next->vmb_end - next->vmb_start + 1;
840 840 next = tmp;
841 841 } else {
842 842 if (next->vmb_type == VMUSAGE_BOUND_INCORE)
843 843 rss += next->vmb_end - next->vmb_start + 1;
844 844 if (next == *last)
845 845 break;
846 846 next = AVL_NEXT(tree, next);
847 847 }
848 848 }
849 849 return (rss);
850 850 }
851 851
852 852 /*
853 853 * Merges adjacent bounds with same type between first and last bound.
854 854 * After merge, last pointer may point to a different bound, as (incoming)
855 855 * last bound may have been merged away.
856 856 */
857 857 static void
858 858 vmu_merge_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last)
859 859 {
860 860 vmu_bound_t *current;
861 861 vmu_bound_t *next;
862 862
863 863 ASSERT(tree != NULL);
864 864 ASSERT(*first != NULL);
865 865 ASSERT(*last != NULL);
866 866
867 867 current = *first;
868 868 while (current != *last) {
869 869 next = AVL_NEXT(tree, current);
870 870 if ((current->vmb_end + 1) == next->vmb_start &&
871 871 current->vmb_type == next->vmb_type) {
872 872 current->vmb_end = next->vmb_end;
873 873 avl_remove(tree, next);
874 874 vmu_free_bound(next);
875 875 if (next == *last) {
876 876 *last = current;
877 877 }
878 878 } else {
879 879 current = AVL_NEXT(tree, current);
880 880 }
881 881 }
882 882 }
883 883
884 884 /*
885 885 * Given an amp and a list of bounds, updates each bound's type with
886 886 * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE.
887 887 *
888 888 * If a bound is partially incore, it will be split into two bounds.
889 889 * first and last may be modified, as bounds may be split into multiple
890 890 * bounds if they are partially incore/not-incore.
891 891 *
892 892 * Set incore to non-zero if bounds are already known to be incore.
893 893 *
894 894 */
895 895 static void
896 896 vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
897 897 vmu_bound_t **first, vmu_bound_t **last, boolean_t incore)
898 898 {
899 899 vmu_bound_t *next;
900 900 vmu_bound_t *tmp;
901 901 pgcnt_t index;
902 902 short bound_type;
903 903 short page_type;
904 904 vnode_t *vn;
905 905 anoff_t off;
906 906 struct anon *ap;
907 907
908 908 next = *first;
909 909 /* Shared anon slots don't change once set. */
910 910 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
911 911 for (;;) {
912 912 if (incore == B_TRUE)
913 913 next->vmb_type = VMUSAGE_BOUND_INCORE;
914 914
915 915 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
916 916 if (next == *last)
917 917 break;
918 918 next = AVL_NEXT(tree, next);
919 919 continue;
920 920 }
921 921 bound_type = next->vmb_type;
922 922 index = next->vmb_start;
923 923 while (index <= next->vmb_end) {
924 924
925 925 /*
926 926 * These are used to determine how much to increment
927 927 * index when a large page is found.
928 928 */
929 929 page_t *page;
930 930 pgcnt_t pgcnt = 1;
931 931 uint_t pgshft;
932 932 pgcnt_t pgmsk;
933 933
934 934 ap = anon_get_ptr(amp->ahp, index);
935 935 if (ap != NULL)
936 936 swap_xlate(ap, &vn, &off);
937 937
938 938 if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
939 939 (page = page_exists(vn, off)) != NULL) {
940 940 page_type = VMUSAGE_BOUND_INCORE;
941 941 if (page->p_szc > 0) {
942 942 pgcnt = page_get_pagecnt(page->p_szc);
943 943 pgshft = page_get_shift(page->p_szc);
944 944 pgmsk = (0x1 << (pgshft - PAGESHIFT))
945 945 - 1;
946 946 }
947 947 } else {
948 948 page_type = VMUSAGE_BOUND_NOT_INCORE;
949 949 }
950 950 if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
951 951 next->vmb_type = page_type;
952 952 } else if (next->vmb_type != page_type) {
953 953 /*
954 954 * If current bound type does not match page
955 955 * type, need to split off new bound.
956 956 */
957 957 tmp = vmu_alloc_bound();
958 958 tmp->vmb_type = page_type;
959 959 tmp->vmb_start = index;
960 960 tmp->vmb_end = next->vmb_end;
961 961 avl_insert_here(tree, tmp, next, AVL_AFTER);
962 962 next->vmb_end = index - 1;
963 963 if (*last == next)
964 964 *last = tmp;
965 965 next = tmp;
966 966 }
967 967 if (pgcnt > 1) {
968 968 /*
969 969 * If inside large page, jump to next large
970 970 * page
971 971 */
972 972 index = (index & ~pgmsk) + pgcnt;
973 973 } else {
974 974 index++;
975 975 }
976 976 }
977 977 if (next == *last) {
978 978 ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
979 979 break;
980 980 } else
981 981 next = AVL_NEXT(tree, next);
982 982 }
983 983 ANON_LOCK_EXIT(&->a_rwlock);
984 984 }
985 985
986 986 /*
987 987 * Same as vmu_amp_update_incore_bounds(), except for tracking
988 988 * incore-/not-incore for vnodes.
989 989 */
990 990 static void
991 991 vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
992 992 vmu_bound_t **first, vmu_bound_t **last)
993 993 {
994 994 vmu_bound_t *next;
995 995 vmu_bound_t *tmp;
996 996 pgcnt_t index;
997 997 short bound_type;
998 998 short page_type;
999 999
1000 1000 next = *first;
1001 1001 for (;;) {
1002 1002 if (vnode->v_pages == NULL)
1003 1003 next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;
1004 1004
1005 1005 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
1006 1006 if (next == *last)
1007 1007 break;
1008 1008 next = AVL_NEXT(tree, next);
1009 1009 continue;
1010 1010 }
1011 1011
1012 1012 bound_type = next->vmb_type;
1013 1013 index = next->vmb_start;
1014 1014 while (index <= next->vmb_end) {
1015 1015
1016 1016 /*
1017 1017 * These are used to determine how much to increment
1018 1018 * index when a large page is found.
1019 1019 */
1020 1020 page_t *page;
1021 1021 pgcnt_t pgcnt = 1;
1022 1022 uint_t pgshft;
1023 1023 pgcnt_t pgmsk;
1024 1024
1025 1025 if (vnode->v_pages != NULL &&
1026 1026 (page = page_exists(vnode, ptob(index))) != NULL) {
1027 1027 page_type = VMUSAGE_BOUND_INCORE;
1028 1028 if (page->p_szc > 0) {
1029 1029 pgcnt = page_get_pagecnt(page->p_szc);
1030 1030 pgshft = page_get_shift(page->p_szc);
1031 1031 pgmsk = (0x1 << (pgshft - PAGESHIFT))
1032 1032 - 1;
1033 1033 }
1034 1034 } else {
1035 1035 page_type = VMUSAGE_BOUND_NOT_INCORE;
1036 1036 }
1037 1037 if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
1038 1038 next->vmb_type = page_type;
1039 1039 } else if (next->vmb_type != page_type) {
1040 1040 /*
1041 1041 * If current bound type does not match page
1042 1042 * type, need to split off new bound.
1043 1043 */
1044 1044 tmp = vmu_alloc_bound();
1045 1045 tmp->vmb_type = page_type;
1046 1046 tmp->vmb_start = index;
1047 1047 tmp->vmb_end = next->vmb_end;
1048 1048 avl_insert_here(tree, tmp, next, AVL_AFTER);
1049 1049 next->vmb_end = index - 1;
1050 1050 if (*last == next)
1051 1051 *last = tmp;
1052 1052 next = tmp;
1053 1053 }
1054 1054 if (pgcnt > 1) {
1055 1055 /*
1056 1056 * If inside large page, jump to next large
1057 1057 * page
1058 1058 */
1059 1059 index = (index & ~pgmsk) + pgcnt;
1060 1060 } else {
1061 1061 index++;
1062 1062 }
1063 1063 }
1064 1064 if (next == *last) {
1065 1065 ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
1066 1066 break;
1067 1067 } else
1068 1068 next = AVL_NEXT(tree, next);
1069 1069 }
1070 1070 }
1071 1071
1072 1072 /*
1073 1073 * Calculate the rss and swap consumed by a segment. vmu_entities is the
1074 1074 * list of entities to visit. For shared segments, the vnode or amp
1075 1075 * is looked up in each entity to see if it has been already counted. Private
1076 1076 * anon pages are checked per entity to ensure that COW pages are not
1077 1077 * double counted.
1078 1078 *
1079 1079 * For private mapped files, first the amp is checked for private pages.
1080 1080 * Bounds not backed by the amp are looked up in the vnode for each entity
1081 1081 * to avoid double counting of private COW vnode pages.
1082 1082 */
1083 1083 static void
1084 1084 vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
1085 1085 {
1086 1086 struct segvn_data *svd;
1087 1087 struct shm_data *shmd;
1088 1088 struct spt_data *sptd;
1089 1089 vmu_object_t *shared_object = NULL;
1090 1090 vmu_object_t *entity_object = NULL;
1091 1091 vmu_entity_t *entity;
1092 1092 vmusage_t *result;
1093 1093 vmu_bound_t *first = NULL;
1094 1094 vmu_bound_t *last = NULL;
1095 1095 vmu_bound_t *cur = NULL;
1096 1096 vmu_bound_t *e_first = NULL;
1097 1097 vmu_bound_t *e_last = NULL;
1098 1098 vmu_bound_t *tmp;
1099 1099 pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt;
1100 1100 struct anon_map *private_amp = NULL;
1101 1101 boolean_t incore = B_FALSE;
1102 1102 boolean_t shared = B_FALSE;
1103 1103 int file = 0;
1104 1104 pgcnt_t swresv = 0;
1105 1105 pgcnt_t panon = 0;
1106 1106
1107 1107 /* Can zero-length segments exist? Not sure, so paranoia. */
1108 1108 if (seg->s_size <= 0)
1109 1109 return;
1110 1110
1111 1111 /*
1112 1112 * Figure out if there is a shared object (such as a named vnode or
1113 1113 * a shared amp, then figure out if there is a private amp, which
1114 1114 * identifies private pages.
1115 1115 */
1116 1116 if (seg->s_ops == &segvn_ops) {
1117 1117 svd = (struct segvn_data *)seg->s_data;
1118 1118 if (svd->type == MAP_SHARED) {
1119 1119 shared = B_TRUE;
1120 1120 } else {
1121 1121 swresv = svd->swresv;
1122 1122
1123 1123 if (SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock,
1124 1124 RW_READER) != 0) {
1125 1125 /*
1126 1126 * Text replication anon maps can be shared
1127 1127 * across all zones. Space used for text
1128 1128 * replication is typically capped as a small %
1129 1129 * of memory. To keep it simple for now we
1130 1130 * don't account for swap and memory space used
1131 1131 * for text replication.
1132 1132 */
1133 1133 if (svd->tr_state == SEGVN_TR_OFF &&
1134 1134 svd->amp != NULL) {
1135 1135 private_amp = svd->amp;
1136 1136 p_start = svd->anon_index;
1137 1137 p_end = svd->anon_index +
1138 1138 btop(seg->s_size) - 1;
1139 1139 }
1140 1140 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
1141 1141 }
1142 1142 }
1143 1143 if (svd->vp != NULL) {
1144 1144 file = 1;
1145 1145 shared_object = vmu_find_insert_object(
1146 1146 vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp,
1147 1147 VMUSAGE_TYPE_VNODE);
1148 1148 s_start = btop(svd->offset);
1149 1149 s_end = btop(svd->offset + seg->s_size) - 1;
1150 1150 }
1151 1151 if (svd->amp != NULL && svd->type == MAP_SHARED) {
1152 1152 ASSERT(shared_object == NULL);
1153 1153 shared_object = vmu_find_insert_object(
1154 1154 vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp,
1155 1155 VMUSAGE_TYPE_AMP);
1156 1156 s_start = svd->anon_index;
1157 1157 s_end = svd->anon_index + btop(seg->s_size) - 1;
1158 1158 /* schedctl mappings are always in core */
1159 1159 if (svd->amp->swresv == 0)
1160 1160 incore = B_TRUE;
1161 1161 }
1162 1162 } else if (seg->s_ops == &segspt_shmops) {
1163 1163 shared = B_TRUE;
1164 1164 shmd = (struct shm_data *)seg->s_data;
1165 1165 shared_object = vmu_find_insert_object(
1166 1166 vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp,
1167 1167 VMUSAGE_TYPE_AMP);
1168 1168 s_start = 0;
1169 1169 s_end = btop(seg->s_size) - 1;
1170 1170 sptd = shmd->shm_sptseg->s_data;
1171 1171
1172 1172 /* ism segments are always incore and do not reserve swap */
1173 1173 if (sptd->spt_flags & SHM_SHARE_MMU)
1174 1174 incore = B_TRUE;
1175 1175
1176 1176 } else {
1177 1177 return;
1178 1178 }
1179 1179
1180 1180 /*
1181 1181 * If there is a private amp, count anon pages that exist. If an
1182 1182 * anon has a refcnt > 1 (COW sharing), then save the anon in a
1183 1183 * hash so that it is not double counted.
1184 1184 *
1185 1185 * If there is also a shared object, then figure out the bounds
1186 1186 * which are not mapped by the private amp.
1187 1187 */
1188 1188 if (private_amp != NULL) {
1189 1189
1190 1190 /* Enter as writer to prevent COW anons from being freed */
1191 1191 ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER);
1192 1192
1193 1193 p_index = p_start;
1194 1194 s_index = s_start;
1195 1195
1196 1196 while (p_index <= p_end) {
1197 1197
1198 1198 pgcnt_t p_index_next;
1199 1199 pgcnt_t p_bound_size;
1200 1200 int cnt;
1201 1201 anoff_t off;
1202 1202 struct vnode *vn;
1203 1203 struct anon *ap;
1204 1204 page_t *page; /* For handling of large */
1205 1205 pgcnt_t pgcnt = 1; /* pages */
1206 1206 pgcnt_t pgstart;
1207 1207 pgcnt_t pgend;
1208 1208 uint_t pgshft;
1209 1209 pgcnt_t pgmsk;
1210 1210
1211 1211 p_index_next = p_index;
1212 1212 ap = anon_get_next_ptr(private_amp->ahp,
1213 1213 &p_index_next);
1214 1214
1215 1215 /*
1216 1216 * If next anon is past end of mapping, simulate
1217 1217 * end of anon so loop terminates.
1218 1218 */
1219 1219 if (p_index_next > p_end) {
1220 1220 p_index_next = p_end + 1;
1221 1221 ap = NULL;
1222 1222 }
1223 1223 /*
1224 1224 * For COW segments, keep track of bounds not
1225 1225 * backed by private amp so they can be looked
1226 1226 * up in the backing vnode
1227 1227 */
1228 1228 if (p_index_next != p_index) {
1229 1229
1230 1230 /*
1231 1231 * Compute index difference between anon and
1232 1232 * previous anon.
1233 1233 */
1234 1234 p_bound_size = p_index_next - p_index - 1;
1235 1235
1236 1236 if (shared_object != NULL) {
1237 1237 cur = vmu_alloc_bound();
1238 1238 cur->vmb_start = s_index;
1239 1239 cur->vmb_end = s_index + p_bound_size;
1240 1240 cur->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1241 1241 if (first == NULL) {
1242 1242 first = cur;
1243 1243 last = cur;
1244 1244 } else {
1245 1245 last->vmb_next = cur;
1246 1246 last = cur;
1247 1247 }
1248 1248 }
1249 1249 p_index = p_index + p_bound_size + 1;
1250 1250 s_index = s_index + p_bound_size + 1;
1251 1251 }
1252 1252
1253 1253 /* Detect end of anons in amp */
1254 1254 if (ap == NULL)
1255 1255 break;
1256 1256
1257 1257 cnt = ap->an_refcnt;
1258 1258 swap_xlate(ap, &vn, &off);
1259 1259
1260 1260 if (vn == NULL || vn->v_pages == NULL ||
1261 1261 (page = page_exists(vn, off)) == NULL) {
1262 1262 p_index++;
1263 1263 s_index++;
1264 1264 continue;
1265 1265 }
1266 1266
1267 1267 /*
1268 1268 * If large page is found, compute portion of large
1269 1269 * page in mapping, and increment indicies to the next
1270 1270 * large page.
1271 1271 */
1272 1272 if (page->p_szc > 0) {
1273 1273
1274 1274 pgcnt = page_get_pagecnt(page->p_szc);
1275 1275 pgshft = page_get_shift(page->p_szc);
1276 1276 pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1;
1277 1277
1278 1278 /* First page in large page */
1279 1279 pgstart = p_index & ~pgmsk;
1280 1280 /* Last page in large page */
1281 1281 pgend = pgstart + pgcnt - 1;
1282 1282 /*
1283 1283 * Artifically end page if page extends past
1284 1284 * end of mapping.
1285 1285 */
1286 1286 if (pgend > p_end)
1287 1287 pgend = p_end;
1288 1288
1289 1289 /*
1290 1290 * Compute number of pages from large page
1291 1291 * which are mapped.
1292 1292 */
1293 1293 pgcnt = pgend - p_index + 1;
1294 1294
1295 1295 /*
1296 1296 * Point indicies at page after large page,
1297 1297 * or at page after end of mapping.
1298 1298 */
1299 1299 p_index += pgcnt;
1300 1300 s_index += pgcnt;
1301 1301 } else {
1302 1302 p_index++;
1303 1303 s_index++;
1304 1304 }
1305 1305
1306 1306 /*
1307 1307 * Assume anon structs with a refcnt
1308 1308 * of 1 are not COW shared, so there
1309 1309 * is no reason to track them per entity.
1310 1310 */
1311 1311 if (cnt == 1) {
1312 1312 panon += pgcnt;
1313 1313 continue;
1314 1314 }
1315 1315 for (entity = vmu_entities; entity != NULL;
1316 1316 entity = entity->vme_next_calc) {
1317 1317
1318 1318 result = &entity->vme_result;
1319 1319 /*
1320 1320 * Track COW anons per entity so
1321 1321 * they are not double counted.
1322 1322 */
1323 1323 if (vmu_find_insert_anon(entity->vme_anon_hash,
1324 1324 (caddr_t)ap) == 0)
1325 1325 continue;
1326 1326
1327 1327 result->vmu_rss_all += (pgcnt << PAGESHIFT);
1328 1328 result->vmu_rss_private +=
1329 1329 (pgcnt << PAGESHIFT);
1330 1330 }
1331 1331 }
1332 1332 ANON_LOCK_EXIT(&private_amp->a_rwlock);
1333 1333 }
1334 1334
1335 1335 /* Add up resident anon and swap reserved for private mappings */
1336 1336 if (swresv > 0 || panon > 0) {
1337 1337 for (entity = vmu_entities; entity != NULL;
1338 1338 entity = entity->vme_next_calc) {
1339 1339 result = &entity->vme_result;
1340 1340 result->vmu_swap_all += swresv;
1341 1341 result->vmu_swap_private += swresv;
1342 1342 result->vmu_rss_all += (panon << PAGESHIFT);
1343 1343 result->vmu_rss_private += (panon << PAGESHIFT);
1344 1344 }
1345 1345 }
1346 1346
1347 1347 /* Compute resident pages backing shared amp or named vnode */
1348 1348 if (shared_object != NULL) {
1349 1349 avl_tree_t *tree = &(shared_object->vmo_bounds);
1350 1350
1351 1351 if (first == NULL) {
1352 1352 /*
1353 1353 * No private amp, or private amp has no anon
1354 1354 * structs. This means entire segment is backed by
1355 1355 * the shared object.
1356 1356 */
1357 1357 first = vmu_alloc_bound();
1358 1358 first->vmb_start = s_start;
1359 1359 first->vmb_end = s_end;
1360 1360 first->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1361 1361 }
1362 1362 /*
1363 1363 * Iterate bounds not backed by private amp, and compute
1364 1364 * resident pages.
1365 1365 */
1366 1366 cur = first;
1367 1367 while (cur != NULL) {
1368 1368
1369 1369 if (vmu_insert_lookup_object_bounds(shared_object,
1370 1370 cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN,
1371 1371 &first, &last) > 0) {
1372 1372 /* new bounds, find incore/not-incore */
1373 1373 if (shared_object->vmo_type ==
1374 1374 VMUSAGE_TYPE_VNODE) {
1375 1375 vmu_vnode_update_incore_bounds(
1376 1376 tree,
1377 1377 (vnode_t *)
1378 1378 shared_object->vmo_key, &first,
1379 1379 &last);
1380 1380 } else {
1381 1381 vmu_amp_update_incore_bounds(
1382 1382 tree,
1383 1383 (struct anon_map *)
1384 1384 shared_object->vmo_key, &first,
1385 1385 &last, incore);
1386 1386 }
1387 1387 vmu_merge_bounds(tree, &first, &last);
1388 1388 }
1389 1389 for (entity = vmu_entities; entity != NULL;
1390 1390 entity = entity->vme_next_calc) {
1391 1391 avl_tree_t *e_tree;
1392 1392
1393 1393 result = &entity->vme_result;
1394 1394
1395 1395 entity_object = vmu_find_insert_object(
1396 1396 shared_object->vmo_type ==
1397 1397 VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash:
1398 1398 entity->vme_amp_hash,
1399 1399 shared_object->vmo_key,
1400 1400 shared_object->vmo_type);
1401 1401
1402 1402 virt = vmu_insert_lookup_object_bounds(
1403 1403 entity_object, cur->vmb_start, cur->vmb_end,
1404 1404 VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last);
1405 1405
1406 1406 if (virt == 0)
1407 1407 continue;
1408 1408 /*
1409 1409 * Range visited for this entity
1410 1410 */
1411 1411 e_tree = &(entity_object->vmo_bounds);
1412 1412 rss = vmu_update_bounds(e_tree, &e_first,
1413 1413 &e_last, tree, first, last);
1414 1414 result->vmu_rss_all += (rss << PAGESHIFT);
1415 1415 if (shared == B_TRUE && file == B_FALSE) {
1416 1416 /* shared anon mapping */
1417 1417 result->vmu_swap_all +=
1418 1418 (virt << PAGESHIFT);
1419 1419 result->vmu_swap_shared +=
1420 1420 (virt << PAGESHIFT);
1421 1421 result->vmu_rss_shared +=
1422 1422 (rss << PAGESHIFT);
1423 1423 } else if (shared == B_TRUE && file == B_TRUE) {
1424 1424 /* shared file mapping */
1425 1425 result->vmu_rss_shared +=
1426 1426 (rss << PAGESHIFT);
1427 1427 } else if (shared == B_FALSE &&
1428 1428 file == B_TRUE) {
1429 1429 /* private file mapping */
1430 1430 result->vmu_rss_private +=
1431 1431 (rss << PAGESHIFT);
1432 1432 }
1433 1433 vmu_merge_bounds(e_tree, &e_first, &e_last);
1434 1434 }
1435 1435 tmp = cur;
1436 1436 cur = cur->vmb_next;
1437 1437 vmu_free_bound(tmp);
1438 1438 }
1439 1439 }
1440 1440 }
1441 1441
1442 1442 /*
1443 1443 * Based on the current calculation flags, find the relevant entities
1444 1444 * which are relative to the process. Then calculate each segment
1445 1445 * in the process'es address space for each relevant entity.
1446 1446 */
1447 1447 static void
1448 1448 vmu_calculate_proc(proc_t *p)
1449 1449 {
1450 1450 vmu_entity_t *entities = NULL;
1451 1451 vmu_zone_t *zone;
1452 1452 vmu_entity_t *tmp;
1453 1453 struct as *as;
1454 1454 struct seg *seg;
1455 1455 int ret;
1456 1456
1457 1457 /* Figure out which entities are being computed */
1458 1458 if ((vmu_data.vmu_system) != NULL) {
1459 1459 tmp = vmu_data.vmu_system;
1460 1460 tmp->vme_next_calc = entities;
1461 1461 entities = tmp;
1462 1462 }
1463 1463 if (vmu_data.vmu_calc_flags &
1464 1464 (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
1465 1465 VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
1466 1466 VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
1467 1467 VMUSAGE_ALL_EUSERS)) {
1468 1468 ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
1469 1469 (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1470 1470 (mod_hash_val_t *)&zone);
1471 1471 if (ret != 0) {
1472 1472 zone = vmu_alloc_zone(p->p_zone->zone_id);
1473 1473 ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
1474 1474 (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1475 1475 (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
1476 1476 ASSERT(ret == 0);
1477 1477 }
1478 1478 if (zone->vmz_zone != NULL) {
1479 1479 tmp = zone->vmz_zone;
1480 1480 tmp->vme_next_calc = entities;
1481 1481 entities = tmp;
1482 1482 }
1483 1483 if (vmu_data.vmu_calc_flags &
1484 1484 (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) {
1485 1485 tmp = vmu_find_insert_entity(zone->vmz_projects_hash,
1486 1486 p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS,
1487 1487 zone->vmz_id);
1488 1488 tmp->vme_next_calc = entities;
1489 1489 entities = tmp;
1490 1490 }
1491 1491 if (vmu_data.vmu_calc_flags &
1492 1492 (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) {
1493 1493 tmp = vmu_find_insert_entity(zone->vmz_tasks_hash,
1494 1494 p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id);
1495 1495 tmp->vme_next_calc = entities;
1496 1496 entities = tmp;
1497 1497 }
1498 1498 if (vmu_data.vmu_calc_flags &
1499 1499 (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) {
1500 1500 tmp = vmu_find_insert_entity(zone->vmz_rusers_hash,
1501 1501 crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id);
1502 1502 tmp->vme_next_calc = entities;
1503 1503 entities = tmp;
1504 1504 }
1505 1505 if (vmu_data.vmu_calc_flags &
1506 1506 (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) {
1507 1507 tmp = vmu_find_insert_entity(zone->vmz_eusers_hash,
1508 1508 crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id);
1509 1509 tmp->vme_next_calc = entities;
1510 1510 entities = tmp;
1511 1511 }
1512 1512 }
1513 1513 /* Entities which collapse projects and users for all zones */
1514 1514 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) {
1515 1515 tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash,
1516 1516 p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES);
1517 1517 tmp->vme_next_calc = entities;
1518 1518 entities = tmp;
1519 1519 }
1520 1520 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) {
1521 1521 tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash,
1522 1522 crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES);
1523 1523 tmp->vme_next_calc = entities;
1524 1524 entities = tmp;
1525 1525 }
↓ open down ↓ |
1525 lines elided |
↑ open up ↑ |
1526 1526 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) {
1527 1527 tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash,
1528 1528 crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES);
1529 1529 tmp->vme_next_calc = entities;
1530 1530 entities = tmp;
1531 1531 }
1532 1532
1533 1533 ASSERT(entities != NULL);
1534 1534 /* process all segs in process's address space */
1535 1535 as = p->p_as;
1536 - AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1536 + AS_LOCK_ENTER(as, RW_READER);
1537 1537 for (seg = AS_SEGFIRST(as); seg != NULL;
1538 1538 seg = AS_SEGNEXT(as, seg)) {
1539 1539 vmu_calculate_seg(entities, seg);
1540 1540 }
1541 - AS_LOCK_EXIT(as, &as->a_lock);
1541 + AS_LOCK_EXIT(as);
1542 1542 }
1543 1543
1544 1544 /*
1545 1545 * Free data created by previous call to vmu_calculate().
1546 1546 */
1547 1547 static void
1548 1548 vmu_clear_calc()
1549 1549 {
1550 1550 if (vmu_data.vmu_system != NULL)
1551 1551 vmu_free_entity(vmu_data.vmu_system);
1552 1552 vmu_data.vmu_system = NULL;
1553 1553 if (vmu_data.vmu_zones_hash != NULL)
1554 1554 i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash);
1555 1555 if (vmu_data.vmu_projects_col_hash != NULL)
1556 1556 i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash);
1557 1557 if (vmu_data.vmu_rusers_col_hash != NULL)
1558 1558 i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash);
1559 1559 if (vmu_data.vmu_eusers_col_hash != NULL)
1560 1560 i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash);
1561 1561
1562 1562 i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash);
1563 1563 i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash);
1564 1564 }
1565 1565
1566 1566 /*
1567 1567 * Free unused data structures. These can result if the system workload
1568 1568 * decreases between calculations.
1569 1569 */
1570 1570 static void
1571 1571 vmu_free_extra()
1572 1572 {
1573 1573 vmu_bound_t *tb;
1574 1574 vmu_object_t *to;
1575 1575 vmu_entity_t *te;
1576 1576 vmu_zone_t *tz;
1577 1577
1578 1578 while (vmu_data.vmu_free_bounds != NULL) {
1579 1579 tb = vmu_data.vmu_free_bounds;
1580 1580 vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next;
1581 1581 kmem_cache_free(vmu_bound_cache, tb);
1582 1582 }
1583 1583 while (vmu_data.vmu_free_objects != NULL) {
1584 1584 to = vmu_data.vmu_free_objects;
1585 1585 vmu_data.vmu_free_objects =
1586 1586 vmu_data.vmu_free_objects->vmo_next;
1587 1587 kmem_cache_free(vmu_object_cache, to);
1588 1588 }
1589 1589 while (vmu_data.vmu_free_entities != NULL) {
1590 1590 te = vmu_data.vmu_free_entities;
1591 1591 vmu_data.vmu_free_entities =
1592 1592 vmu_data.vmu_free_entities->vme_next;
1593 1593 if (te->vme_vnode_hash != NULL)
1594 1594 mod_hash_destroy_hash(te->vme_vnode_hash);
1595 1595 if (te->vme_amp_hash != NULL)
1596 1596 mod_hash_destroy_hash(te->vme_amp_hash);
1597 1597 if (te->vme_anon_hash != NULL)
1598 1598 mod_hash_destroy_hash(te->vme_anon_hash);
1599 1599 kmem_free(te, sizeof (vmu_entity_t));
1600 1600 }
1601 1601 while (vmu_data.vmu_free_zones != NULL) {
1602 1602 tz = vmu_data.vmu_free_zones;
1603 1603 vmu_data.vmu_free_zones =
1604 1604 vmu_data.vmu_free_zones->vmz_next;
1605 1605 if (tz->vmz_projects_hash != NULL)
1606 1606 mod_hash_destroy_hash(tz->vmz_projects_hash);
1607 1607 if (tz->vmz_tasks_hash != NULL)
1608 1608 mod_hash_destroy_hash(tz->vmz_tasks_hash);
1609 1609 if (tz->vmz_rusers_hash != NULL)
1610 1610 mod_hash_destroy_hash(tz->vmz_rusers_hash);
1611 1611 if (tz->vmz_eusers_hash != NULL)
1612 1612 mod_hash_destroy_hash(tz->vmz_eusers_hash);
1613 1613 kmem_free(tz, sizeof (vmu_zone_t));
1614 1614 }
1615 1615 }
1616 1616
1617 1617 extern kcondvar_t *pr_pid_cv;
1618 1618
1619 1619 /*
1620 1620 * Determine which entity types are relevant and allocate the hashes to
1621 1621 * track them. Then walk the process table and count rss and swap
1622 1622 * for each process'es address space. Address space object such as
1623 1623 * vnodes, amps and anons are tracked per entity, so that they are
1624 1624 * not double counted in the results.
1625 1625 *
1626 1626 */
1627 1627 static void
1628 1628 vmu_calculate()
1629 1629 {
1630 1630 int i = 0;
1631 1631 int ret;
1632 1632 proc_t *p;
1633 1633
1634 1634 vmu_clear_calc();
1635 1635
1636 1636 if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM)
1637 1637 vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM,
1638 1638 ALL_ZONES);
1639 1639
1640 1640 /*
1641 1641 * Walk process table and calculate rss of each proc.
1642 1642 *
1643 1643 * Pidlock and p_lock cannot be held while doing the rss calculation.
1644 1644 * This is because:
1645 1645 * 1. The calculation allocates using KM_SLEEP.
1646 1646 * 2. The calculation grabs a_lock, which cannot be grabbed
1647 1647 * after p_lock.
1648 1648 *
1649 1649 * Since pidlock must be dropped, we cannot simply just walk the
1650 1650 * practive list. Instead, we walk the process table, and sprlock
1651 1651 * each process to ensure that it does not exit during the
1652 1652 * calculation.
1653 1653 */
1654 1654
1655 1655 mutex_enter(&pidlock);
1656 1656 for (i = 0; i < v.v_proc; i++) {
1657 1657 again:
1658 1658 p = pid_entry(i);
1659 1659 if (p == NULL)
1660 1660 continue;
1661 1661
1662 1662 mutex_enter(&p->p_lock);
1663 1663 mutex_exit(&pidlock);
1664 1664
1665 1665 if (panicstr) {
1666 1666 mutex_exit(&p->p_lock);
1667 1667 return;
1668 1668 }
1669 1669
1670 1670 /* Try to set P_PR_LOCK */
1671 1671 ret = sprtrylock_proc(p);
1672 1672 if (ret == -1) {
1673 1673 /* Process in invalid state */
1674 1674 mutex_exit(&p->p_lock);
1675 1675 mutex_enter(&pidlock);
1676 1676 continue;
1677 1677 } else if (ret == 1) {
1678 1678 /*
1679 1679 * P_PR_LOCK is already set. Wait and try again.
1680 1680 * This also drops p_lock.
1681 1681 */
1682 1682 sprwaitlock_proc(p);
1683 1683 mutex_enter(&pidlock);
1684 1684 goto again;
1685 1685 }
1686 1686 mutex_exit(&p->p_lock);
1687 1687
1688 1688 vmu_calculate_proc(p);
1689 1689
1690 1690 mutex_enter(&p->p_lock);
1691 1691 sprunlock(p);
1692 1692 mutex_enter(&pidlock);
1693 1693 }
1694 1694 mutex_exit(&pidlock);
1695 1695
1696 1696 vmu_free_extra();
1697 1697 }
1698 1698
1699 1699 /*
1700 1700 * allocate a new cache for N results satisfying flags
1701 1701 */
1702 1702 vmu_cache_t *
1703 1703 vmu_cache_alloc(size_t nres, uint_t flags)
1704 1704 {
1705 1705 vmu_cache_t *cache;
1706 1706
1707 1707 cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP);
1708 1708 cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP);
1709 1709 cache->vmc_nresults = nres;
1710 1710 cache->vmc_flags = flags;
1711 1711 cache->vmc_refcnt = 1;
1712 1712 return (cache);
1713 1713 }
1714 1714
1715 1715 /*
1716 1716 * Make sure cached results are not freed
1717 1717 */
1718 1718 static void
1719 1719 vmu_cache_hold(vmu_cache_t *cache)
1720 1720 {
1721 1721 ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1722 1722 cache->vmc_refcnt++;
1723 1723 }
1724 1724
1725 1725 /*
1726 1726 * free cache data
1727 1727 */
1728 1728 static void
1729 1729 vmu_cache_rele(vmu_cache_t *cache)
1730 1730 {
1731 1731 ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1732 1732 ASSERT(cache->vmc_refcnt > 0);
1733 1733 cache->vmc_refcnt--;
1734 1734 if (cache->vmc_refcnt == 0) {
1735 1735 kmem_free(cache->vmc_results, sizeof (vmusage_t) *
1736 1736 cache->vmc_nresults);
1737 1737 kmem_free(cache, sizeof (vmu_cache_t));
1738 1738 }
1739 1739 }
1740 1740
1741 1741 /*
1742 1742 * Copy out the cached results to a caller. Inspect the callers flags
1743 1743 * and zone to determine which cached results should be copied.
1744 1744 */
1745 1745 static int
1746 1746 vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
1747 1747 uint_t flags, int cpflg)
1748 1748 {
1749 1749 vmusage_t *result, *out_result;
1750 1750 vmusage_t dummy;
1751 1751 size_t i, count = 0;
1752 1752 size_t bufsize;
1753 1753 int ret = 0;
1754 1754 uint_t types = 0;
1755 1755
1756 1756 if (nres != NULL) {
1757 1757 if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
1758 1758 return (set_errno(EFAULT));
1759 1759 } else {
1760 1760 bufsize = 0;
1761 1761 }
1762 1762
1763 1763 /* figure out what results the caller is interested in. */
1764 1764 if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
1765 1765 types |= VMUSAGE_SYSTEM;
1766 1766 if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
1767 1767 types |= VMUSAGE_ZONE;
1768 1768 if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1769 1769 VMUSAGE_COL_PROJECTS))
1770 1770 types |= VMUSAGE_PROJECTS;
1771 1771 if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
1772 1772 types |= VMUSAGE_TASKS;
1773 1773 if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
1774 1774 types |= VMUSAGE_RUSERS;
1775 1775 if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
1776 1776 types |= VMUSAGE_EUSERS;
1777 1777
1778 1778 /* count results for current zone */
1779 1779 out_result = buf;
1780 1780 for (result = cache->vmc_results, i = 0;
1781 1781 i < cache->vmc_nresults; result++, i++) {
1782 1782
1783 1783 /* Do not return "other-zone" results to non-global zones */
1784 1784 if (curproc->p_zone != global_zone &&
1785 1785 curproc->p_zone->zone_id != result->vmu_zoneid)
1786 1786 continue;
1787 1787
1788 1788 /*
1789 1789 * If non-global zone requests VMUSAGE_SYSTEM, fake
1790 1790 * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result.
1791 1791 */
1792 1792 if (curproc->p_zone != global_zone &&
1793 1793 (flags & VMUSAGE_SYSTEM) != 0 &&
1794 1794 result->vmu_type == VMUSAGE_ZONE) {
1795 1795 count++;
1796 1796 if (out_result != NULL) {
1797 1797 if (bufsize < count) {
1798 1798 ret = set_errno(EOVERFLOW);
1799 1799 } else {
1800 1800 dummy = *result;
1801 1801 dummy.vmu_zoneid = ALL_ZONES;
1802 1802 dummy.vmu_id = 0;
1803 1803 dummy.vmu_type = VMUSAGE_SYSTEM;
1804 1804 if (ddi_copyout(&dummy, out_result,
1805 1805 sizeof (vmusage_t), cpflg))
1806 1806 return (set_errno(EFAULT));
1807 1807 out_result++;
1808 1808 }
1809 1809 }
1810 1810 }
1811 1811
1812 1812 /* Skip results that do not match requested type */
1813 1813 if ((result->vmu_type & types) == 0)
1814 1814 continue;
1815 1815
1816 1816 /* Skip collated results if not requested */
1817 1817 if (result->vmu_zoneid == ALL_ZONES) {
1818 1818 if (result->vmu_type == VMUSAGE_PROJECTS &&
1819 1819 (flags & VMUSAGE_COL_PROJECTS) == 0)
1820 1820 continue;
1821 1821 if (result->vmu_type == VMUSAGE_EUSERS &&
1822 1822 (flags & VMUSAGE_COL_EUSERS) == 0)
1823 1823 continue;
1824 1824 if (result->vmu_type == VMUSAGE_RUSERS &&
1825 1825 (flags & VMUSAGE_COL_RUSERS) == 0)
1826 1826 continue;
1827 1827 }
1828 1828
1829 1829 /* Skip "other zone" results if not requested */
1830 1830 if (result->vmu_zoneid != curproc->p_zone->zone_id) {
1831 1831 if (result->vmu_type == VMUSAGE_ZONE &&
1832 1832 (flags & VMUSAGE_ALL_ZONES) == 0)
1833 1833 continue;
1834 1834 if (result->vmu_type == VMUSAGE_PROJECTS &&
1835 1835 (flags & (VMUSAGE_ALL_PROJECTS |
1836 1836 VMUSAGE_COL_PROJECTS)) == 0)
1837 1837 continue;
1838 1838 if (result->vmu_type == VMUSAGE_TASKS &&
1839 1839 (flags & VMUSAGE_ALL_TASKS) == 0)
1840 1840 continue;
1841 1841 if (result->vmu_type == VMUSAGE_RUSERS &&
1842 1842 (flags & (VMUSAGE_ALL_RUSERS |
1843 1843 VMUSAGE_COL_RUSERS)) == 0)
1844 1844 continue;
1845 1845 if (result->vmu_type == VMUSAGE_EUSERS &&
1846 1846 (flags & (VMUSAGE_ALL_EUSERS |
1847 1847 VMUSAGE_COL_EUSERS)) == 0)
1848 1848 continue;
1849 1849 }
1850 1850 count++;
1851 1851 if (out_result != NULL) {
1852 1852 if (bufsize < count) {
1853 1853 ret = set_errno(EOVERFLOW);
1854 1854 } else {
1855 1855 if (ddi_copyout(result, out_result,
1856 1856 sizeof (vmusage_t), cpflg))
1857 1857 return (set_errno(EFAULT));
1858 1858 out_result++;
1859 1859 }
1860 1860 }
1861 1861 }
1862 1862 if (nres != NULL)
1863 1863 if (ddi_copyout(&count, (void *)nres, sizeof (size_t), cpflg))
1864 1864 return (set_errno(EFAULT));
1865 1865
1866 1866 return (ret);
1867 1867 }
1868 1868
1869 1869 /*
1870 1870 * vm_getusage()
1871 1871 *
1872 1872 * Counts rss and swap by zone, project, task, and/or user. The flags argument
1873 1873 * determines the type of results structures returned. Flags requesting
1874 1874 * results from more than one zone are "flattened" to the local zone if the
1875 1875 * caller is not the global zone.
1876 1876 *
1877 1877 * args:
1878 1878 * flags: bitmap consisting of one or more of VMUSAGE_*.
1879 1879 * age: maximum allowable age (time since counting was done) in
1880 1880 * seconds of the results. Results from previous callers are
1881 1881 * cached in kernel.
1882 1882 * buf: pointer to buffer array of vmusage_t. If NULL, then only nres
1883 1883 * set on success.
1884 1884 * nres: Set to number of vmusage_t structures pointed to by buf
1885 1885 * before calling vm_getusage().
1886 1886 * On return 0 (success) or ENOSPC, is set to the number of result
1887 1887 * structures returned or attempted to return.
1888 1888 *
1889 1889 * returns 0 on success, -1 on failure:
1890 1890 * EINTR (interrupted)
1891 1891 * ENOSPC (nres to small for results, nres set to needed value for success)
1892 1892 * EINVAL (flags invalid)
1893 1893 * EFAULT (bad address for buf or nres)
1894 1894 */
1895 1895 int
1896 1896 vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
1897 1897 {
1898 1898 vmu_entity_t *entity;
1899 1899 vmusage_t *result;
1900 1900 int ret = 0;
1901 1901 int cacherecent = 0;
1902 1902 hrtime_t now;
1903 1903 uint_t flags_orig;
1904 1904
1905 1905 /*
1906 1906 * Non-global zones cannot request system wide and/or collated
1907 1907 * results, or the system result, so munge the flags accordingly.
1908 1908 */
1909 1909 flags_orig = flags;
1910 1910 if (curproc->p_zone != global_zone) {
1911 1911 if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
1912 1912 flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
1913 1913 flags |= VMUSAGE_PROJECTS;
1914 1914 }
1915 1915 if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
1916 1916 flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
1917 1917 flags |= VMUSAGE_RUSERS;
1918 1918 }
1919 1919 if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
1920 1920 flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
1921 1921 flags |= VMUSAGE_EUSERS;
1922 1922 }
1923 1923 if (flags & VMUSAGE_SYSTEM) {
1924 1924 flags &= ~VMUSAGE_SYSTEM;
1925 1925 flags |= VMUSAGE_ZONE;
1926 1926 }
1927 1927 }
1928 1928
1929 1929 /* Check for unknown flags */
1930 1930 if ((flags & (~VMUSAGE_MASK)) != 0)
1931 1931 return (set_errno(EINVAL));
1932 1932
1933 1933 /* Check for no flags */
1934 1934 if ((flags & VMUSAGE_MASK) == 0)
1935 1935 return (set_errno(EINVAL));
1936 1936
1937 1937 mutex_enter(&vmu_data.vmu_lock);
1938 1938 now = gethrtime();
1939 1939
1940 1940 start:
1941 1941 if (vmu_data.vmu_cache != NULL) {
1942 1942
1943 1943 vmu_cache_t *cache;
1944 1944
1945 1945 if ((vmu_data.vmu_cache->vmc_timestamp +
1946 1946 ((hrtime_t)age * NANOSEC)) > now)
1947 1947 cacherecent = 1;
1948 1948
1949 1949 if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
1950 1950 cacherecent == 1) {
1951 1951 cache = vmu_data.vmu_cache;
1952 1952 vmu_cache_hold(cache);
1953 1953 mutex_exit(&vmu_data.vmu_lock);
1954 1954
1955 1955 ret = vmu_copyout_results(cache, buf, nres, flags_orig,
1956 1956 cpflg);
1957 1957 mutex_enter(&vmu_data.vmu_lock);
1958 1958 vmu_cache_rele(cache);
1959 1959 if (vmu_data.vmu_pending_waiters > 0)
1960 1960 cv_broadcast(&vmu_data.vmu_cv);
1961 1961 mutex_exit(&vmu_data.vmu_lock);
1962 1962 return (ret);
1963 1963 }
1964 1964 /*
1965 1965 * If the cache is recent, it is likely that there are other
1966 1966 * consumers of vm_getusage running, so add their flags to the
1967 1967 * desired flags for the calculation.
1968 1968 */
1969 1969 if (cacherecent == 1)
1970 1970 flags = vmu_data.vmu_cache->vmc_flags | flags;
1971 1971 }
1972 1972 if (vmu_data.vmu_calc_thread == NULL) {
1973 1973
1974 1974 vmu_cache_t *cache;
1975 1975
1976 1976 vmu_data.vmu_calc_thread = curthread;
1977 1977 vmu_data.vmu_calc_flags = flags;
1978 1978 vmu_data.vmu_entities = NULL;
1979 1979 vmu_data.vmu_nentities = 0;
1980 1980 if (vmu_data.vmu_pending_waiters > 0)
1981 1981 vmu_data.vmu_calc_flags |=
1982 1982 vmu_data.vmu_pending_flags;
1983 1983
1984 1984 vmu_data.vmu_pending_flags = 0;
1985 1985 mutex_exit(&vmu_data.vmu_lock);
1986 1986 vmu_calculate();
1987 1987 mutex_enter(&vmu_data.vmu_lock);
1988 1988 /* copy results to cache */
1989 1989 if (vmu_data.vmu_cache != NULL)
1990 1990 vmu_cache_rele(vmu_data.vmu_cache);
1991 1991 cache = vmu_data.vmu_cache =
1992 1992 vmu_cache_alloc(vmu_data.vmu_nentities,
1993 1993 vmu_data.vmu_calc_flags);
1994 1994
1995 1995 result = cache->vmc_results;
1996 1996 for (entity = vmu_data.vmu_entities; entity != NULL;
1997 1997 entity = entity->vme_next) {
1998 1998 *result = entity->vme_result;
1999 1999 result++;
2000 2000 }
2001 2001 cache->vmc_timestamp = gethrtime();
2002 2002 vmu_cache_hold(cache);
2003 2003
2004 2004 vmu_data.vmu_calc_flags = 0;
2005 2005 vmu_data.vmu_calc_thread = NULL;
2006 2006
2007 2007 if (vmu_data.vmu_pending_waiters > 0)
2008 2008 cv_broadcast(&vmu_data.vmu_cv);
2009 2009
2010 2010 mutex_exit(&vmu_data.vmu_lock);
2011 2011
2012 2012 /* copy cache */
2013 2013 ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg);
2014 2014 mutex_enter(&vmu_data.vmu_lock);
2015 2015 vmu_cache_rele(cache);
2016 2016 mutex_exit(&vmu_data.vmu_lock);
2017 2017
2018 2018 return (ret);
2019 2019 }
2020 2020 vmu_data.vmu_pending_flags |= flags;
2021 2021 vmu_data.vmu_pending_waiters++;
2022 2022 while (vmu_data.vmu_calc_thread != NULL) {
2023 2023 if (cv_wait_sig(&vmu_data.vmu_cv,
2024 2024 &vmu_data.vmu_lock) == 0) {
2025 2025 vmu_data.vmu_pending_waiters--;
2026 2026 mutex_exit(&vmu_data.vmu_lock);
2027 2027 return (set_errno(EINTR));
2028 2028 }
2029 2029 }
2030 2030 vmu_data.vmu_pending_waiters--;
2031 2031 goto start;
2032 2032 }
↓ open down ↓ |
481 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX