Print this page
5302 vm: remove 'nopageage' static global
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/vm/vm_page.c
+++ new/usr/src/uts/common/vm/vm_page.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 */
24 24
25 25 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
26 26 /* All Rights Reserved */
27 27
28 28 /*
29 29 * University Copyright- Copyright (c) 1982, 1986, 1988
30 30 * The Regents of the University of California
31 31 * All Rights Reserved
32 32 *
33 33 * University Acknowledgment- Portions of this document are derived from
34 34 * software developed by the University of California, Berkeley, and its
35 35 * contributors.
36 36 */
37 37
38 38 /*
39 39 * VM - physical page management.
40 40 */
41 41
42 42 #include <sys/types.h>
43 43 #include <sys/t_lock.h>
44 44 #include <sys/param.h>
45 45 #include <sys/systm.h>
46 46 #include <sys/errno.h>
47 47 #include <sys/time.h>
48 48 #include <sys/vnode.h>
49 49 #include <sys/vm.h>
50 50 #include <sys/vtrace.h>
51 51 #include <sys/swap.h>
52 52 #include <sys/cmn_err.h>
53 53 #include <sys/tuneable.h>
54 54 #include <sys/sysmacros.h>
55 55 #include <sys/cpuvar.h>
56 56 #include <sys/callb.h>
57 57 #include <sys/debug.h>
58 58 #include <sys/tnf_probe.h>
59 59 #include <sys/condvar_impl.h>
60 60 #include <sys/mem_config.h>
61 61 #include <sys/mem_cage.h>
62 62 #include <sys/kmem.h>
63 63 #include <sys/atomic.h>
64 64 #include <sys/strlog.h>
65 65 #include <sys/mman.h>
66 66 #include <sys/ontrap.h>
67 67 #include <sys/lgrp.h>
68 68 #include <sys/vfs.h>
69 69
70 70 #include <vm/hat.h>
71 71 #include <vm/anon.h>
↓ open down ↓ |
71 lines elided |
↑ open up ↑ |
72 72 #include <vm/page.h>
73 73 #include <vm/seg.h>
74 74 #include <vm/pvn.h>
75 75 #include <vm/seg_kmem.h>
76 76 #include <vm/vm_dep.h>
77 77 #include <sys/vm_usage.h>
78 78 #include <fs/fs_subr.h>
79 79 #include <sys/ddi.h>
80 80 #include <sys/modctl.h>
81 81
82 -static int nopageage = 0;
83 -
84 82 static pgcnt_t max_page_get; /* max page_get request size in pages */
85 83 pgcnt_t total_pages = 0; /* total number of pages (used by /proc) */
86 84
87 85 /*
88 86 * freemem_lock protects all freemem variables:
89 87 * availrmem. Also this lock protects the globals which track the
90 88 * availrmem changes for accurate kernel footprint calculation.
91 89 * See below for an explanation of these
92 90 * globals.
93 91 */
94 92 kmutex_t freemem_lock;
95 93 pgcnt_t availrmem;
96 94 pgcnt_t availrmem_initial;
97 95
98 96 /*
99 97 * These globals track availrmem changes to get a more accurate
100 98 * estimate of tke kernel size. Historically pp_kernel is used for
101 99 * kernel size and is based on availrmem. But availrmem is adjusted for
102 100 * locked pages in the system not just for kernel locked pages.
103 101 * These new counters will track the pages locked through segvn and
104 102 * by explicit user locking.
105 103 *
106 104 * pages_locked : How many pages are locked because of user specified
107 105 * locking through mlock or plock.
108 106 *
109 107 * pages_useclaim,pages_claimed : These two variables track the
110 108 * claim adjustments because of the protection changes on a segvn segment.
111 109 *
112 110 * All these globals are protected by the same lock which protects availrmem.
113 111 */
114 112 pgcnt_t pages_locked = 0;
115 113 pgcnt_t pages_useclaim = 0;
116 114 pgcnt_t pages_claimed = 0;
117 115
118 116
119 117 /*
120 118 * new_freemem_lock protects freemem, freemem_wait & freemem_cv.
121 119 */
122 120 static kmutex_t new_freemem_lock;
123 121 static uint_t freemem_wait; /* someone waiting for freemem */
124 122 static kcondvar_t freemem_cv;
125 123
126 124 /*
127 125 * The logical page free list is maintained as two lists, the 'free'
128 126 * and the 'cache' lists.
129 127 * The free list contains those pages that should be reused first.
130 128 *
131 129 * The implementation of the lists is machine dependent.
132 130 * page_get_freelist(), page_get_cachelist(),
133 131 * page_list_sub(), and page_list_add()
134 132 * form the interface to the machine dependent implementation.
135 133 *
136 134 * Pages with p_free set are on the cache list.
137 135 * Pages with p_free and p_age set are on the free list,
138 136 *
139 137 * A page may be locked while on either list.
140 138 */
141 139
142 140 /*
143 141 * free list accounting stuff.
144 142 *
145 143 *
146 144 * Spread out the value for the number of pages on the
147 145 * page free and page cache lists. If there is just one
148 146 * value, then it must be under just one lock.
149 147 * The lock contention and cache traffic are a real bother.
150 148 *
151 149 * When we acquire and then drop a single pcf lock
152 150 * we can start in the middle of the array of pcf structures.
153 151 * If we acquire more than one pcf lock at a time, we need to
154 152 * start at the front to avoid deadlocking.
155 153 *
156 154 * pcf_count holds the number of pages in each pool.
157 155 *
158 156 * pcf_block is set when page_create_get_something() has asked the
159 157 * PSM page freelist and page cachelist routines without specifying
160 158 * a color and nothing came back. This is used to block anything
161 159 * else from moving pages from one list to the other while the
162 160 * lists are searched again. If a page is freeed while pcf_block is
163 161 * set, then pcf_reserve is incremented. pcgs_unblock() takes care
164 162 * of clearning pcf_block, doing the wakeups, etc.
165 163 */
166 164
167 165 #define MAX_PCF_FANOUT NCPU
168 166 static uint_t pcf_fanout = 1; /* Will get changed at boot time */
169 167 static uint_t pcf_fanout_mask = 0;
170 168
171 169 struct pcf {
172 170 kmutex_t pcf_lock; /* protects the structure */
173 171 uint_t pcf_count; /* page count */
174 172 uint_t pcf_wait; /* number of waiters */
175 173 uint_t pcf_block; /* pcgs flag to page_free() */
176 174 uint_t pcf_reserve; /* pages freed after pcf_block set */
177 175 uint_t pcf_fill[10]; /* to line up on the caches */
178 176 };
179 177
180 178 /*
181 179 * PCF_INDEX hash needs to be dynamic (every so often the hash changes where
182 180 * it will hash the cpu to). This is done to prevent a drain condition
183 181 * from happening. This drain condition will occur when pcf_count decrement
184 182 * occurs on cpu A and the increment of pcf_count always occurs on cpu B. An
185 183 * example of this shows up with device interrupts. The dma buffer is allocated
186 184 * by the cpu requesting the IO thus the pcf_count is decremented based on that.
187 185 * When the memory is returned by the interrupt thread, the pcf_count will be
188 186 * incremented based on the cpu servicing the interrupt.
189 187 */
190 188 static struct pcf pcf[MAX_PCF_FANOUT];
191 189 #define PCF_INDEX() ((int)(((long)CPU->cpu_seqid) + \
192 190 (randtick() >> 24)) & (pcf_fanout_mask))
193 191
194 192 static int pcf_decrement_bucket(pgcnt_t);
195 193 static int pcf_decrement_multiple(pgcnt_t *, pgcnt_t, int);
196 194
197 195 kmutex_t pcgs_lock; /* serializes page_create_get_ */
198 196 kmutex_t pcgs_cagelock; /* serializes NOSLEEP cage allocs */
199 197 kmutex_t pcgs_wait_lock; /* used for delay in pcgs */
200 198 static kcondvar_t pcgs_cv; /* cv for delay in pcgs */
201 199
202 200 #ifdef VM_STATS
203 201
204 202 /*
205 203 * No locks, but so what, they are only statistics.
206 204 */
207 205
208 206 static struct page_tcnt {
209 207 int pc_free_cache; /* free's into cache list */
210 208 int pc_free_dontneed; /* free's with dontneed */
211 209 int pc_free_pageout; /* free's from pageout */
212 210 int pc_free_free; /* free's into free list */
213 211 int pc_free_pages; /* free's into large page free list */
214 212 int pc_destroy_pages; /* large page destroy's */
215 213 int pc_get_cache; /* get's from cache list */
216 214 int pc_get_free; /* get's from free list */
217 215 int pc_reclaim; /* reclaim's */
218 216 int pc_abortfree; /* abort's of free pages */
219 217 int pc_find_hit; /* find's that find page */
220 218 int pc_find_miss; /* find's that don't find page */
221 219 int pc_destroy_free; /* # of free pages destroyed */
222 220 #define PC_HASH_CNT (4*PAGE_HASHAVELEN)
223 221 int pc_find_hashlen[PC_HASH_CNT+1];
224 222 int pc_addclaim_pages;
225 223 int pc_subclaim_pages;
226 224 int pc_free_replacement_page[2];
227 225 int pc_try_demote_pages[6];
228 226 int pc_demote_pages[2];
229 227 } pagecnt;
230 228
231 229 uint_t hashin_count;
232 230 uint_t hashin_not_held;
233 231 uint_t hashin_already;
234 232
235 233 uint_t hashout_count;
236 234 uint_t hashout_not_held;
237 235
238 236 uint_t page_create_count;
239 237 uint_t page_create_not_enough;
240 238 uint_t page_create_not_enough_again;
241 239 uint_t page_create_zero;
242 240 uint_t page_create_hashout;
243 241 uint_t page_create_page_lock_failed;
244 242 uint_t page_create_trylock_failed;
245 243 uint_t page_create_found_one;
246 244 uint_t page_create_hashin_failed;
247 245 uint_t page_create_dropped_phm;
248 246
249 247 uint_t page_create_new;
250 248 uint_t page_create_exists;
251 249 uint_t page_create_putbacks;
252 250 uint_t page_create_overshoot;
253 251
254 252 uint_t page_reclaim_zero;
255 253 uint_t page_reclaim_zero_locked;
256 254
257 255 uint_t page_rename_exists;
258 256 uint_t page_rename_count;
259 257
260 258 uint_t page_lookup_cnt[20];
261 259 uint_t page_lookup_nowait_cnt[10];
262 260 uint_t page_find_cnt;
263 261 uint_t page_exists_cnt;
264 262 uint_t page_exists_forreal_cnt;
265 263 uint_t page_lookup_dev_cnt;
266 264 uint_t get_cachelist_cnt;
267 265 uint_t page_create_cnt[10];
268 266 uint_t alloc_pages[9];
269 267 uint_t page_exphcontg[19];
270 268 uint_t page_create_large_cnt[10];
271 269
272 270 /*
273 271 * Collects statistics.
274 272 */
275 273 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \
276 274 uint_t mylen = 0; \
277 275 \
278 276 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash, mylen++) { \
279 277 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
280 278 break; \
281 279 } \
282 280 if ((pp) != NULL) \
283 281 pagecnt.pc_find_hit++; \
284 282 else \
285 283 pagecnt.pc_find_miss++; \
286 284 if (mylen > PC_HASH_CNT) \
287 285 mylen = PC_HASH_CNT; \
288 286 pagecnt.pc_find_hashlen[mylen]++; \
289 287 }
290 288
291 289 #else /* VM_STATS */
292 290
293 291 /*
294 292 * Don't collect statistics
295 293 */
296 294 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \
297 295 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
298 296 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
299 297 break; \
300 298 } \
301 299 }
302 300
303 301 #endif /* VM_STATS */
304 302
305 303
306 304
307 305 #ifdef DEBUG
308 306 #define MEMSEG_SEARCH_STATS
309 307 #endif
310 308
311 309 #ifdef MEMSEG_SEARCH_STATS
312 310 struct memseg_stats {
313 311 uint_t nsearch;
314 312 uint_t nlastwon;
315 313 uint_t nhashwon;
316 314 uint_t nnotfound;
317 315 } memseg_stats;
318 316
319 317 #define MEMSEG_STAT_INCR(v) \
320 318 atomic_inc_32(&memseg_stats.v)
321 319 #else
322 320 #define MEMSEG_STAT_INCR(x)
323 321 #endif
324 322
325 323 struct memseg *memsegs; /* list of memory segments */
326 324
327 325 /*
328 326 * /etc/system tunable to control large page allocation hueristic.
329 327 *
330 328 * Setting to LPAP_LOCAL will heavily prefer the local lgroup over remote lgroup
331 329 * for large page allocation requests. If a large page is not readily
332 330 * avaliable on the local freelists we will go through additional effort
333 331 * to create a large page, potentially moving smaller pages around to coalesce
334 332 * larger pages in the local lgroup.
335 333 * Default value of LPAP_DEFAULT will go to remote freelists if large pages
336 334 * are not readily available in the local lgroup.
337 335 */
338 336 enum lpap {
339 337 LPAP_DEFAULT, /* default large page allocation policy */
340 338 LPAP_LOCAL /* local large page allocation policy */
341 339 };
342 340
343 341 enum lpap lpg_alloc_prefer = LPAP_DEFAULT;
344 342
345 343 static void page_init_mem_config(void);
346 344 static int page_do_hashin(page_t *, vnode_t *, u_offset_t);
347 345 static void page_do_hashout(page_t *);
348 346 static void page_capture_init();
349 347 int page_capture_take_action(page_t *, uint_t, void *);
350 348
351 349 static void page_demote_vp_pages(page_t *);
352 350
353 351
354 352 void
355 353 pcf_init(void)
356 354
357 355 {
358 356 if (boot_ncpus != -1) {
359 357 pcf_fanout = boot_ncpus;
360 358 } else {
361 359 pcf_fanout = max_ncpus;
362 360 }
363 361 #ifdef sun4v
364 362 /*
365 363 * Force at least 4 buckets if possible for sun4v.
366 364 */
367 365 pcf_fanout = MAX(pcf_fanout, 4);
368 366 #endif /* sun4v */
369 367
370 368 /*
371 369 * Round up to the nearest power of 2.
372 370 */
373 371 pcf_fanout = MIN(pcf_fanout, MAX_PCF_FANOUT);
374 372 if (!ISP2(pcf_fanout)) {
375 373 pcf_fanout = 1 << highbit(pcf_fanout);
376 374
377 375 if (pcf_fanout > MAX_PCF_FANOUT) {
378 376 pcf_fanout = 1 << (highbit(MAX_PCF_FANOUT) - 1);
379 377 }
380 378 }
381 379 pcf_fanout_mask = pcf_fanout - 1;
382 380 }
383 381
384 382 /*
385 383 * vm subsystem related initialization
386 384 */
387 385 void
388 386 vm_init(void)
389 387 {
390 388 boolean_t callb_vm_cpr(void *, int);
391 389
392 390 (void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm");
393 391 page_init_mem_config();
394 392 page_retire_init();
395 393 vm_usage_init();
396 394 page_capture_init();
397 395 }
398 396
399 397 /*
400 398 * This function is called at startup and when memory is added or deleted.
401 399 */
402 400 void
403 401 init_pages_pp_maximum()
404 402 {
405 403 static pgcnt_t p_min;
406 404 static pgcnt_t pages_pp_maximum_startup;
407 405 static pgcnt_t avrmem_delta;
408 406 static int init_done;
409 407 static int user_set; /* true if set in /etc/system */
410 408
411 409 if (init_done == 0) {
412 410
413 411 /* If the user specified a value, save it */
414 412 if (pages_pp_maximum != 0) {
415 413 user_set = 1;
416 414 pages_pp_maximum_startup = pages_pp_maximum;
417 415 }
418 416
419 417 /*
420 418 * Setting of pages_pp_maximum is based first time
421 419 * on the value of availrmem just after the start-up
422 420 * allocations. To preserve this relationship at run
423 421 * time, use a delta from availrmem_initial.
424 422 */
425 423 ASSERT(availrmem_initial >= availrmem);
426 424 avrmem_delta = availrmem_initial - availrmem;
427 425
428 426 /* The allowable floor of pages_pp_maximum */
429 427 p_min = tune.t_minarmem + 100;
430 428
431 429 /* Make sure we don't come through here again. */
432 430 init_done = 1;
433 431 }
434 432 /*
435 433 * Determine pages_pp_maximum, the number of currently available
436 434 * pages (availrmem) that can't be `locked'. If not set by
437 435 * the user, we set it to 4% of the currently available memory
438 436 * plus 4MB.
439 437 * But we also insist that it be greater than tune.t_minarmem;
440 438 * otherwise a process could lock down a lot of memory, get swapped
441 439 * out, and never have enough to get swapped back in.
442 440 */
443 441 if (user_set)
444 442 pages_pp_maximum = pages_pp_maximum_startup;
445 443 else
446 444 pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25)
447 445 + btop(4 * 1024 * 1024);
448 446
449 447 if (pages_pp_maximum <= p_min) {
450 448 pages_pp_maximum = p_min;
451 449 }
452 450 }
453 451
454 452 void
455 453 set_max_page_get(pgcnt_t target_total_pages)
456 454 {
457 455 max_page_get = target_total_pages / 2;
458 456 }
459 457
460 458 static pgcnt_t pending_delete;
461 459
462 460 /*ARGSUSED*/
463 461 static void
464 462 page_mem_config_post_add(
465 463 void *arg,
466 464 pgcnt_t delta_pages)
467 465 {
468 466 set_max_page_get(total_pages - pending_delete);
469 467 init_pages_pp_maximum();
470 468 }
471 469
472 470 /*ARGSUSED*/
473 471 static int
474 472 page_mem_config_pre_del(
475 473 void *arg,
476 474 pgcnt_t delta_pages)
477 475 {
478 476 pgcnt_t nv;
479 477
480 478 nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages);
481 479 set_max_page_get(total_pages - nv);
482 480 return (0);
483 481 }
484 482
485 483 /*ARGSUSED*/
486 484 static void
487 485 page_mem_config_post_del(
488 486 void *arg,
489 487 pgcnt_t delta_pages,
490 488 int cancelled)
491 489 {
492 490 pgcnt_t nv;
493 491
494 492 nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages);
495 493 set_max_page_get(total_pages - nv);
496 494 if (!cancelled)
497 495 init_pages_pp_maximum();
498 496 }
499 497
500 498 static kphysm_setup_vector_t page_mem_config_vec = {
501 499 KPHYSM_SETUP_VECTOR_VERSION,
502 500 page_mem_config_post_add,
503 501 page_mem_config_pre_del,
504 502 page_mem_config_post_del,
505 503 };
506 504
507 505 static void
508 506 page_init_mem_config(void)
509 507 {
510 508 int ret;
511 509
512 510 ret = kphysm_setup_func_register(&page_mem_config_vec, (void *)NULL);
513 511 ASSERT(ret == 0);
514 512 }
515 513
516 514 /*
517 515 * Evenly spread out the PCF counters for large free pages
518 516 */
519 517 static void
520 518 page_free_large_ctr(pgcnt_t npages)
521 519 {
522 520 static struct pcf *p = pcf;
523 521 pgcnt_t lump;
524 522
525 523 freemem += npages;
526 524
527 525 lump = roundup(npages, pcf_fanout) / pcf_fanout;
528 526
529 527 while (npages > 0) {
530 528
531 529 ASSERT(!p->pcf_block);
532 530
533 531 if (lump < npages) {
534 532 p->pcf_count += (uint_t)lump;
535 533 npages -= lump;
536 534 } else {
537 535 p->pcf_count += (uint_t)npages;
538 536 npages = 0;
539 537 }
540 538
541 539 ASSERT(!p->pcf_wait);
542 540
543 541 if (++p > &pcf[pcf_fanout - 1])
544 542 p = pcf;
545 543 }
546 544
547 545 ASSERT(npages == 0);
548 546 }
549 547
550 548 /*
551 549 * Add a physical chunk of memory to the system free lists during startup.
552 550 * Platform specific startup() allocates the memory for the page structs.
553 551 *
554 552 * num - number of page structures
555 553 * base - page number (pfn) to be associated with the first page.
556 554 *
557 555 * Since we are doing this during startup (ie. single threaded), we will
558 556 * use shortcut routines to avoid any locking overhead while putting all
559 557 * these pages on the freelists.
560 558 *
561 559 * NOTE: Any changes performed to page_free(), must also be performed to
562 560 * add_physmem() since this is how we initialize all page_t's at
563 561 * boot time.
564 562 */
565 563 void
566 564 add_physmem(
567 565 page_t *pp,
568 566 pgcnt_t num,
569 567 pfn_t pnum)
570 568 {
571 569 page_t *root = NULL;
572 570 uint_t szc = page_num_pagesizes() - 1;
573 571 pgcnt_t large = page_get_pagecnt(szc);
574 572 pgcnt_t cnt = 0;
575 573
576 574 TRACE_2(TR_FAC_VM, TR_PAGE_INIT,
577 575 "add_physmem:pp %p num %lu", pp, num);
578 576
579 577 /*
580 578 * Arbitrarily limit the max page_get request
581 579 * to 1/2 of the page structs we have.
582 580 */
583 581 total_pages += num;
584 582 set_max_page_get(total_pages);
585 583
586 584 PLCNT_MODIFY_MAX(pnum, (long)num);
587 585
588 586 /*
589 587 * The physical space for the pages array
590 588 * representing ram pages has already been
591 589 * allocated. Here we initialize each lock
592 590 * in the page structure, and put each on
593 591 * the free list
594 592 */
595 593 for (; num; pp++, pnum++, num--) {
596 594
597 595 /*
598 596 * this needs to fill in the page number
599 597 * and do any other arch specific initialization
600 598 */
601 599 add_physmem_cb(pp, pnum);
602 600
603 601 pp->p_lckcnt = 0;
604 602 pp->p_cowcnt = 0;
605 603 pp->p_slckcnt = 0;
606 604
607 605 /*
608 606 * Initialize the page lock as unlocked, since nobody
609 607 * can see or access this page yet.
610 608 */
611 609 pp->p_selock = 0;
612 610
613 611 /*
614 612 * Initialize IO lock
615 613 */
616 614 page_iolock_init(pp);
617 615
618 616 /*
619 617 * initialize other fields in the page_t
620 618 */
621 619 PP_SETFREE(pp);
622 620 page_clr_all_props(pp);
623 621 PP_SETAGED(pp);
624 622 pp->p_offset = (u_offset_t)-1;
625 623 pp->p_next = pp;
626 624 pp->p_prev = pp;
627 625
628 626 /*
629 627 * Simple case: System doesn't support large pages.
630 628 */
631 629 if (szc == 0) {
632 630 pp->p_szc = 0;
633 631 page_free_at_startup(pp);
634 632 continue;
635 633 }
636 634
637 635 /*
638 636 * Handle unaligned pages, we collect them up onto
639 637 * the root page until we have a full large page.
640 638 */
641 639 if (!IS_P2ALIGNED(pnum, large)) {
642 640
643 641 /*
644 642 * If not in a large page,
645 643 * just free as small page.
646 644 */
647 645 if (root == NULL) {
648 646 pp->p_szc = 0;
649 647 page_free_at_startup(pp);
650 648 continue;
651 649 }
652 650
653 651 /*
654 652 * Link a constituent page into the large page.
655 653 */
656 654 pp->p_szc = szc;
657 655 page_list_concat(&root, &pp);
658 656
659 657 /*
660 658 * When large page is fully formed, free it.
661 659 */
662 660 if (++cnt == large) {
663 661 page_free_large_ctr(cnt);
664 662 page_list_add_pages(root, PG_LIST_ISINIT);
665 663 root = NULL;
666 664 cnt = 0;
667 665 }
668 666 continue;
669 667 }
670 668
671 669 /*
672 670 * At this point we have a page number which
673 671 * is aligned. We assert that we aren't already
674 672 * in a different large page.
675 673 */
676 674 ASSERT(IS_P2ALIGNED(pnum, large));
677 675 ASSERT(root == NULL && cnt == 0);
678 676
679 677 /*
680 678 * If insufficient number of pages left to form
681 679 * a large page, just free the small page.
682 680 */
683 681 if (num < large) {
684 682 pp->p_szc = 0;
685 683 page_free_at_startup(pp);
686 684 continue;
687 685 }
688 686
689 687 /*
690 688 * Otherwise start a new large page.
691 689 */
692 690 pp->p_szc = szc;
693 691 cnt++;
694 692 root = pp;
695 693 }
696 694 ASSERT(root == NULL && cnt == 0);
697 695 }
698 696
699 697 /*
700 698 * Find a page representing the specified [vp, offset].
701 699 * If we find the page but it is intransit coming in,
702 700 * it will have an "exclusive" lock and we wait for
703 701 * the i/o to complete. A page found on the free list
704 702 * is always reclaimed and then locked. On success, the page
705 703 * is locked, its data is valid and it isn't on the free
706 704 * list, while a NULL is returned if the page doesn't exist.
707 705 */
708 706 page_t *
709 707 page_lookup(vnode_t *vp, u_offset_t off, se_t se)
710 708 {
711 709 return (page_lookup_create(vp, off, se, NULL, NULL, 0));
712 710 }
713 711
714 712 /*
715 713 * Find a page representing the specified [vp, offset].
716 714 * We either return the one we found or, if passed in,
717 715 * create one with identity of [vp, offset] of the
718 716 * pre-allocated page. If we find existing page but it is
719 717 * intransit coming in, it will have an "exclusive" lock
720 718 * and we wait for the i/o to complete. A page found on
721 719 * the free list is always reclaimed and then locked.
722 720 * On success, the page is locked, its data is valid and
723 721 * it isn't on the free list, while a NULL is returned
724 722 * if the page doesn't exist and newpp is NULL;
725 723 */
726 724 page_t *
727 725 page_lookup_create(
728 726 vnode_t *vp,
729 727 u_offset_t off,
730 728 se_t se,
731 729 page_t *newpp,
732 730 spgcnt_t *nrelocp,
733 731 int flags)
734 732 {
735 733 page_t *pp;
736 734 kmutex_t *phm;
737 735 ulong_t index;
738 736 uint_t hash_locked;
739 737 uint_t es;
740 738
741 739 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
742 740 VM_STAT_ADD(page_lookup_cnt[0]);
743 741 ASSERT(newpp ? PAGE_EXCL(newpp) : 1);
744 742
745 743 /*
746 744 * Acquire the appropriate page hash lock since
747 745 * we have to search the hash list. Pages that
748 746 * hash to this list can't change identity while
749 747 * this lock is held.
750 748 */
751 749 hash_locked = 0;
752 750 index = PAGE_HASH_FUNC(vp, off);
753 751 phm = NULL;
754 752 top:
755 753 PAGE_HASH_SEARCH(index, pp, vp, off);
756 754 if (pp != NULL) {
757 755 VM_STAT_ADD(page_lookup_cnt[1]);
758 756 es = (newpp != NULL) ? 1 : 0;
759 757 es |= flags;
760 758 if (!hash_locked) {
761 759 VM_STAT_ADD(page_lookup_cnt[2]);
762 760 if (!page_try_reclaim_lock(pp, se, es)) {
763 761 /*
764 762 * On a miss, acquire the phm. Then
765 763 * next time, page_lock() will be called,
766 764 * causing a wait if the page is busy.
767 765 * just looping with page_trylock() would
768 766 * get pretty boring.
769 767 */
770 768 VM_STAT_ADD(page_lookup_cnt[3]);
771 769 phm = PAGE_HASH_MUTEX(index);
772 770 mutex_enter(phm);
773 771 hash_locked = 1;
774 772 goto top;
775 773 }
776 774 } else {
777 775 VM_STAT_ADD(page_lookup_cnt[4]);
778 776 if (!page_lock_es(pp, se, phm, P_RECLAIM, es)) {
779 777 VM_STAT_ADD(page_lookup_cnt[5]);
780 778 goto top;
781 779 }
782 780 }
783 781
784 782 /*
785 783 * Since `pp' is locked it can not change identity now.
786 784 * Reconfirm we locked the correct page.
787 785 *
788 786 * Both the p_vnode and p_offset *must* be cast volatile
789 787 * to force a reload of their values: The PAGE_HASH_SEARCH
790 788 * macro will have stuffed p_vnode and p_offset into
791 789 * registers before calling page_trylock(); another thread,
792 790 * actually holding the hash lock, could have changed the
793 791 * page's identity in memory, but our registers would not
794 792 * be changed, fooling the reconfirmation. If the hash
795 793 * lock was held during the search, the casting would
796 794 * not be needed.
797 795 */
798 796 VM_STAT_ADD(page_lookup_cnt[6]);
799 797 if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
800 798 ((volatile u_offset_t)(pp->p_offset) != off)) {
801 799 VM_STAT_ADD(page_lookup_cnt[7]);
802 800 if (hash_locked) {
803 801 panic("page_lookup_create: lost page %p",
804 802 (void *)pp);
805 803 /*NOTREACHED*/
806 804 }
807 805 page_unlock(pp);
808 806 phm = PAGE_HASH_MUTEX(index);
809 807 mutex_enter(phm);
810 808 hash_locked = 1;
811 809 goto top;
812 810 }
813 811
814 812 /*
815 813 * If page_trylock() was called, then pp may still be on
816 814 * the cachelist (can't be on the free list, it would not
817 815 * have been found in the search). If it is on the
818 816 * cachelist it must be pulled now. To pull the page from
819 817 * the cachelist, it must be exclusively locked.
820 818 *
821 819 * The other big difference between page_trylock() and
822 820 * page_lock(), is that page_lock() will pull the
823 821 * page from whatever free list (the cache list in this
824 822 * case) the page is on. If page_trylock() was used
825 823 * above, then we have to do the reclaim ourselves.
826 824 */
827 825 if ((!hash_locked) && (PP_ISFREE(pp))) {
828 826 ASSERT(PP_ISAGED(pp) == 0);
829 827 VM_STAT_ADD(page_lookup_cnt[8]);
830 828
831 829 /*
832 830 * page_relcaim will insure that we
833 831 * have this page exclusively
834 832 */
835 833
836 834 if (!page_reclaim(pp, NULL)) {
837 835 /*
838 836 * Page_reclaim dropped whatever lock
839 837 * we held.
840 838 */
841 839 VM_STAT_ADD(page_lookup_cnt[9]);
842 840 phm = PAGE_HASH_MUTEX(index);
843 841 mutex_enter(phm);
844 842 hash_locked = 1;
845 843 goto top;
846 844 } else if (se == SE_SHARED && newpp == NULL) {
847 845 VM_STAT_ADD(page_lookup_cnt[10]);
848 846 page_downgrade(pp);
849 847 }
850 848 }
851 849
852 850 if (hash_locked) {
853 851 mutex_exit(phm);
854 852 }
855 853
856 854 if (newpp != NULL && pp->p_szc < newpp->p_szc &&
857 855 PAGE_EXCL(pp) && nrelocp != NULL) {
858 856 ASSERT(nrelocp != NULL);
859 857 (void) page_relocate(&pp, &newpp, 1, 1, nrelocp,
860 858 NULL);
861 859 if (*nrelocp > 0) {
862 860 VM_STAT_COND_ADD(*nrelocp == 1,
863 861 page_lookup_cnt[11]);
864 862 VM_STAT_COND_ADD(*nrelocp > 1,
865 863 page_lookup_cnt[12]);
866 864 pp = newpp;
867 865 se = SE_EXCL;
868 866 } else {
869 867 if (se == SE_SHARED) {
870 868 page_downgrade(pp);
871 869 }
872 870 VM_STAT_ADD(page_lookup_cnt[13]);
873 871 }
874 872 } else if (newpp != NULL && nrelocp != NULL) {
875 873 if (PAGE_EXCL(pp) && se == SE_SHARED) {
876 874 page_downgrade(pp);
877 875 }
878 876 VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc,
879 877 page_lookup_cnt[14]);
880 878 VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc,
881 879 page_lookup_cnt[15]);
882 880 VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc,
883 881 page_lookup_cnt[16]);
884 882 } else if (newpp != NULL && PAGE_EXCL(pp)) {
885 883 se = SE_EXCL;
886 884 }
887 885 } else if (!hash_locked) {
888 886 VM_STAT_ADD(page_lookup_cnt[17]);
889 887 phm = PAGE_HASH_MUTEX(index);
890 888 mutex_enter(phm);
891 889 hash_locked = 1;
892 890 goto top;
893 891 } else if (newpp != NULL) {
894 892 /*
895 893 * If we have a preallocated page then
896 894 * insert it now and basically behave like
897 895 * page_create.
898 896 */
899 897 VM_STAT_ADD(page_lookup_cnt[18]);
900 898 /*
901 899 * Since we hold the page hash mutex and
902 900 * just searched for this page, page_hashin
903 901 * had better not fail. If it does, that
904 902 * means some thread did not follow the
905 903 * page hash mutex rules. Panic now and
906 904 * get it over with. As usual, go down
907 905 * holding all the locks.
908 906 */
909 907 ASSERT(MUTEX_HELD(phm));
910 908 if (!page_hashin(newpp, vp, off, phm)) {
911 909 ASSERT(MUTEX_HELD(phm));
912 910 panic("page_lookup_create: hashin failed %p %p %llx %p",
913 911 (void *)newpp, (void *)vp, off, (void *)phm);
914 912 /*NOTREACHED*/
915 913 }
916 914 ASSERT(MUTEX_HELD(phm));
917 915 mutex_exit(phm);
918 916 phm = NULL;
919 917 page_set_props(newpp, P_REF);
920 918 page_io_lock(newpp);
921 919 pp = newpp;
922 920 se = SE_EXCL;
923 921 } else {
924 922 VM_STAT_ADD(page_lookup_cnt[19]);
925 923 mutex_exit(phm);
926 924 }
927 925
928 926 ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
929 927
930 928 ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1);
931 929
932 930 return (pp);
933 931 }
934 932
935 933 /*
936 934 * Search the hash list for the page representing the
937 935 * specified [vp, offset] and return it locked. Skip
938 936 * free pages and pages that cannot be locked as requested.
939 937 * Used while attempting to kluster pages.
940 938 */
941 939 page_t *
942 940 page_lookup_nowait(vnode_t *vp, u_offset_t off, se_t se)
943 941 {
944 942 page_t *pp;
945 943 kmutex_t *phm;
946 944 ulong_t index;
947 945 uint_t locked;
948 946
949 947 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
950 948 VM_STAT_ADD(page_lookup_nowait_cnt[0]);
951 949
952 950 index = PAGE_HASH_FUNC(vp, off);
953 951 PAGE_HASH_SEARCH(index, pp, vp, off);
954 952 locked = 0;
955 953 if (pp == NULL) {
956 954 top:
957 955 VM_STAT_ADD(page_lookup_nowait_cnt[1]);
958 956 locked = 1;
959 957 phm = PAGE_HASH_MUTEX(index);
960 958 mutex_enter(phm);
961 959 PAGE_HASH_SEARCH(index, pp, vp, off);
962 960 }
963 961
964 962 if (pp == NULL || PP_ISFREE(pp)) {
965 963 VM_STAT_ADD(page_lookup_nowait_cnt[2]);
966 964 pp = NULL;
967 965 } else {
968 966 if (!page_trylock(pp, se)) {
969 967 VM_STAT_ADD(page_lookup_nowait_cnt[3]);
970 968 pp = NULL;
971 969 } else {
972 970 VM_STAT_ADD(page_lookup_nowait_cnt[4]);
973 971 /*
974 972 * See the comment in page_lookup()
975 973 */
976 974 if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
977 975 ((u_offset_t)(pp->p_offset) != off)) {
978 976 VM_STAT_ADD(page_lookup_nowait_cnt[5]);
979 977 if (locked) {
980 978 panic("page_lookup_nowait %p",
981 979 (void *)pp);
982 980 /*NOTREACHED*/
983 981 }
984 982 page_unlock(pp);
985 983 goto top;
986 984 }
987 985 if (PP_ISFREE(pp)) {
988 986 VM_STAT_ADD(page_lookup_nowait_cnt[6]);
989 987 page_unlock(pp);
990 988 pp = NULL;
991 989 }
992 990 }
993 991 }
994 992 if (locked) {
995 993 VM_STAT_ADD(page_lookup_nowait_cnt[7]);
996 994 mutex_exit(phm);
997 995 }
998 996
999 997 ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
1000 998
1001 999 return (pp);
1002 1000 }
1003 1001
1004 1002 /*
1005 1003 * Search the hash list for a page with the specified [vp, off]
1006 1004 * that is known to exist and is already locked. This routine
1007 1005 * is typically used by segment SOFTUNLOCK routines.
1008 1006 */
1009 1007 page_t *
1010 1008 page_find(vnode_t *vp, u_offset_t off)
1011 1009 {
1012 1010 page_t *pp;
1013 1011 kmutex_t *phm;
1014 1012 ulong_t index;
1015 1013
1016 1014 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1017 1015 VM_STAT_ADD(page_find_cnt);
1018 1016
1019 1017 index = PAGE_HASH_FUNC(vp, off);
1020 1018 phm = PAGE_HASH_MUTEX(index);
1021 1019
1022 1020 mutex_enter(phm);
1023 1021 PAGE_HASH_SEARCH(index, pp, vp, off);
1024 1022 mutex_exit(phm);
1025 1023
1026 1024 ASSERT(pp == NULL || PAGE_LOCKED(pp) || panicstr);
1027 1025 return (pp);
1028 1026 }
1029 1027
1030 1028 /*
1031 1029 * Determine whether a page with the specified [vp, off]
1032 1030 * currently exists in the system. Obviously this should
1033 1031 * only be considered as a hint since nothing prevents the
1034 1032 * page from disappearing or appearing immediately after
1035 1033 * the return from this routine. Subsequently, we don't
1036 1034 * even bother to lock the list.
1037 1035 */
1038 1036 page_t *
1039 1037 page_exists(vnode_t *vp, u_offset_t off)
1040 1038 {
1041 1039 page_t *pp;
1042 1040 ulong_t index;
1043 1041
1044 1042 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1045 1043 VM_STAT_ADD(page_exists_cnt);
1046 1044
1047 1045 index = PAGE_HASH_FUNC(vp, off);
1048 1046 PAGE_HASH_SEARCH(index, pp, vp, off);
1049 1047
1050 1048 return (pp);
1051 1049 }
1052 1050
1053 1051 /*
1054 1052 * Determine if physically contiguous pages exist for [vp, off] - [vp, off +
1055 1053 * page_size(szc)) range. if they exist and ppa is not NULL fill ppa array
1056 1054 * with these pages locked SHARED. If necessary reclaim pages from
1057 1055 * freelist. Return 1 if contiguous pages exist and 0 otherwise.
1058 1056 *
1059 1057 * If we fail to lock pages still return 1 if pages exist and contiguous.
1060 1058 * But in this case return value is just a hint. ppa array won't be filled.
1061 1059 * Caller should initialize ppa[0] as NULL to distinguish return value.
1062 1060 *
1063 1061 * Returns 0 if pages don't exist or not physically contiguous.
1064 1062 *
1065 1063 * This routine doesn't work for anonymous(swapfs) pages.
1066 1064 */
1067 1065 int
1068 1066 page_exists_physcontig(vnode_t *vp, u_offset_t off, uint_t szc, page_t *ppa[])
1069 1067 {
1070 1068 pgcnt_t pages;
1071 1069 pfn_t pfn;
1072 1070 page_t *rootpp;
1073 1071 pgcnt_t i;
1074 1072 pgcnt_t j;
1075 1073 u_offset_t save_off = off;
1076 1074 ulong_t index;
1077 1075 kmutex_t *phm;
1078 1076 page_t *pp;
1079 1077 uint_t pszc;
1080 1078 int loopcnt = 0;
1081 1079
1082 1080 ASSERT(szc != 0);
1083 1081 ASSERT(vp != NULL);
1084 1082 ASSERT(!IS_SWAPFSVP(vp));
1085 1083 ASSERT(!VN_ISKAS(vp));
1086 1084
1087 1085 again:
1088 1086 if (++loopcnt > 3) {
1089 1087 VM_STAT_ADD(page_exphcontg[0]);
1090 1088 return (0);
1091 1089 }
1092 1090
1093 1091 index = PAGE_HASH_FUNC(vp, off);
1094 1092 phm = PAGE_HASH_MUTEX(index);
1095 1093
1096 1094 mutex_enter(phm);
1097 1095 PAGE_HASH_SEARCH(index, pp, vp, off);
1098 1096 mutex_exit(phm);
1099 1097
1100 1098 VM_STAT_ADD(page_exphcontg[1]);
1101 1099
1102 1100 if (pp == NULL) {
1103 1101 VM_STAT_ADD(page_exphcontg[2]);
1104 1102 return (0);
1105 1103 }
1106 1104
1107 1105 pages = page_get_pagecnt(szc);
1108 1106 rootpp = pp;
1109 1107 pfn = rootpp->p_pagenum;
1110 1108
1111 1109 if ((pszc = pp->p_szc) >= szc && ppa != NULL) {
1112 1110 VM_STAT_ADD(page_exphcontg[3]);
1113 1111 if (!page_trylock(pp, SE_SHARED)) {
1114 1112 VM_STAT_ADD(page_exphcontg[4]);
1115 1113 return (1);
1116 1114 }
1117 1115 /*
1118 1116 * Also check whether p_pagenum was modified by DR.
1119 1117 */
1120 1118 if (pp->p_szc != pszc || pp->p_vnode != vp ||
1121 1119 pp->p_offset != off || pp->p_pagenum != pfn) {
1122 1120 VM_STAT_ADD(page_exphcontg[5]);
1123 1121 page_unlock(pp);
1124 1122 off = save_off;
1125 1123 goto again;
1126 1124 }
1127 1125 /*
1128 1126 * szc was non zero and vnode and offset matched after we
1129 1127 * locked the page it means it can't become free on us.
1130 1128 */
1131 1129 ASSERT(!PP_ISFREE(pp));
1132 1130 if (!IS_P2ALIGNED(pfn, pages)) {
1133 1131 page_unlock(pp);
1134 1132 return (0);
1135 1133 }
1136 1134 ppa[0] = pp;
1137 1135 pp++;
1138 1136 off += PAGESIZE;
1139 1137 pfn++;
1140 1138 for (i = 1; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1141 1139 if (!page_trylock(pp, SE_SHARED)) {
1142 1140 VM_STAT_ADD(page_exphcontg[6]);
1143 1141 pp--;
1144 1142 while (i-- > 0) {
1145 1143 page_unlock(pp);
1146 1144 pp--;
1147 1145 }
1148 1146 ppa[0] = NULL;
1149 1147 return (1);
1150 1148 }
1151 1149 if (pp->p_szc != pszc) {
1152 1150 VM_STAT_ADD(page_exphcontg[7]);
1153 1151 page_unlock(pp);
1154 1152 pp--;
1155 1153 while (i-- > 0) {
1156 1154 page_unlock(pp);
1157 1155 pp--;
1158 1156 }
1159 1157 ppa[0] = NULL;
1160 1158 off = save_off;
1161 1159 goto again;
1162 1160 }
1163 1161 /*
1164 1162 * szc the same as for previous already locked pages
1165 1163 * with right identity. Since this page had correct
1166 1164 * szc after we locked it can't get freed or destroyed
1167 1165 * and therefore must have the expected identity.
1168 1166 */
1169 1167 ASSERT(!PP_ISFREE(pp));
1170 1168 if (pp->p_vnode != vp ||
1171 1169 pp->p_offset != off) {
1172 1170 panic("page_exists_physcontig: "
1173 1171 "large page identity doesn't match");
1174 1172 }
1175 1173 ppa[i] = pp;
1176 1174 ASSERT(pp->p_pagenum == pfn);
1177 1175 }
1178 1176 VM_STAT_ADD(page_exphcontg[8]);
1179 1177 ppa[pages] = NULL;
1180 1178 return (1);
1181 1179 } else if (pszc >= szc) {
1182 1180 VM_STAT_ADD(page_exphcontg[9]);
1183 1181 if (!IS_P2ALIGNED(pfn, pages)) {
1184 1182 return (0);
1185 1183 }
1186 1184 return (1);
1187 1185 }
1188 1186
1189 1187 if (!IS_P2ALIGNED(pfn, pages)) {
1190 1188 VM_STAT_ADD(page_exphcontg[10]);
1191 1189 return (0);
1192 1190 }
1193 1191
1194 1192 if (page_numtomemseg_nolock(pfn) !=
1195 1193 page_numtomemseg_nolock(pfn + pages - 1)) {
1196 1194 VM_STAT_ADD(page_exphcontg[11]);
1197 1195 return (0);
1198 1196 }
1199 1197
1200 1198 /*
1201 1199 * We loop up 4 times across pages to promote page size.
1202 1200 * We're extra cautious to promote page size atomically with respect
1203 1201 * to everybody else. But we can probably optimize into 1 loop if
1204 1202 * this becomes an issue.
1205 1203 */
1206 1204
1207 1205 for (i = 0; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1208 1206 if (!page_trylock(pp, SE_EXCL)) {
1209 1207 VM_STAT_ADD(page_exphcontg[12]);
1210 1208 break;
1211 1209 }
1212 1210 /*
1213 1211 * Check whether p_pagenum was modified by DR.
1214 1212 */
1215 1213 if (pp->p_pagenum != pfn) {
1216 1214 page_unlock(pp);
1217 1215 break;
1218 1216 }
1219 1217 if (pp->p_vnode != vp ||
1220 1218 pp->p_offset != off) {
1221 1219 VM_STAT_ADD(page_exphcontg[13]);
1222 1220 page_unlock(pp);
1223 1221 break;
1224 1222 }
1225 1223 if (pp->p_szc >= szc) {
1226 1224 ASSERT(i == 0);
1227 1225 page_unlock(pp);
1228 1226 off = save_off;
1229 1227 goto again;
1230 1228 }
1231 1229 }
1232 1230
1233 1231 if (i != pages) {
1234 1232 VM_STAT_ADD(page_exphcontg[14]);
1235 1233 --pp;
1236 1234 while (i-- > 0) {
1237 1235 page_unlock(pp);
1238 1236 --pp;
1239 1237 }
1240 1238 return (0);
1241 1239 }
1242 1240
1243 1241 pp = rootpp;
1244 1242 for (i = 0; i < pages; i++, pp++) {
1245 1243 if (PP_ISFREE(pp)) {
1246 1244 VM_STAT_ADD(page_exphcontg[15]);
1247 1245 ASSERT(!PP_ISAGED(pp));
1248 1246 ASSERT(pp->p_szc == 0);
1249 1247 if (!page_reclaim(pp, NULL)) {
1250 1248 break;
1251 1249 }
1252 1250 } else {
1253 1251 ASSERT(pp->p_szc < szc);
1254 1252 VM_STAT_ADD(page_exphcontg[16]);
1255 1253 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1256 1254 }
1257 1255 }
1258 1256 if (i < pages) {
1259 1257 VM_STAT_ADD(page_exphcontg[17]);
1260 1258 /*
1261 1259 * page_reclaim failed because we were out of memory.
1262 1260 * drop the rest of the locks and return because this page
1263 1261 * must be already reallocated anyway.
1264 1262 */
1265 1263 pp = rootpp;
1266 1264 for (j = 0; j < pages; j++, pp++) {
1267 1265 if (j != i) {
1268 1266 page_unlock(pp);
1269 1267 }
1270 1268 }
1271 1269 return (0);
1272 1270 }
1273 1271
1274 1272 off = save_off;
1275 1273 pp = rootpp;
1276 1274 for (i = 0; i < pages; i++, pp++, off += PAGESIZE) {
1277 1275 ASSERT(PAGE_EXCL(pp));
1278 1276 ASSERT(!PP_ISFREE(pp));
1279 1277 ASSERT(!hat_page_is_mapped(pp));
1280 1278 ASSERT(pp->p_vnode == vp);
1281 1279 ASSERT(pp->p_offset == off);
1282 1280 pp->p_szc = szc;
1283 1281 }
1284 1282 pp = rootpp;
1285 1283 for (i = 0; i < pages; i++, pp++) {
1286 1284 if (ppa == NULL) {
1287 1285 page_unlock(pp);
1288 1286 } else {
1289 1287 ppa[i] = pp;
1290 1288 page_downgrade(ppa[i]);
1291 1289 }
1292 1290 }
1293 1291 if (ppa != NULL) {
1294 1292 ppa[pages] = NULL;
1295 1293 }
1296 1294 VM_STAT_ADD(page_exphcontg[18]);
1297 1295 ASSERT(vp->v_pages != NULL);
1298 1296 return (1);
1299 1297 }
1300 1298
1301 1299 /*
1302 1300 * Determine whether a page with the specified [vp, off]
1303 1301 * currently exists in the system and if so return its
1304 1302 * size code. Obviously this should only be considered as
1305 1303 * a hint since nothing prevents the page from disappearing
1306 1304 * or appearing immediately after the return from this routine.
1307 1305 */
1308 1306 int
1309 1307 page_exists_forreal(vnode_t *vp, u_offset_t off, uint_t *szc)
1310 1308 {
1311 1309 page_t *pp;
1312 1310 kmutex_t *phm;
1313 1311 ulong_t index;
1314 1312 int rc = 0;
1315 1313
1316 1314 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1317 1315 ASSERT(szc != NULL);
1318 1316 VM_STAT_ADD(page_exists_forreal_cnt);
1319 1317
1320 1318 index = PAGE_HASH_FUNC(vp, off);
1321 1319 phm = PAGE_HASH_MUTEX(index);
1322 1320
1323 1321 mutex_enter(phm);
1324 1322 PAGE_HASH_SEARCH(index, pp, vp, off);
1325 1323 if (pp != NULL) {
1326 1324 *szc = pp->p_szc;
1327 1325 rc = 1;
1328 1326 }
1329 1327 mutex_exit(phm);
1330 1328 return (rc);
1331 1329 }
1332 1330
1333 1331 /* wakeup threads waiting for pages in page_create_get_something() */
1334 1332 void
1335 1333 wakeup_pcgs(void)
1336 1334 {
1337 1335 if (!CV_HAS_WAITERS(&pcgs_cv))
1338 1336 return;
1339 1337 cv_broadcast(&pcgs_cv);
1340 1338 }
1341 1339
1342 1340 /*
1343 1341 * 'freemem' is used all over the kernel as an indication of how many
1344 1342 * pages are free (either on the cache list or on the free page list)
1345 1343 * in the system. In very few places is a really accurate 'freemem'
1346 1344 * needed. To avoid contention of the lock protecting a the
1347 1345 * single freemem, it was spread out into NCPU buckets. Set_freemem
1348 1346 * sets freemem to the total of all NCPU buckets. It is called from
1349 1347 * clock() on each TICK.
1350 1348 */
1351 1349 void
1352 1350 set_freemem()
1353 1351 {
1354 1352 struct pcf *p;
1355 1353 ulong_t t;
1356 1354 uint_t i;
1357 1355
1358 1356 t = 0;
1359 1357 p = pcf;
1360 1358 for (i = 0; i < pcf_fanout; i++) {
1361 1359 t += p->pcf_count;
1362 1360 p++;
1363 1361 }
1364 1362 freemem = t;
1365 1363
1366 1364 /*
1367 1365 * Don't worry about grabbing mutex. It's not that
1368 1366 * critical if we miss a tick or two. This is
1369 1367 * where we wakeup possible delayers in
1370 1368 * page_create_get_something().
1371 1369 */
1372 1370 wakeup_pcgs();
1373 1371 }
1374 1372
1375 1373 ulong_t
1376 1374 get_freemem()
1377 1375 {
1378 1376 struct pcf *p;
1379 1377 ulong_t t;
1380 1378 uint_t i;
1381 1379
1382 1380 t = 0;
1383 1381 p = pcf;
1384 1382 for (i = 0; i < pcf_fanout; i++) {
1385 1383 t += p->pcf_count;
1386 1384 p++;
1387 1385 }
1388 1386 /*
1389 1387 * We just calculated it, might as well set it.
1390 1388 */
1391 1389 freemem = t;
1392 1390 return (t);
1393 1391 }
1394 1392
1395 1393 /*
1396 1394 * Acquire all of the page cache & free (pcf) locks.
1397 1395 */
1398 1396 void
1399 1397 pcf_acquire_all()
1400 1398 {
1401 1399 struct pcf *p;
1402 1400 uint_t i;
1403 1401
1404 1402 p = pcf;
1405 1403 for (i = 0; i < pcf_fanout; i++) {
1406 1404 mutex_enter(&p->pcf_lock);
1407 1405 p++;
1408 1406 }
1409 1407 }
1410 1408
1411 1409 /*
1412 1410 * Release all the pcf_locks.
1413 1411 */
1414 1412 void
1415 1413 pcf_release_all()
1416 1414 {
1417 1415 struct pcf *p;
1418 1416 uint_t i;
1419 1417
1420 1418 p = pcf;
1421 1419 for (i = 0; i < pcf_fanout; i++) {
1422 1420 mutex_exit(&p->pcf_lock);
1423 1421 p++;
1424 1422 }
1425 1423 }
1426 1424
1427 1425 /*
1428 1426 * Inform the VM system that we need some pages freed up.
1429 1427 * Calls must be symmetric, e.g.:
1430 1428 *
1431 1429 * page_needfree(100);
1432 1430 * wait a bit;
1433 1431 * page_needfree(-100);
1434 1432 */
1435 1433 void
1436 1434 page_needfree(spgcnt_t npages)
1437 1435 {
1438 1436 mutex_enter(&new_freemem_lock);
1439 1437 needfree += npages;
1440 1438 mutex_exit(&new_freemem_lock);
1441 1439 }
1442 1440
1443 1441 /*
1444 1442 * Throttle for page_create(): try to prevent freemem from dropping
1445 1443 * below throttlefree. We can't provide a 100% guarantee because
1446 1444 * KM_NOSLEEP allocations, page_reclaim(), and various other things
1447 1445 * nibble away at the freelist. However, we can block all PG_WAIT
1448 1446 * allocations until memory becomes available. The motivation is
1449 1447 * that several things can fall apart when there's no free memory:
1450 1448 *
1451 1449 * (1) If pageout() needs memory to push a page, the system deadlocks.
1452 1450 *
1453 1451 * (2) By (broken) specification, timeout(9F) can neither fail nor
1454 1452 * block, so it has no choice but to panic the system if it
1455 1453 * cannot allocate a callout structure.
1456 1454 *
1457 1455 * (3) Like timeout(), ddi_set_callback() cannot fail and cannot block;
1458 1456 * it panics if it cannot allocate a callback structure.
1459 1457 *
1460 1458 * (4) Untold numbers of third-party drivers have not yet been hardened
1461 1459 * against KM_NOSLEEP and/or allocb() failures; they simply assume
1462 1460 * success and panic the system with a data fault on failure.
1463 1461 * (The long-term solution to this particular problem is to ship
1464 1462 * hostile fault-injecting DEBUG kernels with the DDK.)
1465 1463 *
1466 1464 * It is theoretically impossible to guarantee success of non-blocking
1467 1465 * allocations, but in practice, this throttle is very hard to break.
1468 1466 */
1469 1467 static int
1470 1468 page_create_throttle(pgcnt_t npages, int flags)
1471 1469 {
1472 1470 ulong_t fm;
1473 1471 uint_t i;
1474 1472 pgcnt_t tf; /* effective value of throttlefree */
1475 1473
1476 1474 /*
1477 1475 * Normal priority allocations.
1478 1476 */
1479 1477 if ((flags & (PG_WAIT | PG_NORMALPRI)) == PG_NORMALPRI) {
1480 1478 ASSERT(!(flags & (PG_PANIC | PG_PUSHPAGE)));
1481 1479 return (freemem >= npages + throttlefree);
1482 1480 }
1483 1481
1484 1482 /*
1485 1483 * Never deny pages when:
1486 1484 * - it's a thread that cannot block [NOMEMWAIT()]
1487 1485 * - the allocation cannot block and must not fail
1488 1486 * - the allocation cannot block and is pageout dispensated
1489 1487 */
1490 1488 if (NOMEMWAIT() ||
1491 1489 ((flags & (PG_WAIT | PG_PANIC)) == PG_PANIC) ||
1492 1490 ((flags & (PG_WAIT | PG_PUSHPAGE)) == PG_PUSHPAGE))
1493 1491 return (1);
1494 1492
1495 1493 /*
1496 1494 * If the allocation can't block, we look favorably upon it
1497 1495 * unless we're below pageout_reserve. In that case we fail
1498 1496 * the allocation because we want to make sure there are a few
1499 1497 * pages available for pageout.
1500 1498 */
1501 1499 if ((flags & PG_WAIT) == 0)
1502 1500 return (freemem >= npages + pageout_reserve);
1503 1501
1504 1502 /* Calculate the effective throttlefree value */
1505 1503 tf = throttlefree -
1506 1504 ((flags & PG_PUSHPAGE) ? pageout_reserve : 0);
1507 1505
1508 1506 cv_signal(&proc_pageout->p_cv);
1509 1507
1510 1508 for (;;) {
1511 1509 fm = 0;
1512 1510 pcf_acquire_all();
1513 1511 mutex_enter(&new_freemem_lock);
1514 1512 for (i = 0; i < pcf_fanout; i++) {
1515 1513 fm += pcf[i].pcf_count;
1516 1514 pcf[i].pcf_wait++;
1517 1515 mutex_exit(&pcf[i].pcf_lock);
1518 1516 }
1519 1517 freemem = fm;
1520 1518 if (freemem >= npages + tf) {
1521 1519 mutex_exit(&new_freemem_lock);
1522 1520 break;
1523 1521 }
1524 1522 needfree += npages;
1525 1523 freemem_wait++;
1526 1524 cv_wait(&freemem_cv, &new_freemem_lock);
1527 1525 freemem_wait--;
1528 1526 needfree -= npages;
1529 1527 mutex_exit(&new_freemem_lock);
1530 1528 }
1531 1529 return (1);
1532 1530 }
1533 1531
1534 1532 /*
1535 1533 * page_create_wait() is called to either coalesce pages from the
1536 1534 * different pcf buckets or to wait because there simply are not
1537 1535 * enough pages to satisfy the caller's request.
1538 1536 *
1539 1537 * Sadly, this is called from platform/vm/vm_machdep.c
1540 1538 */
1541 1539 int
1542 1540 page_create_wait(pgcnt_t npages, uint_t flags)
1543 1541 {
1544 1542 pgcnt_t total;
1545 1543 uint_t i;
1546 1544 struct pcf *p;
1547 1545
1548 1546 /*
1549 1547 * Wait until there are enough free pages to satisfy our
1550 1548 * entire request.
1551 1549 * We set needfree += npages before prodding pageout, to make sure
1552 1550 * it does real work when npages > lotsfree > freemem.
1553 1551 */
1554 1552 VM_STAT_ADD(page_create_not_enough);
1555 1553
1556 1554 ASSERT(!kcage_on ? !(flags & PG_NORELOC) : 1);
1557 1555 checkagain:
1558 1556 if ((flags & PG_NORELOC) &&
1559 1557 kcage_freemem < kcage_throttlefree + npages)
1560 1558 (void) kcage_create_throttle(npages, flags);
1561 1559
1562 1560 if (freemem < npages + throttlefree)
1563 1561 if (!page_create_throttle(npages, flags))
1564 1562 return (0);
1565 1563
1566 1564 if (pcf_decrement_bucket(npages) ||
1567 1565 pcf_decrement_multiple(&total, npages, 0))
1568 1566 return (1);
1569 1567
1570 1568 /*
1571 1569 * All of the pcf locks are held, there are not enough pages
1572 1570 * to satisfy the request (npages < total).
1573 1571 * Be sure to acquire the new_freemem_lock before dropping
1574 1572 * the pcf locks. This prevents dropping wakeups in page_free().
1575 1573 * The order is always pcf_lock then new_freemem_lock.
1576 1574 *
1577 1575 * Since we hold all the pcf locks, it is a good time to set freemem.
1578 1576 *
1579 1577 * If the caller does not want to wait, return now.
1580 1578 * Else turn the pageout daemon loose to find something
1581 1579 * and wait till it does.
1582 1580 *
1583 1581 */
1584 1582 freemem = total;
1585 1583
1586 1584 if ((flags & PG_WAIT) == 0) {
1587 1585 pcf_release_all();
1588 1586
1589 1587 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_NOMEM,
1590 1588 "page_create_nomem:npages %ld freemem %ld", npages, freemem);
1591 1589 return (0);
1592 1590 }
1593 1591
1594 1592 ASSERT(proc_pageout != NULL);
1595 1593 cv_signal(&proc_pageout->p_cv);
1596 1594
1597 1595 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START,
1598 1596 "page_create_sleep_start: freemem %ld needfree %ld",
1599 1597 freemem, needfree);
1600 1598
1601 1599 /*
1602 1600 * We are going to wait.
1603 1601 * We currently hold all of the pcf_locks,
1604 1602 * get the new_freemem_lock (it protects freemem_wait),
1605 1603 * before dropping the pcf_locks.
1606 1604 */
1607 1605 mutex_enter(&new_freemem_lock);
1608 1606
1609 1607 p = pcf;
1610 1608 for (i = 0; i < pcf_fanout; i++) {
1611 1609 p->pcf_wait++;
1612 1610 mutex_exit(&p->pcf_lock);
1613 1611 p++;
1614 1612 }
1615 1613
1616 1614 needfree += npages;
1617 1615 freemem_wait++;
1618 1616
1619 1617 cv_wait(&freemem_cv, &new_freemem_lock);
1620 1618
1621 1619 freemem_wait--;
1622 1620 needfree -= npages;
1623 1621
1624 1622 mutex_exit(&new_freemem_lock);
1625 1623
1626 1624 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_END,
1627 1625 "page_create_sleep_end: freemem %ld needfree %ld",
1628 1626 freemem, needfree);
1629 1627
1630 1628 VM_STAT_ADD(page_create_not_enough_again);
1631 1629 goto checkagain;
1632 1630 }
1633 1631 /*
1634 1632 * A routine to do the opposite of page_create_wait().
1635 1633 */
1636 1634 void
1637 1635 page_create_putback(spgcnt_t npages)
1638 1636 {
1639 1637 struct pcf *p;
1640 1638 pgcnt_t lump;
1641 1639 uint_t *which;
1642 1640
1643 1641 /*
1644 1642 * When a contiguous lump is broken up, we have to
1645 1643 * deal with lots of pages (min 64) so lets spread
1646 1644 * the wealth around.
1647 1645 */
1648 1646 lump = roundup(npages, pcf_fanout) / pcf_fanout;
1649 1647 freemem += npages;
1650 1648
1651 1649 for (p = pcf; (npages > 0) && (p < &pcf[pcf_fanout]); p++) {
1652 1650 which = &p->pcf_count;
1653 1651
1654 1652 mutex_enter(&p->pcf_lock);
1655 1653
1656 1654 if (p->pcf_block) {
1657 1655 which = &p->pcf_reserve;
1658 1656 }
1659 1657
1660 1658 if (lump < npages) {
1661 1659 *which += (uint_t)lump;
1662 1660 npages -= lump;
1663 1661 } else {
1664 1662 *which += (uint_t)npages;
1665 1663 npages = 0;
1666 1664 }
1667 1665
1668 1666 if (p->pcf_wait) {
1669 1667 mutex_enter(&new_freemem_lock);
1670 1668 /*
1671 1669 * Check to see if some other thread
1672 1670 * is actually waiting. Another bucket
1673 1671 * may have woken it up by now. If there
1674 1672 * are no waiters, then set our pcf_wait
1675 1673 * count to zero to avoid coming in here
1676 1674 * next time.
1677 1675 */
1678 1676 if (freemem_wait) {
1679 1677 if (npages > 1) {
1680 1678 cv_broadcast(&freemem_cv);
1681 1679 } else {
1682 1680 cv_signal(&freemem_cv);
1683 1681 }
1684 1682 p->pcf_wait--;
1685 1683 } else {
1686 1684 p->pcf_wait = 0;
1687 1685 }
1688 1686 mutex_exit(&new_freemem_lock);
1689 1687 }
1690 1688 mutex_exit(&p->pcf_lock);
1691 1689 }
1692 1690 ASSERT(npages == 0);
1693 1691 }
1694 1692
1695 1693 /*
1696 1694 * A helper routine for page_create_get_something.
1697 1695 * The indenting got to deep down there.
1698 1696 * Unblock the pcf counters. Any pages freed after
1699 1697 * pcf_block got set are moved to pcf_count and
1700 1698 * wakeups (cv_broadcast() or cv_signal()) are done as needed.
1701 1699 */
1702 1700 static void
1703 1701 pcgs_unblock(void)
1704 1702 {
1705 1703 int i;
1706 1704 struct pcf *p;
1707 1705
1708 1706 /* Update freemem while we're here. */
1709 1707 freemem = 0;
1710 1708 p = pcf;
1711 1709 for (i = 0; i < pcf_fanout; i++) {
1712 1710 mutex_enter(&p->pcf_lock);
1713 1711 ASSERT(p->pcf_count == 0);
1714 1712 p->pcf_count = p->pcf_reserve;
1715 1713 p->pcf_block = 0;
1716 1714 freemem += p->pcf_count;
1717 1715 if (p->pcf_wait) {
1718 1716 mutex_enter(&new_freemem_lock);
1719 1717 if (freemem_wait) {
1720 1718 if (p->pcf_reserve > 1) {
1721 1719 cv_broadcast(&freemem_cv);
1722 1720 p->pcf_wait = 0;
1723 1721 } else {
1724 1722 cv_signal(&freemem_cv);
1725 1723 p->pcf_wait--;
1726 1724 }
1727 1725 } else {
1728 1726 p->pcf_wait = 0;
1729 1727 }
1730 1728 mutex_exit(&new_freemem_lock);
1731 1729 }
1732 1730 p->pcf_reserve = 0;
1733 1731 mutex_exit(&p->pcf_lock);
1734 1732 p++;
1735 1733 }
1736 1734 }
1737 1735
1738 1736 /*
1739 1737 * Called from page_create_va() when both the cache and free lists
1740 1738 * have been checked once.
1741 1739 *
1742 1740 * Either returns a page or panics since the accounting was done
1743 1741 * way before we got here.
1744 1742 *
1745 1743 * We don't come here often, so leave the accounting on permanently.
1746 1744 */
1747 1745
1748 1746 #define MAX_PCGS 100
1749 1747
1750 1748 #ifdef DEBUG
1751 1749 #define PCGS_TRIES 100
1752 1750 #else /* DEBUG */
1753 1751 #define PCGS_TRIES 10
1754 1752 #endif /* DEBUG */
1755 1753
1756 1754 #ifdef VM_STATS
1757 1755 uint_t pcgs_counts[PCGS_TRIES];
1758 1756 uint_t pcgs_too_many;
1759 1757 uint_t pcgs_entered;
1760 1758 uint_t pcgs_entered_noreloc;
1761 1759 uint_t pcgs_locked;
1762 1760 uint_t pcgs_cagelocked;
1763 1761 #endif /* VM_STATS */
1764 1762
1765 1763 static page_t *
1766 1764 page_create_get_something(vnode_t *vp, u_offset_t off, struct seg *seg,
1767 1765 caddr_t vaddr, uint_t flags)
1768 1766 {
1769 1767 uint_t count;
1770 1768 page_t *pp;
1771 1769 uint_t locked, i;
1772 1770 struct pcf *p;
1773 1771 lgrp_t *lgrp;
1774 1772 int cagelocked = 0;
1775 1773
1776 1774 VM_STAT_ADD(pcgs_entered);
1777 1775
1778 1776 /*
1779 1777 * Tap any reserve freelists: if we fail now, we'll die
1780 1778 * since the page(s) we're looking for have already been
1781 1779 * accounted for.
1782 1780 */
1783 1781 flags |= PG_PANIC;
1784 1782
1785 1783 if ((flags & PG_NORELOC) != 0) {
1786 1784 VM_STAT_ADD(pcgs_entered_noreloc);
1787 1785 /*
1788 1786 * Requests for free pages from critical threads
1789 1787 * such as pageout still won't throttle here, but
1790 1788 * we must try again, to give the cageout thread
1791 1789 * another chance to catch up. Since we already
1792 1790 * accounted for the pages, we had better get them
1793 1791 * this time.
1794 1792 *
1795 1793 * N.B. All non-critical threads acquire the pcgs_cagelock
1796 1794 * to serialize access to the freelists. This implements a
1797 1795 * turnstile-type synchornization to avoid starvation of
1798 1796 * critical requests for PG_NORELOC memory by non-critical
1799 1797 * threads: all non-critical threads must acquire a 'ticket'
1800 1798 * before passing through, which entails making sure
1801 1799 * kcage_freemem won't fall below minfree prior to grabbing
1802 1800 * pages from the freelists.
1803 1801 */
1804 1802 if (kcage_create_throttle(1, flags) == KCT_NONCRIT) {
1805 1803 mutex_enter(&pcgs_cagelock);
1806 1804 cagelocked = 1;
1807 1805 VM_STAT_ADD(pcgs_cagelocked);
1808 1806 }
1809 1807 }
1810 1808
1811 1809 /*
1812 1810 * Time to get serious.
1813 1811 * We failed to get a `correctly colored' page from both the
1814 1812 * free and cache lists.
1815 1813 * We escalate in stage.
1816 1814 *
1817 1815 * First try both lists without worring about color.
1818 1816 *
1819 1817 * Then, grab all page accounting locks (ie. pcf[]) and
1820 1818 * steal any pages that they have and set the pcf_block flag to
1821 1819 * stop deletions from the lists. This will help because
1822 1820 * a page can get added to the free list while we are looking
1823 1821 * at the cache list, then another page could be added to the cache
1824 1822 * list allowing the page on the free list to be removed as we
1825 1823 * move from looking at the cache list to the free list. This
1826 1824 * could happen over and over. We would never find the page
1827 1825 * we have accounted for.
1828 1826 *
1829 1827 * Noreloc pages are a subset of the global (relocatable) page pool.
1830 1828 * They are not tracked separately in the pcf bins, so it is
1831 1829 * impossible to know when doing pcf accounting if the available
1832 1830 * page(s) are noreloc pages or not. When looking for a noreloc page
1833 1831 * it is quite easy to end up here even if the global (relocatable)
1834 1832 * page pool has plenty of free pages but the noreloc pool is empty.
1835 1833 *
1836 1834 * When the noreloc pool is empty (or low), additional noreloc pages
1837 1835 * are created by converting pages from the global page pool. This
1838 1836 * process will stall during pcf accounting if the pcf bins are
1839 1837 * already locked. Such is the case when a noreloc allocation is
1840 1838 * looping here in page_create_get_something waiting for more noreloc
1841 1839 * pages to appear.
1842 1840 *
1843 1841 * Short of adding a new field to the pcf bins to accurately track
1844 1842 * the number of free noreloc pages, we instead do not grab the
1845 1843 * pcgs_lock, do not set the pcf blocks and do not timeout when
1846 1844 * allocating a noreloc page. This allows noreloc allocations to
1847 1845 * loop without blocking global page pool allocations.
1848 1846 *
1849 1847 * NOTE: the behaviour of page_create_get_something has not changed
1850 1848 * for the case of global page pool allocations.
1851 1849 */
1852 1850
1853 1851 flags &= ~PG_MATCH_COLOR;
1854 1852 locked = 0;
1855 1853 #if defined(__i386) || defined(__amd64)
1856 1854 flags = page_create_update_flags_x86(flags);
1857 1855 #endif
1858 1856
1859 1857 lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
1860 1858
1861 1859 for (count = 0; kcage_on || count < MAX_PCGS; count++) {
1862 1860 pp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
1863 1861 flags, lgrp);
1864 1862 if (pp == NULL) {
1865 1863 pp = page_get_cachelist(vp, off, seg, vaddr,
1866 1864 flags, lgrp);
1867 1865 }
1868 1866 if (pp == NULL) {
1869 1867 /*
1870 1868 * Serialize. Don't fight with other pcgs().
1871 1869 */
1872 1870 if (!locked && (!kcage_on || !(flags & PG_NORELOC))) {
1873 1871 mutex_enter(&pcgs_lock);
1874 1872 VM_STAT_ADD(pcgs_locked);
1875 1873 locked = 1;
1876 1874 p = pcf;
1877 1875 for (i = 0; i < pcf_fanout; i++) {
1878 1876 mutex_enter(&p->pcf_lock);
1879 1877 ASSERT(p->pcf_block == 0);
1880 1878 p->pcf_block = 1;
1881 1879 p->pcf_reserve = p->pcf_count;
1882 1880 p->pcf_count = 0;
1883 1881 mutex_exit(&p->pcf_lock);
1884 1882 p++;
1885 1883 }
1886 1884 freemem = 0;
1887 1885 }
1888 1886
1889 1887 if (count) {
1890 1888 /*
1891 1889 * Since page_free() puts pages on
1892 1890 * a list then accounts for it, we
1893 1891 * just have to wait for page_free()
1894 1892 * to unlock any page it was working
1895 1893 * with. The page_lock()-page_reclaim()
1896 1894 * path falls in the same boat.
1897 1895 *
1898 1896 * We don't need to check on the
1899 1897 * PG_WAIT flag, we have already
1900 1898 * accounted for the page we are
1901 1899 * looking for in page_create_va().
1902 1900 *
1903 1901 * We just wait a moment to let any
1904 1902 * locked pages on the lists free up,
1905 1903 * then continue around and try again.
1906 1904 *
1907 1905 * Will be awakened by set_freemem().
1908 1906 */
1909 1907 mutex_enter(&pcgs_wait_lock);
1910 1908 cv_wait(&pcgs_cv, &pcgs_wait_lock);
1911 1909 mutex_exit(&pcgs_wait_lock);
1912 1910 }
1913 1911 } else {
1914 1912 #ifdef VM_STATS
1915 1913 if (count >= PCGS_TRIES) {
1916 1914 VM_STAT_ADD(pcgs_too_many);
1917 1915 } else {
1918 1916 VM_STAT_ADD(pcgs_counts[count]);
1919 1917 }
1920 1918 #endif
1921 1919 if (locked) {
1922 1920 pcgs_unblock();
1923 1921 mutex_exit(&pcgs_lock);
1924 1922 }
1925 1923 if (cagelocked)
1926 1924 mutex_exit(&pcgs_cagelock);
1927 1925 return (pp);
1928 1926 }
1929 1927 }
1930 1928 /*
1931 1929 * we go down holding the pcf locks.
1932 1930 */
1933 1931 panic("no %spage found %d",
1934 1932 ((flags & PG_NORELOC) ? "non-reloc " : ""), count);
1935 1933 /*NOTREACHED*/
1936 1934 }
1937 1935
1938 1936 /*
1939 1937 * Create enough pages for "bytes" worth of data starting at
1940 1938 * "off" in "vp".
1941 1939 *
1942 1940 * Where flag must be one of:
1943 1941 *
1944 1942 * PG_EXCL: Exclusive create (fail if any page already
1945 1943 * exists in the page cache) which does not
1946 1944 * wait for memory to become available.
1947 1945 *
1948 1946 * PG_WAIT: Non-exclusive create which can wait for
1949 1947 * memory to become available.
1950 1948 *
1951 1949 * PG_PHYSCONTIG: Allocate physically contiguous pages.
1952 1950 * (Not Supported)
1953 1951 *
1954 1952 * A doubly linked list of pages is returned to the caller. Each page
1955 1953 * on the list has the "exclusive" (p_selock) lock and "iolock" (p_iolock)
1956 1954 * lock.
1957 1955 *
1958 1956 * Unable to change the parameters to page_create() in a minor release,
1959 1957 * we renamed page_create() to page_create_va(), changed all known calls
1960 1958 * from page_create() to page_create_va(), and created this wrapper.
1961 1959 *
1962 1960 * Upon a major release, we should break compatibility by deleting this
1963 1961 * wrapper, and replacing all the strings "page_create_va", with "page_create".
1964 1962 *
1965 1963 * NOTE: There is a copy of this interface as page_create_io() in
1966 1964 * i86/vm/vm_machdep.c. Any bugs fixed here should be applied
1967 1965 * there.
1968 1966 */
1969 1967 page_t *
1970 1968 page_create(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags)
1971 1969 {
1972 1970 caddr_t random_vaddr;
1973 1971 struct seg kseg;
1974 1972
1975 1973 #ifdef DEBUG
1976 1974 cmn_err(CE_WARN, "Using deprecated interface page_create: caller %p",
1977 1975 (void *)caller());
1978 1976 #endif
1979 1977
1980 1978 random_vaddr = (caddr_t)(((uintptr_t)vp >> 7) ^
1981 1979 (uintptr_t)(off >> PAGESHIFT));
1982 1980 kseg.s_as = &kas;
1983 1981
1984 1982 return (page_create_va(vp, off, bytes, flags, &kseg, random_vaddr));
1985 1983 }
1986 1984
1987 1985 #ifdef DEBUG
1988 1986 uint32_t pg_alloc_pgs_mtbf = 0;
1989 1987 #endif
1990 1988
1991 1989 /*
1992 1990 * Used for large page support. It will attempt to allocate
1993 1991 * a large page(s) off the freelist.
1994 1992 *
1995 1993 * Returns non zero on failure.
1996 1994 */
1997 1995 int
1998 1996 page_alloc_pages(struct vnode *vp, struct seg *seg, caddr_t addr,
1999 1997 page_t **basepp, page_t *ppa[], uint_t szc, int anypgsz, int pgflags)
2000 1998 {
2001 1999 pgcnt_t npgs, curnpgs, totpgs;
2002 2000 size_t pgsz;
2003 2001 page_t *pplist = NULL, *pp;
2004 2002 int err = 0;
2005 2003 lgrp_t *lgrp;
2006 2004
2007 2005 ASSERT(szc != 0 && szc <= (page_num_pagesizes() - 1));
2008 2006 ASSERT(pgflags == 0 || pgflags == PG_LOCAL);
2009 2007
2010 2008 /*
2011 2009 * Check if system heavily prefers local large pages over remote
2012 2010 * on systems with multiple lgroups.
2013 2011 */
2014 2012 if (lpg_alloc_prefer == LPAP_LOCAL && nlgrps > 1) {
2015 2013 pgflags = PG_LOCAL;
2016 2014 }
2017 2015
2018 2016 VM_STAT_ADD(alloc_pages[0]);
2019 2017
2020 2018 #ifdef DEBUG
2021 2019 if (pg_alloc_pgs_mtbf && !(gethrtime() % pg_alloc_pgs_mtbf)) {
2022 2020 return (ENOMEM);
2023 2021 }
2024 2022 #endif
2025 2023
2026 2024 /*
2027 2025 * One must be NULL but not both.
2028 2026 * And one must be non NULL but not both.
2029 2027 */
2030 2028 ASSERT(basepp != NULL || ppa != NULL);
2031 2029 ASSERT(basepp == NULL || ppa == NULL);
2032 2030
2033 2031 #if defined(__i386) || defined(__amd64)
2034 2032 while (page_chk_freelist(szc) == 0) {
2035 2033 VM_STAT_ADD(alloc_pages[8]);
2036 2034 if (anypgsz == 0 || --szc == 0)
2037 2035 return (ENOMEM);
2038 2036 }
2039 2037 #endif
2040 2038
2041 2039 pgsz = page_get_pagesize(szc);
2042 2040 totpgs = curnpgs = npgs = pgsz >> PAGESHIFT;
2043 2041
2044 2042 ASSERT(((uintptr_t)addr & (pgsz - 1)) == 0);
2045 2043
2046 2044 (void) page_create_wait(npgs, PG_WAIT);
2047 2045
2048 2046 while (npgs && szc) {
2049 2047 lgrp = lgrp_mem_choose(seg, addr, pgsz);
2050 2048 if (pgflags == PG_LOCAL) {
2051 2049 pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2052 2050 pgflags, lgrp);
2053 2051 if (pp == NULL) {
2054 2052 pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2055 2053 0, lgrp);
2056 2054 }
2057 2055 } else {
2058 2056 pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2059 2057 0, lgrp);
2060 2058 }
2061 2059 if (pp != NULL) {
2062 2060 VM_STAT_ADD(alloc_pages[1]);
2063 2061 page_list_concat(&pplist, &pp);
2064 2062 ASSERT(npgs >= curnpgs);
2065 2063 npgs -= curnpgs;
2066 2064 } else if (anypgsz) {
2067 2065 VM_STAT_ADD(alloc_pages[2]);
2068 2066 szc--;
2069 2067 pgsz = page_get_pagesize(szc);
2070 2068 curnpgs = pgsz >> PAGESHIFT;
2071 2069 } else {
2072 2070 VM_STAT_ADD(alloc_pages[3]);
2073 2071 ASSERT(npgs == totpgs);
2074 2072 page_create_putback(npgs);
2075 2073 return (ENOMEM);
2076 2074 }
2077 2075 }
2078 2076 if (szc == 0) {
2079 2077 VM_STAT_ADD(alloc_pages[4]);
2080 2078 ASSERT(npgs != 0);
2081 2079 page_create_putback(npgs);
2082 2080 err = ENOMEM;
2083 2081 } else if (basepp != NULL) {
2084 2082 ASSERT(npgs == 0);
2085 2083 ASSERT(ppa == NULL);
2086 2084 *basepp = pplist;
2087 2085 }
2088 2086
2089 2087 npgs = totpgs - npgs;
2090 2088 pp = pplist;
2091 2089
2092 2090 /*
2093 2091 * Clear the free and age bits. Also if we were passed in a ppa then
2094 2092 * fill it in with all the constituent pages from the large page. But
2095 2093 * if we failed to allocate all the pages just free what we got.
2096 2094 */
2097 2095 while (npgs != 0) {
2098 2096 ASSERT(PP_ISFREE(pp));
2099 2097 ASSERT(PP_ISAGED(pp));
2100 2098 if (ppa != NULL || err != 0) {
2101 2099 if (err == 0) {
2102 2100 VM_STAT_ADD(alloc_pages[5]);
2103 2101 PP_CLRFREE(pp);
2104 2102 PP_CLRAGED(pp);
2105 2103 page_sub(&pplist, pp);
2106 2104 *ppa++ = pp;
2107 2105 npgs--;
2108 2106 } else {
2109 2107 VM_STAT_ADD(alloc_pages[6]);
2110 2108 ASSERT(pp->p_szc != 0);
2111 2109 curnpgs = page_get_pagecnt(pp->p_szc);
2112 2110 page_list_break(&pp, &pplist, curnpgs);
2113 2111 page_list_add_pages(pp, 0);
2114 2112 page_create_putback(curnpgs);
2115 2113 ASSERT(npgs >= curnpgs);
2116 2114 npgs -= curnpgs;
2117 2115 }
2118 2116 pp = pplist;
2119 2117 } else {
2120 2118 VM_STAT_ADD(alloc_pages[7]);
2121 2119 PP_CLRFREE(pp);
2122 2120 PP_CLRAGED(pp);
2123 2121 pp = pp->p_next;
2124 2122 npgs--;
2125 2123 }
2126 2124 }
2127 2125 return (err);
2128 2126 }
2129 2127
2130 2128 /*
2131 2129 * Get a single large page off of the freelists, and set it up for use.
2132 2130 * Number of bytes requested must be a supported page size.
2133 2131 *
2134 2132 * Note that this call may fail even if there is sufficient
2135 2133 * memory available or PG_WAIT is set, so the caller must
2136 2134 * be willing to fallback on page_create_va(), block and retry,
2137 2135 * or fail the requester.
2138 2136 */
2139 2137 page_t *
2140 2138 page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2141 2139 struct seg *seg, caddr_t vaddr, void *arg)
2142 2140 {
2143 2141 pgcnt_t npages;
2144 2142 page_t *pp;
2145 2143 page_t *rootpp;
2146 2144 lgrp_t *lgrp;
2147 2145 lgrp_id_t *lgrpid = (lgrp_id_t *)arg;
2148 2146
2149 2147 ASSERT(vp != NULL);
2150 2148
2151 2149 ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2152 2150 PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0);
2153 2151 /* but no others */
2154 2152
2155 2153 ASSERT((flags & PG_EXCL) == PG_EXCL);
2156 2154
2157 2155 npages = btop(bytes);
2158 2156
2159 2157 if (!kcage_on || panicstr) {
2160 2158 /*
2161 2159 * Cage is OFF, or we are single threaded in
2162 2160 * panic, so make everything a RELOC request.
2163 2161 */
2164 2162 flags &= ~PG_NORELOC;
2165 2163 }
2166 2164
2167 2165 /*
2168 2166 * Make sure there's adequate physical memory available.
2169 2167 * Note: PG_WAIT is ignored here.
2170 2168 */
2171 2169 if (freemem <= throttlefree + npages) {
2172 2170 VM_STAT_ADD(page_create_large_cnt[1]);
2173 2171 return (NULL);
2174 2172 }
2175 2173
2176 2174 /*
2177 2175 * If cage is on, dampen draw from cage when available
2178 2176 * cage space is low.
2179 2177 */
2180 2178 if ((flags & (PG_NORELOC | PG_WAIT)) == (PG_NORELOC | PG_WAIT) &&
2181 2179 kcage_freemem < kcage_throttlefree + npages) {
2182 2180
2183 2181 /*
2184 2182 * The cage is on, the caller wants PG_NORELOC
2185 2183 * pages and available cage memory is very low.
2186 2184 * Call kcage_create_throttle() to attempt to
2187 2185 * control demand on the cage.
2188 2186 */
2189 2187 if (kcage_create_throttle(npages, flags) == KCT_FAILURE) {
2190 2188 VM_STAT_ADD(page_create_large_cnt[2]);
2191 2189 return (NULL);
2192 2190 }
2193 2191 }
2194 2192
2195 2193 if (!pcf_decrement_bucket(npages) &&
2196 2194 !pcf_decrement_multiple(NULL, npages, 1)) {
2197 2195 VM_STAT_ADD(page_create_large_cnt[4]);
2198 2196 return (NULL);
2199 2197 }
2200 2198
2201 2199 /*
2202 2200 * This is where this function behaves fundamentally differently
2203 2201 * than page_create_va(); since we're intending to map the page
2204 2202 * with a single TTE, we have to get it as a physically contiguous
2205 2203 * hardware pagesize chunk. If we can't, we fail.
2206 2204 */
2207 2205 if (lgrpid != NULL && *lgrpid >= 0 && *lgrpid <= lgrp_alloc_max &&
2208 2206 LGRP_EXISTS(lgrp_table[*lgrpid]))
2209 2207 lgrp = lgrp_table[*lgrpid];
2210 2208 else
2211 2209 lgrp = lgrp_mem_choose(seg, vaddr, bytes);
2212 2210
2213 2211 if ((rootpp = page_get_freelist(&kvp, off, seg, vaddr,
2214 2212 bytes, flags & ~PG_MATCH_COLOR, lgrp)) == NULL) {
2215 2213 page_create_putback(npages);
2216 2214 VM_STAT_ADD(page_create_large_cnt[5]);
2217 2215 return (NULL);
2218 2216 }
2219 2217
2220 2218 /*
2221 2219 * if we got the page with the wrong mtype give it back this is a
2222 2220 * workaround for CR 6249718. When CR 6249718 is fixed we never get
2223 2221 * inside "if" and the workaround becomes just a nop
2224 2222 */
2225 2223 if (kcage_on && (flags & PG_NORELOC) && !PP_ISNORELOC(rootpp)) {
2226 2224 page_list_add_pages(rootpp, 0);
2227 2225 page_create_putback(npages);
2228 2226 VM_STAT_ADD(page_create_large_cnt[6]);
2229 2227 return (NULL);
2230 2228 }
2231 2229
2232 2230 /*
2233 2231 * If satisfying this request has left us with too little
2234 2232 * memory, start the wheels turning to get some back. The
2235 2233 * first clause of the test prevents waking up the pageout
2236 2234 * daemon in situations where it would decide that there's
2237 2235 * nothing to do.
2238 2236 */
2239 2237 if (nscan < desscan && freemem < minfree) {
2240 2238 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2241 2239 "pageout_cv_signal:freemem %ld", freemem);
2242 2240 cv_signal(&proc_pageout->p_cv);
2243 2241 }
2244 2242
2245 2243 pp = rootpp;
2246 2244 while (npages--) {
2247 2245 ASSERT(PAGE_EXCL(pp));
2248 2246 ASSERT(pp->p_vnode == NULL);
2249 2247 ASSERT(!hat_page_is_mapped(pp));
2250 2248 PP_CLRFREE(pp);
2251 2249 PP_CLRAGED(pp);
2252 2250 if (!page_hashin(pp, vp, off, NULL))
2253 2251 panic("page_create_large: hashin failed: page %p",
2254 2252 (void *)pp);
2255 2253 page_io_lock(pp);
2256 2254 off += PAGESIZE;
2257 2255 pp = pp->p_next;
2258 2256 }
2259 2257
2260 2258 VM_STAT_ADD(page_create_large_cnt[0]);
2261 2259 return (rootpp);
2262 2260 }
2263 2261
2264 2262 page_t *
2265 2263 page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2266 2264 struct seg *seg, caddr_t vaddr)
2267 2265 {
2268 2266 page_t *plist = NULL;
2269 2267 pgcnt_t npages;
2270 2268 pgcnt_t found_on_free = 0;
2271 2269 pgcnt_t pages_req;
2272 2270 page_t *npp = NULL;
2273 2271 struct pcf *p;
2274 2272 lgrp_t *lgrp;
2275 2273
2276 2274 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
2277 2275 "page_create_start:vp %p off %llx bytes %lu flags %x",
2278 2276 vp, off, bytes, flags);
2279 2277
2280 2278 ASSERT(bytes != 0 && vp != NULL);
2281 2279
2282 2280 if ((flags & PG_EXCL) == 0 && (flags & PG_WAIT) == 0) {
2283 2281 panic("page_create: invalid flags");
2284 2282 /*NOTREACHED*/
2285 2283 }
2286 2284 ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2287 2285 PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0);
2288 2286 /* but no others */
2289 2287
2290 2288 pages_req = npages = btopr(bytes);
2291 2289 /*
2292 2290 * Try to see whether request is too large to *ever* be
2293 2291 * satisfied, in order to prevent deadlock. We arbitrarily
2294 2292 * decide to limit maximum size requests to max_page_get.
2295 2293 */
2296 2294 if (npages >= max_page_get) {
2297 2295 if ((flags & PG_WAIT) == 0) {
2298 2296 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_TOOBIG,
2299 2297 "page_create_toobig:vp %p off %llx npages "
2300 2298 "%lu max_page_get %lu",
2301 2299 vp, off, npages, max_page_get);
2302 2300 return (NULL);
2303 2301 } else {
2304 2302 cmn_err(CE_WARN,
2305 2303 "Request for too much kernel memory "
2306 2304 "(%lu bytes), will hang forever", bytes);
2307 2305 for (;;)
2308 2306 delay(1000000000);
2309 2307 }
2310 2308 }
2311 2309
2312 2310 if (!kcage_on || panicstr) {
2313 2311 /*
2314 2312 * Cage is OFF, or we are single threaded in
2315 2313 * panic, so make everything a RELOC request.
2316 2314 */
2317 2315 flags &= ~PG_NORELOC;
2318 2316 }
2319 2317
2320 2318 if (freemem <= throttlefree + npages)
2321 2319 if (!page_create_throttle(npages, flags))
2322 2320 return (NULL);
2323 2321
2324 2322 /*
2325 2323 * If cage is on, dampen draw from cage when available
2326 2324 * cage space is low.
2327 2325 */
2328 2326 if ((flags & PG_NORELOC) &&
2329 2327 kcage_freemem < kcage_throttlefree + npages) {
2330 2328
2331 2329 /*
2332 2330 * The cage is on, the caller wants PG_NORELOC
2333 2331 * pages and available cage memory is very low.
2334 2332 * Call kcage_create_throttle() to attempt to
2335 2333 * control demand on the cage.
2336 2334 */
2337 2335 if (kcage_create_throttle(npages, flags) == KCT_FAILURE)
2338 2336 return (NULL);
2339 2337 }
2340 2338
2341 2339 VM_STAT_ADD(page_create_cnt[0]);
2342 2340
2343 2341 if (!pcf_decrement_bucket(npages)) {
2344 2342 /*
2345 2343 * Have to look harder. If npages is greater than
2346 2344 * one, then we might have to coalesce the counters.
2347 2345 *
2348 2346 * Go wait. We come back having accounted
2349 2347 * for the memory.
2350 2348 */
2351 2349 VM_STAT_ADD(page_create_cnt[1]);
2352 2350 if (!page_create_wait(npages, flags)) {
2353 2351 VM_STAT_ADD(page_create_cnt[2]);
2354 2352 return (NULL);
2355 2353 }
2356 2354 }
2357 2355
2358 2356 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
2359 2357 "page_create_success:vp %p off %llx", vp, off);
2360 2358
2361 2359 /*
2362 2360 * If satisfying this request has left us with too little
2363 2361 * memory, start the wheels turning to get some back. The
2364 2362 * first clause of the test prevents waking up the pageout
2365 2363 * daemon in situations where it would decide that there's
2366 2364 * nothing to do.
2367 2365 */
2368 2366 if (nscan < desscan && freemem < minfree) {
2369 2367 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2370 2368 "pageout_cv_signal:freemem %ld", freemem);
2371 2369 cv_signal(&proc_pageout->p_cv);
2372 2370 }
2373 2371
2374 2372 /*
2375 2373 * Loop around collecting the requested number of pages.
2376 2374 * Most of the time, we have to `create' a new page. With
2377 2375 * this in mind, pull the page off the free list before
2378 2376 * getting the hash lock. This will minimize the hash
2379 2377 * lock hold time, nesting, and the like. If it turns
2380 2378 * out we don't need the page, we put it back at the end.
2381 2379 */
2382 2380 while (npages--) {
2383 2381 page_t *pp;
2384 2382 kmutex_t *phm = NULL;
2385 2383 ulong_t index;
2386 2384
2387 2385 index = PAGE_HASH_FUNC(vp, off);
2388 2386 top:
2389 2387 ASSERT(phm == NULL);
2390 2388 ASSERT(index == PAGE_HASH_FUNC(vp, off));
2391 2389 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
2392 2390
2393 2391 if (npp == NULL) {
2394 2392 /*
2395 2393 * Try to get a page from the freelist (ie,
2396 2394 * a page with no [vp, off] tag). If that
2397 2395 * fails, use the cachelist.
2398 2396 *
2399 2397 * During the first attempt at both the free
2400 2398 * and cache lists we try for the correct color.
2401 2399 */
2402 2400 /*
2403 2401 * XXXX-how do we deal with virtual indexed
2404 2402 * caches and and colors?
2405 2403 */
2406 2404 VM_STAT_ADD(page_create_cnt[4]);
2407 2405 /*
2408 2406 * Get lgroup to allocate next page of shared memory
2409 2407 * from and use it to specify where to allocate
2410 2408 * the physical memory
2411 2409 */
2412 2410 lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
2413 2411 npp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
2414 2412 flags | PG_MATCH_COLOR, lgrp);
2415 2413 if (npp == NULL) {
2416 2414 npp = page_get_cachelist(vp, off, seg,
2417 2415 vaddr, flags | PG_MATCH_COLOR, lgrp);
2418 2416 if (npp == NULL) {
2419 2417 npp = page_create_get_something(vp,
2420 2418 off, seg, vaddr,
2421 2419 flags & ~PG_MATCH_COLOR);
2422 2420 }
2423 2421
2424 2422 if (PP_ISAGED(npp) == 0) {
2425 2423 /*
2426 2424 * Since this page came from the
2427 2425 * cachelist, we must destroy the
2428 2426 * old vnode association.
2429 2427 */
2430 2428 page_hashout(npp, NULL);
2431 2429 }
2432 2430 }
2433 2431 }
2434 2432
2435 2433 /*
2436 2434 * We own this page!
2437 2435 */
2438 2436 ASSERT(PAGE_EXCL(npp));
2439 2437 ASSERT(npp->p_vnode == NULL);
2440 2438 ASSERT(!hat_page_is_mapped(npp));
2441 2439 PP_CLRFREE(npp);
2442 2440 PP_CLRAGED(npp);
2443 2441
2444 2442 /*
2445 2443 * Here we have a page in our hot little mits and are
2446 2444 * just waiting to stuff it on the appropriate lists.
2447 2445 * Get the mutex and check to see if it really does
2448 2446 * not exist.
2449 2447 */
2450 2448 phm = PAGE_HASH_MUTEX(index);
2451 2449 mutex_enter(phm);
2452 2450 PAGE_HASH_SEARCH(index, pp, vp, off);
2453 2451 if (pp == NULL) {
2454 2452 VM_STAT_ADD(page_create_new);
2455 2453 pp = npp;
2456 2454 npp = NULL;
2457 2455 if (!page_hashin(pp, vp, off, phm)) {
2458 2456 /*
2459 2457 * Since we hold the page hash mutex and
2460 2458 * just searched for this page, page_hashin
2461 2459 * had better not fail. If it does, that
2462 2460 * means somethread did not follow the
2463 2461 * page hash mutex rules. Panic now and
2464 2462 * get it over with. As usual, go down
2465 2463 * holding all the locks.
2466 2464 */
2467 2465 ASSERT(MUTEX_HELD(phm));
2468 2466 panic("page_create: "
2469 2467 "hashin failed %p %p %llx %p",
2470 2468 (void *)pp, (void *)vp, off, (void *)phm);
2471 2469 /*NOTREACHED*/
2472 2470 }
2473 2471 ASSERT(MUTEX_HELD(phm));
2474 2472 mutex_exit(phm);
2475 2473 phm = NULL;
2476 2474
2477 2475 /*
2478 2476 * Hat layer locking need not be done to set
2479 2477 * the following bits since the page is not hashed
2480 2478 * and was on the free list (i.e., had no mappings).
2481 2479 *
2482 2480 * Set the reference bit to protect
2483 2481 * against immediate pageout
2484 2482 *
2485 2483 * XXXmh modify freelist code to set reference
2486 2484 * bit so we don't have to do it here.
2487 2485 */
2488 2486 page_set_props(pp, P_REF);
2489 2487 found_on_free++;
2490 2488 } else {
2491 2489 VM_STAT_ADD(page_create_exists);
2492 2490 if (flags & PG_EXCL) {
2493 2491 /*
2494 2492 * Found an existing page, and the caller
2495 2493 * wanted all new pages. Undo all of the work
2496 2494 * we have done.
2497 2495 */
2498 2496 mutex_exit(phm);
2499 2497 phm = NULL;
2500 2498 while (plist != NULL) {
2501 2499 pp = plist;
2502 2500 page_sub(&plist, pp);
2503 2501 page_io_unlock(pp);
2504 2502 /* large pages should not end up here */
2505 2503 ASSERT(pp->p_szc == 0);
2506 2504 /*LINTED: constant in conditional ctx*/
2507 2505 VN_DISPOSE(pp, B_INVAL, 0, kcred);
2508 2506 }
2509 2507 VM_STAT_ADD(page_create_found_one);
2510 2508 goto fail;
2511 2509 }
2512 2510 ASSERT(flags & PG_WAIT);
2513 2511 if (!page_lock(pp, SE_EXCL, phm, P_NO_RECLAIM)) {
2514 2512 /*
2515 2513 * Start all over again if we blocked trying
2516 2514 * to lock the page.
2517 2515 */
2518 2516 mutex_exit(phm);
2519 2517 VM_STAT_ADD(page_create_page_lock_failed);
2520 2518 phm = NULL;
2521 2519 goto top;
2522 2520 }
2523 2521 mutex_exit(phm);
2524 2522 phm = NULL;
2525 2523
2526 2524 if (PP_ISFREE(pp)) {
2527 2525 ASSERT(PP_ISAGED(pp) == 0);
2528 2526 VM_STAT_ADD(pagecnt.pc_get_cache);
2529 2527 page_list_sub(pp, PG_CACHE_LIST);
2530 2528 PP_CLRFREE(pp);
2531 2529 found_on_free++;
2532 2530 }
2533 2531 }
2534 2532
2535 2533 /*
2536 2534 * Got a page! It is locked. Acquire the i/o
2537 2535 * lock since we are going to use the p_next and
2538 2536 * p_prev fields to link the requested pages together.
2539 2537 */
2540 2538 page_io_lock(pp);
2541 2539 page_add(&plist, pp);
2542 2540 plist = plist->p_next;
2543 2541 off += PAGESIZE;
2544 2542 vaddr += PAGESIZE;
2545 2543 }
2546 2544
2547 2545 ASSERT((flags & PG_EXCL) ? (found_on_free == pages_req) : 1);
2548 2546 fail:
2549 2547 if (npp != NULL) {
2550 2548 /*
2551 2549 * Did not need this page after all.
2552 2550 * Put it back on the free list.
2553 2551 */
2554 2552 VM_STAT_ADD(page_create_putbacks);
2555 2553 PP_SETFREE(npp);
2556 2554 PP_SETAGED(npp);
2557 2555 npp->p_offset = (u_offset_t)-1;
2558 2556 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
2559 2557 page_unlock(npp);
2560 2558
2561 2559 }
2562 2560
2563 2561 ASSERT(pages_req >= found_on_free);
2564 2562
2565 2563 {
2566 2564 uint_t overshoot = (uint_t)(pages_req - found_on_free);
2567 2565
2568 2566 if (overshoot) {
2569 2567 VM_STAT_ADD(page_create_overshoot);
2570 2568 p = &pcf[PCF_INDEX()];
2571 2569 mutex_enter(&p->pcf_lock);
2572 2570 if (p->pcf_block) {
2573 2571 p->pcf_reserve += overshoot;
2574 2572 } else {
2575 2573 p->pcf_count += overshoot;
2576 2574 if (p->pcf_wait) {
2577 2575 mutex_enter(&new_freemem_lock);
2578 2576 if (freemem_wait) {
2579 2577 cv_signal(&freemem_cv);
2580 2578 p->pcf_wait--;
2581 2579 } else {
2582 2580 p->pcf_wait = 0;
2583 2581 }
2584 2582 mutex_exit(&new_freemem_lock);
2585 2583 }
2586 2584 }
2587 2585 mutex_exit(&p->pcf_lock);
2588 2586 /* freemem is approximate, so this test OK */
2589 2587 if (!p->pcf_block)
2590 2588 freemem += overshoot;
2591 2589 }
2592 2590 }
2593 2591
2594 2592 return (plist);
2595 2593 }
2596 2594
2597 2595 /*
2598 2596 * One or more constituent pages of this large page has been marked
2599 2597 * toxic. Simply demote the large page to PAGESIZE pages and let
2600 2598 * page_free() handle it. This routine should only be called by
2601 2599 * large page free routines (page_free_pages() and page_destroy_pages().
2602 2600 * All pages are locked SE_EXCL and have already been marked free.
2603 2601 */
2604 2602 static void
2605 2603 page_free_toxic_pages(page_t *rootpp)
2606 2604 {
2607 2605 page_t *tpp;
2608 2606 pgcnt_t i, pgcnt = page_get_pagecnt(rootpp->p_szc);
2609 2607 uint_t szc = rootpp->p_szc;
2610 2608
2611 2609 for (i = 0, tpp = rootpp; i < pgcnt; i++, tpp = tpp->p_next) {
2612 2610 ASSERT(tpp->p_szc == szc);
2613 2611 ASSERT((PAGE_EXCL(tpp) &&
2614 2612 !page_iolock_assert(tpp)) || panicstr);
2615 2613 tpp->p_szc = 0;
2616 2614 }
2617 2615
2618 2616 while (rootpp != NULL) {
2619 2617 tpp = rootpp;
2620 2618 page_sub(&rootpp, tpp);
2621 2619 ASSERT(PP_ISFREE(tpp));
2622 2620 PP_CLRFREE(tpp);
2623 2621 page_free(tpp, 1);
2624 2622 }
2625 2623 }
2626 2624
2627 2625 /*
2628 2626 * Put page on the "free" list.
2629 2627 * The free list is really two lists maintained by
2630 2628 * the PSM of whatever machine we happen to be on.
2631 2629 */
2632 2630 void
2633 2631 page_free(page_t *pp, int dontneed)
2634 2632 {
2635 2633 struct pcf *p;
2636 2634 uint_t pcf_index;
2637 2635
2638 2636 ASSERT((PAGE_EXCL(pp) &&
2639 2637 !page_iolock_assert(pp)) || panicstr);
2640 2638
2641 2639 if (PP_ISFREE(pp)) {
2642 2640 panic("page_free: page %p is free", (void *)pp);
2643 2641 }
2644 2642
2645 2643 if (pp->p_szc != 0) {
2646 2644 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
2647 2645 PP_ISKAS(pp)) {
2648 2646 panic("page_free: anon or kernel "
2649 2647 "or no vnode large page %p", (void *)pp);
2650 2648 }
2651 2649 page_demote_vp_pages(pp);
2652 2650 ASSERT(pp->p_szc == 0);
2653 2651 }
2654 2652
2655 2653 /*
2656 2654 * The page_struct_lock need not be acquired to examine these
2657 2655 * fields since the page has an "exclusive" lock.
2658 2656 */
2659 2657 if (hat_page_is_mapped(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
2660 2658 pp->p_slckcnt != 0) {
2661 2659 panic("page_free pp=%p, pfn=%lx, lckcnt=%d, cowcnt=%d "
2662 2660 "slckcnt = %d", (void *)pp, page_pptonum(pp), pp->p_lckcnt,
2663 2661 pp->p_cowcnt, pp->p_slckcnt);
2664 2662 /*NOTREACHED*/
2665 2663 }
2666 2664
2667 2665 ASSERT(!hat_page_getshare(pp));
2668 2666
2669 2667 PP_SETFREE(pp);
2670 2668 ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) ||
2671 2669 !hat_ismod(pp));
2672 2670 page_clr_all_props(pp);
2673 2671 ASSERT(!hat_page_getshare(pp));
2674 2672
2675 2673 /*
2676 2674 * Now we add the page to the head of the free list.
2677 2675 * But if this page is associated with a paged vnode
2678 2676 * then we adjust the head forward so that the page is
2679 2677 * effectively at the end of the list.
2680 2678 */
2681 2679 if (pp->p_vnode == NULL) {
2682 2680 /*
2683 2681 * Page has no identity, put it on the free list.
↓ open down ↓ |
2590 lines elided |
↑ open up ↑ |
2684 2682 */
2685 2683 PP_SETAGED(pp);
2686 2684 pp->p_offset = (u_offset_t)-1;
2687 2685 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
2688 2686 VM_STAT_ADD(pagecnt.pc_free_free);
2689 2687 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
2690 2688 "page_free_free:pp %p", pp);
2691 2689 } else {
2692 2690 PP_CLRAGED(pp);
2693 2691
2694 - if (!dontneed || nopageage) {
2692 + if (!dontneed) {
2695 2693 /* move it to the tail of the list */
2696 2694 page_list_add(pp, PG_CACHE_LIST | PG_LIST_TAIL);
2697 2695
2698 2696 VM_STAT_ADD(pagecnt.pc_free_cache);
2699 2697 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_TAIL,
2700 2698 "page_free_cache_tail:pp %p", pp);
2701 2699 } else {
2702 2700 page_list_add(pp, PG_CACHE_LIST | PG_LIST_HEAD);
2703 2701
2704 2702 VM_STAT_ADD(pagecnt.pc_free_dontneed);
2705 2703 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_HEAD,
2706 2704 "page_free_cache_head:pp %p", pp);
2707 2705 }
2708 2706 }
2709 2707 page_unlock(pp);
2710 2708
2711 2709 /*
2712 2710 * Now do the `freemem' accounting.
2713 2711 */
2714 2712 pcf_index = PCF_INDEX();
2715 2713 p = &pcf[pcf_index];
2716 2714
2717 2715 mutex_enter(&p->pcf_lock);
2718 2716 if (p->pcf_block) {
2719 2717 p->pcf_reserve += 1;
2720 2718 } else {
2721 2719 p->pcf_count += 1;
2722 2720 if (p->pcf_wait) {
2723 2721 mutex_enter(&new_freemem_lock);
2724 2722 /*
2725 2723 * Check to see if some other thread
2726 2724 * is actually waiting. Another bucket
2727 2725 * may have woken it up by now. If there
2728 2726 * are no waiters, then set our pcf_wait
2729 2727 * count to zero to avoid coming in here
2730 2728 * next time. Also, since only one page
2731 2729 * was put on the free list, just wake
2732 2730 * up one waiter.
2733 2731 */
2734 2732 if (freemem_wait) {
2735 2733 cv_signal(&freemem_cv);
2736 2734 p->pcf_wait--;
2737 2735 } else {
2738 2736 p->pcf_wait = 0;
2739 2737 }
2740 2738 mutex_exit(&new_freemem_lock);
2741 2739 }
2742 2740 }
2743 2741 mutex_exit(&p->pcf_lock);
2744 2742
2745 2743 /* freemem is approximate, so this test OK */
2746 2744 if (!p->pcf_block)
2747 2745 freemem += 1;
2748 2746 }
2749 2747
2750 2748 /*
2751 2749 * Put page on the "free" list during intial startup.
2752 2750 * This happens during initial single threaded execution.
2753 2751 */
2754 2752 void
2755 2753 page_free_at_startup(page_t *pp)
2756 2754 {
2757 2755 struct pcf *p;
2758 2756 uint_t pcf_index;
2759 2757
2760 2758 page_list_add(pp, PG_FREE_LIST | PG_LIST_HEAD | PG_LIST_ISINIT);
2761 2759 VM_STAT_ADD(pagecnt.pc_free_free);
2762 2760
2763 2761 /*
2764 2762 * Now do the `freemem' accounting.
2765 2763 */
2766 2764 pcf_index = PCF_INDEX();
2767 2765 p = &pcf[pcf_index];
2768 2766
2769 2767 ASSERT(p->pcf_block == 0);
2770 2768 ASSERT(p->pcf_wait == 0);
2771 2769 p->pcf_count += 1;
2772 2770
2773 2771 /* freemem is approximate, so this is OK */
2774 2772 freemem += 1;
2775 2773 }
2776 2774
2777 2775 void
2778 2776 page_free_pages(page_t *pp)
2779 2777 {
2780 2778 page_t *tpp, *rootpp = NULL;
2781 2779 pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
2782 2780 pgcnt_t i;
2783 2781 uint_t szc = pp->p_szc;
2784 2782
2785 2783 VM_STAT_ADD(pagecnt.pc_free_pages);
2786 2784 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
2787 2785 "page_free_free:pp %p", pp);
2788 2786
2789 2787 ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
2790 2788 if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
2791 2789 panic("page_free_pages: not root page %p", (void *)pp);
2792 2790 /*NOTREACHED*/
2793 2791 }
2794 2792
2795 2793 for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
2796 2794 ASSERT((PAGE_EXCL(tpp) &&
2797 2795 !page_iolock_assert(tpp)) || panicstr);
2798 2796 if (PP_ISFREE(tpp)) {
2799 2797 panic("page_free_pages: page %p is free", (void *)tpp);
2800 2798 /*NOTREACHED*/
2801 2799 }
2802 2800 if (hat_page_is_mapped(tpp) || tpp->p_lckcnt != 0 ||
2803 2801 tpp->p_cowcnt != 0 || tpp->p_slckcnt != 0) {
2804 2802 panic("page_free_pages %p", (void *)tpp);
2805 2803 /*NOTREACHED*/
2806 2804 }
2807 2805
2808 2806 ASSERT(!hat_page_getshare(tpp));
2809 2807 ASSERT(tpp->p_vnode == NULL);
2810 2808 ASSERT(tpp->p_szc == szc);
2811 2809
2812 2810 PP_SETFREE(tpp);
2813 2811 page_clr_all_props(tpp);
2814 2812 PP_SETAGED(tpp);
2815 2813 tpp->p_offset = (u_offset_t)-1;
2816 2814 ASSERT(tpp->p_next == tpp);
2817 2815 ASSERT(tpp->p_prev == tpp);
2818 2816 page_list_concat(&rootpp, &tpp);
2819 2817 }
2820 2818 ASSERT(rootpp == pp);
2821 2819
2822 2820 page_list_add_pages(rootpp, 0);
2823 2821 page_create_putback(pgcnt);
2824 2822 }
2825 2823
2826 2824 int free_pages = 1;
2827 2825
2828 2826 /*
2829 2827 * This routine attempts to return pages to the cachelist via page_release().
2830 2828 * It does not *have* to be successful in all cases, since the pageout scanner
2831 2829 * will catch any pages it misses. It does need to be fast and not introduce
2832 2830 * too much overhead.
2833 2831 *
2834 2832 * If a page isn't found on the unlocked sweep of the page_hash bucket, we
2835 2833 * don't lock and retry. This is ok, since the page scanner will eventually
2836 2834 * find any page we miss in free_vp_pages().
2837 2835 */
2838 2836 void
2839 2837 free_vp_pages(vnode_t *vp, u_offset_t off, size_t len)
2840 2838 {
2841 2839 page_t *pp;
2842 2840 u_offset_t eoff;
2843 2841 extern int swap_in_range(vnode_t *, u_offset_t, size_t);
2844 2842
2845 2843 eoff = off + len;
2846 2844
2847 2845 if (free_pages == 0)
2848 2846 return;
2849 2847 if (swap_in_range(vp, off, len))
2850 2848 return;
2851 2849
2852 2850 for (; off < eoff; off += PAGESIZE) {
2853 2851
2854 2852 /*
2855 2853 * find the page using a fast, but inexact search. It'll be OK
2856 2854 * if a few pages slip through the cracks here.
2857 2855 */
2858 2856 pp = page_exists(vp, off);
2859 2857
2860 2858 /*
2861 2859 * If we didn't find the page (it may not exist), the page
2862 2860 * is free, looks still in use (shared), or we can't lock it,
2863 2861 * just give up.
2864 2862 */
2865 2863 if (pp == NULL ||
2866 2864 PP_ISFREE(pp) ||
2867 2865 page_share_cnt(pp) > 0 ||
2868 2866 !page_trylock(pp, SE_EXCL))
2869 2867 continue;
2870 2868
2871 2869 /*
2872 2870 * Once we have locked pp, verify that it's still the
2873 2871 * correct page and not already free
2874 2872 */
2875 2873 ASSERT(PAGE_LOCKED_SE(pp, SE_EXCL));
2876 2874 if (pp->p_vnode != vp || pp->p_offset != off || PP_ISFREE(pp)) {
2877 2875 page_unlock(pp);
2878 2876 continue;
2879 2877 }
2880 2878
2881 2879 /*
2882 2880 * try to release the page...
2883 2881 */
2884 2882 (void) page_release(pp, 1);
2885 2883 }
2886 2884 }
2887 2885
2888 2886 /*
2889 2887 * Reclaim the given page from the free list.
2890 2888 * If pp is part of a large pages, only the given constituent page is reclaimed
2891 2889 * and the large page it belonged to will be demoted. This can only happen
2892 2890 * if the page is not on the cachelist.
2893 2891 *
2894 2892 * Returns 1 on success or 0 on failure.
2895 2893 *
2896 2894 * The page is unlocked if it can't be reclaimed (when freemem == 0).
2897 2895 * If `lock' is non-null, it will be dropped and re-acquired if
2898 2896 * the routine must wait while freemem is 0.
2899 2897 *
2900 2898 * As it turns out, boot_getpages() does this. It picks a page,
2901 2899 * based on where OBP mapped in some address, gets its pfn, searches
2902 2900 * the memsegs, locks the page, then pulls it off the free list!
2903 2901 */
2904 2902 int
2905 2903 page_reclaim(page_t *pp, kmutex_t *lock)
2906 2904 {
2907 2905 struct pcf *p;
2908 2906 struct cpu *cpup;
2909 2907 int enough;
2910 2908 uint_t i;
2911 2909
2912 2910 ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
2913 2911 ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp));
2914 2912
2915 2913 /*
2916 2914 * If `freemem' is 0, we cannot reclaim this page from the
2917 2915 * freelist, so release every lock we might hold: the page,
2918 2916 * and the `lock' before blocking.
2919 2917 *
2920 2918 * The only way `freemem' can become 0 while there are pages
2921 2919 * marked free (have their p->p_free bit set) is when the
2922 2920 * system is low on memory and doing a page_create(). In
2923 2921 * order to guarantee that once page_create() starts acquiring
2924 2922 * pages it will be able to get all that it needs since `freemem'
2925 2923 * was decreased by the requested amount. So, we need to release
2926 2924 * this page, and let page_create() have it.
2927 2925 *
2928 2926 * Since `freemem' being zero is not supposed to happen, just
2929 2927 * use the usual hash stuff as a starting point. If that bucket
2930 2928 * is empty, then assume the worst, and start at the beginning
2931 2929 * of the pcf array. If we always start at the beginning
2932 2930 * when acquiring more than one pcf lock, there won't be any
2933 2931 * deadlock problems.
2934 2932 */
2935 2933
2936 2934 /* TODO: Do we need to test kcage_freemem if PG_NORELOC(pp)? */
2937 2935
2938 2936 if (freemem <= throttlefree && !page_create_throttle(1l, 0)) {
2939 2937 pcf_acquire_all();
2940 2938 goto page_reclaim_nomem;
2941 2939 }
2942 2940
2943 2941 enough = pcf_decrement_bucket(1);
2944 2942
2945 2943 if (!enough) {
2946 2944 VM_STAT_ADD(page_reclaim_zero);
2947 2945 /*
2948 2946 * Check again. Its possible that some other thread
2949 2947 * could have been right behind us, and added one
2950 2948 * to a list somewhere. Acquire each of the pcf locks
2951 2949 * until we find a page.
2952 2950 */
2953 2951 p = pcf;
2954 2952 for (i = 0; i < pcf_fanout; i++) {
2955 2953 mutex_enter(&p->pcf_lock);
2956 2954 if (p->pcf_count >= 1) {
2957 2955 p->pcf_count -= 1;
2958 2956 /*
2959 2957 * freemem is not protected by any lock. Thus,
2960 2958 * we cannot have any assertion containing
2961 2959 * freemem here.
2962 2960 */
2963 2961 freemem -= 1;
2964 2962 enough = 1;
2965 2963 break;
2966 2964 }
2967 2965 p++;
2968 2966 }
2969 2967
2970 2968 if (!enough) {
2971 2969 page_reclaim_nomem:
2972 2970 /*
2973 2971 * We really can't have page `pp'.
2974 2972 * Time for the no-memory dance with
2975 2973 * page_free(). This is just like
2976 2974 * page_create_wait(). Plus the added
2977 2975 * attraction of releasing whatever mutex
2978 2976 * we held when we were called with in `lock'.
2979 2977 * Page_unlock() will wakeup any thread
2980 2978 * waiting around for this page.
2981 2979 */
2982 2980 if (lock) {
2983 2981 VM_STAT_ADD(page_reclaim_zero_locked);
2984 2982 mutex_exit(lock);
2985 2983 }
2986 2984 page_unlock(pp);
2987 2985
2988 2986 /*
2989 2987 * get this before we drop all the pcf locks.
2990 2988 */
2991 2989 mutex_enter(&new_freemem_lock);
2992 2990
2993 2991 p = pcf;
2994 2992 for (i = 0; i < pcf_fanout; i++) {
2995 2993 p->pcf_wait++;
2996 2994 mutex_exit(&p->pcf_lock);
2997 2995 p++;
2998 2996 }
2999 2997
3000 2998 freemem_wait++;
3001 2999 cv_wait(&freemem_cv, &new_freemem_lock);
3002 3000 freemem_wait--;
3003 3001
3004 3002 mutex_exit(&new_freemem_lock);
3005 3003
3006 3004 if (lock) {
3007 3005 mutex_enter(lock);
3008 3006 }
3009 3007 return (0);
3010 3008 }
3011 3009
3012 3010 /*
3013 3011 * The pcf accounting has been done,
3014 3012 * though none of the pcf_wait flags have been set,
3015 3013 * drop the locks and continue on.
3016 3014 */
3017 3015 while (p >= pcf) {
3018 3016 mutex_exit(&p->pcf_lock);
3019 3017 p--;
3020 3018 }
3021 3019 }
3022 3020
3023 3021
3024 3022 VM_STAT_ADD(pagecnt.pc_reclaim);
3025 3023
3026 3024 /*
3027 3025 * page_list_sub will handle the case where pp is a large page.
3028 3026 * It's possible that the page was promoted while on the freelist
3029 3027 */
3030 3028 if (PP_ISAGED(pp)) {
3031 3029 page_list_sub(pp, PG_FREE_LIST);
3032 3030 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE,
3033 3031 "page_reclaim_free:pp %p", pp);
3034 3032 } else {
3035 3033 page_list_sub(pp, PG_CACHE_LIST);
3036 3034 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE,
3037 3035 "page_reclaim_cache:pp %p", pp);
3038 3036 }
3039 3037
3040 3038 /*
3041 3039 * clear the p_free & p_age bits since this page is no longer
3042 3040 * on the free list. Notice that there was a brief time where
3043 3041 * a page is marked as free, but is not on the list.
3044 3042 *
3045 3043 * Set the reference bit to protect against immediate pageout.
3046 3044 */
3047 3045 PP_CLRFREE(pp);
3048 3046 PP_CLRAGED(pp);
3049 3047 page_set_props(pp, P_REF);
3050 3048
3051 3049 CPU_STATS_ENTER_K();
3052 3050 cpup = CPU; /* get cpup now that CPU cannot change */
3053 3051 CPU_STATS_ADDQ(cpup, vm, pgrec, 1);
3054 3052 CPU_STATS_ADDQ(cpup, vm, pgfrec, 1);
3055 3053 CPU_STATS_EXIT_K();
3056 3054 ASSERT(pp->p_szc == 0);
3057 3055
3058 3056 return (1);
3059 3057 }
3060 3058
3061 3059 /*
3062 3060 * Destroy identity of the page and put it back on
3063 3061 * the page free list. Assumes that the caller has
3064 3062 * acquired the "exclusive" lock on the page.
3065 3063 */
3066 3064 void
3067 3065 page_destroy(page_t *pp, int dontfree)
3068 3066 {
3069 3067 ASSERT((PAGE_EXCL(pp) &&
3070 3068 !page_iolock_assert(pp)) || panicstr);
3071 3069 ASSERT(pp->p_slckcnt == 0 || panicstr);
3072 3070
3073 3071 if (pp->p_szc != 0) {
3074 3072 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
3075 3073 PP_ISKAS(pp)) {
3076 3074 panic("page_destroy: anon or kernel or no vnode "
3077 3075 "large page %p", (void *)pp);
3078 3076 }
3079 3077 page_demote_vp_pages(pp);
3080 3078 ASSERT(pp->p_szc == 0);
3081 3079 }
3082 3080
3083 3081 TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy:pp %p", pp);
3084 3082
3085 3083 /*
3086 3084 * Unload translations, if any, then hash out the
3087 3085 * page to erase its identity.
3088 3086 */
3089 3087 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3090 3088 page_hashout(pp, NULL);
3091 3089
3092 3090 if (!dontfree) {
3093 3091 /*
3094 3092 * Acquire the "freemem_lock" for availrmem.
3095 3093 * The page_struct_lock need not be acquired for lckcnt
3096 3094 * and cowcnt since the page has an "exclusive" lock.
3097 3095 * We are doing a modified version of page_pp_unlock here.
3098 3096 */
3099 3097 if ((pp->p_lckcnt != 0) || (pp->p_cowcnt != 0)) {
3100 3098 mutex_enter(&freemem_lock);
3101 3099 if (pp->p_lckcnt != 0) {
3102 3100 availrmem++;
3103 3101 pages_locked--;
3104 3102 pp->p_lckcnt = 0;
3105 3103 }
3106 3104 if (pp->p_cowcnt != 0) {
3107 3105 availrmem += pp->p_cowcnt;
3108 3106 pages_locked -= pp->p_cowcnt;
3109 3107 pp->p_cowcnt = 0;
3110 3108 }
3111 3109 mutex_exit(&freemem_lock);
3112 3110 }
3113 3111 /*
3114 3112 * Put the page on the "free" list.
3115 3113 */
3116 3114 page_free(pp, 0);
3117 3115 }
3118 3116 }
3119 3117
3120 3118 void
3121 3119 page_destroy_pages(page_t *pp)
3122 3120 {
3123 3121
3124 3122 page_t *tpp, *rootpp = NULL;
3125 3123 pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
3126 3124 pgcnt_t i, pglcks = 0;
3127 3125 uint_t szc = pp->p_szc;
3128 3126
3129 3127 ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
3130 3128
3131 3129 VM_STAT_ADD(pagecnt.pc_destroy_pages);
3132 3130
3133 3131 TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy_pages:pp %p", pp);
3134 3132
3135 3133 if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
3136 3134 panic("page_destroy_pages: not root page %p", (void *)pp);
3137 3135 /*NOTREACHED*/
3138 3136 }
3139 3137
3140 3138 for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
3141 3139 ASSERT((PAGE_EXCL(tpp) &&
3142 3140 !page_iolock_assert(tpp)) || panicstr);
3143 3141 ASSERT(tpp->p_slckcnt == 0 || panicstr);
3144 3142 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
3145 3143 page_hashout(tpp, NULL);
3146 3144 ASSERT(tpp->p_offset == (u_offset_t)-1);
3147 3145 if (tpp->p_lckcnt != 0) {
3148 3146 pglcks++;
3149 3147 tpp->p_lckcnt = 0;
3150 3148 } else if (tpp->p_cowcnt != 0) {
3151 3149 pglcks += tpp->p_cowcnt;
3152 3150 tpp->p_cowcnt = 0;
3153 3151 }
3154 3152 ASSERT(!hat_page_getshare(tpp));
3155 3153 ASSERT(tpp->p_vnode == NULL);
3156 3154 ASSERT(tpp->p_szc == szc);
3157 3155
3158 3156 PP_SETFREE(tpp);
3159 3157 page_clr_all_props(tpp);
3160 3158 PP_SETAGED(tpp);
3161 3159 ASSERT(tpp->p_next == tpp);
3162 3160 ASSERT(tpp->p_prev == tpp);
3163 3161 page_list_concat(&rootpp, &tpp);
3164 3162 }
3165 3163
3166 3164 ASSERT(rootpp == pp);
3167 3165 if (pglcks != 0) {
3168 3166 mutex_enter(&freemem_lock);
3169 3167 availrmem += pglcks;
3170 3168 mutex_exit(&freemem_lock);
3171 3169 }
3172 3170
3173 3171 page_list_add_pages(rootpp, 0);
3174 3172 page_create_putback(pgcnt);
3175 3173 }
3176 3174
3177 3175 /*
3178 3176 * Similar to page_destroy(), but destroys pages which are
3179 3177 * locked and known to be on the page free list. Since
3180 3178 * the page is known to be free and locked, no one can access
3181 3179 * it.
3182 3180 *
3183 3181 * Also, the number of free pages does not change.
3184 3182 */
3185 3183 void
3186 3184 page_destroy_free(page_t *pp)
3187 3185 {
3188 3186 ASSERT(PAGE_EXCL(pp));
3189 3187 ASSERT(PP_ISFREE(pp));
3190 3188 ASSERT(pp->p_vnode);
3191 3189 ASSERT(hat_page_getattr(pp, P_MOD | P_REF | P_RO) == 0);
3192 3190 ASSERT(!hat_page_is_mapped(pp));
3193 3191 ASSERT(PP_ISAGED(pp) == 0);
3194 3192 ASSERT(pp->p_szc == 0);
3195 3193
3196 3194 VM_STAT_ADD(pagecnt.pc_destroy_free);
3197 3195 page_list_sub(pp, PG_CACHE_LIST);
3198 3196
3199 3197 page_hashout(pp, NULL);
3200 3198 ASSERT(pp->p_vnode == NULL);
3201 3199 ASSERT(pp->p_offset == (u_offset_t)-1);
3202 3200 ASSERT(pp->p_hash == NULL);
3203 3201
3204 3202 PP_SETAGED(pp);
3205 3203 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3206 3204 page_unlock(pp);
3207 3205
3208 3206 mutex_enter(&new_freemem_lock);
3209 3207 if (freemem_wait) {
3210 3208 cv_signal(&freemem_cv);
3211 3209 }
3212 3210 mutex_exit(&new_freemem_lock);
3213 3211 }
3214 3212
3215 3213 /*
3216 3214 * Rename the page "opp" to have an identity specified
3217 3215 * by [vp, off]. If a page already exists with this name
3218 3216 * it is locked and destroyed. Note that the page's
3219 3217 * translations are not unloaded during the rename.
3220 3218 *
3221 3219 * This routine is used by the anon layer to "steal" the
3222 3220 * original page and is not unlike destroying a page and
3223 3221 * creating a new page using the same page frame.
3224 3222 *
3225 3223 * XXX -- Could deadlock if caller 1 tries to rename A to B while
3226 3224 * caller 2 tries to rename B to A.
3227 3225 */
3228 3226 void
3229 3227 page_rename(page_t *opp, vnode_t *vp, u_offset_t off)
3230 3228 {
3231 3229 page_t *pp;
3232 3230 int olckcnt = 0;
3233 3231 int ocowcnt = 0;
3234 3232 kmutex_t *phm;
3235 3233 ulong_t index;
3236 3234
3237 3235 ASSERT(PAGE_EXCL(opp) && !page_iolock_assert(opp));
3238 3236 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3239 3237 ASSERT(PP_ISFREE(opp) == 0);
3240 3238
3241 3239 VM_STAT_ADD(page_rename_count);
3242 3240
3243 3241 TRACE_3(TR_FAC_VM, TR_PAGE_RENAME,
3244 3242 "page rename:pp %p vp %p off %llx", opp, vp, off);
3245 3243
3246 3244 /*
3247 3245 * CacheFS may call page_rename for a large NFS page
3248 3246 * when both CacheFS and NFS mount points are used
3249 3247 * by applications. Demote this large page before
3250 3248 * renaming it, to ensure that there are no "partial"
3251 3249 * large pages left lying around.
3252 3250 */
3253 3251 if (opp->p_szc != 0) {
3254 3252 vnode_t *ovp = opp->p_vnode;
3255 3253 ASSERT(ovp != NULL);
3256 3254 ASSERT(!IS_SWAPFSVP(ovp));
3257 3255 ASSERT(!VN_ISKAS(ovp));
3258 3256 page_demote_vp_pages(opp);
3259 3257 ASSERT(opp->p_szc == 0);
3260 3258 }
3261 3259
3262 3260 page_hashout(opp, NULL);
3263 3261 PP_CLRAGED(opp);
3264 3262
3265 3263 /*
3266 3264 * Acquire the appropriate page hash lock, since
3267 3265 * we're going to rename the page.
3268 3266 */
3269 3267 index = PAGE_HASH_FUNC(vp, off);
3270 3268 phm = PAGE_HASH_MUTEX(index);
3271 3269 mutex_enter(phm);
3272 3270 top:
3273 3271 /*
3274 3272 * Look for an existing page with this name and destroy it if found.
3275 3273 * By holding the page hash lock all the way to the page_hashin()
3276 3274 * call, we are assured that no page can be created with this
3277 3275 * identity. In the case when the phm lock is dropped to undo any
3278 3276 * hat layer mappings, the existing page is held with an "exclusive"
3279 3277 * lock, again preventing another page from being created with
3280 3278 * this identity.
3281 3279 */
3282 3280 PAGE_HASH_SEARCH(index, pp, vp, off);
3283 3281 if (pp != NULL) {
3284 3282 VM_STAT_ADD(page_rename_exists);
3285 3283
3286 3284 /*
3287 3285 * As it turns out, this is one of only two places where
3288 3286 * page_lock() needs to hold the passed in lock in the
3289 3287 * successful case. In all of the others, the lock could
3290 3288 * be dropped as soon as the attempt is made to lock
3291 3289 * the page. It is tempting to add yet another arguement,
3292 3290 * PL_KEEP or PL_DROP, to let page_lock know what to do.
3293 3291 */
3294 3292 if (!page_lock(pp, SE_EXCL, phm, P_RECLAIM)) {
3295 3293 /*
3296 3294 * Went to sleep because the page could not
3297 3295 * be locked. We were woken up when the page
3298 3296 * was unlocked, or when the page was destroyed.
3299 3297 * In either case, `phm' was dropped while we
3300 3298 * slept. Hence we should not just roar through
3301 3299 * this loop.
3302 3300 */
3303 3301 goto top;
3304 3302 }
3305 3303
3306 3304 /*
3307 3305 * If an existing page is a large page, then demote
3308 3306 * it to ensure that no "partial" large pages are
3309 3307 * "created" after page_rename. An existing page
3310 3308 * can be a CacheFS page, and can't belong to swapfs.
3311 3309 */
3312 3310 if (hat_page_is_mapped(pp)) {
3313 3311 /*
3314 3312 * Unload translations. Since we hold the
3315 3313 * exclusive lock on this page, the page
3316 3314 * can not be changed while we drop phm.
3317 3315 * This is also not a lock protocol violation,
3318 3316 * but rather the proper way to do things.
3319 3317 */
3320 3318 mutex_exit(phm);
3321 3319 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3322 3320 if (pp->p_szc != 0) {
3323 3321 ASSERT(!IS_SWAPFSVP(vp));
3324 3322 ASSERT(!VN_ISKAS(vp));
3325 3323 page_demote_vp_pages(pp);
3326 3324 ASSERT(pp->p_szc == 0);
3327 3325 }
3328 3326 mutex_enter(phm);
3329 3327 } else if (pp->p_szc != 0) {
3330 3328 ASSERT(!IS_SWAPFSVP(vp));
3331 3329 ASSERT(!VN_ISKAS(vp));
3332 3330 mutex_exit(phm);
3333 3331 page_demote_vp_pages(pp);
3334 3332 ASSERT(pp->p_szc == 0);
3335 3333 mutex_enter(phm);
3336 3334 }
3337 3335 page_hashout(pp, phm);
3338 3336 }
3339 3337 /*
3340 3338 * Hash in the page with the new identity.
3341 3339 */
3342 3340 if (!page_hashin(opp, vp, off, phm)) {
3343 3341 /*
3344 3342 * We were holding phm while we searched for [vp, off]
3345 3343 * and only dropped phm if we found and locked a page.
3346 3344 * If we can't create this page now, then some thing
3347 3345 * is really broken.
3348 3346 */
3349 3347 panic("page_rename: Can't hash in page: %p", (void *)pp);
3350 3348 /*NOTREACHED*/
3351 3349 }
3352 3350
3353 3351 ASSERT(MUTEX_HELD(phm));
3354 3352 mutex_exit(phm);
3355 3353
3356 3354 /*
3357 3355 * Now that we have dropped phm, lets get around to finishing up
3358 3356 * with pp.
3359 3357 */
3360 3358 if (pp != NULL) {
3361 3359 ASSERT(!hat_page_is_mapped(pp));
3362 3360 /* for now large pages should not end up here */
3363 3361 ASSERT(pp->p_szc == 0);
3364 3362 /*
3365 3363 * Save the locks for transfer to the new page and then
3366 3364 * clear them so page_free doesn't think they're important.
3367 3365 * The page_struct_lock need not be acquired for lckcnt and
3368 3366 * cowcnt since the page has an "exclusive" lock.
3369 3367 */
3370 3368 olckcnt = pp->p_lckcnt;
3371 3369 ocowcnt = pp->p_cowcnt;
3372 3370 pp->p_lckcnt = pp->p_cowcnt = 0;
3373 3371
3374 3372 /*
3375 3373 * Put the page on the "free" list after we drop
3376 3374 * the lock. The less work under the lock the better.
3377 3375 */
3378 3376 /*LINTED: constant in conditional context*/
3379 3377 VN_DISPOSE(pp, B_FREE, 0, kcred);
3380 3378 }
3381 3379
3382 3380 /*
3383 3381 * Transfer the lock count from the old page (if any).
3384 3382 * The page_struct_lock need not be acquired for lckcnt and
3385 3383 * cowcnt since the page has an "exclusive" lock.
3386 3384 */
3387 3385 opp->p_lckcnt += olckcnt;
3388 3386 opp->p_cowcnt += ocowcnt;
3389 3387 }
3390 3388
3391 3389 /*
3392 3390 * low level routine to add page `pp' to the hash and vp chains for [vp, offset]
3393 3391 *
3394 3392 * Pages are normally inserted at the start of a vnode's v_pages list.
3395 3393 * If the vnode is VMODSORT and the page is modified, it goes at the end.
3396 3394 * This can happen when a modified page is relocated for DR.
3397 3395 *
3398 3396 * Returns 1 on success and 0 on failure.
3399 3397 */
3400 3398 static int
3401 3399 page_do_hashin(page_t *pp, vnode_t *vp, u_offset_t offset)
3402 3400 {
3403 3401 page_t **listp;
3404 3402 page_t *tp;
3405 3403 ulong_t index;
3406 3404
3407 3405 ASSERT(PAGE_EXCL(pp));
3408 3406 ASSERT(vp != NULL);
3409 3407 ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3410 3408
3411 3409 /*
3412 3410 * Be sure to set these up before the page is inserted on the hash
3413 3411 * list. As soon as the page is placed on the list some other
3414 3412 * thread might get confused and wonder how this page could
3415 3413 * possibly hash to this list.
3416 3414 */
3417 3415 pp->p_vnode = vp;
3418 3416 pp->p_offset = offset;
3419 3417
3420 3418 /*
3421 3419 * record if this page is on a swap vnode
3422 3420 */
3423 3421 if ((vp->v_flag & VISSWAP) != 0)
3424 3422 PP_SETSWAP(pp);
3425 3423
3426 3424 index = PAGE_HASH_FUNC(vp, offset);
3427 3425 ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(index)));
3428 3426 listp = &page_hash[index];
3429 3427
3430 3428 /*
3431 3429 * If this page is already hashed in, fail this attempt to add it.
3432 3430 */
3433 3431 for (tp = *listp; tp != NULL; tp = tp->p_hash) {
3434 3432 if (tp->p_vnode == vp && tp->p_offset == offset) {
3435 3433 pp->p_vnode = NULL;
3436 3434 pp->p_offset = (u_offset_t)(-1);
3437 3435 return (0);
3438 3436 }
3439 3437 }
3440 3438 pp->p_hash = *listp;
3441 3439 *listp = pp;
3442 3440
3443 3441 /*
3444 3442 * Add the page to the vnode's list of pages
3445 3443 */
3446 3444 if (vp->v_pages != NULL && IS_VMODSORT(vp) && hat_ismod(pp))
3447 3445 listp = &vp->v_pages->p_vpprev->p_vpnext;
3448 3446 else
3449 3447 listp = &vp->v_pages;
3450 3448
3451 3449 page_vpadd(listp, pp);
3452 3450
3453 3451 return (1);
3454 3452 }
3455 3453
3456 3454 /*
3457 3455 * Add page `pp' to both the hash and vp chains for [vp, offset].
3458 3456 *
3459 3457 * Returns 1 on success and 0 on failure.
3460 3458 * If hold is passed in, it is not dropped.
3461 3459 */
3462 3460 int
3463 3461 page_hashin(page_t *pp, vnode_t *vp, u_offset_t offset, kmutex_t *hold)
3464 3462 {
3465 3463 kmutex_t *phm = NULL;
3466 3464 kmutex_t *vphm;
3467 3465 int rc;
3468 3466
3469 3467 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3470 3468 ASSERT(pp->p_fsdata == 0 || panicstr);
3471 3469
3472 3470 TRACE_3(TR_FAC_VM, TR_PAGE_HASHIN,
3473 3471 "page_hashin:pp %p vp %p offset %llx",
3474 3472 pp, vp, offset);
3475 3473
3476 3474 VM_STAT_ADD(hashin_count);
3477 3475
3478 3476 if (hold != NULL)
3479 3477 phm = hold;
3480 3478 else {
3481 3479 VM_STAT_ADD(hashin_not_held);
3482 3480 phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, offset));
3483 3481 mutex_enter(phm);
3484 3482 }
3485 3483
3486 3484 vphm = page_vnode_mutex(vp);
3487 3485 mutex_enter(vphm);
3488 3486 rc = page_do_hashin(pp, vp, offset);
3489 3487 mutex_exit(vphm);
3490 3488 if (hold == NULL)
3491 3489 mutex_exit(phm);
3492 3490 if (rc == 0)
3493 3491 VM_STAT_ADD(hashin_already);
3494 3492 return (rc);
3495 3493 }
3496 3494
3497 3495 /*
3498 3496 * Remove page ``pp'' from the hash and vp chains and remove vp association.
3499 3497 * All mutexes must be held
3500 3498 */
3501 3499 static void
3502 3500 page_do_hashout(page_t *pp)
3503 3501 {
3504 3502 page_t **hpp;
3505 3503 page_t *hp;
3506 3504 vnode_t *vp = pp->p_vnode;
3507 3505
3508 3506 ASSERT(vp != NULL);
3509 3507 ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3510 3508
3511 3509 /*
3512 3510 * First, take pp off of its hash chain.
3513 3511 */
3514 3512 hpp = &page_hash[PAGE_HASH_FUNC(vp, pp->p_offset)];
3515 3513
3516 3514 for (;;) {
3517 3515 hp = *hpp;
3518 3516 if (hp == pp)
3519 3517 break;
3520 3518 if (hp == NULL) {
3521 3519 panic("page_do_hashout");
3522 3520 /*NOTREACHED*/
3523 3521 }
3524 3522 hpp = &hp->p_hash;
3525 3523 }
3526 3524 *hpp = pp->p_hash;
3527 3525
3528 3526 /*
3529 3527 * Now remove it from its associated vnode.
3530 3528 */
3531 3529 if (vp->v_pages)
3532 3530 page_vpsub(&vp->v_pages, pp);
3533 3531
3534 3532 pp->p_hash = NULL;
3535 3533 page_clr_all_props(pp);
3536 3534 PP_CLRSWAP(pp);
3537 3535 pp->p_vnode = NULL;
3538 3536 pp->p_offset = (u_offset_t)-1;
3539 3537 pp->p_fsdata = 0;
3540 3538 }
3541 3539
3542 3540 /*
3543 3541 * Remove page ``pp'' from the hash and vp chains and remove vp association.
3544 3542 *
3545 3543 * When `phm' is non-NULL it contains the address of the mutex protecting the
3546 3544 * hash list pp is on. It is not dropped.
3547 3545 */
3548 3546 void
3549 3547 page_hashout(page_t *pp, kmutex_t *phm)
3550 3548 {
3551 3549 vnode_t *vp;
3552 3550 ulong_t index;
3553 3551 kmutex_t *nphm;
3554 3552 kmutex_t *vphm;
3555 3553 kmutex_t *sep;
3556 3554
3557 3555 ASSERT(phm != NULL ? MUTEX_HELD(phm) : 1);
3558 3556 ASSERT(pp->p_vnode != NULL);
3559 3557 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
3560 3558 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(pp->p_vnode)));
3561 3559
3562 3560 vp = pp->p_vnode;
3563 3561
3564 3562 TRACE_2(TR_FAC_VM, TR_PAGE_HASHOUT,
3565 3563 "page_hashout:pp %p vp %p", pp, vp);
3566 3564
3567 3565 /* Kernel probe */
3568 3566 TNF_PROBE_2(page_unmap, "vm pagefault", /* CSTYLED */,
3569 3567 tnf_opaque, vnode, vp,
3570 3568 tnf_offset, offset, pp->p_offset);
3571 3569
3572 3570 /*
3573 3571 *
3574 3572 */
3575 3573 VM_STAT_ADD(hashout_count);
3576 3574 index = PAGE_HASH_FUNC(vp, pp->p_offset);
3577 3575 if (phm == NULL) {
3578 3576 VM_STAT_ADD(hashout_not_held);
3579 3577 nphm = PAGE_HASH_MUTEX(index);
3580 3578 mutex_enter(nphm);
3581 3579 }
3582 3580 ASSERT(phm ? phm == PAGE_HASH_MUTEX(index) : 1);
3583 3581
3584 3582
3585 3583 /*
3586 3584 * grab page vnode mutex and remove it...
3587 3585 */
3588 3586 vphm = page_vnode_mutex(vp);
3589 3587 mutex_enter(vphm);
3590 3588
3591 3589 page_do_hashout(pp);
3592 3590
3593 3591 mutex_exit(vphm);
3594 3592 if (phm == NULL)
3595 3593 mutex_exit(nphm);
3596 3594
3597 3595 /*
3598 3596 * Wake up processes waiting for this page. The page's
3599 3597 * identity has been changed, and is probably not the
3600 3598 * desired page any longer.
3601 3599 */
3602 3600 sep = page_se_mutex(pp);
3603 3601 mutex_enter(sep);
3604 3602 pp->p_selock &= ~SE_EWANTED;
3605 3603 if (CV_HAS_WAITERS(&pp->p_cv))
3606 3604 cv_broadcast(&pp->p_cv);
3607 3605 mutex_exit(sep);
3608 3606 }
3609 3607
3610 3608 /*
3611 3609 * Add the page to the front of a linked list of pages
3612 3610 * using the p_next & p_prev pointers for the list.
3613 3611 * The caller is responsible for protecting the list pointers.
3614 3612 */
3615 3613 void
3616 3614 page_add(page_t **ppp, page_t *pp)
3617 3615 {
3618 3616 ASSERT(PAGE_EXCL(pp) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3619 3617
3620 3618 page_add_common(ppp, pp);
3621 3619 }
3622 3620
3623 3621
3624 3622
3625 3623 /*
3626 3624 * Common code for page_add() and mach_page_add()
3627 3625 */
3628 3626 void
3629 3627 page_add_common(page_t **ppp, page_t *pp)
3630 3628 {
3631 3629 if (*ppp == NULL) {
3632 3630 pp->p_next = pp->p_prev = pp;
3633 3631 } else {
3634 3632 pp->p_next = *ppp;
3635 3633 pp->p_prev = (*ppp)->p_prev;
3636 3634 (*ppp)->p_prev = pp;
3637 3635 pp->p_prev->p_next = pp;
3638 3636 }
3639 3637 *ppp = pp;
3640 3638 }
3641 3639
3642 3640
3643 3641 /*
3644 3642 * Remove this page from a linked list of pages
3645 3643 * using the p_next & p_prev pointers for the list.
3646 3644 *
3647 3645 * The caller is responsible for protecting the list pointers.
3648 3646 */
3649 3647 void
3650 3648 page_sub(page_t **ppp, page_t *pp)
3651 3649 {
3652 3650 ASSERT((PP_ISFREE(pp)) ? 1 :
3653 3651 (PAGE_EXCL(pp)) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3654 3652
3655 3653 if (*ppp == NULL || pp == NULL) {
3656 3654 panic("page_sub: bad arg(s): pp %p, *ppp %p",
3657 3655 (void *)pp, (void *)(*ppp));
3658 3656 /*NOTREACHED*/
3659 3657 }
3660 3658
3661 3659 page_sub_common(ppp, pp);
3662 3660 }
3663 3661
3664 3662
3665 3663 /*
3666 3664 * Common code for page_sub() and mach_page_sub()
3667 3665 */
3668 3666 void
3669 3667 page_sub_common(page_t **ppp, page_t *pp)
3670 3668 {
3671 3669 if (*ppp == pp)
3672 3670 *ppp = pp->p_next; /* go to next page */
3673 3671
3674 3672 if (*ppp == pp)
3675 3673 *ppp = NULL; /* page list is gone */
3676 3674 else {
3677 3675 pp->p_prev->p_next = pp->p_next;
3678 3676 pp->p_next->p_prev = pp->p_prev;
3679 3677 }
3680 3678 pp->p_prev = pp->p_next = pp; /* make pp a list of one */
3681 3679 }
3682 3680
3683 3681
3684 3682 /*
3685 3683 * Break page list cppp into two lists with npages in the first list.
3686 3684 * The tail is returned in nppp.
3687 3685 */
3688 3686 void
3689 3687 page_list_break(page_t **oppp, page_t **nppp, pgcnt_t npages)
3690 3688 {
3691 3689 page_t *s1pp = *oppp;
3692 3690 page_t *s2pp;
3693 3691 page_t *e1pp, *e2pp;
3694 3692 long n = 0;
3695 3693
3696 3694 if (s1pp == NULL) {
3697 3695 *nppp = NULL;
3698 3696 return;
3699 3697 }
3700 3698 if (npages == 0) {
3701 3699 *nppp = s1pp;
3702 3700 *oppp = NULL;
3703 3701 return;
3704 3702 }
3705 3703 for (n = 0, s2pp = *oppp; n < npages; n++) {
3706 3704 s2pp = s2pp->p_next;
3707 3705 }
3708 3706 /* Fix head and tail of new lists */
3709 3707 e1pp = s2pp->p_prev;
3710 3708 e2pp = s1pp->p_prev;
3711 3709 s1pp->p_prev = e1pp;
3712 3710 e1pp->p_next = s1pp;
3713 3711 s2pp->p_prev = e2pp;
3714 3712 e2pp->p_next = s2pp;
3715 3713
3716 3714 /* second list empty */
3717 3715 if (s2pp == s1pp) {
3718 3716 *oppp = s1pp;
3719 3717 *nppp = NULL;
3720 3718 } else {
3721 3719 *oppp = s1pp;
3722 3720 *nppp = s2pp;
3723 3721 }
3724 3722 }
3725 3723
3726 3724 /*
3727 3725 * Concatenate page list nppp onto the end of list ppp.
3728 3726 */
3729 3727 void
3730 3728 page_list_concat(page_t **ppp, page_t **nppp)
3731 3729 {
3732 3730 page_t *s1pp, *s2pp, *e1pp, *e2pp;
3733 3731
3734 3732 if (*nppp == NULL) {
3735 3733 return;
3736 3734 }
3737 3735 if (*ppp == NULL) {
3738 3736 *ppp = *nppp;
3739 3737 return;
3740 3738 }
3741 3739 s1pp = *ppp;
3742 3740 e1pp = s1pp->p_prev;
3743 3741 s2pp = *nppp;
3744 3742 e2pp = s2pp->p_prev;
3745 3743 s1pp->p_prev = e2pp;
3746 3744 e2pp->p_next = s1pp;
3747 3745 e1pp->p_next = s2pp;
3748 3746 s2pp->p_prev = e1pp;
3749 3747 }
3750 3748
3751 3749 /*
3752 3750 * return the next page in the page list
3753 3751 */
3754 3752 page_t *
3755 3753 page_list_next(page_t *pp)
3756 3754 {
3757 3755 return (pp->p_next);
3758 3756 }
3759 3757
3760 3758
3761 3759 /*
3762 3760 * Add the page to the front of the linked list of pages
3763 3761 * using p_vpnext/p_vpprev pointers for the list.
3764 3762 *
3765 3763 * The caller is responsible for protecting the lists.
3766 3764 */
3767 3765 void
3768 3766 page_vpadd(page_t **ppp, page_t *pp)
3769 3767 {
3770 3768 if (*ppp == NULL) {
3771 3769 pp->p_vpnext = pp->p_vpprev = pp;
3772 3770 } else {
3773 3771 pp->p_vpnext = *ppp;
3774 3772 pp->p_vpprev = (*ppp)->p_vpprev;
3775 3773 (*ppp)->p_vpprev = pp;
3776 3774 pp->p_vpprev->p_vpnext = pp;
3777 3775 }
3778 3776 *ppp = pp;
3779 3777 }
3780 3778
3781 3779 /*
3782 3780 * Remove this page from the linked list of pages
3783 3781 * using p_vpnext/p_vpprev pointers for the list.
3784 3782 *
3785 3783 * The caller is responsible for protecting the lists.
3786 3784 */
3787 3785 void
3788 3786 page_vpsub(page_t **ppp, page_t *pp)
3789 3787 {
3790 3788 if (*ppp == NULL || pp == NULL) {
3791 3789 panic("page_vpsub: bad arg(s): pp %p, *ppp %p",
3792 3790 (void *)pp, (void *)(*ppp));
3793 3791 /*NOTREACHED*/
3794 3792 }
3795 3793
3796 3794 if (*ppp == pp)
3797 3795 *ppp = pp->p_vpnext; /* go to next page */
3798 3796
3799 3797 if (*ppp == pp)
3800 3798 *ppp = NULL; /* page list is gone */
3801 3799 else {
3802 3800 pp->p_vpprev->p_vpnext = pp->p_vpnext;
3803 3801 pp->p_vpnext->p_vpprev = pp->p_vpprev;
3804 3802 }
3805 3803 pp->p_vpprev = pp->p_vpnext = pp; /* make pp a list of one */
3806 3804 }
3807 3805
3808 3806 /*
3809 3807 * Lock a physical page into memory "long term". Used to support "lock
3810 3808 * in memory" functions. Accepts the page to be locked, and a cow variable
3811 3809 * to indicate whether a the lock will travel to the new page during
3812 3810 * a potential copy-on-write.
3813 3811 */
3814 3812 int
3815 3813 page_pp_lock(
3816 3814 page_t *pp, /* page to be locked */
3817 3815 int cow, /* cow lock */
3818 3816 int kernel) /* must succeed -- ignore checking */
3819 3817 {
3820 3818 int r = 0; /* result -- assume failure */
3821 3819
3822 3820 ASSERT(PAGE_LOCKED(pp));
3823 3821
3824 3822 page_struct_lock(pp);
3825 3823 /*
3826 3824 * Acquire the "freemem_lock" for availrmem.
3827 3825 */
3828 3826 if (cow) {
3829 3827 mutex_enter(&freemem_lock);
3830 3828 if ((availrmem > pages_pp_maximum) &&
3831 3829 (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
3832 3830 availrmem--;
3833 3831 pages_locked++;
3834 3832 mutex_exit(&freemem_lock);
3835 3833 r = 1;
3836 3834 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
3837 3835 cmn_err(CE_WARN,
3838 3836 "COW lock limit reached on pfn 0x%lx",
3839 3837 page_pptonum(pp));
3840 3838 }
3841 3839 } else
3842 3840 mutex_exit(&freemem_lock);
3843 3841 } else {
3844 3842 if (pp->p_lckcnt) {
3845 3843 if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
3846 3844 r = 1;
3847 3845 if (++pp->p_lckcnt ==
3848 3846 (ushort_t)PAGE_LOCK_MAXIMUM) {
3849 3847 cmn_err(CE_WARN, "Page lock limit "
3850 3848 "reached on pfn 0x%lx",
3851 3849 page_pptonum(pp));
3852 3850 }
3853 3851 }
3854 3852 } else {
3855 3853 if (kernel) {
3856 3854 /* availrmem accounting done by caller */
3857 3855 ++pp->p_lckcnt;
3858 3856 r = 1;
3859 3857 } else {
3860 3858 mutex_enter(&freemem_lock);
3861 3859 if (availrmem > pages_pp_maximum) {
3862 3860 availrmem--;
3863 3861 pages_locked++;
3864 3862 ++pp->p_lckcnt;
3865 3863 r = 1;
3866 3864 }
3867 3865 mutex_exit(&freemem_lock);
3868 3866 }
3869 3867 }
3870 3868 }
3871 3869 page_struct_unlock(pp);
3872 3870 return (r);
3873 3871 }
3874 3872
3875 3873 /*
3876 3874 * Decommit a lock on a physical page frame. Account for cow locks if
3877 3875 * appropriate.
3878 3876 */
3879 3877 void
3880 3878 page_pp_unlock(
3881 3879 page_t *pp, /* page to be unlocked */
3882 3880 int cow, /* expect cow lock */
3883 3881 int kernel) /* this was a kernel lock */
3884 3882 {
3885 3883 ASSERT(PAGE_LOCKED(pp));
3886 3884
3887 3885 page_struct_lock(pp);
3888 3886 /*
3889 3887 * Acquire the "freemem_lock" for availrmem.
3890 3888 * If cowcnt or lcknt is already 0 do nothing; i.e., we
3891 3889 * could be called to unlock even if nothing is locked. This could
3892 3890 * happen if locked file pages were truncated (removing the lock)
3893 3891 * and the file was grown again and new pages faulted in; the new
3894 3892 * pages are unlocked but the segment still thinks they're locked.
3895 3893 */
3896 3894 if (cow) {
3897 3895 if (pp->p_cowcnt) {
3898 3896 mutex_enter(&freemem_lock);
3899 3897 pp->p_cowcnt--;
3900 3898 availrmem++;
3901 3899 pages_locked--;
3902 3900 mutex_exit(&freemem_lock);
3903 3901 }
3904 3902 } else {
3905 3903 if (pp->p_lckcnt && --pp->p_lckcnt == 0) {
3906 3904 if (!kernel) {
3907 3905 mutex_enter(&freemem_lock);
3908 3906 availrmem++;
3909 3907 pages_locked--;
3910 3908 mutex_exit(&freemem_lock);
3911 3909 }
3912 3910 }
3913 3911 }
3914 3912 page_struct_unlock(pp);
3915 3913 }
3916 3914
3917 3915 /*
3918 3916 * This routine reserves availrmem for npages;
3919 3917 * flags: KM_NOSLEEP or KM_SLEEP
3920 3918 * returns 1 on success or 0 on failure
3921 3919 */
3922 3920 int
3923 3921 page_resv(pgcnt_t npages, uint_t flags)
3924 3922 {
3925 3923 mutex_enter(&freemem_lock);
3926 3924 while (availrmem < tune.t_minarmem + npages) {
3927 3925 if (flags & KM_NOSLEEP) {
3928 3926 mutex_exit(&freemem_lock);
3929 3927 return (0);
3930 3928 }
3931 3929 mutex_exit(&freemem_lock);
3932 3930 page_needfree(npages);
3933 3931 kmem_reap();
3934 3932 delay(hz >> 2);
3935 3933 page_needfree(-(spgcnt_t)npages);
3936 3934 mutex_enter(&freemem_lock);
3937 3935 }
3938 3936 availrmem -= npages;
3939 3937 mutex_exit(&freemem_lock);
3940 3938 return (1);
3941 3939 }
3942 3940
3943 3941 /*
3944 3942 * This routine unreserves availrmem for npages;
3945 3943 */
3946 3944 void
3947 3945 page_unresv(pgcnt_t npages)
3948 3946 {
3949 3947 mutex_enter(&freemem_lock);
3950 3948 availrmem += npages;
3951 3949 mutex_exit(&freemem_lock);
3952 3950 }
3953 3951
3954 3952 /*
3955 3953 * See Statement at the beginning of segvn_lockop() regarding
3956 3954 * the way we handle cowcnts and lckcnts.
3957 3955 *
3958 3956 * Transfer cowcnt on 'opp' to cowcnt on 'npp' if the vpage
3959 3957 * that breaks COW has PROT_WRITE.
3960 3958 *
3961 3959 * Note that, we may also break COW in case we are softlocking
3962 3960 * on read access during physio;
3963 3961 * in this softlock case, the vpage may not have PROT_WRITE.
3964 3962 * So, we need to transfer lckcnt on 'opp' to lckcnt on 'npp'
3965 3963 * if the vpage doesn't have PROT_WRITE.
3966 3964 *
3967 3965 * This routine is never called if we are stealing a page
3968 3966 * in anon_private.
3969 3967 *
3970 3968 * The caller subtracted from availrmem for read only mapping.
3971 3969 * if lckcnt is 1 increment availrmem.
3972 3970 */
3973 3971 void
3974 3972 page_pp_useclaim(
3975 3973 page_t *opp, /* original page frame losing lock */
3976 3974 page_t *npp, /* new page frame gaining lock */
3977 3975 uint_t write_perm) /* set if vpage has PROT_WRITE */
3978 3976 {
3979 3977 int payback = 0;
3980 3978 int nidx, oidx;
3981 3979
3982 3980 ASSERT(PAGE_LOCKED(opp));
3983 3981 ASSERT(PAGE_LOCKED(npp));
3984 3982
3985 3983 /*
3986 3984 * Since we have two pages we probably have two locks. We need to take
3987 3985 * them in a defined order to avoid deadlocks. It's also possible they
3988 3986 * both hash to the same lock in which case this is a non-issue.
3989 3987 */
3990 3988 nidx = PAGE_LLOCK_HASH(PP_PAGEROOT(npp));
3991 3989 oidx = PAGE_LLOCK_HASH(PP_PAGEROOT(opp));
3992 3990 if (nidx < oidx) {
3993 3991 page_struct_lock(npp);
3994 3992 page_struct_lock(opp);
3995 3993 } else if (oidx < nidx) {
3996 3994 page_struct_lock(opp);
3997 3995 page_struct_lock(npp);
3998 3996 } else { /* The pages hash to the same lock */
3999 3997 page_struct_lock(npp);
4000 3998 }
4001 3999
4002 4000 ASSERT(npp->p_cowcnt == 0);
4003 4001 ASSERT(npp->p_lckcnt == 0);
4004 4002
4005 4003 /* Don't use claim if nothing is locked (see page_pp_unlock above) */
4006 4004 if ((write_perm && opp->p_cowcnt != 0) ||
4007 4005 (!write_perm && opp->p_lckcnt != 0)) {
4008 4006
4009 4007 if (write_perm) {
4010 4008 npp->p_cowcnt++;
4011 4009 ASSERT(opp->p_cowcnt != 0);
4012 4010 opp->p_cowcnt--;
4013 4011 } else {
4014 4012
4015 4013 ASSERT(opp->p_lckcnt != 0);
4016 4014
4017 4015 /*
4018 4016 * We didn't need availrmem decremented if p_lckcnt on
4019 4017 * original page is 1. Here, we are unlocking
4020 4018 * read-only copy belonging to original page and
4021 4019 * are locking a copy belonging to new page.
4022 4020 */
4023 4021 if (opp->p_lckcnt == 1)
4024 4022 payback = 1;
4025 4023
4026 4024 npp->p_lckcnt++;
4027 4025 opp->p_lckcnt--;
4028 4026 }
4029 4027 }
4030 4028 if (payback) {
4031 4029 mutex_enter(&freemem_lock);
4032 4030 availrmem++;
4033 4031 pages_useclaim--;
4034 4032 mutex_exit(&freemem_lock);
4035 4033 }
4036 4034
4037 4035 if (nidx < oidx) {
4038 4036 page_struct_unlock(opp);
4039 4037 page_struct_unlock(npp);
4040 4038 } else if (oidx < nidx) {
4041 4039 page_struct_unlock(npp);
4042 4040 page_struct_unlock(opp);
4043 4041 } else { /* The pages hash to the same lock */
4044 4042 page_struct_unlock(npp);
4045 4043 }
4046 4044 }
4047 4045
4048 4046 /*
4049 4047 * Simple claim adjust functions -- used to support changes in
4050 4048 * claims due to changes in access permissions. Used by segvn_setprot().
4051 4049 */
4052 4050 int
4053 4051 page_addclaim(page_t *pp)
4054 4052 {
4055 4053 int r = 0; /* result */
4056 4054
4057 4055 ASSERT(PAGE_LOCKED(pp));
4058 4056
4059 4057 page_struct_lock(pp);
4060 4058 ASSERT(pp->p_lckcnt != 0);
4061 4059
4062 4060 if (pp->p_lckcnt == 1) {
4063 4061 if (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4064 4062 --pp->p_lckcnt;
4065 4063 r = 1;
4066 4064 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4067 4065 cmn_err(CE_WARN,
4068 4066 "COW lock limit reached on pfn 0x%lx",
4069 4067 page_pptonum(pp));
4070 4068 }
4071 4069 }
4072 4070 } else {
4073 4071 mutex_enter(&freemem_lock);
4074 4072 if ((availrmem > pages_pp_maximum) &&
4075 4073 (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
4076 4074 --availrmem;
4077 4075 ++pages_claimed;
4078 4076 mutex_exit(&freemem_lock);
4079 4077 --pp->p_lckcnt;
4080 4078 r = 1;
4081 4079 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4082 4080 cmn_err(CE_WARN,
4083 4081 "COW lock limit reached on pfn 0x%lx",
4084 4082 page_pptonum(pp));
4085 4083 }
4086 4084 } else
4087 4085 mutex_exit(&freemem_lock);
4088 4086 }
4089 4087 page_struct_unlock(pp);
4090 4088 return (r);
4091 4089 }
4092 4090
4093 4091 int
4094 4092 page_subclaim(page_t *pp)
4095 4093 {
4096 4094 int r = 0;
4097 4095
4098 4096 ASSERT(PAGE_LOCKED(pp));
4099 4097
4100 4098 page_struct_lock(pp);
4101 4099 ASSERT(pp->p_cowcnt != 0);
4102 4100
4103 4101 if (pp->p_lckcnt) {
4104 4102 if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4105 4103 r = 1;
4106 4104 /*
4107 4105 * for availrmem
4108 4106 */
4109 4107 mutex_enter(&freemem_lock);
4110 4108 availrmem++;
4111 4109 pages_claimed--;
4112 4110 mutex_exit(&freemem_lock);
4113 4111
4114 4112 pp->p_cowcnt--;
4115 4113
4116 4114 if (++pp->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4117 4115 cmn_err(CE_WARN,
4118 4116 "Page lock limit reached on pfn 0x%lx",
4119 4117 page_pptonum(pp));
4120 4118 }
4121 4119 }
4122 4120 } else {
4123 4121 r = 1;
4124 4122 pp->p_cowcnt--;
4125 4123 pp->p_lckcnt++;
4126 4124 }
4127 4125 page_struct_unlock(pp);
4128 4126 return (r);
4129 4127 }
4130 4128
4131 4129 /*
4132 4130 * Variant of page_addclaim(), where ppa[] contains the pages of a single large
4133 4131 * page.
4134 4132 */
4135 4133 int
4136 4134 page_addclaim_pages(page_t **ppa)
4137 4135 {
4138 4136 pgcnt_t lckpgs = 0, pg_idx;
4139 4137
4140 4138 VM_STAT_ADD(pagecnt.pc_addclaim_pages);
4141 4139
4142 4140 /*
4143 4141 * Only need to take the page struct lock on the large page root.
4144 4142 */
4145 4143 page_struct_lock(ppa[0]);
4146 4144 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4147 4145
4148 4146 ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4149 4147 ASSERT(ppa[pg_idx]->p_lckcnt != 0);
4150 4148 if (ppa[pg_idx]->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4151 4149 page_struct_unlock(ppa[0]);
4152 4150 return (0);
4153 4151 }
4154 4152 if (ppa[pg_idx]->p_lckcnt > 1)
4155 4153 lckpgs++;
4156 4154 }
4157 4155
4158 4156 if (lckpgs != 0) {
4159 4157 mutex_enter(&freemem_lock);
4160 4158 if (availrmem >= pages_pp_maximum + lckpgs) {
4161 4159 availrmem -= lckpgs;
4162 4160 pages_claimed += lckpgs;
4163 4161 } else {
4164 4162 mutex_exit(&freemem_lock);
4165 4163 page_struct_unlock(ppa[0]);
4166 4164 return (0);
4167 4165 }
4168 4166 mutex_exit(&freemem_lock);
4169 4167 }
4170 4168
4171 4169 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4172 4170 ppa[pg_idx]->p_lckcnt--;
4173 4171 ppa[pg_idx]->p_cowcnt++;
4174 4172 }
4175 4173 page_struct_unlock(ppa[0]);
4176 4174 return (1);
4177 4175 }
4178 4176
4179 4177 /*
4180 4178 * Variant of page_subclaim(), where ppa[] contains the pages of a single large
4181 4179 * page.
4182 4180 */
4183 4181 int
4184 4182 page_subclaim_pages(page_t **ppa)
4185 4183 {
4186 4184 pgcnt_t ulckpgs = 0, pg_idx;
4187 4185
4188 4186 VM_STAT_ADD(pagecnt.pc_subclaim_pages);
4189 4187
4190 4188 /*
4191 4189 * Only need to take the page struct lock on the large page root.
4192 4190 */
4193 4191 page_struct_lock(ppa[0]);
4194 4192 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4195 4193
4196 4194 ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4197 4195 ASSERT(ppa[pg_idx]->p_cowcnt != 0);
4198 4196 if (ppa[pg_idx]->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4199 4197 page_struct_unlock(ppa[0]);
4200 4198 return (0);
4201 4199 }
4202 4200 if (ppa[pg_idx]->p_lckcnt != 0)
4203 4201 ulckpgs++;
4204 4202 }
4205 4203
4206 4204 if (ulckpgs != 0) {
4207 4205 mutex_enter(&freemem_lock);
4208 4206 availrmem += ulckpgs;
4209 4207 pages_claimed -= ulckpgs;
4210 4208 mutex_exit(&freemem_lock);
4211 4209 }
4212 4210
4213 4211 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4214 4212 ppa[pg_idx]->p_cowcnt--;
4215 4213 ppa[pg_idx]->p_lckcnt++;
4216 4214
4217 4215 }
4218 4216 page_struct_unlock(ppa[0]);
4219 4217 return (1);
4220 4218 }
4221 4219
4222 4220 page_t *
4223 4221 page_numtopp(pfn_t pfnum, se_t se)
4224 4222 {
4225 4223 page_t *pp;
4226 4224
4227 4225 retry:
4228 4226 pp = page_numtopp_nolock(pfnum);
4229 4227 if (pp == NULL) {
4230 4228 return ((page_t *)NULL);
4231 4229 }
4232 4230
4233 4231 /*
4234 4232 * Acquire the appropriate lock on the page.
4235 4233 */
4236 4234 while (!page_lock(pp, se, (kmutex_t *)NULL, P_RECLAIM)) {
4237 4235 if (page_pptonum(pp) != pfnum)
4238 4236 goto retry;
4239 4237 continue;
4240 4238 }
4241 4239
4242 4240 if (page_pptonum(pp) != pfnum) {
4243 4241 page_unlock(pp);
4244 4242 goto retry;
4245 4243 }
4246 4244
4247 4245 return (pp);
4248 4246 }
4249 4247
4250 4248 page_t *
4251 4249 page_numtopp_noreclaim(pfn_t pfnum, se_t se)
4252 4250 {
4253 4251 page_t *pp;
4254 4252
4255 4253 retry:
4256 4254 pp = page_numtopp_nolock(pfnum);
4257 4255 if (pp == NULL) {
4258 4256 return ((page_t *)NULL);
4259 4257 }
4260 4258
4261 4259 /*
4262 4260 * Acquire the appropriate lock on the page.
4263 4261 */
4264 4262 while (!page_lock(pp, se, (kmutex_t *)NULL, P_NO_RECLAIM)) {
4265 4263 if (page_pptonum(pp) != pfnum)
4266 4264 goto retry;
4267 4265 continue;
4268 4266 }
4269 4267
4270 4268 if (page_pptonum(pp) != pfnum) {
4271 4269 page_unlock(pp);
4272 4270 goto retry;
4273 4271 }
4274 4272
4275 4273 return (pp);
4276 4274 }
4277 4275
4278 4276 /*
4279 4277 * This routine is like page_numtopp, but will only return page structs
4280 4278 * for pages which are ok for loading into hardware using the page struct.
4281 4279 */
4282 4280 page_t *
4283 4281 page_numtopp_nowait(pfn_t pfnum, se_t se)
4284 4282 {
4285 4283 page_t *pp;
4286 4284
4287 4285 retry:
4288 4286 pp = page_numtopp_nolock(pfnum);
4289 4287 if (pp == NULL) {
4290 4288 return ((page_t *)NULL);
4291 4289 }
4292 4290
4293 4291 /*
4294 4292 * Try to acquire the appropriate lock on the page.
4295 4293 */
4296 4294 if (PP_ISFREE(pp))
4297 4295 pp = NULL;
4298 4296 else {
4299 4297 if (!page_trylock(pp, se))
4300 4298 pp = NULL;
4301 4299 else {
4302 4300 if (page_pptonum(pp) != pfnum) {
4303 4301 page_unlock(pp);
4304 4302 goto retry;
4305 4303 }
4306 4304 if (PP_ISFREE(pp)) {
4307 4305 page_unlock(pp);
4308 4306 pp = NULL;
4309 4307 }
4310 4308 }
4311 4309 }
4312 4310 return (pp);
4313 4311 }
4314 4312
4315 4313 #define SYNC_PROGRESS_NPAGES 1000
4316 4314
4317 4315 /*
4318 4316 * Returns a count of dirty pages that are in the process
4319 4317 * of being written out. If 'cleanit' is set, try to push the page.
4320 4318 */
4321 4319 pgcnt_t
4322 4320 page_busy(int cleanit)
4323 4321 {
4324 4322 page_t *page0 = page_first();
4325 4323 page_t *pp = page0;
4326 4324 pgcnt_t nppbusy = 0;
4327 4325 int counter = 0;
4328 4326 u_offset_t off;
4329 4327
4330 4328 do {
4331 4329 vnode_t *vp = pp->p_vnode;
4332 4330
4333 4331 /*
4334 4332 * Reset the sync timeout. The page list is very long
4335 4333 * on large memory systems.
4336 4334 */
4337 4335 if (++counter > SYNC_PROGRESS_NPAGES) {
4338 4336 counter = 0;
4339 4337 vfs_syncprogress();
4340 4338 }
4341 4339
4342 4340 /*
4343 4341 * A page is a candidate for syncing if it is:
4344 4342 *
4345 4343 * (a) On neither the freelist nor the cachelist
4346 4344 * (b) Hashed onto a vnode
4347 4345 * (c) Not a kernel page
4348 4346 * (d) Dirty
4349 4347 * (e) Not part of a swapfile
4350 4348 * (f) a page which belongs to a real vnode; eg has a non-null
4351 4349 * v_vfsp pointer.
4352 4350 * (g) Backed by a filesystem which doesn't have a
4353 4351 * stubbed-out sync operation
4354 4352 */
4355 4353 if (!PP_ISFREE(pp) && vp != NULL && !VN_ISKAS(vp) &&
4356 4354 hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL &&
4357 4355 vfs_can_sync(vp->v_vfsp)) {
4358 4356 nppbusy++;
4359 4357
4360 4358 if (!cleanit)
4361 4359 continue;
4362 4360 if (!page_trylock(pp, SE_EXCL))
4363 4361 continue;
4364 4362
4365 4363 if (PP_ISFREE(pp) || vp == NULL || IS_SWAPVP(vp) ||
4366 4364 pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
4367 4365 !(hat_pagesync(pp,
4368 4366 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD)) {
4369 4367 page_unlock(pp);
4370 4368 continue;
4371 4369 }
4372 4370 off = pp->p_offset;
4373 4371 VN_HOLD(vp);
4374 4372 page_unlock(pp);
4375 4373 (void) VOP_PUTPAGE(vp, off, PAGESIZE,
4376 4374 B_ASYNC | B_FREE, kcred, NULL);
4377 4375 VN_RELE(vp);
4378 4376 }
4379 4377 } while ((pp = page_next(pp)) != page0);
4380 4378
4381 4379 vfs_syncprogress();
4382 4380 return (nppbusy);
4383 4381 }
4384 4382
4385 4383 void page_invalidate_pages(void);
4386 4384
4387 4385 /*
4388 4386 * callback handler to vm sub-system
4389 4387 *
4390 4388 * callers make sure no recursive entries to this func.
4391 4389 */
4392 4390 /*ARGSUSED*/
4393 4391 boolean_t
4394 4392 callb_vm_cpr(void *arg, int code)
4395 4393 {
4396 4394 if (code == CB_CODE_CPR_CHKPT)
4397 4395 page_invalidate_pages();
4398 4396 return (B_TRUE);
4399 4397 }
4400 4398
4401 4399 /*
4402 4400 * Invalidate all pages of the system.
4403 4401 * It shouldn't be called until all user page activities are all stopped.
4404 4402 */
4405 4403 void
4406 4404 page_invalidate_pages()
4407 4405 {
4408 4406 page_t *pp;
4409 4407 page_t *page0;
4410 4408 pgcnt_t nbusypages;
4411 4409 int retry = 0;
4412 4410 const int MAXRETRIES = 4;
4413 4411 top:
4414 4412 /*
4415 4413 * Flush dirty pages and destroy the clean ones.
4416 4414 */
4417 4415 nbusypages = 0;
4418 4416
4419 4417 pp = page0 = page_first();
4420 4418 do {
4421 4419 struct vnode *vp;
4422 4420 u_offset_t offset;
4423 4421 int mod;
4424 4422
4425 4423 /*
4426 4424 * skip the page if it has no vnode or the page associated
4427 4425 * with the kernel vnode or prom allocated kernel mem.
4428 4426 */
4429 4427 if ((vp = pp->p_vnode) == NULL || VN_ISKAS(vp))
4430 4428 continue;
4431 4429
4432 4430 /*
4433 4431 * skip the page which is already free invalidated.
4434 4432 */
4435 4433 if (PP_ISFREE(pp) && PP_ISAGED(pp))
4436 4434 continue;
4437 4435
4438 4436 /*
4439 4437 * skip pages that are already locked or can't be "exclusively"
4440 4438 * locked or are already free. After we lock the page, check
4441 4439 * the free and age bits again to be sure it's not destroyed
4442 4440 * yet.
4443 4441 * To achieve max. parallelization, we use page_trylock instead
4444 4442 * of page_lock so that we don't get block on individual pages
4445 4443 * while we have thousands of other pages to process.
4446 4444 */
4447 4445 if (!page_trylock(pp, SE_EXCL)) {
4448 4446 nbusypages++;
4449 4447 continue;
4450 4448 } else if (PP_ISFREE(pp)) {
4451 4449 if (!PP_ISAGED(pp)) {
4452 4450 page_destroy_free(pp);
4453 4451 } else {
4454 4452 page_unlock(pp);
4455 4453 }
4456 4454 continue;
4457 4455 }
4458 4456 /*
4459 4457 * Is this page involved in some I/O? shared?
4460 4458 *
4461 4459 * The page_struct_lock need not be acquired to
4462 4460 * examine these fields since the page has an
4463 4461 * "exclusive" lock.
4464 4462 */
4465 4463 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
4466 4464 page_unlock(pp);
4467 4465 continue;
4468 4466 }
4469 4467
4470 4468 if (vp->v_type == VCHR) {
4471 4469 panic("vp->v_type == VCHR");
4472 4470 /*NOTREACHED*/
4473 4471 }
4474 4472
4475 4473 if (!page_try_demote_pages(pp)) {
4476 4474 page_unlock(pp);
4477 4475 continue;
4478 4476 }
4479 4477
4480 4478 /*
4481 4479 * Check the modified bit. Leave the bits alone in hardware
4482 4480 * (they will be modified if we do the putpage).
4483 4481 */
4484 4482 mod = (hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD)
4485 4483 & P_MOD);
4486 4484 if (mod) {
4487 4485 offset = pp->p_offset;
4488 4486 /*
4489 4487 * Hold the vnode before releasing the page lock
4490 4488 * to prevent it from being freed and re-used by
4491 4489 * some other thread.
4492 4490 */
4493 4491 VN_HOLD(vp);
4494 4492 page_unlock(pp);
4495 4493 /*
4496 4494 * No error return is checked here. Callers such as
4497 4495 * cpr deals with the dirty pages at the dump time
4498 4496 * if this putpage fails.
4499 4497 */
4500 4498 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_INVAL,
4501 4499 kcred, NULL);
4502 4500 VN_RELE(vp);
4503 4501 } else {
4504 4502 /*LINTED: constant in conditional context*/
4505 4503 VN_DISPOSE(pp, B_INVAL, 0, kcred);
4506 4504 }
4507 4505 } while ((pp = page_next(pp)) != page0);
4508 4506 if (nbusypages && retry++ < MAXRETRIES) {
4509 4507 delay(1);
4510 4508 goto top;
4511 4509 }
4512 4510 }
4513 4511
4514 4512 /*
4515 4513 * Replace the page "old" with the page "new" on the page hash and vnode lists
4516 4514 *
4517 4515 * the replacement must be done in place, ie the equivalent sequence:
4518 4516 *
4519 4517 * vp = old->p_vnode;
4520 4518 * off = old->p_offset;
4521 4519 * page_do_hashout(old)
4522 4520 * page_do_hashin(new, vp, off)
4523 4521 *
4524 4522 * doesn't work, since
4525 4523 * 1) if old is the only page on the vnode, the v_pages list has a window
4526 4524 * where it looks empty. This will break file system assumptions.
4527 4525 * and
4528 4526 * 2) pvn_vplist_dirty() can't deal with pages moving on the v_pages list.
4529 4527 */
4530 4528 static void
4531 4529 page_do_relocate_hash(page_t *new, page_t *old)
4532 4530 {
4533 4531 page_t **hash_list;
4534 4532 vnode_t *vp = old->p_vnode;
4535 4533 kmutex_t *sep;
4536 4534
4537 4535 ASSERT(PAGE_EXCL(old));
4538 4536 ASSERT(PAGE_EXCL(new));
4539 4537 ASSERT(vp != NULL);
4540 4538 ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
4541 4539 ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, old->p_offset))));
4542 4540
4543 4541 /*
4544 4542 * First find old page on the page hash list
4545 4543 */
4546 4544 hash_list = &page_hash[PAGE_HASH_FUNC(vp, old->p_offset)];
4547 4545
4548 4546 for (;;) {
4549 4547 if (*hash_list == old)
4550 4548 break;
4551 4549 if (*hash_list == NULL) {
4552 4550 panic("page_do_hashout");
4553 4551 /*NOTREACHED*/
4554 4552 }
4555 4553 hash_list = &(*hash_list)->p_hash;
4556 4554 }
4557 4555
4558 4556 /*
4559 4557 * update new and replace old with new on the page hash list
4560 4558 */
4561 4559 new->p_vnode = old->p_vnode;
4562 4560 new->p_offset = old->p_offset;
4563 4561 new->p_hash = old->p_hash;
4564 4562 *hash_list = new;
4565 4563
4566 4564 if ((new->p_vnode->v_flag & VISSWAP) != 0)
4567 4565 PP_SETSWAP(new);
4568 4566
4569 4567 /*
4570 4568 * replace old with new on the vnode's page list
4571 4569 */
4572 4570 if (old->p_vpnext == old) {
4573 4571 new->p_vpnext = new;
4574 4572 new->p_vpprev = new;
4575 4573 } else {
4576 4574 new->p_vpnext = old->p_vpnext;
4577 4575 new->p_vpprev = old->p_vpprev;
4578 4576 new->p_vpnext->p_vpprev = new;
4579 4577 new->p_vpprev->p_vpnext = new;
4580 4578 }
4581 4579 if (vp->v_pages == old)
4582 4580 vp->v_pages = new;
4583 4581
4584 4582 /*
4585 4583 * clear out the old page
4586 4584 */
4587 4585 old->p_hash = NULL;
4588 4586 old->p_vpnext = NULL;
4589 4587 old->p_vpprev = NULL;
4590 4588 old->p_vnode = NULL;
4591 4589 PP_CLRSWAP(old);
4592 4590 old->p_offset = (u_offset_t)-1;
4593 4591 page_clr_all_props(old);
4594 4592
4595 4593 /*
4596 4594 * Wake up processes waiting for this page. The page's
4597 4595 * identity has been changed, and is probably not the
4598 4596 * desired page any longer.
4599 4597 */
4600 4598 sep = page_se_mutex(old);
4601 4599 mutex_enter(sep);
4602 4600 old->p_selock &= ~SE_EWANTED;
4603 4601 if (CV_HAS_WAITERS(&old->p_cv))
4604 4602 cv_broadcast(&old->p_cv);
4605 4603 mutex_exit(sep);
4606 4604 }
4607 4605
4608 4606 /*
4609 4607 * This function moves the identity of page "pp_old" to page "pp_new".
4610 4608 * Both pages must be locked on entry. "pp_new" is free, has no identity,
4611 4609 * and need not be hashed out from anywhere.
4612 4610 */
4613 4611 void
4614 4612 page_relocate_hash(page_t *pp_new, page_t *pp_old)
4615 4613 {
4616 4614 vnode_t *vp = pp_old->p_vnode;
4617 4615 u_offset_t off = pp_old->p_offset;
4618 4616 kmutex_t *phm, *vphm;
4619 4617
4620 4618 /*
4621 4619 * Rehash two pages
4622 4620 */
4623 4621 ASSERT(PAGE_EXCL(pp_old));
4624 4622 ASSERT(PAGE_EXCL(pp_new));
4625 4623 ASSERT(vp != NULL);
4626 4624 ASSERT(pp_new->p_vnode == NULL);
4627 4625
4628 4626 /*
4629 4627 * hashout then hashin while holding the mutexes
4630 4628 */
4631 4629 phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, off));
4632 4630 mutex_enter(phm);
4633 4631 vphm = page_vnode_mutex(vp);
4634 4632 mutex_enter(vphm);
4635 4633
4636 4634 page_do_relocate_hash(pp_new, pp_old);
4637 4635
4638 4636 /* The following comment preserved from page_flip(). */
4639 4637 pp_new->p_fsdata = pp_old->p_fsdata;
4640 4638 pp_old->p_fsdata = 0;
4641 4639 mutex_exit(vphm);
4642 4640 mutex_exit(phm);
4643 4641
4644 4642 /*
4645 4643 * The page_struct_lock need not be acquired for lckcnt and
4646 4644 * cowcnt since the page has an "exclusive" lock.
4647 4645 */
4648 4646 ASSERT(pp_new->p_lckcnt == 0);
4649 4647 ASSERT(pp_new->p_cowcnt == 0);
4650 4648 pp_new->p_lckcnt = pp_old->p_lckcnt;
4651 4649 pp_new->p_cowcnt = pp_old->p_cowcnt;
4652 4650 pp_old->p_lckcnt = pp_old->p_cowcnt = 0;
4653 4651
4654 4652 }
4655 4653
4656 4654 /*
4657 4655 * Helper routine used to lock all remaining members of a
4658 4656 * large page. The caller is responsible for passing in a locked
4659 4657 * pp. If pp is a large page, then it succeeds in locking all the
4660 4658 * remaining constituent pages or it returns with only the
4661 4659 * original page locked.
4662 4660 *
4663 4661 * Returns 1 on success, 0 on failure.
4664 4662 *
4665 4663 * If success is returned this routine guarantees p_szc for all constituent
4666 4664 * pages of a large page pp belongs to can't change. To achieve this we
4667 4665 * recheck szc of pp after locking all constituent pages and retry if szc
4668 4666 * changed (it could only decrease). Since hat_page_demote() needs an EXCL
4669 4667 * lock on one of constituent pages it can't be running after all constituent
4670 4668 * pages are locked. hat_page_demote() with a lock on a constituent page
4671 4669 * outside of this large page (i.e. pp belonged to a larger large page) is
4672 4670 * already done with all constituent pages of pp since the root's p_szc is
4673 4671 * changed last. Therefore no need to synchronize with hat_page_demote() that
4674 4672 * locked a constituent page outside of pp's current large page.
4675 4673 */
4676 4674 #ifdef DEBUG
4677 4675 uint32_t gpg_trylock_mtbf = 0;
4678 4676 #endif
4679 4677
4680 4678 int
4681 4679 group_page_trylock(page_t *pp, se_t se)
4682 4680 {
4683 4681 page_t *tpp;
4684 4682 pgcnt_t npgs, i, j;
4685 4683 uint_t pszc = pp->p_szc;
4686 4684
4687 4685 #ifdef DEBUG
4688 4686 if (gpg_trylock_mtbf && !(gethrtime() % gpg_trylock_mtbf)) {
4689 4687 return (0);
4690 4688 }
4691 4689 #endif
4692 4690
4693 4691 if (pp != PP_GROUPLEADER(pp, pszc)) {
4694 4692 return (0);
4695 4693 }
4696 4694
4697 4695 retry:
4698 4696 ASSERT(PAGE_LOCKED_SE(pp, se));
4699 4697 ASSERT(!PP_ISFREE(pp));
4700 4698 if (pszc == 0) {
4701 4699 return (1);
4702 4700 }
4703 4701 npgs = page_get_pagecnt(pszc);
4704 4702 tpp = pp + 1;
4705 4703 for (i = 1; i < npgs; i++, tpp++) {
4706 4704 if (!page_trylock(tpp, se)) {
4707 4705 tpp = pp + 1;
4708 4706 for (j = 1; j < i; j++, tpp++) {
4709 4707 page_unlock(tpp);
4710 4708 }
4711 4709 return (0);
4712 4710 }
4713 4711 }
4714 4712 if (pp->p_szc != pszc) {
4715 4713 ASSERT(pp->p_szc < pszc);
4716 4714 ASSERT(pp->p_vnode != NULL && !PP_ISKAS(pp) &&
4717 4715 !IS_SWAPFSVP(pp->p_vnode));
4718 4716 tpp = pp + 1;
4719 4717 for (i = 1; i < npgs; i++, tpp++) {
4720 4718 page_unlock(tpp);
4721 4719 }
4722 4720 pszc = pp->p_szc;
4723 4721 goto retry;
4724 4722 }
4725 4723 return (1);
4726 4724 }
4727 4725
4728 4726 void
4729 4727 group_page_unlock(page_t *pp)
4730 4728 {
4731 4729 page_t *tpp;
4732 4730 pgcnt_t npgs, i;
4733 4731
4734 4732 ASSERT(PAGE_LOCKED(pp));
4735 4733 ASSERT(!PP_ISFREE(pp));
4736 4734 ASSERT(pp == PP_PAGEROOT(pp));
4737 4735 npgs = page_get_pagecnt(pp->p_szc);
4738 4736 for (i = 1, tpp = pp + 1; i < npgs; i++, tpp++) {
4739 4737 page_unlock(tpp);
4740 4738 }
4741 4739 }
4742 4740
4743 4741 /*
4744 4742 * returns
4745 4743 * 0 : on success and *nrelocp is number of relocated PAGESIZE pages
4746 4744 * ERANGE : this is not a base page
4747 4745 * EBUSY : failure to get locks on the page/pages
4748 4746 * ENOMEM : failure to obtain replacement pages
4749 4747 * EAGAIN : OBP has not yet completed its boot-time handoff to the kernel
4750 4748 * EIO : An error occurred while trying to copy the page data
4751 4749 *
4752 4750 * Return with all constituent members of target and replacement
4753 4751 * SE_EXCL locked. It is the callers responsibility to drop the
4754 4752 * locks.
4755 4753 */
4756 4754 int
4757 4755 do_page_relocate(
4758 4756 page_t **target,
4759 4757 page_t **replacement,
4760 4758 int grouplock,
4761 4759 spgcnt_t *nrelocp,
4762 4760 lgrp_t *lgrp)
4763 4761 {
4764 4762 page_t *first_repl;
4765 4763 page_t *repl;
4766 4764 page_t *targ;
4767 4765 page_t *pl = NULL;
4768 4766 uint_t ppattr;
4769 4767 pfn_t pfn, repl_pfn;
4770 4768 uint_t szc;
4771 4769 spgcnt_t npgs, i;
4772 4770 int repl_contig = 0;
4773 4771 uint_t flags = 0;
4774 4772 spgcnt_t dofree = 0;
4775 4773
4776 4774 *nrelocp = 0;
4777 4775
4778 4776 #if defined(__sparc)
4779 4777 /*
4780 4778 * We need to wait till OBP has completed
4781 4779 * its boot-time handoff of its resources to the kernel
4782 4780 * before we allow page relocation
4783 4781 */
4784 4782 if (page_relocate_ready == 0) {
4785 4783 return (EAGAIN);
4786 4784 }
4787 4785 #endif
4788 4786
4789 4787 /*
4790 4788 * If this is not a base page,
4791 4789 * just return with 0x0 pages relocated.
4792 4790 */
4793 4791 targ = *target;
4794 4792 ASSERT(PAGE_EXCL(targ));
4795 4793 ASSERT(!PP_ISFREE(targ));
4796 4794 szc = targ->p_szc;
4797 4795 ASSERT(szc < mmu_page_sizes);
4798 4796 VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4799 4797 pfn = targ->p_pagenum;
4800 4798 if (pfn != PFN_BASE(pfn, szc)) {
4801 4799 VM_STAT_ADD(vmm_vmstats.ppr_relocnoroot[szc]);
4802 4800 return (ERANGE);
4803 4801 }
4804 4802
4805 4803 if ((repl = *replacement) != NULL && repl->p_szc >= szc) {
4806 4804 repl_pfn = repl->p_pagenum;
4807 4805 if (repl_pfn != PFN_BASE(repl_pfn, szc)) {
4808 4806 VM_STAT_ADD(vmm_vmstats.ppr_reloc_replnoroot[szc]);
4809 4807 return (ERANGE);
4810 4808 }
4811 4809 repl_contig = 1;
4812 4810 }
4813 4811
4814 4812 /*
4815 4813 * We must lock all members of this large page or we cannot
4816 4814 * relocate any part of it.
4817 4815 */
4818 4816 if (grouplock != 0 && !group_page_trylock(targ, SE_EXCL)) {
4819 4817 VM_STAT_ADD(vmm_vmstats.ppr_relocnolock[targ->p_szc]);
4820 4818 return (EBUSY);
4821 4819 }
4822 4820
4823 4821 /*
4824 4822 * reread szc it could have been decreased before
4825 4823 * group_page_trylock() was done.
4826 4824 */
4827 4825 szc = targ->p_szc;
4828 4826 ASSERT(szc < mmu_page_sizes);
4829 4827 VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4830 4828 ASSERT(pfn == PFN_BASE(pfn, szc));
4831 4829
4832 4830 npgs = page_get_pagecnt(targ->p_szc);
4833 4831
4834 4832 if (repl == NULL) {
4835 4833 dofree = npgs; /* Size of target page in MMU pages */
4836 4834 if (!page_create_wait(dofree, 0)) {
4837 4835 if (grouplock != 0) {
4838 4836 group_page_unlock(targ);
4839 4837 }
4840 4838 VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4841 4839 return (ENOMEM);
4842 4840 }
4843 4841
4844 4842 /*
4845 4843 * seg kmem pages require that the target and replacement
4846 4844 * page be the same pagesize.
4847 4845 */
4848 4846 flags = (VN_ISKAS(targ->p_vnode)) ? PGR_SAMESZC : 0;
4849 4847 repl = page_get_replacement_page(targ, lgrp, flags);
4850 4848 if (repl == NULL) {
4851 4849 if (grouplock != 0) {
4852 4850 group_page_unlock(targ);
4853 4851 }
4854 4852 page_create_putback(dofree);
4855 4853 VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4856 4854 return (ENOMEM);
4857 4855 }
4858 4856 }
4859 4857 #ifdef DEBUG
4860 4858 else {
4861 4859 ASSERT(PAGE_LOCKED(repl));
4862 4860 }
4863 4861 #endif /* DEBUG */
4864 4862
4865 4863 #if defined(__sparc)
4866 4864 /*
4867 4865 * Let hat_page_relocate() complete the relocation if it's kernel page
4868 4866 */
4869 4867 if (VN_ISKAS(targ->p_vnode)) {
4870 4868 *replacement = repl;
4871 4869 if (hat_page_relocate(target, replacement, nrelocp) != 0) {
4872 4870 if (grouplock != 0) {
4873 4871 group_page_unlock(targ);
4874 4872 }
4875 4873 if (dofree) {
4876 4874 *replacement = NULL;
4877 4875 page_free_replacement_page(repl);
4878 4876 page_create_putback(dofree);
4879 4877 }
4880 4878 VM_STAT_ADD(vmm_vmstats.ppr_krelocfail[szc]);
4881 4879 return (EAGAIN);
4882 4880 }
4883 4881 VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
4884 4882 return (0);
4885 4883 }
4886 4884 #else
4887 4885 #if defined(lint)
4888 4886 dofree = dofree;
4889 4887 #endif
4890 4888 #endif
4891 4889
4892 4890 first_repl = repl;
4893 4891
4894 4892 for (i = 0; i < npgs; i++) {
4895 4893 ASSERT(PAGE_EXCL(targ));
4896 4894 ASSERT(targ->p_slckcnt == 0);
4897 4895 ASSERT(repl->p_slckcnt == 0);
4898 4896
4899 4897 (void) hat_pageunload(targ, HAT_FORCE_PGUNLOAD);
4900 4898
4901 4899 ASSERT(hat_page_getshare(targ) == 0);
4902 4900 ASSERT(!PP_ISFREE(targ));
4903 4901 ASSERT(targ->p_pagenum == (pfn + i));
4904 4902 ASSERT(repl_contig == 0 ||
4905 4903 repl->p_pagenum == (repl_pfn + i));
4906 4904
4907 4905 /*
4908 4906 * Copy the page contents and attributes then
4909 4907 * relocate the page in the page hash.
4910 4908 */
4911 4909 if (ppcopy(targ, repl) == 0) {
4912 4910 targ = *target;
4913 4911 repl = first_repl;
4914 4912 VM_STAT_ADD(vmm_vmstats.ppr_copyfail);
4915 4913 if (grouplock != 0) {
4916 4914 group_page_unlock(targ);
4917 4915 }
4918 4916 if (dofree) {
4919 4917 *replacement = NULL;
4920 4918 page_free_replacement_page(repl);
4921 4919 page_create_putback(dofree);
4922 4920 }
4923 4921 return (EIO);
4924 4922 }
4925 4923
4926 4924 targ++;
4927 4925 if (repl_contig != 0) {
4928 4926 repl++;
4929 4927 } else {
4930 4928 repl = repl->p_next;
4931 4929 }
4932 4930 }
4933 4931
4934 4932 repl = first_repl;
4935 4933 targ = *target;
4936 4934
4937 4935 for (i = 0; i < npgs; i++) {
4938 4936 ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO));
4939 4937 page_clr_all_props(repl);
4940 4938 page_set_props(repl, ppattr);
4941 4939 page_relocate_hash(repl, targ);
4942 4940
4943 4941 ASSERT(hat_page_getshare(targ) == 0);
4944 4942 ASSERT(hat_page_getshare(repl) == 0);
4945 4943 /*
4946 4944 * Now clear the props on targ, after the
4947 4945 * page_relocate_hash(), they no longer
4948 4946 * have any meaning.
4949 4947 */
4950 4948 page_clr_all_props(targ);
4951 4949 ASSERT(targ->p_next == targ);
4952 4950 ASSERT(targ->p_prev == targ);
4953 4951 page_list_concat(&pl, &targ);
4954 4952
4955 4953 targ++;
4956 4954 if (repl_contig != 0) {
4957 4955 repl++;
4958 4956 } else {
4959 4957 repl = repl->p_next;
4960 4958 }
4961 4959 }
4962 4960 /* assert that we have come full circle with repl */
4963 4961 ASSERT(repl_contig == 1 || first_repl == repl);
4964 4962
4965 4963 *target = pl;
4966 4964 if (*replacement == NULL) {
4967 4965 ASSERT(first_repl == repl);
4968 4966 *replacement = repl;
4969 4967 }
4970 4968 VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
4971 4969 *nrelocp = npgs;
4972 4970 return (0);
4973 4971 }
4974 4972 /*
4975 4973 * On success returns 0 and *nrelocp the number of PAGESIZE pages relocated.
4976 4974 */
4977 4975 int
4978 4976 page_relocate(
4979 4977 page_t **target,
4980 4978 page_t **replacement,
4981 4979 int grouplock,
4982 4980 int freetarget,
4983 4981 spgcnt_t *nrelocp,
4984 4982 lgrp_t *lgrp)
4985 4983 {
4986 4984 spgcnt_t ret;
4987 4985
4988 4986 /* do_page_relocate returns 0 on success or errno value */
4989 4987 ret = do_page_relocate(target, replacement, grouplock, nrelocp, lgrp);
4990 4988
4991 4989 if (ret != 0 || freetarget == 0) {
4992 4990 return (ret);
4993 4991 }
4994 4992 if (*nrelocp == 1) {
4995 4993 ASSERT(*target != NULL);
4996 4994 page_free(*target, 1);
4997 4995 } else {
4998 4996 page_t *tpp = *target;
4999 4997 uint_t szc = tpp->p_szc;
5000 4998 pgcnt_t npgs = page_get_pagecnt(szc);
5001 4999 ASSERT(npgs > 1);
5002 5000 ASSERT(szc != 0);
5003 5001 do {
5004 5002 ASSERT(PAGE_EXCL(tpp));
5005 5003 ASSERT(!hat_page_is_mapped(tpp));
5006 5004 ASSERT(tpp->p_szc == szc);
5007 5005 PP_SETFREE(tpp);
5008 5006 PP_SETAGED(tpp);
5009 5007 npgs--;
5010 5008 } while ((tpp = tpp->p_next) != *target);
5011 5009 ASSERT(npgs == 0);
5012 5010 page_list_add_pages(*target, 0);
5013 5011 npgs = page_get_pagecnt(szc);
5014 5012 page_create_putback(npgs);
5015 5013 }
5016 5014 return (ret);
5017 5015 }
5018 5016
5019 5017 /*
5020 5018 * it is up to the caller to deal with pcf accounting.
5021 5019 */
5022 5020 void
5023 5021 page_free_replacement_page(page_t *pplist)
5024 5022 {
5025 5023 page_t *pp;
5026 5024
5027 5025 while (pplist != NULL) {
5028 5026 /*
5029 5027 * pp_targ is a linked list.
5030 5028 */
5031 5029 pp = pplist;
5032 5030 if (pp->p_szc == 0) {
5033 5031 page_sub(&pplist, pp);
5034 5032 page_clr_all_props(pp);
5035 5033 PP_SETFREE(pp);
5036 5034 PP_SETAGED(pp);
5037 5035 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
5038 5036 page_unlock(pp);
5039 5037 VM_STAT_ADD(pagecnt.pc_free_replacement_page[0]);
5040 5038 } else {
5041 5039 spgcnt_t curnpgs = page_get_pagecnt(pp->p_szc);
5042 5040 page_t *tpp;
5043 5041 page_list_break(&pp, &pplist, curnpgs);
5044 5042 tpp = pp;
5045 5043 do {
5046 5044 ASSERT(PAGE_EXCL(tpp));
5047 5045 ASSERT(!hat_page_is_mapped(tpp));
5048 5046 page_clr_all_props(tpp);
5049 5047 PP_SETFREE(tpp);
5050 5048 PP_SETAGED(tpp);
5051 5049 } while ((tpp = tpp->p_next) != pp);
5052 5050 page_list_add_pages(pp, 0);
5053 5051 VM_STAT_ADD(pagecnt.pc_free_replacement_page[1]);
5054 5052 }
5055 5053 }
5056 5054 }
5057 5055
5058 5056 /*
5059 5057 * Relocate target to non-relocatable replacement page.
5060 5058 */
5061 5059 int
5062 5060 page_relocate_cage(page_t **target, page_t **replacement)
5063 5061 {
5064 5062 page_t *tpp, *rpp;
5065 5063 spgcnt_t pgcnt, npgs;
5066 5064 int result;
5067 5065
5068 5066 tpp = *target;
5069 5067
5070 5068 ASSERT(PAGE_EXCL(tpp));
5071 5069 ASSERT(tpp->p_szc == 0);
5072 5070
5073 5071 pgcnt = btop(page_get_pagesize(tpp->p_szc));
5074 5072
5075 5073 do {
5076 5074 (void) page_create_wait(pgcnt, PG_WAIT | PG_NORELOC);
5077 5075 rpp = page_get_replacement_page(tpp, NULL, PGR_NORELOC);
5078 5076 if (rpp == NULL) {
5079 5077 page_create_putback(pgcnt);
5080 5078 kcage_cageout_wakeup();
5081 5079 }
5082 5080 } while (rpp == NULL);
5083 5081
5084 5082 ASSERT(PP_ISNORELOC(rpp));
5085 5083
5086 5084 result = page_relocate(&tpp, &rpp, 0, 1, &npgs, NULL);
5087 5085
5088 5086 if (result == 0) {
5089 5087 *replacement = rpp;
5090 5088 if (pgcnt != npgs)
5091 5089 panic("page_relocate_cage: partial relocation");
5092 5090 }
5093 5091
5094 5092 return (result);
5095 5093 }
5096 5094
5097 5095 /*
5098 5096 * Release the page lock on a page, place on cachelist
5099 5097 * tail if no longer mapped. Caller can let us know if
5100 5098 * the page is known to be clean.
5101 5099 */
5102 5100 int
5103 5101 page_release(page_t *pp, int checkmod)
5104 5102 {
5105 5103 int status;
5106 5104
5107 5105 ASSERT(PAGE_LOCKED(pp) && !PP_ISFREE(pp) &&
5108 5106 (pp->p_vnode != NULL));
5109 5107
5110 5108 if (!hat_page_is_mapped(pp) && !IS_SWAPVP(pp->p_vnode) &&
5111 5109 ((PAGE_SHARED(pp) && page_tryupgrade(pp)) || PAGE_EXCL(pp)) &&
5112 5110 pp->p_lckcnt == 0 && pp->p_cowcnt == 0 &&
5113 5111 !hat_page_is_mapped(pp)) {
5114 5112
5115 5113 /*
5116 5114 * If page is modified, unlock it
5117 5115 *
5118 5116 * (p_nrm & P_MOD) bit has the latest stuff because:
5119 5117 * (1) We found that this page doesn't have any mappings
5120 5118 * _after_ holding SE_EXCL and
5121 5119 * (2) We didn't drop SE_EXCL lock after the check in (1)
5122 5120 */
5123 5121 if (checkmod && hat_ismod(pp)) {
5124 5122 page_unlock(pp);
5125 5123 status = PGREL_MOD;
5126 5124 } else {
5127 5125 /*LINTED: constant in conditional context*/
5128 5126 VN_DISPOSE(pp, B_FREE, 0, kcred);
5129 5127 status = PGREL_CLEAN;
5130 5128 }
5131 5129 } else {
5132 5130 page_unlock(pp);
5133 5131 status = PGREL_NOTREL;
5134 5132 }
5135 5133 return (status);
5136 5134 }
5137 5135
5138 5136 /*
5139 5137 * Given a constituent page, try to demote the large page on the freelist.
5140 5138 *
5141 5139 * Returns nonzero if the page could be demoted successfully. Returns with
5142 5140 * the constituent page still locked.
5143 5141 */
5144 5142 int
5145 5143 page_try_demote_free_pages(page_t *pp)
5146 5144 {
5147 5145 page_t *rootpp = pp;
5148 5146 pfn_t pfn = page_pptonum(pp);
5149 5147 spgcnt_t npgs;
5150 5148 uint_t szc = pp->p_szc;
5151 5149
5152 5150 ASSERT(PP_ISFREE(pp));
5153 5151 ASSERT(PAGE_EXCL(pp));
5154 5152
5155 5153 /*
5156 5154 * Adjust rootpp and lock it, if `pp' is not the base
5157 5155 * constituent page.
5158 5156 */
5159 5157 npgs = page_get_pagecnt(pp->p_szc);
5160 5158 if (npgs == 1) {
5161 5159 return (0);
5162 5160 }
5163 5161
5164 5162 if (!IS_P2ALIGNED(pfn, npgs)) {
5165 5163 pfn = P2ALIGN(pfn, npgs);
5166 5164 rootpp = page_numtopp_nolock(pfn);
5167 5165 }
5168 5166
5169 5167 if (pp != rootpp && !page_trylock(rootpp, SE_EXCL)) {
5170 5168 return (0);
5171 5169 }
5172 5170
5173 5171 if (rootpp->p_szc != szc) {
5174 5172 if (pp != rootpp)
5175 5173 page_unlock(rootpp);
5176 5174 return (0);
5177 5175 }
5178 5176
5179 5177 page_demote_free_pages(rootpp);
5180 5178
5181 5179 if (pp != rootpp)
5182 5180 page_unlock(rootpp);
5183 5181
5184 5182 ASSERT(PP_ISFREE(pp));
5185 5183 ASSERT(PAGE_EXCL(pp));
5186 5184 return (1);
5187 5185 }
5188 5186
5189 5187 /*
5190 5188 * Given a constituent page, try to demote the large page.
5191 5189 *
5192 5190 * Returns nonzero if the page could be demoted successfully. Returns with
5193 5191 * the constituent page still locked.
5194 5192 */
5195 5193 int
5196 5194 page_try_demote_pages(page_t *pp)
5197 5195 {
5198 5196 page_t *tpp, *rootpp = pp;
5199 5197 pfn_t pfn = page_pptonum(pp);
5200 5198 spgcnt_t i, npgs;
5201 5199 uint_t szc = pp->p_szc;
5202 5200 vnode_t *vp = pp->p_vnode;
5203 5201
5204 5202 ASSERT(PAGE_EXCL(pp));
5205 5203
5206 5204 VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]);
5207 5205
5208 5206 if (pp->p_szc == 0) {
5209 5207 VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]);
5210 5208 return (1);
5211 5209 }
5212 5210
5213 5211 if (vp != NULL && !IS_SWAPFSVP(vp) && !VN_ISKAS(vp)) {
5214 5212 VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]);
5215 5213 page_demote_vp_pages(pp);
5216 5214 ASSERT(pp->p_szc == 0);
5217 5215 return (1);
5218 5216 }
5219 5217
5220 5218 /*
5221 5219 * Adjust rootpp if passed in is not the base
5222 5220 * constituent page.
5223 5221 */
5224 5222 npgs = page_get_pagecnt(pp->p_szc);
5225 5223 ASSERT(npgs > 1);
5226 5224 if (!IS_P2ALIGNED(pfn, npgs)) {
5227 5225 pfn = P2ALIGN(pfn, npgs);
5228 5226 rootpp = page_numtopp_nolock(pfn);
5229 5227 VM_STAT_ADD(pagecnt.pc_try_demote_pages[3]);
5230 5228 ASSERT(rootpp->p_vnode != NULL);
5231 5229 ASSERT(rootpp->p_szc == szc);
5232 5230 }
5233 5231
5234 5232 /*
5235 5233 * We can't demote kernel pages since we can't hat_unload()
5236 5234 * the mappings.
5237 5235 */
5238 5236 if (VN_ISKAS(rootpp->p_vnode))
5239 5237 return (0);
5240 5238
5241 5239 /*
5242 5240 * Attempt to lock all constituent pages except the page passed
5243 5241 * in since it's already locked.
5244 5242 */
5245 5243 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5246 5244 ASSERT(!PP_ISFREE(tpp));
5247 5245 ASSERT(tpp->p_vnode != NULL);
5248 5246
5249 5247 if (tpp != pp && !page_trylock(tpp, SE_EXCL))
5250 5248 break;
5251 5249 ASSERT(tpp->p_szc == rootpp->p_szc);
5252 5250 ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i);
5253 5251 }
5254 5252
5255 5253 /*
5256 5254 * If we failed to lock them all then unlock what we have
5257 5255 * locked so far and bail.
5258 5256 */
5259 5257 if (i < npgs) {
5260 5258 tpp = rootpp;
5261 5259 while (i-- > 0) {
5262 5260 if (tpp != pp)
5263 5261 page_unlock(tpp);
5264 5262 tpp++;
5265 5263 }
5266 5264 VM_STAT_ADD(pagecnt.pc_try_demote_pages[4]);
5267 5265 return (0);
5268 5266 }
5269 5267
5270 5268 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5271 5269 ASSERT(PAGE_EXCL(tpp));
5272 5270 ASSERT(tpp->p_slckcnt == 0);
5273 5271 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
5274 5272 tpp->p_szc = 0;
5275 5273 }
5276 5274
5277 5275 /*
5278 5276 * Unlock all pages except the page passed in.
5279 5277 */
5280 5278 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5281 5279 ASSERT(!hat_page_is_mapped(tpp));
5282 5280 if (tpp != pp)
5283 5281 page_unlock(tpp);
5284 5282 }
5285 5283
5286 5284 VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]);
5287 5285 return (1);
5288 5286 }
5289 5287
5290 5288 /*
5291 5289 * Called by page_free() and page_destroy() to demote the page size code
5292 5290 * (p_szc) to 0 (since we can't just put a single PAGESIZE page with non zero
5293 5291 * p_szc on free list, neither can we just clear p_szc of a single page_t
5294 5292 * within a large page since it will break other code that relies on p_szc
5295 5293 * being the same for all page_t's of a large page). Anonymous pages should
5296 5294 * never end up here because anon_map_getpages() cannot deal with p_szc
5297 5295 * changes after a single constituent page is locked. While anonymous or
5298 5296 * kernel large pages are demoted or freed the entire large page at a time
5299 5297 * with all constituent pages locked EXCL for the file system pages we
5300 5298 * have to be able to demote a large page (i.e. decrease all constituent pages
5301 5299 * p_szc) with only just an EXCL lock on one of constituent pages. The reason
5302 5300 * we can easily deal with anonymous page demotion the entire large page at a
5303 5301 * time is that those operation originate at address space level and concern
5304 5302 * the entire large page region with actual demotion only done when pages are
5305 5303 * not shared with any other processes (therefore we can always get EXCL lock
5306 5304 * on all anonymous constituent pages after clearing segment page
5307 5305 * cache). However file system pages can be truncated or invalidated at a
5308 5306 * PAGESIZE level from the file system side and end up in page_free() or
5309 5307 * page_destroy() (we also allow only part of the large page to be SOFTLOCKed
5310 5308 * and therefore pageout should be able to demote a large page by EXCL locking
5311 5309 * any constituent page that is not under SOFTLOCK). In those cases we cannot
5312 5310 * rely on being able to lock EXCL all constituent pages.
5313 5311 *
5314 5312 * To prevent szc changes on file system pages one has to lock all constituent
5315 5313 * pages at least SHARED (or call page_szc_lock()). The only subsystem that
5316 5314 * doesn't rely on locking all constituent pages (or using page_szc_lock()) to
5317 5315 * prevent szc changes is hat layer that uses its own page level mlist
5318 5316 * locks. hat assumes that szc doesn't change after mlist lock for a page is
5319 5317 * taken. Therefore we need to change szc under hat level locks if we only
5320 5318 * have an EXCL lock on a single constituent page and hat still references any
5321 5319 * of constituent pages. (Note we can't "ignore" hat layer by simply
5322 5320 * hat_pageunload() all constituent pages without having EXCL locks on all of
5323 5321 * constituent pages). We use hat_page_demote() call to safely demote szc of
5324 5322 * all constituent pages under hat locks when we only have an EXCL lock on one
5325 5323 * of constituent pages.
5326 5324 *
5327 5325 * This routine calls page_szc_lock() before calling hat_page_demote() to
5328 5326 * allow segvn in one special case not to lock all constituent pages SHARED
5329 5327 * before calling hat_memload_array() that relies on p_szc not changing even
5330 5328 * before hat level mlist lock is taken. In that case segvn uses
5331 5329 * page_szc_lock() to prevent hat_page_demote() changing p_szc values.
5332 5330 *
5333 5331 * Anonymous or kernel page demotion still has to lock all pages exclusively
5334 5332 * and do hat_pageunload() on all constituent pages before demoting the page
5335 5333 * therefore there's no need for anonymous or kernel page demotion to use
5336 5334 * hat_page_demote() mechanism.
5337 5335 *
5338 5336 * hat_page_demote() removes all large mappings that map pp and then decreases
5339 5337 * p_szc starting from the last constituent page of the large page. By working
5340 5338 * from the tail of a large page in pfn decreasing order allows one looking at
5341 5339 * the root page to know that hat_page_demote() is done for root's szc area.
5342 5340 * e.g. if a root page has szc 1 one knows it only has to lock all constituent
5343 5341 * pages within szc 1 area to prevent szc changes because hat_page_demote()
5344 5342 * that started on this page when it had szc > 1 is done for this szc 1 area.
5345 5343 *
5346 5344 * We are guaranteed that all constituent pages of pp's large page belong to
5347 5345 * the same vnode with the consecutive offsets increasing in the direction of
5348 5346 * the pfn i.e. the identity of constituent pages can't change until their
5349 5347 * p_szc is decreased. Therefore it's safe for hat_page_demote() to remove
5350 5348 * large mappings to pp even though we don't lock any constituent page except
5351 5349 * pp (i.e. we won't unload e.g. kernel locked page).
5352 5350 */
5353 5351 static void
5354 5352 page_demote_vp_pages(page_t *pp)
5355 5353 {
5356 5354 kmutex_t *mtx;
5357 5355
5358 5356 ASSERT(PAGE_EXCL(pp));
5359 5357 ASSERT(!PP_ISFREE(pp));
5360 5358 ASSERT(pp->p_vnode != NULL);
5361 5359 ASSERT(!IS_SWAPFSVP(pp->p_vnode));
5362 5360 ASSERT(!PP_ISKAS(pp));
5363 5361
5364 5362 VM_STAT_ADD(pagecnt.pc_demote_pages[0]);
5365 5363
5366 5364 mtx = page_szc_lock(pp);
5367 5365 if (mtx != NULL) {
5368 5366 hat_page_demote(pp);
5369 5367 mutex_exit(mtx);
5370 5368 }
5371 5369 ASSERT(pp->p_szc == 0);
5372 5370 }
5373 5371
5374 5372 /*
5375 5373 * Mark any existing pages for migration in the given range
5376 5374 */
5377 5375 void
5378 5376 page_mark_migrate(struct seg *seg, caddr_t addr, size_t len,
5379 5377 struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
5380 5378 u_offset_t vnoff, int rflag)
5381 5379 {
5382 5380 struct anon *ap;
5383 5381 vnode_t *curvp;
5384 5382 lgrp_t *from;
5385 5383 pgcnt_t nlocked;
5386 5384 u_offset_t off;
5387 5385 pfn_t pfn;
5388 5386 size_t pgsz;
5389 5387 size_t segpgsz;
5390 5388 pgcnt_t pages;
5391 5389 uint_t pszc;
5392 5390 page_t *pp0, *pp;
5393 5391 caddr_t va;
5394 5392 ulong_t an_idx;
5395 5393 anon_sync_obj_t cookie;
5396 5394
5397 5395 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
5398 5396
5399 5397 /*
5400 5398 * Don't do anything if don't need to do lgroup optimizations
5401 5399 * on this system
5402 5400 */
5403 5401 if (!lgrp_optimizations())
5404 5402 return;
5405 5403
5406 5404 /*
5407 5405 * Align address and length to (potentially large) page boundary
5408 5406 */
5409 5407 segpgsz = page_get_pagesize(seg->s_szc);
5410 5408 addr = (caddr_t)P2ALIGN((uintptr_t)addr, segpgsz);
5411 5409 if (rflag)
5412 5410 len = P2ROUNDUP(len, segpgsz);
5413 5411
5414 5412 /*
5415 5413 * Do one (large) page at a time
5416 5414 */
5417 5415 va = addr;
5418 5416 while (va < addr + len) {
5419 5417 /*
5420 5418 * Lookup (root) page for vnode and offset corresponding to
5421 5419 * this virtual address
5422 5420 * Try anonmap first since there may be copy-on-write
5423 5421 * pages, but initialize vnode pointer and offset using
5424 5422 * vnode arguments just in case there isn't an amp.
5425 5423 */
5426 5424 curvp = vp;
5427 5425 off = vnoff + va - seg->s_base;
5428 5426 if (amp) {
5429 5427 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
5430 5428 an_idx = anon_index + seg_page(seg, va);
5431 5429 anon_array_enter(amp, an_idx, &cookie);
5432 5430 ap = anon_get_ptr(amp->ahp, an_idx);
5433 5431 if (ap)
5434 5432 swap_xlate(ap, &curvp, &off);
5435 5433 anon_array_exit(&cookie);
5436 5434 ANON_LOCK_EXIT(&->a_rwlock);
5437 5435 }
5438 5436
5439 5437 pp = NULL;
5440 5438 if (curvp)
5441 5439 pp = page_lookup(curvp, off, SE_SHARED);
5442 5440
5443 5441 /*
5444 5442 * If there isn't a page at this virtual address,
5445 5443 * skip to next page
5446 5444 */
5447 5445 if (pp == NULL) {
5448 5446 va += PAGESIZE;
5449 5447 continue;
5450 5448 }
5451 5449
5452 5450 /*
5453 5451 * Figure out which lgroup this page is in for kstats
5454 5452 */
5455 5453 pfn = page_pptonum(pp);
5456 5454 from = lgrp_pfn_to_lgrp(pfn);
5457 5455
5458 5456 /*
5459 5457 * Get page size, and round up and skip to next page boundary
5460 5458 * if unaligned address
5461 5459 */
5462 5460 pszc = pp->p_szc;
5463 5461 pgsz = page_get_pagesize(pszc);
5464 5462 pages = btop(pgsz);
5465 5463 if (!IS_P2ALIGNED(va, pgsz) ||
5466 5464 !IS_P2ALIGNED(pfn, pages) ||
5467 5465 pgsz > segpgsz) {
5468 5466 pgsz = MIN(pgsz, segpgsz);
5469 5467 page_unlock(pp);
5470 5468 pages = btop(P2END((uintptr_t)va, pgsz) -
5471 5469 (uintptr_t)va);
5472 5470 va = (caddr_t)P2END((uintptr_t)va, pgsz);
5473 5471 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, pages);
5474 5472 continue;
5475 5473 }
5476 5474
5477 5475 /*
5478 5476 * Upgrade to exclusive lock on page
5479 5477 */
5480 5478 if (!page_tryupgrade(pp)) {
5481 5479 page_unlock(pp);
5482 5480 va += pgsz;
5483 5481 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5484 5482 btop(pgsz));
5485 5483 continue;
5486 5484 }
5487 5485
5488 5486 pp0 = pp++;
5489 5487 nlocked = 1;
5490 5488
5491 5489 /*
5492 5490 * Lock constituent pages if this is large page
5493 5491 */
5494 5492 if (pages > 1) {
5495 5493 /*
5496 5494 * Lock all constituents except root page, since it
5497 5495 * should be locked already.
5498 5496 */
5499 5497 for (; nlocked < pages; nlocked++) {
5500 5498 if (!page_trylock(pp, SE_EXCL)) {
5501 5499 break;
5502 5500 }
5503 5501 if (PP_ISFREE(pp) ||
5504 5502 pp->p_szc != pszc) {
5505 5503 /*
5506 5504 * hat_page_demote() raced in with us.
5507 5505 */
5508 5506 ASSERT(!IS_SWAPFSVP(curvp));
5509 5507 page_unlock(pp);
5510 5508 break;
5511 5509 }
5512 5510 pp++;
5513 5511 }
5514 5512 }
5515 5513
5516 5514 /*
5517 5515 * If all constituent pages couldn't be locked,
5518 5516 * unlock pages locked so far and skip to next page.
5519 5517 */
5520 5518 if (nlocked < pages) {
5521 5519 while (pp0 < pp) {
5522 5520 page_unlock(pp0++);
5523 5521 }
5524 5522 va += pgsz;
5525 5523 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5526 5524 btop(pgsz));
5527 5525 continue;
5528 5526 }
5529 5527
5530 5528 /*
5531 5529 * hat_page_demote() can no longer happen
5532 5530 * since last cons page had the right p_szc after
5533 5531 * all cons pages were locked. all cons pages
5534 5532 * should now have the same p_szc.
5535 5533 */
5536 5534
5537 5535 /*
5538 5536 * All constituent pages locked successfully, so mark
5539 5537 * large page for migration and unload the mappings of
5540 5538 * constituent pages, so a fault will occur on any part of the
5541 5539 * large page
5542 5540 */
5543 5541 PP_SETMIGRATE(pp0);
5544 5542 while (pp0 < pp) {
5545 5543 (void) hat_pageunload(pp0, HAT_FORCE_PGUNLOAD);
5546 5544 ASSERT(hat_page_getshare(pp0) == 0);
5547 5545 page_unlock(pp0++);
5548 5546 }
5549 5547 lgrp_stat_add(from->lgrp_id, LGRP_PMM_PGS, nlocked);
5550 5548
5551 5549 va += pgsz;
5552 5550 }
5553 5551 }
5554 5552
5555 5553 /*
5556 5554 * Migrate any pages that have been marked for migration in the given range
5557 5555 */
5558 5556 void
5559 5557 page_migrate(
5560 5558 struct seg *seg,
5561 5559 caddr_t addr,
5562 5560 page_t **ppa,
5563 5561 pgcnt_t npages)
5564 5562 {
5565 5563 lgrp_t *from;
5566 5564 lgrp_t *to;
5567 5565 page_t *newpp;
5568 5566 page_t *pp;
5569 5567 pfn_t pfn;
5570 5568 size_t pgsz;
5571 5569 spgcnt_t page_cnt;
5572 5570 spgcnt_t i;
5573 5571 uint_t pszc;
5574 5572
5575 5573 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
5576 5574
5577 5575 while (npages > 0) {
5578 5576 pp = *ppa;
5579 5577 pszc = pp->p_szc;
5580 5578 pgsz = page_get_pagesize(pszc);
5581 5579 page_cnt = btop(pgsz);
5582 5580
5583 5581 /*
5584 5582 * Check to see whether this page is marked for migration
5585 5583 *
5586 5584 * Assume that root page of large page is marked for
5587 5585 * migration and none of the other constituent pages
5588 5586 * are marked. This really simplifies clearing the
5589 5587 * migrate bit by not having to clear it from each
5590 5588 * constituent page.
5591 5589 *
5592 5590 * note we don't want to relocate an entire large page if
5593 5591 * someone is only using one subpage.
5594 5592 */
5595 5593 if (npages < page_cnt)
5596 5594 break;
5597 5595
5598 5596 /*
5599 5597 * Is it marked for migration?
5600 5598 */
5601 5599 if (!PP_ISMIGRATE(pp))
5602 5600 goto next;
5603 5601
5604 5602 /*
5605 5603 * Determine lgroups that page is being migrated between
5606 5604 */
5607 5605 pfn = page_pptonum(pp);
5608 5606 if (!IS_P2ALIGNED(pfn, page_cnt)) {
5609 5607 break;
5610 5608 }
5611 5609 from = lgrp_pfn_to_lgrp(pfn);
5612 5610 to = lgrp_mem_choose(seg, addr, pgsz);
5613 5611
5614 5612 /*
5615 5613 * Need to get exclusive lock's to migrate
5616 5614 */
5617 5615 for (i = 0; i < page_cnt; i++) {
5618 5616 ASSERT(PAGE_LOCKED(ppa[i]));
5619 5617 if (page_pptonum(ppa[i]) != pfn + i ||
5620 5618 ppa[i]->p_szc != pszc) {
5621 5619 break;
5622 5620 }
5623 5621 if (!page_tryupgrade(ppa[i])) {
5624 5622 lgrp_stat_add(from->lgrp_id,
5625 5623 LGRP_PM_FAIL_LOCK_PGS,
5626 5624 page_cnt);
5627 5625 break;
5628 5626 }
5629 5627
5630 5628 /*
5631 5629 * Check to see whether we are trying to migrate
5632 5630 * page to lgroup where it is allocated already.
5633 5631 * If so, clear the migrate bit and skip to next
5634 5632 * page.
5635 5633 */
5636 5634 if (i == 0 && to == from) {
5637 5635 PP_CLRMIGRATE(ppa[0]);
5638 5636 page_downgrade(ppa[0]);
5639 5637 goto next;
5640 5638 }
5641 5639 }
5642 5640
5643 5641 /*
5644 5642 * If all constituent pages couldn't be locked,
5645 5643 * unlock pages locked so far and skip to next page.
5646 5644 */
5647 5645 if (i != page_cnt) {
5648 5646 while (--i != -1) {
5649 5647 page_downgrade(ppa[i]);
5650 5648 }
5651 5649 goto next;
5652 5650 }
5653 5651
5654 5652 (void) page_create_wait(page_cnt, PG_WAIT);
5655 5653 newpp = page_get_replacement_page(pp, to, PGR_SAMESZC);
5656 5654 if (newpp == NULL) {
5657 5655 page_create_putback(page_cnt);
5658 5656 for (i = 0; i < page_cnt; i++) {
5659 5657 page_downgrade(ppa[i]);
5660 5658 }
5661 5659 lgrp_stat_add(to->lgrp_id, LGRP_PM_FAIL_ALLOC_PGS,
5662 5660 page_cnt);
5663 5661 goto next;
5664 5662 }
5665 5663 ASSERT(newpp->p_szc == pszc);
5666 5664 /*
5667 5665 * Clear migrate bit and relocate page
5668 5666 */
5669 5667 PP_CLRMIGRATE(pp);
5670 5668 if (page_relocate(&pp, &newpp, 0, 1, &page_cnt, to)) {
5671 5669 panic("page_migrate: page_relocate failed");
5672 5670 }
5673 5671 ASSERT(page_cnt * PAGESIZE == pgsz);
5674 5672
5675 5673 /*
5676 5674 * Keep stats for number of pages migrated from and to
5677 5675 * each lgroup
5678 5676 */
5679 5677 lgrp_stat_add(from->lgrp_id, LGRP_PM_SRC_PGS, page_cnt);
5680 5678 lgrp_stat_add(to->lgrp_id, LGRP_PM_DEST_PGS, page_cnt);
5681 5679 /*
5682 5680 * update the page_t array we were passed in and
5683 5681 * unlink constituent pages of a large page.
5684 5682 */
5685 5683 for (i = 0; i < page_cnt; ++i, ++pp) {
5686 5684 ASSERT(PAGE_EXCL(newpp));
5687 5685 ASSERT(newpp->p_szc == pszc);
5688 5686 ppa[i] = newpp;
5689 5687 pp = newpp;
5690 5688 page_sub(&newpp, pp);
5691 5689 page_downgrade(pp);
5692 5690 }
5693 5691 ASSERT(newpp == NULL);
5694 5692 next:
5695 5693 addr += pgsz;
5696 5694 ppa += page_cnt;
5697 5695 npages -= page_cnt;
5698 5696 }
5699 5697 }
5700 5698
5701 5699 #define MAX_CNT 60 /* max num of iterations */
5702 5700 /*
5703 5701 * Reclaim/reserve availrmem for npages.
5704 5702 * If there is not enough memory start reaping seg, kmem caches.
5705 5703 * Start pageout scanner (via page_needfree()).
5706 5704 * Exit after ~ MAX_CNT s regardless of how much memory has been released.
5707 5705 * Note: There is no guarantee that any availrmem will be freed as
5708 5706 * this memory typically is locked (kernel heap) or reserved for swap.
5709 5707 * Also due to memory fragmentation kmem allocator may not be able
5710 5708 * to free any memory (single user allocated buffer will prevent
5711 5709 * freeing slab or a page).
5712 5710 */
5713 5711 int
5714 5712 page_reclaim_mem(pgcnt_t npages, pgcnt_t epages, int adjust)
5715 5713 {
5716 5714 int i = 0;
5717 5715 int ret = 0;
5718 5716 pgcnt_t deficit;
5719 5717 pgcnt_t old_availrmem;
5720 5718
5721 5719 mutex_enter(&freemem_lock);
5722 5720 old_availrmem = availrmem - 1;
5723 5721 while ((availrmem < tune.t_minarmem + npages + epages) &&
5724 5722 (old_availrmem < availrmem) && (i++ < MAX_CNT)) {
5725 5723 old_availrmem = availrmem;
5726 5724 deficit = tune.t_minarmem + npages + epages - availrmem;
5727 5725 mutex_exit(&freemem_lock);
5728 5726 page_needfree(deficit);
5729 5727 kmem_reap();
5730 5728 delay(hz);
5731 5729 page_needfree(-(spgcnt_t)deficit);
5732 5730 mutex_enter(&freemem_lock);
5733 5731 }
5734 5732
5735 5733 if (adjust && (availrmem >= tune.t_minarmem + npages + epages)) {
5736 5734 availrmem -= npages;
5737 5735 ret = 1;
5738 5736 }
5739 5737
5740 5738 mutex_exit(&freemem_lock);
5741 5739
5742 5740 return (ret);
5743 5741 }
5744 5742
5745 5743 /*
5746 5744 * Search the memory segments to locate the desired page. Within a
5747 5745 * segment, pages increase linearly with one page structure per
5748 5746 * physical page frame (size PAGESIZE). The search begins
5749 5747 * with the segment that was accessed last, to take advantage of locality.
5750 5748 * If the hint misses, we start from the beginning of the sorted memseg list
5751 5749 */
5752 5750
5753 5751
5754 5752 /*
5755 5753 * Some data structures for pfn to pp lookup.
5756 5754 */
5757 5755 ulong_t mhash_per_slot;
5758 5756 struct memseg *memseg_hash[N_MEM_SLOTS];
5759 5757
5760 5758 page_t *
5761 5759 page_numtopp_nolock(pfn_t pfnum)
5762 5760 {
5763 5761 struct memseg *seg;
5764 5762 page_t *pp;
5765 5763 vm_cpu_data_t *vc;
5766 5764
5767 5765 /*
5768 5766 * We need to disable kernel preemption while referencing the
5769 5767 * cpu_vm_data field in order to prevent us from being switched to
5770 5768 * another cpu and trying to reference it after it has been freed.
5771 5769 * This will keep us on cpu and prevent it from being removed while
5772 5770 * we are still on it.
5773 5771 *
5774 5772 * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5775 5773 * which is being resued by DR who will flush those references
5776 5774 * before modifying the reused memseg. See memseg_cpu_vm_flush().
5777 5775 */
5778 5776 kpreempt_disable();
5779 5777 vc = CPU->cpu_vm_data;
5780 5778 ASSERT(vc != NULL);
5781 5779
5782 5780 MEMSEG_STAT_INCR(nsearch);
5783 5781
5784 5782 /* Try last winner first */
5785 5783 if (((seg = vc->vc_pnum_memseg) != NULL) &&
5786 5784 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5787 5785 MEMSEG_STAT_INCR(nlastwon);
5788 5786 pp = seg->pages + (pfnum - seg->pages_base);
5789 5787 if (pp->p_pagenum == pfnum) {
5790 5788 kpreempt_enable();
5791 5789 return ((page_t *)pp);
5792 5790 }
5793 5791 }
5794 5792
5795 5793 /* Else Try hash */
5796 5794 if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5797 5795 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5798 5796 MEMSEG_STAT_INCR(nhashwon);
5799 5797 vc->vc_pnum_memseg = seg;
5800 5798 pp = seg->pages + (pfnum - seg->pages_base);
5801 5799 if (pp->p_pagenum == pfnum) {
5802 5800 kpreempt_enable();
5803 5801 return ((page_t *)pp);
5804 5802 }
5805 5803 }
5806 5804
5807 5805 /* Else Brute force */
5808 5806 for (seg = memsegs; seg != NULL; seg = seg->next) {
5809 5807 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5810 5808 vc->vc_pnum_memseg = seg;
5811 5809 pp = seg->pages + (pfnum - seg->pages_base);
5812 5810 if (pp->p_pagenum == pfnum) {
5813 5811 kpreempt_enable();
5814 5812 return ((page_t *)pp);
5815 5813 }
5816 5814 }
5817 5815 }
5818 5816 vc->vc_pnum_memseg = NULL;
5819 5817 kpreempt_enable();
5820 5818 MEMSEG_STAT_INCR(nnotfound);
5821 5819 return ((page_t *)NULL);
5822 5820
5823 5821 }
5824 5822
5825 5823 struct memseg *
5826 5824 page_numtomemseg_nolock(pfn_t pfnum)
5827 5825 {
5828 5826 struct memseg *seg;
5829 5827 page_t *pp;
5830 5828
5831 5829 /*
5832 5830 * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5833 5831 * which is being resued by DR who will flush those references
5834 5832 * before modifying the reused memseg. See memseg_cpu_vm_flush().
5835 5833 */
5836 5834 kpreempt_disable();
5837 5835 /* Try hash */
5838 5836 if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5839 5837 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5840 5838 pp = seg->pages + (pfnum - seg->pages_base);
5841 5839 if (pp->p_pagenum == pfnum) {
5842 5840 kpreempt_enable();
5843 5841 return (seg);
5844 5842 }
5845 5843 }
5846 5844
5847 5845 /* Else Brute force */
5848 5846 for (seg = memsegs; seg != NULL; seg = seg->next) {
5849 5847 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5850 5848 pp = seg->pages + (pfnum - seg->pages_base);
5851 5849 if (pp->p_pagenum == pfnum) {
5852 5850 kpreempt_enable();
5853 5851 return (seg);
5854 5852 }
5855 5853 }
5856 5854 }
5857 5855 kpreempt_enable();
5858 5856 return ((struct memseg *)NULL);
5859 5857 }
5860 5858
5861 5859 /*
5862 5860 * Given a page and a count return the page struct that is
5863 5861 * n structs away from the current one in the global page
5864 5862 * list.
5865 5863 *
5866 5864 * This function wraps to the first page upon
5867 5865 * reaching the end of the memseg list.
5868 5866 */
5869 5867 page_t *
5870 5868 page_nextn(page_t *pp, ulong_t n)
5871 5869 {
5872 5870 struct memseg *seg;
5873 5871 page_t *ppn;
5874 5872 vm_cpu_data_t *vc;
5875 5873
5876 5874 /*
5877 5875 * We need to disable kernel preemption while referencing the
5878 5876 * cpu_vm_data field in order to prevent us from being switched to
5879 5877 * another cpu and trying to reference it after it has been freed.
5880 5878 * This will keep us on cpu and prevent it from being removed while
5881 5879 * we are still on it.
5882 5880 *
5883 5881 * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5884 5882 * which is being resued by DR who will flush those references
5885 5883 * before modifying the reused memseg. See memseg_cpu_vm_flush().
5886 5884 */
5887 5885 kpreempt_disable();
5888 5886 vc = (vm_cpu_data_t *)CPU->cpu_vm_data;
5889 5887
5890 5888 ASSERT(vc != NULL);
5891 5889
5892 5890 if (((seg = vc->vc_pnext_memseg) == NULL) ||
5893 5891 (seg->pages_base == seg->pages_end) ||
5894 5892 !(pp >= seg->pages && pp < seg->epages)) {
5895 5893
5896 5894 for (seg = memsegs; seg; seg = seg->next) {
5897 5895 if (pp >= seg->pages && pp < seg->epages)
5898 5896 break;
5899 5897 }
5900 5898
5901 5899 if (seg == NULL) {
5902 5900 /* Memory delete got in, return something valid. */
5903 5901 /* TODO: fix me. */
5904 5902 seg = memsegs;
5905 5903 pp = seg->pages;
5906 5904 }
5907 5905 }
5908 5906
5909 5907 /* check for wraparound - possible if n is large */
5910 5908 while ((ppn = (pp + n)) >= seg->epages || ppn < pp) {
5911 5909 n -= seg->epages - pp;
5912 5910 seg = seg->next;
5913 5911 if (seg == NULL)
5914 5912 seg = memsegs;
5915 5913 pp = seg->pages;
5916 5914 }
5917 5915 vc->vc_pnext_memseg = seg;
5918 5916 kpreempt_enable();
5919 5917 return (ppn);
5920 5918 }
5921 5919
5922 5920 /*
5923 5921 * Initialize for a loop using page_next_scan_large().
5924 5922 */
5925 5923 page_t *
5926 5924 page_next_scan_init(void **cookie)
5927 5925 {
5928 5926 ASSERT(cookie != NULL);
5929 5927 *cookie = (void *)memsegs;
5930 5928 return ((page_t *)memsegs->pages);
5931 5929 }
5932 5930
5933 5931 /*
5934 5932 * Return the next page in a scan of page_t's, assuming we want
5935 5933 * to skip over sub-pages within larger page sizes.
5936 5934 *
5937 5935 * The cookie is used to keep track of the current memseg.
5938 5936 */
5939 5937 page_t *
5940 5938 page_next_scan_large(
5941 5939 page_t *pp,
5942 5940 ulong_t *n,
5943 5941 void **cookie)
5944 5942 {
5945 5943 struct memseg *seg = (struct memseg *)*cookie;
5946 5944 page_t *new_pp;
5947 5945 ulong_t cnt;
5948 5946 pfn_t pfn;
5949 5947
5950 5948
5951 5949 /*
5952 5950 * get the count of page_t's to skip based on the page size
5953 5951 */
5954 5952 ASSERT(pp != NULL);
5955 5953 if (pp->p_szc == 0) {
5956 5954 cnt = 1;
5957 5955 } else {
5958 5956 pfn = page_pptonum(pp);
5959 5957 cnt = page_get_pagecnt(pp->p_szc);
5960 5958 cnt -= pfn & (cnt - 1);
5961 5959 }
5962 5960 *n += cnt;
5963 5961 new_pp = pp + cnt;
5964 5962
5965 5963 /*
5966 5964 * Catch if we went past the end of the current memory segment. If so,
5967 5965 * just move to the next segment with pages.
5968 5966 */
5969 5967 if (new_pp >= seg->epages || seg->pages_base == seg->pages_end) {
5970 5968 do {
5971 5969 seg = seg->next;
5972 5970 if (seg == NULL)
5973 5971 seg = memsegs;
5974 5972 } while (seg->pages_base == seg->pages_end);
5975 5973 new_pp = seg->pages;
5976 5974 *cookie = (void *)seg;
5977 5975 }
5978 5976
5979 5977 return (new_pp);
5980 5978 }
5981 5979
5982 5980
5983 5981 /*
5984 5982 * Returns next page in list. Note: this function wraps
5985 5983 * to the first page in the list upon reaching the end
5986 5984 * of the list. Callers should be aware of this fact.
5987 5985 */
5988 5986
5989 5987 /* We should change this be a #define */
5990 5988
5991 5989 page_t *
5992 5990 page_next(page_t *pp)
5993 5991 {
5994 5992 return (page_nextn(pp, 1));
5995 5993 }
5996 5994
5997 5995 page_t *
5998 5996 page_first()
5999 5997 {
6000 5998 return ((page_t *)memsegs->pages);
6001 5999 }
6002 6000
6003 6001
6004 6002 /*
6005 6003 * This routine is called at boot with the initial memory configuration
6006 6004 * and when memory is added or removed.
6007 6005 */
6008 6006 void
6009 6007 build_pfn_hash()
6010 6008 {
6011 6009 pfn_t cur;
6012 6010 pgcnt_t index;
6013 6011 struct memseg *pseg;
6014 6012 int i;
6015 6013
6016 6014 /*
6017 6015 * Clear memseg_hash array.
6018 6016 * Since memory add/delete is designed to operate concurrently
6019 6017 * with normal operation, the hash rebuild must be able to run
6020 6018 * concurrently with page_numtopp_nolock(). To support this
6021 6019 * functionality, assignments to memseg_hash array members must
6022 6020 * be done atomically.
6023 6021 *
6024 6022 * NOTE: bzero() does not currently guarantee this for kernel
6025 6023 * threads, and cannot be used here.
6026 6024 */
6027 6025 for (i = 0; i < N_MEM_SLOTS; i++)
6028 6026 memseg_hash[i] = NULL;
6029 6027
6030 6028 hat_kpm_mseghash_clear(N_MEM_SLOTS);
6031 6029
6032 6030 /*
6033 6031 * Physmax is the last valid pfn.
6034 6032 */
6035 6033 mhash_per_slot = (physmax + 1) >> MEM_HASH_SHIFT;
6036 6034 for (pseg = memsegs; pseg != NULL; pseg = pseg->next) {
6037 6035 index = MEMSEG_PFN_HASH(pseg->pages_base);
6038 6036 cur = pseg->pages_base;
6039 6037 do {
6040 6038 if (index >= N_MEM_SLOTS)
6041 6039 index = MEMSEG_PFN_HASH(cur);
6042 6040
6043 6041 if (memseg_hash[index] == NULL ||
6044 6042 memseg_hash[index]->pages_base > pseg->pages_base) {
6045 6043 memseg_hash[index] = pseg;
6046 6044 hat_kpm_mseghash_update(index, pseg);
6047 6045 }
6048 6046 cur += mhash_per_slot;
6049 6047 index++;
6050 6048 } while (cur < pseg->pages_end);
6051 6049 }
6052 6050 }
6053 6051
6054 6052 /*
6055 6053 * Return the pagenum for the pp
6056 6054 */
6057 6055 pfn_t
6058 6056 page_pptonum(page_t *pp)
6059 6057 {
6060 6058 return (pp->p_pagenum);
6061 6059 }
6062 6060
6063 6061 /*
6064 6062 * interface to the referenced and modified etc bits
6065 6063 * in the PSM part of the page struct
6066 6064 * when no locking is desired.
6067 6065 */
6068 6066 void
6069 6067 page_set_props(page_t *pp, uint_t flags)
6070 6068 {
6071 6069 ASSERT((flags & ~(P_MOD | P_REF | P_RO)) == 0);
6072 6070 pp->p_nrm |= (uchar_t)flags;
6073 6071 }
6074 6072
6075 6073 void
6076 6074 page_clr_all_props(page_t *pp)
6077 6075 {
6078 6076 pp->p_nrm = 0;
6079 6077 }
6080 6078
6081 6079 /*
6082 6080 * Clear p_lckcnt and p_cowcnt, adjusting freemem if required.
6083 6081 */
6084 6082 int
6085 6083 page_clear_lck_cow(page_t *pp, int adjust)
6086 6084 {
6087 6085 int f_amount;
6088 6086
6089 6087 ASSERT(PAGE_EXCL(pp));
6090 6088
6091 6089 /*
6092 6090 * The page_struct_lock need not be acquired here since
6093 6091 * we require the caller hold the page exclusively locked.
6094 6092 */
6095 6093 f_amount = 0;
6096 6094 if (pp->p_lckcnt) {
6097 6095 f_amount = 1;
6098 6096 pp->p_lckcnt = 0;
6099 6097 }
6100 6098 if (pp->p_cowcnt) {
6101 6099 f_amount += pp->p_cowcnt;
6102 6100 pp->p_cowcnt = 0;
6103 6101 }
6104 6102
6105 6103 if (adjust && f_amount) {
6106 6104 mutex_enter(&freemem_lock);
6107 6105 availrmem += f_amount;
6108 6106 mutex_exit(&freemem_lock);
6109 6107 }
6110 6108
6111 6109 return (f_amount);
6112 6110 }
6113 6111
6114 6112 /*
6115 6113 * The following functions is called from free_vp_pages()
6116 6114 * for an inexact estimate of a newly free'd page...
6117 6115 */
6118 6116 ulong_t
6119 6117 page_share_cnt(page_t *pp)
6120 6118 {
6121 6119 return (hat_page_getshare(pp));
6122 6120 }
6123 6121
6124 6122 int
6125 6123 page_isshared(page_t *pp)
6126 6124 {
6127 6125 return (hat_page_checkshare(pp, 1));
6128 6126 }
6129 6127
6130 6128 int
6131 6129 page_isfree(page_t *pp)
6132 6130 {
6133 6131 return (PP_ISFREE(pp));
6134 6132 }
6135 6133
6136 6134 int
6137 6135 page_isref(page_t *pp)
6138 6136 {
6139 6137 return (hat_page_getattr(pp, P_REF));
6140 6138 }
6141 6139
6142 6140 int
6143 6141 page_ismod(page_t *pp)
6144 6142 {
6145 6143 return (hat_page_getattr(pp, P_MOD));
6146 6144 }
6147 6145
6148 6146 /*
6149 6147 * The following code all currently relates to the page capture logic:
6150 6148 *
6151 6149 * This logic is used for cases where there is a desire to claim a certain
6152 6150 * physical page in the system for the caller. As it may not be possible
6153 6151 * to capture the page immediately, the p_toxic bits are used in the page
6154 6152 * structure to indicate that someone wants to capture this page. When the
6155 6153 * page gets unlocked, the toxic flag will be noted and an attempt to capture
6156 6154 * the page will be made. If it is successful, the original callers callback
6157 6155 * will be called with the page to do with it what they please.
6158 6156 *
6159 6157 * There is also an async thread which wakes up to attempt to capture
6160 6158 * pages occasionally which have the capture bit set. All of the pages which
6161 6159 * need to be captured asynchronously have been inserted into the
6162 6160 * page_capture_hash and thus this thread walks that hash list. Items in the
6163 6161 * hash have an expiration time so this thread handles that as well by removing
6164 6162 * the item from the hash if it has expired.
6165 6163 *
6166 6164 * Some important things to note are:
6167 6165 * - if the PR_CAPTURE bit is set on a page, then the page is in the
6168 6166 * page_capture_hash. The page_capture_hash_head.pchh_mutex is needed
6169 6167 * to set and clear this bit, and while the lock is held is the only time
6170 6168 * you can add or remove an entry from the hash.
6171 6169 * - the PR_CAPTURE bit can only be set and cleared while holding the
6172 6170 * page_capture_hash_head.pchh_mutex
6173 6171 * - the t_flag field of the thread struct is used with the T_CAPTURING
6174 6172 * flag to prevent recursion while dealing with large pages.
6175 6173 * - pages which need to be retired never expire on the page_capture_hash.
6176 6174 */
6177 6175
6178 6176 static void page_capture_thread(void);
6179 6177 static kthread_t *pc_thread_id;
6180 6178 kcondvar_t pc_cv;
6181 6179 static kmutex_t pc_thread_mutex;
6182 6180 static clock_t pc_thread_shortwait;
6183 6181 static clock_t pc_thread_longwait;
6184 6182 static int pc_thread_retry;
6185 6183
6186 6184 struct page_capture_callback pc_cb[PC_NUM_CALLBACKS];
6187 6185
6188 6186 /* Note that this is a circular linked list */
6189 6187 typedef struct page_capture_hash_bucket {
6190 6188 page_t *pp;
6191 6189 uchar_t szc;
6192 6190 uchar_t pri;
6193 6191 uint_t flags;
6194 6192 clock_t expires; /* lbolt at which this request expires. */
6195 6193 void *datap; /* Cached data passed in for callback */
6196 6194 struct page_capture_hash_bucket *next;
6197 6195 struct page_capture_hash_bucket *prev;
6198 6196 } page_capture_hash_bucket_t;
6199 6197
6200 6198 #define PC_PRI_HI 0 /* capture now */
6201 6199 #define PC_PRI_LO 1 /* capture later */
6202 6200 #define PC_NUM_PRI 2
6203 6201
6204 6202 #define PAGE_CAPTURE_PRIO(pp) (PP_ISRAF(pp) ? PC_PRI_LO : PC_PRI_HI)
6205 6203
6206 6204
6207 6205 /*
6208 6206 * Each hash bucket will have it's own mutex and two lists which are:
6209 6207 * active (0): represents requests which have not been processed by
6210 6208 * the page_capture async thread yet.
6211 6209 * walked (1): represents requests which have been processed by the
6212 6210 * page_capture async thread within it's given walk of this bucket.
6213 6211 *
6214 6212 * These are all needed so that we can synchronize all async page_capture
6215 6213 * events. When the async thread moves to a new bucket, it will append the
6216 6214 * walked list to the active list and walk each item one at a time, moving it
6217 6215 * from the active list to the walked list. Thus if there is an async request
6218 6216 * outstanding for a given page, it will always be in one of the two lists.
6219 6217 * New requests will always be added to the active list.
6220 6218 * If we were not able to capture a page before the request expired, we'd free
6221 6219 * up the request structure which would indicate to page_capture that there is
6222 6220 * no longer a need for the given page, and clear the PR_CAPTURE flag if
6223 6221 * possible.
6224 6222 */
6225 6223 typedef struct page_capture_hash_head {
6226 6224 kmutex_t pchh_mutex;
6227 6225 uint_t num_pages[PC_NUM_PRI];
6228 6226 page_capture_hash_bucket_t lists[2]; /* sentinel nodes */
6229 6227 } page_capture_hash_head_t;
6230 6228
6231 6229 #ifdef DEBUG
6232 6230 #define NUM_PAGE_CAPTURE_BUCKETS 4
6233 6231 #else
6234 6232 #define NUM_PAGE_CAPTURE_BUCKETS 64
6235 6233 #endif
6236 6234
6237 6235 page_capture_hash_head_t page_capture_hash[NUM_PAGE_CAPTURE_BUCKETS];
6238 6236
6239 6237 /* for now use a very simple hash based upon the size of a page struct */
6240 6238 #define PAGE_CAPTURE_HASH(pp) \
6241 6239 ((int)(((uintptr_t)pp >> 7) & (NUM_PAGE_CAPTURE_BUCKETS - 1)))
6242 6240
6243 6241 extern pgcnt_t swapfs_minfree;
6244 6242
6245 6243 int page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap);
6246 6244
6247 6245 /*
6248 6246 * a callback function is required for page capture requests.
6249 6247 */
6250 6248 void
6251 6249 page_capture_register_callback(uint_t index, clock_t duration,
6252 6250 int (*cb_func)(page_t *, void *, uint_t))
6253 6251 {
6254 6252 ASSERT(pc_cb[index].cb_active == 0);
6255 6253 ASSERT(cb_func != NULL);
6256 6254 rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
6257 6255 pc_cb[index].duration = duration;
6258 6256 pc_cb[index].cb_func = cb_func;
6259 6257 pc_cb[index].cb_active = 1;
6260 6258 rw_exit(&pc_cb[index].cb_rwlock);
6261 6259 }
6262 6260
6263 6261 void
6264 6262 page_capture_unregister_callback(uint_t index)
6265 6263 {
6266 6264 int i, j;
6267 6265 struct page_capture_hash_bucket *bp1;
6268 6266 struct page_capture_hash_bucket *bp2;
6269 6267 struct page_capture_hash_bucket *head = NULL;
6270 6268 uint_t flags = (1 << index);
6271 6269
6272 6270 rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
6273 6271 ASSERT(pc_cb[index].cb_active == 1);
6274 6272 pc_cb[index].duration = 0; /* Paranoia */
6275 6273 pc_cb[index].cb_func = NULL; /* Paranoia */
6276 6274 pc_cb[index].cb_active = 0;
6277 6275 rw_exit(&pc_cb[index].cb_rwlock);
6278 6276
6279 6277 /*
6280 6278 * Just move all the entries to a private list which we can walk
6281 6279 * through without the need to hold any locks.
6282 6280 * No more requests can get added to the hash lists for this consumer
6283 6281 * as the cb_active field for the callback has been cleared.
6284 6282 */
6285 6283 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
6286 6284 mutex_enter(&page_capture_hash[i].pchh_mutex);
6287 6285 for (j = 0; j < 2; j++) {
6288 6286 bp1 = page_capture_hash[i].lists[j].next;
6289 6287 /* walk through all but first (sentinel) element */
6290 6288 while (bp1 != &page_capture_hash[i].lists[j]) {
6291 6289 bp2 = bp1;
6292 6290 if (bp2->flags & flags) {
6293 6291 bp1 = bp2->next;
6294 6292 bp1->prev = bp2->prev;
6295 6293 bp2->prev->next = bp1;
6296 6294 bp2->next = head;
6297 6295 head = bp2;
6298 6296 /*
6299 6297 * Clear the PR_CAPTURE bit as we
6300 6298 * hold appropriate locks here.
6301 6299 */
6302 6300 page_clrtoxic(head->pp, PR_CAPTURE);
6303 6301 page_capture_hash[i].
6304 6302 num_pages[bp2->pri]--;
6305 6303 continue;
6306 6304 }
6307 6305 bp1 = bp1->next;
6308 6306 }
6309 6307 }
6310 6308 mutex_exit(&page_capture_hash[i].pchh_mutex);
6311 6309 }
6312 6310
6313 6311 while (head != NULL) {
6314 6312 bp1 = head;
6315 6313 head = head->next;
6316 6314 kmem_free(bp1, sizeof (*bp1));
6317 6315 }
6318 6316 }
6319 6317
6320 6318
6321 6319 /*
6322 6320 * Find pp in the active list and move it to the walked list if it
6323 6321 * exists.
6324 6322 * Note that most often pp should be at the front of the active list
6325 6323 * as it is currently used and thus there is no other sort of optimization
6326 6324 * being done here as this is a linked list data structure.
6327 6325 * Returns 1 on successful move or 0 if page could not be found.
6328 6326 */
6329 6327 static int
6330 6328 page_capture_move_to_walked(page_t *pp)
6331 6329 {
6332 6330 page_capture_hash_bucket_t *bp;
6333 6331 int index;
6334 6332
6335 6333 index = PAGE_CAPTURE_HASH(pp);
6336 6334
6337 6335 mutex_enter(&page_capture_hash[index].pchh_mutex);
6338 6336 bp = page_capture_hash[index].lists[0].next;
6339 6337 while (bp != &page_capture_hash[index].lists[0]) {
6340 6338 if (bp->pp == pp) {
6341 6339 /* Remove from old list */
6342 6340 bp->next->prev = bp->prev;
6343 6341 bp->prev->next = bp->next;
6344 6342
6345 6343 /* Add to new list */
6346 6344 bp->next = page_capture_hash[index].lists[1].next;
6347 6345 bp->prev = &page_capture_hash[index].lists[1];
6348 6346 page_capture_hash[index].lists[1].next = bp;
6349 6347 bp->next->prev = bp;
6350 6348
6351 6349 /*
6352 6350 * There is a small probability of page on a free
6353 6351 * list being retired while being allocated
6354 6352 * and before P_RAF is set on it. The page may
6355 6353 * end up marked as high priority request instead
6356 6354 * of low priority request.
6357 6355 * If P_RAF page is not marked as low priority request
6358 6356 * change it to low priority request.
6359 6357 */
6360 6358 page_capture_hash[index].num_pages[bp->pri]--;
6361 6359 bp->pri = PAGE_CAPTURE_PRIO(pp);
6362 6360 page_capture_hash[index].num_pages[bp->pri]++;
6363 6361 mutex_exit(&page_capture_hash[index].pchh_mutex);
6364 6362 return (1);
6365 6363 }
6366 6364 bp = bp->next;
6367 6365 }
6368 6366 mutex_exit(&page_capture_hash[index].pchh_mutex);
6369 6367 return (0);
6370 6368 }
6371 6369
6372 6370 /*
6373 6371 * Add a new entry to the page capture hash. The only case where a new
6374 6372 * entry is not added is when the page capture consumer is no longer registered.
6375 6373 * In this case, we'll silently not add the page to the hash. We know that
6376 6374 * page retire will always be registered for the case where we are currently
6377 6375 * unretiring a page and thus there are no conflicts.
6378 6376 */
6379 6377 static void
6380 6378 page_capture_add_hash(page_t *pp, uint_t szc, uint_t flags, void *datap)
6381 6379 {
6382 6380 page_capture_hash_bucket_t *bp1;
6383 6381 page_capture_hash_bucket_t *bp2;
6384 6382 int index;
6385 6383 int cb_index;
6386 6384 int i;
6387 6385 uchar_t pri;
6388 6386 #ifdef DEBUG
6389 6387 page_capture_hash_bucket_t *tp1;
6390 6388 int l;
6391 6389 #endif
6392 6390
6393 6391 ASSERT(!(flags & CAPTURE_ASYNC));
6394 6392
6395 6393 bp1 = kmem_alloc(sizeof (struct page_capture_hash_bucket), KM_SLEEP);
6396 6394
6397 6395 bp1->pp = pp;
6398 6396 bp1->szc = szc;
6399 6397 bp1->flags = flags;
6400 6398 bp1->datap = datap;
6401 6399
6402 6400 for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6403 6401 if ((flags >> cb_index) & 1) {
6404 6402 break;
6405 6403 }
6406 6404 }
6407 6405
6408 6406 ASSERT(cb_index != PC_NUM_CALLBACKS);
6409 6407
6410 6408 rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
6411 6409 if (pc_cb[cb_index].cb_active) {
6412 6410 if (pc_cb[cb_index].duration == -1) {
6413 6411 bp1->expires = (clock_t)-1;
6414 6412 } else {
6415 6413 bp1->expires = ddi_get_lbolt() +
6416 6414 pc_cb[cb_index].duration;
6417 6415 }
6418 6416 } else {
6419 6417 /* There's no callback registered so don't add to the hash */
6420 6418 rw_exit(&pc_cb[cb_index].cb_rwlock);
6421 6419 kmem_free(bp1, sizeof (*bp1));
6422 6420 return;
6423 6421 }
6424 6422
6425 6423 index = PAGE_CAPTURE_HASH(pp);
6426 6424
6427 6425 /*
6428 6426 * Only allow capture flag to be modified under this mutex.
6429 6427 * Prevents multiple entries for same page getting added.
6430 6428 */
6431 6429 mutex_enter(&page_capture_hash[index].pchh_mutex);
6432 6430
6433 6431 /*
6434 6432 * if not already on the hash, set capture bit and add to the hash
6435 6433 */
6436 6434 if (!(pp->p_toxic & PR_CAPTURE)) {
6437 6435 #ifdef DEBUG
6438 6436 /* Check for duplicate entries */
6439 6437 for (l = 0; l < 2; l++) {
6440 6438 tp1 = page_capture_hash[index].lists[l].next;
6441 6439 while (tp1 != &page_capture_hash[index].lists[l]) {
6442 6440 if (tp1->pp == pp) {
6443 6441 panic("page pp 0x%p already on hash "
6444 6442 "at 0x%p\n",
6445 6443 (void *)pp, (void *)tp1);
6446 6444 }
6447 6445 tp1 = tp1->next;
6448 6446 }
6449 6447 }
6450 6448
6451 6449 #endif
6452 6450 page_settoxic(pp, PR_CAPTURE);
6453 6451 pri = PAGE_CAPTURE_PRIO(pp);
6454 6452 bp1->pri = pri;
6455 6453 bp1->next = page_capture_hash[index].lists[0].next;
6456 6454 bp1->prev = &page_capture_hash[index].lists[0];
6457 6455 bp1->next->prev = bp1;
6458 6456 page_capture_hash[index].lists[0].next = bp1;
6459 6457 page_capture_hash[index].num_pages[pri]++;
6460 6458 if (flags & CAPTURE_RETIRE) {
6461 6459 page_retire_incr_pend_count(datap);
6462 6460 }
6463 6461 mutex_exit(&page_capture_hash[index].pchh_mutex);
6464 6462 rw_exit(&pc_cb[cb_index].cb_rwlock);
6465 6463 cv_signal(&pc_cv);
6466 6464 return;
6467 6465 }
6468 6466
6469 6467 /*
6470 6468 * A page retire request will replace any other request.
6471 6469 * A second physmem request which is for a different process than
6472 6470 * the currently registered one will be dropped as there is
6473 6471 * no way to hold the private data for both calls.
6474 6472 * In the future, once there are more callers, this will have to
6475 6473 * be worked out better as there needs to be private storage for
6476 6474 * at least each type of caller (maybe have datap be an array of
6477 6475 * *void's so that we can index based upon callers index).
6478 6476 */
6479 6477
6480 6478 /* walk hash list to update expire time */
6481 6479 for (i = 0; i < 2; i++) {
6482 6480 bp2 = page_capture_hash[index].lists[i].next;
6483 6481 while (bp2 != &page_capture_hash[index].lists[i]) {
6484 6482 if (bp2->pp == pp) {
6485 6483 if (flags & CAPTURE_RETIRE) {
6486 6484 if (!(bp2->flags & CAPTURE_RETIRE)) {
6487 6485 page_retire_incr_pend_count(
6488 6486 datap);
6489 6487 bp2->flags = flags;
6490 6488 bp2->expires = bp1->expires;
6491 6489 bp2->datap = datap;
6492 6490 }
6493 6491 } else {
6494 6492 ASSERT(flags & CAPTURE_PHYSMEM);
6495 6493 if (!(bp2->flags & CAPTURE_RETIRE) &&
6496 6494 (datap == bp2->datap)) {
6497 6495 bp2->expires = bp1->expires;
6498 6496 }
6499 6497 }
6500 6498 mutex_exit(&page_capture_hash[index].
6501 6499 pchh_mutex);
6502 6500 rw_exit(&pc_cb[cb_index].cb_rwlock);
6503 6501 kmem_free(bp1, sizeof (*bp1));
6504 6502 return;
6505 6503 }
6506 6504 bp2 = bp2->next;
6507 6505 }
6508 6506 }
6509 6507
6510 6508 /*
6511 6509 * the PR_CAPTURE flag is protected by the page_capture_hash mutexes
6512 6510 * and thus it either has to be set or not set and can't change
6513 6511 * while holding the mutex above.
6514 6512 */
6515 6513 panic("page_capture_add_hash, PR_CAPTURE flag set on pp %p\n",
6516 6514 (void *)pp);
6517 6515 }
6518 6516
6519 6517 /*
6520 6518 * We have a page in our hands, lets try and make it ours by turning
6521 6519 * it into a clean page like it had just come off the freelists.
6522 6520 *
6523 6521 * Returns 0 on success, with the page still EXCL locked.
6524 6522 * On failure, the page will be unlocked, and returns EAGAIN
6525 6523 */
6526 6524 static int
6527 6525 page_capture_clean_page(page_t *pp)
6528 6526 {
6529 6527 page_t *newpp;
6530 6528 int skip_unlock = 0;
6531 6529 spgcnt_t count;
6532 6530 page_t *tpp;
6533 6531 int ret = 0;
6534 6532 int extra;
6535 6533
6536 6534 ASSERT(PAGE_EXCL(pp));
6537 6535 ASSERT(!PP_RETIRED(pp));
6538 6536 ASSERT(curthread->t_flag & T_CAPTURING);
6539 6537
6540 6538 if (PP_ISFREE(pp)) {
6541 6539 if (!page_reclaim(pp, NULL)) {
6542 6540 skip_unlock = 1;
6543 6541 ret = EAGAIN;
6544 6542 goto cleanup;
6545 6543 }
6546 6544 ASSERT(pp->p_szc == 0);
6547 6545 if (pp->p_vnode != NULL) {
6548 6546 /*
6549 6547 * Since this page came from the
6550 6548 * cachelist, we must destroy the
6551 6549 * old vnode association.
6552 6550 */
6553 6551 page_hashout(pp, NULL);
6554 6552 }
6555 6553 goto cleanup;
6556 6554 }
6557 6555
6558 6556 /*
6559 6557 * If we know page_relocate will fail, skip it
6560 6558 * It could still fail due to a UE on another page but we
6561 6559 * can't do anything about that.
6562 6560 */
6563 6561 if (pp->p_toxic & PR_UE) {
6564 6562 goto skip_relocate;
6565 6563 }
6566 6564
6567 6565 /*
6568 6566 * It's possible that pages can not have a vnode as fsflush comes
6569 6567 * through and cleans up these pages. It's ugly but that's how it is.
6570 6568 */
6571 6569 if (pp->p_vnode == NULL) {
6572 6570 goto skip_relocate;
6573 6571 }
6574 6572
6575 6573 /*
6576 6574 * Page was not free, so lets try to relocate it.
6577 6575 * page_relocate only works with root pages, so if this is not a root
6578 6576 * page, we need to demote it to try and relocate it.
6579 6577 * Unfortunately this is the best we can do right now.
6580 6578 */
6581 6579 newpp = NULL;
6582 6580 if ((pp->p_szc > 0) && (pp != PP_PAGEROOT(pp))) {
6583 6581 if (page_try_demote_pages(pp) == 0) {
6584 6582 ret = EAGAIN;
6585 6583 goto cleanup;
6586 6584 }
6587 6585 }
6588 6586 ret = page_relocate(&pp, &newpp, 1, 0, &count, NULL);
6589 6587 if (ret == 0) {
6590 6588 page_t *npp;
6591 6589 /* unlock the new page(s) */
6592 6590 while (count-- > 0) {
6593 6591 ASSERT(newpp != NULL);
6594 6592 npp = newpp;
6595 6593 page_sub(&newpp, npp);
6596 6594 page_unlock(npp);
6597 6595 }
6598 6596 ASSERT(newpp == NULL);
6599 6597 /*
6600 6598 * Check to see if the page we have is too large.
6601 6599 * If so, demote it freeing up the extra pages.
6602 6600 */
6603 6601 if (pp->p_szc > 0) {
6604 6602 /* For now demote extra pages to szc == 0 */
6605 6603 extra = page_get_pagecnt(pp->p_szc) - 1;
6606 6604 while (extra > 0) {
6607 6605 tpp = pp->p_next;
6608 6606 page_sub(&pp, tpp);
6609 6607 tpp->p_szc = 0;
6610 6608 page_free(tpp, 1);
6611 6609 extra--;
6612 6610 }
6613 6611 /* Make sure to set our page to szc 0 as well */
6614 6612 ASSERT(pp->p_next == pp && pp->p_prev == pp);
6615 6613 pp->p_szc = 0;
6616 6614 }
6617 6615 goto cleanup;
6618 6616 } else if (ret == EIO) {
6619 6617 ret = EAGAIN;
6620 6618 goto cleanup;
6621 6619 } else {
6622 6620 /*
6623 6621 * Need to reset return type as we failed to relocate the page
6624 6622 * but that does not mean that some of the next steps will not
6625 6623 * work.
6626 6624 */
6627 6625 ret = 0;
6628 6626 }
6629 6627
6630 6628 skip_relocate:
6631 6629
6632 6630 if (pp->p_szc > 0) {
6633 6631 if (page_try_demote_pages(pp) == 0) {
6634 6632 ret = EAGAIN;
6635 6633 goto cleanup;
6636 6634 }
6637 6635 }
6638 6636
6639 6637 ASSERT(pp->p_szc == 0);
6640 6638
6641 6639 if (hat_ismod(pp)) {
6642 6640 ret = EAGAIN;
6643 6641 goto cleanup;
6644 6642 }
6645 6643 if (PP_ISKAS(pp)) {
6646 6644 ret = EAGAIN;
6647 6645 goto cleanup;
6648 6646 }
6649 6647 if (pp->p_lckcnt || pp->p_cowcnt) {
6650 6648 ret = EAGAIN;
6651 6649 goto cleanup;
6652 6650 }
6653 6651
6654 6652 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
6655 6653 ASSERT(!hat_page_is_mapped(pp));
6656 6654
6657 6655 if (hat_ismod(pp)) {
6658 6656 /*
6659 6657 * This is a semi-odd case as the page is now modified but not
6660 6658 * mapped as we just unloaded the mappings above.
6661 6659 */
6662 6660 ret = EAGAIN;
6663 6661 goto cleanup;
6664 6662 }
6665 6663 if (pp->p_vnode != NULL) {
6666 6664 page_hashout(pp, NULL);
6667 6665 }
6668 6666
6669 6667 /*
6670 6668 * At this point, the page should be in a clean state and
6671 6669 * we can do whatever we want with it.
6672 6670 */
6673 6671
6674 6672 cleanup:
6675 6673 if (ret != 0) {
6676 6674 if (!skip_unlock) {
6677 6675 page_unlock(pp);
6678 6676 }
6679 6677 } else {
6680 6678 ASSERT(pp->p_szc == 0);
6681 6679 ASSERT(PAGE_EXCL(pp));
6682 6680
6683 6681 pp->p_next = pp;
6684 6682 pp->p_prev = pp;
6685 6683 }
6686 6684 return (ret);
6687 6685 }
6688 6686
6689 6687 /*
6690 6688 * Various callers of page_trycapture() can have different restrictions upon
6691 6689 * what memory they have access to.
6692 6690 * Returns 0 on success, with the following error codes on failure:
6693 6691 * EPERM - The requested page is long term locked, and thus repeated
6694 6692 * requests to capture this page will likely fail.
6695 6693 * ENOMEM - There was not enough free memory in the system to safely
6696 6694 * map the requested page.
6697 6695 * ENOENT - The requested page was inside the kernel cage, and the
6698 6696 * PHYSMEM_CAGE flag was not set.
6699 6697 */
6700 6698 int
6701 6699 page_capture_pre_checks(page_t *pp, uint_t flags)
6702 6700 {
6703 6701 ASSERT(pp != NULL);
6704 6702
6705 6703 #if defined(__sparc)
6706 6704 if (pp->p_vnode == &promvp) {
6707 6705 return (EPERM);
6708 6706 }
6709 6707
6710 6708 if (PP_ISNORELOC(pp) && !(flags & CAPTURE_GET_CAGE) &&
6711 6709 (flags & CAPTURE_PHYSMEM)) {
6712 6710 return (ENOENT);
6713 6711 }
6714 6712
6715 6713 if (PP_ISNORELOCKERNEL(pp)) {
6716 6714 return (EPERM);
6717 6715 }
6718 6716 #else
6719 6717 if (PP_ISKAS(pp)) {
6720 6718 return (EPERM);
6721 6719 }
6722 6720 #endif /* __sparc */
6723 6721
6724 6722 /* only physmem currently has the restrictions checked below */
6725 6723 if (!(flags & CAPTURE_PHYSMEM)) {
6726 6724 return (0);
6727 6725 }
6728 6726
6729 6727 if (availrmem < swapfs_minfree) {
6730 6728 /*
6731 6729 * We won't try to capture this page as we are
6732 6730 * running low on memory.
6733 6731 */
6734 6732 return (ENOMEM);
6735 6733 }
6736 6734 return (0);
6737 6735 }
6738 6736
6739 6737 /*
6740 6738 * Once we have a page in our mits, go ahead and complete the capture
6741 6739 * operation.
6742 6740 * Returns 1 on failure where page is no longer needed
6743 6741 * Returns 0 on success
6744 6742 * Returns -1 if there was a transient failure.
6745 6743 * Failure cases must release the SE_EXCL lock on pp (usually via page_free).
6746 6744 */
6747 6745 int
6748 6746 page_capture_take_action(page_t *pp, uint_t flags, void *datap)
6749 6747 {
6750 6748 int cb_index;
6751 6749 int ret = 0;
6752 6750 page_capture_hash_bucket_t *bp1;
6753 6751 page_capture_hash_bucket_t *bp2;
6754 6752 int index;
6755 6753 int found = 0;
6756 6754 int i;
6757 6755
6758 6756 ASSERT(PAGE_EXCL(pp));
6759 6757 ASSERT(curthread->t_flag & T_CAPTURING);
6760 6758
6761 6759 for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6762 6760 if ((flags >> cb_index) & 1) {
6763 6761 break;
6764 6762 }
6765 6763 }
6766 6764 ASSERT(cb_index < PC_NUM_CALLBACKS);
6767 6765
6768 6766 /*
6769 6767 * Remove the entry from the page_capture hash, but don't free it yet
6770 6768 * as we may need to put it back.
6771 6769 * Since we own the page at this point in time, we should find it
6772 6770 * in the hash if this is an ASYNC call. If we don't it's likely
6773 6771 * that the page_capture_async() thread decided that this request
6774 6772 * had expired, in which case we just continue on.
6775 6773 */
6776 6774 if (flags & CAPTURE_ASYNC) {
6777 6775
6778 6776 index = PAGE_CAPTURE_HASH(pp);
6779 6777
6780 6778 mutex_enter(&page_capture_hash[index].pchh_mutex);
6781 6779 for (i = 0; i < 2 && !found; i++) {
6782 6780 bp1 = page_capture_hash[index].lists[i].next;
6783 6781 while (bp1 != &page_capture_hash[index].lists[i]) {
6784 6782 if (bp1->pp == pp) {
6785 6783 bp1->next->prev = bp1->prev;
6786 6784 bp1->prev->next = bp1->next;
6787 6785 page_capture_hash[index].
6788 6786 num_pages[bp1->pri]--;
6789 6787 page_clrtoxic(pp, PR_CAPTURE);
6790 6788 found = 1;
6791 6789 break;
6792 6790 }
6793 6791 bp1 = bp1->next;
6794 6792 }
6795 6793 }
6796 6794 mutex_exit(&page_capture_hash[index].pchh_mutex);
6797 6795 }
6798 6796
6799 6797 /* Synchronize with the unregister func. */
6800 6798 rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
6801 6799 if (!pc_cb[cb_index].cb_active) {
6802 6800 page_free(pp, 1);
6803 6801 rw_exit(&pc_cb[cb_index].cb_rwlock);
6804 6802 if (found) {
6805 6803 kmem_free(bp1, sizeof (*bp1));
6806 6804 }
6807 6805 return (1);
6808 6806 }
6809 6807
6810 6808 /*
6811 6809 * We need to remove the entry from the page capture hash and turn off
6812 6810 * the PR_CAPTURE bit before calling the callback. We'll need to cache
6813 6811 * the entry here, and then based upon the return value, cleanup
6814 6812 * appropriately or re-add it to the hash, making sure that someone else
6815 6813 * hasn't already done so.
6816 6814 * It should be rare for the callback to fail and thus it's ok for
6817 6815 * the failure path to be a bit complicated as the success path is
6818 6816 * cleaner and the locking rules are easier to follow.
6819 6817 */
6820 6818
6821 6819 ret = pc_cb[cb_index].cb_func(pp, datap, flags);
6822 6820
6823 6821 rw_exit(&pc_cb[cb_index].cb_rwlock);
6824 6822
6825 6823 /*
6826 6824 * If this was an ASYNC request, we need to cleanup the hash if the
6827 6825 * callback was successful or if the request was no longer valid.
6828 6826 * For non-ASYNC requests, we return failure to map and the caller
6829 6827 * will take care of adding the request to the hash.
6830 6828 * Note also that the callback itself is responsible for the page
6831 6829 * at this point in time in terms of locking ... The most common
6832 6830 * case for the failure path should just be a page_free.
6833 6831 */
6834 6832 if (ret >= 0) {
6835 6833 if (found) {
6836 6834 if (bp1->flags & CAPTURE_RETIRE) {
6837 6835 page_retire_decr_pend_count(datap);
6838 6836 }
6839 6837 kmem_free(bp1, sizeof (*bp1));
6840 6838 }
6841 6839 return (ret);
6842 6840 }
6843 6841 if (!found) {
6844 6842 return (ret);
6845 6843 }
6846 6844
6847 6845 ASSERT(flags & CAPTURE_ASYNC);
6848 6846
6849 6847 /*
6850 6848 * Check for expiration time first as we can just free it up if it's
6851 6849 * expired.
6852 6850 */
6853 6851 if (ddi_get_lbolt() > bp1->expires && bp1->expires != -1) {
6854 6852 kmem_free(bp1, sizeof (*bp1));
6855 6853 return (ret);
6856 6854 }
6857 6855
6858 6856 /*
6859 6857 * The callback failed and there used to be an entry in the hash for
6860 6858 * this page, so we need to add it back to the hash.
6861 6859 */
6862 6860 mutex_enter(&page_capture_hash[index].pchh_mutex);
6863 6861 if (!(pp->p_toxic & PR_CAPTURE)) {
6864 6862 /* just add bp1 back to head of walked list */
6865 6863 page_settoxic(pp, PR_CAPTURE);
6866 6864 bp1->next = page_capture_hash[index].lists[1].next;
6867 6865 bp1->prev = &page_capture_hash[index].lists[1];
6868 6866 bp1->next->prev = bp1;
6869 6867 bp1->pri = PAGE_CAPTURE_PRIO(pp);
6870 6868 page_capture_hash[index].lists[1].next = bp1;
6871 6869 page_capture_hash[index].num_pages[bp1->pri]++;
6872 6870 mutex_exit(&page_capture_hash[index].pchh_mutex);
6873 6871 return (ret);
6874 6872 }
6875 6873
6876 6874 /*
6877 6875 * Otherwise there was a new capture request added to list
6878 6876 * Need to make sure that our original data is represented if
6879 6877 * appropriate.
6880 6878 */
6881 6879 for (i = 0; i < 2; i++) {
6882 6880 bp2 = page_capture_hash[index].lists[i].next;
6883 6881 while (bp2 != &page_capture_hash[index].lists[i]) {
6884 6882 if (bp2->pp == pp) {
6885 6883 if (bp1->flags & CAPTURE_RETIRE) {
6886 6884 if (!(bp2->flags & CAPTURE_RETIRE)) {
6887 6885 bp2->szc = bp1->szc;
6888 6886 bp2->flags = bp1->flags;
6889 6887 bp2->expires = bp1->expires;
6890 6888 bp2->datap = bp1->datap;
6891 6889 }
6892 6890 } else {
6893 6891 ASSERT(bp1->flags & CAPTURE_PHYSMEM);
6894 6892 if (!(bp2->flags & CAPTURE_RETIRE)) {
6895 6893 bp2->szc = bp1->szc;
6896 6894 bp2->flags = bp1->flags;
6897 6895 bp2->expires = bp1->expires;
6898 6896 bp2->datap = bp1->datap;
6899 6897 }
6900 6898 }
6901 6899 page_capture_hash[index].num_pages[bp2->pri]--;
6902 6900 bp2->pri = PAGE_CAPTURE_PRIO(pp);
6903 6901 page_capture_hash[index].num_pages[bp2->pri]++;
6904 6902 mutex_exit(&page_capture_hash[index].
6905 6903 pchh_mutex);
6906 6904 kmem_free(bp1, sizeof (*bp1));
6907 6905 return (ret);
6908 6906 }
6909 6907 bp2 = bp2->next;
6910 6908 }
6911 6909 }
6912 6910 panic("PR_CAPTURE set but not on hash for pp 0x%p\n", (void *)pp);
6913 6911 /*NOTREACHED*/
6914 6912 }
6915 6913
6916 6914 /*
6917 6915 * Try to capture the given page for the caller specified in the flags
6918 6916 * parameter. The page will either be captured and handed over to the
6919 6917 * appropriate callback, or will be queued up in the page capture hash
6920 6918 * to be captured asynchronously.
6921 6919 * If the current request is due to an async capture, the page must be
6922 6920 * exclusively locked before calling this function.
6923 6921 * Currently szc must be 0 but in the future this should be expandable to
6924 6922 * other page sizes.
6925 6923 * Returns 0 on success, with the following error codes on failure:
6926 6924 * EPERM - The requested page is long term locked, and thus repeated
6927 6925 * requests to capture this page will likely fail.
6928 6926 * ENOMEM - There was not enough free memory in the system to safely
6929 6927 * map the requested page.
6930 6928 * ENOENT - The requested page was inside the kernel cage, and the
6931 6929 * CAPTURE_GET_CAGE flag was not set.
6932 6930 * EAGAIN - The requested page could not be capturead at this point in
6933 6931 * time but future requests will likely work.
6934 6932 * EBUSY - The requested page is retired and the CAPTURE_GET_RETIRED flag
6935 6933 * was not set.
6936 6934 */
6937 6935 int
6938 6936 page_itrycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
6939 6937 {
6940 6938 int ret;
6941 6939 int cb_index;
6942 6940
6943 6941 if (flags & CAPTURE_ASYNC) {
6944 6942 ASSERT(PAGE_EXCL(pp));
6945 6943 goto async;
6946 6944 }
6947 6945
6948 6946 /* Make sure there's enough availrmem ... */
6949 6947 ret = page_capture_pre_checks(pp, flags);
6950 6948 if (ret != 0) {
6951 6949 return (ret);
6952 6950 }
6953 6951
6954 6952 if (!page_trylock(pp, SE_EXCL)) {
6955 6953 for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6956 6954 if ((flags >> cb_index) & 1) {
6957 6955 break;
6958 6956 }
6959 6957 }
6960 6958 ASSERT(cb_index < PC_NUM_CALLBACKS);
6961 6959 ret = EAGAIN;
6962 6960 /* Special case for retired pages */
6963 6961 if (PP_RETIRED(pp)) {
6964 6962 if (flags & CAPTURE_GET_RETIRED) {
6965 6963 if (!page_unretire_pp(pp, PR_UNR_TEMP)) {
6966 6964 /*
6967 6965 * Need to set capture bit and add to
6968 6966 * hash so that the page will be
6969 6967 * retired when freed.
6970 6968 */
6971 6969 page_capture_add_hash(pp, szc,
6972 6970 CAPTURE_RETIRE, NULL);
6973 6971 ret = 0;
6974 6972 goto own_page;
6975 6973 }
6976 6974 } else {
6977 6975 return (EBUSY);
6978 6976 }
6979 6977 }
6980 6978 page_capture_add_hash(pp, szc, flags, datap);
6981 6979 return (ret);
6982 6980 }
6983 6981
6984 6982 async:
6985 6983 ASSERT(PAGE_EXCL(pp));
6986 6984
6987 6985 /* Need to check for physmem async requests that availrmem is sane */
6988 6986 if ((flags & (CAPTURE_ASYNC | CAPTURE_PHYSMEM)) ==
6989 6987 (CAPTURE_ASYNC | CAPTURE_PHYSMEM) &&
6990 6988 (availrmem < swapfs_minfree)) {
6991 6989 page_unlock(pp);
6992 6990 return (ENOMEM);
6993 6991 }
6994 6992
6995 6993 ret = page_capture_clean_page(pp);
6996 6994
6997 6995 if (ret != 0) {
6998 6996 /* We failed to get the page, so lets add it to the hash */
6999 6997 if (!(flags & CAPTURE_ASYNC)) {
7000 6998 page_capture_add_hash(pp, szc, flags, datap);
7001 6999 }
7002 7000 return (ret);
7003 7001 }
7004 7002
7005 7003 own_page:
7006 7004 ASSERT(PAGE_EXCL(pp));
7007 7005 ASSERT(pp->p_szc == 0);
7008 7006
7009 7007 /* Call the callback */
7010 7008 ret = page_capture_take_action(pp, flags, datap);
7011 7009
7012 7010 if (ret == 0) {
7013 7011 return (0);
7014 7012 }
7015 7013
7016 7014 /*
7017 7015 * Note that in the failure cases from page_capture_take_action, the
7018 7016 * EXCL lock will have already been dropped.
7019 7017 */
7020 7018 if ((ret == -1) && (!(flags & CAPTURE_ASYNC))) {
7021 7019 page_capture_add_hash(pp, szc, flags, datap);
7022 7020 }
7023 7021 return (EAGAIN);
7024 7022 }
7025 7023
7026 7024 int
7027 7025 page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
7028 7026 {
7029 7027 int ret;
7030 7028
7031 7029 curthread->t_flag |= T_CAPTURING;
7032 7030 ret = page_itrycapture(pp, szc, flags, datap);
7033 7031 curthread->t_flag &= ~T_CAPTURING; /* xor works as we know its set */
7034 7032 return (ret);
7035 7033 }
7036 7034
7037 7035 /*
7038 7036 * When unlocking a page which has the PR_CAPTURE bit set, this routine
7039 7037 * gets called to try and capture the page.
7040 7038 */
7041 7039 void
7042 7040 page_unlock_capture(page_t *pp)
7043 7041 {
7044 7042 page_capture_hash_bucket_t *bp;
7045 7043 int index;
7046 7044 int i;
7047 7045 uint_t szc;
7048 7046 uint_t flags = 0;
7049 7047 void *datap;
7050 7048 kmutex_t *mp;
7051 7049 extern vnode_t retired_pages;
7052 7050
7053 7051 /*
7054 7052 * We need to protect against a possible deadlock here where we own
7055 7053 * the vnode page hash mutex and want to acquire it again as there
7056 7054 * are locations in the code, where we unlock a page while holding
7057 7055 * the mutex which can lead to the page being captured and eventually
7058 7056 * end up here. As we may be hashing out the old page and hashing into
7059 7057 * the retire vnode, we need to make sure we don't own them.
7060 7058 * Other callbacks who do hash operations also need to make sure that
7061 7059 * before they hashin to a vnode that they do not currently own the
7062 7060 * vphm mutex otherwise there will be a panic.
7063 7061 */
7064 7062 if (mutex_owned(page_vnode_mutex(&retired_pages))) {
7065 7063 page_unlock_nocapture(pp);
7066 7064 return;
7067 7065 }
7068 7066 if (pp->p_vnode != NULL && mutex_owned(page_vnode_mutex(pp->p_vnode))) {
7069 7067 page_unlock_nocapture(pp);
7070 7068 return;
7071 7069 }
7072 7070
7073 7071 index = PAGE_CAPTURE_HASH(pp);
7074 7072
7075 7073 mp = &page_capture_hash[index].pchh_mutex;
7076 7074 mutex_enter(mp);
7077 7075 for (i = 0; i < 2; i++) {
7078 7076 bp = page_capture_hash[index].lists[i].next;
7079 7077 while (bp != &page_capture_hash[index].lists[i]) {
7080 7078 if (bp->pp == pp) {
7081 7079 szc = bp->szc;
7082 7080 flags = bp->flags | CAPTURE_ASYNC;
7083 7081 datap = bp->datap;
7084 7082 mutex_exit(mp);
7085 7083 (void) page_trycapture(pp, szc, flags, datap);
7086 7084 return;
7087 7085 }
7088 7086 bp = bp->next;
7089 7087 }
7090 7088 }
7091 7089
7092 7090 /* Failed to find page in hash so clear flags and unlock it. */
7093 7091 page_clrtoxic(pp, PR_CAPTURE);
7094 7092 page_unlock(pp);
7095 7093
7096 7094 mutex_exit(mp);
7097 7095 }
7098 7096
7099 7097 void
7100 7098 page_capture_init()
7101 7099 {
7102 7100 int i;
7103 7101 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7104 7102 page_capture_hash[i].lists[0].next =
7105 7103 &page_capture_hash[i].lists[0];
7106 7104 page_capture_hash[i].lists[0].prev =
7107 7105 &page_capture_hash[i].lists[0];
7108 7106 page_capture_hash[i].lists[1].next =
7109 7107 &page_capture_hash[i].lists[1];
7110 7108 page_capture_hash[i].lists[1].prev =
7111 7109 &page_capture_hash[i].lists[1];
7112 7110 }
7113 7111
7114 7112 pc_thread_shortwait = 23 * hz;
7115 7113 pc_thread_longwait = 1201 * hz;
7116 7114 pc_thread_retry = 3;
7117 7115 mutex_init(&pc_thread_mutex, NULL, MUTEX_DEFAULT, NULL);
7118 7116 cv_init(&pc_cv, NULL, CV_DEFAULT, NULL);
7119 7117 pc_thread_id = thread_create(NULL, 0, page_capture_thread, NULL, 0, &p0,
7120 7118 TS_RUN, minclsyspri);
7121 7119 }
7122 7120
7123 7121 /*
7124 7122 * It is necessary to scrub any failing pages prior to reboot in order to
7125 7123 * prevent a latent error trap from occurring on the next boot.
7126 7124 */
7127 7125 void
7128 7126 page_retire_mdboot()
7129 7127 {
7130 7128 page_t *pp;
7131 7129 int i, j;
7132 7130 page_capture_hash_bucket_t *bp;
7133 7131 uchar_t pri;
7134 7132
7135 7133 /* walk lists looking for pages to scrub */
7136 7134 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7137 7135 for (pri = 0; pri < PC_NUM_PRI; pri++) {
7138 7136 if (page_capture_hash[i].num_pages[pri] != 0) {
7139 7137 break;
7140 7138 }
7141 7139 }
7142 7140 if (pri == PC_NUM_PRI)
7143 7141 continue;
7144 7142
7145 7143 mutex_enter(&page_capture_hash[i].pchh_mutex);
7146 7144
7147 7145 for (j = 0; j < 2; j++) {
7148 7146 bp = page_capture_hash[i].lists[j].next;
7149 7147 while (bp != &page_capture_hash[i].lists[j]) {
7150 7148 pp = bp->pp;
7151 7149 if (PP_TOXIC(pp)) {
7152 7150 if (page_trylock(pp, SE_EXCL)) {
7153 7151 PP_CLRFREE(pp);
7154 7152 pagescrub(pp, 0, PAGESIZE);
7155 7153 page_unlock(pp);
7156 7154 }
7157 7155 }
7158 7156 bp = bp->next;
7159 7157 }
7160 7158 }
7161 7159 mutex_exit(&page_capture_hash[i].pchh_mutex);
7162 7160 }
7163 7161 }
7164 7162
7165 7163 /*
7166 7164 * Walk the page_capture_hash trying to capture pages and also cleanup old
7167 7165 * entries which have expired.
7168 7166 */
7169 7167 void
7170 7168 page_capture_async()
7171 7169 {
7172 7170 page_t *pp;
7173 7171 int i;
7174 7172 int ret;
7175 7173 page_capture_hash_bucket_t *bp1, *bp2;
7176 7174 uint_t szc;
7177 7175 uint_t flags;
7178 7176 void *datap;
7179 7177 uchar_t pri;
7180 7178
7181 7179 /* If there are outstanding pages to be captured, get to work */
7182 7180 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7183 7181 for (pri = 0; pri < PC_NUM_PRI; pri++) {
7184 7182 if (page_capture_hash[i].num_pages[pri] != 0)
7185 7183 break;
7186 7184 }
7187 7185 if (pri == PC_NUM_PRI)
7188 7186 continue;
7189 7187
7190 7188 /* Append list 1 to list 0 and then walk through list 0 */
7191 7189 mutex_enter(&page_capture_hash[i].pchh_mutex);
7192 7190 bp1 = &page_capture_hash[i].lists[1];
7193 7191 bp2 = bp1->next;
7194 7192 if (bp1 != bp2) {
7195 7193 bp1->prev->next = page_capture_hash[i].lists[0].next;
7196 7194 bp2->prev = &page_capture_hash[i].lists[0];
7197 7195 page_capture_hash[i].lists[0].next->prev = bp1->prev;
7198 7196 page_capture_hash[i].lists[0].next = bp2;
7199 7197 bp1->next = bp1;
7200 7198 bp1->prev = bp1;
7201 7199 }
7202 7200
7203 7201 /* list[1] will be empty now */
7204 7202
7205 7203 bp1 = page_capture_hash[i].lists[0].next;
7206 7204 while (bp1 != &page_capture_hash[i].lists[0]) {
7207 7205 /* Check expiration time */
7208 7206 if ((ddi_get_lbolt() > bp1->expires &&
7209 7207 bp1->expires != -1) ||
7210 7208 page_deleted(bp1->pp)) {
7211 7209 page_capture_hash[i].lists[0].next = bp1->next;
7212 7210 bp1->next->prev =
7213 7211 &page_capture_hash[i].lists[0];
7214 7212 page_capture_hash[i].num_pages[bp1->pri]--;
7215 7213
7216 7214 /*
7217 7215 * We can safely remove the PR_CAPTURE bit
7218 7216 * without holding the EXCL lock on the page
7219 7217 * as the PR_CAPTURE bit requres that the
7220 7218 * page_capture_hash[].pchh_mutex be held
7221 7219 * to modify it.
7222 7220 */
7223 7221 page_clrtoxic(bp1->pp, PR_CAPTURE);
7224 7222 mutex_exit(&page_capture_hash[i].pchh_mutex);
7225 7223 kmem_free(bp1, sizeof (*bp1));
7226 7224 mutex_enter(&page_capture_hash[i].pchh_mutex);
7227 7225 bp1 = page_capture_hash[i].lists[0].next;
7228 7226 continue;
7229 7227 }
7230 7228 pp = bp1->pp;
7231 7229 szc = bp1->szc;
7232 7230 flags = bp1->flags;
7233 7231 datap = bp1->datap;
7234 7232 mutex_exit(&page_capture_hash[i].pchh_mutex);
7235 7233 if (page_trylock(pp, SE_EXCL)) {
7236 7234 ret = page_trycapture(pp, szc,
7237 7235 flags | CAPTURE_ASYNC, datap);
7238 7236 } else {
7239 7237 ret = 1; /* move to walked hash */
7240 7238 }
7241 7239
7242 7240 if (ret != 0) {
7243 7241 /* Move to walked hash */
7244 7242 (void) page_capture_move_to_walked(pp);
7245 7243 }
7246 7244 mutex_enter(&page_capture_hash[i].pchh_mutex);
7247 7245 bp1 = page_capture_hash[i].lists[0].next;
7248 7246 }
7249 7247
7250 7248 mutex_exit(&page_capture_hash[i].pchh_mutex);
7251 7249 }
7252 7250 }
7253 7251
7254 7252 /*
7255 7253 * This function is called by the page_capture_thread, and is needed in
7256 7254 * in order to initiate aio cleanup, so that pages used in aio
7257 7255 * will be unlocked and subsequently retired by page_capture_thread.
7258 7256 */
7259 7257 static int
7260 7258 do_aio_cleanup(void)
7261 7259 {
7262 7260 proc_t *procp;
7263 7261 int (*aio_cleanup_dr_delete_memory)(proc_t *);
7264 7262 int cleaned = 0;
7265 7263
7266 7264 if (modload("sys", "kaio") == -1) {
7267 7265 cmn_err(CE_WARN, "do_aio_cleanup: cannot load kaio");
7268 7266 return (0);
7269 7267 }
7270 7268 /*
7271 7269 * We use the aio_cleanup_dr_delete_memory function to
7272 7270 * initiate the actual clean up; this function will wake
7273 7271 * up the per-process aio_cleanup_thread.
7274 7272 */
7275 7273 aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
7276 7274 modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
7277 7275 if (aio_cleanup_dr_delete_memory == NULL) {
7278 7276 cmn_err(CE_WARN,
7279 7277 "aio_cleanup_dr_delete_memory not found in kaio");
7280 7278 return (0);
7281 7279 }
7282 7280 mutex_enter(&pidlock);
7283 7281 for (procp = practive; (procp != NULL); procp = procp->p_next) {
7284 7282 mutex_enter(&procp->p_lock);
7285 7283 if (procp->p_aio != NULL) {
7286 7284 /* cleanup proc's outstanding kaio */
7287 7285 cleaned += (*aio_cleanup_dr_delete_memory)(procp);
7288 7286 }
7289 7287 mutex_exit(&procp->p_lock);
7290 7288 }
7291 7289 mutex_exit(&pidlock);
7292 7290 return (cleaned);
7293 7291 }
7294 7292
7295 7293 /*
7296 7294 * helper function for page_capture_thread
7297 7295 */
7298 7296 static void
7299 7297 page_capture_handle_outstanding(void)
7300 7298 {
7301 7299 int ntry;
7302 7300
7303 7301 /* Reap pages before attempting capture pages */
7304 7302 kmem_reap();
7305 7303
7306 7304 if ((page_retire_pend_count() > page_retire_pend_kas_count()) &&
7307 7305 hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
7308 7306 /*
7309 7307 * Note: Purging only for platforms that support
7310 7308 * ISM hat_pageunload() - mainly SPARC. On x86/x64
7311 7309 * platforms ISM pages SE_SHARED locked until destroyed.
7312 7310 */
7313 7311
7314 7312 /* disable and purge seg_pcache */
7315 7313 (void) seg_p_disable();
7316 7314 for (ntry = 0; ntry < pc_thread_retry; ntry++) {
7317 7315 if (!page_retire_pend_count())
7318 7316 break;
7319 7317 if (do_aio_cleanup()) {
7320 7318 /*
7321 7319 * allow the apps cleanup threads
7322 7320 * to run
7323 7321 */
7324 7322 delay(pc_thread_shortwait);
7325 7323 }
7326 7324 page_capture_async();
7327 7325 }
7328 7326 /* reenable seg_pcache */
7329 7327 seg_p_enable();
7330 7328
7331 7329 /* completed what can be done. break out */
7332 7330 return;
7333 7331 }
7334 7332
7335 7333 /*
7336 7334 * For kernel pages and/or unsupported HAT_DYNAMIC_ISM_UNMAP, reap
7337 7335 * and then attempt to capture.
7338 7336 */
7339 7337 seg_preap();
7340 7338 page_capture_async();
7341 7339 }
7342 7340
7343 7341 /*
7344 7342 * The page_capture_thread loops forever, looking to see if there are
7345 7343 * pages still waiting to be captured.
7346 7344 */
7347 7345 static void
7348 7346 page_capture_thread(void)
7349 7347 {
7350 7348 callb_cpr_t c;
7351 7349 int i;
7352 7350 int high_pri_pages;
7353 7351 int low_pri_pages;
7354 7352 clock_t timeout;
7355 7353
7356 7354 CALLB_CPR_INIT(&c, &pc_thread_mutex, callb_generic_cpr, "page_capture");
7357 7355
7358 7356 mutex_enter(&pc_thread_mutex);
7359 7357 for (;;) {
7360 7358 high_pri_pages = 0;
7361 7359 low_pri_pages = 0;
7362 7360 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7363 7361 high_pri_pages +=
7364 7362 page_capture_hash[i].num_pages[PC_PRI_HI];
7365 7363 low_pri_pages +=
7366 7364 page_capture_hash[i].num_pages[PC_PRI_LO];
7367 7365 }
7368 7366
7369 7367 timeout = pc_thread_longwait;
7370 7368 if (high_pri_pages != 0) {
7371 7369 timeout = pc_thread_shortwait;
7372 7370 page_capture_handle_outstanding();
7373 7371 } else if (low_pri_pages != 0) {
7374 7372 page_capture_async();
7375 7373 }
7376 7374 CALLB_CPR_SAFE_BEGIN(&c);
7377 7375 (void) cv_reltimedwait(&pc_cv, &pc_thread_mutex,
7378 7376 timeout, TR_CLOCK_TICK);
7379 7377 CALLB_CPR_SAFE_END(&c, &pc_thread_mutex);
7380 7378 }
7381 7379 /*NOTREACHED*/
7382 7380 }
7383 7381 /*
7384 7382 * Attempt to locate a bucket that has enough pages to satisfy the request.
7385 7383 * The initial check is done without the lock to avoid unneeded contention.
7386 7384 * The function returns 1 if enough pages were found, else 0 if it could not
7387 7385 * find enough pages in a bucket.
7388 7386 */
7389 7387 static int
7390 7388 pcf_decrement_bucket(pgcnt_t npages)
7391 7389 {
7392 7390 struct pcf *p;
7393 7391 struct pcf *q;
7394 7392 int i;
7395 7393
7396 7394 p = &pcf[PCF_INDEX()];
7397 7395 q = &pcf[pcf_fanout];
7398 7396 for (i = 0; i < pcf_fanout; i++) {
7399 7397 if (p->pcf_count > npages) {
7400 7398 /*
7401 7399 * a good one to try.
7402 7400 */
7403 7401 mutex_enter(&p->pcf_lock);
7404 7402 if (p->pcf_count > npages) {
7405 7403 p->pcf_count -= (uint_t)npages;
7406 7404 /*
7407 7405 * freemem is not protected by any lock.
7408 7406 * Thus, we cannot have any assertion
7409 7407 * containing freemem here.
7410 7408 */
7411 7409 freemem -= npages;
7412 7410 mutex_exit(&p->pcf_lock);
7413 7411 return (1);
7414 7412 }
7415 7413 mutex_exit(&p->pcf_lock);
7416 7414 }
7417 7415 p++;
7418 7416 if (p >= q) {
7419 7417 p = pcf;
7420 7418 }
7421 7419 }
7422 7420 return (0);
7423 7421 }
7424 7422
7425 7423 /*
7426 7424 * Arguments:
7427 7425 * pcftotal_ret: If the value is not NULL and we have walked all the
7428 7426 * buckets but did not find enough pages then it will
7429 7427 * be set to the total number of pages in all the pcf
7430 7428 * buckets.
7431 7429 * npages: Is the number of pages we have been requested to
7432 7430 * find.
7433 7431 * unlock: If set to 0 we will leave the buckets locked if the
7434 7432 * requested number of pages are not found.
7435 7433 *
7436 7434 * Go and try to satisfy the page request from any number of buckets.
7437 7435 * This can be a very expensive operation as we have to lock the buckets
7438 7436 * we are checking (and keep them locked), starting at bucket 0.
7439 7437 *
7440 7438 * The function returns 1 if enough pages were found, else 0 if it could not
7441 7439 * find enough pages in the buckets.
7442 7440 *
7443 7441 */
7444 7442 static int
7445 7443 pcf_decrement_multiple(pgcnt_t *pcftotal_ret, pgcnt_t npages, int unlock)
7446 7444 {
7447 7445 struct pcf *p;
7448 7446 pgcnt_t pcftotal;
7449 7447 int i;
7450 7448
7451 7449 p = pcf;
7452 7450 /* try to collect pages from several pcf bins */
7453 7451 for (pcftotal = 0, i = 0; i < pcf_fanout; i++) {
7454 7452 mutex_enter(&p->pcf_lock);
7455 7453 pcftotal += p->pcf_count;
7456 7454 if (pcftotal >= npages) {
7457 7455 /*
7458 7456 * Wow! There are enough pages laying around
7459 7457 * to satisfy the request. Do the accounting,
7460 7458 * drop the locks we acquired, and go back.
7461 7459 *
7462 7460 * freemem is not protected by any lock. So,
7463 7461 * we cannot have any assertion containing
7464 7462 * freemem.
7465 7463 */
7466 7464 freemem -= npages;
7467 7465 while (p >= pcf) {
7468 7466 if (p->pcf_count <= npages) {
7469 7467 npages -= p->pcf_count;
7470 7468 p->pcf_count = 0;
7471 7469 } else {
7472 7470 p->pcf_count -= (uint_t)npages;
7473 7471 npages = 0;
7474 7472 }
7475 7473 mutex_exit(&p->pcf_lock);
7476 7474 p--;
7477 7475 }
7478 7476 ASSERT(npages == 0);
7479 7477 return (1);
7480 7478 }
7481 7479 p++;
7482 7480 }
7483 7481 if (unlock) {
7484 7482 /* failed to collect pages - release the locks */
7485 7483 while (--p >= pcf) {
7486 7484 mutex_exit(&p->pcf_lock);
7487 7485 }
7488 7486 }
7489 7487 if (pcftotal_ret != NULL)
7490 7488 *pcftotal_ret = pcftotal;
7491 7489 return (0);
7492 7490 }
↓ open down ↓ |
4788 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX