Print this page
5255 uts shouldn't open-code ISP2
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/vm/vpm.c
+++ new/usr/src/uts/common/vm/vpm.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25
26 26
27 27 /*
28 28 * VM - generic vnode page mapping interfaces.
29 29 *
30 30 * Mechanism to provide temporary mappings to vnode pages.
31 31 * The typical use would be to copy/access file data.
32 32 */
33 33
34 34 #include <sys/types.h>
35 35 #include <sys/t_lock.h>
36 36 #include <sys/param.h>
37 37 #include <sys/sysmacros.h>
38 38 #include <sys/buf.h>
39 39 #include <sys/systm.h>
40 40 #include <sys/vnode.h>
41 41 #include <sys/mman.h>
42 42 #include <sys/errno.h>
43 43 #include <sys/cred.h>
44 44 #include <sys/kmem.h>
45 45 #include <sys/vtrace.h>
46 46 #include <sys/cmn_err.h>
47 47 #include <sys/debug.h>
48 48 #include <sys/thread.h>
49 49 #include <sys/dumphdr.h>
50 50 #include <sys/bitmap.h>
51 51 #include <sys/lgrp.h>
52 52
53 53 #include <vm/seg_kmem.h>
54 54 #include <vm/hat.h>
55 55 #include <vm/as.h>
56 56 #include <vm/seg.h>
57 57 #include <vm/seg_kpm.h>
58 58 #include <vm/seg_map.h>
59 59 #include <vm/page.h>
60 60 #include <vm/pvn.h>
61 61 #include <vm/rm.h>
62 62 #include <vm/vpm.h>
63 63
64 64
65 65 #ifdef SEGKPM_SUPPORT
66 66 /*
67 67 * VPM can be disabled by setting vpm_enable = 0 in
68 68 * /etc/system.
69 69 *
70 70 */
71 71 int vpm_enable = 1;
72 72
73 73 #else
74 74
75 75 int vpm_enable = 0;
76 76
77 77 #endif
78 78
79 79 #ifdef SEGKPM_SUPPORT
80 80
81 81
82 82 int vpm_cache_enable = 1;
83 83 long vpm_cache_percent = 12;
84 84 long vpm_cache_size;
85 85 int vpm_nfreelist = 0;
86 86 int vpmd_freemsk = 0;
87 87
88 88 #define VPM_S_PAD 64
89 89 union vpm_cpu {
90 90 struct {
91 91 int vcpu_free_ndx;
92 92 ulong_t vcpu_hits;
93 93 ulong_t vcpu_misses;
94 94 } vcpu;
95 95 char vpm_pad[VPM_S_PAD];
96 96 };
97 97 static union vpm_cpu *vpmd_cpu;
98 98
99 99 #define vfree_ndx vcpu.vcpu_free_ndx
100 100
101 101 int vpm_cachemode = VPMCACHE_LRU;
102 102
103 103 #define PPMTX(pp) (&(pp)->p_ilock)
104 104
105 105 static struct vpmap *vpmd_vpmap; /* list of vpmap structs preallocated */
106 106 static struct vpmfree *vpmd_free;
107 107 #define VPMAPMTX(vpm) (&vpm->vpm_mtx)
108 108 #define VPMAP2VMF(vpm) (&vpmd_free[(vpm - vpmd_vpmap) & vpmd_freemsk])
109 109 #define VPMAP2VMF_NDX(vpm) (ushort_t)((vpm - vpmd_vpmap) & vpmd_freemsk)
110 110 #define VPMP(id) (&vpmd_vpmap[id - 1])
111 111 #define VPMID(vpm) (uint_t)((vpm - vpmd_vpmap) + 1)
112 112
113 113
114 114 #ifdef DEBUG
115 115
116 116 struct vpm_debug {
117 117 int vpmd_steals;
118 118 int vpmd_contend;
119 119 int vpmd_prevpagelocked;
120 120 int vpmd_getpagefailed;
121 121 int vpmd_zerostart;
122 122 int vpmd_emptyfreelist;
123 123 int vpmd_nofreevpms;
124 124 } vpm_debug;
125 125
126 126 #define VPM_DEBUG(x) ((vpm_debug.x)++)
127 127
128 128 int steals;
129 129 int steals_mtbf = 7;
130 130 int contend;
131 131 int contend_mtbf = 127;
132 132
133 133 #define VPM_MTBF(v, f) (((++(v)) & (f)) != (f))
134 134
135 135 #else /* DEBUG */
136 136
137 137 #define VPM_MTBF(v, f) (1)
138 138 #define VPM_DEBUG(x) /* nothing */
139 139
140 140 #endif
141 141
142 142 /*
143 143 * The vpm cache.
144 144 *
145 145 * The main purpose of having a cache here is to speed up page_lookup()
146 146 * operations and also provide an LRU(default) behaviour of file pages. The
147 147 * page_lookup() operation tends to be expensive if a page has to be
148 148 * reclaimed from the system page cache("cachelist"). Once we speed up the
149 149 * page_lookup()->page_reclaim() path then there there should be no need for
150 150 * this cache. The system page cache(cachelist) should effectively serve the
151 151 * purpose of caching file pages.
152 152 *
153 153 * This cache is very similar to segmap's smap cache. Each page in the
154 154 * cache is tracked by the structure vpmap_t. But unlike segmap, there is no
155 155 * hash table. The page_t has a reference to the vpmap_t when cached. For a
156 156 * given vnode, offset the page is found by means of a page_lookup() operation.
157 157 * Any page which has a mapping(i.e when cached) will not be in the
158 158 * system 'cachelist'. Hence the page_lookup() will not have to do a
159 159 * page_reclaim(). That is how the cache serves to speed up page_lookup()
160 160 * operations.
161 161 *
162 162 * This cache can be disabled by setting vpm_cache_enable = 0 in /etc/system.
163 163 */
164 164
165 165 void
166 166 vpm_init()
167 167 {
168 168 long npages;
169 169 struct vpmap *vpm;
170 170 struct vpmfree *vpmflp;
171 171 int i, ndx;
172 172 extern void prefetch_smap_w(void *);
173 173
174 174 if (!kpm_enable) {
175 175 vpm_enable = 0;
176 176 }
177 177
178 178 if (!vpm_enable || !vpm_cache_enable) {
179 179 return;
180 180 }
181 181
182 182 /*
183 183 * Set the size of the cache.
184 184 */
185 185 vpm_cache_size = mmu_ptob((physmem * vpm_cache_percent)/100);
186 186 if (vpm_cache_size < VPMAP_MINCACHE) {
187 187 vpm_cache_size = VPMAP_MINCACHE;
188 188 }
189 189
190 190 if (vpm_cache_size > VPMAP_MAXCACHE) {
191 191 vpm_cache_size = VPMAP_MAXCACHE;
192 192 }
193 193
194 194 /*
195 195 * Number of freelists.
196 196 */
197 197 if (vpm_nfreelist == 0) {
↓ open down ↓ |
197 lines elided |
↑ open up ↑ |
198 198 vpm_nfreelist = max_ncpus;
199 199 } else if (vpm_nfreelist < 0 || vpm_nfreelist > 2 * max_ncpus) {
200 200 cmn_err(CE_WARN, "vpmap create : number of freelist "
201 201 "vpm_nfreelist %d using %d", vpm_nfreelist, max_ncpus);
202 202 vpm_nfreelist = 2 * max_ncpus;
203 203 }
204 204
205 205 /*
206 206 * Round it up to the next power of 2
207 207 */
208 - if (vpm_nfreelist & (vpm_nfreelist - 1)) {
208 + if (!ISP2(vpm_nfreelist)) {
209 209 vpm_nfreelist = 1 << (highbit(vpm_nfreelist));
210 210 }
211 211 vpmd_freemsk = vpm_nfreelist - 1;
212 212
213 213 /*
214 214 * Use a per cpu rotor index to spread the allocations evenly
215 215 * across the available vpm freelists.
216 216 */
217 217 vpmd_cpu = kmem_zalloc(sizeof (union vpm_cpu) * max_ncpus, KM_SLEEP);
218 218 ndx = 0;
219 219 for (i = 0; i < max_ncpus; i++) {
220 220
221 221 vpmd_cpu[i].vfree_ndx = ndx;
222 222 ndx = (ndx + 1) & vpmd_freemsk;
223 223 }
224 224
225 225 /*
226 226 * Allocate and initialize the freelist.
227 227 */
228 228 vpmd_free = kmem_zalloc(vpm_nfreelist * sizeof (struct vpmfree),
229 229 KM_SLEEP);
230 230 for (i = 0; i < vpm_nfreelist; i++) {
231 231
232 232 vpmflp = &vpmd_free[i];
233 233 /*
234 234 * Set up initial queue pointers. They will get flipped
235 235 * back and forth.
236 236 */
237 237 vpmflp->vpm_allocq = &vpmflp->vpm_freeq[VPMALLOCQ];
238 238 vpmflp->vpm_releq = &vpmflp->vpm_freeq[VPMRELEQ];
239 239 }
240 240
241 241 npages = mmu_btop(vpm_cache_size);
242 242
243 243
244 244 /*
245 245 * Allocate and initialize the vpmap structs. We need to
246 246 * walk the array backwards as the prefetch happens in reverse
247 247 * order.
248 248 */
249 249 vpmd_vpmap = kmem_alloc(sizeof (struct vpmap) * npages, KM_SLEEP);
250 250 for (vpm = &vpmd_vpmap[npages - 1]; vpm >= vpmd_vpmap; vpm--) {
251 251 struct vpmfree *vpmflp;
252 252 union vpm_freeq *releq;
253 253 struct vpmap *vpmapf;
254 254
255 255 /*
256 256 * Use prefetch as we have to walk thru a large number of
257 257 * these data structures. We just use the smap's prefetch
258 258 * routine as it does the same.
259 259 */
260 260 prefetch_smap_w((void *)vpm);
261 261
262 262 vpm->vpm_vp = NULL;
263 263 vpm->vpm_off = 0;
264 264 vpm->vpm_pp = NULL;
265 265 vpm->vpm_refcnt = 0;
266 266 mutex_init(&vpm->vpm_mtx, NULL, MUTEX_DEFAULT, NULL);
267 267 vpm->vpm_free_ndx = VPMAP2VMF_NDX(vpm);
268 268
269 269 vpmflp = VPMAP2VMF(vpm);
270 270 releq = vpmflp->vpm_releq;
271 271
272 272 vpmapf = releq->vpmq_free;
273 273 if (vpmapf == NULL) {
274 274 releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
275 275 } else {
276 276 vpm->vpm_next = vpmapf;
277 277 vpm->vpm_prev = vpmapf->vpm_prev;
278 278 vpmapf->vpm_prev = vpm;
279 279 vpm->vpm_prev->vpm_next = vpm;
280 280 releq->vpmq_free = vpm->vpm_next;
281 281 }
282 282
283 283 /*
284 284 * Indicate that the vpmap is on the releq at start
285 285 */
286 286 vpm->vpm_ndxflg = VPMRELEQ;
287 287 }
288 288 }
289 289
290 290
291 291 /*
292 292 * unhooks vpm from the freelist if it is still on the freelist.
293 293 */
294 294 #define VPMAP_RMFREELIST(vpm) \
295 295 { \
296 296 if (vpm->vpm_next != NULL) { \
297 297 union vpm_freeq *freeq; \
298 298 struct vpmfree *vpmflp; \
299 299 vpmflp = &vpmd_free[vpm->vpm_free_ndx]; \
300 300 freeq = &vpmflp->vpm_freeq[vpm->vpm_ndxflg]; \
301 301 mutex_enter(&freeq->vpmq_mtx); \
302 302 if (freeq->vpmq_free != vpm) { \
303 303 vpm->vpm_prev->vpm_next = vpm->vpm_next; \
304 304 vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
305 305 } else if (vpm == vpm->vpm_next) { \
306 306 freeq->vpmq_free = NULL; \
307 307 } else { \
308 308 freeq->vpmq_free = vpm->vpm_next; \
309 309 vpm->vpm_prev->vpm_next = vpm->vpm_next; \
310 310 vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
311 311 } \
312 312 mutex_exit(&freeq->vpmq_mtx); \
313 313 vpm->vpm_next = vpm->vpm_prev = NULL; \
314 314 } \
315 315 }
316 316
317 317 static int
318 318 get_freelndx(int mode)
319 319 {
320 320 int ndx;
321 321
322 322 ndx = vpmd_cpu[CPU->cpu_seqid].vfree_ndx & vpmd_freemsk;
323 323 switch (mode) {
324 324
325 325 case VPMCACHE_LRU:
326 326 default:
327 327 vpmd_cpu[CPU->cpu_seqid].vfree_ndx++;
328 328 break;
329 329 }
330 330 return (ndx);
331 331 }
332 332
333 333
334 334 /*
335 335 * Find one vpmap structure from the free lists and use it for the newpage.
336 336 * The previous page it cached is dissociated and released. The page_t's
337 337 * p_vpmref is cleared only when the vpm it is pointing to is locked(or
338 338 * for AMD64 when the page is exclusively locked in page_unload. That is
339 339 * because the p_vpmref is treated as mapping).
340 340 *
341 341 * The page's p_vpmref is set when the page is
342 342 * locked(at least SHARED locked).
343 343 */
344 344 static struct vpmap *
345 345 get_free_vpmap(page_t *newpage)
346 346 {
347 347 struct vpmfree *vpmflp;
348 348 kmutex_t *vmtx;
349 349 struct vpmap *vpm, *first;
350 350 union vpm_freeq *allocq, *releq;
351 351 page_t *pp = NULL;
352 352 int end_ndx, page_locked = 0;
353 353 int free_ndx;
354 354
355 355 /*
356 356 * get the freelist bin index.
357 357 */
358 358 free_ndx = get_freelndx(vpm_cachemode);
359 359
360 360 end_ndx = free_ndx;
361 361 vpmflp = &vpmd_free[free_ndx];
362 362
363 363 retry_queue:
364 364 allocq = vpmflp->vpm_allocq;
365 365 mutex_enter(&allocq->vpmq_mtx);
366 366
367 367 if ((vpm = allocq->vpmq_free) == NULL) {
368 368
369 369 skip_queue:
370 370 /*
371 371 * The alloc list is empty or this queue is being skipped;
372 372 * first see if the allocq toggled.
373 373 */
374 374 if (vpmflp->vpm_allocq != allocq) {
375 375 /* queue changed */
376 376 mutex_exit(&allocq->vpmq_mtx);
377 377 goto retry_queue;
378 378 }
379 379 releq = vpmflp->vpm_releq;
380 380 if (!mutex_tryenter(&releq->vpmq_mtx)) {
381 381 /* cannot get releq; a free vpmap may be there now */
382 382 mutex_exit(&allocq->vpmq_mtx);
383 383
384 384 /*
385 385 * This loop could spin forever if this thread has
386 386 * higher priority than the thread that is holding
387 387 * releq->vpmq_mtx. In order to force the other thread
388 388 * to run, we'll lock/unlock the mutex which is safe
389 389 * since we just unlocked the allocq mutex.
390 390 */
391 391 mutex_enter(&releq->vpmq_mtx);
392 392 mutex_exit(&releq->vpmq_mtx);
393 393 goto retry_queue;
394 394 }
395 395 if (releq->vpmq_free == NULL) {
396 396 VPM_DEBUG(vpmd_emptyfreelist);
397 397 /*
398 398 * This freelist is empty.
399 399 * This should not happen unless clients
400 400 * are failing to release the vpmap after
401 401 * accessing the data. Before resorting
402 402 * to sleeping, try the next list of the same color.
403 403 */
404 404 free_ndx = (free_ndx + 1) & vpmd_freemsk;
405 405 if (free_ndx != end_ndx) {
406 406 mutex_exit(&releq->vpmq_mtx);
407 407 mutex_exit(&allocq->vpmq_mtx);
408 408 vpmflp = &vpmd_free[free_ndx];
409 409 goto retry_queue;
410 410 }
411 411 /*
412 412 * Tried all freelists.
413 413 * wait on this list and hope something gets freed.
414 414 */
415 415 vpmflp->vpm_want++;
416 416 mutex_exit(&vpmflp->vpm_freeq[1].vpmq_mtx);
417 417 cv_wait(&vpmflp->vpm_free_cv,
418 418 &vpmflp->vpm_freeq[0].vpmq_mtx);
419 419 vpmflp->vpm_want--;
420 420 mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
421 421 vpmflp = &vpmd_free[free_ndx];
422 422 VPM_DEBUG(vpmd_nofreevpms);
423 423 goto retry_queue;
424 424 } else {
425 425 /*
426 426 * Something on the rele queue; flip the alloc
427 427 * and rele queues and retry.
428 428 */
429 429 vpmflp->vpm_allocq = releq;
430 430 vpmflp->vpm_releq = allocq;
431 431 mutex_exit(&allocq->vpmq_mtx);
432 432 mutex_exit(&releq->vpmq_mtx);
433 433 if (page_locked) {
434 434 delay(hz >> 2);
435 435 page_locked = 0;
436 436 }
437 437 goto retry_queue;
438 438 }
439 439 } else {
440 440 int gotnewvpm;
441 441 kmutex_t *pmtx;
442 442 uint_t vpmref;
443 443
444 444 /*
445 445 * Fastpath the case we get the vpmap mutex
446 446 * on the first try.
447 447 */
448 448 first = vpm;
449 449 next_vpmap:
450 450 vmtx = VPMAPMTX(vpm);
451 451 if (!mutex_tryenter(vmtx)) {
452 452 /*
453 453 * Another thread is trying to reclaim this slot.
454 454 * Skip to the next queue or vpmap.
455 455 */
456 456 if ((vpm = vpm->vpm_next) == first) {
457 457 goto skip_queue;
458 458 } else {
459 459 goto next_vpmap;
460 460 }
461 461 }
462 462
463 463 /*
464 464 * Assign this vpm to the newpage.
465 465 */
466 466 pmtx = PPMTX(newpage);
467 467 gotnewvpm = 0;
468 468 mutex_enter(pmtx);
469 469
470 470 /*
471 471 * Check if some other thread already assigned a vpm to
472 472 * this page.
473 473 */
474 474 if ((vpmref = newpage->p_vpmref) == 0) {
475 475 newpage->p_vpmref = VPMID(vpm);
476 476 gotnewvpm = 1;
477 477 } else {
478 478 VPM_DEBUG(vpmd_contend);
479 479 mutex_exit(vmtx);
480 480 }
481 481 mutex_exit(pmtx);
482 482
483 483 if (gotnewvpm) {
484 484
485 485 /*
486 486 * At this point, we've selected the vpm. Remove vpm
487 487 * from its freelist. If vpm is the first one in
488 488 * the freelist, update the head of the freelist.
489 489 */
490 490 if (first == vpm) {
491 491 ASSERT(first == allocq->vpmq_free);
492 492 allocq->vpmq_free = vpm->vpm_next;
493 493 }
494 494
495 495 /*
496 496 * If the head of the freelist still points to vpm,
497 497 * then there are no more free vpmaps in that list.
498 498 */
499 499 if (allocq->vpmq_free == vpm)
500 500 /*
501 501 * Took the last one
502 502 */
503 503 allocq->vpmq_free = NULL;
504 504 else {
505 505 vpm->vpm_prev->vpm_next = vpm->vpm_next;
506 506 vpm->vpm_next->vpm_prev = vpm->vpm_prev;
507 507 }
508 508 mutex_exit(&allocq->vpmq_mtx);
509 509 vpm->vpm_prev = vpm->vpm_next = NULL;
510 510
511 511 /*
512 512 * Disassociate the previous page.
513 513 * p_vpmref is used as a mapping reference to the page.
514 514 */
515 515 if ((pp = vpm->vpm_pp) != NULL &&
516 516 vpm->vpm_vp == pp->p_vnode &&
517 517 vpm->vpm_off == pp->p_offset) {
518 518
519 519 pmtx = PPMTX(pp);
520 520 if (page_trylock(pp, SE_SHARED)) {
521 521 /*
522 522 * Now verify that it is the correct
523 523 * page. If not someone else stole it,
524 524 * so just unlock it and leave.
525 525 */
526 526 mutex_enter(pmtx);
527 527 if (PP_ISFREE(pp) ||
528 528 vpm->vpm_vp != pp->p_vnode ||
529 529 vpm->vpm_off != pp->p_offset ||
530 530 pp->p_vpmref != VPMID(vpm)) {
531 531 mutex_exit(pmtx);
532 532
533 533 page_unlock(pp);
534 534 } else {
535 535 /*
536 536 * Release the page.
537 537 */
538 538 pp->p_vpmref = 0;
539 539 mutex_exit(pmtx);
540 540 (void) page_release(pp, 1);
541 541 }
542 542 } else {
543 543 /*
544 544 * If the page cannot be locked, just
545 545 * clear the p_vpmref and go.
546 546 */
547 547 mutex_enter(pmtx);
548 548 if (pp->p_vpmref == VPMID(vpm)) {
549 549 pp->p_vpmref = 0;
550 550 }
551 551 mutex_exit(pmtx);
552 552 VPM_DEBUG(vpmd_prevpagelocked);
553 553 }
554 554 }
555 555
556 556 /*
557 557 * Setup vpm to point to the new page.
558 558 */
559 559 vpm->vpm_pp = newpage;
560 560 vpm->vpm_vp = newpage->p_vnode;
561 561 vpm->vpm_off = newpage->p_offset;
562 562
563 563 } else {
564 564 int steal = !VPM_MTBF(steals, steals_mtbf);
565 565 /*
566 566 * Page already has a vpm assigned just use that.
567 567 * Grab the vpm mutex and verify that it is still
568 568 * the correct one. The pp->p_vpmref should not change
569 569 * once we have the vpm mutex and the page lock.
570 570 */
571 571 mutex_exit(&allocq->vpmq_mtx);
572 572 vpm = VPMP(vpmref);
573 573 vmtx = VPMAPMTX(vpm);
574 574 mutex_enter(vmtx);
575 575 if ((steal && vpm->vpm_refcnt == 0) ||
576 576 vpm->vpm_pp != newpage) {
577 577 /*
578 578 * The vpm got stolen, retry.
579 579 * clear the p_vpmref.
580 580 */
581 581 pmtx = PPMTX(newpage);
582 582 mutex_enter(pmtx);
583 583 if (newpage->p_vpmref == vpmref) {
584 584 newpage->p_vpmref = 0;
585 585 }
586 586 mutex_exit(pmtx);
587 587
588 588 mutex_exit(vmtx);
589 589 VPM_DEBUG(vpmd_steals);
590 590 goto retry_queue;
591 591 } else if (vpm->vpm_refcnt == 0) {
592 592 /*
593 593 * Remove it from the free list if it
594 594 * exists there.
595 595 */
596 596 VPMAP_RMFREELIST(vpm);
597 597 }
598 598 }
599 599 return (vpm);
600 600 }
601 601 }
602 602
603 603 static void
604 604 free_vpmap(struct vpmap *vpm)
605 605 {
606 606 struct vpmfree *vpmflp;
607 607 struct vpmap *vpmfreelist;
608 608 union vpm_freeq *releq;
609 609
610 610 ASSERT(MUTEX_HELD(VPMAPMTX(vpm)));
611 611
612 612 if (vpm->vpm_refcnt != 0) {
613 613 panic("free_vpmap");
614 614 /*NOTREACHED*/
615 615 }
616 616
617 617 vpmflp = &vpmd_free[vpm->vpm_free_ndx];
618 618 /*
619 619 * Add to the tail of the release queue
620 620 * Note that vpm_releq and vpm_allocq could toggle
621 621 * before we get the lock. This does not affect
622 622 * correctness as the 2 queues are only maintained
623 623 * to reduce lock pressure.
624 624 */
625 625 releq = vpmflp->vpm_releq;
626 626 if (releq == &vpmflp->vpm_freeq[0]) {
627 627 vpm->vpm_ndxflg = 0;
628 628 } else {
629 629 vpm->vpm_ndxflg = 1;
630 630 }
631 631 mutex_enter(&releq->vpmq_mtx);
632 632 vpmfreelist = releq->vpmq_free;
633 633 if (vpmfreelist == 0) {
634 634 int want;
635 635
636 636 releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
637 637 /*
638 638 * Both queue mutexes are held to set vpm_want;
639 639 * snapshot the value before dropping releq mutex.
640 640 * If vpm_want appears after the releq mutex is dropped,
641 641 * then the vpmap just freed is already gone.
642 642 */
643 643 want = vpmflp->vpm_want;
644 644 mutex_exit(&releq->vpmq_mtx);
645 645 /*
646 646 * See if there was a waiter before dropping the releq mutex
647 647 * then recheck after obtaining vpm_freeq[0] mutex as
648 648 * the another thread may have already signaled.
649 649 */
650 650 if (want) {
651 651 mutex_enter(&vpmflp->vpm_freeq[0].vpmq_mtx);
652 652 if (vpmflp->vpm_want)
653 653 cv_signal(&vpmflp->vpm_free_cv);
654 654 mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
655 655 }
656 656 } else {
657 657 vpm->vpm_next = vpmfreelist;
658 658 vpm->vpm_prev = vpmfreelist->vpm_prev;
659 659 vpmfreelist->vpm_prev = vpm;
660 660 vpm->vpm_prev->vpm_next = vpm;
661 661 mutex_exit(&releq->vpmq_mtx);
662 662 }
663 663 }
664 664
665 665 /*
666 666 * Get the vpmap for the page.
667 667 * The refcnt of this vpm is incremented.
668 668 */
669 669 static struct vpmap *
670 670 get_vpmap(page_t *pp)
671 671 {
672 672 struct vpmap *vpm = NULL;
673 673 kmutex_t *vmtx;
674 674 kmutex_t *pmtx;
675 675 unsigned int refid;
676 676
677 677 ASSERT((pp != NULL) && PAGE_LOCKED(pp));
678 678
679 679 if (VPM_MTBF(contend, contend_mtbf) && (refid = pp->p_vpmref) != 0) {
680 680 vpm = VPMP(refid);
681 681 vmtx = VPMAPMTX(vpm);
682 682 mutex_enter(vmtx);
683 683 /*
684 684 * Since we have the page lock and the vpm mutex, the
685 685 * pp->p_vpmref cannot change.
686 686 */
687 687 if (vpm->vpm_pp != pp) {
688 688 pmtx = PPMTX(pp);
689 689
690 690 /*
691 691 * Clear the p_vpmref as it is incorrect.
692 692 * This can happen if the page was stolen.
693 693 * On x64 this should not happen as p_vpmref
694 694 * is treated as a mapping on the page. So
695 695 * if the page is stolen, the mapping would have
696 696 * been cleared in page_unload().
697 697 */
698 698 mutex_enter(pmtx);
699 699 if (pp->p_vpmref == refid)
700 700 pp->p_vpmref = 0;
701 701 mutex_exit(pmtx);
702 702
703 703 mutex_exit(vmtx);
704 704 vpm = NULL;
705 705 } else if (vpm->vpm_refcnt == 0) {
706 706 /*
707 707 * Got the vpm, remove it from the free
708 708 * list if it exists there.
709 709 */
710 710 VPMAP_RMFREELIST(vpm);
711 711 }
712 712 }
713 713 if (vpm == NULL) {
714 714 /*
715 715 * get_free_vpmap() returns with the vpmap mutex held.
716 716 */
717 717 vpm = get_free_vpmap(pp);
718 718 vmtx = VPMAPMTX(vpm);
719 719 vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_misses++;
720 720 } else {
721 721 vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_hits++;
722 722 }
723 723
724 724 vpm->vpm_refcnt++;
725 725 mutex_exit(vmtx);
726 726
727 727 return (vpm);
728 728 }
729 729
730 730 /* END --- vpm cache ---- */
731 731
732 732 /*
733 733 * The vnode page mapping(vpm) interface routines.
734 734 */
735 735
736 736 /*
737 737 * Find or create the pages starting form baseoff for specified
738 738 * length 'len'.
739 739 */
740 740 static int
741 741 vpm_pagecreate(
742 742 struct vnode *vp,
743 743 u_offset_t baseoff,
744 744 size_t len,
745 745 vmap_t vml[],
746 746 int nseg,
747 747 int *newpage)
748 748 {
749 749
750 750 page_t *pp = NULL;
751 751 caddr_t base;
752 752 u_offset_t off = baseoff;
753 753 int i;
754 754 ASSERT(nseg >= MINVMAPS && nseg <= MAXVMAPS);
755 755
756 756 for (i = 0; len > 0; len -= PAGESIZE, i++) {
757 757 struct vpmap *vpm;
758 758
759 759
760 760 if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
761 761
762 762 base = segkpm_create_va(off);
763 763
764 764 /*
765 765 * the seg pointer passed in is just advisor. Just
766 766 * pass segkmap for now like segmap does with
767 767 * segmap_kpm enabled.
768 768 */
769 769 if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
770 770 segkmap, base)) == NULL) {
771 771 panic("segmap_pagecreate_vpm: "
772 772 "page_create failed");
773 773 /*NOTREACHED*/
774 774 }
775 775 if (newpage != NULL)
776 776 *newpage = 1;
777 777
778 778 page_io_unlock(pp);
779 779 }
780 780
781 781 /*
782 782 * Get the vpm for this page_t.
783 783 */
784 784 if (vpm_cache_enable) {
785 785 vpm = get_vpmap(pp);
786 786 vml[i].vs_data = (void *)&vpm->vpm_pp;
787 787 } else {
788 788 vml[i].vs_data = (void *)pp;
789 789 pp->p_vpmref = 0;
790 790 }
791 791
792 792 vml[i].vs_addr = hat_kpm_mapin(pp, 0);
793 793 vml[i].vs_len = PAGESIZE;
794 794
795 795 off += PAGESIZE;
796 796 }
797 797 vml[i].vs_data = NULL;
798 798 vml[i].vs_addr = (caddr_t)NULL;
799 799 return (0);
800 800 }
801 801
802 802
803 803 /*
804 804 * Returns vpm mappings of pages in the range [off, off+len], where
805 805 * len is rounded up to the PAGESIZE boundary. The list of pages and
806 806 * the page addresses are returned in the SGL vml (vmap_t) array passed in.
807 807 * The nseg is the number of vmap_t entries in the array.
808 808 *
809 809 * The segmap's SM_LOCKPROTO usage is not supported by these interfaces.
810 810 * For such cases, use the seg_map interfaces.
811 811 */
812 812 int
813 813 vpm_map_pages(
814 814 struct vnode *vp,
815 815 u_offset_t off,
816 816 size_t len,
817 817 int fetchpage,
818 818 vmap_t *vml,
819 819 int nseg,
820 820 int *newpage,
821 821 enum seg_rw rw)
822 822 {
823 823 extern struct vnode *common_specvp();
824 824 u_offset_t baseoff;
825 825 uint_t prot;
826 826 caddr_t base;
827 827 page_t *pp, *pplist[MAXVMAPS];
828 828 struct vpmap *vpm;
829 829 int i, error = 0;
830 830 size_t tlen;
831 831
832 832 ASSERT(nseg >= MINVMAPS && nseg <= MAXVMAPS);
833 833 baseoff = off & (offset_t)PAGEMASK;
834 834 vml[0].vs_data = NULL;
835 835 vml[0].vs_addr = (caddr_t)NULL;
836 836
837 837 tlen = P2ROUNDUP(off + len, PAGESIZE) - baseoff;
838 838 /*
839 839 * Restrict it to VPMMAXLEN.
840 840 */
841 841 if (tlen > (VPMMAXPGS * PAGESIZE)) {
842 842 tlen = VPMMAXPGS * PAGESIZE;
843 843 }
844 844 /*
845 845 * Ensure length fits within the vml[] array. One element of
846 846 * the array is used to mark the end of the scatter/gather list
847 847 * of valid mappings by setting its vs_addr = NULL. Leave space
848 848 * for this element.
849 849 */
850 850 if (tlen > ((nseg - 1) * PAGESIZE)) {
851 851 tlen = ((nseg - 1) * PAGESIZE);
852 852 }
853 853 len = tlen;
854 854
855 855 /*
856 856 * If this is a block device we have to be sure to use the
857 857 * "common" block device vnode for the mapping.
858 858 */
859 859 if (vp->v_type == VBLK)
860 860 vp = common_specvp(vp);
861 861
862 862
863 863 if (!fetchpage)
864 864 return (vpm_pagecreate(vp, baseoff, len, vml, nseg, newpage));
865 865
866 866 for (i = 0; len > 0; len -= PAGESIZE, i++, pplist[i] = NULL) {
867 867
868 868 pp = page_lookup(vp, baseoff, SE_SHARED);
869 869
870 870 /*
871 871 * If we did not find the page or if this page was not
872 872 * in vpm cache(p_vpmref == 0), then let VOP_GETPAGE get
873 873 * all the pages.
874 874 * We need to call VOP_GETPAGE so that filesytems can do some
875 875 * (un)necessary tracking for sequential access.
876 876 */
877 877
878 878 if (pp == NULL || (vpm_cache_enable && pp->p_vpmref == 0) ||
879 879 (rw == S_WRITE && hat_page_getattr(pp, P_MOD | P_REF)
880 880 != (P_MOD | P_REF))) {
881 881 int j;
882 882 if (pp != NULL) {
883 883 page_unlock(pp);
884 884 }
885 885 /*
886 886 * If we did not find the desired set of pages,
887 887 * from the page cache, just call VOP_GETPAGE to get
888 888 * all the pages.
889 889 */
890 890 for (j = 0; j < i; j++) {
891 891 page_unlock(pplist[j]);
892 892 }
893 893
894 894
895 895 baseoff = off & (offset_t)PAGEMASK;
896 896 /*
897 897 * Pass a dummy address as it will be required
898 898 * by page_create_va(). We pass segkmap as the seg
899 899 * as some file systems(UFS) check it.
900 900 */
901 901 base = segkpm_create_va(baseoff);
902 902
903 903 error = VOP_GETPAGE(vp, baseoff, tlen, &prot, pplist,
904 904 tlen, segkmap, base, rw, CRED(), NULL);
905 905 if (error) {
906 906 VPM_DEBUG(vpmd_getpagefailed);
907 907 pplist[0] = NULL;
908 908 }
909 909 break;
910 910 } else {
911 911 pplist[i] = pp;
912 912 baseoff += PAGESIZE;
913 913 }
914 914 }
915 915
916 916 if (error) {
917 917 for (i = 0; pplist[i] != NULL; i++) {
918 918 page_unlock(pplist[i]);
919 919 pplist[i] = NULL;
920 920 }
921 921 vml[0].vs_addr = NULL;
922 922 vml[0].vs_data = NULL;
923 923 return (error);
924 924 }
925 925
926 926 /*
927 927 * Get the vpm's for pages.
928 928 */
929 929 for (i = 0; pplist[i] != NULL; i++) {
930 930 if (vpm_cache_enable) {
931 931 vpm = get_vpmap(pplist[i]);
932 932 vml[i].vs_data = (void *)&(vpm->vpm_pp);
933 933 } else {
934 934 vml[i].vs_data = (void *)pplist[i];
935 935 pplist[i]->p_vpmref = 0;
936 936 }
937 937
938 938 vml[i].vs_addr = hat_kpm_mapin(pplist[i], 0);
939 939 vml[i].vs_len = PAGESIZE;
940 940 }
941 941
942 942 vml[i].vs_data = NULL;
943 943 vml[i].vs_addr = (caddr_t)NULL;
944 944
945 945 return (0);
946 946 }
947 947
948 948 /*
949 949 * Release the vpm mappings on the pages and unlock them.
950 950 */
951 951 void
952 952 vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
953 953 {
954 954 int i;
955 955 struct vpmap *vpm;
956 956 kmutex_t *mtx;
957 957 page_t *pp;
958 958
959 959 for (i = 0; vml[i].vs_data != NULL; i++) {
960 960 ASSERT(IS_KPM_ADDR(vml[i].vs_addr));
961 961
962 962 if (vpm_cache_enable) {
963 963 pp = *(((page_t **)vml[i].vs_data));
964 964 } else {
965 965 pp = (page_t *)vml[i].vs_data;
966 966 }
967 967
968 968 /*
969 969 * Mark page as being modified or referenced, bacause vpm pages
970 970 * would not cause faults where it would be set normally.
971 971 */
972 972 if (rw == S_WRITE) {
973 973 hat_setrefmod(pp);
974 974 } else {
975 975 ASSERT(rw == S_READ);
976 976 hat_setref(pp);
977 977 }
978 978
979 979 if (vpm_cache_enable) {
980 980 vpm = (struct vpmap *)((char *)vml[i].vs_data
981 981 - offsetof(struct vpmap, vpm_pp));
982 982 hat_kpm_mapout(pp, 0, vml[i].vs_addr);
983 983 page_unlock(pp);
984 984 mtx = VPMAPMTX(vpm);
985 985 mutex_enter(mtx);
986 986
987 987 if (--vpm->vpm_refcnt == 0) {
988 988 free_vpmap(vpm);
989 989 }
990 990 mutex_exit(mtx);
991 991 } else {
992 992 hat_kpm_mapout(pp, 0, vml[i].vs_addr);
993 993 (void) page_release(pp, 1);
994 994 }
995 995 vml[i].vs_data = NULL;
996 996 vml[i].vs_addr = NULL;
997 997 }
998 998 }
999 999
1000 1000 /*
1001 1001 * Given the vp, off and the uio structure, this routine will do the
1002 1002 * the copy (uiomove). If the last page created is partially written,
1003 1003 * the rest of the page is zeroed out. It also zeros the beginning of
1004 1004 * the first page till the start offset if requested(zerostart).
1005 1005 * If pages are to be fetched, it will call the filesystem's getpage
1006 1006 * function (VOP_GETPAGE) to get them, otherwise they will be created if
1007 1007 * not already present in the page cache.
1008 1008 */
1009 1009 int
1010 1010 vpm_data_copy(struct vnode *vp,
1011 1011 u_offset_t off,
1012 1012 size_t len,
1013 1013 struct uio *uio,
1014 1014 int fetchpage,
1015 1015 int *newpage,
1016 1016 int zerostart,
1017 1017 enum seg_rw rw)
1018 1018 {
1019 1019 int error;
1020 1020 struct vmap vml[MINVMAPS];
1021 1021 enum uio_rw uiorw;
1022 1022 int npages = 0;
1023 1023
1024 1024 uiorw = (rw == S_WRITE) ? UIO_WRITE : UIO_READ;
1025 1025 /*
1026 1026 * 'off' will be the offset where the I/O starts.
1027 1027 * We get the pages starting at the (off & PAGEMASK)
1028 1028 * page boundary.
1029 1029 */
1030 1030 error = vpm_map_pages(vp, off, (uint_t)len,
1031 1031 fetchpage, vml, MINVMAPS, &npages, rw);
1032 1032
1033 1033 if (newpage != NULL)
1034 1034 *newpage = npages;
1035 1035 if (!error) {
1036 1036 int i, pn, slen = len;
1037 1037 int pon = off & PAGEOFFSET;
1038 1038
1039 1039 /*
1040 1040 * Clear from the beginning of the page to start offset
1041 1041 * if requested.
1042 1042 */
1043 1043 if (!fetchpage && zerostart) {
1044 1044 (void) kzero(vml[0].vs_addr, (uint_t)pon);
1045 1045 VPM_DEBUG(vpmd_zerostart);
1046 1046 }
1047 1047
1048 1048 for (i = 0; !error && slen > 0 &&
1049 1049 vml[i].vs_addr != NULL; i++) {
1050 1050 pn = (int)MIN(slen, (PAGESIZE - pon));
1051 1051 error = uiomove(vml[i].vs_addr + pon,
1052 1052 (long)pn, uiorw, uio);
1053 1053 slen -= pn;
1054 1054 pon = 0;
1055 1055 }
1056 1056
1057 1057 /*
1058 1058 * When new pages are created, zero out part of the
1059 1059 * page we did not copy to.
1060 1060 */
1061 1061 if (!fetchpage && npages &&
1062 1062 uio->uio_loffset < roundup(off + len, PAGESIZE)) {
1063 1063 int nzero;
1064 1064
1065 1065 pon = (uio->uio_loffset & PAGEOFFSET);
1066 1066 nzero = PAGESIZE - pon;
1067 1067 i = (uio->uio_loffset - (off & PAGEMASK)) / PAGESIZE;
1068 1068 (void) kzero(vml[i].vs_addr + pon, (uint_t)nzero);
1069 1069 }
1070 1070 vpm_unmap_pages(vml, rw);
1071 1071 }
1072 1072 return (error);
1073 1073 }
1074 1074
1075 1075 /*
1076 1076 * called to flush pages for the given vnode covering
1077 1077 * [off, off+len] range.
1078 1078 */
1079 1079 int
1080 1080 vpm_sync_pages(struct vnode *vp,
1081 1081 u_offset_t off,
1082 1082 size_t len,
1083 1083 uint_t flags)
1084 1084 {
1085 1085 extern struct vnode *common_specvp();
1086 1086 int bflags = 0;
1087 1087 int error = 0;
1088 1088 size_t psize = roundup(len, PAGESIZE);
1089 1089
1090 1090 /*
1091 1091 * If this is a block device we have to be sure to use the
1092 1092 * "common" block device vnode for the mapping.
1093 1093 */
1094 1094 if (vp->v_type == VBLK)
1095 1095 vp = common_specvp(vp);
1096 1096
1097 1097 if ((flags & ~SM_DONTNEED) != 0) {
1098 1098 if (flags & SM_ASYNC)
1099 1099 bflags |= B_ASYNC;
1100 1100 if (flags & SM_INVAL)
1101 1101 bflags |= B_INVAL;
1102 1102 if (flags & SM_DESTROY)
1103 1103 bflags |= (B_INVAL|B_TRUNC);
1104 1104 if (flags & SM_FREE)
1105 1105 bflags |= B_FREE;
1106 1106 if (flags & SM_DONTNEED)
1107 1107 bflags |= B_DONTNEED;
1108 1108
1109 1109 error = VOP_PUTPAGE(vp, off, psize, bflags, CRED(), NULL);
1110 1110 }
1111 1111
1112 1112 return (error);
1113 1113 }
1114 1114
1115 1115
1116 1116 #else /* SEGKPM_SUPPORT */
1117 1117
1118 1118 /* vpm stubs */
1119 1119 void
1120 1120 vpm_init()
1121 1121 {
1122 1122 }
1123 1123
1124 1124 /*ARGSUSED*/
1125 1125 int
1126 1126 vpm_pagecreate(
1127 1127 struct vnode *vp,
1128 1128 u_offset_t baseoff,
1129 1129 size_t len,
1130 1130 vmap_t vml[],
1131 1131 int nseg,
1132 1132 int *newpage)
1133 1133 {
1134 1134 return (0);
1135 1135 }
1136 1136
1137 1137 /*ARGSUSED*/
1138 1138 int
1139 1139 vpm_map_pages(
1140 1140 struct vnode *vp,
1141 1141 u_offset_t off,
1142 1142 size_t len,
1143 1143 int fetchpage,
1144 1144 vmap_t vml[],
1145 1145 int nseg,
1146 1146 int *newpage,
1147 1147 enum seg_rw rw)
1148 1148 {
1149 1149 return (0);
1150 1150 }
1151 1151
1152 1152 /*ARGSUSED*/
1153 1153 int
1154 1154 vpm_data_copy(struct vnode *vp,
1155 1155 u_offset_t off,
1156 1156 size_t len,
1157 1157 struct uio *uio,
1158 1158 int fetchpage,
1159 1159 int *newpage,
1160 1160 int zerostart,
1161 1161 enum seg_rw rw)
1162 1162 {
1163 1163 return (0);
1164 1164 }
1165 1165
1166 1166 /*ARGSUSED*/
1167 1167 void
1168 1168 vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
1169 1169 {
1170 1170 }
1171 1171 /*ARGSUSED*/
1172 1172 int
1173 1173 vpm_sync_pages(struct vnode *vp,
1174 1174 u_offset_t off,
1175 1175 size_t len,
1176 1176 uint_t flags)
1177 1177 {
1178 1178 return (0);
1179 1179 }
1180 1180 #endif /* SEGKPM_SUPPORT */
↓ open down ↓ |
962 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX