1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  26 /*      All Rights Reserved   */
  27 
  28 /*
  29  * Portions of this source code were derived from Berkeley 4.3 BSD
  30  * under license from the Regents of the University of California.
  31  */
  32 
  33 /*
  34  * segkp is a segment driver that administers the allocation and deallocation
  35  * of pageable variable size chunks of kernel virtual address space. Each
  36  * allocated resource is page-aligned.
  37  *
  38  * The user may specify whether the resource should be initialized to 0,
  39  * include a redzone, or locked in memory.
  40  */
  41 
  42 #include <sys/types.h>
  43 #include <sys/t_lock.h>
  44 #include <sys/thread.h>
  45 #include <sys/param.h>
  46 #include <sys/errno.h>
  47 #include <sys/sysmacros.h>
  48 #include <sys/systm.h>
  49 #include <sys/buf.h>
  50 #include <sys/mman.h>
  51 #include <sys/vnode.h>
  52 #include <sys/cmn_err.h>
  53 #include <sys/swap.h>
  54 #include <sys/tuneable.h>
  55 #include <sys/kmem.h>
  56 #include <sys/vmem.h>
  57 #include <sys/cred.h>
  58 #include <sys/dumphdr.h>
  59 #include <sys/debug.h>
  60 #include <sys/vtrace.h>
  61 #include <sys/stack.h>
  62 #include <sys/atomic.h>
  63 #include <sys/archsystm.h>
  64 #include <sys/lgrp.h>
  65 
  66 #include <vm/as.h>
  67 #include <vm/seg.h>
  68 #include <vm/seg_kp.h>
  69 #include <vm/seg_kmem.h>
  70 #include <vm/anon.h>
  71 #include <vm/page.h>
  72 #include <vm/hat.h>
  73 #include <sys/bitmap.h>
  74 
  75 /*
  76  * Private seg op routines
  77  */
  78 static void     segkp_badop(void);
  79 static void     segkp_dump(struct seg *seg);
  80 static int      segkp_checkprot(struct seg *seg, caddr_t addr, size_t len,
  81                         uint_t prot);
  82 static int      segkp_kluster(struct seg *seg, caddr_t addr, ssize_t delta);
  83 static int      segkp_pagelock(struct seg *seg, caddr_t addr, size_t len,
  84                         struct page ***page, enum lock_type type,
  85                         enum seg_rw rw);
  86 static void     segkp_insert(struct seg *seg, struct segkp_data *kpd);
  87 static void     segkp_delete(struct seg *seg, struct segkp_data *kpd);
  88 static caddr_t  segkp_get_internal(struct seg *seg, size_t len, uint_t flags,
  89                         struct segkp_data **tkpd, struct anon_map *amp);
  90 static void     segkp_release_internal(struct seg *seg,
  91                         struct segkp_data *kpd, size_t len);
  92 static int      segkp_unlock(struct hat *hat, struct seg *seg, caddr_t vaddr,
  93                         size_t len, struct segkp_data *kpd, uint_t flags);
  94 static int      segkp_load(struct hat *hat, struct seg *seg, caddr_t vaddr,
  95                         size_t len, struct segkp_data *kpd, uint_t flags);
  96 static struct   segkp_data *segkp_find(struct seg *seg, caddr_t vaddr);
  97 static int      segkp_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp);
  98 static int      segkp_capable(struct seg *seg, segcapability_t capability);
  99 
 100 /*
 101  * Lock used to protect the hash table(s) and caches.
 102  */
 103 static kmutex_t segkp_lock;
 104 
 105 /*
 106  * The segkp caches
 107  */
 108 static struct segkp_cache segkp_cache[SEGKP_MAX_CACHE];
 109 
 110 #define SEGKP_BADOP(t)  (t(*)())segkp_badop
 111 
 112 /*
 113  * When there are fewer than red_minavail bytes left on the stack,
 114  * segkp_map_red() will map in the redzone (if called).  5000 seems
 115  * to work reasonably well...
 116  */
 117 long            red_minavail = 5000;
 118 
 119 /*
 120  * will be set to 1 for 32 bit x86 systems only, in startup.c
 121  */
 122 int     segkp_fromheap = 0;
 123 ulong_t *segkp_bitmap;
 124 
 125 /*
 126  * If segkp_map_red() is called with the redzone already mapped and
 127  * with less than RED_DEEP_THRESHOLD bytes available on the stack,
 128  * then the stack situation has become quite serious;  if much more stack
 129  * is consumed, we have the potential of scrogging the next thread/LWP
 130  * structure.  To help debug the "can't happen" panics which may
 131  * result from this condition, we record hrestime and the calling thread
 132  * in red_deep_hires and red_deep_thread respectively.
 133  */
 134 #define RED_DEEP_THRESHOLD      2000
 135 
 136 hrtime_t        red_deep_hires;
 137 kthread_t       *red_deep_thread;
 138 
 139 uint32_t        red_nmapped;
 140 uint32_t        red_closest = UINT_MAX;
 141 uint32_t        red_ndoubles;
 142 
 143 pgcnt_t anon_segkp_pages_locked;        /* See vm/anon.h */
 144 pgcnt_t anon_segkp_pages_resv;          /* anon reserved by seg_kp */
 145 
 146 static struct   seg_ops segkp_ops = {
 147         .dup            = SEGKP_BADOP(int),
 148         .unmap          = SEGKP_BADOP(int),
 149         .free           = SEGKP_BADOP(void),
 150         .fault          = segkp_fault,
 151         .faulta         = SEGKP_BADOP(faultcode_t),
 152         .setprot        = SEGKP_BADOP(int),
 153         .checkprot      = segkp_checkprot,
 154         .kluster        = segkp_kluster,
 155         .swapout        = SEGKP_BADOP(size_t),
 156         .sync           = SEGKP_BADOP(int),
 157         .incore         = SEGKP_BADOP(size_t),
 158         .lockop         = SEGKP_BADOP(int),
 159         .getprot        = SEGKP_BADOP(int),
 160         .getoffset      = SEGKP_BADOP(u_offset_t),
 161         .gettype        = SEGKP_BADOP(int),
 162         .getvp          = SEGKP_BADOP(int),
 163         .advise         = SEGKP_BADOP(int),
 164         .dump           = segkp_dump,
 165         .pagelock       = segkp_pagelock,
 166         .setpagesize    = SEGKP_BADOP(int),
 167         .getmemid       = segkp_getmemid,
 168         .capable        = segkp_capable,
 169 };
 170 
 171 
 172 static void
 173 segkp_badop(void)
 174 {
 175         panic("segkp_badop");
 176         /*NOTREACHED*/
 177 }
 178 
 179 static void segkpinit_mem_config(struct seg *);
 180 
 181 static uint32_t segkp_indel;
 182 
 183 /*
 184  * Allocate the segment specific private data struct and fill it in
 185  * with the per kp segment mutex, anon ptr. array and hash table.
 186  */
 187 int
 188 segkp_create(struct seg *seg)
 189 {
 190         struct segkp_segdata *kpsd;
 191         size_t  np;
 192 
 193         ASSERT(seg != NULL && seg->s_as == &kas);
 194         ASSERT(RW_WRITE_HELD(&seg->s_as->a_lock));
 195 
 196         if (seg->s_size & PAGEOFFSET) {
 197                 panic("Bad segkp size");
 198                 /*NOTREACHED*/
 199         }
 200 
 201         kpsd = kmem_zalloc(sizeof (struct segkp_segdata), KM_SLEEP);
 202 
 203         /*
 204          * Allocate the virtual memory for segkp and initialize it
 205          */
 206         if (segkp_fromheap) {
 207                 np = btop(kvseg.s_size);
 208                 segkp_bitmap = kmem_zalloc(BT_SIZEOFMAP(np), KM_SLEEP);
 209                 kpsd->kpsd_arena = vmem_create("segkp", NULL, 0, PAGESIZE,
 210                     vmem_alloc, vmem_free, heap_arena, 5 * PAGESIZE, VM_SLEEP);
 211         } else {
 212                 segkp_bitmap = NULL;
 213                 np = btop(seg->s_size);
 214                 kpsd->kpsd_arena = vmem_create("segkp", seg->s_base,
 215                     seg->s_size, PAGESIZE, NULL, NULL, NULL, 5 * PAGESIZE,
 216                     VM_SLEEP);
 217         }
 218 
 219         kpsd->kpsd_anon = anon_create(np, ANON_SLEEP | ANON_ALLOC_FORCE);
 220 
 221         kpsd->kpsd_hash = kmem_zalloc(SEGKP_HASHSZ * sizeof (struct segkp *),
 222             KM_SLEEP);
 223         seg->s_data = (void *)kpsd;
 224         seg->s_ops = &segkp_ops;
 225         segkpinit_mem_config(seg);
 226         return (0);
 227 }
 228 
 229 
 230 /*
 231  * Find a free 'freelist' and initialize it with the appropriate attributes
 232  */
 233 void *
 234 segkp_cache_init(struct seg *seg, int maxsize, size_t len, uint_t flags)
 235 {
 236         int i;
 237 
 238         if ((flags & KPD_NO_ANON) && !(flags & KPD_LOCKED))
 239                 return ((void *)-1);
 240 
 241         mutex_enter(&segkp_lock);
 242         for (i = 0; i < SEGKP_MAX_CACHE; i++) {
 243                 if (segkp_cache[i].kpf_inuse)
 244                         continue;
 245                 segkp_cache[i].kpf_inuse = 1;
 246                 segkp_cache[i].kpf_max = maxsize;
 247                 segkp_cache[i].kpf_flags = flags;
 248                 segkp_cache[i].kpf_seg = seg;
 249                 segkp_cache[i].kpf_len = len;
 250                 mutex_exit(&segkp_lock);
 251                 return ((void *)(uintptr_t)i);
 252         }
 253         mutex_exit(&segkp_lock);
 254         return ((void *)-1);
 255 }
 256 
 257 /*
 258  * Free all the cache resources.
 259  */
 260 void
 261 segkp_cache_free(void)
 262 {
 263         struct segkp_data *kpd;
 264         struct seg *seg;
 265         int i;
 266 
 267         mutex_enter(&segkp_lock);
 268         for (i = 0; i < SEGKP_MAX_CACHE; i++) {
 269                 if (!segkp_cache[i].kpf_inuse)
 270                         continue;
 271                 /*
 272                  * Disconnect the freelist and process each element
 273                  */
 274                 kpd = segkp_cache[i].kpf_list;
 275                 seg = segkp_cache[i].kpf_seg;
 276                 segkp_cache[i].kpf_list = NULL;
 277                 segkp_cache[i].kpf_count = 0;
 278                 mutex_exit(&segkp_lock);
 279 
 280                 while (kpd != NULL) {
 281                         struct segkp_data *next;
 282 
 283                         next = kpd->kp_next;
 284                         segkp_release_internal(seg, kpd, kpd->kp_len);
 285                         kpd = next;
 286                 }
 287                 mutex_enter(&segkp_lock);
 288         }
 289         mutex_exit(&segkp_lock);
 290 }
 291 
 292 /*
 293  * There are 2 entries into segkp_get_internal. The first includes a cookie
 294  * used to access a pool of cached segkp resources. The second does not
 295  * use the cache.
 296  */
 297 caddr_t
 298 segkp_get(struct seg *seg, size_t len, uint_t flags)
 299 {
 300         struct segkp_data *kpd = NULL;
 301 
 302         if (segkp_get_internal(seg, len, flags, &kpd, NULL) != NULL) {
 303                 kpd->kp_cookie = -1;
 304                 return (stom(kpd->kp_base, flags));
 305         }
 306         return (NULL);
 307 }
 308 
 309 /*
 310  * Return a 'cached' segkp address
 311  */
 312 caddr_t
 313 segkp_cache_get(void *cookie)
 314 {
 315         struct segkp_cache *freelist = NULL;
 316         struct segkp_data *kpd = NULL;
 317         int index = (int)(uintptr_t)cookie;
 318         struct seg *seg;
 319         size_t len;
 320         uint_t flags;
 321 
 322         if (index < 0 || index >= SEGKP_MAX_CACHE)
 323                 return (NULL);
 324         freelist = &segkp_cache[index];
 325 
 326         mutex_enter(&segkp_lock);
 327         seg = freelist->kpf_seg;
 328         flags = freelist->kpf_flags;
 329         if (freelist->kpf_list != NULL) {
 330                 kpd = freelist->kpf_list;
 331                 freelist->kpf_list = kpd->kp_next;
 332                 freelist->kpf_count--;
 333                 mutex_exit(&segkp_lock);
 334                 kpd->kp_next = NULL;
 335                 segkp_insert(seg, kpd);
 336                 return (stom(kpd->kp_base, flags));
 337         }
 338         len = freelist->kpf_len;
 339         mutex_exit(&segkp_lock);
 340         if (segkp_get_internal(seg, len, flags, &kpd, NULL) != NULL) {
 341                 kpd->kp_cookie = index;
 342                 return (stom(kpd->kp_base, flags));
 343         }
 344         return (NULL);
 345 }
 346 
 347 caddr_t
 348 segkp_get_withanonmap(
 349         struct seg *seg,
 350         size_t len,
 351         uint_t flags,
 352         struct anon_map *amp)
 353 {
 354         struct segkp_data *kpd = NULL;
 355 
 356         ASSERT(amp != NULL);
 357         flags |= KPD_HASAMP;
 358         if (segkp_get_internal(seg, len, flags, &kpd, amp) != NULL) {
 359                 kpd->kp_cookie = -1;
 360                 return (stom(kpd->kp_base, flags));
 361         }
 362         return (NULL);
 363 }
 364 
 365 /*
 366  * This does the real work of segkp allocation.
 367  * Return to client base addr. len must be page-aligned. A null value is
 368  * returned if there are no more vm resources (e.g. pages, swap). The len
 369  * and base recorded in the private data structure include the redzone
 370  * and the redzone length (if applicable). If the user requests a redzone
 371  * either the first or last page is left unmapped depending whether stacks
 372  * grow to low or high memory.
 373  *
 374  * The client may also specify a no-wait flag. If that is set then the
 375  * request will choose a non-blocking path when requesting resources.
 376  * The default is make the client wait.
 377  */
 378 static caddr_t
 379 segkp_get_internal(
 380         struct seg *seg,
 381         size_t len,
 382         uint_t flags,
 383         struct segkp_data **tkpd,
 384         struct anon_map *amp)
 385 {
 386         struct segkp_segdata    *kpsd = (struct segkp_segdata *)seg->s_data;
 387         struct segkp_data       *kpd;
 388         caddr_t vbase = NULL;   /* always first virtual, may not be mapped */
 389         pgcnt_t np = 0;         /* number of pages in the resource */
 390         pgcnt_t segkpindex;
 391         long i;
 392         caddr_t va;
 393         pgcnt_t pages = 0;
 394         ulong_t anon_idx = 0;
 395         int kmflag = (flags & KPD_NOWAIT) ? KM_NOSLEEP : KM_SLEEP;
 396         caddr_t s_base = (segkp_fromheap) ? kvseg.s_base : seg->s_base;
 397 
 398         if (len & PAGEOFFSET) {
 399                 panic("segkp_get: len is not page-aligned");
 400                 /*NOTREACHED*/
 401         }
 402 
 403         ASSERT(((flags & KPD_HASAMP) == 0) == (amp == NULL));
 404 
 405         /* Only allow KPD_NO_ANON if we are going to lock it down */
 406         if ((flags & (KPD_LOCKED|KPD_NO_ANON)) == KPD_NO_ANON)
 407                 return (NULL);
 408 
 409         if ((kpd = kmem_zalloc(sizeof (struct segkp_data), kmflag)) == NULL)
 410                 return (NULL);
 411         /*
 412          * Fix up the len to reflect the REDZONE if applicable
 413          */
 414         if (flags & KPD_HASREDZONE)
 415                 len += PAGESIZE;
 416         np = btop(len);
 417 
 418         vbase = vmem_alloc(SEGKP_VMEM(seg), len, kmflag | VM_BESTFIT);
 419         if (vbase == NULL) {
 420                 kmem_free(kpd, sizeof (struct segkp_data));
 421                 return (NULL);
 422         }
 423 
 424         /* If locking, reserve physical memory */
 425         if (flags & KPD_LOCKED) {
 426                 pages = btop(SEGKP_MAPLEN(len, flags));
 427                 if (page_resv(pages, kmflag) == 0) {
 428                         vmem_free(SEGKP_VMEM(seg), vbase, len);
 429                         kmem_free(kpd, sizeof (struct segkp_data));
 430                         return (NULL);
 431                 }
 432                 if ((flags & KPD_NO_ANON) == 0)
 433                         atomic_add_long(&anon_segkp_pages_locked, pages);
 434         }
 435 
 436         /*
 437          * Reserve sufficient swap space for this vm resource.  We'll
 438          * actually allocate it in the loop below, but reserving it
 439          * here allows us to back out more gracefully than if we
 440          * had an allocation failure in the body of the loop.
 441          *
 442          * Note that we don't need swap space for the red zone page.
 443          */
 444         if (amp != NULL) {
 445                 /*
 446                  * The swap reservation has been done, if required, and the
 447                  * anon_hdr is separate.
 448                  */
 449                 anon_idx = 0;
 450                 kpd->kp_anon_idx = anon_idx;
 451                 kpd->kp_anon = amp->ahp;
 452 
 453                 TRACE_5(TR_FAC_VM, TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u",
 454                     kpd, vbase, len, flags, 1);
 455 
 456         } else if ((flags & KPD_NO_ANON) == 0) {
 457                 if (anon_resv_zone(SEGKP_MAPLEN(len, flags), NULL) == 0) {
 458                         if (flags & KPD_LOCKED) {
 459                                 atomic_add_long(&anon_segkp_pages_locked,
 460                                     -pages);
 461                                 page_unresv(pages);
 462                         }
 463                         vmem_free(SEGKP_VMEM(seg), vbase, len);
 464                         kmem_free(kpd, sizeof (struct segkp_data));
 465                         return (NULL);
 466                 }
 467                 atomic_add_long(&anon_segkp_pages_resv,
 468                     btop(SEGKP_MAPLEN(len, flags)));
 469                 anon_idx = ((uintptr_t)(vbase - s_base)) >> PAGESHIFT;
 470                 kpd->kp_anon_idx = anon_idx;
 471                 kpd->kp_anon = kpsd->kpsd_anon;
 472 
 473                 TRACE_5(TR_FAC_VM, TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u",
 474                     kpd, vbase, len, flags, 1);
 475         } else {
 476                 kpd->kp_anon = NULL;
 477                 kpd->kp_anon_idx = 0;
 478         }
 479 
 480         /*
 481          * Allocate page and anon resources for the virtual address range
 482          * except the redzone
 483          */
 484         if (segkp_fromheap)
 485                 segkpindex = btop((uintptr_t)(vbase - kvseg.s_base));
 486         for (i = 0, va = vbase; i < np; i++, va += PAGESIZE) {
 487                 page_t          *pl[2];
 488                 struct vnode    *vp;
 489                 anoff_t         off;
 490                 int             err;
 491                 page_t          *pp = NULL;
 492 
 493                 /*
 494                  * Mark this page to be a segkp page in the bitmap.
 495                  */
 496                 if (segkp_fromheap) {
 497                         BT_ATOMIC_SET(segkp_bitmap, segkpindex);
 498                         segkpindex++;
 499                 }
 500 
 501                 /*
 502                  * If this page is the red zone page, we don't need swap
 503                  * space for it.  Note that we skip over the code that
 504                  * establishes MMU mappings, so that the page remains
 505                  * invalid.
 506                  */
 507                 if ((flags & KPD_HASREDZONE) && KPD_REDZONE(kpd) == i)
 508                         continue;
 509 
 510                 if (kpd->kp_anon != NULL) {
 511                         struct anon *ap;
 512 
 513                         ASSERT(anon_get_ptr(kpd->kp_anon, anon_idx + i)
 514                             == NULL);
 515                         /*
 516                          * Determine the "vp" and "off" of the anon slot.
 517                          */
 518                         ap = anon_alloc(NULL, 0);
 519                         if (amp != NULL)
 520                                 ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
 521                         (void) anon_set_ptr(kpd->kp_anon, anon_idx + i,
 522                             ap, ANON_SLEEP);
 523                         if (amp != NULL)
 524                                 ANON_LOCK_EXIT(&amp->a_rwlock);
 525                         swap_xlate(ap, &vp, &off);
 526 
 527                         /*
 528                          * Create a page with the specified identity.  The
 529                          * page is returned with the "shared" lock held.
 530                          */
 531                         err = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE,
 532                             NULL, pl, PAGESIZE, seg, va, S_CREATE,
 533                             kcred, NULL);
 534                         if (err) {
 535                                 /*
 536                                  * XXX - This should not fail.
 537                                  */
 538                                 panic("segkp_get: no pages");
 539                                 /*NOTREACHED*/
 540                         }
 541                         pp = pl[0];
 542                 } else {
 543                         ASSERT(page_exists(&kvp,
 544                             (u_offset_t)(uintptr_t)va) == NULL);
 545 
 546                         if ((pp = page_create_va(&kvp,
 547                             (u_offset_t)(uintptr_t)va, PAGESIZE,
 548                             (flags & KPD_NOWAIT ? 0 : PG_WAIT) | PG_EXCL |
 549                             PG_NORELOC, seg, va)) == NULL) {
 550                                 /*
 551                                  * Legitimize resource; then destroy it.
 552                                  * Easier than trying to unwind here.
 553                                  */
 554                                 kpd->kp_flags = flags;
 555                                 kpd->kp_base = vbase;
 556                                 kpd->kp_len = len;
 557                                 segkp_release_internal(seg, kpd, va - vbase);
 558                                 return (NULL);
 559                         }
 560                         page_io_unlock(pp);
 561                 }
 562 
 563                 if (flags & KPD_ZERO)
 564                         pagezero(pp, 0, PAGESIZE);
 565 
 566                 /*
 567                  * Load and lock an MMU translation for the page.
 568                  */
 569                 hat_memload(seg->s_as->a_hat, va, pp, (PROT_READ|PROT_WRITE),
 570                     ((flags & KPD_LOCKED) ? HAT_LOAD_LOCK : HAT_LOAD));
 571 
 572                 /*
 573                  * Now, release lock on the page.
 574                  */
 575                 if (flags & KPD_LOCKED) {
 576                         /*
 577                          * Indicate to page_retire framework that this
 578                          * page can only be retired when it is freed.
 579                          */
 580                         PP_SETRAF(pp);
 581                         page_downgrade(pp);
 582                 } else
 583                         page_unlock(pp);
 584         }
 585 
 586         kpd->kp_flags = flags;
 587         kpd->kp_base = vbase;
 588         kpd->kp_len = len;
 589         segkp_insert(seg, kpd);
 590         *tkpd = kpd;
 591         return (stom(kpd->kp_base, flags));
 592 }
 593 
 594 /*
 595  * Release the resource to cache if the pool(designate by the cookie)
 596  * has less than the maximum allowable. If inserted in cache,
 597  * segkp_delete insures element is taken off of active list.
 598  */
 599 void
 600 segkp_release(struct seg *seg, caddr_t vaddr)
 601 {
 602         struct segkp_cache *freelist;
 603         struct segkp_data *kpd = NULL;
 604 
 605         if ((kpd = segkp_find(seg, vaddr)) == NULL) {
 606                 panic("segkp_release: null kpd");
 607                 /*NOTREACHED*/
 608         }
 609 
 610         if (kpd->kp_cookie != -1) {
 611                 freelist = &segkp_cache[kpd->kp_cookie];
 612                 mutex_enter(&segkp_lock);
 613                 if (!segkp_indel && freelist->kpf_count < freelist->kpf_max) {
 614                         segkp_delete(seg, kpd);
 615                         kpd->kp_next = freelist->kpf_list;
 616                         freelist->kpf_list = kpd;
 617                         freelist->kpf_count++;
 618                         mutex_exit(&segkp_lock);
 619                         return;
 620                 } else {
 621                         mutex_exit(&segkp_lock);
 622                         kpd->kp_cookie = -1;
 623                 }
 624         }
 625         segkp_release_internal(seg, kpd, kpd->kp_len);
 626 }
 627 
 628 /*
 629  * Free the entire resource. segkp_unlock gets called with the start of the
 630  * mapped portion of the resource. The length is the size of the mapped
 631  * portion
 632  */
 633 static void
 634 segkp_release_internal(struct seg *seg, struct segkp_data *kpd, size_t len)
 635 {
 636         caddr_t         va;
 637         long            i;
 638         long            redzone;
 639         size_t          np;
 640         page_t          *pp;
 641         struct vnode    *vp;
 642         anoff_t         off;
 643         struct anon     *ap;
 644         pgcnt_t         segkpindex;
 645 
 646         ASSERT(kpd != NULL);
 647         ASSERT((kpd->kp_flags & KPD_HASAMP) == 0 || kpd->kp_cookie == -1);
 648         np = btop(len);
 649 
 650         /* Remove from active hash list */
 651         if (kpd->kp_cookie == -1) {
 652                 mutex_enter(&segkp_lock);
 653                 segkp_delete(seg, kpd);
 654                 mutex_exit(&segkp_lock);
 655         }
 656 
 657         /*
 658          * Precompute redzone page index.
 659          */
 660         redzone = -1;
 661         if (kpd->kp_flags & KPD_HASREDZONE)
 662                 redzone = KPD_REDZONE(kpd);
 663 
 664 
 665         va = kpd->kp_base;
 666 
 667         hat_unload(seg->s_as->a_hat, va, (np << PAGESHIFT),
 668             ((kpd->kp_flags & KPD_LOCKED) ? HAT_UNLOAD_UNLOCK : HAT_UNLOAD));
 669         /*
 670          * Free up those anon resources that are quiescent.
 671          */
 672         if (segkp_fromheap)
 673                 segkpindex = btop((uintptr_t)(va - kvseg.s_base));
 674         for (i = 0; i < np; i++, va += PAGESIZE) {
 675 
 676                 /*
 677                  * Clear the bit for this page from the bitmap.
 678                  */
 679                 if (segkp_fromheap) {
 680                         BT_ATOMIC_CLEAR(segkp_bitmap, segkpindex);
 681                         segkpindex++;
 682                 }
 683 
 684                 if (i == redzone)
 685                         continue;
 686                 if (kpd->kp_anon) {
 687                         /*
 688                          * Free up anon resources and destroy the
 689                          * associated pages.
 690                          *
 691                          * Release the lock if there is one. Have to get the
 692                          * page to do this, unfortunately.
 693                          */
 694                         if (kpd->kp_flags & KPD_LOCKED) {
 695                                 ap = anon_get_ptr(kpd->kp_anon,
 696                                     kpd->kp_anon_idx + i);
 697                                 swap_xlate(ap, &vp, &off);
 698                                 /* Find the shared-locked page. */
 699                                 pp = page_find(vp, (u_offset_t)off);
 700                                 if (pp == NULL) {
 701                                         panic("segkp_release: "
 702                                             "kp_anon: no page to unlock ");
 703                                         /*NOTREACHED*/
 704                                 }
 705                                 if (PP_ISRAF(pp))
 706                                         PP_CLRRAF(pp);
 707 
 708                                 page_unlock(pp);
 709                         }
 710                         if ((kpd->kp_flags & KPD_HASAMP) == 0) {
 711                                 anon_free(kpd->kp_anon, kpd->kp_anon_idx + i,
 712                                     PAGESIZE);
 713                                 anon_unresv_zone(PAGESIZE, NULL);
 714                                 atomic_dec_ulong(&anon_segkp_pages_resv);
 715                         }
 716                         TRACE_5(TR_FAC_VM,
 717                             TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u",
 718                             kpd, va, PAGESIZE, 0, 0);
 719                 } else {
 720                         if (kpd->kp_flags & KPD_LOCKED) {
 721                                 pp = page_find(&kvp, (u_offset_t)(uintptr_t)va);
 722                                 if (pp == NULL) {
 723                                         panic("segkp_release: "
 724                                             "no page to unlock");
 725                                         /*NOTREACHED*/
 726                                 }
 727                                 if (PP_ISRAF(pp))
 728                                         PP_CLRRAF(pp);
 729                                 /*
 730                                  * We should just upgrade the lock here
 731                                  * but there is no upgrade that waits.
 732                                  */
 733                                 page_unlock(pp);
 734                         }
 735                         pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)va,
 736                             SE_EXCL);
 737                         if (pp != NULL)
 738                                 page_destroy(pp, 0);
 739                 }
 740         }
 741 
 742         /* If locked, release physical memory reservation */
 743         if (kpd->kp_flags & KPD_LOCKED) {
 744                 pgcnt_t pages = btop(SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags));
 745                 if ((kpd->kp_flags & KPD_NO_ANON) == 0)
 746                         atomic_add_long(&anon_segkp_pages_locked, -pages);
 747                 page_unresv(pages);
 748         }
 749 
 750         vmem_free(SEGKP_VMEM(seg), kpd->kp_base, kpd->kp_len);
 751         kmem_free(kpd, sizeof (struct segkp_data));
 752 }
 753 
 754 /*
 755  * segkp_map_red() will check the current frame pointer against the
 756  * stack base.  If the amount of stack remaining is questionable
 757  * (less than red_minavail), then segkp_map_red() will map in the redzone
 758  * and return 1.  Otherwise, it will return 0.  segkp_map_red() can
 759  * _only_ be called when:
 760  *
 761  *   - it is safe to sleep on page_create_va().
 762  *   - the caller is non-swappable.
 763  *
 764  * It is up to the caller to remember whether segkp_map_red() successfully
 765  * mapped the redzone, and, if so, to call segkp_unmap_red() at a later
 766  * time.  Note that the caller must _remain_ non-swappable until after
 767  * calling segkp_unmap_red().
 768  *
 769  * Currently, this routine is only called from pagefault() (which necessarily
 770  * satisfies the above conditions).
 771  */
 772 #if defined(STACK_GROWTH_DOWN)
 773 int
 774 segkp_map_red(void)
 775 {
 776         uintptr_t fp = STACK_BIAS + (uintptr_t)getfp();
 777 #ifndef _LP64
 778         caddr_t stkbase;
 779 #endif
 780 
 781         ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
 782 
 783         /*
 784          * Optimize for the common case where we simply return.
 785          */
 786         if ((curthread->t_red_pp == NULL) &&
 787             (fp - (uintptr_t)curthread->t_stkbase >= red_minavail))
 788                 return (0);
 789 
 790 #if defined(_LP64)
 791         /*
 792          * XXX  We probably need something better than this.
 793          */
 794         panic("kernel stack overflow");
 795         /*NOTREACHED*/
 796 #else /* _LP64 */
 797         if (curthread->t_red_pp == NULL) {
 798                 page_t *red_pp;
 799                 struct seg kseg;
 800 
 801                 caddr_t red_va = (caddr_t)
 802                     (((uintptr_t)curthread->t_stkbase & (uintptr_t)PAGEMASK) -
 803                     PAGESIZE);
 804 
 805                 ASSERT(page_exists(&kvp, (u_offset_t)(uintptr_t)red_va) ==
 806                     NULL);
 807 
 808                 /*
 809                  * Allocate the physical for the red page.
 810                  */
 811                 /*
 812                  * No PG_NORELOC here to avoid waits. Unlikely to get
 813                  * a relocate happening in the short time the page exists
 814                  * and it will be OK anyway.
 815                  */
 816 
 817                 kseg.s_as = &kas;
 818                 red_pp = page_create_va(&kvp, (u_offset_t)(uintptr_t)red_va,
 819                     PAGESIZE, PG_WAIT | PG_EXCL, &kseg, red_va);
 820                 ASSERT(red_pp != NULL);
 821 
 822                 /*
 823                  * So we now have a page to jam into the redzone...
 824                  */
 825                 page_io_unlock(red_pp);
 826 
 827                 hat_memload(kas.a_hat, red_va, red_pp,
 828                     (PROT_READ|PROT_WRITE), HAT_LOAD_LOCK);
 829                 page_downgrade(red_pp);
 830 
 831                 /*
 832                  * The page is left SE_SHARED locked so we can hold on to
 833                  * the page_t pointer.
 834                  */
 835                 curthread->t_red_pp = red_pp;
 836 
 837                 atomic_inc_32(&red_nmapped);
 838                 while (fp - (uintptr_t)curthread->t_stkbase < red_closest) {
 839                         (void) atomic_cas_32(&red_closest, red_closest,
 840                             (uint32_t)(fp - (uintptr_t)curthread->t_stkbase));
 841                 }
 842                 return (1);
 843         }
 844 
 845         stkbase = (caddr_t)(((uintptr_t)curthread->t_stkbase &
 846             (uintptr_t)PAGEMASK) - PAGESIZE);
 847 
 848         atomic_inc_32(&red_ndoubles);
 849 
 850         if (fp - (uintptr_t)stkbase < RED_DEEP_THRESHOLD) {
 851                 /*
 852                  * Oh boy.  We're already deep within the mapped-in
 853                  * redzone page, and the caller is trying to prepare
 854                  * for a deep stack run.  We're running without a
 855                  * redzone right now:  if the caller plows off the
 856                  * end of the stack, it'll plow another thread or
 857                  * LWP structure.  That situation could result in
 858                  * a very hard-to-debug panic, so, in the spirit of
 859                  * recording the name of one's killer in one's own
 860                  * blood, we're going to record hrestime and the calling
 861                  * thread.
 862                  */
 863                 red_deep_hires = hrestime.tv_nsec;
 864                 red_deep_thread = curthread;
 865         }
 866 
 867         /*
 868          * If this is a DEBUG kernel, and we've run too deep for comfort, toss.
 869          */
 870         ASSERT(fp - (uintptr_t)stkbase >= RED_DEEP_THRESHOLD);
 871         return (0);
 872 #endif /* _LP64 */
 873 }
 874 
 875 void
 876 segkp_unmap_red(void)
 877 {
 878         page_t *pp;
 879         caddr_t red_va = (caddr_t)(((uintptr_t)curthread->t_stkbase &
 880             (uintptr_t)PAGEMASK) - PAGESIZE);
 881 
 882         ASSERT(curthread->t_red_pp != NULL);
 883         ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
 884 
 885         /*
 886          * Because we locked the mapping down, we can't simply rely
 887          * on page_destroy() to clean everything up;  we need to call
 888          * hat_unload() to explicitly unlock the mapping resources.
 889          */
 890         hat_unload(kas.a_hat, red_va, PAGESIZE, HAT_UNLOAD_UNLOCK);
 891 
 892         pp = curthread->t_red_pp;
 893 
 894         ASSERT(pp == page_find(&kvp, (u_offset_t)(uintptr_t)red_va));
 895 
 896         /*
 897          * Need to upgrade the SE_SHARED lock to SE_EXCL.
 898          */
 899         if (!page_tryupgrade(pp)) {
 900                 /*
 901                  * As there is now wait for upgrade, release the
 902                  * SE_SHARED lock and wait for SE_EXCL.
 903                  */
 904                 page_unlock(pp);
 905                 pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)red_va, SE_EXCL);
 906                 /* pp may be NULL here, hence the test below */
 907         }
 908 
 909         /*
 910          * Destroy the page, with dontfree set to zero (i.e. free it).
 911          */
 912         if (pp != NULL)
 913                 page_destroy(pp, 0);
 914         curthread->t_red_pp = NULL;
 915 }
 916 #else
 917 #error Red stacks only supported with downwards stack growth.
 918 #endif
 919 
 920 /*
 921  * Handle a fault on an address corresponding to one of the
 922  * resources in the segkp segment.
 923  */
 924 faultcode_t
 925 segkp_fault(
 926         struct hat      *hat,
 927         struct seg      *seg,
 928         caddr_t         vaddr,
 929         size_t          len,
 930         enum fault_type type,
 931         enum seg_rw rw)
 932 {
 933         struct segkp_data       *kpd = NULL;
 934         int                     err;
 935 
 936         ASSERT(seg->s_as == &kas && RW_READ_HELD(&seg->s_as->a_lock));
 937 
 938         /*
 939          * Sanity checks.
 940          */
 941         if (type == F_PROT) {
 942                 panic("segkp_fault: unexpected F_PROT fault");
 943                 /*NOTREACHED*/
 944         }
 945 
 946         if ((kpd = segkp_find(seg, vaddr)) == NULL)
 947                 return (FC_NOMAP);
 948 
 949         mutex_enter(&kpd->kp_lock);
 950 
 951         if (type == F_SOFTLOCK) {
 952                 ASSERT(!(kpd->kp_flags & KPD_LOCKED));
 953                 /*
 954                  * The F_SOFTLOCK case has more stringent
 955                  * range requirements: the given range must exactly coincide
 956                  * with the resource's mapped portion. Note reference to
 957                  * redzone is handled since vaddr would not equal base
 958                  */
 959                 if (vaddr != stom(kpd->kp_base, kpd->kp_flags) ||
 960                     len != SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags)) {
 961                         mutex_exit(&kpd->kp_lock);
 962                         return (FC_MAKE_ERR(EFAULT));
 963                 }
 964 
 965                 if ((err = segkp_load(hat, seg, vaddr, len, kpd, KPD_LOCKED))) {
 966                         mutex_exit(&kpd->kp_lock);
 967                         return (FC_MAKE_ERR(err));
 968                 }
 969                 kpd->kp_flags |= KPD_LOCKED;
 970                 mutex_exit(&kpd->kp_lock);
 971                 return (0);
 972         }
 973 
 974         if (type == F_INVAL) {
 975                 ASSERT(!(kpd->kp_flags & KPD_NO_ANON));
 976 
 977                 /*
 978                  * Check if we touched the redzone. Somewhat optimistic
 979                  * here if we are touching the redzone of our own stack
 980                  * since we wouldn't have a stack to get this far...
 981                  */
 982                 if ((kpd->kp_flags & KPD_HASREDZONE) &&
 983                     btop((uintptr_t)(vaddr - kpd->kp_base)) == KPD_REDZONE(kpd))
 984                         panic("segkp_fault: accessing redzone");
 985 
 986                 /*
 987                  * This fault may occur while the page is being F_SOFTLOCK'ed.
 988                  * Return since a 2nd segkp_load is unnecessary and also would
 989                  * result in the page being locked twice and eventually
 990                  * hang the thread_reaper thread.
 991                  */
 992                 if (kpd->kp_flags & KPD_LOCKED) {
 993                         mutex_exit(&kpd->kp_lock);
 994                         return (0);
 995                 }
 996 
 997                 err = segkp_load(hat, seg, vaddr, len, kpd, kpd->kp_flags);
 998                 mutex_exit(&kpd->kp_lock);
 999                 return (err ? FC_MAKE_ERR(err) : 0);
1000         }
1001 
1002         if (type == F_SOFTUNLOCK) {
1003                 uint_t  flags;
1004 
1005                 /*
1006                  * Make sure the addr is LOCKED and it has anon backing
1007                  * before unlocking
1008                  */
1009                 if ((kpd->kp_flags & (KPD_LOCKED|KPD_NO_ANON)) != KPD_LOCKED) {
1010                         panic("segkp_fault: bad unlock");
1011                         /*NOTREACHED*/
1012                 }
1013 
1014                 if (vaddr != stom(kpd->kp_base, kpd->kp_flags) ||
1015                     len != SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags)) {
1016                         panic("segkp_fault: bad range");
1017                         /*NOTREACHED*/
1018                 }
1019 
1020                 if (rw == S_WRITE)
1021                         flags = kpd->kp_flags | KPD_WRITEDIRTY;
1022                 else
1023                         flags = kpd->kp_flags;
1024                 err = segkp_unlock(hat, seg, vaddr, len, kpd, flags);
1025                 kpd->kp_flags &= ~KPD_LOCKED;
1026                 mutex_exit(&kpd->kp_lock);
1027                 return (err ? FC_MAKE_ERR(err) : 0);
1028         }
1029         mutex_exit(&kpd->kp_lock);
1030         panic("segkp_fault: bogus fault type: %d\n", type);
1031         /*NOTREACHED*/
1032 }
1033 
1034 /*
1035  * Check that the given protections suffice over the range specified by
1036  * vaddr and len.  For this segment type, the only issue is whether or
1037  * not the range lies completely within the mapped part of an allocated
1038  * resource.
1039  */
1040 /* ARGSUSED */
1041 static int
1042 segkp_checkprot(struct seg *seg, caddr_t vaddr, size_t len, uint_t prot)
1043 {
1044         struct segkp_data *kpd = NULL;
1045         caddr_t mbase;
1046         size_t mlen;
1047 
1048         if ((kpd = segkp_find(seg, vaddr)) == NULL)
1049                 return (EACCES);
1050 
1051         mutex_enter(&kpd->kp_lock);
1052         mbase = stom(kpd->kp_base, kpd->kp_flags);
1053         mlen = SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags);
1054         if (len > mlen || vaddr < mbase ||
1055             ((vaddr + len) > (mbase + mlen))) {
1056                 mutex_exit(&kpd->kp_lock);
1057                 return (EACCES);
1058         }
1059         mutex_exit(&kpd->kp_lock);
1060         return (0);
1061 }
1062 
1063 
1064 /*
1065  * Check to see if it makes sense to do kluster/read ahead to
1066  * addr + delta relative to the mapping at addr.  We assume here
1067  * that delta is a signed PAGESIZE'd multiple (which can be negative).
1068  *
1069  * For seg_u we always "approve" of this action from our standpoint.
1070  */
1071 /*ARGSUSED*/
1072 static int
1073 segkp_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
1074 {
1075         return (0);
1076 }
1077 
1078 /*
1079  * Load and possibly lock intra-slot resources in the range given by
1080  * vaddr and len.
1081  */
1082 static int
1083 segkp_load(
1084         struct hat *hat,
1085         struct seg *seg,
1086         caddr_t vaddr,
1087         size_t len,
1088         struct segkp_data *kpd,
1089         uint_t flags)
1090 {
1091         caddr_t va;
1092         caddr_t vlim;
1093         ulong_t i;
1094         uint_t lock;
1095 
1096         ASSERT(MUTEX_HELD(&kpd->kp_lock));
1097 
1098         len = P2ROUNDUP(len, PAGESIZE);
1099 
1100         /* If locking, reserve physical memory */
1101         if (flags & KPD_LOCKED) {
1102                 pgcnt_t pages = btop(len);
1103                 if ((kpd->kp_flags & KPD_NO_ANON) == 0)
1104                         atomic_add_long(&anon_segkp_pages_locked, pages);
1105                 (void) page_resv(pages, KM_SLEEP);
1106         }
1107 
1108         /*
1109          * Loop through the pages in the given range.
1110          */
1111         va = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
1112         vaddr = va;
1113         vlim = va + len;
1114         lock = flags & KPD_LOCKED;
1115         i = ((uintptr_t)(va - kpd->kp_base)) >> PAGESHIFT;
1116         for (; va < vlim; va += PAGESIZE, i++) {
1117                 page_t          *pl[2]; /* second element NULL terminator */
1118                 struct vnode    *vp;
1119                 anoff_t         off;
1120                 int             err;
1121                 struct anon     *ap;
1122 
1123                 /*
1124                  * Summon the page.  If it's not resident, arrange
1125                  * for synchronous i/o to pull it in.
1126                  */
1127                 ap = anon_get_ptr(kpd->kp_anon, kpd->kp_anon_idx + i);
1128                 swap_xlate(ap, &vp, &off);
1129 
1130                 /*
1131                  * The returned page list will have exactly one entry,
1132                  * which is returned to us already kept.
1133                  */
1134                 err = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, NULL,
1135                     pl, PAGESIZE, seg, va, S_READ, kcred, NULL);
1136 
1137                 if (err) {
1138                         /*
1139                          * Back out of what we've done so far.
1140                          */
1141                         (void) segkp_unlock(hat, seg, vaddr,
1142                             (va - vaddr), kpd, flags);
1143                         return (err);
1144                 }
1145 
1146                 /*
1147                  * Load an MMU translation for the page.
1148                  */
1149                 hat_memload(hat, va, pl[0], (PROT_READ|PROT_WRITE),
1150                     lock ? HAT_LOAD_LOCK : HAT_LOAD);
1151 
1152                 if (!lock) {
1153                         /*
1154                          * Now, release "shared" lock on the page.
1155                          */
1156                         page_unlock(pl[0]);
1157                 }
1158         }
1159         return (0);
1160 }
1161 
1162 /*
1163  * At the very least unload the mmu-translations and unlock the range if locked
1164  * Can be called with the following flag value KPD_WRITEDIRTY which specifies
1165  * any dirty pages should be written to disk.
1166  */
1167 static int
1168 segkp_unlock(
1169         struct hat *hat,
1170         struct seg *seg,
1171         caddr_t vaddr,
1172         size_t len,
1173         struct segkp_data *kpd,
1174         uint_t flags)
1175 {
1176         caddr_t va;
1177         caddr_t vlim;
1178         ulong_t i;
1179         struct page *pp;
1180         struct vnode *vp;
1181         anoff_t off;
1182         struct anon *ap;
1183 
1184 #ifdef lint
1185         seg = seg;
1186 #endif /* lint */
1187 
1188         ASSERT(MUTEX_HELD(&kpd->kp_lock));
1189 
1190         /*
1191          * Loop through the pages in the given range. It is assumed
1192          * segkp_unlock is called with page aligned base
1193          */
1194         va = vaddr;
1195         vlim = va + len;
1196         i = ((uintptr_t)(va - kpd->kp_base)) >> PAGESHIFT;
1197         hat_unload(hat, va, len,
1198             ((flags & KPD_LOCKED) ? HAT_UNLOAD_UNLOCK : HAT_UNLOAD));
1199         for (; va < vlim; va += PAGESIZE, i++) {
1200                 /*
1201                  * Find the page associated with this part of the
1202                  * slot, tracking it down through its associated swap
1203                  * space.
1204                  */
1205                 ap = anon_get_ptr(kpd->kp_anon, kpd->kp_anon_idx + i);
1206                 swap_xlate(ap, &vp, &off);
1207 
1208                 if (flags & KPD_LOCKED) {
1209                         if ((pp = page_find(vp, off)) == NULL) {
1210                                 if (flags & KPD_LOCKED) {
1211                                         panic("segkp_softunlock: missing page");
1212                                         /*NOTREACHED*/
1213                                 }
1214                         }
1215                 } else {
1216                         /*
1217                          * Nothing to do if the slot is not locked and the
1218                          * page doesn't exist.
1219                          */
1220                         if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL)
1221                                 continue;
1222                 }
1223 
1224                 /*
1225                  * If the page doesn't have any translations, is
1226                  * dirty and not being shared, then push it out
1227                  * asynchronously and avoid waiting for the
1228                  * pageout daemon to do it for us.
1229                  *
1230                  * XXX - Do we really need to get the "exclusive"
1231                  * lock via an upgrade?
1232                  */
1233                 if ((flags & KPD_WRITEDIRTY) && !hat_page_is_mapped(pp) &&
1234                     hat_ismod(pp) && page_tryupgrade(pp)) {
1235                         /*
1236                          * Hold the vnode before releasing the page lock to
1237                          * prevent it from being freed and re-used by some
1238                          * other thread.
1239                          */
1240                         VN_HOLD(vp);
1241                         page_unlock(pp);
1242 
1243                         /*
1244                          * Want most powerful credentials we can get so
1245                          * use kcred.
1246                          */
1247                         (void) VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE,
1248                             B_ASYNC | B_FREE, kcred, NULL);
1249                         VN_RELE(vp);
1250                 } else {
1251                         page_unlock(pp);
1252                 }
1253         }
1254 
1255         /* If unlocking, release physical memory */
1256         if (flags & KPD_LOCKED) {
1257                 pgcnt_t pages = btopr(len);
1258                 if ((kpd->kp_flags & KPD_NO_ANON) == 0)
1259                         atomic_add_long(&anon_segkp_pages_locked, -pages);
1260                 page_unresv(pages);
1261         }
1262         return (0);
1263 }
1264 
1265 /*
1266  * Insert the kpd in the hash table.
1267  */
1268 static void
1269 segkp_insert(struct seg *seg, struct segkp_data *kpd)
1270 {
1271         struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data;
1272         int index;
1273 
1274         /*
1275          * Insert the kpd based on the address that will be returned
1276          * via segkp_release.
1277          */
1278         index = SEGKP_HASH(stom(kpd->kp_base, kpd->kp_flags));
1279         mutex_enter(&segkp_lock);
1280         kpd->kp_next = kpsd->kpsd_hash[index];
1281         kpsd->kpsd_hash[index] = kpd;
1282         mutex_exit(&segkp_lock);
1283 }
1284 
1285 /*
1286  * Remove kpd from the hash table.
1287  */
1288 static void
1289 segkp_delete(struct seg *seg, struct segkp_data *kpd)
1290 {
1291         struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data;
1292         struct segkp_data **kpp;
1293         int index;
1294 
1295         ASSERT(MUTEX_HELD(&segkp_lock));
1296 
1297         index = SEGKP_HASH(stom(kpd->kp_base, kpd->kp_flags));
1298         for (kpp = &kpsd->kpsd_hash[index];
1299             *kpp != NULL; kpp = &((*kpp)->kp_next)) {
1300                 if (*kpp == kpd) {
1301                         *kpp = kpd->kp_next;
1302                         return;
1303                 }
1304         }
1305         panic("segkp_delete: unable to find element to delete");
1306         /*NOTREACHED*/
1307 }
1308 
1309 /*
1310  * Find the kpd associated with a vaddr.
1311  *
1312  * Most of the callers of segkp_find will pass the vaddr that
1313  * hashes to the desired index, but there are cases where
1314  * this is not true in which case we have to (potentially) scan
1315  * the whole table looking for it. This should be very rare
1316  * (e.g. a segkp_fault(F_INVAL) on an address somewhere in the
1317  * middle of the segkp_data region).
1318  */
1319 static struct segkp_data *
1320 segkp_find(struct seg *seg, caddr_t vaddr)
1321 {
1322         struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data;
1323         struct segkp_data *kpd;
1324         int     i;
1325         int     stop;
1326 
1327         i = stop = SEGKP_HASH(vaddr);
1328         mutex_enter(&segkp_lock);
1329         do {
1330                 for (kpd = kpsd->kpsd_hash[i]; kpd != NULL;
1331                     kpd = kpd->kp_next) {
1332                         if (vaddr >= kpd->kp_base &&
1333                             vaddr < kpd->kp_base + kpd->kp_len) {
1334                                 mutex_exit(&segkp_lock);
1335                                 return (kpd);
1336                         }
1337                 }
1338                 if (--i < 0)
1339                         i = SEGKP_HASHSZ - 1;   /* Wrap */
1340         } while (i != stop);
1341         mutex_exit(&segkp_lock);
1342         return (NULL);          /* Not found */
1343 }
1344 
1345 /*
1346  * returns size of swappable area.
1347  */
1348 size_t
1349 swapsize(caddr_t v)
1350 {
1351         struct segkp_data *kpd;
1352 
1353         if ((kpd = segkp_find(segkp, v)) != NULL)
1354                 return (SEGKP_MAPLEN(kpd->kp_len, kpd->kp_flags));
1355         else
1356                 return (NULL);
1357 }
1358 
1359 /*
1360  * Dump out all the active segkp pages
1361  */
1362 static void
1363 segkp_dump(struct seg *seg)
1364 {
1365         int i;
1366         struct segkp_data *kpd;
1367         struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data;
1368 
1369         for (i = 0; i < SEGKP_HASHSZ; i++) {
1370                 for (kpd = kpsd->kpsd_hash[i];
1371                     kpd != NULL; kpd = kpd->kp_next) {
1372                         pfn_t pfn;
1373                         caddr_t addr;
1374                         caddr_t eaddr;
1375 
1376                         addr = kpd->kp_base;
1377                         eaddr = addr + kpd->kp_len;
1378                         while (addr < eaddr) {
1379                                 ASSERT(seg->s_as == &kas);
1380                                 pfn = hat_getpfnum(seg->s_as->a_hat, addr);
1381                                 if (pfn != PFN_INVALID)
1382                                         dump_addpage(seg->s_as, addr, pfn);
1383                                 addr += PAGESIZE;
1384                                 dump_timeleft = dump_timeout;
1385                         }
1386                 }
1387         }
1388 }
1389 
1390 /*ARGSUSED*/
1391 static int
1392 segkp_pagelock(struct seg *seg, caddr_t addr, size_t len,
1393     struct page ***ppp, enum lock_type type, enum seg_rw rw)
1394 {
1395         return (ENOTSUP);
1396 }
1397 
1398 /*ARGSUSED*/
1399 static int
1400 segkp_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
1401 {
1402         return (ENODEV);
1403 }
1404 
1405 /*ARGSUSED*/
1406 static int
1407 segkp_capable(struct seg *seg, segcapability_t capability)
1408 {
1409         return (0);
1410 }
1411 
1412 #include <sys/mem_config.h>
1413 
1414 /*ARGSUSED*/
1415 static void
1416 segkp_mem_config_post_add(void *arg, pgcnt_t delta_pages)
1417 {}
1418 
1419 /*
1420  * During memory delete, turn off caches so that pages are not held.
1421  * A better solution may be to unlock the pages while they are
1422  * in the cache so that they may be collected naturally.
1423  */
1424 
1425 /*ARGSUSED*/
1426 static int
1427 segkp_mem_config_pre_del(void *arg, pgcnt_t delta_pages)
1428 {
1429         atomic_inc_32(&segkp_indel);
1430         segkp_cache_free();
1431         return (0);
1432 }
1433 
1434 /*ARGSUSED*/
1435 static void
1436 segkp_mem_config_post_del(void *arg, pgcnt_t delta_pages, int cancelled)
1437 {
1438         atomic_dec_32(&segkp_indel);
1439 }
1440 
1441 static kphysm_setup_vector_t segkp_mem_config_vec = {
1442         KPHYSM_SETUP_VECTOR_VERSION,
1443         segkp_mem_config_post_add,
1444         segkp_mem_config_pre_del,
1445         segkp_mem_config_post_del,
1446 };
1447 
1448 static void
1449 segkpinit_mem_config(struct seg *seg)
1450 {
1451         int ret;
1452 
1453         ret = kphysm_setup_func_register(&segkp_mem_config_vec, (void *)seg);
1454         ASSERT(ret == 0);
1455 }