1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2015, Joyent, Inc.  All rights reserved.
  25  */
  26 
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  28 /*        All Rights Reserved   */
  29 
  30 /*
  31  * University Copyright- Copyright (c) 1982, 1986, 1988
  32  * The Regents of the University of California
  33  * All Rights Reserved
  34  *
  35  * University Acknowledgment- Portions of this document are derived from
  36  * software developed by the University of California, Berkeley, and its
  37  * contributors.
  38  */
  39 
  40 /*
  41  * VM - address spaces.
  42  */
  43 
  44 #include <sys/types.h>
  45 #include <sys/t_lock.h>
  46 #include <sys/param.h>
  47 #include <sys/errno.h>
  48 #include <sys/systm.h>
  49 #include <sys/mman.h>
  50 #include <sys/sysmacros.h>
  51 #include <sys/cpuvar.h>
  52 #include <sys/sysinfo.h>
  53 #include <sys/kmem.h>
  54 #include <sys/vnode.h>
  55 #include <sys/vmsystm.h>
  56 #include <sys/cmn_err.h>
  57 #include <sys/debug.h>
  58 #include <sys/tnf_probe.h>
  59 #include <sys/vtrace.h>
  60 
  61 #include <vm/hat.h>
  62 #include <vm/as.h>
  63 #include <vm/seg.h>
  64 #include <vm/seg_vn.h>
  65 #include <vm/seg_dev.h>
  66 #include <vm/seg_kmem.h>
  67 #include <vm/seg_map.h>
  68 #include <vm/seg_spt.h>
  69 #include <vm/page.h>
  70 
  71 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
  72 
  73 static struct kmem_cache *as_cache;
  74 
  75 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
  76 static void as_clearwatchprot(struct as *, caddr_t, size_t);
  77 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
  78 
  79 
  80 /*
  81  * Verifying the segment lists is very time-consuming; it may not be
  82  * desirable always to define VERIFY_SEGLIST when DEBUG is set.
  83  */
  84 #ifdef DEBUG
  85 #define VERIFY_SEGLIST
  86 int do_as_verify = 0;
  87 #endif
  88 
  89 /*
  90  * Allocate a new callback data structure entry and fill in the events of
  91  * interest, the address range of interest, and the callback argument.
  92  * Link the entry on the as->a_callbacks list. A callback entry for the
  93  * entire address space may be specified with vaddr = 0 and size = -1.
  94  *
  95  * CALLERS RESPONSIBILITY: If not calling from within the process context for
  96  * the specified as, the caller must guarantee persistence of the specified as
  97  * for the duration of this function (eg. pages being locked within the as
  98  * will guarantee persistence).
  99  */
 100 int
 101 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
 102                 caddr_t vaddr, size_t size, int sleepflag)
 103 {
 104         struct as_callback      *current_head, *cb;
 105         caddr_t                 saddr;
 106         size_t                  rsize;
 107 
 108         /* callback function and an event are mandatory */
 109         if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
 110                 return (EINVAL);
 111 
 112         /* Adding a callback after as_free has been called is not allowed */
 113         if (as == &kas)
 114                 return (ENOMEM);
 115 
 116         /*
 117          * vaddr = 0 and size = -1 is used to indicate that the callback range
 118          * is the entire address space so no rounding is done in that case.
 119          */
 120         if (size != -1) {
 121                 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
 122                 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
 123                     (size_t)saddr;
 124                 /* check for wraparound */
 125                 if (saddr + rsize < saddr)
 126                         return (ENOMEM);
 127         } else {
 128                 if (vaddr != 0)
 129                         return (EINVAL);
 130                 saddr = vaddr;
 131                 rsize = size;
 132         }
 133 
 134         /* Allocate and initialize a callback entry */
 135         cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
 136         if (cb == NULL)
 137                 return (EAGAIN);
 138 
 139         cb->ascb_func = cb_func;
 140         cb->ascb_arg = arg;
 141         cb->ascb_events = events;
 142         cb->ascb_saddr = saddr;
 143         cb->ascb_len = rsize;
 144 
 145         /* Add the entry to the list */
 146         mutex_enter(&as->a_contents);
 147         current_head = as->a_callbacks;
 148         as->a_callbacks = cb;
 149         cb->ascb_next = current_head;
 150 
 151         /*
 152          * The call to this function may lose in a race with
 153          * a pertinent event - eg. a thread does long term memory locking
 154          * but before the callback is added another thread executes as_unmap.
 155          * A broadcast here resolves that.
 156          */
 157         if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
 158                 AS_CLRUNMAPWAIT(as);
 159                 cv_broadcast(&as->a_cv);
 160         }
 161 
 162         mutex_exit(&as->a_contents);
 163         return (0);
 164 }
 165 
 166 /*
 167  * Search the callback list for an entry which pertains to arg.
 168  *
 169  * This is called from within the client upon completion of the callback.
 170  * RETURN VALUES:
 171  *      AS_CALLBACK_DELETED  (callback entry found and deleted)
 172  *      AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
 173  *      AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
 174  *                      entry will be made in as_do_callbacks)
 175  *
 176  * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
 177  * set, it indicates that as_do_callbacks is processing this entry.  The
 178  * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
 179  * to unblock as_do_callbacks, in case it is blocked.
 180  *
 181  * CALLERS RESPONSIBILITY: If not calling from within the process context for
 182  * the specified as, the caller must guarantee persistence of the specified as
 183  * for the duration of this function (eg. pages being locked within the as
 184  * will guarantee persistence).
 185  */
 186 uint_t
 187 as_delete_callback(struct as *as, void *arg)
 188 {
 189         struct as_callback **prevcb = &as->a_callbacks;
 190         struct as_callback *cb;
 191         uint_t rc = AS_CALLBACK_NOTFOUND;
 192 
 193         mutex_enter(&as->a_contents);
 194         for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
 195                 if (cb->ascb_arg != arg)
 196                         continue;
 197 
 198                 /*
 199                  * If the events indicate AS_CALLBACK_CALLED, just clear
 200                  * AS_ALL_EVENT in the events field and wakeup the thread
 201                  * that may be waiting in as_do_callbacks.  as_do_callbacks
 202                  * will take care of removing this entry from the list.  In
 203                  * that case, return AS_CALLBACK_DELETE_DEFERRED.  Otherwise
 204                  * (AS_CALLBACK_CALLED not set), just remove it from the
 205                  * list, return the memory and return AS_CALLBACK_DELETED.
 206                  */
 207                 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
 208                         /* leave AS_CALLBACK_CALLED */
 209                         cb->ascb_events &= ~AS_ALL_EVENT;
 210                         rc = AS_CALLBACK_DELETE_DEFERRED;
 211                         cv_broadcast(&as->a_cv);
 212                 } else {
 213                         *prevcb = cb->ascb_next;
 214                         kmem_free(cb, sizeof (struct as_callback));
 215                         rc = AS_CALLBACK_DELETED;
 216                 }
 217                 break;
 218         }
 219         mutex_exit(&as->a_contents);
 220         return (rc);
 221 }
 222 
 223 /*
 224  * Searches the as callback list for a matching entry.
 225  * Returns a pointer to the first matching callback, or NULL if
 226  * nothing is found.
 227  * This function never sleeps so it is ok to call it with more
 228  * locks held but the (required) a_contents mutex.
 229  *
 230  * See also comment on as_do_callbacks below.
 231  */
 232 static struct as_callback *
 233 as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
 234                         size_t event_len)
 235 {
 236         struct as_callback      *cb;
 237 
 238         ASSERT(MUTEX_HELD(&as->a_contents));
 239         for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
 240                 /*
 241                  * If the callback has not already been called, then
 242                  * check if events or address range pertains.  An event_len
 243                  * of zero means do an unconditional callback.
 244                  */
 245                 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
 246                     ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
 247                     (event_addr + event_len < cb->ascb_saddr) ||
 248                     (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
 249                         continue;
 250                 }
 251                 break;
 252         }
 253         return (cb);
 254 }
 255 
 256 /*
 257  * Executes a given callback and removes it from the callback list for
 258  * this address space.
 259  * This function may sleep so the caller must drop all locks except
 260  * a_contents before calling this func.
 261  *
 262  * See also comments on as_do_callbacks below.
 263  */
 264 static void
 265 as_execute_callback(struct as *as, struct as_callback *cb,
 266                                 uint_t events)
 267 {
 268         struct as_callback **prevcb;
 269         void    *cb_arg;
 270 
 271         ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
 272         cb->ascb_events |= AS_CALLBACK_CALLED;
 273         mutex_exit(&as->a_contents);
 274         (*cb->ascb_func)(as, cb->ascb_arg, events);
 275         mutex_enter(&as->a_contents);
 276         /*
 277          * the callback function is required to delete the callback
 278          * when the callback function determines it is OK for
 279          * this thread to continue. as_delete_callback will clear
 280          * the AS_ALL_EVENT in the events field when it is deleted.
 281          * If the callback function called as_delete_callback,
 282          * events will already be cleared and there will be no blocking.
 283          */
 284         while ((cb->ascb_events & events) != 0) {
 285                 cv_wait(&as->a_cv, &as->a_contents);
 286         }
 287         /*
 288          * This entry needs to be taken off the list. Normally, the
 289          * callback func itself does that, but unfortunately the list
 290          * may have changed while the callback was running because the
 291          * a_contents mutex was dropped and someone else other than the
 292          * callback func itself could have called as_delete_callback,
 293          * so we have to search to find this entry again.  The entry
 294          * must have AS_CALLBACK_CALLED, and have the same 'arg'.
 295          */
 296         cb_arg = cb->ascb_arg;
 297         prevcb = &as->a_callbacks;
 298         for (cb = as->a_callbacks; cb != NULL;
 299             prevcb = &cb->ascb_next, cb = *prevcb) {
 300                 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
 301                     (cb_arg != cb->ascb_arg)) {
 302                         continue;
 303                 }
 304                 *prevcb = cb->ascb_next;
 305                 kmem_free(cb, sizeof (struct as_callback));
 306                 break;
 307         }
 308 }
 309 
 310 /*
 311  * Check the callback list for a matching event and intersection of
 312  * address range. If there is a match invoke the callback.  Skip an entry if:
 313  *    - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
 314  *    - not event of interest
 315  *    - not address range of interest
 316  *
 317  * An event_len of zero indicates a request for an unconditional callback
 318  * (regardless of event), only the AS_CALLBACK_CALLED is checked.  The
 319  * a_contents lock must be dropped before a callback, so only one callback
 320  * can be done before returning. Return -1 (true) if a callback was
 321  * executed and removed from the list, else return 0 (false).
 322  *
 323  * The logically separate parts, i.e. finding a matching callback and
 324  * executing a given callback have been separated into two functions
 325  * so that they can be called with different sets of locks held beyond
 326  * the always-required a_contents. as_find_callback does not sleep so
 327  * it is ok to call it if more locks than a_contents (i.e. the a_lock
 328  * rwlock) are held. as_execute_callback on the other hand may sleep
 329  * so all locks beyond a_contents must be dropped by the caller if one
 330  * does not want to end comatose.
 331  */
 332 static int
 333 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
 334                         size_t event_len)
 335 {
 336         struct as_callback *cb;
 337 
 338         if ((cb = as_find_callback(as, events, event_addr, event_len))) {
 339                 as_execute_callback(as, cb, events);
 340                 return (-1);
 341         }
 342         return (0);
 343 }
 344 
 345 /*
 346  * Search for the segment containing addr. If a segment containing addr
 347  * exists, that segment is returned.  If no such segment exists, and
 348  * the list spans addresses greater than addr, then the first segment
 349  * whose base is greater than addr is returned; otherwise, NULL is
 350  * returned unless tail is true, in which case the last element of the
 351  * list is returned.
 352  *
 353  * a_seglast is used to cache the last found segment for repeated
 354  * searches to the same addr (which happens frequently).
 355  */
 356 struct seg *
 357 as_findseg(struct as *as, caddr_t addr, int tail)
 358 {
 359         struct seg *seg = as->a_seglast;
 360         avl_index_t where;
 361 
 362         ASSERT(AS_LOCK_HELD(as, &as->a_lock));
 363 
 364         if (seg != NULL &&
 365             seg->s_base <= addr &&
 366             addr < seg->s_base + seg->s_size)
 367                 return (seg);
 368 
 369         seg = avl_find(&as->a_segtree, &addr, &where);
 370         if (seg != NULL)
 371                 return (as->a_seglast = seg);
 372 
 373         seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 374         if (seg == NULL && tail)
 375                 seg = avl_last(&as->a_segtree);
 376         return (as->a_seglast = seg);
 377 }
 378 
 379 #ifdef VERIFY_SEGLIST
 380 /*
 381  * verify that the linked list is coherent
 382  */
 383 static void
 384 as_verify(struct as *as)
 385 {
 386         struct seg *seg, *seglast, *p, *n;
 387         uint_t nsegs = 0;
 388 
 389         if (do_as_verify == 0)
 390                 return;
 391 
 392         seglast = as->a_seglast;
 393 
 394         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 395                 ASSERT(seg->s_as == as);
 396                 p = AS_SEGPREV(as, seg);
 397                 n = AS_SEGNEXT(as, seg);
 398                 ASSERT(p == NULL || p->s_as == as);
 399                 ASSERT(p == NULL || p->s_base < seg->s_base);
 400                 ASSERT(n == NULL || n->s_base > seg->s_base);
 401                 ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
 402                 if (seg == seglast)
 403                         seglast = NULL;
 404                 nsegs++;
 405         }
 406         ASSERT(seglast == NULL);
 407         ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
 408 }
 409 #endif /* VERIFY_SEGLIST */
 410 
 411 /*
 412  * Add a new segment to the address space. The avl_find()
 413  * may be expensive so we attempt to use last segment accessed
 414  * in as_gap() as an insertion point.
 415  */
 416 int
 417 as_addseg(struct as  *as, struct seg *newseg)
 418 {
 419         struct seg *seg;
 420         caddr_t addr;
 421         caddr_t eaddr;
 422         avl_index_t where;
 423 
 424         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
 425 
 426         as->a_updatedir = 1; /* inform /proc */
 427         gethrestime(&as->a_updatetime);
 428 
 429         if (as->a_lastgaphl != NULL) {
 430                 struct seg *hseg = NULL;
 431                 struct seg *lseg = NULL;
 432 
 433                 if (as->a_lastgaphl->s_base > newseg->s_base) {
 434                         hseg = as->a_lastgaphl;
 435                         lseg = AVL_PREV(&as->a_segtree, hseg);
 436                 } else {
 437                         lseg = as->a_lastgaphl;
 438                         hseg = AVL_NEXT(&as->a_segtree, lseg);
 439                 }
 440 
 441                 if (hseg && lseg && lseg->s_base < newseg->s_base &&
 442                     hseg->s_base > newseg->s_base) {
 443                         avl_insert_here(&as->a_segtree, newseg, lseg,
 444                             AVL_AFTER);
 445                         as->a_lastgaphl = NULL;
 446                         as->a_seglast = newseg;
 447                         return (0);
 448                 }
 449                 as->a_lastgaphl = NULL;
 450         }
 451 
 452         addr = newseg->s_base;
 453         eaddr = addr + newseg->s_size;
 454 again:
 455 
 456         seg = avl_find(&as->a_segtree, &addr, &where);
 457 
 458         if (seg == NULL)
 459                 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 460 
 461         if (seg == NULL)
 462                 seg = avl_last(&as->a_segtree);
 463 
 464         if (seg != NULL) {
 465                 caddr_t base = seg->s_base;
 466 
 467                 /*
 468                  * If top of seg is below the requested address, then
 469                  * the insertion point is at the end of the linked list,
 470                  * and seg points to the tail of the list.  Otherwise,
 471                  * the insertion point is immediately before seg.
 472                  */
 473                 if (base + seg->s_size > addr) {
 474                         if (addr >= base || eaddr > base) {
 475 #ifdef __sparc
 476                                 extern struct seg_ops segnf_ops;
 477 
 478                                 /*
 479                                  * no-fault segs must disappear if overlaid.
 480                                  * XXX need new segment type so
 481                                  * we don't have to check s_ops
 482                                  */
 483                                 if (seg->s_ops == &segnf_ops) {
 484                                         seg_unmap(seg);
 485                                         goto again;
 486                                 }
 487 #endif
 488                                 return (-1);    /* overlapping segment */
 489                         }
 490                 }
 491         }
 492         as->a_seglast = newseg;
 493         avl_insert(&as->a_segtree, newseg, where);
 494 
 495 #ifdef VERIFY_SEGLIST
 496         as_verify(as);
 497 #endif
 498         return (0);
 499 }
 500 
 501 struct seg *
 502 as_removeseg(struct as *as, struct seg *seg)
 503 {
 504         avl_tree_t *t;
 505 
 506         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
 507 
 508         as->a_updatedir = 1; /* inform /proc */
 509         gethrestime(&as->a_updatetime);
 510 
 511         if (seg == NULL)
 512                 return (NULL);
 513 
 514         t = &as->a_segtree;
 515         if (as->a_seglast == seg)
 516                 as->a_seglast = NULL;
 517         as->a_lastgaphl = NULL;
 518 
 519         /*
 520          * if this segment is at an address higher than
 521          * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
 522          */
 523         if (as->a_lastgap &&
 524             (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
 525                 as->a_lastgap = AVL_NEXT(t, seg);
 526 
 527         /*
 528          * remove the segment from the seg tree
 529          */
 530         avl_remove(t, seg);
 531 
 532 #ifdef VERIFY_SEGLIST
 533         as_verify(as);
 534 #endif
 535         return (seg);
 536 }
 537 
 538 /*
 539  * Find a segment containing addr.
 540  */
 541 struct seg *
 542 as_segat(struct as *as, caddr_t addr)
 543 {
 544         struct seg *seg = as->a_seglast;
 545 
 546         ASSERT(AS_LOCK_HELD(as, &as->a_lock));
 547 
 548         if (seg != NULL && seg->s_base <= addr &&
 549             addr < seg->s_base + seg->s_size)
 550                 return (seg);
 551 
 552         seg = avl_find(&as->a_segtree, &addr, NULL);
 553         return (seg);
 554 }
 555 
 556 /*
 557  * Serialize all searches for holes in an address space to
 558  * prevent two or more threads from allocating the same virtual
 559  * address range.  The address space must not be "read/write"
 560  * locked by the caller since we may block.
 561  */
 562 void
 563 as_rangelock(struct as *as)
 564 {
 565         mutex_enter(&as->a_contents);
 566         while (AS_ISCLAIMGAP(as))
 567                 cv_wait(&as->a_cv, &as->a_contents);
 568         AS_SETCLAIMGAP(as);
 569         mutex_exit(&as->a_contents);
 570 }
 571 
 572 /*
 573  * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
 574  */
 575 void
 576 as_rangeunlock(struct as *as)
 577 {
 578         mutex_enter(&as->a_contents);
 579         AS_CLRCLAIMGAP(as);
 580         cv_signal(&as->a_cv);
 581         mutex_exit(&as->a_contents);
 582 }
 583 
 584 /*
 585  * compar segments (or just an address) by segment address range
 586  */
 587 static int
 588 as_segcompar(const void *x, const void *y)
 589 {
 590         struct seg *a = (struct seg *)x;
 591         struct seg *b = (struct seg *)y;
 592 
 593         if (a->s_base < b->s_base)
 594                 return (-1);
 595         if (a->s_base >= b->s_base + b->s_size)
 596                 return (1);
 597         return (0);
 598 }
 599 
 600 
 601 void
 602 as_avlinit(struct as *as)
 603 {
 604         avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
 605             offsetof(struct seg, s_tree));
 606         avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
 607             offsetof(struct watched_page, wp_link));
 608 }
 609 
 610 /*ARGSUSED*/
 611 static int
 612 as_constructor(void *buf, void *cdrarg, int kmflags)
 613 {
 614         struct as *as = buf;
 615 
 616         mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
 617         cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
 618         rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
 619         as_avlinit(as);
 620         return (0);
 621 }
 622 
 623 /*ARGSUSED1*/
 624 static void
 625 as_destructor(void *buf, void *cdrarg)
 626 {
 627         struct as *as = buf;
 628 
 629         avl_destroy(&as->a_segtree);
 630         mutex_destroy(&as->a_contents);
 631         cv_destroy(&as->a_cv);
 632         rw_destroy(&as->a_lock);
 633 }
 634 
 635 void
 636 as_init(void)
 637 {
 638         as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
 639             as_constructor, as_destructor, NULL, NULL, NULL, 0);
 640 }
 641 
 642 /*
 643  * Allocate and initialize an address space data structure.
 644  * We call hat_alloc to allow any machine dependent
 645  * information in the hat structure to be initialized.
 646  */
 647 struct as *
 648 as_alloc(void)
 649 {
 650         struct as *as;
 651 
 652         as = kmem_cache_alloc(as_cache, KM_SLEEP);
 653 
 654         as->a_flags          = 0;
 655         as->a_vbits          = 0;
 656         as->a_hrm            = NULL;
 657         as->a_seglast                = NULL;
 658         as->a_size           = 0;
 659         as->a_resvsize               = 0;
 660         as->a_updatedir              = 0;
 661         gethrestime(&as->a_updatetime);
 662         as->a_objectdir              = NULL;
 663         as->a_sizedir                = 0;
 664         as->a_userlimit              = (caddr_t)USERLIMIT;
 665         as->a_lastgap                = NULL;
 666         as->a_lastgaphl              = NULL;
 667         as->a_callbacks              = NULL;
 668 
 669         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 670         as->a_hat = hat_alloc(as);   /* create hat for default system mmu */
 671         AS_LOCK_EXIT(as, &as->a_lock);
 672 
 673         return (as);
 674 }
 675 
 676 /*
 677  * Free an address space data structure.
 678  * Need to free the hat first and then
 679  * all the segments on this as and finally
 680  * the space for the as struct itself.
 681  */
 682 void
 683 as_free(struct as *as)
 684 {
 685         struct hat *hat = as->a_hat;
 686         struct seg *seg, *next;
 687         boolean_t free_started = B_FALSE;
 688 
 689 top:
 690         /*
 691          * Invoke ALL callbacks. as_do_callbacks will do one callback
 692          * per call, and not return (-1) until the callback has completed.
 693          * When as_do_callbacks returns zero, all callbacks have completed.
 694          */
 695         mutex_enter(&as->a_contents);
 696         while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
 697                 ;
 698 
 699         mutex_exit(&as->a_contents);
 700         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 701 
 702         if (!free_started) {
 703                 free_started = B_TRUE;
 704                 hat_free_start(hat);
 705         }
 706         for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
 707                 int err;
 708 
 709                 next = AS_SEGNEXT(as, seg);
 710 retry:
 711                 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
 712                 if (err == EAGAIN) {
 713                         mutex_enter(&as->a_contents);
 714                         if (as->a_callbacks) {
 715                                 AS_LOCK_EXIT(as, &as->a_lock);
 716                         } else if (!AS_ISNOUNMAPWAIT(as)) {
 717                                 /*
 718                                  * Memory is currently locked. Wait for a
 719                                  * cv_signal that it has been unlocked, then
 720                                  * try the operation again.
 721                                  */
 722                                 if (AS_ISUNMAPWAIT(as) == 0)
 723                                         cv_broadcast(&as->a_cv);
 724                                 AS_SETUNMAPWAIT(as);
 725                                 AS_LOCK_EXIT(as, &as->a_lock);
 726                                 while (AS_ISUNMAPWAIT(as))
 727                                         cv_wait(&as->a_cv, &as->a_contents);
 728                         } else {
 729                                 /*
 730                                  * We may have raced with
 731                                  * segvn_reclaim()/segspt_reclaim(). In this
 732                                  * case clean nounmapwait flag and retry since
 733                                  * softlockcnt in this segment may be already
 734                                  * 0.  We don't drop as writer lock so our
 735                                  * number of retries without sleeping should
 736                                  * be very small. See segvn_reclaim() for
 737                                  * more comments.
 738                                  */
 739                                 AS_CLRNOUNMAPWAIT(as);
 740                                 mutex_exit(&as->a_contents);
 741                                 goto retry;
 742                         }
 743                         mutex_exit(&as->a_contents);
 744                         goto top;
 745                 } else {
 746                         /*
 747                          * We do not expect any other error return at this
 748                          * time. This is similar to an ASSERT in seg_unmap()
 749                          */
 750                         ASSERT(err == 0);
 751                 }
 752         }
 753         hat_free_end(hat);
 754         AS_LOCK_EXIT(as, &as->a_lock);
 755 
 756         /* /proc stuff */
 757         ASSERT(avl_numnodes(&as->a_wpage) == 0);
 758         if (as->a_objectdir) {
 759                 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
 760                 as->a_objectdir = NULL;
 761                 as->a_sizedir = 0;
 762         }
 763 
 764         /*
 765          * Free the struct as back to kmem.  Assert it has no segments.
 766          */
 767         ASSERT(avl_numnodes(&as->a_segtree) == 0);
 768         kmem_cache_free(as_cache, as);
 769 }
 770 
 771 int
 772 as_dup(struct as *as, struct proc *forkedproc)
 773 {
 774         struct as *newas;
 775         struct seg *seg, *newseg;
 776         size_t  purgesize = 0;
 777         int error;
 778 
 779         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 780         as_clearwatch(as);
 781         newas = as_alloc();
 782         newas->a_userlimit = as->a_userlimit;
 783         newas->a_proc = forkedproc;
 784 
 785         AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER);
 786 
 787         (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
 788 
 789         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 790 
 791                 if (seg->s_flags & S_PURGE) {
 792                         purgesize += seg->s_size;
 793                         continue;
 794                 }
 795 
 796                 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
 797                 if (newseg == NULL) {
 798                         AS_LOCK_EXIT(newas, &newas->a_lock);
 799                         as_setwatch(as);
 800                         AS_LOCK_EXIT(as, &as->a_lock);
 801                         as_free(newas);
 802                         return (-1);
 803                 }
 804                 if ((error = SEGOP_DUP(seg, newseg)) != 0) {
 805                         /*
 806                          * We call seg_free() on the new seg
 807                          * because the segment is not set up
 808                          * completely; i.e. it has no ops.
 809                          */
 810                         as_setwatch(as);
 811                         AS_LOCK_EXIT(as, &as->a_lock);
 812                         seg_free(newseg);
 813                         AS_LOCK_EXIT(newas, &newas->a_lock);
 814                         as_free(newas);
 815                         return (error);
 816                 }
 817                 newas->a_size += seg->s_size;
 818         }
 819         newas->a_resvsize = as->a_resvsize - purgesize;
 820 
 821         error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
 822 
 823         AS_LOCK_EXIT(newas, &newas->a_lock);
 824 
 825         as_setwatch(as);
 826         AS_LOCK_EXIT(as, &as->a_lock);
 827         if (error != 0) {
 828                 as_free(newas);
 829                 return (error);
 830         }
 831         forkedproc->p_as = newas;
 832         return (0);
 833 }
 834 
 835 /*
 836  * Handle a ``fault'' at addr for size bytes.
 837  */
 838 faultcode_t
 839 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
 840         enum fault_type type, enum seg_rw rw)
 841 {
 842         struct seg *seg;
 843         caddr_t raddr;                  /* rounded down addr */
 844         size_t rsize;                   /* rounded up size */
 845         size_t ssize;
 846         faultcode_t res = 0;
 847         caddr_t addrsav;
 848         struct seg *segsav;
 849         int as_lock_held;
 850         klwp_t *lwp = ttolwp(curthread);
 851 
 852 
 853 
 854 retry:
 855         /*
 856          * Indicate that the lwp is not to be stopped while waiting for a
 857          * pagefault.  This is to avoid deadlock while debugging a process
 858          * via /proc over NFS (in particular).
 859          */
 860         if (lwp != NULL)
 861                 lwp->lwp_nostop++;
 862 
 863         /*
 864          * same length must be used when we softlock and softunlock.  We
 865          * don't support softunlocking lengths less than the original length
 866          * when there is largepage support.  See seg_dev.c for more
 867          * comments.
 868          */
 869         switch (type) {
 870 
 871         case F_SOFTLOCK:
 872                 CPU_STATS_ADD_K(vm, softlock, 1);
 873                 break;
 874 
 875         case F_SOFTUNLOCK:
 876                 break;
 877 
 878         case F_PROT:
 879                 CPU_STATS_ADD_K(vm, prot_fault, 1);
 880                 break;
 881 
 882         case F_INVAL:
 883                 CPU_STATS_ENTER_K();
 884                 CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
 885                 if (as == &kas)
 886                         CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
 887                 CPU_STATS_EXIT_K();
 888                 break;
 889         }
 890 
 891         /* Kernel probe */
 892         TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
 893             tnf_opaque, address,        addr,
 894             tnf_fault_type,     fault_type,     type,
 895             tnf_seg_access,     access,         rw);
 896 
 897         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
 898         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
 899             (size_t)raddr;
 900 
 901         /*
 902          * XXX -- Don't grab the as lock for segkmap. We should grab it for
 903          * correctness, but then we could be stuck holding this lock for
 904          * a LONG time if the fault needs to be resolved on a slow
 905          * filesystem, and then no-one will be able to exec new commands,
 906          * as exec'ing requires the write lock on the as.
 907          */
 908         if (as == &kas && segkmap && segkmap->s_base <= raddr &&
 909             raddr + size < segkmap->s_base + segkmap->s_size) {
 910                 seg = segkmap;
 911                 as_lock_held = 0;
 912         } else {
 913                 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
 914 
 915                 seg = as_segat(as, raddr);
 916                 if (seg == NULL) {
 917                         AS_LOCK_EXIT(as, &as->a_lock);
 918                         if (lwp != NULL)
 919                                 lwp->lwp_nostop--;
 920                         return (FC_NOMAP);
 921                 }
 922 
 923                 as_lock_held = 1;
 924         }
 925 
 926         addrsav = raddr;
 927         segsav = seg;
 928 
 929         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
 930                 if (raddr >= seg->s_base + seg->s_size) {
 931                         seg = AS_SEGNEXT(as, seg);
 932                         if (seg == NULL || raddr != seg->s_base) {
 933                                 res = FC_NOMAP;
 934                                 break;
 935                         }
 936                 }
 937                 if (raddr + rsize > seg->s_base + seg->s_size)
 938                         ssize = seg->s_base + seg->s_size - raddr;
 939                 else
 940                         ssize = rsize;
 941 
 942                 res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
 943                 if (res != 0)
 944                         break;
 945         }
 946 
 947         /*
 948          * If we were SOFTLOCKing and encountered a failure,
 949          * we must SOFTUNLOCK the range we already did. (Maybe we
 950          * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
 951          * right here...)
 952          */
 953         if (res != 0 && type == F_SOFTLOCK) {
 954                 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
 955                         if (addrsav >= seg->s_base + seg->s_size)
 956                                 seg = AS_SEGNEXT(as, seg);
 957                         ASSERT(seg != NULL);
 958                         /*
 959                          * Now call the fault routine again to perform the
 960                          * unlock using S_OTHER instead of the rw variable
 961                          * since we never got a chance to touch the pages.
 962                          */
 963                         if (raddr > seg->s_base + seg->s_size)
 964                                 ssize = seg->s_base + seg->s_size - addrsav;
 965                         else
 966                                 ssize = raddr - addrsav;
 967                         (void) SEGOP_FAULT(hat, seg, addrsav, ssize,
 968                             F_SOFTUNLOCK, S_OTHER);
 969                 }
 970         }
 971         if (as_lock_held)
 972                 AS_LOCK_EXIT(as, &as->a_lock);
 973         if (lwp != NULL)
 974                 lwp->lwp_nostop--;
 975 
 976         /*
 977          * If the lower levels returned EDEADLK for a fault,
 978          * It means that we should retry the fault.  Let's wait
 979          * a bit also to let the deadlock causing condition clear.
 980          * This is part of a gross hack to work around a design flaw
 981          * in the ufs/sds logging code and should go away when the
 982          * logging code is re-designed to fix the problem. See bug
 983          * 4125102 for details of the problem.
 984          */
 985         if (FC_ERRNO(res) == EDEADLK) {
 986                 delay(deadlk_wait);
 987                 res = 0;
 988                 goto retry;
 989         }
 990         return (res);
 991 }
 992 
 993 
 994 
 995 /*
 996  * Asynchronous ``fault'' at addr for size bytes.
 997  */
 998 faultcode_t
 999 as_faulta(struct as *as, caddr_t addr, size_t size)
1000 {
1001         struct seg *seg;
1002         caddr_t raddr;                  /* rounded down addr */
1003         size_t rsize;                   /* rounded up size */
1004         faultcode_t res = 0;
1005         klwp_t *lwp = ttolwp(curthread);
1006 
1007 retry:
1008         /*
1009          * Indicate that the lwp is not to be stopped while waiting
1010          * for a pagefault.  This is to avoid deadlock while debugging
1011          * a process via /proc over NFS (in particular).
1012          */
1013         if (lwp != NULL)
1014                 lwp->lwp_nostop++;
1015 
1016         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1017         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1018             (size_t)raddr;
1019 
1020         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1021         seg = as_segat(as, raddr);
1022         if (seg == NULL) {
1023                 AS_LOCK_EXIT(as, &as->a_lock);
1024                 if (lwp != NULL)
1025                         lwp->lwp_nostop--;
1026                 return (FC_NOMAP);
1027         }
1028 
1029         for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1030                 if (raddr >= seg->s_base + seg->s_size) {
1031                         seg = AS_SEGNEXT(as, seg);
1032                         if (seg == NULL || raddr != seg->s_base) {
1033                                 res = FC_NOMAP;
1034                                 break;
1035                         }
1036                 }
1037                 res = SEGOP_FAULTA(seg, raddr);
1038                 if (res != 0)
1039                         break;
1040         }
1041         AS_LOCK_EXIT(as, &as->a_lock);
1042         if (lwp != NULL)
1043                 lwp->lwp_nostop--;
1044         /*
1045          * If the lower levels returned EDEADLK for a fault,
1046          * It means that we should retry the fault.  Let's wait
1047          * a bit also to let the deadlock causing condition clear.
1048          * This is part of a gross hack to work around a design flaw
1049          * in the ufs/sds logging code and should go away when the
1050          * logging code is re-designed to fix the problem. See bug
1051          * 4125102 for details of the problem.
1052          */
1053         if (FC_ERRNO(res) == EDEADLK) {
1054                 delay(deadlk_wait);
1055                 res = 0;
1056                 goto retry;
1057         }
1058         return (res);
1059 }
1060 
1061 /*
1062  * Set the virtual mapping for the interval from [addr : addr + size)
1063  * in address space `as' to have the specified protection.
1064  * It is ok for the range to cross over several segments,
1065  * as long as they are contiguous.
1066  */
1067 int
1068 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1069 {
1070         struct seg *seg;
1071         struct as_callback *cb;
1072         size_t ssize;
1073         caddr_t raddr;                  /* rounded down addr */
1074         size_t rsize;                   /* rounded up size */
1075         int error = 0, writer = 0;
1076         caddr_t saveraddr;
1077         size_t saversize;
1078 
1079 setprot_top:
1080         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1081         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1082             (size_t)raddr;
1083 
1084         if (raddr + rsize < raddr)           /* check for wraparound */
1085                 return (ENOMEM);
1086 
1087         saveraddr = raddr;
1088         saversize = rsize;
1089 
1090         /*
1091          * Normally we only lock the as as a reader. But
1092          * if due to setprot the segment driver needs to split
1093          * a segment it will return IE_RETRY. Therefore we re-acquire
1094          * the as lock as a writer so the segment driver can change
1095          * the seg list. Also the segment driver will return IE_RETRY
1096          * after it has changed the segment list so we therefore keep
1097          * locking as a writer. Since these opeartions should be rare
1098          * want to only lock as a writer when necessary.
1099          */
1100         if (writer || avl_numnodes(&as->a_wpage) != 0) {
1101                 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1102         } else {
1103                 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1104         }
1105 
1106         as_clearwatchprot(as, raddr, rsize);
1107         seg = as_segat(as, raddr);
1108         if (seg == NULL) {
1109                 as_setwatch(as);
1110                 AS_LOCK_EXIT(as, &as->a_lock);
1111                 return (ENOMEM);
1112         }
1113 
1114         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1115                 if (raddr >= seg->s_base + seg->s_size) {
1116                         seg = AS_SEGNEXT(as, seg);
1117                         if (seg == NULL || raddr != seg->s_base) {
1118                                 error = ENOMEM;
1119                                 break;
1120                         }
1121                 }
1122                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1123                         ssize = seg->s_base + seg->s_size - raddr;
1124                 else
1125                         ssize = rsize;
1126 retry:
1127                 error = SEGOP_SETPROT(seg, raddr, ssize, prot);
1128 
1129                 if (error == IE_NOMEM) {
1130                         error = EAGAIN;
1131                         break;
1132                 }
1133 
1134                 if (error == IE_RETRY) {
1135                         AS_LOCK_EXIT(as, &as->a_lock);
1136                         writer = 1;
1137                         goto setprot_top;
1138                 }
1139 
1140                 if (error == EAGAIN) {
1141                         /*
1142                          * Make sure we have a_lock as writer.
1143                          */
1144                         if (writer == 0) {
1145                                 AS_LOCK_EXIT(as, &as->a_lock);
1146                                 writer = 1;
1147                                 goto setprot_top;
1148                         }
1149 
1150                         /*
1151                          * Memory is currently locked.  It must be unlocked
1152                          * before this operation can succeed through a retry.
1153                          * The possible reasons for locked memory and
1154                          * corresponding strategies for unlocking are:
1155                          * (1) Normal I/O
1156                          *      wait for a signal that the I/O operation
1157                          *      has completed and the memory is unlocked.
1158                          * (2) Asynchronous I/O
1159                          *      The aio subsystem does not unlock pages when
1160                          *      the I/O is completed. Those pages are unlocked
1161                          *      when the application calls aiowait/aioerror.
1162                          *      So, to prevent blocking forever, cv_broadcast()
1163                          *      is done to wake up aio_cleanup_thread.
1164                          *      Subsequently, segvn_reclaim will be called, and
1165                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
1166                          * (3) Long term page locking:
1167                          *      Drivers intending to have pages locked for a
1168                          *      period considerably longer than for normal I/O
1169                          *      (essentially forever) may have registered for a
1170                          *      callback so they may unlock these pages on
1171                          *      request. This is needed to allow this operation
1172                          *      to succeed. Each entry on the callback list is
1173                          *      examined. If the event or address range pertains
1174                          *      the callback is invoked (unless it already is in
1175                          *      progress). The a_contents lock must be dropped
1176                          *      before the callback, so only one callback can
1177                          *      be done at a time. Go to the top and do more
1178                          *      until zero is returned. If zero is returned,
1179                          *      either there were no callbacks for this event
1180                          *      or they were already in progress.
1181                          */
1182                         mutex_enter(&as->a_contents);
1183                         if (as->a_callbacks &&
1184                             (cb = as_find_callback(as, AS_SETPROT_EVENT,
1185                             seg->s_base, seg->s_size))) {
1186                                 AS_LOCK_EXIT(as, &as->a_lock);
1187                                 as_execute_callback(as, cb, AS_SETPROT_EVENT);
1188                         } else if (!AS_ISNOUNMAPWAIT(as)) {
1189                                 if (AS_ISUNMAPWAIT(as) == 0)
1190                                         cv_broadcast(&as->a_cv);
1191                                 AS_SETUNMAPWAIT(as);
1192                                 AS_LOCK_EXIT(as, &as->a_lock);
1193                                 while (AS_ISUNMAPWAIT(as))
1194                                         cv_wait(&as->a_cv, &as->a_contents);
1195                         } else {
1196                                 /*
1197                                  * We may have raced with
1198                                  * segvn_reclaim()/segspt_reclaim(). In this
1199                                  * case clean nounmapwait flag and retry since
1200                                  * softlockcnt in this segment may be already
1201                                  * 0.  We don't drop as writer lock so our
1202                                  * number of retries without sleeping should
1203                                  * be very small. See segvn_reclaim() for
1204                                  * more comments.
1205                                  */
1206                                 AS_CLRNOUNMAPWAIT(as);
1207                                 mutex_exit(&as->a_contents);
1208                                 goto retry;
1209                         }
1210                         mutex_exit(&as->a_contents);
1211                         goto setprot_top;
1212                 } else if (error != 0)
1213                         break;
1214         }
1215         if (error != 0) {
1216                 as_setwatch(as);
1217         } else {
1218                 as_setwatchprot(as, saveraddr, saversize, prot);
1219         }
1220         AS_LOCK_EXIT(as, &as->a_lock);
1221         return (error);
1222 }
1223 
1224 /*
1225  * Check to make sure that the interval [addr, addr + size)
1226  * in address space `as' has at least the specified protection.
1227  * It is ok for the range to cross over several segments, as long
1228  * as they are contiguous.
1229  */
1230 int
1231 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1232 {
1233         struct seg *seg;
1234         size_t ssize;
1235         caddr_t raddr;                  /* rounded down addr */
1236         size_t rsize;                   /* rounded up size */
1237         int error = 0;
1238 
1239         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1240         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1241             (size_t)raddr;
1242 
1243         if (raddr + rsize < raddr)           /* check for wraparound */
1244                 return (ENOMEM);
1245 
1246         /*
1247          * This is ugly as sin...
1248          * Normally, we only acquire the address space readers lock.
1249          * However, if the address space has watchpoints present,
1250          * we must acquire the writer lock on the address space for
1251          * the benefit of as_clearwatchprot() and as_setwatchprot().
1252          */
1253         if (avl_numnodes(&as->a_wpage) != 0)
1254                 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1255         else
1256                 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1257         as_clearwatchprot(as, raddr, rsize);
1258         seg = as_segat(as, raddr);
1259         if (seg == NULL) {
1260                 as_setwatch(as);
1261                 AS_LOCK_EXIT(as, &as->a_lock);
1262                 return (ENOMEM);
1263         }
1264 
1265         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1266                 if (raddr >= seg->s_base + seg->s_size) {
1267                         seg = AS_SEGNEXT(as, seg);
1268                         if (seg == NULL || raddr != seg->s_base) {
1269                                 error = ENOMEM;
1270                                 break;
1271                         }
1272                 }
1273                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1274                         ssize = seg->s_base + seg->s_size - raddr;
1275                 else
1276                         ssize = rsize;
1277 
1278                 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
1279                 if (error != 0)
1280                         break;
1281         }
1282         as_setwatch(as);
1283         AS_LOCK_EXIT(as, &as->a_lock);
1284         return (error);
1285 }
1286 
1287 int
1288 as_unmap(struct as *as, caddr_t addr, size_t size)
1289 {
1290         struct seg *seg, *seg_next;
1291         struct as_callback *cb;
1292         caddr_t raddr, eaddr;
1293         size_t ssize, rsize = 0;
1294         int err;
1295 
1296 top:
1297         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1298         eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1299             (uintptr_t)PAGEMASK);
1300 
1301         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1302 
1303         as->a_updatedir = 1; /* inform /proc */
1304         gethrestime(&as->a_updatetime);
1305 
1306         /*
1307          * Use as_findseg to find the first segment in the range, then
1308          * step through the segments in order, following s_next.
1309          */
1310         as_clearwatchprot(as, raddr, eaddr - raddr);
1311 
1312         for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1313                 if (eaddr <= seg->s_base)
1314                         break;          /* eaddr was in a gap; all done */
1315 
1316                 /* this is implied by the test above */
1317                 ASSERT(raddr < eaddr);
1318 
1319                 if (raddr < seg->s_base)
1320                         raddr = seg->s_base;         /* raddr was in a gap */
1321 
1322                 if (eaddr > (seg->s_base + seg->s_size))
1323                         ssize = seg->s_base + seg->s_size - raddr;
1324                 else
1325                         ssize = eaddr - raddr;
1326 
1327                 /*
1328                  * Save next segment pointer since seg can be
1329                  * destroyed during the segment unmap operation.
1330                  */
1331                 seg_next = AS_SEGNEXT(as, seg);
1332 
1333                 /*
1334                  * We didn't count /dev/null mappings, so ignore them here.
1335                  * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1336                  * we have to do this check here while we have seg.)
1337                  */
1338                 rsize = 0;
1339                 if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1340                     !SEG_IS_PARTIAL_RESV(seg))
1341                         rsize = ssize;
1342 
1343 retry:
1344                 err = SEGOP_UNMAP(seg, raddr, ssize);
1345                 if (err == EAGAIN) {
1346                         /*
1347                          * Memory is currently locked.  It must be unlocked
1348                          * before this operation can succeed through a retry.
1349                          * The possible reasons for locked memory and
1350                          * corresponding strategies for unlocking are:
1351                          * (1) Normal I/O
1352                          *      wait for a signal that the I/O operation
1353                          *      has completed and the memory is unlocked.
1354                          * (2) Asynchronous I/O
1355                          *      The aio subsystem does not unlock pages when
1356                          *      the I/O is completed. Those pages are unlocked
1357                          *      when the application calls aiowait/aioerror.
1358                          *      So, to prevent blocking forever, cv_broadcast()
1359                          *      is done to wake up aio_cleanup_thread.
1360                          *      Subsequently, segvn_reclaim will be called, and
1361                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
1362                          * (3) Long term page locking:
1363                          *      Drivers intending to have pages locked for a
1364                          *      period considerably longer than for normal I/O
1365                          *      (essentially forever) may have registered for a
1366                          *      callback so they may unlock these pages on
1367                          *      request. This is needed to allow this operation
1368                          *      to succeed. Each entry on the callback list is
1369                          *      examined. If the event or address range pertains
1370                          *      the callback is invoked (unless it already is in
1371                          *      progress). The a_contents lock must be dropped
1372                          *      before the callback, so only one callback can
1373                          *      be done at a time. Go to the top and do more
1374                          *      until zero is returned. If zero is returned,
1375                          *      either there were no callbacks for this event
1376                          *      or they were already in progress.
1377                          */
1378                         mutex_enter(&as->a_contents);
1379                         if (as->a_callbacks &&
1380                             (cb = as_find_callback(as, AS_UNMAP_EVENT,
1381                             seg->s_base, seg->s_size))) {
1382                                 AS_LOCK_EXIT(as, &as->a_lock);
1383                                 as_execute_callback(as, cb, AS_UNMAP_EVENT);
1384                         } else if (!AS_ISNOUNMAPWAIT(as)) {
1385                                 if (AS_ISUNMAPWAIT(as) == 0)
1386                                         cv_broadcast(&as->a_cv);
1387                                 AS_SETUNMAPWAIT(as);
1388                                 AS_LOCK_EXIT(as, &as->a_lock);
1389                                 while (AS_ISUNMAPWAIT(as))
1390                                         cv_wait(&as->a_cv, &as->a_contents);
1391                         } else {
1392                                 /*
1393                                  * We may have raced with
1394                                  * segvn_reclaim()/segspt_reclaim(). In this
1395                                  * case clean nounmapwait flag and retry since
1396                                  * softlockcnt in this segment may be already
1397                                  * 0.  We don't drop as writer lock so our
1398                                  * number of retries without sleeping should
1399                                  * be very small. See segvn_reclaim() for
1400                                  * more comments.
1401                                  */
1402                                 AS_CLRNOUNMAPWAIT(as);
1403                                 mutex_exit(&as->a_contents);
1404                                 goto retry;
1405                         }
1406                         mutex_exit(&as->a_contents);
1407                         goto top;
1408                 } else if (err == IE_RETRY) {
1409                         AS_LOCK_EXIT(as, &as->a_lock);
1410                         goto top;
1411                 } else if (err) {
1412                         as_setwatch(as);
1413                         AS_LOCK_EXIT(as, &as->a_lock);
1414                         return (-1);
1415                 }
1416 
1417                 as->a_size -= ssize;
1418                 if (rsize)
1419                         as->a_resvsize -= rsize;
1420                 raddr += ssize;
1421         }
1422         AS_LOCK_EXIT(as, &as->a_lock);
1423         return (0);
1424 }
1425 
1426 static int
1427 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1428     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1429 {
1430         uint_t szc;
1431         uint_t nszc;
1432         int error;
1433         caddr_t a;
1434         caddr_t eaddr;
1435         size_t segsize;
1436         struct seg *seg;
1437         size_t pgsz;
1438         int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1439         uint_t save_szcvec;
1440 
1441         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1442         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1443         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1444         ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1445         if (!do_off) {
1446                 vn_a->offset = 0;
1447         }
1448 
1449         if (szcvec <= 1) {
1450                 seg = seg_alloc(as, addr, size);
1451                 if (seg == NULL) {
1452                         return (ENOMEM);
1453                 }
1454                 vn_a->szc = 0;
1455                 error = (*crfp)(seg, vn_a);
1456                 if (error != 0) {
1457                         seg_free(seg);
1458                 } else {
1459                         as->a_size += size;
1460                         as->a_resvsize += size;
1461                 }
1462                 return (error);
1463         }
1464 
1465         eaddr = addr + size;
1466         save_szcvec = szcvec;
1467         szcvec >>= 1;
1468         szc = 0;
1469         nszc = 0;
1470         while (szcvec) {
1471                 if ((szcvec & 0x1) == 0) {
1472                         nszc++;
1473                         szcvec >>= 1;
1474                         continue;
1475                 }
1476                 nszc++;
1477                 pgsz = page_get_pagesize(nszc);
1478                 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1479                 if (a != addr) {
1480                         ASSERT(a < eaddr);
1481                         segsize = a - addr;
1482                         seg = seg_alloc(as, addr, segsize);
1483                         if (seg == NULL) {
1484                                 return (ENOMEM);
1485                         }
1486                         vn_a->szc = szc;
1487                         error = (*crfp)(seg, vn_a);
1488                         if (error != 0) {
1489                                 seg_free(seg);
1490                                 return (error);
1491                         }
1492                         as->a_size += segsize;
1493                         as->a_resvsize += segsize;
1494                         *segcreated = 1;
1495                         if (do_off) {
1496                                 vn_a->offset += segsize;
1497                         }
1498                         addr = a;
1499                 }
1500                 szc = nszc;
1501                 szcvec >>= 1;
1502         }
1503 
1504         ASSERT(addr < eaddr);
1505         szcvec = save_szcvec | 1; /* add 8K pages */
1506         while (szcvec) {
1507                 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1508                 ASSERT(a >= addr);
1509                 if (a != addr) {
1510                         segsize = a - addr;
1511                         seg = seg_alloc(as, addr, segsize);
1512                         if (seg == NULL) {
1513                                 return (ENOMEM);
1514                         }
1515                         vn_a->szc = szc;
1516                         error = (*crfp)(seg, vn_a);
1517                         if (error != 0) {
1518                                 seg_free(seg);
1519                                 return (error);
1520                         }
1521                         as->a_size += segsize;
1522                         as->a_resvsize += segsize;
1523                         *segcreated = 1;
1524                         if (do_off) {
1525                                 vn_a->offset += segsize;
1526                         }
1527                         addr = a;
1528                 }
1529                 szcvec &= ~(1 << szc);
1530                 if (szcvec) {
1531                         szc = highbit(szcvec) - 1;
1532                         pgsz = page_get_pagesize(szc);
1533                 }
1534         }
1535         ASSERT(addr == eaddr);
1536 
1537         return (0);
1538 }
1539 
1540 static int
1541 as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1542     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1543 {
1544         uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1545         int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1546         uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1547             type, 0);
1548         int error;
1549         struct seg *seg;
1550         struct vattr va;
1551         u_offset_t eoff;
1552         size_t save_size = 0;
1553         extern size_t textrepl_size_thresh;
1554 
1555         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1556         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1557         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1558         ASSERT(vn_a->vp != NULL);
1559         ASSERT(vn_a->amp == NULL);
1560 
1561 again:
1562         if (szcvec <= 1) {
1563                 seg = seg_alloc(as, addr, size);
1564                 if (seg == NULL) {
1565                         return (ENOMEM);
1566                 }
1567                 vn_a->szc = 0;
1568                 error = (*crfp)(seg, vn_a);
1569                 if (error != 0) {
1570                         seg_free(seg);
1571                 } else {
1572                         as->a_size += size;
1573                         as->a_resvsize += size;
1574                 }
1575                 return (error);
1576         }
1577 
1578         va.va_mask = AT_SIZE;
1579         if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1580                 szcvec = 0;
1581                 goto again;
1582         }
1583         eoff = vn_a->offset & PAGEMASK;
1584         if (eoff >= va.va_size) {
1585                 szcvec = 0;
1586                 goto again;
1587         }
1588         eoff += size;
1589         if (btopr(va.va_size) < btopr(eoff)) {
1590                 save_size = size;
1591                 size = va.va_size - (vn_a->offset & PAGEMASK);
1592                 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1593                 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1594                     type, 0);
1595                 if (szcvec <= 1) {
1596                         size = save_size;
1597                         goto again;
1598                 }
1599         }
1600 
1601         if (size > textrepl_size_thresh) {
1602                 vn_a->flags |= _MAP_TEXTREPL;
1603         }
1604         error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1605             segcreated);
1606         if (error != 0) {
1607                 return (error);
1608         }
1609         if (save_size) {
1610                 addr += size;
1611                 size = save_size - size;
1612                 szcvec = 0;
1613                 goto again;
1614         }
1615         return (0);
1616 }
1617 
1618 /*
1619  * as_map_ansegs: shared or private anonymous memory.  Note that the flags
1620  * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1621  */
1622 static int
1623 as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1624     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1625 {
1626         uint_t szcvec;
1627         uchar_t type;
1628 
1629         ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1630         if (vn_a->type == MAP_SHARED) {
1631                 type = MAPPGSZC_SHM;
1632         } else if (vn_a->type == MAP_PRIVATE) {
1633                 if (vn_a->szc == AS_MAP_HEAP) {
1634                         type = MAPPGSZC_HEAP;
1635                 } else if (vn_a->szc == AS_MAP_STACK) {
1636                         type = MAPPGSZC_STACK;
1637                 } else {
1638                         type = MAPPGSZC_PRIVM;
1639                 }
1640         }
1641         szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1642             (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1643             (vn_a->flags & MAP_TEXT), type, 0);
1644         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1645         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1646         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1647         ASSERT(vn_a->vp == NULL);
1648 
1649         return (as_map_segvn_segs(as, addr, size, szcvec,
1650             crfp, vn_a, segcreated));
1651 }
1652 
1653 int
1654 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1655 {
1656         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1657         return (as_map_locked(as, addr, size, crfp, argsp));
1658 }
1659 
1660 int
1661 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1662                 void *argsp)
1663 {
1664         struct seg *seg = NULL;
1665         caddr_t raddr;                  /* rounded down addr */
1666         size_t rsize;                   /* rounded up size */
1667         int error;
1668         int unmap = 0;
1669         struct proc *p = curproc;
1670         struct segvn_crargs crargs;
1671 
1672         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1673         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1674             (size_t)raddr;
1675 
1676         /*
1677          * check for wrap around
1678          */
1679         if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1680                 AS_LOCK_EXIT(as, &as->a_lock);
1681                 return (ENOMEM);
1682         }
1683 
1684         as->a_updatedir = 1; /* inform /proc */
1685         gethrestime(&as->a_updatetime);
1686 
1687         if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1688                 AS_LOCK_EXIT(as, &as->a_lock);
1689 
1690                 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1691                     RCA_UNSAFE_ALL);
1692 
1693                 return (ENOMEM);
1694         }
1695 
1696         if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1697                 crargs = *(struct segvn_crargs *)argsp;
1698                 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1699                 if (error != 0) {
1700                         AS_LOCK_EXIT(as, &as->a_lock);
1701                         if (unmap) {
1702                                 (void) as_unmap(as, addr, size);
1703                         }
1704                         return (error);
1705                 }
1706         } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1707                 crargs = *(struct segvn_crargs *)argsp;
1708                 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1709                 if (error != 0) {
1710                         AS_LOCK_EXIT(as, &as->a_lock);
1711                         if (unmap) {
1712                                 (void) as_unmap(as, addr, size);
1713                         }
1714                         return (error);
1715                 }
1716         } else {
1717                 seg = seg_alloc(as, addr, size);
1718                 if (seg == NULL) {
1719                         AS_LOCK_EXIT(as, &as->a_lock);
1720                         return (ENOMEM);
1721                 }
1722 
1723                 error = (*crfp)(seg, argsp);
1724                 if (error != 0) {
1725                         seg_free(seg);
1726                         AS_LOCK_EXIT(as, &as->a_lock);
1727                         return (error);
1728                 }
1729                 /*
1730                  * Add size now so as_unmap will work if as_ctl fails.
1731                  */
1732                 as->a_size += rsize;
1733                 as->a_resvsize += rsize;
1734         }
1735 
1736         as_setwatch(as);
1737 
1738         /*
1739          * If the address space is locked,
1740          * establish memory locks for the new segment.
1741          */
1742         mutex_enter(&as->a_contents);
1743         if (AS_ISPGLCK(as)) {
1744                 mutex_exit(&as->a_contents);
1745                 AS_LOCK_EXIT(as, &as->a_lock);
1746                 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1747                 if (error != 0)
1748                         (void) as_unmap(as, addr, size);
1749         } else {
1750                 mutex_exit(&as->a_contents);
1751                 AS_LOCK_EXIT(as, &as->a_lock);
1752         }
1753         return (error);
1754 }
1755 
1756 
1757 /*
1758  * Delete all segments in the address space marked with S_PURGE.
1759  * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1760  * These segments are deleted as a first step before calls to as_gap(), so
1761  * that they don't affect mmap() or shmat().
1762  */
1763 void
1764 as_purge(struct as *as)
1765 {
1766         struct seg *seg;
1767         struct seg *next_seg;
1768 
1769         /*
1770          * the setting of NEEDSPURGE is protect by as_rangelock(), so
1771          * no need to grab a_contents mutex for this check
1772          */
1773         if ((as->a_flags & AS_NEEDSPURGE) == 0)
1774                 return;
1775 
1776         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1777         next_seg = NULL;
1778         seg = AS_SEGFIRST(as);
1779         while (seg != NULL) {
1780                 next_seg = AS_SEGNEXT(as, seg);
1781                 if (seg->s_flags & S_PURGE)
1782                         SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1783                 seg = next_seg;
1784         }
1785         AS_LOCK_EXIT(as, &as->a_lock);
1786 
1787         mutex_enter(&as->a_contents);
1788         as->a_flags &= ~AS_NEEDSPURGE;
1789         mutex_exit(&as->a_contents);
1790 }
1791 
1792 /*
1793  * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1794  * range of addresses at least "minlen" long, where the base of the range is
1795  * at "off" phase from an "align" boundary and there is space for a
1796  * "redzone"-sized redzone on eithe rside of the range.  Thus,
1797  * if align was 4M and off was 16k, the user wants a hole which will start
1798  * 16k into a 4M page.
1799  *
1800  * If flags specifies AH_HI, the hole will have the highest possible address
1801  * in the range.  We use the as->a_lastgap field to figure out where to
1802  * start looking for a gap.
1803  *
1804  * Otherwise, the gap will have the lowest possible address.
1805  *
1806  * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1807  *
1808  * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1809  * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1810  *
1811  * NOTE: This routine is not correct when base+len overflows caddr_t.
1812  */
1813 int
1814 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1815     uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1816 {
1817         caddr_t lobound = *basep;
1818         caddr_t hibound = lobound + *lenp;
1819         struct seg *lseg, *hseg;
1820         caddr_t lo, hi;
1821         int forward;
1822         caddr_t save_base;
1823         size_t save_len;
1824         size_t save_minlen;
1825         size_t save_redzone;
1826         int fast_path = 1;
1827 
1828         save_base = *basep;
1829         save_len = *lenp;
1830         save_minlen = minlen;
1831         save_redzone = redzone;
1832 
1833         /*
1834          * For the first pass/fast_path, just add align and redzone into
1835          * minlen since if we get an allocation, we can guarantee that it
1836          * will fit the alignment and redzone requested.
1837          * This increases the chance that hibound will be adjusted to
1838          * a_lastgap->s_base which will likely allow us to find an
1839          * acceptable hole in the address space quicker.
1840          * If we can't find a hole with this fast_path, then we look for
1841          * smaller holes in which the alignment and offset may allow
1842          * the allocation to fit.
1843          */
1844         minlen += align;
1845         minlen += 2 * redzone;
1846         redzone = 0;
1847 
1848         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1849         if (AS_SEGFIRST(as) == NULL) {
1850                 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1851                     align, redzone, off)) {
1852                         AS_LOCK_EXIT(as, &as->a_lock);
1853                         return (0);
1854                 } else {
1855                         AS_LOCK_EXIT(as, &as->a_lock);
1856                         *basep = save_base;
1857                         *lenp = save_len;
1858                         return (-1);
1859                 }
1860         }
1861 
1862 retry:
1863         /*
1864          * Set up to iterate over all the inter-segment holes in the given
1865          * direction.  lseg is NULL for the lowest-addressed hole and hseg is
1866          * NULL for the highest-addressed hole.  If moving backwards, we reset
1867          * sseg to denote the highest-addressed segment.
1868          */
1869         forward = (flags & AH_DIR) == AH_LO;
1870         if (forward) {
1871                 hseg = as_findseg(as, lobound, 1);
1872                 lseg = AS_SEGPREV(as, hseg);
1873         } else {
1874 
1875                 /*
1876                  * If allocating at least as much as the last allocation,
1877                  * use a_lastgap's base as a better estimate of hibound.
1878                  */
1879                 if (as->a_lastgap &&
1880                     minlen >= as->a_lastgap->s_size &&
1881                     hibound >= as->a_lastgap->s_base)
1882                         hibound = as->a_lastgap->s_base;
1883 
1884                 hseg = as_findseg(as, hibound, 1);
1885                 if (hseg->s_base + hseg->s_size < hibound) {
1886                         lseg = hseg;
1887                         hseg = NULL;
1888                 } else {
1889                         lseg = AS_SEGPREV(as, hseg);
1890                 }
1891         }
1892 
1893         for (;;) {
1894                 /*
1895                  * Set lo and hi to the hole's boundaries.  (We should really
1896                  * use MAXADDR in place of hibound in the expression below,
1897                  * but can't express it easily; using hibound in its place is
1898                  * harmless.)
1899                  */
1900                 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1901                 hi = (hseg == NULL) ? hibound : hseg->s_base;
1902                 /*
1903                  * If the iteration has moved past the interval from lobound
1904                  * to hibound it's pointless to continue.
1905                  */
1906                 if ((forward && lo > hibound) || (!forward && hi < lobound))
1907                         break;
1908                 else if (lo > hibound || hi < lobound)
1909                         goto cont;
1910                 /*
1911                  * Candidate hole lies at least partially within the allowable
1912                  * range.  Restrict it to fall completely within that range,
1913                  * i.e., to [max(lo, lobound), min(hi, hibound)].
1914                  */
1915                 if (lo < lobound)
1916                         lo = lobound;
1917                 if (hi > hibound)
1918                         hi = hibound;
1919                 /*
1920                  * Verify that the candidate hole is big enough and meets
1921                  * hardware constraints.  If the hole is too small, no need
1922                  * to do the further checks since they will fail.
1923                  */
1924                 *basep = lo;
1925                 *lenp = hi - lo;
1926                 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
1927                     minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
1928                     ((flags & AH_CONTAIN) == 0 ||
1929                     (*basep <= addr && *basep + *lenp > addr))) {
1930                         if (!forward)
1931                                 as->a_lastgap = hseg;
1932                         if (hseg != NULL)
1933                                 as->a_lastgaphl = hseg;
1934                         else
1935                                 as->a_lastgaphl = lseg;
1936                         AS_LOCK_EXIT(as, &as->a_lock);
1937                         return (0);
1938                 }
1939         cont:
1940                 /*
1941                  * Move to the next hole.
1942                  */
1943                 if (forward) {
1944                         lseg = hseg;
1945                         if (lseg == NULL)
1946                                 break;
1947                         hseg = AS_SEGNEXT(as, hseg);
1948                 } else {
1949                         hseg = lseg;
1950                         if (hseg == NULL)
1951                                 break;
1952                         lseg = AS_SEGPREV(as, lseg);
1953                 }
1954         }
1955         if (fast_path && (align != 0 || save_redzone != 0)) {
1956                 fast_path = 0;
1957                 minlen = save_minlen;
1958                 redzone = save_redzone;
1959                 goto retry;
1960         }
1961         *basep = save_base;
1962         *lenp = save_len;
1963         AS_LOCK_EXIT(as, &as->a_lock);
1964         return (-1);
1965 }
1966 
1967 /*
1968  * Find a hole of at least size minlen within [*basep, *basep + *lenp).
1969  *
1970  * If flags specifies AH_HI, the hole will have the highest possible address
1971  * in the range.  We use the as->a_lastgap field to figure out where to
1972  * start looking for a gap.
1973  *
1974  * Otherwise, the gap will have the lowest possible address.
1975  *
1976  * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1977  *
1978  * If an adequate hole is found, base and len are set to reflect the part of
1979  * the hole that is within range, and 0 is returned, otherwise,
1980  * -1 is returned.
1981  *
1982  * NOTE: This routine is not correct when base+len overflows caddr_t.
1983  */
1984 int
1985 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
1986     caddr_t addr)
1987 {
1988 
1989         return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
1990 }
1991 
1992 /*
1993  * Return the next range within [base, base + len) that is backed
1994  * with "real memory".  Skip holes and non-seg_vn segments.
1995  * We're lazy and only return one segment at a time.
1996  */
1997 int
1998 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
1999 {
2000         extern struct seg_ops segspt_shmops;    /* needs a header file */
2001         struct seg *seg;
2002         caddr_t addr, eaddr;
2003         caddr_t segend;
2004 
2005         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2006 
2007         addr = *basep;
2008         eaddr = addr + *lenp;
2009 
2010         seg = as_findseg(as, addr, 0);
2011         if (seg != NULL)
2012                 addr = MAX(seg->s_base, addr);
2013 
2014         for (;;) {
2015                 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2016                         AS_LOCK_EXIT(as, &as->a_lock);
2017                         return (EINVAL);
2018                 }
2019 
2020                 if (seg->s_ops == &segvn_ops) {
2021                         segend = seg->s_base + seg->s_size;
2022                         break;
2023                 }
2024 
2025                 /*
2026                  * We do ISM by looking into the private data
2027                  * to determine the real size of the segment.
2028                  */
2029                 if (seg->s_ops == &segspt_shmops) {
2030                         segend = seg->s_base + spt_realsize(seg);
2031                         if (addr < segend)
2032                                 break;
2033                 }
2034 
2035                 seg = AS_SEGNEXT(as, seg);
2036 
2037                 if (seg != NULL)
2038                         addr = seg->s_base;
2039         }
2040 
2041         *basep = addr;
2042 
2043         if (segend > eaddr)
2044                 *lenp = eaddr - addr;
2045         else
2046                 *lenp = segend - addr;
2047 
2048         AS_LOCK_EXIT(as, &as->a_lock);
2049         return (0);
2050 }
2051 
2052 /*
2053  * Swap the pages associated with the address space as out to
2054  * secondary storage, returning the number of bytes actually
2055  * swapped.
2056  *
2057  * The value returned is intended to correlate well with the process's
2058  * memory requirements.  Its usefulness for this purpose depends on
2059  * how well the segment-level routines do at returning accurate
2060  * information.
2061  */
2062 size_t
2063 as_swapout(struct as *as)
2064 {
2065         struct seg *seg;
2066         size_t swpcnt = 0;
2067 
2068         /*
2069          * Kernel-only processes have given up their address
2070          * spaces.  Of course, we shouldn't be attempting to
2071          * swap out such processes in the first place...
2072          */
2073         if (as == NULL)
2074                 return (0);
2075 
2076         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2077 
2078         /*
2079          * Free all mapping resources associated with the address
2080          * space.  The segment-level swapout routines capitalize
2081          * on this unmapping by scavanging pages that have become
2082          * unmapped here.
2083          */
2084         hat_swapout(as->a_hat);
2085 
2086         /*
2087          * Call the swapout routines of all segments in the address
2088          * space to do the actual work, accumulating the amount of
2089          * space reclaimed.
2090          */
2091         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2092                 struct seg_ops *ov = seg->s_ops;
2093 
2094                 /*
2095                  * We have to check to see if the seg has
2096                  * an ops vector because the seg may have
2097                  * been in the middle of being set up when
2098                  * the process was picked for swapout.
2099                  */
2100                 if ((ov != NULL) && (ov->swapout != NULL))
2101                         swpcnt += SEGOP_SWAPOUT(seg);
2102         }
2103         AS_LOCK_EXIT(as, &as->a_lock);
2104         return (swpcnt);
2105 }
2106 
2107 /*
2108  * Determine whether data from the mappings in interval [addr, addr + size)
2109  * are in the primary memory (core) cache.
2110  */
2111 int
2112 as_incore(struct as *as, caddr_t addr,
2113     size_t size, char *vec, size_t *sizep)
2114 {
2115         struct seg *seg;
2116         size_t ssize;
2117         caddr_t raddr;          /* rounded down addr */
2118         size_t rsize;           /* rounded up size */
2119         size_t isize;                   /* iteration size */
2120         int error = 0;          /* result, assume success */
2121 
2122         *sizep = 0;
2123         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2124         rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2125             (size_t)raddr;
2126 
2127         if (raddr + rsize < raddr)           /* check for wraparound */
2128                 return (ENOMEM);
2129 
2130         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2131         seg = as_segat(as, raddr);
2132         if (seg == NULL) {
2133                 AS_LOCK_EXIT(as, &as->a_lock);
2134                 return (-1);
2135         }
2136 
2137         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2138                 if (raddr >= seg->s_base + seg->s_size) {
2139                         seg = AS_SEGNEXT(as, seg);
2140                         if (seg == NULL || raddr != seg->s_base) {
2141                                 error = -1;
2142                                 break;
2143                         }
2144                 }
2145                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2146                         ssize = seg->s_base + seg->s_size - raddr;
2147                 else
2148                         ssize = rsize;
2149                 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
2150                 if (isize != ssize) {
2151                         error = -1;
2152                         break;
2153                 }
2154                 vec += btopr(ssize);
2155         }
2156         AS_LOCK_EXIT(as, &as->a_lock);
2157         return (error);
2158 }
2159 
2160 static void
2161 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2162         ulong_t *bitmap, size_t position, size_t npages)
2163 {
2164         caddr_t range_start;
2165         size_t  pos1 = position;
2166         size_t  pos2;
2167         size_t  size;
2168         size_t  end_pos = npages + position;
2169 
2170         while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2171                 size = ptob((pos2 - pos1));
2172                 range_start = (caddr_t)((uintptr_t)addr +
2173                     ptob(pos1 - position));
2174 
2175                 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
2176                     (ulong_t *)NULL, (size_t)NULL);
2177                 pos1 = pos2;
2178         }
2179 }
2180 
2181 static void
2182 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2183         caddr_t raddr, size_t rsize)
2184 {
2185         struct seg *seg = as_segat(as, raddr);
2186         size_t ssize;
2187 
2188         while (rsize != 0) {
2189                 if (raddr >= seg->s_base + seg->s_size)
2190                         seg = AS_SEGNEXT(as, seg);
2191 
2192                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2193                         ssize = seg->s_base + seg->s_size - raddr;
2194                 else
2195                         ssize = rsize;
2196 
2197                 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2198 
2199                 rsize -= ssize;
2200                 raddr += ssize;
2201         }
2202 }
2203 
2204 /*
2205  * Cache control operations over the interval [addr, addr + size) in
2206  * address space "as".
2207  */
2208 /*ARGSUSED*/
2209 int
2210 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2211     uintptr_t arg, ulong_t *lock_map, size_t pos)
2212 {
2213         struct seg *seg;        /* working segment */
2214         caddr_t raddr;          /* rounded down addr */
2215         caddr_t initraddr;      /* saved initial rounded down addr */
2216         size_t rsize;           /* rounded up size */
2217         size_t initrsize;       /* saved initial rounded up size */
2218         size_t ssize;           /* size of seg */
2219         int error = 0;                  /* result */
2220         size_t mlock_size;      /* size of bitmap */
2221         ulong_t *mlock_map;     /* pointer to bitmap used */
2222                                 /* to represent the locked */
2223                                 /* pages. */
2224 retry:
2225         if (error == IE_RETRY)
2226                 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2227         else
2228                 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2229 
2230         /*
2231          * If these are address space lock/unlock operations, loop over
2232          * all segments in the address space, as appropriate.
2233          */
2234         if (func == MC_LOCKAS) {
2235                 size_t npages, idx;
2236                 size_t rlen = 0;        /* rounded as length */
2237 
2238                 idx = pos;
2239 
2240                 if (arg & MCL_FUTURE) {
2241                         mutex_enter(&as->a_contents);
2242                         AS_SETPGLCK(as);
2243                         mutex_exit(&as->a_contents);
2244                 }
2245                 if ((arg & MCL_CURRENT) == 0) {
2246                         AS_LOCK_EXIT(as, &as->a_lock);
2247                         return (0);
2248                 }
2249 
2250                 seg = AS_SEGFIRST(as);
2251                 if (seg == NULL) {
2252                         AS_LOCK_EXIT(as, &as->a_lock);
2253                         return (0);
2254                 }
2255 
2256                 do {
2257                         raddr = (caddr_t)((uintptr_t)seg->s_base &
2258                             (uintptr_t)PAGEMASK);
2259                         rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2260                             PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2261                 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2262 
2263                 mlock_size = BT_BITOUL(btopr(rlen));
2264                 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2265                     sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2266                                 AS_LOCK_EXIT(as, &as->a_lock);
2267                                 return (EAGAIN);
2268                 }
2269 
2270                 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2271                         error = SEGOP_LOCKOP(seg, seg->s_base,
2272                             seg->s_size, attr, MC_LOCK, mlock_map, pos);
2273                         if (error != 0)
2274                                 break;
2275                         pos += seg_pages(seg);
2276                 }
2277 
2278                 if (error) {
2279                         for (seg = AS_SEGFIRST(as); seg != NULL;
2280                             seg = AS_SEGNEXT(as, seg)) {
2281 
2282                                 raddr = (caddr_t)((uintptr_t)seg->s_base &
2283                                     (uintptr_t)PAGEMASK);
2284                                 npages = seg_pages(seg);
2285                                 as_segunlock(seg, raddr, attr, mlock_map,
2286                                     idx, npages);
2287                                 idx += npages;
2288                         }
2289                 }
2290 
2291                 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2292                 AS_LOCK_EXIT(as, &as->a_lock);
2293                 goto lockerr;
2294         } else if (func == MC_UNLOCKAS) {
2295                 mutex_enter(&as->a_contents);
2296                 AS_CLRPGLCK(as);
2297                 mutex_exit(&as->a_contents);
2298 
2299                 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2300                         error = SEGOP_LOCKOP(seg, seg->s_base,
2301                             seg->s_size, attr, MC_UNLOCK, NULL, 0);
2302                         if (error != 0)
2303                                 break;
2304                 }
2305 
2306                 AS_LOCK_EXIT(as, &as->a_lock);
2307                 goto lockerr;
2308         }
2309 
2310         /*
2311          * Normalize addresses and sizes.
2312          */
2313         initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2314         initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2315             (size_t)raddr;
2316 
2317         if (raddr + rsize < raddr) {         /* check for wraparound */
2318                 AS_LOCK_EXIT(as, &as->a_lock);
2319                 return (ENOMEM);
2320         }
2321 
2322         /*
2323          * Get initial segment.
2324          */
2325         if ((seg = as_segat(as, raddr)) == NULL) {
2326                 AS_LOCK_EXIT(as, &as->a_lock);
2327                 return (ENOMEM);
2328         }
2329 
2330         if (func == MC_LOCK) {
2331                 mlock_size = BT_BITOUL(btopr(rsize));
2332                 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2333                     sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2334                                 AS_LOCK_EXIT(as, &as->a_lock);
2335                                 return (EAGAIN);
2336                 }
2337         }
2338 
2339         /*
2340          * Loop over all segments.  If a hole in the address range is
2341          * discovered, then fail.  For each segment, perform the appropriate
2342          * control operation.
2343          */
2344         while (rsize != 0) {
2345 
2346                 /*
2347                  * Make sure there's no hole, calculate the portion
2348                  * of the next segment to be operated over.
2349                  */
2350                 if (raddr >= seg->s_base + seg->s_size) {
2351                         seg = AS_SEGNEXT(as, seg);
2352                         if (seg == NULL || raddr != seg->s_base) {
2353                                 if (func == MC_LOCK) {
2354                                         as_unlockerr(as, attr, mlock_map,
2355                                             initraddr, initrsize - rsize);
2356                                         kmem_free(mlock_map,
2357                                             mlock_size * sizeof (ulong_t));
2358                                 }
2359                                 AS_LOCK_EXIT(as, &as->a_lock);
2360                                 return (ENOMEM);
2361                         }
2362                 }
2363                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2364                         ssize = seg->s_base + seg->s_size - raddr;
2365                 else
2366                         ssize = rsize;
2367 
2368                 /*
2369                  * Dispatch on specific function.
2370                  */
2371                 switch (func) {
2372 
2373                 /*
2374                  * Synchronize cached data from mappings with backing
2375                  * objects.
2376                  */
2377                 case MC_SYNC:
2378                         if (error = SEGOP_SYNC(seg, raddr, ssize,
2379                             attr, (uint_t)arg)) {
2380                                 AS_LOCK_EXIT(as, &as->a_lock);
2381                                 return (error);
2382                         }
2383                         break;
2384 
2385                 /*
2386                  * Lock pages in memory.
2387                  */
2388                 case MC_LOCK:
2389                         if (error = SEGOP_LOCKOP(seg, raddr, ssize,
2390                             attr, func, mlock_map, pos)) {
2391                                 as_unlockerr(as, attr, mlock_map, initraddr,
2392                                     initrsize - rsize + ssize);
2393                                 kmem_free(mlock_map, mlock_size *
2394                                     sizeof (ulong_t));
2395                                 AS_LOCK_EXIT(as, &as->a_lock);
2396                                 goto lockerr;
2397                         }
2398                         break;
2399 
2400                 /*
2401                  * Unlock mapped pages.
2402                  */
2403                 case MC_UNLOCK:
2404                         (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
2405                             (ulong_t *)NULL, (size_t)NULL);
2406                         break;
2407 
2408                 /*
2409                  * Store VM advise for mapped pages in segment layer.
2410                  */
2411                 case MC_ADVISE:
2412                         error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
2413 
2414                         /*
2415                          * Check for regular errors and special retry error
2416                          */
2417                         if (error) {
2418                                 if (error == IE_RETRY) {
2419                                         /*
2420                                          * Need to acquire writers lock, so
2421                                          * have to drop readers lock and start
2422                                          * all over again
2423                                          */
2424                                         AS_LOCK_EXIT(as, &as->a_lock);
2425                                         goto retry;
2426                                 } else if (error == IE_REATTACH) {
2427                                         /*
2428                                          * Find segment for current address
2429                                          * because current segment just got
2430                                          * split or concatenated
2431                                          */
2432                                         seg = as_segat(as, raddr);
2433                                         if (seg == NULL) {
2434                                                 AS_LOCK_EXIT(as, &as->a_lock);
2435                                                 return (ENOMEM);
2436                                         }
2437                                 } else {
2438                                         /*
2439                                          * Regular error
2440                                          */
2441                                         AS_LOCK_EXIT(as, &as->a_lock);
2442                                         return (error);
2443                                 }
2444                         }
2445                         break;
2446 
2447                 case MC_INHERIT_ZERO:
2448                         if (seg->s_ops->inherit == NULL) {
2449                                 error = ENOTSUP;
2450                         } else {
2451                                 error = SEGOP_INHERIT(seg, raddr, ssize,
2452                                     SEGP_INH_ZERO);
2453                         }
2454                         if (error != 0) {
2455                                 AS_LOCK_EXIT(as, &as->a_lock);
2456                                 return (error);
2457                         }
2458                         break;
2459 
2460                 /*
2461                  * Can't happen.
2462                  */
2463                 default:
2464                         panic("as_ctl: bad operation %d", func);
2465                         /*NOTREACHED*/
2466                 }
2467 
2468                 rsize -= ssize;
2469                 raddr += ssize;
2470         }
2471 
2472         if (func == MC_LOCK)
2473                 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2474         AS_LOCK_EXIT(as, &as->a_lock);
2475         return (0);
2476 lockerr:
2477 
2478         /*
2479          * If the lower levels returned EDEADLK for a segment lockop,
2480          * it means that we should retry the operation.  Let's wait
2481          * a bit also to let the deadlock causing condition clear.
2482          * This is part of a gross hack to work around a design flaw
2483          * in the ufs/sds logging code and should go away when the
2484          * logging code is re-designed to fix the problem. See bug
2485          * 4125102 for details of the problem.
2486          */
2487         if (error == EDEADLK) {
2488                 delay(deadlk_wait);
2489                 error = 0;
2490                 goto retry;
2491         }
2492         return (error);
2493 }
2494 
2495 int
2496 fc_decode(faultcode_t fault_err)
2497 {
2498         int error = 0;
2499 
2500         switch (FC_CODE(fault_err)) {
2501         case FC_OBJERR:
2502                 error = FC_ERRNO(fault_err);
2503                 break;
2504         case FC_PROT:
2505                 error = EACCES;
2506                 break;
2507         default:
2508                 error = EFAULT;
2509                 break;
2510         }
2511         return (error);
2512 }
2513 
2514 /*
2515  * Pagelock pages from a range that spans more than 1 segment.  Obtain shadow
2516  * lists from each segment and copy them to one contiguous shadow list (plist)
2517  * as expected by the caller.  Save pointers to per segment shadow lists at
2518  * the tail of plist so that they can be used during as_pageunlock().
2519  */
2520 static int
2521 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2522     caddr_t addr, size_t size, enum seg_rw rw)
2523 {
2524         caddr_t sv_addr = addr;
2525         size_t sv_size = size;
2526         struct seg *sv_seg = seg;
2527         ulong_t segcnt = 1;
2528         ulong_t cnt;
2529         size_t ssize;
2530         pgcnt_t npages = btop(size);
2531         page_t **plist;
2532         page_t **pl;
2533         int error;
2534         caddr_t eaddr;
2535         faultcode_t fault_err = 0;
2536         pgcnt_t pl_off;
2537         extern struct seg_ops segspt_shmops;
2538 
2539         ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2540         ASSERT(seg != NULL);
2541         ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2542         ASSERT(addr + size > seg->s_base + seg->s_size);
2543         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2544         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2545 
2546         /*
2547          * Count the number of segments covered by the range we are about to
2548          * lock. The segment count is used to size the shadow list we return
2549          * back to the caller.
2550          */
2551         for (; size != 0; size -= ssize, addr += ssize) {
2552                 if (addr >= seg->s_base + seg->s_size) {
2553 
2554                         seg = AS_SEGNEXT(as, seg);
2555                         if (seg == NULL || addr != seg->s_base) {
2556                                 AS_LOCK_EXIT(as, &as->a_lock);
2557                                 return (EFAULT);
2558                         }
2559                         /*
2560                          * Do a quick check if subsequent segments
2561                          * will most likely support pagelock.
2562                          */
2563                         if (seg->s_ops == &segvn_ops) {
2564                                 vnode_t *vp;
2565 
2566                                 if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
2567                                     vp != NULL) {
2568                                         AS_LOCK_EXIT(as, &as->a_lock);
2569                                         goto slow;
2570                                 }
2571                         } else if (seg->s_ops != &segspt_shmops) {
2572                                 AS_LOCK_EXIT(as, &as->a_lock);
2573                                 goto slow;
2574                         }
2575                         segcnt++;
2576                 }
2577                 if (addr + size > seg->s_base + seg->s_size) {
2578                         ssize = seg->s_base + seg->s_size - addr;
2579                 } else {
2580                         ssize = size;
2581                 }
2582         }
2583         ASSERT(segcnt > 1);
2584 
2585         plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2586 
2587         addr = sv_addr;
2588         size = sv_size;
2589         seg = sv_seg;
2590 
2591         for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2592                 if (addr >= seg->s_base + seg->s_size) {
2593                         seg = AS_SEGNEXT(as, seg);
2594                         ASSERT(seg != NULL && addr == seg->s_base);
2595                         cnt++;
2596                         ASSERT(cnt < segcnt);
2597                 }
2598                 if (addr + size > seg->s_base + seg->s_size) {
2599                         ssize = seg->s_base + seg->s_size - addr;
2600                 } else {
2601                         ssize = size;
2602                 }
2603                 pl = &plist[npages + cnt];
2604                 error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2605                     L_PAGELOCK, rw);
2606                 if (error) {
2607                         break;
2608                 }
2609                 ASSERT(plist[npages + cnt] != NULL);
2610                 ASSERT(pl_off + btop(ssize) <= npages);
2611                 bcopy(plist[npages + cnt], &plist[pl_off],
2612                     btop(ssize) * sizeof (page_t *));
2613                 pl_off += btop(ssize);
2614         }
2615 
2616         if (size == 0) {
2617                 AS_LOCK_EXIT(as, &as->a_lock);
2618                 ASSERT(cnt == segcnt - 1);
2619                 *ppp = plist;
2620                 return (0);
2621         }
2622 
2623         /*
2624          * one of pagelock calls failed. The error type is in error variable.
2625          * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2626          * type is either EFAULT or ENOTSUP. Otherwise just return the error
2627          * back to the caller.
2628          */
2629 
2630         eaddr = addr;
2631         seg = sv_seg;
2632 
2633         for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2634                 if (addr >= seg->s_base + seg->s_size) {
2635                         seg = AS_SEGNEXT(as, seg);
2636                         ASSERT(seg != NULL && addr == seg->s_base);
2637                         cnt++;
2638                         ASSERT(cnt < segcnt);
2639                 }
2640                 if (eaddr > seg->s_base + seg->s_size) {
2641                         ssize = seg->s_base + seg->s_size - addr;
2642                 } else {
2643                         ssize = eaddr - addr;
2644                 }
2645                 pl = &plist[npages + cnt];
2646                 ASSERT(*pl != NULL);
2647                 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2648                     L_PAGEUNLOCK, rw);
2649         }
2650 
2651         AS_LOCK_EXIT(as, &as->a_lock);
2652 
2653         kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2654 
2655         if (error != ENOTSUP && error != EFAULT) {
2656                 return (error);
2657         }
2658 
2659 slow:
2660         /*
2661          * If we are here because pagelock failed due to the need to cow fault
2662          * in the pages we want to lock F_SOFTLOCK will do this job and in
2663          * next as_pagelock() call for this address range pagelock will
2664          * hopefully succeed.
2665          */
2666         fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2667         if (fault_err != 0) {
2668                 return (fc_decode(fault_err));
2669         }
2670         *ppp = NULL;
2671 
2672         return (0);
2673 }
2674 
2675 /*
2676  * lock pages in a given address space. Return shadow list. If
2677  * the list is NULL, the MMU mapping is also locked.
2678  */
2679 int
2680 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2681     size_t size, enum seg_rw rw)
2682 {
2683         size_t rsize;
2684         caddr_t raddr;
2685         faultcode_t fault_err;
2686         struct seg *seg;
2687         int err;
2688 
2689         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2690             "as_pagelock_start: addr %p size %ld", addr, size);
2691 
2692         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2693         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2694             (size_t)raddr;
2695 
2696         /*
2697          * if the request crosses two segments let
2698          * as_fault handle it.
2699          */
2700         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2701 
2702         seg = as_segat(as, raddr);
2703         if (seg == NULL) {
2704                 AS_LOCK_EXIT(as, &as->a_lock);
2705                 return (EFAULT);
2706         }
2707         ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2708         if (raddr + rsize > seg->s_base + seg->s_size) {
2709                 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2710         }
2711         if (raddr + rsize <= raddr) {
2712                 AS_LOCK_EXIT(as, &as->a_lock);
2713                 return (EFAULT);
2714         }
2715 
2716         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2717             "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2718 
2719         /*
2720          * try to lock pages and pass back shadow list
2721          */
2722         err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2723 
2724         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2725 
2726         AS_LOCK_EXIT(as, &as->a_lock);
2727 
2728         if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2729                 return (err);
2730         }
2731 
2732         /*
2733          * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2734          * to no pagelock support for this segment or pages need to be cow
2735          * faulted in. If fault is needed F_SOFTLOCK will do this job for
2736          * this as_pagelock() call and in the next as_pagelock() call for the
2737          * same address range pagelock call will hopefull succeed.
2738          */
2739         fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2740         if (fault_err != 0) {
2741                 return (fc_decode(fault_err));
2742         }
2743         *ppp = NULL;
2744 
2745         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2746         return (0);
2747 }
2748 
2749 /*
2750  * unlock pages locked by as_pagelock_segs().  Retrieve per segment shadow
2751  * lists from the end of plist and call pageunlock interface for each segment.
2752  * Drop as lock and free plist.
2753  */
2754 static void
2755 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2756     struct page **plist, enum seg_rw rw)
2757 {
2758         ulong_t cnt;
2759         caddr_t eaddr = addr + size;
2760         pgcnt_t npages = btop(size);
2761         size_t ssize;
2762         page_t **pl;
2763 
2764         ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2765         ASSERT(seg != NULL);
2766         ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2767         ASSERT(addr + size > seg->s_base + seg->s_size);
2768         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2769         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2770         ASSERT(plist != NULL);
2771 
2772         for (cnt = 0; addr < eaddr; addr += ssize) {
2773                 if (addr >= seg->s_base + seg->s_size) {
2774                         seg = AS_SEGNEXT(as, seg);
2775                         ASSERT(seg != NULL && addr == seg->s_base);
2776                         cnt++;
2777                 }
2778                 if (eaddr > seg->s_base + seg->s_size) {
2779                         ssize = seg->s_base + seg->s_size - addr;
2780                 } else {
2781                         ssize = eaddr - addr;
2782                 }
2783                 pl = &plist[npages + cnt];
2784                 ASSERT(*pl != NULL);
2785                 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2786                     L_PAGEUNLOCK, rw);
2787         }
2788         ASSERT(cnt > 0);
2789         AS_LOCK_EXIT(as, &as->a_lock);
2790 
2791         cnt++;
2792         kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2793 }
2794 
2795 /*
2796  * unlock pages in a given address range
2797  */
2798 void
2799 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2800     enum seg_rw rw)
2801 {
2802         struct seg *seg;
2803         size_t rsize;
2804         caddr_t raddr;
2805 
2806         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2807             "as_pageunlock_start: addr %p size %ld", addr, size);
2808 
2809         /*
2810          * if the shadow list is NULL, as_pagelock was
2811          * falling back to as_fault
2812          */
2813         if (pp == NULL) {
2814                 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2815                 return;
2816         }
2817 
2818         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2819         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2820             (size_t)raddr;
2821 
2822         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2823         seg = as_segat(as, raddr);
2824         ASSERT(seg != NULL);
2825 
2826         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2827             "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2828 
2829         ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2830         if (raddr + rsize <= seg->s_base + seg->s_size) {
2831                 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2832         } else {
2833                 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2834                 return;
2835         }
2836         AS_LOCK_EXIT(as, &as->a_lock);
2837         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2838 }
2839 
2840 int
2841 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2842     boolean_t wait)
2843 {
2844         struct seg *seg;
2845         size_t ssize;
2846         caddr_t raddr;                  /* rounded down addr */
2847         size_t rsize;                   /* rounded up size */
2848         int error = 0;
2849         size_t pgsz = page_get_pagesize(szc);
2850 
2851 setpgsz_top:
2852         if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2853                 return (EINVAL);
2854         }
2855 
2856         raddr = addr;
2857         rsize = size;
2858 
2859         if (raddr + rsize < raddr)           /* check for wraparound */
2860                 return (ENOMEM);
2861 
2862         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2863         as_clearwatchprot(as, raddr, rsize);
2864         seg = as_segat(as, raddr);
2865         if (seg == NULL) {
2866                 as_setwatch(as);
2867                 AS_LOCK_EXIT(as, &as->a_lock);
2868                 return (ENOMEM);
2869         }
2870 
2871         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2872                 if (raddr >= seg->s_base + seg->s_size) {
2873                         seg = AS_SEGNEXT(as, seg);
2874                         if (seg == NULL || raddr != seg->s_base) {
2875                                 error = ENOMEM;
2876                                 break;
2877                         }
2878                 }
2879                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2880                         ssize = seg->s_base + seg->s_size - raddr;
2881                 } else {
2882                         ssize = rsize;
2883                 }
2884 
2885 retry:
2886                 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2887 
2888                 if (error == IE_NOMEM) {
2889                         error = EAGAIN;
2890                         break;
2891                 }
2892 
2893                 if (error == IE_RETRY) {
2894                         AS_LOCK_EXIT(as, &as->a_lock);
2895                         goto setpgsz_top;
2896                 }
2897 
2898                 if (error == ENOTSUP) {
2899                         error = EINVAL;
2900                         break;
2901                 }
2902 
2903                 if (wait && (error == EAGAIN)) {
2904                         /*
2905                          * Memory is currently locked.  It must be unlocked
2906                          * before this operation can succeed through a retry.
2907                          * The possible reasons for locked memory and
2908                          * corresponding strategies for unlocking are:
2909                          * (1) Normal I/O
2910                          *      wait for a signal that the I/O operation
2911                          *      has completed and the memory is unlocked.
2912                          * (2) Asynchronous I/O
2913                          *      The aio subsystem does not unlock pages when
2914                          *      the I/O is completed. Those pages are unlocked
2915                          *      when the application calls aiowait/aioerror.
2916                          *      So, to prevent blocking forever, cv_broadcast()
2917                          *      is done to wake up aio_cleanup_thread.
2918                          *      Subsequently, segvn_reclaim will be called, and
2919                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
2920                          * (3) Long term page locking:
2921                          *      This is not relevant for as_setpagesize()
2922                          *      because we cannot change the page size for
2923                          *      driver memory. The attempt to do so will
2924                          *      fail with a different error than EAGAIN so
2925                          *      there's no need to trigger as callbacks like
2926                          *      as_unmap, as_setprot or as_free would do.
2927                          */
2928                         mutex_enter(&as->a_contents);
2929                         if (!AS_ISNOUNMAPWAIT(as)) {
2930                                 if (AS_ISUNMAPWAIT(as) == 0) {
2931                                         cv_broadcast(&as->a_cv);
2932                                 }
2933                                 AS_SETUNMAPWAIT(as);
2934                                 AS_LOCK_EXIT(as, &as->a_lock);
2935                                 while (AS_ISUNMAPWAIT(as)) {
2936                                         cv_wait(&as->a_cv, &as->a_contents);
2937                                 }
2938                         } else {
2939                                 /*
2940                                  * We may have raced with
2941                                  * segvn_reclaim()/segspt_reclaim(). In this
2942                                  * case clean nounmapwait flag and retry since
2943                                  * softlockcnt in this segment may be already
2944                                  * 0.  We don't drop as writer lock so our
2945                                  * number of retries without sleeping should
2946                                  * be very small. See segvn_reclaim() for
2947                                  * more comments.
2948                                  */
2949                                 AS_CLRNOUNMAPWAIT(as);
2950                                 mutex_exit(&as->a_contents);
2951                                 goto retry;
2952                         }
2953                         mutex_exit(&as->a_contents);
2954                         goto setpgsz_top;
2955                 } else if (error != 0) {
2956                         break;
2957                 }
2958         }
2959         as_setwatch(as);
2960         AS_LOCK_EXIT(as, &as->a_lock);
2961         return (error);
2962 }
2963 
2964 /*
2965  * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
2966  * in its chunk where s_szc is less than the szc we want to set.
2967  */
2968 static int
2969 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
2970     int *retry)
2971 {
2972         struct seg *seg;
2973         size_t ssize;
2974         int error;
2975 
2976         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
2977 
2978         seg = as_segat(as, raddr);
2979         if (seg == NULL) {
2980                 panic("as_iset3_default_lpsize: no seg");
2981         }
2982 
2983         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2984                 if (raddr >= seg->s_base + seg->s_size) {
2985                         seg = AS_SEGNEXT(as, seg);
2986                         if (seg == NULL || raddr != seg->s_base) {
2987                                 panic("as_iset3_default_lpsize: as changed");
2988                         }
2989                 }
2990                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2991                         ssize = seg->s_base + seg->s_size - raddr;
2992                 } else {
2993                         ssize = rsize;
2994                 }
2995 
2996                 if (szc > seg->s_szc) {
2997                         error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2998                         /* Only retry on EINVAL segments that have no vnode. */
2999                         if (error == EINVAL) {
3000                                 vnode_t *vp = NULL;
3001                                 if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) &&
3002                                     (SEGOP_GETVP(seg, raddr, &vp) != 0 ||
3003                                     vp == NULL)) {
3004                                         *retry = 1;
3005                                 } else {
3006                                         *retry = 0;
3007                                 }
3008                         }
3009                         if (error) {
3010                                 return (error);
3011                         }
3012                 }
3013         }
3014         return (0);
3015 }
3016 
3017 /*
3018  * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3019  * pagesize on each segment in its range, but if any fails with EINVAL,
3020  * then it reduces the pagesizes to the next size in the bitmap and
3021  * retries as_iset3_default_lpsize(). The reason why the code retries
3022  * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3023  * match the bigger sizes, and (b) it's hard to get this offset (to begin
3024  * with) to pass to map_pgszcvec().
3025  */
3026 static int
3027 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3028     uint_t szcvec)
3029 {
3030         int error;
3031         int retry;
3032 
3033         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3034 
3035         for (;;) {
3036                 error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3037                 if (error == EINVAL && retry) {
3038                         szcvec &= ~(1 << szc);
3039                         if (szcvec <= 1) {
3040                                 return (EINVAL);
3041                         }
3042                         szc = highbit(szcvec) - 1;
3043                 } else {
3044                         return (error);
3045                 }
3046         }
3047 }
3048 
3049 /*
3050  * as_iset1_default_lpsize() breaks its chunk into areas where existing
3051  * segments have a smaller szc than we want to set. For each such area,
3052  * it calls as_iset2_default_lpsize()
3053  */
3054 static int
3055 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3056     uint_t szcvec)
3057 {
3058         struct seg *seg;
3059         size_t ssize;
3060         caddr_t setaddr = raddr;
3061         size_t setsize = 0;
3062         int set;
3063         int error;
3064 
3065         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3066 
3067         seg = as_segat(as, raddr);
3068         if (seg == NULL) {
3069                 panic("as_iset1_default_lpsize: no seg");
3070         }
3071         if (seg->s_szc < szc) {
3072                 set = 1;
3073         } else {
3074                 set = 0;
3075         }
3076 
3077         for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3078                 if (raddr >= seg->s_base + seg->s_size) {
3079                         seg = AS_SEGNEXT(as, seg);
3080                         if (seg == NULL || raddr != seg->s_base) {
3081                                 panic("as_iset1_default_lpsize: as changed");
3082                         }
3083                         if (seg->s_szc >= szc && set) {
3084                                 ASSERT(setsize != 0);
3085                                 error = as_iset2_default_lpsize(as,
3086                                     setaddr, setsize, szc, szcvec);
3087                                 if (error) {
3088                                         return (error);
3089                                 }
3090                                 set = 0;
3091                         } else if (seg->s_szc < szc && !set) {
3092                                 setaddr = raddr;
3093                                 setsize = 0;
3094                                 set = 1;
3095                         }
3096                 }
3097                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3098                         ssize = seg->s_base + seg->s_size - raddr;
3099                 } else {
3100                         ssize = rsize;
3101                 }
3102         }
3103         error = 0;
3104         if (set) {
3105                 ASSERT(setsize != 0);
3106                 error = as_iset2_default_lpsize(as, setaddr, setsize,
3107                     szc, szcvec);
3108         }
3109         return (error);
3110 }
3111 
3112 /*
3113  * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3114  * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3115  * chunk to as_iset1_default_lpsize().
3116  */
3117 static int
3118 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3119     int type)
3120 {
3121         int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3122         uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3123             flags, rtype, 1);
3124         uint_t szc;
3125         uint_t nszc;
3126         int error;
3127         caddr_t a;
3128         caddr_t eaddr;
3129         size_t segsize;
3130         size_t pgsz;
3131         uint_t save_szcvec;
3132 
3133         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3134         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3135         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3136 
3137         szcvec &= ~1;
3138         if (szcvec <= 1) {   /* skip if base page size */
3139                 return (0);
3140         }
3141 
3142         /* Get the pagesize of the first larger page size. */
3143         szc = lowbit(szcvec) - 1;
3144         pgsz = page_get_pagesize(szc);
3145         eaddr = addr + size;
3146         addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3147         eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3148 
3149         save_szcvec = szcvec;
3150         szcvec >>= (szc + 1);
3151         nszc = szc;
3152         while (szcvec) {
3153                 if ((szcvec & 0x1) == 0) {
3154                         nszc++;
3155                         szcvec >>= 1;
3156                         continue;
3157                 }
3158                 nszc++;
3159                 pgsz = page_get_pagesize(nszc);
3160                 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3161                 if (a != addr) {
3162                         ASSERT(szc > 0);
3163                         ASSERT(a < eaddr);
3164                         segsize = a - addr;
3165                         error = as_iset1_default_lpsize(as, addr, segsize, szc,
3166                             save_szcvec);
3167                         if (error) {
3168                                 return (error);
3169                         }
3170                         addr = a;
3171                 }
3172                 szc = nszc;
3173                 szcvec >>= 1;
3174         }
3175 
3176         ASSERT(addr < eaddr);
3177         szcvec = save_szcvec;
3178         while (szcvec) {
3179                 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3180                 ASSERT(a >= addr);
3181                 if (a != addr) {
3182                         ASSERT(szc > 0);
3183                         segsize = a - addr;
3184                         error = as_iset1_default_lpsize(as, addr, segsize, szc,
3185                             save_szcvec);
3186                         if (error) {
3187                                 return (error);
3188                         }
3189                         addr = a;
3190                 }
3191                 szcvec &= ~(1 << szc);
3192                 if (szcvec) {
3193                         szc = highbit(szcvec) - 1;
3194                         pgsz = page_get_pagesize(szc);
3195                 }
3196         }
3197         ASSERT(addr == eaddr);
3198 
3199         return (0);
3200 }
3201 
3202 /*
3203  * Set the default large page size for the range. Called via memcntl with
3204  * page size set to 0. as_set_default_lpsize breaks the range down into
3205  * chunks with the same type/flags, ignores-non segvn segments, and passes
3206  * each chunk to as_iset_default_lpsize().
3207  */
3208 int
3209 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3210 {
3211         struct seg *seg;
3212         caddr_t raddr;
3213         size_t rsize;
3214         size_t ssize;
3215         int rtype, rflags;
3216         int stype, sflags;
3217         int error;
3218         caddr_t setaddr;
3219         size_t setsize;
3220         int segvn;
3221 
3222         if (size == 0)
3223                 return (0);
3224 
3225         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3226 again:
3227         error = 0;
3228 
3229         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3230         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3231             (size_t)raddr;
3232 
3233         if (raddr + rsize < raddr) {         /* check for wraparound */
3234                 AS_LOCK_EXIT(as, &as->a_lock);
3235                 return (ENOMEM);
3236         }
3237         as_clearwatchprot(as, raddr, rsize);
3238         seg = as_segat(as, raddr);
3239         if (seg == NULL) {
3240                 as_setwatch(as);
3241                 AS_LOCK_EXIT(as, &as->a_lock);
3242                 return (ENOMEM);
3243         }
3244         if (seg->s_ops == &segvn_ops) {
3245                 rtype = SEGOP_GETTYPE(seg, addr);
3246                 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3247                 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3248                 segvn = 1;
3249         } else {
3250                 segvn = 0;
3251         }
3252         setaddr = raddr;
3253         setsize = 0;
3254 
3255         for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3256                 if (raddr >= (seg->s_base + seg->s_size)) {
3257                         seg = AS_SEGNEXT(as, seg);
3258                         if (seg == NULL || raddr != seg->s_base) {
3259                                 error = ENOMEM;
3260                                 break;
3261                         }
3262                         if (seg->s_ops == &segvn_ops) {
3263                                 stype = SEGOP_GETTYPE(seg, raddr);
3264                                 sflags = stype & (MAP_TEXT | MAP_INITDATA);
3265                                 stype &= (MAP_SHARED | MAP_PRIVATE);
3266                                 if (segvn && (rflags != sflags ||
3267                                     rtype != stype)) {
3268                                         /*
3269                                          * The next segment is also segvn but
3270                                          * has different flags and/or type.
3271                                          */
3272                                         ASSERT(setsize != 0);
3273                                         error = as_iset_default_lpsize(as,
3274                                             setaddr, setsize, rflags, rtype);
3275                                         if (error) {
3276                                                 break;
3277                                         }
3278                                         rflags = sflags;
3279                                         rtype = stype;
3280                                         setaddr = raddr;
3281                                         setsize = 0;
3282                                 } else if (!segvn) {
3283                                         rflags = sflags;
3284                                         rtype = stype;
3285                                         setaddr = raddr;
3286                                         setsize = 0;
3287                                         segvn = 1;
3288                                 }
3289                         } else if (segvn) {
3290                                 /* The next segment is not segvn. */
3291                                 ASSERT(setsize != 0);
3292                                 error = as_iset_default_lpsize(as,
3293                                     setaddr, setsize, rflags, rtype);
3294                                 if (error) {
3295                                         break;
3296                                 }
3297                                 segvn = 0;
3298                         }
3299                 }
3300                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3301                         ssize = seg->s_base + seg->s_size - raddr;
3302                 } else {
3303                         ssize = rsize;
3304                 }
3305         }
3306         if (error == 0 && segvn) {
3307                 /* The last chunk when rsize == 0. */
3308                 ASSERT(setsize != 0);
3309                 error = as_iset_default_lpsize(as, setaddr, setsize,
3310                     rflags, rtype);
3311         }
3312 
3313         if (error == IE_RETRY) {
3314                 goto again;
3315         } else if (error == IE_NOMEM) {
3316                 error = EAGAIN;
3317         } else if (error == ENOTSUP) {
3318                 error = EINVAL;
3319         } else if (error == EAGAIN) {
3320                 mutex_enter(&as->a_contents);
3321                 if (!AS_ISNOUNMAPWAIT(as)) {
3322                         if (AS_ISUNMAPWAIT(as) == 0) {
3323                                 cv_broadcast(&as->a_cv);
3324                         }
3325                         AS_SETUNMAPWAIT(as);
3326                         AS_LOCK_EXIT(as, &as->a_lock);
3327                         while (AS_ISUNMAPWAIT(as)) {
3328                                 cv_wait(&as->a_cv, &as->a_contents);
3329                         }
3330                         mutex_exit(&as->a_contents);
3331                         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3332                 } else {
3333                         /*
3334                          * We may have raced with
3335                          * segvn_reclaim()/segspt_reclaim(). In this case
3336                          * clean nounmapwait flag and retry since softlockcnt
3337                          * in this segment may be already 0.  We don't drop as
3338                          * writer lock so our number of retries without
3339                          * sleeping should be very small. See segvn_reclaim()
3340                          * for more comments.
3341                          */
3342                         AS_CLRNOUNMAPWAIT(as);
3343                         mutex_exit(&as->a_contents);
3344                 }
3345                 goto again;
3346         }
3347 
3348         as_setwatch(as);
3349         AS_LOCK_EXIT(as, &as->a_lock);
3350         return (error);
3351 }
3352 
3353 /*
3354  * Setup all of the uninitialized watched pages that we can.
3355  */
3356 void
3357 as_setwatch(struct as *as)
3358 {
3359         struct watched_page *pwp;
3360         struct seg *seg;
3361         caddr_t vaddr;
3362         uint_t prot;
3363         int  err, retrycnt;
3364 
3365         if (avl_numnodes(&as->a_wpage) == 0)
3366                 return;
3367 
3368         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3369 
3370         for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3371             pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3372                 retrycnt = 0;
3373         retry:
3374                 vaddr = pwp->wp_vaddr;
3375                 if (pwp->wp_oprot != 0 ||    /* already set up */
3376                     (seg = as_segat(as, vaddr)) == NULL ||
3377                     SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
3378                         continue;
3379 
3380                 pwp->wp_oprot = prot;
3381                 if (pwp->wp_read)
3382                         prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3383                 if (pwp->wp_write)
3384                         prot &= ~PROT_WRITE;
3385                 if (pwp->wp_exec)
3386                         prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3387                 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3388                         err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3389                         if (err == IE_RETRY) {
3390                                 pwp->wp_oprot = 0;
3391                                 ASSERT(retrycnt == 0);
3392                                 retrycnt++;
3393                                 goto retry;
3394                         }
3395                 }
3396                 pwp->wp_prot = prot;
3397         }
3398 }
3399 
3400 /*
3401  * Clear all of the watched pages in the address space.
3402  */
3403 void
3404 as_clearwatch(struct as *as)
3405 {
3406         struct watched_page *pwp;
3407         struct seg *seg;
3408         caddr_t vaddr;
3409         uint_t prot;
3410         int err, retrycnt;
3411 
3412         if (avl_numnodes(&as->a_wpage) == 0)
3413                 return;
3414 
3415         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3416 
3417         for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3418             pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3419                 retrycnt = 0;
3420         retry:
3421                 vaddr = pwp->wp_vaddr;
3422                 if (pwp->wp_oprot == 0 ||    /* not set up */
3423                     (seg = as_segat(as, vaddr)) == NULL)
3424                         continue;
3425 
3426                 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3427                         err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3428                         if (err == IE_RETRY) {
3429                                 ASSERT(retrycnt == 0);
3430                                 retrycnt++;
3431                                 goto retry;
3432                         }
3433                 }
3434                 pwp->wp_oprot = 0;
3435                 pwp->wp_prot = 0;
3436         }
3437 }
3438 
3439 /*
3440  * Force a new setup for all the watched pages in the range.
3441  */
3442 static void
3443 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3444 {
3445         struct watched_page *pwp;
3446         struct watched_page tpw;
3447         caddr_t eaddr = addr + size;
3448         caddr_t vaddr;
3449         struct seg *seg;
3450         int err, retrycnt;
3451         uint_t  wprot;
3452         avl_index_t where;
3453 
3454         if (avl_numnodes(&as->a_wpage) == 0)
3455                 return;
3456 
3457         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3458 
3459         tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3460         if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3461                 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3462 
3463         while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3464                 retrycnt = 0;
3465                 vaddr = pwp->wp_vaddr;
3466 
3467                 wprot = prot;
3468                 if (pwp->wp_read)
3469                         wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3470                 if (pwp->wp_write)
3471                         wprot &= ~PROT_WRITE;
3472                 if (pwp->wp_exec)
3473                         wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3474                 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3475                 retry:
3476                         seg = as_segat(as, vaddr);
3477                         if (seg == NULL) {
3478                                 panic("as_setwatchprot: no seg");
3479                                 /*NOTREACHED*/
3480                         }
3481                         err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
3482                         if (err == IE_RETRY) {
3483                                 ASSERT(retrycnt == 0);
3484                                 retrycnt++;
3485                                 goto retry;
3486                         }
3487                 }
3488                 pwp->wp_oprot = prot;
3489                 pwp->wp_prot = wprot;
3490 
3491                 pwp = AVL_NEXT(&as->a_wpage, pwp);
3492         }
3493 }
3494 
3495 /*
3496  * Clear all of the watched pages in the range.
3497  */
3498 static void
3499 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3500 {
3501         caddr_t eaddr = addr + size;
3502         struct watched_page *pwp;
3503         struct watched_page tpw;
3504         uint_t prot;
3505         struct seg *seg;
3506         int err, retrycnt;
3507         avl_index_t where;
3508 
3509         if (avl_numnodes(&as->a_wpage) == 0)
3510                 return;
3511 
3512         tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3513         if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3514                 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3515 
3516         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3517 
3518         while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3519 
3520                 if ((prot = pwp->wp_oprot) != 0) {
3521                         retrycnt = 0;
3522 
3523                         if (prot != pwp->wp_prot) {
3524                         retry:
3525                                 seg = as_segat(as, pwp->wp_vaddr);
3526                                 if (seg == NULL)
3527                                         continue;
3528                                 err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
3529                                     PAGESIZE, prot);
3530                                 if (err == IE_RETRY) {
3531                                         ASSERT(retrycnt == 0);
3532                                         retrycnt++;
3533                                         goto retry;
3534 
3535                                 }
3536                         }
3537                         pwp->wp_oprot = 0;
3538                         pwp->wp_prot = 0;
3539                 }
3540 
3541                 pwp = AVL_NEXT(&as->a_wpage, pwp);
3542         }
3543 }
3544 
3545 void
3546 as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3547 {
3548         struct proc *p;
3549 
3550         mutex_enter(&pidlock);
3551         for (p = practive; p; p = p->p_next) {
3552                 if (p->p_as == as) {
3553                         mutex_enter(&p->p_lock);
3554                         if (p->p_as == as)
3555                                 sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3556                         mutex_exit(&p->p_lock);
3557                 }
3558         }
3559         mutex_exit(&pidlock);
3560 }
3561 
3562 /*
3563  * return memory object ID
3564  */
3565 int
3566 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3567 {
3568         struct seg      *seg;
3569         int             sts;
3570 
3571         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
3572         seg = as_segat(as, addr);
3573         if (seg == NULL) {
3574                 AS_LOCK_EXIT(as, &as->a_lock);
3575                 return (EFAULT);
3576         }
3577         /*
3578          * catch old drivers which may not support getmemid
3579          */
3580         if (seg->s_ops->getmemid == NULL) {
3581                 AS_LOCK_EXIT(as, &as->a_lock);
3582                 return (ENODEV);
3583         }
3584 
3585         sts = SEGOP_GETMEMID(seg, addr, memidp);
3586 
3587         AS_LOCK_EXIT(as, &as->a_lock);
3588         return (sts);
3589 }