1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28 
  29 /*
  30  * University Copyright- Copyright (c) 1982, 1986, 1988
  31  * The Regents of the University of California
  32  * All Rights Reserved
  33  *
  34  * University Acknowledgment- Portions of this document are derived from
  35  * software developed by the University of California, Berkeley, and its
  36  * contributors.
  37  */
  38 
  39 /*
  40  * VM - address spaces.
  41  */
  42 
  43 #include <sys/types.h>
  44 #include <sys/t_lock.h>
  45 #include <sys/param.h>
  46 #include <sys/errno.h>
  47 #include <sys/systm.h>
  48 #include <sys/mman.h>
  49 #include <sys/sysmacros.h>
  50 #include <sys/cpuvar.h>
  51 #include <sys/sysinfo.h>
  52 #include <sys/kmem.h>
  53 #include <sys/vnode.h>
  54 #include <sys/vmsystm.h>
  55 #include <sys/cmn_err.h>
  56 #include <sys/debug.h>
  57 #include <sys/tnf_probe.h>
  58 #include <sys/vtrace.h>
  59 
  60 #include <vm/hat.h>
  61 #include <vm/xhat.h>
  62 #include <vm/as.h>
  63 #include <vm/seg.h>
  64 #include <vm/seg_vn.h>
  65 #include <vm/seg_dev.h>
  66 #include <vm/seg_kmem.h>
  67 #include <vm/seg_map.h>
  68 #include <vm/seg_spt.h>
  69 #include <vm/page.h>
  70 
  71 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
  72 
  73 static struct kmem_cache *as_cache;
  74 
  75 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
  76 static void as_clearwatchprot(struct as *, caddr_t, size_t);
  77 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
  78 
  79 
  80 /*
  81  * Verifying the segment lists is very time-consuming; it may not be
  82  * desirable always to define VERIFY_SEGLIST when DEBUG is set.
  83  */
  84 #ifdef DEBUG
  85 #define VERIFY_SEGLIST
  86 int do_as_verify = 0;
  87 #endif
  88 
  89 /*
  90  * Allocate a new callback data structure entry and fill in the events of
  91  * interest, the address range of interest, and the callback argument.
  92  * Link the entry on the as->a_callbacks list. A callback entry for the
  93  * entire address space may be specified with vaddr = 0 and size = -1.
  94  *
  95  * CALLERS RESPONSIBILITY: If not calling from within the process context for
  96  * the specified as, the caller must guarantee persistence of the specified as
  97  * for the duration of this function (eg. pages being locked within the as
  98  * will guarantee persistence).
  99  */
 100 int
 101 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
 102                 caddr_t vaddr, size_t size, int sleepflag)
 103 {
 104         struct as_callback      *current_head, *cb;
 105         caddr_t                 saddr;
 106         size_t                  rsize;
 107 
 108         /* callback function and an event are mandatory */
 109         if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
 110                 return (EINVAL);
 111 
 112         /* Adding a callback after as_free has been called is not allowed */
 113         if (as == &kas)
 114                 return (ENOMEM);
 115 
 116         /*
 117          * vaddr = 0 and size = -1 is used to indicate that the callback range
 118          * is the entire address space so no rounding is done in that case.
 119          */
 120         if (size != -1) {
 121                 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
 122                 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
 123                     (size_t)saddr;
 124                 /* check for wraparound */
 125                 if (saddr + rsize < saddr)
 126                         return (ENOMEM);
 127         } else {
 128                 if (vaddr != 0)
 129                         return (EINVAL);
 130                 saddr = vaddr;
 131                 rsize = size;
 132         }
 133 
 134         /* Allocate and initialize a callback entry */
 135         cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
 136         if (cb == NULL)
 137                 return (EAGAIN);
 138 
 139         cb->ascb_func = cb_func;
 140         cb->ascb_arg = arg;
 141         cb->ascb_events = events;
 142         cb->ascb_saddr = saddr;
 143         cb->ascb_len = rsize;
 144 
 145         /* Add the entry to the list */
 146         mutex_enter(&as->a_contents);
 147         current_head = as->a_callbacks;
 148         as->a_callbacks = cb;
 149         cb->ascb_next = current_head;
 150 
 151         /*
 152          * The call to this function may lose in a race with
 153          * a pertinent event - eg. a thread does long term memory locking
 154          * but before the callback is added another thread executes as_unmap.
 155          * A broadcast here resolves that.
 156          */
 157         if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
 158                 AS_CLRUNMAPWAIT(as);
 159                 cv_broadcast(&as->a_cv);
 160         }
 161 
 162         mutex_exit(&as->a_contents);
 163         return (0);
 164 }
 165 
 166 /*
 167  * Search the callback list for an entry which pertains to arg.
 168  *
 169  * This is called from within the client upon completion of the callback.
 170  * RETURN VALUES:
 171  *      AS_CALLBACK_DELETED  (callback entry found and deleted)
 172  *      AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
 173  *      AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
 174  *                      entry will be made in as_do_callbacks)
 175  *
 176  * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
 177  * set, it indicates that as_do_callbacks is processing this entry.  The
 178  * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
 179  * to unblock as_do_callbacks, in case it is blocked.
 180  *
 181  * CALLERS RESPONSIBILITY: If not calling from within the process context for
 182  * the specified as, the caller must guarantee persistence of the specified as
 183  * for the duration of this function (eg. pages being locked within the as
 184  * will guarantee persistence).
 185  */
 186 uint_t
 187 as_delete_callback(struct as *as, void *arg)
 188 {
 189         struct as_callback **prevcb = &as->a_callbacks;
 190         struct as_callback *cb;
 191         uint_t rc = AS_CALLBACK_NOTFOUND;
 192 
 193         mutex_enter(&as->a_contents);
 194         for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
 195                 if (cb->ascb_arg != arg)
 196                         continue;
 197 
 198                 /*
 199                  * If the events indicate AS_CALLBACK_CALLED, just clear
 200                  * AS_ALL_EVENT in the events field and wakeup the thread
 201                  * that may be waiting in as_do_callbacks.  as_do_callbacks
 202                  * will take care of removing this entry from the list.  In
 203                  * that case, return AS_CALLBACK_DELETE_DEFERRED.  Otherwise
 204                  * (AS_CALLBACK_CALLED not set), just remove it from the
 205                  * list, return the memory and return AS_CALLBACK_DELETED.
 206                  */
 207                 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
 208                         /* leave AS_CALLBACK_CALLED */
 209                         cb->ascb_events &= ~AS_ALL_EVENT;
 210                         rc = AS_CALLBACK_DELETE_DEFERRED;
 211                         cv_broadcast(&as->a_cv);
 212                 } else {
 213                         *prevcb = cb->ascb_next;
 214                         kmem_free(cb, sizeof (struct as_callback));
 215                         rc = AS_CALLBACK_DELETED;
 216                 }
 217                 break;
 218         }
 219         mutex_exit(&as->a_contents);
 220         return (rc);
 221 }
 222 
 223 /*
 224  * Searches the as callback list for a matching entry.
 225  * Returns a pointer to the first matching callback, or NULL if
 226  * nothing is found.
 227  * This function never sleeps so it is ok to call it with more
 228  * locks held but the (required) a_contents mutex.
 229  *
 230  * See also comment on as_do_callbacks below.
 231  */
 232 static struct as_callback *
 233 as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
 234                         size_t event_len)
 235 {
 236         struct as_callback      *cb;
 237 
 238         ASSERT(MUTEX_HELD(&as->a_contents));
 239         for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
 240                 /*
 241                  * If the callback has not already been called, then
 242                  * check if events or address range pertains.  An event_len
 243                  * of zero means do an unconditional callback.
 244                  */
 245                 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
 246                     ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
 247                     (event_addr + event_len < cb->ascb_saddr) ||
 248                     (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
 249                         continue;
 250                 }
 251                 break;
 252         }
 253         return (cb);
 254 }
 255 
 256 /*
 257  * Executes a given callback and removes it from the callback list for
 258  * this address space.
 259  * This function may sleep so the caller must drop all locks except
 260  * a_contents before calling this func.
 261  *
 262  * See also comments on as_do_callbacks below.
 263  */
 264 static void
 265 as_execute_callback(struct as *as, struct as_callback *cb,
 266                                 uint_t events)
 267 {
 268         struct as_callback **prevcb;
 269         void    *cb_arg;
 270 
 271         ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
 272         cb->ascb_events |= AS_CALLBACK_CALLED;
 273         mutex_exit(&as->a_contents);
 274         (*cb->ascb_func)(as, cb->ascb_arg, events);
 275         mutex_enter(&as->a_contents);
 276         /*
 277          * the callback function is required to delete the callback
 278          * when the callback function determines it is OK for
 279          * this thread to continue. as_delete_callback will clear
 280          * the AS_ALL_EVENT in the events field when it is deleted.
 281          * If the callback function called as_delete_callback,
 282          * events will already be cleared and there will be no blocking.
 283          */
 284         while ((cb->ascb_events & events) != 0) {
 285                 cv_wait(&as->a_cv, &as->a_contents);
 286         }
 287         /*
 288          * This entry needs to be taken off the list. Normally, the
 289          * callback func itself does that, but unfortunately the list
 290          * may have changed while the callback was running because the
 291          * a_contents mutex was dropped and someone else other than the
 292          * callback func itself could have called as_delete_callback,
 293          * so we have to search to find this entry again.  The entry
 294          * must have AS_CALLBACK_CALLED, and have the same 'arg'.
 295          */
 296         cb_arg = cb->ascb_arg;
 297         prevcb = &as->a_callbacks;
 298         for (cb = as->a_callbacks; cb != NULL;
 299             prevcb = &cb->ascb_next, cb = *prevcb) {
 300                 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
 301                     (cb_arg != cb->ascb_arg)) {
 302                         continue;
 303                 }
 304                 *prevcb = cb->ascb_next;
 305                 kmem_free(cb, sizeof (struct as_callback));
 306                 break;
 307         }
 308 }
 309 
 310 /*
 311  * Check the callback list for a matching event and intersection of
 312  * address range. If there is a match invoke the callback.  Skip an entry if:
 313  *    - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
 314  *    - not event of interest
 315  *    - not address range of interest
 316  *
 317  * An event_len of zero indicates a request for an unconditional callback
 318  * (regardless of event), only the AS_CALLBACK_CALLED is checked.  The
 319  * a_contents lock must be dropped before a callback, so only one callback
 320  * can be done before returning. Return -1 (true) if a callback was
 321  * executed and removed from the list, else return 0 (false).
 322  *
 323  * The logically separate parts, i.e. finding a matching callback and
 324  * executing a given callback have been separated into two functions
 325  * so that they can be called with different sets of locks held beyond
 326  * the always-required a_contents. as_find_callback does not sleep so
 327  * it is ok to call it if more locks than a_contents (i.e. the a_lock
 328  * rwlock) are held. as_execute_callback on the other hand may sleep
 329  * so all locks beyond a_contents must be dropped by the caller if one
 330  * does not want to end comatose.
 331  */
 332 static int
 333 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
 334                         size_t event_len)
 335 {
 336         struct as_callback *cb;
 337 
 338         if ((cb = as_find_callback(as, events, event_addr, event_len))) {
 339                 as_execute_callback(as, cb, events);
 340                 return (-1);
 341         }
 342         return (0);
 343 }
 344 
 345 /*
 346  * Search for the segment containing addr. If a segment containing addr
 347  * exists, that segment is returned.  If no such segment exists, and
 348  * the list spans addresses greater than addr, then the first segment
 349  * whose base is greater than addr is returned; otherwise, NULL is
 350  * returned unless tail is true, in which case the last element of the
 351  * list is returned.
 352  *
 353  * a_seglast is used to cache the last found segment for repeated
 354  * searches to the same addr (which happens frequently).
 355  */
 356 struct seg *
 357 as_findseg(struct as *as, caddr_t addr, int tail)
 358 {
 359         struct seg *seg = as->a_seglast;
 360         avl_index_t where;
 361 
 362         ASSERT(AS_LOCK_HELD(as, &as->a_lock));
 363 
 364         if (seg != NULL &&
 365             seg->s_base <= addr &&
 366             addr < seg->s_base + seg->s_size)
 367                 return (seg);
 368 
 369         seg = avl_find(&as->a_segtree, &addr, &where);
 370         if (seg != NULL)
 371                 return (as->a_seglast = seg);
 372 
 373         seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 374         if (seg == NULL && tail)
 375                 seg = avl_last(&as->a_segtree);
 376         return (as->a_seglast = seg);
 377 }
 378 
 379 #ifdef VERIFY_SEGLIST
 380 /*
 381  * verify that the linked list is coherent
 382  */
 383 static void
 384 as_verify(struct as *as)
 385 {
 386         struct seg *seg, *seglast, *p, *n;
 387         uint_t nsegs = 0;
 388 
 389         if (do_as_verify == 0)
 390                 return;
 391 
 392         seglast = as->a_seglast;
 393 
 394         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 395                 ASSERT(seg->s_as == as);
 396                 p = AS_SEGPREV(as, seg);
 397                 n = AS_SEGNEXT(as, seg);
 398                 ASSERT(p == NULL || p->s_as == as);
 399                 ASSERT(p == NULL || p->s_base < seg->s_base);
 400                 ASSERT(n == NULL || n->s_base > seg->s_base);
 401                 ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
 402                 if (seg == seglast)
 403                         seglast = NULL;
 404                 nsegs++;
 405         }
 406         ASSERT(seglast == NULL);
 407         ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
 408 }
 409 #endif /* VERIFY_SEGLIST */
 410 
 411 /*
 412  * Add a new segment to the address space. The avl_find()
 413  * may be expensive so we attempt to use last segment accessed
 414  * in as_gap() as an insertion point.
 415  */
 416 int
 417 as_addseg(struct as  *as, struct seg *newseg)
 418 {
 419         struct seg *seg;
 420         caddr_t addr;
 421         caddr_t eaddr;
 422         avl_index_t where;
 423 
 424         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
 425 
 426         as->a_updatedir = 1; /* inform /proc */
 427         gethrestime(&as->a_updatetime);
 428 
 429         if (as->a_lastgaphl != NULL) {
 430                 struct seg *hseg = NULL;
 431                 struct seg *lseg = NULL;
 432 
 433                 if (as->a_lastgaphl->s_base > newseg->s_base) {
 434                         hseg = as->a_lastgaphl;
 435                         lseg = AVL_PREV(&as->a_segtree, hseg);
 436                 } else {
 437                         lseg = as->a_lastgaphl;
 438                         hseg = AVL_NEXT(&as->a_segtree, lseg);
 439                 }
 440 
 441                 if (hseg && lseg && lseg->s_base < newseg->s_base &&
 442                     hseg->s_base > newseg->s_base) {
 443                         avl_insert_here(&as->a_segtree, newseg, lseg,
 444                             AVL_AFTER);
 445                         as->a_lastgaphl = NULL;
 446                         as->a_seglast = newseg;
 447                         return (0);
 448                 }
 449                 as->a_lastgaphl = NULL;
 450         }
 451 
 452         addr = newseg->s_base;
 453         eaddr = addr + newseg->s_size;
 454 again:
 455 
 456         seg = avl_find(&as->a_segtree, &addr, &where);
 457 
 458         if (seg == NULL)
 459                 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 460 
 461         if (seg == NULL)
 462                 seg = avl_last(&as->a_segtree);
 463 
 464         if (seg != NULL) {
 465                 caddr_t base = seg->s_base;
 466 
 467                 /*
 468                  * If top of seg is below the requested address, then
 469                  * the insertion point is at the end of the linked list,
 470                  * and seg points to the tail of the list.  Otherwise,
 471                  * the insertion point is immediately before seg.
 472                  */
 473                 if (base + seg->s_size > addr) {
 474                         if (addr >= base || eaddr > base) {
 475 #ifdef __sparc
 476                                 extern struct seg_ops segnf_ops;
 477 
 478                                 /*
 479                                  * no-fault segs must disappear if overlaid.
 480                                  * XXX need new segment type so
 481                                  * we don't have to check s_ops
 482                                  */
 483                                 if (seg->s_ops == &segnf_ops) {
 484                                         seg_unmap(seg);
 485                                         goto again;
 486                                 }
 487 #endif
 488                                 return (-1);    /* overlapping segment */
 489                         }
 490                 }
 491         }
 492         as->a_seglast = newseg;
 493         avl_insert(&as->a_segtree, newseg, where);
 494 
 495 #ifdef VERIFY_SEGLIST
 496         as_verify(as);
 497 #endif
 498         return (0);
 499 }
 500 
 501 struct seg *
 502 as_removeseg(struct as *as, struct seg *seg)
 503 {
 504         avl_tree_t *t;
 505 
 506         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
 507 
 508         as->a_updatedir = 1; /* inform /proc */
 509         gethrestime(&as->a_updatetime);
 510 
 511         if (seg == NULL)
 512                 return (NULL);
 513 
 514         t = &as->a_segtree;
 515         if (as->a_seglast == seg)
 516                 as->a_seglast = NULL;
 517         as->a_lastgaphl = NULL;
 518 
 519         /*
 520          * if this segment is at an address higher than
 521          * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
 522          */
 523         if (as->a_lastgap &&
 524             (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
 525                 as->a_lastgap = AVL_NEXT(t, seg);
 526 
 527         /*
 528          * remove the segment from the seg tree
 529          */
 530         avl_remove(t, seg);
 531 
 532 #ifdef VERIFY_SEGLIST
 533         as_verify(as);
 534 #endif
 535         return (seg);
 536 }
 537 
 538 /*
 539  * Find a segment containing addr.
 540  */
 541 struct seg *
 542 as_segat(struct as *as, caddr_t addr)
 543 {
 544         struct seg *seg = as->a_seglast;
 545 
 546         ASSERT(AS_LOCK_HELD(as, &as->a_lock));
 547 
 548         if (seg != NULL && seg->s_base <= addr &&
 549             addr < seg->s_base + seg->s_size)
 550                 return (seg);
 551 
 552         seg = avl_find(&as->a_segtree, &addr, NULL);
 553         return (seg);
 554 }
 555 
 556 /*
 557  * Serialize all searches for holes in an address space to
 558  * prevent two or more threads from allocating the same virtual
 559  * address range.  The address space must not be "read/write"
 560  * locked by the caller since we may block.
 561  */
 562 void
 563 as_rangelock(struct as *as)
 564 {
 565         mutex_enter(&as->a_contents);
 566         while (AS_ISCLAIMGAP(as))
 567                 cv_wait(&as->a_cv, &as->a_contents);
 568         AS_SETCLAIMGAP(as);
 569         mutex_exit(&as->a_contents);
 570 }
 571 
 572 /*
 573  * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
 574  */
 575 void
 576 as_rangeunlock(struct as *as)
 577 {
 578         mutex_enter(&as->a_contents);
 579         AS_CLRCLAIMGAP(as);
 580         cv_signal(&as->a_cv);
 581         mutex_exit(&as->a_contents);
 582 }
 583 
 584 /*
 585  * compar segments (or just an address) by segment address range
 586  */
 587 static int
 588 as_segcompar(const void *x, const void *y)
 589 {
 590         struct seg *a = (struct seg *)x;
 591         struct seg *b = (struct seg *)y;
 592 
 593         if (a->s_base < b->s_base)
 594                 return (-1);
 595         if (a->s_base >= b->s_base + b->s_size)
 596                 return (1);
 597         return (0);
 598 }
 599 
 600 
 601 void
 602 as_avlinit(struct as *as)
 603 {
 604         avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
 605             offsetof(struct seg, s_tree));
 606         avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
 607             offsetof(struct watched_page, wp_link));
 608 }
 609 
 610 /*ARGSUSED*/
 611 static int
 612 as_constructor(void *buf, void *cdrarg, int kmflags)
 613 {
 614         struct as *as = buf;
 615 
 616         mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
 617         cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
 618         rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
 619         as_avlinit(as);
 620         return (0);
 621 }
 622 
 623 /*ARGSUSED1*/
 624 static void
 625 as_destructor(void *buf, void *cdrarg)
 626 {
 627         struct as *as = buf;
 628 
 629         avl_destroy(&as->a_segtree);
 630         mutex_destroy(&as->a_contents);
 631         cv_destroy(&as->a_cv);
 632         rw_destroy(&as->a_lock);
 633 }
 634 
 635 void
 636 as_init(void)
 637 {
 638         as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
 639             as_constructor, as_destructor, NULL, NULL, NULL, 0);
 640 }
 641 
 642 /*
 643  * Allocate and initialize an address space data structure.
 644  * We call hat_alloc to allow any machine dependent
 645  * information in the hat structure to be initialized.
 646  */
 647 struct as *
 648 as_alloc(void)
 649 {
 650         struct as *as;
 651 
 652         as = kmem_cache_alloc(as_cache, KM_SLEEP);
 653 
 654         as->a_flags          = 0;
 655         as->a_vbits          = 0;
 656         as->a_hrm            = NULL;
 657         as->a_seglast                = NULL;
 658         as->a_size           = 0;
 659         as->a_resvsize               = 0;
 660         as->a_updatedir              = 0;
 661         gethrestime(&as->a_updatetime);
 662         as->a_objectdir              = NULL;
 663         as->a_sizedir                = 0;
 664         as->a_userlimit              = (caddr_t)USERLIMIT;
 665         as->a_lastgap                = NULL;
 666         as->a_lastgaphl              = NULL;
 667         as->a_callbacks              = NULL;
 668 
 669         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 670         as->a_hat = hat_alloc(as);   /* create hat for default system mmu */
 671         AS_LOCK_EXIT(as, &as->a_lock);
 672 
 673         as->a_xhat = NULL;
 674 
 675         return (as);
 676 }
 677 
 678 /*
 679  * Free an address space data structure.
 680  * Need to free the hat first and then
 681  * all the segments on this as and finally
 682  * the space for the as struct itself.
 683  */
 684 void
 685 as_free(struct as *as)
 686 {
 687         struct hat *hat = as->a_hat;
 688         struct seg *seg, *next;
 689         int called = 0;
 690 
 691 top:
 692         /*
 693          * Invoke ALL callbacks. as_do_callbacks will do one callback
 694          * per call, and not return (-1) until the callback has completed.
 695          * When as_do_callbacks returns zero, all callbacks have completed.
 696          */
 697         mutex_enter(&as->a_contents);
 698         while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
 699                 ;
 700 
 701         /* This will prevent new XHATs from attaching to as */
 702         if (!called)
 703                 AS_SETBUSY(as);
 704         mutex_exit(&as->a_contents);
 705         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 706 
 707         if (!called) {
 708                 called = 1;
 709                 hat_free_start(hat);
 710                 if (as->a_xhat != NULL)
 711                         xhat_free_start_all(as);
 712         }
 713         for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
 714                 int err;
 715 
 716                 next = AS_SEGNEXT(as, seg);
 717 retry:
 718                 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
 719                 if (err == EAGAIN) {
 720                         mutex_enter(&as->a_contents);
 721                         if (as->a_callbacks) {
 722                                 AS_LOCK_EXIT(as, &as->a_lock);
 723                         } else if (!AS_ISNOUNMAPWAIT(as)) {
 724                                 /*
 725                                  * Memory is currently locked. Wait for a
 726                                  * cv_signal that it has been unlocked, then
 727                                  * try the operation again.
 728                                  */
 729                                 if (AS_ISUNMAPWAIT(as) == 0)
 730                                         cv_broadcast(&as->a_cv);
 731                                 AS_SETUNMAPWAIT(as);
 732                                 AS_LOCK_EXIT(as, &as->a_lock);
 733                                 while (AS_ISUNMAPWAIT(as))
 734                                         cv_wait(&as->a_cv, &as->a_contents);
 735                         } else {
 736                                 /*
 737                                  * We may have raced with
 738                                  * segvn_reclaim()/segspt_reclaim(). In this
 739                                  * case clean nounmapwait flag and retry since
 740                                  * softlockcnt in this segment may be already
 741                                  * 0.  We don't drop as writer lock so our
 742                                  * number of retries without sleeping should
 743                                  * be very small. See segvn_reclaim() for
 744                                  * more comments.
 745                                  */
 746                                 AS_CLRNOUNMAPWAIT(as);
 747                                 mutex_exit(&as->a_contents);
 748                                 goto retry;
 749                         }
 750                         mutex_exit(&as->a_contents);
 751                         goto top;
 752                 } else {
 753                         /*
 754                          * We do not expect any other error return at this
 755                          * time. This is similar to an ASSERT in seg_unmap()
 756                          */
 757                         ASSERT(err == 0);
 758                 }
 759         }
 760         hat_free_end(hat);
 761         if (as->a_xhat != NULL)
 762                 xhat_free_end_all(as);
 763         AS_LOCK_EXIT(as, &as->a_lock);
 764 
 765         /* /proc stuff */
 766         ASSERT(avl_numnodes(&as->a_wpage) == 0);
 767         if (as->a_objectdir) {
 768                 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
 769                 as->a_objectdir = NULL;
 770                 as->a_sizedir = 0;
 771         }
 772 
 773         /*
 774          * Free the struct as back to kmem.  Assert it has no segments.
 775          */
 776         ASSERT(avl_numnodes(&as->a_segtree) == 0);
 777         kmem_cache_free(as_cache, as);
 778 }
 779 
 780 int
 781 as_dup(struct as *as, struct proc *forkedproc)
 782 {
 783         struct as *newas;
 784         struct seg *seg, *newseg;
 785         size_t  purgesize = 0;
 786         int error;
 787 
 788         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 789         as_clearwatch(as);
 790         newas = as_alloc();
 791         newas->a_userlimit = as->a_userlimit;
 792         newas->a_proc = forkedproc;
 793 
 794         AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER);
 795 
 796         /* This will prevent new XHATs from attaching */
 797         mutex_enter(&as->a_contents);
 798         AS_SETBUSY(as);
 799         mutex_exit(&as->a_contents);
 800         mutex_enter(&newas->a_contents);
 801         AS_SETBUSY(newas);
 802         mutex_exit(&newas->a_contents);
 803 
 804         (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
 805 
 806         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 807 
 808                 if (seg->s_flags & S_PURGE) {
 809                         purgesize += seg->s_size;
 810                         continue;
 811                 }
 812 
 813                 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
 814                 if (newseg == NULL) {
 815                         AS_LOCK_EXIT(newas, &newas->a_lock);
 816                         as_setwatch(as);
 817                         mutex_enter(&as->a_contents);
 818                         AS_CLRBUSY(as);
 819                         mutex_exit(&as->a_contents);
 820                         AS_LOCK_EXIT(as, &as->a_lock);
 821                         as_free(newas);
 822                         return (-1);
 823                 }
 824                 if ((error = SEGOP_DUP(seg, newseg)) != 0) {
 825                         /*
 826                          * We call seg_free() on the new seg
 827                          * because the segment is not set up
 828                          * completely; i.e. it has no ops.
 829                          */
 830                         as_setwatch(as);
 831                         mutex_enter(&as->a_contents);
 832                         AS_CLRBUSY(as);
 833                         mutex_exit(&as->a_contents);
 834                         AS_LOCK_EXIT(as, &as->a_lock);
 835                         seg_free(newseg);
 836                         AS_LOCK_EXIT(newas, &newas->a_lock);
 837                         as_free(newas);
 838                         return (error);
 839                 }
 840                 newas->a_size += seg->s_size;
 841         }
 842         newas->a_resvsize = as->a_resvsize - purgesize;
 843 
 844         error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
 845         if (as->a_xhat != NULL)
 846                 error |= xhat_dup_all(as, newas, NULL, 0, HAT_DUP_ALL);
 847 
 848         mutex_enter(&newas->a_contents);
 849         AS_CLRBUSY(newas);
 850         mutex_exit(&newas->a_contents);
 851         AS_LOCK_EXIT(newas, &newas->a_lock);
 852 
 853         as_setwatch(as);
 854         mutex_enter(&as->a_contents);
 855         AS_CLRBUSY(as);
 856         mutex_exit(&as->a_contents);
 857         AS_LOCK_EXIT(as, &as->a_lock);
 858         if (error != 0) {
 859                 as_free(newas);
 860                 return (error);
 861         }
 862         forkedproc->p_as = newas;
 863         return (0);
 864 }
 865 
 866 /*
 867  * Handle a ``fault'' at addr for size bytes.
 868  */
 869 faultcode_t
 870 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
 871         enum fault_type type, enum seg_rw rw)
 872 {
 873         struct seg *seg;
 874         caddr_t raddr;                  /* rounded down addr */
 875         size_t rsize;                   /* rounded up size */
 876         size_t ssize;
 877         faultcode_t res = 0;
 878         caddr_t addrsav;
 879         struct seg *segsav;
 880         int as_lock_held;
 881         klwp_t *lwp = ttolwp(curthread);
 882         int is_xhat = 0;
 883         int holding_wpage = 0;
 884         extern struct seg_ops   segdev_ops;
 885 
 886 
 887 
 888         if (as->a_hat != hat) {
 889                 /* This must be an XHAT then */
 890                 is_xhat = 1;
 891 
 892                 if ((type != F_INVAL) || (as == &kas))
 893                         return (FC_NOSUPPORT);
 894         }
 895 
 896 retry:
 897         if (!is_xhat) {
 898                 /*
 899                  * Indicate that the lwp is not to be stopped while waiting
 900                  * for a pagefault.  This is to avoid deadlock while debugging
 901                  * a process via /proc over NFS (in particular).
 902                  */
 903                 if (lwp != NULL)
 904                         lwp->lwp_nostop++;
 905 
 906                 /*
 907                  * same length must be used when we softlock and softunlock.
 908                  * We don't support softunlocking lengths less than
 909                  * the original length when there is largepage support.
 910                  * See seg_dev.c for more comments.
 911                  */
 912                 switch (type) {
 913 
 914                 case F_SOFTLOCK:
 915                         CPU_STATS_ADD_K(vm, softlock, 1);
 916                         break;
 917 
 918                 case F_SOFTUNLOCK:
 919                         break;
 920 
 921                 case F_PROT:
 922                         CPU_STATS_ADD_K(vm, prot_fault, 1);
 923                         break;
 924 
 925                 case F_INVAL:
 926                         CPU_STATS_ENTER_K();
 927                         CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
 928                         if (as == &kas)
 929                                 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
 930                         CPU_STATS_EXIT_K();
 931                         break;
 932                 }
 933         }
 934 
 935         /* Kernel probe */
 936         TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
 937             tnf_opaque, address,        addr,
 938             tnf_fault_type,     fault_type,     type,
 939             tnf_seg_access,     access,         rw);
 940 
 941         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
 942         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
 943             (size_t)raddr;
 944 
 945         /*
 946          * XXX -- Don't grab the as lock for segkmap. We should grab it for
 947          * correctness, but then we could be stuck holding this lock for
 948          * a LONG time if the fault needs to be resolved on a slow
 949          * filesystem, and then no-one will be able to exec new commands,
 950          * as exec'ing requires the write lock on the as.
 951          */
 952         if (as == &kas && segkmap && segkmap->s_base <= raddr &&
 953             raddr + size < segkmap->s_base + segkmap->s_size) {
 954                 /*
 955                  * if (as==&kas), this can't be XHAT: we've already returned
 956                  * FC_NOSUPPORT.
 957                  */
 958                 seg = segkmap;
 959                 as_lock_held = 0;
 960         } else {
 961                 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
 962                 if (is_xhat && avl_numnodes(&as->a_wpage) != 0) {
 963                         /*
 964                          * Grab and hold the writers' lock on the as
 965                          * if the fault is to a watched page.
 966                          * This will keep CPUs from "peeking" at the
 967                          * address range while we're temporarily boosting
 968                          * the permissions for the XHAT device to
 969                          * resolve the fault in the segment layer.
 970                          *
 971                          * We could check whether faulted address
 972                          * is within a watched page and only then grab
 973                          * the writer lock, but this is simpler.
 974                          */
 975                         AS_LOCK_EXIT(as, &as->a_lock);
 976                         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 977                 }
 978 
 979                 seg = as_segat(as, raddr);
 980                 if (seg == NULL) {
 981                         AS_LOCK_EXIT(as, &as->a_lock);
 982                         if ((lwp != NULL) && (!is_xhat))
 983                                 lwp->lwp_nostop--;
 984                         return (FC_NOMAP);
 985                 }
 986 
 987                 as_lock_held = 1;
 988         }
 989 
 990         addrsav = raddr;
 991         segsav = seg;
 992 
 993         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
 994                 if (raddr >= seg->s_base + seg->s_size) {
 995                         seg = AS_SEGNEXT(as, seg);
 996                         if (seg == NULL || raddr != seg->s_base) {
 997                                 res = FC_NOMAP;
 998                                 break;
 999                         }
1000                 }
1001                 if (raddr + rsize > seg->s_base + seg->s_size)
1002                         ssize = seg->s_base + seg->s_size - raddr;
1003                 else
1004                         ssize = rsize;
1005 
1006                 if (!is_xhat || (seg->s_ops != &segdev_ops)) {
1007 
1008                         if (is_xhat && avl_numnodes(&as->a_wpage) != 0 &&
1009                             pr_is_watchpage_as(raddr, rw, as)) {
1010                                 /*
1011                                  * Handle watch pages.  If we're faulting on a
1012                                  * watched page from an X-hat, we have to
1013                                  * restore the original permissions while we
1014                                  * handle the fault.
1015                                  */
1016                                 as_clearwatch(as);
1017                                 holding_wpage = 1;
1018                         }
1019 
1020                         res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
1021 
1022                         /* Restore watchpoints */
1023                         if (holding_wpage) {
1024                                 as_setwatch(as);
1025                                 holding_wpage = 0;
1026                         }
1027 
1028                         if (res != 0)
1029                                 break;
1030                 } else {
1031                         /* XHAT does not support seg_dev */
1032                         res = FC_NOSUPPORT;
1033                         break;
1034                 }
1035         }
1036 
1037         /*
1038          * If we were SOFTLOCKing and encountered a failure,
1039          * we must SOFTUNLOCK the range we already did. (Maybe we
1040          * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
1041          * right here...)
1042          */
1043         if (res != 0 && type == F_SOFTLOCK) {
1044                 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
1045                         if (addrsav >= seg->s_base + seg->s_size)
1046                                 seg = AS_SEGNEXT(as, seg);
1047                         ASSERT(seg != NULL);
1048                         /*
1049                          * Now call the fault routine again to perform the
1050                          * unlock using S_OTHER instead of the rw variable
1051                          * since we never got a chance to touch the pages.
1052                          */
1053                         if (raddr > seg->s_base + seg->s_size)
1054                                 ssize = seg->s_base + seg->s_size - addrsav;
1055                         else
1056                                 ssize = raddr - addrsav;
1057                         (void) SEGOP_FAULT(hat, seg, addrsav, ssize,
1058                             F_SOFTUNLOCK, S_OTHER);
1059                 }
1060         }
1061         if (as_lock_held)
1062                 AS_LOCK_EXIT(as, &as->a_lock);
1063         if ((lwp != NULL) && (!is_xhat))
1064                 lwp->lwp_nostop--;
1065 
1066         /*
1067          * If the lower levels returned EDEADLK for a fault,
1068          * It means that we should retry the fault.  Let's wait
1069          * a bit also to let the deadlock causing condition clear.
1070          * This is part of a gross hack to work around a design flaw
1071          * in the ufs/sds logging code and should go away when the
1072          * logging code is re-designed to fix the problem. See bug
1073          * 4125102 for details of the problem.
1074          */
1075         if (FC_ERRNO(res) == EDEADLK) {
1076                 delay(deadlk_wait);
1077                 res = 0;
1078                 goto retry;
1079         }
1080         return (res);
1081 }
1082 
1083 
1084 
1085 /*
1086  * Asynchronous ``fault'' at addr for size bytes.
1087  */
1088 faultcode_t
1089 as_faulta(struct as *as, caddr_t addr, size_t size)
1090 {
1091         struct seg *seg;
1092         caddr_t raddr;                  /* rounded down addr */
1093         size_t rsize;                   /* rounded up size */
1094         faultcode_t res = 0;
1095         klwp_t *lwp = ttolwp(curthread);
1096 
1097 retry:
1098         /*
1099          * Indicate that the lwp is not to be stopped while waiting
1100          * for a pagefault.  This is to avoid deadlock while debugging
1101          * a process via /proc over NFS (in particular).
1102          */
1103         if (lwp != NULL)
1104                 lwp->lwp_nostop++;
1105 
1106         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1107         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1108             (size_t)raddr;
1109 
1110         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1111         seg = as_segat(as, raddr);
1112         if (seg == NULL) {
1113                 AS_LOCK_EXIT(as, &as->a_lock);
1114                 if (lwp != NULL)
1115                         lwp->lwp_nostop--;
1116                 return (FC_NOMAP);
1117         }
1118 
1119         for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1120                 if (raddr >= seg->s_base + seg->s_size) {
1121                         seg = AS_SEGNEXT(as, seg);
1122                         if (seg == NULL || raddr != seg->s_base) {
1123                                 res = FC_NOMAP;
1124                                 break;
1125                         }
1126                 }
1127                 res = SEGOP_FAULTA(seg, raddr);
1128                 if (res != 0)
1129                         break;
1130         }
1131         AS_LOCK_EXIT(as, &as->a_lock);
1132         if (lwp != NULL)
1133                 lwp->lwp_nostop--;
1134         /*
1135          * If the lower levels returned EDEADLK for a fault,
1136          * It means that we should retry the fault.  Let's wait
1137          * a bit also to let the deadlock causing condition clear.
1138          * This is part of a gross hack to work around a design flaw
1139          * in the ufs/sds logging code and should go away when the
1140          * logging code is re-designed to fix the problem. See bug
1141          * 4125102 for details of the problem.
1142          */
1143         if (FC_ERRNO(res) == EDEADLK) {
1144                 delay(deadlk_wait);
1145                 res = 0;
1146                 goto retry;
1147         }
1148         return (res);
1149 }
1150 
1151 /*
1152  * Set the virtual mapping for the interval from [addr : addr + size)
1153  * in address space `as' to have the specified protection.
1154  * It is ok for the range to cross over several segments,
1155  * as long as they are contiguous.
1156  */
1157 int
1158 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1159 {
1160         struct seg *seg;
1161         struct as_callback *cb;
1162         size_t ssize;
1163         caddr_t raddr;                  /* rounded down addr */
1164         size_t rsize;                   /* rounded up size */
1165         int error = 0, writer = 0;
1166         caddr_t saveraddr;
1167         size_t saversize;
1168 
1169 setprot_top:
1170         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1171         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1172             (size_t)raddr;
1173 
1174         if (raddr + rsize < raddr)           /* check for wraparound */
1175                 return (ENOMEM);
1176 
1177         saveraddr = raddr;
1178         saversize = rsize;
1179 
1180         /*
1181          * Normally we only lock the as as a reader. But
1182          * if due to setprot the segment driver needs to split
1183          * a segment it will return IE_RETRY. Therefore we re-acquire
1184          * the as lock as a writer so the segment driver can change
1185          * the seg list. Also the segment driver will return IE_RETRY
1186          * after it has changed the segment list so we therefore keep
1187          * locking as a writer. Since these opeartions should be rare
1188          * want to only lock as a writer when necessary.
1189          */
1190         if (writer || avl_numnodes(&as->a_wpage) != 0) {
1191                 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1192         } else {
1193                 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1194         }
1195 
1196         as_clearwatchprot(as, raddr, rsize);
1197         seg = as_segat(as, raddr);
1198         if (seg == NULL) {
1199                 as_setwatch(as);
1200                 AS_LOCK_EXIT(as, &as->a_lock);
1201                 return (ENOMEM);
1202         }
1203 
1204         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1205                 if (raddr >= seg->s_base + seg->s_size) {
1206                         seg = AS_SEGNEXT(as, seg);
1207                         if (seg == NULL || raddr != seg->s_base) {
1208                                 error = ENOMEM;
1209                                 break;
1210                         }
1211                 }
1212                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1213                         ssize = seg->s_base + seg->s_size - raddr;
1214                 else
1215                         ssize = rsize;
1216 retry:
1217                 error = SEGOP_SETPROT(seg, raddr, ssize, prot);
1218 
1219                 if (error == IE_NOMEM) {
1220                         error = EAGAIN;
1221                         break;
1222                 }
1223 
1224                 if (error == IE_RETRY) {
1225                         AS_LOCK_EXIT(as, &as->a_lock);
1226                         writer = 1;
1227                         goto setprot_top;
1228                 }
1229 
1230                 if (error == EAGAIN) {
1231                         /*
1232                          * Make sure we have a_lock as writer.
1233                          */
1234                         if (writer == 0) {
1235                                 AS_LOCK_EXIT(as, &as->a_lock);
1236                                 writer = 1;
1237                                 goto setprot_top;
1238                         }
1239 
1240                         /*
1241                          * Memory is currently locked.  It must be unlocked
1242                          * before this operation can succeed through a retry.
1243                          * The possible reasons for locked memory and
1244                          * corresponding strategies for unlocking are:
1245                          * (1) Normal I/O
1246                          *      wait for a signal that the I/O operation
1247                          *      has completed and the memory is unlocked.
1248                          * (2) Asynchronous I/O
1249                          *      The aio subsystem does not unlock pages when
1250                          *      the I/O is completed. Those pages are unlocked
1251                          *      when the application calls aiowait/aioerror.
1252                          *      So, to prevent blocking forever, cv_broadcast()
1253                          *      is done to wake up aio_cleanup_thread.
1254                          *      Subsequently, segvn_reclaim will be called, and
1255                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
1256                          * (3) Long term page locking:
1257                          *      Drivers intending to have pages locked for a
1258                          *      period considerably longer than for normal I/O
1259                          *      (essentially forever) may have registered for a
1260                          *      callback so they may unlock these pages on
1261                          *      request. This is needed to allow this operation
1262                          *      to succeed. Each entry on the callback list is
1263                          *      examined. If the event or address range pertains
1264                          *      the callback is invoked (unless it already is in
1265                          *      progress). The a_contents lock must be dropped
1266                          *      before the callback, so only one callback can
1267                          *      be done at a time. Go to the top and do more
1268                          *      until zero is returned. If zero is returned,
1269                          *      either there were no callbacks for this event
1270                          *      or they were already in progress.
1271                          */
1272                         mutex_enter(&as->a_contents);
1273                         if (as->a_callbacks &&
1274                             (cb = as_find_callback(as, AS_SETPROT_EVENT,
1275                             seg->s_base, seg->s_size))) {
1276                                 AS_LOCK_EXIT(as, &as->a_lock);
1277                                 as_execute_callback(as, cb, AS_SETPROT_EVENT);
1278                         } else if (!AS_ISNOUNMAPWAIT(as)) {
1279                                 if (AS_ISUNMAPWAIT(as) == 0)
1280                                         cv_broadcast(&as->a_cv);
1281                                 AS_SETUNMAPWAIT(as);
1282                                 AS_LOCK_EXIT(as, &as->a_lock);
1283                                 while (AS_ISUNMAPWAIT(as))
1284                                         cv_wait(&as->a_cv, &as->a_contents);
1285                         } else {
1286                                 /*
1287                                  * We may have raced with
1288                                  * segvn_reclaim()/segspt_reclaim(). In this
1289                                  * case clean nounmapwait flag and retry since
1290                                  * softlockcnt in this segment may be already
1291                                  * 0.  We don't drop as writer lock so our
1292                                  * number of retries without sleeping should
1293                                  * be very small. See segvn_reclaim() for
1294                                  * more comments.
1295                                  */
1296                                 AS_CLRNOUNMAPWAIT(as);
1297                                 mutex_exit(&as->a_contents);
1298                                 goto retry;
1299                         }
1300                         mutex_exit(&as->a_contents);
1301                         goto setprot_top;
1302                 } else if (error != 0)
1303                         break;
1304         }
1305         if (error != 0) {
1306                 as_setwatch(as);
1307         } else {
1308                 as_setwatchprot(as, saveraddr, saversize, prot);
1309         }
1310         AS_LOCK_EXIT(as, &as->a_lock);
1311         return (error);
1312 }
1313 
1314 /*
1315  * Check to make sure that the interval [addr, addr + size)
1316  * in address space `as' has at least the specified protection.
1317  * It is ok for the range to cross over several segments, as long
1318  * as they are contiguous.
1319  */
1320 int
1321 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1322 {
1323         struct seg *seg;
1324         size_t ssize;
1325         caddr_t raddr;                  /* rounded down addr */
1326         size_t rsize;                   /* rounded up size */
1327         int error = 0;
1328 
1329         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1330         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1331             (size_t)raddr;
1332 
1333         if (raddr + rsize < raddr)           /* check for wraparound */
1334                 return (ENOMEM);
1335 
1336         /*
1337          * This is ugly as sin...
1338          * Normally, we only acquire the address space readers lock.
1339          * However, if the address space has watchpoints present,
1340          * we must acquire the writer lock on the address space for
1341          * the benefit of as_clearwatchprot() and as_setwatchprot().
1342          */
1343         if (avl_numnodes(&as->a_wpage) != 0)
1344                 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1345         else
1346                 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1347         as_clearwatchprot(as, raddr, rsize);
1348         seg = as_segat(as, raddr);
1349         if (seg == NULL) {
1350                 as_setwatch(as);
1351                 AS_LOCK_EXIT(as, &as->a_lock);
1352                 return (ENOMEM);
1353         }
1354 
1355         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1356                 if (raddr >= seg->s_base + seg->s_size) {
1357                         seg = AS_SEGNEXT(as, seg);
1358                         if (seg == NULL || raddr != seg->s_base) {
1359                                 error = ENOMEM;
1360                                 break;
1361                         }
1362                 }
1363                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1364                         ssize = seg->s_base + seg->s_size - raddr;
1365                 else
1366                         ssize = rsize;
1367 
1368                 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
1369                 if (error != 0)
1370                         break;
1371         }
1372         as_setwatch(as);
1373         AS_LOCK_EXIT(as, &as->a_lock);
1374         return (error);
1375 }
1376 
1377 int
1378 as_unmap(struct as *as, caddr_t addr, size_t size)
1379 {
1380         struct seg *seg, *seg_next;
1381         struct as_callback *cb;
1382         caddr_t raddr, eaddr;
1383         size_t ssize, rsize = 0;
1384         int err;
1385 
1386 top:
1387         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1388         eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1389             (uintptr_t)PAGEMASK);
1390 
1391         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1392 
1393         as->a_updatedir = 1; /* inform /proc */
1394         gethrestime(&as->a_updatetime);
1395 
1396         /*
1397          * Use as_findseg to find the first segment in the range, then
1398          * step through the segments in order, following s_next.
1399          */
1400         as_clearwatchprot(as, raddr, eaddr - raddr);
1401 
1402         for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1403                 if (eaddr <= seg->s_base)
1404                         break;          /* eaddr was in a gap; all done */
1405 
1406                 /* this is implied by the test above */
1407                 ASSERT(raddr < eaddr);
1408 
1409                 if (raddr < seg->s_base)
1410                         raddr = seg->s_base;         /* raddr was in a gap */
1411 
1412                 if (eaddr > (seg->s_base + seg->s_size))
1413                         ssize = seg->s_base + seg->s_size - raddr;
1414                 else
1415                         ssize = eaddr - raddr;
1416 
1417                 /*
1418                  * Save next segment pointer since seg can be
1419                  * destroyed during the segment unmap operation.
1420                  */
1421                 seg_next = AS_SEGNEXT(as, seg);
1422 
1423                 /*
1424                  * We didn't count /dev/null mappings, so ignore them here.
1425                  * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1426                  * we have to do this check here while we have seg.)
1427                  */
1428                 rsize = 0;
1429                 if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1430                     !SEG_IS_PARTIAL_RESV(seg))
1431                         rsize = ssize;
1432 
1433 retry:
1434                 err = SEGOP_UNMAP(seg, raddr, ssize);
1435                 if (err == EAGAIN) {
1436                         /*
1437                          * Memory is currently locked.  It must be unlocked
1438                          * before this operation can succeed through a retry.
1439                          * The possible reasons for locked memory and
1440                          * corresponding strategies for unlocking are:
1441                          * (1) Normal I/O
1442                          *      wait for a signal that the I/O operation
1443                          *      has completed and the memory is unlocked.
1444                          * (2) Asynchronous I/O
1445                          *      The aio subsystem does not unlock pages when
1446                          *      the I/O is completed. Those pages are unlocked
1447                          *      when the application calls aiowait/aioerror.
1448                          *      So, to prevent blocking forever, cv_broadcast()
1449                          *      is done to wake up aio_cleanup_thread.
1450                          *      Subsequently, segvn_reclaim will be called, and
1451                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
1452                          * (3) Long term page locking:
1453                          *      Drivers intending to have pages locked for a
1454                          *      period considerably longer than for normal I/O
1455                          *      (essentially forever) may have registered for a
1456                          *      callback so they may unlock these pages on
1457                          *      request. This is needed to allow this operation
1458                          *      to succeed. Each entry on the callback list is
1459                          *      examined. If the event or address range pertains
1460                          *      the callback is invoked (unless it already is in
1461                          *      progress). The a_contents lock must be dropped
1462                          *      before the callback, so only one callback can
1463                          *      be done at a time. Go to the top and do more
1464                          *      until zero is returned. If zero is returned,
1465                          *      either there were no callbacks for this event
1466                          *      or they were already in progress.
1467                          */
1468                         mutex_enter(&as->a_contents);
1469                         if (as->a_callbacks &&
1470                             (cb = as_find_callback(as, AS_UNMAP_EVENT,
1471                             seg->s_base, seg->s_size))) {
1472                                 AS_LOCK_EXIT(as, &as->a_lock);
1473                                 as_execute_callback(as, cb, AS_UNMAP_EVENT);
1474                         } else if (!AS_ISNOUNMAPWAIT(as)) {
1475                                 if (AS_ISUNMAPWAIT(as) == 0)
1476                                         cv_broadcast(&as->a_cv);
1477                                 AS_SETUNMAPWAIT(as);
1478                                 AS_LOCK_EXIT(as, &as->a_lock);
1479                                 while (AS_ISUNMAPWAIT(as))
1480                                         cv_wait(&as->a_cv, &as->a_contents);
1481                         } else {
1482                                 /*
1483                                  * We may have raced with
1484                                  * segvn_reclaim()/segspt_reclaim(). In this
1485                                  * case clean nounmapwait flag and retry since
1486                                  * softlockcnt in this segment may be already
1487                                  * 0.  We don't drop as writer lock so our
1488                                  * number of retries without sleeping should
1489                                  * be very small. See segvn_reclaim() for
1490                                  * more comments.
1491                                  */
1492                                 AS_CLRNOUNMAPWAIT(as);
1493                                 mutex_exit(&as->a_contents);
1494                                 goto retry;
1495                         }
1496                         mutex_exit(&as->a_contents);
1497                         goto top;
1498                 } else if (err == IE_RETRY) {
1499                         AS_LOCK_EXIT(as, &as->a_lock);
1500                         goto top;
1501                 } else if (err) {
1502                         as_setwatch(as);
1503                         AS_LOCK_EXIT(as, &as->a_lock);
1504                         return (-1);
1505                 }
1506 
1507                 as->a_size -= ssize;
1508                 if (rsize)
1509                         as->a_resvsize -= rsize;
1510                 raddr += ssize;
1511         }
1512         AS_LOCK_EXIT(as, &as->a_lock);
1513         return (0);
1514 }
1515 
1516 static int
1517 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1518     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1519 {
1520         uint_t szc;
1521         uint_t nszc;
1522         int error;
1523         caddr_t a;
1524         caddr_t eaddr;
1525         size_t segsize;
1526         struct seg *seg;
1527         size_t pgsz;
1528         int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1529         uint_t save_szcvec;
1530 
1531         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1532         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1533         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1534         ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1535         if (!do_off) {
1536                 vn_a->offset = 0;
1537         }
1538 
1539         if (szcvec <= 1) {
1540                 seg = seg_alloc(as, addr, size);
1541                 if (seg == NULL) {
1542                         return (ENOMEM);
1543                 }
1544                 vn_a->szc = 0;
1545                 error = (*crfp)(seg, vn_a);
1546                 if (error != 0) {
1547                         seg_free(seg);
1548                 } else {
1549                         as->a_size += size;
1550                         as->a_resvsize += size;
1551                 }
1552                 return (error);
1553         }
1554 
1555         eaddr = addr + size;
1556         save_szcvec = szcvec;
1557         szcvec >>= 1;
1558         szc = 0;
1559         nszc = 0;
1560         while (szcvec) {
1561                 if ((szcvec & 0x1) == 0) {
1562                         nszc++;
1563                         szcvec >>= 1;
1564                         continue;
1565                 }
1566                 nszc++;
1567                 pgsz = page_get_pagesize(nszc);
1568                 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1569                 if (a != addr) {
1570                         ASSERT(a < eaddr);
1571                         segsize = a - addr;
1572                         seg = seg_alloc(as, addr, segsize);
1573                         if (seg == NULL) {
1574                                 return (ENOMEM);
1575                         }
1576                         vn_a->szc = szc;
1577                         error = (*crfp)(seg, vn_a);
1578                         if (error != 0) {
1579                                 seg_free(seg);
1580                                 return (error);
1581                         }
1582                         as->a_size += segsize;
1583                         as->a_resvsize += segsize;
1584                         *segcreated = 1;
1585                         if (do_off) {
1586                                 vn_a->offset += segsize;
1587                         }
1588                         addr = a;
1589                 }
1590                 szc = nszc;
1591                 szcvec >>= 1;
1592         }
1593 
1594         ASSERT(addr < eaddr);
1595         szcvec = save_szcvec | 1; /* add 8K pages */
1596         while (szcvec) {
1597                 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1598                 ASSERT(a >= addr);
1599                 if (a != addr) {
1600                         segsize = a - addr;
1601                         seg = seg_alloc(as, addr, segsize);
1602                         if (seg == NULL) {
1603                                 return (ENOMEM);
1604                         }
1605                         vn_a->szc = szc;
1606                         error = (*crfp)(seg, vn_a);
1607                         if (error != 0) {
1608                                 seg_free(seg);
1609                                 return (error);
1610                         }
1611                         as->a_size += segsize;
1612                         as->a_resvsize += segsize;
1613                         *segcreated = 1;
1614                         if (do_off) {
1615                                 vn_a->offset += segsize;
1616                         }
1617                         addr = a;
1618                 }
1619                 szcvec &= ~(1 << szc);
1620                 if (szcvec) {
1621                         szc = highbit(szcvec) - 1;
1622                         pgsz = page_get_pagesize(szc);
1623                 }
1624         }
1625         ASSERT(addr == eaddr);
1626 
1627         return (0);
1628 }
1629 
1630 static int
1631 as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1632     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1633 {
1634         uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1635         int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1636         uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1637             type, 0);
1638         int error;
1639         struct seg *seg;
1640         struct vattr va;
1641         u_offset_t eoff;
1642         size_t save_size = 0;
1643         extern size_t textrepl_size_thresh;
1644 
1645         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1646         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1647         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1648         ASSERT(vn_a->vp != NULL);
1649         ASSERT(vn_a->amp == NULL);
1650 
1651 again:
1652         if (szcvec <= 1) {
1653                 seg = seg_alloc(as, addr, size);
1654                 if (seg == NULL) {
1655                         return (ENOMEM);
1656                 }
1657                 vn_a->szc = 0;
1658                 error = (*crfp)(seg, vn_a);
1659                 if (error != 0) {
1660                         seg_free(seg);
1661                 } else {
1662                         as->a_size += size;
1663                         as->a_resvsize += size;
1664                 }
1665                 return (error);
1666         }
1667 
1668         va.va_mask = AT_SIZE;
1669         if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1670                 szcvec = 0;
1671                 goto again;
1672         }
1673         eoff = vn_a->offset & PAGEMASK;
1674         if (eoff >= va.va_size) {
1675                 szcvec = 0;
1676                 goto again;
1677         }
1678         eoff += size;
1679         if (btopr(va.va_size) < btopr(eoff)) {
1680                 save_size = size;
1681                 size = va.va_size - (vn_a->offset & PAGEMASK);
1682                 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1683                 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1684                     type, 0);
1685                 if (szcvec <= 1) {
1686                         size = save_size;
1687                         goto again;
1688                 }
1689         }
1690 
1691         if (size > textrepl_size_thresh) {
1692                 vn_a->flags |= _MAP_TEXTREPL;
1693         }
1694         error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1695             segcreated);
1696         if (error != 0) {
1697                 return (error);
1698         }
1699         if (save_size) {
1700                 addr += size;
1701                 size = save_size - size;
1702                 szcvec = 0;
1703                 goto again;
1704         }
1705         return (0);
1706 }
1707 
1708 /*
1709  * as_map_ansegs: shared or private anonymous memory.  Note that the flags
1710  * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1711  */
1712 static int
1713 as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1714     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1715 {
1716         uint_t szcvec;
1717         uchar_t type;
1718 
1719         ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1720         if (vn_a->type == MAP_SHARED) {
1721                 type = MAPPGSZC_SHM;
1722         } else if (vn_a->type == MAP_PRIVATE) {
1723                 if (vn_a->szc == AS_MAP_HEAP) {
1724                         type = MAPPGSZC_HEAP;
1725                 } else if (vn_a->szc == AS_MAP_STACK) {
1726                         type = MAPPGSZC_STACK;
1727                 } else {
1728                         type = MAPPGSZC_PRIVM;
1729                 }
1730         }
1731         szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1732             (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1733             (vn_a->flags & MAP_TEXT), type, 0);
1734         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1735         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1736         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1737         ASSERT(vn_a->vp == NULL);
1738 
1739         return (as_map_segvn_segs(as, addr, size, szcvec,
1740             crfp, vn_a, segcreated));
1741 }
1742 
1743 int
1744 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1745 {
1746         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1747         return (as_map_locked(as, addr, size, crfp, argsp));
1748 }
1749 
1750 int
1751 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1752                 void *argsp)
1753 {
1754         struct seg *seg = NULL;
1755         caddr_t raddr;                  /* rounded down addr */
1756         size_t rsize;                   /* rounded up size */
1757         int error;
1758         int unmap = 0;
1759         struct proc *p = curproc;
1760         struct segvn_crargs crargs;
1761 
1762         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1763         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1764             (size_t)raddr;
1765 
1766         /*
1767          * check for wrap around
1768          */
1769         if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1770                 AS_LOCK_EXIT(as, &as->a_lock);
1771                 return (ENOMEM);
1772         }
1773 
1774         as->a_updatedir = 1; /* inform /proc */
1775         gethrestime(&as->a_updatetime);
1776 
1777         if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1778                 AS_LOCK_EXIT(as, &as->a_lock);
1779 
1780                 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1781                     RCA_UNSAFE_ALL);
1782 
1783                 return (ENOMEM);
1784         }
1785 
1786         if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1787                 crargs = *(struct segvn_crargs *)argsp;
1788                 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1789                 if (error != 0) {
1790                         AS_LOCK_EXIT(as, &as->a_lock);
1791                         if (unmap) {
1792                                 (void) as_unmap(as, addr, size);
1793                         }
1794                         return (error);
1795                 }
1796         } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1797                 crargs = *(struct segvn_crargs *)argsp;
1798                 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1799                 if (error != 0) {
1800                         AS_LOCK_EXIT(as, &as->a_lock);
1801                         if (unmap) {
1802                                 (void) as_unmap(as, addr, size);
1803                         }
1804                         return (error);
1805                 }
1806         } else {
1807                 seg = seg_alloc(as, addr, size);
1808                 if (seg == NULL) {
1809                         AS_LOCK_EXIT(as, &as->a_lock);
1810                         return (ENOMEM);
1811                 }
1812 
1813                 error = (*crfp)(seg, argsp);
1814                 if (error != 0) {
1815                         seg_free(seg);
1816                         AS_LOCK_EXIT(as, &as->a_lock);
1817                         return (error);
1818                 }
1819                 /*
1820                  * Add size now so as_unmap will work if as_ctl fails.
1821                  */
1822                 as->a_size += rsize;
1823                 as->a_resvsize += rsize;
1824         }
1825 
1826         as_setwatch(as);
1827 
1828         /*
1829          * If the address space is locked,
1830          * establish memory locks for the new segment.
1831          */
1832         mutex_enter(&as->a_contents);
1833         if (AS_ISPGLCK(as)) {
1834                 mutex_exit(&as->a_contents);
1835                 AS_LOCK_EXIT(as, &as->a_lock);
1836                 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1837                 if (error != 0)
1838                         (void) as_unmap(as, addr, size);
1839         } else {
1840                 mutex_exit(&as->a_contents);
1841                 AS_LOCK_EXIT(as, &as->a_lock);
1842         }
1843         return (error);
1844 }
1845 
1846 
1847 /*
1848  * Delete all segments in the address space marked with S_PURGE.
1849  * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1850  * These segments are deleted as a first step before calls to as_gap(), so
1851  * that they don't affect mmap() or shmat().
1852  */
1853 void
1854 as_purge(struct as *as)
1855 {
1856         struct seg *seg;
1857         struct seg *next_seg;
1858 
1859         /*
1860          * the setting of NEEDSPURGE is protect by as_rangelock(), so
1861          * no need to grab a_contents mutex for this check
1862          */
1863         if ((as->a_flags & AS_NEEDSPURGE) == 0)
1864                 return;
1865 
1866         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1867         next_seg = NULL;
1868         seg = AS_SEGFIRST(as);
1869         while (seg != NULL) {
1870                 next_seg = AS_SEGNEXT(as, seg);
1871                 if (seg->s_flags & S_PURGE)
1872                         SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1873                 seg = next_seg;
1874         }
1875         AS_LOCK_EXIT(as, &as->a_lock);
1876 
1877         mutex_enter(&as->a_contents);
1878         as->a_flags &= ~AS_NEEDSPURGE;
1879         mutex_exit(&as->a_contents);
1880 }
1881 
1882 /*
1883  * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1884  * range of addresses at least "minlen" long, where the base of the range is
1885  * at "off" phase from an "align" boundary and there is space for a
1886  * "redzone"-sized redzone on eithe rside of the range.  Thus,
1887  * if align was 4M and off was 16k, the user wants a hole which will start
1888  * 16k into a 4M page.
1889  *
1890  * If flags specifies AH_HI, the hole will have the highest possible address
1891  * in the range.  We use the as->a_lastgap field to figure out where to
1892  * start looking for a gap.
1893  *
1894  * Otherwise, the gap will have the lowest possible address.
1895  *
1896  * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1897  *
1898  * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1899  * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1900  *
1901  * NOTE: This routine is not correct when base+len overflows caddr_t.
1902  */
1903 int
1904 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1905     uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1906 {
1907         caddr_t lobound = *basep;
1908         caddr_t hibound = lobound + *lenp;
1909         struct seg *lseg, *hseg;
1910         caddr_t lo, hi;
1911         int forward;
1912         caddr_t save_base;
1913         size_t save_len;
1914         size_t save_minlen;
1915         size_t save_redzone;
1916         int fast_path = 1;
1917 
1918         save_base = *basep;
1919         save_len = *lenp;
1920         save_minlen = minlen;
1921         save_redzone = redzone;
1922 
1923         /*
1924          * For the first pass/fast_path, just add align and redzone into
1925          * minlen since if we get an allocation, we can guarantee that it
1926          * will fit the alignment and redzone requested.
1927          * This increases the chance that hibound will be adjusted to
1928          * a_lastgap->s_base which will likely allow us to find an
1929          * acceptable hole in the address space quicker.
1930          * If we can't find a hole with this fast_path, then we look for
1931          * smaller holes in which the alignment and offset may allow
1932          * the allocation to fit.
1933          */
1934         minlen += align;
1935         minlen += 2 * redzone;
1936         redzone = 0;
1937 
1938         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1939         if (AS_SEGFIRST(as) == NULL) {
1940                 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1941                     align, redzone, off)) {
1942                         AS_LOCK_EXIT(as, &as->a_lock);
1943                         return (0);
1944                 } else {
1945                         AS_LOCK_EXIT(as, &as->a_lock);
1946                         *basep = save_base;
1947                         *lenp = save_len;
1948                         return (-1);
1949                 }
1950         }
1951 
1952 retry:
1953         /*
1954          * Set up to iterate over all the inter-segment holes in the given
1955          * direction.  lseg is NULL for the lowest-addressed hole and hseg is
1956          * NULL for the highest-addressed hole.  If moving backwards, we reset
1957          * sseg to denote the highest-addressed segment.
1958          */
1959         forward = (flags & AH_DIR) == AH_LO;
1960         if (forward) {
1961                 hseg = as_findseg(as, lobound, 1);
1962                 lseg = AS_SEGPREV(as, hseg);
1963         } else {
1964 
1965                 /*
1966                  * If allocating at least as much as the last allocation,
1967                  * use a_lastgap's base as a better estimate of hibound.
1968                  */
1969                 if (as->a_lastgap &&
1970                     minlen >= as->a_lastgap->s_size &&
1971                     hibound >= as->a_lastgap->s_base)
1972                         hibound = as->a_lastgap->s_base;
1973 
1974                 hseg = as_findseg(as, hibound, 1);
1975                 if (hseg->s_base + hseg->s_size < hibound) {
1976                         lseg = hseg;
1977                         hseg = NULL;
1978                 } else {
1979                         lseg = AS_SEGPREV(as, hseg);
1980                 }
1981         }
1982 
1983         for (;;) {
1984                 /*
1985                  * Set lo and hi to the hole's boundaries.  (We should really
1986                  * use MAXADDR in place of hibound in the expression below,
1987                  * but can't express it easily; using hibound in its place is
1988                  * harmless.)
1989                  */
1990                 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1991                 hi = (hseg == NULL) ? hibound : hseg->s_base;
1992                 /*
1993                  * If the iteration has moved past the interval from lobound
1994                  * to hibound it's pointless to continue.
1995                  */
1996                 if ((forward && lo > hibound) || (!forward && hi < lobound))
1997                         break;
1998                 else if (lo > hibound || hi < lobound)
1999                         goto cont;
2000                 /*
2001                  * Candidate hole lies at least partially within the allowable
2002                  * range.  Restrict it to fall completely within that range,
2003                  * i.e., to [max(lo, lobound), min(hi, hibound)].
2004                  */
2005                 if (lo < lobound)
2006                         lo = lobound;
2007                 if (hi > hibound)
2008                         hi = hibound;
2009                 /*
2010                  * Verify that the candidate hole is big enough and meets
2011                  * hardware constraints.  If the hole is too small, no need
2012                  * to do the further checks since they will fail.
2013                  */
2014                 *basep = lo;
2015                 *lenp = hi - lo;
2016                 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
2017                     minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
2018                     ((flags & AH_CONTAIN) == 0 ||
2019                     (*basep <= addr && *basep + *lenp > addr))) {
2020                         if (!forward)
2021                                 as->a_lastgap = hseg;
2022                         if (hseg != NULL)
2023                                 as->a_lastgaphl = hseg;
2024                         else
2025                                 as->a_lastgaphl = lseg;
2026                         AS_LOCK_EXIT(as, &as->a_lock);
2027                         return (0);
2028                 }
2029         cont:
2030                 /*
2031                  * Move to the next hole.
2032                  */
2033                 if (forward) {
2034                         lseg = hseg;
2035                         if (lseg == NULL)
2036                                 break;
2037                         hseg = AS_SEGNEXT(as, hseg);
2038                 } else {
2039                         hseg = lseg;
2040                         if (hseg == NULL)
2041                                 break;
2042                         lseg = AS_SEGPREV(as, lseg);
2043                 }
2044         }
2045         if (fast_path && (align != 0 || save_redzone != 0)) {
2046                 fast_path = 0;
2047                 minlen = save_minlen;
2048                 redzone = save_redzone;
2049                 goto retry;
2050         }
2051         *basep = save_base;
2052         *lenp = save_len;
2053         AS_LOCK_EXIT(as, &as->a_lock);
2054         return (-1);
2055 }
2056 
2057 /*
2058  * Find a hole of at least size minlen within [*basep, *basep + *lenp).
2059  *
2060  * If flags specifies AH_HI, the hole will have the highest possible address
2061  * in the range.  We use the as->a_lastgap field to figure out where to
2062  * start looking for a gap.
2063  *
2064  * Otherwise, the gap will have the lowest possible address.
2065  *
2066  * If flags specifies AH_CONTAIN, the hole will contain the address addr.
2067  *
2068  * If an adequate hole is found, base and len are set to reflect the part of
2069  * the hole that is within range, and 0 is returned, otherwise,
2070  * -1 is returned.
2071  *
2072  * NOTE: This routine is not correct when base+len overflows caddr_t.
2073  */
2074 int
2075 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
2076     caddr_t addr)
2077 {
2078 
2079         return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
2080 }
2081 
2082 /*
2083  * Return the next range within [base, base + len) that is backed
2084  * with "real memory".  Skip holes and non-seg_vn segments.
2085  * We're lazy and only return one segment at a time.
2086  */
2087 int
2088 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2089 {
2090         extern struct seg_ops segspt_shmops;    /* needs a header file */
2091         struct seg *seg;
2092         caddr_t addr, eaddr;
2093         caddr_t segend;
2094 
2095         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2096 
2097         addr = *basep;
2098         eaddr = addr + *lenp;
2099 
2100         seg = as_findseg(as, addr, 0);
2101         if (seg != NULL)
2102                 addr = MAX(seg->s_base, addr);
2103 
2104         for (;;) {
2105                 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2106                         AS_LOCK_EXIT(as, &as->a_lock);
2107                         return (EINVAL);
2108                 }
2109 
2110                 if (seg->s_ops == &segvn_ops) {
2111                         segend = seg->s_base + seg->s_size;
2112                         break;
2113                 }
2114 
2115                 /*
2116                  * We do ISM by looking into the private data
2117                  * to determine the real size of the segment.
2118                  */
2119                 if (seg->s_ops == &segspt_shmops) {
2120                         segend = seg->s_base + spt_realsize(seg);
2121                         if (addr < segend)
2122                                 break;
2123                 }
2124 
2125                 seg = AS_SEGNEXT(as, seg);
2126 
2127                 if (seg != NULL)
2128                         addr = seg->s_base;
2129         }
2130 
2131         *basep = addr;
2132 
2133         if (segend > eaddr)
2134                 *lenp = eaddr - addr;
2135         else
2136                 *lenp = segend - addr;
2137 
2138         AS_LOCK_EXIT(as, &as->a_lock);
2139         return (0);
2140 }
2141 
2142 /*
2143  * Swap the pages associated with the address space as out to
2144  * secondary storage, returning the number of bytes actually
2145  * swapped.
2146  *
2147  * The value returned is intended to correlate well with the process's
2148  * memory requirements.  Its usefulness for this purpose depends on
2149  * how well the segment-level routines do at returning accurate
2150  * information.
2151  */
2152 size_t
2153 as_swapout(struct as *as)
2154 {
2155         struct seg *seg;
2156         size_t swpcnt = 0;
2157 
2158         /*
2159          * Kernel-only processes have given up their address
2160          * spaces.  Of course, we shouldn't be attempting to
2161          * swap out such processes in the first place...
2162          */
2163         if (as == NULL)
2164                 return (0);
2165 
2166         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2167 
2168         /* Prevent XHATs from attaching */
2169         mutex_enter(&as->a_contents);
2170         AS_SETBUSY(as);
2171         mutex_exit(&as->a_contents);
2172 
2173 
2174         /*
2175          * Free all mapping resources associated with the address
2176          * space.  The segment-level swapout routines capitalize
2177          * on this unmapping by scavanging pages that have become
2178          * unmapped here.
2179          */
2180         hat_swapout(as->a_hat);
2181         if (as->a_xhat != NULL)
2182                 xhat_swapout_all(as);
2183 
2184         mutex_enter(&as->a_contents);
2185         AS_CLRBUSY(as);
2186         mutex_exit(&as->a_contents);
2187 
2188         /*
2189          * Call the swapout routines of all segments in the address
2190          * space to do the actual work, accumulating the amount of
2191          * space reclaimed.
2192          */
2193         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2194                 struct seg_ops *ov = seg->s_ops;
2195 
2196                 /*
2197                  * We have to check to see if the seg has
2198                  * an ops vector because the seg may have
2199                  * been in the middle of being set up when
2200                  * the process was picked for swapout.
2201                  */
2202                 if ((ov != NULL) && (ov->swapout != NULL))
2203                         swpcnt += SEGOP_SWAPOUT(seg);
2204         }
2205         AS_LOCK_EXIT(as, &as->a_lock);
2206         return (swpcnt);
2207 }
2208 
2209 /*
2210  * Determine whether data from the mappings in interval [addr, addr + size)
2211  * are in the primary memory (core) cache.
2212  */
2213 int
2214 as_incore(struct as *as, caddr_t addr,
2215     size_t size, char *vec, size_t *sizep)
2216 {
2217         struct seg *seg;
2218         size_t ssize;
2219         caddr_t raddr;          /* rounded down addr */
2220         size_t rsize;           /* rounded up size */
2221         size_t isize;                   /* iteration size */
2222         int error = 0;          /* result, assume success */
2223 
2224         *sizep = 0;
2225         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2226         rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2227             (size_t)raddr;
2228 
2229         if (raddr + rsize < raddr)           /* check for wraparound */
2230                 return (ENOMEM);
2231 
2232         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2233         seg = as_segat(as, raddr);
2234         if (seg == NULL) {
2235                 AS_LOCK_EXIT(as, &as->a_lock);
2236                 return (-1);
2237         }
2238 
2239         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2240                 if (raddr >= seg->s_base + seg->s_size) {
2241                         seg = AS_SEGNEXT(as, seg);
2242                         if (seg == NULL || raddr != seg->s_base) {
2243                                 error = -1;
2244                                 break;
2245                         }
2246                 }
2247                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2248                         ssize = seg->s_base + seg->s_size - raddr;
2249                 else
2250                         ssize = rsize;
2251                 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
2252                 if (isize != ssize) {
2253                         error = -1;
2254                         break;
2255                 }
2256                 vec += btopr(ssize);
2257         }
2258         AS_LOCK_EXIT(as, &as->a_lock);
2259         return (error);
2260 }
2261 
2262 static void
2263 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2264         ulong_t *bitmap, size_t position, size_t npages)
2265 {
2266         caddr_t range_start;
2267         size_t  pos1 = position;
2268         size_t  pos2;
2269         size_t  size;
2270         size_t  end_pos = npages + position;
2271 
2272         while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2273                 size = ptob((pos2 - pos1));
2274                 range_start = (caddr_t)((uintptr_t)addr +
2275                     ptob(pos1 - position));
2276 
2277                 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
2278                     (ulong_t *)NULL, (size_t)NULL);
2279                 pos1 = pos2;
2280         }
2281 }
2282 
2283 static void
2284 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2285         caddr_t raddr, size_t rsize)
2286 {
2287         struct seg *seg = as_segat(as, raddr);
2288         size_t ssize;
2289 
2290         while (rsize != 0) {
2291                 if (raddr >= seg->s_base + seg->s_size)
2292                         seg = AS_SEGNEXT(as, seg);
2293 
2294                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2295                         ssize = seg->s_base + seg->s_size - raddr;
2296                 else
2297                         ssize = rsize;
2298 
2299                 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2300 
2301                 rsize -= ssize;
2302                 raddr += ssize;
2303         }
2304 }
2305 
2306 /*
2307  * Cache control operations over the interval [addr, addr + size) in
2308  * address space "as".
2309  */
2310 /*ARGSUSED*/
2311 int
2312 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2313     uintptr_t arg, ulong_t *lock_map, size_t pos)
2314 {
2315         struct seg *seg;        /* working segment */
2316         caddr_t raddr;          /* rounded down addr */
2317         caddr_t initraddr;      /* saved initial rounded down addr */
2318         size_t rsize;           /* rounded up size */
2319         size_t initrsize;       /* saved initial rounded up size */
2320         size_t ssize;           /* size of seg */
2321         int error = 0;                  /* result */
2322         size_t mlock_size;      /* size of bitmap */
2323         ulong_t *mlock_map;     /* pointer to bitmap used */
2324                                 /* to represent the locked */
2325                                 /* pages. */
2326 retry:
2327         if (error == IE_RETRY)
2328                 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2329         else
2330                 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2331 
2332         /*
2333          * If these are address space lock/unlock operations, loop over
2334          * all segments in the address space, as appropriate.
2335          */
2336         if (func == MC_LOCKAS) {
2337                 size_t npages, idx;
2338                 size_t rlen = 0;        /* rounded as length */
2339 
2340                 idx = pos;
2341 
2342                 if (arg & MCL_FUTURE) {
2343                         mutex_enter(&as->a_contents);
2344                         AS_SETPGLCK(as);
2345                         mutex_exit(&as->a_contents);
2346                 }
2347                 if ((arg & MCL_CURRENT) == 0) {
2348                         AS_LOCK_EXIT(as, &as->a_lock);
2349                         return (0);
2350                 }
2351 
2352                 seg = AS_SEGFIRST(as);
2353                 if (seg == NULL) {
2354                         AS_LOCK_EXIT(as, &as->a_lock);
2355                         return (0);
2356                 }
2357 
2358                 do {
2359                         raddr = (caddr_t)((uintptr_t)seg->s_base &
2360                             (uintptr_t)PAGEMASK);
2361                         rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2362                             PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2363                 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2364 
2365                 mlock_size = BT_BITOUL(btopr(rlen));
2366                 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2367                     sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2368                                 AS_LOCK_EXIT(as, &as->a_lock);
2369                                 return (EAGAIN);
2370                 }
2371 
2372                 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2373                         error = SEGOP_LOCKOP(seg, seg->s_base,
2374                             seg->s_size, attr, MC_LOCK, mlock_map, pos);
2375                         if (error != 0)
2376                                 break;
2377                         pos += seg_pages(seg);
2378                 }
2379 
2380                 if (error) {
2381                         for (seg = AS_SEGFIRST(as); seg != NULL;
2382                             seg = AS_SEGNEXT(as, seg)) {
2383 
2384                                 raddr = (caddr_t)((uintptr_t)seg->s_base &
2385                                     (uintptr_t)PAGEMASK);
2386                                 npages = seg_pages(seg);
2387                                 as_segunlock(seg, raddr, attr, mlock_map,
2388                                     idx, npages);
2389                                 idx += npages;
2390                         }
2391                 }
2392 
2393                 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2394                 AS_LOCK_EXIT(as, &as->a_lock);
2395                 goto lockerr;
2396         } else if (func == MC_UNLOCKAS) {
2397                 mutex_enter(&as->a_contents);
2398                 AS_CLRPGLCK(as);
2399                 mutex_exit(&as->a_contents);
2400 
2401                 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2402                         error = SEGOP_LOCKOP(seg, seg->s_base,
2403                             seg->s_size, attr, MC_UNLOCK, NULL, 0);
2404                         if (error != 0)
2405                                 break;
2406                 }
2407 
2408                 AS_LOCK_EXIT(as, &as->a_lock);
2409                 goto lockerr;
2410         }
2411 
2412         /*
2413          * Normalize addresses and sizes.
2414          */
2415         initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2416         initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2417             (size_t)raddr;
2418 
2419         if (raddr + rsize < raddr) {         /* check for wraparound */
2420                 AS_LOCK_EXIT(as, &as->a_lock);
2421                 return (ENOMEM);
2422         }
2423 
2424         /*
2425          * Get initial segment.
2426          */
2427         if ((seg = as_segat(as, raddr)) == NULL) {
2428                 AS_LOCK_EXIT(as, &as->a_lock);
2429                 return (ENOMEM);
2430         }
2431 
2432         if (func == MC_LOCK) {
2433                 mlock_size = BT_BITOUL(btopr(rsize));
2434                 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2435                     sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2436                                 AS_LOCK_EXIT(as, &as->a_lock);
2437                                 return (EAGAIN);
2438                 }
2439         }
2440 
2441         /*
2442          * Loop over all segments.  If a hole in the address range is
2443          * discovered, then fail.  For each segment, perform the appropriate
2444          * control operation.
2445          */
2446         while (rsize != 0) {
2447 
2448                 /*
2449                  * Make sure there's no hole, calculate the portion
2450                  * of the next segment to be operated over.
2451                  */
2452                 if (raddr >= seg->s_base + seg->s_size) {
2453                         seg = AS_SEGNEXT(as, seg);
2454                         if (seg == NULL || raddr != seg->s_base) {
2455                                 if (func == MC_LOCK) {
2456                                         as_unlockerr(as, attr, mlock_map,
2457                                             initraddr, initrsize - rsize);
2458                                         kmem_free(mlock_map,
2459                                             mlock_size * sizeof (ulong_t));
2460                                 }
2461                                 AS_LOCK_EXIT(as, &as->a_lock);
2462                                 return (ENOMEM);
2463                         }
2464                 }
2465                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2466                         ssize = seg->s_base + seg->s_size - raddr;
2467                 else
2468                         ssize = rsize;
2469 
2470                 /*
2471                  * Dispatch on specific function.
2472                  */
2473                 switch (func) {
2474 
2475                 /*
2476                  * Synchronize cached data from mappings with backing
2477                  * objects.
2478                  */
2479                 case MC_SYNC:
2480                         if (error = SEGOP_SYNC(seg, raddr, ssize,
2481                             attr, (uint_t)arg)) {
2482                                 AS_LOCK_EXIT(as, &as->a_lock);
2483                                 return (error);
2484                         }
2485                         break;
2486 
2487                 /*
2488                  * Lock pages in memory.
2489                  */
2490                 case MC_LOCK:
2491                         if (error = SEGOP_LOCKOP(seg, raddr, ssize,
2492                             attr, func, mlock_map, pos)) {
2493                                 as_unlockerr(as, attr, mlock_map, initraddr,
2494                                     initrsize - rsize + ssize);
2495                                 kmem_free(mlock_map, mlock_size *
2496                                     sizeof (ulong_t));
2497                                 AS_LOCK_EXIT(as, &as->a_lock);
2498                                 goto lockerr;
2499                         }
2500                         break;
2501 
2502                 /*
2503                  * Unlock mapped pages.
2504                  */
2505                 case MC_UNLOCK:
2506                         (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
2507                             (ulong_t *)NULL, (size_t)NULL);
2508                         break;
2509 
2510                 /*
2511                  * Store VM advise for mapped pages in segment layer.
2512                  */
2513                 case MC_ADVISE:
2514                         error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
2515 
2516                         /*
2517                          * Check for regular errors and special retry error
2518                          */
2519                         if (error) {
2520                                 if (error == IE_RETRY) {
2521                                         /*
2522                                          * Need to acquire writers lock, so
2523                                          * have to drop readers lock and start
2524                                          * all over again
2525                                          */
2526                                         AS_LOCK_EXIT(as, &as->a_lock);
2527                                         goto retry;
2528                                 } else if (error == IE_REATTACH) {
2529                                         /*
2530                                          * Find segment for current address
2531                                          * because current segment just got
2532                                          * split or concatenated
2533                                          */
2534                                         seg = as_segat(as, raddr);
2535                                         if (seg == NULL) {
2536                                                 AS_LOCK_EXIT(as, &as->a_lock);
2537                                                 return (ENOMEM);
2538                                         }
2539                                 } else {
2540                                         /*
2541                                          * Regular error
2542                                          */
2543                                         AS_LOCK_EXIT(as, &as->a_lock);
2544                                         return (error);
2545                                 }
2546                         }
2547                         break;
2548 
2549                 /*
2550                  * Can't happen.
2551                  */
2552                 default:
2553                         panic("as_ctl: bad operation %d", func);
2554                         /*NOTREACHED*/
2555                 }
2556 
2557                 rsize -= ssize;
2558                 raddr += ssize;
2559         }
2560 
2561         if (func == MC_LOCK)
2562                 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2563         AS_LOCK_EXIT(as, &as->a_lock);
2564         return (0);
2565 lockerr:
2566 
2567         /*
2568          * If the lower levels returned EDEADLK for a segment lockop,
2569          * it means that we should retry the operation.  Let's wait
2570          * a bit also to let the deadlock causing condition clear.
2571          * This is part of a gross hack to work around a design flaw
2572          * in the ufs/sds logging code and should go away when the
2573          * logging code is re-designed to fix the problem. See bug
2574          * 4125102 for details of the problem.
2575          */
2576         if (error == EDEADLK) {
2577                 delay(deadlk_wait);
2578                 error = 0;
2579                 goto retry;
2580         }
2581         return (error);
2582 }
2583 
2584 int
2585 fc_decode(faultcode_t fault_err)
2586 {
2587         int error = 0;
2588 
2589         switch (FC_CODE(fault_err)) {
2590         case FC_OBJERR:
2591                 error = FC_ERRNO(fault_err);
2592                 break;
2593         case FC_PROT:
2594                 error = EACCES;
2595                 break;
2596         default:
2597                 error = EFAULT;
2598                 break;
2599         }
2600         return (error);
2601 }
2602 
2603 /*
2604  * Pagelock pages from a range that spans more than 1 segment.  Obtain shadow
2605  * lists from each segment and copy them to one contiguous shadow list (plist)
2606  * as expected by the caller.  Save pointers to per segment shadow lists at
2607  * the tail of plist so that they can be used during as_pageunlock().
2608  */
2609 static int
2610 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2611     caddr_t addr, size_t size, enum seg_rw rw)
2612 {
2613         caddr_t sv_addr = addr;
2614         size_t sv_size = size;
2615         struct seg *sv_seg = seg;
2616         ulong_t segcnt = 1;
2617         ulong_t cnt;
2618         size_t ssize;
2619         pgcnt_t npages = btop(size);
2620         page_t **plist;
2621         page_t **pl;
2622         int error;
2623         caddr_t eaddr;
2624         faultcode_t fault_err = 0;
2625         pgcnt_t pl_off;
2626         extern struct seg_ops segspt_shmops;
2627 
2628         ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2629         ASSERT(seg != NULL);
2630         ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2631         ASSERT(addr + size > seg->s_base + seg->s_size);
2632         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2633         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2634 
2635         /*
2636          * Count the number of segments covered by the range we are about to
2637          * lock. The segment count is used to size the shadow list we return
2638          * back to the caller.
2639          */
2640         for (; size != 0; size -= ssize, addr += ssize) {
2641                 if (addr >= seg->s_base + seg->s_size) {
2642 
2643                         seg = AS_SEGNEXT(as, seg);
2644                         if (seg == NULL || addr != seg->s_base) {
2645                                 AS_LOCK_EXIT(as, &as->a_lock);
2646                                 return (EFAULT);
2647                         }
2648                         /*
2649                          * Do a quick check if subsequent segments
2650                          * will most likely support pagelock.
2651                          */
2652                         if (seg->s_ops == &segvn_ops) {
2653                                 vnode_t *vp;
2654 
2655                                 if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
2656                                     vp != NULL) {
2657                                         AS_LOCK_EXIT(as, &as->a_lock);
2658                                         goto slow;
2659                                 }
2660                         } else if (seg->s_ops != &segspt_shmops) {
2661                                 AS_LOCK_EXIT(as, &as->a_lock);
2662                                 goto slow;
2663                         }
2664                         segcnt++;
2665                 }
2666                 if (addr + size > seg->s_base + seg->s_size) {
2667                         ssize = seg->s_base + seg->s_size - addr;
2668                 } else {
2669                         ssize = size;
2670                 }
2671         }
2672         ASSERT(segcnt > 1);
2673 
2674         plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2675 
2676         addr = sv_addr;
2677         size = sv_size;
2678         seg = sv_seg;
2679 
2680         for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2681                 if (addr >= seg->s_base + seg->s_size) {
2682                         seg = AS_SEGNEXT(as, seg);
2683                         ASSERT(seg != NULL && addr == seg->s_base);
2684                         cnt++;
2685                         ASSERT(cnt < segcnt);
2686                 }
2687                 if (addr + size > seg->s_base + seg->s_size) {
2688                         ssize = seg->s_base + seg->s_size - addr;
2689                 } else {
2690                         ssize = size;
2691                 }
2692                 pl = &plist[npages + cnt];
2693                 error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2694                     L_PAGELOCK, rw);
2695                 if (error) {
2696                         break;
2697                 }
2698                 ASSERT(plist[npages + cnt] != NULL);
2699                 ASSERT(pl_off + btop(ssize) <= npages);
2700                 bcopy(plist[npages + cnt], &plist[pl_off],
2701                     btop(ssize) * sizeof (page_t *));
2702                 pl_off += btop(ssize);
2703         }
2704 
2705         if (size == 0) {
2706                 AS_LOCK_EXIT(as, &as->a_lock);
2707                 ASSERT(cnt == segcnt - 1);
2708                 *ppp = plist;
2709                 return (0);
2710         }
2711 
2712         /*
2713          * one of pagelock calls failed. The error type is in error variable.
2714          * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2715          * type is either EFAULT or ENOTSUP. Otherwise just return the error
2716          * back to the caller.
2717          */
2718 
2719         eaddr = addr;
2720         seg = sv_seg;
2721 
2722         for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2723                 if (addr >= seg->s_base + seg->s_size) {
2724                         seg = AS_SEGNEXT(as, seg);
2725                         ASSERT(seg != NULL && addr == seg->s_base);
2726                         cnt++;
2727                         ASSERT(cnt < segcnt);
2728                 }
2729                 if (eaddr > seg->s_base + seg->s_size) {
2730                         ssize = seg->s_base + seg->s_size - addr;
2731                 } else {
2732                         ssize = eaddr - addr;
2733                 }
2734                 pl = &plist[npages + cnt];
2735                 ASSERT(*pl != NULL);
2736                 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2737                     L_PAGEUNLOCK, rw);
2738         }
2739 
2740         AS_LOCK_EXIT(as, &as->a_lock);
2741 
2742         kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2743 
2744         if (error != ENOTSUP && error != EFAULT) {
2745                 return (error);
2746         }
2747 
2748 slow:
2749         /*
2750          * If we are here because pagelock failed due to the need to cow fault
2751          * in the pages we want to lock F_SOFTLOCK will do this job and in
2752          * next as_pagelock() call for this address range pagelock will
2753          * hopefully succeed.
2754          */
2755         fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2756         if (fault_err != 0) {
2757                 return (fc_decode(fault_err));
2758         }
2759         *ppp = NULL;
2760 
2761         return (0);
2762 }
2763 
2764 /*
2765  * lock pages in a given address space. Return shadow list. If
2766  * the list is NULL, the MMU mapping is also locked.
2767  */
2768 int
2769 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2770     size_t size, enum seg_rw rw)
2771 {
2772         size_t rsize;
2773         caddr_t raddr;
2774         faultcode_t fault_err;
2775         struct seg *seg;
2776         int err;
2777 
2778         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2779             "as_pagelock_start: addr %p size %ld", addr, size);
2780 
2781         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2782         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2783             (size_t)raddr;
2784 
2785         /*
2786          * if the request crosses two segments let
2787          * as_fault handle it.
2788          */
2789         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2790 
2791         seg = as_segat(as, raddr);
2792         if (seg == NULL) {
2793                 AS_LOCK_EXIT(as, &as->a_lock);
2794                 return (EFAULT);
2795         }
2796         ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2797         if (raddr + rsize > seg->s_base + seg->s_size) {
2798                 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2799         }
2800         if (raddr + rsize <= raddr) {
2801                 AS_LOCK_EXIT(as, &as->a_lock);
2802                 return (EFAULT);
2803         }
2804 
2805         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2806             "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2807 
2808         /*
2809          * try to lock pages and pass back shadow list
2810          */
2811         err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2812 
2813         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2814 
2815         AS_LOCK_EXIT(as, &as->a_lock);
2816 
2817         if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2818                 return (err);
2819         }
2820 
2821         /*
2822          * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2823          * to no pagelock support for this segment or pages need to be cow
2824          * faulted in. If fault is needed F_SOFTLOCK will do this job for
2825          * this as_pagelock() call and in the next as_pagelock() call for the
2826          * same address range pagelock call will hopefull succeed.
2827          */
2828         fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2829         if (fault_err != 0) {
2830                 return (fc_decode(fault_err));
2831         }
2832         *ppp = NULL;
2833 
2834         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2835         return (0);
2836 }
2837 
2838 /*
2839  * unlock pages locked by as_pagelock_segs().  Retrieve per segment shadow
2840  * lists from the end of plist and call pageunlock interface for each segment.
2841  * Drop as lock and free plist.
2842  */
2843 static void
2844 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2845     struct page **plist, enum seg_rw rw)
2846 {
2847         ulong_t cnt;
2848         caddr_t eaddr = addr + size;
2849         pgcnt_t npages = btop(size);
2850         size_t ssize;
2851         page_t **pl;
2852 
2853         ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2854         ASSERT(seg != NULL);
2855         ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2856         ASSERT(addr + size > seg->s_base + seg->s_size);
2857         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2858         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2859         ASSERT(plist != NULL);
2860 
2861         for (cnt = 0; addr < eaddr; addr += ssize) {
2862                 if (addr >= seg->s_base + seg->s_size) {
2863                         seg = AS_SEGNEXT(as, seg);
2864                         ASSERT(seg != NULL && addr == seg->s_base);
2865                         cnt++;
2866                 }
2867                 if (eaddr > seg->s_base + seg->s_size) {
2868                         ssize = seg->s_base + seg->s_size - addr;
2869                 } else {
2870                         ssize = eaddr - addr;
2871                 }
2872                 pl = &plist[npages + cnt];
2873                 ASSERT(*pl != NULL);
2874                 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2875                     L_PAGEUNLOCK, rw);
2876         }
2877         ASSERT(cnt > 0);
2878         AS_LOCK_EXIT(as, &as->a_lock);
2879 
2880         cnt++;
2881         kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2882 }
2883 
2884 /*
2885  * unlock pages in a given address range
2886  */
2887 void
2888 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2889     enum seg_rw rw)
2890 {
2891         struct seg *seg;
2892         size_t rsize;
2893         caddr_t raddr;
2894 
2895         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2896             "as_pageunlock_start: addr %p size %ld", addr, size);
2897 
2898         /*
2899          * if the shadow list is NULL, as_pagelock was
2900          * falling back to as_fault
2901          */
2902         if (pp == NULL) {
2903                 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2904                 return;
2905         }
2906 
2907         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2908         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2909             (size_t)raddr;
2910 
2911         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2912         seg = as_segat(as, raddr);
2913         ASSERT(seg != NULL);
2914 
2915         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2916             "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2917 
2918         ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2919         if (raddr + rsize <= seg->s_base + seg->s_size) {
2920                 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2921         } else {
2922                 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2923                 return;
2924         }
2925         AS_LOCK_EXIT(as, &as->a_lock);
2926         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2927 }
2928 
2929 int
2930 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2931     boolean_t wait)
2932 {
2933         struct seg *seg;
2934         size_t ssize;
2935         caddr_t raddr;                  /* rounded down addr */
2936         size_t rsize;                   /* rounded up size */
2937         int error = 0;
2938         size_t pgsz = page_get_pagesize(szc);
2939 
2940 setpgsz_top:
2941         if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2942                 return (EINVAL);
2943         }
2944 
2945         raddr = addr;
2946         rsize = size;
2947 
2948         if (raddr + rsize < raddr)           /* check for wraparound */
2949                 return (ENOMEM);
2950 
2951         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2952         as_clearwatchprot(as, raddr, rsize);
2953         seg = as_segat(as, raddr);
2954         if (seg == NULL) {
2955                 as_setwatch(as);
2956                 AS_LOCK_EXIT(as, &as->a_lock);
2957                 return (ENOMEM);
2958         }
2959 
2960         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2961                 if (raddr >= seg->s_base + seg->s_size) {
2962                         seg = AS_SEGNEXT(as, seg);
2963                         if (seg == NULL || raddr != seg->s_base) {
2964                                 error = ENOMEM;
2965                                 break;
2966                         }
2967                 }
2968                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2969                         ssize = seg->s_base + seg->s_size - raddr;
2970                 } else {
2971                         ssize = rsize;
2972                 }
2973 
2974 retry:
2975                 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2976 
2977                 if (error == IE_NOMEM) {
2978                         error = EAGAIN;
2979                         break;
2980                 }
2981 
2982                 if (error == IE_RETRY) {
2983                         AS_LOCK_EXIT(as, &as->a_lock);
2984                         goto setpgsz_top;
2985                 }
2986 
2987                 if (error == ENOTSUP) {
2988                         error = EINVAL;
2989                         break;
2990                 }
2991 
2992                 if (wait && (error == EAGAIN)) {
2993                         /*
2994                          * Memory is currently locked.  It must be unlocked
2995                          * before this operation can succeed through a retry.
2996                          * The possible reasons for locked memory and
2997                          * corresponding strategies for unlocking are:
2998                          * (1) Normal I/O
2999                          *      wait for a signal that the I/O operation
3000                          *      has completed and the memory is unlocked.
3001                          * (2) Asynchronous I/O
3002                          *      The aio subsystem does not unlock pages when
3003                          *      the I/O is completed. Those pages are unlocked
3004                          *      when the application calls aiowait/aioerror.
3005                          *      So, to prevent blocking forever, cv_broadcast()
3006                          *      is done to wake up aio_cleanup_thread.
3007                          *      Subsequently, segvn_reclaim will be called, and
3008                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
3009                          * (3) Long term page locking:
3010                          *      This is not relevant for as_setpagesize()
3011                          *      because we cannot change the page size for
3012                          *      driver memory. The attempt to do so will
3013                          *      fail with a different error than EAGAIN so
3014                          *      there's no need to trigger as callbacks like
3015                          *      as_unmap, as_setprot or as_free would do.
3016                          */
3017                         mutex_enter(&as->a_contents);
3018                         if (!AS_ISNOUNMAPWAIT(as)) {
3019                                 if (AS_ISUNMAPWAIT(as) == 0) {
3020                                         cv_broadcast(&as->a_cv);
3021                                 }
3022                                 AS_SETUNMAPWAIT(as);
3023                                 AS_LOCK_EXIT(as, &as->a_lock);
3024                                 while (AS_ISUNMAPWAIT(as)) {
3025                                         cv_wait(&as->a_cv, &as->a_contents);
3026                                 }
3027                         } else {
3028                                 /*
3029                                  * We may have raced with
3030                                  * segvn_reclaim()/segspt_reclaim(). In this
3031                                  * case clean nounmapwait flag and retry since
3032                                  * softlockcnt in this segment may be already
3033                                  * 0.  We don't drop as writer lock so our
3034                                  * number of retries without sleeping should
3035                                  * be very small. See segvn_reclaim() for
3036                                  * more comments.
3037                                  */
3038                                 AS_CLRNOUNMAPWAIT(as);
3039                                 mutex_exit(&as->a_contents);
3040                                 goto retry;
3041                         }
3042                         mutex_exit(&as->a_contents);
3043                         goto setpgsz_top;
3044                 } else if (error != 0) {
3045                         break;
3046                 }
3047         }
3048         as_setwatch(as);
3049         AS_LOCK_EXIT(as, &as->a_lock);
3050         return (error);
3051 }
3052 
3053 /*
3054  * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
3055  * in its chunk where s_szc is less than the szc we want to set.
3056  */
3057 static int
3058 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3059     int *retry)
3060 {
3061         struct seg *seg;
3062         size_t ssize;
3063         int error;
3064 
3065         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3066 
3067         seg = as_segat(as, raddr);
3068         if (seg == NULL) {
3069                 panic("as_iset3_default_lpsize: no seg");
3070         }
3071 
3072         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
3073                 if (raddr >= seg->s_base + seg->s_size) {
3074                         seg = AS_SEGNEXT(as, seg);
3075                         if (seg == NULL || raddr != seg->s_base) {
3076                                 panic("as_iset3_default_lpsize: as changed");
3077                         }
3078                 }
3079                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3080                         ssize = seg->s_base + seg->s_size - raddr;
3081                 } else {
3082                         ssize = rsize;
3083                 }
3084 
3085                 if (szc > seg->s_szc) {
3086                         error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
3087                         /* Only retry on EINVAL segments that have no vnode. */
3088                         if (error == EINVAL) {
3089                                 vnode_t *vp = NULL;
3090                                 if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) &&
3091                                     (SEGOP_GETVP(seg, raddr, &vp) != 0 ||
3092                                     vp == NULL)) {
3093                                         *retry = 1;
3094                                 } else {
3095                                         *retry = 0;
3096                                 }
3097                         }
3098                         if (error) {
3099                                 return (error);
3100                         }
3101                 }
3102         }
3103         return (0);
3104 }
3105 
3106 /*
3107  * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3108  * pagesize on each segment in its range, but if any fails with EINVAL,
3109  * then it reduces the pagesizes to the next size in the bitmap and
3110  * retries as_iset3_default_lpsize(). The reason why the code retries
3111  * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3112  * match the bigger sizes, and (b) it's hard to get this offset (to begin
3113  * with) to pass to map_pgszcvec().
3114  */
3115 static int
3116 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3117     uint_t szcvec)
3118 {
3119         int error;
3120         int retry;
3121 
3122         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3123 
3124         for (;;) {
3125                 error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3126                 if (error == EINVAL && retry) {
3127                         szcvec &= ~(1 << szc);
3128                         if (szcvec <= 1) {
3129                                 return (EINVAL);
3130                         }
3131                         szc = highbit(szcvec) - 1;
3132                 } else {
3133                         return (error);
3134                 }
3135         }
3136 }
3137 
3138 /*
3139  * as_iset1_default_lpsize() breaks its chunk into areas where existing
3140  * segments have a smaller szc than we want to set. For each such area,
3141  * it calls as_iset2_default_lpsize()
3142  */
3143 static int
3144 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3145     uint_t szcvec)
3146 {
3147         struct seg *seg;
3148         size_t ssize;
3149         caddr_t setaddr = raddr;
3150         size_t setsize = 0;
3151         int set;
3152         int error;
3153 
3154         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3155 
3156         seg = as_segat(as, raddr);
3157         if (seg == NULL) {
3158                 panic("as_iset1_default_lpsize: no seg");
3159         }
3160         if (seg->s_szc < szc) {
3161                 set = 1;
3162         } else {
3163                 set = 0;
3164         }
3165 
3166         for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3167                 if (raddr >= seg->s_base + seg->s_size) {
3168                         seg = AS_SEGNEXT(as, seg);
3169                         if (seg == NULL || raddr != seg->s_base) {
3170                                 panic("as_iset1_default_lpsize: as changed");
3171                         }
3172                         if (seg->s_szc >= szc && set) {
3173                                 ASSERT(setsize != 0);
3174                                 error = as_iset2_default_lpsize(as,
3175                                     setaddr, setsize, szc, szcvec);
3176                                 if (error) {
3177                                         return (error);
3178                                 }
3179                                 set = 0;
3180                         } else if (seg->s_szc < szc && !set) {
3181                                 setaddr = raddr;
3182                                 setsize = 0;
3183                                 set = 1;
3184                         }
3185                 }
3186                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3187                         ssize = seg->s_base + seg->s_size - raddr;
3188                 } else {
3189                         ssize = rsize;
3190                 }
3191         }
3192         error = 0;
3193         if (set) {
3194                 ASSERT(setsize != 0);
3195                 error = as_iset2_default_lpsize(as, setaddr, setsize,
3196                     szc, szcvec);
3197         }
3198         return (error);
3199 }
3200 
3201 /*
3202  * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3203  * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3204  * chunk to as_iset1_default_lpsize().
3205  */
3206 static int
3207 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3208     int type)
3209 {
3210         int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3211         uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3212             flags, rtype, 1);
3213         uint_t szc;
3214         uint_t nszc;
3215         int error;
3216         caddr_t a;
3217         caddr_t eaddr;
3218         size_t segsize;
3219         size_t pgsz;
3220         uint_t save_szcvec;
3221 
3222         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3223         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3224         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3225 
3226         szcvec &= ~1;
3227         if (szcvec <= 1) {   /* skip if base page size */
3228                 return (0);
3229         }
3230 
3231         /* Get the pagesize of the first larger page size. */
3232         szc = lowbit(szcvec) - 1;
3233         pgsz = page_get_pagesize(szc);
3234         eaddr = addr + size;
3235         addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3236         eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3237 
3238         save_szcvec = szcvec;
3239         szcvec >>= (szc + 1);
3240         nszc = szc;
3241         while (szcvec) {
3242                 if ((szcvec & 0x1) == 0) {
3243                         nszc++;
3244                         szcvec >>= 1;
3245                         continue;
3246                 }
3247                 nszc++;
3248                 pgsz = page_get_pagesize(nszc);
3249                 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3250                 if (a != addr) {
3251                         ASSERT(szc > 0);
3252                         ASSERT(a < eaddr);
3253                         segsize = a - addr;
3254                         error = as_iset1_default_lpsize(as, addr, segsize, szc,
3255                             save_szcvec);
3256                         if (error) {
3257                                 return (error);
3258                         }
3259                         addr = a;
3260                 }
3261                 szc = nszc;
3262                 szcvec >>= 1;
3263         }
3264 
3265         ASSERT(addr < eaddr);
3266         szcvec = save_szcvec;
3267         while (szcvec) {
3268                 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3269                 ASSERT(a >= addr);
3270                 if (a != addr) {
3271                         ASSERT(szc > 0);
3272                         segsize = a - addr;
3273                         error = as_iset1_default_lpsize(as, addr, segsize, szc,
3274                             save_szcvec);
3275                         if (error) {
3276                                 return (error);
3277                         }
3278                         addr = a;
3279                 }
3280                 szcvec &= ~(1 << szc);
3281                 if (szcvec) {
3282                         szc = highbit(szcvec) - 1;
3283                         pgsz = page_get_pagesize(szc);
3284                 }
3285         }
3286         ASSERT(addr == eaddr);
3287 
3288         return (0);
3289 }
3290 
3291 /*
3292  * Set the default large page size for the range. Called via memcntl with
3293  * page size set to 0. as_set_default_lpsize breaks the range down into
3294  * chunks with the same type/flags, ignores-non segvn segments, and passes
3295  * each chunk to as_iset_default_lpsize().
3296  */
3297 int
3298 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3299 {
3300         struct seg *seg;
3301         caddr_t raddr;
3302         size_t rsize;
3303         size_t ssize;
3304         int rtype, rflags;
3305         int stype, sflags;
3306         int error;
3307         caddr_t setaddr;
3308         size_t setsize;
3309         int segvn;
3310 
3311         if (size == 0)
3312                 return (0);
3313 
3314         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3315 again:
3316         error = 0;
3317 
3318         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3319         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3320             (size_t)raddr;
3321 
3322         if (raddr + rsize < raddr) {         /* check for wraparound */
3323                 AS_LOCK_EXIT(as, &as->a_lock);
3324                 return (ENOMEM);
3325         }
3326         as_clearwatchprot(as, raddr, rsize);
3327         seg = as_segat(as, raddr);
3328         if (seg == NULL) {
3329                 as_setwatch(as);
3330                 AS_LOCK_EXIT(as, &as->a_lock);
3331                 return (ENOMEM);
3332         }
3333         if (seg->s_ops == &segvn_ops) {
3334                 rtype = SEGOP_GETTYPE(seg, addr);
3335                 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3336                 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3337                 segvn = 1;
3338         } else {
3339                 segvn = 0;
3340         }
3341         setaddr = raddr;
3342         setsize = 0;
3343 
3344         for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3345                 if (raddr >= (seg->s_base + seg->s_size)) {
3346                         seg = AS_SEGNEXT(as, seg);
3347                         if (seg == NULL || raddr != seg->s_base) {
3348                                 error = ENOMEM;
3349                                 break;
3350                         }
3351                         if (seg->s_ops == &segvn_ops) {
3352                                 stype = SEGOP_GETTYPE(seg, raddr);
3353                                 sflags = stype & (MAP_TEXT | MAP_INITDATA);
3354                                 stype &= (MAP_SHARED | MAP_PRIVATE);
3355                                 if (segvn && (rflags != sflags ||
3356                                     rtype != stype)) {
3357                                         /*
3358                                          * The next segment is also segvn but
3359                                          * has different flags and/or type.
3360                                          */
3361                                         ASSERT(setsize != 0);
3362                                         error = as_iset_default_lpsize(as,
3363                                             setaddr, setsize, rflags, rtype);
3364                                         if (error) {
3365                                                 break;
3366                                         }
3367                                         rflags = sflags;
3368                                         rtype = stype;
3369                                         setaddr = raddr;
3370                                         setsize = 0;
3371                                 } else if (!segvn) {
3372                                         rflags = sflags;
3373                                         rtype = stype;
3374                                         setaddr = raddr;
3375                                         setsize = 0;
3376                                         segvn = 1;
3377                                 }
3378                         } else if (segvn) {
3379                                 /* The next segment is not segvn. */
3380                                 ASSERT(setsize != 0);
3381                                 error = as_iset_default_lpsize(as,
3382                                     setaddr, setsize, rflags, rtype);
3383                                 if (error) {
3384                                         break;
3385                                 }
3386                                 segvn = 0;
3387                         }
3388                 }
3389                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3390                         ssize = seg->s_base + seg->s_size - raddr;
3391                 } else {
3392                         ssize = rsize;
3393                 }
3394         }
3395         if (error == 0 && segvn) {
3396                 /* The last chunk when rsize == 0. */
3397                 ASSERT(setsize != 0);
3398                 error = as_iset_default_lpsize(as, setaddr, setsize,
3399                     rflags, rtype);
3400         }
3401 
3402         if (error == IE_RETRY) {
3403                 goto again;
3404         } else if (error == IE_NOMEM) {
3405                 error = EAGAIN;
3406         } else if (error == ENOTSUP) {
3407                 error = EINVAL;
3408         } else if (error == EAGAIN) {
3409                 mutex_enter(&as->a_contents);
3410                 if (!AS_ISNOUNMAPWAIT(as)) {
3411                         if (AS_ISUNMAPWAIT(as) == 0) {
3412                                 cv_broadcast(&as->a_cv);
3413                         }
3414                         AS_SETUNMAPWAIT(as);
3415                         AS_LOCK_EXIT(as, &as->a_lock);
3416                         while (AS_ISUNMAPWAIT(as)) {
3417                                 cv_wait(&as->a_cv, &as->a_contents);
3418                         }
3419                         mutex_exit(&as->a_contents);
3420                         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3421                 } else {
3422                         /*
3423                          * We may have raced with
3424                          * segvn_reclaim()/segspt_reclaim(). In this case
3425                          * clean nounmapwait flag and retry since softlockcnt
3426                          * in this segment may be already 0.  We don't drop as
3427                          * writer lock so our number of retries without
3428                          * sleeping should be very small. See segvn_reclaim()
3429                          * for more comments.
3430                          */
3431                         AS_CLRNOUNMAPWAIT(as);
3432                         mutex_exit(&as->a_contents);
3433                 }
3434                 goto again;
3435         }
3436 
3437         as_setwatch(as);
3438         AS_LOCK_EXIT(as, &as->a_lock);
3439         return (error);
3440 }
3441 
3442 /*
3443  * Setup all of the uninitialized watched pages that we can.
3444  */
3445 void
3446 as_setwatch(struct as *as)
3447 {
3448         struct watched_page *pwp;
3449         struct seg *seg;
3450         caddr_t vaddr;
3451         uint_t prot;
3452         int  err, retrycnt;
3453 
3454         if (avl_numnodes(&as->a_wpage) == 0)
3455                 return;
3456 
3457         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3458 
3459         for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3460             pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3461                 retrycnt = 0;
3462         retry:
3463                 vaddr = pwp->wp_vaddr;
3464                 if (pwp->wp_oprot != 0 ||    /* already set up */
3465                     (seg = as_segat(as, vaddr)) == NULL ||
3466                     SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
3467                         continue;
3468 
3469                 pwp->wp_oprot = prot;
3470                 if (pwp->wp_read)
3471                         prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3472                 if (pwp->wp_write)
3473                         prot &= ~PROT_WRITE;
3474                 if (pwp->wp_exec)
3475                         prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3476                 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3477                         err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3478                         if (err == IE_RETRY) {
3479                                 pwp->wp_oprot = 0;
3480                                 ASSERT(retrycnt == 0);
3481                                 retrycnt++;
3482                                 goto retry;
3483                         }
3484                 }
3485                 pwp->wp_prot = prot;
3486         }
3487 }
3488 
3489 /*
3490  * Clear all of the watched pages in the address space.
3491  */
3492 void
3493 as_clearwatch(struct as *as)
3494 {
3495         struct watched_page *pwp;
3496         struct seg *seg;
3497         caddr_t vaddr;
3498         uint_t prot;
3499         int err, retrycnt;
3500 
3501         if (avl_numnodes(&as->a_wpage) == 0)
3502                 return;
3503 
3504         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3505 
3506         for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3507             pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3508                 retrycnt = 0;
3509         retry:
3510                 vaddr = pwp->wp_vaddr;
3511                 if (pwp->wp_oprot == 0 ||    /* not set up */
3512                     (seg = as_segat(as, vaddr)) == NULL)
3513                         continue;
3514 
3515                 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3516                         err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3517                         if (err == IE_RETRY) {
3518                                 ASSERT(retrycnt == 0);
3519                                 retrycnt++;
3520                                 goto retry;
3521                         }
3522                 }
3523                 pwp->wp_oprot = 0;
3524                 pwp->wp_prot = 0;
3525         }
3526 }
3527 
3528 /*
3529  * Force a new setup for all the watched pages in the range.
3530  */
3531 static void
3532 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3533 {
3534         struct watched_page *pwp;
3535         struct watched_page tpw;
3536         caddr_t eaddr = addr + size;
3537         caddr_t vaddr;
3538         struct seg *seg;
3539         int err, retrycnt;
3540         uint_t  wprot;
3541         avl_index_t where;
3542 
3543         if (avl_numnodes(&as->a_wpage) == 0)
3544                 return;
3545 
3546         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3547 
3548         tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3549         if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3550                 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3551 
3552         while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3553                 retrycnt = 0;
3554                 vaddr = pwp->wp_vaddr;
3555 
3556                 wprot = prot;
3557                 if (pwp->wp_read)
3558                         wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3559                 if (pwp->wp_write)
3560                         wprot &= ~PROT_WRITE;
3561                 if (pwp->wp_exec)
3562                         wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3563                 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3564                 retry:
3565                         seg = as_segat(as, vaddr);
3566                         if (seg == NULL) {
3567                                 panic("as_setwatchprot: no seg");
3568                                 /*NOTREACHED*/
3569                         }
3570                         err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
3571                         if (err == IE_RETRY) {
3572                                 ASSERT(retrycnt == 0);
3573                                 retrycnt++;
3574                                 goto retry;
3575                         }
3576                 }
3577                 pwp->wp_oprot = prot;
3578                 pwp->wp_prot = wprot;
3579 
3580                 pwp = AVL_NEXT(&as->a_wpage, pwp);
3581         }
3582 }
3583 
3584 /*
3585  * Clear all of the watched pages in the range.
3586  */
3587 static void
3588 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3589 {
3590         caddr_t eaddr = addr + size;
3591         struct watched_page *pwp;
3592         struct watched_page tpw;
3593         uint_t prot;
3594         struct seg *seg;
3595         int err, retrycnt;
3596         avl_index_t where;
3597 
3598         if (avl_numnodes(&as->a_wpage) == 0)
3599                 return;
3600 
3601         tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3602         if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3603                 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3604 
3605         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3606 
3607         while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3608 
3609                 if ((prot = pwp->wp_oprot) != 0) {
3610                         retrycnt = 0;
3611 
3612                         if (prot != pwp->wp_prot) {
3613                         retry:
3614                                 seg = as_segat(as, pwp->wp_vaddr);
3615                                 if (seg == NULL)
3616                                         continue;
3617                                 err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
3618                                     PAGESIZE, prot);
3619                                 if (err == IE_RETRY) {
3620                                         ASSERT(retrycnt == 0);
3621                                         retrycnt++;
3622                                         goto retry;
3623 
3624                                 }
3625                         }
3626                         pwp->wp_oprot = 0;
3627                         pwp->wp_prot = 0;
3628                 }
3629 
3630                 pwp = AVL_NEXT(&as->a_wpage, pwp);
3631         }
3632 }
3633 
3634 void
3635 as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3636 {
3637         struct proc *p;
3638 
3639         mutex_enter(&pidlock);
3640         for (p = practive; p; p = p->p_next) {
3641                 if (p->p_as == as) {
3642                         mutex_enter(&p->p_lock);
3643                         if (p->p_as == as)
3644                                 sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3645                         mutex_exit(&p->p_lock);
3646                 }
3647         }
3648         mutex_exit(&pidlock);
3649 }
3650 
3651 /*
3652  * return memory object ID
3653  */
3654 int
3655 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3656 {
3657         struct seg      *seg;
3658         int             sts;
3659 
3660         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
3661         seg = as_segat(as, addr);
3662         if (seg == NULL) {
3663                 AS_LOCK_EXIT(as, &as->a_lock);
3664                 return (EFAULT);
3665         }
3666         /*
3667          * catch old drivers which may not support getmemid
3668          */
3669         if (seg->s_ops->getmemid == NULL) {
3670                 AS_LOCK_EXIT(as, &as->a_lock);
3671                 return (ENODEV);
3672         }
3673 
3674         sts = SEGOP_GETMEMID(seg, addr, memidp);
3675 
3676         AS_LOCK_EXIT(as, &as->a_lock);
3677         return (sts);
3678 }