1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 
  27 #include <sys/types.h>
  28 #include <sys/modctl.h>
  29 #include <sys/conf.h>
  30 #include <sys/ddi.h>
  31 #include <sys/sunddi.h>
  32 #include <sys/devops.h>
  33 #include <sys/stat.h>
  34 #include <sys/file.h>
  35 #include <sys/cred.h>
  36 #include <sys/policy.h>
  37 #include <sys/errno.h>
  38 #include <vm/seg_dev.h>
  39 #include <vm/seg_vn.h>
  40 #include <vm/page.h>
  41 #include <sys/fs/swapnode.h>
  42 #include <sys/sysmacros.h>
  43 #include <sys/fcntl.h>
  44 #include <sys/vmsystm.h>
  45 #include <sys/physmem.h>
  46 #include <sys/vfs_opreg.h>
  47 
  48 static dev_info_t               *physmem_dip = NULL;
  49 
  50 /*
  51  * Linked list element hanging off physmem_proc_hash below, which holds all
  52  * the information for a given segment which has been setup for this process.
  53  * This is a simple linked list as we are assuming that for a given process
  54  * the setup ioctl will only be called a handful of times.  If this assumption
  55  * changes in the future, a quicker to traverse data structure should be used.
  56  */
  57 struct physmem_hash {
  58         struct physmem_hash *ph_next;
  59         uint64_t ph_base_pa;
  60         caddr_t ph_base_va;
  61         size_t ph_seg_len;
  62         struct vnode *ph_vnode;
  63 };
  64 
  65 /*
  66  * Hash of all of the processes which have setup mappings with the driver with
  67  * pointers to per process data.
  68  */
  69 struct physmem_proc_hash {
  70         struct proc *pph_proc;
  71         struct physmem_hash *pph_hash;
  72         struct physmem_proc_hash *pph_next;
  73 };
  74 
  75 
  76 /* Needs to be a power of two for simple hash algorithm */
  77 #define PPH_SIZE        8
  78 struct physmem_proc_hash *pph[PPH_SIZE];
  79 
  80 /*
  81  * Lock which protects the pph hash above.  To add an element (either a new
  82  * process or a new segment) the WRITE lock must be held.  To traverse the
  83  * list, only a READ lock is needed.
  84  */
  85 krwlock_t pph_rwlock;
  86 
  87 #define PHYSMEM_HASH(procp) ((int)((((uintptr_t)procp) >> 8) & (PPH_SIZE - 1)))
  88 
  89 /*
  90  * Need to keep a reference count of how many processes have the driver
  91  * open to prevent it from disappearing.
  92  */
  93 uint64_t physmem_vnodecnt;
  94 kmutex_t physmem_mutex;         /* protects phsymem_vnodecnt */
  95 
  96 static int physmem_getpage(struct vnode *vp, offset_t off, size_t len,
  97     uint_t *protp, page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
  98     enum seg_rw rw, struct cred *cr, caller_context_t *ct);
  99 
 100 static int physmem_addmap(struct vnode *vp, offset_t off, struct as *as,
 101     caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
 102     struct cred *cred, caller_context_t *ct);
 103 
 104 static int physmem_delmap(struct vnode *vp, offset_t off, struct as *as,
 105     caddr_t addr, size_t len, uint_t prot, uint_t maxprot, uint_t flags,
 106     struct cred *cred, caller_context_t *ct);
 107 
 108 static void physmem_inactive(vnode_t *vp, cred_t *crp, caller_context_t *ct);
 109 
 110 const fs_operation_def_t physmem_vnodeops_template[] = {
 111         VOPNAME_GETPAGE,        { .vop_getpage = physmem_getpage },
 112         VOPNAME_ADDMAP,         { .vop_addmap = physmem_addmap },
 113         VOPNAME_DELMAP,         { .vop_delmap = physmem_delmap },
 114         VOPNAME_INACTIVE,       { .vop_inactive = physmem_inactive },
 115         NULL,                   NULL
 116 };
 117 
 118 vnodeops_t *physmem_vnodeops = NULL;
 119 
 120 /*
 121  * Removes the current process from the hash if the process has no more
 122  * physmem segments active.
 123  */
 124 void
 125 physmem_remove_hash_proc()
 126 {
 127         int index;
 128         struct physmem_proc_hash **walker;
 129         struct physmem_proc_hash *victim = NULL;
 130 
 131         index = PHYSMEM_HASH(curproc);
 132         rw_enter(&pph_rwlock, RW_WRITER);
 133         walker = &pph[index];
 134         while (*walker != NULL) {
 135                 if ((*walker)->pph_proc == curproc &&
 136                     (*walker)->pph_hash == NULL) {
 137                         victim = *walker;
 138                         *walker = victim->pph_next;
 139                         break;
 140                 }
 141                 walker = &((*walker)->pph_next);
 142         }
 143         rw_exit(&pph_rwlock);
 144         if (victim != NULL)
 145                 kmem_free(victim, sizeof (struct physmem_proc_hash));
 146 }
 147 
 148 /*
 149  * Add a new entry to the hash for the given process to cache the
 150  * address ranges that it is working on.  If this is the first hash
 151  * item to be added for this process, we will create the head pointer
 152  * for this process.
 153  * Returns 0 on success, ERANGE when the physical address is already in the
 154  * hash.
 155  */
 156 int
 157 physmem_add_hash(struct physmem_hash *php)
 158 {
 159         int index;
 160         struct physmem_proc_hash *iterator;
 161         struct physmem_proc_hash *newp = NULL;
 162         struct physmem_hash *temp;
 163         int ret = 0;
 164 
 165         index = PHYSMEM_HASH(curproc);
 166 
 167 insert:
 168         rw_enter(&pph_rwlock, RW_WRITER);
 169         iterator = pph[index];
 170         while (iterator != NULL) {
 171                 if (iterator->pph_proc == curproc) {
 172                         /*
 173                          * check to make sure a single process does not try to
 174                          * map the same region twice.
 175                          */
 176                         for (temp = iterator->pph_hash; temp != NULL;
 177                             temp = temp->ph_next) {
 178                                 if ((php->ph_base_pa >= temp->ph_base_pa &&
 179                                     php->ph_base_pa < temp->ph_base_pa +
 180                                     temp->ph_seg_len) ||
 181                                     (temp->ph_base_pa >= php->ph_base_pa &&
 182                                     temp->ph_base_pa < php->ph_base_pa +
 183                                     php->ph_seg_len)) {
 184                                         ret = ERANGE;
 185                                         break;
 186                                 }
 187                         }
 188                         if (ret == 0) {
 189                                 php->ph_next = iterator->pph_hash;
 190                                 iterator->pph_hash = php;
 191                         }
 192                         rw_exit(&pph_rwlock);
 193                         /* Need to check for two threads in sync */
 194                         if (newp != NULL)
 195                                 kmem_free(newp, sizeof (*newp));
 196                         return (ret);
 197                 }
 198                 iterator = iterator->pph_next;
 199         }
 200 
 201         if (newp != NULL) {
 202                 newp->pph_proc = curproc;
 203                 newp->pph_next = pph[index];
 204                 newp->pph_hash = php;
 205                 php->ph_next = NULL;
 206                 pph[index] = newp;
 207                 rw_exit(&pph_rwlock);
 208                 return (0);
 209         }
 210 
 211         rw_exit(&pph_rwlock);
 212         /* Dropped the lock so we could use KM_SLEEP */
 213         newp = kmem_zalloc(sizeof (struct physmem_proc_hash), KM_SLEEP);
 214         goto insert;
 215 }
 216 
 217 /*
 218  * Will return the pointer to the physmem_hash struct if the setup routine
 219  * has previously been called for this memory.
 220  * Returns NULL on failure.
 221  */
 222 struct physmem_hash *
 223 physmem_get_hash(uint64_t req_paddr, size_t len, proc_t *procp)
 224 {
 225         int index;
 226         struct physmem_proc_hash *proc_hp;
 227         struct physmem_hash *php;
 228 
 229         ASSERT(rw_lock_held(&pph_rwlock));
 230 
 231         index = PHYSMEM_HASH(procp);
 232         proc_hp = pph[index];
 233         while (proc_hp != NULL) {
 234                 if (proc_hp->pph_proc == procp) {
 235                         php = proc_hp->pph_hash;
 236                         while (php != NULL) {
 237                                 if ((req_paddr >= php->ph_base_pa) &&
 238                                     (req_paddr + len <=
 239                                     php->ph_base_pa + php->ph_seg_len)) {
 240                                         return (php);
 241                                 }
 242                                 php = php->ph_next;
 243                         }
 244                 }
 245                 proc_hp = proc_hp->pph_next;
 246         }
 247         return (NULL);
 248 }
 249 
 250 int
 251 physmem_validate_cookie(uint64_t p_cookie)
 252 {
 253         int index;
 254         struct physmem_proc_hash *proc_hp;
 255         struct physmem_hash *php;
 256 
 257         ASSERT(rw_lock_held(&pph_rwlock));
 258 
 259         index = PHYSMEM_HASH(curproc);
 260         proc_hp = pph[index];
 261         while (proc_hp != NULL) {
 262                 if (proc_hp->pph_proc == curproc) {
 263                         php = proc_hp->pph_hash;
 264                         while (php != NULL) {
 265                                 if ((uint64_t)(uintptr_t)php == p_cookie) {
 266                                         return (1);
 267                                 }
 268                                 php = php->ph_next;
 269                         }
 270                 }
 271                 proc_hp = proc_hp->pph_next;
 272         }
 273         return (0);
 274 }
 275 
 276 /*
 277  * Remove the given vnode from the pph hash.  If it exists in the hash the
 278  * process still has to be around as the vnode is obviously still around and
 279  * since it's a physmem vnode, it must be in the hash.
 280  * If it is not in the hash that must mean that the setup ioctl failed.
 281  * Return 0 in this instance, 1 if it is in the hash.
 282  */
 283 int
 284 physmem_remove_vnode_hash(vnode_t *vp)
 285 {
 286         int index;
 287         struct physmem_proc_hash *proc_hp;
 288         struct physmem_hash **phpp;
 289         struct physmem_hash *victim;
 290 
 291         index = PHYSMEM_HASH(curproc);
 292         /* synchronize with the map routine */
 293         rw_enter(&pph_rwlock, RW_WRITER);
 294         proc_hp = pph[index];
 295         while (proc_hp != NULL) {
 296                 if (proc_hp->pph_proc == curproc) {
 297                         phpp = &proc_hp->pph_hash;
 298                         while (*phpp != NULL) {
 299                                 if ((*phpp)->ph_vnode == vp) {
 300                                         victim = *phpp;
 301                                         *phpp = victim->ph_next;
 302 
 303                                         rw_exit(&pph_rwlock);
 304                                         kmem_free(victim, sizeof (*victim));
 305                                         return (1);
 306                                 }
 307                                 phpp = &(*phpp)->ph_next;
 308                         }
 309                 }
 310                 proc_hp = proc_hp->pph_next;
 311         }
 312         rw_exit(&pph_rwlock);
 313 
 314         /* not found */
 315         return (0);
 316 }
 317 
 318 int
 319 physmem_setup_vnops()
 320 {
 321         int error;
 322         char *name = "physmem";
 323         if (physmem_vnodeops != NULL)
 324                 cmn_err(CE_PANIC, "physmem vnodeops already set\n");
 325         error = vn_make_ops(name, physmem_vnodeops_template, &physmem_vnodeops);
 326         if (error != 0) {
 327                 cmn_err(CE_WARN, "physmem_setup_vnops: bad vnode ops template");
 328         }
 329         return (error);
 330 }
 331 
 332 /*
 333  * The guts of the PHYSMEM_SETUP ioctl.
 334  * Create a segment in the address space with the specified parameters.
 335  * If pspp->user_va is NULL, as_gap will be used to find an appropriate VA.
 336  * We do not do bounds checking on the requested physical addresses, if they
 337  * do not exist in the system, they will not be mappable.
 338  * Returns 0 on success with the following error codes on failure:
 339  *      ENOMEM - The VA range requested was already mapped if pspp->user_va is
 340  *              non-NULL or the system was unable to find enough VA space for
 341  *              the desired length if user_va was NULL>
 342  *      EINVAL - The requested PA, VA, or length was not PAGESIZE aligned.
 343  */
 344 int
 345 physmem_setup_addrs(struct physmem_setup_param *pspp)
 346 {
 347         struct as *as = curproc->p_as;
 348         struct segvn_crargs vn_a;
 349         int ret = 0;
 350         uint64_t base_pa;
 351         size_t len;
 352         caddr_t uvaddr;
 353         struct vnode *vp;
 354         struct physmem_hash *php;
 355 
 356         ASSERT(pspp != NULL);
 357         base_pa = pspp->req_paddr;
 358         len = pspp->len;
 359         uvaddr = (caddr_t)(uintptr_t)pspp->user_va;
 360 
 361         /* Sanity checking */
 362         if (!IS_P2ALIGNED(base_pa, PAGESIZE))
 363                 return (EINVAL);
 364         if (!IS_P2ALIGNED(len, PAGESIZE))
 365                 return (EINVAL);
 366         if (uvaddr != NULL && !IS_P2ALIGNED(uvaddr, PAGESIZE))
 367                 return (EINVAL);
 368 
 369         php = kmem_zalloc(sizeof (struct physmem_hash), KM_SLEEP);
 370 
 371         /* Need to bump vnode count so that the driver can not be unloaded */
 372         mutex_enter(&physmem_mutex);
 373         physmem_vnodecnt++;
 374         mutex_exit(&physmem_mutex);
 375 
 376         vp = vn_alloc(KM_SLEEP);
 377         ASSERT(vp != NULL);     /* SLEEP can't return NULL */
 378         vn_setops(vp, physmem_vnodeops);
 379 
 380         php->ph_vnode = vp;
 381 
 382         vn_a.vp = vp;
 383         vn_a.offset = (u_offset_t)base_pa;
 384         vn_a.type = MAP_SHARED;
 385         vn_a.prot = PROT_ALL;
 386         vn_a.maxprot = PROT_ALL;
 387         vn_a.flags = 0;
 388         vn_a.cred = NULL;
 389         vn_a.amp = NULL;
 390         vn_a.szc = 0;
 391         vn_a.lgrp_mem_policy_flags = 0;
 392 
 393         as_rangelock(as);
 394         if (uvaddr != NULL) {
 395                 if (as_gap(as, len, &uvaddr, &len, AH_LO, NULL) == -1) {
 396                         ret = ENOMEM;
 397 fail:
 398                         as_rangeunlock(as);
 399                         vn_free(vp);
 400                         kmem_free(php, sizeof (*php));
 401                         mutex_enter(&physmem_mutex);
 402                         physmem_vnodecnt--;
 403                         mutex_exit(&physmem_mutex);
 404                         return (ret);
 405                 }
 406         } else {
 407                 /* We pick the address for the user */
 408                 map_addr(&uvaddr, len, 0, 1, 0);
 409                 if (uvaddr == NULL) {
 410                         ret = ENOMEM;
 411                         goto fail;
 412                 }
 413         }
 414         ret = as_map(as, uvaddr, len, segvn_create, &vn_a);
 415 
 416         if (ret == 0) {
 417                 as_rangeunlock(as);
 418                 php->ph_base_pa = base_pa;
 419                 php->ph_base_va = uvaddr;
 420                 php->ph_seg_len = len;
 421                 pspp->user_va = (uint64_t)(uintptr_t)uvaddr;
 422                 pspp->cookie = (uint64_t)(uintptr_t)php;
 423                 ret = physmem_add_hash(php);
 424                 if (ret == 0)
 425                         return (0);
 426 
 427                 /* Note that the call to as_unmap will free the vnode */
 428                 (void) as_unmap(as, uvaddr, len);
 429                 kmem_free(php, sizeof (*php));
 430                 return (ret);
 431         }
 432 
 433         goto fail;
 434         /*NOTREACHED*/
 435 }
 436 
 437 /*
 438  * The guts of the PHYSMEM_MAP ioctl.
 439  * Map the given PA to the appropriate VA if PHYSMEM_SETUP ioctl has already
 440  * been called for this PA range.
 441  * Returns 0 on success with the following error codes on failure:
 442  *      EPERM - The requested page is long term locked, and thus repeated
 443  *              requests to allocate this page will likely fail.
 444  *      EAGAIN - The requested page could not be allocated, but it is believed
 445  *              that future attempts could succeed.
 446  *      ENOMEM - There was not enough free memory in the system to safely
 447  *              map the requested page.
 448  *      EINVAL - The requested paddr was not PAGESIZE aligned or the
 449  *              PHYSMEM_SETUP ioctl was not called for this page.
 450  *      ENOENT - The requested page was iniside the kernel cage, and the
 451  *              PHYSMEM_CAGE flag was not set.
 452  *      EBUSY - The requested page is retired and the PHYSMEM_RETIRE flag
 453  *              was not set.
 454  */
 455 static int
 456 physmem_map_addrs(struct physmem_map_param *pmpp)
 457 {
 458         caddr_t uvaddr;
 459         page_t *pp;
 460         uint64_t req_paddr;
 461         struct vnode *vp;
 462         int ret = 0;
 463         struct physmem_hash *php;
 464         uint_t flags = 0;
 465 
 466         ASSERT(pmpp != NULL);
 467         req_paddr = pmpp->req_paddr;
 468 
 469         if (!IS_P2ALIGNED(req_paddr, PAGESIZE))
 470                 return (EINVAL);
 471         /* Find the vnode for this map request */
 472         rw_enter(&pph_rwlock, RW_READER);
 473         php = physmem_get_hash(req_paddr, PAGESIZE, curproc);
 474         if (php == NULL) {
 475                 rw_exit(&pph_rwlock);
 476                 return (EINVAL);
 477         }
 478         vp = php->ph_vnode;
 479         uvaddr = php->ph_base_va + (req_paddr - php->ph_base_pa);
 480         rw_exit(&pph_rwlock);
 481 
 482         pp = page_numtopp_nolock(btop((size_t)req_paddr));
 483         if (pp == NULL) {
 484                 pmpp->ret_va = NULL;
 485                 return (EPERM);
 486         }
 487 
 488         /*
 489          * Check to see if page already mapped correctly.  This can happen
 490          * when we failed to capture a page previously and it was captured
 491          * asynchronously for us.  Return success in this case.
 492          */
 493         if (pp->p_vnode == vp) {
 494                 ASSERT(pp->p_offset == (u_offset_t)req_paddr);
 495                 pmpp->ret_va = (uint64_t)(uintptr_t)uvaddr;
 496                 return (0);
 497         }
 498 
 499         /*
 500          * physmem should be responsible for checking for cage
 501          * and prom pages.
 502          */
 503         if (pmpp->flags & PHYSMEM_CAGE)
 504                 flags = CAPTURE_GET_CAGE;
 505         if (pmpp->flags & PHYSMEM_RETIRED)
 506                 flags |= CAPTURE_GET_RETIRED;
 507 
 508         ret = page_trycapture(pp, 0, flags | CAPTURE_PHYSMEM, curproc);
 509 
 510         if (ret != 0) {
 511                 pmpp->ret_va = NULL;
 512                 return (ret);
 513         } else {
 514                 pmpp->ret_va = (uint64_t)(uintptr_t)uvaddr;
 515                 return (0);
 516         }
 517 }
 518 
 519 /*
 520  * Map the given page into the process's address space if possible.
 521  * We actually only hash the page in on the correct vnode as the page
 522  * will be mapped via segvn_pagefault.
 523  * returns 0 on success
 524  * returns 1 if there is no need to map this page anymore (process exited)
 525  * returns -1 if we failed to map the page.
 526  */
 527 int
 528 map_page_proc(page_t *pp, void *arg, uint_t flags)
 529 {
 530         struct vnode *vp;
 531         proc_t *procp = (proc_t *)arg;
 532         int ret;
 533         u_offset_t paddr = (u_offset_t)ptob(pp->p_pagenum);
 534         struct physmem_hash *php;
 535 
 536         ASSERT(pp != NULL);
 537 
 538         /*
 539          * Check against availrmem to make sure that we're not low on memory.
 540          * We check again here as ASYNC requests do not do this check elsewhere.
 541          * We return 1 as we don't want the page to have the PR_CAPTURE bit
 542          * set or be on the page capture hash.
 543          */
 544         if (swapfs_minfree > availrmem + 1) {
 545                 page_free(pp, 1);
 546                 return (1);
 547         }
 548 
 549         /*
 550          * If this is an asynchronous request for the current process,
 551          * we can not map the page as it's possible that we are also in the
 552          * process of unmapping the page which could result in a deadlock
 553          * with the as lock.
 554          */
 555         if ((flags & CAPTURE_ASYNC) && (curproc == procp)) {
 556                 page_free(pp, 1);
 557                 return (-1);
 558         }
 559 
 560         /* only return zeroed out pages */
 561         pagezero(pp, 0, PAGESIZE);
 562 
 563         rw_enter(&pph_rwlock, RW_READER);
 564         php = physmem_get_hash(paddr, PAGESIZE, procp);
 565         if (php == NULL) {
 566                 rw_exit(&pph_rwlock);
 567                 /*
 568                  * Free the page as there is no longer a valid outstanding
 569                  * request for this page.
 570                  */
 571                 page_free(pp, 1);
 572                 return (1);
 573         }
 574 
 575         vp = php->ph_vnode;
 576 
 577         /*
 578          * We need to protect against a possible deadlock here where we own
 579          * the vnode page hash mutex and want to acquire it again as there
 580          * are locations in the code, where we unlock a page while holding
 581          * the mutex which can lead to the page being captured and eventually
 582          * end up here.
 583          */
 584         if (mutex_owned(page_vnode_mutex(vp))) {
 585                 rw_exit(&pph_rwlock);
 586                 page_free(pp, 1);
 587                 return (-1);
 588         }
 589 
 590         ret = page_hashin(pp, vp, paddr, NULL);
 591         rw_exit(&pph_rwlock);
 592         if (ret == 0) {
 593                 page_free(pp, 1);
 594                 return (-1);
 595         }
 596 
 597         page_downgrade(pp);
 598 
 599         mutex_enter(&freemem_lock);
 600         availrmem--;
 601         mutex_exit(&freemem_lock);
 602 
 603         return (0);
 604 }
 605 
 606 /*
 607  * The guts of the PHYSMEM_DESTROY ioctl.
 608  * The cookie passed in will provide all of the information needed to
 609  * free up the address space and physical memory associated with the
 610  * corresponding PHSYMEM_SETUP ioctl.
 611  * Returns 0 on success with the following error codes on failure:
 612  *      EINVAL - The cookie supplied is not valid.
 613  */
 614 int
 615 physmem_destroy_addrs(uint64_t p_cookie)
 616 {
 617         struct as *as = curproc->p_as;
 618         size_t len;
 619         caddr_t uvaddr;
 620 
 621         rw_enter(&pph_rwlock, RW_READER);
 622         if (physmem_validate_cookie(p_cookie) == 0) {
 623                 rw_exit(&pph_rwlock);
 624                 return (EINVAL);
 625         }
 626 
 627         len = ((struct physmem_hash *)(uintptr_t)p_cookie)->ph_seg_len;
 628         uvaddr = ((struct physmem_hash *)(uintptr_t)p_cookie)->ph_base_va;
 629         rw_exit(&pph_rwlock);
 630 
 631         (void) as_unmap(as, uvaddr, len);
 632 
 633         return (0);
 634 }
 635 
 636 /*
 637  * If the page has been hashed into the physmem vnode, then just look it up
 638  * and return it via pl, otherwise return ENOMEM as the map ioctl has not
 639  * succeeded on the given page.
 640  */
 641 /*ARGSUSED*/
 642 static int
 643 physmem_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
 644     page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw,
 645     struct cred *cr, caller_context_t *ct)
 646 {
 647         page_t *pp;
 648 
 649         ASSERT(len == PAGESIZE);
 650         ASSERT(AS_READ_HELD(seg->s_as, &seg->s_as->a_lock));
 651 
 652         /*
 653          * If the page is in the hash, then we successfully claimed this
 654          * page earlier, so return it to the caller.
 655          */
 656         pp = page_lookup(vp, off, SE_SHARED);
 657         if (pp != NULL) {
 658                 pl[0] = pp;
 659                 pl[1] = NULL;
 660                 *protp = PROT_ALL;
 661                 return (0);
 662         }
 663         return (ENOMEM);
 664 }
 665 
 666 /*
 667  * We can not allow a process mapping /dev/physmem pages to fork as there can
 668  * only be a single mapping to a /dev/physmem page at a given time.  Thus, the
 669  * return of EINVAL when we are not working on our own address space.
 670  * Otherwise we return zero as this function is required for normal operation.
 671  */
 672 /*ARGSUSED*/
 673 static int
 674 physmem_addmap(struct vnode *vp, offset_t off, struct as *as,
 675     caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
 676     struct cred *cred, caller_context_t *ct)
 677 {
 678         if (curproc->p_as != as) {
 679                 return (EINVAL);
 680         }
 681         return (0);
 682 }
 683 
 684 /* Will always get called for removing a whole segment. */
 685 /*ARGSUSED*/
 686 static int
 687 physmem_delmap(struct vnode *vp, offset_t off, struct as *as,
 688     caddr_t addr, size_t len, uint_t prot, uint_t maxprot, uint_t flags,
 689     struct cred *cred, caller_context_t *ct)
 690 {
 691         /*
 692          * Release our hold on the vnode so that the final VN_RELE will
 693          * call physmem_inactive to clean things up.
 694          */
 695         VN_RELE(vp);
 696 
 697         return (0);
 698 }
 699 
 700 /*
 701  * Clean up all the pages belonging to this vnode and then free it.
 702  */
 703 /*ARGSUSED*/
 704 static void
 705 physmem_inactive(vnode_t *vp, cred_t *crp, caller_context_t *ct)
 706 {
 707         page_t *pp;
 708 
 709         /*
 710          * Remove the vnode from the hash now, to prevent asynchronous
 711          * attempts to map into this vnode.  This avoids a deadlock
 712          * where two threads try to get into this logic at the same
 713          * time and try to map the pages they are destroying into the
 714          * other's address space.
 715          * If it's not in the hash, just free it.
 716          */
 717         if (physmem_remove_vnode_hash(vp) == 0) {
 718                 ASSERT(vp->v_pages == NULL);
 719                 vn_free(vp);
 720                 physmem_remove_hash_proc();
 721                 mutex_enter(&physmem_mutex);
 722                 physmem_vnodecnt--;
 723                 mutex_exit(&physmem_mutex);
 724                 return;
 725         }
 726 
 727         /*
 728          * At this point in time, no other logic can be adding or removing
 729          * pages from the vnode, otherwise the v_pages list could be inaccurate.
 730          */
 731 
 732         while ((pp = vp->v_pages) != NULL) {
 733                 page_t *rpp;
 734                 if (page_tryupgrade(pp)) {
 735                         /*
 736                          * set lckcnt for page_destroy to do availrmem
 737                          * accounting
 738                          */
 739                         pp->p_lckcnt = 1;
 740                         page_destroy(pp, 0);
 741                 } else {
 742                         /* failure to lock should be transient */
 743                         rpp = page_lookup(vp, ptob(pp->p_pagenum), SE_SHARED);
 744                         if (rpp != pp) {
 745                                 page_unlock(rpp);
 746                                 continue;
 747                         }
 748                         page_unlock(pp);
 749                 }
 750         }
 751         vn_free(vp);
 752         physmem_remove_hash_proc();
 753         mutex_enter(&physmem_mutex);
 754         physmem_vnodecnt--;
 755         mutex_exit(&physmem_mutex);
 756 }
 757 
 758 /*ARGSUSED*/
 759 static int
 760 physmem_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
 761     int *rvalp)
 762 {
 763         int ret;
 764 
 765         switch (cmd) {
 766         case PHYSMEM_SETUP:
 767                 {
 768                         struct physmem_setup_param psp;
 769                         if (ddi_copyin((void *)arg, &psp,
 770                             sizeof (struct physmem_setup_param), 0))
 771                                 return (EFAULT);
 772                         ret = physmem_setup_addrs(&psp);
 773                         if (ddi_copyout(&psp, (void *)arg, sizeof (psp), 0))
 774                                 return (EFAULT);
 775                 }
 776                 break;
 777         case PHYSMEM_MAP:
 778                 {
 779                         struct physmem_map_param pmp;
 780                         if (ddi_copyin((void *)arg, &pmp,
 781                             sizeof (struct physmem_map_param), 0))
 782                                 return (EFAULT);
 783                         ret = physmem_map_addrs(&pmp);
 784                         if (ddi_copyout(&pmp, (void *)arg, sizeof (pmp), 0))
 785                                 return (EFAULT);
 786                 }
 787                 break;
 788         case PHYSMEM_DESTROY:
 789                 {
 790                         uint64_t cookie;
 791                         if (ddi_copyin((void *)arg, &cookie,
 792                             sizeof (uint64_t), 0))
 793                                 return (EFAULT);
 794                         ret = physmem_destroy_addrs(cookie);
 795                 }
 796                 break;
 797         default:
 798                 return (ENOTSUP);
 799         }
 800         return (ret);
 801 }
 802 
 803 /*ARGSUSED*/
 804 static int
 805 physmem_open(dev_t *devp, int flag, int otyp, cred_t *credp)
 806 {
 807         int ret;
 808         static int msg_printed = 0;
 809 
 810         if ((flag & (FWRITE | FREAD)) != (FWRITE | FREAD)) {
 811                 return (EINVAL);
 812         }
 813 
 814         /* need to make sure we have the right privileges */
 815         if ((ret = secpolicy_resource(credp)) != 0)
 816                 return (ret);
 817         if ((ret = secpolicy_lock_memory(credp)) != 0)
 818                 return (ret);
 819 
 820         if (msg_printed == 0) {
 821                 cmn_err(CE_NOTE, "!driver has been opened. This driver may "
 822                     "take out long term locks on pages which may impact "
 823                     "dynamic reconfiguration events");
 824                 msg_printed = 1;
 825         }
 826 
 827         return (0);
 828 }
 829 
 830 /*ARGSUSED*/
 831 static int
 832 physmem_close(dev_t dev, int flag, int otyp, cred_t *credp)
 833 {
 834         return (0);
 835 }
 836 
 837 /*ARGSUSED*/
 838 static int
 839 physmem_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd,
 840     void *arg, void **resultp)
 841 {
 842         switch (infocmd) {
 843         case DDI_INFO_DEVT2DEVINFO:
 844                 *resultp = physmem_dip;
 845                 return (DDI_SUCCESS);
 846 
 847         case DDI_INFO_DEVT2INSTANCE:
 848                 *resultp = (void *)(ulong_t)getminor((dev_t)arg);
 849                 return (DDI_SUCCESS);
 850 
 851         default:
 852                 return (DDI_FAILURE);
 853         }
 854 }
 855 
 856 static int
 857 physmem_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 858 {
 859         int i;
 860 
 861         if (cmd == DDI_RESUME) {
 862                 return (DDI_SUCCESS);
 863         }
 864 
 865         if (cmd != DDI_ATTACH)
 866                 return (DDI_FAILURE);
 867 
 868         if (ddi_create_minor_node(dip, ddi_get_name(dip), S_IFCHR,
 869             ddi_get_instance(dip), DDI_PSEUDO, 0) != DDI_SUCCESS)
 870                 return (DDI_FAILURE);
 871 
 872         physmem_dip = dip;
 873 
 874         /* Initialize driver specific data */
 875         if (physmem_setup_vnops()) {
 876                 ddi_remove_minor_node(dip, ddi_get_name(dip));
 877                 return (DDI_FAILURE);
 878         }
 879 
 880         for (i = 0; i < PPH_SIZE; i++)
 881                 pph[i] = NULL;
 882 
 883         page_capture_register_callback(PC_PHYSMEM, 10000,
 884             map_page_proc);
 885 
 886         return (DDI_SUCCESS);
 887 }
 888 
 889 static int
 890 physmem_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 891 {
 892         int ret = DDI_SUCCESS;
 893 
 894         if (cmd == DDI_SUSPEND) {
 895                 return (DDI_SUCCESS);
 896         }
 897 
 898         if (cmd != DDI_DETACH)
 899                 return (DDI_FAILURE);
 900 
 901         ASSERT(physmem_dip == dip);
 902 
 903         mutex_enter(&physmem_mutex);
 904         if (physmem_vnodecnt == 0) {
 905                 if (physmem_vnodeops != NULL) {
 906                         vn_freevnodeops(physmem_vnodeops);
 907                         physmem_vnodeops = NULL;
 908                         page_capture_unregister_callback(PC_PHYSMEM);
 909                 }
 910         } else {
 911                 ret = EBUSY;
 912         }
 913         mutex_exit(&physmem_mutex);
 914         if (ret == DDI_SUCCESS)
 915                 ddi_remove_minor_node(dip, ddi_get_name(dip));
 916         return (ret);
 917 }
 918 
 919 static struct cb_ops physmem_cb_ops = {
 920         physmem_open,   /* open */
 921         physmem_close,  /* close */
 922         nodev,          /* strategy */
 923         nodev,          /* print */
 924         nodev,          /* dump */
 925         nodev,          /* read */
 926         nodev,          /* write */
 927         physmem_ioctl,  /* ioctl */
 928         nodev,          /* devmap */
 929         nodev,          /* mmap */
 930         nodev,          /* segmap */
 931         nochpoll,       /* chpoll */
 932         ddi_prop_op,    /* prop_op */
 933         NULL,           /* cb_str */
 934         D_NEW | D_MP | D_DEVMAP,
 935         CB_REV,
 936         NULL,
 937         NULL
 938 };
 939 
 940 static struct dev_ops physmem_ops = {
 941         DEVO_REV,
 942         0,
 943         physmem_getinfo,
 944         nulldev,
 945         nulldev,
 946         physmem_attach,
 947         physmem_detach,
 948         nodev,
 949         &physmem_cb_ops,
 950         NULL,
 951         NULL,
 952         ddi_quiesce_not_needed,         /* quiesce */
 953 };
 954 
 955 static struct modldrv modldrv = {
 956         &mod_driverops,
 957         "physmem driver",
 958         &physmem_ops
 959 };
 960 
 961 static struct modlinkage modlinkage = {
 962         MODREV_1,
 963         &modldrv,
 964         NULL
 965 };
 966 
 967 int
 968 _init(void)
 969 {
 970         return (mod_install(&modlinkage));
 971 }
 972 
 973 int
 974 _info(struct modinfo *modinfop)
 975 {
 976         return (mod_info(&modlinkage, modinfop));
 977 }
 978 
 979 int
 980 _fini(void)
 981 {
 982         return (mod_remove(&modlinkage));
 983 }