1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  26  */
  27 
  28 /*
  29  * Vnode operations for the High Sierra filesystem
  30  */
  31 
  32 #include <sys/types.h>
  33 #include <sys/t_lock.h>
  34 #include <sys/param.h>
  35 #include <sys/time.h>
  36 #include <sys/systm.h>
  37 #include <sys/sysmacros.h>
  38 #include <sys/resource.h>
  39 #include <sys/signal.h>
  40 #include <sys/cred.h>
  41 #include <sys/user.h>
  42 #include <sys/buf.h>
  43 #include <sys/vfs.h>
  44 #include <sys/vfs_opreg.h>
  45 #include <sys/stat.h>
  46 #include <sys/vnode.h>
  47 #include <sys/mode.h>
  48 #include <sys/proc.h>
  49 #include <sys/disp.h>
  50 #include <sys/file.h>
  51 #include <sys/fcntl.h>
  52 #include <sys/flock.h>
  53 #include <sys/kmem.h>
  54 #include <sys/uio.h>
  55 #include <sys/conf.h>
  56 #include <sys/errno.h>
  57 #include <sys/mman.h>
  58 #include <sys/pathname.h>
  59 #include <sys/debug.h>
  60 #include <sys/vmsystm.h>
  61 #include <sys/cmn_err.h>
  62 #include <sys/fbuf.h>
  63 #include <sys/dirent.h>
  64 #include <sys/errno.h>
  65 #include <sys/dkio.h>
  66 #include <sys/cmn_err.h>
  67 #include <sys/atomic.h>
  68 
  69 #include <vm/hat.h>
  70 #include <vm/page.h>
  71 #include <vm/pvn.h>
  72 #include <vm/as.h>
  73 #include <vm/seg.h>
  74 #include <vm/seg_map.h>
  75 #include <vm/seg_kmem.h>
  76 #include <vm/seg_vn.h>
  77 #include <vm/rm.h>
  78 #include <vm/page.h>
  79 #include <sys/swap.h>
  80 #include <sys/avl.h>
  81 #include <sys/sunldi.h>
  82 #include <sys/ddi.h>
  83 #include <sys/sunddi.h>
  84 #include <sys/sdt.h>
  85 
  86 /*
  87  * For struct modlinkage
  88  */
  89 #include <sys/modctl.h>
  90 
  91 #include <sys/fs/hsfs_spec.h>
  92 #include <sys/fs/hsfs_node.h>
  93 #include <sys/fs/hsfs_impl.h>
  94 #include <sys/fs/hsfs_susp.h>
  95 #include <sys/fs/hsfs_rrip.h>
  96 
  97 #include <fs/fs_subr.h>
  98 
  99 /* # of contiguous requests to detect sequential access pattern */
 100 static int seq_contig_requests = 2;
 101 
 102 /*
 103  * This is the max number os taskq threads that will be created
 104  * if required. Since we are using a Dynamic TaskQ by default only
 105  * one thread is created initially.
 106  *
 107  * NOTE: In the usual hsfs use case this per fs instance number
 108  * of taskq threads should not place any undue load on a system.
 109  * Even on an unusual system with say 100 CDROM drives, 800 threads
 110  * will not be created unless all the drives are loaded and all
 111  * of them are saturated with I/O at the same time! If there is at
 112  * all a complaint of system load due to such an unusual case it
 113  * should be easy enough to change to one per-machine Dynamic TaskQ
 114  * for all hsfs mounts with a nthreads of say 32.
 115  */
 116 static int hsfs_taskq_nthreads = 8;     /* # of taskq threads per fs */
 117 
 118 /* Min count of adjacent bufs that will avoid buf coalescing */
 119 static int hsched_coalesce_min = 2;
 120 
 121 /*
 122  * Kmem caches for heavily used small allocations. Using these kmem
 123  * caches provides a factor of 3 reduction in system time and greatly
 124  * aids overall throughput esp. on SPARC.
 125  */
 126 struct kmem_cache *hio_cache;
 127 struct kmem_cache *hio_info_cache;
 128 
 129 /*
 130  * This tunable allows us to ignore inode numbers from rrip-1.12.
 131  * In this case, we fall back to our default inode algorithm.
 132  */
 133 extern int use_rrip_inodes;
 134 
 135 /*
 136  * Free behind logic from UFS to tame our thirst for
 137  * the page cache.
 138  * See usr/src/uts/common/fs/ufs/ufs_vnops.c for more
 139  * explanation.
 140  */
 141 static int      freebehind = 1;
 142 static int      smallfile = 0;
 143 static int      cache_read_ahead = 0;
 144 static u_offset_t smallfile64 = 32 * 1024;
 145 #define SMALLFILE1_D 1000
 146 #define SMALLFILE2_D 10
 147 static u_offset_t smallfile1 = 32 * 1024;
 148 static u_offset_t smallfile2 = 32 * 1024;
 149 static clock_t smallfile_update = 0; /* when to recompute */
 150 static uint_t smallfile1_d = SMALLFILE1_D;
 151 static uint_t smallfile2_d = SMALLFILE2_D;
 152 
 153 static int hsched_deadline_compare(const void *x1, const void *x2);
 154 static int hsched_offset_compare(const void *x1, const void *x2);
 155 static void hsched_enqueue_io(struct hsfs *fsp, struct hio *hsio, int ra);
 156 int hsched_invoke_strategy(struct hsfs *fsp);
 157 
 158 /* ARGSUSED */
 159 static int
 160 hsfs_fsync(vnode_t *cp,
 161         int syncflag,
 162         cred_t *cred,
 163         caller_context_t *ct)
 164 {
 165         return (0);
 166 }
 167 
 168 
 169 /*ARGSUSED*/
 170 static int
 171 hsfs_read(struct vnode *vp,
 172         struct uio *uiop,
 173         int ioflag,
 174         struct cred *cred,
 175         struct caller_context *ct)
 176 {
 177         caddr_t base;
 178         offset_t diff;
 179         int error;
 180         struct hsnode *hp;
 181         uint_t filesize;
 182         int dofree;
 183 
 184         hp = VTOH(vp);
 185         /*
 186          * if vp is of type VDIR, make sure dirent
 187          * is filled up with all info (because of ptbl)
 188          */
 189         if (vp->v_type == VDIR) {
 190                 if (hp->hs_dirent.ext_size == 0)
 191                         hs_filldirent(vp, &hp->hs_dirent);
 192         }
 193         filesize = hp->hs_dirent.ext_size;
 194 
 195         /* Sanity checks. */
 196         if (uiop->uio_resid == 0 ||          /* No data wanted. */
 197             uiop->uio_loffset > HS_MAXFILEOFF ||  /* Offset too big. */
 198             uiop->uio_loffset >= filesize)        /* Past EOF. */
 199                 return (0);
 200 
 201         do {
 202                 /*
 203                  * We want to ask for only the "right" amount of data.
 204                  * In this case that means:-
 205                  *
 206                  * We can't get data from beyond our EOF. If asked,
 207                  * we will give a short read.
 208                  *
 209                  * segmap_getmapflt returns buffers of MAXBSIZE bytes.
 210                  * These buffers are always MAXBSIZE aligned.
 211                  * If our starting offset is not MAXBSIZE aligned,
 212                  * we can only ask for less than MAXBSIZE bytes.
 213                  *
 214                  * If our requested offset and length are such that
 215                  * they belong in different MAXBSIZE aligned slots
 216                  * then we'll be making more than one call on
 217                  * segmap_getmapflt.
 218                  *
 219                  * This diagram shows the variables we use and their
 220                  * relationships.
 221                  *
 222                  * |<-----MAXBSIZE----->|
 223                  * +--------------------------...+
 224                  * |.....mapon->|<--n-->|....*...|EOF
 225                  * +--------------------------...+
 226                  * uio_loffset->|
 227                  * uio_resid....|<---------->|
 228                  * diff.........|<-------------->|
 229                  *
 230                  * So, in this case our offset is not aligned
 231                  * and our request takes us outside of the
 232                  * MAXBSIZE window. We will break this up into
 233                  * two segmap_getmapflt calls.
 234                  */
 235                 size_t nbytes;
 236                 offset_t mapon;
 237                 size_t n;
 238                 uint_t flags;
 239 
 240                 mapon = uiop->uio_loffset & MAXBOFFSET;
 241                 diff = filesize - uiop->uio_loffset;
 242                 nbytes = (size_t)MIN(MAXBSIZE - mapon, uiop->uio_resid);
 243                 n = MIN(diff, nbytes);
 244                 if (n <= 0) {
 245                         /* EOF or request satisfied. */
 246                         return (0);
 247                 }
 248 
 249                 /*
 250                  * Freebehind computation taken from:
 251                  * usr/src/uts/common/fs/ufs/ufs_vnops.c
 252                  */
 253                 if (drv_hztousec(ddi_get_lbolt()) >= smallfile_update) {
 254                         uint64_t percpufreeb;
 255                         if (smallfile1_d == 0) smallfile1_d = SMALLFILE1_D;
 256                         if (smallfile2_d == 0) smallfile2_d = SMALLFILE2_D;
 257                         percpufreeb = ptob((uint64_t)freemem) / ncpus_online;
 258                         smallfile1 = percpufreeb / smallfile1_d;
 259                         smallfile2 = percpufreeb / smallfile2_d;
 260                         smallfile1 = MAX(smallfile1, smallfile);
 261                         smallfile1 = MAX(smallfile1, smallfile64);
 262                         smallfile2 = MAX(smallfile1, smallfile2);
 263                         smallfile_update = drv_hztousec(ddi_get_lbolt())
 264                             + 1000000;
 265                 }
 266 
 267                 dofree = freebehind &&
 268                     hp->hs_prev_offset == uiop->uio_loffset &&
 269                     hp->hs_ra_bytes > 0;
 270 
 271                 base = segmap_getmapflt(segkmap, vp,
 272                     (u_offset_t)uiop->uio_loffset, n, 1, S_READ);
 273 
 274                 error = uiomove(base + mapon, n, UIO_READ, uiop);
 275 
 276                 if (error == 0) {
 277                         /*
 278                          * if read a whole block, or read to eof,
 279                          *  won't need this buffer again soon.
 280                          */
 281                         if (n + mapon == MAXBSIZE ||
 282                             uiop->uio_loffset == filesize)
 283                                 flags = SM_DONTNEED;
 284                         else
 285                                 flags = 0;
 286 
 287                         if (dofree) {
 288                                 flags = SM_FREE | SM_ASYNC;
 289                                 if ((cache_read_ahead == 0) &&
 290                                     uiop->uio_loffset > smallfile2)
 291                                         flags |=  SM_DONTNEED;
 292                         }
 293 
 294                         error = segmap_release(segkmap, base, flags);
 295                 } else
 296                         (void) segmap_release(segkmap, base, 0);
 297         } while (error == 0 && uiop->uio_resid > 0);
 298 
 299         return (error);
 300 }
 301 
 302 /*ARGSUSED2*/
 303 static int
 304 hsfs_getattr(
 305         struct vnode *vp,
 306         struct vattr *vap,
 307         int flags,
 308         struct cred *cred,
 309         caller_context_t *ct)
 310 {
 311         struct hsnode *hp;
 312         struct vfs *vfsp;
 313         struct hsfs *fsp;
 314 
 315         hp = VTOH(vp);
 316         fsp = VFS_TO_HSFS(vp->v_vfsp);
 317         vfsp = vp->v_vfsp;
 318 
 319         if ((hp->hs_dirent.ext_size == 0) && (vp->v_type == VDIR)) {
 320                 hs_filldirent(vp, &hp->hs_dirent);
 321         }
 322         vap->va_type = IFTOVT(hp->hs_dirent.mode);
 323         vap->va_mode = hp->hs_dirent.mode;
 324         vap->va_uid = hp->hs_dirent.uid;
 325         vap->va_gid = hp->hs_dirent.gid;
 326 
 327         vap->va_fsid = vfsp->vfs_dev;
 328         vap->va_nodeid = (ino64_t)hp->hs_nodeid;
 329         vap->va_nlink = hp->hs_dirent.nlink;
 330         vap->va_size =       (offset_t)hp->hs_dirent.ext_size;
 331 
 332         vap->va_atime.tv_sec = hp->hs_dirent.adate.tv_sec;
 333         vap->va_atime.tv_nsec = hp->hs_dirent.adate.tv_usec*1000;
 334         vap->va_mtime.tv_sec = hp->hs_dirent.mdate.tv_sec;
 335         vap->va_mtime.tv_nsec = hp->hs_dirent.mdate.tv_usec*1000;
 336         vap->va_ctime.tv_sec = hp->hs_dirent.cdate.tv_sec;
 337         vap->va_ctime.tv_nsec = hp->hs_dirent.cdate.tv_usec*1000;
 338         if (vp->v_type == VCHR || vp->v_type == VBLK)
 339                 vap->va_rdev = hp->hs_dirent.r_dev;
 340         else
 341                 vap->va_rdev = 0;
 342         vap->va_blksize = vfsp->vfs_bsize;
 343         /* no. of blocks = no. of data blocks + no. of xar blocks */
 344         vap->va_nblocks = (fsblkcnt64_t)howmany(vap->va_size + (u_longlong_t)
 345             (hp->hs_dirent.xar_len << fsp->hsfs_vol.lbn_shift), DEV_BSIZE);
 346         vap->va_seq = hp->hs_seq;
 347         return (0);
 348 }
 349 
 350 /*ARGSUSED*/
 351 static int
 352 hsfs_readlink(struct vnode *vp,
 353         struct uio *uiop,
 354         struct cred *cred,
 355         caller_context_t *ct)
 356 {
 357         struct hsnode *hp;
 358 
 359         if (vp->v_type != VLNK)
 360                 return (EINVAL);
 361 
 362         hp = VTOH(vp);
 363 
 364         if (hp->hs_dirent.sym_link == (char *)NULL)
 365                 return (ENOENT);
 366 
 367         return (uiomove(hp->hs_dirent.sym_link,
 368             (size_t)MIN(hp->hs_dirent.ext_size,
 369             uiop->uio_resid), UIO_READ, uiop));
 370 }
 371 
 372 /*ARGSUSED*/
 373 static void
 374 hsfs_inactive(struct vnode *vp,
 375         struct cred *cred,
 376         caller_context_t *ct)
 377 {
 378         struct hsnode *hp;
 379         struct hsfs *fsp;
 380 
 381         int nopage;
 382 
 383         hp = VTOH(vp);
 384         fsp = VFS_TO_HSFS(vp->v_vfsp);
 385         /*
 386          * Note: acquiring and holding v_lock for quite a while
 387          * here serializes on the vnode; this is unfortunate, but
 388          * likely not to overly impact performance, as the underlying
 389          * device (CDROM drive) is quite slow.
 390          */
 391         rw_enter(&fsp->hsfs_hash_lock, RW_WRITER);
 392         mutex_enter(&hp->hs_contents_lock);
 393         mutex_enter(&vp->v_lock);
 394 
 395         if (vp->v_count < 1) {
 396                 panic("hsfs_inactive: v_count < 1");
 397                 /*NOTREACHED*/
 398         }
 399 
 400         if (vp->v_count > 1 || (hp->hs_flags & HREF) == 0) {
 401                 vp->v_count--;       /* release hold from vn_rele */
 402                 mutex_exit(&vp->v_lock);
 403                 mutex_exit(&hp->hs_contents_lock);
 404                 rw_exit(&fsp->hsfs_hash_lock);
 405                 return;
 406         }
 407         vp->v_count--;       /* release hold from vn_rele */
 408         if (vp->v_count == 0) {
 409                 /*
 410                  * Free the hsnode.
 411                  * If there are no pages associated with the
 412                  * hsnode, give it back to the kmem_cache,
 413                  * else put at the end of this file system's
 414                  * internal free list.
 415                  */
 416                 nopage = !vn_has_cached_data(vp);
 417                 hp->hs_flags = 0;
 418                 /*
 419                  * exit these locks now, since hs_freenode may
 420                  * kmem_free the hsnode and embedded vnode
 421                  */
 422                 mutex_exit(&vp->v_lock);
 423                 mutex_exit(&hp->hs_contents_lock);
 424                 hs_freenode(vp, fsp, nopage);
 425         } else {
 426                 mutex_exit(&vp->v_lock);
 427                 mutex_exit(&hp->hs_contents_lock);
 428         }
 429         rw_exit(&fsp->hsfs_hash_lock);
 430 }
 431 
 432 
 433 /*ARGSUSED*/
 434 static int
 435 hsfs_lookup(
 436         struct vnode *dvp,
 437         char *nm,
 438         struct vnode **vpp,
 439         struct pathname *pnp,
 440         int flags,
 441         struct vnode *rdir,
 442         struct cred *cred,
 443         caller_context_t *ct,
 444         int *direntflags,
 445         pathname_t *realpnp)
 446 {
 447         int error;
 448         int namelen = (int)strlen(nm);
 449 
 450         if (*nm == '\0') {
 451                 VN_HOLD(dvp);
 452                 *vpp = dvp;
 453                 return (0);
 454         }
 455 
 456         /*
 457          * If we're looking for ourself, life is simple.
 458          */
 459         if (namelen == 1 && *nm == '.') {
 460                 if (error = hs_access(dvp, (mode_t)VEXEC, cred))
 461                         return (error);
 462                 VN_HOLD(dvp);
 463                 *vpp = dvp;
 464                 return (0);
 465         }
 466 
 467         return (hs_dirlook(dvp, nm, namelen, vpp, cred));
 468 }
 469 
 470 
 471 /*ARGSUSED*/
 472 static int
 473 hsfs_readdir(
 474         struct vnode            *vp,
 475         struct uio              *uiop,
 476         struct cred             *cred,
 477         int                     *eofp,
 478         caller_context_t        *ct,
 479         int                     flags)
 480 {
 481         struct hsnode   *dhp;
 482         struct hsfs     *fsp;
 483         struct hs_direntry hd;
 484         struct dirent64 *nd;
 485         int             error;
 486         uint_t          offset;         /* real offset in directory */
 487         uint_t          dirsiz;         /* real size of directory */
 488         uchar_t         *blkp;
 489         int             hdlen;          /* length of hs directory entry */
 490         long            ndlen;          /* length of dirent entry */
 491         int             bytes_wanted;
 492         size_t          bufsize;        /* size of dirent buffer */
 493         char            *outbuf;        /* ptr to dirent buffer */
 494         char            *dname;
 495         int             dnamelen;
 496         size_t          dname_size;
 497         struct fbuf     *fbp;
 498         uint_t          last_offset;    /* last index into current dir block */
 499         ino64_t         dirino; /* temporary storage before storing in dirent */
 500         off_t           diroff;
 501 
 502         dhp = VTOH(vp);
 503         fsp = VFS_TO_HSFS(vp->v_vfsp);
 504         if (dhp->hs_dirent.ext_size == 0)
 505                 hs_filldirent(vp, &dhp->hs_dirent);
 506         dirsiz = dhp->hs_dirent.ext_size;
 507         if (uiop->uio_loffset >= dirsiz) {        /* at or beyond EOF */
 508                 if (eofp)
 509                         *eofp = 1;
 510                 return (0);
 511         }
 512         ASSERT(uiop->uio_loffset <= HS_MAXFILEOFF);
 513         offset = uiop->uio_loffset;
 514 
 515         dname_size = fsp->hsfs_namemax + 1;  /* 1 for the ending NUL */
 516         dname = kmem_alloc(dname_size, KM_SLEEP);
 517         bufsize = uiop->uio_resid + sizeof (struct dirent64);
 518 
 519         outbuf = kmem_alloc(bufsize, KM_SLEEP);
 520         nd = (struct dirent64 *)outbuf;
 521 
 522         while (offset < dirsiz) {
 523                 bytes_wanted = MIN(MAXBSIZE, dirsiz - (offset & MAXBMASK));
 524 
 525                 error = fbread(vp, (offset_t)(offset & MAXBMASK),
 526                     (unsigned int)bytes_wanted, S_READ, &fbp);
 527                 if (error)
 528                         goto done;
 529 
 530                 blkp = (uchar_t *)fbp->fb_addr;
 531                 last_offset = (offset & MAXBMASK) + fbp->fb_count;
 532 
 533 #define rel_offset(offset) ((offset) & MAXBOFFSET)  /* index into blkp */
 534 
 535                 while (offset < last_offset) {
 536                         /*
 537                          * Very similar validation code is found in
 538                          * process_dirblock(), hsfs_node.c.
 539                          * For an explanation, see there.
 540                          * It may make sense for the future to
 541                          * "consolidate" the code in hs_parsedir(),
 542                          * process_dirblock() and hsfs_readdir() into
 543                          * a single utility function.
 544                          */
 545                         hdlen = (int)((uchar_t)
 546                             HDE_DIR_LEN(&blkp[rel_offset(offset)]));
 547                         if (hdlen < HDE_ROOT_DIR_REC_SIZE ||
 548                             offset + hdlen > last_offset) {
 549                                 /*
 550                                  * advance to next sector boundary
 551                                  */
 552                                 offset = roundup(offset + 1, HS_SECTOR_SIZE);
 553                                 if (hdlen)
 554                                         hs_log_bogus_disk_warning(fsp,
 555                                             HSFS_ERR_TRAILING_JUNK, 0);
 556 
 557                                 continue;
 558                         }
 559 
 560                         bzero(&hd, sizeof (hd));
 561 
 562                         /*
 563                          * Just ignore invalid directory entries.
 564                          * XXX - maybe hs_parsedir() will detect EXISTENCE bit
 565                          */
 566                         if (!hs_parsedir(fsp, &blkp[rel_offset(offset)],
 567                             &hd, dname, &dnamelen, last_offset - offset)) {
 568                                 /*
 569                                  * Determine if there is enough room
 570                                  */
 571                                 ndlen = (long)DIRENT64_RECLEN((dnamelen));
 572 
 573                                 if ((ndlen + ((char *)nd - outbuf)) >
 574                                     uiop->uio_resid) {
 575                                         fbrelse(fbp, S_READ);
 576                                         goto done; /* output buffer full */
 577                                 }
 578 
 579                                 diroff = offset + hdlen;
 580                                 /*
 581                                  * If the media carries rrip-v1.12 or newer,
 582                                  * and we trust the inodes from the rrip data
 583                                  * (use_rrip_inodes != 0), use that data. If the
 584                                  * media has been created by a recent mkisofs
 585                                  * version, we may trust all numbers in the
 586                                  * starting extent number; otherwise, we cannot
 587                                  * do this for zero sized files and symlinks,
 588                                  * because if we did we'd end up mapping all of
 589                                  * them to the same node. We use HS_DUMMY_INO
 590                                  * in this case and make sure that we will not
 591                                  * map all files to the same meta data.
 592                                  */
 593                                 if (hd.inode != 0 && use_rrip_inodes) {
 594                                         dirino = hd.inode;
 595                                 } else if ((hd.ext_size == 0 ||
 596                                     hd.sym_link != (char *)NULL) &&
 597                                     (fsp->hsfs_flags & HSFSMNT_INODE) == 0) {
 598                                         dirino = HS_DUMMY_INO;
 599                                 } else {
 600                                         dirino = hd.ext_lbn;
 601                                 }
 602 
 603                                 /* strncpy(9f) will zero uninitialized bytes */
 604 
 605                                 ASSERT(strlen(dname) + 1 <=
 606                                     DIRENT64_NAMELEN(ndlen));
 607                                 (void) strncpy(nd->d_name, dname,
 608                                     DIRENT64_NAMELEN(ndlen));
 609                                 nd->d_reclen = (ushort_t)ndlen;
 610                                 nd->d_off = (offset_t)diroff;
 611                                 nd->d_ino = dirino;
 612                                 nd = (struct dirent64 *)((char *)nd + ndlen);
 613 
 614                                 /*
 615                                  * free up space allocated for symlink
 616                                  */
 617                                 if (hd.sym_link != (char *)NULL) {
 618                                         kmem_free(hd.sym_link,
 619                                             (size_t)(hd.ext_size+1));
 620                                         hd.sym_link = (char *)NULL;
 621                                 }
 622                         }
 623                         offset += hdlen;
 624                 }
 625                 fbrelse(fbp, S_READ);
 626         }
 627 
 628         /*
 629          * Got here for one of the following reasons:
 630          *      1) outbuf is full (error == 0)
 631          *      2) end of directory reached (error == 0)
 632          *      3) error reading directory sector (error != 0)
 633          *      4) directory entry crosses sector boundary (error == 0)
 634          *
 635          * If any directory entries have been copied, don't report
 636          * case 4.  Instead, return the valid directory entries.
 637          *
 638          * If no entries have been copied, report the error.
 639          * If case 4, this will be indistiguishable from EOF.
 640          */
 641 done:
 642         ndlen = ((char *)nd - outbuf);
 643         if (ndlen != 0) {
 644                 error = uiomove(outbuf, (size_t)ndlen, UIO_READ, uiop);
 645                 uiop->uio_loffset = offset;
 646         }
 647         kmem_free(dname, dname_size);
 648         kmem_free(outbuf, bufsize);
 649         if (eofp && error == 0)
 650                 *eofp = (uiop->uio_loffset >= dirsiz);
 651         return (error);
 652 }
 653 
 654 /*ARGSUSED2*/
 655 static int
 656 hsfs_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
 657 {
 658         struct hsnode *hp;
 659         struct hsfid *fid;
 660 
 661         if (fidp->fid_len < (sizeof (*fid) - sizeof (fid->hf_len))) {
 662                 fidp->fid_len = sizeof (*fid) - sizeof (fid->hf_len);
 663                 return (ENOSPC);
 664         }
 665 
 666         fid = (struct hsfid *)fidp;
 667         fid->hf_len = sizeof (*fid) - sizeof (fid->hf_len);
 668         hp = VTOH(vp);
 669         mutex_enter(&hp->hs_contents_lock);
 670         fid->hf_dir_lbn = hp->hs_dir_lbn;
 671         fid->hf_dir_off = (ushort_t)hp->hs_dir_off;
 672         fid->hf_ino = hp->hs_nodeid;
 673         mutex_exit(&hp->hs_contents_lock);
 674         return (0);
 675 }
 676 
 677 /*ARGSUSED*/
 678 static int
 679 hsfs_open(struct vnode **vpp,
 680         int flag,
 681         struct cred *cred,
 682         caller_context_t *ct)
 683 {
 684         return (0);
 685 }
 686 
 687 /*ARGSUSED*/
 688 static int
 689 hsfs_close(
 690         struct vnode *vp,
 691         int flag,
 692         int count,
 693         offset_t offset,
 694         struct cred *cred,
 695         caller_context_t *ct)
 696 {
 697         (void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
 698         cleanshares(vp, ttoproc(curthread)->p_pid);
 699         return (0);
 700 }
 701 
 702 /*ARGSUSED2*/
 703 static int
 704 hsfs_access(struct vnode *vp,
 705         int mode,
 706         int flags,
 707         cred_t *cred,
 708         caller_context_t *ct)
 709 {
 710         return (hs_access(vp, (mode_t)mode, cred));
 711 }
 712 
 713 /*
 714  * the seek time of a CD-ROM is very slow, and data transfer
 715  * rate is even worse (max. 150K per sec).  The design
 716  * decision is to reduce access to cd-rom as much as possible,
 717  * and to transfer a sizable block (read-ahead) of data at a time.
 718  * UFS style of read ahead one block at a time is not appropriate,
 719  * and is not supported
 720  */
 721 
 722 /*
 723  * KLUSTSIZE should be a multiple of PAGESIZE and <= MAXPHYS.
 724  */
 725 #define KLUSTSIZE       (56 * 1024)
 726 /* we don't support read ahead */
 727 int hsfs_lostpage;      /* no. of times we lost original page */
 728 
 729 /*
 730  * Used to prevent biodone() from releasing buf resources that
 731  * we didn't allocate in quite the usual way.
 732  */
 733 /*ARGSUSED*/
 734 int
 735 hsfs_iodone(struct buf *bp)
 736 {
 737         sema_v(&bp->b_io);
 738         return (0);
 739 }
 740 
 741 /*
 742  * The taskq thread that invokes the scheduling function to ensure
 743  * that all readaheads are complete and cleans up the associated
 744  * memory and releases the page lock.
 745  */
 746 void
 747 hsfs_ra_task(void *arg)
 748 {
 749         struct hio_info *info = arg;
 750         uint_t count;
 751         struct buf *wbuf;
 752 
 753         ASSERT(info->pp != NULL);
 754 
 755         for (count = 0; count < info->bufsused; count++) {
 756                 wbuf = &(info->bufs[count]);
 757 
 758                 DTRACE_PROBE1(hsfs_io_wait_ra, struct buf *, wbuf);
 759                 while (sema_tryp(&(info->sema[count])) == 0) {
 760                         if (hsched_invoke_strategy(info->fsp)) {
 761                                 sema_p(&(info->sema[count]));
 762                                 break;
 763                         }
 764                 }
 765                 sema_destroy(&(info->sema[count]));
 766                 DTRACE_PROBE1(hsfs_io_done_ra, struct buf *, wbuf);
 767                 biofini(&(info->bufs[count]));
 768         }
 769         for (count = 0; count < info->bufsused; count++) {
 770                 if (info->vas[count] != NULL) {
 771                         ppmapout(info->vas[count]);
 772                 }
 773         }
 774         kmem_free(info->vas, info->bufcnt * sizeof (caddr_t));
 775         kmem_free(info->bufs, info->bufcnt * sizeof (struct buf));
 776         kmem_free(info->sema, info->bufcnt * sizeof (ksema_t));
 777 
 778         pvn_read_done(info->pp, 0);
 779         kmem_cache_free(hio_info_cache, info);
 780 }
 781 
 782 /*
 783  * Submit asynchronous readahead requests to the I/O scheduler
 784  * depending on the number of pages to read ahead. These requests
 785  * are asynchronous to the calling thread but I/O requests issued
 786  * subsequently by other threads with higher LBNs must wait for
 787  * these readaheads to complete since we have a single ordered
 788  * I/O pipeline. Thus these readaheads are semi-asynchronous.
 789  * A TaskQ handles waiting for the readaheads to complete.
 790  *
 791  * This function is mostly a copy of hsfs_getapage but somewhat
 792  * simpler. A readahead request is aborted if page allocation
 793  * fails.
 794  */
 795 /*ARGSUSED*/
 796 static int
 797 hsfs_getpage_ra(
 798         struct vnode *vp,
 799         u_offset_t off,
 800         struct seg *seg,
 801         caddr_t addr,
 802         struct hsnode *hp,
 803         struct hsfs *fsp,
 804         int     xarsiz,
 805         offset_t        bof,
 806         int     chunk_lbn_count,
 807         int     chunk_data_bytes)
 808 {
 809         struct buf *bufs;
 810         caddr_t *vas;
 811         caddr_t va;
 812         struct page *pp, *searchp, *lastp;
 813         struct vnode *devvp;
 814         ulong_t byte_offset;
 815         size_t  io_len_tmp;
 816         uint_t  io_off, io_len;
 817         uint_t  xlen;
 818         uint_t  filsiz;
 819         uint_t  secsize;
 820         uint_t  bufcnt;
 821         uint_t  bufsused;
 822         uint_t  count;
 823         uint_t  io_end;
 824         uint_t  which_chunk_lbn;
 825         uint_t  offset_lbn;
 826         uint_t  offset_extra;
 827         offset_t        offset_bytes;
 828         uint_t  remaining_bytes;
 829         uint_t  extension;
 830         int     remainder;      /* must be signed */
 831         diskaddr_t driver_block;
 832         u_offset_t io_off_tmp;
 833         ksema_t *fio_done;
 834         struct hio_info *info;
 835         size_t len;
 836 
 837         ASSERT(fsp->hqueue != NULL);
 838 
 839         if (addr >= seg->s_base + seg->s_size) {
 840                 return (-1);
 841         }
 842 
 843         devvp = fsp->hsfs_devvp;
 844         secsize = fsp->hsfs_vol.lbn_size;  /* bytes per logical block */
 845 
 846         /* file data size */
 847         filsiz = hp->hs_dirent.ext_size;
 848 
 849         if (off >= filsiz)
 850                 return (0);
 851 
 852         extension = 0;
 853         pp = NULL;
 854 
 855         extension += hp->hs_ra_bytes;
 856 
 857         /*
 858          * Some CD writers (e.g. Kodak Photo CD writers)
 859          * create CDs in TAO mode and reserve tracks that
 860          * are not completely written. Some sectors remain
 861          * unreadable for this reason and give I/O errors.
 862          * Also, there's no point in reading sectors
 863          * we'll never look at.  So, if we're asked to go
 864          * beyond the end of a file, truncate to the length
 865          * of that file.
 866          *
 867          * Additionally, this behaviour is required by section
 868          * 6.4.5 of ISO 9660:1988(E).
 869          */
 870         len = MIN(extension ? extension : PAGESIZE, filsiz - off);
 871 
 872         /* A little paranoia */
 873         if (len <= 0)
 874                 return (-1);
 875 
 876         /*
 877          * After all that, make sure we're asking for things in units
 878          * that bdev_strategy() will understand (see bug 4202551).
 879          */
 880         len = roundup(len, DEV_BSIZE);
 881 
 882         pp = pvn_read_kluster(vp, off, seg, addr, &io_off_tmp,
 883             &io_len_tmp, off, len, 1);
 884 
 885         if (pp == NULL) {
 886                 hp->hs_num_contig = 0;
 887                 hp->hs_ra_bytes = 0;
 888                 hp->hs_prev_offset = 0;
 889                 return (-1);
 890         }
 891 
 892         io_off = (uint_t)io_off_tmp;
 893         io_len = (uint_t)io_len_tmp;
 894 
 895         /* check for truncation */
 896         /*
 897          * xxx Clean up and return EIO instead?
 898          * xxx Ought to go to u_offset_t for everything, but we
 899          * xxx call lots of things that want uint_t arguments.
 900          */
 901         ASSERT(io_off == io_off_tmp);
 902 
 903         /*
 904          * get enough buffers for worst-case scenario
 905          * (i.e., no coalescing possible).
 906          */
 907         bufcnt = (len + secsize - 1) / secsize;
 908         bufs = kmem_alloc(bufcnt * sizeof (struct buf), KM_SLEEP);
 909         vas = kmem_alloc(bufcnt * sizeof (caddr_t), KM_SLEEP);
 910 
 911         /*
 912          * Allocate a array of semaphores since we are doing I/O
 913          * scheduling.
 914          */
 915         fio_done = kmem_alloc(bufcnt * sizeof (ksema_t), KM_SLEEP);
 916 
 917         /*
 918          * If our filesize is not an integer multiple of PAGESIZE,
 919          * we zero that part of the last page that's between EOF and
 920          * the PAGESIZE boundary.
 921          */
 922         xlen = io_len & PAGEOFFSET;
 923         if (xlen != 0)
 924                 pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
 925 
 926         DTRACE_PROBE2(hsfs_readahead, struct vnode *, vp, uint_t, io_len);
 927 
 928         va = NULL;
 929         lastp = NULL;
 930         searchp = pp;
 931         io_end = io_off + io_len;
 932         for (count = 0, byte_offset = io_off;
 933             byte_offset < io_end;
 934             count++) {
 935                 ASSERT(count < bufcnt);
 936 
 937                 bioinit(&bufs[count]);
 938                 bufs[count].b_edev = devvp->v_rdev;
 939                 bufs[count].b_dev = cmpdev(devvp->v_rdev);
 940                 bufs[count].b_flags = B_NOCACHE|B_BUSY|B_READ;
 941                 bufs[count].b_iodone = hsfs_iodone;
 942                 bufs[count].b_vp = vp;
 943                 bufs[count].b_file = vp;
 944 
 945                 /* Compute disk address for interleaving. */
 946 
 947                 /* considered without skips */
 948                 which_chunk_lbn = byte_offset / chunk_data_bytes;
 949 
 950                 /* factor in skips */
 951                 offset_lbn = which_chunk_lbn * chunk_lbn_count;
 952 
 953                 /* convert to physical byte offset for lbn */
 954                 offset_bytes = LBN_TO_BYTE(offset_lbn, vp->v_vfsp);
 955 
 956                 /* don't forget offset into lbn */
 957                 offset_extra = byte_offset % chunk_data_bytes;
 958 
 959                 /* get virtual block number for driver */
 960                 driver_block = lbtodb(bof + xarsiz
 961                     + offset_bytes + offset_extra);
 962 
 963                 if (lastp != searchp) {
 964                         /* this branch taken first time through loop */
 965                         va = vas[count] = ppmapin(searchp, PROT_WRITE,
 966                             (caddr_t)-1);
 967                         /* ppmapin() guarantees not to return NULL */
 968                 } else {
 969                         vas[count] = NULL;
 970                 }
 971 
 972                 bufs[count].b_un.b_addr = va + byte_offset % PAGESIZE;
 973                 bufs[count].b_offset =
 974                     (offset_t)(byte_offset - io_off + off);
 975 
 976                 /*
 977                  * We specifically use the b_lblkno member here
 978                  * as even in the 32 bit world driver_block can
 979                  * get very large in line with the ISO9660 spec.
 980                  */
 981 
 982                 bufs[count].b_lblkno = driver_block;
 983 
 984                 remaining_bytes = ((which_chunk_lbn + 1) * chunk_data_bytes)
 985                     - byte_offset;
 986 
 987                 /*
 988                  * remaining_bytes can't be zero, as we derived
 989                  * which_chunk_lbn directly from byte_offset.
 990                  */
 991                 if ((remaining_bytes + byte_offset) < (off + len)) {
 992                         /* coalesce-read the rest of the chunk */
 993                         bufs[count].b_bcount = remaining_bytes;
 994                 } else {
 995                         /* get the final bits */
 996                         bufs[count].b_bcount = off + len - byte_offset;
 997                 }
 998 
 999                 remainder = PAGESIZE - (byte_offset % PAGESIZE);
1000                 if (bufs[count].b_bcount > remainder) {
1001                         bufs[count].b_bcount = remainder;
1002                 }
1003 
1004                 bufs[count].b_bufsize = bufs[count].b_bcount;
1005                 if (((offset_t)byte_offset + bufs[count].b_bcount) >
1006                     HS_MAXFILEOFF) {
1007                         break;
1008                 }
1009                 byte_offset += bufs[count].b_bcount;
1010 
1011                 /*
1012                  * We are scheduling I/O so we need to enqueue
1013                  * requests rather than calling bdev_strategy
1014                  * here. A later invocation of the scheduling
1015                  * function will take care of doing the actual
1016                  * I/O as it selects requests from the queue as
1017                  * per the scheduling logic.
1018                  */
1019                 struct hio *hsio = kmem_cache_alloc(hio_cache,
1020                     KM_SLEEP);
1021 
1022                 sema_init(&fio_done[count], 0, NULL,
1023                     SEMA_DEFAULT, NULL);
1024                 hsio->bp = &bufs[count];
1025                 hsio->sema = &fio_done[count];
1026                 hsio->io_lblkno = bufs[count].b_lblkno;
1027                 hsio->nblocks = howmany(hsio->bp->b_bcount,
1028                     DEV_BSIZE);
1029 
1030                 /* used for deadline */
1031                 hsio->io_timestamp = drv_hztousec(ddi_get_lbolt());
1032 
1033                 /* for I/O coalescing */
1034                 hsio->contig_chain = NULL;
1035                 hsched_enqueue_io(fsp, hsio, 1);
1036 
1037                 lwp_stat_update(LWP_STAT_INBLK, 1);
1038                 lastp = searchp;
1039                 if ((remainder - bufs[count].b_bcount) < 1) {
1040                         searchp = searchp->p_next;
1041                 }
1042         }
1043 
1044         bufsused = count;
1045         info = kmem_cache_alloc(hio_info_cache, KM_SLEEP);
1046         info->bufs = bufs;
1047         info->vas = vas;
1048         info->sema = fio_done;
1049         info->bufsused = bufsused;
1050         info->bufcnt = bufcnt;
1051         info->fsp = fsp;
1052         info->pp = pp;
1053 
1054         (void) taskq_dispatch(fsp->hqueue->ra_task,
1055             hsfs_ra_task, info, KM_SLEEP);
1056         /*
1057          * The I/O locked pages are unlocked in our taskq thread.
1058          */
1059         return (0);
1060 }
1061 
1062 /*
1063  * Each file may have a different interleaving on disk.  This makes
1064  * things somewhat interesting.  The gist is that there are some
1065  * number of contiguous data sectors, followed by some other number
1066  * of contiguous skip sectors.  The sum of those two sets of sectors
1067  * defines the interleave size.  Unfortunately, it means that we generally
1068  * can't simply read N sectors starting at a given offset to satisfy
1069  * any given request.
1070  *
1071  * What we do is get the relevant memory pages via pvn_read_kluster(),
1072  * then stride through the interleaves, setting up a buf for each
1073  * sector that needs to be brought in.  Instead of kmem_alloc'ing
1074  * space for the sectors, though, we just point at the appropriate
1075  * spot in the relevant page for each of them.  This saves us a bunch
1076  * of copying.
1077  *
1078  * NOTICE: The code below in hsfs_getapage is mostly same as the code
1079  *         in hsfs_getpage_ra above (with some omissions). If you are
1080  *         making any change to this function, please also look at
1081  *         hsfs_getpage_ra.
1082  */
1083 /*ARGSUSED*/
1084 static int
1085 hsfs_getapage(
1086         struct vnode *vp,
1087         u_offset_t off,
1088         size_t len,
1089         uint_t *protp,
1090         struct page *pl[],
1091         size_t plsz,
1092         struct seg *seg,
1093         caddr_t addr,
1094         enum seg_rw rw,
1095         struct cred *cred)
1096 {
1097         struct hsnode *hp;
1098         struct hsfs *fsp;
1099         int     err;
1100         struct buf *bufs;
1101         caddr_t *vas;
1102         caddr_t va;
1103         struct page *pp, *searchp, *lastp;
1104         page_t  *pagefound;
1105         offset_t        bof;
1106         struct vnode *devvp;
1107         ulong_t byte_offset;
1108         size_t  io_len_tmp;
1109         uint_t  io_off, io_len;
1110         uint_t  xlen;
1111         uint_t  filsiz;
1112         uint_t  secsize;
1113         uint_t  bufcnt;
1114         uint_t  bufsused;
1115         uint_t  count;
1116         uint_t  io_end;
1117         uint_t  which_chunk_lbn;
1118         uint_t  offset_lbn;
1119         uint_t  offset_extra;
1120         offset_t        offset_bytes;
1121         uint_t  remaining_bytes;
1122         uint_t  extension;
1123         int     remainder;      /* must be signed */
1124         int     chunk_lbn_count;
1125         int     chunk_data_bytes;
1126         int     xarsiz;
1127         diskaddr_t driver_block;
1128         u_offset_t io_off_tmp;
1129         ksema_t *fio_done;
1130         int     calcdone;
1131 
1132         /*
1133          * We don't support asynchronous operation at the moment, so
1134          * just pretend we did it.  If the pages are ever actually
1135          * needed, they'll get brought in then.
1136          */
1137         if (pl == NULL)
1138                 return (0);
1139 
1140         hp = VTOH(vp);
1141         fsp = VFS_TO_HSFS(vp->v_vfsp);
1142         devvp = fsp->hsfs_devvp;
1143         secsize = fsp->hsfs_vol.lbn_size;  /* bytes per logical block */
1144 
1145         /* file data size */
1146         filsiz = hp->hs_dirent.ext_size;
1147 
1148         /* disk addr for start of file */
1149         bof = LBN_TO_BYTE((offset_t)hp->hs_dirent.ext_lbn, vp->v_vfsp);
1150 
1151         /* xarsiz byte must be skipped for data */
1152         xarsiz = hp->hs_dirent.xar_len << fsp->hsfs_vol.lbn_shift;
1153 
1154         /* how many logical blocks in an interleave (data+skip) */
1155         chunk_lbn_count = hp->hs_dirent.intlf_sz + hp->hs_dirent.intlf_sk;
1156 
1157         if (chunk_lbn_count == 0) {
1158                 chunk_lbn_count = 1;
1159         }
1160 
1161         /*
1162          * Convert interleaving size into bytes.  The zero case
1163          * (no interleaving) optimization is handled as a side-
1164          * effect of the read-ahead logic.
1165          */
1166         if (hp->hs_dirent.intlf_sz == 0) {
1167                 chunk_data_bytes = LBN_TO_BYTE(1, vp->v_vfsp);
1168                 /*
1169                  * Optimization: If our pagesize is a multiple of LBN
1170                  * bytes, we can avoid breaking up a page into individual
1171                  * lbn-sized requests.
1172                  */
1173                 if (PAGESIZE % chunk_data_bytes == 0) {
1174                         chunk_lbn_count = BYTE_TO_LBN(PAGESIZE, vp->v_vfsp);
1175                         chunk_data_bytes = PAGESIZE;
1176                 }
1177         } else {
1178                 chunk_data_bytes =
1179                     LBN_TO_BYTE(hp->hs_dirent.intlf_sz, vp->v_vfsp);
1180         }
1181 
1182 reread:
1183         err = 0;
1184         pagefound = 0;
1185         calcdone = 0;
1186 
1187         /*
1188          * Do some read-ahead.  This mostly saves us a bit of
1189          * system cpu time more than anything else when doing
1190          * sequential reads.  At some point, could do the
1191          * read-ahead asynchronously which might gain us something
1192          * on wall time, but it seems unlikely....
1193          *
1194          * We do the easy case here, which is to read through
1195          * the end of the chunk, minus whatever's at the end that
1196          * won't exactly fill a page.
1197          */
1198         if (hp->hs_ra_bytes > 0 && chunk_data_bytes != PAGESIZE) {
1199                 which_chunk_lbn = (off + len) / chunk_data_bytes;
1200                 extension = ((which_chunk_lbn + 1) * chunk_data_bytes) - off;
1201                 extension -= (extension % PAGESIZE);
1202         } else {
1203                 extension = roundup(len, PAGESIZE);
1204         }
1205 
1206         atomic_inc_64(&fsp->total_pages_requested);
1207 
1208         pp = NULL;
1209 again:
1210         /* search for page in buffer */
1211         if ((pagefound = page_exists(vp, off)) == 0) {
1212                 /*
1213                  * Need to really do disk IO to get the page.
1214                  */
1215                 if (!calcdone) {
1216                         extension += hp->hs_ra_bytes;
1217 
1218                         /*
1219                          * Some cd writers don't write sectors that aren't
1220                          * used. Also, there's no point in reading sectors
1221                          * we'll never look at.  So, if we're asked to go
1222                          * beyond the end of a file, truncate to the length
1223                          * of that file.
1224                          *
1225                          * Additionally, this behaviour is required by section
1226                          * 6.4.5 of ISO 9660:1988(E).
1227                          */
1228                         len = MIN(extension ? extension : PAGESIZE,
1229                             filsiz - off);
1230 
1231                         /* A little paranoia. */
1232                         ASSERT(len > 0);
1233 
1234                         /*
1235                          * After all that, make sure we're asking for things
1236                          * in units that bdev_strategy() will understand
1237                          * (see bug 4202551).
1238                          */
1239                         len = roundup(len, DEV_BSIZE);
1240                         calcdone = 1;
1241                 }
1242 
1243                 pp = pvn_read_kluster(vp, off, seg, addr, &io_off_tmp,
1244                     &io_len_tmp, off, len, 0);
1245 
1246                 if (pp == NULL) {
1247                         /*
1248                          * Pressure on memory, roll back readahead
1249                          */
1250                         hp->hs_num_contig = 0;
1251                         hp->hs_ra_bytes = 0;
1252                         hp->hs_prev_offset = 0;
1253                         goto again;
1254                 }
1255 
1256                 io_off = (uint_t)io_off_tmp;
1257                 io_len = (uint_t)io_len_tmp;
1258 
1259                 /* check for truncation */
1260                 /*
1261                  * xxx Clean up and return EIO instead?
1262                  * xxx Ought to go to u_offset_t for everything, but we
1263                  * xxx call lots of things that want uint_t arguments.
1264                  */
1265                 ASSERT(io_off == io_off_tmp);
1266 
1267                 /*
1268                  * get enough buffers for worst-case scenario
1269                  * (i.e., no coalescing possible).
1270                  */
1271                 bufcnt = (len + secsize - 1) / secsize;
1272                 bufs = kmem_zalloc(bufcnt * sizeof (struct buf), KM_SLEEP);
1273                 vas = kmem_alloc(bufcnt * sizeof (caddr_t), KM_SLEEP);
1274 
1275                 /*
1276                  * Allocate a array of semaphores if we are doing I/O
1277                  * scheduling.
1278                  */
1279                 if (fsp->hqueue != NULL)
1280                         fio_done = kmem_alloc(bufcnt * sizeof (ksema_t),
1281                             KM_SLEEP);
1282                 for (count = 0; count < bufcnt; count++) {
1283                         bioinit(&bufs[count]);
1284                         bufs[count].b_edev = devvp->v_rdev;
1285                         bufs[count].b_dev = cmpdev(devvp->v_rdev);
1286                         bufs[count].b_flags = B_NOCACHE|B_BUSY|B_READ;
1287                         bufs[count].b_iodone = hsfs_iodone;
1288                         bufs[count].b_vp = vp;
1289                         bufs[count].b_file = vp;
1290                 }
1291 
1292                 /*
1293                  * If our filesize is not an integer multiple of PAGESIZE,
1294                  * we zero that part of the last page that's between EOF and
1295                  * the PAGESIZE boundary.
1296                  */
1297                 xlen = io_len & PAGEOFFSET;
1298                 if (xlen != 0)
1299                         pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
1300 
1301                 va = NULL;
1302                 lastp = NULL;
1303                 searchp = pp;
1304                 io_end = io_off + io_len;
1305                 for (count = 0, byte_offset = io_off;
1306                     byte_offset < io_end; count++) {
1307                         ASSERT(count < bufcnt);
1308 
1309                         /* Compute disk address for interleaving. */
1310 
1311                         /* considered without skips */
1312                         which_chunk_lbn = byte_offset / chunk_data_bytes;
1313 
1314                         /* factor in skips */
1315                         offset_lbn = which_chunk_lbn * chunk_lbn_count;
1316 
1317                         /* convert to physical byte offset for lbn */
1318                         offset_bytes = LBN_TO_BYTE(offset_lbn, vp->v_vfsp);
1319 
1320                         /* don't forget offset into lbn */
1321                         offset_extra = byte_offset % chunk_data_bytes;
1322 
1323                         /* get virtual block number for driver */
1324                         driver_block =
1325                             lbtodb(bof + xarsiz + offset_bytes + offset_extra);
1326 
1327                         if (lastp != searchp) {
1328                                 /* this branch taken first time through loop */
1329                                 va = vas[count] =
1330                                     ppmapin(searchp, PROT_WRITE, (caddr_t)-1);
1331                                 /* ppmapin() guarantees not to return NULL */
1332                         } else {
1333                                 vas[count] = NULL;
1334                         }
1335 
1336                         bufs[count].b_un.b_addr = va + byte_offset % PAGESIZE;
1337                         bufs[count].b_offset =
1338                             (offset_t)(byte_offset - io_off + off);
1339 
1340                         /*
1341                          * We specifically use the b_lblkno member here
1342                          * as even in the 32 bit world driver_block can
1343                          * get very large in line with the ISO9660 spec.
1344                          */
1345 
1346                         bufs[count].b_lblkno = driver_block;
1347 
1348                         remaining_bytes =
1349                             ((which_chunk_lbn + 1) * chunk_data_bytes)
1350                             - byte_offset;
1351 
1352                         /*
1353                          * remaining_bytes can't be zero, as we derived
1354                          * which_chunk_lbn directly from byte_offset.
1355                          */
1356                         if ((remaining_bytes + byte_offset) < (off + len)) {
1357                                 /* coalesce-read the rest of the chunk */
1358                                 bufs[count].b_bcount = remaining_bytes;
1359                         } else {
1360                                 /* get the final bits */
1361                                 bufs[count].b_bcount = off + len - byte_offset;
1362                         }
1363 
1364                         /*
1365                          * It would be nice to do multiple pages'
1366                          * worth at once here when the opportunity
1367                          * arises, as that has been shown to improve
1368                          * our wall time.  However, to do that
1369                          * requires that we use the pageio subsystem,
1370                          * which doesn't mix well with what we're
1371                          * already using here.  We can't use pageio
1372                          * all the time, because that subsystem
1373                          * assumes that a page is stored in N
1374                          * contiguous blocks on the device.
1375                          * Interleaving violates that assumption.
1376                          *
1377                          * Update: This is now not so big a problem
1378                          * because of the I/O scheduler sitting below
1379                          * that can re-order and coalesce I/O requests.
1380                          */
1381 
1382                         remainder = PAGESIZE - (byte_offset % PAGESIZE);
1383                         if (bufs[count].b_bcount > remainder) {
1384                                 bufs[count].b_bcount = remainder;
1385                         }
1386 
1387                         bufs[count].b_bufsize = bufs[count].b_bcount;
1388                         if (((offset_t)byte_offset + bufs[count].b_bcount) >
1389                             HS_MAXFILEOFF) {
1390                                 break;
1391                         }
1392                         byte_offset += bufs[count].b_bcount;
1393 
1394                         if (fsp->hqueue == NULL) {
1395                                 (void) bdev_strategy(&bufs[count]);
1396 
1397                         } else {
1398                                 /*
1399                                  * We are scheduling I/O so we need to enqueue
1400                                  * requests rather than calling bdev_strategy
1401                                  * here. A later invocation of the scheduling
1402                                  * function will take care of doing the actual
1403                                  * I/O as it selects requests from the queue as
1404                                  * per the scheduling logic.
1405                                  */
1406                                 struct hio *hsio = kmem_cache_alloc(hio_cache,
1407                                     KM_SLEEP);
1408 
1409                                 sema_init(&fio_done[count], 0, NULL,
1410                                     SEMA_DEFAULT, NULL);
1411                                 hsio->bp = &bufs[count];
1412                                 hsio->sema = &fio_done[count];
1413                                 hsio->io_lblkno = bufs[count].b_lblkno;
1414                                 hsio->nblocks = howmany(hsio->bp->b_bcount,
1415                                     DEV_BSIZE);
1416 
1417                                 /* used for deadline */
1418                                 hsio->io_timestamp =
1419                                     drv_hztousec(ddi_get_lbolt());
1420 
1421                                 /* for I/O coalescing */
1422                                 hsio->contig_chain = NULL;
1423                                 hsched_enqueue_io(fsp, hsio, 0);
1424                         }
1425 
1426                         lwp_stat_update(LWP_STAT_INBLK, 1);
1427                         lastp = searchp;
1428                         if ((remainder - bufs[count].b_bcount) < 1) {
1429                                 searchp = searchp->p_next;
1430                         }
1431                 }
1432 
1433                 bufsused = count;
1434                 /* Now wait for everything to come in */
1435                 if (fsp->hqueue == NULL) {
1436                         for (count = 0; count < bufsused; count++) {
1437                                 if (err == 0) {
1438                                         err = biowait(&bufs[count]);
1439                                 } else
1440                                         (void) biowait(&bufs[count]);
1441                         }
1442                 } else {
1443                         for (count = 0; count < bufsused; count++) {
1444                                 struct buf *wbuf;
1445 
1446                                 /*
1447                                  * Invoke scheduling function till our buf
1448                                  * is processed. In doing this it might
1449                                  * process bufs enqueued by other threads
1450                                  * which is good.
1451                                  */
1452                                 wbuf = &bufs[count];
1453                                 DTRACE_PROBE1(hsfs_io_wait, struct buf *, wbuf);
1454                                 while (sema_tryp(&fio_done[count]) == 0) {
1455                                         /*
1456                                          * hsched_invoke_strategy will return 1
1457                                          * if the I/O queue is empty. This means
1458                                          * that there is another thread who has
1459                                          * issued our buf and is waiting. So we
1460                                          * just block instead of spinning.
1461                                          */
1462                                         if (hsched_invoke_strategy(fsp)) {
1463                                                 sema_p(&fio_done[count]);
1464                                                 break;
1465                                         }
1466                                 }
1467                                 sema_destroy(&fio_done[count]);
1468                                 DTRACE_PROBE1(hsfs_io_done, struct buf *, wbuf);
1469 
1470                                 if (err == 0) {
1471                                         err = geterror(wbuf);
1472                                 }
1473                         }
1474                         kmem_free(fio_done, bufcnt * sizeof (ksema_t));
1475                 }
1476 
1477                 /* Don't leak resources */
1478                 for (count = 0; count < bufcnt; count++) {
1479                         biofini(&bufs[count]);
1480                         if (count < bufsused && vas[count] != NULL) {
1481                                 ppmapout(vas[count]);
1482                         }
1483                 }
1484 
1485                 kmem_free(vas, bufcnt * sizeof (caddr_t));
1486                 kmem_free(bufs, bufcnt * sizeof (struct buf));
1487         }
1488 
1489         if (err) {
1490                 pvn_read_done(pp, B_ERROR);
1491                 return (err);
1492         }
1493 
1494         /*
1495          * Lock the requested page, and the one after it if possible.
1496          * Don't bother if our caller hasn't given us a place to stash
1497          * the page pointers, since otherwise we'd lock pages that would
1498          * never get unlocked.
1499          */
1500         if (pagefound) {
1501                 int index;
1502                 ulong_t soff;
1503 
1504                 /*
1505                  * Make sure it's in memory before we say it's here.
1506                  */
1507                 if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
1508                         hsfs_lostpage++;
1509                         goto reread;
1510                 }
1511 
1512                 pl[0] = pp;
1513                 index = 1;
1514                 atomic_inc_64(&fsp->cache_read_pages);
1515 
1516                 /*
1517                  * Try to lock the next page, if it exists, without
1518                  * blocking.
1519                  */
1520                 plsz -= PAGESIZE;
1521                 /* LINTED (plsz is unsigned) */
1522                 for (soff = off + PAGESIZE; plsz > 0;
1523                     soff += PAGESIZE, plsz -= PAGESIZE) {
1524                         pp = page_lookup_nowait(vp, (u_offset_t)soff,
1525                             SE_SHARED);
1526                         if (pp == NULL)
1527                                 break;
1528                         pl[index++] = pp;
1529                 }
1530                 pl[index] = NULL;
1531 
1532                 /*
1533                  * Schedule a semi-asynchronous readahead if we are
1534                  * accessing the last cached page for the current
1535                  * file.
1536                  *
1537                  * Doing this here means that readaheads will be
1538                  * issued only if cache-hits occur. This is an advantage
1539                  * since cache-hits would mean that readahead is giving
1540                  * the desired benefit. If cache-hits do not occur there
1541                  * is no point in reading ahead of time - the system
1542                  * is loaded anyway.
1543                  */
1544                 if (fsp->hqueue != NULL &&
1545                     hp->hs_prev_offset - off == PAGESIZE &&
1546                     hp->hs_prev_offset < filsiz &&
1547                     hp->hs_ra_bytes > 0 &&
1548                     !page_exists(vp, hp->hs_prev_offset)) {
1549                         (void) hsfs_getpage_ra(vp, hp->hs_prev_offset, seg,
1550                             addr + PAGESIZE, hp, fsp, xarsiz, bof,
1551                             chunk_lbn_count, chunk_data_bytes);
1552                 }
1553 
1554                 return (0);
1555         }
1556 
1557         if (pp != NULL) {
1558                 pvn_plist_init(pp, pl, plsz, off, io_len, rw);
1559         }
1560 
1561         return (err);
1562 }
1563 
1564 /*ARGSUSED*/
1565 static int
1566 hsfs_getpage(
1567         struct vnode *vp,
1568         offset_t off,
1569         size_t len,
1570         uint_t *protp,
1571         struct page *pl[],
1572         size_t plsz,
1573         struct seg *seg,
1574         caddr_t addr,
1575         enum seg_rw rw,
1576         struct cred *cred,
1577         caller_context_t *ct)
1578 {
1579         uint_t filsiz;
1580         struct hsfs *fsp;
1581         struct hsnode *hp;
1582 
1583         fsp = VFS_TO_HSFS(vp->v_vfsp);
1584         hp = VTOH(vp);
1585 
1586         /* does not support write */
1587         if (rw == S_WRITE) {
1588                 return (EROFS);
1589         }
1590 
1591         if (vp->v_flag & VNOMAP) {
1592                 return (ENOSYS);
1593         }
1594 
1595         ASSERT(off <= HS_MAXFILEOFF);
1596 
1597         /*
1598          * Determine file data size for EOF check.
1599          */
1600         filsiz = hp->hs_dirent.ext_size;
1601         if ((off + len) > (offset_t)(filsiz + PAGEOFFSET) && seg != segkmap)
1602                 return (EFAULT);        /* beyond EOF */
1603 
1604         /*
1605          * Async Read-ahead computation.
1606          * This attempts to detect sequential access pattern and
1607          * enables reading extra pages ahead of time.
1608          */
1609         if (fsp->hqueue != NULL) {
1610                 /*
1611                  * This check for sequential access also takes into
1612                  * account segmap weirdness when reading in chunks
1613                  * less than the segmap size of 8K.
1614                  */
1615                 if (hp->hs_prev_offset == off || (off <
1616                     hp->hs_prev_offset && off + MAX(len, PAGESIZE)
1617                     >= hp->hs_prev_offset)) {
1618                         if (hp->hs_num_contig <
1619                             (seq_contig_requests - 1)) {
1620                                 hp->hs_num_contig++;
1621 
1622                         } else {
1623                                 /*
1624                                  * We increase readahead quantum till
1625                                  * a predefined max. max_readahead_bytes
1626                                  * is a multiple of PAGESIZE.
1627                                  */
1628                                 if (hp->hs_ra_bytes <
1629                                     fsp->hqueue->max_ra_bytes) {
1630                                         hp->hs_ra_bytes += PAGESIZE;
1631                                 }
1632                         }
1633                 } else {
1634                         /*
1635                          * Not contiguous so reduce read ahead counters.
1636                          */
1637                         if (hp->hs_ra_bytes > 0)
1638                                 hp->hs_ra_bytes -= PAGESIZE;
1639 
1640                         if (hp->hs_ra_bytes <= 0) {
1641                                 hp->hs_ra_bytes = 0;
1642                                 if (hp->hs_num_contig > 0)
1643                                         hp->hs_num_contig--;
1644                         }
1645                 }
1646                 /*
1647                  * Length must be rounded up to page boundary.
1648                  * since we read in units of pages.
1649                  */
1650                 hp->hs_prev_offset = off + roundup(len, PAGESIZE);
1651                 DTRACE_PROBE1(hsfs_compute_ra, struct hsnode *, hp);
1652         }
1653         if (protp != NULL)
1654                 *protp = PROT_ALL;
1655 
1656         return (pvn_getpages(hsfs_getapage, vp, off, len, protp, pl, plsz,
1657             seg, addr, rw, cred));
1658 }
1659 
1660 
1661 
1662 /*
1663  * This function should never be called. We need to have it to pass
1664  * it as an argument to other functions.
1665  */
1666 /*ARGSUSED*/
1667 int
1668 hsfs_putapage(
1669         vnode_t         *vp,
1670         page_t          *pp,
1671         u_offset_t      *offp,
1672         size_t          *lenp,
1673         int             flags,
1674         cred_t          *cr)
1675 {
1676         /* should never happen - just destroy it */
1677         cmn_err(CE_NOTE, "hsfs_putapage: dirty HSFS page");
1678         pvn_write_done(pp, B_ERROR | B_WRITE | B_INVAL | B_FORCE | flags);
1679         return (0);
1680 }
1681 
1682 
1683 /*
1684  * The only flags we support are B_INVAL, B_FREE and B_DONTNEED.
1685  * B_INVAL is set by:
1686  *
1687  *      1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag.
1688  *      2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice
1689  *         which translates to an MC_SYNC with the MS_INVALIDATE flag.
1690  *
1691  * The B_FREE (as well as the B_DONTNEED) flag is set when the
1692  * MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked
1693  * from SEGVN to release pages behind a pagefault.
1694  */
1695 /*ARGSUSED*/
1696 static int
1697 hsfs_putpage(
1698         struct vnode            *vp,
1699         offset_t                off,
1700         size_t                  len,
1701         int                     flags,
1702         struct cred             *cr,
1703         caller_context_t        *ct)
1704 {
1705         int error = 0;
1706 
1707         if (vp->v_count == 0) {
1708                 panic("hsfs_putpage: bad v_count");
1709                 /*NOTREACHED*/
1710         }
1711 
1712         if (vp->v_flag & VNOMAP)
1713                 return (ENOSYS);
1714 
1715         ASSERT(off <= HS_MAXFILEOFF);
1716 
1717         if (!vn_has_cached_data(vp))    /* no pages mapped */
1718                 return (0);
1719 
1720         if (len == 0) {         /* from 'off' to EOF */
1721                 error = pvn_vplist_dirty(vp, off, hsfs_putapage, flags, cr);
1722         } else {
1723                 offset_t end_off = off + len;
1724                 offset_t file_size = VTOH(vp)->hs_dirent.ext_size;
1725                 offset_t io_off;
1726 
1727                 file_size = (file_size + PAGESIZE - 1) & PAGEMASK;
1728                 if (end_off > file_size)
1729                         end_off = file_size;
1730 
1731                 for (io_off = off; io_off < end_off; io_off += PAGESIZE) {
1732                         page_t *pp;
1733 
1734                         /*
1735                          * We insist on getting the page only if we are
1736                          * about to invalidate, free or write it and
1737                          * the B_ASYNC flag is not set.
1738                          */
1739                         if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
1740                                 pp = page_lookup(vp, io_off,
1741                                     (flags & (B_INVAL | B_FREE)) ?
1742                                     SE_EXCL : SE_SHARED);
1743                         } else {
1744                                 pp = page_lookup_nowait(vp, io_off,
1745                                     (flags & B_FREE) ? SE_EXCL : SE_SHARED);
1746                         }
1747 
1748                         if (pp == NULL)
1749                                 continue;
1750 
1751                         /*
1752                          * Normally pvn_getdirty() should return 0, which
1753                          * impies that it has done the job for us.
1754                          * The shouldn't-happen scenario is when it returns 1.
1755                          * This means that the page has been modified and
1756                          * needs to be put back.
1757                          * Since we can't write on a CD, we fake a failed
1758                          * I/O and force pvn_write_done() to destroy the page.
1759                          */
1760                         if (pvn_getdirty(pp, flags) == 1) {
1761                                 cmn_err(CE_NOTE,
1762                                     "hsfs_putpage: dirty HSFS page");
1763                                 pvn_write_done(pp, flags |
1764                                     B_ERROR | B_WRITE | B_INVAL | B_FORCE);
1765                         }
1766                 }
1767         }
1768         return (error);
1769 }
1770 
1771 
1772 /*ARGSUSED*/
1773 static int
1774 hsfs_map(
1775         struct vnode *vp,
1776         offset_t off,
1777         struct as *as,
1778         caddr_t *addrp,
1779         size_t len,
1780         uchar_t prot,
1781         uchar_t maxprot,
1782         uint_t flags,
1783         struct cred *cred,
1784         caller_context_t *ct)
1785 {
1786         struct segvn_crargs vn_a;
1787         int error;
1788 
1789         /* VFS_RECORD(vp->v_vfsp, VS_MAP, VS_CALL); */
1790 
1791         if (vp->v_flag & VNOMAP)
1792                 return (ENOSYS);
1793 
1794         if ((prot & PROT_WRITE) && (flags & MAP_SHARED))
1795                 return (ENOSYS);
1796 
1797         if (off > HS_MAXFILEOFF || off < 0 ||
1798             (off + len) < 0 || (off + len) > HS_MAXFILEOFF)
1799                 return (ENXIO);
1800 
1801         if (vp->v_type != VREG) {
1802                 return (ENODEV);
1803         }
1804 
1805         /*
1806          * If file is being locked, disallow mapping.
1807          */
1808         if (vn_has_mandatory_locks(vp, VTOH(vp)->hs_dirent.mode))
1809                 return (EAGAIN);
1810 
1811         as_rangelock(as);
1812         error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
1813         if (error != 0) {
1814                 as_rangeunlock(as);
1815                 return (error);
1816         }
1817 
1818         vn_a.vp = vp;
1819         vn_a.offset = off;
1820         vn_a.type = flags & MAP_TYPE;
1821         vn_a.prot = prot;
1822         vn_a.maxprot = maxprot;
1823         vn_a.flags = flags & ~MAP_TYPE;
1824         vn_a.cred = cred;
1825         vn_a.amp = NULL;
1826         vn_a.szc = 0;
1827         vn_a.lgrp_mem_policy_flags = 0;
1828 
1829         error = as_map(as, *addrp, len, segvn_create, &vn_a);
1830         as_rangeunlock(as);
1831         return (error);
1832 }
1833 
1834 /* ARGSUSED */
1835 static int
1836 hsfs_addmap(
1837         struct vnode *vp,
1838         offset_t off,
1839         struct as *as,
1840         caddr_t addr,
1841         size_t len,
1842         uchar_t prot,
1843         uchar_t maxprot,
1844         uint_t flags,
1845         struct cred *cr,
1846         caller_context_t *ct)
1847 {
1848         struct hsnode *hp;
1849 
1850         if (vp->v_flag & VNOMAP)
1851                 return (ENOSYS);
1852 
1853         hp = VTOH(vp);
1854         mutex_enter(&hp->hs_contents_lock);
1855         hp->hs_mapcnt += btopr(len);
1856         mutex_exit(&hp->hs_contents_lock);
1857         return (0);
1858 }
1859 
1860 /*ARGSUSED*/
1861 static int
1862 hsfs_delmap(
1863         struct vnode *vp,
1864         offset_t off,
1865         struct as *as,
1866         caddr_t addr,
1867         size_t len,
1868         uint_t prot,
1869         uint_t maxprot,
1870         uint_t flags,
1871         struct cred *cr,
1872         caller_context_t *ct)
1873 {
1874         struct hsnode *hp;
1875 
1876         if (vp->v_flag & VNOMAP)
1877                 return (ENOSYS);
1878 
1879         hp = VTOH(vp);
1880         mutex_enter(&hp->hs_contents_lock);
1881         hp->hs_mapcnt -= btopr(len); /* Count released mappings */
1882         ASSERT(hp->hs_mapcnt >= 0);
1883         mutex_exit(&hp->hs_contents_lock);
1884         return (0);
1885 }
1886 
1887 /* ARGSUSED */
1888 static int
1889 hsfs_seek(
1890         struct vnode *vp,
1891         offset_t ooff,
1892         offset_t *noffp,
1893         caller_context_t *ct)
1894 {
1895         return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
1896 }
1897 
1898 /* ARGSUSED */
1899 static int
1900 hsfs_frlock(
1901         struct vnode *vp,
1902         int cmd,
1903         struct flock64 *bfp,
1904         int flag,
1905         offset_t offset,
1906         struct flk_callback *flk_cbp,
1907         cred_t *cr,
1908         caller_context_t *ct)
1909 {
1910         struct hsnode *hp = VTOH(vp);
1911 
1912         /*
1913          * If the file is being mapped, disallow fs_frlock.
1914          * We are not holding the hs_contents_lock while checking
1915          * hs_mapcnt because the current locking strategy drops all
1916          * locks before calling fs_frlock.
1917          * So, hs_mapcnt could change before we enter fs_frlock making
1918          * it meaningless to have held hs_contents_lock in the first place.
1919          */
1920         if (hp->hs_mapcnt > 0 && MANDLOCK(vp, hp->hs_dirent.mode))
1921                 return (EAGAIN);
1922 
1923         return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
1924 }
1925 
1926 static int
1927 hsched_deadline_compare(const void *x1, const void *x2)
1928 {
1929         const struct hio *h1 = x1;
1930         const struct hio *h2 = x2;
1931 
1932         if (h1->io_timestamp < h2->io_timestamp)
1933                 return (-1);
1934         if (h1->io_timestamp > h2->io_timestamp)
1935                 return (1);
1936 
1937         if (h1->io_lblkno < h2->io_lblkno)
1938                 return (-1);
1939         if (h1->io_lblkno > h2->io_lblkno)
1940                 return (1);
1941 
1942         if (h1 < h2)
1943                 return (-1);
1944         if (h1 > h2)
1945                 return (1);
1946 
1947         return (0);
1948 }
1949 
1950 static int
1951 hsched_offset_compare(const void *x1, const void *x2)
1952 {
1953         const struct hio *h1 = x1;
1954         const struct hio *h2 = x2;
1955 
1956         if (h1->io_lblkno < h2->io_lblkno)
1957                 return (-1);
1958         if (h1->io_lblkno > h2->io_lblkno)
1959                 return (1);
1960 
1961         if (h1 < h2)
1962                 return (-1);
1963         if (h1 > h2)
1964                 return (1);
1965 
1966         return (0);
1967 }
1968 
1969 void
1970 hsched_init_caches(void)
1971 {
1972         hio_cache = kmem_cache_create("hsfs_hio_cache",
1973             sizeof (struct hio), 0, NULL,
1974             NULL, NULL, NULL, NULL, 0);
1975 
1976         hio_info_cache = kmem_cache_create("hsfs_hio_info_cache",
1977             sizeof (struct hio_info), 0, NULL,
1978             NULL, NULL, NULL, NULL, 0);
1979 }
1980 
1981 void
1982 hsched_fini_caches(void)
1983 {
1984         kmem_cache_destroy(hio_cache);
1985         kmem_cache_destroy(hio_info_cache);
1986 }
1987 
1988 /*
1989  * Initialize I/O scheduling structures. This is called via hsfs_mount
1990  */
1991 void
1992 hsched_init(struct hsfs *fsp, int fsid, struct modlinkage *modlinkage)
1993 {
1994         struct hsfs_queue *hqueue = fsp->hqueue;
1995         struct vnode *vp = fsp->hsfs_devvp;
1996 
1997         /* TaskQ name of the form: hsched_task_ + stringof(int) */
1998         char namebuf[23];
1999         int error, err;
2000         struct dk_cinfo info;
2001         ldi_handle_t lh;
2002         ldi_ident_t li;
2003 
2004         /*
2005          * Default maxtransfer = 16k chunk
2006          */
2007         hqueue->dev_maxtransfer = 16384;
2008 
2009         /*
2010          * Try to fetch the maximum device transfer size. This is used to
2011          * ensure that a coalesced block does not exceed the maxtransfer.
2012          */
2013         err  = ldi_ident_from_mod(modlinkage, &li);
2014         if (err) {
2015                 cmn_err(CE_NOTE, "hsched_init: Querying device failed");
2016                 cmn_err(CE_NOTE, "hsched_init: ldi_ident_from_mod err=%d\n",
2017                     err);
2018                 goto set_ra;
2019         }
2020 
2021         err = ldi_open_by_dev(&(vp->v_rdev), OTYP_CHR, FREAD, CRED(), &lh, li);
2022         ldi_ident_release(li);
2023         if (err) {
2024                 cmn_err(CE_NOTE, "hsched_init: Querying device failed");
2025                 cmn_err(CE_NOTE, "hsched_init: ldi_open err=%d\n", err);
2026                 goto set_ra;
2027         }
2028 
2029         error = ldi_ioctl(lh, DKIOCINFO, (intptr_t)&info, FKIOCTL,
2030             CRED(), &err);
2031         err = ldi_close(lh, FREAD, CRED());
2032         if (err) {
2033                 cmn_err(CE_NOTE, "hsched_init: Querying device failed");
2034                 cmn_err(CE_NOTE, "hsched_init: ldi_close err=%d\n", err);
2035         }
2036 
2037         if (error == 0) {
2038                 hqueue->dev_maxtransfer = ldbtob(info.dki_maxtransfer);
2039         }
2040 
2041 set_ra:
2042         /*
2043          * Max size of data to read ahead for sequential access pattern.
2044          * Conservative to avoid letting the underlying CD drive to spin
2045          * down, in case the application is reading slowly.
2046          * We read ahead upto a max of 4 pages.
2047          */
2048         hqueue->max_ra_bytes = PAGESIZE * 8;
2049 
2050         mutex_init(&(hqueue->hsfs_queue_lock), NULL, MUTEX_DEFAULT, NULL);
2051         mutex_init(&(hqueue->strategy_lock), NULL, MUTEX_DEFAULT, NULL);
2052         avl_create(&(hqueue->read_tree), hsched_offset_compare,
2053             sizeof (struct hio), offsetof(struct hio, io_offset_node));
2054         avl_create(&(hqueue->deadline_tree), hsched_deadline_compare,
2055             sizeof (struct hio), offsetof(struct hio, io_deadline_node));
2056 
2057         (void) snprintf(namebuf, sizeof (namebuf), "hsched_task_%d", fsid);
2058         hqueue->ra_task = taskq_create(namebuf, hsfs_taskq_nthreads,
2059             minclsyspri + 2, 1, 104857600 / PAGESIZE, TASKQ_DYNAMIC);
2060 
2061         hqueue->next = NULL;
2062         hqueue->nbuf = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
2063 }
2064 
2065 void
2066 hsched_fini(struct hsfs_queue *hqueue)
2067 {
2068         if (hqueue != NULL) {
2069                 /*
2070                  * Remove the sentinel if there was one.
2071                  */
2072                 if (hqueue->next != NULL) {
2073                         avl_remove(&hqueue->read_tree, hqueue->next);
2074                         kmem_cache_free(hio_cache, hqueue->next);
2075                 }
2076                 avl_destroy(&(hqueue->read_tree));
2077                 avl_destroy(&(hqueue->deadline_tree));
2078                 mutex_destroy(&(hqueue->hsfs_queue_lock));
2079                 mutex_destroy(&(hqueue->strategy_lock));
2080 
2081                 /*
2082                  * If there are any existing readahead threads running
2083                  * taskq_destroy will wait for them to finish.
2084                  */
2085                 taskq_destroy(hqueue->ra_task);
2086                 kmem_free(hqueue->nbuf, sizeof (struct buf));
2087         }
2088 }
2089 
2090 /*
2091  * Determine if two I/O requests are adjacent to each other so
2092  * that they can coalesced.
2093  */
2094 #define IS_ADJACENT(io, nio) \
2095         (((io)->io_lblkno + (io)->nblocks == (nio)->io_lblkno) && \
2096         (io)->bp->b_edev == (nio)->bp->b_edev)
2097 
2098 /*
2099  * This performs the actual I/O scheduling logic. We use the Circular
2100  * Look algorithm here. Sort the I/O requests in ascending order of
2101  * logical block number and process them starting with the lowest
2102  * numbered block and progressing towards higher block numbers in the
2103  * queue. Once there are no more higher numbered blocks, start again
2104  * with the lowest one. This is good for CD/DVD as you keep moving
2105  * the head in one direction along the outward spiral track and avoid
2106  * too many seeks as much as possible. The re-ordering also allows
2107  * us to coalesce adjacent requests into one larger request.
2108  * This is thus essentially a 1-way Elevator with front merging.
2109  *
2110  * In addition each read request here has a deadline and will be
2111  * processed out of turn if the deadline (500ms) expires.
2112  *
2113  * This function is necessarily serialized via hqueue->strategy_lock.
2114  * This function sits just below hsfs_getapage and processes all read
2115  * requests orginating from that function.
2116  */
2117 int
2118 hsched_invoke_strategy(struct hsfs *fsp)
2119 {
2120         struct hsfs_queue *hqueue;
2121         struct buf *nbuf;
2122         struct hio *fio, *nio, *tio, *prev, *last;
2123         size_t bsize, soffset, offset, data;
2124         int bioret, bufcount;
2125         struct vnode *fvp;
2126         ksema_t *io_done;
2127         caddr_t iodata;
2128 
2129         hqueue = fsp->hqueue;
2130         mutex_enter(&hqueue->strategy_lock);
2131         mutex_enter(&hqueue->hsfs_queue_lock);
2132 
2133         /*
2134          * Check for Deadline expiration first
2135          */
2136         fio = avl_first(&hqueue->deadline_tree);
2137 
2138         /*
2139          * Paranoid check for empty I/O queue. Both deadline
2140          * and read trees contain same data sorted in different
2141          * ways. So empty deadline tree = empty read tree.
2142          */
2143         if (fio == NULL) {
2144                 /*
2145                  * Remove the sentinel if there was one.
2146                  */
2147                 if (hqueue->next != NULL) {
2148                         avl_remove(&hqueue->read_tree, hqueue->next);
2149                         kmem_cache_free(hio_cache, hqueue->next);
2150                         hqueue->next = NULL;
2151                 }
2152                 mutex_exit(&hqueue->hsfs_queue_lock);
2153                 mutex_exit(&hqueue->strategy_lock);
2154                 return (1);
2155         }
2156 
2157         if (drv_hztousec(ddi_get_lbolt()) - fio->io_timestamp
2158             < HSFS_READ_DEADLINE) {
2159                 /*
2160                  * Apply standard scheduling logic. This uses the
2161                  * C-LOOK approach. Process I/O requests in ascending
2162                  * order of logical block address till no subsequent
2163                  * higher numbered block request remains. Then start
2164                  * again from the lowest numbered block in the queue.
2165                  *
2166                  * We do this cheaply here by means of a sentinel.
2167                  * The last processed I/O structure from the previous
2168                  * invocation of this func, is left dangling in the
2169                  * read_tree so that we can easily scan to the next
2170                  * higher numbered request and remove the sentinel.
2171                  */
2172                 fio = NULL;
2173                 if (hqueue->next != NULL) {
2174                         fio = AVL_NEXT(&hqueue->read_tree, hqueue->next);
2175                         avl_remove(&hqueue->read_tree, hqueue->next);
2176                         kmem_cache_free(hio_cache, hqueue->next);
2177                         hqueue->next = NULL;
2178                 }
2179                 if (fio == NULL) {
2180                         fio = avl_first(&hqueue->read_tree);
2181                 }
2182         } else if (hqueue->next != NULL) {
2183                 DTRACE_PROBE1(hsfs_deadline_expiry, struct hio *, fio);
2184 
2185                 avl_remove(&hqueue->read_tree, hqueue->next);
2186                 kmem_cache_free(hio_cache, hqueue->next);
2187                 hqueue->next = NULL;
2188         }
2189 
2190         /*
2191          * In addition we try to coalesce contiguous
2192          * requests into one bigger request.
2193          */
2194         bufcount = 1;
2195         bsize = ldbtob(fio->nblocks);
2196         fvp = fio->bp->b_file;
2197         nio = AVL_NEXT(&hqueue->read_tree, fio);
2198         tio = fio;
2199         while (nio != NULL && IS_ADJACENT(tio, nio) &&
2200             bsize < hqueue->dev_maxtransfer) {
2201                 avl_remove(&hqueue->deadline_tree, tio);
2202                 avl_remove(&hqueue->read_tree, tio);
2203                 tio->contig_chain = nio;
2204                 bsize += ldbtob(nio->nblocks);
2205                 prev = tio;
2206                 tio = nio;
2207 
2208                 /*
2209                  * This check is required to detect the case where
2210                  * we are merging adjacent buffers belonging to
2211                  * different files. fvp is used to set the b_file
2212                  * parameter in the coalesced buf. b_file is used
2213                  * by DTrace so we do not want DTrace to accrue
2214                  * requests to two different files to any one file.
2215                  */
2216                 if (fvp && tio->bp->b_file != fvp) {
2217                         fvp = NULL;
2218                 }
2219 
2220                 nio = AVL_NEXT(&hqueue->read_tree, nio);
2221                 bufcount++;
2222         }
2223 
2224         /*
2225          * tio is not removed from the read_tree as it serves as a sentinel
2226          * to cheaply allow us to scan to the next higher numbered I/O
2227          * request.
2228          */
2229         hqueue->next = tio;
2230         avl_remove(&hqueue->deadline_tree, tio);
2231         mutex_exit(&hqueue->hsfs_queue_lock);
2232         DTRACE_PROBE3(hsfs_io_dequeued, struct hio *, fio, int, bufcount,
2233             size_t, bsize);
2234 
2235         /*
2236          * The benefit of coalescing occurs if the the savings in I/O outweighs
2237          * the cost of doing the additional work below.
2238          * It was observed that coalescing 2 buffers results in diminishing
2239          * returns, so we do coalescing if we have >2 adjacent bufs.
2240          */
2241         if (bufcount > hsched_coalesce_min) {
2242                 /*
2243                  * We have coalesced blocks. First allocate mem and buf for
2244                  * the entire coalesced chunk.
2245                  * Since we are guaranteed single-threaded here we pre-allocate
2246                  * one buf at mount time and that is re-used every time. This
2247                  * is a synthesized buf structure that uses kmem_alloced chunk.
2248                  * Not quite a normal buf attached to pages.
2249                  */
2250                 fsp->coalesced_bytes += bsize;
2251                 nbuf = hqueue->nbuf;
2252                 bioinit(nbuf);
2253                 nbuf->b_edev = fio->bp->b_edev;
2254                 nbuf->b_dev = fio->bp->b_dev;
2255                 nbuf->b_flags = fio->bp->b_flags;
2256                 nbuf->b_iodone = fio->bp->b_iodone;
2257                 iodata = kmem_alloc(bsize, KM_SLEEP);
2258                 nbuf->b_un.b_addr = iodata;
2259                 nbuf->b_lblkno = fio->bp->b_lblkno;
2260                 nbuf->b_vp = fvp;
2261                 nbuf->b_file = fvp;
2262                 nbuf->b_bcount = bsize;
2263                 nbuf->b_bufsize = bsize;
2264 
2265                 DTRACE_PROBE3(hsfs_coalesced_io_start, struct hio *, fio, int,
2266                     bufcount, size_t, bsize);
2267 
2268                 /*
2269                  * Perform I/O for the coalesced block.
2270                  */
2271                 (void) bdev_strategy(nbuf);
2272 
2273                 /*
2274                  * Duplicate the last IO node to leave the sentinel alone.
2275                  * The sentinel is freed in the next invocation of this
2276                  * function.
2277                  */
2278                 prev->contig_chain = kmem_cache_alloc(hio_cache, KM_SLEEP);
2279                 prev->contig_chain->bp = tio->bp;
2280                 prev->contig_chain->sema = tio->sema;
2281                 tio = prev->contig_chain;
2282                 tio->contig_chain = NULL;
2283                 soffset = ldbtob(fio->bp->b_lblkno);
2284                 nio = fio;
2285 
2286                 bioret = biowait(nbuf);
2287                 data = bsize - nbuf->b_resid;
2288                 biofini(nbuf);
2289                 mutex_exit(&hqueue->strategy_lock);
2290 
2291                 /*
2292                  * We use the b_resid parameter to detect how much
2293                  * data was succesfully transferred. We will signal
2294                  * a success to all the fully retrieved actual bufs
2295                  * before coalescing, rest is signaled as error,
2296                  * if any.
2297                  */
2298                 tio = nio;
2299                 DTRACE_PROBE3(hsfs_coalesced_io_done, struct hio *, nio,
2300                     int, bioret, size_t, data);
2301 
2302                 /*
2303                  * Copy data and signal success to all the bufs
2304                  * which can be fully satisfied from b_resid.
2305                  */
2306                 while (nio != NULL && data >= nio->bp->b_bcount) {
2307                         offset = ldbtob(nio->bp->b_lblkno) - soffset;
2308                         bcopy(iodata + offset, nio->bp->b_un.b_addr,
2309                             nio->bp->b_bcount);
2310                         data -= nio->bp->b_bcount;
2311                         bioerror(nio->bp, 0);
2312                         biodone(nio->bp);
2313                         sema_v(nio->sema);
2314                         tio = nio;
2315                         nio = nio->contig_chain;
2316                         kmem_cache_free(hio_cache, tio);
2317                 }
2318 
2319                 /*
2320                  * Signal error to all the leftover bufs (if any)
2321                  * after b_resid data is exhausted.
2322                  */
2323                 while (nio != NULL) {
2324                         nio->bp->b_resid = nio->bp->b_bcount - data;
2325                         bzero(nio->bp->b_un.b_addr + data, nio->bp->b_resid);
2326                         bioerror(nio->bp, bioret);
2327                         biodone(nio->bp);
2328                         sema_v(nio->sema);
2329                         tio = nio;
2330                         nio = nio->contig_chain;
2331                         kmem_cache_free(hio_cache, tio);
2332                         data = 0;
2333                 }
2334                 kmem_free(iodata, bsize);
2335         } else {
2336 
2337                 nbuf = tio->bp;
2338                 io_done = tio->sema;
2339                 nio = fio;
2340                 last = tio;
2341 
2342                 while (nio != NULL) {
2343                         (void) bdev_strategy(nio->bp);
2344                         nio = nio->contig_chain;
2345                 }
2346                 nio = fio;
2347                 mutex_exit(&hqueue->strategy_lock);
2348 
2349                 while (nio != NULL) {
2350                         if (nio == last) {
2351                                 (void) biowait(nbuf);
2352                                 sema_v(io_done);
2353                                 break;
2354                                 /* sentinel last not freed. See above. */
2355                         } else {
2356                                 (void) biowait(nio->bp);
2357                                 sema_v(nio->sema);
2358                         }
2359                         tio = nio;
2360                         nio = nio->contig_chain;
2361                         kmem_cache_free(hio_cache, tio);
2362                 }
2363         }
2364         return (0);
2365 }
2366 
2367 /*
2368  * Insert an I/O request in the I/O scheduler's pipeline
2369  * Using AVL tree makes it easy to reorder the I/O request
2370  * based on logical block number.
2371  */
2372 static void
2373 hsched_enqueue_io(struct hsfs *fsp, struct hio *hsio, int ra)
2374 {
2375         struct hsfs_queue *hqueue = fsp->hqueue;
2376 
2377         mutex_enter(&hqueue->hsfs_queue_lock);
2378 
2379         fsp->physical_read_bytes += hsio->bp->b_bcount;
2380         if (ra)
2381                 fsp->readahead_bytes += hsio->bp->b_bcount;
2382 
2383         avl_add(&hqueue->deadline_tree, hsio);
2384         avl_add(&hqueue->read_tree, hsio);
2385 
2386         DTRACE_PROBE3(hsfs_io_enqueued, struct hio *, hsio,
2387             struct hsfs_queue *, hqueue, int, ra);
2388 
2389         mutex_exit(&hqueue->hsfs_queue_lock);
2390 }
2391 
2392 /* ARGSUSED */
2393 static int
2394 hsfs_pathconf(struct vnode *vp,
2395         int cmd,
2396         ulong_t *valp,
2397         struct cred *cr,
2398         caller_context_t *ct)
2399 {
2400         struct hsfs     *fsp;
2401 
2402         int             error = 0;
2403 
2404         switch (cmd) {
2405 
2406         case _PC_NAME_MAX:
2407                 fsp = VFS_TO_HSFS(vp->v_vfsp);
2408                 *valp = fsp->hsfs_namemax;
2409                 break;
2410 
2411         case _PC_FILESIZEBITS:
2412                 *valp = 33;     /* Without multi extent support: 4 GB - 2k */
2413                 break;
2414 
2415         case _PC_TIMESTAMP_RESOLUTION:
2416                 /*
2417                  * HSFS keeps, at best, 1/100 second timestamp resolution.
2418                  */
2419                 *valp = 10000000L;
2420                 break;
2421 
2422         default:
2423                 error = fs_pathconf(vp, cmd, valp, cr, ct);
2424                 break;
2425         }
2426 
2427         return (error);
2428 }
2429 
2430 
2431 
2432 const fs_operation_def_t hsfs_vnodeops_template[] = {
2433         VOPNAME_OPEN,           { .vop_open = hsfs_open },
2434         VOPNAME_CLOSE,          { .vop_close = hsfs_close },
2435         VOPNAME_READ,           { .vop_read = hsfs_read },
2436         VOPNAME_GETATTR,        { .vop_getattr = hsfs_getattr },
2437         VOPNAME_ACCESS,         { .vop_access = hsfs_access },
2438         VOPNAME_LOOKUP,         { .vop_lookup = hsfs_lookup },
2439         VOPNAME_READDIR,        { .vop_readdir = hsfs_readdir },
2440         VOPNAME_READLINK,       { .vop_readlink = hsfs_readlink },
2441         VOPNAME_FSYNC,          { .vop_fsync = hsfs_fsync },
2442         VOPNAME_INACTIVE,       { .vop_inactive = hsfs_inactive },
2443         VOPNAME_FID,            { .vop_fid = hsfs_fid },
2444         VOPNAME_SEEK,           { .vop_seek = hsfs_seek },
2445         VOPNAME_FRLOCK,         { .vop_frlock = hsfs_frlock },
2446         VOPNAME_GETPAGE,        { .vop_getpage = hsfs_getpage },
2447         VOPNAME_PUTPAGE,        { .vop_putpage = hsfs_putpage },
2448         VOPNAME_MAP,            { .vop_map = hsfs_map },
2449         VOPNAME_ADDMAP,         { .vop_addmap = hsfs_addmap },
2450         VOPNAME_DELMAP,         { .vop_delmap = hsfs_delmap },
2451         VOPNAME_PATHCONF,       { .vop_pathconf = hsfs_pathconf },
2452         NULL,                   NULL
2453 };
2454 
2455 struct vnodeops *hsfs_vnodeops;