1 
   2 /*
   3  * CDDL HEADER START
   4  *
   5  * The contents of this file are subject to the terms of the
   6  * Common Development and Distribution License (the "License").
   7  * You may not use this file except in compliance with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  27 /*        All Rights Reserved   */
  28 
  29 /*
  30  * University Copyright- Copyright (c) 1982, 1986, 1988
  31  * The Regents of the University of California
  32  * All Rights Reserved
  33  *
  34  * University Acknowledgment- Portions of this document are derived from
  35  * software developed by the University of California, Berkeley, and its
  36  * contributors.
  37  */
  38 
  39 #include <sys/types.h>
  40 #include <sys/thread.h>
  41 #include <sys/t_lock.h>
  42 #include <sys/param.h>
  43 #include <sys/systm.h>
  44 #include <sys/bitmap.h>
  45 #include <sys/buf.h>
  46 #include <sys/cmn_err.h>
  47 #include <sys/conf.h>
  48 #include <sys/ddi.h>
  49 #include <sys/debug.h>
  50 #include <sys/errno.h>
  51 #include <sys/time.h>
  52 #include <sys/fcntl.h>
  53 #include <sys/flock.h>
  54 #include <sys/file.h>
  55 #include <sys/kmem.h>
  56 #include <sys/mman.h>
  57 #include <sys/vmsystm.h>
  58 #include <sys/open.h>
  59 #include <sys/swap.h>
  60 #include <sys/sysmacros.h>
  61 #include <sys/uio.h>
  62 #include <sys/vfs.h>
  63 #include <sys/vfs_opreg.h>
  64 #include <sys/vnode.h>
  65 #include <sys/stat.h>
  66 #include <sys/poll.h>
  67 #include <sys/zmod.h>
  68 #include <sys/fs/decomp.h>
  69 
  70 #include <vm/hat.h>
  71 #include <vm/as.h>
  72 #include <vm/page.h>
  73 #include <vm/pvn.h>
  74 #include <vm/seg_vn.h>
  75 #include <vm/seg_kmem.h>
  76 #include <vm/seg_map.h>
  77 
  78 #include <fs/fs_subr.h>
  79 
  80 /*
  81  * dcfs - A filesystem for automatic decompressing of fiocompressed files
  82  *
  83  * This filesystem is a layered filesystem that sits on top of a normal
  84  * persistent filesystem and provides automatic decompression of files
  85  * that have been previously compressed and stored on the host file system.
  86  * This is a pseudo filesystem in that it does not persist data, rather it
  87  * intercepts file lookup requests on the host filesystem and provides
  88  * transparent decompression of those files. Currently the only supported
  89  * host filesystem is ufs.
  90  *
  91  * A file is compressed via a userland utility (currently cmd/boot/fiocompress)
  92  * and marked by fiocompress as a compressed file via a flag in the on-disk
  93  * inode (set via a ufs ioctl() - see `ufs_vnops.c`ufs_ioctl()`_FIO_COMPRESSED
  94  * ufs_lookup checks for this flag and if set, passes control to decompvp
  95  * a function defined in this (dcfs) filesystem. decomvp uncompresses the file
  96  * and returns a dcfs vnode to the VFS layer.
  97  *
  98  * dcfs is layered on top of ufs and passes requests involving persistence
  99  * to the underlying ufs filesystem. The compressed files currently cannot be
 100  * written to.
 101  */
 102 
 103 
 104 /*
 105  * Define data structures within this file.
 106  */
 107 #define DCSHFT          5
 108 #define DCTABLESIZE     16
 109 
 110 #if ((DCTABLESIZE & (DCTABLESIZE - 1)) == 0)
 111 #define DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) & (DCTABLESIZE - 1))
 112 #else
 113 #define DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) % DTABLESIZEC)
 114 #endif
 115 
 116 #define DCLRUSIZE       16
 117 
 118 #define DCCACHESIZE     4
 119 
 120 #define rounddown(x, y) ((x) & ~((y) - 1))
 121 
 122 struct dcnode   *dctable[DCTABLESIZE];
 123 
 124 struct dcnode   *dclru;
 125 static int      dclru_len;
 126 
 127 kmutex_t        dctable_lock;
 128 
 129 dev_t           dcdev;
 130 struct vfs      dc_vfs;
 131 
 132 struct kmem_cache *dcnode_cache;
 133 struct kmem_cache *dcbuf_cache[DCCACHESIZE];
 134 
 135 kmutex_t        dccache_lock;
 136 
 137 static int dcinit(int, char *);
 138 
 139 static struct dcnode    *dcnode_alloc(void);
 140 static void             dcnode_free(struct dcnode *);
 141 static void             dcnode_recycle(struct dcnode *);
 142 
 143 static void             dcinsert(struct dcnode *);
 144 static void             dcdelete(struct dcnode *);
 145 static struct dcnode    *dcfind(struct vnode *);
 146 static void             dclru_add(struct dcnode *);
 147 static void             dclru_sub(struct dcnode *);
 148 
 149 
 150 /*
 151  * This is the loadable module wrapper.
 152  */
 153 #include <sys/modctl.h>
 154 
 155 struct vfsops *dc_vfsops;
 156 
 157 static vfsdef_t vfw = {
 158         VFSDEF_VERSION,
 159         "dcfs",
 160         dcinit,
 161         VSW_ZMOUNT,
 162         NULL
 163 };
 164 
 165 /*
 166  * Module linkage information for the kernel.
 167  */
 168 extern struct mod_ops mod_fsops;
 169 
 170 static struct modlfs modlfs = {
 171         &mod_fsops, "compressed filesystem", &vfw
 172 };
 173 
 174 static struct modlinkage modlinkage = {
 175         MODREV_1, (void *)&modlfs, NULL
 176 };
 177 
 178 int
 179 _init()
 180 {
 181         return (mod_install(&modlinkage));
 182 }
 183 
 184 int
 185 _info(struct modinfo *modinfop)
 186 {
 187         return (mod_info(&modlinkage, modinfop));
 188 }
 189 
 190 
 191 static int dc_open(struct vnode **, int, struct cred *, caller_context_t *);
 192 static int dc_close(struct vnode *, int, int, offset_t,
 193     struct cred *, caller_context_t *);
 194 static int dc_read(struct vnode *, struct uio *, int, struct cred *,
 195     struct caller_context *);
 196 static int dc_getattr(struct vnode *, struct vattr *, int,
 197     struct cred *, caller_context_t *);
 198 static int dc_setattr(struct vnode *, struct vattr *, int, struct cred *,
 199     struct caller_context *);
 200 static int dc_access(struct vnode *, int, int,
 201     struct cred *, caller_context_t *);
 202 static int dc_fsync(struct vnode *, int, struct cred *, caller_context_t *);
 203 static void dc_inactive(struct vnode *, struct cred *, caller_context_t *);
 204 static int dc_fid(struct vnode *, struct fid *, caller_context_t *);
 205 static int dc_seek(struct vnode *, offset_t, offset_t *, caller_context_t *);
 206 static int dc_frlock(struct vnode *, int, struct flock64 *, int, offset_t,
 207     struct flk_callback *, struct cred *, caller_context_t *);
 208 static int dc_realvp(struct vnode *, struct vnode **, caller_context_t *);
 209 static int dc_getpage(struct vnode *, offset_t, size_t, uint_t *,
 210     struct page **, size_t, struct seg *, caddr_t, enum seg_rw,
 211     struct cred *, caller_context_t *);
 212 static int dc_putpage(struct vnode *, offset_t, size_t, int,
 213     struct cred *, caller_context_t *);
 214 static int dc_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t,
 215     uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
 216 static int dc_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
 217     uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
 218 static int dc_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
 219     uint_t, uint_t, uint_t, struct cred *, caller_context_t *);
 220 
 221 struct vnodeops *dc_vnodeops;
 222 
 223 const fs_operation_def_t dc_vnodeops_template[] = {
 224         VOPNAME_OPEN,                   { .vop_open = dc_open },
 225         VOPNAME_CLOSE,                  { .vop_close = dc_close },
 226         VOPNAME_READ,                   { .vop_read = dc_read },
 227         VOPNAME_GETATTR,                { .vop_getattr =  dc_getattr },
 228         VOPNAME_SETATTR,                { .vop_setattr = dc_setattr },
 229         VOPNAME_ACCESS,                 { .vop_access = dc_access },
 230         VOPNAME_FSYNC,                  { .vop_fsync = dc_fsync },
 231         VOPNAME_INACTIVE,               { .vop_inactive = dc_inactive },
 232         VOPNAME_FID,                    { .vop_fid = dc_fid },
 233         VOPNAME_SEEK,                   { .vop_seek = dc_seek },
 234         VOPNAME_FRLOCK,                 { .vop_frlock = dc_frlock },
 235         VOPNAME_REALVP,                 { .vop_realvp = dc_realvp },
 236         VOPNAME_GETPAGE,                { .vop_getpage = dc_getpage },
 237         VOPNAME_PUTPAGE,                { .vop_putpage = dc_putpage },
 238         VOPNAME_MAP,                    { .vop_map = dc_map },
 239         VOPNAME_ADDMAP,                 { .vop_addmap = dc_addmap },
 240         VOPNAME_DELMAP,                 { .vop_delmap = dc_delmap },
 241         NULL,                           NULL
 242 };
 243 
 244 /*ARGSUSED*/
 245 static int
 246 dc_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ctp)
 247 {
 248         return (0);
 249 }
 250 
 251 /*ARGSUSED*/
 252 static int
 253 dc_close(struct vnode *vp, int flag, int count, offset_t off,
 254     struct cred *cr, caller_context_t *ctp)
 255 {
 256         (void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
 257         cleanshares(vp, ttoproc(curthread)->p_pid);
 258         return (0);
 259 }
 260 
 261 /*ARGSUSED*/
 262 static int
 263 dc_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
 264         struct caller_context *ct)
 265 {
 266         struct dcnode *dp = VTODC(vp);
 267         size_t rdsize = MAX(MAXBSIZE, dp->dc_hdr->ch_blksize);
 268         size_t fsize = dp->dc_hdr->ch_fsize;
 269         int error;
 270 
 271         /*
 272          * Loop through file with segmap, decompression will occur
 273          * in dc_getapage
 274          */
 275         do {
 276                 caddr_t base;
 277                 size_t n;
 278                 offset_t mapon;
 279 
 280                 /*
 281                  * read to end of block or file
 282                  */
 283                 mapon = uiop->uio_loffset & (rdsize - 1);
 284                 n = MIN(rdsize - mapon, uiop->uio_resid);
 285                 n = MIN(n, fsize - uiop->uio_loffset);
 286                 if (n == 0)
 287                         return (0);     /* at EOF */
 288 
 289                 base = segmap_getmapflt(segkmap, vp, uiop->uio_loffset, n, 1,
 290                     S_READ);
 291                 error = uiomove(base + mapon, n, UIO_READ, uiop);
 292                 if (!error) {
 293                         uint_t flags;
 294 
 295                         if (n + mapon == rdsize || uiop->uio_loffset == fsize)
 296                                 flags = SM_DONTNEED;
 297                         else
 298                                 flags = 0;
 299                         error = segmap_release(segkmap, base, flags);
 300                 } else
 301                         (void) segmap_release(segkmap, base, 0);
 302         } while (!error && uiop->uio_resid);
 303 
 304         return (error);
 305 }
 306 
 307 static int
 308 dc_getattr(struct vnode *vp, struct vattr *vap, int flags,
 309     cred_t *cred, caller_context_t *ctp)
 310 {
 311         struct dcnode *dp = VTODC(vp);
 312         struct vnode *subvp = dp->dc_subvp;
 313         int error;
 314 
 315         error = VOP_GETATTR(subvp, vap, flags, cred, ctp);
 316 
 317         /* substitute uncompressed size */
 318         vap->va_size = dp->dc_hdr->ch_fsize;
 319         return (error);
 320 }
 321 
 322 static int
 323 dc_setattr(struct vnode *vp, struct vattr *vap, int flags, cred_t *cred,
 324     caller_context_t *ctp)
 325 {
 326         struct dcnode *dp = VTODC(vp);
 327         struct vnode *subvp = dp->dc_subvp;
 328 
 329         return (VOP_SETATTR(subvp, vap, flags, cred, ctp));
 330 }
 331 
 332 static int
 333 dc_access(struct vnode *vp, int mode, int flags,
 334     cred_t *cred, caller_context_t *ctp)
 335 {
 336         struct dcnode *dp = VTODC(vp);
 337         struct vnode *subvp = dp->dc_subvp;
 338 
 339         return (VOP_ACCESS(subvp, mode, flags, cred, ctp));
 340 }
 341 
 342 /*ARGSUSED*/
 343 static int
 344 dc_fsync(vnode_t *vp, int syncflag, cred_t *cred, caller_context_t *ctp)
 345 {
 346         return (0);
 347 }
 348 
 349 /*ARGSUSED*/
 350 static void
 351 dc_inactive(struct vnode *vp, cred_t *cr, caller_context_t *ctp)
 352 {
 353         struct dcnode *dp = VTODC(vp);
 354 
 355         mutex_enter(&dctable_lock);
 356         mutex_enter(&vp->v_lock);
 357         ASSERT(vp->v_count >= 1);
 358         if (--vp->v_count != 0) {
 359                 /*
 360                  * Somebody accessed the dcnode before we got a chance to
 361                  * remove it.  They will remove it when they do a vn_rele.
 362                  */
 363                 mutex_exit(&vp->v_lock);
 364                 mutex_exit(&dctable_lock);
 365                 return;
 366         }
 367         mutex_exit(&vp->v_lock);
 368 
 369         dcnode_free(dp);
 370 
 371         mutex_exit(&dctable_lock);
 372 }
 373 
 374 static int
 375 dc_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ctp)
 376 {
 377         struct dcnode *dp = VTODC(vp);
 378         struct vnode *subvp = dp->dc_subvp;
 379 
 380         return (VOP_FID(subvp, fidp, ctp));
 381 }
 382 
 383 static int
 384 dc_seek(struct vnode *vp, offset_t oof, offset_t *noffp, caller_context_t *ctp)
 385 {
 386         struct dcnode *dp = VTODC(vp);
 387         struct vnode *subvp = dp->dc_subvp;
 388 
 389         return (VOP_SEEK(subvp, oof, noffp, ctp));
 390 }
 391 
 392 static int
 393 dc_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
 394     offset_t offset, struct flk_callback *flk_cbp,
 395     cred_t *cr, caller_context_t *ctp)
 396 {
 397         struct dcnode *dp = VTODC(vp);
 398         int error;
 399         struct vattr vattr;
 400 
 401         /*
 402          * If file is being mapped, disallow frlock.
 403          */
 404         vattr.va_mask = AT_MODE;
 405         if (error = VOP_GETATTR(dp->dc_subvp, &vattr, 0, cr, ctp))
 406                 return (error);
 407         if (dp->dc_mapcnt > 0 && MANDLOCK(vp, vattr.va_mode))
 408                 return (EAGAIN);
 409 
 410         return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ctp));
 411 }
 412 
 413 /*ARGSUSED*/
 414 static int
 415 dc_getblock_miss(struct vnode *vp, offset_t off, size_t len, struct page **ppp,
 416     struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr)
 417 {
 418         struct dcnode *dp = VTODC(vp);
 419         struct comphdr *hdr = dp->dc_hdr;
 420         struct page *pp;
 421         struct buf *bp;
 422         caddr_t saddr;
 423         off_t cblkno;
 424         size_t rdoff, rdsize, dsize;
 425         long xlen;
 426         int error, zerr;
 427 
 428         ASSERT(len == hdr->ch_blksize);
 429         /*
 430          * Get destination pages and make them addressable
 431          */
 432         pp = page_create_va(vp, off, len, PG_WAIT, seg, addr);
 433         bp = pageio_setup(pp, len, vp, B_READ);
 434         bp_mapin(bp);
 435 
 436         /*
 437          * read compressed data from subordinate vnode
 438          */
 439         saddr = kmem_cache_alloc(dp->dc_bufcache, KM_SLEEP);
 440         cblkno = off / len;
 441         rdoff = hdr->ch_blkmap[cblkno];
 442         rdsize = hdr->ch_blkmap[cblkno + 1] - rdoff;
 443         error = vn_rdwr(UIO_READ, dp->dc_subvp, saddr, rdsize, rdoff,
 444             UIO_SYSSPACE, 0, 0, cr, NULL);
 445         if (error)
 446                 goto cleanup;
 447 
 448         /*
 449          * Uncompress
 450          */
 451         dsize = len;
 452         zerr = z_uncompress(bp->b_un.b_addr, &dsize, saddr, dp->dc_zmax);
 453         if (zerr != Z_OK) {
 454                 error = EIO;
 455                 goto cleanup;
 456         }
 457 
 458         /*
 459          * Handle EOF
 460          */
 461         xlen = hdr->ch_fsize - off;
 462         if (xlen < len) {
 463                 bzero(bp->b_un.b_addr + xlen, len - xlen);
 464                 if (dsize != xlen)
 465                         error = EIO;
 466         } else if (dsize != len)
 467                 error = EIO;
 468 
 469         /*
 470          * Clean up
 471          */
 472 cleanup:
 473         kmem_cache_free(dp->dc_bufcache, saddr);
 474         pageio_done(bp);
 475         *ppp = pp;
 476         return (error);
 477 }
 478 
 479 static int
 480 dc_getblock(struct vnode *vp, offset_t off, size_t len, struct page **ppp,
 481     struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr)
 482 {
 483         struct page *pp, *plist = NULL;
 484         offset_t pgoff;
 485         int rdblk;
 486 
 487         /*
 488          * pvn_read_kluster() doesn't quite do what we want, since it
 489          * thinks sub block reads are ok.  Here we always decompress
 490          * a full block.
 491          */
 492 
 493         /*
 494          * Check page cache
 495          */
 496         rdblk = 0;
 497         for (pgoff = off; pgoff < off + len; pgoff += PAGESIZE) {
 498                 pp = page_lookup(vp, pgoff, SE_EXCL);
 499                 if (pp == NULL) {
 500                         rdblk = 1;
 501                         break;
 502                 }
 503                 page_io_lock(pp);
 504                 page_add(&plist, pp);
 505                 plist = plist->p_next;
 506         }
 507         if (!rdblk) {
 508                 *ppp = plist;
 509                 return (0);     /* all pages in cache */
 510         }
 511 
 512         /*
 513          * Undo any locks so getblock_miss has an open field
 514          */
 515         if (plist != NULL)
 516                 pvn_io_done(plist);
 517 
 518         return (dc_getblock_miss(vp, off, len, ppp, seg, addr, rw, cr));
 519 }
 520 
 521 static int
 522 dc_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
 523 {
 524         struct vnode *rvp;
 525 
 526         vp = VTODC(vp)->dc_subvp;
 527         if (VOP_REALVP(vp, &rvp, ct) == 0)
 528                 vp = rvp;
 529         *vpp = vp;
 530         return (0);
 531 }
 532 
 533 /*ARGSUSED10*/
 534 static int
 535 dc_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
 536     struct page *pl[], size_t plsz, struct seg *seg, caddr_t addr,
 537     enum seg_rw rw, struct cred *cr, caller_context_t *ctp)
 538 {
 539         struct dcnode *dp = VTODC(vp);
 540         struct comphdr *hdr = dp->dc_hdr;
 541         struct page *pp, *plist = NULL;
 542         caddr_t vp_baddr;
 543         offset_t vp_boff, vp_bend;
 544         size_t bsize = hdr->ch_blksize;
 545         int nblks, error;
 546 
 547         /* does not support write */
 548         if (rw == S_WRITE) {
 549                 panic("write attempt on compressed file");
 550                 /*NOTREACHED*/
 551         }
 552 
 553         if (protp)
 554                 *protp = PROT_ALL;
 555         /*
 556          * We don't support asynchronous operation at the moment, so
 557          * just pretend we did it.  If the pages are ever actually
 558          * needed, they'll get brought in then.
 559          */
 560         if (pl == NULL)
 561                 return (0);
 562 
 563         /*
 564          * Calc block start and end offsets
 565          */
 566         vp_boff = rounddown(off, bsize);
 567         vp_bend = roundup(off + len, bsize);
 568         vp_baddr = (caddr_t)rounddown((uintptr_t)addr, bsize);
 569 
 570         nblks = (vp_bend - vp_boff) / bsize;
 571         while (nblks--) {
 572                 error = dc_getblock(vp, vp_boff, bsize, &pp, seg, vp_baddr,
 573                     rw, cr);
 574                 page_list_concat(&plist, &pp);
 575                 vp_boff += bsize;
 576                 vp_baddr += bsize;
 577         }
 578         if (!error)
 579                 pvn_plist_init(plist, pl, plsz, off, len, rw);
 580         else
 581                 pvn_read_done(plist, B_ERROR);
 582         return (error);
 583 }
 584 
 585 /*
 586  * This function should never be called. We need to have it to pass
 587  * it as an argument to other functions.
 588  */
 589 /*ARGSUSED*/
 590 static int
 591 dc_putapage(struct vnode *vp, struct page *pp, u_offset_t *offp, size_t *lenp,
 592     int flags, struct cred *cr)
 593 {
 594         /* should never happen */
 595         cmn_err(CE_PANIC, "dcfs: dc_putapage: dirty page");
 596         /*NOTREACHED*/
 597         return (0);
 598 }
 599 
 600 
 601 /*
 602  * The only flags we support are B_INVAL, B_FREE and B_DONTNEED.
 603  * B_INVAL is set by:
 604  *
 605  *      1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag.
 606  *      2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice
 607  *         which translates to an MC_SYNC with the MS_INVALIDATE flag.
 608  *
 609  * The B_FREE (as well as the B_DONTNEED) flag is set when the
 610  * MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked
 611  * from SEGVN to release pages behind a pagefault.
 612  */
 613 /*ARGSUSED5*/
 614 static int
 615 dc_putpage(struct vnode *vp, offset_t off, size_t len, int flags,
 616     struct cred *cr, caller_context_t *ctp)
 617 {
 618         int error = 0;
 619 
 620         if (vp->v_count == 0) {
 621                 panic("dcfs_putpage: bad v_count");
 622                 /*NOTREACHED*/
 623         }
 624 
 625         if (vp->v_flag & VNOMAP)
 626                 return (ENOSYS);
 627 
 628         if (!vn_has_cached_data(vp))    /* no pages mapped */
 629                 return (0);
 630 
 631         if (len == 0)           /* from 'off' to EOF */
 632                 error = pvn_vplist_dirty(vp, off, dc_putapage, flags, cr);
 633         else {
 634                 offset_t io_off;
 635                 se_t se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
 636 
 637                 for (io_off = off; io_off < off + len; io_off += PAGESIZE) {
 638                         page_t *pp;
 639 
 640                         /*
 641                          * We insist on getting the page only if we are
 642                          * about to invalidate, free or write it and
 643                          * the B_ASYNC flag is not set.
 644                          */
 645                         if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0))
 646                                 pp = page_lookup(vp, io_off, se);
 647                         else
 648                                 pp = page_lookup_nowait(vp, io_off, se);
 649 
 650                         if (pp == NULL)
 651                                 continue;
 652                         /*
 653                          * Normally pvn_getdirty() should return 0, which
 654                          * impies that it has done the job for us.
 655                          * The shouldn't-happen scenario is when it returns 1.
 656                          * This means that the page has been modified and
 657                          * needs to be put back.
 658                          * Since we can't write to a dcfs compressed file,
 659                          * we fake a failed I/O and force pvn_write_done()
 660                          * to destroy the page.
 661                          */
 662                         if (pvn_getdirty(pp, flags) == 1) {
 663                                 cmn_err(CE_NOTE, "dc_putpage: dirty page");
 664                                 pvn_write_done(pp, flags |
 665                                     B_ERROR | B_WRITE | B_INVAL | B_FORCE);
 666                         }
 667                 }
 668         }
 669         return (error);
 670 }
 671 
 672 static int
 673 dc_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp,
 674     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
 675     struct cred *cred, caller_context_t *ctp)
 676 {
 677         struct vattr vattr;
 678         struct segvn_crargs vn_a;
 679         int error;
 680 
 681         if (vp->v_flag & VNOMAP)
 682                 return (ENOSYS);
 683 
 684         if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0)
 685                 return (ENXIO);
 686 
 687         /*
 688          * If file is being locked, disallow mapping.
 689          */
 690         if (error = VOP_GETATTR(VTODC(vp)->dc_subvp, &vattr, 0, cred, ctp))
 691                 return (error);
 692         if (vn_has_mandatory_locks(vp, vattr.va_mode))
 693                 return (EAGAIN);
 694 
 695         as_rangelock(as);
 696 
 697         if ((flags & MAP_FIXED) == 0) {
 698                 map_addr(addrp, len, off, 1, flags);
 699                 if (*addrp == NULL) {
 700                         as_rangeunlock(as);
 701                         return (ENOMEM);
 702                 }
 703         } else {
 704                 /*
 705                  * User specified address - blow away any previous mappings
 706                  */
 707                 (void) as_unmap(as, *addrp, len);
 708         }
 709 
 710         vn_a.vp = vp;
 711         vn_a.offset = off;
 712         vn_a.type = flags & MAP_TYPE;
 713         vn_a.prot = prot;
 714         vn_a.maxprot = maxprot;
 715         vn_a.flags = flags & ~MAP_TYPE;
 716         vn_a.cred = cred;
 717         vn_a.amp = NULL;
 718         vn_a.szc = 0;
 719         vn_a.lgrp_mem_policy_flags = 0;
 720 
 721         error = as_map(as, *addrp, len, segvn_create, &vn_a);
 722         as_rangeunlock(as);
 723         return (error);
 724 }
 725 
 726 /*ARGSUSED*/
 727 static int
 728 dc_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
 729     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
 730     struct cred *cr, caller_context_t *ctp)
 731 {
 732         struct dcnode *dp;
 733 
 734         if (vp->v_flag & VNOMAP)
 735                 return (ENOSYS);
 736 
 737         dp = VTODC(vp);
 738         mutex_enter(&dp->dc_lock);
 739         dp->dc_mapcnt += btopr(len);
 740         mutex_exit(&dp->dc_lock);
 741         return (0);
 742 }
 743 
 744 /*ARGSUSED*/
 745 static int
 746 dc_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
 747     size_t len, uint_t prot, uint_t maxprot, uint_t flags,
 748     struct cred *cr, caller_context_t *ctp)
 749 {
 750         struct dcnode *dp;
 751 
 752         if (vp->v_flag & VNOMAP)
 753                 return (ENOSYS);
 754 
 755         dp = VTODC(vp);
 756         mutex_enter(&dp->dc_lock);
 757         dp->dc_mapcnt -= btopr(len);
 758         ASSERT(dp->dc_mapcnt >= 0);
 759         mutex_exit(&dp->dc_lock);
 760         return (0);
 761 }
 762 
 763 /*
 764  * Constructor/destructor routines for dcnodes
 765  */
 766 /*ARGSUSED1*/
 767 static int
 768 dcnode_constructor(void *buf, void *cdrarg, int kmflags)
 769 {
 770         struct dcnode *dp = buf;
 771         struct vnode *vp;
 772 
 773         vp = dp->dc_vp = vn_alloc(kmflags);
 774         if (vp == NULL) {
 775                 return (-1);
 776         }
 777         vp->v_data = dp;
 778         vp->v_type = VREG;
 779         vp->v_flag = VNOSWAP;
 780         vp->v_vfsp = &dc_vfs;
 781         vn_setops(vp, dc_vnodeops);
 782         vn_exists(vp);
 783 
 784         mutex_init(&dp->dc_lock, NULL, MUTEX_DEFAULT, NULL);
 785         dp->dc_mapcnt = 0;
 786         dp->dc_lrunext = dp->dc_lruprev = NULL;
 787         dp->dc_hdr = NULL;
 788         dp->dc_subvp = NULL;
 789         return (0);
 790 }
 791 
 792 /*ARGSUSED*/
 793 static void
 794 dcnode_destructor(void *buf, void *cdrarg)
 795 {
 796         struct dcnode *dp = buf;
 797         struct vnode *vp = DCTOV(dp);
 798 
 799         mutex_destroy(&dp->dc_lock);
 800 
 801         VERIFY(dp->dc_hdr == NULL);
 802         VERIFY(dp->dc_subvp == NULL);
 803         vn_invalid(vp);
 804         vn_free(vp);
 805 }
 806 
 807 static struct dcnode *
 808 dcnode_alloc(void)
 809 {
 810         struct dcnode *dp;
 811 
 812         /*
 813          * If the free list is above DCLRUSIZE
 814          * re-use one from it
 815          */
 816         mutex_enter(&dctable_lock);
 817         if (dclru_len < DCLRUSIZE) {
 818                 mutex_exit(&dctable_lock);
 819                 dp = kmem_cache_alloc(dcnode_cache, KM_SLEEP);
 820         } else {
 821                 ASSERT(dclru != NULL);
 822                 dp = dclru;
 823                 dclru_sub(dp);
 824                 dcdelete(dp);
 825                 mutex_exit(&dctable_lock);
 826                 dcnode_recycle(dp);
 827         }
 828         return (dp);
 829 }
 830 
 831 static void
 832 dcnode_free(struct dcnode *dp)
 833 {
 834         struct vnode *vp = DCTOV(dp);
 835 
 836         ASSERT(MUTEX_HELD(&dctable_lock));
 837 
 838         /*
 839          * If no cached pages, no need to put it on lru
 840          */
 841         if (!vn_has_cached_data(vp)) {
 842                 dcdelete(dp);
 843                 dcnode_recycle(dp);
 844                 kmem_cache_free(dcnode_cache, dp);
 845                 return;
 846         }
 847 
 848         /*
 849          * Add to lru, if it's over the limit, free from head
 850          */
 851         dclru_add(dp);
 852         if (dclru_len > DCLRUSIZE) {
 853                 dp = dclru;
 854                 dclru_sub(dp);
 855                 dcdelete(dp);
 856                 dcnode_recycle(dp);
 857                 kmem_cache_free(dcnode_cache, dp);
 858         }
 859 }
 860 
 861 static void
 862 dcnode_recycle(struct dcnode *dp)
 863 {
 864         struct vnode *vp;
 865 
 866         vp = DCTOV(dp);
 867 
 868         VN_RELE(dp->dc_subvp);
 869         dp->dc_subvp = NULL;
 870         (void) pvn_vplist_dirty(vp, 0, dc_putapage, B_INVAL, NULL);
 871         kmem_free(dp->dc_hdr, dp->dc_hdrsize);
 872         dp->dc_hdr = NULL;
 873         dp->dc_hdrsize = dp->dc_zmax = 0;
 874         dp->dc_bufcache = NULL;
 875         dp->dc_mapcnt = 0;
 876         vn_reinit(vp);
 877         vp->v_type = VREG;
 878         vp->v_flag = VNOSWAP;
 879         vp->v_vfsp = &dc_vfs;
 880 }
 881 
 882 static int
 883 dcinit(int fstype, char *name)
 884 {
 885         static const fs_operation_def_t dc_vfsops_template[] = {
 886                 NULL, NULL
 887         };
 888         int error;
 889         major_t dev;
 890 
 891         error = vfs_setfsops(fstype, dc_vfsops_template, &dc_vfsops);
 892         if (error) {
 893                 cmn_err(CE_WARN, "dcinit: bad vfs ops template");
 894                 return (error);
 895         }
 896         VFS_INIT(&dc_vfs, dc_vfsops, NULL);
 897         dc_vfs.vfs_flag = VFS_RDONLY;
 898         dc_vfs.vfs_fstype = fstype;
 899         if ((dev = getudev()) == (major_t)-1)
 900                 dev = 0;
 901         dcdev = makedevice(dev, 0);
 902         dc_vfs.vfs_dev = dcdev;
 903 
 904         error = vn_make_ops(name, dc_vnodeops_template, &dc_vnodeops);
 905         if (error != 0) {
 906                 (void) vfs_freevfsops_by_type(fstype);
 907                 cmn_err(CE_WARN, "dcinit: bad vnode ops template");
 908                 return (error);
 909         }
 910 
 911         mutex_init(&dctable_lock, NULL, MUTEX_DEFAULT, NULL);
 912         mutex_init(&dccache_lock, NULL, MUTEX_DEFAULT, NULL);
 913         dcnode_cache = kmem_cache_create("dcnode_cache", sizeof (struct dcnode),
 914             0, dcnode_constructor, dcnode_destructor, NULL, NULL, NULL, 0);
 915 
 916         return (0);
 917 }
 918 
 919 /*
 920  * Return shadow vnode with the given vp as its subordinate
 921  */
 922 struct vnode *
 923 decompvp(struct vnode *vp, cred_t *cred, caller_context_t *ctp)
 924 {
 925         struct dcnode *dp, *ndp;
 926         struct comphdr thdr, *hdr;
 927         struct kmem_cache **cpp;
 928         struct vattr vattr;
 929         size_t hdrsize, bsize;
 930         int error;
 931 
 932         /*
 933          * See if we have an existing shadow
 934          * If none, we have to manufacture one
 935          */
 936         mutex_enter(&dctable_lock);
 937         dp = dcfind(vp);
 938         mutex_exit(&dctable_lock);
 939         if (dp != NULL)
 940                 return (DCTOV(dp));
 941 
 942         /*
 943          * Make sure it's a valid compressed file
 944          */
 945         hdr = &thdr;
 946         error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, sizeof (struct comphdr), 0,
 947             UIO_SYSSPACE, 0, 0, cred, NULL);
 948         if (error || hdr->ch_magic != CH_MAGIC_ZLIB ||
 949             hdr->ch_version != CH_VERSION || hdr->ch_algorithm != CH_ALG_ZLIB ||
 950             hdr->ch_fsize == 0 || hdr->ch_blksize < PAGESIZE ||
 951             hdr->ch_blksize > ptob(DCCACHESIZE) || !ISP2(hdr->ch_blksize))
 952                 return (NULL);
 953 
 954         /* get underlying file size */
 955         if (VOP_GETATTR(vp, &vattr, 0, cred, ctp) != 0)
 956                 return (NULL);
 957 
 958         /*
 959          * Re-read entire header
 960          */
 961         hdrsize = hdr->ch_blkmap[0] + sizeof (uint64_t);
 962         hdr = kmem_alloc(hdrsize, KM_SLEEP);
 963         error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, hdrsize, 0, UIO_SYSSPACE,
 964             0, 0, cred, NULL);
 965         if (error) {
 966                 kmem_free(hdr, hdrsize);
 967                 return (NULL);
 968         }
 969 
 970         /*
 971          * add extra blkmap entry to make dc_getblock()'s
 972          * life easier
 973          */
 974         bsize = hdr->ch_blksize;
 975         hdr->ch_blkmap[((hdr->ch_fsize-1) / bsize) + 1] = vattr.va_size;
 976 
 977         ndp = dcnode_alloc();
 978         ndp->dc_subvp = vp;
 979         VN_HOLD(vp);
 980         ndp->dc_hdr = hdr;
 981         ndp->dc_hdrsize = hdrsize;
 982 
 983         /*
 984          * Allocate kmem cache if none there already
 985          */
 986         ndp->dc_zmax = ZMAXBUF(bsize);
 987         cpp = &dcbuf_cache[btop(bsize)];
 988         mutex_enter(&dccache_lock);
 989         if (*cpp == NULL)
 990                 *cpp = kmem_cache_create("dcbuf_cache", ndp->dc_zmax, 0, NULL,
 991                     NULL, NULL, NULL, NULL, 0);
 992         mutex_exit(&dccache_lock);
 993         ndp->dc_bufcache = *cpp;
 994 
 995         /*
 996          * Recheck table in case someone else created shadow
 997          * while we were blocked above.
 998          */
 999         mutex_enter(&dctable_lock);
1000         dp = dcfind(vp);
1001         if (dp != NULL) {
1002                 mutex_exit(&dctable_lock);
1003                 dcnode_recycle(ndp);
1004                 kmem_cache_free(dcnode_cache, ndp);
1005                 return (DCTOV(dp));
1006         }
1007         dcinsert(ndp);
1008         mutex_exit(&dctable_lock);
1009 
1010         return (DCTOV(ndp));
1011 }
1012 
1013 
1014 /*
1015  * dcnode lookup table
1016  * These routines maintain a table of dcnodes hashed by their
1017  * subordinate vnode so that they can be found if they already
1018  * exist in the vnode cache
1019  */
1020 
1021 /*
1022  * Put a dcnode in the table.
1023  */
1024 static void
1025 dcinsert(struct dcnode *newdp)
1026 {
1027         int idx = DCHASH(newdp->dc_subvp);
1028 
1029         ASSERT(MUTEX_HELD(&dctable_lock));
1030         newdp->dc_hash = dctable[idx];
1031         dctable[idx] = newdp;
1032 }
1033 
1034 /*
1035  * Remove a dcnode from the hash table.
1036  */
1037 void
1038 dcdelete(struct dcnode *deldp)
1039 {
1040         int idx = DCHASH(deldp->dc_subvp);
1041         struct dcnode *dp, *prevdp;
1042 
1043         ASSERT(MUTEX_HELD(&dctable_lock));
1044         dp = dctable[idx];
1045         if (dp == deldp)
1046                 dctable[idx] = dp->dc_hash;
1047         else {
1048                 for (prevdp = dp, dp = dp->dc_hash; dp != NULL;
1049                     prevdp = dp, dp = dp->dc_hash) {
1050                         if (dp == deldp) {
1051                                 prevdp->dc_hash = dp->dc_hash;
1052                                 break;
1053                         }
1054                 }
1055         }
1056         ASSERT(dp != NULL);
1057 }
1058 
1059 /*
1060  * Find a shadow vnode in the dctable hash list.
1061  */
1062 static struct dcnode *
1063 dcfind(struct vnode *vp)
1064 {
1065         struct dcnode *dp;
1066 
1067         ASSERT(MUTEX_HELD(&dctable_lock));
1068         for (dp = dctable[DCHASH(vp)]; dp != NULL; dp = dp->dc_hash)
1069                 if (dp->dc_subvp == vp) {
1070                         VN_HOLD(DCTOV(dp));
1071                         if (dp->dc_lrunext)
1072                                 dclru_sub(dp);
1073                         return (dp);
1074                 }
1075         return (NULL);
1076 }
1077 
1078 #ifdef  DEBUG
1079 static int
1080 dclru_count(void)
1081 {
1082         struct dcnode *dp;
1083         int i = 0;
1084 
1085         if (dclru == NULL)
1086                 return (0);
1087         for (dp = dclru; dp->dc_lrunext != dclru; dp = dp->dc_lrunext)
1088                 i++;
1089         return (i + 1);
1090 }
1091 #endif
1092 
1093 static void
1094 dclru_add(struct dcnode *dp)
1095 {
1096         /*
1097          * Add to dclru as double-link chain
1098          */
1099         ASSERT(MUTEX_HELD(&dctable_lock));
1100         if (dclru == NULL) {
1101                 dclru = dp;
1102                 dp->dc_lruprev = dp->dc_lrunext = dp;
1103         } else {
1104                 struct dcnode *last = dclru->dc_lruprev;
1105 
1106                 dclru->dc_lruprev = dp;
1107                 last->dc_lrunext = dp;
1108                 dp->dc_lruprev = last;
1109                 dp->dc_lrunext = dclru;
1110         }
1111         dclru_len++;
1112         ASSERT(dclru_len == dclru_count());
1113 }
1114 
1115 static void
1116 dclru_sub(struct dcnode *dp)
1117 {
1118         ASSERT(MUTEX_HELD(&dctable_lock));
1119         dp->dc_lrunext->dc_lruprev = dp->dc_lruprev;
1120         dp->dc_lruprev->dc_lrunext = dp->dc_lrunext;
1121         if (dp == dclru)
1122                 dclru = dp->dc_lrunext == dp ? NULL : dp->dc_lrunext;
1123         dp->dc_lrunext = dp->dc_lruprev = NULL;
1124         dclru_len--;
1125         ASSERT(dclru_len == dclru_count());
1126 }