1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
  28  *      All rights reserved.
  29  */
  30 
  31 /*
  32  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  33  */
  34 
  35 #include <sys/param.h>
  36 #include <sys/types.h>
  37 #include <sys/systm.h>
  38 #include <sys/cred.h>
  39 #include <sys/time.h>
  40 #include <sys/vnode.h>
  41 #include <sys/vfs.h>
  42 #include <sys/vfs_opreg.h>
  43 #include <sys/file.h>
  44 #include <sys/filio.h>
  45 #include <sys/uio.h>
  46 #include <sys/buf.h>
  47 #include <sys/mman.h>
  48 #include <sys/pathname.h>
  49 #include <sys/dirent.h>
  50 #include <sys/debug.h>
  51 #include <sys/vmsystm.h>
  52 #include <sys/fcntl.h>
  53 #include <sys/flock.h>
  54 #include <sys/swap.h>
  55 #include <sys/errno.h>
  56 #include <sys/strsubr.h>
  57 #include <sys/sysmacros.h>
  58 #include <sys/kmem.h>
  59 #include <sys/cmn_err.h>
  60 #include <sys/pathconf.h>
  61 #include <sys/utsname.h>
  62 #include <sys/dnlc.h>
  63 #include <sys/acl.h>
  64 #include <sys/systeminfo.h>
  65 #include <sys/atomic.h>
  66 #include <sys/policy.h>
  67 #include <sys/sdt.h>
  68 #include <sys/zone.h>
  69 
  70 #include <rpc/types.h>
  71 #include <rpc/auth.h>
  72 #include <rpc/clnt.h>
  73 #include <rpc/rpc_rdma.h>
  74 
  75 #include <nfs/nfs.h>
  76 #include <nfs/nfs_clnt.h>
  77 #include <nfs/rnode.h>
  78 #include <nfs/nfs_acl.h>
  79 #include <nfs/lm.h>
  80 
  81 #include <vm/hat.h>
  82 #include <vm/as.h>
  83 #include <vm/page.h>
  84 #include <vm/pvn.h>
  85 #include <vm/seg.h>
  86 #include <vm/seg_map.h>
  87 #include <vm/seg_kpm.h>
  88 #include <vm/seg_vn.h>
  89 
  90 #include <fs/fs_subr.h>
  91 
  92 #include <sys/ddi.h>
  93 
  94 static int      nfs3_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
  95                         cred_t *);
  96 static int      nfs3write(vnode_t *, caddr_t, u_offset_t, int, cred_t *,
  97                         stable_how *);
  98 static int      nfs3read(vnode_t *, caddr_t, offset_t, int, size_t *, cred_t *);
  99 static int      nfs3setattr(vnode_t *, struct vattr *, int, cred_t *);
 100 static int      nfs3_accessx(void *, int, cred_t *);
 101 static int      nfs3lookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *);
 102 static int      nfs3lookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int);
 103 static int      nfs3create(vnode_t *, char *, struct vattr *, enum vcexcl,
 104                         int, vnode_t **, cred_t *, int);
 105 static int      nfs3excl_create_settimes(vnode_t *, struct vattr *, cred_t *);
 106 static int      nfs3mknod(vnode_t *, char *, struct vattr *, enum vcexcl,
 107                         int, vnode_t **, cred_t *);
 108 static int      nfs3rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
 109                         caller_context_t *);
 110 static int      do_nfs3readdir(vnode_t *, rddir_cache *, cred_t *);
 111 static void     nfs3readdir(vnode_t *, rddir_cache *, cred_t *);
 112 static void     nfs3readdirplus(vnode_t *, rddir_cache *, cred_t *);
 113 static int      nfs3_bio(struct buf *, stable_how *, cred_t *);
 114 static int      nfs3_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
 115                         page_t *[], size_t, struct seg *, caddr_t,
 116                         enum seg_rw, cred_t *);
 117 static void     nfs3_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
 118                         cred_t *);
 119 static int      nfs3_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
 120                         int, cred_t *);
 121 static int      nfs3_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
 122                         int, cred_t *);
 123 static int      nfs3_commit(vnode_t *, offset3, count3, cred_t *);
 124 static void     nfs3_set_mod(vnode_t *);
 125 static void     nfs3_get_commit(vnode_t *);
 126 static void     nfs3_get_commit_range(vnode_t *, u_offset_t, size_t);
 127 static int      nfs3_putpage_commit(vnode_t *, offset_t, size_t, cred_t *);
 128 static int      nfs3_commit_vp(vnode_t *, u_offset_t, size_t,  cred_t *);
 129 static int      nfs3_sync_commit(vnode_t *, page_t *, offset3, count3,
 130                         cred_t *);
 131 static void     nfs3_async_commit(vnode_t *, page_t *, offset3, count3,
 132                         cred_t *);
 133 static void     nfs3_delmap_callback(struct as *, void *, uint_t);
 134 
 135 /*
 136  * Error flags used to pass information about certain special errors
 137  * which need to be handled specially.
 138  */
 139 #define NFS_EOF                 -98
 140 #define NFS_VERF_MISMATCH       -97
 141 
 142 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */
 143 #define ALIGN64(x, ptr, sz)                                             \
 144         x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1);           \
 145         if (x) {                                                        \
 146                 x = sizeof (uint64_t) - (x);                            \
 147                 sz -= (x);                                              \
 148                 ptr += (x);                                             \
 149         }
 150 
 151 /*
 152  * These are the vnode ops routines which implement the vnode interface to
 153  * the networked file system.  These routines just take their parameters,
 154  * make them look networkish by putting the right info into interface structs,
 155  * and then calling the appropriate remote routine(s) to do the work.
 156  *
 157  * Note on directory name lookup cacheing:  If we detect a stale fhandle,
 158  * we purge the directory cache relative to that vnode.  This way, the
 159  * user won't get burned by the cache repeatedly.  See <nfs/rnode.h> for
 160  * more details on rnode locking.
 161  */
 162 
 163 static int      nfs3_open(vnode_t **, int, cred_t *, caller_context_t *);
 164 static int      nfs3_close(vnode_t *, int, int, offset_t, cred_t *,
 165                         caller_context_t *);
 166 static int      nfs3_read(vnode_t *, struct uio *, int, cred_t *,
 167                         caller_context_t *);
 168 static int      nfs3_write(vnode_t *, struct uio *, int, cred_t *,
 169                         caller_context_t *);
 170 static int      nfs3_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
 171                         caller_context_t *);
 172 static int      nfs3_getattr(vnode_t *, struct vattr *, int, cred_t *,
 173                         caller_context_t *);
 174 static int      nfs3_setattr(vnode_t *, struct vattr *, int, cred_t *,
 175                         caller_context_t *);
 176 static int      nfs3_access(vnode_t *, int, int, cred_t *, caller_context_t *);
 177 static int      nfs3_readlink(vnode_t *, struct uio *, cred_t *,
 178                         caller_context_t *);
 179 static int      nfs3_fsync(vnode_t *, int, cred_t *, caller_context_t *);
 180 static void     nfs3_inactive(vnode_t *, cred_t *, caller_context_t *);
 181 static int      nfs3_lookup(vnode_t *, char *, vnode_t **,
 182                         struct pathname *, int, vnode_t *, cred_t *,
 183                         caller_context_t *, int *, pathname_t *);
 184 static int      nfs3_create(vnode_t *, char *, struct vattr *, enum vcexcl,
 185                         int, vnode_t **, cred_t *, int, caller_context_t *,
 186                         vsecattr_t *);
 187 static int      nfs3_remove(vnode_t *, char *, cred_t *, caller_context_t *,
 188                         int);
 189 static int      nfs3_link(vnode_t *, vnode_t *, char *, cred_t *,
 190                         caller_context_t *, int);
 191 static int      nfs3_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
 192                         caller_context_t *, int);
 193 static int      nfs3_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
 194                         cred_t *, caller_context_t *, int, vsecattr_t *);
 195 static int      nfs3_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
 196                         caller_context_t *, int);
 197 static int      nfs3_symlink(vnode_t *, char *, struct vattr *, char *,
 198                         cred_t *, caller_context_t *, int);
 199 static int      nfs3_readdir(vnode_t *, struct uio *, cred_t *, int *,
 200                         caller_context_t *, int);
 201 static int      nfs3_fid(vnode_t *, fid_t *, caller_context_t *);
 202 static int      nfs3_rwlock(vnode_t *, int, caller_context_t *);
 203 static void     nfs3_rwunlock(vnode_t *, int, caller_context_t *);
 204 static int      nfs3_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
 205 static int      nfs3_getpage(vnode_t *, offset_t, size_t, uint_t *,
 206                         page_t *[], size_t, struct seg *, caddr_t,
 207                         enum seg_rw, cred_t *, caller_context_t *);
 208 static int      nfs3_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
 209                         caller_context_t *);
 210 static int      nfs3_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
 211                         uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
 212 static int      nfs3_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 213                         uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
 214 static int      nfs3_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
 215                         struct flk_callback *, cred_t *, caller_context_t *);
 216 static int      nfs3_space(vnode_t *, int, struct flock64 *, int, offset_t,
 217                         cred_t *, caller_context_t *);
 218 static int      nfs3_realvp(vnode_t *, vnode_t **, caller_context_t *);
 219 static int      nfs3_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 220                         uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
 221 static int      nfs3_pathconf(vnode_t *, int, ulong_t *, cred_t *,
 222                         caller_context_t *);
 223 static int      nfs3_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
 224                         cred_t *, caller_context_t *);
 225 static void     nfs3_dispose(vnode_t *, page_t *, int, int, cred_t *,
 226                         caller_context_t *);
 227 static int      nfs3_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 228                         caller_context_t *);
 229 static int      nfs3_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 230                         caller_context_t *);
 231 static int      nfs3_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
 232                         caller_context_t *);
 233 
 234 struct vnodeops *nfs3_vnodeops;
 235 
 236 const fs_operation_def_t nfs3_vnodeops_template[] = {
 237         VOPNAME_OPEN,           { .vop_open = nfs3_open },
 238         VOPNAME_CLOSE,          { .vop_close = nfs3_close },
 239         VOPNAME_READ,           { .vop_read = nfs3_read },
 240         VOPNAME_WRITE,          { .vop_write = nfs3_write },
 241         VOPNAME_IOCTL,          { .vop_ioctl = nfs3_ioctl },
 242         VOPNAME_GETATTR,        { .vop_getattr = nfs3_getattr },
 243         VOPNAME_SETATTR,        { .vop_setattr = nfs3_setattr },
 244         VOPNAME_ACCESS,         { .vop_access = nfs3_access },
 245         VOPNAME_LOOKUP,         { .vop_lookup = nfs3_lookup },
 246         VOPNAME_CREATE,         { .vop_create = nfs3_create },
 247         VOPNAME_REMOVE,         { .vop_remove = nfs3_remove },
 248         VOPNAME_LINK,           { .vop_link = nfs3_link },
 249         VOPNAME_RENAME,         { .vop_rename = nfs3_rename },
 250         VOPNAME_MKDIR,          { .vop_mkdir = nfs3_mkdir },
 251         VOPNAME_RMDIR,          { .vop_rmdir = nfs3_rmdir },
 252         VOPNAME_READDIR,        { .vop_readdir = nfs3_readdir },
 253         VOPNAME_SYMLINK,        { .vop_symlink = nfs3_symlink },
 254         VOPNAME_READLINK,       { .vop_readlink = nfs3_readlink },
 255         VOPNAME_FSYNC,          { .vop_fsync = nfs3_fsync },
 256         VOPNAME_INACTIVE,       { .vop_inactive = nfs3_inactive },
 257         VOPNAME_FID,            { .vop_fid = nfs3_fid },
 258         VOPNAME_RWLOCK,         { .vop_rwlock = nfs3_rwlock },
 259         VOPNAME_RWUNLOCK,       { .vop_rwunlock = nfs3_rwunlock },
 260         VOPNAME_SEEK,           { .vop_seek = nfs3_seek },
 261         VOPNAME_FRLOCK,         { .vop_frlock = nfs3_frlock },
 262         VOPNAME_SPACE,          { .vop_space = nfs3_space },
 263         VOPNAME_REALVP,         { .vop_realvp = nfs3_realvp },
 264         VOPNAME_GETPAGE,        { .vop_getpage = nfs3_getpage },
 265         VOPNAME_PUTPAGE,        { .vop_putpage = nfs3_putpage },
 266         VOPNAME_MAP,            { .vop_map = nfs3_map },
 267         VOPNAME_ADDMAP,         { .vop_addmap = nfs3_addmap },
 268         VOPNAME_DELMAP,         { .vop_delmap = nfs3_delmap },
 269         /* no separate nfs3_dump */
 270         VOPNAME_DUMP,           { .vop_dump = nfs_dump },
 271         VOPNAME_PATHCONF,       { .vop_pathconf = nfs3_pathconf },
 272         VOPNAME_PAGEIO,         { .vop_pageio = nfs3_pageio },
 273         VOPNAME_DISPOSE,        { .vop_dispose = nfs3_dispose },
 274         VOPNAME_SETSECATTR,     { .vop_setsecattr = nfs3_setsecattr },
 275         VOPNAME_GETSECATTR,     { .vop_getsecattr = nfs3_getsecattr },
 276         VOPNAME_SHRLOCK,        { .vop_shrlock = nfs3_shrlock },
 277         VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
 278         NULL,                   NULL
 279 };
 280 
 281 /*
 282  * XXX:  This is referenced in modstubs.s
 283  */
 284 struct vnodeops *
 285 nfs3_getvnodeops(void)
 286 {
 287         return (nfs3_vnodeops);
 288 }
 289 
 290 /* ARGSUSED */
 291 static int
 292 nfs3_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 293 {
 294         int error;
 295         struct vattr va;
 296         rnode_t *rp;
 297         vnode_t *vp;
 298 
 299         vp = *vpp;
 300         if (nfs_zone() != VTOMI(vp)->mi_zone)
 301                 return (EIO);
 302         rp = VTOR(vp);
 303         mutex_enter(&rp->r_statelock);
 304         if (rp->r_cred == NULL) {
 305                 crhold(cr);
 306                 rp->r_cred = cr;
 307         }
 308         mutex_exit(&rp->r_statelock);
 309 
 310         /*
 311          * If there is no cached data or if close-to-open
 312          * consistency checking is turned off, we can avoid
 313          * the over the wire getattr.  Otherwise, if the
 314          * file system is mounted readonly, then just verify
 315          * the caches are up to date using the normal mechanism.
 316          * Else, if the file is not mmap'd, then just mark
 317          * the attributes as timed out.  They will be refreshed
 318          * and the caches validated prior to being used.
 319          * Else, the file system is mounted writeable so
 320          * force an over the wire GETATTR in order to ensure
 321          * that all cached data is valid.
 322          */
 323         if (vp->v_count > 1 ||
 324             ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) &&
 325             !(VTOMI(vp)->mi_flags & MI_NOCTO))) {
 326                 if (vn_is_readonly(vp))
 327                         error = nfs3_validate_caches(vp, cr);
 328                 else if (rp->r_mapcnt == 0 && vp->v_count == 1) {
 329                         PURGE_ATTRCACHE(vp);
 330                         error = 0;
 331                 } else {
 332                         va.va_mask = AT_ALL;
 333                         error = nfs3_getattr_otw(vp, &va, cr);
 334                 }
 335         } else
 336                 error = 0;
 337 
 338         return (error);
 339 }
 340 
 341 /* ARGSUSED */
 342 static int
 343 nfs3_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
 344                 caller_context_t *ct)
 345 {
 346         rnode_t *rp;
 347         int error;
 348         struct vattr va;
 349 
 350         /*
 351          * zone_enter(2) prevents processes from changing zones with NFS files
 352          * open; if we happen to get here from the wrong zone we can't do
 353          * anything over the wire.
 354          */
 355         if (VTOMI(vp)->mi_zone != nfs_zone()) {
 356                 /*
 357                  * We could attempt to clean up locks, except we're sure
 358                  * that the current process didn't acquire any locks on
 359                  * the file: any attempt to lock a file belong to another zone
 360                  * will fail, and one can't lock an NFS file and then change
 361                  * zones, as that fails too.
 362                  *
 363                  * Returning an error here is the sane thing to do.  A
 364                  * subsequent call to VN_RELE() which translates to a
 365                  * nfs3_inactive() will clean up state: if the zone of the
 366                  * vnode's origin is still alive and kicking, an async worker
 367                  * thread will handle the request (from the correct zone), and
 368                  * everything (minus the commit and final nfs3_getattr_otw()
 369                  * call) should be OK. If the zone is going away
 370                  * nfs_async_inactive() will throw away cached pages inline.
 371                  */
 372                 return (EIO);
 373         }
 374 
 375         /*
 376          * If we are using local locking for this filesystem, then
 377          * release all of the SYSV style record locks.  Otherwise,
 378          * we are doing network locking and we need to release all
 379          * of the network locks.  All of the locks held by this
 380          * process on this file are released no matter what the
 381          * incoming reference count is.
 382          */
 383         if (VTOMI(vp)->mi_flags & MI_LLOCK) {
 384                 cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
 385                 cleanshares(vp, ttoproc(curthread)->p_pid);
 386         } else
 387                 nfs_lockrelease(vp, flag, offset, cr);
 388 
 389         if (count > 1)
 390                 return (0);
 391 
 392         /*
 393          * If the file has been `unlinked', then purge the
 394          * DNLC so that this vnode will get reycled quicker
 395          * and the .nfs* file on the server will get removed.
 396          */
 397         rp = VTOR(vp);
 398         if (rp->r_unldvp != NULL)
 399                 dnlc_purge_vp(vp);
 400 
 401         /*
 402          * If the file was open for write and there are pages,
 403          * then if the file system was mounted using the "no-close-
 404          *      to-open" semantics, then start an asynchronous flush
 405          *      of the all of the pages in the file.
 406          * else the file system was not mounted using the "no-close-
 407          *      to-open" semantics, then do a synchronous flush and
 408          *      commit of all of the dirty and uncommitted pages.
 409          *
 410          * The asynchronous flush of the pages in the "nocto" path
 411          * mostly just associates a cred pointer with the rnode so
 412          * writes which happen later will have a better chance of
 413          * working.  It also starts the data being written to the
 414          * server, but without unnecessarily delaying the application.
 415          */
 416         if ((flag & FWRITE) && vn_has_cached_data(vp)) {
 417                 if (VTOMI(vp)->mi_flags & MI_NOCTO) {
 418                         error = nfs3_putpage(vp, (offset_t)0, 0, B_ASYNC,
 419                             cr, ct);
 420                         if (error == EAGAIN)
 421                                 error = 0;
 422                 } else
 423                         error = nfs3_putpage_commit(vp, (offset_t)0, 0, cr);
 424                 if (!error) {
 425                         mutex_enter(&rp->r_statelock);
 426                         error = rp->r_error;
 427                         rp->r_error = 0;
 428                         mutex_exit(&rp->r_statelock);
 429                 }
 430         } else {
 431                 mutex_enter(&rp->r_statelock);
 432                 error = rp->r_error;
 433                 rp->r_error = 0;
 434                 mutex_exit(&rp->r_statelock);
 435         }
 436 
 437         /*
 438          * If RWRITEATTR is set, then issue an over the wire GETATTR to
 439          * refresh the attribute cache with a set of attributes which
 440          * weren't returned from a WRITE.  This will enable the close-
 441          * to-open processing to work.
 442          */
 443         if (rp->r_flags & RWRITEATTR)
 444                 (void) nfs3_getattr_otw(vp, &va, cr);
 445 
 446         return (error);
 447 }
 448 
 449 /* ARGSUSED */
 450 static int
 451 nfs3_directio_read(vnode_t *vp, struct uio *uiop, cred_t *cr)
 452 {
 453         mntinfo_t *mi;
 454         READ3args args;
 455         READ3uiores res;
 456         int tsize;
 457         offset_t offset;
 458         ssize_t count;
 459         int error;
 460         int douprintf;
 461         failinfo_t fi;
 462         char *sv_hostname;
 463 
 464         mi = VTOMI(vp);
 465         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
 466         sv_hostname = VTOR(vp)->r_server->sv_hostname;
 467 
 468         douprintf = 1;
 469         args.file = *VTOFH3(vp);
 470         fi.vp = vp;
 471         fi.fhp = (caddr_t)&args.file;
 472         fi.copyproc = nfs3copyfh;
 473         fi.lookupproc = nfs3lookup;
 474         fi.xattrdirproc = acl_getxattrdir3;
 475 
 476         res.uiop = uiop;
 477 
 478         res.wlist = NULL;
 479 
 480         offset = uiop->uio_loffset;
 481         count = uiop->uio_resid;
 482 
 483         do {
 484                 if (mi->mi_io_kstats) {
 485                         mutex_enter(&mi->mi_lock);
 486                         kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
 487                         mutex_exit(&mi->mi_lock);
 488                 }
 489 
 490                 do {
 491                         tsize = MIN(mi->mi_tsize, count);
 492                         args.offset = (offset3)offset;
 493                         args.count = (count3)tsize;
 494                         res.size = (uint_t)tsize;
 495                         args.res_uiop = uiop;
 496                         args.res_data_val_alt = NULL;
 497 
 498                         error = rfs3call(mi, NFSPROC3_READ,
 499                             xdr_READ3args, (caddr_t)&args,
 500                             xdr_READ3uiores, (caddr_t)&res, cr,
 501                             &douprintf, &res.status, 0, &fi);
 502                 } while (error == ENFS_TRYAGAIN);
 503 
 504                 if (mi->mi_io_kstats) {
 505                         mutex_enter(&mi->mi_lock);
 506                         kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
 507                         mutex_exit(&mi->mi_lock);
 508                 }
 509 
 510                 if (error)
 511                         return (error);
 512 
 513                 error = geterrno3(res.status);
 514                 if (error)
 515                         return (error);
 516 
 517                 if (res.count != res.size) {
 518                         zcmn_err(getzoneid(), CE_WARN,
 519 "nfs3_directio_read: server %s returned incorrect amount",
 520                             sv_hostname);
 521                         return (EIO);
 522                 }
 523                 count -= res.count;
 524                 offset += res.count;
 525                 if (mi->mi_io_kstats) {
 526                         mutex_enter(&mi->mi_lock);
 527                         KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
 528                         KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.count;
 529                         mutex_exit(&mi->mi_lock);
 530                 }
 531                 lwp_stat_update(LWP_STAT_INBLK, 1);
 532         } while (count && !res.eof);
 533 
 534         return (0);
 535 }
 536 
 537 /* ARGSUSED */
 538 static int
 539 nfs3_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 540         caller_context_t *ct)
 541 {
 542         rnode_t *rp;
 543         u_offset_t off;
 544         offset_t diff;
 545         int on;
 546         size_t n;
 547         caddr_t base;
 548         uint_t flags;
 549         int error = 0;
 550         mntinfo_t *mi;
 551 
 552         rp = VTOR(vp);
 553         mi = VTOMI(vp);
 554 
 555         ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
 556 
 557         if (nfs_zone() != mi->mi_zone)
 558                 return (EIO);
 559 
 560         if (vp->v_type != VREG)
 561                 return (EISDIR);
 562 
 563         if (uiop->uio_resid == 0)
 564                 return (0);
 565 
 566         if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0)
 567                 return (EINVAL);
 568 
 569         /*
 570          * Bypass VM if caching has been disabled (e.g., locking) or if
 571          * using client-side direct I/O and the file is not mmap'd and
 572          * there are no cached pages.
 573          */
 574         if ((vp->v_flag & VNOCACHE) ||
 575             (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
 576             rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
 577             !vn_has_cached_data(vp))) {
 578                 return (nfs3_directio_read(vp, uiop, cr));
 579         }
 580 
 581         do {
 582                 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
 583                 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
 584                 n = MIN(MAXBSIZE - on, uiop->uio_resid);
 585 
 586                 error = nfs3_validate_caches(vp, cr);
 587                 if (error)
 588                         break;
 589 
 590                 mutex_enter(&rp->r_statelock);
 591                 while (rp->r_flags & RINCACHEPURGE) {
 592                         if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 593                                 mutex_exit(&rp->r_statelock);
 594                                 return (EINTR);
 595                         }
 596                 }
 597                 diff = rp->r_size - uiop->uio_loffset;
 598                 mutex_exit(&rp->r_statelock);
 599                 if (diff <= 0)
 600                         break;
 601                 if (diff < n)
 602                         n = (size_t)diff;
 603 
 604                 if (vpm_enable) {
 605                         /*
 606                          * Copy data.
 607                          */
 608                         error = vpm_data_copy(vp, off + on, n, uiop,
 609                             1, NULL, 0, S_READ);
 610                 } else {
 611                         base = segmap_getmapflt(segkmap, vp, off + on, n, 1,
 612                             S_READ);
 613 
 614                         error = uiomove(base + on, n, UIO_READ, uiop);
 615                 }
 616 
 617                 if (!error) {
 618                         /*
 619                          * If read a whole block or read to eof,
 620                          * won't need this buffer again soon.
 621                          */
 622                         mutex_enter(&rp->r_statelock);
 623                         if (n + on == MAXBSIZE ||
 624                             uiop->uio_loffset == rp->r_size)
 625                                 flags = SM_DONTNEED;
 626                         else
 627                                 flags = 0;
 628                         mutex_exit(&rp->r_statelock);
 629                         if (vpm_enable) {
 630                                 error = vpm_sync_pages(vp, off, n, flags);
 631                         } else {
 632                                 error = segmap_release(segkmap, base, flags);
 633                         }
 634                 } else {
 635                         if (vpm_enable) {
 636                                 (void) vpm_sync_pages(vp, off, n, 0);
 637                         } else {
 638                                 (void) segmap_release(segkmap, base, 0);
 639                         }
 640                 }
 641         } while (!error && uiop->uio_resid > 0);
 642 
 643         return (error);
 644 }
 645 
 646 /* ARGSUSED */
 647 static int
 648 nfs3_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 649         caller_context_t *ct)
 650 {
 651         rlim64_t limit = uiop->uio_llimit;
 652         rnode_t *rp;
 653         u_offset_t off;
 654         caddr_t base;
 655         uint_t flags;
 656         int remainder;
 657         size_t n;
 658         int on;
 659         int error;
 660         int resid;
 661         offset_t offset;
 662         mntinfo_t *mi;
 663         uint_t bsize;
 664 
 665         rp = VTOR(vp);
 666 
 667         if (vp->v_type != VREG)
 668                 return (EISDIR);
 669 
 670         mi = VTOMI(vp);
 671         if (nfs_zone() != mi->mi_zone)
 672                 return (EIO);
 673         if (uiop->uio_resid == 0)
 674                 return (0);
 675 
 676         if (ioflag & FAPPEND) {
 677                 struct vattr va;
 678 
 679                 /*
 680                  * Must serialize if appending.
 681                  */
 682                 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
 683                         nfs_rw_exit(&rp->r_rwlock);
 684                         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
 685                             INTR(vp)))
 686                                 return (EINTR);
 687                 }
 688 
 689                 va.va_mask = AT_SIZE;
 690                 error = nfs3getattr(vp, &va, cr);
 691                 if (error)
 692                         return (error);
 693                 uiop->uio_loffset = va.va_size;
 694         }
 695 
 696         offset = uiop->uio_loffset + uiop->uio_resid;
 697 
 698         if (uiop->uio_loffset < 0 || offset < 0)
 699                 return (EINVAL);
 700 
 701         if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 702                 limit = MAXOFFSET_T;
 703 
 704         /*
 705          * Check to make sure that the process will not exceed
 706          * its limit on file size.  It is okay to write up to
 707          * the limit, but not beyond.  Thus, the write which
 708          * reaches the limit will be short and the next write
 709          * will return an error.
 710          */
 711         remainder = 0;
 712         if (offset > limit) {
 713                 remainder = offset - limit;
 714                 uiop->uio_resid = limit - uiop->uio_loffset;
 715                 if (uiop->uio_resid <= 0) {
 716                         proc_t *p = ttoproc(curthread);
 717 
 718                         uiop->uio_resid += remainder;
 719                         mutex_enter(&p->p_lock);
 720                         (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
 721                             p->p_rctls, p, RCA_UNSAFE_SIGINFO);
 722                         mutex_exit(&p->p_lock);
 723                         return (EFBIG);
 724                 }
 725         }
 726 
 727         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
 728                 return (EINTR);
 729 
 730         /*
 731          * Bypass VM if caching has been disabled (e.g., locking) or if
 732          * using client-side direct I/O and the file is not mmap'd and
 733          * there are no cached pages.
 734          */
 735         if ((vp->v_flag & VNOCACHE) ||
 736             (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
 737             rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
 738             !vn_has_cached_data(vp))) {
 739                 size_t bufsize;
 740                 int count;
 741                 u_offset_t org_offset;
 742                 stable_how stab_comm;
 743 
 744 nfs3_fwrite:
 745                 if (rp->r_flags & RSTALE) {
 746                         resid = uiop->uio_resid;
 747                         offset = uiop->uio_loffset;
 748                         error = rp->r_error;
 749                         /*
 750                          * A close may have cleared r_error, if so,
 751                          * propagate ESTALE error return properly
 752                          */
 753                         if (error == 0)
 754                                 error = ESTALE;
 755                         goto bottom;
 756                 }
 757                 bufsize = MIN(uiop->uio_resid, mi->mi_stsize);
 758                 base = kmem_alloc(bufsize, KM_SLEEP);
 759                 do {
 760                         if (ioflag & FDSYNC)
 761                                 stab_comm = DATA_SYNC;
 762                         else
 763                                 stab_comm = FILE_SYNC;
 764                         resid = uiop->uio_resid;
 765                         offset = uiop->uio_loffset;
 766                         count = MIN(uiop->uio_resid, bufsize);
 767                         org_offset = uiop->uio_loffset;
 768                         error = uiomove(base, count, UIO_WRITE, uiop);
 769                         if (!error) {
 770                                 error = nfs3write(vp, base, org_offset,
 771                                     count, cr, &stab_comm);
 772                         }
 773                 } while (!error && uiop->uio_resid > 0);
 774                 kmem_free(base, bufsize);
 775                 goto bottom;
 776         }
 777 
 778 
 779         bsize = vp->v_vfsp->vfs_bsize;
 780 
 781         do {
 782                 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
 783                 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
 784                 n = MIN(MAXBSIZE - on, uiop->uio_resid);
 785 
 786                 resid = uiop->uio_resid;
 787                 offset = uiop->uio_loffset;
 788 
 789                 if (rp->r_flags & RSTALE) {
 790                         error = rp->r_error;
 791                         /*
 792                          * A close may have cleared r_error, if so,
 793                          * propagate ESTALE error return properly
 794                          */
 795                         if (error == 0)
 796                                 error = ESTALE;
 797                         break;
 798                 }
 799 
 800                 /*
 801                  * Don't create dirty pages faster than they
 802                  * can be cleaned so that the system doesn't
 803                  * get imbalanced.  If the async queue is
 804                  * maxed out, then wait for it to drain before
 805                  * creating more dirty pages.  Also, wait for
 806                  * any threads doing pagewalks in the vop_getattr
 807                  * entry points so that they don't block for
 808                  * long periods.
 809                  */
 810                 mutex_enter(&rp->r_statelock);
 811                 while ((mi->mi_max_threads != 0 &&
 812                     rp->r_awcount > 2 * mi->mi_max_threads) ||
 813                     rp->r_gcount > 0) {
 814                         if (INTR(vp)) {
 815                                 klwp_t *lwp = ttolwp(curthread);
 816 
 817                                 if (lwp != NULL)
 818                                         lwp->lwp_nostop++;
 819                                 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 820                                         mutex_exit(&rp->r_statelock);
 821                                         if (lwp != NULL)
 822                                                 lwp->lwp_nostop--;
 823                                         error = EINTR;
 824                                         goto bottom;
 825                                 }
 826                                 if (lwp != NULL)
 827                                         lwp->lwp_nostop--;
 828                         } else
 829                                 cv_wait(&rp->r_cv, &rp->r_statelock);
 830                 }
 831                 mutex_exit(&rp->r_statelock);
 832 
 833                 /*
 834                  * Touch the page and fault it in if it is not in core
 835                  * before segmap_getmapflt or vpm_data_copy can lock it.
 836                  * This is to avoid the deadlock if the buffer is mapped
 837                  * to the same file through mmap which we want to write.
 838                  */
 839                 uio_prefaultpages((long)n, uiop);
 840 
 841                 if (vpm_enable) {
 842                         /*
 843                          * It will use kpm mappings, so no need to
 844                          * pass an address.
 845                          */
 846                         error = writerp(rp, NULL, n, uiop, 0);
 847                 } else  {
 848                         if (segmap_kpm) {
 849                                 int pon = uiop->uio_loffset & PAGEOFFSET;
 850                                 size_t pn = MIN(PAGESIZE - pon,
 851                                     uiop->uio_resid);
 852                                 int pagecreate;
 853 
 854                                 mutex_enter(&rp->r_statelock);
 855                                 pagecreate = (pon == 0) && (pn == PAGESIZE ||
 856                                     uiop->uio_loffset + pn >= rp->r_size);
 857                                 mutex_exit(&rp->r_statelock);
 858 
 859                                 base = segmap_getmapflt(segkmap, vp, off + on,
 860                                     pn, !pagecreate, S_WRITE);
 861 
 862                                 error = writerp(rp, base + pon, n, uiop,
 863                                     pagecreate);
 864 
 865                         } else {
 866                                 base = segmap_getmapflt(segkmap, vp, off + on,
 867                                     n, 0, S_READ);
 868                                 error = writerp(rp, base + on, n, uiop, 0);
 869                         }
 870                 }
 871 
 872                 if (!error) {
 873                         if (mi->mi_flags & MI_NOAC)
 874                                 flags = SM_WRITE;
 875                         else if ((uiop->uio_loffset % bsize) == 0 ||
 876                             IS_SWAPVP(vp)) {
 877                                 /*
 878                                  * Have written a whole block.
 879                                  * Start an asynchronous write
 880                                  * and mark the buffer to
 881                                  * indicate that it won't be
 882                                  * needed again soon.
 883                                  */
 884                                 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
 885                         } else
 886                                 flags = 0;
 887                         if ((ioflag & (FSYNC|FDSYNC)) ||
 888                             (rp->r_flags & ROUTOFSPACE)) {
 889                                 flags &= ~SM_ASYNC;
 890                                 flags |= SM_WRITE;
 891                         }
 892                         if (vpm_enable) {
 893                                 error = vpm_sync_pages(vp, off, n, flags);
 894                         } else {
 895                                 error = segmap_release(segkmap, base, flags);
 896                         }
 897                 } else {
 898                         if (vpm_enable) {
 899                                 (void) vpm_sync_pages(vp, off, n, 0);
 900                         } else {
 901                                 (void) segmap_release(segkmap, base, 0);
 902                         }
 903                         /*
 904                          * In the event that we got an access error while
 905                          * faulting in a page for a write-only file just
 906                          * force a write.
 907                          */
 908                         if (error == EACCES)
 909                                 goto nfs3_fwrite;
 910                 }
 911         } while (!error && uiop->uio_resid > 0);
 912 
 913 bottom:
 914         if (error) {
 915                 uiop->uio_resid = resid + remainder;
 916                 uiop->uio_loffset = offset;
 917         } else
 918                 uiop->uio_resid += remainder;
 919 
 920         nfs_rw_exit(&rp->r_lkserlock);
 921 
 922         return (error);
 923 }
 924 
 925 /*
 926  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
 927  */
 928 static int
 929 nfs3_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
 930         int flags, cred_t *cr)
 931 {
 932         struct buf *bp;
 933         int error;
 934         page_t *savepp;
 935         uchar_t fsdata;
 936         stable_how stab_comm;
 937 
 938         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
 939         bp = pageio_setup(pp, len, vp, flags);
 940         ASSERT(bp != NULL);
 941 
 942         /*
 943          * pageio_setup should have set b_addr to 0.  This
 944          * is correct since we want to do I/O on a page
 945          * boundary.  bp_mapin will use this addr to calculate
 946          * an offset, and then set b_addr to the kernel virtual
 947          * address it allocated for us.
 948          */
 949         ASSERT(bp->b_un.b_addr == 0);
 950 
 951         bp->b_edev = 0;
 952         bp->b_dev = 0;
 953         bp->b_lblkno = lbtodb(off);
 954         bp->b_file = vp;
 955         bp->b_offset = (offset_t)off;
 956         bp_mapin(bp);
 957 
 958         /*
 959          * Calculate the desired level of stability to write data
 960          * on the server and then mark all of the pages to reflect
 961          * this.
 962          */
 963         if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) &&
 964             freemem > desfree) {
 965                 stab_comm = UNSTABLE;
 966                 fsdata = C_DELAYCOMMIT;
 967         } else {
 968                 stab_comm = FILE_SYNC;
 969                 fsdata = C_NOCOMMIT;
 970         }
 971 
 972         savepp = pp;
 973         do {
 974                 pp->p_fsdata = fsdata;
 975         } while ((pp = pp->p_next) != savepp);
 976 
 977         error = nfs3_bio(bp, &stab_comm, cr);
 978 
 979         bp_mapout(bp);
 980         pageio_done(bp);
 981 
 982         /*
 983          * If the server wrote pages in a more stable fashion than
 984          * was requested, then clear all of the marks in the pages
 985          * indicating that COMMIT operations were required.
 986          */
 987         if (stab_comm != UNSTABLE && fsdata == C_DELAYCOMMIT) {
 988                 do {
 989                         pp->p_fsdata = C_NOCOMMIT;
 990                 } while ((pp = pp->p_next) != savepp);
 991         }
 992 
 993         return (error);
 994 }
 995 
 996 /*
 997  * Write to file.  Writes to remote server in largest size
 998  * chunks that the server can handle.  Write is synchronous.
 999  */
1000 static int
1001 nfs3write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr,
1002         stable_how *stab_comm)
1003 {
1004         mntinfo_t *mi;
1005         WRITE3args args;
1006         WRITE3res res;
1007         int error;
1008         int tsize;
1009         rnode_t *rp;
1010         int douprintf;
1011 
1012         rp = VTOR(vp);
1013         mi = VTOMI(vp);
1014 
1015         ASSERT(nfs_zone() == mi->mi_zone);
1016 
1017         args.file = *VTOFH3(vp);
1018         args.stable = *stab_comm;
1019 
1020         *stab_comm = FILE_SYNC;
1021 
1022         douprintf = 1;
1023 
1024         do {
1025                 if ((vp->v_flag & VNOCACHE) ||
1026                     (rp->r_flags & RDIRECTIO) ||
1027                     (mi->mi_flags & MI_DIRECTIO))
1028                         tsize = MIN(mi->mi_stsize, count);
1029                 else
1030                         tsize = MIN(mi->mi_curwrite, count);
1031                 args.offset = (offset3)offset;
1032                 args.count = (count3)tsize;
1033                 args.data.data_len = (uint_t)tsize;
1034                 args.data.data_val = base;
1035 
1036                 if (mi->mi_io_kstats) {
1037                         mutex_enter(&mi->mi_lock);
1038                         kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1039                         mutex_exit(&mi->mi_lock);
1040                 }
1041                 args.mblk = NULL;
1042                 do {
1043                         error = rfs3call(mi, NFSPROC3_WRITE,
1044                             xdr_WRITE3args, (caddr_t)&args,
1045                             xdr_WRITE3res, (caddr_t)&res, cr,
1046                             &douprintf, &res.status, 0, NULL);
1047                 } while (error == ENFS_TRYAGAIN);
1048                 if (mi->mi_io_kstats) {
1049                         mutex_enter(&mi->mi_lock);
1050                         kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1051                         mutex_exit(&mi->mi_lock);
1052                 }
1053 
1054                 if (error)
1055                         return (error);
1056                 error = geterrno3(res.status);
1057                 if (!error) {
1058                         if (res.resok.count > args.count) {
1059                                 zcmn_err(getzoneid(), CE_WARN,
1060                                     "nfs3write: server %s wrote %u, "
1061                                     "requested was %u",
1062                                     rp->r_server->sv_hostname,
1063                                     res.resok.count, args.count);
1064                                 return (EIO);
1065                         }
1066                         if (res.resok.committed == UNSTABLE) {
1067                                 *stab_comm = UNSTABLE;
1068                                 if (args.stable == DATA_SYNC ||
1069                                     args.stable == FILE_SYNC) {
1070                                         zcmn_err(getzoneid(), CE_WARN,
1071                         "nfs3write: server %s did not commit to stable storage",
1072                                             rp->r_server->sv_hostname);
1073                                         return (EIO);
1074                                 }
1075                         }
1076                         tsize = (int)res.resok.count;
1077                         count -= tsize;
1078                         base += tsize;
1079                         offset += tsize;
1080                         if (mi->mi_io_kstats) {
1081                                 mutex_enter(&mi->mi_lock);
1082                                 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++;
1083                                 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten +=
1084                                     tsize;
1085                                 mutex_exit(&mi->mi_lock);
1086                         }
1087                         lwp_stat_update(LWP_STAT_OUBLK, 1);
1088                         mutex_enter(&rp->r_statelock);
1089                         if (rp->r_flags & RHAVEVERF) {
1090                                 if (rp->r_verf != res.resok.verf) {
1091                                         nfs3_set_mod(vp);
1092                                         rp->r_verf = res.resok.verf;
1093                                         /*
1094                                          * If the data was written UNSTABLE,
1095                                          * then might as well stop because
1096                                          * the whole block will have to get
1097                                          * rewritten anyway.
1098                                          */
1099                                         if (*stab_comm == UNSTABLE) {
1100                                                 mutex_exit(&rp->r_statelock);
1101                                                 break;
1102                                         }
1103                                 }
1104                         } else {
1105                                 rp->r_verf = res.resok.verf;
1106                                 rp->r_flags |= RHAVEVERF;
1107                         }
1108                         /*
1109                          * Mark the attribute cache as timed out and
1110                          * set RWRITEATTR to indicate that the file
1111                          * was modified with a WRITE operation and
1112                          * that the attributes can not be trusted.
1113                          */
1114                         PURGE_ATTRCACHE_LOCKED(rp);
1115                         rp->r_flags |= RWRITEATTR;
1116                         mutex_exit(&rp->r_statelock);
1117                 }
1118         } while (!error && count);
1119 
1120         return (error);
1121 }
1122 
1123 /*
1124  * Read from a file.  Reads data in largest chunks our interface can handle.
1125  */
1126 static int
1127 nfs3read(vnode_t *vp, caddr_t base, offset_t offset, int count,
1128         size_t *residp, cred_t *cr)
1129 {
1130         mntinfo_t *mi;
1131         READ3args args;
1132         READ3vres res;
1133         int tsize;
1134         int error;
1135         int douprintf;
1136         failinfo_t fi;
1137         rnode_t *rp;
1138         struct vattr va;
1139         hrtime_t t;
1140 
1141         rp = VTOR(vp);
1142         mi = VTOMI(vp);
1143         ASSERT(nfs_zone() == mi->mi_zone);
1144         douprintf = 1;
1145 
1146         args.file = *VTOFH3(vp);
1147         fi.vp = vp;
1148         fi.fhp = (caddr_t)&args.file;
1149         fi.copyproc = nfs3copyfh;
1150         fi.lookupproc = nfs3lookup;
1151         fi.xattrdirproc = acl_getxattrdir3;
1152 
1153         res.pov.fres.vp = vp;
1154         res.pov.fres.vap = &va;
1155 
1156         res.wlist = NULL;
1157         *residp = count;
1158         do {
1159                 if (mi->mi_io_kstats) {
1160                         mutex_enter(&mi->mi_lock);
1161                         kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1162                         mutex_exit(&mi->mi_lock);
1163                 }
1164 
1165                 do {
1166                         if ((vp->v_flag & VNOCACHE) ||
1167                             (rp->r_flags & RDIRECTIO) ||
1168                             (mi->mi_flags & MI_DIRECTIO))
1169                                 tsize = MIN(mi->mi_tsize, count);
1170                         else
1171                                 tsize = MIN(mi->mi_curread, count);
1172                         res.data.data_val = base;
1173                         res.data.data_len = tsize;
1174                         args.offset = (offset3)offset;
1175                         args.count = (count3)tsize;
1176                         args.res_uiop = NULL;
1177                         args.res_data_val_alt = base;
1178 
1179                         t = gethrtime();
1180                         error = rfs3call(mi, NFSPROC3_READ,
1181                             xdr_READ3args, (caddr_t)&args,
1182                             xdr_READ3vres, (caddr_t)&res, cr,
1183                             &douprintf, &res.status, 0, &fi);
1184                 } while (error == ENFS_TRYAGAIN);
1185 
1186                 if (mi->mi_io_kstats) {
1187                         mutex_enter(&mi->mi_lock);
1188                         kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1189                         mutex_exit(&mi->mi_lock);
1190                 }
1191 
1192                 if (error)
1193                         return (error);
1194 
1195                 error = geterrno3(res.status);
1196                 if (error)
1197                         return (error);
1198 
1199                 if (res.count != res.data.data_len) {
1200                         zcmn_err(getzoneid(), CE_WARN,
1201                             "nfs3read: server %s returned incorrect amount",
1202                             rp->r_server->sv_hostname);
1203                         return (EIO);
1204                 }
1205 
1206                 count -= res.count;
1207                 *residp = count;
1208                 base += res.count;
1209                 offset += res.count;
1210                 if (mi->mi_io_kstats) {
1211                         mutex_enter(&mi->mi_lock);
1212                         KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
1213                         KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.count;
1214                         mutex_exit(&mi->mi_lock);
1215                 }
1216                 lwp_stat_update(LWP_STAT_INBLK, 1);
1217         } while (count && !res.eof);
1218 
1219         if (res.pov.attributes) {
1220                 mutex_enter(&rp->r_statelock);
1221                 if (!CACHE_VALID(rp, va.va_mtime, va.va_size)) {
1222                         mutex_exit(&rp->r_statelock);
1223                         PURGE_ATTRCACHE(vp);
1224                 } else {
1225                         if (rp->r_mtime <= t)
1226                                 nfs_attrcache_va(vp, &va);
1227                         mutex_exit(&rp->r_statelock);
1228                 }
1229         }
1230 
1231         return (0);
1232 }
1233 
1234 /* ARGSUSED */
1235 static int
1236 nfs3_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
1237         caller_context_t *ct)
1238 {
1239 
1240         if (nfs_zone() != VTOMI(vp)->mi_zone)
1241                 return (EIO);
1242         switch (cmd) {
1243                 case _FIODIRECTIO:
1244                         return (nfs_directio(vp, (int)arg, cr));
1245                 default:
1246                         return (ENOTTY);
1247         }
1248 }
1249 
1250 /* ARGSUSED */
1251 static int
1252 nfs3_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1253         caller_context_t *ct)
1254 {
1255         int error;
1256         rnode_t *rp;
1257 
1258         if (nfs_zone() != VTOMI(vp)->mi_zone)
1259                 return (EIO);
1260         /*
1261          * If it has been specified that the return value will
1262          * just be used as a hint, and we are only being asked
1263          * for size, fsid or rdevid, then return the client's
1264          * notion of these values without checking to make sure
1265          * that the attribute cache is up to date.
1266          * The whole point is to avoid an over the wire GETATTR
1267          * call.
1268          */
1269         rp = VTOR(vp);
1270         if (flags & ATTR_HINT) {
1271                 if (vap->va_mask ==
1272                     (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) {
1273                         mutex_enter(&rp->r_statelock);
1274                         if (vap->va_mask | AT_SIZE)
1275                                 vap->va_size = rp->r_size;
1276                         if (vap->va_mask | AT_FSID)
1277                                 vap->va_fsid = rp->r_attr.va_fsid;
1278                         if (vap->va_mask | AT_RDEV)
1279                                 vap->va_rdev = rp->r_attr.va_rdev;
1280                         mutex_exit(&rp->r_statelock);
1281                         return (0);
1282                 }
1283         }
1284 
1285         /*
1286          * Only need to flush pages if asking for the mtime
1287          * and if there any dirty pages or any outstanding
1288          * asynchronous (write) requests for this file.
1289          */
1290         if (vap->va_mask & AT_MTIME) {
1291                 if (vn_has_cached_data(vp) &&
1292                     ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) {
1293                         mutex_enter(&rp->r_statelock);
1294                         rp->r_gcount++;
1295                         mutex_exit(&rp->r_statelock);
1296                         error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr, ct);
1297                         mutex_enter(&rp->r_statelock);
1298                         if (error && (error == ENOSPC || error == EDQUOT)) {
1299                                 if (!rp->r_error)
1300                                         rp->r_error = error;
1301                         }
1302                         if (--rp->r_gcount == 0)
1303                                 cv_broadcast(&rp->r_cv);
1304                         mutex_exit(&rp->r_statelock);
1305                 }
1306         }
1307 
1308         return (nfs3getattr(vp, vap, cr));
1309 }
1310 
1311 /*ARGSUSED4*/
1312 static int
1313 nfs3_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1314                 caller_context_t *ct)
1315 {
1316         int error;
1317         struct vattr va;
1318 
1319         if (vap->va_mask & AT_NOSET)
1320                 return (EINVAL);
1321         if (nfs_zone() != VTOMI(vp)->mi_zone)
1322                 return (EIO);
1323 
1324         va.va_mask = AT_UID | AT_MODE;
1325         error = nfs3getattr(vp, &va, cr);
1326         if (error)
1327                 return (error);
1328 
1329         error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs3_accessx,
1330             vp);
1331         if (error)
1332                 return (error);
1333 
1334         error = nfs3setattr(vp, vap, flags, cr);
1335 
1336         if (error == 0 && (vap->va_mask & AT_SIZE) && vap->va_size == 0)
1337                 vnevent_truncate(vp, ct);
1338 
1339         return (error);
1340 }
1341 
1342 static int
1343 nfs3setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
1344 {
1345         int error;
1346         uint_t mask;
1347         SETATTR3args args;
1348         SETATTR3res res;
1349         int douprintf;
1350         rnode_t *rp;
1351         struct vattr va;
1352         mode_t omode;
1353         vsecattr_t *vsp;
1354         hrtime_t t;
1355 
1356         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
1357         mask = vap->va_mask;
1358 
1359         rp = VTOR(vp);
1360 
1361         /*
1362          * Only need to flush pages if there are any pages and
1363          * if the file is marked as dirty in some fashion.  The
1364          * file must be flushed so that we can accurately
1365          * determine the size of the file and the cached data
1366          * after the SETATTR returns.  A file is considered to
1367          * be dirty if it is either marked with RDIRTY, has
1368          * outstanding i/o's active, or is mmap'd.  In this
1369          * last case, we can't tell whether there are dirty
1370          * pages, so we flush just to be sure.
1371          */
1372         if (vn_has_cached_data(vp) &&
1373             ((rp->r_flags & RDIRTY) ||
1374             rp->r_count > 0 ||
1375             rp->r_mapcnt > 0)) {
1376                 ASSERT(vp->v_type != VCHR);
1377                 error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
1378                 if (error && (error == ENOSPC || error == EDQUOT)) {
1379                         mutex_enter(&rp->r_statelock);
1380                         if (!rp->r_error)
1381                                 rp->r_error = error;
1382                         mutex_exit(&rp->r_statelock);
1383                 }
1384         }
1385 
1386         args.object = *RTOFH3(rp);
1387         /*
1388          * If the intent is for the server to set the times,
1389          * there is no point in have the mask indicating set mtime or
1390          * atime, because the vap values may be junk, and so result
1391          * in an overflow error. Remove these flags from the vap mask
1392          * before calling in this case, and restore them afterwards.
1393          */
1394         if ((mask & (AT_ATIME | AT_MTIME)) && !(flags & ATTR_UTIME)) {
1395                 /* Use server times, so don't set the args time fields */
1396                 vap->va_mask &= ~(AT_ATIME | AT_MTIME);
1397                 error = vattr_to_sattr3(vap, &args.new_attributes);
1398                 vap->va_mask |= (mask & (AT_ATIME | AT_MTIME));
1399                 if (mask & AT_ATIME) {
1400                         args.new_attributes.atime.set_it = SET_TO_SERVER_TIME;
1401                 }
1402                 if (mask & AT_MTIME) {
1403                         args.new_attributes.mtime.set_it = SET_TO_SERVER_TIME;
1404                 }
1405         } else {
1406                 /* Either do not set times or use the client specified times */
1407                 error = vattr_to_sattr3(vap, &args.new_attributes);
1408         }
1409 
1410         if (error) {
1411                 /* req time field(s) overflow - return immediately */
1412                 return (error);
1413         }
1414 
1415         va.va_mask = AT_MODE | AT_CTIME;
1416         error = nfs3getattr(vp, &va, cr);
1417         if (error)
1418                 return (error);
1419         omode = va.va_mode;
1420 
1421 tryagain:
1422         if (mask & AT_SIZE) {
1423                 args.guard.check = TRUE;
1424                 args.guard.obj_ctime.seconds = va.va_ctime.tv_sec;
1425                 args.guard.obj_ctime.nseconds = va.va_ctime.tv_nsec;
1426         } else
1427                 args.guard.check = FALSE;
1428 
1429         douprintf = 1;
1430 
1431         t = gethrtime();
1432 
1433         error = rfs3call(VTOMI(vp), NFSPROC3_SETATTR,
1434             xdr_SETATTR3args, (caddr_t)&args,
1435             xdr_SETATTR3res, (caddr_t)&res, cr,
1436             &douprintf, &res.status, 0, NULL);
1437 
1438         /*
1439          * Purge the access cache and ACL cache if changing either the
1440          * owner of the file, the group owner, or the mode.  These may
1441          * change the access permissions of the file, so purge old
1442          * information and start over again.
1443          */
1444         if (mask & (AT_UID | AT_GID | AT_MODE)) {
1445                 (void) nfs_access_purge_rp(rp);
1446                 if (rp->r_secattr != NULL) {
1447                         mutex_enter(&rp->r_statelock);
1448                         vsp = rp->r_secattr;
1449                         rp->r_secattr = NULL;
1450                         mutex_exit(&rp->r_statelock);
1451                         if (vsp != NULL)
1452                                 nfs_acl_free(vsp);
1453                 }
1454         }
1455 
1456         if (error) {
1457                 PURGE_ATTRCACHE(vp);
1458                 return (error);
1459         }
1460 
1461         error = geterrno3(res.status);
1462         if (!error) {
1463                 /*
1464                  * If changing the size of the file, invalidate
1465                  * any local cached data which is no longer part
1466                  * of the file.  We also possibly invalidate the
1467                  * last page in the file.  We could use
1468                  * pvn_vpzero(), but this would mark the page as
1469                  * modified and require it to be written back to
1470                  * the server for no particularly good reason.
1471                  * This way, if we access it, then we bring it
1472                  * back in.  A read should be cheaper than a
1473                  * write.
1474                  */
1475                 if (mask & AT_SIZE) {
1476                         nfs_invalidate_pages(vp,
1477                             (vap->va_size & PAGEMASK), cr);
1478                 }
1479                 nfs3_cache_wcc_data(vp, &res.resok.obj_wcc, t, cr);
1480                 /*
1481                  * Some servers will change the mode to clear the setuid
1482                  * and setgid bits when changing the uid or gid.  The
1483                  * client needs to compensate appropriately.
1484                  */
1485                 if (mask & (AT_UID | AT_GID)) {
1486                         int terror;
1487 
1488                         va.va_mask = AT_MODE;
1489                         terror = nfs3getattr(vp, &va, cr);
1490                         if (!terror &&
1491                             (((mask & AT_MODE) && va.va_mode != vap->va_mode) ||
1492                             (!(mask & AT_MODE) && va.va_mode != omode))) {
1493                                 va.va_mask = AT_MODE;
1494                                 if (mask & AT_MODE)
1495                                         va.va_mode = vap->va_mode;
1496                                 else
1497                                         va.va_mode = omode;
1498                                 (void) nfs3setattr(vp, &va, 0, cr);
1499                         }
1500                 }
1501         } else {
1502                 nfs3_cache_wcc_data(vp, &res.resfail.obj_wcc, t, cr);
1503                 /*
1504                  * If we got back a "not synchronized" error, then
1505                  * we need to retry with a new guard value.  The
1506                  * guard value used is the change time.  If the
1507                  * server returned post_op_attr, then we can just
1508                  * retry because we have the latest attributes.
1509                  * Otherwise, we issue a GETATTR to get the latest
1510                  * attributes and then retry.  If we couldn't get
1511                  * the attributes this way either, then we give
1512                  * up because we can't complete the operation as
1513                  * required.
1514                  */
1515                 if (res.status == NFS3ERR_NOT_SYNC) {
1516                         va.va_mask = AT_CTIME;
1517                         if (nfs3getattr(vp, &va, cr) == 0)
1518                                 goto tryagain;
1519                 }
1520                 PURGE_STALE_FH(error, vp, cr);
1521         }
1522 
1523         return (error);
1524 }
1525 
1526 static int
1527 nfs3_accessx(void *vp, int mode, cred_t *cr)
1528 {
1529         ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone);
1530         return (nfs3_access(vp, mode, 0, cr, NULL));
1531 }
1532 
1533 /* ARGSUSED */
1534 static int
1535 nfs3_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
1536 {
1537         int error;
1538         ACCESS3args args;
1539         ACCESS3res res;
1540         int douprintf;
1541         uint32 acc;
1542         rnode_t *rp;
1543         cred_t *cred, *ncr, *ncrfree = NULL;
1544         failinfo_t fi;
1545         nfs_access_type_t cacc;
1546         hrtime_t t;
1547 
1548         acc = 0;
1549         if (nfs_zone() != VTOMI(vp)->mi_zone)
1550                 return (EIO);
1551         if (mode & VREAD)
1552                 acc |= ACCESS3_READ;
1553         if (mode & VWRITE) {
1554                 if (vn_is_readonly(vp) && !IS_DEVVP(vp))
1555                         return (EROFS);
1556                 if (vp->v_type == VDIR)
1557                         acc |= ACCESS3_DELETE;
1558                 acc |= ACCESS3_MODIFY | ACCESS3_EXTEND;
1559         }
1560         if (mode & VEXEC) {
1561                 if (vp->v_type == VDIR)
1562                         acc |= ACCESS3_LOOKUP;
1563                 else
1564                         acc |= ACCESS3_EXECUTE;
1565         }
1566 
1567         rp = VTOR(vp);
1568         args.object = *VTOFH3(vp);
1569         if (vp->v_type == VDIR) {
1570                 args.access = ACCESS3_READ | ACCESS3_DELETE | ACCESS3_MODIFY |
1571                     ACCESS3_EXTEND | ACCESS3_LOOKUP;
1572         } else {
1573                 args.access = ACCESS3_READ | ACCESS3_MODIFY | ACCESS3_EXTEND |
1574                     ACCESS3_EXECUTE;
1575         }
1576         fi.vp = vp;
1577         fi.fhp = (caddr_t)&args.object;
1578         fi.copyproc = nfs3copyfh;
1579         fi.lookupproc = nfs3lookup;
1580         fi.xattrdirproc = acl_getxattrdir3;
1581 
1582         cred = cr;
1583         /*
1584          * ncr and ncrfree both initially
1585          * point to the memory area returned
1586          * by crnetadjust();
1587          * ncrfree not NULL when exiting means
1588          * that we need to release it
1589          */
1590         ncr = crnetadjust(cred);
1591         ncrfree = ncr;
1592 tryagain:
1593         if (rp->r_acache != NULL) {
1594                 cacc = nfs_access_check(rp, acc, cred);
1595                 if (cacc == NFS_ACCESS_ALLOWED) {
1596                         if (ncrfree != NULL)
1597                                 crfree(ncrfree);
1598                         return (0);
1599                 }
1600                 if (cacc == NFS_ACCESS_DENIED) {
1601                         /*
1602                          * If the cred can be adjusted, try again
1603                          * with the new cred.
1604                          */
1605                         if (ncr != NULL) {
1606                                 cred = ncr;
1607                                 ncr = NULL;
1608                                 goto tryagain;
1609                         }
1610                         if (ncrfree != NULL)
1611                                 crfree(ncrfree);
1612                         return (EACCES);
1613                 }
1614         }
1615 
1616         douprintf = 1;
1617 
1618         t = gethrtime();
1619 
1620         error = rfs3call(VTOMI(vp), NFSPROC3_ACCESS,
1621             xdr_ACCESS3args, (caddr_t)&args,
1622             xdr_ACCESS3res, (caddr_t)&res, cred,
1623             &douprintf, &res.status, 0, &fi);
1624 
1625         if (error) {
1626                 if (ncrfree != NULL)
1627                         crfree(ncrfree);
1628                 return (error);
1629         }
1630 
1631         error = geterrno3(res.status);
1632         if (!error) {
1633                 nfs3_cache_post_op_attr(vp, &res.resok.obj_attributes, t, cr);
1634                 nfs_access_cache(rp, args.access, res.resok.access, cred);
1635                 /*
1636                  * we just cached results with cred; if cred is the
1637                  * adjusted credentials from crnetadjust, we do not want
1638                  * to release them before exiting: hence setting ncrfree
1639                  * to NULL
1640                  */
1641                 if (cred != cr)
1642                         ncrfree = NULL;
1643                 if ((acc & res.resok.access) != acc) {
1644                         /*
1645                          * If the cred can be adjusted, try again
1646                          * with the new cred.
1647                          */
1648                         if (ncr != NULL) {
1649                                 cred = ncr;
1650                                 ncr = NULL;
1651                                 goto tryagain;
1652                         }
1653                         error = EACCES;
1654                 }
1655         } else {
1656                 nfs3_cache_post_op_attr(vp, &res.resfail.obj_attributes, t, cr);
1657                 PURGE_STALE_FH(error, vp, cr);
1658         }
1659 
1660         if (ncrfree != NULL)
1661                 crfree(ncrfree);
1662 
1663         return (error);
1664 }
1665 
1666 static int nfs3_do_symlink_cache = 1;
1667 
1668 /* ARGSUSED */
1669 static int
1670 nfs3_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
1671 {
1672         int error;
1673         READLINK3args args;
1674         READLINK3res res;
1675         nfspath3 resdata_backup;
1676         rnode_t *rp;
1677         int douprintf;
1678         int len;
1679         failinfo_t fi;
1680         hrtime_t t;
1681 
1682         /*
1683          * Can't readlink anything other than a symbolic link.
1684          */
1685         if (vp->v_type != VLNK)
1686                 return (EINVAL);
1687         if (nfs_zone() != VTOMI(vp)->mi_zone)
1688                 return (EIO);
1689 
1690         rp = VTOR(vp);
1691         if (nfs3_do_symlink_cache && rp->r_symlink.contents != NULL) {
1692                 error = nfs3_validate_caches(vp, cr);
1693                 if (error)
1694                         return (error);
1695                 mutex_enter(&rp->r_statelock);
1696                 if (rp->r_symlink.contents != NULL) {
1697                         error = uiomove(rp->r_symlink.contents,
1698                             rp->r_symlink.len, UIO_READ, uiop);
1699                         mutex_exit(&rp->r_statelock);
1700                         return (error);
1701                 }
1702                 mutex_exit(&rp->r_statelock);
1703         }
1704 
1705         args.symlink = *VTOFH3(vp);
1706         fi.vp = vp;
1707         fi.fhp = (caddr_t)&args.symlink;
1708         fi.copyproc = nfs3copyfh;
1709         fi.lookupproc = nfs3lookup;
1710         fi.xattrdirproc = acl_getxattrdir3;
1711 
1712         res.resok.data = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1713 
1714         resdata_backup = res.resok.data;
1715 
1716         douprintf = 1;
1717 
1718         t = gethrtime();
1719 
1720         error = rfs3call(VTOMI(vp), NFSPROC3_READLINK,
1721             xdr_READLINK3args, (caddr_t)&args,
1722             xdr_READLINK3res, (caddr_t)&res, cr,
1723             &douprintf, &res.status, 0, &fi);
1724 
1725         if (res.resok.data == nfs3nametoolong)
1726                 error = EINVAL;
1727 
1728         if (error) {
1729                 kmem_free(resdata_backup, MAXPATHLEN);
1730                 return (error);
1731         }
1732 
1733         error = geterrno3(res.status);
1734         if (!error) {
1735                 nfs3_cache_post_op_attr(vp, &res.resok.symlink_attributes, t,
1736                     cr);
1737                 len = strlen(res.resok.data);
1738                 error = uiomove(res.resok.data, len, UIO_READ, uiop);
1739                 if (nfs3_do_symlink_cache && rp->r_symlink.contents == NULL) {
1740                         mutex_enter(&rp->r_statelock);
1741                                 if (rp->r_symlink.contents == NULL) {
1742                                 rp->r_symlink.contents = res.resok.data;
1743                                 rp->r_symlink.len = len;
1744                                 rp->r_symlink.size = MAXPATHLEN;
1745                                 mutex_exit(&rp->r_statelock);
1746                         } else {
1747                                 mutex_exit(&rp->r_statelock);
1748 
1749                                 kmem_free((void *)res.resok.data, MAXPATHLEN);
1750                         }
1751                 } else {
1752                         kmem_free((void *)res.resok.data, MAXPATHLEN);
1753                 }
1754         } else {
1755                 nfs3_cache_post_op_attr(vp,
1756                     &res.resfail.symlink_attributes, t, cr);
1757                 PURGE_STALE_FH(error, vp, cr);
1758 
1759                 kmem_free((void *)res.resok.data, MAXPATHLEN);
1760 
1761         }
1762 
1763         /*
1764          * The over the wire error for attempting to readlink something
1765          * other than a symbolic link is ENXIO.  However, we need to
1766          * return EINVAL instead of ENXIO, so we map it here.
1767          */
1768         return (error == ENXIO ? EINVAL : error);
1769 }
1770 
1771 /*
1772  * Flush local dirty pages to stable storage on the server.
1773  *
1774  * If FNODSYNC is specified, then there is nothing to do because
1775  * metadata changes are not cached on the client before being
1776  * sent to the server.
1777  */
1778 /* ARGSUSED */
1779 static int
1780 nfs3_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
1781 {
1782         int error;
1783 
1784         if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
1785                 return (0);
1786         if (nfs_zone() != VTOMI(vp)->mi_zone)
1787                 return (EIO);
1788 
1789         error = nfs3_putpage_commit(vp, (offset_t)0, 0, cr);
1790         if (!error)
1791                 error = VTOR(vp)->r_error;
1792         return (error);
1793 }
1794 
1795 /*
1796  * Weirdness: if the file was removed or the target of a rename
1797  * operation while it was open, it got renamed instead.  Here we
1798  * remove the renamed file.
1799  */
1800 /* ARGSUSED */
1801 static void
1802 nfs3_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1803 {
1804         rnode_t *rp;
1805 
1806         ASSERT(vp != DNLC_NO_VNODE);
1807 
1808         /*
1809          * If this is coming from the wrong zone, we let someone in the right
1810          * zone take care of it asynchronously.  We can get here due to
1811          * VN_RELE() being called from pageout() or fsflush().  This call may
1812          * potentially turn into an expensive no-op if, for instance, v_count
1813          * gets incremented in the meantime, but it's still correct.
1814          */
1815         if (nfs_zone() != VTOMI(vp)->mi_zone) {
1816                 nfs_async_inactive(vp, cr, nfs3_inactive);
1817                 return;
1818         }
1819 
1820         rp = VTOR(vp);
1821 redo:
1822         if (rp->r_unldvp != NULL) {
1823                 /*
1824                  * Save the vnode pointer for the directory where the
1825                  * unlinked-open file got renamed, then set it to NULL
1826                  * to prevent another thread from getting here before
1827                  * we're done with the remove.  While we have the
1828                  * statelock, make local copies of the pertinent rnode
1829                  * fields.  If we weren't to do this in an atomic way, the
1830                  * the unl* fields could become inconsistent with respect
1831                  * to each other due to a race condition between this
1832                  * code and nfs_remove().  See bug report 1034328.
1833                  */
1834                 mutex_enter(&rp->r_statelock);
1835                 if (rp->r_unldvp != NULL) {
1836                         vnode_t *unldvp;
1837                         char *unlname;
1838                         cred_t *unlcred;
1839                         REMOVE3args args;
1840                         REMOVE3res res;
1841                         int douprintf;
1842                         int error;
1843                         hrtime_t t;
1844 
1845                         unldvp = rp->r_unldvp;
1846                         rp->r_unldvp = NULL;
1847                         unlname = rp->r_unlname;
1848                         rp->r_unlname = NULL;
1849                         unlcred = rp->r_unlcred;
1850                         rp->r_unlcred = NULL;
1851                         mutex_exit(&rp->r_statelock);
1852 
1853                         /*
1854                          * If there are any dirty pages left, then flush
1855                          * them.  This is unfortunate because they just
1856                          * may get thrown away during the remove operation,
1857                          * but we have to do this for correctness.
1858                          */
1859                         if (vn_has_cached_data(vp) &&
1860                             ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
1861                                 ASSERT(vp->v_type != VCHR);
1862                                 error = nfs3_putpage(vp, (offset_t)0, 0, 0,
1863                                     cr, ct);
1864                                 if (error) {
1865                                         mutex_enter(&rp->r_statelock);
1866                                         if (!rp->r_error)
1867                                                 rp->r_error = error;
1868                                         mutex_exit(&rp->r_statelock);
1869                                 }
1870                         }
1871 
1872                         /*
1873                          * Do the remove operation on the renamed file
1874                          */
1875                         setdiropargs3(&args.object, unlname, unldvp);
1876 
1877                         douprintf = 1;
1878 
1879                         t = gethrtime();
1880 
1881                         error = rfs3call(VTOMI(unldvp), NFSPROC3_REMOVE,
1882                             xdr_diropargs3, (caddr_t)&args,
1883                             xdr_REMOVE3res, (caddr_t)&res, unlcred,
1884                             &douprintf, &res.status, 0, NULL);
1885 
1886                         if (error) {
1887                                 PURGE_ATTRCACHE(unldvp);
1888                         } else {
1889                                 error = geterrno3(res.status);
1890                                 if (!error) {
1891                                         nfs3_cache_wcc_data(unldvp,
1892                                             &res.resok.dir_wcc, t, cr);
1893                                         if (HAVE_RDDIR_CACHE(VTOR(unldvp)))
1894                                                 nfs_purge_rddir_cache(unldvp);
1895                                 } else {
1896                                         nfs3_cache_wcc_data(unldvp,
1897                                             &res.resfail.dir_wcc, t, cr);
1898                                         PURGE_STALE_FH(error, unldvp, cr);
1899                                 }
1900                         }
1901 
1902                         /*
1903                          * Release stuff held for the remove
1904                          */
1905                         VN_RELE(unldvp);
1906                         kmem_free(unlname, MAXNAMELEN);
1907                         crfree(unlcred);
1908                         goto redo;
1909                 }
1910                 mutex_exit(&rp->r_statelock);
1911         }
1912 
1913         rp_addfree(rp, cr);
1914 }
1915 
1916 /*
1917  * Remote file system operations having to do with directory manipulation.
1918  */
1919 
1920 /* ARGSUSED */
1921 static int
1922 nfs3_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1923         int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
1924         int *direntflags, pathname_t *realpnp)
1925 {
1926         int error;
1927         vnode_t *vp;
1928         vnode_t *avp = NULL;
1929         rnode_t *drp;
1930 
1931         if (nfs_zone() != VTOMI(dvp)->mi_zone)
1932                 return (EPERM);
1933 
1934         drp = VTOR(dvp);
1935 
1936         /*
1937          * Are we looking up extended attributes?  If so, "dvp" is
1938          * the file or directory for which we want attributes, and
1939          * we need a lookup of the hidden attribute directory
1940          * before we lookup the rest of the path.
1941          */
1942         if (flags & LOOKUP_XATTR) {
1943                 bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0);
1944                 mntinfo_t *mi;
1945 
1946                 mi = VTOMI(dvp);
1947                 if (!(mi->mi_flags & MI_EXTATTR))
1948                         return (EINVAL);
1949 
1950                 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp)))
1951                         return (EINTR);
1952 
1953                 (void) nfs3lookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr);
1954                 if (avp == NULL)
1955                         error = acl_getxattrdir3(dvp, &avp, cflag, cr, 0);
1956                 else
1957                         error = 0;
1958 
1959                 nfs_rw_exit(&drp->r_rwlock);
1960 
1961                 if (error) {
1962                         if (mi->mi_flags & MI_EXTATTR)
1963                                 return (error);
1964                         return (EINVAL);
1965                 }
1966                 dvp = avp;
1967                 drp = VTOR(dvp);
1968         }
1969 
1970         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) {
1971                 error = EINTR;
1972                 goto out;
1973         }
1974 
1975         error = nfs3lookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0);
1976 
1977         nfs_rw_exit(&drp->r_rwlock);
1978 
1979         /*
1980          * If vnode is a device, create special vnode.
1981          */
1982         if (!error && IS_DEVVP(*vpp)) {
1983                 vp = *vpp;
1984                 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
1985                 VN_RELE(vp);
1986         }
1987 
1988 out:
1989         if (avp != NULL)
1990                 VN_RELE(avp);
1991 
1992         return (error);
1993 }
1994 
1995 static int nfs3_lookup_neg_cache = 1;
1996 
1997 #ifdef DEBUG
1998 static int nfs3_lookup_dnlc_hits = 0;
1999 static int nfs3_lookup_dnlc_misses = 0;
2000 static int nfs3_lookup_dnlc_neg_hits = 0;
2001 static int nfs3_lookup_dnlc_disappears = 0;
2002 static int nfs3_lookup_dnlc_lookups = 0;
2003 #endif
2004 
2005 /* ARGSUSED */
2006 int
2007 nfs3lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
2008         int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags)
2009 {
2010         int error;
2011         rnode_t *drp;
2012 
2013         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2014         /*
2015          * If lookup is for "", just return dvp.  Don't need
2016          * to send it over the wire, look it up in the dnlc,
2017          * or perform any access checks.
2018          */
2019         if (*nm == '\0') {
2020                 VN_HOLD(dvp);
2021                 *vpp = dvp;
2022                 return (0);
2023         }
2024 
2025         /*
2026          * Can't do lookups in non-directories.
2027          */
2028         if (dvp->v_type != VDIR)
2029                 return (ENOTDIR);
2030 
2031         /*
2032          * If we're called with RFSCALL_SOFT, it's important that
2033          * the only rfscall is one we make directly; if we permit
2034          * an access call because we're looking up "." or validating
2035          * a dnlc hit, we'll deadlock because that rfscall will not
2036          * have the RFSCALL_SOFT set.
2037          */
2038         if (rfscall_flags & RFSCALL_SOFT)
2039                 goto callit;
2040 
2041         /*
2042          * If lookup is for ".", just return dvp.  Don't need
2043          * to send it over the wire or look it up in the dnlc,
2044          * just need to check access.
2045          */
2046         if (strcmp(nm, ".") == 0) {
2047                 error = nfs3_access(dvp, VEXEC, 0, cr, NULL);
2048                 if (error)
2049                         return (error);
2050                 VN_HOLD(dvp);
2051                 *vpp = dvp;
2052                 return (0);
2053         }
2054 
2055         drp = VTOR(dvp);
2056         if (!(drp->r_flags & RLOOKUP)) {
2057                 mutex_enter(&drp->r_statelock);
2058                 drp->r_flags |= RLOOKUP;
2059                 mutex_exit(&drp->r_statelock);
2060         }
2061 
2062         /*
2063          * Lookup this name in the DNLC.  If there was a valid entry,
2064          * then return the results of the lookup.
2065          */
2066         error = nfs3lookup_dnlc(dvp, nm, vpp, cr);
2067         if (error || *vpp != NULL)
2068                 return (error);
2069 
2070 callit:
2071         error = nfs3lookup_otw(dvp, nm, vpp, cr, rfscall_flags);
2072 
2073         return (error);
2074 }
2075 
2076 static int
2077 nfs3lookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
2078 {
2079         int error;
2080         vnode_t *vp;
2081 
2082         ASSERT(*nm != '\0');
2083         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2084         /*
2085          * Lookup this name in the DNLC.  If successful, then validate
2086          * the caches and then recheck the DNLC.  The DNLC is rechecked
2087          * just in case this entry got invalidated during the call
2088          * to nfs3_validate_caches.
2089          *
2090          * An assumption is being made that it is safe to say that a
2091          * file exists which may not on the server.  Any operations to
2092          * the server will fail with ESTALE.
2093          */
2094 #ifdef DEBUG
2095         nfs3_lookup_dnlc_lookups++;
2096 #endif
2097         vp = dnlc_lookup(dvp, nm);
2098         if (vp != NULL) {
2099                 VN_RELE(vp);
2100                 if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) {
2101                         PURGE_ATTRCACHE(dvp);
2102                 }
2103                 error = nfs3_validate_caches(dvp, cr);
2104                 if (error)
2105                         return (error);
2106                 vp = dnlc_lookup(dvp, nm);
2107                 if (vp != NULL) {
2108                         error = nfs3_access(dvp, VEXEC, 0, cr, NULL);
2109                         if (error) {
2110                                 VN_RELE(vp);
2111                                 return (error);
2112                         }
2113                         if (vp == DNLC_NO_VNODE) {
2114                                 VN_RELE(vp);
2115 #ifdef DEBUG
2116                                 nfs3_lookup_dnlc_neg_hits++;
2117 #endif
2118                                 return (ENOENT);
2119                         }
2120                         *vpp = vp;
2121 #ifdef DEBUG
2122                         nfs3_lookup_dnlc_hits++;
2123 #endif
2124                         return (0);
2125                 }
2126 #ifdef DEBUG
2127                 nfs3_lookup_dnlc_disappears++;
2128 #endif
2129         }
2130 #ifdef DEBUG
2131         else
2132                 nfs3_lookup_dnlc_misses++;
2133 #endif
2134 
2135         *vpp = NULL;
2136 
2137         return (0);
2138 }
2139 
2140 static int
2141 nfs3lookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr,
2142         int rfscall_flags)
2143 {
2144         int error;
2145         LOOKUP3args args;
2146         LOOKUP3vres res;
2147         int douprintf;
2148         struct vattr vattr;
2149         struct vattr dvattr;
2150         vnode_t *vp;
2151         failinfo_t fi;
2152         hrtime_t t;
2153 
2154         ASSERT(*nm != '\0');
2155         ASSERT(dvp->v_type == VDIR);
2156         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2157 
2158         setdiropargs3(&args.what, nm, dvp);
2159 
2160         fi.vp = dvp;
2161         fi.fhp = (caddr_t)&args.what.dir;
2162         fi.copyproc = nfs3copyfh;
2163         fi.lookupproc = nfs3lookup;
2164         fi.xattrdirproc = acl_getxattrdir3;
2165         res.obj_attributes.fres.vp = dvp;
2166         res.obj_attributes.fres.vap = &vattr;
2167         res.dir_attributes.fres.vp = dvp;
2168         res.dir_attributes.fres.vap = &dvattr;
2169 
2170         douprintf = 1;
2171 
2172         t = gethrtime();
2173 
2174         error = rfs3call(VTOMI(dvp), NFSPROC3_LOOKUP,
2175             xdr_diropargs3, (caddr_t)&args,
2176             xdr_LOOKUP3vres, (caddr_t)&res, cr,
2177             &douprintf, &res.status, rfscall_flags, &fi);
2178 
2179         if (error)
2180                 return (error);
2181 
2182         nfs3_cache_post_op_vattr(dvp, &res.dir_attributes, t, cr);
2183 
2184         error = geterrno3(res.status);
2185         if (error) {
2186                 PURGE_STALE_FH(error, dvp, cr);
2187                 if (error == ENOENT && nfs3_lookup_neg_cache)
2188                         dnlc_enter(dvp, nm, DNLC_NO_VNODE);
2189                 return (error);
2190         }
2191 
2192         if (res.obj_attributes.attributes) {
2193                 vp = makenfs3node_va(&res.object, res.obj_attributes.fres.vap,
2194                     dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
2195         } else {
2196                 vp = makenfs3node_va(&res.object, NULL,
2197                     dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
2198                 if (vp->v_type == VNON) {
2199                         vattr.va_mask = AT_TYPE;
2200                         error = nfs3getattr(vp, &vattr, cr);
2201                         if (error) {
2202                                 VN_RELE(vp);
2203                                 return (error);
2204                         }
2205                         vp->v_type = vattr.va_type;
2206                 }
2207         }
2208 
2209         if (!(rfscall_flags & RFSCALL_SOFT))
2210                 dnlc_update(dvp, nm, vp);
2211 
2212         *vpp = vp;
2213 
2214         return (error);
2215 }
2216 
2217 #ifdef DEBUG
2218 static int nfs3_create_misses = 0;
2219 #endif
2220 
2221 /* ARGSUSED */
2222 static int
2223 nfs3_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2224         int mode, vnode_t **vpp, cred_t *cr, int lfaware, caller_context_t *ct,
2225         vsecattr_t *vsecp)
2226 {
2227         int error;
2228         vnode_t *vp;
2229         rnode_t *rp;
2230         struct vattr vattr;
2231         rnode_t *drp;
2232         vnode_t *tempvp;
2233 
2234         drp = VTOR(dvp);
2235         if (nfs_zone() != VTOMI(dvp)->mi_zone)
2236                 return (EPERM);
2237         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2238                 return (EINTR);
2239 
2240 top:
2241         /*
2242          * We make a copy of the attributes because the caller does not
2243          * expect us to change what va points to.
2244          */
2245         vattr = *va;
2246 
2247         /*
2248          * If the pathname is "", just use dvp.  Don't need
2249          * to send it over the wire, look it up in the dnlc,
2250          * or perform any access checks.
2251          */
2252         if (*nm == '\0') {
2253                 error = 0;
2254                 VN_HOLD(dvp);
2255                 vp = dvp;
2256         /*
2257          * If the pathname is ".", just use dvp.  Don't need
2258          * to send it over the wire or look it up in the dnlc,
2259          * just need to check access.
2260          */
2261         } else if (strcmp(nm, ".") == 0) {
2262                 error = nfs3_access(dvp, VEXEC, 0, cr, ct);
2263                 if (error) {
2264                         nfs_rw_exit(&drp->r_rwlock);
2265                         return (error);
2266                 }
2267                 VN_HOLD(dvp);
2268                 vp = dvp;
2269         /*
2270          * We need to go over the wire, just to be sure whether the
2271          * file exists or not.  Using the DNLC can be dangerous in
2272          * this case when making a decision regarding existence.
2273          */
2274         } else {
2275                 error = nfs3lookup_otw(dvp, nm, &vp, cr, 0);
2276         }
2277         if (!error) {
2278                 if (exclusive == EXCL)
2279                         error = EEXIST;
2280                 else if (vp->v_type == VDIR && (mode & VWRITE))
2281                         error = EISDIR;
2282                 else {
2283                         /*
2284                          * If vnode is a device, create special vnode.
2285                          */
2286                         if (IS_DEVVP(vp)) {
2287                                 tempvp = vp;
2288                                 vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2289                                 VN_RELE(tempvp);
2290                         }
2291                         if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
2292                                 if ((vattr.va_mask & AT_SIZE) &&
2293                                     vp->v_type == VREG) {
2294                                         rp = VTOR(vp);
2295                                         /*
2296                                          * Check here for large file handled
2297                                          * by LF-unaware process (as
2298                                          * ufs_create() does)
2299                                          */
2300                                         if (!(lfaware & FOFFMAX)) {
2301                                                 mutex_enter(&rp->r_statelock);
2302                                                 if (rp->r_size > MAXOFF32_T)
2303                                                         error = EOVERFLOW;
2304                                                 mutex_exit(&rp->r_statelock);
2305                                         }
2306                                         if (!error) {
2307                                                 vattr.va_mask = AT_SIZE;
2308                                                 error = nfs3setattr(vp,
2309                                                     &vattr, 0, cr);
2310 
2311                                                 /*
2312                                                  * Existing file was truncated;
2313                                                  * emit a create event.
2314                                                  */
2315                                                 vnevent_create(vp, ct);
2316                                         }
2317                                 }
2318                         }
2319                 }
2320                 nfs_rw_exit(&drp->r_rwlock);
2321                 if (error) {
2322                         VN_RELE(vp);
2323                 } else {
2324                         *vpp = vp;
2325                 }
2326 
2327                 return (error);
2328         }
2329 
2330         dnlc_remove(dvp, nm);
2331 
2332         /*
2333          * Decide what the group-id of the created file should be.
2334          * Set it in attribute list as advisory...
2335          */
2336         error = setdirgid(dvp, &vattr.va_gid, cr);
2337         if (error) {
2338                 nfs_rw_exit(&drp->r_rwlock);
2339                 return (error);
2340         }
2341         vattr.va_mask |= AT_GID;
2342 
2343         ASSERT(vattr.va_mask & AT_TYPE);
2344         if (vattr.va_type == VREG) {
2345                 ASSERT(vattr.va_mask & AT_MODE);
2346                 if (MANDMODE(vattr.va_mode)) {
2347                         nfs_rw_exit(&drp->r_rwlock);
2348                         return (EACCES);
2349                 }
2350                 error = nfs3create(dvp, nm, &vattr, exclusive, mode, vpp, cr,
2351                     lfaware);
2352                 /*
2353                  * If this is not an exclusive create, then the CREATE
2354                  * request will be made with the GUARDED mode set.  This
2355                  * means that the server will return EEXIST if the file
2356                  * exists.  The file could exist because of a retransmitted
2357                  * request.  In this case, we recover by starting over and
2358                  * checking to see whether the file exists.  This second
2359                  * time through it should and a CREATE request will not be
2360                  * sent.
2361                  *
2362                  * This handles the problem of a dangling CREATE request
2363                  * which contains attributes which indicate that the file
2364                  * should be truncated.  This retransmitted request could
2365                  * possibly truncate valid data in the file if not caught
2366                  * by the duplicate request mechanism on the server or if
2367                  * not caught by other means.  The scenario is:
2368                  *
2369                  * Client transmits CREATE request with size = 0
2370                  * Client times out, retransmits request.
2371                  * Response to the first request arrives from the server
2372                  *  and the client proceeds on.
2373                  * Client writes data to the file.
2374                  * The server now processes retransmitted CREATE request
2375                  *  and truncates file.
2376                  *
2377                  * The use of the GUARDED CREATE request prevents this from
2378                  * happening because the retransmitted CREATE would fail
2379                  * with EEXIST and would not truncate the file.
2380                  */
2381                 if (error == EEXIST && exclusive == NONEXCL) {
2382 #ifdef DEBUG
2383                         nfs3_create_misses++;
2384 #endif
2385                         goto top;
2386                 }
2387                 nfs_rw_exit(&drp->r_rwlock);
2388                 return (error);
2389         }
2390         error = nfs3mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr);
2391         nfs_rw_exit(&drp->r_rwlock);
2392         return (error);
2393 }
2394 
2395 /* ARGSUSED */
2396 static int
2397 nfs3create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2398         int mode, vnode_t **vpp, cred_t *cr, int lfaware)
2399 {
2400         int error;
2401         CREATE3args args;
2402         CREATE3res res;
2403         int douprintf;
2404         vnode_t *vp;
2405         struct vattr vattr;
2406         nfstime3 *verfp;
2407         rnode_t *rp;
2408         timestruc_t now;
2409         hrtime_t t;
2410 
2411         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2412         setdiropargs3(&args.where, nm, dvp);
2413         if (exclusive == EXCL) {
2414                 args.how.mode = EXCLUSIVE;
2415                 /*
2416                  * Construct the create verifier.  This verifier needs
2417                  * to be unique between different clients.  It also needs
2418                  * to vary for each exclusive create request generated
2419                  * from the client to the server.
2420                  *
2421                  * The first attempt is made to use the hostid and a
2422                  * unique number on the client.  If the hostid has not
2423                  * been set, the high resolution time that the exclusive
2424                  * create request is being made is used.  This will work
2425                  * unless two different clients, both with the hostid
2426                  * not set, attempt an exclusive create request on the
2427                  * same file, at exactly the same clock time.  The
2428                  * chances of this happening seem small enough to be
2429                  * reasonable.
2430                  */
2431                 verfp = (nfstime3 *)&args.how.createhow3_u.verf;
2432                 verfp->seconds = zone_get_hostid(NULL);
2433                 if (verfp->seconds != 0)
2434                         verfp->nseconds = newnum();
2435                 else {
2436                         gethrestime(&now);
2437                         verfp->seconds = now.tv_sec;
2438                         verfp->nseconds = now.tv_nsec;
2439                 }
2440                 /*
2441                  * Since the server will use this value for the mtime,
2442                  * make sure that it can't overflow. Zero out the MSB.
2443                  * The actual value does not matter here, only its uniqeness.
2444                  */
2445                 verfp->seconds %= INT32_MAX;
2446         } else {
2447                 /*
2448                  * Issue the non-exclusive create in guarded mode.  This
2449                  * may result in some false EEXIST responses for
2450                  * retransmitted requests, but these will be handled at
2451                  * a higher level.  By using GUARDED, duplicate requests
2452                  * to do file truncation and possible access problems
2453                  * can be avoided.
2454                  */
2455                 args.how.mode = GUARDED;
2456                 error = vattr_to_sattr3(va,
2457                     &args.how.createhow3_u.obj_attributes);
2458                 if (error) {
2459                         /* req time field(s) overflow - return immediately */
2460                         return (error);
2461                 }
2462         }
2463 
2464         douprintf = 1;
2465 
2466         t = gethrtime();
2467 
2468         error = rfs3call(VTOMI(dvp), NFSPROC3_CREATE,
2469             xdr_CREATE3args, (caddr_t)&args,
2470             xdr_CREATE3res, (caddr_t)&res, cr,
2471             &douprintf, &res.status, 0, NULL);
2472 
2473         if (error) {
2474                 PURGE_ATTRCACHE(dvp);
2475                 return (error);
2476         }
2477 
2478         error = geterrno3(res.status);
2479         if (!error) {
2480                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
2481                 if (HAVE_RDDIR_CACHE(VTOR(dvp)))
2482                         nfs_purge_rddir_cache(dvp);
2483 
2484                 /*
2485                  * On exclusive create the times need to be explicitly
2486                  * set to clear any potential verifier that may be stored
2487                  * in one of these fields (see comment below).  This
2488                  * is done here to cover the case where no post op attrs
2489                  * were returned or a 'invalid' time was returned in
2490                  * the attributes.
2491                  */
2492                 if (exclusive == EXCL)
2493                         va->va_mask |= (AT_MTIME | AT_ATIME);
2494 
2495                 if (!res.resok.obj.handle_follows) {
2496                         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2497                         if (error)
2498                                 return (error);
2499                 } else {
2500                         if (res.resok.obj_attributes.attributes) {
2501                                 vp = makenfs3node(&res.resok.obj.handle,
2502                                     &res.resok.obj_attributes.attr,
2503                                     dvp->v_vfsp, t, cr, NULL, NULL);
2504                         } else {
2505                                 vp = makenfs3node(&res.resok.obj.handle, NULL,
2506                                     dvp->v_vfsp, t, cr, NULL, NULL);
2507 
2508                                 /*
2509                                  * On an exclusive create, it is possible
2510                                  * that attributes were returned but those
2511                                  * postop attributes failed to decode
2512                                  * properly.  If this is the case,
2513                                  * then most likely the atime or mtime
2514                                  * were invalid for our client; this
2515                                  * is caused by the server storing the
2516                                  * create verifier in one of the time
2517                                  * fields(most likely mtime).
2518                                  * So... we are going to setattr just the
2519                                  * atime/mtime to clear things up.
2520                                  */
2521                                 if (exclusive == EXCL) {
2522                                         if (error =
2523                                             nfs3excl_create_settimes(vp,
2524                                             va, cr)) {
2525                                                 /*
2526                                                  * Setting the times failed.
2527                                                  * Remove the file and return
2528                                                  * the error.
2529                                                  */
2530                                                 VN_RELE(vp);
2531                                                 (void) nfs3_remove(dvp,
2532                                                     nm, cr, NULL, 0);
2533                                                 return (error);
2534                                         }
2535                                 }
2536 
2537                                 /*
2538                                  * This handles the non-exclusive case
2539                                  * and the exclusive case where no post op
2540                                  * attrs were returned.
2541                                  */
2542                                 if (vp->v_type == VNON) {
2543                                         vattr.va_mask = AT_TYPE;
2544                                         error = nfs3getattr(vp, &vattr, cr);
2545                                         if (error) {
2546                                                 VN_RELE(vp);
2547                                                 return (error);
2548                                         }
2549                                         vp->v_type = vattr.va_type;
2550                                 }
2551                         }
2552                         dnlc_update(dvp, nm, vp);
2553                 }
2554 
2555                 rp = VTOR(vp);
2556 
2557                 /*
2558                  * Check here for large file handled by
2559                  * LF-unaware process (as ufs_create() does)
2560                  */
2561                 if ((va->va_mask & AT_SIZE) && vp->v_type == VREG &&
2562                     !(lfaware & FOFFMAX)) {
2563                         mutex_enter(&rp->r_statelock);
2564                         if (rp->r_size > MAXOFF32_T) {
2565                                 mutex_exit(&rp->r_statelock);
2566                                 VN_RELE(vp);
2567                                 return (EOVERFLOW);
2568                         }
2569                         mutex_exit(&rp->r_statelock);
2570                 }
2571 
2572                 if (exclusive == EXCL &&
2573                     (va->va_mask & ~(AT_GID | AT_SIZE))) {
2574                         /*
2575                          * If doing an exclusive create, then generate
2576                          * a SETATTR to set the initial attributes.
2577                          * Try to set the mtime and the atime to the
2578                          * server's current time.  It is somewhat
2579                          * expected that these fields will be used to
2580                          * store the exclusive create cookie.  If not,
2581                          * server implementors will need to know that
2582                          * a SETATTR will follow an exclusive create
2583                          * and the cookie should be destroyed if
2584                          * appropriate. This work may have been done
2585                          * earlier in this function if post op attrs
2586                          * were not available.
2587                          *
2588                          * The AT_GID and AT_SIZE bits are turned off
2589                          * so that the SETATTR request will not attempt
2590                          * to process these.  The gid will be set
2591                          * separately if appropriate.  The size is turned
2592                          * off because it is assumed that a new file will
2593                          * be created empty and if the file wasn't empty,
2594                          * then the exclusive create will have failed
2595                          * because the file must have existed already.
2596                          * Therefore, no truncate operation is needed.
2597                          */
2598                         va->va_mask &= ~(AT_GID | AT_SIZE);
2599                         error = nfs3setattr(vp, va, 0, cr);
2600                         if (error) {
2601                                 /*
2602                                  * Couldn't correct the attributes of
2603                                  * the newly created file and the
2604                                  * attributes are wrong.  Remove the
2605                                  * file and return an error to the
2606                                  * application.
2607                                  */
2608                                 VN_RELE(vp);
2609                                 (void) nfs3_remove(dvp, nm, cr, NULL, 0);
2610                                 return (error);
2611                         }
2612                 }
2613 
2614                 if (va->va_gid != rp->r_attr.va_gid) {
2615                         /*
2616                          * If the gid on the file isn't right, then
2617                          * generate a SETATTR to attempt to change
2618                          * it.  This may or may not work, depending
2619                          * upon the server's semantics for allowing
2620                          * file ownership changes.
2621                          */
2622                         va->va_mask = AT_GID;
2623                         (void) nfs3setattr(vp, va, 0, cr);
2624                 }
2625 
2626                 /*
2627                  * If vnode is a device create special vnode
2628                  */
2629                 if (IS_DEVVP(vp)) {
2630                         *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2631                         VN_RELE(vp);
2632                 } else
2633                         *vpp = vp;
2634         } else {
2635                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
2636                 PURGE_STALE_FH(error, dvp, cr);
2637         }
2638 
2639         return (error);
2640 }
2641 
2642 /*
2643  * Special setattr function to take care of rest of atime/mtime
2644  * after successful exclusive create.  This function exists to avoid
2645  * handling attributes from the server; exclusive the atime/mtime fields
2646  * may be 'invalid' in client's view and therefore can not be trusted.
2647  */
2648 static int
2649 nfs3excl_create_settimes(vnode_t *vp, struct vattr *vap, cred_t *cr)
2650 {
2651         int error;
2652         uint_t mask;
2653         SETATTR3args args;
2654         SETATTR3res res;
2655         int douprintf;
2656         rnode_t *rp;
2657         hrtime_t t;
2658 
2659         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
2660         /* save the caller's mask so that it can be reset later */
2661         mask = vap->va_mask;
2662 
2663         rp = VTOR(vp);
2664 
2665         args.object = *RTOFH3(rp);
2666         args.guard.check = FALSE;
2667 
2668         /* Use the mask to initialize the arguments */
2669         vap->va_mask = 0;
2670         error = vattr_to_sattr3(vap, &args.new_attributes);
2671 
2672         /* We want to set just atime/mtime on this request */
2673         args.new_attributes.atime.set_it = SET_TO_SERVER_TIME;
2674         args.new_attributes.mtime.set_it = SET_TO_SERVER_TIME;
2675 
2676         douprintf = 1;
2677 
2678         t = gethrtime();
2679 
2680         error = rfs3call(VTOMI(vp), NFSPROC3_SETATTR,
2681             xdr_SETATTR3args, (caddr_t)&args,
2682             xdr_SETATTR3res, (caddr_t)&res, cr,
2683             &douprintf, &res.status, 0, NULL);
2684 
2685         if (error) {
2686                 vap->va_mask = mask;
2687                 return (error);
2688         }
2689 
2690         error = geterrno3(res.status);
2691         if (!error) {
2692                 /*
2693                  * It is important to pick up the attributes.
2694                  * Since this is the exclusive create path, the
2695                  * attributes on the initial create were ignored
2696                  * and we need these to have the correct info.
2697                  */
2698                 nfs3_cache_wcc_data(vp, &res.resok.obj_wcc, t, cr);
2699                 /*
2700                  * No need to do the atime/mtime work again so clear
2701                  * the bits.
2702                  */
2703                 mask &= ~(AT_ATIME | AT_MTIME);
2704         } else {
2705                 nfs3_cache_wcc_data(vp, &res.resfail.obj_wcc, t, cr);
2706         }
2707 
2708         vap->va_mask = mask;
2709 
2710         return (error);
2711 }
2712 
2713 /* ARGSUSED */
2714 static int
2715 nfs3mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2716         int mode, vnode_t **vpp, cred_t *cr)
2717 {
2718         int error;
2719         MKNOD3args args;
2720         MKNOD3res res;
2721         int douprintf;
2722         vnode_t *vp;
2723         struct vattr vattr;
2724         hrtime_t t;
2725 
2726         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2727         switch (va->va_type) {
2728         case VCHR:
2729         case VBLK:
2730                 setdiropargs3(&args.where, nm, dvp);
2731                 args.what.type = (va->va_type == VCHR) ? NF3CHR : NF3BLK;
2732                 error = vattr_to_sattr3(va,
2733                     &args.what.mknoddata3_u.device.dev_attributes);
2734                 if (error) {
2735                         /* req time field(s) overflow - return immediately */
2736                         return (error);
2737                 }
2738                 args.what.mknoddata3_u.device.spec.specdata1 =
2739                     getmajor(va->va_rdev);
2740                 args.what.mknoddata3_u.device.spec.specdata2 =
2741                     getminor(va->va_rdev);
2742                 break;
2743 
2744         case VFIFO:
2745         case VSOCK:
2746                 setdiropargs3(&args.where, nm, dvp);
2747                 args.what.type = (va->va_type == VFIFO) ? NF3FIFO : NF3SOCK;
2748                 error = vattr_to_sattr3(va,
2749                     &args.what.mknoddata3_u.pipe_attributes);
2750                 if (error) {
2751                         /* req time field(s) overflow - return immediately */
2752                         return (error);
2753                 }
2754                 break;
2755 
2756         default:
2757                 return (EINVAL);
2758         }
2759 
2760         douprintf = 1;
2761 
2762         t = gethrtime();
2763 
2764         error = rfs3call(VTOMI(dvp), NFSPROC3_MKNOD,
2765             xdr_MKNOD3args, (caddr_t)&args,
2766             xdr_MKNOD3res, (caddr_t)&res, cr,
2767             &douprintf, &res.status, 0, NULL);
2768 
2769         if (error) {
2770                 PURGE_ATTRCACHE(dvp);
2771                 return (error);
2772         }
2773 
2774         error = geterrno3(res.status);
2775         if (!error) {
2776                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
2777                 if (HAVE_RDDIR_CACHE(VTOR(dvp)))
2778                         nfs_purge_rddir_cache(dvp);
2779 
2780                 if (!res.resok.obj.handle_follows) {
2781                         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2782                         if (error)
2783                                 return (error);
2784                 } else {
2785                         if (res.resok.obj_attributes.attributes) {
2786                                 vp = makenfs3node(&res.resok.obj.handle,
2787                                     &res.resok.obj_attributes.attr,
2788                                     dvp->v_vfsp, t, cr, NULL, NULL);
2789                         } else {
2790                                 vp = makenfs3node(&res.resok.obj.handle, NULL,
2791                                     dvp->v_vfsp, t, cr, NULL, NULL);
2792                                 if (vp->v_type == VNON) {
2793                                         vattr.va_mask = AT_TYPE;
2794                                         error = nfs3getattr(vp, &vattr, cr);
2795                                         if (error) {
2796                                                 VN_RELE(vp);
2797                                                 return (error);
2798                                         }
2799                                         vp->v_type = vattr.va_type;
2800                                 }
2801 
2802                         }
2803                         dnlc_update(dvp, nm, vp);
2804                 }
2805 
2806                 if (va->va_gid != VTOR(vp)->r_attr.va_gid) {
2807                         va->va_mask = AT_GID;
2808                         (void) nfs3setattr(vp, va, 0, cr);
2809                 }
2810 
2811                 /*
2812                  * If vnode is a device create special vnode
2813                  */
2814                 if (IS_DEVVP(vp)) {
2815                         *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2816                         VN_RELE(vp);
2817                 } else
2818                         *vpp = vp;
2819         } else {
2820                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
2821                 PURGE_STALE_FH(error, dvp, cr);
2822         }
2823         return (error);
2824 }
2825 
2826 /*
2827  * Weirdness: if the vnode to be removed is open
2828  * we rename it instead of removing it and nfs_inactive
2829  * will remove the new name.
2830  */
2831 /* ARGSUSED */
2832 static int
2833 nfs3_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
2834 {
2835         int error;
2836         REMOVE3args args;
2837         REMOVE3res res;
2838         vnode_t *vp;
2839         char *tmpname;
2840         int douprintf;
2841         rnode_t *rp;
2842         rnode_t *drp;
2843         hrtime_t t;
2844 
2845         if (nfs_zone() != VTOMI(dvp)->mi_zone)
2846                 return (EPERM);
2847         drp = VTOR(dvp);
2848         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2849                 return (EINTR);
2850 
2851         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2852         if (error) {
2853                 nfs_rw_exit(&drp->r_rwlock);
2854                 return (error);
2855         }
2856 
2857         if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) {
2858                 VN_RELE(vp);
2859                 nfs_rw_exit(&drp->r_rwlock);
2860                 return (EPERM);
2861         }
2862 
2863         /*
2864          * First just remove the entry from the name cache, as it
2865          * is most likely the only entry for this vp.
2866          */
2867         dnlc_remove(dvp, nm);
2868 
2869         /*
2870          * If the file has a v_count > 1 then there may be more than one
2871          * entry in the name cache due multiple links or an open file,
2872          * but we don't have the real reference count so flush all
2873          * possible entries.
2874          */
2875         if (vp->v_count > 1)
2876                 dnlc_purge_vp(vp);
2877 
2878         /*
2879          * Now we have the real reference count on the vnode
2880          */
2881         rp = VTOR(vp);
2882         mutex_enter(&rp->r_statelock);
2883         if (vp->v_count > 1 &&
2884             (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
2885                 mutex_exit(&rp->r_statelock);
2886                 tmpname = newname();
2887                 error = nfs3rename(dvp, nm, dvp, tmpname, cr, ct);
2888                 if (error)
2889                         kmem_free(tmpname, MAXNAMELEN);
2890                 else {
2891                         mutex_enter(&rp->r_statelock);
2892                         if (rp->r_unldvp == NULL) {
2893                                 VN_HOLD(dvp);
2894                                 rp->r_unldvp = dvp;
2895                                 if (rp->r_unlcred != NULL)
2896                                         crfree(rp->r_unlcred);
2897                                 crhold(cr);
2898                                 rp->r_unlcred = cr;
2899                                 rp->r_unlname = tmpname;
2900                         } else {
2901                                 kmem_free(rp->r_unlname, MAXNAMELEN);
2902                                 rp->r_unlname = tmpname;
2903                         }
2904                         mutex_exit(&rp->r_statelock);
2905                 }
2906         } else {
2907                 mutex_exit(&rp->r_statelock);
2908                 /*
2909                  * We need to flush any dirty pages which happen to
2910                  * be hanging around before removing the file.  This
2911                  * shouldn't happen very often and mostly on file
2912                  * systems mounted "nocto".
2913                  */
2914                 if (vn_has_cached_data(vp) &&
2915                     ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
2916                         error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr, ct);
2917                         if (error && (error == ENOSPC || error == EDQUOT)) {
2918                                 mutex_enter(&rp->r_statelock);
2919                                 if (!rp->r_error)
2920                                         rp->r_error = error;
2921                                 mutex_exit(&rp->r_statelock);
2922                         }
2923                 }
2924 
2925                 setdiropargs3(&args.object, nm, dvp);
2926 
2927                 douprintf = 1;
2928 
2929                 t = gethrtime();
2930 
2931                 error = rfs3call(VTOMI(dvp), NFSPROC3_REMOVE,
2932                     xdr_diropargs3, (caddr_t)&args,
2933                     xdr_REMOVE3res, (caddr_t)&res, cr,
2934                     &douprintf, &res.status, 0, NULL);
2935 
2936                 /*
2937                  * The xattr dir may be gone after last attr is removed,
2938                  * so flush it from dnlc.
2939                  */
2940                 if (dvp->v_flag & V_XATTRDIR)
2941                         dnlc_purge_vp(dvp);
2942 
2943                 PURGE_ATTRCACHE(vp);
2944 
2945                 if (error) {
2946                         PURGE_ATTRCACHE(dvp);
2947                 } else {
2948                         error = geterrno3(res.status);
2949                         if (!error) {
2950                                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t,
2951                                     cr);
2952                                 if (HAVE_RDDIR_CACHE(drp))
2953                                         nfs_purge_rddir_cache(dvp);
2954                         } else {
2955                                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc,
2956                                     t, cr);
2957                                 PURGE_STALE_FH(error, dvp, cr);
2958                         }
2959                 }
2960         }
2961 
2962         if (error == 0) {
2963                 vnevent_remove(vp, dvp, nm, ct);
2964         }
2965         VN_RELE(vp);
2966 
2967         nfs_rw_exit(&drp->r_rwlock);
2968 
2969         return (error);
2970 }
2971 
2972 /* ARGSUSED */
2973 static int
2974 nfs3_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
2975         caller_context_t *ct, int flags)
2976 {
2977         int error;
2978         LINK3args args;
2979         LINK3res res;
2980         vnode_t *realvp;
2981         int douprintf;
2982         mntinfo_t *mi;
2983         rnode_t *tdrp;
2984         hrtime_t t;
2985 
2986         if (nfs_zone() != VTOMI(tdvp)->mi_zone)
2987                 return (EPERM);
2988         if (VOP_REALVP(svp, &realvp, ct) == 0)
2989                 svp = realvp;
2990 
2991         mi = VTOMI(svp);
2992 
2993         if (!(mi->mi_flags & MI_LINK))
2994                 return (EOPNOTSUPP);
2995 
2996         args.file = *VTOFH3(svp);
2997         setdiropargs3(&args.link, tnm, tdvp);
2998 
2999         tdrp = VTOR(tdvp);
3000         if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp)))
3001                 return (EINTR);
3002 
3003         dnlc_remove(tdvp, tnm);
3004 
3005         douprintf = 1;
3006 
3007         t = gethrtime();
3008 
3009         error = rfs3call(mi, NFSPROC3_LINK,
3010             xdr_LINK3args, (caddr_t)&args,
3011             xdr_LINK3res, (caddr_t)&res, cr,
3012             &douprintf, &res.status, 0, NULL);
3013 
3014         if (error) {
3015                 PURGE_ATTRCACHE(tdvp);
3016                 PURGE_ATTRCACHE(svp);
3017                 nfs_rw_exit(&tdrp->r_rwlock);
3018                 return (error);
3019         }
3020 
3021         error = geterrno3(res.status);
3022 
3023         if (!error) {
3024                 nfs3_cache_post_op_attr(svp, &res.resok.file_attributes, t, cr);
3025                 nfs3_cache_wcc_data(tdvp, &res.resok.linkdir_wcc, t, cr);
3026                 if (HAVE_RDDIR_CACHE(tdrp))
3027                         nfs_purge_rddir_cache(tdvp);
3028                 dnlc_update(tdvp, tnm, svp);
3029         } else {
3030                 nfs3_cache_post_op_attr(svp, &res.resfail.file_attributes, t,
3031                     cr);
3032                 nfs3_cache_wcc_data(tdvp, &res.resfail.linkdir_wcc, t, cr);
3033                 if (error == EOPNOTSUPP) {
3034                         mutex_enter(&mi->mi_lock);
3035                         mi->mi_flags &= ~MI_LINK;
3036                         mutex_exit(&mi->mi_lock);
3037                 }
3038         }
3039 
3040         nfs_rw_exit(&tdrp->r_rwlock);
3041 
3042         if (!error) {
3043                 /*
3044                  * Notify the source file of this link operation.
3045                  */
3046                 vnevent_link(svp, ct);
3047         }
3048         return (error);
3049 }
3050 
3051 /* ARGSUSED */
3052 static int
3053 nfs3_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
3054         caller_context_t *ct, int flags)
3055 {
3056         vnode_t *realvp;
3057 
3058         if (nfs_zone() != VTOMI(odvp)->mi_zone)
3059                 return (EPERM);
3060         if (VOP_REALVP(ndvp, &realvp, ct) == 0)
3061                 ndvp = realvp;
3062 
3063         return (nfs3rename(odvp, onm, ndvp, nnm, cr, ct));
3064 }
3065 
3066 /*
3067  * nfs3rename does the real work of renaming in NFS Version 3.
3068  */
3069 static int
3070 nfs3rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
3071     caller_context_t *ct)
3072 {
3073         int error;
3074         RENAME3args args;
3075         RENAME3res res;
3076         int douprintf;
3077         vnode_t *nvp = NULL;
3078         vnode_t *ovp = NULL;
3079         char *tmpname;
3080         rnode_t *rp;
3081         rnode_t *odrp;
3082         rnode_t *ndrp;
3083         hrtime_t t;
3084 
3085         ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone);
3086 
3087         if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 ||
3088             strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0)
3089                 return (EINVAL);
3090 
3091         odrp = VTOR(odvp);
3092         ndrp = VTOR(ndvp);
3093         if ((intptr_t)odrp < (intptr_t)ndrp) {
3094                 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp)))
3095                         return (EINTR);
3096                 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) {
3097                         nfs_rw_exit(&odrp->r_rwlock);
3098                         return (EINTR);
3099                 }
3100         } else {
3101                 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp)))
3102                         return (EINTR);
3103                 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) {
3104                         nfs_rw_exit(&ndrp->r_rwlock);
3105                         return (EINTR);
3106                 }
3107         }
3108 
3109         /*
3110          * Lookup the target file.  If it exists, it needs to be
3111          * checked to see whether it is a mount point and whether
3112          * it is active (open).
3113          */
3114         error = nfs3lookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0);
3115         if (!error) {
3116                 /*
3117                  * If this file has been mounted on, then just
3118                  * return busy because renaming to it would remove
3119                  * the mounted file system from the name space.
3120                  */
3121                 if (vn_mountedvfs(nvp) != NULL) {
3122                         VN_RELE(nvp);
3123                         nfs_rw_exit(&odrp->r_rwlock);
3124                         nfs_rw_exit(&ndrp->r_rwlock);
3125                         return (EBUSY);
3126                 }
3127 
3128                 /*
3129                  * Purge the name cache of all references to this vnode
3130                  * so that we can check the reference count to infer
3131                  * whether it is active or not.
3132                  */
3133                 /*
3134                  * First just remove the entry from the name cache, as it
3135                  * is most likely the only entry for this vp.
3136                  */
3137                 dnlc_remove(ndvp, nnm);
3138                 /*
3139                  * If the file has a v_count > 1 then there may be more
3140                  * than one entry in the name cache due multiple links
3141                  * or an open file, but we don't have the real reference
3142                  * count so flush all possible entries.
3143                  */
3144                 if (nvp->v_count > 1)
3145                         dnlc_purge_vp(nvp);
3146 
3147                 /*
3148                  * If the vnode is active and is not a directory,
3149                  * arrange to rename it to a
3150                  * temporary file so that it will continue to be
3151                  * accessible.  This implements the "unlink-open-file"
3152                  * semantics for the target of a rename operation.
3153                  * Before doing this though, make sure that the
3154                  * source and target files are not already the same.
3155                  */
3156                 if (nvp->v_count > 1 && nvp->v_type != VDIR) {
3157                         /*
3158                          * Lookup the source name.
3159                          */
3160                         error = nfs3lookup(odvp, onm, &ovp, NULL, 0, NULL,
3161                             cr, 0);
3162 
3163                         /*
3164                          * The source name *should* already exist.
3165                          */
3166                         if (error) {
3167                                 VN_RELE(nvp);
3168                                 nfs_rw_exit(&odrp->r_rwlock);
3169                                 nfs_rw_exit(&ndrp->r_rwlock);
3170                                 return (error);
3171                         }
3172 
3173                         /*
3174                          * Compare the two vnodes.  If they are the same,
3175                          * just release all held vnodes and return success.
3176                          */
3177                         if (ovp == nvp) {
3178                                 VN_RELE(ovp);
3179                                 VN_RELE(nvp);
3180                                 nfs_rw_exit(&odrp->r_rwlock);
3181                                 nfs_rw_exit(&ndrp->r_rwlock);
3182                                 return (0);
3183                         }
3184 
3185                         /*
3186                          * Can't mix and match directories and non-
3187                          * directories in rename operations.  We already
3188                          * know that the target is not a directory.  If
3189                          * the source is a directory, return an error.
3190                          */
3191                         if (ovp->v_type == VDIR) {
3192                                 VN_RELE(ovp);
3193                                 VN_RELE(nvp);
3194                                 nfs_rw_exit(&odrp->r_rwlock);
3195                                 nfs_rw_exit(&ndrp->r_rwlock);
3196                                 return (ENOTDIR);
3197                         }
3198 
3199                         /*
3200                          * The target file exists, is not the same as
3201                          * the source file, and is active.  Link it
3202                          * to a temporary filename to avoid having
3203                          * the server removing the file completely.
3204                          */
3205                         tmpname = newname();
3206                         error = nfs3_link(ndvp, nvp, tmpname, cr, NULL, 0);
3207                         if (error == EOPNOTSUPP) {
3208                                 error = nfs3_rename(ndvp, nnm, ndvp, tmpname,
3209                                     cr, NULL, 0);
3210                         }
3211                         if (error) {
3212                                 kmem_free(tmpname, MAXNAMELEN);
3213                                 VN_RELE(ovp);
3214                                 VN_RELE(nvp);
3215                                 nfs_rw_exit(&odrp->r_rwlock);
3216                                 nfs_rw_exit(&ndrp->r_rwlock);
3217                                 return (error);
3218                         }
3219                         rp = VTOR(nvp);
3220                         mutex_enter(&rp->r_statelock);
3221                         if (rp->r_unldvp == NULL) {
3222                                 VN_HOLD(ndvp);
3223                                 rp->r_unldvp = ndvp;
3224                                 if (rp->r_unlcred != NULL)
3225                                         crfree(rp->r_unlcred);
3226                                 crhold(cr);
3227                                 rp->r_unlcred = cr;
3228                                 rp->r_unlname = tmpname;
3229                         } else {
3230                                 kmem_free(rp->r_unlname, MAXNAMELEN);
3231                                 rp->r_unlname = tmpname;
3232                         }
3233                         mutex_exit(&rp->r_statelock);
3234                 }
3235         }
3236 
3237         if (ovp == NULL) {
3238                 /*
3239                  * When renaming directories to be a subdirectory of a
3240                  * different parent, the dnlc entry for ".." will no
3241                  * longer be valid, so it must be removed.
3242                  *
3243                  * We do a lookup here to determine whether we are renaming
3244                  * a directory and we need to check if we are renaming
3245                  * an unlinked file.  This might have already been done
3246                  * in previous code, so we check ovp == NULL to avoid
3247                  * doing it twice.
3248                  */
3249 
3250                 error = nfs3lookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0);
3251                 /*
3252                  * The source name *should* already exist.
3253                  */
3254                 if (error) {
3255                         nfs_rw_exit(&odrp->r_rwlock);
3256                         nfs_rw_exit(&ndrp->r_rwlock);
3257                         if (nvp) {
3258                                 VN_RELE(nvp);
3259                         }
3260                         return (error);
3261                 }
3262                 ASSERT(ovp != NULL);
3263         }
3264 
3265         dnlc_remove(odvp, onm);
3266         dnlc_remove(ndvp, nnm);
3267 
3268         setdiropargs3(&args.from, onm, odvp);
3269         setdiropargs3(&args.to, nnm, ndvp);
3270 
3271         douprintf = 1;
3272 
3273         t = gethrtime();
3274 
3275         error = rfs3call(VTOMI(odvp), NFSPROC3_RENAME,
3276             xdr_RENAME3args, (caddr_t)&args,
3277             xdr_RENAME3res, (caddr_t)&res, cr,
3278             &douprintf, &res.status, 0, NULL);
3279 
3280         if (error) {
3281                 PURGE_ATTRCACHE(odvp);
3282                 PURGE_ATTRCACHE(ndvp);
3283                 VN_RELE(ovp);
3284                 nfs_rw_exit(&odrp->r_rwlock);
3285                 nfs_rw_exit(&ndrp->r_rwlock);
3286                 if (nvp) {
3287                         VN_RELE(nvp);
3288                 }
3289                 return (error);
3290         }
3291 
3292         error = geterrno3(res.status);
3293 
3294         if (!error) {
3295                 nfs3_cache_wcc_data(odvp, &res.resok.fromdir_wcc, t, cr);
3296                 if (HAVE_RDDIR_CACHE(odrp))
3297                         nfs_purge_rddir_cache(odvp);
3298                 if (ndvp != odvp) {
3299                         nfs3_cache_wcc_data(ndvp, &res.resok.todir_wcc, t, cr);
3300                         if (HAVE_RDDIR_CACHE(ndrp))
3301                                 nfs_purge_rddir_cache(ndvp);
3302                 }
3303                 /*
3304                  * when renaming directories to be a subdirectory of a
3305                  * different parent, the dnlc entry for ".." will no
3306                  * longer be valid, so it must be removed
3307                  */
3308                 rp = VTOR(ovp);
3309                 if (ndvp != odvp) {
3310                         if (ovp->v_type == VDIR) {
3311                                 dnlc_remove(ovp, "..");
3312                                 if (HAVE_RDDIR_CACHE(rp))
3313                                         nfs_purge_rddir_cache(ovp);
3314                         }
3315                 }
3316 
3317                 /*
3318                  * If we are renaming the unlinked file, update the
3319                  * r_unldvp and r_unlname as needed.
3320                  */
3321                 mutex_enter(&rp->r_statelock);
3322                 if (rp->r_unldvp != NULL) {
3323                         if (strcmp(rp->r_unlname, onm) == 0) {
3324                                 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN);
3325                                 rp->r_unlname[MAXNAMELEN - 1] = '\0';
3326 
3327                                 if (ndvp != rp->r_unldvp) {
3328                                         VN_RELE(rp->r_unldvp);
3329                                         rp->r_unldvp = ndvp;
3330                                         VN_HOLD(ndvp);
3331                                 }
3332                         }
3333                 }
3334                 mutex_exit(&rp->r_statelock);
3335         } else {
3336                 nfs3_cache_wcc_data(odvp, &res.resfail.fromdir_wcc, t, cr);
3337                 if (ndvp != odvp) {
3338                         nfs3_cache_wcc_data(ndvp, &res.resfail.todir_wcc, t,
3339                             cr);
3340                 }
3341                 /*
3342                  * System V defines rename to return EEXIST, not
3343                  * ENOTEMPTY if the target directory is not empty.
3344                  * Over the wire, the error is NFSERR_ENOTEMPTY
3345                  * which geterrno maps to ENOTEMPTY.
3346                  */
3347                 if (error == ENOTEMPTY)
3348                         error = EEXIST;
3349         }
3350 
3351         if (error == 0) {
3352                 if (nvp)
3353                         vnevent_rename_dest(nvp, ndvp, nnm, ct);
3354 
3355                 if (odvp != ndvp)
3356                         vnevent_rename_dest_dir(ndvp, ct);
3357                 ASSERT(ovp != NULL);
3358                 vnevent_rename_src(ovp, odvp, onm, ct);
3359         }
3360 
3361         if (nvp) {
3362                 VN_RELE(nvp);
3363         }
3364         VN_RELE(ovp);
3365 
3366         nfs_rw_exit(&odrp->r_rwlock);
3367         nfs_rw_exit(&ndrp->r_rwlock);
3368 
3369         return (error);
3370 }
3371 
3372 /* ARGSUSED */
3373 static int
3374 nfs3_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
3375         caller_context_t *ct, int flags, vsecattr_t *vsecp)
3376 {
3377         int error;
3378         MKDIR3args args;
3379         MKDIR3res res;
3380         int douprintf;
3381         struct vattr vattr;
3382         vnode_t *vp;
3383         rnode_t *drp;
3384         hrtime_t t;
3385 
3386         if (nfs_zone() != VTOMI(dvp)->mi_zone)
3387                 return (EPERM);
3388         setdiropargs3(&args.where, nm, dvp);
3389 
3390         /*
3391          * Decide what the group-id and set-gid bit of the created directory
3392          * should be.  May have to do a setattr to get the gid right.
3393          */
3394         error = setdirgid(dvp, &va->va_gid, cr);
3395         if (error)
3396                 return (error);
3397         error = setdirmode(dvp, &va->va_mode, cr);
3398         if (error)
3399                 return (error);
3400         va->va_mask |= AT_MODE|AT_GID;
3401 
3402         error = vattr_to_sattr3(va, &args.attributes);
3403         if (error) {
3404                 /* req time field(s) overflow - return immediately */
3405                 return (error);
3406         }
3407 
3408         drp = VTOR(dvp);
3409         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3410                 return (EINTR);
3411 
3412         dnlc_remove(dvp, nm);
3413 
3414         douprintf = 1;
3415 
3416         t = gethrtime();
3417 
3418         error = rfs3call(VTOMI(dvp), NFSPROC3_MKDIR,
3419             xdr_MKDIR3args, (caddr_t)&args,
3420             xdr_MKDIR3res, (caddr_t)&res, cr,
3421             &douprintf, &res.status, 0, NULL);
3422 
3423         if (error) {
3424                 PURGE_ATTRCACHE(dvp);
3425                 nfs_rw_exit(&drp->r_rwlock);
3426                 return (error);
3427         }
3428 
3429         error = geterrno3(res.status);
3430         if (!error) {
3431                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3432                 if (HAVE_RDDIR_CACHE(drp))
3433                         nfs_purge_rddir_cache(dvp);
3434 
3435                 if (!res.resok.obj.handle_follows) {
3436                         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
3437                         if (error) {
3438                                 nfs_rw_exit(&drp->r_rwlock);
3439                                 return (error);
3440                         }
3441                 } else {
3442                         if (res.resok.obj_attributes.attributes) {
3443                                 vp = makenfs3node(&res.resok.obj.handle,
3444                                     &res.resok.obj_attributes.attr,
3445                                     dvp->v_vfsp, t, cr, NULL, NULL);
3446                         } else {
3447                                 vp = makenfs3node(&res.resok.obj.handle, NULL,
3448                                     dvp->v_vfsp, t, cr, NULL, NULL);
3449                                 if (vp->v_type == VNON) {
3450                                         vattr.va_mask = AT_TYPE;
3451                                         error = nfs3getattr(vp, &vattr, cr);
3452                                         if (error) {
3453                                                 VN_RELE(vp);
3454                                                 nfs_rw_exit(&drp->r_rwlock);
3455                                                 return (error);
3456                                         }
3457                                         vp->v_type = vattr.va_type;
3458                                 }
3459                         }
3460                         dnlc_update(dvp, nm, vp);
3461                 }
3462                 if (va->va_gid != VTOR(vp)->r_attr.va_gid) {
3463                         va->va_mask = AT_GID;
3464                         (void) nfs3setattr(vp, va, 0, cr);
3465                 }
3466                 *vpp = vp;
3467         } else {
3468                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3469                 PURGE_STALE_FH(error, dvp, cr);
3470         }
3471 
3472         nfs_rw_exit(&drp->r_rwlock);
3473 
3474         return (error);
3475 }
3476 
3477 /* ARGSUSED */
3478 static int
3479 nfs3_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
3480         caller_context_t *ct, int flags)
3481 {
3482         int error;
3483         RMDIR3args args;
3484         RMDIR3res res;
3485         vnode_t *vp;
3486         int douprintf;
3487         rnode_t *drp;
3488         hrtime_t t;
3489 
3490         if (nfs_zone() != VTOMI(dvp)->mi_zone)
3491                 return (EPERM);
3492         drp = VTOR(dvp);
3493         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3494                 return (EINTR);
3495 
3496         /*
3497          * Attempt to prevent a rmdir(".") from succeeding.
3498          */
3499         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
3500         if (error) {
3501                 nfs_rw_exit(&drp->r_rwlock);
3502                 return (error);
3503         }
3504 
3505         if (vp == cdir) {
3506                 VN_RELE(vp);
3507                 nfs_rw_exit(&drp->r_rwlock);
3508                 return (EINVAL);
3509         }
3510 
3511         setdiropargs3(&args.object, nm, dvp);
3512 
3513         /*
3514          * First just remove the entry from the name cache, as it
3515          * is most likely an entry for this vp.
3516          */
3517         dnlc_remove(dvp, nm);
3518 
3519         /*
3520          * If there vnode reference count is greater than one, then
3521          * there may be additional references in the DNLC which will
3522          * need to be purged.  First, trying removing the entry for
3523          * the parent directory and see if that removes the additional
3524          * reference(s).  If that doesn't do it, then use dnlc_purge_vp
3525          * to completely remove any references to the directory which
3526          * might still exist in the DNLC.
3527          */
3528         if (vp->v_count > 1) {
3529                 dnlc_remove(vp, "..");
3530                 if (vp->v_count > 1)
3531                         dnlc_purge_vp(vp);
3532         }
3533 
3534         douprintf = 1;
3535 
3536         t = gethrtime();
3537 
3538         error = rfs3call(VTOMI(dvp), NFSPROC3_RMDIR,
3539             xdr_diropargs3, (caddr_t)&args,
3540             xdr_RMDIR3res, (caddr_t)&res, cr,
3541             &douprintf, &res.status, 0, NULL);
3542 
3543         PURGE_ATTRCACHE(vp);
3544 
3545         if (error) {
3546                 PURGE_ATTRCACHE(dvp);
3547                 VN_RELE(vp);
3548                 nfs_rw_exit(&drp->r_rwlock);
3549                 return (error);
3550         }
3551 
3552         error = geterrno3(res.status);
3553         if (!error) {
3554                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3555                 if (HAVE_RDDIR_CACHE(drp))
3556                         nfs_purge_rddir_cache(dvp);
3557                 if (HAVE_RDDIR_CACHE(VTOR(vp)))
3558                         nfs_purge_rddir_cache(vp);
3559         } else {
3560                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3561                 PURGE_STALE_FH(error, dvp, cr);
3562                 /*
3563                  * System V defines rmdir to return EEXIST, not
3564                  * ENOTEMPTY if the directory is not empty.  Over
3565                  * the wire, the error is NFSERR_ENOTEMPTY which
3566                  * geterrno maps to ENOTEMPTY.
3567                  */
3568                 if (error == ENOTEMPTY)
3569                         error = EEXIST;
3570         }
3571 
3572         if (error == 0) {
3573                 vnevent_rmdir(vp, dvp, nm, ct);
3574         }
3575         VN_RELE(vp);
3576 
3577         nfs_rw_exit(&drp->r_rwlock);
3578 
3579         return (error);
3580 }
3581 
3582 /* ARGSUSED */
3583 static int
3584 nfs3_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
3585         caller_context_t *ct, int flags)
3586 {
3587         int error;
3588         SYMLINK3args args;
3589         SYMLINK3res res;
3590         int douprintf;
3591         mntinfo_t *mi;
3592         vnode_t *vp;
3593         rnode_t *rp;
3594         char *contents;
3595         rnode_t *drp;
3596         hrtime_t t;
3597 
3598         mi = VTOMI(dvp);
3599 
3600         if (nfs_zone() != mi->mi_zone)
3601                 return (EPERM);
3602         if (!(mi->mi_flags & MI_SYMLINK))
3603                 return (EOPNOTSUPP);
3604 
3605         setdiropargs3(&args.where, lnm, dvp);
3606         error = vattr_to_sattr3(tva, &args.symlink.symlink_attributes);
3607         if (error) {
3608                 /* req time field(s) overflow - return immediately */
3609                 return (error);
3610         }
3611         args.symlink.symlink_data = tnm;
3612 
3613         drp = VTOR(dvp);
3614         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3615                 return (EINTR);
3616 
3617         dnlc_remove(dvp, lnm);
3618 
3619         douprintf = 1;
3620 
3621         t = gethrtime();
3622 
3623         error = rfs3call(mi, NFSPROC3_SYMLINK,
3624             xdr_SYMLINK3args, (caddr_t)&args,
3625             xdr_SYMLINK3res, (caddr_t)&res, cr,
3626             &douprintf, &res.status, 0, NULL);
3627 
3628         if (error) {
3629                 PURGE_ATTRCACHE(dvp);
3630                 nfs_rw_exit(&drp->r_rwlock);
3631                 return (error);
3632         }
3633 
3634         error = geterrno3(res.status);
3635         if (!error) {
3636                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3637                 if (HAVE_RDDIR_CACHE(drp))
3638                         nfs_purge_rddir_cache(dvp);
3639 
3640                 if (res.resok.obj.handle_follows) {
3641                         if (res.resok.obj_attributes.attributes) {
3642                                 vp = makenfs3node(&res.resok.obj.handle,
3643                                     &res.resok.obj_attributes.attr,
3644                                     dvp->v_vfsp, t, cr, NULL, NULL);
3645                         } else {
3646                                 vp = makenfs3node(&res.resok.obj.handle, NULL,
3647                                     dvp->v_vfsp, t, cr, NULL, NULL);
3648                                 vp->v_type = VLNK;
3649                                 vp->v_rdev = 0;
3650                         }
3651                         dnlc_update(dvp, lnm, vp);
3652                         rp = VTOR(vp);
3653                         if (nfs3_do_symlink_cache &&
3654                             rp->r_symlink.contents == NULL) {
3655 
3656                                 contents = kmem_alloc(MAXPATHLEN,
3657                                     KM_NOSLEEP);
3658 
3659                                 if (contents != NULL) {
3660                                         mutex_enter(&rp->r_statelock);
3661                                         if (rp->r_symlink.contents == NULL) {
3662                                                 rp->r_symlink.len = strlen(tnm);
3663                                                 bcopy(tnm, contents,
3664                                                     rp->r_symlink.len);
3665                                                 rp->r_symlink.contents =
3666                                                     contents;
3667                                                 rp->r_symlink.size = MAXPATHLEN;
3668                                                 mutex_exit(&rp->r_statelock);
3669                                         } else {
3670                                                 mutex_exit(&rp->r_statelock);
3671                                                 kmem_free((void *)contents,
3672                                                     MAXPATHLEN);
3673                                         }
3674                                 }
3675                         }
3676                         VN_RELE(vp);
3677                 }
3678         } else {
3679                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3680                 PURGE_STALE_FH(error, dvp, cr);
3681                 if (error == EOPNOTSUPP) {
3682                         mutex_enter(&mi->mi_lock);
3683                         mi->mi_flags &= ~MI_SYMLINK;
3684                         mutex_exit(&mi->mi_lock);
3685                 }
3686         }
3687 
3688         nfs_rw_exit(&drp->r_rwlock);
3689 
3690         return (error);
3691 }
3692 
3693 #ifdef DEBUG
3694 static int nfs3_readdir_cache_hits = 0;
3695 static int nfs3_readdir_cache_shorts = 0;
3696 static int nfs3_readdir_cache_waits = 0;
3697 static int nfs3_readdir_cache_misses = 0;
3698 static int nfs3_readdir_readahead = 0;
3699 #endif
3700 
3701 static int nfs3_shrinkreaddir = 0;
3702 
3703 /*
3704  * Read directory entries.
3705  * There are some weird things to look out for here.  The uio_loffset
3706  * field is either 0 or it is the offset returned from a previous
3707  * readdir.  It is an opaque value used by the server to find the
3708  * correct directory block to read. The count field is the number
3709  * of blocks to read on the server.  This is advisory only, the server
3710  * may return only one block's worth of entries.  Entries may be compressed
3711  * on the server.
3712  */
3713 /* ARGSUSED */
3714 static int
3715 nfs3_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
3716         caller_context_t *ct, int flags)
3717 {
3718         int error;
3719         size_t count;
3720         rnode_t *rp;
3721         rddir_cache *rdc;
3722         rddir_cache *nrdc;
3723         rddir_cache *rrdc;
3724 #ifdef DEBUG
3725         int missed;
3726 #endif
3727         int doreadahead;
3728         rddir_cache srdc;
3729         avl_index_t where;
3730 
3731         if (nfs_zone() != VTOMI(vp)->mi_zone)
3732                 return (EIO);
3733         rp = VTOR(vp);
3734 
3735         ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
3736 
3737         /*
3738          * Make sure that the directory cache is valid.
3739          */
3740         if (HAVE_RDDIR_CACHE(rp)) {
3741                 if (nfs_disable_rddir_cache) {
3742                         /*
3743                          * Setting nfs_disable_rddir_cache in /etc/system
3744                          * allows interoperability with servers that do not
3745                          * properly update the attributes of directories.
3746                          * Any cached information gets purged before an
3747                          * access is made to it.
3748                          */
3749                         nfs_purge_rddir_cache(vp);
3750                 } else {
3751                         error = nfs3_validate_caches(vp, cr);
3752                         if (error)
3753                                 return (error);
3754                 }
3755         }
3756 
3757         /*
3758          * It is possible that some servers may not be able to correctly
3759          * handle a large READDIR or READDIRPLUS request due to bugs in
3760          * their implementation.  In order to continue to interoperate
3761          * with them, this workaround is provided to limit the maximum
3762          * size of a READDIRPLUS request to 1024.  In any case, the request
3763          * size is limited to MAXBSIZE.
3764          */
3765         count = MIN(uiop->uio_iov->iov_len,
3766             nfs3_shrinkreaddir ? 1024 : MAXBSIZE);
3767 
3768         nrdc = NULL;
3769 #ifdef DEBUG
3770         missed = 0;
3771 #endif
3772 top:
3773         /*
3774          * Short circuit last readdir which always returns 0 bytes.
3775          * This can be done after the directory has been read through
3776          * completely at least once.  This will set r_direof which
3777          * can be used to find the value of the last cookie.
3778          */
3779         mutex_enter(&rp->r_statelock);
3780         if (rp->r_direof != NULL &&
3781             uiop->uio_loffset == rp->r_direof->nfs3_ncookie) {
3782                 mutex_exit(&rp->r_statelock);
3783 #ifdef DEBUG
3784                 nfs3_readdir_cache_shorts++;
3785 #endif
3786                 if (eofp)
3787                         *eofp = 1;
3788                 if (nrdc != NULL)
3789                         rddir_cache_rele(nrdc);
3790                 return (0);
3791         }
3792         /*
3793          * Look for a cache entry.  Cache entries are identified
3794          * by the NFS cookie value and the byte count requested.
3795          */
3796         srdc.nfs3_cookie = uiop->uio_loffset;
3797         srdc.buflen = count;
3798         rdc = avl_find(&rp->r_dir, &srdc, &where);
3799         if (rdc != NULL) {
3800                 rddir_cache_hold(rdc);
3801                 /*
3802                  * If the cache entry is in the process of being
3803                  * filled in, wait until this completes.  The
3804                  * RDDIRWAIT bit is set to indicate that someone
3805                  * is waiting and then the thread currently
3806                  * filling the entry is done, it should do a
3807                  * cv_broadcast to wakeup all of the threads
3808                  * waiting for it to finish.
3809                  */
3810                 if (rdc->flags & RDDIR) {
3811                         nfs_rw_exit(&rp->r_rwlock);
3812                         rdc->flags |= RDDIRWAIT;
3813 #ifdef DEBUG
3814                         nfs3_readdir_cache_waits++;
3815 #endif
3816                         if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
3817                                 /*
3818                                  * We got interrupted, probably
3819                                  * the user typed ^C or an alarm
3820                                  * fired.  We free the new entry
3821                                  * if we allocated one.
3822                                  */
3823                                 mutex_exit(&rp->r_statelock);
3824                                 (void) nfs_rw_enter_sig(&rp->r_rwlock,
3825                                     RW_READER, FALSE);
3826                                 rddir_cache_rele(rdc);
3827                                 if (nrdc != NULL)
3828                                         rddir_cache_rele(nrdc);
3829                                 return (EINTR);
3830                         }
3831                         mutex_exit(&rp->r_statelock);
3832                         (void) nfs_rw_enter_sig(&rp->r_rwlock,
3833                             RW_READER, FALSE);
3834                         rddir_cache_rele(rdc);
3835                         goto top;
3836                 }
3837                 /*
3838                  * Check to see if a readdir is required to
3839                  * fill the entry.  If so, mark this entry
3840                  * as being filled, remove our reference,
3841                  * and branch to the code to fill the entry.
3842                  */
3843                 if (rdc->flags & RDDIRREQ) {
3844                         rdc->flags &= ~RDDIRREQ;
3845                         rdc->flags |= RDDIR;
3846                         if (nrdc != NULL)
3847                                 rddir_cache_rele(nrdc);
3848                         nrdc = rdc;
3849                         mutex_exit(&rp->r_statelock);
3850                         goto bottom;
3851                 }
3852 #ifdef DEBUG
3853                 if (!missed)
3854                         nfs3_readdir_cache_hits++;
3855 #endif
3856                 /*
3857                  * If an error occurred while attempting
3858                  * to fill the cache entry, just return it.
3859                  */
3860                 if (rdc->error) {
3861                         error = rdc->error;
3862                         mutex_exit(&rp->r_statelock);
3863                         rddir_cache_rele(rdc);
3864                         if (nrdc != NULL)
3865                                 rddir_cache_rele(nrdc);
3866                         return (error);
3867                 }
3868 
3869                 /*
3870                  * The cache entry is complete and good,
3871                  * copyout the dirent structs to the calling
3872                  * thread.
3873                  */
3874                 error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop);
3875 
3876                 /*
3877                  * If no error occurred during the copyout,
3878                  * update the offset in the uio struct to
3879                  * contain the value of the next cookie
3880                  * and set the eof value appropriately.
3881                  */
3882                 if (!error) {
3883                         uiop->uio_loffset = rdc->nfs3_ncookie;
3884                         if (eofp)
3885                                 *eofp = rdc->eof;
3886                 }
3887 
3888                 /*
3889                  * Decide whether to do readahead.
3890                  *
3891                  * Don't if have already read to the end of
3892                  * directory.  There is nothing more to read.
3893                  *
3894                  * Don't if the application is not doing
3895                  * lookups in the directory.  The readahead
3896                  * is only effective if the application can
3897                  * be doing work while an async thread is
3898                  * handling the over the wire request.
3899                  */
3900                 if (rdc->eof) {
3901                         rp->r_direof = rdc;
3902                         doreadahead = FALSE;
3903                 } else if (!(rp->r_flags & RLOOKUP))
3904                         doreadahead = FALSE;
3905                 else
3906                         doreadahead = TRUE;
3907 
3908                 if (!doreadahead) {
3909                         mutex_exit(&rp->r_statelock);
3910                         rddir_cache_rele(rdc);
3911                         if (nrdc != NULL)
3912                                 rddir_cache_rele(nrdc);
3913                         return (error);
3914                 }
3915 
3916                 /*
3917                  * Check to see whether we found an entry
3918                  * for the readahead.  If so, we don't need
3919                  * to do anything further, so free the new
3920                  * entry if one was allocated.  Otherwise,
3921                  * allocate a new entry, add it to the cache,
3922                  * and then initiate an asynchronous readdir
3923                  * operation to fill it.
3924                  */
3925                 srdc.nfs3_cookie = rdc->nfs3_ncookie;
3926                 srdc.buflen = count;
3927                 rrdc = avl_find(&rp->r_dir, &srdc, &where);
3928                 if (rrdc != NULL) {
3929                         if (nrdc != NULL)
3930                                 rddir_cache_rele(nrdc);
3931                 } else {
3932                         if (nrdc != NULL)
3933                                 rrdc = nrdc;
3934                         else {
3935                                 rrdc = rddir_cache_alloc(KM_NOSLEEP);
3936                         }
3937                         if (rrdc != NULL) {
3938                                 rrdc->nfs3_cookie = rdc->nfs3_ncookie;
3939                                 rrdc->buflen = count;
3940                                 avl_insert(&rp->r_dir, rrdc, where);
3941                                 rddir_cache_hold(rrdc);
3942                                 mutex_exit(&rp->r_statelock);
3943                                 rddir_cache_rele(rdc);
3944 #ifdef DEBUG
3945                                 nfs3_readdir_readahead++;
3946 #endif
3947                                 nfs_async_readdir(vp, rrdc, cr, do_nfs3readdir);
3948                                 return (error);
3949                         }
3950                 }
3951 
3952                 mutex_exit(&rp->r_statelock);
3953                 rddir_cache_rele(rdc);
3954                 return (error);
3955         }
3956 
3957         /*
3958          * Didn't find an entry in the cache.  Construct a new empty
3959          * entry and link it into the cache.  Other processes attempting
3960          * to access this entry will need to wait until it is filled in.
3961          *
3962          * Since kmem_alloc may block, another pass through the cache
3963          * will need to be taken to make sure that another process
3964          * hasn't already added an entry to the cache for this request.
3965          */
3966         if (nrdc == NULL) {
3967                 mutex_exit(&rp->r_statelock);
3968                 nrdc = rddir_cache_alloc(KM_SLEEP);
3969                 nrdc->nfs3_cookie = uiop->uio_loffset;
3970                 nrdc->buflen = count;
3971                 goto top;
3972         }
3973 
3974         /*
3975          * Add this entry to the cache.
3976          */
3977         avl_insert(&rp->r_dir, nrdc, where);
3978         rddir_cache_hold(nrdc);
3979         mutex_exit(&rp->r_statelock);
3980 
3981 bottom:
3982 #ifdef DEBUG
3983         missed = 1;
3984         nfs3_readdir_cache_misses++;
3985 #endif
3986         /*
3987          * Do the readdir.  This routine decides whether to use
3988          * READDIR or READDIRPLUS.
3989          */
3990         error = do_nfs3readdir(vp, nrdc, cr);
3991 
3992         /*
3993          * If this operation failed, just return the error which occurred.
3994          */
3995         if (error != 0)
3996                 return (error);
3997 
3998         /*
3999          * Since the RPC operation will have taken sometime and blocked
4000          * this process, another pass through the cache will need to be
4001          * taken to find the correct cache entry.  It is possible that
4002          * the correct cache entry will not be there (although one was
4003          * added) because the directory changed during the RPC operation
4004          * and the readdir cache was flushed.  In this case, just start
4005          * over.  It is hoped that this will not happen too often... :-)
4006          */
4007         nrdc = NULL;
4008         goto top;
4009         /* NOTREACHED */
4010 }
4011 
4012 static int
4013 do_nfs3readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
4014 {
4015         int error;
4016         rnode_t *rp;
4017         mntinfo_t *mi;
4018 
4019         rp = VTOR(vp);
4020         mi = VTOMI(vp);
4021         ASSERT(nfs_zone() == mi->mi_zone);
4022         /*
4023          * Issue the proper request.
4024          *
4025          * If the server does not support READDIRPLUS, then use READDIR.
4026          *
4027          * Otherwise --
4028          * Issue a READDIRPLUS if reading to fill an empty cache or if
4029          * an application has performed a lookup in the directory which
4030          * required an over the wire lookup.  The use of READDIRPLUS
4031          * will help to (re)populate the DNLC.
4032          */
4033         if (!(mi->mi_flags & MI_READDIRONLY) &&
4034             (rp->r_flags & (RLOOKUP | RREADDIRPLUS))) {
4035                 if (rp->r_flags & RREADDIRPLUS) {
4036                         mutex_enter(&rp->r_statelock);
4037                         rp->r_flags &= ~RREADDIRPLUS;
4038                         mutex_exit(&rp->r_statelock);
4039                 }
4040                 nfs3readdirplus(vp, rdc, cr);
4041                 if (rdc->error == EOPNOTSUPP)
4042                         nfs3readdir(vp, rdc, cr);
4043         } else
4044                 nfs3readdir(vp, rdc, cr);
4045 
4046         mutex_enter(&rp->r_statelock);
4047         rdc->flags &= ~RDDIR;
4048         if (rdc->flags & RDDIRWAIT) {
4049                 rdc->flags &= ~RDDIRWAIT;
4050                 cv_broadcast(&rdc->cv);
4051         }
4052         error = rdc->error;
4053         if (error)
4054                 rdc->flags |= RDDIRREQ;
4055         mutex_exit(&rp->r_statelock);
4056 
4057         rddir_cache_rele(rdc);
4058 
4059         return (error);
4060 }
4061 
4062 static void
4063 nfs3readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
4064 {
4065         int error;
4066         READDIR3args args;
4067         READDIR3vres res;
4068         vattr_t dva;
4069         rnode_t *rp;
4070         int douprintf;
4071         failinfo_t fi, *fip = NULL;
4072         mntinfo_t *mi;
4073         hrtime_t t;
4074 
4075         rp = VTOR(vp);
4076         mi = VTOMI(vp);
4077         ASSERT(nfs_zone() == mi->mi_zone);
4078 
4079         args.dir = *RTOFH3(rp);
4080         args.cookie = (cookie3)rdc->nfs3_cookie;
4081         args.cookieverf = rp->r_cookieverf;
4082         args.count = rdc->buflen;
4083 
4084         /*
4085          * NFS client failover support
4086          * suppress failover unless we have a zero cookie
4087          */
4088         if (args.cookie == (cookie3) 0) {
4089                 fi.vp = vp;
4090                 fi.fhp = (caddr_t)&args.dir;
4091                 fi.copyproc = nfs3copyfh;
4092                 fi.lookupproc = nfs3lookup;
4093                 fi.xattrdirproc = acl_getxattrdir3;
4094                 fip = &fi;
4095         }
4096 
4097 #ifdef DEBUG
4098         rdc->entries = rddir_cache_buf_alloc(rdc->buflen, KM_SLEEP);
4099 #else
4100         rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
4101 #endif
4102 
4103         res.entries = (dirent64_t *)rdc->entries;
4104         res.entries_size = rdc->buflen;
4105         res.dir_attributes.fres.vap = &dva;
4106         res.dir_attributes.fres.vp = vp;
4107         res.loff = rdc->nfs3_cookie;
4108 
4109         douprintf = 1;
4110 
4111         if (mi->mi_io_kstats) {
4112                 mutex_enter(&mi->mi_lock);
4113                 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
4114                 mutex_exit(&mi->mi_lock);
4115         }
4116 
4117         t = gethrtime();
4118 
4119         error = rfs3call(VTOMI(vp), NFSPROC3_READDIR,
4120             xdr_READDIR3args, (caddr_t)&args,
4121             xdr_READDIR3vres, (caddr_t)&res, cr,
4122             &douprintf, &res.status, 0, fip);
4123 
4124         if (mi->mi_io_kstats) {
4125                 mutex_enter(&mi->mi_lock);
4126                 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
4127                 mutex_exit(&mi->mi_lock);
4128         }
4129 
4130         if (error)
4131                 goto err;
4132 
4133         nfs3_cache_post_op_vattr(vp, &res.dir_attributes, t, cr);
4134 
4135         error = geterrno3(res.status);
4136         if (error) {
4137                 PURGE_STALE_FH(error, vp, cr);
4138                 goto err;
4139         }
4140 
4141         if (mi->mi_io_kstats) {
4142                 mutex_enter(&mi->mi_lock);
4143                 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
4144                 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.size;
4145                 mutex_exit(&mi->mi_lock);
4146         }
4147 
4148         rdc->nfs3_ncookie = res.loff;
4149         rp->r_cookieverf = res.cookieverf;
4150         rdc->eof = res.eof ? 1 : 0;
4151         rdc->entlen = res.size;
4152         ASSERT(rdc->entlen <= rdc->buflen);
4153         rdc->error = 0;
4154         return;
4155 
4156 err:
4157         kmem_free(rdc->entries, rdc->buflen);
4158         rdc->entries = NULL;
4159         rdc->error = error;
4160 }
4161 
4162 /*
4163  * Read directory entries.
4164  * There are some weird things to look out for here.  The uio_loffset
4165  * field is either 0 or it is the offset returned from a previous
4166  * readdir.  It is an opaque value used by the server to find the
4167  * correct directory block to read. The count field is the number
4168  * of blocks to read on the server.  This is advisory only, the server
4169  * may return only one block's worth of entries.  Entries may be compressed
4170  * on the server.
4171  */
4172 static void
4173 nfs3readdirplus(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
4174 {
4175         int error;
4176         READDIRPLUS3args args;
4177         READDIRPLUS3vres res;
4178         vattr_t dva;
4179         rnode_t *rp;
4180         mntinfo_t *mi;
4181         int douprintf;
4182         failinfo_t fi, *fip = NULL;
4183 
4184         rp = VTOR(vp);
4185         mi = VTOMI(vp);
4186         ASSERT(nfs_zone() == mi->mi_zone);
4187 
4188         args.dir = *RTOFH3(rp);
4189         args.cookie = (cookie3)rdc->nfs3_cookie;
4190         args.cookieverf = rp->r_cookieverf;
4191         args.dircount = rdc->buflen;
4192         args.maxcount = mi->mi_tsize;
4193 
4194         /*
4195          * NFS client failover support
4196          * suppress failover unless we have a zero cookie
4197          */
4198         if (args.cookie == (cookie3)0) {
4199                 fi.vp = vp;
4200                 fi.fhp = (caddr_t)&args.dir;
4201                 fi.copyproc = nfs3copyfh;
4202                 fi.lookupproc = nfs3lookup;
4203                 fi.xattrdirproc = acl_getxattrdir3;
4204                 fip = &fi;
4205         }
4206 
4207 #ifdef DEBUG
4208         rdc->entries = rddir_cache_buf_alloc(rdc->buflen, KM_SLEEP);
4209 #else
4210         rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
4211 #endif
4212 
4213         res.entries = (dirent64_t *)rdc->entries;
4214         res.entries_size = rdc->buflen;
4215         res.dir_attributes.fres.vap = &dva;
4216         res.dir_attributes.fres.vp = vp;
4217         res.loff = rdc->nfs3_cookie;
4218         res.credentials = cr;
4219 
4220         douprintf = 1;
4221 
4222         if (mi->mi_io_kstats) {
4223                 mutex_enter(&mi->mi_lock);
4224                 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
4225                 mutex_exit(&mi->mi_lock);
4226         }
4227 
4228         res.time = gethrtime();
4229 
4230         error = rfs3call(mi, NFSPROC3_READDIRPLUS,
4231             xdr_READDIRPLUS3args, (caddr_t)&args,
4232             xdr_READDIRPLUS3vres, (caddr_t)&res, cr,
4233             &douprintf, &res.status, 0, fip);
4234 
4235         if (mi->mi_io_kstats) {
4236                 mutex_enter(&mi->mi_lock);
4237                 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
4238                 mutex_exit(&mi->mi_lock);
4239         }
4240 
4241         if (error) {
4242                 goto err;
4243         }
4244 
4245         nfs3_cache_post_op_vattr(vp, &res.dir_attributes, res.time, cr);
4246 
4247         error = geterrno3(res.status);
4248         if (error) {
4249                 PURGE_STALE_FH(error, vp, cr);
4250                 if (error == EOPNOTSUPP) {
4251                         mutex_enter(&mi->mi_lock);
4252                         mi->mi_flags |= MI_READDIRONLY;
4253                         mutex_exit(&mi->mi_lock);
4254                 }
4255                 goto err;
4256         }
4257 
4258         if (mi->mi_io_kstats) {
4259                 mutex_enter(&mi->mi_lock);
4260                 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
4261                 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.size;
4262                 mutex_exit(&mi->mi_lock);
4263         }
4264 
4265         rdc->nfs3_ncookie = res.loff;
4266         rp->r_cookieverf = res.cookieverf;
4267         rdc->eof = res.eof ? 1 : 0;
4268         rdc->entlen = res.size;
4269         ASSERT(rdc->entlen <= rdc->buflen);
4270         rdc->error = 0;
4271 
4272         return;
4273 
4274 err:
4275         kmem_free(rdc->entries, rdc->buflen);
4276         rdc->entries = NULL;
4277         rdc->error = error;
4278 }
4279 
4280 #ifdef DEBUG
4281 static int nfs3_bio_do_stop = 0;
4282 #endif
4283 
4284 static int
4285 nfs3_bio(struct buf *bp, stable_how *stab_comm, cred_t *cr)
4286 {
4287         rnode_t *rp = VTOR(bp->b_vp);
4288         int count;
4289         int error;
4290         cred_t *cred;
4291         offset_t offset;
4292 
4293         ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone);
4294         offset = ldbtob(bp->b_lblkno);
4295 
4296         DTRACE_IO1(start, struct buf *, bp);
4297 
4298         if (bp->b_flags & B_READ) {
4299                 mutex_enter(&rp->r_statelock);
4300                 if (rp->r_cred != NULL) {
4301                         cred = rp->r_cred;
4302                         crhold(cred);
4303                 } else {
4304                         rp->r_cred = cr;
4305                         crhold(cr);
4306                         cred = cr;
4307                         crhold(cred);
4308                 }
4309                 mutex_exit(&rp->r_statelock);
4310         read_again:
4311                 error = bp->b_error = nfs3read(bp->b_vp, bp->b_un.b_addr,
4312                     offset, bp->b_bcount, &bp->b_resid, cred);
4313                 crfree(cred);
4314                 if (!error) {
4315                         if (bp->b_resid) {
4316                                 /*
4317                                  * Didn't get it all because we hit EOF,
4318                                  * zero all the memory beyond the EOF.
4319                                  */
4320                                 /* bzero(rdaddr + */
4321                                 bzero(bp->b_un.b_addr +
4322                                     bp->b_bcount - bp->b_resid, bp->b_resid);
4323                         }
4324                         mutex_enter(&rp->r_statelock);
4325                         if (bp->b_resid == bp->b_bcount &&
4326                             offset >= rp->r_size) {
4327                                 /*
4328                                  * We didn't read anything at all as we are
4329                                  * past EOF.  Return an error indicator back
4330                                  * but don't destroy the pages (yet).
4331                                  */
4332                                 error = NFS_EOF;
4333                         }
4334                         mutex_exit(&rp->r_statelock);
4335                 } else if (error == EACCES) {
4336                         mutex_enter(&rp->r_statelock);
4337                         if (cred != cr) {
4338                                 if (rp->r_cred != NULL)
4339                                         crfree(rp->r_cred);
4340                                 rp->r_cred = cr;
4341                                 crhold(cr);
4342                                 cred = cr;
4343                                 crhold(cred);
4344                                 mutex_exit(&rp->r_statelock);
4345                                 goto read_again;
4346                         }
4347                         mutex_exit(&rp->r_statelock);
4348                 }
4349         } else {
4350                 if (!(rp->r_flags & RSTALE)) {
4351                         mutex_enter(&rp->r_statelock);
4352                         if (rp->r_cred != NULL) {
4353                                 cred = rp->r_cred;
4354                                 crhold(cred);
4355                         } else {
4356                                 rp->r_cred = cr;
4357                                 crhold(cr);
4358                                 cred = cr;
4359                                 crhold(cred);
4360                         }
4361                         mutex_exit(&rp->r_statelock);
4362                 write_again:
4363                         mutex_enter(&rp->r_statelock);
4364                         count = MIN(bp->b_bcount, rp->r_size - offset);
4365                         mutex_exit(&rp->r_statelock);
4366                         if (count < 0)
4367                                 cmn_err(CE_PANIC, "nfs3_bio: write count < 0");
4368 #ifdef DEBUG
4369                         if (count == 0) {
4370                                 zcmn_err(getzoneid(), CE_WARN,
4371                                     "nfs3_bio: zero length write at %lld",
4372                                     offset);
4373                                 nfs_printfhandle(&rp->r_fh);
4374                                 if (nfs3_bio_do_stop)
4375                                         debug_enter("nfs3_bio");
4376                         }
4377 #endif
4378                         error = nfs3write(bp->b_vp, bp->b_un.b_addr, offset,
4379                             count, cred, stab_comm);
4380                         if (error == EACCES) {
4381                                 mutex_enter(&rp->r_statelock);
4382                                 if (cred != cr) {
4383                                         if (rp->r_cred != NULL)
4384                                                 crfree(rp->r_cred);
4385                                         rp->r_cred = cr;
4386                                         crhold(cr);
4387                                         crfree(cred);
4388                                         cred = cr;
4389                                         crhold(cred);
4390                                         mutex_exit(&rp->r_statelock);
4391                                         goto write_again;
4392                                 }
4393                                 mutex_exit(&rp->r_statelock);
4394                         }
4395                         bp->b_error = error;
4396                         if (error && error != EINTR) {
4397                                 /*
4398                                  * Don't print EDQUOT errors on the console.
4399                                  * Don't print asynchronous EACCES errors.
4400                                  * Don't print EFBIG errors.
4401                                  * Print all other write errors.
4402                                  */
4403                                 if (error != EDQUOT && error != EFBIG &&
4404                                     (error != EACCES ||
4405                                     !(bp->b_flags & B_ASYNC)))
4406                                         nfs_write_error(bp->b_vp, error, cred);
4407                                 /*
4408                                  * Update r_error and r_flags as appropriate.
4409                                  * If the error was ESTALE, then mark the
4410                                  * rnode as not being writeable and save
4411                                  * the error status.  Otherwise, save any
4412                                  * errors which occur from asynchronous
4413                                  * page invalidations.  Any errors occurring
4414                                  * from other operations should be saved
4415                                  * by the caller.
4416                                  */
4417                                 mutex_enter(&rp->r_statelock);
4418                                 if (error == ESTALE) {
4419                                         rp->r_flags |= RSTALE;
4420                                         if (!rp->r_error)
4421                                                 rp->r_error = error;
4422                                 } else if (!rp->r_error &&
4423                                     (bp->b_flags &
4424                                     (B_INVAL|B_FORCE|B_ASYNC)) ==
4425                                     (B_INVAL|B_FORCE|B_ASYNC)) {
4426                                         rp->r_error = error;
4427                                 }
4428                                 mutex_exit(&rp->r_statelock);
4429                         }
4430                         crfree(cred);
4431                 } else {
4432                         error = rp->r_error;
4433                         /*
4434                          * A close may have cleared r_error, if so,
4435                          * propagate ESTALE error return properly
4436                          */
4437                         if (error == 0)
4438                                 error = ESTALE;
4439                 }
4440         }
4441 
4442         if (error != 0 && error != NFS_EOF)
4443                 bp->b_flags |= B_ERROR;
4444 
4445         DTRACE_IO1(done, struct buf *, bp);
4446 
4447         return (error);
4448 }
4449 
4450 /* ARGSUSED */
4451 static int
4452 nfs3_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4453 {
4454         rnode_t *rp;
4455 
4456         if (nfs_zone() != VTOMI(vp)->mi_zone)
4457                 return (EIO);
4458         rp = VTOR(vp);
4459 
4460         if (fidp->fid_len < (ushort_t)rp->r_fh.fh_len) {
4461                 fidp->fid_len = rp->r_fh.fh_len;
4462                 return (ENOSPC);
4463         }
4464         fidp->fid_len = rp->r_fh.fh_len;
4465         bcopy(rp->r_fh.fh_buf, fidp->fid_data, fidp->fid_len);
4466         return (0);
4467 }
4468 
4469 /* ARGSUSED2 */
4470 static int
4471 nfs3_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
4472 {
4473         rnode_t *rp = VTOR(vp);
4474 
4475         if (!write_lock) {
4476                 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
4477                 return (V_WRITELOCK_FALSE);
4478         }
4479 
4480         if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) {
4481                 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
4482                 if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp))
4483                         return (V_WRITELOCK_FALSE);
4484                 nfs_rw_exit(&rp->r_rwlock);
4485         }
4486 
4487         (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
4488         return (V_WRITELOCK_TRUE);
4489 }
4490 
4491 /* ARGSUSED */
4492 static void
4493 nfs3_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
4494 {
4495         rnode_t *rp = VTOR(vp);
4496 
4497         nfs_rw_exit(&rp->r_rwlock);
4498 }
4499 
4500 /* ARGSUSED */
4501 static int
4502 nfs3_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
4503 {
4504 
4505         /*
4506          * Because we stuff the readdir cookie into the offset field
4507          * someone may attempt to do an lseek with the cookie which
4508          * we want to succeed.
4509          */
4510         if (vp->v_type == VDIR)
4511                 return (0);
4512         if (*noffp < 0)
4513                 return (EINVAL);
4514         return (0);
4515 }
4516 
4517 /*
4518  * number of nfs3_bsize blocks to read ahead.
4519  */
4520 static int nfs3_nra = 4;
4521 
4522 #ifdef DEBUG
4523 static int nfs3_lostpage = 0;   /* number of times we lost original page */
4524 #endif
4525 
4526 /*
4527  * Return all the pages from [off..off+len) in file
4528  */
4529 /* ARGSUSED */
4530 static int
4531 nfs3_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4532         page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4533         enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4534 {
4535         rnode_t *rp;
4536         int error;
4537         mntinfo_t *mi;
4538 
4539         if (vp->v_flag & VNOMAP)
4540                 return (ENOSYS);
4541 
4542         if (nfs_zone() != VTOMI(vp)->mi_zone)
4543                 return (EIO);
4544         if (protp != NULL)
4545                 *protp = PROT_ALL;
4546 
4547         /*
4548          * Now valididate that the caches are up to date.
4549          */
4550         error = nfs3_validate_caches(vp, cr);
4551         if (error)
4552                 return (error);
4553 
4554         rp = VTOR(vp);
4555         mi = VTOMI(vp);
4556 retry:
4557         mutex_enter(&rp->r_statelock);
4558 
4559         /*
4560          * Don't create dirty pages faster than they
4561          * can be cleaned so that the system doesn't
4562          * get imbalanced.  If the async queue is
4563          * maxed out, then wait for it to drain before
4564          * creating more dirty pages.  Also, wait for
4565          * any threads doing pagewalks in the vop_getattr
4566          * entry points so that they don't block for
4567          * long periods.
4568          */
4569         if (rw == S_CREATE) {
4570                 while ((mi->mi_max_threads != 0 &&
4571                     rp->r_awcount > 2 * mi->mi_max_threads) ||
4572                     rp->r_gcount > 0)
4573                         cv_wait(&rp->r_cv, &rp->r_statelock);
4574         }
4575 
4576         /*
4577          * If we are getting called as a side effect of an nfs_write()
4578          * operation the local file size might not be extended yet.
4579          * In this case we want to be able to return pages of zeroes.
4580          */
4581         if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
4582                 mutex_exit(&rp->r_statelock);
4583                 return (EFAULT);                /* beyond EOF */
4584         }
4585 
4586         mutex_exit(&rp->r_statelock);
4587 
4588         if (len <= PAGESIZE) {
4589                 error = nfs3_getapage(vp, off, len, protp, pl, plsz,
4590                     seg, addr, rw, cr);
4591         } else {
4592                 error = pvn_getpages(nfs3_getapage, vp, off, len, protp,
4593                     pl, plsz, seg, addr, rw, cr);
4594         }
4595 
4596         switch (error) {
4597         case NFS_EOF:
4598                 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
4599                 goto retry;
4600         case ESTALE:
4601                 PURGE_STALE_FH(error, vp, cr);
4602         }
4603 
4604         return (error);
4605 }
4606 
4607 /*
4608  * Called from pvn_getpages or nfs3_getpage to get a particular page.
4609  */
4610 /* ARGSUSED */
4611 static int
4612 nfs3_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
4613         page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4614         enum seg_rw rw, cred_t *cr)
4615 {
4616         rnode_t *rp;
4617         uint_t bsize;
4618         struct buf *bp;
4619         page_t *pp;
4620         u_offset_t lbn;
4621         u_offset_t io_off;
4622         u_offset_t blkoff;
4623         u_offset_t rablkoff;
4624         size_t io_len;
4625         uint_t blksize;
4626         int error;
4627         int readahead;
4628         int readahead_issued = 0;
4629         int ra_window; /* readahead window */
4630         page_t *pagefound;
4631         page_t *savepp;
4632 
4633         if (nfs_zone() != VTOMI(vp)->mi_zone)
4634                 return (EIO);
4635         rp = VTOR(vp);
4636         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4637 
4638 reread:
4639         bp = NULL;
4640         pp = NULL;
4641         pagefound = NULL;
4642 
4643         if (pl != NULL)
4644                 pl[0] = NULL;
4645 
4646         error = 0;
4647         lbn = off / bsize;
4648         blkoff = lbn * bsize;
4649 
4650         /*
4651          * Queueing up the readahead before doing the synchronous read
4652          * results in a significant increase in read throughput because
4653          * of the increased parallelism between the async threads and
4654          * the process context.
4655          */
4656         if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
4657             rw != S_CREATE &&
4658             !(vp->v_flag & VNOCACHE)) {
4659                 mutex_enter(&rp->r_statelock);
4660 
4661                 /*
4662                  * Calculate the number of readaheads to do.
4663                  * a) No readaheads at offset = 0.
4664                  * b) Do maximum(nfs3_nra) readaheads when the readahead
4665                  *    window is closed.
4666                  * c) Do readaheads between 1 to (nfs3_nra - 1) depending
4667                  *    upon how far the readahead window is open or close.
4668                  * d) No readaheads if rp->r_nextr is not within the scope
4669                  *    of the readahead window (random i/o).
4670                  */
4671 
4672                 if (off == 0)
4673                         readahead = 0;
4674                 else if (blkoff == rp->r_nextr)
4675                         readahead = nfs3_nra;
4676                 else if (rp->r_nextr > blkoff &&
4677                     ((ra_window = (rp->r_nextr - blkoff) / bsize)
4678                     <= (nfs3_nra - 1)))
4679                         readahead = nfs3_nra - ra_window;
4680                 else
4681                         readahead = 0;
4682 
4683                 rablkoff = rp->r_nextr;
4684                 while (readahead > 0 && rablkoff + bsize < rp->r_size) {
4685                         mutex_exit(&rp->r_statelock);
4686                         if (nfs_async_readahead(vp, rablkoff + bsize,
4687                             addr + (rablkoff + bsize - off), seg, cr,
4688                             nfs3_readahead) < 0) {
4689                                 mutex_enter(&rp->r_statelock);
4690                                 break;
4691                         }
4692                         readahead--;
4693                         rablkoff += bsize;
4694                         /*
4695                          * Indicate that we did a readahead so
4696                          * readahead offset is not updated
4697                          * by the synchronous read below.
4698                          */
4699                         readahead_issued = 1;
4700                         mutex_enter(&rp->r_statelock);
4701                         /*
4702                          * set readahead offset to
4703                          * offset of last async readahead
4704                          * request.
4705                          */
4706                         rp->r_nextr = rablkoff;
4707                 }
4708                 mutex_exit(&rp->r_statelock);
4709         }
4710 
4711 again:
4712         if ((pagefound = page_exists(vp, off)) == NULL) {
4713                 if (pl == NULL) {
4714                         (void) nfs_async_readahead(vp, blkoff, addr, seg, cr,
4715                             nfs3_readahead);
4716                 } else if (rw == S_CREATE) {
4717                         /*
4718                          * Block for this page is not allocated, or the offset
4719                          * is beyond the current allocation size, or we're
4720                          * allocating a swap slot and the page was not found,
4721                          * so allocate it and return a zero page.
4722                          */
4723                         if ((pp = page_create_va(vp, off,
4724                             PAGESIZE, PG_WAIT, seg, addr)) == NULL)
4725                                 cmn_err(CE_PANIC, "nfs3_getapage: page_create");
4726                         io_len = PAGESIZE;
4727                         mutex_enter(&rp->r_statelock);
4728                         rp->r_nextr = off + PAGESIZE;
4729                         mutex_exit(&rp->r_statelock);
4730                 } else {
4731                         /*
4732                          * Need to go to server to get a BLOCK, exception to
4733                          * that being while reading at offset = 0 or doing
4734                          * random i/o, in that case read only a PAGE.
4735                          */
4736                         mutex_enter(&rp->r_statelock);
4737                         if (blkoff < rp->r_size &&
4738                             blkoff + bsize >= rp->r_size) {
4739                                 /*
4740                                  * If only a block or less is left in
4741                                  * the file, read all that is remaining.
4742                                  */
4743                                 if (rp->r_size <= off) {
4744                                         /*
4745                                          * Trying to access beyond EOF,
4746                                          * set up to get at least one page.
4747                                          */
4748                                         blksize = off + PAGESIZE - blkoff;
4749                                 } else
4750                                         blksize = rp->r_size - blkoff;
4751                         } else if ((off == 0) ||
4752                             (off != rp->r_nextr && !readahead_issued)) {
4753                                 blksize = PAGESIZE;
4754                                 blkoff = off; /* block = page here */
4755                         } else
4756                                 blksize = bsize;
4757                         mutex_exit(&rp->r_statelock);
4758 
4759                         pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4760                             &io_len, blkoff, blksize, 0);
4761 
4762                         /*
4763                          * Some other thread has entered the page,
4764                          * so just use it.
4765                          */
4766                         if (pp == NULL)
4767                                 goto again;
4768 
4769                         /*
4770                          * Now round the request size up to page boundaries.
4771                          * This ensures that the entire page will be
4772                          * initialized to zeroes if EOF is encountered.
4773                          */
4774                         io_len = ptob(btopr(io_len));
4775 
4776                         bp = pageio_setup(pp, io_len, vp, B_READ);
4777                         ASSERT(bp != NULL);
4778 
4779                         /*
4780                          * pageio_setup should have set b_addr to 0.  This
4781                          * is correct since we want to do I/O on a page
4782                          * boundary.  bp_mapin will use this addr to calculate
4783                          * an offset, and then set b_addr to the kernel virtual
4784                          * address it allocated for us.
4785                          */
4786                         ASSERT(bp->b_un.b_addr == 0);
4787 
4788                         bp->b_edev = 0;
4789                         bp->b_dev = 0;
4790                         bp->b_lblkno = lbtodb(io_off);
4791                         bp->b_file = vp;
4792                         bp->b_offset = (offset_t)off;
4793                         bp_mapin(bp);
4794 
4795                         /*
4796                          * If doing a write beyond what we believe is EOF,
4797                          * don't bother trying to read the pages from the
4798                          * server, we'll just zero the pages here.  We
4799                          * don't check that the rw flag is S_WRITE here
4800                          * because some implementations may attempt a
4801                          * read access to the buffer before copying data.
4802                          */
4803                         mutex_enter(&rp->r_statelock);
4804                         if (io_off >= rp->r_size && seg == segkmap) {
4805                                 mutex_exit(&rp->r_statelock);
4806                                 bzero(bp->b_un.b_addr, io_len);
4807                         } else {
4808                                 mutex_exit(&rp->r_statelock);
4809                                 error = nfs3_bio(bp, NULL, cr);
4810                         }
4811 
4812                         /*
4813                          * Unmap the buffer before freeing it.
4814                          */
4815                         bp_mapout(bp);
4816                         pageio_done(bp);
4817 
4818                         savepp = pp;
4819                         do {
4820                                 pp->p_fsdata = C_NOCOMMIT;
4821                         } while ((pp = pp->p_next) != savepp);
4822 
4823                         if (error == NFS_EOF) {
4824                                 /*
4825                                  * If doing a write system call just return
4826                                  * zeroed pages, else user tried to get pages
4827                                  * beyond EOF, return error.  We don't check
4828                                  * that the rw flag is S_WRITE here because
4829                                  * some implementations may attempt a read
4830                                  * access to the buffer before copying data.
4831                                  */
4832                                 if (seg == segkmap)
4833                                         error = 0;
4834                                 else
4835                                         error = EFAULT;
4836                         }
4837 
4838                         if (!readahead_issued && !error) {
4839                                 mutex_enter(&rp->r_statelock);
4840                                 rp->r_nextr = io_off + io_len;
4841                                 mutex_exit(&rp->r_statelock);
4842                         }
4843                 }
4844         }
4845 
4846 out:
4847         if (pl == NULL)
4848                 return (error);
4849 
4850         if (error) {
4851                 if (pp != NULL)
4852                         pvn_read_done(pp, B_ERROR);
4853                 return (error);
4854         }
4855 
4856         if (pagefound) {
4857                 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
4858 
4859                 /*
4860                  * Page exists in the cache, acquire the appropriate lock.
4861                  * If this fails, start all over again.
4862                  */
4863                 if ((pp = page_lookup(vp, off, se)) == NULL) {
4864 #ifdef DEBUG
4865                         nfs3_lostpage++;
4866 #endif
4867                         goto reread;
4868                 }
4869                 pl[0] = pp;
4870                 pl[1] = NULL;
4871                 return (0);
4872         }
4873 
4874         if (pp != NULL)
4875                 pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4876 
4877         return (error);
4878 }
4879 
4880 static void
4881 nfs3_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
4882         cred_t *cr)
4883 {
4884         int error;
4885         page_t *pp;
4886         u_offset_t io_off;
4887         size_t io_len;
4888         struct buf *bp;
4889         uint_t bsize, blksize;
4890         rnode_t *rp = VTOR(vp);
4891         page_t *savepp;
4892 
4893         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4894         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4895 
4896         mutex_enter(&rp->r_statelock);
4897         if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
4898                 /*
4899                  * If less than a block left in file read less
4900                  * than a block.
4901                  */
4902                 blksize = rp->r_size - blkoff;
4903         } else
4904                 blksize = bsize;
4905         mutex_exit(&rp->r_statelock);
4906 
4907         pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
4908             &io_off, &io_len, blkoff, blksize, 1);
4909         /*
4910          * The isra flag passed to the kluster function is 1, we may have
4911          * gotten a return value of NULL for a variety of reasons (# of free
4912          * pages < minfree, someone entered the page on the vnode etc). In all
4913          * cases, we want to punt on the readahead.
4914          */
4915         if (pp == NULL)
4916                 return;
4917 
4918         /*
4919          * Now round the request size up to page boundaries.
4920          * This ensures that the entire page will be
4921          * initialized to zeroes if EOF is encountered.
4922          */
4923         io_len = ptob(btopr(io_len));
4924 
4925         bp = pageio_setup(pp, io_len, vp, B_READ);
4926         ASSERT(bp != NULL);
4927 
4928         /*
4929          * pageio_setup should have set b_addr to 0.  This is correct since
4930          * we want to do I/O on a page boundary. bp_mapin() will use this addr
4931          * to calculate an offset, and then set b_addr to the kernel virtual
4932          * address it allocated for us.
4933          */
4934         ASSERT(bp->b_un.b_addr == 0);
4935 
4936         bp->b_edev = 0;
4937         bp->b_dev = 0;
4938         bp->b_lblkno = lbtodb(io_off);
4939         bp->b_file = vp;
4940         bp->b_offset = (offset_t)blkoff;
4941         bp_mapin(bp);
4942 
4943         /*
4944          * If doing a write beyond what we believe is EOF, don't bother trying
4945          * to read the pages from the server, we'll just zero the pages here.
4946          * We don't check that the rw flag is S_WRITE here because some
4947          * implementations may attempt a read access to the buffer before
4948          * copying data.
4949          */
4950         mutex_enter(&rp->r_statelock);
4951         if (io_off >= rp->r_size && seg == segkmap) {
4952                 mutex_exit(&rp->r_statelock);
4953                 bzero(bp->b_un.b_addr, io_len);
4954                 error = 0;
4955         } else {
4956                 mutex_exit(&rp->r_statelock);
4957                 error = nfs3_bio(bp, NULL, cr);
4958                 if (error == NFS_EOF)
4959                         error = 0;
4960         }
4961 
4962         /*
4963          * Unmap the buffer before freeing it.
4964          */
4965         bp_mapout(bp);
4966         pageio_done(bp);
4967 
4968         savepp = pp;
4969         do {
4970                 pp->p_fsdata = C_NOCOMMIT;
4971         } while ((pp = pp->p_next) != savepp);
4972 
4973         pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
4974 
4975         /*
4976          * In case of error set readahead offset
4977          * to the lowest offset.
4978          * pvn_read_done() calls VN_DISPOSE to destroy the pages
4979          */
4980         if (error && rp->r_nextr > io_off) {
4981                 mutex_enter(&rp->r_statelock);
4982                 if (rp->r_nextr > io_off)
4983                         rp->r_nextr = io_off;
4984                 mutex_exit(&rp->r_statelock);
4985         }
4986 }
4987 
4988 /*
4989  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
4990  * If len == 0, do from off to EOF.
4991  *
4992  * The normal cases should be len == 0 && off == 0 (entire vp list),
4993  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
4994  * (from pageout).
4995  */
4996 /* ARGSUSED */
4997 static int
4998 nfs3_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4999         caller_context_t *ct)
5000 {
5001         int error;
5002         rnode_t *rp;
5003 
5004         ASSERT(cr != NULL);
5005 
5006         /*
5007          * XXX - Why should this check be made here?
5008          */
5009         if (vp->v_flag & VNOMAP)
5010                 return (ENOSYS);
5011         if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp))
5012                 return (0);
5013         if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
5014                 return (EIO);
5015 
5016         rp = VTOR(vp);
5017         mutex_enter(&rp->r_statelock);
5018         rp->r_count++;
5019         mutex_exit(&rp->r_statelock);
5020         error = nfs_putpages(vp, off, len, flags, cr);
5021         mutex_enter(&rp->r_statelock);
5022         rp->r_count--;
5023         cv_broadcast(&rp->r_cv);
5024         mutex_exit(&rp->r_statelock);
5025 
5026         return (error);
5027 }
5028 
5029 /*
5030  * Write out a single page, possibly klustering adjacent dirty pages.
5031  */
5032 int
5033 nfs3_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
5034         int flags, cred_t *cr)
5035 {
5036         u_offset_t io_off;
5037         u_offset_t lbn_off;
5038         u_offset_t lbn;
5039         size_t io_len;
5040         uint_t bsize;
5041         int error;
5042         rnode_t *rp;
5043 
5044         ASSERT(!vn_is_readonly(vp));
5045         ASSERT(pp != NULL);
5046         ASSERT(cr != NULL);
5047         ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone);
5048 
5049         rp = VTOR(vp);
5050         ASSERT(rp->r_count > 0);
5051 
5052         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
5053         lbn = pp->p_offset / bsize;
5054         lbn_off = lbn * bsize;
5055 
5056         /*
5057          * Find a kluster that fits in one block, or in
5058          * one page if pages are bigger than blocks.  If
5059          * there is less file space allocated than a whole
5060          * page, we'll shorten the i/o request below.
5061          */
5062         pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
5063             roundup(bsize, PAGESIZE), flags);
5064 
5065         /*
5066          * pvn_write_kluster shouldn't have returned a page with offset
5067          * behind the original page we were given.  Verify that.
5068          */
5069         ASSERT((pp->p_offset / bsize) >= lbn);
5070 
5071         /*
5072          * Now pp will have the list of kept dirty pages marked for
5073          * write back.  It will also handle invalidation and freeing
5074          * of pages that are not dirty.  Check for page length rounding
5075          * problems.
5076          */
5077         if (io_off + io_len > lbn_off + bsize) {
5078                 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
5079                 io_len = lbn_off + bsize - io_off;
5080         }
5081         /*
5082          * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
5083          * consistent value of r_size. RMODINPROGRESS is set in writerp().
5084          * When RMODINPROGRESS is set it indicates that a uiomove() is in
5085          * progress and the r_size has not been made consistent with the
5086          * new size of the file. When the uiomove() completes the r_size is
5087          * updated and the RMODINPROGRESS flag is cleared.
5088          *
5089          * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
5090          * consistent value of r_size. Without this handshaking, it is
5091          * possible that nfs(3)_bio() picks  up the old value of r_size
5092          * before the uiomove() in writerp() completes. This will result
5093          * in the write through nfs(3)_bio() being dropped.
5094          *
5095          * More precisely, there is a window between the time the uiomove()
5096          * completes and the time the r_size is updated. If a VOP_PUTPAGE()
5097          * operation intervenes in this window, the page will be picked up,
5098          * because it is dirty (it will be unlocked, unless it was
5099          * pagecreate'd). When the page is picked up as dirty, the dirty
5100          * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is
5101          * checked. This will still be the old size. Therefore the page will
5102          * not be written out. When segmap_release() calls VOP_PUTPAGE(),
5103          * the page will be found to be clean and the write will be dropped.
5104          */
5105         if (rp->r_flags & RMODINPROGRESS) {
5106                 mutex_enter(&rp->r_statelock);
5107                 if ((rp->r_flags & RMODINPROGRESS) &&
5108                     rp->r_modaddr + MAXBSIZE > io_off &&
5109                     rp->r_modaddr < io_off + io_len) {
5110                         page_t *plist;
5111                         /*
5112                          * A write is in progress for this region of the file.
5113                          * If we did not detect RMODINPROGRESS here then this
5114                          * path through nfs_putapage() would eventually go to
5115                          * nfs(3)_bio() and may not write out all of the data
5116                          * in the pages. We end up losing data. So we decide
5117                          * to set the modified bit on each page in the page
5118                          * list and mark the rnode with RDIRTY. This write
5119                          * will be restarted at some later time.
5120                          */
5121                         plist = pp;
5122                         while (plist != NULL) {
5123                                 pp = plist;
5124                                 page_sub(&plist, pp);
5125                                 hat_setmod(pp);
5126                                 page_io_unlock(pp);
5127                                 page_unlock(pp);
5128                         }
5129                         rp->r_flags |= RDIRTY;
5130                         mutex_exit(&rp->r_statelock);
5131                         if (offp)
5132                                 *offp = io_off;
5133                         if (lenp)
5134                                 *lenp = io_len;
5135                         return (0);
5136                 }
5137                 mutex_exit(&rp->r_statelock);
5138         }
5139 
5140         if (flags & B_ASYNC) {
5141                 error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr,
5142                     nfs3_sync_putapage);
5143         } else
5144                 error = nfs3_sync_putapage(vp, pp, io_off, io_len, flags, cr);
5145 
5146         if (offp)
5147                 *offp = io_off;
5148         if (lenp)
5149                 *lenp = io_len;
5150         return (error);
5151 }
5152 
5153 static int
5154 nfs3_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
5155         int flags, cred_t *cr)
5156 {
5157         int error;
5158         rnode_t *rp;
5159 
5160         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5161 
5162         flags |= B_WRITE;
5163 
5164         error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5165 
5166         rp = VTOR(vp);
5167 
5168         if ((error == ENOSPC || error == EDQUOT || error == EFBIG ||
5169             error == EACCES) &&
5170             (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
5171                 if (!(rp->r_flags & ROUTOFSPACE)) {
5172                         mutex_enter(&rp->r_statelock);
5173                         rp->r_flags |= ROUTOFSPACE;
5174                         mutex_exit(&rp->r_statelock);
5175                 }
5176                 flags |= B_ERROR;
5177                 pvn_write_done(pp, flags);
5178                 /*
5179                  * If this was not an async thread, then try again to
5180                  * write out the pages, but this time, also destroy
5181                  * them whether or not the write is successful.  This
5182                  * will prevent memory from filling up with these
5183                  * pages and destroying them is the only alternative
5184                  * if they can't be written out.
5185                  *
5186                  * Don't do this if this is an async thread because
5187                  * when the pages are unlocked in pvn_write_done,
5188                  * some other thread could have come along, locked
5189                  * them, and queued for an async thread.  It would be
5190                  * possible for all of the async threads to be tied
5191                  * up waiting to lock the pages again and they would
5192                  * all already be locked and waiting for an async
5193                  * thread to handle them.  Deadlock.
5194                  */
5195                 if (!(flags & B_ASYNC)) {
5196                         error = nfs3_putpage(vp, io_off, io_len,
5197                             B_INVAL | B_FORCE, cr, NULL);
5198                 }
5199         } else {
5200                 if (error)
5201                         flags |= B_ERROR;
5202                 else if (rp->r_flags & ROUTOFSPACE) {
5203                         mutex_enter(&rp->r_statelock);
5204                         rp->r_flags &= ~ROUTOFSPACE;
5205                         mutex_exit(&rp->r_statelock);
5206                 }
5207                 pvn_write_done(pp, flags);
5208                 if (freemem < desfree)
5209                         (void) nfs3_commit_vp(vp, (u_offset_t)0, 0, cr);
5210         }
5211 
5212         return (error);
5213 }
5214 
5215 /* ARGSUSED */
5216 static int
5217 nfs3_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
5218         size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
5219         cred_t *cr, caller_context_t *ct)
5220 {
5221         struct segvn_crargs vn_a;
5222         int error;
5223         rnode_t *rp;
5224         struct vattr va;
5225 
5226         if (nfs_zone() != VTOMI(vp)->mi_zone)
5227                 return (EIO);
5228 
5229         if (vp->v_flag & VNOMAP)
5230                 return (ENOSYS);
5231 
5232         if (off < 0 || off + len < 0)
5233                 return (ENXIO);
5234 
5235         if (vp->v_type != VREG)
5236                 return (ENODEV);
5237 
5238         /*
5239          * If there is cached data and if close-to-open consistency
5240          * checking is not turned off and if the file system is not
5241          * mounted readonly, then force an over the wire getattr.
5242          * Otherwise, just invoke nfs3getattr to get a copy of the
5243          * attributes.  The attribute cache will be used unless it
5244          * is timed out and if it is, then an over the wire getattr
5245          * will be issued.
5246          */
5247         va.va_mask = AT_ALL;
5248         if (vn_has_cached_data(vp) &&
5249             !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp))
5250                 error = nfs3_getattr_otw(vp, &va, cr);
5251         else
5252                 error = nfs3getattr(vp, &va, cr);
5253         if (error)
5254                 return (error);
5255 
5256         /*
5257          * Check to see if the vnode is currently marked as not cachable.
5258          * This means portions of the file are locked (through VOP_FRLOCK).
5259          * In this case the map request must be refused.  We use
5260          * rp->r_lkserlock to avoid a race with concurrent lock requests.
5261          */
5262         rp = VTOR(vp);
5263 
5264         /*
5265          * Atomically increment r_inmap after acquiring r_rwlock. The
5266          * idea here is to acquire r_rwlock to block read/write and
5267          * not to protect r_inmap. r_inmap will inform nfs3_read/write()
5268          * that we are in nfs3_map(). Now, r_rwlock is acquired in order
5269          * and we can prevent the deadlock that would have occurred
5270          * when nfs3_addmap() would have acquired it out of order.
5271          *
5272          * Since we are not protecting r_inmap by any lock, we do not
5273          * hold any lock when we decrement it. We atomically decrement
5274          * r_inmap after we release r_lkserlock.
5275          */
5276 
5277         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
5278                 return (EINTR);
5279         atomic_inc_uint(&rp->r_inmap);
5280         nfs_rw_exit(&rp->r_rwlock);
5281 
5282         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) {
5283                 atomic_dec_uint(&rp->r_inmap);
5284                 return (EINTR);
5285         }
5286 
5287         if (vp->v_flag & VNOCACHE) {
5288                 error = EAGAIN;
5289                 goto done;
5290         }
5291 
5292         /*
5293          * Don't allow concurrent locks and mapping if mandatory locking is
5294          * enabled.
5295          */
5296         if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) &&
5297             MANDLOCK(vp, va.va_mode)) {
5298                 error = EAGAIN;
5299                 goto done;
5300         }
5301 
5302         as_rangelock(as);
5303         error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
5304         if (error != 0) {
5305                 as_rangeunlock(as);
5306                 goto done;
5307         }
5308 
5309         vn_a.vp = vp;
5310         vn_a.offset = off;
5311         vn_a.type = (flags & MAP_TYPE);
5312         vn_a.prot = (uchar_t)prot;
5313         vn_a.maxprot = (uchar_t)maxprot;
5314         vn_a.flags = (flags & ~MAP_TYPE);
5315         vn_a.cred = cr;
5316         vn_a.amp = NULL;
5317         vn_a.szc = 0;
5318         vn_a.lgrp_mem_policy_flags = 0;
5319 
5320         error = as_map(as, *addrp, len, segvn_create, &vn_a);
5321         as_rangeunlock(as);
5322 
5323 done:
5324         nfs_rw_exit(&rp->r_lkserlock);
5325         atomic_dec_uint(&rp->r_inmap);
5326         return (error);
5327 }
5328 
5329 /* ARGSUSED */
5330 static int
5331 nfs3_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5332         size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
5333         cred_t *cr, caller_context_t *ct)
5334 {
5335         rnode_t *rp;
5336 
5337         if (vp->v_flag & VNOMAP)
5338                 return (ENOSYS);
5339         if (nfs_zone() != VTOMI(vp)->mi_zone)
5340                 return (EIO);
5341 
5342         rp = VTOR(vp);
5343         atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
5344 
5345         return (0);
5346 }
5347 
5348 /* ARGSUSED */
5349 static int
5350 nfs3_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
5351         offset_t offset, struct flk_callback *flk_cbp, cred_t *cr,
5352         caller_context_t *ct)
5353 {
5354         netobj lm_fh3;
5355         int rc;
5356         u_offset_t start, end;
5357         rnode_t *rp;
5358         int error = 0, intr = INTR(vp);
5359 
5360         if (nfs_zone() != VTOMI(vp)->mi_zone)
5361                 return (EIO);
5362         /* check for valid cmd parameter */
5363         if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
5364                 return (EINVAL);
5365 
5366         /* Verify l_type. */
5367         switch (bfp->l_type) {
5368         case F_RDLCK:
5369                 if (cmd != F_GETLK && !(flag & FREAD))
5370                         return (EBADF);
5371                 break;
5372         case F_WRLCK:
5373                 if (cmd != F_GETLK && !(flag & FWRITE))
5374                         return (EBADF);
5375                 break;
5376         case F_UNLCK:
5377                 intr = 0;
5378                 break;
5379 
5380         default:
5381                 return (EINVAL);
5382         }
5383 
5384         /* check the validity of the lock range */
5385         if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
5386                 return (rc);
5387         if (rc = flk_check_lock_data(start, end, MAXEND))
5388                 return (rc);
5389 
5390         /*
5391          * If the filesystem is mounted using local locking, pass the
5392          * request off to the local locking code.
5393          */
5394         if (VTOMI(vp)->mi_flags & MI_LLOCK) {
5395                 if (cmd == F_SETLK || cmd == F_SETLKW) {
5396                         /*
5397                          * For complete safety, we should be holding
5398                          * r_lkserlock.  However, we can't call
5399                          * lm_safelock and then fs_frlock while
5400                          * holding r_lkserlock, so just invoke
5401                          * lm_safelock and expect that this will
5402                          * catch enough of the cases.
5403                          */
5404                         if (!lm_safelock(vp, bfp, cr))
5405                                 return (EAGAIN);
5406                 }
5407                 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
5408         }
5409 
5410         rp = VTOR(vp);
5411 
5412         /*
5413          * Check whether the given lock request can proceed, given the
5414          * current file mappings.
5415          */
5416         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
5417                 return (EINTR);
5418         if (cmd == F_SETLK || cmd == F_SETLKW) {
5419                 if (!lm_safelock(vp, bfp, cr)) {
5420                         rc = EAGAIN;
5421                         goto done;
5422                 }
5423         }
5424 
5425         /*
5426          * Flush the cache after waiting for async I/O to finish.  For new
5427          * locks, this is so that the process gets the latest bits from the
5428          * server.  For unlocks, this is so that other clients see the
5429          * latest bits once the file has been unlocked.  If currently dirty
5430          * pages can't be flushed, then don't allow a lock to be set.  But
5431          * allow unlocks to succeed, to avoid having orphan locks on the
5432          * server.
5433          */
5434         if (cmd != F_GETLK) {
5435                 mutex_enter(&rp->r_statelock);
5436                 while (rp->r_count > 0) {
5437                         if (intr) {
5438                                 klwp_t *lwp = ttolwp(curthread);
5439 
5440                                 if (lwp != NULL)
5441                                         lwp->lwp_nostop++;
5442                                 if (cv_wait_sig(&rp->r_cv,
5443                                     &rp->r_statelock) == 0) {
5444                                         if (lwp != NULL)
5445                                                 lwp->lwp_nostop--;
5446                                         rc = EINTR;
5447                                         break;
5448                                 }
5449                                 if (lwp != NULL)
5450                                         lwp->lwp_nostop--;
5451                         } else
5452                                 cv_wait(&rp->r_cv, &rp->r_statelock);
5453                 }
5454                 mutex_exit(&rp->r_statelock);
5455                 if (rc != 0)
5456                         goto done;
5457                 error = nfs3_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
5458                 if (error) {
5459                         if (error == ENOSPC || error == EDQUOT) {
5460                                 mutex_enter(&rp->r_statelock);
5461                                 if (!rp->r_error)
5462                                         rp->r_error = error;
5463                                 mutex_exit(&rp->r_statelock);
5464                         }
5465                         if (bfp->l_type != F_UNLCK) {
5466                                 rc = ENOLCK;
5467                                 goto done;
5468                         }
5469                 }
5470         }
5471 
5472         lm_fh3.n_len = VTOFH3(vp)->fh3_length;
5473         lm_fh3.n_bytes = (char *)&(VTOFH3(vp)->fh3_u.data);
5474 
5475         /*
5476          * Call the lock manager to do the real work of contacting
5477          * the server and obtaining the lock.
5478          */
5479         rc = lm4_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh3, flk_cbp);
5480 
5481         if (rc == 0)
5482                 nfs_lockcompletion(vp, cmd);
5483 
5484 done:
5485         nfs_rw_exit(&rp->r_lkserlock);
5486         return (rc);
5487 }
5488 
5489 /*
5490  * Free storage space associated with the specified vnode.  The portion
5491  * to be freed is specified by bfp->l_start and bfp->l_len (already
5492  * normalized to a "whence" of 0).
5493  *
5494  * This is an experimental facility whose continued existence is not
5495  * guaranteed.  Currently, we only support the special case
5496  * of l_len == 0, meaning free to end of file.
5497  */
5498 /* ARGSUSED */
5499 static int
5500 nfs3_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
5501         offset_t offset, cred_t *cr, caller_context_t *ct)
5502 {
5503         int error;
5504 
5505         ASSERT(vp->v_type == VREG);
5506         if (cmd != F_FREESP)
5507                 return (EINVAL);
5508         if (nfs_zone() != VTOMI(vp)->mi_zone)
5509                 return (EIO);
5510 
5511         error = convoff(vp, bfp, 0, offset);
5512         if (!error) {
5513                 ASSERT(bfp->l_start >= 0);
5514                 if (bfp->l_len == 0) {
5515                         struct vattr va;
5516 
5517                         /*
5518                          * ftruncate should not change the ctime and
5519                          * mtime if we truncate the file to its
5520                          * previous size.
5521                          */
5522                         va.va_mask = AT_SIZE;
5523                         error = nfs3getattr(vp, &va, cr);
5524                         if (error || va.va_size == bfp->l_start)
5525                                 return (error);
5526                         va.va_mask = AT_SIZE;
5527                         va.va_size = bfp->l_start;
5528                         error = nfs3setattr(vp, &va, 0, cr);
5529 
5530                         if (error == 0 && bfp->l_start == 0)
5531                                 vnevent_truncate(vp, ct);
5532                 } else
5533                         error = EINVAL;
5534         }
5535 
5536         return (error);
5537 }
5538 
5539 /* ARGSUSED */
5540 static int
5541 nfs3_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
5542 {
5543 
5544         return (EINVAL);
5545 }
5546 
5547 /*
5548  * Setup and add an address space callback to do the work of the delmap call.
5549  * The callback will (and must be) deleted in the actual callback function.
5550  *
5551  * This is done in order to take care of the problem that we have with holding
5552  * the address space's a_lock for a long period of time (e.g. if the NFS server
5553  * is down).  Callbacks will be executed in the address space code while the
5554  * a_lock is not held.  Holding the address space's a_lock causes things such
5555  * as ps and fork to hang because they are trying to acquire this lock as well.
5556  */
5557 /* ARGSUSED */
5558 static int
5559 nfs3_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5560         size_t len, uint_t prot, uint_t maxprot, uint_t flags,
5561         cred_t *cr, caller_context_t *ct)
5562 {
5563         int                     caller_found;
5564         int                     error;
5565         rnode_t                 *rp;
5566         nfs_delmap_args_t       *dmapp;
5567         nfs_delmapcall_t        *delmap_call;
5568 
5569         if (vp->v_flag & VNOMAP)
5570                 return (ENOSYS);
5571         /*
5572          * A process may not change zones if it has NFS pages mmap'ed
5573          * in, so we can't legitimately get here from the wrong zone.
5574          */
5575         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5576 
5577         rp = VTOR(vp);
5578 
5579         /*
5580          * The way that the address space of this process deletes its mapping
5581          * of this file is via the following call chains:
5582          * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs3_delmap()
5583          * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs3_delmap()
5584          *
5585          * With the use of address space callbacks we are allowed to drop the
5586          * address space lock, a_lock, while executing the NFS operations that
5587          * need to go over the wire.  Returning EAGAIN to the caller of this
5588          * function is what drives the execution of the callback that we add
5589          * below.  The callback will be executed by the address space code
5590          * after dropping the a_lock.  When the callback is finished, since
5591          * we dropped the a_lock, it must be re-acquired and segvn_unmap()
5592          * is called again on the same segment to finish the rest of the work
5593          * that needs to happen during unmapping.
5594          *
5595          * This action of calling back into the segment driver causes
5596          * nfs3_delmap() to get called again, but since the callback was
5597          * already executed at this point, it already did the work and there
5598          * is nothing left for us to do.
5599          *
5600          * To Summarize:
5601          * - The first time nfs3_delmap is called by the current thread is when
5602          * we add the caller associated with this delmap to the delmap caller
5603          * list, add the callback, and return EAGAIN.
5604          * - The second time in this call chain when nfs3_delmap is called we
5605          * will find this caller in the delmap caller list and realize there
5606          * is no more work to do thus removing this caller from the list and
5607          * returning the error that was set in the callback execution.
5608          */
5609         caller_found = nfs_find_and_delete_delmapcall(rp, &error);
5610         if (caller_found) {
5611                 /*
5612                  * 'error' is from the actual delmap operations.  To avoid
5613                  * hangs, we need to handle the return of EAGAIN differently
5614                  * since this is what drives the callback execution.
5615                  * In this case, we don't want to return EAGAIN and do the
5616                  * callback execution because there are none to execute.
5617                  */
5618                 if (error == EAGAIN)
5619                         return (0);
5620                 else
5621                         return (error);
5622         }
5623 
5624         /* current caller was not in the list */
5625         delmap_call = nfs_init_delmapcall();
5626 
5627         mutex_enter(&rp->r_statelock);
5628         list_insert_tail(&rp->r_indelmap, delmap_call);
5629         mutex_exit(&rp->r_statelock);
5630 
5631         dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP);
5632 
5633         dmapp->vp = vp;
5634         dmapp->off = off;
5635         dmapp->addr = addr;
5636         dmapp->len = len;
5637         dmapp->prot = prot;
5638         dmapp->maxprot = maxprot;
5639         dmapp->flags = flags;
5640         dmapp->cr = cr;
5641         dmapp->caller = delmap_call;
5642 
5643         error = as_add_callback(as, nfs3_delmap_callback, dmapp,
5644             AS_UNMAP_EVENT, addr, len, KM_SLEEP);
5645 
5646         return (error ? error : EAGAIN);
5647 }
5648 
5649 /*
5650  * Remove some pages from an mmap'd vnode.  Just update the
5651  * count of pages.  If doing close-to-open, then flush and
5652  * commit all of the pages associated with this file.
5653  * Otherwise, start an asynchronous page flush to write out
5654  * any dirty pages.  This will also associate a credential
5655  * with the rnode which can be used to write the pages.
5656  */
5657 /* ARGSUSED */
5658 static void
5659 nfs3_delmap_callback(struct as *as, void *arg, uint_t event)
5660 {
5661         int                     error;
5662         rnode_t                 *rp;
5663         mntinfo_t               *mi;
5664         nfs_delmap_args_t       *dmapp = (nfs_delmap_args_t *)arg;
5665 
5666         rp = VTOR(dmapp->vp);
5667         mi = VTOMI(dmapp->vp);
5668 
5669         atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
5670         ASSERT(rp->r_mapcnt >= 0);
5671 
5672         /*
5673          * Initiate a page flush and potential commit if there are
5674          * pages, the file system was not mounted readonly, the segment
5675          * was mapped shared, and the pages themselves were writeable.
5676          */
5677         if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) &&
5678             dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
5679                 mutex_enter(&rp->r_statelock);
5680                 rp->r_flags |= RDIRTY;
5681                 mutex_exit(&rp->r_statelock);
5682                 /*
5683                  * If this is a cross-zone access a sync putpage won't work, so
5684                  * the best we can do is try an async putpage.  That seems
5685                  * better than something more draconian such as discarding the
5686                  * dirty pages.
5687                  */
5688                 if ((mi->mi_flags & MI_NOCTO) ||
5689                     nfs_zone() != mi->mi_zone)
5690                         error = nfs3_putpage(dmapp->vp, dmapp->off, dmapp->len,
5691                             B_ASYNC, dmapp->cr, NULL);
5692                 else
5693                         error = nfs3_putpage_commit(dmapp->vp, dmapp->off,
5694                             dmapp->len, dmapp->cr);
5695                 if (!error) {
5696                         mutex_enter(&rp->r_statelock);
5697                         error = rp->r_error;
5698                         rp->r_error = 0;
5699                         mutex_exit(&rp->r_statelock);
5700                 }
5701         } else
5702                 error = 0;
5703 
5704         if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO))
5705                 (void) nfs3_putpage(dmapp->vp, dmapp->off, dmapp->len,
5706                     B_INVAL, dmapp->cr, NULL);
5707 
5708         dmapp->caller->error = error;
5709         (void) as_delete_callback(as, arg);
5710         kmem_free(dmapp, sizeof (nfs_delmap_args_t));
5711 }
5712 
5713 static int nfs3_pathconf_disable_cache = 0;
5714 
5715 #ifdef DEBUG
5716 static int nfs3_pathconf_cache_hits = 0;
5717 static int nfs3_pathconf_cache_misses = 0;
5718 #endif
5719 
5720 /* ARGSUSED */
5721 static int
5722 nfs3_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
5723         caller_context_t *ct)
5724 {
5725         int error;
5726         PATHCONF3args args;
5727         PATHCONF3res res;
5728         int douprintf;
5729         failinfo_t fi;
5730         rnode_t *rp;
5731         hrtime_t t;
5732 
5733         if (nfs_zone() != VTOMI(vp)->mi_zone)
5734                 return (EIO);
5735         /*
5736          * Large file spec - need to base answer on info stored
5737          * on original FSINFO response.
5738          */
5739         if (cmd == _PC_FILESIZEBITS) {
5740                 unsigned long long ll;
5741                 long l = 1;
5742 
5743                 ll = VTOMI(vp)->mi_maxfilesize;
5744 
5745                 if (ll == 0) {
5746                         *valp = 0;
5747                         return (0);
5748                 }
5749 
5750                 if (ll & 0xffffffff00000000) {
5751                         l += 32; ll >>= 32;
5752                 }
5753                 if (ll & 0xffff0000) {
5754                         l += 16; ll >>= 16;
5755                 }
5756                 if (ll & 0xff00) {
5757                         l += 8; ll >>= 8;
5758                 }
5759                 if (ll & 0xf0) {
5760                         l += 4; ll >>= 4;
5761                 }
5762                 if (ll & 0xc) {
5763                         l += 2; ll >>= 2;
5764                 }
5765                 if (ll & 0x2)
5766                         l += 2;
5767                 else if (ll & 0x1)
5768                         l += 1;
5769                 *valp = l;
5770                 return (0);
5771         }
5772 
5773         if (cmd == _PC_ACL_ENABLED) {
5774                 *valp = _ACL_ACLENT_ENABLED;
5775                 return (0);
5776         }
5777 
5778         if (cmd == _PC_XATTR_EXISTS) {
5779                 error = 0;
5780                 *valp = 0;
5781                 if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
5782                         vnode_t *avp;
5783                         rnode_t *rp;
5784                         int error = 0;
5785                         mntinfo_t *mi = VTOMI(vp);
5786 
5787                         if (!(mi->mi_flags & MI_EXTATTR))
5788                                 return (0);
5789 
5790                         rp = VTOR(vp);
5791                         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER,
5792                             INTR(vp)))
5793                                 return (EINTR);
5794 
5795                         error = nfs3lookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr);
5796                         if (error || avp == NULL)
5797                                 error = acl_getxattrdir3(vp, &avp, 0, cr, 0);
5798 
5799                         nfs_rw_exit(&rp->r_rwlock);
5800 
5801                         if (error == 0 && avp != NULL) {
5802                                 error = do_xattr_exists_check(avp, valp, cr);
5803                                 VN_RELE(avp);
5804                         } else if (error == ENOENT) {
5805                                 error = 0;
5806                                 *valp = 0;
5807                         }
5808                 }
5809                 return (error);
5810         }
5811 
5812         rp = VTOR(vp);
5813         if (rp->r_pathconf != NULL) {
5814                 mutex_enter(&rp->r_statelock);
5815                 if (rp->r_pathconf != NULL && nfs3_pathconf_disable_cache) {
5816                         kmem_free(rp->r_pathconf, sizeof (*rp->r_pathconf));
5817                         rp->r_pathconf = NULL;
5818                 }
5819                 if (rp->r_pathconf != NULL) {
5820                         error = 0;
5821                         switch (cmd) {
5822                         case _PC_LINK_MAX:
5823                                 *valp = rp->r_pathconf->link_max;
5824                                 break;
5825                         case _PC_NAME_MAX:
5826                                 *valp = rp->r_pathconf->name_max;
5827                                 break;
5828                         case _PC_PATH_MAX:
5829                         case _PC_SYMLINK_MAX:
5830                                 *valp = MAXPATHLEN;
5831                                 break;
5832                         case _PC_CHOWN_RESTRICTED:
5833                                 *valp = rp->r_pathconf->chown_restricted;
5834                                 break;
5835                         case _PC_NO_TRUNC:
5836                                 *valp = rp->r_pathconf->no_trunc;
5837                                 break;
5838                         default:
5839                                 error = EINVAL;
5840                                 break;
5841                         }
5842                         mutex_exit(&rp->r_statelock);
5843 #ifdef DEBUG
5844                         nfs3_pathconf_cache_hits++;
5845 #endif
5846                         return (error);
5847                 }
5848                 mutex_exit(&rp->r_statelock);
5849         }
5850 #ifdef DEBUG
5851         nfs3_pathconf_cache_misses++;
5852 #endif
5853 
5854         args.object = *VTOFH3(vp);
5855         fi.vp = vp;
5856         fi.fhp = (caddr_t)&args.object;
5857         fi.copyproc = nfs3copyfh;
5858         fi.lookupproc = nfs3lookup;
5859         fi.xattrdirproc = acl_getxattrdir3;
5860 
5861         douprintf = 1;
5862 
5863         t = gethrtime();
5864 
5865         error = rfs3call(VTOMI(vp), NFSPROC3_PATHCONF,
5866             xdr_nfs_fh3, (caddr_t)&args,
5867             xdr_PATHCONF3res, (caddr_t)&res, cr,
5868             &douprintf, &res.status, 0, &fi);
5869 
5870         if (error)
5871                 return (error);
5872 
5873         error = geterrno3(res.status);
5874 
5875         if (!error) {
5876                 nfs3_cache_post_op_attr(vp, &res.resok.obj_attributes, t, cr);
5877                 if (!nfs3_pathconf_disable_cache) {
5878                         mutex_enter(&rp->r_statelock);
5879                         if (rp->r_pathconf == NULL) {
5880                                 rp->r_pathconf = kmem_alloc(
5881                                     sizeof (*rp->r_pathconf), KM_NOSLEEP);
5882                                 if (rp->r_pathconf != NULL)
5883                                         *rp->r_pathconf = res.resok.info;
5884                         }
5885                         mutex_exit(&rp->r_statelock);
5886                 }
5887                 switch (cmd) {
5888                 case _PC_LINK_MAX:
5889                         *valp = res.resok.info.link_max;
5890                         break;
5891                 case _PC_NAME_MAX:
5892                         *valp = res.resok.info.name_max;
5893                         break;
5894                 case _PC_PATH_MAX:
5895                 case _PC_SYMLINK_MAX:
5896                         *valp = MAXPATHLEN;
5897                         break;
5898                 case _PC_CHOWN_RESTRICTED:
5899                         *valp = res.resok.info.chown_restricted;
5900                         break;
5901                 case _PC_NO_TRUNC:
5902                         *valp = res.resok.info.no_trunc;
5903                         break;
5904                 default:
5905                         return (EINVAL);
5906                 }
5907         } else {
5908                 nfs3_cache_post_op_attr(vp, &res.resfail.obj_attributes, t, cr);
5909                 PURGE_STALE_FH(error, vp, cr);
5910         }
5911 
5912         return (error);
5913 }
5914 
5915 /*
5916  * Called by async thread to do synchronous pageio. Do the i/o, wait
5917  * for it to complete, and cleanup the page list when done.
5918  */
5919 static int
5920 nfs3_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
5921         int flags, cred_t *cr)
5922 {
5923         int error;
5924 
5925         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5926         error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5927         if (flags & B_READ)
5928                 pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
5929         else
5930                 pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
5931         return (error);
5932 }
5933 
5934 /* ARGSUSED */
5935 static int
5936 nfs3_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
5937         int flags, cred_t *cr, caller_context_t *ct)
5938 {
5939         int error;
5940         rnode_t *rp;
5941 
5942         if (pp == NULL)
5943                 return (EINVAL);
5944         if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
5945                 return (EIO);
5946 
5947         rp = VTOR(vp);
5948         mutex_enter(&rp->r_statelock);
5949         rp->r_count++;
5950         mutex_exit(&rp->r_statelock);
5951 
5952         if (flags & B_ASYNC) {
5953                 error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr,
5954                     nfs3_sync_pageio);
5955         } else
5956                 error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5957         mutex_enter(&rp->r_statelock);
5958         rp->r_count--;
5959         cv_broadcast(&rp->r_cv);
5960         mutex_exit(&rp->r_statelock);
5961         return (error);
5962 }
5963 
5964 /* ARGSUSED */
5965 static void
5966 nfs3_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr,
5967         caller_context_t *ct)
5968 {
5969         int error;
5970         rnode_t *rp;
5971         page_t *plist;
5972         page_t *pptr;
5973         offset3 offset;
5974         count3 len;
5975         k_sigset_t smask;
5976 
5977         /*
5978          * We should get called with fl equal to either B_FREE or
5979          * B_INVAL.  Any other value is illegal.
5980          *
5981          * The page that we are either supposed to free or destroy
5982          * should be exclusive locked and its io lock should not
5983          * be held.
5984          */
5985         ASSERT(fl == B_FREE || fl == B_INVAL);
5986         ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
5987         rp = VTOR(vp);
5988 
5989         /*
5990          * If the page doesn't need to be committed or we shouldn't
5991          * even bother attempting to commit it, then just make sure
5992          * that the p_fsdata byte is clear and then either free or
5993          * destroy the page as appropriate.
5994          */
5995         if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & RSTALE)) {
5996                 pp->p_fsdata = C_NOCOMMIT;
5997                 if (fl == B_FREE)
5998                         page_free(pp, dn);
5999                 else
6000                         page_destroy(pp, dn);
6001                 return;
6002         }
6003 
6004         /*
6005          * If there is a page invalidation operation going on, then
6006          * if this is one of the pages being destroyed, then just
6007          * clear the p_fsdata byte and then either free or destroy
6008          * the page as appropriate.
6009          */
6010         mutex_enter(&rp->r_statelock);
6011         if ((rp->r_flags & RTRUNCATE) && pp->p_offset >= rp->r_truncaddr) {
6012                 mutex_exit(&rp->r_statelock);
6013                 pp->p_fsdata = C_NOCOMMIT;
6014                 if (fl == B_FREE)
6015                         page_free(pp, dn);
6016                 else
6017                         page_destroy(pp, dn);
6018                 return;
6019         }
6020 
6021         /*
6022          * If we are freeing this page and someone else is already
6023          * waiting to do a commit, then just unlock the page and
6024          * return.  That other thread will take care of commiting
6025          * this page.  The page can be freed sometime after the
6026          * commit has finished.  Otherwise, if the page is marked
6027          * as delay commit, then we may be getting called from
6028          * pvn_write_done, one page at a time.   This could result
6029          * in one commit per page, so we end up doing lots of small
6030          * commits instead of fewer larger commits.  This is bad,
6031          * we want do as few commits as possible.
6032          */
6033         if (fl == B_FREE) {
6034                 if (rp->r_flags & RCOMMITWAIT) {
6035                         page_unlock(pp);
6036                         mutex_exit(&rp->r_statelock);
6037                         return;
6038                 }
6039                 if (pp->p_fsdata == C_DELAYCOMMIT) {
6040                         pp->p_fsdata = C_COMMIT;
6041                         page_unlock(pp);
6042                         mutex_exit(&rp->r_statelock);
6043                         return;
6044                 }
6045         }
6046 
6047         /*
6048          * Check to see if there is a signal which would prevent an
6049          * attempt to commit the pages from being successful.  If so,
6050          * then don't bother with all of the work to gather pages and
6051          * generate the unsuccessful RPC.  Just return from here and
6052          * let the page be committed at some later time.
6053          */
6054         sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT);
6055         if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) {
6056                 sigunintr(&smask);
6057                 page_unlock(pp);
6058                 mutex_exit(&rp->r_statelock);
6059                 return;
6060         }
6061         sigunintr(&smask);
6062 
6063         /*
6064          * We are starting to need to commit pages, so let's try
6065          * to commit as many as possible at once to reduce the
6066          * overhead.
6067          *
6068          * Set the `commit inprogress' state bit.  We must
6069          * first wait until any current one finishes.  Then
6070          * we initialize the c_pages list with this page.
6071          */
6072         while (rp->r_flags & RCOMMIT) {
6073                 rp->r_flags |= RCOMMITWAIT;
6074                 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
6075                 rp->r_flags &= ~RCOMMITWAIT;
6076         }
6077         rp->r_flags |= RCOMMIT;
6078         mutex_exit(&rp->r_statelock);
6079         ASSERT(rp->r_commit.c_pages == NULL);
6080         rp->r_commit.c_pages = pp;
6081         rp->r_commit.c_commbase = (offset3)pp->p_offset;
6082         rp->r_commit.c_commlen = PAGESIZE;
6083 
6084         /*
6085          * Gather together all other pages which can be committed.
6086          * They will all be chained off r_commit.c_pages.
6087          */
6088         nfs3_get_commit(vp);
6089 
6090         /*
6091          * Clear the `commit inprogress' status and disconnect
6092          * the list of pages to be committed from the rnode.
6093          * At this same time, we also save the starting offset
6094          * and length of data to be committed on the server.
6095          */
6096         plist = rp->r_commit.c_pages;
6097         rp->r_commit.c_pages = NULL;
6098         offset = rp->r_commit.c_commbase;
6099         len = rp->r_commit.c_commlen;
6100         mutex_enter(&rp->r_statelock);
6101         rp->r_flags &= ~RCOMMIT;
6102         cv_broadcast(&rp->r_commit.c_cv);
6103         mutex_exit(&rp->r_statelock);
6104 
6105         if (curproc == proc_pageout || curproc == proc_fsflush ||
6106             nfs_zone() != VTOMI(vp)->mi_zone) {
6107                 nfs_async_commit(vp, plist, offset, len, cr, nfs3_async_commit);
6108                 return;
6109         }
6110 
6111         /*
6112          * Actually generate the COMMIT3 over the wire operation.
6113          */
6114         error = nfs3_commit(vp, offset, len, cr);
6115 
6116         /*
6117          * If we got an error during the commit, just unlock all
6118          * of the pages.  The pages will get retransmitted to the
6119          * server during a putpage operation.
6120          */
6121         if (error) {
6122                 while (plist != NULL) {
6123                         pptr = plist;
6124                         page_sub(&plist, pptr);
6125                         page_unlock(pptr);
6126                 }
6127                 return;
6128         }
6129 
6130         /*
6131          * We've tried as hard as we can to commit the data to stable
6132          * storage on the server.  We release the rest of the pages
6133          * and clear the commit required state.  They will be put
6134          * onto the tail of the cachelist if they are nolonger
6135          * mapped.
6136          */
6137         while (plist != pp) {
6138                 pptr = plist;
6139                 page_sub(&plist, pptr);
6140                 pptr->p_fsdata = C_NOCOMMIT;
6141                 (void) page_release(pptr, 1);
6142         }
6143 
6144         /*
6145          * It is possible that nfs3_commit didn't return error but
6146          * some other thread has modified the page we are going
6147          * to free/destroy.
6148          *    In this case we need to rewrite the page. Do an explicit check
6149          * before attempting to free/destroy the page. If modified, needs to
6150          * be rewritten so unlock the page and return.
6151          */
6152         if (hat_ismod(pp)) {
6153                 pp->p_fsdata = C_NOCOMMIT;
6154                 page_unlock(pp);
6155                 return;
6156         }
6157 
6158         /*
6159          * Now, as appropriate, either free or destroy the page
6160          * that we were called with.
6161          */
6162         pp->p_fsdata = C_NOCOMMIT;
6163         if (fl == B_FREE)
6164                 page_free(pp, dn);
6165         else
6166                 page_destroy(pp, dn);
6167 }
6168 
6169 static int
6170 nfs3_commit(vnode_t *vp, offset3 offset, count3 count, cred_t *cr)
6171 {
6172         int error;
6173         rnode_t *rp;
6174         COMMIT3args args;
6175         COMMIT3res res;
6176         int douprintf;
6177         cred_t *cred;
6178 
6179         rp = VTOR(vp);
6180         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6181 
6182         mutex_enter(&rp->r_statelock);
6183         if (rp->r_cred != NULL) {
6184                 cred = rp->r_cred;
6185                 crhold(cred);
6186         } else {
6187                 rp->r_cred = cr;
6188                 crhold(cr);
6189                 cred = cr;
6190                 crhold(cred);
6191         }
6192         mutex_exit(&rp->r_statelock);
6193 
6194         args.file = *VTOFH3(vp);
6195         args.offset = offset;
6196         args.count = count;
6197 
6198 doitagain:
6199         douprintf = 1;
6200         error = rfs3call(VTOMI(vp), NFSPROC3_COMMIT,
6201             xdr_COMMIT3args, (caddr_t)&args,
6202             xdr_COMMIT3res, (caddr_t)&res, cred,
6203             &douprintf, &res.status, 0, NULL);
6204 
6205         crfree(cred);
6206 
6207         if (error)
6208                 return (error);
6209 
6210         error = geterrno3(res.status);
6211         if (!error) {
6212                 ASSERT(rp->r_flags & RHAVEVERF);
6213                 mutex_enter(&rp->r_statelock);
6214                 if (rp->r_verf == res.resok.verf) {
6215                         mutex_exit(&rp->r_statelock);
6216                         return (0);
6217                 }
6218                 nfs3_set_mod(vp);
6219                 rp->r_verf = res.resok.verf;
6220                 mutex_exit(&rp->r_statelock);
6221                 error = NFS_VERF_MISMATCH;
6222         } else {
6223                 if (error == EACCES) {
6224                         mutex_enter(&rp->r_statelock);
6225                         if (cred != cr) {
6226                                 if (rp->r_cred != NULL)
6227                                         crfree(rp->r_cred);
6228                                 rp->r_cred = cr;
6229                                 crhold(cr);
6230                                 cred = cr;
6231                                 crhold(cred);
6232                                 mutex_exit(&rp->r_statelock);
6233                                 goto doitagain;
6234                         }
6235                         mutex_exit(&rp->r_statelock);
6236                 }
6237                 /*
6238                  * Can't do a PURGE_STALE_FH here because this
6239                  * can cause a deadlock.  nfs3_commit can
6240                  * be called from nfs3_dispose which can be called
6241                  * indirectly via pvn_vplist_dirty.  PURGE_STALE_FH
6242                  * can call back to pvn_vplist_dirty.
6243                  */
6244                 if (error == ESTALE) {
6245                         mutex_enter(&rp->r_statelock);
6246                         rp->r_flags |= RSTALE;
6247                         if (!rp->r_error)
6248                                 rp->r_error = error;
6249                         mutex_exit(&rp->r_statelock);
6250                         PURGE_ATTRCACHE(vp);
6251                 } else {
6252                         mutex_enter(&rp->r_statelock);
6253                         if (!rp->r_error)
6254                                 rp->r_error = error;
6255                         mutex_exit(&rp->r_statelock);
6256                 }
6257         }
6258 
6259         return (error);
6260 }
6261 
6262 static void
6263 nfs3_set_mod(vnode_t *vp)
6264 {
6265         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6266 
6267         pvn_vplist_setdirty(vp, nfs_setmod_check);
6268 }
6269 
6270 /*
6271  * This routine is used to gather together a page list of the pages
6272  * which are to be committed on the server.  This routine must not
6273  * be called if the calling thread holds any locked pages.
6274  *
6275  * The calling thread must have set RCOMMIT.  This bit is used to
6276  * serialize access to the commit structure in the rnode.  As long
6277  * as the thread has set RCOMMIT, then it can manipulate the commit
6278  * structure without requiring any other locks.
6279  */
6280 static void
6281 nfs3_get_commit(vnode_t *vp)
6282 {
6283         rnode_t *rp;
6284         page_t *pp;
6285         kmutex_t *vphm;
6286 
6287         rp = VTOR(vp);
6288 
6289         ASSERT(rp->r_flags & RCOMMIT);
6290 
6291         vphm = page_vnode_mutex(vp);
6292         mutex_enter(vphm);
6293 
6294         /*
6295          * If there are no pages associated with this vnode, then
6296          * just return.
6297          */
6298         if ((pp = vp->v_pages) == NULL) {
6299                 mutex_exit(vphm);
6300                 return;
6301         }
6302 
6303         /*
6304          * Step through all of the pages associated with this vnode
6305          * looking for pages which need to be committed.
6306          */
6307         do {
6308                 /* Skip marker pages. */
6309                 if (pp->p_hash == PVN_VPLIST_HASH_TAG)
6310                         continue;
6311 
6312                 /*
6313                  * If this page does not need to be committed or is
6314                  * modified, then just skip it.
6315                  */
6316                 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp))
6317                         continue;
6318 
6319                 /*
6320                  * Attempt to lock the page.  If we can't, then
6321                  * someone else is messing with it and we will
6322                  * just skip it.
6323                  */
6324                 if (!page_trylock(pp, SE_EXCL))
6325                         continue;
6326 
6327                 /*
6328                  * If this page does not need to be committed or is
6329                  * modified, then just skip it.  Recheck now that
6330                  * the page is locked.
6331                  */
6332                 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
6333                         page_unlock(pp);
6334                         continue;
6335                 }
6336 
6337                 if (PP_ISFREE(pp)) {
6338                         cmn_err(CE_PANIC, "nfs3_get_commit: %p is free",
6339                             (void *)pp);
6340                 }
6341 
6342                 /*
6343                  * The page needs to be committed and we locked it.
6344                  * Update the base and length parameters and add it
6345                  * to r_pages.
6346                  */
6347                 if (rp->r_commit.c_pages == NULL) {
6348                         rp->r_commit.c_commbase = (offset3)pp->p_offset;
6349                         rp->r_commit.c_commlen = PAGESIZE;
6350                 } else if (pp->p_offset < rp->r_commit.c_commbase) {
6351                         rp->r_commit.c_commlen = rp->r_commit.c_commbase -
6352                             (offset3)pp->p_offset + rp->r_commit.c_commlen;
6353                         rp->r_commit.c_commbase = (offset3)pp->p_offset;
6354                 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen)
6355                     <= pp->p_offset) {
6356                         rp->r_commit.c_commlen = (offset3)pp->p_offset -
6357                             rp->r_commit.c_commbase + PAGESIZE;
6358                 }
6359                 page_add(&rp->r_commit.c_pages, pp);
6360         } while ((pp = pp->p_vpnext) != vp->v_pages);
6361 
6362         mutex_exit(vphm);
6363 }
6364 
6365 /*
6366  * This routine is used to gather together a page list of the pages
6367  * which are to be committed on the server.  This routine must not
6368  * be called if the calling thread holds any locked pages.
6369  *
6370  * The calling thread must have set RCOMMIT.  This bit is used to
6371  * serialize access to the commit structure in the rnode.  As long
6372  * as the thread has set RCOMMIT, then it can manipulate the commit
6373  * structure without requiring any other locks.
6374  */
6375 static void
6376 nfs3_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len)
6377 {
6378 
6379         rnode_t *rp;
6380         page_t *pp;
6381         u_offset_t end;
6382         u_offset_t off;
6383 
6384         ASSERT(len != 0);
6385 
6386         rp = VTOR(vp);
6387 
6388         ASSERT(rp->r_flags & RCOMMIT);
6389         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6390 
6391         /*
6392          * If there are no pages associated with this vnode, then
6393          * just return.
6394          */
6395         if ((pp = vp->v_pages) == NULL)
6396                 return;
6397 
6398         /*
6399          * Calculate the ending offset.
6400          */
6401         end = soff + len;
6402 
6403         for (off = soff; off < end; off += PAGESIZE) {
6404                 /*
6405                  * Lookup each page by vp, offset.
6406                  */
6407                 if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL)
6408                         continue;
6409 
6410                 /*
6411                  * If this page does not need to be committed or is
6412                  * modified, then just skip it.
6413                  */
6414                 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
6415                         page_unlock(pp);
6416                         continue;
6417                 }
6418 
6419                 ASSERT(PP_ISFREE(pp) == 0);
6420 
6421                 /*
6422                  * The page needs to be committed and we locked it.
6423                  * Update the base and length parameters and add it
6424                  * to r_pages.
6425                  */
6426                 if (rp->r_commit.c_pages == NULL) {
6427                         rp->r_commit.c_commbase = (offset3)pp->p_offset;
6428                         rp->r_commit.c_commlen = PAGESIZE;
6429                 } else {
6430                         rp->r_commit.c_commlen = (offset3)pp->p_offset -
6431                             rp->r_commit.c_commbase + PAGESIZE;
6432                 }
6433                 page_add(&rp->r_commit.c_pages, pp);
6434         }
6435 }
6436 
6437 static int
6438 nfs3_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr)
6439 {
6440         int error;
6441         writeverf3 write_verf;
6442         rnode_t *rp = VTOR(vp);
6443 
6444         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6445         /*
6446          * Flush the data portion of the file and then commit any
6447          * portions which need to be committed.  This may need to
6448          * be done twice if the server has changed state since
6449          * data was last written.  The data will need to be
6450          * rewritten to the server and then a new commit done.
6451          *
6452          * In fact, this may need to be done several times if the
6453          * server is having problems and crashing while we are
6454          * attempting to do this.
6455          */
6456 
6457 top:
6458         /*
6459          * Do a flush based on the poff and plen arguments.  This
6460          * will asynchronously write out any modified pages in the
6461          * range specified by (poff, plen).  This starts all of the
6462          * i/o operations which will be waited for in the next
6463          * call to nfs3_putpage
6464          */
6465 
6466         mutex_enter(&rp->r_statelock);
6467         write_verf = rp->r_verf;
6468         mutex_exit(&rp->r_statelock);
6469 
6470         error = nfs3_putpage(vp, poff, plen, B_ASYNC, cr, NULL);
6471         if (error == EAGAIN)
6472                 error = 0;
6473 
6474         /*
6475          * Do a flush based on the poff and plen arguments.  This
6476          * will synchronously write out any modified pages in the
6477          * range specified by (poff, plen) and wait until all of
6478          * the asynchronous i/o's in that range are done as well.
6479          */
6480         if (!error)
6481                 error = nfs3_putpage(vp, poff, plen, 0, cr, NULL);
6482 
6483         if (error)
6484                 return (error);
6485 
6486         mutex_enter(&rp->r_statelock);
6487         if (rp->r_verf != write_verf) {
6488                 mutex_exit(&rp->r_statelock);
6489                 goto top;
6490         }
6491         mutex_exit(&rp->r_statelock);
6492 
6493         /*
6494          * Now commit any pages which might need to be committed.
6495          * If the error, NFS_VERF_MISMATCH, is returned, then
6496          * start over with the flush operation.
6497          */
6498 
6499         error = nfs3_commit_vp(vp, poff, plen, cr);
6500 
6501         if (error == NFS_VERF_MISMATCH)
6502                 goto top;
6503 
6504         return (error);
6505 }
6506 
6507 static int
6508 nfs3_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen, cred_t *cr)
6509 {
6510         rnode_t *rp;
6511         page_t *plist;
6512         offset3 offset;
6513         count3 len;
6514 
6515 
6516         rp = VTOR(vp);
6517 
6518         if (nfs_zone() != VTOMI(vp)->mi_zone)
6519                 return (EIO);
6520         /*
6521          * Set the `commit inprogress' state bit.  We must
6522          * first wait until any current one finishes.
6523          */
6524         mutex_enter(&rp->r_statelock);
6525         while (rp->r_flags & RCOMMIT) {
6526                 rp->r_flags |= RCOMMITWAIT;
6527                 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
6528                 rp->r_flags &= ~RCOMMITWAIT;
6529         }
6530         rp->r_flags |= RCOMMIT;
6531         mutex_exit(&rp->r_statelock);
6532 
6533         /*
6534          * Gather together all of the pages which need to be
6535          * committed.
6536          */
6537         if (plen == 0)
6538                 nfs3_get_commit(vp);
6539         else
6540                 nfs3_get_commit_range(vp, poff, plen);
6541 
6542         /*
6543          * Clear the `commit inprogress' bit and disconnect the
6544          * page list which was gathered together in nfs3_get_commit.
6545          */
6546         plist = rp->r_commit.c_pages;
6547         rp->r_commit.c_pages = NULL;
6548         offset = rp->r_commit.c_commbase;
6549         len = rp->r_commit.c_commlen;
6550         mutex_enter(&rp->r_statelock);
6551         rp->r_flags &= ~RCOMMIT;
6552         cv_broadcast(&rp->r_commit.c_cv);
6553         mutex_exit(&rp->r_statelock);
6554 
6555         /*
6556          * If any pages need to be committed, commit them and
6557          * then unlock them so that they can be freed some
6558          * time later.
6559          */
6560         if (plist != NULL) {
6561                 /*
6562                  * No error occurred during the flush portion
6563                  * of this operation, so now attempt to commit
6564                  * the data to stable storage on the server.
6565                  *
6566                  * This will unlock all of the pages on the list.
6567                  */
6568                 return (nfs3_sync_commit(vp, plist, offset, len, cr));
6569         }
6570         return (0);
6571 }
6572 
6573 static int
6574 nfs3_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
6575         cred_t *cr)
6576 {
6577         int error;
6578         page_t *pp;
6579 
6580         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6581         error = nfs3_commit(vp, offset, count, cr);
6582 
6583         /*
6584          * If we got an error, then just unlock all of the pages
6585          * on the list.
6586          */
6587         if (error) {
6588                 while (plist != NULL) {
6589                         pp = plist;
6590                         page_sub(&plist, pp);
6591                         page_unlock(pp);
6592                 }
6593                 return (error);
6594         }
6595         /*
6596          * We've tried as hard as we can to commit the data to stable
6597          * storage on the server.  We just unlock the pages and clear
6598          * the commit required state.  They will get freed later.
6599          */
6600         while (plist != NULL) {
6601                 pp = plist;
6602                 page_sub(&plist, pp);
6603                 pp->p_fsdata = C_NOCOMMIT;
6604                 page_unlock(pp);
6605         }
6606 
6607         return (error);
6608 }
6609 
6610 static void
6611 nfs3_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
6612         cred_t *cr)
6613 {
6614         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6615         (void) nfs3_sync_commit(vp, plist, offset, count, cr);
6616 }
6617 
6618 /* ARGSUSED */
6619 static int
6620 nfs3_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
6621         caller_context_t *ct)
6622 {
6623         int error;
6624         mntinfo_t *mi;
6625 
6626         mi = VTOMI(vp);
6627 
6628         if (nfs_zone() != mi->mi_zone)
6629                 return (EIO);
6630 
6631         if (mi->mi_flags & MI_ACL) {
6632                 error = acl_setacl3(vp, vsecattr, flag, cr);
6633                 if (mi->mi_flags & MI_ACL)
6634                         return (error);
6635         }
6636 
6637         return (ENOSYS);
6638 }
6639 
6640 /* ARGSUSED */
6641 static int
6642 nfs3_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
6643         caller_context_t *ct)
6644 {
6645         int error;
6646         mntinfo_t *mi;
6647 
6648         mi = VTOMI(vp);
6649 
6650         if (nfs_zone() != mi->mi_zone)
6651                 return (EIO);
6652 
6653         if (mi->mi_flags & MI_ACL) {
6654                 error = acl_getacl3(vp, vsecattr, flag, cr);
6655                 if (mi->mi_flags & MI_ACL)
6656                         return (error);
6657         }
6658 
6659         return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
6660 }
6661 
6662 /* ARGSUSED */
6663 static int
6664 nfs3_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
6665         caller_context_t *ct)
6666 {
6667         int error;
6668         struct shrlock nshr;
6669         struct nfs_owner nfs_owner;
6670         netobj lm_fh3;
6671 
6672         if (nfs_zone() != VTOMI(vp)->mi_zone)
6673                 return (EIO);
6674 
6675         /*
6676          * check for valid cmd parameter
6677          */
6678         if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
6679                 return (EINVAL);
6680 
6681         /*
6682          * Check access permissions
6683          */
6684         if (cmd == F_SHARE &&
6685             (((shr->s_access & F_RDACC) && !(flag & FREAD)) ||
6686             ((shr->s_access & F_WRACC) && !(flag & FWRITE))))
6687                 return (EBADF);
6688 
6689         /*
6690          * If the filesystem is mounted using local locking, pass the
6691          * request off to the local share code.
6692          */
6693         if (VTOMI(vp)->mi_flags & MI_LLOCK)
6694                 return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
6695 
6696         switch (cmd) {
6697         case F_SHARE:
6698         case F_UNSHARE:
6699                 lm_fh3.n_len = VTOFH3(vp)->fh3_length;
6700                 lm_fh3.n_bytes = (char *)&(VTOFH3(vp)->fh3_u.data);
6701 
6702                 /*
6703                  * If passed an owner that is too large to fit in an
6704                  * nfs_owner it is likely a recursive call from the
6705                  * lock manager client and pass it straight through.  If
6706                  * it is not a nfs_owner then simply return an error.
6707                  */
6708                 if (shr->s_own_len > sizeof (nfs_owner.lowner)) {
6709                         if (((struct nfs_owner *)shr->s_owner)->magic !=
6710                             NFS_OWNER_MAGIC)
6711                                 return (EINVAL);
6712 
6713                         if (error = lm4_shrlock(vp, cmd, shr, flag, &lm_fh3)) {
6714                                 error = set_errno(error);
6715                         }
6716                         return (error);
6717                 }
6718                 /*
6719                  * Remote share reservations owner is a combination of
6720                  * a magic number, hostname, and the local owner
6721                  */
6722                 bzero(&nfs_owner, sizeof (nfs_owner));
6723                 nfs_owner.magic = NFS_OWNER_MAGIC;
6724                 (void) strncpy(nfs_owner.hname, uts_nodename(),
6725                     sizeof (nfs_owner.hname));
6726                 bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len);
6727                 nshr.s_access = shr->s_access;
6728                 nshr.s_deny = shr->s_deny;
6729                 nshr.s_sysid = 0;
6730                 nshr.s_pid = ttoproc(curthread)->p_pid;
6731                 nshr.s_own_len = sizeof (nfs_owner);
6732                 nshr.s_owner = (caddr_t)&nfs_owner;
6733 
6734                 if (error = lm4_shrlock(vp, cmd, &nshr, flag, &lm_fh3)) {
6735                         error = set_errno(error);
6736                 }
6737 
6738                 break;
6739 
6740         case F_HASREMOTELOCKS:
6741                 /*
6742                  * NFS client can't store remote locks itself
6743                  */
6744                 shr->s_access = 0;
6745                 error = 0;
6746                 break;
6747 
6748         default:
6749                 error = EINVAL;
6750                 break;
6751         }
6752 
6753         return (error);
6754 }