1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  24  */
  25 
  26 /*
  27  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  28  *      All rights reserved.
  29  */
  30 
  31 #include <sys/param.h>
  32 #include <sys/types.h>
  33 #include <sys/systm.h>
  34 #include <sys/cred.h>
  35 #include <sys/buf.h>
  36 #include <sys/vfs.h>
  37 #include <sys/vnode.h>
  38 #include <sys/uio.h>
  39 #include <sys/stat.h>
  40 #include <sys/errno.h>
  41 #include <sys/sysmacros.h>
  42 #include <sys/statvfs.h>
  43 #include <sys/kmem.h>
  44 #include <sys/kstat.h>
  45 #include <sys/dirent.h>
  46 #include <sys/cmn_err.h>
  47 #include <sys/debug.h>
  48 #include <sys/vtrace.h>
  49 #include <sys/mode.h>
  50 #include <sys/acl.h>
  51 #include <sys/nbmlock.h>
  52 #include <sys/policy.h>
  53 #include <sys/sdt.h>
  54 
  55 #include <rpc/types.h>
  56 #include <rpc/auth.h>
  57 #include <rpc/svc.h>
  58 
  59 #include <nfs/nfs.h>
  60 #include <nfs/export.h>
  61 #include <nfs/nfs_cmd.h>
  62 
  63 #include <vm/hat.h>
  64 #include <vm/as.h>
  65 #include <vm/seg.h>
  66 #include <vm/seg_map.h>
  67 #include <vm/seg_kmem.h>
  68 
  69 #include <sys/strsubr.h>
  70 
  71 /*
  72  * These are the interface routines for the server side of the
  73  * Network File System.  See the NFS version 2 protocol specification
  74  * for a description of this interface.
  75  */
  76 
  77 static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
  78 static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
  79                         cred_t *);
  80 
  81 /*
  82  * Some "over the wire" UNIX file types.  These are encoded
  83  * into the mode.  This needs to be fixed in the next rev.
  84  */
  85 #define IFMT            0170000         /* type of file */
  86 #define IFCHR           0020000         /* character special */
  87 #define IFBLK           0060000         /* block special */
  88 #define IFSOCK          0140000         /* socket */
  89 
  90 u_longlong_t nfs2_srv_caller_id;
  91 
  92 /*
  93  * Get file attributes.
  94  * Returns the current attributes of the file with the given fhandle.
  95  */
  96 /* ARGSUSED */
  97 void
  98 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
  99     struct svc_req *req, cred_t *cr, bool_t ro)
 100 {
 101         int error;
 102         vnode_t *vp;
 103         struct vattr va;
 104 
 105         vp = nfs_fhtovp(fhp, exi);
 106         if (vp == NULL) {
 107                 ns->ns_status = NFSERR_STALE;
 108                 return;
 109         }
 110 
 111         /*
 112          * Do the getattr.
 113          */
 114         va.va_mask = AT_ALL;    /* we want all the attributes */
 115 
 116         error = rfs4_delegated_getattr(vp, &va, 0, cr);
 117 
 118         /* check for overflows */
 119         if (!error) {
 120                 /* Lie about the object type for a referral */
 121                 if (vn_is_nfs_reparse(vp, cr))
 122                         va.va_type = VLNK;
 123 
 124                 acl_perm(vp, exi, &va, cr);
 125                 error = vattr_to_nattr(&va, &ns->ns_attr);
 126         }
 127 
 128         VN_RELE(vp);
 129 
 130         ns->ns_status = puterrno(error);
 131 }
 132 void *
 133 rfs_getattr_getfh(fhandle_t *fhp)
 134 {
 135         return (fhp);
 136 }
 137 
 138 /*
 139  * Set file attributes.
 140  * Sets the attributes of the file with the given fhandle.  Returns
 141  * the new attributes.
 142  */
 143 /* ARGSUSED */
 144 void
 145 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 146     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 147 {
 148         int error;
 149         int flag;
 150         int in_crit = 0;
 151         vnode_t *vp;
 152         struct vattr va;
 153         struct vattr bva;
 154         struct flock64 bf;
 155         caller_context_t ct;
 156 
 157 
 158         vp = nfs_fhtovp(&args->saa_fh, exi);
 159         if (vp == NULL) {
 160                 ns->ns_status = NFSERR_STALE;
 161                 return;
 162         }
 163 
 164         if (rdonly(ro, vp)) {
 165                 VN_RELE(vp);
 166                 ns->ns_status = NFSERR_ROFS;
 167                 return;
 168         }
 169 
 170         error = sattr_to_vattr(&args->saa_sa, &va);
 171         if (error) {
 172                 VN_RELE(vp);
 173                 ns->ns_status = puterrno(error);
 174                 return;
 175         }
 176 
 177         /*
 178          * If the client is requesting a change to the mtime,
 179          * but the nanosecond field is set to 1 billion, then
 180          * this is a flag to the server that it should set the
 181          * atime and mtime fields to the server's current time.
 182          * The 1 billion number actually came from the client
 183          * as 1 million, but the units in the over the wire
 184          * request are microseconds instead of nanoseconds.
 185          *
 186          * This is an overload of the protocol and should be
 187          * documented in the NFS Version 2 protocol specification.
 188          */
 189         if (va.va_mask & AT_MTIME) {
 190                 if (va.va_mtime.tv_nsec == 1000000000) {
 191                         gethrestime(&va.va_mtime);
 192                         va.va_atime = va.va_mtime;
 193                         va.va_mask |= AT_ATIME;
 194                         flag = 0;
 195                 } else
 196                         flag = ATTR_UTIME;
 197         } else
 198                 flag = 0;
 199 
 200         /*
 201          * If the filesystem is exported with nosuid, then mask off
 202          * the setuid and setgid bits.
 203          */
 204         if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 205             (exi->exi_export.ex_flags & EX_NOSUID))
 206                 va.va_mode &= ~(VSUID | VSGID);
 207 
 208         ct.cc_sysid = 0;
 209         ct.cc_pid = 0;
 210         ct.cc_caller_id = nfs2_srv_caller_id;
 211         ct.cc_flags = CC_DONTBLOCK;
 212 
 213         /*
 214          * We need to specially handle size changes because it is
 215          * possible for the client to create a file with modes
 216          * which indicate read-only, but with the file opened for
 217          * writing.  If the client then tries to set the size of
 218          * the file, then the normal access checking done in
 219          * VOP_SETATTR would prevent the client from doing so,
 220          * although it should be legal for it to do so.  To get
 221          * around this, we do the access checking for ourselves
 222          * and then use VOP_SPACE which doesn't do the access
 223          * checking which VOP_SETATTR does. VOP_SPACE can only
 224          * operate on VREG files, let VOP_SETATTR handle the other
 225          * extremely rare cases.
 226          * Also the client should not be allowed to change the
 227          * size of the file if there is a conflicting non-blocking
 228          * mandatory lock in the region of change.
 229          */
 230         if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 231                 if (nbl_need_check(vp)) {
 232                         nbl_start_crit(vp, RW_READER);
 233                         in_crit = 1;
 234                 }
 235 
 236                 bva.va_mask = AT_UID | AT_SIZE;
 237 
 238                 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 239 
 240                 if (error) {
 241                         if (in_crit)
 242                                 nbl_end_crit(vp);
 243                         VN_RELE(vp);
 244                         ns->ns_status = puterrno(error);
 245                         return;
 246                 }
 247 
 248                 if (in_crit) {
 249                         u_offset_t offset;
 250                         ssize_t length;
 251 
 252                         if (va.va_size < bva.va_size) {
 253                                 offset = va.va_size;
 254                                 length = bva.va_size - va.va_size;
 255                         } else {
 256                                 offset = bva.va_size;
 257                                 length = va.va_size - bva.va_size;
 258                         }
 259                         if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 260                             NULL)) {
 261                                 error = EACCES;
 262                         }
 263                 }
 264 
 265                 if (crgetuid(cr) == bva.va_uid && !error &&
 266                     va.va_size != bva.va_size) {
 267                         va.va_mask &= ~AT_SIZE;
 268                         bf.l_type = F_WRLCK;
 269                         bf.l_whence = 0;
 270                         bf.l_start = (off64_t)va.va_size;
 271                         bf.l_len = 0;
 272                         bf.l_sysid = 0;
 273                         bf.l_pid = 0;
 274 
 275                         error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 276                             (offset_t)va.va_size, cr, &ct);
 277                 }
 278                 if (in_crit)
 279                         nbl_end_crit(vp);
 280         } else
 281                 error = 0;
 282 
 283         /*
 284          * Do the setattr.
 285          */
 286         if (!error && va.va_mask) {
 287                 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 288         }
 289 
 290         /*
 291          * check if the monitor on either vop_space or vop_setattr detected
 292          * a delegation conflict and if so, mark the thread flag as
 293          * wouldblock so that the response is dropped and the client will
 294          * try again.
 295          */
 296         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 297                 VN_RELE(vp);
 298                 curthread->t_flag |= T_WOULDBLOCK;
 299                 return;
 300         }
 301 
 302         if (!error) {
 303                 va.va_mask = AT_ALL;    /* get everything */
 304 
 305                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 306 
 307                 /* check for overflows */
 308                 if (!error) {
 309                         acl_perm(vp, exi, &va, cr);
 310                         error = vattr_to_nattr(&va, &ns->ns_attr);
 311                 }
 312         }
 313 
 314         ct.cc_flags = 0;
 315 
 316         /*
 317          * Force modified metadata out to stable storage.
 318          */
 319         (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 320 
 321         VN_RELE(vp);
 322 
 323         ns->ns_status = puterrno(error);
 324 }
 325 void *
 326 rfs_setattr_getfh(struct nfssaargs *args)
 327 {
 328         return (&args->saa_fh);
 329 }
 330 
 331 /*
 332  * Directory lookup.
 333  * Returns an fhandle and file attributes for file name in a directory.
 334  */
 335 /* ARGSUSED */
 336 void
 337 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 338     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 339 {
 340         int error;
 341         vnode_t *dvp;
 342         vnode_t *vp;
 343         struct vattr va;
 344         fhandle_t *fhp = da->da_fhandle;
 345         struct sec_ol sec = {0, 0};
 346         bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 347         char *name;
 348         struct sockaddr *ca;
 349 
 350         /*
 351          * Trusted Extension doesn't support NFSv2. MOUNT
 352          * will reject v2 clients. Need to prevent v2 client
 353          * access via WebNFS here.
 354          */
 355         if (is_system_labeled() && req->rq_vers == 2) {
 356                 dr->dr_status = NFSERR_ACCES;
 357                 return;
 358         }
 359 
 360         /*
 361          * Disallow NULL paths
 362          */
 363         if (da->da_name == NULL || *da->da_name == '\0') {
 364                 dr->dr_status = NFSERR_ACCES;
 365                 return;
 366         }
 367 
 368         /*
 369          * Allow lookups from the root - the default
 370          * location of the public filehandle.
 371          */
 372         if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 373                 dvp = rootdir;
 374                 VN_HOLD(dvp);
 375         } else {
 376                 dvp = nfs_fhtovp(fhp, exi);
 377                 if (dvp == NULL) {
 378                         dr->dr_status = NFSERR_STALE;
 379                         return;
 380                 }
 381         }
 382 
 383         /*
 384          * Not allow lookup beyond root.
 385          * If the filehandle matches a filehandle of the exi,
 386          * then the ".." refers beyond the root of an exported filesystem.
 387          */
 388         if (strcmp(da->da_name, "..") == 0 &&
 389             EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 390                 VN_RELE(dvp);
 391                 dr->dr_status = NFSERR_NOENT;
 392                 return;
 393         }
 394 
 395         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 396         name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 397             MAXPATHLEN);
 398 
 399         if (name == NULL) {
 400                 dr->dr_status = NFSERR_ACCES;
 401                 return;
 402         }
 403 
 404         /*
 405          * If the public filehandle is used then allow
 406          * a multi-component lookup, i.e. evaluate
 407          * a pathname and follow symbolic links if
 408          * necessary.
 409          *
 410          * This may result in a vnode in another filesystem
 411          * which is OK as long as the filesystem is exported.
 412          */
 413         if (PUBLIC_FH2(fhp)) {
 414                 publicfh_flag = TRUE;
 415                 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 416                     &sec);
 417         } else {
 418                 /*
 419                  * Do a normal single component lookup.
 420                  */
 421                 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 422                     NULL, NULL, NULL);
 423         }
 424 
 425         if (name != da->da_name)
 426                 kmem_free(name, MAXPATHLEN);
 427 
 428 
 429         if (!error) {
 430                 va.va_mask = AT_ALL;    /* we want everything */
 431 
 432                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 433 
 434                 /* check for overflows */
 435                 if (!error) {
 436                         acl_perm(vp, exi, &va, cr);
 437                         error = vattr_to_nattr(&va, &dr->dr_attr);
 438                         if (!error) {
 439                                 if (sec.sec_flags & SEC_QUERY)
 440                                         error = makefh_ol(&dr->dr_fhandle, exi,
 441                                             sec.sec_index);
 442                                 else {
 443                                         error = makefh(&dr->dr_fhandle, vp,
 444                                             exi);
 445                                         if (!error && publicfh_flag &&
 446                                             !chk_clnt_sec(exi, req))
 447                                                 auth_weak = TRUE;
 448                                 }
 449                         }
 450                 }
 451                 VN_RELE(vp);
 452         }
 453 
 454         VN_RELE(dvp);
 455 
 456         /*
 457          * If publicfh_flag is true then we have called rfs_publicfh_mclookup
 458          * and have obtained a new exportinfo in exi which needs to be
 459          * released. Note the the original exportinfo pointed to by exi
 460          * will be released by the caller, comon_dispatch.
 461          */
 462         if (publicfh_flag && exi != NULL)
 463                 exi_rele(exi);
 464 
 465         /*
 466          * If it's public fh, no 0x81, and client's flavor is
 467          * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 468          * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 469          */
 470         if (auth_weak)
 471                 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 472         else
 473                 dr->dr_status = puterrno(error);
 474 }
 475 void *
 476 rfs_lookup_getfh(struct nfsdiropargs *da)
 477 {
 478         return (da->da_fhandle);
 479 }
 480 
 481 /*
 482  * Read symbolic link.
 483  * Returns the string in the symbolic link at the given fhandle.
 484  */
 485 /* ARGSUSED */
 486 void
 487 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 488     struct svc_req *req, cred_t *cr, bool_t ro)
 489 {
 490         int error;
 491         struct iovec iov;
 492         struct uio uio;
 493         vnode_t *vp;
 494         struct vattr va;
 495         struct sockaddr *ca;
 496         char *name = NULL;
 497         int is_referral = 0;
 498 
 499         vp = nfs_fhtovp(fhp, exi);
 500         if (vp == NULL) {
 501                 rl->rl_data = NULL;
 502                 rl->rl_status = NFSERR_STALE;
 503                 return;
 504         }
 505 
 506         va.va_mask = AT_MODE;
 507 
 508         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 509 
 510         if (error) {
 511                 VN_RELE(vp);
 512                 rl->rl_data = NULL;
 513                 rl->rl_status = puterrno(error);
 514                 return;
 515         }
 516 
 517         if (MANDLOCK(vp, va.va_mode)) {
 518                 VN_RELE(vp);
 519                 rl->rl_data = NULL;
 520                 rl->rl_status = NFSERR_ACCES;
 521                 return;
 522         }
 523 
 524         /* We lied about the object type for a referral */
 525         if (vn_is_nfs_reparse(vp, cr))
 526                 is_referral = 1;
 527 
 528         /*
 529          * XNFS and RFC1094 require us to return ENXIO if argument
 530          * is not a link. BUGID 1138002.
 531          */
 532         if (vp->v_type != VLNK && !is_referral) {
 533                 VN_RELE(vp);
 534                 rl->rl_data = NULL;
 535                 rl->rl_status = NFSERR_NXIO;
 536                 return;
 537         }
 538 
 539         /*
 540          * Allocate data for pathname.  This will be freed by rfs_rlfree.
 541          */
 542         rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 543 
 544         if (is_referral) {
 545                 char *s;
 546                 size_t strsz;
 547 
 548                 /* Get an artificial symlink based on a referral */
 549                 s = build_symlink(vp, cr, &strsz);
 550                 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 551                 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 552                     vnode_t *, vp, char *, s);
 553                 if (s == NULL)
 554                         error = EINVAL;
 555                 else {
 556                         error = 0;
 557                         (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 558                         rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 559                         kmem_free(s, strsz);
 560                 }
 561 
 562         } else {
 563 
 564                 /*
 565                  * Set up io vector to read sym link data
 566                  */
 567                 iov.iov_base = rl->rl_data;
 568                 iov.iov_len = NFS_MAXPATHLEN;
 569                 uio.uio_iov = &iov;
 570                 uio.uio_iovcnt = 1;
 571                 uio.uio_segflg = UIO_SYSSPACE;
 572                 uio.uio_extflg = UIO_COPY_CACHED;
 573                 uio.uio_loffset = (offset_t)0;
 574                 uio.uio_resid = NFS_MAXPATHLEN;
 575 
 576                 /*
 577                  * Do the readlink.
 578                  */
 579                 error = VOP_READLINK(vp, &uio, cr, NULL);
 580 
 581                 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 582 
 583                 if (!error)
 584                         rl->rl_data[rl->rl_count] = '\0';
 585 
 586         }
 587 
 588 
 589         VN_RELE(vp);
 590 
 591         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 592         name = nfscmd_convname(ca, exi, rl->rl_data,
 593             NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 594 
 595         if (name != NULL && name != rl->rl_data) {
 596                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 597                 rl->rl_data = name;
 598         }
 599 
 600         /*
 601          * XNFS and RFC1094 require us to return ENXIO if argument
 602          * is not a link. UFS returns EINVAL if this is the case,
 603          * so we do the mapping here. BUGID 1138002.
 604          */
 605         if (error == EINVAL)
 606                 rl->rl_status = NFSERR_NXIO;
 607         else
 608                 rl->rl_status = puterrno(error);
 609 
 610 }
 611 void *
 612 rfs_readlink_getfh(fhandle_t *fhp)
 613 {
 614         return (fhp);
 615 }
 616 /*
 617  * Free data allocated by rfs_readlink
 618  */
 619 void
 620 rfs_rlfree(struct nfsrdlnres *rl)
 621 {
 622         if (rl->rl_data != NULL)
 623                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 624 }
 625 
 626 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 627 
 628 /*
 629  * Read data.
 630  * Returns some data read from the file at the given fhandle.
 631  */
 632 /* ARGSUSED */
 633 void
 634 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 635     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 636 {
 637         vnode_t *vp;
 638         int error;
 639         struct vattr va;
 640         struct iovec iov;
 641         struct uio uio;
 642         mblk_t *mp;
 643         int alloc_err = 0;
 644         int in_crit = 0;
 645         caller_context_t ct;
 646 
 647         vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 648         if (vp == NULL) {
 649                 rr->rr_data = NULL;
 650                 rr->rr_status = NFSERR_STALE;
 651                 return;
 652         }
 653 
 654         if (vp->v_type != VREG) {
 655                 VN_RELE(vp);
 656                 rr->rr_data = NULL;
 657                 rr->rr_status = NFSERR_ISDIR;
 658                 return;
 659         }
 660 
 661         ct.cc_sysid = 0;
 662         ct.cc_pid = 0;
 663         ct.cc_caller_id = nfs2_srv_caller_id;
 664         ct.cc_flags = CC_DONTBLOCK;
 665 
 666         /*
 667          * Enter the critical region before calling VOP_RWLOCK
 668          * to avoid a deadlock with write requests.
 669          */
 670         if (nbl_need_check(vp)) {
 671                 nbl_start_crit(vp, RW_READER);
 672                 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 673                     0, NULL)) {
 674                         nbl_end_crit(vp);
 675                         VN_RELE(vp);
 676                         rr->rr_data = NULL;
 677                         rr->rr_status = NFSERR_ACCES;
 678                         return;
 679                 }
 680                 in_crit = 1;
 681         }
 682 
 683         error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 684 
 685         /* check if a monitor detected a delegation conflict */
 686         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 687                 VN_RELE(vp);
 688                 /* mark as wouldblock so response is dropped */
 689                 curthread->t_flag |= T_WOULDBLOCK;
 690 
 691                 rr->rr_data = NULL;
 692                 return;
 693         }
 694 
 695         va.va_mask = AT_ALL;
 696 
 697         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 698 
 699         if (error) {
 700                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 701                 if (in_crit)
 702                         nbl_end_crit(vp);
 703 
 704                 VN_RELE(vp);
 705                 rr->rr_data = NULL;
 706                 rr->rr_status = puterrno(error);
 707 
 708                 return;
 709         }
 710 
 711         /*
 712          * This is a kludge to allow reading of files created
 713          * with no read permission.  The owner of the file
 714          * is always allowed to read it.
 715          */
 716         if (crgetuid(cr) != va.va_uid) {
 717                 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 718 
 719                 if (error) {
 720                         /*
 721                          * Exec is the same as read over the net because
 722                          * of demand loading.
 723                          */
 724                         error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 725                 }
 726                 if (error) {
 727                         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 728                         if (in_crit)
 729                                 nbl_end_crit(vp);
 730                         VN_RELE(vp);
 731                         rr->rr_data = NULL;
 732                         rr->rr_status = puterrno(error);
 733 
 734                         return;
 735                 }
 736         }
 737 
 738         if (MANDLOCK(vp, va.va_mode)) {
 739                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 740                 if (in_crit)
 741                         nbl_end_crit(vp);
 742 
 743                 VN_RELE(vp);
 744                 rr->rr_data = NULL;
 745                 rr->rr_status = NFSERR_ACCES;
 746 
 747                 return;
 748         }
 749 
 750         rr->rr_ok.rrok_wlist_len = 0;
 751         rr->rr_ok.rrok_wlist = NULL;
 752 
 753         if ((u_offset_t)ra->ra_offset >= va.va_size) {
 754                 rr->rr_count = 0;
 755                 rr->rr_data = NULL;
 756                 /*
 757                  * In this case, status is NFS_OK, but there is no data
 758                  * to encode. So set rr_mp to NULL.
 759                  */
 760                 rr->rr_mp = NULL;
 761                 rr->rr_ok.rrok_wlist = ra->ra_wlist;
 762                 if (rr->rr_ok.rrok_wlist)
 763                         clist_zero_len(rr->rr_ok.rrok_wlist);
 764                 goto done;
 765         }
 766 
 767         if (ra->ra_wlist) {
 768                 mp = NULL;
 769                 rr->rr_mp = NULL;
 770                 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 771                 if (ra->ra_count > iov.iov_len) {
 772                         rr->rr_data = NULL;
 773                         rr->rr_status = NFSERR_INVAL;
 774                         goto done;
 775                 }
 776         } else {
 777                 /*
 778                  * mp will contain the data to be sent out in the read reply.
 779                  * This will be freed after the reply has been sent out (by the
 780                  * driver).
 781                  * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 782                  * that the call to xdrmblk_putmblk() never fails.
 783                  */
 784                 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 785                     &alloc_err);
 786                 ASSERT(mp != NULL);
 787                 ASSERT(alloc_err == 0);
 788 
 789                 rr->rr_mp = mp;
 790 
 791                 /*
 792                  * Set up io vector
 793                  */
 794                 iov.iov_base = (caddr_t)mp->b_datap->db_base;
 795                 iov.iov_len = ra->ra_count;
 796         }
 797 
 798         uio.uio_iov = &iov;
 799         uio.uio_iovcnt = 1;
 800         uio.uio_segflg = UIO_SYSSPACE;
 801         uio.uio_extflg = UIO_COPY_CACHED;
 802         uio.uio_loffset = (offset_t)ra->ra_offset;
 803         uio.uio_resid = ra->ra_count;
 804 
 805         error = VOP_READ(vp, &uio, 0, cr, &ct);
 806 
 807         if (error) {
 808                 if (mp)
 809                         freeb(mp);
 810 
 811                 /*
 812                  * check if a monitor detected a delegation conflict and
 813                  * mark as wouldblock so response is dropped
 814                  */
 815                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 816                         curthread->t_flag |= T_WOULDBLOCK;
 817                 else
 818                         rr->rr_status = puterrno(error);
 819 
 820                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 821                 if (in_crit)
 822                         nbl_end_crit(vp);
 823 
 824                 VN_RELE(vp);
 825                 rr->rr_data = NULL;
 826 
 827                 return;
 828         }
 829 
 830         /*
 831          * Get attributes again so we can send the latest access
 832          * time to the client side for his cache.
 833          */
 834         va.va_mask = AT_ALL;
 835 
 836         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 837 
 838         if (error) {
 839                 if (mp)
 840                         freeb(mp);
 841 
 842                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 843                 if (in_crit)
 844                         nbl_end_crit(vp);
 845 
 846                 VN_RELE(vp);
 847                 rr->rr_data = NULL;
 848                 rr->rr_status = puterrno(error);
 849 
 850                 return;
 851         }
 852 
 853         rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 854 
 855         if (mp) {
 856                 rr->rr_data = (char *)mp->b_datap->db_base;
 857         } else {
 858                 if (ra->ra_wlist) {
 859                         rr->rr_data = (caddr_t)iov.iov_base;
 860                         if (!rdma_setup_read_data2(ra, rr)) {
 861                                 rr->rr_data = NULL;
 862                                 rr->rr_status = puterrno(NFSERR_INVAL);
 863                         }
 864                 }
 865         }
 866 done:
 867         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 868         if (in_crit)
 869                 nbl_end_crit(vp);
 870 
 871         acl_perm(vp, exi, &va, cr);
 872 
 873         /* check for overflows */
 874         error = vattr_to_nattr(&va, &rr->rr_attr);
 875 
 876         VN_RELE(vp);
 877 
 878         rr->rr_status = puterrno(error);
 879 }
 880 
 881 /*
 882  * Free data allocated by rfs_read
 883  */
 884 void
 885 rfs_rdfree(struct nfsrdresult *rr)
 886 {
 887         mblk_t *mp;
 888 
 889         if (rr->rr_status == NFS_OK) {
 890                 mp = rr->rr_mp;
 891                 if (mp != NULL)
 892                         freeb(mp);
 893         }
 894 }
 895 
 896 void *
 897 rfs_read_getfh(struct nfsreadargs *ra)
 898 {
 899         return (&ra->ra_fhandle);
 900 }
 901 
 902 #define MAX_IOVECS      12
 903 
 904 #ifdef DEBUG
 905 static int rfs_write_sync_hits = 0;
 906 static int rfs_write_sync_misses = 0;
 907 #endif
 908 
 909 /*
 910  * Write data to file.
 911  * Returns attributes of a file after writing some data to it.
 912  *
 913  * Any changes made here, especially in error handling might have
 914  * to also be done in rfs_write (which clusters write requests).
 915  */
 916 /* ARGSUSED */
 917 void
 918 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
 919     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 920 {
 921         int error;
 922         vnode_t *vp;
 923         rlim64_t rlimit;
 924         struct vattr va;
 925         struct uio uio;
 926         struct iovec iov[MAX_IOVECS];
 927         mblk_t *m;
 928         struct iovec *iovp;
 929         int iovcnt;
 930         cred_t *savecred;
 931         int in_crit = 0;
 932         caller_context_t ct;
 933 
 934         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
 935         if (vp == NULL) {
 936                 ns->ns_status = NFSERR_STALE;
 937                 return;
 938         }
 939 
 940         if (rdonly(ro, vp)) {
 941                 VN_RELE(vp);
 942                 ns->ns_status = NFSERR_ROFS;
 943                 return;
 944         }
 945 
 946         if (vp->v_type != VREG) {
 947                 VN_RELE(vp);
 948                 ns->ns_status = NFSERR_ISDIR;
 949                 return;
 950         }
 951 
 952         ct.cc_sysid = 0;
 953         ct.cc_pid = 0;
 954         ct.cc_caller_id = nfs2_srv_caller_id;
 955         ct.cc_flags = CC_DONTBLOCK;
 956 
 957         va.va_mask = AT_UID|AT_MODE;
 958 
 959         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 960 
 961         if (error) {
 962                 VN_RELE(vp);
 963                 ns->ns_status = puterrno(error);
 964 
 965                 return;
 966         }
 967 
 968         if (crgetuid(cr) != va.va_uid) {
 969                 /*
 970                  * This is a kludge to allow writes of files created
 971                  * with read only permission.  The owner of the file
 972                  * is always allowed to write it.
 973                  */
 974                 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
 975 
 976                 if (error) {
 977                         VN_RELE(vp);
 978                         ns->ns_status = puterrno(error);
 979                         return;
 980                 }
 981         }
 982 
 983         /*
 984          * Can't access a mandatory lock file.  This might cause
 985          * the NFS service thread to block forever waiting for a
 986          * lock to be released that will never be released.
 987          */
 988         if (MANDLOCK(vp, va.va_mode)) {
 989                 VN_RELE(vp);
 990                 ns->ns_status = NFSERR_ACCES;
 991                 return;
 992         }
 993 
 994         /*
 995          * We have to enter the critical region before calling VOP_RWLOCK
 996          * to avoid a deadlock with ufs.
 997          */
 998         if (nbl_need_check(vp)) {
 999                 nbl_start_crit(vp, RW_READER);
1000                 in_crit = 1;
1001                 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1002                     wa->wa_count, 0, NULL)) {
1003                         error = EACCES;
1004                         goto out;
1005                 }
1006         }
1007 
1008         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1009 
1010         /* check if a monitor detected a delegation conflict */
1011         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1012                 VN_RELE(vp);
1013                 /* mark as wouldblock so response is dropped */
1014                 curthread->t_flag |= T_WOULDBLOCK;
1015                 return;
1016         }
1017 
1018         if (wa->wa_data || wa->wa_rlist) {
1019                 /* Do the RDMA thing if necessary */
1020                 if (wa->wa_rlist) {
1021                         iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1022                         iov[0].iov_len = wa->wa_count;
1023                 } else  {
1024                         iov[0].iov_base = wa->wa_data;
1025                         iov[0].iov_len = wa->wa_count;
1026                 }
1027                 uio.uio_iov = iov;
1028                 uio.uio_iovcnt = 1;
1029                 uio.uio_segflg = UIO_SYSSPACE;
1030                 uio.uio_extflg = UIO_COPY_DEFAULT;
1031                 uio.uio_loffset = (offset_t)wa->wa_offset;
1032                 uio.uio_resid = wa->wa_count;
1033                 /*
1034                  * The limit is checked on the client. We
1035                  * should allow any size writes here.
1036                  */
1037                 uio.uio_llimit = curproc->p_fsz_ctl;
1038                 rlimit = uio.uio_llimit - wa->wa_offset;
1039                 if (rlimit < (rlim64_t)uio.uio_resid)
1040                         uio.uio_resid = (uint_t)rlimit;
1041 
1042                 /*
1043                  * for now we assume no append mode
1044                  */
1045                 /*
1046                  * We're changing creds because VM may fault and we need
1047                  * the cred of the current thread to be used if quota
1048                  * checking is enabled.
1049                  */
1050                 savecred = curthread->t_cred;
1051                 curthread->t_cred = cr;
1052                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1053                 curthread->t_cred = savecred;
1054         } else {
1055                 iovcnt = 0;
1056                 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1057                         iovcnt++;
1058                 if (iovcnt <= MAX_IOVECS) {
1059 #ifdef DEBUG
1060                         rfs_write_sync_hits++;
1061 #endif
1062                         iovp = iov;
1063                 } else {
1064 #ifdef DEBUG
1065                         rfs_write_sync_misses++;
1066 #endif
1067                         iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1068                 }
1069                 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1070                 uio.uio_iov = iovp;
1071                 uio.uio_iovcnt = iovcnt;
1072                 uio.uio_segflg = UIO_SYSSPACE;
1073                 uio.uio_extflg = UIO_COPY_DEFAULT;
1074                 uio.uio_loffset = (offset_t)wa->wa_offset;
1075                 uio.uio_resid = wa->wa_count;
1076                 /*
1077                  * The limit is checked on the client. We
1078                  * should allow any size writes here.
1079                  */
1080                 uio.uio_llimit = curproc->p_fsz_ctl;
1081                 rlimit = uio.uio_llimit - wa->wa_offset;
1082                 if (rlimit < (rlim64_t)uio.uio_resid)
1083                         uio.uio_resid = (uint_t)rlimit;
1084 
1085                 /*
1086                  * For now we assume no append mode.
1087                  */
1088                 /*
1089                  * We're changing creds because VM may fault and we need
1090                  * the cred of the current thread to be used if quota
1091                  * checking is enabled.
1092                  */
1093                 savecred = curthread->t_cred;
1094                 curthread->t_cred = cr;
1095                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1096                 curthread->t_cred = savecred;
1097 
1098                 if (iovp != iov)
1099                         kmem_free(iovp, sizeof (*iovp) * iovcnt);
1100         }
1101 
1102         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1103 
1104         if (!error) {
1105                 /*
1106                  * Get attributes again so we send the latest mod
1107                  * time to the client side for his cache.
1108                  */
1109                 va.va_mask = AT_ALL;    /* now we want everything */
1110 
1111                 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1112 
1113                 /* check for overflows */
1114                 if (!error) {
1115                         acl_perm(vp, exi, &va, cr);
1116                         error = vattr_to_nattr(&va, &ns->ns_attr);
1117                 }
1118         }
1119 
1120 out:
1121         if (in_crit)
1122                 nbl_end_crit(vp);
1123         VN_RELE(vp);
1124 
1125         /* check if a monitor detected a delegation conflict */
1126         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1127                 /* mark as wouldblock so response is dropped */
1128                 curthread->t_flag |= T_WOULDBLOCK;
1129         else
1130                 ns->ns_status = puterrno(error);
1131 
1132 }
1133 
1134 struct rfs_async_write {
1135         struct nfswriteargs *wa;
1136         struct nfsattrstat *ns;
1137         struct svc_req *req;
1138         cred_t *cr;
1139         bool_t ro;
1140         kthread_t *thread;
1141         struct rfs_async_write *list;
1142 };
1143 
1144 struct rfs_async_write_list {
1145         fhandle_t *fhp;
1146         kcondvar_t cv;
1147         struct rfs_async_write *list;
1148         struct rfs_async_write_list *next;
1149 };
1150 
1151 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1152 static kmutex_t rfs_async_write_lock;
1153 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1154 
1155 #define MAXCLIOVECS     42
1156 #define RFSWRITE_INITVAL (enum nfsstat) -1
1157 
1158 #ifdef DEBUG
1159 static int rfs_write_hits = 0;
1160 static int rfs_write_misses = 0;
1161 #endif
1162 
1163 /*
1164  * Write data to file.
1165  * Returns attributes of a file after writing some data to it.
1166  */
1167 void
1168 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1169     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1170 {
1171         int error;
1172         vnode_t *vp;
1173         rlim64_t rlimit;
1174         struct vattr va;
1175         struct uio uio;
1176         struct rfs_async_write_list *lp;
1177         struct rfs_async_write_list *nlp;
1178         struct rfs_async_write *rp;
1179         struct rfs_async_write *nrp;
1180         struct rfs_async_write *trp;
1181         struct rfs_async_write *lrp;
1182         int data_written;
1183         int iovcnt;
1184         mblk_t *m;
1185         struct iovec *iovp;
1186         struct iovec *niovp;
1187         struct iovec iov[MAXCLIOVECS];
1188         int count;
1189         int rcount;
1190         uint_t off;
1191         uint_t len;
1192         struct rfs_async_write nrpsp;
1193         struct rfs_async_write_list nlpsp;
1194         ushort_t t_flag;
1195         cred_t *savecred;
1196         int in_crit = 0;
1197         caller_context_t ct;
1198 
1199         if (!rfs_write_async) {
1200                 rfs_write_sync(wa, ns, exi, req, cr, ro);
1201                 return;
1202         }
1203 
1204         /*
1205          * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1206          * is considered an OK.
1207          */
1208         ns->ns_status = RFSWRITE_INITVAL;
1209 
1210         nrp = &nrpsp;
1211         nrp->wa = wa;
1212         nrp->ns = ns;
1213         nrp->req = req;
1214         nrp->cr = cr;
1215         nrp->ro = ro;
1216         nrp->thread = curthread;
1217 
1218         ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1219 
1220         /*
1221          * Look to see if there is already a cluster started
1222          * for this file.
1223          */
1224         mutex_enter(&rfs_async_write_lock);
1225         for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1226                 if (bcmp(&wa->wa_fhandle, lp->fhp,
1227                     sizeof (fhandle_t)) == 0)
1228                         break;
1229         }
1230 
1231         /*
1232          * If lp is non-NULL, then there is already a cluster
1233          * started.  We need to place ourselves in the cluster
1234          * list in the right place as determined by starting
1235          * offset.  Conflicts with non-blocking mandatory locked
1236          * regions will be checked when the cluster is processed.
1237          */
1238         if (lp != NULL) {
1239                 rp = lp->list;
1240                 trp = NULL;
1241                 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1242                         trp = rp;
1243                         rp = rp->list;
1244                 }
1245                 nrp->list = rp;
1246                 if (trp == NULL)
1247                         lp->list = nrp;
1248                 else
1249                         trp->list = nrp;
1250                 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1251                         cv_wait(&lp->cv, &rfs_async_write_lock);
1252                 mutex_exit(&rfs_async_write_lock);
1253 
1254                 return;
1255         }
1256 
1257         /*
1258          * No cluster started yet, start one and add ourselves
1259          * to the list of clusters.
1260          */
1261         nrp->list = NULL;
1262 
1263         nlp = &nlpsp;
1264         nlp->fhp = &wa->wa_fhandle;
1265         cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1266         nlp->list = nrp;
1267         nlp->next = NULL;
1268 
1269         if (rfs_async_write_head == NULL) {
1270                 rfs_async_write_head = nlp;
1271         } else {
1272                 lp = rfs_async_write_head;
1273                 while (lp->next != NULL)
1274                         lp = lp->next;
1275                 lp->next = nlp;
1276         }
1277         mutex_exit(&rfs_async_write_lock);
1278 
1279         /*
1280          * Convert the file handle common to all of the requests
1281          * in this cluster to a vnode.
1282          */
1283         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1284         if (vp == NULL) {
1285                 mutex_enter(&rfs_async_write_lock);
1286                 if (rfs_async_write_head == nlp)
1287                         rfs_async_write_head = nlp->next;
1288                 else {
1289                         lp = rfs_async_write_head;
1290                         while (lp->next != nlp)
1291                                 lp = lp->next;
1292                         lp->next = nlp->next;
1293                 }
1294                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1295                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1296                         rp->ns->ns_status = NFSERR_STALE;
1297                         rp->thread->t_flag |= t_flag;
1298                 }
1299                 cv_broadcast(&nlp->cv);
1300                 mutex_exit(&rfs_async_write_lock);
1301 
1302                 return;
1303         }
1304 
1305         /*
1306          * Can only write regular files.  Attempts to write any
1307          * other file types fail with EISDIR.
1308          */
1309         if (vp->v_type != VREG) {
1310                 VN_RELE(vp);
1311                 mutex_enter(&rfs_async_write_lock);
1312                 if (rfs_async_write_head == nlp)
1313                         rfs_async_write_head = nlp->next;
1314                 else {
1315                         lp = rfs_async_write_head;
1316                         while (lp->next != nlp)
1317                                 lp = lp->next;
1318                         lp->next = nlp->next;
1319                 }
1320                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1321                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1322                         rp->ns->ns_status = NFSERR_ISDIR;
1323                         rp->thread->t_flag |= t_flag;
1324                 }
1325                 cv_broadcast(&nlp->cv);
1326                 mutex_exit(&rfs_async_write_lock);
1327 
1328                 return;
1329         }
1330 
1331         /*
1332          * Enter the critical region before calling VOP_RWLOCK, to avoid a
1333          * deadlock with ufs.
1334          */
1335         if (nbl_need_check(vp)) {
1336                 nbl_start_crit(vp, RW_READER);
1337                 in_crit = 1;
1338         }
1339 
1340         ct.cc_sysid = 0;
1341         ct.cc_pid = 0;
1342         ct.cc_caller_id = nfs2_srv_caller_id;
1343         ct.cc_flags = CC_DONTBLOCK;
1344 
1345         /*
1346          * Lock the file for writing.  This operation provides
1347          * the delay which allows clusters to grow.
1348          */
1349         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1350 
1351         /* check if a monitor detected a delegation conflict */
1352         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1353                 if (in_crit)
1354                         nbl_end_crit(vp);
1355                 VN_RELE(vp);
1356                 /* mark as wouldblock so response is dropped */
1357                 curthread->t_flag |= T_WOULDBLOCK;
1358                 mutex_enter(&rfs_async_write_lock);
1359                 if (rfs_async_write_head == nlp)
1360                         rfs_async_write_head = nlp->next;
1361                 else {
1362                         lp = rfs_async_write_head;
1363                         while (lp->next != nlp)
1364                                 lp = lp->next;
1365                         lp->next = nlp->next;
1366                 }
1367                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1368                         if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1369                                 rp->ns->ns_status = puterrno(error);
1370                                 rp->thread->t_flag |= T_WOULDBLOCK;
1371                         }
1372                 }
1373                 cv_broadcast(&nlp->cv);
1374                 mutex_exit(&rfs_async_write_lock);
1375 
1376                 return;
1377         }
1378 
1379         /*
1380          * Disconnect this cluster from the list of clusters.
1381          * The cluster that is being dealt with must be fixed
1382          * in size after this point, so there is no reason
1383          * to leave it on the list so that new requests can
1384          * find it.
1385          *
1386          * The algorithm is that the first write request will
1387          * create a cluster, convert the file handle to a
1388          * vnode pointer, and then lock the file for writing.
1389          * This request is not likely to be clustered with
1390          * any others.  However, the next request will create
1391          * a new cluster and be blocked in VOP_RWLOCK while
1392          * the first request is being processed.  This delay
1393          * will allow more requests to be clustered in this
1394          * second cluster.
1395          */
1396         mutex_enter(&rfs_async_write_lock);
1397         if (rfs_async_write_head == nlp)
1398                 rfs_async_write_head = nlp->next;
1399         else {
1400                 lp = rfs_async_write_head;
1401                 while (lp->next != nlp)
1402                         lp = lp->next;
1403                 lp->next = nlp->next;
1404         }
1405         mutex_exit(&rfs_async_write_lock);
1406 
1407         /*
1408          * Step through the list of requests in this cluster.
1409          * We need to check permissions to make sure that all
1410          * of the requests have sufficient permission to write
1411          * the file.  A cluster can be composed of requests
1412          * from different clients and different users on each
1413          * client.
1414          *
1415          * As a side effect, we also calculate the size of the
1416          * byte range that this cluster encompasses.
1417          */
1418         rp = nlp->list;
1419         off = rp->wa->wa_offset;
1420         len = (uint_t)0;
1421         do {
1422                 if (rdonly(rp->ro, vp)) {
1423                         rp->ns->ns_status = NFSERR_ROFS;
1424                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1425                         rp->thread->t_flag |= t_flag;
1426                         continue;
1427                 }
1428 
1429                 va.va_mask = AT_UID|AT_MODE;
1430 
1431                 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1432 
1433                 if (!error) {
1434                         if (crgetuid(rp->cr) != va.va_uid) {
1435                                 /*
1436                                  * This is a kludge to allow writes of files
1437                                  * created with read only permission.  The
1438                                  * owner of the file is always allowed to
1439                                  * write it.
1440                                  */
1441                                 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1442                         }
1443                         if (!error && MANDLOCK(vp, va.va_mode))
1444                                 error = EACCES;
1445                 }
1446 
1447                 /*
1448                  * Check for a conflict with a nbmand-locked region.
1449                  */
1450                 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1451                     rp->wa->wa_count, 0, NULL)) {
1452                         error = EACCES;
1453                 }
1454 
1455                 if (error) {
1456                         rp->ns->ns_status = puterrno(error);
1457                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1458                         rp->thread->t_flag |= t_flag;
1459                         continue;
1460                 }
1461                 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1462                         len = rp->wa->wa_offset + rp->wa->wa_count - off;
1463         } while ((rp = rp->list) != NULL);
1464 
1465         /*
1466          * Step through the cluster attempting to gather as many
1467          * requests which are contiguous as possible.  These
1468          * contiguous requests are handled via one call to VOP_WRITE
1469          * instead of different calls to VOP_WRITE.  We also keep
1470          * track of the fact that any data was written.
1471          */
1472         rp = nlp->list;
1473         data_written = 0;
1474         do {
1475                 /*
1476                  * Skip any requests which are already marked as having an
1477                  * error.
1478                  */
1479                 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1480                         rp = rp->list;
1481                         continue;
1482                 }
1483 
1484                 /*
1485                  * Count the number of iovec's which are required
1486                  * to handle this set of requests.  One iovec is
1487                  * needed for each data buffer, whether addressed
1488                  * by wa_data or by the b_rptr pointers in the
1489                  * mblk chains.
1490                  */
1491                 iovcnt = 0;
1492                 lrp = rp;
1493                 for (;;) {
1494                         if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1495                                 iovcnt++;
1496                         else {
1497                                 m = lrp->wa->wa_mblk;
1498                                 while (m != NULL) {
1499                                         iovcnt++;
1500                                         m = m->b_cont;
1501                                 }
1502                         }
1503                         if (lrp->list == NULL ||
1504                             lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1505                             lrp->wa->wa_offset + lrp->wa->wa_count !=
1506                             lrp->list->wa->wa_offset) {
1507                                 lrp = lrp->list;
1508                                 break;
1509                         }
1510                         lrp = lrp->list;
1511                 }
1512 
1513                 if (iovcnt <= MAXCLIOVECS) {
1514 #ifdef DEBUG
1515                         rfs_write_hits++;
1516 #endif
1517                         niovp = iov;
1518                 } else {
1519 #ifdef DEBUG
1520                         rfs_write_misses++;
1521 #endif
1522                         niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1523                 }
1524                 /*
1525                  * Put together the scatter/gather iovecs.
1526                  */
1527                 iovp = niovp;
1528                 trp = rp;
1529                 count = 0;
1530                 do {
1531                         if (trp->wa->wa_data || trp->wa->wa_rlist) {
1532                                 if (trp->wa->wa_rlist) {
1533                                         iovp->iov_base =
1534                                             (char *)((trp->wa->wa_rlist)->
1535                                             u.c_daddr3);
1536                                         iovp->iov_len = trp->wa->wa_count;
1537                                 } else  {
1538                                         iovp->iov_base = trp->wa->wa_data;
1539                                         iovp->iov_len = trp->wa->wa_count;
1540                                 }
1541                                 iovp++;
1542                         } else {
1543                                 m = trp->wa->wa_mblk;
1544                                 rcount = trp->wa->wa_count;
1545                                 while (m != NULL) {
1546                                         iovp->iov_base = (caddr_t)m->b_rptr;
1547                                         iovp->iov_len = (m->b_wptr - m->b_rptr);
1548                                         rcount -= iovp->iov_len;
1549                                         if (rcount < 0)
1550                                                 iovp->iov_len += rcount;
1551                                         iovp++;
1552                                         if (rcount <= 0)
1553                                                 break;
1554                                         m = m->b_cont;
1555                                 }
1556                         }
1557                         count += trp->wa->wa_count;
1558                         trp = trp->list;
1559                 } while (trp != lrp);
1560 
1561                 uio.uio_iov = niovp;
1562                 uio.uio_iovcnt = iovcnt;
1563                 uio.uio_segflg = UIO_SYSSPACE;
1564                 uio.uio_extflg = UIO_COPY_DEFAULT;
1565                 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1566                 uio.uio_resid = count;
1567                 /*
1568                  * The limit is checked on the client. We
1569                  * should allow any size writes here.
1570                  */
1571                 uio.uio_llimit = curproc->p_fsz_ctl;
1572                 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1573                 if (rlimit < (rlim64_t)uio.uio_resid)
1574                         uio.uio_resid = (uint_t)rlimit;
1575 
1576                 /*
1577                  * For now we assume no append mode.
1578                  */
1579 
1580                 /*
1581                  * We're changing creds because VM may fault
1582                  * and we need the cred of the current
1583                  * thread to be used if quota * checking is
1584                  * enabled.
1585                  */
1586                 savecred = curthread->t_cred;
1587                 curthread->t_cred = cr;
1588                 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1589                 curthread->t_cred = savecred;
1590 
1591                 /* check if a monitor detected a delegation conflict */
1592                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1593                         /* mark as wouldblock so response is dropped */
1594                         curthread->t_flag |= T_WOULDBLOCK;
1595 
1596                 if (niovp != iov)
1597                         kmem_free(niovp, sizeof (*niovp) * iovcnt);
1598 
1599                 if (!error) {
1600                         data_written = 1;
1601                         /*
1602                          * Get attributes again so we send the latest mod
1603                          * time to the client side for his cache.
1604                          */
1605                         va.va_mask = AT_ALL;    /* now we want everything */
1606 
1607                         error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1608 
1609                         if (!error)
1610                                 acl_perm(vp, exi, &va, rp->cr);
1611                 }
1612 
1613                 /*
1614                  * Fill in the status responses for each request
1615                  * which was just handled.  Also, copy the latest
1616                  * attributes in to the attribute responses if
1617                  * appropriate.
1618                  */
1619                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1620                 do {
1621                         rp->thread->t_flag |= t_flag;
1622                         /* check for overflows */
1623                         if (!error) {
1624                                 error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1625                         }
1626                         rp->ns->ns_status = puterrno(error);
1627                         rp = rp->list;
1628                 } while (rp != lrp);
1629         } while (rp != NULL);
1630 
1631         /*
1632          * If any data was written at all, then we need to flush
1633          * the data and metadata to stable storage.
1634          */
1635         if (data_written) {
1636                 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1637 
1638                 if (!error) {
1639                         error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1640                 }
1641         }
1642 
1643         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1644 
1645         if (in_crit)
1646                 nbl_end_crit(vp);
1647         VN_RELE(vp);
1648 
1649         t_flag = curthread->t_flag & T_WOULDBLOCK;
1650         mutex_enter(&rfs_async_write_lock);
1651         for (rp = nlp->list; rp != NULL; rp = rp->list) {
1652                 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1653                         rp->ns->ns_status = puterrno(error);
1654                         rp->thread->t_flag |= t_flag;
1655                 }
1656         }
1657         cv_broadcast(&nlp->cv);
1658         mutex_exit(&rfs_async_write_lock);
1659 
1660 }
1661 
1662 void *
1663 rfs_write_getfh(struct nfswriteargs *wa)
1664 {
1665         return (&wa->wa_fhandle);
1666 }
1667 
1668 /*
1669  * Create a file.
1670  * Creates a file with given attributes and returns those attributes
1671  * and an fhandle for the new file.
1672  */
1673 void
1674 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1675     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1676 {
1677         int error;
1678         int lookuperr;
1679         int in_crit = 0;
1680         struct vattr va;
1681         vnode_t *vp;
1682         vnode_t *realvp;
1683         vnode_t *dvp;
1684         char *name = args->ca_da.da_name;
1685         vnode_t *tvp = NULL;
1686         int mode;
1687         int lookup_ok;
1688         bool_t trunc;
1689         struct sockaddr *ca;
1690 
1691         /*
1692          * Disallow NULL paths
1693          */
1694         if (name == NULL || *name == '\0') {
1695                 dr->dr_status = NFSERR_ACCES;
1696                 return;
1697         }
1698 
1699         dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1700         if (dvp == NULL) {
1701                 dr->dr_status = NFSERR_STALE;
1702                 return;
1703         }
1704 
1705         error = sattr_to_vattr(args->ca_sa, &va);
1706         if (error) {
1707                 dr->dr_status = puterrno(error);
1708                 return;
1709         }
1710 
1711         /*
1712          * Must specify the mode.
1713          */
1714         if (!(va.va_mask & AT_MODE)) {
1715                 VN_RELE(dvp);
1716                 dr->dr_status = NFSERR_INVAL;
1717                 return;
1718         }
1719 
1720         /*
1721          * This is a completely gross hack to make mknod
1722          * work over the wire until we can wack the protocol
1723          */
1724         if ((va.va_mode & IFMT) == IFCHR) {
1725                 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1726                         va.va_type = VFIFO;     /* xtra kludge for named pipe */
1727                 else {
1728                         va.va_type = VCHR;
1729                         /*
1730                          * uncompress the received dev_t
1731                          * if the top half is zero indicating a request
1732                          * from an `older style' OS.
1733                          */
1734                         if ((va.va_size & 0xffff0000) == 0)
1735                                 va.va_rdev = nfsv2_expdev(va.va_size);
1736                         else
1737                                 va.va_rdev = (dev_t)va.va_size;
1738                 }
1739                 va.va_mask &= ~AT_SIZE;
1740         } else if ((va.va_mode & IFMT) == IFBLK) {
1741                 va.va_type = VBLK;
1742                 /*
1743                  * uncompress the received dev_t
1744                  * if the top half is zero indicating a request
1745                  * from an `older style' OS.
1746                  */
1747                 if ((va.va_size & 0xffff0000) == 0)
1748                         va.va_rdev = nfsv2_expdev(va.va_size);
1749                 else
1750                         va.va_rdev = (dev_t)va.va_size;
1751                 va.va_mask &= ~AT_SIZE;
1752         } else if ((va.va_mode & IFMT) == IFSOCK) {
1753                 va.va_type = VSOCK;
1754         } else {
1755                 va.va_type = VREG;
1756         }
1757         va.va_mode &= ~IFMT;
1758         va.va_mask |= AT_TYPE;
1759 
1760         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1761         name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1762             MAXPATHLEN);
1763         if (name == NULL) {
1764                 dr->dr_status = puterrno(EINVAL);
1765                 return;
1766         }
1767 
1768         /*
1769          * Why was the choice made to use VWRITE as the mode to the
1770          * call to VOP_CREATE ? This results in a bug.  When a client
1771          * opens a file that already exists and is RDONLY, the second
1772          * open fails with an EACESS because of the mode.
1773          * bug ID 1054648.
1774          */
1775         lookup_ok = 0;
1776         mode = VWRITE;
1777         if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1778                 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1779                     NULL, NULL, NULL);
1780                 if (!error) {
1781                         struct vattr at;
1782 
1783                         lookup_ok = 1;
1784                         at.va_mask = AT_MODE;
1785                         error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1786                         if (!error)
1787                                 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1788                         VN_RELE(tvp);
1789                         tvp = NULL;
1790                 }
1791         }
1792 
1793         if (!lookup_ok) {
1794                 if (rdonly(ro, dvp)) {
1795                         error = EROFS;
1796                 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1797                     va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1798                         error = EPERM;
1799                 } else {
1800                         error = 0;
1801                 }
1802         }
1803 
1804         /*
1805          * If file size is being modified on an already existing file
1806          * make sure that there are no conflicting non-blocking mandatory
1807          * locks in the region being manipulated. Return EACCES if there
1808          * are conflicting locks.
1809          */
1810         if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1811                 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1812                     NULL, NULL, NULL);
1813 
1814                 if (!lookuperr &&
1815                     rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1816                         VN_RELE(tvp);
1817                         curthread->t_flag |= T_WOULDBLOCK;
1818                         goto out;
1819                 }
1820 
1821                 if (!lookuperr && nbl_need_check(tvp)) {
1822                         /*
1823                          * The file exists. Now check if it has any
1824                          * conflicting non-blocking mandatory locks
1825                          * in the region being changed.
1826                          */
1827                         struct vattr bva;
1828                         u_offset_t offset;
1829                         ssize_t length;
1830 
1831                         nbl_start_crit(tvp, RW_READER);
1832                         in_crit = 1;
1833 
1834                         bva.va_mask = AT_SIZE;
1835                         error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1836                         if (!error) {
1837                                 if (va.va_size < bva.va_size) {
1838                                         offset = va.va_size;
1839                                         length = bva.va_size - va.va_size;
1840                                 } else {
1841                                         offset = bva.va_size;
1842                                         length = va.va_size - bva.va_size;
1843                                 }
1844                                 if (length) {
1845                                         if (nbl_conflict(tvp, NBL_WRITE,
1846                                             offset, length, 0, NULL)) {
1847                                                 error = EACCES;
1848                                         }
1849                                 }
1850                         }
1851                         if (error) {
1852                                 nbl_end_crit(tvp);
1853                                 VN_RELE(tvp);
1854                                 in_crit = 0;
1855                         }
1856                 } else if (tvp != NULL) {
1857                         VN_RELE(tvp);
1858                 }
1859         }
1860 
1861         if (!error) {
1862                 /*
1863                  * If filesystem is shared with nosuid the remove any
1864                  * setuid/setgid bits on create.
1865                  */
1866                 if (va.va_type == VREG &&
1867                     exi->exi_export.ex_flags & EX_NOSUID)
1868                         va.va_mode &= ~(VSUID | VSGID);
1869 
1870                 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1871                     NULL, NULL);
1872 
1873                 if (!error) {
1874 
1875                         if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1876                                 trunc = TRUE;
1877                         else
1878                                 trunc = FALSE;
1879 
1880                         if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1881                                 VN_RELE(vp);
1882                                 curthread->t_flag |= T_WOULDBLOCK;
1883                                 goto out;
1884                         }
1885                         va.va_mask = AT_ALL;
1886 
1887                         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1888 
1889                         /* check for overflows */
1890                         if (!error) {
1891                                 acl_perm(vp, exi, &va, cr);
1892                                 error = vattr_to_nattr(&va, &dr->dr_attr);
1893                                 if (!error) {
1894                                         error = makefh(&dr->dr_fhandle, vp,
1895                                             exi);
1896                                 }
1897                         }
1898                         /*
1899                          * Force modified metadata out to stable storage.
1900                          *
1901                          * if a underlying vp exists, pass it to VOP_FSYNC
1902                          */
1903                         if (VOP_REALVP(vp, &realvp, NULL) == 0)
1904                                 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1905                         else
1906                                 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1907                         VN_RELE(vp);
1908                 }
1909 
1910                 if (in_crit) {
1911                         nbl_end_crit(tvp);
1912                         VN_RELE(tvp);
1913                 }
1914         }
1915 
1916         /*
1917          * Force modified data and metadata out to stable storage.
1918          */
1919         (void) VOP_FSYNC(dvp, 0, cr, NULL);
1920 
1921 out:
1922 
1923         VN_RELE(dvp);
1924 
1925         dr->dr_status = puterrno(error);
1926 
1927         if (name != args->ca_da.da_name)
1928                 kmem_free(name, MAXPATHLEN);
1929 }
1930 void *
1931 rfs_create_getfh(struct nfscreatargs *args)
1932 {
1933         return (args->ca_da.da_fhandle);
1934 }
1935 
1936 /*
1937  * Remove a file.
1938  * Remove named file from parent directory.
1939  */
1940 /* ARGSUSED */
1941 void
1942 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1943     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1944 {
1945         int error = 0;
1946         vnode_t *vp;
1947         vnode_t *targvp;
1948         int in_crit = 0;
1949 
1950         /*
1951          * Disallow NULL paths
1952          */
1953         if (da->da_name == NULL || *da->da_name == '\0') {
1954                 *status = NFSERR_ACCES;
1955                 return;
1956         }
1957 
1958         vp = nfs_fhtovp(da->da_fhandle, exi);
1959         if (vp == NULL) {
1960                 *status = NFSERR_STALE;
1961                 return;
1962         }
1963 
1964         if (rdonly(ro, vp)) {
1965                 VN_RELE(vp);
1966                 *status = NFSERR_ROFS;
1967                 return;
1968         }
1969 
1970         /*
1971          * Check for a conflict with a non-blocking mandatory share reservation.
1972          */
1973         error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
1974             NULL, cr, NULL, NULL, NULL);
1975         if (error != 0) {
1976                 VN_RELE(vp);
1977                 *status = puterrno(error);
1978                 return;
1979         }
1980 
1981         /*
1982          * If the file is delegated to an v4 client, then initiate
1983          * recall and drop this request (by setting T_WOULDBLOCK).
1984          * The client will eventually re-transmit the request and
1985          * (hopefully), by then, the v4 client will have returned
1986          * the delegation.
1987          */
1988 
1989         if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1990                 VN_RELE(vp);
1991                 VN_RELE(targvp);
1992                 curthread->t_flag |= T_WOULDBLOCK;
1993                 return;
1994         }
1995 
1996         if (nbl_need_check(targvp)) {
1997                 nbl_start_crit(targvp, RW_READER);
1998                 in_crit = 1;
1999                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2000                         error = EACCES;
2001                         goto out;
2002                 }
2003         }
2004 
2005         error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2006 
2007         /*
2008          * Force modified data and metadata out to stable storage.
2009          */
2010         (void) VOP_FSYNC(vp, 0, cr, NULL);
2011 
2012 out:
2013         if (in_crit)
2014                 nbl_end_crit(targvp);
2015         VN_RELE(targvp);
2016         VN_RELE(vp);
2017 
2018         *status = puterrno(error);
2019 
2020 }
2021 
2022 void *
2023 rfs_remove_getfh(struct nfsdiropargs *da)
2024 {
2025         return (da->da_fhandle);
2026 }
2027 
2028 /*
2029  * rename a file
2030  * Give a file (from) a new name (to).
2031  */
2032 /* ARGSUSED */
2033 void
2034 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2035     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2036 {
2037         int error = 0;
2038         vnode_t *fromvp;
2039         vnode_t *tovp;
2040         struct exportinfo *to_exi;
2041         fhandle_t *fh;
2042         vnode_t *srcvp;
2043         vnode_t *targvp;
2044         int in_crit = 0;
2045 
2046         fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2047         if (fromvp == NULL) {
2048                 *status = NFSERR_STALE;
2049                 return;
2050         }
2051 
2052         fh = args->rna_to.da_fhandle;
2053         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2054         if (to_exi == NULL) {
2055                 VN_RELE(fromvp);
2056                 *status = NFSERR_ACCES;
2057                 return;
2058         }
2059         exi_rele(to_exi);
2060 
2061         if (to_exi != exi) {
2062                 VN_RELE(fromvp);
2063                 *status = NFSERR_XDEV;
2064                 return;
2065         }
2066 
2067         tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2068         if (tovp == NULL) {
2069                 VN_RELE(fromvp);
2070                 *status = NFSERR_STALE;
2071                 return;
2072         }
2073 
2074         if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2075                 VN_RELE(tovp);
2076                 VN_RELE(fromvp);
2077                 *status = NFSERR_NOTDIR;
2078                 return;
2079         }
2080 
2081         /*
2082          * Disallow NULL paths
2083          */
2084         if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2085             args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2086                 VN_RELE(tovp);
2087                 VN_RELE(fromvp);
2088                 *status = NFSERR_ACCES;
2089                 return;
2090         }
2091 
2092         if (rdonly(ro, tovp)) {
2093                 VN_RELE(tovp);
2094                 VN_RELE(fromvp);
2095                 *status = NFSERR_ROFS;
2096                 return;
2097         }
2098 
2099         /*
2100          * Check for a conflict with a non-blocking mandatory share reservation.
2101          */
2102         error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2103             NULL, cr, NULL, NULL, NULL);
2104         if (error != 0) {
2105                 VN_RELE(tovp);
2106                 VN_RELE(fromvp);
2107                 *status = puterrno(error);
2108                 return;
2109         }
2110 
2111         /* Check for delegations on the source file */
2112 
2113         if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2114                 VN_RELE(tovp);
2115                 VN_RELE(fromvp);
2116                 VN_RELE(srcvp);
2117                 curthread->t_flag |= T_WOULDBLOCK;
2118                 return;
2119         }
2120 
2121         /* Check for delegation on the file being renamed over, if it exists */
2122 
2123         if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2124             VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2125             NULL, NULL, NULL) == 0) {
2126 
2127                 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2128                         VN_RELE(tovp);
2129                         VN_RELE(fromvp);
2130                         VN_RELE(srcvp);
2131                         VN_RELE(targvp);
2132                         curthread->t_flag |= T_WOULDBLOCK;
2133                         return;
2134                 }
2135                 VN_RELE(targvp);
2136         }
2137 
2138 
2139         if (nbl_need_check(srcvp)) {
2140                 nbl_start_crit(srcvp, RW_READER);
2141                 in_crit = 1;
2142                 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2143                         error = EACCES;
2144                         goto out;
2145                 }
2146         }
2147 
2148         error = VOP_RENAME(fromvp, args->rna_from.da_name,
2149             tovp, args->rna_to.da_name, cr, NULL, 0);
2150 
2151         if (error == 0)
2152                 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2153                     strlen(args->rna_to.da_name));
2154 
2155         /*
2156          * Force modified data and metadata out to stable storage.
2157          */
2158         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2159         (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2160 
2161 out:
2162         if (in_crit)
2163                 nbl_end_crit(srcvp);
2164         VN_RELE(srcvp);
2165         VN_RELE(tovp);
2166         VN_RELE(fromvp);
2167 
2168         *status = puterrno(error);
2169 
2170 }
2171 void *
2172 rfs_rename_getfh(struct nfsrnmargs *args)
2173 {
2174         return (args->rna_from.da_fhandle);
2175 }
2176 
2177 /*
2178  * Link to a file.
2179  * Create a file (to) which is a hard link to the given file (from).
2180  */
2181 /* ARGSUSED */
2182 void
2183 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2184     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2185 {
2186         int error;
2187         vnode_t *fromvp;
2188         vnode_t *tovp;
2189         struct exportinfo *to_exi;
2190         fhandle_t *fh;
2191 
2192         fromvp = nfs_fhtovp(args->la_from, exi);
2193         if (fromvp == NULL) {
2194                 *status = NFSERR_STALE;
2195                 return;
2196         }
2197 
2198         fh = args->la_to.da_fhandle;
2199         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2200         if (to_exi == NULL) {
2201                 VN_RELE(fromvp);
2202                 *status = NFSERR_ACCES;
2203                 return;
2204         }
2205         exi_rele(to_exi);
2206 
2207         if (to_exi != exi) {
2208                 VN_RELE(fromvp);
2209                 *status = NFSERR_XDEV;
2210                 return;
2211         }
2212 
2213         tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2214         if (tovp == NULL) {
2215                 VN_RELE(fromvp);
2216                 *status = NFSERR_STALE;
2217                 return;
2218         }
2219 
2220         if (tovp->v_type != VDIR) {
2221                 VN_RELE(tovp);
2222                 VN_RELE(fromvp);
2223                 *status = NFSERR_NOTDIR;
2224                 return;
2225         }
2226         /*
2227          * Disallow NULL paths
2228          */
2229         if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2230                 VN_RELE(tovp);
2231                 VN_RELE(fromvp);
2232                 *status = NFSERR_ACCES;
2233                 return;
2234         }
2235 
2236         if (rdonly(ro, tovp)) {
2237                 VN_RELE(tovp);
2238                 VN_RELE(fromvp);
2239                 *status = NFSERR_ROFS;
2240                 return;
2241         }
2242 
2243         error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2244 
2245         /*
2246          * Force modified data and metadata out to stable storage.
2247          */
2248         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2249         (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2250 
2251         VN_RELE(tovp);
2252         VN_RELE(fromvp);
2253 
2254         *status = puterrno(error);
2255 
2256 }
2257 void *
2258 rfs_link_getfh(struct nfslinkargs *args)
2259 {
2260         return (args->la_from);
2261 }
2262 
2263 /*
2264  * Symbolicly link to a file.
2265  * Create a file (to) with the given attributes which is a symbolic link
2266  * to the given path name (to).
2267  */
2268 void
2269 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2270     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2271 {
2272         int error;
2273         struct vattr va;
2274         vnode_t *vp;
2275         vnode_t *svp;
2276         int lerror;
2277         struct sockaddr *ca;
2278         char *name = NULL;
2279 
2280         /*
2281          * Disallow NULL paths
2282          */
2283         if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2284                 *status = NFSERR_ACCES;
2285                 return;
2286         }
2287 
2288         vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2289         if (vp == NULL) {
2290                 *status = NFSERR_STALE;
2291                 return;
2292         }
2293 
2294         if (rdonly(ro, vp)) {
2295                 VN_RELE(vp);
2296                 *status = NFSERR_ROFS;
2297                 return;
2298         }
2299 
2300         error = sattr_to_vattr(args->sla_sa, &va);
2301         if (error) {
2302                 VN_RELE(vp);
2303                 *status = puterrno(error);
2304                 return;
2305         }
2306 
2307         if (!(va.va_mask & AT_MODE)) {
2308                 VN_RELE(vp);
2309                 *status = NFSERR_INVAL;
2310                 return;
2311         }
2312 
2313         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2314         name = nfscmd_convname(ca, exi, args->sla_tnm,
2315             NFSCMD_CONV_INBOUND, MAXPATHLEN);
2316 
2317         if (name == NULL) {
2318                 *status = NFSERR_ACCES;
2319                 return;
2320         }
2321 
2322         va.va_type = VLNK;
2323         va.va_mask |= AT_TYPE;
2324 
2325         error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2326 
2327         /*
2328          * Force new data and metadata out to stable storage.
2329          */
2330         lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2331             NULL, cr, NULL, NULL, NULL);
2332 
2333         if (!lerror) {
2334                 (void) VOP_FSYNC(svp, 0, cr, NULL);
2335                 VN_RELE(svp);
2336         }
2337 
2338         /*
2339          * Force modified data and metadata out to stable storage.
2340          */
2341         (void) VOP_FSYNC(vp, 0, cr, NULL);
2342 
2343         VN_RELE(vp);
2344 
2345         *status = puterrno(error);
2346         if (name != args->sla_tnm)
2347                 kmem_free(name, MAXPATHLEN);
2348 
2349 }
2350 void *
2351 rfs_symlink_getfh(struct nfsslargs *args)
2352 {
2353         return (args->sla_from.da_fhandle);
2354 }
2355 
2356 /*
2357  * Make a directory.
2358  * Create a directory with the given name, parent directory, and attributes.
2359  * Returns a file handle and attributes for the new directory.
2360  */
2361 /* ARGSUSED */
2362 void
2363 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2364     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2365 {
2366         int error;
2367         struct vattr va;
2368         vnode_t *dvp = NULL;
2369         vnode_t *vp;
2370         char *name = args->ca_da.da_name;
2371 
2372         /*
2373          * Disallow NULL paths
2374          */
2375         if (name == NULL || *name == '\0') {
2376                 dr->dr_status = NFSERR_ACCES;
2377                 return;
2378         }
2379 
2380         vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2381         if (vp == NULL) {
2382                 dr->dr_status = NFSERR_STALE;
2383                 return;
2384         }
2385 
2386         if (rdonly(ro, vp)) {
2387                 VN_RELE(vp);
2388                 dr->dr_status = NFSERR_ROFS;
2389                 return;
2390         }
2391 
2392         error = sattr_to_vattr(args->ca_sa, &va);
2393         if (error) {
2394                 VN_RELE(vp);
2395                 dr->dr_status = puterrno(error);
2396                 return;
2397         }
2398 
2399         if (!(va.va_mask & AT_MODE)) {
2400                 VN_RELE(vp);
2401                 dr->dr_status = NFSERR_INVAL;
2402                 return;
2403         }
2404 
2405         va.va_type = VDIR;
2406         va.va_mask |= AT_TYPE;
2407 
2408         error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2409 
2410         if (!error) {
2411                 /*
2412                  * Attribtutes of the newly created directory should
2413                  * be returned to the client.
2414                  */
2415                 va.va_mask = AT_ALL; /* We want everything */
2416                 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2417 
2418                 /* check for overflows */
2419                 if (!error) {
2420                         acl_perm(vp, exi, &va, cr);
2421                         error = vattr_to_nattr(&va, &dr->dr_attr);
2422                         if (!error) {
2423                                 error = makefh(&dr->dr_fhandle, dvp, exi);
2424                         }
2425                 }
2426                 /*
2427                  * Force new data and metadata out to stable storage.
2428                  */
2429                 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2430                 VN_RELE(dvp);
2431         }
2432 
2433         /*
2434          * Force modified data and metadata out to stable storage.
2435          */
2436         (void) VOP_FSYNC(vp, 0, cr, NULL);
2437 
2438         VN_RELE(vp);
2439 
2440         dr->dr_status = puterrno(error);
2441 
2442 }
2443 void *
2444 rfs_mkdir_getfh(struct nfscreatargs *args)
2445 {
2446         return (args->ca_da.da_fhandle);
2447 }
2448 
2449 /*
2450  * Remove a directory.
2451  * Remove the given directory name from the given parent directory.
2452  */
2453 /* ARGSUSED */
2454 void
2455 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2456     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2457 {
2458         int error;
2459         vnode_t *vp;
2460 
2461         /*
2462          * Disallow NULL paths
2463          */
2464         if (da->da_name == NULL || *da->da_name == '\0') {
2465                 *status = NFSERR_ACCES;
2466                 return;
2467         }
2468 
2469         vp = nfs_fhtovp(da->da_fhandle, exi);
2470         if (vp == NULL) {
2471                 *status = NFSERR_STALE;
2472                 return;
2473         }
2474 
2475         if (rdonly(ro, vp)) {
2476                 VN_RELE(vp);
2477                 *status = NFSERR_ROFS;
2478                 return;
2479         }
2480 
2481         /*
2482          * VOP_RMDIR takes a third argument (the current
2483          * directory of the process).  That's because someone
2484          * wants to return EINVAL if one tries to remove ".".
2485          * Of course, NFS servers have no idea what their
2486          * clients' current directories are.  We fake it by
2487          * supplying a vnode known to exist and illegal to
2488          * remove.
2489          */
2490         error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2491 
2492         /*
2493          * Force modified data and metadata out to stable storage.
2494          */
2495         (void) VOP_FSYNC(vp, 0, cr, NULL);
2496 
2497         VN_RELE(vp);
2498 
2499         /*
2500          * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2501          * if the directory is not empty.  A System V NFS server
2502          * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2503          * over the wire.
2504          */
2505         if (error == EEXIST)
2506                 *status = NFSERR_NOTEMPTY;
2507         else
2508                 *status = puterrno(error);
2509 
2510 }
2511 void *
2512 rfs_rmdir_getfh(struct nfsdiropargs *da)
2513 {
2514         return (da->da_fhandle);
2515 }
2516 
2517 /* ARGSUSED */
2518 void
2519 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2520     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2521 {
2522         int error;
2523         int iseof;
2524         struct iovec iov;
2525         struct uio uio;
2526         vnode_t *vp;
2527         char *ndata = NULL;
2528         struct sockaddr *ca;
2529         size_t nents;
2530         int ret;
2531 
2532         vp = nfs_fhtovp(&rda->rda_fh, exi);
2533         if (vp == NULL) {
2534                 rd->rd_entries = NULL;
2535                 rd->rd_status = NFSERR_STALE;
2536                 return;
2537         }
2538 
2539         if (vp->v_type != VDIR) {
2540                 VN_RELE(vp);
2541                 rd->rd_entries = NULL;
2542                 rd->rd_status = NFSERR_NOTDIR;
2543                 return;
2544         }
2545 
2546         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2547 
2548         error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2549 
2550         if (error) {
2551                 rd->rd_entries = NULL;
2552                 goto bad;
2553         }
2554 
2555         if (rda->rda_count == 0) {
2556                 rd->rd_entries = NULL;
2557                 rd->rd_size = 0;
2558                 rd->rd_eof = FALSE;
2559                 goto bad;
2560         }
2561 
2562         rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2563 
2564         /*
2565          * Allocate data for entries.  This will be freed by rfs_rddirfree.
2566          */
2567         rd->rd_bufsize = (uint_t)rda->rda_count;
2568         rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2569 
2570         /*
2571          * Set up io vector to read directory data
2572          */
2573         iov.iov_base = (caddr_t)rd->rd_entries;
2574         iov.iov_len = rda->rda_count;
2575         uio.uio_iov = &iov;
2576         uio.uio_iovcnt = 1;
2577         uio.uio_segflg = UIO_SYSSPACE;
2578         uio.uio_extflg = UIO_COPY_CACHED;
2579         uio.uio_loffset = (offset_t)rda->rda_offset;
2580         uio.uio_resid = rda->rda_count;
2581 
2582         /*
2583          * read directory
2584          */
2585         error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2586 
2587         /*
2588          * Clean up
2589          */
2590         if (!error) {
2591                 /*
2592                  * set size and eof
2593                  */
2594                 if (uio.uio_resid == rda->rda_count) {
2595                         rd->rd_size = 0;
2596                         rd->rd_eof = TRUE;
2597                 } else {
2598                         rd->rd_size = (uint32_t)(rda->rda_count -
2599                             uio.uio_resid);
2600                         rd->rd_eof = iseof ? TRUE : FALSE;
2601                 }
2602         }
2603 
2604         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2605         nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2606         ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2607             rda->rda_count, &ndata);
2608 
2609         if (ret != 0) {
2610                 size_t dropbytes;
2611                 /*
2612                  * We had to drop one or more entries in order to fit
2613                  * during the character conversion.  We need to patch
2614                  * up the size and eof info.
2615                  */
2616                 if (rd->rd_eof)
2617                         rd->rd_eof = FALSE;
2618                 dropbytes = nfscmd_dropped_entrysize(
2619                     (struct dirent64 *)rd->rd_entries, nents, ret);
2620                 rd->rd_size -= dropbytes;
2621         }
2622         if (ndata == NULL) {
2623                 ndata = (char *)rd->rd_entries;
2624         } else if (ndata != (char *)rd->rd_entries) {
2625                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2626                 rd->rd_entries = (void *)ndata;
2627                 rd->rd_bufsize = rda->rda_count;
2628         }
2629 
2630 bad:
2631         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2632 
2633 #if 0 /* notyet */
2634         /*
2635          * Don't do this.  It causes local disk writes when just
2636          * reading the file and the overhead is deemed larger
2637          * than the benefit.
2638          */
2639         /*
2640          * Force modified metadata out to stable storage.
2641          */
2642         (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2643 #endif
2644 
2645         VN_RELE(vp);
2646 
2647         rd->rd_status = puterrno(error);
2648 
2649 }
2650 void *
2651 rfs_readdir_getfh(struct nfsrddirargs *rda)
2652 {
2653         return (&rda->rda_fh);
2654 }
2655 void
2656 rfs_rddirfree(struct nfsrddirres *rd)
2657 {
2658         if (rd->rd_entries != NULL)
2659                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2660 }
2661 
2662 /* ARGSUSED */
2663 void
2664 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2665     struct svc_req *req, cred_t *cr, bool_t ro)
2666 {
2667         int error;
2668         struct statvfs64 sb;
2669         vnode_t *vp;
2670 
2671         vp = nfs_fhtovp(fh, exi);
2672         if (vp == NULL) {
2673                 fs->fs_status = NFSERR_STALE;
2674                 return;
2675         }
2676 
2677         error = VFS_STATVFS(vp->v_vfsp, &sb);
2678 
2679         if (!error) {
2680                 fs->fs_tsize = nfstsize();
2681                 fs->fs_bsize = sb.f_frsize;
2682                 fs->fs_blocks = sb.f_blocks;
2683                 fs->fs_bfree = sb.f_bfree;
2684                 fs->fs_bavail = sb.f_bavail;
2685         }
2686 
2687         VN_RELE(vp);
2688 
2689         fs->fs_status = puterrno(error);
2690 
2691 }
2692 void *
2693 rfs_statfs_getfh(fhandle_t *fh)
2694 {
2695         return (fh);
2696 }
2697 
2698 static int
2699 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2700 {
2701         vap->va_mask = 0;
2702 
2703         /*
2704          * There was a sign extension bug in some VFS based systems
2705          * which stored the mode as a short.  When it would get
2706          * assigned to a u_long, no sign extension would occur.
2707          * It needed to, but this wasn't noticed because sa_mode
2708          * would then get assigned back to the short, thus ignoring
2709          * the upper 16 bits of sa_mode.
2710          *
2711          * To make this implementation work for both broken
2712          * clients and good clients, we check for both versions
2713          * of the mode.
2714          */
2715         if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2716             sa->sa_mode != (uint32_t)-1) {
2717                 vap->va_mask |= AT_MODE;
2718                 vap->va_mode = sa->sa_mode;
2719         }
2720         if (sa->sa_uid != (uint32_t)-1) {
2721                 vap->va_mask |= AT_UID;
2722                 vap->va_uid = sa->sa_uid;
2723         }
2724         if (sa->sa_gid != (uint32_t)-1) {
2725                 vap->va_mask |= AT_GID;
2726                 vap->va_gid = sa->sa_gid;
2727         }
2728         if (sa->sa_size != (uint32_t)-1) {
2729                 vap->va_mask |= AT_SIZE;
2730                 vap->va_size = sa->sa_size;
2731         }
2732         if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2733             sa->sa_atime.tv_usec != (int32_t)-1) {
2734 #ifndef _LP64
2735                 /* return error if time overflow */
2736                 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2737                         return (EOVERFLOW);
2738 #endif
2739                 vap->va_mask |= AT_ATIME;
2740                 /*
2741                  * nfs protocol defines times as unsigned so don't extend sign,
2742                  * unless sysadmin set nfs_allow_preepoch_time.
2743                  */
2744                 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2745                 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2746         }
2747         if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2748             sa->sa_mtime.tv_usec != (int32_t)-1) {
2749 #ifndef _LP64
2750                 /* return error if time overflow */
2751                 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2752                         return (EOVERFLOW);
2753 #endif
2754                 vap->va_mask |= AT_MTIME;
2755                 /*
2756                  * nfs protocol defines times as unsigned so don't extend sign,
2757                  * unless sysadmin set nfs_allow_preepoch_time.
2758                  */
2759                 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2760                 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2761         }
2762         return (0);
2763 }
2764 
2765 static enum nfsftype vt_to_nf[] = {
2766         0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2767 };
2768 
2769 /*
2770  * check the following fields for overflow: nodeid, size, and time.
2771  * There could be a problem when converting 64-bit LP64 fields
2772  * into 32-bit ones.  Return an error if there is an overflow.
2773  */
2774 int
2775 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2776 {
2777         ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2778         na->na_type = vt_to_nf[vap->va_type];
2779 
2780         if (vap->va_mode == (unsigned short) -1)
2781                 na->na_mode = (uint32_t)-1;
2782         else
2783                 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2784 
2785         if (vap->va_uid == (unsigned short)(-1))
2786                 na->na_uid = (uint32_t)(-1);
2787         else if (vap->va_uid == UID_NOBODY)
2788                 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2789         else
2790                 na->na_uid = vap->va_uid;
2791 
2792         if (vap->va_gid == (unsigned short)(-1))
2793                 na->na_gid = (uint32_t)-1;
2794         else if (vap->va_gid == GID_NOBODY)
2795                 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2796         else
2797                 na->na_gid = vap->va_gid;
2798 
2799         /*
2800          * Do we need to check fsid for overflow?  It is 64-bit in the
2801          * vattr, but are bigger than 32 bit values supported?
2802          */
2803         na->na_fsid = vap->va_fsid;
2804 
2805         na->na_nodeid = vap->va_nodeid;
2806 
2807         /*
2808          * Check to make sure that the nodeid is representable over the
2809          * wire without losing bits.
2810          */
2811         if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2812                 return (EFBIG);
2813         na->na_nlink = vap->va_nlink;
2814 
2815         /*
2816          * Check for big files here, instead of at the caller.  See
2817          * comments in cstat for large special file explanation.
2818          */
2819         if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2820                 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2821                         return (EFBIG);
2822                 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2823                         /* UNKNOWN_SIZE | OVERFLOW */
2824                         na->na_size = MAXOFF32_T;
2825                 } else
2826                         na->na_size = vap->va_size;
2827         } else
2828                 na->na_size = vap->va_size;
2829 
2830         /*
2831          * If the vnode times overflow the 32-bit times that NFS2
2832          * uses on the wire then return an error.
2833          */
2834         if (!NFS_VAP_TIME_OK(vap)) {
2835                 return (EOVERFLOW);
2836         }
2837         na->na_atime.tv_sec = vap->va_atime.tv_sec;
2838         na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2839 
2840         na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2841         na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2842 
2843         na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2844         na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2845 
2846         /*
2847          * If the dev_t will fit into 16 bits then compress
2848          * it, otherwise leave it alone. See comments in
2849          * nfs_client.c.
2850          */
2851         if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2852             getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2853                 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2854         else
2855                 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2856 
2857         na->na_blocks = vap->va_nblocks;
2858         na->na_blocksize = vap->va_blksize;
2859 
2860         /*
2861          * This bit of ugliness is a *TEMPORARY* hack to preserve the
2862          * over-the-wire protocols for named-pipe vnodes.  It remaps the
2863          * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2864          *
2865          * BUYER BEWARE:
2866          *  If you are porting the NFS to a non-Sun server, you probably
2867          *  don't want to include the following block of code.  The
2868          *  over-the-wire special file types will be changing with the
2869          *  NFS Protocol Revision.
2870          */
2871         if (vap->va_type == VFIFO)
2872                 NA_SETFIFO(na);
2873         return (0);
2874 }
2875 
2876 /*
2877  * acl v2 support: returns approximate permission.
2878  *      default: returns minimal permission (more restrictive)
2879  *      aclok: returns maximal permission (less restrictive)
2880  *      This routine changes the permissions that are alaredy in *va.
2881  *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2882  *      CLASS_OBJ is always the same as GROUP_OBJ entry.
2883  */
2884 static void
2885 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2886 {
2887         vsecattr_t      vsa;
2888         int             aclcnt;
2889         aclent_t        *aclentp;
2890         mode_t          mask_perm;
2891         mode_t          grp_perm;
2892         mode_t          other_perm;
2893         mode_t          other_orig;
2894         int             error;
2895 
2896         /* dont care default acl */
2897         vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2898         error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2899 
2900         if (!error) {
2901                 aclcnt = vsa.vsa_aclcnt;
2902                 if (aclcnt > MIN_ACL_ENTRIES) {
2903                         /* non-trivial ACL */
2904                         aclentp = vsa.vsa_aclentp;
2905                         if (exi->exi_export.ex_flags & EX_ACLOK) {
2906                                 /* maximal permissions */
2907                                 grp_perm = 0;
2908                                 other_perm = 0;
2909                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
2910                                         switch (aclentp->a_type) {
2911                                         case USER_OBJ:
2912                                                 break;
2913                                         case USER:
2914                                                 grp_perm |=
2915                                                     aclentp->a_perm << 3;
2916                                                 other_perm |= aclentp->a_perm;
2917                                                 break;
2918                                         case GROUP_OBJ:
2919                                                 grp_perm |=
2920                                                     aclentp->a_perm << 3;
2921                                                 break;
2922                                         case GROUP:
2923                                                 other_perm |= aclentp->a_perm;
2924                                                 break;
2925                                         case OTHER_OBJ:
2926                                                 other_orig = aclentp->a_perm;
2927                                                 break;
2928                                         case CLASS_OBJ:
2929                                                 mask_perm = aclentp->a_perm;
2930                                                 break;
2931                                         default:
2932                                                 break;
2933                                         }
2934                                 }
2935                                 grp_perm &= mask_perm << 3;
2936                                 other_perm &= mask_perm;
2937                                 other_perm |= other_orig;
2938 
2939                         } else {
2940                                 /* minimal permissions */
2941                                 grp_perm = 070;
2942                                 other_perm = 07;
2943                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
2944                                         switch (aclentp->a_type) {
2945                                         case USER_OBJ:
2946                                                 break;
2947                                         case USER:
2948                                         case CLASS_OBJ:
2949                                                 grp_perm &=
2950                                                     aclentp->a_perm << 3;
2951                                                 other_perm &=
2952                                                     aclentp->a_perm;
2953                                                 break;
2954                                         case GROUP_OBJ:
2955                                                 grp_perm &=
2956                                                     aclentp->a_perm << 3;
2957                                                 break;
2958                                         case GROUP:
2959                                                 other_perm &=
2960                                                     aclentp->a_perm;
2961                                                 break;
2962                                         case OTHER_OBJ:
2963                                                 other_perm &=
2964                                                     aclentp->a_perm;
2965                                                 break;
2966                                         default:
2967                                                 break;
2968                                         }
2969                                 }
2970                         }
2971                         /* copy to va */
2972                         va->va_mode &= ~077;
2973                         va->va_mode |= grp_perm | other_perm;
2974                 }
2975                 if (vsa.vsa_aclcnt)
2976                         kmem_free(vsa.vsa_aclentp,
2977                             vsa.vsa_aclcnt * sizeof (aclent_t));
2978         }
2979 }
2980 
2981 void
2982 rfs_srvrinit(void)
2983 {
2984         mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2985         nfs2_srv_caller_id = fs_new_caller_id();
2986 }
2987 
2988 void
2989 rfs_srvrfini(void)
2990 {
2991         mutex_destroy(&rfs_async_write_lock);
2992 }
2993 
2994 static int
2995 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2996 {
2997         struct clist    *wcl;
2998         int             wlist_len;
2999         uint32_t        count = rr->rr_count;
3000 
3001         wcl = ra->ra_wlist;
3002 
3003         if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3004                 return (FALSE);
3005         }
3006 
3007         wcl = ra->ra_wlist;
3008         rr->rr_ok.rrok_wlist_len = wlist_len;
3009         rr->rr_ok.rrok_wlist = wcl;
3010 
3011         return (TRUE);
3012 }