5045-use-atomic_inc_*-atomic_dec_*-instead-of-atomic_add_* Wdiff usr/src/uts/common/fs/nfs/nfs4_client.c

Print this page

5045 use atomic_{inc,dec}_* instead of atomic_add_*

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/nfs/nfs4_client.c
          +++ new/usr/src/uts/common/fs/nfs/nfs4_client.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   */
  24   24  
  25   25  /*
  26   26   *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  27   27   *      All Rights Reserved
  28   28   */
  29   29  
  30   30  #include <sys/param.h>
  31   31  #include <sys/types.h>
  32   32  #include <sys/systm.h>
  33   33  #include <sys/thread.h>
  34   34  #include <sys/t_lock.h>
  35   35  #include <sys/time.h>
  36   36  #include <sys/vnode.h>
  37   37  #include <sys/vfs.h>
  38   38  #include <sys/errno.h>
  39   39  #include <sys/buf.h>
  40   40  #include <sys/stat.h>
  41   41  #include <sys/cred.h>
  42   42  #include <sys/kmem.h>
  43   43  #include <sys/debug.h>
  44   44  #include <sys/dnlc.h>
  45   45  #include <sys/vmsystm.h>
  46   46  #include <sys/flock.h>
  47   47  #include <sys/share.h>
  48   48  #include <sys/cmn_err.h>
  49   49  #include <sys/tiuser.h>
  50   50  #include <sys/sysmacros.h>
  51   51  #include <sys/callb.h>
  52   52  #include <sys/acl.h>
  53   53  #include <sys/kstat.h>
  54   54  #include <sys/signal.h>
  55   55  #include <sys/disp.h>
  56   56  #include <sys/atomic.h>
  57   57  #include <sys/list.h>
  58   58  #include <sys/sdt.h>
  59   59  
  60   60  #include <rpc/types.h>
  61   61  #include <rpc/xdr.h>
  62   62  #include <rpc/auth.h>
  63   63  #include <rpc/clnt.h>
  64   64  
  65   65  #include <nfs/nfs.h>
  66   66  #include <nfs/nfs_clnt.h>
  67   67  #include <nfs/nfs_acl.h>
  68   68  
  69   69  #include <nfs/nfs4.h>
  70   70  #include <nfs/rnode4.h>
  71   71  #include <nfs/nfs4_clnt.h>
  72   72  
  73   73  #include <vm/hat.h>
  74   74  #include <vm/as.h>
  75   75  #include <vm/page.h>
  76   76  #include <vm/pvn.h>
  77   77  #include <vm/seg.h>
  78   78  #include <vm/seg_map.h>
  79   79  #include <vm/seg_vn.h>
  80   80  
  81   81  #include <sys/ddi.h>
  82   82  
  83   83  /*
  84   84   * Arguments to page-flush thread.
  85   85   */
  86   86  typedef struct {
  87   87          vnode_t *vp;
  88   88          cred_t *cr;
  89   89  } pgflush_t;
  90   90  
  91   91  #ifdef DEBUG
  92   92  int nfs4_client_lease_debug;
  93   93  int nfs4_sharedfh_debug;
  94   94  int nfs4_fname_debug;
  95   95  
  96   96  /* temporary: panic if v_type is inconsistent with r_attr va_type */
  97   97  int nfs4_vtype_debug;
  98   98  
  99   99  uint_t nfs4_tsd_key;
 100  100  #endif
 101  101  
 102  102  static time_t   nfs4_client_resumed = 0;
 103  103  static  callb_id_t cid = 0;
 104  104  
 105  105  static int      nfs4renew(nfs4_server_t *);
 106  106  static void     nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int);
 107  107  static void     nfs4_pgflush_thread(pgflush_t *);
 108  108  
 109  109  static boolean_t nfs4_client_cpr_callb(void *, int);
 110  110  
 111  111  struct mi4_globals {
 112  112          kmutex_t        mig_lock;  /* lock protecting mig_list */
 113  113          list_t          mig_list;  /* list of NFS v4 mounts in zone */
 114  114          boolean_t       mig_destructor_called;
 115  115  };
 116  116  
 117  117  static zone_key_t mi4_list_key;
 118  118  
 119  119  /*
 120  120   * Attributes caching:
 121  121   *
 122  122   * Attributes are cached in the rnode in struct vattr form.
 123  123   * There is a time associated with the cached attributes (r_time_attr_inval)
 124  124   * which tells whether the attributes are valid. The time is initialized
 125  125   * to the difference between current time and the modify time of the vnode
 126  126   * when new attributes are cached. This allows the attributes for
 127  127   * files that have changed recently to be timed out sooner than for files
 128  128   * that have not changed for a long time. There are minimum and maximum
 129  129   * timeout values that can be set per mount point.
 130  130   */
 131  131  
 132  132  /*
 133  133   * If a cache purge is in progress, wait for it to finish.
 134  134   *
 135  135   * The current thread must not be in the middle of an
 136  136   * nfs4_start_op/nfs4_end_op region.  Otherwise, there could be a deadlock
 137  137   * between this thread, a recovery thread, and the page flush thread.
 138  138   */
 139  139  int
 140  140  nfs4_waitfor_purge_complete(vnode_t *vp)
 141  141  {
 142  142          rnode4_t *rp;
 143  143          k_sigset_t smask;
 144  144  
 145  145          rp = VTOR4(vp);
 146  146          if ((rp->r_serial != NULL && rp->r_serial != curthread) ||
 147  147              ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) {
 148  148                  mutex_enter(&rp->r_statelock);
 149  149                  sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
 150  150                  while ((rp->r_serial != NULL && rp->r_serial != curthread) ||
 151  151                      ((rp->r_flags & R4PGFLUSH) &&
 152  152                      rp->r_pgflush != curthread)) {
 153  153                          if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 154  154                                  sigunintr(&smask);
 155  155                                  mutex_exit(&rp->r_statelock);
 156  156                                  return (EINTR);
 157  157                          }
 158  158                  }
 159  159                  sigunintr(&smask);
 160  160                  mutex_exit(&rp->r_statelock);
 161  161          }
 162  162          return (0);
 163  163  }
 164  164  
 165  165  /*
 166  166   * Validate caches by checking cached attributes. If they have timed out,
 167  167   * then get new attributes from the server.  As a side effect, cache
 168  168   * invalidation is done if the attributes have changed.
 169  169   *
 170  170   * If the attributes have not timed out and if there is a cache
 171  171   * invalidation being done by some other thread, then wait until that
 172  172   * thread has completed the cache invalidation.
 173  173   */
 174  174  int
 175  175  nfs4_validate_caches(vnode_t *vp, cred_t *cr)
 176  176  {
 177  177          int error;
 178  178          nfs4_ga_res_t gar;
 179  179  
 180  180          if (ATTRCACHE4_VALID(vp)) {
 181  181                  error = nfs4_waitfor_purge_complete(vp);
 182  182                  if (error)
 183  183                          return (error);
 184  184                  return (0);
 185  185          }
 186  186  
 187  187          gar.n4g_va.va_mask = AT_ALL;
 188  188          return (nfs4_getattr_otw(vp, &gar, cr, 0));
 189  189  }
 190  190  
 191  191  /*
 192  192   * Fill in attribute from the cache.
 193  193   * If valid, then return 0 to indicate that no error occurred,
 194  194   * otherwise return 1 to indicate that an error occurred.
 195  195   */
 196  196  static int
 197  197  nfs4_getattr_cache(vnode_t *vp, struct vattr *vap)
 198  198  {
 199  199          rnode4_t *rp;
 200  200  
 201  201          rp = VTOR4(vp);
 202  202          mutex_enter(&rp->r_statelock);
 203  203          mutex_enter(&rp->r_statev4_lock);
 204  204          if (ATTRCACHE4_VALID(vp)) {
 205  205                  mutex_exit(&rp->r_statev4_lock);
 206  206                  /*
 207  207                   * Cached attributes are valid
 208  208                   */
 209  209                  *vap = rp->r_attr;
 210  210                  mutex_exit(&rp->r_statelock);
 211  211                  return (0);
 212  212          }
 213  213          mutex_exit(&rp->r_statev4_lock);
 214  214          mutex_exit(&rp->r_statelock);
 215  215          return (1);
 216  216  }
 217  217  
 218  218  
 219  219  /*
 220  220   * If returned error is ESTALE flush all caches.  The nfs4_purge_caches()
 221  221   * call is synchronous because all the pages were invalidated by the
 222  222   * nfs4_invalidate_pages() call.
 223  223   */
 224  224  void
 225  225  nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr)
 226  226  {
 227  227          struct rnode4 *rp = VTOR4(vp);
 228  228  
 229  229          /* Ensure that the ..._end_op() call has been done */
 230  230          ASSERT(tsd_get(nfs4_tsd_key) == NULL);
 231  231  
 232  232          if (errno != ESTALE)
 233  233                  return;
 234  234  
 235  235          mutex_enter(&rp->r_statelock);
 236  236          rp->r_flags |= R4STALE;
 237  237          if (!rp->r_error)
 238  238                  rp->r_error = errno;
 239  239          mutex_exit(&rp->r_statelock);
 240  240          if (nfs4_has_pages(vp))
 241  241                  nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
 242  242          nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE);
 243  243  }
 244  244  
 245  245  /*
 246  246   * Purge all of the various NFS `data' caches.  If "asyncpg" is TRUE, the
 247  247   * page purge is done asynchronously.
 248  248   */
 249  249  void
 250  250  nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg)
 251  251  {
 252  252          rnode4_t *rp;
 253  253          char *contents;
 254  254          vnode_t *xattr;
 255  255          int size;
 256  256          int pgflush;                    /* are we the page flush thread? */
 257  257  
 258  258          /*
 259  259           * Purge the DNLC for any entries which refer to this file.
 260  260           */
 261  261          if (vp->v_count > 1 &&
 262  262              (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC))
 263  263                  dnlc_purge_vp(vp);
 264  264  
 265  265          /*
 266  266           * Clear any readdir state bits and purge the readlink response cache.
 267  267           */
 268  268          rp = VTOR4(vp);
 269  269          mutex_enter(&rp->r_statelock);
 270  270          rp->r_flags &= ~R4LOOKUP;
 271  271          contents = rp->r_symlink.contents;
 272  272          size = rp->r_symlink.size;
 273  273          rp->r_symlink.contents = NULL;
 274  274  
 275  275          xattr = rp->r_xattr_dir;
 276  276          rp->r_xattr_dir = NULL;
 277  277  
 278  278          /*
 279  279           * Purge pathconf cache too.
 280  280           */
 281  281          rp->r_pathconf.pc4_xattr_valid = 0;
 282  282          rp->r_pathconf.pc4_cache_valid = 0;
 283  283  
 284  284          pgflush = (curthread == rp->r_pgflush);
 285  285          mutex_exit(&rp->r_statelock);
 286  286  
 287  287          if (contents != NULL) {
 288  288  
 289  289                  kmem_free((void *)contents, size);
 290  290          }
 291  291  
 292  292          if (xattr != NULL)
 293  293                  VN_RELE(xattr);
 294  294  
 295  295          /*
 296  296           * Flush the page cache.  If the current thread is the page flush
 297  297           * thread, don't initiate a new page flush.  There's no need for
 298  298           * it, and doing it correctly is hard.
 299  299           */
 300  300          if (nfs4_has_pages(vp) && !pgflush) {
 301  301                  if (!asyncpg) {
 302  302                          (void) nfs4_waitfor_purge_complete(vp);
 303  303                          nfs4_flush_pages(vp, cr);
 304  304                  } else {
 305  305                          pgflush_t *args;
 306  306  
 307  307                          /*
 308  308                           * We don't hold r_statelock while creating the
 309  309                           * thread, in case the call blocks.  So we use a
 310  310                           * flag to indicate that a page flush thread is
 311  311                           * active.
 312  312                           */
 313  313                          mutex_enter(&rp->r_statelock);
 314  314                          if (rp->r_flags & R4PGFLUSH) {
 315  315                                  mutex_exit(&rp->r_statelock);
 316  316                          } else {
 317  317                                  rp->r_flags |= R4PGFLUSH;
 318  318                                  mutex_exit(&rp->r_statelock);
 319  319  
 320  320                                  args = kmem_alloc(sizeof (pgflush_t),
 321  321                                      KM_SLEEP);
 322  322                                  args->vp = vp;
 323  323                                  VN_HOLD(args->vp);
 324  324                                  args->cr = cr;
 325  325                                  crhold(args->cr);
 326  326                                  (void) zthread_create(NULL, 0,
 327  327                                      nfs4_pgflush_thread, args, 0,
 328  328                                      minclsyspri);
 329  329                          }
 330  330                  }
 331  331          }
 332  332  
 333  333          /*
 334  334           * Flush the readdir response cache.
 335  335           */
 336  336          nfs4_purge_rddir_cache(vp);
 337  337  }
 338  338  
 339  339  /*
 340  340   * Invalidate all pages for the given file, after writing back the dirty
 341  341   * ones.
 342  342   */
 343  343  
 344  344  void
 345  345  nfs4_flush_pages(vnode_t *vp, cred_t *cr)
 346  346  {
 347  347          int error;
 348  348          rnode4_t *rp = VTOR4(vp);
 349  349  
 350  350          error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
 351  351          if (error == ENOSPC || error == EDQUOT) {
 352  352                  mutex_enter(&rp->r_statelock);
 353  353                  if (!rp->r_error)
 354  354                          rp->r_error = error;
 355  355                  mutex_exit(&rp->r_statelock);
 356  356          }
 357  357  }
 358  358  
 359  359  /*
 360  360   * Page flush thread.
 361  361   */
 362  362  
 363  363  static void
 364  364  nfs4_pgflush_thread(pgflush_t *args)
 365  365  {
 366  366          rnode4_t *rp = VTOR4(args->vp);
 367  367  
 368  368          /* remember which thread we are, so we don't deadlock ourselves */
 369  369          mutex_enter(&rp->r_statelock);
 370  370          ASSERT(rp->r_pgflush == NULL);
 371  371          rp->r_pgflush = curthread;
 372  372          mutex_exit(&rp->r_statelock);
 373  373  
 374  374          nfs4_flush_pages(args->vp, args->cr);
 375  375  
 376  376          mutex_enter(&rp->r_statelock);
 377  377          rp->r_pgflush = NULL;
 378  378          rp->r_flags &= ~R4PGFLUSH;
 379  379          cv_broadcast(&rp->r_cv);
 380  380          mutex_exit(&rp->r_statelock);
 381  381  
 382  382          VN_RELE(args->vp);
 383  383          crfree(args->cr);
 384  384          kmem_free(args, sizeof (pgflush_t));
 385  385          zthread_exit();
 386  386  }
 387  387  
 388  388  /*
 389  389   * Purge the readdir cache of all entries which are not currently
 390  390   * being filled.
 391  391   */
 392  392  void
 393  393  nfs4_purge_rddir_cache(vnode_t *vp)
 394  394  {
 395  395          rnode4_t *rp;
 396  396  
 397  397          rp = VTOR4(vp);
 398  398  
 399  399          mutex_enter(&rp->r_statelock);
 400  400          rp->r_direof = NULL;
 401  401          rp->r_flags &= ~R4LOOKUP;
 402  402          rp->r_flags |= R4READDIRWATTR;
 403  403          rddir4_cache_purge(rp);
 404  404          mutex_exit(&rp->r_statelock);
 405  405  }
 406  406  
 407  407  /*
 408  408   * Set attributes cache for given vnode using virtual attributes.  There is
 409  409   * no cache validation, but if the attributes are deemed to be stale, they
 410  410   * are ignored.  This corresponds to nfs3_attrcache().
 411  411   *
 412  412   * Set the timeout value on the attribute cache and fill it
 413  413   * with the passed in attributes.
 414  414   */
 415  415  void
 416  416  nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t)
 417  417  {
 418  418          rnode4_t *rp = VTOR4(vp);
 419  419  
 420  420          mutex_enter(&rp->r_statelock);
 421  421          if (rp->r_time_attr_saved <= t)
 422  422                  nfs4_attrcache_va(vp, garp, FALSE);
 423  423          mutex_exit(&rp->r_statelock);
 424  424  }
 425  425  
 426  426  /*
 427  427   * Use the passed in virtual attributes to check to see whether the
 428  428   * data and metadata caches are valid, cache the new attributes, and
 429  429   * then do the cache invalidation if required.
 430  430   *
 431  431   * The cache validation and caching of the new attributes is done
 432  432   * atomically via the use of the mutex, r_statelock.  If required,
 433  433   * the cache invalidation is done atomically w.r.t. the cache
 434  434   * validation and caching of the attributes via the pseudo lock,
 435  435   * r_serial.
 436  436   *
 437  437   * This routine is used to do cache validation and attributes caching
 438  438   * for operations with a single set of post operation attributes.
 439  439   */
 440  440  
 441  441  void
 442  442  nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
 443  443      hrtime_t t, cred_t *cr, int async,
 444  444      change_info4 *cinfo)
 445  445  {
 446  446          rnode4_t *rp;
 447  447          int mtime_changed = 0;
 448  448          int ctime_changed = 0;
 449  449          vsecattr_t *vsp;
 450  450          int was_serial, set_time_cache_inval, recov;
 451  451          vattr_t *vap = &garp->n4g_va;
 452  452          mntinfo4_t *mi = VTOMI4(vp);
 453  453          len_t preattr_rsize;
 454  454          boolean_t writemodify_set = B_FALSE;
 455  455          boolean_t cachepurge_set = B_FALSE;
 456  456  
 457  457          ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid);
 458  458  
 459  459          /* Is curthread the recovery thread? */
 460  460          mutex_enter(&mi->mi_lock);
 461  461          recov = (VTOMI4(vp)->mi_recovthread == curthread);
 462  462          mutex_exit(&mi->mi_lock);
 463  463  
 464  464          rp = VTOR4(vp);
 465  465          mutex_enter(&rp->r_statelock);
 466  466          was_serial = (rp->r_serial == curthread);
 467  467          if (rp->r_serial && !was_serial) {
 468  468                  klwp_t *lwp = ttolwp(curthread);
 469  469  
 470  470                  /*
 471  471                   * If we're the recovery thread, then purge current attrs
 472  472                   * and bail out to avoid potential deadlock between another
 473  473                   * thread caching attrs (r_serial thread), recov thread,
 474  474                   * and an async writer thread.
 475  475                   */
 476  476                  if (recov) {
 477  477                          PURGE_ATTRCACHE4_LOCKED(rp);
 478  478                          mutex_exit(&rp->r_statelock);
 479  479                          return;
 480  480                  }
 481  481  
 482  482                  if (lwp != NULL)
 483  483                          lwp->lwp_nostop++;
 484  484                  while (rp->r_serial != NULL) {
 485  485                          if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 486  486                                  mutex_exit(&rp->r_statelock);
 487  487                                  if (lwp != NULL)
 488  488                                          lwp->lwp_nostop--;
 489  489                                  return;
 490  490                          }
 491  491                  }
 492  492                  if (lwp != NULL)
 493  493                          lwp->lwp_nostop--;
 494  494          }
 495  495  
 496  496          /*
 497  497           * If there is a page flush thread, the current thread needs to
 498  498           * bail out, to prevent a possible deadlock between the current
 499  499           * thread (which might be in a start_op/end_op region), the
 500  500           * recovery thread, and the page flush thread.  Expire the
 501  501           * attribute cache, so that any attributes the current thread was
 502  502           * going to set are not lost.
 503  503           */
 504  504          if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) {
 505  505                  PURGE_ATTRCACHE4_LOCKED(rp);
 506  506                  mutex_exit(&rp->r_statelock);
 507  507                  return;
 508  508          }
 509  509  
 510  510          if (rp->r_time_attr_saved > t) {
 511  511                  /*
 512  512                   * Attributes have been cached since these attributes were
 513  513                   * probably made. If there is an inconsistency in what is
 514  514                   * cached, mark them invalid. If not, don't act on them.
 515  515                   */
 516  516                  if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
 517  517                          PURGE_ATTRCACHE4_LOCKED(rp);
 518  518                  mutex_exit(&rp->r_statelock);
 519  519                  return;
 520  520          }
 521  521          set_time_cache_inval = 0;
 522  522          if (cinfo) {
 523  523                  /*
 524  524                   * Only directory modifying callers pass non-NULL cinfo.
 525  525                   */
 526  526                  ASSERT(vp->v_type == VDIR);
 527  527                  /*
 528  528                   * If the cache timeout either doesn't exist or hasn't expired,
 529  529                   * and dir didn't changed on server before dirmod op
 530  530                   * and dir didn't change after dirmod op but before getattr
 531  531                   * then there's a chance that the client's cached data for
 532  532                   * this object is current (not stale).  No immediate cache
 533  533                   * flush is required.
 534  534                   *
 535  535                   */
 536  536                  if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) &&
 537  537                      cinfo->before == rp->r_change &&
 538  538                      (garp->n4g_change_valid &&
 539  539                      cinfo->after == garp->n4g_change)) {
 540  540  
 541  541                          /*
 542  542                           * If atomic isn't set, then the before/after info
 543  543                           * cannot be blindly trusted.  For this case, we tell
 544  544                           * nfs4_attrcache_va to cache the attrs but also
 545  545                           * establish an absolute maximum cache timeout.  When
 546  546                           * the timeout is reached, caches will be flushed.
 547  547                           */
 548  548                          if (! cinfo->atomic)
 549  549                                  set_time_cache_inval = 1;
 550  550                  } else {
 551  551  
 552  552                          /*
 553  553                           * We're not sure exactly what changed, but we know
 554  554                           * what to do.  flush all caches for dir.  remove the
 555  555                           * attr timeout.
 556  556                           *
 557  557                           * a) timeout expired.  flush all caches.
 558  558                           * b) r_change != cinfo.before.  flush all caches.
 559  559                           * c) r_change == cinfo.before, but cinfo.after !=
 560  560                           *    post-op getattr(change).  flush all caches.
 561  561                           * d) post-op getattr(change) not provided by server.
 562  562                           *    flush all caches.
 563  563                           */
 564  564                          mtime_changed = 1;
 565  565                          ctime_changed = 1;
 566  566                          rp->r_time_cache_inval = 0;
 567  567                  }
 568  568          } else {
 569  569                  /*
 570  570                   * Write thread after writing data to file on remote server,
 571  571                   * will always set R4WRITEMODIFIED to indicate that file on
 572  572                   * remote server was modified with a WRITE operation and would
 573  573                   * have marked attribute cache as timed out. If R4WRITEMODIFIED
 574  574                   * is set, then do not check for mtime and ctime change.
 575  575                   */
 576  576                  if (!(rp->r_flags & R4WRITEMODIFIED)) {
 577  577                          if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
 578  578                                  mtime_changed = 1;
 579  579  
 580  580                          if (rp->r_attr.va_ctime.tv_sec !=
 581  581                              vap->va_ctime.tv_sec ||
 582  582                              rp->r_attr.va_ctime.tv_nsec !=
 583  583                              vap->va_ctime.tv_nsec)
 584  584                                  ctime_changed = 1;
 585  585                  } else {
 586  586                          writemodify_set = B_TRUE;
 587  587                  }
 588  588          }
 589  589  
 590  590          preattr_rsize = rp->r_size;
 591  591  
 592  592          nfs4_attrcache_va(vp, garp, set_time_cache_inval);
 593  593  
 594  594          /*
 595  595           * If we have updated filesize in nfs4_attrcache_va, as soon as we
 596  596           * drop statelock we will be in transition of purging all
 597  597           * our caches and updating them. It is possible for another
 598  598           * thread to pick this new file size and read in zeroed data.
 599  599           * stall other threads till cache purge is complete.
 600  600           */
 601  601          if ((!cinfo) && (rp->r_size != preattr_rsize)) {
 602  602                  /*
 603  603                   * If R4WRITEMODIFIED was set and we have updated the file
 604  604                   * size, Server's returned file size need not necessarily
 605  605                   * be because of this Client's WRITE. We need to purge
 606  606                   * all caches.
 607  607                   */
 608  608                  if (writemodify_set)
 609  609                          mtime_changed = 1;
 610  610  
 611  611                  if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) {
 612  612                          rp->r_flags |= R4INCACHEPURGE;
 613  613                          cachepurge_set = B_TRUE;
 614  614                  }
 615  615          }
 616  616  
 617  617          if (!mtime_changed && !ctime_changed) {
 618  618                  mutex_exit(&rp->r_statelock);
 619  619                  return;
 620  620          }
 621  621  
 622  622          rp->r_serial = curthread;
 623  623  
 624  624          mutex_exit(&rp->r_statelock);
 625  625  
 626  626          /*
 627  627           * If we're the recov thread, then force async nfs4_purge_caches
 628  628           * to avoid potential deadlock.
 629  629           */
 630  630          if (mtime_changed)
 631  631                  nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async);
 632  632  
 633  633          if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) {
 634  634                  mutex_enter(&rp->r_statelock);
 635  635                  rp->r_flags &= ~R4INCACHEPURGE;
 636  636                  cv_broadcast(&rp->r_cv);
 637  637                  mutex_exit(&rp->r_statelock);
 638  638                  cachepurge_set = B_FALSE;
 639  639          }
 640  640  
 641  641          if (ctime_changed) {
 642  642                  (void) nfs4_access_purge_rp(rp);
 643  643                  if (rp->r_secattr != NULL) {
 644  644                          mutex_enter(&rp->r_statelock);
 645  645                          vsp = rp->r_secattr;
 646  646                          rp->r_secattr = NULL;
 647  647                          mutex_exit(&rp->r_statelock);
 648  648                          if (vsp != NULL)
 649  649                                  nfs4_acl_free_cache(vsp);
 650  650                  }
 651  651          }
 652  652  
 653  653          if (!was_serial) {
 654  654                  mutex_enter(&rp->r_statelock);
 655  655                  rp->r_serial = NULL;
 656  656                  cv_broadcast(&rp->r_cv);
 657  657                  mutex_exit(&rp->r_statelock);
 658  658          }
 659  659  }
 660  660  
 661  661  /*
 662  662   * Set attributes cache for given vnode using virtual attributes.
 663  663   *
 664  664   * Set the timeout value on the attribute cache and fill it
 665  665   * with the passed in attributes.
 666  666   *
 667  667   * The caller must be holding r_statelock.
 668  668   */
 669  669  static void
 670  670  nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout)
 671  671  {
 672  672          rnode4_t *rp;
 673  673          mntinfo4_t *mi;
 674  674          hrtime_t delta;
 675  675          hrtime_t now;
 676  676          vattr_t *vap = &garp->n4g_va;
 677  677  
 678  678          rp = VTOR4(vp);
 679  679  
 680  680          ASSERT(MUTEX_HELD(&rp->r_statelock));
 681  681          ASSERT(vap->va_mask == AT_ALL);
 682  682  
 683  683          /* Switch to master before checking v_flag */
 684  684          if (IS_SHADOW(vp, rp))
 685  685                  vp = RTOV4(rp);
 686  686  
 687  687          now = gethrtime();
 688  688  
 689  689          mi = VTOMI4(vp);
 690  690  
 691  691          /*
 692  692           * Only establish a new cache timeout (if requested).  Never
 693  693           * extend a timeout.  Never clear a timeout.  Clearing a timeout
 694  694           * is done by nfs4_update_dircaches (ancestor in our call chain)
 695  695           */
 696  696          if (set_cache_timeout && ! rp->r_time_cache_inval)
 697  697                  rp->r_time_cache_inval = now + mi->mi_acdirmax;
 698  698  
 699  699          /*
 700  700           * Delta is the number of nanoseconds that we will
 701  701           * cache the attributes of the file.  It is based on
 702  702           * the number of nanoseconds since the last time that
 703  703           * we detected a change.  The assumption is that files
 704  704           * that changed recently are likely to change again.
 705  705           * There is a minimum and a maximum for regular files
 706  706           * and for directories which is enforced though.
 707  707           *
 708  708           * Using the time since last change was detected
 709  709           * eliminates direct comparison or calculation
 710  710           * using mixed client and server times.  NFS does
 711  711           * not make any assumptions regarding the client
 712  712           * and server clocks being synchronized.
 713  713           */
 714  714          if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
 715  715              vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
 716  716              vap->va_size != rp->r_attr.va_size) {
 717  717                  rp->r_time_attr_saved = now;
 718  718          }
 719  719  
 720  720          if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE))
 721  721                  delta = 0;
 722  722          else {
 723  723                  delta = now - rp->r_time_attr_saved;
 724  724                  if (vp->v_type == VDIR) {
 725  725                          if (delta < mi->mi_acdirmin)
 726  726                                  delta = mi->mi_acdirmin;
 727  727                          else if (delta > mi->mi_acdirmax)
 728  728                                  delta = mi->mi_acdirmax;
 729  729                  } else {
 730  730                          if (delta < mi->mi_acregmin)
 731  731                                  delta = mi->mi_acregmin;
 732  732                          else if (delta > mi->mi_acregmax)
 733  733                                  delta = mi->mi_acregmax;
 734  734                  }
 735  735          }
 736  736          rp->r_time_attr_inval = now + delta;
 737  737  
 738  738          rp->r_attr = *vap;
 739  739          if (garp->n4g_change_valid)
 740  740                  rp->r_change = garp->n4g_change;
 741  741  
 742  742          /*
 743  743           * The attributes that were returned may be valid and can
 744  744           * be used, but they may not be allowed to be cached.
 745  745           * Reset the timers to cause immediate invalidation and
 746  746           * clear r_change so no VERIFY operations will suceed
 747  747           */
 748  748          if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) {
 749  749                  rp->r_time_attr_inval = now;
 750  750                  rp->r_time_attr_saved = now;
 751  751                  rp->r_change = 0;
 752  752          }
 753  753  
 754  754          /*
 755  755           * If mounted_on_fileid returned AND the object is a stub,
 756  756           * then set object's va_nodeid to the mounted over fid
 757  757           * returned by server.
 758  758           *
 759  759           * If mounted_on_fileid not provided/supported, then
 760  760           * just set it to 0 for now.  Eventually it would be
 761  761           * better to set it to a hashed version of FH.  This
 762  762           * would probably be good enough to provide a unique
 763  763           * fid/d_ino within a dir.
 764  764           *
 765  765           * We don't need to carry mounted_on_fileid in the
 766  766           * rnode as long as the client never requests fileid
 767  767           * without also requesting mounted_on_fileid.  For
 768  768           * now, it stays.
 769  769           */
 770  770          if (garp->n4g_mon_fid_valid) {
 771  771                  rp->r_mntd_fid = garp->n4g_mon_fid;
 772  772  
 773  773                  if (RP_ISSTUB(rp))
 774  774                          rp->r_attr.va_nodeid = rp->r_mntd_fid;
 775  775          }
 776  776  
 777  777          /*
 778  778           * Check to see if there are valid pathconf bits to
 779  779           * cache in the rnode.
 780  780           */
 781  781          if (garp->n4g_ext_res) {
 782  782                  if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) {
 783  783                          rp->r_pathconf = garp->n4g_ext_res->n4g_pc4;
 784  784                  } else {
 785  785                          if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) {
 786  786                                  rp->r_pathconf.pc4_xattr_valid = TRUE;
 787  787                                  rp->r_pathconf.pc4_xattr_exists =
 788  788                                      garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists;
 789  789                          }
 790  790                  }
 791  791          }
 792  792          /*
 793  793           * Update the size of the file if there is no cached data or if
 794  794           * the cached data is clean and there is no data being written
 795  795           * out.
 796  796           */
 797  797          if (rp->r_size != vap->va_size &&
 798  798              (!vn_has_cached_data(vp) ||
 799  799              (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) {
 800  800                  rp->r_size = vap->va_size;
 801  801          }
 802  802          nfs_setswaplike(vp, vap);
 803  803          rp->r_flags &= ~R4WRITEMODIFIED;
 804  804  }
 805  805  
 806  806  /*
 807  807   * Get attributes over-the-wire and update attributes cache
 808  808   * if no error occurred in the over-the-wire operation.
 809  809   * Return 0 if successful, otherwise error.
 810  810   */
 811  811  int
 812  812  nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl)
 813  813  {
 814  814          mntinfo4_t *mi = VTOMI4(vp);
 815  815          hrtime_t t;
 816  816          nfs4_recov_state_t recov_state;
 817  817          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
 818  818  
 819  819          recov_state.rs_flags = 0;
 820  820          recov_state.rs_num_retry_despite_err = 0;
 821  821  
 822  822          /* Save the original mount point security flavor */
 823  823          (void) save_mnt_secinfo(mi->mi_curr_serv);
 824  824  
 825  825  recov_retry:
 826  826  
 827  827          if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR,
 828  828              &recov_state, NULL))) {
 829  829                  (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
 830  830                  return (e.error);
 831  831          }
 832  832  
 833  833          t = gethrtime();
 834  834  
 835  835          nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl);
 836  836  
 837  837          if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) {
 838  838                  if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
 839  839                      NULL, OP_GETATTR, NULL, NULL, NULL) == FALSE)  {
 840  840                          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR,
 841  841                              &recov_state, 1);
 842  842                          goto recov_retry;
 843  843                  }
 844  844          }
 845  845  
 846  846          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0);
 847  847  
 848  848          if (!e.error) {
 849  849                  if (e.stat == NFS4_OK) {
 850  850                          nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
 851  851                  } else {
 852  852                          e.error = geterrno4(e.stat);
 853  853  
 854  854                          nfs4_purge_stale_fh(e.error, vp, cr);
 855  855                  }
 856  856          }
 857  857  
 858  858          /*
 859  859           * If getattr a node that is a stub for a crossed
 860  860           * mount point, keep the original secinfo flavor for
 861  861           * the current file system, not the crossed one.
 862  862           */
 863  863          (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
 864  864  
 865  865          return (e.error);
 866  866  }
 867  867  
 868  868  /*
 869  869   * Generate a compound to get attributes over-the-wire.
 870  870   */
 871  871  void
 872  872  nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp,
 873  873      nfs4_error_t *ep, cred_t *cr, int get_acl)
 874  874  {
 875  875          COMPOUND4args_clnt args;
 876  876          COMPOUND4res_clnt res;
 877  877          int doqueue;
 878  878          rnode4_t *rp = VTOR4(vp);
 879  879          nfs_argop4 argop[2];
 880  880  
 881  881          args.ctag = TAG_GETATTR;
 882  882  
 883  883          args.array_len = 2;
 884  884          args.array = argop;
 885  885  
 886  886          /* putfh */
 887  887          argop[0].argop = OP_CPUTFH;
 888  888          argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
 889  889  
 890  890          /* getattr */
 891  891          /*
 892  892           * Unlike nfs version 2 and 3, where getattr returns all the
 893  893           * attributes, nfs version 4 returns only the ones explicitly
 894  894           * asked for. This creates problems, as some system functions
 895  895           * (e.g. cache check) require certain attributes and if the
 896  896           * cached node lacks some attributes such as uid/gid, it can
 897  897           * affect system utilities (e.g. "ls") that rely on the information
 898  898           * to be there. This can lead to anything from system crashes to
 899  899           * corrupted information processed by user apps.
 900  900           * So to ensure that all bases are covered, request at least
 901  901           * the AT_ALL attribute mask.
 902  902           */
 903  903          argop[1].argop = OP_GETATTR;
 904  904          argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
 905  905          if (get_acl)
 906  906                  argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK;
 907  907          argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
 908  908  
 909  909          doqueue = 1;
 910  910  
 911  911          rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep);
 912  912  
 913  913          if (ep->error)
 914  914                  return;
 915  915  
 916  916          if (res.status != NFS4_OK) {
 917  917                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
 918  918                  return;
 919  919          }
 920  920  
 921  921          *garp = res.array[1].nfs_resop4_u.opgetattr.ga_res;
 922  922  
 923  923          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
 924  924  }
 925  925  
 926  926  /*
 927  927   * Return either cached or remote attributes. If get remote attr
 928  928   * use them to check and invalidate caches, then cache the new attributes.
 929  929   */
 930  930  int
 931  931  nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr)
 932  932  {
 933  933          int error;
 934  934          rnode4_t *rp;
 935  935          nfs4_ga_res_t gar;
 936  936  
 937  937          ASSERT(nfs4_consistent_type(vp));
 938  938  
 939  939          /*
 940  940           * If we've got cached attributes, we're done, otherwise go
 941  941           * to the server to get attributes, which will update the cache
 942  942           * in the process. Either way, use the cached attributes for
 943  943           * the caller's vattr_t.
 944  944           *
 945  945           * Note that we ignore the gar set by the OTW call: the attr caching
 946  946           * code may make adjustments when storing to the rnode, and we want
 947  947           * to see those changes here.
 948  948           */
 949  949          rp = VTOR4(vp);
 950  950          error = 0;
 951  951          mutex_enter(&rp->r_statelock);
 952  952          if (!ATTRCACHE4_VALID(vp)) {
 953  953                  mutex_exit(&rp->r_statelock);
 954  954                  error = nfs4_getattr_otw(vp, &gar, cr, 0);
 955  955                  mutex_enter(&rp->r_statelock);
 956  956          }
 957  957  
 958  958          if (!error)
 959  959                  *vap = rp->r_attr;
 960  960  
 961  961          /* Return the client's view of file size */
 962  962          vap->va_size = rp->r_size;
 963  963  
 964  964          mutex_exit(&rp->r_statelock);
 965  965  
 966  966          ASSERT(nfs4_consistent_type(vp));
 967  967  
 968  968          return (error);
 969  969  }
 970  970  
 971  971  int
 972  972  nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type,
 973  973      nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr)
 974  974  {
 975  975          COMPOUND4args_clnt args;
 976  976          COMPOUND4res_clnt res;
 977  977          int doqueue;
 978  978          nfs_argop4 argop[2];
 979  979          mntinfo4_t *mi = VTOMI4(vp);
 980  980          bool_t needrecov = FALSE;
 981  981          nfs4_recov_state_t recov_state;
 982  982          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
 983  983          nfs4_ga_ext_res_t *gerp;
 984  984  
 985  985          recov_state.rs_flags = 0;
 986  986          recov_state.rs_num_retry_despite_err = 0;
 987  987  
 988  988  recov_retry:
 989  989          args.ctag = tag_type;
 990  990  
 991  991          args.array_len = 2;
 992  992          args.array = argop;
 993  993  
 994  994          e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL);
 995  995          if (e.error)
 996  996                  return (e.error);
 997  997  
 998  998          /* putfh */
 999  999          argop[0].argop = OP_CPUTFH;
1000 1000          argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
1001 1001  
1002 1002          /* getattr */
1003 1003          argop[1].argop = OP_GETATTR;
1004 1004          argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap;
1005 1005          argop[1].nfs_argop4_u.opgetattr.mi = mi;
1006 1006  
1007 1007          doqueue = 1;
1008 1008  
1009 1009          NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1010 1010              "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first",
1011 1011              rnode4info(VTOR4(vp))));
1012 1012  
1013 1013          rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1014 1014  
1015 1015          needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
1016 1016          if (!needrecov && e.error) {
1017 1017                  nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1018 1018                      needrecov);
1019 1019                  return (e.error);
1020 1020          }
1021 1021  
1022 1022          if (needrecov) {
1023 1023                  bool_t abort;
1024 1024  
1025 1025                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1026 1026                      "nfs4_attr_otw: initiating recovery\n"));
1027 1027  
1028 1028                  abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
1029 1029                      NULL, OP_GETATTR, NULL, NULL, NULL);
1030 1030                  nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1031 1031                      needrecov);
1032 1032                  if (!e.error) {
1033 1033                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1034 1034                          e.error = geterrno4(res.status);
1035 1035                  }
1036 1036                  if (abort == FALSE)
1037 1037                          goto recov_retry;
1038 1038                  return (e.error);
1039 1039          }
1040 1040  
1041 1041          if (res.status) {
1042 1042                  e.error = geterrno4(res.status);
1043 1043          } else {
1044 1044                  gerp = garp->n4g_ext_res;
1045 1045                  bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res,
1046 1046                      garp, sizeof (nfs4_ga_res_t));
1047 1047                  garp->n4g_ext_res = gerp;
1048 1048                  if (garp->n4g_ext_res &&
1049 1049                      res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res)
1050 1050                          bcopy(res.array[1].nfs_resop4_u.opgetattr.
1051 1051                              ga_res.n4g_ext_res,
1052 1052                              garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t));
1053 1053          }
1054 1054          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1055 1055          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1056 1056              needrecov);
1057 1057          return (e.error);
1058 1058  }
1059 1059  
1060 1060  /*
1061 1061   * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
1062 1062   * for the demand-based allocation of async threads per-mount.  The
1063 1063   * nfs_async_timeout is the amount of time a thread will live after it
1064 1064   * becomes idle, unless new I/O requests are received before the thread
1065 1065   * dies.  See nfs4_async_putpage and nfs4_async_start.
1066 1066   */
1067 1067  
1068 1068  static void     nfs4_async_start(struct vfs *);
1069 1069  static void     nfs4_async_pgops_start(struct vfs *);
1070 1070  static void     nfs4_async_common_start(struct vfs *, int);
1071 1071  
1072 1072  static void
1073 1073  free_async_args4(struct nfs4_async_reqs *args)
1074 1074  {
1075 1075          rnode4_t *rp;
1076 1076  
1077 1077          if (args->a_io != NFS4_INACTIVE) {
1078 1078                  rp = VTOR4(args->a_vp);
1079 1079                  mutex_enter(&rp->r_statelock);
1080 1080                  rp->r_count--;
1081 1081                  if (args->a_io == NFS4_PUTAPAGE ||
1082 1082                      args->a_io == NFS4_PAGEIO)
1083 1083                          rp->r_awcount--;
1084 1084                  cv_broadcast(&rp->r_cv);
1085 1085                  mutex_exit(&rp->r_statelock);
1086 1086                  VN_RELE(args->a_vp);
1087 1087          }
1088 1088          crfree(args->a_cred);
1089 1089          kmem_free(args, sizeof (*args));
1090 1090  }
1091 1091  
1092 1092  /*
1093 1093   * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1094 1094   * pageout(), running in the global zone, have legitimate reasons to do
1095 1095   * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
1096 1096   * use of a a per-mount "asynchronous requests manager thread" which is
1097 1097   * signaled by the various asynchronous work routines when there is
1098 1098   * asynchronous work to be done.  It is responsible for creating new
1099 1099   * worker threads if necessary, and notifying existing worker threads
1100 1100   * that there is work to be done.
1101 1101   *
1102 1102   * In other words, it will "take the specifications from the customers and
1103 1103   * give them to the engineers."
1104 1104   *
1105 1105   * Worker threads die off of their own accord if they are no longer
1106 1106   * needed.
1107 1107   *
1108 1108   * This thread is killed when the zone is going away or the filesystem
1109 1109   * is being unmounted.
1110 1110   */
1111 1111  void
1112 1112  nfs4_async_manager(vfs_t *vfsp)
1113 1113  {
1114 1114          callb_cpr_t cprinfo;
1115 1115          mntinfo4_t *mi;
1116 1116          uint_t max_threads;
1117 1117  
1118 1118          mi = VFTOMI4(vfsp);
1119 1119  
1120 1120          CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1121 1121              "nfs4_async_manager");
1122 1122  
1123 1123          mutex_enter(&mi->mi_async_lock);
1124 1124          /*
1125 1125           * We want to stash the max number of threads that this mount was
1126 1126           * allowed so we can use it later when the variable is set to zero as
1127 1127           * part of the zone/mount going away.
1128 1128           *
1129 1129           * We want to be able to create at least one thread to handle
1130 1130           * asynchronous inactive calls.
1131 1131           */
1132 1132          max_threads = MAX(mi->mi_max_threads, 1);
1133 1133          /*
1134 1134           * We don't want to wait for mi_max_threads to go to zero, since that
1135 1135           * happens as part of a failed unmount, but this thread should only
1136 1136           * exit when the mount is really going away.
1137 1137           *
1138 1138           * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be
1139 1139           * attempted: the various _async_*() functions know to do things
1140 1140           * inline if mi_max_threads == 0.  Henceforth we just drain out the
1141 1141           * outstanding requests.
1142 1142           *
1143 1143           * Note that we still create zthreads even if we notice the zone is
1144 1144           * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone
1145 1145           * shutdown sequence to take slightly longer in some cases, but
1146 1146           * doesn't violate the protocol, as all threads will exit as soon as
1147 1147           * they're done processing the remaining requests.
1148 1148           */
1149 1149          for (;;) {
1150 1150                  while (mi->mi_async_req_count > 0) {
1151 1151                          /*
1152 1152                           * Paranoia: If the mount started out having
1153 1153                           * (mi->mi_max_threads == 0), and the value was
1154 1154                           * later changed (via a debugger or somesuch),
1155 1155                           * we could be confused since we will think we
1156 1156                           * can't create any threads, and the calling
1157 1157                           * code (which looks at the current value of
1158 1158                           * mi->mi_max_threads, now non-zero) thinks we
1159 1159                           * can.
1160 1160                           *
1161 1161                           * So, because we're paranoid, we create threads
1162 1162                           * up to the maximum of the original and the
1163 1163                           * current value. This means that future
1164 1164                           * (debugger-induced) alterations of
1165 1165                           * mi->mi_max_threads are ignored for our
1166 1166                           * purposes, but who told them they could change
1167 1167                           * random values on a live kernel anyhow?
1168 1168                           */
1169 1169                          if (mi->mi_threads[NFS4_ASYNC_QUEUE] <
1170 1170                              MAX(mi->mi_max_threads, max_threads)) {
1171 1171                                  mi->mi_threads[NFS4_ASYNC_QUEUE]++;
1172 1172                                  mutex_exit(&mi->mi_async_lock);
1173 1173                                  MI4_HOLD(mi);
1174 1174                                  VFS_HOLD(vfsp); /* hold for new thread */
1175 1175                                  (void) zthread_create(NULL, 0, nfs4_async_start,
1176 1176                                      vfsp, 0, minclsyspri);
1177 1177                                  mutex_enter(&mi->mi_async_lock);
1178 1178                          } else if (mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] <
1179 1179                              NUM_ASYNC_PGOPS_THREADS) {
1180 1180                                  mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE]++;
1181 1181                                  mutex_exit(&mi->mi_async_lock);
1182 1182                                  MI4_HOLD(mi);
1183 1183                                  VFS_HOLD(vfsp); /* hold for new thread */
1184 1184                                  (void) zthread_create(NULL, 0,
1185 1185                                      nfs4_async_pgops_start, vfsp, 0,
1186 1186                                      minclsyspri);
1187 1187                                  mutex_enter(&mi->mi_async_lock);
1188 1188                          }
1189 1189                          NFS4_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
1190 1190                          ASSERT(mi->mi_async_req_count != 0);
1191 1191                          mi->mi_async_req_count--;
1192 1192                  }
1193 1193  
1194 1194                  mutex_enter(&mi->mi_lock);
1195 1195                  if (mi->mi_flags & MI4_ASYNC_MGR_STOP) {
1196 1196                          mutex_exit(&mi->mi_lock);
1197 1197                          break;
1198 1198                  }
1199 1199                  mutex_exit(&mi->mi_lock);
1200 1200  
1201 1201                  CALLB_CPR_SAFE_BEGIN(&cprinfo);
1202 1202                  cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1203 1203                  CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1204 1204          }
1205 1205  
1206 1206          NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1207 1207              "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp));
1208 1208          /*
1209 1209           * Let everyone know we're done.
1210 1210           */
1211 1211          mi->mi_manager_thread = NULL;
1212 1212          /*
1213 1213           * Wake up the inactive thread.
1214 1214           */
1215 1215          cv_broadcast(&mi->mi_inact_req_cv);
1216 1216          /*
1217 1217           * Wake up anyone sitting in nfs4_async_manager_stop()
1218 1218           */
1219 1219          cv_broadcast(&mi->mi_async_cv);
1220 1220          /*
1221 1221           * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1222 1222           * since CALLB_CPR_EXIT is actually responsible for releasing
1223 1223           * 'mi_async_lock'.
1224 1224           */
1225 1225          CALLB_CPR_EXIT(&cprinfo);
1226 1226          VFS_RELE(vfsp); /* release thread's hold */
1227 1227          MI4_RELE(mi);
1228 1228          zthread_exit();
1229 1229  }
1230 1230  
1231 1231  /*
1232 1232   * Signal (and wait for) the async manager thread to clean up and go away.
1233 1233   */
1234 1234  void
1235 1235  nfs4_async_manager_stop(vfs_t *vfsp)
1236 1236  {
1237 1237          mntinfo4_t *mi = VFTOMI4(vfsp);
1238 1238  
1239 1239          mutex_enter(&mi->mi_async_lock);
1240 1240          mutex_enter(&mi->mi_lock);
1241 1241          mi->mi_flags |= MI4_ASYNC_MGR_STOP;
1242 1242          mutex_exit(&mi->mi_lock);
1243 1243          cv_broadcast(&mi->mi_async_reqs_cv);
1244 1244          /*
1245 1245           * Wait for the async manager thread to die.
1246 1246           */
1247 1247          while (mi->mi_manager_thread != NULL)
1248 1248                  cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1249 1249          mutex_exit(&mi->mi_async_lock);
1250 1250  }
1251 1251  
1252 1252  int
1253 1253  nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1254 1254      struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1255 1255      u_offset_t, caddr_t, struct seg *, cred_t *))
1256 1256  {
1257 1257          rnode4_t *rp;
1258 1258          mntinfo4_t *mi;
1259 1259          struct nfs4_async_reqs *args;
1260 1260  
1261 1261          rp = VTOR4(vp);
1262 1262          ASSERT(rp->r_freef == NULL);
1263 1263  
1264 1264          mi = VTOMI4(vp);
1265 1265  
1266 1266          /*
1267 1267           * If addr falls in a different segment, don't bother doing readahead.
1268 1268           */
1269 1269          if (addr >= seg->s_base + seg->s_size)
1270 1270                  return (-1);
1271 1271  
1272 1272          /*
1273 1273           * If we can't allocate a request structure, punt on the readahead.
1274 1274           */
1275 1275          if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1276 1276                  return (-1);
1277 1277  
1278 1278          /*
1279 1279           * If a lock operation is pending, don't initiate any new
1280 1280           * readaheads.  Otherwise, bump r_count to indicate the new
1281 1281           * asynchronous I/O.
1282 1282           */
1283 1283          if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1284 1284                  kmem_free(args, sizeof (*args));
1285 1285                  return (-1);
1286 1286          }
1287 1287          mutex_enter(&rp->r_statelock);
1288 1288          rp->r_count++;
1289 1289          mutex_exit(&rp->r_statelock);
1290 1290          nfs_rw_exit(&rp->r_lkserlock);
1291 1291  
1292 1292          args->a_next = NULL;
1293 1293  #ifdef DEBUG
1294 1294          args->a_queuer = curthread;
1295 1295  #endif
1296 1296          VN_HOLD(vp);
1297 1297          args->a_vp = vp;
1298 1298          ASSERT(cr != NULL);
1299 1299          crhold(cr);
1300 1300          args->a_cred = cr;
1301 1301          args->a_io = NFS4_READ_AHEAD;
1302 1302          args->a_nfs4_readahead = readahead;
1303 1303          args->a_nfs4_blkoff = blkoff;
1304 1304          args->a_nfs4_seg = seg;
1305 1305          args->a_nfs4_addr = addr;
1306 1306  
1307 1307          mutex_enter(&mi->mi_async_lock);
1308 1308  
1309 1309          /*
1310 1310           * If asyncio has been disabled, don't bother readahead.
1311 1311           */
1312 1312          if (mi->mi_max_threads == 0) {
1313 1313                  mutex_exit(&mi->mi_async_lock);
1314 1314                  goto noasync;
1315 1315          }
1316 1316  
1317 1317          /*
1318 1318           * Link request structure into the async list and
1319 1319           * wakeup async thread to do the i/o.
1320 1320           */
1321 1321          if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) {
1322 1322                  mi->mi_async_reqs[NFS4_READ_AHEAD] = args;
1323 1323                  mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1324 1324          } else {
1325 1325                  mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args;
1326 1326                  mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1327 1327          }
1328 1328  
1329 1329          if (mi->mi_io_kstats) {
1330 1330                  mutex_enter(&mi->mi_lock);
1331 1331                  kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1332 1332                  mutex_exit(&mi->mi_lock);
1333 1333          }
1334 1334  
1335 1335          mi->mi_async_req_count++;
1336 1336          ASSERT(mi->mi_async_req_count != 0);
1337 1337          cv_signal(&mi->mi_async_reqs_cv);
1338 1338          mutex_exit(&mi->mi_async_lock);
1339 1339          return (0);
1340 1340  
1341 1341  noasync:
1342 1342          mutex_enter(&rp->r_statelock);
1343 1343          rp->r_count--;
1344 1344          cv_broadcast(&rp->r_cv);
1345 1345          mutex_exit(&rp->r_statelock);
1346 1346          VN_RELE(vp);
1347 1347          crfree(cr);
1348 1348          kmem_free(args, sizeof (*args));
1349 1349          return (-1);
1350 1350  }
1351 1351  
1352 1352  static void
1353 1353  nfs4_async_start(struct vfs *vfsp)
1354 1354  {
1355 1355          nfs4_async_common_start(vfsp, NFS4_ASYNC_QUEUE);
1356 1356  }
1357 1357  
1358 1358  static void
1359 1359  nfs4_async_pgops_start(struct vfs *vfsp)
1360 1360  {
1361 1361          nfs4_async_common_start(vfsp, NFS4_ASYNC_PGOPS_QUEUE);
1362 1362  }
1363 1363  
1364 1364  /*
1365 1365   * The async queues for each mounted file system are arranged as a
1366 1366   * set of queues, one for each async i/o type.  Requests are taken
1367 1367   * from the queues in a round-robin fashion.  A number of consecutive
1368 1368   * requests are taken from each queue before moving on to the next
1369 1369   * queue.  This functionality may allow the NFS Version 2 server to do
1370 1370   * write clustering, even if the client is mixing writes and reads
1371 1371   * because it will take multiple write requests from the queue
1372 1372   * before processing any of the other async i/o types.
1373 1373   *
1374 1374   * XXX The nfs4_async_common_start thread is unsafe in the light of the present
1375 1375   * model defined by cpr to suspend the system. Specifically over the
1376 1376   * wire calls are cpr-unsafe. The thread should be reevaluated in
1377 1377   * case of future updates to the cpr model.
1378 1378   */
1379 1379  static void
1380 1380  nfs4_async_common_start(struct vfs *vfsp, int async_queue)
1381 1381  {
1382 1382          struct nfs4_async_reqs *args;
1383 1383          mntinfo4_t *mi = VFTOMI4(vfsp);
1384 1384          clock_t time_left = 1;
1385 1385          callb_cpr_t cprinfo;
1386 1386          int i;
1387 1387          extern int nfs_async_timeout;
1388 1388          int async_types;
1389 1389          kcondvar_t *async_work_cv;
1390 1390  
1391 1391          if (async_queue == NFS4_ASYNC_QUEUE) {
1392 1392                  async_types = NFS4_ASYNC_TYPES;
1393 1393                  async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_QUEUE];
1394 1394          } else {
1395 1395                  async_types = NFS4_ASYNC_PGOPS_TYPES;
1396 1396                  async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE];
1397 1397          }
1398 1398  
1399 1399          /*
1400 1400           * Dynamic initialization of nfs_async_timeout to allow nfs to be
1401 1401           * built in an implementation independent manner.
1402 1402           */
1403 1403          if (nfs_async_timeout == -1)
1404 1404                  nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1405 1405  
1406 1406          CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1407 1407  
1408 1408          mutex_enter(&mi->mi_async_lock);
1409 1409          for (;;) {
1410 1410                  /*
1411 1411                   * Find the next queue containing an entry.  We start
1412 1412                   * at the current queue pointer and then round robin
1413 1413                   * through all of them until we either find a non-empty
1414 1414                   * queue or have looked through all of them.
1415 1415                   */
1416 1416                  for (i = 0; i < async_types; i++) {
1417 1417                          args = *mi->mi_async_curr[async_queue];
1418 1418                          if (args != NULL)
1419 1419                                  break;
1420 1420                          mi->mi_async_curr[async_queue]++;
1421 1421                          if (mi->mi_async_curr[async_queue] ==
1422 1422                              &mi->mi_async_reqs[async_types]) {
1423 1423                                  mi->mi_async_curr[async_queue] =
1424 1424                                      &mi->mi_async_reqs[0];
1425 1425                          }
1426 1426                  }
1427 1427                  /*
1428 1428                   * If we didn't find a entry, then block until woken up
1429 1429                   * again and then look through the queues again.
1430 1430                   */
1431 1431                  if (args == NULL) {
1432 1432                          /*
1433 1433                           * Exiting is considered to be safe for CPR as well
1434 1434                           */
1435 1435                          CALLB_CPR_SAFE_BEGIN(&cprinfo);
1436 1436  
1437 1437                          /*
1438 1438                           * Wakeup thread waiting to unmount the file
1439 1439                           * system only if all async threads are inactive.
1440 1440                           *
1441 1441                           * If we've timed-out and there's nothing to do,
1442 1442                           * then get rid of this thread.
1443 1443                           */
1444 1444                          if (mi->mi_max_threads == 0 || time_left <= 0) {
1445 1445                                  --mi->mi_threads[async_queue];
1446 1446  
1447 1447                                  if (mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
1448 1448                                      mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0)
1449 1449                                          cv_signal(&mi->mi_async_cv);
1450 1450                                  CALLB_CPR_EXIT(&cprinfo);
1451 1451                                  VFS_RELE(vfsp); /* release thread's hold */
1452 1452                                  MI4_RELE(mi);
1453 1453                                  zthread_exit();
1454 1454                                  /* NOTREACHED */
1455 1455                          }
1456 1456                          time_left = cv_reltimedwait(async_work_cv,
1457 1457                              &mi->mi_async_lock, nfs_async_timeout,
1458 1458                              TR_CLOCK_TICK);
1459 1459  
1460 1460                          CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1461 1461  
1462 1462                          continue;
1463 1463                  } else {
1464 1464                          time_left = 1;
1465 1465                  }
1466 1466  
1467 1467                  /*
1468 1468                   * Remove the request from the async queue and then
1469 1469                   * update the current async request queue pointer.  If
1470 1470                   * the current queue is empty or we have removed enough
1471 1471                   * consecutive entries from it, then reset the counter
1472 1472                   * for this queue and then move the current pointer to
1473 1473                   * the next queue.
1474 1474                   */
1475 1475                  *mi->mi_async_curr[async_queue] = args->a_next;
1476 1476                  if (*mi->mi_async_curr[async_queue] == NULL ||
1477 1477                      --mi->mi_async_clusters[args->a_io] == 0) {
1478 1478                          mi->mi_async_clusters[args->a_io] =
1479 1479                              mi->mi_async_init_clusters;
1480 1480                          mi->mi_async_curr[async_queue]++;
1481 1481                          if (mi->mi_async_curr[async_queue] ==
1482 1482                              &mi->mi_async_reqs[async_types]) {
1483 1483                                  mi->mi_async_curr[async_queue] =
1484 1484                                      &mi->mi_async_reqs[0];
1485 1485                          }
1486 1486                  }
1487 1487  
1488 1488                  if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) {
1489 1489                          mutex_enter(&mi->mi_lock);
1490 1490                          kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1491 1491                          mutex_exit(&mi->mi_lock);
1492 1492                  }
1493 1493  
1494 1494                  mutex_exit(&mi->mi_async_lock);
1495 1495  
1496 1496                  /*
1497 1497                   * Obtain arguments from the async request structure.
1498 1498                   */
1499 1499                  if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) {
1500 1500                          (*args->a_nfs4_readahead)(args->a_vp,
1501 1501                              args->a_nfs4_blkoff, args->a_nfs4_addr,
1502 1502                              args->a_nfs4_seg, args->a_cred);
1503 1503                  } else if (args->a_io == NFS4_PUTAPAGE) {
1504 1504                          (void) (*args->a_nfs4_putapage)(args->a_vp,
1505 1505                              args->a_nfs4_pp, args->a_nfs4_off,
1506 1506                              args->a_nfs4_len, args->a_nfs4_flags,
1507 1507                              args->a_cred);
1508 1508                  } else if (args->a_io == NFS4_PAGEIO) {
1509 1509                          (void) (*args->a_nfs4_pageio)(args->a_vp,
1510 1510                              args->a_nfs4_pp, args->a_nfs4_off,
1511 1511                              args->a_nfs4_len, args->a_nfs4_flags,
1512 1512                              args->a_cred);
1513 1513                  } else if (args->a_io == NFS4_READDIR) {
1514 1514                          (void) ((*args->a_nfs4_readdir)(args->a_vp,
1515 1515                              args->a_nfs4_rdc, args->a_cred));
1516 1516                  } else if (args->a_io == NFS4_COMMIT) {
1517 1517                          (*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist,
1518 1518                              args->a_nfs4_offset, args->a_nfs4_count,
1519 1519                              args->a_cred);
1520 1520                  } else if (args->a_io == NFS4_INACTIVE) {
1521 1521                          nfs4_inactive_otw(args->a_vp, args->a_cred);
1522 1522                  }
1523 1523  
1524 1524                  /*
1525 1525                   * Now, release the vnode and free the credentials
1526 1526                   * structure.
1527 1527                   */
1528 1528                  free_async_args4(args);
1529 1529                  /*
1530 1530                   * Reacquire the mutex because it will be needed above.
1531 1531                   */
1532 1532                  mutex_enter(&mi->mi_async_lock);
1533 1533          }
1534 1534  }
1535 1535  
1536 1536  /*
1537 1537   * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as
1538 1538   * part of VOP_INACTIVE.
1539 1539   */
1540 1540  
1541 1541  void
1542 1542  nfs4_inactive_thread(mntinfo4_t *mi)
1543 1543  {
1544 1544          struct nfs4_async_reqs *args;
1545 1545          callb_cpr_t cprinfo;
1546 1546          vfs_t *vfsp = mi->mi_vfsp;
1547 1547  
1548 1548          CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1549 1549              "nfs4_inactive_thread");
1550 1550  
1551 1551          for (;;) {
1552 1552                  mutex_enter(&mi->mi_async_lock);
1553 1553                  args = mi->mi_async_reqs[NFS4_INACTIVE];
1554 1554                  if (args == NULL) {
1555 1555                          mutex_enter(&mi->mi_lock);
1556 1556                          /*
1557 1557                           * We don't want to exit until the async manager is done
1558 1558                           * with its work; hence the check for mi_manager_thread
1559 1559                           * being NULL.
1560 1560                           *
1561 1561                           * The async manager thread will cv_broadcast() on
1562 1562                           * mi_inact_req_cv when it's done, at which point we'll
1563 1563                           * wake up and exit.
1564 1564                           */
1565 1565                          if (mi->mi_manager_thread == NULL)
1566 1566                                  goto die;
1567 1567                          mi->mi_flags |= MI4_INACTIVE_IDLE;
1568 1568                          mutex_exit(&mi->mi_lock);
1569 1569                          cv_signal(&mi->mi_async_cv);
1570 1570                          CALLB_CPR_SAFE_BEGIN(&cprinfo);
1571 1571                          cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock);
1572 1572                          CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1573 1573                          mutex_exit(&mi->mi_async_lock);
1574 1574                  } else {
1575 1575                          mutex_enter(&mi->mi_lock);
1576 1576                          mi->mi_flags &= ~MI4_INACTIVE_IDLE;
1577 1577                          mutex_exit(&mi->mi_lock);
1578 1578                          mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next;
1579 1579                          mutex_exit(&mi->mi_async_lock);
1580 1580                          nfs4_inactive_otw(args->a_vp, args->a_cred);
1581 1581                          crfree(args->a_cred);
1582 1582                          kmem_free(args, sizeof (*args));
1583 1583                  }
1584 1584          }
1585 1585  die:
1586 1586          mutex_exit(&mi->mi_lock);
1587 1587          mi->mi_inactive_thread = NULL;
1588 1588          cv_signal(&mi->mi_async_cv);
1589 1589  
1590 1590          /*
1591 1591           * There is no explicit call to mutex_exit(&mi->mi_async_lock) since
1592 1592           * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'.
1593 1593           */
1594 1594          CALLB_CPR_EXIT(&cprinfo);
1595 1595  
1596 1596          NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1597 1597              "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp));
1598 1598  
1599 1599          MI4_RELE(mi);
1600 1600          zthread_exit();
1601 1601          /* NOTREACHED */
1602 1602  }
1603 1603  
1604 1604  /*
1605 1605   * nfs_async_stop:
1606 1606   * Wait for all outstanding putpage operations and the inactive thread to
1607 1607   * complete; nfs4_async_stop_sig() without interruptibility.
1608 1608   */
1609 1609  void
1610 1610  nfs4_async_stop(struct vfs *vfsp)
1611 1611  {
1612 1612          mntinfo4_t *mi = VFTOMI4(vfsp);
1613 1613  
1614 1614          /*
1615 1615           * Wait for all outstanding async operations to complete and for
1616 1616           * worker threads to exit.
1617 1617           */
1618 1618          mutex_enter(&mi->mi_async_lock);
1619 1619          mi->mi_max_threads = 0;
1620 1620          NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1621 1621          while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1622 1622              mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0)
1623 1623                  cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1624 1624  
1625 1625          /*
1626 1626           * Wait for the inactive thread to finish doing what it's doing.  It
1627 1627           * won't exit until the last reference to the vfs_t goes away.
1628 1628           */
1629 1629          if (mi->mi_inactive_thread != NULL) {
1630 1630                  mutex_enter(&mi->mi_lock);
1631 1631                  while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1632 1632                      (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1633 1633                          mutex_exit(&mi->mi_lock);
1634 1634                          cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1635 1635                          mutex_enter(&mi->mi_lock);
1636 1636                  }
1637 1637                  mutex_exit(&mi->mi_lock);
1638 1638          }
1639 1639          mutex_exit(&mi->mi_async_lock);
1640 1640  }
1641 1641  
1642 1642  /*
1643 1643   * nfs_async_stop_sig:
1644 1644   * Wait for all outstanding putpage operations and the inactive thread to
1645 1645   * complete. If a signal is delivered we will abort and return non-zero;
1646 1646   * otherwise return 0. Since this routine is called from nfs4_unmount, we
1647 1647   * need to make it interruptible.
1648 1648   */
1649 1649  int
1650 1650  nfs4_async_stop_sig(struct vfs *vfsp)
1651 1651  {
1652 1652          mntinfo4_t *mi = VFTOMI4(vfsp);
1653 1653          ushort_t omax;
1654 1654          bool_t intr = FALSE;
1655 1655  
1656 1656          /*
1657 1657           * Wait for all outstanding putpage operations to complete and for
1658 1658           * worker threads to exit.
1659 1659           */
1660 1660          mutex_enter(&mi->mi_async_lock);
1661 1661          omax = mi->mi_max_threads;
1662 1662          mi->mi_max_threads = 0;
1663 1663          NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1664 1664          while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1665 1665              mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) {
1666 1666                  if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) {
1667 1667                          intr = TRUE;
1668 1668                          goto interrupted;
1669 1669                  }
1670 1670          }
1671 1671  
1672 1672          /*
1673 1673           * Wait for the inactive thread to finish doing what it's doing.  It
1674 1674           * won't exit until the a last reference to the vfs_t goes away.
1675 1675           */
1676 1676          if (mi->mi_inactive_thread != NULL) {
1677 1677                  mutex_enter(&mi->mi_lock);
1678 1678                  while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1679 1679                      (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1680 1680                          mutex_exit(&mi->mi_lock);
1681 1681                          if (!cv_wait_sig(&mi->mi_async_cv,
1682 1682                              &mi->mi_async_lock)) {
1683 1683                                  intr = TRUE;
1684 1684                                  goto interrupted;
1685 1685                          }
1686 1686                          mutex_enter(&mi->mi_lock);
1687 1687                  }
1688 1688                  mutex_exit(&mi->mi_lock);
1689 1689          }
1690 1690  interrupted:
1691 1691          if (intr)
1692 1692                  mi->mi_max_threads = omax;
1693 1693          mutex_exit(&mi->mi_async_lock);
1694 1694  
1695 1695          return (intr);
1696 1696  }
1697 1697  
1698 1698  int
1699 1699  nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1700 1700      int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1701 1701      u_offset_t, size_t, int, cred_t *))
1702 1702  {
1703 1703          rnode4_t *rp;
1704 1704          mntinfo4_t *mi;
1705 1705          struct nfs4_async_reqs *args;
1706 1706  
1707 1707          ASSERT(flags & B_ASYNC);
1708 1708          ASSERT(vp->v_vfsp != NULL);
1709 1709  
1710 1710          rp = VTOR4(vp);
1711 1711          ASSERT(rp->r_count > 0);
1712 1712  
1713 1713          mi = VTOMI4(vp);
1714 1714  
1715 1715          /*
1716 1716           * If we can't allocate a request structure, do the putpage
1717 1717           * operation synchronously in this thread's context.
1718 1718           */
1719 1719          if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1720 1720                  goto noasync;
1721 1721  
1722 1722          args->a_next = NULL;
1723 1723  #ifdef DEBUG
1724 1724          args->a_queuer = curthread;
1725 1725  #endif
1726 1726          VN_HOLD(vp);
1727 1727          args->a_vp = vp;
1728 1728          ASSERT(cr != NULL);
1729 1729          crhold(cr);
1730 1730          args->a_cred = cr;
1731 1731          args->a_io = NFS4_PUTAPAGE;
1732 1732          args->a_nfs4_putapage = putapage;
1733 1733          args->a_nfs4_pp = pp;
1734 1734          args->a_nfs4_off = off;
1735 1735          args->a_nfs4_len = (uint_t)len;
1736 1736          args->a_nfs4_flags = flags;
1737 1737  
1738 1738          mutex_enter(&mi->mi_async_lock);
1739 1739  
1740 1740          /*
1741 1741           * If asyncio has been disabled, then make a synchronous request.
1742 1742           * This check is done a second time in case async io was diabled
1743 1743           * while this thread was blocked waiting for memory pressure to
1744 1744           * reduce or for the queue to drain.
1745 1745           */
1746 1746          if (mi->mi_max_threads == 0) {
1747 1747                  mutex_exit(&mi->mi_async_lock);
1748 1748  
1749 1749                  VN_RELE(vp);
1750 1750                  crfree(cr);
1751 1751                  kmem_free(args, sizeof (*args));
1752 1752                  goto noasync;
1753 1753          }
1754 1754  
1755 1755          /*
1756 1756           * Link request structure into the async list and
1757 1757           * wakeup async thread to do the i/o.
1758 1758           */
1759 1759          if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) {
1760 1760                  mi->mi_async_reqs[NFS4_PUTAPAGE] = args;
1761 1761                  mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1762 1762          } else {
1763 1763                  mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args;
1764 1764                  mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1765 1765          }
1766 1766  
1767 1767          mutex_enter(&rp->r_statelock);
1768 1768          rp->r_count++;
1769 1769          rp->r_awcount++;
1770 1770          mutex_exit(&rp->r_statelock);
1771 1771  
1772 1772          if (mi->mi_io_kstats) {
1773 1773                  mutex_enter(&mi->mi_lock);
1774 1774                  kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1775 1775                  mutex_exit(&mi->mi_lock);
1776 1776          }
1777 1777  
1778 1778          mi->mi_async_req_count++;
1779 1779          ASSERT(mi->mi_async_req_count != 0);
1780 1780          cv_signal(&mi->mi_async_reqs_cv);
1781 1781          mutex_exit(&mi->mi_async_lock);
1782 1782          return (0);
1783 1783  
1784 1784  noasync:
1785 1785  
1786 1786          if (curproc == proc_pageout || curproc == proc_fsflush) {
1787 1787                  /*
1788 1788                   * If we get here in the context of the pageout/fsflush,
1789 1789                   * or we have run out of memory or we're attempting to
1790 1790                   * unmount we refuse to do a sync write, because this may
1791 1791                   * hang pageout/fsflush and the machine. In this case,
1792 1792                   * we just re-mark the page as dirty and punt on the page.
1793 1793                   *
1794 1794                   * Make sure B_FORCE isn't set.  We can re-mark the
1795 1795                   * pages as dirty and unlock the pages in one swoop by
1796 1796                   * passing in B_ERROR to pvn_write_done().  However,
1797 1797                   * we should make sure B_FORCE isn't set - we don't
1798 1798                   * want the page tossed before it gets written out.
1799 1799                   */
1800 1800                  if (flags & B_FORCE)
1801 1801                          flags &= ~(B_INVAL | B_FORCE);
1802 1802                  pvn_write_done(pp, flags | B_ERROR);
1803 1803                  return (0);
1804 1804          }
1805 1805  
1806 1806          if (nfs_zone() != mi->mi_zone) {
1807 1807                  /*
1808 1808                   * So this was a cross-zone sync putpage.
1809 1809                   *
1810 1810                   * We pass in B_ERROR to pvn_write_done() to re-mark the pages
1811 1811                   * as dirty and unlock them.
1812 1812                   *
1813 1813                   * We don't want to clear B_FORCE here as the caller presumably
1814 1814                   * knows what they're doing if they set it.
1815 1815                   */
1816 1816                  pvn_write_done(pp, flags | B_ERROR);
1817 1817                  return (EPERM);
1818 1818          }
1819 1819          return ((*putapage)(vp, pp, off, len, flags, cr));
1820 1820  }
1821 1821  
1822 1822  int
1823 1823  nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1824 1824      int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1825 1825      size_t, int, cred_t *))
1826 1826  {
1827 1827          rnode4_t *rp;
1828 1828          mntinfo4_t *mi;
1829 1829          struct nfs4_async_reqs *args;
1830 1830  
1831 1831          ASSERT(flags & B_ASYNC);
1832 1832          ASSERT(vp->v_vfsp != NULL);
1833 1833  
1834 1834          rp = VTOR4(vp);
1835 1835          ASSERT(rp->r_count > 0);
1836 1836  
1837 1837          mi = VTOMI4(vp);
1838 1838  
1839 1839          /*
1840 1840           * If we can't allocate a request structure, do the pageio
1841 1841           * request synchronously in this thread's context.
1842 1842           */
1843 1843          if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1844 1844                  goto noasync;
1845 1845  
1846 1846          args->a_next = NULL;
1847 1847  #ifdef DEBUG
1848 1848          args->a_queuer = curthread;
1849 1849  #endif
1850 1850          VN_HOLD(vp);
1851 1851          args->a_vp = vp;
1852 1852          ASSERT(cr != NULL);
1853 1853          crhold(cr);
1854 1854          args->a_cred = cr;
1855 1855          args->a_io = NFS4_PAGEIO;
1856 1856          args->a_nfs4_pageio = pageio;
1857 1857          args->a_nfs4_pp = pp;
1858 1858          args->a_nfs4_off = io_off;
1859 1859          args->a_nfs4_len = (uint_t)io_len;
1860 1860          args->a_nfs4_flags = flags;
1861 1861  
1862 1862          mutex_enter(&mi->mi_async_lock);
1863 1863  
1864 1864          /*
1865 1865           * If asyncio has been disabled, then make a synchronous request.
1866 1866           * This check is done a second time in case async io was diabled
1867 1867           * while this thread was blocked waiting for memory pressure to
1868 1868           * reduce or for the queue to drain.
1869 1869           */
1870 1870          if (mi->mi_max_threads == 0) {
1871 1871                  mutex_exit(&mi->mi_async_lock);
1872 1872  
1873 1873                  VN_RELE(vp);
1874 1874                  crfree(cr);
1875 1875                  kmem_free(args, sizeof (*args));
1876 1876                  goto noasync;
1877 1877          }
1878 1878  
1879 1879          /*
1880 1880           * Link request structure into the async list and
1881 1881           * wakeup async thread to do the i/o.
1882 1882           */
1883 1883          if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) {
1884 1884                  mi->mi_async_reqs[NFS4_PAGEIO] = args;
1885 1885                  mi->mi_async_tail[NFS4_PAGEIO] = args;
1886 1886          } else {
1887 1887                  mi->mi_async_tail[NFS4_PAGEIO]->a_next = args;
1888 1888                  mi->mi_async_tail[NFS4_PAGEIO] = args;
1889 1889          }
1890 1890  
1891 1891          mutex_enter(&rp->r_statelock);
1892 1892          rp->r_count++;
1893 1893          rp->r_awcount++;
1894 1894          mutex_exit(&rp->r_statelock);
1895 1895  
1896 1896          if (mi->mi_io_kstats) {
1897 1897                  mutex_enter(&mi->mi_lock);
1898 1898                  kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1899 1899                  mutex_exit(&mi->mi_lock);
1900 1900          }
1901 1901  
1902 1902          mi->mi_async_req_count++;
1903 1903          ASSERT(mi->mi_async_req_count != 0);
1904 1904          cv_signal(&mi->mi_async_reqs_cv);
1905 1905          mutex_exit(&mi->mi_async_lock);
1906 1906          return (0);
1907 1907  
1908 1908  noasync:
1909 1909          /*
1910 1910           * If we can't do it ASYNC, for reads we do nothing (but cleanup
1911 1911           * the page list), for writes we do it synchronously, except for
1912 1912           * proc_pageout/proc_fsflush as described below.
1913 1913           */
1914 1914          if (flags & B_READ) {
1915 1915                  pvn_read_done(pp, flags | B_ERROR);
1916 1916                  return (0);
1917 1917          }
1918 1918  
1919 1919          if (curproc == proc_pageout || curproc == proc_fsflush) {
1920 1920                  /*
1921 1921                   * If we get here in the context of the pageout/fsflush,
1922 1922                   * we refuse to do a sync write, because this may hang
1923 1923                   * pageout/fsflush (and the machine). In this case, we just
1924 1924                   * re-mark the page as dirty and punt on the page.
1925 1925                   *
1926 1926                   * Make sure B_FORCE isn't set.  We can re-mark the
1927 1927                   * pages as dirty and unlock the pages in one swoop by
1928 1928                   * passing in B_ERROR to pvn_write_done().  However,
1929 1929                   * we should make sure B_FORCE isn't set - we don't
1930 1930                   * want the page tossed before it gets written out.
1931 1931                   */
1932 1932                  if (flags & B_FORCE)
1933 1933                          flags &= ~(B_INVAL | B_FORCE);
1934 1934                  pvn_write_done(pp, flags | B_ERROR);
1935 1935                  return (0);
1936 1936          }
1937 1937  
1938 1938          if (nfs_zone() != mi->mi_zone) {
1939 1939                  /*
1940 1940                   * So this was a cross-zone sync pageio.  We pass in B_ERROR
1941 1941                   * to pvn_write_done() to re-mark the pages as dirty and unlock
1942 1942                   * them.
1943 1943                   *
1944 1944                   * We don't want to clear B_FORCE here as the caller presumably
1945 1945                   * knows what they're doing if they set it.
1946 1946                   */
1947 1947                  pvn_write_done(pp, flags | B_ERROR);
1948 1948                  return (EPERM);
1949 1949          }
1950 1950          return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1951 1951  }
1952 1952  
1953 1953  void
1954 1954  nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr,
1955 1955      int (*readdir)(vnode_t *, rddir4_cache *, cred_t *))
1956 1956  {
1957 1957          rnode4_t *rp;
1958 1958          mntinfo4_t *mi;
1959 1959          struct nfs4_async_reqs *args;
1960 1960  
1961 1961          rp = VTOR4(vp);
1962 1962          ASSERT(rp->r_freef == NULL);
1963 1963  
1964 1964          mi = VTOMI4(vp);
1965 1965  
1966 1966          /*
1967 1967           * If we can't allocate a request structure, skip the readdir.
1968 1968           */
1969 1969          if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1970 1970                  goto noasync;
1971 1971  
1972 1972          args->a_next = NULL;
1973 1973  #ifdef DEBUG
1974 1974          args->a_queuer = curthread;
1975 1975  #endif
1976 1976          VN_HOLD(vp);
1977 1977          args->a_vp = vp;
1978 1978          ASSERT(cr != NULL);
1979 1979          crhold(cr);
1980 1980          args->a_cred = cr;
1981 1981          args->a_io = NFS4_READDIR;
1982 1982          args->a_nfs4_readdir = readdir;
1983 1983          args->a_nfs4_rdc = rdc;
1984 1984  
1985 1985          mutex_enter(&mi->mi_async_lock);
1986 1986  
1987 1987          /*
1988 1988           * If asyncio has been disabled, then skip this request
1989 1989           */
1990 1990          if (mi->mi_max_threads == 0) {
1991 1991                  mutex_exit(&mi->mi_async_lock);
1992 1992  
1993 1993                  VN_RELE(vp);
1994 1994                  crfree(cr);
1995 1995                  kmem_free(args, sizeof (*args));
1996 1996                  goto noasync;
1997 1997          }
1998 1998  
1999 1999          /*
2000 2000           * Link request structure into the async list and
2001 2001           * wakeup async thread to do the i/o.
2002 2002           */
2003 2003          if (mi->mi_async_reqs[NFS4_READDIR] == NULL) {
2004 2004                  mi->mi_async_reqs[NFS4_READDIR] = args;
2005 2005                  mi->mi_async_tail[NFS4_READDIR] = args;
2006 2006          } else {
2007 2007                  mi->mi_async_tail[NFS4_READDIR]->a_next = args;
2008 2008                  mi->mi_async_tail[NFS4_READDIR] = args;
2009 2009          }
2010 2010  
2011 2011          mutex_enter(&rp->r_statelock);
2012 2012          rp->r_count++;
2013 2013          mutex_exit(&rp->r_statelock);
2014 2014  
2015 2015          if (mi->mi_io_kstats) {
2016 2016                  mutex_enter(&mi->mi_lock);
2017 2017                  kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2018 2018                  mutex_exit(&mi->mi_lock);
2019 2019          }
2020 2020  
2021 2021          mi->mi_async_req_count++;
2022 2022          ASSERT(mi->mi_async_req_count != 0);
2023 2023          cv_signal(&mi->mi_async_reqs_cv);
2024 2024          mutex_exit(&mi->mi_async_lock);
2025 2025          return;
2026 2026  
2027 2027  noasync:
2028 2028          mutex_enter(&rp->r_statelock);
2029 2029          rdc->entries = NULL;
2030 2030          /*
2031 2031           * Indicate that no one is trying to fill this entry and
2032 2032           * it still needs to be filled.
2033 2033           */
2034 2034          rdc->flags &= ~RDDIR;
2035 2035          rdc->flags |= RDDIRREQ;
2036 2036          rddir4_cache_rele(rp, rdc);
2037 2037          mutex_exit(&rp->r_statelock);
2038 2038  }
2039 2039  
2040 2040  void
2041 2041  nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
2042 2042      cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
2043 2043      cred_t *))
2044 2044  {
2045 2045          rnode4_t *rp;
2046 2046          mntinfo4_t *mi;
2047 2047          struct nfs4_async_reqs *args;
2048 2048          page_t *pp;
2049 2049  
2050 2050          rp = VTOR4(vp);
2051 2051          mi = VTOMI4(vp);
2052 2052  
2053 2053          /*
2054 2054           * If we can't allocate a request structure, do the commit
2055 2055           * operation synchronously in this thread's context.
2056 2056           */
2057 2057          if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
2058 2058                  goto noasync;
2059 2059  
2060 2060          args->a_next = NULL;
2061 2061  #ifdef DEBUG
2062 2062          args->a_queuer = curthread;
2063 2063  #endif
2064 2064          VN_HOLD(vp);
2065 2065          args->a_vp = vp;
2066 2066          ASSERT(cr != NULL);
2067 2067          crhold(cr);
2068 2068          args->a_cred = cr;
2069 2069          args->a_io = NFS4_COMMIT;
2070 2070          args->a_nfs4_commit = commit;
2071 2071          args->a_nfs4_plist = plist;
2072 2072          args->a_nfs4_offset = offset;
2073 2073          args->a_nfs4_count = count;
2074 2074  
2075 2075          mutex_enter(&mi->mi_async_lock);
2076 2076  
2077 2077          /*
2078 2078           * If asyncio has been disabled, then make a synchronous request.
2079 2079           * This check is done a second time in case async io was diabled
2080 2080           * while this thread was blocked waiting for memory pressure to
2081 2081           * reduce or for the queue to drain.
2082 2082           */
2083 2083          if (mi->mi_max_threads == 0) {
2084 2084                  mutex_exit(&mi->mi_async_lock);
2085 2085  
2086 2086                  VN_RELE(vp);
2087 2087                  crfree(cr);
2088 2088                  kmem_free(args, sizeof (*args));
2089 2089                  goto noasync;
2090 2090          }
2091 2091  
2092 2092          /*
2093 2093           * Link request structure into the async list and
2094 2094           * wakeup async thread to do the i/o.
2095 2095           */
2096 2096          if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) {
2097 2097                  mi->mi_async_reqs[NFS4_COMMIT] = args;
2098 2098                  mi->mi_async_tail[NFS4_COMMIT] = args;
2099 2099          } else {
2100 2100                  mi->mi_async_tail[NFS4_COMMIT]->a_next = args;
2101 2101                  mi->mi_async_tail[NFS4_COMMIT] = args;
2102 2102          }
2103 2103  
2104 2104          mutex_enter(&rp->r_statelock);
2105 2105          rp->r_count++;
2106 2106          mutex_exit(&rp->r_statelock);
2107 2107  
2108 2108          if (mi->mi_io_kstats) {
2109 2109                  mutex_enter(&mi->mi_lock);
2110 2110                  kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2111 2111                  mutex_exit(&mi->mi_lock);
2112 2112          }
2113 2113  
2114 2114          mi->mi_async_req_count++;
2115 2115          ASSERT(mi->mi_async_req_count != 0);
2116 2116          cv_signal(&mi->mi_async_reqs_cv);
2117 2117          mutex_exit(&mi->mi_async_lock);
2118 2118          return;
2119 2119  
2120 2120  noasync:
2121 2121          if (curproc == proc_pageout || curproc == proc_fsflush ||
2122 2122              nfs_zone() != mi->mi_zone) {
2123 2123                  while (plist != NULL) {
2124 2124                          pp = plist;
2125 2125                          page_sub(&plist, pp);
2126 2126                          pp->p_fsdata = C_COMMIT;
2127 2127                          page_unlock(pp);
2128 2128                  }
2129 2129                  return;
2130 2130          }
2131 2131          (*commit)(vp, plist, offset, count, cr);
2132 2132  }
2133 2133  
2134 2134  /*
2135 2135   * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread.  The
2136 2136   * reference to the vnode is handed over to the thread; the caller should
2137 2137   * no longer refer to the vnode.
2138 2138   *
2139 2139   * Unlike most of the async routines, this handoff is needed for
2140 2140   * correctness reasons, not just performance.  So doing operations in the
2141 2141   * context of the current thread is not an option.
2142 2142   */
2143 2143  void
2144 2144  nfs4_async_inactive(vnode_t *vp, cred_t *cr)
2145 2145  {
2146 2146          mntinfo4_t *mi;
2147 2147          struct nfs4_async_reqs *args;
2148 2148          boolean_t signal_inactive_thread = B_FALSE;
2149 2149  
2150 2150          mi = VTOMI4(vp);
2151 2151  
2152 2152          args = kmem_alloc(sizeof (*args), KM_SLEEP);
2153 2153          args->a_next = NULL;
2154 2154  #ifdef DEBUG
2155 2155          args->a_queuer = curthread;
2156 2156  #endif
2157 2157          args->a_vp = vp;
2158 2158          ASSERT(cr != NULL);
2159 2159          crhold(cr);
2160 2160          args->a_cred = cr;
2161 2161          args->a_io = NFS4_INACTIVE;
2162 2162  
2163 2163          /*
2164 2164           * Note that we don't check mi->mi_max_threads here, since we
2165 2165           * *need* to get rid of this vnode regardless of whether someone
2166 2166           * set nfs4_max_threads to zero in /etc/system.
2167 2167           *
2168 2168           * The manager thread knows about this and is willing to create
2169 2169           * at least one thread to accommodate us.
2170 2170           */
2171 2171          mutex_enter(&mi->mi_async_lock);
2172 2172          if (mi->mi_inactive_thread == NULL) {
2173 2173                  rnode4_t *rp;
2174 2174                  vnode_t *unldvp = NULL;
2175 2175                  char *unlname;
2176 2176                  cred_t *unlcred;
2177 2177  
2178 2178                  mutex_exit(&mi->mi_async_lock);
2179 2179                  /*
2180 2180                   * We just need to free up the memory associated with the
2181 2181                   * vnode, which can be safely done from within the current
2182 2182                   * context.
2183 2183                   */
2184 2184                  crfree(cr);     /* drop our reference */
2185 2185                  kmem_free(args, sizeof (*args));
2186 2186                  rp = VTOR4(vp);
2187 2187                  mutex_enter(&rp->r_statelock);
2188 2188                  if (rp->r_unldvp != NULL) {
2189 2189                          unldvp = rp->r_unldvp;
2190 2190                          rp->r_unldvp = NULL;
2191 2191                          unlname = rp->r_unlname;
2192 2192                          rp->r_unlname = NULL;
2193 2193                          unlcred = rp->r_unlcred;
2194 2194                          rp->r_unlcred = NULL;
2195 2195                  }
2196 2196                  mutex_exit(&rp->r_statelock);
2197 2197                  /*
2198 2198                   * No need to explicitly throw away any cached pages.  The
2199 2199                   * eventual r4inactive() will attempt a synchronous
2200 2200                   * VOP_PUTPAGE() which will immediately fail since the request
2201 2201                   * is coming from the wrong zone, and then will proceed to call
2202 2202                   * nfs4_invalidate_pages() which will clean things up for us.
2203 2203                   *
2204 2204                   * Throw away the delegation here so rp4_addfree()'s attempt to
2205 2205                   * return any existing delegations becomes a no-op.
2206 2206                   */
2207 2207                  if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
2208 2208                          (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
2209 2209                              FALSE);
2210 2210                          (void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
2211 2211                          nfs_rw_exit(&mi->mi_recovlock);
2212 2212                  }
2213 2213                  nfs4_clear_open_streams(rp);
2214 2214  
2215 2215                  rp4_addfree(rp, cr);
2216 2216                  if (unldvp != NULL) {
2217 2217                          kmem_free(unlname, MAXNAMELEN);
2218 2218                          VN_RELE(unldvp);
2219 2219                          crfree(unlcred);
2220 2220                  }
2221 2221                  return;
2222 2222          }
2223 2223  
2224 2224          if (mi->mi_manager_thread == NULL) {
2225 2225                  /*
2226 2226                   * We want to talk to the inactive thread.
2227 2227                   */
2228 2228                  signal_inactive_thread = B_TRUE;
2229 2229          }
2230 2230  
2231 2231          /*
2232 2232           * Enqueue the vnode and wake up either the special thread (empty
2233 2233           * list) or an async thread.
2234 2234           */
2235 2235          if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) {
2236 2236                  mi->mi_async_reqs[NFS4_INACTIVE] = args;
2237 2237                  mi->mi_async_tail[NFS4_INACTIVE] = args;
2238 2238                  signal_inactive_thread = B_TRUE;
2239 2239          } else {
2240 2240                  mi->mi_async_tail[NFS4_INACTIVE]->a_next = args;
2241 2241                  mi->mi_async_tail[NFS4_INACTIVE] = args;
2242 2242          }
2243 2243          if (signal_inactive_thread) {
2244 2244                  cv_signal(&mi->mi_inact_req_cv);
2245 2245          } else  {
2246 2246                  mi->mi_async_req_count++;
2247 2247                  ASSERT(mi->mi_async_req_count != 0);
2248 2248                  cv_signal(&mi->mi_async_reqs_cv);
2249 2249          }
2250 2250  
2251 2251          mutex_exit(&mi->mi_async_lock);
2252 2252  }
2253 2253  
2254 2254  int
2255 2255  writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2256 2256  {
2257 2257          int pagecreate;
2258 2258          int n;
2259 2259          int saved_n;
2260 2260          caddr_t saved_base;
2261 2261          u_offset_t offset;
2262 2262          int error;
2263 2263          int sm_error;
2264 2264          vnode_t *vp = RTOV(rp);
2265 2265  
2266 2266          ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2267 2267          ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2268 2268          if (!vpm_enable) {
2269 2269                  ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2270 2270          }
2271 2271  
2272 2272          /*
2273 2273           * Move bytes in at most PAGESIZE chunks. We must avoid
2274 2274           * spanning pages in uiomove() because page faults may cause
2275 2275           * the cache to be invalidated out from under us. The r_size is not
2276 2276           * updated until after the uiomove. If we push the last page of a
2277 2277           * file before r_size is correct, we will lose the data written past
2278 2278           * the current (and invalid) r_size.
2279 2279           */
2280 2280          do {
2281 2281                  offset = uio->uio_loffset;
2282 2282                  pagecreate = 0;
2283 2283  
2284 2284                  /*
2285 2285                   * n is the number of bytes required to satisfy the request
2286 2286                   *   or the number of bytes to fill out the page.
2287 2287                   */
2288 2288                  n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2289 2289  
2290 2290                  /*
2291 2291                   * Check to see if we can skip reading in the page
2292 2292                   * and just allocate the memory.  We can do this
2293 2293                   * if we are going to rewrite the entire mapping
2294 2294                   * or if we are going to write to or beyond the current
2295 2295                   * end of file from the beginning of the mapping.
2296 2296                   *
2297 2297                   * The read of r_size is now protected by r_statelock.
2298 2298                   */
2299 2299                  mutex_enter(&rp->r_statelock);
2300 2300                  /*
2301 2301                   * When pgcreated is nonzero the caller has already done
2302 2302                   * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2303 2303                   * segkpm this means we already have at least one page
2304 2304                   * created and mapped at base.
2305 2305                   */
2306 2306                  pagecreate = pgcreated ||
2307 2307                      ((offset & PAGEOFFSET) == 0 &&
2308 2308                      (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2309 2309  
2310 2310                  mutex_exit(&rp->r_statelock);
2311 2311  
2312 2312                  if (!vpm_enable && pagecreate) {
2313 2313                          /*
2314 2314                           * The last argument tells segmap_pagecreate() to
2315 2315                           * always lock the page, as opposed to sometimes
2316 2316                           * returning with the page locked. This way we avoid a
2317 2317                           * fault on the ensuing uiomove(), but also
2318 2318                           * more importantly (to fix bug 1094402) we can
2319 2319                           * call segmap_fault() to unlock the page in all
2320 2320                           * cases. An alternative would be to modify
2321 2321                           * segmap_pagecreate() to tell us when it is
2322 2322                           * locking a page, but that's a fairly major
2323 2323                           * interface change.
2324 2324                           */
2325 2325                          if (pgcreated == 0)
2326 2326                                  (void) segmap_pagecreate(segkmap, base,
2327 2327                                      (uint_t)n, 1);
2328 2328                          saved_base = base;
2329 2329                          saved_n = n;
2330 2330                  }
2331 2331  
2332 2332                  /*
2333 2333                   * The number of bytes of data in the last page can not
2334 2334                   * be accurately be determined while page is being
2335 2335                   * uiomove'd to and the size of the file being updated.
2336 2336                   * Thus, inform threads which need to know accurately
2337 2337                   * how much data is in the last page of the file.  They
2338 2338                   * will not do the i/o immediately, but will arrange for
2339 2339                   * the i/o to happen later when this modify operation
2340 2340                   * will have finished.
2341 2341                   */
2342 2342                  ASSERT(!(rp->r_flags & R4MODINPROGRESS));
2343 2343                  mutex_enter(&rp->r_statelock);
2344 2344                  rp->r_flags |= R4MODINPROGRESS;
2345 2345                  rp->r_modaddr = (offset & MAXBMASK);
2346 2346                  mutex_exit(&rp->r_statelock);
2347 2347  
2348 2348                  if (vpm_enable) {
2349 2349                          /*
2350 2350                           * Copy data. If new pages are created, part of
2351 2351                           * the page that is not written will be initizliazed
2352 2352                           * with zeros.
2353 2353                           */
2354 2354                          error = vpm_data_copy(vp, offset, n, uio,
2355 2355                              !pagecreate, NULL, 0, S_WRITE);
2356 2356                  } else {
2357 2357                          error = uiomove(base, n, UIO_WRITE, uio);
2358 2358                  }
2359 2359  
2360 2360                  /*
2361 2361                   * r_size is the maximum number of
2362 2362                   * bytes known to be in the file.
2363 2363                   * Make sure it is at least as high as the
2364 2364                   * first unwritten byte pointed to by uio_loffset.
2365 2365                   */
2366 2366                  mutex_enter(&rp->r_statelock);
2367 2367                  if (rp->r_size < uio->uio_loffset)
2368 2368                          rp->r_size = uio->uio_loffset;
2369 2369                  rp->r_flags &= ~R4MODINPROGRESS;
2370 2370                  rp->r_flags |= R4DIRTY;
2371 2371                  mutex_exit(&rp->r_statelock);
2372 2372  
2373 2373                  /* n = # of bytes written */
2374 2374                  n = (int)(uio->uio_loffset - offset);
2375 2375  
2376 2376                  if (!vpm_enable) {
2377 2377                          base += n;
2378 2378                  }
2379 2379  
2380 2380                  tcount -= n;
2381 2381                  /*
2382 2382                   * If we created pages w/o initializing them completely,
2383 2383                   * we need to zero the part that wasn't set up.
2384 2384                   * This happens on a most EOF write cases and if
2385 2385                   * we had some sort of error during the uiomove.
2386 2386                   */
2387 2387                  if (!vpm_enable && pagecreate) {
2388 2388                          if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2389 2389                                  (void) kzero(base, PAGESIZE - n);
2390 2390  
2391 2391                          if (pgcreated) {
2392 2392                                  /*
2393 2393                                   * Caller is responsible for this page,
2394 2394                                   * it was not created in this loop.
2395 2395                                   */
2396 2396                                  pgcreated = 0;
2397 2397                          } else {
2398 2398                                  /*
2399 2399                                   * For bug 1094402: segmap_pagecreate locks
2400 2400                                   * page. Unlock it. This also unlocks the
2401 2401                                   * pages allocated by page_create_va() in
2402 2402                                   * segmap_pagecreate().
2403 2403                                   */
2404 2404                                  sm_error = segmap_fault(kas.a_hat, segkmap,
2405 2405                                      saved_base, saved_n,
2406 2406                                      F_SOFTUNLOCK, S_WRITE);
2407 2407                                  if (error == 0)
2408 2408                                          error = sm_error;
2409 2409                          }
2410 2410                  }
2411 2411          } while (tcount > 0 && error == 0);
2412 2412  
2413 2413          return (error);
2414 2414  }
2415 2415  
2416 2416  int
2417 2417  nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2418 2418  {
2419 2419          rnode4_t *rp;
2420 2420          page_t *pp;
2421 2421          u_offset_t eoff;
2422 2422          u_offset_t io_off;
2423 2423          size_t io_len;
2424 2424          int error;
2425 2425          int rdirty;
2426 2426          int err;
2427 2427  
2428 2428          rp = VTOR4(vp);
2429 2429          ASSERT(rp->r_count > 0);
2430 2430  
2431 2431          if (!nfs4_has_pages(vp))
2432 2432                  return (0);
2433 2433  
2434 2434          ASSERT(vp->v_type != VCHR);
2435 2435  
2436 2436          /*
2437 2437           * If R4OUTOFSPACE is set, then all writes turn into B_INVAL
2438 2438           * writes.  B_FORCE is set to force the VM system to actually
2439 2439           * invalidate the pages, even if the i/o failed.  The pages
2440 2440           * need to get invalidated because they can't be written out
2441 2441           * because there isn't any space left on either the server's
2442 2442           * file system or in the user's disk quota.  The B_FREE bit
2443 2443           * is cleared to avoid confusion as to whether this is a
2444 2444           * request to place the page on the freelist or to destroy
2445 2445           * it.
2446 2446           */
2447 2447          if ((rp->r_flags & R4OUTOFSPACE) ||
2448 2448              (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2449 2449                  flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2450 2450  
2451 2451          if (len == 0) {
2452 2452                  /*
2453 2453                   * If doing a full file synchronous operation, then clear
2454 2454                   * the R4DIRTY bit.  If a page gets dirtied while the flush
2455 2455                   * is happening, then R4DIRTY will get set again.  The
2456 2456                   * R4DIRTY bit must get cleared before the flush so that
2457 2457                   * we don't lose this information.
2458 2458                   *
2459 2459                   * If there are no full file async write operations
2460 2460                   * pending and RDIRTY bit is set, clear it.
2461 2461                   */
2462 2462                  if (off == (u_offset_t)0 &&
2463 2463                      !(flags & B_ASYNC) &&
2464 2464                      (rp->r_flags & R4DIRTY)) {
2465 2465                          mutex_enter(&rp->r_statelock);
2466 2466                          rdirty = (rp->r_flags & R4DIRTY);
2467 2467                          rp->r_flags &= ~R4DIRTY;
2468 2468                          mutex_exit(&rp->r_statelock);
2469 2469                  } else if (flags & B_ASYNC && off == (u_offset_t)0) {
2470 2470                          mutex_enter(&rp->r_statelock);
2471 2471                          if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) {
2472 2472                                  rdirty = (rp->r_flags & R4DIRTY);
2473 2473                                  rp->r_flags &= ~R4DIRTY;
2474 2474                          }
2475 2475                          mutex_exit(&rp->r_statelock);
2476 2476                  } else
2477 2477                          rdirty = 0;
2478 2478  
2479 2479                  /*
2480 2480                   * Search the entire vp list for pages >= off, and flush
2481 2481                   * the dirty pages.
2482 2482                   */
2483 2483                  error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2484 2484                      flags, cr);
2485 2485  
2486 2486                  /*
2487 2487                   * If an error occurred and the file was marked as dirty
2488 2488                   * before and we aren't forcibly invalidating pages, then
2489 2489                   * reset the R4DIRTY flag.
2490 2490                   */
2491 2491                  if (error && rdirty &&
2492 2492                      (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2493 2493                          mutex_enter(&rp->r_statelock);
2494 2494                          rp->r_flags |= R4DIRTY;
2495 2495                          mutex_exit(&rp->r_statelock);
2496 2496                  }
2497 2497          } else {
2498 2498                  /*
2499 2499                   * Do a range from [off...off + len) looking for pages
2500 2500                   * to deal with.
2501 2501                   */
2502 2502                  error = 0;
2503 2503                  io_len = 0;
2504 2504                  eoff = off + len;
2505 2505                  mutex_enter(&rp->r_statelock);
2506 2506                  for (io_off = off; io_off < eoff && io_off < rp->r_size;
2507 2507                      io_off += io_len) {
2508 2508                          mutex_exit(&rp->r_statelock);
2509 2509                          /*
2510 2510                           * If we are not invalidating, synchronously
2511 2511                           * freeing or writing pages use the routine
2512 2512                           * page_lookup_nowait() to prevent reclaiming
2513 2513                           * them from the free list.
2514 2514                           */
2515 2515                          if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2516 2516                                  pp = page_lookup(vp, io_off,
2517 2517                                      (flags & (B_INVAL | B_FREE)) ?
2518 2518                                      SE_EXCL : SE_SHARED);
2519 2519                          } else {
2520 2520                                  pp = page_lookup_nowait(vp, io_off,
2521 2521                                      (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2522 2522                          }
2523 2523  
2524 2524                          if (pp == NULL || !pvn_getdirty(pp, flags))
2525 2525                                  io_len = PAGESIZE;
2526 2526                          else {
2527 2527                                  err = (*rp->r_putapage)(vp, pp, &io_off,
2528 2528                                      &io_len, flags, cr);
2529 2529                                  if (!error)
2530 2530                                          error = err;
2531 2531                                  /*
2532 2532                                   * "io_off" and "io_len" are returned as
2533 2533                                   * the range of pages we actually wrote.
2534 2534                                   * This allows us to skip ahead more quickly
2535 2535                                   * since several pages may've been dealt
2536 2536                                   * with by this iteration of the loop.
2537 2537                                   */
2538 2538                          }
2539 2539                          mutex_enter(&rp->r_statelock);
2540 2540                  }
2541 2541                  mutex_exit(&rp->r_statelock);
2542 2542          }
2543 2543  
2544 2544          return (error);
2545 2545  }
2546 2546  
2547 2547  void
2548 2548  nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2549 2549  {
2550 2550          rnode4_t *rp;
2551 2551  
2552 2552          rp = VTOR4(vp);
2553 2553          if (IS_SHADOW(vp, rp))
2554 2554                  vp = RTOV4(rp);
2555 2555          mutex_enter(&rp->r_statelock);
2556 2556          while (rp->r_flags & R4TRUNCATE)
2557 2557                  cv_wait(&rp->r_cv, &rp->r_statelock);
2558 2558          rp->r_flags |= R4TRUNCATE;
2559 2559          if (off == (u_offset_t)0) {
2560 2560                  rp->r_flags &= ~R4DIRTY;
2561 2561                  if (!(rp->r_flags & R4STALE))
2562 2562                          rp->r_error = 0;
2563 2563          }
2564 2564          rp->r_truncaddr = off;
2565 2565          mutex_exit(&rp->r_statelock);
2566 2566          (void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2567 2567              B_INVAL | B_TRUNC, cr);
2568 2568          mutex_enter(&rp->r_statelock);
2569 2569          rp->r_flags &= ~R4TRUNCATE;
2570 2570          cv_broadcast(&rp->r_cv);
2571 2571          mutex_exit(&rp->r_statelock);
2572 2572  }
2573 2573  
2574 2574  static int
2575 2575  nfs4_mnt_kstat_update(kstat_t *ksp, int rw)
2576 2576  {
2577 2577          mntinfo4_t *mi;
2578 2578          struct mntinfo_kstat *mik;
2579 2579          vfs_t *vfsp;
2580 2580  
2581 2581          /* this is a read-only kstat. Bail out on a write */
2582 2582          if (rw == KSTAT_WRITE)
2583 2583                  return (EACCES);
2584 2584  
2585 2585  
2586 2586          /*
2587 2587           * We don't want to wait here as kstat_chain_lock could be held by
2588 2588           * dounmount(). dounmount() takes vfs_reflock before the chain lock
2589 2589           * and thus could lead to a deadlock.
2590 2590           */
2591 2591          vfsp = (struct vfs *)ksp->ks_private;
2592 2592  
2593 2593          mi = VFTOMI4(vfsp);
2594 2594          mik = (struct mntinfo_kstat *)ksp->ks_data;
2595 2595  
2596 2596          (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
2597 2597  
2598 2598          mik->mik_vers = (uint32_t)mi->mi_vers;
2599 2599          mik->mik_flags = mi->mi_flags;
2600 2600          /*
2601 2601           * The sv_secdata holds the flavor the client specifies.
2602 2602           * If the client uses default and a security negotiation
2603 2603           * occurs, sv_currsec will point to the current flavor
2604 2604           * selected from the server flavor list.
2605 2605           * sv_currsec is NULL if no security negotiation takes place.
2606 2606           */
2607 2607          mik->mik_secmod = mi->mi_curr_serv->sv_currsec ?
2608 2608              mi->mi_curr_serv->sv_currsec->secmod :
2609 2609              mi->mi_curr_serv->sv_secdata->secmod;
2610 2610          mik->mik_curread = (uint32_t)mi->mi_curread;
2611 2611          mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
2612 2612          mik->mik_retrans = mi->mi_retrans;
2613 2613          mik->mik_timeo = mi->mi_timeo;
2614 2614          mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
2615 2615          mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
2616 2616          mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
2617 2617          mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
2618 2618          mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
2619 2619          mik->mik_failover = (uint32_t)mi->mi_failover;
2620 2620          mik->mik_remap = (uint32_t)mi->mi_remap;
2621 2621  
2622 2622          (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
2623 2623  
2624 2624          return (0);
2625 2625  }
2626 2626  
2627 2627  void
2628 2628  nfs4_mnt_kstat_init(struct vfs *vfsp)
2629 2629  {
2630 2630          mntinfo4_t *mi = VFTOMI4(vfsp);
2631 2631  
2632 2632          /*
2633 2633           * PSARC 2001/697 Contract Private Interface
2634 2634           * All nfs kstats are under SunMC contract
2635 2635           * Please refer to the PSARC listed above and contact
2636 2636           * SunMC before making any changes!
2637 2637           *
2638 2638           * Changes must be reviewed by Solaris File Sharing
2639 2639           * Changes must be communicated to contract-2001-697@sun.com
2640 2640           *
2641 2641           */
2642 2642  
2643 2643          mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
2644 2644              NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
2645 2645          if (mi->mi_io_kstats) {
2646 2646                  if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2647 2647                          kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
2648 2648                  mi->mi_io_kstats->ks_lock = &mi->mi_lock;
2649 2649                  kstat_install(mi->mi_io_kstats);
2650 2650          }
2651 2651  
2652 2652          if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
2653 2653              getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
2654 2654              sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
2655 2655                  if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2656 2656                          kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
2657 2657                  mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update;
2658 2658                  mi->mi_ro_kstats->ks_private = (void *)vfsp;
2659 2659                  kstat_install(mi->mi_ro_kstats);
2660 2660          }
2661 2661  
2662 2662          nfs4_mnt_recov_kstat_init(vfsp);
2663 2663  }
2664 2664  
2665 2665  void
2666 2666  nfs4_write_error(vnode_t *vp, int error, cred_t *cr)
2667 2667  {
2668 2668          mntinfo4_t *mi;
2669 2669          clock_t now = ddi_get_lbolt();
2670 2670  
2671 2671          mi = VTOMI4(vp);
2672 2672          /*
2673 2673           * In case of forced unmount, do not print any messages
2674 2674           * since it can flood the console with error messages.
2675 2675           */
2676 2676          if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
2677 2677                  return;
2678 2678  
2679 2679          /*
2680 2680           * If the mount point is dead, not recoverable, do not
2681 2681           * print error messages that can flood the console.
2682 2682           */
2683 2683          if (mi->mi_flags & MI4_RECOV_FAIL)
2684 2684                  return;
2685 2685  
2686 2686          /*
2687 2687           * No use in flooding the console with ENOSPC
2688 2688           * messages from the same file system.
2689 2689           */
2690 2690          if ((error != ENOSPC && error != EDQUOT) ||
2691 2691              now - mi->mi_printftime > 0) {
2692 2692                  zoneid_t zoneid = mi->mi_zone->zone_id;
2693 2693  
2694 2694  #ifdef DEBUG
2695 2695                  nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2696 2696                      mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL);
2697 2697  #else
2698 2698                  nfs_perror(error, "NFS write error on host %s: %m.\n",
2699 2699                      VTOR4(vp)->r_server->sv_hostname, NULL);
2700 2700  #endif
2701 2701                  if (error == ENOSPC || error == EDQUOT) {
2702 2702                          zcmn_err(zoneid, CE_CONT,
2703 2703                              "^File: userid=%d, groupid=%d\n",
2704 2704                              crgetuid(cr), crgetgid(cr));
2705 2705                          if (crgetuid(curthread->t_cred) != crgetuid(cr) ||
2706 2706                              crgetgid(curthread->t_cred) != crgetgid(cr)) {
2707 2707                                  zcmn_err(zoneid, CE_CONT,
2708 2708                                      "^User: userid=%d, groupid=%d\n",
2709 2709                                      crgetuid(curthread->t_cred),
2710 2710                                      crgetgid(curthread->t_cred));
2711 2711                          }
2712 2712                          mi->mi_printftime = now +
2713 2713                              nfs_write_error_interval * hz;
2714 2714                  }
2715 2715                  sfh4_printfhandle(VTOR4(vp)->r_fh);
2716 2716  #ifdef DEBUG
2717 2717                  if (error == EACCES) {
2718 2718                          zcmn_err(zoneid, CE_CONT,
2719 2719                              "nfs_bio: cred is%s kcred\n",
2720 2720                              cr == kcred ? "" : " not");
2721 2721                  }
2722 2722  #endif
2723 2723          }
2724 2724  }
2725 2725  
2726 2726  /*
2727 2727   * Return non-zero if the given file can be safely memory mapped.  Locks
2728 2728   * are safe if whole-file (length and offset are both zero).
2729 2729   */
2730 2730  
2731 2731  #define SAFE_LOCK(flk)  ((flk).l_start == 0 && (flk).l_len == 0)
2732 2732  
2733 2733  static int
2734 2734  nfs4_safemap(const vnode_t *vp)
2735 2735  {
2736 2736          locklist_t      *llp, *next_llp;
2737 2737          int             safe = 1;
2738 2738          rnode4_t        *rp = VTOR4(vp);
2739 2739  
2740 2740          ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2741 2741  
2742 2742          NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: "
2743 2743              "vp = %p", (void *)vp));
2744 2744  
2745 2745          /*
2746 2746           * Review all the locks for the vnode, both ones that have been
2747 2747           * acquired and ones that are pending.  We assume that
2748 2748           * flk_active_locks_for_vp() has merged any locks that can be
2749 2749           * merged (so that if a process has the entire file locked, it is
2750 2750           * represented as a single lock).
2751 2751           *
2752 2752           * Note that we can't bail out of the loop if we find a non-safe
2753 2753           * lock, because we have to free all the elements in the llp list.
2754 2754           * We might be able to speed up this code slightly by not looking
2755 2755           * at each lock's l_start and l_len fields once we've found a
2756 2756           * non-safe lock.
2757 2757           */
2758 2758  
2759 2759          llp = flk_active_locks_for_vp(vp);
2760 2760          while (llp) {
2761 2761                  NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2762 2762                      "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")",
2763 2763                      llp->ll_flock.l_start, llp->ll_flock.l_len));
2764 2764                  if (!SAFE_LOCK(llp->ll_flock)) {
2765 2765                          safe = 0;
2766 2766                          NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2767 2767                              "nfs4_safemap: unsafe active lock (%" PRId64
2768 2768                              ", %" PRId64 ")", llp->ll_flock.l_start,
2769 2769                              llp->ll_flock.l_len));
2770 2770                  }
2771 2771                  next_llp = llp->ll_next;
2772 2772                  VN_RELE(llp->ll_vp);
2773 2773                  kmem_free(llp, sizeof (*llp));
2774 2774                  llp = next_llp;
2775 2775          }
2776 2776  
2777 2777          NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s",
2778 2778              safe ? "safe" : "unsafe"));
2779 2779          return (safe);
2780 2780  }
2781 2781  
2782 2782  /*
2783 2783   * Return whether there is a lost LOCK or LOCKU queued up for the given
2784 2784   * file that would make an mmap request unsafe.  cf. nfs4_safemap().
2785 2785   */
2786 2786  
2787 2787  bool_t
2788 2788  nfs4_map_lost_lock_conflict(vnode_t *vp)
2789 2789  {
2790 2790          bool_t conflict = FALSE;
2791 2791          nfs4_lost_rqst_t *lrp;
2792 2792          mntinfo4_t *mi = VTOMI4(vp);
2793 2793  
2794 2794          mutex_enter(&mi->mi_lock);
2795 2795          for (lrp = list_head(&mi->mi_lost_state); lrp != NULL;
2796 2796              lrp = list_next(&mi->mi_lost_state, lrp)) {
2797 2797                  if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2798 2798                          continue;
2799 2799                  ASSERT(lrp->lr_vp != NULL);
2800 2800                  if (!VOP_CMP(lrp->lr_vp, vp, NULL))
2801 2801                          continue;       /* different file */
2802 2802                  if (!SAFE_LOCK(*lrp->lr_flk)) {
2803 2803                          conflict = TRUE;
2804 2804                          break;
2805 2805                  }
2806 2806          }
2807 2807  
2808 2808          mutex_exit(&mi->mi_lock);
2809 2809          return (conflict);
2810 2810  }
2811 2811  
2812 2812  /*
2813 2813   * nfs_lockcompletion:
2814 2814   *
2815 2815   * If the vnode has a lock that makes it unsafe to cache the file, mark it
2816 2816   * as non cachable (set VNOCACHE bit).
2817 2817   */
2818 2818  
2819 2819  void
2820 2820  nfs4_lockcompletion(vnode_t *vp, int cmd)
2821 2821  {
2822 2822          rnode4_t *rp = VTOR4(vp);
2823 2823  
2824 2824          ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2825 2825          ASSERT(!IS_SHADOW(vp, rp));
2826 2826  
2827 2827          if (cmd == F_SETLK || cmd == F_SETLKW) {
2828 2828  
2829 2829                  if (!nfs4_safemap(vp)) {
2830 2830                          mutex_enter(&vp->v_lock);
2831 2831                          vp->v_flag |= VNOCACHE;
2832 2832                          mutex_exit(&vp->v_lock);
2833 2833                  } else {
2834 2834                          mutex_enter(&vp->v_lock);
2835 2835                          vp->v_flag &= ~VNOCACHE;
2836 2836                          mutex_exit(&vp->v_lock);
2837 2837                  }
2838 2838          }
2839 2839          /*
2840 2840           * The cached attributes of the file are stale after acquiring
2841 2841           * the lock on the file. They were updated when the file was
2842 2842           * opened, but not updated when the lock was acquired. Therefore the
2843 2843           * cached attributes are invalidated after the lock is obtained.
2844 2844           */
2845 2845          PURGE_ATTRCACHE4(vp);
2846 2846  }
2847 2847  
2848 2848  /* ARGSUSED */
2849 2849  static void *
2850 2850  nfs4_mi_init(zoneid_t zoneid)
2851 2851  {
2852 2852          struct mi4_globals *mig;
2853 2853  
2854 2854          mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2855 2855          mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2856 2856          list_create(&mig->mig_list, sizeof (mntinfo4_t),
2857 2857              offsetof(mntinfo4_t, mi_zone_node));
2858 2858          mig->mig_destructor_called = B_FALSE;
2859 2859          return (mig);
2860 2860  }
2861 2861  
2862 2862  /*
2863 2863   * Callback routine to tell all NFSv4 mounts in the zone to start tearing down
2864 2864   * state and killing off threads.
2865 2865   */
2866 2866  /* ARGSUSED */
2867 2867  static void
2868 2868  nfs4_mi_shutdown(zoneid_t zoneid, void *data)
2869 2869  {
2870 2870          struct mi4_globals *mig = data;
2871 2871          mntinfo4_t *mi;
2872 2872          nfs4_server_t *np;
2873 2873  
2874 2874          NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2875 2875              "nfs4_mi_shutdown zone %d\n", zoneid));
2876 2876          ASSERT(mig != NULL);
2877 2877          for (;;) {
2878 2878                  mutex_enter(&mig->mig_lock);
2879 2879                  mi = list_head(&mig->mig_list);
2880 2880                  if (mi == NULL) {
2881 2881                          mutex_exit(&mig->mig_lock);
2882 2882                          break;
2883 2883                  }
2884 2884  
2885 2885                  NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2886 2886                      "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp));
2887 2887                  /*
2888 2888                   * purge the DNLC for this filesystem
2889 2889                   */
2890 2890                  (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2891 2891                  /*
2892 2892                   * Tell existing async worker threads to exit.
2893 2893                   */
2894 2894                  mutex_enter(&mi->mi_async_lock);
2895 2895                  mi->mi_max_threads = 0;
2896 2896                  NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2897 2897                  /*
2898 2898                   * Set the appropriate flags, signal and wait for both the
2899 2899                   * async manager and the inactive thread to exit when they're
2900 2900                   * done with their current work.
2901 2901                   */
2902 2902                  mutex_enter(&mi->mi_lock);
2903 2903                  mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD);
2904 2904                  mutex_exit(&mi->mi_lock);
2905 2905                  mutex_exit(&mi->mi_async_lock);
2906 2906                  if (mi->mi_manager_thread) {
2907 2907                          nfs4_async_manager_stop(mi->mi_vfsp);
2908 2908                  }
2909 2909                  if (mi->mi_inactive_thread) {
2910 2910                          mutex_enter(&mi->mi_async_lock);
2911 2911                          cv_signal(&mi->mi_inact_req_cv);
2912 2912                          /*
2913 2913                           * Wait for the inactive thread to exit.
2914 2914                           */
2915 2915                          while (mi->mi_inactive_thread != NULL) {
2916 2916                                  cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2917 2917                          }
2918 2918                          mutex_exit(&mi->mi_async_lock);
2919 2919                  }
2920 2920                  /*
2921 2921                   * Wait for the recovery thread to complete, that is, it will
2922 2922                   * signal when it is done using the "mi" structure and about
2923 2923                   * to exit
2924 2924                   */
2925 2925                  mutex_enter(&mi->mi_lock);
2926 2926                  while (mi->mi_in_recovery > 0)
2927 2927                          cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
2928 2928                  mutex_exit(&mi->mi_lock);
2929 2929                  /*
2930 2930                   * We're done when every mi has been done or the list is empty.
2931 2931                   * This one is done, remove it from the list.
2932 2932                   */
2933 2933                  list_remove(&mig->mig_list, mi);
2934 2934                  mutex_exit(&mig->mig_lock);
2935 2935                  zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2936 2936  
2937 2937                  /*
2938 2938                   * Release hold on vfs and mi done to prevent race with zone
2939 2939                   * shutdown. This releases the hold in nfs4_mi_zonelist_add.
2940 2940                   */
2941 2941                  VFS_RELE(mi->mi_vfsp);
2942 2942                  MI4_RELE(mi);
2943 2943          }
2944 2944          /*
2945 2945           * Tell each renew thread in the zone to exit
2946 2946           */
2947 2947          mutex_enter(&nfs4_server_lst_lock);
2948 2948          for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
2949 2949                  mutex_enter(&np->s_lock);
2950 2950                  if (np->zoneid == zoneid) {
2951 2951                          /*
2952 2952                           * We add another hold onto the nfs4_server_t
2953 2953                           * because this will make sure tha the nfs4_server_t
2954 2954                           * stays around until nfs4_callback_fini_zone destroys
2955 2955                           * the zone. This way, the renew thread can
2956 2956                           * unconditionally release its holds on the
2957 2957                           * nfs4_server_t.
2958 2958                           */
2959 2959                          np->s_refcnt++;
2960 2960                          nfs4_mark_srv_dead(np);
2961 2961                  }
2962 2962                  mutex_exit(&np->s_lock);
2963 2963          }
2964 2964          mutex_exit(&nfs4_server_lst_lock);
2965 2965  }
2966 2966  
2967 2967  static void
2968 2968  nfs4_mi_free_globals(struct mi4_globals *mig)
2969 2969  {
2970 2970          list_destroy(&mig->mig_list);   /* makes sure the list is empty */
2971 2971          mutex_destroy(&mig->mig_lock);
2972 2972          kmem_free(mig, sizeof (*mig));
2973 2973  }
2974 2974  
2975 2975  /* ARGSUSED */
2976 2976  static void
2977 2977  nfs4_mi_destroy(zoneid_t zoneid, void *data)
2978 2978  {
2979 2979          struct mi4_globals *mig = data;
2980 2980  
2981 2981          NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2982 2982              "nfs4_mi_destroy zone %d\n", zoneid));
2983 2983          ASSERT(mig != NULL);
2984 2984          mutex_enter(&mig->mig_lock);
2985 2985          if (list_head(&mig->mig_list) != NULL) {
2986 2986                  /* Still waiting for VFS_FREEVFS() */
2987 2987                  mig->mig_destructor_called = B_TRUE;
2988 2988                  mutex_exit(&mig->mig_lock);
2989 2989                  return;
2990 2990          }
2991 2991          nfs4_mi_free_globals(mig);
2992 2992  }
2993 2993  
2994 2994  /*
2995 2995   * Add an NFS mount to the per-zone list of NFS mounts.
2996 2996   */
2997 2997  void
2998 2998  nfs4_mi_zonelist_add(mntinfo4_t *mi)
2999 2999  {
3000 3000          struct mi4_globals *mig;
3001 3001  
3002 3002          mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3003 3003          mutex_enter(&mig->mig_lock);
3004 3004          list_insert_head(&mig->mig_list, mi);
3005 3005          /*
3006 3006           * hold added to eliminate race with zone shutdown -this will be
3007 3007           * released in mi_shutdown
3008 3008           */
3009 3009          MI4_HOLD(mi);
3010 3010          VFS_HOLD(mi->mi_vfsp);
3011 3011          mutex_exit(&mig->mig_lock);
3012 3012  }
3013 3013  
3014 3014  /*
3015 3015   * Remove an NFS mount from the per-zone list of NFS mounts.
3016 3016   */
3017 3017  int
3018 3018  nfs4_mi_zonelist_remove(mntinfo4_t *mi)
3019 3019  {
3020 3020          struct mi4_globals *mig;
3021 3021          int ret = 0;
3022 3022  
3023 3023          mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3024 3024          mutex_enter(&mig->mig_lock);
3025 3025          mutex_enter(&mi->mi_lock);
3026 3026          /* if this mi is marked dead, then the zone already released it */
3027 3027          if (!(mi->mi_flags & MI4_DEAD)) {
3028 3028                  list_remove(&mig->mig_list, mi);
3029 3029                  mutex_exit(&mi->mi_lock);
3030 3030  
3031 3031                  /* release the holds put on in zonelist_add(). */
3032 3032                  VFS_RELE(mi->mi_vfsp);
3033 3033                  MI4_RELE(mi);
3034 3034                  ret = 1;
3035 3035          } else {
3036 3036                  mutex_exit(&mi->mi_lock);
3037 3037          }
3038 3038  
3039 3039          /*
3040 3040           * We can be called asynchronously by VFS_FREEVFS() after the zone
3041 3041           * shutdown/destroy callbacks have executed; if so, clean up the zone's
3042 3042           * mi globals.
3043 3043           */
3044 3044          if (list_head(&mig->mig_list) == NULL &&
3045 3045              mig->mig_destructor_called == B_TRUE) {
3046 3046                  nfs4_mi_free_globals(mig);
3047 3047                  return (ret);
3048 3048          }
3049 3049          mutex_exit(&mig->mig_lock);
3050 3050          return (ret);
3051 3051  }
3052 3052  
3053 3053  void
3054 3054  nfs_free_mi4(mntinfo4_t *mi)
3055 3055  {
3056 3056          nfs4_open_owner_t       *foop;
3057 3057          nfs4_oo_hash_bucket_t   *bucketp;
3058 3058          nfs4_debug_msg_t        *msgp;
3059 3059          int i;
3060 3060          servinfo4_t             *svp;
3061 3061  
3062 3062          /*
3063 3063           * Code introduced here should be carefully evaluated to make
3064 3064           * sure none of the freed resources are accessed either directly
3065 3065           * or indirectly after freeing them. For eg: Introducing calls to
3066 3066           * NFS4_DEBUG that use mntinfo4_t structure member after freeing
3067 3067           * the structure members or other routines calling back into NFS
3068 3068           * accessing freed mntinfo4_t structure member.
3069 3069           */
3070 3070          mutex_enter(&mi->mi_lock);
3071 3071          ASSERT(mi->mi_recovthread == NULL);
3072 3072          ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP);
3073 3073          mutex_exit(&mi->mi_lock);
3074 3074          mutex_enter(&mi->mi_async_lock);
3075 3075          ASSERT(mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
3076 3076              mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0);
3077 3077          ASSERT(mi->mi_manager_thread == NULL);
3078 3078          mutex_exit(&mi->mi_async_lock);
3079 3079          if (mi->mi_io_kstats) {
3080 3080                  kstat_delete(mi->mi_io_kstats);
3081 3081                  mi->mi_io_kstats = NULL;
3082 3082          }
3083 3083          if (mi->mi_ro_kstats) {
3084 3084                  kstat_delete(mi->mi_ro_kstats);
3085 3085                  mi->mi_ro_kstats = NULL;
3086 3086          }
3087 3087          if (mi->mi_recov_ksp) {
3088 3088                  kstat_delete(mi->mi_recov_ksp);
3089 3089                  mi->mi_recov_ksp = NULL;
3090 3090          }
3091 3091          mutex_enter(&mi->mi_msg_list_lock);
3092 3092          while (msgp = list_head(&mi->mi_msg_list)) {
3093 3093                  list_remove(&mi->mi_msg_list, msgp);
3094 3094                  nfs4_free_msg(msgp);
3095 3095          }
3096 3096          mutex_exit(&mi->mi_msg_list_lock);
3097 3097          list_destroy(&mi->mi_msg_list);
3098 3098          if (mi->mi_fname != NULL)
3099 3099                  fn_rele(&mi->mi_fname);
3100 3100          if (mi->mi_rootfh != NULL)
3101 3101                  sfh4_rele(&mi->mi_rootfh);
3102 3102          if (mi->mi_srvparentfh != NULL)
3103 3103                  sfh4_rele(&mi->mi_srvparentfh);
3104 3104          svp = mi->mi_servers;
3105 3105          sv4_free(svp);
3106 3106          mutex_destroy(&mi->mi_lock);
3107 3107          mutex_destroy(&mi->mi_async_lock);
3108 3108          mutex_destroy(&mi->mi_msg_list_lock);
3109 3109          nfs_rw_destroy(&mi->mi_recovlock);
3110 3110          nfs_rw_destroy(&mi->mi_rename_lock);
3111 3111          nfs_rw_destroy(&mi->mi_fh_lock);
3112 3112          cv_destroy(&mi->mi_failover_cv);
3113 3113          cv_destroy(&mi->mi_async_reqs_cv);
3114 3114          cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]);
3115 3115          cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]);
3116 3116          cv_destroy(&mi->mi_async_cv);
3117 3117          cv_destroy(&mi->mi_inact_req_cv);
3118 3118          /*
3119 3119           * Destroy the oo hash lists and mutexes for the cred hash table.
3120 3120           */
3121 3121          for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
3122 3122                  bucketp = &(mi->mi_oo_list[i]);
3123 3123                  /* Destroy any remaining open owners on the list */
3124 3124                  foop = list_head(&bucketp->b_oo_hash_list);
3125 3125                  while (foop != NULL) {
3126 3126                          list_remove(&bucketp->b_oo_hash_list, foop);
3127 3127                          nfs4_destroy_open_owner(foop);
3128 3128                          foop = list_head(&bucketp->b_oo_hash_list);
3129 3129                  }
3130 3130                  list_destroy(&bucketp->b_oo_hash_list);
3131 3131                  mutex_destroy(&bucketp->b_lock);
3132 3132          }
3133 3133          /*
3134 3134           * Empty and destroy the freed open owner list.
3135 3135           */
3136 3136          foop = list_head(&mi->mi_foo_list);
3137 3137          while (foop != NULL) {
3138 3138                  list_remove(&mi->mi_foo_list, foop);
3139 3139                  nfs4_destroy_open_owner(foop);
3140 3140                  foop = list_head(&mi->mi_foo_list);

↓ open down ↓

3140 lines elided

↑ open up ↑

3141 3141          }
3142 3142          list_destroy(&mi->mi_foo_list);
3143 3143          list_destroy(&mi->mi_bseqid_list);
3144 3144          list_destroy(&mi->mi_lost_state);
3145 3145          avl_destroy(&mi->mi_filehandles);
3146 3146          kmem_free(mi, sizeof (*mi));
3147 3147  }
3148 3148  void
3149 3149  mi_hold(mntinfo4_t *mi)
3150 3150  {
3151      -        atomic_add_32(&mi->mi_count, 1);
     3151 +        atomic_inc_32(&mi->mi_count);
3152 3152          ASSERT(mi->mi_count != 0);
3153 3153  }
3154 3154  
3155 3155  void
3156 3156  mi_rele(mntinfo4_t *mi)
3157 3157  {
3158 3158          ASSERT(mi->mi_count != 0);
3159      -        if (atomic_add_32_nv(&mi->mi_count, -1) == 0) {
     3159 +        if (atomic_dec_32_nv(&mi->mi_count) == 0) {
3160 3160                  nfs_free_mi4(mi);
3161 3161          }
3162 3162  }
3163 3163  
3164 3164  vnode_t    nfs4_xattr_notsupp_vnode;
3165 3165  
3166 3166  void
3167 3167  nfs4_clnt_init(void)
3168 3168  {
3169 3169          nfs4_vnops_init();

3170 3170          (void) nfs4_rnode_init();
3171 3171          (void) nfs4_shadow_init();
3172 3172          (void) nfs4_acache_init();
3173 3173          (void) nfs4_subr_init();
3174 3174          nfs4_acl_init();
3175 3175          nfs_idmap_init();
3176 3176          nfs4_callback_init();
3177 3177          nfs4_secinfo_init();
3178 3178  #ifdef  DEBUG
3179 3179          tsd_create(&nfs4_tsd_key, NULL);
3180 3180  #endif
3181 3181  
3182 3182          /*
3183 3183           * Add a CPR callback so that we can update client
3184 3184           * lease after a suspend and resume.
3185 3185           */
3186 3186          cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4");
3187 3187  
3188 3188          zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown,
3189 3189              nfs4_mi_destroy);
3190 3190  
3191 3191          /*
3192 3192           * Initialise the reference count of the notsupp xattr cache vnode to 1
3193 3193           * so that it never goes away (VOP_INACTIVE isn't called on it).
3194 3194           */
3195 3195          nfs4_xattr_notsupp_vnode.v_count = 1;
3196 3196  }
3197 3197  
3198 3198  void
3199 3199  nfs4_clnt_fini(void)
3200 3200  {
3201 3201          (void) zone_key_delete(mi4_list_key);
3202 3202          nfs4_vnops_fini();
3203 3203          (void) nfs4_rnode_fini();
3204 3204          (void) nfs4_shadow_fini();
3205 3205          (void) nfs4_acache_fini();
3206 3206          (void) nfs4_subr_fini();
3207 3207          nfs_idmap_fini();
3208 3208          nfs4_callback_fini();
3209 3209          nfs4_secinfo_fini();
3210 3210  #ifdef  DEBUG
3211 3211          tsd_destroy(&nfs4_tsd_key);
3212 3212  #endif
3213 3213          if (cid)
3214 3214                  (void) callb_delete(cid);
3215 3215  }
3216 3216  
3217 3217  /*ARGSUSED*/
3218 3218  static boolean_t
3219 3219  nfs4_client_cpr_callb(void *arg, int code)
3220 3220  {
3221 3221          /*
3222 3222           * We get called for Suspend and Resume events.
3223 3223           * For the suspend case we simply don't care!
3224 3224           */
3225 3225          if (code == CB_CODE_CPR_CHKPT) {
3226 3226                  return (B_TRUE);
3227 3227          }
3228 3228  
3229 3229          /*
3230 3230           * When we get to here we are in the process of
3231 3231           * resuming the system from a previous suspend.
3232 3232           */
3233 3233          nfs4_client_resumed = gethrestime_sec();
3234 3234          return (B_TRUE);
3235 3235  }
3236 3236  
3237 3237  void
3238 3238  nfs4_renew_lease_thread(nfs4_server_t *sp)
3239 3239  {
3240 3240          int     error = 0;
3241 3241          time_t  tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs;
3242 3242          clock_t tick_delay = 0;
3243 3243          clock_t time_left = 0;
3244 3244          callb_cpr_t cpr_info;
3245 3245          kmutex_t cpr_lock;
3246 3246  
3247 3247          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3248 3248              "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp));
3249 3249          mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
3250 3250          CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease");
3251 3251  
3252 3252          mutex_enter(&sp->s_lock);
3253 3253          /* sp->s_lease_time is set via a GETATTR */
3254 3254          sp->last_renewal_time = gethrestime_sec();
3255 3255          sp->lease_valid = NFS4_LEASE_UNINITIALIZED;
3256 3256          ASSERT(sp->s_refcnt >= 1);
3257 3257  
3258 3258          for (;;) {
3259 3259                  if (!sp->state_ref_count ||
3260 3260                      sp->lease_valid != NFS4_LEASE_VALID) {
3261 3261  
3262 3262                          kip_secs = MAX((sp->s_lease_time >> 1) -
3263 3263                              (3 * sp->propagation_delay.tv_sec), 1);
3264 3264  
3265 3265                          tick_delay = SEC_TO_TICK(kip_secs);
3266 3266  
3267 3267                          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3268 3268                              "nfs4_renew_lease_thread: no renew : thread "
3269 3269                              "wait %ld secs", kip_secs));
3270 3270  
3271 3271                          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3272 3272                              "nfs4_renew_lease_thread: no renew : "
3273 3273                              "state_ref_count %d, lease_valid %d",
3274 3274                              sp->state_ref_count, sp->lease_valid));
3275 3275  
3276 3276                          mutex_enter(&cpr_lock);
3277 3277                          CALLB_CPR_SAFE_BEGIN(&cpr_info);
3278 3278                          mutex_exit(&cpr_lock);
3279 3279                          time_left = cv_reltimedwait(&sp->cv_thread_exit,
3280 3280                              &sp->s_lock, tick_delay, TR_CLOCK_TICK);
3281 3281                          mutex_enter(&cpr_lock);
3282 3282                          CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3283 3283                          mutex_exit(&cpr_lock);
3284 3284  
3285 3285                          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3286 3286                              "nfs4_renew_lease_thread: no renew: "
3287 3287                              "time left %ld", time_left));
3288 3288  
3289 3289                          if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3290 3290                                  goto die;
3291 3291                          continue;
3292 3292                  }
3293 3293  
3294 3294                  tmp_last_renewal_time = sp->last_renewal_time;
3295 3295  
3296 3296                  tmp_time = gethrestime_sec() - sp->last_renewal_time +
3297 3297                      (3 * sp->propagation_delay.tv_sec);
3298 3298  
3299 3299                  NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3300 3300                      "nfs4_renew_lease_thread: tmp_time %ld, "
3301 3301                      "sp->last_renewal_time %ld", tmp_time,
3302 3302                      sp->last_renewal_time));
3303 3303  
3304 3304                  kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1);
3305 3305  
3306 3306                  tick_delay = SEC_TO_TICK(kip_secs);
3307 3307  
3308 3308                  NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3309 3309                      "nfs4_renew_lease_thread: valid lease: sleep for %ld "
3310 3310                      "secs", kip_secs));
3311 3311  
3312 3312                  mutex_enter(&cpr_lock);
3313 3313                  CALLB_CPR_SAFE_BEGIN(&cpr_info);
3314 3314                  mutex_exit(&cpr_lock);
3315 3315                  time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock,
3316 3316                      tick_delay, TR_CLOCK_TICK);
3317 3317                  mutex_enter(&cpr_lock);
3318 3318                  CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3319 3319                  mutex_exit(&cpr_lock);
3320 3320  
3321 3321                  NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3322 3322                      "nfs4_renew_lease_thread: valid lease: time left %ld :"
3323 3323                      "sp last_renewal_time %ld, nfs4_client_resumed %ld, "
3324 3324                      "tmp_last_renewal_time %ld", time_left,
3325 3325                      sp->last_renewal_time, nfs4_client_resumed,
3326 3326                      tmp_last_renewal_time));
3327 3327  
3328 3328                  if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3329 3329                          goto die;
3330 3330  
3331 3331                  if (tmp_last_renewal_time == sp->last_renewal_time ||
3332 3332                      (nfs4_client_resumed != 0 &&
3333 3333                      nfs4_client_resumed > sp->last_renewal_time)) {
3334 3334                          /*
3335 3335                           * Issue RENEW op since we haven't renewed the lease
3336 3336                           * since we slept.
3337 3337                           */
3338 3338                          tmp_now_time = gethrestime_sec();
3339 3339                          error = nfs4renew(sp);
3340 3340                          /*
3341 3341                           * Need to re-acquire sp's lock, nfs4renew()
3342 3342                           * relinqueshes it.
3343 3343                           */
3344 3344                          mutex_enter(&sp->s_lock);
3345 3345  
3346 3346                          /*
3347 3347                           * See if someone changed s_thread_exit while we gave
3348 3348                           * up s_lock.
3349 3349                           */
3350 3350                          if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3351 3351                                  goto die;
3352 3352  
3353 3353                          if (!error) {
3354 3354                                  /*
3355 3355                                   * check to see if we implicitly renewed while
3356 3356                                   * we waited for a reply for our RENEW call.
3357 3357                                   */
3358 3358                                  if (tmp_last_renewal_time ==
3359 3359                                      sp->last_renewal_time) {
3360 3360                                          /* no implicit renew came */
3361 3361                                          sp->last_renewal_time = tmp_now_time;
3362 3362                                  } else {
3363 3363                                          NFS4_DEBUG(nfs4_client_lease_debug,
3364 3364                                              (CE_NOTE, "renew_thread: did "
3365 3365                                              "implicit renewal before reply "
3366 3366                                              "from server for RENEW"));
3367 3367                                  }
3368 3368                          } else {
3369 3369                                  /* figure out error */
3370 3370                                  NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3371 3371                                      "renew_thread: nfs4renew returned error"
3372 3372                                      " %d", error));
3373 3373                          }
3374 3374  
3375 3375                  }
3376 3376          }
3377 3377  
3378 3378  die:
3379 3379          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3380 3380              "nfs4_renew_lease_thread: thread exiting"));
3381 3381  
3382 3382          while (sp->s_otw_call_count != 0) {
3383 3383                  NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3384 3384                      "nfs4_renew_lease_thread: waiting for outstanding "
3385 3385                      "otw calls to finish for sp 0x%p, current "
3386 3386                      "s_otw_call_count %d", (void *)sp,
3387 3387                      sp->s_otw_call_count));
3388 3388                  mutex_enter(&cpr_lock);
3389 3389                  CALLB_CPR_SAFE_BEGIN(&cpr_info);
3390 3390                  mutex_exit(&cpr_lock);
3391 3391                  cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
3392 3392                  mutex_enter(&cpr_lock);
3393 3393                  CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3394 3394                  mutex_exit(&cpr_lock);
3395 3395          }
3396 3396          mutex_exit(&sp->s_lock);
3397 3397  
3398 3398          nfs4_server_rele(sp);           /* free the thread's reference */
3399 3399          nfs4_server_rele(sp);           /* free the list's reference */
3400 3400          sp = NULL;
3401 3401  
3402 3402  done:
3403 3403          mutex_enter(&cpr_lock);
3404 3404          CALLB_CPR_EXIT(&cpr_info);      /* drops cpr_lock */
3405 3405          mutex_destroy(&cpr_lock);
3406 3406  
3407 3407          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3408 3408              "nfs4_renew_lease_thread: renew thread exit officially"));
3409 3409  
3410 3410          zthread_exit();
3411 3411          /* NOT REACHED */
3412 3412  }
3413 3413  
3414 3414  /*
3415 3415   * Send out a RENEW op to the server.
3416 3416   * Assumes sp is locked down.
3417 3417   */
3418 3418  static int
3419 3419  nfs4renew(nfs4_server_t *sp)
3420 3420  {
3421 3421          COMPOUND4args_clnt args;
3422 3422          COMPOUND4res_clnt res;
3423 3423          nfs_argop4 argop[1];
3424 3424          int doqueue = 1;
3425 3425          int rpc_error;
3426 3426          cred_t *cr;
3427 3427          mntinfo4_t *mi;
3428 3428          timespec_t prop_time, after_time;
3429 3429          int needrecov = FALSE;
3430 3430          nfs4_recov_state_t recov_state;
3431 3431          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3432 3432  
3433 3433          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew"));
3434 3434  
3435 3435          recov_state.rs_flags = 0;
3436 3436          recov_state.rs_num_retry_despite_err = 0;
3437 3437  
3438 3438  recov_retry:
3439 3439          mi = sp->mntinfo4_list;
3440 3440          VFS_HOLD(mi->mi_vfsp);
3441 3441          mutex_exit(&sp->s_lock);
3442 3442          ASSERT(mi != NULL);
3443 3443  
3444 3444          e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3445 3445          if (e.error) {
3446 3446                  VFS_RELE(mi->mi_vfsp);
3447 3447                  return (e.error);
3448 3448          }
3449 3449  
3450 3450          /* Check to see if we're dealing with a marked-dead sp */
3451 3451          mutex_enter(&sp->s_lock);
3452 3452          if (sp->s_thread_exit == NFS4_THREAD_EXIT) {
3453 3453                  mutex_exit(&sp->s_lock);
3454 3454                  nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3455 3455                  VFS_RELE(mi->mi_vfsp);
3456 3456                  return (0);
3457 3457          }
3458 3458  
3459 3459          /* Make sure mi hasn't changed on us */
3460 3460          if (mi != sp->mntinfo4_list) {
3461 3461                  /* Must drop sp's lock to avoid a recursive mutex enter */
3462 3462                  mutex_exit(&sp->s_lock);
3463 3463                  nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3464 3464                  VFS_RELE(mi->mi_vfsp);
3465 3465                  mutex_enter(&sp->s_lock);
3466 3466                  goto recov_retry;
3467 3467          }
3468 3468          mutex_exit(&sp->s_lock);
3469 3469  
3470 3470          args.ctag = TAG_RENEW;
3471 3471  
3472 3472          args.array_len = 1;
3473 3473          args.array = argop;
3474 3474  
3475 3475          argop[0].argop = OP_RENEW;
3476 3476  
3477 3477          mutex_enter(&sp->s_lock);
3478 3478          argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid;
3479 3479          cr = sp->s_cred;
3480 3480          crhold(cr);
3481 3481          mutex_exit(&sp->s_lock);
3482 3482  
3483 3483          ASSERT(cr != NULL);
3484 3484  
3485 3485          /* used to figure out RTT for sp */
3486 3486          gethrestime(&prop_time);
3487 3487  
3488 3488          NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3489 3489              "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first",
3490 3490              (void*)sp));
3491 3491          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ",
3492 3492              prop_time.tv_sec, prop_time.tv_nsec));
3493 3493  
3494 3494          DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp,
3495 3495              mntinfo4_t *, mi);
3496 3496  
3497 3497          rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3498 3498          crfree(cr);
3499 3499  
3500 3500          DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp,
3501 3501              mntinfo4_t *, mi);
3502 3502  
3503 3503          gethrestime(&after_time);
3504 3504  
3505 3505          mutex_enter(&sp->s_lock);
3506 3506          sp->propagation_delay.tv_sec =
3507 3507              MAX(1, after_time.tv_sec - prop_time.tv_sec);
3508 3508          mutex_exit(&sp->s_lock);
3509 3509  
3510 3510          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ",
3511 3511              after_time.tv_sec, after_time.tv_nsec));
3512 3512  
3513 3513          if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) {
3514 3514                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3515 3515                  nfs4_delegreturn_all(sp);
3516 3516                  nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3517 3517                  VFS_RELE(mi->mi_vfsp);
3518 3518                  /*
3519 3519                   * If the server returns CB_PATH_DOWN, it has renewed
3520 3520                   * the lease and informed us that the callback path is
3521 3521                   * down.  Since the lease is renewed, just return 0 and
3522 3522                   * let the renew thread proceed as normal.
3523 3523                   */
3524 3524                  return (0);
3525 3525          }
3526 3526  
3527 3527          needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3528 3528          if (!needrecov && e.error) {
3529 3529                  nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3530 3530                  VFS_RELE(mi->mi_vfsp);
3531 3531                  return (e.error);
3532 3532          }
3533 3533  
3534 3534          rpc_error = e.error;
3535 3535  
3536 3536          if (needrecov) {
3537 3537                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3538 3538                      "nfs4renew: initiating recovery\n"));
3539 3539  
3540 3540                  if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
3541 3541                      OP_RENEW, NULL, NULL, NULL) == FALSE) {
3542 3542                          nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3543 3543                          VFS_RELE(mi->mi_vfsp);
3544 3544                          if (!e.error)
3545 3545                                  (void) xdr_free(xdr_COMPOUND4res_clnt,
3546 3546                                      (caddr_t)&res);
3547 3547                          mutex_enter(&sp->s_lock);
3548 3548                          goto recov_retry;
3549 3549                  }
3550 3550                  /* fall through for res.status case */
3551 3551          }
3552 3552  
3553 3553          if (res.status) {
3554 3554                  if (res.status == NFS4ERR_LEASE_MOVED) {
3555 3555                          /*EMPTY*/
3556 3556                          /*
3557 3557                           * XXX need to try every mntinfo4 in sp->mntinfo4_list
3558 3558                           * to renew the lease on that server
3559 3559                           */
3560 3560                  }
3561 3561                  e.error = geterrno4(res.status);
3562 3562          }
3563 3563  
3564 3564          if (!rpc_error)
3565 3565                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3566 3566  
3567 3567          nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3568 3568  
3569 3569          VFS_RELE(mi->mi_vfsp);
3570 3570  
3571 3571          return (e.error);
3572 3572  }
3573 3573  
3574 3574  void
3575 3575  nfs4_inc_state_ref_count(mntinfo4_t *mi)
3576 3576  {
3577 3577          nfs4_server_t   *sp;
3578 3578  
3579 3579          /* this locks down sp if it is found */
3580 3580          sp = find_nfs4_server(mi);
3581 3581  
3582 3582          if (sp != NULL) {
3583 3583                  nfs4_inc_state_ref_count_nolock(sp, mi);
3584 3584                  mutex_exit(&sp->s_lock);
3585 3585                  nfs4_server_rele(sp);
3586 3586          }
3587 3587  }
3588 3588  
3589 3589  /*
3590 3590   * Bump the number of OPEN files (ie: those with state) so we know if this
3591 3591   * nfs4_server has any state to maintain a lease for or not.
3592 3592   *
3593 3593   * Also, marks the nfs4_server's lease valid if it hasn't been done so already.
3594 3594   */
3595 3595  void
3596 3596  nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3597 3597  {
3598 3598          ASSERT(mutex_owned(&sp->s_lock));
3599 3599  
3600 3600          sp->state_ref_count++;
3601 3601          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3602 3602              "nfs4_inc_state_ref_count: state_ref_count now %d",
3603 3603              sp->state_ref_count));
3604 3604  
3605 3605          if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED)
3606 3606                  sp->lease_valid = NFS4_LEASE_VALID;
3607 3607  
3608 3608          /*
3609 3609           * If this call caused the lease to be marked valid and/or
3610 3610           * took the state_ref_count from 0 to 1, then start the time
3611 3611           * on lease renewal.
3612 3612           */
3613 3613          if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1)
3614 3614                  sp->last_renewal_time = gethrestime_sec();
3615 3615  
3616 3616          /* update the number of open files for mi */
3617 3617          mi->mi_open_files++;
3618 3618  }
3619 3619  
3620 3620  void
3621 3621  nfs4_dec_state_ref_count(mntinfo4_t *mi)
3622 3622  {
3623 3623          nfs4_server_t   *sp;
3624 3624  
3625 3625          /* this locks down sp if it is found */
3626 3626          sp = find_nfs4_server_all(mi, 1);
3627 3627  
3628 3628          if (sp != NULL) {
3629 3629                  nfs4_dec_state_ref_count_nolock(sp, mi);
3630 3630                  mutex_exit(&sp->s_lock);
3631 3631                  nfs4_server_rele(sp);
3632 3632          }
3633 3633  }
3634 3634  
3635 3635  /*
3636 3636   * Decrement the number of OPEN files (ie: those with state) so we know if
3637 3637   * this nfs4_server has any state to maintain a lease for or not.
3638 3638   */
3639 3639  void
3640 3640  nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3641 3641  {
3642 3642          ASSERT(mutex_owned(&sp->s_lock));
3643 3643          ASSERT(sp->state_ref_count != 0);
3644 3644          sp->state_ref_count--;
3645 3645  
3646 3646          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3647 3647              "nfs4_dec_state_ref_count: state ref count now %d",
3648 3648              sp->state_ref_count));
3649 3649  
3650 3650          mi->mi_open_files--;
3651 3651          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3652 3652              "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x",
3653 3653              mi->mi_open_files, mi->mi_flags));
3654 3654  
3655 3655          /* We don't have to hold the mi_lock to test mi_flags */
3656 3656          if (mi->mi_open_files == 0 &&
3657 3657              (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) {
3658 3658                  NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3659 3659                      "nfs4_dec_state_ref_count: remove mntinfo4 %p since "
3660 3660                      "we have closed the last open file", (void*)mi));
3661 3661                  nfs4_remove_mi_from_server(mi, sp);
3662 3662          }
3663 3663  }
3664 3664  
3665 3665  bool_t
3666 3666  inlease(nfs4_server_t *sp)
3667 3667  {
3668 3668          bool_t result;
3669 3669  
3670 3670          ASSERT(mutex_owned(&sp->s_lock));
3671 3671  
3672 3672          if (sp->lease_valid == NFS4_LEASE_VALID &&
3673 3673              gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time)
3674 3674                  result = TRUE;
3675 3675          else
3676 3676                  result = FALSE;
3677 3677  
3678 3678          return (result);
3679 3679  }
3680 3680  
3681 3681  
3682 3682  /*
3683 3683   * Return non-zero if the given nfs4_server_t is going through recovery.
3684 3684   */
3685 3685  
3686 3686  int
3687 3687  nfs4_server_in_recovery(nfs4_server_t *sp)
3688 3688  {
3689 3689          return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
3690 3690  }
3691 3691  
3692 3692  /*
3693 3693   * Compare two shared filehandle objects.  Returns -1, 0, or +1, if the
3694 3694   * first is less than, equal to, or greater than the second.
3695 3695   */
3696 3696  
3697 3697  int
3698 3698  sfh4cmp(const void *p1, const void *p2)
3699 3699  {
3700 3700          const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1;
3701 3701          const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2;
3702 3702  
3703 3703          return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh));
3704 3704  }
3705 3705  
3706 3706  /*
3707 3707   * Create a table for shared filehandle objects.
3708 3708   */
3709 3709  
3710 3710  void
3711 3711  sfh4_createtab(avl_tree_t *tab)
3712 3712  {
3713 3713          avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t),
3714 3714              offsetof(nfs4_sharedfh_t, sfh_tree));
3715 3715  }
3716 3716  
3717 3717  /*
3718 3718   * Return a shared filehandle object for the given filehandle.  The caller
3719 3719   * is responsible for eventually calling sfh4_rele().
3720 3720   */
3721 3721  
3722 3722  nfs4_sharedfh_t *
3723 3723  sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key)
3724 3724  {
3725 3725          nfs4_sharedfh_t *sfh, *nsfh;
3726 3726          avl_index_t where;
3727 3727          nfs4_sharedfh_t skey;
3728 3728  
3729 3729          if (!key) {
3730 3730                  skey.sfh_fh = *fh;
3731 3731                  key = &skey;
3732 3732          }
3733 3733  
3734 3734          nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP);
3735 3735          nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len;
3736 3736          /*
3737 3737           * We allocate the largest possible filehandle size because it's
3738 3738           * not that big, and it saves us from possibly having to resize the
3739 3739           * buffer later.
3740 3740           */
3741 3741          nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP);
3742 3742          bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len);
3743 3743          mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL);
3744 3744          nsfh->sfh_refcnt = 1;
3745 3745          nsfh->sfh_flags = SFH4_IN_TREE;
3746 3746          nsfh->sfh_mi = mi;
3747 3747          NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)",
3748 3748              (void *)nsfh));
3749 3749  
3750 3750          (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3751 3751          sfh = avl_find(&mi->mi_filehandles, key, &where);
3752 3752          if (sfh != NULL) {
3753 3753                  mutex_enter(&sfh->sfh_lock);
3754 3754                  sfh->sfh_refcnt++;
3755 3755                  mutex_exit(&sfh->sfh_lock);
3756 3756                  nfs_rw_exit(&mi->mi_fh_lock);
3757 3757                  /* free our speculative allocs */
3758 3758                  kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3759 3759                  kmem_free(nsfh, sizeof (nfs4_sharedfh_t));
3760 3760                  return (sfh);
3761 3761          }
3762 3762  
3763 3763          avl_insert(&mi->mi_filehandles, nsfh, where);
3764 3764          nfs_rw_exit(&mi->mi_fh_lock);
3765 3765  
3766 3766          return (nsfh);
3767 3767  }
3768 3768  
3769 3769  /*
3770 3770   * Return a shared filehandle object for the given filehandle.  The caller
3771 3771   * is responsible for eventually calling sfh4_rele().
3772 3772   */
3773 3773  
3774 3774  nfs4_sharedfh_t *
3775 3775  sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi)
3776 3776  {
3777 3777          nfs4_sharedfh_t *sfh;
3778 3778          nfs4_sharedfh_t key;
3779 3779  
3780 3780          ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE);
3781 3781  
3782 3782  #ifdef DEBUG
3783 3783          if (nfs4_sharedfh_debug) {
3784 3784                  nfs4_fhandle_t fhandle;
3785 3785  
3786 3786                  fhandle.fh_len = fh->nfs_fh4_len;
3787 3787                  bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len);
3788 3788                  zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:");
3789 3789                  nfs4_printfhandle(&fhandle);
3790 3790          }
3791 3791  #endif
3792 3792  
3793 3793          /*
3794 3794           * If there's already an object for the given filehandle, bump the
3795 3795           * reference count and return it.  Otherwise, create a new object
3796 3796           * and add it to the AVL tree.
3797 3797           */
3798 3798  
3799 3799          key.sfh_fh = *fh;
3800 3800  
3801 3801          (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3802 3802          sfh = avl_find(&mi->mi_filehandles, &key, NULL);
3803 3803          if (sfh != NULL) {
3804 3804                  mutex_enter(&sfh->sfh_lock);
3805 3805                  sfh->sfh_refcnt++;
3806 3806                  NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3807 3807                      "sfh4_get: found existing %p, new refcnt=%d",
3808 3808                      (void *)sfh, sfh->sfh_refcnt));
3809 3809                  mutex_exit(&sfh->sfh_lock);
3810 3810                  nfs_rw_exit(&mi->mi_fh_lock);
3811 3811                  return (sfh);
3812 3812          }
3813 3813          nfs_rw_exit(&mi->mi_fh_lock);
3814 3814  
3815 3815          return (sfh4_put(fh, mi, &key));
3816 3816  }
3817 3817  
3818 3818  /*
3819 3819   * Get a reference to the given shared filehandle object.
3820 3820   */
3821 3821  
3822 3822  void
3823 3823  sfh4_hold(nfs4_sharedfh_t *sfh)
3824 3824  {
3825 3825          ASSERT(sfh->sfh_refcnt > 0);
3826 3826  
3827 3827          mutex_enter(&sfh->sfh_lock);
3828 3828          sfh->sfh_refcnt++;
3829 3829          NFS4_DEBUG(nfs4_sharedfh_debug,
3830 3830              (CE_NOTE, "sfh4_hold %p, new refcnt=%d",
3831 3831              (void *)sfh, sfh->sfh_refcnt));
3832 3832          mutex_exit(&sfh->sfh_lock);
3833 3833  }
3834 3834  
3835 3835  /*
3836 3836   * Release a reference to the given shared filehandle object and null out
3837 3837   * the given pointer.
3838 3838   */
3839 3839  
3840 3840  void
3841 3841  sfh4_rele(nfs4_sharedfh_t **sfhpp)
3842 3842  {
3843 3843          mntinfo4_t *mi;
3844 3844          nfs4_sharedfh_t *sfh = *sfhpp;
3845 3845  
3846 3846          ASSERT(sfh->sfh_refcnt > 0);
3847 3847  
3848 3848          mutex_enter(&sfh->sfh_lock);
3849 3849          if (sfh->sfh_refcnt > 1) {
3850 3850                  sfh->sfh_refcnt--;
3851 3851                  NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3852 3852                      "sfh4_rele %p, new refcnt=%d",
3853 3853                      (void *)sfh, sfh->sfh_refcnt));
3854 3854                  mutex_exit(&sfh->sfh_lock);
3855 3855                  goto finish;
3856 3856          }
3857 3857          mutex_exit(&sfh->sfh_lock);
3858 3858  
3859 3859          /*
3860 3860           * Possibly the last reference, so get the lock for the table in
3861 3861           * case it's time to remove the object from the table.
3862 3862           */
3863 3863          mi = sfh->sfh_mi;
3864 3864          (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3865 3865          mutex_enter(&sfh->sfh_lock);
3866 3866          sfh->sfh_refcnt--;
3867 3867          if (sfh->sfh_refcnt > 0) {
3868 3868                  NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3869 3869                      "sfh4_rele %p, new refcnt=%d",
3870 3870                      (void *)sfh, sfh->sfh_refcnt));
3871 3871                  mutex_exit(&sfh->sfh_lock);
3872 3872                  nfs_rw_exit(&mi->mi_fh_lock);
3873 3873                  goto finish;
3874 3874          }
3875 3875  
3876 3876          NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3877 3877              "sfh4_rele %p, last ref", (void *)sfh));
3878 3878          if (sfh->sfh_flags & SFH4_IN_TREE) {
3879 3879                  avl_remove(&mi->mi_filehandles, sfh);
3880 3880                  sfh->sfh_flags &= ~SFH4_IN_TREE;
3881 3881          }
3882 3882          mutex_exit(&sfh->sfh_lock);
3883 3883          nfs_rw_exit(&mi->mi_fh_lock);
3884 3884          mutex_destroy(&sfh->sfh_lock);
3885 3885          kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3886 3886          kmem_free(sfh, sizeof (nfs4_sharedfh_t));
3887 3887  
3888 3888  finish:
3889 3889          *sfhpp = NULL;
3890 3890  }
3891 3891  
3892 3892  /*
3893 3893   * Update the filehandle for the given shared filehandle object.
3894 3894   */
3895 3895  
3896 3896  int nfs4_warn_dupfh = 0;        /* if set, always warn about dup fhs below */
3897 3897  
3898 3898  void
3899 3899  sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh)
3900 3900  {
3901 3901          mntinfo4_t *mi = sfh->sfh_mi;
3902 3902          nfs4_sharedfh_t *dupsfh;
3903 3903          avl_index_t where;
3904 3904          nfs4_sharedfh_t key;
3905 3905  
3906 3906  #ifdef DEBUG
3907 3907          mutex_enter(&sfh->sfh_lock);
3908 3908          ASSERT(sfh->sfh_refcnt > 0);
3909 3909          mutex_exit(&sfh->sfh_lock);
3910 3910  #endif
3911 3911          ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE);
3912 3912  
3913 3913          /*
3914 3914           * The basic plan is to remove the shared filehandle object from
3915 3915           * the table, update it to have the new filehandle, then reinsert
3916 3916           * it.
3917 3917           */
3918 3918  
3919 3919          (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3920 3920          mutex_enter(&sfh->sfh_lock);
3921 3921          if (sfh->sfh_flags & SFH4_IN_TREE) {
3922 3922                  avl_remove(&mi->mi_filehandles, sfh);
3923 3923                  sfh->sfh_flags &= ~SFH4_IN_TREE;
3924 3924          }
3925 3925          mutex_exit(&sfh->sfh_lock);
3926 3926          sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len;
3927 3927          bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val,
3928 3928              sfh->sfh_fh.nfs_fh4_len);
3929 3929  
3930 3930          /*
3931 3931           * XXX If there is already a shared filehandle object with the new
3932 3932           * filehandle, we're in trouble, because the rnode code assumes
3933 3933           * that there is only one shared filehandle object for a given
3934 3934           * filehandle.  So issue a warning (for read-write mounts only)
3935 3935           * and don't try to re-insert the given object into the table.
3936 3936           * Hopefully the given object will quickly go away and everyone
3937 3937           * will use the new object.
3938 3938           */
3939 3939          key.sfh_fh = *newfh;
3940 3940          dupsfh = avl_find(&mi->mi_filehandles, &key, &where);
3941 3941          if (dupsfh != NULL) {
3942 3942                  if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) {
3943 3943                          zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: "
3944 3944                              "duplicate filehandle detected");
3945 3945                          sfh4_printfhandle(dupsfh);
3946 3946                  }
3947 3947          } else {
3948 3948                  avl_insert(&mi->mi_filehandles, sfh, where);
3949 3949                  mutex_enter(&sfh->sfh_lock);
3950 3950                  sfh->sfh_flags |= SFH4_IN_TREE;
3951 3951                  mutex_exit(&sfh->sfh_lock);
3952 3952          }
3953 3953          nfs_rw_exit(&mi->mi_fh_lock);
3954 3954  }
3955 3955  
3956 3956  /*
3957 3957   * Copy out the current filehandle for the given shared filehandle object.
3958 3958   */
3959 3959  
3960 3960  void
3961 3961  sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp)
3962 3962  {
3963 3963          mntinfo4_t *mi = sfh->sfh_mi;
3964 3964  
3965 3965          ASSERT(sfh->sfh_refcnt > 0);
3966 3966  
3967 3967          (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3968 3968          fhp->fh_len = sfh->sfh_fh.nfs_fh4_len;
3969 3969          ASSERT(fhp->fh_len <= NFS4_FHSIZE);
3970 3970          bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len);
3971 3971          nfs_rw_exit(&mi->mi_fh_lock);
3972 3972  }
3973 3973  
3974 3974  /*
3975 3975   * Print out the filehandle for the given shared filehandle object.
3976 3976   */
3977 3977  
3978 3978  void
3979 3979  sfh4_printfhandle(const nfs4_sharedfh_t *sfh)
3980 3980  {
3981 3981          nfs4_fhandle_t fhandle;
3982 3982  
3983 3983          sfh4_copyval(sfh, &fhandle);
3984 3984          nfs4_printfhandle(&fhandle);
3985 3985  }
3986 3986  
3987 3987  /*
3988 3988   * Compare 2 fnames.  Returns -1 if the first is "less" than the second, 0
3989 3989   * if they're the same, +1 if the first is "greater" than the second.  The
3990 3990   * caller (or whoever's calling the AVL package) is responsible for
3991 3991   * handling locking issues.
3992 3992   */
3993 3993  
3994 3994  static int
3995 3995  fncmp(const void *p1, const void *p2)
3996 3996  {
3997 3997          const nfs4_fname_t *f1 = p1;
3998 3998          const nfs4_fname_t *f2 = p2;
3999 3999          int res;
4000 4000  
4001 4001          res = strcmp(f1->fn_name, f2->fn_name);
4002 4002          /*
4003 4003           * The AVL package wants +/-1, not arbitrary positive or negative
4004 4004           * integers.
4005 4005           */
4006 4006          if (res > 0)
4007 4007                  res = 1;
4008 4008          else if (res < 0)
4009 4009                  res = -1;
4010 4010          return (res);
4011 4011  }
4012 4012  
4013 4013  /*
4014 4014   * Get or create an fname with the given name, as a child of the given
4015 4015   * fname.  The caller is responsible for eventually releasing the reference
4016 4016   * (fn_rele()).  parent may be NULL.
4017 4017   */
4018 4018  
4019 4019  nfs4_fname_t *
4020 4020  fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh)
4021 4021  {
4022 4022          nfs4_fname_t key;
4023 4023          nfs4_fname_t *fnp;
4024 4024          avl_index_t where;
4025 4025  
4026 4026          key.fn_name = name;
4027 4027  
4028 4028          /*
4029 4029           * If there's already an fname registered with the given name, bump
4030 4030           * its reference count and return it.  Otherwise, create a new one
4031 4031           * and add it to the parent's AVL tree.
4032 4032           *
4033 4033           * fname entries we are looking for should match both name
4034 4034           * and sfh stored in the fname.
4035 4035           */
4036 4036  again:
4037 4037          if (parent != NULL) {
4038 4038                  mutex_enter(&parent->fn_lock);
4039 4039                  fnp = avl_find(&parent->fn_children, &key, &where);
4040 4040                  if (fnp != NULL) {
4041 4041                          /*
4042 4042                           * This hold on fnp is released below later,
4043 4043                           * in case this is not the fnp we want.
4044 4044                           */
4045 4045                          fn_hold(fnp);
4046 4046  
4047 4047                          if (fnp->fn_sfh == sfh) {
4048 4048                                  /*
4049 4049                                   * We have found our entry.
4050 4050                                   * put an hold and return it.
4051 4051                                   */
4052 4052                                  mutex_exit(&parent->fn_lock);
4053 4053                                  return (fnp);
4054 4054                          }
4055 4055  
4056 4056                          /*
4057 4057                           * We have found an entry that has a mismatching
4058 4058                           * fn_sfh. This could be a stale entry due to
4059 4059                           * server side rename. We will remove this entry
4060 4060                           * and make sure no such entries exist.
4061 4061                           */
4062 4062                          mutex_exit(&parent->fn_lock);
4063 4063                          mutex_enter(&fnp->fn_lock);
4064 4064                          if (fnp->fn_parent == parent) {
4065 4065                                  /*
4066 4066                                   * Remove ourselves from parent's
4067 4067                                   * fn_children tree.
4068 4068                                   */
4069 4069                                  mutex_enter(&parent->fn_lock);
4070 4070                                  avl_remove(&parent->fn_children, fnp);
4071 4071                                  mutex_exit(&parent->fn_lock);
4072 4072                                  fn_rele(&fnp->fn_parent);
4073 4073                          }
4074 4074                          mutex_exit(&fnp->fn_lock);
4075 4075                          fn_rele(&fnp);
4076 4076                          goto again;
4077 4077                  }
4078 4078          }
4079 4079  
4080 4080          fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP);
4081 4081          mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL);
4082 4082          fnp->fn_parent = parent;
4083 4083          if (parent != NULL)
4084 4084                  fn_hold(parent);
4085 4085          fnp->fn_len = strlen(name);
4086 4086          ASSERT(fnp->fn_len < MAXNAMELEN);
4087 4087          fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP);
4088 4088          (void) strcpy(fnp->fn_name, name);
4089 4089          fnp->fn_refcnt = 1;
4090 4090  
4091 4091          /*
4092 4092           * This hold on sfh is later released
4093 4093           * when we do the final fn_rele() on this fname.
4094 4094           */
4095 4095          sfh4_hold(sfh);
4096 4096          fnp->fn_sfh = sfh;
4097 4097  
4098 4098          avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t),
4099 4099              offsetof(nfs4_fname_t, fn_tree));
4100 4100          NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4101 4101              "fn_get %p:%s, a new nfs4_fname_t!",
4102 4102              (void *)fnp, fnp->fn_name));
4103 4103          if (parent != NULL) {

↓ open down ↓

934 lines elided

↑ open up ↑

4104 4104                  avl_insert(&parent->fn_children, fnp, where);
4105 4105                  mutex_exit(&parent->fn_lock);
4106 4106          }
4107 4107  
4108 4108          return (fnp);
4109 4109  }
4110 4110  
4111 4111  void
4112 4112  fn_hold(nfs4_fname_t *fnp)
4113 4113  {
4114      -        atomic_add_32(&fnp->fn_refcnt, 1);
     4114 +        atomic_inc_32(&fnp->fn_refcnt);
4115 4115          NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4116 4116              "fn_hold %p:%s, new refcnt=%d",
4117 4117              (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4118 4118  }
4119 4119  
4120 4120  /*
4121 4121   * Decrement the reference count of the given fname, and destroy it if its
4122 4122   * reference count goes to zero.  Nulls out the given pointer.
4123 4123   */
4124 4124

4125 4125  void
4126 4126  fn_rele(nfs4_fname_t **fnpp)
4127 4127  {
4128 4128          nfs4_fname_t *parent;
4129 4129          uint32_t newref;

↓ open down ↓

5 lines elided

↑ open up ↑

4130 4130          nfs4_fname_t *fnp;
4131 4131  
4132 4132  recur:
4133 4133          fnp = *fnpp;
4134 4134          *fnpp = NULL;
4135 4135  
4136 4136          mutex_enter(&fnp->fn_lock);
4137 4137          parent = fnp->fn_parent;
4138 4138          if (parent != NULL)
4139 4139                  mutex_enter(&parent->fn_lock);  /* prevent new references */
4140      -        newref = atomic_add_32_nv(&fnp->fn_refcnt, -1);
     4140 +        newref = atomic_dec_32_nv(&fnp->fn_refcnt);
4141 4141          if (newref > 0) {
4142 4142                  NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4143 4143                      "fn_rele %p:%s, new refcnt=%d",
4144 4144                      (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4145 4145                  if (parent != NULL)
4146 4146                          mutex_exit(&parent->fn_lock);
4147 4147                  mutex_exit(&fnp->fn_lock);
4148 4148                  return;
4149 4149          }
4150 4150

4151 4151          NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4152 4152              "fn_rele %p:%s, last reference, deleting...",
4153 4153              (void *)fnp, fnp->fn_name));
4154 4154          if (parent != NULL) {
4155 4155                  avl_remove(&parent->fn_children, fnp);
4156 4156                  mutex_exit(&parent->fn_lock);
4157 4157          }
4158 4158          kmem_free(fnp->fn_name, fnp->fn_len + 1);
4159 4159          sfh4_rele(&fnp->fn_sfh);
4160 4160          mutex_destroy(&fnp->fn_lock);
4161 4161          avl_destroy(&fnp->fn_children);
4162 4162          kmem_free(fnp, sizeof (nfs4_fname_t));
4163 4163          /*
4164 4164           * Recursivly fn_rele the parent.
4165 4165           * Use goto instead of a recursive call to avoid stack overflow.
4166 4166           */
4167 4167          if (parent != NULL) {
4168 4168                  fnpp = &parent;
4169 4169                  goto recur;
4170 4170          }
4171 4171  }
4172 4172  
4173 4173  /*
4174 4174   * Returns the single component name of the given fname, in a MAXNAMELEN
4175 4175   * string buffer, which the caller is responsible for freeing.  Note that
4176 4176   * the name may become invalid as a result of fn_move().
4177 4177   */
4178 4178  
4179 4179  char *
4180 4180  fn_name(nfs4_fname_t *fnp)
4181 4181  {
4182 4182          char *name;
4183 4183  
4184 4184          ASSERT(fnp->fn_len < MAXNAMELEN);
4185 4185          name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
4186 4186          mutex_enter(&fnp->fn_lock);
4187 4187          (void) strcpy(name, fnp->fn_name);
4188 4188          mutex_exit(&fnp->fn_lock);
4189 4189  
4190 4190          return (name);
4191 4191  }
4192 4192  
4193 4193  
4194 4194  /*
4195 4195   * fn_path_realloc
4196 4196   *
4197 4197   * This function, used only by fn_path, constructs
4198 4198   * a new string which looks like "prepend" + "/" + "current".
4199 4199   * by allocating a new string and freeing the old one.
4200 4200   */
4201 4201  static void
4202 4202  fn_path_realloc(char **curses, char *prepend)
4203 4203  {
4204 4204          int len, curlen = 0;
4205 4205          char *news;
4206 4206  
4207 4207          if (*curses == NULL) {
4208 4208                  /*
4209 4209                   * Prime the pump, allocate just the
4210 4210                   * space for prepend and return that.
4211 4211                   */
4212 4212                  len = strlen(prepend) + 1;
4213 4213                  news = kmem_alloc(len, KM_SLEEP);
4214 4214                  (void) strncpy(news, prepend, len);
4215 4215          } else {
4216 4216                  /*
4217 4217                   * Allocate the space  for a new string
4218 4218                   * +1 +1 is for the "/" and the NULL
4219 4219                   * byte at the end of it all.
4220 4220                   */
4221 4221                  curlen = strlen(*curses);
4222 4222                  len = curlen + strlen(prepend) + 1 + 1;
4223 4223                  news = kmem_alloc(len, KM_SLEEP);
4224 4224                  (void) strncpy(news, prepend, len);
4225 4225                  (void) strcat(news, "/");
4226 4226                  (void) strcat(news, *curses);
4227 4227                  kmem_free(*curses, curlen + 1);
4228 4228          }
4229 4229          *curses = news;
4230 4230  }
4231 4231  
4232 4232  /*
4233 4233   * Returns the path name (starting from the fs root) for the given fname.
4234 4234   * The caller is responsible for freeing.  Note that the path may be or
4235 4235   * become invalid as a result of fn_move().
4236 4236   */
4237 4237  
4238 4238  char *
4239 4239  fn_path(nfs4_fname_t *fnp)
4240 4240  {
4241 4241          char *path;
4242 4242          nfs4_fname_t *nextfnp;
4243 4243  
4244 4244          if (fnp == NULL)
4245 4245                  return (NULL);
4246 4246  
4247 4247          path = NULL;
4248 4248  
4249 4249          /* walk up the tree constructing the pathname.  */
4250 4250  
4251 4251          fn_hold(fnp);                   /* adjust for later rele */
4252 4252          do {
4253 4253                  mutex_enter(&fnp->fn_lock);
4254 4254                  /*
4255 4255                   * Add fn_name in front of the current path
4256 4256                   */
4257 4257                  fn_path_realloc(&path, fnp->fn_name);
4258 4258                  nextfnp = fnp->fn_parent;
4259 4259                  if (nextfnp != NULL)
4260 4260                          fn_hold(nextfnp);
4261 4261                  mutex_exit(&fnp->fn_lock);
4262 4262                  fn_rele(&fnp);
4263 4263                  fnp = nextfnp;
4264 4264          } while (fnp != NULL);
4265 4265  
4266 4266          return (path);
4267 4267  }
4268 4268  
4269 4269  /*
4270 4270   * Return a reference to the parent of the given fname, which the caller is
4271 4271   * responsible for eventually releasing.
4272 4272   */
4273 4273  
4274 4274  nfs4_fname_t *
4275 4275  fn_parent(nfs4_fname_t *fnp)
4276 4276  {
4277 4277          nfs4_fname_t *parent;
4278 4278  
4279 4279          mutex_enter(&fnp->fn_lock);
4280 4280          parent = fnp->fn_parent;
4281 4281          if (parent != NULL)
4282 4282                  fn_hold(parent);
4283 4283          mutex_exit(&fnp->fn_lock);
4284 4284  
4285 4285          return (parent);
4286 4286  }
4287 4287  
4288 4288  /*
4289 4289   * Update fnp so that its parent is newparent and its name is newname.
4290 4290   */
4291 4291  
4292 4292  void
4293 4293  fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname)
4294 4294  {
4295 4295          nfs4_fname_t *parent, *tmpfnp;
4296 4296          ssize_t newlen;
4297 4297          nfs4_fname_t key;
4298 4298          avl_index_t where;
4299 4299  
4300 4300          /*
4301 4301           * This assert exists to catch the client trying to rename
4302 4302           * a dir to be a child of itself.  This happened at a recent
4303 4303           * bakeoff against a 3rd party (broken) server which allowed
4304 4304           * the rename to succeed.  If it trips it means that:
4305 4305           *      a) the code in nfs4rename that detects this case is broken
4306 4306           *      b) the server is broken (since it allowed the bogus rename)
4307 4307           *
4308 4308           * For non-DEBUG kernels, prepare for a recursive mutex_enter
4309 4309           * panic below from:  mutex_enter(&newparent->fn_lock);
4310 4310           */
4311 4311          ASSERT(fnp != newparent);
4312 4312  
4313 4313          /*
4314 4314           * Remove fnp from its current parent, change its name, then add it
4315 4315           * to newparent. It might happen that fnp was replaced by another
4316 4316           * nfs4_fname_t with the same fn_name in parent->fn_children.
4317 4317           * In such case, fnp->fn_parent is NULL and we skip the removal
4318 4318           * of fnp from its current parent.
4319 4319           */
4320 4320          mutex_enter(&fnp->fn_lock);
4321 4321          parent = fnp->fn_parent;
4322 4322          if (parent != NULL) {
4323 4323                  mutex_enter(&parent->fn_lock);
4324 4324                  avl_remove(&parent->fn_children, fnp);
4325 4325                  mutex_exit(&parent->fn_lock);
4326 4326                  fn_rele(&fnp->fn_parent);
4327 4327          }
4328 4328  
4329 4329          newlen = strlen(newname);
4330 4330          if (newlen != fnp->fn_len) {
4331 4331                  ASSERT(newlen < MAXNAMELEN);
4332 4332                  kmem_free(fnp->fn_name, fnp->fn_len + 1);
4333 4333                  fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP);
4334 4334                  fnp->fn_len = newlen;
4335 4335          }
4336 4336          (void) strcpy(fnp->fn_name, newname);
4337 4337  
4338 4338  again:
4339 4339          mutex_enter(&newparent->fn_lock);
4340 4340          key.fn_name = fnp->fn_name;
4341 4341          tmpfnp = avl_find(&newparent->fn_children, &key, &where);
4342 4342          if (tmpfnp != NULL) {
4343 4343                  /*
4344 4344                   * This could be due to a file that was unlinked while
4345 4345                   * open, or perhaps the rnode is in the free list.  Remove
4346 4346                   * it from newparent and let it go away on its own.  The
4347 4347                   * contorted code is to deal with lock order issues and
4348 4348                   * race conditions.
4349 4349                   */
4350 4350                  fn_hold(tmpfnp);
4351 4351                  mutex_exit(&newparent->fn_lock);
4352 4352                  mutex_enter(&tmpfnp->fn_lock);
4353 4353                  if (tmpfnp->fn_parent == newparent) {
4354 4354                          mutex_enter(&newparent->fn_lock);
4355 4355                          avl_remove(&newparent->fn_children, tmpfnp);
4356 4356                          mutex_exit(&newparent->fn_lock);
4357 4357                          fn_rele(&tmpfnp->fn_parent);
4358 4358                  }
4359 4359                  mutex_exit(&tmpfnp->fn_lock);
4360 4360                  fn_rele(&tmpfnp);
4361 4361                  goto again;
4362 4362          }
4363 4363          fnp->fn_parent = newparent;
4364 4364          fn_hold(newparent);
4365 4365          avl_insert(&newparent->fn_children, fnp, where);
4366 4366          mutex_exit(&newparent->fn_lock);
4367 4367          mutex_exit(&fnp->fn_lock);
4368 4368  }
4369 4369  
4370 4370  #ifdef DEBUG
4371 4371  /*
4372 4372   * Return non-zero if the type information makes sense for the given vnode.
4373 4373   * Otherwise panic.
4374 4374   */
4375 4375  int
4376 4376  nfs4_consistent_type(vnode_t *vp)
4377 4377  {
4378 4378          rnode4_t *rp = VTOR4(vp);
4379 4379  
4380 4380          if (nfs4_vtype_debug && vp->v_type != VNON &&
4381 4381              rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) {
4382 4382                  cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, "
4383 4383                      "rnode attr type=%d", (void *)vp, vp->v_type,
4384 4384                      rp->r_attr.va_type);
4385 4385          }
4386 4386  
4387 4387          return (1);
4388 4388  }
4389 4389  #endif /* DEBUG */

↓ open down ↓

239 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX