Print this page
5045 use atomic_{inc,dec}_* instead of atomic_add_*
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/nfs/nfs4_client.c
+++ new/usr/src/uts/common/fs/nfs/nfs4_client.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 */
24 24
25 25 /*
26 26 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
27 27 * All Rights Reserved
28 28 */
29 29
30 30 #include <sys/param.h>
31 31 #include <sys/types.h>
32 32 #include <sys/systm.h>
33 33 #include <sys/thread.h>
34 34 #include <sys/t_lock.h>
35 35 #include <sys/time.h>
36 36 #include <sys/vnode.h>
37 37 #include <sys/vfs.h>
38 38 #include <sys/errno.h>
39 39 #include <sys/buf.h>
40 40 #include <sys/stat.h>
41 41 #include <sys/cred.h>
42 42 #include <sys/kmem.h>
43 43 #include <sys/debug.h>
44 44 #include <sys/dnlc.h>
45 45 #include <sys/vmsystm.h>
46 46 #include <sys/flock.h>
47 47 #include <sys/share.h>
48 48 #include <sys/cmn_err.h>
49 49 #include <sys/tiuser.h>
50 50 #include <sys/sysmacros.h>
51 51 #include <sys/callb.h>
52 52 #include <sys/acl.h>
53 53 #include <sys/kstat.h>
54 54 #include <sys/signal.h>
55 55 #include <sys/disp.h>
56 56 #include <sys/atomic.h>
57 57 #include <sys/list.h>
58 58 #include <sys/sdt.h>
59 59
60 60 #include <rpc/types.h>
61 61 #include <rpc/xdr.h>
62 62 #include <rpc/auth.h>
63 63 #include <rpc/clnt.h>
64 64
65 65 #include <nfs/nfs.h>
66 66 #include <nfs/nfs_clnt.h>
67 67 #include <nfs/nfs_acl.h>
68 68
69 69 #include <nfs/nfs4.h>
70 70 #include <nfs/rnode4.h>
71 71 #include <nfs/nfs4_clnt.h>
72 72
73 73 #include <vm/hat.h>
74 74 #include <vm/as.h>
75 75 #include <vm/page.h>
76 76 #include <vm/pvn.h>
77 77 #include <vm/seg.h>
78 78 #include <vm/seg_map.h>
79 79 #include <vm/seg_vn.h>
80 80
81 81 #include <sys/ddi.h>
82 82
83 83 /*
84 84 * Arguments to page-flush thread.
85 85 */
86 86 typedef struct {
87 87 vnode_t *vp;
88 88 cred_t *cr;
89 89 } pgflush_t;
90 90
91 91 #ifdef DEBUG
92 92 int nfs4_client_lease_debug;
93 93 int nfs4_sharedfh_debug;
94 94 int nfs4_fname_debug;
95 95
96 96 /* temporary: panic if v_type is inconsistent with r_attr va_type */
97 97 int nfs4_vtype_debug;
98 98
99 99 uint_t nfs4_tsd_key;
100 100 #endif
101 101
102 102 static time_t nfs4_client_resumed = 0;
103 103 static callb_id_t cid = 0;
104 104
105 105 static int nfs4renew(nfs4_server_t *);
106 106 static void nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int);
107 107 static void nfs4_pgflush_thread(pgflush_t *);
108 108
109 109 static boolean_t nfs4_client_cpr_callb(void *, int);
110 110
111 111 struct mi4_globals {
112 112 kmutex_t mig_lock; /* lock protecting mig_list */
113 113 list_t mig_list; /* list of NFS v4 mounts in zone */
114 114 boolean_t mig_destructor_called;
115 115 };
116 116
117 117 static zone_key_t mi4_list_key;
118 118
119 119 /*
120 120 * Attributes caching:
121 121 *
122 122 * Attributes are cached in the rnode in struct vattr form.
123 123 * There is a time associated with the cached attributes (r_time_attr_inval)
124 124 * which tells whether the attributes are valid. The time is initialized
125 125 * to the difference between current time and the modify time of the vnode
126 126 * when new attributes are cached. This allows the attributes for
127 127 * files that have changed recently to be timed out sooner than for files
128 128 * that have not changed for a long time. There are minimum and maximum
129 129 * timeout values that can be set per mount point.
130 130 */
131 131
132 132 /*
133 133 * If a cache purge is in progress, wait for it to finish.
134 134 *
135 135 * The current thread must not be in the middle of an
136 136 * nfs4_start_op/nfs4_end_op region. Otherwise, there could be a deadlock
137 137 * between this thread, a recovery thread, and the page flush thread.
138 138 */
139 139 int
140 140 nfs4_waitfor_purge_complete(vnode_t *vp)
141 141 {
142 142 rnode4_t *rp;
143 143 k_sigset_t smask;
144 144
145 145 rp = VTOR4(vp);
146 146 if ((rp->r_serial != NULL && rp->r_serial != curthread) ||
147 147 ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) {
148 148 mutex_enter(&rp->r_statelock);
149 149 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
150 150 while ((rp->r_serial != NULL && rp->r_serial != curthread) ||
151 151 ((rp->r_flags & R4PGFLUSH) &&
152 152 rp->r_pgflush != curthread)) {
153 153 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
154 154 sigunintr(&smask);
155 155 mutex_exit(&rp->r_statelock);
156 156 return (EINTR);
157 157 }
158 158 }
159 159 sigunintr(&smask);
160 160 mutex_exit(&rp->r_statelock);
161 161 }
162 162 return (0);
163 163 }
164 164
165 165 /*
166 166 * Validate caches by checking cached attributes. If they have timed out,
167 167 * then get new attributes from the server. As a side effect, cache
168 168 * invalidation is done if the attributes have changed.
169 169 *
170 170 * If the attributes have not timed out and if there is a cache
171 171 * invalidation being done by some other thread, then wait until that
172 172 * thread has completed the cache invalidation.
173 173 */
174 174 int
175 175 nfs4_validate_caches(vnode_t *vp, cred_t *cr)
176 176 {
177 177 int error;
178 178 nfs4_ga_res_t gar;
179 179
180 180 if (ATTRCACHE4_VALID(vp)) {
181 181 error = nfs4_waitfor_purge_complete(vp);
182 182 if (error)
183 183 return (error);
184 184 return (0);
185 185 }
186 186
187 187 gar.n4g_va.va_mask = AT_ALL;
188 188 return (nfs4_getattr_otw(vp, &gar, cr, 0));
189 189 }
190 190
191 191 /*
192 192 * Fill in attribute from the cache.
193 193 * If valid, then return 0 to indicate that no error occurred,
194 194 * otherwise return 1 to indicate that an error occurred.
195 195 */
196 196 static int
197 197 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap)
198 198 {
199 199 rnode4_t *rp;
200 200
201 201 rp = VTOR4(vp);
202 202 mutex_enter(&rp->r_statelock);
203 203 mutex_enter(&rp->r_statev4_lock);
204 204 if (ATTRCACHE4_VALID(vp)) {
205 205 mutex_exit(&rp->r_statev4_lock);
206 206 /*
207 207 * Cached attributes are valid
208 208 */
209 209 *vap = rp->r_attr;
210 210 mutex_exit(&rp->r_statelock);
211 211 return (0);
212 212 }
213 213 mutex_exit(&rp->r_statev4_lock);
214 214 mutex_exit(&rp->r_statelock);
215 215 return (1);
216 216 }
217 217
218 218
219 219 /*
220 220 * If returned error is ESTALE flush all caches. The nfs4_purge_caches()
221 221 * call is synchronous because all the pages were invalidated by the
222 222 * nfs4_invalidate_pages() call.
223 223 */
224 224 void
225 225 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr)
226 226 {
227 227 struct rnode4 *rp = VTOR4(vp);
228 228
229 229 /* Ensure that the ..._end_op() call has been done */
230 230 ASSERT(tsd_get(nfs4_tsd_key) == NULL);
231 231
232 232 if (errno != ESTALE)
233 233 return;
234 234
235 235 mutex_enter(&rp->r_statelock);
236 236 rp->r_flags |= R4STALE;
237 237 if (!rp->r_error)
238 238 rp->r_error = errno;
239 239 mutex_exit(&rp->r_statelock);
240 240 if (nfs4_has_pages(vp))
241 241 nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
242 242 nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE);
243 243 }
244 244
245 245 /*
246 246 * Purge all of the various NFS `data' caches. If "asyncpg" is TRUE, the
247 247 * page purge is done asynchronously.
248 248 */
249 249 void
250 250 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg)
251 251 {
252 252 rnode4_t *rp;
253 253 char *contents;
254 254 vnode_t *xattr;
255 255 int size;
256 256 int pgflush; /* are we the page flush thread? */
257 257
258 258 /*
259 259 * Purge the DNLC for any entries which refer to this file.
260 260 */
261 261 if (vp->v_count > 1 &&
262 262 (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC))
263 263 dnlc_purge_vp(vp);
264 264
265 265 /*
266 266 * Clear any readdir state bits and purge the readlink response cache.
267 267 */
268 268 rp = VTOR4(vp);
269 269 mutex_enter(&rp->r_statelock);
270 270 rp->r_flags &= ~R4LOOKUP;
271 271 contents = rp->r_symlink.contents;
272 272 size = rp->r_symlink.size;
273 273 rp->r_symlink.contents = NULL;
274 274
275 275 xattr = rp->r_xattr_dir;
276 276 rp->r_xattr_dir = NULL;
277 277
278 278 /*
279 279 * Purge pathconf cache too.
280 280 */
281 281 rp->r_pathconf.pc4_xattr_valid = 0;
282 282 rp->r_pathconf.pc4_cache_valid = 0;
283 283
284 284 pgflush = (curthread == rp->r_pgflush);
285 285 mutex_exit(&rp->r_statelock);
286 286
287 287 if (contents != NULL) {
288 288
289 289 kmem_free((void *)contents, size);
290 290 }
291 291
292 292 if (xattr != NULL)
293 293 VN_RELE(xattr);
294 294
295 295 /*
296 296 * Flush the page cache. If the current thread is the page flush
297 297 * thread, don't initiate a new page flush. There's no need for
298 298 * it, and doing it correctly is hard.
299 299 */
300 300 if (nfs4_has_pages(vp) && !pgflush) {
301 301 if (!asyncpg) {
302 302 (void) nfs4_waitfor_purge_complete(vp);
303 303 nfs4_flush_pages(vp, cr);
304 304 } else {
305 305 pgflush_t *args;
306 306
307 307 /*
308 308 * We don't hold r_statelock while creating the
309 309 * thread, in case the call blocks. So we use a
310 310 * flag to indicate that a page flush thread is
311 311 * active.
312 312 */
313 313 mutex_enter(&rp->r_statelock);
314 314 if (rp->r_flags & R4PGFLUSH) {
315 315 mutex_exit(&rp->r_statelock);
316 316 } else {
317 317 rp->r_flags |= R4PGFLUSH;
318 318 mutex_exit(&rp->r_statelock);
319 319
320 320 args = kmem_alloc(sizeof (pgflush_t),
321 321 KM_SLEEP);
322 322 args->vp = vp;
323 323 VN_HOLD(args->vp);
324 324 args->cr = cr;
325 325 crhold(args->cr);
326 326 (void) zthread_create(NULL, 0,
327 327 nfs4_pgflush_thread, args, 0,
328 328 minclsyspri);
329 329 }
330 330 }
331 331 }
332 332
333 333 /*
334 334 * Flush the readdir response cache.
335 335 */
336 336 nfs4_purge_rddir_cache(vp);
337 337 }
338 338
339 339 /*
340 340 * Invalidate all pages for the given file, after writing back the dirty
341 341 * ones.
342 342 */
343 343
344 344 void
345 345 nfs4_flush_pages(vnode_t *vp, cred_t *cr)
346 346 {
347 347 int error;
348 348 rnode4_t *rp = VTOR4(vp);
349 349
350 350 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
351 351 if (error == ENOSPC || error == EDQUOT) {
352 352 mutex_enter(&rp->r_statelock);
353 353 if (!rp->r_error)
354 354 rp->r_error = error;
355 355 mutex_exit(&rp->r_statelock);
356 356 }
357 357 }
358 358
359 359 /*
360 360 * Page flush thread.
361 361 */
362 362
363 363 static void
364 364 nfs4_pgflush_thread(pgflush_t *args)
365 365 {
366 366 rnode4_t *rp = VTOR4(args->vp);
367 367
368 368 /* remember which thread we are, so we don't deadlock ourselves */
369 369 mutex_enter(&rp->r_statelock);
370 370 ASSERT(rp->r_pgflush == NULL);
371 371 rp->r_pgflush = curthread;
372 372 mutex_exit(&rp->r_statelock);
373 373
374 374 nfs4_flush_pages(args->vp, args->cr);
375 375
376 376 mutex_enter(&rp->r_statelock);
377 377 rp->r_pgflush = NULL;
378 378 rp->r_flags &= ~R4PGFLUSH;
379 379 cv_broadcast(&rp->r_cv);
380 380 mutex_exit(&rp->r_statelock);
381 381
382 382 VN_RELE(args->vp);
383 383 crfree(args->cr);
384 384 kmem_free(args, sizeof (pgflush_t));
385 385 zthread_exit();
386 386 }
387 387
388 388 /*
389 389 * Purge the readdir cache of all entries which are not currently
390 390 * being filled.
391 391 */
392 392 void
393 393 nfs4_purge_rddir_cache(vnode_t *vp)
394 394 {
395 395 rnode4_t *rp;
396 396
397 397 rp = VTOR4(vp);
398 398
399 399 mutex_enter(&rp->r_statelock);
400 400 rp->r_direof = NULL;
401 401 rp->r_flags &= ~R4LOOKUP;
402 402 rp->r_flags |= R4READDIRWATTR;
403 403 rddir4_cache_purge(rp);
404 404 mutex_exit(&rp->r_statelock);
405 405 }
406 406
407 407 /*
408 408 * Set attributes cache for given vnode using virtual attributes. There is
409 409 * no cache validation, but if the attributes are deemed to be stale, they
410 410 * are ignored. This corresponds to nfs3_attrcache().
411 411 *
412 412 * Set the timeout value on the attribute cache and fill it
413 413 * with the passed in attributes.
414 414 */
415 415 void
416 416 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t)
417 417 {
418 418 rnode4_t *rp = VTOR4(vp);
419 419
420 420 mutex_enter(&rp->r_statelock);
421 421 if (rp->r_time_attr_saved <= t)
422 422 nfs4_attrcache_va(vp, garp, FALSE);
423 423 mutex_exit(&rp->r_statelock);
424 424 }
425 425
426 426 /*
427 427 * Use the passed in virtual attributes to check to see whether the
428 428 * data and metadata caches are valid, cache the new attributes, and
429 429 * then do the cache invalidation if required.
430 430 *
431 431 * The cache validation and caching of the new attributes is done
432 432 * atomically via the use of the mutex, r_statelock. If required,
433 433 * the cache invalidation is done atomically w.r.t. the cache
434 434 * validation and caching of the attributes via the pseudo lock,
435 435 * r_serial.
436 436 *
437 437 * This routine is used to do cache validation and attributes caching
438 438 * for operations with a single set of post operation attributes.
439 439 */
440 440
441 441 void
442 442 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
443 443 hrtime_t t, cred_t *cr, int async,
444 444 change_info4 *cinfo)
445 445 {
446 446 rnode4_t *rp;
447 447 int mtime_changed = 0;
448 448 int ctime_changed = 0;
449 449 vsecattr_t *vsp;
450 450 int was_serial, set_time_cache_inval, recov;
451 451 vattr_t *vap = &garp->n4g_va;
452 452 mntinfo4_t *mi = VTOMI4(vp);
453 453 len_t preattr_rsize;
454 454 boolean_t writemodify_set = B_FALSE;
455 455 boolean_t cachepurge_set = B_FALSE;
456 456
457 457 ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid);
458 458
459 459 /* Is curthread the recovery thread? */
460 460 mutex_enter(&mi->mi_lock);
461 461 recov = (VTOMI4(vp)->mi_recovthread == curthread);
462 462 mutex_exit(&mi->mi_lock);
463 463
464 464 rp = VTOR4(vp);
465 465 mutex_enter(&rp->r_statelock);
466 466 was_serial = (rp->r_serial == curthread);
467 467 if (rp->r_serial && !was_serial) {
468 468 klwp_t *lwp = ttolwp(curthread);
469 469
470 470 /*
471 471 * If we're the recovery thread, then purge current attrs
472 472 * and bail out to avoid potential deadlock between another
473 473 * thread caching attrs (r_serial thread), recov thread,
474 474 * and an async writer thread.
475 475 */
476 476 if (recov) {
477 477 PURGE_ATTRCACHE4_LOCKED(rp);
478 478 mutex_exit(&rp->r_statelock);
479 479 return;
480 480 }
481 481
482 482 if (lwp != NULL)
483 483 lwp->lwp_nostop++;
484 484 while (rp->r_serial != NULL) {
485 485 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
486 486 mutex_exit(&rp->r_statelock);
487 487 if (lwp != NULL)
488 488 lwp->lwp_nostop--;
489 489 return;
490 490 }
491 491 }
492 492 if (lwp != NULL)
493 493 lwp->lwp_nostop--;
494 494 }
495 495
496 496 /*
497 497 * If there is a page flush thread, the current thread needs to
498 498 * bail out, to prevent a possible deadlock between the current
499 499 * thread (which might be in a start_op/end_op region), the
500 500 * recovery thread, and the page flush thread. Expire the
501 501 * attribute cache, so that any attributes the current thread was
502 502 * going to set are not lost.
503 503 */
504 504 if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) {
505 505 PURGE_ATTRCACHE4_LOCKED(rp);
506 506 mutex_exit(&rp->r_statelock);
507 507 return;
508 508 }
509 509
510 510 if (rp->r_time_attr_saved > t) {
511 511 /*
512 512 * Attributes have been cached since these attributes were
513 513 * probably made. If there is an inconsistency in what is
514 514 * cached, mark them invalid. If not, don't act on them.
515 515 */
516 516 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
517 517 PURGE_ATTRCACHE4_LOCKED(rp);
518 518 mutex_exit(&rp->r_statelock);
519 519 return;
520 520 }
521 521 set_time_cache_inval = 0;
522 522 if (cinfo) {
523 523 /*
524 524 * Only directory modifying callers pass non-NULL cinfo.
525 525 */
526 526 ASSERT(vp->v_type == VDIR);
527 527 /*
528 528 * If the cache timeout either doesn't exist or hasn't expired,
529 529 * and dir didn't changed on server before dirmod op
530 530 * and dir didn't change after dirmod op but before getattr
531 531 * then there's a chance that the client's cached data for
532 532 * this object is current (not stale). No immediate cache
533 533 * flush is required.
534 534 *
535 535 */
536 536 if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) &&
537 537 cinfo->before == rp->r_change &&
538 538 (garp->n4g_change_valid &&
539 539 cinfo->after == garp->n4g_change)) {
540 540
541 541 /*
542 542 * If atomic isn't set, then the before/after info
543 543 * cannot be blindly trusted. For this case, we tell
544 544 * nfs4_attrcache_va to cache the attrs but also
545 545 * establish an absolute maximum cache timeout. When
546 546 * the timeout is reached, caches will be flushed.
547 547 */
548 548 if (! cinfo->atomic)
549 549 set_time_cache_inval = 1;
550 550 } else {
551 551
552 552 /*
553 553 * We're not sure exactly what changed, but we know
554 554 * what to do. flush all caches for dir. remove the
555 555 * attr timeout.
556 556 *
557 557 * a) timeout expired. flush all caches.
558 558 * b) r_change != cinfo.before. flush all caches.
559 559 * c) r_change == cinfo.before, but cinfo.after !=
560 560 * post-op getattr(change). flush all caches.
561 561 * d) post-op getattr(change) not provided by server.
562 562 * flush all caches.
563 563 */
564 564 mtime_changed = 1;
565 565 ctime_changed = 1;
566 566 rp->r_time_cache_inval = 0;
567 567 }
568 568 } else {
569 569 /*
570 570 * Write thread after writing data to file on remote server,
571 571 * will always set R4WRITEMODIFIED to indicate that file on
572 572 * remote server was modified with a WRITE operation and would
573 573 * have marked attribute cache as timed out. If R4WRITEMODIFIED
574 574 * is set, then do not check for mtime and ctime change.
575 575 */
576 576 if (!(rp->r_flags & R4WRITEMODIFIED)) {
577 577 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
578 578 mtime_changed = 1;
579 579
580 580 if (rp->r_attr.va_ctime.tv_sec !=
581 581 vap->va_ctime.tv_sec ||
582 582 rp->r_attr.va_ctime.tv_nsec !=
583 583 vap->va_ctime.tv_nsec)
584 584 ctime_changed = 1;
585 585 } else {
586 586 writemodify_set = B_TRUE;
587 587 }
588 588 }
589 589
590 590 preattr_rsize = rp->r_size;
591 591
592 592 nfs4_attrcache_va(vp, garp, set_time_cache_inval);
593 593
594 594 /*
595 595 * If we have updated filesize in nfs4_attrcache_va, as soon as we
596 596 * drop statelock we will be in transition of purging all
597 597 * our caches and updating them. It is possible for another
598 598 * thread to pick this new file size and read in zeroed data.
599 599 * stall other threads till cache purge is complete.
600 600 */
601 601 if ((!cinfo) && (rp->r_size != preattr_rsize)) {
602 602 /*
603 603 * If R4WRITEMODIFIED was set and we have updated the file
604 604 * size, Server's returned file size need not necessarily
605 605 * be because of this Client's WRITE. We need to purge
606 606 * all caches.
607 607 */
608 608 if (writemodify_set)
609 609 mtime_changed = 1;
610 610
611 611 if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) {
612 612 rp->r_flags |= R4INCACHEPURGE;
613 613 cachepurge_set = B_TRUE;
614 614 }
615 615 }
616 616
617 617 if (!mtime_changed && !ctime_changed) {
618 618 mutex_exit(&rp->r_statelock);
619 619 return;
620 620 }
621 621
622 622 rp->r_serial = curthread;
623 623
624 624 mutex_exit(&rp->r_statelock);
625 625
626 626 /*
627 627 * If we're the recov thread, then force async nfs4_purge_caches
628 628 * to avoid potential deadlock.
629 629 */
630 630 if (mtime_changed)
631 631 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async);
632 632
633 633 if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) {
634 634 mutex_enter(&rp->r_statelock);
635 635 rp->r_flags &= ~R4INCACHEPURGE;
636 636 cv_broadcast(&rp->r_cv);
637 637 mutex_exit(&rp->r_statelock);
638 638 cachepurge_set = B_FALSE;
639 639 }
640 640
641 641 if (ctime_changed) {
642 642 (void) nfs4_access_purge_rp(rp);
643 643 if (rp->r_secattr != NULL) {
644 644 mutex_enter(&rp->r_statelock);
645 645 vsp = rp->r_secattr;
646 646 rp->r_secattr = NULL;
647 647 mutex_exit(&rp->r_statelock);
648 648 if (vsp != NULL)
649 649 nfs4_acl_free_cache(vsp);
650 650 }
651 651 }
652 652
653 653 if (!was_serial) {
654 654 mutex_enter(&rp->r_statelock);
655 655 rp->r_serial = NULL;
656 656 cv_broadcast(&rp->r_cv);
657 657 mutex_exit(&rp->r_statelock);
658 658 }
659 659 }
660 660
661 661 /*
662 662 * Set attributes cache for given vnode using virtual attributes.
663 663 *
664 664 * Set the timeout value on the attribute cache and fill it
665 665 * with the passed in attributes.
666 666 *
667 667 * The caller must be holding r_statelock.
668 668 */
669 669 static void
670 670 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout)
671 671 {
672 672 rnode4_t *rp;
673 673 mntinfo4_t *mi;
674 674 hrtime_t delta;
675 675 hrtime_t now;
676 676 vattr_t *vap = &garp->n4g_va;
677 677
678 678 rp = VTOR4(vp);
679 679
680 680 ASSERT(MUTEX_HELD(&rp->r_statelock));
681 681 ASSERT(vap->va_mask == AT_ALL);
682 682
683 683 /* Switch to master before checking v_flag */
684 684 if (IS_SHADOW(vp, rp))
685 685 vp = RTOV4(rp);
686 686
687 687 now = gethrtime();
688 688
689 689 mi = VTOMI4(vp);
690 690
691 691 /*
692 692 * Only establish a new cache timeout (if requested). Never
693 693 * extend a timeout. Never clear a timeout. Clearing a timeout
694 694 * is done by nfs4_update_dircaches (ancestor in our call chain)
695 695 */
696 696 if (set_cache_timeout && ! rp->r_time_cache_inval)
697 697 rp->r_time_cache_inval = now + mi->mi_acdirmax;
698 698
699 699 /*
700 700 * Delta is the number of nanoseconds that we will
701 701 * cache the attributes of the file. It is based on
702 702 * the number of nanoseconds since the last time that
703 703 * we detected a change. The assumption is that files
704 704 * that changed recently are likely to change again.
705 705 * There is a minimum and a maximum for regular files
706 706 * and for directories which is enforced though.
707 707 *
708 708 * Using the time since last change was detected
709 709 * eliminates direct comparison or calculation
710 710 * using mixed client and server times. NFS does
711 711 * not make any assumptions regarding the client
712 712 * and server clocks being synchronized.
713 713 */
714 714 if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
715 715 vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
716 716 vap->va_size != rp->r_attr.va_size) {
717 717 rp->r_time_attr_saved = now;
718 718 }
719 719
720 720 if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE))
721 721 delta = 0;
722 722 else {
723 723 delta = now - rp->r_time_attr_saved;
724 724 if (vp->v_type == VDIR) {
725 725 if (delta < mi->mi_acdirmin)
726 726 delta = mi->mi_acdirmin;
727 727 else if (delta > mi->mi_acdirmax)
728 728 delta = mi->mi_acdirmax;
729 729 } else {
730 730 if (delta < mi->mi_acregmin)
731 731 delta = mi->mi_acregmin;
732 732 else if (delta > mi->mi_acregmax)
733 733 delta = mi->mi_acregmax;
734 734 }
735 735 }
736 736 rp->r_time_attr_inval = now + delta;
737 737
738 738 rp->r_attr = *vap;
739 739 if (garp->n4g_change_valid)
740 740 rp->r_change = garp->n4g_change;
741 741
742 742 /*
743 743 * The attributes that were returned may be valid and can
744 744 * be used, but they may not be allowed to be cached.
745 745 * Reset the timers to cause immediate invalidation and
746 746 * clear r_change so no VERIFY operations will suceed
747 747 */
748 748 if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) {
749 749 rp->r_time_attr_inval = now;
750 750 rp->r_time_attr_saved = now;
751 751 rp->r_change = 0;
752 752 }
753 753
754 754 /*
755 755 * If mounted_on_fileid returned AND the object is a stub,
756 756 * then set object's va_nodeid to the mounted over fid
757 757 * returned by server.
758 758 *
759 759 * If mounted_on_fileid not provided/supported, then
760 760 * just set it to 0 for now. Eventually it would be
761 761 * better to set it to a hashed version of FH. This
762 762 * would probably be good enough to provide a unique
763 763 * fid/d_ino within a dir.
764 764 *
765 765 * We don't need to carry mounted_on_fileid in the
766 766 * rnode as long as the client never requests fileid
767 767 * without also requesting mounted_on_fileid. For
768 768 * now, it stays.
769 769 */
770 770 if (garp->n4g_mon_fid_valid) {
771 771 rp->r_mntd_fid = garp->n4g_mon_fid;
772 772
773 773 if (RP_ISSTUB(rp))
774 774 rp->r_attr.va_nodeid = rp->r_mntd_fid;
775 775 }
776 776
777 777 /*
778 778 * Check to see if there are valid pathconf bits to
779 779 * cache in the rnode.
780 780 */
781 781 if (garp->n4g_ext_res) {
782 782 if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) {
783 783 rp->r_pathconf = garp->n4g_ext_res->n4g_pc4;
784 784 } else {
785 785 if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) {
786 786 rp->r_pathconf.pc4_xattr_valid = TRUE;
787 787 rp->r_pathconf.pc4_xattr_exists =
788 788 garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists;
789 789 }
790 790 }
791 791 }
792 792 /*
793 793 * Update the size of the file if there is no cached data or if
794 794 * the cached data is clean and there is no data being written
795 795 * out.
796 796 */
797 797 if (rp->r_size != vap->va_size &&
798 798 (!vn_has_cached_data(vp) ||
799 799 (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) {
800 800 rp->r_size = vap->va_size;
801 801 }
802 802 nfs_setswaplike(vp, vap);
803 803 rp->r_flags &= ~R4WRITEMODIFIED;
804 804 }
805 805
806 806 /*
807 807 * Get attributes over-the-wire and update attributes cache
808 808 * if no error occurred in the over-the-wire operation.
809 809 * Return 0 if successful, otherwise error.
810 810 */
811 811 int
812 812 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl)
813 813 {
814 814 mntinfo4_t *mi = VTOMI4(vp);
815 815 hrtime_t t;
816 816 nfs4_recov_state_t recov_state;
817 817 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
818 818
819 819 recov_state.rs_flags = 0;
820 820 recov_state.rs_num_retry_despite_err = 0;
821 821
822 822 /* Save the original mount point security flavor */
823 823 (void) save_mnt_secinfo(mi->mi_curr_serv);
824 824
825 825 recov_retry:
826 826
827 827 if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR,
828 828 &recov_state, NULL))) {
829 829 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
830 830 return (e.error);
831 831 }
832 832
833 833 t = gethrtime();
834 834
835 835 nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl);
836 836
837 837 if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) {
838 838 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
839 839 NULL, OP_GETATTR, NULL, NULL, NULL) == FALSE) {
840 840 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR,
841 841 &recov_state, 1);
842 842 goto recov_retry;
843 843 }
844 844 }
845 845
846 846 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0);
847 847
848 848 if (!e.error) {
849 849 if (e.stat == NFS4_OK) {
850 850 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
851 851 } else {
852 852 e.error = geterrno4(e.stat);
853 853
854 854 nfs4_purge_stale_fh(e.error, vp, cr);
855 855 }
856 856 }
857 857
858 858 /*
859 859 * If getattr a node that is a stub for a crossed
860 860 * mount point, keep the original secinfo flavor for
861 861 * the current file system, not the crossed one.
862 862 */
863 863 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
864 864
865 865 return (e.error);
866 866 }
867 867
868 868 /*
869 869 * Generate a compound to get attributes over-the-wire.
870 870 */
871 871 void
872 872 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp,
873 873 nfs4_error_t *ep, cred_t *cr, int get_acl)
874 874 {
875 875 COMPOUND4args_clnt args;
876 876 COMPOUND4res_clnt res;
877 877 int doqueue;
878 878 rnode4_t *rp = VTOR4(vp);
879 879 nfs_argop4 argop[2];
880 880
881 881 args.ctag = TAG_GETATTR;
882 882
883 883 args.array_len = 2;
884 884 args.array = argop;
885 885
886 886 /* putfh */
887 887 argop[0].argop = OP_CPUTFH;
888 888 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
889 889
890 890 /* getattr */
891 891 /*
892 892 * Unlike nfs version 2 and 3, where getattr returns all the
893 893 * attributes, nfs version 4 returns only the ones explicitly
894 894 * asked for. This creates problems, as some system functions
895 895 * (e.g. cache check) require certain attributes and if the
896 896 * cached node lacks some attributes such as uid/gid, it can
897 897 * affect system utilities (e.g. "ls") that rely on the information
898 898 * to be there. This can lead to anything from system crashes to
899 899 * corrupted information processed by user apps.
900 900 * So to ensure that all bases are covered, request at least
901 901 * the AT_ALL attribute mask.
902 902 */
903 903 argop[1].argop = OP_GETATTR;
904 904 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
905 905 if (get_acl)
906 906 argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK;
907 907 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
908 908
909 909 doqueue = 1;
910 910
911 911 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep);
912 912
913 913 if (ep->error)
914 914 return;
915 915
916 916 if (res.status != NFS4_OK) {
917 917 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
918 918 return;
919 919 }
920 920
921 921 *garp = res.array[1].nfs_resop4_u.opgetattr.ga_res;
922 922
923 923 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
924 924 }
925 925
926 926 /*
927 927 * Return either cached or remote attributes. If get remote attr
928 928 * use them to check and invalidate caches, then cache the new attributes.
929 929 */
930 930 int
931 931 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr)
932 932 {
933 933 int error;
934 934 rnode4_t *rp;
935 935 nfs4_ga_res_t gar;
936 936
937 937 ASSERT(nfs4_consistent_type(vp));
938 938
939 939 /*
940 940 * If we've got cached attributes, we're done, otherwise go
941 941 * to the server to get attributes, which will update the cache
942 942 * in the process. Either way, use the cached attributes for
943 943 * the caller's vattr_t.
944 944 *
945 945 * Note that we ignore the gar set by the OTW call: the attr caching
946 946 * code may make adjustments when storing to the rnode, and we want
947 947 * to see those changes here.
948 948 */
949 949 rp = VTOR4(vp);
950 950 error = 0;
951 951 mutex_enter(&rp->r_statelock);
952 952 if (!ATTRCACHE4_VALID(vp)) {
953 953 mutex_exit(&rp->r_statelock);
954 954 error = nfs4_getattr_otw(vp, &gar, cr, 0);
955 955 mutex_enter(&rp->r_statelock);
956 956 }
957 957
958 958 if (!error)
959 959 *vap = rp->r_attr;
960 960
961 961 /* Return the client's view of file size */
962 962 vap->va_size = rp->r_size;
963 963
964 964 mutex_exit(&rp->r_statelock);
965 965
966 966 ASSERT(nfs4_consistent_type(vp));
967 967
968 968 return (error);
969 969 }
970 970
971 971 int
972 972 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type,
973 973 nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr)
974 974 {
975 975 COMPOUND4args_clnt args;
976 976 COMPOUND4res_clnt res;
977 977 int doqueue;
978 978 nfs_argop4 argop[2];
979 979 mntinfo4_t *mi = VTOMI4(vp);
980 980 bool_t needrecov = FALSE;
981 981 nfs4_recov_state_t recov_state;
982 982 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
983 983 nfs4_ga_ext_res_t *gerp;
984 984
985 985 recov_state.rs_flags = 0;
986 986 recov_state.rs_num_retry_despite_err = 0;
987 987
988 988 recov_retry:
989 989 args.ctag = tag_type;
990 990
991 991 args.array_len = 2;
992 992 args.array = argop;
993 993
994 994 e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL);
995 995 if (e.error)
996 996 return (e.error);
997 997
998 998 /* putfh */
999 999 argop[0].argop = OP_CPUTFH;
1000 1000 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
1001 1001
1002 1002 /* getattr */
1003 1003 argop[1].argop = OP_GETATTR;
1004 1004 argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap;
1005 1005 argop[1].nfs_argop4_u.opgetattr.mi = mi;
1006 1006
1007 1007 doqueue = 1;
1008 1008
1009 1009 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1010 1010 "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first",
1011 1011 rnode4info(VTOR4(vp))));
1012 1012
1013 1013 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1014 1014
1015 1015 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
1016 1016 if (!needrecov && e.error) {
1017 1017 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1018 1018 needrecov);
1019 1019 return (e.error);
1020 1020 }
1021 1021
1022 1022 if (needrecov) {
1023 1023 bool_t abort;
1024 1024
1025 1025 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1026 1026 "nfs4_attr_otw: initiating recovery\n"));
1027 1027
1028 1028 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
1029 1029 NULL, OP_GETATTR, NULL, NULL, NULL);
1030 1030 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1031 1031 needrecov);
1032 1032 if (!e.error) {
1033 1033 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1034 1034 e.error = geterrno4(res.status);
1035 1035 }
1036 1036 if (abort == FALSE)
1037 1037 goto recov_retry;
1038 1038 return (e.error);
1039 1039 }
1040 1040
1041 1041 if (res.status) {
1042 1042 e.error = geterrno4(res.status);
1043 1043 } else {
1044 1044 gerp = garp->n4g_ext_res;
1045 1045 bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res,
1046 1046 garp, sizeof (nfs4_ga_res_t));
1047 1047 garp->n4g_ext_res = gerp;
1048 1048 if (garp->n4g_ext_res &&
1049 1049 res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res)
1050 1050 bcopy(res.array[1].nfs_resop4_u.opgetattr.
1051 1051 ga_res.n4g_ext_res,
1052 1052 garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t));
1053 1053 }
1054 1054 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1055 1055 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1056 1056 needrecov);
1057 1057 return (e.error);
1058 1058 }
1059 1059
1060 1060 /*
1061 1061 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark
1062 1062 * for the demand-based allocation of async threads per-mount. The
1063 1063 * nfs_async_timeout is the amount of time a thread will live after it
1064 1064 * becomes idle, unless new I/O requests are received before the thread
1065 1065 * dies. See nfs4_async_putpage and nfs4_async_start.
1066 1066 */
1067 1067
1068 1068 static void nfs4_async_start(struct vfs *);
1069 1069 static void nfs4_async_pgops_start(struct vfs *);
1070 1070 static void nfs4_async_common_start(struct vfs *, int);
1071 1071
1072 1072 static void
1073 1073 free_async_args4(struct nfs4_async_reqs *args)
1074 1074 {
1075 1075 rnode4_t *rp;
1076 1076
1077 1077 if (args->a_io != NFS4_INACTIVE) {
1078 1078 rp = VTOR4(args->a_vp);
1079 1079 mutex_enter(&rp->r_statelock);
1080 1080 rp->r_count--;
1081 1081 if (args->a_io == NFS4_PUTAPAGE ||
1082 1082 args->a_io == NFS4_PAGEIO)
1083 1083 rp->r_awcount--;
1084 1084 cv_broadcast(&rp->r_cv);
1085 1085 mutex_exit(&rp->r_statelock);
1086 1086 VN_RELE(args->a_vp);
1087 1087 }
1088 1088 crfree(args->a_cred);
1089 1089 kmem_free(args, sizeof (*args));
1090 1090 }
1091 1091
1092 1092 /*
1093 1093 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1094 1094 * pageout(), running in the global zone, have legitimate reasons to do
1095 1095 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by
1096 1096 * use of a a per-mount "asynchronous requests manager thread" which is
1097 1097 * signaled by the various asynchronous work routines when there is
1098 1098 * asynchronous work to be done. It is responsible for creating new
1099 1099 * worker threads if necessary, and notifying existing worker threads
1100 1100 * that there is work to be done.
1101 1101 *
1102 1102 * In other words, it will "take the specifications from the customers and
1103 1103 * give them to the engineers."
1104 1104 *
1105 1105 * Worker threads die off of their own accord if they are no longer
1106 1106 * needed.
1107 1107 *
1108 1108 * This thread is killed when the zone is going away or the filesystem
1109 1109 * is being unmounted.
1110 1110 */
1111 1111 void
1112 1112 nfs4_async_manager(vfs_t *vfsp)
1113 1113 {
1114 1114 callb_cpr_t cprinfo;
1115 1115 mntinfo4_t *mi;
1116 1116 uint_t max_threads;
1117 1117
1118 1118 mi = VFTOMI4(vfsp);
1119 1119
1120 1120 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1121 1121 "nfs4_async_manager");
1122 1122
1123 1123 mutex_enter(&mi->mi_async_lock);
1124 1124 /*
1125 1125 * We want to stash the max number of threads that this mount was
1126 1126 * allowed so we can use it later when the variable is set to zero as
1127 1127 * part of the zone/mount going away.
1128 1128 *
1129 1129 * We want to be able to create at least one thread to handle
1130 1130 * asynchronous inactive calls.
1131 1131 */
1132 1132 max_threads = MAX(mi->mi_max_threads, 1);
1133 1133 /*
1134 1134 * We don't want to wait for mi_max_threads to go to zero, since that
1135 1135 * happens as part of a failed unmount, but this thread should only
1136 1136 * exit when the mount is really going away.
1137 1137 *
1138 1138 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be
1139 1139 * attempted: the various _async_*() functions know to do things
1140 1140 * inline if mi_max_threads == 0. Henceforth we just drain out the
1141 1141 * outstanding requests.
1142 1142 *
1143 1143 * Note that we still create zthreads even if we notice the zone is
1144 1144 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone
1145 1145 * shutdown sequence to take slightly longer in some cases, but
1146 1146 * doesn't violate the protocol, as all threads will exit as soon as
1147 1147 * they're done processing the remaining requests.
1148 1148 */
1149 1149 for (;;) {
1150 1150 while (mi->mi_async_req_count > 0) {
1151 1151 /*
1152 1152 * Paranoia: If the mount started out having
1153 1153 * (mi->mi_max_threads == 0), and the value was
1154 1154 * later changed (via a debugger or somesuch),
1155 1155 * we could be confused since we will think we
1156 1156 * can't create any threads, and the calling
1157 1157 * code (which looks at the current value of
1158 1158 * mi->mi_max_threads, now non-zero) thinks we
1159 1159 * can.
1160 1160 *
1161 1161 * So, because we're paranoid, we create threads
1162 1162 * up to the maximum of the original and the
1163 1163 * current value. This means that future
1164 1164 * (debugger-induced) alterations of
1165 1165 * mi->mi_max_threads are ignored for our
1166 1166 * purposes, but who told them they could change
1167 1167 * random values on a live kernel anyhow?
1168 1168 */
1169 1169 if (mi->mi_threads[NFS4_ASYNC_QUEUE] <
1170 1170 MAX(mi->mi_max_threads, max_threads)) {
1171 1171 mi->mi_threads[NFS4_ASYNC_QUEUE]++;
1172 1172 mutex_exit(&mi->mi_async_lock);
1173 1173 MI4_HOLD(mi);
1174 1174 VFS_HOLD(vfsp); /* hold for new thread */
1175 1175 (void) zthread_create(NULL, 0, nfs4_async_start,
1176 1176 vfsp, 0, minclsyspri);
1177 1177 mutex_enter(&mi->mi_async_lock);
1178 1178 } else if (mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] <
1179 1179 NUM_ASYNC_PGOPS_THREADS) {
1180 1180 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE]++;
1181 1181 mutex_exit(&mi->mi_async_lock);
1182 1182 MI4_HOLD(mi);
1183 1183 VFS_HOLD(vfsp); /* hold for new thread */
1184 1184 (void) zthread_create(NULL, 0,
1185 1185 nfs4_async_pgops_start, vfsp, 0,
1186 1186 minclsyspri);
1187 1187 mutex_enter(&mi->mi_async_lock);
1188 1188 }
1189 1189 NFS4_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
1190 1190 ASSERT(mi->mi_async_req_count != 0);
1191 1191 mi->mi_async_req_count--;
1192 1192 }
1193 1193
1194 1194 mutex_enter(&mi->mi_lock);
1195 1195 if (mi->mi_flags & MI4_ASYNC_MGR_STOP) {
1196 1196 mutex_exit(&mi->mi_lock);
1197 1197 break;
1198 1198 }
1199 1199 mutex_exit(&mi->mi_lock);
1200 1200
1201 1201 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1202 1202 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1203 1203 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1204 1204 }
1205 1205
1206 1206 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1207 1207 "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp));
1208 1208 /*
1209 1209 * Let everyone know we're done.
1210 1210 */
1211 1211 mi->mi_manager_thread = NULL;
1212 1212 /*
1213 1213 * Wake up the inactive thread.
1214 1214 */
1215 1215 cv_broadcast(&mi->mi_inact_req_cv);
1216 1216 /*
1217 1217 * Wake up anyone sitting in nfs4_async_manager_stop()
1218 1218 */
1219 1219 cv_broadcast(&mi->mi_async_cv);
1220 1220 /*
1221 1221 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1222 1222 * since CALLB_CPR_EXIT is actually responsible for releasing
1223 1223 * 'mi_async_lock'.
1224 1224 */
1225 1225 CALLB_CPR_EXIT(&cprinfo);
1226 1226 VFS_RELE(vfsp); /* release thread's hold */
1227 1227 MI4_RELE(mi);
1228 1228 zthread_exit();
1229 1229 }
1230 1230
1231 1231 /*
1232 1232 * Signal (and wait for) the async manager thread to clean up and go away.
1233 1233 */
1234 1234 void
1235 1235 nfs4_async_manager_stop(vfs_t *vfsp)
1236 1236 {
1237 1237 mntinfo4_t *mi = VFTOMI4(vfsp);
1238 1238
1239 1239 mutex_enter(&mi->mi_async_lock);
1240 1240 mutex_enter(&mi->mi_lock);
1241 1241 mi->mi_flags |= MI4_ASYNC_MGR_STOP;
1242 1242 mutex_exit(&mi->mi_lock);
1243 1243 cv_broadcast(&mi->mi_async_reqs_cv);
1244 1244 /*
1245 1245 * Wait for the async manager thread to die.
1246 1246 */
1247 1247 while (mi->mi_manager_thread != NULL)
1248 1248 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1249 1249 mutex_exit(&mi->mi_async_lock);
1250 1250 }
1251 1251
1252 1252 int
1253 1253 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1254 1254 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1255 1255 u_offset_t, caddr_t, struct seg *, cred_t *))
1256 1256 {
1257 1257 rnode4_t *rp;
1258 1258 mntinfo4_t *mi;
1259 1259 struct nfs4_async_reqs *args;
1260 1260
1261 1261 rp = VTOR4(vp);
1262 1262 ASSERT(rp->r_freef == NULL);
1263 1263
1264 1264 mi = VTOMI4(vp);
1265 1265
1266 1266 /*
1267 1267 * If addr falls in a different segment, don't bother doing readahead.
1268 1268 */
1269 1269 if (addr >= seg->s_base + seg->s_size)
1270 1270 return (-1);
1271 1271
1272 1272 /*
1273 1273 * If we can't allocate a request structure, punt on the readahead.
1274 1274 */
1275 1275 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1276 1276 return (-1);
1277 1277
1278 1278 /*
1279 1279 * If a lock operation is pending, don't initiate any new
1280 1280 * readaheads. Otherwise, bump r_count to indicate the new
1281 1281 * asynchronous I/O.
1282 1282 */
1283 1283 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1284 1284 kmem_free(args, sizeof (*args));
1285 1285 return (-1);
1286 1286 }
1287 1287 mutex_enter(&rp->r_statelock);
1288 1288 rp->r_count++;
1289 1289 mutex_exit(&rp->r_statelock);
1290 1290 nfs_rw_exit(&rp->r_lkserlock);
1291 1291
1292 1292 args->a_next = NULL;
1293 1293 #ifdef DEBUG
1294 1294 args->a_queuer = curthread;
1295 1295 #endif
1296 1296 VN_HOLD(vp);
1297 1297 args->a_vp = vp;
1298 1298 ASSERT(cr != NULL);
1299 1299 crhold(cr);
1300 1300 args->a_cred = cr;
1301 1301 args->a_io = NFS4_READ_AHEAD;
1302 1302 args->a_nfs4_readahead = readahead;
1303 1303 args->a_nfs4_blkoff = blkoff;
1304 1304 args->a_nfs4_seg = seg;
1305 1305 args->a_nfs4_addr = addr;
1306 1306
1307 1307 mutex_enter(&mi->mi_async_lock);
1308 1308
1309 1309 /*
1310 1310 * If asyncio has been disabled, don't bother readahead.
1311 1311 */
1312 1312 if (mi->mi_max_threads == 0) {
1313 1313 mutex_exit(&mi->mi_async_lock);
1314 1314 goto noasync;
1315 1315 }
1316 1316
1317 1317 /*
1318 1318 * Link request structure into the async list and
1319 1319 * wakeup async thread to do the i/o.
1320 1320 */
1321 1321 if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) {
1322 1322 mi->mi_async_reqs[NFS4_READ_AHEAD] = args;
1323 1323 mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1324 1324 } else {
1325 1325 mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args;
1326 1326 mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1327 1327 }
1328 1328
1329 1329 if (mi->mi_io_kstats) {
1330 1330 mutex_enter(&mi->mi_lock);
1331 1331 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1332 1332 mutex_exit(&mi->mi_lock);
1333 1333 }
1334 1334
1335 1335 mi->mi_async_req_count++;
1336 1336 ASSERT(mi->mi_async_req_count != 0);
1337 1337 cv_signal(&mi->mi_async_reqs_cv);
1338 1338 mutex_exit(&mi->mi_async_lock);
1339 1339 return (0);
1340 1340
1341 1341 noasync:
1342 1342 mutex_enter(&rp->r_statelock);
1343 1343 rp->r_count--;
1344 1344 cv_broadcast(&rp->r_cv);
1345 1345 mutex_exit(&rp->r_statelock);
1346 1346 VN_RELE(vp);
1347 1347 crfree(cr);
1348 1348 kmem_free(args, sizeof (*args));
1349 1349 return (-1);
1350 1350 }
1351 1351
1352 1352 static void
1353 1353 nfs4_async_start(struct vfs *vfsp)
1354 1354 {
1355 1355 nfs4_async_common_start(vfsp, NFS4_ASYNC_QUEUE);
1356 1356 }
1357 1357
1358 1358 static void
1359 1359 nfs4_async_pgops_start(struct vfs *vfsp)
1360 1360 {
1361 1361 nfs4_async_common_start(vfsp, NFS4_ASYNC_PGOPS_QUEUE);
1362 1362 }
1363 1363
1364 1364 /*
1365 1365 * The async queues for each mounted file system are arranged as a
1366 1366 * set of queues, one for each async i/o type. Requests are taken
1367 1367 * from the queues in a round-robin fashion. A number of consecutive
1368 1368 * requests are taken from each queue before moving on to the next
1369 1369 * queue. This functionality may allow the NFS Version 2 server to do
1370 1370 * write clustering, even if the client is mixing writes and reads
1371 1371 * because it will take multiple write requests from the queue
1372 1372 * before processing any of the other async i/o types.
1373 1373 *
1374 1374 * XXX The nfs4_async_common_start thread is unsafe in the light of the present
1375 1375 * model defined by cpr to suspend the system. Specifically over the
1376 1376 * wire calls are cpr-unsafe. The thread should be reevaluated in
1377 1377 * case of future updates to the cpr model.
1378 1378 */
1379 1379 static void
1380 1380 nfs4_async_common_start(struct vfs *vfsp, int async_queue)
1381 1381 {
1382 1382 struct nfs4_async_reqs *args;
1383 1383 mntinfo4_t *mi = VFTOMI4(vfsp);
1384 1384 clock_t time_left = 1;
1385 1385 callb_cpr_t cprinfo;
1386 1386 int i;
1387 1387 extern int nfs_async_timeout;
1388 1388 int async_types;
1389 1389 kcondvar_t *async_work_cv;
1390 1390
1391 1391 if (async_queue == NFS4_ASYNC_QUEUE) {
1392 1392 async_types = NFS4_ASYNC_TYPES;
1393 1393 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_QUEUE];
1394 1394 } else {
1395 1395 async_types = NFS4_ASYNC_PGOPS_TYPES;
1396 1396 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE];
1397 1397 }
1398 1398
1399 1399 /*
1400 1400 * Dynamic initialization of nfs_async_timeout to allow nfs to be
1401 1401 * built in an implementation independent manner.
1402 1402 */
1403 1403 if (nfs_async_timeout == -1)
1404 1404 nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1405 1405
1406 1406 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1407 1407
1408 1408 mutex_enter(&mi->mi_async_lock);
1409 1409 for (;;) {
1410 1410 /*
1411 1411 * Find the next queue containing an entry. We start
1412 1412 * at the current queue pointer and then round robin
1413 1413 * through all of them until we either find a non-empty
1414 1414 * queue or have looked through all of them.
1415 1415 */
1416 1416 for (i = 0; i < async_types; i++) {
1417 1417 args = *mi->mi_async_curr[async_queue];
1418 1418 if (args != NULL)
1419 1419 break;
1420 1420 mi->mi_async_curr[async_queue]++;
1421 1421 if (mi->mi_async_curr[async_queue] ==
1422 1422 &mi->mi_async_reqs[async_types]) {
1423 1423 mi->mi_async_curr[async_queue] =
1424 1424 &mi->mi_async_reqs[0];
1425 1425 }
1426 1426 }
1427 1427 /*
1428 1428 * If we didn't find a entry, then block until woken up
1429 1429 * again and then look through the queues again.
1430 1430 */
1431 1431 if (args == NULL) {
1432 1432 /*
1433 1433 * Exiting is considered to be safe for CPR as well
1434 1434 */
1435 1435 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1436 1436
1437 1437 /*
1438 1438 * Wakeup thread waiting to unmount the file
1439 1439 * system only if all async threads are inactive.
1440 1440 *
1441 1441 * If we've timed-out and there's nothing to do,
1442 1442 * then get rid of this thread.
1443 1443 */
1444 1444 if (mi->mi_max_threads == 0 || time_left <= 0) {
1445 1445 --mi->mi_threads[async_queue];
1446 1446
1447 1447 if (mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
1448 1448 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0)
1449 1449 cv_signal(&mi->mi_async_cv);
1450 1450 CALLB_CPR_EXIT(&cprinfo);
1451 1451 VFS_RELE(vfsp); /* release thread's hold */
1452 1452 MI4_RELE(mi);
1453 1453 zthread_exit();
1454 1454 /* NOTREACHED */
1455 1455 }
1456 1456 time_left = cv_reltimedwait(async_work_cv,
1457 1457 &mi->mi_async_lock, nfs_async_timeout,
1458 1458 TR_CLOCK_TICK);
1459 1459
1460 1460 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1461 1461
1462 1462 continue;
1463 1463 } else {
1464 1464 time_left = 1;
1465 1465 }
1466 1466
1467 1467 /*
1468 1468 * Remove the request from the async queue and then
1469 1469 * update the current async request queue pointer. If
1470 1470 * the current queue is empty or we have removed enough
1471 1471 * consecutive entries from it, then reset the counter
1472 1472 * for this queue and then move the current pointer to
1473 1473 * the next queue.
1474 1474 */
1475 1475 *mi->mi_async_curr[async_queue] = args->a_next;
1476 1476 if (*mi->mi_async_curr[async_queue] == NULL ||
1477 1477 --mi->mi_async_clusters[args->a_io] == 0) {
1478 1478 mi->mi_async_clusters[args->a_io] =
1479 1479 mi->mi_async_init_clusters;
1480 1480 mi->mi_async_curr[async_queue]++;
1481 1481 if (mi->mi_async_curr[async_queue] ==
1482 1482 &mi->mi_async_reqs[async_types]) {
1483 1483 mi->mi_async_curr[async_queue] =
1484 1484 &mi->mi_async_reqs[0];
1485 1485 }
1486 1486 }
1487 1487
1488 1488 if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) {
1489 1489 mutex_enter(&mi->mi_lock);
1490 1490 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1491 1491 mutex_exit(&mi->mi_lock);
1492 1492 }
1493 1493
1494 1494 mutex_exit(&mi->mi_async_lock);
1495 1495
1496 1496 /*
1497 1497 * Obtain arguments from the async request structure.
1498 1498 */
1499 1499 if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) {
1500 1500 (*args->a_nfs4_readahead)(args->a_vp,
1501 1501 args->a_nfs4_blkoff, args->a_nfs4_addr,
1502 1502 args->a_nfs4_seg, args->a_cred);
1503 1503 } else if (args->a_io == NFS4_PUTAPAGE) {
1504 1504 (void) (*args->a_nfs4_putapage)(args->a_vp,
1505 1505 args->a_nfs4_pp, args->a_nfs4_off,
1506 1506 args->a_nfs4_len, args->a_nfs4_flags,
1507 1507 args->a_cred);
1508 1508 } else if (args->a_io == NFS4_PAGEIO) {
1509 1509 (void) (*args->a_nfs4_pageio)(args->a_vp,
1510 1510 args->a_nfs4_pp, args->a_nfs4_off,
1511 1511 args->a_nfs4_len, args->a_nfs4_flags,
1512 1512 args->a_cred);
1513 1513 } else if (args->a_io == NFS4_READDIR) {
1514 1514 (void) ((*args->a_nfs4_readdir)(args->a_vp,
1515 1515 args->a_nfs4_rdc, args->a_cred));
1516 1516 } else if (args->a_io == NFS4_COMMIT) {
1517 1517 (*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist,
1518 1518 args->a_nfs4_offset, args->a_nfs4_count,
1519 1519 args->a_cred);
1520 1520 } else if (args->a_io == NFS4_INACTIVE) {
1521 1521 nfs4_inactive_otw(args->a_vp, args->a_cred);
1522 1522 }
1523 1523
1524 1524 /*
1525 1525 * Now, release the vnode and free the credentials
1526 1526 * structure.
1527 1527 */
1528 1528 free_async_args4(args);
1529 1529 /*
1530 1530 * Reacquire the mutex because it will be needed above.
1531 1531 */
1532 1532 mutex_enter(&mi->mi_async_lock);
1533 1533 }
1534 1534 }
1535 1535
1536 1536 /*
1537 1537 * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as
1538 1538 * part of VOP_INACTIVE.
1539 1539 */
1540 1540
1541 1541 void
1542 1542 nfs4_inactive_thread(mntinfo4_t *mi)
1543 1543 {
1544 1544 struct nfs4_async_reqs *args;
1545 1545 callb_cpr_t cprinfo;
1546 1546 vfs_t *vfsp = mi->mi_vfsp;
1547 1547
1548 1548 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1549 1549 "nfs4_inactive_thread");
1550 1550
1551 1551 for (;;) {
1552 1552 mutex_enter(&mi->mi_async_lock);
1553 1553 args = mi->mi_async_reqs[NFS4_INACTIVE];
1554 1554 if (args == NULL) {
1555 1555 mutex_enter(&mi->mi_lock);
1556 1556 /*
1557 1557 * We don't want to exit until the async manager is done
1558 1558 * with its work; hence the check for mi_manager_thread
1559 1559 * being NULL.
1560 1560 *
1561 1561 * The async manager thread will cv_broadcast() on
1562 1562 * mi_inact_req_cv when it's done, at which point we'll
1563 1563 * wake up and exit.
1564 1564 */
1565 1565 if (mi->mi_manager_thread == NULL)
1566 1566 goto die;
1567 1567 mi->mi_flags |= MI4_INACTIVE_IDLE;
1568 1568 mutex_exit(&mi->mi_lock);
1569 1569 cv_signal(&mi->mi_async_cv);
1570 1570 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1571 1571 cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock);
1572 1572 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1573 1573 mutex_exit(&mi->mi_async_lock);
1574 1574 } else {
1575 1575 mutex_enter(&mi->mi_lock);
1576 1576 mi->mi_flags &= ~MI4_INACTIVE_IDLE;
1577 1577 mutex_exit(&mi->mi_lock);
1578 1578 mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next;
1579 1579 mutex_exit(&mi->mi_async_lock);
1580 1580 nfs4_inactive_otw(args->a_vp, args->a_cred);
1581 1581 crfree(args->a_cred);
1582 1582 kmem_free(args, sizeof (*args));
1583 1583 }
1584 1584 }
1585 1585 die:
1586 1586 mutex_exit(&mi->mi_lock);
1587 1587 mi->mi_inactive_thread = NULL;
1588 1588 cv_signal(&mi->mi_async_cv);
1589 1589
1590 1590 /*
1591 1591 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since
1592 1592 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'.
1593 1593 */
1594 1594 CALLB_CPR_EXIT(&cprinfo);
1595 1595
1596 1596 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1597 1597 "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp));
1598 1598
1599 1599 MI4_RELE(mi);
1600 1600 zthread_exit();
1601 1601 /* NOTREACHED */
1602 1602 }
1603 1603
1604 1604 /*
1605 1605 * nfs_async_stop:
1606 1606 * Wait for all outstanding putpage operations and the inactive thread to
1607 1607 * complete; nfs4_async_stop_sig() without interruptibility.
1608 1608 */
1609 1609 void
1610 1610 nfs4_async_stop(struct vfs *vfsp)
1611 1611 {
1612 1612 mntinfo4_t *mi = VFTOMI4(vfsp);
1613 1613
1614 1614 /*
1615 1615 * Wait for all outstanding async operations to complete and for
1616 1616 * worker threads to exit.
1617 1617 */
1618 1618 mutex_enter(&mi->mi_async_lock);
1619 1619 mi->mi_max_threads = 0;
1620 1620 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1621 1621 while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1622 1622 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0)
1623 1623 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1624 1624
1625 1625 /*
1626 1626 * Wait for the inactive thread to finish doing what it's doing. It
1627 1627 * won't exit until the last reference to the vfs_t goes away.
1628 1628 */
1629 1629 if (mi->mi_inactive_thread != NULL) {
1630 1630 mutex_enter(&mi->mi_lock);
1631 1631 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1632 1632 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1633 1633 mutex_exit(&mi->mi_lock);
1634 1634 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1635 1635 mutex_enter(&mi->mi_lock);
1636 1636 }
1637 1637 mutex_exit(&mi->mi_lock);
1638 1638 }
1639 1639 mutex_exit(&mi->mi_async_lock);
1640 1640 }
1641 1641
1642 1642 /*
1643 1643 * nfs_async_stop_sig:
1644 1644 * Wait for all outstanding putpage operations and the inactive thread to
1645 1645 * complete. If a signal is delivered we will abort and return non-zero;
1646 1646 * otherwise return 0. Since this routine is called from nfs4_unmount, we
1647 1647 * need to make it interruptible.
1648 1648 */
1649 1649 int
1650 1650 nfs4_async_stop_sig(struct vfs *vfsp)
1651 1651 {
1652 1652 mntinfo4_t *mi = VFTOMI4(vfsp);
1653 1653 ushort_t omax;
1654 1654 bool_t intr = FALSE;
1655 1655
1656 1656 /*
1657 1657 * Wait for all outstanding putpage operations to complete and for
1658 1658 * worker threads to exit.
1659 1659 */
1660 1660 mutex_enter(&mi->mi_async_lock);
1661 1661 omax = mi->mi_max_threads;
1662 1662 mi->mi_max_threads = 0;
1663 1663 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1664 1664 while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1665 1665 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) {
1666 1666 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) {
1667 1667 intr = TRUE;
1668 1668 goto interrupted;
1669 1669 }
1670 1670 }
1671 1671
1672 1672 /*
1673 1673 * Wait for the inactive thread to finish doing what it's doing. It
1674 1674 * won't exit until the a last reference to the vfs_t goes away.
1675 1675 */
1676 1676 if (mi->mi_inactive_thread != NULL) {
1677 1677 mutex_enter(&mi->mi_lock);
1678 1678 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1679 1679 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1680 1680 mutex_exit(&mi->mi_lock);
1681 1681 if (!cv_wait_sig(&mi->mi_async_cv,
1682 1682 &mi->mi_async_lock)) {
1683 1683 intr = TRUE;
1684 1684 goto interrupted;
1685 1685 }
1686 1686 mutex_enter(&mi->mi_lock);
1687 1687 }
1688 1688 mutex_exit(&mi->mi_lock);
1689 1689 }
1690 1690 interrupted:
1691 1691 if (intr)
1692 1692 mi->mi_max_threads = omax;
1693 1693 mutex_exit(&mi->mi_async_lock);
1694 1694
1695 1695 return (intr);
1696 1696 }
1697 1697
1698 1698 int
1699 1699 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1700 1700 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1701 1701 u_offset_t, size_t, int, cred_t *))
1702 1702 {
1703 1703 rnode4_t *rp;
1704 1704 mntinfo4_t *mi;
1705 1705 struct nfs4_async_reqs *args;
1706 1706
1707 1707 ASSERT(flags & B_ASYNC);
1708 1708 ASSERT(vp->v_vfsp != NULL);
1709 1709
1710 1710 rp = VTOR4(vp);
1711 1711 ASSERT(rp->r_count > 0);
1712 1712
1713 1713 mi = VTOMI4(vp);
1714 1714
1715 1715 /*
1716 1716 * If we can't allocate a request structure, do the putpage
1717 1717 * operation synchronously in this thread's context.
1718 1718 */
1719 1719 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1720 1720 goto noasync;
1721 1721
1722 1722 args->a_next = NULL;
1723 1723 #ifdef DEBUG
1724 1724 args->a_queuer = curthread;
1725 1725 #endif
1726 1726 VN_HOLD(vp);
1727 1727 args->a_vp = vp;
1728 1728 ASSERT(cr != NULL);
1729 1729 crhold(cr);
1730 1730 args->a_cred = cr;
1731 1731 args->a_io = NFS4_PUTAPAGE;
1732 1732 args->a_nfs4_putapage = putapage;
1733 1733 args->a_nfs4_pp = pp;
1734 1734 args->a_nfs4_off = off;
1735 1735 args->a_nfs4_len = (uint_t)len;
1736 1736 args->a_nfs4_flags = flags;
1737 1737
1738 1738 mutex_enter(&mi->mi_async_lock);
1739 1739
1740 1740 /*
1741 1741 * If asyncio has been disabled, then make a synchronous request.
1742 1742 * This check is done a second time in case async io was diabled
1743 1743 * while this thread was blocked waiting for memory pressure to
1744 1744 * reduce or for the queue to drain.
1745 1745 */
1746 1746 if (mi->mi_max_threads == 0) {
1747 1747 mutex_exit(&mi->mi_async_lock);
1748 1748
1749 1749 VN_RELE(vp);
1750 1750 crfree(cr);
1751 1751 kmem_free(args, sizeof (*args));
1752 1752 goto noasync;
1753 1753 }
1754 1754
1755 1755 /*
1756 1756 * Link request structure into the async list and
1757 1757 * wakeup async thread to do the i/o.
1758 1758 */
1759 1759 if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) {
1760 1760 mi->mi_async_reqs[NFS4_PUTAPAGE] = args;
1761 1761 mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1762 1762 } else {
1763 1763 mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args;
1764 1764 mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1765 1765 }
1766 1766
1767 1767 mutex_enter(&rp->r_statelock);
1768 1768 rp->r_count++;
1769 1769 rp->r_awcount++;
1770 1770 mutex_exit(&rp->r_statelock);
1771 1771
1772 1772 if (mi->mi_io_kstats) {
1773 1773 mutex_enter(&mi->mi_lock);
1774 1774 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1775 1775 mutex_exit(&mi->mi_lock);
1776 1776 }
1777 1777
1778 1778 mi->mi_async_req_count++;
1779 1779 ASSERT(mi->mi_async_req_count != 0);
1780 1780 cv_signal(&mi->mi_async_reqs_cv);
1781 1781 mutex_exit(&mi->mi_async_lock);
1782 1782 return (0);
1783 1783
1784 1784 noasync:
1785 1785
1786 1786 if (curproc == proc_pageout || curproc == proc_fsflush) {
1787 1787 /*
1788 1788 * If we get here in the context of the pageout/fsflush,
1789 1789 * or we have run out of memory or we're attempting to
1790 1790 * unmount we refuse to do a sync write, because this may
1791 1791 * hang pageout/fsflush and the machine. In this case,
1792 1792 * we just re-mark the page as dirty and punt on the page.
1793 1793 *
1794 1794 * Make sure B_FORCE isn't set. We can re-mark the
1795 1795 * pages as dirty and unlock the pages in one swoop by
1796 1796 * passing in B_ERROR to pvn_write_done(). However,
1797 1797 * we should make sure B_FORCE isn't set - we don't
1798 1798 * want the page tossed before it gets written out.
1799 1799 */
1800 1800 if (flags & B_FORCE)
1801 1801 flags &= ~(B_INVAL | B_FORCE);
1802 1802 pvn_write_done(pp, flags | B_ERROR);
1803 1803 return (0);
1804 1804 }
1805 1805
1806 1806 if (nfs_zone() != mi->mi_zone) {
1807 1807 /*
1808 1808 * So this was a cross-zone sync putpage.
1809 1809 *
1810 1810 * We pass in B_ERROR to pvn_write_done() to re-mark the pages
1811 1811 * as dirty and unlock them.
1812 1812 *
1813 1813 * We don't want to clear B_FORCE here as the caller presumably
1814 1814 * knows what they're doing if they set it.
1815 1815 */
1816 1816 pvn_write_done(pp, flags | B_ERROR);
1817 1817 return (EPERM);
1818 1818 }
1819 1819 return ((*putapage)(vp, pp, off, len, flags, cr));
1820 1820 }
1821 1821
1822 1822 int
1823 1823 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1824 1824 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1825 1825 size_t, int, cred_t *))
1826 1826 {
1827 1827 rnode4_t *rp;
1828 1828 mntinfo4_t *mi;
1829 1829 struct nfs4_async_reqs *args;
1830 1830
1831 1831 ASSERT(flags & B_ASYNC);
1832 1832 ASSERT(vp->v_vfsp != NULL);
1833 1833
1834 1834 rp = VTOR4(vp);
1835 1835 ASSERT(rp->r_count > 0);
1836 1836
1837 1837 mi = VTOMI4(vp);
1838 1838
1839 1839 /*
1840 1840 * If we can't allocate a request structure, do the pageio
1841 1841 * request synchronously in this thread's context.
1842 1842 */
1843 1843 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1844 1844 goto noasync;
1845 1845
1846 1846 args->a_next = NULL;
1847 1847 #ifdef DEBUG
1848 1848 args->a_queuer = curthread;
1849 1849 #endif
1850 1850 VN_HOLD(vp);
1851 1851 args->a_vp = vp;
1852 1852 ASSERT(cr != NULL);
1853 1853 crhold(cr);
1854 1854 args->a_cred = cr;
1855 1855 args->a_io = NFS4_PAGEIO;
1856 1856 args->a_nfs4_pageio = pageio;
1857 1857 args->a_nfs4_pp = pp;
1858 1858 args->a_nfs4_off = io_off;
1859 1859 args->a_nfs4_len = (uint_t)io_len;
1860 1860 args->a_nfs4_flags = flags;
1861 1861
1862 1862 mutex_enter(&mi->mi_async_lock);
1863 1863
1864 1864 /*
1865 1865 * If asyncio has been disabled, then make a synchronous request.
1866 1866 * This check is done a second time in case async io was diabled
1867 1867 * while this thread was blocked waiting for memory pressure to
1868 1868 * reduce or for the queue to drain.
1869 1869 */
1870 1870 if (mi->mi_max_threads == 0) {
1871 1871 mutex_exit(&mi->mi_async_lock);
1872 1872
1873 1873 VN_RELE(vp);
1874 1874 crfree(cr);
1875 1875 kmem_free(args, sizeof (*args));
1876 1876 goto noasync;
1877 1877 }
1878 1878
1879 1879 /*
1880 1880 * Link request structure into the async list and
1881 1881 * wakeup async thread to do the i/o.
1882 1882 */
1883 1883 if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) {
1884 1884 mi->mi_async_reqs[NFS4_PAGEIO] = args;
1885 1885 mi->mi_async_tail[NFS4_PAGEIO] = args;
1886 1886 } else {
1887 1887 mi->mi_async_tail[NFS4_PAGEIO]->a_next = args;
1888 1888 mi->mi_async_tail[NFS4_PAGEIO] = args;
1889 1889 }
1890 1890
1891 1891 mutex_enter(&rp->r_statelock);
1892 1892 rp->r_count++;
1893 1893 rp->r_awcount++;
1894 1894 mutex_exit(&rp->r_statelock);
1895 1895
1896 1896 if (mi->mi_io_kstats) {
1897 1897 mutex_enter(&mi->mi_lock);
1898 1898 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1899 1899 mutex_exit(&mi->mi_lock);
1900 1900 }
1901 1901
1902 1902 mi->mi_async_req_count++;
1903 1903 ASSERT(mi->mi_async_req_count != 0);
1904 1904 cv_signal(&mi->mi_async_reqs_cv);
1905 1905 mutex_exit(&mi->mi_async_lock);
1906 1906 return (0);
1907 1907
1908 1908 noasync:
1909 1909 /*
1910 1910 * If we can't do it ASYNC, for reads we do nothing (but cleanup
1911 1911 * the page list), for writes we do it synchronously, except for
1912 1912 * proc_pageout/proc_fsflush as described below.
1913 1913 */
1914 1914 if (flags & B_READ) {
1915 1915 pvn_read_done(pp, flags | B_ERROR);
1916 1916 return (0);
1917 1917 }
1918 1918
1919 1919 if (curproc == proc_pageout || curproc == proc_fsflush) {
1920 1920 /*
1921 1921 * If we get here in the context of the pageout/fsflush,
1922 1922 * we refuse to do a sync write, because this may hang
1923 1923 * pageout/fsflush (and the machine). In this case, we just
1924 1924 * re-mark the page as dirty and punt on the page.
1925 1925 *
1926 1926 * Make sure B_FORCE isn't set. We can re-mark the
1927 1927 * pages as dirty and unlock the pages in one swoop by
1928 1928 * passing in B_ERROR to pvn_write_done(). However,
1929 1929 * we should make sure B_FORCE isn't set - we don't
1930 1930 * want the page tossed before it gets written out.
1931 1931 */
1932 1932 if (flags & B_FORCE)
1933 1933 flags &= ~(B_INVAL | B_FORCE);
1934 1934 pvn_write_done(pp, flags | B_ERROR);
1935 1935 return (0);
1936 1936 }
1937 1937
1938 1938 if (nfs_zone() != mi->mi_zone) {
1939 1939 /*
1940 1940 * So this was a cross-zone sync pageio. We pass in B_ERROR
1941 1941 * to pvn_write_done() to re-mark the pages as dirty and unlock
1942 1942 * them.
1943 1943 *
1944 1944 * We don't want to clear B_FORCE here as the caller presumably
1945 1945 * knows what they're doing if they set it.
1946 1946 */
1947 1947 pvn_write_done(pp, flags | B_ERROR);
1948 1948 return (EPERM);
1949 1949 }
1950 1950 return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1951 1951 }
1952 1952
1953 1953 void
1954 1954 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr,
1955 1955 int (*readdir)(vnode_t *, rddir4_cache *, cred_t *))
1956 1956 {
1957 1957 rnode4_t *rp;
1958 1958 mntinfo4_t *mi;
1959 1959 struct nfs4_async_reqs *args;
1960 1960
1961 1961 rp = VTOR4(vp);
1962 1962 ASSERT(rp->r_freef == NULL);
1963 1963
1964 1964 mi = VTOMI4(vp);
1965 1965
1966 1966 /*
1967 1967 * If we can't allocate a request structure, skip the readdir.
1968 1968 */
1969 1969 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1970 1970 goto noasync;
1971 1971
1972 1972 args->a_next = NULL;
1973 1973 #ifdef DEBUG
1974 1974 args->a_queuer = curthread;
1975 1975 #endif
1976 1976 VN_HOLD(vp);
1977 1977 args->a_vp = vp;
1978 1978 ASSERT(cr != NULL);
1979 1979 crhold(cr);
1980 1980 args->a_cred = cr;
1981 1981 args->a_io = NFS4_READDIR;
1982 1982 args->a_nfs4_readdir = readdir;
1983 1983 args->a_nfs4_rdc = rdc;
1984 1984
1985 1985 mutex_enter(&mi->mi_async_lock);
1986 1986
1987 1987 /*
1988 1988 * If asyncio has been disabled, then skip this request
1989 1989 */
1990 1990 if (mi->mi_max_threads == 0) {
1991 1991 mutex_exit(&mi->mi_async_lock);
1992 1992
1993 1993 VN_RELE(vp);
1994 1994 crfree(cr);
1995 1995 kmem_free(args, sizeof (*args));
1996 1996 goto noasync;
1997 1997 }
1998 1998
1999 1999 /*
2000 2000 * Link request structure into the async list and
2001 2001 * wakeup async thread to do the i/o.
2002 2002 */
2003 2003 if (mi->mi_async_reqs[NFS4_READDIR] == NULL) {
2004 2004 mi->mi_async_reqs[NFS4_READDIR] = args;
2005 2005 mi->mi_async_tail[NFS4_READDIR] = args;
2006 2006 } else {
2007 2007 mi->mi_async_tail[NFS4_READDIR]->a_next = args;
2008 2008 mi->mi_async_tail[NFS4_READDIR] = args;
2009 2009 }
2010 2010
2011 2011 mutex_enter(&rp->r_statelock);
2012 2012 rp->r_count++;
2013 2013 mutex_exit(&rp->r_statelock);
2014 2014
2015 2015 if (mi->mi_io_kstats) {
2016 2016 mutex_enter(&mi->mi_lock);
2017 2017 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2018 2018 mutex_exit(&mi->mi_lock);
2019 2019 }
2020 2020
2021 2021 mi->mi_async_req_count++;
2022 2022 ASSERT(mi->mi_async_req_count != 0);
2023 2023 cv_signal(&mi->mi_async_reqs_cv);
2024 2024 mutex_exit(&mi->mi_async_lock);
2025 2025 return;
2026 2026
2027 2027 noasync:
2028 2028 mutex_enter(&rp->r_statelock);
2029 2029 rdc->entries = NULL;
2030 2030 /*
2031 2031 * Indicate that no one is trying to fill this entry and
2032 2032 * it still needs to be filled.
2033 2033 */
2034 2034 rdc->flags &= ~RDDIR;
2035 2035 rdc->flags |= RDDIRREQ;
2036 2036 rddir4_cache_rele(rp, rdc);
2037 2037 mutex_exit(&rp->r_statelock);
2038 2038 }
2039 2039
2040 2040 void
2041 2041 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
2042 2042 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
2043 2043 cred_t *))
2044 2044 {
2045 2045 rnode4_t *rp;
2046 2046 mntinfo4_t *mi;
2047 2047 struct nfs4_async_reqs *args;
2048 2048 page_t *pp;
2049 2049
2050 2050 rp = VTOR4(vp);
2051 2051 mi = VTOMI4(vp);
2052 2052
2053 2053 /*
2054 2054 * If we can't allocate a request structure, do the commit
2055 2055 * operation synchronously in this thread's context.
2056 2056 */
2057 2057 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
2058 2058 goto noasync;
2059 2059
2060 2060 args->a_next = NULL;
2061 2061 #ifdef DEBUG
2062 2062 args->a_queuer = curthread;
2063 2063 #endif
2064 2064 VN_HOLD(vp);
2065 2065 args->a_vp = vp;
2066 2066 ASSERT(cr != NULL);
2067 2067 crhold(cr);
2068 2068 args->a_cred = cr;
2069 2069 args->a_io = NFS4_COMMIT;
2070 2070 args->a_nfs4_commit = commit;
2071 2071 args->a_nfs4_plist = plist;
2072 2072 args->a_nfs4_offset = offset;
2073 2073 args->a_nfs4_count = count;
2074 2074
2075 2075 mutex_enter(&mi->mi_async_lock);
2076 2076
2077 2077 /*
2078 2078 * If asyncio has been disabled, then make a synchronous request.
2079 2079 * This check is done a second time in case async io was diabled
2080 2080 * while this thread was blocked waiting for memory pressure to
2081 2081 * reduce or for the queue to drain.
2082 2082 */
2083 2083 if (mi->mi_max_threads == 0) {
2084 2084 mutex_exit(&mi->mi_async_lock);
2085 2085
2086 2086 VN_RELE(vp);
2087 2087 crfree(cr);
2088 2088 kmem_free(args, sizeof (*args));
2089 2089 goto noasync;
2090 2090 }
2091 2091
2092 2092 /*
2093 2093 * Link request structure into the async list and
2094 2094 * wakeup async thread to do the i/o.
2095 2095 */
2096 2096 if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) {
2097 2097 mi->mi_async_reqs[NFS4_COMMIT] = args;
2098 2098 mi->mi_async_tail[NFS4_COMMIT] = args;
2099 2099 } else {
2100 2100 mi->mi_async_tail[NFS4_COMMIT]->a_next = args;
2101 2101 mi->mi_async_tail[NFS4_COMMIT] = args;
2102 2102 }
2103 2103
2104 2104 mutex_enter(&rp->r_statelock);
2105 2105 rp->r_count++;
2106 2106 mutex_exit(&rp->r_statelock);
2107 2107
2108 2108 if (mi->mi_io_kstats) {
2109 2109 mutex_enter(&mi->mi_lock);
2110 2110 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2111 2111 mutex_exit(&mi->mi_lock);
2112 2112 }
2113 2113
2114 2114 mi->mi_async_req_count++;
2115 2115 ASSERT(mi->mi_async_req_count != 0);
2116 2116 cv_signal(&mi->mi_async_reqs_cv);
2117 2117 mutex_exit(&mi->mi_async_lock);
2118 2118 return;
2119 2119
2120 2120 noasync:
2121 2121 if (curproc == proc_pageout || curproc == proc_fsflush ||
2122 2122 nfs_zone() != mi->mi_zone) {
2123 2123 while (plist != NULL) {
2124 2124 pp = plist;
2125 2125 page_sub(&plist, pp);
2126 2126 pp->p_fsdata = C_COMMIT;
2127 2127 page_unlock(pp);
2128 2128 }
2129 2129 return;
2130 2130 }
2131 2131 (*commit)(vp, plist, offset, count, cr);
2132 2132 }
2133 2133
2134 2134 /*
2135 2135 * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread. The
2136 2136 * reference to the vnode is handed over to the thread; the caller should
2137 2137 * no longer refer to the vnode.
2138 2138 *
2139 2139 * Unlike most of the async routines, this handoff is needed for
2140 2140 * correctness reasons, not just performance. So doing operations in the
2141 2141 * context of the current thread is not an option.
2142 2142 */
2143 2143 void
2144 2144 nfs4_async_inactive(vnode_t *vp, cred_t *cr)
2145 2145 {
2146 2146 mntinfo4_t *mi;
2147 2147 struct nfs4_async_reqs *args;
2148 2148 boolean_t signal_inactive_thread = B_FALSE;
2149 2149
2150 2150 mi = VTOMI4(vp);
2151 2151
2152 2152 args = kmem_alloc(sizeof (*args), KM_SLEEP);
2153 2153 args->a_next = NULL;
2154 2154 #ifdef DEBUG
2155 2155 args->a_queuer = curthread;
2156 2156 #endif
2157 2157 args->a_vp = vp;
2158 2158 ASSERT(cr != NULL);
2159 2159 crhold(cr);
2160 2160 args->a_cred = cr;
2161 2161 args->a_io = NFS4_INACTIVE;
2162 2162
2163 2163 /*
2164 2164 * Note that we don't check mi->mi_max_threads here, since we
2165 2165 * *need* to get rid of this vnode regardless of whether someone
2166 2166 * set nfs4_max_threads to zero in /etc/system.
2167 2167 *
2168 2168 * The manager thread knows about this and is willing to create
2169 2169 * at least one thread to accommodate us.
2170 2170 */
2171 2171 mutex_enter(&mi->mi_async_lock);
2172 2172 if (mi->mi_inactive_thread == NULL) {
2173 2173 rnode4_t *rp;
2174 2174 vnode_t *unldvp = NULL;
2175 2175 char *unlname;
2176 2176 cred_t *unlcred;
2177 2177
2178 2178 mutex_exit(&mi->mi_async_lock);
2179 2179 /*
2180 2180 * We just need to free up the memory associated with the
2181 2181 * vnode, which can be safely done from within the current
2182 2182 * context.
2183 2183 */
2184 2184 crfree(cr); /* drop our reference */
2185 2185 kmem_free(args, sizeof (*args));
2186 2186 rp = VTOR4(vp);
2187 2187 mutex_enter(&rp->r_statelock);
2188 2188 if (rp->r_unldvp != NULL) {
2189 2189 unldvp = rp->r_unldvp;
2190 2190 rp->r_unldvp = NULL;
2191 2191 unlname = rp->r_unlname;
2192 2192 rp->r_unlname = NULL;
2193 2193 unlcred = rp->r_unlcred;
2194 2194 rp->r_unlcred = NULL;
2195 2195 }
2196 2196 mutex_exit(&rp->r_statelock);
2197 2197 /*
2198 2198 * No need to explicitly throw away any cached pages. The
2199 2199 * eventual r4inactive() will attempt a synchronous
2200 2200 * VOP_PUTPAGE() which will immediately fail since the request
2201 2201 * is coming from the wrong zone, and then will proceed to call
2202 2202 * nfs4_invalidate_pages() which will clean things up for us.
2203 2203 *
2204 2204 * Throw away the delegation here so rp4_addfree()'s attempt to
2205 2205 * return any existing delegations becomes a no-op.
2206 2206 */
2207 2207 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
2208 2208 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
2209 2209 FALSE);
2210 2210 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
2211 2211 nfs_rw_exit(&mi->mi_recovlock);
2212 2212 }
2213 2213 nfs4_clear_open_streams(rp);
2214 2214
2215 2215 rp4_addfree(rp, cr);
2216 2216 if (unldvp != NULL) {
2217 2217 kmem_free(unlname, MAXNAMELEN);
2218 2218 VN_RELE(unldvp);
2219 2219 crfree(unlcred);
2220 2220 }
2221 2221 return;
2222 2222 }
2223 2223
2224 2224 if (mi->mi_manager_thread == NULL) {
2225 2225 /*
2226 2226 * We want to talk to the inactive thread.
2227 2227 */
2228 2228 signal_inactive_thread = B_TRUE;
2229 2229 }
2230 2230
2231 2231 /*
2232 2232 * Enqueue the vnode and wake up either the special thread (empty
2233 2233 * list) or an async thread.
2234 2234 */
2235 2235 if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) {
2236 2236 mi->mi_async_reqs[NFS4_INACTIVE] = args;
2237 2237 mi->mi_async_tail[NFS4_INACTIVE] = args;
2238 2238 signal_inactive_thread = B_TRUE;
2239 2239 } else {
2240 2240 mi->mi_async_tail[NFS4_INACTIVE]->a_next = args;
2241 2241 mi->mi_async_tail[NFS4_INACTIVE] = args;
2242 2242 }
2243 2243 if (signal_inactive_thread) {
2244 2244 cv_signal(&mi->mi_inact_req_cv);
2245 2245 } else {
2246 2246 mi->mi_async_req_count++;
2247 2247 ASSERT(mi->mi_async_req_count != 0);
2248 2248 cv_signal(&mi->mi_async_reqs_cv);
2249 2249 }
2250 2250
2251 2251 mutex_exit(&mi->mi_async_lock);
2252 2252 }
2253 2253
2254 2254 int
2255 2255 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2256 2256 {
2257 2257 int pagecreate;
2258 2258 int n;
2259 2259 int saved_n;
2260 2260 caddr_t saved_base;
2261 2261 u_offset_t offset;
2262 2262 int error;
2263 2263 int sm_error;
2264 2264 vnode_t *vp = RTOV(rp);
2265 2265
2266 2266 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2267 2267 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2268 2268 if (!vpm_enable) {
2269 2269 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2270 2270 }
2271 2271
2272 2272 /*
2273 2273 * Move bytes in at most PAGESIZE chunks. We must avoid
2274 2274 * spanning pages in uiomove() because page faults may cause
2275 2275 * the cache to be invalidated out from under us. The r_size is not
2276 2276 * updated until after the uiomove. If we push the last page of a
2277 2277 * file before r_size is correct, we will lose the data written past
2278 2278 * the current (and invalid) r_size.
2279 2279 */
2280 2280 do {
2281 2281 offset = uio->uio_loffset;
2282 2282 pagecreate = 0;
2283 2283
2284 2284 /*
2285 2285 * n is the number of bytes required to satisfy the request
2286 2286 * or the number of bytes to fill out the page.
2287 2287 */
2288 2288 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2289 2289
2290 2290 /*
2291 2291 * Check to see if we can skip reading in the page
2292 2292 * and just allocate the memory. We can do this
2293 2293 * if we are going to rewrite the entire mapping
2294 2294 * or if we are going to write to or beyond the current
2295 2295 * end of file from the beginning of the mapping.
2296 2296 *
2297 2297 * The read of r_size is now protected by r_statelock.
2298 2298 */
2299 2299 mutex_enter(&rp->r_statelock);
2300 2300 /*
2301 2301 * When pgcreated is nonzero the caller has already done
2302 2302 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2303 2303 * segkpm this means we already have at least one page
2304 2304 * created and mapped at base.
2305 2305 */
2306 2306 pagecreate = pgcreated ||
2307 2307 ((offset & PAGEOFFSET) == 0 &&
2308 2308 (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2309 2309
2310 2310 mutex_exit(&rp->r_statelock);
2311 2311
2312 2312 if (!vpm_enable && pagecreate) {
2313 2313 /*
2314 2314 * The last argument tells segmap_pagecreate() to
2315 2315 * always lock the page, as opposed to sometimes
2316 2316 * returning with the page locked. This way we avoid a
2317 2317 * fault on the ensuing uiomove(), but also
2318 2318 * more importantly (to fix bug 1094402) we can
2319 2319 * call segmap_fault() to unlock the page in all
2320 2320 * cases. An alternative would be to modify
2321 2321 * segmap_pagecreate() to tell us when it is
2322 2322 * locking a page, but that's a fairly major
2323 2323 * interface change.
2324 2324 */
2325 2325 if (pgcreated == 0)
2326 2326 (void) segmap_pagecreate(segkmap, base,
2327 2327 (uint_t)n, 1);
2328 2328 saved_base = base;
2329 2329 saved_n = n;
2330 2330 }
2331 2331
2332 2332 /*
2333 2333 * The number of bytes of data in the last page can not
2334 2334 * be accurately be determined while page is being
2335 2335 * uiomove'd to and the size of the file being updated.
2336 2336 * Thus, inform threads which need to know accurately
2337 2337 * how much data is in the last page of the file. They
2338 2338 * will not do the i/o immediately, but will arrange for
2339 2339 * the i/o to happen later when this modify operation
2340 2340 * will have finished.
2341 2341 */
2342 2342 ASSERT(!(rp->r_flags & R4MODINPROGRESS));
2343 2343 mutex_enter(&rp->r_statelock);
2344 2344 rp->r_flags |= R4MODINPROGRESS;
2345 2345 rp->r_modaddr = (offset & MAXBMASK);
2346 2346 mutex_exit(&rp->r_statelock);
2347 2347
2348 2348 if (vpm_enable) {
2349 2349 /*
2350 2350 * Copy data. If new pages are created, part of
2351 2351 * the page that is not written will be initizliazed
2352 2352 * with zeros.
2353 2353 */
2354 2354 error = vpm_data_copy(vp, offset, n, uio,
2355 2355 !pagecreate, NULL, 0, S_WRITE);
2356 2356 } else {
2357 2357 error = uiomove(base, n, UIO_WRITE, uio);
2358 2358 }
2359 2359
2360 2360 /*
2361 2361 * r_size is the maximum number of
2362 2362 * bytes known to be in the file.
2363 2363 * Make sure it is at least as high as the
2364 2364 * first unwritten byte pointed to by uio_loffset.
2365 2365 */
2366 2366 mutex_enter(&rp->r_statelock);
2367 2367 if (rp->r_size < uio->uio_loffset)
2368 2368 rp->r_size = uio->uio_loffset;
2369 2369 rp->r_flags &= ~R4MODINPROGRESS;
2370 2370 rp->r_flags |= R4DIRTY;
2371 2371 mutex_exit(&rp->r_statelock);
2372 2372
2373 2373 /* n = # of bytes written */
2374 2374 n = (int)(uio->uio_loffset - offset);
2375 2375
2376 2376 if (!vpm_enable) {
2377 2377 base += n;
2378 2378 }
2379 2379
2380 2380 tcount -= n;
2381 2381 /*
2382 2382 * If we created pages w/o initializing them completely,
2383 2383 * we need to zero the part that wasn't set up.
2384 2384 * This happens on a most EOF write cases and if
2385 2385 * we had some sort of error during the uiomove.
2386 2386 */
2387 2387 if (!vpm_enable && pagecreate) {
2388 2388 if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2389 2389 (void) kzero(base, PAGESIZE - n);
2390 2390
2391 2391 if (pgcreated) {
2392 2392 /*
2393 2393 * Caller is responsible for this page,
2394 2394 * it was not created in this loop.
2395 2395 */
2396 2396 pgcreated = 0;
2397 2397 } else {
2398 2398 /*
2399 2399 * For bug 1094402: segmap_pagecreate locks
2400 2400 * page. Unlock it. This also unlocks the
2401 2401 * pages allocated by page_create_va() in
2402 2402 * segmap_pagecreate().
2403 2403 */
2404 2404 sm_error = segmap_fault(kas.a_hat, segkmap,
2405 2405 saved_base, saved_n,
2406 2406 F_SOFTUNLOCK, S_WRITE);
2407 2407 if (error == 0)
2408 2408 error = sm_error;
2409 2409 }
2410 2410 }
2411 2411 } while (tcount > 0 && error == 0);
2412 2412
2413 2413 return (error);
2414 2414 }
2415 2415
2416 2416 int
2417 2417 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2418 2418 {
2419 2419 rnode4_t *rp;
2420 2420 page_t *pp;
2421 2421 u_offset_t eoff;
2422 2422 u_offset_t io_off;
2423 2423 size_t io_len;
2424 2424 int error;
2425 2425 int rdirty;
2426 2426 int err;
2427 2427
2428 2428 rp = VTOR4(vp);
2429 2429 ASSERT(rp->r_count > 0);
2430 2430
2431 2431 if (!nfs4_has_pages(vp))
2432 2432 return (0);
2433 2433
2434 2434 ASSERT(vp->v_type != VCHR);
2435 2435
2436 2436 /*
2437 2437 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL
2438 2438 * writes. B_FORCE is set to force the VM system to actually
2439 2439 * invalidate the pages, even if the i/o failed. The pages
2440 2440 * need to get invalidated because they can't be written out
2441 2441 * because there isn't any space left on either the server's
2442 2442 * file system or in the user's disk quota. The B_FREE bit
2443 2443 * is cleared to avoid confusion as to whether this is a
2444 2444 * request to place the page on the freelist or to destroy
2445 2445 * it.
2446 2446 */
2447 2447 if ((rp->r_flags & R4OUTOFSPACE) ||
2448 2448 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2449 2449 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2450 2450
2451 2451 if (len == 0) {
2452 2452 /*
2453 2453 * If doing a full file synchronous operation, then clear
2454 2454 * the R4DIRTY bit. If a page gets dirtied while the flush
2455 2455 * is happening, then R4DIRTY will get set again. The
2456 2456 * R4DIRTY bit must get cleared before the flush so that
2457 2457 * we don't lose this information.
2458 2458 *
2459 2459 * If there are no full file async write operations
2460 2460 * pending and RDIRTY bit is set, clear it.
2461 2461 */
2462 2462 if (off == (u_offset_t)0 &&
2463 2463 !(flags & B_ASYNC) &&
2464 2464 (rp->r_flags & R4DIRTY)) {
2465 2465 mutex_enter(&rp->r_statelock);
2466 2466 rdirty = (rp->r_flags & R4DIRTY);
2467 2467 rp->r_flags &= ~R4DIRTY;
2468 2468 mutex_exit(&rp->r_statelock);
2469 2469 } else if (flags & B_ASYNC && off == (u_offset_t)0) {
2470 2470 mutex_enter(&rp->r_statelock);
2471 2471 if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) {
2472 2472 rdirty = (rp->r_flags & R4DIRTY);
2473 2473 rp->r_flags &= ~R4DIRTY;
2474 2474 }
2475 2475 mutex_exit(&rp->r_statelock);
2476 2476 } else
2477 2477 rdirty = 0;
2478 2478
2479 2479 /*
2480 2480 * Search the entire vp list for pages >= off, and flush
2481 2481 * the dirty pages.
2482 2482 */
2483 2483 error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2484 2484 flags, cr);
2485 2485
2486 2486 /*
2487 2487 * If an error occurred and the file was marked as dirty
2488 2488 * before and we aren't forcibly invalidating pages, then
2489 2489 * reset the R4DIRTY flag.
2490 2490 */
2491 2491 if (error && rdirty &&
2492 2492 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2493 2493 mutex_enter(&rp->r_statelock);
2494 2494 rp->r_flags |= R4DIRTY;
2495 2495 mutex_exit(&rp->r_statelock);
2496 2496 }
2497 2497 } else {
2498 2498 /*
2499 2499 * Do a range from [off...off + len) looking for pages
2500 2500 * to deal with.
2501 2501 */
2502 2502 error = 0;
2503 2503 io_len = 0;
2504 2504 eoff = off + len;
2505 2505 mutex_enter(&rp->r_statelock);
2506 2506 for (io_off = off; io_off < eoff && io_off < rp->r_size;
2507 2507 io_off += io_len) {
2508 2508 mutex_exit(&rp->r_statelock);
2509 2509 /*
2510 2510 * If we are not invalidating, synchronously
2511 2511 * freeing or writing pages use the routine
2512 2512 * page_lookup_nowait() to prevent reclaiming
2513 2513 * them from the free list.
2514 2514 */
2515 2515 if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2516 2516 pp = page_lookup(vp, io_off,
2517 2517 (flags & (B_INVAL | B_FREE)) ?
2518 2518 SE_EXCL : SE_SHARED);
2519 2519 } else {
2520 2520 pp = page_lookup_nowait(vp, io_off,
2521 2521 (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2522 2522 }
2523 2523
2524 2524 if (pp == NULL || !pvn_getdirty(pp, flags))
2525 2525 io_len = PAGESIZE;
2526 2526 else {
2527 2527 err = (*rp->r_putapage)(vp, pp, &io_off,
2528 2528 &io_len, flags, cr);
2529 2529 if (!error)
2530 2530 error = err;
2531 2531 /*
2532 2532 * "io_off" and "io_len" are returned as
2533 2533 * the range of pages we actually wrote.
2534 2534 * This allows us to skip ahead more quickly
2535 2535 * since several pages may've been dealt
2536 2536 * with by this iteration of the loop.
2537 2537 */
2538 2538 }
2539 2539 mutex_enter(&rp->r_statelock);
2540 2540 }
2541 2541 mutex_exit(&rp->r_statelock);
2542 2542 }
2543 2543
2544 2544 return (error);
2545 2545 }
2546 2546
2547 2547 void
2548 2548 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2549 2549 {
2550 2550 rnode4_t *rp;
2551 2551
2552 2552 rp = VTOR4(vp);
2553 2553 if (IS_SHADOW(vp, rp))
2554 2554 vp = RTOV4(rp);
2555 2555 mutex_enter(&rp->r_statelock);
2556 2556 while (rp->r_flags & R4TRUNCATE)
2557 2557 cv_wait(&rp->r_cv, &rp->r_statelock);
2558 2558 rp->r_flags |= R4TRUNCATE;
2559 2559 if (off == (u_offset_t)0) {
2560 2560 rp->r_flags &= ~R4DIRTY;
2561 2561 if (!(rp->r_flags & R4STALE))
2562 2562 rp->r_error = 0;
2563 2563 }
2564 2564 rp->r_truncaddr = off;
2565 2565 mutex_exit(&rp->r_statelock);
2566 2566 (void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2567 2567 B_INVAL | B_TRUNC, cr);
2568 2568 mutex_enter(&rp->r_statelock);
2569 2569 rp->r_flags &= ~R4TRUNCATE;
2570 2570 cv_broadcast(&rp->r_cv);
2571 2571 mutex_exit(&rp->r_statelock);
2572 2572 }
2573 2573
2574 2574 static int
2575 2575 nfs4_mnt_kstat_update(kstat_t *ksp, int rw)
2576 2576 {
2577 2577 mntinfo4_t *mi;
2578 2578 struct mntinfo_kstat *mik;
2579 2579 vfs_t *vfsp;
2580 2580
2581 2581 /* this is a read-only kstat. Bail out on a write */
2582 2582 if (rw == KSTAT_WRITE)
2583 2583 return (EACCES);
2584 2584
2585 2585
2586 2586 /*
2587 2587 * We don't want to wait here as kstat_chain_lock could be held by
2588 2588 * dounmount(). dounmount() takes vfs_reflock before the chain lock
2589 2589 * and thus could lead to a deadlock.
2590 2590 */
2591 2591 vfsp = (struct vfs *)ksp->ks_private;
2592 2592
2593 2593 mi = VFTOMI4(vfsp);
2594 2594 mik = (struct mntinfo_kstat *)ksp->ks_data;
2595 2595
2596 2596 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
2597 2597
2598 2598 mik->mik_vers = (uint32_t)mi->mi_vers;
2599 2599 mik->mik_flags = mi->mi_flags;
2600 2600 /*
2601 2601 * The sv_secdata holds the flavor the client specifies.
2602 2602 * If the client uses default and a security negotiation
2603 2603 * occurs, sv_currsec will point to the current flavor
2604 2604 * selected from the server flavor list.
2605 2605 * sv_currsec is NULL if no security negotiation takes place.
2606 2606 */
2607 2607 mik->mik_secmod = mi->mi_curr_serv->sv_currsec ?
2608 2608 mi->mi_curr_serv->sv_currsec->secmod :
2609 2609 mi->mi_curr_serv->sv_secdata->secmod;
2610 2610 mik->mik_curread = (uint32_t)mi->mi_curread;
2611 2611 mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
2612 2612 mik->mik_retrans = mi->mi_retrans;
2613 2613 mik->mik_timeo = mi->mi_timeo;
2614 2614 mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
2615 2615 mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
2616 2616 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
2617 2617 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
2618 2618 mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
2619 2619 mik->mik_failover = (uint32_t)mi->mi_failover;
2620 2620 mik->mik_remap = (uint32_t)mi->mi_remap;
2621 2621
2622 2622 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
2623 2623
2624 2624 return (0);
2625 2625 }
2626 2626
2627 2627 void
2628 2628 nfs4_mnt_kstat_init(struct vfs *vfsp)
2629 2629 {
2630 2630 mntinfo4_t *mi = VFTOMI4(vfsp);
2631 2631
2632 2632 /*
2633 2633 * PSARC 2001/697 Contract Private Interface
2634 2634 * All nfs kstats are under SunMC contract
2635 2635 * Please refer to the PSARC listed above and contact
2636 2636 * SunMC before making any changes!
2637 2637 *
2638 2638 * Changes must be reviewed by Solaris File Sharing
2639 2639 * Changes must be communicated to contract-2001-697@sun.com
2640 2640 *
2641 2641 */
2642 2642
2643 2643 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
2644 2644 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
2645 2645 if (mi->mi_io_kstats) {
2646 2646 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2647 2647 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
2648 2648 mi->mi_io_kstats->ks_lock = &mi->mi_lock;
2649 2649 kstat_install(mi->mi_io_kstats);
2650 2650 }
2651 2651
2652 2652 if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
2653 2653 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
2654 2654 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
2655 2655 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2656 2656 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
2657 2657 mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update;
2658 2658 mi->mi_ro_kstats->ks_private = (void *)vfsp;
2659 2659 kstat_install(mi->mi_ro_kstats);
2660 2660 }
2661 2661
2662 2662 nfs4_mnt_recov_kstat_init(vfsp);
2663 2663 }
2664 2664
2665 2665 void
2666 2666 nfs4_write_error(vnode_t *vp, int error, cred_t *cr)
2667 2667 {
2668 2668 mntinfo4_t *mi;
2669 2669 clock_t now = ddi_get_lbolt();
2670 2670
2671 2671 mi = VTOMI4(vp);
2672 2672 /*
2673 2673 * In case of forced unmount, do not print any messages
2674 2674 * since it can flood the console with error messages.
2675 2675 */
2676 2676 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
2677 2677 return;
2678 2678
2679 2679 /*
2680 2680 * If the mount point is dead, not recoverable, do not
2681 2681 * print error messages that can flood the console.
2682 2682 */
2683 2683 if (mi->mi_flags & MI4_RECOV_FAIL)
2684 2684 return;
2685 2685
2686 2686 /*
2687 2687 * No use in flooding the console with ENOSPC
2688 2688 * messages from the same file system.
2689 2689 */
2690 2690 if ((error != ENOSPC && error != EDQUOT) ||
2691 2691 now - mi->mi_printftime > 0) {
2692 2692 zoneid_t zoneid = mi->mi_zone->zone_id;
2693 2693
2694 2694 #ifdef DEBUG
2695 2695 nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2696 2696 mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL);
2697 2697 #else
2698 2698 nfs_perror(error, "NFS write error on host %s: %m.\n",
2699 2699 VTOR4(vp)->r_server->sv_hostname, NULL);
2700 2700 #endif
2701 2701 if (error == ENOSPC || error == EDQUOT) {
2702 2702 zcmn_err(zoneid, CE_CONT,
2703 2703 "^File: userid=%d, groupid=%d\n",
2704 2704 crgetuid(cr), crgetgid(cr));
2705 2705 if (crgetuid(curthread->t_cred) != crgetuid(cr) ||
2706 2706 crgetgid(curthread->t_cred) != crgetgid(cr)) {
2707 2707 zcmn_err(zoneid, CE_CONT,
2708 2708 "^User: userid=%d, groupid=%d\n",
2709 2709 crgetuid(curthread->t_cred),
2710 2710 crgetgid(curthread->t_cred));
2711 2711 }
2712 2712 mi->mi_printftime = now +
2713 2713 nfs_write_error_interval * hz;
2714 2714 }
2715 2715 sfh4_printfhandle(VTOR4(vp)->r_fh);
2716 2716 #ifdef DEBUG
2717 2717 if (error == EACCES) {
2718 2718 zcmn_err(zoneid, CE_CONT,
2719 2719 "nfs_bio: cred is%s kcred\n",
2720 2720 cr == kcred ? "" : " not");
2721 2721 }
2722 2722 #endif
2723 2723 }
2724 2724 }
2725 2725
2726 2726 /*
2727 2727 * Return non-zero if the given file can be safely memory mapped. Locks
2728 2728 * are safe if whole-file (length and offset are both zero).
2729 2729 */
2730 2730
2731 2731 #define SAFE_LOCK(flk) ((flk).l_start == 0 && (flk).l_len == 0)
2732 2732
2733 2733 static int
2734 2734 nfs4_safemap(const vnode_t *vp)
2735 2735 {
2736 2736 locklist_t *llp, *next_llp;
2737 2737 int safe = 1;
2738 2738 rnode4_t *rp = VTOR4(vp);
2739 2739
2740 2740 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2741 2741
2742 2742 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: "
2743 2743 "vp = %p", (void *)vp));
2744 2744
2745 2745 /*
2746 2746 * Review all the locks for the vnode, both ones that have been
2747 2747 * acquired and ones that are pending. We assume that
2748 2748 * flk_active_locks_for_vp() has merged any locks that can be
2749 2749 * merged (so that if a process has the entire file locked, it is
2750 2750 * represented as a single lock).
2751 2751 *
2752 2752 * Note that we can't bail out of the loop if we find a non-safe
2753 2753 * lock, because we have to free all the elements in the llp list.
2754 2754 * We might be able to speed up this code slightly by not looking
2755 2755 * at each lock's l_start and l_len fields once we've found a
2756 2756 * non-safe lock.
2757 2757 */
2758 2758
2759 2759 llp = flk_active_locks_for_vp(vp);
2760 2760 while (llp) {
2761 2761 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2762 2762 "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")",
2763 2763 llp->ll_flock.l_start, llp->ll_flock.l_len));
2764 2764 if (!SAFE_LOCK(llp->ll_flock)) {
2765 2765 safe = 0;
2766 2766 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2767 2767 "nfs4_safemap: unsafe active lock (%" PRId64
2768 2768 ", %" PRId64 ")", llp->ll_flock.l_start,
2769 2769 llp->ll_flock.l_len));
2770 2770 }
2771 2771 next_llp = llp->ll_next;
2772 2772 VN_RELE(llp->ll_vp);
2773 2773 kmem_free(llp, sizeof (*llp));
2774 2774 llp = next_llp;
2775 2775 }
2776 2776
2777 2777 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s",
2778 2778 safe ? "safe" : "unsafe"));
2779 2779 return (safe);
2780 2780 }
2781 2781
2782 2782 /*
2783 2783 * Return whether there is a lost LOCK or LOCKU queued up for the given
2784 2784 * file that would make an mmap request unsafe. cf. nfs4_safemap().
2785 2785 */
2786 2786
2787 2787 bool_t
2788 2788 nfs4_map_lost_lock_conflict(vnode_t *vp)
2789 2789 {
2790 2790 bool_t conflict = FALSE;
2791 2791 nfs4_lost_rqst_t *lrp;
2792 2792 mntinfo4_t *mi = VTOMI4(vp);
2793 2793
2794 2794 mutex_enter(&mi->mi_lock);
2795 2795 for (lrp = list_head(&mi->mi_lost_state); lrp != NULL;
2796 2796 lrp = list_next(&mi->mi_lost_state, lrp)) {
2797 2797 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2798 2798 continue;
2799 2799 ASSERT(lrp->lr_vp != NULL);
2800 2800 if (!VOP_CMP(lrp->lr_vp, vp, NULL))
2801 2801 continue; /* different file */
2802 2802 if (!SAFE_LOCK(*lrp->lr_flk)) {
2803 2803 conflict = TRUE;
2804 2804 break;
2805 2805 }
2806 2806 }
2807 2807
2808 2808 mutex_exit(&mi->mi_lock);
2809 2809 return (conflict);
2810 2810 }
2811 2811
2812 2812 /*
2813 2813 * nfs_lockcompletion:
2814 2814 *
2815 2815 * If the vnode has a lock that makes it unsafe to cache the file, mark it
2816 2816 * as non cachable (set VNOCACHE bit).
2817 2817 */
2818 2818
2819 2819 void
2820 2820 nfs4_lockcompletion(vnode_t *vp, int cmd)
2821 2821 {
2822 2822 rnode4_t *rp = VTOR4(vp);
2823 2823
2824 2824 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2825 2825 ASSERT(!IS_SHADOW(vp, rp));
2826 2826
2827 2827 if (cmd == F_SETLK || cmd == F_SETLKW) {
2828 2828
2829 2829 if (!nfs4_safemap(vp)) {
2830 2830 mutex_enter(&vp->v_lock);
2831 2831 vp->v_flag |= VNOCACHE;
2832 2832 mutex_exit(&vp->v_lock);
2833 2833 } else {
2834 2834 mutex_enter(&vp->v_lock);
2835 2835 vp->v_flag &= ~VNOCACHE;
2836 2836 mutex_exit(&vp->v_lock);
2837 2837 }
2838 2838 }
2839 2839 /*
2840 2840 * The cached attributes of the file are stale after acquiring
2841 2841 * the lock on the file. They were updated when the file was
2842 2842 * opened, but not updated when the lock was acquired. Therefore the
2843 2843 * cached attributes are invalidated after the lock is obtained.
2844 2844 */
2845 2845 PURGE_ATTRCACHE4(vp);
2846 2846 }
2847 2847
2848 2848 /* ARGSUSED */
2849 2849 static void *
2850 2850 nfs4_mi_init(zoneid_t zoneid)
2851 2851 {
2852 2852 struct mi4_globals *mig;
2853 2853
2854 2854 mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2855 2855 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2856 2856 list_create(&mig->mig_list, sizeof (mntinfo4_t),
2857 2857 offsetof(mntinfo4_t, mi_zone_node));
2858 2858 mig->mig_destructor_called = B_FALSE;
2859 2859 return (mig);
2860 2860 }
2861 2861
2862 2862 /*
2863 2863 * Callback routine to tell all NFSv4 mounts in the zone to start tearing down
2864 2864 * state and killing off threads.
2865 2865 */
2866 2866 /* ARGSUSED */
2867 2867 static void
2868 2868 nfs4_mi_shutdown(zoneid_t zoneid, void *data)
2869 2869 {
2870 2870 struct mi4_globals *mig = data;
2871 2871 mntinfo4_t *mi;
2872 2872 nfs4_server_t *np;
2873 2873
2874 2874 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2875 2875 "nfs4_mi_shutdown zone %d\n", zoneid));
2876 2876 ASSERT(mig != NULL);
2877 2877 for (;;) {
2878 2878 mutex_enter(&mig->mig_lock);
2879 2879 mi = list_head(&mig->mig_list);
2880 2880 if (mi == NULL) {
2881 2881 mutex_exit(&mig->mig_lock);
2882 2882 break;
2883 2883 }
2884 2884
2885 2885 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2886 2886 "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp));
2887 2887 /*
2888 2888 * purge the DNLC for this filesystem
2889 2889 */
2890 2890 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2891 2891 /*
2892 2892 * Tell existing async worker threads to exit.
2893 2893 */
2894 2894 mutex_enter(&mi->mi_async_lock);
2895 2895 mi->mi_max_threads = 0;
2896 2896 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2897 2897 /*
2898 2898 * Set the appropriate flags, signal and wait for both the
2899 2899 * async manager and the inactive thread to exit when they're
2900 2900 * done with their current work.
2901 2901 */
2902 2902 mutex_enter(&mi->mi_lock);
2903 2903 mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD);
2904 2904 mutex_exit(&mi->mi_lock);
2905 2905 mutex_exit(&mi->mi_async_lock);
2906 2906 if (mi->mi_manager_thread) {
2907 2907 nfs4_async_manager_stop(mi->mi_vfsp);
2908 2908 }
2909 2909 if (mi->mi_inactive_thread) {
2910 2910 mutex_enter(&mi->mi_async_lock);
2911 2911 cv_signal(&mi->mi_inact_req_cv);
2912 2912 /*
2913 2913 * Wait for the inactive thread to exit.
2914 2914 */
2915 2915 while (mi->mi_inactive_thread != NULL) {
2916 2916 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2917 2917 }
2918 2918 mutex_exit(&mi->mi_async_lock);
2919 2919 }
2920 2920 /*
2921 2921 * Wait for the recovery thread to complete, that is, it will
2922 2922 * signal when it is done using the "mi" structure and about
2923 2923 * to exit
2924 2924 */
2925 2925 mutex_enter(&mi->mi_lock);
2926 2926 while (mi->mi_in_recovery > 0)
2927 2927 cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
2928 2928 mutex_exit(&mi->mi_lock);
2929 2929 /*
2930 2930 * We're done when every mi has been done or the list is empty.
2931 2931 * This one is done, remove it from the list.
2932 2932 */
2933 2933 list_remove(&mig->mig_list, mi);
2934 2934 mutex_exit(&mig->mig_lock);
2935 2935 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2936 2936
2937 2937 /*
2938 2938 * Release hold on vfs and mi done to prevent race with zone
2939 2939 * shutdown. This releases the hold in nfs4_mi_zonelist_add.
2940 2940 */
2941 2941 VFS_RELE(mi->mi_vfsp);
2942 2942 MI4_RELE(mi);
2943 2943 }
2944 2944 /*
2945 2945 * Tell each renew thread in the zone to exit
2946 2946 */
2947 2947 mutex_enter(&nfs4_server_lst_lock);
2948 2948 for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
2949 2949 mutex_enter(&np->s_lock);
2950 2950 if (np->zoneid == zoneid) {
2951 2951 /*
2952 2952 * We add another hold onto the nfs4_server_t
2953 2953 * because this will make sure tha the nfs4_server_t
2954 2954 * stays around until nfs4_callback_fini_zone destroys
2955 2955 * the zone. This way, the renew thread can
2956 2956 * unconditionally release its holds on the
2957 2957 * nfs4_server_t.
2958 2958 */
2959 2959 np->s_refcnt++;
2960 2960 nfs4_mark_srv_dead(np);
2961 2961 }
2962 2962 mutex_exit(&np->s_lock);
2963 2963 }
2964 2964 mutex_exit(&nfs4_server_lst_lock);
2965 2965 }
2966 2966
2967 2967 static void
2968 2968 nfs4_mi_free_globals(struct mi4_globals *mig)
2969 2969 {
2970 2970 list_destroy(&mig->mig_list); /* makes sure the list is empty */
2971 2971 mutex_destroy(&mig->mig_lock);
2972 2972 kmem_free(mig, sizeof (*mig));
2973 2973 }
2974 2974
2975 2975 /* ARGSUSED */
2976 2976 static void
2977 2977 nfs4_mi_destroy(zoneid_t zoneid, void *data)
2978 2978 {
2979 2979 struct mi4_globals *mig = data;
2980 2980
2981 2981 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2982 2982 "nfs4_mi_destroy zone %d\n", zoneid));
2983 2983 ASSERT(mig != NULL);
2984 2984 mutex_enter(&mig->mig_lock);
2985 2985 if (list_head(&mig->mig_list) != NULL) {
2986 2986 /* Still waiting for VFS_FREEVFS() */
2987 2987 mig->mig_destructor_called = B_TRUE;
2988 2988 mutex_exit(&mig->mig_lock);
2989 2989 return;
2990 2990 }
2991 2991 nfs4_mi_free_globals(mig);
2992 2992 }
2993 2993
2994 2994 /*
2995 2995 * Add an NFS mount to the per-zone list of NFS mounts.
2996 2996 */
2997 2997 void
2998 2998 nfs4_mi_zonelist_add(mntinfo4_t *mi)
2999 2999 {
3000 3000 struct mi4_globals *mig;
3001 3001
3002 3002 mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3003 3003 mutex_enter(&mig->mig_lock);
3004 3004 list_insert_head(&mig->mig_list, mi);
3005 3005 /*
3006 3006 * hold added to eliminate race with zone shutdown -this will be
3007 3007 * released in mi_shutdown
3008 3008 */
3009 3009 MI4_HOLD(mi);
3010 3010 VFS_HOLD(mi->mi_vfsp);
3011 3011 mutex_exit(&mig->mig_lock);
3012 3012 }
3013 3013
3014 3014 /*
3015 3015 * Remove an NFS mount from the per-zone list of NFS mounts.
3016 3016 */
3017 3017 int
3018 3018 nfs4_mi_zonelist_remove(mntinfo4_t *mi)
3019 3019 {
3020 3020 struct mi4_globals *mig;
3021 3021 int ret = 0;
3022 3022
3023 3023 mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3024 3024 mutex_enter(&mig->mig_lock);
3025 3025 mutex_enter(&mi->mi_lock);
3026 3026 /* if this mi is marked dead, then the zone already released it */
3027 3027 if (!(mi->mi_flags & MI4_DEAD)) {
3028 3028 list_remove(&mig->mig_list, mi);
3029 3029 mutex_exit(&mi->mi_lock);
3030 3030
3031 3031 /* release the holds put on in zonelist_add(). */
3032 3032 VFS_RELE(mi->mi_vfsp);
3033 3033 MI4_RELE(mi);
3034 3034 ret = 1;
3035 3035 } else {
3036 3036 mutex_exit(&mi->mi_lock);
3037 3037 }
3038 3038
3039 3039 /*
3040 3040 * We can be called asynchronously by VFS_FREEVFS() after the zone
3041 3041 * shutdown/destroy callbacks have executed; if so, clean up the zone's
3042 3042 * mi globals.
3043 3043 */
3044 3044 if (list_head(&mig->mig_list) == NULL &&
3045 3045 mig->mig_destructor_called == B_TRUE) {
3046 3046 nfs4_mi_free_globals(mig);
3047 3047 return (ret);
3048 3048 }
3049 3049 mutex_exit(&mig->mig_lock);
3050 3050 return (ret);
3051 3051 }
3052 3052
3053 3053 void
3054 3054 nfs_free_mi4(mntinfo4_t *mi)
3055 3055 {
3056 3056 nfs4_open_owner_t *foop;
3057 3057 nfs4_oo_hash_bucket_t *bucketp;
3058 3058 nfs4_debug_msg_t *msgp;
3059 3059 int i;
3060 3060 servinfo4_t *svp;
3061 3061
3062 3062 /*
3063 3063 * Code introduced here should be carefully evaluated to make
3064 3064 * sure none of the freed resources are accessed either directly
3065 3065 * or indirectly after freeing them. For eg: Introducing calls to
3066 3066 * NFS4_DEBUG that use mntinfo4_t structure member after freeing
3067 3067 * the structure members or other routines calling back into NFS
3068 3068 * accessing freed mntinfo4_t structure member.
3069 3069 */
3070 3070 mutex_enter(&mi->mi_lock);
3071 3071 ASSERT(mi->mi_recovthread == NULL);
3072 3072 ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP);
3073 3073 mutex_exit(&mi->mi_lock);
3074 3074 mutex_enter(&mi->mi_async_lock);
3075 3075 ASSERT(mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
3076 3076 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0);
3077 3077 ASSERT(mi->mi_manager_thread == NULL);
3078 3078 mutex_exit(&mi->mi_async_lock);
3079 3079 if (mi->mi_io_kstats) {
3080 3080 kstat_delete(mi->mi_io_kstats);
3081 3081 mi->mi_io_kstats = NULL;
3082 3082 }
3083 3083 if (mi->mi_ro_kstats) {
3084 3084 kstat_delete(mi->mi_ro_kstats);
3085 3085 mi->mi_ro_kstats = NULL;
3086 3086 }
3087 3087 if (mi->mi_recov_ksp) {
3088 3088 kstat_delete(mi->mi_recov_ksp);
3089 3089 mi->mi_recov_ksp = NULL;
3090 3090 }
3091 3091 mutex_enter(&mi->mi_msg_list_lock);
3092 3092 while (msgp = list_head(&mi->mi_msg_list)) {
3093 3093 list_remove(&mi->mi_msg_list, msgp);
3094 3094 nfs4_free_msg(msgp);
3095 3095 }
3096 3096 mutex_exit(&mi->mi_msg_list_lock);
3097 3097 list_destroy(&mi->mi_msg_list);
3098 3098 if (mi->mi_fname != NULL)
3099 3099 fn_rele(&mi->mi_fname);
3100 3100 if (mi->mi_rootfh != NULL)
3101 3101 sfh4_rele(&mi->mi_rootfh);
3102 3102 if (mi->mi_srvparentfh != NULL)
3103 3103 sfh4_rele(&mi->mi_srvparentfh);
3104 3104 svp = mi->mi_servers;
3105 3105 sv4_free(svp);
3106 3106 mutex_destroy(&mi->mi_lock);
3107 3107 mutex_destroy(&mi->mi_async_lock);
3108 3108 mutex_destroy(&mi->mi_msg_list_lock);
3109 3109 nfs_rw_destroy(&mi->mi_recovlock);
3110 3110 nfs_rw_destroy(&mi->mi_rename_lock);
3111 3111 nfs_rw_destroy(&mi->mi_fh_lock);
3112 3112 cv_destroy(&mi->mi_failover_cv);
3113 3113 cv_destroy(&mi->mi_async_reqs_cv);
3114 3114 cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]);
3115 3115 cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]);
3116 3116 cv_destroy(&mi->mi_async_cv);
3117 3117 cv_destroy(&mi->mi_inact_req_cv);
3118 3118 /*
3119 3119 * Destroy the oo hash lists and mutexes for the cred hash table.
3120 3120 */
3121 3121 for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
3122 3122 bucketp = &(mi->mi_oo_list[i]);
3123 3123 /* Destroy any remaining open owners on the list */
3124 3124 foop = list_head(&bucketp->b_oo_hash_list);
3125 3125 while (foop != NULL) {
3126 3126 list_remove(&bucketp->b_oo_hash_list, foop);
3127 3127 nfs4_destroy_open_owner(foop);
3128 3128 foop = list_head(&bucketp->b_oo_hash_list);
3129 3129 }
3130 3130 list_destroy(&bucketp->b_oo_hash_list);
3131 3131 mutex_destroy(&bucketp->b_lock);
3132 3132 }
3133 3133 /*
3134 3134 * Empty and destroy the freed open owner list.
3135 3135 */
3136 3136 foop = list_head(&mi->mi_foo_list);
3137 3137 while (foop != NULL) {
3138 3138 list_remove(&mi->mi_foo_list, foop);
3139 3139 nfs4_destroy_open_owner(foop);
3140 3140 foop = list_head(&mi->mi_foo_list);
↓ open down ↓ |
3140 lines elided |
↑ open up ↑ |
3141 3141 }
3142 3142 list_destroy(&mi->mi_foo_list);
3143 3143 list_destroy(&mi->mi_bseqid_list);
3144 3144 list_destroy(&mi->mi_lost_state);
3145 3145 avl_destroy(&mi->mi_filehandles);
3146 3146 kmem_free(mi, sizeof (*mi));
3147 3147 }
3148 3148 void
3149 3149 mi_hold(mntinfo4_t *mi)
3150 3150 {
3151 - atomic_add_32(&mi->mi_count, 1);
3151 + atomic_inc_32(&mi->mi_count);
3152 3152 ASSERT(mi->mi_count != 0);
3153 3153 }
3154 3154
3155 3155 void
3156 3156 mi_rele(mntinfo4_t *mi)
3157 3157 {
3158 3158 ASSERT(mi->mi_count != 0);
3159 - if (atomic_add_32_nv(&mi->mi_count, -1) == 0) {
3159 + if (atomic_dec_32_nv(&mi->mi_count) == 0) {
3160 3160 nfs_free_mi4(mi);
3161 3161 }
3162 3162 }
3163 3163
3164 3164 vnode_t nfs4_xattr_notsupp_vnode;
3165 3165
3166 3166 void
3167 3167 nfs4_clnt_init(void)
3168 3168 {
3169 3169 nfs4_vnops_init();
3170 3170 (void) nfs4_rnode_init();
3171 3171 (void) nfs4_shadow_init();
3172 3172 (void) nfs4_acache_init();
3173 3173 (void) nfs4_subr_init();
3174 3174 nfs4_acl_init();
3175 3175 nfs_idmap_init();
3176 3176 nfs4_callback_init();
3177 3177 nfs4_secinfo_init();
3178 3178 #ifdef DEBUG
3179 3179 tsd_create(&nfs4_tsd_key, NULL);
3180 3180 #endif
3181 3181
3182 3182 /*
3183 3183 * Add a CPR callback so that we can update client
3184 3184 * lease after a suspend and resume.
3185 3185 */
3186 3186 cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4");
3187 3187
3188 3188 zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown,
3189 3189 nfs4_mi_destroy);
3190 3190
3191 3191 /*
3192 3192 * Initialise the reference count of the notsupp xattr cache vnode to 1
3193 3193 * so that it never goes away (VOP_INACTIVE isn't called on it).
3194 3194 */
3195 3195 nfs4_xattr_notsupp_vnode.v_count = 1;
3196 3196 }
3197 3197
3198 3198 void
3199 3199 nfs4_clnt_fini(void)
3200 3200 {
3201 3201 (void) zone_key_delete(mi4_list_key);
3202 3202 nfs4_vnops_fini();
3203 3203 (void) nfs4_rnode_fini();
3204 3204 (void) nfs4_shadow_fini();
3205 3205 (void) nfs4_acache_fini();
3206 3206 (void) nfs4_subr_fini();
3207 3207 nfs_idmap_fini();
3208 3208 nfs4_callback_fini();
3209 3209 nfs4_secinfo_fini();
3210 3210 #ifdef DEBUG
3211 3211 tsd_destroy(&nfs4_tsd_key);
3212 3212 #endif
3213 3213 if (cid)
3214 3214 (void) callb_delete(cid);
3215 3215 }
3216 3216
3217 3217 /*ARGSUSED*/
3218 3218 static boolean_t
3219 3219 nfs4_client_cpr_callb(void *arg, int code)
3220 3220 {
3221 3221 /*
3222 3222 * We get called for Suspend and Resume events.
3223 3223 * For the suspend case we simply don't care!
3224 3224 */
3225 3225 if (code == CB_CODE_CPR_CHKPT) {
3226 3226 return (B_TRUE);
3227 3227 }
3228 3228
3229 3229 /*
3230 3230 * When we get to here we are in the process of
3231 3231 * resuming the system from a previous suspend.
3232 3232 */
3233 3233 nfs4_client_resumed = gethrestime_sec();
3234 3234 return (B_TRUE);
3235 3235 }
3236 3236
3237 3237 void
3238 3238 nfs4_renew_lease_thread(nfs4_server_t *sp)
3239 3239 {
3240 3240 int error = 0;
3241 3241 time_t tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs;
3242 3242 clock_t tick_delay = 0;
3243 3243 clock_t time_left = 0;
3244 3244 callb_cpr_t cpr_info;
3245 3245 kmutex_t cpr_lock;
3246 3246
3247 3247 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3248 3248 "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp));
3249 3249 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
3250 3250 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease");
3251 3251
3252 3252 mutex_enter(&sp->s_lock);
3253 3253 /* sp->s_lease_time is set via a GETATTR */
3254 3254 sp->last_renewal_time = gethrestime_sec();
3255 3255 sp->lease_valid = NFS4_LEASE_UNINITIALIZED;
3256 3256 ASSERT(sp->s_refcnt >= 1);
3257 3257
3258 3258 for (;;) {
3259 3259 if (!sp->state_ref_count ||
3260 3260 sp->lease_valid != NFS4_LEASE_VALID) {
3261 3261
3262 3262 kip_secs = MAX((sp->s_lease_time >> 1) -
3263 3263 (3 * sp->propagation_delay.tv_sec), 1);
3264 3264
3265 3265 tick_delay = SEC_TO_TICK(kip_secs);
3266 3266
3267 3267 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3268 3268 "nfs4_renew_lease_thread: no renew : thread "
3269 3269 "wait %ld secs", kip_secs));
3270 3270
3271 3271 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3272 3272 "nfs4_renew_lease_thread: no renew : "
3273 3273 "state_ref_count %d, lease_valid %d",
3274 3274 sp->state_ref_count, sp->lease_valid));
3275 3275
3276 3276 mutex_enter(&cpr_lock);
3277 3277 CALLB_CPR_SAFE_BEGIN(&cpr_info);
3278 3278 mutex_exit(&cpr_lock);
3279 3279 time_left = cv_reltimedwait(&sp->cv_thread_exit,
3280 3280 &sp->s_lock, tick_delay, TR_CLOCK_TICK);
3281 3281 mutex_enter(&cpr_lock);
3282 3282 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3283 3283 mutex_exit(&cpr_lock);
3284 3284
3285 3285 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3286 3286 "nfs4_renew_lease_thread: no renew: "
3287 3287 "time left %ld", time_left));
3288 3288
3289 3289 if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3290 3290 goto die;
3291 3291 continue;
3292 3292 }
3293 3293
3294 3294 tmp_last_renewal_time = sp->last_renewal_time;
3295 3295
3296 3296 tmp_time = gethrestime_sec() - sp->last_renewal_time +
3297 3297 (3 * sp->propagation_delay.tv_sec);
3298 3298
3299 3299 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3300 3300 "nfs4_renew_lease_thread: tmp_time %ld, "
3301 3301 "sp->last_renewal_time %ld", tmp_time,
3302 3302 sp->last_renewal_time));
3303 3303
3304 3304 kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1);
3305 3305
3306 3306 tick_delay = SEC_TO_TICK(kip_secs);
3307 3307
3308 3308 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3309 3309 "nfs4_renew_lease_thread: valid lease: sleep for %ld "
3310 3310 "secs", kip_secs));
3311 3311
3312 3312 mutex_enter(&cpr_lock);
3313 3313 CALLB_CPR_SAFE_BEGIN(&cpr_info);
3314 3314 mutex_exit(&cpr_lock);
3315 3315 time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock,
3316 3316 tick_delay, TR_CLOCK_TICK);
3317 3317 mutex_enter(&cpr_lock);
3318 3318 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3319 3319 mutex_exit(&cpr_lock);
3320 3320
3321 3321 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3322 3322 "nfs4_renew_lease_thread: valid lease: time left %ld :"
3323 3323 "sp last_renewal_time %ld, nfs4_client_resumed %ld, "
3324 3324 "tmp_last_renewal_time %ld", time_left,
3325 3325 sp->last_renewal_time, nfs4_client_resumed,
3326 3326 tmp_last_renewal_time));
3327 3327
3328 3328 if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3329 3329 goto die;
3330 3330
3331 3331 if (tmp_last_renewal_time == sp->last_renewal_time ||
3332 3332 (nfs4_client_resumed != 0 &&
3333 3333 nfs4_client_resumed > sp->last_renewal_time)) {
3334 3334 /*
3335 3335 * Issue RENEW op since we haven't renewed the lease
3336 3336 * since we slept.
3337 3337 */
3338 3338 tmp_now_time = gethrestime_sec();
3339 3339 error = nfs4renew(sp);
3340 3340 /*
3341 3341 * Need to re-acquire sp's lock, nfs4renew()
3342 3342 * relinqueshes it.
3343 3343 */
3344 3344 mutex_enter(&sp->s_lock);
3345 3345
3346 3346 /*
3347 3347 * See if someone changed s_thread_exit while we gave
3348 3348 * up s_lock.
3349 3349 */
3350 3350 if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3351 3351 goto die;
3352 3352
3353 3353 if (!error) {
3354 3354 /*
3355 3355 * check to see if we implicitly renewed while
3356 3356 * we waited for a reply for our RENEW call.
3357 3357 */
3358 3358 if (tmp_last_renewal_time ==
3359 3359 sp->last_renewal_time) {
3360 3360 /* no implicit renew came */
3361 3361 sp->last_renewal_time = tmp_now_time;
3362 3362 } else {
3363 3363 NFS4_DEBUG(nfs4_client_lease_debug,
3364 3364 (CE_NOTE, "renew_thread: did "
3365 3365 "implicit renewal before reply "
3366 3366 "from server for RENEW"));
3367 3367 }
3368 3368 } else {
3369 3369 /* figure out error */
3370 3370 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3371 3371 "renew_thread: nfs4renew returned error"
3372 3372 " %d", error));
3373 3373 }
3374 3374
3375 3375 }
3376 3376 }
3377 3377
3378 3378 die:
3379 3379 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3380 3380 "nfs4_renew_lease_thread: thread exiting"));
3381 3381
3382 3382 while (sp->s_otw_call_count != 0) {
3383 3383 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3384 3384 "nfs4_renew_lease_thread: waiting for outstanding "
3385 3385 "otw calls to finish for sp 0x%p, current "
3386 3386 "s_otw_call_count %d", (void *)sp,
3387 3387 sp->s_otw_call_count));
3388 3388 mutex_enter(&cpr_lock);
3389 3389 CALLB_CPR_SAFE_BEGIN(&cpr_info);
3390 3390 mutex_exit(&cpr_lock);
3391 3391 cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
3392 3392 mutex_enter(&cpr_lock);
3393 3393 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3394 3394 mutex_exit(&cpr_lock);
3395 3395 }
3396 3396 mutex_exit(&sp->s_lock);
3397 3397
3398 3398 nfs4_server_rele(sp); /* free the thread's reference */
3399 3399 nfs4_server_rele(sp); /* free the list's reference */
3400 3400 sp = NULL;
3401 3401
3402 3402 done:
3403 3403 mutex_enter(&cpr_lock);
3404 3404 CALLB_CPR_EXIT(&cpr_info); /* drops cpr_lock */
3405 3405 mutex_destroy(&cpr_lock);
3406 3406
3407 3407 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3408 3408 "nfs4_renew_lease_thread: renew thread exit officially"));
3409 3409
3410 3410 zthread_exit();
3411 3411 /* NOT REACHED */
3412 3412 }
3413 3413
3414 3414 /*
3415 3415 * Send out a RENEW op to the server.
3416 3416 * Assumes sp is locked down.
3417 3417 */
3418 3418 static int
3419 3419 nfs4renew(nfs4_server_t *sp)
3420 3420 {
3421 3421 COMPOUND4args_clnt args;
3422 3422 COMPOUND4res_clnt res;
3423 3423 nfs_argop4 argop[1];
3424 3424 int doqueue = 1;
3425 3425 int rpc_error;
3426 3426 cred_t *cr;
3427 3427 mntinfo4_t *mi;
3428 3428 timespec_t prop_time, after_time;
3429 3429 int needrecov = FALSE;
3430 3430 nfs4_recov_state_t recov_state;
3431 3431 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3432 3432
3433 3433 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew"));
3434 3434
3435 3435 recov_state.rs_flags = 0;
3436 3436 recov_state.rs_num_retry_despite_err = 0;
3437 3437
3438 3438 recov_retry:
3439 3439 mi = sp->mntinfo4_list;
3440 3440 VFS_HOLD(mi->mi_vfsp);
3441 3441 mutex_exit(&sp->s_lock);
3442 3442 ASSERT(mi != NULL);
3443 3443
3444 3444 e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3445 3445 if (e.error) {
3446 3446 VFS_RELE(mi->mi_vfsp);
3447 3447 return (e.error);
3448 3448 }
3449 3449
3450 3450 /* Check to see if we're dealing with a marked-dead sp */
3451 3451 mutex_enter(&sp->s_lock);
3452 3452 if (sp->s_thread_exit == NFS4_THREAD_EXIT) {
3453 3453 mutex_exit(&sp->s_lock);
3454 3454 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3455 3455 VFS_RELE(mi->mi_vfsp);
3456 3456 return (0);
3457 3457 }
3458 3458
3459 3459 /* Make sure mi hasn't changed on us */
3460 3460 if (mi != sp->mntinfo4_list) {
3461 3461 /* Must drop sp's lock to avoid a recursive mutex enter */
3462 3462 mutex_exit(&sp->s_lock);
3463 3463 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3464 3464 VFS_RELE(mi->mi_vfsp);
3465 3465 mutex_enter(&sp->s_lock);
3466 3466 goto recov_retry;
3467 3467 }
3468 3468 mutex_exit(&sp->s_lock);
3469 3469
3470 3470 args.ctag = TAG_RENEW;
3471 3471
3472 3472 args.array_len = 1;
3473 3473 args.array = argop;
3474 3474
3475 3475 argop[0].argop = OP_RENEW;
3476 3476
3477 3477 mutex_enter(&sp->s_lock);
3478 3478 argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid;
3479 3479 cr = sp->s_cred;
3480 3480 crhold(cr);
3481 3481 mutex_exit(&sp->s_lock);
3482 3482
3483 3483 ASSERT(cr != NULL);
3484 3484
3485 3485 /* used to figure out RTT for sp */
3486 3486 gethrestime(&prop_time);
3487 3487
3488 3488 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3489 3489 "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first",
3490 3490 (void*)sp));
3491 3491 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ",
3492 3492 prop_time.tv_sec, prop_time.tv_nsec));
3493 3493
3494 3494 DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp,
3495 3495 mntinfo4_t *, mi);
3496 3496
3497 3497 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3498 3498 crfree(cr);
3499 3499
3500 3500 DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp,
3501 3501 mntinfo4_t *, mi);
3502 3502
3503 3503 gethrestime(&after_time);
3504 3504
3505 3505 mutex_enter(&sp->s_lock);
3506 3506 sp->propagation_delay.tv_sec =
3507 3507 MAX(1, after_time.tv_sec - prop_time.tv_sec);
3508 3508 mutex_exit(&sp->s_lock);
3509 3509
3510 3510 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ",
3511 3511 after_time.tv_sec, after_time.tv_nsec));
3512 3512
3513 3513 if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) {
3514 3514 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3515 3515 nfs4_delegreturn_all(sp);
3516 3516 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3517 3517 VFS_RELE(mi->mi_vfsp);
3518 3518 /*
3519 3519 * If the server returns CB_PATH_DOWN, it has renewed
3520 3520 * the lease and informed us that the callback path is
3521 3521 * down. Since the lease is renewed, just return 0 and
3522 3522 * let the renew thread proceed as normal.
3523 3523 */
3524 3524 return (0);
3525 3525 }
3526 3526
3527 3527 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3528 3528 if (!needrecov && e.error) {
3529 3529 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3530 3530 VFS_RELE(mi->mi_vfsp);
3531 3531 return (e.error);
3532 3532 }
3533 3533
3534 3534 rpc_error = e.error;
3535 3535
3536 3536 if (needrecov) {
3537 3537 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3538 3538 "nfs4renew: initiating recovery\n"));
3539 3539
3540 3540 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
3541 3541 OP_RENEW, NULL, NULL, NULL) == FALSE) {
3542 3542 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3543 3543 VFS_RELE(mi->mi_vfsp);
3544 3544 if (!e.error)
3545 3545 (void) xdr_free(xdr_COMPOUND4res_clnt,
3546 3546 (caddr_t)&res);
3547 3547 mutex_enter(&sp->s_lock);
3548 3548 goto recov_retry;
3549 3549 }
3550 3550 /* fall through for res.status case */
3551 3551 }
3552 3552
3553 3553 if (res.status) {
3554 3554 if (res.status == NFS4ERR_LEASE_MOVED) {
3555 3555 /*EMPTY*/
3556 3556 /*
3557 3557 * XXX need to try every mntinfo4 in sp->mntinfo4_list
3558 3558 * to renew the lease on that server
3559 3559 */
3560 3560 }
3561 3561 e.error = geterrno4(res.status);
3562 3562 }
3563 3563
3564 3564 if (!rpc_error)
3565 3565 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3566 3566
3567 3567 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3568 3568
3569 3569 VFS_RELE(mi->mi_vfsp);
3570 3570
3571 3571 return (e.error);
3572 3572 }
3573 3573
3574 3574 void
3575 3575 nfs4_inc_state_ref_count(mntinfo4_t *mi)
3576 3576 {
3577 3577 nfs4_server_t *sp;
3578 3578
3579 3579 /* this locks down sp if it is found */
3580 3580 sp = find_nfs4_server(mi);
3581 3581
3582 3582 if (sp != NULL) {
3583 3583 nfs4_inc_state_ref_count_nolock(sp, mi);
3584 3584 mutex_exit(&sp->s_lock);
3585 3585 nfs4_server_rele(sp);
3586 3586 }
3587 3587 }
3588 3588
3589 3589 /*
3590 3590 * Bump the number of OPEN files (ie: those with state) so we know if this
3591 3591 * nfs4_server has any state to maintain a lease for or not.
3592 3592 *
3593 3593 * Also, marks the nfs4_server's lease valid if it hasn't been done so already.
3594 3594 */
3595 3595 void
3596 3596 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3597 3597 {
3598 3598 ASSERT(mutex_owned(&sp->s_lock));
3599 3599
3600 3600 sp->state_ref_count++;
3601 3601 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3602 3602 "nfs4_inc_state_ref_count: state_ref_count now %d",
3603 3603 sp->state_ref_count));
3604 3604
3605 3605 if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED)
3606 3606 sp->lease_valid = NFS4_LEASE_VALID;
3607 3607
3608 3608 /*
3609 3609 * If this call caused the lease to be marked valid and/or
3610 3610 * took the state_ref_count from 0 to 1, then start the time
3611 3611 * on lease renewal.
3612 3612 */
3613 3613 if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1)
3614 3614 sp->last_renewal_time = gethrestime_sec();
3615 3615
3616 3616 /* update the number of open files for mi */
3617 3617 mi->mi_open_files++;
3618 3618 }
3619 3619
3620 3620 void
3621 3621 nfs4_dec_state_ref_count(mntinfo4_t *mi)
3622 3622 {
3623 3623 nfs4_server_t *sp;
3624 3624
3625 3625 /* this locks down sp if it is found */
3626 3626 sp = find_nfs4_server_all(mi, 1);
3627 3627
3628 3628 if (sp != NULL) {
3629 3629 nfs4_dec_state_ref_count_nolock(sp, mi);
3630 3630 mutex_exit(&sp->s_lock);
3631 3631 nfs4_server_rele(sp);
3632 3632 }
3633 3633 }
3634 3634
3635 3635 /*
3636 3636 * Decrement the number of OPEN files (ie: those with state) so we know if
3637 3637 * this nfs4_server has any state to maintain a lease for or not.
3638 3638 */
3639 3639 void
3640 3640 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3641 3641 {
3642 3642 ASSERT(mutex_owned(&sp->s_lock));
3643 3643 ASSERT(sp->state_ref_count != 0);
3644 3644 sp->state_ref_count--;
3645 3645
3646 3646 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3647 3647 "nfs4_dec_state_ref_count: state ref count now %d",
3648 3648 sp->state_ref_count));
3649 3649
3650 3650 mi->mi_open_files--;
3651 3651 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3652 3652 "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x",
3653 3653 mi->mi_open_files, mi->mi_flags));
3654 3654
3655 3655 /* We don't have to hold the mi_lock to test mi_flags */
3656 3656 if (mi->mi_open_files == 0 &&
3657 3657 (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) {
3658 3658 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3659 3659 "nfs4_dec_state_ref_count: remove mntinfo4 %p since "
3660 3660 "we have closed the last open file", (void*)mi));
3661 3661 nfs4_remove_mi_from_server(mi, sp);
3662 3662 }
3663 3663 }
3664 3664
3665 3665 bool_t
3666 3666 inlease(nfs4_server_t *sp)
3667 3667 {
3668 3668 bool_t result;
3669 3669
3670 3670 ASSERT(mutex_owned(&sp->s_lock));
3671 3671
3672 3672 if (sp->lease_valid == NFS4_LEASE_VALID &&
3673 3673 gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time)
3674 3674 result = TRUE;
3675 3675 else
3676 3676 result = FALSE;
3677 3677
3678 3678 return (result);
3679 3679 }
3680 3680
3681 3681
3682 3682 /*
3683 3683 * Return non-zero if the given nfs4_server_t is going through recovery.
3684 3684 */
3685 3685
3686 3686 int
3687 3687 nfs4_server_in_recovery(nfs4_server_t *sp)
3688 3688 {
3689 3689 return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
3690 3690 }
3691 3691
3692 3692 /*
3693 3693 * Compare two shared filehandle objects. Returns -1, 0, or +1, if the
3694 3694 * first is less than, equal to, or greater than the second.
3695 3695 */
3696 3696
3697 3697 int
3698 3698 sfh4cmp(const void *p1, const void *p2)
3699 3699 {
3700 3700 const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1;
3701 3701 const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2;
3702 3702
3703 3703 return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh));
3704 3704 }
3705 3705
3706 3706 /*
3707 3707 * Create a table for shared filehandle objects.
3708 3708 */
3709 3709
3710 3710 void
3711 3711 sfh4_createtab(avl_tree_t *tab)
3712 3712 {
3713 3713 avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t),
3714 3714 offsetof(nfs4_sharedfh_t, sfh_tree));
3715 3715 }
3716 3716
3717 3717 /*
3718 3718 * Return a shared filehandle object for the given filehandle. The caller
3719 3719 * is responsible for eventually calling sfh4_rele().
3720 3720 */
3721 3721
3722 3722 nfs4_sharedfh_t *
3723 3723 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key)
3724 3724 {
3725 3725 nfs4_sharedfh_t *sfh, *nsfh;
3726 3726 avl_index_t where;
3727 3727 nfs4_sharedfh_t skey;
3728 3728
3729 3729 if (!key) {
3730 3730 skey.sfh_fh = *fh;
3731 3731 key = &skey;
3732 3732 }
3733 3733
3734 3734 nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP);
3735 3735 nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len;
3736 3736 /*
3737 3737 * We allocate the largest possible filehandle size because it's
3738 3738 * not that big, and it saves us from possibly having to resize the
3739 3739 * buffer later.
3740 3740 */
3741 3741 nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP);
3742 3742 bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len);
3743 3743 mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL);
3744 3744 nsfh->sfh_refcnt = 1;
3745 3745 nsfh->sfh_flags = SFH4_IN_TREE;
3746 3746 nsfh->sfh_mi = mi;
3747 3747 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)",
3748 3748 (void *)nsfh));
3749 3749
3750 3750 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3751 3751 sfh = avl_find(&mi->mi_filehandles, key, &where);
3752 3752 if (sfh != NULL) {
3753 3753 mutex_enter(&sfh->sfh_lock);
3754 3754 sfh->sfh_refcnt++;
3755 3755 mutex_exit(&sfh->sfh_lock);
3756 3756 nfs_rw_exit(&mi->mi_fh_lock);
3757 3757 /* free our speculative allocs */
3758 3758 kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3759 3759 kmem_free(nsfh, sizeof (nfs4_sharedfh_t));
3760 3760 return (sfh);
3761 3761 }
3762 3762
3763 3763 avl_insert(&mi->mi_filehandles, nsfh, where);
3764 3764 nfs_rw_exit(&mi->mi_fh_lock);
3765 3765
3766 3766 return (nsfh);
3767 3767 }
3768 3768
3769 3769 /*
3770 3770 * Return a shared filehandle object for the given filehandle. The caller
3771 3771 * is responsible for eventually calling sfh4_rele().
3772 3772 */
3773 3773
3774 3774 nfs4_sharedfh_t *
3775 3775 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi)
3776 3776 {
3777 3777 nfs4_sharedfh_t *sfh;
3778 3778 nfs4_sharedfh_t key;
3779 3779
3780 3780 ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE);
3781 3781
3782 3782 #ifdef DEBUG
3783 3783 if (nfs4_sharedfh_debug) {
3784 3784 nfs4_fhandle_t fhandle;
3785 3785
3786 3786 fhandle.fh_len = fh->nfs_fh4_len;
3787 3787 bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len);
3788 3788 zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:");
3789 3789 nfs4_printfhandle(&fhandle);
3790 3790 }
3791 3791 #endif
3792 3792
3793 3793 /*
3794 3794 * If there's already an object for the given filehandle, bump the
3795 3795 * reference count and return it. Otherwise, create a new object
3796 3796 * and add it to the AVL tree.
3797 3797 */
3798 3798
3799 3799 key.sfh_fh = *fh;
3800 3800
3801 3801 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3802 3802 sfh = avl_find(&mi->mi_filehandles, &key, NULL);
3803 3803 if (sfh != NULL) {
3804 3804 mutex_enter(&sfh->sfh_lock);
3805 3805 sfh->sfh_refcnt++;
3806 3806 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3807 3807 "sfh4_get: found existing %p, new refcnt=%d",
3808 3808 (void *)sfh, sfh->sfh_refcnt));
3809 3809 mutex_exit(&sfh->sfh_lock);
3810 3810 nfs_rw_exit(&mi->mi_fh_lock);
3811 3811 return (sfh);
3812 3812 }
3813 3813 nfs_rw_exit(&mi->mi_fh_lock);
3814 3814
3815 3815 return (sfh4_put(fh, mi, &key));
3816 3816 }
3817 3817
3818 3818 /*
3819 3819 * Get a reference to the given shared filehandle object.
3820 3820 */
3821 3821
3822 3822 void
3823 3823 sfh4_hold(nfs4_sharedfh_t *sfh)
3824 3824 {
3825 3825 ASSERT(sfh->sfh_refcnt > 0);
3826 3826
3827 3827 mutex_enter(&sfh->sfh_lock);
3828 3828 sfh->sfh_refcnt++;
3829 3829 NFS4_DEBUG(nfs4_sharedfh_debug,
3830 3830 (CE_NOTE, "sfh4_hold %p, new refcnt=%d",
3831 3831 (void *)sfh, sfh->sfh_refcnt));
3832 3832 mutex_exit(&sfh->sfh_lock);
3833 3833 }
3834 3834
3835 3835 /*
3836 3836 * Release a reference to the given shared filehandle object and null out
3837 3837 * the given pointer.
3838 3838 */
3839 3839
3840 3840 void
3841 3841 sfh4_rele(nfs4_sharedfh_t **sfhpp)
3842 3842 {
3843 3843 mntinfo4_t *mi;
3844 3844 nfs4_sharedfh_t *sfh = *sfhpp;
3845 3845
3846 3846 ASSERT(sfh->sfh_refcnt > 0);
3847 3847
3848 3848 mutex_enter(&sfh->sfh_lock);
3849 3849 if (sfh->sfh_refcnt > 1) {
3850 3850 sfh->sfh_refcnt--;
3851 3851 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3852 3852 "sfh4_rele %p, new refcnt=%d",
3853 3853 (void *)sfh, sfh->sfh_refcnt));
3854 3854 mutex_exit(&sfh->sfh_lock);
3855 3855 goto finish;
3856 3856 }
3857 3857 mutex_exit(&sfh->sfh_lock);
3858 3858
3859 3859 /*
3860 3860 * Possibly the last reference, so get the lock for the table in
3861 3861 * case it's time to remove the object from the table.
3862 3862 */
3863 3863 mi = sfh->sfh_mi;
3864 3864 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3865 3865 mutex_enter(&sfh->sfh_lock);
3866 3866 sfh->sfh_refcnt--;
3867 3867 if (sfh->sfh_refcnt > 0) {
3868 3868 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3869 3869 "sfh4_rele %p, new refcnt=%d",
3870 3870 (void *)sfh, sfh->sfh_refcnt));
3871 3871 mutex_exit(&sfh->sfh_lock);
3872 3872 nfs_rw_exit(&mi->mi_fh_lock);
3873 3873 goto finish;
3874 3874 }
3875 3875
3876 3876 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3877 3877 "sfh4_rele %p, last ref", (void *)sfh));
3878 3878 if (sfh->sfh_flags & SFH4_IN_TREE) {
3879 3879 avl_remove(&mi->mi_filehandles, sfh);
3880 3880 sfh->sfh_flags &= ~SFH4_IN_TREE;
3881 3881 }
3882 3882 mutex_exit(&sfh->sfh_lock);
3883 3883 nfs_rw_exit(&mi->mi_fh_lock);
3884 3884 mutex_destroy(&sfh->sfh_lock);
3885 3885 kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3886 3886 kmem_free(sfh, sizeof (nfs4_sharedfh_t));
3887 3887
3888 3888 finish:
3889 3889 *sfhpp = NULL;
3890 3890 }
3891 3891
3892 3892 /*
3893 3893 * Update the filehandle for the given shared filehandle object.
3894 3894 */
3895 3895
3896 3896 int nfs4_warn_dupfh = 0; /* if set, always warn about dup fhs below */
3897 3897
3898 3898 void
3899 3899 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh)
3900 3900 {
3901 3901 mntinfo4_t *mi = sfh->sfh_mi;
3902 3902 nfs4_sharedfh_t *dupsfh;
3903 3903 avl_index_t where;
3904 3904 nfs4_sharedfh_t key;
3905 3905
3906 3906 #ifdef DEBUG
3907 3907 mutex_enter(&sfh->sfh_lock);
3908 3908 ASSERT(sfh->sfh_refcnt > 0);
3909 3909 mutex_exit(&sfh->sfh_lock);
3910 3910 #endif
3911 3911 ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE);
3912 3912
3913 3913 /*
3914 3914 * The basic plan is to remove the shared filehandle object from
3915 3915 * the table, update it to have the new filehandle, then reinsert
3916 3916 * it.
3917 3917 */
3918 3918
3919 3919 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3920 3920 mutex_enter(&sfh->sfh_lock);
3921 3921 if (sfh->sfh_flags & SFH4_IN_TREE) {
3922 3922 avl_remove(&mi->mi_filehandles, sfh);
3923 3923 sfh->sfh_flags &= ~SFH4_IN_TREE;
3924 3924 }
3925 3925 mutex_exit(&sfh->sfh_lock);
3926 3926 sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len;
3927 3927 bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val,
3928 3928 sfh->sfh_fh.nfs_fh4_len);
3929 3929
3930 3930 /*
3931 3931 * XXX If there is already a shared filehandle object with the new
3932 3932 * filehandle, we're in trouble, because the rnode code assumes
3933 3933 * that there is only one shared filehandle object for a given
3934 3934 * filehandle. So issue a warning (for read-write mounts only)
3935 3935 * and don't try to re-insert the given object into the table.
3936 3936 * Hopefully the given object will quickly go away and everyone
3937 3937 * will use the new object.
3938 3938 */
3939 3939 key.sfh_fh = *newfh;
3940 3940 dupsfh = avl_find(&mi->mi_filehandles, &key, &where);
3941 3941 if (dupsfh != NULL) {
3942 3942 if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) {
3943 3943 zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: "
3944 3944 "duplicate filehandle detected");
3945 3945 sfh4_printfhandle(dupsfh);
3946 3946 }
3947 3947 } else {
3948 3948 avl_insert(&mi->mi_filehandles, sfh, where);
3949 3949 mutex_enter(&sfh->sfh_lock);
3950 3950 sfh->sfh_flags |= SFH4_IN_TREE;
3951 3951 mutex_exit(&sfh->sfh_lock);
3952 3952 }
3953 3953 nfs_rw_exit(&mi->mi_fh_lock);
3954 3954 }
3955 3955
3956 3956 /*
3957 3957 * Copy out the current filehandle for the given shared filehandle object.
3958 3958 */
3959 3959
3960 3960 void
3961 3961 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp)
3962 3962 {
3963 3963 mntinfo4_t *mi = sfh->sfh_mi;
3964 3964
3965 3965 ASSERT(sfh->sfh_refcnt > 0);
3966 3966
3967 3967 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3968 3968 fhp->fh_len = sfh->sfh_fh.nfs_fh4_len;
3969 3969 ASSERT(fhp->fh_len <= NFS4_FHSIZE);
3970 3970 bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len);
3971 3971 nfs_rw_exit(&mi->mi_fh_lock);
3972 3972 }
3973 3973
3974 3974 /*
3975 3975 * Print out the filehandle for the given shared filehandle object.
3976 3976 */
3977 3977
3978 3978 void
3979 3979 sfh4_printfhandle(const nfs4_sharedfh_t *sfh)
3980 3980 {
3981 3981 nfs4_fhandle_t fhandle;
3982 3982
3983 3983 sfh4_copyval(sfh, &fhandle);
3984 3984 nfs4_printfhandle(&fhandle);
3985 3985 }
3986 3986
3987 3987 /*
3988 3988 * Compare 2 fnames. Returns -1 if the first is "less" than the second, 0
3989 3989 * if they're the same, +1 if the first is "greater" than the second. The
3990 3990 * caller (or whoever's calling the AVL package) is responsible for
3991 3991 * handling locking issues.
3992 3992 */
3993 3993
3994 3994 static int
3995 3995 fncmp(const void *p1, const void *p2)
3996 3996 {
3997 3997 const nfs4_fname_t *f1 = p1;
3998 3998 const nfs4_fname_t *f2 = p2;
3999 3999 int res;
4000 4000
4001 4001 res = strcmp(f1->fn_name, f2->fn_name);
4002 4002 /*
4003 4003 * The AVL package wants +/-1, not arbitrary positive or negative
4004 4004 * integers.
4005 4005 */
4006 4006 if (res > 0)
4007 4007 res = 1;
4008 4008 else if (res < 0)
4009 4009 res = -1;
4010 4010 return (res);
4011 4011 }
4012 4012
4013 4013 /*
4014 4014 * Get or create an fname with the given name, as a child of the given
4015 4015 * fname. The caller is responsible for eventually releasing the reference
4016 4016 * (fn_rele()). parent may be NULL.
4017 4017 */
4018 4018
4019 4019 nfs4_fname_t *
4020 4020 fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh)
4021 4021 {
4022 4022 nfs4_fname_t key;
4023 4023 nfs4_fname_t *fnp;
4024 4024 avl_index_t where;
4025 4025
4026 4026 key.fn_name = name;
4027 4027
4028 4028 /*
4029 4029 * If there's already an fname registered with the given name, bump
4030 4030 * its reference count and return it. Otherwise, create a new one
4031 4031 * and add it to the parent's AVL tree.
4032 4032 *
4033 4033 * fname entries we are looking for should match both name
4034 4034 * and sfh stored in the fname.
4035 4035 */
4036 4036 again:
4037 4037 if (parent != NULL) {
4038 4038 mutex_enter(&parent->fn_lock);
4039 4039 fnp = avl_find(&parent->fn_children, &key, &where);
4040 4040 if (fnp != NULL) {
4041 4041 /*
4042 4042 * This hold on fnp is released below later,
4043 4043 * in case this is not the fnp we want.
4044 4044 */
4045 4045 fn_hold(fnp);
4046 4046
4047 4047 if (fnp->fn_sfh == sfh) {
4048 4048 /*
4049 4049 * We have found our entry.
4050 4050 * put an hold and return it.
4051 4051 */
4052 4052 mutex_exit(&parent->fn_lock);
4053 4053 return (fnp);
4054 4054 }
4055 4055
4056 4056 /*
4057 4057 * We have found an entry that has a mismatching
4058 4058 * fn_sfh. This could be a stale entry due to
4059 4059 * server side rename. We will remove this entry
4060 4060 * and make sure no such entries exist.
4061 4061 */
4062 4062 mutex_exit(&parent->fn_lock);
4063 4063 mutex_enter(&fnp->fn_lock);
4064 4064 if (fnp->fn_parent == parent) {
4065 4065 /*
4066 4066 * Remove ourselves from parent's
4067 4067 * fn_children tree.
4068 4068 */
4069 4069 mutex_enter(&parent->fn_lock);
4070 4070 avl_remove(&parent->fn_children, fnp);
4071 4071 mutex_exit(&parent->fn_lock);
4072 4072 fn_rele(&fnp->fn_parent);
4073 4073 }
4074 4074 mutex_exit(&fnp->fn_lock);
4075 4075 fn_rele(&fnp);
4076 4076 goto again;
4077 4077 }
4078 4078 }
4079 4079
4080 4080 fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP);
4081 4081 mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL);
4082 4082 fnp->fn_parent = parent;
4083 4083 if (parent != NULL)
4084 4084 fn_hold(parent);
4085 4085 fnp->fn_len = strlen(name);
4086 4086 ASSERT(fnp->fn_len < MAXNAMELEN);
4087 4087 fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP);
4088 4088 (void) strcpy(fnp->fn_name, name);
4089 4089 fnp->fn_refcnt = 1;
4090 4090
4091 4091 /*
4092 4092 * This hold on sfh is later released
4093 4093 * when we do the final fn_rele() on this fname.
4094 4094 */
4095 4095 sfh4_hold(sfh);
4096 4096 fnp->fn_sfh = sfh;
4097 4097
4098 4098 avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t),
4099 4099 offsetof(nfs4_fname_t, fn_tree));
4100 4100 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4101 4101 "fn_get %p:%s, a new nfs4_fname_t!",
4102 4102 (void *)fnp, fnp->fn_name));
4103 4103 if (parent != NULL) {
↓ open down ↓ |
934 lines elided |
↑ open up ↑ |
4104 4104 avl_insert(&parent->fn_children, fnp, where);
4105 4105 mutex_exit(&parent->fn_lock);
4106 4106 }
4107 4107
4108 4108 return (fnp);
4109 4109 }
4110 4110
4111 4111 void
4112 4112 fn_hold(nfs4_fname_t *fnp)
4113 4113 {
4114 - atomic_add_32(&fnp->fn_refcnt, 1);
4114 + atomic_inc_32(&fnp->fn_refcnt);
4115 4115 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4116 4116 "fn_hold %p:%s, new refcnt=%d",
4117 4117 (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4118 4118 }
4119 4119
4120 4120 /*
4121 4121 * Decrement the reference count of the given fname, and destroy it if its
4122 4122 * reference count goes to zero. Nulls out the given pointer.
4123 4123 */
4124 4124
4125 4125 void
4126 4126 fn_rele(nfs4_fname_t **fnpp)
4127 4127 {
4128 4128 nfs4_fname_t *parent;
4129 4129 uint32_t newref;
↓ open down ↓ |
5 lines elided |
↑ open up ↑ |
4130 4130 nfs4_fname_t *fnp;
4131 4131
4132 4132 recur:
4133 4133 fnp = *fnpp;
4134 4134 *fnpp = NULL;
4135 4135
4136 4136 mutex_enter(&fnp->fn_lock);
4137 4137 parent = fnp->fn_parent;
4138 4138 if (parent != NULL)
4139 4139 mutex_enter(&parent->fn_lock); /* prevent new references */
4140 - newref = atomic_add_32_nv(&fnp->fn_refcnt, -1);
4140 + newref = atomic_dec_32_nv(&fnp->fn_refcnt);
4141 4141 if (newref > 0) {
4142 4142 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4143 4143 "fn_rele %p:%s, new refcnt=%d",
4144 4144 (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4145 4145 if (parent != NULL)
4146 4146 mutex_exit(&parent->fn_lock);
4147 4147 mutex_exit(&fnp->fn_lock);
4148 4148 return;
4149 4149 }
4150 4150
4151 4151 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4152 4152 "fn_rele %p:%s, last reference, deleting...",
4153 4153 (void *)fnp, fnp->fn_name));
4154 4154 if (parent != NULL) {
4155 4155 avl_remove(&parent->fn_children, fnp);
4156 4156 mutex_exit(&parent->fn_lock);
4157 4157 }
4158 4158 kmem_free(fnp->fn_name, fnp->fn_len + 1);
4159 4159 sfh4_rele(&fnp->fn_sfh);
4160 4160 mutex_destroy(&fnp->fn_lock);
4161 4161 avl_destroy(&fnp->fn_children);
4162 4162 kmem_free(fnp, sizeof (nfs4_fname_t));
4163 4163 /*
4164 4164 * Recursivly fn_rele the parent.
4165 4165 * Use goto instead of a recursive call to avoid stack overflow.
4166 4166 */
4167 4167 if (parent != NULL) {
4168 4168 fnpp = &parent;
4169 4169 goto recur;
4170 4170 }
4171 4171 }
4172 4172
4173 4173 /*
4174 4174 * Returns the single component name of the given fname, in a MAXNAMELEN
4175 4175 * string buffer, which the caller is responsible for freeing. Note that
4176 4176 * the name may become invalid as a result of fn_move().
4177 4177 */
4178 4178
4179 4179 char *
4180 4180 fn_name(nfs4_fname_t *fnp)
4181 4181 {
4182 4182 char *name;
4183 4183
4184 4184 ASSERT(fnp->fn_len < MAXNAMELEN);
4185 4185 name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
4186 4186 mutex_enter(&fnp->fn_lock);
4187 4187 (void) strcpy(name, fnp->fn_name);
4188 4188 mutex_exit(&fnp->fn_lock);
4189 4189
4190 4190 return (name);
4191 4191 }
4192 4192
4193 4193
4194 4194 /*
4195 4195 * fn_path_realloc
4196 4196 *
4197 4197 * This function, used only by fn_path, constructs
4198 4198 * a new string which looks like "prepend" + "/" + "current".
4199 4199 * by allocating a new string and freeing the old one.
4200 4200 */
4201 4201 static void
4202 4202 fn_path_realloc(char **curses, char *prepend)
4203 4203 {
4204 4204 int len, curlen = 0;
4205 4205 char *news;
4206 4206
4207 4207 if (*curses == NULL) {
4208 4208 /*
4209 4209 * Prime the pump, allocate just the
4210 4210 * space for prepend and return that.
4211 4211 */
4212 4212 len = strlen(prepend) + 1;
4213 4213 news = kmem_alloc(len, KM_SLEEP);
4214 4214 (void) strncpy(news, prepend, len);
4215 4215 } else {
4216 4216 /*
4217 4217 * Allocate the space for a new string
4218 4218 * +1 +1 is for the "/" and the NULL
4219 4219 * byte at the end of it all.
4220 4220 */
4221 4221 curlen = strlen(*curses);
4222 4222 len = curlen + strlen(prepend) + 1 + 1;
4223 4223 news = kmem_alloc(len, KM_SLEEP);
4224 4224 (void) strncpy(news, prepend, len);
4225 4225 (void) strcat(news, "/");
4226 4226 (void) strcat(news, *curses);
4227 4227 kmem_free(*curses, curlen + 1);
4228 4228 }
4229 4229 *curses = news;
4230 4230 }
4231 4231
4232 4232 /*
4233 4233 * Returns the path name (starting from the fs root) for the given fname.
4234 4234 * The caller is responsible for freeing. Note that the path may be or
4235 4235 * become invalid as a result of fn_move().
4236 4236 */
4237 4237
4238 4238 char *
4239 4239 fn_path(nfs4_fname_t *fnp)
4240 4240 {
4241 4241 char *path;
4242 4242 nfs4_fname_t *nextfnp;
4243 4243
4244 4244 if (fnp == NULL)
4245 4245 return (NULL);
4246 4246
4247 4247 path = NULL;
4248 4248
4249 4249 /* walk up the tree constructing the pathname. */
4250 4250
4251 4251 fn_hold(fnp); /* adjust for later rele */
4252 4252 do {
4253 4253 mutex_enter(&fnp->fn_lock);
4254 4254 /*
4255 4255 * Add fn_name in front of the current path
4256 4256 */
4257 4257 fn_path_realloc(&path, fnp->fn_name);
4258 4258 nextfnp = fnp->fn_parent;
4259 4259 if (nextfnp != NULL)
4260 4260 fn_hold(nextfnp);
4261 4261 mutex_exit(&fnp->fn_lock);
4262 4262 fn_rele(&fnp);
4263 4263 fnp = nextfnp;
4264 4264 } while (fnp != NULL);
4265 4265
4266 4266 return (path);
4267 4267 }
4268 4268
4269 4269 /*
4270 4270 * Return a reference to the parent of the given fname, which the caller is
4271 4271 * responsible for eventually releasing.
4272 4272 */
4273 4273
4274 4274 nfs4_fname_t *
4275 4275 fn_parent(nfs4_fname_t *fnp)
4276 4276 {
4277 4277 nfs4_fname_t *parent;
4278 4278
4279 4279 mutex_enter(&fnp->fn_lock);
4280 4280 parent = fnp->fn_parent;
4281 4281 if (parent != NULL)
4282 4282 fn_hold(parent);
4283 4283 mutex_exit(&fnp->fn_lock);
4284 4284
4285 4285 return (parent);
4286 4286 }
4287 4287
4288 4288 /*
4289 4289 * Update fnp so that its parent is newparent and its name is newname.
4290 4290 */
4291 4291
4292 4292 void
4293 4293 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname)
4294 4294 {
4295 4295 nfs4_fname_t *parent, *tmpfnp;
4296 4296 ssize_t newlen;
4297 4297 nfs4_fname_t key;
4298 4298 avl_index_t where;
4299 4299
4300 4300 /*
4301 4301 * This assert exists to catch the client trying to rename
4302 4302 * a dir to be a child of itself. This happened at a recent
4303 4303 * bakeoff against a 3rd party (broken) server which allowed
4304 4304 * the rename to succeed. If it trips it means that:
4305 4305 * a) the code in nfs4rename that detects this case is broken
4306 4306 * b) the server is broken (since it allowed the bogus rename)
4307 4307 *
4308 4308 * For non-DEBUG kernels, prepare for a recursive mutex_enter
4309 4309 * panic below from: mutex_enter(&newparent->fn_lock);
4310 4310 */
4311 4311 ASSERT(fnp != newparent);
4312 4312
4313 4313 /*
4314 4314 * Remove fnp from its current parent, change its name, then add it
4315 4315 * to newparent. It might happen that fnp was replaced by another
4316 4316 * nfs4_fname_t with the same fn_name in parent->fn_children.
4317 4317 * In such case, fnp->fn_parent is NULL and we skip the removal
4318 4318 * of fnp from its current parent.
4319 4319 */
4320 4320 mutex_enter(&fnp->fn_lock);
4321 4321 parent = fnp->fn_parent;
4322 4322 if (parent != NULL) {
4323 4323 mutex_enter(&parent->fn_lock);
4324 4324 avl_remove(&parent->fn_children, fnp);
4325 4325 mutex_exit(&parent->fn_lock);
4326 4326 fn_rele(&fnp->fn_parent);
4327 4327 }
4328 4328
4329 4329 newlen = strlen(newname);
4330 4330 if (newlen != fnp->fn_len) {
4331 4331 ASSERT(newlen < MAXNAMELEN);
4332 4332 kmem_free(fnp->fn_name, fnp->fn_len + 1);
4333 4333 fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP);
4334 4334 fnp->fn_len = newlen;
4335 4335 }
4336 4336 (void) strcpy(fnp->fn_name, newname);
4337 4337
4338 4338 again:
4339 4339 mutex_enter(&newparent->fn_lock);
4340 4340 key.fn_name = fnp->fn_name;
4341 4341 tmpfnp = avl_find(&newparent->fn_children, &key, &where);
4342 4342 if (tmpfnp != NULL) {
4343 4343 /*
4344 4344 * This could be due to a file that was unlinked while
4345 4345 * open, or perhaps the rnode is in the free list. Remove
4346 4346 * it from newparent and let it go away on its own. The
4347 4347 * contorted code is to deal with lock order issues and
4348 4348 * race conditions.
4349 4349 */
4350 4350 fn_hold(tmpfnp);
4351 4351 mutex_exit(&newparent->fn_lock);
4352 4352 mutex_enter(&tmpfnp->fn_lock);
4353 4353 if (tmpfnp->fn_parent == newparent) {
4354 4354 mutex_enter(&newparent->fn_lock);
4355 4355 avl_remove(&newparent->fn_children, tmpfnp);
4356 4356 mutex_exit(&newparent->fn_lock);
4357 4357 fn_rele(&tmpfnp->fn_parent);
4358 4358 }
4359 4359 mutex_exit(&tmpfnp->fn_lock);
4360 4360 fn_rele(&tmpfnp);
4361 4361 goto again;
4362 4362 }
4363 4363 fnp->fn_parent = newparent;
4364 4364 fn_hold(newparent);
4365 4365 avl_insert(&newparent->fn_children, fnp, where);
4366 4366 mutex_exit(&newparent->fn_lock);
4367 4367 mutex_exit(&fnp->fn_lock);
4368 4368 }
4369 4369
4370 4370 #ifdef DEBUG
4371 4371 /*
4372 4372 * Return non-zero if the type information makes sense for the given vnode.
4373 4373 * Otherwise panic.
4374 4374 */
4375 4375 int
4376 4376 nfs4_consistent_type(vnode_t *vp)
4377 4377 {
4378 4378 rnode4_t *rp = VTOR4(vp);
4379 4379
4380 4380 if (nfs4_vtype_debug && vp->v_type != VNON &&
4381 4381 rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) {
4382 4382 cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, "
4383 4383 "rnode attr type=%d", (void *)vp, vp->v_type,
4384 4384 rp->r_attr.va_type);
4385 4385 }
4386 4386
4387 4387 return (1);
4388 4388 }
4389 4389 #endif /* DEBUG */
↓ open down ↓ |
239 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX