xmod-cleanup New usr/src/uts/common/os/flock.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28 /*      All Rights Reserved */
  29 
  30 #pragma ident   "%Z%%M% %I%     %E% SMI"
  31 
  32 #include <sys/flock_impl.h>
  33 #include <sys/vfs.h>
  34 #include <sys/t_lock.h>           /* for <sys/callb.h> */
  35 #include <sys/callb.h>
  36 #include <sys/clconf.h>
  37 #include <sys/cladm.h>
  38 #include <sys/nbmlock.h>
  39 #include <sys/cred.h>
  40 #include <sys/policy.h>
  41 
  42 /*
  43  * The following four variables are for statistics purposes and they are
  44  * not protected by locks. They may not be accurate but will at least be
  45  * close to the actual value.
  46  */
  47 
  48 int     flk_lock_allocs;
  49 int     flk_lock_frees;
  50 int     edge_allocs;
  51 int     edge_frees;
  52 int     flk_proc_vertex_allocs;
  53 int     flk_proc_edge_allocs;
  54 int     flk_proc_vertex_frees;
  55 int     flk_proc_edge_frees;
  56 
  57 static kmutex_t flock_lock;
  58 
  59 #ifdef DEBUG
  60 int check_debug = 0;
  61 #define CHECK_ACTIVE_LOCKS(gp)  if (check_debug) \
  62                                         check_active_locks(gp);
  63 #define CHECK_SLEEPING_LOCKS(gp)        if (check_debug) \
  64                                                 check_sleeping_locks(gp);
  65 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)   \
  66                 if (check_debug)        \
  67                         check_owner_locks(gp, pid, sysid, vp);
  68 #define CHECK_LOCK_TRANSITION(old_state, new_state) \
  69         { \
  70                 if (check_lock_transition(old_state, new_state)) { \
  71                         cmn_err(CE_PANIC, "Illegal lock transition \
  72                             from %d to %d", old_state, new_state); \
  73                 } \
  74         }
  75 #else
  76 
  77 #define CHECK_ACTIVE_LOCKS(gp)
  78 #define CHECK_SLEEPING_LOCKS(gp)
  79 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
  80 #define CHECK_LOCK_TRANSITION(old_state, new_state)
  81 
  82 #endif /* DEBUG */
  83 
  84 struct kmem_cache       *flk_edge_cache;
  85 
  86 graph_t         *lock_graph[HASH_SIZE];
  87 proc_graph_t    pgraph;
  88 
  89 /*
  90  * Clustering.
  91  *
  92  * NLM REGISTRY TYPE IMPLEMENTATION
  93  *
  94  * Assumptions:
  95  *  1.  Nodes in a cluster are numbered starting at 1; always non-negative
  96  *      integers; maximum node id is returned by clconf_maximum_nodeid().
  97  *  2.  We use this node id to identify the node an NLM server runs on.
  98  */
  99 
 100 /*
 101  * NLM registry object keeps track of NLM servers via their
 102  * nlmids (which are the node ids of the node in the cluster they run on)
 103  * that have requested locks at this LLM with which this registry is
 104  * associated.
 105  *
 106  * Representation of abstraction:
 107  *    rep = record[     states: array[nlm_state],
 108  *                      lock: mutex]
 109  *
 110  *    Representation invariants:
 111  *      1. index i of rep.states is between 0 and n - 1 where n is number
 112  *         of elements in the array, which happen to be the maximum number
 113  *         of nodes in the cluster configuration + 1.
 114  *      2. map nlmid to index i of rep.states
 115  *              0   -> 0
 116  *              1   -> 1
 117  *              2   -> 2
 118  *              n-1 -> clconf_maximum_nodeid()+1
 119  *      3.  This 1-1 mapping is quite convenient and it avoids errors resulting
 120  *          from forgetting to subtract 1 from the index.
 121  *      4.  The reason we keep the 0th index is the following.  A legitimate
 122  *          cluster configuration includes making a UFS file system NFS
 123  *          exportable.  The code is structured so that if you're in a cluster
 124  *          you do one thing; otherwise, you do something else.  The problem
 125  *          is what to do if you think you're in a cluster with PXFS loaded,
 126  *          but you're using UFS not PXFS?  The upper two bytes of the sysid
 127  *          encode the node id of the node where NLM server runs; these bytes
 128  *          are zero for UFS.  Since the nodeid is used to index into the
 129  *          registry, we can record the NLM server state information at index
 130  *          0 using the same mechanism used for PXFS file locks!
 131  */
 132 static flk_nlm_status_t *nlm_reg_status = NULL; /* state array 0..N-1 */
 133 static kmutex_t nlm_reg_lock;                   /* lock to protect arrary */
 134 static uint_t nlm_status_size;                  /* size of state array */
 135 
 136 /*
 137  * Although we need a global lock dependency graph (and associated data
 138  * structures), we also need a per-zone notion of whether the lock manager is
 139  * running, and so whether to allow lock manager requests or not.
 140  *
 141  * Thus, on a per-zone basis we maintain a ``global'' variable
 142  * (flk_lockmgr_status), protected by flock_lock, and set when the lock
 143  * manager is determined to be changing state (starting or stopping).
 144  *
 145  * Each graph/zone pair also has a copy of this variable, which is protected by
 146  * the graph's mutex.
 147  *
 148  * The per-graph copies are used to synchronize lock requests with shutdown
 149  * requests.  The global copy is used to initialize the per-graph field when a
 150  * new graph is created.
 151  */
 152 struct flock_globals {
 153         flk_lockmgr_status_t flk_lockmgr_status;
 154         flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
 155 };
 156 
 157 zone_key_t flock_zone_key;
 158 
 159 static void create_flock(lock_descriptor_t *, flock64_t *);
 160 static lock_descriptor_t        *flk_get_lock(void);
 161 static void     flk_free_lock(lock_descriptor_t *lock);
 162 static void     flk_get_first_blocking_lock(lock_descriptor_t *request);
 163 static int flk_process_request(lock_descriptor_t *);
 164 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
 165 static edge_t *flk_get_edge(void);
 166 static int flk_wait_execute_request(lock_descriptor_t *);
 167 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
 168 static void flk_insert_active_lock(lock_descriptor_t *);
 169 static void flk_delete_active_lock(lock_descriptor_t *, int);
 170 static void flk_insert_sleeping_lock(lock_descriptor_t *);
 171 static void flk_graph_uncolor(graph_t *);
 172 static void flk_wakeup(lock_descriptor_t *, int);
 173 static void flk_free_edge(edge_t *);
 174 static void flk_recompute_dependencies(lock_descriptor_t *,
 175                         lock_descriptor_t **,  int, int);
 176 static int flk_find_barriers(lock_descriptor_t *);
 177 static void flk_update_barriers(lock_descriptor_t *);
 178 static int flk_color_reachables(lock_descriptor_t *);
 179 static int flk_canceled(lock_descriptor_t *);
 180 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
 181 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
 182 static void wait_for_lock(lock_descriptor_t *);
 183 static void unlock_lockmgr_granted(struct flock_globals *);
 184 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
 185 
 186 /* Clustering hooks */
 187 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
 188 static void cl_flk_wakeup_sleeping_nlm_locks(int);
 189 static void cl_flk_unlock_nlm_granted(int);
 190 
 191 #ifdef DEBUG
 192 static int check_lock_transition(int, int);
 193 static void check_sleeping_locks(graph_t *);
 194 static void check_active_locks(graph_t *);
 195 static int no_path(lock_descriptor_t *, lock_descriptor_t *);
 196 static void path(lock_descriptor_t *, lock_descriptor_t *);
 197 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
 198 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
 199 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
 200 #endif
 201 
 202 /*      proc_graph function definitions */
 203 static int flk_check_deadlock(lock_descriptor_t *);
 204 static void flk_proc_graph_uncolor(void);
 205 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
 206 static proc_edge_t *flk_get_proc_edge(void);
 207 static void flk_proc_release(proc_vertex_t *);
 208 static void flk_free_proc_edge(proc_edge_t *);
 209 static void flk_update_proc_graph(edge_t *, int);
 210 
 211 /* Non-blocking mandatory locking */
 212 static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
 213                         u_offset_t);
 214 
 215 static struct flock_globals *
 216 flk_get_globals(void)
 217 {
 218         /*
 219          * The KLM module had better be loaded if we're attempting to handle
 220          * lockmgr requests.
 221          */
 222         ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
 223         return (zone_getspecific(flock_zone_key, curproc->p_zone));
 224 }
 225 
 226 static flk_lockmgr_status_t
 227 flk_get_lockmgr_status(void)
 228 {
 229         struct flock_globals *fg;
 230 
 231         ASSERT(MUTEX_HELD(&flock_lock));
 232 
 233         if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
 234                 /*
 235                  * KLM module not loaded; lock manager definitely not running.
 236                  */
 237                 return (FLK_LOCKMGR_DOWN);
 238         }
 239         fg = flk_get_globals();
 240         return (fg->flk_lockmgr_status);
 241 }
 242 
 243 /*
 244  * Routine called from fs_frlock in fs/fs_subr.c
 245  */
 246 
 247 int
 248 reclock(vnode_t         *vp,
 249         flock64_t       *lckdat,
 250         int             cmd,
 251         int             flag,
 252         u_offset_t      offset,
 253         flk_callback_t  *flk_cbp)
 254 {
 255         lock_descriptor_t       stack_lock_request;
 256         lock_descriptor_t       *lock_request;
 257         int error = 0;
 258         graph_t *gp;
 259         int                     nlmid;
 260 
 261         /*
 262          * Check access permissions
 263          */
 264         if ((cmd & SETFLCK) &&
 265                 ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
 266                 (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
 267                         return (EBADF);
 268 
 269         /*
 270          * for query and unlock we use the stack_lock_request
 271          */
 272 
 273         if ((lckdat->l_type == F_UNLCK) ||
 274                         !((cmd & INOFLCK) || (cmd & SETFLCK))) {
 275                 lock_request = &stack_lock_request;
 276                 (void) bzero((caddr_t)lock_request,
 277                                 sizeof (lock_descriptor_t));
 278 
 279                 /*
 280                  * following is added to make the assertions in
 281                  * flk_execute_request() to pass through
 282                  */
 283 
 284                 lock_request->l_edge.edge_in_next = &lock_request->l_edge;
 285                 lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
 286                 lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
 287                 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
 288                 lock_request->l_status = FLK_INITIAL_STATE;
 289         } else {
 290                 lock_request = flk_get_lock();
 291         }
 292         lock_request->l_state = 0;
 293         lock_request->l_vnode = vp;
 294         lock_request->l_zoneid = getzoneid();
 295 
 296         /*
 297          * Convert the request range into the canonical start and end
 298          * values.  The NLM protocol supports locking over the entire
 299          * 32-bit range, so there's no range checking for remote requests,
 300          * but we still need to verify that local requests obey the rules.
 301          */
 302         /* Clustering */
 303         if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
 304                 ASSERT(lckdat->l_whence == 0);
 305                 lock_request->l_start = lckdat->l_start;
 306                 lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
 307                         lckdat->l_start + (lckdat->l_len - 1);
 308         } else {
 309                 /* check the validity of the lock range */
 310                 error = flk_convert_lock_data(vp, lckdat,
 311                         &lock_request->l_start, &lock_request->l_end,
 312                         offset);
 313                 if (error) {
 314                         goto done;
 315                 }
 316                 error = flk_check_lock_data(lock_request->l_start,
 317                                             lock_request->l_end, MAXEND);
 318                 if (error) {
 319                         goto done;
 320                 }
 321         }
 322 
 323         ASSERT(lock_request->l_end >= lock_request->l_start);
 324 
 325         lock_request->l_type = lckdat->l_type;
 326         if (cmd & INOFLCK)
 327                 lock_request->l_state |= IO_LOCK;
 328         if (cmd & SLPFLCK)
 329                 lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
 330         if (cmd & RCMDLCK)
 331                 lock_request->l_state |= LOCKMGR_LOCK;
 332         if (cmd & NBMLCK)
 333                 lock_request->l_state |= NBMAND_LOCK;
 334         /*
 335          * Clustering: set flag for PXFS locks
 336          * We do not _only_ check for the PCMDLCK flag because PXFS locks could
 337          * also be of type 'RCMDLCK'.
 338          * We do not _only_ check the GETPXFSID() macro because local PXFS
 339          * clients use a pxfsid of zero to permit deadlock detection in the LLM.
 340          */
 341 
 342         if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
 343                 lock_request->l_state |= PXFS_LOCK;
 344         }
 345         if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
 346                 if (lock_request->l_type == F_RDLCK ||
 347                         lock_request->l_type == F_WRLCK)
 348                         lock_request->l_state |= QUERY_LOCK;
 349         }
 350         lock_request->l_flock = (*lckdat);
 351         lock_request->l_callbacks = flk_cbp;
 352 
 353         /*
 354          * We are ready for processing the request
 355          */
 356         if (IS_LOCKMGR(lock_request)) {
 357                 /*
 358                  * If the lock request is an NLM server request ....
 359                  */
 360                 if (nlm_status_size == 0) { /* not booted as cluster */
 361                         mutex_enter(&flock_lock);
 362                         /*
 363                          * Bail out if this is a lock manager request and the
 364                          * lock manager is not supposed to be running.
 365                          */
 366                         if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
 367                                 mutex_exit(&flock_lock);
 368                                 error = ENOLCK;
 369                                 goto done;
 370                         }
 371                         mutex_exit(&flock_lock);
 372                 } else {                        /* booted as a cluster */
 373                         nlmid = GETNLMID(lock_request->l_flock.l_sysid);
 374                         ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
 375 
 376                         mutex_enter(&nlm_reg_lock);
 377                         /*
 378                          * If the NLM registry does not know about this
 379                          * NLM server making the request, add its nlmid
 380                          * to the registry.
 381                          */
 382                         if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
 383                                 nlmid)) {
 384                                 FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
 385                         } else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
 386                                 nlmid)) {
 387                                 /*
 388                                  * If the NLM server is already known (has made
 389                                  * previous lock requests) and its state is
 390                                  * not NLM_UP (means that NLM server is
 391                                  * shutting down), then bail out with an
 392                                  * error to deny the lock request.
 393                                  */
 394                                 mutex_exit(&nlm_reg_lock);
 395                                 error = ENOLCK;
 396                                 goto done;
 397                         }
 398                         mutex_exit(&nlm_reg_lock);
 399                 }
 400         }
 401 
 402         /* Now get the lock graph for a particular vnode */
 403         gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
 404 
 405         /*
 406          * We drop rwlock here otherwise this might end up causing a
 407          * deadlock if this IOLOCK sleeps. (bugid # 1183392).
 408          */
 409 
 410         if (IS_IO_LOCK(lock_request)) {
 411                 VOP_RWUNLOCK(vp,
 412                         (lock_request->l_type == F_RDLCK) ?
 413                                 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
 414         }
 415         mutex_enter(&gp->gp_mutex);
 416 
 417         lock_request->l_state |= REFERENCED_LOCK;
 418         lock_request->l_graph = gp;
 419 
 420         switch (lock_request->l_type) {
 421         case F_RDLCK:
 422         case F_WRLCK:
 423                 if (IS_QUERY_LOCK(lock_request)) {
 424                         flk_get_first_blocking_lock(lock_request);
 425                         (*lckdat) = lock_request->l_flock;
 426                         break;
 427                 }
 428 
 429                 /* process the request now */
 430 
 431                 error = flk_process_request(lock_request);
 432                 break;
 433 
 434         case F_UNLCK:
 435                 /* unlock request will not block so execute it immediately */
 436 
 437                 if (IS_LOCKMGR(lock_request) &&
 438                     flk_canceled(lock_request)) {
 439                         error = 0;
 440                 } else {
 441                         error = flk_execute_request(lock_request);
 442                 }
 443                 break;
 444 
 445         case F_UNLKSYS:
 446                 /*
 447                  * Recovery mechanism to release lock manager locks when
 448                  * NFS client crashes and restart. NFS server will clear
 449                  * old locks and grant new locks.
 450                  */
 451 
 452                 if (lock_request->l_flock.l_sysid == 0) {
 453                         mutex_exit(&gp->gp_mutex);
 454                         return (EINVAL);
 455                 }
 456                 if (secpolicy_nfs(CRED()) != 0) {
 457                         mutex_exit(&gp->gp_mutex);
 458                         return (EPERM);
 459                 }
 460                 flk_delete_locks_by_sysid(lock_request);
 461                 lock_request->l_state &= ~REFERENCED_LOCK;
 462                 flk_set_state(lock_request, FLK_DEAD_STATE);
 463                 flk_free_lock(lock_request);
 464                 mutex_exit(&gp->gp_mutex);
 465                 return (0);
 466 
 467         default:
 468                 error = EINVAL;
 469                 break;
 470         }
 471 
 472         /* Clustering: For blocked PXFS locks, return */
 473         if (error == PXFS_LOCK_BLOCKED) {
 474                 lock_request->l_state &= ~REFERENCED_LOCK;
 475                 mutex_exit(&gp->gp_mutex);
 476                 return (error);
 477         }
 478 
 479         /*
 480          * Now that we have seen the status of locks in the system for
 481          * this vnode we acquire the rwlock if it is an IO_LOCK.
 482          */
 483 
 484         if (IS_IO_LOCK(lock_request)) {
 485                 (void) VOP_RWLOCK(vp,
 486                         (lock_request->l_type == F_RDLCK) ?
 487                                 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
 488                 if (!error) {
 489                         lckdat->l_type = F_UNLCK;
 490 
 491                         /*
 492                          * This wake up is needed otherwise
 493                          * if IO_LOCK has slept the dependents on this
 494                          * will not be woken up at all. (bugid # 1185482).
 495                          */
 496 
 497                         flk_wakeup(lock_request, 1);
 498                         flk_set_state(lock_request, FLK_DEAD_STATE);
 499                         flk_free_lock(lock_request);
 500                 }
 501                 /*
 502                  * else if error had occurred either flk_process_request()
 503                  * has returned EDEADLK in which case there will be no
 504                  * dependents for this lock or EINTR from flk_wait_execute_
 505                  * request() in which case flk_cancel_sleeping_lock()
 506                  * would have been done. same is true with EBADF.
 507                  */
 508         }
 509 
 510         if (lock_request == &stack_lock_request) {
 511                 flk_set_state(lock_request, FLK_DEAD_STATE);
 512         } else {
 513                 lock_request->l_state &= ~REFERENCED_LOCK;
 514                 if ((error != 0) || IS_DELETED(lock_request)) {
 515                         flk_set_state(lock_request, FLK_DEAD_STATE);
 516                         flk_free_lock(lock_request);
 517                 }
 518         }
 519 
 520         mutex_exit(&gp->gp_mutex);
 521         return (error);
 522 
 523 done:
 524         flk_set_state(lock_request, FLK_DEAD_STATE);
 525         if (lock_request != &stack_lock_request)
 526                 flk_free_lock(lock_request);
 527         return (error);
 528 }
 529 
 530 /*
 531  * Invoke the callbacks in the given list.  If before sleeping, invoke in
 532  * list order.  If after sleeping, invoke in reverse order.
 533  *
 534  * CPR (suspend/resume) support: if one of the callbacks returns a
 535  * callb_cpr_t, return it.   This will be used to make the thread CPR-safe
 536  * while it is sleeping.  There should be at most one callb_cpr_t for the
 537  * thread.
 538  * XXX This is unnecessarily complicated.  The CPR information should just
 539  * get passed in directly through VOP_FRLOCK and reclock, rather than
 540  * sneaking it in via a callback.
 541  */
 542 
 543 callb_cpr_t *
 544 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
 545 {
 546         callb_cpr_t *cpr_callbackp = NULL;
 547         callb_cpr_t *one_result;
 548         flk_callback_t *cb;
 549 
 550         if (cblist == NULL)
 551                 return (NULL);
 552 
 553         if (when == FLK_BEFORE_SLEEP) {
 554                 cb = cblist;
 555                 do {
 556                         one_result = (*cb->cb_callback)(when, cb->cb_data);
 557                         if (one_result != NULL) {
 558                                 ASSERT(cpr_callbackp == NULL);
 559                                 cpr_callbackp = one_result;
 560                         }
 561                         cb = cb->cb_next;
 562                 } while (cb != cblist);
 563         } else {
 564                 cb = cblist->cb_prev;
 565                 do {
 566                         one_result = (*cb->cb_callback)(when, cb->cb_data);
 567                         if (one_result != NULL) {
 568                                 cpr_callbackp = one_result;
 569                         }
 570                         cb = cb->cb_prev;
 571                 } while (cb != cblist->cb_prev);
 572         }
 573 
 574         return (cpr_callbackp);
 575 }
 576 
 577 /*
 578  * Initialize a flk_callback_t to hold the given callback.
 579  */
 580 
 581 void
 582 flk_init_callback(flk_callback_t *flk_cb,
 583         callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
 584 {
 585         flk_cb->cb_next = flk_cb;
 586         flk_cb->cb_prev = flk_cb;
 587         flk_cb->cb_callback = cb_fcn;
 588         flk_cb->cb_data = cbdata;
 589 }
 590 
 591 /*
 592  * Initialize an flk_callback_t and then link it into the head of an
 593  * existing list (which may be NULL).
 594  */
 595 
 596 void
 597 flk_add_callback(flk_callback_t *newcb,
 598                 callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
 599                 void *cbdata, flk_callback_t *cblist)
 600 {
 601         flk_init_callback(newcb, cb_fcn, cbdata);
 602 
 603         if (cblist == NULL)
 604                 return;
 605 
 606         newcb->cb_prev = cblist->cb_prev;
 607         newcb->cb_next = cblist;
 608         cblist->cb_prev->cb_next = newcb;
 609         cblist->cb_prev = newcb;
 610 }
 611 
 612 /*
 613  * Initialize the flk_edge_cache data structure and create the
 614  * nlm_reg_status array.
 615  */
 616 
 617 void
 618 flk_init(void)
 619 {
 620         uint_t  i;
 621 
 622         flk_edge_cache = kmem_cache_create("flk_edges",
 623                 sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
 624         if (flk_edge_cache == NULL) {
 625                 cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
 626         }
 627         /*
 628          * Create the NLM registry object.
 629          */
 630 
 631         if (cluster_bootflags & CLUSTER_BOOTED) {
 632                 /*
 633                  * This routine tells you the maximum node id that will be used
 634                  * in the cluster.  This number will be the size of the nlm
 635                  * registry status array.  We add 1 because we will be using
 636                  * all entries indexed from 0 to maxnodeid; e.g., from 0
 637                  * to 64, for a total of 65 entries.
 638                  */
 639                 nlm_status_size = clconf_maximum_nodeid() + 1;
 640         } else {
 641                 nlm_status_size = 0;
 642         }
 643 
 644         if (nlm_status_size != 0) {     /* booted as a cluster */
 645                 nlm_reg_status = (flk_nlm_status_t *)
 646                         kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
 647                                 KM_SLEEP);
 648 
 649                 /* initialize all NLM states in array to NLM_UNKNOWN */
 650                 for (i = 0; i < nlm_status_size; i++) {
 651                         nlm_reg_status[i] = FLK_NLM_UNKNOWN;
 652                 }
 653         }
 654 }
 655 
 656 /*
 657  * Zone constructor/destructor callbacks to be executed when a zone is
 658  * created/destroyed.
 659  */
 660 /* ARGSUSED */
 661 void *
 662 flk_zone_init(zoneid_t zoneid)
 663 {
 664         struct flock_globals *fg;
 665         uint_t i;
 666 
 667         fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
 668         fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
 669         for (i = 0; i < HASH_SIZE; i++)
 670                 fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
 671         return (fg);
 672 }
 673 
 674 /* ARGSUSED */
 675 void
 676 flk_zone_fini(zoneid_t zoneid, void *data)
 677 {
 678         struct flock_globals *fg = data;
 679 
 680         kmem_free(fg, sizeof (*fg));
 681 }
 682 
 683 /*
 684  * Get a lock_descriptor structure with initialization of edge lists.
 685  */
 686 
 687 static lock_descriptor_t *
 688 flk_get_lock(void)
 689 {
 690         lock_descriptor_t       *l;
 691 
 692         l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
 693 
 694         cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
 695         l->l_edge.edge_in_next = &l->l_edge;
 696         l->l_edge.edge_in_prev = &l->l_edge;
 697         l->l_edge.edge_adj_next = &l->l_edge;
 698         l->l_edge.edge_adj_prev = &l->l_edge;
 699         l->pvertex = -1;
 700         l->l_status = FLK_INITIAL_STATE;
 701         flk_lock_allocs++;
 702         return (l);
 703 }
 704 
 705 /*
 706  * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
 707  * when some thread has a reference to it as in reclock().
 708  */
 709 
 710 void
 711 flk_free_lock(lock_descriptor_t *lock)
 712 {
 713         ASSERT(IS_DEAD(lock));
 714         if (IS_REFERENCED(lock)) {
 715                 lock->l_state |= DELETED_LOCK;
 716                 return;
 717         }
 718         flk_lock_frees++;
 719         kmem_free((void *)lock, sizeof (lock_descriptor_t));
 720 }
 721 
 722 void
 723 flk_set_state(lock_descriptor_t *lock, int new_state)
 724 {
 725         /*
 726          * Locks in the sleeping list may be woken up in a number of ways,
 727          * and more than once.  If a sleeping lock is signaled awake more
 728          * than once, then it may or may not change state depending on its
 729          * current state.
 730          * Also note that NLM locks that are sleeping could be moved to an
 731          * interrupted state more than once if the unlock request is
 732          * retransmitted by the NLM client - the second time around, this is
 733          * just a nop.
 734          * The ordering of being signaled awake is:
 735          * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
 736          * The checks below implement this ordering.
 737          */
 738         if (IS_INTERRUPTED(lock)) {
 739                 if ((new_state == FLK_CANCELLED_STATE) ||
 740                     (new_state == FLK_GRANTED_STATE) ||
 741                     (new_state == FLK_INTERRUPTED_STATE)) {
 742                         return;
 743                 }
 744         }
 745         if (IS_CANCELLED(lock)) {
 746                 if ((new_state == FLK_GRANTED_STATE) ||
 747                     (new_state == FLK_CANCELLED_STATE)) {
 748                         return;
 749                 }
 750         }
 751         CHECK_LOCK_TRANSITION(lock->l_status, new_state);
 752         if (IS_PXFS(lock)) {
 753                 cl_flk_state_transition_notify(lock, lock->l_status, new_state);
 754         }
 755         lock->l_status = new_state;
 756 }
 757 
 758 /*
 759  * Routine that checks whether there are any blocking locks in the system.
 760  *
 761  * The policy followed is if a write lock is sleeping we don't allow read
 762  * locks before this write lock even though there may not be any active
 763  * locks corresponding to the read locks' region.
 764  *
 765  * flk_add_edge() function adds an edge between l1 and l2 iff there
 766  * is no path between l1 and l2. This is done to have a "minimum
 767  * storage representation" of the dependency graph.
 768  *
 769  * Another property of the graph is since only the new request throws
 770  * edges to the existing locks in the graph, the graph is always topologically
 771  * ordered.
 772  */
 773 
 774 static int
 775 flk_process_request(lock_descriptor_t *request)
 776 {
 777         graph_t *gp = request->l_graph;
 778         lock_descriptor_t *lock;
 779         int request_blocked_by_active = 0;
 780         int request_blocked_by_granted = 0;
 781         int request_blocked_by_sleeping = 0;
 782         vnode_t *vp = request->l_vnode;
 783         int     error = 0;
 784         int request_will_wait = 0;
 785         int found_covering_lock = 0;
 786         lock_descriptor_t *covered_by = NULL;
 787 
 788         ASSERT(MUTEX_HELD(&gp->gp_mutex));
 789         request_will_wait = IS_WILLING_TO_SLEEP(request);
 790 
 791         /*
 792          * check active locks
 793          */
 794 
 795         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 796 
 797 
 798         if (lock) {
 799                 do {
 800                         if (BLOCKS(lock, request)) {
 801                                 if (!request_will_wait)
 802                                         return (EAGAIN);
 803                                 request_blocked_by_active = 1;
 804                                 break;
 805                         }
 806                         /*
 807                          * Grant lock if it is for the same owner holding active
 808                          * lock that covers the request.
 809                          */
 810 
 811                         if (SAME_OWNER(lock, request) &&
 812                                         COVERS(lock, request) &&
 813                                                 (request->l_type == F_RDLCK))
 814                                 return (flk_execute_request(request));
 815                         lock = lock->l_next;
 816                 } while (lock->l_vnode == vp);
 817         }
 818 
 819         if (!request_blocked_by_active) {
 820                         lock_descriptor_t *lk[1];
 821                         lock_descriptor_t *first_glock = NULL;
 822                 /*
 823                  * Shall we grant this?! NO!!
 824                  * What about those locks that were just granted and still
 825                  * in sleep queue. Those threads are woken up and so locks
 826                  * are almost active.
 827                  */
 828                 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
 829                 if (lock) {
 830                         do {
 831                                 if (BLOCKS(lock, request)) {
 832                                         if (IS_GRANTED(lock)) {
 833                                                 request_blocked_by_granted = 1;
 834                                         } else {
 835                                                 request_blocked_by_sleeping = 1;
 836                                         }
 837                                 }
 838 
 839                                 lock = lock->l_next;
 840                         } while ((lock->l_vnode == vp));
 841                         first_glock = lock->l_prev;
 842                         ASSERT(first_glock->l_vnode == vp);
 843                 }
 844 
 845                 if (request_blocked_by_granted)
 846                         goto block;
 847 
 848                 if (!request_blocked_by_sleeping) {
 849                         /*
 850                          * If the request isn't going to be blocked by a
 851                          * sleeping request, we know that it isn't going to
 852                          * be blocked; we can just execute the request --
 853                          * without performing costly deadlock detection.
 854                          */
 855                         ASSERT(!request_blocked_by_active);
 856                         return (flk_execute_request(request));
 857                 } else if (request->l_type == F_RDLCK) {
 858                         /*
 859                          * If we have a sleeping writer in the requested
 860                          * lock's range, block.
 861                          */
 862                         goto block;
 863                 }
 864 
 865                 lk[0] = request;
 866                 request->l_state |= RECOMPUTE_LOCK;
 867                 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 868                 if (lock) {
 869                         do {
 870                                 flk_recompute_dependencies(lock, lk, 1, 0);
 871                                 lock = lock->l_next;
 872                         } while (lock->l_vnode == vp);
 873                 }
 874                 lock = first_glock;
 875                 if (lock) {
 876                         do {
 877                                 if (IS_GRANTED(lock)) {
 878                                 flk_recompute_dependencies(lock, lk, 1, 0);
 879                                 }
 880                                 lock = lock->l_prev;
 881                         } while ((lock->l_vnode == vp));
 882                 }
 883                 request->l_state &= ~RECOMPUTE_LOCK;
 884                 if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
 885                         return (EDEADLK);
 886                 return (flk_execute_request(request));
 887         }
 888 
 889 block:
 890         if (request_will_wait)
 891                 flk_graph_uncolor(gp);
 892 
 893         /* check sleeping locks */
 894 
 895         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
 896 
 897         /*
 898          * If we find a sleeping write lock that is a superset of the
 899          * region wanted by request we can be assured that by adding an
 900          * edge to this write lock we have paths to all locks in the
 901          * graph that blocks the request except in one case and that is why
 902          * another check for SAME_OWNER in the loop below. The exception
 903          * case is when this process that owns the sleeping write lock 'l1'
 904          * has other locks l2, l3, l4 that are in the system and arrived
 905          * before l1. l1 does not have path to these locks as they are from
 906          * same process. We break when we find a second covering sleeping
 907          * lock l5 owned by a process different from that owning l1, because
 908          * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
 909          * it has l1 would have produced a deadlock already.
 910          */
 911 
 912         if (lock) {
 913                 do {
 914                         if (BLOCKS(lock, request)) {
 915                                 if (!request_will_wait)
 916                                         return (EAGAIN);
 917                                 if (COVERS(lock, request) &&
 918                                                 lock->l_type == F_WRLCK) {
 919                                         if (found_covering_lock &&
 920                                             !SAME_OWNER(lock, covered_by)) {
 921                                                 found_covering_lock++;
 922                                                 break;
 923                                         }
 924                                         found_covering_lock = 1;
 925                                         covered_by = lock;
 926                                 }
 927                                 if (found_covering_lock &&
 928                                         !SAME_OWNER(lock, covered_by)) {
 929                                         lock = lock->l_next;
 930                                         continue;
 931                                 }
 932                                 if ((error = flk_add_edge(request, lock,
 933                                                 !found_covering_lock, 0)))
 934                                         return (error);
 935                         }
 936                         lock = lock->l_next;
 937                 } while (lock->l_vnode == vp);
 938         }
 939 
 940 /*
 941  * found_covering_lock == 2 iff at this point 'request' has paths
 942  * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
 943  * point 'request' has paths to all locks that blocks 'request' whose owners
 944  * are not same as the one that covers 'request' (covered_by above) and
 945  * we can have locks whose owner is same as covered_by in the active list.
 946  */
 947 
 948         if (request_blocked_by_active && found_covering_lock != 2) {
 949                 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 950                 ASSERT(lock != NULL);
 951                 do {
 952                         if (BLOCKS(lock, request)) {
 953                                 if (found_covering_lock &&
 954                                         !SAME_OWNER(lock, covered_by)) {
 955                                         lock = lock->l_next;
 956                                         continue;
 957                                 }
 958                                 if ((error = flk_add_edge(request, lock,
 959                                                         CHECK_CYCLE, 0)))
 960                                         return (error);
 961                         }
 962                         lock = lock->l_next;
 963                 } while (lock->l_vnode == vp);
 964         }
 965 
 966         if (NOT_BLOCKED(request)) {
 967                 /*
 968                  * request not dependent on any other locks
 969                  * so execute this request
 970                  */
 971                 return (flk_execute_request(request));
 972         } else {
 973                 /*
 974                  * check for deadlock
 975                  */
 976                 if (flk_check_deadlock(request))
 977                         return (EDEADLK);
 978                 /*
 979                  * this thread has to sleep
 980                  */
 981                 return (flk_wait_execute_request(request));
 982         }
 983 }
 984 
 985 /*
 986  * The actual execution of the request in the simple case is only to
 987  * insert the 'request' in the list of active locks if it is not an
 988  * UNLOCK.
 989  * We have to consider the existing active locks' relation to
 990  * this 'request' if they are owned by same process. flk_relation() does
 991  * this job and sees to that the dependency graph information is maintained
 992  * properly.
 993  */
 994 
 995 int
 996 flk_execute_request(lock_descriptor_t *request)
 997 {
 998         graph_t *gp = request->l_graph;
 999         vnode_t *vp = request->l_vnode;
1000         lock_descriptor_t       *lock, *lock1;
1001         int done_searching = 0;
1002 
1003         CHECK_SLEEPING_LOCKS(gp);
1004         CHECK_ACTIVE_LOCKS(gp);
1005 
1006         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1007 
1008         flk_set_state(request, FLK_START_STATE);
1009 
1010         ASSERT(NOT_BLOCKED(request));
1011 
1012         /* IO_LOCK requests are only to check status */
1013 
1014         if (IS_IO_LOCK(request))
1015                 return (0);
1016 
1017         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1018 
1019         if (lock == NULL && request->l_type == F_UNLCK)
1020                 return (0);
1021         if (lock == NULL) {
1022                 flk_insert_active_lock(request);
1023                 return (0);
1024         }
1025 
1026         do {
1027                 lock1 = lock->l_next;
1028                 if (SAME_OWNER(request, lock)) {
1029                         done_searching = flk_relation(lock, request);
1030                 }
1031                 lock = lock1;
1032         } while (lock->l_vnode == vp && !done_searching);
1033 
1034         /*
1035          * insert in active queue
1036          */
1037 
1038         if (request->l_type != F_UNLCK)
1039                 flk_insert_active_lock(request);
1040 
1041         return (0);
1042 }
1043 
1044 /*
1045  * 'request' is blocked by some one therefore we put it into sleep queue.
1046  */
1047 static int
1048 flk_wait_execute_request(lock_descriptor_t *request)
1049 {
1050         graph_t *gp = request->l_graph;
1051         callb_cpr_t     *cprp;          /* CPR info from callback */
1052         struct flock_globals *fg;
1053         int index;
1054 
1055         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1056         ASSERT(IS_WILLING_TO_SLEEP(request));
1057 
1058         flk_insert_sleeping_lock(request);
1059 
1060         if (IS_LOCKMGR(request)) {
1061                 index = HASH_INDEX(request->l_vnode);
1062                 fg = flk_get_globals();
1063 
1064                 if (nlm_status_size == 0) {     /* not booted as a cluster */
1065                         if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1066                                 flk_cancel_sleeping_lock(request, 1);
1067                                 return (ENOLCK);
1068                         }
1069                 } else {                        /* booted as a cluster */
1070                         /*
1071                          * If the request is an NLM server lock request,
1072                          * and the NLM state of the lock request is not
1073                          * NLM_UP (because the NLM server is shutting
1074                          * down), then cancel the sleeping lock and
1075                          * return error ENOLCK that will encourage the
1076                          * client to retransmit.
1077                          */
1078                         if (!IS_NLM_UP(request)) {
1079                                 flk_cancel_sleeping_lock(request, 1);
1080                                 return (ENOLCK);
1081                         }
1082                 }
1083         }
1084 
1085         /* Clustering: For blocking PXFS locks, return */
1086         if (IS_PXFS(request)) {
1087                 /*
1088                  * PXFS locks sleep on the client side.
1089                  * The callback argument is used to wake up the sleeper
1090                  * when the lock is granted.
1091                  * We return -1 (rather than an errno value) to indicate
1092                  * the client side should sleep
1093                  */
1094                 return (PXFS_LOCK_BLOCKED);
1095         }
1096 
1097         if (request->l_callbacks != NULL) {
1098                 /*
1099                  * To make sure the shutdown code works correctly, either
1100                  * the callback must happen after putting the lock on the
1101                  * sleep list, or we must check the shutdown status after
1102                  * returning from the callback (and before sleeping).  At
1103                  * least for now, we'll use the first option.  If a
1104                  * shutdown or signal or whatever happened while the graph
1105                  * mutex was dropped, that will be detected by
1106                  * wait_for_lock().
1107                  */
1108                 mutex_exit(&gp->gp_mutex);
1109 
1110                 cprp = flk_invoke_callbacks(request->l_callbacks,
1111                                             FLK_BEFORE_SLEEP);
1112 
1113                 mutex_enter(&gp->gp_mutex);
1114 
1115                 if (cprp == NULL) {
1116                         wait_for_lock(request);
1117                 } else {
1118                         mutex_enter(cprp->cc_lockp);
1119                         CALLB_CPR_SAFE_BEGIN(cprp);
1120                         mutex_exit(cprp->cc_lockp);
1121                         wait_for_lock(request);
1122                         mutex_enter(cprp->cc_lockp);
1123                         CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1124                         mutex_exit(cprp->cc_lockp);
1125                 }
1126 
1127                 mutex_exit(&gp->gp_mutex);
1128                 (void) flk_invoke_callbacks(request->l_callbacks,
1129                                             FLK_AFTER_SLEEP);
1130                 mutex_enter(&gp->gp_mutex);
1131         } else {
1132                 wait_for_lock(request);
1133         }
1134 
1135         if (IS_LOCKMGR(request)) {
1136                 /*
1137                  * If the lock manager is shutting down, return an
1138                  * error that will encourage the client to retransmit.
1139                  */
1140                 if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1141                         !IS_GRANTED(request)) {
1142                         flk_cancel_sleeping_lock(request, 1);
1143                         return (ENOLCK);
1144                 }
1145         }
1146 
1147         if (IS_INTERRUPTED(request)) {
1148                 /* we got a signal, or act like we did */
1149                 flk_cancel_sleeping_lock(request, 1);
1150                 return (EINTR);
1151         }
1152 
1153         /* Cancelled if some other thread has closed the file */
1154 
1155         if (IS_CANCELLED(request)) {
1156                 flk_cancel_sleeping_lock(request, 1);
1157                 return (EBADF);
1158         }
1159 
1160         request->l_state &= ~GRANTED_LOCK;
1161         REMOVE_SLEEP_QUEUE(request);
1162         return (flk_execute_request(request));
1163 }
1164 
1165 /*
1166  * This routine adds an edge between from and to because from depends
1167  * to. If asked to check for deadlock it checks whether there are any
1168  * reachable locks from "from_lock" that is owned by the same process
1169  * as "from_lock".
1170  * NOTE: It is the caller's responsibility to make sure that the color
1171  * of the graph is consistent between the calls to flk_add_edge as done
1172  * in flk_process_request. This routine does not color and check for
1173  * deadlock explicitly.
1174  */
1175 
1176 static int
1177 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1178                         int check_cycle, int update_graph)
1179 {
1180         edge_t  *edge;
1181         edge_t  *ep;
1182         lock_descriptor_t       *vertex;
1183         lock_descriptor_t *vertex_stack;
1184 
1185         STACK_INIT(vertex_stack);
1186 
1187         /*
1188          * if to vertex already has mark_color just return
1189          * don't add an edge as it is reachable from from vertex
1190          * before itself.
1191          */
1192 
1193         if (COLORED(to_lock))
1194                 return (0);
1195 
1196         edge = flk_get_edge();
1197 
1198         /*
1199          * set the from and to vertex
1200          */
1201 
1202         edge->from_vertex = from_lock;
1203         edge->to_vertex = to_lock;
1204 
1205         /*
1206          * put in adjacency list of from vertex
1207          */
1208 
1209         from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1210         edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1211         edge->edge_adj_prev = &from_lock->l_edge;
1212         from_lock->l_edge.edge_adj_next = edge;
1213 
1214         /*
1215          * put in in list of to vertex
1216          */
1217 
1218         to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1219         edge->edge_in_next = to_lock->l_edge.edge_in_next;
1220         to_lock->l_edge.edge_in_next = edge;
1221         edge->edge_in_prev = &to_lock->l_edge;
1222 
1223 
1224         if (update_graph) {
1225                 flk_update_proc_graph(edge, 0);
1226                 return (0);
1227         }
1228         if (!check_cycle) {
1229                 return (0);
1230         }
1231 
1232         STACK_PUSH(vertex_stack, from_lock, l_stack);
1233 
1234         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1235 
1236                 STACK_POP(vertex_stack, l_stack);
1237 
1238                 for (ep = FIRST_ADJ(vertex);
1239                         ep != HEAD(vertex);
1240                                 ep = NEXT_ADJ(ep)) {
1241                         if (COLORED(ep->to_vertex))
1242                                 continue;
1243                         COLOR(ep->to_vertex);
1244                         if (SAME_OWNER(ep->to_vertex, from_lock))
1245                                 goto dead_lock;
1246                         STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1247                 }
1248         }
1249         return (0);
1250 
1251 dead_lock:
1252 
1253         /*
1254          * remove all edges
1255          */
1256 
1257         ep = FIRST_ADJ(from_lock);
1258 
1259         while (ep != HEAD(from_lock)) {
1260                 IN_LIST_REMOVE(ep);
1261                 from_lock->l_sedge = NEXT_ADJ(ep);
1262                 ADJ_LIST_REMOVE(ep);
1263                 flk_free_edge(ep);
1264                 ep = from_lock->l_sedge;
1265         }
1266         return (EDEADLK);
1267 }
1268 
1269 /*
1270  * Get an edge structure for representing the dependency between two locks.
1271  */
1272 
1273 static edge_t *
1274 flk_get_edge()
1275 {
1276         edge_t  *ep;
1277 
1278         ASSERT(flk_edge_cache != NULL);
1279 
1280         ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1281         edge_allocs++;
1282         return (ep);
1283 }
1284 
1285 /*
1286  * Free the edge structure.
1287  */
1288 
1289 static void
1290 flk_free_edge(edge_t *ep)
1291 {
1292         edge_frees++;
1293         kmem_cache_free(flk_edge_cache, (void *)ep);
1294 }
1295 
1296 /*
1297  * Check the relationship of request with lock and perform the
1298  * recomputation of dependencies, break lock if required, and return
1299  * 1 if request cannot have any more relationship with the next
1300  * active locks.
1301  * The 'lock' and 'request' are compared and in case of overlap we
1302  * delete the 'lock' and form new locks to represent the non-overlapped
1303  * portion of original 'lock'. This function has side effects such as
1304  * 'lock' will be freed, new locks will be added to the active list.
1305  */
1306 
1307 static int
1308 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1309 {
1310         int lock_effect;
1311         lock_descriptor_t *lock1, *lock2;
1312         lock_descriptor_t *topology[3];
1313         int nvertex = 0;
1314         int i;
1315         edge_t  *ep;
1316         graph_t *gp = (lock->l_graph);
1317 
1318 
1319         CHECK_SLEEPING_LOCKS(gp);
1320         CHECK_ACTIVE_LOCKS(gp);
1321 
1322         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1323 
1324         topology[0] = topology[1] = topology[2] = NULL;
1325 
1326         if (request->l_type == F_UNLCK)
1327                 lock_effect = FLK_UNLOCK;
1328         else if (request->l_type == F_RDLCK &&
1329                         lock->l_type == F_WRLCK)
1330                 lock_effect = FLK_DOWNGRADE;
1331         else if (request->l_type == F_WRLCK &&
1332                         lock->l_type == F_RDLCK)
1333                 lock_effect = FLK_UPGRADE;
1334         else
1335                 lock_effect = FLK_STAY_SAME;
1336 
1337         if (lock->l_end < request->l_start) {
1338                 if (lock->l_end == request->l_start - 1 &&
1339                                 lock_effect == FLK_STAY_SAME) {
1340                         topology[0] = request;
1341                         request->l_start = lock->l_start;
1342                         nvertex = 1;
1343                         goto recompute;
1344                 } else {
1345                         return (0);
1346                 }
1347         }
1348 
1349         if (lock->l_start > request->l_end) {
1350                 if (request->l_end == lock->l_start - 1 &&
1351                                         lock_effect == FLK_STAY_SAME) {
1352                         topology[0] = request;
1353                         request->l_end = lock->l_end;
1354                         nvertex = 1;
1355                         goto recompute;
1356                 } else {
1357                         return (1);
1358                 }
1359         }
1360 
1361         if (request->l_end < lock->l_end) {
1362                 if (request->l_start > lock->l_start) {
1363                         if (lock_effect == FLK_STAY_SAME) {
1364                                 request->l_start = lock->l_start;
1365                                 request->l_end = lock->l_end;
1366                                 topology[0] = request;
1367                                 nvertex = 1;
1368                         } else {
1369                                 lock1 = flk_get_lock();
1370                                 lock2 = flk_get_lock();
1371                                 COPY(lock1, lock);
1372                                 COPY(lock2, lock);
1373                                 lock1->l_start = lock->l_start;
1374                                 lock1->l_end = request->l_start - 1;
1375                                 lock2->l_start = request->l_end + 1;
1376                                 lock2->l_end = lock->l_end;
1377                                 topology[0] = lock1;
1378                                 topology[1] = lock2;
1379                                 topology[2] = request;
1380                                 nvertex = 3;
1381                         }
1382                 } else if (request->l_start < lock->l_start) {
1383                         if (lock_effect == FLK_STAY_SAME) {
1384                                 request->l_end = lock->l_end;
1385                                 topology[0] = request;
1386                                 nvertex = 1;
1387                         } else {
1388                                 lock1 = flk_get_lock();
1389                                 COPY(lock1, lock);
1390                                 lock1->l_start = request->l_end + 1;
1391                                 topology[0] = lock1;
1392                                 topology[1] = request;
1393                                 nvertex = 2;
1394                         }
1395                 } else  {
1396                         if (lock_effect == FLK_STAY_SAME) {
1397                                 request->l_start = lock->l_start;
1398                                 request->l_end = lock->l_end;
1399                                 topology[0] = request;
1400                                 nvertex = 1;
1401                         } else {
1402                                 lock1 = flk_get_lock();
1403                                 COPY(lock1, lock);
1404                                 lock1->l_start = request->l_end + 1;
1405                                 topology[0] = lock1;
1406                                 topology[1] = request;
1407                                 nvertex = 2;
1408                         }
1409                 }
1410         } else if (request->l_end > lock->l_end) {
1411                 if (request->l_start > lock->l_start)  {
1412                         if (lock_effect == FLK_STAY_SAME) {
1413                                 request->l_start = lock->l_start;
1414                                 topology[0] = request;
1415                                 nvertex = 1;
1416                         } else {
1417                                 lock1 = flk_get_lock();
1418                                 COPY(lock1, lock);
1419                                 lock1->l_end = request->l_start - 1;
1420                                 topology[0] = lock1;
1421                                 topology[1] = request;
1422                                 nvertex = 2;
1423                         }
1424                 } else if (request->l_start < lock->l_start)  {
1425                         topology[0] = request;
1426                         nvertex = 1;
1427                 } else {
1428                         topology[0] = request;
1429                         nvertex = 1;
1430                 }
1431         } else {
1432                 if (request->l_start > lock->l_start) {
1433                         if (lock_effect == FLK_STAY_SAME) {
1434                                 request->l_start = lock->l_start;
1435                                 topology[0] = request;
1436                                 nvertex = 1;
1437                         } else {
1438                                 lock1 = flk_get_lock();
1439                                 COPY(lock1, lock);
1440                                 lock1->l_end = request->l_start - 1;
1441                                 topology[0] = lock1;
1442                                 topology[1] = request;
1443                                 nvertex = 2;
1444                         }
1445                 } else if (request->l_start < lock->l_start) {
1446                         topology[0] = request;
1447                         nvertex = 1;
1448                 } else {
1449                         if (lock_effect !=  FLK_UNLOCK) {
1450                                 topology[0] = request;
1451                                 nvertex = 1;
1452                         } else {
1453                                 flk_delete_active_lock(lock, 0);
1454                                 flk_wakeup(lock, 1);
1455                                 flk_free_lock(lock);
1456                                 CHECK_SLEEPING_LOCKS(gp);
1457                                 CHECK_ACTIVE_LOCKS(gp);
1458                                 return (1);
1459                         }
1460                 }
1461         }
1462 
1463 recompute:
1464 
1465         /*
1466          * For unlock we don't send the 'request' to for recomputing
1467          * dependencies because no lock will add an edge to this.
1468          */
1469 
1470         if (lock_effect == FLK_UNLOCK) {
1471                 topology[nvertex-1] = NULL;
1472                 nvertex--;
1473         }
1474         for (i = 0; i < nvertex; i++) {
1475                 topology[i]->l_state |= RECOMPUTE_LOCK;
1476                 topology[i]->l_color = NO_COLOR;
1477         }
1478 
1479         ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1480 
1481         /*
1482          * we remove the adjacent edges for all vertices' to this vertex
1483          * 'lock'.
1484          */
1485 
1486         ep = FIRST_IN(lock);
1487         while (ep != HEAD(lock)) {
1488                 ADJ_LIST_REMOVE(ep);
1489                 ep = NEXT_IN(ep);
1490         }
1491 
1492         flk_delete_active_lock(lock, 0);
1493 
1494         /* We are ready for recomputing the dependencies now */
1495 
1496         flk_recompute_dependencies(lock, topology, nvertex, 1);
1497 
1498         for (i = 0; i < nvertex; i++) {
1499                 topology[i]->l_state &= ~RECOMPUTE_LOCK;
1500                 topology[i]->l_color = NO_COLOR;
1501         }
1502 
1503 
1504         if (lock_effect == FLK_UNLOCK) {
1505                 nvertex++;
1506         }
1507         for (i = 0; i < nvertex - 1; i++) {
1508                 flk_insert_active_lock(topology[i]);
1509         }
1510 
1511 
1512         if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1513                 flk_wakeup(lock, 0);
1514         } else {
1515                 ep = FIRST_IN(lock);
1516                 while (ep != HEAD(lock)) {
1517                         lock->l_sedge = NEXT_IN(ep);
1518                         IN_LIST_REMOVE(ep);
1519                         flk_update_proc_graph(ep, 1);
1520                         flk_free_edge(ep);
1521                         ep = lock->l_sedge;
1522                 }
1523         }
1524         flk_free_lock(lock);
1525 
1526         CHECK_SLEEPING_LOCKS(gp);
1527         CHECK_ACTIVE_LOCKS(gp);
1528         return (0);
1529 }
1530 
1531 /*
1532  * Insert a lock into the active queue.
1533  */
1534 
1535 static void
1536 flk_insert_active_lock(lock_descriptor_t *new_lock)
1537 {
1538         graph_t *gp = new_lock->l_graph;
1539         vnode_t *vp = new_lock->l_vnode;
1540         lock_descriptor_t *first_lock, *lock;
1541 
1542         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1543 
1544         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1545         first_lock = lock;
1546 
1547         if (first_lock != NULL) {
1548                 for (; (lock->l_vnode == vp &&
1549                         lock->l_start < new_lock->l_start); lock = lock->l_next)
1550                         ;
1551         } else {
1552                 lock = ACTIVE_HEAD(gp);
1553         }
1554 
1555         lock->l_prev->l_next = new_lock;
1556         new_lock->l_next = lock;
1557         new_lock->l_prev = lock->l_prev;
1558         lock->l_prev = new_lock;
1559 
1560         if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
1561                 vp->v_filocks = (struct filock *)new_lock;
1562         }
1563         flk_set_state(new_lock, FLK_ACTIVE_STATE);
1564         new_lock->l_state |= ACTIVE_LOCK;
1565 
1566         CHECK_ACTIVE_LOCKS(gp);
1567         CHECK_SLEEPING_LOCKS(gp);
1568 }
1569 
1570 /*
1571  * Delete the active lock : Performs two functions depending on the
1572  * value of second parameter. One is to remove from the active lists
1573  * only and other is to both remove and free the lock.
1574  */
1575 
1576 static void
1577 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1578 {
1579         vnode_t *vp = lock->l_vnode;
1580         graph_t *gp = lock->l_graph;
1581 
1582         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1583         if (free_lock)
1584                 ASSERT(NO_DEPENDENTS(lock));
1585         ASSERT(NOT_BLOCKED(lock));
1586         ASSERT(IS_ACTIVE(lock));
1587 
1588         ASSERT((vp->v_filocks != NULL));
1589 
1590         if (vp->v_filocks == (struct filock *)lock) {
1591                 vp->v_filocks = (struct filock *)
1592                                 ((lock->l_next->l_vnode == vp) ? lock->l_next :
1593                                                                 NULL);
1594         }
1595         lock->l_next->l_prev = lock->l_prev;
1596         lock->l_prev->l_next = lock->l_next;
1597         lock->l_next = lock->l_prev = NULL;
1598         flk_set_state(lock, FLK_DEAD_STATE);
1599         lock->l_state &= ~ACTIVE_LOCK;
1600 
1601         if (free_lock)
1602                 flk_free_lock(lock);
1603         CHECK_ACTIVE_LOCKS(gp);
1604         CHECK_SLEEPING_LOCKS(gp);
1605 }
1606 
1607 /*
1608  * Insert into the sleep queue.
1609  */
1610 
1611 static void
1612 flk_insert_sleeping_lock(lock_descriptor_t *request)
1613 {
1614         graph_t *gp = request->l_graph;
1615         vnode_t *vp = request->l_vnode;
1616         lock_descriptor_t       *lock;
1617 
1618         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1619         ASSERT(IS_INITIAL(request));
1620 
1621         for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
1622                 lock->l_vnode < vp); lock = lock->l_next)
1623                 ;
1624 
1625         lock->l_prev->l_next = request;
1626         request->l_prev = lock->l_prev;
1627         lock->l_prev = request;
1628         request->l_next = lock;
1629         flk_set_state(request, FLK_SLEEPING_STATE);
1630         request->l_state |= SLEEPING_LOCK;
1631 }
1632 
1633 /*
1634  * Cancelling a sleeping lock implies removing a vertex from the
1635  * dependency graph and therefore we should recompute the dependencies
1636  * of all vertices that have a path  to this vertex, w.r.t. all
1637  * vertices reachable from this vertex.
1638  */
1639 
1640 void
1641 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
1642 {
1643         graph_t *gp = request->l_graph;
1644         vnode_t *vp = request->l_vnode;
1645         lock_descriptor_t **topology = NULL;
1646         edge_t  *ep;
1647         lock_descriptor_t *vertex, *lock;
1648         int nvertex = 0;
1649         int i;
1650         lock_descriptor_t *vertex_stack;
1651 
1652         STACK_INIT(vertex_stack);
1653 
1654         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1655         /*
1656          * count number of vertex pointers that has to be allocated
1657          * All vertices that are reachable from request.
1658          */
1659 
1660         STACK_PUSH(vertex_stack, request, l_stack);
1661 
1662         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1663                 STACK_POP(vertex_stack, l_stack);
1664                 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
1665                                         ep = NEXT_ADJ(ep)) {
1666                         if (IS_RECOMPUTE(ep->to_vertex))
1667                                 continue;
1668                         ep->to_vertex->l_state |= RECOMPUTE_LOCK;
1669                         STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1670                         nvertex++;
1671                 }
1672         }
1673 
1674         /*
1675          * allocate memory for holding the vertex pointers
1676          */
1677 
1678         if (nvertex) {
1679                 topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
1680                                                 KM_SLEEP);
1681         }
1682 
1683         /*
1684          * one more pass to actually store the vertices in the
1685          * allocated array.
1686          * We first check sleeping locks and then active locks
1687          * so that topology array will be in a topological
1688          * order.
1689          */
1690 
1691         nvertex = 0;
1692         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1693 
1694         if (lock) {
1695                 do {
1696                         if (IS_RECOMPUTE(lock)) {
1697                                 lock->l_index = nvertex;
1698                                 topology[nvertex++] = lock;
1699                         }
1700                         lock->l_color = NO_COLOR;
1701                         lock = lock->l_next;
1702                 } while (lock->l_vnode == vp);
1703         }
1704 
1705         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1706 
1707         if (lock) {
1708                 do {
1709                         if (IS_RECOMPUTE(lock)) {
1710                                 lock->l_index = nvertex;
1711                                 topology[nvertex++] = lock;
1712                         }
1713                         lock->l_color = NO_COLOR;
1714                         lock = lock->l_next;
1715                 } while (lock->l_vnode == vp);
1716         }
1717 
1718         /*
1719          * remove in and out edges of request
1720          * They are freed after updating proc_graph below.
1721          */
1722 
1723         for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
1724                 ADJ_LIST_REMOVE(ep);
1725         }
1726 
1727 
1728         if (remove_from_queue)
1729                 REMOVE_SLEEP_QUEUE(request);
1730 
1731         /* we are ready to recompute */
1732 
1733         flk_recompute_dependencies(request, topology, nvertex, 1);
1734 
1735         ep = FIRST_ADJ(request);
1736         while (ep != HEAD(request)) {
1737                 IN_LIST_REMOVE(ep);
1738                 request->l_sedge = NEXT_ADJ(ep);
1739                 ADJ_LIST_REMOVE(ep);
1740                 flk_update_proc_graph(ep, 1);
1741                 flk_free_edge(ep);
1742                 ep = request->l_sedge;
1743         }
1744 
1745 
1746         /*
1747          * unset the RECOMPUTE flag in those vertices
1748          */
1749 
1750         for (i = 0; i < nvertex; i++) {
1751                 topology[i]->l_state &= ~RECOMPUTE_LOCK;
1752         }
1753 
1754         /*
1755          * free the topology
1756          */
1757         if (nvertex)
1758                 kmem_free((void *)topology,
1759                         (nvertex * sizeof (lock_descriptor_t *)));
1760         /*
1761          * Possibility of some locks unblocked now
1762          */
1763 
1764         flk_wakeup(request, 0);
1765 
1766         /*
1767          * we expect to have a correctly recomputed graph  now.
1768          */
1769         flk_set_state(request, FLK_DEAD_STATE);
1770         flk_free_lock(request);
1771         CHECK_SLEEPING_LOCKS(gp);
1772         CHECK_ACTIVE_LOCKS(gp);
1773 
1774 }
1775 
1776 /*
1777  * Uncoloring the graph is simply to increment the mark value of the graph
1778  * And only when wrap round takes place will we color all vertices in
1779  * the graph explicitly.
1780  */
1781 
1782 static void
1783 flk_graph_uncolor(graph_t *gp)
1784 {
1785         lock_descriptor_t *lock;
1786 
1787         if (gp->mark == UINT_MAX) {
1788                 gp->mark = 1;
1789         for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
1790                                         lock = lock->l_next)
1791                         lock->l_color  = 0;
1792 
1793         for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
1794                                         lock = lock->l_next)
1795                         lock->l_color  = 0;
1796         } else {
1797                 gp->mark++;
1798         }
1799 }
1800 
1801 /*
1802  * Wake up locks that are blocked on the given lock.
1803  */
1804 
1805 static void
1806 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
1807 {
1808         edge_t  *ep;
1809         graph_t *gp = lock->l_graph;
1810         lock_descriptor_t       *lck;
1811 
1812         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1813         if (NO_DEPENDENTS(lock))
1814                 return;
1815         ep = FIRST_IN(lock);
1816         do {
1817                 /*
1818                  * delete the edge from the adjacency list
1819                  * of from vertex. if no more adjacent edges
1820                  * for this vertex wake this process.
1821                  */
1822                 lck = ep->from_vertex;
1823                 if (adj_list_remove)
1824                         ADJ_LIST_REMOVE(ep);
1825                 flk_update_proc_graph(ep, 1);
1826                 if (NOT_BLOCKED(lck)) {
1827                         GRANT_WAKEUP(lck);
1828                 }
1829                 lock->l_sedge = NEXT_IN(ep);
1830                 IN_LIST_REMOVE(ep);
1831                 flk_free_edge(ep);
1832                 ep = lock->l_sedge;
1833         } while (ep != HEAD(lock));
1834         ASSERT(NO_DEPENDENTS(lock));
1835 }
1836 
1837 /*
1838  * The dependents of request, is checked for its dependency against the
1839  * locks in topology (called topology because the array is and should be in
1840  * topological order for this algorithm, if not in topological order the
1841  * inner loop below might add more edges than necessary. Topological ordering
1842  * of vertices satisfies the property that all edges will be from left to
1843  * right i.e., topology[i] can have an edge to  topology[j], iff i<j)
1844  * If lock l1 in the dependent set of request is dependent (blocked by)
1845  * on lock l2 in topology but does not have a path to it, we add an edge
1846  * in the inner loop below.
1847  *
1848  * We don't want to add an edge between l1 and l2 if there exists
1849  * already a path from l1 to l2, so care has to be taken for those vertices
1850  * that  have two paths to 'request'. These vertices are referred to here
1851  * as barrier locks.
1852  *
1853  * The barriers has to be found (those vertex that originally had two paths
1854  * to request) because otherwise we may end up adding edges unnecessarily
1855  * to vertices in topology, and thus barrier vertices can have an edge
1856  * to a vertex in topology as well a path to it.
1857  */
1858 
1859 static void
1860 flk_recompute_dependencies(lock_descriptor_t *request,
1861                 lock_descriptor_t **topology,
1862                         int nvertex, int update_graph)
1863 {
1864         lock_descriptor_t *vertex, *lock;
1865         graph_t *gp = request->l_graph;
1866         int i, count;
1867         int barrier_found = 0;
1868         edge_t  *ep;
1869         lock_descriptor_t *vertex_stack;
1870 
1871         STACK_INIT(vertex_stack);
1872 
1873         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1874         if (nvertex == 0)
1875                 return;
1876         flk_graph_uncolor(request->l_graph);
1877         barrier_found = flk_find_barriers(request);
1878         request->l_state |= RECOMPUTE_DONE;
1879 
1880         STACK_PUSH(vertex_stack, request, l_stack);
1881         request->l_sedge = FIRST_IN(request);
1882 
1883 
1884         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1885                 if (vertex->l_state & RECOMPUTE_DONE) {
1886                         count = 0;
1887                         goto next_in_edge;
1888                 }
1889                 if (IS_BARRIER(vertex)) {
1890                         /* decrement the barrier count */
1891                         if (vertex->l_index) {
1892                                 vertex->l_index--;
1893                                 /* this guy will be pushed again anyway ? */
1894                                 STACK_POP(vertex_stack, l_stack);
1895                                 if (vertex->l_index == 0)  {
1896                                 /*
1897                                  * barrier is over we can recompute
1898                                  * dependencies for this lock in the
1899                                  * next stack pop
1900                                  */
1901                                         vertex->l_state &= ~BARRIER_LOCK;
1902                                 }
1903                                 continue;
1904                         }
1905                 }
1906                 vertex->l_state |= RECOMPUTE_DONE;
1907                 flk_graph_uncolor(gp);
1908                 count = flk_color_reachables(vertex);
1909                 for (i = 0; i < nvertex; i++) {
1910                         lock = topology[i];
1911                         if (COLORED(lock))
1912                                 continue;
1913                         if (BLOCKS(lock, vertex)) {
1914                                 (void) flk_add_edge(vertex, lock,
1915                                     NO_CHECK_CYCLE, update_graph);
1916                                 COLOR(lock);
1917                                 count++;
1918                                 count += flk_color_reachables(lock);
1919                         }
1920 
1921                 }
1922 
1923 next_in_edge:
1924                 if (count == nvertex ||
1925                                 vertex->l_sedge == HEAD(vertex)) {
1926                         /* prune the tree below this */
1927                         STACK_POP(vertex_stack, l_stack);
1928                         vertex->l_state &= ~RECOMPUTE_DONE;
1929                         /* update the barrier locks below this! */
1930                         if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
1931                                 flk_graph_uncolor(gp);
1932                                 flk_update_barriers(vertex);
1933                         }
1934                         continue;
1935                 }
1936 
1937                 ep = vertex->l_sedge;
1938                 lock = ep->from_vertex;
1939                 STACK_PUSH(vertex_stack, lock, l_stack);
1940                 lock->l_sedge = FIRST_IN(lock);
1941                 vertex->l_sedge = NEXT_IN(ep);
1942         }
1943 
1944 }
1945 
1946 /*
1947  * Color all reachable vertices from vertex that belongs to topology (here
1948  * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
1949  *
1950  * Note: we need to use a different stack_link l_stack1 because this is
1951  * called from flk_recompute_dependencies() that already uses a stack with
1952  * l_stack as stack_link.
1953  */
1954 
1955 static int
1956 flk_color_reachables(lock_descriptor_t *vertex)
1957 {
1958         lock_descriptor_t *ver, *lock;
1959         int count;
1960         edge_t  *ep;
1961         lock_descriptor_t *vertex_stack;
1962 
1963         STACK_INIT(vertex_stack);
1964 
1965         STACK_PUSH(vertex_stack, vertex, l_stack1);
1966         count = 0;
1967         while ((ver = STACK_TOP(vertex_stack)) != NULL) {
1968 
1969                 STACK_POP(vertex_stack, l_stack1);
1970                 for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
1971                                         ep = NEXT_ADJ(ep)) {
1972                         lock = ep->to_vertex;
1973                         if (COLORED(lock))
1974                                 continue;
1975                         COLOR(lock);
1976                         if (IS_RECOMPUTE(lock))
1977                                 count++;
1978                         STACK_PUSH(vertex_stack, lock, l_stack1);
1979                 }
1980 
1981         }
1982         return (count);
1983 }
1984 
1985 /*
1986  * Called from flk_recompute_dependencies() this routine decrements
1987  * the barrier count of barrier vertices that are reachable from lock.
1988  */
1989 
1990 static void
1991 flk_update_barriers(lock_descriptor_t *lock)
1992 {
1993         lock_descriptor_t *vertex, *lck;
1994         edge_t  *ep;
1995         lock_descriptor_t *vertex_stack;
1996 
1997         STACK_INIT(vertex_stack);
1998 
1999         STACK_PUSH(vertex_stack, lock, l_stack1);
2000 
2001         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2002                 STACK_POP(vertex_stack, l_stack1);
2003                 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2004                                                 ep = NEXT_IN(ep)) {
2005                         lck = ep->from_vertex;
2006                         if (COLORED(lck)) {
2007                                 if (IS_BARRIER(lck)) {
2008                                         ASSERT(lck->l_index > 0);
2009                                         lck->l_index--;
2010                                         if (lck->l_index == 0)
2011                                                 lck->l_state &= ~BARRIER_LOCK;
2012                                 }
2013                                 continue;
2014                         }
2015                         COLOR(lck);
2016                         if (IS_BARRIER(lck)) {
2017                                 ASSERT(lck->l_index > 0);
2018                                 lck->l_index--;
2019                                 if (lck->l_index == 0)
2020                                         lck->l_state &= ~BARRIER_LOCK;
2021                         }
2022                         STACK_PUSH(vertex_stack, lck, l_stack1);
2023                 }
2024         }
2025 }
2026 
2027 /*
2028  * Finds all vertices that are reachable from 'lock' more than once and
2029  * mark them as barrier vertices and increment their barrier count.
2030  * The barrier count is one minus the total number of paths from lock
2031  * to that vertex.
2032  */
2033 
2034 static int
2035 flk_find_barriers(lock_descriptor_t *lock)
2036 {
2037         lock_descriptor_t *vertex, *lck;
2038         int found = 0;
2039         edge_t  *ep;
2040         lock_descriptor_t *vertex_stack;
2041 
2042         STACK_INIT(vertex_stack);
2043 
2044         STACK_PUSH(vertex_stack, lock, l_stack1);
2045 
2046         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2047                 STACK_POP(vertex_stack, l_stack1);
2048                 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2049                                                 ep = NEXT_IN(ep)) {
2050                         lck = ep->from_vertex;
2051                         if (COLORED(lck)) {
2052                                 /* this is a barrier */
2053                                 lck->l_state |= BARRIER_LOCK;
2054                                 /* index will have barrier count */
2055                                 lck->l_index++;
2056                                 if (!found)
2057                                         found = 1;
2058                                 continue;
2059                         }
2060                         COLOR(lck);
2061                         lck->l_index = 0;
2062                         STACK_PUSH(vertex_stack, lck, l_stack1);
2063                 }
2064         }
2065         return (found);
2066 }
2067 
2068 /*
2069  * Finds the first lock that is mainly responsible for blocking this
2070  * request.  If there is no such lock, request->l_flock.l_type is set to
2071  * F_UNLCK.  Otherwise, request->l_flock is filled in with the particulars
2072  * of the blocking lock.
2073  *
2074  * Note: It is possible a request is blocked by a sleeping lock because
2075  * of the fairness policy used in flk_process_request() to construct the
2076  * dependencies. (see comments before flk_process_request()).
2077  */
2078 
2079 static void
2080 flk_get_first_blocking_lock(lock_descriptor_t *request)
2081 {
2082         graph_t *gp = request->l_graph;
2083         vnode_t *vp = request->l_vnode;
2084         lock_descriptor_t *lock, *blocker;
2085 
2086         ASSERT(MUTEX_HELD(&gp->gp_mutex));
2087         blocker = NULL;
2088         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2089 
2090         if (lock) {
2091                 do {
2092                         if (BLOCKS(lock, request)) {
2093                                 blocker = lock;
2094                                 break;
2095                         }
2096                         lock = lock->l_next;
2097                 } while (lock->l_vnode == vp);
2098         }
2099 
2100         if (blocker) {
2101                 report_blocker(blocker, request);
2102         } else
2103                 request->l_flock.l_type = F_UNLCK;
2104 }
2105 
2106 /*
2107  * Get the graph_t structure associated with a vnode.
2108  * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2109  * not yet been initialized, then a new element is allocated and returned.
2110  */
2111 graph_t *
2112 flk_get_lock_graph(vnode_t *vp, int initialize)
2113 {
2114         graph_t *gp;
2115         graph_t *gp_alloc = NULL;
2116         int index = HASH_INDEX(vp);
2117 
2118         if (initialize == FLK_USE_GRAPH) {
2119                 mutex_enter(&flock_lock);
2120                 gp = lock_graph[index];
2121                 mutex_exit(&flock_lock);
2122                 return (gp);
2123         }
2124 
2125         ASSERT(initialize == FLK_INIT_GRAPH);
2126 
2127         if (lock_graph[index] == NULL) {
2128 
2129                 gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2130 
2131                 /* Initialize the graph */
2132 
2133                 gp_alloc->active_locks.l_next =
2134                     gp_alloc->active_locks.l_prev =
2135                     (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2136                 gp_alloc->sleeping_locks.l_next =
2137                     gp_alloc->sleeping_locks.l_prev =
2138                     (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2139                 gp_alloc->index = index;
2140                 mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2141         }
2142 
2143         mutex_enter(&flock_lock);
2144 
2145         gp = lock_graph[index];
2146 
2147         /* Recheck the value within flock_lock */
2148         if (gp == NULL) {
2149                 struct flock_globals *fg;
2150 
2151                 /* We must have previously allocated the graph_t structure */
2152                 ASSERT(gp_alloc != NULL);
2153                 lock_graph[index] = gp = gp_alloc;
2154                 /*
2155                  * The lockmgr status is only needed if KLM is loaded.
2156                  */
2157                 if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2158                         fg = flk_get_globals();
2159                         fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2160                 }
2161         }
2162 
2163         mutex_exit(&flock_lock);
2164 
2165         if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2166                 /* There was a race to allocate the graph_t and we lost */
2167                 mutex_destroy(&gp_alloc->gp_mutex);
2168                 kmem_free(gp_alloc, sizeof (graph_t));
2169         }
2170 
2171         return (gp);
2172 }
2173 
2174 /*
2175  * PSARC case 1997/292
2176  */
2177 int
2178 cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
2179 {
2180         lock_descriptor_t *lock;
2181         int result = 0;
2182         graph_t *gp;
2183         int                     lock_nlmid;
2184 
2185         /*
2186          * Check to see if node is booted as a cluster. If not, return.
2187          */
2188         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2189                 return (0);
2190         }
2191 
2192         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2193         if (gp == NULL) {
2194                 return (0);
2195         }
2196 
2197         mutex_enter(&gp->gp_mutex);
2198 
2199         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2200 
2201         if (lock) {
2202                 while (lock->l_vnode == vp) {
2203                         /* get NLM id from sysid */
2204                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2205 
2206                         /*
2207                          * If NLM server request _and_ nlmid of lock matches
2208                          * nlmid of argument, then we've found a remote lock.
2209                          */
2210                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2211                                 result = 1;
2212                                 goto done;
2213                         }
2214                         lock = lock->l_next;
2215                 }
2216         }
2217 
2218         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2219 
2220         if (lock) {
2221                 while (lock->l_vnode == vp) {
2222                         /* get NLM id from sysid */
2223                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2224 
2225                         /*
2226                          * If NLM server request _and_ nlmid of lock matches
2227                          * nlmid of argument, then we've found a remote lock.
2228                          */
2229                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2230                                 result = 1;
2231                                 goto done;
2232                         }
2233                         lock = lock->l_next;
2234                 }
2235         }
2236 
2237 done:
2238         mutex_exit(&gp->gp_mutex);
2239         return (result);
2240 }
2241 
2242 /*
2243  * Determine whether there are any locks for the given vnode with a remote
2244  * sysid.  Returns zero if not, non-zero if there are.
2245  *
2246  * Note that the return value from this function is potentially invalid
2247  * once it has been returned.  The caller is responsible for providing its
2248  * own synchronization mechanism to ensure that the return value is useful
2249  * (e.g., see nfs_lockcompletion()).
2250  */
2251 int
2252 flk_has_remote_locks(vnode_t *vp)
2253 {
2254         lock_descriptor_t *lock;
2255         int result = 0;
2256         graph_t *gp;
2257 
2258         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2259         if (gp == NULL) {
2260                 return (0);
2261         }
2262 
2263         mutex_enter(&gp->gp_mutex);
2264 
2265         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2266 
2267         if (lock) {
2268                 while (lock->l_vnode == vp) {
2269                         if (IS_REMOTE(lock)) {
2270                                 result = 1;
2271                                 goto done;
2272                         }
2273                         lock = lock->l_next;
2274                 }
2275         }
2276 
2277         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2278 
2279         if (lock) {
2280                 while (lock->l_vnode == vp) {
2281                         if (IS_REMOTE(lock)) {
2282                                 result = 1;
2283                                 goto done;
2284                         }
2285                         lock = lock->l_next;
2286                 }
2287         }
2288 
2289 done:
2290         mutex_exit(&gp->gp_mutex);
2291         return (result);
2292 }
2293 
2294 /*
2295  * Determine if there are any locks owned by the given sysid.
2296  * Returns zero if not, non-zero if there are.  Note that this return code
2297  * could be derived from flk_get_{sleeping,active}_locks, but this routine
2298  * avoids all the memory allocations of those routines.
2299  *
2300  * This routine has the same synchronization issues as
2301  * flk_has_remote_locks.
2302  */
2303 
2304 int
2305 flk_sysid_has_locks(int sysid, int lck_type)
2306 {
2307         int             has_locks = 0;
2308         lock_descriptor_t       *lock;
2309         graph_t         *gp;
2310         int             i;
2311 
2312         for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2313                 mutex_enter(&flock_lock);
2314                 gp = lock_graph[i];
2315                 mutex_exit(&flock_lock);
2316                 if (gp == NULL) {
2317                         continue;
2318                 }
2319 
2320                 mutex_enter(&gp->gp_mutex);
2321 
2322                 if (lck_type & FLK_QUERY_ACTIVE) {
2323                         for (lock = ACTIVE_HEAD(gp)->l_next;
2324                             lock != ACTIVE_HEAD(gp) && !has_locks;
2325                             lock = lock->l_next) {
2326                                 if (lock->l_flock.l_sysid == sysid)
2327                                         has_locks = 1;
2328                         }
2329                 }
2330 
2331                 if (lck_type & FLK_QUERY_SLEEPING) {
2332                         for (lock = SLEEPING_HEAD(gp)->l_next;
2333                                 lock != SLEEPING_HEAD(gp) && !has_locks;
2334                                 lock = lock->l_next) {
2335                                 if (lock->l_flock.l_sysid == sysid)
2336                                         has_locks = 1;
2337                         }
2338                 }
2339                 mutex_exit(&gp->gp_mutex);
2340         }
2341 
2342         return (has_locks);
2343 }
2344 
2345 
2346 /*
2347  * PSARC case 1997/292
2348  *
2349  * Requires: "sysid" is a pair [nlmid, sysid].  The lower half is 16-bit
2350  *  quantity, the real sysid generated by the NLM server; the upper half
2351  *  identifies the node of the cluster where the NLM server ran.
2352  *  This routine is only called by an NLM server running in a cluster.
2353  * Effects: Remove all locks held on behalf of the client identified
2354  *  by "sysid."
2355  */
2356 void
2357 cl_flk_remove_locks_by_sysid(int sysid)
2358 {
2359         graph_t *gp;
2360         int i;
2361         lock_descriptor_t *lock, *nlock;
2362 
2363         /*
2364          * Check to see if node is booted as a cluster. If not, return.
2365          */
2366         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2367                 return;
2368         }
2369 
2370         ASSERT(sysid != 0);
2371         for (i = 0; i < HASH_SIZE; i++) {
2372                 mutex_enter(&flock_lock);
2373                 gp = lock_graph[i];
2374                 mutex_exit(&flock_lock);
2375 
2376                 if (gp == NULL)
2377                         continue;
2378 
2379                 mutex_enter(&gp->gp_mutex);      /*  get mutex on lock graph */
2380 
2381                 /* signal sleeping requests so that they bail out */
2382                 lock = SLEEPING_HEAD(gp)->l_next;
2383                 while (lock != SLEEPING_HEAD(gp)) {
2384                         nlock = lock->l_next;
2385                         if (lock->l_flock.l_sysid == sysid) {
2386                                 INTERRUPT_WAKEUP(lock);
2387                         }
2388                         lock = nlock;
2389                 }
2390 
2391                 /* delete active locks */
2392                 lock = ACTIVE_HEAD(gp)->l_next;
2393                 while (lock != ACTIVE_HEAD(gp)) {
2394                         nlock = lock->l_next;
2395                         if (lock->l_flock.l_sysid == sysid) {
2396                                 flk_delete_active_lock(lock, 0);
2397                                 flk_wakeup(lock, 1);
2398                                 flk_free_lock(lock);
2399                         }
2400                         lock = nlock;
2401                 }
2402                 mutex_exit(&gp->gp_mutex);    /* release mutex on lock graph */
2403         }
2404 }
2405 
2406 /*
2407  * Delete all locks in the system that belongs to the sysid of the request.
2408  */
2409 
2410 static void
2411 flk_delete_locks_by_sysid(lock_descriptor_t *request)
2412 {
2413         int     sysid  = request->l_flock.l_sysid;
2414         lock_descriptor_t *lock, *nlock;
2415         graph_t *gp;
2416         int i;
2417 
2418         ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
2419         ASSERT(sysid != 0);
2420 
2421         mutex_exit(&request->l_graph->gp_mutex);
2422 
2423         for (i = 0; i < HASH_SIZE; i++) {
2424                 mutex_enter(&flock_lock);
2425                 gp = lock_graph[i];
2426                 mutex_exit(&flock_lock);
2427 
2428                 if (gp == NULL)
2429                         continue;
2430 
2431                 mutex_enter(&gp->gp_mutex);
2432 
2433                 /* signal sleeping requests so that they bail out */
2434                 lock = SLEEPING_HEAD(gp)->l_next;
2435                 while (lock != SLEEPING_HEAD(gp)) {
2436                         nlock = lock->l_next;
2437                         if (lock->l_flock.l_sysid == sysid) {
2438                                 INTERRUPT_WAKEUP(lock);
2439                         }
2440                         lock = nlock;
2441                 }
2442 
2443                 /* delete active locks */
2444                 lock = ACTIVE_HEAD(gp)->l_next;
2445                 while (lock != ACTIVE_HEAD(gp)) {
2446                         nlock = lock->l_next;
2447                         if (lock->l_flock.l_sysid == sysid) {
2448                                 flk_delete_active_lock(lock, 0);
2449                                 flk_wakeup(lock, 1);
2450                                 flk_free_lock(lock);
2451                         }
2452                         lock = nlock;
2453                 }
2454                 mutex_exit(&gp->gp_mutex);
2455         }
2456 
2457         mutex_enter(&request->l_graph->gp_mutex);
2458 }
2459 
2460 /*
2461  * Clustering: Deletes PXFS locks
2462  * Effects: Delete all locks on files in the given file system and with the
2463  *  given PXFS id.
2464  */
2465 void
2466 cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
2467 {
2468         lock_descriptor_t *lock, *nlock;
2469         graph_t *gp;
2470         int i;
2471 
2472         for (i = 0; i < HASH_SIZE; i++) {
2473                 mutex_enter(&flock_lock);
2474                 gp = lock_graph[i];
2475                 mutex_exit(&flock_lock);
2476 
2477                 if (gp == NULL)
2478                         continue;
2479 
2480                 mutex_enter(&gp->gp_mutex);
2481 
2482                 /* signal sleeping requests so that they bail out */
2483                 lock = SLEEPING_HEAD(gp)->l_next;
2484                 while (lock != SLEEPING_HEAD(gp)) {
2485                         nlock = lock->l_next;
2486                         if (lock->l_vnode->v_vfsp == vfsp) {
2487                                 ASSERT(IS_PXFS(lock));
2488                                 if (GETPXFSID(lock->l_flock.l_sysid) ==
2489                                     pxfsid) {
2490                                         flk_set_state(lock,
2491                                             FLK_CANCELLED_STATE);
2492                                         flk_cancel_sleeping_lock(lock, 1);
2493                                 }
2494                         }
2495                         lock = nlock;
2496                 }
2497 
2498                 /* delete active locks */
2499                 lock = ACTIVE_HEAD(gp)->l_next;
2500                 while (lock != ACTIVE_HEAD(gp)) {
2501                         nlock = lock->l_next;
2502                         if (lock->l_vnode->v_vfsp == vfsp) {
2503                                 ASSERT(IS_PXFS(lock));
2504                                 if (GETPXFSID(lock->l_flock.l_sysid) ==
2505                                     pxfsid) {
2506                                         flk_delete_active_lock(lock, 0);
2507                                         flk_wakeup(lock, 1);
2508                                         flk_free_lock(lock);
2509                                 }
2510                         }
2511                         lock = nlock;
2512                 }
2513                 mutex_exit(&gp->gp_mutex);
2514         }
2515 }
2516 
2517 /*
2518  * Search for a sleeping lock manager lock which matches exactly this lock
2519  * request; if one is found, fake a signal to cancel it.
2520  *
2521  * Return 1 if a matching lock was found, 0 otherwise.
2522  */
2523 
2524 static int
2525 flk_canceled(lock_descriptor_t *request)
2526 {
2527         lock_descriptor_t *lock, *nlock;
2528         graph_t *gp = request->l_graph;
2529         vnode_t *vp = request->l_vnode;
2530 
2531         ASSERT(MUTEX_HELD(&gp->gp_mutex));
2532         ASSERT(IS_LOCKMGR(request));
2533         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2534 
2535         if (lock) {
2536                 while (lock->l_vnode == vp) {
2537                         nlock = lock->l_next;
2538                         if (SAME_OWNER(lock, request) &&
2539                                 lock->l_start == request->l_start &&
2540                                         lock->l_end == request->l_end) {
2541                                 INTERRUPT_WAKEUP(lock);
2542                                 return (1);
2543                         }
2544                         lock = nlock;
2545                 }
2546         }
2547         return (0);
2548 }
2549 
2550 /*
2551  * Remove all the locks for the vnode belonging to the given pid and sysid.
2552  */
2553 
2554 void
2555 cleanlocks(vnode_t *vp, pid_t pid, int sysid)
2556 {
2557         graph_t *gp;
2558         lock_descriptor_t *lock, *nlock;
2559         lock_descriptor_t *link_stack;
2560 
2561         STACK_INIT(link_stack);
2562 
2563         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2564 
2565         if (gp == NULL)
2566                 return;
2567         mutex_enter(&gp->gp_mutex);
2568 
2569         CHECK_SLEEPING_LOCKS(gp);
2570         CHECK_ACTIVE_LOCKS(gp);
2571 
2572         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2573 
2574         if (lock) {
2575                 do {
2576                         nlock = lock->l_next;
2577                         if ((lock->l_flock.l_pid == pid ||
2578                                         pid == IGN_PID) &&
2579                                 lock->l_flock.l_sysid == sysid) {
2580                                 CANCEL_WAKEUP(lock);
2581                         }
2582                         lock = nlock;
2583                 } while (lock->l_vnode == vp);
2584         }
2585 
2586         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2587 
2588         if (lock) {
2589                 do {
2590                         nlock = lock->l_next;
2591                         if ((lock->l_flock.l_pid == pid ||
2592                                         pid == IGN_PID) &&
2593                                 lock->l_flock.l_sysid == sysid) {
2594                                 flk_delete_active_lock(lock, 0);
2595                                 STACK_PUSH(link_stack, lock, l_stack);
2596                         }
2597                         lock = nlock;
2598                 } while (lock->l_vnode == vp);
2599         }
2600 
2601         while ((lock = STACK_TOP(link_stack)) != NULL) {
2602                 STACK_POP(link_stack, l_stack);
2603                 flk_wakeup(lock, 1);
2604                 flk_free_lock(lock);
2605         }
2606 
2607         CHECK_SLEEPING_LOCKS(gp);
2608         CHECK_ACTIVE_LOCKS(gp);
2609         CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2610         mutex_exit(&gp->gp_mutex);
2611 }
2612 
2613 
2614 /*
2615  * Called from 'fs' read and write routines for files that have mandatory
2616  * locking enabled.
2617  */
2618 
2619 int
2620 chklock(
2621         struct vnode    *vp,
2622         int             iomode,
2623         u_offset_t      offset,
2624         ssize_t         len,
2625         int             fmode,
2626         caller_context_t *ct)
2627 {
2628         register int    i;
2629         struct flock64  bf;
2630         int             error = 0;
2631 
2632         bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
2633         bf.l_whence = 0;
2634         bf.l_start = offset;
2635         bf.l_len = len;
2636         if (ct == NULL) {
2637                 bf.l_pid = curproc->p_pid;
2638                 bf.l_sysid = 0;
2639         } else {
2640                 bf.l_pid = ct->cc_pid;
2641                 bf.l_sysid = ct->cc_sysid;
2642         }
2643         i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
2644         if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
2645             bf.l_type != F_UNLCK)
2646                 error = i ? i : EAGAIN;
2647         return (error);
2648 }
2649 
2650 /*
2651  * convoff - converts the given data (start, whence) to the
2652  * given whence.
2653  */
2654 int
2655 convoff(vp, lckdat, whence, offset)
2656         struct vnode    *vp;
2657         struct flock64  *lckdat;
2658         int             whence;
2659         offset_t        offset;
2660 {
2661         int             error;
2662         struct vattr    vattr;
2663 
2664         if ((lckdat->l_whence == 2) || (whence == 2)) {
2665                 vattr.va_mask = AT_SIZE;
2666                 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
2667                         return (error);
2668         }
2669 
2670         switch (lckdat->l_whence) {
2671         case 1:
2672                 lckdat->l_start += offset;
2673                 break;
2674         case 2:
2675                 lckdat->l_start += vattr.va_size;
2676                 /* FALLTHRU */
2677         case 0:
2678                 break;
2679         default:
2680                 return (EINVAL);
2681         }
2682 
2683         if (lckdat->l_start < 0)
2684                 return (EINVAL);
2685 
2686         switch (whence) {
2687         case 1:
2688                 lckdat->l_start -= offset;
2689                 break;
2690         case 2:
2691                 lckdat->l_start -= vattr.va_size;
2692                 /* FALLTHRU */
2693         case 0:
2694                 break;
2695         default:
2696                 return (EINVAL);
2697         }
2698 
2699         lckdat->l_whence = (short)whence;
2700         return (0);
2701 }
2702 
2703 
2704 /*      proc_graph function definitions */
2705 
2706 /*
2707  * Function checks for deadlock due to the new 'lock'. If deadlock found
2708  * edges of this lock are freed and returned.
2709  */
2710 
2711 static int
2712 flk_check_deadlock(lock_descriptor_t *lock)
2713 {
2714         proc_vertex_t   *start_vertex, *pvertex;
2715         proc_vertex_t *dvertex;
2716         proc_edge_t *pep, *ppep;
2717         edge_t  *ep, *nep;
2718         proc_vertex_t *process_stack;
2719 
2720         STACK_INIT(process_stack);
2721 
2722         mutex_enter(&flock_lock);
2723         start_vertex = flk_get_proc_vertex(lock);
2724         ASSERT(start_vertex != NULL);
2725 
2726         /* construct the edges from this process to other processes */
2727 
2728         ep = FIRST_ADJ(lock);
2729         while (ep != HEAD(lock)) {
2730                 proc_vertex_t *adj_proc;
2731 
2732                 adj_proc = flk_get_proc_vertex(ep->to_vertex);
2733                 for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
2734                         if (pep->to_proc == adj_proc) {
2735                                 ASSERT(pep->refcount);
2736                                 pep->refcount++;
2737                                 break;
2738                         }
2739                 }
2740                 if (pep == NULL) {
2741                         pep = flk_get_proc_edge();
2742                         pep->to_proc = adj_proc;
2743                         pep->refcount = 1;
2744                         adj_proc->incount++;
2745                         pep->next = start_vertex->edge;
2746                         start_vertex->edge = pep;
2747                 }
2748                 ep = NEXT_ADJ(ep);
2749         }
2750 
2751         ep = FIRST_IN(lock);
2752 
2753         while (ep != HEAD(lock)) {
2754                 proc_vertex_t *in_proc;
2755 
2756                 in_proc = flk_get_proc_vertex(ep->from_vertex);
2757 
2758                 for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
2759                         if (pep->to_proc == start_vertex) {
2760                                 ASSERT(pep->refcount);
2761                                 pep->refcount++;
2762                                 break;
2763                         }
2764                 }
2765                 if (pep == NULL) {
2766                         pep = flk_get_proc_edge();
2767                         pep->to_proc = start_vertex;
2768                         pep->refcount = 1;
2769                         start_vertex->incount++;
2770                         pep->next = in_proc->edge;
2771                         in_proc->edge = pep;
2772                 }
2773                 ep = NEXT_IN(ep);
2774         }
2775 
2776         if (start_vertex->incount == 0) {
2777                 mutex_exit(&flock_lock);
2778                 return (0);
2779         }
2780 
2781         flk_proc_graph_uncolor();
2782 
2783         start_vertex->p_sedge = start_vertex->edge;
2784 
2785         STACK_PUSH(process_stack, start_vertex, p_stack);
2786 
2787         while ((pvertex = STACK_TOP(process_stack)) != NULL) {
2788                 for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
2789                         dvertex = pep->to_proc;
2790                         if (!PROC_ARRIVED(dvertex)) {
2791                                 STACK_PUSH(process_stack, dvertex, p_stack);
2792                                 dvertex->p_sedge = dvertex->edge;
2793                                 PROC_ARRIVE(pvertex);
2794                                 pvertex->p_sedge = pep->next;
2795                                 break;
2796                         }
2797                         if (!PROC_DEPARTED(dvertex))
2798                                 goto deadlock;
2799                 }
2800                 if (pep == NULL) {
2801                         PROC_DEPART(pvertex);
2802                         STACK_POP(process_stack, p_stack);
2803                 }
2804         }
2805         mutex_exit(&flock_lock);
2806         return (0);
2807 
2808 deadlock:
2809 
2810         /* we remove all lock edges and proc edges */
2811 
2812         ep = FIRST_ADJ(lock);
2813         while (ep != HEAD(lock)) {
2814                 proc_vertex_t *adj_proc;
2815                 adj_proc = flk_get_proc_vertex(ep->to_vertex);
2816                 nep = NEXT_ADJ(ep);
2817                 IN_LIST_REMOVE(ep);
2818                 ADJ_LIST_REMOVE(ep);
2819                 flk_free_edge(ep);
2820                 ppep = start_vertex->edge;
2821                 for (pep = start_vertex->edge; pep != NULL; ppep = pep,
2822                                                 pep = ppep->next) {
2823                         if (pep->to_proc == adj_proc) {
2824                                 pep->refcount--;
2825                                 if (pep->refcount == 0) {
2826                                         if (pep == ppep) {
2827                                                 start_vertex->edge = pep->next;
2828                                         } else {
2829                                                 ppep->next = pep->next;
2830                                         }
2831                                         adj_proc->incount--;
2832                                         flk_proc_release(adj_proc);
2833                                         flk_free_proc_edge(pep);
2834                                 }
2835                                 break;
2836                         }
2837                 }
2838                 ep = nep;
2839         }
2840         ep = FIRST_IN(lock);
2841         while (ep != HEAD(lock)) {
2842                 proc_vertex_t *in_proc;
2843                 in_proc = flk_get_proc_vertex(ep->from_vertex);
2844                 nep = NEXT_IN(ep);
2845                 IN_LIST_REMOVE(ep);
2846                 ADJ_LIST_REMOVE(ep);
2847                 flk_free_edge(ep);
2848                 ppep = in_proc->edge;
2849                 for (pep = in_proc->edge; pep != NULL; ppep = pep,
2850                                                 pep = ppep->next) {
2851                         if (pep->to_proc == start_vertex) {
2852                                 pep->refcount--;
2853                                 if (pep->refcount == 0) {
2854                                         if (pep == ppep) {
2855                                                 in_proc->edge = pep->next;
2856                                         } else {
2857                                                 ppep->next = pep->next;
2858                                         }
2859                                         start_vertex->incount--;
2860                                         flk_proc_release(in_proc);
2861                                         flk_free_proc_edge(pep);
2862                                 }
2863                                 break;
2864                         }
2865                 }
2866                 ep = nep;
2867         }
2868         flk_proc_release(start_vertex);
2869         mutex_exit(&flock_lock);
2870         return (1);
2871 }
2872 
2873 /*
2874  * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
2875  * from the list we return that, otherwise we allocate one. If necessary,
2876  * we grow the list of vertices also.
2877  */
2878 
2879 static proc_vertex_t *
2880 flk_get_proc_vertex(lock_descriptor_t *lock)
2881 {
2882         int i;
2883         proc_vertex_t   *pv;
2884         proc_vertex_t   **palloc;
2885 
2886         ASSERT(MUTEX_HELD(&flock_lock));
2887         if (lock->pvertex != -1) {
2888                 ASSERT(lock->pvertex >= 0);
2889                 pv = pgraph.proc[lock->pvertex];
2890                 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2891                         return (pv);
2892                 }
2893         }
2894         for (i = 0; i < pgraph.gcount; i++) {
2895                 pv = pgraph.proc[i];
2896                 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2897                         lock->pvertex = pv->index = i;
2898                         return (pv);
2899                 }
2900         }
2901         pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
2902         pv->pid = lock->l_flock.l_pid;
2903         pv->sysid = lock->l_flock.l_sysid;
2904         flk_proc_vertex_allocs++;
2905         if (pgraph.free != 0) {
2906                 for (i = 0; i < pgraph.gcount; i++) {
2907                         if (pgraph.proc[i] == NULL) {
2908                                 pgraph.proc[i] = pv;
2909                                 lock->pvertex = pv->index = i;
2910                                 pgraph.free--;
2911                                 return (pv);
2912                         }
2913                 }
2914         }
2915         palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
2916                                 sizeof (proc_vertex_t *), KM_SLEEP);
2917 
2918         if (pgraph.proc) {
2919                 bcopy(pgraph.proc, palloc,
2920                         pgraph.gcount * sizeof (proc_vertex_t *));
2921 
2922                 kmem_free(pgraph.proc,
2923                         pgraph.gcount * sizeof (proc_vertex_t *));
2924         }
2925         pgraph.proc = palloc;
2926         pgraph.free += (PROC_CHUNK - 1);
2927         pv->index = lock->pvertex = pgraph.gcount;
2928         pgraph.gcount += PROC_CHUNK;
2929         pgraph.proc[pv->index] = pv;
2930         return (pv);
2931 }
2932 
2933 /*
2934  * Allocate a proc edge.
2935  */
2936 
2937 static proc_edge_t *
2938 flk_get_proc_edge()
2939 {
2940         proc_edge_t *pep;
2941 
2942         pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
2943         flk_proc_edge_allocs++;
2944         return (pep);
2945 }
2946 
2947 /*
2948  * Free the proc edge. Called whenever its reference count goes to zero.
2949  */
2950 
2951 static void
2952 flk_free_proc_edge(proc_edge_t *pep)
2953 {
2954         ASSERT(pep->refcount == 0);
2955         kmem_free((void *)pep, sizeof (proc_edge_t));
2956         flk_proc_edge_frees++;
2957 }
2958 
2959 /*
2960  * Color the graph explicitly done only when the mark value hits max value.
2961  */
2962 
2963 static void
2964 flk_proc_graph_uncolor()
2965 {
2966         int i;
2967 
2968         if (pgraph.mark == UINT_MAX) {
2969                 for (i = 0; i < pgraph.gcount; i++)
2970                         if (pgraph.proc[i] != NULL) {
2971                                 pgraph.proc[i]->atime = 0;
2972                                 pgraph.proc[i]->dtime = 0;
2973                         }
2974                 pgraph.mark = 1;
2975         } else {
2976                 pgraph.mark++;
2977         }
2978 }
2979 
2980 /*
2981  * Release the proc vertex iff both there are no in edges and out edges
2982  */
2983 
2984 static void
2985 flk_proc_release(proc_vertex_t *proc)
2986 {
2987         ASSERT(MUTEX_HELD(&flock_lock));
2988         if (proc->edge == NULL && proc->incount == 0) {
2989                 pgraph.proc[proc->index] = NULL;
2990                 pgraph.free++;
2991                 kmem_free(proc, sizeof (proc_vertex_t));
2992                 flk_proc_vertex_frees++;
2993         }
2994 }
2995 
2996 /*
2997  * Updates process graph to reflect change in a lock_graph.
2998  * Note: We should call this function only after we have a correctly
2999  * recomputed lock graph. Otherwise we might miss a deadlock detection.
3000  * eg: in function flk_relation() we call this function after flk_recompute_
3001  * dependencies() otherwise if a process tries to lock a vnode hashed
3002  * into another graph it might sleep for ever.
3003  */
3004 
3005 static void
3006 flk_update_proc_graph(edge_t *ep, int delete)
3007 {
3008         proc_vertex_t *toproc, *fromproc;
3009         proc_edge_t *pep, *prevpep;
3010 
3011         mutex_enter(&flock_lock);
3012         toproc = flk_get_proc_vertex(ep->to_vertex);
3013         fromproc = flk_get_proc_vertex(ep->from_vertex);
3014 
3015         if (!delete)
3016                 goto add;
3017         pep = prevpep = fromproc->edge;
3018 
3019         ASSERT(pep != NULL);
3020         while (pep != NULL) {
3021                 if (pep->to_proc == toproc) {
3022                         ASSERT(pep->refcount > 0);
3023                         pep->refcount--;
3024                         if (pep->refcount == 0) {
3025                                 if (pep == prevpep) {
3026                                         fromproc->edge = pep->next;
3027                                 } else {
3028                                         prevpep->next = pep->next;
3029                                 }
3030                                 toproc->incount--;
3031                                 flk_proc_release(toproc);
3032                                 flk_free_proc_edge(pep);
3033                         }
3034                         break;
3035                 }
3036                 prevpep = pep;
3037                 pep = pep->next;
3038         }
3039         flk_proc_release(fromproc);
3040         mutex_exit(&flock_lock);
3041         return;
3042 add:
3043 
3044         pep = fromproc->edge;
3045 
3046         while (pep != NULL) {
3047                 if (pep->to_proc == toproc) {
3048                         ASSERT(pep->refcount > 0);
3049                         pep->refcount++;
3050                         break;
3051                 }
3052                 pep = pep->next;
3053         }
3054         if (pep == NULL) {
3055                 pep = flk_get_proc_edge();
3056                 pep->to_proc = toproc;
3057                 pep->refcount = 1;
3058                 toproc->incount++;
3059                 pep->next = fromproc->edge;
3060                 fromproc->edge = pep;
3061         }
3062         mutex_exit(&flock_lock);
3063 }
3064 
3065 /*
3066  * Set the control status for lock manager requests.
3067  *
3068  */
3069 
3070 /*
3071  * PSARC case 1997/292
3072  *
3073  * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3074  * Effects: Set the state of the NLM server identified by "nlmid"
3075  *   in the NLM registry to state "nlm_state."
3076  *   Raises exception no_such_nlm if "nlmid" doesn't identify a known
3077  *   NLM server to this LLM.
3078  *   Note that when this routine is called with NLM_SHUTTING_DOWN there
3079  *   may be locks requests that have gotten started but not finished.  In
3080  *   particular, there may be blocking requests that are in the callback code
3081  *   before sleeping (so they're not holding the lock for the graph).  If
3082  *   such a thread reacquires the graph's lock (to go to sleep) after
3083  *   NLM state in the NLM registry  is set to a non-up value,
3084  *   it will notice the status and bail out.  If the request gets
3085  *   granted before the thread can check the NLM registry, let it
3086  *   continue normally.  It will get flushed when we are called with NLM_DOWN.
3087  *
3088  * Modifies: nlm_reg_obj (global)
3089  * Arguments:
3090  *    nlmid     (IN):    id uniquely identifying an NLM server
3091  *    nlm_state (IN):    NLM server state to change "nlmid" to
3092  */
3093 void
3094 cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
3095 {
3096         /*
3097          * Check to see if node is booted as a cluster. If not, return.
3098          */
3099         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3100                 return;
3101         }
3102 
3103         /*
3104          * Check for development/debugging.  It is possible to boot a node
3105          * in non-cluster mode, and then run a special script, currently
3106          * available only to developers, to bring up the node as part of a
3107          * cluster.  The problem is that running such a script does not
3108          * result in the routine flk_init() being called and hence global array
3109          * nlm_reg_status is NULL.  The NLM thinks it's in cluster mode,
3110          * but the LLM needs to do an additional check to see if the global
3111          * array has been created or not. If nlm_reg_status is NULL, then
3112          * return, else continue.
3113          */
3114         if (nlm_reg_status == NULL) {
3115                 return;
3116         }
3117 
3118         ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3119         mutex_enter(&nlm_reg_lock);
3120 
3121         if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
3122                 /*
3123                  * If the NLM server "nlmid" is unknown in the NLM registry,
3124                  * add it to the registry in the nlm shutting down state.
3125                  */
3126                 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3127                         FLK_NLM_SHUTTING_DOWN);
3128         } else {
3129                 /*
3130                  * Change the state of the NLM server identified by "nlmid"
3131                  * in the NLM registry to the argument "nlm_state."
3132                  */
3133                 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3134                         nlm_state);
3135         }
3136 
3137         /*
3138          *  The reason we must register the NLM server that is shutting down
3139          *  with an LLM that doesn't already know about it (never sent a lock
3140          *  request) is to handle correctly a race between shutdown and a new
3141          *  lock request.  Suppose that a shutdown request from the NLM server
3142          *  invokes this routine at the LLM, and a thread is spawned to
3143          *  service the request. Now suppose a new lock request is in
3144          *  progress and has already passed the first line of defense in
3145          *  reclock(), which denies new locks requests from NLM servers
3146          *  that are not in the NLM_UP state.  After the current routine
3147          *  is invoked for both phases of shutdown, the routine will return,
3148          *  having done nothing, and the lock request will proceed and
3149          *  probably be granted.  The problem is that the shutdown was ignored
3150          *  by the lock request because there was no record of that NLM server
3151          *  shutting down.   We will be in the peculiar position of thinking
3152          *  that we've shutdown the NLM server and all locks at all LLMs have
3153          *  been discarded, but in fact there's still one lock held.
3154          *  The solution is to record the existence of NLM server and change
3155          *  its state immediately to NLM_SHUTTING_DOWN.  The lock request in
3156          *  progress may proceed because the next phase NLM_DOWN will catch
3157          *  this lock and discard it.
3158          */
3159         mutex_exit(&nlm_reg_lock);
3160 
3161         switch (nlm_state) {
3162         case FLK_NLM_UP:
3163                 /*
3164                  * Change the NLM state of all locks still held on behalf of
3165                  * the NLM server identified by "nlmid" to NLM_UP.
3166                  */
3167                 cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
3168                 break;
3169 
3170         case FLK_NLM_SHUTTING_DOWN:
3171                 /*
3172                  * Wake up all sleeping locks for the NLM server identified
3173                  * by "nlmid." Note that eventually all woken threads will
3174                  * have their lock requests cancelled and descriptors
3175                  * removed from the sleeping lock list.  Note that the NLM
3176                  * server state associated with each lock descriptor is
3177                  * changed to FLK_NLM_SHUTTING_DOWN.
3178                  */
3179                 cl_flk_wakeup_sleeping_nlm_locks(nlmid);
3180                 break;
3181 
3182         case FLK_NLM_DOWN:
3183                 /*
3184                  * Discard all active, granted locks for this NLM server
3185                  * identified by "nlmid."
3186                  */
3187                 cl_flk_unlock_nlm_granted(nlmid);
3188                 break;
3189 
3190         default:
3191                 panic("cl_set_nlm_status: bad status (%d)", nlm_state);
3192         }
3193 }
3194 
3195 /*
3196  * Set the control status for lock manager requests.
3197  *
3198  * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3199  * may be locks requests that have gotten started but not finished.  In
3200  * particular, there may be blocking requests that are in the callback code
3201  * before sleeping (so they're not holding the lock for the graph).  If
3202  * such a thread reacquires the graph's lock (to go to sleep) after
3203  * flk_lockmgr_status is set to a non-up value, it will notice the status
3204  * and bail out.  If the request gets granted before the thread can check
3205  * flk_lockmgr_status, let it continue normally.  It will get flushed when
3206  * we are called with FLK_LOCKMGR_DOWN.
3207  */
3208 
3209 void
3210 flk_set_lockmgr_status(flk_lockmgr_status_t status)
3211 {
3212         int i;
3213         graph_t *gp;
3214         struct flock_globals *fg;
3215 
3216         fg = flk_get_globals();
3217         ASSERT(fg != NULL);
3218 
3219         mutex_enter(&flock_lock);
3220         fg->flk_lockmgr_status = status;
3221         mutex_exit(&flock_lock);
3222 
3223         /*
3224          * If the lock manager is coming back up, all that's needed is to
3225          * propagate this information to the graphs.  If the lock manager
3226          * is going down, additional action is required, and each graph's
3227          * copy of the state is updated atomically with this other action.
3228          */
3229         switch (status) {
3230         case FLK_LOCKMGR_UP:
3231                 for (i = 0; i < HASH_SIZE; i++) {
3232                         mutex_enter(&flock_lock);
3233                         gp = lock_graph[i];
3234                         mutex_exit(&flock_lock);
3235                         if (gp == NULL)
3236                                 continue;
3237                         mutex_enter(&gp->gp_mutex);
3238                         fg->lockmgr_status[i] = status;
3239                         mutex_exit(&gp->gp_mutex);
3240                 }
3241                 break;
3242         case FLK_WAKEUP_SLEEPERS:
3243                 wakeup_sleeping_lockmgr_locks(fg);
3244                 break;
3245         case FLK_LOCKMGR_DOWN:
3246                 unlock_lockmgr_granted(fg);
3247                 break;
3248         default:
3249                 panic("flk_set_lockmgr_status: bad status (%d)", status);
3250                 break;
3251         }
3252 }
3253 
3254 /*
3255  * This routine returns all the locks that are active or sleeping and are
3256  * associated with a particular set of identifiers.  If lock_state != 0, then
3257  * only locks that match the lock_state are returned. If lock_state == 0, then
3258  * all locks are returned. If pid == NOPID, the pid is ignored.  If
3259  * use_sysid is FALSE, then the sysid is ignored.  If vp is NULL, then the
3260  * vnode pointer is ignored.
3261  *
3262  * A list containing the vnode pointer and an flock structure
3263  * describing the lock is returned.  Each element in the list is
3264  * dynamically allocated and must be freed by the caller.  The
3265  * last item in the list is denoted by a NULL value in the ll_next
3266  * field.
3267  *
3268  * The vnode pointers returned are held.  The caller is responsible
3269  * for releasing these.  Note that the returned list is only a snapshot of
3270  * the current lock information, and that it is a snapshot of a moving
3271  * target (only one graph is locked at a time).
3272  */
3273 
3274 locklist_t *
3275 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3276                 pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3277 {
3278         lock_descriptor_t       *lock;
3279         lock_descriptor_t       *graph_head;
3280         locklist_t              listhead;
3281         locklist_t              *llheadp;
3282         locklist_t              *llp;
3283         locklist_t              *lltp;
3284         graph_t                 *gp;
3285         int                     i;
3286         int                     first_index; /* graph index */
3287         int                     num_indexes; /* graph index */
3288 
3289         ASSERT((list_type == FLK_ACTIVE_STATE) ||
3290             (list_type == FLK_SLEEPING_STATE));
3291 
3292         /*
3293          * Get a pointer to something to use as a list head while building
3294          * the rest of the list.
3295          */
3296         llheadp = &listhead;
3297         lltp = llheadp;
3298         llheadp->ll_next = (locklist_t *)NULL;
3299 
3300         /* Figure out which graphs we want to look at. */
3301         if (vp == NULL) {
3302                 first_index = 0;
3303                 num_indexes = HASH_SIZE;
3304         } else {
3305                 first_index = HASH_INDEX(vp);
3306                 num_indexes = 1;
3307         }
3308 
3309         for (i = first_index; i < first_index + num_indexes; i++) {
3310                 mutex_enter(&flock_lock);
3311                 gp = lock_graph[i];
3312                 mutex_exit(&flock_lock);
3313                 if (gp == NULL) {
3314                         continue;
3315                 }
3316 
3317                 mutex_enter(&gp->gp_mutex);
3318                 graph_head = (list_type == FLK_ACTIVE_STATE) ?
3319                         ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3320                 for (lock = graph_head->l_next;
3321                     lock != graph_head;
3322                     lock = lock->l_next) {
3323                         if (use_sysid && lock->l_flock.l_sysid != sysid)
3324                                 continue;
3325                         if (pid != NOPID && lock->l_flock.l_pid != pid)
3326                                 continue;
3327                         if (vp != NULL && lock->l_vnode != vp)
3328                                 continue;
3329                         if (lock_state && !(lock_state & lock->l_state))
3330                                 continue;
3331                         if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3332                                 continue;
3333                         /*
3334                          * A matching lock was found.  Allocate
3335                          * space for a new locklist entry and fill
3336                          * it in.
3337                          */
3338                         llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3339                         lltp->ll_next = llp;
3340                         VN_HOLD(lock->l_vnode);
3341                         llp->ll_vp = lock->l_vnode;
3342                         create_flock(lock, &(llp->ll_flock));
3343                         llp->ll_next = (locklist_t *)NULL;
3344                         lltp = llp;
3345                 }
3346                 mutex_exit(&gp->gp_mutex);
3347         }
3348 
3349         llp = llheadp->ll_next;
3350         return (llp);
3351 }
3352 
3353 /*
3354  * These two functions are simply interfaces to get_lock_list.  They return
3355  * a list of sleeping or active locks for the given sysid and pid.  See
3356  * get_lock_list for details.
3357  *
3358  * In either case we don't particularly care to specify the zone of interest;
3359  * the sysid-space is global across zones, so the sysid will map to exactly one
3360  * zone, and we'll return information for that zone.
3361  */
3362 
3363 locklist_t *
3364 flk_get_sleeping_locks(int sysid, pid_t pid)
3365 {
3366         return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
3367                     ALL_ZONES));
3368 }
3369 
3370 locklist_t *
3371 flk_get_active_locks(int sysid, pid_t pid)
3372 {
3373         return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
3374                     ALL_ZONES));
3375 }
3376 
3377 /*
3378  * Another interface to get_lock_list.  This one returns all the active
3379  * locks for a given vnode.  Again, see get_lock_list for details.
3380  *
3381  * We don't need to specify which zone's locks we're interested in.  The matter
3382  * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
3383  * be used by multiple zones, so the list of locks will all be from the right
3384  * zone.
3385  */
3386 
3387 locklist_t *
3388 flk_active_locks_for_vp(const vnode_t *vp)
3389 {
3390         return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
3391                     ALL_ZONES));
3392 }
3393 
3394 /*
3395  * Another interface to get_lock_list.  This one returns all the active
3396  * nbmand locks for a given vnode.  Again, see get_lock_list for details.
3397  *
3398  * See the comment for flk_active_locks_for_vp() for why we don't care to
3399  * specify the particular zone of interest.
3400  */
3401 locklist_t *
3402 flk_active_nbmand_locks_for_vp(const vnode_t *vp)
3403 {
3404         return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3405                                 NOPID, vp, ALL_ZONES));
3406 }
3407 
3408 /*
3409  * Another interface to get_lock_list.  This one returns all the active
3410  * nbmand locks for a given pid.  Again, see get_lock_list for details.
3411  *
3412  * The zone doesn't need to be specified here; the locks held by a
3413  * particular process will either be local (ie, non-NFS) or from the zone
3414  * the process is executing in.  This is because other parts of the system
3415  * ensure that an NFS vnode can't be used in a zone other than that in
3416  * which it was opened.
3417  */
3418 locklist_t *
3419 flk_active_nbmand_locks(pid_t pid)
3420 {
3421         return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3422                                 pid, NULL, ALL_ZONES));
3423 }
3424 
3425 /*
3426  * Free up all entries in the locklist.
3427  */
3428 void
3429 flk_free_locklist(locklist_t *llp)
3430 {
3431         locklist_t *next_llp;
3432 
3433         while (llp) {
3434                 next_llp = llp->ll_next;
3435                 VN_RELE(llp->ll_vp);
3436                 kmem_free(llp, sizeof (*llp));
3437                 llp = next_llp;
3438         }
3439 }
3440 
3441 static void
3442 cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
3443 {
3444         /*
3445          * For each graph "lg" in the hash table lock_graph do
3446          * a.  Get the list of sleeping locks
3447          * b.  For each lock descriptor in the list do
3448          *      i.   If the requested lock is an NLM server request AND
3449          *              the nlmid is the same as the routine argument then
3450          *              change the lock descriptor's state field to
3451          *              "nlm_state."
3452          * c.  Get the list of active locks
3453          * d.  For each lock descriptor in the list do
3454          *      i.   If the requested lock is an NLM server request AND
3455          *              the nlmid is the same as the routine argument then
3456          *              change the lock descriptor's state field to
3457          *              "nlm_state."
3458          */
3459 
3460         int                     i;
3461         graph_t                 *gp;                    /* lock graph */
3462         lock_descriptor_t       *lock;                  /* lock */
3463         lock_descriptor_t       *nlock = NULL;          /* next lock */
3464         int                     lock_nlmid;
3465 
3466         for (i = 0; i < HASH_SIZE; i++) {
3467                 mutex_enter(&flock_lock);
3468                 gp = lock_graph[i];
3469                 mutex_exit(&flock_lock);
3470                 if (gp == NULL) {
3471                         continue;
3472                 }
3473 
3474                 /* Get list of sleeping locks in current lock graph. */
3475                 mutex_enter(&gp->gp_mutex);
3476                 for (lock = SLEEPING_HEAD(gp)->l_next;
3477                     lock != SLEEPING_HEAD(gp);
3478                     lock = nlock) {
3479                         nlock = lock->l_next;
3480                         /* get NLM id */
3481                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3482 
3483                         /*
3484                          * If NLM server request AND nlmid of lock matches
3485                          * nlmid of argument, then set the NLM state of the
3486                          * lock to "nlm_state."
3487                          */
3488                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3489                                 SET_NLM_STATE(lock, nlm_state);
3490                         }
3491                 }
3492 
3493                 /* Get list of active locks in current lock graph. */
3494                 for (lock = ACTIVE_HEAD(gp)->l_next;
3495                     lock != ACTIVE_HEAD(gp);
3496                     lock = nlock) {
3497                         nlock = lock->l_next;
3498                         /* get NLM id */
3499                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3500 
3501                         /*
3502                          * If NLM server request AND nlmid of lock matches
3503                          * nlmid of argument, then set the NLM state of the
3504                          * lock to "nlm_state."
3505                          */
3506                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3507                                 ASSERT(IS_ACTIVE(lock));
3508                                 SET_NLM_STATE(lock, nlm_state);
3509                         }
3510                 }
3511                 mutex_exit(&gp->gp_mutex);
3512         }
3513 }
3514 
3515 /*
3516  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
3517  * Effects: Find all sleeping lock manager requests _only_ for the NLM server
3518  *   identified by "nlmid." Poke those lock requests.
3519  */
3520 static void
3521 cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
3522 {
3523         lock_descriptor_t *lock;
3524         lock_descriptor_t *nlock = NULL; /* next lock */
3525         int i;
3526         graph_t *gp;
3527         int     lock_nlmid;
3528 
3529         for (i = 0; i < HASH_SIZE; i++) {
3530                 mutex_enter(&flock_lock);
3531                 gp = lock_graph[i];
3532                 mutex_exit(&flock_lock);
3533                 if (gp == NULL) {
3534                         continue;
3535                 }
3536 
3537                 mutex_enter(&gp->gp_mutex);
3538                 for (lock = SLEEPING_HEAD(gp)->l_next;
3539                     lock != SLEEPING_HEAD(gp);
3540                     lock = nlock) {
3541                         nlock = lock->l_next;
3542                         /*
3543                          * If NLM server request _and_ nlmid of lock matches
3544                          * nlmid of argument, then set the NLM state of the
3545                          * lock to NLM_SHUTTING_DOWN, and wake up sleeping
3546                          * request.
3547                          */
3548                         if (IS_LOCKMGR(lock)) {
3549                                 /* get NLM id */
3550                                 lock_nlmid =
3551                                         GETNLMID(lock->l_flock.l_sysid);
3552                                 if (nlmid == lock_nlmid) {
3553                                         SET_NLM_STATE(lock,
3554                                                 FLK_NLM_SHUTTING_DOWN);
3555                                         INTERRUPT_WAKEUP(lock);
3556                                 }
3557                         }
3558                 }
3559                 mutex_exit(&gp->gp_mutex);
3560         }
3561 }
3562 
3563 /*
3564  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
3565  * Effects:  Find all active (granted) lock manager locks _only_ for the
3566  *   NLM server identified by "nlmid" and release them.
3567  */
3568 static void
3569 cl_flk_unlock_nlm_granted(int nlmid)
3570 {
3571         lock_descriptor_t *lock;
3572         lock_descriptor_t *nlock = NULL; /* next lock */
3573         int i;
3574         graph_t *gp;
3575         int     lock_nlmid;
3576 
3577         for (i = 0; i < HASH_SIZE; i++) {
3578                 mutex_enter(&flock_lock);
3579                 gp = lock_graph[i];
3580                 mutex_exit(&flock_lock);
3581                 if (gp == NULL) {
3582                         continue;
3583                 }
3584 
3585                 mutex_enter(&gp->gp_mutex);
3586                 for (lock = ACTIVE_HEAD(gp)->l_next;
3587                     lock != ACTIVE_HEAD(gp);
3588                     lock = nlock) {
3589                         nlock = lock->l_next;
3590                         ASSERT(IS_ACTIVE(lock));
3591 
3592                         /*
3593                          * If it's an  NLM server request _and_ nlmid of
3594                          * the lock matches nlmid of argument, then
3595                          * remove the active lock the list, wakup blocked
3596                          * threads, and free the storage for the lock.
3597                          * Note that there's no need to mark the NLM state
3598                          * of this lock to NLM_DOWN because the lock will
3599                          * be deleted anyway and its storage freed.
3600                          */
3601                         if (IS_LOCKMGR(lock)) {
3602                                 /* get NLM id */
3603                                 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3604                                 if (nlmid == lock_nlmid) {
3605                                         flk_delete_active_lock(lock, 0);
3606                                         flk_wakeup(lock, 1);
3607                                         flk_free_lock(lock);
3608                                 }
3609                         }
3610                 }
3611                 mutex_exit(&gp->gp_mutex);
3612         }
3613 }
3614 
3615 /*
3616  * Find all sleeping lock manager requests and poke them.
3617  */
3618 static void
3619 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
3620 {
3621         lock_descriptor_t *lock;
3622         lock_descriptor_t *nlock = NULL; /* next lock */
3623         int i;
3624         graph_t *gp;
3625         zoneid_t zoneid = getzoneid();
3626 
3627         for (i = 0; i < HASH_SIZE; i++) {
3628                 mutex_enter(&flock_lock);
3629                 gp = lock_graph[i];
3630                 mutex_exit(&flock_lock);
3631                 if (gp == NULL) {
3632                         continue;
3633                 }
3634 
3635                 mutex_enter(&gp->gp_mutex);
3636                 fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
3637                 for (lock = SLEEPING_HEAD(gp)->l_next;
3638                     lock != SLEEPING_HEAD(gp);
3639                     lock = nlock) {
3640                         nlock = lock->l_next;
3641                         if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3642                                 INTERRUPT_WAKEUP(lock);
3643                         }
3644                 }
3645                 mutex_exit(&gp->gp_mutex);
3646         }
3647 }
3648 
3649 
3650 /*
3651  * Find all active (granted) lock manager locks and release them.
3652  */
3653 static void
3654 unlock_lockmgr_granted(struct flock_globals *fg)
3655 {
3656         lock_descriptor_t *lock;
3657         lock_descriptor_t *nlock = NULL; /* next lock */
3658         int i;
3659         graph_t *gp;
3660         zoneid_t zoneid = getzoneid();
3661 
3662         for (i = 0; i < HASH_SIZE; i++) {
3663                 mutex_enter(&flock_lock);
3664                 gp = lock_graph[i];
3665                 mutex_exit(&flock_lock);
3666                 if (gp == NULL) {
3667                         continue;
3668                 }
3669 
3670                 mutex_enter(&gp->gp_mutex);
3671                 fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
3672                 for (lock = ACTIVE_HEAD(gp)->l_next;
3673                     lock != ACTIVE_HEAD(gp);
3674                     lock = nlock) {
3675                         nlock = lock->l_next;
3676                         if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3677                                 ASSERT(IS_ACTIVE(lock));
3678                                 flk_delete_active_lock(lock, 0);
3679                                 flk_wakeup(lock, 1);
3680                                 flk_free_lock(lock);
3681                         }
3682                 }
3683                 mutex_exit(&gp->gp_mutex);
3684         }
3685 }
3686 
3687 
3688 /*
3689  * Wait until a lock is granted, cancelled, or interrupted.
3690  */
3691 
3692 static void
3693 wait_for_lock(lock_descriptor_t *request)
3694 {
3695         graph_t *gp = request->l_graph;
3696 
3697         ASSERT(MUTEX_HELD(&gp->gp_mutex));
3698 
3699         while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
3700             !(IS_INTERRUPTED(request))) {
3701                 if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
3702                         flk_set_state(request, FLK_INTERRUPTED_STATE);
3703                         request->l_state |= INTERRUPTED_LOCK;
3704                 }
3705         }
3706 }
3707 
3708 /*
3709  * Create an flock structure from the existing lock information
3710  *
3711  * This routine is used to create flock structures for the lock manager
3712  * to use in a reclaim request.  Since the lock was originated on this
3713  * host, it must be conforming to UNIX semantics, so no checking is
3714  * done to make sure it falls within the lower half of the 32-bit range.
3715  */
3716 
3717 static void
3718 create_flock(lock_descriptor_t *lp, flock64_t *flp)
3719 {
3720         ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
3721         ASSERT(lp->l_end >= lp->l_start);
3722 
3723         flp->l_type = lp->l_type;
3724         flp->l_whence = 0;
3725         flp->l_start = lp->l_start;
3726         flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
3727                 (lp->l_end - lp->l_start + 1);
3728         flp->l_sysid = lp->l_flock.l_sysid;
3729         flp->l_pid = lp->l_flock.l_pid;
3730 }
3731 
3732 /*
3733  * Convert flock_t data describing a lock range into unsigned long starting
3734  * and ending points, which are put into lock_request.  Returns 0 or an
3735  * errno value.
3736  * Large Files: max is passed by the caller and we return EOVERFLOW
3737  * as defined by LFS API.
3738  */
3739 
3740 int
3741 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
3742     u_offset_t *start, u_offset_t *end, offset_t offset)
3743 {
3744         struct vattr    vattr;
3745         int     error;
3746 
3747         /*
3748          * Determine the starting point of the request
3749          */
3750         switch (flp->l_whence) {
3751         case 0:         /* SEEK_SET */
3752                 *start = (u_offset_t)flp->l_start;
3753                 break;
3754         case 1:         /* SEEK_CUR */
3755                 *start = (u_offset_t)(flp->l_start + offset);
3756                 break;
3757         case 2:         /* SEEK_END */
3758                 vattr.va_mask = AT_SIZE;
3759                 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
3760                         return (error);
3761                 *start = (u_offset_t)(flp->l_start + vattr.va_size);
3762                 break;
3763         default:
3764                 return (EINVAL);
3765         }
3766 
3767         /*
3768          * Determine the range covered by the request.
3769          */
3770         if (flp->l_len == 0)
3771                 *end = MAX_U_OFFSET_T;
3772         else if ((offset_t)flp->l_len > 0) {
3773                 *end = (u_offset_t)(*start + (flp->l_len - 1));
3774         } else {
3775                 /*
3776                  * Negative length; why do we even allow this ?
3777                  * Because this allows easy specification of
3778                  * the last n bytes of the file.
3779                  */
3780                 *end = *start;
3781                 *start += (u_offset_t)flp->l_len;
3782                 (*start)++;
3783         }
3784         return (0);
3785 }
3786 
3787 /*
3788  * Check the validity of lock data.  This can used by the NFS
3789  * frlock routines to check data before contacting the server.  The
3790  * server must support semantics that aren't as restrictive as
3791  * the UNIX API, so the NFS client is required to check.
3792  * The maximum is now passed in by the caller.
3793  */
3794 
3795 int
3796 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
3797 {
3798         /*
3799          * The end (length) for local locking should never be greater
3800          * than MAXEND. However, the representation for
3801          * the entire file is MAX_U_OFFSET_T.
3802          */
3803         if ((start > max) ||
3804             ((end > max) && (end != MAX_U_OFFSET_T))) {
3805                 return (EINVAL);
3806         }
3807         if (start > end) {
3808             return (EINVAL);
3809         }
3810         return (0);
3811 }
3812 
3813 /*
3814  * Fill in request->l_flock with information about the lock blocking the
3815  * request.  The complexity here is that lock manager requests are allowed
3816  * to see into the upper part of the 32-bit address range, whereas local
3817  * requests are only allowed to see signed values.
3818  *
3819  * What should be done when "blocker" is a lock manager lock that uses the
3820  * upper portion of the 32-bit range, but "request" is local?  Since the
3821  * request has already been determined to have been blocked by the blocker,
3822  * at least some portion of "blocker" must be in the range of the request,
3823  * or the request extends to the end of file.  For the first case, the
3824  * portion in the lower range is returned with the indication that it goes
3825  * "to EOF."  For the second case, the last byte of the lower range is
3826  * returned with the indication that it goes "to EOF."
3827  */
3828 
3829 static void
3830 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
3831 {
3832         flock64_t *flrp;                        /* l_flock portion of request */
3833 
3834         ASSERT(blocker != NULL);
3835 
3836         flrp = &request->l_flock;
3837         flrp->l_whence = 0;
3838         flrp->l_type = blocker->l_type;
3839         flrp->l_pid = blocker->l_flock.l_pid;
3840         flrp->l_sysid = blocker->l_flock.l_sysid;
3841 
3842         if (IS_LOCKMGR(request)) {
3843                 flrp->l_start = blocker->l_start;
3844                 if (blocker->l_end == MAX_U_OFFSET_T)
3845                         flrp->l_len = 0;
3846                 else
3847                         flrp->l_len = blocker->l_end - blocker->l_start + 1;
3848         } else {
3849                 if (blocker->l_start > MAXEND) {
3850                         flrp->l_start = MAXEND;
3851                         flrp->l_len = 0;
3852                 } else {
3853                         flrp->l_start = blocker->l_start;
3854                         if (blocker->l_end == MAX_U_OFFSET_T)
3855                                 flrp->l_len = 0;
3856                         else
3857                                 flrp->l_len = blocker->l_end -
3858                                         blocker->l_start + 1;
3859                 }
3860         }
3861 }
3862 
3863 /*
3864  * PSARC case 1997/292
3865  */
3866 /*
3867  * This is the public routine exported by flock.h.
3868  */
3869 void
3870 cl_flk_change_nlm_state_to_unknown(int nlmid)
3871 {
3872         /*
3873          * Check to see if node is booted as a cluster. If not, return.
3874          */
3875         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3876                 return;
3877         }
3878 
3879         /*
3880          * See comment in cl_flk_set_nlm_status().
3881          */
3882         if (nlm_reg_status == NULL) {
3883                 return;
3884         }
3885 
3886         /*
3887          * protect NLM registry state with a mutex.
3888          */
3889         ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3890         mutex_enter(&nlm_reg_lock);
3891         FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
3892         mutex_exit(&nlm_reg_lock);
3893 }
3894 
3895 /*
3896  * Return non-zero if the given I/O request conflicts with an active NBMAND
3897  * lock.
3898  * If svmand is non-zero, it means look at all active locks, not just NBMAND
3899  * locks.
3900  */
3901 
3902 int
3903 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
3904                 ssize_t length, int svmand, caller_context_t *ct)
3905 {
3906         int conflict = 0;
3907         graph_t                 *gp;
3908         lock_descriptor_t       *lock;
3909         pid_t pid;
3910         int sysid;
3911 
3912         if (ct == NULL) {
3913                 pid = curproc->p_pid;
3914                 sysid = 0;
3915         } else {
3916                 pid = ct->cc_pid;
3917                 sysid = ct->cc_sysid;
3918         }
3919 
3920         mutex_enter(&flock_lock);
3921         gp = lock_graph[HASH_INDEX(vp)];
3922         mutex_exit(&flock_lock);
3923         if (gp == NULL)
3924                 return (0);
3925 
3926         mutex_enter(&gp->gp_mutex);
3927         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
3928 
3929         for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
3930                 if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
3931                     (lock->l_flock.l_sysid != sysid ||
3932                     lock->l_flock.l_pid != pid) &&
3933                     lock_blocks_io(op, offset, length,
3934                                 lock->l_type, lock->l_start, lock->l_end)) {
3935                         conflict = 1;
3936                         break;
3937                 }
3938         }
3939         mutex_exit(&gp->gp_mutex);
3940 
3941         return (conflict);
3942 }
3943 
3944 /*
3945  * Return non-zero if the given I/O request conflicts with the given lock.
3946  */
3947 
3948 static int
3949 lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
3950             int lock_type, u_offset_t lock_start, u_offset_t lock_end)
3951 {
3952         ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
3953         ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
3954 
3955         if (op == NBL_READ && lock_type == F_RDLCK)
3956                 return (0);
3957 
3958         if (offset <= lock_start && lock_start < offset + length)
3959                 return (1);
3960         if (lock_start <= offset && offset <= lock_end)
3961                 return (1);
3962 
3963         return (0);
3964 }
3965 
3966 #ifdef DEBUG
3967 static void
3968 check_active_locks(graph_t *gp)
3969 {
3970         lock_descriptor_t *lock, *lock1;
3971         edge_t  *ep;
3972 
3973         for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
3974                                                 lock = lock->l_next) {
3975                 ASSERT(IS_ACTIVE(lock));
3976                 ASSERT(NOT_BLOCKED(lock));
3977                 ASSERT(!IS_BARRIER(lock));
3978 
3979                 ep = FIRST_IN(lock);
3980 
3981                 while (ep != HEAD(lock)) {
3982                         ASSERT(IS_SLEEPING(ep->from_vertex));
3983                         ASSERT(!NOT_BLOCKED(ep->from_vertex));
3984                         ep = NEXT_IN(ep);
3985                 }
3986 
3987                 for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
3988                                         lock1 = lock1->l_next) {
3989                         if (lock1->l_vnode == lock->l_vnode) {
3990                         if (BLOCKS(lock1, lock)) {
3991                                 cmn_err(CE_PANIC,
3992                                     "active lock %p blocks %p",
3993                                     (void *)lock1, (void *)lock);
3994                         } else if (BLOCKS(lock, lock1)) {
3995                                 cmn_err(CE_PANIC,
3996                                     "active lock %p blocks %p",
3997                                     (void *)lock, (void *)lock1);
3998                         }
3999                         }
4000                 }
4001         }
4002 }
4003 
4004 /*
4005  * Effect: This functions checks to see if the transition from 'old_state' to
4006  *      'new_state' is a valid one.  It returns 0 if the transition is valid
4007  *      and 1 if it is not.
4008  *      For a map of valid transitions, see sys/flock_impl.h
4009  */
4010 static int
4011 check_lock_transition(int old_state, int new_state)
4012 {
4013         switch (old_state) {
4014         case FLK_INITIAL_STATE:
4015                 if ((new_state == FLK_START_STATE) ||
4016                     (new_state == FLK_SLEEPING_STATE) ||
4017                     (new_state == FLK_ACTIVE_STATE) ||
4018                     (new_state == FLK_DEAD_STATE)) {
4019                         return (0);
4020                 } else {
4021                         return (1);
4022                 }
4023         case FLK_START_STATE:
4024                 if ((new_state == FLK_ACTIVE_STATE) ||
4025                     (new_state == FLK_DEAD_STATE)) {
4026                         return (0);
4027                 } else {
4028                         return (1);
4029                 }
4030         case FLK_ACTIVE_STATE:
4031                 if (new_state == FLK_DEAD_STATE) {
4032                         return (0);
4033                 } else {
4034                         return (1);
4035                 }
4036         case FLK_SLEEPING_STATE:
4037                 if ((new_state == FLK_GRANTED_STATE) ||
4038                     (new_state == FLK_INTERRUPTED_STATE) ||
4039                     (new_state == FLK_CANCELLED_STATE)) {
4040                         return (0);
4041                 } else {
4042                         return (1);
4043                 }
4044         case FLK_GRANTED_STATE:
4045                 if ((new_state == FLK_START_STATE) ||
4046                     (new_state == FLK_INTERRUPTED_STATE) ||
4047                     (new_state == FLK_CANCELLED_STATE)) {
4048                         return (0);
4049                 } else {
4050                         return (1);
4051                 }
4052         case FLK_CANCELLED_STATE:
4053                 if ((new_state == FLK_INTERRUPTED_STATE) ||
4054                     (new_state == FLK_DEAD_STATE)) {
4055                         return (0);
4056                 } else {
4057                         return (1);
4058                 }
4059         case FLK_INTERRUPTED_STATE:
4060                 if (new_state == FLK_DEAD_STATE) {
4061                         return (0);
4062                 } else {
4063                         return (1);
4064                 }
4065         case FLK_DEAD_STATE:
4066                 /* May be set more than once */
4067                 if (new_state == FLK_DEAD_STATE) {
4068                         return (0);
4069                 } else {
4070                         return (1);
4071                 }
4072         default:
4073                 return (1);
4074         }
4075 }
4076 
4077 static void
4078 check_sleeping_locks(graph_t *gp)
4079 {
4080         lock_descriptor_t *lock1, *lock2;
4081         edge_t *ep;
4082         for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
4083                                 lock1 = lock1->l_next) {
4084                                 ASSERT(!IS_BARRIER(lock1));
4085         for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
4086                                 lock2 = lock2->l_next) {
4087                 if (lock1->l_vnode == lock2->l_vnode) {
4088                         if (BLOCKS(lock2, lock1)) {
4089                                 ASSERT(!IS_GRANTED(lock1));
4090                                 ASSERT(!NOT_BLOCKED(lock1));
4091                                 path(lock1, lock2);
4092                         }
4093                 }
4094         }
4095 
4096         for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
4097                                         lock2 = lock2->l_next) {
4098                                 ASSERT(!IS_BARRIER(lock1));
4099                 if (lock1->l_vnode == lock2->l_vnode) {
4100                         if (BLOCKS(lock2, lock1)) {
4101                                 ASSERT(!IS_GRANTED(lock1));
4102                                 ASSERT(!NOT_BLOCKED(lock1));
4103                                 path(lock1, lock2);
4104                         }
4105                 }
4106         }
4107         ep = FIRST_ADJ(lock1);
4108         while (ep != HEAD(lock1)) {
4109                 ASSERT(BLOCKS(ep->to_vertex, lock1));
4110                 ep = NEXT_ADJ(ep);
4111         }
4112         }
4113 }
4114 
4115 static int
4116 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
4117 {
4118         edge_t  *ep;
4119         lock_descriptor_t       *vertex;
4120         lock_descriptor_t *vertex_stack;
4121 
4122         STACK_INIT(vertex_stack);
4123 
4124         flk_graph_uncolor(lock1->l_graph);
4125         ep = FIRST_ADJ(lock1);
4126         ASSERT(ep != HEAD(lock1));
4127         while (ep != HEAD(lock1)) {
4128                 if (no_path)
4129                         ASSERT(ep->to_vertex != lock2);
4130                 STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4131                 COLOR(ep->to_vertex);
4132                 ep = NEXT_ADJ(ep);
4133         }
4134 
4135         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
4136                 STACK_POP(vertex_stack, l_dstack);
4137                 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
4138                                                 ep = NEXT_ADJ(ep)) {
4139                         if (COLORED(ep->to_vertex))
4140                                 continue;
4141                         COLOR(ep->to_vertex);
4142                         if (ep->to_vertex == lock2)
4143                                 return (1);
4144 
4145                         STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4146                 }
4147         }
4148         return (0);
4149 }
4150 
4151 static void
4152 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
4153 {
4154         lock_descriptor_t *lock;
4155 
4156         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4157 
4158         if (lock) {
4159                 while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
4160                         if (lock->l_flock.l_pid == pid &&
4161                             lock->l_flock.l_sysid == sysid)
4162                                 cmn_err(CE_PANIC,
4163                                     "owner pid %d's lock %p in active queue",
4164                                     pid, (void *)lock);
4165                         lock = lock->l_next;
4166                 }
4167         }
4168         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
4169 
4170         if (lock) {
4171                 while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
4172                         if (lock->l_flock.l_pid == pid &&
4173                             lock->l_flock.l_sysid == sysid)
4174                                 cmn_err(CE_PANIC,
4175                                     "owner pid %d's lock %p in sleep queue",
4176                                     pid, (void *)lock);
4177                         lock = lock->l_next;
4178                 }
4179         }
4180 }
4181 
4182 static int
4183 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4184 {
4185         edge_t *ep = FIRST_ADJ(lock1);
4186 
4187         while (ep != HEAD(lock1)) {
4188                 if (ep->to_vertex == lock2)
4189                         return (1);
4190                 else
4191                         ep = NEXT_ADJ(ep);
4192         }
4193         return (0);
4194 }
4195 
4196 static int
4197 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4198 {
4199         return (!level_two_path(lock1, lock2, 1));
4200 }
4201 
4202 static void
4203 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4204 {
4205         if (level_one_path(lock1, lock2)) {
4206                 if (level_two_path(lock1, lock2, 0) != 0) {
4207                         cmn_err(CE_WARN,
4208                             "one edge one path from lock1 %p lock2 %p",
4209                             (void *)lock1, (void *)lock2);
4210                 }
4211         } else if (no_path(lock1, lock2)) {
4212                 cmn_err(CE_PANIC,
4213                     "No path from  lock1 %p to lock2 %p",
4214                     (void *)lock1, (void *)lock2);
4215         }
4216 }
4217 #endif /* DEBUG */