1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #include <sys/note.h>
  27 #include <sys/t_lock.h>
  28 #include <sys/cmn_err.h>
  29 #include <sys/instance.h>
  30 #include <sys/conf.h>
  31 #include <sys/stat.h>
  32 #include <sys/ddi.h>
  33 #include <sys/hwconf.h>
  34 #include <sys/sunddi.h>
  35 #include <sys/sunndi.h>
  36 #include <sys/ddi_impldefs.h>
  37 #include <sys/ndi_impldefs.h>
  38 #include <sys/modctl.h>
  39 #include <sys/dacf.h>
  40 #include <sys/promif.h>
  41 #include <sys/cpuvar.h>
  42 #include <sys/pathname.h>
  43 #include <sys/kobj.h>
  44 #include <sys/devcache.h>
  45 #include <sys/devcache_impl.h>
  46 #include <sys/sysmacros.h>
  47 #include <sys/varargs.h>
  48 #include <sys/callb.h>
  49 
  50 /*
  51  * This facility provides interfaces to clients to register,
  52  * read and update cache data in persisted backing store files,
  53  * usually in /etc/devices.  The data persisted through this
  54  * mechanism should be stateless data, functioning in the sense
  55  * of a cache.  Writes are performed by a background daemon
  56  * thread, permitting a client to schedule an update without
  57  * blocking, then continue updating the data state in
  58  * parallel.  The data is only locked by the daemon thread
  59  * to pack the data in preparation for the write.
  60  *
  61  * Data persisted through this mechanism should be capable
  62  * of being regenerated through normal system operation,
  63  * for example attaching all disk devices would cause all
  64  * devids to be registered for those devices.  By caching
  65  * a devid-device tuple, the system can operate in a
  66  * more optimal way, directly attaching the device mapped
  67  * to a devid, rather than burdensomely driving attach of
  68  * the entire device tree to discover a single device.
  69  *
  70  * Note that a client should only need to include
  71  * <sys/devcache.h> for the supported interfaces.
  72  *
  73  * The data per client is entirely within the control of
  74  * the client.  When reading, data unpacked from the backing
  75  * store should be inserted in the list.  The pointer to
  76  * the list can be retrieved via nvf_list().  When writing,
  77  * the data on the list is to be packed and returned to the
  78  * nvpdaemon as an nvlist.
  79  *
  80  * Obvious restrictions are imposed by the limits of the
  81  * nvlist format.  The data cannot be read or written
  82  * piecemeal, and large amounts of data aren't recommended.
  83  * However, nvlists do allow that data be named and typed
  84  * and can be size-of-int invariant, and the cached data
  85  * can be versioned conveniently.
  86  *
  87  * The registration involves two steps: a handle is
  88  * allocated by calling the registration function.
  89  * This sets up the data referenced by the handle and
  90  * initializes the lock.  Following registration, the
  91  * client must initialize the data list.  The list
  92  * interfaces require that the list element with offset
  93  * to the node link be provided.  The format of the
  94  * list element is under the control of the client.
  95  *
  96  * Locking: the address of the data list r/w lock provided
  97  * can be accessed with nvf_lock().  The lock must be held
  98  * as reader when traversing the list or checking state,
  99  * such as nvf_is_dirty().  The lock must be held as
 100  * writer when updating the list or marking it dirty.
 101  * The lock must not be held when waking the daemon.
 102  *
 103  * The data r/w lock is held as writer when the pack,
 104  * unpack and free list handlers are called.  The
 105  * lock should not be dropped and must be still held
 106  * upon return.  The client should also hold the lock
 107  * as reader when checking if the list is dirty, and
 108  * as writer when marking the list dirty or initiating
 109  * a read.
 110  *
 111  * The asynchronous nature of updates allows for the
 112  * possibility that the data may continue to be updated
 113  * once the daemon has been notified that an update is
 114  * desired.  The data only needs to be locked against
 115  * updates when packing the data into the form to be
 116  * written.  When the write of the packed data has
 117  * completed, the daemon will automatically reschedule
 118  * an update if the data was marked dirty after the
 119  * point at which it was packed.  Before beginning an
 120  * update, the daemon attempts to lock the data as
 121  * writer; if the writer lock is already held, it
 122  * backs off and retries later.  The model is to give
 123  * priority to the kernel processes generating the
 124  * data, and that the nature of the data is that
 125  * it does not change often, can be re-generated when
 126  * needed, so updates should not happen often and
 127  * can be delayed until the data stops changing.
 128  * The client may update the list or mark it dirty
 129  * any time it is able to acquire the lock as
 130  * writer first.
 131  *
 132  * A failed write will be retried after some delay,
 133  * in the hope that the cause of the error will be
 134  * transient, for example a filesystem with no space
 135  * available.  An update on a read-only filesystem
 136  * is failed silently and not retried; this would be
 137  * the case when booted off install media.
 138  *
 139  * There is no unregister mechanism as of yet, as it
 140  * hasn't been needed so far.
 141  */
 142 
 143 /*
 144  * Global list of files registered and updated by the nvpflush
 145  * daemon, protected by the nvf_cache_mutex.  While an
 146  * update is taking place, a file is temporarily moved to
 147  * the dirty list to avoid locking the primary list for
 148  * the duration of the update.
 149  */
 150 list_t          nvf_cache_files;
 151 list_t          nvf_dirty_files;
 152 kmutex_t        nvf_cache_mutex;
 153 
 154 
 155 /*
 156  * Allow some delay from an update of the data before flushing
 157  * to permit simultaneous updates of multiple changes.
 158  * Changes in the data are expected to be bursty, ie
 159  * reconfig or hot-plug of a new adapter.
 160  *
 161  * kfio_report_error (default 0)
 162  *      Set to 1 to enable some error messages related to low-level
 163  *      kernel file i/o operations.
 164  *
 165  * nvpflush_delay (default 10)
 166  *      The number of seconds after data is marked dirty before the
 167  *      flush daemon is triggered to flush the data.  A longer period
 168  *      of time permits more data updates per write.  Note that
 169  *      every update resets the timer so no repository write will
 170  *      occur while data is being updated continuously.
 171  *
 172  * nvpdaemon_idle_time (default 60)
 173  *      The number of seconds the daemon will sleep idle before exiting.
 174  *
 175  */
 176 #define NVPFLUSH_DELAY          10
 177 #define NVPDAEMON_IDLE_TIME     60
 178 
 179 #define TICKS_PER_SECOND        drv_sectohz(1)
 180 
 181 /*
 182  * Tunables
 183  */
 184 int kfio_report_error = 0;              /* kernel file i/o operations */
 185 int kfio_disable_read = 0;              /* disable all reads */
 186 int kfio_disable_write = 0;             /* disable all writes */
 187 
 188 int nvpflush_delay      = NVPFLUSH_DELAY;
 189 int nvpdaemon_idle_time = NVPDAEMON_IDLE_TIME;
 190 
 191 static timeout_id_t     nvpflush_id = 0;
 192 static int              nvpflush_timer_busy = 0;
 193 static int              nvpflush_daemon_active = 0;
 194 static kthread_t        *nvpflush_thr_id = 0;
 195 
 196 static int              do_nvpflush = 0;
 197 static int              nvpbusy = 0;
 198 static kmutex_t         nvpflush_lock;
 199 static kcondvar_t       nvpflush_cv;
 200 static kthread_id_t     nvpflush_thread;
 201 static clock_t          nvpticks;
 202 
 203 static void nvpflush_daemon(void);
 204 
 205 #ifdef  DEBUG
 206 int nvpdaemon_debug = 0;
 207 int kfio_debug = 0;
 208 #endif  /* DEBUG */
 209 
 210 extern int modrootloaded;
 211 extern void mdi_read_devices_files(void);
 212 extern void mdi_clean_vhcache(void);
 213 extern int sys_shutdown;
 214 
 215 /*
 216  * Initialize the overall cache file management
 217  */
 218 void
 219 i_ddi_devices_init(void)
 220 {
 221         list_create(&nvf_cache_files, sizeof (nvfd_t),
 222             offsetof(nvfd_t, nvf_link));
 223         list_create(&nvf_dirty_files, sizeof (nvfd_t),
 224             offsetof(nvfd_t, nvf_link));
 225         mutex_init(&nvf_cache_mutex, NULL, MUTEX_DEFAULT, NULL);
 226         retire_store_init();
 227         devid_cache_init();
 228 }
 229 
 230 /*
 231  * Read cache files
 232  * The files read here should be restricted to those
 233  * that may be required to mount root.
 234  */
 235 void
 236 i_ddi_read_devices_files(void)
 237 {
 238         /*
 239          * The retire store should be the first file read as it
 240          * may need to offline devices. kfio_disable_read is not
 241          * used for retire. For the rationale see the tunable
 242          * ddi_retire_store_bypass and comments in:
 243          *      uts/common/os/retire_store.c
 244          */
 245 
 246         retire_store_read();
 247 
 248         if (!kfio_disable_read) {
 249                 mdi_read_devices_files();
 250                 devid_cache_read();
 251         }
 252 }
 253 
 254 void
 255 i_ddi_start_flush_daemon(void)
 256 {
 257         nvfd_t  *nvfdp;
 258 
 259         ASSERT(i_ddi_io_initialized());
 260 
 261         mutex_init(&nvpflush_lock, NULL, MUTEX_DRIVER, NULL);
 262         cv_init(&nvpflush_cv, NULL, CV_DRIVER, NULL);
 263 
 264         mutex_enter(&nvf_cache_mutex);
 265         for (nvfdp = list_head(&nvf_cache_files); nvfdp;
 266             nvfdp = list_next(&nvf_cache_files, nvfdp)) {
 267                 if (NVF_IS_DIRTY(nvfdp)) {
 268                         nvf_wake_daemon();
 269                         break;
 270                 }
 271         }
 272         mutex_exit(&nvf_cache_mutex);
 273 }
 274 
 275 void
 276 i_ddi_clean_devices_files(void)
 277 {
 278         devid_cache_cleanup();
 279         mdi_clean_vhcache();
 280 }
 281 
 282 /*
 283  * Register a cache file to be managed and updated by the nvpflush daemon.
 284  * All operations are performed through the returned handle.
 285  * There is no unregister mechanism for now.
 286  */
 287 nvf_handle_t
 288 nvf_register_file(nvf_ops_t *ops)
 289 {
 290         nvfd_t *nvfdp;
 291 
 292         nvfdp = kmem_zalloc(sizeof (*nvfdp), KM_SLEEP);
 293 
 294         nvfdp->nvf_ops = ops;
 295         nvfdp->nvf_flags = 0;
 296         rw_init(&nvfdp->nvf_lock, NULL, RW_DRIVER, NULL);
 297 
 298         mutex_enter(&nvf_cache_mutex);
 299         list_insert_tail(&nvf_cache_files, nvfdp);
 300         mutex_exit(&nvf_cache_mutex);
 301 
 302         return ((nvf_handle_t)nvfdp);
 303 }
 304 
 305 /*PRINTFLIKE1*/
 306 void
 307 nvf_error(const char *fmt, ...)
 308 {
 309         va_list ap;
 310 
 311         if (kfio_report_error) {
 312                 va_start(ap, fmt);
 313                 vcmn_err(CE_NOTE, fmt, ap);
 314                 va_end(ap);
 315         }
 316 }
 317 
 318 /*
 319  * Some operations clients may use to manage the data
 320  * to be persisted in a cache file.
 321  */
 322 char *
 323 nvf_cache_name(nvf_handle_t handle)
 324 {
 325         return (((nvfd_t *)handle)->nvf_cache_path);
 326 }
 327 
 328 krwlock_t *
 329 nvf_lock(nvf_handle_t handle)
 330 {
 331         return (&(((nvfd_t *)handle)->nvf_lock));
 332 }
 333 
 334 list_t *
 335 nvf_list(nvf_handle_t handle)
 336 {
 337         return (&(((nvfd_t *)handle)->nvf_data_list));
 338 }
 339 
 340 void
 341 nvf_mark_dirty(nvf_handle_t handle)
 342 {
 343         ASSERT(RW_WRITE_HELD(&(((nvfd_t *)handle)->nvf_lock)));
 344         NVF_MARK_DIRTY((nvfd_t *)handle);
 345 }
 346 
 347 int
 348 nvf_is_dirty(nvf_handle_t handle)
 349 {
 350         ASSERT(RW_LOCK_HELD(&(((nvfd_t *)handle)->nvf_lock)));
 351         return (NVF_IS_DIRTY((nvfd_t *)handle));
 352 }
 353 
 354 static uint16_t
 355 nvp_cksum(uchar_t *buf, int64_t buflen)
 356 {
 357         uint16_t cksum = 0;
 358         uint16_t *p = (uint16_t *)buf;
 359         int64_t n;
 360 
 361         if ((buflen & 0x01) != 0) {
 362                 buflen--;
 363                 cksum = buf[buflen];
 364         }
 365         n = buflen / 2;
 366         while (n-- > 0)
 367                 cksum ^= *p++;
 368         return (cksum);
 369 }
 370 
 371 int
 372 fread_nvlist(char *filename, nvlist_t **ret_nvlist)
 373 {
 374         struct _buf     *file;
 375         nvpf_hdr_t      hdr;
 376         char            *buf;
 377         nvlist_t        *nvl;
 378         int             rval;
 379         uint_t          offset;
 380         int             n;
 381         char            c;
 382         uint16_t        cksum, hdrsum;
 383 
 384         *ret_nvlist = NULL;
 385 
 386         file = kobj_open_file(filename);
 387         if (file == (struct _buf *)-1) {
 388                 KFDEBUG((CE_CONT, "cannot open file: %s\n", filename));
 389                 return (ENOENT);
 390         }
 391 
 392         offset = 0;
 393         n = kobj_read_file(file, (char *)&hdr, sizeof (hdr), offset);
 394         if (n != sizeof (hdr)) {
 395                 kobj_close_file(file);
 396                 if (n < 0) {
 397                         nvf_error("error reading header: %s\n", filename);
 398                         return (EIO);
 399                 } else if (n == 0) {
 400                         KFDEBUG((CE_CONT, "file empty: %s\n", filename));
 401                 } else {
 402                         nvf_error("header size incorrect: %s\n", filename);
 403                 }
 404                 return (EINVAL);
 405         }
 406         offset += n;
 407 
 408         KFDEBUG2((CE_CONT, "nvpf_magic: 0x%x\n", hdr.nvpf_magic));
 409         KFDEBUG2((CE_CONT, "nvpf_version: %d\n", hdr.nvpf_version));
 410         KFDEBUG2((CE_CONT, "nvpf_size: %lld\n",
 411             (longlong_t)hdr.nvpf_size));
 412         KFDEBUG2((CE_CONT, "nvpf_hdr_chksum: 0x%x\n",
 413             hdr.nvpf_hdr_chksum));
 414         KFDEBUG2((CE_CONT, "nvpf_chksum: 0x%x\n", hdr.nvpf_chksum));
 415 
 416         cksum = hdr.nvpf_hdr_chksum;
 417         hdr.nvpf_hdr_chksum = 0;
 418         hdrsum = nvp_cksum((uchar_t *)&hdr, sizeof (hdr));
 419 
 420         if (hdr.nvpf_magic != NVPF_HDR_MAGIC ||
 421             hdr.nvpf_version != NVPF_HDR_VERSION || hdrsum != cksum) {
 422                 kobj_close_file(file);
 423                 if (hdrsum != cksum) {
 424                         nvf_error("%s: checksum error "
 425                             "(actual 0x%x, expected 0x%x)\n",
 426                             filename, hdrsum, cksum);
 427                 }
 428                 nvf_error("%s: header information incorrect", filename);
 429                 return (EINVAL);
 430         }
 431 
 432         ASSERT(hdr.nvpf_size >= 0);
 433 
 434         buf = kmem_alloc(hdr.nvpf_size, KM_SLEEP);
 435         n = kobj_read_file(file, buf, hdr.nvpf_size, offset);
 436         if (n != hdr.nvpf_size) {
 437                 kmem_free(buf, hdr.nvpf_size);
 438                 kobj_close_file(file);
 439                 if (n < 0) {
 440                         nvf_error("%s: read error %d", filename, n);
 441                 } else {
 442                         nvf_error("%s: incomplete read %d/%lld",
 443                             filename, n, (longlong_t)hdr.nvpf_size);
 444                 }
 445                 return (EINVAL);
 446         }
 447         offset += n;
 448 
 449         rval = kobj_read_file(file, &c, 1, offset);
 450         kobj_close_file(file);
 451         if (rval > 0) {
 452                 nvf_error("%s is larger than %lld\n",
 453                     filename, (longlong_t)hdr.nvpf_size);
 454                 kmem_free(buf, hdr.nvpf_size);
 455                 return (EINVAL);
 456         }
 457 
 458         cksum = nvp_cksum((uchar_t *)buf, hdr.nvpf_size);
 459         if (hdr.nvpf_chksum != cksum) {
 460                 nvf_error("%s: checksum error (actual 0x%x, expected 0x%x)\n",
 461                     filename, hdr.nvpf_chksum, cksum);
 462                 kmem_free(buf, hdr.nvpf_size);
 463                 return (EINVAL);
 464         }
 465 
 466         nvl = NULL;
 467         rval = nvlist_unpack(buf, hdr.nvpf_size, &nvl, 0);
 468         if (rval != 0) {
 469                 nvf_error("%s: error %d unpacking nvlist\n",
 470                     filename, rval);
 471                 kmem_free(buf, hdr.nvpf_size);
 472                 return (EINVAL);
 473         }
 474 
 475         kmem_free(buf, hdr.nvpf_size);
 476         *ret_nvlist = nvl;
 477         return (0);
 478 }
 479 
 480 static int
 481 kfcreate(char *filename, kfile_t **kfilep)
 482 {
 483         kfile_t *fp;
 484         int     rval;
 485 
 486         ASSERT(modrootloaded);
 487 
 488         fp = kmem_alloc(sizeof (kfile_t), KM_SLEEP);
 489 
 490         fp->kf_vnflags = FCREAT | FWRITE | FTRUNC;
 491         fp->kf_fname = filename;
 492         fp->kf_fpos = 0;
 493         fp->kf_state = 0;
 494 
 495         KFDEBUG((CE_CONT, "create: %s flags 0x%x\n",
 496             filename, fp->kf_vnflags));
 497         rval = vn_open(filename, UIO_SYSSPACE, fp->kf_vnflags,
 498             0444, &fp->kf_vp, CRCREAT, 0);
 499         if (rval != 0) {
 500                 kmem_free(fp, sizeof (kfile_t));
 501                 KFDEBUG((CE_CONT, "%s: create error %d\n",
 502                     filename, rval));
 503                 return (rval);
 504         }
 505 
 506         *kfilep = fp;
 507         return (0);
 508 }
 509 
 510 static int
 511 kfremove(char *filename)
 512 {
 513         int rval;
 514 
 515         KFDEBUG((CE_CONT, "remove: %s\n", filename));
 516         rval = vn_remove(filename, UIO_SYSSPACE, RMFILE);
 517         if (rval != 0) {
 518                 KFDEBUG((CE_CONT, "%s: remove error %d\n",
 519                     filename, rval));
 520         }
 521         return (rval);
 522 }
 523 
 524 static int
 525 kfread(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n)
 526 {
 527         ssize_t         resid;
 528         int             err;
 529         ssize_t         n;
 530 
 531         ASSERT(modrootloaded);
 532 
 533         if (fp->kf_state != 0)
 534                 return (fp->kf_state);
 535 
 536         err = vn_rdwr(UIO_READ, fp->kf_vp, buf, bufsiz, fp->kf_fpos,
 537             UIO_SYSSPACE, 0, (rlim64_t)0, kcred, &resid);
 538         if (err != 0) {
 539                 KFDEBUG((CE_CONT, "%s: read error %d\n",
 540                     fp->kf_fname, err));
 541                 fp->kf_state = err;
 542                 return (err);
 543         }
 544 
 545         ASSERT(resid >= 0 && resid <= bufsiz);
 546         n = bufsiz - resid;
 547 
 548         KFDEBUG1((CE_CONT, "%s: read %ld bytes ok %ld bufsiz, %ld resid\n",
 549             fp->kf_fname, n, bufsiz, resid));
 550 
 551         fp->kf_fpos += n;
 552         *ret_n = n;
 553         return (0);
 554 }
 555 
 556 static int
 557 kfwrite(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n)
 558 {
 559         rlim64_t        rlimit;
 560         ssize_t         resid;
 561         int             err;
 562         ssize_t         len;
 563         ssize_t         n = 0;
 564 
 565         ASSERT(modrootloaded);
 566 
 567         if (fp->kf_state != 0)
 568                 return (fp->kf_state);
 569 
 570         len = bufsiz;
 571         rlimit = bufsiz + 1;
 572         for (;;) {
 573                 err = vn_rdwr(UIO_WRITE, fp->kf_vp, buf, len, fp->kf_fpos,
 574                     UIO_SYSSPACE, FSYNC, rlimit, kcred, &resid);
 575                 if (err) {
 576                         KFDEBUG((CE_CONT, "%s: write error %d\n",
 577                             fp->kf_fname, err));
 578                         fp->kf_state = err;
 579                         return (err);
 580                 }
 581 
 582                 KFDEBUG1((CE_CONT, "%s: write %ld bytes ok %ld resid\n",
 583                     fp->kf_fname, len-resid, resid));
 584 
 585                 ASSERT(resid >= 0 && resid <= len);
 586 
 587                 n += (len - resid);
 588                 if (resid == 0)
 589                         break;
 590 
 591                 if (resid == len) {
 592                         KFDEBUG((CE_CONT, "%s: filesystem full?\n",
 593                             fp->kf_fname));
 594                         fp->kf_state = ENOSPC;
 595                         return (ENOSPC);
 596                 }
 597 
 598                 len -= resid;
 599                 buf += len;
 600                 fp->kf_fpos += len;
 601                 len = resid;
 602         }
 603 
 604         ASSERT(n == bufsiz);
 605         KFDEBUG1((CE_CONT, "%s: wrote %ld bytes ok\n", fp->kf_fname, n));
 606 
 607         *ret_n = n;
 608         return (0);
 609 }
 610 
 611 
 612 static int
 613 kfclose(kfile_t *fp)
 614 {
 615         int             rval;
 616 
 617         KFDEBUG((CE_CONT, "close: %s\n", fp->kf_fname));
 618 
 619         if ((fp->kf_vnflags & FWRITE) && fp->kf_state == 0) {
 620                 rval = VOP_FSYNC(fp->kf_vp, FSYNC, kcred, NULL);
 621                 if (rval != 0) {
 622                         nvf_error("%s: sync error %d\n",
 623                             fp->kf_fname, rval);
 624                 }
 625                 KFDEBUG((CE_CONT, "%s: sync ok\n", fp->kf_fname));
 626         }
 627 
 628         rval = VOP_CLOSE(fp->kf_vp, fp->kf_vnflags, 1,
 629             (offset_t)0, kcred, NULL);
 630         if (rval != 0) {
 631                 if (fp->kf_state == 0) {
 632                         nvf_error("%s: close error %d\n",
 633                             fp->kf_fname, rval);
 634                 }
 635         } else {
 636                 if (fp->kf_state == 0)
 637                         KFDEBUG((CE_CONT, "%s: close ok\n", fp->kf_fname));
 638         }
 639 
 640         VN_RELE(fp->kf_vp);
 641         kmem_free(fp, sizeof (kfile_t));
 642         return (rval);
 643 }
 644 
 645 static int
 646 kfrename(char *oldname, char *newname)
 647 {
 648         int rval;
 649 
 650         ASSERT(modrootloaded);
 651 
 652         KFDEBUG((CE_CONT, "renaming %s to %s\n", oldname, newname));
 653 
 654         if ((rval = vn_rename(oldname, newname, UIO_SYSSPACE)) != 0) {
 655                 KFDEBUG((CE_CONT, "rename %s to %s: %d\n",
 656                     oldname, newname, rval));
 657         }
 658 
 659         return (rval);
 660 }
 661 
 662 int
 663 fwrite_nvlist(char *filename, nvlist_t *nvl)
 664 {
 665         char    *buf;
 666         char    *nvbuf;
 667         kfile_t *fp;
 668         char    *newname;
 669         int     len, err, err1;
 670         size_t  buflen;
 671         ssize_t n;
 672 
 673         ASSERT(modrootloaded);
 674 
 675         nvbuf = NULL;
 676         err = nvlist_pack(nvl, &nvbuf, &buflen, NV_ENCODE_NATIVE, 0);
 677         if (err != 0) {
 678                 nvf_error("%s: error %d packing nvlist\n",
 679                     filename, err);
 680                 return (err);
 681         }
 682 
 683         buf = kmem_alloc(sizeof (nvpf_hdr_t) + buflen, KM_SLEEP);
 684         bzero(buf, sizeof (nvpf_hdr_t));
 685 
 686         ((nvpf_hdr_t *)buf)->nvpf_magic = NVPF_HDR_MAGIC;
 687         ((nvpf_hdr_t *)buf)->nvpf_version = NVPF_HDR_VERSION;
 688         ((nvpf_hdr_t *)buf)->nvpf_size = buflen;
 689         ((nvpf_hdr_t *)buf)->nvpf_chksum = nvp_cksum((uchar_t *)nvbuf, buflen);
 690         ((nvpf_hdr_t *)buf)->nvpf_hdr_chksum =
 691             nvp_cksum((uchar_t *)buf, sizeof (nvpf_hdr_t));
 692 
 693         bcopy(nvbuf, buf + sizeof (nvpf_hdr_t), buflen);
 694         kmem_free(nvbuf, buflen);
 695         buflen += sizeof (nvpf_hdr_t);
 696 
 697         len = strlen(filename) + MAX_SUFFIX_LEN + 2;
 698         newname = kmem_alloc(len, KM_SLEEP);
 699 
 700 
 701         (void) sprintf(newname, "%s.%s", filename, NEW_FILENAME_SUFFIX);
 702 
 703         /*
 704          * To make it unlikely we suffer data loss, write
 705          * data to the new temporary file.  Once successful
 706          * complete the transaction by renaming the new file
 707          * to replace the previous.
 708          */
 709 
 710         if ((err = kfcreate(newname, &fp)) == 0) {
 711                 err = kfwrite(fp, buf, buflen, &n);
 712                 if (err) {
 713                         nvf_error("%s: write error - %d\n",
 714                             newname, err);
 715                 } else {
 716                         if (n != buflen) {
 717                                 nvf_error(
 718                                     "%s: partial write %ld of %ld bytes\n",
 719                                     newname, n, buflen);
 720                                 nvf_error("%s: filesystem may be full?\n",
 721                                     newname);
 722                                 err = EIO;
 723                         }
 724                 }
 725                 if ((err1 = kfclose(fp)) != 0) {
 726                         nvf_error("%s: close error\n", newname);
 727                         if (err == 0)
 728                                 err = err1;
 729                 }
 730                 if (err != 0) {
 731                         if (kfremove(newname) != 0) {
 732                                 nvf_error("%s: remove failed\n",
 733                                     newname);
 734                         }
 735                 }
 736         } else {
 737                 nvf_error("%s: create failed - %d\n", filename, err);
 738         }
 739 
 740         if (err == 0) {
 741                 if ((err = kfrename(newname, filename)) != 0) {
 742                         nvf_error("%s: rename from %s failed\n",
 743                             newname, filename);
 744                 }
 745         }
 746 
 747         kmem_free(newname, len);
 748         kmem_free(buf, buflen);
 749 
 750         return (err);
 751 }
 752 
 753 static int
 754 e_fwrite_nvlist(nvfd_t *nvfd, nvlist_t *nvl)
 755 {
 756         int err;
 757 
 758         if ((err = fwrite_nvlist(nvfd->nvf_cache_path, nvl)) == 0)
 759                 return (DDI_SUCCESS);
 760         else {
 761                 if (err == EROFS)
 762                         NVF_MARK_READONLY(nvfd);
 763                 return (DDI_FAILURE);
 764         }
 765 }
 766 
 767 static void
 768 nvp_list_free(nvfd_t *nvf)
 769 {
 770         ASSERT(RW_WRITE_HELD(&nvf->nvf_lock));
 771         (nvf->nvf_list_free)((nvf_handle_t)nvf);
 772         ASSERT(RW_WRITE_HELD(&nvf->nvf_lock));
 773 }
 774 
 775 /*
 776  * Read a file in the nvlist format
 777  *      EIO - i/o error during read
 778  *      ENOENT - file not found
 779  *      EINVAL - file contents corrupted
 780  */
 781 static int
 782 fread_nvp_list(nvfd_t *nvfd)
 783 {
 784         nvlist_t        *nvl;
 785         nvpair_t        *nvp;
 786         char            *name;
 787         nvlist_t        *sublist;
 788         int             rval;
 789         int             rv;
 790 
 791         ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
 792 
 793         rval = fread_nvlist(nvfd->nvf_cache_path, &nvl);
 794         if (rval != 0)
 795                 return (rval);
 796         ASSERT(nvl != NULL);
 797 
 798         nvp = NULL;
 799         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
 800                 name = nvpair_name(nvp);
 801                 ASSERT(strlen(name) > 0);
 802 
 803                 switch (nvpair_type(nvp)) {
 804                 case DATA_TYPE_NVLIST:
 805                         rval = nvpair_value_nvlist(nvp, &sublist);
 806                         if (rval != 0) {
 807                                 nvf_error(
 808                                     "nvpair_value_nvlist error %s %d\n",
 809                                     name, rval);
 810                                 goto error;
 811                         }
 812 
 813                         /*
 814                          * unpack nvlist for this device and
 815                          * add elements to data list.
 816                          */
 817                         ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
 818                         rv = (nvfd->nvf_unpack_nvlist)
 819                             ((nvf_handle_t)nvfd, sublist, name);
 820                         ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
 821                         if (rv != 0) {
 822                                 nvf_error(
 823                                     "%s: %s invalid list element\n",
 824                                     nvfd->nvf_cache_path, name);
 825                                 rval = EINVAL;
 826                                 goto error;
 827                         }
 828                         break;
 829 
 830                 default:
 831                         nvf_error("%s: %s unsupported data type %d\n",
 832                             nvfd->nvf_cache_path, name, nvpair_type(nvp));
 833                         rval = EINVAL;
 834                         goto error;
 835                 }
 836         }
 837 
 838         nvlist_free(nvl);
 839 
 840         return (0);
 841 
 842 error:
 843         nvlist_free(nvl);
 844         nvp_list_free(nvfd);
 845         return (rval);
 846 }
 847 
 848 
 849 int
 850 nvf_read_file(nvf_handle_t nvf_handle)
 851 {
 852         nvfd_t *nvfd = (nvfd_t *)nvf_handle;
 853         int rval;
 854 
 855         ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
 856 
 857         if (kfio_disable_read)
 858                 return (0);
 859 
 860         KFDEBUG((CE_CONT, "reading %s\n", nvfd->nvf_cache_path));
 861 
 862         rval = fread_nvp_list(nvfd);
 863         if (rval) {
 864                 switch (rval) {
 865                 case EIO:
 866                         nvfd->nvf_flags |= NVF_F_REBUILD_MSG;
 867                         cmn_err(CE_WARN, "%s: I/O error",
 868                             nvfd->nvf_cache_path);
 869                         break;
 870                 case ENOENT:
 871                         nvfd->nvf_flags |= NVF_F_CREATE_MSG;
 872                         nvf_error("%s: not found\n",
 873                             nvfd->nvf_cache_path);
 874                         break;
 875                 case EINVAL:
 876                 default:
 877                         nvfd->nvf_flags |= NVF_F_REBUILD_MSG;
 878                         cmn_err(CE_WARN, "%s: data file corrupted",
 879                             nvfd->nvf_cache_path);
 880                         break;
 881                 }
 882         }
 883         return (rval);
 884 }
 885 
 886 static void
 887 nvf_write_is_complete(nvfd_t *fd)
 888 {
 889         if (fd->nvf_write_complete) {
 890                 (fd->nvf_write_complete)((nvf_handle_t)fd);
 891         }
 892 }
 893 
 894 /*ARGSUSED*/
 895 static void
 896 nvpflush_timeout(void *arg)
 897 {
 898         clock_t nticks;
 899 
 900         mutex_enter(&nvpflush_lock);
 901         nticks = nvpticks - ddi_get_lbolt();
 902         if (nticks > 4) {
 903                 nvpflush_timer_busy = 1;
 904                 mutex_exit(&nvpflush_lock);
 905                 nvpflush_id = timeout(nvpflush_timeout, NULL, nticks);
 906         } else {
 907                 do_nvpflush = 1;
 908                 NVPDAEMON_DEBUG((CE_CONT, "signal nvpdaemon\n"));
 909                 cv_signal(&nvpflush_cv);
 910                 nvpflush_id = 0;
 911                 nvpflush_timer_busy = 0;
 912                 mutex_exit(&nvpflush_lock);
 913         }
 914 }
 915 
 916 /*
 917  * After marking a list as dirty, wake the nvpflush daemon
 918  * to perform the update.
 919  */
 920 void
 921 nvf_wake_daemon(void)
 922 {
 923         clock_t nticks;
 924 
 925         /*
 926          * If the system isn't up yet or is shutting down,
 927          * don't even think about starting a flush.
 928          */
 929         if (!i_ddi_io_initialized() || sys_shutdown)
 930                 return;
 931 
 932         mutex_enter(&nvpflush_lock);
 933 
 934         if (nvpflush_daemon_active == 0) {
 935                 nvpflush_daemon_active = 1;
 936                 mutex_exit(&nvpflush_lock);
 937                 NVPDAEMON_DEBUG((CE_CONT, "starting nvpdaemon thread\n"));
 938                 nvpflush_thr_id = thread_create(NULL, 0,
 939                     (void (*)())nvpflush_daemon,
 940                     NULL, 0, &p0, TS_RUN, minclsyspri);
 941                 mutex_enter(&nvpflush_lock);
 942         }
 943 
 944         nticks = nvpflush_delay * TICKS_PER_SECOND;
 945         nvpticks = ddi_get_lbolt() + nticks;
 946         if (nvpflush_timer_busy == 0) {
 947                 nvpflush_timer_busy = 1;
 948                 mutex_exit(&nvpflush_lock);
 949                 nvpflush_id = timeout(nvpflush_timeout, NULL, nticks + 4);
 950         } else
 951                 mutex_exit(&nvpflush_lock);
 952 }
 953 
 954 static int
 955 nvpflush_one(nvfd_t *nvfd)
 956 {
 957         int rval = DDI_SUCCESS;
 958         nvlist_t *nvl;
 959 
 960         rw_enter(&nvfd->nvf_lock, RW_READER);
 961 
 962         ASSERT((nvfd->nvf_flags & NVF_F_FLUSHING) == 0);
 963 
 964         if (!NVF_IS_DIRTY(nvfd) ||
 965             NVF_IS_READONLY(nvfd) || kfio_disable_write || sys_shutdown) {
 966                 NVF_CLEAR_DIRTY(nvfd);
 967                 rw_exit(&nvfd->nvf_lock);
 968                 return (DDI_SUCCESS);
 969         }
 970 
 971         if (rw_tryupgrade(&nvfd->nvf_lock) == 0) {
 972                 nvf_error("nvpflush: "
 973                     "%s rw upgrade failed\n", nvfd->nvf_cache_path);
 974                 rw_exit(&nvfd->nvf_lock);
 975                 return (DDI_FAILURE);
 976         }
 977         if (((nvfd->nvf_pack_list)
 978             ((nvf_handle_t)nvfd, &nvl)) != DDI_SUCCESS) {
 979                 nvf_error("nvpflush: "
 980                     "%s nvlist construction failed\n", nvfd->nvf_cache_path);
 981                 ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
 982                 rw_exit(&nvfd->nvf_lock);
 983                 return (DDI_FAILURE);
 984         }
 985         ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
 986 
 987         NVF_CLEAR_DIRTY(nvfd);
 988         nvfd->nvf_flags |= NVF_F_FLUSHING;
 989         rw_exit(&nvfd->nvf_lock);
 990 
 991         rval = e_fwrite_nvlist(nvfd, nvl);
 992         nvlist_free(nvl);
 993 
 994         rw_enter(&nvfd->nvf_lock, RW_WRITER);
 995         nvfd->nvf_flags &= ~NVF_F_FLUSHING;
 996         if (rval == DDI_FAILURE) {
 997                 if (NVF_IS_READONLY(nvfd)) {
 998                         rval = DDI_SUCCESS;
 999                         nvfd->nvf_flags &= ~(NVF_F_ERROR | NVF_F_DIRTY);
1000                 } else if ((nvfd->nvf_flags & NVF_F_ERROR) == 0) {
1001                         cmn_err(CE_CONT,
1002                             "%s: update failed\n", nvfd->nvf_cache_path);
1003                         nvfd->nvf_flags |= NVF_F_ERROR | NVF_F_DIRTY;
1004                 }
1005         } else {
1006                 if (nvfd->nvf_flags & NVF_F_CREATE_MSG) {
1007                         cmn_err(CE_CONT,
1008                             "!Creating %s\n", nvfd->nvf_cache_path);
1009                         nvfd->nvf_flags &= ~NVF_F_CREATE_MSG;
1010                 }
1011                 if (nvfd->nvf_flags & NVF_F_REBUILD_MSG) {
1012                         cmn_err(CE_CONT,
1013                             "!Rebuilding %s\n", nvfd->nvf_cache_path);
1014                         nvfd->nvf_flags &= ~NVF_F_REBUILD_MSG;
1015                 }
1016                 if (nvfd->nvf_flags & NVF_F_ERROR) {
1017                         cmn_err(CE_CONT,
1018                             "%s: update now ok\n", nvfd->nvf_cache_path);
1019                         nvfd->nvf_flags &= ~NVF_F_ERROR;
1020                 }
1021                 /*
1022                  * The file may need to be flushed again if the cached
1023                  * data was touched while writing the earlier contents.
1024                  */
1025                 if (NVF_IS_DIRTY(nvfd))
1026                         rval = DDI_FAILURE;
1027         }
1028 
1029         rw_exit(&nvfd->nvf_lock);
1030         return (rval);
1031 }
1032 
1033 
1034 static void
1035 nvpflush_daemon(void)
1036 {
1037         callb_cpr_t cprinfo;
1038         nvfd_t *nvfdp, *nextfdp;
1039         clock_t clk;
1040         int rval;
1041         int want_wakeup;
1042         int is_now_clean;
1043 
1044         ASSERT(modrootloaded);
1045 
1046         nvpflush_thread = curthread;
1047         NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: init\n"));
1048 
1049         CALLB_CPR_INIT(&cprinfo, &nvpflush_lock, callb_generic_cpr, "nvp");
1050         mutex_enter(&nvpflush_lock);
1051         for (;;) {
1052                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1053                 while (do_nvpflush == 0) {
1054                         clk = cv_reltimedwait(&nvpflush_cv, &nvpflush_lock,
1055                             (nvpdaemon_idle_time * TICKS_PER_SECOND),
1056                             TR_CLOCK_TICK);
1057                         if ((clk == -1 && do_nvpflush == 0 &&
1058                             nvpflush_timer_busy == 0) || sys_shutdown) {
1059                                 /*
1060                                  * Note that CALLB_CPR_EXIT calls mutex_exit()
1061                                  * on the lock passed in to CALLB_CPR_INIT,
1062                                  * so the lock must be held when invoking it.
1063                                  */
1064                                 CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock);
1065                                 NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: exit\n"));
1066                                 ASSERT(mutex_owned(&nvpflush_lock));
1067                                 nvpflush_thr_id = NULL;
1068                                 nvpflush_daemon_active = 0;
1069                                 CALLB_CPR_EXIT(&cprinfo);
1070                                 thread_exit();
1071                         }
1072                 }
1073                 CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock);
1074 
1075                 nvpbusy = 1;
1076                 want_wakeup = 0;
1077                 do_nvpflush = 0;
1078                 mutex_exit(&nvpflush_lock);
1079 
1080                 /*
1081                  * Try flushing what's dirty, reschedule if there's
1082                  * a failure or data gets marked as dirty again.
1083                  * First move each file marked dirty to the dirty
1084                  * list to avoid locking the list across the write.
1085                  */
1086                 mutex_enter(&nvf_cache_mutex);
1087                 for (nvfdp = list_head(&nvf_cache_files);
1088                     nvfdp; nvfdp = nextfdp) {
1089                         nextfdp = list_next(&nvf_cache_files, nvfdp);
1090                         rw_enter(&nvfdp->nvf_lock, RW_READER);
1091                         if (NVF_IS_DIRTY(nvfdp)) {
1092                                 list_remove(&nvf_cache_files, nvfdp);
1093                                 list_insert_tail(&nvf_dirty_files, nvfdp);
1094                                 rw_exit(&nvfdp->nvf_lock);
1095                         } else {
1096                                 NVPDAEMON_DEBUG((CE_CONT,
1097                                     "nvpdaemon: not dirty %s\n",
1098                                     nvfdp->nvf_cache_path));
1099                                 rw_exit(&nvfdp->nvf_lock);
1100                         }
1101                 }
1102                 mutex_exit(&nvf_cache_mutex);
1103 
1104                 /*
1105                  * Now go through the dirty list
1106                  */
1107                 for (nvfdp = list_head(&nvf_dirty_files);
1108                     nvfdp; nvfdp = nextfdp) {
1109                         nextfdp = list_next(&nvf_dirty_files, nvfdp);
1110 
1111                         is_now_clean = 0;
1112                         rw_enter(&nvfdp->nvf_lock, RW_READER);
1113                         if (NVF_IS_DIRTY(nvfdp)) {
1114                                 NVPDAEMON_DEBUG((CE_CONT,
1115                                     "nvpdaemon: flush %s\n",
1116                                     nvfdp->nvf_cache_path));
1117                                 rw_exit(&nvfdp->nvf_lock);
1118                                 rval = nvpflush_one(nvfdp);
1119                                 rw_enter(&nvfdp->nvf_lock, RW_READER);
1120                                 if (rval != DDI_SUCCESS ||
1121                                     NVF_IS_DIRTY(nvfdp)) {
1122                                         rw_exit(&nvfdp->nvf_lock);
1123                                         NVPDAEMON_DEBUG((CE_CONT,
1124                                             "nvpdaemon: %s dirty again\n",
1125                                             nvfdp->nvf_cache_path));
1126                                         want_wakeup = 1;
1127                                 } else {
1128                                         rw_exit(&nvfdp->nvf_lock);
1129                                         nvf_write_is_complete(nvfdp);
1130                                         is_now_clean = 1;
1131                                 }
1132                         } else {
1133                                 NVPDAEMON_DEBUG((CE_CONT,
1134                                     "nvpdaemon: not dirty %s\n",
1135                                     nvfdp->nvf_cache_path));
1136                                 rw_exit(&nvfdp->nvf_lock);
1137                                 is_now_clean = 1;
1138                         }
1139 
1140                         if (is_now_clean) {
1141                                 mutex_enter(&nvf_cache_mutex);
1142                                 list_remove(&nvf_dirty_files, nvfdp);
1143                                 list_insert_tail(&nvf_cache_files,
1144                                     nvfdp);
1145                                 mutex_exit(&nvf_cache_mutex);
1146                         }
1147                 }
1148 
1149                 if (want_wakeup)
1150                         nvf_wake_daemon();
1151 
1152                 mutex_enter(&nvpflush_lock);
1153                 nvpbusy = 0;
1154         }
1155 }