1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
  24  * Copyright 2012 Alexey Zaytsev <alexey.zaytsev@gmail.com> All rights reserved.
  25  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/ksynch.h>
  30 #include <sys/kmem.h>
  31 #include <sys/file.h>
  32 #include <sys/errno.h>
  33 #include <sys/open.h>
  34 #include <sys/buf.h>
  35 #include <sys/uio.h>
  36 #include <sys/aio_req.h>
  37 #include <sys/cred.h>
  38 #include <sys/modctl.h>
  39 #include <sys/cmlb.h>
  40 #include <sys/conf.h>
  41 #include <sys/devops.h>
  42 #include <sys/list.h>
  43 #include <sys/sysmacros.h>
  44 #include <sys/dkio.h>
  45 #include <sys/vtoc.h>
  46 #include <sys/scsi/scsi.h>        /* for DTYPE_DIRECT */
  47 #include <sys/kstat.h>
  48 #include <sys/fs/dv_node.h>
  49 #include <sys/ddi.h>
  50 #include <sys/sunddi.h>
  51 #include <sys/note.h>
  52 #include <sys/blkdev.h>
  53 
  54 #define BD_MAXPART      64
  55 #define BDINST(dev)     (getminor(dev) / BD_MAXPART)
  56 #define BDPART(dev)     (getminor(dev) % BD_MAXPART)
  57 
  58 typedef struct bd bd_t;
  59 typedef struct bd_xfer_impl bd_xfer_impl_t;
  60 
  61 struct bd {
  62         void            *d_private;
  63         dev_info_t      *d_dip;
  64         kmutex_t        d_ocmutex;
  65         kmutex_t        d_iomutex;
  66         kmutex_t        d_statemutex;
  67         kcondvar_t      d_statecv;
  68         enum dkio_state d_state;
  69         cmlb_handle_t   d_cmlbh;
  70         unsigned        d_open_lyr[BD_MAXPART]; /* open count */
  71         uint64_t        d_open_excl;    /* bit mask indexed by partition */
  72         uint64_t        d_open_reg[OTYPCNT];            /* bit mask */
  73 
  74         uint32_t        d_qsize;
  75         uint32_t        d_qactive;
  76         uint32_t        d_maxxfer;
  77         uint32_t        d_blkshift;
  78         uint32_t        d_pblkshift;
  79         uint64_t        d_numblks;
  80         ddi_devid_t     d_devid;
  81 
  82         kmem_cache_t    *d_cache;
  83         list_t          d_runq;
  84         list_t          d_waitq;
  85         kstat_t         *d_ksp;
  86         kstat_io_t      *d_kiop;
  87 
  88         boolean_t       d_rdonly;
  89         boolean_t       d_ssd;
  90         boolean_t       d_removable;
  91         boolean_t       d_hotpluggable;
  92         boolean_t       d_use_dma;
  93 
  94         ddi_dma_attr_t  d_dma;
  95         bd_ops_t        d_ops;
  96         bd_handle_t     d_handle;
  97 };
  98 
  99 struct bd_handle {
 100         bd_ops_t        h_ops;
 101         ddi_dma_attr_t  *h_dma;
 102         dev_info_t      *h_parent;
 103         dev_info_t      *h_child;
 104         void            *h_private;
 105         bd_t            *h_bd;
 106         char            *h_name;
 107         char            h_addr[20];     /* enough for %X,%X */
 108 };
 109 
 110 struct bd_xfer_impl {
 111         bd_xfer_t       i_public;
 112         list_node_t     i_linkage;
 113         bd_t            *i_bd;
 114         buf_t           *i_bp;
 115         uint_t          i_num_win;
 116         uint_t          i_cur_win;
 117         off_t           i_offset;
 118         int             (*i_func)(void *, bd_xfer_t *);
 119         uint32_t        i_blkshift;
 120         size_t          i_len;
 121         size_t          i_resid;
 122 };
 123 
 124 #define i_dmah          i_public.x_dmah
 125 #define i_dmac          i_public.x_dmac
 126 #define i_ndmac         i_public.x_ndmac
 127 #define i_kaddr         i_public.x_kaddr
 128 #define i_nblks         i_public.x_nblks
 129 #define i_blkno         i_public.x_blkno
 130 #define i_flags         i_public.x_flags
 131 
 132 
 133 /*
 134  * Private prototypes.
 135  */
 136 
 137 static int bd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
 138 static int bd_attach(dev_info_t *, ddi_attach_cmd_t);
 139 static int bd_detach(dev_info_t *, ddi_detach_cmd_t);
 140 
 141 static int bd_open(dev_t *, int, int, cred_t *);
 142 static int bd_close(dev_t, int, int, cred_t *);
 143 static int bd_strategy(struct buf *);
 144 static int bd_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
 145 static int bd_dump(dev_t, caddr_t, daddr_t, int);
 146 static int bd_read(dev_t, struct uio *, cred_t *);
 147 static int bd_write(dev_t, struct uio *, cred_t *);
 148 static int bd_aread(dev_t, struct aio_req *, cred_t *);
 149 static int bd_awrite(dev_t, struct aio_req *, cred_t *);
 150 static int bd_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
 151     caddr_t, int *);
 152 
 153 static int bd_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
 154     void *);
 155 static int bd_tg_getinfo(dev_info_t *, int, void *, void *);
 156 static int bd_xfer_ctor(void *, void *, int);
 157 static void bd_xfer_dtor(void *, void *);
 158 static void bd_sched(bd_t *);
 159 static void bd_submit(bd_t *, bd_xfer_impl_t *);
 160 static void bd_runq_exit(bd_xfer_impl_t *, int);
 161 static void bd_update_state(bd_t *);
 162 static int bd_check_state(bd_t *, enum dkio_state *);
 163 static int bd_flush_write_cache(bd_t *, struct dk_callback *);
 164 
 165 struct cmlb_tg_ops bd_tg_ops = {
 166         TG_DK_OPS_VERSION_1,
 167         bd_tg_rdwr,
 168         bd_tg_getinfo,
 169 };
 170 
 171 static struct cb_ops bd_cb_ops = {
 172         bd_open,                /* open */
 173         bd_close,               /* close */
 174         bd_strategy,            /* strategy */
 175         nodev,                  /* print */
 176         bd_dump,                /* dump */
 177         bd_read,                /* read */
 178         bd_write,               /* write */
 179         bd_ioctl,               /* ioctl */
 180         nodev,                  /* devmap */
 181         nodev,                  /* mmap */
 182         nodev,                  /* segmap */
 183         nochpoll,               /* poll */
 184         bd_prop_op,             /* cb_prop_op */
 185         0,                      /* streamtab  */
 186         D_64BIT | D_MP,         /* Driver comaptibility flag */
 187         CB_REV,                 /* cb_rev */
 188         bd_aread,               /* async read */
 189         bd_awrite               /* async write */
 190 };
 191 
 192 struct dev_ops bd_dev_ops = {
 193         DEVO_REV,               /* devo_rev, */
 194         0,                      /* refcnt  */
 195         bd_getinfo,             /* getinfo */
 196         nulldev,                /* identify */
 197         nulldev,                /* probe */
 198         bd_attach,              /* attach */
 199         bd_detach,              /* detach */
 200         nodev,                  /* reset */
 201         &bd_cb_ops,                 /* driver operations */
 202         NULL,                   /* bus operations */
 203         NULL,                   /* power */
 204         ddi_quiesce_not_needed, /* quiesce */
 205 };
 206 
 207 static struct modldrv modldrv = {
 208         &mod_driverops,
 209         "Generic Block Device",
 210         &bd_dev_ops,
 211 };
 212 
 213 static struct modlinkage modlinkage = {
 214         MODREV_1, { &modldrv, NULL }
 215 };
 216 
 217 static void *bd_state;
 218 static krwlock_t bd_lock;
 219 
 220 int
 221 _init(void)
 222 {
 223         int     rv;
 224 
 225         rv = ddi_soft_state_init(&bd_state, sizeof (struct bd), 2);
 226         if (rv != DDI_SUCCESS) {
 227                 return (rv);
 228         }
 229         rw_init(&bd_lock, NULL, RW_DRIVER, NULL);
 230         rv = mod_install(&modlinkage);
 231         if (rv != DDI_SUCCESS) {
 232                 rw_destroy(&bd_lock);
 233                 ddi_soft_state_fini(&bd_state);
 234         }
 235         return (rv);
 236 }
 237 
 238 int
 239 _fini(void)
 240 {
 241         int     rv;
 242 
 243         rv = mod_remove(&modlinkage);
 244         if (rv == DDI_SUCCESS) {
 245                 rw_destroy(&bd_lock);
 246                 ddi_soft_state_fini(&bd_state);
 247         }
 248         return (rv);
 249 }
 250 
 251 int
 252 _info(struct modinfo *modinfop)
 253 {
 254         return (mod_info(&modlinkage, modinfop));
 255 }
 256 
 257 static int
 258 bd_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp)
 259 {
 260         bd_t    *bd;
 261         minor_t inst;
 262 
 263         _NOTE(ARGUNUSED(dip));
 264 
 265         inst = BDINST((dev_t)arg);
 266 
 267         switch (cmd) {
 268         case DDI_INFO_DEVT2DEVINFO:
 269                 bd = ddi_get_soft_state(bd_state, inst);
 270                 if (bd == NULL) {
 271                         return (DDI_FAILURE);
 272                 }
 273                 *resultp = (void *)bd->d_dip;
 274                 break;
 275 
 276         case DDI_INFO_DEVT2INSTANCE:
 277                 *resultp = (void *)(intptr_t)inst;
 278                 break;
 279 
 280         default:
 281                 return (DDI_FAILURE);
 282         }
 283         return (DDI_SUCCESS);
 284 }
 285 
 286 static int
 287 bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 288 {
 289         int             inst;
 290         bd_handle_t     hdl;
 291         bd_t            *bd;
 292         bd_drive_t      drive;
 293         int             rv;
 294         char            name[16];
 295         char            kcache[32];
 296 
 297         switch (cmd) {
 298         case DDI_ATTACH:
 299                 break;
 300         case DDI_RESUME:
 301                 /* We don't do anything native for suspend/resume */
 302                 return (DDI_SUCCESS);
 303         default:
 304                 return (DDI_FAILURE);
 305         }
 306 
 307         inst = ddi_get_instance(dip);
 308         hdl = ddi_get_parent_data(dip);
 309 
 310         (void) snprintf(name, sizeof (name), "%s%d",
 311             ddi_driver_name(dip), ddi_get_instance(dip));
 312         (void) snprintf(kcache, sizeof (kcache), "%s_xfer", name);
 313 
 314         if (hdl == NULL) {
 315                 cmn_err(CE_WARN, "%s: missing parent data!", name);
 316                 return (DDI_FAILURE);
 317         }
 318 
 319         if (ddi_soft_state_zalloc(bd_state, inst) != DDI_SUCCESS) {
 320                 cmn_err(CE_WARN, "%s: unable to zalloc soft state!", name);
 321                 return (DDI_FAILURE);
 322         }
 323         bd = ddi_get_soft_state(bd_state, inst);
 324 
 325         if (hdl->h_dma) {
 326                 bd->d_dma = *(hdl->h_dma);
 327                 bd->d_dma.dma_attr_granular =
 328                     max(DEV_BSIZE, bd->d_dma.dma_attr_granular);
 329                 bd->d_use_dma = B_TRUE;
 330 
 331                 if (bd->d_maxxfer &&
 332                     (bd->d_maxxfer != bd->d_dma.dma_attr_maxxfer)) {
 333                         cmn_err(CE_WARN,
 334                             "%s: inconsistent maximum transfer size!",
 335                             name);
 336                         /* We force it */
 337                         bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
 338                 } else {
 339                         bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
 340                 }
 341         } else {
 342                 bd->d_use_dma = B_FALSE;
 343                 if (bd->d_maxxfer == 0) {
 344                         bd->d_maxxfer = 1024 * 1024;
 345                 }
 346         }
 347         bd->d_ops = hdl->h_ops;
 348         bd->d_private = hdl->h_private;
 349         bd->d_blkshift = 9;  /* 512 bytes, to start */
 350 
 351         if (bd->d_maxxfer % DEV_BSIZE) {
 352                 cmn_err(CE_WARN, "%s: maximum transfer misaligned!", name);
 353                 bd->d_maxxfer &= ~(DEV_BSIZE - 1);
 354         }
 355         if (bd->d_maxxfer < DEV_BSIZE) {
 356                 cmn_err(CE_WARN, "%s: maximum transfer size too small!", name);
 357                 ddi_soft_state_free(bd_state, inst);
 358                 return (DDI_FAILURE);
 359         }
 360 
 361         bd->d_dip = dip;
 362         bd->d_handle = hdl;
 363         hdl->h_bd = bd;
 364         ddi_set_driver_private(dip, bd);
 365 
 366         mutex_init(&bd->d_iomutex, NULL, MUTEX_DRIVER, NULL);
 367         mutex_init(&bd->d_ocmutex, NULL, MUTEX_DRIVER, NULL);
 368         mutex_init(&bd->d_statemutex, NULL, MUTEX_DRIVER, NULL);
 369         cv_init(&bd->d_statecv, NULL, CV_DRIVER, NULL);
 370 
 371         list_create(&bd->d_waitq, sizeof (bd_xfer_impl_t),
 372             offsetof(struct bd_xfer_impl, i_linkage));
 373         list_create(&bd->d_runq, sizeof (bd_xfer_impl_t),
 374             offsetof(struct bd_xfer_impl, i_linkage));
 375 
 376         bd->d_cache = kmem_cache_create(kcache, sizeof (bd_xfer_impl_t), 8,
 377             bd_xfer_ctor, bd_xfer_dtor, NULL, bd, NULL, 0);
 378 
 379         bd->d_ksp = kstat_create(ddi_driver_name(dip), inst, NULL, "disk",
 380             KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT);
 381         if (bd->d_ksp != NULL) {
 382                 bd->d_ksp->ks_lock = &bd->d_iomutex;
 383                 kstat_install(bd->d_ksp);
 384                 bd->d_kiop = bd->d_ksp->ks_data;
 385         } else {
 386                 /*
 387                  * Even if we cannot create the kstat, we create a
 388                  * scratch kstat.  The reason for this is to ensure
 389                  * that we can update the kstat all of the time,
 390                  * without adding an extra branch instruction.
 391                  */
 392                 bd->d_kiop = kmem_zalloc(sizeof (kstat_io_t), KM_SLEEP);
 393         }
 394 
 395         cmlb_alloc_handle(&bd->d_cmlbh);
 396 
 397         bd->d_state = DKIO_NONE;
 398 
 399         bzero(&drive, sizeof (drive));
 400         bd->d_ops.o_drive_info(bd->d_private, &drive);
 401         bd->d_qsize = drive.d_qsize;
 402         bd->d_removable = drive.d_removable;
 403         bd->d_hotpluggable = drive.d_hotpluggable;
 404 
 405         if (drive.d_maxxfer && drive.d_maxxfer < bd->d_maxxfer)
 406                 bd->d_maxxfer = drive.d_maxxfer;
 407 
 408 
 409         rv = cmlb_attach(dip, &bd_tg_ops, DTYPE_DIRECT,
 410             bd->d_removable, bd->d_hotpluggable,
 411             drive.d_lun >= 0 ? DDI_NT_BLOCK_CHAN : DDI_NT_BLOCK,
 412             CMLB_FAKE_LABEL_ONE_PARTITION, bd->d_cmlbh, 0);
 413         if (rv != 0) {
 414                 cmlb_free_handle(&bd->d_cmlbh);
 415                 kmem_cache_destroy(bd->d_cache);
 416                 mutex_destroy(&bd->d_iomutex);
 417                 mutex_destroy(&bd->d_ocmutex);
 418                 mutex_destroy(&bd->d_statemutex);
 419                 cv_destroy(&bd->d_statecv);
 420                 list_destroy(&bd->d_waitq);
 421                 list_destroy(&bd->d_runq);
 422                 if (bd->d_ksp != NULL) {
 423                         kstat_delete(bd->d_ksp);
 424                         bd->d_ksp = NULL;
 425                 } else {
 426                         kmem_free(bd->d_kiop, sizeof (kstat_io_t));
 427                 }
 428                 ddi_soft_state_free(bd_state, inst);
 429                 return (DDI_FAILURE);
 430         }
 431 
 432         if (bd->d_ops.o_devid_init != NULL) {
 433                 rv = bd->d_ops.o_devid_init(bd->d_private, dip, &bd->d_devid);
 434                 if (rv == DDI_SUCCESS) {
 435                         if (ddi_devid_register(dip, bd->d_devid) !=
 436                             DDI_SUCCESS) {
 437                                 cmn_err(CE_WARN,
 438                                     "%s: unable to register devid", name);
 439                         }
 440                 }
 441         }
 442 
 443         /*
 444          * Add a zero-length attribute to tell the world we support
 445          * kernel ioctls (for layered drivers).  Also set up properties
 446          * used by HAL to identify removable media.
 447          */
 448         (void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
 449             DDI_KERNEL_IOCTL, NULL, 0);
 450         if (bd->d_removable) {
 451                 (void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
 452                     "removable-media", NULL, 0);
 453         }
 454         if (bd->d_hotpluggable) {
 455                 (void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
 456                     "hotpluggable", NULL, 0);
 457         }
 458 
 459         ddi_report_dev(dip);
 460 
 461         return (DDI_SUCCESS);
 462 }
 463 
 464 static int
 465 bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 466 {
 467         bd_t    *bd;
 468 
 469         bd = ddi_get_driver_private(dip);
 470 
 471         switch (cmd) {
 472         case DDI_DETACH:
 473                 break;
 474         case DDI_SUSPEND:
 475                 /* We don't suspend, but our parent does */
 476                 return (DDI_SUCCESS);
 477         default:
 478                 return (DDI_FAILURE);
 479         }
 480         if (bd->d_ksp != NULL) {
 481                 kstat_delete(bd->d_ksp);
 482                 bd->d_ksp = NULL;
 483         } else {
 484                 kmem_free(bd->d_kiop, sizeof (kstat_io_t));
 485         }
 486         cmlb_detach(bd->d_cmlbh, 0);
 487         cmlb_free_handle(&bd->d_cmlbh);
 488         if (bd->d_devid)
 489                 ddi_devid_free(bd->d_devid);
 490         kmem_cache_destroy(bd->d_cache);
 491         mutex_destroy(&bd->d_iomutex);
 492         mutex_destroy(&bd->d_ocmutex);
 493         mutex_destroy(&bd->d_statemutex);
 494         cv_destroy(&bd->d_statecv);
 495         list_destroy(&bd->d_waitq);
 496         list_destroy(&bd->d_runq);
 497         ddi_soft_state_free(bd_state, ddi_get_instance(dip));
 498         return (DDI_SUCCESS);
 499 }
 500 
 501 static int
 502 bd_xfer_ctor(void *buf, void *arg, int kmflag)
 503 {
 504         bd_xfer_impl_t  *xi;
 505         bd_t            *bd = arg;
 506         int             (*dcb)(caddr_t);
 507 
 508         if (kmflag == KM_PUSHPAGE || kmflag == KM_SLEEP) {
 509                 dcb = DDI_DMA_SLEEP;
 510         } else {
 511                 dcb = DDI_DMA_DONTWAIT;
 512         }
 513 
 514         xi = buf;
 515         bzero(xi, sizeof (*xi));
 516         xi->i_bd = bd;
 517 
 518         if (bd->d_use_dma) {
 519                 if (ddi_dma_alloc_handle(bd->d_dip, &bd->d_dma, dcb, NULL,
 520                     &xi->i_dmah) != DDI_SUCCESS) {
 521                         return (-1);
 522                 }
 523         }
 524 
 525         return (0);
 526 }
 527 
 528 static void
 529 bd_xfer_dtor(void *buf, void *arg)
 530 {
 531         bd_xfer_impl_t  *xi = buf;
 532 
 533         _NOTE(ARGUNUSED(arg));
 534 
 535         if (xi->i_dmah)
 536                 ddi_dma_free_handle(&xi->i_dmah);
 537         xi->i_dmah = NULL;
 538 }
 539 
 540 static bd_xfer_impl_t *
 541 bd_xfer_alloc(bd_t *bd, struct buf *bp, int (*func)(void *, bd_xfer_t *),
 542     int kmflag)
 543 {
 544         bd_xfer_impl_t          *xi;
 545         int                     rv = 0;
 546         int                     status;
 547         unsigned                dir;
 548         int                     (*cb)(caddr_t);
 549         size_t                  len;
 550         uint32_t                shift;
 551 
 552         if (kmflag == KM_SLEEP) {
 553                 cb = DDI_DMA_SLEEP;
 554         } else {
 555                 cb = DDI_DMA_DONTWAIT;
 556         }
 557 
 558         xi = kmem_cache_alloc(bd->d_cache, kmflag);
 559         if (xi == NULL) {
 560                 bioerror(bp, ENOMEM);
 561                 return (NULL);
 562         }
 563 
 564         ASSERT(bp);
 565 
 566         xi->i_bp = bp;
 567         xi->i_func = func;
 568         xi->i_blkno = bp->b_lblkno;
 569 
 570         if (bp->b_bcount == 0) {
 571                 xi->i_len = 0;
 572                 xi->i_nblks = 0;
 573                 xi->i_kaddr = NULL;
 574                 xi->i_resid = 0;
 575                 xi->i_num_win = 0;
 576                 goto done;
 577         }
 578 
 579         if (bp->b_flags & B_READ) {
 580                 dir = DDI_DMA_READ;
 581                 xi->i_func = bd->d_ops.o_read;
 582         } else {
 583                 dir = DDI_DMA_WRITE;
 584                 xi->i_func = bd->d_ops.o_write;
 585         }
 586 
 587         shift = bd->d_blkshift;
 588         xi->i_blkshift = shift;
 589 
 590         if (!bd->d_use_dma) {
 591                 bp_mapin(bp);
 592                 rv = 0;
 593                 xi->i_offset = 0;
 594                 xi->i_num_win =
 595                     (bp->b_bcount + (bd->d_maxxfer - 1)) / bd->d_maxxfer;
 596                 xi->i_cur_win = 0;
 597                 xi->i_len = min(bp->b_bcount, bd->d_maxxfer);
 598                 xi->i_nblks = xi->i_len >> shift;
 599                 xi->i_kaddr = bp->b_un.b_addr;
 600                 xi->i_resid = bp->b_bcount;
 601         } else {
 602 
 603                 /*
 604                  * We have to use consistent DMA if the address is misaligned.
 605                  */
 606                 if (((bp->b_flags & (B_PAGEIO | B_REMAPPED)) != B_PAGEIO) &&
 607                     ((uintptr_t)bp->b_un.b_addr & 0x7)) {
 608                         dir |= DDI_DMA_CONSISTENT | DDI_DMA_PARTIAL;
 609                 } else {
 610                         dir |= DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
 611                 }
 612 
 613                 status = ddi_dma_buf_bind_handle(xi->i_dmah, bp, dir, cb,
 614                     NULL, &xi->i_dmac, &xi->i_ndmac);
 615                 switch (status) {
 616                 case DDI_DMA_MAPPED:
 617                         xi->i_num_win = 1;
 618                         xi->i_cur_win = 0;
 619                         xi->i_offset = 0;
 620                         xi->i_len = bp->b_bcount;
 621                         xi->i_nblks = xi->i_len >> shift;
 622                         xi->i_resid = bp->b_bcount;
 623                         rv = 0;
 624                         break;
 625                 case DDI_DMA_PARTIAL_MAP:
 626                         xi->i_cur_win = 0;
 627 
 628                         if ((ddi_dma_numwin(xi->i_dmah, &xi->i_num_win) !=
 629                             DDI_SUCCESS) ||
 630                             (ddi_dma_getwin(xi->i_dmah, 0, &xi->i_offset,
 631                             &len, &xi->i_dmac, &xi->i_ndmac) !=
 632                             DDI_SUCCESS) ||
 633                             (P2PHASE(len, shift) != 0)) {
 634                                 (void) ddi_dma_unbind_handle(xi->i_dmah);
 635                                 rv = EFAULT;
 636                                 goto done;
 637                         }
 638                         xi->i_len = len;
 639                         xi->i_nblks = xi->i_len >> shift;
 640                         xi->i_resid = bp->b_bcount;
 641                         rv = 0;
 642                         break;
 643                 case DDI_DMA_NORESOURCES:
 644                         rv = EAGAIN;
 645                         goto done;
 646                 case DDI_DMA_TOOBIG:
 647                         rv = EINVAL;
 648                         goto done;
 649                 case DDI_DMA_NOMAPPING:
 650                 case DDI_DMA_INUSE:
 651                 default:
 652                         rv = EFAULT;
 653                         goto done;
 654                 }
 655         }
 656 
 657 done:
 658         if (rv != 0) {
 659                 kmem_cache_free(bd->d_cache, xi);
 660                 bioerror(bp, rv);
 661                 return (NULL);
 662         }
 663 
 664         return (xi);
 665 }
 666 
 667 static void
 668 bd_xfer_free(bd_xfer_impl_t *xi)
 669 {
 670         if (xi->i_dmah) {
 671                 (void) ddi_dma_unbind_handle(xi->i_dmah);
 672         }
 673         kmem_cache_free(xi->i_bd->d_cache, xi);
 674 }
 675 
 676 static int
 677 bd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
 678 {
 679         dev_t           dev = *devp;
 680         bd_t            *bd;
 681         minor_t         part;
 682         minor_t         inst;
 683         uint64_t        mask;
 684         boolean_t       ndelay;
 685         int             rv;
 686         diskaddr_t      nblks;
 687         diskaddr_t      lba;
 688 
 689         _NOTE(ARGUNUSED(credp));
 690 
 691         part = BDPART(dev);
 692         inst = BDINST(dev);
 693 
 694         if (otyp >= OTYPCNT)
 695                 return (EINVAL);
 696 
 697         ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE;
 698 
 699         /*
 700          * Block any DR events from changing the set of registered
 701          * devices while we function.
 702          */
 703         rw_enter(&bd_lock, RW_READER);
 704         if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
 705                 rw_exit(&bd_lock);
 706                 return (ENXIO);
 707         }
 708 
 709         mutex_enter(&bd->d_ocmutex);
 710 
 711         ASSERT(part < 64);
 712         mask = (1U << part);
 713 
 714         bd_update_state(bd);
 715 
 716         if (cmlb_validate(bd->d_cmlbh, 0, 0) != 0) {
 717 
 718                 /* non-blocking opens are allowed to succeed */
 719                 if (!ndelay) {
 720                         rv = ENXIO;
 721                         goto done;
 722                 }
 723         } else if (cmlb_partinfo(bd->d_cmlbh, part, &nblks, &lba,
 724             NULL, NULL, 0) == 0) {
 725 
 726                 /*
 727                  * We read the partinfo, verify valid ranges.  If the
 728                  * partition is invalid, and we aren't blocking or
 729                  * doing a raw access, then fail. (Non-blocking and
 730                  * raw accesses can still succeed to allow a disk with
 731                  * bad partition data to opened by format and fdisk.)
 732                  */
 733                 if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) {
 734                         rv = ENXIO;
 735                         goto done;
 736                 }
 737         } else if (!ndelay) {
 738                 /*
 739                  * cmlb_partinfo failed -- invalid partition or no
 740                  * disk label.
 741                  */
 742                 rv = ENXIO;
 743                 goto done;
 744         }
 745 
 746         if ((flag & FWRITE) && bd->d_rdonly) {
 747                 rv = EROFS;
 748                 goto done;
 749         }
 750 
 751         if ((bd->d_open_excl) & (mask)) {
 752                 rv = EBUSY;
 753                 goto done;
 754         }
 755         if (flag & FEXCL) {
 756                 if (bd->d_open_lyr[part]) {
 757                         rv = EBUSY;
 758                         goto done;
 759                 }
 760                 for (int i = 0; i < OTYP_LYR; i++) {
 761                         if (bd->d_open_reg[i] & mask) {
 762                                 rv = EBUSY;
 763                                 goto done;
 764                         }
 765                 }
 766         }
 767 
 768         if (otyp == OTYP_LYR) {
 769                 bd->d_open_lyr[part]++;
 770         } else {
 771                 bd->d_open_reg[otyp] |= mask;
 772         }
 773         if (flag & FEXCL) {
 774                 bd->d_open_excl |= mask;
 775         }
 776 
 777         rv = 0;
 778 done:
 779         mutex_exit(&bd->d_ocmutex);
 780         rw_exit(&bd_lock);
 781 
 782         return (rv);
 783 }
 784 
 785 static int
 786 bd_close(dev_t dev, int flag, int otyp, cred_t *credp)
 787 {
 788         bd_t            *bd;
 789         minor_t         inst;
 790         minor_t         part;
 791         uint64_t        mask;
 792         boolean_t       last = B_TRUE;
 793 
 794         _NOTE(ARGUNUSED(flag));
 795         _NOTE(ARGUNUSED(credp));
 796 
 797         part = BDPART(dev);
 798         inst = BDINST(dev);
 799 
 800         ASSERT(part < 64);
 801         mask = (1U << part);
 802 
 803         rw_enter(&bd_lock, RW_READER);
 804 
 805         if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
 806                 rw_exit(&bd_lock);
 807                 return (ENXIO);
 808         }
 809 
 810         mutex_enter(&bd->d_ocmutex);
 811         if (bd->d_open_excl & mask) {
 812                 bd->d_open_excl &= ~mask;
 813         }
 814         if (otyp == OTYP_LYR) {
 815                 bd->d_open_lyr[part]--;
 816         } else {
 817                 bd->d_open_reg[otyp] &= ~mask;
 818         }
 819         for (int i = 0; i < 64; i++) {
 820                 if (bd->d_open_lyr[part]) {
 821                         last = B_FALSE;
 822                 }
 823         }
 824         for (int i = 0; last && (i < OTYP_LYR); i++) {
 825                 if (bd->d_open_reg[i]) {
 826                         last = B_FALSE;
 827                 }
 828         }
 829         mutex_exit(&bd->d_ocmutex);
 830 
 831         if (last) {
 832                 cmlb_invalidate(bd->d_cmlbh, 0);
 833         }
 834         rw_exit(&bd_lock);
 835 
 836         return (0);
 837 }
 838 
 839 static int
 840 bd_dump(dev_t dev, caddr_t caddr, daddr_t blkno, int nblk)
 841 {
 842         minor_t         inst;
 843         minor_t         part;
 844         diskaddr_t      pstart;
 845         diskaddr_t      psize;
 846         bd_t            *bd;
 847         bd_xfer_impl_t  *xi;
 848         buf_t           *bp;
 849         int             rv;
 850 
 851         rw_enter(&bd_lock, RW_READER);
 852 
 853         part = BDPART(dev);
 854         inst = BDINST(dev);
 855 
 856         if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
 857                 rw_exit(&bd_lock);
 858                 return (ENXIO);
 859         }
 860         /*
 861          * do cmlb, but do it synchronously unless we already have the
 862          * partition (which we probably should.)
 863          */
 864         if (cmlb_partinfo(bd->d_cmlbh, part, &psize, &pstart, NULL, NULL,
 865             (void *)1)) {
 866                 rw_exit(&bd_lock);
 867                 return (ENXIO);
 868         }
 869 
 870         if ((blkno + nblk) > psize) {
 871                 rw_exit(&bd_lock);
 872                 return (EINVAL);
 873         }
 874         bp = getrbuf(KM_NOSLEEP);
 875         if (bp == NULL) {
 876                 rw_exit(&bd_lock);
 877                 return (ENOMEM);
 878         }
 879 
 880         bp->b_bcount = nblk << bd->d_blkshift;
 881         bp->b_resid = bp->b_bcount;
 882         bp->b_lblkno = blkno;
 883         bp->b_un.b_addr = caddr;
 884 
 885         xi = bd_xfer_alloc(bd, bp,  bd->d_ops.o_write, KM_NOSLEEP);
 886         if (xi == NULL) {
 887                 rw_exit(&bd_lock);
 888                 freerbuf(bp);
 889                 return (ENOMEM);
 890         }
 891         xi->i_blkno = blkno + pstart;
 892         xi->i_flags = BD_XFER_POLL;
 893         bd_submit(bd, xi);
 894         rw_exit(&bd_lock);
 895 
 896         /*
 897          * Generally, we should have run this entirely synchronously
 898          * at this point and the biowait call should be a no-op.  If
 899          * it didn't happen this way, it's a bug in the underlying
 900          * driver not honoring BD_XFER_POLL.
 901          */
 902         (void) biowait(bp);
 903         rv = geterror(bp);
 904         freerbuf(bp);
 905         return (rv);
 906 }
 907 
 908 void
 909 bd_minphys(struct buf *bp)
 910 {
 911         minor_t inst;
 912         bd_t    *bd;
 913         inst = BDINST(bp->b_edev);
 914 
 915         bd = ddi_get_soft_state(bd_state, inst);
 916 
 917         /*
 918          * In a non-debug kernel, bd_strategy will catch !bd as
 919          * well, and will fail nicely.
 920          */
 921         ASSERT(bd);
 922 
 923         if (bp->b_bcount > bd->d_maxxfer)
 924                 bp->b_bcount = bd->d_maxxfer;
 925 }
 926 
 927 static int
 928 bd_read(dev_t dev, struct uio *uio, cred_t *credp)
 929 {
 930         _NOTE(ARGUNUSED(credp));
 931         return (physio(bd_strategy, NULL, dev, B_READ, bd_minphys, uio));
 932 }
 933 
 934 static int
 935 bd_write(dev_t dev, struct uio *uio, cred_t *credp)
 936 {
 937         _NOTE(ARGUNUSED(credp));
 938         return (physio(bd_strategy, NULL, dev, B_WRITE, bd_minphys, uio));
 939 }
 940 
 941 static int
 942 bd_aread(dev_t dev, struct aio_req *aio, cred_t *credp)
 943 {
 944         _NOTE(ARGUNUSED(credp));
 945         return (aphysio(bd_strategy, anocancel, dev, B_READ, bd_minphys, aio));
 946 }
 947 
 948 static int
 949 bd_awrite(dev_t dev, struct aio_req *aio, cred_t *credp)
 950 {
 951         _NOTE(ARGUNUSED(credp));
 952         return (aphysio(bd_strategy, anocancel, dev, B_WRITE, bd_minphys, aio));
 953 }
 954 
 955 static int
 956 bd_strategy(struct buf *bp)
 957 {
 958         minor_t         inst;
 959         minor_t         part;
 960         bd_t            *bd;
 961         diskaddr_t      p_lba;
 962         diskaddr_t      p_nblks;
 963         diskaddr_t      b_nblks;
 964         bd_xfer_impl_t  *xi;
 965         uint32_t        shift;
 966         int             (*func)(void *, bd_xfer_t *);
 967 
 968         part = BDPART(bp->b_edev);
 969         inst = BDINST(bp->b_edev);
 970 
 971         ASSERT(bp);
 972 
 973         bp->b_resid = bp->b_bcount;
 974 
 975         if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
 976                 bioerror(bp, ENXIO);
 977                 biodone(bp);
 978                 return (0);
 979         }
 980 
 981         if (cmlb_partinfo(bd->d_cmlbh, part, &p_nblks, &p_lba,
 982             NULL, NULL, 0)) {
 983                 bioerror(bp, ENXIO);
 984                 biodone(bp);
 985                 return (0);
 986         }
 987 
 988         shift = bd->d_blkshift;
 989 
 990         if ((P2PHASE(bp->b_bcount, (1U << shift)) != 0) ||
 991             (bp->b_lblkno > p_nblks)) {
 992                 bioerror(bp, ENXIO);
 993                 biodone(bp);
 994                 return (0);
 995         }
 996         b_nblks = bp->b_bcount >> shift;
 997         if ((bp->b_lblkno == p_nblks) || (bp->b_bcount == 0)) {
 998                 biodone(bp);
 999                 return (0);
1000         }
1001 
1002         if ((b_nblks + bp->b_lblkno) > p_nblks) {
1003                 bp->b_resid = ((bp->b_lblkno + b_nblks - p_nblks) << shift);
1004                 bp->b_bcount -= bp->b_resid;
1005         } else {
1006                 bp->b_resid = 0;
1007         }
1008         func = (bp->b_flags & B_READ) ? bd->d_ops.o_read : bd->d_ops.o_write;
1009 
1010         xi = bd_xfer_alloc(bd, bp, func, KM_NOSLEEP);
1011         if (xi == NULL) {
1012                 xi = bd_xfer_alloc(bd, bp, func, KM_PUSHPAGE);
1013         }
1014         if (xi == NULL) {
1015                 /* bd_request_alloc will have done bioerror */
1016                 biodone(bp);
1017                 return (0);
1018         }
1019         xi->i_blkno = bp->b_lblkno + p_lba;
1020 
1021         bd_submit(bd, xi);
1022 
1023         return (0);
1024 }
1025 
1026 static int
1027 bd_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, int *rvalp)
1028 {
1029         minor_t         inst;
1030         uint16_t        part;
1031         bd_t            *bd;
1032         void            *ptr = (void *)arg;
1033         int             rv;
1034 
1035         part = BDPART(dev);
1036         inst = BDINST(dev);
1037 
1038         if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1039                 return (ENXIO);
1040         }
1041 
1042         rv = cmlb_ioctl(bd->d_cmlbh, dev, cmd, arg, flag, credp, rvalp, 0);
1043         if (rv != ENOTTY)
1044                 return (rv);
1045 
1046         if (rvalp != NULL) {
1047                 /* the return value of the ioctl is 0 by default */
1048                 *rvalp = 0;
1049         }
1050 
1051         switch (cmd) {
1052         case DKIOCGMEDIAINFO: {
1053                 struct dk_minfo minfo;
1054 
1055                 /* make sure our state information is current */
1056                 bd_update_state(bd);
1057                 bzero(&minfo, sizeof (minfo));
1058                 minfo.dki_media_type = DK_FIXED_DISK;
1059                 minfo.dki_lbsize = (1U << bd->d_blkshift);
1060                 minfo.dki_capacity = bd->d_numblks;
1061                 if (ddi_copyout(&minfo, ptr, sizeof (minfo), flag)) {
1062                         return (EFAULT);
1063                 }
1064                 return (0);
1065         }
1066         case DKIOCGMEDIAINFOEXT: {
1067                 struct dk_minfo_ext miext;
1068 
1069                 /* make sure our state information is current */
1070                 bd_update_state(bd);
1071                 bzero(&miext, sizeof (miext));
1072                 miext.dki_media_type = DK_FIXED_DISK;
1073                 miext.dki_lbsize = (1U << bd->d_blkshift);
1074                 miext.dki_pbsize = (1U << bd->d_pblkshift);
1075                 miext.dki_capacity = bd->d_numblks;
1076                 if (ddi_copyout(&miext, ptr, sizeof (miext), flag)) {
1077                         return (EFAULT);
1078                 }
1079                 return (0);
1080         }
1081         case DKIOCINFO: {
1082                 struct dk_cinfo cinfo;
1083                 bzero(&cinfo, sizeof (cinfo));
1084                 cinfo.dki_ctype = DKC_BLKDEV;
1085                 cinfo.dki_cnum = ddi_get_instance(ddi_get_parent(bd->d_dip));
1086                 (void) snprintf(cinfo.dki_cname, sizeof (cinfo.dki_cname),
1087                     "%s", ddi_driver_name(ddi_get_parent(bd->d_dip)));
1088                 (void) snprintf(cinfo.dki_dname, sizeof (cinfo.dki_dname),
1089                     "%s", ddi_driver_name(bd->d_dip));
1090                 cinfo.dki_unit = inst;
1091                 cinfo.dki_flags = DKI_FMTVOL;
1092                 cinfo.dki_partition = part;
1093                 cinfo.dki_maxtransfer = bd->d_maxxfer / DEV_BSIZE;
1094                 cinfo.dki_addr = 0;
1095                 cinfo.dki_slave = 0;
1096                 cinfo.dki_space = 0;
1097                 cinfo.dki_prio = 0;
1098                 cinfo.dki_vec = 0;
1099                 if (ddi_copyout(&cinfo, ptr, sizeof (cinfo), flag)) {
1100                         return (EFAULT);
1101                 }
1102                 return (0);
1103         }
1104         case DKIOCREMOVABLE: {
1105                 int i;
1106                 i = bd->d_removable ? 1 : 0;
1107                 if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1108                         return (EFAULT);
1109                 }
1110                 return (0);
1111         }
1112         case DKIOCHOTPLUGGABLE: {
1113                 int i;
1114                 i = bd->d_hotpluggable ? 1 : 0;
1115                 if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1116                         return (EFAULT);
1117                 }
1118                 return (0);
1119         }
1120         case DKIOCREADONLY: {
1121                 int i;
1122                 i = bd->d_rdonly ? 1 : 0;
1123                 if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1124                         return (EFAULT);
1125                 }
1126                 return (0);
1127         }
1128         case DKIOCSOLIDSTATE: {
1129                 int i;
1130                 i = bd->d_ssd ? 1 : 0;
1131                 if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1132                         return (EFAULT);
1133                 }
1134                 return (0);
1135         }
1136         case DKIOCSTATE: {
1137                 enum dkio_state state;
1138                 if (ddi_copyin(ptr, &state, sizeof (state), flag)) {
1139                         return (EFAULT);
1140                 }
1141                 if ((rv = bd_check_state(bd, &state)) != 0) {
1142                         return (rv);
1143                 }
1144                 if (ddi_copyout(&state, ptr, sizeof (state), flag)) {
1145                         return (EFAULT);
1146                 }
1147                 return (0);
1148         }
1149         case DKIOCFLUSHWRITECACHE: {
1150                 struct dk_callback *dkc = NULL;
1151 
1152                 if (flag & FKIOCTL)
1153                         dkc = (void *)arg;
1154 
1155                 rv = bd_flush_write_cache(bd, dkc);
1156                 return (rv);
1157         }
1158 
1159         default:
1160                 break;
1161 
1162         }
1163         return (ENOTTY);
1164 }
1165 
1166 static int
1167 bd_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
1168     char *name, caddr_t valuep, int *lengthp)
1169 {
1170         bd_t    *bd;
1171 
1172         bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1173         if (bd == NULL)
1174                 return (ddi_prop_op(dev, dip, prop_op, mod_flags,
1175                     name, valuep, lengthp));
1176 
1177         return (cmlb_prop_op(bd->d_cmlbh, dev, dip, prop_op, mod_flags, name,
1178             valuep, lengthp, BDPART(dev), 0));
1179 }
1180 
1181 
1182 static int
1183 bd_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start,
1184     size_t length, void *tg_cookie)
1185 {
1186         bd_t            *bd;
1187         buf_t           *bp;
1188         bd_xfer_impl_t  *xi;
1189         int             rv;
1190         int             (*func)(void *, bd_xfer_t *);
1191         int             kmflag;
1192 
1193         /*
1194          * If we are running in polled mode (such as during dump(9e)
1195          * execution), then we cannot sleep for kernel allocations.
1196          */
1197         kmflag = tg_cookie ? KM_NOSLEEP : KM_SLEEP;
1198 
1199         bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1200 
1201         if (P2PHASE(length, (1U << bd->d_blkshift)) != 0) {
1202                 /* We can only transfer whole blocks at a time! */
1203                 return (EINVAL);
1204         }
1205 
1206         if ((bp = getrbuf(kmflag)) == NULL) {
1207                 return (ENOMEM);
1208         }
1209 
1210         switch (cmd) {
1211         case TG_READ:
1212                 bp->b_flags = B_READ;
1213                 func = bd->d_ops.o_read;
1214                 break;
1215         case TG_WRITE:
1216                 bp->b_flags = B_WRITE;
1217                 func = bd->d_ops.o_write;
1218                 break;
1219         default:
1220                 freerbuf(bp);
1221                 return (EINVAL);
1222         }
1223 
1224         bp->b_un.b_addr = bufaddr;
1225         bp->b_bcount = length;
1226         xi = bd_xfer_alloc(bd, bp, func, kmflag);
1227         if (xi == NULL) {
1228                 rv = geterror(bp);
1229                 freerbuf(bp);
1230                 return (rv);
1231         }
1232         xi->i_flags = tg_cookie ? BD_XFER_POLL : 0;
1233         xi->i_blkno = start;
1234         bd_submit(bd, xi);
1235         (void) biowait(bp);
1236         rv = geterror(bp);
1237         freerbuf(bp);
1238 
1239         return (rv);
1240 }
1241 
1242 static int
1243 bd_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
1244 {
1245         bd_t            *bd;
1246 
1247         _NOTE(ARGUNUSED(tg_cookie));
1248         bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1249 
1250         switch (cmd) {
1251         case TG_GETPHYGEOM:
1252         case TG_GETVIRTGEOM:
1253                 /*
1254                  * We don't have any "geometry" as such, let cmlb
1255                  * fabricate something.
1256                  */
1257                 return (ENOTTY);
1258 
1259         case TG_GETCAPACITY:
1260                 bd_update_state(bd);
1261                 *(diskaddr_t *)arg = bd->d_numblks;
1262                 return (0);
1263 
1264         case TG_GETBLOCKSIZE:
1265                 *(uint32_t *)arg = (1U << bd->d_blkshift);
1266                 return (0);
1267 
1268         case TG_GETATTR:
1269                 /*
1270                  * It turns out that cmlb really doesn't do much for
1271                  * non-writable media, but lets make the information
1272                  * available for it in case it does more in the
1273                  * future.  (The value is currently used for
1274                  * triggering special behavior for CD-ROMs.)
1275                  */
1276                 bd_update_state(bd);
1277                 ((tg_attribute_t *)arg)->media_is_writable =
1278                     bd->d_rdonly ? B_FALSE : B_TRUE;
1279                 ((tg_attribute_t *)arg)->media_is_solid_state = bd->d_ssd;
1280                 return (0);
1281 
1282         default:
1283                 return (EINVAL);
1284         }
1285 }
1286 
1287 
1288 static void
1289 bd_sched(bd_t *bd)
1290 {
1291         bd_xfer_impl_t  *xi;
1292         struct buf      *bp;
1293         int             rv;
1294 
1295         mutex_enter(&bd->d_iomutex);
1296 
1297         while ((bd->d_qactive < bd->d_qsize) &&
1298             ((xi = list_remove_head(&bd->d_waitq)) != NULL)) {
1299                 bd->d_qactive++;
1300                 kstat_waitq_to_runq(bd->d_kiop);
1301                 list_insert_tail(&bd->d_runq, xi);
1302 
1303                 /*
1304                  * Submit the job to the driver.  We drop the I/O mutex
1305                  * so that we can deal with the case where the driver
1306                  * completion routine calls back into us synchronously.
1307                  */
1308 
1309                 mutex_exit(&bd->d_iomutex);
1310 
1311                 rv = xi->i_func(bd->d_private, &xi->i_public);
1312                 if (rv != 0) {
1313                         bp = xi->i_bp;
1314                         bioerror(bp, rv);
1315                         biodone(bp);
1316 
1317                         mutex_enter(&bd->d_iomutex);
1318                         bd->d_qactive--;
1319                         kstat_runq_exit(bd->d_kiop);
1320                         list_remove(&bd->d_runq, xi);
1321                         bd_xfer_free(xi);
1322                 } else {
1323                         mutex_enter(&bd->d_iomutex);
1324                 }
1325         }
1326 
1327         mutex_exit(&bd->d_iomutex);
1328 }
1329 
1330 static void
1331 bd_submit(bd_t *bd, bd_xfer_impl_t *xi)
1332 {
1333         mutex_enter(&bd->d_iomutex);
1334         list_insert_tail(&bd->d_waitq, xi);
1335         kstat_waitq_enter(bd->d_kiop);
1336         mutex_exit(&bd->d_iomutex);
1337 
1338         bd_sched(bd);
1339 }
1340 
1341 static void
1342 bd_runq_exit(bd_xfer_impl_t *xi, int err)
1343 {
1344         bd_t    *bd = xi->i_bd;
1345         buf_t   *bp = xi->i_bp;
1346 
1347         mutex_enter(&bd->d_iomutex);
1348         bd->d_qactive--;
1349         kstat_runq_exit(bd->d_kiop);
1350         list_remove(&bd->d_runq, xi);
1351         mutex_exit(&bd->d_iomutex);
1352 
1353         if (err == 0) {
1354                 if (bp->b_flags & B_READ) {
1355                         bd->d_kiop->reads++;
1356                         bd->d_kiop->nread += (bp->b_bcount - xi->i_resid);
1357                 } else {
1358                         bd->d_kiop->writes++;
1359                         bd->d_kiop->nwritten += (bp->b_bcount - xi->i_resid);
1360                 }
1361         }
1362         bd_sched(bd);
1363 }
1364 
1365 static void
1366 bd_update_state(bd_t *bd)
1367 {
1368         enum    dkio_state      state = DKIO_INSERTED;
1369         boolean_t               docmlb = B_FALSE;
1370         bd_media_t              media;
1371 
1372         bzero(&media, sizeof (media));
1373 
1374         mutex_enter(&bd->d_statemutex);
1375         if (bd->d_ops.o_media_info(bd->d_private, &media) != 0) {
1376                 bd->d_numblks = 0;
1377                 state = DKIO_EJECTED;
1378                 goto done;
1379         }
1380 
1381         if ((media.m_blksize < 512) ||
1382             (!ISP2(media.m_blksize)) ||
1383             (P2PHASE(bd->d_maxxfer, media.m_blksize))) {
1384                 cmn_err(CE_WARN, "%s%d: Invalid media block size (%d)",
1385                     ddi_driver_name(bd->d_dip), ddi_get_instance(bd->d_dip),
1386                     media.m_blksize);
1387                 /*
1388                  * We can't use the media, treat it as not present.
1389                  */
1390                 state = DKIO_EJECTED;
1391                 bd->d_numblks = 0;
1392                 goto done;
1393         }
1394 
1395         if (((1U << bd->d_blkshift) != media.m_blksize) ||
1396             (bd->d_numblks != media.m_nblks)) {
1397                 /* Device size changed */
1398                 docmlb = B_TRUE;
1399         }
1400 
1401         bd->d_blkshift = ddi_ffs(media.m_blksize) - 1;
1402         bd->d_pblkshift = bd->d_blkshift;
1403         bd->d_numblks = media.m_nblks;
1404         bd->d_rdonly = media.m_readonly;
1405         bd->d_ssd = media.m_solidstate;
1406 
1407         /*
1408          * Only use the supplied physical block size if it is non-zero,
1409          * greater or equal to the block size, and a power of 2. Ignore it
1410          * if not, it's just informational and we can still use the media.
1411          */
1412         if ((media.m_pblksize != 0) &&
1413             (media.m_pblksize >= media.m_blksize) &&
1414             (ISP2(media.m_pblksize)))
1415                 bd->d_pblkshift = ddi_ffs(media.m_pblksize) - 1;
1416 
1417 done:
1418         if (state != bd->d_state) {
1419                 bd->d_state = state;
1420                 cv_broadcast(&bd->d_statecv);
1421                 docmlb = B_TRUE;
1422         }
1423         mutex_exit(&bd->d_statemutex);
1424 
1425         if (docmlb) {
1426                 if (state == DKIO_INSERTED) {
1427                         (void) cmlb_validate(bd->d_cmlbh, 0, 0);
1428                 } else {
1429                         cmlb_invalidate(bd->d_cmlbh, 0);
1430                 }
1431         }
1432 }
1433 
1434 static int
1435 bd_check_state(bd_t *bd, enum dkio_state *state)
1436 {
1437         clock_t         when;
1438 
1439         for (;;) {
1440 
1441                 bd_update_state(bd);
1442 
1443                 mutex_enter(&bd->d_statemutex);
1444 
1445                 if (bd->d_state != *state) {
1446                         *state = bd->d_state;
1447                         mutex_exit(&bd->d_statemutex);
1448                         break;
1449                 }
1450 
1451                 when = drv_usectohz(1000000);
1452                 if (cv_reltimedwait_sig(&bd->d_statecv, &bd->d_statemutex,
1453                     when, TR_CLOCK_TICK) == 0) {
1454                         mutex_exit(&bd->d_statemutex);
1455                         return (EINTR);
1456                 }
1457 
1458                 mutex_exit(&bd->d_statemutex);
1459         }
1460 
1461         return (0);
1462 }
1463 
1464 static int
1465 bd_flush_write_cache_done(struct buf *bp)
1466 {
1467         struct dk_callback *dc = (void *)bp->b_private;
1468 
1469         (*dc->dkc_callback)(dc->dkc_cookie, geterror(bp));
1470         kmem_free(dc, sizeof (*dc));
1471         freerbuf(bp);
1472         return (0);
1473 }
1474 
1475 static int
1476 bd_flush_write_cache(bd_t *bd, struct dk_callback *dkc)
1477 {
1478         buf_t                   *bp;
1479         struct dk_callback      *dc;
1480         bd_xfer_impl_t          *xi;
1481         int                     rv;
1482 
1483         if (bd->d_ops.o_sync_cache == NULL) {
1484                 return (ENOTSUP);
1485         }
1486         bp = getrbuf(KM_SLEEP);
1487         bp->b_resid = 0;
1488         bp->b_bcount = 0;
1489 
1490         xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_sync_cache, KM_SLEEP);
1491         if (xi == NULL) {
1492                 rv = geterror(bp);
1493                 freerbuf(bp);
1494                 return (rv);
1495         }
1496 
1497         /* Make an asynchronous flush, but only if there is a callback */
1498         if (dkc != NULL && dkc->dkc_callback != NULL) {
1499                 /* Make a private copy of the callback structure */
1500                 dc = kmem_alloc(sizeof (*dc), KM_SLEEP);
1501                 *dc = *dkc;
1502                 bp->b_private = dc;
1503                 bp->b_iodone = bd_flush_write_cache_done;
1504 
1505                 bd_submit(bd, xi);
1506                 return (0);
1507         }
1508 
1509         /* In case there is no callback, perform a synchronous flush */
1510         bd_submit(bd, xi);
1511         (void) biowait(bp);
1512         rv = geterror(bp);
1513         freerbuf(bp);
1514 
1515         return (rv);
1516 }
1517 
1518 /*
1519  * Nexus support.
1520  */
1521 int
1522 bd_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop,
1523     void *arg, void *result)
1524 {
1525         bd_handle_t     hdl;
1526 
1527         switch (ctlop) {
1528         case DDI_CTLOPS_REPORTDEV:
1529                 cmn_err(CE_CONT, "?Block device: %s@%s, %s%d\n",
1530                     ddi_node_name(rdip), ddi_get_name_addr(rdip),
1531                     ddi_driver_name(rdip), ddi_get_instance(rdip));
1532                 return (DDI_SUCCESS);
1533 
1534         case DDI_CTLOPS_INITCHILD:
1535                 hdl = ddi_get_parent_data((dev_info_t *)arg);
1536                 if (hdl == NULL) {
1537                         return (DDI_NOT_WELL_FORMED);
1538                 }
1539                 ddi_set_name_addr((dev_info_t *)arg, hdl->h_addr);
1540                 return (DDI_SUCCESS);
1541 
1542         case DDI_CTLOPS_UNINITCHILD:
1543                 ddi_set_name_addr((dev_info_t *)arg, NULL);
1544                 ndi_prop_remove_all((dev_info_t *)arg);
1545                 return (DDI_SUCCESS);
1546 
1547         default:
1548                 return (ddi_ctlops(dip, rdip, ctlop, arg, result));
1549         }
1550 }
1551 
1552 /*
1553  * Functions for device drivers.
1554  */
1555 bd_handle_t
1556 bd_alloc_handle(void *private, bd_ops_t *ops, ddi_dma_attr_t *dma, int kmflag)
1557 {
1558         bd_handle_t     hdl;
1559 
1560         hdl = kmem_zalloc(sizeof (*hdl), kmflag);
1561         if (hdl != NULL) {
1562                 hdl->h_ops = *ops;
1563                 hdl->h_dma = dma;
1564                 hdl->h_private = private;
1565         }
1566 
1567         return (hdl);
1568 }
1569 
1570 void
1571 bd_free_handle(bd_handle_t hdl)
1572 {
1573         kmem_free(hdl, sizeof (*hdl));
1574 }
1575 
1576 int
1577 bd_attach_handle(dev_info_t *dip, bd_handle_t hdl)
1578 {
1579         dev_info_t      *child;
1580         bd_drive_t      drive;
1581 
1582         /* if drivers don't override this, make it assume none */
1583         drive.d_lun = -1;
1584         hdl->h_ops.o_drive_info(hdl->h_private, &drive);
1585 
1586         hdl->h_parent = dip;
1587         hdl->h_name = "blkdev";
1588 
1589         if (drive.d_lun >= 0) {
1590                 (void) snprintf(hdl->h_addr, sizeof (hdl->h_addr), "%X,%X",
1591                     drive.d_target, drive.d_lun);
1592         } else {
1593                 (void) snprintf(hdl->h_addr, sizeof (hdl->h_addr), "%X",
1594                     drive.d_target);
1595         }
1596         if (ndi_devi_alloc(dip, hdl->h_name, (pnode_t)DEVI_SID_NODEID,
1597             &child) != NDI_SUCCESS) {
1598                 cmn_err(CE_WARN, "%s%d: unable to allocate node %s@%s",
1599                     ddi_driver_name(dip), ddi_get_instance(dip),
1600                     "blkdev", hdl->h_addr);
1601                 return (DDI_FAILURE);
1602         }
1603 
1604         ddi_set_parent_data(child, hdl);
1605         hdl->h_child = child;
1606 
1607         if (ndi_devi_online(child, 0) == NDI_FAILURE) {
1608                 cmn_err(CE_WARN, "%s%d: failed bringing node %s@%s online",
1609                     ddi_driver_name(dip), ddi_get_instance(dip),
1610                     hdl->h_name, hdl->h_addr);
1611                 (void) ndi_devi_free(child);
1612                 return (DDI_FAILURE);
1613         }
1614 
1615         return (DDI_SUCCESS);
1616 }
1617 
1618 int
1619 bd_detach_handle(bd_handle_t hdl)
1620 {
1621         int     circ;
1622         int     rv;
1623         char    *devnm;
1624 
1625         if (hdl->h_child == NULL) {
1626                 return (DDI_SUCCESS);
1627         }
1628         ndi_devi_enter(hdl->h_parent, &circ);
1629         if (i_ddi_node_state(hdl->h_child) < DS_INITIALIZED) {
1630                 rv = ddi_remove_child(hdl->h_child, 0);
1631         } else {
1632                 devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP);
1633                 (void) ddi_deviname(hdl->h_child, devnm);
1634                 (void) devfs_clean(hdl->h_parent, devnm + 1, DV_CLEAN_FORCE);
1635                 rv = ndi_devi_unconfig_one(hdl->h_parent, devnm + 1, NULL,
1636                     NDI_DEVI_REMOVE | NDI_UNCONFIG);
1637                 kmem_free(devnm, MAXNAMELEN + 1);
1638         }
1639         if (rv == 0) {
1640                 hdl->h_child = NULL;
1641         }
1642 
1643         ndi_devi_exit(hdl->h_parent, circ);
1644         return (rv = NDI_SUCCESS ? DDI_SUCCESS : DDI_FAILURE);
1645 }
1646 
1647 void
1648 bd_xfer_done(bd_xfer_t *xfer, int err)
1649 {
1650         bd_xfer_impl_t  *xi = (void *)xfer;
1651         buf_t           *bp = xi->i_bp;
1652         int             rv = DDI_SUCCESS;
1653         bd_t            *bd = xi->i_bd;
1654         size_t          len;
1655 
1656         if (err != 0) {
1657                 bd_runq_exit(xi, err);
1658 
1659                 bp->b_resid += xi->i_resid;
1660                 bd_xfer_free(xi);
1661                 bioerror(bp, err);
1662                 biodone(bp);
1663                 return;
1664         }
1665 
1666         xi->i_cur_win++;
1667         xi->i_resid -= xi->i_len;
1668 
1669         if (xi->i_resid == 0) {
1670                 /* Job completed succcessfully! */
1671                 bd_runq_exit(xi, 0);
1672 
1673                 bd_xfer_free(xi);
1674                 biodone(bp);
1675                 return;
1676         }
1677 
1678         xi->i_blkno += xi->i_nblks;
1679 
1680         if (bd->d_use_dma) {
1681                 /* More transfer still pending... advance to next DMA window. */
1682                 rv = ddi_dma_getwin(xi->i_dmah, xi->i_cur_win,
1683                     &xi->i_offset, &len, &xi->i_dmac, &xi->i_ndmac);
1684         } else {
1685                 /* Advance memory window. */
1686                 xi->i_kaddr += xi->i_len;
1687                 xi->i_offset += xi->i_len;
1688                 len = min(bp->b_bcount - xi->i_offset, bd->d_maxxfer);
1689         }
1690 
1691 
1692         if ((rv != DDI_SUCCESS) ||
1693             (P2PHASE(len, (1U << xi->i_blkshift) != 0))) {
1694                 bd_runq_exit(xi, EFAULT);
1695 
1696                 bp->b_resid += xi->i_resid;
1697                 bd_xfer_free(xi);
1698                 bioerror(bp, EFAULT);
1699                 biodone(bp);
1700                 return;
1701         }
1702         xi->i_len = len;
1703         xi->i_nblks = len >> xi->i_blkshift;
1704 
1705         /* Submit next window to hardware. */
1706         rv = xi->i_func(bd->d_private, &xi->i_public);
1707         if (rv != 0) {
1708                 bd_runq_exit(xi, rv);
1709 
1710                 bp->b_resid += xi->i_resid;
1711                 bd_xfer_free(xi);
1712                 bioerror(bp, rv);
1713                 biodone(bp);
1714         }
1715 }
1716 
1717 void
1718 bd_state_change(bd_handle_t hdl)
1719 {
1720         bd_t            *bd;
1721 
1722         if ((bd = hdl->h_bd) != NULL) {
1723                 bd_update_state(bd);
1724         }
1725 }
1726 
1727 void
1728 bd_mod_init(struct dev_ops *devops)
1729 {
1730         static struct bus_ops bd_bus_ops = {
1731                 BUSO_REV,               /* busops_rev */
1732                 nullbusmap,             /* bus_map */
1733                 NULL,                   /* bus_get_intrspec (OBSOLETE) */
1734                 NULL,                   /* bus_add_intrspec (OBSOLETE) */
1735                 NULL,                   /* bus_remove_intrspec (OBSOLETE) */
1736                 i_ddi_map_fault,        /* bus_map_fault */
1737                 NULL,                   /* bus_dma_map (OBSOLETE) */
1738                 ddi_dma_allochdl,       /* bus_dma_allochdl */
1739                 ddi_dma_freehdl,        /* bus_dma_freehdl */
1740                 ddi_dma_bindhdl,        /* bus_dma_bindhdl */
1741                 ddi_dma_unbindhdl,      /* bus_dma_unbindhdl */
1742                 ddi_dma_flush,          /* bus_dma_flush */
1743                 ddi_dma_win,            /* bus_dma_win */
1744                 ddi_dma_mctl,           /* bus_dma_ctl */
1745                 bd_bus_ctl,             /* bus_ctl */
1746                 ddi_bus_prop_op,        /* bus_prop_op */
1747                 NULL,                   /* bus_get_eventcookie */
1748                 NULL,                   /* bus_add_eventcall */
1749                 NULL,                   /* bus_remove_eventcall */
1750                 NULL,                   /* bus_post_event */
1751                 NULL,                   /* bus_intr_ctl (OBSOLETE) */
1752                 NULL,                   /* bus_config */
1753                 NULL,                   /* bus_unconfig */
1754                 NULL,                   /* bus_fm_init */
1755                 NULL,                   /* bus_fm_fini */
1756                 NULL,                   /* bus_fm_access_enter */
1757                 NULL,                   /* bus_fm_access_exit */
1758                 NULL,                   /* bus_power */
1759                 NULL,                   /* bus_intr_op */
1760         };
1761 
1762         devops->devo_bus_ops = &bd_bus_ops;
1763 
1764         /*
1765          * NB: The device driver is free to supply its own
1766          * character entry device support.
1767          */
1768 }
1769 
1770 void
1771 bd_mod_fini(struct dev_ops *devops)
1772 {
1773         devops->devo_bus_ops = NULL;
1774 }