1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 26 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 27 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 28 * Copyright (c) 2014 Integros [integros.com] 29 */ 30 31 #include <sys/zfs_context.h> 32 #include <sys/dmu.h> 33 #include <sys/dmu_send.h> 34 #include <sys/dmu_impl.h> 35 #include <sys/dbuf.h> 36 #include <sys/dmu_objset.h> 37 #include <sys/dsl_dataset.h> 38 #include <sys/dsl_dir.h> 39 #include <sys/dmu_tx.h> 40 #include <sys/spa.h> 41 #include <sys/zio.h> 42 #include <sys/dmu_zfetch.h> 43 #include <sys/sa.h> 44 #include <sys/sa_impl.h> 45 #include <sys/zfeature.h> 46 #include <sys/blkptr.h> 47 #include <sys/range_tree.h> 48 49 /* 50 * Number of times that zfs_free_range() took the slow path while doing 51 * a zfs receive. A nonzero value indicates a potential performance problem. 52 */ 53 uint64_t zfs_free_range_recv_miss; 54 55 static void dbuf_destroy(dmu_buf_impl_t *db); 56 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 57 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 58 59 #ifndef __lint 60 extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, 61 dmu_buf_evict_func_t *evict_func_prep, dmu_buf_evict_func_t *evict_func, 62 dmu_buf_t **clear_on_evict_dbufp); 63 #endif /* ! __lint */ 64 65 /* 66 * Global data structures and functions for the dbuf cache. 67 */ 68 static kmem_cache_t *dbuf_cache; 69 static taskq_t *dbu_evict_taskq; 70 71 /* ARGSUSED */ 72 static int 73 dbuf_cons(void *vdb, void *unused, int kmflag) 74 { 75 dmu_buf_impl_t *db = vdb; 76 bzero(db, sizeof (dmu_buf_impl_t)); 77 78 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 79 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 80 refcount_create(&db->db_holds); 81 82 return (0); 83 } 84 85 /* ARGSUSED */ 86 static void 87 dbuf_dest(void *vdb, void *unused) 88 { 89 dmu_buf_impl_t *db = vdb; 90 mutex_destroy(&db->db_mtx); 91 cv_destroy(&db->db_changed); 92 refcount_destroy(&db->db_holds); 93 } 94 95 /* 96 * dbuf hash table routines 97 */ 98 static dbuf_hash_table_t dbuf_hash_table; 99 100 static uint64_t dbuf_hash_count; 101 102 static uint64_t 103 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 104 { 105 uintptr_t osv = (uintptr_t)os; 106 uint64_t crc = -1ULL; 107 108 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 109 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 110 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 111 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 112 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 113 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 114 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 115 116 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 117 118 return (crc); 119 } 120 121 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 122 123 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 124 ((dbuf)->db.db_object == (obj) && \ 125 (dbuf)->db_objset == (os) && \ 126 (dbuf)->db_level == (level) && \ 127 (dbuf)->db_blkid == (blkid)) 128 129 dmu_buf_impl_t * 130 dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) 131 { 132 dbuf_hash_table_t *h = &dbuf_hash_table; 133 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 134 uint64_t idx = hv & h->hash_table_mask; 135 dmu_buf_impl_t *db; 136 137 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 138 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 139 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 140 mutex_enter(&db->db_mtx); 141 if (db->db_state != DB_EVICTING) { 142 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 143 return (db); 144 } 145 mutex_exit(&db->db_mtx); 146 } 147 } 148 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 149 return (NULL); 150 } 151 152 static dmu_buf_impl_t * 153 dbuf_find_bonus(objset_t *os, uint64_t object) 154 { 155 dnode_t *dn; 156 dmu_buf_impl_t *db = NULL; 157 158 if (dnode_hold(os, object, FTAG, &dn) == 0) { 159 rw_enter(&dn->dn_struct_rwlock, RW_READER); 160 if (dn->dn_bonus != NULL) { 161 db = dn->dn_bonus; 162 mutex_enter(&db->db_mtx); 163 } 164 rw_exit(&dn->dn_struct_rwlock); 165 dnode_rele(dn, FTAG); 166 } 167 return (db); 168 } 169 170 /* 171 * Insert an entry into the hash table. If there is already an element 172 * equal to elem in the hash table, then the already existing element 173 * will be returned and the new element will not be inserted. 174 * Otherwise returns NULL. 175 */ 176 static dmu_buf_impl_t * 177 dbuf_hash_insert(dmu_buf_impl_t *db) 178 { 179 dbuf_hash_table_t *h = &dbuf_hash_table; 180 objset_t *os = db->db_objset; 181 uint64_t obj = db->db.db_object; 182 int level = db->db_level; 183 uint64_t blkid = db->db_blkid; 184 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 185 uint64_t idx = hv & h->hash_table_mask; 186 dmu_buf_impl_t *dbf; 187 188 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 189 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 190 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 191 mutex_enter(&dbf->db_mtx); 192 if (dbf->db_state != DB_EVICTING) { 193 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 194 return (dbf); 195 } 196 mutex_exit(&dbf->db_mtx); 197 } 198 } 199 200 mutex_enter(&db->db_mtx); 201 db->db_hash_next = h->hash_table[idx]; 202 h->hash_table[idx] = db; 203 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 204 atomic_inc_64(&dbuf_hash_count); 205 206 return (NULL); 207 } 208 209 /* 210 * Remove an entry from the hash table. It must be in the EVICTING state. 211 */ 212 static void 213 dbuf_hash_remove(dmu_buf_impl_t *db) 214 { 215 dbuf_hash_table_t *h = &dbuf_hash_table; 216 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 217 db->db_level, db->db_blkid); 218 uint64_t idx = hv & h->hash_table_mask; 219 dmu_buf_impl_t *dbf, **dbp; 220 221 /* 222 * We musn't hold db_mtx to maintain lock ordering: 223 * DBUF_HASH_MUTEX > db_mtx. 224 */ 225 ASSERT(refcount_is_zero(&db->db_holds)); 226 ASSERT(db->db_state == DB_EVICTING); 227 ASSERT(!MUTEX_HELD(&db->db_mtx)); 228 229 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 230 dbp = &h->hash_table[idx]; 231 while ((dbf = *dbp) != db) { 232 dbp = &dbf->db_hash_next; 233 ASSERT(dbf != NULL); 234 } 235 *dbp = db->db_hash_next; 236 db->db_hash_next = NULL; 237 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 238 atomic_dec_64(&dbuf_hash_count); 239 } 240 241 static arc_evict_func_t dbuf_do_evict; 242 243 typedef enum { 244 DBVU_EVICTING, 245 DBVU_NOT_EVICTING 246 } dbvu_verify_type_t; 247 248 static void 249 dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) 250 { 251 #ifdef ZFS_DEBUG 252 int64_t holds; 253 254 if (db->db_user == NULL) 255 return; 256 257 /* Only data blocks support the attachment of user data. */ 258 ASSERT(db->db_level == 0); 259 260 /* Clients must resolve a dbuf before attaching user data. */ 261 ASSERT(db->db.db_data != NULL); 262 ASSERT3U(db->db_state, ==, DB_CACHED); 263 264 holds = refcount_count(&db->db_holds); 265 if (verify_type == DBVU_EVICTING) { 266 /* 267 * Immediate eviction occurs when holds == dirtycnt. 268 * For normal eviction buffers, holds is zero on 269 * eviction, except when dbuf_fix_old_data() calls 270 * dbuf_clear_data(). However, the hold count can grow 271 * during eviction even though db_mtx is held (see 272 * dmu_bonus_hold() for an example), so we can only 273 * test the generic invariant that holds >= dirtycnt. 274 */ 275 ASSERT3U(holds, >=, db->db_dirtycnt); 276 } else { 277 if (db->db_user_immediate_evict == TRUE) 278 ASSERT3U(holds, >=, db->db_dirtycnt); 279 else 280 ASSERT3U(holds, >, 0); 281 } 282 #endif 283 } 284 285 static void 286 dbuf_evict_user(dmu_buf_impl_t *db) 287 { 288 dmu_buf_user_t *dbu = db->db_user; 289 290 ASSERT(MUTEX_HELD(&db->db_mtx)); 291 292 if (dbu == NULL) 293 return; 294 295 dbuf_verify_user(db, DBVU_EVICTING); 296 db->db_user = NULL; 297 298 #ifdef ZFS_DEBUG 299 if (dbu->dbu_clear_on_evict_dbufp != NULL) 300 *dbu->dbu_clear_on_evict_dbufp = NULL; 301 #endif 302 303 if (dbu->dbu_evict_func_prep != NULL) 304 dbu->dbu_evict_func_prep(dbu); 305 306 /* 307 * Invoke the callback from a taskq to avoid lock order reversals 308 * and limit stack depth. 309 */ 310 taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0, 311 &dbu->dbu_tqent); 312 } 313 314 boolean_t 315 dbuf_is_metadata(dmu_buf_impl_t *db) 316 { 317 if (db->db_level > 0) { 318 return (B_TRUE); 319 } else { 320 boolean_t is_metadata; 321 322 DB_DNODE_ENTER(db); 323 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); 324 DB_DNODE_EXIT(db); 325 326 return (is_metadata); 327 } 328 } 329 330 void 331 dbuf_evict(dmu_buf_impl_t *db) 332 { 333 ASSERT(MUTEX_HELD(&db->db_mtx)); 334 ASSERT(db->db_buf == NULL); 335 ASSERT(db->db_data_pending == NULL); 336 337 dbuf_clear(db); 338 dbuf_destroy(db); 339 } 340 341 void 342 dbuf_init(void) 343 { 344 uint64_t hsize = 1ULL << 16; 345 dbuf_hash_table_t *h = &dbuf_hash_table; 346 int i; 347 348 /* 349 * The hash table is big enough to fill all of physical memory 350 * with an average 4K block size. The table will take up 351 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 352 */ 353 while (hsize * 4096 < physmem * PAGESIZE) 354 hsize <<= 1; 355 356 retry: 357 h->hash_table_mask = hsize - 1; 358 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 359 if (h->hash_table == NULL) { 360 /* XXX - we should really return an error instead of assert */ 361 ASSERT(hsize > (1ULL << 10)); 362 hsize >>= 1; 363 goto retry; 364 } 365 366 dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 367 sizeof (dmu_buf_impl_t), 368 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 369 370 for (i = 0; i < DBUF_MUTEXES; i++) 371 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 372 373 /* 374 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc 375 * configuration is not required. 376 */ 377 dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); 378 } 379 380 void 381 dbuf_fini(void) 382 { 383 dbuf_hash_table_t *h = &dbuf_hash_table; 384 int i; 385 386 for (i = 0; i < DBUF_MUTEXES; i++) 387 mutex_destroy(&h->hash_mutexes[i]); 388 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 389 kmem_cache_destroy(dbuf_cache); 390 taskq_destroy(dbu_evict_taskq); 391 } 392 393 /* 394 * Other stuff. 395 */ 396 397 #ifdef ZFS_DEBUG 398 static void 399 dbuf_verify(dmu_buf_impl_t *db) 400 { 401 dnode_t *dn; 402 dbuf_dirty_record_t *dr; 403 404 ASSERT(MUTEX_HELD(&db->db_mtx)); 405 406 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 407 return; 408 409 ASSERT(db->db_objset != NULL); 410 DB_DNODE_ENTER(db); 411 dn = DB_DNODE(db); 412 if (dn == NULL) { 413 ASSERT(db->db_parent == NULL); 414 ASSERT(db->db_blkptr == NULL); 415 } else { 416 ASSERT3U(db->db.db_object, ==, dn->dn_object); 417 ASSERT3P(db->db_objset, ==, dn->dn_objset); 418 ASSERT3U(db->db_level, <, dn->dn_nlevels); 419 ASSERT(db->db_blkid == DMU_BONUS_BLKID || 420 db->db_blkid == DMU_SPILL_BLKID || 421 !avl_is_empty(&dn->dn_dbufs)); 422 } 423 if (db->db_blkid == DMU_BONUS_BLKID) { 424 ASSERT(dn != NULL); 425 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 426 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 427 } else if (db->db_blkid == DMU_SPILL_BLKID) { 428 ASSERT(dn != NULL); 429 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 430 ASSERT0(db->db.db_offset); 431 } else { 432 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 433 } 434 435 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 436 ASSERT(dr->dr_dbuf == db); 437 438 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 439 ASSERT(dr->dr_dbuf == db); 440 441 /* 442 * We can't assert that db_size matches dn_datablksz because it 443 * can be momentarily different when another thread is doing 444 * dnode_set_blksz(). 445 */ 446 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 447 dr = db->db_data_pending; 448 /* 449 * It should only be modified in syncing context, so 450 * make sure we only have one copy of the data. 451 */ 452 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 453 } 454 455 /* verify db->db_blkptr */ 456 if (db->db_blkptr) { 457 if (db->db_parent == dn->dn_dbuf) { 458 /* db is pointed to by the dnode */ 459 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 460 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 461 ASSERT(db->db_parent == NULL); 462 else 463 ASSERT(db->db_parent != NULL); 464 if (db->db_blkid != DMU_SPILL_BLKID) 465 ASSERT3P(db->db_blkptr, ==, 466 &dn->dn_phys->dn_blkptr[db->db_blkid]); 467 } else { 468 /* db is pointed to by an indirect block */ 469 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 470 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 471 ASSERT3U(db->db_parent->db.db_object, ==, 472 db->db.db_object); 473 /* 474 * dnode_grow_indblksz() can make this fail if we don't 475 * have the struct_rwlock. XXX indblksz no longer 476 * grows. safe to do this now? 477 */ 478 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 479 ASSERT3P(db->db_blkptr, ==, 480 ((blkptr_t *)db->db_parent->db.db_data + 481 db->db_blkid % epb)); 482 } 483 } 484 } 485 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 486 (db->db_buf == NULL || db->db_buf->b_data) && 487 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 488 db->db_state != DB_FILL && !dn->dn_free_txg) { 489 /* 490 * If the blkptr isn't set but they have nonzero data, 491 * it had better be dirty, otherwise we'll lose that 492 * data when we evict this buffer. 493 */ 494 if (db->db_dirtycnt == 0) { 495 uint64_t *buf = db->db.db_data; 496 int i; 497 498 for (i = 0; i < db->db.db_size >> 3; i++) { 499 ASSERT(buf[i] == 0); 500 } 501 } 502 } 503 DB_DNODE_EXIT(db); 504 } 505 #endif 506 507 static void 508 dbuf_clear_data(dmu_buf_impl_t *db) 509 { 510 ASSERT(MUTEX_HELD(&db->db_mtx)); 511 dbuf_evict_user(db); 512 db->db_buf = NULL; 513 db->db.db_data = NULL; 514 if (db->db_state != DB_NOFILL) 515 db->db_state = DB_UNCACHED; 516 } 517 518 static void 519 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 520 { 521 ASSERT(MUTEX_HELD(&db->db_mtx)); 522 ASSERT(buf != NULL); 523 524 db->db_buf = buf; 525 ASSERT(buf->b_data != NULL); 526 db->db.db_data = buf->b_data; 527 if (!arc_released(buf)) 528 arc_set_callback(buf, dbuf_do_evict, db); 529 } 530 531 /* 532 * Loan out an arc_buf for read. Return the loaned arc_buf. 533 */ 534 arc_buf_t * 535 dbuf_loan_arcbuf(dmu_buf_impl_t *db) 536 { 537 arc_buf_t *abuf; 538 539 mutex_enter(&db->db_mtx); 540 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 541 int blksz = db->db.db_size; 542 spa_t *spa = db->db_objset->os_spa; 543 544 mutex_exit(&db->db_mtx); 545 abuf = arc_loan_buf(spa, blksz); 546 bcopy(db->db.db_data, abuf->b_data, blksz); 547 } else { 548 abuf = db->db_buf; 549 arc_loan_inuse_buf(abuf, db); 550 dbuf_clear_data(db); 551 mutex_exit(&db->db_mtx); 552 } 553 return (abuf); 554 } 555 556 /* 557 * Calculate which level n block references the data at the level 0 offset 558 * provided. 559 */ 560 uint64_t 561 dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset) 562 { 563 if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { 564 /* 565 * The level n blkid is equal to the level 0 blkid divided by 566 * the number of level 0s in a level n block. 567 * 568 * The level 0 blkid is offset >> datablkshift = 569 * offset / 2^datablkshift. 570 * 571 * The number of level 0s in a level n is the number of block 572 * pointers in an indirect block, raised to the power of level. 573 * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = 574 * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). 575 * 576 * Thus, the level n blkid is: offset / 577 * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT))) 578 * = offset / 2^(datablkshift + level * 579 * (indblkshift - SPA_BLKPTRSHIFT)) 580 * = offset >> (datablkshift + level * 581 * (indblkshift - SPA_BLKPTRSHIFT)) 582 */ 583 return (offset >> (dn->dn_datablkshift + level * 584 (dn->dn_indblkshift - SPA_BLKPTRSHIFT))); 585 } else { 586 ASSERT3U(offset, <, dn->dn_datablksz); 587 return (0); 588 } 589 } 590 591 static void 592 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 593 { 594 dmu_buf_impl_t *db = vdb; 595 596 mutex_enter(&db->db_mtx); 597 ASSERT3U(db->db_state, ==, DB_READ); 598 /* 599 * All reads are synchronous, so we must have a hold on the dbuf 600 */ 601 ASSERT(refcount_count(&db->db_holds) > 0); 602 ASSERT(db->db_buf == NULL); 603 ASSERT(db->db.db_data == NULL); 604 if (db->db_level == 0 && db->db_freed_in_flight) { 605 /* we were freed in flight; disregard any error */ 606 arc_release(buf, db); 607 bzero(buf->b_data, db->db.db_size); 608 arc_buf_freeze(buf); 609 db->db_freed_in_flight = FALSE; 610 dbuf_set_data(db, buf); 611 db->db_state = DB_CACHED; 612 } else if (zio == NULL || zio->io_error == 0) { 613 dbuf_set_data(db, buf); 614 db->db_state = DB_CACHED; 615 } else { 616 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 617 ASSERT3P(db->db_buf, ==, NULL); 618 VERIFY(arc_buf_remove_ref(buf, db)); 619 db->db_state = DB_UNCACHED; 620 } 621 cv_broadcast(&db->db_changed); 622 dbuf_rele_and_unlock(db, NULL); 623 } 624 625 static void 626 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 627 { 628 dnode_t *dn; 629 zbookmark_phys_t zb; 630 arc_flags_t aflags = ARC_FLAG_NOWAIT; 631 632 DB_DNODE_ENTER(db); 633 dn = DB_DNODE(db); 634 ASSERT(!refcount_is_zero(&db->db_holds)); 635 /* We need the struct_rwlock to prevent db_blkptr from changing. */ 636 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 637 ASSERT(MUTEX_HELD(&db->db_mtx)); 638 ASSERT(db->db_state == DB_UNCACHED); 639 ASSERT(db->db_buf == NULL); 640 641 if (db->db_blkid == DMU_BONUS_BLKID) { 642 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 643 644 ASSERT3U(bonuslen, <=, db->db.db_size); 645 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 646 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 647 if (bonuslen < DN_MAX_BONUSLEN) 648 bzero(db->db.db_data, DN_MAX_BONUSLEN); 649 if (bonuslen) 650 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 651 DB_DNODE_EXIT(db); 652 db->db_state = DB_CACHED; 653 mutex_exit(&db->db_mtx); 654 return; 655 } 656 657 /* 658 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 659 * processes the delete record and clears the bp while we are waiting 660 * for the dn_mtx (resulting in a "no" from block_freed). 661 */ 662 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 663 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 664 BP_IS_HOLE(db->db_blkptr)))) { 665 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 666 667 DB_DNODE_EXIT(db); 668 dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, 669 db->db.db_size, db, type)); 670 bzero(db->db.db_data, db->db.db_size); 671 db->db_state = DB_CACHED; 672 mutex_exit(&db->db_mtx); 673 return; 674 } 675 676 DB_DNODE_EXIT(db); 677 678 db->db_state = DB_READ; 679 mutex_exit(&db->db_mtx); 680 681 if (DBUF_IS_L2CACHEABLE(db)) 682 aflags |= ARC_FLAG_L2CACHE; 683 if (DBUF_IS_L2COMPRESSIBLE(db)) 684 aflags |= ARC_FLAG_L2COMPRESS; 685 686 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 687 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 688 db->db.db_object, db->db_level, db->db_blkid); 689 690 dbuf_add_ref(db, NULL); 691 692 (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, 693 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 694 (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 695 &aflags, &zb); 696 } 697 698 int 699 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 700 { 701 int err = 0; 702 boolean_t havepzio = (zio != NULL); 703 boolean_t prefetch; 704 dnode_t *dn; 705 706 /* 707 * We don't have to hold the mutex to check db_state because it 708 * can't be freed while we have a hold on the buffer. 709 */ 710 ASSERT(!refcount_is_zero(&db->db_holds)); 711 712 if (db->db_state == DB_NOFILL) 713 return (SET_ERROR(EIO)); 714 715 DB_DNODE_ENTER(db); 716 dn = DB_DNODE(db); 717 if ((flags & DB_RF_HAVESTRUCT) == 0) 718 rw_enter(&dn->dn_struct_rwlock, RW_READER); 719 720 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 721 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 722 DBUF_IS_CACHEABLE(db); 723 724 mutex_enter(&db->db_mtx); 725 if (db->db_state == DB_CACHED) { 726 mutex_exit(&db->db_mtx); 727 if (prefetch) 728 dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1); 729 if ((flags & DB_RF_HAVESTRUCT) == 0) 730 rw_exit(&dn->dn_struct_rwlock); 731 DB_DNODE_EXIT(db); 732 } else if (db->db_state == DB_UNCACHED) { 733 spa_t *spa = dn->dn_objset->os_spa; 734 735 if (zio == NULL) 736 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 737 dbuf_read_impl(db, zio, flags); 738 739 /* dbuf_read_impl has dropped db_mtx for us */ 740 741 if (prefetch) 742 dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1); 743 744 if ((flags & DB_RF_HAVESTRUCT) == 0) 745 rw_exit(&dn->dn_struct_rwlock); 746 DB_DNODE_EXIT(db); 747 748 if (!havepzio) 749 err = zio_wait(zio); 750 } else { 751 /* 752 * Another reader came in while the dbuf was in flight 753 * between UNCACHED and CACHED. Either a writer will finish 754 * writing the buffer (sending the dbuf to CACHED) or the 755 * first reader's request will reach the read_done callback 756 * and send the dbuf to CACHED. Otherwise, a failure 757 * occurred and the dbuf went to UNCACHED. 758 */ 759 mutex_exit(&db->db_mtx); 760 if (prefetch) 761 dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1); 762 if ((flags & DB_RF_HAVESTRUCT) == 0) 763 rw_exit(&dn->dn_struct_rwlock); 764 DB_DNODE_EXIT(db); 765 766 /* Skip the wait per the caller's request. */ 767 mutex_enter(&db->db_mtx); 768 if ((flags & DB_RF_NEVERWAIT) == 0) { 769 while (db->db_state == DB_READ || 770 db->db_state == DB_FILL) { 771 ASSERT(db->db_state == DB_READ || 772 (flags & DB_RF_HAVESTRUCT) == 0); 773 DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, 774 db, zio_t *, zio); 775 cv_wait(&db->db_changed, &db->db_mtx); 776 } 777 if (db->db_state == DB_UNCACHED) 778 err = SET_ERROR(EIO); 779 } 780 mutex_exit(&db->db_mtx); 781 } 782 783 ASSERT(err || havepzio || db->db_state == DB_CACHED); 784 return (err); 785 } 786 787 static void 788 dbuf_noread(dmu_buf_impl_t *db) 789 { 790 ASSERT(!refcount_is_zero(&db->db_holds)); 791 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 792 mutex_enter(&db->db_mtx); 793 while (db->db_state == DB_READ || db->db_state == DB_FILL) 794 cv_wait(&db->db_changed, &db->db_mtx); 795 if (db->db_state == DB_UNCACHED) { 796 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 797 spa_t *spa = db->db_objset->os_spa; 798 799 ASSERT(db->db_buf == NULL); 800 ASSERT(db->db.db_data == NULL); 801 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); 802 db->db_state = DB_FILL; 803 } else if (db->db_state == DB_NOFILL) { 804 dbuf_clear_data(db); 805 } else { 806 ASSERT3U(db->db_state, ==, DB_CACHED); 807 } 808 mutex_exit(&db->db_mtx); 809 } 810 811 /* 812 * This is our just-in-time copy function. It makes a copy of 813 * buffers, that have been modified in a previous transaction 814 * group, before we modify them in the current active group. 815 * 816 * This function is used in two places: when we are dirtying a 817 * buffer for the first time in a txg, and when we are freeing 818 * a range in a dnode that includes this buffer. 819 * 820 * Note that when we are called from dbuf_free_range() we do 821 * not put a hold on the buffer, we just traverse the active 822 * dbuf list for the dnode. 823 */ 824 static void 825 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 826 { 827 dbuf_dirty_record_t *dr = db->db_last_dirty; 828 829 ASSERT(MUTEX_HELD(&db->db_mtx)); 830 ASSERT(db->db.db_data != NULL); 831 ASSERT(db->db_level == 0); 832 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 833 834 if (dr == NULL || 835 (dr->dt.dl.dr_data != 836 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 837 return; 838 839 /* 840 * If the last dirty record for this dbuf has not yet synced 841 * and its referencing the dbuf data, either: 842 * reset the reference to point to a new copy, 843 * or (if there a no active holders) 844 * just null out the current db_data pointer. 845 */ 846 ASSERT(dr->dr_txg >= txg - 2); 847 if (db->db_blkid == DMU_BONUS_BLKID) { 848 /* Note that the data bufs here are zio_bufs */ 849 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 850 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 851 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 852 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 853 int size = db->db.db_size; 854 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 855 spa_t *spa = db->db_objset->os_spa; 856 857 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); 858 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 859 } else { 860 dbuf_clear_data(db); 861 } 862 } 863 864 void 865 dbuf_unoverride(dbuf_dirty_record_t *dr) 866 { 867 dmu_buf_impl_t *db = dr->dr_dbuf; 868 blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 869 uint64_t txg = dr->dr_txg; 870 871 ASSERT(MUTEX_HELD(&db->db_mtx)); 872 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 873 ASSERT(db->db_level == 0); 874 875 if (db->db_blkid == DMU_BONUS_BLKID || 876 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 877 return; 878 879 ASSERT(db->db_data_pending != dr); 880 881 /* free this block */ 882 if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) 883 zio_free(db->db_objset->os_spa, txg, bp); 884 885 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 886 dr->dt.dl.dr_nopwrite = B_FALSE; 887 888 /* 889 * Release the already-written buffer, so we leave it in 890 * a consistent dirty state. Note that all callers are 891 * modifying the buffer, so they will immediately do 892 * another (redundant) arc_release(). Therefore, leave 893 * the buf thawed to save the effort of freezing & 894 * immediately re-thawing it. 895 */ 896 arc_release(dr->dt.dl.dr_data, db); 897 } 898 899 /* 900 * Evict (if its unreferenced) or clear (if its referenced) any level-0 901 * data blocks in the free range, so that any future readers will find 902 * empty blocks. 903 * 904 * This is a no-op if the dataset is in the middle of an incremental 905 * receive; see comment below for details. 906 */ 907 void 908 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, 909 dmu_tx_t *tx) 910 { 911 dmu_buf_impl_t db_search; 912 dmu_buf_impl_t *db, *db_next; 913 uint64_t txg = tx->tx_txg; 914 avl_index_t where; 915 916 if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID)) 917 end_blkid = dn->dn_maxblkid; 918 dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); 919 920 db_search.db_level = 0; 921 db_search.db_blkid = start_blkid; 922 db_search.db_state = DB_SEARCH; 923 924 mutex_enter(&dn->dn_dbufs_mtx); 925 if (start_blkid >= dn->dn_unlisted_l0_blkid) { 926 /* There can't be any dbufs in this range; no need to search. */ 927 #ifdef DEBUG 928 db = avl_find(&dn->dn_dbufs, &db_search, &where); 929 ASSERT3P(db, ==, NULL); 930 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 931 ASSERT(db == NULL || db->db_level > 0); 932 #endif 933 mutex_exit(&dn->dn_dbufs_mtx); 934 return; 935 } else if (dmu_objset_is_receiving(dn->dn_objset)) { 936 /* 937 * If we are receiving, we expect there to be no dbufs in 938 * the range to be freed, because receive modifies each 939 * block at most once, and in offset order. If this is 940 * not the case, it can lead to performance problems, 941 * so note that we unexpectedly took the slow path. 942 */ 943 atomic_inc_64(&zfs_free_range_recv_miss); 944 } 945 946 db = avl_find(&dn->dn_dbufs, &db_search, &where); 947 ASSERT3P(db, ==, NULL); 948 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 949 950 for (; db != NULL; db = db_next) { 951 db_next = AVL_NEXT(&dn->dn_dbufs, db); 952 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 953 954 if (db->db_level != 0 || db->db_blkid > end_blkid) { 955 break; 956 } 957 ASSERT3U(db->db_blkid, >=, start_blkid); 958 959 /* found a level 0 buffer in the range */ 960 mutex_enter(&db->db_mtx); 961 if (dbuf_undirty(db, tx)) { 962 /* mutex has been dropped and dbuf destroyed */ 963 continue; 964 } 965 966 if (db->db_state == DB_UNCACHED || 967 db->db_state == DB_NOFILL || 968 db->db_state == DB_EVICTING) { 969 ASSERT(db->db.db_data == NULL); 970 mutex_exit(&db->db_mtx); 971 continue; 972 } 973 if (db->db_state == DB_READ || db->db_state == DB_FILL) { 974 /* will be handled in dbuf_read_done or dbuf_rele */ 975 db->db_freed_in_flight = TRUE; 976 mutex_exit(&db->db_mtx); 977 continue; 978 } 979 if (refcount_count(&db->db_holds) == 0) { 980 ASSERT(db->db_buf); 981 dbuf_clear(db); 982 continue; 983 } 984 /* The dbuf is referenced */ 985 986 if (db->db_last_dirty != NULL) { 987 dbuf_dirty_record_t *dr = db->db_last_dirty; 988 989 if (dr->dr_txg == txg) { 990 /* 991 * This buffer is "in-use", re-adjust the file 992 * size to reflect that this buffer may 993 * contain new data when we sync. 994 */ 995 if (db->db_blkid != DMU_SPILL_BLKID && 996 db->db_blkid > dn->dn_maxblkid) 997 dn->dn_maxblkid = db->db_blkid; 998 dbuf_unoverride(dr); 999 } else { 1000 /* 1001 * This dbuf is not dirty in the open context. 1002 * Either uncache it (if its not referenced in 1003 * the open context) or reset its contents to 1004 * empty. 1005 */ 1006 dbuf_fix_old_data(db, txg); 1007 } 1008 } 1009 /* clear the contents if its cached */ 1010 if (db->db_state == DB_CACHED) { 1011 ASSERT(db->db.db_data != NULL); 1012 arc_release(db->db_buf, db); 1013 bzero(db->db.db_data, db->db.db_size); 1014 arc_buf_freeze(db->db_buf); 1015 } 1016 1017 mutex_exit(&db->db_mtx); 1018 } 1019 mutex_exit(&dn->dn_dbufs_mtx); 1020 } 1021 1022 static int 1023 dbuf_block_freeable(dmu_buf_impl_t *db) 1024 { 1025 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 1026 uint64_t birth_txg = 0; 1027 1028 /* 1029 * We don't need any locking to protect db_blkptr: 1030 * If it's syncing, then db_last_dirty will be set 1031 * so we'll ignore db_blkptr. 1032 * 1033 * This logic ensures that only block births for 1034 * filled blocks are considered. 1035 */ 1036 ASSERT(MUTEX_HELD(&db->db_mtx)); 1037 if (db->db_last_dirty && (db->db_blkptr == NULL || 1038 !BP_IS_HOLE(db->db_blkptr))) { 1039 birth_txg = db->db_last_dirty->dr_txg; 1040 } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { 1041 birth_txg = db->db_blkptr->blk_birth; 1042 } 1043 1044 /* 1045 * If this block don't exist or is in a snapshot, it can't be freed. 1046 * Don't pass the bp to dsl_dataset_block_freeable() since we 1047 * are holding the db_mtx lock and might deadlock if we are 1048 * prefetching a dedup-ed block. 1049 */ 1050 if (birth_txg != 0) 1051 return (ds == NULL || 1052 dsl_dataset_block_freeable(ds, NULL, birth_txg)); 1053 else 1054 return (B_FALSE); 1055 } 1056 1057 void 1058 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 1059 { 1060 arc_buf_t *buf, *obuf; 1061 int osize = db->db.db_size; 1062 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1063 dnode_t *dn; 1064 1065 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1066 1067 DB_DNODE_ENTER(db); 1068 dn = DB_DNODE(db); 1069 1070 /* XXX does *this* func really need the lock? */ 1071 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1072 1073 /* 1074 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held 1075 * is OK, because there can be no other references to the db 1076 * when we are changing its size, so no concurrent DB_FILL can 1077 * be happening. 1078 */ 1079 /* 1080 * XXX we should be doing a dbuf_read, checking the return 1081 * value and returning that up to our callers 1082 */ 1083 dmu_buf_will_dirty(&db->db, tx); 1084 1085 /* create the data buffer for the new block */ 1086 buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); 1087 1088 /* copy old block data to the new block */ 1089 obuf = db->db_buf; 1090 bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 1091 /* zero the remainder */ 1092 if (size > osize) 1093 bzero((uint8_t *)buf->b_data + osize, size - osize); 1094 1095 mutex_enter(&db->db_mtx); 1096 dbuf_set_data(db, buf); 1097 VERIFY(arc_buf_remove_ref(obuf, db)); 1098 db->db.db_size = size; 1099 1100 if (db->db_level == 0) { 1101 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1102 db->db_last_dirty->dt.dl.dr_data = buf; 1103 } 1104 mutex_exit(&db->db_mtx); 1105 1106 dnode_willuse_space(dn, size-osize, tx); 1107 DB_DNODE_EXIT(db); 1108 } 1109 1110 void 1111 dbuf_release_bp(dmu_buf_impl_t *db) 1112 { 1113 objset_t *os = db->db_objset; 1114 1115 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 1116 ASSERT(arc_released(os->os_phys_buf) || 1117 list_link_active(&os->os_dsl_dataset->ds_synced_link)); 1118 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 1119 1120 (void) arc_release(db->db_buf, db); 1121 } 1122 1123 /* 1124 * We already have a dirty record for this TXG, and we are being 1125 * dirtied again. 1126 */ 1127 static void 1128 dbuf_redirty(dbuf_dirty_record_t *dr) 1129 { 1130 dmu_buf_impl_t *db = dr->dr_dbuf; 1131 1132 ASSERT(MUTEX_HELD(&db->db_mtx)); 1133 1134 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 1135 /* 1136 * If this buffer has already been written out, 1137 * we now need to reset its state. 1138 */ 1139 dbuf_unoverride(dr); 1140 if (db->db.db_object != DMU_META_DNODE_OBJECT && 1141 db->db_state != DB_NOFILL) { 1142 /* Already released on initial dirty, so just thaw. */ 1143 ASSERT(arc_released(db->db_buf)); 1144 arc_buf_thaw(db->db_buf); 1145 } 1146 } 1147 } 1148 1149 dbuf_dirty_record_t * 1150 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1151 { 1152 dnode_t *dn; 1153 objset_t *os; 1154 dbuf_dirty_record_t **drp, *dr; 1155 int drop_struct_lock = FALSE; 1156 boolean_t do_free_accounting = B_FALSE; 1157 int txgoff = tx->tx_txg & TXG_MASK; 1158 1159 ASSERT(tx->tx_txg != 0); 1160 ASSERT(!refcount_is_zero(&db->db_holds)); 1161 DMU_TX_DIRTY_BUF(tx, db); 1162 1163 DB_DNODE_ENTER(db); 1164 dn = DB_DNODE(db); 1165 /* 1166 * Shouldn't dirty a regular buffer in syncing context. Private 1167 * objects may be dirtied in syncing context, but only if they 1168 * were already pre-dirtied in open context. 1169 */ 1170 ASSERT(!dmu_tx_is_syncing(tx) || 1171 BP_IS_HOLE(dn->dn_objset->os_rootbp) || 1172 DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1173 dn->dn_objset->os_dsl_dataset == NULL); 1174 /* 1175 * We make this assert for private objects as well, but after we 1176 * check if we're already dirty. They are allowed to re-dirty 1177 * in syncing context. 1178 */ 1179 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1180 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1181 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1182 1183 mutex_enter(&db->db_mtx); 1184 /* 1185 * XXX make this true for indirects too? The problem is that 1186 * transactions created with dmu_tx_create_assigned() from 1187 * syncing context don't bother holding ahead. 1188 */ 1189 ASSERT(db->db_level != 0 || 1190 db->db_state == DB_CACHED || db->db_state == DB_FILL || 1191 db->db_state == DB_NOFILL); 1192 1193 mutex_enter(&dn->dn_mtx); 1194 /* 1195 * Don't set dirtyctx to SYNC if we're just modifying this as we 1196 * initialize the objset. 1197 */ 1198 if (dn->dn_dirtyctx == DN_UNDIRTIED && 1199 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 1200 dn->dn_dirtyctx = 1201 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 1202 ASSERT(dn->dn_dirtyctx_firstset == NULL); 1203 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 1204 } 1205 mutex_exit(&dn->dn_mtx); 1206 1207 if (db->db_blkid == DMU_SPILL_BLKID) 1208 dn->dn_have_spill = B_TRUE; 1209 1210 /* 1211 * If this buffer is already dirty, we're done. 1212 */ 1213 drp = &db->db_last_dirty; 1214 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 1215 db->db.db_object == DMU_META_DNODE_OBJECT); 1216 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 1217 drp = &dr->dr_next; 1218 if (dr && dr->dr_txg == tx->tx_txg) { 1219 DB_DNODE_EXIT(db); 1220 1221 dbuf_redirty(dr); 1222 mutex_exit(&db->db_mtx); 1223 return (dr); 1224 } 1225 1226 /* 1227 * Only valid if not already dirty. 1228 */ 1229 ASSERT(dn->dn_object == 0 || 1230 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1231 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1232 1233 ASSERT3U(dn->dn_nlevels, >, db->db_level); 1234 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 1235 dn->dn_phys->dn_nlevels > db->db_level || 1236 dn->dn_next_nlevels[txgoff] > db->db_level || 1237 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 1238 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1239 1240 /* 1241 * We should only be dirtying in syncing context if it's the 1242 * mos or we're initializing the os or it's a special object. 1243 * However, we are allowed to dirty in syncing context provided 1244 * we already dirtied it in open context. Hence we must make 1245 * this assertion only if we're not already dirty. 1246 */ 1247 os = dn->dn_objset; 1248 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1249 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 1250 ASSERT(db->db.db_size != 0); 1251 1252 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1253 1254 if (db->db_blkid != DMU_BONUS_BLKID) { 1255 /* 1256 * Update the accounting. 1257 * Note: we delay "free accounting" until after we drop 1258 * the db_mtx. This keeps us from grabbing other locks 1259 * (and possibly deadlocking) in bp_get_dsize() while 1260 * also holding the db_mtx. 1261 */ 1262 dnode_willuse_space(dn, db->db.db_size, tx); 1263 do_free_accounting = dbuf_block_freeable(db); 1264 } 1265 1266 /* 1267 * If this buffer is dirty in an old transaction group we need 1268 * to make a copy of it so that the changes we make in this 1269 * transaction group won't leak out when we sync the older txg. 1270 */ 1271 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1272 if (db->db_level == 0) { 1273 void *data_old = db->db_buf; 1274 1275 if (db->db_state != DB_NOFILL) { 1276 if (db->db_blkid == DMU_BONUS_BLKID) { 1277 dbuf_fix_old_data(db, tx->tx_txg); 1278 data_old = db->db.db_data; 1279 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1280 /* 1281 * Release the data buffer from the cache so 1282 * that we can modify it without impacting 1283 * possible other users of this cached data 1284 * block. Note that indirect blocks and 1285 * private objects are not released until the 1286 * syncing state (since they are only modified 1287 * then). 1288 */ 1289 arc_release(db->db_buf, db); 1290 dbuf_fix_old_data(db, tx->tx_txg); 1291 data_old = db->db_buf; 1292 } 1293 ASSERT(data_old != NULL); 1294 } 1295 dr->dt.dl.dr_data = data_old; 1296 } else { 1297 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1298 list_create(&dr->dt.di.dr_children, 1299 sizeof (dbuf_dirty_record_t), 1300 offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1301 } 1302 if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) 1303 dr->dr_accounted = db->db.db_size; 1304 dr->dr_dbuf = db; 1305 dr->dr_txg = tx->tx_txg; 1306 dr->dr_next = *drp; 1307 *drp = dr; 1308 1309 /* 1310 * We could have been freed_in_flight between the dbuf_noread 1311 * and dbuf_dirty. We win, as though the dbuf_noread() had 1312 * happened after the free. 1313 */ 1314 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1315 db->db_blkid != DMU_SPILL_BLKID) { 1316 mutex_enter(&dn->dn_mtx); 1317 if (dn->dn_free_ranges[txgoff] != NULL) { 1318 range_tree_clear(dn->dn_free_ranges[txgoff], 1319 db->db_blkid, 1); 1320 } 1321 mutex_exit(&dn->dn_mtx); 1322 db->db_freed_in_flight = FALSE; 1323 } 1324 1325 /* 1326 * This buffer is now part of this txg 1327 */ 1328 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1329 db->db_dirtycnt += 1; 1330 ASSERT3U(db->db_dirtycnt, <=, 3); 1331 1332 mutex_exit(&db->db_mtx); 1333 1334 if (db->db_blkid == DMU_BONUS_BLKID || 1335 db->db_blkid == DMU_SPILL_BLKID) { 1336 mutex_enter(&dn->dn_mtx); 1337 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1338 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1339 mutex_exit(&dn->dn_mtx); 1340 dnode_setdirty(dn, tx); 1341 DB_DNODE_EXIT(db); 1342 return (dr); 1343 } else if (do_free_accounting) { 1344 blkptr_t *bp = db->db_blkptr; 1345 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1346 bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1347 /* 1348 * This is only a guess -- if the dbuf is dirty 1349 * in a previous txg, we don't know how much 1350 * space it will use on disk yet. We should 1351 * really have the struct_rwlock to access 1352 * db_blkptr, but since this is just a guess, 1353 * it's OK if we get an odd answer. 1354 */ 1355 ddt_prefetch(os->os_spa, bp); 1356 dnode_willuse_space(dn, -willfree, tx); 1357 } 1358 1359 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1360 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1361 drop_struct_lock = TRUE; 1362 } 1363 1364 if (db->db_level == 0) { 1365 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1366 ASSERT(dn->dn_maxblkid >= db->db_blkid); 1367 } 1368 1369 if (db->db_level+1 < dn->dn_nlevels) { 1370 dmu_buf_impl_t *parent = db->db_parent; 1371 dbuf_dirty_record_t *di; 1372 int parent_held = FALSE; 1373 1374 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1375 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1376 1377 parent = dbuf_hold_level(dn, db->db_level+1, 1378 db->db_blkid >> epbs, FTAG); 1379 ASSERT(parent != NULL); 1380 parent_held = TRUE; 1381 } 1382 if (drop_struct_lock) 1383 rw_exit(&dn->dn_struct_rwlock); 1384 ASSERT3U(db->db_level+1, ==, parent->db_level); 1385 di = dbuf_dirty(parent, tx); 1386 if (parent_held) 1387 dbuf_rele(parent, FTAG); 1388 1389 mutex_enter(&db->db_mtx); 1390 /* 1391 * Since we've dropped the mutex, it's possible that 1392 * dbuf_undirty() might have changed this out from under us. 1393 */ 1394 if (db->db_last_dirty == dr || 1395 dn->dn_object == DMU_META_DNODE_OBJECT) { 1396 mutex_enter(&di->dt.di.dr_mtx); 1397 ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1398 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1399 list_insert_tail(&di->dt.di.dr_children, dr); 1400 mutex_exit(&di->dt.di.dr_mtx); 1401 dr->dr_parent = di; 1402 } 1403 mutex_exit(&db->db_mtx); 1404 } else { 1405 ASSERT(db->db_level+1 == dn->dn_nlevels); 1406 ASSERT(db->db_blkid < dn->dn_nblkptr); 1407 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); 1408 mutex_enter(&dn->dn_mtx); 1409 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1410 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1411 mutex_exit(&dn->dn_mtx); 1412 if (drop_struct_lock) 1413 rw_exit(&dn->dn_struct_rwlock); 1414 } 1415 1416 dnode_setdirty(dn, tx); 1417 DB_DNODE_EXIT(db); 1418 return (dr); 1419 } 1420 1421 /* 1422 * Undirty a buffer in the transaction group referenced by the given 1423 * transaction. Return whether this evicted the dbuf. 1424 */ 1425 static boolean_t 1426 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1427 { 1428 dnode_t *dn; 1429 uint64_t txg = tx->tx_txg; 1430 dbuf_dirty_record_t *dr, **drp; 1431 1432 ASSERT(txg != 0); 1433 1434 /* 1435 * Due to our use of dn_nlevels below, this can only be called 1436 * in open context, unless we are operating on the MOS. 1437 * From syncing context, dn_nlevels may be different from the 1438 * dn_nlevels used when dbuf was dirtied. 1439 */ 1440 ASSERT(db->db_objset == 1441 dmu_objset_pool(db->db_objset)->dp_meta_objset || 1442 txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); 1443 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1444 ASSERT0(db->db_level); 1445 ASSERT(MUTEX_HELD(&db->db_mtx)); 1446 1447 /* 1448 * If this buffer is not dirty, we're done. 1449 */ 1450 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1451 if (dr->dr_txg <= txg) 1452 break; 1453 if (dr == NULL || dr->dr_txg < txg) 1454 return (B_FALSE); 1455 ASSERT(dr->dr_txg == txg); 1456 ASSERT(dr->dr_dbuf == db); 1457 1458 DB_DNODE_ENTER(db); 1459 dn = DB_DNODE(db); 1460 1461 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1462 1463 ASSERT(db->db.db_size != 0); 1464 1465 dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), 1466 dr->dr_accounted, txg); 1467 1468 *drp = dr->dr_next; 1469 1470 /* 1471 * Note that there are three places in dbuf_dirty() 1472 * where this dirty record may be put on a list. 1473 * Make sure to do a list_remove corresponding to 1474 * every one of those list_insert calls. 1475 */ 1476 if (dr->dr_parent) { 1477 mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1478 list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1479 mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1480 } else if (db->db_blkid == DMU_SPILL_BLKID || 1481 db->db_level + 1 == dn->dn_nlevels) { 1482 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1483 mutex_enter(&dn->dn_mtx); 1484 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1485 mutex_exit(&dn->dn_mtx); 1486 } 1487 DB_DNODE_EXIT(db); 1488 1489 if (db->db_state != DB_NOFILL) { 1490 dbuf_unoverride(dr); 1491 1492 ASSERT(db->db_buf != NULL); 1493 ASSERT(dr->dt.dl.dr_data != NULL); 1494 if (dr->dt.dl.dr_data != db->db_buf) 1495 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); 1496 } 1497 1498 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1499 1500 ASSERT(db->db_dirtycnt > 0); 1501 db->db_dirtycnt -= 1; 1502 1503 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1504 arc_buf_t *buf = db->db_buf; 1505 1506 ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); 1507 dbuf_clear_data(db); 1508 VERIFY(arc_buf_remove_ref(buf, db)); 1509 dbuf_evict(db); 1510 return (B_TRUE); 1511 } 1512 1513 return (B_FALSE); 1514 } 1515 1516 void 1517 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) 1518 { 1519 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1520 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1521 1522 ASSERT(tx->tx_txg != 0); 1523 ASSERT(!refcount_is_zero(&db->db_holds)); 1524 1525 /* 1526 * Quick check for dirtyness. For already dirty blocks, this 1527 * reduces runtime of this function by >90%, and overall performance 1528 * by 50% for some workloads (e.g. file deletion with indirect blocks 1529 * cached). 1530 */ 1531 mutex_enter(&db->db_mtx); 1532 dbuf_dirty_record_t *dr; 1533 for (dr = db->db_last_dirty; 1534 dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) { 1535 /* 1536 * It's possible that it is already dirty but not cached, 1537 * because there are some calls to dbuf_dirty() that don't 1538 * go through dmu_buf_will_dirty(). 1539 */ 1540 if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) { 1541 /* This dbuf is already dirty and cached. */ 1542 dbuf_redirty(dr); 1543 mutex_exit(&db->db_mtx); 1544 return; 1545 } 1546 } 1547 mutex_exit(&db->db_mtx); 1548 1549 DB_DNODE_ENTER(db); 1550 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 1551 rf |= DB_RF_HAVESTRUCT; 1552 DB_DNODE_EXIT(db); 1553 (void) dbuf_read(db, NULL, rf); 1554 (void) dbuf_dirty(db, tx); 1555 } 1556 1557 void 1558 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1559 { 1560 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1561 1562 db->db_state = DB_NOFILL; 1563 1564 dmu_buf_will_fill(db_fake, tx); 1565 } 1566 1567 void 1568 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1569 { 1570 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1571 1572 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1573 ASSERT(tx->tx_txg != 0); 1574 ASSERT(db->db_level == 0); 1575 ASSERT(!refcount_is_zero(&db->db_holds)); 1576 1577 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1578 dmu_tx_private_ok(tx)); 1579 1580 dbuf_noread(db); 1581 (void) dbuf_dirty(db, tx); 1582 } 1583 1584 #pragma weak dmu_buf_fill_done = dbuf_fill_done 1585 /* ARGSUSED */ 1586 void 1587 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1588 { 1589 mutex_enter(&db->db_mtx); 1590 DBUF_VERIFY(db); 1591 1592 if (db->db_state == DB_FILL) { 1593 if (db->db_level == 0 && db->db_freed_in_flight) { 1594 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1595 /* we were freed while filling */ 1596 /* XXX dbuf_undirty? */ 1597 bzero(db->db.db_data, db->db.db_size); 1598 db->db_freed_in_flight = FALSE; 1599 } 1600 db->db_state = DB_CACHED; 1601 cv_broadcast(&db->db_changed); 1602 } 1603 mutex_exit(&db->db_mtx); 1604 } 1605 1606 void 1607 dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, 1608 bp_embedded_type_t etype, enum zio_compress comp, 1609 int uncompressed_size, int compressed_size, int byteorder, 1610 dmu_tx_t *tx) 1611 { 1612 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 1613 struct dirty_leaf *dl; 1614 dmu_object_type_t type; 1615 1616 if (etype == BP_EMBEDDED_TYPE_DATA) { 1617 ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), 1618 SPA_FEATURE_EMBEDDED_DATA)); 1619 } 1620 1621 DB_DNODE_ENTER(db); 1622 type = DB_DNODE(db)->dn_type; 1623 DB_DNODE_EXIT(db); 1624 1625 ASSERT0(db->db_level); 1626 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1627 1628 dmu_buf_will_not_fill(dbuf, tx); 1629 1630 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1631 dl = &db->db_last_dirty->dt.dl; 1632 encode_embedded_bp_compressed(&dl->dr_overridden_by, 1633 data, comp, uncompressed_size, compressed_size); 1634 BPE_SET_ETYPE(&dl->dr_overridden_by, etype); 1635 BP_SET_TYPE(&dl->dr_overridden_by, type); 1636 BP_SET_LEVEL(&dl->dr_overridden_by, 0); 1637 BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); 1638 1639 dl->dr_override_state = DR_OVERRIDDEN; 1640 dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; 1641 } 1642 1643 /* 1644 * Directly assign a provided arc buf to a given dbuf if it's not referenced 1645 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 1646 */ 1647 void 1648 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 1649 { 1650 ASSERT(!refcount_is_zero(&db->db_holds)); 1651 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1652 ASSERT(db->db_level == 0); 1653 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 1654 ASSERT(buf != NULL); 1655 ASSERT(arc_buf_size(buf) == db->db.db_size); 1656 ASSERT(tx->tx_txg != 0); 1657 1658 arc_return_buf(buf, db); 1659 ASSERT(arc_released(buf)); 1660 1661 mutex_enter(&db->db_mtx); 1662 1663 while (db->db_state == DB_READ || db->db_state == DB_FILL) 1664 cv_wait(&db->db_changed, &db->db_mtx); 1665 1666 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 1667 1668 if (db->db_state == DB_CACHED && 1669 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 1670 mutex_exit(&db->db_mtx); 1671 (void) dbuf_dirty(db, tx); 1672 bcopy(buf->b_data, db->db.db_data, db->db.db_size); 1673 VERIFY(arc_buf_remove_ref(buf, db)); 1674 xuio_stat_wbuf_copied(); 1675 return; 1676 } 1677 1678 xuio_stat_wbuf_nocopy(); 1679 if (db->db_state == DB_CACHED) { 1680 dbuf_dirty_record_t *dr = db->db_last_dirty; 1681 1682 ASSERT(db->db_buf != NULL); 1683 if (dr != NULL && dr->dr_txg == tx->tx_txg) { 1684 ASSERT(dr->dt.dl.dr_data == db->db_buf); 1685 if (!arc_released(db->db_buf)) { 1686 ASSERT(dr->dt.dl.dr_override_state == 1687 DR_OVERRIDDEN); 1688 arc_release(db->db_buf, db); 1689 } 1690 dr->dt.dl.dr_data = buf; 1691 VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1692 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 1693 arc_release(db->db_buf, db); 1694 VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1695 } 1696 db->db_buf = NULL; 1697 } 1698 ASSERT(db->db_buf == NULL); 1699 dbuf_set_data(db, buf); 1700 db->db_state = DB_FILL; 1701 mutex_exit(&db->db_mtx); 1702 (void) dbuf_dirty(db, tx); 1703 dmu_buf_fill_done(&db->db, tx); 1704 } 1705 1706 /* 1707 * "Clear" the contents of this dbuf. This will mark the dbuf 1708 * EVICTING and clear *most* of its references. Unfortunately, 1709 * when we are not holding the dn_dbufs_mtx, we can't clear the 1710 * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1711 * in this case. For callers from the DMU we will usually see: 1712 * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy() 1713 * For the arc callback, we will usually see: 1714 * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1715 * Sometimes, though, we will get a mix of these two: 1716 * DMU: dbuf_clear()->arc_clear_callback() 1717 * ARC: dbuf_do_evict()->dbuf_destroy() 1718 * 1719 * This routine will dissociate the dbuf from the arc, by calling 1720 * arc_clear_callback(), but will not evict the data from the ARC. 1721 */ 1722 void 1723 dbuf_clear(dmu_buf_impl_t *db) 1724 { 1725 dnode_t *dn; 1726 dmu_buf_impl_t *parent = db->db_parent; 1727 dmu_buf_impl_t *dndb; 1728 boolean_t dbuf_gone = B_FALSE; 1729 1730 ASSERT(MUTEX_HELD(&db->db_mtx)); 1731 ASSERT(refcount_is_zero(&db->db_holds)); 1732 1733 dbuf_evict_user(db); 1734 1735 if (db->db_state == DB_CACHED) { 1736 ASSERT(db->db.db_data != NULL); 1737 if (db->db_blkid == DMU_BONUS_BLKID) { 1738 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1739 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1740 } 1741 db->db.db_data = NULL; 1742 db->db_state = DB_UNCACHED; 1743 } 1744 1745 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 1746 ASSERT(db->db_data_pending == NULL); 1747 1748 db->db_state = DB_EVICTING; 1749 db->db_blkptr = NULL; 1750 1751 DB_DNODE_ENTER(db); 1752 dn = DB_DNODE(db); 1753 dndb = dn->dn_dbuf; 1754 if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1755 avl_remove(&dn->dn_dbufs, db); 1756 atomic_dec_32(&dn->dn_dbufs_count); 1757 membar_producer(); 1758 DB_DNODE_EXIT(db); 1759 /* 1760 * Decrementing the dbuf count means that the hold corresponding 1761 * to the removed dbuf is no longer discounted in dnode_move(), 1762 * so the dnode cannot be moved until after we release the hold. 1763 * The membar_producer() ensures visibility of the decremented 1764 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 1765 * release any lock. 1766 */ 1767 dnode_rele(dn, db); 1768 db->db_dnode_handle = NULL; 1769 } else { 1770 DB_DNODE_EXIT(db); 1771 } 1772 1773 if (db->db_buf) 1774 dbuf_gone = arc_clear_callback(db->db_buf); 1775 1776 if (!dbuf_gone) 1777 mutex_exit(&db->db_mtx); 1778 1779 /* 1780 * If this dbuf is referenced from an indirect dbuf, 1781 * decrement the ref count on the indirect dbuf. 1782 */ 1783 if (parent && parent != dndb) 1784 dbuf_rele(parent, db); 1785 } 1786 1787 /* 1788 * Note: While bpp will always be updated if the function returns success, 1789 * parentp will not be updated if the dnode does not have dn_dbuf filled in; 1790 * this happens when the dnode is the meta-dnode, or a userused or groupused 1791 * object. 1792 */ 1793 static int 1794 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1795 dmu_buf_impl_t **parentp, blkptr_t **bpp) 1796 { 1797 int nlevels, epbs; 1798 1799 *parentp = NULL; 1800 *bpp = NULL; 1801 1802 ASSERT(blkid != DMU_BONUS_BLKID); 1803 1804 if (blkid == DMU_SPILL_BLKID) { 1805 mutex_enter(&dn->dn_mtx); 1806 if (dn->dn_have_spill && 1807 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 1808 *bpp = &dn->dn_phys->dn_spill; 1809 else 1810 *bpp = NULL; 1811 dbuf_add_ref(dn->dn_dbuf, NULL); 1812 *parentp = dn->dn_dbuf; 1813 mutex_exit(&dn->dn_mtx); 1814 return (0); 1815 } 1816 1817 if (dn->dn_phys->dn_nlevels == 0) 1818 nlevels = 1; 1819 else 1820 nlevels = dn->dn_phys->dn_nlevels; 1821 1822 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1823 1824 ASSERT3U(level * epbs, <, 64); 1825 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1826 if (level >= nlevels || 1827 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1828 /* the buffer has no parent yet */ 1829 return (SET_ERROR(ENOENT)); 1830 } else if (level < nlevels-1) { 1831 /* this block is referenced from an indirect block */ 1832 int err = dbuf_hold_impl(dn, level+1, 1833 blkid >> epbs, fail_sparse, FALSE, NULL, parentp); 1834 if (err) 1835 return (err); 1836 err = dbuf_read(*parentp, NULL, 1837 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1838 if (err) { 1839 dbuf_rele(*parentp, NULL); 1840 *parentp = NULL; 1841 return (err); 1842 } 1843 *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1844 (blkid & ((1ULL << epbs) - 1)); 1845 return (0); 1846 } else { 1847 /* the block is referenced from the dnode */ 1848 ASSERT3U(level, ==, nlevels-1); 1849 ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1850 blkid < dn->dn_phys->dn_nblkptr); 1851 if (dn->dn_dbuf) { 1852 dbuf_add_ref(dn->dn_dbuf, NULL); 1853 *parentp = dn->dn_dbuf; 1854 } 1855 *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1856 return (0); 1857 } 1858 } 1859 1860 static dmu_buf_impl_t * 1861 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1862 dmu_buf_impl_t *parent, blkptr_t *blkptr) 1863 { 1864 objset_t *os = dn->dn_objset; 1865 dmu_buf_impl_t *db, *odb; 1866 1867 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1868 ASSERT(dn->dn_type != DMU_OT_NONE); 1869 1870 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1871 1872 db->db_objset = os; 1873 db->db.db_object = dn->dn_object; 1874 db->db_level = level; 1875 db->db_blkid = blkid; 1876 db->db_last_dirty = NULL; 1877 db->db_dirtycnt = 0; 1878 db->db_dnode_handle = dn->dn_handle; 1879 db->db_parent = parent; 1880 db->db_blkptr = blkptr; 1881 1882 db->db_user = NULL; 1883 db->db_user_immediate_evict = FALSE; 1884 db->db_freed_in_flight = FALSE; 1885 db->db_pending_evict = FALSE; 1886 1887 if (blkid == DMU_BONUS_BLKID) { 1888 ASSERT3P(parent, ==, dn->dn_dbuf); 1889 db->db.db_size = DN_MAX_BONUSLEN - 1890 (dn->dn_nblkptr-1) * sizeof (blkptr_t); 1891 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 1892 db->db.db_offset = DMU_BONUS_BLKID; 1893 db->db_state = DB_UNCACHED; 1894 /* the bonus dbuf is not placed in the hash table */ 1895 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1896 return (db); 1897 } else if (blkid == DMU_SPILL_BLKID) { 1898 db->db.db_size = (blkptr != NULL) ? 1899 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 1900 db->db.db_offset = 0; 1901 } else { 1902 int blocksize = 1903 db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; 1904 db->db.db_size = blocksize; 1905 db->db.db_offset = db->db_blkid * blocksize; 1906 } 1907 1908 /* 1909 * Hold the dn_dbufs_mtx while we get the new dbuf 1910 * in the hash table *and* added to the dbufs list. 1911 * This prevents a possible deadlock with someone 1912 * trying to look up this dbuf before its added to the 1913 * dn_dbufs list. 1914 */ 1915 mutex_enter(&dn->dn_dbufs_mtx); 1916 db->db_state = DB_EVICTING; 1917 if ((odb = dbuf_hash_insert(db)) != NULL) { 1918 /* someone else inserted it first */ 1919 kmem_cache_free(dbuf_cache, db); 1920 mutex_exit(&dn->dn_dbufs_mtx); 1921 return (odb); 1922 } 1923 avl_add(&dn->dn_dbufs, db); 1924 if (db->db_level == 0 && db->db_blkid >= 1925 dn->dn_unlisted_l0_blkid) 1926 dn->dn_unlisted_l0_blkid = db->db_blkid + 1; 1927 db->db_state = DB_UNCACHED; 1928 mutex_exit(&dn->dn_dbufs_mtx); 1929 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1930 1931 if (parent && parent != dn->dn_dbuf) 1932 dbuf_add_ref(parent, db); 1933 1934 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1935 refcount_count(&dn->dn_holds) > 0); 1936 (void) refcount_add(&dn->dn_holds, db); 1937 atomic_inc_32(&dn->dn_dbufs_count); 1938 1939 dprintf_dbuf(db, "db=%p\n", db); 1940 1941 return (db); 1942 } 1943 1944 static int 1945 dbuf_do_evict(void *private) 1946 { 1947 dmu_buf_impl_t *db = private; 1948 1949 if (!MUTEX_HELD(&db->db_mtx)) 1950 mutex_enter(&db->db_mtx); 1951 1952 ASSERT(refcount_is_zero(&db->db_holds)); 1953 1954 if (db->db_state != DB_EVICTING) { 1955 ASSERT(db->db_state == DB_CACHED); 1956 DBUF_VERIFY(db); 1957 db->db_buf = NULL; 1958 dbuf_evict(db); 1959 } else { 1960 mutex_exit(&db->db_mtx); 1961 dbuf_destroy(db); 1962 } 1963 return (0); 1964 } 1965 1966 static void 1967 dbuf_destroy(dmu_buf_impl_t *db) 1968 { 1969 ASSERT(refcount_is_zero(&db->db_holds)); 1970 1971 if (db->db_blkid != DMU_BONUS_BLKID) { 1972 /* 1973 * If this dbuf is still on the dn_dbufs list, 1974 * remove it from that list. 1975 */ 1976 if (db->db_dnode_handle != NULL) { 1977 dnode_t *dn; 1978 1979 DB_DNODE_ENTER(db); 1980 dn = DB_DNODE(db); 1981 mutex_enter(&dn->dn_dbufs_mtx); 1982 avl_remove(&dn->dn_dbufs, db); 1983 atomic_dec_32(&dn->dn_dbufs_count); 1984 mutex_exit(&dn->dn_dbufs_mtx); 1985 DB_DNODE_EXIT(db); 1986 /* 1987 * Decrementing the dbuf count means that the hold 1988 * corresponding to the removed dbuf is no longer 1989 * discounted in dnode_move(), so the dnode cannot be 1990 * moved until after we release the hold. 1991 */ 1992 dnode_rele(dn, db); 1993 db->db_dnode_handle = NULL; 1994 } 1995 dbuf_hash_remove(db); 1996 } 1997 db->db_parent = NULL; 1998 db->db_buf = NULL; 1999 2000 ASSERT(db->db.db_data == NULL); 2001 ASSERT(db->db_hash_next == NULL); 2002 ASSERT(db->db_blkptr == NULL); 2003 ASSERT(db->db_data_pending == NULL); 2004 2005 kmem_cache_free(dbuf_cache, db); 2006 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 2007 } 2008 2009 typedef struct dbuf_prefetch_arg { 2010 spa_t *dpa_spa; /* The spa to issue the prefetch in. */ 2011 zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ 2012 int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ 2013 int dpa_curlevel; /* The current level that we're reading */ 2014 zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ 2015 zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ 2016 arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ 2017 } dbuf_prefetch_arg_t; 2018 2019 /* 2020 * Actually issue the prefetch read for the block given. 2021 */ 2022 static void 2023 dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) 2024 { 2025 if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 2026 return; 2027 2028 arc_flags_t aflags = 2029 dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; 2030 2031 ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); 2032 ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); 2033 ASSERT(dpa->dpa_zio != NULL); 2034 (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, 2035 dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2036 &aflags, &dpa->dpa_zb); 2037 } 2038 2039 /* 2040 * Called when an indirect block above our prefetch target is read in. This 2041 * will either read in the next indirect block down the tree or issue the actual 2042 * prefetch if the next block down is our target. 2043 */ 2044 static void 2045 dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) 2046 { 2047 dbuf_prefetch_arg_t *dpa = private; 2048 2049 ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); 2050 ASSERT3S(dpa->dpa_curlevel, >, 0); 2051 if (zio != NULL) { 2052 ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); 2053 ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); 2054 ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); 2055 } 2056 2057 dpa->dpa_curlevel--; 2058 2059 uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> 2060 (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); 2061 blkptr_t *bp = ((blkptr_t *)abuf->b_data) + 2062 P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); 2063 if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) { 2064 kmem_free(dpa, sizeof (*dpa)); 2065 } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { 2066 ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); 2067 dbuf_issue_final_prefetch(dpa, bp); 2068 kmem_free(dpa, sizeof (*dpa)); 2069 } else { 2070 arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; 2071 zbookmark_phys_t zb; 2072 2073 ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); 2074 2075 SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, 2076 dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); 2077 2078 (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, 2079 bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, 2080 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2081 &iter_aflags, &zb); 2082 } 2083 (void) arc_buf_remove_ref(abuf, private); 2084 } 2085 2086 /* 2087 * Issue prefetch reads for the given block on the given level. If the indirect 2088 * blocks above that block are not in memory, we will read them in 2089 * asynchronously. As a result, this call never blocks waiting for a read to 2090 * complete. 2091 */ 2092 void 2093 dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, 2094 arc_flags_t aflags) 2095 { 2096 blkptr_t bp; 2097 int epbs, nlevels, curlevel; 2098 uint64_t curblkid; 2099 2100 ASSERT(blkid != DMU_BONUS_BLKID); 2101 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2102 2103 if (blkid > dn->dn_maxblkid) 2104 return; 2105 2106 if (dnode_block_freed(dn, blkid)) 2107 return; 2108 2109 /* 2110 * This dnode hasn't been written to disk yet, so there's nothing to 2111 * prefetch. 2112 */ 2113 nlevels = dn->dn_phys->dn_nlevels; 2114 if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) 2115 return; 2116 2117 epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2118 if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) 2119 return; 2120 2121 dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, 2122 level, blkid); 2123 if (db != NULL) { 2124 mutex_exit(&db->db_mtx); 2125 /* 2126 * This dbuf already exists. It is either CACHED, or 2127 * (we assume) about to be read or filled. 2128 */ 2129 return; 2130 } 2131 2132 /* 2133 * Find the closest ancestor (indirect block) of the target block 2134 * that is present in the cache. In this indirect block, we will 2135 * find the bp that is at curlevel, curblkid. 2136 */ 2137 curlevel = level; 2138 curblkid = blkid; 2139 while (curlevel < nlevels - 1) { 2140 int parent_level = curlevel + 1; 2141 uint64_t parent_blkid = curblkid >> epbs; 2142 dmu_buf_impl_t *db; 2143 2144 if (dbuf_hold_impl(dn, parent_level, parent_blkid, 2145 FALSE, TRUE, FTAG, &db) == 0) { 2146 blkptr_t *bpp = db->db_buf->b_data; 2147 bp = bpp[P2PHASE(curblkid, 1 << epbs)]; 2148 dbuf_rele(db, FTAG); 2149 break; 2150 } 2151 2152 curlevel = parent_level; 2153 curblkid = parent_blkid; 2154 } 2155 2156 if (curlevel == nlevels - 1) { 2157 /* No cached indirect blocks found. */ 2158 ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); 2159 bp = dn->dn_phys->dn_blkptr[curblkid]; 2160 } 2161 if (BP_IS_HOLE(&bp)) 2162 return; 2163 2164 ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); 2165 2166 zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, 2167 ZIO_FLAG_CANFAIL); 2168 2169 dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); 2170 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 2171 SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, 2172 dn->dn_object, level, blkid); 2173 dpa->dpa_curlevel = curlevel; 2174 dpa->dpa_prio = prio; 2175 dpa->dpa_aflags = aflags; 2176 dpa->dpa_spa = dn->dn_objset->os_spa; 2177 dpa->dpa_epbs = epbs; 2178 dpa->dpa_zio = pio; 2179 2180 /* 2181 * If we have the indirect just above us, no need to do the asynchronous 2182 * prefetch chain; we'll just run the last step ourselves. If we're at 2183 * a higher level, though, we want to issue the prefetches for all the 2184 * indirect blocks asynchronously, so we can go on with whatever we were 2185 * doing. 2186 */ 2187 if (curlevel == level) { 2188 ASSERT3U(curblkid, ==, blkid); 2189 dbuf_issue_final_prefetch(dpa, &bp); 2190 kmem_free(dpa, sizeof (*dpa)); 2191 } else { 2192 arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; 2193 zbookmark_phys_t zb; 2194 2195 SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, 2196 dn->dn_object, curlevel, curblkid); 2197 (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, 2198 &bp, dbuf_prefetch_indirect_done, dpa, prio, 2199 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2200 &iter_aflags, &zb); 2201 } 2202 /* 2203 * We use pio here instead of dpa_zio since it's possible that 2204 * dpa may have already been freed. 2205 */ 2206 zio_nowait(pio); 2207 } 2208 2209 /* 2210 * Returns with db_holds incremented, and db_mtx not held. 2211 * Note: dn_struct_rwlock must be held. 2212 */ 2213 int 2214 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, 2215 boolean_t fail_sparse, boolean_t fail_uncached, 2216 void *tag, dmu_buf_impl_t **dbp) 2217 { 2218 dmu_buf_impl_t *db, *parent = NULL; 2219 2220 ASSERT(blkid != DMU_BONUS_BLKID); 2221 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2222 ASSERT3U(dn->dn_nlevels, >, level); 2223 2224 *dbp = NULL; 2225 top: 2226 /* dbuf_find() returns with db_mtx held */ 2227 db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); 2228 2229 if (db == NULL) { 2230 blkptr_t *bp = NULL; 2231 int err; 2232 2233 if (fail_uncached) 2234 return (SET_ERROR(ENOENT)); 2235 2236 ASSERT3P(parent, ==, NULL); 2237 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 2238 if (fail_sparse) { 2239 if (err == 0 && bp && BP_IS_HOLE(bp)) 2240 err = SET_ERROR(ENOENT); 2241 if (err) { 2242 if (parent) 2243 dbuf_rele(parent, NULL); 2244 return (err); 2245 } 2246 } 2247 if (err && err != ENOENT) 2248 return (err); 2249 db = dbuf_create(dn, level, blkid, parent, bp); 2250 } 2251 2252 if (fail_uncached && db->db_state != DB_CACHED) { 2253 mutex_exit(&db->db_mtx); 2254 return (SET_ERROR(ENOENT)); 2255 } 2256 2257 if (db->db_buf && refcount_is_zero(&db->db_holds)) { 2258 arc_buf_add_ref(db->db_buf, db); 2259 if (db->db_buf->b_data == NULL) { 2260 dbuf_clear(db); 2261 if (parent) { 2262 dbuf_rele(parent, NULL); 2263 parent = NULL; 2264 } 2265 goto top; 2266 } 2267 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 2268 } 2269 2270 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 2271 2272 /* 2273 * If this buffer is currently syncing out, and we are are 2274 * still referencing it from db_data, we need to make a copy 2275 * of it in case we decide we want to dirty it again in this txg. 2276 */ 2277 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 2278 dn->dn_object != DMU_META_DNODE_OBJECT && 2279 db->db_state == DB_CACHED && db->db_data_pending) { 2280 dbuf_dirty_record_t *dr = db->db_data_pending; 2281 2282 if (dr->dt.dl.dr_data == db->db_buf) { 2283 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2284 2285 dbuf_set_data(db, 2286 arc_buf_alloc(dn->dn_objset->os_spa, 2287 db->db.db_size, db, type)); 2288 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 2289 db->db.db_size); 2290 } 2291 } 2292 2293 (void) refcount_add(&db->db_holds, tag); 2294 DBUF_VERIFY(db); 2295 mutex_exit(&db->db_mtx); 2296 2297 /* NOTE: we can't rele the parent until after we drop the db_mtx */ 2298 if (parent) 2299 dbuf_rele(parent, NULL); 2300 2301 ASSERT3P(DB_DNODE(db), ==, dn); 2302 ASSERT3U(db->db_blkid, ==, blkid); 2303 ASSERT3U(db->db_level, ==, level); 2304 *dbp = db; 2305 2306 return (0); 2307 } 2308 2309 dmu_buf_impl_t * 2310 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 2311 { 2312 return (dbuf_hold_level(dn, 0, blkid, tag)); 2313 } 2314 2315 dmu_buf_impl_t * 2316 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 2317 { 2318 dmu_buf_impl_t *db; 2319 int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); 2320 return (err ? NULL : db); 2321 } 2322 2323 void 2324 dbuf_create_bonus(dnode_t *dn) 2325 { 2326 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 2327 2328 ASSERT(dn->dn_bonus == NULL); 2329 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 2330 } 2331 2332 int 2333 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) 2334 { 2335 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2336 dnode_t *dn; 2337 2338 if (db->db_blkid != DMU_SPILL_BLKID) 2339 return (SET_ERROR(ENOTSUP)); 2340 if (blksz == 0) 2341 blksz = SPA_MINBLOCKSIZE; 2342 ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); 2343 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 2344 2345 DB_DNODE_ENTER(db); 2346 dn = DB_DNODE(db); 2347 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 2348 dbuf_new_size(db, blksz, tx); 2349 rw_exit(&dn->dn_struct_rwlock); 2350 DB_DNODE_EXIT(db); 2351 2352 return (0); 2353 } 2354 2355 void 2356 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) 2357 { 2358 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 2359 } 2360 2361 #pragma weak dmu_buf_add_ref = dbuf_add_ref 2362 void 2363 dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 2364 { 2365 int64_t holds = refcount_add(&db->db_holds, tag); 2366 ASSERT(holds > 1); 2367 } 2368 2369 #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref 2370 boolean_t 2371 dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, 2372 void *tag) 2373 { 2374 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2375 dmu_buf_impl_t *found_db; 2376 boolean_t result = B_FALSE; 2377 2378 if (db->db_blkid == DMU_BONUS_BLKID) 2379 found_db = dbuf_find_bonus(os, obj); 2380 else 2381 found_db = dbuf_find(os, obj, 0, blkid); 2382 2383 if (found_db != NULL) { 2384 if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { 2385 (void) refcount_add(&db->db_holds, tag); 2386 result = B_TRUE; 2387 } 2388 mutex_exit(&db->db_mtx); 2389 } 2390 return (result); 2391 } 2392 2393 /* 2394 * If you call dbuf_rele() you had better not be referencing the dnode handle 2395 * unless you have some other direct or indirect hold on the dnode. (An indirect 2396 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) 2397 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the 2398 * dnode's parent dbuf evicting its dnode handles. 2399 */ 2400 void 2401 dbuf_rele(dmu_buf_impl_t *db, void *tag) 2402 { 2403 mutex_enter(&db->db_mtx); 2404 dbuf_rele_and_unlock(db, tag); 2405 } 2406 2407 void 2408 dmu_buf_rele(dmu_buf_t *db, void *tag) 2409 { 2410 dbuf_rele((dmu_buf_impl_t *)db, tag); 2411 } 2412 2413 /* 2414 * dbuf_rele() for an already-locked dbuf. This is necessary to allow 2415 * db_dirtycnt and db_holds to be updated atomically. 2416 */ 2417 void 2418 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 2419 { 2420 int64_t holds; 2421 2422 ASSERT(MUTEX_HELD(&db->db_mtx)); 2423 DBUF_VERIFY(db); 2424 2425 /* 2426 * Remove the reference to the dbuf before removing its hold on the 2427 * dnode so we can guarantee in dnode_move() that a referenced bonus 2428 * buffer has a corresponding dnode hold. 2429 */ 2430 holds = refcount_remove(&db->db_holds, tag); 2431 ASSERT(holds >= 0); 2432 2433 /* 2434 * We can't freeze indirects if there is a possibility that they 2435 * may be modified in the current syncing context. 2436 */ 2437 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 2438 arc_buf_freeze(db->db_buf); 2439 2440 if (holds == db->db_dirtycnt && 2441 db->db_level == 0 && db->db_user_immediate_evict) 2442 dbuf_evict_user(db); 2443 2444 if (holds == 0) { 2445 if (db->db_blkid == DMU_BONUS_BLKID) { 2446 dnode_t *dn; 2447 boolean_t evict_dbuf = db->db_pending_evict; 2448 2449 /* 2450 * If the dnode moves here, we cannot cross this 2451 * barrier until the move completes. 2452 */ 2453 DB_DNODE_ENTER(db); 2454 2455 dn = DB_DNODE(db); 2456 atomic_dec_32(&dn->dn_dbufs_count); 2457 2458 /* 2459 * Decrementing the dbuf count means that the bonus 2460 * buffer's dnode hold is no longer discounted in 2461 * dnode_move(). The dnode cannot move until after 2462 * the dnode_rele() below. 2463 */ 2464 DB_DNODE_EXIT(db); 2465 2466 /* 2467 * Do not reference db after its lock is dropped. 2468 * Another thread may evict it. 2469 */ 2470 mutex_exit(&db->db_mtx); 2471 2472 if (evict_dbuf) 2473 dnode_evict_bonus(dn); 2474 2475 dnode_rele(dn, db); 2476 } else if (db->db_buf == NULL) { 2477 /* 2478 * This is a special case: we never associated this 2479 * dbuf with any data allocated from the ARC. 2480 */ 2481 ASSERT(db->db_state == DB_UNCACHED || 2482 db->db_state == DB_NOFILL); 2483 dbuf_evict(db); 2484 } else if (arc_released(db->db_buf)) { 2485 arc_buf_t *buf = db->db_buf; 2486 /* 2487 * This dbuf has anonymous data associated with it. 2488 */ 2489 dbuf_clear_data(db); 2490 VERIFY(arc_buf_remove_ref(buf, db)); 2491 dbuf_evict(db); 2492 } else { 2493 VERIFY(!arc_buf_remove_ref(db->db_buf, db)); 2494 2495 /* 2496 * A dbuf will be eligible for eviction if either the 2497 * 'primarycache' property is set or a duplicate 2498 * copy of this buffer is already cached in the arc. 2499 * 2500 * In the case of the 'primarycache' a buffer 2501 * is considered for eviction if it matches the 2502 * criteria set in the property. 2503 * 2504 * To decide if our buffer is considered a 2505 * duplicate, we must call into the arc to determine 2506 * if multiple buffers are referencing the same 2507 * block on-disk. If so, then we simply evict 2508 * ourselves. 2509 */ 2510 if (!DBUF_IS_CACHEABLE(db)) { 2511 if (db->db_blkptr != NULL && 2512 !BP_IS_HOLE(db->db_blkptr) && 2513 !BP_IS_EMBEDDED(db->db_blkptr)) { 2514 spa_t *spa = 2515 dmu_objset_spa(db->db_objset); 2516 blkptr_t bp = *db->db_blkptr; 2517 dbuf_clear(db); 2518 arc_freed(spa, &bp); 2519 } else { 2520 dbuf_clear(db); 2521 } 2522 } else if (db->db_pending_evict || 2523 arc_buf_eviction_needed(db->db_buf)) { 2524 dbuf_clear(db); 2525 } else { 2526 mutex_exit(&db->db_mtx); 2527 } 2528 } 2529 } else { 2530 mutex_exit(&db->db_mtx); 2531 } 2532 } 2533 2534 #pragma weak dmu_buf_refcount = dbuf_refcount 2535 uint64_t 2536 dbuf_refcount(dmu_buf_impl_t *db) 2537 { 2538 return (refcount_count(&db->db_holds)); 2539 } 2540 2541 void * 2542 dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, 2543 dmu_buf_user_t *new_user) 2544 { 2545 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2546 2547 mutex_enter(&db->db_mtx); 2548 dbuf_verify_user(db, DBVU_NOT_EVICTING); 2549 if (db->db_user == old_user) 2550 db->db_user = new_user; 2551 else 2552 old_user = db->db_user; 2553 dbuf_verify_user(db, DBVU_NOT_EVICTING); 2554 mutex_exit(&db->db_mtx); 2555 2556 return (old_user); 2557 } 2558 2559 void * 2560 dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) 2561 { 2562 return (dmu_buf_replace_user(db_fake, NULL, user)); 2563 } 2564 2565 void * 2566 dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) 2567 { 2568 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2569 2570 db->db_user_immediate_evict = TRUE; 2571 return (dmu_buf_set_user(db_fake, user)); 2572 } 2573 2574 void * 2575 dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) 2576 { 2577 return (dmu_buf_replace_user(db_fake, user, NULL)); 2578 } 2579 2580 void * 2581 dmu_buf_get_user(dmu_buf_t *db_fake) 2582 { 2583 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2584 2585 dbuf_verify_user(db, DBVU_NOT_EVICTING); 2586 return (db->db_user); 2587 } 2588 2589 void 2590 dmu_buf_user_evict_wait() 2591 { 2592 taskq_wait(dbu_evict_taskq); 2593 } 2594 2595 boolean_t 2596 dmu_buf_freeable(dmu_buf_t *dbuf) 2597 { 2598 boolean_t res = B_FALSE; 2599 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 2600 2601 if (db->db_blkptr) 2602 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 2603 db->db_blkptr, db->db_blkptr->blk_birth); 2604 2605 return (res); 2606 } 2607 2608 blkptr_t * 2609 dmu_buf_get_blkptr(dmu_buf_t *db) 2610 { 2611 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 2612 return (dbi->db_blkptr); 2613 } 2614 2615 static void 2616 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 2617 { 2618 /* ASSERT(dmu_tx_is_syncing(tx) */ 2619 ASSERT(MUTEX_HELD(&db->db_mtx)); 2620 2621 if (db->db_blkptr != NULL) 2622 return; 2623 2624 if (db->db_blkid == DMU_SPILL_BLKID) { 2625 db->db_blkptr = &dn->dn_phys->dn_spill; 2626 BP_ZERO(db->db_blkptr); 2627 return; 2628 } 2629 if (db->db_level == dn->dn_phys->dn_nlevels-1) { 2630 /* 2631 * This buffer was allocated at a time when there was 2632 * no available blkptrs from the dnode, or it was 2633 * inappropriate to hook it in (i.e., nlevels mis-match). 2634 */ 2635 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 2636 ASSERT(db->db_parent == NULL); 2637 db->db_parent = dn->dn_dbuf; 2638 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 2639 DBUF_VERIFY(db); 2640 } else { 2641 dmu_buf_impl_t *parent = db->db_parent; 2642 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2643 2644 ASSERT(dn->dn_phys->dn_nlevels > 1); 2645 if (parent == NULL) { 2646 mutex_exit(&db->db_mtx); 2647 rw_enter(&dn->dn_struct_rwlock, RW_READER); 2648 parent = dbuf_hold_level(dn, db->db_level + 1, 2649 db->db_blkid >> epbs, db); 2650 rw_exit(&dn->dn_struct_rwlock); 2651 mutex_enter(&db->db_mtx); 2652 db->db_parent = parent; 2653 } 2654 db->db_blkptr = (blkptr_t *)parent->db.db_data + 2655 (db->db_blkid & ((1ULL << epbs) - 1)); 2656 DBUF_VERIFY(db); 2657 } 2658 } 2659 2660 static void 2661 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2662 { 2663 dmu_buf_impl_t *db = dr->dr_dbuf; 2664 dnode_t *dn; 2665 zio_t *zio; 2666 2667 ASSERT(dmu_tx_is_syncing(tx)); 2668 2669 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2670 2671 mutex_enter(&db->db_mtx); 2672 2673 ASSERT(db->db_level > 0); 2674 DBUF_VERIFY(db); 2675 2676 /* Read the block if it hasn't been read yet. */ 2677 if (db->db_buf == NULL) { 2678 mutex_exit(&db->db_mtx); 2679 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 2680 mutex_enter(&db->db_mtx); 2681 } 2682 ASSERT3U(db->db_state, ==, DB_CACHED); 2683 ASSERT(db->db_buf != NULL); 2684 2685 DB_DNODE_ENTER(db); 2686 dn = DB_DNODE(db); 2687 /* Indirect block size must match what the dnode thinks it is. */ 2688 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2689 dbuf_check_blkptr(dn, db); 2690 DB_DNODE_EXIT(db); 2691 2692 /* Provide the pending dirty record to child dbufs */ 2693 db->db_data_pending = dr; 2694 2695 mutex_exit(&db->db_mtx); 2696 dbuf_write(dr, db->db_buf, tx); 2697 2698 zio = dr->dr_zio; 2699 mutex_enter(&dr->dt.di.dr_mtx); 2700 dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); 2701 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2702 mutex_exit(&dr->dt.di.dr_mtx); 2703 zio_nowait(zio); 2704 } 2705 2706 static void 2707 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2708 { 2709 arc_buf_t **datap = &dr->dt.dl.dr_data; 2710 dmu_buf_impl_t *db = dr->dr_dbuf; 2711 dnode_t *dn; 2712 objset_t *os; 2713 uint64_t txg = tx->tx_txg; 2714 2715 ASSERT(dmu_tx_is_syncing(tx)); 2716 2717 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2718 2719 mutex_enter(&db->db_mtx); 2720 /* 2721 * To be synced, we must be dirtied. But we 2722 * might have been freed after the dirty. 2723 */ 2724 if (db->db_state == DB_UNCACHED) { 2725 /* This buffer has been freed since it was dirtied */ 2726 ASSERT(db->db.db_data == NULL); 2727 } else if (db->db_state == DB_FILL) { 2728 /* This buffer was freed and is now being re-filled */ 2729 ASSERT(db->db.db_data != dr->dt.dl.dr_data); 2730 } else { 2731 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 2732 } 2733 DBUF_VERIFY(db); 2734 2735 DB_DNODE_ENTER(db); 2736 dn = DB_DNODE(db); 2737 2738 if (db->db_blkid == DMU_SPILL_BLKID) { 2739 mutex_enter(&dn->dn_mtx); 2740 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; 2741 mutex_exit(&dn->dn_mtx); 2742 } 2743 2744 /* 2745 * If this is a bonus buffer, simply copy the bonus data into the 2746 * dnode. It will be written out when the dnode is synced (and it 2747 * will be synced, since it must have been dirty for dbuf_sync to 2748 * be called). 2749 */ 2750 if (db->db_blkid == DMU_BONUS_BLKID) { 2751 dbuf_dirty_record_t **drp; 2752 2753 ASSERT(*datap != NULL); 2754 ASSERT0(db->db_level); 2755 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 2756 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 2757 DB_DNODE_EXIT(db); 2758 2759 if (*datap != db->db.db_data) { 2760 zio_buf_free(*datap, DN_MAX_BONUSLEN); 2761 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2762 } 2763 db->db_data_pending = NULL; 2764 drp = &db->db_last_dirty; 2765 while (*drp != dr) 2766 drp = &(*drp)->dr_next; 2767 ASSERT(dr->dr_next == NULL); 2768 ASSERT(dr->dr_dbuf == db); 2769 *drp = dr->dr_next; 2770 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2771 ASSERT(db->db_dirtycnt > 0); 2772 db->db_dirtycnt -= 1; 2773 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2774 return; 2775 } 2776 2777 os = dn->dn_objset; 2778 2779 /* 2780 * This function may have dropped the db_mtx lock allowing a dmu_sync 2781 * operation to sneak in. As a result, we need to ensure that we 2782 * don't check the dr_override_state until we have returned from 2783 * dbuf_check_blkptr. 2784 */ 2785 dbuf_check_blkptr(dn, db); 2786 2787 /* 2788 * If this buffer is in the middle of an immediate write, 2789 * wait for the synchronous IO to complete. 2790 */ 2791 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 2792 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 2793 cv_wait(&db->db_changed, &db->db_mtx); 2794 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 2795 } 2796 2797 if (db->db_state != DB_NOFILL && 2798 dn->dn_object != DMU_META_DNODE_OBJECT && 2799 refcount_count(&db->db_holds) > 1 && 2800 dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 2801 *datap == db->db_buf) { 2802 /* 2803 * If this buffer is currently "in use" (i.e., there 2804 * are active holds and db_data still references it), 2805 * then make a copy before we start the write so that 2806 * any modifications from the open txg will not leak 2807 * into this write. 2808 * 2809 * NOTE: this copy does not need to be made for 2810 * objects only modified in the syncing context (e.g. 2811 * DNONE_DNODE blocks). 2812 */ 2813 int blksz = arc_buf_size(*datap); 2814 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2815 *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 2816 bcopy(db->db.db_data, (*datap)->b_data, blksz); 2817 } 2818 db->db_data_pending = dr; 2819 2820 mutex_exit(&db->db_mtx); 2821 2822 dbuf_write(dr, *datap, tx); 2823 2824 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2825 if (dn->dn_object == DMU_META_DNODE_OBJECT) { 2826 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2827 DB_DNODE_EXIT(db); 2828 } else { 2829 /* 2830 * Although zio_nowait() does not "wait for an IO", it does 2831 * initiate the IO. If this is an empty write it seems plausible 2832 * that the IO could actually be completed before the nowait 2833 * returns. We need to DB_DNODE_EXIT() first in case 2834 * zio_nowait() invalidates the dbuf. 2835 */ 2836 DB_DNODE_EXIT(db); 2837 zio_nowait(dr->dr_zio); 2838 } 2839 } 2840 2841 void 2842 dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) 2843 { 2844 dbuf_dirty_record_t *dr; 2845 2846 while (dr = list_head(list)) { 2847 if (dr->dr_zio != NULL) { 2848 /* 2849 * If we find an already initialized zio then we 2850 * are processing the meta-dnode, and we have finished. 2851 * The dbufs for all dnodes are put back on the list 2852 * during processing, so that we can zio_wait() 2853 * these IOs after initiating all child IOs. 2854 */ 2855 ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2856 DMU_META_DNODE_OBJECT); 2857 break; 2858 } 2859 if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && 2860 dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { 2861 VERIFY3U(dr->dr_dbuf->db_level, ==, level); 2862 } 2863 list_remove(list, dr); 2864 if (dr->dr_dbuf->db_level > 0) 2865 dbuf_sync_indirect(dr, tx); 2866 else 2867 dbuf_sync_leaf(dr, tx); 2868 } 2869 } 2870 2871 /* ARGSUSED */ 2872 static void 2873 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2874 { 2875 dmu_buf_impl_t *db = vdb; 2876 dnode_t *dn; 2877 blkptr_t *bp = zio->io_bp; 2878 blkptr_t *bp_orig = &zio->io_bp_orig; 2879 spa_t *spa = zio->io_spa; 2880 int64_t delta; 2881 uint64_t fill = 0; 2882 int i; 2883 2884 ASSERT3P(db->db_blkptr, ==, bp); 2885 2886 DB_DNODE_ENTER(db); 2887 dn = DB_DNODE(db); 2888 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 2889 dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 2890 zio->io_prev_space_delta = delta; 2891 2892 if (bp->blk_birth != 0) { 2893 ASSERT((db->db_blkid != DMU_SPILL_BLKID && 2894 BP_GET_TYPE(bp) == dn->dn_type) || 2895 (db->db_blkid == DMU_SPILL_BLKID && 2896 BP_GET_TYPE(bp) == dn->dn_bonustype) || 2897 BP_IS_EMBEDDED(bp)); 2898 ASSERT(BP_GET_LEVEL(bp) == db->db_level); 2899 } 2900 2901 mutex_enter(&db->db_mtx); 2902 2903 #ifdef ZFS_DEBUG 2904 if (db->db_blkid == DMU_SPILL_BLKID) { 2905 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2906 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2907 db->db_blkptr == &dn->dn_phys->dn_spill); 2908 } 2909 #endif 2910 2911 if (db->db_level == 0) { 2912 mutex_enter(&dn->dn_mtx); 2913 if (db->db_blkid > dn->dn_phys->dn_maxblkid && 2914 db->db_blkid != DMU_SPILL_BLKID) 2915 dn->dn_phys->dn_maxblkid = db->db_blkid; 2916 mutex_exit(&dn->dn_mtx); 2917 2918 if (dn->dn_type == DMU_OT_DNODE) { 2919 dnode_phys_t *dnp = db->db.db_data; 2920 for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2921 i--, dnp++) { 2922 if (dnp->dn_type != DMU_OT_NONE) 2923 fill++; 2924 } 2925 } else { 2926 if (BP_IS_HOLE(bp)) { 2927 fill = 0; 2928 } else { 2929 fill = 1; 2930 } 2931 } 2932 } else { 2933 blkptr_t *ibp = db->db.db_data; 2934 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2935 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 2936 if (BP_IS_HOLE(ibp)) 2937 continue; 2938 fill += BP_GET_FILL(ibp); 2939 } 2940 } 2941 DB_DNODE_EXIT(db); 2942 2943 if (!BP_IS_EMBEDDED(bp)) 2944 bp->blk_fill = fill; 2945 2946 mutex_exit(&db->db_mtx); 2947 } 2948 2949 /* 2950 * The SPA will call this callback several times for each zio - once 2951 * for every physical child i/o (zio->io_phys_children times). This 2952 * allows the DMU to monitor the progress of each logical i/o. For example, 2953 * there may be 2 copies of an indirect block, or many fragments of a RAID-Z 2954 * block. There may be a long delay before all copies/fragments are completed, 2955 * so this callback allows us to retire dirty space gradually, as the physical 2956 * i/os complete. 2957 */ 2958 /* ARGSUSED */ 2959 static void 2960 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) 2961 { 2962 dmu_buf_impl_t *db = arg; 2963 objset_t *os = db->db_objset; 2964 dsl_pool_t *dp = dmu_objset_pool(os); 2965 dbuf_dirty_record_t *dr; 2966 int delta = 0; 2967 2968 dr = db->db_data_pending; 2969 ASSERT3U(dr->dr_txg, ==, zio->io_txg); 2970 2971 /* 2972 * The callback will be called io_phys_children times. Retire one 2973 * portion of our dirty space each time we are called. Any rounding 2974 * error will be cleaned up by dsl_pool_sync()'s call to 2975 * dsl_pool_undirty_space(). 2976 */ 2977 delta = dr->dr_accounted / zio->io_phys_children; 2978 dsl_pool_undirty_space(dp, delta, zio->io_txg); 2979 } 2980 2981 /* ARGSUSED */ 2982 static void 2983 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2984 { 2985 dmu_buf_impl_t *db = vdb; 2986 blkptr_t *bp_orig = &zio->io_bp_orig; 2987 blkptr_t *bp = db->db_blkptr; 2988 objset_t *os = db->db_objset; 2989 dmu_tx_t *tx = os->os_synctx; 2990 dbuf_dirty_record_t **drp, *dr; 2991 2992 ASSERT0(zio->io_error); 2993 ASSERT(db->db_blkptr == bp); 2994 2995 /* 2996 * For nopwrites and rewrites we ensure that the bp matches our 2997 * original and bypass all the accounting. 2998 */ 2999 if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { 3000 ASSERT(BP_EQUAL(bp, bp_orig)); 3001 } else { 3002 dsl_dataset_t *ds = os->os_dsl_dataset; 3003 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 3004 dsl_dataset_block_born(ds, bp, tx); 3005 } 3006 3007 mutex_enter(&db->db_mtx); 3008 3009 DBUF_VERIFY(db); 3010 3011 drp = &db->db_last_dirty; 3012 while ((dr = *drp) != db->db_data_pending) 3013 drp = &dr->dr_next; 3014 ASSERT(!list_link_active(&dr->dr_dirty_node)); 3015 ASSERT(dr->dr_dbuf == db); 3016 ASSERT(dr->dr_next == NULL); 3017 *drp = dr->dr_next; 3018 3019 #ifdef ZFS_DEBUG 3020 if (db->db_blkid == DMU_SPILL_BLKID) { 3021 dnode_t *dn; 3022 3023 DB_DNODE_ENTER(db); 3024 dn = DB_DNODE(db); 3025 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 3026 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 3027 db->db_blkptr == &dn->dn_phys->dn_spill); 3028 DB_DNODE_EXIT(db); 3029 } 3030 #endif 3031 3032 if (db->db_level == 0) { 3033 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 3034 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 3035 if (db->db_state != DB_NOFILL) { 3036 if (dr->dt.dl.dr_data != db->db_buf) 3037 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 3038 db)); 3039 else if (!arc_released(db->db_buf)) 3040 arc_set_callback(db->db_buf, dbuf_do_evict, db); 3041 } 3042 } else { 3043 dnode_t *dn; 3044 3045 DB_DNODE_ENTER(db); 3046 dn = DB_DNODE(db); 3047 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 3048 ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); 3049 if (!BP_IS_HOLE(db->db_blkptr)) { 3050 int epbs = 3051 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 3052 ASSERT3U(db->db_blkid, <=, 3053 dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); 3054 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 3055 db->db.db_size); 3056 if (!arc_released(db->db_buf)) 3057 arc_set_callback(db->db_buf, dbuf_do_evict, db); 3058 } 3059 DB_DNODE_EXIT(db); 3060 mutex_destroy(&dr->dt.di.dr_mtx); 3061 list_destroy(&dr->dt.di.dr_children); 3062 } 3063 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 3064 3065 cv_broadcast(&db->db_changed); 3066 ASSERT(db->db_dirtycnt > 0); 3067 db->db_dirtycnt -= 1; 3068 db->db_data_pending = NULL; 3069 dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); 3070 } 3071 3072 static void 3073 dbuf_write_nofill_ready(zio_t *zio) 3074 { 3075 dbuf_write_ready(zio, NULL, zio->io_private); 3076 } 3077 3078 static void 3079 dbuf_write_nofill_done(zio_t *zio) 3080 { 3081 dbuf_write_done(zio, NULL, zio->io_private); 3082 } 3083 3084 static void 3085 dbuf_write_override_ready(zio_t *zio) 3086 { 3087 dbuf_dirty_record_t *dr = zio->io_private; 3088 dmu_buf_impl_t *db = dr->dr_dbuf; 3089 3090 dbuf_write_ready(zio, NULL, db); 3091 } 3092 3093 static void 3094 dbuf_write_override_done(zio_t *zio) 3095 { 3096 dbuf_dirty_record_t *dr = zio->io_private; 3097 dmu_buf_impl_t *db = dr->dr_dbuf; 3098 blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 3099 3100 mutex_enter(&db->db_mtx); 3101 if (!BP_EQUAL(zio->io_bp, obp)) { 3102 if (!BP_IS_HOLE(obp)) 3103 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 3104 arc_release(dr->dt.dl.dr_data, db); 3105 } 3106 mutex_exit(&db->db_mtx); 3107 3108 dbuf_write_done(zio, NULL, db); 3109 } 3110 3111 /* Issue I/O to commit a dirty buffer to disk. */ 3112 static void 3113 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 3114 { 3115 dmu_buf_impl_t *db = dr->dr_dbuf; 3116 dnode_t *dn; 3117 objset_t *os; 3118 dmu_buf_impl_t *parent = db->db_parent; 3119 uint64_t txg = tx->tx_txg; 3120 zbookmark_phys_t zb; 3121 zio_prop_t zp; 3122 zio_t *zio; 3123 int wp_flag = 0; 3124 3125 DB_DNODE_ENTER(db); 3126 dn = DB_DNODE(db); 3127 os = dn->dn_objset; 3128 3129 if (db->db_state != DB_NOFILL) { 3130 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 3131 /* 3132 * Private object buffers are released here rather 3133 * than in dbuf_dirty() since they are only modified 3134 * in the syncing context and we don't want the 3135 * overhead of making multiple copies of the data. 3136 */ 3137 if (BP_IS_HOLE(db->db_blkptr)) { 3138 arc_buf_thaw(data); 3139 } else { 3140 dbuf_release_bp(db); 3141 } 3142 } 3143 } 3144 3145 if (parent != dn->dn_dbuf) { 3146 /* Our parent is an indirect block. */ 3147 /* We have a dirty parent that has been scheduled for write. */ 3148 ASSERT(parent && parent->db_data_pending); 3149 /* Our parent's buffer is one level closer to the dnode. */ 3150 ASSERT(db->db_level == parent->db_level-1); 3151 /* 3152 * We're about to modify our parent's db_data by modifying 3153 * our block pointer, so the parent must be released. 3154 */ 3155 ASSERT(arc_released(parent->db_buf)); 3156 zio = parent->db_data_pending->dr_zio; 3157 } else { 3158 /* Our parent is the dnode itself. */ 3159 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 3160 db->db_blkid != DMU_SPILL_BLKID) || 3161 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 3162 if (db->db_blkid != DMU_SPILL_BLKID) 3163 ASSERT3P(db->db_blkptr, ==, 3164 &dn->dn_phys->dn_blkptr[db->db_blkid]); 3165 zio = dn->dn_zio; 3166 } 3167 3168 ASSERT(db->db_level == 0 || data == db->db_buf); 3169 ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 3170 ASSERT(zio); 3171 3172 SET_BOOKMARK(&zb, os->os_dsl_dataset ? 3173 os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 3174 db->db.db_object, db->db_level, db->db_blkid); 3175 3176 if (db->db_blkid == DMU_SPILL_BLKID) 3177 wp_flag = WP_SPILL; 3178 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 3179 3180 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); 3181 DB_DNODE_EXIT(db); 3182 3183 if (db->db_level == 0 && 3184 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 3185 /* 3186 * The BP for this block has been provided by open context 3187 * (by dmu_sync() or dmu_buf_write_embedded()). 3188 */ 3189 void *contents = (data != NULL) ? data->b_data : NULL; 3190 3191 dr->dr_zio = zio_write(zio, os->os_spa, txg, 3192 db->db_blkptr, contents, db->db.db_size, &zp, 3193 dbuf_write_override_ready, NULL, dbuf_write_override_done, 3194 dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 3195 mutex_enter(&db->db_mtx); 3196 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 3197 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 3198 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); 3199 mutex_exit(&db->db_mtx); 3200 } else if (db->db_state == DB_NOFILL) { 3201 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || 3202 zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); 3203 dr->dr_zio = zio_write(zio, os->os_spa, txg, 3204 db->db_blkptr, NULL, db->db.db_size, &zp, 3205 dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, 3206 ZIO_PRIORITY_ASYNC_WRITE, 3207 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 3208 } else { 3209 ASSERT(arc_released(data)); 3210 dr->dr_zio = arc_write(zio, os->os_spa, txg, 3211 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), 3212 DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, 3213 dbuf_write_physdone, dbuf_write_done, db, 3214 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 3215 } 3216 }