Print this page
patch nuke-the-dbuf-hash
patch make-the-merge-easy
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/zfs/dbuf.c
+++ new/usr/src/uts/common/fs/zfs/dbuf.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
24 24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25 25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26 26 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27 27 */
28 28
29 29 #include <sys/zfs_context.h>
30 30 #include <sys/dmu.h>
31 31 #include <sys/dmu_send.h>
32 32 #include <sys/dmu_impl.h>
33 33 #include <sys/dbuf.h>
34 34 #include <sys/dmu_objset.h>
35 35 #include <sys/dsl_dataset.h>
36 36 #include <sys/dsl_dir.h>
37 37 #include <sys/dmu_tx.h>
38 38 #include <sys/spa.h>
39 39 #include <sys/spa_impl.h>
40 40 #include <sys/zio.h>
41 41 #include <sys/dmu_zfetch.h>
42 42 #include <sys/sa.h>
43 43 #include <sys/sa_impl.h>
44 44 #include <sys/zfeature.h>
45 45 #include <sys/blkptr.h>
46 46 #include <sys/range_tree.h>
47 47
48 48 /*
49 49 * Number of times that zfs_free_range() took the slow path while doing
50 50 * a zfs receive. A nonzero value indicates a potential performance problem.
51 51 */
52 52 uint64_t zfs_free_range_recv_miss;
53 53
54 54 static void dbuf_destroy(dmu_buf_impl_t *db);
55 55 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
56 56 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
57 57
58 58 /*
59 59 * Global data structures and functions for the dbuf cache.
60 60 */
61 61 static kmem_cache_t *dbuf_cache;
62 62
63 63 /* ARGSUSED */
64 64 static int
65 65 dbuf_cons(void *vdb, void *unused, int kmflag)
66 66 {
67 67 dmu_buf_impl_t *db = vdb;
68 68 bzero(db, sizeof (dmu_buf_impl_t));
69 69
70 70 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
71 71 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
72 72 refcount_create(&db->db_holds);
73 73
74 74 return (0);
75 75 }
76 76
↓ open down ↓ |
76 lines elided |
↑ open up ↑ |
77 77 /* ARGSUSED */
78 78 static void
79 79 dbuf_dest(void *vdb, void *unused)
80 80 {
81 81 dmu_buf_impl_t *db = vdb;
82 82 mutex_destroy(&db->db_mtx);
83 83 cv_destroy(&db->db_changed);
84 84 refcount_destroy(&db->db_holds);
85 85 }
86 86
87 -/*
88 - * dbuf hash table routines
89 - */
90 -#pragma align 64(dbuf_hash_table)
91 -static dbuf_hash_table_t dbuf_hash_table;
92 -
93 -static uint64_t dbuf_hash_count;
94 -
95 -static uint64_t
96 -dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
97 -{
98 - uintptr_t osv = (uintptr_t)os;
99 - uint64_t crc = -1ULL;
100 -
101 - ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
102 - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
103 - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
104 - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
105 - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
106 - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
107 - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
108 -
109 - crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
110 -
111 - return (crc);
112 -}
113 -
114 -#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
115 -
116 -#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
117 - ((dbuf)->db.db_object == (obj) && \
118 - (dbuf)->db_objset == (os) && \
119 - (dbuf)->db_level == (level) && \
120 - (dbuf)->db_blkid == (blkid))
121 -
122 87 dmu_buf_impl_t *
123 88 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
124 89 {
125 - dbuf_hash_table_t *h = &dbuf_hash_table;
126 90 objset_t *os = dn->dn_objset;
127 91 uint64_t obj = dn->dn_object;
128 - uint64_t hv = DBUF_HASH(os, obj, level, blkid);
129 - uint64_t idx = hv & h->hash_table_mask;
130 92 dmu_buf_impl_t *db;
93 + dmu_buf_impl_t key;
94 + avl_index_t where;
131 95
132 - mutex_enter(DBUF_HASH_MUTEX(h, idx));
133 - for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
134 - if (DBUF_EQUAL(db, os, obj, level, blkid)) {
135 - mutex_enter(&db->db_mtx);
136 - if (db->db_state != DB_EVICTING) {
137 - mutex_exit(DBUF_HASH_MUTEX(h, idx));
138 - return (db);
139 - }
140 - mutex_exit(&db->db_mtx);
141 - }
142 - }
143 - mutex_exit(DBUF_HASH_MUTEX(h, idx));
144 - return (NULL);
145 -}
96 + key.db_level = level;
97 + key.db_blkid = blkid;
98 + key.db_state = DB_SEARCH;
146 99
147 -/*
148 - * Insert an entry into the hash table. If there is already an element
149 - * equal to elem in the hash table, then the already existing element
150 - * will be returned and the new element will not be inserted.
151 - * Otherwise returns NULL.
152 - */
153 -static dmu_buf_impl_t *
154 -dbuf_hash_insert(dmu_buf_impl_t *db)
155 -{
156 - dbuf_hash_table_t *h = &dbuf_hash_table;
157 - objset_t *os = db->db_objset;
158 - uint64_t obj = db->db.db_object;
159 - int level = db->db_level;
160 - uint64_t blkid = db->db_blkid;
161 - uint64_t hv = DBUF_HASH(os, obj, level, blkid);
162 - uint64_t idx = hv & h->hash_table_mask;
163 - dmu_buf_impl_t *dbf;
164 -
165 - mutex_enter(DBUF_HASH_MUTEX(h, idx));
166 - for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
167 - if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
168 - mutex_enter(&dbf->db_mtx);
169 - if (dbf->db_state != DB_EVICTING) {
170 - mutex_exit(DBUF_HASH_MUTEX(h, idx));
171 - return (dbf);
172 - }
173 - mutex_exit(&dbf->db_mtx);
100 + mutex_enter(&dn->dn_dbufs_mtx);
101 + db = avl_find(&dn->dn_dbufs, &key, &where);
102 + ASSERT3P(db, ==, NULL);
103 + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
104 +
105 + for (; db; db = AVL_NEXT(&dn->dn_dbufs, db)) {
106 + if ((db->db_level != level) || (db->db_blkid != blkid))
107 + break;
108 +
109 + mutex_enter(&db->db_mtx);
110 + if (db->db_state != DB_EVICTING) {
111 + mutex_exit(&dn->dn_dbufs_mtx);
112 + return (db);
174 113 }
114 + mutex_exit(&db->db_mtx);
175 115 }
176 116
177 - mutex_enter(&db->db_mtx);
178 - db->db_hash_next = h->hash_table[idx];
179 - h->hash_table[idx] = db;
180 - mutex_exit(DBUF_HASH_MUTEX(h, idx));
181 - atomic_inc_64(&dbuf_hash_count);
182 -
117 + mutex_exit(&dn->dn_dbufs_mtx);
183 118 return (NULL);
184 119 }
185 120
186 -/*
187 - * Remove an entry from the hash table. It must be in the EVICTING state.
188 - */
189 -static void
190 -dbuf_hash_remove(dmu_buf_impl_t *db)
191 -{
192 - dbuf_hash_table_t *h = &dbuf_hash_table;
193 - uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
194 - db->db_level, db->db_blkid);
195 - uint64_t idx = hv & h->hash_table_mask;
196 - dmu_buf_impl_t *dbf, **dbp;
197 -
198 - /*
199 - * We musn't hold db_mtx to maintain lock ordering:
200 - * DBUF_HASH_MUTEX > db_mtx.
201 - */
202 - ASSERT(refcount_is_zero(&db->db_holds));
203 - ASSERT(db->db_state == DB_EVICTING);
204 - ASSERT(!MUTEX_HELD(&db->db_mtx));
205 -
206 - mutex_enter(DBUF_HASH_MUTEX(h, idx));
207 - dbp = &h->hash_table[idx];
208 - while ((dbf = *dbp) != db) {
209 - dbp = &dbf->db_hash_next;
210 - ASSERT(dbf != NULL);
211 - }
212 - *dbp = db->db_hash_next;
213 - db->db_hash_next = NULL;
214 - mutex_exit(DBUF_HASH_MUTEX(h, idx));
215 - atomic_dec_64(&dbuf_hash_count);
216 -}
217 -
218 121 static arc_evict_func_t dbuf_do_evict;
219 122
220 123 static void
221 124 dbuf_evict_user(dmu_buf_impl_t *db)
222 125 {
223 126 ASSERT(MUTEX_HELD(&db->db_mtx));
224 127
225 128 if (db->db_level != 0 || db->db_evict_func == NULL)
226 129 return;
227 130
228 131 if (db->db_user_data_ptr_ptr)
229 132 *db->db_user_data_ptr_ptr = db->db.db_data;
230 133 db->db_evict_func(&db->db, db->db_user_ptr);
231 134 db->db_user_ptr = NULL;
232 135 db->db_user_data_ptr_ptr = NULL;
233 136 db->db_evict_func = NULL;
234 137 }
235 138
236 139 boolean_t
237 140 dbuf_is_metadata(dmu_buf_impl_t *db)
238 141 {
239 142 if (db->db_level > 0) {
240 143 return (B_TRUE);
241 144 } else {
242 145 boolean_t is_metadata;
243 146
244 147 DB_DNODE_ENTER(db);
245 148 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
246 149 DB_DNODE_EXIT(db);
247 150
248 151 return (is_metadata);
249 152 }
250 153 }
251 154
252 155 void
253 156 dbuf_evict(dmu_buf_impl_t *db)
254 157 {
255 158 ASSERT(MUTEX_HELD(&db->db_mtx));
↓ open down ↓ |
28 lines elided |
↑ open up ↑ |
256 159 ASSERT(db->db_buf == NULL);
257 160 ASSERT(db->db_data_pending == NULL);
258 161
259 162 dbuf_clear(db);
260 163 dbuf_destroy(db);
261 164 }
262 165
263 166 void
264 167 dbuf_init(void)
265 168 {
266 - uint64_t hsize = 1ULL << 16;
267 - dbuf_hash_table_t *h = &dbuf_hash_table;
268 - int i;
269 -
270 - /*
271 - * The hash table is big enough to fill all of physical memory
272 - * with an average 4K block size. The table will take up
273 - * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
274 - */
275 - while (hsize * 4096 < physmem * PAGESIZE)
276 - hsize <<= 1;
277 -
278 -retry:
279 - h->hash_table_mask = hsize - 1;
280 - h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
281 - if (h->hash_table == NULL) {
282 - /* XXX - we should really return an error instead of assert */
283 - ASSERT(hsize > (1ULL << 10));
284 - hsize >>= 1;
285 - goto retry;
286 - }
287 -
288 169 dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
289 170 sizeof (dmu_buf_impl_t),
290 171 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
291 -
292 - for (i = 0; i < DBUF_MUTEXES; i++)
293 - mutex_init(DBUF_HASH_MUTEX(h, i), NULL, MUTEX_DEFAULT, NULL);
294 172 }
295 173
296 174 void
297 175 dbuf_fini(void)
298 176 {
299 - dbuf_hash_table_t *h = &dbuf_hash_table;
300 - int i;
301 -
302 - for (i = 0; i < DBUF_MUTEXES; i++)
303 - mutex_destroy(DBUF_HASH_MUTEX(h, i));
304 - kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
305 177 kmem_cache_destroy(dbuf_cache);
306 178 }
307 179
308 180 /*
309 181 * Other stuff.
310 182 */
311 183
312 184 #ifdef ZFS_DEBUG
313 185 static void
314 186 dbuf_verify(dmu_buf_impl_t *db)
315 187 {
316 188 dnode_t *dn;
317 189 dbuf_dirty_record_t *dr;
318 190
319 191 ASSERT(MUTEX_HELD(&db->db_mtx));
320 192
321 193 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
322 194 return;
323 195
324 196 ASSERT(db->db_objset != NULL);
325 197 DB_DNODE_ENTER(db);
326 198 dn = DB_DNODE(db);
327 199 if (dn == NULL) {
328 200 ASSERT(db->db_parent == NULL);
329 201 ASSERT(db->db_blkptr == NULL);
330 202 } else {
331 203 ASSERT3U(db->db.db_object, ==, dn->dn_object);
332 204 ASSERT3P(db->db_objset, ==, dn->dn_objset);
333 205 ASSERT3U(db->db_level, <, dn->dn_nlevels);
334 206 ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
335 207 db->db_blkid == DMU_SPILL_BLKID ||
336 208 !avl_is_empty(&dn->dn_dbufs));
337 209 }
338 210 if (db->db_blkid == DMU_BONUS_BLKID) {
339 211 ASSERT(dn != NULL);
340 212 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
341 213 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
342 214 } else if (db->db_blkid == DMU_SPILL_BLKID) {
343 215 ASSERT(dn != NULL);
344 216 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
345 217 ASSERT0(db->db.db_offset);
346 218 } else {
347 219 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
348 220 }
349 221
350 222 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
351 223 ASSERT(dr->dr_dbuf == db);
352 224
353 225 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
354 226 ASSERT(dr->dr_dbuf == db);
355 227
356 228 /*
357 229 * We can't assert that db_size matches dn_datablksz because it
358 230 * can be momentarily different when another thread is doing
359 231 * dnode_set_blksz().
360 232 */
361 233 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
362 234 dr = db->db_data_pending;
363 235 /*
364 236 * It should only be modified in syncing context, so
365 237 * make sure we only have one copy of the data.
366 238 */
367 239 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
368 240 }
369 241
370 242 /* verify db->db_blkptr */
371 243 if (db->db_blkptr) {
372 244 if (db->db_parent == dn->dn_dbuf) {
373 245 /* db is pointed to by the dnode */
374 246 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
375 247 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
376 248 ASSERT(db->db_parent == NULL);
377 249 else
378 250 ASSERT(db->db_parent != NULL);
379 251 if (db->db_blkid != DMU_SPILL_BLKID)
380 252 ASSERT3P(db->db_blkptr, ==,
381 253 &dn->dn_phys->dn_blkptr[db->db_blkid]);
382 254 } else {
383 255 /* db is pointed to by an indirect block */
384 256 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
385 257 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
386 258 ASSERT3U(db->db_parent->db.db_object, ==,
387 259 db->db.db_object);
388 260 /*
389 261 * dnode_grow_indblksz() can make this fail if we don't
390 262 * have the struct_rwlock. XXX indblksz no longer
391 263 * grows. safe to do this now?
392 264 */
393 265 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
394 266 ASSERT3P(db->db_blkptr, ==,
395 267 ((blkptr_t *)db->db_parent->db.db_data +
396 268 db->db_blkid % epb));
397 269 }
398 270 }
399 271 }
400 272 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
401 273 (db->db_buf == NULL || db->db_buf->b_data) &&
402 274 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
403 275 db->db_state != DB_FILL && !dn->dn_free_txg) {
404 276 /*
405 277 * If the blkptr isn't set but they have nonzero data,
406 278 * it had better be dirty, otherwise we'll lose that
407 279 * data when we evict this buffer.
408 280 */
409 281 if (db->db_dirtycnt == 0) {
410 282 uint64_t *buf = db->db.db_data;
411 283 int i;
412 284
413 285 for (i = 0; i < db->db.db_size >> 3; i++) {
414 286 ASSERT(buf[i] == 0);
415 287 }
416 288 }
417 289 }
418 290 DB_DNODE_EXIT(db);
419 291 }
420 292 #endif
421 293
422 294 static void
423 295 dbuf_update_data(dmu_buf_impl_t *db)
424 296 {
425 297 ASSERT(MUTEX_HELD(&db->db_mtx));
426 298 if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
427 299 ASSERT(!refcount_is_zero(&db->db_holds));
428 300 *db->db_user_data_ptr_ptr = db->db.db_data;
429 301 }
430 302 }
431 303
432 304 static void
433 305 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
434 306 {
435 307 ASSERT(MUTEX_HELD(&db->db_mtx));
436 308 db->db_buf = buf;
437 309 if (buf != NULL) {
438 310 ASSERT(buf->b_data != NULL);
439 311 db->db.db_data = buf->b_data;
440 312 if (!arc_released(buf))
441 313 arc_set_callback(buf, dbuf_do_evict, db);
442 314 dbuf_update_data(db);
443 315 } else {
444 316 dbuf_evict_user(db);
445 317 db->db.db_data = NULL;
446 318 if (db->db_state != DB_NOFILL)
447 319 db->db_state = DB_UNCACHED;
448 320 }
449 321 }
450 322
451 323 /*
452 324 * Loan out an arc_buf for read. Return the loaned arc_buf.
453 325 */
454 326 arc_buf_t *
455 327 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
456 328 {
457 329 arc_buf_t *abuf;
458 330
459 331 mutex_enter(&db->db_mtx);
460 332 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
461 333 int blksz = db->db.db_size;
462 334 spa_t *spa = db->db_objset->os_spa;
463 335
464 336 mutex_exit(&db->db_mtx);
465 337 abuf = arc_loan_buf(spa, blksz);
466 338 bcopy(db->db.db_data, abuf->b_data, blksz);
467 339 } else {
468 340 abuf = db->db_buf;
469 341 arc_loan_inuse_buf(abuf, db);
470 342 dbuf_set_data(db, NULL);
471 343 mutex_exit(&db->db_mtx);
472 344 }
473 345 return (abuf);
474 346 }
475 347
476 348 uint64_t
477 349 dbuf_whichblock(dnode_t *dn, uint64_t offset)
478 350 {
479 351 if (dn->dn_datablkshift) {
480 352 return (offset >> dn->dn_datablkshift);
481 353 } else {
482 354 ASSERT3U(offset, <, dn->dn_datablksz);
483 355 return (0);
484 356 }
485 357 }
486 358
487 359 static void
488 360 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
489 361 {
490 362 dmu_buf_impl_t *db = vdb;
491 363
492 364 mutex_enter(&db->db_mtx);
493 365 ASSERT3U(db->db_state, ==, DB_READ);
494 366 /*
495 367 * All reads are synchronous, so we must have a hold on the dbuf
496 368 */
497 369 ASSERT(refcount_count(&db->db_holds) > 0);
498 370 ASSERT(db->db_buf == NULL);
499 371 ASSERT(db->db.db_data == NULL);
500 372 if (db->db_level == 0 && db->db_freed_in_flight) {
501 373 /* we were freed in flight; disregard any error */
502 374 arc_release(buf, db);
503 375 bzero(buf->b_data, db->db.db_size);
504 376 arc_buf_freeze(buf);
505 377 db->db_freed_in_flight = FALSE;
506 378 dbuf_set_data(db, buf);
507 379 db->db_state = DB_CACHED;
508 380 } else if (zio == NULL || zio->io_error == 0) {
509 381 dbuf_set_data(db, buf);
510 382 db->db_state = DB_CACHED;
511 383 } else {
512 384 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
513 385 ASSERT3P(db->db_buf, ==, NULL);
514 386 VERIFY(arc_buf_remove_ref(buf, db));
515 387 db->db_state = DB_UNCACHED;
516 388 }
517 389 cv_broadcast(&db->db_changed);
518 390 dbuf_rele_and_unlock(db, NULL);
519 391 }
520 392
521 393 static void
522 394 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
523 395 {
524 396 dnode_t *dn;
525 397 zbookmark_phys_t zb;
526 398 uint32_t aflags = ARC_NOWAIT;
527 399
528 400 DB_DNODE_ENTER(db);
529 401 dn = DB_DNODE(db);
530 402 ASSERT(!refcount_is_zero(&db->db_holds));
531 403 /* We need the struct_rwlock to prevent db_blkptr from changing. */
532 404 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
533 405 ASSERT(MUTEX_HELD(&db->db_mtx));
534 406 ASSERT(db->db_state == DB_UNCACHED);
535 407 ASSERT(db->db_buf == NULL);
536 408
537 409 if (db->db_blkid == DMU_BONUS_BLKID) {
538 410 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
539 411
540 412 ASSERT3U(bonuslen, <=, db->db.db_size);
541 413 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
542 414 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
543 415 if (bonuslen < DN_MAX_BONUSLEN)
544 416 bzero(db->db.db_data, DN_MAX_BONUSLEN);
545 417 if (bonuslen)
546 418 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
547 419 DB_DNODE_EXIT(db);
548 420 dbuf_update_data(db);
549 421 db->db_state = DB_CACHED;
550 422 mutex_exit(&db->db_mtx);
551 423 return;
552 424 }
553 425
554 426 /*
555 427 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
556 428 * processes the delete record and clears the bp while we are waiting
557 429 * for the dn_mtx (resulting in a "no" from block_freed).
558 430 */
559 431 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
560 432 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
561 433 BP_IS_HOLE(db->db_blkptr)))) {
562 434 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
563 435
564 436 DB_DNODE_EXIT(db);
565 437 dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
566 438 db->db.db_size, db, type));
567 439 bzero(db->db.db_data, db->db.db_size);
568 440 db->db_state = DB_CACHED;
569 441 *flags |= DB_RF_CACHED;
570 442 mutex_exit(&db->db_mtx);
571 443 return;
572 444 }
573 445
574 446 DB_DNODE_EXIT(db);
575 447
576 448 db->db_state = DB_READ;
577 449 mutex_exit(&db->db_mtx);
578 450
579 451 if (DBUF_IS_L2CACHEABLE(db))
580 452 aflags |= ARC_L2CACHE;
581 453 if (DBUF_IS_L2COMPRESSIBLE(db))
582 454 aflags |= ARC_L2COMPRESS;
583 455
584 456 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
585 457 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
586 458 db->db.db_object, db->db_level, db->db_blkid);
587 459
588 460 dbuf_add_ref(db, NULL);
589 461
590 462 (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
591 463 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
592 464 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
593 465 &aflags, &zb);
594 466 if (aflags & ARC_CACHED)
595 467 *flags |= DB_RF_CACHED;
596 468 }
597 469
598 470 int
599 471 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
600 472 {
601 473 int err = 0;
602 474 boolean_t havepzio = (zio != NULL);
603 475 boolean_t prefetch;
604 476 dnode_t *dn;
605 477
606 478 /*
607 479 * We don't have to hold the mutex to check db_state because it
608 480 * can't be freed while we have a hold on the buffer.
609 481 */
610 482 ASSERT(!refcount_is_zero(&db->db_holds));
611 483
612 484 if (db->db_state == DB_NOFILL)
613 485 return (SET_ERROR(EIO));
614 486
615 487 DB_DNODE_ENTER(db);
616 488 dn = DB_DNODE(db);
617 489 if ((flags & DB_RF_HAVESTRUCT) == 0)
618 490 rw_enter(&dn->dn_struct_rwlock, RW_READER);
619 491
620 492 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
621 493 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
622 494 DBUF_IS_CACHEABLE(db);
623 495
624 496 mutex_enter(&db->db_mtx);
625 497 if (db->db_state == DB_CACHED) {
626 498 mutex_exit(&db->db_mtx);
627 499 if (prefetch)
628 500 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
629 501 db->db.db_size, TRUE);
630 502 if ((flags & DB_RF_HAVESTRUCT) == 0)
631 503 rw_exit(&dn->dn_struct_rwlock);
632 504 DB_DNODE_EXIT(db);
633 505 } else if (db->db_state == DB_UNCACHED) {
634 506 spa_t *spa = dn->dn_objset->os_spa;
635 507
636 508 if (zio == NULL)
637 509 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
638 510 dbuf_read_impl(db, zio, &flags);
639 511
640 512 /* dbuf_read_impl has dropped db_mtx for us */
641 513
642 514 if (prefetch)
643 515 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
644 516 db->db.db_size, flags & DB_RF_CACHED);
645 517
646 518 if ((flags & DB_RF_HAVESTRUCT) == 0)
647 519 rw_exit(&dn->dn_struct_rwlock);
648 520 DB_DNODE_EXIT(db);
649 521
650 522 if (!havepzio)
651 523 err = zio_wait(zio);
652 524 } else {
653 525 /*
654 526 * Another reader came in while the dbuf was in flight
655 527 * between UNCACHED and CACHED. Either a writer will finish
656 528 * writing the buffer (sending the dbuf to CACHED) or the
657 529 * first reader's request will reach the read_done callback
658 530 * and send the dbuf to CACHED. Otherwise, a failure
659 531 * occurred and the dbuf went to UNCACHED.
660 532 */
661 533 mutex_exit(&db->db_mtx);
662 534 if (prefetch)
663 535 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
664 536 db->db.db_size, TRUE);
665 537 if ((flags & DB_RF_HAVESTRUCT) == 0)
666 538 rw_exit(&dn->dn_struct_rwlock);
667 539 DB_DNODE_EXIT(db);
668 540
669 541 /* Skip the wait per the caller's request. */
670 542 mutex_enter(&db->db_mtx);
671 543 if ((flags & DB_RF_NEVERWAIT) == 0) {
672 544 while (db->db_state == DB_READ ||
673 545 db->db_state == DB_FILL) {
674 546 ASSERT(db->db_state == DB_READ ||
675 547 (flags & DB_RF_HAVESTRUCT) == 0);
676 548 cv_wait(&db->db_changed, &db->db_mtx);
677 549 }
678 550 if (db->db_state == DB_UNCACHED)
679 551 err = SET_ERROR(EIO);
680 552 }
681 553 mutex_exit(&db->db_mtx);
682 554 }
683 555
684 556 ASSERT(err || havepzio || db->db_state == DB_CACHED);
685 557 return (err);
686 558 }
687 559
688 560 static void
689 561 dbuf_noread(dmu_buf_impl_t *db)
690 562 {
691 563 ASSERT(!refcount_is_zero(&db->db_holds));
692 564 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
693 565 mutex_enter(&db->db_mtx);
694 566 while (db->db_state == DB_READ || db->db_state == DB_FILL)
695 567 cv_wait(&db->db_changed, &db->db_mtx);
696 568 if (db->db_state == DB_UNCACHED) {
697 569 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
698 570 spa_t *spa = db->db_objset->os_spa;
699 571
700 572 ASSERT(db->db_buf == NULL);
701 573 ASSERT(db->db.db_data == NULL);
702 574 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
703 575 db->db_state = DB_FILL;
704 576 } else if (db->db_state == DB_NOFILL) {
705 577 dbuf_set_data(db, NULL);
706 578 } else {
707 579 ASSERT3U(db->db_state, ==, DB_CACHED);
708 580 }
709 581 mutex_exit(&db->db_mtx);
710 582 }
711 583
712 584 /*
713 585 * This is our just-in-time copy function. It makes a copy of
714 586 * buffers, that have been modified in a previous transaction
715 587 * group, before we modify them in the current active group.
716 588 *
717 589 * This function is used in two places: when we are dirtying a
718 590 * buffer for the first time in a txg, and when we are freeing
719 591 * a range in a dnode that includes this buffer.
720 592 *
721 593 * Note that when we are called from dbuf_free_range() we do
722 594 * not put a hold on the buffer, we just traverse the active
723 595 * dbuf list for the dnode.
724 596 */
725 597 static void
726 598 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
727 599 {
728 600 dbuf_dirty_record_t *dr = db->db_last_dirty;
729 601
730 602 ASSERT(MUTEX_HELD(&db->db_mtx));
731 603 ASSERT(db->db.db_data != NULL);
732 604 ASSERT(db->db_level == 0);
733 605 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
734 606
735 607 if (dr == NULL ||
736 608 (dr->dt.dl.dr_data !=
737 609 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
738 610 return;
739 611
740 612 /*
741 613 * If the last dirty record for this dbuf has not yet synced
742 614 * and its referencing the dbuf data, either:
743 615 * reset the reference to point to a new copy,
744 616 * or (if there a no active holders)
745 617 * just null out the current db_data pointer.
746 618 */
747 619 ASSERT(dr->dr_txg >= txg - 2);
748 620 if (db->db_blkid == DMU_BONUS_BLKID) {
749 621 /* Note that the data bufs here are zio_bufs */
750 622 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
751 623 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
752 624 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
753 625 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
754 626 int size = db->db.db_size;
755 627 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
756 628 spa_t *spa = db->db_objset->os_spa;
757 629
758 630 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
759 631 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
760 632 } else {
761 633 dbuf_set_data(db, NULL);
762 634 }
763 635 }
764 636
765 637 void
766 638 dbuf_unoverride(dbuf_dirty_record_t *dr)
767 639 {
768 640 dmu_buf_impl_t *db = dr->dr_dbuf;
769 641 blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
770 642 uint64_t txg = dr->dr_txg;
771 643
772 644 ASSERT(MUTEX_HELD(&db->db_mtx));
773 645 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
774 646 ASSERT(db->db_level == 0);
775 647
776 648 if (db->db_blkid == DMU_BONUS_BLKID ||
777 649 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
778 650 return;
779 651
780 652 ASSERT(db->db_data_pending != dr);
781 653
782 654 /* free this block */
783 655 if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
784 656 zio_free(db->db_objset->os_spa, txg, bp);
785 657
786 658 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
787 659 dr->dt.dl.dr_nopwrite = B_FALSE;
788 660
789 661 /*
790 662 * Release the already-written buffer, so we leave it in
791 663 * a consistent dirty state. Note that all callers are
792 664 * modifying the buffer, so they will immediately do
793 665 * another (redundant) arc_release(). Therefore, leave
794 666 * the buf thawed to save the effort of freezing &
795 667 * immediately re-thawing it.
796 668 */
797 669 arc_release(dr->dt.dl.dr_data, db);
798 670 }
799 671
800 672 /*
801 673 * Evict (if its unreferenced) or clear (if its referenced) any level-0
802 674 * data blocks in the free range, so that any future readers will find
803 675 * empty blocks.
804 676 *
805 677 * This is a no-op if the dataset is in the middle of an incremental
806 678 * receive; see comment below for details.
807 679 */
808 680 void
809 681 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
810 682 dmu_tx_t *tx)
811 683 {
812 684 dmu_buf_impl_t *db, *db_next, db_search;
813 685 uint64_t txg = tx->tx_txg;
814 686 avl_index_t where;
815 687
816 688 if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID))
817 689 end_blkid = dn->dn_maxblkid;
818 690 dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
819 691
820 692 db_search.db_level = 0;
821 693 db_search.db_blkid = start_blkid;
822 694 db_search.db_state = DB_SEARCH;
823 695
824 696 mutex_enter(&dn->dn_dbufs_mtx);
825 697 if (start_blkid >= dn->dn_unlisted_l0_blkid) {
826 698 /* There can't be any dbufs in this range; no need to search. */
827 699 #ifdef DEBUG
828 700 db = avl_find(&dn->dn_dbufs, &db_search, &where);
829 701 ASSERT3P(db, ==, NULL);
830 702 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
831 703 ASSERT(db == NULL || db->db_level > 0);
832 704 #endif
833 705 mutex_exit(&dn->dn_dbufs_mtx);
834 706 return;
835 707 } else if (dmu_objset_is_receiving(dn->dn_objset)) {
836 708 /*
837 709 * If we are receiving, we expect there to be no dbufs in
838 710 * the range to be freed, because receive modifies each
839 711 * block at most once, and in offset order. If this is
840 712 * not the case, it can lead to performance problems,
841 713 * so note that we unexpectedly took the slow path.
842 714 */
843 715 atomic_inc_64(&zfs_free_range_recv_miss);
844 716 }
845 717
846 718 db = avl_find(&dn->dn_dbufs, &db_search, &where);
847 719 ASSERT3P(db, ==, NULL);
848 720 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
849 721
850 722 for (; db != NULL; db = db_next) {
851 723 db_next = AVL_NEXT(&dn->dn_dbufs, db);
852 724 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
853 725
854 726 if (db->db_level != 0 || db->db_blkid > end_blkid) {
855 727 break;
856 728 }
857 729 ASSERT3U(db->db_blkid, >=, start_blkid);
858 730
859 731 /* found a level 0 buffer in the range */
860 732 mutex_enter(&db->db_mtx);
861 733 if (dbuf_undirty(db, tx)) {
862 734 /* mutex has been dropped and dbuf destroyed */
863 735 continue;
864 736 }
865 737
866 738 if (db->db_state == DB_UNCACHED ||
867 739 db->db_state == DB_NOFILL ||
868 740 db->db_state == DB_EVICTING) {
869 741 ASSERT(db->db.db_data == NULL);
870 742 mutex_exit(&db->db_mtx);
871 743 continue;
872 744 }
873 745 if (db->db_state == DB_READ || db->db_state == DB_FILL) {
874 746 /* will be handled in dbuf_read_done or dbuf_rele */
875 747 db->db_freed_in_flight = TRUE;
876 748 mutex_exit(&db->db_mtx);
877 749 continue;
878 750 }
879 751 if (refcount_count(&db->db_holds) == 0) {
880 752 ASSERT(db->db_buf);
881 753 dbuf_clear(db);
882 754 continue;
883 755 }
884 756 /* The dbuf is referenced */
885 757
886 758 if (db->db_last_dirty != NULL) {
887 759 dbuf_dirty_record_t *dr = db->db_last_dirty;
888 760
889 761 if (dr->dr_txg == txg) {
890 762 /*
891 763 * This buffer is "in-use", re-adjust the file
892 764 * size to reflect that this buffer may
893 765 * contain new data when we sync.
894 766 */
895 767 if (db->db_blkid != DMU_SPILL_BLKID &&
896 768 db->db_blkid > dn->dn_maxblkid)
897 769 dn->dn_maxblkid = db->db_blkid;
898 770 dbuf_unoverride(dr);
899 771 } else {
900 772 /*
901 773 * This dbuf is not dirty in the open context.
902 774 * Either uncache it (if its not referenced in
903 775 * the open context) or reset its contents to
904 776 * empty.
905 777 */
906 778 dbuf_fix_old_data(db, txg);
907 779 }
908 780 }
909 781 /* clear the contents if its cached */
910 782 if (db->db_state == DB_CACHED) {
911 783 ASSERT(db->db.db_data != NULL);
912 784 arc_release(db->db_buf, db);
913 785 bzero(db->db.db_data, db->db.db_size);
914 786 arc_buf_freeze(db->db_buf);
915 787 }
916 788
917 789 mutex_exit(&db->db_mtx);
918 790 }
919 791 mutex_exit(&dn->dn_dbufs_mtx);
920 792 }
921 793
922 794 static int
923 795 dbuf_block_freeable(dmu_buf_impl_t *db)
924 796 {
925 797 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
926 798 uint64_t birth_txg = 0;
927 799
928 800 /*
929 801 * We don't need any locking to protect db_blkptr:
930 802 * If it's syncing, then db_last_dirty will be set
931 803 * so we'll ignore db_blkptr.
932 804 *
933 805 * This logic ensures that only block births for
934 806 * filled blocks are considered.
935 807 */
936 808 ASSERT(MUTEX_HELD(&db->db_mtx));
937 809 if (db->db_last_dirty && (db->db_blkptr == NULL ||
938 810 !BP_IS_HOLE(db->db_blkptr))) {
939 811 birth_txg = db->db_last_dirty->dr_txg;
940 812 } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
941 813 birth_txg = db->db_blkptr->blk_birth;
942 814 }
943 815
944 816 /*
945 817 * If this block don't exist or is in a snapshot, it can't be freed.
946 818 * Don't pass the bp to dsl_dataset_block_freeable() since we
947 819 * are holding the db_mtx lock and might deadlock if we are
948 820 * prefetching a dedup-ed block.
949 821 */
950 822 if (birth_txg != 0)
951 823 return (ds == NULL ||
952 824 dsl_dataset_block_freeable(ds, NULL, birth_txg));
953 825 else
954 826 return (B_FALSE);
955 827 }
956 828
957 829 void
958 830 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
959 831 {
960 832 arc_buf_t *buf, *obuf;
961 833 int osize = db->db.db_size;
962 834 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
963 835 dnode_t *dn;
964 836
965 837 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
966 838
967 839 DB_DNODE_ENTER(db);
968 840 dn = DB_DNODE(db);
969 841
970 842 /* XXX does *this* func really need the lock? */
971 843 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
972 844
973 845 /*
974 846 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
975 847 * is OK, because there can be no other references to the db
976 848 * when we are changing its size, so no concurrent DB_FILL can
977 849 * be happening.
978 850 */
979 851 /*
980 852 * XXX we should be doing a dbuf_read, checking the return
981 853 * value and returning that up to our callers
982 854 */
983 855 dmu_buf_will_dirty(&db->db, tx);
984 856
985 857 /* create the data buffer for the new block */
986 858 buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
987 859
988 860 /* copy old block data to the new block */
989 861 obuf = db->db_buf;
990 862 bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
991 863 /* zero the remainder */
992 864 if (size > osize)
993 865 bzero((uint8_t *)buf->b_data + osize, size - osize);
994 866
995 867 mutex_enter(&db->db_mtx);
996 868 dbuf_set_data(db, buf);
997 869 VERIFY(arc_buf_remove_ref(obuf, db));
998 870 db->db.db_size = size;
999 871
1000 872 if (db->db_level == 0) {
1001 873 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1002 874 db->db_last_dirty->dt.dl.dr_data = buf;
1003 875 }
1004 876 mutex_exit(&db->db_mtx);
1005 877
1006 878 dnode_willuse_space(dn, size-osize, tx);
1007 879 DB_DNODE_EXIT(db);
1008 880 }
1009 881
1010 882 void
1011 883 dbuf_release_bp(dmu_buf_impl_t *db)
1012 884 {
1013 885 objset_t *os = db->db_objset;
1014 886
1015 887 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
1016 888 ASSERT(arc_released(os->os_phys_buf) ||
1017 889 list_link_active(&os->os_dsl_dataset->ds_synced_link));
1018 890 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1019 891
1020 892 (void) arc_release(db->db_buf, db);
1021 893 }
1022 894
1023 895 dbuf_dirty_record_t *
1024 896 dbuf_dirty_sc(dmu_buf_impl_t *db, dmu_tx_t *tx, boolean_t usesc)
1025 897 {
1026 898 dnode_t *dn;
1027 899 objset_t *os;
1028 900 dbuf_dirty_record_t **drp, *dr;
1029 901 int drop_struct_lock = FALSE;
1030 902 boolean_t do_free_accounting = B_FALSE;
1031 903 int txgoff = tx->tx_txg & TXG_MASK;
1032 904
1033 905 ASSERT(tx->tx_txg != 0);
1034 906 ASSERT(!refcount_is_zero(&db->db_holds));
1035 907 DMU_TX_DIRTY_BUF(tx, db);
1036 908
1037 909 DB_DNODE_ENTER(db);
1038 910 dn = DB_DNODE(db);
1039 911 /*
1040 912 * Shouldn't dirty a regular buffer in syncing context. Private
1041 913 * objects may be dirtied in syncing context, but only if they
1042 914 * were already pre-dirtied in open context.
1043 915 */
1044 916 ASSERT(!dmu_tx_is_syncing(tx) ||
1045 917 BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1046 918 DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1047 919 dn->dn_objset->os_dsl_dataset == NULL);
1048 920 /*
1049 921 * We make this assert for private objects as well, but after we
1050 922 * check if we're already dirty. They are allowed to re-dirty
1051 923 * in syncing context.
1052 924 */
1053 925 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1054 926 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1055 927 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1056 928
1057 929 mutex_enter(&db->db_mtx);
1058 930 /*
1059 931 * XXX make this true for indirects too? The problem is that
1060 932 * transactions created with dmu_tx_create_assigned() from
1061 933 * syncing context don't bother holding ahead.
1062 934 */
1063 935 ASSERT(db->db_level != 0 ||
1064 936 db->db_state == DB_CACHED || db->db_state == DB_FILL ||
1065 937 db->db_state == DB_NOFILL);
1066 938
1067 939 mutex_enter(&dn->dn_mtx);
1068 940 /*
1069 941 * Don't set dirtyctx to SYNC if we're just modifying this as we
1070 942 * initialize the objset.
1071 943 */
1072 944 if (dn->dn_dirtyctx == DN_UNDIRTIED &&
1073 945 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1074 946 dn->dn_dirtyctx =
1075 947 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1076 948 ASSERT(dn->dn_dirtyctx_firstset == NULL);
1077 949 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1078 950 }
1079 951 mutex_exit(&dn->dn_mtx);
1080 952
1081 953 if (db->db_blkid == DMU_SPILL_BLKID)
1082 954 dn->dn_have_spill = B_TRUE;
1083 955
1084 956 /*
1085 957 * If this buffer is already dirty, we're done.
1086 958 */
1087 959 drp = &db->db_last_dirty;
1088 960 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1089 961 db->db.db_object == DMU_META_DNODE_OBJECT);
1090 962 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1091 963 drp = &dr->dr_next;
1092 964 if (dr && dr->dr_txg == tx->tx_txg) {
1093 965 DB_DNODE_EXIT(db);
1094 966
1095 967 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1096 968 /*
1097 969 * If this buffer has already been written out,
1098 970 * we now need to reset its state.
1099 971 */
1100 972 dbuf_unoverride(dr);
1101 973 if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1102 974 db->db_state != DB_NOFILL)
1103 975 arc_buf_thaw(db->db_buf);
1104 976 }
1105 977
1106 978 /*
1107 979 * Special class usage of dirty dbuf could be changed,
1108 980 * update the dirty entry.
1109 981 */
1110 982 dr->dr_usesc = usesc;
1111 983 mutex_exit(&db->db_mtx);
1112 984 return (dr);
1113 985 }
1114 986
1115 987 /*
1116 988 * Only valid if not already dirty.
1117 989 */
1118 990 ASSERT(dn->dn_object == 0 ||
1119 991 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1120 992 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1121 993
1122 994 ASSERT3U(dn->dn_nlevels, >, db->db_level);
1123 995 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1124 996 dn->dn_phys->dn_nlevels > db->db_level ||
1125 997 dn->dn_next_nlevels[txgoff] > db->db_level ||
1126 998 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1127 999 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1128 1000
1129 1001 /*
1130 1002 * We should only be dirtying in syncing context if it's the
1131 1003 * mos or we're initializing the os or it's a special object.
1132 1004 * However, we are allowed to dirty in syncing context provided
1133 1005 * we already dirtied it in open context. Hence we must make
1134 1006 * this assertion only if we're not already dirty.
1135 1007 */
1136 1008 os = dn->dn_objset;
1137 1009 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1138 1010 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1139 1011 ASSERT(db->db.db_size != 0);
1140 1012
1141 1013 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1142 1014
1143 1015 if (db->db_blkid != DMU_BONUS_BLKID) {
1144 1016 /*
1145 1017 * Update the accounting.
1146 1018 * Note: we delay "free accounting" until after we drop
1147 1019 * the db_mtx. This keeps us from grabbing other locks
1148 1020 * (and possibly deadlocking) in bp_get_dsize() while
1149 1021 * also holding the db_mtx.
1150 1022 */
1151 1023 dnode_willuse_space(dn, db->db.db_size, tx);
1152 1024 do_free_accounting = dbuf_block_freeable(db);
1153 1025 }
1154 1026
1155 1027 /*
1156 1028 * If this buffer is dirty in an old transaction group we need
1157 1029 * to make a copy of it so that the changes we make in this
1158 1030 * transaction group won't leak out when we sync the older txg.
1159 1031 */
1160 1032 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1161 1033 if (db->db_level == 0) {
1162 1034 void *data_old = db->db_buf;
1163 1035
1164 1036 if (db->db_state != DB_NOFILL) {
1165 1037 if (db->db_blkid == DMU_BONUS_BLKID) {
1166 1038 dbuf_fix_old_data(db, tx->tx_txg);
1167 1039 data_old = db->db.db_data;
1168 1040 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1169 1041 /*
1170 1042 * Release the data buffer from the cache so
1171 1043 * that we can modify it without impacting
1172 1044 * possible other users of this cached data
1173 1045 * block. Note that indirect blocks and
1174 1046 * private objects are not released until the
1175 1047 * syncing state (since they are only modified
1176 1048 * then).
1177 1049 */
1178 1050 arc_release(db->db_buf, db);
1179 1051 dbuf_fix_old_data(db, tx->tx_txg);
1180 1052 data_old = db->db_buf;
1181 1053 }
1182 1054 ASSERT(data_old != NULL);
1183 1055 }
1184 1056 dr->dt.dl.dr_data = data_old;
1185 1057 } else {
1186 1058 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1187 1059 list_create(&dr->dt.di.dr_children,
1188 1060 sizeof (dbuf_dirty_record_t),
1189 1061 offsetof(dbuf_dirty_record_t, dr_dirty_node));
1190 1062 }
1191 1063 if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
1192 1064 dr->dr_accounted = db->db.db_size;
1193 1065 dr->dr_dbuf = db;
1194 1066 dr->dr_txg = tx->tx_txg;
1195 1067 dr->dr_next = *drp;
1196 1068 dr->dr_usesc = usesc;
1197 1069 *drp = dr;
1198 1070
1199 1071 /*
1200 1072 * We could have been freed_in_flight between the dbuf_noread
1201 1073 * and dbuf_dirty. We win, as though the dbuf_noread() had
1202 1074 * happened after the free.
1203 1075 */
1204 1076 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1205 1077 db->db_blkid != DMU_SPILL_BLKID) {
1206 1078 mutex_enter(&dn->dn_mtx);
1207 1079 if (dn->dn_free_ranges[txgoff] != NULL) {
1208 1080 range_tree_clear(dn->dn_free_ranges[txgoff],
1209 1081 db->db_blkid, 1);
1210 1082 }
1211 1083 mutex_exit(&dn->dn_mtx);
1212 1084 db->db_freed_in_flight = FALSE;
1213 1085 }
1214 1086
1215 1087 /*
1216 1088 * This buffer is now part of this txg
1217 1089 */
1218 1090 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1219 1091 db->db_dirtycnt += 1;
1220 1092 ASSERT3U(db->db_dirtycnt, <=, 3);
1221 1093
1222 1094 mutex_exit(&db->db_mtx);
1223 1095
1224 1096 if (db->db_blkid == DMU_BONUS_BLKID ||
1225 1097 db->db_blkid == DMU_SPILL_BLKID) {
1226 1098 mutex_enter(&dn->dn_mtx);
1227 1099 ASSERT(!list_link_active(&dr->dr_dirty_node));
1228 1100 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1229 1101 mutex_exit(&dn->dn_mtx);
1230 1102 dnode_setdirty_sc(dn, tx, usesc);
1231 1103 DB_DNODE_EXIT(db);
1232 1104 return (dr);
1233 1105 } else if (do_free_accounting) {
1234 1106 blkptr_t *bp = db->db_blkptr;
1235 1107 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1236 1108 bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1237 1109 /*
1238 1110 * This is only a guess -- if the dbuf is dirty
1239 1111 * in a previous txg, we don't know how much
1240 1112 * space it will use on disk yet. We should
1241 1113 * really have the struct_rwlock to access
1242 1114 * db_blkptr, but since this is just a guess,
1243 1115 * it's OK if we get an odd answer.
1244 1116 */
1245 1117 ddt_prefetch(os->os_spa, bp);
1246 1118 dnode_willuse_space(dn, -willfree, tx);
1247 1119 }
1248 1120
1249 1121 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1250 1122 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1251 1123 drop_struct_lock = TRUE;
1252 1124 }
1253 1125
1254 1126 if (db->db_level == 0) {
1255 1127 dnode_new_blkid(dn, db->db_blkid, tx, usesc, drop_struct_lock);
1256 1128 ASSERT(dn->dn_maxblkid >= db->db_blkid);
1257 1129 }
1258 1130
1259 1131 if (db->db_level+1 < dn->dn_nlevels) {
1260 1132 dmu_buf_impl_t *parent = db->db_parent;
1261 1133 dbuf_dirty_record_t *di;
1262 1134 int parent_held = FALSE;
1263 1135
1264 1136 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1265 1137 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1266 1138
1267 1139 parent = dbuf_hold_level(dn, db->db_level+1,
1268 1140 db->db_blkid >> epbs, FTAG);
1269 1141 ASSERT(parent != NULL);
1270 1142 parent_held = TRUE;
1271 1143 }
1272 1144 if (drop_struct_lock)
1273 1145 rw_exit(&dn->dn_struct_rwlock);
1274 1146 ASSERT3U(db->db_level+1, ==, parent->db_level);
1275 1147 di = dbuf_dirty_sc(parent, tx, usesc);
1276 1148 if (parent_held)
1277 1149 dbuf_rele(parent, FTAG);
1278 1150
1279 1151 mutex_enter(&db->db_mtx);
1280 1152 /*
1281 1153 * Since we've dropped the mutex, it's possible that
1282 1154 * dbuf_undirty() might have changed this out from under us.
1283 1155 */
1284 1156 if (db->db_last_dirty == dr ||
1285 1157 dn->dn_object == DMU_META_DNODE_OBJECT) {
1286 1158 mutex_enter(&di->dt.di.dr_mtx);
1287 1159 ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1288 1160 ASSERT(!list_link_active(&dr->dr_dirty_node));
1289 1161 list_insert_tail(&di->dt.di.dr_children, dr);
1290 1162 mutex_exit(&di->dt.di.dr_mtx);
1291 1163 dr->dr_parent = di;
1292 1164 }
1293 1165
1294 1166 /*
1295 1167 * Special class usage of dirty dbuf could be changed,
1296 1168 * update the dirty entry.
1297 1169 */
1298 1170 dr->dr_usesc = usesc;
1299 1171 mutex_exit(&db->db_mtx);
1300 1172 } else {
1301 1173 ASSERT(db->db_level+1 == dn->dn_nlevels);
1302 1174 ASSERT(db->db_blkid < dn->dn_nblkptr);
1303 1175 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1304 1176 mutex_enter(&dn->dn_mtx);
1305 1177 ASSERT(!list_link_active(&dr->dr_dirty_node));
1306 1178 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1307 1179 mutex_exit(&dn->dn_mtx);
1308 1180 if (drop_struct_lock)
1309 1181 rw_exit(&dn->dn_struct_rwlock);
1310 1182 }
1311 1183
1312 1184 dnode_setdirty_sc(dn, tx, usesc);
1313 1185 DB_DNODE_EXIT(db);
1314 1186 return (dr);
1315 1187 }
1316 1188
1317 1189 dbuf_dirty_record_t *
1318 1190 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1319 1191 {
1320 1192 spa_t *spa;
1321 1193
1322 1194 ASSERT(db->db_objset != NULL);
1323 1195 spa = db->db_objset->os_spa;
1324 1196
1325 1197 return (dbuf_dirty_sc(db, tx, spa->spa_usesc));
1326 1198 }
1327 1199
1328 1200 /*
1329 1201 * Undirty a buffer in the transaction group referenced by the given
1330 1202 * transaction. Return whether this evicted the dbuf.
1331 1203 */
1332 1204 static boolean_t
1333 1205 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1334 1206 {
1335 1207 dnode_t *dn;
1336 1208 uint64_t txg = tx->tx_txg;
1337 1209 dbuf_dirty_record_t *dr, **drp;
1338 1210
1339 1211 ASSERT(txg != 0);
1340 1212 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1341 1213 ASSERT0(db->db_level);
1342 1214 ASSERT(MUTEX_HELD(&db->db_mtx));
1343 1215
1344 1216 /*
1345 1217 * If this buffer is not dirty, we're done.
1346 1218 */
1347 1219 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1348 1220 if (dr->dr_txg <= txg)
1349 1221 break;
1350 1222 if (dr == NULL || dr->dr_txg < txg)
1351 1223 return (B_FALSE);
1352 1224 ASSERT(dr->dr_txg == txg);
1353 1225 ASSERT(dr->dr_dbuf == db);
1354 1226
1355 1227 DB_DNODE_ENTER(db);
1356 1228 dn = DB_DNODE(db);
1357 1229
1358 1230 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1359 1231
1360 1232 ASSERT(db->db.db_size != 0);
1361 1233
1362 1234 /*
1363 1235 * Any space we accounted for in dp_dirty_* will be cleaned up by
1364 1236 * dsl_pool_sync(). This is relatively rare so the discrepancy
1365 1237 * is not a big deal.
1366 1238 */
1367 1239
1368 1240 *drp = dr->dr_next;
1369 1241
1370 1242 /*
1371 1243 * Note that there are three places in dbuf_dirty()
1372 1244 * where this dirty record may be put on a list.
1373 1245 * Make sure to do a list_remove corresponding to
1374 1246 * every one of those list_insert calls.
1375 1247 */
1376 1248 if (dr->dr_parent) {
1377 1249 mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1378 1250 list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1379 1251 mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1380 1252 } else if (db->db_blkid == DMU_SPILL_BLKID ||
1381 1253 db->db_level+1 == dn->dn_nlevels) {
1382 1254 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1383 1255 mutex_enter(&dn->dn_mtx);
1384 1256 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1385 1257 mutex_exit(&dn->dn_mtx);
1386 1258 }
1387 1259 DB_DNODE_EXIT(db);
1388 1260
1389 1261 if (db->db_state != DB_NOFILL) {
1390 1262 dbuf_unoverride(dr);
1391 1263
1392 1264 ASSERT(db->db_buf != NULL);
1393 1265 ASSERT(dr->dt.dl.dr_data != NULL);
1394 1266 if (dr->dt.dl.dr_data != db->db_buf)
1395 1267 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1396 1268 }
1397 1269
1398 1270 if (db->db_level != 0) {
1399 1271 mutex_destroy(&dr->dt.di.dr_mtx);
1400 1272 list_destroy(&dr->dt.di.dr_children);
1401 1273 }
1402 1274
1403 1275 kmem_free(dr, sizeof (dbuf_dirty_record_t));
1404 1276
1405 1277 ASSERT(db->db_dirtycnt > 0);
1406 1278 db->db_dirtycnt -= 1;
1407 1279
1408 1280 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1409 1281 arc_buf_t *buf = db->db_buf;
1410 1282
1411 1283 ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1412 1284 dbuf_set_data(db, NULL);
1413 1285 VERIFY(arc_buf_remove_ref(buf, db));
1414 1286 dbuf_evict(db);
1415 1287 return (B_TRUE);
1416 1288 }
1417 1289
1418 1290 return (B_FALSE);
1419 1291 }
1420 1292
1421 1293 void
1422 1294 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
1423 1295 {
1424 1296 dmu_buf_will_dirty_sc(db_fake, tx, B_TRUE);
1425 1297 }
1426 1298
1427 1299 void
1428 1300 dmu_buf_will_dirty_sc(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t usesc)
1429 1301 {
1430 1302 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1431 1303 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1432 1304
1433 1305 ASSERT(tx->tx_txg != 0);
1434 1306 ASSERT(!refcount_is_zero(&db->db_holds));
1435 1307
1436 1308 DB_DNODE_ENTER(db);
1437 1309 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1438 1310 rf |= DB_RF_HAVESTRUCT;
1439 1311 DB_DNODE_EXIT(db);
1440 1312 (void) dbuf_read(db, NULL, rf);
1441 1313 (void) dbuf_dirty_sc(db, tx, usesc);
1442 1314 }
1443 1315
1444 1316
1445 1317 void
1446 1318 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1447 1319 {
1448 1320 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1449 1321
1450 1322 db->db_state = DB_NOFILL;
1451 1323
1452 1324 dmu_buf_will_fill(db_fake, tx);
1453 1325 }
1454 1326
1455 1327 void
1456 1328 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1457 1329 {
1458 1330 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1459 1331
1460 1332 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1461 1333 ASSERT(tx->tx_txg != 0);
1462 1334 ASSERT(db->db_level == 0);
1463 1335 ASSERT(!refcount_is_zero(&db->db_holds));
1464 1336
1465 1337 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1466 1338 dmu_tx_private_ok(tx));
1467 1339
1468 1340 dbuf_noread(db);
1469 1341 (void) dbuf_dirty(db, tx);
1470 1342 }
1471 1343
1472 1344 #pragma weak dmu_buf_fill_done = dbuf_fill_done
1473 1345 /* ARGSUSED */
1474 1346 void
1475 1347 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1476 1348 {
1477 1349 mutex_enter(&db->db_mtx);
1478 1350 DBUF_VERIFY(db);
1479 1351
1480 1352 if (db->db_state == DB_FILL) {
1481 1353 if (db->db_level == 0 && db->db_freed_in_flight) {
1482 1354 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1483 1355 /* we were freed while filling */
1484 1356 /* XXX dbuf_undirty? */
1485 1357 bzero(db->db.db_data, db->db.db_size);
1486 1358 db->db_freed_in_flight = FALSE;
1487 1359 }
1488 1360 db->db_state = DB_CACHED;
1489 1361 cv_broadcast(&db->db_changed);
1490 1362 }
1491 1363 mutex_exit(&db->db_mtx);
1492 1364 }
1493 1365
1494 1366 void
1495 1367 dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
1496 1368 bp_embedded_type_t etype, enum zio_compress comp,
1497 1369 int uncompressed_size, int compressed_size, int byteorder,
1498 1370 dmu_tx_t *tx)
1499 1371 {
1500 1372 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
1501 1373 struct dirty_leaf *dl;
1502 1374 dmu_object_type_t type;
1503 1375
1504 1376 DB_DNODE_ENTER(db);
1505 1377 type = DB_DNODE(db)->dn_type;
1506 1378 DB_DNODE_EXIT(db);
1507 1379
1508 1380 ASSERT0(db->db_level);
1509 1381 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1510 1382
1511 1383 dmu_buf_will_not_fill(dbuf, tx);
1512 1384
1513 1385 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1514 1386 dl = &db->db_last_dirty->dt.dl;
1515 1387 encode_embedded_bp_compressed(&dl->dr_overridden_by,
1516 1388 data, comp, uncompressed_size, compressed_size);
1517 1389 BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
1518 1390 BP_SET_TYPE(&dl->dr_overridden_by, type);
1519 1391 BP_SET_LEVEL(&dl->dr_overridden_by, 0);
1520 1392 BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
1521 1393
1522 1394 dl->dr_override_state = DR_OVERRIDDEN;
1523 1395 dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
1524 1396 }
1525 1397
1526 1398 /*
1527 1399 * Directly assign a provided arc buf to a given dbuf if it's not referenced
1528 1400 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
1529 1401 */
1530 1402 void
1531 1403 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
1532 1404 {
1533 1405 ASSERT(!refcount_is_zero(&db->db_holds));
1534 1406 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1535 1407 ASSERT(db->db_level == 0);
1536 1408 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
1537 1409 ASSERT(buf != NULL);
1538 1410 ASSERT(arc_buf_size(buf) == db->db.db_size);
1539 1411 ASSERT(tx->tx_txg != 0);
1540 1412
1541 1413 arc_return_buf(buf, db);
1542 1414 ASSERT(arc_released(buf));
1543 1415
1544 1416 mutex_enter(&db->db_mtx);
1545 1417
1546 1418 while (db->db_state == DB_READ || db->db_state == DB_FILL)
1547 1419 cv_wait(&db->db_changed, &db->db_mtx);
1548 1420
1549 1421 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
1550 1422
1551 1423 if (db->db_state == DB_CACHED &&
1552 1424 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
1553 1425 mutex_exit(&db->db_mtx);
1554 1426 (void) dbuf_dirty(db, tx);
1555 1427 bcopy(buf->b_data, db->db.db_data, db->db.db_size);
1556 1428 VERIFY(arc_buf_remove_ref(buf, db));
1557 1429 xuio_stat_wbuf_copied();
1558 1430 return;
1559 1431 }
1560 1432
1561 1433 xuio_stat_wbuf_nocopy();
1562 1434 if (db->db_state == DB_CACHED) {
1563 1435 dbuf_dirty_record_t *dr = db->db_last_dirty;
1564 1436
1565 1437 ASSERT(db->db_buf != NULL);
1566 1438 if (dr != NULL && dr->dr_txg == tx->tx_txg) {
1567 1439 ASSERT(dr->dt.dl.dr_data == db->db_buf);
1568 1440 if (!arc_released(db->db_buf)) {
1569 1441 ASSERT(dr->dt.dl.dr_override_state ==
1570 1442 DR_OVERRIDDEN);
1571 1443 arc_release(db->db_buf, db);
1572 1444 }
1573 1445 dr->dt.dl.dr_data = buf;
1574 1446 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1575 1447 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1576 1448 arc_release(db->db_buf, db);
1577 1449 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1578 1450 }
1579 1451 db->db_buf = NULL;
1580 1452 }
1581 1453 ASSERT(db->db_buf == NULL);
1582 1454 dbuf_set_data(db, buf);
1583 1455 db->db_state = DB_FILL;
1584 1456 mutex_exit(&db->db_mtx);
1585 1457 (void) dbuf_dirty(db, tx);
1586 1458 dmu_buf_fill_done(&db->db, tx);
1587 1459 }
1588 1460
1589 1461 /*
1590 1462 * "Clear" the contents of this dbuf. This will mark the dbuf
1591 1463 * EVICTING and clear *most* of its references. Unfortunately,
1592 1464 * when we are not holding the dn_dbufs_mtx, we can't clear the
1593 1465 * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
1594 1466 * in this case. For callers from the DMU we will usually see:
1595 1467 * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy()
1596 1468 * For the arc callback, we will usually see:
1597 1469 * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1598 1470 * Sometimes, though, we will get a mix of these two:
1599 1471 * DMU: dbuf_clear()->arc_clear_callback()
1600 1472 * ARC: dbuf_do_evict()->dbuf_destroy()
1601 1473 *
1602 1474 * This routine will dissociate the dbuf from the arc, by calling
1603 1475 * arc_clear_callback(), but will not evict the data from the ARC.
1604 1476 */
1605 1477 void
1606 1478 dbuf_clear(dmu_buf_impl_t *db)
1607 1479 {
1608 1480 dnode_t *dn;
1609 1481 dmu_buf_impl_t *parent = db->db_parent;
1610 1482 dmu_buf_impl_t *dndb;
1611 1483 boolean_t dbuf_gone = B_FALSE;
1612 1484
1613 1485 ASSERT(MUTEX_HELD(&db->db_mtx));
1614 1486 ASSERT(refcount_is_zero(&db->db_holds));
1615 1487
1616 1488 dbuf_evict_user(db);
1617 1489
1618 1490 if (db->db_state == DB_CACHED) {
1619 1491 ASSERT(db->db.db_data != NULL);
1620 1492 if (db->db_blkid == DMU_BONUS_BLKID) {
1621 1493 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1622 1494 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1623 1495 }
1624 1496 db->db.db_data = NULL;
1625 1497 db->db_state = DB_UNCACHED;
1626 1498 }
1627 1499
1628 1500 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1629 1501 ASSERT(db->db_data_pending == NULL);
1630 1502
1631 1503 db->db_state = DB_EVICTING;
1632 1504 db->db_blkptr = NULL;
1633 1505
1634 1506 DB_DNODE_ENTER(db);
1635 1507 dn = DB_DNODE(db);
1636 1508 dndb = dn->dn_dbuf;
1637 1509 if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1638 1510 avl_remove(&dn->dn_dbufs, db);
1639 1511 atomic_dec_32(&dn->dn_dbufs_count);
1640 1512 membar_producer();
1641 1513 DB_DNODE_EXIT(db);
1642 1514 /*
1643 1515 * Decrementing the dbuf count means that the hold corresponding
1644 1516 * to the removed dbuf is no longer discounted in dnode_move(),
1645 1517 * so the dnode cannot be moved until after we release the hold.
1646 1518 * The membar_producer() ensures visibility of the decremented
1647 1519 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
1648 1520 * release any lock.
1649 1521 */
1650 1522 dnode_rele(dn, db);
1651 1523 db->db_dnode_handle = NULL;
1652 1524 } else {
1653 1525 DB_DNODE_EXIT(db);
1654 1526 }
1655 1527
1656 1528 if (db->db_buf)
1657 1529 dbuf_gone = arc_clear_callback(db->db_buf);
1658 1530
1659 1531 if (!dbuf_gone)
1660 1532 mutex_exit(&db->db_mtx);
1661 1533
1662 1534 /*
1663 1535 * If this dbuf is referenced from an indirect dbuf,
1664 1536 * decrement the ref count on the indirect dbuf.
1665 1537 */
1666 1538 if (parent && parent != dndb)
1667 1539 dbuf_rele(parent, db);
1668 1540 }
1669 1541
1670 1542 static int
1671 1543 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1672 1544 dmu_buf_impl_t **parentp, blkptr_t **bpp)
1673 1545 {
1674 1546 int nlevels, epbs;
1675 1547
1676 1548 *parentp = NULL;
1677 1549 *bpp = NULL;
1678 1550
1679 1551 ASSERT(blkid != DMU_BONUS_BLKID);
1680 1552
1681 1553 if (blkid == DMU_SPILL_BLKID) {
1682 1554 mutex_enter(&dn->dn_mtx);
1683 1555 if (dn->dn_have_spill &&
1684 1556 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
1685 1557 *bpp = &dn->dn_phys->dn_spill;
1686 1558 else
1687 1559 *bpp = NULL;
1688 1560 dbuf_add_ref(dn->dn_dbuf, NULL);
1689 1561 *parentp = dn->dn_dbuf;
1690 1562 mutex_exit(&dn->dn_mtx);
1691 1563 return (0);
1692 1564 }
1693 1565
1694 1566 if (dn->dn_phys->dn_nlevels == 0)
1695 1567 nlevels = 1;
1696 1568 else
1697 1569 nlevels = dn->dn_phys->dn_nlevels;
1698 1570
1699 1571 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1700 1572
1701 1573 ASSERT3U(level * epbs, <, 64);
1702 1574 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1703 1575 if (level >= nlevels ||
1704 1576 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1705 1577 /* the buffer has no parent yet */
1706 1578 return (SET_ERROR(ENOENT));
1707 1579 } else if (level < nlevels-1) {
1708 1580 /* this block is referenced from an indirect block */
1709 1581 int err = dbuf_hold_impl(dn, level+1,
1710 1582 blkid >> epbs, fail_sparse, NULL, parentp);
1711 1583 if (err)
1712 1584 return (err);
1713 1585 err = dbuf_read(*parentp, NULL,
1714 1586 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1715 1587 if (err) {
1716 1588 dbuf_rele(*parentp, NULL);
1717 1589 *parentp = NULL;
1718 1590 return (err);
1719 1591 }
1720 1592 *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1721 1593 (blkid & ((1ULL << epbs) - 1));
1722 1594 return (0);
1723 1595 } else {
1724 1596 /* the block is referenced from the dnode */
1725 1597 ASSERT3U(level, ==, nlevels-1);
1726 1598 ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1727 1599 blkid < dn->dn_phys->dn_nblkptr);
1728 1600 if (dn->dn_dbuf) {
1729 1601 dbuf_add_ref(dn->dn_dbuf, NULL);
1730 1602 *parentp = dn->dn_dbuf;
1731 1603 }
1732 1604 *bpp = &dn->dn_phys->dn_blkptr[blkid];
↓ open down ↓ |
1418 lines elided |
↑ open up ↑ |
1733 1605 return (0);
1734 1606 }
1735 1607 }
1736 1608
1737 1609 static dmu_buf_impl_t *
1738 1610 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1739 1611 dmu_buf_impl_t *parent, blkptr_t *blkptr)
1740 1612 {
1741 1613 objset_t *os = dn->dn_objset;
1742 1614 dmu_buf_impl_t *db, *odb;
1615 + avl_index_t where;
1743 1616
1744 1617 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1745 1618 ASSERT(dn->dn_type != DMU_OT_NONE);
1746 1619
1747 1620 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1748 1621
1749 1622 db->db_objset = os;
1750 1623 db->db.db_object = dn->dn_object;
1751 1624 db->db_level = level;
1752 1625 db->db_blkid = blkid;
1753 1626 db->db_last_dirty = NULL;
1754 1627 db->db_dirtycnt = 0;
1755 1628 db->db_dnode_handle = dn->dn_handle;
1756 1629 db->db_parent = parent;
1757 1630 db->db_blkptr = blkptr;
1758 1631
1759 1632 db->db_user_ptr = NULL;
1760 1633 db->db_user_data_ptr_ptr = NULL;
1761 1634 db->db_evict_func = NULL;
↓ open down ↓ |
9 lines elided |
↑ open up ↑ |
1762 1635 db->db_immediate_evict = 0;
1763 1636 db->db_freed_in_flight = 0;
1764 1637
1765 1638 if (blkid == DMU_BONUS_BLKID) {
1766 1639 ASSERT3P(parent, ==, dn->dn_dbuf);
1767 1640 db->db.db_size = DN_MAX_BONUSLEN -
1768 1641 (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1769 1642 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1770 1643 db->db.db_offset = DMU_BONUS_BLKID;
1771 1644 db->db_state = DB_UNCACHED;
1772 - /* the bonus dbuf is not placed in the hash table */
1645 + /* the bonus dbuf is not placed into the dnode's dbuf tree */
1773 1646 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1774 1647 return (db);
1775 1648 } else if (blkid == DMU_SPILL_BLKID) {
1776 1649 db->db.db_size = (blkptr != NULL) ?
1777 1650 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1778 1651 db->db.db_offset = 0;
1779 1652 } else {
1780 1653 int blocksize =
1781 1654 db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
1782 1655 db->db.db_size = blocksize;
1783 1656 db->db.db_offset = db->db_blkid * blocksize;
1784 1657 }
1785 1658
1786 - /*
1787 - * Hold the dn_dbufs_mtx while we get the new dbuf
1788 - * in the hash table *and* added to the dbufs list.
1789 - * This prevents a possible deadlock with someone
1790 - * trying to look up this dbuf before its added to the
1791 - * dn_dbufs list.
1792 - */
1793 1659 mutex_enter(&dn->dn_dbufs_mtx);
1660 + mutex_enter(&db->db_mtx);
1794 1661 db->db_state = DB_EVICTING;
1795 - if ((odb = dbuf_hash_insert(db)) != NULL) {
1662 + if ((odb = avl_find(&dn->dn_dbufs, db, &where))) {
1796 1663 /* someone else inserted it first */
1664 + mutex_exit(&db->db_mtx);
1797 1665 kmem_cache_free(dbuf_cache, db);
1666 + mutex_enter(&odb->db_mtx);
1798 1667 mutex_exit(&dn->dn_dbufs_mtx);
1799 1668 return (odb);
1800 1669 }
1801 - avl_add(&dn->dn_dbufs, db);
1670 + avl_insert(&dn->dn_dbufs, db, where);
1802 1671 if (db->db_level == 0 && db->db_blkid >=
1803 1672 dn->dn_unlisted_l0_blkid)
1804 1673 dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
1805 1674 db->db_state = DB_UNCACHED;
1806 1675 mutex_exit(&dn->dn_dbufs_mtx);
1807 1676 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1808 1677
1809 1678 if (parent && parent != dn->dn_dbuf)
1810 1679 dbuf_add_ref(parent, db);
1811 1680
1812 1681 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1813 1682 refcount_count(&dn->dn_holds) > 0);
1814 1683 (void) refcount_add(&dn->dn_holds, db);
1815 1684 atomic_inc_32(&dn->dn_dbufs_count);
1816 1685
1817 1686 dprintf_dbuf(db, "db=%p\n", db);
1818 1687
1819 1688 return (db);
1820 1689 }
1821 1690
1822 1691 static int
1823 1692 dbuf_do_evict(void *private)
1824 1693 {
1825 1694 dmu_buf_impl_t *db = private;
1826 1695
1827 1696 if (!MUTEX_HELD(&db->db_mtx))
1828 1697 mutex_enter(&db->db_mtx);
1829 1698
1830 1699 ASSERT(refcount_is_zero(&db->db_holds));
1831 1700
1832 1701 if (db->db_state != DB_EVICTING) {
1833 1702 ASSERT(db->db_state == DB_CACHED);
1834 1703 DBUF_VERIFY(db);
1835 1704 db->db_buf = NULL;
1836 1705 dbuf_evict(db);
1837 1706 } else {
1838 1707 mutex_exit(&db->db_mtx);
1839 1708 dbuf_destroy(db);
1840 1709 }
1841 1710 return (0);
1842 1711 }
1843 1712
1844 1713 static void
1845 1714 dbuf_destroy(dmu_buf_impl_t *db)
1846 1715 {
1847 1716 ASSERT(refcount_is_zero(&db->db_holds));
1848 1717
1849 1718 if (db->db_blkid != DMU_BONUS_BLKID) {
1850 1719 /*
1851 1720 * If this dbuf is still on the dn_dbufs list,
1852 1721 * remove it from that list.
1853 1722 */
1854 1723 if (db->db_dnode_handle != NULL) {
1855 1724 dnode_t *dn;
1856 1725
1857 1726 DB_DNODE_ENTER(db);
1858 1727 dn = DB_DNODE(db);
1859 1728 mutex_enter(&dn->dn_dbufs_mtx);
1860 1729 avl_remove(&dn->dn_dbufs, db);
1861 1730 atomic_dec_32(&dn->dn_dbufs_count);
1862 1731 mutex_exit(&dn->dn_dbufs_mtx);
↓ open down ↓ |
51 lines elided |
↑ open up ↑ |
1863 1732 DB_DNODE_EXIT(db);
1864 1733 /*
1865 1734 * Decrementing the dbuf count means that the hold
1866 1735 * corresponding to the removed dbuf is no longer
1867 1736 * discounted in dnode_move(), so the dnode cannot be
1868 1737 * moved until after we release the hold.
1869 1738 */
1870 1739 dnode_rele(dn, db);
1871 1740 db->db_dnode_handle = NULL;
1872 1741 }
1873 - dbuf_hash_remove(db);
1874 1742 }
1875 1743 db->db_parent = NULL;
1876 1744 db->db_buf = NULL;
1877 1745
1878 1746 ASSERT(db->db.db_data == NULL);
1879 - ASSERT(db->db_hash_next == NULL);
1880 1747 ASSERT(db->db_blkptr == NULL);
1881 1748 ASSERT(db->db_data_pending == NULL);
1882 1749
1883 1750 kmem_cache_free(dbuf_cache, db);
1884 1751 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1885 1752 }
1886 1753
1887 1754 void
1888 1755 dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
1889 1756 {
1890 1757 dmu_buf_impl_t *db = NULL;
1891 1758 blkptr_t *bp = NULL;
1892 1759
1893 1760 ASSERT(blkid != DMU_BONUS_BLKID);
1894 1761 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1895 1762
1896 1763 if (dnode_block_freed(dn, blkid))
1897 1764 return;
1898 1765
1899 1766 /* dbuf_find() returns with db_mtx held */
1900 1767 if (db = dbuf_find(dn, 0, blkid)) {
1901 1768 /*
1902 1769 * This dbuf is already in the cache. We assume that
1903 1770 * it is already CACHED, or else about to be either
1904 1771 * read or filled.
1905 1772 */
1906 1773 mutex_exit(&db->db_mtx);
1907 1774 return;
1908 1775 }
1909 1776
1910 1777 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1911 1778 if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
1912 1779 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1913 1780 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1914 1781 zbookmark_phys_t zb;
1915 1782
1916 1783 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1917 1784 dn->dn_object, 0, blkid);
1918 1785
1919 1786 (void) arc_read(NULL, dn->dn_objset->os_spa,
1920 1787 bp, NULL, NULL, prio,
1921 1788 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1922 1789 &aflags, &zb);
1923 1790 }
1924 1791 if (db)
1925 1792 dbuf_rele(db, NULL);
1926 1793 }
1927 1794 }
1928 1795
1929 1796 /*
1930 1797 * Returns with db_holds incremented, and db_mtx not held.
1931 1798 * Note: dn_struct_rwlock must be held.
1932 1799 */
1933 1800 int
1934 1801 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1935 1802 void *tag, dmu_buf_impl_t **dbp)
1936 1803 {
1937 1804 dmu_buf_impl_t *db, *parent = NULL;
1938 1805
1939 1806 ASSERT(blkid != DMU_BONUS_BLKID);
1940 1807 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1941 1808 ASSERT3U(dn->dn_nlevels, >, level);
1942 1809
1943 1810 *dbp = NULL;
1944 1811 top:
1945 1812 /* dbuf_find() returns with db_mtx held */
1946 1813 db = dbuf_find(dn, level, blkid);
1947 1814
1948 1815 if (db == NULL) {
1949 1816 blkptr_t *bp = NULL;
1950 1817 int err;
1951 1818
1952 1819 ASSERT3P(parent, ==, NULL);
1953 1820 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1954 1821 if (fail_sparse) {
1955 1822 if (err == 0 && bp && BP_IS_HOLE(bp))
1956 1823 err = SET_ERROR(ENOENT);
1957 1824 if (err) {
1958 1825 if (parent)
1959 1826 dbuf_rele(parent, NULL);
1960 1827 return (err);
1961 1828 }
1962 1829 }
1963 1830 if (err && err != ENOENT)
1964 1831 return (err);
1965 1832 db = dbuf_create(dn, level, blkid, parent, bp);
1966 1833 }
1967 1834
1968 1835 if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1969 1836 arc_buf_add_ref(db->db_buf, db);
1970 1837 if (db->db_buf->b_data == NULL) {
1971 1838 dbuf_clear(db);
1972 1839 if (parent) {
1973 1840 dbuf_rele(parent, NULL);
1974 1841 parent = NULL;
1975 1842 }
1976 1843 goto top;
1977 1844 }
1978 1845 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1979 1846 }
1980 1847
1981 1848 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1982 1849
1983 1850 /*
1984 1851 * If this buffer is currently syncing out, and we are are
1985 1852 * still referencing it from db_data, we need to make a copy
1986 1853 * of it in case we decide we want to dirty it again in this txg.
1987 1854 */
1988 1855 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1989 1856 dn->dn_object != DMU_META_DNODE_OBJECT &&
1990 1857 db->db_state == DB_CACHED && db->db_data_pending) {
1991 1858 dbuf_dirty_record_t *dr = db->db_data_pending;
1992 1859
1993 1860 if (dr->dt.dl.dr_data == db->db_buf) {
1994 1861 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1995 1862
1996 1863 dbuf_set_data(db,
1997 1864 arc_buf_alloc(dn->dn_objset->os_spa,
1998 1865 db->db.db_size, db, type));
1999 1866 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
2000 1867 db->db.db_size);
2001 1868 }
2002 1869 }
2003 1870
2004 1871 (void) refcount_add(&db->db_holds, tag);
2005 1872 dbuf_update_data(db);
2006 1873 DBUF_VERIFY(db);
2007 1874 mutex_exit(&db->db_mtx);
2008 1875
2009 1876 /* NOTE: we can't rele the parent until after we drop the db_mtx */
2010 1877 if (parent)
2011 1878 dbuf_rele(parent, NULL);
2012 1879
2013 1880 ASSERT3P(DB_DNODE(db), ==, dn);
2014 1881 ASSERT3U(db->db_blkid, ==, blkid);
2015 1882 ASSERT3U(db->db_level, ==, level);
2016 1883 *dbp = db;
2017 1884
2018 1885 return (0);
2019 1886 }
2020 1887
2021 1888 dmu_buf_impl_t *
2022 1889 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
2023 1890 {
2024 1891 dmu_buf_impl_t *db;
2025 1892 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
2026 1893 return (err ? NULL : db);
2027 1894 }
2028 1895
2029 1896 dmu_buf_impl_t *
2030 1897 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
2031 1898 {
2032 1899 dmu_buf_impl_t *db;
2033 1900 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
2034 1901 return (err ? NULL : db);
2035 1902 }
2036 1903
2037 1904 void
2038 1905 dbuf_create_bonus(dnode_t *dn)
2039 1906 {
2040 1907 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
2041 1908
2042 1909 ASSERT(dn->dn_bonus == NULL);
2043 1910 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
2044 1911 }
2045 1912
2046 1913 int
2047 1914 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
2048 1915 {
2049 1916 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2050 1917 dnode_t *dn;
2051 1918
2052 1919 if (db->db_blkid != DMU_SPILL_BLKID)
2053 1920 return (SET_ERROR(ENOTSUP));
2054 1921 if (blksz == 0)
2055 1922 blksz = SPA_MINBLOCKSIZE;
2056 1923 if (blksz > SPA_MAXBLOCKSIZE)
2057 1924 blksz = SPA_MAXBLOCKSIZE;
2058 1925 else
2059 1926 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
2060 1927
2061 1928 DB_DNODE_ENTER(db);
2062 1929 dn = DB_DNODE(db);
2063 1930 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2064 1931 dbuf_new_size(db, blksz, tx);
2065 1932 rw_exit(&dn->dn_struct_rwlock);
2066 1933 DB_DNODE_EXIT(db);
2067 1934
2068 1935 return (0);
2069 1936 }
2070 1937
2071 1938 void
2072 1939 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
2073 1940 {
2074 1941 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
2075 1942 }
2076 1943
2077 1944 #pragma weak dmu_buf_add_ref = dbuf_add_ref
2078 1945 void
2079 1946 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
2080 1947 {
2081 1948 int64_t holds = refcount_add(&db->db_holds, tag);
2082 1949 ASSERT(holds > 1);
2083 1950 }
2084 1951
2085 1952 /*
2086 1953 * If you call dbuf_rele() you had better not be referencing the dnode handle
2087 1954 * unless you have some other direct or indirect hold on the dnode. (An indirect
2088 1955 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
2089 1956 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
2090 1957 * dnode's parent dbuf evicting its dnode handles.
2091 1958 */
2092 1959 void
2093 1960 dbuf_rele(dmu_buf_impl_t *db, void *tag)
2094 1961 {
2095 1962 mutex_enter(&db->db_mtx);
2096 1963 dbuf_rele_and_unlock(db, tag);
2097 1964 }
2098 1965
2099 1966 void
2100 1967 dmu_buf_rele(dmu_buf_t *db, void *tag)
2101 1968 {
2102 1969 dbuf_rele((dmu_buf_impl_t *)db, tag);
2103 1970 }
2104 1971
2105 1972 /*
2106 1973 * dbuf_rele() for an already-locked dbuf. This is necessary to allow
2107 1974 * db_dirtycnt and db_holds to be updated atomically.
2108 1975 */
2109 1976 void
2110 1977 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
2111 1978 {
2112 1979 int64_t holds;
2113 1980
2114 1981 ASSERT(MUTEX_HELD(&db->db_mtx));
2115 1982 DBUF_VERIFY(db);
2116 1983
2117 1984 /*
2118 1985 * Remove the reference to the dbuf before removing its hold on the
2119 1986 * dnode so we can guarantee in dnode_move() that a referenced bonus
2120 1987 * buffer has a corresponding dnode hold.
2121 1988 */
2122 1989 holds = refcount_remove(&db->db_holds, tag);
2123 1990 ASSERT(holds >= 0);
2124 1991
2125 1992 /*
2126 1993 * We can't freeze indirects if there is a possibility that they
2127 1994 * may be modified in the current syncing context.
2128 1995 */
2129 1996 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2130 1997 arc_buf_freeze(db->db_buf);
2131 1998
2132 1999 if (holds == db->db_dirtycnt &&
2133 2000 db->db_level == 0 && db->db_immediate_evict)
2134 2001 dbuf_evict_user(db);
2135 2002
2136 2003 if (holds == 0) {
2137 2004 if (db->db_blkid == DMU_BONUS_BLKID) {
2138 2005 mutex_exit(&db->db_mtx);
2139 2006
2140 2007 /*
2141 2008 * If the dnode moves here, we cannot cross this barrier
2142 2009 * until the move completes.
2143 2010 */
2144 2011 DB_DNODE_ENTER(db);
2145 2012 atomic_dec_32(&DB_DNODE(db)->dn_dbufs_count);
2146 2013 DB_DNODE_EXIT(db);
2147 2014 /*
2148 2015 * The bonus buffer's dnode hold is no longer discounted
2149 2016 * in dnode_move(). The dnode cannot move until after
2150 2017 * the dnode_rele().
2151 2018 */
2152 2019 dnode_rele(DB_DNODE(db), db);
2153 2020 } else if (db->db_buf == NULL) {
2154 2021 /*
2155 2022 * This is a special case: we never associated this
2156 2023 * dbuf with any data allocated from the ARC.
2157 2024 */
2158 2025 ASSERT(db->db_state == DB_UNCACHED ||
2159 2026 db->db_state == DB_NOFILL);
2160 2027 dbuf_evict(db);
2161 2028 } else if (arc_released(db->db_buf)) {
2162 2029 arc_buf_t *buf = db->db_buf;
2163 2030 /*
2164 2031 * This dbuf has anonymous data associated with it.
2165 2032 */
2166 2033 dbuf_set_data(db, NULL);
2167 2034 VERIFY(arc_buf_remove_ref(buf, db));
2168 2035 dbuf_evict(db);
2169 2036 } else {
2170 2037 VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2171 2038
2172 2039 /*
2173 2040 * A dbuf will be eligible for eviction if either the
2174 2041 * 'primarycache' property is set or a duplicate
2175 2042 * copy of this buffer is already cached in the arc.
2176 2043 *
2177 2044 * In the case of the 'primarycache' a buffer
2178 2045 * is considered for eviction if it matches the
2179 2046 * criteria set in the property.
2180 2047 *
2181 2048 * To decide if our buffer is considered a
2182 2049 * duplicate, we must call into the arc to determine
2183 2050 * if multiple buffers are referencing the same
2184 2051 * block on-disk. If so, then we simply evict
2185 2052 * ourselves.
2186 2053 */
2187 2054 if (!DBUF_IS_CACHEABLE(db)) {
2188 2055 if (db->db_blkptr != NULL &&
2189 2056 !BP_IS_HOLE(db->db_blkptr) &&
2190 2057 !BP_IS_EMBEDDED(db->db_blkptr)) {
2191 2058 spa_t *spa =
2192 2059 dmu_objset_spa(db->db_objset);
2193 2060 blkptr_t bp = *db->db_blkptr;
2194 2061 dbuf_clear(db);
2195 2062 arc_freed(spa, &bp);
2196 2063 } else {
2197 2064 dbuf_clear(db);
2198 2065 }
2199 2066 } else if (arc_buf_eviction_needed(db->db_buf)) {
2200 2067 dbuf_clear(db);
2201 2068 } else {
2202 2069 mutex_exit(&db->db_mtx);
2203 2070 }
2204 2071 }
2205 2072 } else {
2206 2073 mutex_exit(&db->db_mtx);
2207 2074 }
2208 2075 }
2209 2076
2210 2077 #pragma weak dmu_buf_refcount = dbuf_refcount
2211 2078 uint64_t
2212 2079 dbuf_refcount(dmu_buf_impl_t *db)
2213 2080 {
2214 2081 return (refcount_count(&db->db_holds));
2215 2082 }
2216 2083
2217 2084 void *
2218 2085 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2219 2086 dmu_buf_evict_func_t *evict_func)
2220 2087 {
2221 2088 return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2222 2089 user_data_ptr_ptr, evict_func));
2223 2090 }
2224 2091
2225 2092 void *
2226 2093 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2227 2094 dmu_buf_evict_func_t *evict_func)
2228 2095 {
2229 2096 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2230 2097
2231 2098 db->db_immediate_evict = TRUE;
2232 2099 return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2233 2100 user_data_ptr_ptr, evict_func));
2234 2101 }
2235 2102
2236 2103 void *
2237 2104 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
2238 2105 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
2239 2106 {
2240 2107 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2241 2108 ASSERT(db->db_level == 0);
2242 2109
2243 2110 ASSERT((user_ptr == NULL) == (evict_func == NULL));
2244 2111
2245 2112 mutex_enter(&db->db_mtx);
2246 2113
2247 2114 if (db->db_user_ptr == old_user_ptr) {
2248 2115 db->db_user_ptr = user_ptr;
2249 2116 db->db_user_data_ptr_ptr = user_data_ptr_ptr;
2250 2117 db->db_evict_func = evict_func;
2251 2118
2252 2119 dbuf_update_data(db);
2253 2120 } else {
2254 2121 old_user_ptr = db->db_user_ptr;
2255 2122 }
2256 2123
2257 2124 mutex_exit(&db->db_mtx);
2258 2125 return (old_user_ptr);
2259 2126 }
2260 2127
2261 2128 void *
2262 2129 dmu_buf_get_user(dmu_buf_t *db_fake)
2263 2130 {
2264 2131 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2265 2132 ASSERT(!refcount_is_zero(&db->db_holds));
2266 2133
2267 2134 return (db->db_user_ptr);
2268 2135 }
2269 2136
2270 2137 boolean_t
2271 2138 dmu_buf_freeable(dmu_buf_t *dbuf)
2272 2139 {
2273 2140 boolean_t res = B_FALSE;
2274 2141 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2275 2142
2276 2143 if (db->db_blkptr)
2277 2144 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2278 2145 db->db_blkptr, db->db_blkptr->blk_birth);
2279 2146
2280 2147 return (res);
2281 2148 }
2282 2149
2283 2150 blkptr_t *
2284 2151 dmu_buf_get_blkptr(dmu_buf_t *db)
2285 2152 {
2286 2153 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2287 2154 return (dbi->db_blkptr);
2288 2155 }
2289 2156
2290 2157 static void
2291 2158 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
2292 2159 {
2293 2160 /* ASSERT(dmu_tx_is_syncing(tx) */
2294 2161 ASSERT(MUTEX_HELD(&db->db_mtx));
2295 2162
2296 2163 if (db->db_blkptr != NULL)
2297 2164 return;
2298 2165
2299 2166 if (db->db_blkid == DMU_SPILL_BLKID) {
2300 2167 db->db_blkptr = &dn->dn_phys->dn_spill;
2301 2168 BP_ZERO(db->db_blkptr);
2302 2169 return;
2303 2170 }
2304 2171 if (db->db_level == dn->dn_phys->dn_nlevels-1) {
2305 2172 /*
2306 2173 * This buffer was allocated at a time when there was
2307 2174 * no available blkptrs from the dnode, or it was
2308 2175 * inappropriate to hook it in (i.e., nlevels mis-match).
2309 2176 */
2310 2177 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
2311 2178 ASSERT(db->db_parent == NULL);
2312 2179 db->db_parent = dn->dn_dbuf;
2313 2180 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
2314 2181 DBUF_VERIFY(db);
2315 2182 } else {
2316 2183 dmu_buf_impl_t *parent = db->db_parent;
2317 2184 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2318 2185
2319 2186 ASSERT(dn->dn_phys->dn_nlevels > 1);
2320 2187 if (parent == NULL) {
2321 2188 mutex_exit(&db->db_mtx);
2322 2189 rw_enter(&dn->dn_struct_rwlock, RW_READER);
2323 2190 (void) dbuf_hold_impl(dn, db->db_level+1,
2324 2191 db->db_blkid >> epbs, FALSE, db, &parent);
2325 2192 rw_exit(&dn->dn_struct_rwlock);
2326 2193 mutex_enter(&db->db_mtx);
2327 2194 db->db_parent = parent;
2328 2195 }
2329 2196 db->db_blkptr = (blkptr_t *)parent->db.db_data +
2330 2197 (db->db_blkid & ((1ULL << epbs) - 1));
2331 2198 DBUF_VERIFY(db);
2332 2199 }
2333 2200 }
2334 2201
2335 2202 static void
2336 2203 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2337 2204 {
2338 2205 dmu_buf_impl_t *db = dr->dr_dbuf;
2339 2206 dnode_t *dn;
2340 2207 zio_t *zio;
2341 2208
2342 2209 ASSERT(dmu_tx_is_syncing(tx));
2343 2210
2344 2211 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2345 2212
2346 2213 mutex_enter(&db->db_mtx);
2347 2214
2348 2215 ASSERT(db->db_level > 0);
2349 2216 DBUF_VERIFY(db);
2350 2217
2351 2218 /* Read the block if it hasn't been read yet. */
2352 2219 if (db->db_buf == NULL) {
2353 2220 mutex_exit(&db->db_mtx);
2354 2221 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2355 2222 mutex_enter(&db->db_mtx);
2356 2223 }
2357 2224 ASSERT3U(db->db_state, ==, DB_CACHED);
2358 2225 ASSERT(db->db_buf != NULL);
2359 2226
2360 2227 DB_DNODE_ENTER(db);
2361 2228 dn = DB_DNODE(db);
2362 2229 /* Indirect block size must match what the dnode thinks it is. */
2363 2230 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2364 2231 dbuf_check_blkptr(dn, db);
2365 2232 DB_DNODE_EXIT(db);
2366 2233
2367 2234 /* Provide the pending dirty record to child dbufs */
2368 2235 db->db_data_pending = dr;
2369 2236
2370 2237 mutex_exit(&db->db_mtx);
2371 2238 dbuf_write(dr, db->db_buf, tx);
2372 2239
2373 2240 zio = dr->dr_zio;
2374 2241 mutex_enter(&dr->dt.di.dr_mtx);
2375 2242 dbuf_sync_list(&dr->dt.di.dr_children, tx);
2376 2243 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2377 2244 mutex_exit(&dr->dt.di.dr_mtx);
2378 2245 zio_nowait(zio);
2379 2246 }
2380 2247
2381 2248 static void
2382 2249 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2383 2250 {
2384 2251 arc_buf_t **datap = &dr->dt.dl.dr_data;
2385 2252 dmu_buf_impl_t *db = dr->dr_dbuf;
2386 2253 dnode_t *dn;
2387 2254 objset_t *os;
2388 2255 uint64_t txg = tx->tx_txg;
2389 2256
2390 2257 ASSERT(dmu_tx_is_syncing(tx));
2391 2258
2392 2259 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2393 2260
2394 2261 mutex_enter(&db->db_mtx);
2395 2262 /*
2396 2263 * To be synced, we must be dirtied. But we
2397 2264 * might have been freed after the dirty.
2398 2265 */
2399 2266 if (db->db_state == DB_UNCACHED) {
2400 2267 /* This buffer has been freed since it was dirtied */
2401 2268 ASSERT(db->db.db_data == NULL);
2402 2269 } else if (db->db_state == DB_FILL) {
2403 2270 /* This buffer was freed and is now being re-filled */
2404 2271 ASSERT(db->db.db_data != dr->dt.dl.dr_data);
2405 2272 } else {
2406 2273 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
2407 2274 }
2408 2275 DBUF_VERIFY(db);
2409 2276
2410 2277 DB_DNODE_ENTER(db);
2411 2278 dn = DB_DNODE(db);
2412 2279
2413 2280 if (db->db_blkid == DMU_SPILL_BLKID) {
2414 2281 mutex_enter(&dn->dn_mtx);
2415 2282 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
2416 2283 mutex_exit(&dn->dn_mtx);
2417 2284 }
2418 2285
2419 2286 /*
2420 2287 * If this is a bonus buffer, simply copy the bonus data into the
2421 2288 * dnode. It will be written out when the dnode is synced (and it
2422 2289 * will be synced, since it must have been dirty for dbuf_sync to
2423 2290 * be called).
2424 2291 */
2425 2292 if (db->db_blkid == DMU_BONUS_BLKID) {
2426 2293 dbuf_dirty_record_t **drp;
2427 2294
2428 2295 ASSERT(*datap != NULL);
2429 2296 ASSERT0(db->db_level);
2430 2297 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
2431 2298 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
2432 2299 DB_DNODE_EXIT(db);
2433 2300
2434 2301 if (*datap != db->db.db_data) {
2435 2302 zio_buf_free(*datap, DN_MAX_BONUSLEN);
2436 2303 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2437 2304 }
2438 2305 db->db_data_pending = NULL;
2439 2306 drp = &db->db_last_dirty;
2440 2307 while (*drp != dr)
2441 2308 drp = &(*drp)->dr_next;
2442 2309 ASSERT(dr->dr_next == NULL);
2443 2310 ASSERT(dr->dr_dbuf == db);
2444 2311 *drp = dr->dr_next;
2445 2312 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2446 2313 ASSERT(db->db_dirtycnt > 0);
2447 2314 db->db_dirtycnt -= 1;
2448 2315 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2449 2316 return;
2450 2317 }
2451 2318
2452 2319 os = dn->dn_objset;
2453 2320
2454 2321 /*
2455 2322 * This function may have dropped the db_mtx lock allowing a dmu_sync
2456 2323 * operation to sneak in. As a result, we need to ensure that we
2457 2324 * don't check the dr_override_state until we have returned from
2458 2325 * dbuf_check_blkptr.
2459 2326 */
2460 2327 dbuf_check_blkptr(dn, db);
2461 2328
2462 2329 /*
2463 2330 * If this buffer is in the middle of an immediate write,
2464 2331 * wait for the synchronous IO to complete.
2465 2332 */
2466 2333 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
2467 2334 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
2468 2335 cv_wait(&db->db_changed, &db->db_mtx);
2469 2336 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
2470 2337 }
2471 2338
2472 2339 if (db->db_state != DB_NOFILL &&
2473 2340 dn->dn_object != DMU_META_DNODE_OBJECT &&
2474 2341 refcount_count(&db->db_holds) > 1 &&
2475 2342 dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
2476 2343 *datap == db->db_buf) {
2477 2344 /*
2478 2345 * If this buffer is currently "in use" (i.e., there
2479 2346 * are active holds and db_data still references it),
2480 2347 * then make a copy before we start the write so that
2481 2348 * any modifications from the open txg will not leak
2482 2349 * into this write.
2483 2350 *
2484 2351 * NOTE: this copy does not need to be made for
2485 2352 * objects only modified in the syncing context (e.g.
2486 2353 * DNONE_DNODE blocks).
2487 2354 */
2488 2355 int blksz = arc_buf_size(*datap);
2489 2356 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2490 2357 *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2491 2358 bcopy(db->db.db_data, (*datap)->b_data, blksz);
2492 2359 }
2493 2360 db->db_data_pending = dr;
2494 2361
2495 2362 mutex_exit(&db->db_mtx);
2496 2363
2497 2364 dbuf_write(dr, *datap, tx);
2498 2365
2499 2366 ASSERT(!list_link_active(&dr->dr_dirty_node));
2500 2367 if (dn->dn_object == DMU_META_DNODE_OBJECT) {
2501 2368 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2502 2369 DB_DNODE_EXIT(db);
2503 2370 } else {
2504 2371 /*
2505 2372 * Although zio_nowait() does not "wait for an IO", it does
2506 2373 * initiate the IO. If this is an empty write it seems plausible
2507 2374 * that the IO could actually be completed before the nowait
2508 2375 * returns. We need to DB_DNODE_EXIT() first in case
2509 2376 * zio_nowait() invalidates the dbuf.
2510 2377 */
2511 2378 DB_DNODE_EXIT(db);
2512 2379 zio_nowait(dr->dr_zio);
2513 2380 }
2514 2381 }
2515 2382
2516 2383 void
2517 2384 dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2518 2385 {
2519 2386 dbuf_dirty_record_t *dr;
2520 2387
2521 2388 while (dr = list_head(list)) {
2522 2389 if (dr->dr_zio != NULL) {
2523 2390 /*
2524 2391 * If we find an already initialized zio then we
2525 2392 * are processing the meta-dnode, and we have finished.
2526 2393 * The dbufs for all dnodes are put back on the list
2527 2394 * during processing, so that we can zio_wait()
2528 2395 * these IOs after initiating all child IOs.
2529 2396 */
2530 2397 ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2531 2398 DMU_META_DNODE_OBJECT);
2532 2399 break;
2533 2400 }
2534 2401 list_remove(list, dr);
2535 2402 if (dr->dr_dbuf->db_level > 0)
2536 2403 dbuf_sync_indirect(dr, tx);
2537 2404 else
2538 2405 dbuf_sync_leaf(dr, tx);
2539 2406 }
2540 2407 }
2541 2408
2542 2409 /* ARGSUSED */
2543 2410 static void
2544 2411 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2545 2412 {
2546 2413 dmu_buf_impl_t *db = vdb;
2547 2414 dnode_t *dn;
2548 2415 blkptr_t *bp = zio->io_bp;
2549 2416 blkptr_t *bp_orig = &zio->io_bp_orig;
2550 2417 spa_t *spa = zio->io_spa;
2551 2418 int64_t delta;
2552 2419 uint64_t fill = 0;
2553 2420 int i;
2554 2421
2555 2422 ASSERT3P(db->db_blkptr, ==, bp);
2556 2423
2557 2424 DB_DNODE_ENTER(db);
2558 2425 dn = DB_DNODE(db);
2559 2426 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
2560 2427 dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
2561 2428 zio->io_prev_space_delta = delta;
2562 2429
2563 2430 if (bp->blk_birth != 0) {
2564 2431 ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
2565 2432 BP_GET_TYPE(bp) == dn->dn_type) ||
2566 2433 (db->db_blkid == DMU_SPILL_BLKID &&
2567 2434 BP_GET_TYPE(bp) == dn->dn_bonustype) ||
2568 2435 BP_IS_EMBEDDED(bp));
2569 2436 ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2570 2437 }
2571 2438
2572 2439 mutex_enter(&db->db_mtx);
2573 2440
2574 2441 #ifdef ZFS_DEBUG
2575 2442 if (db->db_blkid == DMU_SPILL_BLKID) {
2576 2443 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2577 2444 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2578 2445 db->db_blkptr == &dn->dn_phys->dn_spill);
2579 2446 }
2580 2447 #endif
2581 2448
2582 2449 if (db->db_level == 0) {
2583 2450 mutex_enter(&dn->dn_mtx);
2584 2451 if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
2585 2452 db->db_blkid != DMU_SPILL_BLKID)
2586 2453 dn->dn_phys->dn_maxblkid = db->db_blkid;
2587 2454 mutex_exit(&dn->dn_mtx);
2588 2455
2589 2456 if (dn->dn_type == DMU_OT_DNODE) {
2590 2457 dnode_phys_t *dnp = db->db.db_data;
2591 2458 for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2592 2459 i--, dnp++) {
2593 2460 if (dnp->dn_type != DMU_OT_NONE)
2594 2461 fill++;
2595 2462 }
2596 2463 } else {
2597 2464 if (BP_IS_HOLE(bp)) {
2598 2465 fill = 0;
2599 2466 } else {
2600 2467 fill = 1;
2601 2468 }
2602 2469 }
2603 2470 } else {
2604 2471 blkptr_t *ibp = db->db.db_data;
2605 2472 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2606 2473 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2607 2474 if (BP_IS_HOLE(ibp))
2608 2475 continue;
2609 2476 fill += BP_GET_FILL(ibp);
2610 2477 }
2611 2478 }
2612 2479 DB_DNODE_EXIT(db);
2613 2480
2614 2481 if (!BP_IS_EMBEDDED(bp))
2615 2482 bp->blk_fill = fill;
2616 2483
2617 2484 mutex_exit(&db->db_mtx);
2618 2485 }
2619 2486
2620 2487 /*
2621 2488 * The SPA will call this callback several times for each zio - once
2622 2489 * for every physical child i/o (zio->io_phys_children times). This
2623 2490 * allows the DMU to monitor the progress of each logical i/o. For example,
2624 2491 * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
2625 2492 * block. There may be a long delay before all copies/fragments are completed,
2626 2493 * so this callback allows us to retire dirty space gradually, as the physical
2627 2494 * i/os complete.
2628 2495 */
2629 2496 /* ARGSUSED */
2630 2497 static void
2631 2498 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
2632 2499 {
2633 2500 dmu_buf_impl_t *db = arg;
2634 2501 objset_t *os = db->db_objset;
2635 2502 dsl_pool_t *dp = dmu_objset_pool(os);
2636 2503 dbuf_dirty_record_t *dr;
2637 2504 int delta = 0;
2638 2505
2639 2506 dr = db->db_data_pending;
2640 2507 ASSERT3U(dr->dr_txg, ==, zio->io_txg);
2641 2508
2642 2509 /*
2643 2510 * The callback will be called io_phys_children times. Retire one
2644 2511 * portion of our dirty space each time we are called. Any rounding
2645 2512 * error will be cleaned up by dsl_pool_sync()'s call to
2646 2513 * dsl_pool_undirty_space().
2647 2514 */
2648 2515 delta = dr->dr_accounted / zio->io_phys_children;
2649 2516 dsl_pool_undirty_space(dp, delta, zio->io_txg);
2650 2517 }
2651 2518
2652 2519 /* ARGSUSED */
2653 2520 static void
2654 2521 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2655 2522 {
2656 2523 dmu_buf_impl_t *db = vdb;
2657 2524 blkptr_t *bp_orig = &zio->io_bp_orig;
2658 2525 blkptr_t *bp = db->db_blkptr;
2659 2526 objset_t *os = db->db_objset;
2660 2527 dmu_tx_t *tx = os->os_synctx;
2661 2528 dbuf_dirty_record_t **drp, *dr;
2662 2529
2663 2530 ASSERT0(zio->io_error);
2664 2531 ASSERT(db->db_blkptr == bp);
2665 2532
2666 2533 /*
2667 2534 * For nopwrites and rewrites we ensure that the bp matches our
2668 2535 * original and bypass all the accounting.
2669 2536 */
2670 2537 if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
2671 2538 ASSERT(BP_EQUAL(bp, bp_orig));
2672 2539 } else {
2673 2540 dsl_dataset_t *ds = os->os_dsl_dataset;
2674 2541 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
2675 2542 dsl_dataset_block_born(ds, bp, tx);
2676 2543 }
2677 2544
2678 2545 mutex_enter(&db->db_mtx);
2679 2546
2680 2547 DBUF_VERIFY(db);
2681 2548
2682 2549 drp = &db->db_last_dirty;
2683 2550 while ((dr = *drp) != db->db_data_pending)
2684 2551 drp = &dr->dr_next;
2685 2552 ASSERT(!list_link_active(&dr->dr_dirty_node));
2686 2553 ASSERT(dr->dr_dbuf == db);
2687 2554 ASSERT(dr->dr_next == NULL);
2688 2555 *drp = dr->dr_next;
2689 2556
2690 2557 #ifdef ZFS_DEBUG
2691 2558 if (db->db_blkid == DMU_SPILL_BLKID) {
2692 2559 dnode_t *dn;
2693 2560
2694 2561 DB_DNODE_ENTER(db);
2695 2562 dn = DB_DNODE(db);
2696 2563 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2697 2564 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2698 2565 db->db_blkptr == &dn->dn_phys->dn_spill);
2699 2566 DB_DNODE_EXIT(db);
2700 2567 }
2701 2568 #endif
2702 2569
2703 2570 if (db->db_level == 0) {
2704 2571 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2705 2572 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2706 2573 if (db->db_state != DB_NOFILL) {
2707 2574 if (dr->dt.dl.dr_data != db->db_buf)
2708 2575 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
2709 2576 db));
2710 2577 else if (!arc_released(db->db_buf))
2711 2578 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2712 2579 }
2713 2580 } else {
2714 2581 dnode_t *dn;
2715 2582
2716 2583 DB_DNODE_ENTER(db);
2717 2584 dn = DB_DNODE(db);
2718 2585 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2719 2586 ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
2720 2587 if (!BP_IS_HOLE(db->db_blkptr)) {
2721 2588 int epbs =
2722 2589 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2723 2590 ASSERT3U(db->db_blkid, <=,
2724 2591 dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
2725 2592 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2726 2593 db->db.db_size);
2727 2594 if (!arc_released(db->db_buf))
2728 2595 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2729 2596 }
2730 2597 DB_DNODE_EXIT(db);
2731 2598 mutex_destroy(&dr->dt.di.dr_mtx);
2732 2599 list_destroy(&dr->dt.di.dr_children);
2733 2600 }
2734 2601 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2735 2602
2736 2603 cv_broadcast(&db->db_changed);
2737 2604 ASSERT(db->db_dirtycnt > 0);
2738 2605 db->db_dirtycnt -= 1;
2739 2606 db->db_data_pending = NULL;
2740 2607 dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
2741 2608 }
2742 2609
2743 2610 static void
2744 2611 dbuf_write_nofill_ready(zio_t *zio)
2745 2612 {
2746 2613 dbuf_write_ready(zio, NULL, zio->io_private);
2747 2614 }
2748 2615
2749 2616 static void
2750 2617 dbuf_write_nofill_done(zio_t *zio)
2751 2618 {
2752 2619 dbuf_write_done(zio, NULL, zio->io_private);
2753 2620 }
2754 2621
2755 2622 static void
2756 2623 dbuf_write_override_ready(zio_t *zio)
2757 2624 {
2758 2625 dbuf_dirty_record_t *dr = zio->io_private;
2759 2626 dmu_buf_impl_t *db = dr->dr_dbuf;
2760 2627
2761 2628 dbuf_write_ready(zio, NULL, db);
2762 2629 }
2763 2630
2764 2631 static void
2765 2632 dbuf_write_override_done(zio_t *zio)
2766 2633 {
2767 2634 dbuf_dirty_record_t *dr = zio->io_private;
2768 2635 dmu_buf_impl_t *db = dr->dr_dbuf;
2769 2636 blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
2770 2637
2771 2638 mutex_enter(&db->db_mtx);
2772 2639 if (!BP_EQUAL(zio->io_bp, obp)) {
2773 2640 if (!BP_IS_HOLE(obp))
2774 2641 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
2775 2642 arc_release(dr->dt.dl.dr_data, db);
2776 2643 }
2777 2644 mutex_exit(&db->db_mtx);
2778 2645
2779 2646 dbuf_write_done(zio, NULL, db);
2780 2647 }
2781 2648
2782 2649 /* Issue I/O to commit a dirty buffer to disk. */
2783 2650 static void
2784 2651 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2785 2652 {
2786 2653 dmu_buf_impl_t *db = dr->dr_dbuf;
2787 2654 dnode_t *dn;
2788 2655 objset_t *os;
2789 2656 dmu_buf_impl_t *parent = db->db_parent;
2790 2657 uint64_t txg = tx->tx_txg;
2791 2658 zbookmark_phys_t zb;
2792 2659 zio_prop_t zp;
2793 2660 zio_t *zio;
2794 2661 int wp_flag = 0;
2795 2662
2796 2663 DB_DNODE_ENTER(db);
2797 2664 dn = DB_DNODE(db);
2798 2665 os = dn->dn_objset;
2799 2666
2800 2667 if (db->db_state != DB_NOFILL) {
2801 2668 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
2802 2669 /*
2803 2670 * Private object buffers are released here rather
2804 2671 * than in dbuf_dirty() since they are only modified
2805 2672 * in the syncing context and we don't want the
2806 2673 * overhead of making multiple copies of the data.
2807 2674 */
2808 2675 if (BP_IS_HOLE(db->db_blkptr)) {
2809 2676 arc_buf_thaw(data);
2810 2677 } else {
2811 2678 dbuf_release_bp(db);
2812 2679 }
2813 2680 }
2814 2681 }
2815 2682
2816 2683 if (parent != dn->dn_dbuf) {
2817 2684 /* Our parent is an indirect block. */
2818 2685 /* We have a dirty parent that has been scheduled for write. */
2819 2686 ASSERT(parent && parent->db_data_pending);
2820 2687 /* Our parent's buffer is one level closer to the dnode. */
2821 2688 ASSERT(db->db_level == parent->db_level-1);
2822 2689 /*
2823 2690 * We're about to modify our parent's db_data by modifying
2824 2691 * our block pointer, so the parent must be released.
2825 2692 */
2826 2693 ASSERT(arc_released(parent->db_buf));
2827 2694 zio = parent->db_data_pending->dr_zio;
2828 2695 } else {
2829 2696 /* Our parent is the dnode itself. */
2830 2697 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
2831 2698 db->db_blkid != DMU_SPILL_BLKID) ||
2832 2699 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
2833 2700 if (db->db_blkid != DMU_SPILL_BLKID)
2834 2701 ASSERT3P(db->db_blkptr, ==,
2835 2702 &dn->dn_phys->dn_blkptr[db->db_blkid]);
2836 2703 zio = dn->dn_zio;
2837 2704 }
2838 2705
2839 2706 ASSERT(db->db_level == 0 || data == db->db_buf);
2840 2707 ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2841 2708 ASSERT(zio);
2842 2709
2843 2710 SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2844 2711 os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2845 2712 db->db.db_object, db->db_level, db->db_blkid);
2846 2713
2847 2714 if (db->db_blkid == DMU_SPILL_BLKID)
2848 2715 wp_flag = WP_SPILL;
2849 2716 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
2850 2717 WP_SET_SPECIALCLASS(wp_flag, dr->dr_usesc);
2851 2718
2852 2719 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
2853 2720 DB_DNODE_EXIT(db);
2854 2721
2855 2722 if (db->db_level == 0 &&
2856 2723 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2857 2724 /*
2858 2725 * The BP for this block has been provided by open context
2859 2726 * (by dmu_sync() or dmu_buf_write_embedded()).
2860 2727 */
2861 2728 void *contents = (data != NULL) ? data->b_data : NULL;
2862 2729
2863 2730 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2864 2731 db->db_blkptr, contents, db->db.db_size, &zp,
2865 2732 dbuf_write_override_ready, NULL, dbuf_write_override_done,
2866 2733 dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2867 2734 mutex_enter(&db->db_mtx);
2868 2735 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2869 2736 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2870 2737 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
2871 2738 mutex_exit(&db->db_mtx);
2872 2739 } else if (db->db_state == DB_NOFILL) {
2873 2740 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
2874 2741 zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
2875 2742 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2876 2743 db->db_blkptr, NULL, db->db.db_size, &zp,
2877 2744 dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
2878 2745 ZIO_PRIORITY_ASYNC_WRITE,
2879 2746 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
2880 2747 } else {
2881 2748 ASSERT(arc_released(data));
2882 2749 dr->dr_zio = arc_write(zio, os->os_spa, txg,
2883 2750 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
2884 2751 DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
2885 2752 dbuf_write_physdone, dbuf_write_done, db,
2886 2753 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2887 2754 }
2888 2755 }
↓ open down ↓ |
999 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX