diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index f1b5a17f337e..0a9fe361df7f 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1193,7 +1193,8 @@ dbuf_verify(dmu_buf_impl_t *db) if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && (db->db_buf == NULL || db->db_buf->b_data) && db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && - db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) { + db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg) && + RW_LOCK_HELD(&db->db_rwlock)) { /* * If the blkptr isn't set but they have nonzero data, * it had better be dirty, otherwise we'll lose that @@ -1697,7 +1698,9 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP); arc_space_consume(bonuslen, ARC_SPACE_BONUS); + rw_enter(&db->db_rwlock, RW_READER); memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen); + rw_exit(&db->db_rwlock); } else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) { dnode_t *dn = DB_DNODE(db); int size = arc_buf_size(db->db_buf); @@ -1727,7 +1730,9 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) } else { dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size); } + rw_enter(&db->db_rwlock, RW_READER); memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size); + rw_exit(&db->db_rwlock); } else { db->db_buf = NULL; dbuf_clear_data(db); @@ -2999,7 +3004,9 @@ dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed) ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ + rw_enter(&db->db_rwlock, RW_WRITER); memset(db->db.db_data, 0, db->db.db_size); + rw_exit(&db->db_rwlock); db->db_freed_in_flight = FALSE; db->db_state = DB_CACHED; DTRACE_SET_STATE(db, @@ -3374,8 +3381,10 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, *parentp = NULL; return (err); } + mutex_enter(&(*parentp)->db_mtx); *bpp = ((blkptr_t *)(*parentp)->db.db_data) + (blkid & ((1ULL << epbs) - 1)); + mutex_exit(&(*parentp)->db_mtx); return (0); } else { /* the block is referenced from the dnode */ @@ -4560,10 +4569,12 @@ dbuf_lightweight_bp(dbuf_dirty_record_t *dr) return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]); } else { dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf; + ASSERT(MUTEX_HELD(&parent_db->db_mtx)); int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; VERIFY3U(parent_db->db_level, ==, 1); VERIFY3P(DB_DNODE(parent_db), ==, dn); VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid); + ASSERT(RW_LOCK_HELD(&parent_db->db_rwlock)); blkptr_t *bp = parent_db->db.db_data; return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]); } @@ -4574,12 +4585,22 @@ dbuf_lightweight_ready(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; blkptr_t *bp = zio->io_bp; + dmu_buf_impl_t *parent_db = NULL; if (zio->io_error != 0) return; dnode_t *dn = dr->dr_dnode; + EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1); + if (dr->dr_parent == NULL) { + parent_db = dn->dn_dbuf; + } else { + parent_db = dr->dr_parent->dr_dbuf; + } + mutex_enter(&parent_db->db_mtx); + + rw_enter(&parent_db->db_rwlock, RW_READER); blkptr_t *bp_orig = dbuf_lightweight_bp(dr); spa_t *spa = dmu_objset_spa(dn->dn_objset); int64_t delta = bp_get_dsize_sync(spa, bp) - @@ -4599,16 +4620,13 @@ dbuf_lightweight_ready(zio_t *zio) BP_SET_FILL(bp, fill); } - dmu_buf_impl_t *parent_db; - EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1); - if (dr->dr_parent == NULL) { - parent_db = dn->dn_dbuf; - } else { - parent_db = dr->dr_parent->dr_dbuf; + if (!rw_tryupgrade(&parent_db->db_rwlock)) { + rw_exit(&parent_db->db_rwlock); + rw_enter(&parent_db->db_rwlock, RW_WRITER); } - rw_enter(&parent_db->db_rwlock, RW_WRITER); *bp_orig = *bp; rw_exit(&parent_db->db_rwlock); + mutex_exit(&parent_db->db_mtx); } static void @@ -4640,6 +4658,7 @@ noinline static void dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dnode_t *dn = dr->dr_dnode; + dmu_buf_impl_t *parent_db = NULL; zio_t *pio; if (dn->dn_phys->dn_nlevels == 1) { pio = dn->dn_zio; @@ -4658,6 +4677,11 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx) * See comment in dbuf_write(). This is so that zio->io_bp_orig * will have the old BP in dbuf_lightweight_done(). */ + if (dr->dr_dnode->dn_phys->dn_nlevels != 1) { + parent_db = dr->dr_parent->dr_dbuf; + mutex_enter(&parent_db->db_mtx); + rw_enter(&parent_db->db_rwlock, RW_READER); + } dr->dr_bp_copy = *dbuf_lightweight_bp(dr); dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset), @@ -4667,6 +4691,11 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb); + if (parent_db) { + rw_exit(&parent_db->db_rwlock); + mutex_exit(&parent_db->db_mtx); + } + zio_nowait(dr->dr_zio); } @@ -4823,7 +4852,9 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) } else { *datap = arc_alloc_buf(os->os_spa, db, type, psize); } + rw_enter(&db->db_rwlock, RW_READER); memcpy((*datap)->b_data, db->db.db_data, psize); + rw_exit(&db->db_rwlock); } db->db_data_pending = dr; @@ -4929,6 +4960,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) if (dn->dn_type == DMU_OT_DNODE) { i = 0; + rw_enter(&db->db_rwlock, RW_READER); while (i < db->db.db_size) { dnode_phys_t *dnp = (void *)(((char *)db->db.db_data) + i); @@ -4954,6 +4986,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) DNODE_MIN_SIZE; } } + rw_exit(&db->db_rwlock); } else { if (BP_IS_HOLE(bp)) { fill = 0; @@ -4962,6 +4995,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) } } } else { + rw_enter(&db->db_rwlock, RW_READER); blkptr_t *ibp = db->db.db_data; ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { @@ -4971,6 +5005,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) BLK_CONFIG_SKIP, BLK_VERIFY_HALT); fill += BP_GET_FILL(ibp); } + rw_exit(&db->db_rwlock); } DB_DNODE_EXIT(db); @@ -5005,6 +5040,8 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) DB_DNODE_EXIT(db); ASSERT3U(epbs, <, 31); + mutex_enter(&db->db_mtx); + rw_enter(&db->db_rwlock, RW_READER); /* Determine if all our children are holes */ for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) { if (!BP_IS_HOLE(bp)) @@ -5021,10 +5058,14 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) * anybody from reading the blocks we're about to * zero out. */ - rw_enter(&db->db_rwlock, RW_WRITER); + if (!rw_tryupgrade(&db->db_rwlock)) { + rw_exit(&db->db_rwlock); + rw_enter(&db->db_rwlock, RW_WRITER); + } memset(db->db.db_data, 0, db->db.db_size); - rw_exit(&db->db_rwlock); } + rw_exit(&db->db_rwlock); + mutex_exit(&db->db_mtx); } static void @@ -5220,11 +5261,11 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) * avoid lock contention, only grab it when we are actually * changing the BP. */ - if (rw != NULL) + if (rw != NULL && !rw_tryupgrade(rw)) { + rw_exit(rw); rw_enter(rw, RW_WRITER); + } *bp = bp_copy; - if (rw != NULL) - rw_exit(rw); } } @@ -5240,6 +5281,8 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)) return; + mutex_enter(&db->db_mtx); + rw_enter(&db->db_rwlock, RW_READER); if (db->db_level > 0) { blkptr_t *bp = db->db.db_data; for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { @@ -5258,6 +5301,8 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) } } } + rw_exit(&db->db_rwlock); + mutex_exit(&db->db_mtx); } diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index b3f792e4ae6b..5acb4787a4da 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -2190,6 +2190,7 @@ void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) { objset_t *os = dn->dn_objset; + krwlock_t *rw = NULL; void *data = NULL; dmu_buf_impl_t *db = NULL; int flags = dn->dn_id_flags; @@ -2234,8 +2235,12 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) FTAG, (dmu_buf_t **)&db); ASSERT(error == 0); mutex_enter(&db->db_mtx); - data = (before) ? db->db.db_data : - dmu_objset_userquota_find_data(db, tx); + if (before) { + rw = &db->db_rwlock; + data = db->db.db_data; + } else { + data = dmu_objset_userquota_find_data(db, tx); + } have_spill = B_TRUE; } else { mutex_enter(&dn->dn_mtx); @@ -2249,7 +2254,11 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) * type has changed and that type isn't an object type to track */ zfs_file_info_t zfi; + if (rw) + rw_enter(rw, RW_READER); error = file_cbs[os->os_phys->os_type](dn->dn_bonustype, data, &zfi); + if (rw) + rw_exit(rw); if (before) { ASSERT(data); diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 904a039edf95..5df9e7dff533 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -436,11 +436,15 @@ dnode_verify(dnode_t *dn) if (dn->dn_phys->dn_type != DMU_OT_NONE) ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels); ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL); +#ifdef DEBUG if (dn->dn_dbuf != NULL) { + mutex_enter(&dn->dn_dbuf->db_mtx); ASSERT3P(dn->dn_phys, ==, (dnode_phys_t *)dn->dn_dbuf->db.db_data + (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT))); + mutex_exit(&dn->dn_dbuf->db_mtx); } +#endif if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); } @@ -1521,7 +1525,6 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, epb = db->db.db_size >> DNODE_SHIFT; idx = object & (epb - 1); - dn_block = (dnode_phys_t *)db->db.db_data; ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); dnc = dmu_buf_get_user(&db->db); @@ -1535,7 +1538,11 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, dnc->dnc_count = epb; dnh = &dnc->dnc_children[0]; + mutex_enter(&db->db_mtx); + dn_block = (dnode_phys_t *)db->db.db_data; + /* Initialize dnode slot status from dnode_phys_t */ + rw_enter(&db->db_rwlock, RW_READER); for (int i = 0; i < epb; i++) { zrl_init(&dnh[i].dnh_zrlock); @@ -1556,6 +1563,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, skip = 0; } } + rw_exit(&db->db_rwlock); + mutex_exit(&db->db_mtx); dmu_buf_init_user(&dnc->dnc_dbu, NULL, dnode_buf_evict_async, NULL); @@ -1572,6 +1581,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, } ASSERT(dnc->dnc_count == epb); + mutex_enter(&db->db_mtx); + dn_block = (dnode_phys_t *)db->db.db_data; if (flag & DNODE_MUST_BE_ALLOCATED) { slots = 1; @@ -1584,11 +1595,13 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, } else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) { DNODE_STAT_BUMP(dnode_hold_alloc_interior); dnode_slots_rele(dnc, idx, slots); + mutex_exit(&db->db_mtx); dbuf_rele(db, FTAG); return (SET_ERROR(EEXIST)); } else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) { DNODE_STAT_BUMP(dnode_hold_alloc_misses); dnode_slots_rele(dnc, idx, slots); + mutex_exit(&db->db_mtx); dbuf_rele(db, FTAG); return (SET_ERROR(ENOENT)); } else { @@ -1607,8 +1620,10 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses); dn = dnh->dnh_dnode; } else { + rw_enter(&db->db_rwlock, RW_READER); dn = dnode_create(os, dn_block + idx, db, object, dnh); + rw_exit(&db->db_rwlock); dmu_buf_add_user_size(&db->db, sizeof (dnode_t)); } @@ -1619,6 +1634,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, DNODE_STAT_BUMP(dnode_hold_alloc_type_none); mutex_exit(&dn->dn_mtx); dnode_slots_rele(dnc, idx, slots); + mutex_exit(&db->db_mtx); dbuf_rele(db, FTAG); return (SET_ERROR(ENOENT)); } @@ -1627,6 +1643,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, if (flag & DNODE_DRY_RUN) { mutex_exit(&dn->dn_mtx); dnode_slots_rele(dnc, idx, slots); + mutex_exit(&db->db_mtx); dbuf_rele(db, FTAG); return (0); } @@ -1636,6 +1653,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, if (idx + slots - 1 >= DNODES_PER_BLOCK) { DNODE_STAT_BUMP(dnode_hold_free_overflow); + mutex_exit(&db->db_mtx); dbuf_rele(db, FTAG); return (SET_ERROR(ENOSPC)); } @@ -1645,6 +1663,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, if (!dnode_check_slots_free(dnc, idx, slots)) { DNODE_STAT_BUMP(dnode_hold_free_misses); dnode_slots_rele(dnc, idx, slots); + mutex_exit(&db->db_mtx); dbuf_rele(db, FTAG); return (SET_ERROR(ENOSPC)); } @@ -1658,6 +1677,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, if (!dnode_check_slots_free(dnc, idx, slots)) { DNODE_STAT_BUMP(dnode_hold_free_lock_misses); dnode_slots_rele(dnc, idx, slots); + mutex_exit(&db->db_mtx); dbuf_rele(db, FTAG); return (SET_ERROR(ENOSPC)); } @@ -1680,8 +1700,10 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { dn = dnh->dnh_dnode; } else { + rw_enter(&db->db_rwlock, RW_READER); dn = dnode_create(os, dn_block + idx, db, object, dnh); + rw_exit(&db->db_rwlock); dmu_buf_add_user_size(&db->db, sizeof (dnode_t)); } @@ -1690,6 +1712,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, DNODE_STAT_BUMP(dnode_hold_free_refcount); mutex_exit(&dn->dn_mtx); dnode_slots_rele(dnc, idx, slots); + mutex_exit(&db->db_mtx); dbuf_rele(db, FTAG); return (SET_ERROR(EEXIST)); } @@ -1698,6 +1721,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, if (flag & DNODE_DRY_RUN) { mutex_exit(&dn->dn_mtx); dnode_slots_rele(dnc, idx, slots); + mutex_exit(&db->db_mtx); dbuf_rele(db, FTAG); return (0); } @@ -1705,9 +1729,11 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR); DNODE_STAT_BUMP(dnode_hold_free_hits); } else { + mutex_exit(&db->db_mtx); dbuf_rele(db, FTAG); return (SET_ERROR(EINVAL)); } + mutex_exit(&db->db_mtx); ASSERT0(dn->dn_free_txg); @@ -2588,6 +2614,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, dbuf_rele(db, FTAG); return (error); } + mutex_enter(&db->db_mtx); data = db->db.db_data; rw_enter(&db->db_rwlock, RW_READER); } @@ -2667,6 +2694,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, if (db != NULL) { rw_exit(&db->db_rwlock); + mutex_exit(&db->db_mtx); dbuf_rele(db, FTAG); } else { if (dn->dn_dbuf != NULL) diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 4067f221f1bf..21059b64270b 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -79,6 +79,7 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT); if (dn->dn_dbuf != NULL) rw_enter(&dn->dn_dbuf->db_rwlock, RW_WRITER); + mutex_enter(&db->db_mtx); rw_enter(&db->db_rwlock, RW_WRITER); ASSERT(db->db.db_data); ASSERT(arc_released(db->db_buf)); @@ -123,6 +124,7 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) memset(dn->dn_phys->dn_blkptr, 0, sizeof (blkptr_t) * nblkptr); rw_exit(&db->db_rwlock); + mutex_exit(&db->db_mtx); if (dn->dn_dbuf != NULL) rw_exit(&dn->dn_dbuf->db_rwlock); @@ -233,6 +235,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) * future txg. */ mutex_enter(&child->db_mtx); + rw_enter(&child->db_rwlock, RW_READER); buf = child->db.db_data; if (buf != NULL && child->db_state != DB_FILL && list_is_empty(&child->db_dirty_records)) { @@ -247,6 +250,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) } } } + rw_exit(&child->db_rwlock); mutex_exit(&child->db_mtx); dbuf_rele(child, FTAG); @@ -310,6 +314,12 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, dmu_buf_unlock_parent(db, dblt, FTAG); dbuf_release_bp(db); + /* + * XXX db_mtx isn't held, but should be. But locking it here causes a + * recurse-on-non-recursive mutex panic many levels downstack: + * free_verify->dbuf_hold_impl->dbuf_findbp->dbuf_hold_impl->dbuf_find + */ + /* mutex_enter(&db->db_mtx); */ bp = db->db.db_data; DB_DNODE_ENTER(db); @@ -338,6 +348,10 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, rw_exit(&db->db_rwlock); } else { for (uint64_t id = start; id <= end; id++, bp++) { + /* + * XXX should really have db_rwlock here. But we can't + * hold it when we recurse into free_children. + */ if (BP_IS_HOLE(bp)) continue; rw_enter(&dn->dn_struct_rwlock, RW_READER);