Skip to content

Commit a359e6c

Browse files
committed
Be more careful with locking around db.db_data
Lock db_mtx in some places that access db->db_data. But don't lock it in free_children, even though it does access db->db_data, because that leads to a recurse-on-non-recursive panic. Lock db_rwlock in some places that access db->db.db_data's contents. Closes #16626 Sponsored by: ConnectWise Signed-off-by: Alan Somers <[email protected]>
1 parent 0a2163d commit a359e6c

File tree

4 files changed

+112
-16
lines changed

4 files changed

+112
-16
lines changed

module/zfs/dbuf.c

Lines changed: 58 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1193,7 +1193,8 @@ dbuf_verify(dmu_buf_impl_t *db)
11931193
if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
11941194
(db->db_buf == NULL || db->db_buf->b_data) &&
11951195
db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
1196-
db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) {
1196+
db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg) &&
1197+
RW_LOCK_HELD(&db->db_rwlock)) {
11971198
/*
11981199
* If the blkptr isn't set but they have nonzero data,
11991200
* it had better be dirty, otherwise we'll lose that
@@ -1697,7 +1698,9 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
16971698
int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
16981699
dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);
16991700
arc_space_consume(bonuslen, ARC_SPACE_BONUS);
1701+
rw_enter(&db->db_rwlock, RW_READER);
17001702
memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen);
1703+
rw_exit(&db->db_rwlock);
17011704
} else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
17021705
dnode_t *dn = DB_DNODE(db);
17031706
int size = arc_buf_size(db->db_buf);
@@ -1727,7 +1730,9 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
17271730
} else {
17281731
dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
17291732
}
1733+
rw_enter(&db->db_rwlock, RW_READER);
17301734
memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size);
1735+
rw_exit(&db->db_rwlock);
17311736
} else {
17321737
db->db_buf = NULL;
17331738
dbuf_clear_data(db);
@@ -2999,7 +3004,9 @@ dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
29993004
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
30003005
/* we were freed while filling */
30013006
/* XXX dbuf_undirty? */
3007+
rw_enter(&db->db_rwlock, RW_WRITER);
30023008
memset(db->db.db_data, 0, db->db.db_size);
3009+
rw_exit(&db->db_rwlock);
30033010
db->db_freed_in_flight = FALSE;
30043011
db->db_state = DB_CACHED;
30053012
DTRACE_SET_STATE(db,
@@ -3374,8 +3381,10 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
33743381
*parentp = NULL;
33753382
return (err);
33763383
}
3384+
mutex_enter(&(*parentp)->db_mtx);
33773385
*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
33783386
(blkid & ((1ULL << epbs) - 1));
3387+
mutex_exit(&(*parentp)->db_mtx);
33793388
return (0);
33803389
} else {
33813390
/* the block is referenced from the dnode */
@@ -4560,10 +4569,12 @@ dbuf_lightweight_bp(dbuf_dirty_record_t *dr)
45604569
return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]);
45614570
} else {
45624571
dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf;
4572+
ASSERT(MUTEX_HELD(&parent_db->db_mtx));
45634573
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
45644574
VERIFY3U(parent_db->db_level, ==, 1);
45654575
VERIFY3P(DB_DNODE(parent_db), ==, dn);
45664576
VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid);
4577+
ASSERT(RW_LOCK_HELD(&parent_db->db_rwlock));
45674578
blkptr_t *bp = parent_db->db.db_data;
45684579
return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]);
45694580
}
@@ -4574,12 +4585,22 @@ dbuf_lightweight_ready(zio_t *zio)
45744585
{
45754586
dbuf_dirty_record_t *dr = zio->io_private;
45764587
blkptr_t *bp = zio->io_bp;
4588+
dmu_buf_impl_t *parent_db = NULL;
45774589

45784590
if (zio->io_error != 0)
45794591
return;
45804592

45814593
dnode_t *dn = dr->dr_dnode;
45824594

4595+
EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
4596+
if (dr->dr_parent == NULL) {
4597+
parent_db = dn->dn_dbuf;
4598+
} else {
4599+
parent_db = dr->dr_parent->dr_dbuf;
4600+
}
4601+
mutex_enter(&parent_db->db_mtx);
4602+
4603+
rw_enter(&parent_db->db_rwlock, RW_READER);
45834604
blkptr_t *bp_orig = dbuf_lightweight_bp(dr);
45844605
spa_t *spa = dmu_objset_spa(dn->dn_objset);
45854606
int64_t delta = bp_get_dsize_sync(spa, bp) -
@@ -4599,16 +4620,13 @@ dbuf_lightweight_ready(zio_t *zio)
45994620
BP_SET_FILL(bp, fill);
46004621
}
46014622

4602-
dmu_buf_impl_t *parent_db;
4603-
EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
4604-
if (dr->dr_parent == NULL) {
4605-
parent_db = dn->dn_dbuf;
4606-
} else {
4607-
parent_db = dr->dr_parent->dr_dbuf;
4623+
if (!rw_tryupgrade(&parent_db->db_rwlock)) {
4624+
rw_exit(&parent_db->db_rwlock);
4625+
rw_enter(&parent_db->db_rwlock, RW_WRITER);
46084626
}
4609-
rw_enter(&parent_db->db_rwlock, RW_WRITER);
46104627
*bp_orig = *bp;
46114628
rw_exit(&parent_db->db_rwlock);
4629+
mutex_exit(&parent_db->db_mtx);
46124630
}
46134631

46144632
static void
@@ -4640,6 +4658,7 @@ noinline static void
46404658
dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
46414659
{
46424660
dnode_t *dn = dr->dr_dnode;
4661+
dmu_buf_impl_t *parent_db = NULL;
46434662
zio_t *pio;
46444663
if (dn->dn_phys->dn_nlevels == 1) {
46454664
pio = dn->dn_zio;
@@ -4658,6 +4677,11 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
46584677
* See comment in dbuf_write(). This is so that zio->io_bp_orig
46594678
* will have the old BP in dbuf_lightweight_done().
46604679
*/
4680+
if (dr->dr_dnode->dn_phys->dn_nlevels != 1) {
4681+
parent_db = dr->dr_parent->dr_dbuf;
4682+
mutex_enter(&parent_db->db_mtx);
4683+
rw_enter(&parent_db->db_rwlock, RW_READER);
4684+
}
46614685
dr->dr_bp_copy = *dbuf_lightweight_bp(dr);
46624686

46634687
dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset),
@@ -4667,6 +4691,11 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
46674691
dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
46684692
ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
46694693

4694+
if (parent_db) {
4695+
rw_exit(&parent_db->db_rwlock);
4696+
mutex_exit(&parent_db->db_mtx);
4697+
}
4698+
46704699
zio_nowait(dr->dr_zio);
46714700
}
46724701

@@ -4823,7 +4852,9 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
48234852
} else {
48244853
*datap = arc_alloc_buf(os->os_spa, db, type, psize);
48254854
}
4855+
rw_enter(&db->db_rwlock, RW_READER);
48264856
memcpy((*datap)->b_data, db->db.db_data, psize);
4857+
rw_exit(&db->db_rwlock);
48274858
}
48284859
db->db_data_pending = dr;
48294860

@@ -4929,6 +4960,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
49294960

49304961
if (dn->dn_type == DMU_OT_DNODE) {
49314962
i = 0;
4963+
rw_enter(&db->db_rwlock, RW_READER);
49324964
while (i < db->db.db_size) {
49334965
dnode_phys_t *dnp =
49344966
(void *)(((char *)db->db.db_data) + i);
@@ -4954,6 +4986,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
49544986
DNODE_MIN_SIZE;
49554987
}
49564988
}
4989+
rw_exit(&db->db_rwlock);
49574990
} else {
49584991
if (BP_IS_HOLE(bp)) {
49594992
fill = 0;
@@ -4962,6 +4995,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
49624995
}
49634996
}
49644997
} else {
4998+
rw_enter(&db->db_rwlock, RW_READER);
49654999
blkptr_t *ibp = db->db.db_data;
49665000
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
49675001
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
@@ -4971,6 +5005,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
49715005
BLK_CONFIG_SKIP, BLK_VERIFY_HALT);
49725006
fill += BP_GET_FILL(ibp);
49735007
}
5008+
rw_exit(&db->db_rwlock);
49745009
}
49755010
DB_DNODE_EXIT(db);
49765011

@@ -5005,6 +5040,8 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
50055040
DB_DNODE_EXIT(db);
50065041
ASSERT3U(epbs, <, 31);
50075042

5043+
mutex_enter(&db->db_mtx);
5044+
rw_enter(&db->db_rwlock, RW_READER);
50085045
/* Determine if all our children are holes */
50095046
for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) {
50105047
if (!BP_IS_HOLE(bp))
@@ -5021,10 +5058,14 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
50215058
* anybody from reading the blocks we're about to
50225059
* zero out.
50235060
*/
5024-
rw_enter(&db->db_rwlock, RW_WRITER);
5061+
if (!rw_tryupgrade(&db->db_rwlock)) {
5062+
rw_exit(&db->db_rwlock);
5063+
rw_enter(&db->db_rwlock, RW_WRITER);
5064+
}
50255065
memset(db->db.db_data, 0, db->db.db_size);
5026-
rw_exit(&db->db_rwlock);
50275066
}
5067+
rw_exit(&db->db_rwlock);
5068+
mutex_exit(&db->db_mtx);
50285069
}
50295070

50305071
static void
@@ -5220,11 +5261,11 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
52205261
* avoid lock contention, only grab it when we are actually
52215262
* changing the BP.
52225263
*/
5223-
if (rw != NULL)
5264+
if (rw != NULL && !rw_tryupgrade(rw)) {
5265+
rw_exit(rw);
52245266
rw_enter(rw, RW_WRITER);
5267+
}
52255268
*bp = bp_copy;
5226-
if (rw != NULL)
5227-
rw_exit(rw);
52285269
}
52295270
}
52305271

@@ -5240,6 +5281,8 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
52405281
if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
52415282
return;
52425283

5284+
mutex_enter(&db->db_mtx);
5285+
rw_enter(&db->db_rwlock, RW_READER);
52435286
if (db->db_level > 0) {
52445287
blkptr_t *bp = db->db.db_data;
52455288
for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
@@ -5258,6 +5301,8 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
52585301
}
52595302
}
52605303
}
5304+
rw_exit(&db->db_rwlock);
5305+
mutex_exit(&db->db_mtx);
52615306
}
52625307

52635308

module/zfs/dmu_objset.c

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2190,6 +2190,7 @@ void
21902190
dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
21912191
{
21922192
objset_t *os = dn->dn_objset;
2193+
krwlock_t *rw = NULL;
21932194
void *data = NULL;
21942195
dmu_buf_impl_t *db = NULL;
21952196
int flags = dn->dn_id_flags;
@@ -2234,8 +2235,12 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
22342235
FTAG, (dmu_buf_t **)&db);
22352236
ASSERT(error == 0);
22362237
mutex_enter(&db->db_mtx);
2237-
data = (before) ? db->db.db_data :
2238-
dmu_objset_userquota_find_data(db, tx);
2238+
if (before) {
2239+
rw = &db->db_rwlock;
2240+
data = db->db.db_data;
2241+
} else {
2242+
data = dmu_objset_userquota_find_data(db, tx);
2243+
}
22392244
have_spill = B_TRUE;
22402245
} else {
22412246
mutex_enter(&dn->dn_mtx);
@@ -2249,7 +2254,11 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
22492254
* type has changed and that type isn't an object type to track
22502255
*/
22512256
zfs_file_info_t zfi;
2257+
if (rw)
2258+
rw_enter(rw, RW_READER);
22522259
error = file_cbs[os->os_phys->os_type](dn->dn_bonustype, data, &zfi);
2260+
if (rw)
2261+
rw_exit(rw);
22532262

22542263
if (before) {
22552264
ASSERT(data);

0 commit comments

Comments
 (0)