-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Be more careful with locking db.db_mtx #17418
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1193,7 +1193,8 @@ dbuf_verify(dmu_buf_impl_t *db) | |
if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && | ||
(db->db_buf == NULL || db->db_buf->b_data) && | ||
db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && | ||
db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) { | ||
db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg) && | ||
RW_LOCK_HELD(&db->db_rwlock)) { | ||
/* | ||
* If the blkptr isn't set but they have nonzero data, | ||
* it had better be dirty, otherwise we'll lose that | ||
|
@@ -1697,7 +1698,9 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) | |
int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); | ||
dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP); | ||
arc_space_consume(bonuslen, ARC_SPACE_BONUS); | ||
rw_enter(&db->db_rwlock, RW_READER); | ||
memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen); | ||
rw_exit(&db->db_rwlock); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not about indirects or meta-dnode. |
||
} else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) { | ||
dnode_t *dn = DB_DNODE(db); | ||
int size = arc_buf_size(db->db_buf); | ||
|
@@ -1727,7 +1730,9 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) | |
} else { | ||
dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size); | ||
} | ||
rw_enter(&db->db_rwlock, RW_READER); | ||
memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size); | ||
rw_exit(&db->db_rwlock); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, this is not about indirects or meta-dnode, since those are never "fixed". |
||
} else { | ||
db->db_buf = NULL; | ||
dbuf_clear_data(db); | ||
|
@@ -2999,7 +3004,9 @@ dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed) | |
ASSERT(db->db_blkid != DMU_BONUS_BLKID); | ||
/* we were freed while filling */ | ||
/* XXX dbuf_undirty? */ | ||
rw_enter(&db->db_rwlock, RW_WRITER); | ||
memset(db->db.db_data, 0, db->db.db_size); | ||
rw_exit(&db->db_rwlock); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, not indirect or meta-dnode. |
||
db->db_freed_in_flight = FALSE; | ||
db->db_state = DB_CACHED; | ||
DTRACE_SET_STATE(db, | ||
|
@@ -3374,8 +3381,10 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, | |
*parentp = NULL; | ||
return (err); | ||
} | ||
mutex_enter(&(*parentp)->db_mtx); | ||
*bpp = ((blkptr_t *)(*parentp)->db.db_data) + | ||
(blkid & ((1ULL << epbs) - 1)); | ||
mutex_exit(&(*parentp)->db_mtx); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indirect dbufs are never relocated. These pointers are constant. |
||
return (0); | ||
} else { | ||
/* the block is referenced from the dnode */ | ||
|
@@ -4560,10 +4569,12 @@ dbuf_lightweight_bp(dbuf_dirty_record_t *dr) | |
return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]); | ||
} else { | ||
dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf; | ||
ASSERT(MUTEX_HELD(&parent_db->db_mtx)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure why do we need locked indirect's db_mtx here. Indirects don't relocate. |
||
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; | ||
VERIFY3U(parent_db->db_level, ==, 1); | ||
VERIFY3P(DB_DNODE(parent_db), ==, dn); | ||
VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid); | ||
ASSERT(RW_LOCK_HELD(&parent_db->db_rwlock)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We just return pointer here. Since indirect dbufs do nto relocate, we should not enforce the lock being held until the actual dereference point. |
||
blkptr_t *bp = parent_db->db.db_data; | ||
return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]); | ||
} | ||
|
@@ -4574,12 +4585,22 @@ dbuf_lightweight_ready(zio_t *zio) | |
{ | ||
dbuf_dirty_record_t *dr = zio->io_private; | ||
blkptr_t *bp = zio->io_bp; | ||
dmu_buf_impl_t *parent_db = NULL; | ||
|
||
if (zio->io_error != 0) | ||
return; | ||
|
||
dnode_t *dn = dr->dr_dnode; | ||
|
||
EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1); | ||
if (dr->dr_parent == NULL) { | ||
parent_db = dn->dn_dbuf; | ||
} else { | ||
parent_db = dr->dr_parent->dr_dbuf; | ||
} | ||
mutex_enter(&parent_db->db_mtx); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Correct me if I am wrong, but why do we need this lock in context of above comments? |
||
|
||
rw_enter(&parent_db->db_rwlock, RW_READER); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, this we might need for purposes of |
||
blkptr_t *bp_orig = dbuf_lightweight_bp(dr); | ||
spa_t *spa = dmu_objset_spa(dn->dn_objset); | ||
int64_t delta = bp_get_dsize_sync(spa, bp) - | ||
|
@@ -4599,16 +4620,13 @@ dbuf_lightweight_ready(zio_t *zio) | |
BP_SET_FILL(bp, fill); | ||
} | ||
|
||
dmu_buf_impl_t *parent_db; | ||
EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1); | ||
if (dr->dr_parent == NULL) { | ||
parent_db = dn->dn_dbuf; | ||
} else { | ||
parent_db = dr->dr_parent->dr_dbuf; | ||
if (!rw_tryupgrade(&parent_db->db_rwlock)) { | ||
rw_exit(&parent_db->db_rwlock); | ||
rw_enter(&parent_db->db_rwlock, RW_WRITER); | ||
} | ||
rw_enter(&parent_db->db_rwlock, RW_WRITER); | ||
*bp_orig = *bp; | ||
rw_exit(&parent_db->db_rwlock); | ||
mutex_exit(&parent_db->db_mtx); | ||
} | ||
|
||
static void | ||
|
@@ -4640,6 +4658,7 @@ noinline static void | |
dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx) | ||
{ | ||
dnode_t *dn = dr->dr_dnode; | ||
dmu_buf_impl_t *parent_db = NULL; | ||
zio_t *pio; | ||
if (dn->dn_phys->dn_nlevels == 1) { | ||
pio = dn->dn_zio; | ||
|
@@ -4658,6 +4677,11 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx) | |
* See comment in dbuf_write(). This is so that zio->io_bp_orig | ||
* will have the old BP in dbuf_lightweight_done(). | ||
*/ | ||
if (dr->dr_dnode->dn_phys->dn_nlevels != 1) { | ||
parent_db = dr->dr_parent->dr_dbuf; | ||
mutex_enter(&parent_db->db_mtx); | ||
rw_enter(&parent_db->db_rwlock, RW_READER); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again I don't think we need |
||
} | ||
dr->dr_bp_copy = *dbuf_lightweight_bp(dr); | ||
|
||
dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset), | ||
|
@@ -4667,6 +4691,11 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx) | |
dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE, | ||
ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb); | ||
|
||
if (parent_db) { | ||
rw_exit(&parent_db->db_rwlock); | ||
mutex_exit(&parent_db->db_mtx); | ||
} | ||
|
||
zio_nowait(dr->dr_zio); | ||
} | ||
|
||
|
@@ -4823,7 +4852,9 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) | |
} else { | ||
*datap = arc_alloc_buf(os->os_spa, db, type, psize); | ||
} | ||
rw_enter(&db->db_rwlock, RW_READER); | ||
memcpy((*datap)->b_data, db->db.db_data, psize); | ||
rw_exit(&db->db_rwlock); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not a meta-dnode or indirect. |
||
} | ||
db->db_data_pending = dr; | ||
|
||
|
@@ -4929,6 +4960,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) | |
|
||
if (dn->dn_type == DMU_OT_DNODE) { | ||
i = 0; | ||
rw_enter(&db->db_rwlock, RW_READER); | ||
while (i < db->db.db_size) { | ||
dnode_phys_t *dnp = | ||
(void *)(((char *)db->db.db_data) + i); | ||
|
@@ -4954,6 +4986,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) | |
DNODE_MIN_SIZE; | ||
} | ||
} | ||
rw_exit(&db->db_rwlock); | ||
} else { | ||
if (BP_IS_HOLE(bp)) { | ||
fill = 0; | ||
|
@@ -4962,6 +4995,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) | |
} | ||
} | ||
} else { | ||
rw_enter(&db->db_rwlock, RW_READER); | ||
blkptr_t *ibp = db->db.db_data; | ||
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); | ||
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { | ||
|
@@ -4971,6 +5005,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) | |
BLK_CONFIG_SKIP, BLK_VERIFY_HALT); | ||
fill += BP_GET_FILL(ibp); | ||
} | ||
rw_exit(&db->db_rwlock); | ||
} | ||
DB_DNODE_EXIT(db); | ||
|
||
|
@@ -5005,6 +5040,8 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) | |
DB_DNODE_EXIT(db); | ||
ASSERT3U(epbs, <, 31); | ||
|
||
mutex_enter(&db->db_mtx); | ||
rw_enter(&db->db_rwlock, RW_READER); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think we need |
||
/* Determine if all our children are holes */ | ||
for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) { | ||
if (!BP_IS_HOLE(bp)) | ||
|
@@ -5021,10 +5058,14 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) | |
* anybody from reading the blocks we're about to | ||
* zero out. | ||
*/ | ||
rw_enter(&db->db_rwlock, RW_WRITER); | ||
if (!rw_tryupgrade(&db->db_rwlock)) { | ||
rw_exit(&db->db_rwlock); | ||
rw_enter(&db->db_rwlock, RW_WRITER); | ||
} | ||
memset(db->db.db_data, 0, db->db.db_size); | ||
rw_exit(&db->db_rwlock); | ||
} | ||
rw_exit(&db->db_rwlock); | ||
mutex_exit(&db->db_mtx); | ||
} | ||
|
||
static void | ||
|
@@ -5220,11 +5261,11 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) | |
* avoid lock contention, only grab it when we are actually | ||
* changing the BP. | ||
*/ | ||
if (rw != NULL) | ||
if (rw != NULL && !rw_tryupgrade(rw)) { | ||
rw_exit(rw); | ||
rw_enter(rw, RW_WRITER); | ||
} | ||
*bp = bp_copy; | ||
if (rw != NULL) | ||
rw_exit(rw); | ||
} | ||
} | ||
|
||
|
@@ -5240,6 +5281,8 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) | |
if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)) | ||
return; | ||
|
||
mutex_enter(&db->db_mtx); | ||
rw_enter(&db->db_rwlock, RW_READER); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is called only for meta-dnode or indirects, so we should not need the |
||
if (db->db_level > 0) { | ||
blkptr_t *bp = db->db.db_data; | ||
for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { | ||
|
@@ -5258,6 +5301,8 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) | |
} | ||
} | ||
} | ||
rw_exit(&db->db_rwlock); | ||
mutex_exit(&db->db_mtx); | ||
} | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2190,6 +2190,7 @@ void | |
dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) | ||
{ | ||
objset_t *os = dn->dn_objset; | ||
krwlock_t *rw = NULL; | ||
void *data = NULL; | ||
dmu_buf_impl_t *db = NULL; | ||
int flags = dn->dn_id_flags; | ||
|
@@ -2234,8 +2235,12 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) | |
FTAG, (dmu_buf_t **)&db); | ||
ASSERT(error == 0); | ||
mutex_enter(&db->db_mtx); | ||
data = (before) ? db->db.db_data : | ||
dmu_objset_userquota_find_data(db, tx); | ||
if (before) { | ||
rw = &db->db_rwlock; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's a bonus not meta-dnode or indirect. |
||
data = db->db.db_data; | ||
} else { | ||
data = dmu_objset_userquota_find_data(db, tx); | ||
} | ||
have_spill = B_TRUE; | ||
} else { | ||
mutex_enter(&dn->dn_mtx); | ||
|
@@ -2249,7 +2254,11 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) | |
* type has changed and that type isn't an object type to track | ||
*/ | ||
zfs_file_info_t zfi; | ||
if (rw) | ||
rw_enter(rw, RW_READER); | ||
error = file_cbs[os->os_phys->os_type](dn->dn_bonustype, data, &zfi); | ||
if (rw) | ||
rw_exit(rw); | ||
|
||
if (before) { | ||
ASSERT(data); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't believe this condition is needed (or even correct). If the
db_dirtycnt
below is zero (and it should be protected bydb_mtx
), then the buffer must be empty.