Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 58 additions & 13 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -1193,7 +1193,8 @@ dbuf_verify(dmu_buf_impl_t *db)
if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
(db->db_buf == NULL || db->db_buf->b_data) &&
db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) {
db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg) &&
RW_LOCK_HELD(&db->db_rwlock)) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't believe this condition is needed (or even correct). If the db_dirtycnt below is zero (and it should be protected by db_mtx), then the buffer must be empty.

/*
* If the blkptr isn't set but they have nonzero data,
* it had better be dirty, otherwise we'll lose that
Expand Down Expand Up @@ -1697,7 +1698,9 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);
arc_space_consume(bonuslen, ARC_SPACE_BONUS);
rw_enter(&db->db_rwlock, RW_READER);
memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen);
rw_exit(&db->db_rwlock);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not about indirects or meta-dnode. db_rwlock didn't promise to protect every dbuf.

} else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
dnode_t *dn = DB_DNODE(db);
int size = arc_buf_size(db->db_buf);
Expand Down Expand Up @@ -1727,7 +1730,9 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
} else {
dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
}
rw_enter(&db->db_rwlock, RW_READER);
memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size);
rw_exit(&db->db_rwlock);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, this is not about indirects or meta-dnode, since those are never "fixed". db_rwlock didn't promise to protect every dbuf.

} else {
db->db_buf = NULL;
dbuf_clear_data(db);
Expand Down Expand Up @@ -2999,7 +3004,9 @@ dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
/* we were freed while filling */
/* XXX dbuf_undirty? */
rw_enter(&db->db_rwlock, RW_WRITER);
memset(db->db.db_data, 0, db->db.db_size);
rw_exit(&db->db_rwlock);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, not indirect or meta-dnode.

db->db_freed_in_flight = FALSE;
db->db_state = DB_CACHED;
DTRACE_SET_STATE(db,
Expand Down Expand Up @@ -3374,8 +3381,10 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
*parentp = NULL;
return (err);
}
mutex_enter(&(*parentp)->db_mtx);
*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
(blkid & ((1ULL << epbs) - 1));
mutex_exit(&(*parentp)->db_mtx);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indirect dbufs are never relocated. These pointers are constant.

return (0);
} else {
/* the block is referenced from the dnode */
Expand Down Expand Up @@ -4560,10 +4569,12 @@ dbuf_lightweight_bp(dbuf_dirty_record_t *dr)
return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]);
} else {
dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf;
ASSERT(MUTEX_HELD(&parent_db->db_mtx));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure why do we need locked indirect's db_mtx here. Indirects don't relocate.

int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
VERIFY3U(parent_db->db_level, ==, 1);
VERIFY3P(DB_DNODE(parent_db), ==, dn);
VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid);
ASSERT(RW_LOCK_HELD(&parent_db->db_rwlock));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We just return pointer here. Since indirect dbufs do nto relocate, we should not enforce the lock being held until the actual dereference point.

blkptr_t *bp = parent_db->db.db_data;
return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]);
}
Expand All @@ -4574,12 +4585,22 @@ dbuf_lightweight_ready(zio_t *zio)
{
dbuf_dirty_record_t *dr = zio->io_private;
blkptr_t *bp = zio->io_bp;
dmu_buf_impl_t *parent_db = NULL;

if (zio->io_error != 0)
return;

dnode_t *dn = dr->dr_dnode;

EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
if (dr->dr_parent == NULL) {
parent_db = dn->dn_dbuf;
} else {
parent_db = dr->dr_parent->dr_dbuf;
}
mutex_enter(&parent_db->db_mtx);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct me if I am wrong, but why do we need this lock in context of above comments?


rw_enter(&parent_db->db_rwlock, RW_READER);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, this we might need for purposes of bp_get_dsize_sync() and later. But since we are going to upgrade it any way, is there a reason to not to RW_WRITER from the beginning?

blkptr_t *bp_orig = dbuf_lightweight_bp(dr);
spa_t *spa = dmu_objset_spa(dn->dn_objset);
int64_t delta = bp_get_dsize_sync(spa, bp) -
Expand All @@ -4599,16 +4620,13 @@ dbuf_lightweight_ready(zio_t *zio)
BP_SET_FILL(bp, fill);
}

dmu_buf_impl_t *parent_db;
EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
if (dr->dr_parent == NULL) {
parent_db = dn->dn_dbuf;
} else {
parent_db = dr->dr_parent->dr_dbuf;
if (!rw_tryupgrade(&parent_db->db_rwlock)) {
rw_exit(&parent_db->db_rwlock);
rw_enter(&parent_db->db_rwlock, RW_WRITER);
}
rw_enter(&parent_db->db_rwlock, RW_WRITER);
*bp_orig = *bp;
rw_exit(&parent_db->db_rwlock);
mutex_exit(&parent_db->db_mtx);
}

static void
Expand Down Expand Up @@ -4640,6 +4658,7 @@ noinline static void
dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
{
dnode_t *dn = dr->dr_dnode;
dmu_buf_impl_t *parent_db = NULL;
zio_t *pio;
if (dn->dn_phys->dn_nlevels == 1) {
pio = dn->dn_zio;
Expand All @@ -4658,6 +4677,11 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
* See comment in dbuf_write(). This is so that zio->io_bp_orig
* will have the old BP in dbuf_lightweight_done().
*/
if (dr->dr_dnode->dn_phys->dn_nlevels != 1) {
parent_db = dr->dr_parent->dr_dbuf;
mutex_enter(&parent_db->db_mtx);
rw_enter(&parent_db->db_rwlock, RW_READER);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again I don't think we need db_mtx locked here, while about db_rwlock I am not sure, since I don't think there should be a concurrent updates of the same block pointer any way.

}
dr->dr_bp_copy = *dbuf_lightweight_bp(dr);

dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset),
Expand All @@ -4667,6 +4691,11 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);

if (parent_db) {
rw_exit(&parent_db->db_rwlock);
mutex_exit(&parent_db->db_mtx);
}

zio_nowait(dr->dr_zio);
}

Expand Down Expand Up @@ -4823,7 +4852,9 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
} else {
*datap = arc_alloc_buf(os->os_spa, db, type, psize);
}
rw_enter(&db->db_rwlock, RW_READER);
memcpy((*datap)->b_data, db->db.db_data, psize);
rw_exit(&db->db_rwlock);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not a meta-dnode or indirect. db_rwlock didn't promise to protect every dbuf.

}
db->db_data_pending = dr;

Expand Down Expand Up @@ -4929,6 +4960,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)

if (dn->dn_type == DMU_OT_DNODE) {
i = 0;
rw_enter(&db->db_rwlock, RW_READER);
while (i < db->db.db_size) {
dnode_phys_t *dnp =
(void *)(((char *)db->db.db_data) + i);
Expand All @@ -4954,6 +4986,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
DNODE_MIN_SIZE;
}
}
rw_exit(&db->db_rwlock);
} else {
if (BP_IS_HOLE(bp)) {
fill = 0;
Expand All @@ -4962,6 +4995,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
}
}
} else {
rw_enter(&db->db_rwlock, RW_READER);
blkptr_t *ibp = db->db.db_data;
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
Expand All @@ -4971,6 +5005,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
BLK_CONFIG_SKIP, BLK_VERIFY_HALT);
fill += BP_GET_FILL(ibp);
}
rw_exit(&db->db_rwlock);
}
DB_DNODE_EXIT(db);

Expand Down Expand Up @@ -5005,6 +5040,8 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
DB_DNODE_EXIT(db);
ASSERT3U(epbs, <, 31);

mutex_enter(&db->db_mtx);
rw_enter(&db->db_rwlock, RW_READER);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need db_mtx lock here, while db_rwlock not sure, since at this point all modifications must be completed, since the block is going to be written. I think previous db_rwlock below should be sufficient.

/* Determine if all our children are holes */
for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) {
if (!BP_IS_HOLE(bp))
Expand All @@ -5021,10 +5058,14 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
* anybody from reading the blocks we're about to
* zero out.
*/
rw_enter(&db->db_rwlock, RW_WRITER);
if (!rw_tryupgrade(&db->db_rwlock)) {
rw_exit(&db->db_rwlock);
rw_enter(&db->db_rwlock, RW_WRITER);
}
memset(db->db.db_data, 0, db->db.db_size);
rw_exit(&db->db_rwlock);
}
rw_exit(&db->db_rwlock);
mutex_exit(&db->db_mtx);
}

static void
Expand Down Expand Up @@ -5220,11 +5261,11 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
* avoid lock contention, only grab it when we are actually
* changing the BP.
*/
if (rw != NULL)
if (rw != NULL && !rw_tryupgrade(rw)) {
rw_exit(rw);
rw_enter(rw, RW_WRITER);
}
*bp = bp_copy;
if (rw != NULL)
rw_exit(rw);
}
}

Expand All @@ -5240,6 +5281,8 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
return;

mutex_enter(&db->db_mtx);
rw_enter(&db->db_rwlock, RW_READER);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is called only for meta-dnode or indirects, so we should not need the db_mtx. About db_rwlock I am not sure. I think by this time modifications of this block should be completed, and old locking inside dbuf_remap_impl() should cover our modifications from other readers.

if (db->db_level > 0) {
blkptr_t *bp = db->db.db_data;
for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
Expand All @@ -5258,6 +5301,8 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
}
}
}
rw_exit(&db->db_rwlock);
mutex_exit(&db->db_mtx);
}


Expand Down
13 changes: 11 additions & 2 deletions module/zfs/dmu_objset.c
Original file line number Diff line number Diff line change
Expand Up @@ -2190,6 +2190,7 @@ void
dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
{
objset_t *os = dn->dn_objset;
krwlock_t *rw = NULL;
void *data = NULL;
dmu_buf_impl_t *db = NULL;
int flags = dn->dn_id_flags;
Expand Down Expand Up @@ -2234,8 +2235,12 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
FTAG, (dmu_buf_t **)&db);
ASSERT(error == 0);
mutex_enter(&db->db_mtx);
data = (before) ? db->db.db_data :
dmu_objset_userquota_find_data(db, tx);
if (before) {
rw = &db->db_rwlock;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a bonus not meta-dnode or indirect.

data = db->db.db_data;
} else {
data = dmu_objset_userquota_find_data(db, tx);
}
have_spill = B_TRUE;
} else {
mutex_enter(&dn->dn_mtx);
Expand All @@ -2249,7 +2254,11 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
* type has changed and that type isn't an object type to track
*/
zfs_file_info_t zfi;
if (rw)
rw_enter(rw, RW_READER);
error = file_cbs[os->os_phys->os_type](dn->dn_bonustype, data, &zfi);
if (rw)
rw_exit(rw);

if (before) {
ASSERT(data);
Expand Down
Loading
Loading