Skip to content

Commit 928eccc

Browse files
authored
DDT: Reduce global DDT lock scope during writes
Before this change DDT lock was taken 4 times per written block, and as effectively a pool-wide lock it can be highly congested. This change introduces a new per-entry dde_io_lock, protecting some fields during I/O ready and done stages, so that we don't need the global lock there. According to my write tests on 64-thread system with 4KB blocks this significantly reduce the global lock contention, reducing CPU usage from 100% to expected ~80%, and increasing write throughput by 10%. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Rob Norris <[email protected]> Signed-off-by: Alexander Motin <[email protected]> Closes #17960
1 parent a5b665d commit 928eccc

File tree

3 files changed

+87
-37
lines changed

3 files changed

+87
-37
lines changed

include/sys/ddt.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,9 @@ typedef enum {
219219
* because its relatively rarely used.
220220
*/
221221
typedef struct {
222+
/* protects dde_phys, dde_orig_phys and dde_lead_zio during I/O */
223+
kmutex_t dde_io_lock;
224+
222225
/* copy of data after a repair read, to be rewritten */
223226
abd_t *dde_repair_abd;
224227

module/zfs/ddt.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1010,6 +1010,7 @@ ddt_alloc_entry_io(ddt_entry_t *dde)
10101010
return;
10111011

10121012
dde->dde_io = kmem_zalloc(sizeof (ddt_entry_io_t), KM_SLEEP);
1013+
mutex_init(&dde->dde_io->dde_io_lock, NULL, MUTEX_DEFAULT, NULL);
10131014
}
10141015

10151016
static void
@@ -1022,6 +1023,7 @@ ddt_free(const ddt_t *ddt, ddt_entry_t *dde)
10221023
if (dde->dde_io->dde_repair_abd != NULL)
10231024
abd_free(dde->dde_io->dde_repair_abd);
10241025

1026+
mutex_destroy(&dde->dde_io->dde_io_lock);
10251027
kmem_free(dde->dde_io, sizeof (ddt_entry_io_t));
10261028
}
10271029

module/zfs/zio.c

Lines changed: 82 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -3683,7 +3683,7 @@ zio_ddt_child_write_done(zio_t *zio)
36833683
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
36843684
ddt_univ_phys_t *ddp = dde->dde_phys;
36853685

3686-
ddt_enter(ddt);
3686+
mutex_enter(&dde->dde_io->dde_io_lock);
36873687

36883688
/* we're the lead, so once we're done there's no one else outstanding */
36893689
if (dde->dde_io->dde_lead_zio[p] == zio)
@@ -3700,21 +3700,24 @@ zio_ddt_child_write_done(zio_t *zio)
37003700
ddt_phys_unextend(ddp, orig, v);
37013701
ddt_phys_clear(orig, v);
37023702

3703+
mutex_exit(&dde->dde_io->dde_io_lock);
3704+
3705+
/*
3706+
* Undo the optimistic refcount increments that were done in
3707+
* zio_ddt_write() for all non-DDT-child parents. Since errors
3708+
* are rare, taking the global lock here is acceptable.
3709+
*/
3710+
ddt_enter(ddt);
3711+
zio_t *pio;
3712+
zl = NULL;
3713+
while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
3714+
if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
3715+
ddt_phys_decref(ddp, v);
3716+
}
37033717
ddt_exit(ddt);
37043718
return;
37053719
}
37063720

3707-
/*
3708-
* Add references for all dedup writes that were waiting on the
3709-
* physical one, skipping any other physical writes that are waiting.
3710-
*/
3711-
zio_t *pio;
3712-
zl = NULL;
3713-
while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
3714-
if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
3715-
ddt_phys_addref(ddp, v);
3716-
}
3717-
37183721
/*
37193722
* We've successfully added new DVAs to the entry. Clear the saved
37203723
* state or, if there's still outstanding IO, remember it so we can
@@ -3725,7 +3728,7 @@ zio_ddt_child_write_done(zio_t *zio)
37253728
else
37263729
ddt_phys_copy(orig, ddp, v);
37273730

3728-
ddt_exit(ddt);
3731+
mutex_exit(&dde->dde_io->dde_io_lock);
37293732
}
37303733

37313734
static void
@@ -3753,7 +3756,7 @@ zio_ddt_child_write_ready(zio_t *zio)
37533756
if (zio->io_error != 0)
37543757
return;
37553758

3756-
ddt_enter(ddt);
3759+
mutex_enter(&dde->dde_io->dde_io_lock);
37573760

37583761
ddt_phys_extend(dde->dde_phys, v, zio->io_bp);
37593762

@@ -3764,7 +3767,7 @@ zio_ddt_child_write_ready(zio_t *zio)
37643767
ddt_bp_fill(dde->dde_phys, v, pio->io_bp, zio->io_txg);
37653768
}
37663769

3767-
ddt_exit(ddt);
3770+
mutex_exit(&dde->dde_io->dde_io_lock);
37683771
}
37693772

37703773
static zio_t *
@@ -3799,11 +3802,11 @@ zio_ddt_write(zio_t *zio)
37993802
dde = ddt_lookup(ddt, bp, B_FALSE);
38003803
if (dde == NULL) {
38013804
/* DDT size is over its quota so no new entries */
3805+
ddt_exit(ddt);
38023806
zp->zp_dedup = B_FALSE;
38033807
BP_SET_DEDUP(bp, B_FALSE);
38043808
if (zio->io_bp_override == NULL)
38053809
zio->io_pipeline = ZIO_WRITE_PIPELINE;
3806-
ddt_exit(ddt);
38073810
return (zio);
38083811
}
38093812

@@ -3814,6 +3817,7 @@ zio_ddt_write(zio_t *zio)
38143817
* we can't resolve it, so just convert to an ordinary write.
38153818
* (And automatically e-mail a paper to Nature?)
38163819
*/
3820+
ddt_exit(ddt);
38173821
if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
38183822
ZCHECKSUM_FLAG_DEDUP)) {
38193823
zp->zp_checksum = spa_dedup_checksum(spa);
@@ -3826,7 +3830,6 @@ zio_ddt_write(zio_t *zio)
38263830
}
38273831
ASSERT(!BP_GET_DEDUP(bp));
38283832
zio->io_pipeline = ZIO_WRITE_PIPELINE;
3829-
ddt_exit(ddt);
38303833
return (zio);
38313834
}
38323835

@@ -3877,10 +3880,15 @@ zio_ddt_write(zio_t *zio)
38773880
uint8_t parent_dvas = 0;
38783881

38793882
/*
3880-
* What we do next depends on whether or not there's IO outstanding that
3881-
* will update this entry.
3883+
* What we do next depends on whether or not there's IO outstanding
3884+
* that will update this entry. If dde_io exists, we need to hold
3885+
* its lock to safely check and use dde_lead_zio.
38823886
*/
3883-
if (dde->dde_io == NULL || dde->dde_io->dde_lead_zio[p] == NULL) {
3887+
ddt_entry_io_t *dde_io = dde->dde_io;
3888+
if (dde_io != NULL)
3889+
mutex_enter(&dde_io->dde_io_lock);
3890+
3891+
if (dde_io == NULL || dde_io->dde_lead_zio[p] == NULL) {
38843892
/*
38853893
* No IO outstanding, so we only need to worry about ourselves.
38863894
*/
@@ -3895,6 +3903,8 @@ zio_ddt_write(zio_t *zio)
38953903
* block and leave.
38963904
*/
38973905
if (have_dvas == 0) {
3906+
if (dde_io != NULL)
3907+
mutex_exit(&dde_io->dde_io_lock);
38983908
ASSERT(BP_GET_BIRTH(bp) == txg);
38993909
ASSERT(BP_EQUAL(bp, zio->io_bp_override));
39003910
ddt_phys_extend(ddp, v, bp);
@@ -3923,6 +3933,9 @@ zio_ddt_write(zio_t *zio)
39233933
* then we can just use them as-is.
39243934
*/
39253935
if (have_dvas >= need_dvas) {
3936+
if (dde_io != NULL)
3937+
mutex_exit(&dde_io->dde_io_lock);
3938+
39263939
/*
39273940
* For rewrite operations, try preserving the original
39283941
* logical birth time. If the result matches the
@@ -3934,8 +3947,8 @@ zio_ddt_write(zio_t *zio)
39343947
ddt_bp_fill(ddp, v, bp, orig_logical_birth);
39353948
if (BP_EQUAL(bp, &zio->io_bp_orig)) {
39363949
/* We can skip accounting. */
3937-
zio->io_flags |= ZIO_FLAG_NOPWRITE;
39383950
ddt_exit(ddt);
3951+
zio->io_flags |= ZIO_FLAG_NOPWRITE;
39393952
return (zio);
39403953
}
39413954
}
@@ -3997,7 +4010,16 @@ zio_ddt_write(zio_t *zio)
39974010
* missed out.
39984011
*/
39994012
ddt_bp_fill(ddp, v, bp, txg);
4000-
zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
4013+
piggyback:
4014+
zio_add_child(zio, dde_io->dde_lead_zio[p]);
4015+
4016+
/*
4017+
* Optimistically increment refcount for this parent.
4018+
* If the write fails, zio_ddt_child_write_done() will
4019+
* decrement for all non-DDT-child parents.
4020+
*/
4021+
ddt_phys_addref(ddp, v);
4022+
mutex_exit(&dde_io->dde_io_lock);
40014023
ddt_exit(ddt);
40024024
return (zio);
40034025
}
@@ -4023,11 +4045,8 @@ zio_ddt_write(zio_t *zio)
40234045
ASSERT(pio);
40244046
parent_dvas = pio->io_prop.zp_copies;
40254047

4026-
if (parent_dvas >= need_dvas) {
4027-
zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
4028-
ddt_exit(ddt);
4029-
return (zio);
4030-
}
4048+
if (parent_dvas >= need_dvas)
4049+
goto piggyback;
40314050

40324051
/*
40334052
* Still not enough, so we will need to issue to get the
@@ -4037,10 +4056,12 @@ zio_ddt_write(zio_t *zio)
40374056
}
40384057

40394058
if (is_ganged) {
4059+
if (dde_io != NULL)
4060+
mutex_exit(&dde_io->dde_io_lock);
4061+
ddt_exit(ddt);
40404062
zp->zp_dedup = B_FALSE;
40414063
BP_SET_DEDUP(bp, B_FALSE);
40424064
zio->io_pipeline = ZIO_WRITE_PIPELINE;
4043-
ddt_exit(ddt);
40444065
return (zio);
40454066
}
40464067

@@ -4070,21 +4091,45 @@ zio_ddt_write(zio_t *zio)
40704091
* We are the new lead zio, because our parent has the highest
40714092
* zp_copies that has been requested for this entry so far.
40724093
*/
4073-
ddt_alloc_entry_io(dde);
4074-
if (dde->dde_io->dde_lead_zio[p] == NULL) {
4094+
if (dde_io == NULL) {
4095+
/*
4096+
* New dde_io. No lock needed since no other thread can have
4097+
* a reference yet.
4098+
*/
4099+
ddt_alloc_entry_io(dde);
4100+
dde_io = dde->dde_io;
40754101
/*
40764102
* First time out, take a copy of the stable entry to revert
40774103
* to if there's an error (see zio_ddt_child_write_done())
40784104
*/
4079-
ddt_phys_copy(&dde->dde_io->dde_orig_phys, dde->dde_phys, v);
4105+
ddt_phys_copy(&dde_io->dde_orig_phys, dde->dde_phys, v);
4106+
dde_io->dde_lead_zio[p] = cio;
40804107
} else {
4081-
/*
4082-
* Make the existing chain our child, because it cannot
4083-
* complete until we have.
4084-
*/
4085-
zio_add_child(cio, dde->dde_io->dde_lead_zio[p]);
4108+
if (dde_io->dde_lead_zio[p] == NULL) {
4109+
/*
4110+
* First time out, take a copy of the stable entry
4111+
* to revert to if there's an error (see
4112+
* zio_ddt_child_write_done())
4113+
*/
4114+
ddt_phys_copy(&dde_io->dde_orig_phys, dde->dde_phys,
4115+
v);
4116+
} else {
4117+
/*
4118+
* Make the existing chain our child, because it
4119+
* cannot complete until we have.
4120+
*/
4121+
zio_add_child(cio, dde_io->dde_lead_zio[p]);
4122+
}
4123+
dde_io->dde_lead_zio[p] = cio;
4124+
mutex_exit(&dde_io->dde_io_lock);
40864125
}
4087-
dde->dde_io->dde_lead_zio[p] = cio;
4126+
4127+
/*
4128+
* Optimistically increment the refcount for this dedup write.
4129+
* If the write fails, zio_ddt_child_write_done() will decrement
4130+
* for all non-DDT-child parents.
4131+
*/
4132+
ddt_phys_addref(ddp, v);
40884133

40894134
ddt_exit(ddt);
40904135

0 commit comments

Comments
 (0)