Skip to content

Commit 07e8336

Browse files
liu-song-6shligit
authored andcommitted
md/r5cache: shift complex rmw from read path to write path
Write back cache requires a complex RMW mechanism, where old data is read into dev->orig_page for prexor, and then xor is done with dev->page. This logic is already implemented in the write path. However, current read path is not awared of this requirement. When the array is optimal, the RMW is not required, as the data are read from raid disks. However, when the target stripe is degraded, complex RMW is required to generate right data. To keep read path as clean as possible, we handle read path by flushing degraded, in-journal stripes before processing reads to missing dev. Specifically, when there is read requests to a degraded stripe with data in journal, handle_stripe_fill() calls r5c_make_stripe_write_out() and exits. Then handle_stripe_dirtying() will do the complex RMW and flush the stripe to RAID disks. After that, read requests are handled. There is one more corner case when there is non-overwrite bio for the missing (or out of sync) dev. handle_stripe_dirtying() will not be able to process the non-overwrite bios without constructing the data in handle_stripe_fill(). This is fixed by delaying non-overwrite bios in handle_stripe_dirtying(). So handle_stripe_fill() works on these bios after the stripe is flushed to raid disks. Signed-off-by: Song Liu <[email protected]> Signed-off-by: Shaohua Li <[email protected]>
1 parent a85dd7b commit 07e8336

File tree

1 file changed

+45
-4
lines changed

1 file changed

+45
-4
lines changed

drivers/md/raid5.c

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2897,6 +2897,30 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
28972897
return r_sector;
28982898
}
28992899

2900+
/*
2901+
* There are cases where we want handle_stripe_dirtying() and
2902+
* schedule_reconstruction() to delay towrite to some dev of a stripe.
2903+
*
2904+
* This function checks whether we want to delay the towrite. Specifically,
2905+
* we delay the towrite when:
2906+
*
2907+
* 1. degraded stripe has a non-overwrite to the missing dev, AND this
2908+
* stripe has data in journal (for other devices).
2909+
*
2910+
* In this case, when reading data for the non-overwrite dev, it is
2911+
* necessary to handle complex rmw of write back cache (prexor with
2912+
* orig_page, and xor with page). To keep read path simple, we would
2913+
* like to flush data in journal to RAID disks first, so complex rmw
2914+
* is handled in the write patch (handle_stripe_dirtying).
2915+
*
2916+
*/
2917+
static inline bool delay_towrite(struct r5dev *dev,
2918+
struct stripe_head_state *s)
2919+
{
2920+
return !test_bit(R5_OVERWRITE, &dev->flags) &&
2921+
!test_bit(R5_Insync, &dev->flags) && s->injournal;
2922+
}
2923+
29002924
static void
29012925
schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
29022926
int rcw, int expand)
@@ -2917,7 +2941,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
29172941
for (i = disks; i--; ) {
29182942
struct r5dev *dev = &sh->dev[i];
29192943

2920-
if (dev->towrite) {
2944+
if (dev->towrite && !delay_towrite(dev, s)) {
29212945
set_bit(R5_LOCKED, &dev->flags);
29222946
set_bit(R5_Wantdrain, &dev->flags);
29232947
if (!expand)
@@ -3494,10 +3518,26 @@ static void handle_stripe_fill(struct stripe_head *sh,
34943518
* midst of changing due to a write
34953519
*/
34963520
if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
3497-
!sh->reconstruct_state)
3521+
!sh->reconstruct_state) {
3522+
3523+
/*
3524+
* For degraded stripe with data in journal, do not handle
3525+
* read requests yet, instead, flush the stripe to raid
3526+
* disks first, this avoids handling complex rmw of write
3527+
* back cache (prexor with orig_page, and then xor with
3528+
* page) in the read path
3529+
*/
3530+
if (s->injournal && s->failed) {
3531+
if (test_bit(STRIPE_R5C_CACHING, &sh->state))
3532+
r5c_make_stripe_write_out(sh);
3533+
goto out;
3534+
}
3535+
34983536
for (i = disks; i--; )
34993537
if (fetch_block(sh, s, i, disks))
35003538
break;
3539+
}
3540+
out:
35013541
set_bit(STRIPE_HANDLE, &sh->state);
35023542
}
35033543

@@ -3653,7 +3693,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
36533693
} else for (i = disks; i--; ) {
36543694
/* would I have to read this buffer for read_modify_write */
36553695
struct r5dev *dev = &sh->dev[i];
3656-
if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx ||
3696+
if (((dev->towrite && !delay_towrite(dev, s)) ||
3697+
i == sh->pd_idx || i == sh->qd_idx ||
36573698
test_bit(R5_InJournal, &dev->flags)) &&
36583699
!test_bit(R5_LOCKED, &dev->flags) &&
36593700
!(uptodate_for_rmw(dev) ||
@@ -3717,7 +3758,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
37173758

37183759
for (i = disks; i--; ) {
37193760
struct r5dev *dev = &sh->dev[i];
3720-
if ((dev->towrite ||
3761+
if (((dev->towrite && !delay_towrite(dev, s)) ||
37213762
i == sh->pd_idx || i == sh->qd_idx ||
37223763
test_bit(R5_InJournal, &dev->flags)) &&
37233764
!test_bit(R5_LOCKED, &dev->flags) &&

0 commit comments

Comments
 (0)