Skip to content

Commit 0091c5a

Browse files
YuKuai-huaweiliu-song-6
authored andcommitted
md/raid1: factor out helpers to choose the best rdev from read_balance()
The way that best rdev is chosen: 1) If the read is sequential from one rdev: - if rdev is rotational, use this rdev; - if rdev is non-rotational, use this rdev until total read length exceed disk opt io size; 2) If the read is not sequential: - if there is idle disk, use it, otherwise: - if the array has non-rotational disk, choose the rdev with minimal inflight IO; - if all the underlaying disks are rotational disk, choose the rdev with closest IO; There are no functional changes, just to make code cleaner and prepare for following refactor. Co-developed-by: Paul Luse <[email protected]> Signed-off-by: Paul Luse <[email protected]> Signed-off-by: Yu Kuai <[email protected]> Reviewed-by: Xiao Ni <[email protected]> Signed-off-by: Song Liu <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent ba58f57 commit 0091c5a

File tree

1 file changed

+98
-77
lines changed

1 file changed

+98
-77
lines changed

drivers/md/raid1.c

Lines changed: 98 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -730,74 +730,71 @@ static bool should_choose_next(struct r1conf *conf, int disk)
730730
mirror->next_seq_sect - opt_iosize >= mirror->seq_start;
731731
}
732732

733-
/*
734-
* This routine returns the disk from which the requested read should
735-
* be done. There is a per-array 'next expected sequential IO' sector
736-
* number - if this matches on the next IO then we use the last disk.
737-
* There is also a per-disk 'last know head position' sector that is
738-
* maintained from IRQ contexts, both the normal and the resync IO
739-
* completion handlers update this position correctly. If there is no
740-
* perfect sequential match then we pick the disk whose head is closest.
741-
*
742-
* If there are 2 mirrors in the same 2 devices, performance degrades
743-
* because position is mirror, not device based.
744-
*
745-
* The rdev for the device selected will have nr_pending incremented.
746-
*/
747-
static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors)
733+
static bool rdev_readable(struct md_rdev *rdev, struct r1bio *r1_bio)
748734
{
749-
const sector_t this_sector = r1_bio->sector;
750-
int sectors;
751-
int best_good_sectors;
752-
int best_disk, best_dist_disk, best_pending_disk, sequential_disk;
753-
int disk;
754-
sector_t best_dist;
755-
unsigned int min_pending;
756-
struct md_rdev *rdev;
735+
if (!rdev || test_bit(Faulty, &rdev->flags))
736+
return false;
757737

758-
retry:
759-
sectors = r1_bio->sectors;
760-
best_disk = -1;
761-
best_dist_disk = -1;
762-
sequential_disk = -1;
763-
best_dist = MaxSector;
764-
best_pending_disk = -1;
765-
min_pending = UINT_MAX;
766-
best_good_sectors = 0;
767-
clear_bit(R1BIO_FailFast, &r1_bio->state);
738+
/* still in recovery */
739+
if (!test_bit(In_sync, &rdev->flags) &&
740+
rdev->recovery_offset < r1_bio->sector + r1_bio->sectors)
741+
return false;
768742

769-
if (raid1_should_read_first(conf->mddev, this_sector, sectors))
770-
return choose_first_rdev(conf, r1_bio, max_sectors);
743+
/* don't read from slow disk unless have to */
744+
if (test_bit(WriteMostly, &rdev->flags))
745+
return false;
746+
747+
/* don't split IO for bad blocks unless have to */
748+
if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors))
749+
return false;
750+
751+
return true;
752+
}
753+
754+
struct read_balance_ctl {
755+
sector_t closest_dist;
756+
int closest_dist_disk;
757+
int min_pending;
758+
int min_pending_disk;
759+
int sequential_disk;
760+
int readable_disks;
761+
};
762+
763+
static int choose_best_rdev(struct r1conf *conf, struct r1bio *r1_bio)
764+
{
765+
int disk;
766+
struct read_balance_ctl ctl = {
767+
.closest_dist_disk = -1,
768+
.closest_dist = MaxSector,
769+
.min_pending_disk = -1,
770+
.min_pending = UINT_MAX,
771+
.sequential_disk = -1,
772+
};
771773

772774
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
775+
struct md_rdev *rdev;
773776
sector_t dist;
774777
unsigned int pending;
775778

776-
rdev = conf->mirrors[disk].rdev;
777-
if (r1_bio->bios[disk] == IO_BLOCKED
778-
|| rdev == NULL
779-
|| test_bit(Faulty, &rdev->flags))
780-
continue;
781-
if (!test_bit(In_sync, &rdev->flags) &&
782-
rdev->recovery_offset < this_sector + sectors)
783-
continue;
784-
if (test_bit(WriteMostly, &rdev->flags))
779+
if (r1_bio->bios[disk] == IO_BLOCKED)
785780
continue;
786-
if (rdev_has_badblock(rdev, this_sector, sectors))
781+
782+
rdev = conf->mirrors[disk].rdev;
783+
if (!rdev_readable(rdev, r1_bio))
787784
continue;
788785

789-
if (best_disk >= 0)
790-
/* At least two disks to choose from so failfast is OK */
786+
/* At least two disks to choose from so failfast is OK */
787+
if (ctl.readable_disks++ == 1)
791788
set_bit(R1BIO_FailFast, &r1_bio->state);
792789

793790
pending = atomic_read(&rdev->nr_pending);
794-
dist = abs(this_sector - conf->mirrors[disk].head_position);
791+
dist = abs(r1_bio->sector - conf->mirrors[disk].head_position);
792+
795793
/* Don't change to another disk for sequential reads */
796794
if (is_sequential(conf, disk, r1_bio)) {
797-
if (!should_choose_next(conf, disk)) {
798-
best_disk = disk;
799-
break;
800-
}
795+
if (!should_choose_next(conf, disk))
796+
return disk;
797+
801798
/*
802799
* Add 'pending' to avoid choosing this disk if
803800
* there is other idle disk.
@@ -807,52 +804,76 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
807804
* If there is no other idle disk, this disk
808805
* will be chosen.
809806
*/
810-
sequential_disk = disk;
807+
ctl.sequential_disk = disk;
811808
}
812809

813-
if (min_pending > pending) {
814-
min_pending = pending;
815-
best_pending_disk = disk;
810+
if (ctl.min_pending > pending) {
811+
ctl.min_pending = pending;
812+
ctl.min_pending_disk = disk;
816813
}
817814

818-
if (dist < best_dist) {
819-
best_dist = dist;
820-
best_dist_disk = disk;
815+
if (ctl.closest_dist > dist) {
816+
ctl.closest_dist = dist;
817+
ctl.closest_dist_disk = disk;
821818
}
822819
}
823820

824821
/*
825822
* sequential IO size exceeds optimal iosize, however, there is no other
826823
* idle disk, so choose the sequential disk.
827824
*/
828-
if (best_disk == -1 && min_pending != 0)
829-
best_disk = sequential_disk;
825+
if (ctl.sequential_disk != -1 && ctl.min_pending != 0)
826+
return ctl.sequential_disk;
830827

831828
/*
832829
* If all disks are rotational, choose the closest disk. If any disk is
833830
* non-rotational, choose the disk with less pending request even the
834831
* disk is rotational, which might/might not be optimal for raids with
835832
* mixed ratation/non-rotational disks depending on workload.
836833
*/
837-
if (best_disk == -1) {
838-
if (READ_ONCE(conf->nonrot_disks) || min_pending == 0)
839-
best_disk = best_pending_disk;
840-
else
841-
best_disk = best_dist_disk;
842-
}
834+
if (ctl.min_pending_disk != -1 &&
835+
(READ_ONCE(conf->nonrot_disks) || ctl.min_pending == 0))
836+
return ctl.min_pending_disk;
837+
else
838+
return ctl.closest_dist_disk;
839+
}
843840

844-
if (best_disk >= 0) {
845-
rdev = conf->mirrors[best_disk].rdev;
846-
if (!rdev)
847-
goto retry;
841+
/*
842+
* This routine returns the disk from which the requested read should be done.
843+
*
844+
* 1) If resync is in progress, find the first usable disk and use it even if it
845+
* has some bad blocks.
846+
*
847+
* 2) Now that there is no resync, loop through all disks and skipping slow
848+
* disks and disks with bad blocks for now. Only pay attention to key disk
849+
* choice.
850+
*
851+
* 3) If we've made it this far, now look for disks with bad blocks and choose
852+
* the one with most number of sectors.
853+
*
854+
* 4) If we are all the way at the end, we have no choice but to use a disk even
855+
* if it is write mostly.
856+
*
857+
* The rdev for the device selected will have nr_pending incremented.
858+
*/
859+
static int read_balance(struct r1conf *conf, struct r1bio *r1_bio,
860+
int *max_sectors)
861+
{
862+
int disk;
848863

849-
sectors = best_good_sectors;
850-
update_read_sectors(conf, disk, this_sector, sectors);
851-
}
852-
*max_sectors = sectors;
864+
clear_bit(R1BIO_FailFast, &r1_bio->state);
865+
866+
if (raid1_should_read_first(conf->mddev, r1_bio->sector,
867+
r1_bio->sectors))
868+
return choose_first_rdev(conf, r1_bio, max_sectors);
853869

854-
if (best_disk >= 0)
855-
return best_disk;
870+
disk = choose_best_rdev(conf, r1_bio);
871+
if (disk >= 0) {
872+
*max_sectors = r1_bio->sectors;
873+
update_read_sectors(conf, disk, r1_bio->sector,
874+
r1_bio->sectors);
875+
return disk;
876+
}
856877

857878
/*
858879
* If we are here it means we didn't find a perfectly good disk so

0 commit comments

Comments
 (0)