@@ -730,74 +730,71 @@ static bool should_choose_next(struct r1conf *conf, int disk)
730
730
mirror -> next_seq_sect - opt_iosize >= mirror -> seq_start ;
731
731
}
732
732
733
- /*
734
- * This routine returns the disk from which the requested read should
735
- * be done. There is a per-array 'next expected sequential IO' sector
736
- * number - if this matches on the next IO then we use the last disk.
737
- * There is also a per-disk 'last know head position' sector that is
738
- * maintained from IRQ contexts, both the normal and the resync IO
739
- * completion handlers update this position correctly. If there is no
740
- * perfect sequential match then we pick the disk whose head is closest.
741
- *
742
- * If there are 2 mirrors in the same 2 devices, performance degrades
743
- * because position is mirror, not device based.
744
- *
745
- * The rdev for the device selected will have nr_pending incremented.
746
- */
747
- static int read_balance (struct r1conf * conf , struct r1bio * r1_bio , int * max_sectors )
733
+ static bool rdev_readable (struct md_rdev * rdev , struct r1bio * r1_bio )
748
734
{
749
- const sector_t this_sector = r1_bio -> sector ;
750
- int sectors ;
751
- int best_good_sectors ;
752
- int best_disk , best_dist_disk , best_pending_disk , sequential_disk ;
753
- int disk ;
754
- sector_t best_dist ;
755
- unsigned int min_pending ;
756
- struct md_rdev * rdev ;
735
+ if (!rdev || test_bit (Faulty , & rdev -> flags ))
736
+ return false;
757
737
758
- retry :
759
- sectors = r1_bio -> sectors ;
760
- best_disk = -1 ;
761
- best_dist_disk = -1 ;
762
- sequential_disk = -1 ;
763
- best_dist = MaxSector ;
764
- best_pending_disk = -1 ;
765
- min_pending = UINT_MAX ;
766
- best_good_sectors = 0 ;
767
- clear_bit (R1BIO_FailFast , & r1_bio -> state );
738
+ /* still in recovery */
739
+ if (!test_bit (In_sync , & rdev -> flags ) &&
740
+ rdev -> recovery_offset < r1_bio -> sector + r1_bio -> sectors )
741
+ return false;
768
742
769
- if (raid1_should_read_first (conf -> mddev , this_sector , sectors ))
770
- return choose_first_rdev (conf , r1_bio , max_sectors );
743
+ /* don't read from slow disk unless have to */
744
+ if (test_bit (WriteMostly , & rdev -> flags ))
745
+ return false;
746
+
747
+ /* don't split IO for bad blocks unless have to */
748
+ if (rdev_has_badblock (rdev , r1_bio -> sector , r1_bio -> sectors ))
749
+ return false;
750
+
751
+ return true;
752
+ }
753
+
754
+ struct read_balance_ctl {
755
+ sector_t closest_dist ;
756
+ int closest_dist_disk ;
757
+ int min_pending ;
758
+ int min_pending_disk ;
759
+ int sequential_disk ;
760
+ int readable_disks ;
761
+ };
762
+
763
+ static int choose_best_rdev (struct r1conf * conf , struct r1bio * r1_bio )
764
+ {
765
+ int disk ;
766
+ struct read_balance_ctl ctl = {
767
+ .closest_dist_disk = -1 ,
768
+ .closest_dist = MaxSector ,
769
+ .min_pending_disk = -1 ,
770
+ .min_pending = UINT_MAX ,
771
+ .sequential_disk = -1 ,
772
+ };
771
773
772
774
for (disk = 0 ; disk < conf -> raid_disks * 2 ; disk ++ ) {
775
+ struct md_rdev * rdev ;
773
776
sector_t dist ;
774
777
unsigned int pending ;
775
778
776
- rdev = conf -> mirrors [disk ].rdev ;
777
- if (r1_bio -> bios [disk ] == IO_BLOCKED
778
- || rdev == NULL
779
- || test_bit (Faulty , & rdev -> flags ))
780
- continue ;
781
- if (!test_bit (In_sync , & rdev -> flags ) &&
782
- rdev -> recovery_offset < this_sector + sectors )
783
- continue ;
784
- if (test_bit (WriteMostly , & rdev -> flags ))
779
+ if (r1_bio -> bios [disk ] == IO_BLOCKED )
785
780
continue ;
786
- if (rdev_has_badblock (rdev , this_sector , sectors ))
781
+
782
+ rdev = conf -> mirrors [disk ].rdev ;
783
+ if (!rdev_readable (rdev , r1_bio ))
787
784
continue ;
788
785
789
- if ( best_disk >= 0 )
790
- /* At least two disks to choose from so failfast is OK */
786
+ /* At least two disks to choose from so failfast is OK */
787
+ if ( ctl . readable_disks ++ == 1 )
791
788
set_bit (R1BIO_FailFast , & r1_bio -> state );
792
789
793
790
pending = atomic_read (& rdev -> nr_pending );
794
- dist = abs (this_sector - conf -> mirrors [disk ].head_position );
791
+ dist = abs (r1_bio -> sector - conf -> mirrors [disk ].head_position );
792
+
795
793
/* Don't change to another disk for sequential reads */
796
794
if (is_sequential (conf , disk , r1_bio )) {
797
- if (!should_choose_next (conf , disk )) {
798
- best_disk = disk ;
799
- break ;
800
- }
795
+ if (!should_choose_next (conf , disk ))
796
+ return disk ;
797
+
801
798
/*
802
799
* Add 'pending' to avoid choosing this disk if
803
800
* there is other idle disk.
@@ -807,52 +804,76 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
807
804
* If there is no other idle disk, this disk
808
805
* will be chosen.
809
806
*/
810
- sequential_disk = disk ;
807
+ ctl . sequential_disk = disk ;
811
808
}
812
809
813
- if (min_pending > pending ) {
814
- min_pending = pending ;
815
- best_pending_disk = disk ;
810
+ if (ctl . min_pending > pending ) {
811
+ ctl . min_pending = pending ;
812
+ ctl . min_pending_disk = disk ;
816
813
}
817
814
818
- if (dist < best_dist ) {
819
- best_dist = dist ;
820
- best_dist_disk = disk ;
815
+ if (ctl . closest_dist > dist ) {
816
+ ctl . closest_dist = dist ;
817
+ ctl . closest_dist_disk = disk ;
821
818
}
822
819
}
823
820
824
821
/*
825
822
* sequential IO size exceeds optimal iosize, however, there is no other
826
823
* idle disk, so choose the sequential disk.
827
824
*/
828
- if (best_disk == -1 && min_pending != 0 )
829
- best_disk = sequential_disk ;
825
+ if (ctl . sequential_disk != -1 && ctl . min_pending != 0 )
826
+ return ctl . sequential_disk ;
830
827
831
828
/*
832
829
* If all disks are rotational, choose the closest disk. If any disk is
833
830
* non-rotational, choose the disk with less pending request even the
834
831
* disk is rotational, which might/might not be optimal for raids with
835
832
* mixed ratation/non-rotational disks depending on workload.
836
833
*/
837
- if (best_disk == -1 ) {
838
- if (READ_ONCE (conf -> nonrot_disks ) || min_pending == 0 )
839
- best_disk = best_pending_disk ;
840
- else
841
- best_disk = best_dist_disk ;
842
- }
834
+ if (ctl . min_pending_disk != -1 &&
835
+ (READ_ONCE (conf -> nonrot_disks ) || ctl . min_pending == 0 ) )
836
+ return ctl . min_pending_disk ;
837
+ else
838
+ return ctl . closest_dist_disk ;
839
+ }
843
840
844
- if (best_disk >= 0 ) {
845
- rdev = conf -> mirrors [best_disk ].rdev ;
846
- if (!rdev )
847
- goto retry ;
841
+ /*
842
+ * This routine returns the disk from which the requested read should be done.
843
+ *
844
+ * 1) If resync is in progress, find the first usable disk and use it even if it
845
+ * has some bad blocks.
846
+ *
847
+ * 2) Now that there is no resync, loop through all disks and skipping slow
848
+ * disks and disks with bad blocks for now. Only pay attention to key disk
849
+ * choice.
850
+ *
851
+ * 3) If we've made it this far, now look for disks with bad blocks and choose
852
+ * the one with most number of sectors.
853
+ *
854
+ * 4) If we are all the way at the end, we have no choice but to use a disk even
855
+ * if it is write mostly.
856
+ *
857
+ * The rdev for the device selected will have nr_pending incremented.
858
+ */
859
+ static int read_balance (struct r1conf * conf , struct r1bio * r1_bio ,
860
+ int * max_sectors )
861
+ {
862
+ int disk ;
848
863
849
- sectors = best_good_sectors ;
850
- update_read_sectors (conf , disk , this_sector , sectors );
851
- }
852
- * max_sectors = sectors ;
864
+ clear_bit (R1BIO_FailFast , & r1_bio -> state );
865
+
866
+ if (raid1_should_read_first (conf -> mddev , r1_bio -> sector ,
867
+ r1_bio -> sectors ))
868
+ return choose_first_rdev (conf , r1_bio , max_sectors );
853
869
854
- if (best_disk >= 0 )
855
- return best_disk ;
870
+ disk = choose_best_rdev (conf , r1_bio );
871
+ if (disk >= 0 ) {
872
+ * max_sectors = r1_bio -> sectors ;
873
+ update_read_sectors (conf , disk , r1_bio -> sector ,
874
+ r1_bio -> sectors );
875
+ return disk ;
876
+ }
856
877
857
878
/*
858
879
* If we are here it means we didn't find a perfectly good disk so
0 commit comments