Skip to content

Commit b75197e

Browse files
YuKuai-huaweiliu-song-6
authored andcommitted
md: Remove flush handling
For flush request, md has a special flush handling to merge concurrent flush request into single one, however, the whole mechanism is based on a disk level spin_lock 'mddev->lock'. And fsync can be called quite often in some user cases, for consequence, spin lock from IO fast path can cause performance degradation. Fortunately, the block layer already has flush handling to merge concurrent flush request, and it only acquires hctx level spin lock. (see details in blk-flush.c) This patch removes the flush handling in md, and converts to use general block layer flush handling in underlying disks. Flush test for 4 nvme raid10: start 128 threads to do fsync 100000 times, on arm64, see how long it takes. Test script: void* thread_func(void* arg) { int fd = *(int*)arg; for (int i = 0; i < FSYNC_COUNT; i++) { fsync(fd); } return NULL; } int main() { int fd = open("/dev/md0", O_RDWR); if (fd < 0) { perror("open"); exit(1); } pthread_t threads[THREADS]; struct timeval start, end; gettimeofday(&start, NULL); for (int i = 0; i < THREADS; i++) { pthread_create(&threads[i], NULL, thread_func, &fd); } for (int i = 0; i < THREADS; i++) { pthread_join(threads[i], NULL); } gettimeofday(&end, NULL); close(fd); long long elapsed = (end.tv_sec - start.tv_sec) * 1000000LL + (end.tv_usec - start.tv_usec); printf("Elapsed time: %lld microseconds\n", elapsed); return 0; } Test result: about 10 times faster: Before this patch: 50943374 microseconds After this patch: 5096347 microseconds Signed-off-by: Yu Kuai <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Song Liu <[email protected]>
1 parent 2d389a7 commit b75197e

File tree

2 files changed

+15
-133
lines changed

2 files changed

+15
-133
lines changed

drivers/md/md.c

Lines changed: 15 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -546,137 +546,30 @@ static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_n
546546
return 0;
547547
}
548548

549-
/*
550-
* Generic flush handling for md
551-
*/
552-
553-
static void md_end_flush(struct bio *bio)
554-
{
555-
struct md_rdev *rdev = bio->bi_private;
556-
struct mddev *mddev = rdev->mddev;
557-
558-
bio_put(bio);
559-
560-
rdev_dec_pending(rdev, mddev);
561-
562-
if (atomic_dec_and_test(&mddev->flush_pending))
563-
/* The pre-request flush has finished */
564-
queue_work(md_wq, &mddev->flush_work);
565-
}
566-
567-
static void md_submit_flush_data(struct work_struct *ws);
568-
569-
static void submit_flushes(struct work_struct *ws)
549+
bool md_flush_request(struct mddev *mddev, struct bio *bio)
570550
{
571-
struct mddev *mddev = container_of(ws, struct mddev, flush_work);
572551
struct md_rdev *rdev;
573-
574-
mddev->start_flush = ktime_get_boottime();
575-
INIT_WORK(&mddev->flush_work, md_submit_flush_data);
576-
atomic_set(&mddev->flush_pending, 1);
577-
rcu_read_lock();
578-
rdev_for_each_rcu(rdev, mddev)
579-
if (rdev->raid_disk >= 0 &&
580-
!test_bit(Faulty, &rdev->flags)) {
581-
struct bio *bi;
582-
583-
atomic_inc(&rdev->nr_pending);
584-
rcu_read_unlock();
585-
bi = bio_alloc_bioset(rdev->bdev, 0,
586-
REQ_OP_WRITE | REQ_PREFLUSH,
587-
GFP_NOIO, &mddev->bio_set);
588-
bi->bi_end_io = md_end_flush;
589-
bi->bi_private = rdev;
590-
atomic_inc(&mddev->flush_pending);
591-
submit_bio(bi);
592-
rcu_read_lock();
593-
}
594-
rcu_read_unlock();
595-
if (atomic_dec_and_test(&mddev->flush_pending))
596-
queue_work(md_wq, &mddev->flush_work);
597-
}
598-
599-
static void md_submit_flush_data(struct work_struct *ws)
600-
{
601-
struct mddev *mddev = container_of(ws, struct mddev, flush_work);
602-
struct bio *bio = mddev->flush_bio;
552+
struct bio *new;
603553

604554
/*
605-
* must reset flush_bio before calling into md_handle_request to avoid a
606-
* deadlock, because other bios passed md_handle_request suspend check
607-
* could wait for this and below md_handle_request could wait for those
608-
* bios because of suspend check
555+
* md_flush_reqeust() should be called under md_handle_request() and
556+
* 'active_io' is already grabbed. Hence it's safe to get rdev directly
557+
* without rcu protection.
609558
*/
610-
spin_lock_irq(&mddev->lock);
611-
mddev->prev_flush_start = mddev->start_flush;
612-
mddev->flush_bio = NULL;
613-
spin_unlock_irq(&mddev->lock);
614-
wake_up(&mddev->sb_wait);
615-
616-
if (bio->bi_iter.bi_size == 0) {
617-
/* an empty barrier - all done */
618-
bio_endio(bio);
619-
} else {
620-
bio->bi_opf &= ~REQ_PREFLUSH;
559+
WARN_ON(percpu_ref_is_zero(&mddev->active_io));
621560

622-
/*
623-
* make_requst() will never return error here, it only
624-
* returns error in raid5_make_request() by dm-raid.
625-
* Since dm always splits data and flush operation into
626-
* two separate io, io size of flush submitted by dm
627-
* always is 0, make_request() will not be called here.
628-
*/
629-
if (WARN_ON_ONCE(!mddev->pers->make_request(mddev, bio)))
630-
bio_io_error(bio);
631-
}
632-
633-
/* The pair is percpu_ref_get() from md_flush_request() */
634-
percpu_ref_put(&mddev->active_io);
635-
}
561+
rdev_for_each(rdev, mddev) {
562+
if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
563+
continue;
636564

637-
/*
638-
* Manages consolidation of flushes and submitting any flushes needed for
639-
* a bio with REQ_PREFLUSH. Returns true if the bio is finished or is
640-
* being finished in another context. Returns false if the flushing is
641-
* complete but still needs the I/O portion of the bio to be processed.
642-
*/
643-
bool md_flush_request(struct mddev *mddev, struct bio *bio)
644-
{
645-
ktime_t req_start = ktime_get_boottime();
646-
spin_lock_irq(&mddev->lock);
647-
/* flush requests wait until ongoing flush completes,
648-
* hence coalescing all the pending requests.
649-
*/
650-
wait_event_lock_irq(mddev->sb_wait,
651-
!mddev->flush_bio ||
652-
ktime_before(req_start, mddev->prev_flush_start),
653-
mddev->lock);
654-
/* new request after previous flush is completed */
655-
if (ktime_after(req_start, mddev->prev_flush_start)) {
656-
WARN_ON(mddev->flush_bio);
657-
/*
658-
* Grab a reference to make sure mddev_suspend() will wait for
659-
* this flush to be done.
660-
*
661-
* md_flush_reqeust() is called under md_handle_request() and
662-
* 'active_io' is already grabbed, hence percpu_ref_is_zero()
663-
* won't pass, percpu_ref_tryget_live() can't be used because
664-
* percpu_ref_kill() can be called by mddev_suspend()
665-
* concurrently.
666-
*/
667-
WARN_ON(percpu_ref_is_zero(&mddev->active_io));
668-
percpu_ref_get(&mddev->active_io);
669-
mddev->flush_bio = bio;
670-
spin_unlock_irq(&mddev->lock);
671-
INIT_WORK(&mddev->flush_work, submit_flushes);
672-
queue_work(md_wq, &mddev->flush_work);
673-
return true;
565+
new = bio_alloc_bioset(rdev->bdev, 0,
566+
REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO,
567+
&mddev->bio_set);
568+
bio_chain(new, bio);
569+
submit_bio(new);
674570
}
675571

676-
/* flush was performed for some other bio while we waited. */
677-
spin_unlock_irq(&mddev->lock);
678-
if (bio->bi_iter.bi_size == 0) {
679-
/* pure flush without data - all done */
572+
if (bio_sectors(bio) == 0) {
680573
bio_endio(bio);
681574
return true;
682575
}
@@ -763,7 +656,6 @@ int mddev_init(struct mddev *mddev)
763656
atomic_set(&mddev->openers, 0);
764657
atomic_set(&mddev->sync_seq, 0);
765658
spin_lock_init(&mddev->lock);
766-
atomic_set(&mddev->flush_pending, 0);
767659
init_waitqueue_head(&mddev->sb_wait);
768660
init_waitqueue_head(&mddev->recovery_wait);
769661
mddev->reshape_position = MaxSector;

drivers/md/md.h

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -571,16 +571,6 @@ struct mddev {
571571
*/
572572
struct bio_set io_clone_set;
573573

574-
/* Generic flush handling.
575-
* The last to finish preflush schedules a worker to submit
576-
* the rest of the request (without the REQ_PREFLUSH flag).
577-
*/
578-
struct bio *flush_bio;
579-
atomic_t flush_pending;
580-
ktime_t start_flush, prev_flush_start; /* prev_flush_start is when the previous completed
581-
* flush was started.
582-
*/
583-
struct work_struct flush_work;
584574
struct work_struct event_work; /* used by dm to report failure event */
585575
mempool_t *serial_info_pool;
586576
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);

0 commit comments

Comments
 (0)