Skip to content

Commit 6a218b9

Browse files
Mike SnitzerAnna Schumaker
authored andcommitted
nfs/localio: do not issue misaligned DIO out-of-order
From https://lore.kernel.org/linux-nfs/[email protected]/ On Wed, Oct 29, 2025 at 12:20:40AM -0700, Christoph Hellwig wrote: > On Mon, Oct 27, 2025 at 12:18:30PM -0400, Mike Snitzer wrote: > > LOCALIO's misaligned DIO will issue head/tail followed by O_DIRECT > > middle (via AIO completion of that aligned middle). So out of order > > relative to file offset. > > That's in general a really bad idea. It will obviously work, but > both on SSDs and out of place write file systems it is a sure way > to increase your garbage collection overhead a lot down the line. Fix this by never issuing misaligned DIO out of order. This fix means the DIO-aligned middle will only use AIO completion if there is no misaligned end segment. Otherwise, all 3 segments of a misaligned DIO will be issued without AIO completion to ensure file offset increases properly for all partial READ or WRITE situations. Factoring out nfs_local_iter_setup() helps standardize repetitive nfs_local_iters_setup_dio() code and is inspired by cleanup work that Chuck Lever did on the NFSD Direct code. Fixes: c817248 ("nfs/localio: add proper O_DIRECT support for READ and WRITE") Reported-by: Christoph Hellwig <[email protected]> Signed-off-by: Mike Snitzer <[email protected]> Signed-off-by: Anna Schumaker <[email protected]>
1 parent d32ddfe commit 6a218b9

File tree

1 file changed

+52
-76
lines changed

1 file changed

+52
-76
lines changed

fs/nfs/localio.c

Lines changed: 52 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,7 @@ struct nfs_local_kiocb {
4444
short int end_iter_index;
4545
atomic_t n_iters;
4646
bool iter_is_dio_aligned[NFSLOCAL_MAX_IOS];
47-
loff_t offset[NFSLOCAL_MAX_IOS] ____cacheline_aligned;
48-
struct iov_iter iters[NFSLOCAL_MAX_IOS];
47+
struct iov_iter iters[NFSLOCAL_MAX_IOS] ____cacheline_aligned;
4948
/* End mostly DIO-specific members */
5049
};
5150

@@ -314,6 +313,7 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
314313
init_sync_kiocb(&iocb->kiocb, file);
315314

316315
iocb->hdr = hdr;
316+
iocb->kiocb.ki_pos = hdr->args.offset;
317317
iocb->kiocb.ki_flags &= ~IOCB_APPEND;
318318
iocb->kiocb.ki_complete = NULL;
319319
iocb->aio_complete_work = NULL;
@@ -389,67 +389,65 @@ static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i,
389389
return true;
390390
}
391391

392+
static void
393+
nfs_local_iter_setup(struct iov_iter *iter, int rw, struct bio_vec *bvec,
394+
unsigned int nvecs, unsigned long total,
395+
size_t start, size_t len)
396+
{
397+
iov_iter_bvec(iter, rw, bvec, nvecs, total);
398+
if (start)
399+
iov_iter_advance(iter, start);
400+
iov_iter_truncate(iter, len);
401+
}
402+
392403
/*
393404
* Setup as many as 3 iov_iter based on extents described by @local_dio.
394405
* Returns the number of iov_iter that were setup.
395406
*/
396407
static int
397408
nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw,
398-
unsigned int nvecs, size_t len,
409+
unsigned int nvecs, unsigned long total,
399410
struct nfs_local_dio *local_dio)
400411
{
401412
int n_iters = 0;
402413
struct iov_iter *iters = iocb->iters;
403414

404415
/* Setup misaligned start? */
405416
if (local_dio->start_len) {
406-
iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
407-
iters[n_iters].count = local_dio->start_len;
408-
iocb->offset[n_iters] = iocb->hdr->args.offset;
409-
iocb->iter_is_dio_aligned[n_iters] = false;
410-
atomic_inc(&iocb->n_iters);
417+
nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec,
418+
nvecs, total, 0, local_dio->start_len);
411419
++n_iters;
412420
}
413421

414-
/* Setup misaligned end?
415-
* If so, the end is purposely setup to be issued using buffered IO
416-
* before the middle (which will use DIO, if DIO-aligned, with AIO).
417-
* This creates problems if/when the end results in short read or write.
418-
* So must save index and length of end to handle this corner case.
419-
*/
420-
if (local_dio->end_len) {
421-
iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
422-
iocb->offset[n_iters] = local_dio->end_offset;
423-
iov_iter_advance(&iters[n_iters],
424-
local_dio->start_len + local_dio->middle_len);
425-
iocb->iter_is_dio_aligned[n_iters] = false;
426-
/* Save index and length of end */
427-
iocb->end_iter_index = n_iters;
428-
iocb->end_len = local_dio->end_len;
429-
atomic_inc(&iocb->n_iters);
430-
++n_iters;
431-
}
432-
433-
/* Setup DIO-aligned middle to be issued last, to allow for
434-
* DIO with AIO completion (see nfs_local_call_{read,write}).
422+
/*
423+
* Setup DIO-aligned middle, if there is no misaligned end (below)
424+
* then AIO completion is used, see nfs_local_call_{read,write}
435425
*/
436-
iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
437-
if (local_dio->start_len)
438-
iov_iter_advance(&iters[n_iters], local_dio->start_len);
439-
iters[n_iters].count -= local_dio->end_len;
440-
iocb->offset[n_iters] = local_dio->middle_offset;
426+
nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec, nvecs,
427+
total, local_dio->start_len, local_dio->middle_len);
441428

442429
iocb->iter_is_dio_aligned[n_iters] =
443430
nfs_iov_iter_aligned_bvec(&iters[n_iters],
444431
local_dio->mem_align-1, local_dio->offset_align-1);
445432

446433
if (unlikely(!iocb->iter_is_dio_aligned[n_iters])) {
447434
trace_nfs_local_dio_misaligned(iocb->hdr->inode,
448-
iocb->hdr->args.offset, len, local_dio);
435+
local_dio->start_len, local_dio->middle_len, local_dio);
449436
return 0; /* no DIO-aligned IO possible */
450437
}
438+
iocb->end_iter_index = n_iters;
451439
++n_iters;
452440

441+
/* Setup misaligned end? */
442+
if (local_dio->end_len) {
443+
nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec,
444+
nvecs, total, local_dio->start_len +
445+
local_dio->middle_len, local_dio->end_len);
446+
iocb->end_iter_index = n_iters;
447+
++n_iters;
448+
}
449+
450+
atomic_set(&iocb->n_iters, n_iters);
453451
return n_iters;
454452
}
455453

@@ -476,7 +474,7 @@ nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw)
476474
len = hdr->args.count - total;
477475

478476
/*
479-
* For each iocb, iocb->n_iter is always at least 1 and we always
477+
* For each iocb, iocb->n_iters is always at least 1 and we always
480478
* end io after first nfs_local_pgio_done call unless misaligned DIO.
481479
*/
482480
atomic_set(&iocb->n_iters, 1);
@@ -494,9 +492,7 @@ nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw)
494492
}
495493

496494
/* Use buffered IO */
497-
iocb->offset[0] = hdr->args.offset;
498495
iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len);
499-
iocb->iter_is_dio_aligned[0] = false;
500496
}
501497

502498
static void
@@ -633,30 +629,20 @@ static void nfs_local_call_read(struct work_struct *work)
633629

634630
n_iters = atomic_read(&iocb->n_iters);
635631
for (int i = 0; i < n_iters ; i++) {
636-
/* DIO-aligned middle is always issued last with AIO completion */
637632
if (iocb->iter_is_dio_aligned[i]) {
638633
iocb->kiocb.ki_flags |= IOCB_DIRECT;
639-
iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
640-
iocb->aio_complete_work = nfs_local_read_aio_complete_work;
641-
}
634+
/* Only use AIO completion if DIO-aligned segment is last */
635+
if (i == iocb->end_iter_index) {
636+
iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
637+
iocb->aio_complete_work = nfs_local_read_aio_complete_work;
638+
}
639+
} else
640+
iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
642641

643-
iocb->kiocb.ki_pos = iocb->offset[i];
644642
status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]);
645643
if (status != -EIOCBQUEUED) {
646-
if (unlikely(status >= 0 && status < iocb->iters[i].count)) {
647-
/* partial read */
648-
if (i == iocb->end_iter_index) {
649-
/* Must not account DIO partial end, otherwise (due
650-
* to end being issued before middle): the partial
651-
* read accounting in nfs_local_read_done()
652-
* would incorrectly advance hdr->args.offset
653-
*/
654-
status = 0;
655-
} else {
656-
/* Partial read at start or middle, force done */
657-
force_done = true;
658-
}
659-
}
644+
if (unlikely(status >= 0 && status < iocb->iters[i].count))
645+
force_done = true; /* Partial read */
660646
if (nfs_local_pgio_done(iocb, status, force_done)) {
661647
nfs_local_read_iocb_done(iocb);
662648
break;
@@ -851,30 +837,20 @@ static void nfs_local_call_write(struct work_struct *work)
851837
file_start_write(filp);
852838
n_iters = atomic_read(&iocb->n_iters);
853839
for (int i = 0; i < n_iters ; i++) {
854-
/* DIO-aligned middle is always issued last with AIO completion */
855840
if (iocb->iter_is_dio_aligned[i]) {
856841
iocb->kiocb.ki_flags |= IOCB_DIRECT;
857-
iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
858-
iocb->aio_complete_work = nfs_local_write_aio_complete_work;
859-
}
842+
/* Only use AIO completion if DIO-aligned segment is last */
843+
if (i == iocb->end_iter_index) {
844+
iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
845+
iocb->aio_complete_work = nfs_local_write_aio_complete_work;
846+
}
847+
} else
848+
iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
860849

861-
iocb->kiocb.ki_pos = iocb->offset[i];
862850
status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]);
863851
if (status != -EIOCBQUEUED) {
864-
if (unlikely(status >= 0 && status < iocb->iters[i].count)) {
865-
/* partial write */
866-
if (i == iocb->end_iter_index) {
867-
/* Must not account DIO partial end, otherwise (due
868-
* to end being issued before middle): the partial
869-
* write accounting in nfs_local_write_done()
870-
* would incorrectly advance hdr->args.offset
871-
*/
872-
status = 0;
873-
} else {
874-
/* Partial write at start or middle, force done */
875-
force_done = true;
876-
}
877-
}
852+
if (unlikely(status >= 0 && status < iocb->iters[i].count))
853+
force_done = true; /* Partial write */
878854
if (nfs_local_pgio_done(iocb, status, force_done)) {
879855
nfs_local_write_iocb_done(iocb);
880856
break;

0 commit comments

Comments
 (0)