Skip to content

Commit c817248

Browse files
Mike SnitzerAnna Schumaker
authored andcommitted
nfs/localio: add proper O_DIRECT support for READ and WRITE
Because the NFS client will already happily handle misaligned O_DIRECT IO (by sending it out to NFSD via RPC) this commit's new capabilities are for the benefit of LOCALIO. LOCALIO will make best effort to transform misaligned IO to DIO-aligned extents when possible. LOCALIO's READ and WRITE DIO that is misaligned will be split into as many as 3 component IOs (@start, @middle and @EnD) as needed -- IFF the @middle extent is verified to be DIO-aligned, and then the @start and/or @EnD are misaligned (due to each being a partial page). Otherwise if the @middle isn't DIO-aligned the code will fallback to issuing only a single contiguous buffered IO. The @middle is only DIO-aligned if both the memory and on-disk offsets for the IO are aligned relative to the underlying local filesystem's block device limits (@dma_alignment and @logical_block_size respectively). The misaligned @start and/or @EnD extents are issued using buffered IO and the DIO-aligned @middle is issued using O_DIRECT. The @start and @EnD IOs are issued first using buffered IO with IOCB_SYNC and then the @middle is issued last using direct IO with async completion (AIO). This out of order IO completion means that LOCALIO's IO completion code (nfs_local_read_done and nfs_local_write_done) is only called for the IO's last associated iov_iter completion. And in the case of DIO-aligned @middle it completes last using AIO. nfs_local_pgio_done() is updated to handle piece-wise partial completion of each iov_iter. This implementation for LOCALIO's misaligned DIO handling uses 3 iov_iter that share the same backing pages in their bio_vecs (so unfortunately 'struct nfs_local_kiocb' has 3 instead of only 1). [Reducing LOCALIO's per-IO (struct nfs_local_kiocb) memory use can be explored in the future. One logical progression to improve this code, and eliminate explicit loops over up to 3 iov_iter, is by extending 'struct iov_iter' to support iov_iter_clone() and iov_iter_chain() interfaces that are comparable to what 'struct bio' is able to support in the block layer. But even that wouldn't avoid the need to allocate/use up to 3 iov_iter] Signed-off-by: Mike Snitzer <[email protected]> Signed-off-by: Anna Schumaker <[email protected]>
1 parent e43e9a3 commit c817248

File tree

1 file changed

+202
-47
lines changed

1 file changed

+202
-47
lines changed

fs/nfs/localio.c

Lines changed: 202 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,23 @@
3030

3131
#define NFSDBG_FACILITY NFSDBG_VFS
3232

33+
#define NFSLOCAL_MAX_IOS 3
34+
3335
struct nfs_local_kiocb {
3436
struct kiocb kiocb;
3537
struct bio_vec *bvec;
3638
struct nfs_pgio_header *hdr;
3739
struct work_struct work;
3840
void (*aio_complete_work)(struct work_struct *);
39-
struct iov_iter iter ____cacheline_aligned;
4041
struct nfsd_file *localio;
42+
/* Begin mostly DIO-specific members */
43+
size_t end_len;
44+
short int end_iter_index;
45+
short int n_iters;
46+
bool iter_is_dio_aligned[NFSLOCAL_MAX_IOS];
47+
loff_t offset[NFSLOCAL_MAX_IOS] ____cacheline_aligned;
48+
struct iov_iter iters[NFSLOCAL_MAX_IOS];
49+
/* End mostly DIO-specific members */
4150
};
4251

4352
struct nfs_local_fsync_ctx {
@@ -291,7 +300,7 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
291300
{
292301
struct nfs_local_kiocb *iocb;
293302

294-
iocb = kmalloc(sizeof(*iocb), flags);
303+
iocb = kzalloc(sizeof(*iocb), flags);
295304
if (iocb == NULL)
296305
return NULL;
297306

@@ -303,25 +312,72 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
303312
}
304313

305314
init_sync_kiocb(&iocb->kiocb, file);
306-
if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags))
307-
iocb->kiocb.ki_flags = IOCB_DIRECT;
308315

309-
iocb->kiocb.ki_pos = hdr->args.offset;
310316
iocb->hdr = hdr;
311317
iocb->kiocb.ki_flags &= ~IOCB_APPEND;
312318
iocb->aio_complete_work = NULL;
313319

320+
iocb->end_iter_index = -1;
321+
314322
return iocb;
315323
}
316324

325+
struct nfs_local_dio {
326+
u32 mem_align;
327+
u32 offset_align;
328+
loff_t middle_offset;
329+
loff_t end_offset;
330+
ssize_t start_len; /* Length for misaligned first extent */
331+
ssize_t middle_len; /* Length for DIO-aligned middle extent */
332+
ssize_t end_len; /* Length for misaligned last extent */
333+
};
334+
335+
static bool
336+
nfs_is_local_dio_possible(struct nfs_local_kiocb *iocb, int rw,
337+
size_t len, struct nfs_local_dio *local_dio)
338+
{
339+
struct nfs_pgio_header *hdr = iocb->hdr;
340+
loff_t offset = hdr->args.offset;
341+
u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align;
342+
loff_t start_end, orig_end, middle_end;
343+
344+
nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align,
345+
&nf_dio_offset_align, &nf_dio_read_offset_align);
346+
if (rw == ITER_DEST)
347+
nf_dio_offset_align = nf_dio_read_offset_align;
348+
349+
if (unlikely(!nf_dio_mem_align || !nf_dio_offset_align))
350+
return false;
351+
if (unlikely(nf_dio_offset_align > PAGE_SIZE))
352+
return false;
353+
if (unlikely(len < nf_dio_offset_align))
354+
return false;
355+
356+
local_dio->mem_align = nf_dio_mem_align;
357+
local_dio->offset_align = nf_dio_offset_align;
358+
359+
start_end = round_up(offset, nf_dio_offset_align);
360+
orig_end = offset + len;
361+
middle_end = round_down(orig_end, nf_dio_offset_align);
362+
363+
local_dio->middle_offset = start_end;
364+
local_dio->end_offset = middle_end;
365+
366+
local_dio->start_len = start_end - offset;
367+
local_dio->middle_len = middle_end - start_end;
368+
local_dio->end_len = orig_end - middle_end;
369+
370+
return true;
371+
}
372+
317373
static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i,
318-
loff_t offset, unsigned int addr_mask, unsigned int len_mask)
374+
unsigned int addr_mask, unsigned int len_mask)
319375
{
320376
const struct bio_vec *bvec = i->bvec;
321377
size_t skip = i->iov_offset;
322378
size_t size = i->count;
323379

324-
if ((offset | size) & len_mask)
380+
if (size & len_mask)
325381
return false;
326382
do {
327383
size_t len = bvec->bv_len;
@@ -338,8 +394,68 @@ static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i,
338394
return true;
339395
}
340396

341-
static void
342-
nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int rw)
397+
/*
398+
* Setup as many as 3 iov_iter based on extents described by @local_dio.
399+
* Returns the number of iov_iter that were setup.
400+
*/
401+
static int
402+
nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw,
403+
unsigned int nvecs, size_t len,
404+
struct nfs_local_dio *local_dio)
405+
{
406+
int n_iters = 0;
407+
struct iov_iter *iters = iocb->iters;
408+
409+
/* Setup misaligned start? */
410+
if (local_dio->start_len) {
411+
iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
412+
iters[n_iters].count = local_dio->start_len;
413+
iocb->offset[n_iters] = iocb->hdr->args.offset;
414+
iocb->iter_is_dio_aligned[n_iters] = false;
415+
++n_iters;
416+
}
417+
418+
/* Setup misaligned end?
419+
* If so, the end is purposely setup to be issued using buffered IO
420+
* before the middle (which will use DIO, if DIO-aligned, with AIO).
421+
* This creates problems if/when the end results in a partial write.
422+
* So must save index and length of end to handle this corner case.
423+
*/
424+
if (local_dio->end_len) {
425+
iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
426+
iocb->offset[n_iters] = local_dio->end_offset;
427+
iov_iter_advance(&iters[n_iters],
428+
local_dio->start_len + local_dio->middle_len);
429+
iocb->iter_is_dio_aligned[n_iters] = false;
430+
/* Save index and length of end */
431+
iocb->end_iter_index = n_iters;
432+
iocb->end_len = local_dio->end_len;
433+
++n_iters;
434+
}
435+
436+
/* Setup DIO-aligned middle to be issued last, to allow for
437+
* DIO with AIO completion (see nfs_local_call_{read,write}).
438+
*/
439+
iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
440+
if (local_dio->start_len)
441+
iov_iter_advance(&iters[n_iters], local_dio->start_len);
442+
iters[n_iters].count -= local_dio->end_len;
443+
iocb->offset[n_iters] = local_dio->middle_offset;
444+
445+
iocb->iter_is_dio_aligned[n_iters] =
446+
nfs_iov_iter_aligned_bvec(&iters[n_iters],
447+
local_dio->mem_align-1, local_dio->offset_align-1);
448+
449+
if (unlikely(!iocb->iter_is_dio_aligned[n_iters]))
450+
return 0; /* no DIO-aligned IO possible */
451+
++n_iters;
452+
453+
iocb->n_iters = n_iters;
454+
return n_iters;
455+
}
456+
457+
static noinline_for_stack void
458+
nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw)
343459
{
344460
struct nfs_pgio_header *hdr = iocb->hdr;
345461
struct page **pagevec = hdr->page_array.pagevec;
@@ -360,26 +476,18 @@ nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int rw)
360476
}
361477
len = hdr->args.count - total;
362478

363-
iov_iter_bvec(i, rw, iocb->bvec, v, len);
364-
365-
if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
366-
u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align;
367-
/* Verify the IO is DIO-aligned as required */
368-
nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align,
369-
&nf_dio_offset_align,
370-
&nf_dio_read_offset_align);
371-
if (rw == ITER_DEST)
372-
nf_dio_offset_align = nf_dio_read_offset_align;
373-
374-
if (nf_dio_mem_align && nf_dio_offset_align &&
375-
nfs_iov_iter_aligned_bvec(i, hdr->args.offset,
376-
nf_dio_mem_align - 1,
377-
nf_dio_offset_align - 1))
378-
return; /* is DIO-aligned */
479+
if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) {
480+
struct nfs_local_dio local_dio;
379481

380-
/* Fallback to using buffered for this misaligned IO */
381-
iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
482+
if (nfs_is_local_dio_possible(iocb, rw, len, &local_dio) &&
483+
nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0)
484+
return; /* is DIO-aligned */
382485
}
486+
487+
/* Use buffered IO */
488+
iocb->offset[0] = hdr->args.offset;
489+
iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len);
490+
iocb->n_iters = 1;
383491
}
384492

385493
static void
@@ -402,10 +510,12 @@ nfs_local_pgio_init(struct nfs_pgio_header *hdr,
402510
static void
403511
nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status)
404512
{
513+
/* Must handle partial completions */
405514
if (status >= 0) {
406-
hdr->res.count = status;
407-
hdr->res.op_status = NFS4_OK;
408-
hdr->task.tk_status = 0;
515+
hdr->res.count += status;
516+
/* @hdr was initialized to 0 (zeroed during allocation) */
517+
if (hdr->task.tk_status == 0)
518+
hdr->res.op_status = NFS4_OK;
409519
} else {
410520
hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status);
411521
hdr->task.tk_status = status;
@@ -451,8 +561,6 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
451561
pr_info_ratelimited("nfs: Unexpected direct I/O read alignment failure\n");
452562
}
453563

454-
nfs_local_pgio_done(hdr, status);
455-
456564
/*
457565
* Must clear replen otherwise NFSv3 data corruption will occur
458566
* if/when switching from LOCALIO back to using normal RPC.
@@ -480,6 +588,7 @@ static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret)
480588
struct nfs_local_kiocb *iocb =
481589
container_of(kiocb, struct nfs_local_kiocb, kiocb);
482590

591+
nfs_local_pgio_done(iocb->hdr, ret);
483592
nfs_local_read_done(iocb, ret);
484593
nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_read_aio_complete_work */
485594
}
@@ -494,12 +603,21 @@ static void nfs_local_call_read(struct work_struct *work)
494603

495604
save_cred = override_creds(filp->f_cred);
496605

497-
if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
498-
iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
499-
iocb->aio_complete_work = nfs_local_read_aio_complete_work;
500-
}
606+
for (int i = 0; i < iocb->n_iters ; i++) {
607+
if (iocb->iter_is_dio_aligned[i]) {
608+
iocb->kiocb.ki_flags |= IOCB_DIRECT;
609+
iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
610+
iocb->aio_complete_work = nfs_local_read_aio_complete_work;
611+
}
501612

502-
status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iter);
613+
iocb->kiocb.ki_pos = iocb->offset[i];
614+
status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]);
615+
if (status != -EIOCBQUEUED) {
616+
nfs_local_pgio_done(iocb->hdr, status);
617+
if (iocb->hdr->task.tk_status)
618+
break;
619+
}
620+
}
503621

504622
revert_creds(save_cred);
505623

@@ -635,18 +753,19 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
635753
}
636754

637755
/* Handle short writes as if they are ENOSPC */
756+
status = hdr->res.count;
638757
if (status > 0 && status < hdr->args.count) {
639758
hdr->mds_offset += status;
640759
hdr->args.offset += status;
641760
hdr->args.pgbase += status;
642761
hdr->args.count -= status;
643762
nfs_set_pgio_error(hdr, -ENOSPC, hdr->args.offset);
644763
status = -ENOSPC;
764+
/* record -ENOSPC in terms of nfs_local_pgio_done */
765+
nfs_local_pgio_done(hdr, status);
645766
}
646-
if (status < 0)
767+
if (hdr->task.tk_status < 0)
647768
nfs_reset_boot_verifier(inode);
648-
649-
nfs_local_pgio_done(hdr, status);
650769
}
651770

652771
static void nfs_local_write_aio_complete_work(struct work_struct *work)
@@ -663,6 +782,7 @@ static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret)
663782
struct nfs_local_kiocb *iocb =
664783
container_of(kiocb, struct nfs_local_kiocb, kiocb);
665784

785+
nfs_local_pgio_done(iocb->hdr, ret);
666786
nfs_local_write_done(iocb, ret);
667787
nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_write_aio_complete_work */
668788
}
@@ -679,13 +799,48 @@ static void nfs_local_call_write(struct work_struct *work)
679799
current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
680800
save_cred = override_creds(filp->f_cred);
681801

682-
if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
683-
iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
684-
iocb->aio_complete_work = nfs_local_write_aio_complete_work;
685-
}
686-
687802
file_start_write(filp);
688-
status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iter);
803+
for (int i = 0; i < iocb->n_iters ; i++) {
804+
if (iocb->iter_is_dio_aligned[i]) {
805+
iocb->kiocb.ki_flags |= IOCB_DIRECT;
806+
iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
807+
iocb->aio_complete_work = nfs_local_write_aio_complete_work;
808+
}
809+
retry:
810+
iocb->kiocb.ki_pos = iocb->offset[i];
811+
status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]);
812+
if (status != -EIOCBQUEUED) {
813+
if (unlikely(status >= 0 && status < iocb->iters[i].count)) {
814+
/* partial write */
815+
if (i == iocb->end_iter_index) {
816+
/* Must not account partial end, otherwise, due
817+
* to end being issued before middle: the partial
818+
* write accounting in nfs_local_write_done()
819+
* would incorrectly advance hdr->args.offset
820+
*/
821+
status = 0;
822+
} else {
823+
/* Partial write at start or buffered middle,
824+
* exit early.
825+
*/
826+
nfs_local_pgio_done(iocb->hdr, status);
827+
break;
828+
}
829+
} else if (unlikely(status == -ENOTBLK &&
830+
(iocb->kiocb.ki_flags & IOCB_DIRECT))) {
831+
/* VFS will return -ENOTBLK if DIO WRITE fails to
832+
* invalidate the page cache. Retry using buffered IO.
833+
*/
834+
iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
835+
iocb->kiocb.ki_complete = NULL;
836+
iocb->aio_complete_work = NULL;
837+
goto retry;
838+
}
839+
nfs_local_pgio_done(iocb->hdr, status);
840+
if (iocb->hdr->task.tk_status)
841+
break;
842+
}
843+
}
689844
file_end_write(filp);
690845

691846
revert_creds(save_cred);
@@ -754,7 +909,7 @@ nfs_local_iocb_init(struct nfs_pgio_header *hdr, struct nfsd_file *localio)
754909
iocb->hdr = hdr;
755910
iocb->localio = localio;
756911

757-
nfs_local_iter_init(&iocb->iter, iocb, rw);
912+
nfs_local_iters_init(iocb, rw);
758913

759914
return iocb;
760915
}

0 commit comments

Comments
 (0)