Skip to content

Commit 377698d

Browse files
author
Darrick J. Wong
committed
Merge tag 'xfs-async-dio.6-2023-08-01' of git://git.kernel.dk/linux into iomap-6.6-mergeA
Improve iomap/xfs async dio write performance iomap always punts async dio write completions to a workqueue, which has a cost in terms of efficiency (now you need an unrelated worker to process it) and latency (now you're bouncing a completion through an async worker, which is a classic slowdown scenario). io_uring handles IRQ completions via task_work, and for writes that don't need to do extra IO at completion time, we can safely complete them inline from that. This patchset adds IOCB_DIO_CALLER_COMP, which an IO issuer can set to inform the completion side that any extra work that needs doing for that completion can be punted to a safe task context. The iomap dio completion will happen in hard/soft irq context, and we need a saner context to process these completions. IOCB_DIO_CALLER_COMP is added, which can be set in a struct kiocb->ki_flags by the issuer. If the completion side of the iocb handling understands this flag, it can choose to set a kiocb->dio_complete() handler and just call ki_complete from IRQ context. The issuer must then ensure that this callback is processed from a task. io_uring punts IRQ completions to task_work already, so it's trivial wire it up to run more of the completion before posting a CQE. This is good for up to a 37% improvement in throughput/latency for low queue depth IO, patch 5 has the details. If we need to do real work at completion time, iomap will clear the IOMAP_DIO_CALLER_COMP flag. This work came about when Andres tested low queue depth dio writes for postgres and compared it to doing sync dio writes, showing that the async processing slows us down a lot. * tag 'xfs-async-dio.6-2023-08-01' of git://git.kernel.dk/linux: iomap: support IOCB_DIO_CALLER_COMP io_uring/rw: add write support for IOCB_DIO_CALLER_COMP fs: add IOCB flags related to passing back dio completions iomap: add IOMAP_DIO_INLINE_COMP iomap: only set iocb->private for polled bio iomap: treat a write through cache the same as FUA iomap: use an unsigned type for IOMAP_DIO_* defines iomap: cleanup up iomap_dio_bio_end_io() Signed-off-by: Darrick J. Wong <[email protected]>
2 parents a67371b + 8c052fb commit 377698d

File tree

3 files changed

+180
-45
lines changed

3 files changed

+180
-45
lines changed

fs/iomap/direct-io.c

Lines changed: 123 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,12 @@
2020
* Private flags for iomap_dio, must not overlap with the public ones in
2121
* iomap.h:
2222
*/
23-
#define IOMAP_DIO_WRITE_FUA (1 << 28)
24-
#define IOMAP_DIO_NEED_SYNC (1 << 29)
25-
#define IOMAP_DIO_WRITE (1 << 30)
26-
#define IOMAP_DIO_DIRTY (1 << 31)
23+
#define IOMAP_DIO_CALLER_COMP (1U << 26)
24+
#define IOMAP_DIO_INLINE_COMP (1U << 27)
25+
#define IOMAP_DIO_WRITE_THROUGH (1U << 28)
26+
#define IOMAP_DIO_NEED_SYNC (1U << 29)
27+
#define IOMAP_DIO_WRITE (1U << 30)
28+
#define IOMAP_DIO_DIRTY (1U << 31)
2729

2830
struct iomap_dio {
2931
struct kiocb *iocb;
@@ -41,7 +43,6 @@ struct iomap_dio {
4143
struct {
4244
struct iov_iter *iter;
4345
struct task_struct *waiter;
44-
struct bio *poll_bio;
4546
} submit;
4647

4748
/* used for aio completion: */
@@ -63,12 +64,14 @@ static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter,
6364
static void iomap_dio_submit_bio(const struct iomap_iter *iter,
6465
struct iomap_dio *dio, struct bio *bio, loff_t pos)
6566
{
67+
struct kiocb *iocb = dio->iocb;
68+
6669
atomic_inc(&dio->ref);
6770

6871
/* Sync dio can't be polled reliably */
69-
if ((dio->iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(dio->iocb)) {
70-
bio_set_polled(bio, dio->iocb);
71-
dio->submit.poll_bio = bio;
72+
if ((iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(iocb)) {
73+
bio_set_polled(bio, iocb);
74+
WRITE_ONCE(iocb->private, bio);
7275
}
7376

7477
if (dio->dops && dio->dops->submit_io)
@@ -130,6 +133,11 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
130133
}
131134
EXPORT_SYMBOL_GPL(iomap_dio_complete);
132135

136+
static ssize_t iomap_dio_deferred_complete(void *data)
137+
{
138+
return iomap_dio_complete(data);
139+
}
140+
133141
static void iomap_dio_complete_work(struct work_struct *work)
134142
{
135143
struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
@@ -152,27 +160,69 @@ void iomap_dio_bio_end_io(struct bio *bio)
152160
{
153161
struct iomap_dio *dio = bio->bi_private;
154162
bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
163+
struct kiocb *iocb = dio->iocb;
155164

156165
if (bio->bi_status)
157166
iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
167+
if (!atomic_dec_and_test(&dio->ref))
168+
goto release_bio;
158169

159-
if (atomic_dec_and_test(&dio->ref)) {
160-
if (dio->wait_for_completion) {
161-
struct task_struct *waiter = dio->submit.waiter;
162-
WRITE_ONCE(dio->submit.waiter, NULL);
163-
blk_wake_io_task(waiter);
164-
} else if (dio->flags & IOMAP_DIO_WRITE) {
165-
struct inode *inode = file_inode(dio->iocb->ki_filp);
166-
167-
WRITE_ONCE(dio->iocb->private, NULL);
168-
INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
169-
queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
170-
} else {
171-
WRITE_ONCE(dio->iocb->private, NULL);
172-
iomap_dio_complete_work(&dio->aio.work);
173-
}
170+
/*
171+
* Synchronous dio, task itself will handle any completion work
172+
* that needs after IO. All we need to do is wake the task.
173+
*/
174+
if (dio->wait_for_completion) {
175+
struct task_struct *waiter = dio->submit.waiter;
176+
177+
WRITE_ONCE(dio->submit.waiter, NULL);
178+
blk_wake_io_task(waiter);
179+
goto release_bio;
180+
}
181+
182+
/*
183+
* Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline
184+
*/
185+
if (dio->flags & IOMAP_DIO_INLINE_COMP) {
186+
WRITE_ONCE(iocb->private, NULL);
187+
iomap_dio_complete_work(&dio->aio.work);
188+
goto release_bio;
189+
}
190+
191+
/*
192+
* If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
193+
* our completion that way to avoid an async punt to a workqueue.
194+
*/
195+
if (dio->flags & IOMAP_DIO_CALLER_COMP) {
196+
/* only polled IO cares about private cleared */
197+
iocb->private = dio;
198+
iocb->dio_complete = iomap_dio_deferred_complete;
199+
200+
/*
201+
* Invoke ->ki_complete() directly. We've assigned our
202+
* dio_complete callback handler, and since the issuer set
203+
* IOCB_DIO_CALLER_COMP, we know their ki_complete handler will
204+
* notice ->dio_complete being set and will defer calling that
205+
* handler until it can be done from a safe task context.
206+
*
207+
* Note that the 'res' being passed in here is not important
208+
* for this case. The actual completion value of the request
209+
* will be gotten from dio_complete when that is run by the
210+
* issuer.
211+
*/
212+
iocb->ki_complete(iocb, 0);
213+
goto release_bio;
174214
}
175215

216+
/*
217+
* Async DIO completion that requires filesystem level completion work
218+
* gets punted to a work queue to complete as the operation may require
219+
* more IO to be issued to finalise filesystem metadata changes or
220+
* guarantee data integrity.
221+
*/
222+
INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
223+
queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq,
224+
&dio->aio.work);
225+
release_bio:
176226
if (should_dirty) {
177227
bio_check_pages_dirty(bio);
178228
} else {
@@ -203,7 +253,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
203253
/*
204254
* Figure out the bio's operation flags from the dio request, the
205255
* mapping, and whether or not we want FUA. Note that we can end up
206-
* clearing the WRITE_FUA flag in the dio request.
256+
* clearing the WRITE_THROUGH flag in the dio request.
207257
*/
208258
static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
209259
const struct iomap *iomap, bool use_fua)
@@ -217,7 +267,7 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
217267
if (use_fua)
218268
opflags |= REQ_FUA;
219269
else
220-
dio->flags &= ~IOMAP_DIO_WRITE_FUA;
270+
dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
221271

222272
return opflags;
223273
}
@@ -257,12 +307,19 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
257307
* Use a FUA write if we need datasync semantics, this is a pure
258308
* data IO that doesn't require any metadata updates (including
259309
* after IO completion such as unwritten extent conversion) and
260-
* the underlying device supports FUA. This allows us to avoid
261-
* cache flushes on IO completion.
310+
* the underlying device either supports FUA or doesn't have
311+
* a volatile write cache. This allows us to avoid cache flushes
312+
* on IO completion. If we can't use writethrough and need to
313+
* sync, disable in-task completions as dio completion will
314+
* need to call generic_write_sync() which will do a blocking
315+
* fsync / cache flush call.
262316
*/
263317
if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
264-
(dio->flags & IOMAP_DIO_WRITE_FUA) && bdev_fua(iomap->bdev))
318+
(dio->flags & IOMAP_DIO_WRITE_THROUGH) &&
319+
(bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev)))
265320
use_fua = true;
321+
else if (dio->flags & IOMAP_DIO_NEED_SYNC)
322+
dio->flags &= ~IOMAP_DIO_CALLER_COMP;
266323
}
267324

268325
/*
@@ -277,10 +334,23 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
277334
goto out;
278335

279336
/*
280-
* We can only poll for single bio I/Os.
337+
* We can only do deferred completion for pure overwrites that
338+
* don't require additional IO at completion. This rules out
339+
* writes that need zeroing or extent conversion, extend
340+
* the file size, or issue journal IO or cache flushes
341+
* during completion processing.
281342
*/
282343
if (need_zeroout ||
344+
((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) ||
283345
((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
346+
dio->flags &= ~IOMAP_DIO_CALLER_COMP;
347+
348+
/*
349+
* The rules for polled IO completions follow the guidelines as the
350+
* ones we set for inline and deferred completions. If none of those
351+
* are available for this IO, clear the polled flag.
352+
*/
353+
if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP)))
284354
dio->iocb->ki_flags &= ~IOCB_HIPRI;
285355

286356
if (need_zeroout) {
@@ -505,12 +575,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
505575

506576
dio->submit.iter = iter;
507577
dio->submit.waiter = current;
508-
dio->submit.poll_bio = NULL;
509578

510579
if (iocb->ki_flags & IOCB_NOWAIT)
511580
iomi.flags |= IOMAP_NOWAIT;
512581

513582
if (iov_iter_rw(iter) == READ) {
583+
/* reads can always complete inline */
584+
dio->flags |= IOMAP_DIO_INLINE_COMP;
585+
514586
if (iomi.pos >= dio->i_size)
515587
goto out_free_dio;
516588

@@ -524,6 +596,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
524596
iomi.flags |= IOMAP_WRITE;
525597
dio->flags |= IOMAP_DIO_WRITE;
526598

599+
/*
600+
* Flag as supporting deferred completions, if the issuer
601+
* groks it. This can avoid a workqueue punt for writes.
602+
* We may later clear this flag if we need to do other IO
603+
* as part of this IO completion.
604+
*/
605+
if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
606+
dio->flags |= IOMAP_DIO_CALLER_COMP;
607+
527608
if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
528609
ret = -EAGAIN;
529610
if (iomi.pos >= dio->i_size ||
@@ -537,13 +618,16 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
537618
dio->flags |= IOMAP_DIO_NEED_SYNC;
538619

539620
/*
540-
* For datasync only writes, we optimistically try
541-
* using FUA for this IO. Any non-FUA write that
542-
* occurs will clear this flag, hence we know before
543-
* completion whether a cache flush is necessary.
621+
* For datasync only writes, we optimistically try using
622+
* WRITE_THROUGH for this IO. This flag requires either
623+
* FUA writes through the device's write cache, or a
624+
* normal write to a device without a volatile write
625+
* cache. For the former, Any non-FUA write that occurs
626+
* will clear this flag, hence we know before completion
627+
* whether a cache flush is necessary.
544628
*/
545629
if (!(iocb->ki_flags & IOCB_SYNC))
546-
dio->flags |= IOMAP_DIO_WRITE_FUA;
630+
dio->flags |= IOMAP_DIO_WRITE_THROUGH;
547631
}
548632

549633
/*
@@ -605,14 +689,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
605689
iomap_dio_set_error(dio, ret);
606690

607691
/*
608-
* If all the writes we issued were FUA, we don't need to flush the
609-
* cache on IO completion. Clear the sync flag for this case.
692+
* If all the writes we issued were already written through to the
693+
* media, we don't need to flush the cache on IO completion. Clear the
694+
* sync flag for this case.
610695
*/
611-
if (dio->flags & IOMAP_DIO_WRITE_FUA)
696+
if (dio->flags & IOMAP_DIO_WRITE_THROUGH)
612697
dio->flags &= ~IOMAP_DIO_NEED_SYNC;
613698

614-
WRITE_ONCE(iocb->private, dio->submit.poll_bio);
615-
616699
/*
617700
* We are about to drop our additional submission reference, which
618701
* might be the last reference to the dio. There are three different

include/linux/fs.h

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,20 @@ enum rw_hint {
338338
#define IOCB_NOIO (1 << 20)
339339
/* can use bio alloc cache */
340340
#define IOCB_ALLOC_CACHE (1 << 21)
341+
/*
342+
* IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the
343+
* iocb completion can be passed back to the owner for execution from a safe
344+
* context rather than needing to be punted through a workqueue. If this
345+
* flag is set, the bio completion handling may set iocb->dio_complete to a
346+
* handler function and iocb->private to context information for that handler.
347+
* The issuer should call the handler with that context information from task
348+
* context to complete the processing of the iocb. Note that while this
349+
* provides a task context for the dio_complete() callback, it should only be
350+
* used on the completion side for non-IO generating completions. It's fine to
351+
* call blocking functions from this callback, but they should not wait for
352+
* unrelated IO (like cache flushing, new IO generation, etc).
353+
*/
354+
#define IOCB_DIO_CALLER_COMP (1 << 22)
341355

342356
/* for use in trace events */
343357
#define TRACE_IOCB_STRINGS \
@@ -351,7 +365,8 @@ enum rw_hint {
351365
{ IOCB_WRITE, "WRITE" }, \
352366
{ IOCB_WAITQ, "WAITQ" }, \
353367
{ IOCB_NOIO, "NOIO" }, \
354-
{ IOCB_ALLOC_CACHE, "ALLOC_CACHE" }
368+
{ IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \
369+
{ IOCB_DIO_CALLER_COMP, "CALLER_COMP" }
355370

356371
struct kiocb {
357372
struct file *ki_filp;
@@ -360,7 +375,23 @@ struct kiocb {
360375
void *private;
361376
int ki_flags;
362377
u16 ki_ioprio; /* See linux/ioprio.h */
363-
struct wait_page_queue *ki_waitq; /* for async buffered IO */
378+
union {
379+
/*
380+
* Only used for async buffered reads, where it denotes the
381+
* page waitqueue associated with completing the read. Valid
382+
* IFF IOCB_WAITQ is set.
383+
*/
384+
struct wait_page_queue *ki_waitq;
385+
/*
386+
* Can be used for O_DIRECT IO, where the completion handling
387+
* is punted back to the issuer of the IO. May only be set
388+
* if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer
389+
* must then check for presence of this handler when ki_complete
390+
* is invoked. The data passed in to this handler must be
391+
* assigned to ->private when dio_complete is assigned.
392+
*/
393+
ssize_t (*dio_complete)(void *data);
394+
};
364395
};
365396

366397
static inline bool is_sync_kiocb(struct kiocb *kiocb)

io_uring/rw.c

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
105105
} else {
106106
rw->kiocb.ki_ioprio = get_current_ioprio();
107107
}
108+
rw->kiocb.dio_complete = NULL;
108109

109110
rw->addr = READ_ONCE(sqe->addr);
110111
rw->len = READ_ONCE(sqe->len);
@@ -285,6 +286,15 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
285286

286287
void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts)
287288
{
289+
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
290+
struct kiocb *kiocb = &rw->kiocb;
291+
292+
if ((kiocb->ki_flags & IOCB_DIO_CALLER_COMP) && kiocb->dio_complete) {
293+
long res = kiocb->dio_complete(rw->kiocb.private);
294+
295+
io_req_set_res(req, io_fixup_rw_res(req, res), 0);
296+
}
297+
288298
io_req_io_end(req);
289299

290300
if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
@@ -300,9 +310,11 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
300310
struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
301311
struct io_kiocb *req = cmd_to_io_kiocb(rw);
302312

303-
if (__io_complete_rw_common(req, res))
304-
return;
305-
io_req_set_res(req, io_fixup_rw_res(req, res), 0);
313+
if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) {
314+
if (__io_complete_rw_common(req, res))
315+
return;
316+
io_req_set_res(req, io_fixup_rw_res(req, res), 0);
317+
}
306318
req->io_task_work.func = io_req_rw_complete;
307319
__io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE);
308320
}
@@ -916,6 +928,15 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
916928
}
917929
kiocb->ki_flags |= IOCB_WRITE;
918930

931+
/*
932+
* For non-polled IO, set IOCB_DIO_CALLER_COMP, stating that our handler
933+
* groks deferring the completion to task context. This isn't
934+
* necessary and useful for polled IO as that can always complete
935+
* directly.
936+
*/
937+
if (!(kiocb->ki_flags & IOCB_HIPRI))
938+
kiocb->ki_flags |= IOCB_DIO_CALLER_COMP;
939+
919940
if (likely(req->file->f_op->write_iter))
920941
ret2 = call_write_iter(req->file, kiocb, &s->iter);
921942
else if (req->file->f_op->write)

0 commit comments

Comments
 (0)