Skip to content

Commit 8c052fb

Browse files
committed
iomap: support IOCB_DIO_CALLER_COMP
If IOCB_DIO_CALLER_COMP is set, utilize that to set kiocb->dio_complete handler and data for that callback. Rather than punt the completion to a workqueue, we pass back the handler and data to the issuer and will get a callback from a safe task context. Using the following fio job to randomly dio write 4k blocks at queue depths of 1..16: fio --name=dio-write --filename=/data1/file --time_based=1 \ --runtime=10 --bs=4096 --rw=randwrite --norandommap --buffered=0 \ --cpus_allowed=4 --ioengine=io_uring --iodepth=$depth shows the following results before and after this patch: Stock Patched Diff ======================================= QD1 155K 162K + 4.5% QD2 290K 313K + 7.9% QD4 533K 597K +12.0% QD8 604K 827K +36.9% QD16 615K 845K +37.4% which shows nice wins all around. If we factored in per-IOP efficiency, the wins look even nicer. This becomes apparent as queue depth rises, as the offloaded workqueue completions runs out of steam. Reviewed-by: Darrick J. Wong <[email protected]> Reviewed-by: Christoph Hellwig <[email protected]> Reviewed-by: Dave Chinner <[email protected]> Signed-off-by: Jens Axboe <[email protected]>
1 parent 099ada2 commit 8c052fb

File tree

1 file changed

+60
-2
lines changed

1 file changed

+60
-2
lines changed

fs/iomap/direct-io.c

Lines changed: 60 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
* Private flags for iomap_dio, must not overlap with the public ones in
2121
* iomap.h:
2222
*/
23+
#define IOMAP_DIO_CALLER_COMP (1U << 26)
2324
#define IOMAP_DIO_INLINE_COMP (1U << 27)
2425
#define IOMAP_DIO_WRITE_THROUGH (1U << 28)
2526
#define IOMAP_DIO_NEED_SYNC (1U << 29)
@@ -132,6 +133,11 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
132133
}
133134
EXPORT_SYMBOL_GPL(iomap_dio_complete);
134135

136+
static ssize_t iomap_dio_deferred_complete(void *data)
137+
{
138+
return iomap_dio_complete(data);
139+
}
140+
135141
static void iomap_dio_complete_work(struct work_struct *work)
136142
{
137143
struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
@@ -182,6 +188,31 @@ void iomap_dio_bio_end_io(struct bio *bio)
182188
goto release_bio;
183189
}
184190

191+
/*
192+
* If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
193+
* our completion that way to avoid an async punt to a workqueue.
194+
*/
195+
if (dio->flags & IOMAP_DIO_CALLER_COMP) {
196+
/* only polled IO cares about private cleared */
197+
iocb->private = dio;
198+
iocb->dio_complete = iomap_dio_deferred_complete;
199+
200+
/*
201+
* Invoke ->ki_complete() directly. We've assigned our
202+
* dio_complete callback handler, and since the issuer set
203+
* IOCB_DIO_CALLER_COMP, we know their ki_complete handler will
204+
* notice ->dio_complete being set and will defer calling that
205+
* handler until it can be done from a safe task context.
206+
*
207+
* Note that the 'res' being passed in here is not important
208+
* for this case. The actual completion value of the request
209+
* will be gotten from dio_complete when that is run by the
210+
* issuer.
211+
*/
212+
iocb->ki_complete(iocb, 0);
213+
goto release_bio;
214+
}
215+
185216
/*
186217
* Async DIO completion that requires filesystem level completion work
187218
* gets punted to a work queue to complete as the operation may require
@@ -278,12 +309,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
278309
* after IO completion such as unwritten extent conversion) and
279310
* the underlying device either supports FUA or doesn't have
280311
* a volatile write cache. This allows us to avoid cache flushes
281-
* on IO completion.
312+
* on IO completion. If we can't use writethrough and need to
313+
* sync, disable in-task completions as dio completion will
314+
* need to call generic_write_sync() which will do a blocking
315+
* fsync / cache flush call.
282316
*/
283317
if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
284318
(dio->flags & IOMAP_DIO_WRITE_THROUGH) &&
285319
(bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev)))
286320
use_fua = true;
321+
else if (dio->flags & IOMAP_DIO_NEED_SYNC)
322+
dio->flags &= ~IOMAP_DIO_CALLER_COMP;
287323
}
288324

289325
/*
@@ -298,10 +334,23 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
298334
goto out;
299335

300336
/*
301-
* We can only poll for single bio I/Os.
337+
* We can only do deferred completion for pure overwrites that
338+
* don't require additional IO at completion. This rules out
339+
* writes that need zeroing or extent conversion, extend
340+
* the file size, or issue journal IO or cache flushes
341+
* during completion processing.
302342
*/
303343
if (need_zeroout ||
344+
((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) ||
304345
((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
346+
dio->flags &= ~IOMAP_DIO_CALLER_COMP;
347+
348+
/*
349+
* The rules for polled IO completions follow the guidelines as the
350+
* ones we set for inline and deferred completions. If none of those
351+
* are available for this IO, clear the polled flag.
352+
*/
353+
if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP)))
305354
dio->iocb->ki_flags &= ~IOCB_HIPRI;
306355

307356
if (need_zeroout) {
@@ -547,6 +596,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
547596
iomi.flags |= IOMAP_WRITE;
548597
dio->flags |= IOMAP_DIO_WRITE;
549598

599+
/*
600+
* Flag as supporting deferred completions, if the issuer
601+
* groks it. This can avoid a workqueue punt for writes.
602+
* We may later clear this flag if we need to do other IO
603+
* as part of this IO completion.
604+
*/
605+
if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
606+
dio->flags |= IOMAP_DIO_CALLER_COMP;
607+
550608
if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
551609
ret = -EAGAIN;
552610
if (iomi.pos >= dio->i_size ||

0 commit comments

Comments
 (0)