Skip to content

Commit 1100c4a

Browse files
committed
io_uring: add support for batch wait timeout
Waiting for events with io_uring has two knobs that can be set: 1) The number of events to wake for 2) The timeout associated with the event Waiting will abort when either of those conditions are met, as expected. This adds support for a third event, which is associated with the number of events to wait for. Applications generally like to handle batches of completions, and right now they'd set a number of events to wait for and the timeout for that. If no events have been received but the timeout triggers, control is returned to the application and it can wait again. However, if the application doesn't have anything to do until events are reaped, then it's possible to make this waiting more efficient. For example, the application may have a latency time of 50 usecs and wanting to handle a batch of 8 requests at the time. If it uses 50 usecs as the timeout, then it'll be doing 20K context switches per second even if nothing is happening. This introduces the notion of min batch wait time. If the min batch wait time expires, then we'll return to userspace if we have any events at all. If none are available, the general wait time is applied. Any request arriving after the min batch wait time will cause waiting to stop and return control to the application. Signed-off-by: Jens Axboe <[email protected]>
1 parent cebf123 commit 1100c4a

File tree

2 files changed

+82
-13
lines changed

2 files changed

+82
-13
lines changed

io_uring/io_uring.c

Lines changed: 80 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2355,17 +2355,70 @@ static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
23552355
struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
23562356

23572357
WRITE_ONCE(iowq->hit_timeout, 1);
2358+
iowq->min_timeout = 0;
23582359
wake_up_process(iowq->wq.private);
23592360
return HRTIMER_NORESTART;
23602361
}
23612362

2363+
/*
2364+
* Doing min_timeout portion. If we saw any timeouts, events, or have work,
2365+
* wake up. If not, and we have a normal timeout, switch to that and keep
2366+
* sleeping.
2367+
*/
2368+
static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
2369+
{
2370+
struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
2371+
struct io_ring_ctx *ctx = iowq->ctx;
2372+
2373+
/* no general timeout, or shorter (or equal), we are done */
2374+
if (iowq->timeout == KTIME_MAX ||
2375+
ktime_compare(iowq->min_timeout, iowq->timeout) >= 0)
2376+
goto out_wake;
2377+
/* work we may need to run, wake function will see if we need to wake */
2378+
if (io_has_work(ctx))
2379+
goto out_wake;
2380+
/* got events since we started waiting, min timeout is done */
2381+
if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
2382+
goto out_wake;
2383+
/* if we have any events and min timeout expired, we're done */
2384+
if (io_cqring_events(ctx))
2385+
goto out_wake;
2386+
2387+
/*
2388+
* If using deferred task_work running and application is waiting on
2389+
* more than one request, ensure we reset it now where we are switching
2390+
* to normal sleeps. Any request completion post min_wait should wake
2391+
* the task and return.
2392+
*/
2393+
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
2394+
atomic_set(&ctx->cq_wait_nr, 1);
2395+
smp_mb();
2396+
if (!llist_empty(&ctx->work_llist))
2397+
goto out_wake;
2398+
}
2399+
2400+
iowq->t.function = io_cqring_timer_wakeup;
2401+
hrtimer_set_expires(timer, iowq->timeout);
2402+
return HRTIMER_RESTART;
2403+
out_wake:
2404+
return io_cqring_timer_wakeup(timer);
2405+
}
2406+
23622407
static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
2363-
clockid_t clock_id)
2408+
clockid_t clock_id, ktime_t start_time)
23642409
{
2365-
iowq->hit_timeout = 0;
2410+
ktime_t timeout;
2411+
23662412
hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS);
2367-
iowq->t.function = io_cqring_timer_wakeup;
2368-
hrtimer_set_expires_range_ns(&iowq->t, iowq->timeout, 0);
2413+
if (iowq->min_timeout) {
2414+
timeout = ktime_add_ns(iowq->min_timeout, start_time);
2415+
iowq->t.function = io_cqring_min_timer_wakeup;
2416+
} else {
2417+
timeout = iowq->timeout;
2418+
iowq->t.function = io_cqring_timer_wakeup;
2419+
}
2420+
2421+
hrtimer_set_expires_range_ns(&iowq->t, timeout, 0);
23692422
hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS);
23702423

23712424
if (!READ_ONCE(iowq->hit_timeout))
@@ -2379,7 +2432,8 @@ static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
23792432
}
23802433

23812434
static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
2382-
struct io_wait_queue *iowq)
2435+
struct io_wait_queue *iowq,
2436+
ktime_t start_time)
23832437
{
23842438
int ret = 0;
23852439

@@ -2390,8 +2444,8 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
23902444
*/
23912445
if (current_pending_io())
23922446
current->in_iowait = 1;
2393-
if (iowq->timeout != KTIME_MAX)
2394-
ret = io_cqring_schedule_timeout(iowq, ctx->clockid);
2447+
if (iowq->timeout != KTIME_MAX || iowq->min_timeout)
2448+
ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time);
23952449
else
23962450
schedule();
23972451
current->in_iowait = 0;
@@ -2400,7 +2454,8 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
24002454

24012455
/* If this returns > 0, the caller should retry */
24022456
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
2403-
struct io_wait_queue *iowq)
2457+
struct io_wait_queue *iowq,
2458+
ktime_t start_time)
24042459
{
24052460
if (unlikely(READ_ONCE(ctx->check_cq)))
24062461
return 1;
@@ -2413,7 +2468,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
24132468
if (unlikely(io_should_wake(iowq)))
24142469
return 0;
24152470

2416-
return __io_cqring_wait_schedule(ctx, iowq);
2471+
return __io_cqring_wait_schedule(ctx, iowq, start_time);
24172472
}
24182473

24192474
struct ext_arg {
@@ -2431,6 +2486,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
24312486
{
24322487
struct io_wait_queue iowq;
24332488
struct io_rings *rings = ctx->rings;
2489+
ktime_t start_time;
24342490
int ret;
24352491

24362492
if (!io_allowed_run_tw(ctx))
@@ -2448,9 +2504,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
24482504
iowq.wq.private = current;
24492505
INIT_LIST_HEAD(&iowq.wq.entry);
24502506
iowq.ctx = ctx;
2451-
iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
24522507
iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
2508+
iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail);
2509+
iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
2510+
iowq.hit_timeout = 0;
2511+
iowq.min_timeout = 0;
24532512
iowq.timeout = KTIME_MAX;
2513+
start_time = io_get_time(ctx);
24542514

24552515
if (ext_arg->ts) {
24562516
struct timespec64 ts;
@@ -2460,7 +2520,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
24602520

24612521
iowq.timeout = timespec64_to_ktime(ts);
24622522
if (!(flags & IORING_ENTER_ABS_TIMER))
2463-
iowq.timeout = ktime_add(iowq.timeout, io_get_time(ctx));
2523+
iowq.timeout = ktime_add(iowq.timeout, start_time);
24642524
}
24652525

24662526
if (ext_arg->sig) {
@@ -2480,8 +2540,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
24802540

24812541
trace_io_uring_cqring_wait(ctx, min_events);
24822542
do {
2483-
int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
24842543
unsigned long check_cq;
2544+
int nr_wait;
2545+
2546+
/* if min timeout has been hit, don't reset wait count */
2547+
if (!iowq.hit_timeout)
2548+
nr_wait = (int) iowq.cq_tail -
2549+
READ_ONCE(ctx->rings->cq.tail);
2550+
else
2551+
nr_wait = 1;
24852552

24862553
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
24872554
atomic_set(&ctx->cq_wait_nr, nr_wait);
@@ -2491,7 +2558,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
24912558
TASK_INTERRUPTIBLE);
24922559
}
24932560

2494-
ret = io_cqring_wait_schedule(ctx, &iowq);
2561+
ret = io_cqring_wait_schedule(ctx, &iowq, start_time);
24952562
__set_current_state(TASK_RUNNING);
24962563
atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
24972564

io_uring/io_uring.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,10 @@ struct io_wait_queue {
3939
struct wait_queue_entry wq;
4040
struct io_ring_ctx *ctx;
4141
unsigned cq_tail;
42+
unsigned cq_min_tail;
4243
unsigned nr_timeouts;
4344
int hit_timeout;
45+
ktime_t min_timeout;
4446
ktime_t timeout;
4547
struct hrtimer t;
4648

0 commit comments

Comments
 (0)