Skip to content

Commit 54e60e5

Browse files
committed
Merge tag 'for-6.2/io_uring-2022-12-08' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: - Always ensure proper ordering in case of CQ ring overflow, which then means we can remove some work-arounds for that (Dylan) - Support completion batching for multishot, greatly increasing the efficiency for those (Dylan) - Flag epoll/eventfd wakeups done from io_uring, so that we can easily tell if we're recursing into io_uring again. Previously, this would have resulted in repeated multishot notifications if we had a dependency there. That could happen if an eventfd was registered as the ring eventfd, and we multishot polled for events on it. Or if an io_uring fd was added to epoll, and io_uring had a multishot request for the epoll fd. Test cases here: https://git.kernel.dk/cgit/liburing/commit/?id=919755a7d0096fda08fb6d65ac54ad8d0fe027cd Previously these got terminated when the CQ ring eventually overflowed, now it's handled gracefully (me). - Tightening of the IOPOLL based completions (Pavel) - Optimizations of the networking zero-copy paths (Pavel) - Various tweaks and fixes (Dylan, Pavel) * tag 'for-6.2/io_uring-2022-12-08' of git://git.kernel.dk/linux: (41 commits) io_uring: keep unlock_post inlined in hot path io_uring: don't use complete_post in kbuf io_uring: spelling fix io_uring: remove io_req_complete_post_tw io_uring: allow multishot polled reqs to defer completion io_uring: remove overflow param from io_post_aux_cqe io_uring: add lockdep assertion in io_fill_cqe_aux io_uring: make io_fill_cqe_aux static io_uring: add io_aux_cqe which allows deferred completion io_uring: allow defer completion for aux posted cqes io_uring: defer all io_req_complete_failed io_uring: always lock in io_apoll_task_func io_uring: remove iopoll spinlock io_uring: iopoll protect complete_post io_uring: inline __io_req_complete_put() io_uring: remove io_req_tw_post_queue io_uring: use io_req_task_complete() in timeout io_uring: hold locks for io_req_complete_failed io_uring: add completion locking for iopoll io_uring: kill io_cqring_ev_posted() and __io_cq_unlock_post() ...
2 parents d523ec4 + 5d77291 commit 54e60e5

File tree

18 files changed

+355
-207
lines changed

18 files changed

+355
-207
lines changed

fs/eventfd.c

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -43,21 +43,7 @@ struct eventfd_ctx {
4343
int id;
4444
};
4545

46-
/**
47-
* eventfd_signal - Adds @n to the eventfd counter.
48-
* @ctx: [in] Pointer to the eventfd context.
49-
* @n: [in] Value of the counter to be added to the eventfd internal counter.
50-
* The value cannot be negative.
51-
*
52-
* This function is supposed to be called by the kernel in paths that do not
53-
* allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
54-
* value, and we signal this as overflow condition by returning a EPOLLERR
55-
* to poll(2).
56-
*
57-
* Returns the amount by which the counter was incremented. This will be less
58-
* than @n if the counter has overflowed.
59-
*/
60-
__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
46+
__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask)
6147
{
6248
unsigned long flags;
6349

@@ -78,12 +64,31 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
7864
n = ULLONG_MAX - ctx->count;
7965
ctx->count += n;
8066
if (waitqueue_active(&ctx->wqh))
81-
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
67+
wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask);
8268
current->in_eventfd = 0;
8369
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
8470

8571
return n;
8672
}
73+
74+
/**
75+
* eventfd_signal - Adds @n to the eventfd counter.
76+
* @ctx: [in] Pointer to the eventfd context.
77+
* @n: [in] Value of the counter to be added to the eventfd internal counter.
78+
* The value cannot be negative.
79+
*
80+
* This function is supposed to be called by the kernel in paths that do not
81+
* allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
82+
* value, and we signal this as overflow condition by returning a EPOLLERR
83+
* to poll(2).
84+
*
85+
* Returns the amount by which the counter was incremented. This will be less
86+
* than @n if the counter has overflowed.
87+
*/
88+
__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
89+
{
90+
return eventfd_signal_mask(ctx, n, 0);
91+
}
8792
EXPORT_SYMBOL_GPL(eventfd_signal);
8893

8994
static void eventfd_free_ctx(struct eventfd_ctx *ctx)

fs/eventpoll.c

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -491,7 +491,8 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
491491
*/
492492
#ifdef CONFIG_DEBUG_LOCK_ALLOC
493493

494-
static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
494+
static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
495+
unsigned pollflags)
495496
{
496497
struct eventpoll *ep_src;
497498
unsigned long flags;
@@ -522,16 +523,17 @@ static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
522523
}
523524
spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
524525
ep->nests = nests + 1;
525-
wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
526+
wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags);
526527
ep->nests = 0;
527528
spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
528529
}
529530

530531
#else
531532

532-
static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
533+
static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
534+
unsigned pollflags)
533535
{
534-
wake_up_poll(&ep->poll_wait, EPOLLIN);
536+
wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags);
535537
}
536538

537539
#endif
@@ -742,7 +744,7 @@ static void ep_free(struct eventpoll *ep)
742744

743745
/* We need to release all tasks waiting for these file */
744746
if (waitqueue_active(&ep->poll_wait))
745-
ep_poll_safewake(ep, NULL);
747+
ep_poll_safewake(ep, NULL, 0);
746748

747749
/*
748750
* We need to lock this because we could be hit by
@@ -1208,7 +1210,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
12081210

12091211
/* We have to call this outside the lock */
12101212
if (pwake)
1211-
ep_poll_safewake(ep, epi);
1213+
ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE);
12121214

12131215
if (!(epi->event.events & EPOLLEXCLUSIVE))
12141216
ewake = 1;
@@ -1553,7 +1555,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
15531555

15541556
/* We have to call this outside the lock */
15551557
if (pwake)
1556-
ep_poll_safewake(ep, NULL);
1558+
ep_poll_safewake(ep, NULL, 0);
15571559

15581560
return 0;
15591561
}
@@ -1629,7 +1631,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
16291631

16301632
/* We have to call this outside the lock */
16311633
if (pwake)
1632-
ep_poll_safewake(ep, NULL);
1634+
ep_poll_safewake(ep, NULL, 0);
16331635

16341636
return 0;
16351637
}

include/linux/eventfd.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ struct file *eventfd_fget(int fd);
4040
struct eventfd_ctx *eventfd_ctx_fdget(int fd);
4141
struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
4242
__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n);
43+
__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask);
4344
int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
4445
__u64 *cnt);
4546
void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
@@ -66,6 +67,12 @@ static inline int eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
6667
return -ENOSYS;
6768
}
6869

70+
static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n,
71+
unsigned mask)
72+
{
73+
return -ENOSYS;
74+
}
75+
6976
static inline void eventfd_ctx_put(struct eventfd_ctx *ctx)
7077
{
7178

include/linux/io_uring_types.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,9 @@ struct io_submit_state {
174174
bool plug_started;
175175
bool need_plug;
176176
unsigned short submit_nr;
177+
unsigned int cqes_count;
177178
struct blk_plug plug;
179+
struct io_uring_cqe cqes[16];
178180
};
179181

180182
struct io_ev_fd {

include/uapi/linux/eventpoll.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,12 @@
4141
#define EPOLLMSG (__force __poll_t)0x00000400
4242
#define EPOLLRDHUP (__force __poll_t)0x00002000
4343

44+
/*
45+
* Internal flag - wakeup generated by io_uring, used to detect recursion back
46+
* into the io_uring poll handler.
47+
*/
48+
#define EPOLL_URING_WAKE ((__force __poll_t)(1U << 27))
49+
4450
/* Set exclusive wakeup mode for the target file descriptor */
4551
#define EPOLLEXCLUSIVE ((__force __poll_t)(1U << 28))
4652

include/uapi/linux/io_uring.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,10 +296,28 @@ enum io_uring_op {
296296
*
297297
* IORING_RECVSEND_FIXED_BUF Use registered buffers, the index is stored in
298298
* the buf_index field.
299+
*
300+
* IORING_SEND_ZC_REPORT_USAGE
301+
* If set, SEND[MSG]_ZC should report
302+
* the zerocopy usage in cqe.res
303+
* for the IORING_CQE_F_NOTIF cqe.
304+
* 0 is reported if zerocopy was actually possible.
305+
* IORING_NOTIF_USAGE_ZC_COPIED if data was copied
306+
* (at least partially).
299307
*/
300308
#define IORING_RECVSEND_POLL_FIRST (1U << 0)
301309
#define IORING_RECV_MULTISHOT (1U << 1)
302310
#define IORING_RECVSEND_FIXED_BUF (1U << 2)
311+
#define IORING_SEND_ZC_REPORT_USAGE (1U << 3)
312+
313+
/*
314+
* cqe.res for IORING_CQE_F_NOTIF if
315+
* IORING_SEND_ZC_REPORT_USAGE was requested
316+
*
317+
* It should be treated as a flag, all other
318+
* bits of cqe.res should be treated as reserved!
319+
*/
320+
#define IORING_NOTIF_USAGE_ZC_COPIED (1U << 31)
303321

304322
/*
305323
* accept flags stored in sqe->ioprio

0 commit comments

Comments
 (0)