Skip to content

Commit 49fffac

Browse files
committed
Merge tag 'for-6.16/io_uring-20250523' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: - Avoid indirect function calls in io-wq for executing and freeing work. The design of io-wq is such that it can be a generic mechanism, but as it's just used by io_uring now, may as well avoid these indirect calls - Clean up registered buffers for networking - Add support for IORING_OP_PIPE. Pretty straight forward, allows creating pipes with io_uring, particularly useful for having these be instantiated as direct descriptors - Clean up the coalescing support fore registered buffers - Add support for multiple interface queues for zero-copy rx networking. As this feature was merged for 6.15 it supported just a single ifq per ring - Clean up the eventfd support - Add dma-buf support to zero-copy rx - Clean up and improving the request draining support - Clean up provided buffer support, most notably with an eye toward making the legacy support less intrusive - Minor fdinfo cleanups, dropping support for dumping what credentials are registered - Improve support for overflow CQE handling, getting rid of GFP_ATOMIC for allocating overflow entries where possible - Improve detection of cases where io-wq doesn't need to spawn a new worker unnecessarily - Various little cleanups * tag 'for-6.16/io_uring-20250523' of git://git.kernel.dk/linux: (59 commits) io_uring/cmd: warn on reg buf imports by ineligible cmds io_uring/io-wq: only create a new worker if it can make progress io_uring/io-wq: ignore non-busy worker going to sleep io_uring/io-wq: move hash helpers to the top trace/io_uring: fix io_uring_local_work_run ctx documentation io_uring: finish IOU_OK -> IOU_COMPLETE transition io_uring: add new helpers for posting overflows io_uring: pass in struct io_big_cqe to io_alloc_ocqe() io_uring: make io_alloc_ocqe() take a struct io_cqe pointer io_uring: split alloc and add of overflow io_uring: open code io_req_cqe_overflow() io_uring/fdinfo: get rid of dumping credentials io_uring/fdinfo: only compile if CONFIG_PROC_FS is set io_uring/kbuf: unify legacy buf provision and removal io_uring/kbuf: refactor __io_remove_buffers io_uring/kbuf: don't compute size twice on prep io_uring/kbuf: drop extra vars in io_register_pbuf_ring io_uring/kbuf: use mem_is_zero() io_uring/kbuf: account ring io_buffer_list memory io_uring: drain based on allocates reqs ...
2 parents 6f59de9 + 6faaf6e commit 49fffac

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+958
-714
lines changed

include/linux/io_uring_types.h

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,6 @@ enum io_uring_cmd_flags {
4040
IO_URING_F_TASK_DEAD = (1 << 13),
4141
};
4242

43-
struct io_zcrx_ifq;
44-
4543
struct io_wq_work_node {
4644
struct io_wq_work_node *next;
4745
};
@@ -343,7 +341,6 @@ struct io_ring_ctx {
343341
unsigned cached_cq_tail;
344342
unsigned cq_entries;
345343
struct io_ev_fd __rcu *io_ev_fd;
346-
unsigned cq_extra;
347344

348345
void *cq_wait_arg;
349346
size_t cq_wait_size;
@@ -394,7 +391,8 @@ struct io_ring_ctx {
394391
struct wait_queue_head poll_wq;
395392
struct io_restriction restrictions;
396393

397-
struct io_zcrx_ifq *ifq;
394+
/* Stores zcrx object pointers of type struct io_zcrx_ifq */
395+
struct xarray zcrx_ctxs;
398396

399397
u32 pers_next;
400398
struct xarray personalities;
@@ -418,6 +416,7 @@ struct io_ring_ctx {
418416

419417
struct callback_head poll_wq_task_work;
420418
struct list_head defer_list;
419+
unsigned nr_drained;
421420

422421
struct io_alloc_cache msg_cache;
423422
spinlock_t msg_lock;
@@ -436,6 +435,7 @@ struct io_ring_ctx {
436435

437436
/* protected by ->completion_lock */
438437
unsigned evfd_last_cq_tail;
438+
unsigned nr_req_allocated;
439439

440440
/*
441441
* Protection for resize vs mmap races - both the mmap and resize
@@ -448,8 +448,6 @@ struct io_ring_ctx {
448448
struct io_mapped_region ring_region;
449449
/* used for optimised request parameter and wait argument passing */
450450
struct io_mapped_region param_region;
451-
/* just one zcrx per ring for now, will move to io_zcrx_ifq eventually */
452-
struct io_mapped_region zcrx_region;
453451
};
454452

455453
/*
@@ -653,8 +651,7 @@ struct io_kiocb {
653651
u8 iopoll_completed;
654652
/*
655653
* Can be either a fixed buffer index, or used with provided buffers.
656-
* For the latter, before issue it points to the buffer group ID,
657-
* and after selection it points to the buffer ID itself.
654+
* For the latter, it points to the selected buffer ID.
658655
*/
659656
u16 buf_index;
660657

@@ -713,7 +710,7 @@ struct io_kiocb {
713710
const struct cred *creds;
714711
struct io_wq_work work;
715712

716-
struct {
713+
struct io_big_cqe {
717714
u64 extra1;
718715
u64 extra2;
719716
} big_cqe;

include/trace/events/io_uring.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -645,7 +645,7 @@ TRACE_EVENT(io_uring_short_write,
645645
/*
646646
* io_uring_local_work_run - ran ring local task work
647647
*
648-
* @tctx: pointer to a io_uring_ctx
648+
* @ctx: pointer to an io_ring_ctx
649649
* @count: how many functions it ran
650650
* @loops: how many loops it ran
651651
*

include/uapi/linux/io_uring.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ struct io_uring_sqe {
7373
__u32 futex_flags;
7474
__u32 install_fd_flags;
7575
__u32 nop_flags;
76+
__u32 pipe_flags;
7677
};
7778
__u64 user_data; /* data to be passed back at completion time */
7879
/* pack this to avoid bogus arm OABI complaints */
@@ -287,6 +288,7 @@ enum io_uring_op {
287288
IORING_OP_EPOLL_WAIT,
288289
IORING_OP_READV_FIXED,
289290
IORING_OP_WRITEV_FIXED,
291+
IORING_OP_PIPE,
290292

291293
/* this goes last, obviously */
292294
IORING_OP_LAST,
@@ -992,12 +994,16 @@ struct io_uring_zcrx_offsets {
992994
__u64 __resv[2];
993995
};
994996

997+
enum io_uring_zcrx_area_flags {
998+
IORING_ZCRX_AREA_DMABUF = 1,
999+
};
1000+
9951001
struct io_uring_zcrx_area_reg {
9961002
__u64 addr;
9971003
__u64 len;
9981004
__u64 rq_area_token;
9991005
__u32 flags;
1000-
__u32 __resv1;
1006+
__u32 dmabuf_fd;
10011007
__u64 __resv2[2];
10021008
};
10031009

io_uring/Makefile

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,17 @@ GCOV_PROFILE := y
77
endif
88

99
obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
10-
tctx.o filetable.o rw.o net.o poll.o \
10+
tctx.o filetable.o rw.o poll.o \
1111
eventfd.o uring_cmd.o openclose.o \
1212
sqpoll.o xattr.o nop.o fs.o splice.o \
1313
sync.o msg_ring.o advise.o openclose.o \
14-
statx.o timeout.o fdinfo.o cancel.o \
14+
statx.o timeout.o cancel.o \
1515
waitid.o register.o truncate.o \
1616
memmap.o alloc_cache.o
1717
obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o
1818
obj-$(CONFIG_IO_WQ) += io-wq.o
1919
obj-$(CONFIG_FUTEX) += futex.o
2020
obj-$(CONFIG_EPOLL) += epoll.o
2121
obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
22+
obj-$(CONFIG_NET) += net.o cmd_net.o
23+
obj-$(CONFIG_PROC_FS) += fdinfo.o

io_uring/advise.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
5858

5959
ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
6060
io_req_set_res(req, ret, 0);
61-
return IOU_OK;
61+
return IOU_COMPLETE;
6262
#else
6363
return -EOPNOTSUPP;
6464
#endif
@@ -104,5 +104,5 @@ int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
104104
if (ret < 0)
105105
req_set_fail(req);
106106
io_req_set_res(req, ret, 0);
107-
return IOU_OK;
107+
return IOU_COMPLETE;
108108
}

io_uring/cancel.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
229229
if (ret < 0)
230230
req_set_fail(req);
231231
io_req_set_res(req, ret, 0);
232-
return IOU_OK;
232+
return IOU_COMPLETE;
233233
}
234234

235235
static int __io_sync_cancel(struct io_uring_task *tctx,

io_uring/cmd_net.c

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#include <asm/ioctls.h>
2+
#include <linux/io_uring/net.h>
3+
#include <net/sock.h>
4+
5+
#include "uring_cmd.h"
6+
7+
static inline int io_uring_cmd_getsockopt(struct socket *sock,
8+
struct io_uring_cmd *cmd,
9+
unsigned int issue_flags)
10+
{
11+
const struct io_uring_sqe *sqe = cmd->sqe;
12+
bool compat = !!(issue_flags & IO_URING_F_COMPAT);
13+
int optlen, optname, level, err;
14+
void __user *optval;
15+
16+
level = READ_ONCE(sqe->level);
17+
if (level != SOL_SOCKET)
18+
return -EOPNOTSUPP;
19+
20+
optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
21+
optname = READ_ONCE(sqe->optname);
22+
optlen = READ_ONCE(sqe->optlen);
23+
24+
err = do_sock_getsockopt(sock, compat, level, optname,
25+
USER_SOCKPTR(optval),
26+
KERNEL_SOCKPTR(&optlen));
27+
if (err)
28+
return err;
29+
30+
/* On success, return optlen */
31+
return optlen;
32+
}
33+
34+
static inline int io_uring_cmd_setsockopt(struct socket *sock,
35+
struct io_uring_cmd *cmd,
36+
unsigned int issue_flags)
37+
{
38+
const struct io_uring_sqe *sqe = cmd->sqe;
39+
bool compat = !!(issue_flags & IO_URING_F_COMPAT);
40+
int optname, optlen, level;
41+
void __user *optval;
42+
sockptr_t optval_s;
43+
44+
optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
45+
optname = READ_ONCE(sqe->optname);
46+
optlen = READ_ONCE(sqe->optlen);
47+
level = READ_ONCE(sqe->level);
48+
optval_s = USER_SOCKPTR(optval);
49+
50+
return do_sock_setsockopt(sock, compat, level, optname, optval_s,
51+
optlen);
52+
}
53+
54+
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
55+
{
56+
struct socket *sock = cmd->file->private_data;
57+
struct sock *sk = sock->sk;
58+
struct proto *prot = READ_ONCE(sk->sk_prot);
59+
int ret, arg = 0;
60+
61+
if (!prot || !prot->ioctl)
62+
return -EOPNOTSUPP;
63+
64+
switch (cmd->cmd_op) {
65+
case SOCKET_URING_OP_SIOCINQ:
66+
ret = prot->ioctl(sk, SIOCINQ, &arg);
67+
if (ret)
68+
return ret;
69+
return arg;
70+
case SOCKET_URING_OP_SIOCOUTQ:
71+
ret = prot->ioctl(sk, SIOCOUTQ, &arg);
72+
if (ret)
73+
return ret;
74+
return arg;
75+
case SOCKET_URING_OP_GETSOCKOPT:
76+
return io_uring_cmd_getsockopt(sock, cmd, issue_flags);
77+
case SOCKET_URING_OP_SETSOCKOPT:
78+
return io_uring_cmd_setsockopt(sock, cmd, issue_flags);
79+
default:
80+
return -EOPNOTSUPP;
81+
}
82+
}
83+
EXPORT_SYMBOL_GPL(io_uring_cmd_sock);

io_uring/epoll.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
6161
if (ret < 0)
6262
req_set_fail(req);
6363
io_req_set_res(req, ret, 0);
64-
return IOU_OK;
64+
return IOU_COMPLETE;
6565
}
6666

6767
int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -88,5 +88,5 @@ int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags)
8888
req_set_fail(req);
8989

9090
io_req_set_res(req, ret, 0);
91-
return IOU_OK;
91+
return IOU_COMPLETE;
9292
}

io_uring/eventfd.c

Lines changed: 14 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -47,13 +47,6 @@ static void io_eventfd_do_signal(struct rcu_head *rcu)
4747
io_eventfd_put(ev_fd);
4848
}
4949

50-
static void io_eventfd_release(struct io_ev_fd *ev_fd, bool put_ref)
51-
{
52-
if (put_ref)
53-
io_eventfd_put(ev_fd);
54-
rcu_read_unlock();
55-
}
56-
5750
/*
5851
* Returns true if the caller should put the ev_fd reference, false if not.
5952
*/
@@ -72,63 +65,34 @@ static bool __io_eventfd_signal(struct io_ev_fd *ev_fd)
7265

7366
/*
7467
* Trigger if eventfd_async isn't set, or if it's set and the caller is
75-
* an async worker. If ev_fd isn't valid, obviously return false.
68+
* an async worker.
7669
*/
7770
static bool io_eventfd_trigger(struct io_ev_fd *ev_fd)
7871
{
79-
if (ev_fd)
80-
return !ev_fd->eventfd_async || io_wq_current_is_worker();
81-
return false;
72+
return !ev_fd->eventfd_async || io_wq_current_is_worker();
8273
}
8374

84-
/*
85-
* On success, returns with an ev_fd reference grabbed and the RCU read
86-
* lock held.
87-
*/
88-
static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx)
75+
void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event)
8976
{
77+
bool skip = false;
9078
struct io_ev_fd *ev_fd;
9179

9280
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
93-
return NULL;
94-
95-
rcu_read_lock();
81+
return;
9682

97-
/*
98-
* rcu_dereference ctx->io_ev_fd once and use it for both for checking
99-
* and eventfd_signal
100-
*/
83+
guard(rcu)();
10184
ev_fd = rcu_dereference(ctx->io_ev_fd);
102-
10385
/*
10486
* Check again if ev_fd exists in case an io_eventfd_unregister call
10587
* completed between the NULL check of ctx->io_ev_fd at the start of
10688
* the function and rcu_read_lock.
10789
*/
108-
if (io_eventfd_trigger(ev_fd) && refcount_inc_not_zero(&ev_fd->refs))
109-
return ev_fd;
110-
111-
rcu_read_unlock();
112-
return NULL;
113-
}
114-
115-
void io_eventfd_signal(struct io_ring_ctx *ctx)
116-
{
117-
struct io_ev_fd *ev_fd;
118-
119-
ev_fd = io_eventfd_grab(ctx);
120-
if (ev_fd)
121-
io_eventfd_release(ev_fd, __io_eventfd_signal(ev_fd));
122-
}
123-
124-
void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
125-
{
126-
struct io_ev_fd *ev_fd;
127-
128-
ev_fd = io_eventfd_grab(ctx);
129-
if (ev_fd) {
130-
bool skip, put_ref = true;
90+
if (!ev_fd)
91+
return;
92+
if (!io_eventfd_trigger(ev_fd) || !refcount_inc_not_zero(&ev_fd->refs))
93+
return;
13194

95+
if (cqe_event) {
13296
/*
13397
* Eventfd should only get triggered when at least one event
13498
* has been posted. Some applications rely on the eventfd
@@ -142,12 +106,10 @@ void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
142106
skip = ctx->cached_cq_tail == ev_fd->last_cq_tail;
143107
ev_fd->last_cq_tail = ctx->cached_cq_tail;
144108
spin_unlock(&ctx->completion_lock);
145-
146-
if (!skip)
147-
put_ref = __io_eventfd_signal(ev_fd);
148-
149-
io_eventfd_release(ev_fd, put_ref);
150109
}
110+
111+
if (skip || __io_eventfd_signal(ev_fd))
112+
io_eventfd_put(ev_fd);
151113
}
152114

153115
int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,

io_uring/eventfd.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,4 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
44
unsigned int eventfd_async);
55
int io_eventfd_unregister(struct io_ring_ctx *ctx);
66

7-
void io_eventfd_flush_signal(struct io_ring_ctx *ctx);
8-
void io_eventfd_signal(struct io_ring_ctx *ctx);
7+
void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event);

0 commit comments

Comments
 (0)