Skip to content

Commit 3a56e24

Browse files
committed
Merge tag 'for-6.11/io_uring-20240714' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: "Here are the io_uring updates queued up for 6.11. Nothing major this time around, various minor improvements and cleanups/fixes. This contains: - Add bind/listen opcodes. Main motivation is to support direct descriptors, to avoid needing a regular fd just for doing these two operations (Gabriel) - Probe fixes (Gabriel) - Treat io-wq work flags as atomics. Not fixing a real issue, but may as well and it silences a KCSAN warning (me) - Cleanup of rsrc __set_current_state() usage (me) - Add 64-bit for {m,f}advise operations (me) - Improve performance of data ring messages (me) - Fix for ring message overflow posting (Pavel) - Fix for freezer interaction with TWA_NOTIFY_SIGNAL. Not strictly an io_uring thing, but since TWA_NOTIFY_SIGNAL was originally added for faster task_work signaling for io_uring, bundling it with this pull (Pavel) - Add Pavel as a co-maintainer - Various cleanups (me, Thorsten)" * tag 'for-6.11/io_uring-20240714' of git://git.kernel.dk/linux: (28 commits) io_uring/net: check socket is valid in io_bind()/io_listen() kernel: rerun task_work while freezing in get_signal() io_uring/io-wq: limit retrying worker initialisation io_uring/napi: Remove unnecessary s64 cast io_uring/net: cleanup io_recv_finish() bundle handling io_uring/msg_ring: fix overflow posting MAINTAINERS: change Pavel Begunkov from io_uring reviewer to maintainer io_uring/msg_ring: use kmem_cache_free() to free request io_uring/msg_ring: check for dead submitter task io_uring/msg_ring: add an alloc cache for io_kiocb entries io_uring/msg_ring: improve handling of target CQE posting io_uring: add io_add_aux_cqe() helper io_uring: add remote task_work execution helper io_uring/msg_ring: tighten requirement for remote posting io_uring: Allocate only necessary memory in io_probe io_uring: Fix probe of disabled operations io_uring: Introduce IORING_OP_LISTEN io_uring: Introduce IORING_OP_BIND net: Split a __sys_listen helper for io_uring net: Split a __sys_bind helper for io_uring ...
2 parents 4f5e249 + ad00e62 commit 3a56e24

File tree

23 files changed

+538
-310
lines changed

23 files changed

+538
-310
lines changed

MAINTAINERS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11551,7 +11551,7 @@ F: include/linux/iosys-map.h
1155111551

1155211552
IO_URING
1155311553
M: Jens Axboe <[email protected]>
11554-
R: Pavel Begunkov <[email protected]>
11554+
M: Pavel Begunkov <[email protected]>
1155511555
1155611556
S: Maintained
1155711557
T: git git://git.kernel.dk/linux-block

include/linux/io_uring_types.h

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ struct io_wq_work_list {
5050

5151
struct io_wq_work {
5252
struct io_wq_work_node list;
53-
unsigned flags;
53+
atomic_t flags;
5454
/* place it here instead of io_kiocb as it fills padding and saves 4B */
5555
int cancel_seq;
5656
};
@@ -210,14 +210,6 @@ struct io_submit_state {
210210
struct blk_plug plug;
211211
};
212212

213-
struct io_ev_fd {
214-
struct eventfd_ctx *cq_ev_fd;
215-
unsigned int eventfd_async: 1;
216-
struct rcu_head rcu;
217-
atomic_t refs;
218-
atomic_t ops;
219-
};
220-
221213
struct io_alloc_cache {
222214
void **entries;
223215
unsigned int nr_cached;
@@ -372,7 +364,6 @@ struct io_ring_ctx {
372364
struct io_restriction restrictions;
373365

374366
/* slow path rsrc auxilary data, used by update/register */
375-
struct io_mapped_ubuf *dummy_ubuf;
376367
struct io_rsrc_data *file_data;
377368
struct io_rsrc_data *buf_data;
378369

@@ -405,6 +396,9 @@ struct io_ring_ctx {
405396
struct callback_head poll_wq_task_work;
406397
struct list_head defer_list;
407398

399+
struct io_alloc_cache msg_cache;
400+
spinlock_t msg_lock;
401+
408402
#ifdef CONFIG_NET_RX_BUSY_POLL
409403
struct list_head napi_list; /* track busy poll napi_id */
410404
spinlock_t napi_lock; /* napi_list lock */

include/linux/socket.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -442,11 +442,14 @@ extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
442442
extern int __sys_socket(int family, int type, int protocol);
443443
extern struct file *__sys_socket_file(int family, int type, int protocol);
444444
extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen);
445+
extern int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address,
446+
int addrlen);
445447
extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr,
446448
int addrlen, int file_flags);
447449
extern int __sys_connect(int fd, struct sockaddr __user *uservaddr,
448450
int addrlen);
449451
extern int __sys_listen(int fd, int backlog);
452+
extern int __sys_listen_socket(struct socket *sock, int backlog);
450453
extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr,
451454
int __user *usockaddr_len);
452455
extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr,

include/uapi/linux/io_uring.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,8 @@ enum io_uring_op {
257257
IORING_OP_FUTEX_WAITV,
258258
IORING_OP_FIXED_FD_INSTALL,
259259
IORING_OP_FTRUNCATE,
260+
IORING_OP_BIND,
261+
IORING_OP_LISTEN,
260262

261263
/* this goes last, obviously */
262264
IORING_OP_LAST,

io_uring/Makefile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44

55
obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
66
tctx.o filetable.o rw.o net.o poll.o \
7-
uring_cmd.o openclose.o sqpoll.o \
8-
xattr.o nop.o fs.o splice.o sync.o \
9-
msg_ring.o advise.o openclose.o \
7+
eventfd.o uring_cmd.o openclose.o \
8+
sqpoll.o xattr.o nop.o fs.o splice.o \
9+
sync.o msg_ring.o advise.o openclose.o \
1010
epoll.o statx.o timeout.o fdinfo.o \
1111
cancel.o waitid.o register.o \
1212
truncate.o memmap.o

io_uring/advise.c

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,14 @@
1717
struct io_fadvise {
1818
struct file *file;
1919
u64 offset;
20-
u32 len;
20+
u64 len;
2121
u32 advice;
2222
};
2323

2424
struct io_madvise {
2525
struct file *file;
2626
u64 addr;
27-
u32 len;
27+
u64 len;
2828
u32 advice;
2929
};
3030

@@ -33,11 +33,13 @@ int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3333
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
3434
struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise);
3535

36-
if (sqe->buf_index || sqe->off || sqe->splice_fd_in)
36+
if (sqe->buf_index || sqe->splice_fd_in)
3737
return -EINVAL;
3838

3939
ma->addr = READ_ONCE(sqe->addr);
40-
ma->len = READ_ONCE(sqe->len);
40+
ma->len = READ_ONCE(sqe->off);
41+
if (!ma->len)
42+
ma->len = READ_ONCE(sqe->len);
4143
ma->advice = READ_ONCE(sqe->fadvise_advice);
4244
req->flags |= REQ_F_FORCE_ASYNC;
4345
return 0;
@@ -78,11 +80,13 @@ int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
7880
{
7981
struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise);
8082

81-
if (sqe->buf_index || sqe->addr || sqe->splice_fd_in)
83+
if (sqe->buf_index || sqe->splice_fd_in)
8284
return -EINVAL;
8385

8486
fa->offset = READ_ONCE(sqe->off);
85-
fa->len = READ_ONCE(sqe->len);
87+
fa->len = READ_ONCE(sqe->addr);
88+
if (!fa->len)
89+
fa->len = READ_ONCE(sqe->len);
8690
fa->advice = READ_ONCE(sqe->fadvise_advice);
8791
if (io_fadvise_force_async(fa))
8892
req->flags |= REQ_F_FORCE_ASYNC;

io_uring/eventfd.c

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
#include <linux/kernel.h>
3+
#include <linux/errno.h>
4+
#include <linux/mm.h>
5+
#include <linux/slab.h>
6+
#include <linux/eventfd.h>
7+
#include <linux/eventpoll.h>
8+
#include <linux/io_uring.h>
9+
#include <linux/io_uring_types.h>
10+
11+
#include "io-wq.h"
12+
#include "eventfd.h"
13+
14+
struct io_ev_fd {
15+
struct eventfd_ctx *cq_ev_fd;
16+
unsigned int eventfd_async: 1;
17+
struct rcu_head rcu;
18+
atomic_t refs;
19+
atomic_t ops;
20+
};
21+
22+
enum {
23+
IO_EVENTFD_OP_SIGNAL_BIT,
24+
};
25+
26+
static void io_eventfd_free(struct rcu_head *rcu)
27+
{
28+
struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
29+
30+
eventfd_ctx_put(ev_fd->cq_ev_fd);
31+
kfree(ev_fd);
32+
}
33+
34+
static void io_eventfd_do_signal(struct rcu_head *rcu)
35+
{
36+
struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
37+
38+
eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
39+
40+
if (atomic_dec_and_test(&ev_fd->refs))
41+
io_eventfd_free(rcu);
42+
}
43+
44+
void io_eventfd_signal(struct io_ring_ctx *ctx)
45+
{
46+
struct io_ev_fd *ev_fd = NULL;
47+
48+
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
49+
return;
50+
51+
guard(rcu)();
52+
53+
/*
54+
* rcu_dereference ctx->io_ev_fd once and use it for both for checking
55+
* and eventfd_signal
56+
*/
57+
ev_fd = rcu_dereference(ctx->io_ev_fd);
58+
59+
/*
60+
* Check again if ev_fd exists incase an io_eventfd_unregister call
61+
* completed between the NULL check of ctx->io_ev_fd at the start of
62+
* the function and rcu_read_lock.
63+
*/
64+
if (unlikely(!ev_fd))
65+
return;
66+
if (!atomic_inc_not_zero(&ev_fd->refs))
67+
return;
68+
if (ev_fd->eventfd_async && !io_wq_current_is_worker())
69+
goto out;
70+
71+
if (likely(eventfd_signal_allowed())) {
72+
eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
73+
} else {
74+
if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) {
75+
call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal);
76+
return;
77+
}
78+
}
79+
out:
80+
if (atomic_dec_and_test(&ev_fd->refs))
81+
call_rcu(&ev_fd->rcu, io_eventfd_free);
82+
}
83+
84+
void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
85+
{
86+
bool skip;
87+
88+
spin_lock(&ctx->completion_lock);
89+
90+
/*
91+
* Eventfd should only get triggered when at least one event has been
92+
* posted. Some applications rely on the eventfd notification count
93+
* only changing IFF a new CQE has been added to the CQ ring. There's
94+
* no depedency on 1:1 relationship between how many times this
95+
* function is called (and hence the eventfd count) and number of CQEs
96+
* posted to the CQ ring.
97+
*/
98+
skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
99+
ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
100+
spin_unlock(&ctx->completion_lock);
101+
if (skip)
102+
return;
103+
104+
io_eventfd_signal(ctx);
105+
}
106+
107+
int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
108+
unsigned int eventfd_async)
109+
{
110+
struct io_ev_fd *ev_fd;
111+
__s32 __user *fds = arg;
112+
int fd;
113+
114+
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
115+
lockdep_is_held(&ctx->uring_lock));
116+
if (ev_fd)
117+
return -EBUSY;
118+
119+
if (copy_from_user(&fd, fds, sizeof(*fds)))
120+
return -EFAULT;
121+
122+
ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
123+
if (!ev_fd)
124+
return -ENOMEM;
125+
126+
ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
127+
if (IS_ERR(ev_fd->cq_ev_fd)) {
128+
int ret = PTR_ERR(ev_fd->cq_ev_fd);
129+
kfree(ev_fd);
130+
return ret;
131+
}
132+
133+
spin_lock(&ctx->completion_lock);
134+
ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
135+
spin_unlock(&ctx->completion_lock);
136+
137+
ev_fd->eventfd_async = eventfd_async;
138+
ctx->has_evfd = true;
139+
atomic_set(&ev_fd->refs, 1);
140+
atomic_set(&ev_fd->ops, 0);
141+
rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
142+
return 0;
143+
}
144+
145+
int io_eventfd_unregister(struct io_ring_ctx *ctx)
146+
{
147+
struct io_ev_fd *ev_fd;
148+
149+
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
150+
lockdep_is_held(&ctx->uring_lock));
151+
if (ev_fd) {
152+
ctx->has_evfd = false;
153+
rcu_assign_pointer(ctx->io_ev_fd, NULL);
154+
if (atomic_dec_and_test(&ev_fd->refs))
155+
call_rcu(&ev_fd->rcu, io_eventfd_free);
156+
return 0;
157+
}
158+
159+
return -ENXIO;
160+
}

io_uring/eventfd.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
2+
struct io_ring_ctx;
3+
int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
4+
unsigned int eventfd_async);
5+
int io_eventfd_unregister(struct io_ring_ctx *ctx);
6+
7+
void io_eventfd_flush_signal(struct io_ring_ctx *ctx);
8+
void io_eventfd_signal(struct io_ring_ctx *ctx);

0 commit comments

Comments
 (0)