Skip to content

Commit 6d0f8dc

Browse files
committed
Merge branch 'for-6.12/io_uring' into for-6.12/io_uring-discard
* for-6.12/io_uring: (31 commits) io_uring/io-wq: inherit cpuset of cgroup in io worker io_uring/io-wq: do not allow pinning outside of cpuset io_uring/rw: drop -EOPNOTSUPP check in __io_complete_rw_common() io_uring/rw: treat -EOPNOTSUPP for IOCB_NOWAIT like -EAGAIN io_uring/sqpoll: do not allow pinning outside of cpuset io_uring/eventfd: move refs to refcount_t io_uring: remove unused rsrc_put_fn io_uring: add new line after variable declaration io_uring: add GCOV_PROFILE_URING Kconfig option io_uring/kbuf: add support for incremental buffer consumption io_uring/kbuf: pass in 'len' argument for buffer commit Revert "io_uring: Require zeroed sqe->len on provided-buffers send" io_uring/kbuf: move io_ring_head_to_buf() to kbuf.h io_uring/kbuf: add io_kbuf_commit() helper io_uring/kbuf: shrink nr_iovs/mode in struct buf_sel_arg io_uring: wire up min batch wake timeout io_uring: add support for batch wait timeout io_uring: implement our own schedule timeout handling io_uring: move schedule wait logic into helper io_uring: encapsulate extraneous wait flags into a separate struct ...
2 parents 318ad42 + 84eacf1 commit 6d0f8dc

File tree

19 files changed

+573
-238
lines changed

19 files changed

+573
-238
lines changed

include/linux/io_uring_types.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,9 @@ struct io_ring_ctx {
239239
struct io_rings *rings;
240240
struct percpu_ref refs;
241241

242+
clockid_t clockid;
243+
enum tk_offsets clock_offset;
244+
242245
enum task_work_notify_mode notify_method;
243246
unsigned sq_thread_idle;
244247
} ____cacheline_aligned_in_smp;

include/uapi/linux/io_uring.h

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -440,11 +440,21 @@ struct io_uring_cqe {
440440
* IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv
441441
* IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct
442442
* them from sends.
443+
* IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get
444+
* more completions. In other words, the buffer is being
445+
* partially consumed, and will be used by the kernel for
446+
* more completions. This is only set for buffers used via
447+
* the incremental buffer consumption, as provided by
448+
* a ring buffer setup with IOU_PBUF_RING_INC. For any
449+
* other provided buffer type, all completions with a
450+
* buffer passed back is automatically returned to the
451+
* application.
443452
*/
444453
#define IORING_CQE_F_BUFFER (1U << 0)
445454
#define IORING_CQE_F_MORE (1U << 1)
446455
#define IORING_CQE_F_SOCK_NONEMPTY (1U << 2)
447456
#define IORING_CQE_F_NOTIF (1U << 3)
457+
#define IORING_CQE_F_BUF_MORE (1U << 4)
448458

449459
#define IORING_CQE_BUFFER_SHIFT 16
450460

@@ -507,6 +517,7 @@ struct io_cqring_offsets {
507517
#define IORING_ENTER_SQ_WAIT (1U << 2)
508518
#define IORING_ENTER_EXT_ARG (1U << 3)
509519
#define IORING_ENTER_REGISTERED_RING (1U << 4)
520+
#define IORING_ENTER_ABS_TIMER (1U << 5)
510521

511522
/*
512523
* Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -542,6 +553,7 @@ struct io_uring_params {
542553
#define IORING_FEAT_LINKED_FILE (1U << 12)
543554
#define IORING_FEAT_REG_REG_RING (1U << 13)
544555
#define IORING_FEAT_RECVSEND_BUNDLE (1U << 14)
556+
#define IORING_FEAT_MIN_TIMEOUT (1U << 15)
545557

546558
/*
547559
* io_uring_register(2) opcodes and arguments
@@ -595,6 +607,8 @@ enum io_uring_register_op {
595607
IORING_REGISTER_NAPI = 27,
596608
IORING_UNREGISTER_NAPI = 28,
597609

610+
IORING_REGISTER_CLOCK = 29,
611+
598612
/* this goes last */
599613
IORING_REGISTER_LAST,
600614

@@ -675,6 +689,11 @@ struct io_uring_restriction {
675689
__u32 resv2[3];
676690
};
677691

692+
struct io_uring_clock_register {
693+
__u32 clockid;
694+
__u32 __resv[3];
695+
};
696+
678697
struct io_uring_buf {
679698
__u64 addr;
680699
__u32 len;
@@ -707,9 +726,17 @@ struct io_uring_buf_ring {
707726
* mmap(2) with the offset set as:
708727
* IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)
709728
* to get a virtual mapping for the ring.
729+
* IOU_PBUF_RING_INC: If set, buffers consumed from this buffer ring can be
730+
* consumed incrementally. Normally one (or more) buffers
731+
* are fully consumed. With incremental consumptions, it's
732+
* feasible to register big ranges of buffers, and each
733+
* use of it will consume only as much as it needs. This
734+
* requires that both the kernel and application keep
735+
* track of where the current read/recv index is at.
710736
*/
711737
enum io_uring_register_pbuf_ring_flags {
712738
IOU_PBUF_RING_MMAP = 1,
739+
IOU_PBUF_RING_INC = 2,
713740
};
714741

715742
/* argument for IORING_(UN)REGISTER_PBUF_RING */
@@ -758,7 +785,7 @@ enum io_uring_register_restriction_op {
758785
struct io_uring_getevents_arg {
759786
__u64 sigmask;
760787
__u32 sigmask_sz;
761-
__u32 pad;
788+
__u32 min_wait_usec;
762789
__u64 ts;
763790
};
764791

init/Kconfig

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1687,6 +1687,19 @@ config IO_URING
16871687
applications to submit and complete IO through submission and
16881688
completion rings that are shared between the kernel and application.
16891689

1690+
config GCOV_PROFILE_URING
1691+
bool "Enable GCOV profiling on the io_uring subsystem"
1692+
depends on GCOV_KERNEL
1693+
help
1694+
Enable GCOV profiling on the io_uring subsystem, to facilitate
1695+
code coverage testing.
1696+
1697+
If unsure, say N.
1698+
1699+
Note that this will have a negative impact on the performance of
1700+
the io_uring subsystem, hence this should only be enabled for
1701+
specific test purposes.
1702+
16901703
config ADVISE_SYSCALLS
16911704
bool "Enable madvise/fadvise syscalls" if EXPERT
16921705
default y

io_uring/Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22
#
33
# Makefile for io_uring
44

5+
ifdef CONFIG_GCOV_PROFILE_URING
6+
GCOV_PROFILE := y
7+
endif
8+
59
obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
610
tctx.o filetable.o rw.o net.o poll.o \
711
eventfd.o uring_cmd.o openclose.o \

io_uring/eventfd.c

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ struct io_ev_fd {
1515
struct eventfd_ctx *cq_ev_fd;
1616
unsigned int eventfd_async: 1;
1717
struct rcu_head rcu;
18-
atomic_t refs;
18+
refcount_t refs;
1919
atomic_t ops;
2020
};
2121

@@ -37,7 +37,7 @@ static void io_eventfd_do_signal(struct rcu_head *rcu)
3737

3838
eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
3939

40-
if (atomic_dec_and_test(&ev_fd->refs))
40+
if (refcount_dec_and_test(&ev_fd->refs))
4141
io_eventfd_free(rcu);
4242
}
4343

@@ -63,7 +63,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx)
6363
*/
6464
if (unlikely(!ev_fd))
6565
return;
66-
if (!atomic_inc_not_zero(&ev_fd->refs))
66+
if (!refcount_inc_not_zero(&ev_fd->refs))
6767
return;
6868
if (ev_fd->eventfd_async && !io_wq_current_is_worker())
6969
goto out;
@@ -77,7 +77,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx)
7777
}
7878
}
7979
out:
80-
if (atomic_dec_and_test(&ev_fd->refs))
80+
if (refcount_dec_and_test(&ev_fd->refs))
8181
call_rcu(&ev_fd->rcu, io_eventfd_free);
8282
}
8383

@@ -126,6 +126,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
126126
ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
127127
if (IS_ERR(ev_fd->cq_ev_fd)) {
128128
int ret = PTR_ERR(ev_fd->cq_ev_fd);
129+
129130
kfree(ev_fd);
130131
return ret;
131132
}
@@ -136,7 +137,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
136137

137138
ev_fd->eventfd_async = eventfd_async;
138139
ctx->has_evfd = true;
139-
atomic_set(&ev_fd->refs, 1);
140+
refcount_set(&ev_fd->refs, 1);
140141
atomic_set(&ev_fd->ops, 0);
141142
rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
142143
return 0;
@@ -151,7 +152,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx)
151152
if (ev_fd) {
152153
ctx->has_evfd = false;
153154
rcu_assign_pointer(ctx->io_ev_fd, NULL);
154-
if (atomic_dec_and_test(&ev_fd->refs))
155+
if (refcount_dec_and_test(&ev_fd->refs))
155156
call_rcu(&ev_fd->rcu, io_eventfd_free);
156157
return 0;
157158
}

io_uring/fdinfo.c

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,19 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
221221
cqe->user_data, cqe->res, cqe->flags);
222222

223223
}
224-
225224
spin_unlock(&ctx->completion_lock);
225+
226+
#ifdef CONFIG_NET_RX_BUSY_POLL
227+
if (ctx->napi_enabled) {
228+
seq_puts(m, "NAPI:\tenabled\n");
229+
seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt);
230+
if (ctx->napi_prefer_busy_poll)
231+
seq_puts(m, "napi_prefer_busy_poll:\ttrue\n");
232+
else
233+
seq_puts(m, "napi_prefer_busy_poll:\tfalse\n");
234+
} else {
235+
seq_puts(m, "NAPI:\tdisabled\n");
236+
}
237+
#endif
226238
}
227239
#endif

io_uring/io-wq.c

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <linux/slab.h>
1414
#include <linux/rculist_nulls.h>
1515
#include <linux/cpu.h>
16+
#include <linux/cpuset.h>
1617
#include <linux/task_work.h>
1718
#include <linux/audit.h>
1819
#include <linux/mmu_context.h>
@@ -1167,7 +1168,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
11671168

11681169
if (!alloc_cpumask_var(&wq->cpu_mask, GFP_KERNEL))
11691170
goto err;
1170-
cpumask_copy(wq->cpu_mask, cpu_possible_mask);
1171+
cpuset_cpus_allowed(data->task, wq->cpu_mask);
11711172
wq->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
11721173
wq->acct[IO_WQ_ACCT_UNBOUND].max_workers =
11731174
task_rlimit(current, RLIMIT_NPROC);
@@ -1322,17 +1323,29 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node)
13221323

13231324
int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask)
13241325
{
1326+
cpumask_var_t allowed_mask;
1327+
int ret = 0;
1328+
13251329
if (!tctx || !tctx->io_wq)
13261330
return -EINVAL;
13271331

1332+
if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL))
1333+
return -ENOMEM;
1334+
13281335
rcu_read_lock();
1329-
if (mask)
1330-
cpumask_copy(tctx->io_wq->cpu_mask, mask);
1331-
else
1332-
cpumask_copy(tctx->io_wq->cpu_mask, cpu_possible_mask);
1336+
cpuset_cpus_allowed(tctx->io_wq->task, allowed_mask);
1337+
if (mask) {
1338+
if (cpumask_subset(mask, allowed_mask))
1339+
cpumask_copy(tctx->io_wq->cpu_mask, mask);
1340+
else
1341+
ret = -EINVAL;
1342+
} else {
1343+
cpumask_copy(tctx->io_wq->cpu_mask, allowed_mask);
1344+
}
13331345
rcu_read_unlock();
13341346

1335-
return 0;
1347+
free_cpumask_var(allowed_mask);
1348+
return ret;
13361349
}
13371350

13381351
/*

0 commit comments

Comments
 (0)