Skip to content

Commit 3147a06

Browse files
committed
Merge tag 'for-6.12/io_uring-20240922' of git://git.kernel.dk/linux
Pull more io_uring updates from Jens Axboe: "Mostly just a set of fixes in here, or little changes that didn't get included in the initial pull request. This contains: - Move the SQPOLL napi polling outside the submission lock (Olivier) - Rename of the "copy buffers" API that got added in the 6.12 merge window. There's really no copying going on, it's just referencing the buffers. After a bit of consideration, decided that it was better to simply rename this to avoid potential confusion (me) - Shrink struct io_mapped_ubuf from 48 to 32 bytes, by changing it to start + len tracking rather than having start / end in there, and by removing the caching of folio_mask when we can just calculate it from folio_shift when we need it (me) - Fixes for the SQPOLL affinity checking (me, Felix) - Fix for how cqring waiting checks for the presence of task_work. Just check it directly rather than check for a specific notification mechanism (me) - Tweak to how request linking is represented in tracing (me) - Fix a syzbot report that deliberately sets up a huge list of overflow entries, and then hits rcu stalls when flushing this list. Just check for the need to preempt, and drop/reacquire locks in the loop. There's no state maintained over the loop itself, and each entry is yanked from head-of-list (me)" * tag 'for-6.12/io_uring-20240922' of git://git.kernel.dk/linux: io_uring: check if we need to reschedule during overflow flush io_uring: improve request linking trace io_uring: check for presence of task_work rather than TIF_NOTIFY_SIGNAL io_uring/sqpoll: do the napi busy poll outside the submission block io_uring: clean up a type in io_uring_register_get_file() io_uring/sqpoll: do not put cpumask on stack io_uring/sqpoll: retain test for whether the CPU is valid io_uring/rsrc: change ubuf->ubuf_end to length tracking io_uring/rsrc: get rid of io_mapped_ubuf->folio_mask io_uring: rename "copy buffers" to "clone buffers"
2 parents 172d513 + eac2ca2 commit 3147a06

File tree

8 files changed

+55
-35
lines changed

8 files changed

+55
-35
lines changed

include/uapi/linux/io_uring.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -609,8 +609,8 @@ enum io_uring_register_op {
609609

610610
IORING_REGISTER_CLOCK = 29,
611611

612-
/* copy registered buffers from source ring to current ring */
613-
IORING_REGISTER_COPY_BUFFERS = 30,
612+
/* clone registered buffers from source ring to current ring */
613+
IORING_REGISTER_CLONE_BUFFERS = 30,
614614

615615
/* this goes last */
616616
IORING_REGISTER_LAST,
@@ -701,7 +701,7 @@ enum {
701701
IORING_REGISTER_SRC_REGISTERED = 1,
702702
};
703703

704-
struct io_uring_copy_buffers {
704+
struct io_uring_clone_buffers {
705705
__u32 src_fd;
706706
__u32 flags;
707707
__u32 pad[6];

io_uring/fdinfo.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,9 +177,8 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
177177
seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
178178
for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
179179
struct io_mapped_ubuf *buf = ctx->user_bufs[i];
180-
unsigned int len = buf->ubuf_end - buf->ubuf;
181180

182-
seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
181+
seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, buf->len);
183182
}
184183
if (has_lock && !xa_empty(&ctx->personalities)) {
185184
unsigned long index;

io_uring/io_uring.c

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,21 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying)
635635
}
636636
list_del(&ocqe->list);
637637
kfree(ocqe);
638+
639+
/*
640+
* For silly syzbot cases that deliberately overflow by huge
641+
* amounts, check if we need to resched and drop and
642+
* reacquire the locks if so. Nothing real would ever hit this.
643+
* Ideally we'd have a non-posting unlock for this, but hard
644+
* to care for a non-real case.
645+
*/
646+
if (need_resched()) {
647+
io_cq_unlock_post(ctx);
648+
mutex_unlock(&ctx->uring_lock);
649+
cond_resched();
650+
mutex_lock(&ctx->uring_lock);
651+
io_cq_lock(ctx);
652+
}
638653
}
639654

640655
if (list_empty(&ctx->cq_overflow_list)) {
@@ -2164,7 +2179,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
21642179
* conditions are true (normal request), then just queue it.
21652180
*/
21662181
if (unlikely(link->head)) {
2167-
trace_io_uring_link(req, link->head);
2182+
trace_io_uring_link(req, link->last);
21682183
link->last->link = req;
21692184
link->last = req;
21702185

@@ -2472,7 +2487,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
24722487
return 1;
24732488
if (unlikely(!llist_empty(&ctx->work_llist)))
24742489
return 1;
2475-
if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL)))
2490+
if (unlikely(task_work_pending(current)))
24762491
return 1;
24772492
if (unlikely(task_sigpending(current)))
24782493
return -EINTR;
@@ -2579,9 +2594,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
25792594
* If we got woken because of task_work being processed, run it
25802595
* now rather than let the caller do another wait loop.
25812596
*/
2582-
io_run_task_work();
25832597
if (!llist_empty(&ctx->work_llist))
25842598
io_run_local_work(ctx, nr_wait);
2599+
io_run_task_work();
25852600

25862601
/*
25872602
* Non-local task_work will be run on exit to userspace, but

io_uring/register.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -542,11 +542,11 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
542542
break;
543543
ret = io_register_clock(ctx, arg);
544544
break;
545-
case IORING_REGISTER_COPY_BUFFERS:
545+
case IORING_REGISTER_CLONE_BUFFERS:
546546
ret = -EINVAL;
547547
if (!arg || nr_args != 1)
548548
break;
549-
ret = io_register_copy_buffers(ctx, arg);
549+
ret = io_register_clone_buffers(ctx, arg);
550550
break;
551551
default:
552552
ret = -EINVAL;
@@ -561,7 +561,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
561561
* true, then the registered index is used. Otherwise, the normal fd table.
562562
* Caller must call fput() on the returned file, unless it's an ERR_PTR.
563563
*/
564-
struct file *io_uring_register_get_file(int fd, bool registered)
564+
struct file *io_uring_register_get_file(unsigned int fd, bool registered)
565565
{
566566
struct file *file;
567567

io_uring/register.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,6 @@
44

55
int io_eventfd_unregister(struct io_ring_ctx *ctx);
66
int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id);
7-
struct file *io_uring_register_get_file(int fd, bool registered);
7+
struct file *io_uring_register_get_file(unsigned int fd, bool registered);
88

99
#endif

io_uring/rsrc.c

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
3838
static const struct io_mapped_ubuf dummy_ubuf = {
3939
/* set invalid range, so io_import_fixed() fails meeting it */
4040
.ubuf = -1UL,
41-
.ubuf_end = 0,
41+
.len = UINT_MAX,
4242
};
4343

4444
int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
@@ -991,16 +991,13 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
991991
size = iov->iov_len;
992992
/* store original address for later verification */
993993
imu->ubuf = (unsigned long) iov->iov_base;
994-
imu->ubuf_end = imu->ubuf + iov->iov_len;
994+
imu->len = iov->iov_len;
995995
imu->nr_bvecs = nr_pages;
996996
imu->folio_shift = PAGE_SHIFT;
997-
imu->folio_mask = PAGE_MASK;
998-
if (coalesced) {
997+
if (coalesced)
999998
imu->folio_shift = data.folio_shift;
1000-
imu->folio_mask = ~((1UL << data.folio_shift) - 1);
1001-
}
1002999
refcount_set(&imu->refs, 1);
1003-
off = (unsigned long) iov->iov_base & ~imu->folio_mask;
1000+
off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1);
10041001
*pimu = imu;
10051002
ret = 0;
10061003

@@ -1100,7 +1097,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
11001097
if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
11011098
return -EFAULT;
11021099
/* not inside the mapped region */
1103-
if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
1100+
if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
11041101
return -EFAULT;
11051102

11061103
/*
@@ -1143,14 +1140,14 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
11431140
iter->bvec = bvec + seg_skip;
11441141
iter->nr_segs -= seg_skip;
11451142
iter->count -= bvec->bv_len + offset;
1146-
iter->iov_offset = offset & ~imu->folio_mask;
1143+
iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
11471144
}
11481145
}
11491146

11501147
return 0;
11511148
}
11521149

1153-
static int io_copy_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
1150+
static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
11541151
{
11551152
struct io_mapped_ubuf **user_bufs;
11561153
struct io_rsrc_data *data;
@@ -1214,9 +1211,9 @@ static int io_copy_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
12141211
*
12151212
* Since the memory is already accounted once, don't account it again.
12161213
*/
1217-
int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg)
1214+
int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
12181215
{
1219-
struct io_uring_copy_buffers buf;
1216+
struct io_uring_clone_buffers buf;
12201217
bool registered_src;
12211218
struct file *file;
12221219
int ret;
@@ -1234,7 +1231,7 @@ int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg)
12341231
file = io_uring_register_get_file(buf.src_fd, registered_src);
12351232
if (IS_ERR(file))
12361233
return PTR_ERR(file);
1237-
ret = io_copy_buffers(ctx, file->private_data);
1234+
ret = io_clone_buffers(ctx, file->private_data);
12381235
if (!registered_src)
12391236
fput(file);
12401237
return ret;

io_uring/rsrc.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,12 +42,11 @@ struct io_rsrc_node {
4242

4343
struct io_mapped_ubuf {
4444
u64 ubuf;
45-
u64 ubuf_end;
45+
unsigned int len;
4646
unsigned int nr_bvecs;
4747
unsigned int folio_shift;
48-
unsigned long acct_pages;
49-
unsigned long folio_mask;
5048
refcount_t refs;
49+
unsigned long acct_pages;
5150
struct bio_vec bvec[] __counted_by(nr_bvecs);
5251
};
5352

@@ -68,7 +67,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
6867
struct io_mapped_ubuf *imu,
6968
u64 buf_addr, size_t len);
7069

71-
int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg);
70+
int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg);
7271
void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
7372
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
7473
int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,

io_uring/sqpoll.c

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -196,9 +196,6 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
196196
ret = io_submit_sqes(ctx, to_submit);
197197
mutex_unlock(&ctx->uring_lock);
198198

199-
if (io_napi(ctx))
200-
ret += io_napi_sqpoll_busy_poll(ctx);
201-
202199
if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
203200
wake_up(&ctx->sqo_sq_wait);
204201
if (creds)
@@ -323,6 +320,10 @@ static int io_sq_thread(void *data)
323320
if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE))
324321
sqt_spin = true;
325322

323+
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
324+
if (io_napi(ctx))
325+
io_napi_sqpoll_busy_poll(ctx);
326+
326327
if (sqt_spin || !time_after(jiffies, timeout)) {
327328
if (sqt_spin) {
328329
io_sq_update_worktime(sqd, &start);
@@ -461,13 +462,22 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
461462
return 0;
462463

463464
if (p->flags & IORING_SETUP_SQ_AFF) {
464-
struct cpumask allowed_mask;
465+
cpumask_var_t allowed_mask;
465466
int cpu = p->sq_thread_cpu;
466467

467468
ret = -EINVAL;
468-
cpuset_cpus_allowed(current, &allowed_mask);
469-
if (!cpumask_test_cpu(cpu, &allowed_mask))
469+
if (cpu >= nr_cpu_ids || !cpu_online(cpu))
470+
goto err_sqpoll;
471+
ret = -ENOMEM;
472+
if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL))
473+
goto err_sqpoll;
474+
ret = -EINVAL;
475+
cpuset_cpus_allowed(current, allowed_mask);
476+
if (!cpumask_test_cpu(cpu, allowed_mask)) {
477+
free_cpumask_var(allowed_mask);
470478
goto err_sqpoll;
479+
}
480+
free_cpumask_var(allowed_mask);
471481
sqd->sq_cpu = cpu;
472482
} else {
473483
sqd->sq_cpu = -1;

0 commit comments

Comments
 (0)