Skip to content

Commit 4f72ed4

Browse files
committed
Merge tag 'io_uring-6.9-20240405' of git://git.kernel.dk/linux
Pull io_uring fixes from Jens Axboe: - Backport of some fixes that came up during development of the 6.10 io_uring patches. This includes some kbuf cleanups and reference fixes. - Disable multishot read if we don't have NOWAIT support on the target - Fix for a dependency issue with workqueue flushing * tag 'io_uring-6.9-20240405' of git://git.kernel.dk/linux: io_uring/kbuf: hold io_buffer_list reference over mmap io_uring/kbuf: protect io_buffer_list teardown with a reference io_uring/kbuf: get rid of bl->is_ready io_uring/kbuf: get rid of lower BGID lists io_uring: use private workqueue for exit work io_uring: disable io-wq execution of multishot NOWAIT requests io_uring/rw: don't allow multishot reads without NOWAIT support
2 parents 4de2ff2 + 561e4f9 commit 4f72ed4

File tree

5 files changed

+73
-94
lines changed

5 files changed

+73
-94
lines changed

include/linux/io_uring_types.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,6 @@ struct io_ring_ctx {
294294

295295
struct io_submit_state submit_state;
296296

297-
struct io_buffer_list *io_bl;
298297
struct xarray io_bl_xa;
299298

300299
struct io_hash_table cancel_table_locked;

io_uring/io_uring.c

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
147147
static void io_queue_sqe(struct io_kiocb *req);
148148

149149
struct kmem_cache *req_cachep;
150+
static struct workqueue_struct *iou_wq __ro_after_init;
150151

151152
static int __read_mostly sysctl_io_uring_disabled;
152153
static int __read_mostly sysctl_io_uring_group = -1;
@@ -350,7 +351,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
350351
err:
351352
kfree(ctx->cancel_table.hbs);
352353
kfree(ctx->cancel_table_locked.hbs);
353-
kfree(ctx->io_bl);
354354
xa_destroy(&ctx->io_bl_xa);
355355
kfree(ctx);
356356
return NULL;
@@ -1982,10 +1982,15 @@ void io_wq_submit_work(struct io_wq_work *work)
19821982
err = -EBADFD;
19831983
if (!io_file_can_poll(req))
19841984
goto fail;
1985-
err = -ECANCELED;
1986-
if (io_arm_poll_handler(req, issue_flags) != IO_APOLL_OK)
1987-
goto fail;
1988-
return;
1985+
if (req->file->f_flags & O_NONBLOCK ||
1986+
req->file->f_mode & FMODE_NOWAIT) {
1987+
err = -ECANCELED;
1988+
if (io_arm_poll_handler(req, issue_flags) != IO_APOLL_OK)
1989+
goto fail;
1990+
return;
1991+
} else {
1992+
req->flags &= ~REQ_F_APOLL_MULTISHOT;
1993+
}
19891994
}
19901995

19911996
if (req->flags & REQ_F_FORCE_ASYNC) {
@@ -2926,7 +2931,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
29262931
io_napi_free(ctx);
29272932
kfree(ctx->cancel_table.hbs);
29282933
kfree(ctx->cancel_table_locked.hbs);
2929-
kfree(ctx->io_bl);
29302934
xa_destroy(&ctx->io_bl_xa);
29312935
kfree(ctx);
29322936
}
@@ -3161,7 +3165,7 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
31613165
* noise and overhead, there's no discernable change in runtime
31623166
* over using system_wq.
31633167
*/
3164-
queue_work(system_unbound_wq, &ctx->exit_work);
3168+
queue_work(iou_wq, &ctx->exit_work);
31653169
}
31663170

31673171
static int io_uring_release(struct inode *inode, struct file *file)
@@ -3443,14 +3447,15 @@ static void *io_uring_validate_mmap_request(struct file *file,
34433447
ptr = ctx->sq_sqes;
34443448
break;
34453449
case IORING_OFF_PBUF_RING: {
3450+
struct io_buffer_list *bl;
34463451
unsigned int bgid;
34473452

34483453
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
3449-
rcu_read_lock();
3450-
ptr = io_pbuf_get_address(ctx, bgid);
3451-
rcu_read_unlock();
3452-
if (!ptr)
3453-
return ERR_PTR(-EINVAL);
3454+
bl = io_pbuf_get_bl(ctx, bgid);
3455+
if (IS_ERR(bl))
3456+
return bl;
3457+
ptr = bl->buf_ring;
3458+
io_put_bl(ctx, bl);
34543459
break;
34553460
}
34563461
default:
@@ -4185,6 +4190,8 @@ static int __init io_uring_init(void)
41854190
io_buf_cachep = KMEM_CACHE(io_buffer,
41864191
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
41874192

4193+
iou_wq = alloc_workqueue("iou_exit", WQ_UNBOUND, 64);
4194+
41884195
#ifdef CONFIG_SYSCTL
41894196
register_sysctl_init("kernel", kernel_io_uring_disabled_table);
41904197
#endif

io_uring/kbuf.c

Lines changed: 41 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717

1818
#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
1919

20-
#define BGID_ARRAY 64
21-
2220
/* BIDs are addressed by a 16-bit field in a CQE */
2321
#define MAX_BIDS_PER_BGID (1 << 16)
2422

@@ -40,13 +38,9 @@ struct io_buf_free {
4038
int inuse;
4139
};
4240

43-
static struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
44-
struct io_buffer_list *bl,
45-
unsigned int bgid)
41+
static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
42+
unsigned int bgid)
4643
{
47-
if (bl && bgid < BGID_ARRAY)
48-
return &bl[bgid];
49-
5044
return xa_load(&ctx->io_bl_xa, bgid);
5145
}
5246

@@ -55,7 +49,7 @@ static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
5549
{
5650
lockdep_assert_held(&ctx->uring_lock);
5751

58-
return __io_buffer_get_list(ctx, ctx->io_bl, bgid);
52+
return __io_buffer_get_list(ctx, bgid);
5953
}
6054

6155
static int io_buffer_add_list(struct io_ring_ctx *ctx,
@@ -67,11 +61,7 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx,
6761
* always under the ->uring_lock, but the RCU lookup from mmap does.
6862
*/
6963
bl->bgid = bgid;
70-
smp_store_release(&bl->is_ready, 1);
71-
72-
if (bgid < BGID_ARRAY)
73-
return 0;
74-
64+
atomic_set(&bl->refs, 1);
7565
return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
7666
}
7767

@@ -208,24 +198,6 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
208198
return ret;
209199
}
210200

211-
static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
212-
{
213-
struct io_buffer_list *bl;
214-
int i;
215-
216-
bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), GFP_KERNEL);
217-
if (!bl)
218-
return -ENOMEM;
219-
220-
for (i = 0; i < BGID_ARRAY; i++) {
221-
INIT_LIST_HEAD(&bl[i].buf_list);
222-
bl[i].bgid = i;
223-
}
224-
225-
smp_store_release(&ctx->io_bl, bl);
226-
return 0;
227-
}
228-
229201
/*
230202
* Mark the given mapped range as free for reuse
231203
*/
@@ -294,24 +266,24 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
294266
return i;
295267
}
296268

269+
void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
270+
{
271+
if (atomic_dec_and_test(&bl->refs)) {
272+
__io_remove_buffers(ctx, bl, -1U);
273+
kfree_rcu(bl, rcu);
274+
}
275+
}
276+
297277
void io_destroy_buffers(struct io_ring_ctx *ctx)
298278
{
299279
struct io_buffer_list *bl;
300280
struct list_head *item, *tmp;
301281
struct io_buffer *buf;
302282
unsigned long index;
303-
int i;
304-
305-
for (i = 0; i < BGID_ARRAY; i++) {
306-
if (!ctx->io_bl)
307-
break;
308-
__io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
309-
}
310283

311284
xa_for_each(&ctx->io_bl_xa, index, bl) {
312285
xa_erase(&ctx->io_bl_xa, bl->bgid);
313-
__io_remove_buffers(ctx, bl, -1U);
314-
kfree_rcu(bl, rcu);
286+
io_put_bl(ctx, bl);
315287
}
316288

317289
/*
@@ -489,12 +461,6 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
489461

490462
io_ring_submit_lock(ctx, issue_flags);
491463

492-
if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
493-
ret = io_init_bl_list(ctx);
494-
if (ret)
495-
goto err;
496-
}
497-
498464
bl = io_buffer_get_list(ctx, p->bgid);
499465
if (unlikely(!bl)) {
500466
bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
@@ -507,14 +473,9 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
507473
if (ret) {
508474
/*
509475
* Doesn't need rcu free as it was never visible, but
510-
* let's keep it consistent throughout. Also can't
511-
* be a lower indexed array group, as adding one
512-
* where lookup failed cannot happen.
476+
* let's keep it consistent throughout.
513477
*/
514-
if (p->bgid >= BGID_ARRAY)
515-
kfree_rcu(bl, rcu);
516-
else
517-
WARN_ON_ONCE(1);
478+
kfree_rcu(bl, rcu);
518479
goto err;
519480
}
520481
}
@@ -679,12 +640,6 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
679640
if (reg.ring_entries >= 65536)
680641
return -EINVAL;
681642

682-
if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
683-
int ret = io_init_bl_list(ctx);
684-
if (ret)
685-
return ret;
686-
}
687-
688643
bl = io_buffer_get_list(ctx, reg.bgid);
689644
if (bl) {
690645
/* if mapped buffer ring OR classic exists, don't allow */
@@ -733,11 +688,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
733688
if (!bl->is_buf_ring)
734689
return -EINVAL;
735690

736-
__io_remove_buffers(ctx, bl, -1U);
737-
if (bl->bgid >= BGID_ARRAY) {
738-
xa_erase(&ctx->io_bl_xa, bl->bgid);
739-
kfree_rcu(bl, rcu);
740-
}
691+
xa_erase(&ctx->io_bl_xa, bl->bgid);
692+
io_put_bl(ctx, bl);
741693
return 0;
742694
}
743695

@@ -767,23 +719,35 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
767719
return 0;
768720
}
769721

770-
void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
722+
struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
723+
unsigned long bgid)
771724
{
772725
struct io_buffer_list *bl;
726+
bool ret;
773727

774-
bl = __io_buffer_get_list(ctx, smp_load_acquire(&ctx->io_bl), bgid);
775-
776-
if (!bl || !bl->is_mmap)
777-
return NULL;
778728
/*
779-
* Ensure the list is fully setup. Only strictly needed for RCU lookup
780-
* via mmap, and in that case only for the array indexed groups. For
781-
* the xarray lookups, it's either visible and ready, or not at all.
729+
* We have to be a bit careful here - we're inside mmap and cannot grab
730+
* the uring_lock. This means the buffer_list could be simultaneously
731+
* going away, if someone is trying to be sneaky. Look it up under rcu
732+
* so we know it's not going away, and attempt to grab a reference to
733+
* it. If the ref is already zero, then fail the mapping. If successful,
734+
* the caller will call io_put_bl() to drop the the reference at at the
735+
* end. This may then safely free the buffer_list (and drop the pages)
736+
* at that point, vm_insert_pages() would've already grabbed the
737+
* necessary vma references.
782738
*/
783-
if (!smp_load_acquire(&bl->is_ready))
784-
return NULL;
785-
786-
return bl->buf_ring;
739+
rcu_read_lock();
740+
bl = xa_load(&ctx->io_bl_xa, bgid);
741+
/* must be a mmap'able buffer ring and have pages */
742+
ret = false;
743+
if (bl && bl->is_mmap)
744+
ret = atomic_inc_not_zero(&bl->refs);
745+
rcu_read_unlock();
746+
747+
if (ret)
748+
return bl;
749+
750+
return ERR_PTR(-EINVAL);
787751
}
788752

789753
/*

io_uring/kbuf.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,12 @@ struct io_buffer_list {
2525
__u16 head;
2626
__u16 mask;
2727

28+
atomic_t refs;
29+
2830
/* ring mapped provided buffers */
2931
__u8 is_buf_ring;
3032
/* ring mapped provided buffers, but mmap'ed by application */
3133
__u8 is_mmap;
32-
/* bl is visible from an RCU point of view for lookup */
33-
__u8 is_ready;
3434
};
3535

3636
struct io_buffer {
@@ -61,7 +61,9 @@ void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
6161

6262
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
6363

64-
void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid);
64+
void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl);
65+
struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
66+
unsigned long bgid);
6567

6668
static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
6769
{

io_uring/rw.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -936,6 +936,13 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
936936

937937
ret = __io_read(req, issue_flags);
938938

939+
/*
940+
* If the file doesn't support proper NOWAIT, then disable multishot
941+
* and stay in single shot mode.
942+
*/
943+
if (!io_file_supports_nowait(req))
944+
req->flags &= ~REQ_F_APOLL_MULTISHOT;
945+
939946
/*
940947
* If we get -EAGAIN, recycle our buffer and just let normal poll
941948
* handling arm it.
@@ -955,7 +962,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
955962
/*
956963
* Any successful return value will keep the multishot read armed.
957964
*/
958-
if (ret > 0) {
965+
if (ret > 0 && req->flags & REQ_F_APOLL_MULTISHOT) {
959966
/*
960967
* Put our buffer and post a CQE. If we fail to post a CQE, then
961968
* jump to the termination path. This request is then done.

0 commit comments

Comments
 (0)