Skip to content

Commit 8350142

Browse files
committed
Merge tag 'for-6.13/io_uring-20241118' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: - Cleanups of the eventfd handling code, making it fully private. - Support for sending a sync message to another ring, without having a ring available to send a normal async message. - Get rid of the separate unlocked hash table, unify everything around the single locked one. - Add support for ring resizing. It can be hard to appropriately size the CQ ring upfront, if the application doesn't know how busy it will be. This results in applications sizing rings for the most busy case, which can be wasteful. With ring resizing, they can start small and grow the ring, if needed. - Add support for fixed wait regions, rather than needing to copy the same wait data tons of times for each wait operation. - Rewrite the resource node handling, which before was serialized per ring. This caused issues with particularly fixed files, where one file waiting on IO could hold up putting and freeing of other unrelated files. Now each node is handled separately. New code is much simpler too, and was a net 250 line reduction in code. - Add support for just doing partial buffer clones, rather than always cloning the entire buffer table. - Series adding static NAPI support, where a specific NAPI instance is used rather than having a list of them available that need lookup. - Add support for mapped regions, and also convert the fixed wait support mentioned above to that concept. This avoids doing special mappings for various planned features, and folds the existing registered wait into that too. - Add support for hybrid IO polling, which is a variant of strict IOPOLL but with an initial sleep delay to avoid spinning too early and wasting resources on devices that aren't necessarily in the < 5 usec category wrt latencies. - Various cleanups and little fixes. * tag 'for-6.13/io_uring-20241118' of git://git.kernel.dk/linux: (79 commits) io_uring/region: fix error codes after failed vmap io_uring: restore back registered wait arguments io_uring: add memory region registration io_uring: introduce concept of memory regions io_uring: temporarily disable registered waits io_uring: disable ENTER_EXT_ARG_REG for IOPOLL io_uring: fortify io_pin_pages with a warning switch io_msg_ring() to CLASS(fd) io_uring: fix invalid hybrid polling ctx leaks io_uring/uring_cmd: fix buffer index retrieval io_uring/rsrc: add & apply io_req_assign_buf_node() io_uring/rsrc: remove '->ctx_ptr' of 'struct io_rsrc_node' io_uring/rsrc: pass 'struct io_ring_ctx' reference to rsrc helpers io_uring: avoid normal tw intermediate fallback io_uring/napi: add static napi tracking strategy io_uring/napi: clean up __io_napi_do_busy_loop io_uring/napi: Use lock guards io_uring/napi: improve __io_napi_add io_uring/napi: fix io_napi_entry RCU accesses io_uring/napi: protect concurrent io_napi_entry timeout accesses ...
2 parents 77a0cfa + a652958 commit 8350142

40 files changed

+1825
-1223
lines changed

include/linux/io_uring/cmd.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
110110

111111
static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
112112
{
113-
return cmd_to_io_kiocb(cmd)->task;
113+
return cmd_to_io_kiocb(cmd)->tctx->task;
114114
}
115115

116116
#endif /* _LINUX_IO_URING_CMD_H */

include/linux/io_uring_types.h

Lines changed: 56 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -56,19 +56,18 @@ struct io_wq_work {
5656
int cancel_seq;
5757
};
5858

59-
struct io_fixed_file {
60-
/* file * with additional FFS_* flags */
61-
unsigned long file_ptr;
59+
struct io_rsrc_data {
60+
unsigned int nr;
61+
struct io_rsrc_node **nodes;
6262
};
6363

6464
struct io_file_table {
65-
struct io_fixed_file *files;
65+
struct io_rsrc_data data;
6666
unsigned long *bitmap;
6767
unsigned int alloc_hint;
6868
};
6969

7070
struct io_hash_bucket {
71-
spinlock_t lock;
7271
struct hlist_head list;
7372
} ____cacheline_aligned_in_smp;
7473

@@ -77,6 +76,12 @@ struct io_hash_table {
7776
unsigned hash_bits;
7877
};
7978

79+
struct io_mapped_region {
80+
struct page **pages;
81+
void *vmap_ptr;
82+
size_t nr_pages;
83+
};
84+
8085
/*
8186
* Arbitrary limit, can be raised if need be
8287
*/
@@ -86,6 +91,7 @@ struct io_uring_task {
8691
/* submission side */
8792
int cached_refs;
8893
const struct io_ring_ctx *last;
94+
struct task_struct *task;
8995
struct io_wq *io_wq;
9096
struct file *registered_rings[IO_RINGFD_REG_MAX];
9197

@@ -271,7 +277,6 @@ struct io_ring_ctx {
271277
* Fixed resources fast path, should be accessed only under
272278
* uring_lock, and updated through io_uring_register(2)
273279
*/
274-
struct io_rsrc_node *rsrc_node;
275280
atomic_t cancel_seq;
276281

277282
/*
@@ -284,15 +289,13 @@ struct io_ring_ctx {
284289
struct io_wq_work_list iopoll_list;
285290

286291
struct io_file_table file_table;
287-
struct io_mapped_ubuf **user_bufs;
288-
unsigned nr_user_files;
289-
unsigned nr_user_bufs;
292+
struct io_rsrc_data buf_table;
290293

291294
struct io_submit_state submit_state;
292295

293296
struct xarray io_bl_xa;
294297

295-
struct io_hash_table cancel_table_locked;
298+
struct io_hash_table cancel_table;
296299
struct io_alloc_cache apoll_cache;
297300
struct io_alloc_cache netmsg_cache;
298301
struct io_alloc_cache rw_cache;
@@ -303,6 +306,11 @@ struct io_ring_ctx {
303306
* ->uring_cmd() by io_uring_cmd_insert_cancelable()
304307
*/
305308
struct hlist_head cancelable_uring_cmd;
309+
/*
310+
* For Hybrid IOPOLL, runtime in hybrid polling, without
311+
* scheduling time
312+
*/
313+
u64 hybrid_poll_time;
306314
} ____cacheline_aligned_in_smp;
307315

308316
struct {
@@ -317,6 +325,9 @@ struct io_ring_ctx {
317325
unsigned cq_entries;
318326
struct io_ev_fd __rcu *io_ev_fd;
319327
unsigned cq_extra;
328+
329+
void *cq_wait_arg;
330+
size_t cq_wait_size;
320331
} ____cacheline_aligned_in_smp;
321332

322333
/*
@@ -343,7 +354,6 @@ struct io_ring_ctx {
343354

344355
struct list_head io_buffers_comp;
345356
struct list_head cq_overflow_list;
346-
struct io_hash_table cancel_table;
347357

348358
struct hlist_head waitid_list;
349359

@@ -367,16 +377,6 @@ struct io_ring_ctx {
367377
struct wait_queue_head poll_wq;
368378
struct io_restriction restrictions;
369379

370-
/* slow path rsrc auxilary data, used by update/register */
371-
struct io_rsrc_data *file_data;
372-
struct io_rsrc_data *buf_data;
373-
374-
/* protected by ->uring_lock */
375-
struct list_head rsrc_ref_list;
376-
struct io_alloc_cache rsrc_node_cache;
377-
struct wait_queue_head rsrc_quiesce_wq;
378-
unsigned rsrc_quiesce;
379-
380380
u32 pers_next;
381381
struct xarray personalities;
382382

@@ -410,14 +410,21 @@ struct io_ring_ctx {
410410
/* napi busy poll default timeout */
411411
ktime_t napi_busy_poll_dt;
412412
bool napi_prefer_busy_poll;
413-
bool napi_enabled;
413+
u8 napi_track_mode;
414414

415415
DECLARE_HASHTABLE(napi_ht, 4);
416416
#endif
417417

418418
/* protected by ->completion_lock */
419419
unsigned evfd_last_cq_tail;
420420

421+
/*
422+
* Protection for resize vs mmap races - both the mmap and resize
423+
* side will need to grab this lock, to prevent either side from
424+
* being run concurrently with the other.
425+
*/
426+
struct mutex resize_lock;
427+
421428
/*
422429
* If IORING_SETUP_NO_MMAP is used, then the below holds
423430
* the gup'ed pages for the two rings, and the sqes.
@@ -426,6 +433,9 @@ struct io_ring_ctx {
426433
unsigned short n_sqe_pages;
427434
struct page **ring_pages;
428435
struct page **sqe_pages;
436+
437+
/* used for optimised request parameter and wait argument passing */
438+
struct io_mapped_region param_region;
429439
};
430440

431441
struct io_tw_state {
@@ -448,6 +458,7 @@ enum {
448458
REQ_F_LINK_TIMEOUT_BIT,
449459
REQ_F_NEED_CLEANUP_BIT,
450460
REQ_F_POLLED_BIT,
461+
REQ_F_HYBRID_IOPOLL_STATE_BIT,
451462
REQ_F_BUFFER_SELECTED_BIT,
452463
REQ_F_BUFFER_RING_BIT,
453464
REQ_F_REISSUE_BIT,
@@ -460,7 +471,6 @@ enum {
460471
REQ_F_DOUBLE_POLL_BIT,
461472
REQ_F_APOLL_MULTISHOT_BIT,
462473
REQ_F_CLEAR_POLLIN_BIT,
463-
REQ_F_HASH_LOCKED_BIT,
464474
/* keep async read/write and isreg together and in order */
465475
REQ_F_SUPPORT_NOWAIT_BIT,
466476
REQ_F_ISREG_BIT,
@@ -469,6 +479,7 @@ enum {
469479
REQ_F_BL_EMPTY_BIT,
470480
REQ_F_BL_NO_RECYCLE_BIT,
471481
REQ_F_BUFFERS_COMMIT_BIT,
482+
REQ_F_BUF_NODE_BIT,
472483

473484
/* not a real bit, just to check we're not overflowing the space */
474485
__REQ_F_LAST_BIT,
@@ -507,6 +518,8 @@ enum {
507518
REQ_F_NEED_CLEANUP = IO_REQ_FLAG(REQ_F_NEED_CLEANUP_BIT),
508519
/* already went through poll handler */
509520
REQ_F_POLLED = IO_REQ_FLAG(REQ_F_POLLED_BIT),
521+
/* every req only blocks once in hybrid poll */
522+
REQ_F_IOPOLL_STATE = IO_REQ_FLAG(REQ_F_HYBRID_IOPOLL_STATE_BIT),
510523
/* buffer already selected */
511524
REQ_F_BUFFER_SELECTED = IO_REQ_FLAG(REQ_F_BUFFER_SELECTED_BIT),
512525
/* buffer selected from ring, needs commit */
@@ -535,8 +548,6 @@ enum {
535548
REQ_F_APOLL_MULTISHOT = IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT),
536549
/* recvmsg special flag, clear EPOLLIN */
537550
REQ_F_CLEAR_POLLIN = IO_REQ_FLAG(REQ_F_CLEAR_POLLIN_BIT),
538-
/* hashed into ->cancel_hash_locked, protected by ->uring_lock */
539-
REQ_F_HASH_LOCKED = IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT),
540551
/* don't use lazy poll wake for this request */
541552
REQ_F_POLL_NO_LAZY = IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT),
542553
/* file is pollable */
@@ -547,6 +558,8 @@ enum {
547558
REQ_F_BL_NO_RECYCLE = IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT),
548559
/* buffer ring head needs incrementing on put */
549560
REQ_F_BUFFERS_COMMIT = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT),
561+
/* buf node is valid */
562+
REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT),
550563
};
551564

552565
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
@@ -616,12 +629,9 @@ struct io_kiocb {
616629
struct io_cqe cqe;
617630

618631
struct io_ring_ctx *ctx;
619-
struct task_struct *task;
632+
struct io_uring_task *tctx;
620633

621634
union {
622-
/* store used ubuf, so we can prevent reloading */
623-
struct io_mapped_ubuf *imu;
624-
625635
/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
626636
struct io_buffer *kbuf;
627637

@@ -630,6 +640,8 @@ struct io_kiocb {
630640
* REQ_F_BUFFER_RING is set.
631641
*/
632642
struct io_buffer_list *buf_list;
643+
644+
struct io_rsrc_node *buf_node;
633645
};
634646

635647
union {
@@ -639,13 +651,20 @@ struct io_kiocb {
639651
__poll_t apoll_events;
640652
};
641653

642-
struct io_rsrc_node *rsrc_node;
654+
struct io_rsrc_node *file_node;
643655

644656
atomic_t refs;
645657
bool cancel_seq_set;
646658
struct io_task_work io_task_work;
647-
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
648-
struct hlist_node hash_node;
659+
union {
660+
/*
661+
* for polled requests, i.e. IORING_OP_POLL_ADD and async armed
662+
* poll
663+
*/
664+
struct hlist_node hash_node;
665+
/* For IOPOLL setup queues, with hybrid polling */
666+
u64 iopoll_start;
667+
};
649668
/* internal polling, see IORING_FEAT_FAST_POLL */
650669
struct async_poll *apoll;
651670
/* opcode allocated if it needs to store data for async defer */
@@ -668,4 +687,9 @@ struct io_overflow_cqe {
668687
struct io_uring_cqe cqe;
669688
};
670689

690+
static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx)
691+
{
692+
return ctx->flags & IORING_SETUP_CQE32;
693+
}
694+
671695
#endif

include/trace/events/io_uring.h

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -315,20 +315,14 @@ TRACE_EVENT(io_uring_fail_link,
315315
* io_uring_complete - called when completing an SQE
316316
*
317317
* @ctx: pointer to a ring context structure
318-
* @req: pointer to a submitted request
319-
* @user_data: user data associated with the request
320-
* @res: result of the request
321-
* @cflags: completion flags
322-
* @extra1: extra 64-bit data for CQE32
323-
* @extra2: extra 64-bit data for CQE32
324-
*
318+
* @req: (optional) pointer to a submitted request
319+
* @cqe: pointer to the filled in CQE being posted
325320
*/
326321
TRACE_EVENT(io_uring_complete,
327322

328-
TP_PROTO(void *ctx, void *req, u64 user_data, int res, unsigned cflags,
329-
u64 extra1, u64 extra2),
323+
TP_PROTO(struct io_ring_ctx *ctx, void *req, struct io_uring_cqe *cqe),
330324

331-
TP_ARGS(ctx, req, user_data, res, cflags, extra1, extra2),
325+
TP_ARGS(ctx, req, cqe),
332326

333327
TP_STRUCT__entry (
334328
__field( void *, ctx )
@@ -343,11 +337,11 @@ TRACE_EVENT(io_uring_complete,
343337
TP_fast_assign(
344338
__entry->ctx = ctx;
345339
__entry->req = req;
346-
__entry->user_data = user_data;
347-
__entry->res = res;
348-
__entry->cflags = cflags;
349-
__entry->extra1 = extra1;
350-
__entry->extra2 = extra2;
340+
__entry->user_data = cqe->user_data;
341+
__entry->res = cqe->res;
342+
__entry->cflags = cqe->flags;
343+
__entry->extra1 = io_ctx_cqe32(ctx) ? cqe->big_cqe[0] : 0;
344+
__entry->extra2 = io_ctx_cqe32(ctx) ? cqe->big_cqe[1] : 0;
351345
),
352346

353347
TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x "

0 commit comments

Comments
 (0)