Skip to content

Commit 5b9a7bb

Browse files
committed
Merge tag 'for-6.4/io_uring-2023-04-21' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: - Cleanup of the io-wq per-node mapping, notably getting rid of it so we just have a single io_wq entry per ring (Breno) - Followup to the above, move accounting to io_wq as well and completely drop struct io_wqe (Gabriel) - Enable KASAN for the internal io_uring caches (Breno) - Add support for multishot timeouts. Some applications use timeouts to wake someone waiting on completion entries, and this makes it a bit easier to just have a recurring timer rather than needing to rearm it every time (David) - Support archs that have shared cache coloring between userspace and the kernel, and hence have strict address requirements for mmap'ing the ring into userspace. This should only be parisc/hppa. (Helge, me) - XFS has supported O_DIRECT writes without needing to lock the inode exclusively for a long time, and ext4 now supports it as well. This is true for the common cases of not extending the file size. Flag the fs as having that feature, and utilize that to avoid serializing those writes in io_uring (me) - Enable completion batching for uring commands (me) - Revert patch adding io_uring restriction to what can be GUP mapped or not. This does not belong in io_uring, as io_uring isn't really special in this regard. Since this is also getting in the way of cleanups and improvements to the GUP code, get rid of if (me) - A few series greatly reducing the complexity of registered resources, like buffers or files. Not only does this clean up the code a lot, the simplified code is also a LOT more efficient (Pavel) - Series optimizing how we wait for events and run task_work related to it (Pavel) - Fixes for file/buffer unregistration with DEFER_TASKRUN (Pavel) - Misc cleanups and improvements (Pavel, me) * tag 'for-6.4/io_uring-2023-04-21' of git://git.kernel.dk/linux: (71 commits) Revert "io_uring/rsrc: disallow multi-source reg buffers" io_uring: add support for multishot timeouts io_uring/rsrc: disassociate nodes and rsrc_data io_uring/rsrc: devirtualise rsrc put callbacks io_uring/rsrc: pass node to io_rsrc_put_work() io_uring/rsrc: inline io_rsrc_put_work() io_uring/rsrc: add empty flag in rsrc_node io_uring/rsrc: merge nodes and io_rsrc_put io_uring/rsrc: infer node from ctx on io_queue_rsrc_removal io_uring/rsrc: remove unused io_rsrc_node::llist io_uring/rsrc: refactor io_queue_rsrc_removal io_uring/rsrc: simplify single file node switching io_uring/rsrc: clean up __io_sqe_buffers_update() io_uring/rsrc: inline switch_start fast path io_uring/rsrc: remove rsrc_data refs io_uring/rsrc: fix DEFER_TASKRUN rsrc quiesce io_uring/rsrc: use wq for quiescing io_uring/rsrc: refactor io_rsrc_ref_quiesce io_uring/rsrc: remove io_rsrc_node::done io_uring/rsrc: use nospec'ed indexes ...
2 parents 5c7ecad + 3c85cc4 commit 5b9a7bb

File tree

22 files changed

+949
-847
lines changed

22 files changed

+949
-847
lines changed

fs/ext4/file.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -899,7 +899,8 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
899899
return ret;
900900
}
901901

902-
filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
902+
filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC |
903+
FMODE_DIO_PARALLEL_WRITE;
903904
return dquot_file_open(inode, filp);
904905
}
905906

fs/xfs/xfs_file.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1171,7 +1171,8 @@ xfs_file_open(
11711171
{
11721172
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
11731173
return -EIO;
1174-
file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
1174+
file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
1175+
FMODE_DIO_PARALLEL_WRITE;
11751176
return generic_file_open(inode, file);
11761177
}
11771178

include/linux/fs.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
168168

169169
#define FMODE_NOREUSE ((__force fmode_t)0x800000)
170170

171+
/* File supports non-exclusive O_DIRECT writes from multiple threads */
172+
#define FMODE_DIO_PARALLEL_WRITE ((__force fmode_t)0x1000000)
173+
171174
/* File was opened by fanotify and shouldn't generate fanotify events */
172175
#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
173176

include/linux/io_uring_types.h

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,10 @@ struct io_ev_fd {
188188
};
189189

190190
struct io_alloc_cache {
191-
struct hlist_head list;
191+
struct io_wq_work_node list;
192192
unsigned int nr_cached;
193+
unsigned int max_cached;
194+
size_t elem_size;
193195
};
194196

195197
struct io_ring_ctx {
@@ -239,7 +241,6 @@ struct io_ring_ctx {
239241
* uring_lock, and updated through io_uring_register(2)
240242
*/
241243
struct io_rsrc_node *rsrc_node;
242-
int rsrc_cached_refs;
243244
atomic_t cancel_seq;
244245
struct io_file_table file_table;
245246
unsigned nr_user_files;
@@ -295,7 +296,7 @@ struct io_ring_ctx {
295296
spinlock_t completion_lock;
296297

297298
bool poll_multi_queue;
298-
bool cq_waiting;
299+
atomic_t cq_wait_nr;
299300

300301
/*
301302
* ->iopoll_list is protected by the ctx->uring_lock for
@@ -325,16 +326,15 @@ struct io_ring_ctx {
325326
struct io_restriction restrictions;
326327

327328
/* slow path rsrc auxilary data, used by update/register */
328-
struct io_rsrc_node *rsrc_backup_node;
329329
struct io_mapped_ubuf *dummy_ubuf;
330330
struct io_rsrc_data *file_data;
331331
struct io_rsrc_data *buf_data;
332332

333-
struct delayed_work rsrc_put_work;
334-
struct callback_head rsrc_put_tw;
335-
struct llist_head rsrc_put_llist;
333+
/* protected by ->uring_lock */
336334
struct list_head rsrc_ref_list;
337-
spinlock_t rsrc_ref_lock;
335+
struct io_alloc_cache rsrc_node_cache;
336+
struct wait_queue_head rsrc_quiesce_wq;
337+
unsigned rsrc_quiesce;
338338

339339
struct list_head io_buffers_pages;
340340

@@ -366,6 +366,11 @@ struct io_ring_ctx {
366366
unsigned evfd_last_cq_tail;
367367
};
368368

369+
struct io_tw_state {
370+
/* ->uring_lock is taken, callbacks can use io_tw_lock to lock it */
371+
bool locked;
372+
};
373+
369374
enum {
370375
REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
371376
REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
@@ -472,7 +477,7 @@ enum {
472477
REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT),
473478
};
474479

475-
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
480+
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
476481

477482
struct io_task_work {
478483
struct llist_node node;
@@ -562,6 +567,7 @@ struct io_kiocb {
562567
atomic_t refs;
563568
atomic_t poll_refs;
564569
struct io_task_work io_task_work;
570+
unsigned nr_tw;
565571
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
566572
union {
567573
struct hlist_node hash_node;

include/trace/events/io_uring.h

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -360,27 +360,25 @@ TRACE_EVENT(io_uring_complete,
360360
);
361361

362362
/**
363-
* io_uring_submit_sqe - called before submitting one SQE
363+
* io_uring_submit_req - called before submitting a request
364364
*
365365
* @req: pointer to a submitted request
366-
* @force_nonblock: whether a context blocking or not
367366
*
368367
* Allows to track SQE submitting, to understand what was the source of it, SQ
369368
* thread or io_uring_enter call.
370369
*/
371-
TRACE_EVENT(io_uring_submit_sqe,
370+
TRACE_EVENT(io_uring_submit_req,
372371

373-
TP_PROTO(struct io_kiocb *req, bool force_nonblock),
372+
TP_PROTO(struct io_kiocb *req),
374373

375-
TP_ARGS(req, force_nonblock),
374+
TP_ARGS(req),
376375

377376
TP_STRUCT__entry (
378377
__field( void *, ctx )
379378
__field( void *, req )
380379
__field( unsigned long long, user_data )
381380
__field( u8, opcode )
382381
__field( u32, flags )
383-
__field( bool, force_nonblock )
384382
__field( bool, sq_thread )
385383

386384
__string( op_str, io_uring_get_opcode(req->opcode) )
@@ -392,16 +390,15 @@ TRACE_EVENT(io_uring_submit_sqe,
392390
__entry->user_data = req->cqe.user_data;
393391
__entry->opcode = req->opcode;
394392
__entry->flags = req->flags;
395-
__entry->force_nonblock = force_nonblock;
396393
__entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL;
397394

398395
__assign_str(op_str, io_uring_get_opcode(req->opcode));
399396
),
400397

401398
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, "
402-
"non block %d, sq_thread %d", __entry->ctx, __entry->req,
399+
"sq_thread %d", __entry->ctx, __entry->req,
403400
__entry->user_data, __get_str(op_str),
404-
__entry->flags, __entry->force_nonblock, __entry->sq_thread)
401+
__entry->flags, __entry->sq_thread)
405402
);
406403

407404
/*

include/uapi/linux/io_uring.h

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,7 @@ enum io_uring_op {
250250
#define IORING_TIMEOUT_REALTIME (1U << 3)
251251
#define IORING_LINK_TIMEOUT_UPDATE (1U << 4)
252252
#define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5)
253+
#define IORING_TIMEOUT_MULTISHOT (1U << 6)
253254
#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
254255
#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
255256
/*
@@ -389,6 +390,9 @@ enum {
389390
#define IORING_OFF_SQ_RING 0ULL
390391
#define IORING_OFF_CQ_RING 0x8000000ULL
391392
#define IORING_OFF_SQES 0x10000000ULL
393+
#define IORING_OFF_PBUF_RING 0x80000000ULL
394+
#define IORING_OFF_PBUF_SHIFT 16
395+
#define IORING_OFF_MMAP_MASK 0xf8000000ULL
392396

393397
/*
394398
* Filled with the offset for mmap(2)
@@ -568,19 +572,6 @@ struct io_uring_rsrc_update2 {
568572
__u32 resv2;
569573
};
570574

571-
struct io_uring_notification_slot {
572-
__u64 tag;
573-
__u64 resv[3];
574-
};
575-
576-
struct io_uring_notification_register {
577-
__u32 nr_slots;
578-
__u32 resv;
579-
__u64 resv2;
580-
__u64 data;
581-
__u64 resv3;
582-
};
583-
584575
/* Skip updating fd indexes set to this value in the fd table */
585576
#define IORING_REGISTER_FILES_SKIP (-2)
586577

@@ -635,12 +626,26 @@ struct io_uring_buf_ring {
635626
};
636627
};
637628

629+
/*
630+
* Flags for IORING_REGISTER_PBUF_RING.
631+
*
632+
* IOU_PBUF_RING_MMAP: If set, kernel will allocate the memory for the ring.
633+
* The application must not set a ring_addr in struct
634+
* io_uring_buf_reg, instead it must subsequently call
635+
* mmap(2) with the offset set as:
636+
* IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)
637+
* to get a virtual mapping for the ring.
638+
*/
639+
enum {
640+
IOU_PBUF_RING_MMAP = 1,
641+
};
642+
638643
/* argument for IORING_(UN)REGISTER_PBUF_RING */
639644
struct io_uring_buf_reg {
640645
__u64 ring_addr;
641646
__u32 ring_entries;
642647
__u16 bgid;
643-
__u16 pad;
648+
__u16 flags;
644649
__u64 resv[3];
645650
};
646651

io_uring/alloc_cache.h

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,47 +7,60 @@
77
#define IO_ALLOC_CACHE_MAX 512
88

99
struct io_cache_entry {
10-
struct hlist_node node;
10+
struct io_wq_work_node node;
1111
};
1212

1313
static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
1414
struct io_cache_entry *entry)
1515
{
16-
if (cache->nr_cached < IO_ALLOC_CACHE_MAX) {
16+
if (cache->nr_cached < cache->max_cached) {
1717
cache->nr_cached++;
18-
hlist_add_head(&entry->node, &cache->list);
18+
wq_stack_add_head(&entry->node, &cache->list);
19+
/* KASAN poisons object */
20+
kasan_slab_free_mempool(entry);
1921
return true;
2022
}
2123
return false;
2224
}
2325

26+
static inline bool io_alloc_cache_empty(struct io_alloc_cache *cache)
27+
{
28+
return !cache->list.next;
29+
}
30+
2431
static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache)
2532
{
26-
if (!hlist_empty(&cache->list)) {
27-
struct hlist_node *node = cache->list.first;
33+
if (cache->list.next) {
34+
struct io_cache_entry *entry;
2835

29-
hlist_del(node);
36+
entry = container_of(cache->list.next, struct io_cache_entry, node);
37+
kasan_unpoison_range(entry, cache->elem_size);
38+
cache->list.next = cache->list.next->next;
3039
cache->nr_cached--;
31-
return container_of(node, struct io_cache_entry, node);
40+
return entry;
3241
}
3342

3443
return NULL;
3544
}
3645

37-
static inline void io_alloc_cache_init(struct io_alloc_cache *cache)
46+
static inline void io_alloc_cache_init(struct io_alloc_cache *cache,
47+
unsigned max_nr, size_t size)
3848
{
39-
INIT_HLIST_HEAD(&cache->list);
49+
cache->list.next = NULL;
4050
cache->nr_cached = 0;
51+
cache->max_cached = max_nr;
52+
cache->elem_size = size;
4153
}
4254

4355
static inline void io_alloc_cache_free(struct io_alloc_cache *cache,
4456
void (*free)(struct io_cache_entry *))
4557
{
46-
while (!hlist_empty(&cache->list)) {
47-
struct hlist_node *node = cache->list.first;
58+
while (1) {
59+
struct io_cache_entry *entry = io_alloc_cache_get(cache);
4860

49-
hlist_del(node);
50-
free(container_of(node, struct io_cache_entry, node));
61+
if (!entry)
62+
break;
63+
free(entry);
5164
}
5265
cache->nr_cached = 0;
5366
}

io_uring/filetable.c

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,6 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
6464
u32 slot_index)
6565
__must_hold(&req->ctx->uring_lock)
6666
{
67-
bool needs_switch = false;
6867
struct io_fixed_file *file_slot;
6968
int ret;
7069

@@ -81,18 +80,13 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
8180
if (file_slot->file_ptr) {
8281
struct file *old_file;
8382

84-
ret = io_rsrc_node_switch_start(ctx);
85-
if (ret)
86-
goto err;
87-
8883
old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
89-
ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
90-
ctx->rsrc_node, old_file);
84+
ret = io_queue_rsrc_removal(ctx->file_data, slot_index, old_file);
9185
if (ret)
92-
goto err;
86+
return ret;
87+
9388
file_slot->file_ptr = 0;
9489
io_file_bitmap_clear(&ctx->file_table, slot_index);
95-
needs_switch = true;
9690
}
9791

9892
ret = io_scm_file_account(ctx, file);
@@ -101,9 +95,6 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
10195
io_fixed_file_set(file_slot, file);
10296
io_file_bitmap_set(&ctx->file_table, slot_index);
10397
}
104-
err:
105-
if (needs_switch)
106-
io_rsrc_node_switch(ctx, ctx->file_data);
10798
return ret;
10899
}
109100

@@ -156,23 +147,19 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
156147
return -ENXIO;
157148
if (offset >= ctx->nr_user_files)
158149
return -EINVAL;
159-
ret = io_rsrc_node_switch_start(ctx);
160-
if (ret)
161-
return ret;
162150

163151
offset = array_index_nospec(offset, ctx->nr_user_files);
164152
file_slot = io_fixed_file_slot(&ctx->file_table, offset);
165153
if (!file_slot->file_ptr)
166154
return -EBADF;
167155

168156
file = (struct file *)(file_slot->file_ptr & FFS_MASK);
169-
ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
157+
ret = io_queue_rsrc_removal(ctx->file_data, offset, file);
170158
if (ret)
171159
return ret;
172160

173161
file_slot->file_ptr = 0;
174162
io_file_bitmap_clear(&ctx->file_table, offset);
175-
io_rsrc_node_switch(ctx, ctx->file_data);
176163
return 0;
177164
}
178165

0 commit comments

Comments
 (0)