@@ -540,7 +540,6 @@ enum {
540
540
REQ_F_ISREG_BIT ,
541
541
REQ_F_COMP_LOCKED_BIT ,
542
542
REQ_F_NEED_CLEANUP_BIT ,
543
- REQ_F_OVERFLOW_BIT ,
544
543
REQ_F_POLLED_BIT ,
545
544
REQ_F_BUFFER_SELECTED_BIT ,
546
545
REQ_F_NO_FILE_TABLE_BIT ,
@@ -583,8 +582,6 @@ enum {
583
582
REQ_F_COMP_LOCKED = BIT (REQ_F_COMP_LOCKED_BIT ),
584
583
/* needs cleanup */
585
584
REQ_F_NEED_CLEANUP = BIT (REQ_F_NEED_CLEANUP_BIT ),
586
- /* in overflow list */
587
- REQ_F_OVERFLOW = BIT (REQ_F_OVERFLOW_BIT ),
588
585
/* already went through poll handler */
589
586
REQ_F_POLLED = BIT (REQ_F_POLLED_BIT ),
590
587
/* buffer already selected */
@@ -946,7 +943,8 @@ static void io_get_req_task(struct io_kiocb *req)
946
943
947
944
static inline void io_clean_op (struct io_kiocb * req )
948
945
{
949
- if (req -> flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED ))
946
+ if (req -> flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED |
947
+ REQ_F_INFLIGHT ))
950
948
__io_clean_op (req );
951
949
}
952
950
@@ -1366,7 +1364,6 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1366
1364
req = list_first_entry (& ctx -> cq_overflow_list , struct io_kiocb ,
1367
1365
compl .list );
1368
1366
list_move (& req -> compl .list , & list );
1369
- req -> flags &= ~REQ_F_OVERFLOW ;
1370
1367
if (cqe ) {
1371
1368
WRITE_ONCE (cqe -> user_data , req -> user_data );
1372
1369
WRITE_ONCE (cqe -> res , req -> result );
@@ -1419,7 +1416,6 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
1419
1416
ctx -> rings -> sq_flags |= IORING_SQ_CQ_OVERFLOW ;
1420
1417
}
1421
1418
io_clean_op (req );
1422
- req -> flags |= REQ_F_OVERFLOW ;
1423
1419
req -> result = res ;
1424
1420
req -> compl .cflags = cflags ;
1425
1421
refcount_inc (& req -> refs );
@@ -1563,17 +1559,6 @@ static bool io_dismantle_req(struct io_kiocb *req)
1563
1559
if (req -> file )
1564
1560
io_put_file (req , req -> file , (req -> flags & REQ_F_FIXED_FILE ));
1565
1561
1566
- if (req -> flags & REQ_F_INFLIGHT ) {
1567
- struct io_ring_ctx * ctx = req -> ctx ;
1568
- unsigned long flags ;
1569
-
1570
- spin_lock_irqsave (& ctx -> inflight_lock , flags );
1571
- list_del (& req -> inflight_entry );
1572
- if (waitqueue_active (& ctx -> inflight_wait ))
1573
- wake_up (& ctx -> inflight_wait );
1574
- spin_unlock_irqrestore (& ctx -> inflight_lock , flags );
1575
- }
1576
-
1577
1562
return io_req_clean_work (req );
1578
1563
}
1579
1564
@@ -2819,22 +2804,15 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2819
2804
return __io_iov_buffer_select (req , iov , needs_lock );
2820
2805
}
2821
2806
2822
- static ssize_t io_import_iovec (int rw , struct io_kiocb * req ,
2823
- struct iovec * * iovec , struct iov_iter * iter ,
2824
- bool needs_lock )
2807
+ static ssize_t __io_import_iovec (int rw , struct io_kiocb * req ,
2808
+ struct iovec * * iovec , struct iov_iter * iter ,
2809
+ bool needs_lock )
2825
2810
{
2826
2811
void __user * buf = u64_to_user_ptr (req -> rw .addr );
2827
2812
size_t sqe_len = req -> rw .len ;
2828
2813
ssize_t ret ;
2829
2814
u8 opcode ;
2830
2815
2831
- if (req -> io ) {
2832
- struct io_async_rw * iorw = & req -> io -> rw ;
2833
-
2834
- * iovec = NULL ;
2835
- return iov_iter_count (& iorw -> iter );
2836
- }
2837
-
2838
2816
opcode = req -> opcode ;
2839
2817
if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED ) {
2840
2818
* iovec = NULL ;
@@ -2848,10 +2826,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
2848
2826
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE ) {
2849
2827
if (req -> flags & REQ_F_BUFFER_SELECT ) {
2850
2828
buf = io_rw_buffer_select (req , & sqe_len , needs_lock );
2851
- if (IS_ERR (buf )) {
2852
- * iovec = NULL ;
2829
+ if (IS_ERR (buf ))
2853
2830
return PTR_ERR (buf );
2854
- }
2855
2831
req -> rw .len = sqe_len ;
2856
2832
}
2857
2833
@@ -2879,6 +2855,16 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
2879
2855
return import_iovec (rw , buf , sqe_len , UIO_FASTIOV , iovec , iter );
2880
2856
}
2881
2857
2858
+ static ssize_t io_import_iovec (int rw , struct io_kiocb * req ,
2859
+ struct iovec * * iovec , struct iov_iter * iter ,
2860
+ bool needs_lock )
2861
+ {
2862
+ if (!req -> io )
2863
+ return __io_import_iovec (rw , req , iovec , iter , needs_lock );
2864
+ * iovec = NULL ;
2865
+ return iov_iter_count (& req -> io -> rw .iter );
2866
+ }
2867
+
2882
2868
/*
2883
2869
* For files that don't have ->read_iter() and ->write_iter(), handle them
2884
2870
* by looping over ->read() or ->write() manually.
@@ -3001,11 +2987,8 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw,
3001
2987
ssize_t ret ;
3002
2988
3003
2989
iorw -> iter .iov = iorw -> fast_iov ;
3004
- /* reset ->io around the iovec import, we don't want to use it */
3005
- req -> io = NULL ;
3006
- ret = io_import_iovec (rw , req , (struct iovec * * ) & iorw -> iter .iov ,
2990
+ ret = __io_import_iovec (rw , req , (struct iovec * * ) & iorw -> iter .iov ,
3007
2991
& iorw -> iter , !force_nonblock );
3008
- req -> io = container_of (iorw , struct io_async_ctx , rw );
3009
2992
if (unlikely (ret < 0 ))
3010
2993
return ret ;
3011
2994
@@ -3074,27 +3057,6 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3074
3057
return 1 ;
3075
3058
}
3076
3059
3077
- static inline int kiocb_wait_page_queue_init (struct kiocb * kiocb ,
3078
- struct wait_page_queue * wait ,
3079
- wait_queue_func_t func ,
3080
- void * data )
3081
- {
3082
- /* Can't support async wakeup with polled IO */
3083
- if (kiocb -> ki_flags & IOCB_HIPRI )
3084
- return - EINVAL ;
3085
- if (kiocb -> ki_filp -> f_mode & FMODE_BUF_RASYNC ) {
3086
- wait -> wait .func = func ;
3087
- wait -> wait .private = data ;
3088
- wait -> wait .flags = 0 ;
3089
- INIT_LIST_HEAD (& wait -> wait .entry );
3090
- kiocb -> ki_flags |= IOCB_WAITQ ;
3091
- kiocb -> ki_waitq = wait ;
3092
- return 0 ;
3093
- }
3094
-
3095
- return - EOPNOTSUPP ;
3096
- }
3097
-
3098
3060
/*
3099
3061
* This controls whether a given IO request should be armed for async page
3100
3062
* based retry. If we return false here, the request is handed to the async
@@ -3109,31 +3071,33 @@ static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb,
3109
3071
*/
3110
3072
static bool io_rw_should_retry (struct io_kiocb * req )
3111
3073
{
3074
+ struct wait_page_queue * wait = & req -> io -> rw .wpq ;
3112
3075
struct kiocb * kiocb = & req -> rw .kiocb ;
3113
- int ret ;
3114
3076
3115
3077
/* never retry for NOWAIT, we just complete with -EAGAIN */
3116
3078
if (req -> flags & REQ_F_NOWAIT )
3117
3079
return false;
3118
3080
3119
3081
/* Only for buffered IO */
3120
- if (kiocb -> ki_flags & IOCB_DIRECT )
3082
+ if (kiocb -> ki_flags & ( IOCB_DIRECT | IOCB_HIPRI ) )
3121
3083
return false;
3084
+
3122
3085
/*
3123
3086
* just use poll if we can, and don't attempt if the fs doesn't
3124
3087
* support callback based unlocks
3125
3088
*/
3126
3089
if (file_can_poll (req -> file ) || !(req -> file -> f_mode & FMODE_BUF_RASYNC ))
3127
3090
return false;
3128
3091
3129
- ret = kiocb_wait_page_queue_init ( kiocb , & req -> io -> rw . wpq ,
3130
- io_async_buf_func , req ) ;
3131
- if (! ret ) {
3132
- io_get_req_task ( req );
3133
- return true ;
3134
- }
3092
+ wait -> wait . func = io_async_buf_func ;
3093
+ wait -> wait . private = req ;
3094
+ wait -> wait . flags = 0 ;
3095
+ INIT_LIST_HEAD ( & wait -> wait . entry );
3096
+ kiocb -> ki_flags |= IOCB_WAITQ ;
3097
+ kiocb -> ki_waitq = wait ;
3135
3098
3136
- return false;
3099
+ io_get_req_task (req );
3100
+ return true;
3137
3101
}
3138
3102
3139
3103
static int io_iter_do_read (struct io_kiocb * req , struct iov_iter * iter )
@@ -3238,6 +3202,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
3238
3202
kiocb_done (kiocb , ret , cs );
3239
3203
ret = 0 ;
3240
3204
out_free :
3205
+ /* it's reportedly faster than delegating the null check to kfree() */
3241
3206
if (iovec )
3242
3207
kfree (iovec );
3243
3208
return ret ;
@@ -3334,6 +3299,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
3334
3299
return - EAGAIN ;
3335
3300
}
3336
3301
out_free :
3302
+ /* it's reportedly faster than delegating the null check to kfree() */
3337
3303
if (iovec )
3338
3304
kfree (iovec );
3339
3305
return ret ;
@@ -5653,6 +5619,18 @@ static void __io_clean_op(struct io_kiocb *req)
5653
5619
}
5654
5620
req -> flags &= ~REQ_F_NEED_CLEANUP ;
5655
5621
}
5622
+
5623
+ if (req -> flags & REQ_F_INFLIGHT ) {
5624
+ struct io_ring_ctx * ctx = req -> ctx ;
5625
+ unsigned long flags ;
5626
+
5627
+ spin_lock_irqsave (& ctx -> inflight_lock , flags );
5628
+ list_del (& req -> inflight_entry );
5629
+ if (waitqueue_active (& ctx -> inflight_wait ))
5630
+ wake_up (& ctx -> inflight_wait );
5631
+ spin_unlock_irqrestore (& ctx -> inflight_lock , flags );
5632
+ req -> flags &= ~REQ_F_INFLIGHT ;
5633
+ }
5656
5634
}
5657
5635
5658
5636
static int io_issue_sqe (struct io_kiocb * req , const struct io_uring_sqe * sqe ,
@@ -7979,7 +7957,13 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
7979
7957
ACCT_LOCKED );
7980
7958
7981
7959
INIT_WORK (& ctx -> exit_work , io_ring_exit_work );
7982
- queue_work (system_wq , & ctx -> exit_work );
7960
+ /*
7961
+ * Use system_unbound_wq to avoid spawning tons of event kworkers
7962
+ * if we're exiting a ton of rings at the same time. It just adds
7963
+ * noise and overhead, there's no discernable change in runtime
7964
+ * over using system_wq.
7965
+ */
7966
+ queue_work (system_unbound_wq , & ctx -> exit_work );
7983
7967
}
7984
7968
7985
7969
static int io_uring_release (struct inode * inode , struct file * file )
@@ -8063,6 +8047,33 @@ static bool io_timeout_remove_link(struct io_ring_ctx *ctx,
8063
8047
return found ;
8064
8048
}
8065
8049
8050
+ static bool io_cancel_link_cb (struct io_wq_work * work , void * data )
8051
+ {
8052
+ return io_match_link (container_of (work , struct io_kiocb , work ), data );
8053
+ }
8054
+
8055
+ static void io_attempt_cancel (struct io_ring_ctx * ctx , struct io_kiocb * req )
8056
+ {
8057
+ enum io_wq_cancel cret ;
8058
+
8059
+ /* cancel this particular work, if it's running */
8060
+ cret = io_wq_cancel_work (ctx -> io_wq , & req -> work );
8061
+ if (cret != IO_WQ_CANCEL_NOTFOUND )
8062
+ return ;
8063
+
8064
+ /* find links that hold this pending, cancel those */
8065
+ cret = io_wq_cancel_cb (ctx -> io_wq , io_cancel_link_cb , req , true);
8066
+ if (cret != IO_WQ_CANCEL_NOTFOUND )
8067
+ return ;
8068
+
8069
+ /* if we have a poll link holding this pending, cancel that */
8070
+ if (io_poll_remove_link (ctx , req ))
8071
+ return ;
8072
+
8073
+ /* final option, timeout link is holding this req pending */
8074
+ io_timeout_remove_link (ctx , req );
8075
+ }
8076
+
8066
8077
static void io_uring_cancel_files (struct io_ring_ctx * ctx ,
8067
8078
struct files_struct * files )
8068
8079
{
@@ -8094,35 +8105,9 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
8094
8105
/* We need to keep going until we don't find a matching req */
8095
8106
if (!cancel_req )
8096
8107
break ;
8097
-
8098
- if (cancel_req -> flags & REQ_F_OVERFLOW ) {
8099
- spin_lock_irq (& ctx -> completion_lock );
8100
- list_del (& cancel_req -> compl .list );
8101
- cancel_req -> flags &= ~REQ_F_OVERFLOW ;
8102
-
8103
- io_cqring_mark_overflow (ctx );
8104
- WRITE_ONCE (ctx -> rings -> cq_overflow ,
8105
- atomic_inc_return (& ctx -> cached_cq_overflow ));
8106
- io_commit_cqring (ctx );
8107
- spin_unlock_irq (& ctx -> completion_lock );
8108
-
8109
- /*
8110
- * Put inflight ref and overflow ref. If that's
8111
- * all we had, then we're done with this request.
8112
- */
8113
- if (refcount_sub_and_test (2 , & cancel_req -> refs )) {
8114
- io_free_req (cancel_req );
8115
- finish_wait (& ctx -> inflight_wait , & wait );
8116
- continue ;
8117
- }
8118
- } else {
8119
- io_wq_cancel_work (ctx -> io_wq , & cancel_req -> work );
8120
- /* could be a link, check and remove if it is */
8121
- if (!io_poll_remove_link (ctx , cancel_req ))
8122
- io_timeout_remove_link (ctx , cancel_req );
8123
- io_put_req (cancel_req );
8124
- }
8125
-
8108
+ /* cancel this request, or head link requests */
8109
+ io_attempt_cancel (ctx , cancel_req );
8110
+ io_put_req (cancel_req );
8126
8111
schedule ();
8127
8112
finish_wait (& ctx -> inflight_wait , & wait );
8128
8113
}
0 commit comments