Skip to content

Commit 83d6c39

Browse files
committed
io-wq: fix race between worker exiting and activating free worker
Nadav correctly reports that we have a race between a worker exiting, and new work being queued. This can lead to work being queued behind an existing worker that could be sleeping on an event before it can run to completion, and hence introducing potential big latency gaps if we hit this race condition: cpu0 cpu1 ---- ---- io_wqe_worker() schedule_timeout() // timed out io_wqe_enqueue() io_wqe_wake_worker() // work_flags & IO_WQ_WORK_CONCURRENT io_wqe_activate_free_worker() io_worker_exit() Fix this by having the exiting worker go through the normal decrement of a running worker, which will spawn a new one if needed. The free worker activation is modified to only return success if we were able to find a sleeping worker - if not, we keep looking through the list. If we fail, we create a new worker as per usual. Cc: [email protected] Link: https://lore.kernel.org/io-uring/[email protected]/ Reported-by: Nadav Amit <[email protected]> Tested-by: Nadav Amit <[email protected]> Signed-off-by: Jens Axboe <[email protected]>
1 parent a890d01 commit 83d6c39

File tree

1 file changed

+19
-19
lines changed

1 file changed

+19
-19
lines changed

fs/io-wq.c

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ struct io_cb_cancel_data {
130130
};
131131

132132
static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index);
133+
static void io_wqe_dec_running(struct io_worker *worker);
133134

134135
static bool io_worker_get(struct io_worker *worker)
135136
{
@@ -168,26 +169,21 @@ static void io_worker_exit(struct io_worker *worker)
168169
{
169170
struct io_wqe *wqe = worker->wqe;
170171
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
171-
unsigned flags;
172172

173173
if (refcount_dec_and_test(&worker->ref))
174174
complete(&worker->ref_done);
175175
wait_for_completion(&worker->ref_done);
176176

177-
preempt_disable();
178-
current->flags &= ~PF_IO_WORKER;
179-
flags = worker->flags;
180-
worker->flags = 0;
181-
if (flags & IO_WORKER_F_RUNNING)
182-
atomic_dec(&acct->nr_running);
183-
worker->flags = 0;
184-
preempt_enable();
185-
186177
raw_spin_lock_irq(&wqe->lock);
187-
if (flags & IO_WORKER_F_FREE)
178+
if (worker->flags & IO_WORKER_F_FREE)
188179
hlist_nulls_del_rcu(&worker->nulls_node);
189180
list_del_rcu(&worker->all_list);
190181
acct->nr_workers--;
182+
preempt_disable();
183+
io_wqe_dec_running(worker);
184+
worker->flags = 0;
185+
current->flags &= ~PF_IO_WORKER;
186+
preempt_enable();
191187
raw_spin_unlock_irq(&wqe->lock);
192188

193189
kfree_rcu(worker, rcu);
@@ -214,15 +210,19 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
214210
struct hlist_nulls_node *n;
215211
struct io_worker *worker;
216212

217-
n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list));
218-
if (is_a_nulls(n))
219-
return false;
220-
221-
worker = hlist_nulls_entry(n, struct io_worker, nulls_node);
222-
if (io_worker_get(worker)) {
223-
wake_up_process(worker->task);
213+
/*
214+
* Iterate free_list and see if we can find an idle worker to
215+
* activate. If a given worker is on the free_list but in the process
216+
* of exiting, keep trying.
217+
*/
218+
hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) {
219+
if (!io_worker_get(worker))
220+
continue;
221+
if (wake_up_process(worker->task)) {
222+
io_worker_release(worker);
223+
return true;
224+
}
224225
io_worker_release(worker);
225-
return true;
226226
}
227227

228228
return false;

0 commit comments

Comments
 (0)