Skip to content

Commit e9e64f8

Browse files
committed
Merge branch 'for-5.17' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq
Pull workqueue updates from Tejun Heo: - The code around workqueue scheduler hooks got reorganized early 2019 which unfortuately introdued a couple subtle and rare race conditions where preemption can mangle internal workqueue state triggering a WARN and possibly causing a stall or at least delay in execution. Frederic fixed both early December and the fixes were sitting in for-5.16-fixes which I forgot to push. They are here now. I'll forward them to stable after they land. - The scheduler hook reorganization has more implicatoins for workqueue code in that the hooks are now more strictly synchronized and thus the interacting operations can become more straight-forward. Lai is in the process of simplifying workqueue code and this pull request contains some of the patches. * 'for-5.17' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq: workqueue: Remove the cacheline_aligned for nr_running workqueue: Move the code of waking a worker up in unbind_workers() workqueue: Remove schedule() in unbind_workers() workqueue: Remove outdated comment about exceptional workers in unbind_workers() workqueue: Remove the advanced kicking of the idle workers in rebind_workers() workqueue: Remove the outdated comment before wq_worker_sleeping() workqueue: Fix unbind_workers() VS wq_worker_sleeping() race workqueue: Fix unbind_workers() VS wq_worker_running() race workqueue: Upgrade queue_work_on() comment
2 parents ea1ca66 + 2a8ab0f commit e9e64f8

File tree

1 file changed

+45
-56
lines changed

1 file changed

+45
-56
lines changed

kernel/workqueue.c

Lines changed: 45 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,9 @@ struct worker_pool {
154154

155155
unsigned long watchdog_ts; /* L: watchdog timestamp */
156156

157+
/* The current concurrency level. */
158+
atomic_t nr_running;
159+
157160
struct list_head worklist; /* L: list of pending works */
158161

159162
int nr_workers; /* L: total number of workers */
@@ -177,19 +180,12 @@ struct worker_pool {
177180
struct hlist_node hash_node; /* PL: unbound_pool_hash node */
178181
int refcnt; /* PL: refcnt for unbound pools */
179182

180-
/*
181-
* The current concurrency level. As it's likely to be accessed
182-
* from other CPUs during try_to_wake_up(), put it in a separate
183-
* cacheline.
184-
*/
185-
atomic_t nr_running ____cacheline_aligned_in_smp;
186-
187183
/*
188184
* Destruction of pool is RCU protected to allow dereferences
189185
* from get_work_pool().
190186
*/
191187
struct rcu_head rcu;
192-
} ____cacheline_aligned_in_smp;
188+
};
193189

194190
/*
195191
* The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS
@@ -868,8 +864,17 @@ void wq_worker_running(struct task_struct *task)
868864

869865
if (!worker->sleeping)
870866
return;
867+
868+
/*
869+
* If preempted by unbind_workers() between the WORKER_NOT_RUNNING check
870+
* and the nr_running increment below, we may ruin the nr_running reset
871+
* and leave with an unexpected pool->nr_running == 1 on the newly unbound
872+
* pool. Protect against such race.
873+
*/
874+
preempt_disable();
871875
if (!(worker->flags & WORKER_NOT_RUNNING))
872876
atomic_inc(&worker->pool->nr_running);
877+
preempt_enable();
873878
worker->sleeping = 0;
874879
}
875880

@@ -878,8 +883,7 @@ void wq_worker_running(struct task_struct *task)
878883
* @task: task going to sleep
879884
*
880885
* This function is called from schedule() when a busy worker is
881-
* going to sleep. Preemption needs to be disabled to protect ->sleeping
882-
* assignment.
886+
* going to sleep.
883887
*/
884888
void wq_worker_sleeping(struct task_struct *task)
885889
{
@@ -903,6 +907,16 @@ void wq_worker_sleeping(struct task_struct *task)
903907
worker->sleeping = 1;
904908
raw_spin_lock_irq(&pool->lock);
905909

910+
/*
911+
* Recheck in case unbind_workers() preempted us. We don't
912+
* want to decrement nr_running after the worker is unbound
913+
* and nr_running has been reset.
914+
*/
915+
if (worker->flags & WORKER_NOT_RUNNING) {
916+
raw_spin_unlock_irq(&pool->lock);
917+
return;
918+
}
919+
906920
/*
907921
* The counterpart of the following dec_and_test, implied mb,
908922
* worklist not empty test sequence is in insert_work().
@@ -1531,7 +1545,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
15311545
* @work: work to queue
15321546
*
15331547
* We queue the work to a specific CPU, the caller must ensure it
1534-
* can't go away.
1548+
* can't go away. Callers that fail to ensure that the specified
1549+
* CPU cannot go away will execute on a randomly chosen CPU.
15351550
*
15361551
* Return: %false if @work was already on a queue, %true otherwise.
15371552
*/
@@ -1811,14 +1826,8 @@ static void worker_enter_idle(struct worker *worker)
18111826
if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
18121827
mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
18131828

1814-
/*
1815-
* Sanity check nr_running. Because unbind_workers() releases
1816-
* pool->lock between setting %WORKER_UNBOUND and zapping
1817-
* nr_running, the warning may trigger spuriously. Check iff
1818-
* unbind is not in progress.
1819-
*/
1820-
WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
1821-
pool->nr_workers == pool->nr_idle &&
1829+
/* Sanity check nr_running. */
1830+
WARN_ON_ONCE(pool->nr_workers == pool->nr_idle &&
18221831
atomic_read(&pool->nr_running));
18231832
}
18241833

@@ -4979,38 +4988,22 @@ static void unbind_workers(int cpu)
49794988
/*
49804989
* We've blocked all attach/detach operations. Make all workers
49814990
* unbound and set DISASSOCIATED. Before this, all workers
4982-
* except for the ones which are still executing works from
4983-
* before the last CPU down must be on the cpu. After
4984-
* this, they may become diasporas.
4991+
* must be on the cpu. After this, they may become diasporas.
4992+
* And the preemption disabled section in their sched callbacks
4993+
* are guaranteed to see WORKER_UNBOUND since the code here
4994+
* is on the same cpu.
49854995
*/
49864996
for_each_pool_worker(worker, pool)
49874997
worker->flags |= WORKER_UNBOUND;
49884998

49894999
pool->flags |= POOL_DISASSOCIATED;
49905000

4991-
raw_spin_unlock_irq(&pool->lock);
4992-
4993-
for_each_pool_worker(worker, pool) {
4994-
kthread_set_per_cpu(worker->task, -1);
4995-
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
4996-
}
4997-
4998-
mutex_unlock(&wq_pool_attach_mutex);
4999-
50005001
/*
5001-
* Call schedule() so that we cross rq->lock and thus can
5002-
* guarantee sched callbacks see the %WORKER_UNBOUND flag.
5003-
* This is necessary as scheduler callbacks may be invoked
5004-
* from other cpus.
5005-
*/
5006-
schedule();
5007-
5008-
/*
5009-
* Sched callbacks are disabled now. Zap nr_running.
5010-
* After this, nr_running stays zero and need_more_worker()
5011-
* and keep_working() are always true as long as the
5012-
* worklist is not empty. This pool now behaves as an
5013-
* unbound (in terms of concurrency management) pool which
5002+
* The handling of nr_running in sched callbacks are disabled
5003+
* now. Zap nr_running. After this, nr_running stays zero and
5004+
* need_more_worker() and keep_working() are always true as
5005+
* long as the worklist is not empty. This pool now behaves as
5006+
* an unbound (in terms of concurrency management) pool which
50145007
* are served by workers tied to the pool.
50155008
*/
50165009
atomic_set(&pool->nr_running, 0);
@@ -5020,9 +5013,16 @@ static void unbind_workers(int cpu)
50205013
* worker blocking could lead to lengthy stalls. Kick off
50215014
* unbound chain execution of currently pending work items.
50225015
*/
5023-
raw_spin_lock_irq(&pool->lock);
50245016
wake_up_worker(pool);
5017+
50255018
raw_spin_unlock_irq(&pool->lock);
5019+
5020+
for_each_pool_worker(worker, pool) {
5021+
kthread_set_per_cpu(worker->task, -1);
5022+
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
5023+
}
5024+
5025+
mutex_unlock(&wq_pool_attach_mutex);
50265026
}
50275027
}
50285028

@@ -5058,17 +5058,6 @@ static void rebind_workers(struct worker_pool *pool)
50585058
for_each_pool_worker(worker, pool) {
50595059
unsigned int worker_flags = worker->flags;
50605060

5061-
/*
5062-
* A bound idle worker should actually be on the runqueue
5063-
* of the associated CPU for local wake-ups targeting it to
5064-
* work. Kick all idle workers so that they migrate to the
5065-
* associated CPU. Doing this in the same loop as
5066-
* replacing UNBOUND with REBOUND is safe as no worker will
5067-
* be bound before @pool->lock is released.
5068-
*/
5069-
if (worker_flags & WORKER_IDLE)
5070-
wake_up_process(worker->task);
5071-
50725061
/*
50735062
* We want to clear UNBOUND but can't directly call
50745063
* worker_clr_flags() or adjust nr_running. Atomically

0 commit comments

Comments
 (0)