@@ -154,6 +154,9 @@ struct worker_pool {
154154
155155 unsigned long watchdog_ts ; /* L: watchdog timestamp */
156156
157+ /* The current concurrency level. */
158+ atomic_t nr_running ;
159+
157160 struct list_head worklist ; /* L: list of pending works */
158161
159162 int nr_workers ; /* L: total number of workers */
@@ -177,19 +180,12 @@ struct worker_pool {
177180 struct hlist_node hash_node ; /* PL: unbound_pool_hash node */
178181 int refcnt ; /* PL: refcnt for unbound pools */
179182
180- /*
181- * The current concurrency level. As it's likely to be accessed
182- * from other CPUs during try_to_wake_up(), put it in a separate
183- * cacheline.
184- */
185- atomic_t nr_running ____cacheline_aligned_in_smp ;
186-
187183 /*
188184 * Destruction of pool is RCU protected to allow dereferences
189185 * from get_work_pool().
190186 */
191187 struct rcu_head rcu ;
192- } ____cacheline_aligned_in_smp ;
188+ };
193189
194190/*
195191 * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS
@@ -868,8 +864,17 @@ void wq_worker_running(struct task_struct *task)
868864
869865 if (!worker -> sleeping )
870866 return ;
867+
868+ /*
869+ * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check
870+ * and the nr_running increment below, we may ruin the nr_running reset
871+ * and leave with an unexpected pool->nr_running == 1 on the newly unbound
872+ * pool. Protect against such race.
873+ */
874+ preempt_disable ();
871875 if (!(worker -> flags & WORKER_NOT_RUNNING ))
872876 atomic_inc (& worker -> pool -> nr_running );
877+ preempt_enable ();
873878 worker -> sleeping = 0 ;
874879}
875880
@@ -878,8 +883,7 @@ void wq_worker_running(struct task_struct *task)
878883 * @task: task going to sleep
879884 *
880885 * This function is called from schedule() when a busy worker is
881- * going to sleep. Preemption needs to be disabled to protect ->sleeping
882- * assignment.
886+ * going to sleep.
883887 */
884888void wq_worker_sleeping (struct task_struct * task )
885889{
@@ -903,6 +907,16 @@ void wq_worker_sleeping(struct task_struct *task)
903907 worker -> sleeping = 1 ;
904908 raw_spin_lock_irq (& pool -> lock );
905909
910+ /*
911+ * Recheck in case unbind_workers() preempted us. We don't
912+ * want to decrement nr_running after the worker is unbound
913+ * and nr_running has been reset.
914+ */
915+ if (worker -> flags & WORKER_NOT_RUNNING ) {
916+ raw_spin_unlock_irq (& pool -> lock );
917+ return ;
918+ }
919+
906920 /*
907921 * The counterpart of the following dec_and_test, implied mb,
908922 * worklist not empty test sequence is in insert_work().
@@ -1531,7 +1545,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
15311545 * @work: work to queue
15321546 *
15331547 * We queue the work to a specific CPU, the caller must ensure it
1534- * can't go away.
1548+ * can't go away. Callers that fail to ensure that the specified
1549+ * CPU cannot go away will execute on a randomly chosen CPU.
15351550 *
15361551 * Return: %false if @work was already on a queue, %true otherwise.
15371552 */
@@ -1811,14 +1826,8 @@ static void worker_enter_idle(struct worker *worker)
18111826 if (too_many_workers (pool ) && !timer_pending (& pool -> idle_timer ))
18121827 mod_timer (& pool -> idle_timer , jiffies + IDLE_WORKER_TIMEOUT );
18131828
1814- /*
1815- * Sanity check nr_running. Because unbind_workers() releases
1816- * pool->lock between setting %WORKER_UNBOUND and zapping
1817- * nr_running, the warning may trigger spuriously. Check iff
1818- * unbind is not in progress.
1819- */
1820- WARN_ON_ONCE (!(pool -> flags & POOL_DISASSOCIATED ) &&
1821- pool -> nr_workers == pool -> nr_idle &&
1829+ /* Sanity check nr_running. */
1830+ WARN_ON_ONCE (pool -> nr_workers == pool -> nr_idle &&
18221831 atomic_read (& pool -> nr_running ));
18231832}
18241833
@@ -4979,38 +4988,22 @@ static void unbind_workers(int cpu)
49794988 /*
49804989 * We've blocked all attach/detach operations. Make all workers
49814990 * unbound and set DISASSOCIATED. Before this, all workers
4982- * except for the ones which are still executing works from
4983- * before the last CPU down must be on the cpu. After
4984- * this, they may become diasporas.
4991+ * must be on the cpu. After this, they may become diasporas.
4992+ * And the preemption disabled section in their sched callbacks
4993+ * are guaranteed to see WORKER_UNBOUND since the code here
4994+ * is on the same cpu.
49854995 */
49864996 for_each_pool_worker (worker , pool )
49874997 worker -> flags |= WORKER_UNBOUND ;
49884998
49894999 pool -> flags |= POOL_DISASSOCIATED ;
49905000
4991- raw_spin_unlock_irq (& pool -> lock );
4992-
4993- for_each_pool_worker (worker , pool ) {
4994- kthread_set_per_cpu (worker -> task , -1 );
4995- WARN_ON_ONCE (set_cpus_allowed_ptr (worker -> task , cpu_possible_mask ) < 0 );
4996- }
4997-
4998- mutex_unlock (& wq_pool_attach_mutex );
4999-
50005001 /*
5001- * Call schedule() so that we cross rq->lock and thus can
5002- * guarantee sched callbacks see the %WORKER_UNBOUND flag.
5003- * This is necessary as scheduler callbacks may be invoked
5004- * from other cpus.
5005- */
5006- schedule ();
5007-
5008- /*
5009- * Sched callbacks are disabled now. Zap nr_running.
5010- * After this, nr_running stays zero and need_more_worker()
5011- * and keep_working() are always true as long as the
5012- * worklist is not empty. This pool now behaves as an
5013- * unbound (in terms of concurrency management) pool which
5002+ * The handling of nr_running in sched callbacks are disabled
5003+ * now. Zap nr_running. After this, nr_running stays zero and
5004+ * need_more_worker() and keep_working() are always true as
5005+ * long as the worklist is not empty. This pool now behaves as
5006+ * an unbound (in terms of concurrency management) pool which
50145007 * are served by workers tied to the pool.
50155008 */
50165009 atomic_set (& pool -> nr_running , 0 );
@@ -5020,9 +5013,16 @@ static void unbind_workers(int cpu)
50205013 * worker blocking could lead to lengthy stalls. Kick off
50215014 * unbound chain execution of currently pending work items.
50225015 */
5023- raw_spin_lock_irq (& pool -> lock );
50245016 wake_up_worker (pool );
5017+
50255018 raw_spin_unlock_irq (& pool -> lock );
5019+
5020+ for_each_pool_worker (worker , pool ) {
5021+ kthread_set_per_cpu (worker -> task , -1 );
5022+ WARN_ON_ONCE (set_cpus_allowed_ptr (worker -> task , cpu_possible_mask ) < 0 );
5023+ }
5024+
5025+ mutex_unlock (& wq_pool_attach_mutex );
50265026 }
50275027}
50285028
@@ -5058,17 +5058,6 @@ static void rebind_workers(struct worker_pool *pool)
50585058 for_each_pool_worker (worker , pool ) {
50595059 unsigned int worker_flags = worker -> flags ;
50605060
5061- /*
5062- * A bound idle worker should actually be on the runqueue
5063- * of the associated CPU for local wake-ups targeting it to
5064- * work. Kick all idle workers so that they migrate to the
5065- * associated CPU. Doing this in the same loop as
5066- * replacing UNBOUND with REBOUND is safe as no worker will
5067- * be bound before @pool->lock is released.
5068- */
5069- if (worker_flags & WORKER_IDLE )
5070- wake_up_process (worker -> task );
5071-
50725061 /*
50735062 * We want to clear UNBOUND but can't directly call
50745063 * worker_clr_flags() or adjust nr_running. Atomically
0 commit comments