@@ -179,6 +179,7 @@ struct worker_pool {
179
179
180
180
struct worker * manager ; /* L: purely informational */
181
181
struct list_head workers ; /* A: attached workers */
182
+ struct list_head dying_workers ; /* A: workers about to die */
182
183
struct completion * detach_completion ; /* all workers detached */
183
184
184
185
struct ida worker_ida ; /* worker IDs for task name */
@@ -1906,7 +1907,7 @@ static void worker_detach_from_pool(struct worker *worker)
1906
1907
list_del (& worker -> node );
1907
1908
worker -> pool = NULL ;
1908
1909
1909
- if (list_empty (& pool -> workers ))
1910
+ if (list_empty (& pool -> workers ) && list_empty ( & pool -> dying_workers ) )
1910
1911
detach_completion = pool -> detach_completion ;
1911
1912
mutex_unlock (& wq_pool_attach_mutex );
1912
1913
@@ -1995,21 +1996,44 @@ static void rebind_worker(struct worker *worker, struct worker_pool *pool)
1995
1996
WARN_ON_ONCE (set_cpus_allowed_ptr (worker -> task , pool -> attrs -> cpumask ) < 0 );
1996
1997
}
1997
1998
1999
+ static void wake_dying_workers (struct list_head * cull_list )
2000
+ {
2001
+ struct worker * worker , * tmp ;
2002
+
2003
+ list_for_each_entry_safe (worker , tmp , cull_list , entry ) {
2004
+ list_del_init (& worker -> entry );
2005
+ unbind_worker (worker );
2006
+ /*
2007
+ * If the worker was somehow already running, then it had to be
2008
+ * in pool->idle_list when set_worker_dying() happened or we
2009
+ * wouldn't have gotten here.
2010
+ *
2011
+ * Thus, the worker must either have observed the WORKER_DIE
2012
+ * flag, or have set its state to TASK_IDLE. Either way, the
2013
+ * below will be observed by the worker and is safe to do
2014
+ * outside of pool->lock.
2015
+ */
2016
+ wake_up_process (worker -> task );
2017
+ }
2018
+ }
2019
+
1998
2020
/**
1999
- * destroy_worker - destroy a workqueue worker
2021
+ * set_worker_dying - Tag a worker for destruction
2000
2022
* @worker: worker to be destroyed
2023
+ * @list: transfer worker away from its pool->idle_list and into list
2001
2024
*
2002
- * Destroy @worker and adjust @pool stats accordingly. The worker should
2003
- * be idle.
2025
+ * Tag @worker for destruction and adjust @pool stats accordingly. The worker
2026
+ * should be idle.
2004
2027
*
2005
2028
* CONTEXT:
2006
2029
* raw_spin_lock_irq(pool->lock).
2007
2030
*/
2008
- static void destroy_worker (struct worker * worker )
2031
+ static void set_worker_dying (struct worker * worker , struct list_head * list )
2009
2032
{
2010
2033
struct worker_pool * pool = worker -> pool ;
2011
2034
2012
2035
lockdep_assert_held (& pool -> lock );
2036
+ lockdep_assert_held (& wq_pool_attach_mutex );
2013
2037
2014
2038
/* sanity check frenzy */
2015
2039
if (WARN_ON (worker -> current_work ) ||
@@ -2020,9 +2044,10 @@ static void destroy_worker(struct worker *worker)
2020
2044
pool -> nr_workers -- ;
2021
2045
pool -> nr_idle -- ;
2022
2046
2023
- list_del_init (& worker -> entry );
2024
2047
worker -> flags |= WORKER_DIE ;
2025
- wake_up_process (worker -> task );
2048
+
2049
+ list_move (& worker -> entry , list );
2050
+ list_move (& worker -> node , & pool -> dying_workers );
2026
2051
}
2027
2052
2028
2053
/**
@@ -2069,11 +2094,24 @@ static void idle_worker_timeout(struct timer_list *t)
2069
2094
*
2070
2095
* This goes through a pool's idle workers and gets rid of those that have been
2071
2096
* idle for at least IDLE_WORKER_TIMEOUT seconds.
2097
+ *
2098
+ * We don't want to disturb isolated CPUs because of a pcpu kworker being
2099
+ * culled, so this also resets worker affinity. This requires a sleepable
2100
+ * context, hence the split between timer callback and work item.
2072
2101
*/
2073
2102
static void idle_cull_fn (struct work_struct * work )
2074
2103
{
2075
2104
struct worker_pool * pool = container_of (work , struct worker_pool , idle_cull_work );
2105
+ struct list_head cull_list ;
2076
2106
2107
+ INIT_LIST_HEAD (& cull_list );
2108
+ /*
2109
+ * Grabbing wq_pool_attach_mutex here ensures an already-running worker
2110
+ * cannot proceed beyong worker_detach_from_pool() in its self-destruct
2111
+ * path. This is required as a previously-preempted worker could run after
2112
+ * set_worker_dying() has happened but before wake_dying_workers() did.
2113
+ */
2114
+ mutex_lock (& wq_pool_attach_mutex );
2077
2115
raw_spin_lock_irq (& pool -> lock );
2078
2116
2079
2117
while (too_many_workers (pool )) {
@@ -2088,10 +2126,12 @@ static void idle_cull_fn(struct work_struct *work)
2088
2126
break ;
2089
2127
}
2090
2128
2091
- destroy_worker (worker );
2129
+ set_worker_dying (worker , & cull_list );
2092
2130
}
2093
2131
2094
2132
raw_spin_unlock_irq (& pool -> lock );
2133
+ wake_dying_workers (& cull_list );
2134
+ mutex_unlock (& wq_pool_attach_mutex );
2095
2135
}
2096
2136
2097
2137
static void send_mayday (struct work_struct * work )
@@ -2455,12 +2495,12 @@ static int worker_thread(void *__worker)
2455
2495
/* am I supposed to die? */
2456
2496
if (unlikely (worker -> flags & WORKER_DIE )) {
2457
2497
raw_spin_unlock_irq (& pool -> lock );
2458
- WARN_ON_ONCE (!list_empty (& worker -> entry ));
2459
2498
set_pf_worker (false);
2460
2499
2461
2500
set_task_comm (worker -> task , "kworker/dying" );
2462
2501
ida_free (& pool -> worker_ida , worker -> id );
2463
2502
worker_detach_from_pool (worker );
2503
+ WARN_ON_ONCE (!list_empty (& worker -> entry ));
2464
2504
kfree (worker );
2465
2505
return 0 ;
2466
2506
}
@@ -3534,6 +3574,7 @@ static int init_worker_pool(struct worker_pool *pool)
3534
3574
timer_setup (& pool -> mayday_timer , pool_mayday_timeout , 0 );
3535
3575
3536
3576
INIT_LIST_HEAD (& pool -> workers );
3577
+ INIT_LIST_HEAD (& pool -> dying_workers );
3537
3578
3538
3579
ida_init (& pool -> worker_ida );
3539
3580
INIT_HLIST_NODE (& pool -> hash_node );
@@ -3622,8 +3663,11 @@ static void rcu_free_pool(struct rcu_head *rcu)
3622
3663
static void put_unbound_pool (struct worker_pool * pool )
3623
3664
{
3624
3665
DECLARE_COMPLETION_ONSTACK (detach_completion );
3666
+ struct list_head cull_list ;
3625
3667
struct worker * worker ;
3626
3668
3669
+ INIT_LIST_HEAD (& cull_list );
3670
+
3627
3671
lockdep_assert_held (& wq_pool_mutex );
3628
3672
3629
3673
if (-- pool -> refcnt )
@@ -3656,21 +3700,25 @@ static void put_unbound_pool(struct worker_pool *pool)
3656
3700
rcuwait_wait_event (& manager_wait ,
3657
3701
!(pool -> flags & POOL_MANAGER_ACTIVE ),
3658
3702
TASK_UNINTERRUPTIBLE );
3703
+
3704
+ mutex_lock (& wq_pool_attach_mutex );
3659
3705
raw_spin_lock_irq (& pool -> lock );
3660
3706
if (!(pool -> flags & POOL_MANAGER_ACTIVE )) {
3661
3707
pool -> flags |= POOL_MANAGER_ACTIVE ;
3662
3708
break ;
3663
3709
}
3664
3710
raw_spin_unlock_irq (& pool -> lock );
3711
+ mutex_unlock (& wq_pool_attach_mutex );
3665
3712
}
3666
3713
3667
3714
while ((worker = first_idle_worker (pool )))
3668
- destroy_worker (worker );
3715
+ set_worker_dying (worker , & cull_list );
3669
3716
WARN_ON (pool -> nr_workers || pool -> nr_idle );
3670
3717
raw_spin_unlock_irq (& pool -> lock );
3671
3718
3672
- mutex_lock (& wq_pool_attach_mutex );
3673
- if (!list_empty (& pool -> workers ))
3719
+ wake_dying_workers (& cull_list );
3720
+
3721
+ if (!list_empty (& pool -> workers ) || !list_empty (& pool -> dying_workers ))
3674
3722
pool -> detach_completion = & detach_completion ;
3675
3723
mutex_unlock (& wq_pool_attach_mutex );
3676
3724
0 commit comments