Skip to content

Commit 2125c00

Browse files
Waiman-Longhtejun
authored andcommitted
cgroup/cpuset: Make cpuset hotplug processing synchronous
Since commit 3a5a6d0("cpuset: don't nest cgroup_mutex inside get_online_cpus()"), cpuset hotplug was done asynchronously via a work function. This is to avoid recursive locking of cgroup_mutex. Since then, the cgroup locking scheme has changed quite a bit. A cpuset_mutex was introduced to protect cpuset specific operations. The cpuset_mutex is then replaced by a cpuset_rwsem. With commit d74b27d ("cgroup/cpuset: Change cpuset_rwsem and hotplug lock order"), cpu_hotplug_lock is acquired before cpuset_rwsem. Later on, cpuset_rwsem is reverted back to cpuset_mutex. All these locking changes allow the hotplug code to call into cpuset core directly. The following commits were also merged due to the asynchronous nature of cpuset hotplug processing. - commit b22afcd ("cpu/hotplug: Cure the cpusets trainwreck") - commit 50e7663 ("sched/cpuset/pm: Fix cpuset vs. suspend-resume bugs") - commit 28b89b9 ("cpuset: handle race between CPU hotplug and cpuset_hotplug_work") Clean up all these bandages by making cpuset hotplug processing synchronous again with the exception that the call to cgroup_transfer_tasks() to transfer tasks out of an empty cgroup v1 cpuset, if necessary, will still be done via a work function due to the existing cgroup_mutex -> cpu_hotplug_lock dependency. It is possible to reverse that dependency, but that will require updating a number of different cgroup controllers. This special hotplug code path should be rarely taken anyway. As all the cpuset states will be updated by the end of the hotplug operation, we can revert most the above commits except commit 50e7663 ("sched/cpuset/pm: Fix cpuset vs. suspend-resume bugs") which is partially reverted. Also removing some cpus_read_lock trylock attempts in the cpuset partition code as they are no longer necessary since the cpu_hotplug_lock is now held for the whole duration of the cpuset hotplug code path. Signed-off-by: Waiman Long <[email protected]> Tested-by: Valentin Schneider <[email protected]> Signed-off-by: Tejun Heo <[email protected]>
1 parent 4793cb5 commit 2125c00

File tree

4 files changed

+56
-138
lines changed

4 files changed

+56
-138
lines changed

include/linux/cpuset.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ extern int cpuset_init(void);
7070
extern void cpuset_init_smp(void);
7171
extern void cpuset_force_rebuild(void);
7272
extern void cpuset_update_active_cpus(void);
73-
extern void cpuset_wait_for_hotplug(void);
7473
extern void inc_dl_tasks_cs(struct task_struct *task);
7574
extern void dec_dl_tasks_cs(struct task_struct *task);
7675
extern void cpuset_lock(void);
@@ -185,8 +184,6 @@ static inline void cpuset_update_active_cpus(void)
185184
partition_sched_domains(1, NULL, NULL);
186185
}
187186

188-
static inline void cpuset_wait_for_hotplug(void) { }
189-
190187
static inline void inc_dl_tasks_cs(struct task_struct *task) { }
191188
static inline void dec_dl_tasks_cs(struct task_struct *task) { }
192189
static inline void cpuset_lock(void) { }

kernel/cgroup/cpuset.c

Lines changed: 56 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,14 @@ struct cpuset {
201201
struct list_head remote_sibling;
202202
};
203203

204+
/*
205+
* Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
206+
*/
207+
struct cpuset_remove_tasks_struct {
208+
struct work_struct work;
209+
struct cpuset *cs;
210+
};
211+
204212
/*
205213
* Exclusive CPUs distributed out to sub-partitions of top_cpuset
206214
*/
@@ -449,12 +457,6 @@ static DEFINE_SPINLOCK(callback_lock);
449457

450458
static struct workqueue_struct *cpuset_migrate_mm_wq;
451459

452-
/*
453-
* CPU / memory hotplug is handled asynchronously.
454-
*/
455-
static void cpuset_hotplug_workfn(struct work_struct *work);
456-
static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
457-
458460
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
459461

460462
static inline void check_insane_mems_config(nodemask_t *nodes)
@@ -540,22 +542,10 @@ static void guarantee_online_cpus(struct task_struct *tsk,
540542
rcu_read_lock();
541543
cs = task_cs(tsk);
542544

543-
while (!cpumask_intersects(cs->effective_cpus, pmask)) {
545+
while (!cpumask_intersects(cs->effective_cpus, pmask))
544546
cs = parent_cs(cs);
545-
if (unlikely(!cs)) {
546-
/*
547-
* The top cpuset doesn't have any online cpu as a
548-
* consequence of a race between cpuset_hotplug_work
549-
* and cpu hotplug notifier. But we know the top
550-
* cpuset's effective_cpus is on its way to be
551-
* identical to cpu_online_mask.
552-
*/
553-
goto out_unlock;
554-
}
555-
}
556-
cpumask_and(pmask, pmask, cs->effective_cpus);
557547

558-
out_unlock:
548+
cpumask_and(pmask, pmask, cs->effective_cpus);
559549
rcu_read_unlock();
560550
}
561551

@@ -1217,7 +1207,7 @@ static void rebuild_sched_domains_locked(void)
12171207
/*
12181208
* If we have raced with CPU hotplug, return early to avoid
12191209
* passing doms with offlined cpu to partition_sched_domains().
1220-
* Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
1210+
* Anyways, cpuset_handle_hotplug() will rebuild sched domains.
12211211
*
12221212
* With no CPUs in any subpartitions, top_cpuset's effective CPUs
12231213
* should be the same as the active CPUs, so checking only top_cpuset
@@ -1260,12 +1250,17 @@ static void rebuild_sched_domains_locked(void)
12601250
}
12611251
#endif /* CONFIG_SMP */
12621252

1263-
void rebuild_sched_domains(void)
1253+
static void rebuild_sched_domains_cpuslocked(void)
12641254
{
1265-
cpus_read_lock();
12661255
mutex_lock(&cpuset_mutex);
12671256
rebuild_sched_domains_locked();
12681257
mutex_unlock(&cpuset_mutex);
1258+
}
1259+
1260+
void rebuild_sched_domains(void)
1261+
{
1262+
cpus_read_lock();
1263+
rebuild_sched_domains_cpuslocked();
12691264
cpus_read_unlock();
12701265
}
12711266

@@ -2079,14 +2074,11 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
20792074

20802075
/*
20812076
* For partcmd_update without newmask, it is being called from
2082-
* cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken.
2083-
* Update the load balance flag and scheduling domain if
2084-
* cpus_read_trylock() is successful.
2077+
* cpuset_handle_hotplug(). Update the load balance flag and
2078+
* scheduling domain accordingly.
20852079
*/
2086-
if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) {
2080+
if ((cmd == partcmd_update) && !newmask)
20872081
update_partition_sd_lb(cs, old_prs);
2088-
cpus_read_unlock();
2089-
}
20902082

20912083
notify_partition_change(cs, old_prs);
20922084
return 0;
@@ -3599,8 +3591,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
35993591
* proceeding, so that we don't end up keep removing tasks added
36003592
* after execution capability is restored.
36013593
*
3602-
* cpuset_hotplug_work calls back into cgroup core via
3603-
* cgroup_transfer_tasks() and waiting for it from a cgroupfs
3594+
* cpuset_handle_hotplug may call back into cgroup core asynchronously
3595+
* via cgroup_transfer_tasks() and waiting for it from a cgroupfs
36043596
* operation like this one can lead to a deadlock through kernfs
36053597
* active_ref protection. Let's break the protection. Losing the
36063598
* protection is okay as we check whether @cs is online after
@@ -3609,7 +3601,6 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
36093601
*/
36103602
css_get(&cs->css);
36113603
kernfs_break_active_protection(of->kn);
3612-
flush_work(&cpuset_hotplug_work);
36133604

36143605
cpus_read_lock();
36153606
mutex_lock(&cpuset_mutex);
@@ -4354,6 +4345,16 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
43544345
}
43554346
}
43564347

4348+
static void cpuset_migrate_tasks_workfn(struct work_struct *work)
4349+
{
4350+
struct cpuset_remove_tasks_struct *s;
4351+
4352+
s = container_of(work, struct cpuset_remove_tasks_struct, work);
4353+
remove_tasks_in_empty_cpuset(s->cs);
4354+
css_put(&s->cs->css);
4355+
kfree(s);
4356+
}
4357+
43574358
static void
43584359
hotplug_update_tasks_legacy(struct cpuset *cs,
43594360
struct cpumask *new_cpus, nodemask_t *new_mems,
@@ -4383,12 +4384,21 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
43834384
/*
43844385
* Move tasks to the nearest ancestor with execution resources,
43854386
* This is full cgroup operation which will also call back into
4386-
* cpuset. Should be done outside any lock.
4387+
* cpuset. Execute it asynchronously using workqueue.
43874388
*/
4388-
if (is_empty) {
4389-
mutex_unlock(&cpuset_mutex);
4390-
remove_tasks_in_empty_cpuset(cs);
4391-
mutex_lock(&cpuset_mutex);
4389+
if (is_empty && cs->css.cgroup->nr_populated_csets &&
4390+
css_tryget_online(&cs->css)) {
4391+
struct cpuset_remove_tasks_struct *s;
4392+
4393+
s = kzalloc(sizeof(*s), GFP_KERNEL);
4394+
if (WARN_ON_ONCE(!s)) {
4395+
css_put(&cs->css);
4396+
return;
4397+
}
4398+
4399+
s->cs = cs;
4400+
INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
4401+
schedule_work(&s->work);
43924402
}
43934403
}
43944404

@@ -4421,30 +4431,6 @@ void cpuset_force_rebuild(void)
44214431
force_rebuild = true;
44224432
}
44234433

4424-
/*
4425-
* Attempt to acquire a cpus_read_lock while a hotplug operation may be in
4426-
* progress.
4427-
* Return: true if successful, false otherwise
4428-
*
4429-
* To avoid circular lock dependency between cpuset_mutex and cpus_read_lock,
4430-
* cpus_read_trylock() is used here to acquire the lock.
4431-
*/
4432-
static bool cpuset_hotplug_cpus_read_trylock(void)
4433-
{
4434-
int retries = 0;
4435-
4436-
while (!cpus_read_trylock()) {
4437-
/*
4438-
* CPU hotplug still in progress. Retry 5 times
4439-
* with a 10ms wait before bailing out.
4440-
*/
4441-
if (++retries > 5)
4442-
return false;
4443-
msleep(10);
4444-
}
4445-
return true;
4446-
}
4447-
44484434
/**
44494435
* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
44504436
* @cs: cpuset in interest
@@ -4493,13 +4479,11 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
44934479
compute_partition_effective_cpumask(cs, &new_cpus);
44944480

44954481
if (remote && cpumask_empty(&new_cpus) &&
4496-
partition_is_populated(cs, NULL) &&
4497-
cpuset_hotplug_cpus_read_trylock()) {
4482+
partition_is_populated(cs, NULL)) {
44984483
remote_partition_disable(cs, tmp);
44994484
compute_effective_cpumask(&new_cpus, cs, parent);
45004485
remote = false;
45014486
cpuset_force_rebuild();
4502-
cpus_read_unlock();
45034487
}
45044488

45054489
/*
@@ -4519,18 +4503,8 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
45194503
else if (is_partition_valid(parent) && is_partition_invalid(cs))
45204504
partcmd = partcmd_update;
45214505

4522-
/*
4523-
* cpus_read_lock needs to be held before calling
4524-
* update_parent_effective_cpumask(). To avoid circular lock
4525-
* dependency between cpuset_mutex and cpus_read_lock,
4526-
* cpus_read_trylock() is used here to acquire the lock.
4527-
*/
45284506
if (partcmd >= 0) {
4529-
if (!cpuset_hotplug_cpus_read_trylock())
4530-
goto update_tasks;
4531-
45324507
update_parent_effective_cpumask(cs, partcmd, NULL, tmp);
4533-
cpus_read_unlock();
45344508
if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {
45354509
compute_partition_effective_cpumask(cs, &new_cpus);
45364510
cpuset_force_rebuild();
@@ -4558,8 +4532,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
45584532
}
45594533

45604534
/**
4561-
* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
4562-
* @work: unused
4535+
* cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset
45634536
*
45644537
* This function is called after either CPU or memory configuration has
45654538
* changed and updates cpuset accordingly. The top_cpuset is always
@@ -4573,8 +4546,10 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
45734546
*
45744547
* Note that CPU offlining during suspend is ignored. We don't modify
45754548
* cpusets across suspend/resume cycles at all.
4549+
*
4550+
* CPU / memory hotplug is handled synchronously.
45764551
*/
4577-
static void cpuset_hotplug_workfn(struct work_struct *work)
4552+
static void cpuset_handle_hotplug(void)
45784553
{
45794554
static cpumask_t new_cpus;
45804555
static nodemask_t new_mems;
@@ -4585,6 +4560,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
45854560
if (on_dfl && !alloc_cpumasks(NULL, &tmp))
45864561
ptmp = &tmp;
45874562

4563+
lockdep_assert_cpus_held();
45884564
mutex_lock(&cpuset_mutex);
45894565

45904566
/* fetch the available cpus/mems and find out which changed how */
@@ -4666,7 +4642,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
46664642
/* rebuild sched domains if cpus_allowed has changed */
46674643
if (cpus_updated || force_rebuild) {
46684644
force_rebuild = false;
4669-
rebuild_sched_domains();
4645+
rebuild_sched_domains_cpuslocked();
46704646
}
46714647

46724648
free_cpumasks(NULL, ptmp);
@@ -4679,12 +4655,7 @@ void cpuset_update_active_cpus(void)
46794655
* inside cgroup synchronization. Bounce actual hotplug processing
46804656
* to a work item to avoid reverse locking order.
46814657
*/
4682-
schedule_work(&cpuset_hotplug_work);
4683-
}
4684-
4685-
void cpuset_wait_for_hotplug(void)
4686-
{
4687-
flush_work(&cpuset_hotplug_work);
4658+
cpuset_handle_hotplug();
46884659
}
46894660

46904661
/*
@@ -4695,7 +4666,7 @@ void cpuset_wait_for_hotplug(void)
46954666
static int cpuset_track_online_nodes(struct notifier_block *self,
46964667
unsigned long action, void *arg)
46974668
{
4698-
schedule_work(&cpuset_hotplug_work);
4669+
cpuset_handle_hotplug();
46994670
return NOTIFY_OK;
47004671
}
47014672

kernel/cpu.c

Lines changed: 0 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1208,52 +1208,6 @@ void __init cpuhp_threads_init(void)
12081208
kthread_unpark(this_cpu_read(cpuhp_state.thread));
12091209
}
12101210

1211-
/*
1212-
*
1213-
* Serialize hotplug trainwrecks outside of the cpu_hotplug_lock
1214-
* protected region.
1215-
*
1216-
* The operation is still serialized against concurrent CPU hotplug via
1217-
* cpu_add_remove_lock, i.e. CPU map protection. But it is _not_
1218-
* serialized against other hotplug related activity like adding or
1219-
* removing of state callbacks and state instances, which invoke either the
1220-
* startup or the teardown callback of the affected state.
1221-
*
1222-
* This is required for subsystems which are unfixable vs. CPU hotplug and
1223-
* evade lock inversion problems by scheduling work which has to be
1224-
* completed _before_ cpu_up()/_cpu_down() returns.
1225-
*
1226-
* Don't even think about adding anything to this for any new code or even
1227-
* drivers. It's only purpose is to keep existing lock order trainwrecks
1228-
* working.
1229-
*
1230-
* For cpu_down() there might be valid reasons to finish cleanups which are
1231-
* not required to be done under cpu_hotplug_lock, but that's a different
1232-
* story and would be not invoked via this.
1233-
*/
1234-
static void cpu_up_down_serialize_trainwrecks(bool tasks_frozen)
1235-
{
1236-
/*
1237-
* cpusets delegate hotplug operations to a worker to "solve" the
1238-
* lock order problems. Wait for the worker, but only if tasks are
1239-
* _not_ frozen (suspend, hibernate) as that would wait forever.
1240-
*
1241-
* The wait is required because otherwise the hotplug operation
1242-
* returns with inconsistent state, which could even be observed in
1243-
* user space when a new CPU is brought up. The CPU plug uevent
1244-
* would be delivered and user space reacting on it would fail to
1245-
* move tasks to the newly plugged CPU up to the point where the
1246-
* work has finished because up to that point the newly plugged CPU
1247-
* is not assignable in cpusets/cgroups. On unplug that's not
1248-
* necessarily a visible issue, but it is still inconsistent state,
1249-
* which is the real problem which needs to be "fixed". This can't
1250-
* prevent the transient state between scheduling the work and
1251-
* returning from waiting for it.
1252-
*/
1253-
if (!tasks_frozen)
1254-
cpuset_wait_for_hotplug();
1255-
}
1256-
12571211
#ifdef CONFIG_HOTPLUG_CPU
12581212
#ifndef arch_clear_mm_cpumask_cpu
12591213
#define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm))
@@ -1494,7 +1448,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
14941448
*/
14951449
lockup_detector_cleanup();
14961450
arch_smt_update();
1497-
cpu_up_down_serialize_trainwrecks(tasks_frozen);
14981451
return ret;
14991452
}
15001453

@@ -1728,7 +1681,6 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
17281681
out:
17291682
cpus_write_unlock();
17301683
arch_smt_update();
1731-
cpu_up_down_serialize_trainwrecks(tasks_frozen);
17321684
return ret;
17331685
}
17341686

kernel/power/process.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -194,8 +194,6 @@ void thaw_processes(void)
194194
__usermodehelper_set_disable_depth(UMH_FREEZING);
195195
thaw_workqueues();
196196

197-
cpuset_wait_for_hotplug();
198-
199197
read_lock(&tasklist_lock);
200198
for_each_process_thread(g, p) {
201199
/* No other threads should have PF_SUSPEND_TASK set */

0 commit comments

Comments
 (0)