@@ -201,6 +201,14 @@ struct cpuset {
201
201
struct list_head remote_sibling ;
202
202
};
203
203
204
+ /*
205
+ * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
206
+ */
207
+ struct cpuset_remove_tasks_struct {
208
+ struct work_struct work ;
209
+ struct cpuset * cs ;
210
+ };
211
+
204
212
/*
205
213
* Exclusive CPUs distributed out to sub-partitions of top_cpuset
206
214
*/
@@ -449,12 +457,6 @@ static DEFINE_SPINLOCK(callback_lock);
449
457
450
458
static struct workqueue_struct * cpuset_migrate_mm_wq ;
451
459
452
- /*
453
- * CPU / memory hotplug is handled asynchronously.
454
- */
455
- static void cpuset_hotplug_workfn (struct work_struct * work );
456
- static DECLARE_WORK (cpuset_hotplug_work , cpuset_hotplug_workfn ) ;
457
-
458
460
static DECLARE_WAIT_QUEUE_HEAD (cpuset_attach_wq );
459
461
460
462
static inline void check_insane_mems_config (nodemask_t * nodes )
@@ -540,22 +542,10 @@ static void guarantee_online_cpus(struct task_struct *tsk,
540
542
rcu_read_lock ();
541
543
cs = task_cs (tsk );
542
544
543
- while (!cpumask_intersects (cs -> effective_cpus , pmask )) {
545
+ while (!cpumask_intersects (cs -> effective_cpus , pmask ))
544
546
cs = parent_cs (cs );
545
- if (unlikely (!cs )) {
546
- /*
547
- * The top cpuset doesn't have any online cpu as a
548
- * consequence of a race between cpuset_hotplug_work
549
- * and cpu hotplug notifier. But we know the top
550
- * cpuset's effective_cpus is on its way to be
551
- * identical to cpu_online_mask.
552
- */
553
- goto out_unlock ;
554
- }
555
- }
556
- cpumask_and (pmask , pmask , cs -> effective_cpus );
557
547
558
- out_unlock :
548
+ cpumask_and ( pmask , pmask , cs -> effective_cpus );
559
549
rcu_read_unlock ();
560
550
}
561
551
@@ -1217,7 +1207,7 @@ static void rebuild_sched_domains_locked(void)
1217
1207
/*
1218
1208
* If we have raced with CPU hotplug, return early to avoid
1219
1209
* passing doms with offlined cpu to partition_sched_domains().
1220
- * Anyways, cpuset_hotplug_workfn () will rebuild sched domains.
1210
+ * Anyways, cpuset_handle_hotplug () will rebuild sched domains.
1221
1211
*
1222
1212
* With no CPUs in any subpartitions, top_cpuset's effective CPUs
1223
1213
* should be the same as the active CPUs, so checking only top_cpuset
@@ -1260,12 +1250,17 @@ static void rebuild_sched_domains_locked(void)
1260
1250
}
1261
1251
#endif /* CONFIG_SMP */
1262
1252
1263
- void rebuild_sched_domains (void )
1253
+ static void rebuild_sched_domains_cpuslocked (void )
1264
1254
{
1265
- cpus_read_lock ();
1266
1255
mutex_lock (& cpuset_mutex );
1267
1256
rebuild_sched_domains_locked ();
1268
1257
mutex_unlock (& cpuset_mutex );
1258
+ }
1259
+
1260
+ void rebuild_sched_domains (void )
1261
+ {
1262
+ cpus_read_lock ();
1263
+ rebuild_sched_domains_cpuslocked ();
1269
1264
cpus_read_unlock ();
1270
1265
}
1271
1266
@@ -2079,14 +2074,11 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
2079
2074
2080
2075
/*
2081
2076
* For partcmd_update without newmask, it is being called from
2082
- * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken.
2083
- * Update the load balance flag and scheduling domain if
2084
- * cpus_read_trylock() is successful.
2077
+ * cpuset_handle_hotplug(). Update the load balance flag and
2078
+ * scheduling domain accordingly.
2085
2079
*/
2086
- if ((cmd == partcmd_update ) && !newmask && cpus_read_trylock ()) {
2080
+ if ((cmd == partcmd_update ) && !newmask )
2087
2081
update_partition_sd_lb (cs , old_prs );
2088
- cpus_read_unlock ();
2089
- }
2090
2082
2091
2083
notify_partition_change (cs , old_prs );
2092
2084
return 0 ;
@@ -3599,8 +3591,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
3599
3591
* proceeding, so that we don't end up keep removing tasks added
3600
3592
* after execution capability is restored.
3601
3593
*
3602
- * cpuset_hotplug_work calls back into cgroup core via
3603
- * cgroup_transfer_tasks() and waiting for it from a cgroupfs
3594
+ * cpuset_handle_hotplug may call back into cgroup core asynchronously
3595
+ * via cgroup_transfer_tasks() and waiting for it from a cgroupfs
3604
3596
* operation like this one can lead to a deadlock through kernfs
3605
3597
* active_ref protection. Let's break the protection. Losing the
3606
3598
* protection is okay as we check whether @cs is online after
@@ -3609,7 +3601,6 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
3609
3601
*/
3610
3602
css_get (& cs -> css );
3611
3603
kernfs_break_active_protection (of -> kn );
3612
- flush_work (& cpuset_hotplug_work );
3613
3604
3614
3605
cpus_read_lock ();
3615
3606
mutex_lock (& cpuset_mutex );
@@ -4354,6 +4345,16 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
4354
4345
}
4355
4346
}
4356
4347
4348
+ static void cpuset_migrate_tasks_workfn (struct work_struct * work )
4349
+ {
4350
+ struct cpuset_remove_tasks_struct * s ;
4351
+
4352
+ s = container_of (work , struct cpuset_remove_tasks_struct , work );
4353
+ remove_tasks_in_empty_cpuset (s -> cs );
4354
+ css_put (& s -> cs -> css );
4355
+ kfree (s );
4356
+ }
4357
+
4357
4358
static void
4358
4359
hotplug_update_tasks_legacy (struct cpuset * cs ,
4359
4360
struct cpumask * new_cpus , nodemask_t * new_mems ,
@@ -4383,12 +4384,21 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
4383
4384
/*
4384
4385
* Move tasks to the nearest ancestor with execution resources,
4385
4386
* This is full cgroup operation which will also call back into
4386
- * cpuset. Should be done outside any lock .
4387
+ * cpuset. Execute it asynchronously using workqueue .
4387
4388
*/
4388
- if (is_empty ) {
4389
- mutex_unlock (& cpuset_mutex );
4390
- remove_tasks_in_empty_cpuset (cs );
4391
- mutex_lock (& cpuset_mutex );
4389
+ if (is_empty && cs -> css .cgroup -> nr_populated_csets &&
4390
+ css_tryget_online (& cs -> css )) {
4391
+ struct cpuset_remove_tasks_struct * s ;
4392
+
4393
+ s = kzalloc (sizeof (* s ), GFP_KERNEL );
4394
+ if (WARN_ON_ONCE (!s )) {
4395
+ css_put (& cs -> css );
4396
+ return ;
4397
+ }
4398
+
4399
+ s -> cs = cs ;
4400
+ INIT_WORK (& s -> work , cpuset_migrate_tasks_workfn );
4401
+ schedule_work (& s -> work );
4392
4402
}
4393
4403
}
4394
4404
@@ -4421,30 +4431,6 @@ void cpuset_force_rebuild(void)
4421
4431
force_rebuild = true;
4422
4432
}
4423
4433
4424
- /*
4425
- * Attempt to acquire a cpus_read_lock while a hotplug operation may be in
4426
- * progress.
4427
- * Return: true if successful, false otherwise
4428
- *
4429
- * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock,
4430
- * cpus_read_trylock() is used here to acquire the lock.
4431
- */
4432
- static bool cpuset_hotplug_cpus_read_trylock (void )
4433
- {
4434
- int retries = 0 ;
4435
-
4436
- while (!cpus_read_trylock ()) {
4437
- /*
4438
- * CPU hotplug still in progress. Retry 5 times
4439
- * with a 10ms wait before bailing out.
4440
- */
4441
- if (++ retries > 5 )
4442
- return false;
4443
- msleep (10 );
4444
- }
4445
- return true;
4446
- }
4447
-
4448
4434
/**
4449
4435
* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
4450
4436
* @cs: cpuset in interest
@@ -4493,13 +4479,11 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
4493
4479
compute_partition_effective_cpumask (cs , & new_cpus );
4494
4480
4495
4481
if (remote && cpumask_empty (& new_cpus ) &&
4496
- partition_is_populated (cs , NULL ) &&
4497
- cpuset_hotplug_cpus_read_trylock ()) {
4482
+ partition_is_populated (cs , NULL )) {
4498
4483
remote_partition_disable (cs , tmp );
4499
4484
compute_effective_cpumask (& new_cpus , cs , parent );
4500
4485
remote = false;
4501
4486
cpuset_force_rebuild ();
4502
- cpus_read_unlock ();
4503
4487
}
4504
4488
4505
4489
/*
@@ -4519,18 +4503,8 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
4519
4503
else if (is_partition_valid (parent ) && is_partition_invalid (cs ))
4520
4504
partcmd = partcmd_update ;
4521
4505
4522
- /*
4523
- * cpus_read_lock needs to be held before calling
4524
- * update_parent_effective_cpumask(). To avoid circular lock
4525
- * dependency between cpuset_mutex and cpus_read_lock,
4526
- * cpus_read_trylock() is used here to acquire the lock.
4527
- */
4528
4506
if (partcmd >= 0 ) {
4529
- if (!cpuset_hotplug_cpus_read_trylock ())
4530
- goto update_tasks ;
4531
-
4532
4507
update_parent_effective_cpumask (cs , partcmd , NULL , tmp );
4533
- cpus_read_unlock ();
4534
4508
if ((partcmd == partcmd_invalidate ) || is_partition_valid (cs )) {
4535
4509
compute_partition_effective_cpumask (cs , & new_cpus );
4536
4510
cpuset_force_rebuild ();
@@ -4558,8 +4532,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
4558
4532
}
4559
4533
4560
4534
/**
4561
- * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
4562
- * @work: unused
4535
+ * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset
4563
4536
*
4564
4537
* This function is called after either CPU or memory configuration has
4565
4538
* changed and updates cpuset accordingly. The top_cpuset is always
@@ -4573,8 +4546,10 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
4573
4546
*
4574
4547
* Note that CPU offlining during suspend is ignored. We don't modify
4575
4548
* cpusets across suspend/resume cycles at all.
4549
+ *
4550
+ * CPU / memory hotplug is handled synchronously.
4576
4551
*/
4577
- static void cpuset_hotplug_workfn ( struct work_struct * work )
4552
+ static void cpuset_handle_hotplug ( void )
4578
4553
{
4579
4554
static cpumask_t new_cpus ;
4580
4555
static nodemask_t new_mems ;
@@ -4585,6 +4560,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
4585
4560
if (on_dfl && !alloc_cpumasks (NULL , & tmp ))
4586
4561
ptmp = & tmp ;
4587
4562
4563
+ lockdep_assert_cpus_held ();
4588
4564
mutex_lock (& cpuset_mutex );
4589
4565
4590
4566
/* fetch the available cpus/mems and find out which changed how */
@@ -4666,7 +4642,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
4666
4642
/* rebuild sched domains if cpus_allowed has changed */
4667
4643
if (cpus_updated || force_rebuild ) {
4668
4644
force_rebuild = false;
4669
- rebuild_sched_domains ();
4645
+ rebuild_sched_domains_cpuslocked ();
4670
4646
}
4671
4647
4672
4648
free_cpumasks (NULL , ptmp );
@@ -4679,12 +4655,7 @@ void cpuset_update_active_cpus(void)
4679
4655
* inside cgroup synchronization. Bounce actual hotplug processing
4680
4656
* to a work item to avoid reverse locking order.
4681
4657
*/
4682
- schedule_work (& cpuset_hotplug_work );
4683
- }
4684
-
4685
- void cpuset_wait_for_hotplug (void )
4686
- {
4687
- flush_work (& cpuset_hotplug_work );
4658
+ cpuset_handle_hotplug ();
4688
4659
}
4689
4660
4690
4661
/*
@@ -4695,7 +4666,7 @@ void cpuset_wait_for_hotplug(void)
4695
4666
static int cpuset_track_online_nodes (struct notifier_block * self ,
4696
4667
unsigned long action , void * arg )
4697
4668
{
4698
- schedule_work ( & cpuset_hotplug_work );
4669
+ cpuset_handle_hotplug ( );
4699
4670
return NOTIFY_OK ;
4700
4671
}
4701
4672
0 commit comments