Skip to content

Commit 24c56ee

Browse files
committed
Merge tag 'sched_urgent_for_v5.11_rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Borislav Petkov: - Correct the marking of kthreads which are supposed to run on a specific, single CPU vs such which are affine to only one CPU, mark per-cpu workqueue threads as such and make sure that marking "survives" CPU hotplug. Fix CPU hotplug issues with such kthreads. - A fix to not push away tasks on CPUs coming online. - Have workqueue CPU hotplug code use cpu_possible_mask when breaking affinity on CPU offlining so that pending workers can finish on newly arrived onlined CPUs too. - Dump tasks which haven't vacated a CPU which is currently being unplugged. - Register a special scale invariance callback which gets called on resume from RAM to read out APERF/MPERF after resume and thus make the schedutil scaling governor more precise. * tag 'sched_urgent_for_v5.11_rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched: Relax the set_cpus_allowed_ptr() semantics sched: Fix CPU hotplug / tighten is_per_cpu_kthread() sched: Prepare to use balance_push in ttwu() workqueue: Restrict affinity change to rescuer workqueue: Tag bound workers with KTHREAD_IS_PER_CPU kthread: Extract KTHREAD_IS_PER_CPU sched: Don't run cpu-online with balance_push() enabled workqueue: Use cpu_possible_mask instead of cpu_active_mask to break affinity sched/core: Print out straggler tasks in sched_cpu_dying() x86: PM: Register syscore_ops for scale invariance
2 parents 025929f + 741ba80 commit 24c56ee

File tree

7 files changed

+151
-33
lines changed

7 files changed

+151
-33
lines changed

arch/x86/kernel/smpboot.c

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
#include <linux/numa.h>
5757
#include <linux/pgtable.h>
5858
#include <linux/overflow.h>
59+
#include <linux/syscore_ops.h>
5960

6061
#include <asm/acpi.h>
6162
#include <asm/desc.h>
@@ -2083,6 +2084,23 @@ static void init_counter_refs(void)
20832084
this_cpu_write(arch_prev_mperf, mperf);
20842085
}
20852086

2087+
#ifdef CONFIG_PM_SLEEP
2088+
static struct syscore_ops freq_invariance_syscore_ops = {
2089+
.resume = init_counter_refs,
2090+
};
2091+
2092+
static void register_freq_invariance_syscore_ops(void)
2093+
{
2094+
/* Bail out if registered already. */
2095+
if (freq_invariance_syscore_ops.node.prev)
2096+
return;
2097+
2098+
register_syscore_ops(&freq_invariance_syscore_ops);
2099+
}
2100+
#else
2101+
static inline void register_freq_invariance_syscore_ops(void) {}
2102+
#endif
2103+
20862104
static void init_freq_invariance(bool secondary, bool cppc_ready)
20872105
{
20882106
bool ret = false;
@@ -2109,6 +2127,7 @@ static void init_freq_invariance(bool secondary, bool cppc_ready)
21092127
if (ret) {
21102128
init_counter_refs();
21112129
static_branch_enable(&arch_scale_freq_key);
2130+
register_freq_invariance_syscore_ops();
21122131
pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
21132132
} else {
21142133
pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");

include/linux/kthread.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
3333
unsigned int cpu,
3434
const char *namefmt);
3535

36+
void kthread_set_per_cpu(struct task_struct *k, int cpu);
37+
bool kthread_is_per_cpu(struct task_struct *k);
38+
3639
/**
3740
* kthread_run - create and wake a thread.
3841
* @threadfn: the function to run until signal_pending(current).

kernel/kthread.c

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -493,11 +493,36 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
493493
return p;
494494
kthread_bind(p, cpu);
495495
/* CPU hotplug need to bind once again when unparking the thread. */
496-
set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
497496
to_kthread(p)->cpu = cpu;
498497
return p;
499498
}
500499

500+
void kthread_set_per_cpu(struct task_struct *k, int cpu)
501+
{
502+
struct kthread *kthread = to_kthread(k);
503+
if (!kthread)
504+
return;
505+
506+
WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY));
507+
508+
if (cpu < 0) {
509+
clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
510+
return;
511+
}
512+
513+
kthread->cpu = cpu;
514+
set_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
515+
}
516+
517+
bool kthread_is_per_cpu(struct task_struct *k)
518+
{
519+
struct kthread *kthread = to_kthread(k);
520+
if (!kthread)
521+
return false;
522+
523+
return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
524+
}
525+
501526
/**
502527
* kthread_unpark - unpark a thread created by kthread_create().
503528
* @k: thread created by kthread_create().

kernel/sched/core.c

Lines changed: 88 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1796,13 +1796,28 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)
17961796
*/
17971797
static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
17981798
{
1799+
/* When not in the task's cpumask, no point in looking further. */
17991800
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
18001801
return false;
18011802

1802-
if (is_per_cpu_kthread(p) || is_migration_disabled(p))
1803+
/* migrate_disabled() must be allowed to finish. */
1804+
if (is_migration_disabled(p))
18031805
return cpu_online(cpu);
18041806

1805-
return cpu_active(cpu);
1807+
/* Non kernel threads are not allowed during either online or offline. */
1808+
if (!(p->flags & PF_KTHREAD))
1809+
return cpu_active(cpu);
1810+
1811+
/* KTHREAD_IS_PER_CPU is always allowed. */
1812+
if (kthread_is_per_cpu(p))
1813+
return cpu_online(cpu);
1814+
1815+
/* Regular kernel threads don't get to stay during offline. */
1816+
if (cpu_rq(cpu)->balance_push)
1817+
return false;
1818+
1819+
/* But are allowed during online. */
1820+
return cpu_online(cpu);
18061821
}
18071822

18081823
/*
@@ -2327,7 +2342,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
23272342

23282343
if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
23292344
/*
2330-
* Kernel threads are allowed on online && !active CPUs.
2345+
* Kernel threads are allowed on online && !active CPUs,
2346+
* however, during cpu-hot-unplug, even these might get pushed
2347+
* away if not KTHREAD_IS_PER_CPU.
23312348
*
23322349
* Specifically, migration_disabled() tasks must not fail the
23332350
* cpumask_any_and_distribute() pick below, esp. so on
@@ -2371,16 +2388,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
23712388

23722389
__do_set_cpus_allowed(p, new_mask, flags);
23732390

2374-
if (p->flags & PF_KTHREAD) {
2375-
/*
2376-
* For kernel threads that do indeed end up on online &&
2377-
* !active we want to ensure they are strict per-CPU threads.
2378-
*/
2379-
WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
2380-
!cpumask_intersects(new_mask, cpu_active_mask) &&
2381-
p->nr_cpus_allowed != 1);
2382-
}
2383-
23842391
return affine_move_task(rq, p, &rf, dest_cpu, flags);
23852392

23862393
out:
@@ -3121,6 +3128,13 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
31213128

31223129
static inline bool ttwu_queue_cond(int cpu, int wake_flags)
31233130
{
3131+
/*
3132+
* Do not complicate things with the async wake_list while the CPU is
3133+
* in hotplug state.
3134+
*/
3135+
if (!cpu_active(cpu))
3136+
return false;
3137+
31243138
/*
31253139
* If the CPU does not share cache, then queue the task on the
31263140
* remote rqs wakelist to avoid accessing remote data.
@@ -7276,8 +7290,14 @@ static void balance_push(struct rq *rq)
72767290
/*
72777291
* Both the cpu-hotplug and stop task are in this case and are
72787292
* required to complete the hotplug process.
7293+
*
7294+
* XXX: the idle task does not match kthread_is_per_cpu() due to
7295+
* histerical raisins.
72797296
*/
7280-
if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
7297+
if (rq->idle == push_task ||
7298+
((push_task->flags & PF_KTHREAD) && kthread_is_per_cpu(push_task)) ||
7299+
is_migration_disabled(push_task)) {
7300+
72817301
/*
72827302
* If this is the idle task on the outgoing CPU try to wake
72837303
* up the hotplug control thread which might wait for the
@@ -7309,7 +7329,7 @@ static void balance_push(struct rq *rq)
73097329
/*
73107330
* At this point need_resched() is true and we'll take the loop in
73117331
* schedule(). The next pick is obviously going to be the stop task
7312-
* which is_per_cpu_kthread() and will push this task away.
7332+
* which kthread_is_per_cpu() and will push this task away.
73137333
*/
73147334
raw_spin_lock(&rq->lock);
73157335
}
@@ -7320,10 +7340,13 @@ static void balance_push_set(int cpu, bool on)
73207340
struct rq_flags rf;
73217341

73227342
rq_lock_irqsave(rq, &rf);
7323-
if (on)
7343+
rq->balance_push = on;
7344+
if (on) {
7345+
WARN_ON_ONCE(rq->balance_callback);
73247346
rq->balance_callback = &balance_push_callback;
7325-
else
7347+
} else if (rq->balance_callback == &balance_push_callback) {
73267348
rq->balance_callback = NULL;
7349+
}
73277350
rq_unlock_irqrestore(rq, &rf);
73287351
}
73297352

@@ -7441,6 +7464,10 @@ int sched_cpu_activate(unsigned int cpu)
74417464
struct rq *rq = cpu_rq(cpu);
74427465
struct rq_flags rf;
74437466

7467+
/*
7468+
* Make sure that when the hotplug state machine does a roll-back
7469+
* we clear balance_push. Ideally that would happen earlier...
7470+
*/
74447471
balance_push_set(cpu, false);
74457472

74467473
#ifdef CONFIG_SCHED_SMT
@@ -7483,17 +7510,27 @@ int sched_cpu_deactivate(unsigned int cpu)
74837510
int ret;
74847511

74857512
set_cpu_active(cpu, false);
7513+
7514+
/*
7515+
* From this point forward, this CPU will refuse to run any task that
7516+
* is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively
7517+
* push those tasks away until this gets cleared, see
7518+
* sched_cpu_dying().
7519+
*/
7520+
balance_push_set(cpu, true);
7521+
74867522
/*
7487-
* We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
7488-
* users of this state to go away such that all new such users will
7489-
* observe it.
7523+
* We've cleared cpu_active_mask / set balance_push, wait for all
7524+
* preempt-disabled and RCU users of this state to go away such that
7525+
* all new such users will observe it.
7526+
*
7527+
* Specifically, we rely on ttwu to no longer target this CPU, see
7528+
* ttwu_queue_cond() and is_cpu_allowed().
74907529
*
74917530
* Do sync before park smpboot threads to take care the rcu boost case.
74927531
*/
74937532
synchronize_rcu();
74947533

7495-
balance_push_set(cpu, true);
7496-
74977534
rq_lock_irqsave(rq, &rf);
74987535
if (rq->rd) {
74997536
update_rq_clock(rq);
@@ -7574,6 +7611,25 @@ static void calc_load_migrate(struct rq *rq)
75747611
atomic_long_add(delta, &calc_load_tasks);
75757612
}
75767613

7614+
static void dump_rq_tasks(struct rq *rq, const char *loglvl)
7615+
{
7616+
struct task_struct *g, *p;
7617+
int cpu = cpu_of(rq);
7618+
7619+
lockdep_assert_held(&rq->lock);
7620+
7621+
printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);
7622+
for_each_process_thread(g, p) {
7623+
if (task_cpu(p) != cpu)
7624+
continue;
7625+
7626+
if (!task_on_rq_queued(p))
7627+
continue;
7628+
7629+
printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm);
7630+
}
7631+
}
7632+
75777633
int sched_cpu_dying(unsigned int cpu)
75787634
{
75797635
struct rq *rq = cpu_rq(cpu);
@@ -7583,9 +7639,18 @@ int sched_cpu_dying(unsigned int cpu)
75837639
sched_tick_stop(cpu);
75847640

75857641
rq_lock_irqsave(rq, &rf);
7586-
BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
7642+
if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
7643+
WARN(true, "Dying CPU not properly vacated!");
7644+
dump_rq_tasks(rq, KERN_WARNING);
7645+
}
75877646
rq_unlock_irqrestore(rq, &rf);
75887647

7648+
/*
7649+
* Now that the CPU is offline, make sure we're welcome
7650+
* to new tasks once we come back up.
7651+
*/
7652+
balance_push_set(cpu, false);
7653+
75897654
calc_load_migrate(rq);
75907655
update_max_interval();
75917656
nohz_balance_exit_idle(rq);

kernel/sched/sched.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -975,6 +975,7 @@ struct rq {
975975
unsigned long cpu_capacity_orig;
976976

977977
struct callback_head *balance_callback;
978+
unsigned char balance_push;
978979

979980
unsigned char nohz_idle_balance;
980981
unsigned char idle_balance;

kernel/smpboot.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
188188
kfree(td);
189189
return PTR_ERR(tsk);
190190
}
191+
kthread_set_per_cpu(tsk, cpu);
191192
/*
192193
* Park the thread so that it could start right on the CPU
193194
* when it is available.

kernel/workqueue.c

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1848,19 +1848,18 @@ static void worker_attach_to_pool(struct worker *worker,
18481848
{
18491849
mutex_lock(&wq_pool_attach_mutex);
18501850

1851-
/*
1852-
* set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
1853-
* online CPUs. It'll be re-applied when any of the CPUs come up.
1854-
*/
1855-
set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
1856-
18571851
/*
18581852
* The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains
18591853
* stable across this function. See the comments above the flag
18601854
* definition for details.
18611855
*/
18621856
if (pool->flags & POOL_DISASSOCIATED)
18631857
worker->flags |= WORKER_UNBOUND;
1858+
else
1859+
kthread_set_per_cpu(worker->task, pool->cpu);
1860+
1861+
if (worker->rescue_wq)
1862+
set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
18641863

18651864
list_add_tail(&worker->node, &pool->workers);
18661865
worker->pool = pool;
@@ -1883,6 +1882,7 @@ static void worker_detach_from_pool(struct worker *worker)
18831882

18841883
mutex_lock(&wq_pool_attach_mutex);
18851884

1885+
kthread_set_per_cpu(worker->task, -1);
18861886
list_del(&worker->node);
18871887
worker->pool = NULL;
18881888

@@ -4919,8 +4919,10 @@ static void unbind_workers(int cpu)
49194919

49204920
raw_spin_unlock_irq(&pool->lock);
49214921

4922-
for_each_pool_worker(worker, pool)
4923-
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0);
4922+
for_each_pool_worker(worker, pool) {
4923+
kthread_set_per_cpu(worker->task, -1);
4924+
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
4925+
}
49244926

49254927
mutex_unlock(&wq_pool_attach_mutex);
49264928

@@ -4972,9 +4974,11 @@ static void rebind_workers(struct worker_pool *pool)
49724974
* of all workers first and then clear UNBOUND. As we're called
49734975
* from CPU_ONLINE, the following shouldn't fail.
49744976
*/
4975-
for_each_pool_worker(worker, pool)
4977+
for_each_pool_worker(worker, pool) {
4978+
kthread_set_per_cpu(worker->task, pool->cpu);
49764979
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
49774980
pool->attrs->cpumask) < 0);
4981+
}
49784982

49794983
raw_spin_lock_irq(&pool->lock);
49804984

0 commit comments

Comments
 (0)