Skip to content

Commit d1a8919

Browse files
author
Frederic Weisbecker
committed
kthread: Default affine kthread to its preferred NUMA node
Kthreads attached to a preferred NUMA node for their task structure allocation can also be assumed to run preferrably within that same node. A more precise affinity is usually notified by calling kthread_create_on_cpu() or kthread_bind[_mask]() before the first wakeup. For the others, a default affinity to the node is desired and sometimes implemented with more or less success when it comes to deal with hotplug events and nohz_full / CPU Isolation interactions: - kcompactd is affine to its node and handles hotplug but not CPU Isolation - kswapd is affine to its node and ignores hotplug and CPU Isolation - A bunch of drivers create their kthreads on a specific node and don't take care about affining further. Handle that default node affinity preference at the generic level instead, provided a kthread is created on an actual node and doesn't apply any specific affinity such as a given CPU or a custom cpumask to bind to before its first wake-up. This generic handling is aware of CPU hotplug events and CPU isolation such that: * When a housekeeping CPU goes up that is part of the node of a given kthread, the related task is re-affined to that own node if it was previously running on the default last resort online housekeeping set from other nodes. * When a housekeeping CPU goes down while it was part of the node of a kthread, the running task is migrated (or the sleeping task is woken up) automatically by the scheduler to other housekeepers within the same node or, as a last resort, to all housekeepers from other nodes. Acked-by: Vlastimil Babka <[email protected]> Signed-off-by: Frederic Weisbecker <[email protected]>
1 parent 5eacb68 commit d1a8919

File tree

2 files changed

+106
-1
lines changed

2 files changed

+106
-1
lines changed

include/linux/cpuhotplug.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ enum cpuhp_state {
240240
CPUHP_AP_WORKQUEUE_ONLINE,
241241
CPUHP_AP_RANDOM_ONLINE,
242242
CPUHP_AP_RCUTREE_ONLINE,
243+
CPUHP_AP_KTHREADS_ONLINE,
243244
CPUHP_AP_BASE_CACHEINFO_ONLINE,
244245
CPUHP_AP_ONLINE_DYN,
245246
CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 40,

kernel/kthread.c

Lines changed: 105 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ static DEFINE_SPINLOCK(kthread_create_lock);
3535
static LIST_HEAD(kthread_create_list);
3636
struct task_struct *kthreadd_task;
3737

38+
static LIST_HEAD(kthreads_hotplug);
39+
static DEFINE_MUTEX(kthreads_hotplug_lock);
40+
3841
struct kthread_create_info
3942
{
4043
/* Information passed to kthread() from kthreadd. */
@@ -53,6 +56,7 @@ struct kthread_create_info
5356
struct kthread {
5457
unsigned long flags;
5558
unsigned int cpu;
59+
unsigned int node;
5660
int started;
5761
int result;
5862
int (*threadfn)(void *);
@@ -64,6 +68,8 @@ struct kthread {
6468
#endif
6569
/* To store the full name if task comm is truncated. */
6670
char *full_name;
71+
struct task_struct *task;
72+
struct list_head hotplug_node;
6773
};
6874

6975
enum KTHREAD_BITS {
@@ -122,8 +128,11 @@ bool set_kthread_struct(struct task_struct *p)
122128

123129
init_completion(&kthread->exited);
124130
init_completion(&kthread->parked);
131+
INIT_LIST_HEAD(&kthread->hotplug_node);
125132
p->vfork_done = &kthread->exited;
126133

134+
kthread->task = p;
135+
kthread->node = tsk_fork_get_node(current);
127136
p->worker_private = kthread;
128137
return true;
129138
}
@@ -314,6 +323,11 @@ void __noreturn kthread_exit(long result)
314323
{
315324
struct kthread *kthread = to_kthread(current);
316325
kthread->result = result;
326+
if (!list_empty(&kthread->hotplug_node)) {
327+
mutex_lock(&kthreads_hotplug_lock);
328+
list_del(&kthread->hotplug_node);
329+
mutex_unlock(&kthreads_hotplug_lock);
330+
}
317331
do_exit(0);
318332
}
319333
EXPORT_SYMBOL(kthread_exit);
@@ -339,6 +353,48 @@ void __noreturn kthread_complete_and_exit(struct completion *comp, long code)
339353
}
340354
EXPORT_SYMBOL(kthread_complete_and_exit);
341355

356+
static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpumask)
357+
{
358+
cpumask_and(cpumask, cpumask_of_node(kthread->node),
359+
housekeeping_cpumask(HK_TYPE_KTHREAD));
360+
361+
if (cpumask_empty(cpumask))
362+
cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD));
363+
}
364+
365+
static void kthread_affine_node(void)
366+
{
367+
struct kthread *kthread = to_kthread(current);
368+
cpumask_var_t affinity;
369+
370+
WARN_ON_ONCE(kthread_is_per_cpu(current));
371+
372+
if (kthread->node == NUMA_NO_NODE) {
373+
housekeeping_affine(current, HK_TYPE_KTHREAD);
374+
} else {
375+
if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) {
376+
WARN_ON_ONCE(1);
377+
return;
378+
}
379+
380+
mutex_lock(&kthreads_hotplug_lock);
381+
WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
382+
list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
383+
/*
384+
* The node cpumask is racy when read from kthread() but:
385+
* - a racing CPU going down will either fail on the subsequent
386+
* call to set_cpus_allowed_ptr() or be migrated to housekeepers
387+
* afterwards by the scheduler.
388+
* - a racing CPU going up will be handled by kthreads_online_cpu()
389+
*/
390+
kthread_fetch_affinity(kthread, affinity);
391+
set_cpus_allowed_ptr(current, affinity);
392+
mutex_unlock(&kthreads_hotplug_lock);
393+
394+
free_cpumask_var(affinity);
395+
}
396+
}
397+
342398
static int kthread(void *_create)
343399
{
344400
static const struct sched_param param = { .sched_priority = 0 };
@@ -369,7 +425,6 @@ static int kthread(void *_create)
369425
* back to default in case they have been changed.
370426
*/
371427
sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
372-
set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD));
373428

374429
/* OK, tell user we're spawned, wait for stop or wakeup */
375430
__set_current_state(TASK_UNINTERRUPTIBLE);
@@ -385,6 +440,9 @@ static int kthread(void *_create)
385440

386441
self->started = 1;
387442

443+
if (!(current->flags & PF_NO_SETAFFINITY))
444+
kthread_affine_node();
445+
388446
ret = -EINTR;
389447
if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
390448
cgroup_kthread_ready();
@@ -781,6 +839,52 @@ int kthreadd(void *unused)
781839
return 0;
782840
}
783841

842+
/*
843+
* Re-affine kthreads according to their preferences
844+
* and the newly online CPU. The CPU down part is handled
845+
* by select_fallback_rq() which default re-affines to
846+
* housekeepers in case the preferred affinity doesn't
847+
* apply anymore.
848+
*/
849+
static int kthreads_online_cpu(unsigned int cpu)
850+
{
851+
cpumask_var_t affinity;
852+
struct kthread *k;
853+
int ret;
854+
855+
guard(mutex)(&kthreads_hotplug_lock);
856+
857+
if (list_empty(&kthreads_hotplug))
858+
return 0;
859+
860+
if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
861+
return -ENOMEM;
862+
863+
ret = 0;
864+
865+
list_for_each_entry(k, &kthreads_hotplug, hotplug_node) {
866+
if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) ||
867+
kthread_is_per_cpu(k->task) ||
868+
k->node == NUMA_NO_NODE)) {
869+
ret = -EINVAL;
870+
continue;
871+
}
872+
kthread_fetch_affinity(k, affinity);
873+
set_cpus_allowed_ptr(k->task, affinity);
874+
}
875+
876+
free_cpumask_var(affinity);
877+
878+
return ret;
879+
}
880+
881+
static int kthreads_init(void)
882+
{
883+
return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online",
884+
kthreads_online_cpu, NULL);
885+
}
886+
early_initcall(kthreads_init);
887+
784888
void __kthread_init_worker(struct kthread_worker *worker,
785889
const char *name,
786890
struct lock_class_key *key)

0 commit comments

Comments
 (0)