Skip to content

Commit 860a452

Browse files
arighihtejun
authored andcommitted
sched_ext: Introduce NUMA awareness to the default idle selection policy
Similarly to commit dfa4ed2 ("sched_ext: Introduce LLC awareness to the default idle selection policy"), extend the built-in idle CPU selection policy to also prioritize CPUs within the same NUMA node. With this change applied, the built-in CPU idle selection policy follows this logic: - always prioritize CPUs from fully idle SMT cores, - select the same CPU if possible, - select a CPU within the same LLC domain, - select a CPU within the same NUMA node. Both NUMA and LLC awareness features are enabled only when the system has multiple NUMA nodes or multiple LLC domains. In the future, we may want to improve the NUMA node selection to account the node distance from prev_cpu. Currently, the logic only tries to keep tasks running on the same NUMA node. If all CPUs within a node are busy, the next NUMA node is chosen randomly. Signed-off-by: Andrea Righi <[email protected]> Signed-off-by: Tejun Heo <[email protected]>
1 parent b7d0bbc commit 860a452

File tree

1 file changed

+119
-17
lines changed

1 file changed

+119
-17
lines changed

kernel/sched/ext.c

Lines changed: 119 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -870,6 +870,11 @@ static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
870870
static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
871871
static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
872872

873+
#ifdef CONFIG_SMP
874+
static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
875+
static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa);
876+
#endif
877+
873878
static struct static_key_false scx_has_op[SCX_OPI_END] =
874879
{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
875880

@@ -3124,39 +3129,88 @@ static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
31243129
goto retry;
31253130
}
31263131

3127-
#ifdef CONFIG_SCHED_MC
31283132
/*
3129-
* Return the cpumask of CPUs usable by task @p in the same LLC domain of @cpu,
3130-
* or NULL if the LLC domain cannot be determined.
3133+
* Initialize topology-aware scheduling.
3134+
*
3135+
* Detect if the system has multiple LLC or multiple NUMA domains and enable
3136+
* cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
3137+
* selection policy.
31313138
*/
3132-
static const struct cpumask *llc_domain(const struct task_struct *p, s32 cpu)
3139+
static void update_selcpu_topology(void)
31333140
{
3134-
struct sched_domain *sd = rcu_dereference(per_cpu(sd_llc, cpu));
3135-
const struct cpumask *llc_cpus = sd ? sched_domain_span(sd) : NULL;
3141+
bool enable_llc = false, enable_numa = false;
3142+
struct sched_domain *sd;
3143+
const struct cpumask *cpus;
3144+
s32 cpu = cpumask_first(cpu_online_mask);
31363145

31373146
/*
3138-
* Return the LLC domain only if the task is allowed to run on all
3139-
* CPUs.
3147+
* We only need to check the NUMA node and LLC domain of the first
3148+
* available CPU to determine if they cover all CPUs.
3149+
*
3150+
* If all CPUs belong to the same NUMA node or share the same LLC
3151+
* domain, enabling NUMA or LLC optimizations is unnecessary.
3152+
* Otherwise, these optimizations can be enabled.
31403153
*/
3141-
return p->nr_cpus_allowed == nr_cpu_ids ? llc_cpus : NULL;
3142-
}
3143-
#else /* CONFIG_SCHED_MC */
3144-
static inline const struct cpumask *llc_domain(struct task_struct *p, s32 cpu)
3145-
{
3146-
return NULL;
3154+
rcu_read_lock();
3155+
sd = rcu_dereference(per_cpu(sd_llc, cpu));
3156+
if (sd) {
3157+
cpus = sched_domain_span(sd);
3158+
if (cpumask_weight(cpus) < num_possible_cpus())
3159+
enable_llc = true;
3160+
}
3161+
sd = highest_flag_domain(cpu, SD_NUMA);
3162+
if (sd) {
3163+
cpus = sched_group_span(sd->groups);
3164+
if (cpumask_weight(cpus) < num_possible_cpus())
3165+
enable_numa = true;
3166+
}
3167+
rcu_read_unlock();
3168+
3169+
pr_debug("sched_ext: LLC idle selection %s\n",
3170+
enable_llc ? "enabled" : "disabled");
3171+
pr_debug("sched_ext: NUMA idle selection %s\n",
3172+
enable_numa ? "enabled" : "disabled");
3173+
3174+
if (enable_llc)
3175+
static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
3176+
else
3177+
static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
3178+
if (enable_numa)
3179+
static_branch_enable_cpuslocked(&scx_selcpu_topo_numa);
3180+
else
3181+
static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
31473182
}
3148-
#endif /* CONFIG_SCHED_MC */
31493183

31503184
/*
3151-
* Built-in cpu idle selection policy.
3185+
* Built-in CPU idle selection policy:
3186+
*
3187+
* 1. Prioritize full-idle cores:
3188+
* - always prioritize CPUs from fully idle cores (both logical CPUs are
3189+
* idle) to avoid interference caused by SMT.
3190+
*
3191+
* 2. Reuse the same CPU:
3192+
* - prefer the last used CPU to take advantage of cached data (L1, L2) and
3193+
* branch prediction optimizations.
3194+
*
3195+
* 3. Pick a CPU within the same LLC (Last-Level Cache):
3196+
* - if the above conditions aren't met, pick a CPU that shares the same LLC
3197+
* to maintain cache locality.
3198+
*
3199+
* 4. Pick a CPU within the same NUMA node, if enabled:
3200+
* - choose a CPU from the same NUMA node to reduce memory access latency.
3201+
*
3202+
* Step 3 and 4 are performed only if the system has, respectively, multiple
3203+
* LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and
3204+
* scx_selcpu_topo_numa).
31523205
*
31533206
* NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
31543207
* we never call ops.select_cpu() for them, see select_task_rq().
31553208
*/
31563209
static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
31573210
u64 wake_flags, bool *found)
31583211
{
3159-
const struct cpumask *llc_cpus = llc_domain(p, prev_cpu);
3212+
const struct cpumask *llc_cpus = NULL;
3213+
const struct cpumask *numa_cpus = NULL;
31603214
s32 cpu;
31613215

31623216
*found = false;
@@ -3166,6 +3220,30 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
31663220
return prev_cpu;
31673221
}
31683222

3223+
/*
3224+
* Determine the scheduling domain only if the task is allowed to run
3225+
* on all CPUs.
3226+
*
3227+
* This is done primarily for efficiency, as it avoids the overhead of
3228+
* updating a cpumask every time we need to select an idle CPU (which
3229+
* can be costly in large SMP systems), but it also aligns logically:
3230+
* if a task's scheduling domain is restricted by user-space (through
3231+
* CPU affinity), the task will simply use the flat scheduling domain
3232+
* defined by user-space.
3233+
*/
3234+
if (p->nr_cpus_allowed >= num_possible_cpus()) {
3235+
if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
3236+
numa_cpus = cpumask_of_node(cpu_to_node(prev_cpu));
3237+
3238+
if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) {
3239+
struct sched_domain *sd;
3240+
3241+
sd = rcu_dereference(per_cpu(sd_llc, prev_cpu));
3242+
if (sd)
3243+
llc_cpus = sched_domain_span(sd);
3244+
}
3245+
}
3246+
31693247
/*
31703248
* If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
31713249
*/
@@ -3226,6 +3304,15 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
32263304
goto cpu_found;
32273305
}
32283306

3307+
/*
3308+
* Search for any fully idle core in the same NUMA node.
3309+
*/
3310+
if (numa_cpus) {
3311+
cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE);
3312+
if (cpu >= 0)
3313+
goto cpu_found;
3314+
}
3315+
32293316
/*
32303317
* Search for any full idle core usable by the task.
32313318
*/
@@ -3251,6 +3338,15 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
32513338
goto cpu_found;
32523339
}
32533340

3341+
/*
3342+
* Search for any idle CPU in the same NUMA node.
3343+
*/
3344+
if (numa_cpus) {
3345+
cpu = scx_pick_idle_cpu(numa_cpus, 0);
3346+
if (cpu >= 0)
3347+
goto cpu_found;
3348+
}
3349+
32543350
/*
32553351
* Search for any idle CPU usable by the task.
32563352
*/
@@ -3383,6 +3479,9 @@ static void handle_hotplug(struct rq *rq, bool online)
33833479

33843480
atomic_long_inc(&scx_hotplug_seq);
33853481

3482+
if (scx_enabled())
3483+
update_selcpu_topology();
3484+
33863485
if (online && SCX_HAS_OP(cpu_online))
33873486
SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
33883487
else if (!online && SCX_HAS_OP(cpu_offline))
@@ -5202,6 +5301,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
52025301
static_branch_enable_cpuslocked(&scx_has_op[i]);
52035302

52045303
check_hotplug_seq(ops);
5304+
#ifdef CONFIG_SMP
5305+
update_selcpu_topology();
5306+
#endif
52055307
cpus_read_unlock();
52065308

52075309
ret = validate_ops(ops);

0 commit comments

Comments
 (0)