@@ -870,6 +870,11 @@ static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
870870static DEFINE_STATIC_KEY_FALSE (scx_ops_cpu_preempt );
871871static DEFINE_STATIC_KEY_FALSE (scx_builtin_idle_enabled );
872872
873+ #ifdef CONFIG_SMP
874+ static DEFINE_STATIC_KEY_FALSE (scx_selcpu_topo_llc );
875+ static DEFINE_STATIC_KEY_FALSE (scx_selcpu_topo_numa );
876+ #endif
877+
873878static struct static_key_false scx_has_op [SCX_OPI_END ] =
874879 { [0 ... SCX_OPI_END - 1 ] = STATIC_KEY_FALSE_INIT };
875880
@@ -3124,39 +3129,88 @@ static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
31243129 goto retry ;
31253130}
31263131
3127- #ifdef CONFIG_SCHED_MC
31283132/*
3129- * Return the cpumask of CPUs usable by task @p in the same LLC domain of @cpu,
3130- * or NULL if the LLC domain cannot be determined.
3133+ * Initialize topology-aware scheduling.
3134+ *
3135+ * Detect if the system has multiple LLC or multiple NUMA domains and enable
3136+ * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
3137+ * selection policy.
31313138 */
3132- static const struct cpumask * llc_domain ( const struct task_struct * p , s32 cpu )
3139+ static void update_selcpu_topology ( void )
31333140{
3134- struct sched_domain * sd = rcu_dereference (per_cpu (sd_llc , cpu ));
3135- const struct cpumask * llc_cpus = sd ? sched_domain_span (sd ) : NULL ;
3141+ bool enable_llc = false, enable_numa = false;
3142+ struct sched_domain * sd ;
3143+ const struct cpumask * cpus ;
3144+ s32 cpu = cpumask_first (cpu_online_mask );
31363145
31373146 /*
3138- * Return the LLC domain only if the task is allowed to run on all
3139- * CPUs.
3147+ * We only need to check the NUMA node and LLC domain of the first
3148+ * available CPU to determine if they cover all CPUs.
3149+ *
3150+ * If all CPUs belong to the same NUMA node or share the same LLC
3151+ * domain, enabling NUMA or LLC optimizations is unnecessary.
3152+ * Otherwise, these optimizations can be enabled.
31403153 */
3141- return p -> nr_cpus_allowed == nr_cpu_ids ? llc_cpus : NULL ;
3142- }
3143- #else /* CONFIG_SCHED_MC */
3144- static inline const struct cpumask * llc_domain (struct task_struct * p , s32 cpu )
3145- {
3146- return NULL ;
3154+ rcu_read_lock ();
3155+ sd = rcu_dereference (per_cpu (sd_llc , cpu ));
3156+ if (sd ) {
3157+ cpus = sched_domain_span (sd );
3158+ if (cpumask_weight (cpus ) < num_possible_cpus ())
3159+ enable_llc = true;
3160+ }
3161+ sd = highest_flag_domain (cpu , SD_NUMA );
3162+ if (sd ) {
3163+ cpus = sched_group_span (sd -> groups );
3164+ if (cpumask_weight (cpus ) < num_possible_cpus ())
3165+ enable_numa = true;
3166+ }
3167+ rcu_read_unlock ();
3168+
3169+ pr_debug ("sched_ext: LLC idle selection %s\n" ,
3170+ enable_llc ? "enabled" : "disabled" );
3171+ pr_debug ("sched_ext: NUMA idle selection %s\n" ,
3172+ enable_numa ? "enabled" : "disabled" );
3173+
3174+ if (enable_llc )
3175+ static_branch_enable_cpuslocked (& scx_selcpu_topo_llc );
3176+ else
3177+ static_branch_disable_cpuslocked (& scx_selcpu_topo_llc );
3178+ if (enable_numa )
3179+ static_branch_enable_cpuslocked (& scx_selcpu_topo_numa );
3180+ else
3181+ static_branch_disable_cpuslocked (& scx_selcpu_topo_numa );
31473182}
3148- #endif /* CONFIG_SCHED_MC */
31493183
31503184/*
3151- * Built-in cpu idle selection policy.
3185+ * Built-in CPU idle selection policy:
3186+ *
3187+ * 1. Prioritize full-idle cores:
3188+ * - always prioritize CPUs from fully idle cores (both logical CPUs are
3189+ * idle) to avoid interference caused by SMT.
3190+ *
3191+ * 2. Reuse the same CPU:
3192+ * - prefer the last used CPU to take advantage of cached data (L1, L2) and
3193+ * branch prediction optimizations.
3194+ *
3195+ * 3. Pick a CPU within the same LLC (Last-Level Cache):
3196+ * - if the above conditions aren't met, pick a CPU that shares the same LLC
3197+ * to maintain cache locality.
3198+ *
3199+ * 4. Pick a CPU within the same NUMA node, if enabled:
3200+ * - choose a CPU from the same NUMA node to reduce memory access latency.
3201+ *
3202+ * Step 3 and 4 are performed only if the system has, respectively, multiple
3203+ * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and
3204+ * scx_selcpu_topo_numa).
31523205 *
31533206 * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
31543207 * we never call ops.select_cpu() for them, see select_task_rq().
31553208 */
31563209static s32 scx_select_cpu_dfl (struct task_struct * p , s32 prev_cpu ,
31573210 u64 wake_flags , bool * found )
31583211{
3159- const struct cpumask * llc_cpus = llc_domain (p , prev_cpu );
3212+ const struct cpumask * llc_cpus = NULL ;
3213+ const struct cpumask * numa_cpus = NULL ;
31603214 s32 cpu ;
31613215
31623216 * found = false;
@@ -3166,6 +3220,30 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
31663220 return prev_cpu ;
31673221 }
31683222
3223+ /*
3224+ * Determine the scheduling domain only if the task is allowed to run
3225+ * on all CPUs.
3226+ *
3227+ * This is done primarily for efficiency, as it avoids the overhead of
3228+ * updating a cpumask every time we need to select an idle CPU (which
3229+ * can be costly in large SMP systems), but it also aligns logically:
3230+ * if a task's scheduling domain is restricted by user-space (through
3231+ * CPU affinity), the task will simply use the flat scheduling domain
3232+ * defined by user-space.
3233+ */
3234+ if (p -> nr_cpus_allowed >= num_possible_cpus ()) {
3235+ if (static_branch_maybe (CONFIG_NUMA , & scx_selcpu_topo_numa ))
3236+ numa_cpus = cpumask_of_node (cpu_to_node (prev_cpu ));
3237+
3238+ if (static_branch_maybe (CONFIG_SCHED_MC , & scx_selcpu_topo_llc )) {
3239+ struct sched_domain * sd ;
3240+
3241+ sd = rcu_dereference (per_cpu (sd_llc , prev_cpu ));
3242+ if (sd )
3243+ llc_cpus = sched_domain_span (sd );
3244+ }
3245+ }
3246+
31693247 /*
31703248 * If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
31713249 */
@@ -3226,6 +3304,15 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
32263304 goto cpu_found ;
32273305 }
32283306
3307+ /*
3308+ * Search for any fully idle core in the same NUMA node.
3309+ */
3310+ if (numa_cpus ) {
3311+ cpu = scx_pick_idle_cpu (numa_cpus , SCX_PICK_IDLE_CORE );
3312+ if (cpu >= 0 )
3313+ goto cpu_found ;
3314+ }
3315+
32293316 /*
32303317 * Search for any full idle core usable by the task.
32313318 */
@@ -3251,6 +3338,15 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
32513338 goto cpu_found ;
32523339 }
32533340
3341+ /*
3342+ * Search for any idle CPU in the same NUMA node.
3343+ */
3344+ if (numa_cpus ) {
3345+ cpu = scx_pick_idle_cpu (numa_cpus , 0 );
3346+ if (cpu >= 0 )
3347+ goto cpu_found ;
3348+ }
3349+
32543350 /*
32553351 * Search for any idle CPU usable by the task.
32563352 */
@@ -3383,6 +3479,9 @@ static void handle_hotplug(struct rq *rq, bool online)
33833479
33843480 atomic_long_inc (& scx_hotplug_seq );
33853481
3482+ if (scx_enabled ())
3483+ update_selcpu_topology ();
3484+
33863485 if (online && SCX_HAS_OP (cpu_online ))
33873486 SCX_CALL_OP (SCX_KF_UNLOCKED , cpu_online , cpu );
33883487 else if (!online && SCX_HAS_OP (cpu_offline ))
@@ -5202,6 +5301,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
52025301 static_branch_enable_cpuslocked (& scx_has_op [i ]);
52035302
52045303 check_hotplug_seq (ops );
5304+ #ifdef CONFIG_SMP
5305+ update_selcpu_topology ();
5306+ #endif
52055307 cpus_read_unlock ();
52065308
52075309 ret = validate_ops (ops );
0 commit comments