Skip to content

Commit 8881e16

Browse files
Barry SongPeter Zijlstra
authored andcommitted
sched/fair: Scan cluster before scanning LLC in wake-up path
For platforms having clusters like Kunpeng920, CPUs within the same cluster have lower latency when synchronizing and accessing shared resources like cache. Thus, this patch tries to find an idle cpu within the cluster of the target CPU before scanning the whole LLC to gain lower latency. This will be implemented in 2 steps in select_idle_sibling(): 1. When the prev_cpu/recent_used_cpu are good wakeup candidates, use them if they're sharing cluster with the target CPU. Otherwise trying to scan for an idle CPU in the target's cluster. 2. Scanning the cluster prior to the LLC of the target CPU for an idle CPU to wakeup. Testing has been done on Kunpeng920 by pinning tasks to one numa and two numa. On Kunpeng920, Each numa has 8 clusters and each cluster has 4 CPUs. With this patch, We noticed enhancement on tbench and netperf within one numa or cross two numa on top of tip-sched-core commit 9b46f1a ("sched/debug: Print 'tgid' in sched_show_task()") tbench results (node 0): baseline patched 1: 327.2833 372.4623 ( 13.80%) 4: 1320.5933 1479.8833 ( 12.06%) 8: 2638.4867 2921.5267 ( 10.73%) 16: 5282.7133 5891.5633 ( 11.53%) 32: 9810.6733 9877.3400 ( 0.68%) 64: 7408.9367 7447.9900 ( 0.53%) 128: 6203.2600 6191.6500 ( -0.19%) tbench results (node 0-1): baseline patched 1: 332.0433 372.7223 ( 12.25%) 4: 1325.4667 1477.6733 ( 11.48%) 8: 2622.9433 2897.9967 ( 10.49%) 16: 5218.6100 5878.2967 ( 12.64%) 32: 10211.7000 11494.4000 ( 12.56%) 64: 13313.7333 16740.0333 ( 25.74%) 128: 13959.1000 14533.9000 ( 4.12%) netperf results TCP_RR (node 0): baseline patched 1: 76546.5033 90649.9867 ( 18.42%) 4: 77292.4450 90932.7175 ( 17.65%) 8: 77367.7254 90882.3467 ( 17.47%) 16: 78519.9048 90938.8344 ( 15.82%) 32: 72169.5035 72851.6730 ( 0.95%) 64: 25911.2457 25882.2315 ( -0.11%) 128: 10752.6572 10768.6038 ( 0.15%) netperf results TCP_RR (node 0-1): baseline patched 1: 76857.6667 90892.2767 ( 18.26%) 4: 78236.6475 90767.3017 ( 16.02%) 8: 77929.6096 90684.1633 ( 16.37%) 16: 77438.5873 90502.5787 ( 16.87%) 32: 74205.6635 88301.5612 ( 19.00%) 64: 69827.8535 71787.6706 ( 2.81%) 128: 25281.4366 25771.3023 ( 1.94%) netperf results UDP_RR (node 0): baseline patched 1: 96869.8400 110800.8467 ( 14.38%) 4: 97744.9750 109680.5425 ( 12.21%) 8: 98783.9863 110409.9637 ( 11.77%) 16: 99575.0235 110636.2435 ( 11.11%) 32: 95044.7250 97622.8887 ( 2.71%) 64: 32925.2146 32644.4991 ( -0.85%) 128: 12859.2343 12824.0051 ( -0.27%) netperf results UDP_RR (node 0-1): baseline patched 1: 97202.4733 110190.1200 ( 13.36%) 4: 95954.0558 106245.7258 ( 10.73%) 8: 96277.1958 105206.5304 ( 9.27%) 16: 97692.7810 107927.2125 ( 10.48%) 32: 79999.6702 103550.2999 ( 29.44%) 64: 80592.7413 87284.0856 ( 8.30%) 128: 27701.5770 29914.5820 ( 7.99%) Note neither Kunpeng920 nor x86 Jacobsville supports SMT, so the SMT branch in the code has not been tested but it supposed to work. Chen Yu also noticed this will improve the performance of tbench and netperf on a 24 CPUs Jacobsville machine, there are 4 CPUs in one cluster sharing L2 Cache. [https://lore.kernel.org/lkml/[email protected]] Suggested-by: Peter Zijlstra <[email protected]> Signed-off-by: Barry Song <[email protected]> Signed-off-by: Yicong Yang <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Reviewed-by: Tim Chen <[email protected]> Reviewed-by: Chen Yu <[email protected]> Reviewed-by: Gautham R. Shenoy <[email protected]> Reviewed-by: Vincent Guittot <[email protected]> Tested-and-reviewed-by: Chen Yu <[email protected]> Tested-by: Yicong Yang <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent b95303e commit 8881e16

File tree

3 files changed

+49
-4
lines changed

3 files changed

+49
-4
lines changed

kernel/sched/fair.c

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7259,14 +7259,38 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
72597259
}
72607260
}
72617261

7262+
if (static_branch_unlikely(&sched_cluster_active)) {
7263+
struct sched_group *sg = sd->groups;
7264+
7265+
if (sg->flags & SD_CLUSTER) {
7266+
for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) {
7267+
if (!cpumask_test_cpu(cpu, cpus))
7268+
continue;
7269+
7270+
if (has_idle_core) {
7271+
i = select_idle_core(p, cpu, cpus, &idle_cpu);
7272+
if ((unsigned int)i < nr_cpumask_bits)
7273+
return i;
7274+
} else {
7275+
if (--nr <= 0)
7276+
return -1;
7277+
idle_cpu = __select_idle_cpu(cpu, p);
7278+
if ((unsigned int)idle_cpu < nr_cpumask_bits)
7279+
return idle_cpu;
7280+
}
7281+
}
7282+
cpumask_andnot(cpus, cpus, sched_group_span(sg));
7283+
}
7284+
}
7285+
72627286
for_each_cpu_wrap(cpu, cpus, target + 1) {
72637287
if (has_idle_core) {
72647288
i = select_idle_core(p, cpu, cpus, &idle_cpu);
72657289
if ((unsigned int)i < nr_cpumask_bits)
72667290
return i;
72677291

72687292
} else {
7269-
if (!--nr)
7293+
if (--nr <= 0)
72707294
return -1;
72717295
idle_cpu = __select_idle_cpu(cpu, p);
72727296
if ((unsigned int)idle_cpu < nr_cpumask_bits)
@@ -7395,8 +7419,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
73957419
*/
73967420
if (prev != target && cpus_share_cache(prev, target) &&
73977421
(available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
7398-
asym_fits_cpu(task_util, util_min, util_max, prev))
7399-
return prev;
7422+
asym_fits_cpu(task_util, util_min, util_max, prev)) {
7423+
7424+
if (!static_branch_unlikely(&sched_cluster_active) ||
7425+
cpus_share_resources(prev, target))
7426+
return prev;
7427+
}
74007428

74017429
/*
74027430
* Allow a per-cpu kthread to stack with the wakee if the
@@ -7423,7 +7451,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
74237451
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
74247452
cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
74257453
asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
7426-
return recent_used_cpu;
7454+
7455+
if (!static_branch_unlikely(&sched_cluster_active) ||
7456+
cpus_share_resources(recent_used_cpu, target))
7457+
return recent_used_cpu;
7458+
74277459
}
74287460

74297461
/*

kernel/sched/sched.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1859,6 +1859,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
18591859
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
18601860
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
18611861
extern struct static_key_false sched_asym_cpucapacity;
1862+
extern struct static_key_false sched_cluster_active;
18621863

18631864
static __always_inline bool sched_asym_cpucap_active(void)
18641865
{

kernel/sched/topology.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -673,7 +673,9 @@ DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
673673
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
674674
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
675675
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
676+
676677
DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
678+
DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
677679

678680
static void update_top_cache_domain(int cpu)
679681
{
@@ -2386,6 +2388,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
23862388
struct rq *rq = NULL;
23872389
int i, ret = -ENOMEM;
23882390
bool has_asym = false;
2391+
bool has_cluster = false;
23892392

23902393
if (WARN_ON(cpumask_empty(cpu_map)))
23912394
goto error;
@@ -2514,12 +2517,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
25142517
WRITE_ONCE(d.rd->max_cpu_capacity, capacity);
25152518

25162519
cpu_attach_domain(sd, d.rd, i);
2520+
2521+
if (lowest_flag_domain(i, SD_CLUSTER))
2522+
has_cluster = true;
25172523
}
25182524
rcu_read_unlock();
25192525

25202526
if (has_asym)
25212527
static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
25222528

2529+
if (has_cluster)
2530+
static_branch_inc_cpuslocked(&sched_cluster_active);
2531+
25232532
if (rq && sched_debug_verbose) {
25242533
pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
25252534
cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
@@ -2619,6 +2628,9 @@ static void detach_destroy_domains(const struct cpumask *cpu_map)
26192628
if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
26202629
static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
26212630

2631+
if (static_branch_unlikely(&sched_cluster_active))
2632+
static_branch_dec_cpuslocked(&sched_cluster_active);
2633+
26222634
rcu_read_lock();
26232635
for_each_cpu(i, cpu_map)
26242636
cpu_attach_domain(NULL, &def_root_domain, i);

0 commit comments

Comments
 (0)