Skip to content

Commit 6843d6d

Browse files
NickJackolsonVasily Gorbik
authored andcommitted
s390/hiperdispatch: Introduce hiperdispatch
When LPAR is in vertical polarization, CPUs get different polarization values, namely vertical high, vertical medium and vertical low. These values represent the likelyhood of the CPU getting physical runtime. Vertical high CPUs will always get runtime and others get varying runtime depending on the load the CEC is under. Vertical high and vertical medium CPUs are considered the CPUs which the current LPAR has the entitlement to run on. The vertical lows are on the other hand are borrowed CPUs which would only be given to the LPAR by hipervisor when the other LPARs are not utilizing them. Using the CPU capacities, hint linux scheduler when it should prioritise vertical high and vertical medium CPUs over vertical low CPUs. By tracking various system statistics hiperdispatch determines when to adjust cpu capacities. After each adjustment, rebuilding of scheduler domains is necessary to notify the scheduler about capacity changes but since this operation is costly it should be done as sparsely as possible. Acked-by: Vasily Gorbik <[email protected]> Co-developed-by: Tobias Huschle <[email protected]> Signed-off-by: Tobias Huschle <[email protected]> Signed-off-by: Mete Durlu <[email protected]> Signed-off-by: Vasily Gorbik <[email protected]>
1 parent 26ceef5 commit 6843d6d

File tree

4 files changed

+228
-5
lines changed

4 files changed

+228
-5
lines changed

arch/s390/include/asm/hiperdispatch.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
/*
3+
* Copyright IBM Corp. 2024
4+
*/
5+
6+
#ifndef _ASM_HIPERDISPATCH_H
7+
#define _ASM_HIPERDISPATCH_H
8+
9+
void hd_reset_state(void);
10+
void hd_add_core(int cpu);
11+
void hd_disable_hiperdispatch(void);
12+
int hd_enable_hiperdispatch(void);
13+
14+
#endif /* _ASM_HIPERDISPATCH_H */

arch/s390/kernel/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ obj-$(CONFIG_SYSFS) += nospec-sysfs.o
5151
CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE)
5252

5353
obj-$(CONFIG_MODULES) += module.o
54-
obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o
54+
obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o hiperdispatch.o
5555
obj-$(CONFIG_NUMA) += numa.o
5656
obj-$(CONFIG_AUDIT) += audit.o
5757
compat-obj-$(CONFIG_AUDIT) += compat_audit.o

arch/s390/kernel/hiperdispatch.c

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
/*
3+
* Copyright IBM Corp. 2024
4+
*/
5+
6+
#define KMSG_COMPONENT "hd"
7+
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
8+
9+
/*
10+
* Hiperdispatch:
11+
* Dynamically calculates the optimum number of high capacity COREs
12+
* by considering the state the system is in. When hiperdispatch decides
13+
* that a capacity update is necessary, it schedules a topology update.
14+
* During topology updates the CPU capacities are always re-adjusted.
15+
*
16+
* There is two places where CPU capacities are being accessed within
17+
* hiperdispatch.
18+
* -> hiperdispatch's reoccuring work function reads CPU capacities to
19+
* determine high capacity CPU count.
20+
* -> during a topology update hiperdispatch's adjustment function
21+
* updates CPU capacities.
22+
* These two can run on different CPUs in parallel which can cause
23+
* hiperdispatch to make wrong decisions. This can potentially cause
24+
* some overhead by leading to extra rebuild_sched_domains() calls
25+
* for correction. Access to capacities within hiperdispatch has to be
26+
* serialized to prevent the overhead.
27+
*
28+
* Hiperdispatch decision making revolves around steal time.
29+
* HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time
30+
* crosses the threshold value hiperdispatch falls back to giving high
31+
* capacities to entitled CPUs. When steal time drops below the
32+
* threshold boundary, hiperdispatch utilizes all CPUs by giving all
33+
* of them high capacity.
34+
*
35+
* The theory behind HD_STEAL_THRESHOLD is related to the SMP thread
36+
* performance. Comparing the throughput of;
37+
* - single CORE, with N threads, running N tasks
38+
* - N separate COREs running N tasks,
39+
* using individual COREs for individual tasks yield better
40+
* performance. This performance difference is roughly ~30% (can change
41+
* between machine generations)
42+
*
43+
* Hiperdispatch tries to hint scheduler to use individual COREs for
44+
* each task, as long as steal time on those COREs are less than 30%,
45+
* therefore delaying the throughput loss caused by using SMP threads.
46+
*/
47+
48+
#include <linux/cpumask.h>
49+
#include <linux/kernel_stat.h>
50+
#include <linux/ktime.h>
51+
#include <linux/workqueue.h>
52+
#include <asm/hiperdispatch.h>
53+
#include <asm/smp.h>
54+
#include <asm/topology.h>
55+
56+
#define HD_DELAY_FACTOR (4)
57+
#define HD_DELAY_INTERVAL (HZ / 4)
58+
#define HD_STEAL_THRESHOLD 30
59+
60+
static cpumask_t hd_vl_coremask; /* Mask containing all vertical low COREs */
61+
static cpumask_t hd_vmvl_cpumask; /* Mask containing vertical medium and low CPUs */
62+
static int hd_high_capacity_cores; /* Current CORE count with high capacity */
63+
static int hd_entitled_cores; /* Total vertical high and medium CORE count */
64+
static int hd_online_cores; /* Current online CORE count */
65+
66+
static unsigned long hd_previous_steal; /* Previous iteration's CPU steal timer total */
67+
68+
static void hd_capacity_work_fn(struct work_struct *work);
69+
static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn);
70+
71+
void hd_reset_state(void)
72+
{
73+
cpumask_clear(&hd_vl_coremask);
74+
cpumask_clear(&hd_vmvl_cpumask);
75+
hd_entitled_cores = 0;
76+
hd_online_cores = 0;
77+
}
78+
79+
void hd_add_core(int cpu)
80+
{
81+
const struct cpumask *siblings;
82+
int polarization;
83+
84+
hd_online_cores++;
85+
polarization = smp_cpu_get_polarization(cpu);
86+
siblings = topology_sibling_cpumask(cpu);
87+
switch (polarization) {
88+
case POLARIZATION_VH:
89+
hd_entitled_cores++;
90+
break;
91+
case POLARIZATION_VM:
92+
hd_entitled_cores++;
93+
cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
94+
break;
95+
case POLARIZATION_VL:
96+
cpumask_set_cpu(cpu, &hd_vl_coremask);
97+
cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
98+
break;
99+
}
100+
}
101+
102+
static void hd_update_capacities(void)
103+
{
104+
int cpu, upscaling_cores;
105+
unsigned long capacity;
106+
107+
upscaling_cores = hd_high_capacity_cores - hd_entitled_cores;
108+
capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW;
109+
hd_high_capacity_cores = hd_entitled_cores;
110+
for_each_cpu(cpu, &hd_vl_coremask) {
111+
smp_set_core_capacity(cpu, capacity);
112+
if (capacity != CPU_CAPACITY_HIGH)
113+
continue;
114+
hd_high_capacity_cores++;
115+
upscaling_cores--;
116+
if (upscaling_cores == 0)
117+
capacity = CPU_CAPACITY_LOW;
118+
}
119+
}
120+
121+
void hd_disable_hiperdispatch(void)
122+
{
123+
cancel_delayed_work_sync(&hd_capacity_work);
124+
hd_high_capacity_cores = hd_online_cores;
125+
hd_previous_steal = 0;
126+
}
127+
128+
int hd_enable_hiperdispatch(void)
129+
{
130+
if (hd_entitled_cores == 0)
131+
return 0;
132+
if (hd_online_cores <= hd_entitled_cores)
133+
return 0;
134+
mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * HD_DELAY_FACTOR);
135+
hd_update_capacities();
136+
return 1;
137+
}
138+
139+
static unsigned long hd_calculate_steal_percentage(void)
140+
{
141+
unsigned long time_delta, steal_delta, steal, percentage;
142+
static ktime_t prev;
143+
int cpus, cpu;
144+
ktime_t now;
145+
146+
cpus = 0;
147+
steal = 0;
148+
percentage = 0;
149+
for_each_cpu(cpu, &hd_vmvl_cpumask) {
150+
steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
151+
cpus++;
152+
}
153+
/*
154+
* If there is no vertical medium and low CPUs steal time
155+
* is 0 as vertical high CPUs shouldn't experience steal time.
156+
*/
157+
if (cpus == 0)
158+
return percentage;
159+
now = ktime_get();
160+
time_delta = ktime_to_ns(ktime_sub(now, prev));
161+
if (steal > hd_previous_steal && hd_previous_steal != 0) {
162+
steal_delta = (steal - hd_previous_steal) * 100 / time_delta;
163+
percentage = steal_delta / cpus;
164+
}
165+
hd_previous_steal = steal;
166+
prev = now;
167+
return percentage;
168+
}
169+
170+
static void hd_capacity_work_fn(struct work_struct *work)
171+
{
172+
unsigned long steal_percentage, new_cores;
173+
174+
mutex_lock(&smp_cpu_state_mutex);
175+
/*
176+
* If online cores are less or equal to entitled cores hiperdispatch
177+
* does not need to make any adjustments, call a topology update to
178+
* disable hiperdispatch.
179+
* Normally this check is handled on topology update, but during cpu
180+
* unhotplug, topology and cpu mask updates are done in reverse
181+
* order, causing hd_enable_hiperdispatch() to get stale data.
182+
*/
183+
if (hd_online_cores <= hd_entitled_cores) {
184+
topology_schedule_update();
185+
mutex_unlock(&smp_cpu_state_mutex);
186+
return;
187+
}
188+
steal_percentage = hd_calculate_steal_percentage();
189+
if (steal_percentage < HD_STEAL_THRESHOLD)
190+
new_cores = hd_online_cores;
191+
else
192+
new_cores = hd_entitled_cores;
193+
if (hd_high_capacity_cores != new_cores) {
194+
hd_high_capacity_cores = new_cores;
195+
topology_schedule_update();
196+
}
197+
mutex_unlock(&smp_cpu_state_mutex);
198+
schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL);
199+
}

arch/s390/kernel/topology.c

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <linux/mm.h>
2525
#include <linux/nodemask.h>
2626
#include <linux/node.h>
27+
#include <asm/hiperdispatch.h>
2728
#include <asm/sysinfo.h>
2829

2930
#define PTF_HORIZONTAL (0UL)
@@ -47,6 +48,7 @@ static int topology_mode = TOPOLOGY_MODE_UNINITIALIZED;
4748
static void set_topology_timer(void);
4849
static void topology_work_fn(struct work_struct *work);
4950
static struct sysinfo_15_1_x *tl_info;
51+
static int cpu_management;
5052

5153
static DECLARE_WORK(topology_work, topology_work_fn);
5254

@@ -144,6 +146,7 @@ static void add_cpus_to_mask(struct topology_core *tl_core,
144146
cpumask_set_cpu(cpu, &book->mask);
145147
cpumask_set_cpu(cpu, &socket->mask);
146148
smp_cpu_set_polarization(cpu, tl_core->pp);
149+
smp_cpu_set_capacity(cpu, CPU_CAPACITY_HIGH);
147150
}
148151
}
149152
}
@@ -270,6 +273,7 @@ void update_cpu_masks(void)
270273
topo->drawer_id = id;
271274
}
272275
}
276+
hd_reset_state();
273277
for_each_online_cpu(cpu) {
274278
topo = &cpu_topology[cpu];
275279
pkg_first = cpumask_first(&topo->core_mask);
@@ -278,8 +282,10 @@ void update_cpu_masks(void)
278282
for_each_cpu(sibling, &topo->core_mask) {
279283
topo_sibling = &cpu_topology[sibling];
280284
smt_first = cpumask_first(&topo_sibling->thread_mask);
281-
if (sibling == smt_first)
285+
if (sibling == smt_first) {
282286
topo_package->booted_cores++;
287+
hd_add_core(sibling);
288+
}
283289
}
284290
} else {
285291
topo->booted_cores = topo_package->booted_cores;
@@ -303,8 +309,10 @@ static void __arch_update_dedicated_flag(void *arg)
303309
static int __arch_update_cpu_topology(void)
304310
{
305311
struct sysinfo_15_1_x *info = tl_info;
306-
int rc = 0;
312+
int rc, hd_status;
307313

314+
hd_status = 0;
315+
rc = 0;
308316
mutex_lock(&smp_cpu_state_mutex);
309317
if (MACHINE_HAS_TOPOLOGY) {
310318
rc = 1;
@@ -314,7 +322,11 @@ static int __arch_update_cpu_topology(void)
314322
update_cpu_masks();
315323
if (!MACHINE_HAS_TOPOLOGY)
316324
topology_update_polarization_simple();
325+
if (cpu_management == 1)
326+
hd_status = hd_enable_hiperdispatch();
317327
mutex_unlock(&smp_cpu_state_mutex);
328+
if (hd_status == 0)
329+
hd_disable_hiperdispatch();
318330
return rc;
319331
}
320332

@@ -374,8 +386,6 @@ void topology_expect_change(void)
374386
set_topology_timer();
375387
}
376388

377-
static int cpu_management;
378-
379389
static int set_polarization(int polarization)
380390
{
381391
int rc = 0;

0 commit comments

Comments
 (0)