Skip to content

Commit 7e71a13

Browse files
committed
Merge branches 'pm-cpuidle', 'pm-core' and 'pm-sleep'
Merge cpuidle updates, PM core updates and changes related to system sleep handling for 6.3-rc1: - Make the TEO cpuidle governor check CPU utilization in order to refine idle state selection (Kajetan Puchalski). - Make Kconfig select the haltpoll cpuidle governor when the haltpoll cpuidle driver is selected and replace a default_idle() call in that driver with arch_cpu_idle() which allows MWAIT to be used (Li RongQing). - Add Emerald Rapids Xeon support to the intel_idle driver (Artem Bityutskiy). - Add ARCH_SUSPEND_POSSIBLE dependencies for ARMv4 cpuidle drivers to avoid randconfig build failures (Arnd Bergmann). - Make kobj_type structures used in the cpuidle sysfs interface constant (Thomas Weißschuh). - Make the cpuidle driver registration code update microsecond values of idle state parameters in accordance with their nanosecond values if they are provided (Rafael Wysocki). - Make the PSCI cpuidle driver prevent topology CPUs from being suspended on PREEMPT_RT (Krzysztof Kozlowski). - Document that pm_runtime_force_suspend() cannot be used with DPM_FLAG_SMART_SUSPEND (Richard Fitzgerald). - Add EXPORT macros for exporting PM functions from drivers (Richard Fitzgerald). - Drop "select SRCU" from system sleep Kconfig (Paul E. McKenney). - Remove /** from non-kernel-doc comments in hibernation code (Randy Dunlap). * pm-cpuidle: cpuidle: psci: Do not suspend topology CPUs on PREEMPT_RT cpuidle: driver: Update microsecond values of state parameters as needed cpuidle: sysfs: make kobj_type structures constant cpuidle: add ARCH_SUSPEND_POSSIBLE dependencies intel_idle: add Emerald Rapids Xeon support cpuidle-haltpoll: Replace default_idle() with arch_cpu_idle() cpuidle-haltpoll: select haltpoll governor cpuidle: teo: Introduce util-awareness cpuidle: teo: Optionally skip polling states in teo_find_shallower_state() * pm-core: PM: Add EXPORT macros for exporting PM functions PM: runtime: Document that force_suspend() is incompatible with SMART_SUSPEND * pm-sleep: PM: sleep: Remove "select SRCU" PM: hibernate: swap: don't use /** for non-kernel-doc comments
4 parents 73dd320 + f9901f6 + 41a337b + 52e0452 commit 7e71a13

File tree

14 files changed

+144
-19
lines changed

14 files changed

+144
-19
lines changed

arch/x86/kernel/process.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -721,6 +721,7 @@ void arch_cpu_idle(void)
721721
{
722722
x86_idle();
723723
}
724+
EXPORT_SYMBOL_GPL(arch_cpu_idle);
724725

725726
/*
726727
* We use this if we don't have any better idle routine..

drivers/base/power/runtime.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1864,6 +1864,10 @@ static bool pm_runtime_need_not_resume(struct device *dev)
18641864
* sure the device is put into low power state and it should only be used during
18651865
* system-wide PM transitions to sleep states. It assumes that the analogous
18661866
* pm_runtime_force_resume() will be used to resume the device.
1867+
*
1868+
* Do not use with DPM_FLAG_SMART_SUSPEND as this can lead to an inconsistent
1869+
* state where this function has called the ->runtime_suspend callback but the
1870+
* PM core marks the driver as runtime active.
18671871
*/
18681872
int pm_runtime_force_suspend(struct device *dev)
18691873
{

drivers/cpuidle/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ endmenu
7474
config HALTPOLL_CPUIDLE
7575
tristate "Halt poll cpuidle driver"
7676
depends on X86 && KVM_GUEST
77+
select CPU_IDLE_GOV_HALTPOLL
7778
default y
7879
help
7980
This option enables halt poll cpuidle driver, which allows to poll

drivers/cpuidle/Kconfig.arm

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,14 @@ config ARM_PSCI_CPUIDLE
2424
It provides an idle driver that is capable of detecting and
2525
managing idle states through the PSCI firmware interface.
2626

27+
The driver has limitations when used with PREEMPT_RT:
28+
- If the idle states are described with the non-hierarchical layout,
29+
all idle states are still available.
30+
31+
- If the idle states are described with the hierarchical layout,
32+
only the idle states defined per CPU are available, but not the ones
33+
being shared among a group of CPUs (aka cluster idle states).
34+
2735
config ARM_PSCI_CPUIDLE_DOMAIN
2836
bool "PSCI CPU idle Domain"
2937
depends on ARM_PSCI_CPUIDLE
@@ -102,6 +110,7 @@ config ARM_MVEBU_V7_CPUIDLE
102110
config ARM_TEGRA_CPUIDLE
103111
bool "CPU Idle Driver for NVIDIA Tegra SoCs"
104112
depends on (ARCH_TEGRA || COMPILE_TEST) && !ARM64 && MMU
113+
depends on ARCH_SUSPEND_POSSIBLE
105114
select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP
106115
select ARM_CPU_SUSPEND
107116
help
@@ -110,6 +119,7 @@ config ARM_TEGRA_CPUIDLE
110119
config ARM_QCOM_SPM_CPUIDLE
111120
bool "CPU Idle Driver for Qualcomm Subsystem Power Manager (SPM)"
112121
depends on (ARCH_QCOM || COMPILE_TEST) && !ARM64 && MMU
122+
depends on ARCH_SUSPEND_POSSIBLE
113123
select ARM_CPU_SUSPEND
114124
select CPU_IDLE_MULTIPLE_DRIVERS
115125
select DT_IDLE_STATES

drivers/cpuidle/cpuidle-haltpoll.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ static int default_enter_idle(struct cpuidle_device *dev,
3232
local_irq_enable();
3333
return index;
3434
}
35-
default_idle();
35+
arch_cpu_idle();
3636
return index;
3737
}
3838

drivers/cpuidle/cpuidle-psci-domain.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,11 @@ static int psci_pd_init(struct device_node *np, bool use_osi)
6464

6565
pd->flags |= GENPD_FLAG_IRQ_SAFE | GENPD_FLAG_CPU_DOMAIN;
6666

67-
/* Allow power off when OSI has been successfully enabled. */
68-
if (use_osi)
67+
/*
68+
* Allow power off when OSI has been successfully enabled.
69+
* PREEMPT_RT is not yet ready to enter domain idle states.
70+
*/
71+
if (use_osi && !IS_ENABLED(CONFIG_PREEMPT_RT))
6972
pd->power_off = psci_pd_power_off;
7073
else
7174
pd->flags |= GENPD_FLAG_ALWAYS_ON;

drivers/cpuidle/cpuidle-psci.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,9 @@ static int psci_dt_cpu_init_topology(struct cpuidle_driver *drv,
231231
if (!psci_has_osi_support())
232232
return 0;
233233

234+
if (IS_ENABLED(CONFIG_PREEMPT_RT))
235+
return 0;
236+
234237
data->dev = psci_dt_attach_cpu(cpu);
235238
if (IS_ERR_OR_NULL(data->dev))
236239
return PTR_ERR_OR_ZERO(data->dev);

drivers/cpuidle/driver.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,11 +183,15 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv)
183183
s->target_residency_ns = s->target_residency * NSEC_PER_USEC;
184184
else if (s->target_residency_ns < 0)
185185
s->target_residency_ns = 0;
186+
else
187+
s->target_residency = div_u64(s->target_residency_ns, NSEC_PER_USEC);
186188

187189
if (s->exit_latency > 0)
188190
s->exit_latency_ns = s->exit_latency * NSEC_PER_USEC;
189191
else if (s->exit_latency_ns < 0)
190192
s->exit_latency_ns = 0;
193+
else
194+
s->exit_latency = div_u64(s->exit_latency_ns, NSEC_PER_USEC);
191195
}
192196
}
193197

drivers/cpuidle/governors/teo.c

Lines changed: 98 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,13 @@
22
/*
33
* Timer events oriented CPU idle governor
44
*
5+
* TEO governor:
56
* Copyright (C) 2018 - 2021 Intel Corporation
67
* Author: Rafael J. Wysocki <[email protected]>
8+
*
9+
* Util-awareness mechanism:
10+
* Copyright (C) 2022 Arm Ltd.
11+
* Author: Kajetan Puchalski <[email protected]>
712
*/
813

914
/**
@@ -99,14 +104,55 @@
99104
* select the given idle state instead of the candidate one.
100105
*
101106
* 3. By default, select the candidate state.
107+
*
108+
* Util-awareness mechanism:
109+
*
110+
* The idea behind the util-awareness extension is that there are two distinct
111+
* scenarios for the CPU which should result in two different approaches to idle
112+
* state selection - utilized and not utilized.
113+
*
114+
* In this case, 'utilized' means that the average runqueue util of the CPU is
115+
* above a certain threshold.
116+
*
117+
* When the CPU is utilized while going into idle, more likely than not it will
118+
* be woken up to do more work soon and so a shallower idle state should be
119+
* selected to minimise latency and maximise performance. When the CPU is not
120+
* being utilized, the usual metrics-based approach to selecting the deepest
121+
* available idle state should be preferred to take advantage of the power
122+
* saving.
123+
*
124+
* In order to achieve this, the governor uses a utilization threshold.
125+
* The threshold is computed per-CPU as a percentage of the CPU's capacity
126+
* by bit shifting the capacity value. Based on testing, the shift of 6 (~1.56%)
127+
* seems to be getting the best results.
128+
*
129+
* Before selecting the next idle state, the governor compares the current CPU
130+
* util to the precomputed util threshold. If it's below, it defaults to the
131+
* TEO metrics mechanism. If it's above, the closest shallower idle state will
132+
* be selected instead, as long as is not a polling state.
102133
*/
103134

104135
#include <linux/cpuidle.h>
105136
#include <linux/jiffies.h>
106137
#include <linux/kernel.h>
138+
#include <linux/sched.h>
107139
#include <linux/sched/clock.h>
140+
#include <linux/sched/topology.h>
108141
#include <linux/tick.h>
109142

143+
/*
144+
* The number of bits to shift the CPU's capacity by in order to determine
145+
* the utilized threshold.
146+
*
147+
* 6 was chosen based on testing as the number that achieved the best balance
148+
* of power and performance on average.
149+
*
150+
* The resulting threshold is high enough to not be triggered by background
151+
* noise and low enough to react quickly when activity starts to ramp up.
152+
*/
153+
#define UTIL_THRESHOLD_SHIFT 6
154+
155+
110156
/*
111157
* The PULSE value is added to metrics when they grow and the DECAY_SHIFT value
112158
* is used for decreasing metrics on a regular basis.
@@ -137,9 +183,11 @@ struct teo_bin {
137183
* @time_span_ns: Time between idle state selection and post-wakeup update.
138184
* @sleep_length_ns: Time till the closest timer event (at the selection time).
139185
* @state_bins: Idle state data bins for this CPU.
140-
* @total: Grand total of the "intercepts" and "hits" mertics for all bins.
186+
* @total: Grand total of the "intercepts" and "hits" metrics for all bins.
141187
* @next_recent_idx: Index of the next @recent_idx entry to update.
142188
* @recent_idx: Indices of bins corresponding to recent "intercepts".
189+
* @util_threshold: Threshold above which the CPU is considered utilized
190+
* @utilized: Whether the last sleep on the CPU happened while utilized
143191
*/
144192
struct teo_cpu {
145193
s64 time_span_ns;
@@ -148,10 +196,29 @@ struct teo_cpu {
148196
unsigned int total;
149197
int next_recent_idx;
150198
int recent_idx[NR_RECENT];
199+
unsigned long util_threshold;
200+
bool utilized;
151201
};
152202

153203
static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
154204

205+
/**
206+
* teo_cpu_is_utilized - Check if the CPU's util is above the threshold
207+
* @cpu: Target CPU
208+
* @cpu_data: Governor CPU data for the target CPU
209+
*/
210+
#ifdef CONFIG_SMP
211+
static bool teo_cpu_is_utilized(int cpu, struct teo_cpu *cpu_data)
212+
{
213+
return sched_cpu_util(cpu) > cpu_data->util_threshold;
214+
}
215+
#else
216+
static bool teo_cpu_is_utilized(int cpu, struct teo_cpu *cpu_data)
217+
{
218+
return false;
219+
}
220+
#endif
221+
155222
/**
156223
* teo_update - Update CPU metrics after wakeup.
157224
* @drv: cpuidle driver containing state data.
@@ -258,15 +325,17 @@ static s64 teo_middle_of_bin(int idx, struct cpuidle_driver *drv)
258325
* @dev: Target CPU.
259326
* @state_idx: Index of the capping idle state.
260327
* @duration_ns: Idle duration value to match.
328+
* @no_poll: Don't consider polling states.
261329
*/
262330
static int teo_find_shallower_state(struct cpuidle_driver *drv,
263331
struct cpuidle_device *dev, int state_idx,
264-
s64 duration_ns)
332+
s64 duration_ns, bool no_poll)
265333
{
266334
int i;
267335

268336
for (i = state_idx - 1; i >= 0; i--) {
269-
if (dev->states_usage[i].disable)
337+
if (dev->states_usage[i].disable ||
338+
(no_poll && drv->states[i].flags & CPUIDLE_FLAG_POLLING))
270339
continue;
271340

272341
state_idx = i;
@@ -321,6 +390,22 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
321390
goto end;
322391
}
323392

393+
cpu_data->utilized = teo_cpu_is_utilized(dev->cpu, cpu_data);
394+
/*
395+
* If the CPU is being utilized over the threshold and there are only 2
396+
* states to choose from, the metrics need not be considered, so choose
397+
* the shallowest non-polling state and exit.
398+
*/
399+
if (drv->state_count < 3 && cpu_data->utilized) {
400+
for (i = 0; i < drv->state_count; ++i) {
401+
if (!dev->states_usage[i].disable &&
402+
!(drv->states[i].flags & CPUIDLE_FLAG_POLLING)) {
403+
idx = i;
404+
goto end;
405+
}
406+
}
407+
}
408+
324409
/*
325410
* Find the deepest idle state whose target residency does not exceed
326411
* the current sleep length and the deepest idle state not deeper than
@@ -452,6 +537,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
452537
if (idx > constraint_idx)
453538
idx = constraint_idx;
454539

540+
/*
541+
* If the CPU is being utilized over the threshold, choose a shallower
542+
* non-polling state to improve latency
543+
*/
544+
if (cpu_data->utilized)
545+
idx = teo_find_shallower_state(drv, dev, idx, duration_ns, true);
546+
455547
end:
456548
/*
457549
* Don't stop the tick if the selected state is a polling one or if the
@@ -469,7 +561,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
469561
*/
470562
if (idx > idx0 &&
471563
drv->states[idx].target_residency_ns > delta_tick)
472-
idx = teo_find_shallower_state(drv, dev, idx, delta_tick);
564+
idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false);
473565
}
474566

475567
return idx;
@@ -508,9 +600,11 @@ static int teo_enable_device(struct cpuidle_driver *drv,
508600
struct cpuidle_device *dev)
509601
{
510602
struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
603+
unsigned long max_capacity = arch_scale_cpu_capacity(dev->cpu);
511604
int i;
512605

513606
memset(cpu_data, 0, sizeof(*cpu_data));
607+
cpu_data->util_threshold = max_capacity >> UTIL_THRESHOLD_SHIFT;
514608

515609
for (i = 0; i < NR_RECENT; i++)
516610
cpu_data->recent_idx[i] = -1;

drivers/cpuidle/sysfs.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ static void cpuidle_sysfs_release(struct kobject *kobj)
200200
complete(&kdev->kobj_unregister);
201201
}
202202

203-
static struct kobj_type ktype_cpuidle = {
203+
static const struct kobj_type ktype_cpuidle = {
204204
.sysfs_ops = &cpuidle_sysfs_ops,
205205
.release = cpuidle_sysfs_release,
206206
};
@@ -447,7 +447,7 @@ static void cpuidle_state_sysfs_release(struct kobject *kobj)
447447
complete(&state_obj->kobj_unregister);
448448
}
449449

450-
static struct kobj_type ktype_state_cpuidle = {
450+
static const struct kobj_type ktype_state_cpuidle = {
451451
.sysfs_ops = &cpuidle_state_sysfs_ops,
452452
.default_groups = cpuidle_state_default_groups,
453453
.release = cpuidle_state_sysfs_release,
@@ -594,7 +594,7 @@ static struct attribute *cpuidle_driver_default_attrs[] = {
594594
};
595595
ATTRIBUTE_GROUPS(cpuidle_driver_default);
596596

597-
static struct kobj_type ktype_driver_cpuidle = {
597+
static const struct kobj_type ktype_driver_cpuidle = {
598598
.sysfs_ops = &cpuidle_driver_sysfs_ops,
599599
.default_groups = cpuidle_driver_default_groups,
600600
.release = cpuidle_driver_sysfs_release,

0 commit comments

Comments
 (0)