Skip to content

Commit ebf5197

Browse files
spandruvadarafaeljw
authored andcommitted
thermal: intel: powerclamp: Add two module parameters
In some use cases, it is desirable to only inject idle on certain set of CPUs. For example on Alder Lake systems, it is possible that we force idle only on P-Cores for thermal reasons. Also the idle percent can be more than 50% if we only choose partial set of CPUs in the system. Introduce 2 new module parameters for this purpose. They can be only changed when the cooling device is inactive. cpumask (Read/Write): A bit mask of CPUs to inject idle. The format of this bitmask is same as used in other subsystems like in /proc/irq/*/smp_affinity. The mask is comma separated 32 bit groups. Each CPU is one bit. For example for 256 CPU system the full mask is: ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff The rightmost mask is for CPU 0-32. max_idle (Read/Write): Maximum injected idle time to the total CPU time ratio in percent range from 1 to 100. Even if the cooling device max_state is always 100 (100%), this parameter allows to add a max idle percent limit. The default is 50, to match the current implementation of powerclamp driver. Also doesn't allow value more than 75, if the cpumask includes every CPU present in the system. Also when the cpumask doesn't include every CPU, there is no use of compensation using package C-state idle counters. Hence don't start package C-state polling thread even for a single package or a single die system in this case. Signed-off-by: Srinivas Pandruvada <[email protected]> Signed-off-by: Rafael J. Wysocki <[email protected]>
1 parent 707bf8e commit ebf5197

File tree

2 files changed

+178
-20
lines changed

2 files changed

+178
-20
lines changed

Documentation/admin-guide/thermal/intel_powerclamp.rst

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ By:
2626
- Generic Thermal Layer (sysfs)
2727
- Kernel APIs (TBD)
2828
29+
(*) Module Parameters
30+
2931
INTRODUCTION
3032
============
3133

@@ -318,3 +320,23 @@ device, a PID based userspace thermal controller can manage to
318320
control CPU temperature effectively, when no other thermal influence
319321
is added. For example, a UltraBook user can compile the kernel under
320322
certain temperature (below most active trip points).
323+
324+
Module Parameters
325+
=================
326+
327+
``cpumask`` (RW)
328+
A bit mask of CPUs to inject idle. The format of the bitmask is same as
329+
used in other subsystems like in /proc/irq/*/smp_affinity. The mask is
330+
comma separated 32 bit groups. Each CPU is one bit. For example for a 256
331+
CPU system the full mask is:
332+
ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff
333+
334+
The rightmost mask is for CPU 0-32.
335+
336+
``max_idle`` (RW)
337+
Maximum injected idle time to the total CPU time ratio in percent range
338+
from 1 to 100. Even if the cooling device max_state is always 100 (100%),
339+
this parameter allows to add a max idle percent limit. The default is 50,
340+
to match the current implementation of powerclamp driver. Also doesn't
341+
allow value more than 75, if the cpumask includes every CPU present in
342+
the system.

drivers/thermal/intel/intel_powerclamp.c

Lines changed: 156 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
#include <asm/mwait.h>
3838
#include <asm/cpu_device_id.h>
3939

40-
#define MAX_TARGET_RATIO (50U)
40+
#define MAX_TARGET_RATIO (100U)
4141
/* For each undisturbed clamping period (no extra wake ups during idle time),
4242
* we increment the confidence counter for the given target ratio.
4343
* CONFIDENCE_OK defines the level where runtime calibration results are
@@ -121,6 +121,141 @@ static const struct kernel_param_ops duration_ops = {
121121
module_param_cb(duration, &duration_ops, NULL, 0644);
122122
MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
123123

124+
#define DEFAULT_MAX_IDLE 50
125+
#define MAX_ALL_CPU_IDLE 75
126+
127+
static u8 max_idle = DEFAULT_MAX_IDLE;
128+
129+
static cpumask_var_t idle_injection_cpu_mask;
130+
131+
static int allocate_copy_idle_injection_mask(const struct cpumask *copy_mask)
132+
{
133+
if (cpumask_available(idle_injection_cpu_mask))
134+
goto copy_mask;
135+
136+
/* This mask is allocated only one time and freed during module exit */
137+
if (!alloc_cpumask_var(&idle_injection_cpu_mask, GFP_KERNEL))
138+
return -ENOMEM;
139+
140+
copy_mask:
141+
cpumask_copy(idle_injection_cpu_mask, copy_mask);
142+
143+
return 0;
144+
}
145+
146+
/* Return true if the cpumask and idle percent combination is invalid */
147+
static bool check_invalid(cpumask_var_t mask, u8 idle)
148+
{
149+
if (cpumask_equal(cpu_present_mask, mask) && idle > MAX_ALL_CPU_IDLE)
150+
return true;
151+
152+
return false;
153+
}
154+
155+
static int cpumask_set(const char *arg, const struct kernel_param *kp)
156+
{
157+
cpumask_var_t new_mask;
158+
int ret;
159+
160+
mutex_lock(&powerclamp_lock);
161+
162+
/* Can't set mask when cooling device is in use */
163+
if (powerclamp_data.clamping) {
164+
ret = -EAGAIN;
165+
goto skip_cpumask_set;
166+
}
167+
168+
ret = alloc_cpumask_var(&new_mask, GFP_KERNEL);
169+
if (!ret)
170+
goto skip_cpumask_set;
171+
172+
ret = bitmap_parse(arg, strlen(arg), cpumask_bits(new_mask),
173+
nr_cpumask_bits);
174+
if (ret)
175+
goto free_cpumask_set;
176+
177+
if (cpumask_empty(new_mask) || check_invalid(new_mask, max_idle)) {
178+
ret = -EINVAL;
179+
goto free_cpumask_set;
180+
}
181+
182+
/*
183+
* When module parameters are passed from kernel command line
184+
* during insmod, the module parameter callback is called
185+
* before powerclamp_init(), so we can't assume that some
186+
* cpumask can be allocated and copied before here. Also
187+
* in this case this cpumask is used as the default mask.
188+
*/
189+
ret = allocate_copy_idle_injection_mask(new_mask);
190+
191+
free_cpumask_set:
192+
free_cpumask_var(new_mask);
193+
skip_cpumask_set:
194+
mutex_unlock(&powerclamp_lock);
195+
196+
return ret;
197+
}
198+
199+
static int cpumask_get(char *buf, const struct kernel_param *kp)
200+
{
201+
if (!cpumask_available(idle_injection_cpu_mask))
202+
return -ENODEV;
203+
204+
return bitmap_print_to_pagebuf(false, buf, cpumask_bits(idle_injection_cpu_mask),
205+
nr_cpumask_bits);
206+
}
207+
208+
static const struct kernel_param_ops cpumask_ops = {
209+
.set = cpumask_set,
210+
.get = cpumask_get,
211+
};
212+
213+
module_param_cb(cpumask, &cpumask_ops, NULL, 0644);
214+
MODULE_PARM_DESC(cpumask, "Mask of CPUs to use for idle injection.");
215+
216+
static int max_idle_set(const char *arg, const struct kernel_param *kp)
217+
{
218+
u8 new_max_idle;
219+
int ret = 0;
220+
221+
mutex_lock(&powerclamp_lock);
222+
223+
/* Can't set mask when cooling device is in use */
224+
if (powerclamp_data.clamping) {
225+
ret = -EAGAIN;
226+
goto skip_limit_set;
227+
}
228+
229+
ret = kstrtou8(arg, 10, &new_max_idle);
230+
if (ret)
231+
goto skip_limit_set;
232+
233+
if (new_max_idle > MAX_TARGET_RATIO) {
234+
ret = -EINVAL;
235+
goto skip_limit_set;
236+
}
237+
238+
if (check_invalid(idle_injection_cpu_mask, new_max_idle)) {
239+
ret = -EINVAL;
240+
goto skip_limit_set;
241+
}
242+
243+
max_idle = new_max_idle;
244+
245+
skip_limit_set:
246+
mutex_unlock(&powerclamp_lock);
247+
248+
return ret;
249+
}
250+
251+
static const struct kernel_param_ops max_idle_ops = {
252+
.set = max_idle_set,
253+
.get = param_get_int,
254+
};
255+
256+
module_param_cb(max_idle, &max_idle_ops, &max_idle, 0644);
257+
MODULE_PARM_DESC(max_idle, "maximum injected idle time to the total CPU time ratio in percent range:1-100");
258+
124259
struct powerclamp_calibration_data {
125260
unsigned long confidence; /* used for calibration, basically a counter
126261
* gets incremented each time a clamping
@@ -472,21 +607,15 @@ static void trigger_idle_injection(void)
472607
*/
473608
static int powerclamp_idle_injection_register(void)
474609
{
475-
/*
476-
* The idle inject core will only inject for online CPUs,
477-
* So we can register for all present CPUs. In this way
478-
* if some CPU goes online/offline while idle inject
479-
* is registered, nothing additional calls are required.
480-
* The same runtime and idle time is applicable for
481-
* newly onlined CPUs if any.
482-
*
483-
* Here cpu_present_mask can be used as is.
484-
* cast to (struct cpumask *) is required as the
485-
* cpu_present_mask is const struct cpumask *, otherwise
486-
* there will be compiler warnings.
487-
*/
488-
ii_dev = idle_inject_register_full((struct cpumask *)cpu_present_mask,
489-
idle_inject_update);
610+
poll_pkg_cstate_enable = false;
611+
if (cpumask_equal(cpu_present_mask, idle_injection_cpu_mask)) {
612+
ii_dev = idle_inject_register_full(idle_injection_cpu_mask, idle_inject_update);
613+
if (topology_max_packages() == 1 && topology_max_die_per_package() == 1)
614+
poll_pkg_cstate_enable = true;
615+
} else {
616+
ii_dev = idle_inject_register(idle_injection_cpu_mask);
617+
}
618+
490619
if (!ii_dev) {
491620
pr_err("powerclamp: idle_inject_register failed\n");
492621
return -EAGAIN;
@@ -567,7 +696,7 @@ static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
567696
mutex_lock(&powerclamp_lock);
568697

569698
new_target_ratio = clamp(new_target_ratio, 0UL,
570-
(unsigned long) (MAX_TARGET_RATIO - 1));
699+
(unsigned long) (max_idle - 1));
571700
if (!powerclamp_data.target_ratio && new_target_ratio > 0) {
572701
pr_info("Start idle injection to reduce power\n");
573702
powerclamp_data.target_ratio = new_target_ratio;
@@ -658,15 +787,19 @@ static int __init powerclamp_init(void)
658787

659788
/* probe cpu features and ids here */
660789
retval = powerclamp_probe();
790+
if (retval)
791+
return retval;
792+
793+
mutex_lock(&powerclamp_lock);
794+
retval = allocate_copy_idle_injection_mask(cpu_present_mask);
795+
mutex_unlock(&powerclamp_lock);
796+
661797
if (retval)
662798
return retval;
663799

664800
/* set default limit, maybe adjusted during runtime based on feedback */
665801
window_size = 2;
666802

667-
if (topology_max_packages() == 1 && topology_max_die_per_package() == 1)
668-
poll_pkg_cstate_enable = true;
669-
670803
cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
671804
&powerclamp_cooling_ops);
672805
if (IS_ERR(cooling_dev))
@@ -691,6 +824,9 @@ static void __exit powerclamp_exit(void)
691824

692825
cancel_delayed_work_sync(&poll_pkg_cstate_work);
693826
debugfs_remove_recursive(debug_dir);
827+
828+
if (cpumask_available(idle_injection_cpu_mask))
829+
free_cpumask_var(idle_injection_cpu_mask);
694830
}
695831
module_exit(powerclamp_exit);
696832

0 commit comments

Comments
 (0)