Skip to content

Commit 13685c4

Browse files
Qais YousefPeter Zijlstra
authored andcommitted
sched/uclamp: Add a new sysctl to control RT default boost value
RT tasks by default run at the highest capacity/performance level. When uclamp is selected this default behavior is retained by enforcing the requested uclamp.min (p->uclamp_req[UCLAMP_MIN]) of the RT tasks to be uclamp_none(UCLAMP_MAX), which is SCHED_CAPACITY_SCALE; the maximum value. This is also referred to as 'the default boost value of RT tasks'. See commit 1a00d99 ("sched/uclamp: Set default clamps for RT tasks"). On battery powered devices, it is desired to control this default (currently hardcoded) behavior at runtime to reduce energy consumed by RT tasks. For example, a mobile device manufacturer where big.LITTLE architecture is dominant, the performance of the little cores varies across SoCs, and on high end ones the big cores could be too power hungry. Given the diversity of SoCs, the new knob allows manufactures to tune the best performance/power for RT tasks for the particular hardware they run on. They could opt to further tune the value when the user selects a different power saving mode or when the device is actively charging. The runtime aspect of it further helps in creating a single kernel image that can be run on multiple devices that require different tuning. Keep in mind that a lot of RT tasks in the system are created by the kernel. On Android for instance I can see over 50 RT tasks, only a handful of which created by the Android framework. To control the default behavior globally by system admins and device integrator, introduce the new sysctl_sched_uclamp_util_min_rt_default to change the default boost value of the RT tasks. I anticipate this to be mostly in the form of modifying the init script of a particular device. To avoid polluting the fast path with unnecessary code, the approach taken is to synchronously do the update by traversing all the existing tasks in the system. This could race with a concurrent fork(), which is dealt with by introducing sched_post_fork() function which will ensure the racy fork will get the right update applied. Tested on Juno-r2 in combination with the RT capacity awareness [1]. By default an RT task will go to the highest capacity CPU and run at the maximum frequency, which is particularly energy inefficient on high end mobile devices because the biggest core[s] are 'huge' and power hungry. With this patch the RT task can be controlled to run anywhere by default, and doesn't cause the frequency to be maximum all the time. Yet any task that really needs to be boosted can easily escape this default behavior by modifying its requested uclamp.min value (p->uclamp_req[UCLAMP_MIN]) via sched_setattr() syscall. [1] 804d402: ("sched/rt: Make RT capacity-aware") Signed-off-by: Qais Yousef <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent e65855a commit 13685c4

File tree

6 files changed

+131
-8
lines changed

6 files changed

+131
-8
lines changed

include/linux/sched.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -686,9 +686,15 @@ struct task_struct {
686686
struct sched_dl_entity dl;
687687

688688
#ifdef CONFIG_UCLAMP_TASK
689-
/* Clamp values requested for a scheduling entity */
689+
/*
690+
* Clamp values requested for a scheduling entity.
691+
* Must be updated with task_rq_lock() held.
692+
*/
690693
struct uclamp_se uclamp_req[UCLAMP_CNT];
691-
/* Effective clamp values used for a scheduling entity */
694+
/*
695+
* Effective clamp values used for a scheduling entity.
696+
* Must be updated with task_rq_lock() held.
697+
*/
692698
struct uclamp_se uclamp[UCLAMP_CNT];
693699
#endif
694700

include/linux/sched/sysctl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ extern unsigned int sysctl_sched_dl_period_min;
6767
#ifdef CONFIG_UCLAMP_TASK
6868
extern unsigned int sysctl_sched_uclamp_util_min;
6969
extern unsigned int sysctl_sched_uclamp_util_max;
70+
extern unsigned int sysctl_sched_uclamp_util_min_rt_default;
7071
#endif
7172

7273
#ifdef CONFIG_CFS_BANDWIDTH

include/linux/sched/task.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
5555
extern void init_idle(struct task_struct *idle, int cpu);
5656

5757
extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
58+
extern void sched_post_fork(struct task_struct *p);
5859
extern void sched_dead(struct task_struct *p);
5960

6061
void __noreturn do_task_dead(void);

kernel/fork.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2304,6 +2304,7 @@ static __latent_entropy struct task_struct *copy_process(
23042304
write_unlock_irq(&tasklist_lock);
23052305

23062306
proc_fork_connector(p);
2307+
sched_post_fork(p);
23072308
cgroup_post_fork(p, args);
23082309
perf_event_fork(p);
23092310

kernel/sched/core.c

Lines changed: 113 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -889,6 +889,23 @@ unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
889889
/* Max allowed maximum utilization */
890890
unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
891891

892+
/*
893+
* By default RT tasks run at the maximum performance point/capacity of the
894+
* system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
895+
* SCHED_CAPACITY_SCALE.
896+
*
897+
* This knob allows admins to change the default behavior when uclamp is being
898+
* used. In battery powered devices, particularly, running at the maximum
899+
* capacity and frequency will increase energy consumption and shorten the
900+
* battery life.
901+
*
902+
* This knob only affects RT tasks that their uclamp_se->user_defined == false.
903+
*
904+
* This knob will not override the system default sched_util_clamp_min defined
905+
* above.
906+
*/
907+
unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
908+
892909
/* All clamps are required to be less or equal than these values */
893910
static struct uclamp_se uclamp_default[UCLAMP_CNT];
894911

@@ -991,6 +1008,64 @@ unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
9911008
return uclamp_idle_value(rq, clamp_id, clamp_value);
9921009
}
9931010

1011+
static void __uclamp_update_util_min_rt_default(struct task_struct *p)
1012+
{
1013+
unsigned int default_util_min;
1014+
struct uclamp_se *uc_se;
1015+
1016+
lockdep_assert_held(&p->pi_lock);
1017+
1018+
uc_se = &p->uclamp_req[UCLAMP_MIN];
1019+
1020+
/* Only sync if user didn't override the default */
1021+
if (uc_se->user_defined)
1022+
return;
1023+
1024+
default_util_min = sysctl_sched_uclamp_util_min_rt_default;
1025+
uclamp_se_set(uc_se, default_util_min, false);
1026+
}
1027+
1028+
static void uclamp_update_util_min_rt_default(struct task_struct *p)
1029+
{
1030+
struct rq_flags rf;
1031+
struct rq *rq;
1032+
1033+
if (!rt_task(p))
1034+
return;
1035+
1036+
/* Protect updates to p->uclamp_* */
1037+
rq = task_rq_lock(p, &rf);
1038+
__uclamp_update_util_min_rt_default(p);
1039+
task_rq_unlock(rq, p, &rf);
1040+
}
1041+
1042+
static void uclamp_sync_util_min_rt_default(void)
1043+
{
1044+
struct task_struct *g, *p;
1045+
1046+
/*
1047+
* copy_process() sysctl_uclamp
1048+
* uclamp_min_rt = X;
1049+
* write_lock(&tasklist_lock) read_lock(&tasklist_lock)
1050+
* // link thread smp_mb__after_spinlock()
1051+
* write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
1052+
* sched_post_fork() for_each_process_thread()
1053+
* __uclamp_sync_rt() __uclamp_sync_rt()
1054+
*
1055+
* Ensures that either sched_post_fork() will observe the new
1056+
* uclamp_min_rt or for_each_process_thread() will observe the new
1057+
* task.
1058+
*/
1059+
read_lock(&tasklist_lock);
1060+
smp_mb__after_spinlock();
1061+
read_unlock(&tasklist_lock);
1062+
1063+
rcu_read_lock();
1064+
for_each_process_thread(g, p)
1065+
uclamp_update_util_min_rt_default(p);
1066+
rcu_read_unlock();
1067+
}
1068+
9941069
static inline struct uclamp_se
9951070
uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
9961071
{
@@ -1278,12 +1353,13 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
12781353
void *buffer, size_t *lenp, loff_t *ppos)
12791354
{
12801355
bool update_root_tg = false;
1281-
int old_min, old_max;
1356+
int old_min, old_max, old_min_rt;
12821357
int result;
12831358

12841359
mutex_lock(&uclamp_mutex);
12851360
old_min = sysctl_sched_uclamp_util_min;
12861361
old_max = sysctl_sched_uclamp_util_max;
1362+
old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
12871363

12881364
result = proc_dointvec(table, write, buffer, lenp, ppos);
12891365
if (result)
@@ -1292,7 +1368,9 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
12921368
goto done;
12931369

12941370
if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
1295-
sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
1371+
sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
1372+
sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
1373+
12961374
result = -EINVAL;
12971375
goto undo;
12981376
}
@@ -1313,6 +1391,11 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
13131391
uclamp_update_root_tg();
13141392
}
13151393

1394+
if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
1395+
static_branch_enable(&sched_uclamp_used);
1396+
uclamp_sync_util_min_rt_default();
1397+
}
1398+
13161399
/*
13171400
* We update all RUNNABLE tasks only when task groups are in use.
13181401
* Otherwise, keep it simple and do just a lazy update at each next
@@ -1324,6 +1407,7 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
13241407
undo:
13251408
sysctl_sched_uclamp_util_min = old_min;
13261409
sysctl_sched_uclamp_util_max = old_max;
1410+
sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
13271411
done:
13281412
mutex_unlock(&uclamp_mutex);
13291413

@@ -1369,17 +1453,20 @@ static void __setscheduler_uclamp(struct task_struct *p,
13691453
*/
13701454
for_each_clamp_id(clamp_id) {
13711455
struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1372-
unsigned int clamp_value = uclamp_none(clamp_id);
13731456

13741457
/* Keep using defined clamps across class changes */
13751458
if (uc_se->user_defined)
13761459
continue;
13771460

1378-
/* By default, RT tasks always get 100% boost */
1461+
/*
1462+
* RT by default have a 100% boost value that could be modified
1463+
* at runtime.
1464+
*/
13791465
if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
1380-
clamp_value = uclamp_none(UCLAMP_MAX);
1466+
__uclamp_update_util_min_rt_default(p);
1467+
else
1468+
uclamp_se_set(uc_se, uclamp_none(clamp_id), false);
13811469

1382-
uclamp_se_set(uc_se, clamp_value, false);
13831470
}
13841471

13851472
if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
@@ -1400,6 +1487,10 @@ static void uclamp_fork(struct task_struct *p)
14001487
{
14011488
enum uclamp_id clamp_id;
14021489

1490+
/*
1491+
* We don't need to hold task_rq_lock() when updating p->uclamp_* here
1492+
* as the task is still at its early fork stages.
1493+
*/
14031494
for_each_clamp_id(clamp_id)
14041495
p->uclamp[clamp_id].active = false;
14051496

@@ -1412,6 +1503,11 @@ static void uclamp_fork(struct task_struct *p)
14121503
}
14131504
}
14141505

1506+
static void uclamp_post_fork(struct task_struct *p)
1507+
{
1508+
uclamp_update_util_min_rt_default(p);
1509+
}
1510+
14151511
static void __init init_uclamp_rq(struct rq *rq)
14161512
{
14171513
enum uclamp_id clamp_id;
@@ -1462,6 +1558,7 @@ static inline int uclamp_validate(struct task_struct *p,
14621558
static void __setscheduler_uclamp(struct task_struct *p,
14631559
const struct sched_attr *attr) { }
14641560
static inline void uclamp_fork(struct task_struct *p) { }
1561+
static inline void uclamp_post_fork(struct task_struct *p) { }
14651562
static inline void init_uclamp(void) { }
14661563
#endif /* CONFIG_UCLAMP_TASK */
14671564

@@ -3205,6 +3302,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
32053302
return 0;
32063303
}
32073304

3305+
void sched_post_fork(struct task_struct *p)
3306+
{
3307+
uclamp_post_fork(p);
3308+
}
3309+
32083310
unsigned long to_ratio(u64 period, u64 runtime)
32093311
{
32103312
if (runtime == RUNTIME_INF)
@@ -5724,6 +5826,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
57245826
kattr.sched_nice = task_nice(p);
57255827

57265828
#ifdef CONFIG_UCLAMP_TASK
5829+
/*
5830+
* This could race with another potential updater, but this is fine
5831+
* because it'll correctly read the old or the new value. We don't need
5832+
* to guarantee who wins the race as long as it doesn't return garbage.
5833+
*/
57275834
kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
57285835
kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
57295836
#endif

kernel/sysctl.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1815,6 +1815,13 @@ static struct ctl_table kern_table[] = {
18151815
.mode = 0644,
18161816
.proc_handler = sysctl_sched_uclamp_handler,
18171817
},
1818+
{
1819+
.procname = "sched_util_clamp_min_rt_default",
1820+
.data = &sysctl_sched_uclamp_util_min_rt_default,
1821+
.maxlen = sizeof(unsigned int),
1822+
.mode = 0644,
1823+
.proc_handler = sysctl_sched_uclamp_handler,
1824+
},
18181825
#endif
18191826
#ifdef CONFIG_SCHED_AUTOGROUP
18201827
{

0 commit comments

Comments
 (0)