Skip to content

Commit 10c64a0

Browse files
committed
Merge tag 'sched_urgent_for_v5.17_rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Borislav Petkov: "A bunch of fixes: forced idle time accounting, utilization values propagation in the sched hierarchies and other minor cleanups and improvements" * tag 'sched_urgent_for_v5.17_rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: kernel/sched: Remove dl_boosted flag comment sched: Avoid double preemption in __cond_resched_*lock*() sched/fair: Fix all kernel-doc warnings sched/core: Accounting forceidle time for all tasks except idle task sched/pelt: Relax the sync of load_sum with load_avg sched/pelt: Relax the sync of runnable_sum with runnable_avg sched/pelt: Continue to relax the sync of util_sum with util_avg sched/pelt: Relax the sync of util_sum with util_avg psi: Fix uaf issue when psi trigger is destroyed while being polled
2 parents 0f9e042 + 0e38724 commit 10c64a0

File tree

10 files changed

+125
-103
lines changed

10 files changed

+125
-103
lines changed

Documentation/accounting/psi.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,8 @@ Triggers can be set on more than one psi metric and more than one trigger
9292
for the same psi metric can be specified. However for each trigger a separate
9393
file descriptor is required to be able to poll it separately from others,
9494
therefore for each trigger a separate open() syscall should be made even
95-
when opening the same psi interface file.
95+
when opening the same psi interface file. Write operations to a file descriptor
96+
with an already existing psi trigger will fail with EBUSY.
9697

9798
Monitors activate only when system enters stall state for the monitored
9899
psi metric and deactivates upon exit from the stall state. While system is

include/linux/psi.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ void cgroup_move_task(struct task_struct *p, struct css_set *to);
3333

3434
struct psi_trigger *psi_trigger_create(struct psi_group *group,
3535
char *buf, size_t nbytes, enum psi_res res);
36-
void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *t);
36+
void psi_trigger_destroy(struct psi_trigger *t);
3737

3838
__poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
3939
poll_table *wait);

include/linux/psi_types.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -141,9 +141,6 @@ struct psi_trigger {
141141
* events to one per window
142142
*/
143143
u64 last_event_time;
144-
145-
/* Refcounting to prevent premature destruction */
146-
struct kref refcount;
147144
};
148145

149146
struct psi_group {

include/linux/sched.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -619,10 +619,6 @@ struct sched_dl_entity {
619619
* task has to wait for a replenishment to be performed at the
620620
* next firing of dl_timer.
621621
*
622-
* @dl_boosted tells if we are boosted due to DI. If so we are
623-
* outside bandwidth enforcement mechanism (but only until we
624-
* exit the critical section);
625-
*
626622
* @dl_yielded tells if task gave up the CPU before consuming
627623
* all its available runtime during the last job.
628624
*

kernel/cgroup/cgroup.c

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3643,15 +3643,20 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
36433643
cgroup_get(cgrp);
36443644
cgroup_kn_unlock(of->kn);
36453645

3646+
/* Allow only one trigger per file descriptor */
3647+
if (ctx->psi.trigger) {
3648+
cgroup_put(cgrp);
3649+
return -EBUSY;
3650+
}
3651+
36463652
psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
36473653
new = psi_trigger_create(psi, buf, nbytes, res);
36483654
if (IS_ERR(new)) {
36493655
cgroup_put(cgrp);
36503656
return PTR_ERR(new);
36513657
}
36523658

3653-
psi_trigger_replace(&ctx->psi.trigger, new);
3654-
3659+
smp_store_release(&ctx->psi.trigger, new);
36553660
cgroup_put(cgrp);
36563661

36573662
return nbytes;
@@ -3690,7 +3695,7 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
36903695
{
36913696
struct cgroup_file_ctx *ctx = of->priv;
36923697

3693-
psi_trigger_replace(&ctx->psi.trigger, NULL);
3698+
psi_trigger_destroy(ctx->psi.trigger);
36943699
}
36953700

36963701
bool cgroup_psi_enabled(void)

kernel/sched/core.c

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5822,8 +5822,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
58225822
}
58235823

58245824
if (schedstat_enabled() && rq->core->core_forceidle_count) {
5825-
if (cookie)
5826-
rq->core->core_forceidle_start = rq_clock(rq->core);
5825+
rq->core->core_forceidle_start = rq_clock(rq->core);
58275826
rq->core->core_forceidle_occupation = occ;
58285827
}
58295828

@@ -8219,9 +8218,7 @@ int __cond_resched_lock(spinlock_t *lock)
82198218

82208219
if (spin_needbreak(lock) || resched) {
82218220
spin_unlock(lock);
8222-
if (resched)
8223-
preempt_schedule_common();
8224-
else
8221+
if (!_cond_resched())
82258222
cpu_relax();
82268223
ret = 1;
82278224
spin_lock(lock);
@@ -8239,9 +8236,7 @@ int __cond_resched_rwlock_read(rwlock_t *lock)
82398236

82408237
if (rwlock_needbreak(lock) || resched) {
82418238
read_unlock(lock);
8242-
if (resched)
8243-
preempt_schedule_common();
8244-
else
8239+
if (!_cond_resched())
82458240
cpu_relax();
82468241
ret = 1;
82478242
read_lock(lock);
@@ -8259,9 +8254,7 @@ int __cond_resched_rwlock_write(rwlock_t *lock)
82598254

82608255
if (rwlock_needbreak(lock) || resched) {
82618256
write_unlock(lock);
8262-
if (resched)
8263-
preempt_schedule_common();
8264-
else
8257+
if (!_cond_resched())
82658258
cpu_relax();
82668259
ret = 1;
82678260
write_lock(lock);

kernel/sched/core_sched.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,7 @@ void __sched_core_account_forceidle(struct rq *rq)
277277
rq_i = cpu_rq(i);
278278
p = rq_i->core_pick ?: rq_i->curr;
279279

280-
if (!p->core_cookie)
280+
if (p == rq_i->idle)
281281
continue;
282282

283283
__schedstat_add(p->stats.core_forceidle_sum, delta);

kernel/sched/fair.c

Lines changed: 77 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3028,9 +3028,11 @@ enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
30283028
static inline void
30293029
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
30303030
{
3031-
u32 divider = get_pelt_divider(&se->avg);
30323031
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3033-
cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
3032+
sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
3033+
/* See update_cfs_rq_load_avg() */
3034+
cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
3035+
cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
30343036
}
30353037
#else
30363038
static inline void
@@ -3381,7 +3383,6 @@ void set_task_rq_fair(struct sched_entity *se,
33813383
se->avg.last_update_time = n_last_update_time;
33823384
}
33833385

3384-
33853386
/*
33863387
* When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
33873388
* propagate its contribution. The key to this propagation is the invariant
@@ -3449,15 +3450,14 @@ void set_task_rq_fair(struct sched_entity *se,
34493450
* XXX: only do this for the part of runnable > running ?
34503451
*
34513452
*/
3452-
34533453
static inline void
34543454
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
34553455
{
3456-
long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3457-
u32 divider;
3456+
long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
3457+
u32 new_sum, divider;
34583458

34593459
/* Nothing to update */
3460-
if (!delta)
3460+
if (!delta_avg)
34613461
return;
34623462

34633463
/*
@@ -3466,23 +3466,30 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
34663466
*/
34673467
divider = get_pelt_divider(&cfs_rq->avg);
34683468

3469+
34693470
/* Set new sched_entity's utilization */
34703471
se->avg.util_avg = gcfs_rq->avg.util_avg;
3471-
se->avg.util_sum = se->avg.util_avg * divider;
3472+
new_sum = se->avg.util_avg * divider;
3473+
delta_sum = (long)new_sum - (long)se->avg.util_sum;
3474+
se->avg.util_sum = new_sum;
34723475

34733476
/* Update parent cfs_rq utilization */
3474-
add_positive(&cfs_rq->avg.util_avg, delta);
3475-
cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
3477+
add_positive(&cfs_rq->avg.util_avg, delta_avg);
3478+
add_positive(&cfs_rq->avg.util_sum, delta_sum);
3479+
3480+
/* See update_cfs_rq_load_avg() */
3481+
cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
3482+
cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
34763483
}
34773484

34783485
static inline void
34793486
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
34803487
{
3481-
long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
3482-
u32 divider;
3488+
long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
3489+
u32 new_sum, divider;
34833490

34843491
/* Nothing to update */
3485-
if (!delta)
3492+
if (!delta_avg)
34863493
return;
34873494

34883495
/*
@@ -3493,19 +3500,25 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
34933500

34943501
/* Set new sched_entity's runnable */
34953502
se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
3496-
se->avg.runnable_sum = se->avg.runnable_avg * divider;
3503+
new_sum = se->avg.runnable_avg * divider;
3504+
delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
3505+
se->avg.runnable_sum = new_sum;
34973506

34983507
/* Update parent cfs_rq runnable */
3499-
add_positive(&cfs_rq->avg.runnable_avg, delta);
3500-
cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
3508+
add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
3509+
add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
3510+
/* See update_cfs_rq_load_avg() */
3511+
cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
3512+
cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
35013513
}
35023514

35033515
static inline void
35043516
update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
35053517
{
3506-
long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3518+
long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
35073519
unsigned long load_avg;
35083520
u64 load_sum = 0;
3521+
s64 delta_sum;
35093522
u32 divider;
35103523

35113524
if (!runnable_sum)
@@ -3532,7 +3545,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
35323545
* assuming all tasks are equally runnable.
35333546
*/
35343547
if (scale_load_down(gcfs_rq->load.weight)) {
3535-
load_sum = div_s64(gcfs_rq->avg.load_sum,
3548+
load_sum = div_u64(gcfs_rq->avg.load_sum,
35363549
scale_load_down(gcfs_rq->load.weight));
35373550
}
35383551

@@ -3549,19 +3562,22 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
35493562
running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
35503563
runnable_sum = max(runnable_sum, running_sum);
35513564

3552-
load_sum = (s64)se_weight(se) * runnable_sum;
3553-
load_avg = div_s64(load_sum, divider);
3554-
3555-
se->avg.load_sum = runnable_sum;
3565+
load_sum = se_weight(se) * runnable_sum;
3566+
load_avg = div_u64(load_sum, divider);
35563567

3557-
delta = load_avg - se->avg.load_avg;
3558-
if (!delta)
3568+
delta_avg = load_avg - se->avg.load_avg;
3569+
if (!delta_avg)
35593570
return;
35603571

3561-
se->avg.load_avg = load_avg;
3572+
delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
35623573

3563-
add_positive(&cfs_rq->avg.load_avg, delta);
3564-
cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
3574+
se->avg.load_sum = runnable_sum;
3575+
se->avg.load_avg = load_avg;
3576+
add_positive(&cfs_rq->avg.load_avg, delta_avg);
3577+
add_positive(&cfs_rq->avg.load_sum, delta_sum);
3578+
/* See update_cfs_rq_load_avg() */
3579+
cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
3580+
cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
35653581
}
35663582

35673583
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3652,7 +3668,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
36523668
*
36533669
* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
36543670
*
3655-
* Returns true if the load decayed or we removed load.
3671+
* Return: true if the load decayed or we removed load.
36563672
*
36573673
* Since both these conditions indicate a changed cfs_rq->avg.load we should
36583674
* call update_tg_load_avg() when this function returns true.
@@ -3677,15 +3693,32 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
36773693

36783694
r = removed_load;
36793695
sub_positive(&sa->load_avg, r);
3680-
sa->load_sum = sa->load_avg * divider;
3696+
sub_positive(&sa->load_sum, r * divider);
3697+
/* See sa->util_sum below */
3698+
sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
36813699

36823700
r = removed_util;
36833701
sub_positive(&sa->util_avg, r);
3684-
sa->util_sum = sa->util_avg * divider;
3702+
sub_positive(&sa->util_sum, r * divider);
3703+
/*
3704+
* Because of rounding, se->util_sum might ends up being +1 more than
3705+
* cfs->util_sum. Although this is not a problem by itself, detaching
3706+
* a lot of tasks with the rounding problem between 2 updates of
3707+
* util_avg (~1ms) can make cfs->util_sum becoming null whereas
3708+
* cfs_util_avg is not.
3709+
* Check that util_sum is still above its lower bound for the new
3710+
* util_avg. Given that period_contrib might have moved since the last
3711+
* sync, we are only sure that util_sum must be above or equal to
3712+
* util_avg * minimum possible divider
3713+
*/
3714+
sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
36853715

36863716
r = removed_runnable;
36873717
sub_positive(&sa->runnable_avg, r);
3688-
sa->runnable_sum = sa->runnable_avg * divider;
3718+
sub_positive(&sa->runnable_sum, r * divider);
3719+
/* See sa->util_sum above */
3720+
sa->runnable_sum = max_t(u32, sa->runnable_sum,
3721+
sa->runnable_avg * PELT_MIN_DIVIDER);
36893722

36903723
/*
36913724
* removed_runnable is the unweighted version of removed_load so we
@@ -3772,17 +3805,18 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
37723805
*/
37733806
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
37743807
{
3775-
/*
3776-
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3777-
* See ___update_load_avg() for details.
3778-
*/
3779-
u32 divider = get_pelt_divider(&cfs_rq->avg);
3780-
37813808
dequeue_load_avg(cfs_rq, se);
37823809
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3783-
cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
3810+
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3811+
/* See update_cfs_rq_load_avg() */
3812+
cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
3813+
cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
3814+
37843815
sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
3785-
cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
3816+
sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
3817+
/* See update_cfs_rq_load_avg() */
3818+
cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
3819+
cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
37863820

37873821
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
37883822

@@ -8539,6 +8573,8 @@ group_type group_classify(unsigned int imbalance_pct,
85398573
*
85408574
* If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
85418575
* of @dst_cpu are idle and @sg has lower priority.
8576+
*
8577+
* Return: true if @dst_cpu can pull tasks, false otherwise.
85428578
*/
85438579
static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
85448580
struct sg_lb_stats *sgs,
@@ -8614,6 +8650,7 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
86148650
/**
86158651
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
86168652
* @env: The load balancing environment.
8653+
* @sds: Load-balancing data with statistics of the local group.
86178654
* @group: sched_group whose statistics are to be updated.
86188655
* @sgs: variable to hold the statistics for this group.
86198656
* @sg_status: Holds flag indicating the status of the sched_group
@@ -9421,12 +9458,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
94219458
/**
94229459
* find_busiest_group - Returns the busiest group within the sched_domain
94239460
* if there is an imbalance.
9461+
* @env: The load balancing environment.
94249462
*
94259463
* Also calculates the amount of runnable load which should be moved
94269464
* to restore balance.
94279465
*
9428-
* @env: The load balancing environment.
9429-
*
94309466
* Return: - The busiest group if imbalance exists.
94319467
*/
94329468
static struct sched_group *find_busiest_group(struct lb_env *env)

kernel/sched/pelt.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,11 @@ update_irq_load_avg(struct rq *rq, u64 running)
3737
}
3838
#endif
3939

40+
#define PELT_MIN_DIVIDER (LOAD_AVG_MAX - 1024)
41+
4042
static inline u32 get_pelt_divider(struct sched_avg *avg)
4143
{
42-
return LOAD_AVG_MAX - 1024 + avg->period_contrib;
44+
return PELT_MIN_DIVIDER + avg->period_contrib;
4345
}
4446

4547
static inline void cfs_se_util_change(struct sched_avg *avg)

0 commit comments

Comments
 (0)