Skip to content

Commit 99f9259

Browse files
committed
Merge tag 'sched-urgent-2021-06-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar: "Misc fixes: - Fix performance regression caused by lack of intended batching of RCU callbacks by over-eager NOHZ-full code. - Fix cgroups related corruption of load_avg and load_sum metrics. - Three fixes to fix blocked load, util_sum/runnable_sum and util_est tracking bugs" * tag 'sched-urgent-2021-06-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Fix util_est UTIL_AVG_UNCHANGED handling sched/pelt: Ensure that *_sum is always synced with *_avg tick/nohz: Only check for RCU deferred wakeup on user/guest entry when needed sched/fair: Make sure to update tg contrib for blocked load sched/fair: Keep load_avg and load_sum synced
2 parents 191aaf6 + 68d7a19 commit 99f9259

File tree

8 files changed

+41
-25
lines changed

8 files changed

+41
-25
lines changed

include/linux/entry-kvm.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#define __LINUX_ENTRYKVM_H
44

55
#include <linux/entry-common.h>
6+
#include <linux/tick.h>
67

78
/* Transfer to guest mode work */
89
#ifdef CONFIG_KVM_XFER_TO_GUEST_WORK
@@ -57,7 +58,7 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu);
5758
static inline void xfer_to_guest_mode_prepare(void)
5859
{
5960
lockdep_assert_irqs_disabled();
60-
rcu_nocb_flush_deferred_wakeup();
61+
tick_nohz_user_enter_prepare();
6162
}
6263

6364
/**

include/linux/sched.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,11 +350,19 @@ struct load_weight {
350350
* Only for tasks we track a moving average of the past instantaneous
351351
* estimated utilization. This allows to absorb sporadic drops in utilization
352352
* of an otherwise almost periodic task.
353+
*
354+
* The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
355+
* updates. When a task is dequeued, its util_est should not be updated if its
356+
* util_avg has not been updated in the meantime.
357+
* This information is mapped into the MSB bit of util_est.enqueued at dequeue
358+
* time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
359+
* for a task) it is safe to use MSB.
353360
*/
354361
struct util_est {
355362
unsigned int enqueued;
356363
unsigned int ewma;
357364
#define UTIL_EST_WEIGHT_SHIFT 2
365+
#define UTIL_AVG_UNCHANGED 0x80000000
358366
} __attribute__((__aligned__(sizeof(u64))));
359367

360368
/*

include/linux/tick.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <linux/context_tracking_state.h>
1212
#include <linux/cpumask.h>
1313
#include <linux/sched.h>
14+
#include <linux/rcupdate.h>
1415

1516
#ifdef CONFIG_GENERIC_CLOCKEVENTS
1617
extern void __init tick_init(void);
@@ -300,4 +301,10 @@ static inline void tick_nohz_task_switch(void)
300301
__tick_nohz_task_switch();
301302
}
302303

304+
static inline void tick_nohz_user_enter_prepare(void)
305+
{
306+
if (tick_nohz_full_cpu(smp_processor_id()))
307+
rcu_nocb_flush_deferred_wakeup();
308+
}
309+
303310
#endif

kernel/entry/common.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <linux/highmem.h>
66
#include <linux/livepatch.h>
77
#include <linux/audit.h>
8+
#include <linux/tick.h>
89

910
#include "common.h"
1011

@@ -186,7 +187,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
186187
local_irq_disable_exit_to_user();
187188

188189
/* Check if any of the above work has queued a deferred wakeup */
189-
rcu_nocb_flush_deferred_wakeup();
190+
tick_nohz_user_enter_prepare();
190191

191192
ti_work = READ_ONCE(current_thread_info()->flags);
192193
}
@@ -202,7 +203,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs)
202203
lockdep_assert_irqs_disabled();
203204

204205
/* Flush pending rcuog wakeup before the last need_resched() check */
205-
rcu_nocb_flush_deferred_wakeup();
206+
tick_nohz_user_enter_prepare();
206207

207208
if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
208209
ti_work = exit_to_user_mode_loop(regs, ti_work);

kernel/sched/debug.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -885,6 +885,7 @@ static const struct seq_operations sched_debug_sops = {
885885
#define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F))
886886
#define __P(F) __PS(#F, F)
887887
#define P(F) __PS(#F, p->F)
888+
#define PM(F, M) __PS(#F, p->F & (M))
888889
#define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F)))
889890
#define __PN(F) __PSN(#F, F)
890891
#define PN(F) __PSN(#F, p->F)
@@ -1011,7 +1012,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
10111012
P(se.avg.util_avg);
10121013
P(se.avg.last_update_time);
10131014
P(se.avg.util_est.ewma);
1014-
P(se.avg.util_est.enqueued);
1015+
PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED);
10151016
#endif
10161017
#ifdef CONFIG_UCLAMP_TASK
10171018
__PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);

kernel/sched/fair.c

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3499,10 +3499,9 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
34993499
static inline void
35003500
update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
35013501
{
3502-
long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3502+
long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
35033503
unsigned long load_avg;
35043504
u64 load_sum = 0;
3505-
s64 delta_sum;
35063505
u32 divider;
35073506

35083507
if (!runnable_sum)
@@ -3549,13 +3548,13 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
35493548
load_sum = (s64)se_weight(se) * runnable_sum;
35503549
load_avg = div_s64(load_sum, divider);
35513550

3552-
delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
3553-
delta_avg = load_avg - se->avg.load_avg;
3551+
delta = load_avg - se->avg.load_avg;
35543552

35553553
se->avg.load_sum = runnable_sum;
35563554
se->avg.load_avg = load_avg;
3557-
add_positive(&cfs_rq->avg.load_avg, delta_avg);
3558-
add_positive(&cfs_rq->avg.load_sum, delta_sum);
3555+
3556+
add_positive(&cfs_rq->avg.load_avg, delta);
3557+
cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
35593558
}
35603559

35613560
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3766,11 +3765,17 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
37663765
*/
37673766
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
37683767
{
3768+
/*
3769+
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3770+
* See ___update_load_avg() for details.
3771+
*/
3772+
u32 divider = get_pelt_divider(&cfs_rq->avg);
3773+
37693774
dequeue_load_avg(cfs_rq, se);
37703775
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3771-
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3776+
cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
37723777
sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
3773-
sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
3778+
cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
37743779

37753780
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
37763781

@@ -3902,7 +3907,7 @@ static inline unsigned long _task_util_est(struct task_struct *p)
39023907
{
39033908
struct util_est ue = READ_ONCE(p->se.avg.util_est);
39043909

3905-
return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
3910+
return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
39063911
}
39073912

39083913
static inline unsigned long task_util_est(struct task_struct *p)
@@ -4002,7 +4007,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
40024007
* Reset EWMA on utilization increases, the moving average is used only
40034008
* to smooth utilization decreases.
40044009
*/
4005-
ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
4010+
ue.enqueued = task_util(p);
40064011
if (sched_feat(UTIL_EST_FASTUP)) {
40074012
if (ue.ewma < ue.enqueued) {
40084013
ue.ewma = ue.enqueued;
@@ -4051,6 +4056,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
40514056
ue.ewma += last_ewma_diff;
40524057
ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
40534058
done:
4059+
ue.enqueued |= UTIL_AVG_UNCHANGED;
40544060
WRITE_ONCE(p->se.avg.util_est, ue);
40554061

40564062
trace_sched_util_est_se_tp(&p->se);
@@ -8030,7 +8036,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
80308036
/* Propagate pending load changes to the parent, if any: */
80318037
se = cfs_rq->tg->se[cpu];
80328038
if (se && !skip_blocked_update(se))
8033-
update_load_avg(cfs_rq_of(se), se, 0);
8039+
update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
80348040

80358041
/*
80368042
* There can be a lot of idle CPU cgroups. Don't let fully

kernel/sched/pelt.h

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -42,23 +42,14 @@ static inline u32 get_pelt_divider(struct sched_avg *avg)
4242
return LOAD_AVG_MAX - 1024 + avg->period_contrib;
4343
}
4444

45-
/*
46-
* When a task is dequeued, its estimated utilization should not be update if
47-
* its util_avg has not been updated at least once.
48-
* This flag is used to synchronize util_avg updates with util_est updates.
49-
* We map this information into the LSB bit of the utilization saved at
50-
* dequeue time (i.e. util_est.dequeued).
51-
*/
52-
#define UTIL_AVG_UNCHANGED 0x1
53-
5445
static inline void cfs_se_util_change(struct sched_avg *avg)
5546
{
5647
unsigned int enqueued;
5748

5849
if (!sched_feat(UTIL_EST))
5950
return;
6051

61-
/* Avoid store if the flag has been already set */
52+
/* Avoid store if the flag has been already reset */
6253
enqueued = avg->util_est.enqueued;
6354
if (!(enqueued & UTIL_AVG_UNCHANGED))
6455
return;

kernel/time/tick-sched.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
230230

231231
#ifdef CONFIG_NO_HZ_FULL
232232
cpumask_var_t tick_nohz_full_mask;
233+
EXPORT_SYMBOL_GPL(tick_nohz_full_mask);
233234
bool tick_nohz_full_running;
234235
EXPORT_SYMBOL_GPL(tick_nohz_full_running);
235236
static atomic_t tick_dep_mask;

0 commit comments

Comments
 (0)