Skip to content

Commit 3840cbe

Browse files
hnaztorvalds
authored andcommitted
sched: psi: fix bogus pressure spikes from aggregation race
Brandon reports sporadic, non-sensical spikes in cumulative pressure time (total=) when reading cpu.pressure at a high rate. This is due to a race condition between reader aggregation and tasks changing states. While it affects all states and all resources captured by PSI, in practice it most likely triggers with CPU pressure, since scheduling events are so frequent compared to other resource events. The race context is the live snooping of ongoing stalls during a pressure read. The read aggregates per-cpu records for stalls that have concluded, but will also incorporate ad-hoc the duration of any active state that hasn't been recorded yet. This is important to get timely measurements of ongoing stalls. Those ad-hoc samples are calculated on-the-fly up to the current time on that CPU; since the stall hasn't concluded, it's expected that this is the minimum amount of stall time that will enter the per-cpu records once it does. The problem is that the path that concludes the state uses a CPU clock read that is not synchronized against aggregators; the clock is read outside of the seqlock protection. This allows aggregators to race and snoop a stall with a longer duration than will actually be recorded. With the recorded stall time being less than the last snapshot remembered by the aggregator, a subsequent sample will underflow and observe a bogus delta value, resulting in an erratic jump in pressure. Fix this by moving the clock read of the state change into the seqlock protection. This ensures no aggregation can snoop live stalls past the time that's recorded when the state concludes. Reported-by: Brandon Duffany <[email protected]> Link: https://bugzilla.kernel.org/show_bug.cgi?id=219194 Link: https://lore.kernel.org/lkml/[email protected]/ Fixes: df77430 ("psi: Reduce calls to sched_clock() in psi") Cc: [email protected] Signed-off-by: Johannes Weiner <[email protected]> Reviewed-by: Chengming Zhou <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 8c245fe commit 3840cbe

File tree

1 file changed

+12
-14
lines changed

1 file changed

+12
-14
lines changed

kernel/sched/psi.c

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -769,12 +769,13 @@ static void record_times(struct psi_group_cpu *groupc, u64 now)
769769
}
770770

771771
static void psi_group_change(struct psi_group *group, int cpu,
772-
unsigned int clear, unsigned int set, u64 now,
772+
unsigned int clear, unsigned int set,
773773
bool wake_clock)
774774
{
775775
struct psi_group_cpu *groupc;
776776
unsigned int t, m;
777777
u32 state_mask;
778+
u64 now;
778779

779780
lockdep_assert_rq_held(cpu_rq(cpu));
780781
groupc = per_cpu_ptr(group->pcpu, cpu);
@@ -789,6 +790,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
789790
* SOME and FULL time these may have resulted in.
790791
*/
791792
write_seqcount_begin(&groupc->seq);
793+
now = cpu_clock(cpu);
792794

793795
/*
794796
* Start with TSK_ONCPU, which doesn't have a corresponding
@@ -899,18 +901,15 @@ void psi_task_change(struct task_struct *task, int clear, int set)
899901
{
900902
int cpu = task_cpu(task);
901903
struct psi_group *group;
902-
u64 now;
903904

904905
if (!task->pid)
905906
return;
906907

907908
psi_flags_change(task, clear, set);
908909

909-
now = cpu_clock(cpu);
910-
911910
group = task_psi_group(task);
912911
do {
913-
psi_group_change(group, cpu, clear, set, now, true);
912+
psi_group_change(group, cpu, clear, set, true);
914913
} while ((group = group->parent));
915914
}
916915

@@ -919,7 +918,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
919918
{
920919
struct psi_group *group, *common = NULL;
921920
int cpu = task_cpu(prev);
922-
u64 now = cpu_clock(cpu);
923921

924922
if (next->pid) {
925923
psi_flags_change(next, 0, TSK_ONCPU);
@@ -936,7 +934,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
936934
break;
937935
}
938936

939-
psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
937+
psi_group_change(group, cpu, 0, TSK_ONCPU, true);
940938
} while ((group = group->parent));
941939
}
942940

@@ -974,7 +972,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
974972
do {
975973
if (group == common)
976974
break;
977-
psi_group_change(group, cpu, clear, set, now, wake_clock);
975+
psi_group_change(group, cpu, clear, set, wake_clock);
978976
} while ((group = group->parent));
979977

980978
/*
@@ -986,7 +984,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
986984
if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
987985
clear &= ~TSK_ONCPU;
988986
for (; group; group = group->parent)
989-
psi_group_change(group, cpu, clear, set, now, wake_clock);
987+
psi_group_change(group, cpu, clear, set, wake_clock);
990988
}
991989
}
992990
}
@@ -997,8 +995,8 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
997995
int cpu = task_cpu(curr);
998996
struct psi_group *group;
999997
struct psi_group_cpu *groupc;
1000-
u64 now, irq;
1001998
s64 delta;
999+
u64 irq;
10021000

10031001
if (static_branch_likely(&psi_disabled))
10041002
return;
@@ -1011,20 +1009,22 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
10111009
if (prev && task_psi_group(prev) == group)
10121010
return;
10131011

1014-
now = cpu_clock(cpu);
10151012
irq = irq_time_read(cpu);
10161013
delta = (s64)(irq - rq->psi_irq_time);
10171014
if (delta < 0)
10181015
return;
10191016
rq->psi_irq_time = irq;
10201017

10211018
do {
1019+
u64 now;
1020+
10221021
if (!group->enabled)
10231022
continue;
10241023

10251024
groupc = per_cpu_ptr(group->pcpu, cpu);
10261025

10271026
write_seqcount_begin(&groupc->seq);
1027+
now = cpu_clock(cpu);
10281028

10291029
record_times(groupc, now);
10301030
groupc->times[PSI_IRQ_FULL] += delta;
@@ -1223,11 +1223,9 @@ void psi_cgroup_restart(struct psi_group *group)
12231223
for_each_possible_cpu(cpu) {
12241224
struct rq *rq = cpu_rq(cpu);
12251225
struct rq_flags rf;
1226-
u64 now;
12271226

12281227
rq_lock_irq(rq, &rf);
1229-
now = cpu_clock(cpu);
1230-
psi_group_change(group, cpu, 0, 0, now, true);
1228+
psi_group_change(group, cpu, 0, 0, true);
12311229
rq_unlock_irq(rq, &rf);
12321230
}
12331231
}

0 commit comments

Comments
 (0)