Skip to content

Commit 2fcd7bb

Browse files
Chengming ZhouPeter Zijlstra
authored andcommitted
sched/psi: Fix avgs_work re-arm in psi_avgs_work()
Pavan reported a problem that PSI avgs_work idle shutoff is not working at all. Because PSI_NONIDLE condition would be observed in psi_avgs_work()->collect_percpu_times()->get_recent_times() even if only the kworker running avgs_work on the CPU. Although commit 1b69ac6 ("psi: fix aggregation idle shut-off") avoided the ping-pong wake problem when the worker sleep, psi_avgs_work() still will always re-arm the avgs_work, so shutoff is not working. This patch changes to use PSI_STATE_RESCHEDULE to flag whether to re-arm avgs_work in get_recent_times(). For the current CPU, we re-arm avgs_work only when (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0), for other CPUs we can just check PSI_NONIDLE delta. The new flag is only used in psi_avgs_work(), so we check in get_recent_times() that current_work() is avgs_work. One potential problem is that the brief period of non-idle time incurred between the aggregation run and the kworker's dequeue will be stranded in the per-cpu buckets until avgs_work run next time. The buckets can hold 4s worth of time, and future activity will wake the avgs_work with a 2s delay, giving us 2s worth of data we can leave behind when shut off the avgs_work. If the kworker run other works after avgs_work shut off and doesn't have any scheduler activities for 2s, this maybe a problem. Reported-by: Pavan Kondeti <[email protected]> Signed-off-by: Chengming Zhou <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Acked-by: Johannes Weiner <[email protected]> Acked-by: Suren Baghdasaryan <[email protected]> Tested-by: Chengming Zhou <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent e38f89a commit 2fcd7bb

File tree

2 files changed

+30
-3
lines changed

2 files changed

+30
-3
lines changed

include/linux/psi_types.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ enum psi_states {
7272
/* Use one bit in the state mask to track TSK_ONCPU */
7373
#define PSI_ONCPU (1 << NR_PSI_STATES)
7474

75+
/* Flag whether to re-arm avgs_work, see details in get_recent_times() */
76+
#define PSI_STATE_RESCHEDULE (1 << (NR_PSI_STATES + 1))
77+
7578
enum psi_aggregators {
7679
PSI_AVGS = 0,
7780
PSI_POLL,

kernel/sched/psi.c

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
242242
u32 *pchanged_states)
243243
{
244244
struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
245+
int current_cpu = raw_smp_processor_id();
246+
unsigned int tasks[NR_PSI_TASK_COUNTS];
245247
u64 now, state_start;
246248
enum psi_states s;
247249
unsigned int seq;
@@ -256,6 +258,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
256258
memcpy(times, groupc->times, sizeof(groupc->times));
257259
state_mask = groupc->state_mask;
258260
state_start = groupc->state_start;
261+
if (cpu == current_cpu)
262+
memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
259263
} while (read_seqcount_retry(&groupc->seq, seq));
260264

261265
/* Calculate state time deltas against the previous snapshot */
@@ -280,6 +284,28 @@ static void get_recent_times(struct psi_group *group, int cpu,
280284
if (delta)
281285
*pchanged_states |= (1 << s);
282286
}
287+
288+
/*
289+
* When collect_percpu_times() from the avgs_work, we don't want to
290+
* re-arm avgs_work when all CPUs are IDLE. But the current CPU running
291+
* this avgs_work is never IDLE, cause avgs_work can't be shut off.
292+
* So for the current CPU, we need to re-arm avgs_work only when
293+
* (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0), for other CPUs
294+
* we can just check PSI_NONIDLE delta.
295+
*/
296+
if (current_work() == &group->avgs_work.work) {
297+
bool reschedule;
298+
299+
if (cpu == current_cpu)
300+
reschedule = tasks[NR_RUNNING] +
301+
tasks[NR_IOWAIT] +
302+
tasks[NR_MEMSTALL] > 1;
303+
else
304+
reschedule = *pchanged_states & (1 << PSI_NONIDLE);
305+
306+
if (reschedule)
307+
*pchanged_states |= PSI_STATE_RESCHEDULE;
308+
}
283309
}
284310

285311
static void calc_avgs(unsigned long avg[3], int missed_periods,
@@ -415,7 +441,6 @@ static void psi_avgs_work(struct work_struct *work)
415441
struct delayed_work *dwork;
416442
struct psi_group *group;
417443
u32 changed_states;
418-
bool nonidle;
419444
u64 now;
420445

421446
dwork = to_delayed_work(work);
@@ -426,7 +451,6 @@ static void psi_avgs_work(struct work_struct *work)
426451
now = sched_clock();
427452

428453
collect_percpu_times(group, PSI_AVGS, &changed_states);
429-
nonidle = changed_states & (1 << PSI_NONIDLE);
430454
/*
431455
* If there is task activity, periodically fold the per-cpu
432456
* times and feed samples into the running averages. If things
@@ -437,7 +461,7 @@ static void psi_avgs_work(struct work_struct *work)
437461
if (now >= group->avg_next_update)
438462
group->avg_next_update = update_averages(group, now);
439463

440-
if (nonidle) {
464+
if (changed_states & PSI_STATE_RESCHEDULE) {
441465
schedule_delayed_work(dwork, nsecs_to_jiffies(
442466
group->avg_next_update - now) + 1);
443467
}

0 commit comments

Comments
 (0)