Skip to content

Commit 36b238d

Browse files
hnazPeter Zijlstra
authored andcommitted
psi: Optimize switching tasks inside shared cgroups
When switching tasks running on a CPU, the psi state of a cgroup containing both of these tasks does not change. Right now, we don't exploit that, and can perform many unnecessary state changes in nested hierarchies, especially when most activity comes from one leaf cgroup. This patch implements an optimization where we only update cgroups whose state actually changes during a task switch. These are all cgroups that contain one task but not the other, up to the first shared ancestor. When both tasks are in the same group, we don't need to update anything at all. We can identify the first shared ancestor by walking the groups of the incoming task until we see TSK_ONCPU set on the local CPU; that's the first group that also contains the outgoing task. The new psi_task_switch() is similar to psi_task_change(). To allow code reuse, move the task flag maintenance code into a new function and the poll/avg worker wakeups into the shared psi_group_change(). Suggested-by: Peter Zijlstra <[email protected]> Signed-off-by: Johannes Weiner <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent b05e75d commit 36b238d

File tree

3 files changed

+70
-28
lines changed

3 files changed

+70
-28
lines changed

include/linux/psi.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ extern struct psi_group psi_system;
1717
void psi_init(void);
1818

1919
void psi_task_change(struct task_struct *task, int clear, int set);
20+
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
21+
bool sleep);
2022

2123
void psi_memstall_tick(struct task_struct *task, int cpu);
2224
void psi_memstall_enter(unsigned long *flags);

kernel/sched/psi.c

Lines changed: 67 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -669,13 +669,14 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
669669
groupc->times[PSI_NONIDLE] += delta;
670670
}
671671

672-
static u32 psi_group_change(struct psi_group *group, int cpu,
673-
unsigned int clear, unsigned int set)
672+
static void psi_group_change(struct psi_group *group, int cpu,
673+
unsigned int clear, unsigned int set,
674+
bool wake_clock)
674675
{
675676
struct psi_group_cpu *groupc;
677+
u32 state_mask = 0;
676678
unsigned int t, m;
677679
enum psi_states s;
678-
u32 state_mask = 0;
679680

680681
groupc = per_cpu_ptr(group->pcpu, cpu);
681682

@@ -717,7 +718,11 @@ static u32 psi_group_change(struct psi_group *group, int cpu,
717718

718719
write_seqcount_end(&groupc->seq);
719720

720-
return state_mask;
721+
if (state_mask & group->poll_states)
722+
psi_schedule_poll_work(group, 1);
723+
724+
if (wake_clock && !delayed_work_pending(&group->avgs_work))
725+
schedule_delayed_work(&group->avgs_work, PSI_FREQ);
721726
}
722727

723728
static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
@@ -744,27 +749,32 @@ static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
744749
return &psi_system;
745750
}
746751

747-
void psi_task_change(struct task_struct *task, int clear, int set)
752+
static void psi_flags_change(struct task_struct *task, int clear, int set)
748753
{
749-
int cpu = task_cpu(task);
750-
struct psi_group *group;
751-
bool wake_clock = true;
752-
void *iter = NULL;
753-
754-
if (!task->pid)
755-
return;
756-
757754
if (((task->psi_flags & set) ||
758755
(task->psi_flags & clear) != clear) &&
759756
!psi_bug) {
760757
printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
761-
task->pid, task->comm, cpu,
758+
task->pid, task->comm, task_cpu(task),
762759
task->psi_flags, clear, set);
763760
psi_bug = 1;
764761
}
765762

766763
task->psi_flags &= ~clear;
767764
task->psi_flags |= set;
765+
}
766+
767+
void psi_task_change(struct task_struct *task, int clear, int set)
768+
{
769+
int cpu = task_cpu(task);
770+
struct psi_group *group;
771+
bool wake_clock = true;
772+
void *iter = NULL;
773+
774+
if (!task->pid)
775+
return;
776+
777+
psi_flags_change(task, clear, set);
768778

769779
/*
770780
* Periodic aggregation shuts off if there is a period of no
@@ -777,14 +787,51 @@ void psi_task_change(struct task_struct *task, int clear, int set)
777787
wq_worker_last_func(task) == psi_avgs_work))
778788
wake_clock = false;
779789

780-
while ((group = iterate_groups(task, &iter))) {
781-
u32 state_mask = psi_group_change(group, cpu, clear, set);
790+
while ((group = iterate_groups(task, &iter)))
791+
psi_group_change(group, cpu, clear, set, wake_clock);
792+
}
793+
794+
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
795+
bool sleep)
796+
{
797+
struct psi_group *group, *common = NULL;
798+
int cpu = task_cpu(prev);
799+
void *iter;
800+
801+
if (next->pid) {
802+
psi_flags_change(next, 0, TSK_ONCPU);
803+
/*
804+
* When moving state between tasks, the group that
805+
* contains them both does not change: we can stop
806+
* updating the tree once we reach the first common
807+
* ancestor. Iterate @next's ancestors until we
808+
* encounter @prev's state.
809+
*/
810+
iter = NULL;
811+
while ((group = iterate_groups(next, &iter))) {
812+
if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
813+
common = group;
814+
break;
815+
}
816+
817+
psi_group_change(group, cpu, 0, TSK_ONCPU, true);
818+
}
819+
}
820+
821+
/*
822+
* If this is a voluntary sleep, dequeue will have taken care
823+
* of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We
824+
* only need to deal with it during preemption.
825+
*/
826+
if (sleep)
827+
return;
782828

783-
if (state_mask & group->poll_states)
784-
psi_schedule_poll_work(group, 1);
829+
if (prev->pid) {
830+
psi_flags_change(prev, TSK_ONCPU, 0);
785831

786-
if (wake_clock && !delayed_work_pending(&group->avgs_work))
787-
schedule_delayed_work(&group->avgs_work, PSI_FREQ);
832+
iter = NULL;
833+
while ((group = iterate_groups(prev, &iter)) && group != common)
834+
psi_group_change(group, cpu, TSK_ONCPU, 0, true);
788835
}
789836
}
790837

kernel/sched/stats.h

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -141,14 +141,7 @@ static inline void psi_sched_switch(struct task_struct *prev,
141141
if (static_branch_likely(&psi_disabled))
142142
return;
143143

144-
/*
145-
* Clear the TSK_ONCPU state if the task was preempted. If
146-
* it's a voluntary sleep, dequeue will have taken care of it.
147-
*/
148-
if (!sleep)
149-
psi_task_change(prev, TSK_ONCPU, 0);
150-
151-
psi_task_change(next, 0, TSK_ONCPU);
144+
psi_task_switch(prev, next, sleep);
152145
}
153146

154147
static inline void psi_task_tick(struct rq *rq)

0 commit comments

Comments
 (0)