Skip to content

Commit 710ffe6

Browse files
surenbaghdasaryanPeter Zijlstra
authored andcommitted
sched/psi: Stop relying on timer_pending() for poll_work rescheduling
Psi polling mechanism is trying to minimize the number of wakeups to run psi_poll_work and is currently relying on timer_pending() to detect when this work is already scheduled. This provides a window of opportunity for psi_group_change to schedule an immediate psi_poll_work after poll_timer_fn got called but before psi_poll_work could reschedule itself. Below is the depiction of this entire window: poll_timer_fn wake_up_interruptible(&group->poll_wait); psi_poll_worker wait_event_interruptible(group->poll_wait, ...) psi_poll_work psi_schedule_poll_work if (timer_pending(&group->poll_timer)) return; ... mod_timer(&group->poll_timer, jiffies + delay); Prior to 461daba we used to rely on poll_scheduled atomic which was reset and set back inside psi_poll_work and therefore this race window was much smaller. The larger window causes increased number of wakeups and our partners report visible power regression of ~10mA after applying 461daba. Bring back the poll_scheduled atomic and make this race window even narrower by resetting poll_scheduled only when we reach polling expiration time. This does not completely eliminate the possibility of extra wakeups caused by a race with psi_group_change however it will limit it to the worst case scenario of one extra wakeup per every tracking window (0.5s in the worst case). This patch also ensures correct ordering between clearing poll_scheduled flag and obtaining changed_states using memory barrier. Correct ordering between updating changed_states and setting poll_scheduled is ensured by atomic_xchg operation. By tracing the number of immediate rescheduling attempts performed by psi_group_change and the number of these attempts being blocked due to psi monitor being already active, we can assess the effects of this change: Before the patch: Run#1 Run#2 Run#3 Immediate reschedules attempted: 684365 1385156 1261240 Immediate reschedules blocked: 682846 1381654 1258682 Immediate reschedules (delta): 1519 3502 2558 Immediate reschedules (% of attempted): 0.22% 0.25% 0.20% After the patch: Run#1 Run#2 Run#3 Immediate reschedules attempted: 882244 770298 426218 Immediate reschedules blocked: 881996 769796 426074 Immediate reschedules (delta): 248 502 144 Immediate reschedules (% of attempted): 0.03% 0.07% 0.03% The number of non-blocked immediate reschedules dropped from 0.22-0.25% to 0.03-0.07%. The drop is attributed to the decrease in the race window size and the fact that we allow this race only when psi monitors reach polling window expiration time. Fixes: 461daba ("psi: eliminate kthread_worker from psi trigger scheduling mechanism") Reported-by: Kathleen Chang <[email protected]> Reported-by: Wenju Xu <[email protected]> Reported-by: Jonathan Chen <[email protected]> Signed-off-by: Suren Baghdasaryan <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Reviewed-by: Chengming Zhou <[email protected]> Acked-by: Johannes Weiner <[email protected]> Tested-by: SH Chen <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 2fcd7bb commit 710ffe6

File tree

2 files changed

+53
-10
lines changed

2 files changed

+53
-10
lines changed

include/linux/psi_types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ struct psi_group {
180180
struct timer_list poll_timer;
181181
wait_queue_head_t poll_wait;
182182
atomic_t poll_wakeup;
183+
atomic_t poll_scheduled;
183184

184185
/* Protects data used by the monitor */
185186
struct mutex trigger_lock;

kernel/sched/psi.c

Lines changed: 52 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ static void group_init(struct psi_group *group)
189189
INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
190190
mutex_init(&group->avgs_lock);
191191
/* Init trigger-related members */
192+
atomic_set(&group->poll_scheduled, 0);
192193
mutex_init(&group->trigger_lock);
193194
INIT_LIST_HEAD(&group->triggers);
194195
group->poll_min_period = U32_MAX;
@@ -589,18 +590,17 @@ static u64 update_triggers(struct psi_group *group, u64 now)
589590
return now + group->poll_min_period;
590591
}
591592

592-
/* Schedule polling if it's not already scheduled. */
593-
static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
593+
/* Schedule polling if it's not already scheduled or forced. */
594+
static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay,
595+
bool force)
594596
{
595597
struct task_struct *task;
596598

597599
/*
598-
* Do not reschedule if already scheduled.
599-
* Possible race with a timer scheduled after this check but before
600-
* mod_timer below can be tolerated because group->polling_next_update
601-
* will keep updates on schedule.
600+
* atomic_xchg should be called even when !force to provide a
601+
* full memory barrier (see the comment inside psi_poll_work).
602602
*/
603-
if (timer_pending(&group->poll_timer))
603+
if (atomic_xchg(&group->poll_scheduled, 1) && !force)
604604
return;
605605

606606
rcu_read_lock();
@@ -612,19 +612,59 @@ static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
612612
*/
613613
if (likely(task))
614614
mod_timer(&group->poll_timer, jiffies + delay);
615+
else
616+
atomic_set(&group->poll_scheduled, 0);
615617

616618
rcu_read_unlock();
617619
}
618620

619621
static void psi_poll_work(struct psi_group *group)
620622
{
623+
bool force_reschedule = false;
621624
u32 changed_states;
622625
u64 now;
623626

624627
mutex_lock(&group->trigger_lock);
625628

626629
now = sched_clock();
627630

631+
if (now > group->polling_until) {
632+
/*
633+
* We are either about to start or might stop polling if no
634+
* state change was recorded. Resetting poll_scheduled leaves
635+
* a small window for psi_group_change to sneak in and schedule
636+
* an immediate poll_work before we get to rescheduling. One
637+
* potential extra wakeup at the end of the polling window
638+
* should be negligible and polling_next_update still keeps
639+
* updates correctly on schedule.
640+
*/
641+
atomic_set(&group->poll_scheduled, 0);
642+
/*
643+
* A task change can race with the poll worker that is supposed to
644+
* report on it. To avoid missing events, ensure ordering between
645+
* poll_scheduled and the task state accesses, such that if the poll
646+
* worker misses the state update, the task change is guaranteed to
647+
* reschedule the poll worker:
648+
*
649+
* poll worker:
650+
* atomic_set(poll_scheduled, 0)
651+
* smp_mb()
652+
* LOAD states
653+
*
654+
* task change:
655+
* STORE states
656+
* if atomic_xchg(poll_scheduled, 1) == 0:
657+
* schedule poll worker
658+
*
659+
* The atomic_xchg() implies a full barrier.
660+
*/
661+
smp_mb();
662+
} else {
663+
/* Polling window is not over, keep rescheduling */
664+
force_reschedule = true;
665+
}
666+
667+
628668
collect_percpu_times(group, PSI_POLL, &changed_states);
629669

630670
if (changed_states & group->poll_states) {
@@ -650,7 +690,8 @@ static void psi_poll_work(struct psi_group *group)
650690
group->polling_next_update = update_triggers(group, now);
651691

652692
psi_schedule_poll_work(group,
653-
nsecs_to_jiffies(group->polling_next_update - now) + 1);
693+
nsecs_to_jiffies(group->polling_next_update - now) + 1,
694+
force_reschedule);
654695

655696
out:
656697
mutex_unlock(&group->trigger_lock);
@@ -811,7 +852,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
811852
write_seqcount_end(&groupc->seq);
812853

813854
if (state_mask & group->poll_states)
814-
psi_schedule_poll_work(group, 1);
855+
psi_schedule_poll_work(group, 1, false);
815856

816857
if (wake_clock && !delayed_work_pending(&group->avgs_work))
817858
schedule_delayed_work(&group->avgs_work, PSI_FREQ);
@@ -965,7 +1006,7 @@ void psi_account_irqtime(struct task_struct *task, u32 delta)
9651006
write_seqcount_end(&groupc->seq);
9661007

9671008
if (group->poll_states & (1 << PSI_IRQ_FULL))
968-
psi_schedule_poll_work(group, 1);
1009+
psi_schedule_poll_work(group, 1, false);
9691010
} while ((group = group->parent));
9701011
}
9711012
#endif
@@ -1351,6 +1392,7 @@ void psi_trigger_destroy(struct psi_trigger *t)
13511392
* can no longer be found through group->poll_task.
13521393
*/
13531394
kthread_stop(task_to_destroy);
1395+
atomic_set(&group->poll_scheduled, 0);
13541396
}
13551397
kfree(t);
13561398
}

0 commit comments

Comments
 (0)