Skip to content

Commit 461daba

Browse files
surenbaghdasaryanPeter Zijlstra
authored andcommitted
psi: eliminate kthread_worker from psi trigger scheduling mechanism
Each psi group requires a dedicated kthread_delayed_work and kthread_worker. Since no other work can be performed using psi_group's kthread_worker, the same result can be obtained using a task_struct and a timer directly. This makes psi triggering simpler by removing lists and locks involved with kthread_worker usage and eliminates the need for poll_scheduled atomic use in the hot path. Signed-off-by: Suren Baghdasaryan <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent f4291df commit 461daba

File tree

2 files changed

+68
-52
lines changed

2 files changed

+68
-52
lines changed

include/linux/psi_types.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -153,9 +153,10 @@ struct psi_group {
153153
unsigned long avg[NR_PSI_STATES - 1][3];
154154

155155
/* Monitor work control */
156-
atomic_t poll_scheduled;
157-
struct kthread_worker __rcu *poll_kworker;
158-
struct kthread_delayed_work poll_work;
156+
struct task_struct __rcu *poll_task;
157+
struct timer_list poll_timer;
158+
wait_queue_head_t poll_wait;
159+
atomic_t poll_wakeup;
159160

160161
/* Protects data used by the monitor */
161162
struct mutex trigger_lock;

kernel/sched/psi.c

Lines changed: 64 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,6 @@ static void group_init(struct psi_group *group)
190190
INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
191191
mutex_init(&group->avgs_lock);
192192
/* Init trigger-related members */
193-
atomic_set(&group->poll_scheduled, 0);
194193
mutex_init(&group->trigger_lock);
195194
INIT_LIST_HEAD(&group->triggers);
196195
memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
@@ -199,7 +198,7 @@ static void group_init(struct psi_group *group)
199198
memset(group->polling_total, 0, sizeof(group->polling_total));
200199
group->polling_next_update = ULLONG_MAX;
201200
group->polling_until = 0;
202-
rcu_assign_pointer(group->poll_kworker, NULL);
201+
rcu_assign_pointer(group->poll_task, NULL);
203202
}
204203

205204
void __init psi_init(void)
@@ -547,47 +546,38 @@ static u64 update_triggers(struct psi_group *group, u64 now)
547546
return now + group->poll_min_period;
548547
}
549548

550-
/*
551-
* Schedule polling if it's not already scheduled. It's safe to call even from
552-
* hotpath because even though kthread_queue_delayed_work takes worker->lock
553-
* spinlock that spinlock is never contended due to poll_scheduled atomic
554-
* preventing such competition.
555-
*/
549+
/* Schedule polling if it's not already scheduled. */
556550
static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
557551
{
558-
struct kthread_worker *kworker;
552+
struct task_struct *task;
559553

560-
/* Do not reschedule if already scheduled */
561-
if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0)
554+
/*
555+
* Do not reschedule if already scheduled.
556+
* Possible race with a timer scheduled after this check but before
557+
* mod_timer below can be tolerated because group->polling_next_update
558+
* will keep updates on schedule.
559+
*/
560+
if (timer_pending(&group->poll_timer))
562561
return;
563562

564563
rcu_read_lock();
565564

566-
kworker = rcu_dereference(group->poll_kworker);
565+
task = rcu_dereference(group->poll_task);
567566
/*
568567
* kworker might be NULL in case psi_trigger_destroy races with
569568
* psi_task_change (hotpath) which can't use locks
570569
*/
571-
if (likely(kworker))
572-
kthread_queue_delayed_work(kworker, &group->poll_work, delay);
573-
else
574-
atomic_set(&group->poll_scheduled, 0);
570+
if (likely(task))
571+
mod_timer(&group->poll_timer, jiffies + delay);
575572

576573
rcu_read_unlock();
577574
}
578575

579-
static void psi_poll_work(struct kthread_work *work)
576+
static void psi_poll_work(struct psi_group *group)
580577
{
581-
struct kthread_delayed_work *dwork;
582-
struct psi_group *group;
583578
u32 changed_states;
584579
u64 now;
585580

586-
dwork = container_of(work, struct kthread_delayed_work, work);
587-
group = container_of(dwork, struct psi_group, poll_work);
588-
589-
atomic_set(&group->poll_scheduled, 0);
590-
591581
mutex_lock(&group->trigger_lock);
592582

593583
now = sched_clock();
@@ -623,6 +613,35 @@ static void psi_poll_work(struct kthread_work *work)
623613
mutex_unlock(&group->trigger_lock);
624614
}
625615

616+
static int psi_poll_worker(void *data)
617+
{
618+
struct psi_group *group = (struct psi_group *)data;
619+
struct sched_param param = {
620+
.sched_priority = 1,
621+
};
622+
623+
sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
624+
625+
while (true) {
626+
wait_event_interruptible(group->poll_wait,
627+
atomic_cmpxchg(&group->poll_wakeup, 1, 0) ||
628+
kthread_should_stop());
629+
if (kthread_should_stop())
630+
break;
631+
632+
psi_poll_work(group);
633+
}
634+
return 0;
635+
}
636+
637+
static void poll_timer_fn(struct timer_list *t)
638+
{
639+
struct psi_group *group = from_timer(group, t, poll_timer);
640+
641+
atomic_set(&group->poll_wakeup, 1);
642+
wake_up_interruptible(&group->poll_wait);
643+
}
644+
626645
static void record_times(struct psi_group_cpu *groupc, int cpu,
627646
bool memstall_tick)
628647
{
@@ -1099,22 +1118,20 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
10991118

11001119
mutex_lock(&group->trigger_lock);
11011120

1102-
if (!rcu_access_pointer(group->poll_kworker)) {
1103-
struct sched_param param = {
1104-
.sched_priority = 1,
1105-
};
1106-
struct kthread_worker *kworker;
1121+
if (!rcu_access_pointer(group->poll_task)) {
1122+
struct task_struct *task;
11071123

1108-
kworker = kthread_create_worker(0, "psimon");
1109-
if (IS_ERR(kworker)) {
1124+
task = kthread_create(psi_poll_worker, group, "psimon");
1125+
if (IS_ERR(task)) {
11101126
kfree(t);
11111127
mutex_unlock(&group->trigger_lock);
1112-
return ERR_CAST(kworker);
1128+
return ERR_CAST(task);
11131129
}
1114-
sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
1115-
kthread_init_delayed_work(&group->poll_work,
1116-
psi_poll_work);
1117-
rcu_assign_pointer(group->poll_kworker, kworker);
1130+
atomic_set(&group->poll_wakeup, 0);
1131+
init_waitqueue_head(&group->poll_wait);
1132+
wake_up_process(task);
1133+
timer_setup(&group->poll_timer, poll_timer_fn, 0);
1134+
rcu_assign_pointer(group->poll_task, task);
11181135
}
11191136

11201137
list_add(&t->node, &group->triggers);
@@ -1132,7 +1149,7 @@ static void psi_trigger_destroy(struct kref *ref)
11321149
{
11331150
struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
11341151
struct psi_group *group = t->group;
1135-
struct kthread_worker *kworker_to_destroy = NULL;
1152+
struct task_struct *task_to_destroy = NULL;
11361153

11371154
if (static_branch_likely(&psi_disabled))
11381155
return;
@@ -1158,39 +1175,37 @@ static void psi_trigger_destroy(struct kref *ref)
11581175
period = min(period, div_u64(tmp->win.size,
11591176
UPDATES_PER_WINDOW));
11601177
group->poll_min_period = period;
1161-
/* Destroy poll_kworker when the last trigger is destroyed */
1178+
/* Destroy poll_task when the last trigger is destroyed */
11621179
if (group->poll_states == 0) {
11631180
group->polling_until = 0;
1164-
kworker_to_destroy = rcu_dereference_protected(
1165-
group->poll_kworker,
1181+
task_to_destroy = rcu_dereference_protected(
1182+
group->poll_task,
11661183
lockdep_is_held(&group->trigger_lock));
1167-
rcu_assign_pointer(group->poll_kworker, NULL);
1184+
rcu_assign_pointer(group->poll_task, NULL);
11681185
}
11691186
}
11701187

11711188
mutex_unlock(&group->trigger_lock);
11721189

11731190
/*
11741191
* Wait for both *trigger_ptr from psi_trigger_replace and
1175-
* poll_kworker RCUs to complete their read-side critical sections
1176-
* before destroying the trigger and optionally the poll_kworker
1192+
* poll_task RCUs to complete their read-side critical sections
1193+
* before destroying the trigger and optionally the poll_task
11771194
*/
11781195
synchronize_rcu();
11791196
/*
11801197
* Destroy the kworker after releasing trigger_lock to prevent a
11811198
* deadlock while waiting for psi_poll_work to acquire trigger_lock
11821199
*/
1183-
if (kworker_to_destroy) {
1200+
if (task_to_destroy) {
11841201
/*
11851202
* After the RCU grace period has expired, the worker
1186-
* can no longer be found through group->poll_kworker.
1203+
* can no longer be found through group->poll_task.
11871204
* But it might have been already scheduled before
11881205
* that - deschedule it cleanly before destroying it.
11891206
*/
1190-
kthread_cancel_delayed_work_sync(&group->poll_work);
1191-
atomic_set(&group->poll_scheduled, 0);
1192-
1193-
kthread_destroy_worker(kworker_to_destroy);
1207+
del_timer_sync(&group->poll_timer);
1208+
kthread_stop(task_to_destroy);
11941209
}
11951210
kfree(t);
11961211
}

0 commit comments

Comments
 (0)