Skip to content

Commit 8c2090c

Browse files
committed
sched_ext: Initialize in bypass mode
scx_ops_enable() used preempt_disable() around the task iteration loop to switch tasks into SCX to guarantee forward progress of the task which is running scx_ops_enable(). However, in the gap between setting __scx_ops_enabled and preeempt_disable(), an external entity can put tasks including the enabling one into SCX prematurely, which can lead to malfunctions including stalls. The bypass mode can wrap the entire enabling operation and guarantee forward progress no matter what the BPF scheduler does. Use the bypass mode instead to guarantee forward progress while enabling. While at it, release and regrab scx_tasks_lock between the two task iteration locks in scx_ops_enable() for clarity as there is no reason to keep holding the lock between them. Signed-off-by: Tejun Heo <[email protected]>
1 parent fc1fceb commit 8c2090c

File tree

1 file changed

+15
-10
lines changed

1 file changed

+15
-10
lines changed

kernel/sched/ext.c

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5075,6 +5075,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
50755075
queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
50765076
scx_watchdog_timeout / 2);
50775077

5078+
/*
5079+
* Once __scx_ops_enabled is set, %current can be switched to SCX
5080+
* anytime. This can lead to stalls as some BPF schedulers (e.g.
5081+
* userspace scheduling) may not function correctly before all tasks are
5082+
* switched. Init in bypass mode to guarantee forward progress.
5083+
*/
5084+
scx_ops_bypass(true);
5085+
50785086
/*
50795087
* Lock out forks, cgroup on/offlining and moves before opening the
50805088
* floodgate so that they don't wander into the operations prematurely.
@@ -5134,7 +5142,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
51345142
* disabled.
51355143
*/
51365144
spin_lock_irq(&scx_tasks_lock);
5137-
51385145
scx_task_iter_init(&sti);
51395146
while ((p = scx_task_iter_next_locked(&sti))) {
51405147
/*
@@ -5163,22 +5170,19 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
51635170
spin_lock_irq(&scx_tasks_lock);
51645171
}
51655172
scx_task_iter_exit(&sti);
5173+
spin_unlock_irq(&scx_tasks_lock);
51665174

51675175
/*
5168-
* All tasks are prepped but are still ops-disabled. Ensure that
5169-
* %current can't be scheduled out and switch everyone.
5170-
* preempt_disable() is necessary because we can't guarantee that
5171-
* %current won't be starved if scheduled out while switching.
5176+
* All tasks are prepped but the tasks are not enabled. Switch everyone.
51725177
*/
5173-
preempt_disable();
5178+
WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
51745179

51755180
/*
51765181
* We're fully committed and can't fail. The PREPPED -> ENABLED
51775182
* transitions here are synchronized against sched_ext_free() through
51785183
* scx_tasks_lock.
51795184
*/
5180-
WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
5181-
5185+
spin_lock_irq(&scx_tasks_lock);
51825186
scx_task_iter_init(&sti);
51835187
while ((p = scx_task_iter_next_locked(&sti))) {
51845188
const struct sched_class *old_class = p->sched_class;
@@ -5195,12 +5199,12 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
51955199
check_class_changed(task_rq(p), p, old_class, p->prio);
51965200
}
51975201
scx_task_iter_exit(&sti);
5198-
51995202
spin_unlock_irq(&scx_tasks_lock);
5200-
preempt_enable();
5203+
52015204
scx_cgroup_unlock();
52025205
cpus_read_unlock();
52035206
percpu_up_write(&scx_fork_rwsem);
5207+
scx_ops_bypass(false);
52045208

52055209
/*
52065210
* Returning an error code here would lose the recorded error
@@ -5241,6 +5245,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
52415245
err_disable_unlock_all:
52425246
scx_cgroup_unlock();
52435247
percpu_up_write(&scx_fork_rwsem);
5248+
scx_ops_bypass(false);
52445249
err_disable_unlock_cpus:
52455250
cpus_read_unlock();
52465251
err_disable:

0 commit comments

Comments
 (0)