Skip to content

Commit a6250aa

Browse files
committed
sched_ext: Handle cases where pick_task_scx() is called without preceding balance_scx()
sched_ext dispatches tasks from the BPF scheduler from balance_scx() and thus every pick_task_scx() call must be preceded by balance_scx(). While this usually holds, due to a bug, there are cases where the fair class's balance() returns true indicating that it has tasks to run on the CPU and thus terminating balance() calls but fails to actually find the next task to run when pick_task() is called. In such cases, pick_task_scx() can be called without preceding balance_scx(). Detect this condition using SCX_RQ_BAL_PENDING flags. If detected, keep running the previous task if possible and avoid stalling from entering idle without balancing. Signed-off-by: Tejun Heo <[email protected]> Cc: Peter Zijlstra <[email protected]> Link: http://lkml.kernel.org/r/[email protected]
1 parent a759bf0 commit a6250aa

File tree

3 files changed

+42
-20
lines changed

3 files changed

+42
-20
lines changed

kernel/sched/core.c

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5914,12 +5914,15 @@ static void prev_balance(struct rq *rq, struct task_struct *prev,
59145914

59155915
#ifdef CONFIG_SCHED_CLASS_EXT
59165916
/*
5917-
* SCX requires a balance() call before every pick_next_task() including
5918-
* when waking up from SCHED_IDLE. If @start_class is below SCX, start
5919-
* from SCX instead.
5917+
* SCX requires a balance() call before every pick_task() including when
5918+
* waking up from SCHED_IDLE. If @start_class is below SCX, start from
5919+
* SCX instead. Also, set a flag to detect missing balance() call.
59205920
*/
5921-
if (scx_enabled() && sched_class_above(&ext_sched_class, start_class))
5922-
start_class = &ext_sched_class;
5921+
if (scx_enabled()) {
5922+
rq->scx.flags |= SCX_RQ_BAL_PENDING;
5923+
if (sched_class_above(&ext_sched_class, start_class))
5924+
start_class = &ext_sched_class;
5925+
}
59235926
#endif
59245927

59255928
/*

kernel/sched/ext.c

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2634,7 +2634,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
26342634

26352635
lockdep_assert_rq_held(rq);
26362636
rq->scx.flags |= SCX_RQ_IN_BALANCE;
2637-
rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
2637+
rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
26382638

26392639
if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
26402640
unlikely(rq->scx.cpu_released)) {
@@ -2948,12 +2948,11 @@ static struct task_struct *pick_task_scx(struct rq *rq)
29482948
{
29492949
struct task_struct *prev = rq->curr;
29502950
struct task_struct *p;
2951+
bool prev_on_scx = prev->sched_class == &ext_sched_class;
2952+
bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
2953+
bool kick_idle = false;
29512954

29522955
/*
2953-
* If balance_scx() is telling us to keep running @prev, replenish slice
2954-
* if necessary and keep running @prev. Otherwise, pop the first one
2955-
* from the local DSQ.
2956-
*
29572956
* WORKAROUND:
29582957
*
29592958
* %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just
@@ -2962,22 +2961,41 @@ static struct task_struct *pick_task_scx(struct rq *rq)
29622961
* which then ends up calling pick_task_scx() without preceding
29632962
* balance_scx().
29642963
*
2965-
* For now, ignore cases where $prev is not on SCX. This isn't great and
2966-
* can theoretically lead to stalls. However, for switch_all cases, this
2967-
* happens only while a BPF scheduler is being loaded or unloaded, and,
2968-
* for partial cases, fair will likely keep triggering this CPU.
2964+
* Keep running @prev if possible and avoid stalling from entering idle
2965+
* without balancing.
29692966
*
2970-
* Once fair is fixed, restore WARN_ON_ONCE().
2967+
* Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE()
2968+
* if pick_task_scx() is called without preceding balance_scx().
29712969
*/
2972-
if ((rq->scx.flags & SCX_RQ_BAL_KEEP) &&
2973-
prev->sched_class == &ext_sched_class) {
2970+
if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {
2971+
if (prev_on_scx) {
2972+
keep_prev = true;
2973+
} else {
2974+
keep_prev = false;
2975+
kick_idle = true;
2976+
}
2977+
} else if (unlikely(keep_prev && !prev_on_scx)) {
2978+
/* only allowed during transitions */
2979+
WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED);
2980+
keep_prev = false;
2981+
}
2982+
2983+
/*
2984+
* If balance_scx() is telling us to keep running @prev, replenish slice
2985+
* if necessary and keep running @prev. Otherwise, pop the first one
2986+
* from the local DSQ.
2987+
*/
2988+
if (keep_prev) {
29742989
p = prev;
29752990
if (!p->scx.slice)
29762991
p->scx.slice = SCX_SLICE_DFL;
29772992
} else {
29782993
p = first_local_task(rq);
2979-
if (!p)
2994+
if (!p) {
2995+
if (kick_idle)
2996+
scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE);
29802997
return NULL;
2998+
}
29812999

29823000
if (unlikely(!p->scx.slice)) {
29833001
if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {

kernel/sched/sched.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -751,8 +751,9 @@ enum scx_rq_flags {
751751
*/
752752
SCX_RQ_ONLINE = 1 << 0,
753753
SCX_RQ_CAN_STOP_TICK = 1 << 1,
754-
SCX_RQ_BAL_KEEP = 1 << 2, /* balance decided to keep current */
755-
SCX_RQ_BYPASSING = 1 << 3,
754+
SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */
755+
SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */
756+
SCX_RQ_BYPASSING = 1 << 4,
756757

757758
SCX_RQ_IN_WAKEUP = 1 << 16,
758759
SCX_RQ_IN_BALANCE = 1 << 17,

0 commit comments

Comments
 (0)