Skip to content

Commit 3022e9d

Browse files
committed
Merge tag 'sched_ext-for-6.12-rc7-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext
Pull sched_ext fixes from Tejun Heo: - The fair sched class currently has a bug where its balance() returns true telling the sched core that it has tasks to run but then NULL from pick_task(). This makes sched core call sched_ext's pick_task() without preceding balance() which can lead to stalls in partial mode. For now, work around by detecting the condition and forcing the CPU to go through another scheduling cycle. - Add a missing newline to an error message and fix drgn introspection tool which went out of sync. * tag 'sched_ext-for-6.12-rc7-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: sched_ext: Handle cases where pick_task_scx() is called without preceding balance_scx() sched_ext: Update scx_show_state.py to match scx_ops_bypass_depth's new type sched_ext: Add a missing newline at the end of an error message
2 parents 0ccd733 + a6250aa commit 3022e9d

File tree

4 files changed

+44
-22
lines changed

4 files changed

+44
-22
lines changed

kernel/sched/core.c

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5920,12 +5920,15 @@ static void prev_balance(struct rq *rq, struct task_struct *prev,
59205920

59215921
#ifdef CONFIG_SCHED_CLASS_EXT
59225922
/*
5923-
* SCX requires a balance() call before every pick_next_task() including
5924-
* when waking up from SCHED_IDLE. If @start_class is below SCX, start
5925-
* from SCX instead.
5923+
* SCX requires a balance() call before every pick_task() including when
5924+
* waking up from SCHED_IDLE. If @start_class is below SCX, start from
5925+
* SCX instead. Also, set a flag to detect missing balance() call.
59265926
*/
5927-
if (scx_enabled() && sched_class_above(&ext_sched_class, start_class))
5928-
start_class = &ext_sched_class;
5927+
if (scx_enabled()) {
5928+
rq->scx.flags |= SCX_RQ_BAL_PENDING;
5929+
if (sched_class_above(&ext_sched_class, start_class))
5930+
start_class = &ext_sched_class;
5931+
}
59295932
#endif
59305933

59315934
/*

kernel/sched/ext.c

Lines changed: 32 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2634,7 +2634,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
26342634

26352635
lockdep_assert_rq_held(rq);
26362636
rq->scx.flags |= SCX_RQ_IN_BALANCE;
2637-
rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
2637+
rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
26382638

26392639
if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
26402640
unlikely(rq->scx.cpu_released)) {
@@ -2948,12 +2948,11 @@ static struct task_struct *pick_task_scx(struct rq *rq)
29482948
{
29492949
struct task_struct *prev = rq->curr;
29502950
struct task_struct *p;
2951+
bool prev_on_scx = prev->sched_class == &ext_sched_class;
2952+
bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
2953+
bool kick_idle = false;
29512954

29522955
/*
2953-
* If balance_scx() is telling us to keep running @prev, replenish slice
2954-
* if necessary and keep running @prev. Otherwise, pop the first one
2955-
* from the local DSQ.
2956-
*
29572956
* WORKAROUND:
29582957
*
29592958
* %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just
@@ -2962,22 +2961,41 @@ static struct task_struct *pick_task_scx(struct rq *rq)
29622961
* which then ends up calling pick_task_scx() without preceding
29632962
* balance_scx().
29642963
*
2965-
* For now, ignore cases where $prev is not on SCX. This isn't great and
2966-
* can theoretically lead to stalls. However, for switch_all cases, this
2967-
* happens only while a BPF scheduler is being loaded or unloaded, and,
2968-
* for partial cases, fair will likely keep triggering this CPU.
2964+
* Keep running @prev if possible and avoid stalling from entering idle
2965+
* without balancing.
29692966
*
2970-
* Once fair is fixed, restore WARN_ON_ONCE().
2967+
* Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE()
2968+
* if pick_task_scx() is called without preceding balance_scx().
29712969
*/
2972-
if ((rq->scx.flags & SCX_RQ_BAL_KEEP) &&
2973-
prev->sched_class == &ext_sched_class) {
2970+
if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {
2971+
if (prev_on_scx) {
2972+
keep_prev = true;
2973+
} else {
2974+
keep_prev = false;
2975+
kick_idle = true;
2976+
}
2977+
} else if (unlikely(keep_prev && !prev_on_scx)) {
2978+
/* only allowed during transitions */
2979+
WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED);
2980+
keep_prev = false;
2981+
}
2982+
2983+
/*
2984+
* If balance_scx() is telling us to keep running @prev, replenish slice
2985+
* if necessary and keep running @prev. Otherwise, pop the first one
2986+
* from the local DSQ.
2987+
*/
2988+
if (keep_prev) {
29742989
p = prev;
29752990
if (!p->scx.slice)
29762991
p->scx.slice = SCX_SLICE_DFL;
29772992
} else {
29782993
p = first_local_task(rq);
2979-
if (!p)
2994+
if (!p) {
2995+
if (kick_idle)
2996+
scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE);
29802997
return NULL;
2998+
}
29812999

29823000
if (unlikely(!p->scx.slice)) {
29833001
if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
@@ -4979,7 +4997,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
49794997

49804998
if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
49814999
cpu_possible_mask)) {
4982-
pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation");
5000+
pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
49835001
return -EINVAL;
49845002
}
49855003

kernel/sched/sched.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -751,8 +751,9 @@ enum scx_rq_flags {
751751
*/
752752
SCX_RQ_ONLINE = 1 << 0,
753753
SCX_RQ_CAN_STOP_TICK = 1 << 1,
754-
SCX_RQ_BAL_KEEP = 1 << 2, /* balance decided to keep current */
755-
SCX_RQ_BYPASSING = 1 << 3,
754+
SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */
755+
SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */
756+
SCX_RQ_BYPASSING = 1 << 4,
756757

757758
SCX_RQ_IN_WAKEUP = 1 << 16,
758759
SCX_RQ_IN_BALANCE = 1 << 17,

tools/sched_ext/scx_show_state.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,6 @@ def ops_state_str(state):
3535
print(f'switching_all : {read_int("scx_switching_all")}')
3636
print(f'switched_all : {read_static_key("__scx_switched_all")}')
3737
print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})')
38-
print(f'bypass_depth : {read_atomic("scx_ops_bypass_depth")}')
38+
print(f'bypass_depth : {prog["scx_ops_bypass_depth"].value_()}')
3939
print(f'nr_rejected : {read_atomic("scx_nr_rejected")}')
4040
print(f'enable_seq : {read_atomic("scx_enable_seq")}')

0 commit comments

Comments
 (0)