Skip to content

Commit 04f41cb

Browse files
committed
Merge tag 'sched_ext-for-6.14-rc2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext
Pull sched_ext fixes from Tejun Heo: - Fix lock imbalance in a corner case of dispatch_to_local_dsq() - Migration disabled tasks were confusing some BPF schedulers and its handling had a bug. Fix it and simplify the default behavior by dispatching them automatically - ops.tick(), ops.disable() and ops.exit_task() were incorrectly disallowing kfuncs that require the task argument to be the rq operation is currently operating on and thus is rq-locked. Allow them. - Fix autogroup migration handling bug which was occasionally triggering a warning in the cgroup migration path - tools/sched_ext, selftest and other misc updates * tag 'sched_ext-for-6.14-rc2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: sched_ext: Use SCX_CALL_OP_TASK in task_tick_scx sched_ext: Fix the incorrect bpf_list kfunc API in common.bpf.h. sched_ext: selftests: Fix grammar in tests description sched_ext: Fix incorrect assumption about migration disabled tasks in task_can_run_on_remote_rq() sched_ext: Fix migration disabled handling in targeted dispatches sched_ext: Implement auto local dispatching of migration disabled tasks sched_ext: Fix incorrect time delta calculation in time_delta() sched_ext: Fix lock imbalance in dispatch_to_local_dsq() sched_ext: selftests/dsp_local_on: Fix selftest on UP systems tools/sched_ext: Add helper to check task migration state sched_ext: Fix incorrect autogroup migration detection sched_ext: selftests/dsp_local_on: Fix sporadic failures selftests/sched_ext: Fix enum resolution sched_ext: Include task weight in the error state dump sched_ext: Fixes typos in comments
2 parents 80868f5 + f5717c9 commit 04f41cb

27 files changed

+198
-118
lines changed

kernel/sched/autogroup.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ void sched_autogroup_exit_task(struct task_struct *p)
150150
* see this thread after that: we can no longer use signal->autogroup.
151151
* See the PF_EXITING check in task_wants_autogroup().
152152
*/
153-
sched_move_task(p);
153+
sched_move_task(p, true);
154154
}
155155

156156
static void
@@ -182,7 +182,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
182182
* sched_autogroup_exit_task().
183183
*/
184184
for_each_thread(p, t)
185-
sched_move_task(t);
185+
sched_move_task(t, true);
186186

187187
unlock_task_sighand(p, &flags);
188188
autogroup_kref_put(prev);

kernel/sched/core.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9050,7 +9050,7 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group
90509050
* now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
90519051
* its new group.
90529052
*/
9053-
void sched_move_task(struct task_struct *tsk)
9053+
void sched_move_task(struct task_struct *tsk, bool for_autogroup)
90549054
{
90559055
int queued, running, queue_flags =
90569056
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
@@ -9079,7 +9079,8 @@ void sched_move_task(struct task_struct *tsk)
90799079
put_prev_task(rq, tsk);
90809080

90819081
sched_change_group(tsk, group);
9082-
scx_move_task(tsk);
9082+
if (!for_autogroup)
9083+
scx_cgroup_move_task(tsk);
90839084

90849085
if (queued)
90859086
enqueue_task(rq, tsk, queue_flags);
@@ -9180,7 +9181,7 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
91809181
struct cgroup_subsys_state *css;
91819182

91829183
cgroup_taskset_for_each(task, css, tset)
9183-
sched_move_task(task);
9184+
sched_move_task(task, false);
91849185

91859186
scx_cgroup_finish_attach();
91869187
}

kernel/sched/ext.c

Lines changed: 76 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,19 @@ enum scx_ops_flags {
122122
*/
123123
SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
124124

125+
/*
126+
* A migration disabled task can only execute on its current CPU. By
127+
* default, such tasks are automatically put on the CPU's local DSQ with
128+
* the default slice on enqueue. If this ops flag is set, they also go
129+
* through ops.enqueue().
130+
*
131+
* A migration disabled task never invokes ops.select_cpu() as it can
132+
* only select the current CPU. Also, p->cpus_ptr will only contain its
133+
* current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
134+
* and thus may disagree with cpumask_weight(p->cpus_ptr).
135+
*/
136+
SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
137+
125138
/*
126139
* CPU cgroup support flags
127140
*/
@@ -130,6 +143,7 @@ enum scx_ops_flags {
130143
SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
131144
SCX_OPS_ENQ_LAST |
132145
SCX_OPS_ENQ_EXITING |
146+
SCX_OPS_ENQ_MIGRATION_DISABLED |
133147
SCX_OPS_SWITCH_PARTIAL |
134148
SCX_OPS_HAS_CGROUP_WEIGHT,
135149
};
@@ -416,7 +430,7 @@ struct sched_ext_ops {
416430

417431
/**
418432
* @update_idle: Update the idle state of a CPU
419-
* @cpu: CPU to udpate the idle state for
433+
* @cpu: CPU to update the idle state for
420434
* @idle: whether entering or exiting the idle state
421435
*
422436
* This operation is called when @rq's CPU goes or leaves the idle
@@ -882,6 +896,7 @@ static bool scx_warned_zero_slice;
882896

883897
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
884898
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
899+
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled);
885900
static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
886901
static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
887902

@@ -1214,7 +1229,7 @@ static bool scx_kf_allowed_if_unlocked(void)
12141229

12151230
/**
12161231
* nldsq_next_task - Iterate to the next task in a non-local DSQ
1217-
* @dsq: user dsq being interated
1232+
* @dsq: user dsq being iterated
12181233
* @cur: current position, %NULL to start iteration
12191234
* @rev: walk backwards
12201235
*
@@ -2014,6 +2029,11 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
20142029
unlikely(p->flags & PF_EXITING))
20152030
goto local;
20162031

2032+
/* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
2033+
if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) &&
2034+
is_migration_disabled(p))
2035+
goto local;
2036+
20172037
if (!SCX_HAS_OP(enqueue))
20182038
goto global;
20192039

@@ -2078,7 +2098,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p)
20782098

20792099
/*
20802100
* list_add_tail() must be used. scx_ops_bypass() depends on tasks being
2081-
* appened to the runnable_list.
2101+
* appended to the runnable_list.
20822102
*/
20832103
list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
20842104
}
@@ -2313,12 +2333,35 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
23132333
*
23142334
* - The BPF scheduler is bypassed while the rq is offline and we can always say
23152335
* no to the BPF scheduler initiated migrations while offline.
2336+
*
2337+
* The caller must ensure that @p and @rq are on different CPUs.
23162338
*/
23172339
static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
23182340
bool trigger_error)
23192341
{
23202342
int cpu = cpu_of(rq);
23212343

2344+
SCHED_WARN_ON(task_cpu(p) == cpu);
2345+
2346+
/*
2347+
* If @p has migration disabled, @p->cpus_ptr is updated to contain only
2348+
* the pinned CPU in migrate_disable_switch() while @p is being switched
2349+
* out. However, put_prev_task_scx() is called before @p->cpus_ptr is
2350+
* updated and thus another CPU may see @p on a DSQ inbetween leading to
2351+
* @p passing the below task_allowed_on_cpu() check while migration is
2352+
* disabled.
2353+
*
2354+
* Test the migration disabled state first as the race window is narrow
2355+
* and the BPF scheduler failing to check migration disabled state can
2356+
* easily be masked if task_allowed_on_cpu() is done first.
2357+
*/
2358+
if (unlikely(is_migration_disabled(p))) {
2359+
if (trigger_error)
2360+
scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
2361+
p->comm, p->pid, task_cpu(p), cpu);
2362+
return false;
2363+
}
2364+
23222365
/*
23232366
* We don't require the BPF scheduler to avoid dispatching to offline
23242367
* CPUs mostly for convenience but also because CPUs can go offline
@@ -2327,14 +2370,11 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
23272370
*/
23282371
if (!task_allowed_on_cpu(p, cpu)) {
23292372
if (trigger_error)
2330-
scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]",
2331-
cpu_of(rq), p->comm, p->pid);
2373+
scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
2374+
cpu, p->comm, p->pid);
23322375
return false;
23332376
}
23342377

2335-
if (unlikely(is_migration_disabled(p)))
2336-
return false;
2337-
23382378
if (!scx_rq_online(rq))
23392379
return false;
23402380

@@ -2437,7 +2477,8 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
24372477

24382478
if (dst_dsq->id == SCX_DSQ_LOCAL) {
24392479
dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
2440-
if (!task_can_run_on_remote_rq(p, dst_rq, true)) {
2480+
if (src_rq != dst_rq &&
2481+
unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
24412482
dst_dsq = find_global_dsq(p);
24422483
dst_rq = src_rq;
24432484
}
@@ -2480,7 +2521,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
24802521
/*
24812522
* A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly
24822523
* banging on the same DSQ on a large NUMA system to the point where switching
2483-
* to the bypass mode can take a long time. Inject artifical delays while the
2524+
* to the bypass mode can take a long time. Inject artificial delays while the
24842525
* bypass mode is switching to guarantee timely completion.
24852526
*/
24862527
static void scx_ops_breather(struct rq *rq)
@@ -2575,6 +2616,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
25752616
{
25762617
struct rq *src_rq = task_rq(p);
25772618
struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
2619+
#ifdef CONFIG_SMP
2620+
struct rq *locked_rq = rq;
2621+
#endif
25782622

25792623
/*
25802624
* We're synchronized against dequeue through DISPATCHING. As @p can't
@@ -2588,7 +2632,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
25882632
}
25892633

25902634
#ifdef CONFIG_SMP
2591-
if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
2635+
if (src_rq != dst_rq &&
2636+
unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
25922637
dispatch_enqueue(find_global_dsq(p), p,
25932638
enq_flags | SCX_ENQ_CLEAR_OPSS);
25942639
return;
@@ -2611,8 +2656,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
26112656
atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
26122657

26132658
/* switch to @src_rq lock */
2614-
if (rq != src_rq) {
2615-
raw_spin_rq_unlock(rq);
2659+
if (locked_rq != src_rq) {
2660+
raw_spin_rq_unlock(locked_rq);
2661+
locked_rq = src_rq;
26162662
raw_spin_rq_lock(src_rq);
26172663
}
26182664

@@ -2630,6 +2676,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
26302676
} else {
26312677
move_remote_task_to_local_dsq(p, enq_flags,
26322678
src_rq, dst_rq);
2679+
/* task has been moved to dst_rq, which is now locked */
2680+
locked_rq = dst_rq;
26332681
}
26342682

26352683
/* if the destination CPU is idle, wake it up */
@@ -2638,8 +2686,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
26382686
}
26392687

26402688
/* switch back to @rq lock */
2641-
if (rq != dst_rq) {
2642-
raw_spin_rq_unlock(dst_rq);
2689+
if (locked_rq != rq) {
2690+
raw_spin_rq_unlock(locked_rq);
26432691
raw_spin_rq_lock(rq);
26442692
}
26452693
#else /* CONFIG_SMP */
@@ -3144,7 +3192,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
31443192
*
31453193
* Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
31463194
* to implement the default task ordering. The older the timestamp, the higher
3147-
* prority the task - the global FIFO ordering matching the default scheduling
3195+
* priority the task - the global FIFO ordering matching the default scheduling
31483196
* behavior.
31493197
*
31503198
* When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
@@ -3851,7 +3899,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
38513899
curr->scx.slice = 0;
38523900
touch_core_sched(rq, curr);
38533901
} else if (SCX_HAS_OP(tick)) {
3854-
SCX_CALL_OP(SCX_KF_REST, tick, curr);
3902+
SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr);
38553903
}
38563904

38573905
if (!curr->scx.slice)
@@ -3998,7 +4046,7 @@ static void scx_ops_disable_task(struct task_struct *p)
39984046
WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
39994047

40004048
if (SCX_HAS_OP(disable))
4001-
SCX_CALL_OP(SCX_KF_REST, disable, p);
4049+
SCX_CALL_OP_TASK(SCX_KF_REST, disable, p);
40024050
scx_set_task_state(p, SCX_TASK_READY);
40034051
}
40044052

@@ -4027,7 +4075,7 @@ static void scx_ops_exit_task(struct task_struct *p)
40274075
}
40284076

40294077
if (SCX_HAS_OP(exit_task))
4030-
SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args);
4078+
SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args);
40314079
scx_set_task_state(p, SCX_TASK_NONE);
40324080
}
40334081

@@ -4323,24 +4371,11 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
43234371
return ops_sanitize_err("cgroup_prep_move", ret);
43244372
}
43254373

4326-
void scx_move_task(struct task_struct *p)
4374+
void scx_cgroup_move_task(struct task_struct *p)
43274375
{
43284376
if (!scx_cgroup_enabled)
43294377
return;
43304378

4331-
/*
4332-
* We're called from sched_move_task() which handles both cgroup and
4333-
* autogroup moves. Ignore the latter.
4334-
*
4335-
* Also ignore exiting tasks, because in the exit path tasks transition
4336-
* from the autogroup to the root group, so task_group_is_autogroup()
4337-
* alone isn't able to catch exiting autogroup tasks. This is safe for
4338-
* cgroup_move(), because cgroup migrations never happen for PF_EXITING
4339-
* tasks.
4340-
*/
4341-
if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING))
4342-
return;
4343-
43444379
/*
43454380
* @p must have ops.cgroup_prep_move() called on it and thus
43464381
* cgrp_moving_from set.
@@ -4590,7 +4625,7 @@ static int scx_cgroup_init(void)
45904625
cgroup_warned_missing_idle = false;
45914626

45924627
/*
4593-
* scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk
4628+
* scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
45944629
* cgroups and init, all online cgroups are initialized.
45954630
*/
45964631
rcu_read_lock();
@@ -5059,6 +5094,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
50595094
static_branch_disable(&scx_has_op[i]);
50605095
static_branch_disable(&scx_ops_enq_last);
50615096
static_branch_disable(&scx_ops_enq_exiting);
5097+
static_branch_disable(&scx_ops_enq_migration_disabled);
50625098
static_branch_disable(&scx_ops_cpu_preempt);
50635099
static_branch_disable(&scx_builtin_idle_enabled);
50645100
synchronize_rcu();
@@ -5277,9 +5313,10 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
52775313
scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,
52785314
p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK,
52795315
ops_state >> SCX_OPSS_QSEQ_SHIFT);
5280-
dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu slice=%llu",
5281-
p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf,
5282-
p->scx.dsq_vtime, p->scx.slice);
5316+
dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s",
5317+
p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf);
5318+
dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u",
5319+
p->scx.dsq_vtime, p->scx.slice, p->scx.weight);
52835320
dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
52845321

52855322
if (SCX_HAS_OP(dump_task)) {
@@ -5667,6 +5704,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
56675704

56685705
if (ops->flags & SCX_OPS_ENQ_EXITING)
56695706
static_branch_enable(&scx_ops_enq_exiting);
5707+
if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED)
5708+
static_branch_enable(&scx_ops_enq_migration_disabled);
56705709
if (scx_ops.cpu_acquire || scx_ops.cpu_release)
56715710
static_branch_enable(&scx_ops_cpu_preempt);
56725711

kernel/sched/ext.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {}
7373
int scx_tg_online(struct task_group *tg);
7474
void scx_tg_offline(struct task_group *tg);
7575
int scx_cgroup_can_attach(struct cgroup_taskset *tset);
76-
void scx_move_task(struct task_struct *p);
76+
void scx_cgroup_move_task(struct task_struct *p);
7777
void scx_cgroup_finish_attach(void);
7878
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
7979
void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
@@ -82,7 +82,7 @@ void scx_group_set_idle(struct task_group *tg, bool idle);
8282
static inline int scx_tg_online(struct task_group *tg) { return 0; }
8383
static inline void scx_tg_offline(struct task_group *tg) {}
8484
static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
85-
static inline void scx_move_task(struct task_struct *p) {}
85+
static inline void scx_cgroup_move_task(struct task_struct *p) {}
8686
static inline void scx_cgroup_finish_attach(void) {}
8787
static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
8888
static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}

kernel/sched/sched.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -572,7 +572,7 @@ extern void sched_online_group(struct task_group *tg,
572572
extern void sched_destroy_group(struct task_group *tg);
573573
extern void sched_release_group(struct task_group *tg);
574574

575-
extern void sched_move_task(struct task_struct *tsk);
575+
extern void sched_move_task(struct task_struct *tsk, bool for_autogroup);
576576

577577
#ifdef CONFIG_FAIR_GROUP_SCHED
578578
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);

0 commit comments

Comments
 (0)