Skip to content

Commit 02baaa6

Browse files
committed
Merge tag 'sched_ext-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext
Pull sched_ext updates from Tejun Heo: - Improve recovery from misbehaving BPF schedulers. When a scheduler puts many tasks with varying affinity restrictions on a shared DSQ, CPUs scanning through tasks they cannot run can overwhelm the system, causing lockups. Bypass mode now uses per-CPU DSQs with a load balancer to avoid this, and hooks into the hardlockup detector to attempt recovery. Add scx_cpu0 example scheduler to demonstrate this scenario. - Add lockless peek operation for DSQs to reduce lock contention for schedulers that need to query queue state during load balancing. - Allow scx_bpf_reenqueue_local() to be called from anywhere in preparation for deprecating cpu_acquire/release() callbacks in favor of generic BPF hooks. - Prepare for hierarchical scheduler support: add scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() kfuncs, make scx_bpf_dsq_insert*() return bool, and wrap kfunc args in structs for future aux__prog parameter. - Implement cgroup_set_idle() callback to notify BPF schedulers when a cgroup's idle state changes. - Fix migration tasks being incorrectly downgraded from stop_sched_class to rt_sched_class across sched_ext enable/disable. Applied late as the fix is low risk and the bug subtle but needs stable backporting. - Various fixes and cleanups including cgroup exit ordering, SCX_KICK_WAIT reliability, and backward compatibility improvements. * tag 'sched_ext-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: (44 commits) sched_ext: Fix incorrect sched_class settings for per-cpu migration tasks sched_ext: tools: Removing duplicate targets during non-cross compilation sched_ext: Use kvfree_rcu() to release per-cpu ksyncs object sched_ext: Pass locked CPU parameter to scx_hardlockup() and add docs sched_ext: Update comments replacing breather with aborting mechanism sched_ext: Implement load balancer for bypass mode sched_ext: Factor out abbreviated dispatch dequeue into dispatch_dequeue_locked() sched_ext: Factor out scx_dsq_list_node cursor initialization into INIT_DSQ_LIST_CURSOR sched_ext: Add scx_cpu0 example scheduler sched_ext: Hook up hardlockup detector sched_ext: Make handle_lockup() propagate scx_verror() result sched_ext: Refactor lockup handlers into handle_lockup() sched_ext: Make scx_exit() and scx_vexit() return bool sched_ext: Exit dispatch and move operations immediately when aborting sched_ext: Simplify breather mechanism with scx_aborting flag sched_ext: Use per-CPU DSQs instead of per-node global DSQs in bypass mode sched_ext: Refactor do_enqueue_task() local and global DSQ paths sched_ext: Use shorter slice in bypass mode sched_ext: Mark racy bitfields to prevent adding fields that can't tolerate races sched_ext: Minor cleanups to scx_task_iter ...
2 parents 8449d32 + 1dd6c84 commit 02baaa6

File tree

20 files changed

+1893
-411
lines changed

20 files changed

+1893
-411
lines changed

include/linux/sched/ext.h

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,18 @@
1717
enum scx_public_consts {
1818
SCX_OPS_NAME_LEN = 128,
1919

20+
/*
21+
* %SCX_SLICE_DFL is used to refill slices when the BPF scheduler misses
22+
* to set the slice for a task that is selected for execution.
23+
* %SCX_EV_REFILL_SLICE_DFL counts the number of times the default slice
24+
* refill has been triggered.
25+
*
26+
* %SCX_SLICE_BYPASS is used as the slice for all tasks in the bypass
27+
* mode. As making forward progress for all tasks is the main goal of
28+
* the bypass mode, a shorter slice is used.
29+
*/
2030
SCX_SLICE_DFL = 20 * 1000000, /* 20ms */
31+
SCX_SLICE_BYPASS = 5 * 1000000, /* 5ms */
2132
SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */
2233
};
2334

@@ -46,6 +57,7 @@ enum scx_dsq_id_flags {
4657
SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0,
4758
SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1,
4859
SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2,
60+
SCX_DSQ_BYPASS = SCX_DSQ_FLAG_BUILTIN | 3,
4961
SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
5062
SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU,
5163
};
@@ -58,6 +70,7 @@ enum scx_dsq_id_flags {
5870
*/
5971
struct scx_dispatch_q {
6072
raw_spinlock_t lock;
73+
struct task_struct __rcu *first_task; /* lockless peek at head */
6174
struct list_head list; /* tasks in dispatch order */
6275
struct rb_root priq; /* used to order by p->scx.dsq_vtime */
6376
u32 nr;
@@ -136,6 +149,13 @@ struct scx_dsq_list_node {
136149
u32 priv; /* can be used by iter cursor */
137150
};
138151

152+
#define INIT_DSQ_LIST_CURSOR(__node, __flags, __priv) \
153+
(struct scx_dsq_list_node) { \
154+
.node = LIST_HEAD_INIT((__node).node), \
155+
.flags = SCX_DSQ_LNODE_ITER_CURSOR | (__flags), \
156+
.priv = (__priv), \
157+
}
158+
139159
/*
140160
* The following is embedded in task_struct and contains all fields necessary
141161
* for a task to be scheduled by SCX.
@@ -207,16 +227,18 @@ struct sched_ext_entity {
207227
struct list_head tasks_node;
208228
};
209229

210-
void sched_ext_free(struct task_struct *p);
230+
void sched_ext_dead(struct task_struct *p);
211231
void print_scx_info(const char *log_lvl, struct task_struct *p);
212232
void scx_softlockup(u32 dur_s);
233+
bool scx_hardlockup(int cpu);
213234
bool scx_rcu_cpu_stall(void);
214235

215236
#else /* !CONFIG_SCHED_CLASS_EXT */
216237

217-
static inline void sched_ext_free(struct task_struct *p) {}
238+
static inline void sched_ext_dead(struct task_struct *p) {}
218239
static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
219240
static inline void scx_softlockup(u32 dur_s) {}
241+
static inline bool scx_hardlockup(int cpu) { return false; }
220242
static inline bool scx_rcu_cpu_stall(void) { return false; }
221243

222244
#endif /* CONFIG_SCHED_CLASS_EXT */
@@ -228,6 +250,7 @@ struct scx_task_group {
228250
u64 bw_period_us;
229251
u64 bw_quota_us;
230252
u64 bw_burst_us;
253+
bool idle;
231254
#endif
232255
};
233256

include/trace/events/sched_ext.h

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,45 @@ TRACE_EVENT(sched_ext_event,
4545
)
4646
);
4747

48+
TRACE_EVENT(sched_ext_bypass_lb,
49+
50+
TP_PROTO(__u32 node, __u32 nr_cpus, __u32 nr_tasks, __u32 nr_balanced,
51+
__u32 before_min, __u32 before_max,
52+
__u32 after_min, __u32 after_max),
53+
54+
TP_ARGS(node, nr_cpus, nr_tasks, nr_balanced,
55+
before_min, before_max, after_min, after_max),
56+
57+
TP_STRUCT__entry(
58+
__field( __u32, node )
59+
__field( __u32, nr_cpus )
60+
__field( __u32, nr_tasks )
61+
__field( __u32, nr_balanced )
62+
__field( __u32, before_min )
63+
__field( __u32, before_max )
64+
__field( __u32, after_min )
65+
__field( __u32, after_max )
66+
),
67+
68+
TP_fast_assign(
69+
__entry->node = node;
70+
__entry->nr_cpus = nr_cpus;
71+
__entry->nr_tasks = nr_tasks;
72+
__entry->nr_balanced = nr_balanced;
73+
__entry->before_min = before_min;
74+
__entry->before_max = before_max;
75+
__entry->after_min = after_min;
76+
__entry->after_max = after_max;
77+
),
78+
79+
TP_printk("node %u: nr_cpus=%u nr_tasks=%u nr_balanced=%u min=%u->%u max=%u->%u",
80+
__entry->node, __entry->nr_cpus,
81+
__entry->nr_tasks, __entry->nr_balanced,
82+
__entry->before_min, __entry->after_min,
83+
__entry->before_max, __entry->after_max
84+
)
85+
);
86+
4887
#endif /* _TRACE_SCHED_EXT_H */
4988

5089
/* This part must be outside protection */

kernel/fork.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -736,7 +736,6 @@ void __put_task_struct(struct task_struct *tsk)
736736
WARN_ON(tsk == current);
737737

738738
unwind_task_free(tsk);
739-
sched_ext_free(tsk);
740739
io_uring_free(tsk);
741740
cgroup_task_free(tsk);
742741
task_numa_free(tsk, true);

kernel/sched/core.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5143,6 +5143,12 @@ static struct rq *finish_task_switch(struct task_struct *prev)
51435143
if (prev->sched_class->task_dead)
51445144
prev->sched_class->task_dead(prev);
51455145

5146+
/*
5147+
* sched_ext_dead() must come before cgroup_task_dead() to
5148+
* prevent cgroups from being removed while its member tasks are
5149+
* visible to SCX schedulers.
5150+
*/
5151+
sched_ext_dead(prev);
51465152
cgroup_task_dead(prev);
51475153

51485154
/* Task is done with its stack. */

0 commit comments

Comments
 (0)