Skip to content

Commit e32c260

Browse files
committed
sched_ext: Enable the ops breather and eject BPF scheduler on softlockup
On 2 x Intel Sapphire Rapids machines with 224 logical CPUs, a poorly behaving BPF scheduler can live-lock the system by making multiple CPUs bang on the same DSQ to the point where soft-lockup detection triggers before SCX's own watchdog can take action. It also seems possible that the machine can be live-locked enough to prevent scx_ops_helper, which is an RT task, from running in a timely manner. Implement scx_softlockup() which is called when three quarters of soft-lockup threshold has passed. The function immediately enables the ops breather and triggers an ops error to initiate ejection of the BPF scheduler. The previous and this patch combined enable the kernel to reliably recover the system from live-lock conditions that can be triggered by a poorly behaving BPF scheduler on Intel dual socket systems. Signed-off-by: Tejun Heo <[email protected]> Cc: Douglas Anderson <[email protected]> Cc: Andrew Morton <[email protected]>
1 parent 62dcbab commit e32c260

File tree

4 files changed

+57
-0
lines changed

4 files changed

+57
-0
lines changed

include/linux/sched/ext.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,11 +205,13 @@ struct sched_ext_entity {
205205

206206
void sched_ext_free(struct task_struct *p);
207207
void print_scx_info(const char *log_lvl, struct task_struct *p);
208+
void scx_softlockup(u32 dur_s);
208209

209210
#else /* !CONFIG_SCHED_CLASS_EXT */
210211

211212
static inline void sched_ext_free(struct task_struct *p) {}
212213
static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
214+
static inline void scx_softlockup(u32 dur_s) {}
213215

214216
#endif /* CONFIG_SCHED_CLASS_EXT */
215217
#endif /* _LINUX_SCHED_EXT_H */

kernel/sched/ext.c

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -867,6 +867,7 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
867867
DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
868868
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
869869
static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
870+
static unsigned long scx_in_softlockup;
870871
static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0);
871872
static int scx_ops_bypass_depth;
872873
static bool scx_ops_init_task_enabled;
@@ -4614,6 +4615,49 @@ bool task_should_scx(struct task_struct *p)
46144615
return p->policy == SCHED_EXT;
46154616
}
46164617

4618+
/**
4619+
* scx_softlockup - sched_ext softlockup handler
4620+
*
4621+
* On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can
4622+
* live-lock the system by making many CPUs target the same DSQ to the point
4623+
* where soft-lockup detection triggers. This function is called from
4624+
* soft-lockup watchdog when the triggering point is close and tries to unjam
4625+
* the system by enabling the breather and aborting the BPF scheduler.
4626+
*/
4627+
void scx_softlockup(u32 dur_s)
4628+
{
4629+
switch (scx_ops_enable_state()) {
4630+
case SCX_OPS_ENABLING:
4631+
case SCX_OPS_ENABLED:
4632+
break;
4633+
default:
4634+
return;
4635+
}
4636+
4637+
/* allow only one instance, cleared at the end of scx_ops_bypass() */
4638+
if (test_and_set_bit(0, &scx_in_softlockup))
4639+
return;
4640+
4641+
printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n",
4642+
smp_processor_id(), dur_s, scx_ops.name);
4643+
4644+
/*
4645+
* Some CPUs may be trapped in the dispatch paths. Enable breather
4646+
* immediately; otherwise, we might even be able to get to
4647+
* scx_ops_bypass().
4648+
*/
4649+
atomic_inc(&scx_ops_breather_depth);
4650+
4651+
scx_ops_error("soft lockup - CPU#%d stuck for %us",
4652+
smp_processor_id(), dur_s);
4653+
}
4654+
4655+
static void scx_clear_softlockup(void)
4656+
{
4657+
if (test_and_clear_bit(0, &scx_in_softlockup))
4658+
atomic_dec(&scx_ops_breather_depth);
4659+
}
4660+
46174661
/**
46184662
* scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
46194663
*
@@ -4724,6 +4768,7 @@ static void scx_ops_bypass(bool bypass)
47244768
atomic_dec(&scx_ops_breather_depth);
47254769
unlock:
47264770
raw_spin_unlock_irqrestore(&bypass_lock, flags);
4771+
scx_clear_softlockup();
47274772
}
47284773

47294774
static void free_exit_info(struct scx_exit_info *ei)

kernel/watchdog.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -644,6 +644,14 @@ static int is_softlockup(unsigned long touch_ts,
644644
need_counting_irqs())
645645
start_counting_irqs();
646646

647+
/*
648+
* A poorly behaving BPF scheduler can live-lock the system into
649+
* soft lockups. Tell sched_ext to try ejecting the BPF
650+
* scheduler when close to a soft lockup.
651+
*/
652+
if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4))
653+
scx_softlockup(now - touch_ts);
654+
647655
/* Warn about unreasonable delays. */
648656
if (time_after(now, period_ts + get_softlockup_thresh()))
649657
return now - touch_ts;

tools/sched_ext/scx_show_state.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ def ops_state_str(state):
3535
print(f'switching_all : {read_int("scx_switching_all")}')
3636
print(f'switched_all : {read_static_key("__scx_switched_all")}')
3737
print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})')
38+
print(f'in_softlockup : {prog["scx_in_softlockup"].value_()}')
39+
print(f'breather_depth: {read_atomic("scx_ops_breather_depth")}')
3840
print(f'bypass_depth : {prog["scx_ops_bypass_depth"].value_()}')
3941
print(f'nr_rejected : {read_atomic("scx_nr_rejected")}')
4042
print(f'enable_seq : {read_atomic("scx_enable_seq")}')

0 commit comments

Comments
 (0)