Skip to content

Commit b7b3b2d

Browse files
committed
sched_ext: Split the global DSQ per NUMA node
In the bypass mode, the global DSQ is used to schedule all tasks in simple FIFO order. All tasks are queued into the global DSQ and all CPUs try to execute tasks from it. This creates a lot of cross-node cacheline accesses and scheduling across the node boundaries, and can lead to live-lock conditions where the system takes tens of minutes to disable the BPF scheduler while executing in the bypass mode. Split the global DSQ per NUMA node. Each node has its own global DSQ. When a task is dispatched to SCX_DSQ_GLOBAL, it's put into the global DSQ local to the task's CPU and all CPUs in a node only consume its node-local global DSQ. This resolves a livelock condition which could be reliably triggered on an 2x EPYC 7642 system by running `stress-ng --race-sched 1024` together with `stress-ng --workload 80 --workload-threads 10` while repeatedly enabling and disabling a SCX scheduler. Signed-off-by: Tejun Heo <[email protected]> Acked-by: David Vernet <[email protected]>
1 parent bba26bf commit b7b3b2d

File tree

1 file changed

+60
-13
lines changed

1 file changed

+60
-13
lines changed

kernel/sched/ext.c

Lines changed: 60 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -925,8 +925,15 @@ static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
925925
*/
926926
static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
927927

928-
/* dispatch queues */
929-
static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global;
928+
/*
929+
* Dispatch queues.
930+
*
931+
* The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is
932+
* to avoid live-locking in bypass mode where all tasks are dispatched to
933+
* %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't
934+
* sufficient, it can be further split.
935+
*/
936+
static struct scx_dispatch_q **global_dsqs;
930937

931938
static const struct rhashtable_params dsq_hash_params = {
932939
.key_len = 8,
@@ -1029,6 +1036,11 @@ static bool u32_before(u32 a, u32 b)
10291036
return (s32)(a - b) < 0;
10301037
}
10311038

1039+
static struct scx_dispatch_q *find_global_dsq(struct task_struct *p)
1040+
{
1041+
return global_dsqs[cpu_to_node(task_cpu(p))];
1042+
}
1043+
10321044
static struct scx_dispatch_q *find_user_dsq(u64 dsq_id)
10331045
{
10341046
return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
@@ -1642,7 +1654,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
16421654
scx_ops_error("attempting to dispatch to a destroyed dsq");
16431655
/* fall back to the global dsq */
16441656
raw_spin_unlock(&dsq->lock);
1645-
dsq = &scx_dsq_global;
1657+
dsq = find_global_dsq(p);
16461658
raw_spin_lock(&dsq->lock);
16471659
}
16481660
}
@@ -1820,20 +1832,20 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
18201832
s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
18211833

18221834
if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
1823-
return &scx_dsq_global;
1835+
return find_global_dsq(p);
18241836

18251837
return &cpu_rq(cpu)->scx.local_dsq;
18261838
}
18271839

18281840
if (dsq_id == SCX_DSQ_GLOBAL)
1829-
dsq = &scx_dsq_global;
1841+
dsq = find_global_dsq(p);
18301842
else
18311843
dsq = find_user_dsq(dsq_id);
18321844

18331845
if (unlikely(!dsq)) {
18341846
scx_ops_error("non-existent DSQ 0x%llx for %s[%d]",
18351847
dsq_id, p->comm, p->pid);
1836-
return &scx_dsq_global;
1848+
return find_global_dsq(p);
18371849
}
18381850

18391851
return dsq;
@@ -2005,7 +2017,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
20052017
global:
20062018
touch_core_sched(rq, p); /* see the comment in local: */
20072019
p->scx.slice = SCX_SLICE_DFL;
2008-
dispatch_enqueue(&scx_dsq_global, p, enq_flags);
2020+
dispatch_enqueue(find_global_dsq(p), p, enq_flags);
20092021
}
20102022

20112023
static bool task_runnable(const struct task_struct *p)
@@ -2391,6 +2403,13 @@ static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq)
23912403
return false;
23922404
}
23932405

2406+
static bool consume_global_dsq(struct rq *rq)
2407+
{
2408+
int node = cpu_to_node(cpu_of(rq));
2409+
2410+
return consume_dispatch_q(rq, global_dsqs[node]);
2411+
}
2412+
23942413
/**
23952414
* dispatch_to_local_dsq - Dispatch a task to a local dsq
23962415
* @rq: current rq which is locked
@@ -2424,7 +2443,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
24242443

24252444
#ifdef CONFIG_SMP
24262445
if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
2427-
dispatch_enqueue(&scx_dsq_global, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
2446+
dispatch_enqueue(find_global_dsq(p), p,
2447+
enq_flags | SCX_ENQ_CLEAR_OPSS);
24282448
return;
24292449
}
24302450

@@ -2624,7 +2644,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
26242644
if (rq->scx.local_dsq.nr)
26252645
goto has_tasks;
26262646

2627-
if (consume_dispatch_q(rq, &scx_dsq_global))
2647+
if (consume_global_dsq(rq))
26282648
goto has_tasks;
26292649

26302650
if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq))
@@ -2649,7 +2669,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
26492669

26502670
if (rq->scx.local_dsq.nr)
26512671
goto has_tasks;
2652-
if (consume_dispatch_q(rq, &scx_dsq_global))
2672+
if (consume_global_dsq(rq))
26532673
goto has_tasks;
26542674

26552675
/*
@@ -4924,7 +4944,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
49244944
struct scx_task_iter sti;
49254945
struct task_struct *p;
49264946
unsigned long timeout;
4927-
int i, cpu, ret;
4947+
int i, cpu, node, ret;
49284948

49294949
if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
49304950
cpu_possible_mask)) {
@@ -4943,6 +4963,34 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
49434963
}
49444964
}
49454965

4966+
if (!global_dsqs) {
4967+
struct scx_dispatch_q **dsqs;
4968+
4969+
dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL);
4970+
if (!dsqs) {
4971+
ret = -ENOMEM;
4972+
goto err_unlock;
4973+
}
4974+
4975+
for_each_node_state(node, N_POSSIBLE) {
4976+
struct scx_dispatch_q *dsq;
4977+
4978+
dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node);
4979+
if (!dsq) {
4980+
for_each_node_state(node, N_POSSIBLE)
4981+
kfree(dsqs[node]);
4982+
kfree(dsqs);
4983+
ret = -ENOMEM;
4984+
goto err_unlock;
4985+
}
4986+
4987+
init_dsq(dsq, SCX_DSQ_GLOBAL);
4988+
dsqs[node] = dsq;
4989+
}
4990+
4991+
global_dsqs = dsqs;
4992+
}
4993+
49464994
if (scx_ops_enable_state() != SCX_OPS_DISABLED) {
49474995
ret = -EBUSY;
49484996
goto err_unlock;
@@ -5777,7 +5825,6 @@ void __init init_sched_ext_class(void)
57775825
SCX_TG_ONLINE);
57785826

57795827
BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
5780-
init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL);
57815828
#ifdef CONFIG_SMP
57825829
BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
57835830
BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
@@ -6053,7 +6100,7 @@ static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit,
60536100
if (dst_dsq->id == SCX_DSQ_LOCAL) {
60546101
dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
60556102
if (!task_can_run_on_remote_rq(p, dst_rq, true)) {
6056-
dst_dsq = &scx_dsq_global;
6103+
dst_dsq = find_global_dsq(p);
60576104
dst_rq = src_rq;
60586105
}
60596106
} else {

0 commit comments

Comments
 (0)