@@ -122,6 +122,19 @@ enum scx_ops_flags {
122
122
*/
123
123
SCX_OPS_SWITCH_PARTIAL = 1LLU << 3 ,
124
124
125
+ /*
126
+ * A migration disabled task can only execute on its current CPU. By
127
+ * default, such tasks are automatically put on the CPU's local DSQ with
128
+ * the default slice on enqueue. If this ops flag is set, they also go
129
+ * through ops.enqueue().
130
+ *
131
+ * A migration disabled task never invokes ops.select_cpu() as it can
132
+ * only select the current CPU. Also, p->cpus_ptr will only contain its
133
+ * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
134
+ * and thus may disagree with cpumask_weight(p->cpus_ptr).
135
+ */
136
+ SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4 ,
137
+
125
138
/*
126
139
* CPU cgroup support flags
127
140
*/
@@ -130,6 +143,7 @@ enum scx_ops_flags {
130
143
SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
131
144
SCX_OPS_ENQ_LAST |
132
145
SCX_OPS_ENQ_EXITING |
146
+ SCX_OPS_ENQ_MIGRATION_DISABLED |
133
147
SCX_OPS_SWITCH_PARTIAL |
134
148
SCX_OPS_HAS_CGROUP_WEIGHT ,
135
149
};
@@ -416,7 +430,7 @@ struct sched_ext_ops {
416
430
417
431
/**
418
432
* @update_idle: Update the idle state of a CPU
419
- * @cpu: CPU to udpate the idle state for
433
+ * @cpu: CPU to update the idle state for
420
434
* @idle: whether entering or exiting the idle state
421
435
*
422
436
* This operation is called when @rq's CPU goes or leaves the idle
@@ -882,6 +896,7 @@ static bool scx_warned_zero_slice;
882
896
883
897
static DEFINE_STATIC_KEY_FALSE (scx_ops_enq_last );
884
898
static DEFINE_STATIC_KEY_FALSE (scx_ops_enq_exiting );
899
+ static DEFINE_STATIC_KEY_FALSE (scx_ops_enq_migration_disabled );
885
900
static DEFINE_STATIC_KEY_FALSE (scx_ops_cpu_preempt );
886
901
static DEFINE_STATIC_KEY_FALSE (scx_builtin_idle_enabled );
887
902
@@ -1214,7 +1229,7 @@ static bool scx_kf_allowed_if_unlocked(void)
1214
1229
1215
1230
/**
1216
1231
* nldsq_next_task - Iterate to the next task in a non-local DSQ
1217
- * @dsq: user dsq being interated
1232
+ * @dsq: user dsq being iterated
1218
1233
* @cur: current position, %NULL to start iteration
1219
1234
* @rev: walk backwards
1220
1235
*
@@ -2014,6 +2029,11 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
2014
2029
unlikely (p -> flags & PF_EXITING ))
2015
2030
goto local ;
2016
2031
2032
+ /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
2033
+ if (!static_branch_unlikely (& scx_ops_enq_migration_disabled ) &&
2034
+ is_migration_disabled (p ))
2035
+ goto local ;
2036
+
2017
2037
if (!SCX_HAS_OP (enqueue ))
2018
2038
goto global ;
2019
2039
@@ -2078,7 +2098,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p)
2078
2098
2079
2099
/*
2080
2100
* list_add_tail() must be used. scx_ops_bypass() depends on tasks being
2081
- * appened to the runnable_list.
2101
+ * appended to the runnable_list.
2082
2102
*/
2083
2103
list_add_tail (& p -> scx .runnable_node , & rq -> scx .runnable_list );
2084
2104
}
@@ -2313,12 +2333,35 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
2313
2333
*
2314
2334
* - The BPF scheduler is bypassed while the rq is offline and we can always say
2315
2335
* no to the BPF scheduler initiated migrations while offline.
2336
+ *
2337
+ * The caller must ensure that @p and @rq are on different CPUs.
2316
2338
*/
2317
2339
static bool task_can_run_on_remote_rq (struct task_struct * p , struct rq * rq ,
2318
2340
bool trigger_error )
2319
2341
{
2320
2342
int cpu = cpu_of (rq );
2321
2343
2344
+ SCHED_WARN_ON (task_cpu (p ) == cpu );
2345
+
2346
+ /*
2347
+ * If @p has migration disabled, @p->cpus_ptr is updated to contain only
2348
+ * the pinned CPU in migrate_disable_switch() while @p is being switched
2349
+ * out. However, put_prev_task_scx() is called before @p->cpus_ptr is
2350
+ * updated and thus another CPU may see @p on a DSQ inbetween leading to
2351
+ * @p passing the below task_allowed_on_cpu() check while migration is
2352
+ * disabled.
2353
+ *
2354
+ * Test the migration disabled state first as the race window is narrow
2355
+ * and the BPF scheduler failing to check migration disabled state can
2356
+ * easily be masked if task_allowed_on_cpu() is done first.
2357
+ */
2358
+ if (unlikely (is_migration_disabled (p ))) {
2359
+ if (trigger_error )
2360
+ scx_ops_error ("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d" ,
2361
+ p -> comm , p -> pid , task_cpu (p ), cpu );
2362
+ return false;
2363
+ }
2364
+
2322
2365
/*
2323
2366
* We don't require the BPF scheduler to avoid dispatching to offline
2324
2367
* CPUs mostly for convenience but also because CPUs can go offline
@@ -2327,14 +2370,11 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
2327
2370
*/
2328
2371
if (!task_allowed_on_cpu (p , cpu )) {
2329
2372
if (trigger_error )
2330
- scx_ops_error ("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]" ,
2331
- cpu_of ( rq ) , p -> comm , p -> pid );
2373
+ scx_ops_error ("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]" ,
2374
+ cpu , p -> comm , p -> pid );
2332
2375
return false;
2333
2376
}
2334
2377
2335
- if (unlikely (is_migration_disabled (p )))
2336
- return false;
2337
-
2338
2378
if (!scx_rq_online (rq ))
2339
2379
return false;
2340
2380
@@ -2437,7 +2477,8 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
2437
2477
2438
2478
if (dst_dsq -> id == SCX_DSQ_LOCAL ) {
2439
2479
dst_rq = container_of (dst_dsq , struct rq , scx .local_dsq );
2440
- if (!task_can_run_on_remote_rq (p , dst_rq , true)) {
2480
+ if (src_rq != dst_rq &&
2481
+ unlikely (!task_can_run_on_remote_rq (p , dst_rq , true))) {
2441
2482
dst_dsq = find_global_dsq (p );
2442
2483
dst_rq = src_rq ;
2443
2484
}
@@ -2480,7 +2521,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
2480
2521
/*
2481
2522
* A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly
2482
2523
* banging on the same DSQ on a large NUMA system to the point where switching
2483
- * to the bypass mode can take a long time. Inject artifical delays while the
2524
+ * to the bypass mode can take a long time. Inject artificial delays while the
2484
2525
* bypass mode is switching to guarantee timely completion.
2485
2526
*/
2486
2527
static void scx_ops_breather (struct rq * rq )
@@ -2575,6 +2616,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
2575
2616
{
2576
2617
struct rq * src_rq = task_rq (p );
2577
2618
struct rq * dst_rq = container_of (dst_dsq , struct rq , scx .local_dsq );
2619
+ #ifdef CONFIG_SMP
2620
+ struct rq * locked_rq = rq ;
2621
+ #endif
2578
2622
2579
2623
/*
2580
2624
* We're synchronized against dequeue through DISPATCHING. As @p can't
@@ -2588,7 +2632,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
2588
2632
}
2589
2633
2590
2634
#ifdef CONFIG_SMP
2591
- if (unlikely (!task_can_run_on_remote_rq (p , dst_rq , true))) {
2635
+ if (src_rq != dst_rq &&
2636
+ unlikely (!task_can_run_on_remote_rq (p , dst_rq , true))) {
2592
2637
dispatch_enqueue (find_global_dsq (p ), p ,
2593
2638
enq_flags | SCX_ENQ_CLEAR_OPSS );
2594
2639
return ;
@@ -2611,8 +2656,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
2611
2656
atomic_long_set_release (& p -> scx .ops_state , SCX_OPSS_NONE );
2612
2657
2613
2658
/* switch to @src_rq lock */
2614
- if (rq != src_rq ) {
2615
- raw_spin_rq_unlock (rq );
2659
+ if (locked_rq != src_rq ) {
2660
+ raw_spin_rq_unlock (locked_rq );
2661
+ locked_rq = src_rq ;
2616
2662
raw_spin_rq_lock (src_rq );
2617
2663
}
2618
2664
@@ -2630,6 +2676,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
2630
2676
} else {
2631
2677
move_remote_task_to_local_dsq (p , enq_flags ,
2632
2678
src_rq , dst_rq );
2679
+ /* task has been moved to dst_rq, which is now locked */
2680
+ locked_rq = dst_rq ;
2633
2681
}
2634
2682
2635
2683
/* if the destination CPU is idle, wake it up */
@@ -2638,8 +2686,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
2638
2686
}
2639
2687
2640
2688
/* switch back to @rq lock */
2641
- if (rq != dst_rq ) {
2642
- raw_spin_rq_unlock (dst_rq );
2689
+ if (locked_rq != rq ) {
2690
+ raw_spin_rq_unlock (locked_rq );
2643
2691
raw_spin_rq_lock (rq );
2644
2692
}
2645
2693
#else /* CONFIG_SMP */
@@ -3144,7 +3192,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
3144
3192
*
3145
3193
* Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
3146
3194
* to implement the default task ordering. The older the timestamp, the higher
3147
- * prority the task - the global FIFO ordering matching the default scheduling
3195
+ * priority the task - the global FIFO ordering matching the default scheduling
3148
3196
* behavior.
3149
3197
*
3150
3198
* When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
@@ -3851,7 +3899,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
3851
3899
curr -> scx .slice = 0 ;
3852
3900
touch_core_sched (rq , curr );
3853
3901
} else if (SCX_HAS_OP (tick )) {
3854
- SCX_CALL_OP (SCX_KF_REST , tick , curr );
3902
+ SCX_CALL_OP_TASK (SCX_KF_REST , tick , curr );
3855
3903
}
3856
3904
3857
3905
if (!curr -> scx .slice )
@@ -3998,7 +4046,7 @@ static void scx_ops_disable_task(struct task_struct *p)
3998
4046
WARN_ON_ONCE (scx_get_task_state (p ) != SCX_TASK_ENABLED );
3999
4047
4000
4048
if (SCX_HAS_OP (disable ))
4001
- SCX_CALL_OP (SCX_KF_REST , disable , p );
4049
+ SCX_CALL_OP_TASK (SCX_KF_REST , disable , p );
4002
4050
scx_set_task_state (p , SCX_TASK_READY );
4003
4051
}
4004
4052
@@ -4027,7 +4075,7 @@ static void scx_ops_exit_task(struct task_struct *p)
4027
4075
}
4028
4076
4029
4077
if (SCX_HAS_OP (exit_task ))
4030
- SCX_CALL_OP (SCX_KF_REST , exit_task , p , & args );
4078
+ SCX_CALL_OP_TASK (SCX_KF_REST , exit_task , p , & args );
4031
4079
scx_set_task_state (p , SCX_TASK_NONE );
4032
4080
}
4033
4081
@@ -4323,24 +4371,11 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
4323
4371
return ops_sanitize_err ("cgroup_prep_move" , ret );
4324
4372
}
4325
4373
4326
- void scx_move_task (struct task_struct * p )
4374
+ void scx_cgroup_move_task (struct task_struct * p )
4327
4375
{
4328
4376
if (!scx_cgroup_enabled )
4329
4377
return ;
4330
4378
4331
- /*
4332
- * We're called from sched_move_task() which handles both cgroup and
4333
- * autogroup moves. Ignore the latter.
4334
- *
4335
- * Also ignore exiting tasks, because in the exit path tasks transition
4336
- * from the autogroup to the root group, so task_group_is_autogroup()
4337
- * alone isn't able to catch exiting autogroup tasks. This is safe for
4338
- * cgroup_move(), because cgroup migrations never happen for PF_EXITING
4339
- * tasks.
4340
- */
4341
- if (task_group_is_autogroup (task_group (p )) || (p -> flags & PF_EXITING ))
4342
- return ;
4343
-
4344
4379
/*
4345
4380
* @p must have ops.cgroup_prep_move() called on it and thus
4346
4381
* cgrp_moving_from set.
@@ -4590,7 +4625,7 @@ static int scx_cgroup_init(void)
4590
4625
cgroup_warned_missing_idle = false;
4591
4626
4592
4627
/*
4593
- * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk
4628
+ * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
4594
4629
* cgroups and init, all online cgroups are initialized.
4595
4630
*/
4596
4631
rcu_read_lock ();
@@ -5059,6 +5094,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
5059
5094
static_branch_disable (& scx_has_op [i ]);
5060
5095
static_branch_disable (& scx_ops_enq_last );
5061
5096
static_branch_disable (& scx_ops_enq_exiting );
5097
+ static_branch_disable (& scx_ops_enq_migration_disabled );
5062
5098
static_branch_disable (& scx_ops_cpu_preempt );
5063
5099
static_branch_disable (& scx_builtin_idle_enabled );
5064
5100
synchronize_rcu ();
@@ -5277,9 +5313,10 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
5277
5313
scx_get_task_state (p ), p -> scx .flags & ~SCX_TASK_STATE_MASK ,
5278
5314
p -> scx .dsq_flags , ops_state & SCX_OPSS_STATE_MASK ,
5279
5315
ops_state >> SCX_OPSS_QSEQ_SHIFT );
5280
- dump_line (s , " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu slice=%llu" ,
5281
- p -> scx .sticky_cpu , p -> scx .holding_cpu , dsq_id_buf ,
5282
- p -> scx .dsq_vtime , p -> scx .slice );
5316
+ dump_line (s , " sticky/holding_cpu=%d/%d dsq_id=%s" ,
5317
+ p -> scx .sticky_cpu , p -> scx .holding_cpu , dsq_id_buf );
5318
+ dump_line (s , " dsq_vtime=%llu slice=%llu weight=%u" ,
5319
+ p -> scx .dsq_vtime , p -> scx .slice , p -> scx .weight );
5283
5320
dump_line (s , " cpus=%*pb" , cpumask_pr_args (p -> cpus_ptr ));
5284
5321
5285
5322
if (SCX_HAS_OP (dump_task )) {
@@ -5667,6 +5704,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
5667
5704
5668
5705
if (ops -> flags & SCX_OPS_ENQ_EXITING )
5669
5706
static_branch_enable (& scx_ops_enq_exiting );
5707
+ if (ops -> flags & SCX_OPS_ENQ_MIGRATION_DISABLED )
5708
+ static_branch_enable (& scx_ops_enq_migration_disabled );
5670
5709
if (scx_ops .cpu_acquire || scx_ops .cpu_release )
5671
5710
static_branch_enable (& scx_ops_cpu_preempt );
5672
5711
0 commit comments