@@ -122,6 +122,19 @@ enum scx_ops_flags {
122122 */
123123 SCX_OPS_SWITCH_PARTIAL = 1LLU << 3 ,
124124
125+ /*
126+ * A migration disabled task can only execute on its current CPU. By
127+ * default, such tasks are automatically put on the CPU's local DSQ with
128+ * the default slice on enqueue. If this ops flag is set, they also go
129+ * through ops.enqueue().
130+ *
131+ * A migration disabled task never invokes ops.select_cpu() as it can
132+ * only select the current CPU. Also, p->cpus_ptr will only contain its
133+ * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
134+ * and thus may disagree with cpumask_weight(p->cpus_ptr).
135+ */
136+ SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4 ,
137+
125138 /*
126139 * CPU cgroup support flags
127140 */
@@ -130,6 +143,7 @@ enum scx_ops_flags {
130143 SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
131144 SCX_OPS_ENQ_LAST |
132145 SCX_OPS_ENQ_EXITING |
146+ SCX_OPS_ENQ_MIGRATION_DISABLED |
133147 SCX_OPS_SWITCH_PARTIAL |
134148 SCX_OPS_HAS_CGROUP_WEIGHT ,
135149};
@@ -416,7 +430,7 @@ struct sched_ext_ops {
416430
417431 /**
418432 * @update_idle: Update the idle state of a CPU
419- * @cpu: CPU to udpate the idle state for
433+ * @cpu: CPU to update the idle state for
420434 * @idle: whether entering or exiting the idle state
421435 *
422436 * This operation is called when @rq's CPU goes or leaves the idle
@@ -882,6 +896,7 @@ static bool scx_warned_zero_slice;
882896
883897static DEFINE_STATIC_KEY_FALSE (scx_ops_enq_last );
884898static DEFINE_STATIC_KEY_FALSE (scx_ops_enq_exiting );
899+ static DEFINE_STATIC_KEY_FALSE (scx_ops_enq_migration_disabled );
885900static DEFINE_STATIC_KEY_FALSE (scx_ops_cpu_preempt );
886901static DEFINE_STATIC_KEY_FALSE (scx_builtin_idle_enabled );
887902
@@ -1214,7 +1229,7 @@ static bool scx_kf_allowed_if_unlocked(void)
12141229
12151230/**
12161231 * nldsq_next_task - Iterate to the next task in a non-local DSQ
1217- * @dsq: user dsq being interated
1232+ * @dsq: user dsq being iterated
12181233 * @cur: current position, %NULL to start iteration
12191234 * @rev: walk backwards
12201235 *
@@ -2014,6 +2029,11 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
20142029 unlikely (p -> flags & PF_EXITING ))
20152030 goto local ;
20162031
2032+ /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
2033+ if (!static_branch_unlikely (& scx_ops_enq_migration_disabled ) &&
2034+ is_migration_disabled (p ))
2035+ goto local ;
2036+
20172037 if (!SCX_HAS_OP (enqueue ))
20182038 goto global ;
20192039
@@ -2078,7 +2098,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p)
20782098
20792099 /*
20802100 * list_add_tail() must be used. scx_ops_bypass() depends on tasks being
2081- * appened to the runnable_list.
2101+ * appended to the runnable_list.
20822102 */
20832103 list_add_tail (& p -> scx .runnable_node , & rq -> scx .runnable_list );
20842104}
@@ -2313,12 +2333,35 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
23132333 *
23142334 * - The BPF scheduler is bypassed while the rq is offline and we can always say
23152335 * no to the BPF scheduler initiated migrations while offline.
2336+ *
2337+ * The caller must ensure that @p and @rq are on different CPUs.
23162338 */
23172339static bool task_can_run_on_remote_rq (struct task_struct * p , struct rq * rq ,
23182340 bool trigger_error )
23192341{
23202342 int cpu = cpu_of (rq );
23212343
2344+ SCHED_WARN_ON (task_cpu (p ) == cpu );
2345+
2346+ /*
2347+ * If @p has migration disabled, @p->cpus_ptr is updated to contain only
2348+ * the pinned CPU in migrate_disable_switch() while @p is being switched
2349+ * out. However, put_prev_task_scx() is called before @p->cpus_ptr is
2350+ * updated and thus another CPU may see @p on a DSQ inbetween leading to
2351+ * @p passing the below task_allowed_on_cpu() check while migration is
2352+ * disabled.
2353+ *
2354+ * Test the migration disabled state first as the race window is narrow
2355+ * and the BPF scheduler failing to check migration disabled state can
2356+ * easily be masked if task_allowed_on_cpu() is done first.
2357+ */
2358+ if (unlikely (is_migration_disabled (p ))) {
2359+ if (trigger_error )
2360+ scx_ops_error ("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d" ,
2361+ p -> comm , p -> pid , task_cpu (p ), cpu );
2362+ return false;
2363+ }
2364+
23222365 /*
23232366 * We don't require the BPF scheduler to avoid dispatching to offline
23242367 * CPUs mostly for convenience but also because CPUs can go offline
@@ -2327,14 +2370,11 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
23272370 */
23282371 if (!task_allowed_on_cpu (p , cpu )) {
23292372 if (trigger_error )
2330- scx_ops_error ("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]" ,
2331- cpu_of ( rq ) , p -> comm , p -> pid );
2373+ scx_ops_error ("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]" ,
2374+ cpu , p -> comm , p -> pid );
23322375 return false;
23332376 }
23342377
2335- if (unlikely (is_migration_disabled (p )))
2336- return false;
2337-
23382378 if (!scx_rq_online (rq ))
23392379 return false;
23402380
@@ -2437,7 +2477,8 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
24372477
24382478 if (dst_dsq -> id == SCX_DSQ_LOCAL ) {
24392479 dst_rq = container_of (dst_dsq , struct rq , scx .local_dsq );
2440- if (!task_can_run_on_remote_rq (p , dst_rq , true)) {
2480+ if (src_rq != dst_rq &&
2481+ unlikely (!task_can_run_on_remote_rq (p , dst_rq , true))) {
24412482 dst_dsq = find_global_dsq (p );
24422483 dst_rq = src_rq ;
24432484 }
@@ -2480,7 +2521,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
24802521/*
24812522 * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly
24822523 * banging on the same DSQ on a large NUMA system to the point where switching
2483- * to the bypass mode can take a long time. Inject artifical delays while the
2524+ * to the bypass mode can take a long time. Inject artificial delays while the
24842525 * bypass mode is switching to guarantee timely completion.
24852526 */
24862527static void scx_ops_breather (struct rq * rq )
@@ -2575,6 +2616,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
25752616{
25762617 struct rq * src_rq = task_rq (p );
25772618 struct rq * dst_rq = container_of (dst_dsq , struct rq , scx .local_dsq );
2619+ #ifdef CONFIG_SMP
2620+ struct rq * locked_rq = rq ;
2621+ #endif
25782622
25792623 /*
25802624 * We're synchronized against dequeue through DISPATCHING. As @p can't
@@ -2588,7 +2632,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
25882632 }
25892633
25902634#ifdef CONFIG_SMP
2591- if (unlikely (!task_can_run_on_remote_rq (p , dst_rq , true))) {
2635+ if (src_rq != dst_rq &&
2636+ unlikely (!task_can_run_on_remote_rq (p , dst_rq , true))) {
25922637 dispatch_enqueue (find_global_dsq (p ), p ,
25932638 enq_flags | SCX_ENQ_CLEAR_OPSS );
25942639 return ;
@@ -2611,8 +2656,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
26112656 atomic_long_set_release (& p -> scx .ops_state , SCX_OPSS_NONE );
26122657
26132658 /* switch to @src_rq lock */
2614- if (rq != src_rq ) {
2615- raw_spin_rq_unlock (rq );
2659+ if (locked_rq != src_rq ) {
2660+ raw_spin_rq_unlock (locked_rq );
2661+ locked_rq = src_rq ;
26162662 raw_spin_rq_lock (src_rq );
26172663 }
26182664
@@ -2630,6 +2676,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
26302676 } else {
26312677 move_remote_task_to_local_dsq (p , enq_flags ,
26322678 src_rq , dst_rq );
2679+ /* task has been moved to dst_rq, which is now locked */
2680+ locked_rq = dst_rq ;
26332681 }
26342682
26352683 /* if the destination CPU is idle, wake it up */
@@ -2638,8 +2686,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
26382686 }
26392687
26402688 /* switch back to @rq lock */
2641- if (rq != dst_rq ) {
2642- raw_spin_rq_unlock (dst_rq );
2689+ if (locked_rq != rq ) {
2690+ raw_spin_rq_unlock (locked_rq );
26432691 raw_spin_rq_lock (rq );
26442692 }
26452693#else /* CONFIG_SMP */
@@ -3144,7 +3192,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
31443192 *
31453193 * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
31463194 * to implement the default task ordering. The older the timestamp, the higher
3147- * prority the task - the global FIFO ordering matching the default scheduling
3195+ * priority the task - the global FIFO ordering matching the default scheduling
31483196 * behavior.
31493197 *
31503198 * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
@@ -3851,7 +3899,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
38513899 curr -> scx .slice = 0 ;
38523900 touch_core_sched (rq , curr );
38533901 } else if (SCX_HAS_OP (tick )) {
3854- SCX_CALL_OP (SCX_KF_REST , tick , curr );
3902+ SCX_CALL_OP_TASK (SCX_KF_REST , tick , curr );
38553903 }
38563904
38573905 if (!curr -> scx .slice )
@@ -3998,7 +4046,7 @@ static void scx_ops_disable_task(struct task_struct *p)
39984046 WARN_ON_ONCE (scx_get_task_state (p ) != SCX_TASK_ENABLED );
39994047
40004048 if (SCX_HAS_OP (disable ))
4001- SCX_CALL_OP (SCX_KF_REST , disable , p );
4049+ SCX_CALL_OP_TASK (SCX_KF_REST , disable , p );
40024050 scx_set_task_state (p , SCX_TASK_READY );
40034051}
40044052
@@ -4027,7 +4075,7 @@ static void scx_ops_exit_task(struct task_struct *p)
40274075 }
40284076
40294077 if (SCX_HAS_OP (exit_task ))
4030- SCX_CALL_OP (SCX_KF_REST , exit_task , p , & args );
4078+ SCX_CALL_OP_TASK (SCX_KF_REST , exit_task , p , & args );
40314079 scx_set_task_state (p , SCX_TASK_NONE );
40324080}
40334081
@@ -4323,24 +4371,11 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
43234371 return ops_sanitize_err ("cgroup_prep_move" , ret );
43244372}
43254373
4326- void scx_move_task (struct task_struct * p )
4374+ void scx_cgroup_move_task (struct task_struct * p )
43274375{
43284376 if (!scx_cgroup_enabled )
43294377 return ;
43304378
4331- /*
4332- * We're called from sched_move_task() which handles both cgroup and
4333- * autogroup moves. Ignore the latter.
4334- *
4335- * Also ignore exiting tasks, because in the exit path tasks transition
4336- * from the autogroup to the root group, so task_group_is_autogroup()
4337- * alone isn't able to catch exiting autogroup tasks. This is safe for
4338- * cgroup_move(), because cgroup migrations never happen for PF_EXITING
4339- * tasks.
4340- */
4341- if (task_group_is_autogroup (task_group (p )) || (p -> flags & PF_EXITING ))
4342- return ;
4343-
43444379 /*
43454380 * @p must have ops.cgroup_prep_move() called on it and thus
43464381 * cgrp_moving_from set.
@@ -4590,7 +4625,7 @@ static int scx_cgroup_init(void)
45904625 cgroup_warned_missing_idle = false;
45914626
45924627 /*
4593- * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk
4628+ * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
45944629 * cgroups and init, all online cgroups are initialized.
45954630 */
45964631 rcu_read_lock ();
@@ -5059,6 +5094,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
50595094 static_branch_disable (& scx_has_op [i ]);
50605095 static_branch_disable (& scx_ops_enq_last );
50615096 static_branch_disable (& scx_ops_enq_exiting );
5097+ static_branch_disable (& scx_ops_enq_migration_disabled );
50625098 static_branch_disable (& scx_ops_cpu_preempt );
50635099 static_branch_disable (& scx_builtin_idle_enabled );
50645100 synchronize_rcu ();
@@ -5277,9 +5313,10 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
52775313 scx_get_task_state (p ), p -> scx .flags & ~SCX_TASK_STATE_MASK ,
52785314 p -> scx .dsq_flags , ops_state & SCX_OPSS_STATE_MASK ,
52795315 ops_state >> SCX_OPSS_QSEQ_SHIFT );
5280- dump_line (s , " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu slice=%llu" ,
5281- p -> scx .sticky_cpu , p -> scx .holding_cpu , dsq_id_buf ,
5282- p -> scx .dsq_vtime , p -> scx .slice );
5316+ dump_line (s , " sticky/holding_cpu=%d/%d dsq_id=%s" ,
5317+ p -> scx .sticky_cpu , p -> scx .holding_cpu , dsq_id_buf );
5318+ dump_line (s , " dsq_vtime=%llu slice=%llu weight=%u" ,
5319+ p -> scx .dsq_vtime , p -> scx .slice , p -> scx .weight );
52835320 dump_line (s , " cpus=%*pb" , cpumask_pr_args (p -> cpus_ptr ));
52845321
52855322 if (SCX_HAS_OP (dump_task )) {
@@ -5667,6 +5704,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
56675704
56685705 if (ops -> flags & SCX_OPS_ENQ_EXITING )
56695706 static_branch_enable (& scx_ops_enq_exiting );
5707+ if (ops -> flags & SCX_OPS_ENQ_MIGRATION_DISABLED )
5708+ static_branch_enable (& scx_ops_enq_migration_disabled );
56705709 if (scx_ops .cpu_acquire || scx_ops .cpu_release )
56715710 static_branch_enable (& scx_ops_cpu_preempt );
56725711
0 commit comments