Skip to content

Commit cfe32da

Browse files
arighigregkh
authored andcommitted
sched_ext: idle: Refresh idle masks during idle-to-idle transitions
[ Upstream commit a2a3374 ] With the consolidation of put_prev_task/set_next_task(), see commit 436f3ee ("sched: Combine the last put_prev_task() and the first set_next_task()"), we are now skipping the transition between these two functions when the previous and the next tasks are the same. As a result, the scx idle state of a CPU is updated only when transitioning to or from the idle thread. While this is generally correct, it can lead to uneven and inefficient core utilization in certain scenarios [1]. A typical scenario involves proactive wake-ups: scx_bpf_pick_idle_cpu() selects and marks an idle CPU as busy, followed by a wake-up via scx_bpf_kick_cpu(), without dispatching any tasks. In this case, the CPU continues running the idle thread, returns to idle, but remains marked as busy, preventing it from being selected again as an idle CPU (until a task eventually runs on it and releases the CPU). For example, running a workload that uses 20% of each CPU, combined with an scx scheduler using proactive wake-ups, results in the following core utilization: CPU 0: 25.7% CPU 1: 29.3% CPU 2: 26.5% CPU 3: 25.5% CPU 4: 0.0% CPU 5: 25.5% CPU 6: 0.0% CPU 7: 10.5% To address this, refresh the idle state also in pick_task_idle(), during idle-to-idle transitions, but only trigger ops.update_idle() on actual state changes to prevent unnecessary updates to the scx scheduler and maintain balanced state transitions. With this change in place, the core utilization in the previous example becomes the following: CPU 0: 18.8% CPU 1: 19.4% CPU 2: 18.0% CPU 3: 18.7% CPU 4: 19.3% CPU 5: 18.9% CPU 6: 18.7% CPU 7: 19.3% [1] sched-ext/scx#1139 Fixes: 7c65ae8 ("sched_ext: Don't call put_prev_task_scx() before picking the next task") Signed-off-by: Andrea Righi <[email protected]> Signed-off-by: Tejun Heo <[email protected]> Signed-off-by: Sasha Levin <[email protected]>
1 parent 11cb1d6 commit cfe32da

File tree

3 files changed

+59
-15
lines changed

3 files changed

+59
-15
lines changed

kernel/sched/ext.c

Lines changed: 52 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3240,16 +3240,8 @@ static void reset_idle_masks(void)
32403240
cpumask_copy(idle_masks.smt, cpu_online_mask);
32413241
}
32423242

3243-
void __scx_update_idle(struct rq *rq, bool idle)
3243+
static void update_builtin_idle(int cpu, bool idle)
32443244
{
3245-
int cpu = cpu_of(rq);
3246-
3247-
if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) {
3248-
SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
3249-
if (!static_branch_unlikely(&scx_builtin_idle_enabled))
3250-
return;
3251-
}
3252-
32533245
if (idle)
32543246
cpumask_set_cpu(cpu, idle_masks.cpu);
32553247
else
@@ -3276,6 +3268,57 @@ void __scx_update_idle(struct rq *rq, bool idle)
32763268
#endif
32773269
}
32783270

3271+
/*
3272+
* Update the idle state of a CPU to @idle.
3273+
*
3274+
* If @do_notify is true, ops.update_idle() is invoked to notify the scx
3275+
* scheduler of an actual idle state transition (idle to busy or vice
3276+
* versa). If @do_notify is false, only the idle state in the idle masks is
3277+
* refreshed without invoking ops.update_idle().
3278+
*
3279+
* This distinction is necessary, because an idle CPU can be "reserved" and
3280+
* awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
3281+
* busy even if no tasks are dispatched. In this case, the CPU may return
3282+
* to idle without a true state transition. Refreshing the idle masks
3283+
* without invoking ops.update_idle() ensures accurate idle state tracking
3284+
* while avoiding unnecessary updates and maintaining balanced state
3285+
* transitions.
3286+
*/
3287+
void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
3288+
{
3289+
int cpu = cpu_of(rq);
3290+
3291+
lockdep_assert_rq_held(rq);
3292+
3293+
/*
3294+
* Trigger ops.update_idle() only when transitioning from a task to
3295+
* the idle thread and vice versa.
3296+
*
3297+
* Idle transitions are indicated by do_notify being set to true,
3298+
* managed by put_prev_task_idle()/set_next_task_idle().
3299+
*/
3300+
if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
3301+
SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
3302+
3303+
/*
3304+
* Update the idle masks:
3305+
* - for real idle transitions (do_notify == true)
3306+
* - for idle-to-idle transitions (indicated by the previous task
3307+
* being the idle thread, managed by pick_task_idle())
3308+
*
3309+
* Skip updating idle masks if the previous task is not the idle
3310+
* thread, since set_next_task_idle() has already handled it when
3311+
* transitioning from a task to the idle thread (calling this
3312+
* function with do_notify == true).
3313+
*
3314+
* In this way we can avoid updating the idle masks twice,
3315+
* unnecessarily.
3316+
*/
3317+
if (static_branch_likely(&scx_builtin_idle_enabled))
3318+
if (do_notify || is_idle_task(rq->curr))
3319+
update_builtin_idle(cpu, idle);
3320+
}
3321+
32793322
static void handle_hotplug(struct rq *rq, bool online)
32803323
{
32813324
int cpu = cpu_of(rq);

kernel/sched/ext.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,15 +57,15 @@ static inline void init_sched_ext_class(void) {}
5757
#endif /* CONFIG_SCHED_CLASS_EXT */
5858

5959
#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP)
60-
void __scx_update_idle(struct rq *rq, bool idle);
60+
void __scx_update_idle(struct rq *rq, bool idle, bool do_notify);
6161

62-
static inline void scx_update_idle(struct rq *rq, bool idle)
62+
static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify)
6363
{
6464
if (scx_enabled())
65-
__scx_update_idle(rq, idle);
65+
__scx_update_idle(rq, idle, do_notify);
6666
}
6767
#else
68-
static inline void scx_update_idle(struct rq *rq, bool idle) {}
68+
static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {}
6969
#endif
7070

7171
#ifdef CONFIG_CGROUP_SCHED

kernel/sched/idle.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -453,19 +453,20 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
453453
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next)
454454
{
455455
dl_server_update_idle_time(rq, prev);
456-
scx_update_idle(rq, false);
456+
scx_update_idle(rq, false, true);
457457
}
458458

459459
static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
460460
{
461461
update_idle_core(rq);
462-
scx_update_idle(rq, true);
462+
scx_update_idle(rq, true, true);
463463
schedstat_inc(rq->sched_goidle);
464464
next->se.exec_start = rq_clock_task(rq);
465465
}
466466

467467
struct task_struct *pick_task_idle(struct rq *rq)
468468
{
469+
scx_update_idle(rq, true, false);
469470
return rq->idle;
470471
}
471472

0 commit comments

Comments
 (0)