Skip to content

Commit 3dcac25

Browse files
author
Peter Zijlstra
committed
sched/core: Introduce SM_IDLE and an idle re-entry fast-path in __schedule()
Since commit b2a02fc ("smp: Optimize send_call_function_single_ipi()") an idle CPU in TIF_POLLING_NRFLAG mode can be pulled out of idle by setting TIF_NEED_RESCHED flag to service an IPI without actually sending an interrupt. Even in cases where the IPI handler does not queue a task on the idle CPU, do_idle() will call __schedule() since need_resched() returns true in these cases. Introduce and use SM_IDLE to identify call to __schedule() from schedule_idle() and shorten the idle re-entry time by skipping pick_next_task() when nr_running is 0 and the previous task is the idle task. With the SM_IDLE fast-path, the time taken to complete a fixed set of IPIs using ipistorm improves noticeably. Following are the numbers from a dual socket Intel Ice Lake Xeon server (2 x 32C/64T) and 3rd Generation AMD EPYC system (2 x 64C/128T) (boost on, C2 disabled) running ipistorm between CPU8 and CPU16: cmdline: insmod ipistorm.ko numipi=100000 single=1 offset=8 cpulist=8 wait=1 ================================================================== Test : ipistorm (modified) Units : Normalized runtime Interpretation: Lower is better Statistic : AMean ======================= Intel Ice Lake Xeon ====================== kernel: time [pct imp] tip:sched/core 1.00 [baseline] tip:sched/core + SM_IDLE 0.80 [20.51%] ==================== 3rd Generation AMD EPYC ===================== kernel: time [pct imp] tip:sched/core 1.00 [baseline] tip:sched/core + SM_IDLE 0.90 [10.17%] ================================================================== [ kprateek: Commit message, SM_RTLOCK_WAIT fix ] Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Not-yet-signed-off-by: Peter Zijlstra <[email protected]> Signed-off-by: K Prateek Nayak <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Acked-by: Vincent Guittot <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent b2d7022 commit 3dcac25

File tree

1 file changed

+26
-19
lines changed

1 file changed

+26
-19
lines changed

kernel/sched/core.c

Lines changed: 26 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6410,19 +6410,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
64106410
* Constants for the sched_mode argument of __schedule().
64116411
*
64126412
* The mode argument allows RT enabled kernels to differentiate a
6413-
* preemption from blocking on an 'sleeping' spin/rwlock. Note that
6414-
* SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
6415-
* optimize the AND operation out and just check for zero.
6413+
* preemption from blocking on an 'sleeping' spin/rwlock.
64166414
*/
6417-
#define SM_NONE 0x0
6418-
#define SM_PREEMPT 0x1
6419-
#define SM_RTLOCK_WAIT 0x2
6420-
6421-
#ifndef CONFIG_PREEMPT_RT
6422-
# define SM_MASK_PREEMPT (~0U)
6423-
#else
6424-
# define SM_MASK_PREEMPT SM_PREEMPT
6425-
#endif
6415+
#define SM_IDLE (-1)
6416+
#define SM_NONE 0
6417+
#define SM_PREEMPT 1
6418+
#define SM_RTLOCK_WAIT 2
64266419

64276420
/*
64286421
* __schedule() is the main scheduler function.
@@ -6463,9 +6456,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
64636456
*
64646457
* WARNING: must be called with preemption disabled!
64656458
*/
6466-
static void __sched notrace __schedule(unsigned int sched_mode)
6459+
static void __sched notrace __schedule(int sched_mode)
64676460
{
64686461
struct task_struct *prev, *next;
6462+
/*
6463+
* On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted
6464+
* as a preemption by schedule_debug() and RCU.
6465+
*/
6466+
bool preempt = sched_mode > SM_NONE;
64696467
unsigned long *switch_count;
64706468
unsigned long prev_state;
64716469
struct rq_flags rf;
@@ -6476,13 +6474,13 @@ static void __sched notrace __schedule(unsigned int sched_mode)
64766474
rq = cpu_rq(cpu);
64776475
prev = rq->curr;
64786476

6479-
schedule_debug(prev, !!sched_mode);
6477+
schedule_debug(prev, preempt);
64806478

64816479
if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
64826480
hrtick_clear(rq);
64836481

64846482
local_irq_disable();
6485-
rcu_note_context_switch(!!sched_mode);
6483+
rcu_note_context_switch(preempt);
64866484

64876485
/*
64886486
* Make sure that signal_pending_state()->signal_pending() below
@@ -6511,12 +6509,20 @@ static void __sched notrace __schedule(unsigned int sched_mode)
65116509

65126510
switch_count = &prev->nivcsw;
65136511

6512+
/* Task state changes only considers SM_PREEMPT as preemption */
6513+
preempt = sched_mode == SM_PREEMPT;
6514+
65146515
/*
65156516
* We must load prev->state once (task_struct::state is volatile), such
65166517
* that we form a control dependency vs deactivate_task() below.
65176518
*/
65186519
prev_state = READ_ONCE(prev->__state);
6519-
if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
6520+
if (sched_mode == SM_IDLE) {
6521+
if (!rq->nr_running) {
6522+
next = prev;
6523+
goto picked;
6524+
}
6525+
} else if (!preempt && prev_state) {
65206526
if (signal_pending_state(prev_state, prev)) {
65216527
WRITE_ONCE(prev->__state, TASK_RUNNING);
65226528
} else {
@@ -6547,6 +6553,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
65476553
}
65486554

65496555
next = pick_next_task(rq, prev, &rf);
6556+
picked:
65506557
clear_tsk_need_resched(prev);
65516558
clear_preempt_need_resched();
65526559
#ifdef CONFIG_SCHED_DEBUG
@@ -6588,7 +6595,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
65886595
psi_account_irqtime(rq, prev, next);
65896596
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
65906597

6591-
trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
6598+
trace_sched_switch(preempt, prev, next, prev_state);
65926599

65936600
/* Also unlocks the rq: */
65946601
rq = context_switch(rq, prev, next, &rf);
@@ -6664,7 +6671,7 @@ static void sched_update_worker(struct task_struct *tsk)
66646671
}
66656672
}
66666673

6667-
static __always_inline void __schedule_loop(unsigned int sched_mode)
6674+
static __always_inline void __schedule_loop(int sched_mode)
66686675
{
66696676
do {
66706677
preempt_disable();
@@ -6709,7 +6716,7 @@ void __sched schedule_idle(void)
67096716
*/
67106717
WARN_ON_ONCE(current->__state);
67116718
do {
6712-
__schedule(SM_NONE);
6719+
__schedule(SM_IDLE);
67136720
} while (need_resched());
67146721
}
67156722

0 commit comments

Comments
 (0)