Skip to content

Commit fa2c325

Browse files
Valentin SchneiderPeter Zijlstra
authored andcommitted
sched/tracing: Don't re-read p->state when emitting sched_switch event
As of commit c6e7bd7 ("sched/core: Optimize ttwu() spinning on p->on_cpu") the following sequence becomes possible: p->__state = TASK_INTERRUPTIBLE; __schedule() deactivate_task(p); ttwu() READ !p->on_rq p->__state=TASK_WAKING trace_sched_switch() __trace_sched_switch_state() task_state_index() return 0; TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in the trace event. Prevent this by pushing the value read from __schedule() down the trace event. Reported-by: Abhijeet Dharmapurikar <[email protected]> Signed-off-by: Valentin Schneider <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Reviewed-by: Steven Rostedt (Google) <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 49bef33 commit fa2c325

File tree

9 files changed

+34
-14
lines changed

9 files changed

+34
-14
lines changed

include/linux/sched.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1620,10 +1620,10 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
16201620
#define TASK_REPORT_IDLE (TASK_REPORT + 1)
16211621
#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1)
16221622

1623-
static inline unsigned int task_state_index(struct task_struct *tsk)
1623+
static inline unsigned int __task_state_index(unsigned int tsk_state,
1624+
unsigned int tsk_exit_state)
16241625
{
1625-
unsigned int tsk_state = READ_ONCE(tsk->__state);
1626-
unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;
1626+
unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT;
16271627

16281628
BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX);
16291629

@@ -1633,6 +1633,11 @@ static inline unsigned int task_state_index(struct task_struct *tsk)
16331633
return fls(state);
16341634
}
16351635

1636+
static inline unsigned int task_state_index(struct task_struct *tsk)
1637+
{
1638+
return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state);
1639+
}
1640+
16361641
static inline char task_index_to_char(unsigned int state)
16371642
{
16381643
static const char state_char[] = "RSDTtXZPI";

include/trace/events/sched.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,9 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
187187
TP_ARGS(p));
188188

189189
#ifdef CREATE_TRACE_POINTS
190-
static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p)
190+
static inline long __trace_sched_switch_state(bool preempt,
191+
unsigned int prev_state,
192+
struct task_struct *p)
191193
{
192194
unsigned int state;
193195

@@ -208,7 +210,7 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
208210
* it for left shift operation to get the correct task->state
209211
* mapping.
210212
*/
211-
state = task_state_index(p);
213+
state = __task_state_index(prev_state, p->exit_state);
212214

213215
return state ? (1 << (state - 1)) : state;
214216
}
@@ -220,10 +222,11 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
220222
TRACE_EVENT(sched_switch,
221223

222224
TP_PROTO(bool preempt,
225+
unsigned int prev_state,
223226
struct task_struct *prev,
224227
struct task_struct *next),
225228

226-
TP_ARGS(preempt, prev, next),
229+
TP_ARGS(preempt, prev_state, prev, next),
227230

228231
TP_STRUCT__entry(
229232
__array( char, prev_comm, TASK_COMM_LEN )
@@ -239,7 +242,7 @@ TRACE_EVENT(sched_switch,
239242
memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
240243
__entry->prev_pid = prev->pid;
241244
__entry->prev_prio = prev->prio;
242-
__entry->prev_state = __trace_sched_switch_state(preempt, prev);
245+
__entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev);
243246
memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
244247
__entry->next_pid = next->pid;
245248
__entry->next_prio = next->prio;

kernel/sched/core.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4836,7 +4836,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
48364836
{
48374837
struct rq *rq = this_rq();
48384838
struct mm_struct *mm = rq->prev_mm;
4839-
long prev_state;
4839+
unsigned int prev_state;
48404840

48414841
/*
48424842
* The previous task will have left us with a preempt_count of 2
@@ -6300,7 +6300,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
63006300
migrate_disable_switch(rq, prev);
63016301
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
63026302

6303-
trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next);
6303+
trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next);
63046304

63056305
/* Also unlocks the rq: */
63066306
rq = context_switch(rq, prev, next, &rf);

kernel/trace/fgraph.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -415,7 +415,9 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
415415

416416
static void
417417
ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
418-
struct task_struct *prev, struct task_struct *next)
418+
unsigned int prev_state,
419+
struct task_struct *prev,
420+
struct task_struct *next)
419421
{
420422
unsigned long long timestamp;
421423
int index;

kernel/trace/ftrace.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7347,7 +7347,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
73477347

73487348
static void
73497349
ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
7350-
struct task_struct *prev, struct task_struct *next)
7350+
unsigned int prev_state,
7351+
struct task_struct *prev,
7352+
struct task_struct *next)
73517353
{
73527354
struct trace_array *tr = data;
73537355
struct trace_pid_list *pid_list;

kernel/trace/trace_events.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -759,7 +759,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)
759759

760760
static void
761761
event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
762-
struct task_struct *prev, struct task_struct *next)
762+
unsigned int prev_state,
763+
struct task_struct *prev,
764+
struct task_struct *next)
763765
{
764766
struct trace_array *tr = data;
765767
struct trace_pid_list *no_pid_list;
@@ -783,7 +785,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
783785

784786
static void
785787
event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
786-
struct task_struct *prev, struct task_struct *next)
788+
unsigned int prev_state,
789+
struct task_struct *prev,
790+
struct task_struct *next)
787791
{
788792
struct trace_array *tr = data;
789793
struct trace_pid_list *no_pid_list;

kernel/trace/trace_osnoise.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1167,7 +1167,9 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
11671167
* used to record the beginning and to report the end of a thread noise window.
11681168
*/
11691169
static void
1170-
trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p,
1170+
trace_sched_switch_callback(void *data, bool preempt,
1171+
unsigned int prev_state,
1172+
struct task_struct *p,
11711173
struct task_struct *n)
11721174
{
11731175
struct osnoise_variables *osn_var = this_cpu_osn_var();

kernel/trace/trace_sched_switch.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ static DEFINE_MUTEX(sched_register_mutex);
2222

2323
static void
2424
probe_sched_switch(void *ignore, bool preempt,
25+
unsigned int prev_state,
2526
struct task_struct *prev, struct task_struct *next)
2627
{
2728
int flags;

kernel/trace/trace_sched_wakeup.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
426426

427427
static void notrace
428428
probe_wakeup_sched_switch(void *ignore, bool preempt,
429+
unsigned int prev_state,
429430
struct task_struct *prev, struct task_struct *next)
430431
{
431432
struct trace_array_cpu *data;

0 commit comments

Comments
 (0)