Skip to content

Commit 3c9880f

Browse files
committed
ftrace: Use a running sleeptime instead of saving on shadow stack
The fgraph "sleep-time" option tells the function graph tracer and the profiler whether to include the time a function "sleeps" (is scheduled off the CPU) in its duration for the function. By default it is true, which means the duration of a function is calculated by the timestamp of when the function was entered to the timestamp of when it exits. If the "sleep-time" option is disabled, it needs to remove the time that the task was not running on the CPU during the function. Currently it is done in a sched_switch tracepoint probe where it moves the "calltime" (time of entry of the function) forward by the sleep time calculated. It updates all the calltime in the shadow stack. This is time consuming for those users of the function graph tracer that does not care about the sleep time. Instead, add a "ftrace_sleeptime" to the task_struct that gets the sleep time added each time the task wakes up. Then have the function entry save the current "ftrace_sleeptime" and on function exit, move the calltime forward by the difference of the current "ftrace_sleeptime" from the saved sleeptime. This removes one dependency of "calltime" needed to be on the shadow stack. It also simplifies the code that removes the sleep time of functions. TODO: Only enable the sched_switch tracepoint when this is needed. Cc: Mark Rutland <[email protected]> Cc: Mathieu Desnoyers <[email protected]> Cc: Andrew Morton <[email protected]> Cc: Jiri Olsa <[email protected]> Link: https://lore.kernel.org/[email protected] Acked-by: Masami Hiramatsu (Google) <[email protected]> Signed-off-by: Steven Rostedt (Google) <[email protected]>
1 parent a312a0f commit 3c9880f

File tree

5 files changed

+60
-25
lines changed

5 files changed

+60
-25
lines changed

include/linux/sched.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1441,6 +1441,7 @@ struct task_struct {
14411441

14421442
/* Timestamp for last schedule: */
14431443
unsigned long long ftrace_timestamp;
1444+
unsigned long long ftrace_sleeptime;
14441445

14451446
/*
14461447
* Number of functions that haven't been traced

kernel/trace/fgraph.c

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,7 @@ void *fgraph_retrieve_parent_data(int idx, int *size_bytes, int depth)
495495
}
496496

497497
/* Both enabled by default (can be cleared by function_graph tracer flags */
498-
static bool fgraph_sleep_time = true;
498+
bool fgraph_sleep_time = true;
499499

500500
#ifdef CONFIG_DYNAMIC_FTRACE
501501
/*
@@ -1046,9 +1046,7 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
10461046
struct task_struct *next,
10471047
unsigned int prev_state)
10481048
{
1049-
struct ftrace_ret_stack *ret_stack;
10501049
unsigned long long timestamp;
1051-
int offset;
10521050

10531051
/*
10541052
* Does the user want to count the time a function was asleep.
@@ -1065,17 +1063,7 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
10651063
if (!next->ftrace_timestamp)
10661064
return;
10671065

1068-
/*
1069-
* Update all the counters in next to make up for the
1070-
* time next was sleeping.
1071-
*/
1072-
timestamp -= next->ftrace_timestamp;
1073-
1074-
for (offset = next->curr_ret_stack; offset > 0; ) {
1075-
ret_stack = get_ret_stack(next, offset, &offset);
1076-
if (ret_stack)
1077-
ret_stack->calltime += timestamp;
1078-
}
1066+
next->ftrace_sleeptime += timestamp - next->ftrace_timestamp;
10791067
}
10801068

10811069
static DEFINE_PER_CPU(unsigned long *, idle_ret_stack);

kernel/trace/ftrace.c

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -820,30 +820,39 @@ void ftrace_graph_graph_time_control(bool enable)
820820
fgraph_graph_time = enable;
821821
}
822822

823+
struct profile_fgraph_data {
824+
unsigned long long subtime;
825+
unsigned long long sleeptime;
826+
};
827+
823828
static int profile_graph_entry(struct ftrace_graph_ent *trace,
824829
struct fgraph_ops *gops)
825830
{
826-
unsigned long long *subtime;
831+
struct profile_fgraph_data *profile_data;
827832

828833
function_profile_call(trace->func, 0, NULL, NULL);
829834

830835
/* If function graph is shutting down, ret_stack can be NULL */
831836
if (!current->ret_stack)
832837
return 0;
833838

834-
subtime = fgraph_reserve_data(gops->idx, sizeof(*subtime));
835-
if (subtime)
836-
*subtime = 0;
839+
profile_data = fgraph_reserve_data(gops->idx, sizeof(*profile_data));
840+
if (!profile_data)
841+
return 0;
842+
843+
profile_data->subtime = 0;
844+
profile_data->sleeptime = current->ftrace_sleeptime;
837845

838846
return 1;
839847
}
840848

841849
static void profile_graph_return(struct ftrace_graph_ret *trace,
842850
struct fgraph_ops *gops)
843851
{
852+
struct profile_fgraph_data *profile_data;
853+
struct profile_fgraph_data *parent_data;
844854
struct ftrace_profile_stat *stat;
845855
unsigned long long calltime;
846-
unsigned long long *subtime;
847856
struct ftrace_profile *rec;
848857
unsigned long flags;
849858
int size;
@@ -859,16 +868,24 @@ static void profile_graph_return(struct ftrace_graph_ret *trace,
859868

860869
calltime = trace->rettime - trace->calltime;
861870

871+
if (!fgraph_sleep_time) {
872+
profile_data = fgraph_retrieve_data(gops->idx, &size);
873+
if (profile_data && current->ftrace_sleeptime)
874+
calltime -= current->ftrace_sleeptime - profile_data->sleeptime;
875+
}
876+
862877
if (!fgraph_graph_time) {
863878

864879
/* Append this call time to the parent time to subtract */
865-
subtime = fgraph_retrieve_parent_data(gops->idx, &size, 1);
866-
if (subtime)
867-
*subtime += calltime;
880+
parent_data = fgraph_retrieve_parent_data(gops->idx, &size, 1);
881+
if (parent_data)
882+
parent_data->subtime += calltime;
883+
884+
if (!profile_data)
885+
profile_data = fgraph_retrieve_data(gops->idx, &size);
868886

869-
subtime = fgraph_retrieve_data(gops->idx, &size);
870-
if (subtime && *subtime && *subtime < calltime)
871-
calltime -= *subtime;
887+
if (profile_data && profile_data->subtime && profile_data->subtime < calltime)
888+
calltime -= profile_data->subtime;
872889
else
873890
calltime = 0;
874891
}

kernel/trace/trace.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1048,6 +1048,7 @@ static inline void ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftra
10481048
#endif /* CONFIG_DYNAMIC_FTRACE */
10491049

10501050
extern unsigned int fgraph_max_depth;
1051+
extern bool fgraph_sleep_time;
10511052

10521053
static inline bool
10531054
ftrace_graph_ignore_func(struct fgraph_ops *gops, struct ftrace_graph_ent *trace)

kernel/trace/trace_functions_graph.c

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace,
133133
unsigned long *task_var = fgraph_get_task_var(gops);
134134
struct trace_array *tr = gops->private;
135135
struct trace_array_cpu *data;
136+
unsigned long *sleeptime;
136137
unsigned long flags;
137138
unsigned int trace_ctx;
138139
long disabled;
@@ -167,6 +168,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace,
167168
if (ftrace_graph_ignore_irqs())
168169
return 0;
169170

171+
/* save the current sleep time if we are to ignore it */
172+
if (!fgraph_sleep_time) {
173+
sleeptime = fgraph_reserve_data(gops->idx, sizeof(*sleeptime));
174+
if (sleeptime)
175+
*sleeptime = current->ftrace_sleeptime;
176+
}
177+
170178
/*
171179
* Stop here if tracing_threshold is set. We only write function return
172180
* events to the ring buffer.
@@ -238,6 +246,22 @@ void __trace_graph_return(struct trace_array *tr,
238246
trace_buffer_unlock_commit_nostack(buffer, event);
239247
}
240248

249+
static void handle_nosleeptime(struct ftrace_graph_ret *trace,
250+
struct fgraph_ops *gops)
251+
{
252+
unsigned long long *sleeptime;
253+
int size;
254+
255+
if (fgraph_sleep_time)
256+
return;
257+
258+
sleeptime = fgraph_retrieve_data(gops->idx, &size);
259+
if (!sleeptime)
260+
return;
261+
262+
trace->calltime += current->ftrace_sleeptime - *sleeptime;
263+
}
264+
241265
void trace_graph_return(struct ftrace_graph_ret *trace,
242266
struct fgraph_ops *gops)
243267
{
@@ -256,6 +280,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace,
256280
return;
257281
}
258282

283+
handle_nosleeptime(trace, gops);
284+
259285
local_irq_save(flags);
260286
cpu = raw_smp_processor_id();
261287
data = per_cpu_ptr(tr->array_buffer.data, cpu);
@@ -278,6 +304,8 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace,
278304
return;
279305
}
280306

307+
handle_nosleeptime(trace, gops);
308+
281309
if (tracing_thresh &&
282310
(trace->rettime - trace->calltime < tracing_thresh))
283311
return;

0 commit comments

Comments
 (0)