Skip to content

Commit 2dffa35

Browse files
jpoimboerostedt
authored andcommitted
unwind_user/deferred: Add deferred unwinding interface
Add an interface for scheduling task work to unwind the user space stack before returning to user space. This solves several problems for its callers: - Ensure the unwind happens in task context even if the caller may be running in interrupt context. - Avoid duplicate unwinds, whether called multiple times by the same caller or by different callers. - Create a "context cookie" which allows trace post-processing to correlate kernel unwinds/traces with the user unwind. A concept of a "cookie" is created to detect when the stacktrace is the same. A cookie is generated the first time a user space stacktrace is requested after the task enters the kernel. As the stacktrace is saved on the task_struct while the task is in the kernel, if another request comes in, if the cookie is still the same, it will use the saved stacktrace, and not have to regenerate one. The cookie is passed to the caller on request, and when the stacktrace is generated upon returning to user space, it calls the requester's callback with the cookie as well as the stacktrace. The cookie is cleared when it goes back to user space. Note, this currently adds another conditional to the unwind_reset_info() path that is always called returning to user space, but future changes will put this back to a single conditional. A global list is created and protected by a global mutex that holds tracers that register with the unwind infrastructure. The number of registered tracers will be limited in future changes. Each perf program or ftrace instance will register its own descriptor to use for deferred unwind stack traces. Note, in the function unwind_deferred_task_work() that gets called when returning to user space, it uses a global mutex for synchronization which will cause a big bottleneck. This will be replaced by SRCU, but that change adds some complex synchronization that deservers its own commit. Cc: Masami Hiramatsu <[email protected]> Cc: Mathieu Desnoyers <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: Jiri Olsa <[email protected]> Cc: Arnaldo Carvalho de Melo <[email protected]> Cc: Namhyung Kim <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: Andrii Nakryiko <[email protected]> Cc: Indu Bhagat <[email protected]> Cc: "Jose E. Marchesi" <[email protected]> Cc: Beau Belgrave <[email protected]> Cc: Jens Remus <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Andrew Morton <[email protected]> Cc: Jens Axboe <[email protected]> Cc: Florian Weimer <[email protected]> Cc: Sam James <[email protected]> Link: https://lore.kernel.org/[email protected] Co-developed-by: Steven Rostedt (Google) <[email protected]> Signed-off-by: Josh Poimboeuf <[email protected]> Signed-off-by: Steven Rostedt (Google) <[email protected]>
1 parent b9c7352 commit 2dffa35

File tree

3 files changed

+203
-1
lines changed

3 files changed

+203
-1
lines changed

include/linux/unwind_deferred.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,39 @@
22
#ifndef _LINUX_UNWIND_USER_DEFERRED_H
33
#define _LINUX_UNWIND_USER_DEFERRED_H
44

5+
#include <linux/task_work.h>
56
#include <linux/unwind_user.h>
67
#include <linux/unwind_deferred_types.h>
78

9+
struct unwind_work;
10+
11+
typedef void (*unwind_callback_t)(struct unwind_work *work, struct unwind_stacktrace *trace, u64 cookie);
12+
13+
struct unwind_work {
14+
struct list_head list;
15+
unwind_callback_t func;
16+
};
17+
818
#ifdef CONFIG_UNWIND_USER
919

1020
void unwind_task_init(struct task_struct *task);
1121
void unwind_task_free(struct task_struct *task);
1222

1323
int unwind_user_faultable(struct unwind_stacktrace *trace);
1424

25+
int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func);
26+
int unwind_deferred_request(struct unwind_work *work, u64 *cookie);
27+
void unwind_deferred_cancel(struct unwind_work *work);
28+
1529
static __always_inline void unwind_reset_info(void)
1630
{
31+
if (unlikely(current->unwind_info.id.id))
32+
current->unwind_info.id.id = 0;
33+
/*
34+
* As unwind_user_faultable() can be called directly and
35+
* depends on nr_entries being cleared on exit to user,
36+
* this needs to be a separate conditional.
37+
*/
1738
if (unlikely(current->unwind_info.cache))
1839
current->unwind_info.cache->nr_entries = 0;
1940
}
@@ -24,6 +45,9 @@ static inline void unwind_task_init(struct task_struct *task) {}
2445
static inline void unwind_task_free(struct task_struct *task) {}
2546

2647
static inline int unwind_user_faultable(struct unwind_stacktrace *trace) { return -ENOSYS; }
48+
static inline int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) { return -ENOSYS; }
49+
static inline int unwind_deferred_request(struct unwind_work *work, u64 *timestamp) { return -ENOSYS; }
50+
static inline void unwind_deferred_cancel(struct unwind_work *work) {}
2751

2852
static inline void unwind_reset_info(void) {}
2953

include/linux/unwind_deferred_types.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,32 @@ struct unwind_cache {
77
unsigned long entries[];
88
};
99

10+
/*
11+
* The unwind_task_id is a unique identifier that maps to a user space
12+
* stacktrace. It is generated the first time a deferred user space
13+
* stacktrace is requested after a task has entered the kerenl and
14+
* is cleared to zero when it exits. The mapped id will be a non-zero
15+
* number.
16+
*
17+
* To simplify the generation of the 64 bit number, 32 bits will be
18+
* the CPU it was generated on, and the other 32 bits will be a per
19+
* cpu counter that gets incremented by two every time a new identifier
20+
* is generated. The LSB will always be set to keep the value
21+
* from being zero.
22+
*/
23+
union unwind_task_id {
24+
struct {
25+
u32 cpu;
26+
u32 cnt;
27+
};
28+
u64 id;
29+
};
30+
1031
struct unwind_task_info {
1132
struct unwind_cache *cache;
33+
struct callback_head work;
34+
union unwind_task_id id;
35+
int pending;
1236
};
1337

1438
#endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */

kernel/unwind/deferred.c

Lines changed: 155 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,63 @@
22
/*
33
* Deferred user space unwinding
44
*/
5+
#include <linux/sched/task_stack.h>
6+
#include <linux/unwind_deferred.h>
7+
#include <linux/sched/clock.h>
8+
#include <linux/task_work.h>
59
#include <linux/kernel.h>
610
#include <linux/sched.h>
711
#include <linux/sizes.h>
812
#include <linux/slab.h>
9-
#include <linux/unwind_deferred.h>
13+
#include <linux/mm.h>
1014

1115
/* Make the cache fit in a 4K page */
1216
#define UNWIND_MAX_ENTRIES \
1317
((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long))
1418

19+
/* Guards adding to and reading the list of callbacks */
20+
static DEFINE_MUTEX(callback_mutex);
21+
static LIST_HEAD(callbacks);
22+
23+
/*
24+
* This is a unique percpu identifier for a given task entry context.
25+
* Conceptually, it's incremented every time the CPU enters the kernel from
26+
* user space, so that each "entry context" on the CPU gets a unique ID. In
27+
* reality, as an optimization, it's only incremented on demand for the first
28+
* deferred unwind request after a given entry-from-user.
29+
*
30+
* It's combined with the CPU id to make a systemwide-unique "context cookie".
31+
*/
32+
static DEFINE_PER_CPU(u32, unwind_ctx_ctr);
33+
34+
/*
35+
* The context cookie is a unique identifier that is assigned to a user
36+
* space stacktrace. As the user space stacktrace remains the same while
37+
* the task is in the kernel, the cookie is an identifier for the stacktrace.
38+
* Although it is possible for the stacktrace to get another cookie if another
39+
* request is made after the cookie was cleared and before reentering user
40+
* space.
41+
*/
42+
static u64 get_cookie(struct unwind_task_info *info)
43+
{
44+
u32 cnt = 1;
45+
u32 old = 0;
46+
47+
if (info->id.cpu)
48+
return info->id.id;
49+
50+
/* LSB is always set to ensure 0 is an invalid value */
51+
cnt |= __this_cpu_read(unwind_ctx_ctr) + 2;
52+
if (try_cmpxchg(&info->id.cnt, &old, cnt)) {
53+
/* Update the per cpu counter */
54+
__this_cpu_write(unwind_ctx_ctr, cnt);
55+
}
56+
/* Interrupts are disabled, the CPU will always be same */
57+
info->id.cpu = smp_processor_id() + 1; /* Must be non zero */
58+
59+
return info->id.id;
60+
}
61+
1562
/**
1663
* unwind_user_faultable - Produce a user stacktrace in faultable context
1764
* @trace: The descriptor that will store the user stacktrace
@@ -62,16 +109,123 @@ int unwind_user_faultable(struct unwind_stacktrace *trace)
62109
return 0;
63110
}
64111

112+
static void unwind_deferred_task_work(struct callback_head *head)
113+
{
114+
struct unwind_task_info *info = container_of(head, struct unwind_task_info, work);
115+
struct unwind_stacktrace trace;
116+
struct unwind_work *work;
117+
u64 cookie;
118+
119+
if (WARN_ON_ONCE(!info->pending))
120+
return;
121+
122+
/* Allow work to come in again */
123+
WRITE_ONCE(info->pending, 0);
124+
125+
/*
126+
* From here on out, the callback must always be called, even if it's
127+
* just an empty trace.
128+
*/
129+
trace.nr = 0;
130+
trace.entries = NULL;
131+
132+
unwind_user_faultable(&trace);
133+
134+
cookie = info->id.id;
135+
136+
guard(mutex)(&callback_mutex);
137+
list_for_each_entry(work, &callbacks, list) {
138+
work->func(work, &trace, cookie);
139+
}
140+
}
141+
142+
/**
143+
* unwind_deferred_request - Request a user stacktrace on task kernel exit
144+
* @work: Unwind descriptor requesting the trace
145+
* @cookie: The cookie of the first request made for this task
146+
*
147+
* Schedule a user space unwind to be done in task work before exiting the
148+
* kernel.
149+
*
150+
* The returned @cookie output is the generated cookie of the very first
151+
* request for a user space stacktrace for this task since it entered the
152+
* kernel. It can be from a request by any caller of this infrastructure.
153+
* Its value will also be passed to the callback function. It can be
154+
* used to stitch kernel and user stack traces together in post-processing.
155+
*
156+
* It's valid to call this function multiple times for the same @work within
157+
* the same task entry context. Each call will return the same cookie
158+
* while the task hasn't left the kernel. If the callback is not pending
159+
* because it has already been previously called for the same entry context,
160+
* it will be called again with the same stack trace and cookie.
161+
*
162+
* Return: 1 if the the callback was already queued.
163+
* 0 if the callback successfully was queued.
164+
* Negative if there's an error.
165+
* @cookie holds the cookie of the first request by any user
166+
*/
167+
int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
168+
{
169+
struct unwind_task_info *info = &current->unwind_info;
170+
int ret;
171+
172+
*cookie = 0;
173+
174+
if (WARN_ON_ONCE(in_nmi()))
175+
return -EINVAL;
176+
177+
if ((current->flags & (PF_KTHREAD | PF_EXITING)) ||
178+
!user_mode(task_pt_regs(current)))
179+
return -EINVAL;
180+
181+
guard(irqsave)();
182+
183+
*cookie = get_cookie(info);
184+
185+
/* callback already pending? */
186+
if (info->pending)
187+
return 1;
188+
189+
/* The work has been claimed, now schedule it. */
190+
ret = task_work_add(current, &info->work, TWA_RESUME);
191+
if (WARN_ON_ONCE(ret))
192+
return ret;
193+
194+
info->pending = 1;
195+
return 0;
196+
}
197+
198+
void unwind_deferred_cancel(struct unwind_work *work)
199+
{
200+
if (!work)
201+
return;
202+
203+
guard(mutex)(&callback_mutex);
204+
list_del(&work->list);
205+
}
206+
207+
int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
208+
{
209+
memset(work, 0, sizeof(*work));
210+
211+
guard(mutex)(&callback_mutex);
212+
list_add(&work->list, &callbacks);
213+
work->func = func;
214+
return 0;
215+
}
216+
65217
void unwind_task_init(struct task_struct *task)
66218
{
67219
struct unwind_task_info *info = &task->unwind_info;
68220

69221
memset(info, 0, sizeof(*info));
222+
init_task_work(&info->work, unwind_deferred_task_work);
70223
}
71224

72225
void unwind_task_free(struct task_struct *task)
73226
{
74227
struct unwind_task_info *info = &task->unwind_info;
75228

76229
kfree(info->cache);
230+
task_work_cancel(task, &info->work);
77231
}

0 commit comments

Comments
 (0)