Skip to content

Commit be3d526

Browse files
committed
unwind deferred: Use bitmask to determine which callbacks to call
In order to know which registered callback requested a stacktrace for when the task goes back to user space, add a bitmask to keep track of all registered tracers. The bitmask is the size of long, which means that on a 32 bit machine, it can have at most 32 registered tracers, and on 64 bit, it can have at most 64 registered tracers. This should not be an issue as there should not be more than 10 (unless BPF can abuse this?). When a tracer registers with unwind_deferred_init() it will get a bit number assigned to it. When a tracer requests a stacktrace, it will have its bit set within the task_struct. When the task returns back to user space, it will call the callbacks for all the registered tracers where their bits are set in the task's mask. When a tracer is removed by the unwind_deferred_cancel() all current tasks will clear the associated bit, just in case another tracer gets registered immediately afterward and then gets their callback called unexpectedly. To prevent live locks from happening if an event that happens between the task_work and when the task goes back to user space, triggers the deferred unwind, have the unwind_mask get cleared on exit to user space and not after the callback is made. Move the pending bit from a value on the task_struct to bit zero of the unwind_mask (saves space on the task_struct). This will allow modifying the pending bit along with the work bits atomically. Instead of clearing a work's bit after its callback is called, it is delayed until exit. If the work is requested again, the task_work is not queued again and the request will be notified that the task has already been called by returning a positive number (the same as if it was already pending). The pending bit is cleared before calling the callback functions but the current work bits remain. If one of the called works registers again, it will not trigger a task_work if its bit is still present in the task's unwind_mask. If a new work requests a deferred unwind, then it will set both the pending bit and its own bit. Note this will also cause any work that was previously queued and had their callback already executed to be executed again. Future work will remove these spurious callbacks. The use of atomic_long bit operations were suggested by Peter Zijlstra: Link: https://lore.kernel.org/all/[email protected]/ The unwind_mask could not be converted to atomic_long_t do to atomic_long not having all the bit operations needed by unwind_mask. Instead it follows other use cases in the kernel and just typecasts the unwind_mask to atomic_long_t when using the two atomic_long functions. Cc: Masami Hiramatsu <[email protected]> Cc: Mathieu Desnoyers <[email protected]> Cc: Josh Poimboeuf <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: Jiri Olsa <[email protected]> Cc: Arnaldo Carvalho de Melo <[email protected]> Cc: Namhyung Kim <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: Andrii Nakryiko <[email protected]> Cc: Indu Bhagat <[email protected]> Cc: "Jose E. Marchesi" <[email protected]> Cc: Beau Belgrave <[email protected]> Cc: Jens Remus <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Andrew Morton <[email protected]> Cc: Jens Axboe <[email protected]> Cc: Florian Weimer <[email protected]> Cc: Sam James <[email protected]> Link: https://lore.kernel.org/[email protected] Signed-off-by: Steven Rostedt (Google) <[email protected]>
1 parent 055c706 commit be3d526

File tree

3 files changed

+92
-23
lines changed

3 files changed

+92
-23
lines changed

include/linux/unwind_deferred.h

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,19 @@ typedef void (*unwind_callback_t)(struct unwind_work *work, struct unwind_stackt
1313
struct unwind_work {
1414
struct list_head list;
1515
unwind_callback_t func;
16+
int bit;
1617
};
1718

1819
#ifdef CONFIG_UNWIND_USER
1920

21+
enum {
22+
UNWIND_PENDING_BIT = 0,
23+
};
24+
25+
enum {
26+
UNWIND_PENDING = BIT(UNWIND_PENDING_BIT),
27+
};
28+
2029
void unwind_task_init(struct task_struct *task);
2130
void unwind_task_free(struct task_struct *task);
2231

@@ -28,15 +37,26 @@ void unwind_deferred_cancel(struct unwind_work *work);
2837

2938
static __always_inline void unwind_reset_info(void)
3039
{
31-
if (unlikely(current->unwind_info.id.id))
40+
struct unwind_task_info *info = &current->unwind_info;
41+
unsigned long bits;
42+
43+
/* Was there any unwinding? */
44+
if (unlikely(info->unwind_mask)) {
45+
bits = info->unwind_mask;
46+
do {
47+
/* Is a task_work going to run again before going back */
48+
if (bits & UNWIND_PENDING)
49+
return;
50+
} while (!try_cmpxchg(&info->unwind_mask, &bits, 0UL));
3251
current->unwind_info.id.id = 0;
52+
}
3353
/*
3454
* As unwind_user_faultable() can be called directly and
3555
* depends on nr_entries being cleared on exit to user,
3656
* this needs to be a separate conditional.
3757
*/
38-
if (unlikely(current->unwind_info.cache))
39-
current->unwind_info.cache->nr_entries = 0;
58+
if (unlikely(info->cache))
59+
info->cache->nr_entries = 0;
4060
}
4161

4262
#else /* !CONFIG_UNWIND_USER */

include/linux/unwind_deferred_types.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,10 @@ union unwind_task_id {
2929
};
3030

3131
struct unwind_task_info {
32+
unsigned long unwind_mask;
3233
struct unwind_cache *cache;
3334
struct callback_head work;
3435
union unwind_task_id id;
35-
int pending;
3636
};
3737

3838
#endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */

kernel/unwind/deferred.c

Lines changed: 68 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,16 @@ static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
4545
static DEFINE_MUTEX(callback_mutex);
4646
static LIST_HEAD(callbacks);
4747

48+
#define RESERVED_BITS (UNWIND_PENDING)
49+
50+
/* Zero'd bits are available for assigning callback users */
51+
static unsigned long unwind_mask = RESERVED_BITS;
52+
53+
static inline bool unwind_pending(struct unwind_task_info *info)
54+
{
55+
return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask);
56+
}
57+
4858
/*
4959
* This is a unique percpu identifier for a given task entry context.
5060
* Conceptually, it's incremented every time the CPU enters the kernel from
@@ -138,14 +148,15 @@ static void unwind_deferred_task_work(struct callback_head *head)
138148
struct unwind_task_info *info = container_of(head, struct unwind_task_info, work);
139149
struct unwind_stacktrace trace;
140150
struct unwind_work *work;
151+
unsigned long bits;
141152
u64 cookie;
142153

143-
if (WARN_ON_ONCE(!info->pending))
154+
if (WARN_ON_ONCE(!unwind_pending(info)))
144155
return;
145156

146-
/* Allow work to come in again */
147-
WRITE_ONCE(info->pending, 0);
148-
157+
/* Clear pending bit but make sure to have the current bits */
158+
bits = atomic_long_fetch_andnot(UNWIND_PENDING,
159+
(atomic_long_t *)&info->unwind_mask);
149160
/*
150161
* From here on out, the callback must always be called, even if it's
151162
* just an empty trace.
@@ -159,7 +170,8 @@ static void unwind_deferred_task_work(struct callback_head *head)
159170

160171
guard(mutex)(&callback_mutex);
161172
list_for_each_entry(work, &callbacks, list) {
162-
work->func(work, &trace, cookie);
173+
if (test_bit(work->bit, &bits))
174+
work->func(work, &trace, cookie);
163175
}
164176
}
165177

@@ -183,15 +195,16 @@ static void unwind_deferred_task_work(struct callback_head *head)
183195
* because it has already been previously called for the same entry context,
184196
* it will be called again with the same stack trace and cookie.
185197
*
186-
* Return: 1 if the the callback was already queued.
187-
* 0 if the callback successfully was queued.
198+
* Return: 0 if the callback successfully was queued.
199+
* 1 if the callback is pending or was already executed.
188200
* Negative if there's an error.
189201
* @cookie holds the cookie of the first request by any user
190202
*/
191203
int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
192204
{
193205
struct unwind_task_info *info = &current->unwind_info;
194-
long pending;
206+
unsigned long old, bits;
207+
unsigned long bit = BIT(work->bit);
195208
int ret;
196209

197210
*cookie = 0;
@@ -212,39 +225,74 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
212225

213226
*cookie = get_cookie(info);
214227

215-
/* callback already pending? */
216-
pending = READ_ONCE(info->pending);
217-
if (pending)
218-
return 1;
228+
old = READ_ONCE(info->unwind_mask);
219229

220-
/* Claim the work unless an NMI just now swooped in to do so. */
221-
if (!try_cmpxchg(&info->pending, &pending, 1))
230+
/* Is this already queued or executed */
231+
if (old & bit)
222232
return 1;
223233

234+
/*
235+
* This work's bit hasn't been set yet. Now set it with the PENDING
236+
* bit and fetch the current value of unwind_mask. If ether the
237+
* work's bit or PENDING was already set, then this is already queued
238+
* to have a callback.
239+
*/
240+
bits = UNWIND_PENDING | bit;
241+
old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask);
242+
if (old & bits) {
243+
/*
244+
* If the work's bit was set, whatever set it had better
245+
* have also set pending and queued a callback.
246+
*/
247+
WARN_ON_ONCE(!(old & UNWIND_PENDING));
248+
return old & bit;
249+
}
250+
224251
/* The work has been claimed, now schedule it. */
225252
ret = task_work_add(current, &info->work, TWA_RESUME);
226-
if (WARN_ON_ONCE(ret)) {
227-
WRITE_ONCE(info->pending, 0);
228-
return ret;
229-
}
230253

231-
return 0;
254+
if (WARN_ON_ONCE(ret))
255+
WRITE_ONCE(info->unwind_mask, 0);
256+
257+
return ret;
232258
}
233259

234260
void unwind_deferred_cancel(struct unwind_work *work)
235261
{
262+
struct task_struct *g, *t;
263+
236264
if (!work)
237265
return;
238266

267+
/* No work should be using a reserved bit */
268+
if (WARN_ON_ONCE(BIT(work->bit) & RESERVED_BITS))
269+
return;
270+
239271
guard(mutex)(&callback_mutex);
240272
list_del(&work->list);
273+
274+
__clear_bit(work->bit, &unwind_mask);
275+
276+
guard(rcu)();
277+
/* Clear this bit from all threads */
278+
for_each_process_thread(g, t) {
279+
clear_bit(work->bit, &t->unwind_info.unwind_mask);
280+
}
241281
}
242282

243283
int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
244284
{
245285
memset(work, 0, sizeof(*work));
246286

247287
guard(mutex)(&callback_mutex);
288+
289+
/* See if there's a bit in the mask available */
290+
if (unwind_mask == ~0UL)
291+
return -EBUSY;
292+
293+
work->bit = ffz(unwind_mask);
294+
__set_bit(work->bit, &unwind_mask);
295+
248296
list_add(&work->list, &callbacks);
249297
work->func = func;
250298
return 0;
@@ -256,6 +304,7 @@ void unwind_task_init(struct task_struct *task)
256304

257305
memset(info, 0, sizeof(*info));
258306
init_task_work(&info->work, unwind_deferred_task_work);
307+
info->unwind_mask = 0;
259308
}
260309

261310
void unwind_task_free(struct task_struct *task)

0 commit comments

Comments
 (0)