Skip to content

Commit 3a50d6f

Browse files
authored
signals: acquire loader lock during stackwalk on Win32 (#59840)
Lock the Windows loader lock in jl_with_stackwalk_lock to prevent certain deadlocks when unwinding the stack, similar to how macOS acquires the keymgr lock in jl_lock_profile_mach. The loader lock prevents the loader from modifying internal data structures while we're walking the stack, which could otherwise cause crashes or hangs during backtrace collection. Fixes some of #59650 🤖 Generated with Claude Code
1 parent 20c787d commit 3a50d6f

File tree

5 files changed

+45
-65
lines changed

5 files changed

+45
-65
lines changed

src/julia_internal.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,6 @@ JL_DLLEXPORT int jl_lock_profile(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER;
240240
JL_DLLEXPORT void jl_unlock_profile(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE;
241241
JL_DLLEXPORT int jl_lock_profile_wr(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER;
242242
JL_DLLEXPORT void jl_unlock_profile_wr(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE;
243-
void jl_with_stackwalk_lock(void (*f)(void*) JL_NOTSAFEPOINT, void *ctx) JL_NOTSAFEPOINT;
244243

245244
arraylist_t *jl_get_all_tasks_arraylist(void) JL_NOTSAFEPOINT;
246245
typedef struct {
@@ -1533,8 +1532,8 @@ void jl_fprint_bt_entry_codeloc(ios_t *s, jl_bt_element_t *bt_data) JL_NOTSAFEPO
15331532
#ifdef _OS_WINDOWS_
15341533
JL_DLLEXPORT void jl_refresh_dbg_module_list(void);
15351534
#endif
1536-
int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx) JL_NOTSAFEPOINT;
15371535
void jl_thread_resume(int tid) JL_NOTSAFEPOINT;
1536+
int jl_thread_suspend(int16_t tid, bt_context_t *ctx) JL_NOTSAFEPOINT;
15381537

15391538
// *to is NULL or malloc'd pointer, from is allowed to be NULL
15401539
STATIC_INLINE char *jl_copy_str(char **to, const char *from) JL_NOTSAFEPOINT

src/signals-mach.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,7 @@ static int jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx) J
527527
return 1;
528528
}
529529

530-
int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
530+
static int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
531531
{
532532
(void)timeout;
533533
host_thread_state_t state;
@@ -712,6 +712,14 @@ static void jl_unlock_profile_mach(int dlsymlock, int keymgr_locked)
712712
jl_unlock_profile();
713713
}
714714

715+
int jl_thread_suspend(int16_t tid, bt_context_t *ctx)
716+
{
717+
int lockret = jl_lock_profile_mach(1);
718+
int success = jl_thread_suspend_and_get_state(tid, 1, ctx);
719+
jl_unlock_profile_mach(1, lockret);
720+
return success;
721+
}
722+
715723
void jl_with_stackwalk_lock(void (*f)(void*), void *ctx)
716724
{
717725
int lockret = jl_lock_profile_mach(1);

src/signals-unix.c

Lines changed: 19 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -303,41 +303,34 @@ int exc_reg_is_write_fault(uintptr_t esr) {
303303
}
304304
#endif
305305

306+
static int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx);
307+
306308
#if defined(HAVE_MACH)
307309
#include "signals-mach.c"
308310
#else
309311
#include <poll.h>
310312
#include <sys/eventfd.h>
311313
#include <link.h>
312314

313-
#ifndef _OS_FREEBSD_
314315
typedef struct {
315-
void (*f)(void*) JL_NOTSAFEPOINT;
316-
void *ctx;
317-
} callback_t;
316+
int16_t tid;
317+
bt_context_t *ctx;
318+
int success;
319+
} callback_data_t;
318320
static int with_dl_iterate_phdr_lock(struct dl_phdr_info *info, size_t size, void *data)
319321
{
320322
jl_lock_profile();
321-
callback_t *callback = (callback_t*)data;
322-
callback->f(callback->ctx);
323+
callback_data_t *cb_data = (callback_data_t*)data;
324+
cb_data->success = jl_thread_suspend_and_get_state(cb_data->tid, 1, cb_data->ctx);
323325
jl_unlock_profile();
324326
return 1; // only call this once
325327
}
326-
#endif
327328

328-
void jl_with_stackwalk_lock(void (*f)(void*), void *ctx)
329+
int jl_thread_suspend(int16_t tid, bt_context_t *ctx)
329330
{
330-
#ifndef _OS_FREEBSD_
331-
callback_t callback = {f, ctx};
332-
dl_iterate_phdr(with_dl_iterate_phdr_lock, &callback);
333-
#else
334-
// FreeBSD makes the questionable decisions to use a terrible implementation of a spin
335-
// lock and to block all signals while a lock is held. However, that also means it is
336-
// not currently vulnerable to this libunwind bug that other platforms can encounter.
337-
jl_lock_profile();
338-
f(ctx);
339-
jl_unlock_profile();
340-
#endif
331+
callback_data_t cb_data = {tid, ctx, 0};
332+
dl_iterate_phdr(with_dl_iterate_phdr_lock, &cb_data);
333+
return cb_data.success;
341334
}
342335

343336
#if defined(_OS_LINUX_) && (defined(_CPU_X86_64_) || defined(_CPU_X86_))
@@ -458,7 +451,7 @@ static int exit_signal_cond = -1;
458451
static int signal_caught_cond = -1;
459452
static int signals_inflight = 0;
460453

461-
int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
454+
static int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
462455
{
463456
int err;
464457
pthread_mutex_lock(&in_signal_lock);
@@ -845,15 +838,15 @@ void trigger_profile_peek(void)
845838

846839
static jl_bt_element_t signal_bt_data[JL_MAX_BT_SIZE + 1];
847840
static size_t signal_bt_size = 0;
848-
static void do_critical_profile(void *ctx)
841+
static void do_critical_profile(void)
849842
{
850843
bt_context_t signal_context;
851844
// sample each thread, round-robin style in reverse order
852845
// (so that thread zero gets notified last)
853846
int nthreads = jl_atomic_load_acquire(&jl_n_threads);
854847
for (int i = nthreads; i-- > 0; ) {
855848
// notify thread to stop
856-
if (!jl_thread_suspend_and_get_state(i, 1, &signal_context))
849+
if (!jl_thread_suspend(i, &signal_context))
857850
continue;
858851

859852
// do backtrace on thread contexts for critical signals
@@ -866,7 +859,7 @@ static void do_critical_profile(void *ctx)
866859
}
867860
}
868861

869-
static void do_profile(void *ctx)
862+
static void do_profile(void)
870863
{
871864
bt_context_t signal_context;
872865
int nthreads = jl_atomic_load_acquire(&jl_n_threads);
@@ -883,7 +876,7 @@ static void do_profile(void *ctx)
883876
return;
884877
}
885878
// notify thread to stop
886-
if (!jl_thread_suspend_and_get_state(tid, 1, &signal_context))
879+
if (!jl_thread_suspend(tid, &signal_context))
887880
return;
888881
// unwinding can fail, so keep track of the current state
889882
// and restore from the SEGV handler if anything happens.
@@ -1062,15 +1055,15 @@ static void *signal_listener(void *arg)
10621055
signal_bt_size = 0;
10631056
#if !defined(JL_DISABLE_LIBUNWIND)
10641057
if (critical) {
1065-
jl_with_stackwalk_lock(do_critical_profile, NULL);
1058+
do_critical_profile();
10661059
}
10671060
else if (profile) {
10681061
if (profile_all_tasks) {
10691062
// Don't take the stackwalk lock here since it's already taken in `jl_rec_backtrace`
10701063
jl_profile_task();
10711064
}
10721065
else {
1073-
jl_with_stackwalk_lock(do_profile, NULL);
1066+
do_profile();
10741067
}
10751068
}
10761069
#ifndef HAVE_MACH

src/signals-win.c

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44
// Note that this file is `#include`d by "signal-handling.c"
55
#include <mmsystem.h> // hidden by LEAN_AND_MEAN
66

7+
// Loader lock functions from ntdll
8+
// See https://devblogs.microsoft.com/oldnewthing/20140808-00/?p=293
9+
extern NTSTATUS NTAPI LdrLockLoaderLock(ULONG Flags, ULONG *State, ULONG_PTR *Cookie);
10+
extern NTSTATUS NTAPI LdrUnlockLoaderLock(ULONG Flags, ULONG_PTR Cookie);
11+
712
static const size_t sig_stack_size = 131072; // 128k reserved for backtrace_fiber for stack overflow handling
813

914
// Copied from MINGW_FLOAT_H which may not be found due to a collision with the builtin gcc float.h
@@ -439,26 +444,19 @@ void jl_thread_resume(int tid)
439444
}
440445
}
441446

442-
void jl_lock_stackwalk(void)
447+
int jl_thread_suspend(int16_t tid, bt_context_t *ctx)
443448
{
444449
uv_mutex_lock(&jl_in_stackwalk);
445450
jl_lock_profile();
446-
}
447-
448-
void jl_unlock_stackwalk(void)
449-
{
451+
ULONG_PTR lock_cookie = 0;
452+
LdrLockLoaderLock(0x1, NULL, &lock_cookie);
453+
int success = jl_thread_suspend_and_get_state(tid, 0, ctx);
454+
LdrUnlockLoaderLock(0x1, lock_cookie);
450455
jl_unlock_profile();
451456
uv_mutex_unlock(&jl_in_stackwalk);
457+
return success;
452458
}
453459

454-
void jl_with_stackwalk_lock(void (*f)(void*), void *ctx)
455-
{
456-
jl_lock_stackwalk();
457-
f(ctx);
458-
jl_unlock_stackwalk();
459-
}
460-
461-
462460
static DWORD WINAPI profile_bt( LPVOID lparam )
463461
{
464462
// Note: illegal to use jl_* functions from this thread except for profiling-specific functions
@@ -477,17 +475,15 @@ static DWORD WINAPI profile_bt( LPVOID lparam )
477475
}
478476
else {
479477
// TODO: bring this up to parity with other OS by adding loop over tid here
480-
jl_lock_stackwalk();
481-
CONTEXT ctxThread;
482-
if (!jl_thread_suspend_and_get_state(0, 0, &ctxThread)) {
483-
jl_unlock_stackwalk();
478+
bt_context_t c;
479+
if (!jl_thread_suspend(0, &c)) {
484480
fputs("failed to suspend main thread. aborting profiling.", stderr);
485481
jl_profile_stop_timer();
486482
break;
487483
}
488484
// Get backtrace data
489485
profile_bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur,
490-
profile_bt_size_max - profile_bt_size_cur - 1, &ctxThread, NULL);
486+
profile_bt_size_max - profile_bt_size_cur - 1, &c, NULL);
491487

492488
jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; // given only profiling hMainThread
493489

@@ -507,7 +503,6 @@ static DWORD WINAPI profile_bt( LPVOID lparam )
507503
// Mark the end of this block with two 0's
508504
profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0;
509505
profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0;
510-
jl_unlock_stackwalk();
511506
jl_thread_resume(0);
512507
jl_check_profile_autostop();
513508
}

src/stackwalk.c

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1257,26 +1257,13 @@ return 0;
12571257
#endif
12581258
}
12591259

1260-
typedef struct {
1261-
int16_t old;
1262-
bt_context_t *c;
1263-
int success;
1264-
} suspend_t;
1265-
static void suspend(void *ctx)
1266-
{
1267-
suspend_t *suspenddata = (suspend_t*)ctx;
1268-
suspenddata->success = jl_thread_suspend_and_get_state(suspenddata->old, 1, suspenddata->c);
1269-
}
1270-
12711260
JL_DLLEXPORT size_t jl_try_record_thread_backtrace(jl_ptls_t ptls2, jl_bt_element_t *bt_data, size_t max_bt_size) JL_NOTSAFEPOINT
12721261
{
12731262
int16_t tid = ptls2->tid;
12741263
jl_task_t *t = NULL;
12751264
bt_context_t *context = NULL;
12761265
bt_context_t c;
1277-
suspend_t suspenddata = {tid, &c};
1278-
jl_with_stackwalk_lock(suspend, &suspenddata);
1279-
if (!suspenddata.success) {
1266+
if (!jl_thread_suspend(tid, &c)) {
12801267
return 0;
12811268
}
12821269
// thread is stopped, safe to read the task it was running before we stopped it
@@ -1309,9 +1296,7 @@ JL_DLLEXPORT jl_record_backtrace_result_t jl_record_backtrace(jl_task_t *t, jl_b
13091296
int16_t old;
13101297
for (old = -1; !jl_atomic_cmpswap(&t->tid, &old, tid) && old != tid; old = -1) {
13111298
// if this task is already running somewhere, we need to stop the thread it is running on and query its state
1312-
suspend_t suspenddata = {old, &c};
1313-
jl_with_stackwalk_lock(suspend, &suspenddata);
1314-
if (!suspenddata.success) {
1299+
if (!jl_thread_suspend(old, &c)) {
13151300
if (jl_atomic_load_relaxed(&t->tid) != old)
13161301
continue;
13171302
return result;

0 commit comments

Comments
 (0)