Skip to content

Commit 11c692e

Browse files
authored
add option to dump backtrace of stop-the-world straggler (#57045)
This is still a work in progress, but it should help determine what a straggler thread was doing during the stop-the-world phase and why it failed to reach a safepoint in a timely manner. We've encountered long TTSP issues in production, and this tool should provide a valuable means to accurately diagnose them.
1 parent 647b9f4 commit 11c692e

File tree

8 files changed

+93
-4
lines changed

8 files changed

+93
-4
lines changed

base/options.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ struct JLOptions
6262
trace_compile_timing::Int8
6363
trim::Int8
6464
task_metrics::Int8
65+
timeout_for_safepoint_straggler_s::Int16
6566
end
6667

6768
# This runs early in the sysimage != is not defined yet

src/jloptions.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ JL_DLLEXPORT void jl_init_options(void)
153153
0, // trace_compile_timing
154154
JL_TRIM_NO, // trim
155155
0, // task_metrics
156+
-1, // timeout_for_safepoint_straggler_s
156157
};
157158
jl_options_initialized = 1;
158159
}
@@ -311,6 +312,8 @@ static const char opts_hidden[] =
311312
" --output-asm <name> Generate an assembly file (.s)\n"
312313
" --output-incremental={yes|no*} Generate an incremental output file (rather than\n"
313314
" complete)\n"
315+
" --timeout-for-safepoint-straggler <seconds> If this value is set, then we will dump the backtrace for a thread\n"
316+
" that fails to reach a safepoint within the specified time\n"
314317
" --trace-compile={stderr|name} Print precompile statements for methods compiled\n"
315318
" during execution or save to stderr or a path. Methods that\n"
316319
" were recompiled are printed in yellow or with a trailing\n"
@@ -346,6 +349,7 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
346349
opt_warn_scope,
347350
opt_inline,
348351
opt_polly,
352+
opt_timeout_for_safepoint_straggler,
349353
opt_trace_compile,
350354
opt_trace_compile_timing,
351355
opt_trace_dispatch,
@@ -427,6 +431,7 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
427431
{ "warn-scope", required_argument, 0, opt_warn_scope },
428432
{ "inline", required_argument, 0, opt_inline },
429433
{ "polly", required_argument, 0, opt_polly },
434+
{ "timeout-for-safepoint-straggler", required_argument, 0, opt_timeout_for_safepoint_straggler },
430435
{ "trace-compile", required_argument, 0, opt_trace_compile },
431436
{ "trace-compile-timing", no_argument, 0, opt_trace_compile_timing },
432437
{ "trace-dispatch", required_argument, 0, opt_trace_dispatch },
@@ -970,6 +975,13 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
970975
else
971976
jl_errorf("julia: invalid argument to --permalloc-pkgimg={yes|no} (%s)", optarg);
972977
break;
978+
case opt_timeout_for_safepoint_straggler:
979+
errno = 0;
980+
long timeout = strtol(optarg, &endptr, 10);
981+
if (errno != 0 || optarg == endptr || timeout < 1 || timeout > INT16_MAX)
982+
jl_errorf("julia: --timeout-for-safepoint-straggler=<seconds>; seconds must be an integer between 1 and %d", INT16_MAX);
983+
jl_options.timeout_for_safepoint_straggler_s = (int16_t)timeout;
984+
break;
973985
case opt_trim:
974986
if (optarg == NULL || !strcmp(optarg,"safe"))
975987
jl_options.trim = JL_TRIM_SAFE;

src/jloptions.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ typedef struct {
6666
int8_t trace_compile_timing;
6767
int8_t trim;
6868
int8_t task_metrics;
69+
int16_t timeout_for_safepoint_straggler_s;
6970
} jl_options_t;
7071

7172
#endif

src/julia_internal.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,8 @@ typedef struct {
215215
size_t bt_size;
216216
int tid;
217217
} jl_record_backtrace_result_t;
218+
JL_DLLEXPORT JL_DLLEXPORT size_t jl_try_record_thread_backtrace(jl_ptls_t ptls2, struct _jl_bt_element_t *bt_data,
219+
size_t max_bt_size) JL_NOTSAFEPOINT;
218220
JL_DLLEXPORT jl_record_backtrace_result_t jl_record_backtrace(jl_task_t *t, struct _jl_bt_element_t *bt_data,
219221
size_t max_bt_size, int all_tasks_profiler) JL_NOTSAFEPOINT;
220222
extern volatile struct _jl_bt_element_t *profile_bt_data_prof;

src/safepoint.c

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -149,10 +149,33 @@ void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads)
149149
// Use system mutexes rather than spin locking to minimize wasted CPU time
150150
// while we wait for other threads reach a safepoint.
151151
// This is particularly important when run under rr.
152-
uv_mutex_lock(&safepoint_lock);
153-
if (!jl_atomic_load_relaxed(&ptls2->gc_state))
154-
uv_cond_wait(&safepoint_cond_begin, &safepoint_lock);
155-
uv_mutex_unlock(&safepoint_lock);
152+
if (jl_options.timeout_for_safepoint_straggler_s == -1) { // timeout was not specified: no need to dump the backtrace
153+
uv_mutex_lock(&safepoint_lock);
154+
if (!jl_atomic_load_relaxed(&ptls2->gc_state)) {
155+
uv_cond_wait(&safepoint_cond_begin, &safepoint_lock);
156+
}
157+
uv_mutex_unlock(&safepoint_lock);
158+
}
159+
else {
160+
const int64_t timeout = jl_options.timeout_for_safepoint_straggler_s * 1000000000; // convert to nanoseconds
161+
int ret = 0;
162+
uv_mutex_lock(&safepoint_lock);
163+
if (!jl_atomic_load_relaxed(&ptls2->gc_state)) {
164+
ret = uv_cond_timedwait(&safepoint_cond_begin, &safepoint_lock, timeout);
165+
}
166+
uv_mutex_unlock(&safepoint_lock);
167+
// If we woke up because of a timeout, print the backtrace of the straggler
168+
if (ret == UV_ETIMEDOUT) {
169+
jl_safe_printf("===== Thread %d failed to reach safepoint after %d seconds, printing backtrace below =====\n", ptls2->tid + 1, jl_options.timeout_for_safepoint_straggler_s);
170+
// Try to record the backtrace of the straggler using `jl_try_record_thread_backtrace`
171+
jl_ptls_t ptls = jl_current_task->ptls;
172+
size_t bt_size = jl_try_record_thread_backtrace(ptls2, ptls->bt_data, JL_MAX_BT_SIZE);
173+
// Print the backtrace of the straggler
174+
for (size_t i = 0; i < bt_size; i += jl_bt_entry_size(ptls->bt_data + i)) {
175+
jl_print_bt_entry_codeloc(ptls->bt_data + i);
176+
}
177+
}
178+
}
156179
}
157180
}
158181
}

src/stackwalk.c

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1260,6 +1260,25 @@ static void suspend(void *ctx)
12601260
suspenddata->success = jl_thread_suspend_and_get_state(suspenddata->old, 1, suspenddata->c);
12611261
}
12621262

1263+
JL_DLLEXPORT size_t jl_try_record_thread_backtrace(jl_ptls_t ptls2, jl_bt_element_t *bt_data, size_t max_bt_size) JL_NOTSAFEPOINT
1264+
{
1265+
int16_t tid = ptls2->tid;
1266+
jl_task_t *t = NULL;
1267+
bt_context_t *context = NULL;
1268+
bt_context_t c;
1269+
suspend_t suspenddata = {tid, &c};
1270+
jl_with_stackwalk_lock(suspend, &suspenddata);
1271+
if (!suspenddata.success) {
1272+
return 0;
1273+
}
1274+
// thread is stopped, safe to read the task it was running before we stopped it
1275+
t = jl_atomic_load_relaxed(&ptls2->current_task);
1276+
context = &c;
1277+
size_t bt_size = rec_backtrace_ctx(bt_data, max_bt_size, context, ptls2->previous_task ? NULL : t->gcstack);
1278+
jl_thread_resume(tid);
1279+
return bt_size;
1280+
}
1281+
12631282
JL_DLLEXPORT jl_record_backtrace_result_t jl_record_backtrace(jl_task_t *t, jl_bt_element_t *bt_data, size_t max_bt_size, int all_tasks_profiler) JL_NOTSAFEPOINT
12641283
{
12651284
int16_t tid = INT16_MAX;

test/cmdlineargs.jl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1238,3 +1238,9 @@ end
12381238
@test parse(UInt64,read(`$exename --heap-size-hint=$str -E "Base.JLOptions().heap_size_hint"`, String)) == val
12391239
end
12401240
end
1241+
1242+
@testset "--timeout-for-safepoint-straggler" begin
1243+
exename = `$(Base.julia_cmd())`
1244+
timeout = 120
1245+
@test parse(Int,read(`$exename --timeout-for-safepoint-straggler=$timeout -E "Base.JLOptions().timeout_for_safepoint_straggler_s"`, String)) == timeout
1246+
end

test/threads_exec.jl

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1536,4 +1536,29 @@ end
15361536
end
15371537
end
15381538

1539+
@testset "--timeout-for-safepoint-straggler command-line flag" begin
1540+
program = "
1541+
function main()
1542+
t = Threads.@spawn begin
1543+
ccall(:uv_sleep, Cvoid, (Cuint,), 5000)
1544+
end
1545+
# Force a GC
1546+
ccall(:uv_sleep, Cvoid, (Cuint,), 1000)
1547+
GC.gc()
1548+
wait(t)
1549+
end
1550+
main()
1551+
"
1552+
tmp_output_filename = tempname()
1553+
tmp_output_file = open(tmp_output_filename, "w")
1554+
if isnothing(tmp_output_file)
1555+
error("Failed to open file $tmp_output_filename")
1556+
end
1557+
run(pipeline(`$(Base.julia_cmd()) --threads=4 --timeout-for-safepoint-straggler=1 -e $program`, stderr=tmp_output_file))
1558+
# Check whether we printed the straggler's backtrace
1559+
@test !isempty(read(tmp_output_filename, String))
1560+
close(tmp_output_file)
1561+
rm(tmp_output_filename)
1562+
end
1563+
15391564
end # main testset

0 commit comments

Comments
 (0)