Skip to content

Commit 6e11ffc

Browse files
authored
make threaded exit and cache file loading safer by pausing other threads (JuliaLang#55105)
1 parent 0fd1f04 commit 6e11ffc

File tree

8 files changed

+63
-15
lines changed

8 files changed

+63
-15
lines changed

src/init.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -334,15 +334,15 @@ JL_DLLEXPORT void jl_atexit_hook(int exitcode) JL_NOTSAFEPOINT_ENTER
334334
// we would like to guarantee this, but cannot currently, so there is still a small race window
335335
// that needs to be fixed in libuv
336336
}
337-
if (ct)
338-
(void)jl_gc_safe_enter(ct->ptls); // park in gc-safe
339337
if (loop != NULL) {
340338
// TODO: consider uv_loop_close(loop) here, before shutdown?
341339
uv_library_shutdown();
342340
// no JL_UV_UNLOCK(), since it is now torn down
343341
}
344-
345-
// TODO: Destroy threads?
342+
if (ct)
343+
jl_safepoint_suspend_all_threads(ct); // Destroy other threads, so that they don't segfault
344+
if (ct)
345+
(void)jl_gc_safe_enter(ct->ptls); // park in gc-safe
346346

347347
jl_destroy_timing(); // cleans up the current timing_stack for noreturn
348348
#ifdef USE_TIMING_COUNTS

src/julia.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1107,6 +1107,8 @@ STATIC_INLINE void jl_gc_multi_wb(const void *parent, const jl_value_t *ptr) JL_
11071107
JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz);
11081108
JL_DLLEXPORT void jl_gc_safepoint(void);
11091109
JL_DLLEXPORT int jl_safepoint_suspend_thread(int tid, int waitstate);
1110+
JL_DLLEXPORT void jl_safepoint_suspend_all_threads(struct _jl_task_t *ct);
1111+
JL_DLLEXPORT void jl_safepoint_resume_all_threads(struct _jl_task_t *ct);
11101112
JL_DLLEXPORT int jl_safepoint_resume_thread(int tid) JL_NOTSAFEPOINT;
11111113

11121114
void *mtarraylist_get(small_arraylist_t *_a, size_t idx) JL_NOTSAFEPOINT;

src/julia_internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -959,6 +959,7 @@ extern JL_DLLEXPORT ssize_t jl_tls_offset;
959959
extern JL_DLLEXPORT const int jl_tls_elf_support;
960960
void jl_init_threading(void);
961961
void jl_start_threads(void);
962+
extern uv_mutex_t safepoint_lock;
962963

963964
// Whether the GC is running
964965
extern char *jl_safepoint_pages;

src/safepoint.c

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -267,12 +267,12 @@ void jl_safepoint_wait_thread_resume(void)
267267
uv_cond_broadcast(&safepoint_cond_begin);
268268
uv_mutex_unlock(&safepoint_lock);
269269
uv_mutex_lock(&ct->ptls->sleep_lock);
270+
while (jl_atomic_load_relaxed(&ct->ptls->suspend_count))
271+
uv_cond_wait(&ct->ptls->wake_signal, &ct->ptls->sleep_lock);
270272
}
271-
while (jl_atomic_load_relaxed(&ct->ptls->suspend_count))
272-
uv_cond_wait(&ct->ptls->wake_signal, &ct->ptls->sleep_lock);
273-
// must while still holding the mutex_unlock, so we know other threads in
274-
// jl_safepoint_suspend_thread will observe this thread in the correct GC
275-
// state, and not still stuck in JL_GC_STATE_WAITING
273+
// must exit gc while still holding the mutex_unlock, so we know other
274+
// threads in jl_safepoint_suspend_thread will observe this thread in the
275+
// correct GC state, and not still stuck in JL_GC_STATE_WAITING
276276
jl_atomic_store_release(&ct->ptls->gc_state, state);
277277
uv_mutex_unlock(&ct->ptls->sleep_lock);
278278
}
@@ -290,12 +290,20 @@ int jl_safepoint_suspend_thread(int tid, int waitstate)
290290
if (0 > tid || tid >= jl_atomic_load_acquire(&jl_n_threads))
291291
return 0;
292292
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
293+
jl_task_t *ct2 = ptls2 ? jl_atomic_load_relaxed(&ptls2->current_task) : NULL;
294+
if (ct2 == NULL) {
295+
// this thread is not alive yet or already dead
296+
return 0;
297+
}
298+
uv_mutex_lock(&safepoint_lock);
293299
uv_mutex_lock(&ptls2->sleep_lock);
294300
int16_t suspend_count = jl_atomic_load_relaxed(&ptls2->suspend_count) + 1;
295301
jl_atomic_store_relaxed(&ptls2->suspend_count, suspend_count);
296302
if (suspend_count == 1) { // first to suspend
297303
jl_safepoint_enable(3);
298304
jl_atomic_store_relaxed(&ptls2->safepoint, (size_t*)(jl_safepoint_pages + jl_page_size * 3 + sizeof(void*)));
305+
if (jl_atomic_load(&_threadedregion) != 0 || tid == jl_atomic_load_relaxed(&io_loop_tid))
306+
jl_wake_libuv(); // our integration with libuv right now doesn't handle except by waking it
299307
}
300308
uv_mutex_unlock(&ptls2->sleep_lock);
301309
if (waitstate) {
@@ -305,17 +313,20 @@ int jl_safepoint_suspend_thread(int tid, int waitstate)
305313
// not, so assume it is running GC and wait for GC to finish first.
306314
// It will be unable to reenter helping with GC because we have
307315
// changed its safepoint page.
316+
uv_mutex_unlock(&safepoint_lock);
308317
jl_set_gc_and_wait();
318+
uv_mutex_lock(&safepoint_lock);
309319
}
310320
while (jl_atomic_load_acquire(&ptls2->suspend_count) != 0) {
311321
int8_t state2 = jl_atomic_load_acquire(&ptls2->gc_state);
312322
if (waitstate <= 2 && state2 != JL_GC_STATE_UNSAFE)
313323
break;
314324
if (waitstate == 3 && state2 == JL_GC_STATE_WAITING)
315325
break;
316-
jl_cpu_pause(); // yield (wait for safepoint_cond_begin, for example)?
326+
uv_cond_wait(&safepoint_cond_begin, &safepoint_lock);
317327
}
318328
}
329+
uv_mutex_unlock(&safepoint_lock);
319330
return suspend_count;
320331
}
321332

@@ -326,6 +337,11 @@ int jl_safepoint_resume_thread(int tid) JL_NOTSAFEPOINT
326337
if (0 > tid || tid >= jl_atomic_load_acquire(&jl_n_threads))
327338
return 0;
328339
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
340+
jl_task_t *ct2 = ptls2 ? jl_atomic_load_relaxed(&ptls2->current_task) : NULL;
341+
if (ct2 == NULL) {
342+
// this thread is not alive yet or already dead
343+
return 0;
344+
}
329345
uv_mutex_lock(&safepoint_lock);
330346
uv_mutex_lock(&ptls2->sleep_lock);
331347
int16_t suspend_count = jl_atomic_load_relaxed(&ptls2->suspend_count);
@@ -338,6 +354,7 @@ int jl_safepoint_resume_thread(int tid) JL_NOTSAFEPOINT
338354
#ifdef _OS_DARWIN_
339355
jl_safepoint_resume_thread_mach(ptls2, tid);
340356
#endif
357+
uv_cond_broadcast(&safepoint_cond_begin);
341358
}
342359
if (suspend_count != 0) {
343360
jl_atomic_store_relaxed(&ptls2->suspend_count, suspend_count - 1);

src/signals-mach.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ static void attach_exception_port(thread_port_t thread, int segv_only);
4545

4646
// low 16 bits are the thread id, the next 8 bits are the original gc_state
4747
static arraylist_t suspended_threads;
48-
extern uv_mutex_t safepoint_lock;
4948
extern uv_cond_t safepoint_cond_begin;
5049

5150
#define GC_STATE_SHIFT 8*sizeof(int16_t)

src/staticdata.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3132,6 +3132,7 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image, jl
31323132
jl_array_t **ext_targets, jl_array_t **edges,
31333133
char **base, arraylist_t *ccallable_list, pkgcachesizes *cachesizes) JL_GC_DISABLED
31343134
{
3135+
jl_task_t *ct = jl_current_task;
31353136
int en = jl_gc_enable(0);
31363137
ios_t sysimg, const_data, symbols, relocs, gvar_record, fptr_record;
31373138
jl_serializer_state s = {0};
@@ -3143,7 +3144,7 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image, jl
31433144
s.relocs = &relocs;
31443145
s.gvar_record = &gvar_record;
31453146
s.fptr_record = &fptr_record;
3146-
s.ptls = jl_current_task->ptls;
3147+
s.ptls = ct->ptls;
31473148
jl_value_t **const*const tags = get_tags();
31483149
htable_t new_dt_objs;
31493150
htable_new(&new_dt_objs, 0);
@@ -3316,6 +3317,7 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image, jl
33163317
arraylist_new(&cleanup_list, 0);
33173318
arraylist_t delay_list;
33183319
arraylist_new(&delay_list, 0);
3320+
JL_LOCK(&typecache_lock); // Might GC--prevent other threads from changing any type caches while we inspect them all
33193321
for (size_t i = 0; i < s.uniquing_types.len; i++) {
33203322
uintptr_t item = (uintptr_t)s.uniquing_types.items[i];
33213323
// check whether we are operating on the typetag
@@ -3443,6 +3445,7 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image, jl
34433445
}
34443446
arraylist_grow(&cleanup_list, -cleanup_list.len);
34453447
// finally cache all our new types now
3448+
jl_safepoint_suspend_all_threads(ct); // past this point, it is now not safe to observe the intermediate states on other threads via reflection, so temporarily pause those
34463449
for (size_t i = 0; i < new_dt_objs.size; i += 2) {
34473450
void *dt = table[i + 1];
34483451
if (dt != HT_NOTFOUND) {
@@ -3456,6 +3459,8 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image, jl
34563459
assert(jl_is_datatype(obj));
34573460
jl_cache_type_((jl_datatype_t*)obj);
34583461
}
3462+
JL_UNLOCK(&typecache_lock); // Might GC
3463+
jl_safepoint_resume_all_threads(ct); // TODO: move this later to also protect MethodInstance allocations, but we would need to acquire all jl_specializations_get_linfo and jl_module_globalref locks, which is hard
34593464
// Perform fixups: things like updating world ages, inserting methods & specializations, etc.
34603465
for (size_t i = 0; i < s.uniquing_objs.len; i++) {
34613466
uintptr_t item = (uintptr_t)s.uniquing_objs.items[i];

src/threading.c

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,30 @@ JL_DLLEXPORT jl_gcframe_t **jl_adopt_thread(void)
432432
return &ct->gcstack;
433433
}
434434

435+
436+
void jl_safepoint_suspend_all_threads(jl_task_t *ct)
437+
{
438+
// TODO: prevent jl_n_threads changing or jl_safepoint_resume_thread calls on another thread
439+
//uv_mutex_lock(&tls_lock);
440+
//disallow_resume = ct->tid;
441+
//uv_mutex_unlock(&tls_lock);
442+
for (int16_t tid = 0; tid < jl_atomic_load_relaxed(&jl_n_threads); tid++) {
443+
if (tid != jl_atomic_load_relaxed(&ct->tid))
444+
jl_safepoint_suspend_thread(tid, 1);
445+
};
446+
}
447+
448+
void jl_safepoint_resume_all_threads(jl_task_t *ct)
449+
{
450+
//uv_mutex_lock(&tls_lock);
451+
//if (disallow_resume != ct->tid) return;
452+
//uv_mutex_unlock(&tls_lock);
453+
for (int16_t tid = 0; tid < jl_atomic_load_relaxed(&jl_n_threads); tid++) {
454+
if (tid != jl_atomic_load_relaxed(&ct->tid))
455+
jl_safepoint_resume_thread(tid);
456+
};
457+
}
458+
435459
void jl_task_frame_noreturn(jl_task_t *ct) JL_NOTSAFEPOINT;
436460
void scheduler_delete_thread(jl_ptls_t ptls) JL_NOTSAFEPOINT;
437461

test/atexit.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ using Test
220220
# Block until the atexit hooks have all finished. We use a manual "spin
221221
# lock" because task switch is disallowed inside the finalizer, below.
222222
atexit_has_finished[] = 1
223-
while atexit_has_finished[] == 1 end
223+
while atexit_has_finished[] == 1; GC.safepoint(); end
224224
try
225225
# By the time this runs, all the atexit hooks will be done.
226226
# So this will throw.
@@ -232,7 +232,7 @@ using Test
232232
exit(22)
233233
end
234234
end
235-
while atexit_has_finished[] == 0 end
235+
while atexit_has_finished[] == 0; GC.safepoint(); end
236236
end
237237
# Finalizers run after the atexit hooks, so this blocks exit until the spawned
238238
# task above gets a chance to run.
@@ -241,7 +241,7 @@ using Test
241241
# Allow the spawned task to finish
242242
atexit_has_finished[] = 2
243243
# Then spin forever to prevent exit.
244-
while atexit_has_finished[] == 2 end
244+
while atexit_has_finished[] == 2; GC.safepoint(); end
245245
end
246246
exit(0)
247247
""" => 22,

0 commit comments

Comments
 (0)