Skip to content

Commit 849694f

Browse files
committed
Fix SMP deadlock by avoiding nested monitors/processes_table locks
https://ampcode.com/threads/T-019ba1c2-2a7f-77c7-bd33-ce9f303152a2 Fix lock ordering inversion causing deadlocks on ESP32 SMP under heavy socket load. The functions enif_demonitor_process and destroy_resource_monitors previously acquired locks in the order: monitors -> processes_table. However, enif_monitor_process and context_get_process_info (via resource_monitor_to_resource) use the opposite order: processes_table -> monitors. This created an ABBA deadlock under SMP when these code paths ran concurrently. The fix uses a two-phase approach in both functions: - Phase 1: Acquire monitors lock, find/remove entries, release monitors lock - Phase 2: Acquire process lock, send signals, release process lock This ensures the two locks are never held simultaneously, eliminating the deadlock while preserving correct semantics. If the target process dies between phases, the demonitor signal is simply skipped (harmless no-op). Also fixes a pre-existing bug where enif_demonitor_process failed to unlock the monitors list on the resource_type mismatch error path. Signed-off-by: Peter M <petermm@gmail.com>
1 parent 6fd67cb commit 849694f

File tree

1 file changed

+40
-17
lines changed

1 file changed

+40
-17
lines changed

src/libAtomVM/resources.c

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -476,54 +476,77 @@ int enif_demonitor_process(ErlNifEnv *env, void *obj, const ErlNifMonitor *mon)
476476
return -1;
477477
}
478478

479+
int32_t target_process_id = INVALID_PROCESS_ID;
480+
uint64_t ref_ticks = 0;
481+
482+
// Phase 1: Find and remove from monitors list while holding monitors lock
479483
struct ListHead *monitors = synclist_wrlock(&resource_type->monitors);
480484
struct ListHead *item;
481485
LIST_FOR_EACH (item, monitors) {
482486
struct ResourceMonitor *monitor = GET_LIST_ENTRY(item, struct ResourceMonitor, resource_list_head);
483487
if (monitor->ref_ticks == mon->ref_ticks) {
484488
struct RefcBinary *resource = refc_binary_from_data(obj);
485489
if (resource->resource_type != mon->resource_type) {
490+
synclist_unlock(&resource_type->monitors);
486491
return -1;
487492
}
488493

489-
Context *target = globalcontext_get_process_lock(global, monitor->process_id);
490-
if (target) {
491-
mailbox_send_ref_signal(target, DemonitorSignal, monitor->ref_ticks);
492-
globalcontext_get_process_unlock(global, target);
493-
}
494-
494+
target_process_id = monitor->process_id;
495+
ref_ticks = monitor->ref_ticks;
495496
list_remove(&monitor->resource_list_head);
496497
free(monitor);
497-
synclist_unlock(&resource_type->monitors);
498-
return 0;
498+
break;
499499
}
500500
}
501-
502501
synclist_unlock(&resource_type->monitors);
503502

504-
return -1;
503+
if (target_process_id == INVALID_PROCESS_ID) {
504+
return -1;
505+
}
506+
507+
// Phase 2: Send demonitor signal without holding monitors lock
508+
// This avoids lock order inversion with processes_table
509+
Context *target = globalcontext_get_process_lock(global, target_process_id);
510+
if (target) {
511+
mailbox_send_ref_signal(target, DemonitorSignal, ref_ticks);
512+
globalcontext_get_process_unlock(global, target);
513+
}
514+
515+
return 0;
505516
}
506517

507518
void destroy_resource_monitors(struct RefcBinary *resource, GlobalContext *global)
508519
{
509520
struct ResourceType *resource_type = resource->resource_type;
521+
522+
// Phase 1: Collect monitors to destroy while holding monitors lock
523+
struct ListHead to_signal;
524+
list_init(&to_signal);
525+
510526
struct ListHead *monitors = synclist_wrlock(&resource_type->monitors);
511527
struct ListHead *item;
512528
struct ListHead *tmp;
513529
MUTABLE_LIST_FOR_EACH (item, tmp, monitors) {
514530
struct ResourceMonitor *monitor = GET_LIST_ENTRY(item, struct ResourceMonitor, resource_list_head);
515531
if (monitor->resource == resource) {
516-
Context *target = globalcontext_get_process_lock(global, monitor->process_id);
517-
if (target) {
518-
mailbox_send_ref_signal(target, DemonitorSignal, monitor->ref_ticks);
519-
globalcontext_get_process_unlock(global, target);
520-
}
521532
list_remove(&monitor->resource_list_head);
522-
free(monitor);
533+
list_append(&to_signal, &monitor->resource_list_head);
523534
}
524535
}
525-
526536
synclist_unlock(&resource_type->monitors);
537+
538+
// Phase 2: Send demonitor signals without holding monitors lock
539+
// This avoids lock order inversion with processes_table
540+
MUTABLE_LIST_FOR_EACH (item, tmp, &to_signal) {
541+
struct ResourceMonitor *monitor = GET_LIST_ENTRY(item, struct ResourceMonitor, resource_list_head);
542+
Context *target = globalcontext_get_process_lock(global, monitor->process_id);
543+
if (target) {
544+
mailbox_send_ref_signal(target, DemonitorSignal, monitor->ref_ticks);
545+
globalcontext_get_process_unlock(global, target);
546+
}
547+
list_remove(&monitor->resource_list_head);
548+
free(monitor);
549+
}
527550
}
528551

529552
int enif_compare_monitors(const ErlNifMonitor *monitor1, const ErlNifMonitor *monitor2)

0 commit comments

Comments
 (0)