From 6fd67cb090b5aba2fdd7dd7b6137910e374235e3 Mon Sep 17 00:00:00 2001 From: Peter M Date: Fri, 9 Jan 2026 09:57:59 +0100 Subject: [PATCH 1/2] Fix SMP deadlock: consistent lock ordering in enif_monitor_process MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit https://ampcode.com/threads/T-019ba1c2-2a7f-77c7-bd33-ce9f303152a2 ## Summary Fix a lock ordering inversion that causes deadlocks under SMP on ESP32 (and potentially other platforms) when sockets are used under heavy load. ## Problem `enif_monitor_process` and `enif_demonitor_process` acquire locks in opposite orders: | Function | Lock Order | |----------|------------| | `enif_monitor_process` | `processes_table` → `monitors` | | `enif_demonitor_process` | `monitors` → `processes_table` | | `destroy_resource_monitors` | `monitors` → `processes_table` | This creates an ABBA deadlock when two threads call these functions concurrently—one holds `processes_table` waiting for `monitors`, while the other holds `monitors` waiting for `processes_table`. The issue is triggered by `otp_socket.c` which calls both monitor/demonitor from NIFs, the select thread, and monitor callbacks under load. With `AVM_NO_SMP`, `synclist_wrlock` is a no-op so no deadlock occurs, which explains why disabling SMP works around the issue. ## Fix Change `enif_monitor_process` to acquire locks in the same order as the other functions: `monitors` → `processes_table`. ## Testing - Tested on ESP32 with SMP enabled under heavy socket load - Verified no regression on non-SMP builds Signed-off-by: Peter M --- src/libAtomVM/resources.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/libAtomVM/resources.c b/src/libAtomVM/resources.c index 1b0c537fc..35194ab7e 100644 --- a/src/libAtomVM/resources.c +++ b/src/libAtomVM/resources.c @@ -397,14 +397,17 @@ int enif_monitor_process(ErlNifEnv *env, void *obj, const ErlNifPid *target_pid, return -1; } + struct ListHead *monitors_head = synclist_wrlock(&resource_type->monitors); Context *target = globalcontext_get_process_lock(env->global, *target_pid); if (IS_NULL_PTR(target)) { + synclist_unlock(&resource_type->monitors); free(resource_monitor); free(monitor); return 1; } - synclist_append(&resource_type->monitors, &resource_monitor->resource_list_head); + list_append(monitors_head, &resource_monitor->resource_list_head); + synclist_unlock(&resource_type->monitors); mailbox_send_monitor_signal(target, MonitorSignal, monitor); globalcontext_get_process_unlock(env->global, target); From 849694fe73428835f46f477b2d55ec0d63393b32 Mon Sep 17 00:00:00 2001 From: Peter M Date: Mon, 12 Jan 2026 09:00:11 +0100 Subject: [PATCH 2/2] Fix SMP deadlock by avoiding nested monitors/processes_table locks https://ampcode.com/threads/T-019ba1c2-2a7f-77c7-bd33-ce9f303152a2 Fix lock ordering inversion causing deadlocks on ESP32 SMP under heavy socket load. The functions enif_demonitor_process and destroy_resource_monitors previously acquired locks in the order: monitors -> processes_table. However, enif_monitor_process and context_get_process_info (via resource_monitor_to_resource) use the opposite order: processes_table -> monitors. This created an ABBA deadlock under SMP when these code paths ran concurrently. The fix uses a two-phase approach in both functions: - Phase 1: Acquire monitors lock, find/remove entries, release monitors lock - Phase 2: Acquire process lock, send signals, release process lock This ensures the two locks are never held simultaneously, eliminating the deadlock while preserving correct semantics. If the target process dies between phases, the demonitor signal is simply skipped (harmless no-op). Also fixes a pre-existing bug where enif_demonitor_process failed to unlock the monitors list on the resource_type mismatch error path. Signed-off-by: Peter M --- src/libAtomVM/resources.c | 57 +++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 17 deletions(-) diff --git a/src/libAtomVM/resources.c b/src/libAtomVM/resources.c index 35194ab7e..0dd1b2df2 100644 --- a/src/libAtomVM/resources.c +++ b/src/libAtomVM/resources.c @@ -476,6 +476,10 @@ int enif_demonitor_process(ErlNifEnv *env, void *obj, const ErlNifMonitor *mon) return -1; } + int32_t target_process_id = INVALID_PROCESS_ID; + uint64_t ref_ticks = 0; + + // Phase 1: Find and remove from monitors list while holding monitors lock struct ListHead *monitors = synclist_wrlock(&resource_type->monitors); struct ListHead *item; LIST_FOR_EACH (item, monitors) { @@ -483,47 +487,66 @@ int enif_demonitor_process(ErlNifEnv *env, void *obj, const ErlNifMonitor *mon) if (monitor->ref_ticks == mon->ref_ticks) { struct RefcBinary *resource = refc_binary_from_data(obj); if (resource->resource_type != mon->resource_type) { + synclist_unlock(&resource_type->monitors); return -1; } - Context *target = globalcontext_get_process_lock(global, monitor->process_id); - if (target) { - mailbox_send_ref_signal(target, DemonitorSignal, monitor->ref_ticks); - globalcontext_get_process_unlock(global, target); - } - + target_process_id = monitor->process_id; + ref_ticks = monitor->ref_ticks; list_remove(&monitor->resource_list_head); free(monitor); - synclist_unlock(&resource_type->monitors); - return 0; + break; } } - synclist_unlock(&resource_type->monitors); - return -1; + if (target_process_id == INVALID_PROCESS_ID) { + return -1; + } + + // Phase 2: Send demonitor signal without holding monitors lock + // This avoids lock order inversion with processes_table + Context *target = globalcontext_get_process_lock(global, target_process_id); + if (target) { + mailbox_send_ref_signal(target, DemonitorSignal, ref_ticks); + globalcontext_get_process_unlock(global, target); + } + + return 0; } void destroy_resource_monitors(struct RefcBinary *resource, GlobalContext *global) { struct ResourceType *resource_type = resource->resource_type; + + // Phase 1: Collect monitors to destroy while holding monitors lock + struct ListHead to_signal; + list_init(&to_signal); + struct ListHead *monitors = synclist_wrlock(&resource_type->monitors); struct ListHead *item; struct ListHead *tmp; MUTABLE_LIST_FOR_EACH (item, tmp, monitors) { struct ResourceMonitor *monitor = GET_LIST_ENTRY(item, struct ResourceMonitor, resource_list_head); if (monitor->resource == resource) { - Context *target = globalcontext_get_process_lock(global, monitor->process_id); - if (target) { - mailbox_send_ref_signal(target, DemonitorSignal, monitor->ref_ticks); - globalcontext_get_process_unlock(global, target); - } list_remove(&monitor->resource_list_head); - free(monitor); + list_append(&to_signal, &monitor->resource_list_head); } } - synclist_unlock(&resource_type->monitors); + + // Phase 2: Send demonitor signals without holding monitors lock + // This avoids lock order inversion with processes_table + MUTABLE_LIST_FOR_EACH (item, tmp, &to_signal) { + struct ResourceMonitor *monitor = GET_LIST_ENTRY(item, struct ResourceMonitor, resource_list_head); + Context *target = globalcontext_get_process_lock(global, monitor->process_id); + if (target) { + mailbox_send_ref_signal(target, DemonitorSignal, monitor->ref_ticks); + globalcontext_get_process_unlock(global, target); + } + list_remove(&monitor->resource_list_head); + free(monitor); + } } int enif_compare_monitors(const ErlNifMonitor *monitor1, const ErlNifMonitor *monitor2)