Skip to content

Commit e09f33e

Browse files
[3.14] gh-137433: Fix deadlock with stop-the-world and daemon threads (gh-137735) (GH-138965)
There was a deadlock originally seen by Memray when a daemon thread enabled or disabled profiling while the interpreter was shutting down. I think this could also happen with garbage collection, but I haven't seen that in practice. The daemon thread could be hung while trying acquire the global rwmutex that prevents overlapping global and per-interpreter stop-the-world events. Since it already held the main interpreter's stop-the-world lock, it also deadlocked the main thread, which is trying to perform interpreter finalization. Swap the order of lock acquisition to prevent this deadlock. Additionally, refactor `_PyParkingLot_Park` so that the global buckets hashtable is left in a clean state if the thread is hung in `PyEval_AcquireThread`. (cherry picked from commit 90fe325) Co-authored-by: Sam Gross <[email protected]>
1 parent db8b943 commit e09f33e

File tree

4 files changed

+51
-4
lines changed

4 files changed

+51
-4
lines changed

Lib/test/test_threading.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1381,6 +1381,33 @@ def test_native_id_after_fork(self):
13811381
self.assertEqual(len(native_ids), 2)
13821382
self.assertNotEqual(native_ids[0], native_ids[1])
13831383

1384+
def test_stop_the_world_during_finalization(self):
1385+
# gh-137433: Test functions that trigger a stop-the-world in the free
1386+
# threading build concurrent with interpreter finalization.
1387+
script = """if True:
1388+
import gc
1389+
import sys
1390+
import threading
1391+
NUM_THREADS = 5
1392+
b = threading.Barrier(NUM_THREADS + 1)
1393+
def run_in_bg():
1394+
b.wait()
1395+
while True:
1396+
sys.setprofile(None)
1397+
gc.collect()
1398+
1399+
for _ in range(NUM_THREADS):
1400+
t = threading.Thread(target=run_in_bg, daemon=True)
1401+
t.start()
1402+
1403+
b.wait()
1404+
print("Exiting...")
1405+
"""
1406+
rc, out, err = assert_python_ok('-c', script)
1407+
self.assertEqual(rc, 0)
1408+
self.assertEqual(err, b"")
1409+
self.assertEqual(out.strip(), b"Exiting...")
1410+
13841411
class ThreadJoinOnShutdown(BaseTestCase):
13851412

13861413
def _run_and_join(self, script):
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix a potential deadlock in the :term:`free threading` build when daemon
2+
threads enable or disable profiling or tracing while the main thread is
3+
shutting down the interpreter.

Python/parking_lot.c

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,19 @@ _PyParkingLot_Park(const void *addr, const void *expected, size_t size,
342342
enqueue(bucket, addr, &wait);
343343
_PyRawMutex_Unlock(&bucket->mutex);
344344

345-
int res = _PySemaphore_Wait(&wait.sema, timeout_ns, detach);
345+
PyThreadState *tstate = NULL;
346+
if (detach) {
347+
tstate = _PyThreadState_GET();
348+
if (tstate && _PyThreadState_IsAttached(tstate)) {
349+
// Only detach if we are attached
350+
PyEval_ReleaseThread(tstate);
351+
}
352+
else {
353+
tstate = NULL;
354+
}
355+
}
356+
357+
int res = _PySemaphore_Wait(&wait.sema, timeout_ns, 0);
346358
if (res == Py_PARK_OK) {
347359
goto done;
348360
}
@@ -354,7 +366,7 @@ _PyParkingLot_Park(const void *addr, const void *expected, size_t size,
354366
// Another thread has started to unpark us. Wait until we process the
355367
// wakeup signal.
356368
do {
357-
res = _PySemaphore_Wait(&wait.sema, -1, detach);
369+
res = _PySemaphore_Wait(&wait.sema, -1, 0);
358370
} while (res != Py_PARK_OK);
359371
goto done;
360372
}
@@ -366,6 +378,9 @@ _PyParkingLot_Park(const void *addr, const void *expected, size_t size,
366378

367379
done:
368380
_PySemaphore_Destroy(&wait.sema);
381+
if (tstate) {
382+
PyEval_AcquireThread(tstate);
383+
}
369384
return res;
370385

371386
}

Python/pystate.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2345,13 +2345,15 @@ stop_the_world(struct _stoptheworld_state *stw)
23452345
{
23462346
_PyRuntimeState *runtime = &_PyRuntime;
23472347

2348-
PyMutex_Lock(&stw->mutex);
2348+
// gh-137433: Acquire the rwmutex first to avoid deadlocks with daemon
2349+
// threads that may hang when blocked on lock acquisition.
23492350
if (stw->is_global) {
23502351
_PyRWMutex_Lock(&runtime->stoptheworld_mutex);
23512352
}
23522353
else {
23532354
_PyRWMutex_RLock(&runtime->stoptheworld_mutex);
23542355
}
2356+
PyMutex_Lock(&stw->mutex);
23552357

23562358
HEAD_LOCK(runtime);
23572359
stw->requested = 1;
@@ -2417,13 +2419,13 @@ start_the_world(struct _stoptheworld_state *stw)
24172419
}
24182420
stw->requester = NULL;
24192421
HEAD_UNLOCK(runtime);
2422+
PyMutex_Unlock(&stw->mutex);
24202423
if (stw->is_global) {
24212424
_PyRWMutex_Unlock(&runtime->stoptheworld_mutex);
24222425
}
24232426
else {
24242427
_PyRWMutex_RUnlock(&runtime->stoptheworld_mutex);
24252428
}
2426-
PyMutex_Unlock(&stw->mutex);
24272429
}
24282430
#endif // Py_GIL_DISABLED
24292431

0 commit comments

Comments
 (0)