Skip to content

Commit 90fe325

Browse files
authored
gh-137433: Fix deadlock with stop-the-world and daemon threads (gh-137735)
There was a deadlock originally seen by Memray when a daemon thread enabled or disabled profiling while the interpreter was shutting down. I think this could also happen with garbage collection, but I haven't seen that in practice. The daemon thread could be hung while trying acquire the global rwmutex that prevents overlapping global and per-interpreter stop-the-world events. Since it already held the main interpreter's stop-the-world lock, it also deadlocked the main thread, which is trying to perform interpreter finalization. Swap the order of lock acquisition to prevent this deadlock. Additionally, refactor `_PyParkingLot_Park` so that the global buckets hashtable is left in a clean state if the thread is hung in `PyEval_AcquireThread`.
1 parent 4c0d7bc commit 90fe325

File tree

6 files changed

+55
-31
lines changed

6 files changed

+55
-31
lines changed

Include/internal/pycore_semaphore.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,8 @@ typedef struct _PySemaphore {
4646
} _PySemaphore;
4747

4848
// Puts the current thread to sleep until _PySemaphore_Wakeup() is called.
49-
// If `detach` is true, then the thread will detach/release the GIL while
50-
// sleeping.
5149
PyAPI_FUNC(int)
52-
_PySemaphore_Wait(_PySemaphore *sema, PyTime_t timeout_ns, int detach);
50+
_PySemaphore_Wait(_PySemaphore *sema, PyTime_t timeout_ns);
5351

5452
// Wakes up a single thread waiting on sema. Note that _PySemaphore_Wakeup()
5553
// can be called before _PySemaphore_Wait().

Lib/test/test_threading.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1430,6 +1430,33 @@ def test_native_id_after_fork(self):
14301430
self.assertEqual(len(native_ids), 2)
14311431
self.assertNotEqual(native_ids[0], native_ids[1])
14321432

1433+
def test_stop_the_world_during_finalization(self):
1434+
# gh-137433: Test functions that trigger a stop-the-world in the free
1435+
# threading build concurrent with interpreter finalization.
1436+
script = """if True:
1437+
import gc
1438+
import sys
1439+
import threading
1440+
NUM_THREADS = 5
1441+
b = threading.Barrier(NUM_THREADS + 1)
1442+
def run_in_bg():
1443+
b.wait()
1444+
while True:
1445+
sys.setprofile(None)
1446+
gc.collect()
1447+
1448+
for _ in range(NUM_THREADS):
1449+
t = threading.Thread(target=run_in_bg, daemon=True)
1450+
t.start()
1451+
1452+
b.wait()
1453+
print("Exiting...")
1454+
"""
1455+
rc, out, err = assert_python_ok('-c', script)
1456+
self.assertEqual(rc, 0)
1457+
self.assertEqual(err, b"")
1458+
self.assertEqual(out.strip(), b"Exiting...")
1459+
14331460
class ThreadJoinOnShutdown(BaseTestCase):
14341461

14351462
def _run_and_join(self, script):
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix a potential deadlock in the :term:`free threading` build when daemon
2+
threads enable or disable profiling or tracing while the main thread is
3+
shutting down the interpreter.

Python/lock.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,7 @@ _PyRawMutex_LockSlow(_PyRawMutex *m)
227227

228228
// Wait for us to be woken up. Note that we still have to lock the
229229
// mutex ourselves: it is NOT handed off to us.
230-
_PySemaphore_Wait(&waiter.sema, -1, /*detach=*/0);
230+
_PySemaphore_Wait(&waiter.sema, -1);
231231
}
232232

233233
_PySemaphore_Destroy(&waiter.sema);

Python/parking_lot.c

Lines changed: 19 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,8 @@ _PySemaphore_Destroy(_PySemaphore *sema)
9191
#endif
9292
}
9393

94-
static int
95-
_PySemaphore_PlatformWait(_PySemaphore *sema, PyTime_t timeout)
94+
int
95+
_PySemaphore_Wait(_PySemaphore *sema, PyTime_t timeout)
9696
{
9797
int res;
9898
#if defined(MS_WINDOWS)
@@ -225,27 +225,6 @@ _PySemaphore_PlatformWait(_PySemaphore *sema, PyTime_t timeout)
225225
return res;
226226
}
227227

228-
int
229-
_PySemaphore_Wait(_PySemaphore *sema, PyTime_t timeout, int detach)
230-
{
231-
PyThreadState *tstate = NULL;
232-
if (detach) {
233-
tstate = _PyThreadState_GET();
234-
if (tstate && _PyThreadState_IsAttached(tstate)) {
235-
// Only detach if we are attached
236-
PyEval_ReleaseThread(tstate);
237-
}
238-
else {
239-
tstate = NULL;
240-
}
241-
}
242-
int res = _PySemaphore_PlatformWait(sema, timeout);
243-
if (tstate) {
244-
PyEval_AcquireThread(tstate);
245-
}
246-
return res;
247-
}
248-
249228
void
250229
_PySemaphore_Wakeup(_PySemaphore *sema)
251230
{
@@ -342,7 +321,19 @@ _PyParkingLot_Park(const void *addr, const void *expected, size_t size,
342321
enqueue(bucket, addr, &wait);
343322
_PyRawMutex_Unlock(&bucket->mutex);
344323

345-
int res = _PySemaphore_Wait(&wait.sema, timeout_ns, detach);
324+
PyThreadState *tstate = NULL;
325+
if (detach) {
326+
tstate = _PyThreadState_GET();
327+
if (tstate && _PyThreadState_IsAttached(tstate)) {
328+
// Only detach if we are attached
329+
PyEval_ReleaseThread(tstate);
330+
}
331+
else {
332+
tstate = NULL;
333+
}
334+
}
335+
336+
int res = _PySemaphore_Wait(&wait.sema, timeout_ns);
346337
if (res == Py_PARK_OK) {
347338
goto done;
348339
}
@@ -354,7 +345,7 @@ _PyParkingLot_Park(const void *addr, const void *expected, size_t size,
354345
// Another thread has started to unpark us. Wait until we process the
355346
// wakeup signal.
356347
do {
357-
res = _PySemaphore_Wait(&wait.sema, -1, detach);
348+
res = _PySemaphore_Wait(&wait.sema, -1);
358349
} while (res != Py_PARK_OK);
359350
goto done;
360351
}
@@ -366,6 +357,9 @@ _PyParkingLot_Park(const void *addr, const void *expected, size_t size,
366357

367358
done:
368359
_PySemaphore_Destroy(&wait.sema);
360+
if (tstate) {
361+
PyEval_AcquireThread(tstate);
362+
}
369363
return res;
370364

371365
}

Python/pystate.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2253,13 +2253,15 @@ stop_the_world(struct _stoptheworld_state *stw)
22532253
{
22542254
_PyRuntimeState *runtime = &_PyRuntime;
22552255

2256-
PyMutex_Lock(&stw->mutex);
2256+
// gh-137433: Acquire the rwmutex first to avoid deadlocks with daemon
2257+
// threads that may hang when blocked on lock acquisition.
22572258
if (stw->is_global) {
22582259
_PyRWMutex_Lock(&runtime->stoptheworld_mutex);
22592260
}
22602261
else {
22612262
_PyRWMutex_RLock(&runtime->stoptheworld_mutex);
22622263
}
2264+
PyMutex_Lock(&stw->mutex);
22632265

22642266
HEAD_LOCK(runtime);
22652267
stw->requested = 1;
@@ -2325,13 +2327,13 @@ start_the_world(struct _stoptheworld_state *stw)
23252327
}
23262328
stw->requester = NULL;
23272329
HEAD_UNLOCK(runtime);
2330+
PyMutex_Unlock(&stw->mutex);
23282331
if (stw->is_global) {
23292332
_PyRWMutex_Unlock(&runtime->stoptheworld_mutex);
23302333
}
23312334
else {
23322335
_PyRWMutex_RUnlock(&runtime->stoptheworld_mutex);
23332336
}
2334-
PyMutex_Unlock(&stw->mutex);
23352337
}
23362338
#endif // Py_GIL_DISABLED
23372339

0 commit comments

Comments
 (0)