Skip to content

Commit 5bce448

Browse files
committed
backport the changes I made for the Rust version of test_pipe_inheritance
1 parent e95178b commit 5bce448

File tree

2 files changed

+110
-51
lines changed

2 files changed

+110
-51
lines changed

.github/workflows/ci.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,7 @@ jobs:
2020
- uses: actions/checkout@v4
2121
- uses: astral-sh/setup-uv@v5
2222
- run: uv run --python ${{ matrix.python-version }} ci.py
23+
- name: run test_pipe_inheritance for 5 minutes
24+
run: uv run --python ${{ matrix.python-version }} ci.py
25+
env:
26+
DUCT_RACE_TEST_SECONDS: 300

test_duct.py

Lines changed: 106 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1008,55 +1008,110 @@ def test_zombies_reaped():
10081008

10091009

10101010
def test_pipe_inheritance():
1011-
"""Spawn 300 child processes: 100 writers, 100 readers, and 100 waiters.
1012-
The goal is to see whether any of the waiters inherits a write end of one
1013-
of the pipes. If so, it will deadlock a reader and deadlock this test.
1014-
1015-
You can make this fail on Windows and macOS by passing `close_fds=False` to
1016-
`subprocess.Popen` in duct.py. The Windows deadlock is because pipes are
1017-
briefly marked inheritable process-wide during a critical section when
1018-
spawning child processes, and `close_fds=False` turns off the
1019-
whitelist/allowlist that works around that. I think the macOS deadlock is
1020-
because macOS doesn't support `pipe2`, so pipe creation itself has a brief
1021-
race with child spawning, before `FD_CLOEXEC` takes effect."""
1022-
children = []
1023-
threads = []
1024-
1025-
def reader_writer_fn():
1026-
# Open a pipe for the child to read from. This can deadlock if a waiter
1027-
# inherits this pipe and keeps it open.
1028-
pipe_reader_fd, pipe_writer_fd = os.pipe()
1029-
pipe_reader = os.fdopen(pipe_reader_fd, "rb")
1030-
pipe_writer = os.fdopen(pipe_writer_fd, "wb")
1031-
writer_child = echo_cmd("foo").stdout_file(pipe_writer).start()
1032-
children.append(writer_child)
1033-
pipe_writer.close()
1034-
reader_child = cat_cmd().stdin_file(pipe_reader).reader()
1035-
children.append(reader_child)
1036-
pipe_reader.close()
1037-
writer_child.wait()
1038-
assert reader_child.read().strip() == b"foo"
1039-
1040-
for _ in range(100):
1041-
thread = threading.Thread(target=reader_writer_fn, daemon=True)
1042-
thread.start()
1043-
threads.append(thread)
1044-
1045-
# Spawn a child that won't exit until we kill it. This is intended to
1046-
# keep pipes open if they're inherited unintentionally. Without
1047-
# something like this, we'd need an inheritance *cycle* between
1048-
# readers, which might be unlikely.
1049-
children.append(sleep_cmd(365 * 24 * 60 * 60).unchecked().start())
1011+
"""Spawn lots of child processes, 100 `cat`s and 100 `sleep`s at a time,
1012+
for a fixed number of seconds (longer in CI). The goal is to see whether
1013+
any of the `sleep`s inherits the write end of a pipe that one of the `cat`s
1014+
is trying to read. If so, that `cat` will hang. Detect that with a
1015+
`wait_timeout`, and fail if we see it.
1016+
1017+
This test is most relevant in Python, where the standard library doesn't
1018+
protect `subprocess.Popen` with a global mutex, which means that spawning
1019+
child processes from multiple threads is prone to deadlocks on Windows. The
1020+
updated `close_fds=True` default in Python 3.7+ is a workaround, and the
1021+
Python version of this tests exercises that. The Rust standard library has
1022+
a global `Mutex` that prevents that particular race. However, Python's
1023+
`close_fds` feature also works around a bug on macOS, where there's no
1024+
`pipe2` syscall, so we can't open pipes and mark them `CLOEXEC` atomically.
1025+
Rust is vulnerable to this race."""
10501026

1051-
try:
1052-
for thread in threads:
1053-
# These threads should terminate very quickly in the absence of
1054-
# deadlocks. 10 seconds is generous.
1055-
thread.join(timeout=10)
1056-
assert not thread.is_alive(), "deadlock!"
1057-
finally:
1058-
# `daemon=True` above means that we technically don't need to worry
1059-
# about background threads outliving this test, but it's better to
1060-
# unblock them.
1061-
for child in children:
1062-
child.kill()
1027+
test_duration_secs = 1
1028+
if duration_env := os.environ.get("DUCT_RACE_TEST_SECONDS"):
1029+
test_duration_secs = int(duration_env)
1030+
print(f"test duration: {test_duration_secs} seconds")
1031+
test_start = time.monotonic()
1032+
test_deadline = test_start + test_duration_secs
1033+
spawns_per_iteration = 100
1034+
# If they don't hang, the `cat` processes should exit almost immediately,
1035+
# so a 1 second wait is generous.
1036+
deadlock_timeout = 1.0
1037+
start_barrier = threading.Barrier(2)
1038+
end_barrier = threading.Barrier(2)
1039+
deadlocked = False
1040+
finished = False
1041+
iterations = 0
1042+
1043+
# We don't yet support waiting on a Duct handle with a timeout in Python.
1044+
# (We do in Rust though.) Use a daemon thread to fake it.
1045+
def wait_with_timeout(handle, timeout):
1046+
wait_thread = threading.Thread(target=handle.wait, daemon=True)
1047+
wait_thread.start()
1048+
wait_thread.join(timeout)
1049+
still_running = wait_thread.is_alive()
1050+
return still_running
1051+
1052+
def spawn_sleepers():
1053+
# A background thread spawns `sleep`s.
1054+
sleep_expr = sleep_cmd(1_000_000).unchecked()
1055+
while not deadlocked and not finished:
1056+
sleeps = []
1057+
start_barrier.wait()
1058+
# Spawn all the `sleep`s.
1059+
for _ in range(spawns_per_iteration):
1060+
sleeps.append(sleep_expr.start())
1061+
end_barrier.wait()
1062+
# Clean up `sleep`s *after* the end barrier, so that we wait until the main thread
1063+
# has confirmed there are no deadlocks.
1064+
for sleep_handle in sleeps:
1065+
sleep_handle.kill()
1066+
sleep_handle.wait()
1067+
1068+
# Not a daemon thread, so that we don't exit with `sleep`s running.
1069+
sleep_thread = threading.Thread(target=spawn_sleepers)
1070+
sleep_thread.start()
1071+
1072+
# This thread spawns `cat`s.
1073+
while not deadlocked and not finished:
1074+
iterations += 1
1075+
cats = []
1076+
cat_expr = (
1077+
cat_cmd()
1078+
# `stdin_bytes` opens a pipe
1079+
.stdin_bytes(b"foo")
1080+
# `pipe` opens a pipe (of course)
1081+
.pipe(cat_cmd())
1082+
# Capturing output also opens a pipe.
1083+
.stdout_capture().unchecked()
1084+
)
1085+
start_barrier.wait()
1086+
# Spawn all the `cat`s.
1087+
for _ in range(spawns_per_iteration):
1088+
cats.append(cat_expr.start())
1089+
# Check for deadlocks *before* the end barrier, so that `sleep` cleanup doesn't happen
1090+
# until this loop is done.
1091+
for cat_handle in cats:
1092+
# Only do a `wait_timeout` if we haven't seen a deadlock yet, so that we exit
1093+
# quickly once we know the test has failed.
1094+
if not deadlocked:
1095+
still_running = wait_with_timeout(cat_handle, deadlock_timeout)
1096+
if still_running:
1097+
deadlocked = True
1098+
# Check the deadline *before* the end barrier, so that both loops are guaranteed to
1099+
# see the flag in the next iteration.
1100+
if time.monotonic() >= test_deadline:
1101+
finished = True
1102+
end_barrier.wait()
1103+
# Kill and reap `cat`s *after* the end barrier, because `wait` blocks on their IO
1104+
# threads, which (if there are inheritance bugs) might be kept running by a `sleep`.
1105+
# Deadlocking this test isn't the end of the world, because either way it's a CI
1106+
# failure, but it's a lot nicer to return a clear message quickly. Note that we don't
1107+
# have to worry about a cycle of `cat`s inheriting each other's pipes, because only
1108+
# this thread is spawning `cat`s.
1109+
for cat_handle in cats:
1110+
cat_handle.kill()
1111+
cat_handle.wait()
1112+
1113+
elapsed = time.monotonic() - test_start
1114+
assert not deadlocked, (
1115+
f"deadlock after {iterations} iterations "
1116+
f"({elapsed - deadlock_timeout:.3f} seconds)"
1117+
)

0 commit comments

Comments
 (0)