@@ -1008,55 +1008,110 @@ def test_zombies_reaped():
10081008
10091009
10101010def test_pipe_inheritance ():
1011- """Spawn 300 child processes: 100 writers, 100 readers, and 100 waiters.
1012- The goal is to see whether any of the waiters inherits a write end of one
1013- of the pipes. If so, it will deadlock a reader and deadlock this test.
1014-
1015- You can make this fail on Windows and macOS by passing `close_fds=False` to
1016- `subprocess.Popen` in duct.py. The Windows deadlock is because pipes are
1017- briefly marked inheritable process-wide during a critical section when
1018- spawning child processes, and `close_fds=False` turns off the
1019- whitelist/allowlist that works around that. I think the macOS deadlock is
1020- because macOS doesn't support `pipe2`, so pipe creation itself has a brief
1021- race with child spawning, before `FD_CLOEXEC` takes effect."""
1022- children = []
1023- threads = []
1024-
1025- def reader_writer_fn ():
1026- # Open a pipe for the child to read from. This can deadlock if a waiter
1027- # inherits this pipe and keeps it open.
1028- pipe_reader_fd , pipe_writer_fd = os .pipe ()
1029- pipe_reader = os .fdopen (pipe_reader_fd , "rb" )
1030- pipe_writer = os .fdopen (pipe_writer_fd , "wb" )
1031- writer_child = echo_cmd ("foo" ).stdout_file (pipe_writer ).start ()
1032- children .append (writer_child )
1033- pipe_writer .close ()
1034- reader_child = cat_cmd ().stdin_file (pipe_reader ).reader ()
1035- children .append (reader_child )
1036- pipe_reader .close ()
1037- writer_child .wait ()
1038- assert reader_child .read ().strip () == b"foo"
1039-
1040- for _ in range (100 ):
1041- thread = threading .Thread (target = reader_writer_fn , daemon = True )
1042- thread .start ()
1043- threads .append (thread )
1044-
1045- # Spawn a child that won't exit until we kill it. This is intended to
1046- # keep pipes open if they're inherited unintentionally. Without
1047- # something like this, we'd need an inheritance *cycle* between
1048- # readers, which might be unlikely.
1049- children .append (sleep_cmd (365 * 24 * 60 * 60 ).unchecked ().start ())
1011+ """Spawn lots of child processes, 100 `cat`s and 100 `sleep`s at a time,
1012+ for a fixed number of seconds (longer in CI). The goal is to see whether
1013+ any of the `sleep`s inherits the write end of a pipe that one of the `cat`s
1014+ is trying to read. If so, that `cat` will hang. Detect that with a
1015+ `wait_timeout`, and fail if we see it.
1016+
1017+ This test is most relevant in Python, where the standard library doesn't
1018+ protect `subprocess.Popen` with a global mutex, which means that spawning
1019+ child processes from multiple threads is prone to deadlocks on Windows. The
1020+ updated `close_fds=True` default in Python 3.7+ is a workaround, and the
1021+ Python version of this tests exercises that. The Rust standard library has
1022+ a global `Mutex` that prevents that particular race. However, Python's
1023+ `close_fds` feature also works around a bug on macOS, where there's no
1024+ `pipe2` syscall, so we can't open pipes and mark them `CLOEXEC` atomically.
1025+ Rust is vulnerable to this race."""
10501026
1051- try :
1052- for thread in threads :
1053- # These threads should terminate very quickly in the absence of
1054- # deadlocks. 10 seconds is generous.
1055- thread .join (timeout = 10 )
1056- assert not thread .is_alive (), "deadlock!"
1057- finally :
1058- # `daemon=True` above means that we technically don't need to worry
1059- # about background threads outliving this test, but it's better to
1060- # unblock them.
1061- for child in children :
1062- child .kill ()
1027+ test_duration_secs = 1
1028+ if duration_env := os .environ .get ("DUCT_RACE_TEST_SECONDS" ):
1029+ test_duration_secs = int (duration_env )
1030+ print (f"test duration: { test_duration_secs } seconds" )
1031+ test_start = time .monotonic ()
1032+ test_deadline = test_start + test_duration_secs
1033+ spawns_per_iteration = 100
1034+ # If they don't hang, the `cat` processes should exit almost immediately,
1035+ # so a 1 second wait is generous.
1036+ deadlock_timeout = 1.0
1037+ start_barrier = threading .Barrier (2 )
1038+ end_barrier = threading .Barrier (2 )
1039+ deadlocked = False
1040+ finished = False
1041+ iterations = 0
1042+
1043+ # We don't yet support waiting on a Duct handle with a timeout in Python.
1044+ # (We do in Rust though.) Use a daemon thread to fake it.
1045+ def wait_with_timeout (handle , timeout ):
1046+ wait_thread = threading .Thread (target = handle .wait , daemon = True )
1047+ wait_thread .start ()
1048+ wait_thread .join (timeout )
1049+ still_running = wait_thread .is_alive ()
1050+ return still_running
1051+
1052+ def spawn_sleepers ():
1053+ # A background thread spawns `sleep`s.
1054+ sleep_expr = sleep_cmd (1_000_000 ).unchecked ()
1055+ while not deadlocked and not finished :
1056+ sleeps = []
1057+ start_barrier .wait ()
1058+ # Spawn all the `sleep`s.
1059+ for _ in range (spawns_per_iteration ):
1060+ sleeps .append (sleep_expr .start ())
1061+ end_barrier .wait ()
1062+ # Clean up `sleep`s *after* the end barrier, so that we wait until the main thread
1063+ # has confirmed there are no deadlocks.
1064+ for sleep_handle in sleeps :
1065+ sleep_handle .kill ()
1066+ sleep_handle .wait ()
1067+
1068+ # Not a daemon thread, so that we don't exit with `sleep`s running.
1069+ sleep_thread = threading .Thread (target = spawn_sleepers )
1070+ sleep_thread .start ()
1071+
1072+ # This thread spawns `cat`s.
1073+ while not deadlocked and not finished :
1074+ iterations += 1
1075+ cats = []
1076+ cat_expr = (
1077+ cat_cmd ()
1078+ # `stdin_bytes` opens a pipe
1079+ .stdin_bytes (b"foo" )
1080+ # `pipe` opens a pipe (of course)
1081+ .pipe (cat_cmd ())
1082+ # Capturing output also opens a pipe.
1083+ .stdout_capture ().unchecked ()
1084+ )
1085+ start_barrier .wait ()
1086+ # Spawn all the `cat`s.
1087+ for _ in range (spawns_per_iteration ):
1088+ cats .append (cat_expr .start ())
1089+ # Check for deadlocks *before* the end barrier, so that `sleep` cleanup doesn't happen
1090+ # until this loop is done.
1091+ for cat_handle in cats :
1092+ # Only do a `wait_timeout` if we haven't seen a deadlock yet, so that we exit
1093+ # quickly once we know the test has failed.
1094+ if not deadlocked :
1095+ still_running = wait_with_timeout (cat_handle , deadlock_timeout )
1096+ if still_running :
1097+ deadlocked = True
1098+ # Check the deadline *before* the end barrier, so that both loops are guaranteed to
1099+ # see the flag in the next iteration.
1100+ if time .monotonic () >= test_deadline :
1101+ finished = True
1102+ end_barrier .wait ()
1103+ # Kill and reap `cat`s *after* the end barrier, because `wait` blocks on their IO
1104+ # threads, which (if there are inheritance bugs) might be kept running by a `sleep`.
1105+ # Deadlocking this test isn't the end of the world, because either way it's a CI
1106+ # failure, but it's a lot nicer to return a clear message quickly. Note that we don't
1107+ # have to worry about a cycle of `cat`s inheriting each other's pipes, because only
1108+ # this thread is spawning `cat`s.
1109+ for cat_handle in cats :
1110+ cat_handle .kill ()
1111+ cat_handle .wait ()
1112+
1113+ elapsed = time .monotonic () - test_start
1114+ assert not deadlocked , (
1115+ f"deadlock after { iterations } iterations "
1116+ f"({ elapsed - deadlock_timeout :.3f} seconds)"
1117+ )
0 commit comments