Skip to content

Commit 0a008c6

Browse files
committed
Try to fix the race condition in the test script
1 parent c86d9bb commit 0a008c6

File tree

2 files changed

+15
-16
lines changed

2 files changed

+15
-16
lines changed

test-scripts/run_libcxx_tests.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ def run_shard(q: Queue, barrier: Barrier, num, total, ssh_port_queue, kernel, di
7070

7171
# sys.argv.append("--pretend")
7272
print("Starting shard", num, sys.argv)
73+
run_remote_lit_test.CURRENT_STAGE = run_remote_lit_test.MultiprocessStages.FINDING_SSH_PORT
7374
boot_cheribsd.MESSAGE_PREFIX = "\033[0;34m" + "shard" + str(num) + ": \033[0m"
7475
if pretend:
7576
boot_cheribsd.QEMU_LOGFILE = Path(os.devnull)
@@ -95,7 +96,7 @@ def set_cmdline_args(args: argparse.Namespace):
9596
if mp_queue:
9697
# check that we don't get a conflict
9798
mp_debug(args, "Syncing shard ", shard_num, " with main process. Stage: assign SSH port")
98-
99+
assert run_remote_lit_test.CURRENT_STAGE == run_remote_lit_test.MultiprocessStages.FINDING_SSH_PORT
99100
ssh_port_queue.put((args.ssh_port, shard_num)) # check that we don't get a conflict
100101
run_remote_lit_test.notify_main_process(
101102
args,
@@ -296,13 +297,6 @@ def run_parallel_impl(
296297
starttime = datetime.datetime.now()
297298
ssh_ports = [] # check that we don't have multiple parallel jobs trying to use the same port
298299
assert not mp_barrier.broken, mp_barrier
299-
# FIXME: without this sleep it fails in jenkins (is the python version there broken?)
300-
# Works just fine everywhere else where I test it...
301-
boot_cheribsd.info("Waiting 5 seconds before releasing barrier")
302-
if not get_global_config().pretend:
303-
time.sleep(5)
304-
mp_debug(args, "Waiting for SSH port barrier")
305-
mp_barrier.wait(timeout=10) # wait for ssh ports to be assigned
306300
for i in range(len(processes)):
307301
try:
308302
ssh_port, index = ssh_port_queue.get(timeout=1)
@@ -317,6 +311,7 @@ def run_parallel_impl(
317311
timed_out = True # kill all child processes
318312
boot_cheribsd.failure("ERROR: Could not determine SSH port for one of the processes!", exit=False)
319313

314+
mp_barrier.wait() # allow shards to start running
320315
# wait for the success/failure message from the process:
321316
# if the shard takes longer than 4 hours to run something went wrong
322317
start_time = datetime.datetime.utcnow()
@@ -380,8 +375,9 @@ def run_parallel_impl(
380375
remaining_processes.remove(target_process)
381376
target_process.stage = run_remote_lit_test.MultiprocessStages.EXITED
382377
elif shard_result[0] == run_remote_lit_test.NEXT_STAGE:
383-
mp_debug(args, "===> Shard ", shard_result[1], " reached next stage: ", shard_result[2])
384-
if target_process.stage == run_remote_lit_test.MultiprocessStages.BOOTING_CHERIBSD:
378+
mp_debug(args, "===> Shard ", shard_result[1], " complated stage: ", shard_result[2])
379+
assert target_process.stage == shard_result[2]
380+
if shard_result[2] == run_remote_lit_test.MultiprocessStages.BOOTING_CHERIBSD:
385381
not_booted_processes.remove(target_process)
386382
boot_cheribsd.success(
387383
"Shard ",
@@ -398,9 +394,12 @@ def run_parallel_impl(
398394
assert mp_barrier.n_waiting == len(processes), f"{mp_barrier.n_waiting} != {len(processes)}"
399395
mp_barrier.wait(timeout=10)
400396
boot_cheribsd.success("Barrier has been released, tests should run now.")
401-
# assert target_process.stage < shard_result[2], "STAGE WENT BACKWARDS?"
402-
target_process.stage = shard_result[2]
397+
target_process.stage = shard_result[3]
403398
elif shard_result[0] == run_remote_lit_test.FAILURE:
399+
boot_cheribsd.failure(
400+
f"ERROR: Shard {target_process} faied in stage: {target_process.stage}",
401+
exit=False,
402+
)
404403
previous_stage = target_process.stage
405404
target_process.stage = run_remote_lit_test.MultiprocessStages.FAILED
406405
target_process.error_message = shard_result[2]
@@ -423,7 +422,7 @@ def run_parallel_impl(
423422
shard_result[1],
424423
" failed while running tests: ",
425424
shard_result[2],
426-
exit=True,
425+
exit=False,
427426
)
428427
else:
429428
boot_cheribsd.failure("===> FATAL: Received invalid shard result message: ", shard_result, exit=True)

test-scripts/run_remote_lit_test.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ class MultiprocessStages(Enum):
6262
TIMED_OUT = "timed out"
6363

6464

65-
CURRENT_STAGE: MultiprocessStages = MultiprocessStages.FINDING_SSH_PORT
65+
CURRENT_STAGE: MultiprocessStages = MultiprocessStages.BOOTING_CHERIBSD
6666

6767

6868
def add_common_cmdline_args(parser: argparse.ArgumentParser, default_xunit_output: str, allow_multiprocessing: bool):
@@ -111,14 +111,13 @@ def notify_main_process(
111111
if mp_q:
112112
global CURRENT_STAGE # noqa: PLW0603
113113
mp_debug(cmdline_args, "Next stage: ", CURRENT_STAGE, "->", stage)
114-
mp_q.put((NEXT_STAGE, cmdline_args.internal_shard, stage))
114+
mp_q.put((NEXT_STAGE, cmdline_args.internal_shard, CURRENT_STAGE, stage))
115115
CURRENT_STAGE = stage
116116
if barrier:
117117
assert mp_q
118118
mp_debug(cmdline_args, "Waiting for main process to release barrier for stage ", stage)
119119
barrier.wait()
120120
mp_debug(cmdline_args, "Barrier released for stage ", stage)
121-
time.sleep(1)
122121

123122

124123
def flush_thread(f, qemu: boot_cheribsd.QemuCheriBSDInstance, should_exit_event: threading.Event):
@@ -210,6 +209,7 @@ def run_remote_lit_tests_impl(
210209
time.sleep(10)
211210
if mp_q:
212211
assert barrier is not None
212+
assert CURRENT_STAGE == MultiprocessStages.BOOTING_CHERIBSD
213213
notify_main_process(args, MultiprocessStages.TESTING_SSH_CONNECTION, mp_q, barrier=barrier)
214214
if get_global_config().pretend and os.getenv("FAIL_RAISE_EXCEPTION") and args.internal_shard == 1:
215215
raise RuntimeError("SOMETHING WENT WRONG!")

0 commit comments

Comments
 (0)