@@ -70,6 +70,7 @@ def run_shard(q: Queue, barrier: Barrier, num, total, ssh_port_queue, kernel, di
70
70
71
71
# sys.argv.append("--pretend")
72
72
print ("Starting shard" , num , sys .argv )
73
+ run_remote_lit_test .CURRENT_STAGE = run_remote_lit_test .MultiprocessStages .FINDING_SSH_PORT
73
74
boot_cheribsd .MESSAGE_PREFIX = "\033 [0;34m" + "shard" + str (num ) + ": \033 [0m"
74
75
if pretend :
75
76
boot_cheribsd .QEMU_LOGFILE = Path (os .devnull )
@@ -95,7 +96,7 @@ def set_cmdline_args(args: argparse.Namespace):
95
96
if mp_queue :
96
97
# check that we don't get a conflict
97
98
mp_debug (args , "Syncing shard " , shard_num , " with main process. Stage: assign SSH port" )
98
-
99
+ assert run_remote_lit_test . CURRENT_STAGE == run_remote_lit_test . MultiprocessStages . FINDING_SSH_PORT
99
100
ssh_port_queue .put ((args .ssh_port , shard_num )) # check that we don't get a conflict
100
101
run_remote_lit_test .notify_main_process (
101
102
args ,
@@ -296,13 +297,6 @@ def run_parallel_impl(
296
297
starttime = datetime .datetime .now ()
297
298
ssh_ports = [] # check that we don't have multiple parallel jobs trying to use the same port
298
299
assert not mp_barrier .broken , mp_barrier
299
- # FIXME: without this sleep it fails in jenkins (is the python version there broken?)
300
- # Works just fine everywhere else where I test it...
301
- boot_cheribsd .info ("Waiting 5 seconds before releasing barrier" )
302
- if not get_global_config ().pretend :
303
- time .sleep (5 )
304
- mp_debug (args , "Waiting for SSH port barrier" )
305
- mp_barrier .wait (timeout = 10 ) # wait for ssh ports to be assigned
306
300
for i in range (len (processes )):
307
301
try :
308
302
ssh_port , index = ssh_port_queue .get (timeout = 1 )
@@ -317,6 +311,7 @@ def run_parallel_impl(
317
311
timed_out = True # kill all child processes
318
312
boot_cheribsd .failure ("ERROR: Could not determine SSH port for one of the processes!" , exit = False )
319
313
314
+ mp_barrier .wait () # allow shards to start running
320
315
# wait for the success/failure message from the process:
321
316
# if the shard takes longer than 4 hours to run something went wrong
322
317
start_time = datetime .datetime .utcnow ()
@@ -380,8 +375,9 @@ def run_parallel_impl(
380
375
remaining_processes .remove (target_process )
381
376
target_process .stage = run_remote_lit_test .MultiprocessStages .EXITED
382
377
elif shard_result [0 ] == run_remote_lit_test .NEXT_STAGE :
383
- mp_debug (args , "===> Shard " , shard_result [1 ], " reached next stage: " , shard_result [2 ])
384
- if target_process .stage == run_remote_lit_test .MultiprocessStages .BOOTING_CHERIBSD :
378
+ mp_debug (args , "===> Shard " , shard_result [1 ], " complated stage: " , shard_result [2 ])
379
+ assert target_process .stage == shard_result [2 ]
380
+ if shard_result [2 ] == run_remote_lit_test .MultiprocessStages .BOOTING_CHERIBSD :
385
381
not_booted_processes .remove (target_process )
386
382
boot_cheribsd .success (
387
383
"Shard " ,
@@ -398,9 +394,12 @@ def run_parallel_impl(
398
394
assert mp_barrier .n_waiting == len (processes ), f"{ mp_barrier .n_waiting } != { len (processes )} "
399
395
mp_barrier .wait (timeout = 10 )
400
396
boot_cheribsd .success ("Barrier has been released, tests should run now." )
401
- # assert target_process.stage < shard_result[2], "STAGE WENT BACKWARDS?"
402
- target_process .stage = shard_result [2 ]
397
+ target_process .stage = shard_result [3 ]
403
398
elif shard_result [0 ] == run_remote_lit_test .FAILURE :
399
+ boot_cheribsd .failure (
400
+ f"ERROR: Shard { target_process } faied in stage: { target_process .stage } " ,
401
+ exit = False ,
402
+ )
404
403
previous_stage = target_process .stage
405
404
target_process .stage = run_remote_lit_test .MultiprocessStages .FAILED
406
405
target_process .error_message = shard_result [2 ]
@@ -423,7 +422,7 @@ def run_parallel_impl(
423
422
shard_result [1 ],
424
423
" failed while running tests: " ,
425
424
shard_result [2 ],
426
- exit = True ,
425
+ exit = False ,
427
426
)
428
427
else :
429
428
boot_cheribsd .failure ("===> FATAL: Received invalid shard result message: " , shard_result , exit = True )
0 commit comments