|
1 | | -import os, typing, shutil, time, itertools |
| 1 | +import os, typing, shutil, time, itertools, subprocess |
2 | 2 | from random import sample, seed |
3 | 3 |
|
4 | 4 | import rich, rich.table |
@@ -194,10 +194,31 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): |
194 | 194 | cons.print(f" [bold magenta]{case.get_uuid()}[/bold magenta] SKIP {case.trace}") |
195 | 195 | return |
196 | 196 |
|
197 | | - cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices) |
198 | | - |
199 | 197 | out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt") |
200 | 198 |
|
| 199 | + try: |
| 200 | + cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices) |
| 201 | + except subprocess.TimeoutExpired as exc: |
| 202 | + # Save any partial stdout we have |
| 203 | + partial_output = "" |
| 204 | + if exc.stdout: |
| 205 | + try: |
| 206 | + partial_output = exc.stdout.decode() if isinstance(exc.stdout, bytes) else exc.stdout |
| 207 | + except Exception: |
| 208 | + partial_output = str(exc.stdout) |
| 209 | + |
| 210 | + if partial_output: |
| 211 | + common.file_write(out_filepath, partial_output) |
| 212 | + |
| 213 | + raise MFCException( |
| 214 | + f"Test {case} (2-rank case): Timed out after {ARG('timeout')} seconds.\n" |
| 215 | + f"This suggests the MPI job may have hung or deadlocked.\n" |
| 216 | + f"Partial output (if any) saved to {out_filepath}.\n" |
| 217 | + f"Case dictionary: {case.get_filepath()}.\n" |
| 218 | + f"Command: {' '.join(str(x) for x in exc.cmd)}" |
| 219 | + ) from exc |
| 220 | + |
| 221 | + # On normal completion, write full stdout |
201 | 222 | common.file_write(out_filepath, cmd.stdout) |
202 | 223 |
|
203 | 224 | if cmd.returncode != 0: |
@@ -238,8 +259,30 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): |
238 | 259 |
|
239 | 260 | if ARG("test_all"): |
240 | 261 | case.delete_output() |
241 | | - cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices) |
242 | 262 | out_filepath = os.path.join(case.get_dirpath(), "out_post.txt") |
| 263 | + |
| 264 | + try: |
| 265 | + cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices) |
| 266 | + except subprocess.TimeoutExpired as exc: |
| 267 | + # Save any partial stdout we have |
| 268 | + partial_output = "" |
| 269 | + if exc.stdout: |
| 270 | + try: |
| 271 | + partial_output = exc.stdout.decode() if isinstance(exc.stdout, bytes) else exc.stdout |
| 272 | + except Exception: |
| 273 | + partial_output = str(exc.stdout) |
| 274 | + |
| 275 | + if partial_output: |
| 276 | + common.file_write(out_filepath, partial_output) |
| 277 | + |
| 278 | + raise MFCException( |
| 279 | + f"Test {case} (2-rank case, post-process): Timed out after {ARG('timeout')} seconds.\n" |
| 280 | + f"This suggests the MPI job may have hung or deadlocked.\n" |
| 281 | + f"Partial output (if any) saved to {out_filepath}.\n" |
| 282 | + f"Case dictionary: {case.get_filepath()}.\n" |
| 283 | + f"Command: {' '.join(str(x) for x in exc.cmd)}" |
| 284 | + ) from exc |
| 285 | + |
243 | 286 | common.file_write(out_filepath, cmd.stdout) |
244 | 287 |
|
245 | 288 | for silo_filepath in os.listdir(os.path.join(case.get_dirpath(), 'silo_hdf5', 'p0')): |
|
0 commit comments