|
3 | 3 | # Fault tolerance related items |
4 | 4 | export FAULT_TOL_CFG_PATH="{{fault_tol_cfg_path}}" |
5 | 5 | export FAULT_TOL_FINISHED_FLAG_FILE="{{fault_tol_finished_flag_file}}" |
6 | | -ANY_JOB_STEP_FAILED=0 |
7 | | -export TORCHX_MAX_RETRIES=3 |
8 | | - |
9 | | -# Automatic job resubmission related items |
10 | | -JOB_RESULTS_FILE="{{fault_tol_job_results_file}}" |
11 | | -# For k8s, we use pod restart count or a custom retry counter |
12 | | -RETRY_COUNT=${RETRY_COUNT:-0} |
13 | | -# Use a unique identifier for this job/pod |
14 | | -JOB_ID=${HOSTNAME:-${TORCHX_REPLICA_ID:-unknown}} |
15 | | - |
16 | | -is_job_failures_limit_reached() { |
17 | | - # If TORCHX_MAX_RETRIES is 0 or unset, never reach the limit (infinite retries) |
18 | | - if [ "${TORCHX_MAX_RETRIES:-0}" -eq 0 ]; then |
19 | | - return 1 # Limit not reached, allow retries |
20 | | - fi |
21 | | - |
22 | | - # If job results file doesn't exist yet, limit not reached |
23 | | - if [ ! -f "$JOB_RESULTS_FILE" ]; then |
24 | | - return 1 |
25 | | - fi |
26 | | - |
27 | | - # Check if we have TORCHX_MAX_RETRIES failures in the log |
28 | | - tail -n "${TORCHX_MAX_RETRIES}" "$JOB_RESULTS_FILE" 2>/dev/null | \ |
29 | | - awk "/^[[:alnum:]_-]+[[:space:]]+[[:alnum:]]+[[:space:]]+[XF]$/{f++} END{exit !(f>=${TORCHX_MAX_RETRIES})}" |
30 | | -} |
31 | | - |
32 | | -is_training_finished() { |
33 | | - test -f "$(dirname "$JOB_RESULTS_FILE")/$(basename "$FAULT_TOL_FINISHED_FLAG_FILE")" |
34 | | -} |
35 | | - |
36 | | -# Check if training is already finished |
37 | | -if is_training_finished ; then |
38 | | - echo "Training is finished" |
39 | | - exit 0 |
40 | | -fi |
41 | | - |
42 | | -# Check if we've hit the failure limit |
43 | | -if is_job_failures_limit_reached ; then |
44 | | - echo "Job failures limit reached (${TORCHX_MAX_RETRIES:-0})" |
45 | | - exit 1 |
46 | | -fi |
47 | | - |
48 | | -# Only clean up job results on the very first run |
49 | | -if [ "$RETRY_COUNT" -eq 0 ] ; then |
50 | | - rm -f "$FAULT_TOL_FINISHED_FLAG_FILE" "$JOB_RESULTS_FILE" |
51 | | -fi |
52 | | - |
53 | | -# Ensure directory exists |
54 | | -mkdir -p "$(dirname "$JOB_RESULTS_FILE")" |
55 | | - |
56 | | -# Write unknown job status to the job log, we will fix it at the end |
57 | | -echo "$JOB_ID $RETRY_COUNT X" >> "$JOB_RESULTS_FILE" |
58 | 6 | {%- endmacro %} |
59 | 7 |
|
60 | 8 | {% macro ft_launcher_teardown() -%} |
61 | | -if [ $exitcode -ne 0 ]; then ANY_JOB_STEP_FAILED=1 ; fi |
62 | | - |
63 | | -# Fix the job log entry ("JOB_ID X" -> "JOB_ID S/F"), depending on the job result |
64 | | -JOB_ID=${HOSTNAME:-${TORCHX_REPLICA_ID:-unknown}} |
65 | | -RETRY_COUNT=${RETRY_COUNT:-0} |
66 | | - |
67 | | -if [ "$ANY_JOB_STEP_FAILED" = "0" ] ; then |
68 | | - sed -i "s/$JOB_ID $RETRY_COUNT X/$JOB_ID $RETRY_COUNT S/" "$JOB_RESULTS_FILE" |
69 | | -else |
70 | | - sed -i "s/$JOB_ID $RETRY_COUNT X/$JOB_ID $RETRY_COUNT F/" "$JOB_RESULTS_FILE" |
71 | | -fi |
72 | | - |
73 | | -# Check final state |
74 | | -if is_training_finished ; then |
75 | | - echo "Training completed successfully" |
76 | | - exit 0 |
77 | | -elif is_job_failures_limit_reached ; then |
78 | | - echo "Job failures limit reached, giving up" |
79 | | - exit 1 |
80 | | -else |
81 | | - # Training not finished and we haven't hit retry limit |
82 | | - # Exit with failure code to trigger pod restart |
83 | | - echo "Training incomplete, exiting with code $exitcode to trigger retry" |
84 | | - exit $exitcode |
85 | | -fi |
| 9 | +exit $exitcode |
86 | 10 | {%- endmacro %} |
0 commit comments