@@ -13,30 +13,48 @@ RETRY_COUNT=${RETRY_COUNT:-0}
1313JOB_ID=${HOSTNAME:-${TORCHX_REPLICA_ID:-unknown}}
1414
1515is_job_failures_limit_reached() {
16- if [ $TORCHX_MAX_RETRIES -eq 0 ]; then
17- true
16+ if [ ! -f "$JOB_RESULTS_FILE" ]; then
17+ return 1 # File doesn't exist, limit not reached
18+ fi
19+ if [ "${TORCHX_MAX_RETRIES:-0}" -eq 0 ]; then
20+ return 0 # 0 retries means limit is always reached
1821 else
19- tail -n $ TORCHX_MAX_RETRIES "$JOB_RESULTS_FILE" 2>/dev/null | \
20- awk "/^[[:alnum:]_-]+[[:space:]]+[[:alnum:]]+[[:space:]]+[XF]$/{f++} END{exit !(f>=$TORCHX_MAX_RETRIES)}"
22+ tail -n "${ TORCHX_MAX_RETRIES}" "$JOB_RESULTS_FILE" 2>/dev/null | \
23+ awk "/^[[:alnum:]_-]+[[:space:]]+[[:alnum:]]+[[:space:]]+[XF]$/{f++} END{exit !(f>=${ TORCHX_MAX_RETRIES} )}"
2124 fi
2225}
26+
2327is_training_finished() {
24- test -f "$(dirname $JOB_RESULTS_FILE)/$(basename $FAULT_TOL_FINISHED_FLAG_FILE)"
28+ test -f "$(dirname " $JOB_RESULTS_FILE" )/$(basename " $FAULT_TOL_FINISHED_FLAG_FILE" )"
2529}
26- # Exit immediately if finished flag file exists and this job is a continuation
27- if [ "$RETRY_COUNT" -gt 0 ] ; then
28- if is_training_finished ; then echo "Training is finished" ; exit 0 ; fi
29- if is_job_failures_limit_reached ; then echo "Job failures limit reached ($TORCHX_MAX_RETRIES)" ; exit 1 ; fi
30- else
30+
31+ # Check if training is already finished
32+ if is_training_finished ; then
33+ echo "Training is finished"
34+ exit 0
35+ fi
36+
37+ # Check if we've hit the failure limit
38+ if is_job_failures_limit_reached ; then
39+ echo "Job failures limit reached (${TORCHX_MAX_RETRIES:-0})"
40+ exit 1
41+ fi
42+
43+ # Only clean up job results on the very first run
44+ if [ "$RETRY_COUNT" -eq 0 ] ; then
3145 rm -f "$FAULT_TOL_FINISHED_FLAG_FILE" "$JOB_RESULTS_FILE"
3246fi
3347
48+ # Ensure directory exists
49+ mkdir -p "$(dirname "$JOB_RESULTS_FILE")"
50+
3451# Write unknown job status to the job log, we will fix it at the end
3552echo "$JOB_ID $RETRY_COUNT X" >> "$JOB_RESULTS_FILE"
3653{% - endmacro %}
3754
3855{% macro ft_launcher_teardown () -%}
3956if [ $exitcode -ne 0 ]; then ANY_JOB_STEP_FAILED=1 ; fi
57+
4058# Fix the job log entry ("JOB_ID X" -> "JOB_ID S/F"), depending on the job result
4159JOB_ID=${HOSTNAME:-${TORCHX_REPLICA_ID:-unknown}}
4260RETRY_COUNT=${RETRY_COUNT:-0}
4765 sed -i "s/$JOB_ID $RETRY_COUNT X/$JOB_ID $RETRY_COUNT F/" "$JOB_RESULTS_FILE"
4866fi
4967
50- # On k8s, we exit with the appropriate code and let the retry policy handle resubmission
51- # Rather than explicitly requeueing like SLURM
52- if ! (is_training_finished || is_job_failures_limit_reached); then
68+ # Check final state
69+ if is_training_finished ; then
70+ echo "Training completed successfully"
71+ exit 0
72+ elif is_job_failures_limit_reached ; then
73+ echo "Job failures limit reached, giving up"
74+ exit 1
75+ else
76+ # Training not finished and we haven't hit retry limit
77+ # Exit with failure code to trigger pod restart
78+ echo "Training incomplete, exiting with code $exitcode to trigger retry"
5379 exit $exitcode
5480fi
5581{% - endmacro %}
0 commit comments