Skip to content

Commit 5f2fb8a

Browse files
committed
test
Signed-off-by: oliver könig <[email protected]>
1 parent bd000a4 commit 5f2fb8a

File tree

1 file changed

+39
-13
lines changed

1 file changed

+39
-13
lines changed

nemo_run/core/execution/templates/ft_launcher_dgxc.j2

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,30 +13,48 @@ RETRY_COUNT=${RETRY_COUNT:-0}
1313
JOB_ID=${HOSTNAME:-${TORCHX_REPLICA_ID:-unknown}}
1414

1515
is_job_failures_limit_reached() {
16-
if [ $TORCHX_MAX_RETRIES -eq 0 ]; then
17-
true
16+
if [ ! -f "$JOB_RESULTS_FILE" ]; then
17+
return 1 # File doesn't exist, limit not reached
18+
fi
19+
if [ "${TORCHX_MAX_RETRIES:-0}" -eq 0 ]; then
20+
return 0 # 0 retries means limit is always reached
1821
else
19-
tail -n $TORCHX_MAX_RETRIES "$JOB_RESULTS_FILE" 2>/dev/null | \
20-
awk "/^[[:alnum:]_-]+[[:space:]]+[[:alnum:]]+[[:space:]]+[XF]$/{f++} END{exit !(f>=$TORCHX_MAX_RETRIES)}"
22+
tail -n "${TORCHX_MAX_RETRIES}" "$JOB_RESULTS_FILE" 2>/dev/null | \
23+
awk "/^[[:alnum:]_-]+[[:space:]]+[[:alnum:]]+[[:space:]]+[XF]$/{f++} END{exit !(f>=${TORCHX_MAX_RETRIES})}"
2124
fi
2225
}
26+
2327
is_training_finished() {
24-
test -f "$(dirname $JOB_RESULTS_FILE)/$(basename $FAULT_TOL_FINISHED_FLAG_FILE)"
28+
test -f "$(dirname "$JOB_RESULTS_FILE")/$(basename "$FAULT_TOL_FINISHED_FLAG_FILE")"
2529
}
26-
# Exit immediately if finished flag file exists and this job is a continuation
27-
if [ "$RETRY_COUNT" -gt 0 ] ; then
28-
if is_training_finished ; then echo "Training is finished" ; exit 0 ; fi
29-
if is_job_failures_limit_reached ; then echo "Job failures limit reached ($TORCHX_MAX_RETRIES)" ; exit 1 ; fi
30-
else
30+
31+
# Check if training is already finished
32+
if is_training_finished ; then
33+
echo "Training is finished"
34+
exit 0
35+
fi
36+
37+
# Check if we've hit the failure limit
38+
if is_job_failures_limit_reached ; then
39+
echo "Job failures limit reached (${TORCHX_MAX_RETRIES:-0})"
40+
exit 1
41+
fi
42+
43+
# Only clean up job results on the very first run
44+
if [ "$RETRY_COUNT" -eq 0 ] ; then
3145
rm -f "$FAULT_TOL_FINISHED_FLAG_FILE" "$JOB_RESULTS_FILE"
3246
fi
3347

48+
# Ensure directory exists
49+
mkdir -p "$(dirname "$JOB_RESULTS_FILE")"
50+
3451
# Write unknown job status to the job log, we will fix it at the end
3552
echo "$JOB_ID $RETRY_COUNT X" >> "$JOB_RESULTS_FILE"
3653
{%- endmacro %}
3754

3855
{% macro ft_launcher_teardown() -%}
3956
if [ $exitcode -ne 0 ]; then ANY_JOB_STEP_FAILED=1 ; fi
57+
4058
# Fix the job log entry ("JOB_ID X" -> "JOB_ID S/F"), depending on the job result
4159
JOB_ID=${HOSTNAME:-${TORCHX_REPLICA_ID:-unknown}}
4260
RETRY_COUNT=${RETRY_COUNT:-0}
@@ -47,9 +65,17 @@ else
4765
sed -i "s/$JOB_ID $RETRY_COUNT X/$JOB_ID $RETRY_COUNT F/" "$JOB_RESULTS_FILE"
4866
fi
4967

50-
# On k8s, we exit with the appropriate code and let the retry policy handle resubmission
51-
# Rather than explicitly requeueing like SLURM
52-
if ! (is_training_finished || is_job_failures_limit_reached); then
68+
# Check final state
69+
if is_training_finished ; then
70+
echo "Training completed successfully"
71+
exit 0
72+
elif is_job_failures_limit_reached ; then
73+
echo "Job failures limit reached, giving up"
74+
exit 1
75+
else
76+
# Training not finished and we haven't hit retry limit
77+
# Exit with failure code to trigger pod restart
78+
echo "Training incomplete, exiting with code $exitcode to trigger retry"
5379
exit $exitcode
5480
fi
5581
{%- endmacro %}

0 commit comments

Comments
 (0)