Skip to content

Commit cdffd23

Browse files
committed
cleanup
Signed-off-by: oliver könig <[email protected]>
1 parent 6d3c34f commit cdffd23

File tree

1 file changed

+1
-77
lines changed

1 file changed

+1
-77
lines changed

nemo_run/core/execution/templates/ft_launcher_dgxc.j2

Lines changed: 1 addition & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -3,84 +3,8 @@
33
# Fault tolerance related items
44
export FAULT_TOL_CFG_PATH="{{fault_tol_cfg_path}}"
55
export FAULT_TOL_FINISHED_FLAG_FILE="{{fault_tol_finished_flag_file}}"
6-
ANY_JOB_STEP_FAILED=0
7-
export TORCHX_MAX_RETRIES=3
8-
9-
# Automatic job resubmission related items
10-
JOB_RESULTS_FILE="{{fault_tol_job_results_file}}"
11-
# For k8s, we use pod restart count or a custom retry counter
12-
RETRY_COUNT=${RETRY_COUNT:-0}
13-
# Use a unique identifier for this job/pod
14-
JOB_ID=${HOSTNAME:-${TORCHX_REPLICA_ID:-unknown}}
15-
16-
is_job_failures_limit_reached() {
17-
# If TORCHX_MAX_RETRIES is 0 or unset, never reach the limit (infinite retries)
18-
if [ "${TORCHX_MAX_RETRIES:-0}" -eq 0 ]; then
19-
return 1 # Limit not reached, allow retries
20-
fi
21-
22-
# If job results file doesn't exist yet, limit not reached
23-
if [ ! -f "$JOB_RESULTS_FILE" ]; then
24-
return 1
25-
fi
26-
27-
# Check if we have TORCHX_MAX_RETRIES failures in the log
28-
tail -n "${TORCHX_MAX_RETRIES}" "$JOB_RESULTS_FILE" 2>/dev/null | \
29-
awk "/^[[:alnum:]_-]+[[:space:]]+[[:alnum:]]+[[:space:]]+[XF]$/{f++} END{exit !(f>=${TORCHX_MAX_RETRIES})}"
30-
}
31-
32-
is_training_finished() {
33-
test -f "$(dirname "$JOB_RESULTS_FILE")/$(basename "$FAULT_TOL_FINISHED_FLAG_FILE")"
34-
}
35-
36-
# Check if training is already finished
37-
if is_training_finished ; then
38-
echo "Training is finished"
39-
exit 0
40-
fi
41-
42-
# Check if we've hit the failure limit
43-
if is_job_failures_limit_reached ; then
44-
echo "Job failures limit reached (${TORCHX_MAX_RETRIES:-0})"
45-
exit 1
46-
fi
47-
48-
# Only clean up job results on the very first run
49-
if [ "$RETRY_COUNT" -eq 0 ] ; then
50-
rm -f "$FAULT_TOL_FINISHED_FLAG_FILE" "$JOB_RESULTS_FILE"
51-
fi
52-
53-
# Ensure directory exists
54-
mkdir -p "$(dirname "$JOB_RESULTS_FILE")"
55-
56-
# Write unknown job status to the job log, we will fix it at the end
57-
echo "$JOB_ID $RETRY_COUNT X" >> "$JOB_RESULTS_FILE"
586
{%- endmacro %}
597

608
{% macro ft_launcher_teardown() -%}
61-
if [ $exitcode -ne 0 ]; then ANY_JOB_STEP_FAILED=1 ; fi
62-
63-
# Fix the job log entry ("JOB_ID X" -> "JOB_ID S/F"), depending on the job result
64-
JOB_ID=${HOSTNAME:-${TORCHX_REPLICA_ID:-unknown}}
65-
RETRY_COUNT=${RETRY_COUNT:-0}
66-
67-
if [ "$ANY_JOB_STEP_FAILED" = "0" ] ; then
68-
sed -i "s/$JOB_ID $RETRY_COUNT X/$JOB_ID $RETRY_COUNT S/" "$JOB_RESULTS_FILE"
69-
else
70-
sed -i "s/$JOB_ID $RETRY_COUNT X/$JOB_ID $RETRY_COUNT F/" "$JOB_RESULTS_FILE"
71-
fi
72-
73-
# Check final state
74-
if is_training_finished ; then
75-
echo "Training completed successfully"
76-
exit 0
77-
elif is_job_failures_limit_reached ; then
78-
echo "Job failures limit reached, giving up"
79-
exit 1
80-
else
81-
# Training not finished and we haven't hit retry limit
82-
# Exit with failure code to trigger pod restart
83-
echo "Training incomplete, exiting with code $exitcode to trigger retry"
84-
exit $exitcode
85-
fi
9+
exit $exitcode
8610
{%- endmacro %}

0 commit comments

Comments
 (0)