Skip to content

Commit c7ab843

Browse files
committed
fix
Signed-off-by: oliver könig <[email protected]>
1 parent 096e0af commit c7ab843

File tree

1 file changed

+2
-6
lines changed

1 file changed

+2
-6
lines changed

nemo_run/core/execution/templates/ft_launcher_dgxc.j2

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,8 @@ is_training_finished() {
1919
test -f "$(dirname $JOB_RESULTS_FILE)/$(basename $FAULT_TOL_FINISHED_FLAG_FILE)"
2020
}
2121
# Exit immediately if finished flag file exists and this job is a continuation
22-
if [ -v RETRY_COUNT ] && [ "$RETRY_COUNT" -gt 0 ] ; then
23-
if is_training_finished ; then echo "Training is finished" ; exit 0 ; fi
24-
if is_job_failures_limit_reached ; then echo "Job failures limit reached ($TORCHX_MAX_RETRIES)" ; exit 1 ; fi
25-
else
26-
rm -f "$FAULT_TOL_FINISHED_FLAG_FILE" "$JOB_RESULTS_FILE"
27-
fi
22+
if is_training_finished ; then echo "Training is finished" ; exit 0 ; fi
23+
if is_job_failures_limit_reached ; then echo "Job failures limit reached ($TORCHX_MAX_RETRIES)" ; exit 1 ; fi
2824

2925
# Write unknown job status to the job log, we will fix it at the end
3026
echo "$TORCHX_REPLICA_ID ${RETRY_COUNT:-0} X" >> "$JOB_RESULTS_FILE"

0 commit comments

Comments
 (0)