File tree Expand file tree Collapse file tree 1 file changed +2
-6
lines changed
nemo_run/core/execution/templates Expand file tree Collapse file tree 1 file changed +2
-6
lines changed Original file line number Diff line number Diff line change @@ -19,12 +19,8 @@ is_training_finished() {
1919 test -f "$(dirname $JOB_RESULTS_FILE)/$(basename $FAULT_TOL_FINISHED_FLAG_FILE)"
2020}
2121# Exit immediately if finished flag file exists and this job is a continuation
22- if [ -v RETRY_COUNT ] && [ "$RETRY_COUNT" -gt 0 ] ; then
23- if is_training_finished ; then echo "Training is finished" ; exit 0 ; fi
24- if is_job_failures_limit_reached ; then echo "Job failures limit reached ($TORCHX_MAX_RETRIES)" ; exit 1 ; fi
25- else
26- rm -f "$FAULT_TOL_FINISHED_FLAG_FILE" "$JOB_RESULTS_FILE"
27- fi
22+ if is_training_finished ; then echo "Training is finished" ; exit 0 ; fi
23+ if is_job_failures_limit_reached ; then echo "Job failures limit reached ($TORCHX_MAX_RETRIES)" ; exit 1 ; fi
2824
2925# Write unknown job status to the job log, we will fix it at the end
3026echo "$TORCHX_REPLICA_ID ${RETRY_COUNT:-0} X" >> "$JOB_RESULTS_FILE"
You can’t perform that action at this time.
0 commit comments