Skip to content

Commit b5f0a1a

Browse files
committed
fix
Signed-off-by: oliver könig <[email protected]>
1 parent cdffd23 commit b5f0a1a

File tree

1 file changed

+14
-0
lines changed

1 file changed

+14
-0
lines changed

nemo_run/core/execution/templates/ft_launcher_dgxc.j2

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,20 @@
33
# Fault tolerance related items
44
export FAULT_TOL_CFG_PATH="{{fault_tol_cfg_path}}"
55
export FAULT_TOL_FINISHED_FLAG_FILE="{{fault_tol_finished_flag_file}}"
6+
7+
JOB_RESULTS_FILE="{{fault_tol_job_results_file}}"
8+
9+
is_training_finished() {
10+
test -f "$(dirname $JOB_RESULTS_FILE)/$(basename $FAULT_TOL_FINISHED_FLAG_FILE)"
11+
}
12+
13+
if is_training_finished ; then
14+
echo "Training is finished";
15+
exit 0;
16+
else
17+
rm -f "$FAULT_TOL_FINISHED_FLAG_FILE" "$JOB_RESULTS_FILE"
18+
fi
19+
620
{%- endmacro %}
721

822
{% macro ft_launcher_teardown() -%}

0 commit comments

Comments
 (0)