We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent cdffd23 commit b5f0a1aCopy full SHA for b5f0a1a
nemo_run/core/execution/templates/ft_launcher_dgxc.j2
@@ -3,6 +3,20 @@
3
# Fault tolerance related items
4
export FAULT_TOL_CFG_PATH="{{fault_tol_cfg_path}}"
5
export FAULT_TOL_FINISHED_FLAG_FILE="{{fault_tol_finished_flag_file}}"
6
+
7
+JOB_RESULTS_FILE="{{fault_tol_job_results_file}}"
8
9
+is_training_finished() {
10
+ test -f "$(dirname $JOB_RESULTS_FILE)/$(basename $FAULT_TOL_FINISHED_FLAG_FILE)"
11
+}
12
13
+if is_training_finished ; then
14
+ echo "Training is finished";
15
+ exit 0;
16
+else
17
+ rm -f "$FAULT_TOL_FINISHED_FLAG_FILE" "$JOB_RESULTS_FILE"
18
+fi
19
20
{%- endmacro %}
21
22
{% macro ft_launcher_teardown() -%}
0 commit comments