File tree Expand file tree Collapse file tree 1 file changed +16
-1
lines changed
nemo_run/core/execution/templates Expand file tree Collapse file tree 1 file changed +16
-1
lines changed Original file line number Diff line number Diff line change @@ -66,4 +66,19 @@ if [ "$exitcode" -eq "0" ]; then
6666 echo "[FT-Teardown] Job finished successfully and flag file exists."
6767 exit 0
6868 else
69- # Edge Case: The python script exited 0, but didn't write the flag
69+ # Edge Case: The python script exited 0, but didn't write the flag file.
70+ # This usually means a silent crash or partial run. We must force a retry.
71+ echo "[FT-Teardown] WARNING: Process exited 0 but finished flag is MISSING."
72+ echo "[FT-Teardown] Forcing exit 1 to trigger Kubernetes restart."
73+ exit 1
74+ fi
75+ else
76+ # Case B: Script crashed (exitcode != 0).
77+ echo "[FT-Teardown] Job failed with exit code $exitcode."
78+
79+ # We exit with the error code.
80+ # The K8s 'backoffLimit' (in PyTorchJob spec) will determine if we restart.
81+ # We do NOT calculate retry counts manually here.
82+ exit $exitcode
83+ fi
84+ {% - endmacro %}
You can’t perform that action at this time.
0 commit comments