Skip to content

Commit eb98cb5

Browse files
committed
cleanup
Signed-off-by: oliver könig <[email protected]>
1 parent 620e073 commit eb98cb5

File tree

2 files changed

+3
-56
lines changed

2 files changed

+3
-56
lines changed
Lines changed: 3 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,31 @@
11
{%- import "ft_launcher_dgxc.j2" as fault_tolerance -%}
22
#!/bin/bash
3-
#
4-
# Generated by NeMo Run for Kubernetes (PyTorchJob)
5-
#
63

7-
# 1. Basic Shell Setup
84
set -evx # Print commands, but DO NOT exit immediately on error (we handle that below)
95
export PYTHONUNBUFFERED=1
106
export TORCHX_MAX_RETRIES={{max_retries}}
117

12-
# 2. Environment Variables
13-
# These are strictly user-defined vars (e.g. HYDRA_FULL_ERROR).
14-
# Note: MASTER_ADDR, WORLD_SIZE, RANK are injected automatically by the PyTorchJob operator.
158
{%- for env_var in env_vars %}
169
{{env_var}}
1710
{%- endfor %}
1811

19-
# 3. Fault Tolerance: SETUP (Check-in)
20-
# Checks if we are resuming or if we are already finished.
2112
{%- if ft_enabled %}
2213
{{ fault_tolerance.ft_launcher_setup(fault_tol_cfg_path, fault_tol_finished_flag_file, fault_tol_job_results_file) }}
2314
{%- endif %}
2415

25-
# 4. Main Execution
26-
# In PyTorchJob, we usually have exactly one main command (torchrun).
27-
# We assume the variable 'training_command' contains the full torchrun string.
28-
2916
echo "Starting training command..."
3017
set +e # Turn off auto-exit so we can capture the code
31-
# ---------------------------------------------------------
18+
3219
{{ training_command }}
33-
# ---------------------------------------------------------
20+
3421
exitcode=$?
3522
set -e
3623

3724
echo "Main command exited with code $exitcode"
3825

39-
# 5. Fault Tolerance: TEARDOWN (Check-out)
40-
# Decides if we should exit 0 (complete) or exit 1 (retry via K8s backoffLimit).
4126
{%- if ft_enabled %}
4227
{{ fault_tolerance.ft_launcher_teardown() }}
4328
{%- else %}
44-
# If FT is disabled, simply pass the exit code through.
45-
# K8s will restart if exitcode != 0 and backoffLimit > 0.
29+
4630
exit $exitcode
4731
{%- endif %}
Lines changed: 0 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,20 @@
11
{% macro ft_launcher_setup(fault_tol_cfg_path, fault_tol_finished_flag_file, fault_tol_job_results_file) -%}
2-
# -------------------------------------------------------------------------
3-
# K8s Fault Tolerance Setup (The "Check-In" Desk)
4-
# -------------------------------------------------------------------------
52

6-
# 1. Export Paths
7-
# IMPORTANT: These paths must reside on a ReadWriteMany (RWX) Persistent Volume
8-
# mounted to all Pods so state is preserved across pod restarts/rescheduling.
93
export FAULT_TOL_CFG_PATH="{{fault_tol_cfg_path}}"
104
export FAULT_TOL_FINISHED_FLAG_FILE="{{fault_tol_finished_flag_file}}"
115
export FAULT_TOL_JOB_RESULTS_FILE="{{fault_tol_job_results_file}}"
126

13-
# 2. Define Helper Functions
147
is_training_finished() {
158
test -f "$FAULT_TOL_FINISHED_FLAG_FILE"
169
}
1710

18-
# 3. Check for Previous Success
19-
# In K8s, a Pod might be restarted due to node maintenance even if the job
20-
# logic was done. If the flag file exists, we exit immediately with 0.
2111
if is_training_finished ; then
2212
echo "[FT-Setup] Found finished flag at $FAULT_TOL_FINISHED_FLAG_FILE."
2313
echo "[FT-Setup] Training is already complete. Exiting successfully."
2414
exit 0
2515
fi
2616

27-
# 4. Logging Start
28-
# We use HOSTNAME (usually pod-name) as the identifier since SLURM_JOB_ID is gone.
29-
# We append 'X' (Running/Unknown) to the log.
3017
echo "[FT-Setup] Starting training on $(hostname)..."
31-
# Optional: Log attempt to shared file (Using X for Running)
32-
# Note: In high-scale K8s, writing to a single file from 1000 pods can cause lock contention.
33-
# If scale is small, this is fine.
3418
if [ -n "$FAULT_TOL_JOB_RESULTS_FILE" ]; then
3519
mkdir -p "$(dirname "$FAULT_TOL_JOB_RESULTS_FILE")"
3620
echo "$(hostname) $(date +%s) X" >> "$FAULT_TOL_JOB_RESULTS_FILE"
@@ -39,49 +23,28 @@ fi
3923
{%- endmacro %}
4024

4125
{% macro ft_launcher_teardown() -%}
42-
# -------------------------------------------------------------------------
43-
# K8s Fault Tolerance Teardown (The "Check-Out" Desk)
44-
# -------------------------------------------------------------------------
45-
46-
# 1. Analyze Exit Code from the Main Command
47-
# 'exitcode' is captured in the main script before calling this macro.
4826
if [ "$exitcode" -eq "0" ]; then
4927
RESULT_STATUS="S" # Success
5028
else
5129
RESULT_STATUS="F" # Failure
5230
fi
5331

54-
# 2. Update Log (Optional but helpful for debugging)
5532
if [ -n "$FAULT_TOL_JOB_RESULTS_FILE" ]; then
56-
# We update the specific entry for this host from X to S or F
57-
# Note: 'sed -i' on a shared PVC can be risky with concurrency.
58-
# Appending a new status line is safer in K8s.
5933
mkdir -p "$(dirname "$FAULT_TOL_JOB_RESULTS_FILE")"
60-
6134
echo "$(hostname) $(date +%s) $RESULT_STATUS" >> "$FAULT_TOL_JOB_RESULTS_FILE"
6235
fi
6336

64-
# 3. The Requeue Decision Logic
6537
if [ "$exitcode" -eq "0" ]; then
66-
# Case A: Script exited successfully.
67-
# Verification: Did it actually finish (create the flag file)?
6838
if is_training_finished; then
6939
echo "[FT-Teardown] Job finished successfully and flag file exists."
7040
exit 0
7141
else
72-
# Edge Case: The python script exited 0, but didn't write the flag file.
73-
# This usually means a silent crash or partial run. We must force a retry.
7442
echo "[FT-Teardown] WARNING: Process exited 0 but finished flag is MISSING."
7543
echo "[FT-Teardown] Forcing exit 1 to trigger Kubernetes restart."
7644
exit 1
7745
fi
7846
else
79-
# Case B: Script crashed (exitcode != 0).
8047
echo "[FT-Teardown] Job failed with exit code $exitcode."
81-
82-
# We exit with the error code.
83-
# The K8s 'backoffLimit' (in PyTorchJob spec) will determine if we restart.
84-
# We do NOT calculate retry counts manually here.
8548
exit $exitcode
8649
fi
8750
{%- endmacro %}

0 commit comments

Comments
 (0)