cleanup

ko3n1g · ko3n1g · commit eb98cb51cc8b · 2025-12-12T00:43:38.000Z
Signed-off-by: oliver könig &lt;okoenig@nvidia.com&gt;
diff --git a/nemo_run/core/execution/templates/dgxc.sh.j2 b/nemo_run/core/execution/templates/dgxc.sh.j2
@@ -1,47 +1,31 @@
 {%- import "ft_launcher_dgxc.j2" as fault_tolerance -%}
 #!/bin/bash
-#
-# Generated by NeMo Run for Kubernetes (PyTorchJob)
-#
 
-# 1. Basic Shell Setup
 set -evx  # Print commands, but DO NOT exit immediately on error (we handle that below)
 export PYTHONUNBUFFERED=1
 export TORCHX_MAX_RETRIES={{max_retries}}
 
-# 2. Environment Variables
-# These are strictly user-defined vars (e.g. HYDRA_FULL_ERROR).
-# Note: MASTER_ADDR, WORLD_SIZE, RANK are injected automatically by the PyTorchJob operator.
 {%- for env_var in env_vars %}
 {{env_var}}
 {%- endfor %}
 
-# 3. Fault Tolerance: SETUP (Check-in)
-# Checks if we are resuming or if we are already finished.
 {%- if ft_enabled %}
 {{ fault_tolerance.ft_launcher_setup(fault_tol_cfg_path, fault_tol_finished_flag_file, fault_tol_job_results_file) }}
 {%- endif %}
 
-# 4. Main Execution
-# In PyTorchJob, we usually have exactly one main command (torchrun).
-# We assume the variable 'training_command' contains the full torchrun string.
-
 echo "Starting training command..."
 set +e # Turn off auto-exit so we can capture the code
-# ---------------------------------------------------------
+
 {{ training_command }}
-# ---------------------------------------------------------
+
 exitcode=$?
 set -e
 
 echo "Main command exited with code $exitcode"
 
-# 5. Fault Tolerance: TEARDOWN (Check-out)
-# Decides if we should exit 0 (complete) or exit 1 (retry via K8s backoffLimit).
 {%- if ft_enabled %}
 {{ fault_tolerance.ft_launcher_teardown() }}
 {%- else %}
-# If FT is disabled, simply pass the exit code through.
-# K8s will restart if exitcode != 0 and backoffLimit > 0.
+
 exit $exitcode
 {%- endif %}
diff --git a/nemo_run/core/execution/templates/ft_launcher_dgxc.j2 b/nemo_run/core/execution/templates/ft_launcher_dgxc.j2
@@ -1,36 +1,20 @@
 {% macro ft_launcher_setup(fault_tol_cfg_path, fault_tol_finished_flag_file, fault_tol_job_results_file) -%}
-# -------------------------------------------------------------------------
-# K8s Fault Tolerance Setup (The "Check-In" Desk)
-# -------------------------------------------------------------------------
 
-# 1. Export Paths
-# IMPORTANT: These paths must reside on a ReadWriteMany (RWX) Persistent Volume
-# mounted to all Pods so state is preserved across pod restarts/rescheduling.
 export FAULT_TOL_CFG_PATH="{{fault_tol_cfg_path}}"
 export FAULT_TOL_FINISHED_FLAG_FILE="{{fault_tol_finished_flag_file}}"
 export FAULT_TOL_JOB_RESULTS_FILE="{{fault_tol_job_results_file}}"
 
-# 2. Define Helper Functions
 is_training_finished() {
     test -f "$FAULT_TOL_FINISHED_FLAG_FILE"
 }
 
-# 3. Check for Previous Success
-# In K8s, a Pod might be restarted due to node maintenance even if the job
-# logic was done. If the flag file exists, we exit immediately with 0.
 if is_training_finished ; then
     echo "[FT-Setup] Found finished flag at $FAULT_TOL_FINISHED_FLAG_FILE."
     echo "[FT-Setup] Training is already complete. Exiting successfully."
     exit 0
 fi
 
-# 4. Logging Start
-# We use HOSTNAME (usually pod-name) as the identifier since SLURM_JOB_ID is gone.
-# We append 'X' (Running/Unknown) to the log.
 echo "[FT-Setup] Starting training on $(hostname)..."
-# Optional: Log attempt to shared file (Using X for Running)
-# Note: In high-scale K8s, writing to a single file from 1000 pods can cause lock contention.
-# If scale is small, this is fine.
 if [ -n "$FAULT_TOL_JOB_RESULTS_FILE" ]; then
     mkdir -p "$(dirname "$FAULT_TOL_JOB_RESULTS_FILE")"
     echo "$(hostname) $(date +%s) X" >> "$FAULT_TOL_JOB_RESULTS_FILE"
@@ -39,49 +23,28 @@ fi
 {%- endmacro %}
 
 {% macro ft_launcher_teardown() -%}
-# -------------------------------------------------------------------------
-# K8s Fault Tolerance Teardown (The "Check-Out" Desk)
-# -------------------------------------------------------------------------
-
-# 1. Analyze Exit Code from the Main Command
-# 'exitcode' is captured in the main script before calling this macro.
 if [ "$exitcode" -eq "0" ]; then
     RESULT_STATUS="S" # Success
 else
     RESULT_STATUS="F" # Failure
 fi
 
-# 2. Update Log (Optional but helpful for debugging)
 if [ -n "$FAULT_TOL_JOB_RESULTS_FILE" ]; then
-    # We update the specific entry for this host from X to S or F
-    # Note: 'sed -i' on a shared PVC can be risky with concurrency.
-    # Appending a new status line is safer in K8s.
     mkdir -p "$(dirname "$FAULT_TOL_JOB_RESULTS_FILE")"
-
     echo "$(hostname) $(date +%s) $RESULT_STATUS" >> "$FAULT_TOL_JOB_RESULTS_FILE"
 fi
 
-# 3. The Requeue Decision Logic
 if [ "$exitcode" -eq "0" ]; then
-    # Case A: Script exited successfully.
-    # Verification: Did it actually finish (create the flag file)?
     if is_training_finished; then
         echo "[FT-Teardown] Job finished successfully and flag file exists."
         exit 0
     else
-        # Edge Case: The python script exited 0, but didn't write the flag file.
-        # This usually means a silent crash or partial run. We must force a retry.
         echo "[FT-Teardown] WARNING: Process exited 0 but finished flag is MISSING."
         echo "[FT-Teardown] Forcing exit 1 to trigger Kubernetes restart."
         exit 1
     fi
 else
-    # Case B: Script crashed (exitcode != 0).
     echo "[FT-Teardown] Job failed with exit code $exitcode."
-
-    # We exit with the error code.
-    # The K8s 'backoffLimit' (in PyTorchJob spec) will determine if we restart.
-    # We do NOT calculate retry counts manually here.
     exit $exitcode
 fi
 {%- endmacro %}