|
1 | 1 | {% macro ft_launcher_setup(fault_tol_cfg_path, fault_tol_finished_flag_file, fault_tol_job_results_file) -%} |
2 | | -# ------------------------------------------------------------------------- |
3 | | -# K8s Fault Tolerance Setup (The "Check-In" Desk) |
4 | | -# ------------------------------------------------------------------------- |
5 | 2 |
|
6 | | -# 1. Export Paths |
7 | | -# IMPORTANT: These paths must reside on a ReadWriteMany (RWX) Persistent Volume |
8 | | -# mounted to all Pods so state is preserved across pod restarts/rescheduling. |
9 | 3 | export FAULT_TOL_CFG_PATH="{{fault_tol_cfg_path}}" |
10 | 4 | export FAULT_TOL_FINISHED_FLAG_FILE="{{fault_tol_finished_flag_file}}" |
11 | 5 | export FAULT_TOL_JOB_RESULTS_FILE="{{fault_tol_job_results_file}}" |
12 | 6 |
|
13 | | -# 2. Define Helper Functions |
14 | 7 | is_training_finished() { |
15 | 8 | test -f "$FAULT_TOL_FINISHED_FLAG_FILE" |
16 | 9 | } |
17 | 10 |
|
18 | | -# 3. Check for Previous Success |
19 | | -# In K8s, a Pod might be restarted due to node maintenance even if the job |
20 | | -# logic was done. If the flag file exists, we exit immediately with 0. |
21 | 11 | if is_training_finished ; then |
22 | 12 | echo "[FT-Setup] Found finished flag at $FAULT_TOL_FINISHED_FLAG_FILE." |
23 | 13 | echo "[FT-Setup] Training is already complete. Exiting successfully." |
24 | 14 | exit 0 |
25 | 15 | fi |
26 | 16 |
|
27 | | -# 4. Logging Start |
28 | | -# We use HOSTNAME (usually pod-name) as the identifier since SLURM_JOB_ID is gone. |
29 | | -# We append 'X' (Running/Unknown) to the log. |
30 | 17 | echo "[FT-Setup] Starting training on $(hostname)..." |
31 | | -# Optional: Log attempt to shared file (Using X for Running) |
32 | | -# Note: In high-scale K8s, writing to a single file from 1000 pods can cause lock contention. |
33 | | -# If scale is small, this is fine. |
34 | 18 | if [ -n "$FAULT_TOL_JOB_RESULTS_FILE" ]; then |
35 | 19 | mkdir -p "$(dirname "$FAULT_TOL_JOB_RESULTS_FILE")" |
36 | 20 | echo "$(hostname) $(date +%s) X" >> "$FAULT_TOL_JOB_RESULTS_FILE" |
|
39 | 23 | {%- endmacro %} |
40 | 24 |
|
41 | 25 | {% macro ft_launcher_teardown() -%} |
42 | | -# ------------------------------------------------------------------------- |
43 | | -# K8s Fault Tolerance Teardown (The "Check-Out" Desk) |
44 | | -# ------------------------------------------------------------------------- |
45 | | - |
46 | | -# 1. Analyze Exit Code from the Main Command |
47 | | -# 'exitcode' is captured in the main script before calling this macro. |
48 | 26 | if [ "$exitcode" -eq "0" ]; then |
49 | 27 | RESULT_STATUS="S" # Success |
50 | 28 | else |
51 | 29 | RESULT_STATUS="F" # Failure |
52 | 30 | fi |
53 | 31 |
|
54 | | -# 2. Update Log (Optional but helpful for debugging) |
55 | 32 | if [ -n "$FAULT_TOL_JOB_RESULTS_FILE" ]; then |
56 | | - # We update the specific entry for this host from X to S or F |
57 | | - # Note: 'sed -i' on a shared PVC can be risky with concurrency. |
58 | | - # Appending a new status line is safer in K8s. |
59 | 33 | mkdir -p "$(dirname "$FAULT_TOL_JOB_RESULTS_FILE")" |
60 | | - |
61 | 34 | echo "$(hostname) $(date +%s) $RESULT_STATUS" >> "$FAULT_TOL_JOB_RESULTS_FILE" |
62 | 35 | fi |
63 | 36 |
|
64 | | -# 3. The Requeue Decision Logic |
65 | 37 | if [ "$exitcode" -eq "0" ]; then |
66 | | - # Case A: Script exited successfully. |
67 | | - # Verification: Did it actually finish (create the flag file)? |
68 | 38 | if is_training_finished; then |
69 | 39 | echo "[FT-Teardown] Job finished successfully and flag file exists." |
70 | 40 | exit 0 |
71 | 41 | else |
72 | | - # Edge Case: The python script exited 0, but didn't write the flag file. |
73 | | - # This usually means a silent crash or partial run. We must force a retry. |
74 | 42 | echo "[FT-Teardown] WARNING: Process exited 0 but finished flag is MISSING." |
75 | 43 | echo "[FT-Teardown] Forcing exit 1 to trigger Kubernetes restart." |
76 | 44 | exit 1 |
77 | 45 | fi |
78 | 46 | else |
79 | | - # Case B: Script crashed (exitcode != 0). |
80 | 47 | echo "[FT-Teardown] Job failed with exit code $exitcode." |
81 | | - |
82 | | - # We exit with the error code. |
83 | | - # The K8s 'backoffLimit' (in PyTorchJob spec) will determine if we restart. |
84 | | - # We do NOT calculate retry counts manually here. |
85 | 48 | exit $exitcode |
86 | 49 | fi |
87 | 50 | {%- endmacro %} |
0 commit comments