Skip to content

Commit 223c9e9

Browse files
nagpalarKeitaW
andauthored
Support FSDP2 for FSDP test case (#951)
* FSDP2: updated files from the sample repo * updated sbatch files with conda activate command * Refactor FSDP training configuration - Downgrade PyTorch from 2.9.1 to 2.7.0 - Split Dockerfile into multi-stage build (Base and HTPO stages) - Remove venv activation from Slurm training scripts - Reverting Kubernetes template configuration * fixing typo HTPO to HPTO * Docker Target does not accept upercase so changed it Lowercase base or hpto and 2.7.0 is not supported so switching it to 2.6.0 for FSDP2 and Hyperpod Elastic agent * Update PyTorch and related package versions Let's come back to this in a separate PR. For now, let me merge the PR before we address #959 --------- Co-authored-by: Keita Watanabe <mlkeita@amazon.com>
1 parent 46710bd commit 223c9e9

13 files changed

+155
-143
lines changed
Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM public.ecr.aws/hpc-cloud/nccl-tests:latest
1+
FROM public.ecr.aws/hpc-cloud/nccl-tests:latest AS base
22

33
RUN apt update && apt install -y nvtop
44

@@ -7,6 +7,10 @@ RUN ln -s /usr/bin/python3 /usr/bin/python
77

88
COPY src/ /fsdp/
99
RUN --mount=type=cache,target=/root/.cache/pip pip install -r /fsdp/requirements.txt
10-
RUN pip install hyperpod-elastic-agent
1110

12-
WORKDIR /fsdp
11+
WORKDIR /fsdp
12+
13+
# HyperPod-specific image with elastic agent
14+
FROM base AS hpto
15+
16+
RUN pip install hyperpod-elastic-agent

3.test_cases/pytorch/FSDP/kubernetes/fsdp.yaml-template

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ spec:
66
elasticPolicy:
77
rdzvBackend: c10d
88
minReplicas: 1
9-
maxReplicas: 64
9+
maxReplicas: $MAX_NODES
1010
maxRestarts: 100
1111
metrics:
1212
- type: Resource
@@ -17,7 +17,7 @@ spec:
1717
averageUtilization: 90
1818
pytorchReplicaSpecs:
1919
Worker:
20-
replicas: $NUM_NODES
20+
replicas: $MAX_NODES
2121
restartPolicy: OnFailure
2222
template:
2323
metadata:
@@ -31,6 +31,9 @@ spec:
3131
- name: local
3232
hostPath:
3333
path: /mnt/k8s-disks/0
34+
- name: checkpoints
35+
persistentVolumeClaim:
36+
claimName: fsx-claim
3437
#nodeSelector:
3538
# node.kubernetes.io/instance-type: "${INSTANCE_TYPE}"
3639
containers:
@@ -78,16 +81,20 @@ spec:
7881
value: "1"
7982
- name: HF_TOKEN
8083
value: "${HF_TOKEN}"
81-
#- name: TORCH_DIST_INIT_BARRIER
82-
# value: "1"
84+
- name: TORCH_DIST_INIT_BARRIER
85+
value: "1"
86+
- name: GLOO_SOCKET_IFNAME
87+
value: "eth0"
88+
- name: NCCL_SOCKET_TIMEOUT_MS
89+
value: "600000"
8390
#- name: NCCL_IGNORE_DISABLED_P2P
8491
# value: "1"
8592
#- name: NCCL_NVLS_ENABLE
8693
# value: "0"
8794
command:
8895
- /usr/local/bin/torchrun
8996
- --nproc_per_node=$GPU_PER_NODE
90-
- --nnodes=$NUM_NODES
97+
- --nnodes=$MIN_NODES:$MAX_NODES
9198
- /fsdp/train.py
9299
- --max_context_width=4096
93100
- --num_key_value_heads=32
@@ -113,3 +120,5 @@ spec:
113120
mountPath: /dev/shm
114121
- name: local
115122
mountPath: /local
123+
- name: checkpoints
124+
mountPath: /checkpoints

3.test_cases/pytorch/FSDP/slurm/llama2_13b-training.sbatch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,4 +110,4 @@ if [ -d "/opt/sagemaker_cluster" ]; then
110110
echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
111111
AUTO_RESUME="--auto-resume=1"
112112
fi
113-
srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
113+
srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"

3.test_cases/pytorch/FSDP/slurm/llama2_70b-training.sbatch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,4 +110,4 @@ if [ -d "/opt/sagemaker_cluster" ]; then
110110
echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
111111
AUTO_RESUME="--auto-resume=1"
112112
fi
113-
srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
113+
srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"

3.test_cases/pytorch/FSDP/slurm/llama2_7b-training.sbatch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,4 +110,4 @@ if [ -d "/opt/sagemaker_cluster" ]; then
110110
echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
111111
AUTO_RESUME="--auto-resume=1"
112112
fi
113-
srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
113+
srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"

3.test_cases/pytorch/FSDP/slurm/llama3_1_70b-training.sbatch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,4 +110,4 @@ if [ -d "/opt/sagemaker_cluster" ]; then
110110
echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
111111
AUTO_RESUME="--auto-resume=1"
112112
fi
113-
srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
113+
srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"

3.test_cases/pytorch/FSDP/slurm/llama3_1_8b-training.sbatch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,4 +110,4 @@ if [ -d "/opt/sagemaker_cluster" ]; then
110110
echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
111111
AUTO_RESUME="--auto-resume=1"
112112
fi
113-
srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
113+
srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"

3.test_cases/pytorch/FSDP/slurm/llama3_2_1b-training.sbatch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,4 +110,4 @@ if [ -d "/opt/sagemaker_cluster" ]; then
110110
echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
111111
AUTO_RESUME="--auto-resume=1"
112112
fi
113-
srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
113+
srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"

3.test_cases/pytorch/FSDP/slurm/llama3_2_3b-training.sbatch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,4 +110,4 @@ if [ -d "/opt/sagemaker_cluster" ]; then
110110
echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
111111
AUTO_RESUME="--auto-resume=1"
112112
fi
113-
srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
113+
srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"

3.test_cases/pytorch/FSDP/slurm/mathstral_7b-training.sbatch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,4 +132,4 @@ if [ -d "/opt/sagemaker_cluster" ]; then
132132
echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
133133
AUTO_RESUME="--auto-resume=1"
134134
fi
135-
srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
135+
srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"

0 commit comments

Comments
 (0)