Skip to content

Commit 88bd137

Browse files
authored
Change FSDP PyTorch to 2.7.1 (#739)
Close #738
1 parent 4153337 commit 88bd137

File tree

7 files changed

+10
-13
lines changed

7 files changed

+10
-13
lines changed

3.test_cases/pytorch/FSDP/slurm/llama2_13b-training.sbatch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ export FI_EFA_SET_CUDA_SYNC_MEMOPS=0
4343
# LD_PRELOAD is required for PyTorch to find the NCCL library
4444
# This path assumes you are using the Deep Learning AMI
4545
# If you are not using the DLAMI, you may need to update this path
46-
export LD_PRELOAD=/usr/local/cuda-12.4/lib/libnccl.so
46+
export LD_PRELOAD=/usr/local/cuda-12.8/lib/libnccl.so
4747
export NCCL_SOCKET_IFNAME=^docker,lo,veth,eth
4848

4949
## Set HuggingFace metadata timeout (in seconds) for large clusters

3.test_cases/pytorch/FSDP/slurm/llama2_70b-training.sbatch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ export FI_EFA_SET_CUDA_SYNC_MEMOPS=0
4343
# LD_PRELOAD is required for PyTorch to find the NCCL library
4444
# This path assumes you are using the Deep Learning AMI
4545
# If you are not using the DLAMI, you may need to update this path
46-
export LD_PRELOAD=/usr/local/cuda-12.4/lib/libnccl.so
46+
export LD_PRELOAD=/usr/local/cuda-12.8/lib/libnccl.so
4747
export NCCL_SOCKET_IFNAME=^docker,lo,veth,eth
4848

4949
## Set HuggingFace metadata timeout (in seconds) for large clusters

3.test_cases/pytorch/FSDP/slurm/llama2_7b-training.sbatch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ export FI_EFA_SET_CUDA_SYNC_MEMOPS=0
4343
# LD_PRELOAD is required for PyTorch to find the NCCL library
4444
# This path assumes you are using the Deep Learning AMI
4545
# If you are not using the DLAMI, you may need to update this path
46-
export LD_PRELOAD=/usr/local/cuda-12.4/lib/libnccl.so
46+
export LD_PRELOAD=/usr/local/cuda-12.8/lib/libnccl.so
4747
export NCCL_SOCKET_IFNAME=^docker,lo,veth,eth
4848

4949
## Set HuggingFace metadata timeout (in seconds) for large clusters

3.test_cases/pytorch/FSDP/slurm/mathstral_7b-training.sbatch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ export FI_EFA_SET_CUDA_SYNC_MEMOPS=0
4343
# LD_PRELOAD is required for PyTorch to find the NCCL library
4444
# This path assumes you are using the Deep Learning AMI
4545
# If you are not using the DLAMI, you may need to update this path
46-
export LD_PRELOAD=/usr/local/cuda-12.4/lib/libnccl.so
46+
export LD_PRELOAD=/usr/local/cuda-12.8/lib/libnccl.so
4747
export NCCL_SOCKET_IFNAME=^docker,lo,veth,eth
4848

4949
## Set HuggingFace metadata timeout (in seconds) for large clusters

3.test_cases/pytorch/FSDP/slurm/mistral_8x7b-training.sbatch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ export FI_EFA_SET_CUDA_SYNC_MEMOPS=0
4343
# LD_PRELOAD is required for PyTorch to find the NCCL library
4444
# This path assumes you are using the Deep Learning AMI
4545
# If you are not using the DLAMI, you may need to update this path
46-
export LD_PRELOAD=/usr/local/cuda-12.4/lib/libnccl.so
46+
export LD_PRELOAD=/usr/local/cuda-12.8/lib/libnccl.so
4747
export NCCL_SOCKET_IFNAME=^docker,lo,veth,eth
4848

4949
## Set HuggingFace metadata timeout (in seconds) for large clusters

3.test_cases/pytorch/FSDP/slurm/training-sub.template

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ export FI_EFA_SET_CUDA_SYNC_MEMOPS=0
4343
# LD_PRELOAD is required for PyTorch to find the NCCL library
4444
# This path assumes you are using the Deep Learning AMI
4545
# If you are not using the DLAMI, you may need to update this path
46-
export LD_PRELOAD=/usr/local/cuda-12.4/lib/libnccl.so
46+
export LD_PRELOAD=/usr/local/cuda-12.8/lib/libnccl.so
4747
export NCCL_SOCKET_IFNAME=^docker,lo,veth,eth
4848

4949
## Set HuggingFace metadata timeout (in seconds) for large clusters
Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
11
datasets
2-
fsspec==2023.9.2
3-
numpy==1.*
4-
python-etcd
5-
torch==2.5.1 --index-url https://download.pytorch.org/whl/cu124
6-
torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124
7-
torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu124
8-
transformers==4.50.3
2+
torch==2.7.1 --index-url https://download.pytorch.org/whl/cu128
3+
torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu128
4+
torchvision==0.22.1 --index-url https://download.pytorch.org/whl/cu128
5+
transformers==4.52.4

0 commit comments

Comments
 (0)