diff --git a/3.test_cases/pytorch/FSDP/slurm/llama2_13b-training.sbatch b/3.test_cases/pytorch/FSDP/slurm/llama2_13b-training.sbatch index 6304b2d9a..63751de75 100644 --- a/3.test_cases/pytorch/FSDP/slurm/llama2_13b-training.sbatch +++ b/3.test_cases/pytorch/FSDP/slurm/llama2_13b-training.sbatch @@ -43,7 +43,7 @@ export FI_EFA_SET_CUDA_SYNC_MEMOPS=0 # LD_PRELOAD is required for PyTorch to find the NCCL library # This path assumes you are using the Deep Learning AMI # If you are not using the DLAMI, you may need to update this path -export LD_PRELOAD=/usr/local/cuda-12.4/lib/libnccl.so +export LD_PRELOAD=/usr/local/cuda-12.8/lib/libnccl.so export NCCL_SOCKET_IFNAME=^docker,lo,veth,eth ## Set HuggingFace metadata timeout (in seconds) for large clusters diff --git a/3.test_cases/pytorch/FSDP/slurm/llama2_70b-training.sbatch b/3.test_cases/pytorch/FSDP/slurm/llama2_70b-training.sbatch index e529db7cc..cc9a00047 100644 --- a/3.test_cases/pytorch/FSDP/slurm/llama2_70b-training.sbatch +++ b/3.test_cases/pytorch/FSDP/slurm/llama2_70b-training.sbatch @@ -43,7 +43,7 @@ export FI_EFA_SET_CUDA_SYNC_MEMOPS=0 # LD_PRELOAD is required for PyTorch to find the NCCL library # This path assumes you are using the Deep Learning AMI # If you are not using the DLAMI, you may need to update this path -export LD_PRELOAD=/usr/local/cuda-12.4/lib/libnccl.so +export LD_PRELOAD=/usr/local/cuda-12.8/lib/libnccl.so export NCCL_SOCKET_IFNAME=^docker,lo,veth,eth ## Set HuggingFace metadata timeout (in seconds) for large clusters diff --git a/3.test_cases/pytorch/FSDP/slurm/llama2_7b-training.sbatch b/3.test_cases/pytorch/FSDP/slurm/llama2_7b-training.sbatch index 37b949fdc..8fdcca400 100644 --- a/3.test_cases/pytorch/FSDP/slurm/llama2_7b-training.sbatch +++ b/3.test_cases/pytorch/FSDP/slurm/llama2_7b-training.sbatch @@ -43,7 +43,7 @@ export FI_EFA_SET_CUDA_SYNC_MEMOPS=0 # LD_PRELOAD is required for PyTorch to find the NCCL library # This path assumes you are using the Deep Learning AMI # If you are not using the DLAMI, you may need to update this path -export LD_PRELOAD=/usr/local/cuda-12.4/lib/libnccl.so +export LD_PRELOAD=/usr/local/cuda-12.8/lib/libnccl.so export NCCL_SOCKET_IFNAME=^docker,lo,veth,eth ## Set HuggingFace metadata timeout (in seconds) for large clusters diff --git a/3.test_cases/pytorch/FSDP/slurm/mathstral_7b-training.sbatch b/3.test_cases/pytorch/FSDP/slurm/mathstral_7b-training.sbatch index 8a33bd44a..bb9a9c242 100644 --- a/3.test_cases/pytorch/FSDP/slurm/mathstral_7b-training.sbatch +++ b/3.test_cases/pytorch/FSDP/slurm/mathstral_7b-training.sbatch @@ -43,7 +43,7 @@ export FI_EFA_SET_CUDA_SYNC_MEMOPS=0 # LD_PRELOAD is required for PyTorch to find the NCCL library # This path assumes you are using the Deep Learning AMI # If you are not using the DLAMI, you may need to update this path -export LD_PRELOAD=/usr/local/cuda-12.4/lib/libnccl.so +export LD_PRELOAD=/usr/local/cuda-12.8/lib/libnccl.so export NCCL_SOCKET_IFNAME=^docker,lo,veth,eth ## Set HuggingFace metadata timeout (in seconds) for large clusters diff --git a/3.test_cases/pytorch/FSDP/slurm/mistral_8x7b-training.sbatch b/3.test_cases/pytorch/FSDP/slurm/mistral_8x7b-training.sbatch index 8748db15c..64bea32c0 100644 --- a/3.test_cases/pytorch/FSDP/slurm/mistral_8x7b-training.sbatch +++ b/3.test_cases/pytorch/FSDP/slurm/mistral_8x7b-training.sbatch @@ -43,7 +43,7 @@ export FI_EFA_SET_CUDA_SYNC_MEMOPS=0 # LD_PRELOAD is required for PyTorch to find the NCCL library # This path assumes you are using the Deep Learning AMI # If you are not using the DLAMI, you may need to update this path -export LD_PRELOAD=/usr/local/cuda-12.4/lib/libnccl.so +export LD_PRELOAD=/usr/local/cuda-12.8/lib/libnccl.so export NCCL_SOCKET_IFNAME=^docker,lo,veth,eth ## Set HuggingFace metadata timeout (in seconds) for large clusters diff --git a/3.test_cases/pytorch/FSDP/slurm/training-sub.template b/3.test_cases/pytorch/FSDP/slurm/training-sub.template index 3a952c775..f8b078477 100644 --- a/3.test_cases/pytorch/FSDP/slurm/training-sub.template +++ b/3.test_cases/pytorch/FSDP/slurm/training-sub.template @@ -43,7 +43,7 @@ export FI_EFA_SET_CUDA_SYNC_MEMOPS=0 # LD_PRELOAD is required for PyTorch to find the NCCL library # This path assumes you are using the Deep Learning AMI # If you are not using the DLAMI, you may need to update this path -export LD_PRELOAD=/usr/local/cuda-12.4/lib/libnccl.so +export LD_PRELOAD=/usr/local/cuda-12.8/lib/libnccl.so export NCCL_SOCKET_IFNAME=^docker,lo,veth,eth ## Set HuggingFace metadata timeout (in seconds) for large clusters diff --git a/3.test_cases/pytorch/FSDP/src/requirements.txt b/3.test_cases/pytorch/FSDP/src/requirements.txt index 25971272d..5c4bbb74c 100644 --- a/3.test_cases/pytorch/FSDP/src/requirements.txt +++ b/3.test_cases/pytorch/FSDP/src/requirements.txt @@ -1,8 +1,5 @@ datasets -fsspec==2023.9.2 -numpy==1.* -python-etcd -torch==2.5.1 --index-url https://download.pytorch.org/whl/cu124 -torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124 -torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu124 -transformers==4.50.3 \ No newline at end of file +torch==2.7.1 --index-url https://download.pytorch.org/whl/cu128 +torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu128 +torchvision==0.22.1 --index-url https://download.pytorch.org/whl/cu128 +transformers==4.52.4