Skip to content

Commit 7180242

Browse files
committed
fixing deepspeed on 64 gpus
1 parent b38d938 commit 7180242

File tree

4 files changed

+568
-527
lines changed

4 files changed

+568
-527
lines changed

scripts/out.png

24.4 KB
Loading

scripts/run.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,14 @@ export MPICH_GPU_SUPPORT_ENABLED=0
1717
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CRAY_MPICH_ROOTDIR}/gtl/lib"
1818

1919
## this enables the slingshot-11 plugin for RCCL (crucial for inter-node bw)
20-
# export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/lustre/orion/scratch/adityaranjan/csc547/my-venv/aws-ofi-rccl/build/lib"
20+
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/lustre/orion/scratch/adityaranjan/csc547/my-venv/aws-ofi-rccl/build/lib"
2121
# export NCCL_DEBUG=INFO
22-
# export FI_CXI_ATS=0
22+
export FI_CXI_ATS=0
2323

2424
## this improves cross node bandwidth for some cases
25-
# export NCCL_CROSS_NIC=1
25+
export NCCL_CROSS_NIC=1
2626

27-
# export CUDA_DEVICE_MAX_CONNECTIONS=1
27+
export CUDA_DEVICE_MAX_CONNECTIONS=1
2828

2929
export PYTHONPATH="${PYTHONPATH}:/ccs/home/adityaranjan/scratch/my-venv/improved-diffusion"
3030

0 commit comments

Comments
 (0)