Skip to content

Commit d78115d

Browse files
authored
fix nccl for docker cu11 (#3896)
1 parent 2e768c2 commit d78115d

File tree

3 files changed

+23
-3
lines changed

3 files changed

+23
-3
lines changed

docker/Dockerfile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ ENV CUDA_VERSION_SHORT=cu118
1414
# Builder image
1515
FROM ${CUDA_VERSION} AS dev
1616
ARG PYTHON_VERSION=3.10
17+
ARG NCCL_BRANCH=v2.26.6-1
1718

1819
ENV PATH=/opt/py3/bin:/root/.local/bin:${PATH}
1920
ENV DEBIAN_FRONTEND=noninteractive
@@ -58,11 +59,12 @@ COPY docker/install.sh /tmp/install.sh
5859

5960
RUN --mount=type=cache,target=/root/.cache \
6061
--mount=type=cache,target=/wheels,from=builder,source=/wheels \
62+
--mount=type=cache,target=/nccl,from=builder,source=/nccl \
6163
/tmp/install.sh
6264

6365
# explicitly set ptxas path for triton
6466
ENV PATH=/opt/py3/bin:$PATH
6567
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
6668
ENV NCCL_LAUNCH_MODE=GROUP
67-
69+
ENV LD_LIBRARY_PATH=/nccl/lib:$LD_LIBRARY_PATH
6870
FROM ${IMAGE_TYPE}

docker/build.sh

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,23 @@
11
#!/bin/bash -ex
22

3-
mkdir -p /wheels
3+
mkdir -p /wheels /nccl
44

55
if [[ "${CUDA_VERSION_SHORT}" != "cu118" ]]; then
66
pip install nvidia-nccl-cu12
77
else
8-
pip install nvidia-nccl-cu11
8+
NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90"
9+
pushd /tmp >/dev/null
10+
git clone --depth=1 --branch ${NCCL_BRANCH} https://github.com/NVIDIA/nccl.git
11+
pushd nccl >/dev/null
12+
make NVCC_GENCODE="$NVCC_GENCODE" -j$(nproc) src.build
13+
mkdir -p /nccl/include /nccl/lib
14+
mv build/include/* /nccl/include/
15+
mv build/lib/lib* /nccl/lib/
16+
popd >/dev/null
17+
popd >/dev/null
18+
rm -rf /tmp/nccl
19+
export LD_LIBRARY_PATH=/nccl/lib:$LD_LIBRARY_PATH
920
fi
1021

22+
pip install --upgrade pip build
1123
python3 -m build -w -o /wheels -v .

docker/install.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,3 +70,9 @@ BASE_URL="https://github.com/Dao-AILab/flash-attention/releases/download/v${FA_V
7070
FULL_URL="${BASE_URL}/${WHEEL}"
7171

7272
pip install "$FULL_URL"
73+
74+
# copy nccl
75+
if [[ "${CUDA_VERSION_SHORT}" = "cu118" ]]; then
76+
rm -rf /opt/py3/lib/python${PYTHON_VERSION}/site-packages/nvidia/nccl
77+
cp -R /nccl /opt/py3/lib/python${PYTHON_VERSION}/site-packages/nvidia/
78+
fi

0 commit comments

Comments
 (0)