Skip to content

Commit 1f979e0

Browse files
authored
build: Fix ngc pytorch build with deep-ep (#1234)
Signed-off-by: Charlie Truong <chtruong@nvidia.com>
1 parent 5efbe4f commit 1f979e0

File tree

1 file changed

+4
-0
lines changed

1 file changed

+4
-0
lines changed

docker/Dockerfile.ngc_pytorch

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@ ARG BASE_IMAGE
8383
ARG UV_NO_INSTALL_PACKAGES="--no-install-package torch --no-install-package torchvision --no-install-package triton --no-install-package nvidia-cublas-cu12 --no-install-package nvidia-cuda-cupti-cu12 --no-install-package nvidia-cuda-nvrtc-cu12 --no-install-package nvidia-cuda-runtime-cu12 --no-install-package nvidia-cudnn-cu12 --no-install-package nvidia-cufft-cu12 --no-install-package nvidia-cufile-cu12 --no-install-package nvidia-curand-cu12 --no-install-package nvidia-cusolver-cu12 --no-install-package nvidia-cusparse-cu12 --no-install-package nvidia-cusparselt-cu12 --no-install-package nvidia-nccl-cu12 --no-install-package vllm --no-install-package flash-attn --no-install-package transformer-engine --no-install-package transformer-engine-cu12 --no-install-package transformer-engine-torch --no-install-package numpy"
8484
ENV UV_NO_INSTALL_PACKAGES=${UV_NO_INSTALL_PACKAGES}
8585
ENV PATH="/opt/nemo_rl_venv/bin:$PATH"
86+
# Ensure DeepEP is built for H100 and B200
87+
ENV TORCH_CUDA_ARCH_LIST="9.0 10.0"
8688

8789
# First copy only the dependency files
8890
COPY --from=nemo-rl pyproject.toml uv.lock ./
@@ -95,6 +97,8 @@ RUN --mount=type=bind,from=build_vllm,source=/opt/,target=/tmp/build_vllm/ <<"EO
9597
# The venv is symlinked to avoid bloating the layer size
9698
uv venv --system-site-packages ${UV_PROJECT_ENVIRONMENT}
9799
uv pip install --no-cache-dir --no-deps /tmp/build_vllm/vllm/vllm*.whl
100+
# Ensure nvshmem is installed before building DeepEP
101+
uv sync --link-mode symlink --locked --inexact --no-install-project $UV_NO_INSTALL_PACKAGES
98102
uv sync --link-mode symlink --locked --inexact --extra vllm --extra mcore --extra automodel --all-groups --no-install-project $UV_NO_INSTALL_PACKAGES
99103
EOF
100104

0 commit comments

Comments
 (0)