[Docker] Add cuda arch list as build option (#1950)

simon-mo · web-flow · commit c85b80c2b64d · 2023-12-08T09:53:47.000-08:00
diff --git a/Dockerfile b/Dockerfile
@@ -30,11 +30,15 @@ COPY requirements.txt requirements.txt
 COPY pyproject.toml pyproject.toml
 COPY vllm/__init__.py vllm/__init__.py
 
+ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 # max jobs used by Ninja to build extensions
-ENV MAX_JOBS=$max_jobs
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
 # number of threads used by nvcc
 ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads
+
 RUN python3 setup.py build_ext --inplace
 
 # image to run unit testing suite
diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
@@ -31,6 +31,14 @@ You can build and run vLLM from source via the provided dockerfile. To build vLL
 
     $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
 
+
+.. note::
+
+        By default vLLM will build for all GPU types for widest distribution. If you are just building for the
+        current GPU type the machine is running on, you can add the argument ``--build-arg torch_cuda_arch_list=""``
+        for vLLM to find the current GPU type and build for that.
+
+
 To run vLLM:
 
 .. code-block:: console