vLLM version 0.4.2 Docker image (#521)

squeakymouse · web-flow · commit 9261c492b6f7 · 2024-05-20T09:43:42.000-07:00
diff --git a/model-engine/model_engine_server/inference/vllm/Dockerfile b/model-engine/model_engine_server/inference/vllm/Dockerfile
@@ -1,3 +1,37 @@
+#################### BASE BUILD IMAGE ####################
+FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
+RUN apt-get update -y \
+    && apt-get install -y python3-pip git
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-12.1/compat/
+WORKDIR /workspace
+
+COPY requirements-build.txt requirements-build.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements-build.txt
+#################### BASE BUILD IMAGE ####################
+
+#################### FLASH_ATTENTION Build IMAGE ####################
+FROM dev as flash-attn-builder
+# max jobs used for build
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
+# flash attention version
+ARG flash_attn_version=v2.4.2
+ENV FLASH_ATTN_VERSION=${flash_attn_version}
+
+WORKDIR /usr/src/flash-attention-v2
+
+# Download the wheel or build it if a pre-compiled release doesn't exist
+RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
+    --no-build-isolation --no-deps --no-cache-dir
+
+#################### FLASH_ATTENTION Build IMAGE ####################
+
+#################### Runtime IMAGE ####################
 FROM nvcr.io/nvidia/pytorch:23.09-py3
 
 RUN apt-get update \
@@ -7,6 +41,10 @@ RUN apt-get update \
     && apt-get autoremove -y \
     && rm -rf /var/lib/apt/lists/*
 
+# Install flash attention (from pre-built wheel)
+RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
+    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
+
 RUN pip uninstall torch -y
 COPY requirements.txt /workspace/requirements.txt
 RUN pip install -r requirements.txt
@@ -15,3 +53,5 @@ RUN wget https://github.com/peak/s5cmd/releases/download/v2.2.1/s5cmd_2.2.1_Linu
 RUN tar -xvzf s5cmd_2.2.1_Linux-64bit.tar.gz
 
 COPY vllm_server.py /workspace/vllm_server.py
+
+#################### Runtime IMAGE ####################
diff --git a/model-engine/model_engine_server/inference/vllm/requirements-build.txt b/model-engine/model_engine_server/inference/vllm/requirements-build.txt
@@ -0,0 +1,8 @@
+# Copied from https://github.com/vllm-project/vllm/blob/main/requirements-build.txt
+# Needed to build flash-attn into docker image
+cmake>=3.21
+ninja
+packaging
+setuptools>=49.4.0
+torch==2.3.0
+wheel
diff --git a/model-engine/model_engine_server/inference/vllm/requirements.txt b/model-engine/model_engine_server/inference/vllm/requirements.txt
@@ -1,2 +1,2 @@
-vllm==0.4.1
+vllm==0.4.2
 pydantic>=2.0

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-vllm==0.4.1`
	`1`	`+vllm==0.4.2`
`2`	`2`	`pydantic>=2.0`