Skip to content

Commit 9261c49

Browse files
authored
vLLM version 0.4.2 Docker image (#521)
1 parent 6cac2db commit 9261c49

File tree

3 files changed

+49
-1
lines changed

3 files changed

+49
-1
lines changed

model-engine/model_engine_server/inference/vllm/Dockerfile

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,37 @@
1+
#################### BASE BUILD IMAGE ####################
2+
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
3+
RUN apt-get update -y \
4+
&& apt-get install -y python3-pip git
5+
# Workaround for https://github.com/openai/triton/issues/2507 and
6+
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
7+
# this won't be needed for future versions of this docker image
8+
# or future versions of triton.
9+
RUN ldconfig /usr/local/cuda-12.1/compat/
10+
WORKDIR /workspace
11+
12+
COPY requirements-build.txt requirements-build.txt
13+
RUN --mount=type=cache,target=/root/.cache/pip \
14+
pip install -r requirements-build.txt
15+
#################### BASE BUILD IMAGE ####################
16+
17+
#################### FLASH_ATTENTION Build IMAGE ####################
18+
FROM dev as flash-attn-builder
19+
# max jobs used for build
20+
ARG max_jobs=2
21+
ENV MAX_JOBS=${max_jobs}
22+
# flash attention version
23+
ARG flash_attn_version=v2.4.2
24+
ENV FLASH_ATTN_VERSION=${flash_attn_version}
25+
26+
WORKDIR /usr/src/flash-attention-v2
27+
28+
# Download the wheel or build it if a pre-compiled release doesn't exist
29+
RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
30+
--no-build-isolation --no-deps --no-cache-dir
31+
32+
#################### FLASH_ATTENTION Build IMAGE ####################
33+
34+
#################### Runtime IMAGE ####################
135
FROM nvcr.io/nvidia/pytorch:23.09-py3
236

337
RUN apt-get update \
@@ -7,6 +41,10 @@ RUN apt-get update \
741
&& apt-get autoremove -y \
842
&& rm -rf /var/lib/apt/lists/*
943

44+
# Install flash attention (from pre-built wheel)
45+
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
46+
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
47+
1048
RUN pip uninstall torch -y
1149
COPY requirements.txt /workspace/requirements.txt
1250
RUN pip install -r requirements.txt
@@ -15,3 +53,5 @@ RUN wget https://github.com/peak/s5cmd/releases/download/v2.2.1/s5cmd_2.2.1_Linu
1553
RUN tar -xvzf s5cmd_2.2.1_Linux-64bit.tar.gz
1654

1755
COPY vllm_server.py /workspace/vllm_server.py
56+
57+
#################### Runtime IMAGE ####################
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Copied from https://github.com/vllm-project/vllm/blob/main/requirements-build.txt
2+
# Needed to build flash-attn into docker image
3+
cmake>=3.21
4+
ninja
5+
packaging
6+
setuptools>=49.4.0
7+
torch==2.3.0
8+
wheel
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
vllm==0.4.1
1+
vllm==0.4.2
22
pydantic>=2.0

0 commit comments

Comments
 (0)