Skip to content

Commit 04d554d

Browse files
authored
Upgrade vLLM version for batch completion (#518)
* bump pydantic==2.7.1 * Add fallback to v1 models if pydantic >2 * version bump vllm * Update docker image to manually install flash attention * skip coverage
1 parent 43107e3 commit 04d554d

File tree

5 files changed

+58
-4
lines changed

5 files changed

+58
-4
lines changed

model-engine/model_engine_server/common/dtos/llms.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from typing import Any, Dict, List, Optional
66

7+
import pydantic
78
from model_engine_server.common.dtos.model_endpoints import (
89
CpuSpecificationType,
910
GetModelEndpointV1Response,
@@ -21,7 +22,11 @@
2122
ModelEndpointStatus,
2223
Quantization,
2324
)
24-
from pydantic import BaseModel, Field, HttpUrl
25+
26+
if int(pydantic.__version__.split(".")[0]) > 1:
27+
from pydantic.v1 import BaseModel, Field, HttpUrl # pragma: no cover
28+
else:
29+
from pydantic import BaseModel, Field, HttpUrl
2530

2631

2732
class CreateLLMModelEndpointV1Request(BaseModel):

model-engine/model_engine_server/inference/batch_inference/Dockerfile_vllm

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,37 @@
1+
#################### BASE BUILD IMAGE ####################
2+
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
3+
RUN apt-get update -y \
4+
&& apt-get install -y python3-pip git
5+
# Workaround for https://github.com/openai/triton/issues/2507 and
6+
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
7+
# this won't be needed for future versions of this docker image
8+
# or future versions of triton.
9+
RUN ldconfig /usr/local/cuda-12.1/compat/
10+
WORKDIR /workspace
11+
12+
COPY model-engine/model_engine_server/inference/batch_inference/requirements-build.txt requirements-build.txt
13+
RUN --mount=type=cache,target=/root/.cache/pip \
14+
pip install -r requirements-build.txt
15+
#################### BASE BUILD IMAGE ####################
16+
17+
#################### FLASH_ATTENTION Build IMAGE ####################
18+
FROM dev as flash-attn-builder
19+
# max jobs used for build
20+
ARG max_jobs=2
21+
ENV MAX_JOBS=${max_jobs}
22+
# flash attention version
23+
ARG flash_attn_version=v2.5.6
24+
ENV FLASH_ATTN_VERSION=${flash_attn_version}
25+
26+
WORKDIR /usr/src/flash-attention-v2
27+
28+
# Download the wheel or build it if a pre-compiled release doesn't exist
29+
RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
30+
--no-build-isolation --no-deps --no-cache-dir
31+
32+
#################### FLASH_ATTENTION Build IMAGE ####################
33+
34+
#################### Runtime IMAGE ####################
135
FROM nvcr.io/nvidia/pytorch:23.09-py3
236

337
RUN apt-get update && \
@@ -6,6 +40,10 @@ RUN apt-get update && \
640
rm -rf /var/lib/apt/lists/* && \
741
apt-get clean
842

43+
# Install flash attention (from pre-built wheel)
44+
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
45+
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
46+
947
RUN pip uninstall torch -y
1048
RUN pip install torch==2.1.1 --index-url https://download.pytorch.org/whl/cu121
1149

@@ -21,3 +59,5 @@ RUN pip install -r requirements.txt
2159
COPY model-engine /workspace/model-engine
2260
RUN pip install -e /workspace/model-engine
2361
COPY model-engine/model_engine_server/inference/batch_inference/vllm_batch.py /workspace/vllm_batch.py
62+
63+
#################### Runtime IMAGE ####################
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Copied from https://github.com/vllm-project/vllm/blob/main/requirements-build.txt
2+
# Needed to build flash-attn into docker image
3+
cmake>=3.21
4+
ninja
5+
packaging
6+
setuptools>=49.4.0
7+
torch==2.3.0
8+
wheel
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
-e ../../.. # Need to install model_engine_server as a package

model-engine/model_engine_server/inference/batch_inference/requirements.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
vllm==0.2.5
2-
pydantic==1.10.13
3-
boto3==1.34.15
1+
vllm==0.4.2
2+
pydantic==2.7.1
3+
boto3>=1.34.105
44
smart-open==6.4.0
55
ddtrace==2.4.0
66
docker==7.0.0

0 commit comments

Comments
 (0)