Upgrade vLLM version for batch completion (#518)

dmchoiboi · web-flow · commit 04d554d2777a · 2024-05-15T14:29:59.000-07:00
* bump pydantic==2.7.1

* Add fallback to v1 models if pydantic &gt;2

* version bump vllm

* Update docker image to manually install flash attention

* skip coverage
diff --git a/model-engine/model_engine_server/common/dtos/llms.py b/model-engine/model_engine_server/common/dtos/llms.py
@@ -4,6 +4,7 @@
 
 from typing import Any, Dict, List, Optional
 
+import pydantic
 from model_engine_server.common.dtos.model_endpoints import (
     CpuSpecificationType,
     GetModelEndpointV1Response,
@@ -21,7 +22,11 @@
     ModelEndpointStatus,
     Quantization,
 )
-from pydantic import BaseModel, Field, HttpUrl
+
+if int(pydantic.__version__.split(".")[0]) > 1:
+    from pydantic.v1 import BaseModel, Field, HttpUrl  # pragma: no cover
+else:
+    from pydantic import BaseModel, Field, HttpUrl
 
 
 class CreateLLMModelEndpointV1Request(BaseModel):
diff --git a/model-engine/model_engine_server/inference/batch_inference/Dockerfile_vllm b/model-engine/model_engine_server/inference/batch_inference/Dockerfile_vllm
@@ -1,3 +1,37 @@
+#################### BASE BUILD IMAGE ####################
+FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
+RUN apt-get update -y \
+    && apt-get install -y python3-pip git
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-12.1/compat/
+WORKDIR /workspace
+
+COPY model-engine/model_engine_server/inference/batch_inference/requirements-build.txt requirements-build.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements-build.txt
+#################### BASE BUILD IMAGE ####################
+
+#################### FLASH_ATTENTION Build IMAGE ####################
+FROM dev as flash-attn-builder
+# max jobs used for build
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
+# flash attention version
+ARG flash_attn_version=v2.5.6
+ENV FLASH_ATTN_VERSION=${flash_attn_version}
+
+WORKDIR /usr/src/flash-attention-v2
+
+# Download the wheel or build it if a pre-compiled release doesn't exist
+RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
+    --no-build-isolation --no-deps --no-cache-dir
+
+#################### FLASH_ATTENTION Build IMAGE ####################
+
+#################### Runtime IMAGE ####################
 FROM nvcr.io/nvidia/pytorch:23.09-py3
 
 RUN apt-get update && \
@@ -6,6 +40,10 @@ RUN apt-get update && \
     rm -rf /var/lib/apt/lists/* && \
     apt-get clean
 
+# Install flash attention (from pre-built wheel)
+RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
+    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
+
 RUN pip uninstall torch -y
 RUN pip install torch==2.1.1 --index-url https://download.pytorch.org/whl/cu121
 
@@ -21,3 +59,5 @@ RUN pip install -r requirements.txt
 COPY model-engine /workspace/model-engine
 RUN pip install -e /workspace/model-engine
 COPY model-engine/model_engine_server/inference/batch_inference/vllm_batch.py /workspace/vllm_batch.py
+
+#################### Runtime IMAGE ####################
diff --git a/model-engine/model_engine_server/inference/batch_inference/requirements-build.txt b/model-engine/model_engine_server/inference/batch_inference/requirements-build.txt
@@ -0,0 +1,8 @@
+# Copied from https://github.com/vllm-project/vllm/blob/main/requirements-build.txt
+# Needed to build flash-attn into docker image
+cmake>=3.21
+ninja
+packaging
+setuptools>=49.4.0
+torch==2.3.0
+wheel
diff --git a/model-engine/model_engine_server/inference/batch_inference/requirements-dev.txt b/model-engine/model_engine_server/inference/batch_inference/requirements-dev.txt
@@ -0,0 +1 @@
+-e ../../.. # Need to install model_engine_server as a package
diff --git a/model-engine/model_engine_server/inference/batch_inference/requirements.txt b/model-engine/model_engine_server/inference/batch_inference/requirements.txt
@@ -1,6 +1,6 @@
-vllm==0.2.5
-pydantic==1.10.13
-boto3==1.34.15
+vllm==0.4.2
+pydantic==2.7.1
+boto3>=1.34.105
 smart-open==6.4.0
 ddtrace==2.4.0
 docker==7.0.0

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+-e ../../.. # Need to install model_engine_server as a package`