Skip to content

Commit a4982c5

Browse files
authored
Revert "Upgrade vLLM version for batch completion (#518)" (#520)
This reverts commit 04d554d.
1 parent 04d554d commit a4982c5

File tree

5 files changed

+4
-58
lines changed

5 files changed

+4
-58
lines changed

model-engine/model_engine_server/common/dtos/llms.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
from typing import Any, Dict, List, Optional
66

7-
import pydantic
87
from model_engine_server.common.dtos.model_endpoints import (
98
CpuSpecificationType,
109
GetModelEndpointV1Response,
@@ -22,11 +21,7 @@
2221
ModelEndpointStatus,
2322
Quantization,
2423
)
25-
26-
if int(pydantic.__version__.split(".")[0]) > 1:
27-
from pydantic.v1 import BaseModel, Field, HttpUrl # pragma: no cover
28-
else:
29-
from pydantic import BaseModel, Field, HttpUrl
24+
from pydantic import BaseModel, Field, HttpUrl
3025

3126

3227
class CreateLLMModelEndpointV1Request(BaseModel):
Lines changed: 0 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,3 @@
1-
#################### BASE BUILD IMAGE ####################
2-
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
3-
RUN apt-get update -y \
4-
&& apt-get install -y python3-pip git
5-
# Workaround for https://github.com/openai/triton/issues/2507 and
6-
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
7-
# this won't be needed for future versions of this docker image
8-
# or future versions of triton.
9-
RUN ldconfig /usr/local/cuda-12.1/compat/
10-
WORKDIR /workspace
11-
12-
COPY model-engine/model_engine_server/inference/batch_inference/requirements-build.txt requirements-build.txt
13-
RUN --mount=type=cache,target=/root/.cache/pip \
14-
pip install -r requirements-build.txt
15-
#################### BASE BUILD IMAGE ####################
16-
17-
#################### FLASH_ATTENTION Build IMAGE ####################
18-
FROM dev as flash-attn-builder
19-
# max jobs used for build
20-
ARG max_jobs=2
21-
ENV MAX_JOBS=${max_jobs}
22-
# flash attention version
23-
ARG flash_attn_version=v2.5.6
24-
ENV FLASH_ATTN_VERSION=${flash_attn_version}
25-
26-
WORKDIR /usr/src/flash-attention-v2
27-
28-
# Download the wheel or build it if a pre-compiled release doesn't exist
29-
RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
30-
--no-build-isolation --no-deps --no-cache-dir
31-
32-
#################### FLASH_ATTENTION Build IMAGE ####################
33-
34-
#################### Runtime IMAGE ####################
351
FROM nvcr.io/nvidia/pytorch:23.09-py3
362

373
RUN apt-get update && \
@@ -40,10 +6,6 @@ RUN apt-get update && \
406
rm -rf /var/lib/apt/lists/* && \
417
apt-get clean
428

43-
# Install flash attention (from pre-built wheel)
44-
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
45-
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
46-
479
RUN pip uninstall torch -y
4810
RUN pip install torch==2.1.1 --index-url https://download.pytorch.org/whl/cu121
4911

@@ -59,5 +21,3 @@ RUN pip install -r requirements.txt
5921
COPY model-engine /workspace/model-engine
6022
RUN pip install -e /workspace/model-engine
6123
COPY model-engine/model_engine_server/inference/batch_inference/vllm_batch.py /workspace/vllm_batch.py
62-
63-
#################### Runtime IMAGE ####################

model-engine/model_engine_server/inference/batch_inference/requirements-build.txt

Lines changed: 0 additions & 8 deletions
This file was deleted.

model-engine/model_engine_server/inference/batch_inference/requirements-dev.txt

Lines changed: 0 additions & 1 deletion
This file was deleted.

model-engine/model_engine_server/inference/batch_inference/requirements.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
vllm==0.4.2
2-
pydantic==2.7.1
3-
boto3>=1.34.105
1+
vllm==0.2.5
2+
pydantic==1.10.13
3+
boto3==1.34.15
44
smart-open==6.4.0
55
ddtrace==2.4.0
66
docker==7.0.0

0 commit comments

Comments
 (0)