scaleapi
diff --git a/‎model-engine/model_engine_server/common/dtos/llms.py‎
Lines changed: 1 addition & 6 deletions b/‎model-engine/model_engine_server/common/dtos/llms.py‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎model-engine/model_engine_server/inference/batch_inference/Dockerfile_vllm‎
Lines changed: 0 additions & 40 deletions b/‎model-engine/model_engine_server/inference/batch_inference/Dockerfile_vllm‎
Lines changed: 0 additions & 40 deletions
diff --git a/‎model-engine/model_engine_server/inference/batch_inference/requirements-build.txt‎
Lines changed: 0 additions & 8 deletions b/‎model-engine/model_engine_server/inference/batch_inference/requirements-build.txt‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎model-engine/model_engine_server/inference/batch_inference/requirements-dev.txt‎
Lines changed: 0 additions & 1 deletion b/‎model-engine/model_engine_server/inference/batch_inference/requirements-dev.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎model-engine/model_engine_server/inference/batch_inference/requirements.txt‎
Lines changed: 3 additions & 3 deletions b/‎model-engine/model_engine_server/inference/batch_inference/requirements.txt‎
Lines changed: 3 additions & 3 deletions
@@ -4,7 +4,6 @@
 
 from typing import Any, Dict, List, Optional
 
-import pydantic
 from model_engine_server.common.dtos.model_endpoints import (
     CpuSpecificationType,
     GetModelEndpointV1Response,
@@ -22,11 +21,7 @@
     ModelEndpointStatus,
     Quantization,
 )
-
-if int(pydantic.__version__.split(".")[0]) > 1:
-    from pydantic.v1 import BaseModel, Field, HttpUrl  # pragma: no cover
-else:
-    from pydantic import BaseModel, Field, HttpUrl
+from pydantic import BaseModel, Field, HttpUrl
 
 
 class CreateLLMModelEndpointV1Request(BaseModel):
 
@@ -1,37 +1,3 @@
-#################### BASE BUILD IMAGE ####################
-FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
-RUN apt-get update -y \
-    && apt-get install -y python3-pip git
-# Workaround for https://github.com/openai/triton/issues/2507 and
-# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
-# this won't be needed for future versions of this docker image
-# or future versions of triton.
-RUN ldconfig /usr/local/cuda-12.1/compat/
-WORKDIR /workspace
-
-COPY model-engine/model_engine_server/inference/batch_inference/requirements-build.txt requirements-build.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements-build.txt
-#################### BASE BUILD IMAGE ####################
-
-#################### FLASH_ATTENTION Build IMAGE ####################
-FROM dev as flash-attn-builder
-# max jobs used for build
-ARG max_jobs=2
-ENV MAX_JOBS=${max_jobs}
-# flash attention version
-ARG flash_attn_version=v2.5.6
-ENV FLASH_ATTN_VERSION=${flash_attn_version}
-
-WORKDIR /usr/src/flash-attention-v2
-
-# Download the wheel or build it if a pre-compiled release doesn't exist
-RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
-    --no-build-isolation --no-deps --no-cache-dir
-
-#################### FLASH_ATTENTION Build IMAGE ####################
-
-#################### Runtime IMAGE ####################
 FROM nvcr.io/nvidia/pytorch:23.09-py3
 
 RUN apt-get update && \
@@ -40,10 +6,6 @@ RUN apt-get update && \
     rm -rf /var/lib/apt/lists/* && \
     apt-get clean
 
-# Install flash attention (from pre-built wheel)
-RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
-    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
-
 RUN pip uninstall torch -y
 RUN pip install torch==2.1.1 --index-url https://download.pytorch.org/whl/cu121
 
@@ -59,5 +21,3 @@ RUN pip install -r requirements.txt
 COPY model-engine /workspace/model-engine
 RUN pip install -e /workspace/model-engine
 COPY model-engine/model_engine_server/inference/batch_inference/vllm_batch.py /workspace/vllm_batch.py
-
-#################### Runtime IMAGE ####################
@@ -1,6 +1,6 @@
-vllm==0.4.2
-pydantic==2.7.1
-boto3>=1.34.105
+vllm==0.2.5
+pydantic==1.10.13
+boto3==1.34.15
 smart-open==6.4.0
 ddtrace==2.4.0
 docker==7.0.0