Update vllm batch job to work with vllm > 0.5.0 (#550)

dmchoiboi · web-flow · commit e46cbd48f10e · 2024-06-25T19:30:08.000-07:00
* Update vllm batch job to work with vllm &gt; 0.5.0

* Fix test

* Add comments
diff --git a/model-engine/model_engine_server/common/dtos/llms.py b/model-engine/model_engine_server/common/dtos/llms.py
@@ -1,5 +1,7 @@
 """
 DTOs for LLM APIs.
+
+Make sure to keep this in sync with inference/batch_inference/dto.py.
 """
 
 from typing import Any, Dict, List, Optional
@@ -553,6 +555,14 @@ class CreateBatchCompletionsEngineRequest(CreateBatchCompletionsRequest):
     hidden from the DTO exposed to the client.
     """
 
+    model_cfg: CreateBatchCompletionsModelConfig
+    """
+    Model configuration for the batch inference. Hardware configurations are inferred.
+
+    We rename model_config from api to model_cfg in engine since engine uses pydantic v2 which
+    reserves model_config as a keyword.
+    """
+
     max_gpu_memory_utilization: Optional[float] = Field(default=0.9, le=1.0)
     """
     Maximum GPU memory utilization for the batch inference. Default to 90%.
@@ -565,6 +575,7 @@ def from_api(request: CreateBatchCompletionsRequest) -> "CreateBatchCompletionsE
             output_data_path=request.output_data_path,
             content=request.content,
             model_config=request.model_config,
+            model_cfg=request.model_config,
             data_parallelism=request.data_parallelism,
             max_runtime_sec=request.max_runtime_sec,
             tool_config=request.tool_config,
diff --git a/model-engine/model_engine_server/inference/batch_inference/dto.py b/model-engine/model_engine_server/inference/batch_inference/dto.py
@@ -0,0 +1,165 @@
+# This is a copy of model_engine_server.common.dtos.llm
+# This is done to decouple the pydantic requirements since vllm requires pydantic >2
+# while model engine is on 1.x
+from enum import Enum
+from typing import Dict, List, Optional
+
+from pydantic import BaseModel, Field
+
+
+class TokenOutput(BaseModel):
+    token: str
+    log_prob: float
+
+
+class CompletionOutput(BaseModel):
+    text: str
+    num_prompt_tokens: int
+    num_completion_tokens: int
+    tokens: Optional[List[TokenOutput]] = None
+
+
+class CreateBatchCompletionsRequestContent(BaseModel):
+    prompts: List[str]
+    max_new_tokens: int
+    temperature: float = Field(ge=0.0, le=1.0)
+    """
+    Temperature of the sampling. Setting to 0 equals to greedy sampling.
+    """
+    stop_sequences: Optional[List[str]] = None
+    """
+    List of sequences to stop the completion at.
+    """
+    return_token_log_probs: Optional[bool] = False
+    """
+    Whether to return the log probabilities of the tokens.
+    """
+    presence_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0)
+    """
+    Only supported in vllm, lightllm
+    Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty
+    """
+    frequency_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0)
+    """
+    Only supported in vllm, lightllm
+    Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty
+    """
+    top_k: Optional[int] = Field(default=None, ge=-1)
+    """
+    Controls the number of top tokens to consider. -1 means consider all tokens.
+    """
+    top_p: Optional[float] = Field(default=None, gt=0.0, le=1.0)
+    """
+    Controls the cumulative probability of the top tokens to consider. 1.0 means consider all tokens.
+    """
+    skip_special_tokens: Optional[bool] = True
+    """
+    Whether to skip special tokens in the output.
+    """
+
+
+class Quantization(str, Enum):
+    BITSANDBYTES = "bitsandbytes"
+    AWQ = "awq"
+
+
+class CreateBatchCompletionsModelConfig(BaseModel):
+    model: str
+    checkpoint_path: Optional[str] = None
+    """
+    Path to the checkpoint to load the model from.
+    """
+    labels: Dict[str, str]
+    """
+    Labels to attach to the batch inference job.
+    """
+    num_shards: Optional[int] = 1
+    """
+    Suggested number of shards to distribute the model. When not specified, will infer the number of shards based on model config.
+    System may decide to use a different number than the given value.
+    """
+    quantize: Optional[Quantization] = None
+    """
+    Whether to quantize the model.
+    """
+    seed: Optional[int] = None
+    """
+    Random seed for the model.
+    """
+
+
+class ToolConfig(BaseModel):
+    """
+    Configuration for tool use.
+    NOTE: this config is highly experimental and signature will change significantly in future iterations.
+    """
+
+    name: str
+    """
+    Name of the tool to use for the batch inference.
+    """
+    max_iterations: Optional[int] = 10
+    """
+    Maximum number of iterations to run the tool.
+    """
+    execution_timeout_seconds: Optional[int] = 60
+    """
+    Maximum runtime of the tool in seconds.
+    """
+    should_retry_on_error: Optional[bool] = True
+    """
+    Whether to retry the tool on error.
+    """
+
+
+class CreateBatchCompletionsRequest(BaseModel):
+    """
+    Request object for batch completions.
+    """
+
+    input_data_path: Optional[str]
+    output_data_path: str
+    """
+    Path to the output file. The output file will be a JSON file of type List[CompletionOutput].
+    """
+    content: Optional[CreateBatchCompletionsRequestContent] = None
+    """
+    Either `input_data_path` or `content` needs to be provided.
+    When input_data_path is provided, the input file should be a JSON file of type BatchCompletionsRequestContent.
+    """
+
+    data_parallelism: Optional[int] = Field(default=1, ge=1, le=64)
+    """
+    Number of replicas to run the batch inference. More replicas are slower to schedule but faster to inference.
+    """
+    max_runtime_sec: Optional[int] = Field(default=24 * 3600, ge=1, le=2 * 24 * 3600)
+    """
+    Maximum runtime of the batch inference in seconds. Default to one day.
+    """
+    tool_config: Optional[ToolConfig] = None
+    """
+    Configuration for tool use.
+    NOTE: this config is highly experimental and signature will change significantly in future iterations.
+    """
+
+
+class CreateBatchCompletionsEngineRequest(CreateBatchCompletionsRequest):
+    """
+    Internal model for representing request to the llm engine. This contains additional fields that we want
+    hidden from the DTO exposed to the client.
+    """
+
+    model_cfg: CreateBatchCompletionsModelConfig = Field(alias="model_config")
+    """
+    Model configuration for the batch inference. Hardware configurations are inferred.
+    
+    We rename model_config from api to model_cfg in engine since engine uses pydantic v2 which
+    reserves model_config as a keyword.
+
+    We alias `model_config` for deserialization for backwards compatibility.
+    """
+
+    max_gpu_memory_utilization: Optional[float] = Field(default=0.9, le=1.0)
+    """
+    Maximum GPU memory utilization for the batch inference. Default to 90%.
+    """
diff --git a/model-engine/model_engine_server/inference/batch_inference/requirements.txt b/model-engine/model_engine_server/inference/batch_inference/requirements.txt
@@ -1,5 +1,5 @@
-vllm==0.2.5
-pydantic==1.10.13
+vllm==0.5.0.post1
+pydantic>=2
 boto3==1.34.15
 smart-open==6.4.0
 ddtrace==2.4.0
diff --git a/model-engine/model_engine_server/inference/batch_inference/vllm_batch.py b/model-engine/model_engine_server/inference/batch_inference/vllm_batch.py
@@ -13,7 +13,7 @@
 import boto3
 import smart_open
 from func_timeout import FunctionTimedOut, func_set_timeout
-from model_engine_server.common.dtos.llms import (
+from model_engine_server.inference.batch_inference.dto import (
     CompletionOutput,
     CreateBatchCompletionsEngineRequest,
     CreateBatchCompletionsRequestContent,
@@ -150,9 +150,9 @@ def get_vllm_engine(model: str, request: CreateBatchCompletionsEngineRequest):
 
     engine_args = AsyncEngineArgs(
         model=model,
-        quantization=request.model_config.quantize,
-        tensor_parallel_size=request.model_config.num_shards,
-        seed=request.model_config.seed or 0,
+        quantization=request.model_cfg.quantize,
+        tensor_parallel_size=request.model_cfg.num_shards,
+        seed=request.model_cfg.seed or 0,
         disable_log_requests=True,
         gpu_memory_utilization=request.max_gpu_memory_utilization or 0.9,
     )
@@ -316,18 +316,16 @@ async def batch_inference():
 
     request = CreateBatchCompletionsEngineRequest.parse_file(CONFIG_FILE)
 
-    if request.model_config.checkpoint_path is not None:
-        download_model(request.model_config.checkpoint_path, MODEL_WEIGHTS_FOLDER)
+    if request.model_cfg.checkpoint_path is not None:
+        download_model(request.model_cfg.checkpoint_path, MODEL_WEIGHTS_FOLDER)
 
     content = request.content
     if content is None:
         with smart_open.open(request.input_data_path, "r") as f:
             content = CreateBatchCompletionsRequestContent.parse_raw(f.read())
 
-    model = (
-        MODEL_WEIGHTS_FOLDER if request.model_config.checkpoint_path else request.model_config.model
-    )
-    is_finetuned = request.model_config.checkpoint_path is not None
+    model = MODEL_WEIGHTS_FOLDER if request.model_cfg.checkpoint_path else request.model_cfg.model
+    is_finetuned = request.model_cfg.checkpoint_path is not None
 
     llm = get_vllm_engine(model, request)
 
@@ -352,7 +350,7 @@ async def batch_inference():
             prompts,
             tool,
             is_finetuned,
-            request.model_config.model,
+            request.model_cfg.model,
         )
     else:
         bar = tqdm(total=len(prompts), desc="Processed prompts")
@@ -372,7 +370,7 @@ async def batch_inference():
             bar,
             use_tool=False,
             is_finetuned=is_finetuned,
-            model=request.model_config.model,
+            model=request.model_cfg.model,
         )
 
         bar.close()
@@ -430,27 +428,25 @@ async def generate_with_vllm(
             skip_special_tokens=skip_special_tokens if skip_special_tokens is not None else True,
         )
         results_generator = await engine.add_request(
-            request_id, prompt, sampling_params, None, time.monotonic()
+            request_id, prompt, sampling_params, time.monotonic(), None
         )
         results_generators.append(results_generator)
 
     outputs = []
     for generator in results_generators:
-        last_output_text = ""
         tokens = []
         async for request_output in generator:
             if request_output.finished:
                 bar.update(1)
 
-            token_text = request_output.outputs[-1].text[len(last_output_text) :]
-            log_probs = request_output.outputs[0].logprobs[-1] if return_token_log_probs else None
-            last_output_text = request_output.outputs[-1].text
-
             if return_token_log_probs:
+                output = request_output.outputs[0]
+                log_probs = output.logprobs[-1] if return_token_log_probs else None
+                token_id = output.token_ids[-1]
                 tokens.append(
                     TokenOutput(
-                        token=token_text,
-                        log_prob=log_probs[request_output.outputs[0].token_ids[-1]],
+                        token=log_probs[token_id].decoded_token,
+                        log_prob=log_probs[token_id].logprob,
                     )
                 )
 
diff --git a/model-engine/tests/unit/inference/conftest.py b/model-engine/tests/unit/inference/conftest.py
@@ -1,11 +1,10 @@
 from unittest.mock import MagicMock
 
 import pytest
-from model_engine_server.common.dtos.llms import (
+from model_engine_server.inference.batch_inference.dto import (
     CompletionOutput,
     CreateBatchCompletionsEngineRequest,
     CreateBatchCompletionsModelConfig,
-    CreateBatchCompletionsRequest,
     CreateBatchCompletionsRequestContent,
     TokenOutput,
     ToolConfig,
@@ -14,16 +13,18 @@
 
 @pytest.fixture
 def create_batch_completions_engine_request() -> CreateBatchCompletionsEngineRequest:
+    model_config = CreateBatchCompletionsModelConfig(
+        model="model",
+        checkpoint_path="checkpoint_path",
+        labels={},
+        seed=123,
+        num_shards=4,
+    )
     return CreateBatchCompletionsEngineRequest(
         input_data_path="input_data_path",
         output_data_path="output_data_path",
-        model_config=CreateBatchCompletionsModelConfig(
-            model="model",
-            checkpoint_path="checkpoint_path",
-            labels={},
-            seed=123,
-            num_shards=4,
-        ),
+        model_cfg=model_config,
+        model_config=model_config,
         data_parallelism=1,
         max_runtime_sec=86400,
         max_gpu_memory_utilization=0.95,
@@ -32,10 +33,13 @@ def create_batch_completions_engine_request() -> CreateBatchCompletionsEngineReq
 
 @pytest.fixture
 def create_batch_completions_tool_completion_request():
-    return CreateBatchCompletionsRequest(
-        model_config=CreateBatchCompletionsModelConfig(
-            checkpoint_path="checkpoint_path", model="model", num_shards=4, seed=123, labels={}
-        ),
+    model_config = CreateBatchCompletionsModelConfig(
+        checkpoint_path="checkpoint_path", model="model", num_shards=4, seed=123, labels={}
+    )
+
+    return CreateBatchCompletionsEngineRequest(
+        model_cfg=model_config,
+        model_config=model_config,
         data_parallelism=1,
         input_data_path="input_data_path",
         output_data_path="output_data_path",