Address comment and rebase to r25.10 (V1 API)

yinggeh · yinggeh · commit 943ee5f8c0e8 · 2025-10-30T15:45:48.000-07:00
diff --git a/src/model.py b/src/model.py
@@ -40,7 +40,7 @@
 )
 
 from utils.metrics import VllmStatLoggerFactory
-from utils.vllm_backend_utils import TritonSamplingParams
+from utils.request import EmbedRequest, GenerateRequest
 
 _VLLM_ENGINE_ARGS_FILENAME = "model.json"
 _MULTI_LORA_ARGS_FILENAME = "multi_lora.json"
@@ -249,6 +249,11 @@ def _init_engine(self):
                 self._event_thread = None
             raise e
 
+        # Get supported tasks from the engine running in another thread
+        self.supported_tasks = asyncio.run_coroutine_threadsafe(
+            self._llm_engine.get_supported_tasks(), self._event_loop
+        ).result()
+
     async def _run_llm_engine(self):
         # Counter to keep track of ongoing request counts.
         self._ongoing_request_count = 0
@@ -453,11 +458,11 @@ async def _infer(self, request):
             request_task_name = self._validate_request_task_name(request)
             if request_task_name == "generate":
                 request = GenerateRequest(
-                    request, self._llm_engine.generate, self.output_dtype
+                    request, self._llm_engine.generate, self.output_dtype, self.logger
                 )
             elif request_task_name == "embed":
                 request = EmbedRequest(
-                    request, self._llm_engine.encode, self.output_dtype
+                    request, self._llm_engine.encode, self.output_dtype, self.logger
                 )
             else:
                 raise ValueError(
@@ -499,10 +504,9 @@ async def _infer(self, request):
                 # Send each response if streaming.
                 if request.stream:
                     response = request.create_response(
-                        request_output_state,
                         request_output,
+                        request_output_state,
                         prepend_input=False,
-                        additional_outputs=request.additional_outputs,
                     )
                     flags = 0
                     if request_output.finished:
@@ -515,10 +519,9 @@ async def _infer(self, request):
             if not request.stream:
                 if request_task_name == "generate":
                     response = request.create_response(
-                        request_output_state={},
                         request_output=request_output,
+                        request_output_state={},
                         prepend_input=request.prepend_input,
-                        additional_outputs=request.additional_outputs,
                     )
                 else:
                     response = request.create_response(
diff --git a/src/utils/request.py b/src/utils/request.py
@@ -28,23 +28,33 @@
 import json
 from abc import abstractmethod
 from io import BytesIO
+from typing import Callable
 
 import numpy as np
 import triton_python_backend_utils as pb_utils
 from PIL import Image
 from vllm.inputs.data import TokensPrompt
 from vllm.lora.request import LoRARequest
+from vllm.outputs import (
+    EmbeddingOutput,
+    EmbeddingRequestOutput,
+    PoolingRequestOutput,
+    RequestOutput,
+)
 from vllm.pooling_params import PoolingParams
 from vllm.utils import random_uuid
 
 from utils.vllm_backend_utils import TritonSamplingParams
 
 
 class RequestBase:
-    def __init__(self, request, executor_callback, output_dtype):
+    def __init__(
+        self, request, executor_callback: Callable, output_dtype: np.dtype, logger
+    ):
         self.request = request
         self.executor_callback = executor_callback
         self.output_dtype = output_dtype
+        self.logger = logger
         self.id = random_uuid()
         self.stream = False
         self.prepend_input = False
@@ -58,13 +68,15 @@ def execute(self):
         raise NotImplementedError
 
     @abstractmethod
-    def create_response(self, *args, **kwargs):
+    def create_response(self, request_output, *args, **kwargs):
         raise NotImplementedError
 
 
 class GenerateRequest(RequestBase):
-    def __init__(self, request, executor_callback, output_dtype):
-        super().__init__(request, executor_callback, output_dtype)
+    def __init__(
+        self, request, executor_callback: Callable, output_dtype: np.dtype, logger
+    ):
+        super().__init__(request, executor_callback, output_dtype, logger)
 
     def _get_input_tensors(self):
         # prompt
@@ -166,7 +178,12 @@ async def execute(self):
         async for response in response_iterator:
             yield response
 
-    def create_response(self, request_output_state, request_output, prepend_input):
+    def create_response(
+        self,
+        request_output: RequestOutput,
+        request_output_state: dict,
+        prepend_input: bool,
+    ):
         output_tensors = []
 
         # text_output
@@ -278,8 +295,10 @@ def create_response(self, request_output_state, request_output, prepend_input):
 
 
 class EmbedRequest(RequestBase):
-    def __init__(self, request, executor_callback, output_dtype):
-        super().__init__(request, executor_callback, output_dtype)
+    def __init__(
+        self, request, executor_callback: Callable, output_dtype: np.dtype, logger
+    ):
+        super().__init__(request, executor_callback, output_dtype, logger)
 
     def _get_input_tensors(self):
         embedding_request = pb_utils.get_input_tensor_by_name(
@@ -338,32 +357,16 @@ def _to_pooling_params(self, embedding_request: dict):
             pooling_params = PoolingParams(dimensions=dims, task="embed")
         return pooling_params
 
-    def create_response(self, request_output):
+    def create_response(self, request_output: PoolingRequestOutput[EmbeddingOutput]):
         output_tensors = []
+        request_output = EmbeddingRequestOutput.from_base(request_output)
 
-        # Extract embedding vector from output
-        # PoolingRequestOutput.outputs is a PoolingOutput with .data (torch.Tensor)
-        pooling_data = request_output.outputs.data
-
-        # Convert torch tensor to numpy array then to list for JSON serialization
-        if hasattr(pooling_data, "cpu"):
-            # It's a torch tensor - move to CPU and convert to numpy
-            embedding_array = pooling_data.cpu().numpy()
-        else:
-            # Already numpy or list
-            embedding_array = np.array(pooling_data, dtype=np.float32)
-
-        # Create response tensor - for embeddings, we use text_output to return the vector
-        # (This is a simplification - you may want to define a proper embedding output tensor)
-        embedding_list = (
-            embedding_array.tolist()
-            if hasattr(embedding_array, "tolist")
-            else list(embedding_array)
-        )
-        embedding_str = json.dumps(embedding_list)
+        # Extract embedding list from output
+        embedding: list[float] = request_output.outputs.embedding
         output_tensors.append(
             pb_utils.Tensor(
-                "text_output", np.asarray([embedding_str], dtype=self.output_dtype)
+                "text_output",
+                np.asarray([json.dumps(embedding)], dtype=self.output_dtype),
             )
         )