feat: add the text embedding endpoint for LLM serving

baixiac · baixiac · commit fab113b6828b · 2025-09-16T16:32:58.000+01:00
diff --git a/app/api/routers/generative.py b/app/api/routers/generative.py
@@ -10,8 +10,16 @@
 from fastapi import APIRouter, Depends, Request, Body, Query
 from fastapi.encoders import jsonable_encoder
 from fastapi.responses import PlainTextResponse, StreamingResponse, JSONResponse
-from starlette.status import HTTP_200_OK, HTTP_400_BAD_REQUEST
-from app.domain import Tags, OpenAIChatRequest, OpenAIChatResponse, PromptMessage, PromptRole
+from starlette.status import HTTP_200_OK, HTTP_400_BAD_REQUEST, HTTP_500_INTERNAL_SERVER_ERROR
+from app.domain import (
+    Tags,
+    OpenAIChatRequest,
+    OpenAIChatResponse,
+    OpenAIEmbeddingsRequest,
+    OpenAIEmbeddingsResponse,
+    PromptMessage,
+    PromptRole,
+)
 from app.model_services.base import AbstractModelService
 from app.utils import get_settings, get_prompt_from_messages
 from app.api.utils import get_rate_limiter
@@ -21,6 +29,7 @@
 PATH_GENERATE = "/generate"
 PATH_GENERATE_ASYNC = "/stream/generate"
 PATH_OPENAI_COMPLETIONS = "/v1/chat/completions"
+PATH_OPENAI_EMBEDDINGS = "/v1/embeddings"
 
 router = APIRouter()
 config = get_settings()
@@ -134,7 +143,7 @@ async def generate_text_stream(
 
 @router.post(
     PATH_OPENAI_COMPLETIONS,
-    tags=[Tags.Generative.name],
+    tags=[Tags.OpenAICompatible.name],
     response_model=None,
     dependencies=[Depends(cms_globals.props.current_active_user)],
     description="Generate chat response based on messages, similar to OpenAI's /v1/chat/completions",
@@ -162,6 +171,7 @@ def generate_chat_completions(
     """
 
     messages = request_data.messages
+    model = model_service.model_name if request_data.model != model_service.model_name else request_data.model
     stream = request_data.stream
     max_tokens = request_data.max_tokens
     temperature = request_data.temperature
@@ -224,7 +234,7 @@ async def _stream(prompt: str, max_tokens: int, temperature: float) -> AsyncGene
             id=tracking_id,
             object="chat.completion",
             created=int(time.time()),
-            model=model_service.model_name,
+            model=model,
             choices=[
                 {
                     "index": 0,
@@ -239,14 +249,100 @@ async def _stream(prompt: str, max_tokens: int, temperature: float) -> AsyncGene
         return JSONResponse(content=jsonable_encoder(completion), headers={"x-cms-tracking-id": tracking_id})
 
 
+@router.post(
+    PATH_OPENAI_EMBEDDINGS,
+    tags=[Tags.OpenAICompatible.name],
+    response_model=None,
+    dependencies=[Depends(cms_globals.props.current_active_user)],
+    description="Create embeddings based on text(s), similar to OpenAI's /v1/embeddings endpoint",
+)
+def embed_texts(
+    request: Request,
+    request_data: Annotated[OpenAIEmbeddingsRequest, Body(
+        description="Text(s) to be embedded", media_type="application/json"
+    )],
+    tracking_id: Union[str, None] = Depends(validate_tracking_id),
+    model_service: AbstractModelService = Depends(cms_globals.model_service_dep)
+) -> JSONResponse:
+    """
+    Embeds text or a list of texts, mimicking OpenAI's /v1/embeddings endpoint.
+
+    Args:
+        request (Request): The request object.
+        request_data (OpenAIEmbeddingsRequest): The request data containing model and input text(s).
+        tracking_id (Union[str, None]): An optional tracking ID of the requested task.
+        model_service (AbstractModelService): The model service dependency.
+
+    Returns:
+        JSONResponse: A response containing the embeddings of the text(s).
+    """
+    tracking_id = tracking_id or str(uuid.uuid4())
+
+    if not hasattr(model_service, "create_embeddings"):
+        error_response = {
+            "error": {
+                "message": "Model does not support embeddings",
+                "type": "invalid_request_error",
+                "param": "model",
+                "code": "model_not_supported",
+            }
+        }
+        return JSONResponse(
+            content=error_response,
+            status_code=HTTP_500_INTERNAL_SERVER_ERROR,
+            headers={"x-cms-tracking-id": tracking_id},
+        )
+
+    input_text = request_data.input
+    model = model_service.model_name if request_data.model != model_service.model_name else request_data.model
+
+    if isinstance(input_text, str):
+        input_texts = [input_text]
+    else:
+        input_texts = input_text
+
+    try:
+        embeddings_data = []
+
+        for i, embedding in enumerate(model_service.create_embeddings(input_texts)):
+            embeddings_data.append({
+                "object": "embedding",
+                "embedding": embedding,
+                "index": i,
+            })
+
+        response = OpenAIEmbeddingsResponse(object="list", data=embeddings_data, model=model)
+
+        return JSONResponse(
+            content=jsonable_encoder(response),
+            headers={"x-cms-tracking-id": tracking_id},
+        )
+
+    except Exception as e:
+        logger.error("Failed to create embeddings")
+        logger.exception(e)
+        error_response = {
+            "error": {
+                "message": f"Failed to create embeddings: {str(e)}",
+                "type": "server_error",
+                "code": "internal_error",
+            }
+        }
+        return JSONResponse(
+            content=error_response,
+            status_code=HTTP_500_INTERNAL_SERVER_ERROR,
+            headers={"x-cms-tracking-id": tracking_id},
+        )
+
+
 def _empty_prompt_error() -> Iterable[str]:
     yield "ERROR: No prompt text provided\n"
 
 
 def _send_usage_metrics(handler: str, prompt_token_num: int, completion_token_num: int) -> None:
     cms_prompt_tokens.labels(handler=handler).observe(prompt_token_num)
-    logger.debug(f"Sent prompt tokens usage: {prompt_token_num}")
+    logger.debug("Sent prompt tokens usage: %s", prompt_token_num)
     cms_completion_tokens.labels(handler=handler).observe(completion_token_num)
-    logger.debug(f"Sent completion tokens usage: {completion_token_num}")
+    logger.debug("Sent completion tokens usage: %s", completion_token_num)
     cms_total_tokens.labels(handler=handler).observe(prompt_token_num + completion_token_num)
-    logger.debug(f"Sent total tokens usage: {prompt_token_num + completion_token_num}")
+    logger.debug("Sent total tokens usage: %s", prompt_token_num + completion_token_num)
diff --git a/app/domain.py b/app/domain.py
@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import List, Optional, Dict, Any
+from typing import List, Optional, Dict, Any, Union
 
 from fastapi import HTTPException
 from starlette.status import HTTP_400_BAD_REQUEST
@@ -27,6 +27,7 @@ class Tags(str, Enum):
     Evaluating = "Evaluate the deployed model with trainer export"
     Authentication = "Authenticate registered users"
     Generative = "Generate text based on the input prompt"
+    OpenAICompatible = "Compatible with OpenAI APIs"
 
 
 class TagsStreamable(str, Enum):
@@ -185,6 +186,7 @@ class OpenAIChatRequest(BaseModel):
     messages: List[PromptMessage] = Field(..., description="A list of messages to be sent to the model")
     stream: bool = Field(..., description="Whether to stream the response")
     max_tokens: int = Field(512, description="The maximum number of tokens to generate", gt=0)
+    model: str = Field(..., description="The name of the model used for generating the completion")
     temperature: float = Field(0.7, description="The temperature of the generated text", ge=0.0, le=1.0)
 
 
@@ -194,3 +196,14 @@ class OpenAIChatResponse(BaseModel):
     created: int = Field(..., description="The timestamp when the completion was generated")
     model: str = Field(..., description="The name of the model used for generating the completion")
     choices: List = Field(..., description="The generated messages and their metadata")
+
+
+class OpenAIEmbeddingsRequest(BaseModel):
+    input: Union[str, List[str]] = Field(..., description="Input text or list of texts to embed")
+    model: str = Field(..., description="The name of the model used for creating the embeddings")
+
+
+class OpenAIEmbeddingsResponse(BaseModel):
+    object: str = Field(..., description="The type of the response")
+    data: List[Dict[str, Any]] = Field(..., description="List of embedding objects")
+    model: str = Field(..., description="The name of the model used for creating the embeddings")
diff --git a/app/model_services/base.py b/app/model_services/base.py
@@ -1,6 +1,6 @@
 import asyncio
 from abc import ABC, abstractmethod
-from typing import Any, List, Iterable, Tuple, final, Optional, Generic, TypeVar, Protocol, AsyncIterable
+from typing import Any, List, Iterable, Tuple, final, Optional, Generic, TypeVar, Protocol, AsyncIterable, Union
 from app.config import Settings
 from app.domain import ModelCard, Annotation
 
@@ -17,7 +17,7 @@ def tracker_client(self) -> Any:
 T = TypeVar("T", bound=_TrainerCommon)
 
 class AbstractModelService(ABC, Generic[T]):
-    """An abstract base class defining the common interface for all model services."""
+    """An abstract base class defining the common interface for NER model services."""
 
     @abstractmethod
     def __init__(self, config: Settings, *args: Any, **kwargs: Any) -> None:
@@ -200,6 +200,29 @@ def generate_async(self, prompt: str, *args: Any, **kwargs: Any) -> AsyncIterabl
 
         raise NotImplementedError
 
+    def create_embeddings(
+        self,
+        text: Union[str, List[str]],
+        *args: Any,
+        **kwargs: Any
+    ) -> Union[List[float], List[List[float]]]:
+        """
+        Creates embeddings for a given text or list of texts.
+
+        Args:
+            text (Union[str, List[str]]): The text(s) to be embedded.
+            *args (Any): Additional positional arguments to be passed to this method.
+            **kwargs (Any): Additional keyword arguments to be passed to this method.
+
+        Returns:
+            Union[List[float], List[List[float]]]: The embedding vector(s) for the text(s).
+
+        Raises:
+            NotImplementedError: If the method is not implemented by the subclass.
+        """
+
+        raise NotImplementedError
+
     def train_supervised(self, *args: Any, **kwargs: Any) -> Tuple[bool, str, str]:
         """
         Initiates supervised training on the model.
diff --git a/app/model_services/huggingface_llm_model.py b/app/model_services/huggingface_llm_model.py
@@ -1,8 +1,9 @@
 import os
 import logging
 import asyncio
+import torch
 from concurrent.futures import ThreadPoolExecutor
-from typing import Dict, List, Optional, Tuple, Any, AsyncIterable, Callable
+from typing import Dict, List, Optional, Tuple, Any, AsyncIterable, Callable, Union
 from transformers import (
     AutoModelForCausalLM,
     AutoTokenizer,
@@ -307,3 +308,50 @@ async def generate_async(
             return
         finally:
             logger.debug("Chat response generation completed")
+
+    def create_embeddings(
+        self,
+        text: Union[str, List[str]],
+        *args: Any,
+        **kwargs: Any
+    ) -> Union[List[float], List[List[float]]]:
+        """
+        Creates embeddings for a given text or list of texts using the model's hidden states.
+
+        Args:
+            text (Union[str, List[str]]): The text(s) to be embedded.
+            *args (Any): Additional positional arguments to be passed to this method.
+            **kwargs (Any): Additional keyword arguments to be passed to this method.
+
+        Returns:
+            List[float], List[List[float]]: The embedding vector(s) for the text(s).
+
+        Raises:
+            NotImplementedError: If the model doesn't support embeddings.
+        """
+
+        self.model.eval()
+
+        inputs = self.tokenizer(
+            text,
+            add_special_tokens=False,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+        )
+
+        if non_default_device_is_available(self._config.DEVICE):
+            inputs.to(get_settings().DEVICE)
+
+        with torch.no_grad():
+            outputs = self.model(**inputs, output_hidden_states=True)
+
+        last_hidden_state = outputs.hidden_states[-1]
+        attention_mask = inputs["attention_mask"]
+        masked_hidden_states = last_hidden_state * attention_mask.unsqueeze(-1)
+        sum_hidden_states = masked_hidden_states.sum(dim=1)
+        num_tokens = attention_mask.sum(dim=1, keepdim=True)
+        embeddings = sum_hidden_states / num_tokens
+
+        results = embeddings.cpu().numpy().tolist()
+        return results[0] if isinstance(text, str) else results
diff --git a/tests/app/api/test_serving_hf_llm.py b/tests/app/api/test_serving_hf_llm.py
@@ -31,7 +31,9 @@ def llm_app(llm_model_service):
 
 @pytest.fixture(scope="function")
 def client(llm_model_service):
+    llm_model_service.model_name = "HuggingFace LLM model"
     llm_model_service.generate.return_value = "Yeah."
+    llm_model_service.create_embeddings.return_value = [[1.0, 2.0, 3.0]]
     app = get_generative_server(config, msd_overwritten=lambda: llm_model_service)
     app.dependency_overrides[cms_globals.props.current_active_user] = lambda: None
     client = TestClient(app)
@@ -82,6 +84,7 @@ async def test_generate_chat_completions(llm_model_service, llm_app):
           "content": "Who are you?"
         }
       ],
+      "model": "HuggingFace LLM model",
       "stream": True,
       "max_tokens": 128,
       "temperature": 0.7
@@ -98,3 +101,22 @@ async def test_generate_chat_completions(llm_model_service, llm_app):
     assert response.text.startswith("data:")
     assert "id" in response.text
     assert "chat.completion.chunk" in response.text
+
+
+def test_create_embeddings(client):
+    request_data = {
+        "input": ["Alright"],
+        "model": "HuggingFace LLM model",
+    }
+    response = client.post(
+        "/v1/embeddings",
+        data=json.dumps(request_data),
+        headers={"Content-Type": "application/json"},
+    )
+    assert response.status_code == 200
+    assert response.headers["content-type"] == "application/json"
+    assert response.json() == {
+        "object": "list",
+        "data": [{"object": "embedding", "embedding": [1.0, 2.0, 3.0], "index": 0}],
+        "model": "HuggingFace LLM model"
+    }
diff --git a/tests/app/model_services/test_huggingface_llm_model.py b/tests/app/model_services/test_huggingface_llm_model.py