Feature: Adds support for OpenAISpec in litgpt serve (#1943)

bhimrazy · web-flow · commit 54fe4a808656 · 2025-04-25T08:18:42.000+02:00
diff --git a/litgpt/deploy/serve.py b/litgpt/deploy/serve.py
@@ -1,4 +1,5 @@
 # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+import json
 import sys
 from pathlib import Path
 from pprint import pprint
@@ -11,8 +12,10 @@
 from litgpt.utils import auto_download_checkpoint
 
 _LITSERVE_AVAILABLE = RequirementCache("litserve")
+_JINJA2_AVAILABLE = RequirementCache("jinja2")
 if _LITSERVE_AVAILABLE:
     from litserve import LitAPI, LitServer
+    from litserve.specs.openai import ChatCompletionRequest, OpenAISpec
 else:
     LitAPI, LitServer = object, object
 
@@ -129,6 +132,55 @@ def encode_response(self, output):
             yield {"output": out}
 
 
+class OpenAISpecLitAPI(BaseLitAPI):
+    def __init__(
+        self,
+        checkpoint_dir: Path,
+        quantize: Optional[str] = None,
+        precision: Optional[str] = None,
+        temperature: float = 0.8,
+        top_k: int = 50,
+        top_p: float = 1.0,
+        max_new_tokens: int = 50,
+        devices: int = 1,
+    ):
+        super().__init__(checkpoint_dir, quantize, precision, temperature, top_k, top_p, max_new_tokens, devices)
+
+    def setup(self, device: str):
+        super().setup(device)
+        if not _JINJA2_AVAILABLE:
+            raise ImportError(str(_JINJA2_AVAILABLE))
+        from jinja2 import Template
+
+        config_path = self.checkpoint_dir / "tokenizer_config.json"
+        if not config_path.is_file():
+            raise FileNotFoundError(f"Tokenizer config file not found at {config_path}")
+
+        with open(config_path, encoding="utf-8") as fp:
+            config = json.load(fp)
+            chat_template = config.get("chat_template", None)
+            if chat_template is None:
+                raise ValueError("chat_template not found in tokenizer config file.")
+            self.chat_template = chat_template
+
+        self.template = Template(self.chat_template)
+
+    def decode_request(self, request: "ChatCompletionRequest") -> Any:
+        # Apply chat template to request messages
+        return self.template.render(messages=request.messages)
+
+    def predict(self, inputs: str, context: dict) -> Any:
+        # Extract parameters from context with fallback to instance attributes
+        temperature = context.get("temperature") or self.temperature
+        top_p = context.get("top_p", self.top_p) or self.top_p
+        max_new_tokens = context.get("max_completion_tokens") or self.max_new_tokens
+
+        # Run the model on the input and return the output.
+        yield from self.llm.generate(
+            inputs, temperature=temperature, top_k=self.top_k, top_p=top_p, max_new_tokens=max_new_tokens, stream=True
+        )
+
+
 def run_server(
     checkpoint_dir: Path,
     quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None,
@@ -141,6 +193,7 @@ def run_server(
     accelerator: str = "auto",
     port: int = 8000,
     stream: bool = False,
+    openai_spec: bool = False,
     access_token: Optional[str] = None,
 ) -> None:
     """Serve a LitGPT model using LitServe.
@@ -179,42 +232,28 @@ def run_server(
             The "auto" setting (default) chooses a GPU if available, and otherwise uses a CPU.
         port: The network port number on which the model is configured to be served.
         stream: Whether to stream the responses.
+        openai_spec: Whether to use the OpenAISpec.
         access_token: Optional API token to access models with restrictions.
     """
     checkpoint_dir = auto_download_checkpoint(model_name=checkpoint_dir, access_token=access_token)
     pprint(locals())
 
-    if not stream:
-        server = LitServer(
-            SimpleLitAPI(
-                checkpoint_dir=checkpoint_dir,
-                quantize=quantize,
-                precision=precision,
-                temperature=temperature,
-                top_k=top_k,
-                top_p=top_p,
-                max_new_tokens=max_new_tokens,
-                devices=devices,
-            ),
-            accelerator=accelerator,
-            devices=1,  # We need to use the devives inside the `SimpleLitAPI` class
-        )
-
-    else:
-        server = LitServer(
-            StreamLitAPI(
-                checkpoint_dir=checkpoint_dir,
-                quantize=quantize,
-                precision=precision,
-                temperature=temperature,
-                top_k=top_k,
-                top_p=top_p,
-                max_new_tokens=max_new_tokens,
-                devices=devices,  # We need to use the devives inside the `StreamLitAPI` class
-            ),
-            accelerator=accelerator,
-            devices=1,
-            stream=True,
-        )
+    api_class = OpenAISpecLitAPI if openai_spec else StreamLitAPI if stream else SimpleLitAPI
+    server = LitServer(
+        api_class(
+            checkpoint_dir=checkpoint_dir,
+            quantize=quantize,
+            precision=precision,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            max_new_tokens=max_new_tokens,
+            devices=devices,
+        ),
+        spec=OpenAISpec() if openai_spec else None,
+        accelerator=accelerator,
+        devices=1,
+        stream=stream,
+    )
 
     server.run(port=port, generate_client_file=False)
diff --git a/pyproject.toml b/pyproject.toml
@@ -75,6 +75,7 @@ optional-dependencies.extra = [
 optional-dependencies.test = [
   "einops>=0.7",
   "protobuf>=4.23.4",
+  "pydantic>=2.11",
   "pytest>=8.1.1",
   "pytest-benchmark>=5.1",
   "pytest-dependency>=0.6",
diff --git a/tests/test_serve.py b/tests/test_serve.py
@@ -1,4 +1,5 @@
 # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+import json
 import platform
 import shutil
 import subprocess
@@ -136,3 +137,127 @@ def run_server():
     if process:
         kill_process_tree(process.pid)
     server_thread.join()
+
+
+@_RunIf(min_cuda_gpus=1)
+def test_serve_with_openai_spec_missing_chat_template(tmp_path):
+    seed_everything(123)
+    ours_config = Config.from_name("pythia-14m")
+    download_from_hub(repo_id="EleutherAI/pythia-14m", tokenizer_only=True, checkpoint_dir=tmp_path)
+    shutil.move(str(tmp_path / "EleutherAI" / "pythia-14m" / "tokenizer.json"), str(tmp_path))
+    shutil.move(str(tmp_path / "EleutherAI" / "pythia-14m" / "tokenizer_config.json"), str(tmp_path))
+    ours_model = GPT(ours_config)
+    checkpoint_path = tmp_path / "lit_model.pth"
+    torch.save(ours_model.state_dict(), checkpoint_path)
+    config_path = tmp_path / "model_config.yaml"
+    with open(config_path, "w", encoding="utf-8") as fp:
+        yaml.dump(asdict(ours_config), fp)
+
+    run_command = ["litgpt", "serve", tmp_path, "--openai_spec", "true"]
+
+    process = None
+
+    def run_server():
+        nonlocal process
+        try:
+            process = subprocess.Popen(run_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        except subprocess.TimeoutExpired:
+            print("Server start-up timeout expired")
+        return None, None
+
+    server_thread = threading.Thread(target=run_server)
+    server_thread.start()
+
+    time.sleep(30)  # Give the server some time to start and raise the error
+
+    try:
+        stdout = process.stdout.read().strip() if process.stdout else ""
+        stderr = process.stderr.read().strip() if process.stderr else ""
+        output = (stdout or "") + (stderr or "")
+        assert "ValueError: chat_template not found in tokenizer config file." in output, (
+            "Expected ValueError for missing chat_template not found."
+        )
+    finally:
+        if process:
+            kill_process_tree(process.pid)
+        server_thread.join()
+
+
+@_RunIf(min_cuda_gpus=1)
+def test_serve_with_openai_spec(tmp_path):
+    seed_everything(123)
+    ours_config = Config.from_name("SmolLM2-135M-Instruct")
+    download_from_hub(repo_id="HuggingFaceTB/SmolLM2-135M-Instruct", tokenizer_only=True, checkpoint_dir=tmp_path)
+    shutil.move(str(tmp_path / "HuggingFaceTB" / "SmolLM2-135M-Instruct" / "tokenizer.json"), str(tmp_path))
+    shutil.move(str(tmp_path / "HuggingFaceTB" / "SmolLM2-135M-Instruct" / "tokenizer_config.json"), str(tmp_path))
+    ours_model = GPT(ours_config)
+    checkpoint_path = tmp_path / "lit_model.pth"
+    torch.save(ours_model.state_dict(), checkpoint_path)
+    config_path = tmp_path / "model_config.yaml"
+    with open(config_path, "w", encoding="utf-8") as fp:
+        yaml.dump(asdict(ours_config), fp)
+
+    run_command = ["litgpt", "serve", tmp_path, "--openai_spec", "true"]
+
+    process = None
+
+    def run_server():
+        nonlocal process
+        try:
+            process = subprocess.Popen(run_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        except subprocess.TimeoutExpired:
+            print("Server start-up timeout expired")
+
+    server_thread = threading.Thread(target=run_server)
+    server_thread.start()
+
+    _wait_and_check_response()
+
+    try:
+        # Test server health
+        response = requests.get("http://127.0.0.1:8000/health")
+        assert response.status_code == 200, f"Server health check failed with status code {response.status_code}"
+        assert response.text == "ok", "Server did not respond as expected."
+
+        # Test non-streaming chat completion
+        response = requests.post(
+            "http://127.0.0.1:8000/v1/chat/completions",
+            json={
+                "model": "SmolLM2-135M-Instruct",
+                "messages": [{"role": "user", "content": "Hello!"}],
+            },
+        )
+        assert response.status_code == 200, (
+            f"Non-streaming chat completion failed with status code {response.status_code}"
+        )
+        response_json = response.json()
+        assert "choices" in response_json, "Response JSON does not contain 'choices'."
+        assert "message" in response_json["choices"][0], "Response JSON does not contain 'message' in 'choices'."
+        assert "content" in response_json["choices"][0]["message"], (
+            "Response JSON does not contain 'content' in 'message'."
+        )
+        assert response_json["choices"][0]["message"]["content"], "Content is empty in the response."
+
+        # Test streaming chat completion
+        stream_response = requests.post(
+            "http://127.0.0.1:8000/v1/chat/completions",
+            json={
+                "model": "SmolLM2-135M-Instruct",
+                "messages": [{"role": "user", "content": "Hello!"}],
+                "stream": True,
+            },
+        )
+        assert stream_response.status_code == 200, (
+            f"Streaming chat completion failed with status code {stream_response.status_code}"
+        )
+        for line in stream_response.iter_lines():
+            decoded = line.decode("utf-8").replace("data: ", "").replace("[DONE]", "").strip()
+            if decoded:
+                data = json.loads(decoded)
+                assert "choices" in data, "Response JSON does not contain 'choices'."
+                assert "delta" in data["choices"][0], "Response JSON does not contain 'delta' in 'choices'."
+                assert "content" in data["choices"][0]["delta"], "Response JSON does not contain 'content' in 'delta'."
+    finally:
+        if process:
+            kill_process_tree(process.pid)
+        server_thread.join()