fix: enhance llm-katan OpenAI API compatibility for issue #241 (#354)

yossiovadia · web-flow · commit 88eec240bbd5 · 2025-10-07T14:42:11.000+08:00
* fix: enhance llm-katan OpenAI API compatibility for issue #241 - Add missing OpenAI API response fields (system_fingerprint, logprobs, detailed usage) - Fix streaming response Content-Type from text/plain to text/event-stream - Ensure both static and streaming responses include all compatibility fields - Add token_usage alias for better SDK compatibility - Apply fixes to both TransformersBackend and VLLMBackend Resolves OpenWebUI hanging issue when connecting to llm-katan endpoints. Signed-off-by: Yossi Ovadia <yovadia@redhat.com> * bump llm-katan version to 0.1.9 for PyPI release Published llm-katan v0.1.9 to PyPI with OpenAI API compatibility fixes. Signed-off-by: Yossi Ovadia <yovadia@redhat.com> * chore: trigger CI re-run to check pre-commit status Trigger CI re-run to verify if Black formatting issues are resolved. Signed-off-by: Yossi Ovadia <yovadia@redhat.com> * trigger pre-commit formatting fix Signed-off-by: Yossi Ovadia <yovadia@redhat.com> * fix: apply black formatting to llm-katan Python files Signed-off-by: Yossi Ovadia <yovadia@redhat.com> * fix: apply black formatting to llm-katan Python files for CI compliance Signed-off-by: Yossi Ovadia <yovadia@redhat.com> --------- Signed-off-by: Yossi Ovadia <yovadia@redhat.com>
diff --git a/e2e-tests/llm-katan/llm_katan/model.py b/e2e-tests/llm-katan/llm_katan/model.py
@@ -136,20 +136,27 @@ async def generate(
             "object": "chat.completion",
             "created": int(time.time()),
             "model": self.config.served_model_name,
+            "system_fingerprint": "llm-katan-transformers",
             "choices": [
                 {
                     "index": 0,
                     "message": {"role": "assistant", "content": generated_text},
+                    "logprobs": None,
                     "finish_reason": "stop",
                 }
             ],
             "usage": {
                 "prompt_tokens": prompt_tokens,
                 "completion_tokens": completion_tokens,
                 "total_tokens": total_tokens,
+                "prompt_tokens_details": {"cached_tokens": 0},
+                "completion_tokens_details": {"reasoning_tokens": 0},
             },
         }
 
+        # Add token_usage as alias for better SDK compatibility
+        response_data["token_usage"] = response_data["usage"]
+
         if stream:
             # For streaming, yield chunks
             words = generated_text.split()
@@ -159,12 +166,14 @@ async def generate(
                     "object": "chat.completion.chunk",
                     "created": response_data["created"],
                     "model": self.config.served_model_name,
+                    "system_fingerprint": "llm-katan-transformers",
                     "choices": [
                         {
                             "index": 0,
                             "delta": {
                                 "content": word + " " if i < len(words) - 1 else word
                             },
+                            "logprobs": None,
                             "finish_reason": None,
                         }
                     ],
@@ -178,7 +187,17 @@ async def generate(
                 "object": "chat.completion.chunk",
                 "created": response_data["created"],
                 "model": self.config.served_model_name,
-                "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
+                "system_fingerprint": "llm-katan-transformers",
+                "choices": [
+                    {"index": 0, "delta": {}, "logprobs": None, "finish_reason": "stop"}
+                ],
+                "usage": {
+                    "prompt_tokens": prompt_tokens,
+                    "completion_tokens": completion_tokens,
+                    "total_tokens": total_tokens,
+                    "prompt_tokens_details": {"cached_tokens": 0},
+                    "completion_tokens_details": {"reasoning_tokens": 0},
+                },
             }
             yield final_chunk
         else:
@@ -295,10 +314,12 @@ async def generate(
             "object": "chat.completion",
             "created": int(time.time()),
             "model": self.config.served_model_name,
+            "system_fingerprint": "llm-katan-vllm",
             "choices": [
                 {
                     "index": 0,
                     "message": {"role": "assistant", "content": generated_text},
+                    "logprobs": None,
                     "finish_reason": "stop",
                 }
             ],
@@ -307,9 +328,14 @@ async def generate(
                 "completion_tokens": len(output.outputs[0].token_ids),
                 "total_tokens": len(output.prompt_token_ids)
                 + len(output.outputs[0].token_ids),
+                "prompt_tokens_details": {"cached_tokens": 0},
+                "completion_tokens_details": {"reasoning_tokens": 0},
             },
         }
 
+        # Add token_usage as alias for better SDK compatibility
+        response_data["token_usage"] = response_data["usage"]
+
         if stream:
             # For streaming, yield chunks (simplified for now)
             words = generated_text.split()
@@ -319,12 +345,14 @@ async def generate(
                     "object": "chat.completion.chunk",
                     "created": response_data["created"],
                     "model": self.config.served_model_name,
+                    "system_fingerprint": "llm-katan-vllm",
                     "choices": [
                         {
                             "index": 0,
                             "delta": {
                                 "content": word + " " if i < len(words) - 1 else word
                             },
+                            "logprobs": None,
                             "finish_reason": None,
                         }
                     ],
@@ -338,7 +366,18 @@ async def generate(
                 "object": "chat.completion.chunk",
                 "created": response_data["created"],
                 "model": self.config.served_model_name,
-                "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
+                "system_fingerprint": "llm-katan-vllm",
+                "choices": [
+                    {"index": 0, "delta": {}, "logprobs": None, "finish_reason": "stop"}
+                ],
+                "usage": {
+                    "prompt_tokens": len(output.prompt_token_ids),
+                    "completion_tokens": len(output.outputs[0].token_ids),
+                    "total_tokens": len(output.prompt_token_ids)
+                    + len(output.outputs[0].token_ids),
+                    "prompt_tokens_details": {"cached_tokens": 0},
+                    "completion_tokens_details": {"reasoning_tokens": 0},
+                },
             }
             yield final_chunk
         else:
diff --git a/e2e-tests/llm-katan/llm_katan/server.py b/e2e-tests/llm-katan/llm_katan/server.py
@@ -181,8 +181,13 @@ async def generate_stream():
 
                 return StreamingResponse(
                     generate_stream(),
-                    media_type="text/plain",
-                    headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
+                    media_type="text/event-stream",
+                    headers={
+                        "Cache-Control": "no-cache",
+                        "Connection": "keep-alive",
+                        "Access-Control-Allow-Origin": "*",
+                        "Access-Control-Allow-Headers": "Content-Type",
+                    },
                 )
             else:
                 # Non-streaming response
diff --git a/e2e-tests/llm-katan/pyproject.toml b/e2e-tests/llm-katan/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "llm-katan"
-version = "0.1.8"
+version = "0.1.9"
 description = "LLM Katan - Lightweight LLM Server for Testing - Real tiny models with FastAPI and HuggingFace"
 readme = "README.md"
 authors = [