From a6b308c2c81593702683aea6bb94aa60568d4c09 Mon Sep 17 00:00:00 2001 From: Yossi Ovadia Date: Mon, 6 Oct 2025 12:36:09 -0700 Subject: [PATCH 1/6] fix: enhance llm-katan OpenAI API compatibility for issue #241 - Add missing OpenAI API response fields (system_fingerprint, logprobs, detailed usage) - Fix streaming response Content-Type from text/plain to text/event-stream - Ensure both static and streaming responses include all compatibility fields - Add token_usage alias for better SDK compatibility - Apply fixes to both TransformersBackend and VLLMBackend Resolves OpenWebUI hanging issue when connecting to llm-katan endpoints. Signed-off-by: Yossi Ovadia --- e2e-tests/llm-katan/llm_katan/model.py | 61 +++++++++++++++++-------- e2e-tests/llm-katan/llm_katan/server.py | 17 +++---- 2 files changed, 51 insertions(+), 27 deletions(-) diff --git a/e2e-tests/llm-katan/llm_katan/model.py b/e2e-tests/llm-katan/llm_katan/model.py index 27d42ebc..2ccc4fa0 100644 --- a/e2e-tests/llm-katan/llm_katan/model.py +++ b/e2e-tests/llm-katan/llm_katan/model.py @@ -103,9 +103,7 @@ async def generate( raise RuntimeError("Model not loaded. Call load_model() first.") max_tokens = max_tokens or self.config.max_tokens - temperature = ( - temperature if temperature is not None else self.config.temperature - ) + temperature = temperature if temperature is not None else self.config.temperature # Convert messages to prompt prompt = self._messages_to_prompt(messages) @@ -136,10 +134,12 @@ async def generate( "object": "chat.completion", "created": int(time.time()), "model": self.config.served_model_name, + "system_fingerprint": "llm-katan-transformers", "choices": [ { "index": 0, "message": {"role": "assistant", "content": generated_text}, + "logprobs": None, "finish_reason": "stop", } ], @@ -147,9 +147,14 @@ async def generate( "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": total_tokens, + "prompt_tokens_details": {"cached_tokens": 0}, + "completion_tokens_details": {"reasoning_tokens": 0}, }, } + # Add token_usage as alias for better SDK compatibility + response_data["token_usage"] = response_data["usage"] + if stream: # For streaming, yield chunks words = generated_text.split() @@ -159,12 +164,12 @@ async def generate( "object": "chat.completion.chunk", "created": response_data["created"], "model": self.config.served_model_name, + "system_fingerprint": "llm-katan-transformers", "choices": [ { "index": 0, - "delta": { - "content": word + " " if i < len(words) - 1 else word - }, + "delta": {"content": word + " " if i < len(words) - 1 else word}, + "logprobs": None, "finish_reason": None, } ], @@ -178,7 +183,15 @@ async def generate( "object": "chat.completion.chunk", "created": response_data["created"], "model": self.config.served_model_name, - "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}], + "system_fingerprint": "llm-katan-transformers", + "choices": [{"index": 0, "delta": {}, "logprobs": None, "finish_reason": "stop"}], + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + "prompt_tokens_details": {"cached_tokens": 0}, + "completion_tokens_details": {"reasoning_tokens": 0}, + }, } yield final_chunk else: @@ -268,9 +281,7 @@ async def generate( from vllm.sampling_params import SamplingParams max_tokens = max_tokens or self.config.max_tokens - temperature = ( - temperature if temperature is not None else self.config.temperature - ) + temperature = temperature if temperature is not None else self.config.temperature # Convert messages to prompt prompt = self._messages_to_prompt(messages) @@ -282,9 +293,7 @@ async def generate( # Generate loop = asyncio.get_event_loop() - outputs = await loop.run_in_executor( - None, self.engine.generate, [prompt], sampling_params - ) + outputs = await loop.run_in_executor(None, self.engine.generate, [prompt], sampling_params) output = outputs[0] generated_text = output.outputs[0].text.strip() @@ -295,21 +304,27 @@ async def generate( "object": "chat.completion", "created": int(time.time()), "model": self.config.served_model_name, + "system_fingerprint": "llm-katan-vllm", "choices": [ { "index": 0, "message": {"role": "assistant", "content": generated_text}, + "logprobs": None, "finish_reason": "stop", } ], "usage": { "prompt_tokens": len(output.prompt_token_ids), "completion_tokens": len(output.outputs[0].token_ids), - "total_tokens": len(output.prompt_token_ids) - + len(output.outputs[0].token_ids), + "total_tokens": len(output.prompt_token_ids) + len(output.outputs[0].token_ids), + "prompt_tokens_details": {"cached_tokens": 0}, + "completion_tokens_details": {"reasoning_tokens": 0}, }, } + # Add token_usage as alias for better SDK compatibility + response_data["token_usage"] = response_data["usage"] + if stream: # For streaming, yield chunks (simplified for now) words = generated_text.split() @@ -319,12 +334,12 @@ async def generate( "object": "chat.completion.chunk", "created": response_data["created"], "model": self.config.served_model_name, + "system_fingerprint": "llm-katan-vllm", "choices": [ { "index": 0, - "delta": { - "content": word + " " if i < len(words) - 1 else word - }, + "delta": {"content": word + " " if i < len(words) - 1 else word}, + "logprobs": None, "finish_reason": None, } ], @@ -338,7 +353,15 @@ async def generate( "object": "chat.completion.chunk", "created": response_data["created"], "model": self.config.served_model_name, - "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}], + "system_fingerprint": "llm-katan-vllm", + "choices": [{"index": 0, "delta": {}, "logprobs": None, "finish_reason": "stop"}], + "usage": { + "prompt_tokens": len(output.prompt_token_ids), + "completion_tokens": len(output.outputs[0].token_ids), + "total_tokens": len(output.prompt_token_ids) + len(output.outputs[0].token_ids), + "prompt_tokens_details": {"cached_tokens": 0}, + "completion_tokens_details": {"reasoning_tokens": 0}, + }, } yield final_chunk else: diff --git a/e2e-tests/llm-katan/llm_katan/server.py b/e2e-tests/llm-katan/llm_katan/server.py index f96b748a..a2c8d625 100644 --- a/e2e-tests/llm-katan/llm_katan/server.py +++ b/e2e-tests/llm-katan/llm_katan/server.py @@ -160,9 +160,7 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request try: # Convert messages to dict format - messages = [ - {"role": msg.role, "content": msg.content} for msg in request.messages - ] + messages = [{"role": msg.role, "content": msg.content} for msg in request.messages] # Update metrics metrics["total_requests"] += 1 @@ -181,8 +179,13 @@ async def generate_stream(): return StreamingResponse( generate_stream(), - media_type="text/plain", - headers={"Cache-Control": "no-cache", "Connection": "keep-alive"}, + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "Access-Control-Allow-Origin": "*", + "Access-Control-Allow-Headers": "Content-Type", + }, ) else: # Non-streaming response @@ -198,9 +201,7 @@ async def generate_stream(): response_time = time.time() - start_time metrics["response_times"].append(response_time) if "choices" in response and response["choices"]: - generated_text = ( - response["choices"][0].get("message", {}).get("content", "") - ) + generated_text = response["choices"][0].get("message", {}).get("content", "") token_count = len(generated_text.split()) # Rough token estimate metrics["total_tokens_generated"] += token_count From 415346ad85af60c6a97031aa65d288f4e38a7799 Mon Sep 17 00:00:00 2001 From: Yossi Ovadia Date: Mon, 6 Oct 2025 13:27:39 -0700 Subject: [PATCH 2/6] bump llm-katan version to 0.1.9 for PyPI release Published llm-katan v0.1.9 to PyPI with OpenAI API compatibility fixes. Signed-off-by: Yossi Ovadia --- e2e-tests/llm-katan/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e-tests/llm-katan/pyproject.toml b/e2e-tests/llm-katan/pyproject.toml index a33a835f..f9708ec4 100644 --- a/e2e-tests/llm-katan/pyproject.toml +++ b/e2e-tests/llm-katan/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "llm-katan" -version = "0.1.8" +version = "0.1.9" description = "LLM Katan - Lightweight LLM Server for Testing - Real tiny models with FastAPI and HuggingFace" readme = "README.md" authors = [ From 29ffb27daf7af5d9ba169db7cae03cf4d5c33ffc Mon Sep 17 00:00:00 2001 From: Yossi Ovadia Date: Mon, 6 Oct 2025 14:48:31 -0700 Subject: [PATCH 3/6] chore: trigger CI re-run to check pre-commit status Trigger CI re-run to verify if Black formatting issues are resolved. Signed-off-by: Yossi Ovadia From d3dcf384dffb294e978f7f9e6e138261fef60ab0 Mon Sep 17 00:00:00 2001 From: Yossi Ovadia Date: Mon, 6 Oct 2025 20:57:21 -0700 Subject: [PATCH 4/6] trigger pre-commit formatting fix Signed-off-by: Yossi Ovadia From 6886362a67467df6defbdf6ddf12fd74e37d3659 Mon Sep 17 00:00:00 2001 From: Yossi Ovadia Date: Mon, 6 Oct 2025 21:01:59 -0700 Subject: [PATCH 5/6] fix: apply black formatting to llm-katan Python files Signed-off-by: Yossi Ovadia --- e2e-tests/llm-katan/llm_katan/cli.py | 3 +-- e2e-tests/llm-katan/llm_katan/config.py | 4 +--- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/e2e-tests/llm-katan/llm_katan/cli.py b/e2e-tests/llm-katan/llm_katan/cli.py index 2ee48e7e..13cc6626 100644 --- a/e2e-tests/llm-katan/llm_katan/cli.py +++ b/e2e-tests/llm-katan/llm_katan/cli.py @@ -161,8 +161,7 @@ def main( import transformers # noqa: F401 except ImportError: click.echo( - "❌ Required dependencies missing. " - "Install with: pip install transformers torch", + "❌ Required dependencies missing. " "Install with: pip install transformers torch", err=True, ) sys.exit(1) diff --git a/e2e-tests/llm-katan/llm_katan/config.py b/e2e-tests/llm-katan/llm_katan/config.py index bfebbe90..e5bb3301 100644 --- a/e2e-tests/llm-katan/llm_katan/config.py +++ b/e2e-tests/llm-katan/llm_katan/config.py @@ -36,9 +36,7 @@ def __post_init__(self): # Validate backend if self.backend not in ["transformers", "vllm"]: - raise ValueError( - f"Invalid backend: {self.backend}. Must be 'transformers' or 'vllm'" - ) + raise ValueError(f"Invalid backend: {self.backend}. Must be 'transformers' or 'vllm'") @property def device_auto(self) -> str: From c5968f09d3ce90fdc66f7c7575a6f5737ba4f085 Mon Sep 17 00:00:00 2001 From: Yossi Ovadia Date: Mon, 6 Oct 2025 21:17:40 -0700 Subject: [PATCH 6/6] fix: apply black formatting to llm-katan Python files for CI compliance Signed-off-by: Yossi Ovadia --- e2e-tests/llm-katan/llm_katan/cli.py | 3 ++- e2e-tests/llm-katan/llm_katan/config.py | 4 ++- e2e-tests/llm-katan/llm_katan/model.py | 34 ++++++++++++++++++------- e2e-tests/llm-katan/llm_katan/server.py | 8 ++++-- 4 files changed, 36 insertions(+), 13 deletions(-) diff --git a/e2e-tests/llm-katan/llm_katan/cli.py b/e2e-tests/llm-katan/llm_katan/cli.py index 13cc6626..2ee48e7e 100644 --- a/e2e-tests/llm-katan/llm_katan/cli.py +++ b/e2e-tests/llm-katan/llm_katan/cli.py @@ -161,7 +161,8 @@ def main( import transformers # noqa: F401 except ImportError: click.echo( - "❌ Required dependencies missing. " "Install with: pip install transformers torch", + "❌ Required dependencies missing. " + "Install with: pip install transformers torch", err=True, ) sys.exit(1) diff --git a/e2e-tests/llm-katan/llm_katan/config.py b/e2e-tests/llm-katan/llm_katan/config.py index e5bb3301..bfebbe90 100644 --- a/e2e-tests/llm-katan/llm_katan/config.py +++ b/e2e-tests/llm-katan/llm_katan/config.py @@ -36,7 +36,9 @@ def __post_init__(self): # Validate backend if self.backend not in ["transformers", "vllm"]: - raise ValueError(f"Invalid backend: {self.backend}. Must be 'transformers' or 'vllm'") + raise ValueError( + f"Invalid backend: {self.backend}. Must be 'transformers' or 'vllm'" + ) @property def device_auto(self) -> str: diff --git a/e2e-tests/llm-katan/llm_katan/model.py b/e2e-tests/llm-katan/llm_katan/model.py index 2ccc4fa0..8f37c335 100644 --- a/e2e-tests/llm-katan/llm_katan/model.py +++ b/e2e-tests/llm-katan/llm_katan/model.py @@ -103,7 +103,9 @@ async def generate( raise RuntimeError("Model not loaded. Call load_model() first.") max_tokens = max_tokens or self.config.max_tokens - temperature = temperature if temperature is not None else self.config.temperature + temperature = ( + temperature if temperature is not None else self.config.temperature + ) # Convert messages to prompt prompt = self._messages_to_prompt(messages) @@ -168,7 +170,9 @@ async def generate( "choices": [ { "index": 0, - "delta": {"content": word + " " if i < len(words) - 1 else word}, + "delta": { + "content": word + " " if i < len(words) - 1 else word + }, "logprobs": None, "finish_reason": None, } @@ -184,7 +188,9 @@ async def generate( "created": response_data["created"], "model": self.config.served_model_name, "system_fingerprint": "llm-katan-transformers", - "choices": [{"index": 0, "delta": {}, "logprobs": None, "finish_reason": "stop"}], + "choices": [ + {"index": 0, "delta": {}, "logprobs": None, "finish_reason": "stop"} + ], "usage": { "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, @@ -281,7 +287,9 @@ async def generate( from vllm.sampling_params import SamplingParams max_tokens = max_tokens or self.config.max_tokens - temperature = temperature if temperature is not None else self.config.temperature + temperature = ( + temperature if temperature is not None else self.config.temperature + ) # Convert messages to prompt prompt = self._messages_to_prompt(messages) @@ -293,7 +301,9 @@ async def generate( # Generate loop = asyncio.get_event_loop() - outputs = await loop.run_in_executor(None, self.engine.generate, [prompt], sampling_params) + outputs = await loop.run_in_executor( + None, self.engine.generate, [prompt], sampling_params + ) output = outputs[0] generated_text = output.outputs[0].text.strip() @@ -316,7 +326,8 @@ async def generate( "usage": { "prompt_tokens": len(output.prompt_token_ids), "completion_tokens": len(output.outputs[0].token_ids), - "total_tokens": len(output.prompt_token_ids) + len(output.outputs[0].token_ids), + "total_tokens": len(output.prompt_token_ids) + + len(output.outputs[0].token_ids), "prompt_tokens_details": {"cached_tokens": 0}, "completion_tokens_details": {"reasoning_tokens": 0}, }, @@ -338,7 +349,9 @@ async def generate( "choices": [ { "index": 0, - "delta": {"content": word + " " if i < len(words) - 1 else word}, + "delta": { + "content": word + " " if i < len(words) - 1 else word + }, "logprobs": None, "finish_reason": None, } @@ -354,11 +367,14 @@ async def generate( "created": response_data["created"], "model": self.config.served_model_name, "system_fingerprint": "llm-katan-vllm", - "choices": [{"index": 0, "delta": {}, "logprobs": None, "finish_reason": "stop"}], + "choices": [ + {"index": 0, "delta": {}, "logprobs": None, "finish_reason": "stop"} + ], "usage": { "prompt_tokens": len(output.prompt_token_ids), "completion_tokens": len(output.outputs[0].token_ids), - "total_tokens": len(output.prompt_token_ids) + len(output.outputs[0].token_ids), + "total_tokens": len(output.prompt_token_ids) + + len(output.outputs[0].token_ids), "prompt_tokens_details": {"cached_tokens": 0}, "completion_tokens_details": {"reasoning_tokens": 0}, }, diff --git a/e2e-tests/llm-katan/llm_katan/server.py b/e2e-tests/llm-katan/llm_katan/server.py index a2c8d625..375ae589 100644 --- a/e2e-tests/llm-katan/llm_katan/server.py +++ b/e2e-tests/llm-katan/llm_katan/server.py @@ -160,7 +160,9 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request try: # Convert messages to dict format - messages = [{"role": msg.role, "content": msg.content} for msg in request.messages] + messages = [ + {"role": msg.role, "content": msg.content} for msg in request.messages + ] # Update metrics metrics["total_requests"] += 1 @@ -201,7 +203,9 @@ async def generate_stream(): response_time = time.time() - start_time metrics["response_times"].append(response_time) if "choices" in response and response["choices"]: - generated_text = response["choices"][0].get("message", {}).get("content", "") + generated_text = ( + response["choices"][0].get("message", {}).get("content", "") + ) token_count = len(generated_text.split()) # Rough token estimate metrics["total_tokens_generated"] += token_count