diff --git a/e2e-tests/llm-katan/llm_katan/model.py b/e2e-tests/llm-katan/llm_katan/model.py index 27d42ebc..8f37c335 100644 --- a/e2e-tests/llm-katan/llm_katan/model.py +++ b/e2e-tests/llm-katan/llm_katan/model.py @@ -136,10 +136,12 @@ async def generate( "object": "chat.completion", "created": int(time.time()), "model": self.config.served_model_name, + "system_fingerprint": "llm-katan-transformers", "choices": [ { "index": 0, "message": {"role": "assistant", "content": generated_text}, + "logprobs": None, "finish_reason": "stop", } ], @@ -147,9 +149,14 @@ async def generate( "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": total_tokens, + "prompt_tokens_details": {"cached_tokens": 0}, + "completion_tokens_details": {"reasoning_tokens": 0}, }, } + # Add token_usage as alias for better SDK compatibility + response_data["token_usage"] = response_data["usage"] + if stream: # For streaming, yield chunks words = generated_text.split() @@ -159,12 +166,14 @@ async def generate( "object": "chat.completion.chunk", "created": response_data["created"], "model": self.config.served_model_name, + "system_fingerprint": "llm-katan-transformers", "choices": [ { "index": 0, "delta": { "content": word + " " if i < len(words) - 1 else word }, + "logprobs": None, "finish_reason": None, } ], @@ -178,7 +187,17 @@ async def generate( "object": "chat.completion.chunk", "created": response_data["created"], "model": self.config.served_model_name, - "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}], + "system_fingerprint": "llm-katan-transformers", + "choices": [ + {"index": 0, "delta": {}, "logprobs": None, "finish_reason": "stop"} + ], + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + "prompt_tokens_details": {"cached_tokens": 0}, + "completion_tokens_details": {"reasoning_tokens": 0}, + }, } yield final_chunk else: @@ -295,10 +314,12 @@ async def generate( "object": "chat.completion", "created": int(time.time()), "model": self.config.served_model_name, + "system_fingerprint": "llm-katan-vllm", "choices": [ { "index": 0, "message": {"role": "assistant", "content": generated_text}, + "logprobs": None, "finish_reason": "stop", } ], @@ -307,9 +328,14 @@ async def generate( "completion_tokens": len(output.outputs[0].token_ids), "total_tokens": len(output.prompt_token_ids) + len(output.outputs[0].token_ids), + "prompt_tokens_details": {"cached_tokens": 0}, + "completion_tokens_details": {"reasoning_tokens": 0}, }, } + # Add token_usage as alias for better SDK compatibility + response_data["token_usage"] = response_data["usage"] + if stream: # For streaming, yield chunks (simplified for now) words = generated_text.split() @@ -319,12 +345,14 @@ async def generate( "object": "chat.completion.chunk", "created": response_data["created"], "model": self.config.served_model_name, + "system_fingerprint": "llm-katan-vllm", "choices": [ { "index": 0, "delta": { "content": word + " " if i < len(words) - 1 else word }, + "logprobs": None, "finish_reason": None, } ], @@ -338,7 +366,18 @@ async def generate( "object": "chat.completion.chunk", "created": response_data["created"], "model": self.config.served_model_name, - "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}], + "system_fingerprint": "llm-katan-vllm", + "choices": [ + {"index": 0, "delta": {}, "logprobs": None, "finish_reason": "stop"} + ], + "usage": { + "prompt_tokens": len(output.prompt_token_ids), + "completion_tokens": len(output.outputs[0].token_ids), + "total_tokens": len(output.prompt_token_ids) + + len(output.outputs[0].token_ids), + "prompt_tokens_details": {"cached_tokens": 0}, + "completion_tokens_details": {"reasoning_tokens": 0}, + }, } yield final_chunk else: diff --git a/e2e-tests/llm-katan/llm_katan/server.py b/e2e-tests/llm-katan/llm_katan/server.py index f96b748a..375ae589 100644 --- a/e2e-tests/llm-katan/llm_katan/server.py +++ b/e2e-tests/llm-katan/llm_katan/server.py @@ -181,8 +181,13 @@ async def generate_stream(): return StreamingResponse( generate_stream(), - media_type="text/plain", - headers={"Cache-Control": "no-cache", "Connection": "keep-alive"}, + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "Access-Control-Allow-Origin": "*", + "Access-Control-Allow-Headers": "Content-Type", + }, ) else: # Non-streaming response diff --git a/e2e-tests/llm-katan/pyproject.toml b/e2e-tests/llm-katan/pyproject.toml index a33a835f..f9708ec4 100644 --- a/e2e-tests/llm-katan/pyproject.toml +++ b/e2e-tests/llm-katan/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "llm-katan" -version = "0.1.8" +version = "0.1.9" description = "LLM Katan - Lightweight LLM Server for Testing - Real tiny models with FastAPI and HuggingFace" readme = "README.md" authors = [