Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 41 additions & 2 deletions e2e-tests/llm-katan/llm_katan/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,20 +136,27 @@ async def generate(
"object": "chat.completion",
"created": int(time.time()),
"model": self.config.served_model_name,
"system_fingerprint": "llm-katan-transformers",
"choices": [
{
"index": 0,
"message": {"role": "assistant", "content": generated_text},
"logprobs": None,
"finish_reason": "stop",
}
],
"usage": {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": total_tokens,
"prompt_tokens_details": {"cached_tokens": 0},
"completion_tokens_details": {"reasoning_tokens": 0},
},
}

# Add token_usage as alias for better SDK compatibility
response_data["token_usage"] = response_data["usage"]

if stream:
# For streaming, yield chunks
words = generated_text.split()
Expand All @@ -159,12 +166,14 @@ async def generate(
"object": "chat.completion.chunk",
"created": response_data["created"],
"model": self.config.served_model_name,
"system_fingerprint": "llm-katan-transformers",
"choices": [
{
"index": 0,
"delta": {
"content": word + " " if i < len(words) - 1 else word
},
"logprobs": None,
"finish_reason": None,
}
],
Expand All @@ -178,7 +187,17 @@ async def generate(
"object": "chat.completion.chunk",
"created": response_data["created"],
"model": self.config.served_model_name,
"choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
"system_fingerprint": "llm-katan-transformers",
"choices": [
{"index": 0, "delta": {}, "logprobs": None, "finish_reason": "stop"}
],
"usage": {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": total_tokens,
"prompt_tokens_details": {"cached_tokens": 0},
"completion_tokens_details": {"reasoning_tokens": 0},
},
}
yield final_chunk
else:
Expand Down Expand Up @@ -295,10 +314,12 @@ async def generate(
"object": "chat.completion",
"created": int(time.time()),
"model": self.config.served_model_name,
"system_fingerprint": "llm-katan-vllm",
"choices": [
{
"index": 0,
"message": {"role": "assistant", "content": generated_text},
"logprobs": None,
"finish_reason": "stop",
}
],
Expand All @@ -307,9 +328,14 @@ async def generate(
"completion_tokens": len(output.outputs[0].token_ids),
"total_tokens": len(output.prompt_token_ids)
+ len(output.outputs[0].token_ids),
"prompt_tokens_details": {"cached_tokens": 0},
"completion_tokens_details": {"reasoning_tokens": 0},
},
}

# Add token_usage as alias for better SDK compatibility
response_data["token_usage"] = response_data["usage"]

if stream:
# For streaming, yield chunks (simplified for now)
words = generated_text.split()
Expand All @@ -319,12 +345,14 @@ async def generate(
"object": "chat.completion.chunk",
"created": response_data["created"],
"model": self.config.served_model_name,
"system_fingerprint": "llm-katan-vllm",
"choices": [
{
"index": 0,
"delta": {
"content": word + " " if i < len(words) - 1 else word
},
"logprobs": None,
"finish_reason": None,
}
],
Expand All @@ -338,7 +366,18 @@ async def generate(
"object": "chat.completion.chunk",
"created": response_data["created"],
"model": self.config.served_model_name,
"choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
"system_fingerprint": "llm-katan-vllm",
"choices": [
{"index": 0, "delta": {}, "logprobs": None, "finish_reason": "stop"}
],
"usage": {
"prompt_tokens": len(output.prompt_token_ids),
"completion_tokens": len(output.outputs[0].token_ids),
"total_tokens": len(output.prompt_token_ids)
+ len(output.outputs[0].token_ids),
"prompt_tokens_details": {"cached_tokens": 0},
"completion_tokens_details": {"reasoning_tokens": 0},
},
}
yield final_chunk
else:
Expand Down
9 changes: 7 additions & 2 deletions e2e-tests/llm-katan/llm_katan/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,13 @@ async def generate_stream():

return StreamingResponse(
generate_stream(),
media_type="text/plain",
headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Headers": "Content-Type",
},
)
else:
# Non-streaming response
Expand Down
2 changes: 1 addition & 1 deletion e2e-tests/llm-katan/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "llm-katan"
version = "0.1.8"
version = "0.1.9"
description = "LLM Katan - Lightweight LLM Server for Testing - Real tiny models with FastAPI and HuggingFace"
readme = "README.md"
authors = [
Expand Down
Loading