Skip to content

Commit 88eec24

Browse files
authored
fix: enhance llm-katan OpenAI API compatibility for issue #241 (#354)
* fix: enhance llm-katan OpenAI API compatibility for issue #241 - Add missing OpenAI API response fields (system_fingerprint, logprobs, detailed usage) - Fix streaming response Content-Type from text/plain to text/event-stream - Ensure both static and streaming responses include all compatibility fields - Add token_usage alias for better SDK compatibility - Apply fixes to both TransformersBackend and VLLMBackend Resolves OpenWebUI hanging issue when connecting to llm-katan endpoints. Signed-off-by: Yossi Ovadia <[email protected]> * bump llm-katan version to 0.1.9 for PyPI release Published llm-katan v0.1.9 to PyPI with OpenAI API compatibility fixes. Signed-off-by: Yossi Ovadia <[email protected]> * chore: trigger CI re-run to check pre-commit status Trigger CI re-run to verify if Black formatting issues are resolved. Signed-off-by: Yossi Ovadia <[email protected]> * trigger pre-commit formatting fix Signed-off-by: Yossi Ovadia <[email protected]> * fix: apply black formatting to llm-katan Python files Signed-off-by: Yossi Ovadia <[email protected]> * fix: apply black formatting to llm-katan Python files for CI compliance Signed-off-by: Yossi Ovadia <[email protected]> --------- Signed-off-by: Yossi Ovadia <[email protected]>
1 parent 02fd9d8 commit 88eec24

File tree

3 files changed

+49
-5
lines changed

3 files changed

+49
-5
lines changed

e2e-tests/llm-katan/llm_katan/model.py

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,20 +136,27 @@ async def generate(
136136
"object": "chat.completion",
137137
"created": int(time.time()),
138138
"model": self.config.served_model_name,
139+
"system_fingerprint": "llm-katan-transformers",
139140
"choices": [
140141
{
141142
"index": 0,
142143
"message": {"role": "assistant", "content": generated_text},
144+
"logprobs": None,
143145
"finish_reason": "stop",
144146
}
145147
],
146148
"usage": {
147149
"prompt_tokens": prompt_tokens,
148150
"completion_tokens": completion_tokens,
149151
"total_tokens": total_tokens,
152+
"prompt_tokens_details": {"cached_tokens": 0},
153+
"completion_tokens_details": {"reasoning_tokens": 0},
150154
},
151155
}
152156

157+
# Add token_usage as alias for better SDK compatibility
158+
response_data["token_usage"] = response_data["usage"]
159+
153160
if stream:
154161
# For streaming, yield chunks
155162
words = generated_text.split()
@@ -159,12 +166,14 @@ async def generate(
159166
"object": "chat.completion.chunk",
160167
"created": response_data["created"],
161168
"model": self.config.served_model_name,
169+
"system_fingerprint": "llm-katan-transformers",
162170
"choices": [
163171
{
164172
"index": 0,
165173
"delta": {
166174
"content": word + " " if i < len(words) - 1 else word
167175
},
176+
"logprobs": None,
168177
"finish_reason": None,
169178
}
170179
],
@@ -178,7 +187,17 @@ async def generate(
178187
"object": "chat.completion.chunk",
179188
"created": response_data["created"],
180189
"model": self.config.served_model_name,
181-
"choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
190+
"system_fingerprint": "llm-katan-transformers",
191+
"choices": [
192+
{"index": 0, "delta": {}, "logprobs": None, "finish_reason": "stop"}
193+
],
194+
"usage": {
195+
"prompt_tokens": prompt_tokens,
196+
"completion_tokens": completion_tokens,
197+
"total_tokens": total_tokens,
198+
"prompt_tokens_details": {"cached_tokens": 0},
199+
"completion_tokens_details": {"reasoning_tokens": 0},
200+
},
182201
}
183202
yield final_chunk
184203
else:
@@ -295,10 +314,12 @@ async def generate(
295314
"object": "chat.completion",
296315
"created": int(time.time()),
297316
"model": self.config.served_model_name,
317+
"system_fingerprint": "llm-katan-vllm",
298318
"choices": [
299319
{
300320
"index": 0,
301321
"message": {"role": "assistant", "content": generated_text},
322+
"logprobs": None,
302323
"finish_reason": "stop",
303324
}
304325
],
@@ -307,9 +328,14 @@ async def generate(
307328
"completion_tokens": len(output.outputs[0].token_ids),
308329
"total_tokens": len(output.prompt_token_ids)
309330
+ len(output.outputs[0].token_ids),
331+
"prompt_tokens_details": {"cached_tokens": 0},
332+
"completion_tokens_details": {"reasoning_tokens": 0},
310333
},
311334
}
312335

336+
# Add token_usage as alias for better SDK compatibility
337+
response_data["token_usage"] = response_data["usage"]
338+
313339
if stream:
314340
# For streaming, yield chunks (simplified for now)
315341
words = generated_text.split()
@@ -319,12 +345,14 @@ async def generate(
319345
"object": "chat.completion.chunk",
320346
"created": response_data["created"],
321347
"model": self.config.served_model_name,
348+
"system_fingerprint": "llm-katan-vllm",
322349
"choices": [
323350
{
324351
"index": 0,
325352
"delta": {
326353
"content": word + " " if i < len(words) - 1 else word
327354
},
355+
"logprobs": None,
328356
"finish_reason": None,
329357
}
330358
],
@@ -338,7 +366,18 @@ async def generate(
338366
"object": "chat.completion.chunk",
339367
"created": response_data["created"],
340368
"model": self.config.served_model_name,
341-
"choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
369+
"system_fingerprint": "llm-katan-vllm",
370+
"choices": [
371+
{"index": 0, "delta": {}, "logprobs": None, "finish_reason": "stop"}
372+
],
373+
"usage": {
374+
"prompt_tokens": len(output.prompt_token_ids),
375+
"completion_tokens": len(output.outputs[0].token_ids),
376+
"total_tokens": len(output.prompt_token_ids)
377+
+ len(output.outputs[0].token_ids),
378+
"prompt_tokens_details": {"cached_tokens": 0},
379+
"completion_tokens_details": {"reasoning_tokens": 0},
380+
},
342381
}
343382
yield final_chunk
344383
else:

e2e-tests/llm-katan/llm_katan/server.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,13 @@ async def generate_stream():
181181

182182
return StreamingResponse(
183183
generate_stream(),
184-
media_type="text/plain",
185-
headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
184+
media_type="text/event-stream",
185+
headers={
186+
"Cache-Control": "no-cache",
187+
"Connection": "keep-alive",
188+
"Access-Control-Allow-Origin": "*",
189+
"Access-Control-Allow-Headers": "Content-Type",
190+
},
186191
)
187192
else:
188193
# Non-streaming response

e2e-tests/llm-katan/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "llm-katan"
7-
version = "0.1.8"
7+
version = "0.1.9"
88
description = "LLM Katan - Lightweight LLM Server for Testing - Real tiny models with FastAPI and HuggingFace"
99
readme = "README.md"
1010
authors = [

0 commit comments

Comments
 (0)