Skip to content

Commit a6b308c

Browse files
committed
fix: enhance llm-katan OpenAI API compatibility for issue #241
- Add missing OpenAI API response fields (system_fingerprint, logprobs, detailed usage) - Fix streaming response Content-Type from text/plain to text/event-stream - Ensure both static and streaming responses include all compatibility fields - Add token_usage alias for better SDK compatibility - Apply fixes to both TransformersBackend and VLLMBackend Resolves OpenWebUI hanging issue when connecting to llm-katan endpoints. Signed-off-by: Yossi Ovadia <[email protected]>
1 parent 0270850 commit a6b308c

File tree

2 files changed

+51
-27
lines changed

2 files changed

+51
-27
lines changed

e2e-tests/llm-katan/llm_katan/model.py

Lines changed: 42 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,7 @@ async def generate(
103103
raise RuntimeError("Model not loaded. Call load_model() first.")
104104

105105
max_tokens = max_tokens or self.config.max_tokens
106-
temperature = (
107-
temperature if temperature is not None else self.config.temperature
108-
)
106+
temperature = temperature if temperature is not None else self.config.temperature
109107

110108
# Convert messages to prompt
111109
prompt = self._messages_to_prompt(messages)
@@ -136,20 +134,27 @@ async def generate(
136134
"object": "chat.completion",
137135
"created": int(time.time()),
138136
"model": self.config.served_model_name,
137+
"system_fingerprint": "llm-katan-transformers",
139138
"choices": [
140139
{
141140
"index": 0,
142141
"message": {"role": "assistant", "content": generated_text},
142+
"logprobs": None,
143143
"finish_reason": "stop",
144144
}
145145
],
146146
"usage": {
147147
"prompt_tokens": prompt_tokens,
148148
"completion_tokens": completion_tokens,
149149
"total_tokens": total_tokens,
150+
"prompt_tokens_details": {"cached_tokens": 0},
151+
"completion_tokens_details": {"reasoning_tokens": 0},
150152
},
151153
}
152154

155+
# Add token_usage as alias for better SDK compatibility
156+
response_data["token_usage"] = response_data["usage"]
157+
153158
if stream:
154159
# For streaming, yield chunks
155160
words = generated_text.split()
@@ -159,12 +164,12 @@ async def generate(
159164
"object": "chat.completion.chunk",
160165
"created": response_data["created"],
161166
"model": self.config.served_model_name,
167+
"system_fingerprint": "llm-katan-transformers",
162168
"choices": [
163169
{
164170
"index": 0,
165-
"delta": {
166-
"content": word + " " if i < len(words) - 1 else word
167-
},
171+
"delta": {"content": word + " " if i < len(words) - 1 else word},
172+
"logprobs": None,
168173
"finish_reason": None,
169174
}
170175
],
@@ -178,7 +183,15 @@ async def generate(
178183
"object": "chat.completion.chunk",
179184
"created": response_data["created"],
180185
"model": self.config.served_model_name,
181-
"choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
186+
"system_fingerprint": "llm-katan-transformers",
187+
"choices": [{"index": 0, "delta": {}, "logprobs": None, "finish_reason": "stop"}],
188+
"usage": {
189+
"prompt_tokens": prompt_tokens,
190+
"completion_tokens": completion_tokens,
191+
"total_tokens": total_tokens,
192+
"prompt_tokens_details": {"cached_tokens": 0},
193+
"completion_tokens_details": {"reasoning_tokens": 0},
194+
},
182195
}
183196
yield final_chunk
184197
else:
@@ -268,9 +281,7 @@ async def generate(
268281
from vllm.sampling_params import SamplingParams
269282

270283
max_tokens = max_tokens or self.config.max_tokens
271-
temperature = (
272-
temperature if temperature is not None else self.config.temperature
273-
)
284+
temperature = temperature if temperature is not None else self.config.temperature
274285

275286
# Convert messages to prompt
276287
prompt = self._messages_to_prompt(messages)
@@ -282,9 +293,7 @@ async def generate(
282293

283294
# Generate
284295
loop = asyncio.get_event_loop()
285-
outputs = await loop.run_in_executor(
286-
None, self.engine.generate, [prompt], sampling_params
287-
)
296+
outputs = await loop.run_in_executor(None, self.engine.generate, [prompt], sampling_params)
288297

289298
output = outputs[0]
290299
generated_text = output.outputs[0].text.strip()
@@ -295,21 +304,27 @@ async def generate(
295304
"object": "chat.completion",
296305
"created": int(time.time()),
297306
"model": self.config.served_model_name,
307+
"system_fingerprint": "llm-katan-vllm",
298308
"choices": [
299309
{
300310
"index": 0,
301311
"message": {"role": "assistant", "content": generated_text},
312+
"logprobs": None,
302313
"finish_reason": "stop",
303314
}
304315
],
305316
"usage": {
306317
"prompt_tokens": len(output.prompt_token_ids),
307318
"completion_tokens": len(output.outputs[0].token_ids),
308-
"total_tokens": len(output.prompt_token_ids)
309-
+ len(output.outputs[0].token_ids),
319+
"total_tokens": len(output.prompt_token_ids) + len(output.outputs[0].token_ids),
320+
"prompt_tokens_details": {"cached_tokens": 0},
321+
"completion_tokens_details": {"reasoning_tokens": 0},
310322
},
311323
}
312324

325+
# Add token_usage as alias for better SDK compatibility
326+
response_data["token_usage"] = response_data["usage"]
327+
313328
if stream:
314329
# For streaming, yield chunks (simplified for now)
315330
words = generated_text.split()
@@ -319,12 +334,12 @@ async def generate(
319334
"object": "chat.completion.chunk",
320335
"created": response_data["created"],
321336
"model": self.config.served_model_name,
337+
"system_fingerprint": "llm-katan-vllm",
322338
"choices": [
323339
{
324340
"index": 0,
325-
"delta": {
326-
"content": word + " " if i < len(words) - 1 else word
327-
},
341+
"delta": {"content": word + " " if i < len(words) - 1 else word},
342+
"logprobs": None,
328343
"finish_reason": None,
329344
}
330345
],
@@ -338,7 +353,15 @@ async def generate(
338353
"object": "chat.completion.chunk",
339354
"created": response_data["created"],
340355
"model": self.config.served_model_name,
341-
"choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
356+
"system_fingerprint": "llm-katan-vllm",
357+
"choices": [{"index": 0, "delta": {}, "logprobs": None, "finish_reason": "stop"}],
358+
"usage": {
359+
"prompt_tokens": len(output.prompt_token_ids),
360+
"completion_tokens": len(output.outputs[0].token_ids),
361+
"total_tokens": len(output.prompt_token_ids) + len(output.outputs[0].token_ids),
362+
"prompt_tokens_details": {"cached_tokens": 0},
363+
"completion_tokens_details": {"reasoning_tokens": 0},
364+
},
342365
}
343366
yield final_chunk
344367
else:

e2e-tests/llm-katan/llm_katan/server.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -160,9 +160,7 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
160160

161161
try:
162162
# Convert messages to dict format
163-
messages = [
164-
{"role": msg.role, "content": msg.content} for msg in request.messages
165-
]
163+
messages = [{"role": msg.role, "content": msg.content} for msg in request.messages]
166164

167165
# Update metrics
168166
metrics["total_requests"] += 1
@@ -181,8 +179,13 @@ async def generate_stream():
181179

182180
return StreamingResponse(
183181
generate_stream(),
184-
media_type="text/plain",
185-
headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
182+
media_type="text/event-stream",
183+
headers={
184+
"Cache-Control": "no-cache",
185+
"Connection": "keep-alive",
186+
"Access-Control-Allow-Origin": "*",
187+
"Access-Control-Allow-Headers": "Content-Type",
188+
},
186189
)
187190
else:
188191
# Non-streaming response
@@ -198,9 +201,7 @@ async def generate_stream():
198201
response_time = time.time() - start_time
199202
metrics["response_times"].append(response_time)
200203
if "choices" in response and response["choices"]:
201-
generated_text = (
202-
response["choices"][0].get("message", {}).get("content", "")
203-
)
204+
generated_text = response["choices"][0].get("message", {}).get("content", "")
204205
token_count = len(generated_text.split()) # Rough token estimate
205206
metrics["total_tokens_generated"] += token_count
206207

0 commit comments

Comments
 (0)