@@ -103,9 +103,7 @@ async def generate(
103103 raise RuntimeError ("Model not loaded. Call load_model() first." )
104104
105105 max_tokens = max_tokens or self .config .max_tokens
106- temperature = (
107- temperature if temperature is not None else self .config .temperature
108- )
106+ temperature = temperature if temperature is not None else self .config .temperature
109107
110108 # Convert messages to prompt
111109 prompt = self ._messages_to_prompt (messages )
@@ -136,20 +134,27 @@ async def generate(
136134 "object" : "chat.completion" ,
137135 "created" : int (time .time ()),
138136 "model" : self .config .served_model_name ,
137+ "system_fingerprint" : "llm-katan-transformers" ,
139138 "choices" : [
140139 {
141140 "index" : 0 ,
142141 "message" : {"role" : "assistant" , "content" : generated_text },
142+ "logprobs" : None ,
143143 "finish_reason" : "stop" ,
144144 }
145145 ],
146146 "usage" : {
147147 "prompt_tokens" : prompt_tokens ,
148148 "completion_tokens" : completion_tokens ,
149149 "total_tokens" : total_tokens ,
150+ "prompt_tokens_details" : {"cached_tokens" : 0 },
151+ "completion_tokens_details" : {"reasoning_tokens" : 0 },
150152 },
151153 }
152154
155+ # Add token_usage as alias for better SDK compatibility
156+ response_data ["token_usage" ] = response_data ["usage" ]
157+
153158 if stream :
154159 # For streaming, yield chunks
155160 words = generated_text .split ()
@@ -159,12 +164,12 @@ async def generate(
159164 "object" : "chat.completion.chunk" ,
160165 "created" : response_data ["created" ],
161166 "model" : self .config .served_model_name ,
167+ "system_fingerprint" : "llm-katan-transformers" ,
162168 "choices" : [
163169 {
164170 "index" : 0 ,
165- "delta" : {
166- "content" : word + " " if i < len (words ) - 1 else word
167- },
171+ "delta" : {"content" : word + " " if i < len (words ) - 1 else word },
172+ "logprobs" : None ,
168173 "finish_reason" : None ,
169174 }
170175 ],
@@ -178,7 +183,15 @@ async def generate(
178183 "object" : "chat.completion.chunk" ,
179184 "created" : response_data ["created" ],
180185 "model" : self .config .served_model_name ,
181- "choices" : [{"index" : 0 , "delta" : {}, "finish_reason" : "stop" }],
186+ "system_fingerprint" : "llm-katan-transformers" ,
187+ "choices" : [{"index" : 0 , "delta" : {}, "logprobs" : None , "finish_reason" : "stop" }],
188+ "usage" : {
189+ "prompt_tokens" : prompt_tokens ,
190+ "completion_tokens" : completion_tokens ,
191+ "total_tokens" : total_tokens ,
192+ "prompt_tokens_details" : {"cached_tokens" : 0 },
193+ "completion_tokens_details" : {"reasoning_tokens" : 0 },
194+ },
182195 }
183196 yield final_chunk
184197 else :
@@ -268,9 +281,7 @@ async def generate(
268281 from vllm .sampling_params import SamplingParams
269282
270283 max_tokens = max_tokens or self .config .max_tokens
271- temperature = (
272- temperature if temperature is not None else self .config .temperature
273- )
284+ temperature = temperature if temperature is not None else self .config .temperature
274285
275286 # Convert messages to prompt
276287 prompt = self ._messages_to_prompt (messages )
@@ -282,9 +293,7 @@ async def generate(
282293
283294 # Generate
284295 loop = asyncio .get_event_loop ()
285- outputs = await loop .run_in_executor (
286- None , self .engine .generate , [prompt ], sampling_params
287- )
296+ outputs = await loop .run_in_executor (None , self .engine .generate , [prompt ], sampling_params )
288297
289298 output = outputs [0 ]
290299 generated_text = output .outputs [0 ].text .strip ()
@@ -295,21 +304,27 @@ async def generate(
295304 "object" : "chat.completion" ,
296305 "created" : int (time .time ()),
297306 "model" : self .config .served_model_name ,
307+ "system_fingerprint" : "llm-katan-vllm" ,
298308 "choices" : [
299309 {
300310 "index" : 0 ,
301311 "message" : {"role" : "assistant" , "content" : generated_text },
312+ "logprobs" : None ,
302313 "finish_reason" : "stop" ,
303314 }
304315 ],
305316 "usage" : {
306317 "prompt_tokens" : len (output .prompt_token_ids ),
307318 "completion_tokens" : len (output .outputs [0 ].token_ids ),
308- "total_tokens" : len (output .prompt_token_ids )
309- + len (output .outputs [0 ].token_ids ),
319+ "total_tokens" : len (output .prompt_token_ids ) + len (output .outputs [0 ].token_ids ),
320+ "prompt_tokens_details" : {"cached_tokens" : 0 },
321+ "completion_tokens_details" : {"reasoning_tokens" : 0 },
310322 },
311323 }
312324
325+ # Add token_usage as alias for better SDK compatibility
326+ response_data ["token_usage" ] = response_data ["usage" ]
327+
313328 if stream :
314329 # For streaming, yield chunks (simplified for now)
315330 words = generated_text .split ()
@@ -319,12 +334,12 @@ async def generate(
319334 "object" : "chat.completion.chunk" ,
320335 "created" : response_data ["created" ],
321336 "model" : self .config .served_model_name ,
337+ "system_fingerprint" : "llm-katan-vllm" ,
322338 "choices" : [
323339 {
324340 "index" : 0 ,
325- "delta" : {
326- "content" : word + " " if i < len (words ) - 1 else word
327- },
341+ "delta" : {"content" : word + " " if i < len (words ) - 1 else word },
342+ "logprobs" : None ,
328343 "finish_reason" : None ,
329344 }
330345 ],
@@ -338,7 +353,15 @@ async def generate(
338353 "object" : "chat.completion.chunk" ,
339354 "created" : response_data ["created" ],
340355 "model" : self .config .served_model_name ,
341- "choices" : [{"index" : 0 , "delta" : {}, "finish_reason" : "stop" }],
356+ "system_fingerprint" : "llm-katan-vllm" ,
357+ "choices" : [{"index" : 0 , "delta" : {}, "logprobs" : None , "finish_reason" : "stop" }],
358+ "usage" : {
359+ "prompt_tokens" : len (output .prompt_token_ids ),
360+ "completion_tokens" : len (output .outputs [0 ].token_ids ),
361+ "total_tokens" : len (output .prompt_token_ids ) + len (output .outputs [0 ].token_ids ),
362+ "prompt_tokens_details" : {"cached_tokens" : 0 },
363+ "completion_tokens_details" : {"reasoning_tokens" : 0 },
364+ },
342365 }
343366 yield final_chunk
344367 else :
0 commit comments