@@ -103,9 +103,7 @@ async def generate(
103
103
raise RuntimeError ("Model not loaded. Call load_model() first." )
104
104
105
105
max_tokens = max_tokens or self .config .max_tokens
106
- temperature = (
107
- temperature if temperature is not None else self .config .temperature
108
- )
106
+ temperature = temperature if temperature is not None else self .config .temperature
109
107
110
108
# Convert messages to prompt
111
109
prompt = self ._messages_to_prompt (messages )
@@ -136,20 +134,27 @@ async def generate(
136
134
"object" : "chat.completion" ,
137
135
"created" : int (time .time ()),
138
136
"model" : self .config .served_model_name ,
137
+ "system_fingerprint" : "llm-katan-transformers" ,
139
138
"choices" : [
140
139
{
141
140
"index" : 0 ,
142
141
"message" : {"role" : "assistant" , "content" : generated_text },
142
+ "logprobs" : None ,
143
143
"finish_reason" : "stop" ,
144
144
}
145
145
],
146
146
"usage" : {
147
147
"prompt_tokens" : prompt_tokens ,
148
148
"completion_tokens" : completion_tokens ,
149
149
"total_tokens" : total_tokens ,
150
+ "prompt_tokens_details" : {"cached_tokens" : 0 },
151
+ "completion_tokens_details" : {"reasoning_tokens" : 0 },
150
152
},
151
153
}
152
154
155
+ # Add token_usage as alias for better SDK compatibility
156
+ response_data ["token_usage" ] = response_data ["usage" ]
157
+
153
158
if stream :
154
159
# For streaming, yield chunks
155
160
words = generated_text .split ()
@@ -159,12 +164,12 @@ async def generate(
159
164
"object" : "chat.completion.chunk" ,
160
165
"created" : response_data ["created" ],
161
166
"model" : self .config .served_model_name ,
167
+ "system_fingerprint" : "llm-katan-transformers" ,
162
168
"choices" : [
163
169
{
164
170
"index" : 0 ,
165
- "delta" : {
166
- "content" : word + " " if i < len (words ) - 1 else word
167
- },
171
+ "delta" : {"content" : word + " " if i < len (words ) - 1 else word },
172
+ "logprobs" : None ,
168
173
"finish_reason" : None ,
169
174
}
170
175
],
@@ -178,7 +183,15 @@ async def generate(
178
183
"object" : "chat.completion.chunk" ,
179
184
"created" : response_data ["created" ],
180
185
"model" : self .config .served_model_name ,
181
- "choices" : [{"index" : 0 , "delta" : {}, "finish_reason" : "stop" }],
186
+ "system_fingerprint" : "llm-katan-transformers" ,
187
+ "choices" : [{"index" : 0 , "delta" : {}, "logprobs" : None , "finish_reason" : "stop" }],
188
+ "usage" : {
189
+ "prompt_tokens" : prompt_tokens ,
190
+ "completion_tokens" : completion_tokens ,
191
+ "total_tokens" : total_tokens ,
192
+ "prompt_tokens_details" : {"cached_tokens" : 0 },
193
+ "completion_tokens_details" : {"reasoning_tokens" : 0 },
194
+ },
182
195
}
183
196
yield final_chunk
184
197
else :
@@ -268,9 +281,7 @@ async def generate(
268
281
from vllm .sampling_params import SamplingParams
269
282
270
283
max_tokens = max_tokens or self .config .max_tokens
271
- temperature = (
272
- temperature if temperature is not None else self .config .temperature
273
- )
284
+ temperature = temperature if temperature is not None else self .config .temperature
274
285
275
286
# Convert messages to prompt
276
287
prompt = self ._messages_to_prompt (messages )
@@ -282,9 +293,7 @@ async def generate(
282
293
283
294
# Generate
284
295
loop = asyncio .get_event_loop ()
285
- outputs = await loop .run_in_executor (
286
- None , self .engine .generate , [prompt ], sampling_params
287
- )
296
+ outputs = await loop .run_in_executor (None , self .engine .generate , [prompt ], sampling_params )
288
297
289
298
output = outputs [0 ]
290
299
generated_text = output .outputs [0 ].text .strip ()
@@ -295,21 +304,27 @@ async def generate(
295
304
"object" : "chat.completion" ,
296
305
"created" : int (time .time ()),
297
306
"model" : self .config .served_model_name ,
307
+ "system_fingerprint" : "llm-katan-vllm" ,
298
308
"choices" : [
299
309
{
300
310
"index" : 0 ,
301
311
"message" : {"role" : "assistant" , "content" : generated_text },
312
+ "logprobs" : None ,
302
313
"finish_reason" : "stop" ,
303
314
}
304
315
],
305
316
"usage" : {
306
317
"prompt_tokens" : len (output .prompt_token_ids ),
307
318
"completion_tokens" : len (output .outputs [0 ].token_ids ),
308
- "total_tokens" : len (output .prompt_token_ids )
309
- + len (output .outputs [0 ].token_ids ),
319
+ "total_tokens" : len (output .prompt_token_ids ) + len (output .outputs [0 ].token_ids ),
320
+ "prompt_tokens_details" : {"cached_tokens" : 0 },
321
+ "completion_tokens_details" : {"reasoning_tokens" : 0 },
310
322
},
311
323
}
312
324
325
+ # Add token_usage as alias for better SDK compatibility
326
+ response_data ["token_usage" ] = response_data ["usage" ]
327
+
313
328
if stream :
314
329
# For streaming, yield chunks (simplified for now)
315
330
words = generated_text .split ()
@@ -319,12 +334,12 @@ async def generate(
319
334
"object" : "chat.completion.chunk" ,
320
335
"created" : response_data ["created" ],
321
336
"model" : self .config .served_model_name ,
337
+ "system_fingerprint" : "llm-katan-vllm" ,
322
338
"choices" : [
323
339
{
324
340
"index" : 0 ,
325
- "delta" : {
326
- "content" : word + " " if i < len (words ) - 1 else word
327
- },
341
+ "delta" : {"content" : word + " " if i < len (words ) - 1 else word },
342
+ "logprobs" : None ,
328
343
"finish_reason" : None ,
329
344
}
330
345
],
@@ -338,7 +353,15 @@ async def generate(
338
353
"object" : "chat.completion.chunk" ,
339
354
"created" : response_data ["created" ],
340
355
"model" : self .config .served_model_name ,
341
- "choices" : [{"index" : 0 , "delta" : {}, "finish_reason" : "stop" }],
356
+ "system_fingerprint" : "llm-katan-vllm" ,
357
+ "choices" : [{"index" : 0 , "delta" : {}, "logprobs" : None , "finish_reason" : "stop" }],
358
+ "usage" : {
359
+ "prompt_tokens" : len (output .prompt_token_ids ),
360
+ "completion_tokens" : len (output .outputs [0 ].token_ids ),
361
+ "total_tokens" : len (output .prompt_token_ids ) + len (output .outputs [0 ].token_ids ),
362
+ "prompt_tokens_details" : {"cached_tokens" : 0 },
363
+ "completion_tokens_details" : {"reasoning_tokens" : 0 },
364
+ },
342
365
}
343
366
yield final_chunk
344
367
else :
0 commit comments