@@ -123,7 +123,7 @@ async def completions(self, request: Dict[Any, Any]):
123123 # Prepare inference inputs with proper parameter mapping
124124 inference_inputs = {
125125 "prompts" : request .get ("prompts" , []),
126- "max_length " : request .get ("max_tokens" , 256 ),
126+ "max_output_len " : request .get ("max_tokens" , 256 ),
127127 "temperature" : request .get ("temperature" , 1.0 ),
128128 "top_k" : request .get ("top_k" , 0 ),
129129 "top_p" : request .get ("top_p" , 0.0 ),
@@ -197,7 +197,7 @@ async def chat_completions(self, request: Dict[Any, Any]):
197197
198198 inference_inputs = {
199199 "prompts" : [messages ], # Wrap messages in a list so apply_chat_template gets the full conversation
200- "max_length " : request .get ("max_tokens" , 256 ),
200+ "max_output_len " : request .get ("max_tokens" , 256 ),
201201 "temperature" : request .get ("temperature" , 1.0 ),
202202 "top_k" : request .get ("top_k" , 0 ),
203203 "top_p" : request .get ("top_p" , 0.0 ),
@@ -248,7 +248,7 @@ async def chat_completions(self, request: Dict[Any, Any]):
248248 ),
249249 "finish_reason" : (
250250 "length"
251- if generated_texts and len (str (generated_texts [0 ])) >= inference_inputs ["max_length " ]
251+ if generated_texts and len (str (generated_texts [0 ])) >= inference_inputs ["max_output_len " ]
252252 else "stop"
253253 ),
254254 }
0 commit comments