@@ -222,13 +222,16 @@ async def _infer_embedding_async(self, template: Template, inputs: Dict[str, Any
222222
223223 async def _infer_full_async (self , template : Template , inputs : Dict [str , Any ], generation_config : Dict [str , Any ],
224224 request_config : RequestConfig ) -> ChatCompletionResponse :
225- output = await self .engine .async_generate (** inputs , sampling_params = generation_config )
225+ engine_inputs = {k : v for k , v in inputs .items () if k != 'template_inputs' }
226+ output = await self .engine .async_generate (** engine_inputs , sampling_params = generation_config )
226227 output ['prompt_token_ids' ] = inputs ['input_ids' ]
227228 return self ._create_chat_completion_response (output , inputs , template , request_config .return_details )
228229
229230 async def _infer_stream_async (self , template : Template , inputs : Dict [str , Any ], generation_config : Dict [str , Any ],
230231 ** kwargs ) -> AsyncIterator [ChatCompletionStreamResponse ]:
231- result_generator = await self .engine .async_generate (** inputs , sampling_params = generation_config , stream = True )
232+ engine_inputs = {k : v for k , v in inputs .items () if k != 'template_inputs' }
233+ result_generator = await self .engine .async_generate (
234+ ** engine_inputs , sampling_params = generation_config , stream = True )
232235 infer_streamer = InferStreamer (template )
233236 async for output in result_generator :
234237 res = self ._create_chat_completion_stream_response (output , template , infer_streamer )
0 commit comments