@@ -158,7 +158,7 @@ def create_app(
158158def prepare_request_resources (
159159 body : CreateCompletionRequest | CreateChatCompletionRequest ,
160160 llama_proxy : LlamaProxy ,
161- body_model : str ,
161+ body_model : str | None ,
162162 kwargs ,
163163) -> llama_cpp .Llama :
164164 if llama_proxy is None :
@@ -192,18 +192,15 @@ async def get_event_publisher(
192192 request : Request ,
193193 inner_send_chan : MemoryObjectSendStream [typing .Any ],
194194 body : CreateCompletionRequest | CreateChatCompletionRequest ,
195- body_model : str ,
195+ body_model : str | None ,
196196 llama_call ,
197197 kwargs ,
198198):
199199 server_settings = next (get_server_settings ())
200200 interrupt_requests = (
201201 server_settings .interrupt_requests if server_settings else False
202202 )
203- async with contextlib .AsyncExitStack () as exit_stack :
204- llama_proxy : LlamaProxy = await exit_stack .enter_async_context (
205- contextlib .asynccontextmanager (get_llama_proxy )()
206- )
203+ async with contextlib .asynccontextmanager (get_llama_proxy )() as llama_proxy :
207204 llama = prepare_request_resources (body , llama_proxy , body_model , kwargs )
208205 async with inner_send_chan :
209206 try :
@@ -345,10 +342,7 @@ async def create_completion(
345342 )
346343
347344 # handle regular request
348- async with contextlib .AsyncExitStack () as exit_stack :
349- llama_proxy : LlamaProxy = await exit_stack .enter_async_context (
350- contextlib .asynccontextmanager (get_llama_proxy )()
351- )
345+ async with contextlib .asynccontextmanager (get_llama_proxy )() as llama_proxy :
352346 llama = prepare_request_resources (body , llama_proxy , body_model , kwargs )
353347
354348 if await request .is_disconnected ():
@@ -517,10 +511,7 @@ async def create_chat_completion(
517511 )
518512
519513 # handle regular request
520- async with contextlib .AsyncExitStack () as exit_stack :
521- llama_proxy : LlamaProxy = await exit_stack .enter_async_context (
522- contextlib .asynccontextmanager (get_llama_proxy )()
523- )
514+ async with contextlib .asynccontextmanager (get_llama_proxy )() as llama_proxy :
524515 llama = prepare_request_resources (body , llama_proxy , body_model , kwargs )
525516
526517 if await request .is_disconnected ():
0 commit comments