22import asyncio
33import threading
44from concurrent .futures import Future
5+ from sys import stderr
56from typing import Any , Callable , Generator , TypeVar
67
78import httpx
@@ -86,6 +87,7 @@ def generate_text(
8687 messages : ModelInput ,
8788 parameters : dict [str , Any ],
8889 ) -> tuple [LazyMessage , PdlLazy [Any ]]:
90+ print ("Asynchronous model call started" , file = stderr )
8991 # global _BACKGROUND_TASKS
9092 future = asyncio .run_coroutine_threadsafe (
9193 LitellmModel .async_generate_text (
@@ -108,6 +110,25 @@ def update_end_nanos(future):
108110 if block .pdl__timing is not None :
109111 block .pdl__timing .end_nanos = time .time_ns ()
110112
113+ # report call completion and its duration (and if available queueing time)
114+ res = future .result ()[1 ]
115+ start = (
116+ block .pdl__timing .start_nanos
117+ if block .pdl__timing .start_nanos is not None
118+ else 0
119+ )
120+ exec_nanos = block .pdl__timing .end_nanos - start
121+ queue_nanos = 0
122+ if "created" in res :
123+ queue_nanos = (
124+ res ["created" ] * 1000000000 - block .pdl__timing .start_nanos
125+ )
126+ exec_nanos = exec_nanos - queue_nanos
127+ print (
128+ f"Asynchronous model call to { block .model } completed in { (exec_nanos )/ 1000000 } ms queued for { queue_nanos / 1000000 } ms" ,
129+ file = stderr ,
130+ )
131+
111132 future .add_done_callback (update_end_nanos )
112133
113134 return message , response
0 commit comments