@@ -41,7 +41,9 @@ def __init__(
4141 self .id_to_event : Dict [int , asyncio .Event ] = {}
4242 self .session = None
4343 self .first_time_costs = MovingAverage ()
44- self .create_session_costs = MovingAverage ()
44+ self .prefill_create_session_costs = MovingAverage ()
45+ self .decode_create_session_costs = MovingAverage ()
46+ self .per_token_costs = MovingAverage ()
4547 return
4648
4749 async def register_pd (self , pd_info_json ):
@@ -181,7 +183,7 @@ async def fetch_stream(
181183 req = await self ._to_req_info (prompt , sampling_params , multimodal_params )
182184 create_start_time = time .time ()
183185 async with self .session .post (p_node .to_llm_url (), json = req ) as response :
184- self .create_session_costs .add ((time .time () - create_start_time ) * 1000 )
186+ self .prefill_create_session_costs .add ((time .time () - create_start_time ) * 1000 )
185187 if response .status == 200 :
186188 async for line in response .content :
187189 line = line .decode ("utf-8" ).strip ()
@@ -217,7 +219,9 @@ async def fetch_stream(
217219 sampling_params .suggested_dp_index = event .upkv_status .dp_index
218220
219221 req = await self ._to_req_info (prompt_ids , sampling_params , multimodal_params )
222+ create_start_time = time .time ()
220223 async with self .session .post (d_node .to_llm_url (), json = req ) as response :
224+ self .decode_create_session_costs .add ((time .time () - create_start_time ) * 1000 )
221225 if response .status == 200 :
222226 async for line in response .content :
223227 line = line .decode ("utf-8" ).strip ()
@@ -269,6 +273,7 @@ async def _wait_to_token_package(
269273
270274 total_cost_time_ms = (time .time () - start_time ) * 1000
271275 mean_per_token_cost_time_ms = (total_cost_time_ms - first_token_cost_ms ) / out_token_counter
276+ self .per_token_costs .add (mean_per_token_cost_time_ms )
272277 x_request_id = request .headers .get ("X-Request-Id" , "" )
273278 x_session_id = request .headers .get ("X-Session-Id" , "" )
274279 prompt_cache_len = metadata .pop ("prompt_cache_len" , 0 )
@@ -312,5 +317,7 @@ async def handle_loop(self):
312317 # 可以做一个定时任务
313318 await asyncio .sleep (20 )
314319 logger .info (f"mean first cost: { self .first_time_costs .average ()} ms" )
315- logger .info (f"create_session_costs: { self .create_session_costs .average ()} ms" )
320+ logger .info (f"prefill mean create_session_costs: { self .prefill_create_session_costs .average ()} ms" )
321+ logger .info (f"decode mean create_session_costs: { self .decode_create_session_costs .average ()} ms" )
322+ logger .info (f"mean per token cost: { self .per_token_costs .average ()} ms" )
316323 return
0 commit comments