@@ -546,9 +546,7 @@ async def route_request(
546
546
msg = FullLookupMsg (event_id = "" , tokens = token_ids )
547
547
ret_msg = await self .kv_manager .handle_orchestration_message (msg )
548
548
matched_infos = ret_msg .matched_info
549
- # print(f">>>>>>>>>>>>>>>>>>>>>>> matched_infos={matched_infos}")
550
549
best_matched_info = self ._find_best_matched (matched_infos )
551
- # print(f">>>>>>>>>>>>>>>>>>>>>>> best_matched_info={best_matched_info}")
552
550
self .uncached_prefix_tokens = len (token_ids ) - best_matched_info [1 ][- 1 ][1 ]
553
551
best_ttft_url = await self ._find_best_ttft (endpoints , matched_infos ,
554
552
best_matched_info , request_stats )
@@ -586,7 +584,7 @@ async def _find_best_ttft(self, endpoints, matched_infos, best_matched_info,
586
584
best_ttft = float ('inf' )
587
585
best_ttft_url = None
588
586
for i , matched_info in enumerate (matched_infos ):
589
- print (f"-------------- URL:{ matched_urls [i ]} --------------" )
587
+ logger . debug (f"-------------- URL:{ matched_urls [i ]} --------------" )
590
588
ttft = self ._estimate_ttft (matched_info , best_matched_info ,
591
589
matched_stats [i ])
592
590
if best_ttft_url is None or ttft <= best_ttft :
@@ -601,7 +599,7 @@ async def _find_best_ttft(self, endpoints, matched_infos, best_matched_info,
601
599
stats = request_stats .get (url , None )
602
600
if stats is None :
603
601
raise ValueError (f"{ url } provides no request stats " )
604
- print (f"-------------- URL:{ url } --------------" )
602
+ logger . debug (f"-------------- URL:{ url } --------------" )
605
603
ttft = self ._estimate_ttft (None , best_matched_info , stats )
606
604
if best_ttft_url is None or ttft <= best_ttft :
607
605
best_ttft = ttft
@@ -621,12 +619,12 @@ def _estimate_ttft(self, matched_info, best_matched_info, stats):
621
619
stats .engine_prefill_tps )
622
620
ttft = forecasted_queue_time + transfer_time
623
621
624
- print (f"-------------- time estimations --------------" )
625
- print (f"uncomputed_prefix_tokens: { stats .uncomputed_prefix_tokens } " )
626
- print (f"engine_prefill_tps: { stats .engine_prefill_tps } " )
627
- print (f"transfer_time: { transfer_time } " )
628
- print (f"forecasted_queue_time: { forecasted_queue_time } " )
629
- print (f"ttft: { ttft } " )
622
+ logger . debug (f"-------------- time estimations --------------" )
623
+ logger . debug (f"uncomputed_prefix_tokens: { stats .uncomputed_prefix_tokens } " )
624
+ logger . debug (f"engine_prefill_tps: { stats .engine_prefill_tps } " )
625
+ logger . debug (f"transfer_time: { transfer_time } " )
626
+ logger . debug (f"forecasted_queue_time: { forecasted_queue_time } " )
627
+ logger . debug (f"ttft: { ttft } " )
630
628
return ttft
631
629
632
630
async def _get_instance_url (self , endpoints , instance_id ):
@@ -649,11 +647,8 @@ async def _get_instance_url(self, endpoints, instance_id):
649
647
return url
650
648
651
649
def _calc_transfer_time (self , matched_info , best_matched_info ):
652
- #print(f"matched_info[1][-1][1]: {matched_info[1][-1][1]}")
653
650
transfer_time = 0
654
651
for chunk in best_matched_info [1 ]:
655
- #print(f"chunk[0]: {chunk[0]}")
656
- #print(f"chunk[1]: {chunk[1]}")
657
652
if matched_info is not None and chunk [1 ] <= matched_info [1 ][- 1 ][1 ]:
658
653
continue
659
654
# TODO better estimations
@@ -663,7 +658,6 @@ def _calc_transfer_time(self, matched_info, best_matched_info):
663
658
transfer_time += 0.015
664
659
else :
665
660
transfer_time += 0.01
666
- #print(f"transfer_time: {transfer_time}")
667
661
return transfer_time
668
662
669
663
def _fallback_routing (self , endpoints , request_stats , request ):
0 commit comments