2929import csv
3030import json
3131from enum import Enum , auto
32- from itertools import pairwise
32+ from itertools import tee
3333from pathlib import Path
34- from typing import List
34+ from typing import Dict , List , Tuple , Union
3535
3636import numpy as np
3737import pandas as pd
@@ -115,7 +115,7 @@ def __init__(
115115 request_throughputs : List [float ] = [],
116116 request_latencies : List [int ] = [],
117117 time_to_first_tokens : List [int ] = [],
118- inter_token_latencies : List [list [int ]] = [[]],
118+ inter_token_latencies : List [List [int ]] = [[]],
119119 output_token_throughputs : List [float ] = [],
120120 output_token_throughputs_per_request : List [int ] = [],
121121 num_output_tokens : List [int ] = [],
@@ -170,7 +170,7 @@ def __init__(self, metrics: Metrics):
170170 self ._calculate_minmax (data , attr )
171171 self ._calculate_std (data , attr )
172172
173- def _preprocess_data (self , data : list , attr : str ) -> list [ int | float ]:
173+ def _preprocess_data (self , data : List , attr : str ) -> List [ Union [ int , float ] ]:
174174 new_data = []
175175 if attr == "inter_token_latency" :
176176 # flatten inter token latencies to 1D
@@ -180,11 +180,11 @@ def _preprocess_data(self, data: list, attr: str) -> list[int | float]:
180180 new_data = data
181181 return new_data
182182
183- def _calculate_mean (self , data : list [ int | float ], attr : str ) -> None :
183+ def _calculate_mean (self , data : List [ Union [ int , float ] ], attr : str ) -> None :
184184 avg = np .mean (data )
185185 setattr (self , "avg_" + attr , avg )
186186
187- def _calculate_percentiles (self , data : list [ int | float ], attr : str ) -> None :
187+ def _calculate_percentiles (self , data : List [ Union [ int , float ] ], attr : str ) -> None :
188188 p25 , p50 , p75 = np .percentile (data , [25 , 50 , 75 ])
189189 p90 , p95 , p99 = np .percentile (data , [90 , 95 , 99 ])
190190 setattr (self , "p25_" + attr , p25 )
@@ -194,12 +194,12 @@ def _calculate_percentiles(self, data: list[int | float], attr: str) -> None:
194194 setattr (self , "p95_" + attr , p95 )
195195 setattr (self , "p99_" + attr , p99 )
196196
197- def _calculate_minmax (self , data : list [ int | float ], attr : str ) -> None :
197+ def _calculate_minmax (self , data : List [ Union [ int , float ] ], attr : str ) -> None :
198198 min , max = np .min (data ), np .max (data )
199199 setattr (self , "min_" + attr , min )
200200 setattr (self , "max_" + attr , max )
201201
202- def _calculate_std (self , data : list [ int | float ], attr : str ) -> None :
202+ def _calculate_std (self , data : List [ Union [ int , float ] ], attr : str ) -> None :
203203 std = np .std (data )
204204 setattr (self , "std_" + attr , std )
205205
@@ -460,7 +460,7 @@ def get_statistics(self, infer_mode: str, load_level: str) -> Statistics:
460460 raise KeyError (f"Profile with { infer_mode } ={ load_level } does not exist." )
461461 return self ._profile_results [(infer_mode , load_level )]
462462
463- def get_profile_load_info (self ) -> list [ tuple [str , str ]]:
463+ def get_profile_load_info (self ) -> List [ Tuple [str , str ]]:
464464 """Return available (infer_mode, load_level) tuple keys."""
465465 return [k for k , _ in self ._profile_results .items ()]
466466
@@ -547,7 +547,9 @@ def _parse_requests(self, requests: dict) -> LLMMetrics:
547547
548548 # inter token latency
549549 itl_per_request = []
550- for (t1 , _ ), (t2 , n2 ) in pairwise (zip (res_timestamps , num_output_tokens )):
550+ for (t1 , _ ), (t2 , n2 ) in self ._pairwise (
551+ zip (res_timestamps , num_output_tokens )
552+ ):
551553 # TMA-1676: handle empty first/last responses
552554 # if the latter response has zero token (e.g. empty string),
553555 # then set it default to one for the sake of inter token latency
@@ -572,8 +574,14 @@ def _parse_requests(self, requests: dict) -> LLMMetrics:
572574 num_input_tokens ,
573575 )
574576
577+ def _pairwise (self , iterable ):
578+ """Generate pairs of consecutive elements from the given iterable."""
579+ a , b = tee (iterable )
580+ next (b , None )
581+ return zip (a , b )
582+
575583 def _preprocess_response (
576- self , res_timestamps : list [int ], res_outputs : list [ dict [str , str ]]
584+ self , res_timestamps : List [int ], res_outputs : List [ Dict [str , str ]]
577585 ) -> None :
578586 """Helper function to preprocess responses of a request."""
579587 if self ._service_kind == "openai" :
@@ -604,7 +612,7 @@ def _preprocess_response(
604612 res_timestamps .pop ()
605613 res_outputs .pop ()
606614
607- def _tokenize_request_inputs (self , req_inputs : dict ) -> list [int ]:
615+ def _tokenize_request_inputs (self , req_inputs : dict ) -> List [int ]:
608616 """Deserialize the request input and return tokenized inputs."""
609617 if self ._service_kind == "triton" :
610618 return self ._tokenize_triton_request_input (req_inputs )
@@ -613,12 +621,12 @@ def _tokenize_request_inputs(self, req_inputs: dict) -> list[int]:
613621 else :
614622 raise ValueError (f"Unknown service kind: '{ self ._service_kind } '." )
615623
616- def _tokenize_triton_request_input (self , req_inputs : dict ) -> list [int ]:
624+ def _tokenize_triton_request_input (self , req_inputs : dict ) -> List [int ]:
617625 """Tokenize the Triton request input texts."""
618626 encodings = self ._tokenizer (req_inputs ["text_input" ])
619627 return encodings .data ["input_ids" ]
620628
621- def _tokenize_openai_request_input (self , req_inputs : dict ) -> list [int ]:
629+ def _tokenize_openai_request_input (self , req_inputs : dict ) -> List [int ]:
622630 """Tokenize the OpenAI request input texts."""
623631 payload = json .loads (req_inputs ["payload" ])
624632 if self ._response_format == ResponseFormat .OPENAI_CHAT_COMPLETIONS :
@@ -632,7 +640,7 @@ def _tokenize_openai_request_input(self, req_inputs: dict) -> list[int]:
632640 encodings = self ._tokenizer (input_text )
633641 return encodings .data ["input_ids" ]
634642
635- def _tokenize_response_outputs (self , res_outputs : dict ) -> list [ list [int ]]:
643+ def _tokenize_response_outputs (self , res_outputs : dict ) -> List [ List [int ]]:
636644 """Deserialize the response output and return tokenized outputs."""
637645 if self ._service_kind == "triton" :
638646 return self ._tokenize_triton_response_output (res_outputs )
@@ -641,22 +649,22 @@ def _tokenize_response_outputs(self, res_outputs: dict) -> list[list[int]]:
641649 else :
642650 raise ValueError (f"Unknown service kind: '{ self ._service_kind } '." )
643651
644- def _tokenize_triton_response_output (self , res_outputs : dict ) -> list [ list [int ]]:
652+ def _tokenize_triton_response_output (self , res_outputs : dict ) -> List [ List [int ]]:
645653 """Tokenize the Triton response output texts."""
646654 output_texts = []
647655 for output in res_outputs :
648656 output_texts .append (output ["text_output" ])
649657 return self ._run_tokenizer (output_texts )
650658
651- def _tokenize_openai_response_output (self , res_outputs : dict ) -> list [ list [int ]]:
659+ def _tokenize_openai_response_output (self , res_outputs : dict ) -> List [ List [int ]]:
652660 """Tokenize the OpenAI response output texts."""
653661 output_texts = []
654662 for output in res_outputs :
655663 text = self ._extract_openai_text_output (output ["response" ])
656664 output_texts .append (text )
657665 return self ._run_tokenizer (output_texts )
658666
659- def _run_tokenizer (self , output_texts : list [str ]) -> list [ list [int ]]:
667+ def _run_tokenizer (self , output_texts : List [str ]) -> List [ List [int ]]:
660668 # exclamation mark trick forces the llama tokenization to consistently
661669 # start each output with a specific token which allows us to safely skip
662670 # the first token of every tokenized output and get only the ones that
0 commit comments