3131from itertools import pairwise
3232
3333import numpy as np
34+ from genai_perf .llm_inputs .llm_inputs import OutputFormat
3435from genai_perf .tokenizer import AutoTokenizer
3536from genai_perf .utils import load_json , remove_sse_prefix
3637from rich .console import Console
3738from rich .table import Table
3839
40+ _OPENAI_CHAT_COMPLETIONS = OutputFormat .OPENAI_CHAT_COMPLETIONS
41+ _OPENAI_COMPLETIONS = OutputFormat .OPENAI_COMPLETIONS
42+
3943
4044class Metrics :
4145 """A base class for all the metrics class that contains common metrics."""
@@ -48,6 +52,7 @@ class Metrics:
4852 "output_token_throughput_per_request" ,
4953 "request_throughput" ,
5054 "num_output_token" ,
55+ "num_input_token" ,
5156 ]
5257
5358 time_fields = [
@@ -108,13 +113,15 @@ def __init__(
108113 output_token_throughputs : list [float ] = [],
109114 output_token_throughputs_per_request : list [int ] = [],
110115 num_output_tokens : list [int ] = [],
116+ num_input_tokens : list [int ] = [],
111117 ) -> None :
112118 super ().__init__ (request_throughputs , request_latencies )
113119 self .time_to_first_tokens = time_to_first_tokens
114120 self .inter_token_latencies = inter_token_latencies
115121 self .output_token_throughputs = output_token_throughputs
116122 self .output_token_throughputs_per_request = output_token_throughputs_per_request
117123 self .num_output_tokens = num_output_tokens
124+ self .num_input_tokens = num_input_tokens
118125
119126 # add base name mapping
120127 self ._base_names ["time_to_first_tokens" ] = "time_to_first_token"
@@ -124,6 +131,7 @@ def __init__(
124131 "output_token_throughputs_per_request"
125132 ] = "output_token_throughput_per_request"
126133 self ._base_names ["num_output_tokens" ] = "num_output_token"
134+ self ._base_names ["num_input_tokens" ] = "num_input_token"
127135
128136
129137class Statistics :
@@ -424,10 +432,15 @@ class LLMProfileDataParser(ProfileDataParser):
424432 """
425433
426434 def __init__ (
427- self , filename : str , service_kind : str , tokenizer : AutoTokenizer
435+ self ,
436+ filename : str ,
437+ service_kind : str ,
438+ output_format : OutputFormat ,
439+ tokenizer : AutoTokenizer ,
428440 ) -> None :
429441 self ._tokenizer = tokenizer
430442 self ._service_kind = service_kind
443+ self ._output_format = output_format
431444 super ().__init__ (filename )
432445
433446 def _parse_requests (self , requests : dict ) -> LLMMetrics :
@@ -437,14 +450,21 @@ def _parse_requests(self, requests: dict) -> LLMMetrics:
437450 time_to_first_tokens = []
438451 inter_token_latencies = []
439452 output_token_throughputs_per_request = []
453+ num_input_tokens = []
440454 num_generated_tokens = []
441455 for request in requests :
442456 req_timestamp = request ["timestamp" ]
457+ req_inputs = request ["request_inputs" ]
443458 res_timestamps = request ["response_timestamps" ]
444459 res_outputs = request ["response_outputs" ]
445460
446461 self ._preprocess_response (res_timestamps , res_outputs )
447462
463+ # Skip requests with empty response. This happens sometimes when the
464+ # model returns a single response with empty string.
465+ if not res_timestamps :
466+ continue
467+
448468 # track entire benchmark duration
449469 min_req_timestamp = min (min_req_timestamp , req_timestamp )
450470 max_res_timestamp = max (max_res_timestamp , res_timestamps [- 1 ])
@@ -457,6 +477,10 @@ def _parse_requests(self, requests: dict) -> LLMMetrics:
457477 # time to first token
458478 time_to_first_tokens .append (res_timestamps [0 ] - req_timestamp )
459479
480+ # number of input tokens
481+ input_tokens = self ._tokenize_request_inputs (req_inputs )
482+ num_input_tokens .append (len (input_tokens ))
483+
460484 # output token throughput per request
461485 output_tokens = self ._tokenize_response_outputs (res_outputs )
462486 num_output_tokens = list (map (len , output_tokens ))
@@ -490,6 +514,7 @@ def _parse_requests(self, requests: dict) -> LLMMetrics:
490514 output_token_throughputs ,
491515 output_token_throughputs_per_request ,
492516 num_generated_tokens ,
517+ num_input_tokens ,
493518 )
494519
495520 def _preprocess_response (
@@ -513,6 +538,32 @@ def _preprocess_response(
513538 res_timestamps .pop ()
514539 res_outputs .pop ()
515540
541+ def _tokenize_request_inputs (self , req_inputs : dict ) -> list [list [int ]]:
542+ """Deserialize the request input and return tokenized inputs."""
543+ if self ._service_kind == "triton" :
544+ return self ._tokenize_triton_request_input (req_inputs )
545+ elif self ._service_kind == "openai" :
546+ return self ._tokenize_openai_request_input (req_inputs )
547+ else :
548+ raise ValueError (f"Unknown service kind: '{ self ._service_kind } '." )
549+
550+ def _tokenize_triton_request_input (self , req_inputs : dict ) -> list [list [int ]]:
551+ """Tokenize the Triton request input texts."""
552+ return self ._tokenizer (req_inputs ["text_input" ])["input_ids" ]
553+
554+ def _tokenize_openai_request_input (self , req_inputs : dict ) -> list [list [int ]]:
555+ """Tokenize the OpenAI request input texts."""
556+ payload = json .loads (req_inputs ["payload" ])
557+ if self ._output_format == _OPENAI_CHAT_COMPLETIONS :
558+ input_text = payload ["messages" ][0 ]["content" ]
559+ elif self ._output_format == _OPENAI_COMPLETIONS :
560+ input_text = payload ["prompt" ][0 ]
561+ else :
562+ raise ValueError (
563+ "Failed to parse OpenAI request input in profile export file."
564+ )
565+ return self ._tokenizer (input_text )["input_ids" ]
566+
516567 def _tokenize_response_outputs (self , res_outputs : dict ) -> list [list [int ]]:
517568 """Deserialize the response output and return tokenized outputs."""
518569 if self ._service_kind == "triton" :
0 commit comments