Skip to content

Commit c63d09e

Browse files
authored
Add number of input tokens metric (#559)
* calculate number of input tokens * Calculate input tokens for triton * skip empty response * Remove unused imports
1 parent 1b1d24b commit c63d09e

File tree

3 files changed

+100
-6
lines changed

3 files changed

+100
-6
lines changed

src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,15 @@
3131
from itertools import pairwise
3232

3333
import numpy as np
34+
from genai_perf.llm_inputs.llm_inputs import OutputFormat
3435
from genai_perf.tokenizer import AutoTokenizer
3536
from genai_perf.utils import load_json, remove_sse_prefix
3637
from rich.console import Console
3738
from rich.table import Table
3839

40+
_OPENAI_CHAT_COMPLETIONS = OutputFormat.OPENAI_CHAT_COMPLETIONS
41+
_OPENAI_COMPLETIONS = OutputFormat.OPENAI_COMPLETIONS
42+
3943

4044
class Metrics:
4145
"""A base class for all the metrics class that contains common metrics."""
@@ -48,6 +52,7 @@ class Metrics:
4852
"output_token_throughput_per_request",
4953
"request_throughput",
5054
"num_output_token",
55+
"num_input_token",
5156
]
5257

5358
time_fields = [
@@ -108,13 +113,15 @@ def __init__(
108113
output_token_throughputs: list[float] = [],
109114
output_token_throughputs_per_request: list[int] = [],
110115
num_output_tokens: list[int] = [],
116+
num_input_tokens: list[int] = [],
111117
) -> None:
112118
super().__init__(request_throughputs, request_latencies)
113119
self.time_to_first_tokens = time_to_first_tokens
114120
self.inter_token_latencies = inter_token_latencies
115121
self.output_token_throughputs = output_token_throughputs
116122
self.output_token_throughputs_per_request = output_token_throughputs_per_request
117123
self.num_output_tokens = num_output_tokens
124+
self.num_input_tokens = num_input_tokens
118125

119126
# add base name mapping
120127
self._base_names["time_to_first_tokens"] = "time_to_first_token"
@@ -124,6 +131,7 @@ def __init__(
124131
"output_token_throughputs_per_request"
125132
] = "output_token_throughput_per_request"
126133
self._base_names["num_output_tokens"] = "num_output_token"
134+
self._base_names["num_input_tokens"] = "num_input_token"
127135

128136

129137
class Statistics:
@@ -424,10 +432,15 @@ class LLMProfileDataParser(ProfileDataParser):
424432
"""
425433

426434
def __init__(
427-
self, filename: str, service_kind: str, tokenizer: AutoTokenizer
435+
self,
436+
filename: str,
437+
service_kind: str,
438+
output_format: OutputFormat,
439+
tokenizer: AutoTokenizer,
428440
) -> None:
429441
self._tokenizer = tokenizer
430442
self._service_kind = service_kind
443+
self._output_format = output_format
431444
super().__init__(filename)
432445

433446
def _parse_requests(self, requests: dict) -> LLMMetrics:
@@ -437,14 +450,21 @@ def _parse_requests(self, requests: dict) -> LLMMetrics:
437450
time_to_first_tokens = []
438451
inter_token_latencies = []
439452
output_token_throughputs_per_request = []
453+
num_input_tokens = []
440454
num_generated_tokens = []
441455
for request in requests:
442456
req_timestamp = request["timestamp"]
457+
req_inputs = request["request_inputs"]
443458
res_timestamps = request["response_timestamps"]
444459
res_outputs = request["response_outputs"]
445460

446461
self._preprocess_response(res_timestamps, res_outputs)
447462

463+
# Skip requests with empty response. This happens sometimes when the
464+
# model returns a single response with empty string.
465+
if not res_timestamps:
466+
continue
467+
448468
# track entire benchmark duration
449469
min_req_timestamp = min(min_req_timestamp, req_timestamp)
450470
max_res_timestamp = max(max_res_timestamp, res_timestamps[-1])
@@ -457,6 +477,10 @@ def _parse_requests(self, requests: dict) -> LLMMetrics:
457477
# time to first token
458478
time_to_first_tokens.append(res_timestamps[0] - req_timestamp)
459479

480+
# number of input tokens
481+
input_tokens = self._tokenize_request_inputs(req_inputs)
482+
num_input_tokens.append(len(input_tokens))
483+
460484
# output token throughput per request
461485
output_tokens = self._tokenize_response_outputs(res_outputs)
462486
num_output_tokens = list(map(len, output_tokens))
@@ -490,6 +514,7 @@ def _parse_requests(self, requests: dict) -> LLMMetrics:
490514
output_token_throughputs,
491515
output_token_throughputs_per_request,
492516
num_generated_tokens,
517+
num_input_tokens,
493518
)
494519

495520
def _preprocess_response(
@@ -513,6 +538,32 @@ def _preprocess_response(
513538
res_timestamps.pop()
514539
res_outputs.pop()
515540

541+
def _tokenize_request_inputs(self, req_inputs: dict) -> list[list[int]]:
542+
"""Deserialize the request input and return tokenized inputs."""
543+
if self._service_kind == "triton":
544+
return self._tokenize_triton_request_input(req_inputs)
545+
elif self._service_kind == "openai":
546+
return self._tokenize_openai_request_input(req_inputs)
547+
else:
548+
raise ValueError(f"Unknown service kind: '{self._service_kind}'.")
549+
550+
def _tokenize_triton_request_input(self, req_inputs: dict) -> list[list[int]]:
551+
"""Tokenize the Triton request input texts."""
552+
return self._tokenizer(req_inputs["text_input"])["input_ids"]
553+
554+
def _tokenize_openai_request_input(self, req_inputs: dict) -> list[list[int]]:
555+
"""Tokenize the OpenAI request input texts."""
556+
payload = json.loads(req_inputs["payload"])
557+
if self._output_format == _OPENAI_CHAT_COMPLETIONS:
558+
input_text = payload["messages"][0]["content"]
559+
elif self._output_format == _OPENAI_COMPLETIONS:
560+
input_text = payload["prompt"][0]
561+
else:
562+
raise ValueError(
563+
"Failed to parse OpenAI request input in profile export file."
564+
)
565+
return self._tokenizer(input_text)["input_ids"]
566+
516567
def _tokenize_response_outputs(self, res_outputs: dict) -> list[list[int]]:
517568
"""Deserialize the response output and return tokenized outputs."""
518569
if self._service_kind == "triton":

src/c++/perf_analyzer/genai-perf/genai_perf/main.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,14 @@ def generate_inputs(args: ArgumentParser, tokenizer: AutoTokenizer) -> None:
7171

7272

7373
def calculate_metrics(
74-
file: str, service_kind: str, tokenizer: AutoTokenizer
74+
args: ArgumentParser, tokenizer: AutoTokenizer
7575
) -> LLMProfileDataParser:
76-
return LLMProfileDataParser(file, service_kind, tokenizer)
76+
return LLMProfileDataParser(
77+
filename=args.profile_export_file,
78+
service_kind=args.service_kind,
79+
output_format=args.output_format,
80+
tokenizer=tokenizer,
81+
)
7782

7883

7984
def report_output(metrics: LLMProfileDataParser, args):
@@ -99,9 +104,7 @@ def run():
99104
tokenizer = get_tokenizer(args.tokenizer)
100105
generate_inputs(args, tokenizer)
101106
args.func(args, extra_args)
102-
metrics = calculate_metrics(
103-
args.profile_export_file, args.service_kind, tokenizer
104-
)
107+
metrics = calculate_metrics(args, tokenizer)
105108
report_output(metrics, args)
106109
except Exception as e:
107110
raise GenAIPerfException(e)

0 commit comments

Comments
 (0)