2828
2929import json
3030from io import StringIO
31- from pathlib import Path
3231
3332import numpy as np
3433import pytest
3534from genai_perf .llm_metrics import LLMMetrics , LLMProfileDataParser
36- from genai_perf .utils import remove_file
3735from transformers import AutoTokenizer
3836
3937
@@ -122,12 +120,12 @@ def test_triton_llm_profile_data(self, mock_read_write) -> None:
122120 - experiment 1: [3 - 1, 4 - 2] = [2, 2]
123121 - experiment 2: [7 - 5, 6 - 3] = [2, 3]
124122 * inter token latencies
125- - experiment 1: [(5 - 3)/1, (8 - 5)/1, (7 - 4)/2, (11 - 7)/2]
126- : [2, 3, 3/2, 2]
127- : [2, 3, 2, 2]
128- - experiment 2: [(8 - 7)/1, (13 - 8)/1, (18 - 13)/1, (8 - 6)/1, (11 - 8)/2]
129- : [1, 5, 5, 2, 3/2]
130- : [1, 5, 5, 2, 2]
123+ - experiment 1: [[ (5 - 3)/1, (8 - 5)/1], [ (7 - 4)/2, (11 - 7)/2] ]
124+ : [[ 2, 3], [ 3/2, 2] ]
125+ : [[ 2, 3], [ 2, 2]] # rounded up
126+ - experiment 2: [[ (8 - 7)/1, (13 - 8)/1, (18 - 13)/1], [ (8 - 6)/1, (11 - 8)/2] ]
127+ : [[ 1, 5, 5], [ 2, 3/2] ]
128+ : [[ 1, 5, 5], [ 2, 2]] # rounded up
131129 * output token throughputs per request
132130 - experiment 1: [3/(8 - 1), 5/(11 - 2)] = [3/7, 5/9]
133131 - experiment 2: [4/(18 - 5), 5/(11 - 3)] = [4/13, 5/8]
@@ -145,8 +143,17 @@ def test_triton_llm_profile_data(self, mock_read_write) -> None:
145143 tokenizer = tokenizer ,
146144 )
147145
148- # experiment 1 statistics
146+ # experiment 1 metrics & statistics
149147 stat = pd .get_statistics (infer_mode = "concurrency" , load_level = "10" )
148+ metrics = stat .metrics
149+
150+ assert metrics .time_to_first_tokens == [2 , 2 ]
151+ assert metrics .inter_token_latencies == [[2 , 3 ], [2 , 2 ]]
152+ ottpr = [3 / ns_to_sec (7 ), 5 / ns_to_sec (9 )]
153+ assert metrics .output_token_throughputs_per_request == pytest .approx (ottpr )
154+ ott = [8 / ns_to_sec (10 )]
155+ assert metrics .output_token_throughputs == pytest .approx (ott )
156+ assert metrics .num_output_tokens == [3 , 5 ]
150157
151158 assert stat .avg_time_to_first_token == 2
152159 assert stat .avg_inter_token_latency == 2.25
@@ -186,6 +193,15 @@ def test_triton_llm_profile_data(self, mock_read_write) -> None:
186193
187194 # experiment 2 statistics
188195 stat = pd .get_statistics (infer_mode = "request_rate" , load_level = "2.0" )
196+ metrics = stat .metrics
197+
198+ assert metrics .time_to_first_tokens == [2 , 3 ]
199+ assert metrics .inter_token_latencies == [[1 , 5 , 5 ], [2 , 2 ]]
200+ ottpr = [4 / ns_to_sec (13 ), 5 / ns_to_sec (8 )]
201+ assert metrics .output_token_throughputs_per_request == pytest .approx (ottpr )
202+ ott = [3 / ns_to_sec (5 )]
203+ assert metrics .output_token_throughputs == pytest .approx (ott )
204+ assert metrics .num_output_tokens == [4 , 5 ]
189205
190206 assert stat .avg_time_to_first_token == 2.5
191207 assert stat .avg_inter_token_latency == 3
@@ -234,8 +250,8 @@ def test_openai_llm_profile_data(self, mock_read_write) -> None:
234250 * time to first tokens
235251 - experiment 1: [3 - 1, 4 - 2] = [2, 2]
236252 * inter token latencies
237- - experiment 1: [(5 - 3)/1, (8 - 5)/1, (12 - 8)/1, (7 - 4)/1, (11 - 7)/2, (15 - 11)/2]
238- : [2, 3, 4, 3, 2, 2]
253+ - experiment 1: [[ (5 - 3)/1, (8 - 5)/1, (12 - 8)/1], [ (7 - 4)/1, (11 - 7)/2, (15 - 11)/2] ]
254+ : [[ 2, 3, 4], [ 3, 2, 2] ]
239255 * output token throughputs per request
240256 - experiment 1: [3/(12 - 1), 5/(15 - 2)] = [3/11, 5/13]
241257 * output token throughputs
@@ -252,6 +268,15 @@ def test_openai_llm_profile_data(self, mock_read_write) -> None:
252268
253269 # experiment 1 statistics
254270 stat = pd .get_statistics (infer_mode = "concurrency" , load_level = "10" )
271+ metrics = stat .metrics
272+
273+ assert metrics .time_to_first_tokens == [2 , 2 ]
274+ assert metrics .inter_token_latencies == [[2 , 3 , 4 ], [3 , 2 , 2 ]]
275+ ottpr = [3 / ns_to_sec (11 ), 5 / ns_to_sec (13 )]
276+ assert metrics .output_token_throughputs_per_request == pytest .approx (ottpr )
277+ ott = [4 / ns_to_sec (7 )]
278+ assert metrics .output_token_throughputs == pytest .approx (ott )
279+ assert metrics .num_output_tokens == [3 , 5 ]
255280
256281 assert stat .avg_time_to_first_token == 2
257282 assert stat .avg_inter_token_latency == 8 / 3
0 commit comments