Skip to content

Commit 28e9746

Browse files
authored
Extend Statistics and Metrics for visualization (#556)
* store ITL per request and add data getter * Remove unused import * check data is empty after preprocessing
1 parent 6259cda commit 28e9746

File tree

2 files changed

+75
-18
lines changed

2 files changed

+75
-18
lines changed

src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,7 @@
2626
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2727
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2828

29-
import contextlib
3029
import csv
31-
import io
3230
import json
3331
from itertools import pairwise
3432

@@ -78,6 +76,13 @@ def __init__(
7876
"request_latencies": "request_latency",
7977
}
8078

79+
def __repr__(self):
80+
attr_strs = []
81+
for k, v in self.__dict__.items():
82+
if not k.startswith("_"):
83+
attr_strs.append(f"{k}={v}")
84+
return f"Metrics({','.join(attr_strs)})"
85+
8186
@property
8287
def data(self) -> dict:
8388
"""Returns all the metrics."""
@@ -99,7 +104,7 @@ def __init__(
99104
request_throughputs: list[float] = [],
100105
request_latencies: list[int] = [],
101106
time_to_first_tokens: list[int] = [],
102-
inter_token_latencies: list[int] = [],
107+
inter_token_latencies: list[list[int]] = [[]],
103108
output_token_throughputs: list[float] = [],
104109
output_token_throughputs_per_request: list[int] = [],
105110
num_output_tokens: list[int] = [],
@@ -141,14 +146,26 @@ class Statistics:
141146

142147
def __init__(self, metrics: Metrics):
143148
# iterate through Metrics to calculate statistics and set attributes
149+
self._metrics = metrics
144150
for attr, data in metrics.data.items():
151+
attr = metrics.get_base_name(attr)
152+
data = self._preprocess_data(data, attr)
145153
if data:
146-
attr = metrics.get_base_name(attr)
147154
self._calculate_mean(data, attr)
148155
self._calculate_percentiles(data, attr)
149156
self._calculate_minmax(data, attr)
150157
self._calculate_std(data, attr)
151158

159+
def _preprocess_data(self, data: list, attr: str) -> list[int | float]:
160+
new_data = []
161+
if attr == "inter_token_latency":
162+
# flatten inter token latencies to 1D
163+
for d in data:
164+
new_data += d
165+
else:
166+
new_data = data
167+
return new_data
168+
152169
def _calculate_mean(self, data: list[int | float], attr: str):
153170
avg = np.mean(data)
154171
setattr(self, "avg_" + attr, avg)
@@ -173,8 +190,21 @@ def _calculate_std(self, data: list[int | float], attr: str):
173190
setattr(self, "std_" + attr, std)
174191

175192
def __repr__(self):
176-
attr_strs = ",".join([f"{k}={v}" for k, v in self.__dict__.items()])
177-
return f"Statistics({attr_strs})"
193+
attr_strs = []
194+
for k, v in self.__dict__.items():
195+
if not k.startswith("_"):
196+
attr_strs.append(f"{k}={v}")
197+
return f"Statistics({','.join(attr_strs)})"
198+
199+
@property
200+
def data(self) -> dict:
201+
"""Return all the aggregated statistics."""
202+
return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}
203+
204+
@property
205+
def metrics(self) -> Metrics:
206+
"""Return the underlying metrics used to calculate the statistics."""
207+
return self._metrics
178208

179209
def _is_throughput_field(self, field: str):
180210
return field in Metrics.throughput_fields
@@ -437,13 +467,15 @@ def _parse_requests(self, requests: dict) -> LLMMetrics:
437467
num_generated_tokens.append(total_output_tokens)
438468

439469
# inter token latency
470+
itl_per_request = []
440471
for (t1, _), (t2, n2) in pairwise(zip(res_timestamps, num_output_tokens)):
441472
# TMA-1676: handle empty first/last responses
442473
# if the latter response has zero token (e.g. empty string),
443474
# then set it default to one for the sake of inter token latency
444475
# calculation and to avoid divide by zero.
445476
num_token = 1 if n2 == 0 else n2
446-
inter_token_latencies.append(round((t2 - t1) / num_token))
477+
itl_per_request.append(round((t2 - t1) / num_token))
478+
inter_token_latencies.append(itl_per_request)
447479

448480
# request & output token throughput
449481
benchmark_duration = (max_res_timestamp - min_req_timestamp) / 1e9 # nanosec

src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,10 @@
2828

2929
import json
3030
from io import StringIO
31-
from pathlib import Path
3231

3332
import numpy as np
3433
import pytest
3534
from genai_perf.llm_metrics import LLMMetrics, LLMProfileDataParser
36-
from genai_perf.utils import remove_file
3735
from transformers import AutoTokenizer
3836

3937

@@ -122,12 +120,12 @@ def test_triton_llm_profile_data(self, mock_read_write) -> None:
122120
- experiment 1: [3 - 1, 4 - 2] = [2, 2]
123121
- experiment 2: [7 - 5, 6 - 3] = [2, 3]
124122
* inter token latencies
125-
- experiment 1: [(5 - 3)/1, (8 - 5)/1, (7 - 4)/2, (11 - 7)/2]
126-
: [2, 3, 3/2, 2]
127-
: [2, 3, 2, 2]
128-
- experiment 2: [(8 - 7)/1, (13 - 8)/1, (18 - 13)/1, (8 - 6)/1, (11 - 8)/2]
129-
: [1, 5, 5, 2, 3/2]
130-
: [1, 5, 5, 2, 2]
123+
- experiment 1: [[(5 - 3)/1, (8 - 5)/1], [(7 - 4)/2, (11 - 7)/2]]
124+
: [[2, 3], [3/2, 2]]
125+
: [[2, 3], [2, 2]] # rounded up
126+
- experiment 2: [[(8 - 7)/1, (13 - 8)/1, (18 - 13)/1], [(8 - 6)/1, (11 - 8)/2]]
127+
: [[1, 5, 5], [2, 3/2]]
128+
: [[1, 5, 5], [2, 2]] # rounded up
131129
* output token throughputs per request
132130
- experiment 1: [3/(8 - 1), 5/(11 - 2)] = [3/7, 5/9]
133131
- experiment 2: [4/(18 - 5), 5/(11 - 3)] = [4/13, 5/8]
@@ -145,8 +143,17 @@ def test_triton_llm_profile_data(self, mock_read_write) -> None:
145143
tokenizer=tokenizer,
146144
)
147145

148-
# experiment 1 statistics
146+
# experiment 1 metrics & statistics
149147
stat = pd.get_statistics(infer_mode="concurrency", load_level="10")
148+
metrics = stat.metrics
149+
150+
assert metrics.time_to_first_tokens == [2, 2]
151+
assert metrics.inter_token_latencies == [[2, 3], [2, 2]]
152+
ottpr = [3 / ns_to_sec(7), 5 / ns_to_sec(9)]
153+
assert metrics.output_token_throughputs_per_request == pytest.approx(ottpr)
154+
ott = [8 / ns_to_sec(10)]
155+
assert metrics.output_token_throughputs == pytest.approx(ott)
156+
assert metrics.num_output_tokens == [3, 5]
150157

151158
assert stat.avg_time_to_first_token == 2
152159
assert stat.avg_inter_token_latency == 2.25
@@ -186,6 +193,15 @@ def test_triton_llm_profile_data(self, mock_read_write) -> None:
186193

187194
# experiment 2 statistics
188195
stat = pd.get_statistics(infer_mode="request_rate", load_level="2.0")
196+
metrics = stat.metrics
197+
198+
assert metrics.time_to_first_tokens == [2, 3]
199+
assert metrics.inter_token_latencies == [[1, 5, 5], [2, 2]]
200+
ottpr = [4 / ns_to_sec(13), 5 / ns_to_sec(8)]
201+
assert metrics.output_token_throughputs_per_request == pytest.approx(ottpr)
202+
ott = [3 / ns_to_sec(5)]
203+
assert metrics.output_token_throughputs == pytest.approx(ott)
204+
assert metrics.num_output_tokens == [4, 5]
189205

190206
assert stat.avg_time_to_first_token == 2.5
191207
assert stat.avg_inter_token_latency == 3
@@ -234,8 +250,8 @@ def test_openai_llm_profile_data(self, mock_read_write) -> None:
234250
* time to first tokens
235251
- experiment 1: [3 - 1, 4 - 2] = [2, 2]
236252
* inter token latencies
237-
- experiment 1: [(5 - 3)/1, (8 - 5)/1, (12 - 8)/1, (7 - 4)/1, (11 - 7)/2, (15 - 11)/2]
238-
: [2, 3, 4, 3, 2, 2]
253+
- experiment 1: [[(5 - 3)/1, (8 - 5)/1, (12 - 8)/1], [(7 - 4)/1, (11 - 7)/2, (15 - 11)/2]]
254+
: [[2, 3, 4], [3, 2, 2]]
239255
* output token throughputs per request
240256
- experiment 1: [3/(12 - 1), 5/(15 - 2)] = [3/11, 5/13]
241257
* output token throughputs
@@ -252,6 +268,15 @@ def test_openai_llm_profile_data(self, mock_read_write) -> None:
252268

253269
# experiment 1 statistics
254270
stat = pd.get_statistics(infer_mode="concurrency", load_level="10")
271+
metrics = stat.metrics
272+
273+
assert metrics.time_to_first_tokens == [2, 2]
274+
assert metrics.inter_token_latencies == [[2, 3, 4], [3, 2, 2]]
275+
ottpr = [3 / ns_to_sec(11), 5 / ns_to_sec(13)]
276+
assert metrics.output_token_throughputs_per_request == pytest.approx(ottpr)
277+
ott = [4 / ns_to_sec(7)]
278+
assert metrics.output_token_throughputs == pytest.approx(ott)
279+
assert metrics.num_output_tokens == [3, 5]
255280

256281
assert stat.avg_time_to_first_token == 2
257282
assert stat.avg_inter_token_latency == 8 / 3

0 commit comments

Comments
 (0)