Extend Statistics and Metrics for visualization (#556)

nv-hwoo · web-flow · commit 28e974605275 · 2024-04-04T13:31:22.000-07:00
* store ITL per request and add data getter

* Remove unused import

* check data is empty after preprocessing
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py
@@ -26,9 +26,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import contextlib
 import csv
-import io
 import json
 from itertools import pairwise
 
@@ -78,6 +76,13 @@ def __init__(
             "request_latencies": "request_latency",
         }
 
+    def __repr__(self):
+        attr_strs = []
+        for k, v in self.__dict__.items():
+            if not k.startswith("_"):
+                attr_strs.append(f"{k}={v}")
+        return f"Metrics({','.join(attr_strs)})"
+
     @property
     def data(self) -> dict:
         """Returns all the metrics."""
@@ -99,7 +104,7 @@ def __init__(
         request_throughputs: list[float] = [],
         request_latencies: list[int] = [],
         time_to_first_tokens: list[int] = [],
-        inter_token_latencies: list[int] = [],
+        inter_token_latencies: list[list[int]] = [[]],
         output_token_throughputs: list[float] = [],
         output_token_throughputs_per_request: list[int] = [],
         num_output_tokens: list[int] = [],
@@ -141,14 +146,26 @@ class Statistics:
 
     def __init__(self, metrics: Metrics):
         # iterate through Metrics to calculate statistics and set attributes
+        self._metrics = metrics
         for attr, data in metrics.data.items():
+            attr = metrics.get_base_name(attr)
+            data = self._preprocess_data(data, attr)
             if data:
-                attr = metrics.get_base_name(attr)
                 self._calculate_mean(data, attr)
                 self._calculate_percentiles(data, attr)
                 self._calculate_minmax(data, attr)
                 self._calculate_std(data, attr)
 
+    def _preprocess_data(self, data: list, attr: str) -> list[int | float]:
+        new_data = []
+        if attr == "inter_token_latency":
+            # flatten inter token latencies to 1D
+            for d in data:
+                new_data += d
+        else:
+            new_data = data
+        return new_data
+
     def _calculate_mean(self, data: list[int | float], attr: str):
         avg = np.mean(data)
         setattr(self, "avg_" + attr, avg)
@@ -173,8 +190,21 @@ def _calculate_std(self, data: list[int | float], attr: str):
         setattr(self, "std_" + attr, std)
 
     def __repr__(self):
-        attr_strs = ",".join([f"{k}={v}" for k, v in self.__dict__.items()])
-        return f"Statistics({attr_strs})"
+        attr_strs = []
+        for k, v in self.__dict__.items():
+            if not k.startswith("_"):
+                attr_strs.append(f"{k}={v}")
+        return f"Statistics({','.join(attr_strs)})"
+
+    @property
+    def data(self) -> dict:
+        """Return all the aggregated statistics."""
+        return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}
+
+    @property
+    def metrics(self) -> Metrics:
+        """Return the underlying metrics used to calculate the statistics."""
+        return self._metrics
 
     def _is_throughput_field(self, field: str):
         return field in Metrics.throughput_fields
@@ -437,13 +467,15 @@ def _parse_requests(self, requests: dict) -> LLMMetrics:
             num_generated_tokens.append(total_output_tokens)
 
             # inter token latency
+            itl_per_request = []
             for (t1, _), (t2, n2) in pairwise(zip(res_timestamps, num_output_tokens)):
                 # TMA-1676: handle empty first/last responses
                 # if the latter response has zero token (e.g. empty string),
                 # then set it default to one for the sake of inter token latency
                 # calculation and to avoid divide by zero.
                 num_token = 1 if n2 == 0 else n2
-                inter_token_latencies.append(round((t2 - t1) / num_token))
+                itl_per_request.append(round((t2 - t1) / num_token))
+            inter_token_latencies.append(itl_per_request)
 
         # request & output token throughput
         benchmark_duration = (max_res_timestamp - min_req_timestamp) / 1e9  # nanosec
diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py b/src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py
@@ -28,12 +28,10 @@
 
 import json
 from io import StringIO
-from pathlib import Path
 
 import numpy as np
 import pytest
 from genai_perf.llm_metrics import LLMMetrics, LLMProfileDataParser
-from genai_perf.utils import remove_file
 from transformers import AutoTokenizer
 
 
@@ -122,12 +120,12 @@ def test_triton_llm_profile_data(self, mock_read_write) -> None:
             - experiment 1: [3 - 1, 4 - 2] = [2, 2]
             - experiment 2: [7 - 5, 6 - 3] = [2, 3]
         * inter token latencies
-            - experiment 1: [(5 - 3)/1, (8 - 5)/1, (7 - 4)/2, (11 - 7)/2]
-                          : [2, 3, 3/2, 2]
-                          : [2, 3, 2, 2]
-            - experiment 2: [(8 - 7)/1, (13 - 8)/1, (18 - 13)/1, (8 - 6)/1, (11 - 8)/2]
-                          : [1, 5, 5, 2, 3/2]
-                          : [1, 5, 5, 2, 2]
+            - experiment 1: [[(5 - 3)/1, (8 - 5)/1], [(7 - 4)/2, (11 - 7)/2]]
+                          : [[2, 3], [3/2, 2]]
+                          : [[2, 3], [2, 2]]  # rounded up
+            - experiment 2: [[(8 - 7)/1, (13 - 8)/1, (18 - 13)/1], [(8 - 6)/1, (11 - 8)/2]]
+                          : [[1, 5, 5], [2, 3/2]]
+                          : [[1, 5, 5], [2, 2]]  # rounded up
         * output token throughputs per request
             - experiment 1: [3/(8 - 1), 5/(11 - 2)] = [3/7, 5/9]
             - experiment 2: [4/(18 - 5), 5/(11 - 3)] = [4/13, 5/8]
@@ -145,8 +143,17 @@ def test_triton_llm_profile_data(self, mock_read_write) -> None:
             tokenizer=tokenizer,
         )
 
-        # experiment 1 statistics
+        # experiment 1 metrics & statistics
         stat = pd.get_statistics(infer_mode="concurrency", load_level="10")
+        metrics = stat.metrics
+
+        assert metrics.time_to_first_tokens == [2, 2]
+        assert metrics.inter_token_latencies == [[2, 3], [2, 2]]
+        ottpr = [3 / ns_to_sec(7), 5 / ns_to_sec(9)]
+        assert metrics.output_token_throughputs_per_request == pytest.approx(ottpr)
+        ott = [8 / ns_to_sec(10)]
+        assert metrics.output_token_throughputs == pytest.approx(ott)
+        assert metrics.num_output_tokens == [3, 5]
 
         assert stat.avg_time_to_first_token == 2
         assert stat.avg_inter_token_latency == 2.25
@@ -186,6 +193,15 @@ def test_triton_llm_profile_data(self, mock_read_write) -> None:
 
         # experiment 2 statistics
         stat = pd.get_statistics(infer_mode="request_rate", load_level="2.0")
+        metrics = stat.metrics
+
+        assert metrics.time_to_first_tokens == [2, 3]
+        assert metrics.inter_token_latencies == [[1, 5, 5], [2, 2]]
+        ottpr = [4 / ns_to_sec(13), 5 / ns_to_sec(8)]
+        assert metrics.output_token_throughputs_per_request == pytest.approx(ottpr)
+        ott = [3 / ns_to_sec(5)]
+        assert metrics.output_token_throughputs == pytest.approx(ott)
+        assert metrics.num_output_tokens == [4, 5]
 
         assert stat.avg_time_to_first_token == 2.5
         assert stat.avg_inter_token_latency == 3
@@ -234,8 +250,8 @@ def test_openai_llm_profile_data(self, mock_read_write) -> None:
         * time to first tokens
             - experiment 1: [3 - 1, 4 - 2] = [2, 2]
         * inter token latencies
-            - experiment 1: [(5 - 3)/1, (8 - 5)/1, (12 - 8)/1, (7 - 4)/1, (11 - 7)/2, (15 - 11)/2]
-                          : [2, 3, 4, 3, 2, 2]
+            - experiment 1: [[(5 - 3)/1, (8 - 5)/1, (12 - 8)/1], [(7 - 4)/1, (11 - 7)/2, (15 - 11)/2]]
+                          : [[2, 3, 4], [3, 2, 2]]
         * output token throughputs per request
             - experiment 1: [3/(12 - 1), 5/(15 - 2)] = [3/11, 5/13]
         * output token throughputs
@@ -252,6 +268,15 @@ def test_openai_llm_profile_data(self, mock_read_write) -> None:
 
         # experiment 1 statistics
         stat = pd.get_statistics(infer_mode="concurrency", load_level="10")
+        metrics = stat.metrics
+
+        assert metrics.time_to_first_tokens == [2, 2]
+        assert metrics.inter_token_latencies == [[2, 3, 4], [3, 2, 2]]
+        ottpr = [3 / ns_to_sec(11), 5 / ns_to_sec(13)]
+        assert metrics.output_token_throughputs_per_request == pytest.approx(ottpr)
+        ott = [4 / ns_to_sec(7)]
+        assert metrics.output_token_throughputs == pytest.approx(ott)
+        assert metrics.num_output_tokens == [3, 5]
 
         assert stat.avg_time_to_first_token == 2
         assert stat.avg_inter_token_latency == 8 / 3