Support Python 3.8 in GenAI-Perf (#643)

the-david-oy · mc-nv · commit fbee5b9e91b1 · 2024-05-13T14:09:09.000-07:00
diff --git a/.github/workflows/python-package-genai.yml b/.github/workflows/python-package-genai.yml
@@ -39,7 +39,7 @@ jobs:
       fail-fast: false
       matrix:
         os: ["ubuntu-22.04"]
-        python-version: ["3.10"]
+        python-version: ["3.8", "3.10"]
 
     steps:
     - uses: actions/checkout@v3
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py
@@ -29,9 +29,9 @@
 import csv
 import json
 from enum import Enum, auto
-from itertools import pairwise
+from itertools import tee
 from pathlib import Path
-from typing import List
+from typing import Dict, List, Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -115,7 +115,7 @@ def __init__(
         request_throughputs: List[float] = [],
         request_latencies: List[int] = [],
         time_to_first_tokens: List[int] = [],
-        inter_token_latencies: List[list[int]] = [[]],
+        inter_token_latencies: List[List[int]] = [[]],
         output_token_throughputs: List[float] = [],
         output_token_throughputs_per_request: List[int] = [],
         num_output_tokens: List[int] = [],
@@ -170,7 +170,7 @@ def __init__(self, metrics: Metrics):
                 self._calculate_minmax(data, attr)
                 self._calculate_std(data, attr)
 
-    def _preprocess_data(self, data: list, attr: str) -> list[int | float]:
+    def _preprocess_data(self, data: List, attr: str) -> List[Union[int, float]]:
         new_data = []
         if attr == "inter_token_latency":
             # flatten inter token latencies to 1D
@@ -180,11 +180,11 @@ def _preprocess_data(self, data: list, attr: str) -> list[int | float]:
             new_data = data
         return new_data
 
-    def _calculate_mean(self, data: list[int | float], attr: str) -> None:
+    def _calculate_mean(self, data: List[Union[int, float]], attr: str) -> None:
         avg = np.mean(data)
         setattr(self, "avg_" + attr, avg)
 
-    def _calculate_percentiles(self, data: list[int | float], attr: str) -> None:
+    def _calculate_percentiles(self, data: List[Union[int, float]], attr: str) -> None:
         p25, p50, p75 = np.percentile(data, [25, 50, 75])
         p90, p95, p99 = np.percentile(data, [90, 95, 99])
         setattr(self, "p25_" + attr, p25)
@@ -194,12 +194,12 @@ def _calculate_percentiles(self, data: list[int | float], attr: str) -> None:
         setattr(self, "p95_" + attr, p95)
         setattr(self, "p99_" + attr, p99)
 
-    def _calculate_minmax(self, data: list[int | float], attr: str) -> None:
+    def _calculate_minmax(self, data: List[Union[int, float]], attr: str) -> None:
         min, max = np.min(data), np.max(data)
         setattr(self, "min_" + attr, min)
         setattr(self, "max_" + attr, max)
 
-    def _calculate_std(self, data: list[int | float], attr: str) -> None:
+    def _calculate_std(self, data: List[Union[int, float]], attr: str) -> None:
         std = np.std(data)
         setattr(self, "std_" + attr, std)
 
@@ -460,7 +460,7 @@ def get_statistics(self, infer_mode: str, load_level: str) -> Statistics:
             raise KeyError(f"Profile with {infer_mode}={load_level} does not exist.")
         return self._profile_results[(infer_mode, load_level)]
 
-    def get_profile_load_info(self) -> list[tuple[str, str]]:
+    def get_profile_load_info(self) -> List[Tuple[str, str]]:
         """Return available (infer_mode, load_level) tuple keys."""
         return [k for k, _ in self._profile_results.items()]
 
@@ -547,7 +547,9 @@ def _parse_requests(self, requests: dict) -> LLMMetrics:
 
             # inter token latency
             itl_per_request = []
-            for (t1, _), (t2, n2) in pairwise(zip(res_timestamps, num_output_tokens)):
+            for (t1, _), (t2, n2) in self._pairwise(
+                zip(res_timestamps, num_output_tokens)
+            ):
                 # TMA-1676: handle empty first/last responses
                 # if the latter response has zero token (e.g. empty string),
                 # then set it default to one for the sake of inter token latency
@@ -572,8 +574,14 @@ def _parse_requests(self, requests: dict) -> LLMMetrics:
             num_input_tokens,
         )
 
+    def _pairwise(self, iterable):
+        """Generate pairs of consecutive elements from the given iterable."""
+        a, b = tee(iterable)
+        next(b, None)
+        return zip(a, b)
+
     def _preprocess_response(
-        self, res_timestamps: list[int], res_outputs: list[dict[str, str]]
+        self, res_timestamps: List[int], res_outputs: List[Dict[str, str]]
     ) -> None:
         """Helper function to preprocess responses of a request."""
         if self._service_kind == "openai":
@@ -604,7 +612,7 @@ def _preprocess_response(
                 res_timestamps.pop()
                 res_outputs.pop()
 
-    def _tokenize_request_inputs(self, req_inputs: dict) -> list[int]:
+    def _tokenize_request_inputs(self, req_inputs: dict) -> List[int]:
         """Deserialize the request input and return tokenized inputs."""
         if self._service_kind == "triton":
             return self._tokenize_triton_request_input(req_inputs)
@@ -613,12 +621,12 @@ def _tokenize_request_inputs(self, req_inputs: dict) -> list[int]:
         else:
             raise ValueError(f"Unknown service kind: '{self._service_kind}'.")
 
-    def _tokenize_triton_request_input(self, req_inputs: dict) -> list[int]:
+    def _tokenize_triton_request_input(self, req_inputs: dict) -> List[int]:
         """Tokenize the Triton request input texts."""
         encodings = self._tokenizer(req_inputs["text_input"])
         return encodings.data["input_ids"]
 
-    def _tokenize_openai_request_input(self, req_inputs: dict) -> list[int]:
+    def _tokenize_openai_request_input(self, req_inputs: dict) -> List[int]:
         """Tokenize the OpenAI request input texts."""
         payload = json.loads(req_inputs["payload"])
         if self._response_format == ResponseFormat.OPENAI_CHAT_COMPLETIONS:
@@ -632,7 +640,7 @@ def _tokenize_openai_request_input(self, req_inputs: dict) -> list[int]:
         encodings = self._tokenizer(input_text)
         return encodings.data["input_ids"]
 
-    def _tokenize_response_outputs(self, res_outputs: dict) -> list[list[int]]:
+    def _tokenize_response_outputs(self, res_outputs: dict) -> List[List[int]]:
         """Deserialize the response output and return tokenized outputs."""
         if self._service_kind == "triton":
             return self._tokenize_triton_response_output(res_outputs)
@@ -641,22 +649,22 @@ def _tokenize_response_outputs(self, res_outputs: dict) -> list[list[int]]:
         else:
             raise ValueError(f"Unknown service kind: '{self._service_kind}'.")
 
-    def _tokenize_triton_response_output(self, res_outputs: dict) -> list[list[int]]:
+    def _tokenize_triton_response_output(self, res_outputs: dict) -> List[List[int]]:
         """Tokenize the Triton response output texts."""
         output_texts = []
         for output in res_outputs:
             output_texts.append(output["text_output"])
         return self._run_tokenizer(output_texts)
 
-    def _tokenize_openai_response_output(self, res_outputs: dict) -> list[list[int]]:
+    def _tokenize_openai_response_output(self, res_outputs: dict) -> List[List[int]]:
         """Tokenize the OpenAI response output texts."""
         output_texts = []
         for output in res_outputs:
             text = self._extract_openai_text_output(output["response"])
             output_texts.append(text)
         return self._run_tokenizer(output_texts)
 
-    def _run_tokenizer(self, output_texts: list[str]) -> list[list[int]]:
+    def _run_tokenizer(self, output_texts: List[str]) -> List[List[int]]:
         # exclamation mark trick forces the llama tokenization to consistently
         # start each output with a specific token which allows us to safely skip
         # the first token of every tokenized output and get only the ones that
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/plots/base_plot.py b/src/c++/perf_analyzer/genai-perf/genai_perf/plots/base_plot.py
@@ -26,6 +26,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from pathlib import Path
+from typing import List
 
 import pandas as pd
 from genai_perf.exceptions import GenAIPerfException
@@ -38,7 +39,7 @@ class BasePlot:
     Base class for plots
     """
 
-    def __init__(self, data: list[ProfileRunData]) -> None:
+    def __init__(self, data: List[ProfileRunData]) -> None:
         self._profile_data = data
 
     def create_plot(
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/plots/box_plot.py b/src/c++/perf_analyzer/genai-perf/genai_perf/plots/box_plot.py
@@ -26,6 +26,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from pathlib import Path
+from typing import List
 
 import plotly.graph_objects as go
 from genai_perf.plots.base_plot import BasePlot
@@ -37,7 +38,7 @@ class BoxPlot(BasePlot):
     Generate a box plot in jpeg and html format.
     """
 
-    def __init__(self, data: list[ProfileRunData]) -> None:
+    def __init__(self, data: List[ProfileRunData]) -> None:
         super().__init__(data)
 
     def create_plot(
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/plots/heat_map.py b/src/c++/perf_analyzer/genai-perf/genai_perf/plots/heat_map.py
@@ -26,6 +26,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from pathlib import Path
+from typing import List
 
 import plotly.graph_objects as go
 from genai_perf.plots.base_plot import BasePlot
@@ -38,7 +39,7 @@ class HeatMap(BasePlot):
     Generate a heat map in jpeg and html format.
     """
 
-    def __init__(self, data: list[ProfileRunData]) -> None:
+    def __init__(self, data: List[ProfileRunData]) -> None:
         super().__init__(data)
 
     def create_plot(
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/plots/plot_config.py b/src/c++/perf_analyzer/genai-perf/genai_perf/plots/plot_config.py
@@ -29,6 +29,7 @@
 from dataclasses import dataclass
 from enum import Enum, auto
 from pathlib import Path
+from typing import List, Sequence, Union
 
 
 class PlotType(Enum):
@@ -40,14 +41,14 @@ class PlotType(Enum):
 @dataclass
 class ProfileRunData:
     name: str
-    x_metric: Sequence[int | float]
-    y_metric: Sequence[int | float]
+    x_metric: Sequence[Union[int, float]]
+    y_metric: Sequence[Union[int, float]]
 
 
 @dataclass
 class PlotConfig:
     title: str
-    data: list[ProfileRunData]
+    data: List[ProfileRunData]
     x_label: str
     y_label: str
     width: int
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/plots/plot_config_parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/plots/plot_config_parser.py
@@ -26,6 +26,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from pathlib import Path
+from typing import List, Union
 
 import genai_perf.logging as logging
 
@@ -46,7 +47,7 @@ class PlotConfigParser:
     def __init__(self, filename: Path) -> None:
         self._filename = filename
 
-    def generate_configs(self) -> list[PlotConfig]:
+    def generate_configs(self) -> List[PlotConfig]:
         """Load YAML configuration file and convert to PlotConfigs."""
         logger.info(
             f"Generating plot configurations by parsing {self._filename}. "
@@ -57,7 +58,7 @@ def generate_configs(self) -> list[PlotConfig]:
         plot_configs = []
         for _, config in configs.items():
             # Collect profile run data
-            profile_data: list[ProfileRunData] = []
+            profile_data: List[ProfileRunData] = []
             for filepath in config["paths"]:
                 stats = self._get_statistics(filepath)
                 profile_data.append(
@@ -103,7 +104,7 @@ def _get_run_name(self, filepath: Path) -> str:
             return filepath.parent.name + "/" + filepath.stem
         return filepath.stem
 
-    def _get_metric(self, stats: Statistics, name: str) -> list[int | float]:
+    def _get_metric(self, stats: Statistics, name: str) -> List[Union[int, float]]:
         if not name:  # no metric
             return []
         elif name == "inter_token_latencies":
@@ -113,7 +114,7 @@ def _get_metric(self, stats: Statistics, name: str) -> list[int | float]:
                 itl_flatten += request_itls
             return [scale(x, (1 / 1e6)) for x in itl_flatten]  # ns to ms
         elif name == "token_positions":
-            token_positions: list[int | float] = []
+            token_positions: List[Union[int, float]] = []
             for request_itls in stats.metrics.data["inter_token_latencies"]:
                 token_positions += list(range(1, len(request_itls) + 1))
             return token_positions
@@ -141,7 +142,7 @@ def _get_plot_type(self, plot_type: str) -> PlotType:
             )
 
     @staticmethod
-    def create_init_yaml_config(filenames: list[Path], output_dir: Path) -> None:
+    def create_init_yaml_config(filenames: List[Path], output_dir: Path) -> None:
         config_str = f"""
         plot1:
           title: Time to First Token
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/plots/plot_manager.py b/src/c++/perf_analyzer/genai-perf/genai_perf/plots/plot_manager.py
@@ -25,6 +25,8 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+from typing import List
+
 import genai_perf.logging as logging
 from genai_perf.plots.box_plot import BoxPlot
 from genai_perf.plots.heat_map import HeatMap
@@ -39,7 +41,7 @@ class PlotManager:
     Manage details around plots generated
     """
 
-    def __init__(self, plot_configs: list[PlotConfig]) -> None:
+    def __init__(self, plot_configs: List[PlotConfig]) -> None:
         self._plot_configs = plot_configs
 
     def _generate_filename(self, title: str) -> str:
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/plots/scatter_plot.py b/src/c++/perf_analyzer/genai-perf/genai_perf/plots/scatter_plot.py
@@ -26,6 +26,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from pathlib import Path
+from typing import List
 
 import plotly.graph_objects as go
 from genai_perf.plots.base_plot import BasePlot
@@ -37,7 +38,7 @@ class ScatterPlot(BasePlot):
     Generate a scatter plot in jpeg and html format.
     """
 
-    def __init__(self, data: list[ProfileRunData]) -> None:
+    def __init__(self, data: List[ProfileRunData]) -> None:
         super().__init__(data)
 
     def create_plot(
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/utils.py b/src/c++/perf_analyzer/genai-perf/genai_perf/utils.py
@@ -27,15 +27,18 @@
 import json
 from enum import Enum
 from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Type
 
 # Skip type checking to avoid mypy error
 # Issue: https://github.com/python/mypy/issues/10632
 import yaml  # type: ignore
 
 
 def remove_sse_prefix(msg: str) -> str:
-    return msg.removeprefix("data: ").strip()
+    prefix = "data: "
+    if msg.startswith(prefix):
+        return msg[len(prefix) :].strip()
+    return msg.strip()
 
 
 def load_yaml(filepath: Path) -> Dict[str, Any]:
@@ -58,14 +61,14 @@ def convert_option_name(name: str) -> str:
     return name.replace("_", "-")
 
 
-def get_enum_names(enum: type[Enum]) -> List:
+def get_enum_names(enum: Type[Enum]) -> List:
     names = []
     for e in enum:
         names.append(e.name.lower())
     return names
 
 
-def get_enum_entry(name: str, enum: type[Enum]) -> Optional[Enum]:
+def get_enum_entry(name: str, enum: Type[Enum]) -> Optional[Enum]:
     for e in enum:
         if e.name.lower() == name.lower():
             return e
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py b/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py
diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py b/src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py