[Benchmark]: Add --gpu-filter to visualizer and simplify D2 guidelines

noemotiovon · Tcc0403 · noemotiovon · commit c2bf6692bb4a · 2026-03-27T07:00:40.000Z
benchmarks_visualizer.py:
  - Add `--gpu-filter` CLI flag to select a specific GPU when benchmark
    data contains results from multiple devices; falls back to the most
    recent device with a warning when omitted or unmatched.
  - Extract `gpu_name_filter()` and `extra_config_filter()` as standalone
    helpers; `load_data()` now applies filters in explicit order:
    kernel/metric/mode → sweep-mode → GPU → extra config.

  BENCHMARK_GUIDELINES.md:
  - Add guideline: import baseline kernels from the test suite instead
    of duplicating reference implementations in benchmark scripts.
  - Remove the continuous hidden-size sweep variant (D2.1) and
    `compute_hidden_size_sweep_config()` reference; D2 now covers only
    the discrete model-config sweep.

Co-authored-by: Tcc0403 &lt;76503978+Tcc0403@users.noreply.github.com&gt;
diff --git a/benchmark/BENCHMARK_GUIDELINES.md b/benchmark/BENCHMARK_GUIDELINES.md
@@ -5,6 +5,11 @@
 - **Location**: `benchmark/scripts/`
 - **Naming**: `benchmark_<kernel_name>.py` (e.g. `benchmark_geglu.py`, `benchmark_dyt.py`)
 
+> **Baseline implementations**: Import reference (non-Liger) kernels from the
+> test suite (e.g. `test/transformers/test_<kernel>.py`) to use as baselines.
+> This keeps benchmark and test implementations in sync and avoids duplicating
+> reference code in benchmark scripts.
+
 ## 2. Shared infrastructure
 
 Do **not** hardcode batch size, sequence length, or model dimensions. All benchmark scripts share the following:
@@ -13,7 +18,7 @@ Do **not** hardcode batch size, sequence length, or model dimensions. All benchm
 |------|-----|
 | Model dimensions (hidden_size, vocab_size, etc.) | `benchmark_model_configs.py`: `ModelConfig`, `MODEL_REGISTRY`, `get_benchmark_model_config()` |
 | Memory probing | `benchmark_model_configs.py`: `estimate_kernel_peak_memory()` |
-| Safe sweep configs | `compute_seq_len_sweep_config()`, `compute_hidden_size_sweep_config()`, `compute_model_config_sweep_config()` |
+| Safe sweep configs | `compute_seq_len_sweep_config()`, `compute_model_config_sweep_config()` |
 | Speed / memory measurement | `utils.py`: `run_speed_benchmark()`, `run_memory_benchmark()` |
 | Running the grid and writing CSV | `utils.py`: `run_benchmarks()` |
 | CLI arguments | `utils.py`: `parse_benchmark_script_args()` — provides `--model`, `--overwrite`, `--sweep-mode`, `--bt` |
@@ -94,25 +99,9 @@ python benchmark_geglu.py --model llama_3_8b --overwrite
 
 ## 4. D2 — Model dimension sweep
 
-Sweep model-related dimensions (e.g. hidden_size, or discrete model configs from `MODEL_REGISTRY`) with a **fixed token count**. Use `--bt` to set the token count.
-
-D2 has two variants:
-
-### 4.1 Continuous sweep (e.g. hidden_size)
-
-Sweep a single model parameter (like hidden_size) in a continuous range with fixed BT.
-
-**How to implement:**
-
-1. Probe: measure peak memory at `(BT, model.hidden_size)`.
-2. `config = compute_hidden_size_sweep_config(model, kernel_peak_bytes=peak_bytes, bt=BT)`. Returns `HiddenSizeSweepConfig` with `bt` and `max_hidden_size`.
-3. Build `x_values` from `config.max_hidden_size` (e.g. `[1024 * i for i in range(1, 17) if 1024 * i <= config.max_hidden_size]`).
-4. Build `extra_benchmark_configs` with `BT=config.bt`, `dtype=model.dtype`, etc.
-5. Call `run_benchmarks(...)`.
-
-**Reference**: `benchmark_dyt.py` — hidden_size sweep with `compute_hidden_size_sweep_config()`.
+Sweep across discrete model configs from `MODEL_REGISTRY` with a **fixed token count**. Use `--bt` to set the token count.
 
-### 4.2 Discrete model-config sweep
+### 4.1 Discrete model-config sweep
 
 Sweep across all `MODEL_REGISTRY` entries as discrete data points. Activated by `--sweep-mode model_config`.
 
@@ -155,7 +144,7 @@ def bench_speed_geglu_model_config(input):
 
 **Reference**: `benchmark_geglu.py`, `benchmark_swiglu.py`, `benchmark_dyt.py` — all support `--sweep-mode model_config`.
 
-### 4.3 How to run
+### 4.2 How to run
 
 ```bash
 # Discrete model-config sweep with default bt=2048
diff --git a/benchmark/benchmarks_visualizer.py b/benchmark/benchmarks_visualizer.py
@@ -44,6 +44,7 @@ class VisualizationsConfig:
     kernel_operation_mode: str = "full"
     sweep_mode: str = "token_length"
     extra_config_filter: str | None = None
+    gpu_filter: str | None = None
     display: bool = False
     overwrite: bool = False
 
@@ -86,6 +87,14 @@ def parse_args() -> VisualizationsConfig:
         "Can be a substring to match or a JSON-like 'key=value' pair (e.g., \"'H': 4096\" or \"H=4096\" for simple cases). "
         "Defaults to None (first available config if multiple exist).",
     )
+    parser.add_argument(
+        "--gpu-filter",
+        type=str,
+        default=None,
+        help="Filter by GPU name. When multiple devices are present, selects "
+        "the matching GPU (uses most recent match if multiple found). "
+        "If omitted, the most recent device is used automatically.",
+    )
     parser.add_argument("--display", action="store_true", help="Display the visualization")
     parser.add_argument(
         "--overwrite",
@@ -97,51 +106,77 @@ def parse_args() -> VisualizationsConfig:
     return args
 
 
-def load_data(config: VisualizationsConfig) -> pd.DataFrame:
-    """Loads the benchmark data from the CSV file and filters it based on the configuration.
+def gpu_name_filter(df: pd.DataFrame, gpu_filter: str | None = None) -> pd.DataFrame:
+    """Filter benchmark data by GPU name when multiple devices are present.
 
     Args:
-        config (VisualizationsConfig): Configuration object for the visualizations script.
-
-    Raises:
-        ValueError: If no data is found for the given filters.
+        df: Pre-filtered benchmark dataframe.
+        gpu_filter: Optional GPU name substring to match. If provided, selects
+            the matching GPU (uses most recent if multiple match). If None,
+            automatically picks the most recent device.
 
     Returns:
-        pd.DataFrame: Filtered benchmark dataframe.
+        pd.DataFrame: Dataframe filtered to a single GPU.
     """
-    df = pd.read_csv(DATA_PATH)
-    df["extra_benchmark_config"] = df["extra_benchmark_config_str"].apply(json.loads)
+    if "gpu_name" not in df.columns or df.empty:
+        return df
+
+    unique_gpus = df["gpu_name"].unique()
+    if len(unique_gpus) <= 1:
+        return df
+
+    if gpu_filter:
+        matched = [g for g in unique_gpus if gpu_filter in g]
+        if matched:
+            if len(matched) > 1:
+                # Multiple matches — pick the most recent
+                matched_df = df[df["gpu_name"].isin(matched)]
+                selected = matched_df.sort_values("timestamp", ascending=False)["gpu_name"].iloc[0]
+                print(
+                    f"Warning: Multiple GPUs match filter '{gpu_filter}': {matched}. "
+                    f"Using the most recent: '{selected}'."
+                )
+            else:
+                selected = matched[0]
+        else:
+            # No match — fall back to most recent GPU
+            selected = df.sort_values("timestamp", ascending=False)["gpu_name"].iloc[0]
+            print(
+                f"Warning: No GPU matches filter '{gpu_filter}'. "
+                f"Available GPUs: {list(unique_gpus)}. "
+                f"Falling back to most recent device: '{selected}'."
+            )
+    else:
+        # No filter provided — pick the most recent device
+        selected = df.sort_values("timestamp", ascending=False)["gpu_name"].iloc[0]
+        print(
+            f"Warning: Data contains entries from multiple devices: {list(unique_gpus)}. "
+            f"Using data from the most recent device: '{selected}'. "
+            f"Use --gpu-filter to select a specific device."
+        )
 
-    mask = (
-        (df["kernel_name"] == config.kernel_name)
-        & (df["metric_name"] == config.metric_name)
-        & (df["kernel_operation_mode"] == config.kernel_operation_mode)
-    )
+    return df[df["gpu_name"] == selected]
 
-    # Filter by sweep mode early, before extra_benchmark_config resolution.
-    if config.sweep_mode == "model_config":
-        mask = mask & (df["x_name"] == SWEEP_MODE_X_NAME)
-    elif config.sweep_mode == "token_length":
-        mask = mask & (df["x_name"] != SWEEP_MODE_X_NAME)
 
-    base_filtered_df = df[mask]
+def extra_config_filter(df: pd.DataFrame, config: VisualizationsConfig) -> pd.DataFrame:
+    """Filter benchmark data by extra_benchmark_config.
 
-    if base_filtered_df.empty:
-        raise ValueError(
-            f"No data found for kernel_name='{config.kernel_name}', "
-            f"metric_name='{config.metric_name}', "
-            f"kernel_operation_mode='{config.kernel_operation_mode}'."
-        )
+    Args:
+        df: Pre-filtered benchmark dataframe (already filtered by kernel, metric, etc.).
+        config: Visualization configuration with optional extra_config_filter.
 
-    unique_extra_configs_str = base_filtered_df["extra_benchmark_config_str"].unique()
+    Returns:
+        pd.DataFrame: Dataframe filtered to a single extra_benchmark_config.
+    """
+    unique_extra_configs_str = df["extra_benchmark_config_str"].unique()
     selected_extra_config_str = None
 
     if len(unique_extra_configs_str) == 0:
         print(
             "Warning: No extra_benchmark_config found for the initial filters. "
             "Proceeding with all data from initial filter."
         )
-        return base_filtered_df
+        return df
 
     if config.extra_config_filter:
         matched_configs = []
@@ -196,14 +231,12 @@ def load_data(config: VisualizationsConfig) -> pd.DataFrame:
         print(f"Using unique extra_benchmark_config: {selected_extra_config_str}")
 
     if selected_extra_config_str:
-        final_filtered_df = base_filtered_df[
-            base_filtered_df["extra_benchmark_config_str"] == selected_extra_config_str
-        ]
+        result_df = df[df["extra_benchmark_config_str"] == selected_extra_config_str]
     else:
         print("Warning: Could not select an extra_benchmark_config. Using data from initial filter if any.")
-        final_filtered_df = base_filtered_df
+        result_df = df
 
-    if final_filtered_df.empty:
+    if result_df.empty:
         raise ValueError(
             f"No data found after attempting to filter by extra_benchmark_config. "
             f"Selected/Defaulted extra_config_str: {selected_extra_config_str}"
@@ -214,7 +247,50 @@ def load_data(config: VisualizationsConfig) -> pd.DataFrame:
     print(
         f"Plotting data for extra_benchmark_config: {json.loads(selected_extra_config_str if selected_extra_config_str else '{}')}"
     )
-    return final_filtered_df
+    return result_df
+
+
+def load_data(config: VisualizationsConfig) -> pd.DataFrame:
+    """Loads the benchmark data from the CSV file and filters it based on the configuration.
+
+    Applies filters in order: kernel/metric/mode → sweep mode → GPU → extra config.
+
+    Args:
+        config (VisualizationsConfig): Configuration object for the visualizations script.
+
+    Raises:
+        ValueError: If no data is found for the given filters.
+
+    Returns:
+        pd.DataFrame: Filtered benchmark dataframe.
+    """
+    df = pd.read_csv(DATA_PATH)
+    df["extra_benchmark_config"] = df["extra_benchmark_config_str"].apply(json.loads)
+
+    mask = (
+        (df["kernel_name"] == config.kernel_name)
+        & (df["metric_name"] == config.metric_name)
+        & (df["kernel_operation_mode"] == config.kernel_operation_mode)
+    )
+
+    # Filter by sweep mode early, before extra_benchmark_config resolution.
+    if config.sweep_mode == "model_config":
+        mask = mask & (df["x_name"] == SWEEP_MODE_X_NAME)
+    elif config.sweep_mode == "token_length":
+        mask = mask & (df["x_name"] != SWEEP_MODE_X_NAME)
+
+    base_filtered_df = df[mask]
+
+    if base_filtered_df.empty:
+        raise ValueError(
+            f"No data found for kernel_name='{config.kernel_name}', "
+            f"metric_name='{config.metric_name}', "
+            f"kernel_operation_mode='{config.kernel_operation_mode}'."
+        )
+
+    # Apply GPU filter, then extra config filter
+    base_filtered_df = gpu_name_filter(base_filtered_df, config.gpu_filter)
+    return extra_config_filter(base_filtered_df, config)
 
 
 def plot_data(df: pd.DataFrame, config: VisualizationsConfig):
@@ -331,6 +407,7 @@ def main():
             kernel_operation_mode=mode,
             sweep_mode=args.sweep_mode,
             extra_config_filter=args.extra_config_filter,
+            gpu_filter=args.gpu_filter,
             display=args.display,
             overwrite=args.overwrite,
         )