From 9eac945752577427aedbaa61061b58e850a2025b Mon Sep 17 00:00:00 2001
From: Saber Gholami <sgholami@tenstorrent.com>
Date: Tue, 10 Mar 2026 15:29:04 +0000
Subject: [PATCH 01/12] initial work for perf ci

---
 .github/workflows/call-jit-perf-test.yml      | 158 +++++++++++
 .github/workflows/schedule-nightly.yml        |  18 +-
 .../workflows/workflow-run-collect-data.yml   |   2 +
 test/ttnn-jit/perf_ci/perf_tests.py           | 101 +++++++
 test/ttnn-jit/perf_ci/run_perf_collect.sh     |  72 +++++
 .../perf_ci/summarize_perf_results.py         | 254 ++++++++++++++++++
 6 files changed, 602 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/call-jit-perf-test.yml
 create mode 100644 test/ttnn-jit/perf_ci/perf_tests.py
 create mode 100755 test/ttnn-jit/perf_ci/run_perf_collect.sh
 create mode 100755 test/ttnn-jit/perf_ci/summarize_perf_results.py

diff --git a/.github/workflows/call-jit-perf-test.yml b/.github/workflows/call-jit-perf-test.yml
new file mode 100644
index 00000000000..dc82d50abd4
--- /dev/null
+++ b/.github/workflows/call-jit-perf-test.yml
@@ -0,0 +1,158 @@
+name: JIT Perf Test
+
+on:
+  workflow_call:
+    inputs:
+      docker_image:
+        description: 'Docker image for the build'
+        required: true
+        type: string
+
+permissions:
+  checks: write
+  packages: write
+
+env:
+  TRACY_NO_INVARIANT_CHECK: 1
+  TRACY_NO_ISA_EXTENSIONS: 1
+
+jobs:
+  run-jit-perf:
+    timeout-minutes: 120
+    name: "JIT Perf Collection"
+
+    runs-on:
+      - n150
+      - in-service
+
+    container:
+      image: ${{ inputs.docker_image }}
+      options: --device /dev/tenstorrent
+      volumes:
+        - /dev/hugepages:/dev/hugepages
+        - /dev/hugepages-1G:/dev/hugepages-1G
+        - /etc/udev/rules.d:/etc/udev/rules.d
+        - /lib/modules:/lib/modules
+        - /opt/tt_metal_infra/provisioning/provisioning_env:/opt/tt_metal_infra/provisioning/provisioning_env
+
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+
+    - name: Fetch job id
+      id: fetch-job-id
+      uses: tenstorrent/tt-github-actions/.github/actions/job_id@main
+      with:
+        job_name: "JIT Perf Collection"
+
+    - name: Set reusable strings
+      id: strings
+      shell: bash
+      env:
+        JOB_ID: ${{ steps.fetch-job-id.outputs.job_id }}
+      run: |
+        echo "work-dir=$(pwd)" >> "$GITHUB_OUTPUT"
+        echo "build-output-dir=$(pwd)/build" >> "$GITHUB_OUTPUT"
+        echo "install-output-dir=$(pwd)/install" >> "$GITHUB_OUTPUT"
+        echo "perf-output-dir=$(pwd)/jit_perf_results" >> "$GITHUB_OUTPUT"
+
+    - name: Git safe dir
+      run: git config --global --add safe.directory ${{ steps.strings.outputs.work-dir }}
+
+    - name: Use install artifacts
+      uses: tenstorrent/tt-forge/.github/actions/download-artifact@main
+      with:
+        name: install-artifacts-tracy
+        path: install
+        github_token: ${{ secrets.GITHUB_TOKEN }}
+
+    - name: Remove existing whls files
+      shell: bash
+      run: |
+        rm -f *.whl
+
+    - name: Download ttrt whls
+      uses: actions/download-artifact@v4
+      with:
+        name: ttrt-whl-tracy
+
+    - name: Install ttrt whls
+      shell: bash
+      run: |
+        source env/activate
+        pip show ttrt && pip uninstall -y ttrt
+        pip install ttrt*.whl --upgrade
+
+    - name: Download Build Artifacts
+      uses: tenstorrent/tt-forge/.github/actions/download-artifact@main
+      with:
+        name: build-artifacts-tracy
+        path: build
+        github_token: ${{ secrets.GITHUB_TOKEN }}
+
+    - name: Generate system descriptor
+      shell: bash
+      run: |
+        source env/activate
+        ttrt query --save-artifacts
+
+    - name: Download and install ttmlir and ttnn-jit wheels
+      shell: bash
+      env:
+        GH_TOKEN: ${{ secrets.GH_TOKEN || github.token }}
+      run: |
+        source env/activate
+        rm -f ttmlir*.whl ttnn_jit*.whl
+
+        gh run download ${{ github.run_id }} --repo ${{ github.repository }} --name ttmlir-whl-tracy
+        gh run download ${{ github.run_id }} --repo ${{ github.repository }} --name ttnn-jit-whl-tracy
+
+        pip show ttmlir &> /dev/null && pip uninstall -y ttmlir
+        pip show ttnn-jit &> /dev/null && pip uninstall -y ttnn-jit
+        pip install ttnn_jit*.whl --find-links . --upgrade
+
+    - name: Set up tt-triage
+      shell: bash
+      run: |
+        TT_METAL_VERSION=$(grep 'set(TT_METAL_VERSION' third_party/CMakeLists.txt | sed 's/.*"\(.*\)".*/\1/')
+
+        mkdir -p tt-triage
+        curl -L "https://github.com/tenstorrent/tt-metal/archive/${TT_METAL_VERSION}.tar.gz" \
+          | tar -xz -C tt-triage --strip-components=1 \
+              tt-metal-${TT_METAL_VERSION}/scripts/ttexalens_ref.txt \
+              tt-metal-${TT_METAL_VERSION}/tools/tt-triage.py \
+              tt-metal-${TT_METAL_VERSION}/tools/triage \
+              tt-metal-${TT_METAL_VERSION}/tt_metal
+
+        echo "TT_METAL_OPERATION_TIMEOUT_SECONDS=${{ vars.TT_METAL_OPERATION_TIMEOUT_SECONDS || 300 }}" >> $GITHUB_ENV
+        echo "TT_METAL_DISPATCH_TIMEOUT_COMMAND_TO_EXECUTE=python $(pwd)/tt-triage/tools/tt-triage.py 1>&2" >> $GITHUB_ENV
+
+    - name: Run JIT perf collection
+      shell: bash
+      run: |
+        source env/activate
+        export PYTHONPATH="${{ steps.strings.outputs.install-output-dir }}/tt-metal/ttnn:${{ steps.strings.outputs.install-output-dir }}/tt-metal"
+        export LD_LIBRARY_PATH="${{ steps.strings.outputs.install-output-dir }}/lib:${TTMLIR_TOOLCHAIN_DIR}/lib:${LD_LIBRARY_PATH}"
+        export SYSTEM_DESC_PATH="${GITHUB_WORKSPACE}/ttrt-artifacts/system_desc.ttsys"
+        export TT_METAL_RUNTIME_ROOT="${{ steps.strings.outputs.install-output-dir }}/tt-metal"
+        export TT_METAL_HOME="${{ steps.strings.outputs.work-dir }}/third_party/tt-metal/src/tt-metal"
+        ln -sf ${{ steps.strings.outputs.install-output-dir }} ${{ steps.strings.outputs.build-output-dir }}
+
+        test/ttnn-jit/perf_ci/run_perf_collect.sh ${{ steps.strings.outputs.perf-output-dir }}
+
+    - name: Upload JIT perf summary
+      uses: actions/upload-artifact@v4
+      if: success() || failure()
+      with:
+        name: jit-perf-summary-${{ steps.fetch-job-id.outputs.job_id }}
+        path: ${{ steps.strings.outputs.perf-output-dir }}/jit_perf_summary.json
+        if-no-files-found: warn
+
+    - name: Upload JIT perf reports
+      uses: ./.github/actions/collect-and-upload-perf-reports
+      if: success() || failure()
+      with:
+        reports_dir: ${{ steps.strings.outputs.perf-output-dir }}
+        perf_report_path: ${{ steps.strings.outputs.work-dir }}/perf_reports
+        artifact_name: jit-perf-reports-${{ steps.fetch-job-id.outputs.job_id }}
diff --git a/.github/workflows/schedule-nightly.yml b/.github/workflows/schedule-nightly.yml
index e367d7da44a..37986f428ed 100644
--- a/.github/workflows/schedule-nightly.yml
+++ b/.github/workflows/schedule-nightly.yml
@@ -32,16 +32,19 @@ jobs:
       runner: ${{ needs.prepare-run.outputs.runner }}
       sh_builder: ${{ fromJson(needs.prepare-run.outputs.sh_builder) }}
       component_matrix: ${{ needs.prepare-run.outputs.build_matrix }}
+  # TODO: Re-enable wheels-build after JIT perf CI debugging is complete.
   wheels-build:
+    if: false
     needs: [ prepare-run, build-image, release-build ] # release-build required so ttnn-jit wheel is built
     uses: ./.github/workflows/call-build-wheels.yml
     secrets: inherit
     with:
       docker-tag: ${{ needs.build-image.outputs.docker-tag }}
       docker_image: ${{ needs.build-image.outputs.docker-image }}
+  # TODO: Re-enable test after JIT perf CI debugging is complete.
   test:
+    if: false
     needs: [ prepare-run, build-image, release-build ]
-    if: needs.prepare-run.outputs.skip_build != 'true'
     uses: ./.github/workflows/call-test.yml
     secrets: inherit
     with:
@@ -49,8 +52,16 @@ jobs:
       test_matrix: ${{ needs.prepare-run.outputs.test_matrix }}
       timeout: ${{ fromJson(needs.prepare-run.outputs.test_timeout) }}
 
+  jit-perf-test:
+    needs: [ build-image, release-build ]
+    uses: ./.github/workflows/call-jit-perf-test.yml
+    secrets: inherit
+    with:
+      docker_image: ${{ needs.build-image.outputs.docker-image }}
+
+  # TODO: Re-enable fail-notify after JIT perf CI debugging is complete.
   fail-notify:
-    if: always()
+    if: false
     needs:
       - prepare-run
       - build-image
@@ -71,8 +82,9 @@ jobs:
         with:
           jobs: ${{ toJSON(needs) }}
 
+  # TODO: Re-enable fail-send-msg after JIT perf CI debugging is complete.
   fail-send-msg:
-    if: always()
+    if: false
     needs:
       - fail-notify
       - test
diff --git a/.github/workflows/workflow-run-collect-data.yml b/.github/workflows/workflow-run-collect-data.yml
index 2567ad1dd78..add2cac7cd6 100644
--- a/.github/workflows/workflow-run-collect-data.yml
+++ b/.github/workflows/workflow-run-collect-data.yml
@@ -26,6 +26,8 @@ jobs:
           run_attempt: ${{ github.event.workflow_run.run_attempt }}
           sftp_host: ${{ secrets.SFTP_CICD_WRITER_HOSTNAME }}
           sftp_user: ${{ secrets.SFTP_CICD_WRITER_USERNAME }}
+          sftp_perf_host: ${{ secrets.SFTP_PERF_WRITER_HOSTNAME }}
+          sftp_perf_user: ${{ secrets.SFTP_PERF_WRITER_USERNAME }}
           sftp_optest_host: ${{ secrets.SFTP_OP_TEST_WRITER_HOSTNAME }}
           sftp_optest_user: ${{ secrets.SFTP_OP_TEST_WRITER_USERNAME }}
           ssh-private-key: ${{ secrets.SFTP_CICD_WRITER_KEY }}
diff --git a/test/ttnn-jit/perf_ci/perf_tests.py b/test/ttnn-jit/perf_ci/perf_tests.py
new file mode 100644
index 00000000000..b157cc31035
--- /dev/null
+++ b/test/ttnn-jit/perf_ci/perf_tests.py
@@ -0,0 +1,101 @@
+# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import ttnn
+import ttnn_jit
+import torch
+
+import pytest
+
+from op_definitions import abs, exp, add, mul, matmul
+
+# Memory configs that pass for all ops and both JIT and non-JIT.
+# DRAM interleaved works for matmul (requires interleaved) and all elementwise ops.
+# L1 interleaved is not used: JIT runtime fails with RuntimeError on L1 interleaved
+# inputs (submit path), so we only test DRAM interleaved for paired JIT vs TTNN comparison.
+MEMORY_CONFIGS = [
+    (ttnn.DRAM_MEMORY_CONFIG, "dram_interleaved"),
+]
+
+
+def is_unary(op):
+    return op == abs or op == exp
+
+
+@pytest.mark.parametrize(
+    "h, w",
+    [
+        (256, 256),
+    ],
+)
+@pytest.mark.parametrize(
+    "op",
+    [
+        abs,
+        exp,
+        add,
+        mul,
+        matmul,
+    ],
+    ids=[
+        "abs",
+        "exp",
+        "add",
+        "mul",
+        "matmul",
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, ttnn_dtype",
+    [
+        (torch.bfloat16, ttnn.DataType.BFLOAT16),
+        (torch.bfloat16, ttnn.DataType.BFLOAT8_B),
+    ],
+    ids=["bf16", "bfp8"],
+)
+@pytest.mark.parametrize(
+    "memory_config, memory_config_id",
+    MEMORY_CONFIGS,
+    ids=[id for _, id in MEMORY_CONFIGS],
+)
+@pytest.mark.parametrize(
+    "jit_enabled",
+    [
+        True,
+        False,
+    ],
+)
+def test_op_compare(
+    h, w, op, dtype, ttnn_dtype, memory_config, memory_config_id, jit_enabled
+):
+    device = ttnn.open_device(device_id=0)
+    torch_tensor_a = torch.rand((h, w), dtype=dtype) * 100
+    torch_tensor_b = torch.rand((h, w), dtype=dtype) * 100
+
+    input_a = ttnn.from_torch(
+        torch_tensor_a,
+        dtype=ttnn_dtype,
+        layout=ttnn.TILE_LAYOUT,
+        device=device,
+        memory_config=memory_config,
+    )
+    input_b = ttnn.from_torch(
+        torch_tensor_b,
+        dtype=ttnn_dtype,
+        layout=ttnn.TILE_LAYOUT,
+        device=device,
+        memory_config=memory_config,
+    )
+
+    function_to_test = (
+        ttnn_jit.jit(debug=True, enable_cache=True)(op) if jit_enabled else op
+    )
+    output_tensor = (
+        function_to_test(input_a)
+        if is_unary(op)
+        else function_to_test(input_a, input_b)
+    )
+
+    print(f"output_tensor\n: {output_tensor}")
+    ttnn.close_device(device)
diff --git a/test/ttnn-jit/perf_ci/run_perf_collect.sh b/test/ttnn-jit/perf_ci/run_perf_collect.sh
new file mode 100755
index 00000000000..c7fb07a5283
--- /dev/null
+++ b/test/ttnn-jit/perf_ci/run_perf_collect.sh
@@ -0,0 +1,72 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Run each parametrized test in perf_tests.py under the device profiler (tracy)
+# and dump results into a directory per test. Use TT_METAL_PROFILER_DIR so each
+# run writes to a known subdir. At the end, runs summarize_perf_results.py to
+# produce OUT_DIR/jit_perf_summary.json.
+#
+# Usage:
+#   ./test/ttnn-jit/perf_ci/run_perf_collect.sh [OUT_DIR]
+#
+# Example:
+#   ./test/ttnn-jit/perf_ci/run_perf_collect.sh
+#   ./test/ttnn-jit/perf_ci/run_perf_collect.sh generated/jit_perf_reports/my_run
+
+set -e
+
+# Script lives in test/ttnn-jit/perf_ci/; go up three levels to repo root
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)"
+cd "$REPO_ROOT"
+
+# Optional: activate venv if present
+if [ -f env/activate ]; then
+  # shellcheck source=/dev/null
+  source env/activate
+fi
+
+OUT_DIR="${1:-generated/jit_perf_reports/run_$(date +%Y%m%d_%H%M%S)}"
+mkdir -p "$OUT_DIR"
+OUT_DIR="$(cd "$OUT_DIR" && pwd)"
+
+# Collect test ids from perf_tests.py (whatever is parametrized there)
+collect_out=$(mktemp)
+if ! pytest test/ttnn-jit/perf_ci/perf_tests.py --collect-only -q >"$collect_out" 2>&1; then
+  echo "Error: pytest collect failed:" >&2
+  cat "$collect_out" >&2
+  rm -f "$collect_out"
+  exit 1
+fi
+TESTS=($(sed -n 's/.*test_op_compare\[\(.*\)\]/\1/p' <"$collect_out"))
+if [ ${#TESTS[@]} -eq 0 ]; then
+  echo "Error: no test_op_compare[*] tests found in test/ttnn-jit/perf_ci/perf_tests.py. Pytest collect output:" >&2
+  cat "$collect_out" >&2
+  rm -f "$collect_out"
+  exit 1
+fi
+rm -f "$collect_out"
+echo "Collected ${#TESTS[@]} tests from perf_tests.py"
+
+export TT_METAL_DEVICE_PROFILER=1
+
+for tid in "${TESTS[@]}"; do
+  echo "=============================================="
+  echo "Running test_op_compare[$tid] ..."
+  echo "=============================================="
+  export TT_METAL_PROFILER_DIR="$OUT_DIR/$tid"
+  mkdir -p "$TT_METAL_PROFILER_DIR"
+  if ! python -m tracy -m -r -p "pytest test/ttnn-jit/perf_ci/perf_tests.py::test_op_compare[$tid]"; then
+    echo "Warning: test_op_compare[$tid] exited with non-zero status (results may still be present)."
+  fi
+done
+
+echo ""
+echo "Results written under: $OUT_DIR"
+echo "Summarizing..."
+if python test/ttnn-jit/perf_ci/summarize_perf_results.py "$OUT_DIR" -o "$OUT_DIR/jit_perf_summary.json"; then
+  echo "Summary written to $OUT_DIR/jit_perf_summary.json"
+else
+  echo "Warning: summarizer exited with an error (run dir may be partial)." >&2
+fi
diff --git a/test/ttnn-jit/perf_ci/summarize_perf_results.py b/test/ttnn-jit/perf_ci/summarize_perf_results.py
new file mode 100755
index 00000000000..30a74d389a5
--- /dev/null
+++ b/test/ttnn-jit/perf_ci/summarize_perf_results.py
@@ -0,0 +1,254 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Read all ops_perf_results_*.csv under a run directory (from run_perf_collect.sh),
+# group JIT vs non-JIT by case (op, shape, dtype, memory_config_id) and write one
+# entry per case with jit_duration_ns, ttnn_duration_ns, and perf_pct_ttnn.
+# math_fidelity is not part of the key so JIT (e.g. HiFi4) and TTNN (e.g. HiFi2) pair.
+# (100 = same, <100 = JIT slower, >100 = JIT faster). Suitable for Superset.
+#
+# Usage:
+#   python test/ttnn-jit/perf_ci/summarize_perf_results.py RUN_DIR [-o OUTPUT.json]
+#
+# Example:
+#   python test/ttnn-jit/perf_ci/summarize_perf_results.py generated/jit_perf_reports/run_20250309_123456 -o jit_perf_summary.json
+
+import argparse
+import csv
+import json
+import sys
+from pathlib import Path
+from typing import Any, Optional
+
+DEVICE_KERNEL_DURATION_COL = "DEVICE KERNEL DURATION [ns]"
+MATH_FIDELITY_COL = "MATH FIDELITY"
+OUTPUT_0_DATATYPE_COL = "OUTPUT_0_DATATYPE"
+INPUT_0_DATATYPE_COL = "INPUT_0_DATATYPE"
+
+
+def find_result_csvs(run_dir: Path):
+    """Yield (test_id, csv_path) for each ops_perf_results_*.csv under run_dir."""
+    run_dir = run_dir.resolve()
+    if not run_dir.is_dir():
+        return
+    for test_dir in run_dir.iterdir():
+        if not test_dir.is_dir():
+            continue
+        test_id = test_dir.name
+        reports_dir = test_dir / "reports"
+        if not reports_dir.is_dir():
+            continue
+        for ts_dir in reports_dir.iterdir():
+            if not ts_dir.is_dir():
+                continue
+            for csv_path in ts_dir.glob("ops_perf_results_*.csv"):
+                yield test_id, csv_path
+
+
+# Known memory_config suffixes in test_id (e.g. ...-dram_interleaved).
+MEMORY_CONFIG_IDS = ("dram_interleaved", "l1_interleaved")
+
+
+def parse_test_id(test_id: str) -> Optional[dict]:
+    """
+    Parse test_id into jit, op, h, w, and optionally memory_config_id.
+    Supports: 'True-abs-256-256' (4 parts), 'True-bf16-abs-256-256' (5),
+    'True-dram_interleaved-bf16-abs-256-256' (6).
+    """
+    parts = test_id.split("-")
+    if len(parts) < 4:
+        return None
+    jit = parts[0].lower() == "true"
+    memory_config_id: Optional[str] = None
+    if len(parts) == 6 and parts[1] in MEMORY_CONFIG_IDS:
+        memory_config_id = parts[1]
+        op = parts[3]
+        try:
+            h, w = int(parts[4]), int(parts[5])
+        except (ValueError, IndexError):
+            return None
+    elif len(parts) == 5:
+        op = parts[2]
+        try:
+            h, w = int(parts[3]), int(parts[4])
+        except (ValueError, IndexError):
+            return None
+    elif len(parts) == 4:
+        op = parts[1]
+        try:
+            h, w = int(parts[2]), int(parts[3])
+        except (ValueError, IndexError):
+            return None
+    else:
+        try:
+            h, w = int(parts[-2]), int(parts[-1])
+        except (ValueError, IndexError):
+            return None
+        op = "-".join(parts[1:-2])
+    return {"jit": jit, "op": op, "h": h, "w": w, "memory_config_id": memory_config_id}
+
+
+def read_csv_duration_and_meta(csv_path: Path) -> Optional[tuple[int, str, str]]:
+    """
+    Read CSV: sum DEVICE KERNEL DURATION [ns], and from first data row return
+    (duration_ns, dtype, math_fidelity). dtype/math_fidelity may be empty if
+    column missing.
+    """
+    total = 0
+    found_duration = False
+    dtype = ""
+    math_fidelity = ""
+    with open(csv_path, newline="", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        fieldnames = reader.fieldnames or []
+        for row in reader:
+            if DEVICE_KERNEL_DURATION_COL in fieldnames:
+                val = row.get(DEVICE_KERNEL_DURATION_COL, "").strip()
+                if val and val != "-":
+                    try:
+                        total += int(float(val))
+                        found_duration = True
+                    except (ValueError, TypeError):
+                        pass
+            if not dtype and (
+                OUTPUT_0_DATATYPE_COL in fieldnames
+                or INPUT_0_DATATYPE_COL in fieldnames
+            ):
+                dtype = (
+                    row.get(OUTPUT_0_DATATYPE_COL)
+                    or row.get(INPUT_0_DATATYPE_COL)
+                    or ""
+                ).strip()
+            if not math_fidelity and MATH_FIDELITY_COL in fieldnames:
+                math_fidelity = (row.get(MATH_FIDELITY_COL) or "").strip()
+    if not found_duration:
+        return None
+    return (total, dtype, math_fidelity)
+
+
+def make_case_key(
+    op: str, h: int, w: int, dtype: str, memory_config_id: Optional[str]
+) -> tuple:
+    """Immutable key to group JIT and non-JIT runs of the same case. Excludes math_fidelity so JIT and TTNN runs (which may report different fidelities) pair into one entry."""
+    return (op, h, w, dtype, memory_config_id or "")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Summarize JIT perf run CSVs into one entry per (op, shape, dtype, memory_config) with JIT vs TTNN comparison."
+    )
+    parser.add_argument(
+        "run_dir",
+        type=Path,
+        help="Directory produced by run_perf_collect.sh (contains test_id/reports/...)",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=Path,
+        default=None,
+        help="Output JSON path (default: RUN_DIR/jit_perf_summary.json)",
+    )
+    parser.add_argument(
+        "-q",
+        "--quiet",
+        action="store_true",
+        help="Do not print progress",
+    )
+    args = parser.parse_args()
+
+    run_dir = args.run_dir.resolve()
+    if not run_dir.is_dir():
+        print(f"Error: not a directory: {run_dir}", file=sys.stderr)
+        sys.exit(1)
+
+    out_path = args.output or (run_dir / "jit_perf_summary.json")
+
+    # Raw rows: one per CSV (test_id, jit, op, h, w, duration_ns, dtype, math_fidelity)
+    raw: list[dict[str, Any]] = []
+    for test_id, csv_path in find_result_csvs(run_dir):
+        parsed = parse_test_id(test_id)
+        if not parsed:
+            if not args.quiet:
+                print(f"Skip (bad test_id): {test_id}", file=sys.stderr)
+            continue
+        result = read_csv_duration_and_meta(csv_path)
+        if result is None:
+            if not args.quiet:
+                print(f"Skip (no duration): {csv_path}", file=sys.stderr)
+            continue
+        duration_ns, dtype, math_fidelity = result
+        raw.append(
+            {
+                "test_id": test_id,
+                "jit": parsed["jit"],
+                "op": parsed["op"],
+                "h": parsed["h"],
+                "w": parsed["w"],
+                "memory_config_id": parsed.get("memory_config_id"),
+                "duration_ns": duration_ns,
+                "dtype": dtype,
+                "math_fidelity": math_fidelity,
+                "csv_path": str(csv_path),
+            }
+        )
+        if not args.quiet:
+            print(
+                f"  {test_id}: {duration_ns} ns (dtype={dtype!r}, math_fidelity={math_fidelity!r})"
+            )
+
+    # Group by case key (op, h, w, dtype, memory_config_id) so JIT and TTNN pair even when math_fidelity differs (e.g. matmul HiFi4 vs HiFi2)
+    groups: dict[tuple, dict[str, Any]] = {}
+    for r in raw:
+        key = make_case_key(
+            r["op"], r["h"], r["w"], r["dtype"], r.get("memory_config_id")
+        )
+        if key not in groups:
+            groups[key] = {
+                "op": r["op"],
+                "h": r["h"],
+                "w": r["w"],
+                "shape": f"{r['h']}x{r['w']}",
+                "dtype": r["dtype"],
+                "math_fidelity": r["math_fidelity"],
+                "math_fidelity_ttnn": None,
+                "memory_config_id": r.get("memory_config_id") or "",
+                "jit_duration_ns": None,
+                "ttnn_duration_ns": None,
+                "perf_pct_ttnn": None,
+                "jit_csv_path": None,
+                "ttnn_csv_path": None,
+            }
+        g = groups[key]
+        if r["jit"]:
+            g["jit_duration_ns"] = r["duration_ns"]
+            g["jit_csv_path"] = r["csv_path"]
+            g["math_fidelity"] = r["math_fidelity"]
+        else:
+            g["ttnn_duration_ns"] = r["duration_ns"]
+            g["ttnn_csv_path"] = r["csv_path"]
+            g["math_fidelity_ttnn"] = r["math_fidelity"]
+
+    # Compute perf_pct_ttnn: (ttnn_duration / jit_duration) * 100 → 100 = same, <100 = JIT slower, >100 = JIT faster
+    out_rows: list[dict[str, Any]] = []
+    for key in sorted(groups.keys()):
+        g = groups[key]
+        jit_ns = g["jit_duration_ns"]
+        ttnn_ns = g["ttnn_duration_ns"]
+        if jit_ns is not None and ttnn_ns is not None and jit_ns > 0:
+            g["perf_pct_ttnn"] = round((ttnn_ns / jit_ns) * 100.0, 2)
+        out_rows.append(g)
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(out_path, "w", encoding="utf-8") as f:
+        json.dump(out_rows, f, indent=2)
+
+    if not args.quiet:
+        print(f"Wrote {len(out_rows)} case(s) to {out_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 1da63077d00a0eeaecbea8a6c211f796a4a98f7c Mon Sep 17 00:00:00 2001
From: Saber Gholami <sgholami@tenstorrent.com>
Date: Tue, 10 Mar 2026 16:06:56 +0000
Subject: [PATCH 02/12] python path issue

---
 test/ttnn-jit/perf_ci/run_perf_collect.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/ttnn-jit/perf_ci/run_perf_collect.sh b/test/ttnn-jit/perf_ci/run_perf_collect.sh
index c7fb07a5283..647f1397f24 100755
--- a/test/ttnn-jit/perf_ci/run_perf_collect.sh
+++ b/test/ttnn-jit/perf_ci/run_perf_collect.sh
@@ -21,8 +21,8 @@ set -e
 REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)"
 cd "$REPO_ROOT"
 
-# Optional: activate venv if present
-if [ -f env/activate ]; then
+# Activate venv if not already active
+if [ -z "$VIRTUAL_ENV" ] && [ -f env/activate ]; then
   # shellcheck source=/dev/null
   source env/activate
 fi

From 81f47dfeb2d7e8c5adea30aa7f915d6a1b92f576 Mon Sep 17 00:00:00 2001
From: Saber Gholami <sgholami@tenstorrent.com>
Date: Tue, 10 Mar 2026 16:36:13 +0000
Subject: [PATCH 03/12] fix ttnn not found

---
 .github/workflows/call-jit-perf-test.yml  |  3 ++-
 test/ttnn-jit/perf_ci/run_perf_collect.sh | 10 ++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/call-jit-perf-test.yml b/.github/workflows/call-jit-perf-test.yml
index dc82d50abd4..ed35711565a 100644
--- a/.github/workflows/call-jit-perf-test.yml
+++ b/.github/workflows/call-jit-perf-test.yml
@@ -130,9 +130,10 @@ jobs:
 
     - name: Run JIT perf collection
       shell: bash
+      env:
+        INSTALL_DIR: ${{ steps.strings.outputs.install-output-dir }}
       run: |
         source env/activate
-        export PYTHONPATH="${{ steps.strings.outputs.install-output-dir }}/tt-metal/ttnn:${{ steps.strings.outputs.install-output-dir }}/tt-metal"
         export LD_LIBRARY_PATH="${{ steps.strings.outputs.install-output-dir }}/lib:${TTMLIR_TOOLCHAIN_DIR}/lib:${LD_LIBRARY_PATH}"
         export SYSTEM_DESC_PATH="${GITHUB_WORKSPACE}/ttrt-artifacts/system_desc.ttsys"
         export TT_METAL_RUNTIME_ROOT="${{ steps.strings.outputs.install-output-dir }}/tt-metal"
diff --git a/test/ttnn-jit/perf_ci/run_perf_collect.sh b/test/ttnn-jit/perf_ci/run_perf_collect.sh
index 647f1397f24..5f74aa9c07e 100755
--- a/test/ttnn-jit/perf_ci/run_perf_collect.sh
+++ b/test/ttnn-jit/perf_ci/run_perf_collect.sh
@@ -21,12 +21,18 @@ set -e
 REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)"
 cd "$REPO_ROOT"
 
-# Activate venv if not already active
-if [ -z "$VIRTUAL_ENV" ] && [ -f env/activate ]; then
+# Activate venv if present
+if [ -f env/activate ]; then
   # shellcheck source=/dev/null
   source env/activate
 fi
 
+# In CI, INSTALL_DIR points to extracted install artifacts containing tt-metal.
+# Set PYTHONPATH so pytest can find the ttnn module (same as ttnn_jit.sh).
+if [ -n "$INSTALL_DIR" ]; then
+  export PYTHONPATH="$INSTALL_DIR/tt-metal/ttnn:$INSTALL_DIR/tt-metal"
+fi
+
 OUT_DIR="${1:-generated/jit_perf_reports/run_$(date +%Y%m%d_%H%M%S)}"
 mkdir -p "$OUT_DIR"
 OUT_DIR="$(cd "$OUT_DIR" && pwd)"

From 82d7c9806297cfb2c0bde3384409e0dc19911e0c Mon Sep 17 00:00:00 2001
From: Saber Gholami <sgholami@tenstorrent.com>
Date: Tue, 10 Mar 2026 16:39:06 +0000
Subject: [PATCH 04/12] Revert "fix ttnn not found"

This reverts commit 81f47dfeb2d7e8c5adea30aa7f915d6a1b92f576.
---
 .github/workflows/call-jit-perf-test.yml  |  3 +--
 test/ttnn-jit/perf_ci/run_perf_collect.sh | 10 ++--------
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/call-jit-perf-test.yml b/.github/workflows/call-jit-perf-test.yml
index ed35711565a..dc82d50abd4 100644
--- a/.github/workflows/call-jit-perf-test.yml
+++ b/.github/workflows/call-jit-perf-test.yml
@@ -130,10 +130,9 @@ jobs:
 
     - name: Run JIT perf collection
       shell: bash
-      env:
-        INSTALL_DIR: ${{ steps.strings.outputs.install-output-dir }}
       run: |
         source env/activate
+        export PYTHONPATH="${{ steps.strings.outputs.install-output-dir }}/tt-metal/ttnn:${{ steps.strings.outputs.install-output-dir }}/tt-metal"
         export LD_LIBRARY_PATH="${{ steps.strings.outputs.install-output-dir }}/lib:${TTMLIR_TOOLCHAIN_DIR}/lib:${LD_LIBRARY_PATH}"
         export SYSTEM_DESC_PATH="${GITHUB_WORKSPACE}/ttrt-artifacts/system_desc.ttsys"
         export TT_METAL_RUNTIME_ROOT="${{ steps.strings.outputs.install-output-dir }}/tt-metal"
diff --git a/test/ttnn-jit/perf_ci/run_perf_collect.sh b/test/ttnn-jit/perf_ci/run_perf_collect.sh
index 5f74aa9c07e..647f1397f24 100755
--- a/test/ttnn-jit/perf_ci/run_perf_collect.sh
+++ b/test/ttnn-jit/perf_ci/run_perf_collect.sh
@@ -21,18 +21,12 @@ set -e
 REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)"
 cd "$REPO_ROOT"
 
-# Activate venv if present
-if [ -f env/activate ]; then
+# Activate venv if not already active
+if [ -z "$VIRTUAL_ENV" ] && [ -f env/activate ]; then
   # shellcheck source=/dev/null
   source env/activate
 fi
 
-# In CI, INSTALL_DIR points to extracted install artifacts containing tt-metal.
-# Set PYTHONPATH so pytest can find the ttnn module (same as ttnn_jit.sh).
-if [ -n "$INSTALL_DIR" ]; then
-  export PYTHONPATH="$INSTALL_DIR/tt-metal/ttnn:$INSTALL_DIR/tt-metal"
-fi
-
 OUT_DIR="${1:-generated/jit_perf_reports/run_$(date +%Y%m%d_%H%M%S)}"
 mkdir -p "$OUT_DIR"
 OUT_DIR="$(cd "$OUT_DIR" && pwd)"

From f93edfed5fdcf134a8aeee9e749ca4c28dd9b3e6 Mon Sep 17 00:00:00 2001
From: Saber Gholami <sgholami@tenstorrent.com>
Date: Tue, 10 Mar 2026 16:46:23 +0000
Subject: [PATCH 05/12] tracy fix

---
 .github/workflows/call-jit-perf-test.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/call-jit-perf-test.yml b/.github/workflows/call-jit-perf-test.yml
index dc82d50abd4..836d0a66e72 100644
--- a/.github/workflows/call-jit-perf-test.yml
+++ b/.github/workflows/call-jit-perf-test.yml
@@ -128,6 +128,14 @@ jobs:
         echo "TT_METAL_OPERATION_TIMEOUT_SECONDS=${{ vars.TT_METAL_OPERATION_TIMEOUT_SECONDS || 300 }}" >> $GITHUB_ENV
         echo "TT_METAL_DISPATCH_TIMEOUT_COMMAND_TO_EXECUTE=python $(pwd)/tt-triage/tools/tt-triage.py 1>&2" >> $GITHUB_ENV
 
+    - name: Set up tracy profiler tools
+      shell: bash
+      run: |
+        TRACY_BIN_DIR="${{ steps.strings.outputs.work-dir }}/third_party/tt-metal/src/tt-metal/build/tools/profiler/bin"
+        mkdir -p "$TRACY_BIN_DIR"
+        cp ${{ steps.strings.outputs.build-output-dir }}/python_packages/ttrt/runtime/capture-release "$TRACY_BIN_DIR/"
+        cp ${{ steps.strings.outputs.build-output-dir }}/python_packages/ttrt/runtime/csvexport-release "$TRACY_BIN_DIR/"
+
     - name: Run JIT perf collection
       shell: bash
       run: |

From 802d5a9278cbac67a2a6284bf15712e45db654fd Mon Sep 17 00:00:00 2001
From: Saber Gholami <sgholami@tenstorrent.com>
Date: Tue, 10 Mar 2026 18:48:25 +0000
Subject: [PATCH 06/12] fix upload issue

---
 .github/workflows/call-jit-perf-test.yml | 8 ++++----
 test/ttnn-jit/perf_ci/perf_tests.py      | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/call-jit-perf-test.yml b/.github/workflows/call-jit-perf-test.yml
index 836d0a66e72..09d4fcc43b2 100644
--- a/.github/workflows/call-jit-perf-test.yml
+++ b/.github/workflows/call-jit-perf-test.yml
@@ -158,9 +158,9 @@ jobs:
         if-no-files-found: warn
 
     - name: Upload JIT perf reports
-      uses: ./.github/actions/collect-and-upload-perf-reports
+      uses: actions/upload-artifact@v4
       if: success() || failure()
       with:
-        reports_dir: ${{ steps.strings.outputs.perf-output-dir }}
-        perf_report_path: ${{ steps.strings.outputs.work-dir }}/perf_reports
-        artifact_name: jit-perf-reports-${{ steps.fetch-job-id.outputs.job_id }}
+        name: jit-perf-reports-${{ steps.fetch-job-id.outputs.job_id }}
+        path: ${{ steps.strings.outputs.perf-output-dir }}
+        if-no-files-found: warn
diff --git a/test/ttnn-jit/perf_ci/perf_tests.py b/test/ttnn-jit/perf_ci/perf_tests.py
index b157cc31035..fb2e2f7ffe4 100644
--- a/test/ttnn-jit/perf_ci/perf_tests.py
+++ b/test/ttnn-jit/perf_ci/perf_tests.py
@@ -26,7 +26,7 @@ def is_unary(op):
 @pytest.mark.parametrize(
     "h, w",
     [
-        (256, 256),
+        (2048, 2048),
     ],
 )
 @pytest.mark.parametrize(

From aa8a7653202f8b9c4828960ce83def10466b64ca Mon Sep 17 00:00:00 2001
From: Saber Gholami <sgholami@tenstorrent.com>
Date: Wed, 11 Mar 2026 15:24:17 +0000
Subject: [PATCH 07/12] fix json file naming

---
 .github/workflows/call-jit-perf-test.yml  | 4 +++-
 test/ttnn-jit/perf_ci/run_perf_collect.sh | 8 +++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/call-jit-perf-test.yml b/.github/workflows/call-jit-perf-test.yml
index 09d4fcc43b2..cb34ec6c185 100644
--- a/.github/workflows/call-jit-perf-test.yml
+++ b/.github/workflows/call-jit-perf-test.yml
@@ -138,6 +138,8 @@ jobs:
 
     - name: Run JIT perf collection
       shell: bash
+      env:
+        JOB_ID: ${{ steps.fetch-job-id.outputs.job_id }}
       run: |
         source env/activate
         export PYTHONPATH="${{ steps.strings.outputs.install-output-dir }}/tt-metal/ttnn:${{ steps.strings.outputs.install-output-dir }}/tt-metal"
@@ -154,7 +156,7 @@ jobs:
       if: success() || failure()
       with:
         name: jit-perf-summary-${{ steps.fetch-job-id.outputs.job_id }}
-        path: ${{ steps.strings.outputs.perf-output-dir }}/jit_perf_summary.json
+        path: ${{ steps.strings.outputs.perf-output-dir }}/perf_jit_summary_${{ steps.fetch-job-id.outputs.job_id }}.json
         if-no-files-found: warn
 
     - name: Upload JIT perf reports
diff --git a/test/ttnn-jit/perf_ci/run_perf_collect.sh b/test/ttnn-jit/perf_ci/run_perf_collect.sh
index 647f1397f24..4a7a0118e45 100755
--- a/test/ttnn-jit/perf_ci/run_perf_collect.sh
+++ b/test/ttnn-jit/perf_ci/run_perf_collect.sh
@@ -6,7 +6,8 @@
 # Run each parametrized test in perf_tests.py under the device profiler (tracy)
 # and dump results into a directory per test. Use TT_METAL_PROFILER_DIR so each
 # run writes to a known subdir. At the end, runs summarize_perf_results.py to
-# produce OUT_DIR/jit_perf_summary.json.
+# produce OUT_DIR/perf_jit_summary[_JOB_ID].json.
+# Set JOB_ID env var to include the job ID in the filename (required for CI).
 #
 # Usage:
 #   ./test/ttnn-jit/perf_ci/run_perf_collect.sh [OUT_DIR]
@@ -64,9 +65,10 @@ done
 
 echo ""
 echo "Results written under: $OUT_DIR"
+SUMMARY_FILE="$OUT_DIR/perf_jit_summary${JOB_ID:+_$JOB_ID}.json"
 echo "Summarizing..."
-if python test/ttnn-jit/perf_ci/summarize_perf_results.py "$OUT_DIR" -o "$OUT_DIR/jit_perf_summary.json"; then
-  echo "Summary written to $OUT_DIR/jit_perf_summary.json"
+if python test/ttnn-jit/perf_ci/summarize_perf_results.py "$OUT_DIR" -o "$SUMMARY_FILE"; then
+  echo "Summary written to $SUMMARY_FILE"
 else
   echo "Warning: summarizer exited with an error (run dir may be partial)." >&2
 fi

From 6d640805c087f206870a001c4368a82442d665ab Mon Sep 17 00:00:00 2001
From: Saber Gholami <sgholami@tenstorrent.com>
Date: Thu, 12 Mar 2026 14:07:24 +0000
Subject: [PATCH 08/12] new json format

---
 .../perf_ci/summarize_perf_results.py         | 37 ++++++++++++++++---
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/test/ttnn-jit/perf_ci/summarize_perf_results.py b/test/ttnn-jit/perf_ci/summarize_perf_results.py
index 30a74d389a5..a478412462d 100755
--- a/test/ttnn-jit/perf_ci/summarize_perf_results.py
+++ b/test/ttnn-jit/perf_ci/summarize_perf_results.py
@@ -231,22 +231,49 @@ def main():
             g["ttnn_csv_path"] = r["csv_path"]
             g["math_fidelity_ttnn"] = r["math_fidelity"]
 
-    # Compute perf_pct_ttnn: (ttnn_duration / jit_duration) * 100 → 100 = same, <100 = JIT slower, >100 = JIT faster
-    out_rows: list[dict[str, Any]] = []
+    # Compute perf_pct_ttnn: (ttnn_duration / jit_duration) * 100
+    # 100 = same, <100 = JIT slower, >100 = JIT faster
+    measurements: list[dict[str, Any]] = []
     for key in sorted(groups.keys()):
         g = groups[key]
         jit_ns = g["jit_duration_ns"]
         ttnn_ns = g["ttnn_duration_ns"]
         if jit_ns is not None and ttnn_ns is not None and jit_ns > 0:
             g["perf_pct_ttnn"] = round((ttnn_ns / jit_ns) * 100.0, 2)
-        out_rows.append(g)
+
+        prefix = f"{g['op']}_{g['dtype']}_{g['memory_config_id']}"
+        if jit_ns is not None:
+            measurements.append(
+                {"measurement_name": f"{prefix}_jit_duration_ns", "value": jit_ns}
+            )
+        if ttnn_ns is not None:
+            measurements.append(
+                {"measurement_name": f"{prefix}_ttnn_duration_ns", "value": ttnn_ns}
+            )
+        if g["perf_pct_ttnn"] is not None:
+            measurements.append(
+                {
+                    "measurement_name": f"{prefix}_perf_pct_ttnn",
+                    "value": g["perf_pct_ttnn"],
+                }
+            )
+
+    report = {
+        "project": "tt-mlir",
+        "model": "ttnn_jit_perf",
+        "model_type": "jit_vs_ttnn",
+        "run_type": "benchmark",
+        "measurements": measurements,
+    }
 
     out_path.parent.mkdir(parents=True, exist_ok=True)
     with open(out_path, "w", encoding="utf-8") as f:
-        json.dump(out_rows, f, indent=2)
+        json.dump(report, f, indent=2)
 
     if not args.quiet:
-        print(f"Wrote {len(out_rows)} case(s) to {out_path}")
+        print(
+            f"Wrote {len(measurements)} measurement(s) from {len(groups)} case(s) to {out_path}"
+        )
     return 0
 
 

From d563c3a9b20b510c7dd4f7d57679046761c34171 Mon Sep 17 00:00:00 2001
From: Saber Gholami <sgholami@tenstorrent.com>
Date: Thu, 12 Mar 2026 15:44:32 +0000
Subject: [PATCH 09/12] json format

---
 .../perf_ci/summarize_perf_results.py         | 25 +++++++++++++------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/test/ttnn-jit/perf_ci/summarize_perf_results.py b/test/ttnn-jit/perf_ci/summarize_perf_results.py
index a478412462d..c2eaa1f60ae 100755
--- a/test/ttnn-jit/perf_ci/summarize_perf_results.py
+++ b/test/ttnn-jit/perf_ci/summarize_perf_results.py
@@ -135,6 +135,19 @@ def make_case_key(
     return (op, h, w, dtype, memory_config_id or "")
 
 
+def _measurement(name: str, value: float, step_name: str) -> dict[str, Any]:
+    return {
+        "measurement_name": name,
+        "value": value,
+        "iteration": 1,
+        "step_name": step_name,
+        "step_warm_up_num_iterations": 0,
+        "target": -1,
+        "device_power": -1.0,
+        "device_temperature": -1.0,
+    }
+
+
 def main():
     parser = argparse.ArgumentParser(
         description="Summarize JIT perf run CSVs into one entry per (op, shape, dtype, memory_config) with JIT vs TTNN comparison."
@@ -242,20 +255,16 @@ def main():
             g["perf_pct_ttnn"] = round((ttnn_ns / jit_ns) * 100.0, 2)
 
         prefix = f"{g['op']}_{g['dtype']}_{g['memory_config_id']}"
+        step = f"{g['op']}_{g['shape']}_{g['dtype']}"
         if jit_ns is not None:
-            measurements.append(
-                {"measurement_name": f"{prefix}_jit_duration_ns", "value": jit_ns}
-            )
+            measurements.append(_measurement(f"{prefix}_jit_duration_ns", jit_ns, step))
         if ttnn_ns is not None:
             measurements.append(
-                {"measurement_name": f"{prefix}_ttnn_duration_ns", "value": ttnn_ns}
+                _measurement(f"{prefix}_ttnn_duration_ns", ttnn_ns, step)
             )
         if g["perf_pct_ttnn"] is not None:
             measurements.append(
-                {
-                    "measurement_name": f"{prefix}_perf_pct_ttnn",
-                    "value": g["perf_pct_ttnn"],
-                }
+                _measurement(f"{prefix}_perf_pct_ttnn", g["perf_pct_ttnn"], step)
             )
 
     report = {

From 99fb6c07aba871616eebf8e4b5e2642a6f97b631 Mon Sep 17 00:00:00 2001
From: Saber Gholami <sgholami@tenstorrent.com>
Date: Fri, 13 Mar 2026 16:29:24 +0000
Subject: [PATCH 10/12] new json format

---
 .github/workflows/call-jit-perf-test.yml      |   8 --
 test/ttnn-jit/perf_ci/run_perf_collect.sh     |  13 +-
 .../perf_ci/summarize_perf_results.py         | 120 ++++++++++--------
 3 files changed, 74 insertions(+), 67 deletions(-)

diff --git a/.github/workflows/call-jit-perf-test.yml b/.github/workflows/call-jit-perf-test.yml
index cb34ec6c185..5b9cd144d39 100644
--- a/.github/workflows/call-jit-perf-test.yml
+++ b/.github/workflows/call-jit-perf-test.yml
@@ -151,14 +151,6 @@ jobs:
 
         test/ttnn-jit/perf_ci/run_perf_collect.sh ${{ steps.strings.outputs.perf-output-dir }}
 
-    - name: Upload JIT perf summary
-      uses: actions/upload-artifact@v4
-      if: success() || failure()
-      with:
-        name: jit-perf-summary-${{ steps.fetch-job-id.outputs.job_id }}
-        path: ${{ steps.strings.outputs.perf-output-dir }}/perf_jit_summary_${{ steps.fetch-job-id.outputs.job_id }}.json
-        if-no-files-found: warn
-
     - name: Upload JIT perf reports
       uses: actions/upload-artifact@v4
       if: success() || failure()
diff --git a/test/ttnn-jit/perf_ci/run_perf_collect.sh b/test/ttnn-jit/perf_ci/run_perf_collect.sh
index 4a7a0118e45..b30af2bdeee 100755
--- a/test/ttnn-jit/perf_ci/run_perf_collect.sh
+++ b/test/ttnn-jit/perf_ci/run_perf_collect.sh
@@ -6,8 +6,8 @@
 # Run each parametrized test in perf_tests.py under the device profiler (tracy)
 # and dump results into a directory per test. Use TT_METAL_PROFILER_DIR so each
 # run writes to a known subdir. At the end, runs summarize_perf_results.py to
-# produce OUT_DIR/perf_jit_summary[_JOB_ID].json.
-# Set JOB_ID env var to include the job ID in the filename (required for CI).
+# produce one JSON report per test case in OUT_DIR (perf_<op>_<dtype>_<mem>_<JOB_ID>.json).
+# Set JOB_ID env var to include the job ID in filenames (required for CI).
 #
 # Usage:
 #   ./test/ttnn-jit/perf_ci/run_perf_collect.sh [OUT_DIR]
@@ -65,10 +65,13 @@ done
 
 echo ""
 echo "Results written under: $OUT_DIR"
-SUMMARY_FILE="$OUT_DIR/perf_jit_summary${JOB_ID:+_$JOB_ID}.json"
 echo "Summarizing..."
-if python test/ttnn-jit/perf_ci/summarize_perf_results.py "$OUT_DIR" -o "$SUMMARY_FILE"; then
-  echo "Summary written to $SUMMARY_FILE"
+JOB_ID_ARG=""
+if [ -n "$JOB_ID" ]; then
+  JOB_ID_ARG="--job-id $JOB_ID"
+fi
+if python test/ttnn-jit/perf_ci/summarize_perf_results.py "$OUT_DIR" --output-dir "$OUT_DIR" $JOB_ID_ARG; then
+  echo "Summary reports written to $OUT_DIR"
 else
   echo "Warning: summarizer exited with an error (run dir may be partial)." >&2
 fi
diff --git a/test/ttnn-jit/perf_ci/summarize_perf_results.py b/test/ttnn-jit/perf_ci/summarize_perf_results.py
index c2eaa1f60ae..057baa5c12b 100755
--- a/test/ttnn-jit/perf_ci/summarize_perf_results.py
+++ b/test/ttnn-jit/perf_ci/summarize_perf_results.py
@@ -5,15 +5,18 @@
 #
 # Read all ops_perf_results_*.csv under a run directory (from run_perf_collect.sh),
 # group JIT vs non-JIT by case (op, shape, dtype, memory_config_id) and write one
-# entry per case with jit_duration_ns, ttnn_duration_ns, and perf_pct_ttnn.
-# math_fidelity is not part of the key so JIT (e.g. HiFi4) and TTNN (e.g. HiFi2) pair.
-# (100 = same, <100 = JIT slower, >100 = JIT faster). Suitable for Superset.
+# JSON report per case with structured fields for Superset ingestion.
+#
+# Each report becomes its own benchmark_run row in Superset with clean filterable
+# columns (model=op, precision=dtype, config=memory/shape/fidelity) and simple
+# measurement names (jit_kernel_duration_ns, ttnn_kernel_duration_ns, perf_ratio).
 #
 # Usage:
-#   python test/ttnn-jit/perf_ci/summarize_perf_results.py RUN_DIR [-o OUTPUT.json]
+#   python test/ttnn-jit/perf_ci/summarize_perf_results.py RUN_DIR [--output-dir DIR] [--job-id ID]
 #
 # Example:
-#   python test/ttnn-jit/perf_ci/summarize_perf_results.py generated/jit_perf_reports/run_20250309_123456 -o jit_perf_summary.json
+#   python test/ttnn-jit/perf_ci/summarize_perf_results.py generated/jit_perf_reports/run_20250309_123456
+#   python test/ttnn-jit/perf_ci/summarize_perf_results.py generated/jit_perf_reports/run_20250309_123456 --job-id 66822899875
 
 import argparse
 import csv
@@ -27,6 +30,10 @@
 OUTPUT_0_DATATYPE_COL = "OUTPUT_0_DATATYPE"
 INPUT_0_DATATYPE_COL = "INPUT_0_DATATYPE"
 
+UNARY_OPS = frozenset({"abs", "exp"})
+
+MEMORY_CONFIG_IDS = ("dram_interleaved", "l1_interleaved")
+
 
 def find_result_csvs(run_dir: Path):
     """Yield (test_id, csv_path) for each ops_perf_results_*.csv under run_dir."""
@@ -47,10 +54,6 @@ def find_result_csvs(run_dir: Path):
                 yield test_id, csv_path
 
 
-# Known memory_config suffixes in test_id (e.g. ...-dram_interleaved).
-MEMORY_CONFIG_IDS = ("dram_interleaved", "l1_interleaved")
-
-
 def parse_test_id(test_id: str) -> Optional[dict]:
     """
     Parse test_id into jit, op, h, w, and optionally memory_config_id.
@@ -131,7 +134,7 @@ def read_csv_duration_and_meta(csv_path: Path) -> Optional[tuple[int, str, str]]
 def make_case_key(
     op: str, h: int, w: int, dtype: str, memory_config_id: Optional[str]
 ) -> tuple:
-    """Immutable key to group JIT and non-JIT runs of the same case. Excludes math_fidelity so JIT and TTNN runs (which may report different fidelities) pair into one entry."""
+    """Immutable key to group JIT and non-JIT runs of the same case."""
     return (op, h, w, dtype, memory_config_id or "")
 
 
@@ -150,7 +153,7 @@ def _measurement(name: str, value: float, step_name: str) -> dict[str, Any]:
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Summarize JIT perf run CSVs into one entry per (op, shape, dtype, memory_config) with JIT vs TTNN comparison."
+        description="Summarize JIT perf run CSVs into one JSON report per (op, dtype, memory_config) test case for Superset."
     )
     parser.add_argument(
         "run_dir",
@@ -158,11 +161,16 @@ def main():
         help="Directory produced by run_perf_collect.sh (contains test_id/reports/...)",
     )
     parser.add_argument(
-        "-o",
-        "--output",
+        "--output-dir",
         type=Path,
         default=None,
-        help="Output JSON path (default: RUN_DIR/jit_perf_summary.json)",
+        help="Directory to write individual JSON reports (default: run_dir)",
+    )
+    parser.add_argument(
+        "--job-id",
+        type=str,
+        default=None,
+        help="GitHub job ID to append to filenames (required for CI collect_data)",
     )
     parser.add_argument(
         "-q",
@@ -177,9 +185,10 @@ def main():
         print(f"Error: not a directory: {run_dir}", file=sys.stderr)
         sys.exit(1)
 
-    out_path = args.output or (run_dir / "jit_perf_summary.json")
+    out_dir = (args.output_dir or run_dir).resolve()
+    out_dir.mkdir(parents=True, exist_ok=True)
+    job_suffix = f"_{args.job_id}" if args.job_id else ""
 
-    # Raw rows: one per CSV (test_id, jit, op, h, w, duration_ns, dtype, math_fidelity)
     raw: list[dict[str, Any]] = []
     for test_id, csv_path in find_result_csvs(run_dir):
         parsed = parse_test_id(test_id)
@@ -204,7 +213,6 @@ def main():
                 "duration_ns": duration_ns,
                 "dtype": dtype,
                 "math_fidelity": math_fidelity,
-                "csv_path": str(csv_path),
             }
         )
         if not args.quiet:
@@ -212,7 +220,6 @@ def main():
                 f"  {test_id}: {duration_ns} ns (dtype={dtype!r}, math_fidelity={math_fidelity!r})"
             )
 
-    # Group by case key (op, h, w, dtype, memory_config_id) so JIT and TTNN pair even when math_fidelity differs (e.g. matmul HiFi4 vs HiFi2)
     groups: dict[tuple, dict[str, Any]] = {}
     for r in raw:
         key = make_case_key(
@@ -225,64 +232,69 @@ def main():
                 "w": r["w"],
                 "shape": f"{r['h']}x{r['w']}",
                 "dtype": r["dtype"],
-                "math_fidelity": r["math_fidelity"],
-                "math_fidelity_ttnn": None,
+                "math_fidelity_jit": "",
+                "math_fidelity_ttnn": "",
                 "memory_config_id": r.get("memory_config_id") or "",
                 "jit_duration_ns": None,
                 "ttnn_duration_ns": None,
-                "perf_pct_ttnn": None,
-                "jit_csv_path": None,
-                "ttnn_csv_path": None,
             }
         g = groups[key]
         if r["jit"]:
             g["jit_duration_ns"] = r["duration_ns"]
-            g["jit_csv_path"] = r["csv_path"]
-            g["math_fidelity"] = r["math_fidelity"]
+            g["math_fidelity_jit"] = r["math_fidelity"]
         else:
             g["ttnn_duration_ns"] = r["duration_ns"]
-            g["ttnn_csv_path"] = r["csv_path"]
             g["math_fidelity_ttnn"] = r["math_fidelity"]
 
-    # Compute perf_pct_ttnn: (ttnn_duration / jit_duration) * 100
-    # 100 = same, <100 = JIT slower, >100 = JIT faster
-    measurements: list[dict[str, Any]] = []
+    file_count = 0
     for key in sorted(groups.keys()):
         g = groups[key]
+        op = g["op"]
+        dtype = g["dtype"]
+        mem_cfg = g["memory_config_id"]
+        shape = g["shape"]
         jit_ns = g["jit_duration_ns"]
         ttnn_ns = g["ttnn_duration_ns"]
-        if jit_ns is not None and ttnn_ns is not None and jit_ns > 0:
-            g["perf_pct_ttnn"] = round((ttnn_ns / jit_ns) * 100.0, 2)
+        is_unary = op in UNARY_OPS
 
-        prefix = f"{g['op']}_{g['dtype']}_{g['memory_config_id']}"
-        step = f"{g['op']}_{g['shape']}_{g['dtype']}"
+        measurements = []
         if jit_ns is not None:
-            measurements.append(_measurement(f"{prefix}_jit_duration_ns", jit_ns, step))
+            measurements.append(_measurement("jit_kernel_duration_ns", jit_ns, op))
         if ttnn_ns is not None:
-            measurements.append(
-                _measurement(f"{prefix}_ttnn_duration_ns", ttnn_ns, step)
-            )
-        if g["perf_pct_ttnn"] is not None:
-            measurements.append(
-                _measurement(f"{prefix}_perf_pct_ttnn", g["perf_pct_ttnn"], step)
-            )
+            measurements.append(_measurement("ttnn_kernel_duration_ns", ttnn_ns, op))
+        if jit_ns is not None and ttnn_ns is not None and jit_ns > 0:
+            ratio = round(ttnn_ns / jit_ns, 4)
+            measurements.append(_measurement("perf_ratio", ratio, op))
 
-    report = {
-        "project": "tt-mlir",
-        "model": "ttnn_jit_perf",
-        "model_type": "jit_vs_ttnn",
-        "run_type": "benchmark",
-        "measurements": measurements,
-    }
+        config = {
+            "input_a_shape": shape,
+            "input_b_shape": None if is_unary else shape,
+            "input_a_memory_config": mem_cfg,
+            "input_b_memory_config": None if is_unary else mem_cfg,
+            "math_fidelity_jit": g["math_fidelity_jit"],
+            "math_fidelity_ttnn": g["math_fidelity_ttnn"],
+        }
+
+        report = {
+            "project": "tt-mlir",
+            "model": op,
+            "model_type": "jit_vs_ttnn",
+            "run_type": "op_benchmark",
+            "precision": dtype,
+            "config": config,
+            "measurements": measurements,
+        }
 
-    out_path.parent.mkdir(parents=True, exist_ok=True)
-    with open(out_path, "w", encoding="utf-8") as f:
-        json.dump(report, f, indent=2)
+        filename = f"perf_{op}_{dtype}_{mem_cfg}{job_suffix}.json"
+        filepath = out_dir / filename
+        with open(filepath, "w", encoding="utf-8") as f:
+            json.dump(report, f, indent=2)
+        file_count += 1
+        if not args.quiet:
+            print(f"  Wrote {filepath.name} ({len(measurements)} measurements)")
 
     if not args.quiet:
-        print(
-            f"Wrote {len(measurements)} measurement(s) from {len(groups)} case(s) to {out_path}"
-        )
+        print(f"Wrote {file_count} report(s) from {len(groups)} case(s) to {out_dir}")
     return 0
 
 

From bb2f4e587a0bc274ad41822c66dd83ed65ee54e8 Mon Sep 17 00:00:00 2001
From: Saber Gholami <sgholami@tenstorrent.com>
Date: Fri, 13 Mar 2026 20:19:20 +0000
Subject: [PATCH 11/12] enable other nightly workflows

---
 .github/workflows/schedule-nightly.yml          | 9 +++------
 .github/workflows/workflow-run-collect-data.yml | 2 --
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/schedule-nightly.yml b/.github/workflows/schedule-nightly.yml
index 37986f428ed..cd50ae5833f 100644
--- a/.github/workflows/schedule-nightly.yml
+++ b/.github/workflows/schedule-nightly.yml
@@ -32,18 +32,16 @@ jobs:
       runner: ${{ needs.prepare-run.outputs.runner }}
       sh_builder: ${{ fromJson(needs.prepare-run.outputs.sh_builder) }}
       component_matrix: ${{ needs.prepare-run.outputs.build_matrix }}
-  # TODO: Re-enable wheels-build after JIT perf CI debugging is complete.
   wheels-build:
-    if: false
     needs: [ prepare-run, build-image, release-build ] # release-build required so ttnn-jit wheel is built
+    if: needs.prepare-run.outputs.skip_build != 'true'
     uses: ./.github/workflows/call-build-wheels.yml
     secrets: inherit
     with:
       docker-tag: ${{ needs.build-image.outputs.docker-tag }}
       docker_image: ${{ needs.build-image.outputs.docker-image }}
-  # TODO: Re-enable test after JIT perf CI debugging is complete.
   test:
-    if: false
+    if: always()
     needs: [ prepare-run, build-image, release-build ]
     uses: ./.github/workflows/call-test.yml
     secrets: inherit
@@ -59,9 +57,8 @@ jobs:
     with:
       docker_image: ${{ needs.build-image.outputs.docker-image }}
 
-  # TODO: Re-enable fail-notify after JIT perf CI debugging is complete.
   fail-notify:
-    if: false
+    if: always()
     needs:
       - prepare-run
       - build-image
diff --git a/.github/workflows/workflow-run-collect-data.yml b/.github/workflows/workflow-run-collect-data.yml
index add2cac7cd6..2567ad1dd78 100644
--- a/.github/workflows/workflow-run-collect-data.yml
+++ b/.github/workflows/workflow-run-collect-data.yml
@@ -26,8 +26,6 @@ jobs:
           run_attempt: ${{ github.event.workflow_run.run_attempt }}
           sftp_host: ${{ secrets.SFTP_CICD_WRITER_HOSTNAME }}
           sftp_user: ${{ secrets.SFTP_CICD_WRITER_USERNAME }}
-          sftp_perf_host: ${{ secrets.SFTP_PERF_WRITER_HOSTNAME }}
-          sftp_perf_user: ${{ secrets.SFTP_PERF_WRITER_USERNAME }}
           sftp_optest_host: ${{ secrets.SFTP_OP_TEST_WRITER_HOSTNAME }}
           sftp_optest_user: ${{ secrets.SFTP_OP_TEST_WRITER_USERNAME }}
           ssh-private-key: ${{ secrets.SFTP_CICD_WRITER_KEY }}

From 081e865d77f727245e491a4ffccc22ff911cb18f Mon Sep 17 00:00:00 2001
From: Saber Gholami <sgholami@tenstorrent.com>
Date: Fri, 13 Mar 2026 20:20:50 +0000
Subject: [PATCH 12/12] nightly fix

---
 .github/workflows/schedule-nightly.yml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/schedule-nightly.yml b/.github/workflows/schedule-nightly.yml
index cd50ae5833f..5d6af9c394c 100644
--- a/.github/workflows/schedule-nightly.yml
+++ b/.github/workflows/schedule-nightly.yml
@@ -34,15 +34,14 @@ jobs:
       component_matrix: ${{ needs.prepare-run.outputs.build_matrix }}
   wheels-build:
     needs: [ prepare-run, build-image, release-build ] # release-build required so ttnn-jit wheel is built
-    if: needs.prepare-run.outputs.skip_build != 'true'
     uses: ./.github/workflows/call-build-wheels.yml
     secrets: inherit
     with:
       docker-tag: ${{ needs.build-image.outputs.docker-tag }}
       docker_image: ${{ needs.build-image.outputs.docker-image }}
   test:
-    if: always()
     needs: [ prepare-run, build-image, release-build ]
+    if: needs.prepare-run.outputs.skip_build != 'true'
     uses: ./.github/workflows/call-test.yml
     secrets: inherit
     with:
@@ -57,6 +56,7 @@ jobs:
     with:
       docker_image: ${{ needs.build-image.outputs.docker-image }}
 
+
   fail-notify:
     if: always()
     needs:
@@ -79,9 +79,8 @@ jobs:
         with:
           jobs: ${{ toJSON(needs) }}
 
-  # TODO: Re-enable fail-send-msg after JIT perf CI debugging is complete.
   fail-send-msg:
-    if: false
+    if: always()
     needs:
       - fail-notify
       - test