[ci] Add amd pr testing (#475)

xuzhao9 · web-flow · commit d8b41f2b92d2 · 2025-10-01T15:44:36.000-04:00
diff --git a/.github/workflows/_linux-benchmark-mi350.yml b/.github/workflows/_linux-benchmark-mi350.yml
@@ -0,0 +1,133 @@
+name: linux-benchmark-mi350
+on:
+  workflow_call:
+    secrets:
+      TRITONBENCH_SCRIBE_GRAPHQL_ACCESS_TOKEN:
+        required: True
+        description: |
+          Tritonbench Scribe Graph Access Token
+    inputs:
+      benchmark_name:
+        required: True
+        type: string
+        description: |
+          Benchmark name
+      conda_env:
+        required: True
+        type: string
+        description: |
+          Conda environment to activate when testing Triton
+      side_a_triton:
+        required: False
+        type: string
+        description: |
+          Triton repo name
+      side_a_commit:
+        required: False
+        type: string
+        description: |
+          Triton repo commit
+
+jobs:
+  linux-benchmark-mi350:
+    if: github.repository_owner == 'meta-pytorch'
+    runs-on: [amd-mi350-runner]
+    timeout-minutes: 240
+    environment: docker-s3-upload
+    permissions:
+      id-token: write
+      contents: read
+    env:
+      SETUP_SCRIPT: "/workspace/setup_instance.sh"
+      CONDA_ENV: ${{ inputs.conda_env }}
+      RUNNER_TYPE: "amd-mi350-runner"
+      DOCKER_IMAGE: "ghcr.io/meta-pytorch/tritonbench:rocm-latest"
+      JOB_NAME: tritonbench-mi350-${{ inputs.conda_env }}-${{ inputs.benchmark_name }}
+      TRITONBENCH_SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.TRITONBENCH_SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+    steps:
+      - name: Checkout Tritonbench
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+      - name: Authenticate with AWS
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
+          # The max duration enforced by the server side
+          role-duration-seconds: 18000
+          aws-region: us-east-1
+      - name: Pull docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: ${{ env.DOCKER_IMAGE }}
+      - name: Start docker container
+        run: |
+          set -eux
+          
+          GPU_FLAG="--device /dev/kfd --device /dev/dri --security-opt seccomp=unconfined "
+
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e CONDA_ENV \
+            --ipc=host \
+            --tty \
+            --detach \
+            --security-opt seccomp=unconfined \
+            --shm-size=32g \
+            --cap-add=SYS_PTRACE \
+            -v "${GITHUB_WORKSPACE}:/tmp/workspace" \
+            -w /tmp/workspace \
+            "${DOCKER_IMAGE}"
+          )
+          # write container id to env
+          echo "TRITONBENCH_CONTAINER_ID=${container_name}" >> $GITHUB_ENV
+      - name: Compile Triton (On Demand)
+        if: ${{ inputs.side_a_triton && inputs.side_a_commit }}
+        run: |
+          docker exec -t -w /tmp/workspace "${TRITONBENCH_CONTAINER_ID}" bash -c "
+            set -eux
+            bash ./.ci/triton/compile.sh --repo ${{ inputs.side_a_triton }} --commit ${{ inputs.side_a_commit }} --side a
+          "
+      - name: Benchmarking
+        run: |
+          if [ -n "${{ inputs.side_a_triton }}" ] && [ -n "${{ inputs.side_a_commit }}" ]; then
+            docker exec -t -w /tmp/workspace "${TRITONBENCH_CONTAINER_ID}" bash -c "
+              set -eux
+              bash .ci/tritonbench/run-benchmark.sh ${{ inputs.benchmark_name }} --conda-env triton-side-a
+            "
+          else
+            docker exec -t -w /tmp/workspace "${TRITONBENCH_CONTAINER_ID}" bash -c "
+              set -eux
+              bash .ci/tritonbench/run-benchmark.sh ${{ inputs.benchmark_name }}
+            "
+          fi
+          cp -r ".benchmarks/${{ inputs.benchmark_name }}" benchmark-output
+      - name: Upload result to GH Actions Artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.JOB_NAME }}
+          path: benchmark-output/
+      - name: Setup uploader dependencies
+        run: |
+          sudo apt-get install -y python3-pip
+          pip3 install -y pyyaml
+      - name: Upload result to Scribe
+        run: |
+          . "${SETUP_SCRIPT}"
+          latest_result_json=$(find ./benchmark-output -name "result.json"  | sort -r | head -n 1)
+          python ./.ci/upload/scribe.py --json ${latest_result_json}
+      - name: Rewrite Tritonbench json to ClickHouse style
+        run: |
+          . "${SETUP_SCRIPT}"
+          latest_result_json=$(find ./benchmark-output -name "result.json"  | sort -r | head -n 1)
+          python ./.ci/test_infra/oss_ci_benchmark_v3.py --json ${latest_result_json} \
+                 --output benchmark-output/results/result.json
+      - name: Upload result to ClickHouse
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        with:
+          benchmark-results-dir: benchmark-output/results
+          dry-run: false
+          schema-version: v3
+          github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/_linux-test-mi350.yml b/.github/workflows/_linux-test-mi350.yml
@@ -30,15 +30,15 @@ jobs:
         run: |
           set -eux
           
-          GPU_FLAG="--device /dev/kfd --device /dev/dri --security-opt seccomp=unconfined "
+          GPU_FLAG="--device /dev/kfd --device /dev/dri --security-opt seccomp=unconfined --group-add video"
 
           container_name=$(docker run \
             ${GPU_FLAG:-} \
+            --env-file /etc/podinfo/gha-gpu-isolation-settings \
             -e CONDA_ENV \
             --ipc=host \
             --tty \
             --detach \
-            --security-opt seccomp=unconfined \
             --shm-size=32g \
             --cap-add=SYS_PTRACE \
             -v "${GITHUB_WORKSPACE}:/tmp/workspace" \
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -14,6 +14,11 @@ jobs:
     uses: ./.github/workflows/_linux-test-h100.yml
     with:
       conda_env: "triton-main"
+  mi350-triton-main-test:
+    uses: ./.github/workflows/_linux-test-mi350.yml
+    with:
+      conda_env: "triton-main"
+
 
 
 concurrency:
diff --git a/test/test_gpu/main.py b/test/test_gpu/main.py
@@ -13,6 +13,7 @@
 )
 from tritonbench.utils.env_utils import (
     is_fbcode,  # @manual=//pytorch/tritonbench:tritonbench
+    is_hip,  # @manual=//pytorch/tritonbench:tritonbench
 )
 
 from tritonbench.utils.parser import get_parser
@@ -28,7 +29,11 @@
     if "site-packages" in triton.__file__:
         SKIP_FILE_NAME = "skip_tests_h100_pytorch.yaml"
     else:
-        SKIP_FILE_NAME = "skip_tests_h100_triton_main.yaml"
+        SKIP_FILE_NAME = (
+            "skip_tests_mi350_triton_main.yaml"
+            if is_hip()
+            else "skip_tests_h100_triton_main.yaml"
+        )
     import os
 
     SKIP_FILE = os.path.abspath(os.path.join(os.path.dirname(__file__), SKIP_FILE_NAME))
diff --git a/test/test_gpu/skip_tests_mi350_triton_main.yaml b/test/test_gpu/skip_tests_mi350_triton_main.yaml
@@ -0,0 +1,8 @@
+# Disable kernels that hard-depend on fbgemm_gpu build
+# TODO: enable fbgemm_gpu build in docker to re-enable these tests
+fp32_to_mx4:
+mx4_to_fp32:
+fp8_fused_quant_gemm_rowwise:
+# TODO: gdpa backward is not supported on MI350
+fwd_only_ops:
+  gdpa:
diff --git a/tritonbench/operators/fp32_to_mx4/operator.py b/tritonbench/operators/fp32_to_mx4/operator.py
@@ -2,9 +2,11 @@
 from typing import Callable, Generator, List, Optional, Tuple
 
 import torch
+from tritonbench.utils.python_utils import try_import
 
 # We are benchmarking the kernel used inside quantize_comm. Insofar, we are using the fp32_to_mx4 fbgemm API rather than the quantize_mx API.
-from fbgemm_gpu.quantize_utils import fp32_to_mx4, RoundingMode
+with try_import("HAS_FBGEMM"):
+    from fbgemm_gpu.quantize_utils import fp32_to_mx4, RoundingMode
 
 from tritonbench.utils.triton_op import (
     BenchmarkOperator,
@@ -26,7 +28,7 @@ def get_input_iter(self) -> Generator:
             _input = torch.randn((sz,), device=self.device, dtype=torch.float32)
             yield _input, 32, 2, 1, RoundingMode.even, False
 
-    @register_benchmark(baseline=True, fwd_only=True)
+    @register_benchmark(baseline=True, fwd_only=True, enabled=HAS_FBGEMM)
     def fbgemm_fp32_to_mx4(self, *args) -> Callable:
         return lambda: fp32_to_mx4(*args, use_triton=True)
 
diff --git a/tritonbench/operators/fp8_fused_quant_gemm_rowwise/operator.py b/tritonbench/operators/fp8_fused_quant_gemm_rowwise/operator.py
@@ -1,7 +1,13 @@
 import argparse
 from typing import Any, Callable, Generator, List, Optional, Tuple
 
-import fbgemm_gpu.experimental.gen_ai  # noqa: F401
+from tritonbench.utils.python_utils import try_import
+
+with try_import("HAS_FBGEMM"):
+    import fbgemm_gpu.experimental.gen_ai  # noqa: F401
+    from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import (
+        matmul_fp8_row as triton_fp8_row,
+    )
 
 import torch
 import triton
@@ -41,10 +47,6 @@ def parse_args(args: List[str]) -> argparse.Namespace:
     return args
 
 
-from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import (
-    matmul_fp8_row as triton_fp8_row,
-)
-
 BUILDIN_SHAPES = [
     (1, 2304, 2048),
     (1, 8192, 16384),
diff --git a/tritonbench/operators/fp8_gemm_blockwise/operator.py b/tritonbench/operators/fp8_gemm_blockwise/operator.py
@@ -1,8 +1,6 @@
 import argparse
 from typing import Any, Callable, Generator, List, Optional, Tuple
 
-import fbgemm_gpu.experimental.gen_ai  # noqa: F401
-
 import torch
 import triton
 
@@ -44,6 +42,8 @@ def parse_args(args: List[str]) -> argparse.Namespace:
 HAS_CUTLASS = False
 if is_cuda():
     try:
+        import fbgemm_gpu.experimental.gen_ai
+
         cutlass_fp8_block = torch.ops.llama_cpp.fp8_blockwise_matmul
         HAS_CUTLASS = True
     except:
diff --git a/tritonbench/operators/gdpa/gdpa.py b/tritonbench/operators/gdpa/gdpa.py
@@ -27,8 +27,6 @@
 from torch._library.triton import capture_triton
 from triton.tools.tensor_descriptor import TensorDescriptor
 
-from .gdpa_blackwell_tlx import gdpa_backward_tlx, get_tlx_bwd_autotune_config
-
 from .gdpa_utils import (
     custom_triton_op,
     get_autotune_kernel,
@@ -48,6 +46,8 @@
     # @manual=//triton:triton
     import triton.language.extra.tlx as tlx  # type: ignore
 
+    from .gdpa_blackwell_tlx import gdpa_backward_tlx, get_tlx_bwd_autotune_config
+
     HAS_TLX = True
 except ImportError:
     # suppress type checking errors
@@ -1082,9 +1082,10 @@ def expect_contiguous(x: torch.Tensor) -> torch.Tensor:
     "default": tuple(bwd_configs_ws),
 }
 
-bwd_autotune_configs_tlx = {
-    "default": tuple(get_tlx_bwd_autotune_config()),
-}
+if HAS_TLX:
+    bwd_autotune_configs_tlx = {
+        "default": tuple(get_tlx_bwd_autotune_config()),
+    }
 
 
 @lru_cache
diff --git a/tritonbench/operators/grouped_gemm/operator.py b/tritonbench/operators/grouped_gemm/operator.py
@@ -87,7 +87,8 @@ def _inner():
 
         return _inner
 
-    @register_benchmark()
+    # TODO: Does not work on hip
+    @register_benchmark(enabled=is_cuda())
     def torch_compile_grouped_gemm(self, group_A, group_B):
         def _inner():
             torch._dynamo.reset()
@@ -104,7 +105,8 @@ def _inner():
         return _inner
 
     # Version of the Inductor Triton benchmark that doesn't time input preprocessing
-    @register_benchmark()
+    # TODO: Does not work on hip
+    @register_benchmark(enabled=is_cuda())
     def preprocessed_pt2_triton_grouped_mm(self, group_A, group_B):
         def _inner():
             torch._dynamo.reset()
diff --git a/tritonbench/operators/mx4_to_fp32/operator.py b/tritonbench/operators/mx4_to_fp32/operator.py
@@ -4,14 +4,17 @@
 import torch
 
 # We are benchmarking the kernel used inside quantize_comm. Insofar, we are using the fp32_to_mx4 fbgemm API rather than the quantize_mx API.
-from fbgemm_gpu.quantize_utils import fp32_to_mx4, mx4_to_fp32
 
+from tritonbench.utils.python_utils import try_import
 from tritonbench.utils.triton_op import (
     BenchmarkOperator,
     register_benchmark,
     register_x_val,
 )
 
+with try_import("HAS_FBGEMM"):
+    from fbgemm_gpu.quantize_utils import fp32_to_mx4, mx4_to_fp32
+
 
 class Operator(BenchmarkOperator):
     def __init__(
@@ -34,7 +37,7 @@ def get_input_iter(self) -> Generator:
             )
             yield _input, group_size, ebits, mbits
 
-    @register_benchmark(baseline=True, fwd_only=True)
+    @register_benchmark(baseline=True, fwd_only=True, enabled=HAS_FBGEMM)
     def fbgemm_mx4_to_fp32(
         self, tensor: torch.Tensor, group_size: int, ebits: int, mbits: int
     ) -> Callable: