Skip to content

Commit d8b41f2

Browse files
authored
[ci] Add amd pr testing (#475)
1 parent 182b103 commit d8b41f2

File tree

11 files changed

+182
-21
lines changed

11 files changed

+182
-21
lines changed
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
name: linux-benchmark-mi350
2+
on:
3+
workflow_call:
4+
secrets:
5+
TRITONBENCH_SCRIBE_GRAPHQL_ACCESS_TOKEN:
6+
required: True
7+
description: |
8+
Tritonbench Scribe Graph Access Token
9+
inputs:
10+
benchmark_name:
11+
required: True
12+
type: string
13+
description: |
14+
Benchmark name
15+
conda_env:
16+
required: True
17+
type: string
18+
description: |
19+
Conda environment to activate when testing Triton
20+
side_a_triton:
21+
required: False
22+
type: string
23+
description: |
24+
Triton repo name
25+
side_a_commit:
26+
required: False
27+
type: string
28+
description: |
29+
Triton repo commit
30+
31+
jobs:
32+
linux-benchmark-mi350:
33+
if: github.repository_owner == 'meta-pytorch'
34+
runs-on: [amd-mi350-runner]
35+
timeout-minutes: 240
36+
environment: docker-s3-upload
37+
permissions:
38+
id-token: write
39+
contents: read
40+
env:
41+
SETUP_SCRIPT: "/workspace/setup_instance.sh"
42+
CONDA_ENV: ${{ inputs.conda_env }}
43+
RUNNER_TYPE: "amd-mi350-runner"
44+
DOCKER_IMAGE: "ghcr.io/meta-pytorch/tritonbench:rocm-latest"
45+
JOB_NAME: tritonbench-mi350-${{ inputs.conda_env }}-${{ inputs.benchmark_name }}
46+
TRITONBENCH_SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.TRITONBENCH_SCRIBE_GRAPHQL_ACCESS_TOKEN }}
47+
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
48+
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
49+
steps:
50+
- name: Checkout Tritonbench
51+
uses: actions/checkout@v3
52+
with:
53+
submodules: recursive
54+
- name: Authenticate with AWS
55+
uses: aws-actions/configure-aws-credentials@v4
56+
with:
57+
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
58+
# The max duration enforced by the server side
59+
role-duration-seconds: 18000
60+
aws-region: us-east-1
61+
- name: Pull docker image
62+
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
63+
with:
64+
docker-image: ${{ env.DOCKER_IMAGE }}
65+
- name: Start docker container
66+
run: |
67+
set -eux
68+
69+
GPU_FLAG="--device /dev/kfd --device /dev/dri --security-opt seccomp=unconfined "
70+
71+
container_name=$(docker run \
72+
${GPU_FLAG:-} \
73+
-e CONDA_ENV \
74+
--ipc=host \
75+
--tty \
76+
--detach \
77+
--security-opt seccomp=unconfined \
78+
--shm-size=32g \
79+
--cap-add=SYS_PTRACE \
80+
-v "${GITHUB_WORKSPACE}:/tmp/workspace" \
81+
-w /tmp/workspace \
82+
"${DOCKER_IMAGE}"
83+
)
84+
# write container id to env
85+
echo "TRITONBENCH_CONTAINER_ID=${container_name}" >> $GITHUB_ENV
86+
- name: Compile Triton (On Demand)
87+
if: ${{ inputs.side_a_triton && inputs.side_a_commit }}
88+
run: |
89+
docker exec -t -w /tmp/workspace "${TRITONBENCH_CONTAINER_ID}" bash -c "
90+
set -eux
91+
bash ./.ci/triton/compile.sh --repo ${{ inputs.side_a_triton }} --commit ${{ inputs.side_a_commit }} --side a
92+
"
93+
- name: Benchmarking
94+
run: |
95+
if [ -n "${{ inputs.side_a_triton }}" ] && [ -n "${{ inputs.side_a_commit }}" ]; then
96+
docker exec -t -w /tmp/workspace "${TRITONBENCH_CONTAINER_ID}" bash -c "
97+
set -eux
98+
bash .ci/tritonbench/run-benchmark.sh ${{ inputs.benchmark_name }} --conda-env triton-side-a
99+
"
100+
else
101+
docker exec -t -w /tmp/workspace "${TRITONBENCH_CONTAINER_ID}" bash -c "
102+
set -eux
103+
bash .ci/tritonbench/run-benchmark.sh ${{ inputs.benchmark_name }}
104+
"
105+
fi
106+
cp -r ".benchmarks/${{ inputs.benchmark_name }}" benchmark-output
107+
- name: Upload result to GH Actions Artifact
108+
uses: actions/upload-artifact@v4
109+
with:
110+
name: ${{ env.JOB_NAME }}
111+
path: benchmark-output/
112+
- name: Setup uploader dependencies
113+
run: |
114+
sudo apt-get install -y python3-pip
115+
pip3 install -y pyyaml
116+
- name: Upload result to Scribe
117+
run: |
118+
. "${SETUP_SCRIPT}"
119+
latest_result_json=$(find ./benchmark-output -name "result.json" | sort -r | head -n 1)
120+
python ./.ci/upload/scribe.py --json ${latest_result_json}
121+
- name: Rewrite Tritonbench json to ClickHouse style
122+
run: |
123+
. "${SETUP_SCRIPT}"
124+
latest_result_json=$(find ./benchmark-output -name "result.json" | sort -r | head -n 1)
125+
python ./.ci/test_infra/oss_ci_benchmark_v3.py --json ${latest_result_json} \
126+
--output benchmark-output/results/result.json
127+
- name: Upload result to ClickHouse
128+
uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
129+
with:
130+
benchmark-results-dir: benchmark-output/results
131+
dry-run: false
132+
schema-version: v3
133+
github-token: ${{ secrets.GITHUB_TOKEN }}

.github/workflows/_linux-test-mi350.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,15 +30,15 @@ jobs:
3030
run: |
3131
set -eux
3232
33-
GPU_FLAG="--device /dev/kfd --device /dev/dri --security-opt seccomp=unconfined "
33+
GPU_FLAG="--device /dev/kfd --device /dev/dri --security-opt seccomp=unconfined --group-add video"
3434
3535
container_name=$(docker run \
3636
${GPU_FLAG:-} \
37+
--env-file /etc/podinfo/gha-gpu-isolation-settings \
3738
-e CONDA_ENV \
3839
--ipc=host \
3940
--tty \
4041
--detach \
41-
--security-opt seccomp=unconfined \
4242
--shm-size=32g \
4343
--cap-add=SYS_PTRACE \
4444
-v "${GITHUB_WORKSPACE}:/tmp/workspace" \

.github/workflows/pr.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@ jobs:
1414
uses: ./.github/workflows/_linux-test-h100.yml
1515
with:
1616
conda_env: "triton-main"
17+
mi350-triton-main-test:
18+
uses: ./.github/workflows/_linux-test-mi350.yml
19+
with:
20+
conda_env: "triton-main"
21+
1722

1823

1924
concurrency:

test/test_gpu/main.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
)
1414
from tritonbench.utils.env_utils import (
1515
is_fbcode, # @manual=//pytorch/tritonbench:tritonbench
16+
is_hip, # @manual=//pytorch/tritonbench:tritonbench
1617
)
1718

1819
from tritonbench.utils.parser import get_parser
@@ -28,7 +29,11 @@
2829
if "site-packages" in triton.__file__:
2930
SKIP_FILE_NAME = "skip_tests_h100_pytorch.yaml"
3031
else:
31-
SKIP_FILE_NAME = "skip_tests_h100_triton_main.yaml"
32+
SKIP_FILE_NAME = (
33+
"skip_tests_mi350_triton_main.yaml"
34+
if is_hip()
35+
else "skip_tests_h100_triton_main.yaml"
36+
)
3237
import os
3338

3439
SKIP_FILE = os.path.abspath(os.path.join(os.path.dirname(__file__), SKIP_FILE_NAME))
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Disable kernels that hard-depend on fbgemm_gpu build
2+
# TODO: enable fbgemm_gpu build in docker to re-enable these tests
3+
fp32_to_mx4:
4+
mx4_to_fp32:
5+
fp8_fused_quant_gemm_rowwise:
6+
# TODO: gdpa backward is not supported on MI350
7+
fwd_only_ops:
8+
gdpa:

tritonbench/operators/fp32_to_mx4/operator.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22
from typing import Callable, Generator, List, Optional, Tuple
33

44
import torch
5+
from tritonbench.utils.python_utils import try_import
56

67
# We are benchmarking the kernel used inside quantize_comm. Insofar, we are using the fp32_to_mx4 fbgemm API rather than the quantize_mx API.
7-
from fbgemm_gpu.quantize_utils import fp32_to_mx4, RoundingMode
8+
with try_import("HAS_FBGEMM"):
9+
from fbgemm_gpu.quantize_utils import fp32_to_mx4, RoundingMode
810

911
from tritonbench.utils.triton_op import (
1012
BenchmarkOperator,
@@ -26,7 +28,7 @@ def get_input_iter(self) -> Generator:
2628
_input = torch.randn((sz,), device=self.device, dtype=torch.float32)
2729
yield _input, 32, 2, 1, RoundingMode.even, False
2830

29-
@register_benchmark(baseline=True, fwd_only=True)
31+
@register_benchmark(baseline=True, fwd_only=True, enabled=HAS_FBGEMM)
3032
def fbgemm_fp32_to_mx4(self, *args) -> Callable:
3133
return lambda: fp32_to_mx4(*args, use_triton=True)
3234

tritonbench/operators/fp8_fused_quant_gemm_rowwise/operator.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,13 @@
11
import argparse
22
from typing import Any, Callable, Generator, List, Optional, Tuple
33

4-
import fbgemm_gpu.experimental.gen_ai # noqa: F401
4+
from tritonbench.utils.python_utils import try_import
5+
6+
with try_import("HAS_FBGEMM"):
7+
import fbgemm_gpu.experimental.gen_ai # noqa: F401
8+
from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import (
9+
matmul_fp8_row as triton_fp8_row,
10+
)
511

612
import torch
713
import triton
@@ -41,10 +47,6 @@ def parse_args(args: List[str]) -> argparse.Namespace:
4147
return args
4248

4349

44-
from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import (
45-
matmul_fp8_row as triton_fp8_row,
46-
)
47-
4850
BUILDIN_SHAPES = [
4951
(1, 2304, 2048),
5052
(1, 8192, 16384),

tritonbench/operators/fp8_gemm_blockwise/operator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import argparse
22
from typing import Any, Callable, Generator, List, Optional, Tuple
33

4-
import fbgemm_gpu.experimental.gen_ai # noqa: F401
5-
64
import torch
75
import triton
86

@@ -44,6 +42,8 @@ def parse_args(args: List[str]) -> argparse.Namespace:
4442
HAS_CUTLASS = False
4543
if is_cuda():
4644
try:
45+
import fbgemm_gpu.experimental.gen_ai
46+
4747
cutlass_fp8_block = torch.ops.llama_cpp.fp8_blockwise_matmul
4848
HAS_CUTLASS = True
4949
except:

tritonbench/operators/gdpa/gdpa.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,6 @@
2727
from torch._library.triton import capture_triton
2828
from triton.tools.tensor_descriptor import TensorDescriptor
2929

30-
from .gdpa_blackwell_tlx import gdpa_backward_tlx, get_tlx_bwd_autotune_config
31-
3230
from .gdpa_utils import (
3331
custom_triton_op,
3432
get_autotune_kernel,
@@ -48,6 +46,8 @@
4846
# @manual=//triton:triton
4947
import triton.language.extra.tlx as tlx # type: ignore
5048

49+
from .gdpa_blackwell_tlx import gdpa_backward_tlx, get_tlx_bwd_autotune_config
50+
5151
HAS_TLX = True
5252
except ImportError:
5353
# suppress type checking errors
@@ -1082,9 +1082,10 @@ def expect_contiguous(x: torch.Tensor) -> torch.Tensor:
10821082
"default": tuple(bwd_configs_ws),
10831083
}
10841084

1085-
bwd_autotune_configs_tlx = {
1086-
"default": tuple(get_tlx_bwd_autotune_config()),
1087-
}
1085+
if HAS_TLX:
1086+
bwd_autotune_configs_tlx = {
1087+
"default": tuple(get_tlx_bwd_autotune_config()),
1088+
}
10881089

10891090

10901091
@lru_cache

tritonbench/operators/grouped_gemm/operator.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,8 @@ def _inner():
8787

8888
return _inner
8989

90-
@register_benchmark()
90+
# TODO: Does not work on hip
91+
@register_benchmark(enabled=is_cuda())
9192
def torch_compile_grouped_gemm(self, group_A, group_B):
9293
def _inner():
9394
torch._dynamo.reset()
@@ -104,7 +105,8 @@ def _inner():
104105
return _inner
105106

106107
# Version of the Inductor Triton benchmark that doesn't time input preprocessing
107-
@register_benchmark()
108+
# TODO: Does not work on hip
109+
@register_benchmark(enabled=is_cuda())
108110
def preprocessed_pt2_triton_grouped_mm(self, group_A, group_B):
109111
def _inner():
110112
torch._dynamo.reset()

0 commit comments

Comments
 (0)