Skip to content

Commit 008acd8

Browse files
authored
[CI][AMD] Run the benchmark on MI350 (#502)
1 parent d562ae2 commit 008acd8

File tree

9 files changed

+111
-43
lines changed

9 files changed

+111
-43
lines changed

.ci/test_infra/oss_ci_benchmark_v3.py

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@
55

66
import argparse
77
import json
8+
import os
89
import re
10+
import sys
11+
from os.path import abspath, exists
912
from pathlib import Path
1013

1114
from typing import Any, Dict, List, Tuple
@@ -15,10 +18,37 @@
1518
"name": "gcp-h100-runner",
1619
"gpu_count": 1,
1720
"avail_gpu_mem_in_gb": 80,
18-
}
21+
},
22+
"amd-mi350-runner": {
23+
"name": "amd-mi350-runner",
24+
"gpu_count": 1,
25+
"avail_gpu_mem_in_gb": 288,
26+
},
1927
}
2028

2129

30+
def setup_tritonbench_cwd():
31+
original_dir = abspath(os.getcwd())
32+
33+
for tritonbench_dir in (
34+
".",
35+
"../../tritonbench",
36+
):
37+
if exists(tritonbench_dir):
38+
break
39+
40+
if exists(tritonbench_dir):
41+
tritonbench_dir = abspath(tritonbench_dir)
42+
os.chdir(tritonbench_dir)
43+
sys.path.append(tritonbench_dir)
44+
return original_dir
45+
46+
47+
setup_tritonbench_cwd()
48+
49+
from tritonbench.utils.scuba_utils import get_github_env
50+
51+
2252
def parse_runners(
2353
runner_name: str, runner_type: str, envs: Dict[str, str]
2454
) -> List[Dict[str, Any]]:
@@ -130,6 +160,11 @@ def v3_json_to_str(v3_json: List[Dict[str, Any]], to_lines: bool = True) -> str:
130160
required=True,
131161
help="Upload benchmark result json file.",
132162
)
163+
parser.add_argument(
164+
"--add-github-env",
165+
action="store_true",
166+
help="Add github env to the result json file.",
167+
)
133168
parser.add_argument("--output", required=True, help="output json.")
134169
args = parser.parse_args()
135170
upload_file_path = Path(args.json)
@@ -138,8 +173,13 @@ def v3_json_to_str(v3_json: List[Dict[str, Any]], to_lines: bool = True) -> str:
138173
), f"Specified result json path {args.json} does not exist."
139174
with open(upload_file_path, "r") as fp:
140175
benchmark_result = json.load(fp)
141-
oss_ci_v3_json = generate_oss_ci_benchmark_v3_json(benchmark_result)
142-
out_str = v3_json_to_str(oss_ci_v3_json)
176+
if args.add_github_env:
177+
github_env = get_github_env()
178+
benchmark_result["github"] = github_env
179+
out_str = v3_json_to_str(benchmark_result, to_lines=False)
180+
else:
181+
oss_ci_v3_json = generate_oss_ci_benchmark_v3_json(benchmark_result)
182+
out_str = v3_json_to_str(oss_ci_v3_json)
143183
output_dir = Path(args.output).parent
144184
output_dir.mkdir(parents=True, exist_ok=True)
145185
with open(args.output, "w") as fp:

.ci/tritonbench/run-benchmark.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/bin/bash
2-
set -x
2+
set -xeuo pipefail
33

44
if [ -z "${SETUP_SCRIPT}" ]; then
55
echo "ERROR: SETUP_SCRIPT is not set"
@@ -31,6 +31,10 @@ fi
3131
tritonbench_dir=$(dirname "$(readlink -f "$0")")/../..
3232
cd "${tritonbench_dir}"
3333

34+
# check if the current repo has "dubious ownership" issue
35+
git config --global --add safe.directory '*'
36+
git rev-parse --verify HEAD
37+
3438
echo "Running ${BENCHMARK_NAME} benchmark under conda env ${CONDA_ENV}"
3539

3640
. "${SETUP_SCRIPT}"

.ci/upload/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
11
boto3
2+
pyyaml
3+
requests

.github/workflows/_linux-benchmark-mi350.yml

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,10 @@ jobs:
5858
# The max duration enforced by the server side
5959
role-duration-seconds: 18000
6060
aws-region: us-east-1
61+
- name: Setup uploader dependencies
62+
run: |
63+
sudo apt-get install -y python3-pip
64+
pip3 install -r .ci/upload/requirements.txt
6165
- name: Pull docker image
6266
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
6367
with:
@@ -77,21 +81,24 @@ jobs:
7781
--security-opt seccomp=unconfined \
7882
--shm-size=32g \
7983
--cap-add=SYS_PTRACE \
84+
--user root \
8085
-v "${GITHUB_WORKSPACE}:/tmp/workspace" \
8186
-w /tmp/workspace \
8287
"${DOCKER_IMAGE}"
8388
)
89+
8490
# write container id to env
8591
echo "TRITONBENCH_CONTAINER_ID=${container_name}" >> $GITHUB_ENV
8692
- name: Compile Triton (On Demand)
8793
if: ${{ inputs.side_a_triton && inputs.side_a_commit }}
8894
run: |
8995
docker exec -t -w /tmp/workspace "${TRITONBENCH_CONTAINER_ID}" bash -c "
9096
set -eux
91-
bash ./.ci/triton/compile.sh --repo ${{ inputs.side_a_triton }} --commit ${{ inputs.side_a_commit }} --side a
97+
bash ./.ci/triton/install.sh --repo ${{ inputs.side_a_triton }} --commit ${{ inputs.side_a_commit }} --side a
9298
"
9399
- name: Benchmarking
94100
run: |
101+
set -eux
95102
if [ -n "${{ inputs.side_a_triton }}" ] && [ -n "${{ inputs.side_a_commit }}" ]; then
96103
docker exec -t -w /tmp/workspace "${TRITONBENCH_CONTAINER_ID}" bash -c "
97104
set -eux
@@ -104,25 +111,23 @@ jobs:
104111
"
105112
fi
106113
cp -r ".benchmarks/${{ inputs.benchmark_name }}" benchmark-output
114+
# post-process result.json
115+
latest_result_json=$(find ./benchmark-output -name "result.json" | sort -r | head -n 1)
116+
python3 ./.ci/test_infra/oss_ci_benchmark_v3.py --json ${latest_result_json} \
117+
--add-github-env --output ${latest_result_json}
107118
- name: Upload result to GH Actions Artifact
108119
uses: actions/upload-artifact@v4
109120
with:
110121
name: ${{ env.JOB_NAME }}
111122
path: benchmark-output/
112-
- name: Setup uploader dependencies
113-
run: |
114-
sudo apt-get install -y python3-pip
115-
pip3 install -y pyyaml
116123
- name: Upload result to Scribe
117124
run: |
118-
. "${SETUP_SCRIPT}"
119125
latest_result_json=$(find ./benchmark-output -name "result.json" | sort -r | head -n 1)
120-
python ./.ci/upload/scribe.py --json ${latest_result_json}
126+
python3 ./.ci/upload/scribe.py --json ${latest_result_json}
121127
- name: Rewrite Tritonbench json to ClickHouse style
122128
run: |
123-
. "${SETUP_SCRIPT}"
124129
latest_result_json=$(find ./benchmark-output -name "result.json" | sort -r | head -n 1)
125-
python ./.ci/test_infra/oss_ci_benchmark_v3.py --json ${latest_result_json} \
130+
python3 ./.ci/test_infra/oss_ci_benchmark_v3.py --json ${latest_result_json} \
126131
--output benchmark-output/results/result.json
127132
- name: Upload result to ClickHouse
128133
uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
@@ -131,3 +136,7 @@ jobs:
131136
dry-run: false
132137
schema-version: v3
133138
github-token: ${{ secrets.GITHUB_TOKEN }}
139+
- name: Kill the container
140+
if: always()
141+
run: |
142+
docker kill "${TRITONBENCH_CONTAINER_ID}" || true

.github/workflows/nightly.yml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ on:
2828
pull_request:
2929
paths:
3030
- benchmarks/nightly/**
31-
- .github/workflows/_linux-benchmark-h100.yml
31+
- .github/workflows/_linux-benchmark-*.yml
3232
- .github/workflows/nightly.yml
3333

3434
jobs:
@@ -42,6 +42,16 @@ jobs:
4242
side_a_commit: ${{ inputs.side_a_commit }}
4343
secrets:
4444
TRITONBENCH_SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.TRITONBENCH_SCRIBE_GRAPHQL_ACCESS_TOKEN }}
45+
mi350-triton-main-nightly-periodic:
46+
uses: ./.github/workflows/_linux-benchmark-mi350.yml
47+
if: ${{ inputs.test_type != 'abtest' }}
48+
with:
49+
conda_env: "triton-main"
50+
benchmark_name: "nightly"
51+
side_a_triton: ${{ inputs.side_a_triton }}
52+
side_a_commit: ${{ inputs.side_a_commit }}
53+
secrets:
54+
TRITONBENCH_SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.TRITONBENCH_SCRIBE_GRAPHQL_ACCESS_TOKEN }}
4555
h100-triton-nightly-abtest:
4656
uses: ./.github/workflows/_linux-benchmark-abtest-h100.yml
4757
if: ${{ inputs.test_type == 'abtest' }}

.github/workflows/pr.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
name: TritonBench PR Test
22
on:
33
pull_request:
4+
paths:
5+
- .github/workflows/pr.yml
6+
- .github/workflows/_linux-test-*.yml
7+
- tritonbench/**
48
push:
59
branches:
610
- main

tritonbench/utils/gpu_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,5 +229,5 @@ def has_nvidia_smi() -> bool:
229229
try:
230230
subprocess.check_output("nvidia-smi")
231231
return True
232-
except subprocess.SubprocessError:
232+
except (subprocess.SubprocessError, FileNotFoundError):
233233
return False

tritonbench/utils/run_utils.py

Lines changed: 2 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010

1111
from typing import Dict, List, Optional
1212

13+
import torch
14+
1315
import yaml
1416

1517
from tritonbench.utils.env_utils import is_fbcode
@@ -44,8 +46,6 @@ def get_run_env(
4446
Gather environment of the benchmark.
4547
repo_locs: Git repository dict of the repositories.
4648
"""
47-
import torch
48-
4949
run_env = {}
5050
run_env["benchmark_date"] = run_timestamp
5151
run_env["cuda_version"] = torch.version.cuda if torch.version.cuda else "unknown"
@@ -73,31 +73,6 @@ def get_run_env(
7373
return run_env
7474

7575

76-
def get_github_env() -> Dict[str, str]:
77-
assert (
78-
"GITHUB_RUN_ID" in os.environ
79-
), "GITHUB_RUN_ID environ must exist to obtain GitHub env"
80-
out = {}
81-
out["GITHUB_ACTION"] = os.environ["GITHUB_ACTION"]
82-
out["GITHUB_ACTOR"] = os.environ["GITHUB_ACTOR"]
83-
out["GITHUB_BASE_REF"] = os.environ["GITHUB_BASE_REF"]
84-
out["GITHUB_REF"] = os.environ["GITHUB_REF"]
85-
out["GITHUB_REF_PROTECTED"] = os.environ["GITHUB_REF_PROTECTED"]
86-
out["GITHUB_REPOSITORY"] = os.environ["GITHUB_REPOSITORY"]
87-
out["GITHUB_RUN_ATTEMPT"] = os.environ["GITHUB_RUN_ATTEMPT"]
88-
out["GITHUB_RUN_ID"] = os.environ["GITHUB_RUN_ID"]
89-
out["GITHUB_RUN_NUMBER"] = os.environ["GITHUB_RUN_NUMBER"]
90-
out["GITHUB_WORKFLOW"] = os.environ["GITHUB_WORKFLOW"]
91-
out["GITHUB_WORKFLOW_REF"] = os.environ["GITHUB_WORKFLOW_REF"]
92-
out["GITHUB_WORKFLOW_SHA"] = os.environ["GITHUB_WORKFLOW_SHA"]
93-
out["JOB_NAME"] = os.environ["JOB_NAME"]
94-
out["RUNNER_ARCH"] = os.environ["RUNNER_ARCH"]
95-
out["RUNNER_TYPE"] = os.environ["RUNNER_TYPE"]
96-
out["RUNNER_NAME"] = os.environ["RUNNER_NAME"]
97-
out["RUNNER_OS"] = os.environ["RUNNER_OS"]
98-
return out
99-
100-
10176
def run_config(config_file: str, args: List[str]):
10277
assert Path(config_file).exists(), f"Config file {config_file} must exist."
10378
with open(config_file, "r") as fp:

tritonbench/utils/scuba_utils.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313

1414
from tritonbench.utils.gpu_utils import get_nvidia_gpu_states, has_nvidia_smi
1515
from tritonbench.utils.path_utils import REPO_PATH
16-
from tritonbench.utils.run_utils import get_github_env, get_run_env
1716

1817
CATEGORY_NAME = "perfpipe_pytorch_user_benchmarks"
1918

@@ -58,6 +57,30 @@
5857
}
5958

6059

60+
def get_github_env() -> Dict[str, str]:
61+
if "GITHUB_RUN_ID" not in os.environ:
62+
return {}
63+
out = {}
64+
out["GITHUB_ACTION"] = os.environ["GITHUB_ACTION"]
65+
out["GITHUB_ACTOR"] = os.environ["GITHUB_ACTOR"]
66+
out["GITHUB_BASE_REF"] = os.environ["GITHUB_BASE_REF"]
67+
out["GITHUB_REF"] = os.environ["GITHUB_REF"]
68+
out["GITHUB_REF_PROTECTED"] = os.environ["GITHUB_REF_PROTECTED"]
69+
out["GITHUB_REPOSITORY"] = os.environ["GITHUB_REPOSITORY"]
70+
out["GITHUB_RUN_ATTEMPT"] = os.environ["GITHUB_RUN_ATTEMPT"]
71+
out["GITHUB_RUN_ID"] = os.environ["GITHUB_RUN_ID"]
72+
out["GITHUB_RUN_NUMBER"] = os.environ["GITHUB_RUN_NUMBER"]
73+
out["GITHUB_WORKFLOW"] = os.environ["GITHUB_WORKFLOW"]
74+
out["GITHUB_WORKFLOW_REF"] = os.environ["GITHUB_WORKFLOW_REF"]
75+
out["GITHUB_WORKFLOW_SHA"] = os.environ["GITHUB_WORKFLOW_SHA"]
76+
out["JOB_NAME"] = os.environ["JOB_NAME"]
77+
out["RUNNER_ARCH"] = os.environ["RUNNER_ARCH"]
78+
out["RUNNER_TYPE"] = os.environ["RUNNER_TYPE"]
79+
out["RUNNER_NAME"] = os.environ["RUNNER_NAME"]
80+
out["RUNNER_OS"] = os.environ["RUNNER_OS"]
81+
return out
82+
83+
6184
class ScribeUploader:
6285
def __init__(self, category, schema):
6386
self.category = category
@@ -133,6 +156,7 @@ def decorate_benchmark_data(
133156
name, run_timestamp, ci: bool, benchmark_data: List[Dict[str, Any]]
134157
):
135158
"""aggregate benchmark_data into a single object"""
159+
from tritonbench.utils.run_utils import get_run_env
136160

137161
repo_locs = {
138162
"tritonbench": REPO_PATH,

0 commit comments

Comments
 (0)