Skip to content

Commit 4852fd3

Browse files
authored
Merge branch 'main' into fix/try-disabling-fp64-patch
2 parents 6cbdd42 + c943b2b commit 4852fd3

File tree

180 files changed

+6849
-3448
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

180 files changed

+6849
-3448
lines changed

.github/actions/setup-pytorch/action.yml

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,8 @@ runs:
4545
if: inputs.ref != ''
4646
shell: bash
4747
run: |
48-
if [[ "${{ inputs.repository }}" = "liangan1/pytorch" ]]; then
49-
PYTORCH_COMMIT_ID="$(<.github/pins/pytorchFlexAttention.txt)"
50-
echo "PYTORCH_REPO=${{ inputs.repository }}" | tee -a "$GITHUB_ENV"
51-
echo "PYTORCH_COMMIT_ID=$PYTORCH_COMMIT_ID" | tee -a "$GITHUB_ENV"
52-
else
53-
echo "PYTORCH_REPO=${{ inputs.repository }}" | tee -a "$GITHUB_ENV"
54-
echo "PYTORCH_COMMIT_ID=${{ steps.commit-id.outputs.commit_id }}" | tee -a "$GITHUB_ENV"
55-
fi
48+
echo "PYTORCH_REPO=${{ inputs.repository }}" | tee -a "$GITHUB_ENV"
49+
echo "PYTORCH_COMMIT_ID=${{ steps.commit-id.outputs.commit_id }}" | tee -a "$GITHUB_ENV"
5650
5751
- name: Identify Python version
5852
shell: bash
@@ -105,7 +99,7 @@ runs:
10599
path: pytorch
106100

107101
- name: Apply additional PR patches
108-
if: ${{ steps.pytorch-cache.outputs.status == 'miss' && inputs.mode == 'source' && (inputs.repository == 'pytorch/pytorch' || inputs.repository == 'liangan1/pytorch') }}
102+
if: ${{ steps.pytorch-cache.outputs.status == 'miss' && inputs.repository == 'pytorch/pytorch' && inputs.mode == 'source' }}
109103
shell: bash
110104
run: |
111105
cd pytorch

.github/pins/pytorch.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
6f6ff8837a0a5ef95bec63d52e24db288f1db8b0
1+
a94483329c1e5de7237ee8c6be68c3bf42743a10

.github/pins/pytorchFlexAttention.txt

Lines changed: 0 additions & 1 deletion
This file was deleted.

.github/workflows/build-test-reusable.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ jobs:
124124

125125
- name: Install pass_rate dependencies
126126
run: |
127-
pip install defusedxml
127+
pip install defusedxml setproctitle==1.3.5
128128
129129
- name: Setup Triton
130130
uses: ./.github/actions/setup-triton
@@ -169,14 +169,14 @@ jobs:
169169
170170
{
171171
echo SKIPLIST="$skiplist"
172-
echo TRITON_TEST_CMD="bash -v -x scripts/test-triton.sh --warning-reports --skip-pytorch-install --reports-dir $GITHUB_WORKSPACE/reports ${{ inputs.ignore_errors && '--ignore-errors' || '' }} $skiplist"
172+
echo TRITON_TEST_CMD="bash -x scripts/test-triton.sh --warning-reports --skip-pytorch-install --reports-dir $GITHUB_WORKSPACE/reports ${{ inputs.ignore_errors && '--ignore-errors' || '' }} $skiplist"
173173
} | tee -a $GITHUB_ENV
174174
175175
- name: Run Proton tests
176176
if: ${{ inputs.driver_version == 'rolling' && inputs.device == 'max1100' }}
177177
run: |
178178
cd third_party/proton/test
179-
pytest test_api.py test_lib.py test_profile.py test_viewer.py -s -v
179+
pytest test_api.py test_lib.py test_profile.py test_viewer.py test_record.py -s -v
180180
cd ..
181181
182182
- name: Run unit tests

.github/workflows/build-test.yml

Lines changed: 4 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -96,36 +96,14 @@ jobs:
9696
path: ${{ steps.pip-cache.outputs.path }}
9797
dest: ${{ steps.pip-cache.outputs.dest }}
9898

99-
prepare:
100-
name: Prepare
101-
runs-on: Linux
102-
103-
outputs:
104-
matrix: ${{ steps.matrix.outputs.matrix }}
105-
106-
steps:
107-
- name: Inputs
108-
run: |
109-
cat <<EOF
110-
${{ toJSON(inputs) }}
111-
EOF
112-
113-
- name: Matrix
114-
id: matrix
115-
run: |
116-
if [[ -n "${{ inputs.runner_label }}" ]]; then
117-
matrix='{"python": ["3.9"]}'
118-
else
119-
matrix='{"python": ["3.9"], "driver": ["rolling", "lts"]}'
120-
fi
121-
echo "matrix=$matrix" | tee -a $GITHUB_OUTPUT
122-
12399
integration-tests:
124100
name: Integration tests matrix
125-
needs: prepare
126101

127102
strategy:
128-
matrix: ${{ fromJson(needs.prepare.outputs.matrix) }}
103+
matrix:
104+
python:
105+
- "3.9"
106+
driver: ${{ fromJson((inputs.runner_label || '') == '' && '["rolling", "lts"]' || '["rolling"]') }}
129107

130108
uses: ./.github/workflows/build-test-reusable.yml
131109
with:

.github/workflows/integration-tests.yml

Lines changed: 6 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ env:
3131
jobs:
3232
Runner-Preparation:
3333
runs-on: ubuntu-latest
34-
timeout-minutes: 30
34+
timeout-minutes: 45
3535
outputs:
3636
matrix-CUDA: ${{ steps.set-matrix.outputs.matrix-CUDA }}
3737
matrix-HIP: ${{ steps.set-matrix.outputs.matrix-HIP }}
@@ -115,7 +115,7 @@ jobs:
115115
run: |
116116
if [ x"${{ github.repository }}" == x"triton-lang/triton" ]; then
117117
echo '::set-output name=matrix-CUDA::[["a100-runner-set"], ["h100-runner-set"], ["gb200-runner-set"]]'
118-
echo '::set-output name=matrix-HIP::[["self-hosted", "gfx90a"], ["self-hosted", "gfx942"]]'
118+
echo '::set-output name=matrix-HIP::[["self-hosted", "gfx90a"], ["amd-gfx942"]]'
119119
echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
120120
else
121121
echo '::set-output name=matrix-CUDA::["ubuntu-latest"]'
@@ -198,12 +198,7 @@ jobs:
198198
~/.triton/nvidia
199199
~/.triton/json
200200
key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
201-
- # Cache ~/.triton/cache because the vast majority of unit test time is
202-
# spent compiling. Triton won't (well, should not) use these cached files
203-
# if something internal to Triton changes, because Triton's internal
204-
# source code is part of the cache key.
205-
#
206-
# Similarly, cache ~/.cache/ccache to speed up compilation.
201+
- # Cache ~/.cache/ccache to speed up compilation.
207202
#
208203
# On branch `main` we always start from an empty cache, i.e. we skip the
209204
# "restore" step. This is to prevent the caches from accumulating stale
@@ -214,7 +209,6 @@ jobs:
214209
uses: actions/cache/restore@v4
215210
with:
216211
path: |
217-
~/.triton/cache
218212
~/.ccache
219213
# Restore the most recent cache entry.
220214
restore-keys: |
@@ -285,14 +279,13 @@ jobs:
285279
uses: actions/cache/save@v4
286280
with:
287281
path: |
288-
~/.triton/cache
289282
~/.ccache
290283
key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
291284
Integration-Tests-AMD:
292285
needs: Runner-Preparation
293286
if: needs.Runner-Preparation.outputs.matrix-HIP != ''
294287
runs-on: ${{ matrix.runner }}
295-
timeout-minutes: 30
288+
timeout-minutes: 45
296289
env:
297290
RUNNER_TYPE: ${{ matrix.runner[1] }}
298291
strategy:
@@ -336,12 +329,7 @@ jobs:
336329
~/.triton/nvidia
337330
~/.triton/json
338331
key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
339-
- # Cache ~/.triton/cache because the vast majority of unit test time is
340-
# spent compiling. Triton won't (well, should not) use these cached files
341-
# if something internal to Triton changes, because Triton's internal
342-
# source code is part of the cache key.
343-
#
344-
# Similarly, cache ~/.cache/ccache to speed up compilation.
332+
- # Cache ~/.cache/ccache to speed up compilation.
345333
#
346334
# On branch `main` we always start from an empty cache, i.e. we skip the
347335
# "restore" step. This is to prevent the caches from accumulating stale
@@ -352,7 +340,6 @@ jobs:
352340
uses: actions/cache/restore@v4
353341
with:
354342
path: |
355-
~/.triton/cache
356343
~/.ccache
357344
# Restore the most recent cache entry.
358345
restore-keys: |
@@ -443,7 +430,6 @@ jobs:
443430
uses: actions/cache/save@v4
444431
with:
445432
path: |
446-
~/.triton/cache
447433
~/.ccache
448434
key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
449435
- name: Clean up caches
@@ -500,12 +486,7 @@ jobs:
500486
~/.triton/nvidia
501487
~/.triton/json
502488
key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
503-
- # Cache ~/.triton/cache because the vast majority of unit test time is
504-
# spent compiling. Triton won't (well, should not) use these cached files
505-
# if something internal to Triton changes, because Triton's internal
506-
# source code is part of the cache key.
507-
#
508-
# Similarly, cache ~/.cache/ccache to speed up compilation.
489+
- # Cache ~/.cache/ccache to speed up compilation.
509490
#
510491
# On branch `main` we always start from an empty cache, i.e. we skip the
511492
# "restore" step. This is to prevent the caches from accumulating stale
@@ -516,7 +497,6 @@ jobs:
516497
uses: actions/cache/restore@v4
517498
with:
518499
path: |
519-
~/.triton/cache
520500
~/.ccache
521501
# Restore the most recent cache entry.
522502
restore-keys: |
@@ -572,6 +552,5 @@ jobs:
572552
uses: actions/cache/save@v4
573553
with:
574554
path: |
575-
~/.triton/cache
576555
~/.ccache
577556
key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}

.github/workflows/integration-tests.yml.in

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ env:
3434
jobs:
3535
Runner-Preparation:
3636
runs-on: ubuntu-latest
37-
timeout-minutes: 30
37+
timeout-minutes: 45
3838
outputs:
3939
matrix-CUDA: ${{ steps.set-matrix.outputs.matrix-CUDA }}
4040
matrix-HIP: ${{ steps.set-matrix.outputs.matrix-HIP }}
@@ -124,7 +124,7 @@ jobs:
124124
run: |
125125
if [ x"${{ github.repository }}" == x"triton-lang/triton" ]; then
126126
echo '::set-output name=matrix-CUDA::[["a100-runner-set"], ["h100-runner-set"], ["gb200-runner-set"]]'
127-
echo '::set-output name=matrix-HIP::[["self-hosted", "gfx90a"], ["self-hosted", "gfx942"]]'
127+
echo '::set-output name=matrix-HIP::[["self-hosted", "gfx90a"], ["amd-gfx942"]]'
128128
echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
129129
else
130130
echo '::set-output name=matrix-CUDA::["ubuntu-latest"]'
@@ -225,12 +225,7 @@ jobs:
225225
~/.triton/json
226226
key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
227227

228-
# Cache ~/.triton/cache because the vast majority of unit test time is
229-
# spent compiling. Triton won't (well, should not) use these cached files
230-
# if something internal to Triton changes, because Triton's internal
231-
# source code is part of the cache key.
232-
#
233-
# Similarly, cache ~/.cache/ccache to speed up compilation.
228+
# Cache ~/.cache/ccache to speed up compilation.
234229
#
235230
# On branch `main` we always start from an empty cache, i.e. we skip the
236231
# "restore" step. This is to prevent the caches from accumulating stale
@@ -242,7 +237,6 @@ jobs:
242237
uses: actions/cache/restore@v4
243238
with:
244239
path: |
245-
~/.triton/cache
246240
~/.ccache
247241
# Restore the most recent cache entry.
248242
restore-keys: |
@@ -325,7 +319,6 @@ jobs:
325319
uses: actions/cache/save@v4
326320
with:
327321
path: |
328-
~/.triton/cache
329322
~/.ccache
330323
key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
331324

@@ -334,7 +327,7 @@ jobs:
334327
if: needs.Runner-Preparation.outputs.matrix-HIP != ''
335328

336329
runs-on: ${{ matrix.runner }}
337-
timeout-minutes: 30
330+
timeout-minutes: 45
338331

339332
env:
340333
RUNNER_TYPE: ${{ matrix.runner[1] }}

.github/workflows/triton-benchmarks.yml

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -274,14 +274,6 @@ jobs:
274274
cd benchmarks/micro_benchmarks
275275
python run_benchmarks.py --reports $REPORTS
276276
277-
# Install Pytorch with FlexAttention XPU support enabled
278-
- name: Setup PyTorch
279-
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
280-
uses: ./.github/actions/setup-pytorch
281-
with:
282-
repository: liangan1/pytorch
283-
ref: liangan1/flex_attention
284-
285277
- name: Run Triton FlexAttention Causal Mask fwd kernel benchmark
286278
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }}
287279
run: |

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ benchmarks/**/*.so
2424
inductor_log/
2525

2626
# Backends copied from submodules
27-
python/triton/backends/
27+
python/triton/backends/*
2828
!python/triton/backends/__init__.py
2929
!python/triton/backends/compiler.py
3030
!python/triton/backends/driver.py

bench/bench/bench_mlp.py

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,22 @@
11
from pathlib import Path
22
import json
3+
import triton
34
import triton.profiler as proton
45
import torch
56
import triton_bench.swiglu
67
from triton_bench.mxfp import downcast_to_mxfp
78
from triton_bench.matmul_ogs import MicroscalingCtx, matmul_ogs, PrecisionConfig, FlexCtx
89
from triton_bench.numerics import InFlexData
9-
from triton_bench.routing import routing_torch, simulate_expert_sharded_routing
10+
from triton_bench.routing import routing, simulate_expert_sharded_routing
1011
from triton_bench.meta import cuda_capability_geq
1112

12-
if torch.cuda.is_available():
13+
14+
def is_hip_cdna4():
15+
target = triton.runtime.driver.active.get_current_target()
16+
return target.backend == 'hip' and target.arch == 'gfx950'
17+
18+
19+
if torch.cuda.is_available() and not is_hip_cdna4():
1320
from triton._C.libtriton import nvidia
1421
cublas_workspace = torch.empty(32 * 1024 * 1024, device="cuda", dtype=torch.uint8)
1522
cublas = nvidia.cublas.CublasLt(cublas_workspace)
@@ -18,6 +25,9 @@
1825

1926

2027
def _query_gpu_specs():
28+
if is_hip_cdna4():
29+
# no spec data yet.
30+
return None
2131
import subprocess
2232
cmd = ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader", "-i=0"]
2333
output = subprocess.check_output(cmd, stderr=subprocess.DEVNULL).decode().strip()
@@ -86,17 +96,19 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
8696
for i in range(100):
8797
x = torch.randn((batch, dim1), device=dev)
8898
x = x.to(wg.dtype if n_expts_tot > 1 else x_dtype)
89-
# TODO: activate proton here when fast routing is done
99+
proton.activate()
90100
if n_expts_tot > 1:
91101
logits = matmul_ogs(x, wg, bg, precision_config=pcg)
92-
rdata, gather_indx, scatter_indx = routing_torch(logits, n_expts_act)
102+
rdata, gather_indx, scatter_indx = routing(logits, n_expts_act)
93103
if EP > 1:
104+
proton.deactivate()
105+
# TODO: activate proton here when fast expert parallelism simulation is done
94106
m = logits.shape[0] * EP
95107
_, rdata, gather_indx, scatter_indx = simulate_expert_sharded_routing(m, rdata, EP, device=dev)
108+
proton.activate()
96109
x = x.to(x_dtype)
97110
else:
98111
rdata, gather_indx, scatter_indx = None, None, None
99-
proton.activate()
100112
# c0 = torch.empty((x.shape[0], w1.shape[-1]), device=dev, dtype=x.dtype)
101113
# c1 = torch.empty((x.shape[0], w2.shape[-1]), device=dev, dtype=x.dtype)
102114
# cublas.matmul(x, w1.squeeze(0), c0)
@@ -119,8 +131,10 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
119131
# TODO: proton should really be recording that in the json instead of
120132
# relying on the user to aggregate
121133
tot_time = sum(x["metrics"].get("time (ns)", 0) for x in data[0]["children"])
122-
min_time_flops = sum([tot_flops[w] / SPECS[f"MAX_TFLOPS{w}"] for w in [8, 16]]) * 1e-3
123-
min_time_bytes = tot_bytes / SPECS["MAX_TBPS"] * 1e-3
134+
min_time_flops = min_time_bytes = 0
135+
if SPECS is not None:
136+
min_time_flops = sum([tot_flops[w] / SPECS[f"MAX_TFLOPS{w}"] for w in [8, 16]]) * 1e-3
137+
min_time_bytes = tot_bytes / SPECS["MAX_TBPS"] * 1e-3
124138
min_time = max(min_time_flops, min_time_bytes)
125139
util = min_time / tot_time
126140
tflops = sum([tot_flops[w] for w in [8, 16]]) / tot_time * 1e-3
@@ -130,9 +144,9 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
130144

131145

132146
if __name__ == "__main__":
133-
has_native_mx4 = torch.cuda.get_device_capability(0)[0] >= 10
147+
has_native_mx4 = torch.cuda.get_device_capability(0)[0] >= 10 or is_hip_cdna4()
134148
qxdtype = "fp8" if has_native_mx4 else "bf16"
135149
print(bench_mlp(8192, 8192, 8192, 1, 1, "fp8", "fp8", TP=1, EP=1, name="dense"))
136150
print(bench_mlp(8192, 8192, 8192, 1, 1, qxdtype, "mx4", TP=1, EP=1, name="dense"))
137-
print(bench_mlp(1024, 5120, 8192, 128, 4, "fp8", "fp8", TP=4, EP=2, name="llama4"))
138-
print(bench_mlp(1024, 5120, 8192, 128, 4, qxdtype, "mx4", TP=4, EP=2, name="llama4"))
151+
print(bench_mlp(2048, 5120, 8192, 128, 4, "fp8", "fp8", TP=4, EP=1, name="llama4"))
152+
print(bench_mlp(2048, 5120, 8192, 128, 4, qxdtype, "mx4", TP=4, EP=1, name="llama4"))

0 commit comments

Comments
 (0)