Skip to content

Commit 09e0a29

Browse files
Merge OpenAI Triton commit aac457e (#3943)
This PR change the Triton base from e1162ee to aac457e (Apr 11). Pass rate: 88.69%->88.73%
2 parents 51cef7b + c973b79 commit 09e0a29

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+1146
-606
lines changed

.github/workflows/integration-tests.yml

Lines changed: 4 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ env:
3131
jobs:
3232
Runner-Preparation:
3333
runs-on: ubuntu-latest
34-
timeout-minutes: 30
34+
timeout-minutes: 45
3535
outputs:
3636
matrix-CUDA: ${{ steps.set-matrix.outputs.matrix-CUDA }}
3737
matrix-HIP: ${{ steps.set-matrix.outputs.matrix-HIP }}
@@ -198,12 +198,7 @@ jobs:
198198
~/.triton/nvidia
199199
~/.triton/json
200200
key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
201-
- # Cache ~/.triton/cache because the vast majority of unit test time is
202-
# spent compiling. Triton won't (well, should not) use these cached files
203-
# if something internal to Triton changes, because Triton's internal
204-
# source code is part of the cache key.
205-
#
206-
# Similarly, cache ~/.cache/ccache to speed up compilation.
201+
- # Cache ~/.cache/ccache to speed up compilation.
207202
#
208203
# On branch `main` we always start from an empty cache, i.e. we skip the
209204
# "restore" step. This is to prevent the caches from accumulating stale
@@ -214,7 +209,6 @@ jobs:
214209
uses: actions/cache/restore@v4
215210
with:
216211
path: |
217-
~/.triton/cache
218212
~/.ccache
219213
# Restore the most recent cache entry.
220214
restore-keys: |
@@ -285,7 +279,6 @@ jobs:
285279
uses: actions/cache/save@v4
286280
with:
287281
path: |
288-
~/.triton/cache
289282
~/.ccache
290283
key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
291284
Integration-Tests-AMD:
@@ -336,12 +329,7 @@ jobs:
336329
~/.triton/nvidia
337330
~/.triton/json
338331
key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
339-
- # Cache ~/.triton/cache because the vast majority of unit test time is
340-
# spent compiling. Triton won't (well, should not) use these cached files
341-
# if something internal to Triton changes, because Triton's internal
342-
# source code is part of the cache key.
343-
#
344-
# Similarly, cache ~/.cache/ccache to speed up compilation.
332+
- # Cache ~/.cache/ccache to speed up compilation.
345333
#
346334
# On branch `main` we always start from an empty cache, i.e. we skip the
347335
# "restore" step. This is to prevent the caches from accumulating stale
@@ -352,7 +340,6 @@ jobs:
352340
uses: actions/cache/restore@v4
353341
with:
354342
path: |
355-
~/.triton/cache
356343
~/.ccache
357344
# Restore the most recent cache entry.
358345
restore-keys: |
@@ -443,7 +430,6 @@ jobs:
443430
uses: actions/cache/save@v4
444431
with:
445432
path: |
446-
~/.triton/cache
447433
~/.ccache
448434
key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
449435
- name: Clean up caches
@@ -500,12 +486,7 @@ jobs:
500486
~/.triton/nvidia
501487
~/.triton/json
502488
key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
503-
- # Cache ~/.triton/cache because the vast majority of unit test time is
504-
# spent compiling. Triton won't (well, should not) use these cached files
505-
# if something internal to Triton changes, because Triton's internal
506-
# source code is part of the cache key.
507-
#
508-
# Similarly, cache ~/.cache/ccache to speed up compilation.
489+
- # Cache ~/.cache/ccache to speed up compilation.
509490
#
510491
# On branch `main` we always start from an empty cache, i.e. we skip the
511492
# "restore" step. This is to prevent the caches from accumulating stale
@@ -516,7 +497,6 @@ jobs:
516497
uses: actions/cache/restore@v4
517498
with:
518499
path: |
519-
~/.triton/cache
520500
~/.ccache
521501
# Restore the most recent cache entry.
522502
restore-keys: |
@@ -572,6 +552,5 @@ jobs:
572552
uses: actions/cache/save@v4
573553
with:
574554
path: |
575-
~/.triton/cache
576555
~/.ccache
577556
key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}

.github/workflows/integration-tests.yml.in

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ env:
3434
jobs:
3535
Runner-Preparation:
3636
runs-on: ubuntu-latest
37-
timeout-minutes: 30
37+
timeout-minutes: 45
3838
outputs:
3939
matrix-CUDA: ${{ steps.set-matrix.outputs.matrix-CUDA }}
4040
matrix-HIP: ${{ steps.set-matrix.outputs.matrix-HIP }}
@@ -225,12 +225,7 @@ jobs:
225225
~/.triton/json
226226
key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
227227

228-
# Cache ~/.triton/cache because the vast majority of unit test time is
229-
# spent compiling. Triton won't (well, should not) use these cached files
230-
# if something internal to Triton changes, because Triton's internal
231-
# source code is part of the cache key.
232-
#
233-
# Similarly, cache ~/.cache/ccache to speed up compilation.
228+
# Cache ~/.cache/ccache to speed up compilation.
234229
#
235230
# On branch `main` we always start from an empty cache, i.e. we skip the
236231
# "restore" step. This is to prevent the caches from accumulating stale
@@ -242,7 +237,6 @@ jobs:
242237
uses: actions/cache/restore@v4
243238
with:
244239
path: |
245-
~/.triton/cache
246240
~/.ccache
247241
# Restore the most recent cache entry.
248242
restore-keys: |
@@ -325,7 +319,6 @@ jobs:
325319
uses: actions/cache/save@v4
326320
with:
327321
path: |
328-
~/.triton/cache
329322
~/.ccache
330323
key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
331324

bench/bench/bench_mlp.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from pathlib import Path
22
import json
3+
import triton
34
import triton.profiler as proton
45
import torch
56
import triton_bench.swiglu
@@ -9,7 +10,13 @@
910
from triton_bench.routing import routing_torch, simulate_expert_sharded_routing
1011
from triton_bench.meta import cuda_capability_geq
1112

12-
if torch.cuda.is_available():
13+
14+
def is_hip_cdna4():
15+
target = triton.runtime.driver.active.get_current_target()
16+
return target.backend == 'hip' and target.arch == 'gfx950'
17+
18+
19+
if torch.cuda.is_available() and not is_hip_cdna4():
1320
from triton._C.libtriton import nvidia
1421
cublas_workspace = torch.empty(32 * 1024 * 1024, device="cuda", dtype=torch.uint8)
1522
cublas = nvidia.cublas.CublasLt(cublas_workspace)
@@ -18,6 +25,9 @@
1825

1926

2027
def _query_gpu_specs():
28+
if is_hip_cdna4():
29+
# no spec data yet.
30+
return None
2131
import subprocess
2232
cmd = ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader", "-i=0"]
2333
output = subprocess.check_output(cmd, stderr=subprocess.DEVNULL).decode().strip()
@@ -119,8 +129,10 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
119129
# TODO: proton should really be recording that in the json instead of
120130
# relying on the user to aggregate
121131
tot_time = sum(x["metrics"].get("time (ns)", 0) for x in data[0]["children"])
122-
min_time_flops = sum([tot_flops[w] / SPECS[f"MAX_TFLOPS{w}"] for w in [8, 16]]) * 1e-3
123-
min_time_bytes = tot_bytes / SPECS["MAX_TBPS"] * 1e-3
132+
min_time_flops = min_time_bytes = 0
133+
if SPECS is not None:
134+
min_time_flops = sum([tot_flops[w] / SPECS[f"MAX_TFLOPS{w}"] for w in [8, 16]]) * 1e-3
135+
min_time_bytes = tot_bytes / SPECS["MAX_TBPS"] * 1e-3
124136
min_time = max(min_time_flops, min_time_bytes)
125137
util = min_time / tot_time
126138
tflops = sum([tot_flops[w] for w in [8, 16]]) / tot_time * 1e-3
@@ -130,7 +142,7 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
130142

131143

132144
if __name__ == "__main__":
133-
has_native_mx4 = torch.cuda.get_device_capability(0)[0] >= 10
145+
has_native_mx4 = torch.cuda.get_device_capability(0)[0] >= 10 or is_hip_cdna4()
134146
qxdtype = "fp8" if has_native_mx4 else "bf16"
135147
print(bench_mlp(8192, 8192, 8192, 1, 1, "fp8", "fp8", TP=1, EP=1, name="dense"))
136148
print(bench_mlp(8192, 8192, 8192, 1, 1, qxdtype, "mx4", TP=1, EP=1, name="dense"))

bench/triton_bench/matmul_ogs_details/opt_flags.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ def make_default_opt_flags_amd(
6161
block_m = 128
6262
elif tokens_per_expt >= 512 and n >= 2048:
6363
block_m = 128
64+
6465
else:
6566
block_m = max(32, min(triton.next_power_of_2(tokens_per_expt), 64))
6667
if routing_data is not None:
@@ -90,7 +91,7 @@ def make_default_opt_flags_amd(
9091
num_stages = 2
9192
is_persistent = False
9293
# AMD-specific
93-
target_kernel_kwargs = {"waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2}
94+
target_kernel_kwargs = {"waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1}
9495
return OptFlags(
9596
block_m=block_m,
9697
block_n=block_n,

bench/triton_bench/matmul_ogs_details/opt_flags_amd.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22
import triton
33

44

5+
def is_hip_cdna4():
6+
target = triton.runtime.driver.active.get_current_target()
7+
return target.backend == 'hip' and target.arch == 'gfx950'
8+
9+
510
def compute_block_nk(n, block_m, grid_m, num_xcds, lhs_dtype, rhs_dtype, microscaling_ctx):
611
lhs_width = lhs_dtype.itemsize
712
rhs_width = rhs_dtype.itemsize if microscaling_ctx.weight_scale is None else 0.5
@@ -18,6 +23,9 @@ def compute_block_nk(n, block_m, grid_m, num_xcds, lhs_dtype, rhs_dtype, microsc
1823
else:
1924
block_n = 128
2025

26+
if is_hip_cdna4() and block_m == 128:
27+
block_n = 512
28+
2129
# block_k needs to match the cacheline size (128B)
2230
block_k = int(128 // min(lhs_width, rhs_width))
2331

include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,10 +94,15 @@ class TargetInfoBase {
9494

9595
virtual bool supportVectorizedAtomics() const = 0;
9696

97-
// Helper used by targets to annotate store operations during lowering to
98-
// llvm.
99-
virtual void storeOpAnnotation(triton::gpu::LocalStoreOp op,
100-
size_t localStoreOpCount, Type type) const {}
97+
// Annotate target specific information to local store operations during
98+
// lowering to LLVM.
99+
virtual void localStoreOpAnnotation(triton::gpu::LocalStoreOp op,
100+
size_t localStoreOpCount,
101+
Type type) const {}
102+
// Annotate target specific information to local load operations during
103+
// lowering to LLVM. `llLoadOp` is the generated LLVM load op.
104+
virtual void localLoadOpAnnotation(triton::gpu::LocalLoadOp localLoadOp,
105+
Operation *llLoadOp) const {}
101106

102107
virtual ~TargetInfoBase() {}
103108
};

include/triton/Conversion/TritonGPUToLLVM/Utility.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -706,8 +706,7 @@ emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
706706
Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
707707
std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);
708708

709-
SmallVector<Value> loadSharedToDistributed(RankedTensorType dstTy,
710-
triton::gpu::MemDescType srcTy,
709+
SmallVector<Value> loadSharedToDistributed(triton::gpu::LocalLoadOp localLoadOp,
711710
Type elemLlvmTy,
712711
const SharedMemoryObject &smemObj,
713712
Location loc, RewriterBase &rewriter,

0 commit comments

Comments
 (0)