Skip to content

Commit 2caaac0

Browse files
committed
Merge commit '449e01478694e35e0654fee3c8525d32cb0e3a5c'
2 parents 334be65 + 449e014 commit 2caaac0

File tree

12 files changed

+253
-428
lines changed

12 files changed

+253
-428
lines changed

test/Proton/scope_id.mlir

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,25 @@ module {
151151

152152
// -----
153153

154+
module {
155+
tt.func @scf_loop() {
156+
%c0 = arith.constant 0 : index
157+
// expected-remark @below {{scope id = 0}}
158+
// expected-remark @below {{scope parent id = -1}}
159+
proton.record start "loop"
160+
scf.for %i = %c0 to %c0 step %c0 {
161+
// expected-remark @below {{scope id = 1}}
162+
// expected-remark @below {{scope parent id = 0}}
163+
proton.record start "loop_body"
164+
proton.record end "loop_body"
165+
}
166+
proton.record end "loop"
167+
tt.return
168+
}
169+
}
170+
171+
// -----
172+
154173
module {
155174
tt.func @scf_loop_if(%cond: i1) {
156175
%c0 = arith.constant 0 : index
@@ -203,19 +222,19 @@ module {
203222
ttg.warp_specialize()
204223
default {
205224
// expected-remark @below {{scope id = 1}}
206-
// expected-remark @below {{scope parent id = -1}}
225+
// expected-remark @below {{scope parent id = 0}}
207226
proton.record start "default"
208227
// expected-remark @below {{scope id = 1}}
209-
// expected-remark @below {{scope parent id = -1}}
228+
// expected-remark @below {{scope parent id = 0}}
210229
proton.record end "default"
211230
ttg.warp_yield
212231
}
213232
partition0() num_warps(1) {
214233
// expected-remark @below {{scope id = 2}}
215-
// expected-remark @below {{scope parent id = -1}}
234+
// expected-remark @below {{scope parent id = 0}}
216235
proton.record start "partition"
217236
// expected-remark @below {{scope id = 2}}
218-
// expected-remark @below {{scope parent id = -1}}
237+
// expected-remark @below {{scope parent id = 0}}
219238
proton.record end "partition"
220239
ttg.warp_return
221240
} : () -> ()

test/TritonGPU/amd/amd-optimize-dot-operands.mlir

Lines changed: 1 addition & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -1,129 +1,4 @@
1-
// RUN: triton-opt %s -split-input-file -tritonamdgpu-optimize-dot-operands="arch-generation-name=gfx950" | FileCheck %s --check-prefixes CHECK,GFX950
2-
// RUN: triton-opt %s -split-input-file -tritonamdgpu-optimize-dot-operands="arch-generation-name=gfx942" | FileCheck %s
3-
4-
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
5-
#linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [32, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 8]], warp = [[0, 0], [0, 0]], block = []}>
6-
#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [32, 32, 16], isTransposed = true}>
7-
#mma1 = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [16, 16, 32], isTransposed = true}>
8-
// CHECK{LITERAL}: #shared = #ttg.swizzled_shared<{vec = 8, perPhase = 2, maxPhase = 8, order = [0, 1]}>
9-
// CHECK{LITERAL}: #smem = #ttg.shared_memory
10-
// CHECK-LABEL: test_local_load_transposed
11-
// CHECK: %[[LOAD:.+]] = tt.load {{.*}} : tensor<64x16x!tt.ptr<f16>, #blocked>
12-
// CHECK: %[[ALLOC:.+]] = ttg.local_alloc %[[LOAD]] : (tensor<64x16xf16, #blocked>) -> !ttg.memdesc<64x16xf16, #shared, #smem>
13-
// CHECK: %[[LOCAL_LOAD_TRANS:.+]] = ttg.local_load %[[ALLOC]] : !ttg.memdesc<64x16xf16, #shared, #smem> -> tensor<64x16xf16, #linear>
14-
// CHECK: %[[LOCAL_LOAD_DIRECT:.+]] = ttg.local_load %[[ALLOC]] : !ttg.memdesc<64x16xf16, #shared, #smem> -> tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
15-
// CHECK: tt.dot {{.+}}, %[[LOCAL_LOAD_DIRECT]], {{.+}}: tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<128x16xf32, #mma>
16-
// CHECK: %[[TRANS:.+]] = tt.trans %[[LOCAL_LOAD_TRANS]] {order = array<i32: 1, 0>} : tensor<64x16xf16, #linear> -> tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>>
17-
// CHECK: tt.dot {{.+}}, %[[TRANS]], {{.+}} : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>> -> tensor<128x64xf32, #mma1>
18-
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
19-
tt.func public @test_local_load_transposed(
20-
%arg0: tensor<64x16x!tt.ptr<f16>, #blocked>,
21-
%out0 : tensor<128x16x!tt.ptr<f32>, #blocked>,
22-
%out1 : tensor<128x64x!tt.ptr<f32>, #blocked>
23-
) {
24-
%cst_0 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1>
25-
%cst_1 = arith.constant dense<0.693147182> : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 8}>>
26-
%cst_2 = arith.constant dense<0.581374812> : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
27-
%cst_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma>
28-
29-
%0 = tt.load %arg0 : tensor<64x16x!tt.ptr<f16>, #blocked>
30-
%1 = ttg.convert_layout %0 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #linear>
31-
%2 = ttg.convert_layout %0 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>>
32-
%3 = tt.dot %cst_1, %2, %cst_0 : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 8}>> * tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>> -> tensor<128x16xf32, #mma1>
33-
%4 = tt.trans %1 {order = array<i32: 1, 0>} : tensor<64x16xf16, #linear> -> tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
34-
%5 = tt.dot %cst_2, %4, %cst_3 : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<128x64xf32, #mma>
35-
36-
%6 = ttg.convert_layout %3 : tensor<128x16xf32, #mma1> -> tensor<128x16xf32, #blocked>
37-
%7 = ttg.convert_layout %5 : tensor<128x64xf32, #mma> -> tensor<128x64xf32, #blocked>
38-
tt.store %out0, %6 : tensor<128x16x!tt.ptr<f32>, #blocked>
39-
tt.store %out1, %7 : tensor<128x64x!tt.ptr<f32>, #blocked>
40-
tt.return
41-
}
42-
}
43-
// -----
44-
45-
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
46-
#linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [32, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 8]], warp = [[0, 0], [0, 0]], block = []}>
47-
#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [32, 32, 16], isTransposed = true}>
48-
#mma1 = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [16, 16, 32], isTransposed = true}>
49-
// CHECK-NOT: #shared = #ttg.swizzled_shared<{vec = 8, perPhase = 2, maxPhase = 8, order = [0, 1]}>
50-
// CHECK-NOT: #smem = #ttg.shared_memory
51-
// CHECK-LABEL: test_not_local_load_transposed_kWidth_mismatch
52-
// CHECK: tt.load {{.*}} : tensor<64x16x!tt.ptr<f16>, #blocked>
53-
// CHECK-NOT: ttg.local_alloc
54-
// CHECK-NOT: ttg.local_load
55-
// CHECK-NOT: ttg.local_load
56-
// CHECK: tt.dot {{.+}}: tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> * tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>> -> tensor<128x16xf32, #mma>
57-
// CHECK: tt.trans {{.+}} {order = array<i32: 1, 0>} : tensor<64x16xf16, #linear> -> tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>>
58-
// CHECK: tt.dot {{.+}} : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>> -> tensor<128x64xf32, #mma1>
59-
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
60-
tt.func public @test_not_local_load_transposed_kWidth_mismatch(
61-
%arg0: tensor<64x16x!tt.ptr<f16>, #blocked>,
62-
%out0 : tensor<128x16x!tt.ptr<f32>, #blocked>,
63-
%out1 : tensor<128x64x!tt.ptr<f32>, #blocked>
64-
) {
65-
%cst_0 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1>
66-
%cst_1 = arith.constant dense<0.693147182> : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 4}>>
67-
%cst_2 = arith.constant dense<0.581374812> : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
68-
%cst_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma>
69-
70-
%0 = tt.load %arg0 : tensor<64x16x!tt.ptr<f16>, #blocked>
71-
%1 = ttg.convert_layout %0 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #linear>
72-
%2 = ttg.convert_layout %0 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 4}>>
73-
%3 = tt.dot %cst_1, %2, %cst_0 : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 4}>> * tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 4}>> -> tensor<128x16xf32, #mma1>
74-
%4 = tt.trans %1 {order = array<i32: 1, 0>} : tensor<64x16xf16, #linear> -> tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
75-
%5 = tt.dot %cst_2, %4, %cst_3 : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<128x64xf32, #mma>
76-
77-
%6 = ttg.convert_layout %3 : tensor<128x16xf32, #mma1> -> tensor<128x16xf32, #blocked>
78-
%7 = ttg.convert_layout %5 : tensor<128x64xf32, #mma> -> tensor<128x64xf32, #blocked>
79-
tt.store %out0, %6 : tensor<128x16x!tt.ptr<f32>, #blocked>
80-
tt.store %out1, %7 : tensor<128x64x!tt.ptr<f32>, #blocked>
81-
tt.return
82-
}
83-
}
84-
// -----
85-
86-
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
87-
#linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [32, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 8]], warp = [[0, 0], [0, 0]], block = []}>
88-
#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [32, 32, 16], isTransposed = true}>
89-
#mma1 = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [16, 16, 32], isTransposed = true}>
90-
// CHECK-NOT: #shared = #ttg.swizzled_shared<{vec = 8, perPhase = 2, maxPhase = 8, order = [0, 1]}>
91-
// CHECK-NOT: #smem = #ttg.shared_memory
92-
// CHECK-LABEL: test_not_local_load_transposed_opIdx_mismatch
93-
// CHECK: tt.load {{.*}} : tensor<64x16x!tt.ptr<f16>, #blocked>
94-
// CHECK-NOT: ttg.local_alloc
95-
// CHECK-NOT: ttg.local_load
96-
// CHECK-NOT: ttg.local_load
97-
// CHECK: tt.dot {{.+}}: tensor<64x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<64x64xf32, #mma>
98-
// CHECK: tt.trans {{.+}} {order = array<i32: 1, 0>} : tensor<64x16xf16, #linear> -> tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>>
99-
// CHECK: tt.dot {{.+}} : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>> -> tensor<128x64xf32, #mma1>
100-
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
101-
tt.func public @test_not_local_load_transposed_opIdx_mismatch(
102-
%arg0: tensor<64x16x!tt.ptr<f16>, #blocked>,
103-
%out0 : tensor<64x64x!tt.ptr<f32>, #blocked>,
104-
%out1 : tensor<128x64x!tt.ptr<f32>, #blocked>
105-
) {
106-
%cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma1>
107-
%cst_1 = arith.constant dense<0.693147182> : tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>>
108-
%cst_2 = arith.constant dense<0.581374812> : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
109-
%cst_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma>
110-
111-
%0 = tt.load %arg0 : tensor<64x16x!tt.ptr<f16>, #blocked>
112-
%1 = ttg.convert_layout %0 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #linear>
113-
%2 = ttg.convert_layout %0 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 8}>>
114-
%3 = tt.dot %2, %cst_1, %cst_0 : tensor<64x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>> -> tensor<64x64xf32, #mma1>
115-
%4 = tt.trans %1 {order = array<i32: 1, 0>} : tensor<64x16xf16, #linear> -> tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
116-
%5 = tt.dot %cst_2, %4, %cst_3 : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<128x64xf32, #mma>
117-
118-
%6 = ttg.convert_layout %3 : tensor<64x64xf32, #mma1> -> tensor<64x64xf32, #blocked>
119-
%7 = ttg.convert_layout %5 : tensor<128x64xf32, #mma> -> tensor<128x64xf32, #blocked>
120-
tt.store %out0, %6 : tensor<64x64x!tt.ptr<f32>, #blocked>
121-
tt.store %out1, %7 : tensor<128x64x!tt.ptr<f32>, #blocked>
122-
tt.return
123-
}
124-
}
125-
126-
// -----
1+
// RUN: triton-opt %s -split-input-file -tritonamdgpu-optimize-dot-operands="arch-generation-name=gfx950" | FileCheck %s --check-prefixes GFX950
1272

1283
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 4], warpsPerCTA = [2, 1], order = [1, 0]}>
1294
#linear = #ttg.linear<{register = [[0, 0, 1], [0, 0, 2], [0, 0, 4], [1, 0, 0], [2, 0, 0], [0, 32, 0], [0, 64, 0]], lane = [[0, 1, 0], [0, 2, 0], [0, 4, 0], [0, 8, 0], [0, 0, 8], [0, 0, 16]], warp = [[0, 16, 0]], block = []}>

0 commit comments

Comments
 (0)