Skip to content

Commit 6647f59

Browse files
victor-edsetiotto
andauthored
[TritonIntelGPUToLLVM] Detect basic sub-group shuffle convert_layout cases (#2531)
Detect basic shuffles and lower to `gpu.shuffle` operations. Basically, support cases in which we go from each work-item having a single tensor element to having `sub_group_size` tensor elements such as element `i` corresponds to the element originally held by work-item `i` in the sub-group. Upstream MLIR pass should handle all integer and floating point types. Drop code handling type legalization for such types when done. Pointer type should still be done in this project. Code should be extended to support other kind of shuffles. Multi-sub-group case not yet implemented. --------- Signed-off-by: victor-eds <[email protected]> Co-authored-by: Ettore Tiotto <[email protected]>
1 parent 988b62b commit 6647f59

File tree

2 files changed

+414
-0
lines changed

2 files changed

+414
-0
lines changed
Lines changed: 259 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,259 @@
1+
// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s
2+
3+
// Basic 16x16 shuffle test
4+
5+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
6+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
7+
#sliced = #triton_gpu.slice<{dim = 1, parent = #blocked}>
8+
#sliced1 = #triton_gpu.slice<{dim = 1, parent = #blocked1}>
9+
10+
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
11+
// CHECK-LABEL: llvm.func spir_kernelcc @test_f16(
12+
// CHECK-SAME: %[[VAL_0:.*]]: !llvm.struct<(f16)>,
13+
// CHECK: %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(f16)>
14+
// CHECK: %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
15+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shuffleDhj(%[[VAL_2]], %[[VAL_4]])
16+
// CHECK: %[[VAL_7:.*]] = llvm.mlir.constant(1 : i32) : i32
17+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shuffleDhj(%[[VAL_2]], %[[VAL_7]])
18+
// CHECK: %[[VAL_10:.*]] = llvm.mlir.constant(2 : i32) : i32
19+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shuffleDhj(%[[VAL_2]], %[[VAL_10]])
20+
// CHECK: %[[VAL_13:.*]] = llvm.mlir.constant(3 : i32) : i32
21+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shuffleDhj(%[[VAL_2]], %[[VAL_13]])
22+
// CHECK: %[[VAL_16:.*]] = llvm.mlir.constant(4 : i32) : i32
23+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shuffleDhj(%[[VAL_2]], %[[VAL_16]])
24+
// CHECK: %[[VAL_19:.*]] = llvm.mlir.constant(5 : i32) : i32
25+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shuffleDhj(%[[VAL_2]], %[[VAL_19]])
26+
// CHECK: %[[VAL_22:.*]] = llvm.mlir.constant(6 : i32) : i32
27+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shuffleDhj(%[[VAL_2]], %[[VAL_22]])
28+
// CHECK: %[[VAL_25:.*]] = llvm.mlir.constant(7 : i32) : i32
29+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shuffleDhj(%[[VAL_2]], %[[VAL_25]])
30+
// CHECK: %[[VAL_28:.*]] = llvm.mlir.constant(8 : i32) : i32
31+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shuffleDhj(%[[VAL_2]], %[[VAL_28]])
32+
// CHECK: %[[VAL_31:.*]] = llvm.mlir.constant(9 : i32) : i32
33+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shuffleDhj(%[[VAL_2]], %[[VAL_31]])
34+
// CHECK: %[[VAL_34:.*]] = llvm.mlir.constant(10 : i32) : i32
35+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shuffleDhj(%[[VAL_2]], %[[VAL_34]])
36+
// CHECK: %[[VAL_37:.*]] = llvm.mlir.constant(11 : i32) : i32
37+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shuffleDhj(%[[VAL_2]], %[[VAL_37]])
38+
// CHECK: %[[VAL_40:.*]] = llvm.mlir.constant(12 : i32) : i32
39+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shuffleDhj(%[[VAL_2]], %[[VAL_40]])
40+
// CHECK: %[[VAL_43:.*]] = llvm.mlir.constant(13 : i32) : i32
41+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shuffleDhj(%[[VAL_2]], %[[VAL_43]])
42+
// CHECK: %[[VAL_46:.*]] = llvm.mlir.constant(14 : i32) : i32
43+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shuffleDhj(%[[VAL_2]], %[[VAL_46]])
44+
// CHECK: %[[VAL_49:.*]] = llvm.mlir.constant(15 : i32) : i32
45+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shuffleDhj(%[[VAL_2]], %[[VAL_49]])
46+
tt.func @test_f16(%arg0: tensor<16xf16, #sliced>) -> tensor<16xf16, #sliced1> {
47+
%0 = triton_gpu.convert_layout %arg0 : tensor<16xf16, #sliced> -> tensor<16xf16, #sliced1>
48+
tt.return %0 : tensor<16xf16, #sliced1>
49+
}
50+
51+
// CHECK-LABEL: llvm.func spir_kernelcc @test_bf16(
52+
// CHECK-SAME: %[[VAL_0:.*]]: !llvm.struct<(bf16)>,
53+
// CHECK: %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(bf16)>
54+
// CHECK: %[[VAL_2:.*]] = llvm.bitcast %[[VAL_1]] : bf16 to i16
55+
// CHECK: %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
56+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflesj(%[[VAL_2]], %[[VAL_4]])
57+
// CHECK: %[[VAL_7:.*]] = llvm.mlir.constant(1 : i32) : i32
58+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflesj(%[[VAL_2]], %[[VAL_7]])
59+
// CHECK: %[[VAL_10:.*]] = llvm.mlir.constant(2 : i32) : i32
60+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflesj(%[[VAL_2]], %[[VAL_10]])
61+
// CHECK: %[[VAL_13:.*]] = llvm.mlir.constant(3 : i32) : i32
62+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflesj(%[[VAL_2]], %[[VAL_13]])
63+
// CHECK: %[[VAL_16:.*]] = llvm.mlir.constant(4 : i32) : i32
64+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflesj(%[[VAL_2]], %[[VAL_16]])
65+
// CHECK: %[[VAL_19:.*]] = llvm.mlir.constant(5 : i32) : i32
66+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflesj(%[[VAL_2]], %[[VAL_19]])
67+
// CHECK: %[[VAL_22:.*]] = llvm.mlir.constant(6 : i32) : i32
68+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflesj(%[[VAL_2]], %[[VAL_22]])
69+
// CHECK: %[[VAL_25:.*]] = llvm.mlir.constant(7 : i32) : i32
70+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflesj(%[[VAL_2]], %[[VAL_25]])
71+
// CHECK: %[[VAL_28:.*]] = llvm.mlir.constant(8 : i32) : i32
72+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflesj(%[[VAL_2]], %[[VAL_28]])
73+
// CHECK: %[[VAL_31:.*]] = llvm.mlir.constant(9 : i32) : i32
74+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflesj(%[[VAL_2]], %[[VAL_31]])
75+
// CHECK: %[[VAL_34:.*]] = llvm.mlir.constant(10 : i32) : i32
76+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflesj(%[[VAL_2]], %[[VAL_34]])
77+
// CHECK: %[[VAL_37:.*]] = llvm.mlir.constant(11 : i32) : i32
78+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflesj(%[[VAL_2]], %[[VAL_37]])
79+
// CHECK: %[[VAL_40:.*]] = llvm.mlir.constant(12 : i32) : i32
80+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflesj(%[[VAL_2]], %[[VAL_40]])
81+
// CHECK: %[[VAL_43:.*]] = llvm.mlir.constant(13 : i32) : i32
82+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflesj(%[[VAL_2]], %[[VAL_43]])
83+
// CHECK: %[[VAL_46:.*]] = llvm.mlir.constant(14 : i32) : i32
84+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflesj(%[[VAL_2]], %[[VAL_46]])
85+
// CHECK: %[[VAL_49:.*]] = llvm.mlir.constant(15 : i32) : i32
86+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflesj(%[[VAL_2]], %[[VAL_49]])
87+
// CHECK-COUNT-16: llvm.bitcast %{{.*}} : i16 to bf16
88+
tt.func @test_bf16(%arg0: tensor<16xbf16, #sliced>) -> tensor<16xbf16, #sliced1> {
89+
%0 = triton_gpu.convert_layout %arg0 : tensor<16xbf16, #sliced> -> tensor<16xbf16, #sliced1>
90+
tt.return %0 : tensor<16xbf16, #sliced1>
91+
}
92+
93+
// CHECK-LABEL: llvm.func spir_kernelcc @test_i1(
94+
// CHECK-SAME: %[[VAL_0:.*]]: !llvm.struct<(i1)>,
95+
// CHECK: %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(i1)>
96+
// CHECK: %[[VAL_2:.*]] = llvm.zext %[[VAL_1]] : i1 to i8
97+
// CHECK: %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
98+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[VAL_2]], %[[VAL_4]])
99+
// CHECK: %[[VAL_7:.*]] = llvm.mlir.constant(1 : i32) : i32
100+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[VAL_2]], %[[VAL_7]])
101+
// CHECK: %[[VAL_10:.*]] = llvm.mlir.constant(2 : i32) : i32
102+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[VAL_2]], %[[VAL_10]])
103+
// CHECK: %[[VAL_13:.*]] = llvm.mlir.constant(3 : i32) : i32
104+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[VAL_2]], %[[VAL_13]])
105+
// CHECK: %[[VAL_16:.*]] = llvm.mlir.constant(4 : i32) : i32
106+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[VAL_2]], %[[VAL_16]])
107+
// CHECK: %[[VAL_19:.*]] = llvm.mlir.constant(5 : i32) : i32
108+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[VAL_2]], %[[VAL_19]])
109+
// CHECK: %[[VAL_22:.*]] = llvm.mlir.constant(6 : i32) : i32
110+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[VAL_2]], %[[VAL_22]])
111+
// CHECK: %[[VAL_25:.*]] = llvm.mlir.constant(7 : i32) : i32
112+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[VAL_2]], %[[VAL_25]])
113+
// CHECK: %[[VAL_28:.*]] = llvm.mlir.constant(8 : i32) : i32
114+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[VAL_2]], %[[VAL_28]])
115+
// CHECK: %[[VAL_31:.*]] = llvm.mlir.constant(9 : i32) : i32
116+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[VAL_2]], %[[VAL_31]])
117+
// CHECK: %[[VAL_34:.*]] = llvm.mlir.constant(10 : i32) : i32
118+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[VAL_2]], %[[VAL_34]])
119+
// CHECK: %[[VAL_37:.*]] = llvm.mlir.constant(11 : i32) : i32
120+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[VAL_2]], %[[VAL_37]])
121+
// CHECK: %[[VAL_40:.*]] = llvm.mlir.constant(12 : i32) : i32
122+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[VAL_2]], %[[VAL_40]])
123+
// CHECK: %[[VAL_43:.*]] = llvm.mlir.constant(13 : i32) : i32
124+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[VAL_2]], %[[VAL_43]])
125+
// CHECK: %[[VAL_46:.*]] = llvm.mlir.constant(14 : i32) : i32
126+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[VAL_2]], %[[VAL_46]])
127+
// CHECK: %[[VAL_49:.*]] = llvm.mlir.constant(15 : i32) : i32
128+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[VAL_2]], %[[VAL_49]])
129+
// CHECK-COUNT-16: llvm.trunc %{{.*}} : i8 to i1
130+
tt.func @test_i1(%arg0: tensor<16xi1, #sliced>) -> tensor<16xi1, #sliced1> {
131+
%0 = triton_gpu.convert_layout %arg0 : tensor<16xi1, #sliced> -> tensor<16xi1, #sliced1>
132+
tt.return %0 : tensor<16xi1, #sliced1>
133+
}
134+
135+
// CHECK-LABEL: llvm.func spir_kernelcc @test_ptr(
136+
// CHECK-SAME: %[[VAL_0:.*]]: !llvm.struct<(ptr<1>)>,
137+
// CHECK: %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(ptr<1>)>
138+
// CHECK: %[[VAL_2:.*]] = llvm.ptrtoint %[[VAL_1]] : !llvm.ptr<1> to i64
139+
// CHECK: %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
140+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_2]], %[[VAL_4]])
141+
// CHECK: %[[VAL_7:.*]] = llvm.mlir.constant(1 : i32) : i32
142+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_2]], %[[VAL_7]])
143+
// CHECK: %[[VAL_10:.*]] = llvm.mlir.constant(2 : i32) : i32
144+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_2]], %[[VAL_10]])
145+
// CHECK: %[[VAL_13:.*]] = llvm.mlir.constant(3 : i32) : i32
146+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_2]], %[[VAL_13]])
147+
// CHECK: %[[VAL_16:.*]] = llvm.mlir.constant(4 : i32) : i32
148+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_2]], %[[VAL_16]])
149+
// CHECK: %[[VAL_19:.*]] = llvm.mlir.constant(5 : i32) : i32
150+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_2]], %[[VAL_19]])
151+
// CHECK: %[[VAL_22:.*]] = llvm.mlir.constant(6 : i32) : i32
152+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_2]], %[[VAL_22]])
153+
// CHECK: %[[VAL_25:.*]] = llvm.mlir.constant(7 : i32) : i32
154+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_2]], %[[VAL_25]])
155+
// CHECK: %[[VAL_28:.*]] = llvm.mlir.constant(8 : i32) : i32
156+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_2]], %[[VAL_28]])
157+
// CHECK: %[[VAL_31:.*]] = llvm.mlir.constant(9 : i32) : i32
158+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_2]], %[[VAL_31]])
159+
// CHECK: %[[VAL_34:.*]] = llvm.mlir.constant(10 : i32) : i32
160+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_2]], %[[VAL_34]])
161+
// CHECK: %[[VAL_37:.*]] = llvm.mlir.constant(11 : i32) : i32
162+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_2]], %[[VAL_37]])
163+
// CHECK: %[[VAL_40:.*]] = llvm.mlir.constant(12 : i32) : i32
164+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_2]], %[[VAL_40]])
165+
// CHECK: %[[VAL_43:.*]] = llvm.mlir.constant(13 : i32) : i32
166+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_2]], %[[VAL_43]])
167+
// CHECK: %[[VAL_46:.*]] = llvm.mlir.constant(14 : i32) : i32
168+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_2]], %[[VAL_46]])
169+
// CHECK: %[[VAL_49:.*]] = llvm.mlir.constant(15 : i32) : i32
170+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_2]], %[[VAL_49]])
171+
// CHECK-COUNT-16: llvm.inttoptr %{{.*}} : i64 to !llvm.ptr<1>
172+
tt.func @test_ptr(%arg0: tensor<16x!tt.ptr<f32>, #sliced>) -> tensor<16x!tt.ptr<f32>, #sliced1> {
173+
%0 = triton_gpu.convert_layout %arg0 : tensor<16x!tt.ptr<f32>, #sliced> -> tensor<16x!tt.ptr<f32>, #sliced1>
174+
tt.return %0 : tensor<16x!tt.ptr<f32>, #sliced1>
175+
}
176+
}
177+
178+
// -----
179+
180+
// Sub-group size 32 variant.
181+
182+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 32], threadsPerWarp = [32, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
183+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [32, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 1], order = [0, 1]}>
184+
#sliced = #triton_gpu.slice<{dim = 1, parent = #blocked}>
185+
#sliced1 = #triton_gpu.slice<{dim = 1, parent = #blocked1}>
186+
187+
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
188+
// CHECK-LABEL: llvm.func spir_kernelcc @test_f32(
189+
// CHECK-SAME: %[[VAL_0:.*]]: !llvm.struct<(f32)>,
190+
// CHECK: %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(f32)>
191+
// CHECK: %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
192+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_4]])
193+
// CHECK: %[[VAL_7:.*]] = llvm.mlir.constant(1 : i32) : i32
194+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_7]])
195+
// CHECK: %[[VAL_10:.*]] = llvm.mlir.constant(2 : i32) : i32
196+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_10]])
197+
// CHECK: %[[VAL_13:.*]] = llvm.mlir.constant(3 : i32) : i32
198+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_13]])
199+
// CHECK: %[[VAL_16:.*]] = llvm.mlir.constant(4 : i32) : i32
200+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_16]])
201+
// CHECK: %[[VAL_19:.*]] = llvm.mlir.constant(5 : i32) : i32
202+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_19]])
203+
// CHECK: %[[VAL_22:.*]] = llvm.mlir.constant(6 : i32) : i32
204+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_22]])
205+
// CHECK: %[[VAL_25:.*]] = llvm.mlir.constant(7 : i32) : i32
206+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_25]])
207+
// CHECK: %[[VAL_28:.*]] = llvm.mlir.constant(8 : i32) : i32
208+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_28]])
209+
// CHECK: %[[VAL_31:.*]] = llvm.mlir.constant(9 : i32) : i32
210+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_31]])
211+
// CHECK: %[[VAL_34:.*]] = llvm.mlir.constant(10 : i32) : i32
212+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_34]])
213+
// CHECK: %[[VAL_37:.*]] = llvm.mlir.constant(11 : i32) : i32
214+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_37]])
215+
// CHECK: %[[VAL_40:.*]] = llvm.mlir.constant(12 : i32) : i32
216+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_40]])
217+
// CHECK: %[[VAL_43:.*]] = llvm.mlir.constant(13 : i32) : i32
218+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_43]])
219+
// CHECK: %[[VAL_46:.*]] = llvm.mlir.constant(14 : i32) : i32
220+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_46]])
221+
// CHECK: %[[VAL_49:.*]] = llvm.mlir.constant(15 : i32) : i32
222+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_49]])
223+
// CHECK: %[[VAL_52:.*]] = llvm.mlir.constant(16 : i32) : i32
224+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_52]])
225+
// CHECK: %[[VAL_55:.*]] = llvm.mlir.constant(17 : i32) : i32
226+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_55]])
227+
// CHECK: %[[VAL_58:.*]] = llvm.mlir.constant(18 : i32) : i32
228+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_58]])
229+
// CHECK: %[[VAL_61:.*]] = llvm.mlir.constant(19 : i32) : i32
230+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_61]])
231+
// CHECK: %[[VAL_64:.*]] = llvm.mlir.constant(20 : i32) : i32
232+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_64]])
233+
// CHECK: %[[VAL_67:.*]] = llvm.mlir.constant(21 : i32) : i32
234+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_67]])
235+
// CHECK: %[[VAL_70:.*]] = llvm.mlir.constant(22 : i32) : i32
236+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_70]])
237+
// CHECK: %[[VAL_73:.*]] = llvm.mlir.constant(23 : i32) : i32
238+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_73]])
239+
// CHECK: %[[VAL_76:.*]] = llvm.mlir.constant(24 : i32) : i32
240+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_76]])
241+
// CHECK: %[[VAL_79:.*]] = llvm.mlir.constant(25 : i32) : i32
242+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_79]])
243+
// CHECK: %[[VAL_82:.*]] = llvm.mlir.constant(26 : i32) : i32
244+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_82]])
245+
// CHECK: %[[VAL_85:.*]] = llvm.mlir.constant(27 : i32) : i32
246+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_85]])
247+
// CHECK: %[[VAL_88:.*]] = llvm.mlir.constant(28 : i32) : i32
248+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_88]])
249+
// CHECK: %[[VAL_91:.*]] = llvm.mlir.constant(29 : i32) : i32
250+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_91]])
251+
// CHECK: %[[VAL_94:.*]] = llvm.mlir.constant(30 : i32) : i32
252+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_94]])
253+
// CHECK: %[[VAL_97:.*]] = llvm.mlir.constant(31 : i32) : i32
254+
// CHECK: llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_97]])
255+
tt.func @test_f32(%arg0: tensor<32xf32, #sliced>) -> tensor<32xf32, #sliced1> {
256+
%0 = triton_gpu.convert_layout %arg0 : tensor<32xf32, #sliced> -> tensor<32xf32, #sliced1>
257+
tt.return %0 : tensor<32xf32, #sliced1>
258+
}
259+
}

0 commit comments

Comments
 (0)