@@ -124,21 +124,28 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
124
124
#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [2 , 2 ], repCluster = [1 , 1 ], A = [8 , 16 ], B = [16 , 16 ], C = [8 , 16 ]}>
125
125
module attributes {" ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 16 : i32 } {
126
126
// CHECK-LABEL: llvm.func spir_kernelcc @prefetch_tensor_of_pointers
127
- tt.func public @prefetch_tensor_of_pointers (%tensor_of_ptr: tensor <64 x32 x!tt.ptr <f16 >, #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>) {
127
+ tt.func public @prefetch_tensor_of_pointers (%arg0: !tt.ptr <f16 >) {
128
+ %0 = tt.make_range {end = 64 : i32 , start = 0 : i32 } : tensor <64 xi32 , #ttg.slice <{dim = 1 , parent = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>}>>
129
+ %1 = tt.expand_dims %0 {axis = 1 : i32 } : tensor <64 xi32 , #ttg.slice <{dim = 1 , parent = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>}>> -> tensor <64 x1 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
130
+ %2 = arith.constant dense <64 > : tensor <64 x1 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
131
+ %3 = arith.muli %1 , %2 : tensor <64 x1 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
132
+ %4 = tt.make_range {end = 32 : i32 , start = 0 : i32 } : tensor <32 xi32 , #ttg.slice <{dim = 0 , parent = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>}>>
133
+ %5 = tt.expand_dims %4 {axis = 0 : i32 } : tensor <32 xi32 , #ttg.slice <{dim = 0 , parent = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>}>> -> tensor <1 x32 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
134
+ %6 = tt.broadcast %3 : tensor <64 x1 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>> -> tensor <64 x32 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
135
+ %7 = tt.broadcast %5 : tensor <1 x32 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>> -> tensor <64 x32 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
136
+ %8 = arith.addi %6 , %7 : tensor <64 x32 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
137
+ %9 = tt.splat %arg0 : !tt.ptr <f16 > -> tensor <64 x32 x!tt.ptr <f16 >, #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
138
+ %tensor_of_ptr = tt.addptr %9 , %8 : tensor <64 x32 x!tt.ptr <f16 >, #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>, tensor <64 x32 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
139
+
140
+ // CHECK: %[[ADDR_0:.*]] = llvm.extractvalue {{.*}}[0] : !llvm.struct<(ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>)>
128
141
// CHECK: %[[ADDR_0:.*]] = llvm.extractvalue {{.*}}[0] : !llvm.struct<(ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>)>
129
142
// CHECK: %[[ADDR_1:.*]] = llvm.extractvalue {{.*}}[1] : !llvm.struct<(ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>)>
130
143
// CHECK: %[[ADDR_16:.*]] = llvm.extractvalue {{.*}}[16] : !llvm.struct<(ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>)>
131
144
// CHECK: %[[ADDR_32:.*]] = llvm.extractvalue {{.*}}[32] : !llvm.struct<(ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>)>
132
145
// CHECK: %[[ADDR_48:.*]] = llvm.extractvalue {{.*}}[48] : !llvm.struct<(ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>)>
133
146
// CHECK: %[[BASE_WIDTH:.*]] = llvm.mlir.constant(64 : i32) : i32
134
- // CHECK: %[[VAL_13:.*]] = llvm.ptrtoint %[[ADDR_0]] : !llvm.ptr<1> to i64
135
- // CHECK: %[[VAL_14:.*]] = llvm.ptrtoint %[[ADDR_1]] : !llvm.ptr<1> to i64
136
- // CHECK: %[[PITCH:.*]] = llvm.sub %[[VAL_14]], %[[VAL_13]] : i64
137
- // CHECK: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
138
- // CHECK: %[[UNIFIED_PITCH:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[PITCH]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i64, i32) -> i64
139
- // CHECK: %[[UNIFIED_PITCH_I32:.*]] = llvm.trunc %[[UNIFIED_PITCH]] : i64 to i32
140
- // CHECK: %[[PITCH_IN_BYTES_I32:.*]] = llvm.intr.umax(%[[UNIFIED_PITCH_I32]], %[[BASE_WIDTH]]) : (i32, i32) -> i32
141
- // CHECK-DAG: %[[BASE_HEIGHT:.*]] = llvm.mlir.constant(8 : i32) : i32
147
+ // CHECK: %[[PITCH:.*]] = llvm.mlir.constant(128 : i32) : i32
148
+ // CHECK: %[[BASE_HEIGHT:.*]] = llvm.mlir.constant(8 : i32) : i32
142
149
// CHECK: %[[CST_0_:.*]] = llvm.mlir.constant(0 : i32) : i32
143
150
// CHECK: llvm.mlir.constant(0 : i32) : i32
144
151
@@ -151,7 +158,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32}
151
158
// CHECK: %[[VAL_13:.*]] = llvm.ptrtoint %[[ADDR_0]] : !llvm.ptr<1> to i64
152
159
// CHECK: %[[UNIFIED_BASE:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_13]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i64, i32) -> i64
153
160
// CHECK: %[[VAL_26:.*]] = llvm.inttoptr %[[UNIFIED_BASE]] : i64 to !llvm.ptr<1>
154
- // CHECK: triton_gen.2Dblockprefetch %[[VAL_26]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH_IN_BYTES_I32 ]], %[[CST_0_]], %[[OFFSET_Y]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 8, v_blocks = 2, cache_control = L1C_L3C}
161
+ // CHECK: triton_gen.2Dblockprefetch %[[VAL_26]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH ]], %[[CST_0_]], %[[OFFSET_Y]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 8, v_blocks = 2, cache_control = L1C_L3C}
155
162
156
163
// CHECK: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
157
164
// CHECK: %[[VAL_29:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflecj(%{{.*}}, %[[CST_0]]) {convergent, no_unwind, will_return} : (i8, i32) -> i8
@@ -162,7 +169,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32}
162
169
// CHECK: %[[VAL_32:.*]] = llvm.ptrtoint %[[ADDR_16]] : !llvm.ptr<1> to i64
163
170
// CHECK: %[[VAL_33:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_32]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i64, i32) -> i64
164
171
// CHECK: %[[VAL_34:.*]] = llvm.inttoptr %[[VAL_33]] : i64 to !llvm.ptr<1>
165
- // CHECK: triton_gen.2Dblockprefetch %[[VAL_34]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH_IN_BYTES_I32 ]], %[[CST_0_]], %[[VAL_31]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 8, v_blocks = 2, cache_control = L1C_L3C}
172
+ // CHECK: triton_gen.2Dblockprefetch %[[VAL_34]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH ]], %[[CST_0_]], %[[VAL_31]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 8, v_blocks = 2, cache_control = L1C_L3C}
166
173
167
174
// CHECK: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
168
175
// CHECK: %[[VAL_36:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflecj(%{{.*}}, %[[CST_0]]) {convergent, no_unwind, will_return} : (i8, i32) -> i8
@@ -173,7 +180,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32}
173
180
// CHECK: %[[VAL_39:.*]] = llvm.ptrtoint %[[ADDR_32]] : !llvm.ptr<1> to i64
174
181
// CHECK: %[[VAL_40:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_39]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i64, i32) -> i64
175
182
// CHECK: %[[VAL_41:.*]] = llvm.inttoptr %[[VAL_40]] : i64 to !llvm.ptr<1>
176
- // CHECK: triton_gen.2Dblockprefetch %[[VAL_41]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH_IN_BYTES_I32 ]], %[[CST_0_]], %[[VAL_38]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 8, v_blocks = 2, cache_control = L1C_L3C}
183
+ // CHECK: triton_gen.2Dblockprefetch %[[VAL_41]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH ]], %[[CST_0_]], %[[VAL_38]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 8, v_blocks = 2, cache_control = L1C_L3C}
177
184
178
185
// CHECK: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
179
186
// CHECK: %[[VAL_43:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflecj(%{{.*}}, %[[CST_0]]) {convergent, no_unwind, will_return} : (i8, i32) -> i8
@@ -184,7 +191,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32}
184
191
// CHECK: %[[VAL_46:.*]] = llvm.ptrtoint %[[ADDR_48]] : !llvm.ptr<1> to i64
185
192
// CHECK: %[[VAL_47:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_46]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i64, i32) -> i64
186
193
// CHECK: %[[VAL_48:.*]] = llvm.inttoptr %[[VAL_47]] : i64 to !llvm.ptr<1>
187
- // CHECK: triton_gen.2Dblockprefetch %[[VAL_48]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH_IN_BYTES_I32 ]], %[[CST_0_]], %[[VAL_45]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 8, v_blocks = 2, cache_control = L1C_L3C}
194
+ // CHECK: triton_gen.2Dblockprefetch %[[VAL_48]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH ]], %[[CST_0_]], %[[VAL_45]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 8, v_blocks = 2, cache_control = L1C_L3C}
188
195
189
196
%mask_tensor = arith.constant dense <1 > : tensor <64 x32 xi1 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
190
197
ttig.prefetch %tensor_of_ptr , %mask_tensor {boundaryCheck = array<i32 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , operandSegmentSizes = array<i32 : 1 , 1 , 1 >, ttig.block_io = " row_major" } : tensor <64 x32 x!tt.ptr <f16 >, #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
@@ -199,34 +206,6 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32}
199
206
200
207
// -----
201
208
202
- // COM: Check that pitch is a constant calculated by AxisInfo analysis, instead of calculating dynamically.
203
- #dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [2 , 2 ], repCluster = [1 , 1 ], A = [8 , 16 ], B = [16 , 16 ], C = [8 , 16 ]}>
204
- module attributes {" ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 16 : i32 } {
205
- // CHECK-LABEL: llvm.func spir_kernelcc @prefetch_tensor_of_pointers
206
- tt.func public @prefetch_tensor_of_pointers (%arg0: i32 , %arg1: !tt.ptr <bf16 >) {
207
- %cst_0 = arith.constant dense <512 > : tensor <128 x1 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
208
- %cst_1 = arith.constant dense <512 > : tensor <128 xi32 , #ttg.slice <{dim = 1 , parent = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>}>>
209
- %c128_i32 = arith.constant 128 : i32
210
- %0 = arith.muli %arg0 , %c128_i32 : i32
211
- %1 = tt.splat %0 : i32 -> tensor <128 xi32 , #ttg.slice <{dim = 1 , parent = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>}>>
212
- %2 = tt.make_range {end = 128 : i32 , start = 0 : i32 } : tensor <128 xi32 , #ttg.slice <{dim = 1 , parent = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>}>>
213
- %3 = arith.addi %1 , %2 : tensor <128 xi32 , #ttg.slice <{dim = 1 , parent = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>}>>
214
- %4 = arith.remsi %3 , %cst_1 : tensor <128 xi32 , #ttg.slice <{dim = 1 , parent = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>}>>
215
- %5 = tt.expand_dims %4 {axis = 1 : i32 } : tensor <128 xi32 , #ttg.slice <{dim = 1 , parent = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>}>> -> tensor <128 x1 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
216
- %6 = arith.muli %5 , %cst_0 : tensor <128 x1 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
217
- %7 = tt.broadcast %6 : tensor <128 x1 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>> -> tensor <128 x64 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
218
- %8 = tt.splat %arg1 : !tt.ptr <bf16 > -> tensor <128 x64 x!tt.ptr <bf16 >, #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
219
- %9 = tt.addptr %8 , %7 : tensor <128 x64 x!tt.ptr <bf16 >, #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>, tensor <128 x64 xi32 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
220
-
221
- // CHECK-DAG: %[[PITCH:.*]] = llvm.mlir.constant(1024 : i32) : i32
222
- // CHECK-COUNT-4: triton_gen.2Dblockprefetch {{.*}}, %[[PITCH]], {{.*}} {elem_size_in_bits = 16, tile_width = 16, tile_height = 8, v_blocks = 2, cache_control = L1C_L3C}
223
- ttig.prefetch %9 {boundaryCheck = array<i32 >, cache = 1 : i32 , evict = 1 : i32 , isVolatile = false , operandSegmentSizes = array<i32 : 1 , 1 , 1 >, ttig.block_io = " row_major" } : tensor <128 x64 x!tt.ptr <bf16 >, #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>
224
- tt.return
225
- }
226
- }
227
-
228
- // -----
229
-
230
209
// COM: Currently the prefetch operation in this test cannot be lowered correctly, so we check that the test compiles cleanly and not 2D block prefetch operation gets generated.
231
210
#mma = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 1 , threadsPerWarp = 16 , warpsPerCTA = [4 , 1 ], repCluster = [4 , 1 ], A = [32 , 8 ], B = [8 , 16 ], C = [32 , 16 ]}>
232
211
module attributes {ttig.min_sg_size = 16 : i32 , ttig.support_sg_2d_block , ttig.target_arch = " spir64" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , ttg.target = " xpu" , " ttg.threads-per-warp" = 16 : i32 } {
0 commit comments