Skip to content

Commit 34379e6

Browse files
authored
Give inner_tiled a strict verifier and explicit semantics with boolean parameters distributed and opaque (iree-org#22369)
Fixes iree-org#22336. Discussed on Discord around https://discord.com/channels/689900678990135345/1254843174111678555/1430237660781215794 --------- Signed-off-by: Benoit Jacob <[email protected]>
1 parent 20be10d commit 34379e6

36 files changed

+805
-336
lines changed

compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMBuiltinManager.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ FailureOr<ModuleOp> ROCMDialect::getOrLoadBuiltinModule(StringRef path) {
3232
// succeeds so that other threads don't have to retry.
3333
OwningOpRef<ModuleOp> &parsedLibrary = builtinModules[path];
3434

35-
parsedLibrary = parseSourceString<mlir::ModuleOp>(maybeBuiltin.value(), ctx);
35+
parsedLibrary = parseSourceString<mlir::ModuleOp>(maybeBuiltin.value(), ctx,
36+
/*sourceName=*/path);
3637
if (!parsedLibrary) {
3738
return failure();
3839
}

compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/apply_builtin_ukernel_pdl_patterns_driver.mlir

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,8 @@ module attributes {
411411
%2 = iree_codegen.inner_tiled ins(%arg0, %arg1) outs(%1){
412412
indexing_maps = [#map1, #map2, #map3],
413413
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
414-
kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x32_F8E4M3FNUZ, intrinsics_m = 8, subgroups_m = 2, intrinsics_n = 4, subgroups_n = 4>
414+
kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x32_F8E4M3FNUZ, intrinsics_m = 8, subgroups_m = 2, intrinsics_n = 4, subgroups_n = 4>,
415+
semantics = #iree_gpu.mma_semantics<distributed = false, opaque = false>
415416
} : tensor<1x128x2x8x4x16x8xf8E4M3FNUZ>, tensor<16x128x4x4x4x16x8xf8E4M3FNUZ> into tensor<1x16x2x4x8x4x4x16x4xf32>
416417
return %2 : tensor<1x16x2x4x8x4x4x16x4xf32>
417418
}
@@ -445,7 +446,8 @@ module attributes {
445446
%2 = iree_codegen.inner_tiled ins(%arg0, %arg1) outs(%1){
446447
indexing_maps = [#map1, #map2, #map3],
447448
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
448-
kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x32_F8E4M3FNUZ, intrinsics_m = 8, intrinsics_n = 2, subgroups_n = 8, intrinsics_k = 2>
449+
kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x32_F8E4M3FNUZ, intrinsics_m = 8, intrinsics_n = 2, subgroups_n = 8, intrinsics_k = 2>,
450+
semantics = #iree_gpu.mma_semantics<distributed = false, opaque = false>
449451
} : tensor<1x64x8x4x16x2x8xf8E4M3FNUZ>, tensor<4x64x8x2x4x16x2x8xf8E4M3FNUZ> into tensor<1x4x8x8x2x4x16x4xf32>
450452
return %2 : tensor<1x4x8x8x2x4x16x4xf32>
451453
}
@@ -479,7 +481,8 @@ module attributes {
479481
%2 = iree_codegen.inner_tiled ins(%arg0, %arg1) outs(%1){
480482
indexing_maps = [#map1, #map2, #map3],
481483
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
482-
kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x16_F16, intrinsics_m = 8, subgroups_m = 2, intrinsics_n = 4, subgroups_n = 4>
484+
kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x16_F16, intrinsics_m = 8, subgroups_m = 2, intrinsics_n = 4, subgroups_n = 4>,
485+
semantics = #iree_gpu.mma_semantics<distributed = false, opaque = false>
483486
} : tensor<1x256x2x8x4x16x4xf16>, tensor<501x256x4x4x4x16x4xf16> into tensor<1x501x2x4x8x4x4x16x4xf32>
484487
return %2 : tensor<1x501x2x4x8x4x4x16x4xf32>
485488
}

compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_dt_matmul_f16.mlir

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,8 @@ util.func @pingpong_dt_large_f16(%lhs_base: !lhs_base_ty, %rhs_base: !rhs_base_t
118118
%dot0 = iree_codegen.inner_tiled ins(%lhs_vec_0_t, %rhs_vec_0_t) outs(%iter) {
119119
indexing_maps = #contraction_accesses,
120120
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
121-
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
121+
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
122+
semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
122123
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32>
123124

124125
rocdl.s.setprio 0
@@ -152,7 +153,8 @@ util.func @pingpong_dt_large_f16(%lhs_base: !lhs_base_ty, %rhs_base: !rhs_base_t
152153
%dot1 = iree_codegen.inner_tiled ins(%lhs_vec_1_t, %rhs_vec_1_t) outs(%dot0) {
153154
indexing_maps = #contraction_accesses,
154155
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
155-
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
156+
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
157+
semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
156158
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32>
157159

158160
rocdl.s.setprio 0
@@ -177,7 +179,8 @@ util.func @pingpong_dt_large_f16(%lhs_base: !lhs_base_ty, %rhs_base: !rhs_base_t
177179
%dot2 = iree_codegen.inner_tiled ins(%lhs_vec_2_t, %rhs_vec_2_t) outs(%dot1) {
178180
indexing_maps = #contraction_accesses,
179181
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
180-
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
182+
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
183+
semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
181184
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32>
182185

183186
rocdl.s.setprio 0
@@ -202,7 +205,8 @@ util.func @pingpong_dt_large_f16(%lhs_base: !lhs_base_ty, %rhs_base: !rhs_base_t
202205
%dot3 = iree_codegen.inner_tiled ins(%lhs_vec_3_t, %rhs_vec_3_t) outs(%dot2) {
203206
indexing_maps = #contraction_accesses,
204207
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
205-
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
208+
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
209+
semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
206210
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32>
207211

208212
rocdl.s.setprio 0
@@ -224,7 +228,8 @@ util.func @pingpong_dt_large_f16(%lhs_base: !lhs_base_ty, %rhs_base: !rhs_base_t
224228
%dot0 = iree_codegen.inner_tiled ins(%lhs_vec_0_t, %rhs_vec_0_t) outs(%3) {
225229
indexing_maps = #contraction_accesses,
226230
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
227-
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
231+
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
232+
semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
228233
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32>
229234

230235
%lhs_vec_1 = vector.transfer_read %lhs_shared[%c1, %m_outer, %ids#2, %c0], %cst {in_bounds = [true, true, true, true]} : !shared_ty, vector<1x8x1x4xf16>
@@ -235,7 +240,8 @@ util.func @pingpong_dt_large_f16(%lhs_base: !lhs_base_ty, %rhs_base: !rhs_base_t
235240
%dot1 = iree_codegen.inner_tiled ins(%lhs_vec_1_t, %rhs_vec_1_t) outs(%dot0) {
236241
indexing_maps = #contraction_accesses,
237242
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
238-
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
243+
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
244+
semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
239245
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32>
240246

241247
%lhs_vec_2 = vector.transfer_read %lhs_shared[%c2, %m_outer, %ids#2, %c0], %cst {in_bounds = [true, true, true, true]} : !shared_ty, vector<1x8x1x4xf16>
@@ -246,7 +252,8 @@ util.func @pingpong_dt_large_f16(%lhs_base: !lhs_base_ty, %rhs_base: !rhs_base_t
246252
%dot2 = iree_codegen.inner_tiled ins(%lhs_vec_2_t, %rhs_vec_2_t) outs(%dot1) {
247253
indexing_maps = #contraction_accesses,
248254
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
249-
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
255+
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
256+
semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
250257
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32>
251258

252259
%lhs_vec_3 = vector.transfer_read %lhs_shared[%c3, %m_outer, %ids#2, %c0], %cst {in_bounds = [true, true, true, true]} : !shared_ty, vector<1x8x1x4xf16>
@@ -257,7 +264,8 @@ util.func @pingpong_dt_large_f16(%lhs_base: !lhs_base_ty, %rhs_base: !rhs_base_t
257264
%dot3 = iree_codegen.inner_tiled ins(%lhs_vec_3_t, %rhs_vec_3_t) outs(%dot2) {
258265
indexing_maps = #contraction_accesses,
259266
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
260-
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
267+
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
268+
semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
261269
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32>
262270

263271
%empty = tensor.empty() : tensor<1x1x1x1x8x4x1x1x4xf32>

compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_dt_matmul_f8E4M3FNUZ.mlir

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,8 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
134134
%dot0 = iree_codegen.inner_tiled ins(%lhs_vec_0_t, %rhs_vec_0_t) outs(%iter) {
135135
indexing_maps = #contraction_accesses,
136136
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
137-
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>
137+
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>,
138+
semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
138139
} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
139140

140141
rocdl.s.setprio 0
@@ -168,7 +169,8 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
168169
%dot1 = iree_codegen.inner_tiled ins(%lhs_vec_1_t, %rhs_vec_1_t) outs(%dot0) {
169170
indexing_maps = #contraction_accesses,
170171
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
171-
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>
172+
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>,
173+
semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
172174
} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
173175

174176
rocdl.s.setprio 0
@@ -194,7 +196,8 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
194196
%dot2 = iree_codegen.inner_tiled ins(%lhs_vec_2_t, %rhs_vec_2_t) outs(%dot1) {
195197
indexing_maps = #contraction_accesses,
196198
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
197-
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>
199+
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>,
200+
semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
198201
} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
199202

200203
rocdl.s.setprio 0
@@ -219,7 +222,8 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
219222
%dot3 = iree_codegen.inner_tiled ins(%lhs_vec_3_t, %rhs_vec_3_t) outs(%dot2) {
220223
indexing_maps = #contraction_accesses,
221224
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
222-
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>
225+
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>,
226+
semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
223227
} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
224228

225229
rocdl.s.setprio 0
@@ -241,7 +245,8 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
241245
%dot0 = iree_codegen.inner_tiled ins(%lhs_vec_0_t, %rhs_vec_0_t) outs(%3) {
242246
indexing_maps = #contraction_accesses,
243247
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
244-
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>
248+
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>,
249+
semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
245250
} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
246251

247252
%lhs_vec_1 = vector.transfer_read %lhs_shared[%c1, %m_outer, %ids#2, %c0], %cst {in_bounds = [true, true, true, true]} : !shared_ty, vector<1x8x1x8xf8E4M3FNUZ>
@@ -252,7 +257,8 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
252257
%dot1 = iree_codegen.inner_tiled ins(%lhs_vec_1_t, %rhs_vec_1_t) outs(%dot0) {
253258
indexing_maps = #contraction_accesses,
254259
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
255-
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>
260+
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>,
261+
semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
256262
} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
257263

258264
%lhs_vec_2 = vector.transfer_read %lhs_shared[%c2, %m_outer, %ids#2, %c0], %cst {in_bounds = [true, true, true, true]} : !shared_ty, vector<1x8x1x8xf8E4M3FNUZ>
@@ -263,7 +269,8 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
263269
%dot2 = iree_codegen.inner_tiled ins(%lhs_vec_2_t, %rhs_vec_2_t) outs(%dot1) {
264270
indexing_maps = #contraction_accesses,
265271
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
266-
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>
272+
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>,
273+
semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
267274
} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
268275

269276
%lhs_vec_3 = vector.transfer_read %lhs_shared[%c3, %m_outer, %ids#2, %c0], %cst {in_bounds = [true, true, true, true]} : !shared_ty, vector<1x8x1x8xf8E4M3FNUZ>
@@ -274,7 +281,8 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
274281
%dot3 = iree_codegen.inner_tiled ins(%lhs_vec_3_t, %rhs_vec_3_t) outs(%dot2) {
275282
indexing_maps = #contraction_accesses,
276283
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
277-
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>
284+
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>,
285+
semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
278286
} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
279287

280288
%empty = tensor.empty() : tensor<1x1x1x1x8x4x1x1x4xf32>
@@ -401,7 +409,8 @@ util.func private @pingpong_dt_medium_f8E4M3FNUZ(%lhs_base: !m_lhs_base_ty, %rhs
401409
%dot0 = iree_codegen.inner_tiled ins(%lhs_vec_0_t, %rhs_vec_0_t) outs(%iter) {
402410
indexing_maps = #contraction_accesses,
403411
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
404-
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>
412+
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>,
413+
semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
405414
} : vector<8x2x1x8xf8E4M3FNUZ>, vector<2x2x1x8xf8E4M3FNUZ> into vector<8x2x1x4xf32>
406415

407416
rocdl.s.setprio 0
@@ -424,7 +433,8 @@ util.func private @pingpong_dt_medium_f8E4M3FNUZ(%lhs_base: !m_lhs_base_ty, %rhs
424433
%dot2 = iree_codegen.inner_tiled ins(%lhs_vec_2_t, %rhs_vec_2_t) outs(%dot0) {
425434
indexing_maps = #contraction_accesses,
426435
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
427-
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>
436+
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>,
437+
semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
428438
} : vector<8x2x1x8xf8E4M3FNUZ>, vector<2x2x1x8xf8E4M3FNUZ> into vector<8x2x1x4xf32>
429439

430440
rocdl.s.setprio 0
@@ -453,13 +463,15 @@ util.func private @pingpong_dt_medium_f8E4M3FNUZ(%lhs_base: !m_lhs_base_ty, %rhs
453463
%dot0 = iree_codegen.inner_tiled ins(%lhs_vec_0_t, %rhs_vec_0_t) outs(%3) {
454464
indexing_maps = #contraction_accesses,
455465
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
456-
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>
466+
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>,
467+
semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
457468
} : vector<8x2x1x8xf8E4M3FNUZ>, vector<2x2x1x8xf8E4M3FNUZ> into vector<8x2x1x4xf32>
458469

459470
%dot2 = iree_codegen.inner_tiled ins(%lhs_vec_2_t, %rhs_vec_2_t) outs(%dot0) {
460471
indexing_maps = #contraction_accesses,
461472
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
462-
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>
473+
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>,
474+
semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
463475
} : vector<8x2x1x8xf8E4M3FNUZ>, vector<2x2x1x8xf8E4M3FNUZ> into vector<8x2x1x4xf32>
464476

465477
%empty = tensor.empty() : tensor<1x1x1x8x2x1x1x4xf32>

0 commit comments

Comments
 (0)