@@ -134,7 +134,8 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
134134 %dot0 = iree_codegen.inner_tiled ins (%lhs_vec_0_t , %rhs_vec_0_t ) outs (%iter ) {
135135 indexing_maps = #contraction_accesses ,
136136 iterator_types = [#linalg.iterator_type <parallel >, #linalg.iterator_type <parallel >, #linalg.iterator_type <reduction >],
137- kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >
137+ kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >,
138+ semantics = #iree_gpu.mma_semantics <distributed = true , opaque = false >
138139 } : vector <8 x1 x1 x8 xf8 E4 M3 FNUZ>, vector <4 x1 x1 x8 xf8 E4 M3 FNUZ> into vector <8 x4 x1 x4 xf32 >
139140
140141 rocdl.s.setprio 0
@@ -168,7 +169,8 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
168169 %dot1 = iree_codegen.inner_tiled ins (%lhs_vec_1_t , %rhs_vec_1_t ) outs (%dot0 ) {
169170 indexing_maps = #contraction_accesses ,
170171 iterator_types = [#linalg.iterator_type <parallel >, #linalg.iterator_type <parallel >, #linalg.iterator_type <reduction >],
171- kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >
172+ kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >,
173+ semantics = #iree_gpu.mma_semantics <distributed = true , opaque = false >
172174 } : vector <8 x1 x1 x8 xf8 E4 M3 FNUZ>, vector <4 x1 x1 x8 xf8 E4 M3 FNUZ> into vector <8 x4 x1 x4 xf32 >
173175
174176 rocdl.s.setprio 0
@@ -194,7 +196,8 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
194196 %dot2 = iree_codegen.inner_tiled ins (%lhs_vec_2_t , %rhs_vec_2_t ) outs (%dot1 ) {
195197 indexing_maps = #contraction_accesses ,
196198 iterator_types = [#linalg.iterator_type <parallel >, #linalg.iterator_type <parallel >, #linalg.iterator_type <reduction >],
197- kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >
199+ kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >,
200+ semantics = #iree_gpu.mma_semantics <distributed = true , opaque = false >
198201 } : vector <8 x1 x1 x8 xf8 E4 M3 FNUZ>, vector <4 x1 x1 x8 xf8 E4 M3 FNUZ> into vector <8 x4 x1 x4 xf32 >
199202
200203 rocdl.s.setprio 0
@@ -219,7 +222,8 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
219222 %dot3 = iree_codegen.inner_tiled ins (%lhs_vec_3_t , %rhs_vec_3_t ) outs (%dot2 ) {
220223 indexing_maps = #contraction_accesses ,
221224 iterator_types = [#linalg.iterator_type <parallel >, #linalg.iterator_type <parallel >, #linalg.iterator_type <reduction >],
222- kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >
225+ kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >,
226+ semantics = #iree_gpu.mma_semantics <distributed = true , opaque = false >
223227 } : vector <8 x1 x1 x8 xf8 E4 M3 FNUZ>, vector <4 x1 x1 x8 xf8 E4 M3 FNUZ> into vector <8 x4 x1 x4 xf32 >
224228
225229 rocdl.s.setprio 0
@@ -241,7 +245,8 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
241245 %dot0 = iree_codegen.inner_tiled ins (%lhs_vec_0_t , %rhs_vec_0_t ) outs (%3 ) {
242246 indexing_maps = #contraction_accesses ,
243247 iterator_types = [#linalg.iterator_type <parallel >, #linalg.iterator_type <parallel >, #linalg.iterator_type <reduction >],
244- kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >
248+ kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >,
249+ semantics = #iree_gpu.mma_semantics <distributed = true , opaque = false >
245250 } : vector <8 x1 x1 x8 xf8 E4 M3 FNUZ>, vector <4 x1 x1 x8 xf8 E4 M3 FNUZ> into vector <8 x4 x1 x4 xf32 >
246251
247252 %lhs_vec_1 = vector.transfer_read %lhs_shared [%c1 , %m_outer , %ids#2 , %c0 ], %cst {in_bounds = [true , true , true , true ]} : !shared_ty , vector <1 x8 x1 x8 xf8 E4 M3 FNUZ>
@@ -252,7 +257,8 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
252257 %dot1 = iree_codegen.inner_tiled ins (%lhs_vec_1_t , %rhs_vec_1_t ) outs (%dot0 ) {
253258 indexing_maps = #contraction_accesses ,
254259 iterator_types = [#linalg.iterator_type <parallel >, #linalg.iterator_type <parallel >, #linalg.iterator_type <reduction >],
255- kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >
260+ kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >,
261+ semantics = #iree_gpu.mma_semantics <distributed = true , opaque = false >
256262 } : vector <8 x1 x1 x8 xf8 E4 M3 FNUZ>, vector <4 x1 x1 x8 xf8 E4 M3 FNUZ> into vector <8 x4 x1 x4 xf32 >
257263
258264 %lhs_vec_2 = vector.transfer_read %lhs_shared [%c2 , %m_outer , %ids#2 , %c0 ], %cst {in_bounds = [true , true , true , true ]} : !shared_ty , vector <1 x8 x1 x8 xf8 E4 M3 FNUZ>
@@ -263,7 +269,8 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
263269 %dot2 = iree_codegen.inner_tiled ins (%lhs_vec_2_t , %rhs_vec_2_t ) outs (%dot1 ) {
264270 indexing_maps = #contraction_accesses ,
265271 iterator_types = [#linalg.iterator_type <parallel >, #linalg.iterator_type <parallel >, #linalg.iterator_type <reduction >],
266- kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >
272+ kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >,
273+ semantics = #iree_gpu.mma_semantics <distributed = true , opaque = false >
267274 } : vector <8 x1 x1 x8 xf8 E4 M3 FNUZ>, vector <4 x1 x1 x8 xf8 E4 M3 FNUZ> into vector <8 x4 x1 x4 xf32 >
268275
269276 %lhs_vec_3 = vector.transfer_read %lhs_shared [%c3 , %m_outer , %ids#2 , %c0 ], %cst {in_bounds = [true , true , true , true ]} : !shared_ty , vector <1 x8 x1 x8 xf8 E4 M3 FNUZ>
@@ -274,7 +281,8 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
274281 %dot3 = iree_codegen.inner_tiled ins (%lhs_vec_3_t , %rhs_vec_3_t ) outs (%dot2 ) {
275282 indexing_maps = #contraction_accesses ,
276283 iterator_types = [#linalg.iterator_type <parallel >, #linalg.iterator_type <parallel >, #linalg.iterator_type <reduction >],
277- kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >
284+ kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >,
285+ semantics = #iree_gpu.mma_semantics <distributed = true , opaque = false >
278286 } : vector <8 x1 x1 x8 xf8 E4 M3 FNUZ>, vector <4 x1 x1 x8 xf8 E4 M3 FNUZ> into vector <8 x4 x1 x4 xf32 >
279287
280288 %empty = tensor.empty () : tensor <1 x1 x1 x1 x8 x4 x1 x1 x4 xf32 >
@@ -401,7 +409,8 @@ util.func private @pingpong_dt_medium_f8E4M3FNUZ(%lhs_base: !m_lhs_base_ty, %rhs
401409 %dot0 = iree_codegen.inner_tiled ins (%lhs_vec_0_t , %rhs_vec_0_t ) outs (%iter ) {
402410 indexing_maps = #contraction_accesses ,
403411 iterator_types = [#linalg.iterator_type <parallel >, #linalg.iterator_type <parallel >, #linalg.iterator_type <reduction >],
404- kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >
412+ kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >,
413+ semantics = #iree_gpu.mma_semantics <distributed = true , opaque = false >
405414 } : vector <8 x2 x1 x8 xf8 E4 M3 FNUZ>, vector <2 x2 x1 x8 xf8 E4 M3 FNUZ> into vector <8 x2 x1 x4 xf32 >
406415
407416 rocdl.s.setprio 0
@@ -424,7 +433,8 @@ util.func private @pingpong_dt_medium_f8E4M3FNUZ(%lhs_base: !m_lhs_base_ty, %rhs
424433 %dot2 = iree_codegen.inner_tiled ins (%lhs_vec_2_t , %rhs_vec_2_t ) outs (%dot0 ) {
425434 indexing_maps = #contraction_accesses ,
426435 iterator_types = [#linalg.iterator_type <parallel >, #linalg.iterator_type <parallel >, #linalg.iterator_type <reduction >],
427- kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >
436+ kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >,
437+ semantics = #iree_gpu.mma_semantics <distributed = true , opaque = false >
428438 } : vector <8 x2 x1 x8 xf8 E4 M3 FNUZ>, vector <2 x2 x1 x8 xf8 E4 M3 FNUZ> into vector <8 x2 x1 x4 xf32 >
429439
430440 rocdl.s.setprio 0
@@ -453,13 +463,15 @@ util.func private @pingpong_dt_medium_f8E4M3FNUZ(%lhs_base: !m_lhs_base_ty, %rhs
453463 %dot0 = iree_codegen.inner_tiled ins (%lhs_vec_0_t , %rhs_vec_0_t ) outs (%3 ) {
454464 indexing_maps = #contraction_accesses ,
455465 iterator_types = [#linalg.iterator_type <parallel >, #linalg.iterator_type <parallel >, #linalg.iterator_type <reduction >],
456- kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >
466+ kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >,
467+ semantics = #iree_gpu.mma_semantics <distributed = true , opaque = false >
457468 } : vector <8 x2 x1 x8 xf8 E4 M3 FNUZ>, vector <2 x2 x1 x8 xf8 E4 M3 FNUZ> into vector <8 x2 x1 x4 xf32 >
458469
459470 %dot2 = iree_codegen.inner_tiled ins (%lhs_vec_2_t , %rhs_vec_2_t ) outs (%dot0 ) {
460471 indexing_maps = #contraction_accesses ,
461472 iterator_types = [#linalg.iterator_type <parallel >, #linalg.iterator_type <parallel >, #linalg.iterator_type <reduction >],
462- kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >
473+ kind = #iree_gpu.mma_layout <MFMA_F32_16x16x32_F8E4M3FNUZ >,
474+ semantics = #iree_gpu.mma_semantics <distributed = true , opaque = false >
463475 } : vector <8 x2 x1 x8 xf8 E4 M3 FNUZ>, vector <2 x2 x1 x8 xf8 E4 M3 FNUZ> into vector <8 x2 x1 x4 xf32 >
464476
465477 %empty = tensor.empty () : tensor <1 x1 x1 x8 x2 x1 x1 x4 xf32 >
0 commit comments