@@ -393,22 +393,20 @@ func.func @data_tiled_1x1x1_tensor_multi_mma(%lhs: tensor<1x1x4x16xf32>, %rhs: t
393393// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]
394394// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]
395395// CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]
396- // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
397396// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index
398397// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index
399398// CHECK: scf.forall (%[[THREAD_ID:.+]]) in (64) shared_outs(%[[ACC_ARG:.+]] = %[[ACC]]) -> (tensor<1x1x4x16x4xf32>)
400399// CHECK: %[[ID_CLAMPED:.+]] = affine.apply #[[$MAP]](%[[THREAD_ID]])
401400// CHECK-DAG: %[[IN_IDS:.+]]:2 = affine.delinearize_index %[[ID_CLAMPED]] into (%[[C4]], %[[C16]])
402401// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[IN_IDS]]#0, %[[IN_IDS]]#1] [1, 1, 1, 1] [1, 1, 1, 1]
403402// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[IN_IDS]]#0, %[[IN_IDS]]#1] [1, 1, 1, 1] [1, 1, 1, 1]
404- // CHECK-DAG: %[[ACC_IDS:.+]]:3 = affine.delinearize_index %[[ID_CLAMPED]] into (%[[C4]], %[[C16]], %[[C1]])
405403// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC_ARG]]
406- // CHECK-SAME: [0, 0, %[[ACC_IDS ]]#0, %[[ACC_IDS ]]#1, %[[ACC_IDS]]#2 ] [1, 1, 1, 1, 4] [1, 1, 1, 1, 1]
404+ // CHECK-SAME: [0, 0, %[[IN_IDS ]]#0, %[[IN_IDS ]]#1, 0 ] [1, 1, 1, 1, 4] [1, 1, 1, 1, 1]
407405// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]]
408406// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x4_F32>
409407// CHECK-SAME: : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> into tensor<1x1x1x1x4xf32>
410408// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC_ARG]]
411- // CHECK-SAME: [0, 0, %[[ACC_IDS ]]#0, %[[ACC_IDS ]]#1, %[[ACC_IDS]]#2 ] [1, 1, 1, 1, 4] [1, 1, 1, 1, 1]
409+ // CHECK-SAME: [0, 0, %[[IN_IDS ]]#0, %[[IN_IDS ]]#1, 0 ] [1, 1, 1, 1, 4] [1, 1, 1, 1, 1]
412410// CHECK: mapping = [#gpu.thread<linear_dim_0>]
413411
414412// -----
@@ -434,24 +432,22 @@ func.func @data_tiled_2x2x4_tensor_multi_mma_unrolled(%lhs: tensor<1x1x2x4x16x4x
434432// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]
435433// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]
436434// CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]
437- // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
438435// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index
439436// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index
440437// CHECK: scf.forall (%[[THREAD_ID:.+]]) in (64) shared_outs(%[[ACC_ARG:.+]] = %[[ACC]]) -> (tensor<1x1x2x2x4x16x4xf32>)
441438// CHECK: %[[ID_CLAMPED:.+]] = affine.apply #[[$MAP]](%[[THREAD_ID]])
442- // CHECK-DAG: %[[IN_IDS:.+]]:4 = affine.delinearize_index %[[ID_CLAMPED]] into (%[[C1]], %[[ C4]], %[[C16]], %[[C1 ]])
439+ // CHECK-DAG: %[[IN_IDS:.+]]:2 = affine.delinearize_index %[[ID_CLAMPED]] into (%[[C4]], %[[C16]])
443440// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]]
444- // CHECK-SAME: [0, 0, %[[IN_IDS]]# 0, %[[IN_IDS]]#1 , %[[IN_IDS]]#2, %[[IN_IDS]]#3 ] [1, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1]
441+ // CHECK-SAME: [0, 0, 0, %[[IN_IDS]]#0 , %[[IN_IDS]]#1, 0 ] [1, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1]
445442// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]]
446- // CHECK-SAME: [0, 0, %[[IN_IDS]]#0, %[[IN_IDS]]#1, %[[IN_IDS]]#2, %[[IN_IDS]]#3] [1, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1]
447- // CHECK-DAG: %[[ACC_IDS:.+]]:5 = affine.delinearize_index %[[ID_CLAMPED]] into (%[[C1]], %[[C1]], %[[C4]], %[[C16]], %[[C1]])
443+ // CHECK-SAME: [0, 0, 0, %[[IN_IDS]]#0, %[[IN_IDS]]#1, 0] [1, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1]
448444// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC_ARG]]
449- // CHECK-SAME: [0, 0, %[[ACC_IDS]]# 0, %[[ACC_IDS]]#1 , %[[ACC_IDS ]]#2 , %[[ACC_IDS ]]#3, %[[ACC_IDS]]#4 ] [1, 1, 2, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1]
445+ // CHECK-SAME: [0, 0, 0, 0 , %[[IN_IDS ]]#0 , %[[IN_IDS ]]#1, 0 ] [1, 1, 2, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1]
450446// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]]
451447// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x4_F32, unroll_m = 2, unroll_n = 2, unroll_k = 4>
452448// CHECK-SAME: : tensor<1x1x2x1x1x4xf32>, tensor<1x1x2x1x1x4xf32> into tensor<1x1x2x2x1x1x4xf32>
453449// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC_ARG]]
454- // CHECK-SAME: [0, 0, %[[ACC_IDS]]# 0, %[[ACC_IDS]]#1 , %[[ACC_IDS ]]#2 , %[[ACC_IDS ]]#3, %[[ACC_IDS]]#4 ] [1, 1, 2, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1]
450+ // CHECK-SAME: [0, 0, 0, 0 , %[[IN_IDS ]]#0 , %[[IN_IDS ]]#1, 0 ] [1, 1, 2, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1]
455451// CHECK: mapping = [#gpu.thread<linear_dim_0>]
456452
457453// -----
@@ -478,25 +474,24 @@ func.func @data_tiled_2x2x4_tensor_multi_mma_unrolled_to_subgroups(%lhs: tensor<
478474// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]
479475// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]
480476// CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]
481- // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
482477// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
483478// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index
484479// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index
485480// CHECK: scf.forall (%[[THREAD_ID:.+]]) in (256) shared_outs(%[[ACC_ARG:.+]] = %[[ACC]]) -> (tensor<1x1x2x2x4x16x4xf32>)
486481// CHECK: %[[ID_CLAMPED_128:.+]] = affine.apply #[[$MAP]](%[[THREAD_ID]])
487- // CHECK-DAG: %[[IN_IDS:.+]]:4 = affine.delinearize_index %[[ID_CLAMPED_128]] into (%[[C2]], %[[C4]], %[[C16]], %[[C1 ]])
482+ // CHECK-DAG: %[[IN_IDS:.+]]:3 = affine.delinearize_index %[[ID_CLAMPED_128]] into (%[[C2]], %[[C4]], %[[C16]])
488483// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]]
489- // CHECK-SAME: [0, 0, %[[IN_IDS]]#0, %[[IN_IDS]]#1, %[[IN_IDS]]#2, %[[IN_IDS]]#3 ] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1]
484+ // CHECK-SAME: [0, 0, %[[IN_IDS]]#0, %[[IN_IDS]]#1, %[[IN_IDS]]#2, 0 ] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1]
490485// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]]
491- // CHECK-SAME: [0, 0, %[[IN_IDS]]#0, %[[IN_IDS]]#1, %[[IN_IDS]]#2, %[[IN_IDS]]#3 ] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1]
486+ // CHECK-SAME: [0, 0, %[[IN_IDS]]#0, %[[IN_IDS]]#1, %[[IN_IDS]]#2, 0 ] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1]
492487// CHECK: %[[ID_CLAMPED_256:.+]] = affine.apply #[[$MAP1]](%[[THREAD_ID]])
493- // CHECK-DAG: %[[ACC_IDS:.+]]:5 = affine.delinearize_index %[[ID_CLAMPED_256]] into (%[[C2]], %[[C2]], %[[C4]], %[[C16]], %[[C1 ]])
488+ // CHECK-DAG: %[[ACC_IDS:.+]]:4 = affine.delinearize_index %[[ID_CLAMPED_256]] into (%[[C2]], %[[C2]], %[[C4]], %[[C16]])
494489// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC_ARG]]
495- // CHECK-SAME: [0, 0, %[[ACC_IDS]]#0, %[[ACC_IDS]]#1, %[[ACC_IDS]]#2, %[[ACC_IDS]]#3, %[[ACC_IDS]]#4 ] [1, 1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1]
490+ // CHECK-SAME: [0, 0, %[[ACC_IDS]]#0, %[[ACC_IDS]]#1, %[[ACC_IDS]]#2, %[[ACC_IDS]]#3, 0 ] [1, 1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1]
496491// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]]
497492// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x4_F32,
498493// CHECK-SAME: unroll_m_to_subgroups = 2, unroll_n_to_subgroups = 2, unroll_k = 4>}
499494// CHECK-SAME: : tensor<1x1x1x1x1x4xf32>, tensor<1x1x1x1x1x4xf32> into tensor<1x1x1x1x1x1x4xf32>
500495// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC_ARG]]
501- // CHECK-SAME: [0, 0, %[[ACC_IDS]]#0, %[[ACC_IDS]]#1, %[[ACC_IDS]]#2, %[[ACC_IDS]]#3, %[[ACC_IDS]]#4 ] [1, 1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1]
496+ // CHECK-SAME: [0, 0, %[[ACC_IDS]]#0, %[[ACC_IDS]]#1, %[[ACC_IDS]]#2, %[[ACC_IDS]]#3, 0 ] [1, 1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1]
502497// CHECK: mapping = [#gpu.thread<linear_dim_0>]
0 commit comments