[ROCM][DT] Update ukernel data layout (#22350)

Yu-Zhewen · web-flow · commit fcae3fcd1f50 · 2025-10-20T17:21:06.000+01:00
#22284 changes the data tiling layout by removing `moveCrossThreadOutermost`. This PR updates ukernel accordingly to ensure correct matching. Numerical correctness and performance have been verified locally on llama 8b prefill. Closes: #22349 --------- Signed-off-by: Yu-Zhewen <zhewenyu@amd.com>
diff --git a/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/apply_builtin_ukernel_pdl_patterns_driver.mlir b/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/apply_builtin_ukernel_pdl_patterns_driver.mlir
@@ -362,15 +362,15 @@ module attributes {
 module attributes {
   hal.executable.target = #executable_target_rocm_hsaco_fb
 } {
-  func.func @inner_tiled_f8_large(%arg0: tensor<1x128x2x8x4x4x4x8xf8E4M3FNUZ>, %arg1: tensor<16x128x4x4x4x16x8xf8E4M3FNUZ>) -> tensor<1x16x2x4x8x4x4x16x4xf32> {
+  func.func @inner_tiled_f8_large(%arg0: tensor<1x128x2x8x4x16x8xf8E4M3FNUZ>, %arg1: tensor<16x128x4x4x4x16x8xf8E4M3FNUZ>) -> tensor<1x16x2x4x8x4x4x16x4xf32> {
     %cst = arith.constant 0.000000e+00 : f32
     %0 = tensor.empty() : tensor<1x16x2x4x8x4x4x16x4xf32>
     %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x16x2x4x8x4x4x16x4xf32>) -> tensor<1x16x2x4x8x4x4x16x4xf32>
     %2 = iree_codegen.inner_tiled ins(%arg0, %arg1) outs(%1){
           indexing_maps = [#map1, #map2, #map3],
           iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
           kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x32_F8E4M3FNUZ, intrinsics_m = 8, subgroups_m = 2, intrinsics_n = 4, subgroups_n = 4>
-        } : tensor<1x128x2x8x4x4x4x8xf8E4M3FNUZ>, tensor<16x128x4x4x4x16x8xf8E4M3FNUZ> into tensor<1x16x2x4x8x4x4x16x4xf32>
+        } : tensor<1x128x2x8x4x16x8xf8E4M3FNUZ>, tensor<16x128x4x4x4x16x8xf8E4M3FNUZ> into tensor<1x16x2x4x8x4x4x16x4xf32>
     return %2 : tensor<1x16x2x4x8x4x4x16x4xf32>
   }
 }
@@ -396,15 +396,15 @@ module attributes {
 module attributes {
   hal.executable.target = #executable_target_rocm_hsaco_fb
 } {
-  func.func @inner_tiled_f8_medium(%arg0: tensor<1x64x8x4x4x4x2x8xf8E4M3FNUZ>, %arg1: tensor<4x64x8x2x4x16x2x8xf8E4M3FNUZ>) -> tensor<1x4x8x8x2x4x16x4xf32> {
+  func.func @inner_tiled_f8_medium(%arg0: tensor<1x64x8x4x16x2x8xf8E4M3FNUZ>, %arg1: tensor<4x64x8x2x4x16x2x8xf8E4M3FNUZ>) -> tensor<1x4x8x8x2x4x16x4xf32> {
     %cst = arith.constant 0.000000e+00 : f32
     %0 = tensor.empty() : tensor<1x4x8x8x2x4x16x4xf32>
     %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x4x8x8x2x4x16x4xf32>) -> tensor<1x4x8x8x2x4x16x4xf32>
     %2 = iree_codegen.inner_tiled ins(%arg0, %arg1) outs(%1){
           indexing_maps = [#map1, #map2, #map3],
           iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
           kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x32_F8E4M3FNUZ,  intrinsics_m = 8, intrinsics_n = 2, subgroups_n = 8, intrinsics_k = 2>
-        } : tensor<1x64x8x4x4x4x2x8xf8E4M3FNUZ>, tensor<4x64x8x2x4x16x2x8xf8E4M3FNUZ> into tensor<1x4x8x8x2x4x16x4xf32>
+        } : tensor<1x64x8x4x16x2x8xf8E4M3FNUZ>, tensor<4x64x8x2x4x16x2x8xf8E4M3FNUZ> into tensor<1x4x8x8x2x4x16x4xf32>
     return %2 : tensor<1x4x8x8x2x4x16x4xf32>
   }
 }
@@ -430,15 +430,15 @@ module attributes {
 module attributes {
   hal.executable.target = #executable_target_rocm_hsaco_fb
 } {
-  func.func @inner_tiled_f16_large(%arg0: tensor<1x256x2x8x4x4x4x4xf16>, %arg1: tensor<501x256x4x4x4x16x4xf16>) -> tensor<1x501x2x4x8x4x4x16x4xf32> {
+  func.func @inner_tiled_f16_large(%arg0: tensor<1x256x2x8x4x16x4xf16>, %arg1: tensor<501x256x4x4x4x16x4xf16>) -> tensor<1x501x2x4x8x4x4x16x4xf32> {
     %cst = arith.constant 0.000000e+00 : f32
     %0 = tensor.empty() : tensor<1x501x2x4x8x4x4x16x4xf32>
     %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x501x2x4x8x4x4x16x4xf32>) -> tensor<1x501x2x4x8x4x4x16x4xf32>
     %2 = iree_codegen.inner_tiled ins(%arg0, %arg1) outs(%1){
           indexing_maps = [#map1, #map2, #map3],
           iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
           kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x16_F16, intrinsics_m = 8, subgroups_m = 2, intrinsics_n = 4, subgroups_n = 4>
-        } : tensor<1x256x2x8x4x4x4x4xf16>, tensor<501x256x4x4x4x16x4xf16> into tensor<1x501x2x4x8x4x4x16x4xf32>
+        } : tensor<1x256x2x8x4x16x4xf16>, tensor<501x256x4x4x4x16x4xf16> into tensor<1x501x2x4x8x4x4x16x4xf32>
     return %2 : tensor<1x501x2x4x8x4x4x16x4xf32>
   }
 }
diff --git a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_dt_matmul_f16.mlir b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_dt_matmul_f16.mlir
@@ -1,7 +1,7 @@
 //  RUN: iree-opt %s
 
 !acc_base_ty = tensor<1x1x2x4x8x4x4x16x4xf32>
-!lhs_base_ty = tensor<1x?x2x8x4x4x4x4xf16>
+!lhs_base_ty = tensor<1x?x2x8x4x16x4xf16>
 !lhs_expand_ty = tensor<1x?x4x2x8x4x4x2x2x4xf16>
 !rhs_base_ty = tensor<1x?x4x4x4x16x4xf16>
 !rhs_expand_ty = tensor<1x?x4x4x4x4x8x2x4xf16>
@@ -44,7 +44,7 @@ util.func @pingpong_dt_large_f16(%lhs_base: !lhs_base_ty, %rhs_base: !rhs_base_t
   %dim = tensor.dim %rhs_base, %c1 : !rhs_base_ty
   %nDim =  arith.divui %dim, %c4 : index
 
-  %lhs_expand = tensor.expand_shape %lhs_base [[0], [1, 2], [3], [4], [5], [6], [7, 8], [9]] output_shape [1, %nDim, 2, 2, 8, 4, 4, 2, 2, 4] : !lhs_base_ty into !lhs_expand_ty
+  %lhs_expand = tensor.expand_shape %lhs_base [[0], [1, 2], [3], [4], [5], [6, 7, 8], [9]] output_shape [1, %nDim, 2, 2, 8, 4, 4, 2, 2, 4] : !lhs_base_ty into !lhs_expand_ty
   %rhs_expand = tensor.expand_shape %rhs_base [[0], [1, 2], [3], [4], [5], [6, 7], [8]] output_shape [1, %nDim, 2, 4, 4, 4, 8, 2, 4] : !rhs_base_ty into !rhs_expand_ty
 
   %lhs = tensor.collapse_shape %lhs_expand [[0, 1], [2], [3, 4], [5, 6, 7], [8, 9]] : !lhs_expand_ty into !in_ty
diff --git a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_dt_matmul_f8E4M3FNUZ.mlir b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_dt_matmul_f8E4M3FNUZ.mlir
@@ -1,15 +1,15 @@
 //  RUN: iree-opt %s
 
 !acc_base_ty = tensor<1x1x2x4x8x4x4x16x4xf32>
-!lhs_base_ty = tensor<1x?x2x8x4x4x4x8xf8E4M3FNUZ>
+!lhs_base_ty = tensor<1x?x2x8x4x16x8xf8E4M3FNUZ>
 !lhs_expand_ty = tensor<1x?x4x2x8x4x4x2x2x8xf8E4M3FNUZ>
 !rhs_base_ty = tensor<1x?x4x4x4x16x8xf8E4M3FNUZ>
 !rhs_expand_ty = tensor<1x?x4x4x4x4x8x2x8xf8E4M3FNUZ>
 !in_ty = tensor<?x4x16x32x16xf8E4M3FNUZ>
 !shared_ty = memref<4x16x64x8xf8E4M3FNUZ, #gpu.address_space<workgroup>>
 
 !m_acc_base_ty = tensor<1x1x8x8x2x4x16x4xf32>
-!m_lhs_base_ty = tensor<1x?x8x4x4x4x2x8xf8E4M3FNUZ>
+!m_lhs_base_ty = tensor<1x?x8x4x16x2x8xf8E4M3FNUZ>
 !m_lhs_expand_ty = tensor<1x?x2x8x4x4x4x2x8xf8E4M3FNUZ>
 !m_rhs_base_ty = tensor<1x?x8x2x4x16x2x8xf8E4M3FNUZ>
 !m_rhs_expand_ty = tensor<1x?x2x8x2x4x16x2x8xf8E4M3FNUZ>
@@ -61,7 +61,7 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
   %dim = tensor.dim %rhs_base, %c1 : !rhs_base_ty
   %nDim = arith.divui %dim, %c4 : index
 
-  %lhs_expand = tensor.expand_shape %lhs_base [[0], [1, 2], [3], [4], [5], [6], [7, 8], [9]] output_shape [1, %nDim, 4, 2, 8, 4, 4, 2, 2, 8] : !lhs_base_ty into !lhs_expand_ty
+  %lhs_expand = tensor.expand_shape %lhs_base [[0], [1, 2], [3], [4], [5], [6, 7, 8], [9]] output_shape [1, %nDim, 4, 2, 8, 4, 4, 2, 2, 8] : !lhs_base_ty into !lhs_expand_ty
   %rhs_expand = tensor.expand_shape %rhs_base [[0], [1, 2], [3], [4], [5], [6, 7], [8]] output_shape [1, %nDim, 4, 4, 4, 4, 8, 2, 8] : !rhs_base_ty into !rhs_expand_ty
 
   %lhs = tensor.collapse_shape %lhs_expand [[0, 1], [2], [3, 4], [5, 6, 7], [8, 9]] : !lhs_expand_ty into !in_ty
@@ -319,7 +319,7 @@ util.func private @pingpong_dt_medium_f8E4M3FNUZ(%lhs_base: !m_lhs_base_ty, %rhs
   %dim = tensor.dim %rhs_base, %c1 : !m_rhs_base_ty
   %nDim = arith.divui %dim, %c2 : index
 
-  %lhs_expand = tensor.expand_shape %lhs_base [[0], [1, 2], [3], [4], [5], [6], [7], [8]] output_shape [1, %nDim, 2, 8, 4, 4, 4, 2, 8] : !m_lhs_base_ty into !m_lhs_expand_ty
+  %lhs_expand = tensor.expand_shape %lhs_base [[0], [1, 2], [3], [4], [5, 6], [7], [8]] output_shape [1, %nDim, 2, 8, 4, 4, 4, 2, 8] : !m_lhs_base_ty into !m_lhs_expand_ty
   %rhs_expand = tensor.expand_shape %rhs_base [[0], [1, 2], [3], [4], [5], [6], [7], [8]] output_shape [1, %nDim, 2, 8, 2, 4, 16, 2, 8] : !m_rhs_base_ty into !m_rhs_expand_ty
 
   %lhs = tensor.collapse_shape %lhs_expand [[0, 1], [2], [3], [4, 5, 6], [7, 8]] : !m_lhs_expand_ty into !m_lhs_ty
diff --git a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/ukernel_patterns_gfx942.mlir b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/ukernel_patterns_gfx942.mlir
@@ -720,7 +720,7 @@ pdl.pattern @annotate_inner_tiled_f8E4M3FNUZ_medium : benefit(1) {
   %attr_name = pdl.attribute = "iree_codegen.ukernel"
   pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
 
-  %lhs_cast_type = pdl.type : tensor<?x?x8x4x4x4x2x8xf8E4M3FNUZ>
+  %lhs_cast_type = pdl.type : tensor<?x?x8x4x16x2x8xf8E4M3FNUZ>
   pdl.apply_native_constraint "matchCastCompatibleType"(%lhs, %lhs_cast_type : !pdl.value, !pdl.type)
   %rhs_cast_type = pdl.type : tensor<?x?x8x2x4x16x2x8xf8E4M3FNUZ>
   pdl.apply_native_constraint "matchCastCompatibleType"(%rhs, %rhs_cast_type : !pdl.value, !pdl.type)
@@ -777,7 +777,7 @@ pdl.pattern @annotate_inner_tiled_f8E4M3FNUZ_large : benefit(2) {
   %attr_name = pdl.attribute = "iree_codegen.ukernel"
   pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
 
-  %lhs_cast_type = pdl.type : tensor<?x?x2x8x4x4x4x8xf8E4M3FNUZ>
+  %lhs_cast_type = pdl.type : tensor<?x?x2x8x4x16x8xf8E4M3FNUZ>
   pdl.apply_native_constraint "matchCastCompatibleType"(%lhs, %lhs_cast_type : !pdl.value, !pdl.type)
   %rhs_cast_type = pdl.type : tensor<?x?x4x4x4x16x8xf8E4M3FNUZ>
   pdl.apply_native_constraint "matchCastCompatibleType"(%rhs, %rhs_cast_type : !pdl.value, !pdl.type)
@@ -834,7 +834,7 @@ pdl.pattern @annotate_inner_tiled_f16_large : benefit(1) {
   %attr_name = pdl.attribute = "iree_codegen.ukernel"
   pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
 
-  %lhs_cast_type = pdl.type : tensor<?x?x2x8x4x4x4x4xf16>
+  %lhs_cast_type = pdl.type : tensor<?x?x2x8x4x16x4xf16>
   pdl.apply_native_constraint "matchCastCompatibleType"(%lhs, %lhs_cast_type : !pdl.value, !pdl.type)
   %rhs_cast_type = pdl.type : tensor<?x?x4x4x4x16x4xf16>
   pdl.apply_native_constraint "matchCastCompatibleType"(%rhs, %rhs_cast_type : !pdl.value, !pdl.type)