[GPU][DT] Update data layout strategy for pingpong ukernels (#21957)

Yu-Zhewen · web-flow · commit bb77caf907e5 · 2025-09-13T00:11:37.000+01:00
As a follow-up to #21914 and #21919, specialize data layout selection for the newly added pingpong ukernels. It also adds end-to-end data-tiling matmul tests with ukernels enabled. --------- Signed-off-by: Yu-Zhewen <zhewenyu@amd.com>
diff --git a/compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMAttrs.cpp b/compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMAttrs.cpp
@@ -496,15 +496,37 @@ Attribute TensorUKernelProviderAttr::getDataLayoutForUKernel(
     return {};
   }
   SmallVector<Type> types = encodingAttr.getElementTypesArray();
+  SmallVector<int64_t> iterationSizes = encodingAttr.getIterationSizesArray();
+  if (types.size() != 3 || iterationSizes.size() != 3) {
+    return {};
+  }
+  // Match the layouts based on UKernels implementation:
+  // https://github.com/iree-org/iree/tree/main/compiler/plugins/target/ROCM/builtins/mlir_ukernel
   Type f16 = Float16Type::get(encoding.getContext());
   Type f32 = Float32Type::get(encoding.getContext());
-  if (types.size() != 3 || types[0] != f16 || types[1] != f16 ||
-      types[2] != f32) {
-    return {};
+  Type f8E4M3FNUZ = Float8E4M3FNUZType::get(encoding.getContext());
+  if (types[0] == f16 && types[1] == f16 && types[2] == f32) {
+    // UKernel: pingpong_dt_large_f16.
+    return IREE::GPU::DataTiledMMAAttr::get(
+        encoding.getContext(), IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x16_F16,
+        8, 2, 4, 4, 1);
+  }
+  if (types[0] == f8E4M3FNUZ && types[1] == f8E4M3FNUZ && types[2] == f32) {
+    /// TODO(#21865): Remove the upper bound (8192) once the scratch memory
+    /// issue is resolved.
+    if (iterationSizes[1] >= 2048 && iterationSizes[1] <= 8192) {
+      // UKernel: pingpong_dt_large_f8E4M3FNUZ.
+      return IREE::GPU::DataTiledMMAAttr::get(
+          encoding.getContext(),
+          IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x32_F8E4M3FNUZ, 8, 2, 4, 4, 1);
+    } else {
+      // UKernel: pingpong_dt_medium_f8E4M3FNUZ.
+      return IREE::GPU::DataTiledMMAAttr::get(
+          encoding.getContext(),
+          IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x32_F8E4M3FNUZ, 8, 1, 2, 8, 2);
+    }
   }
-  return IREE::GPU::DataTiledMMAAttr::get(
-      encoding.getContext(), IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x16_F16, 8,
-      2, 4, 4, 1);
+  return {};
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/compiler/plugins/target/ROCM/test/materialize_encoding_ukernel_gfx942.mlir b/compiler/plugins/target/ROCM/test/materialize_encoding_ukernel_gfx942.mlir
@@ -45,8 +45,9 @@
   #hal.pipeline.binding<storage_buffer>
 ]>
 
-func.func @matmul_lowering_ukernel_provider() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+func.func @matmul_f16_f16_f32_large_lowering_ukernel_provider() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
   %c0 = arith.constant 0 : index
+  // M, N, K are dynamic.
   %M = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(0) : index
   %N = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(1) : index
   %K = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(2) : index
@@ -75,7 +76,168 @@ func.func @matmul_lowering_ukernel_provider() attributes {hal.executable.target
       -> !iree_tensor_ext.dispatch.tensor<readwrite:tensor<?x?xf32, #encoding_result>>{%M, %N}
   return
 }
-// CHECK-LABEL: matmul_lowering_ukernel_provider
+// CHECK-LABEL: matmul_f16_f16_f32_large_lowering_ukernel_provider
 // CHECK:      iree_codegen.inner_tiled
 // CHECK-SAME:     iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]
 // CHECK-SAME:     kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x16_F16, intrinsics_m = 8, subgroups_m = 2, intrinsics_n = 4, subgroups_n = 4>
+
+// -----
+
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {
+  abi = "hip",
+  iree.encoding.resolver = #iree_gpu.gpu_encoding_resolver<>,
+  iree_codegen.target_info = #iree_gpu.target<
+    arch = "gfx942",
+    features = "",
+    wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8,
+    storage =  b64|b32|b16|b8,
+    subgroup =  shuffle|arithmetic,
+    dot =  dp4xi8toi32,
+    mma = [<MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>,
+           <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>,
+           <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>,
+           <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>,
+           <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>,
+           <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>,
+           <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>
+          ],
+    subgroup_size_choices = [64],
+    max_workgroup_sizes = [1024, 1024, 1024],
+    max_thread_count_per_workgroup = 1024,
+    max_workgroup_memory_bytes = 65536,
+    max_workgroup_counts = [2147483647, 2147483647, 2147483647],
+    max_load_instruction_bits = 128,
+    simds_per_wgp = 4,
+    vgpr_space_bits = 16384>
+  >,
+  iree_codegen.ukernel_provider = #rocm.tensor_ukernel_provider,
+  ukernels = "none"
+}>
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#encoding_lhs = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f8E4M3FNUZ, f8E4M3FNUZ, f32], user_indexing_maps = [#map, #map1, #map2], iteration_sizes = [?, ?, ?]>
+#encoding_rhs = #iree_encoding.encoding<operand_index = 1, op_type = matmul, element_types = [f8E4M3FNUZ, f8E4M3FNUZ, f32], user_indexing_maps = [#map, #map1, #map2], iteration_sizes = [?, ?, ?]>
+#encoding_result = #iree_encoding.encoding<operand_index = 2, op_type = matmul, element_types = [f8E4M3FNUZ, f8E4M3FNUZ, f32], user_indexing_maps = [#map, #map1, #map2], iteration_sizes = [?, ?, ?]>
+#pipeline_layout_3 = #hal.pipeline.layout<constants = 3, bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+
+func.func @matmul_f8_f8_f32_medium_lowering_ukernel_provider() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+  %c0 = arith.constant 0 : index
+  // M, N, K are dynamic.
+  %M = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(0) : index
+  %N = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(1) : index
+  %K = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(2) : index
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(0) alignment(64) offset(%c0)
+      : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?xf8E4M3FNUZ, #encoding_lhs>>{%M, %K}
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(1) alignment(64) offset(%c0)
+      : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?xf8E4M3FNUZ, #encoding_rhs>>{%K, %N}
+  %2 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(2) alignment(64) offset(%c0)
+      : !iree_tensor_ext.dispatch.tensor<readwrite:tensor<?x?xf32, #encoding_result>>{%M, %N}
+  %3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1]
+      : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?xf8E4M3FNUZ, #encoding_lhs>>{%M, %K}
+      -> tensor<?x?xf8E4M3FNUZ, #encoding_lhs>
+  %4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1]
+      : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?xf8E4M3FNUZ, #encoding_rhs>>{%K, %N}
+      -> tensor<?x?xf8E4M3FNUZ, #encoding_rhs>
+  %5 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
+      : !iree_tensor_ext.dispatch.tensor<readwrite:tensor<?x?xf32, #encoding_result>>{%M, %N}
+      -> tensor<?x?xf32, #encoding_result>
+  %6 = linalg.matmul
+      ins(%3, %4 : tensor<?x?xf8E4M3FNUZ, #encoding_lhs>,
+                   tensor<?x?xf8E4M3FNUZ, #encoding_rhs>)
+      outs(%5 : tensor<?x?xf32, #encoding_result>)
+      -> tensor<?x?xf32, #encoding_result>
+  iree_tensor_ext.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
+      : tensor<?x?xf32, #encoding_result>
+      -> !iree_tensor_ext.dispatch.tensor<readwrite:tensor<?x?xf32, #encoding_result>>{%M, %N}
+  return
+}
+// CHECK-LABEL: matmul_f8_f8_f32_medium_lowering_ukernel_provider
+// CHECK:      iree_codegen.inner_tiled
+// CHECK-SAME:     iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]
+// CHECK-SAME:     kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x32_F8E4M3FNUZ, intrinsics_m = 8, intrinsics_n = 2, subgroups_n = 8, intrinsics_k = 2>
+
+// -----
+
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {
+  abi = "hip",
+  iree.encoding.resolver = #iree_gpu.gpu_encoding_resolver<>,
+  iree_codegen.target_info = #iree_gpu.target<
+    arch = "gfx942",
+    features = "",
+    wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8,
+    storage =  b64|b32|b16|b8,
+    subgroup =  shuffle|arithmetic,
+    dot =  dp4xi8toi32,
+    mma = [<MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>,
+           <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>,
+           <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>,
+           <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>,
+           <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>,
+           <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>,
+           <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>
+          ],
+    subgroup_size_choices = [64],
+    max_workgroup_sizes = [1024, 1024, 1024],
+    max_thread_count_per_workgroup = 1024,
+    max_workgroup_memory_bytes = 65536,
+    max_workgroup_counts = [2147483647, 2147483647, 2147483647],
+    max_load_instruction_bits = 128,
+    simds_per_wgp = 4,
+    vgpr_space_bits = 16384>
+  >,
+  iree_codegen.ukernel_provider = #rocm.tensor_ukernel_provider,
+  ukernels = "none"
+}>
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#encoding_lhs = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f8E4M3FNUZ, f8E4M3FNUZ, f32], user_indexing_maps = [#map, #map1, #map2], iteration_sizes = [?, 2048, ?]>
+#encoding_rhs = #iree_encoding.encoding<operand_index = 1, op_type = matmul, element_types = [f8E4M3FNUZ, f8E4M3FNUZ, f32], user_indexing_maps = [#map, #map1, #map2], iteration_sizes = [?, 2048, ?]>
+#encoding_result = #iree_encoding.encoding<operand_index = 2, op_type = matmul, element_types = [f8E4M3FNUZ, f8E4M3FNUZ, f32], user_indexing_maps = [#map, #map1, #map2], iteration_sizes = [?, 2048, ?]>
+#pipeline_layout_3 = #hal.pipeline.layout<constants = 2, bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+
+func.func @matmul_f8_f8_f32_large_lowering_ukernel_provider() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+  %c0 = arith.constant 0 : index
+  // M, K are dynamic, and N is static as 2048.
+  %M = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(0) : index
+  %K = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(1) : index
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(0) alignment(64) offset(%c0)
+      : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?xf8E4M3FNUZ, #encoding_lhs>>{%M, %K}
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(1) alignment(64) offset(%c0)
+      : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x2048xf8E4M3FNUZ, #encoding_rhs>>{%K}
+  %2 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(2) alignment(64) offset(%c0)
+      : !iree_tensor_ext.dispatch.tensor<readwrite:tensor<?x2048xf32, #encoding_result>>{%M}
+  %3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1]
+      : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?xf8E4M3FNUZ, #encoding_lhs>>{%M, %K}
+      -> tensor<?x?xf8E4M3FNUZ, #encoding_lhs>
+  %4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, 2048], strides = [1, 1]
+      : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x2048xf8E4M3FNUZ, #encoding_rhs>>{%K}
+      -> tensor<?x2048xf8E4M3FNUZ, #encoding_rhs>
+  %5 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, 2048], strides = [1, 1]
+      : !iree_tensor_ext.dispatch.tensor<readwrite:tensor<?x2048xf32, #encoding_result>>{%M}
+      -> tensor<?x2048xf32, #encoding_result>
+  %6 = linalg.matmul
+      ins(%3, %4 : tensor<?x?xf8E4M3FNUZ, #encoding_lhs>,
+                   tensor<?x2048xf8E4M3FNUZ, #encoding_rhs>)
+      outs(%5 : tensor<?x2048xf32, #encoding_result>)
+      -> tensor<?x2048xf32, #encoding_result>
+  iree_tensor_ext.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, 2048], strides = [1, 1]
+      : tensor<?x2048xf32, #encoding_result>
+      -> !iree_tensor_ext.dispatch.tensor<readwrite:tensor<?x2048xf32, #encoding_result>>{%M}
+  return
+}
+// CHECK-LABEL: matmul_f8_f8_f32_large_lowering_ukernel_provider
+// CHECK:      iree_codegen.inner_tiled
+// CHECK-SAME:     iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]
+// CHECK-SAME:     kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x32_F8E4M3FNUZ,  intrinsics_m = 8, subgroups_m = 2, intrinsics_n = 4, subgroups_n = 4>
diff --git a/tests/e2e/matmul/CMakeLists.txt b/tests/e2e/matmul/CMakeLists.txt
@@ -1510,6 +1510,72 @@ iree_generated_e2e_runner_test(
     "requires-gpu-cdna3"
 )
 
+iree_generated_e2e_runner_test(
+  NAME
+    e2e_matmul_cdna3_dt_f8E4M3FNUZ_tensor_ukernel_medium
+  TEST_TYPE
+    matmul
+  GENERATOR
+    "generate_e2e_matmul_tests.py"
+  GENERATOR_ARGS
+    "--lhs_rhs_type=f8E4M3FNUZ"
+    "--acc_type=f32"
+    "--shapes=custom_mnk"
+    "--mnk=1024,1024,1024"
+  TEST_RUNNER
+    iree_tools_testing_e2e_iree-e2e-matmul-test
+  TARGET_BACKENDS
+    "rocm"
+  DRIVERS
+    "hip"
+  COMPILER_FLAGS
+    ${IREE_HIP_TEST_COMPILER_FLAGS}
+    "--iree-opt-data-tiling=false"
+    "--iree-dispatch-creation-data-tiling"
+    "--iree-hip-encoding-layout-resolver=data-tiling"
+    "--iree-llvmgpu-test-combine-layout-transformation=true"
+    "--iree-hip-enable-tensor-ukernels"
+  LABELS
+    "noasan"
+    "nomsan"
+    "notsan"
+    "noubsan"
+    "requires-gpu-cdna3"
+)
+
+iree_generated_e2e_runner_test(
+  NAME
+    e2e_matmul_cdna3_dt_f8E4M3FNUZ_tensor_ukernel_large
+  TEST_TYPE
+    matmul
+  GENERATOR
+    "generate_e2e_matmul_tests.py"
+  GENERATOR_ARGS
+    "--lhs_rhs_type=f8E4M3FNUZ"
+    "--acc_type=f32"
+    "--shapes=custom_mnk"
+    "--mnk=2048,2048,2048"
+  TEST_RUNNER
+    iree_tools_testing_e2e_iree-e2e-matmul-test
+  TARGET_BACKENDS
+    "rocm"
+  DRIVERS
+    "hip"
+  COMPILER_FLAGS
+    ${IREE_HIP_TEST_COMPILER_FLAGS}
+    "--iree-opt-data-tiling=false"
+    "--iree-dispatch-creation-data-tiling"
+    "--iree-hip-encoding-layout-resolver=data-tiling"
+    "--iree-llvmgpu-test-combine-layout-transformation=true"
+    "--iree-hip-enable-tensor-ukernels"
+  LABELS
+    "noasan"
+    "nomsan"
+    "notsan"
+    "noubsan"
+    "requires-gpu-cdna3"
+)
+
 iree_generated_e2e_runner_test(
   NAME
     e2e_matmul_cdna3_dt_f64
@@ -1631,6 +1697,40 @@ iree_generated_e2e_runner_test(
     "requires-gpu-cdna3"
 )
 
+
+iree_generated_e2e_runner_test(
+  NAME
+    e2e_matmul_dt_tensor_ukernel_f16f16f32_large
+  TEST_TYPE
+    matmul
+  GENERATOR
+    "generate_e2e_matmul_tests.py"
+  GENERATOR_ARGS
+    "--lhs_rhs_type=f16"
+    "--acc_type=f32"
+    "--shapes=custom_mnk"
+    "--mnk=1024,1024,1024"
+  TEST_RUNNER
+    iree_tools_testing_e2e_iree-e2e-matmul-test
+  TARGET_BACKENDS
+    "rocm"
+  DRIVERS
+    "hip"
+  COMPILER_FLAGS
+    ${IREE_HIP_TEST_COMPILER_FLAGS}
+    "--iree-opt-data-tiling=false"
+    "--iree-dispatch-creation-data-tiling"
+    "--iree-hip-encoding-layout-resolver=data-tiling"
+    "--iree-llvmgpu-test-combine-layout-transformation=true"
+    "--iree-hip-enable-tensor-ukernels"
+  LABELS
+    "noasan"
+    "nomsan"
+    "notsan"
+    "noubsan"
+    "requires-gpu-cdna3"
+)
+
 iree_generated_e2e_runner_test(
   NAME
     e2e_matmul_tensor_ukernel_bf16bf16f32_large