[Codegen] Rewrite test so LLVMGPUWarpReduction is not used (iree-org#21770)

newling · hhkit · commit a9eef59d9fc4 · 2025-09-11T11:34:59.000+01:00
There is some logic (introduced in iree-org#20310) that relies on finding a ` iree_tensor_ext.dispatch.tensor.store` op to trigger the LLVMGPUVectorDistribute configuration to kick in. If no such 'beacon' op is found, we currently fall through to LLVMGPUWarpReduction. This PR just rejiggles the test so that the IR is the expected state (the state which it will be in when the full pipeline is run) so that that we're on the right pass to use LLVMGPUVectorDistribute. I can follow this PR up with a refactoring of the logic to make it more robust Signed-off-by: James Newling <james.newling@gmail.com> Signed-off-by: Ivan Ho <ivan.ho@cl.cam.ac.uk>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
@@ -216,27 +216,21 @@ module {
 
 // -----
 
-module {
-  func.func @matmul_dynamic_dim(%11: tensor<?x256xf16>, %12: tensor<256x256xf16>) -> tensor<?x256xf32> {
-    %c0 = arith.constant 0 : index
-    %cst = arith.constant 0.000000e+00 : f32
-    %8 = tensor.dim %11, %c0 : tensor<?x256xf16>
-    %13 = tensor.empty(%8) : tensor<?x256xf32>
-    %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<?x256xf32>) -> tensor<?x256xf32>
-    %15 = linalg.matmul ins(%11, %12 : tensor<?x256xf16>, tensor<256x256xf16>) outs(%14 : tensor<?x256xf32>) -> tensor<?x256xf32>
-    return %15 : tensor<?x256xf32>
-  }
+func.func @matmul_dynamic_M(%arg0: tensor<?x256xf32>, %arg1: tensor<256x256xf32>, %arg2: tensor<?x256xf32>, %arg3: !iree_tensor_ext.dispatch.tensor<readwrite:tensor<?x256xf32>>, %arg4 : index) {
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x256xf32>, tensor<256x256xf32>) outs(%arg2 : tensor<?x256xf32>) -> tensor<?x256xf32>
+  iree_tensor_ext.dispatch.tensor.store %0, %arg3, offsets = [0, 0], sizes = [%arg4, 256], strides = [1, 1] : tensor<?x256xf32> -> !iree_tensor_ext.dispatch.tensor<readwrite:tensor<?x256xf32>>{%arg4}
+  return
 }
 
-// CHECK-LABEL: func.func @matmul_dynamic_dim
+// CHECK-LABEL: func.func @matmul_dynamic_M
 //  CHECK-SAME:   #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>
 //       CHECK:   linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config
 //  CHECK-SAME:     promote_operands = [0, 1]
 //  CHECK-SAME:     reduction = [0, 0, 4]
 //  CHECK-SAME:     thread = [1, 4, 0]
 //  CHECK-SAME:     workgroup = [1, 256, 0]
 
-//        LATE:  LLVMGPUWarpReduction
+//        LATE:  LLVMGPUVectorDistribute
 
 // -----