[GPU] Do not do c promotion for unaligned (I)GEMMs (iree-org#21823)

nirvedhmeshram · web-flow · commit 6dc81fd59b5a · 2025-09-09T16:08:32.000-05:00
We do not need to do c promotion and the performance with and without
seems similar. Here are some GEMM shapes and a conv shape I checked
### GEMMs
| Shape (MxNxK) | No C promotion (This PR) (us)| C promotion (us) |
|-----------------|---------|---------|
| 1023x512x512    | 8    | 10      |
| 1023x512x5121   | 302     | 303     |
| 1023x256x5121   | 305     | 303     |

### Convs
| Command |No c promotion (This PR) (us)| C promotion (us) |

|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------|---------|
| convbfp16 -n 1 -c 77 -H 7 -W 77 -k 77 -y 1 -x 1 -p 3 -q 3 -u 1 -v 1 -l
1 -j 1 -m conv -g 1 -F 1 -t 1 --in_layout NHWC --out_layout NHWC
--fil_layout NHWC | 3.4 | 3.5 |

I didnt observe any difference above noise thresholds in the two
configuration but the no c promotion has the advantage that under the
right configuration we can have larger tile sizes.

Signed-off-by: Nirvedh Meshram &lt;nirvedh@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -582,7 +582,6 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
       llvm::cast<AffineDimExpr>(maps[1].getResults().back()).getPosition();
 
   bool mustBeAligned = true;
-  bool doCPromotion = false;
   std::optional<GPUMMASchedule> schedule = getMmaScheduleFromProblemAndTarget(
       target, problem, transposedLhs, transposedRhs, isGemm,
       /*mustBeAligned*/ true,
@@ -595,10 +594,9 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
   if (!schedule && canSupportUnaligned) {
     LDBG() << "Attempting to deduce unaligned TileAndFuse MMA schedulee";
     mustBeAligned = false;
-    doCPromotion = true;
     schedule = getMmaScheduleFromProblemAndTarget(
         target, problem, transposedLhs, transposedRhs, isGemm, mustBeAligned,
-        doCPromotion, scaled);
+        /*doCPromotion=*/false, scaled);
   }
 
   if (!schedule) {
@@ -674,27 +672,19 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
   attrs.emplace_back(StringAttr::get(context, "subgroup"),
                      b.getI64ArrayAttr(subgroupTileSizes));
   attrs.emplace_back(StringAttr::get(context, "mma_kind"), kind);
-  if (mustBeAligned) {
-    Attribute useGlobalDma = IREE::GPU::UseGlobalLoadDMAAttr::get(context);
-    SmallVector<Attribute> promotionArray = {useGlobalDma, useGlobalDma};
-    SmallVector<int64_t> promotionList = {0, 1};
-    if (scaled) {
-      promotionArray.append({useGlobalDma, useGlobalDma});
-      promotionList.append({2, 3});
-    }
-    ArrayRef<Attribute> promotionTypes =
-        useDirectLoad ? ArrayRef<Attribute>(promotionArray)
-                      : ArrayRef<Attribute>{};
-    GPU::appendPromotedOperandsList(context, attrs, promotionList,
-                                    promotionTypes);
-  } else {
-    // TODO (nirvedhmeshram, Max191, jerryyin) : Add support so that unaligned
-    // shapes do not require c promotion.
-    SmallVector<int64_t> promotionList = {0, 1, 2};
-    if (scaled) {
-      promotionList.append({3, 4});
-    }
-    GPU::appendPromotedOperandsList(context, attrs, promotionList);
+  Attribute useGlobalDma = IREE::GPU::UseGlobalLoadDMAAttr::get(context);
+  SmallVector<Attribute> promotionArray = {useGlobalDma, useGlobalDma};
+  SmallVector<int64_t> promotionList = {0, 1};
+  if (scaled) {
+    promotionArray.append({useGlobalDma, useGlobalDma});
+    promotionList.append({2, 3});
+  }
+  ArrayRef<Attribute> promotionTypes = useDirectLoad
+                                           ? ArrayRef<Attribute>(promotionArray)
+                                           : ArrayRef<Attribute>{};
+  GPU::appendPromotedOperandsList(context, attrs, promotionList,
+                                  promotionTypes);
+  if (!mustBeAligned) {
     SmallVector<int64_t> paddingTileSizes = workgroupTileSizes;
 
     // Initialize inner and outer padding sizes from reductionTileSizes.
@@ -712,8 +702,8 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
     }
     paddingTileSizes[innerKDim] *= kPackFactor;
 
-    // Create `padding_conv` attribute when padding convolutions before IGEMM is
-    // possible, otherwise fallback to pad IGEMM.
+    // Create `padding_conv` attribute when padding convolutions before IGEMM
+    // is possible, otherwise fallback to pad IGEMM.
     if (auto attr =
             getPaddingConvSizes(b, bounds, paddingTileSizes, workgroupTileSizes,
                                 reductionTileSizes, convToIgemmInfo)) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir
@@ -99,13 +99,13 @@ func.func @nhwc_conv_unaligned_mfma() {
 //  CHECK-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
 
 // GFX942-SAME:     padding = [2, 1, 32, 64, 32]
-// GFX942-SAME:     promote_operands = [0, 1, 2]
+// GFX942-SAME:     promote_operands = [0, 1]
 // GFX942-SAME:     reduction = [0, 0, 0, 0, 8]
 // GFX942-SAME:     subgroup = [2, 1, 1, 1, 0]
 // GFX942-SAME:     workgroup = [2, 1, 32, 64, 0]
 
 // MI300X-SAME:     padding = [2, 1, 32, 32, 32]
-// MI300X-SAME:     promote_operands = [0, 1, 2]
+// MI300X-SAME:     promote_operands = [0, 1]
 // MI300X-SAME:     reduction = [0, 0, 0, 0, 8]
 // MI300X-SAME:     subgroup = [1, 1, 1, 1, 0]
 // MI300X-SAME:     workgroup = [2, 1, 32, 32, 0]
@@ -138,13 +138,13 @@ func.func @nchw_conv_unaligned_mfma() {
 //  CHECK-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
 
 // GFX942-SAME:     padding = [1, 64, 4, 32, 32]
-// GFX942-SAME:     promote_operands = [0, 1, 2]
+// GFX942-SAME:     promote_operands = [0, 1]
 // GFX942-SAME:     reduction = [0, 0, 0, 0, 8]
 // GFX942-SAME:     subgroup = [1, 2, 2, 1, 0]
 // GFX942-SAME:     workgroup = [1, 64, 4, 32, 0]
 
 // MI300X-SAME:     padding = [1, 32, 2, 32, 32]
-// MI300X-SAME:     promote_operands = [0, 1, 2]
+// MI300X-SAME:     promote_operands = [0, 1]
 // MI300X-SAME:     reduction = [0, 0, 0, 0, 8]
 // MI300X-SAME:     subgroup = [1, 1, 1, 1, 0]
 // MI300X-SAME:     workgroup = [1, 32, 2, 32, 0]
@@ -177,13 +177,13 @@ func.func @conv_nhwc_fhwc_unaligned_channel(%arg0: tensor<16x26x19x287xf16>, %ar
 //  CHECK-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
 
 // GFX942-SAME:     padding = [1, 4, 32, 32, 32]
-// GFX942-SAME:     promote_operands = [0, 1, 2]
+// GFX942-SAME:     promote_operands = [0, 1]
 // GFX942-SAME:     reduction = [0, 0, 0, 0, 2]
 // GFX942-SAME:     subgroup = [1, 2, 1, 1, 0]
 // GFX942-SAME:     workgroup = [1, 4, 32, 32, 0]
 
 // MI300X-SAME:     padding = [1, 2, 32, 32, 32]
-// MI300X-SAME:     promote_operands = [0, 1, 2]
+// MI300X-SAME:     promote_operands = [0, 1]
 // MI300X-SAME:     reduction = [0, 0, 0, 0, 2]
 // MI300X-SAME:     subgroup = [1, 1, 1, 1, 0]
 // MI300X-SAME:     workgroup = [1, 2, 32, 32, 0]
@@ -215,7 +215,7 @@ func.func @conv_chwn_chwf_unaligned_batch(%arg0: tensor<16x193x129x40xbf16>, %ar
 //       CHECK:   linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
 //  CHECK-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_BF16>
 //  CHECK-SAME:     padding = [16, 1, 1, 16, 64]
-//  CHECK-SAME:     promote_operands = [0, 1, 2]
+//  CHECK-SAME:     promote_operands = [0, 1]
 //  CHECK-SAME:     reduction = [0, 0, 0, 0, 4]
 //  CHECK-SAME:     subgroup = [1, 1, 1, 1, 0]
 //  CHECK-SAME:     workgroup = [16, 1, 1, 16, 0]
@@ -247,13 +247,13 @@ func.func @group_conv_hwgc_gfhwc_unaligned(%arg0: tensor<61x93x16x56xbf16>, %arg
 //       CHECK:   linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
 // GFX942-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_BF16>
 // GFX942-SAME:     padding = [1, 32, 1, 64, 64]
-// GFX942-SAME:     promote_operands = [0, 1, 2]
+// GFX942-SAME:     promote_operands = [0, 1]
 // GFX942-SAME:     reduction = [0, 0, 0, 0, 4]
 // GFX942-SAME:     subgroup = [1, 1, 0, 1, 0]
 // GFX942-SAME:     workgroup = [1, 32, 1, 64, 0]
 
 // MI300X-SAME:     padding = [1, 32, 1, 64, 64]
-// MI300X-SAME:     promote_operands = [0, 1, 2]
+// MI300X-SAME:     promote_operands = [0, 1]
 // MI300X-SAME:     reduction = [0, 0, 0, 0, 4]
 // MI300X-SAME:     subgroup = [1, 1, 0, 1, 0]
 // MI300X-SAME:     workgroup = [1, 32, 1, 64, 0]
@@ -287,13 +287,13 @@ module {
 //       CHECK:   linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
 // GFX942-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_BF16>
 // GFX942-SAME:     padding = [2, 2, 32, 64, 64]
-// GFX942-SAME:     promote_operands = [0, 1, 2]
+// GFX942-SAME:     promote_operands = [0, 1]
 // GFX942-SAME:     reduction = [0, 0, 0, 0, 4]
 // GFX942-SAME:     subgroup = [2, 1, 1, 2, 0]
 // GFX942-SAME:     workgroup = [2, 2, 32, 64, 0]
 
 // MI300X-SAME:     padding = [1, 2, 32, 32, 64]
-// MI300X-SAME:     promote_operands = [0, 1, 2]
+// MI300X-SAME:     promote_operands = [0, 1]
 // MI300X-SAME:     reduction = [0, 0, 0, 0, 4]
 // MI300X-SAME:     subgroup = [1, 1, 1, 1, 0]
 // MI300X-SAME:     workgroup = [1, 2, 32, 32, 0]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
@@ -330,7 +330,7 @@ func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x2x577xf32>, %r
 // LATE-SAME:    {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
 //      LATE:    linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
 //  LATE-SAME:     padding = [1, 16, 16, 4]
-//  LATE-SAME:     promote_operands = [0, 1, 2]
+//  LATE-SAME:     promote_operands = [0, 1]
 //  LATE-SAME:     reduction = [0, 0, 0, 1]
 //  LATE-SAME:     subgroup = [0, 1, 1, 0]
 //  LATE-SAME:     workgroup = [1, 16, 16, 0]
@@ -357,7 +357,7 @@ func.func @unaligned_matmul_with_two_reduce_dim(%arg0: tensor<196x9x4xf32>, %arg
 // LATE:       linalg.generic
 // LATE-SAME:  {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
 // LATE-SAME:  padding = [16, 1, 16, 4]
-// LATE-SAME:  promote_operands = [0, 1, 2]
+// LATE-SAME:  promote_operands = [0, 1]
 // LATE-SAME:  reduction = [0, 1, 0, 1],
 // LATE-SAME:  subgroup = [1, 0, 1, 0],
 // LATE-SAME:  workgroup = [16, 0, 16, 0]}
@@ -437,7 +437,7 @@ func.func @unaligned_to_intrinsic_batched_matmul_tiling_check(%lhs : tensor<12x5
 // LATE-SAME:    {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
 //      LATE:    linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
 //  LATE-SAME:     padding = [1, 16, 64, 4]
-//  LATE-SAME:     promote_operands = [0, 1, 2]
+//  LATE-SAME:     promote_operands = [0, 1]
 //  LATE-SAME:     reduction = [0, 0, 0, 1]
 //  LATE-SAME:     subgroup = [0, 1, 2, 0]
 //  LATE-SAME:     workgroup = [1, 16, 64, 0]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse_gfx950.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse_gfx950.mlir
@@ -123,7 +123,7 @@ func.func @small_scaled_matmul(
 //  CHECK-SAME:   #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>
 //       CHECK:   linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
 //  CHECK-SAME:     mma_kind = #iree_gpu.scaled_mma_layout<intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>
-//  CHECK-SAME:     promote_operands = [0, 1, 2, 3, 4]
+//  CHECK-SAME:     promote_operands = [0, 1, 2, 3]
 //  CHECK-SAME:     reduction = [0, 0, 1, 1]
 //  CHECK-SAME:     subgroup = [1, 1, 0, 0]
 //  CHECK-SAME:     workgroup = [16, 16, 0, 0]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir
@@ -163,6 +163,80 @@ hal.executable private @main {
 
 // -----
 
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer, ReadOnly>,
+  #hal.pipeline.binding<storage_buffer, ReadOnly>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+#translation = #iree_codegen.translation_info<pipeline =
+  LLVMGPUTileAndFuse
+  workgroup_size = [256, 1, 1]
+  subgroup_size = 64,
+  {
+     gpu_pipeline_options = #iree_gpu.pipeline_options<
+       prefetch_shared_memory = false,
+       no_reduce_shared_memory_bank_conflicts = false,
+       use_igemm_convolution = true>
+  }>
+#config = #iree_gpu.lowering_config<{
+  padding = [2, 1, 32, 16, 16],
+  workgroup = [2, 1, 32, 16, 0],
+  reduction = [0, 0, 0, 0, 1],
+  subgroup = [1, 1, 1, 1, 0],
+  mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+  promote_operands = [0, 1]
+}>
+hal.executable private @main {
+  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+    hal.executable.export public @conv_dispatch_0_conv_2d_nhwc_hwcf_2x17x17x1281x3x3x1281_f16xf16xf32 ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @conv_nhwc_unaligned_stride_2_nocpromo() attributes {translation_info = #translation} {
+        %cst = arith.constant 0.000000e+00 : f32
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>          %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<3x3x1281x1281xf16>>
+        %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
+        %3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
+        %4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 1281, 1281], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<3x3x1281x1281xf16>> -> tensor<3x3x1281x1281xf16>
+        %5 = tensor.empty() : tensor<2x17x17x1281xf32>
+        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x17x17x1281xf32>) -> tensor<2x17x17x1281xf32>
+        %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, lowering_config = #config, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<2x35x35x1281xf16>, tensor<3x3x1281x1281xf16>) outs(%6 : tensor<2x17x17x1281xf32>) -> tensor<2x17x17x1281xf32>
+        iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
+        return
+      }
+    }
+  }
+}
+
+//    CHECK-LABEL: func @conv_nhwc_unaligned
+//      CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//      CHECK-DAG:   %[[C721:.+]] = arith.constant 721 : index
+//      CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//      CHECK-NOT:   memref.alloc() {{.*}}xf32
+//      CHECK-DAG:   memref.alloc() : memref<16x20xf16, #gpu.address_space<workgroup>>
+//      CHECK-DAG:   memref.alloc() : memref<2x1x32x20xf16, #gpu.address_space<workgroup>>
+//      CHECK-DAG:   %[[B0:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(0)
+//      CHECK-DAG:   %[[ASSUMED_B0:.+]] = memref.assume_alignment %[[B0]], 64
+//      CHECK-DAG:   %[[BUF0:.+]] = amdgpu.fat_raw_buffer_cast %[[ASSUMED_B0]]
+//      CHECK-DAG:   %[[B1:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(1)
+//      CHECK-DAG:   %[[ASSUMED_B1:.+]] = memref.assume_alignment %[[B1]], 64
+//      CHECK-DAG:   %[[BUF1:.+]] = amdgpu.fat_raw_buffer_cast %[[ASSUMED_B1]]
+//      CHECK-DAG:   %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(2)
+//      CHECK-DAG:   %[[ASSUMED_B2:.+]] = memref.assume_alignment %[[B2]], 64
+//      CHECK-DAG:   %[[BUF2:.+]] = amdgpu.fat_raw_buffer_cast %[[ASSUMED_B2]]
+//          CHECK:   scf.forall ({{.*}}) in (17, 1, 81) {
+//          CHECK:     %[[LOOP:.+]] = scf.for %[[IV:.+]] = %[[C0]] to %[[C721]] step %[[C1]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
+//          CHECK:       gpu.barrier
+//      CHECK-DAG:       %[[LHS_MM0:.+]] = vector.transfer_read {{.*}} vector<4xf16>
+//      CHECK-DAG:       %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<4xf16>
+// CHECK-COUNT-1:       amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32
+//      CHECK-NOT:     scf.for
+//          CHECK:   } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
+
+// -----
+
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer, "ReadOnly">,
   #hal.pipeline.binding<storage_buffer, "ReadOnly">,
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matmul.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matmul.mlir