[AMDGPU] Cache_swizzle stride for fat raw buffer loads should in bytes (iree-org#22314)

sebvince · pstarkcdpr · commit 23ddd24004fb · 2025-11-28T13:55:08.000-08:00
Use stride in bytes for L1 Cache_swizzle as described in CDNA3/4 doc. In the case of #iree_gpu.promote_with_cache_swizzle, we set the stride to 0 if it is not a multiple of 8 bits.
diff --git a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_bf16.mlir b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_bf16.mlir
@@ -43,8 +43,9 @@ util.func private @pingpong_large_bf16(%lhs_base: !bf16_in_ty, %rhs_base: !bf16_
   %rhs_shared_base = memref.alloc() : !bf16_flat_shared
 
   %dim = tensor.dim %lhs_base, %c1 : !bf16_in_ty
-  %lhs = iree_gpu.buffer_resource_cast %lhs_base cacheSwizzleStride(%dim) : !bf16_in_ty
-  %rhs = iree_gpu.buffer_resource_cast %rhs_base cacheSwizzleStride(%dim) : !bf16_in_ty
+  %dim_bytes = arith.muli %dim, %c2 overflow<nsw, nuw>: index
+  %lhs = iree_gpu.buffer_resource_cast %lhs_base cacheSwizzleStride(%dim_bytes) : !bf16_in_ty
+  %rhs = iree_gpu.buffer_resource_cast %rhs_base cacheSwizzleStride(%dim_bytes) : !bf16_in_ty
 
   %lhs_shared_swizzle = iree_codegen.swizzle_hint %lhs_shared_base[#iree_codegen.rotate_rows<64, 4>] : !bf16_flat_shared
   %rhs_shared_swizzle = iree_codegen.swizzle_hint %rhs_shared_base[#iree_codegen.rotate_rows<64, 4>] : !bf16_flat_shared
@@ -266,8 +267,9 @@ util.func private @pingpong_medium_bf16_expanded(%lhs_base: !mexp_in_ty_bf16, %r
   %rhs_shared_base = memref.alloc() : !flat_shared_bf16
 
   %dim = tensor.dim %rhs_base, %c1 : !in_ty_bf16
-  %lhs = iree_gpu.buffer_resource_cast %lhs_base cacheSwizzleStride(%dim) : !mexp_in_ty_bf16
-  %rhs = iree_gpu.buffer_resource_cast %rhs_base cacheSwizzleStride(%dim) : !in_ty_bf16
+  %dim_bytes = arith.muli %dim, %c2 overflow<nsw, nuw>: index
+  %lhs = iree_gpu.buffer_resource_cast %lhs_base cacheSwizzleStride(%dim_bytes) : !mexp_in_ty_bf16
+  %rhs = iree_gpu.buffer_resource_cast %rhs_base cacheSwizzleStride(%dim_bytes) : !in_ty_bf16
 
   %lhs_shared_swizzle = iree_codegen.swizzle_hint %lhs_shared_base[#iree_codegen.rotate_rows<64, 4>] : !mflat_shared_bf16
   %rhs_shared_swizzle = iree_codegen.swizzle_hint %rhs_shared_base[#iree_codegen.rotate_rows<64, 4>] : !flat_shared_bf16
@@ -453,8 +455,9 @@ util.func private @pingpong_large_bf16_expanded(%lhs_base: !bf16_exp_in_ty, %rhs
   %rhs_shared_base = memref.alloc() : !bf16_flat_shared
 
   %dim = tensor.dim %rhs_base, %c1 : !bf16_in_ty
-  %lhs = iree_gpu.buffer_resource_cast %lhs_base cacheSwizzleStride(%dim) : !bf16_exp_in_ty
-  %rhs = iree_gpu.buffer_resource_cast %rhs_base cacheSwizzleStride(%dim) : !bf16_in_ty
+  %dim_bytes = arith.muli %dim, %c2 overflow<nsw, nuw>: index
+  %lhs = iree_gpu.buffer_resource_cast %lhs_base cacheSwizzleStride(%dim_bytes) : !bf16_exp_in_ty
+  %rhs = iree_gpu.buffer_resource_cast %rhs_base cacheSwizzleStride(%dim_bytes) : !bf16_in_ty
 
   %lhs_shared_swizzle = iree_codegen.swizzle_hint %lhs_shared_base[#iree_codegen.rotate_rows<64, 4>] : !bf16_flat_shared
   %rhs_shared_swizzle = iree_codegen.swizzle_hint %rhs_shared_base[#iree_codegen.rotate_rows<64, 4>] : !bf16_flat_shared
diff --git a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f16.mlir b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f16.mlir
@@ -35,8 +35,9 @@ util.func private @pingpong_large_f16(%lhs_base: !in_ty, %rhs_base: !in_ty, %unu
   %rhs_shared_base = memref.alloc() : !flat_shared
 
   %dim = tensor.dim %lhs_base, %c1 : !in_ty
-  %lhs = iree_gpu.buffer_resource_cast %lhs_base cacheSwizzleStride(%dim) : !in_ty
-  %rhs = iree_gpu.buffer_resource_cast %rhs_base cacheSwizzleStride(%dim) : !in_ty
+  %dim_bytes = arith.muli %dim, %c2 overflow<nsw, nuw>: index
+  %lhs = iree_gpu.buffer_resource_cast %lhs_base cacheSwizzleStride(%dim_bytes) : !in_ty
+  %rhs = iree_gpu.buffer_resource_cast %rhs_base cacheSwizzleStride(%dim_bytes) : !in_ty
 
   %lhs_shared_swizzle = iree_codegen.swizzle_hint %lhs_shared_base[#iree_codegen.rotate_rows<64, 4>] : !flat_shared
   %rhs_shared_swizzle = iree_codegen.swizzle_hint %rhs_shared_base[#iree_codegen.rotate_rows<64, 4>] : !flat_shared
@@ -256,8 +257,9 @@ util.func private @pingpong_medium_f16_expanded(%lhs_base: !mexp_in_ty, %rhs_bas
   %rhs_shared_base = memref.alloc() : !flat_shared
 
   %dim = tensor.dim %rhs_base, %c1 : !in_ty
-  %lhs = iree_gpu.buffer_resource_cast %lhs_base cacheSwizzleStride(%dim) : !mexp_in_ty
-  %rhs = iree_gpu.buffer_resource_cast %rhs_base cacheSwizzleStride(%dim) : !in_ty
+  %dim_bytes = arith.muli %dim, %c2 overflow<nsw, nuw>: index
+  %lhs = iree_gpu.buffer_resource_cast %lhs_base cacheSwizzleStride(%dim_bytes) : !mexp_in_ty
+  %rhs = iree_gpu.buffer_resource_cast %rhs_base cacheSwizzleStride(%dim_bytes) : !in_ty
 
   %lhs_shared_swizzle = iree_codegen.swizzle_hint %lhs_shared_base[#iree_codegen.rotate_rows<64, 4>] : !mflat_shared
   %rhs_shared_swizzle = iree_codegen.swizzle_hint %rhs_shared_base[#iree_codegen.rotate_rows<64, 4>] : !flat_shared
@@ -443,8 +445,9 @@ util.func private @pingpong_large_f16_expanded(%lhs_base: !exp_in_ty, %rhs_base:
   %rhs_shared_base = memref.alloc() : !flat_shared
 
   %dim = tensor.dim %rhs_base, %c1 : !in_ty
-  %lhs = iree_gpu.buffer_resource_cast %lhs_base cacheSwizzleStride(%dim) : !exp_in_ty
-  %rhs = iree_gpu.buffer_resource_cast %rhs_base cacheSwizzleStride(%dim) : !in_ty
+  %dim_bytes = arith.muli %dim, %c2 overflow<nsw, nuw>: index
+  %lhs = iree_gpu.buffer_resource_cast %lhs_base cacheSwizzleStride(%dim_bytes) : !exp_in_ty
+  %rhs = iree_gpu.buffer_resource_cast %rhs_base cacheSwizzleStride(%dim_bytes) : !in_ty
 
   %lhs_shared_swizzle = iree_codegen.swizzle_hint %lhs_shared_base[#iree_codegen.rotate_rows<64, 4>] : !flat_shared
   %rhs_shared_swizzle = iree_codegen.swizzle_hint %rhs_shared_base[#iree_codegen.rotate_rows<64, 4>] : !flat_shared
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-promote-matmul-operands))" | FileCheck %s
+// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-promote-matmul-operands),canonicalize)" | FileCheck %s
 
 #lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1]}>
 
@@ -214,8 +214,88 @@ func.func @promote_with_cache_swizzle(%a: tensor<2x34x34x128xf32>, %b: tensor<2x
 // CHECK-LABEL: func.func @promote_with_cache_swizzle
 //  CHECK-SAME:   %[[A:[A-Za-z0-9]+]]: tensor<2x34x34x128xf32>
 //  CHECK-SAME:   %[[B:[A-Za-z0-9]+]]: tensor<2x8x256xf32>
-//   CHECK-DAG:   %[[SWIZZLE_A:.+]] = iree_gpu.buffer_resource_cast %[[A]] cacheSwizzleStride(%c128)
-//   CHECK-DAG:   %[[SWIZZLE_B:.+]] = iree_gpu.buffer_resource_cast %[[B]] cacheSwizzleStride(%c256)
+//   CHECK-DAG:   %[[SWIZZLE_A:.+]] = iree_gpu.buffer_resource_cast %[[A]] cacheSwizzleStride(%c512)
+//   CHECK-DAG:   %[[SWIZZLE_B:.+]] = iree_gpu.buffer_resource_cast %[[B]] cacheSwizzleStride(%c1024)
+//       CHECK:   %[[PA:.+]] = iree_linalg_ext.im2col
+//  CHECK-SAME:     lowering_config = #iree_gpu.derived_thread_config
+//  CHECK-SAME:     ins(%[[SWIZZLE_A]]
+//       CHECK:   %[[PB:.+]] = linalg.copy
+//  CHECK-SAME:     lowering_config = #iree_gpu.use_global_load_dma
+//  CHECK-SAME:     ins(%[[SWIZZLE_B]]
+//       CHECK:   linalg.batch_matmul {{.*}} ins(%[[PA]], %[[PB]]
+
+
+// -----
+
+#lowering_config = #iree_gpu.lowering_config<{
+  promote_operands = [0, 1],
+  promotion_types = [
+    #iree_gpu.promote_with_cache_swizzle<#iree_gpu.derived_thread_config>,
+    #iree_gpu.promote_with_cache_swizzle<#iree_gpu.use_global_load_dma>]}>
+
+func.func @promote_with_cache_swizzle_f4(%a: tensor<2x34x34x128xf4E2M1FN>, %b: tensor<2x8x256xf4E2M1FN>) -> tensor<2x128x256xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %empty = tensor.empty() : tensor<2x128x256xf32>
+  %im2col_empty = tensor.empty() : tensor<2x128x8xf4E2M1FN>
+
+  %im2col = iree_linalg_ext.im2col
+    strides = [1, 1] dilations = [1, 1] kernel_size = [3, 3]
+    m_offset = [0] * [1] k_offset = [0] * [1]
+    batch_pos = [0] m_pos = [2, 3] k_pos = [1]
+    input_k_perm = [0, 1, 2]
+    ins(%a : tensor<2x34x34x128xf4E2M1FN>)
+    outs(%im2col_empty : tensor<2x128x8xf4E2M1FN>) -> tensor<2x128x8xf4E2M1FN>
+
+  %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<2x128x256xf32>) -> tensor<2x128x256xf32>
+  %mm = linalg.batch_matmul {lowering_config = #lowering_config}
+    ins(%im2col, %b : tensor<2x128x8xf4E2M1FN>, tensor<2x8x256xf4E2M1FN>) outs(%fill : tensor<2x128x256xf32>) -> tensor<2x128x256xf32>
+  return %mm : tensor<2x128x256xf32>
+}
+
+// CHECK-LABEL: func.func @promote_with_cache_swizzle_f4
+//  CHECK-SAME:   %[[A:[A-Za-z0-9]+]]: tensor<2x34x34x128xf4E2M1FN>
+//  CHECK-SAME:   %[[B:[A-Za-z0-9]+]]: tensor<2x8x256xf4E2M1FN>
+//   CHECK-DAG:   %[[SWIZZLE_A:.+]] = iree_gpu.buffer_resource_cast %[[A]] cacheSwizzleStride(%c64)
+//   CHECK-DAG:   %[[SWIZZLE_B:.+]] = iree_gpu.buffer_resource_cast %[[B]] cacheSwizzleStride(%c128)
+//       CHECK:   %[[PA:.+]] = iree_linalg_ext.im2col
+//  CHECK-SAME:     lowering_config = #iree_gpu.derived_thread_config
+//  CHECK-SAME:     ins(%[[SWIZZLE_A]]
+//       CHECK:   %[[PB:.+]] = linalg.copy
+//  CHECK-SAME:     lowering_config = #iree_gpu.use_global_load_dma
+//  CHECK-SAME:     ins(%[[SWIZZLE_B]]
+//       CHECK:   linalg.batch_matmul {{.*}} ins(%[[PA]], %[[PB]]
+
+// -----
+#lowering_config = #iree_gpu.lowering_config<{
+  promote_operands = [0, 1],
+  promotion_types = [
+    #iree_gpu.promote_with_cache_swizzle<#iree_gpu.derived_thread_config>,
+    #iree_gpu.promote_with_cache_swizzle<#iree_gpu.use_global_load_dma>]}>
+
+func.func @promote_with_cache_swizzle_f4_no_stride(%a: tensor<2x34x34x129xf4E2M1FN>, %b: tensor<2x8x256xf4E2M1FN>) -> tensor<2x129x256xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %empty = tensor.empty() : tensor<2x129x256xf32>
+  %im2col_empty = tensor.empty() : tensor<2x129x8xf4E2M1FN>
+
+  %im2col = iree_linalg_ext.im2col
+    strides = [1, 1] dilations = [1, 1] kernel_size = [3, 3]
+    m_offset = [0] * [1] k_offset = [0] * [1]
+    batch_pos = [0] m_pos = [2, 3] k_pos = [1]
+    input_k_perm = [0, 1, 2]
+    ins(%a : tensor<2x34x34x129xf4E2M1FN>)
+    outs(%im2col_empty : tensor<2x129x8xf4E2M1FN>) -> tensor<2x129x8xf4E2M1FN>
+
+  %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<2x129x256xf32>) -> tensor<2x129x256xf32>
+  %mm = linalg.batch_matmul {lowering_config = #lowering_config}
+    ins(%im2col, %b : tensor<2x129x8xf4E2M1FN>, tensor<2x8x256xf4E2M1FN>) outs(%fill : tensor<2x129x256xf32>) -> tensor<2x129x256xf32>
+  return %mm : tensor<2x129x256xf32>
+}
+
+// CHECK-LABEL: func.func @promote_with_cache_swizzle_f4_no_stride
+//  CHECK-SAME:   %[[A:[A-Za-z0-9]+]]: tensor<2x34x34x129xf4E2M1FN>
+//  CHECK-SAME:   %[[B:[A-Za-z0-9]+]]: tensor<2x8x256xf4E2M1FN>
+//   CHECK-DAG:   %[[SWIZZLE_A:.+]] = iree_gpu.buffer_resource_cast %[[A]] cacheSwizzleStride(%c0)
+//   CHECK-DAG:   %[[SWIZZLE_B:.+]] = iree_gpu.buffer_resource_cast %[[B]] cacheSwizzleStride(%c128)
 //       CHECK:   %[[PA:.+]] = iree_linalg_ext.im2col
 //  CHECK-SAME:     lowering_config = #iree_gpu.derived_thread_config
 //  CHECK-SAME:     ins(%[[SWIZZLE_A]]
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
@@ -111,14 +111,14 @@ def IREEGPU_PromoteWithCacheSwizzle :
     swizzle inserted if possible. For example,
 
     ```
-    %0 = tensor_ext.dispatch.tensor.load : tensor<?x8192>
+    %0 = tensor_ext.dispatch.tensor.load : tensor<?x4096xf16>
     %1 = linalg.matmul ins(%0, ...)
     ```
 
     Becomes with `#iree_gpu.promote_with_cache_swizzle<#iree_gpu.derived_thread_config>`
 
     ```
-    %0 = tensor_ext.dispatch.tensor.load : tensor<?x8192>
+    %0 = tensor_ext.dispatch.tensor.load : tensor<?x4096xf16>
     %1 = iree_gpu.buffer_resource_cast cache_swizzle(8192)
     %2 = linalg.copy lowering_config = #iree_gpu.derived_thread_config
     %3 = linalg.matmul ins(%2, ...)
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/PromotionImpls.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/PromotionImpls.cpp
@@ -11,6 +11,7 @@
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtInterfaces.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -111,14 +112,34 @@ Value cacheSwizzlePromotionImpl(OpBuilder &builder, OpOperand &operand,
   }
 
   Location loc = promotedValue.getLoc();
-  // Use the size of the inner most dimension as the cache swizzle value.
-  // This is a very rudimentary choice, but functions well enough as a
+  // Use the size in bytes of the inner most dimension as the cache swizzle
+  // value. This is a very rudimentary choice, but functions well enough as a
   // default.
-  Value cacheSwizzleVal = getValueOrCreateConstantIndexOp(
+  AffineExpr s0, s1;
+  bindSymbols(builder.getContext(), s0, s1);
+  Value dtype =
+      arith::ConstantIndexOp::create(
+          builder, loc, tensorType.getElementType().getIntOrFloatBitWidth())
+          ->getResult(0);
+
+  OpFoldResult dim = tensor::getMixedSize(builder, loc, bufferCastValue,
+                                          tensorType.getRank() - 1);
+  Value zero =
+      getValueOrCreateConstantIntOp(builder, loc, builder.getIndexAttr(0));
+  Value strideBytes = getValueOrCreateConstantIndexOp(
       builder, loc,
-      tensor::getMixedSize(builder, loc, bufferCastValue,
-                           tensorType.getRank() - 1));
-
+      affine::makeComposedFoldedAffineApply(builder, loc, (s0 * s1).ceilDiv(8),
+                                            {dim, dtype}));
+  Value strideBitsMod8 = getValueOrCreateConstantIntOp(
+      builder, loc,
+      affine::makeComposedFoldedAffineApply(builder, loc, (s0 * s1) % 8,
+                                            {dim, dtype}));
+  // If the stride in bits is not a multiple of 8, set the value to 0. This will
+  // be ignored by cacheSwizzleStride.
+  Value cmp = arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::eq, strideBitsMod8, zero);
+  Value cacheSwizzleVal =
+      arith::SelectOp::create(builder, loc, cmp, strideBytes, zero).getResult();
   // Insert the resource cast optimistically. If the input is not castable
   // (e.g. another producer) later patterns will drop it anyway as it is treated
   // like a hint.