[GPU] Bail out in GPUReduceBankConflicts if we have collapse_shape user (#18863)

nirvedhmeshram · web-flow · commit aef6e1fc8fe8 · 2024-10-24T16:12:54.000-05:00
This is unsupported by upstream and can lead to a compiler error. llvm/llvm-project#112994 Progress towards: #18858 --------- Signed-off-by: Nirvedh <nirvedh@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUReduceBankConflicts.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUReduceBankConflicts.cpp
@@ -18,6 +18,23 @@ namespace mlir::iree_compiler {
 
 namespace {
 
+/// Check if AllocOp has a CollapseShapeOp user.
+static bool hasCollapseShapeUser(memref::AllocOp allocOp) {
+  SmallVector<Operation *> users(allocOp->getUsers());
+  while (!users.empty()) {
+    auto user = users.pop_back_val();
+    if (isa<memref::CollapseShapeOp>(user)) {
+      return true;
+    }
+    if (isa<ViewLikeOpInterface>(user)) {
+      for (auto u : user->getUsers()) {
+        users.push_back(u);
+      }
+    }
+  }
+  return false;
+}
+
 /// Pad out the inner dimension of the `memref.alloc` op in order reduce the
 /// chances to have bank conflicts when reading 2D shapes within shared memory.
 static void padAlloc(MLIRContext *context, memref::AllocOp allocOp,
@@ -28,6 +45,12 @@ static void padAlloc(MLIRContext *context, memref::AllocOp allocOp,
   int64_t innerDim = allocOpShape.back();
   if (ShapedType::isDynamic(innerDim))
     return;
+
+  // Return if we have CollapseShape op as an user as padding in that case is
+  // unsupported.
+  if (hasCollapseShapeUser(allocOp))
+    return;
+
   Type elType = allocOp.getType().getElementType();
   unsigned bitwidth =
       mlir::DataLayout::closest(allocOp).getTypeSizeInBits(elType);
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/reduce_bank_conflicts.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/reduce_bank_conflicts.mlir
@@ -47,6 +47,66 @@ func.func @pad_alloc_expand_shape(%a: memref<1024x1024xf32>) {
   return
 }
 
+// -----
+// CHECK-LABEL: func.func @no_pad_alloc_collapse_shape
+// CHECK:         %[[A:.*]] = memref.alloc() : memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>>
+// CHECK:         %[[C:.*]] = memref.collapse_shape %[[A]] {{\[}}[0], [1, 2], [3, 4]]
+// CHECK-SAME:      memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>> into
+// CHECK-SAME:      memref<4x32x64xf32, #gpu.address_space<workgroup>>
+// CHECK:         %[[C0:.*]] = arith.constant 0 : index
+// CHECK:         %[[CST_0:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:         %[[VEC_READ:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %[[CST_0]] {in_bounds = [true]} :
+// CHECK-SAME:      memref<1024x1024xf32>, vector<4xf32>
+// CHECK:         vector.transfer_write %[[VEC_READ]], %[[C]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true]} :
+// CHECK-SAME:      vector<4xf32>, memref<4x32x64xf32, #gpu.address_space<workgroup>>
+
+
+func.func @no_pad_alloc_collapse_shape(%a: memref<1024x1024xf32>) {
+  %0 = memref.alloc() : memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>>
+  %1 = memref.collapse_shape %0 [[0], [1, 2], [3, 4]]
+    : memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>> into memref<4x32x64xf32, #gpu.address_space<workgroup>>
+  %c0 = arith.constant 0 : index
+  %cst_0 = arith.constant 0.000000e+00 : f32
+  %3 = vector.transfer_read %a[%c0, %c0], %cst_0 {in_bounds = [true]} :
+    memref<1024x1024xf32>, vector<4xf32>
+  vector.transfer_write %3, %1[%c0, %c0, %c0] {in_bounds = [true]} :
+    vector<4xf32>, memref<4x32x64xf32, #gpu.address_space<workgroup>>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @no_pad_alloc_collapse_shape_throughsubview
+// CHECK:         %[[A:.*]] = memref.alloc() : memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>>
+// CHECK:         %[[S:.*]] = memref.subview %[[A]][0, 0, 0, 0, 0] [4, 2, 16, 8, 8] [1, 1, 1, 1, 1] :
+// CHECK-SAME:      memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>> to
+// CHECK-SAME:      memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>>
+// CHECK:         %[[C:.*]] = memref.collapse_shape %[[S]] {{\[}}[0], [1, 2], [3, 4]]
+// CHECK-SAME:      memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>> into
+// CHECK-SAME:      memref<4x32x64xf32, #gpu.address_space<workgroup>>
+// CHECK:         %[[C0:.*]] = arith.constant 0 : index
+// CHECK:         %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:         %[[VEC_READ:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %[[CST]] {in_bounds = [true]} :
+// CHECK-SAME:      memref<1024x1024xf32>, vector<4xf32>
+// CHECK:         vector.transfer_write %[[VEC_READ]], %[[C]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true]} :
+// CHECK-SAME:      vector<4xf32>, memref<4x32x64xf32, #gpu.address_space<workgroup>>
+
+
+func.func @no_pad_alloc_collapse_shape_throughsubview(%a: memref<1024x1024xf32>) {
+  %0 = memref.alloc() : memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>>
+  %subview = memref.subview %0[0, 0, 0, 0, 0] [4, 2, 16, 8, 8] [1, 1, 1, 1, 1]
+   : memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>> to memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>>
+  %1 = memref.collapse_shape %subview [[0], [1, 2], [3, 4]]
+    : memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>> into memref<4x32x64xf32, #gpu.address_space<workgroup>>
+  %c0 = arith.constant 0 : index
+  %cst_0 = arith.constant 0.000000e+00 : f32
+  %3 = vector.transfer_read %a[%c0, %c0], %cst_0 {in_bounds = [true]} :
+    memref<1024x1024xf32>, vector<4xf32>
+  vector.transfer_write %3, %1[%c0, %c0, %c0] {in_bounds = [true]} :
+    vector<4xf32>, memref<4x32x64xf32, #gpu.address_space<workgroup>>
+  return
+}
+
 // -----
 
 // CHECK-LABEL: func.func @pad_alloc_negative