[GPU] Add C promotion capability in promote matmul operands pass (#19256)

nirvedhmeshram · web-flow · commit 746ad1efa358 · 2024-11-26T17:44:00.000Z
This PR sets up the convention that when the operand index for promotion
is beyond the dpsInputs then we promote the corresponding dpsInit's
tied-result.
Result promotion is implemented in this PR.

Co-authored-by : Quinn Dawkins &lt;quinn.dawkins@gmail.com&gt;

---------

Signed-off-by: Nirvedh &lt;nirvedh@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp
@@ -9,8 +9,12 @@
 #include "iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUInterfaces.h"
 #include "iree/compiler/Codegen/Utils/LinalgOpInfo.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -25,11 +29,83 @@ namespace mlir::iree_compiler {
 #include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"
 
 namespace {
+/// Helper to insert copy with derived thread config.
+Value promoteValue(OpBuilder &builder, Location loc, Value v) {
+  auto tensorType = cast<RankedTensorType>(v.getType());
+  SmallVector<OpFoldResult> mixedSizes = tensor::getMixedSizes(builder, loc, v);
+  Value empty = builder.create<tensor::EmptyOp>(loc, mixedSizes,
+                                                tensorType.getElementType());
+  auto copy = builder.create<linalg::CopyOp>(loc, v, empty);
+  setLoweringConfig(
+      copy, IREE::GPU::DerivedThreadConfigAttr::get(builder.getContext()));
+  return copy.getResult(0);
+}
+
+/// Helper to promote results. If the target value is consumed only by a
+/// `tensor.extract_slice`, this will promote the result of the slice instead.
+void promoteResult(OpBuilder &builder, Operation *op, Value valToMakeShared) {
+  IRRewriter rewriter(builder);
+  Location loc = op->getLoc();
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPointAfterValue(valToMakeShared);
+  tensor::ExtractSliceOp extractSliceOp;
+  SetVector<Operation *> opsToReplaceUseIn;
+  Value valueToReplace = valToMakeShared;
+  for (auto user : valToMakeShared.getUsers()) {
+    extractSliceOp = dyn_cast<tensor::ExtractSliceOp>(user);
+    if (extractSliceOp) {
+      // If the result is consumed by an extract_slice then we expect there to
+      // be exactly one extract slice that is then consumed.
+      // TODO (nirvedhmeshram) : This is fairly special case. Instead we should
+      // just promote results before doing padding which introduces the extract
+      // slice.
+      if (!valToMakeShared.hasOneUse())
+        return;
+      valueToReplace = extractSliceOp.getResult();
+      for (auto user : extractSliceOp->getUsers()) {
+        opsToReplaceUseIn.insert(user);
+      }
+      break;
+    }
+    opsToReplaceUseIn.insert(user);
+  }
+  auto tensorType = cast<RankedTensorType>(valToMakeShared.getType());
+  if (!tensorType) {
+    return;
+  }
+  SmallVector<Value> dynamicSizes;
+  for (auto [idx, size] : llvm::enumerate(tensorType.getShape())) {
+    if (ShapedType::isDynamic(size)) {
+      dynamicSizes.push_back(
+          rewriter.create<tensor::DimOp>(loc, valToMakeShared, idx));
+    }
+  }
+  Attribute addressSpace = gpu::AddressSpaceAttr::get(
+      rewriter.getContext(), gpu::GPUDialect::getWorkgroupAddressSpace());
+  auto alloc = rewriter.create<bufferization::AllocTensorOp>(loc, tensorType,
+                                                             dynamicSizes);
+  alloc.setMemorySpaceAttr(addressSpace);
+  auto copy =
+      rewriter.create<linalg::CopyOp>(loc, valToMakeShared, alloc.getResult());
+
+  Value replacement = copy.getResult(0);
+  // If in extract slice is present we make it consume the new copy.
+  if (extractSliceOp) {
+    extractSliceOp.getSourceMutable().assign(replacement);
+    replacement = valueToReplace;
+  }
+
+  rewriter.setInsertionPointAfterValue(replacement);
+  replacement = promoteValue(rewriter, loc, replacement);
+  valueToReplace.replaceUsesWithIf(replacement, [&](OpOperand &use) {
+    return opsToReplaceUseIn.contains(use.getOwner());
+  });
+}
 
 /// Inserts a `linalg.copy` directly before the given operation on the
 /// specified operand, for example with operand index = 1:
 ///
-///   linalg.matmul ins(%0, %1)
+///   %2 = linalg.matmul ins(%0, %1)
 ///
 /// becomes
 ///
@@ -41,7 +117,24 @@ namespace {
 /// If the producer is already a tilable op, the producer is just annotated with
 /// #iree_gpu.derived_thread_config to indicate that it should be distributed
 /// to threads independently of the matmul.
+/// Additionally we can also promote results so in above example we will
+/// generate for index = 2 :
+///   %out_buffer = bufferization.alloc_tensor
+///   %copy1 = linalg.copy %2 to %out_buffer
+///   %copy2 = linalg.copy %copy1 to %empty {
+///     lowering_config = #iree_gpu.derived_thread_config}
 void promoteOperand(OpBuilder &builder, Operation *op, unsigned index) {
+  auto dpsOp = dyn_cast<DestinationStyleOpInterface>(op);
+  if (!dpsOp)
+    return;
+  // We use the convention that if we are passing an index beyond the inputs
+  // then we promote the result of the corresponding dps init.
+  if (index >= dpsOp.getNumDpsInputs()) {
+    index -= dpsOp.getNumDpsInputs();
+    assert(index < op->getNumResults() &&
+           "trying to promote out of bound result index");
+    return promoteResult(builder, op, op->getResult(index));
+  }
   Value operand = op->getOperand(index);
 
   if (auto producer = operand.getDefiningOp<TilingInterface>()) {
@@ -70,14 +163,8 @@ void promoteOperand(OpBuilder &builder, Operation *op, unsigned index) {
     return;
   }
 
-  SmallVector<OpFoldResult> mixedSizes =
-      tensor::getMixedSizes(builder, op->getLoc(), operand);
-  Value empty = builder.create<tensor::EmptyOp>(op->getLoc(), mixedSizes,
-                                                tensorType.getElementType());
-  auto copy = builder.create<linalg::CopyOp>(op->getLoc(), operand, empty);
-  setLoweringConfig(
-      copy, IREE::GPU::DerivedThreadConfigAttr::get(builder.getContext()));
-  op->setOperand(index, copy.getResult(0));
+  auto replacement = promoteValue(builder, op->getLoc(), operand);
+  op->setOperand(index, replacement);
 }
 
 struct GPUPromoteMatmulOperandsPass final
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
@@ -161,6 +161,8 @@ def GPUPromoteMatmulOperandsPass :
   let summary = "Pass to insert copies with a different thread configuration "
                 "on matmul operands";
   let dependentDialects = [
+    "::mlir::bufferization::BufferizationDialect",
+    "::mlir::gpu::GPUDialect",
     "::mlir::linalg::LinalgDialect",
     "::mlir::iree_compiler::IREE::GPU::IREEGPUDialect"
   ];
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir
@@ -106,3 +106,54 @@ func.func @promote_pad(%a : tensor<4x127xf32>, %b: tensor<128x128xf32>) -> tenso
 //   CHECK:   linalg.copy
 // CHECK-SAME: derived_thread_config
 //       CHECK: return
+
+// -----
+
+#lowering_config = #iree_gpu.lowering_config<{promote_operands = [2]}>
+func.func @promote_result(%a : tensor<?x?xf32>, %b : tensor<?x?xf32>, %mdim : index, %ndim : index) -> tensor<?x?xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %empty = tensor.empty(%mdim, %ndim) : tensor<?x?xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %mm = linalg.matmul {lowering_config = #lowering_config}
+    ins(%a, %b : tensor<?x?xf32>, tensor<?x?xf32>) outs(%fill : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %mm : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: func @promote_result(
+//       CHECK:   %[[MATMUL:.+]] = linalg.matmul
+//       CHECK:   %[[ALLOC:.+]] = bufferization.alloc_tensor
+//       CHECK:   %[[COPY1:.+]] = linalg.copy
+//  CHECK-SAME:       ins(%[[MATMUL]] : tensor<?x?xf32>) outs(%[[ALLOC]] : tensor<?x?xf32>)
+//  CHECK-SAME:       -> tensor<?x?xf32>
+//       CHECK:   %[[COPY2:.+]] = linalg.copy
+//  CHECK-SAME:       {lowering_config = #iree_gpu.derived_thread_config}
+//  CHECK-SAME:       ins(%[[COPY1]] : tensor<?x?xf32>)
+//       CHECK:   return %[[COPY2]] : tensor<?x?xf32>
+
+// -----
+
+#lowering_config = #iree_gpu.lowering_config<{promote_operands = [2]}>
+func.func @promote_padded_result(%a : tensor<?x?xf32>, %b : tensor<?x?xf32>, %mdim : index, %ndim : index, %pad : index, %slice : index) -> tensor<?x?xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %empty = tensor.empty(%mdim, %ndim) : tensor<?x?xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %padded_fill = tensor.pad %fill low[0, 0] high[%pad, %pad] {
+    ^bb0(%arg3: index, %arg4: index):
+      tensor.yield %cst : f32
+    } : tensor<?x?xf32> to tensor<?x?xf32>
+  %mm = linalg.matmul {lowering_config = #lowering_config}
+    ins(%a, %b : tensor<?x?xf32>, tensor<?x?xf32>) outs(%padded_fill : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %mm_slice = tensor.extract_slice %mm [0, 0] [%slice, %slice] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+  return %mm_slice : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: func @promote_padded_result(
+//       CHECK:   %[[MATMUL:.+]] = linalg.matmul
+//       CHECK:   %[[ALLOC:.+]] = bufferization.alloc_tensor
+//       CHECK:   %[[COPY1:.+]] = linalg.copy
+//  CHECK-SAME:       ins(%[[MATMUL]] : tensor<?x?xf32>) outs(%[[ALLOC]] : tensor<?x?xf32>)
+//       CHECK:   %[[EXTRACT:.+]] = tensor.extract_slice %[[COPY1]]
+//       CHECK:   %[[COPY2:.+]] = linalg.copy
+//  CHECK-SAME:       {lowering_config = #iree_gpu.derived_thread_config}
+//  CHECK-SAME:       ins(%[[EXTRACT]] : tensor<?x?xf32>)
+//       CHECK:   return %[[COPY2]] : tensor<?x?xf32>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
@@ -1019,3 +1019,75 @@ hal.executable public @main {
 //       CHECK:     scf.for
 // CHECK-COUNT-4:     arith.addf {{.*}} : vector<9xf32>
 //       CHECK:       vector.transfer_write {{.*}} vector<9xi8>, memref<32x16x9x9xi8, #hal.descriptor_type<storage_buffer>>
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+#config = #iree_gpu.lowering_config<{
+  workgroup = [64, 64, 0],
+  reduction = [0, 0, 2],
+  subgroup = [2, 2],
+  mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+  promote_operands = [0, 1, 2]
+}>
+hal.executable public @main {
+  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+    hal.executable.export public @matmul_transpose_b_promote_result ordinal(0) layout(#pipeline_layout) {
+    ^bb0(%arg0: !hal.device):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @matmul_transpose_b_promote_result()
+        attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [128, 2, 1] subgroup_size = 64>} {
+        %cst = arith.constant 0.000000e+00 : f16
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x1280xf16>>
+        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<10240x1280xf16>>
+        %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x10240xf32>>
+        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1280xf16>> -> tensor<2048x1280xf16>
+        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10240, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<10240x1280xf16>> -> tensor<10240x1280xf16>
+        %5 = tensor.empty() : tensor<2048x10240xf32>
+        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2048x10240xf32>) -> tensor<2048x10240xf32>
+        %7 = linalg.matmul_transpose_b {lowering_config = #config}
+          ins(%3, %4 : tensor<2048x1280xf16>, tensor<10240x1280xf16>)
+          outs(%6 : tensor<2048x10240xf32>) -> tensor<2048x10240xf32>
+        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 10240], strides = [1, 1] : tensor<2048x10240xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x10240xf32>>
+        return
+      }
+    }
+  }
+}
+
+// CHECK-LABEL: func @matmul_transpose_b_promote_result
+//   CHECK-DAG:   %[[B0:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(0)
+//   CHECK-DAG:   %[[B1:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(1)
+//   CHECK-DAG:   %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(2)
+//   CHECK-DAG:   memref.alloc() : memref<64x36xf16, #gpu.address_space<workgroup>>
+//   CHECK-DAG:   memref.alloc() : memref<64x36xf16, #gpu.address_space<workgroup>>
+//   CHECK-DAG:   memref.alloc() : memref<4x16x4x16xf32, #gpu.address_space<workgroup>>
+//       CHECK:   scf.forall ({{.*}}) in (32, 160) {
+//       CHECK:     %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c80 step %c2 {{.*}} -> (vector<2x2x4x1xf32>)
+//       CHECK:       gpu.barrier
+//   CHECK-DAG:       %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16>
+//   CHECK-DAG:       vector.transfer_write %[[LHS_RD]]
+//   CHECK-DAG:       %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<8xf16>
+//   CHECK-DAG:       vector.transfer_write %[[RHS_RD]]
+//       CHECK:       gpu.barrier
+//   CHECK-DAG:       vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<2x1x2x4xf16>
+//   CHECK-DAG:       vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<2x1x2x4xf16>
+//   CHECK-DAG:       vector.transpose %{{.*}}, [0, 2, 1, 3] : vector<2x1x2x4xf16>
+//   CHECK-DAG:       vector.transpose %{{.*}}, [0, 2, 1, 3] : vector<2x1x2x4xf16>
+// CHECK-COUNT-4:     amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32
+//       CHECK:       scf.yield
+//       CHECK:     %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 2, 1, 3] : vector<2x2x4x1xf32> to vector<2x4x2x1xf32>
+//       CHECK:     vector.transfer_write %[[LOOP_T]]
+//       CHECK:     scf.for {{.*}} {
+//       CHECK:       %[[SHARED_READ:.+]] = vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<4xf32>
+//       CHECK:       vector.transfer_write %[[SHARED_READ]], %[[B2]]
+//       CHECK:    }
+//       CHECK:   } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}