[AMD] Add scheduling hint for attention optimizations (#6290)

knwng · web-flow · commit 658b5b2b34eb · 2025-03-27T13:52:41.000-07:00
This PR cleaned up iglp scheduling variants and introduced a new
scheduling variant 'attention' to group attention-related optimizations
together to improve usability, including:
- `sink-insts-to-avoid-spills` LLVM commandline option to avoid register
spills
- `ROCDL::IglpOpt` intrinsic and `ROCDL::SchedBarrier` around it to
reschedule instructions, specifically, we use iglp 2 to interleave mfma
and exp instructions.

This is experimental feature for now and may change in the future.
diff --git a/test/TritonGPU/amd/amd-instruction-sched.mlir b/test/TritonGPU/amd/amd-instruction-sched.mlir
@@ -1,14 +1,10 @@
-// RUN: triton-opt %s -triton-amdgpu-insert-instruction-sched-hints="variant=llvm_iglp_0" -triton-amdgpu-lower-insert-instruction-sched-hints -verify-diagnostics | FileCheck %s -check-prefix=INSERT_IGLP0
-// RUN: triton-opt %s -triton-amdgpu-insert-instruction-sched-hints="variant=llvm_iglp_1" -triton-amdgpu-lower-insert-instruction-sched-hints -verify-diagnostics | FileCheck %s -check-prefix=INSERT_IGLP1
 // RUN: triton-opt %s -convert-triton-to-tritongpu="target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64" -tritongpu-coalesce -tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=32 kPack=1" -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline="num_stages=1" -triton-amdgpu-insert-instruction-sched-hints="variant=local_prefetch" -tritongpu-reduce-data-duplication -optimize-amd-lds-usage="target-arch=gfx942" -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm="arch=gfx942" -verify-diagnostics | FileCheck %s -check-prefix=INSTR_COUNT_NS1
 // RUN: triton-opt %s -convert-triton-to-tritongpu="target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64" -tritongpu-coalesce -tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=32 kPack=1" -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline="num_stages=2" -triton-amdgpu-insert-instruction-sched-hints="variant=local_prefetch" -tritongpu-reduce-data-duplication -optimize-amd-lds-usage="target-arch=gfx942" -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm="arch=gfx942" -verify-diagnostics | FileCheck %s -check-prefix=INSTR_COUNT_NS2
 // RUN: triton-opt %s -convert-triton-to-tritongpu="target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64" -tritongpu-coalesce -tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=16 kPack=1" -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline="num_stages=2" -triton-amdgpu-insert-instruction-sched-hints="variant=local_prefetch" -tritongpu-reduce-data-duplication -optimize-amd-lds-usage="target-arch=gfx942" -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm="arch=gfx942" -triton-amdgpu-lower-insert-instruction-sched-hints="arch=gfx942 num_stages=2" -debug-only="lower-insert-instruction-sched-hints" -verify-diagnostics 2>&1 | FileCheck %s -check-prefix=USE_LOCAL_PREFETCH_GLOBAL_LOAD
 // RUN: triton-opt %s -convert-triton-to-tritongpu="target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64" -tritongpu-coalesce -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline="num_stages=1" | FileCheck %s -check-prefix=LABELING_PS_1
 // RUN: triton-opt %s -convert-triton-to-tritongpu="target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64" -tritongpu-coalesce -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline="num_stages=2" | FileCheck %s -check-prefix=LABELING_PS_2
 
 module {
-  // INSERT_IGLP0-LABEL: @test_dot_op
-  // INSERT_IGLP1-LABEL: @test_dot_op
   // INSTR_COUNT_NS1-LABEL: @test_dot_op
   // INSTR_COUNT_NS2-LABEL: @test_dot_op
   // USE_LOCAL_PREFETCH_GLOBAL_LOAD: @test_dot_op
@@ -44,9 +40,6 @@ module {
     %a = tt.load %a_ptr : tensor<128x32x!tt.ptr<f16>>
     %b = tt.load %b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr<f16>>
 
-    // INSERT_IGLP0: rocdl.iglp.opt 0
-    // INSERT_IGLP1: rocdl.iglp.opt 1
-
     // INSTR_COUNT_NS1: amdgpu.instruction_sched_hint
     // INSTR_COUNT_NS1-SAME: isBufferLoadsAEnabled = false
     // INSTR_COUNT_NS1-SAME: isBufferLoadsBEnabled = false
diff --git a/test/TritonGPU/amd/amd-schedule-hint.mlir b/test/TritonGPU/amd/amd-schedule-hint.mlir
@@ -0,0 +1,39 @@
+// RUN: triton-opt %s -split-input-file -triton-amdgpu-insert-instruction-sched-hints="variant=attention" | FileCheck %s -check-prefix=INSTR_HINT
+// RUN: triton-opt %s -split-input-file -triton-amdgpu-insert-instruction-sched-hints="variant=attention" -triton-amdgpu-lower-insert-instruction-sched-hints -verify-diagnostics | FileCheck %s -check-prefix=LOWER_HINT
+
+#blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [8, 8], warpsPerCTA = [2, 4], order = [1, 0]}>
+#mma = #ttg.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [2, 4], instrShape = [32, 32], isTransposed = true}>
+#dot_op_a = #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>
+#dot_op_b = #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>
+// INSTR_HINT-LABEL: @insert_schedule_hint
+// LOWER_HINT-LABEL: @insert_schedule_hint
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @insert_schedule_hint(
+    %lb : index, %ub : index, %step : index,
+    %arg0: tensor<128x128xf32, #dot_op_a>,
+    %arg1: tensor<128x128xf32, #dot_op_b>,
+    %arg2: tensor<128x128x!tt.ptr<f32>, #blocked>
+  ) {
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma>
+    // INSTR_HINT: scf.for
+    // INSTR_HINT-NEXT: amdgpu.instruction_sched_hint
+    // INSTR_HINT-SAME: variant = #amdgpu.SchedHintVariant<attention>
+
+    // LOWER_HINT: scf.for
+    // LOWER_HINT-NEXT: rocdl.sched.barrier 0
+    // LOWER_HINT-COUNT-2: tt.dot
+    // LOWER_HINT: rocdl.iglp.opt 2
+    // LOWER_HINT-NEXT: rocdl.sched.barrier 0
+    // LOWER_HINT-NEXT: scf.yield
+    %loop = scf.for %iv = %lb to %ub step %step iter_args(%c = %cst) -> (tensor<128x128xf32, #mma>) {
+      %4 = tt.dot %arg0, %arg1, %c : tensor<128x128xf32, #dot_op_a> * tensor<128x128xf32, #dot_op_b> -> tensor<128x128xf32, #mma>
+      %5 = math.exp2 %4 : tensor<128x128xf32, #mma>
+      %6 = ttg.convert_layout %5 : tensor<128x128xf32, #mma> -> tensor<128x128xf32, #dot_op_a>
+      %7 = tt.dot %6, %arg1, %c : tensor<128x128xf32, #dot_op_a> * tensor<128x128xf32, #dot_op_b> -> tensor<128x128xf32, #mma>
+      scf.yield %7 : tensor<128x128xf32, #mma>
+    }
+    %8 = ttg.convert_layout %loop : tensor<128x128xf32, #mma> -> tensor<128x128xf32, #blocked>
+    tt.store %arg2, %8 : tensor<128x128x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
@@ -53,16 +53,14 @@ class HIPOptions:
     #
     # Current experimental scheduling variants:
     #
-    # llvm-iglp-0: injects `llvm.amdgcn.iglp_opt` intrinsic call with value `0` to the GEMM's
-    #              k-loop; i.e., "interleave DS and MFMA instructions for small GEMM kernels".
-    # llvm-iglp-1: injects `llvm.amdgcn.iglp_opt` intrinsic call with value `1` to the GEMM's
-    #              k-loop; i.e., "interleave DS and MFMA instructions for single wave small
-    #              GEMM kernels.".
     # local-prefetch: implements instruction scheduling similar to the one from the ROCm Composable
     #                 Kernel library. Note, this variant requires the use of buffer load/store ops
     #                 and a special software pipelining style - i.e., 1x LDS and 1x register
     #                 prefetch buffers for each GEMM tile.
-    instruction_sched_variant: str = 'none'
+    # attention: enables a bunch of optimizations for attention kernels, including:
+    #            - iglp 2 and sched.barrier around it
+    #            - sink-insts-to-avoid-spills flag to avoid register spills
+    schedule_hint: str = 'none'
 
     def __post_init__(self):
         default_libdir = Path(__file__).parent / 'lib'
@@ -242,7 +240,7 @@ def make_ttgir(mod, metadata, options):
         use_async_copy = int(os.getenv("TRITON_HIP_USE_ASYNC_COPY", "0")) == 1
 
         # The `local-prefetch` scheduling variant requires turning on buffer ops.
-        if options.instruction_sched_variant == "local-prefetch":
+        if options.schedule_hint == "local-prefetch":
             global_prefetch = local_prefetch = 1
 
         if amd.has_matrix_core_feature(options.arch):
@@ -256,8 +254,8 @@ def make_ttgir(mod, metadata, options):
             if use_async_copy:
                 amd.passes.ttgpuir.add_coalesce_async_copy(pm, options.arch)
             passes.common.add_canonicalizer(pm)
-        if options.instruction_sched_variant.lower() != "none":
-            amd.passes.ttgpuir.insert_instruction_sched_hints(pm, options.instruction_sched_variant)
+        if options.schedule_hint.lower() != "none":
+            amd.passes.ttgpuir.insert_instruction_sched_hints(pm, options.schedule_hint)
         passes.ttgpuir.add_optimize_dot_operands(pm, True)
         passes.ttgpuir.add_remove_layout_conversions(pm)
         passes.ttgpuir.add_reduce_data_duplication(pm)
@@ -314,7 +312,7 @@ def make_llir(src, metadata, options):
         passes.common.add_canonicalizer(pm)
         passes.common.add_cse(pm)
         passes.common.add_symbol_dce(pm)
-        if options.instruction_sched_variant.lower() != "none":
+        if options.schedule_hint.lower() != "none":
             amd.passes.ttgpuir.lower_instruction_sched_hints(pm, options.arch, options.num_stages)
         if os.environ.get("TRITON_DISABLE_LINE_INFO", "0") == "0":
             passes.llvmir.add_di_scope(pm)
@@ -396,7 +394,14 @@ def make_amdgcn(src, metadata, options):
         assert len(names) == 1
         metadata["name"] = names[0]
         # llvm -> hsaco
-        amdgcn = llvm.translate_to_asm(src, amd.TARGET_TRIPLE, options.arch, '', [], options.enable_fp_fusion, False)
+        flags = []
+        # The sink-insts-to-avoid-spills flag asks LLVM backend to sink instructions
+        # into loops to avoid register spills in the MachineSinking pass, while it
+        # can also lead to regression in some cases. But from current observation,
+        # the regression is not significant. It would be better to have some heuristics.
+        if options.schedule_hint == 'attention':
+            flags.append('sink-insts-to-avoid-spills')
+        amdgcn = llvm.translate_to_asm(src, amd.TARGET_TRIPLE, options.arch, '', flags, options.enable_fp_fusion, False)
         if os.environ.get("AMDGCN_ENABLE_DUMP", "0") == "1":
             print("// -----// AMDGCN Dump //----- //")
             print(amdgcn)
diff --git a/third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUAttrDefs.td b/third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUAttrDefs.td
@@ -73,16 +73,14 @@ class TritonAMDGPU_I32EnumAttr<string mnemonic, TritonAMDGPU_I32Enum enumInfo> :
 }
 
 def SchedHintCaseNone : I32EnumAttrCase<"none", 0>;
-def SchedHintCaseLLVMIglp0 : I32EnumAttrCase<"llvm_iglp_0", 1>;
-def SchedHintCaseLLVMIglp1 : I32EnumAttrCase<"llvm_iglp_1", 2>;
-def SchedHintCaseLocalPrefetch : I32EnumAttrCase<"local_prefetch", 3>;
+def SchedHintCaseLocalPrefetch : I32EnumAttrCase<"local_prefetch", 1>;
+def SchedHintCaseAttention : I32EnumAttrCase<"attention", 2>;
 
 def TritonAMDGPU_SchedHintsEnum : TritonAMDGPU_I32Enum<
   "SchedHint", "Instruction Scheduling Hints for AMD GPUs", [
     SchedHintCaseNone,
-    SchedHintCaseLLVMIglp0,
-    SchedHintCaseLLVMIglp1,
     SchedHintCaseLocalPrefetch,
+    SchedHintCaseAttention,
   ]>;
 
 def TritonAMDGPU_SchedHintVariantAttr :
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/SchedInstructions.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/SchedInstructions.cpp
@@ -1,6 +1,7 @@
 #include "SchedInstructions.h"
 #include "TritonAMDGPUToLLVM/Passes.h"
 #include "TritonAMDGPUToLLVM/TargetUtils.h"
+#include "Utility.h"
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
@@ -19,6 +20,7 @@ namespace mlir::triton {
 #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
 
 using namespace mlir;
+using ::mlir::LLVM::AMD::isChainDotHead;
 
 // TODO: The following passes/algorithms are applicable only for a single
 // `tt.dot` op in a `scf.for` block -i.e., a single schedule hint op per block.
@@ -425,7 +427,8 @@ struct InstructionSchedHintsRewriter
     // not supposed to be used together with IGLP OPT according to the AMDGPU
     // backend documentation.
     const bool limitSchedulingRange =
-        schedVariant == mlir::triton::amdgpu::SchedHint::local_prefetch;
+        schedVariant == mlir::triton::amdgpu::SchedHint::local_prefetch ||
+        schedVariant == mlir::triton::amdgpu::SchedHint::attention;
     ;
     Location loc = instructionSchedHint->getLoc();
     Block *block = instructionSchedHint->getBlock();
@@ -438,13 +441,12 @@ struct InstructionSchedHintsRewriter
     rewriter.setInsertionPoint(block, std::prev(block->end()));
 
     switch (schedVariant) {
-    case mlir::triton::amdgpu::SchedHint::llvm_iglp_0:
-    case mlir::triton::amdgpu::SchedHint::llvm_iglp_1:
-      createIglpOpt(rewriter, loc, static_cast<int>(schedVariant) - 1);
-      break;
     case mlir::triton::amdgpu::SchedHint::local_prefetch:
       createLocalPrefetchSchedule(rewriter, loc, instructionSchedHint);
       break;
+    case mlir::triton::amdgpu::SchedHint::attention:
+      createIglpOpt(rewriter, loc, 2);
+      break;
     case mlir::triton::amdgpu::SchedHint::none:
     default:
       break;
@@ -520,7 +522,8 @@ struct TritonAMDGPUInsertInstructionSchedHints
       return;
     }
 
-    if (schedHint != mlir::triton::amdgpu::SchedHint::none) {
+    switch (schedHint) {
+    case mlir::triton::amdgpu::SchedHint::local_prefetch:
       mod.walk([&](scf::ForOp forOp) {
         // Note, instruction schedule barriers are inserted only in the case of
         // a single `tt.dot` op in a `scf::ForOp` scope in the current
@@ -532,6 +535,28 @@ struct TritonAMDGPUInsertInstructionSchedHints
                                                                 schedHint);
         }
       });
+      break;
+    case mlir::triton::amdgpu::SchedHint::attention:
+      mod.walk([&](scf::ForOp forOp) {
+        // The attention schedule hint is inserted to the beginning of a
+        // for-loop with chained dots.
+        auto result = forOp->walk([](triton::DotOp op) {
+          if (isChainDotHead(op))
+            return WalkResult::interrupt();
+          return WalkResult::advance();
+        });
+
+        if (result.wasInterrupted()) {
+          OpBuilder rewriter(ctx);
+          rewriter.setInsertionPointToStart(forOp.getBody());
+          rewriter.create<triton::amdgpu::InstructionSchedHint>(forOp->getLoc(),
+                                                                schedHint);
+        }
+      });
+      break;
+    case mlir::triton::amdgpu::SchedHint::none:
+    default:
+      break;
     }
   }
 };
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp
@@ -8,6 +8,7 @@
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
 
+namespace tt = mlir::triton;
 using mlir::triton::ModuleAxisInfoAnalysis;
 using mlir::triton::AMD::DppCtrl;
 using mlir::triton::AMD::ISAFamily;
@@ -641,4 +642,43 @@ bool isUsedByDotScaledOp(Operation *op) {
       });
 }
 
+bool isChainDotHead(tt::DotOpInterface dotOp) {
+  auto isInSameRegion = [&dotOp](Operation *op) {
+    return op->getParentRegion() == dotOp->getParentRegion();
+  };
+  ForwardSliceOptions fwdOpt;
+  fwdOpt.filter = isInSameRegion;
+  SetVector<mlir::Operation *> fwdSlices;
+  getForwardSlice(dotOp, &fwdSlices, fwdOpt);
+  for (Operation *op : fwdSlices) {
+    if (auto dOp = dyn_cast<tt::DotOpInterface>(op)) {
+      assert(dOp != dotOp);
+      auto opA = dOp.getA().getDefiningOp();
+      if (opA && fwdSlices.contains(opA)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool isChainDotTail(tt::DotOpInterface dotOp) {
+  auto isInSameRegion = [&dotOp](Operation *op) {
+    return op->getParentRegion() == dotOp->getParentRegion();
+  };
+  BackwardSliceOptions bwdOpt;
+  bwdOpt.omitBlockArguments = true;
+  bwdOpt.filter = isInSameRegion;
+  SetVector<Operation *> bwdSlices;
+  Operation *opA = dotOp.getA().getDefiningOp();
+  if (!opA)
+    return false;
+  getBackwardSlice(opA, &bwdSlices, bwdOpt);
+  if (llvm::find_if(bwdSlices, [](Operation *op) {
+        return isa<tt::DotOpInterface>(op);
+      }) != bwdSlices.end())
+    return true;
+  return false;
+}
+
 } // namespace mlir::LLVM::AMD
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.h b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.h
@@ -95,6 +95,13 @@ bool canCoalesceWriteIntoSharedMemory(RewriterBase &rewriter,
 // Return true if op is used by DotScaledOp or UpcastMXFPOp ops.
 bool isUsedByDotScaledOp(Operation *op);
 
+// Check if the result of this tl.dot is used as opA of another tl.dot
+// in the same region
+bool isChainDotHead(mlir::triton::DotOpInterface dotOp);
+
+// Check if the opA of this tl.dot is the result of another tl.dot
+// in the same region
+bool isChainDotTail(mlir::triton::DotOpInterface dotOp);
 } // namespace mlir::LLVM::AMD
 
 #endif // TRITON_THIRD_PARTY_AMD_LIB_TRITONAMDGPUTOLLVM_UTILITY_H_
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
@@ -15,6 +15,8 @@
 using namespace mlir;
 namespace tt = mlir::triton;
 namespace ttg = mlir::triton::gpu;
+using ::mlir::LLVM::AMD::isChainDotHead;
+using ::mlir::LLVM::AMD::isChainDotTail;
 using ::mlir::LLVM::AMD::scaleDotElemTypeToMLIRType;
 using mlir::triton::gpu::chooseScaledMfmaScaleLayout;
 
@@ -55,49 +57,6 @@ FailureOr<ScaleDotElemType> mlirTypeToScaledElemType(Type type) {
       .Default([](Type) { return failure(); });
 }
 
-// Check if the result of this tl.dot is used as opA of another tl.dot
-// in the same region
-bool isChainDotHead(tt::DotOpInterface dotOp) {
-  auto isInSameRegion = [&dotOp](Operation *op) {
-    return op->getParentRegion() == dotOp->getParentRegion();
-  };
-  ForwardSliceOptions fwdOpt;
-  fwdOpt.filter = isInSameRegion;
-  SetVector<mlir::Operation *> fwdSlices;
-  getForwardSlice(dotOp, &fwdSlices, fwdOpt);
-  for (Operation *op : fwdSlices) {
-    if (auto dOp = dyn_cast<tt::DotOpInterface>(op)) {
-      assert(dOp != dotOp);
-      auto opA = dOp.getA().getDefiningOp();
-      if (opA && fwdSlices.contains(opA)) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-// Check if the opA of this tl.dot is the result of another tl.dot
-// in the same region
-bool isChainDotTail(tt::DotOpInterface dotOp) {
-  auto isInSameRegion = [&dotOp](Operation *op) {
-    return op->getParentRegion() == dotOp->getParentRegion();
-  };
-  BackwardSliceOptions bwdOpt;
-  bwdOpt.omitBlockArguments = true;
-  bwdOpt.filter = isInSameRegion;
-  SetVector<Operation *> bwdSlices;
-  Operation *opA = dotOp.getA().getDefiningOp();
-  if (!opA)
-    return false;
-  getBackwardSlice(opA, &bwdSlices, bwdOpt);
-  if (llvm::find_if(bwdSlices, [](Operation *op) {
-        return isa<tt::DotOpInterface>(op);
-      }) != bwdSlices.end())
-    return true;
-  return false;
-}
-
 SmallVector<unsigned, 3>
 warpsPerTile(Operation *dotOp, ArrayRef<int64_t> shape, int numWarps,
              std::pair<int64_t, int64_t> shapePerWarp) {