Annotate load containing chained dot operations (#4923)

etiotto · whitneywhtsang · web-flow · commit d0e80f39c562 · 2025-08-21T19:41:51.000-04:00
When a loop contains chained dot operations (result of one dot operation
is used by another) an attribute is added to the load so that subsequent
passes can query the attribute.

---------

Signed-off-by: Tiotto, Ettore &lt;ettore.tiotto@intel.com&gt;
Co-authored-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/benchmarks/triton_kernels_benchmark/flash_attention_benchmark.py b/benchmarks/triton_kernels_benchmark/flash_attention_benchmark.py
@@ -158,7 +158,7 @@ def _attn_fwd_with_block_pointers(Q, K, V, sm_scale, M, Out,  #
 
 
 configs = [
-    triton.Config({'BLOCK_M': BM, 'BLOCK_N': BN, 'grf_mode': 'large', 'one_matrix_per_load_for_bt': True}, num_stages=s, num_warps=w) \
+    triton.Config({'BLOCK_M': BM, 'BLOCK_N': BN, 'grf_mode': 'large'}, num_stages=s, num_warps=w) \
     for BM in [128, 256] \
     for BN in [32, 64] \
     for s in [2, 3, 4] \
diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp
@@ -52,6 +52,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "TRITON_INTEL_ENABLE_FIRST_LOAD_TO_SLM",
     "TRITON_INTEL_ENABLE_INSTR_SCHED",
     "TRITON_INTEL_FAST_MATH",
+    "TRITON_INTEL_ONE_MATRIX_PER_LOAD_BT",
     "TRITON_INTEL_REDUCE_TRANSPOSE",
     // clang-format on
 };
diff --git a/scripts/flash_attention.py b/scripts/flash_attention.py
@@ -42,11 +42,8 @@ def get_configs(options):
     warps_values = options.warps if options.warps else [8, 16, 32]
     split_barriers_scope = options.split_barriers_scope if options.split_barriers_scope else 'None'
     return [
-        triton.Config(
-            {
-                'BLOCK_M': BM, 'BLOCK_N': BN, 'grf_mode': 'large', 'one_matrix_per_load_for_bt': True,
-                'split_barriers_scope': split_barriers_scope
-            }, num_stages=s, num_warps=w)
+        triton.Config({'BLOCK_M': BM, 'BLOCK_N': BN, 'grf_mode': 'large', 'split_barriers_scope': split_barriers_scope},
+                      num_stages=s, num_warps=w)
         for BM in bm_values
         for BN in bn_values
         for s in stages_values
diff --git a/test/TritonIntelGPU/blockptr_load.mlir b/test/TritonIntelGPU/blockptr_load.mlir
@@ -1,5 +1,5 @@
-// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm --check-prefixes=CHECK,LARGE-BLOCK-SIZE-TRANS-B
-// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm=one_matrix_per_load_for_bt=1 | FileCheck %s --implicit-check-not=llvm.inline_asm --check-prefixes=CHECK,SMALL-BLOCK-SIZE-TRANS-B
+// RUN: TRITON_INTEL_ONE_MATRIX_PER_LOAD_BT=0 triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm --check-prefixes=CHECK,LARGE-BLOCK-SIZE-TRANS-B
+// RUN: TRITON_INTEL_ONE_MATRIX_PER_LOAD_BT=1 triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm --check-prefixes=CHECK,SMALL-BLOCK-SIZE-TRANS-B
 
 #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 4], order = [1, 0]}>
 #dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 2], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
diff --git a/test/TritonIntelGPU/subgroup-2d-block-io.mlir b/test/TritonIntelGPU/subgroup-2d-block-io.mlir
@@ -1,6 +1,5 @@
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --check-prefixes=STD-CHECK,CHECK
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-intel-gpu-to-llvm=one_matrix_per_load_for_bt=1 | FileCheck %s --check-prefixes=ONE-MATRIX-CHECK
-
+// RUN: TRITON_INTEL_ONE_MATRIX_PER_LOAD_BT=0 triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --check-prefixes=STD-CHECK,CHECK
+// RUN: TRITON_INTEL_ONE_MATRIX_PER_LOAD_BT=1 triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --check-prefixes=ONE-MATRIX-CHECK
 
 // COM: A matrix, 16x16 block size, 1 warp w/ repCluster=1
 #dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [1, 1]}>
diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
@@ -41,7 +41,6 @@ class XPUOptions:
     sanitize_overflow: bool = False
     generate_native_code: bool = False
     advanced_path: bool = False
-    one_matrix_per_load_for_bt: bool = False
     enable_tile_load_linear_layout: bool = True
 
     def __post_init__(self):
@@ -317,8 +316,7 @@ def make_llir(src, metadata, options):
         if not knobs.intel.reduce_transpose:
             intel.passes.ttgpuir.add_allocate_shared_memory(pm)
         passes.ttgpuir.add_allocate_global_scratch_memory(pm)
-        intel.passes.ttgpuir.add_to_llvmir(pm, options.advanced_path, options.one_matrix_per_load_for_bt,
-                                           options.enable_tile_load_linear_layout)
+        intel.passes.ttgpuir.add_to_llvmir(pm, options.advanced_path, options.enable_tile_load_linear_layout)
         intel.passes.ttgpuir.add_gen_to_llvm(pm)
         passes.common.add_canonicalizer(pm)
         intel.passes.ttgpuir.add_rewrite_stack_ptr(pm)
diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td b/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td
@@ -62,6 +62,13 @@ def TritonIntelGPU_Dialect : Dialect {
     static constexpr llvm::StringRef getSupport16BitAtomicsAttrName() {
       return "ttig.support_16bit_atomics";
     }
+
+    /// FIXME: Remove once IGC can split large 2D block loads.
+    /// Get the name of the attribute used to indicate that a load operation
+    /// should use 'one matrix per load'.
+    static constexpr llvm::StringRef getOneMatrixPerLoadAttrName() {
+      return "ttig.one_matrix_per_load";
+    }
   }];
 
   let useDefaultAttributePrinterParser = 1;
diff --git a/third_party/intel/include/TritonIntelGPUToLLVM/Passes.td b/third_party/intel/include/TritonIntelGPUToLLVM/Passes.td
@@ -25,9 +25,6 @@ def ConvertTritonIntelGPUToLLVM
     Option<"advancedPath", "advanced_path",
            "bool", /*default*/"false",
            "enable advanced path">,
-    Option<"oneMatrixPerLoadForBT", "one_matrix_per_load_for_bt",
-           "bool", /*default*/"false",
-           "Only load one DPAS operands per load for transposed B matrix">,
     Option<"useTileLoadLinearLayout", "use_tile_load_linear_layout",
            "bool", /*default*/"true",
            "Use linear layouts to generate the tile load sizes and offsets">
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -977,16 +977,23 @@ struct LoadOpToBlockIOConversion
   LoadOpToBlockIOConversion(
       LLVMTypeConverter &converter, const triton::intel::TargetInfo &targetInfo,
       const triton::intel::ModuleAxisInfoAnalysis &axisAnalysisPass,
-      PatternBenefit benefit, bool oneMatrixPerLoadForBT,
-      bool useTileLoadLinearLayout)
+      PatternBenefit benefit, bool useTileLoadLinearLayout)
       : ConvertTritonGPUOpToLLVMPattern<triton::LoadOp>(converter, benefit),
         BlockIOConversionBase(targetInfo, axisAnalysisPass),
-        oneMatrixPerLoadForBT(oneMatrixPerLoadForBT),
         useTileLoadLinearLayout(useTileLoadLinearLayout) {}
 
   LogicalResult
   rewriteTensorPointerLoad(triton::LoadOp op, OpAdaptor adaptor,
                            ConversionPatternRewriter &rewriter) const {
+    // FIXME: Remove once IGC can split large 2D block loads.
+    std::optional<bool> oneMatrixPerLoadForBT =
+        mlir::triton::tools::isEnvValueBool(mlir::triton::tools::getStrEnv(
+            "TRITON_INTEL_ONE_MATRIX_PER_LOAD_BT"));
+    if (!oneMatrixPerLoadForBT.has_value())
+      oneMatrixPerLoadForBT =
+          op->hasAttr(triton::gpu::intel::TritonIntelGPUDialect::
+                          getOneMatrixPerLoadAttrName());
+
     Value ptr = op.getPtr();
     assert(isTensorPointerType(ptr.getType()) &&
            "Expecting tensor pointer type");
@@ -1342,7 +1349,7 @@ struct LoadOpToBlockIOConversion
       if (!usePackedType)
         return failure();
 
-      if (oneMatrixPerLoadForBT) {
+      if (*oneMatrixPerLoadForBT) {
         // Only load 1 operand per inst on row.
         numOperandsPer2DLoadM = 1;
         tileHeight = elemsPerDPASInst[threadOrder[rank - 2]];
@@ -1391,7 +1398,7 @@ struct LoadOpToBlockIOConversion
     tileLayout *= LinearLayout::identity1D(numOperandsOuterDimPerLoad,
                                            kIteration, dimOuterStr);
     tileLayout *=
-        LinearLayout::identity1D(isTransposeRequired && oneMatrixPerLoadForBT
+        LinearLayout::identity1D(isTransposeRequired && *oneMatrixPerLoadForBT
                                      ? 1
                                      : numOperandsInnerDimPerLoad,
                                  kIteration, dimInnerStr);
@@ -2466,7 +2473,6 @@ struct LoadOpToBlockIOConversion
   }
 
 private:
-  bool oneMatrixPerLoadForBT;
   bool useTileLoadLinearLayout;
 };
 
@@ -3498,15 +3504,14 @@ void mlir::triton::intel::populateLoadStoreOpToLLVMPatterns(
     LLVMTypeConverter &typeConverter, const TargetInfo &targetInfo,
     RewritePatternSet &patterns,
     const intel::ModuleAxisInfoAnalysis &axisInfoAnalysis,
-    PatternBenefit benefit, bool oneMatrixPerLoadForBT,
-    bool useTileLoadLinearLayout) {
+    PatternBenefit benefit, bool useTileLoadLinearLayout) {
   patterns.add<AtomicCASOpConversion, AtomicRMWOpConversion, LoadOpConversion,
                StoreOpConversion, PrefetchOpConversion>(
       typeConverter, targetInfo, axisInfoAnalysis, benefit);
   // BlockIO is more efficient than gather load or scatter store.
   patterns.add<LoadOpToBlockIOConversion>(
       typeConverter, targetInfo, axisInfoAnalysis, benefit.getBenefit() + 2,
-      oneMatrixPerLoadForBT, useTileLoadLinearLayout);
+      useTileLoadLinearLayout);
   patterns.add<StoreOpToBlockIOConversion>(
       typeConverter, targetInfo, axisInfoAnalysis, benefit.getBenefit() + 2);
 }
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/PatternTritonGPUOpToLLVM.h b/third_party/intel/lib/TritonIntelGPUToLLVM/PatternTritonGPUOpToLLVM.h
@@ -77,8 +77,7 @@ void populateFp4ToFpToLLVMPatterns(LLVMTypeConverter &typeConverter,
 void populateLoadStoreOpToLLVMPatterns(
     LLVMTypeConverter &typeConverter, const TargetInfo &targetInfo,
     RewritePatternSet &patterns, const ModuleAxisInfoAnalysis &axisInfoAnalysis,
-    PatternBenefit benefit, bool oneMatrixPerLoadForBT,
-    bool useTileLoadLinearLayout);
+    PatternBenefit benefit, bool useTileLoadLinearLayout);
 
 void populateTensorPtrOpsToLLVMPatterns(LLVMTypeConverter &typeConverter,
                                         RewritePatternSet &patterns,
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h
@@ -180,10 +180,8 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
 class TritonGPUToLLVMPipelineManager {
 public:
   TritonGPUToLLVMPipelineManager(ModuleOp &mod, MLIRContext *ctx, bool advanced,
-                                 bool oneMatrixPerLoadForBT,
                                  bool useTileLoadLinearLayout)
       : mod(mod), ctx(ctx), isAdvancedPathEnabled(advanced),
-        oneMatrixPerLoadForBT(oneMatrixPerLoadForBT),
         useTileLoadLinearLayout(useTileLoadLinearLayout) {}
 
   /// FIXME: remove once the block ptr conversion path is capable of handling
@@ -224,7 +222,7 @@ class TritonGPUToLLVMPipelineManager {
           typeConverter, patterns, axisInfoAnalysis, targetInfo, benefit);
       intel::populateLoadStoreOpToLLVMPatterns(
           typeConverter, targetInfo, patterns, axisInfoAnalysis, benefit,
-          oneMatrixPerLoadForBT, useTileLoadLinearLayout);
+          useTileLoadLinearLayout);
       intel::populateReduceOpToLLVMPatterns(typeConverter, patterns, targetInfo,
                                             benefit);
       mlir::triton::populateScanOpToLLVMPatterns(typeConverter, patterns,
@@ -278,7 +276,6 @@ class TritonGPUToLLVMPipelineManager {
   /// FIXME: this is temporary and should be removed once we have an analysis to
   /// determine whether a kernel uses block pointers.
   bool isAdvancedPathEnabled = false;
-  bool oneMatrixPerLoadForBT = false;
   bool useTileLoadLinearLayout = true;
 };
 
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp
@@ -61,10 +61,8 @@ struct ConvertTritonGPUToLLVM
           ConvertTritonGPUToLLVM> {
   using ConvertTritonIntelGPUToLLVMBase::ConvertTritonIntelGPUToLLVMBase;
   ConvertTritonGPUToLLVM() = default;
-  ConvertTritonGPUToLLVM(bool advancedPath, bool oneMatrixPerLoadForBT,
-                         bool useTileLoadLinearLayout) {
+  ConvertTritonGPUToLLVM(bool advancedPath, bool useTileLoadLinearLayout) {
     this->advancedPath = advancedPath;
-    this->oneMatrixPerLoadForBT = oneMatrixPerLoadForBT;
     this->useTileLoadLinearLayout = useTileLoadLinearLayout;
   }
 
@@ -87,8 +85,7 @@ struct ConvertTritonGPUToLLVM
                               getSupportDPASAttrName()) &&
              "Target do not support blocked load/mma");
     mlir::triton::intel::TritonGPUToLLVMPipelineManager pipelineManager(
-        mod, context, isAdvancedPathEnabled, oneMatrixPerLoadForBT,
-        useTileLoadLinearLayout);
+        mod, context, isAdvancedPathEnabled, useTileLoadLinearLayout);
     mlir::LowerToLLVMOptions option(context);
     auto targetInfo = mlir::triton::intel::createTargetInfo(mod);
     TritonIntelGPUToLLVMTypeConverter typeConverter(
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp
@@ -15,6 +15,7 @@
 #include "triton/Dialect/Triton/IR/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Casting.h"
 #include <optional>
 
 #define PVC_2D_LOAD_MAXIMUM_NUMBER_OF_ROWS 32
@@ -40,10 +41,27 @@ getWarpsPerTile(tt::DotOp dotOp, ttgi::DpasEncodingAttr::DPASCapability dpasCap,
   };
 
   SetVector<Operation *> slices = multiRootGetSlice(dotOp, {filter});
-  // TODO: revisit this in flash attention.
-  for (Operation *op : slices)
-    if (isa<tt::DotOp>(op) && (op != dotOp))
+  for (Operation *op : slices) {
+    if (isa<tt::DotOp>(op) && (op != dotOp)) {
+      if (auto forOp = op->getParentOfType<scf::ForOp>()) {
+        // FIXME: Remove once IGC can split large 2D block loads.
+        MLIRContext *ctx = forOp->getContext();
+        auto setAttrOnBOperand = [&](tt::DotOp dotOp, StringRef attrName,
+                                     Attribute attr) {
+          Operation *defOp = dotOp.getB().getDefiningOp();
+          while (auto convOp = dyn_cast_or_null<ttg::ConvertLayoutOp>(defOp))
+            defOp = convOp.getSrc().getDefiningOp();
+          if (auto loadOp = dyn_cast_or_null<tt::LoadOp>(defOp))
+            loadOp->setAttr(attrName, attr);
+        };
+        StringRef attrName =
+            ttgi::TritonIntelGPUDialect::getOneMatrixPerLoadAttrName();
+        setAttrOnBOperand(dotOp, attrName, UnitAttr::get(ctx));
+        setAttrOnBOperand(cast<tt::DotOp>(op), attrName, UnitAttr::get(ctx));
+      }
       return {numWarps, 1};
+    }
+  }
 
   size_t rank = shape.size();
   SmallVector<unsigned> ret(rank, 1);
diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc
@@ -64,9 +64,9 @@ void init_triton_intel_passes_ttir(py::module &&m) {
 }
 
 void init_triton_intel_passes_ttgpuir(py::module &&m) {
-  ADD_PASS_OPTION_WRAPPER_3("add_to_llvmir",
+  ADD_PASS_OPTION_WRAPPER_2("add_to_llvmir",
                             gpu::intel::createConvertTritonIntelGPUToLLVM, bool,
-                            bool, bool);
+                            bool);
   ADD_PASS_WRAPPER_0("add_gen_to_llvm", createConvertTritonGENToLLVM);
   ADD_PASS_WRAPPER_0("add_accelerate_matmul",
                      gpu::intel::createTritonIntelGPUAccelerateMatmul);