[TritonGPU] Add Loop Aware CSE pass (#6809)

Mogball · web-flow · commit 2c57e205474d · 2025-05-15T10:54:32.000-07:00
I thought LLVM would be able to do this, but apparently not. This
recursively analyzes the computations of loop iteration arguments to
check if two iter args always have the same value and replaces one with
the other.

This reduces the register usage of pipelined/warp specialized loops by
crushing the number of phase and index arguments. Sometimes up to 10
registers can be saved by this, which can be significant in
high-pressure areas.

This replaces uses of the normal CSE pass in make_ttgir and adds
canonicalize+loop_aware_cse after the pipeliner.

TODO:
- [x] write tests
- [x] manually check GB200 pytests
diff --git a/include/triton/Dialect/Triton/Transforms/Passes.td b/include/triton/Dialect/Triton/Transforms/Passes.td
@@ -79,4 +79,15 @@ def TritonLoopInvariantCodeMotion : Pass</*cli-arg*/"triton-licm", /*Op*/"mlir::
   let dependentDialects = ["mlir::triton::TritonDialect"];
 }
 
+def TritonLoopAwareCSE : Pass<"triton-loop-aware-cse", "mlir::ModuleOp"> {
+  let summary = "CSE within loop bodies";
+
+  let description = [{
+    The `triton-loop-aware-cse` pass performs recursive common subexpression
+    elimination within loop bodies. Unlike regular CSE, which is a single-pass
+    greedy algorithm, this pass can recursively eliminate loop iteration
+    arguments and subcomputations that always have the same value.
+  }];
+}
+
 #endif
diff --git a/lib/Dialect/Triton/Transforms/CMakeLists.txt b/lib/Dialect/Triton/Transforms/CMakeLists.txt
@@ -4,6 +4,7 @@ add_public_tablegen_target(TritonCombineIncGen)
 
 add_triton_library(TritonTransforms
   Combine.cpp
+  LoopAwareCSE.cpp
   LoopInvariantCodeMotion.cpp
   LoopUnroll.cpp
   ReorderBroadcast.cpp
diff --git a/lib/Dialect/Triton/Transforms/LoopAwareCSE.cpp b/lib/Dialect/Triton/Transforms/LoopAwareCSE.cpp
@@ -0,0 +1,181 @@
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/CSE.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+
+using namespace mlir;
+
+namespace mlir::triton {
+#define GEN_PASS_DEF_TRITONLOOPAWARECSE
+#include "triton/Dialect/Triton/Transforms/Passes.h.inc"
+} // namespace mlir::triton
+
+namespace {
+class ValueEquivalence {
+public:
+  std::optional<bool> getKnownEquivalence(Value a, Value b) {
+    if (auto it = equalValues.find(normalizeKey(a, b)); it != equalValues.end())
+      return it->second;
+    return std::nullopt;
+  }
+  void setKnownEquivalence(Value a, Value b, bool eq) {
+    equalValues.insert_or_assign(normalizeKey(a, b), eq);
+  }
+
+private:
+  // Commutatively query the equivalence of two values by sorting the key by
+  // pointer value.
+  std::pair<Value, Value> normalizeKey(Value a, Value b) {
+    if ((uintptr_t)a.getAsOpaquePointer() < (uintptr_t)b.getAsOpaquePointer())
+      return {a, b};
+    return {b, a};
+  }
+
+  DenseMap<std::pair<Value, Value>, bool> equalValues;
+};
+
+struct LoopCSEDriver {
+  LoopCSEDriver(scf::ForOp loop) : loop(loop) {}
+
+  bool areIterArgsEqual(int i, int j);
+  bool areEqualInLoop(Value a, Value b);
+
+  scf::ForOp loop;
+  ValueEquivalence equalValues;
+};
+} // namespace
+
+bool LoopCSEDriver::areIterArgsEqual(int i, int j) {
+  if (i == j)
+    return true;
+  if (loop.getInitArgs()[i] != loop.getInitArgs()[j])
+    return false;
+  BlockArgument aArg = loop.getRegionIterArg(i);
+  BlockArgument bArg = loop.getRegionIterArg(j);
+  // First, assume the arguments are equal. This is how recursion is broken.
+  equalValues.setKnownEquivalence(aArg, bArg, true);
+  bool result =
+      areEqualInLoop(loop.getYieldedValues()[i], loop.getYieldedValues()[j]);
+  // Now update the equivalence based on the actual result.
+  equalValues.setKnownEquivalence(aArg, bArg, result);
+  return result;
+}
+
+bool LoopCSEDriver::areEqualInLoop(Value a, Value b) {
+  // Check trivial case.
+  if (a == b)
+    return true;
+  if (a.getType() != b.getType())
+    return false;
+
+  Block *aBlock = a.getParentBlock();
+  Block *bBlock = b.getParentBlock();
+  // Values from outside the loop must have been equal.
+  if (aBlock != loop.getBody() || bBlock != loop.getBody()) {
+    return false;
+  }
+  // Both must be block arguments or not.
+  if (isa<BlockArgument>(a) != isa<BlockArgument>(b))
+    return false;
+  // Both must be the inductor var or not.
+  if (a == loop.getInductionVar() || b == loop.getInductionVar())
+    return false;
+
+  if (std::optional<bool> eq = equalValues.getKnownEquivalence(a, b))
+    return *eq;
+
+  if (auto aArg = dyn_cast<BlockArgument>(a)) {
+    auto bArg = cast<BlockArgument>(b);
+    bool result =
+        areIterArgsEqual(aArg.getArgNumber() - 1, bArg.getArgNumber() - 1);
+    equalValues.setKnownEquivalence(a, b, result);
+    return result;
+  }
+
+  Operation *aDef = a.getDefiningOp();
+  Operation *bDef = b.getDefiningOp();
+  // For it to be known that the operation results have the same value, they
+  // must be side effect free.
+  if (!isMemoryEffectFree(aDef) || !isMemoryEffectFree(bDef))
+    return false;
+  // Don't bother with operations with regions.
+  if (aDef->getNumRegions() || bDef->getNumRegions())
+    return false;
+
+  bool result = OperationEquivalence::isEquivalentTo(
+      aDef, bDef,
+      [&](Value a, Value b) { return success(areEqualInLoop(a, b)); },
+      [&](Value a, Value b) { equalValues.setKnownEquivalence(a, b, true); },
+      OperationEquivalence::IgnoreLocations);
+  equalValues.setKnownEquivalence(a, b, result);
+  return result;
+}
+
+static void loopCSE(scf::ForOp loop) {
+  int numIterArgs = loop.getNumRegionIterArgs();
+  // Group equivalent iter args together.
+  llvm::EquivalenceClasses<int> equivalentArgs;
+  LoopCSEDriver driver(loop);
+  for (int i = 0; i != numIterArgs; ++i) {
+    for (int j = i + 1; j != numIterArgs; ++j) {
+      if (driver.areIterArgsEqual(i, j))
+        equivalentArgs.unionSets(i, j);
+    }
+  }
+
+  // For each equivalence class, replace all other args in the class with one.
+  for (auto it = equivalentArgs.begin(), end = equivalentArgs.end(); it != end;
+       ++it) {
+    if (!(*it)->isLeader())
+      continue;
+    SmallVector<int> eqArgs;
+    for (auto mIt = equivalentArgs.member_begin(**it);
+         mIt != equivalentArgs.member_end(); ++mIt)
+      eqArgs.push_back(*mIt);
+    assert(eqArgs.size() > 1);
+    // Sort the indices so the pass is deterministic.
+    llvm::sort(eqArgs);
+    BlockArgument unique = loop.getRegionIterArg(eqArgs.front());
+    Value uniqueResult = loop.getResult(eqArgs.front());
+    for (int j : llvm::drop_begin(eqArgs)) {
+      BlockArgument other = loop.getRegionIterArg(j);
+      other.replaceAllUsesWith(unique);
+      // Short-circuit the value. The canonicalizer will clean this up. Leftover
+      // subcomputations can now be removed by normal CSE.
+      (*loop.getYieldedValuesMutable())[j].set(other);
+      loop.getResult(j).replaceAllUsesWith(uniqueResult);
+    }
+  }
+}
+
+namespace {
+struct LoopAwareCSE
+    : public triton::impl::TritonLoopAwareCSEBase<LoopAwareCSE> {
+  using TritonLoopAwareCSEBase::TritonLoopAwareCSEBase;
+
+  void runOnOperation() override {
+    // LoopAwareCSE doesn't recursively CSE ops outside of loops, so run CSE
+    // first to make sure values from outside loops that are equivalent are made
+    // pointer equal.
+    IRRewriter rewriter(&getContext());
+    auto &domInfo = getAnalysis<DominanceInfo>();
+    eliminateCommonSubExpressions(rewriter, domInfo, getOperation());
+
+    // CSE region iter args within loop bodies.
+    getOperation().walk(loopCSE);
+
+    // Now that equivalent iter args have been made pointer equal, run CSE again
+    // to clean up the loop body.
+    eliminateCommonSubExpressions(rewriter, domInfo, getOperation());
+
+    // Run the `scf.for` canonicalizer to clean up the loops (short-circuited
+    // values, unused results, etc.).
+    RewritePatternSet patterns(&getContext());
+    scf::ForOp::getCanonicalizationPatterns(patterns, &getContext());
+    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
+      return signalPassFailure();
+  }
+};
+} // namespace
diff --git a/python/src/passes.cc b/python/src/passes.cc
@@ -44,6 +44,7 @@ void init_triton_passes_ttir(py::module &&m) {
                      createTritonRewriteTensorDescriptorToPointer);
   ADD_PASS_WRAPPER_0("add_loop_unroll", createTritonLoopUnroll);
   ADD_PASS_WRAPPER_0("add_triton_licm", createTritonLoopInvariantCodeMotion);
+  ADD_PASS_WRAPPER_0("add_loop_aware_cse", createTritonLoopAwareCSE);
   ADD_PASS_OPTION_WRAPPER_4("add_convert_to_ttgpuir",
                             createConvertTritonToTritonGPU, const std::string &,
                             int, int, int);
diff --git a/test/Triton/loop_cse.mlir b/test/Triton/loop_cse.mlir
@@ -0,0 +1,47 @@
+// RUN: triton-opt %s -triton-loop-aware-cse -allow-unregistered-dialect | FileCheck %s
+
+// CHECK-LABEL: @loop_buffer_phase_args
+tt.func @loop_buffer_phase_args(%arg0: i32) {
+  %c2_i32 = arith.constant 2 : i32
+  %c128_i32 = arith.constant 128 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i32 = arith.constant 1 : i32
+  // CHECK: [[LOOP_RES:%.*]]:3 = scf.for {{.*}} iter_args
+  // CHECK-SAME: [[M2_INDEX:%arg[0-9]+]] = %c0_i32
+  // CHECK-SAME: [[M2_PHASE:%arg[0-9]+]] = %c0_i32
+  // CHECK-SAME: [[M1_PHASE:%arg[0-9]+]] = %c0_i32
+  %0:10 = scf.for %arg1 = %c0_i32 to %arg0 step %c128_i32 iter_args(%arg2 = %c0_i32, %arg3 = %c0_i32, %arg4 = %c0_i32, %arg5 = %c0_i32, %arg6 = %c0_i32, %arg7 = %c0_i32, %arg8 = %c0_i32, %arg9 = %c0_i32, %arg10 = %c0_i32, %arg11 = %c0_i32) -> (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)  : i32 {
+    %1 = arith.subi %arg0, %c128_i32 : i32
+    %2 = arith.cmpi slt, %arg1, %1 : i32
+    // CHECK: [[M1_PHASE_INCR:%.*]] = arith.xori [[M1_PHASE]], %c1_i32
+    %3 = arith.xori %arg7, %c1_i32 : i32
+    // CHECK: "index_phase_use"([[M2_INDEX]], [[M2_PHASE]], [[M1_PHASE_INCR]], [[M1_PHASE]])
+    "index_phase_use"(%arg4, %arg5, %3, %arg8) : (i32, i32, i32, i32) -> ()
+    %4 = arith.addi %arg4, %c1_i32 : i32
+    %5 = arith.xori %arg5, %c1_i32 : i32
+    %6 = arith.cmpi eq, %4, %c2_i32 : i32
+    // CHECK: [[M2_INDEX_INCR:%.*]] = arith.select %{{.*}}, %c0_i32
+    // CHECK-NEXT: [[M2_PHASE_INCR:%.*]] = arith.select %{{.*}}, %{{.*}}, [[M2_PHASE]]
+    // CHECK-NOT: arith.select
+    %7 = arith.select %6, %c0_i32, %4 : i32
+    %8 = arith.select %6, %5, %arg5 : i32
+    %9 = arith.xori %arg8, %c1_i32 : i32
+    %10 = arith.xori %arg11, %c1_i32 : i32
+    %11 = arith.xori %arg6, %c1_i32 : i32
+    %12 = arith.addi %arg2, %c1_i32 : i32
+    %13 = arith.xori %arg3, %c1_i32 : i32
+    %14 = arith.cmpi eq, %12, %c2_i32 : i32
+    %15 = arith.select %14, %c0_i32, %12 : i32
+    %16 = arith.select %14, %13, %arg3 : i32
+    // CHECK: "index_phase_use"([[M2_INDEX_INCR]], [[M2_PHASE_INCR]], [[M1_PHASE_INCR]],
+    "index_phase_use"(%15, %16, %11, %2) : (i32, i32, i32, i1) -> ()
+    %17 = arith.xori %arg10, %c1_i32 : i32
+    // CHECK: "index_phase_use"([[M1_PHASE_INCR]], [[M1_PHASE]])
+    "index_phase_use"(%17, %arg11) : (i32, i32) -> ()
+    %18 = arith.xori %arg9, %c1_i32 : i32
+    // CHECK: "index_phase_use"([[M1_PHASE_INCR]], [[M1_PHASE]])
+    "index_phase_use"(%17, %arg11) : (i32, i32) -> ()
+    scf.yield %15, %16, %7, %8, %11, %3, %9, %18, %17, %10 : i32, i32, i32, i32, i32, i32, i32, i32, i32, i32
+  }
+  tt.return
+}
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
@@ -246,7 +246,7 @@ def make_ttgir(mod, metadata, opt, capability):
         passes.ttgpuir.add_remove_layout_conversions(pm)
         passes.ttgpuir.add_optimize_dot_operands(pm, capability >= 80)
         nvidia.passes.ttnvgpuir.add_optimize_descriptor_encoding(pm)
-        passes.common.add_cse(pm)
+        passes.ttir.add_loop_aware_cse(pm)
         if capability // 10 in [8, 9]:
             passes.ttgpuir.add_fuse_nested_loops(pm)
             passes.common.add_canonicalizer(pm)
@@ -265,9 +265,10 @@ def make_ttgir(mod, metadata, opt, capability):
             passes.ttgpuir.add_pipeline(pm, opt.num_stages, dump_enabled)
             passes.ttgpuir.add_combine_tensor_select_and_if(pm)
             nvidia.passes.ttnvgpuir.add_remove_tmem_tokens(pm)
-            passes.common.add_canonicalizer(pm)
         else:
             passes.ttir.add_triton_licm(pm)
+        passes.common.add_canonicalizer(pm)
+        passes.ttir.add_loop_aware_cse(pm)
         passes.ttgpuir.add_prefetch(pm)
         passes.ttgpuir.add_WGMMAPrefetch(pm)
         passes.ttgpuir.add_optimize_dot_operands(pm, capability >= 80)
@@ -277,7 +278,7 @@ def make_ttgir(mod, metadata, opt, capability):
         nvidia.passes.ttnvgpuir.add_interleave_tmem(pm)
         passes.ttgpuir.add_reduce_data_duplication(pm)
         passes.ttgpuir.add_reorder_instructions(pm)
-        passes.common.add_cse(pm)
+        passes.ttir.add_loop_aware_cse(pm)
         passes.common.add_symbol_dce(pm)
         if capability // 10 >= 9:
             nvidia.passes.ttnvgpuir.add_tma_lowering(pm)