[TritonGPU] LICM outer loop before flattening (triton-lang#6010)

Mogball · loislo · commit 44a64f07205b · 2025-03-04T16:28:38.000+01:00
Ops in prologue/epilogue can't get hoisted by LICM after the loop is
flattened, so LICM the outer loop before. We still don't want to LICM
the inner loop because it can significantly increase liveranges.
diff --git a/lib/Dialect/TritonGPU/Transforms/FuseNestedLoops.cpp b/lib/Dialect/TritonGPU/Transforms/FuseNestedLoops.cpp
@@ -2,6 +2,7 @@
 #include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/Transforms/LoopInvariantCodeMotionUtils.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
@@ -1053,6 +1054,7 @@ static LogicalResult preprocessLoopNest(const LoopNest &nest,
   scf::ForOp &outerLoop = nest.root->loop;
   scf::ForOp &innerLoop = nest.root->children.front()->loop;
 
+  moveLoopInvariantCode(outerLoop);
   optimizeEpilogueDependencies(outerLoop, innerLoop, domInfo);
   return speculateInnerLoopLength(outerLoop, innerLoop, domInfo);
 }