[BACKEND] Fix ProgramPoint passing in AxisInfoAnalysis (triton-lang#5181)

aakhundov · jbdalido · commit 2bc01531a587 · 2025-04-02T12:28:24.000+02:00
Fixes triton-lang#5122. The `ProgramPoint` [here](https://github.com/triton-lang/triton/blob/0bd30a2f3192204c5a50d5ffde27ad8493f6c026/lib/Analysis/AxisInfo.cpp#L1087) is created on the stack. Then its address is [passed](https://github.com/triton-lang/triton/blob/0bd30a2f3192204c5a50d5ffde27ad8493f6c026/lib/Analysis/AxisInfo.cpp#L1088-L1089) to the MLIR `SparseAnalysis` code, where it is [added as a dependency](https://github.com/llvm/llvm-project/blob/33ff9e43b4c5bdc3da31c6b11ad51d35a69bec5f/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp#L311) and later [dereferenced](https://github.com/llvm/llvm-project/blob/33ff9e43b4c5bdc3da31c6b11ad51d35a69bec5f/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp#L90). By the time the `ProramPoint` is dereferenced in the `AbstractSparseForwardDataFlowAnalysis::visit`, the `AxisInfoAnalysis::visitForOpInductionVar` will have finished and the `ProgramPoint` stack variable destroyed. This leads to a segfault (which can be reproed on the base rev with the lit test added in this PR). The code modified in this PR was originally added in triton-lang#4927, in conjunction with updating the `llvm-project` hash to `b5cc222d7429`. However, as noted in llvm/llvm-project#110344 (the `llvm-project` PR that has made the refactoring prompting the `AxisInfo.cpp` change in triton-lang#4927): > For dense forward data-flow analysis and other analysis (except dense backward data-flow analysis), the program point corresponding to the original operation can be obtained by `getProgramPointAfter(op)` As the `AxisInfoAnalysis` (in Triton) inherits from `SparseForwardDataFlowAnalysis` (in MLIR), in this PR we follow the above which resolves the segfault issue (as the `ProgramPoint` is now stored in the instance-level state of the pass). P.S. The lit test added in this PR is not exactly minimal. However, I did my best to minimize it starting from the 400-line repro TTGIR in
diff --git a/lib/Analysis/AxisInfo.cpp b/lib/Analysis/AxisInfo.cpp
@@ -1102,10 +1102,15 @@ LogicalResult AxisInfoAnalysis::visitOperation(
 void AxisInfoAnalysis::visitForOpInductionVar(
     scf::ForOp op, ArrayRef<dataflow::Lattice<AxisInfo> *> argLattices) {
   ProgramPoint *programPoint = getProgramPointAfter(op);
+<<<<<<< HEAD
   const auto &lb =
       getLatticeElementFor(programPoint, op.getLowerBound())->getValue();
   const auto &step =
       getLatticeElementFor(programPoint, op.getStep())->getValue();
+=======
+  auto lb = getLatticeElementFor(programPoint, op.getLowerBound())->getValue();
+  auto step = getLatticeElementFor(programPoint, op.getStep())->getValue();
+>>>>>>> 24c0fe47a ([BACKEND] Fix ProgramPoint passing in AxisInfoAnalysis (#5181))
 
   AxisInfo::DimVectorT knownContiguity(1, 1);
   AxisInfo::DimVectorT knownDivisibility(1, 1);
diff --git a/test/TritonGPU/coalesce.mlir b/test/TritonGPU/coalesce.mlir
@@ -160,42 +160,3 @@ module {
     tt.return
   }
 }
-
-// -----
-
-#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0]}>
-#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
-#blocked2 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
-#blocked3 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0]}>
-
-// CHECK: [[COALESCED_LAYOUT:#.*]] = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1]}>
-
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
-
-// CHECK: @coalesce_poison
-tt.func @coalesce_poison(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: i32, %arg2: i1) {
-  %c0_i32 = arith.constant 0 : i32
-  %c1_i32 = arith.constant 1 : i32
-  %0 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<128x64x!tt.ptr<f16>, #blocked>
-  %1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked1>
-  %2 = ttg.convert_layout %1 : tensor<128xi32, #blocked1> -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>>
-  %3 = tt.expand_dims %2 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<128x1xi32, #blocked2>
-  %4 = ttg.convert_layout %3 : tensor<128x1xi32, #blocked2> -> tensor<128x1xi32, #blocked3>
-  %5 = tt.broadcast %4 {axis = 1 : i32} : tensor<128x1xi32, #blocked3> -> tensor<128x64xi32, #blocked3>
-  %6 = ttg.convert_layout %5 : tensor<128x64xi32, #blocked3> -> tensor<128x64xi32, #blocked>
-  %7 = tt.addptr %0, %6 : tensor<128x64x!tt.ptr<f16>, #blocked>, tensor<128x64xi32, #blocked>
-
-  %8 = ub.poison : tensor<128x64x!tt.ptr<f16>, #blocked>
-  // CHECK: scf.if
-  %9 = scf.if %arg2 -> (tensor<128x64x!tt.ptr<f16>, #blocked>) {
-    scf.yield %8 : tensor<128x64x!tt.ptr<f16>, #blocked>
-  } else {
-    scf.yield %7 : tensor<128x64x!tt.ptr<f16>, #blocked>
-  }
-  // CHECK: [[PTR:%.*]] = ttg.convert_layout %{{.*}} : tensor<128x64x!tt.ptr<f16>, #{{.*}}> -> tensor<128x64x!tt.ptr<f16>, [[COALESCED_LAYOUT]]>
-  // CHECK-NEXT: tt.load [[PTR]]
-  %10 = tt.load %9 : tensor<128x64x!tt.ptr<f16>, #blocked>
-  tt.return
-}
-
-}