Fix incorrect codegen for masks dependent on loop induction variable (#310)

nhat-nguyen · web-flow · commit 427d7749dff4 · 2025-08-01T11:43:53.000-04:00
Currently, for mask dependent on loop induction variable, we compute the
mask offset by adding the mask offset *before* coming into the loop by
the loop iter-arg. This is not correct when the offset has an initial
value other than 0 because then the value of the offset will always be
one iteration *after* the current iteration. This patch fixes the
codegen and adds tests for these scenarios.
diff --git a/lib/Analysis/MaskAnalysis.cpp b/lib/Analysis/MaskAnalysis.cpp
@@ -8,6 +8,7 @@
 #include "triton-shared/Analysis/MaskAnalysis.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/Builders.h"
 #include "mlir/Support/LogicalResult.h"
 
 #include "triton-shared/Analysis/OpFoldResultUtils.h"
@@ -452,32 +453,65 @@ LogicalResult MaskState::parseLoopIterArg(Value v, const Location loc,
     return failure();
   }
 
+  // This is a bit of a hack!!
+  //
+  // The offset (MaskState::start) of a mask can now depend on a loop's
+  // iter-arg like the following example:
+  //
+  // idx = offset + tl.arange(0, 4)
+  // for it in range(n):
+  //   mask = idx < size
+  //   x = tl.load(x_ptr + idx, mask=mask)
+  //   tl.store(y_ptr + idx, x, mask=mask)
+  //   idx += 4
+  //
+  // See
+  // test/Conversion/TritonToStructured/mask_loop_iter_arg.mlir and
+  // and
+  // python/examples/test_mask_loop_iter_arg.py
+  // for IR and full triton code.
+  //
+  // To support this case, we first make the following assumptions:
+  //  - MaskAnalysis is runs after PtrAnalysis's prepass finishes, which means
+  //    the offset for the load and store pointers have already been set up
+  //    at `argIndex + 1`
+  //  - The tensor of indices used by the load / store and the mask are the same
+  //    (see above where `idx` appears in both the mask and the pointer
+  //    arithmetic). This allows us to use the offset at `argIndex + 1` in the
+  //    above assumption. In the future, to make this more robust, we need to
+  //    verify that the offsets are indeed the same. Or alternatively, make sure
+  //    to generate a separate start and end offset for each mask that is being
+  //    updated in loops.
+  //
+  // Now to generate the mask state in each loop iteration, we first construct
+  // the mask state *before* coming into the loop by parsing the init-arg. A
+  // mask dimensions stay consistent throughout each loop iteration, but its
+  // starting offset (`MaskState::start`) will change. So to construct the mask
+  // state for each iteration, we need to make MaskState::state be the offset
+  // iter-arg at `argIndex + 1`. Now for `MaskState::end`, we can first compute
+  // the distance between `start` and `end` before coming into the loop, then
+  // use this distance to compute the actual `end` in each loop.
   auto argIndex = std::distance(forOp.getRegionIterArgs().begin(), it);
   auto initArg = forOp.getInitArgs()[argIndex];
   if (auto getStateOp = initArg.getDefiningOp<tts::GetStructuredStateOp>()) {
     auto tritonValue = getStateOp->getOperand(0);
     MaskState lhsState;
-    if (failed(lhsState.parse(tritonValue, loc, builder))) {
-      return failure();
-    }
 
-    // This is a bit of a hack!!
-    //
-    // The offsets and dimensions of a MaskState can now depend on a loop's
-    // iter-arg.
-    //
-    // Because the PtrAnalysis's pre-pass already sets up the offsets,
-    // we can create a new MaskState for each loop iteration by adding the
-    // original MaskState with the current iter-arg, which is at `argIndex +
-    // 1`.
-    //
-    // This will not work for nested loop scenarios, which would need a
-    // more robust implementation.
-    if (failed(this->addStateScalar(
-            lhsState, forOp.getRegionIterArgs()[argIndex + 1], loc, builder))) {
-      return failure();
+    {
+      OpBuilder::InsertionGuard guard(builder);
+      // Make sure all ops generated for the mask state are inserted before
+      // the current loop
+      builder.setInsertionPoint(forOp);
+      if (failed(lhsState.parse(tritonValue, loc, builder))) {
+        return failure();
+      }
     }
 
+    auto dist = subOFRs(lhsState.end, lhsState.start, loc, builder);
+    this->start = forOp.getRegionIterArg(argIndex + 1);
+    this->end = addOFRs(this->start, dist, loc, builder);
+    this->dims = lhsState.dims;
+
     return success();
   }
 
diff --git a/python/examples/test_mask_loop_iter_arg.py b/python/examples/test_mask_loop_iter_arg.py
@@ -0,0 +1,69 @@
+import torch
+import triton
+import pytest
+
+import triton.language as tl
+
+@triton.jit
+def mask_loop(
+    y_ptr,
+    x_ptr,
+    scale_ptr,
+    size: torch.int64,
+    BLOCK_SIZE: tl.constexpr,
+):
+    bidx = tl.program_id(0)
+    tidx = tl.arange(0, BLOCK_SIZE)
+
+    grid_stride = tl.num_programs(0) * BLOCK_SIZE
+    iterations = tl.cdiv(size, 4)
+
+    idx = bidx * BLOCK_SIZE + tidx
+    idy = idx + 1
+    for it in range(iterations):
+        mask = idx < size
+        x = tl.load(x_ptr + idx, mask=mask).to(tl.float32)
+        tl.store(y_ptr + idx, x, mask=mask)
+        idx += grid_stride
+
+
+@pytest.mark.parametrize(
+    "b",
+    [
+        1,
+        2,
+        3,
+        8,
+        2048,
+        4096,
+    ],
+)
+@pytest.mark.parametrize(
+    "h",
+    [
+        16,
+        128,
+        1024,
+        5120,
+        7680,
+        8192,
+    ],
+)
+def test_mask_loop(b, h, device):
+    x = torch.randn((b, h), dtype=torch.float32, device=device)
+    y = torch.empty_like(x, dtype=torch.float32, device=device)
+    scale_ones = torch.ones(1, dtype=torch.float32, device=device)
+
+    BLOCK_SIZE = 2
+
+    grid = (2,)
+
+    compiled = mask_loop[grid](
+        y,
+        x,
+        scale_ones,
+        x.numel(),
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+
+    torch.testing.assert_close(x, y)
diff --git a/test/Conversion/TritonToStructured/mask_loop_iter_arg.mlir b/test/Conversion/TritonToStructured/mask_loop_iter_arg.mlir
@@ -0,0 +1,43 @@
+// RUN: triton-shared-opt --triton-to-structured --remove-dead-values --canonicalize %s | FileCheck %s
+
+module {
+  tt.func public @mask_loop(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    %c3_i32 = arith.constant 3 : i32
+    %c4_i32 = arith.constant 4 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c2_i32 = arith.constant 2 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32>
+    %2 = tt.get_num_programs x : i32
+    %3 = arith.muli %2, %c2_i32 : i32
+    %4 = arith.addi %arg3, %c3_i32 : i32
+    %5 = arith.divsi %4, %c4_i32 : i32
+    %6 = arith.muli %0, %c2_i32 : i32
+    %7 = tt.splat %6 : i32 -> tensor<2xi32>
+    %8 = arith.addi %7, %1 : tensor<2xi32>
+    %9 = tt.splat %arg3 : i32 -> tensor<2xi32>
+    %10 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<2x!tt.ptr<f32>>
+    %11 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<2x!tt.ptr<f32>>
+    %12 = tt.splat %3 : i32 -> tensor<2xi32>
+    %13 = scf.for %arg4 = %c0_i32 to %5 step %c1_i32 iter_args(%arg5 = %8) -> (tensor<2xi32>)  : i32 {
+      %14 = arith.cmpi slt, %arg5, %9 : tensor<2xi32>
+      %15 = tt.addptr %10, %arg5 : tensor<2x!tt.ptr<f32>>, tensor<2xi32>
+      %16 = tt.load %15, %14 : tensor<2x!tt.ptr<f32>>
+      %17 = tt.addptr %11, %arg5 : tensor<2x!tt.ptr<f32>>, tensor<2xi32>
+      tt.store %17, %16, %14 : tensor<2x!tt.ptr<f32>>
+      %18 = arith.addi %arg5, %12 : tensor<2xi32>
+      scf.yield %18 : tensor<2xi32>
+    }
+    tt.return
+  }
+}
+
+// CHECK: %8 = scf.for %arg4 = %c0_i32 to %5 step %c1_i32 iter_args(%arg5 = %7) -> (index)  : i32 {
+// CHECK:     %9 = tts.make_tptr %arg1
+// CHECK:     %10 = arith.addi %arg5, %c2 : index
+// CHECK:     %11 = arith.index_cast %arg3 : i32 to index
+// CHECK:     %12 = arith.minsi %10, %11 : index
+// CHECK:     %13 = arith.maxsi %12, %arg5 : index
+// CHECK:     %14 = arith.subi %13, %arg5 : index
+// CHECK:     %15 = "tts.load"(%9, %14)