Fix for compiler crash when analyzing loop scalar mask (#314)

dbd64 · Daniel Donenfeld · web-flow · commit 9ef501662fcb · 2025-08-08T13:38:12.000-04:00
It was previously assumed that end start and end would be set when
analyzing the mask state in a loop. When the mask state is set to
scalar, compute the start and end for use in the rest of the analysis.

---------

Co-authored-by: Daniel Donenfeld &lt;ddonenfeld@microsoft.com&gt;
diff --git a/lib/Analysis/MaskAnalysis.cpp b/lib/Analysis/MaskAnalysis.cpp
@@ -235,7 +235,8 @@ LogicalResult MaskState::minStateScalar(const MaskState &lhsState,
     }
   } else {
     InFlightDiagnostic diag =
-        emitError(loc) << "Unexpected case where both lhs and rhs are not scalars";
+        emitError(loc)
+        << "Unexpected case where both lhs and rhs are not scalars";
     return failure();
   }
   return success();
@@ -329,7 +330,7 @@ LogicalResult MaskState::parseAnd(arith::AndIOp andOp, const Location loc,
   if (failed(rhsState.parse(andOp.getRhs(), loc, builder)))
     return failure();
 
-  if(!lhsState.isMask() || !rhsState.isMask()) {
+  if (!lhsState.isMask() || !rhsState.isMask()) {
     return this->minStateScalar(lhsState, rhsState, loc, builder);
   }
   return this->minStates(lhsState, rhsState, loc, builder);
@@ -363,8 +364,8 @@ LogicalResult MaskState::parseCmp(arith::CmpIOp cmpOp, const Location loc,
   // We only support sge against 0 for lower bounds. Dims already has an
   // implicit assumption that the lower bound is 0, so if we see this, assume
   // the comparison evaluates to true.
-  if (cmpOp.getPredicate() == arith::CmpIPredicate::sge
-    && !(rhsState.scalar && hasConstZero(rhsState.scalar))) {
+  if (cmpOp.getPredicate() == arith::CmpIPredicate::sge &&
+      !(rhsState.scalar && hasConstZero(rhsState.scalar))) {
     InFlightDiagnostic diag = emitError(loc)
                               << "Unsupported cmpi with rhs not equal to 0";
     return failure();
@@ -383,8 +384,11 @@ LogicalResult MaskState::parseCmp(arith::CmpIOp cmpOp, const Location loc,
       cmpDim = i;
     }
   }
-  assert(cmpDim != -1 &&
-         "Unexpected case where no dimension has size larger than 1");
+  assert(
+      cmpDim != -1 ||
+      (!lhsState.scalar && cmpOp.getPredicate() == arith::CmpIPredicate::slt ||
+       cmpOp.getPredicate() == arith::CmpIPredicate::ult) &&
+          "Unexpected case where no dimension has size larger than 1");
 
   OpFoldResult newDim;
   if (lhsState.scalar) {
@@ -397,10 +401,10 @@ LogicalResult MaskState::parseCmp(arith::CmpIOp cmpOp, const Location loc,
     // should be loaded/stored by inserting a comparison + select:
     //    dim = lhs < rhs ? lhs.dim : 0
     newDim = compareOFRs(lhsState.scalar, rhsState.scalar, cmpOp.getPredicate(),
-                  lhsState.dims[cmpDim], builder.getIndexAttr(0),
-                  loc, builder);
+                         lhsState.dims[cmpDim], builder.getIndexAttr(0), loc,
+                         builder);
   } else if (cmpOp.getPredicate() == arith::CmpIPredicate::slt ||
-    cmpOp.getPredicate() == arith::CmpIPredicate::ult) {
+             cmpOp.getPredicate() == arith::CmpIPredicate::ult) {
     // Important:
     // In the case where the values we are loading are entirely masked off like
     // the following:
@@ -418,8 +422,8 @@ LogicalResult MaskState::parseCmp(arith::CmpIOp cmpOp, const Location loc,
     newEnd = maxOFRs(newEnd, lhsState.start, loc, builder);
     newDim = subOFRs(newEnd, lhsState.start, loc, builder);
   } else {
-    assert(cmpOp.getPredicate() == arith::CmpIPredicate::sge && rhsState.scalar
-           && hasConstZero(rhsState.scalar));
+    assert(cmpOp.getPredicate() == arith::CmpIPredicate::sge &&
+           rhsState.scalar && hasConstZero(rhsState.scalar));
     newDim = lhsState.dims[cmpDim];
   }
 
@@ -507,6 +511,12 @@ LogicalResult MaskState::parseLoopIterArg(Value v, const Location loc,
       }
     }
 
+    if (!lhsState.start && !lhsState.end) {
+      assert(lhsState.scalar && "MaskState must have a scalar");
+      lhsState.start = builder.getIndexAttr(0);
+      lhsState.end = lhsState.scalar;
+    }
+
     auto dist = subOFRs(lhsState.end, lhsState.start, loc, builder);
     this->start = forOp.getRegionIterArg(argIndex + 1);
     this->end = addOFRs(this->start, dist, loc, builder);
diff --git a/test/Conversion/TritonToStructured/mask_loop_iter_arg_scalar.mlir b/test/Conversion/TritonToStructured/mask_loop_iter_arg_scalar.mlir
@@ -0,0 +1,41 @@
+// RUN: triton-shared-opt --triton-to-structured --remove-dead-values --canonicalize %s | FileCheck %s
+
+module {
+  tt.func public @scalar_mask_loop(%arg0: !tt.ptr<f8E4M3FN> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<4.480000e+02> : tensor<1xf32>
+    %cst_0 = arith.constant dense<-4.480000e+02> : tensor<1xf32>
+    %c1_i32 = arith.constant 1 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = tt.get_num_programs x : i32
+    %2 = arith.addi %arg3, %1 : i32
+    %3 = arith.subi %2, %c1_i32 : i32
+    %4 = arith.divsi %3, %1 : i32
+    %5 = tt.load %arg2 : !tt.ptr<f32>
+    %6 = tt.splat %0 : i32 -> tensor<1xi32>
+    %7 = scf.for %arg4 = %c0_i32 to %4 step %c1_i32 iter_args(%arg5 = %6) -> (tensor<1xi32>)  : i32 {
+      %8 = tt.splat %arg3 : i32 -> tensor<1xi32>
+      %9 = arith.cmpi slt, %arg5, %8 : tensor<1xi32>
+      %10 = tt.splat %arg1 : !tt.ptr<bf16> -> tensor<1x!tt.ptr<bf16>>
+      %11 = tt.addptr %10, %arg5 : tensor<1x!tt.ptr<bf16>>, tensor<1xi32>
+      %12 = tt.load %11, %9 : tensor<1x!tt.ptr<bf16>>
+      %13 = arith.extf %12 : tensor<1xbf16> to tensor<1xf32>
+      %14 = tt.splat %5 : f32 -> tensor<1xf32>
+      %15 = arith.mulf %13, %14 : tensor<1xf32>
+      %16 = tt.clampf %15, %cst_0, %cst, propagateNan = none : tensor<1xf32>
+      %17 = tt.fp_to_fp %16, rounding = rtne : tensor<1xf32> -> tensor<1xf8E4M3FN>
+      %18 = tt.splat %arg0 : !tt.ptr<f8E4M3FN> -> tensor<1x!tt.ptr<f8E4M3FN>>
+      %19 = tt.addptr %18, %arg5 : tensor<1x!tt.ptr<f8E4M3FN>>, tensor<1xi32>
+      tt.store %19, %17, %9 : tensor<1x!tt.ptr<f8E4M3FN>>
+      %20 = tt.splat %1 : i32 -> tensor<1xi32>
+      %21 = arith.addi %arg5, %20 : tensor<1xi32>
+      scf.yield %21 : tensor<1xi32>
+    }
+    tt.return
+  }
+}
+
+
+// CHECK: %8 = scf.for %arg4 = %c0_i32 to %6 step %c1_i32 iter_args(%arg5 = %1) -> (index)  : i32 {
+// CHECK:   %9 = tts.make_tptr %arg1 to sizes: [1], strides: [%c0], offsets: [%arg5], shape: [0], order: [] : <bf16> to tensor<1x!tt.ptr<bf16>>
+// CHECK:   %10 = "tts.load"(%9) <{operandSegmentSizes = array<i32: 1, 0, 0>, static_mask_dims = array<i64: 1>}> : (tensor<1x!tt.ptr<bf16>>) -> tensor<1xbf16>