[MaskAnalysis] Fix implement of scalar conjunction in mask analysis. (#318)

red1bluelost · web-flow · commit eaeb554c681c · 2025-08-12T17:35:30.000-04:00
The original implementation of conjunction with scalar mistakenly
assumed that the scalar argument would use it's dimension as the mask
point. This does not work because scalar arguments should be a binary
and not a pivot point. To support scalar arguments, we must either use
the mask argument of the conjunction or completely zero out the mask.
This PR changes the code generation so that select statements embedded
this pattern.
diff --git a/include/triton-shared/Analysis/OpFoldResultUtils.h b/include/triton-shared/Analysis/OpFoldResultUtils.h
@@ -8,9 +8,9 @@
 #ifndef TRITON_ANALYSIS_OPFOLDRESULT_UTILS_H
 #define TRITON_ANALYSIS_OPFOLDRESULT_UTILS_H
 
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/OpDefinition.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
 
 #include <optional>
 
@@ -55,17 +55,23 @@ OpFoldResult subOFRs(const OpFoldResult lhs, const OpFoldResult rhs,
 // result is an Integer Attribtue. Otherwise, insert the arith.muli
 // instruction if needed and use its result Value.
 OpFoldResult mulOFRs(const OpFoldResult lhs, const OpFoldResult rhs,
-                         const Location loc, OpBuilder &b);
+                     const Location loc, OpBuilder &b);
 
 OpFoldResult minOFRs(const OpFoldResult lhs, const OpFoldResult rhs,
                      const Location loc, OpBuilder &b);
 
 OpFoldResult maxOFRs(const OpFoldResult lhs, const OpFoldResult rhs,
                      const Location loc, OpBuilder &b);
 
+OpFoldResult selectOFRs(const OpFoldResult cond, const OpFoldResult trueOFR,
+                        const OpFoldResult falseOFR, const Location loc,
+                        OpBuilder &b);
+
 OpFoldResult compareOFRs(const OpFoldResult lhs, const OpFoldResult rhs,
-                    const arith::CmpIPredicate pred, const OpFoldResult trueVal,
-                    const OpFoldResult falseVal, const Location loc, OpBuilder &b);
+                         const arith::CmpIPredicate pred,
+                         const OpFoldResult trueVal,
+                         const OpFoldResult falseVal, const Location loc,
+                         OpBuilder &b);
 } // namespace mlir
 
 #endif
diff --git a/lib/Analysis/MaskAnalysis.cpp b/lib/Analysis/MaskAnalysis.cpp
@@ -219,26 +219,42 @@ LogicalResult MaskState::addStates(const MaskState &lhsState,
 LogicalResult MaskState::minStateScalar(const MaskState &lhsState,
                                         const MaskState &rhsState, Location loc,
                                         OpBuilder &builder) {
+  // Conjunction where both sides are scalar should not be done after splats. We
+  // should ensure that code generation pushes the splat as late as possible.
   if (lhsState.scalar && rhsState.scalar) {
-    dims.push_back(minOFRs(lhsState.dims[0], rhsState.dims[0], loc, builder));
-  } else if (lhsState.scalar) {
-    for (uint32_t i = 0; i < rhsState.getRank(); i++) {
-      auto lhsDim = lhsState.dims[0];
-      auto rhsDim = rhsState.dims[i];
-      dims.push_back(minOFRs(lhsDim, rhsDim, loc, builder));
-    }
-  } else if (rhsState.scalar) {
-    for (uint32_t i = 0; i < lhsState.getRank(); i++) {
-      auto lhsDim = lhsState.dims[i];
-      auto rhsDim = rhsState.dims[0];
-      dims.push_back(minOFRs(lhsDim, rhsDim, loc, builder));
-    }
-  } else {
+    InFlightDiagnostic diag =
+        emitError(loc) << "Unexpected case where both lhs and rhs are scalars";
+    return failure();
+  }
+
+  // Caller should ensure that at least one side is scalar.
+  if (!lhsState.scalar && !rhsState.scalar) {
     InFlightDiagnostic diag =
         emitError(loc)
         << "Unexpected case where both lhs and rhs are not scalars";
     return failure();
   }
+
+  // If we see a scalar condition in a conjunction with a mask, this means we
+  // are either going to take the mask dimension or take nothing at all. To do
+  // that we use a select on the scalar value with the mask dimension in the
+  // true case and zero in the false case.
+  //
+  // Example:
+  // def kernel(..., index: i32, ...):
+  //   ...
+  //   offs = tl.arange(0, 8)
+  //   mask = offs < 4
+  //   scalar = index < 4
+  //   ... = tl.load(some_ptr, mask=scalar & mask, other=0)
+  auto &scalarState = lhsState.scalar ? lhsState : rhsState;
+  auto &nonScalarState = lhsState.scalar ? rhsState : lhsState;
+  for (uint32_t i = 0; i < nonScalarState.getRank(); i++) {
+    auto nonScalarDim = nonScalarState.dims[i];
+    dims.push_back(selectOFRs(scalarState.scalar, nonScalarDim,
+                              builder.getZeroAttr(builder.getIndexType()), loc,
+                              builder));
+  }
   return success();
 }
 
diff --git a/lib/Analysis/OpFoldResultUtils.cpp b/lib/Analysis/OpFoldResultUtils.cpp
@@ -104,7 +104,8 @@ OpFoldResult expandOFRIndex(OpFoldResult ofr, OpFoldResult targetForTy,
 
   Value v = dyn_cast<Value>(ofr);
   if (!v)
-    v = b.create<arith::ConstantOp>(loc, cast<IntegerAttr>(cast<Attribute>(ofr)));
+    v = b.create<arith::ConstantOp>(loc,
+                                    cast<IntegerAttr>(cast<Attribute>(ofr)));
 
   Type ty = v.getType();
   if (targetTy == ty)
@@ -126,7 +127,8 @@ OpFoldResult expandOFRIndex(OpFoldResult ofr, OpFoldResult targetForTy,
       // This path is for case like:
       // input_ptr + (row_indices[:, None] + row_offsets[:,None] % mod_offset) *
       //   stride_m + col_offsets[None, :] * stride_n
-      // The modulo will be in shape of [ROW_SIZE, 1] while row_indices is in shape of [ROW_SIZE,].
+      // The modulo will be in shape of [ROW_SIZE, 1] while row_indices is in
+      // shape of [ROW_SIZE,].
       LLVM_DEBUG({
         llvm::dbgs() << "Reshaping ";
         shapedTy.dump();
@@ -135,14 +137,15 @@ OpFoldResult expandOFRIndex(OpFoldResult ofr, OpFoldResult targetForTy,
       });
       SmallVector<Value> shapeValues;
       for (auto dim : targetShapedTy.getShape()) {
-        shapeValues.push_back(b.create<arith::ConstantOp>(
-            loc, b.getIndexAttr(dim)));
+        shapeValues.push_back(
+            b.create<arith::ConstantOp>(loc, b.getIndexAttr(dim)));
       }
       RankedTensorType targetShapeTensorTy = RankedTensorType::get(
           targetShapedTy.getShape().size(), b.getIndexType());
       auto shapeTensor = b.create<tensor::FromElementsOp>(
           loc, targetShapeTensorTy, shapeValues);
-      return b.create<triton::ReshapeOp>(loc, targetTy, v, shapeTensor).getResult();
+      return b.create<triton::ReshapeOp>(loc, targetTy, v, shapeTensor)
+          .getResult();
     }
     if (isa<IndexType>(targetEltTy) || isa<IndexType>(eltTy)) {
       assert((isa<IntegerType>(targetEltTy) || isa<IntegerType>(eltTy)) &&
@@ -228,7 +231,7 @@ OpFoldResult subOFRs(const OpFoldResult lhs, const OpFoldResult rhs,
 }
 
 OpFoldResult mulOFRs(const OpFoldResult lhs, const OpFoldResult rhs,
-                         const Location loc, OpBuilder &b) {
+                     const Location loc, OpBuilder &b) {
   auto lhsIntAttr = getIntAttr(lhs);
   auto rhsIntAttr = getIntAttr(rhs);
 
@@ -336,44 +339,65 @@ OpFoldResult maxOFRs(const OpFoldResult lhs, const OpFoldResult rhs,
   return maxOp.getResult();
 }
 
+OpFoldResult selectOFRs(const OpFoldResult condOFR, const OpFoldResult trueOFR,
+                        const OpFoldResult falseOFR, const Location loc,
+                        OpBuilder &b) {
+  auto trueValue = ofrToIndexValue(trueOFR, loc, b);
+  auto falseValue = ofrToIndexValue(falseOFR, loc, b);
+  auto condValue = ofrToIndexValue(condOFR, loc, b);
+
+  // Ideally we should not be passing around everything as index type since mask
+  // analysis can come across i1 values, but that improvement is being left for
+  // future work. For now we just unwrap an index back into it's i1 value if
+  // necessary.
+  if (!condValue.getType().isInteger(1)) {
+    assert(condValue.getDefiningOp<arith::IndexCastOp>());
+    condValue = condValue.getDefiningOp<arith::IndexCastOp>().getOperand();
+    assert(condValue.getType().isInteger(1));
+  }
+
+  auto selectOp =
+      b.create<arith::SelectOp>(loc, condValue, trueValue, falseValue);
+  return selectOp.getResult();
+}
+
 OpFoldResult compareOFRs(const OpFoldResult lhs, const OpFoldResult rhs,
-                    const arith::CmpIPredicate pred, const OpFoldResult trueOFR,
-                    const OpFoldResult falseOFR, const Location loc, OpBuilder &b) {
+                         const arith::CmpIPredicate pred,
+                         const OpFoldResult trueOFR,
+                         const OpFoldResult falseOFR, const Location loc,
+                         OpBuilder &b) {
   auto lhsIntAttr = getIntAttr(lhs);
   auto rhsIntAttr = getIntAttr(rhs);
 
   // both lhs and rhs are constants, return the result directly
   if (lhsIntAttr && rhsIntAttr) {
     switch (pred) {
-      case arith::CmpIPredicate::eq:
-        return *lhsIntAttr == *rhsIntAttr ? trueOFR : falseOFR;
-      case arith::CmpIPredicate::ne:
-        return *lhsIntAttr != *rhsIntAttr ? trueOFR : falseOFR;
-      case arith::CmpIPredicate::slt:
-      case arith::CmpIPredicate::ult:
-        return *lhsIntAttr < *rhsIntAttr ? trueOFR : falseOFR;
-      case arith::CmpIPredicate::sle:
-      case arith::CmpIPredicate::ule:
-        return *lhsIntAttr <= *rhsIntAttr ? trueOFR : falseOFR;
-      case arith::CmpIPredicate::sgt:
-      case arith::CmpIPredicate::ugt:
-        return *lhsIntAttr > *rhsIntAttr ? trueOFR : falseOFR;
-      case arith::CmpIPredicate::sge:
-      case arith::CmpIPredicate::uge:
-        return *lhsIntAttr >= *rhsIntAttr ? trueOFR : falseOFR;
-      default:
-        llvm_unreachable("Unsupported predicate");
+    case arith::CmpIPredicate::eq:
+      return *lhsIntAttr == *rhsIntAttr ? trueOFR : falseOFR;
+    case arith::CmpIPredicate::ne:
+      return *lhsIntAttr != *rhsIntAttr ? trueOFR : falseOFR;
+    case arith::CmpIPredicate::slt:
+    case arith::CmpIPredicate::ult:
+      return *lhsIntAttr < *rhsIntAttr ? trueOFR : falseOFR;
+    case arith::CmpIPredicate::sle:
+    case arith::CmpIPredicate::ule:
+      return *lhsIntAttr <= *rhsIntAttr ? trueOFR : falseOFR;
+    case arith::CmpIPredicate::sgt:
+    case arith::CmpIPredicate::ugt:
+      return *lhsIntAttr > *rhsIntAttr ? trueOFR : falseOFR;
+    case arith::CmpIPredicate::sge:
+    case arith::CmpIPredicate::uge:
+      return *lhsIntAttr >= *rhsIntAttr ? trueOFR : falseOFR;
+    default:
+      llvm_unreachable("Unsupported predicate");
     }
   }
 
   auto lhsValue = ofrToIndexValue(lhs, loc, b);
   auto rhsValue = ofrToIndexValue(rhs, loc, b);
-  auto trueValue = ofrToIndexValue(trueOFR, loc, b);
-  auto falseValue = ofrToIndexValue(falseOFR, loc, b);
 
   auto cmpOp = b.create<arith::CmpIOp>(loc, pred, lhsValue, rhsValue);
-  auto selectOp = b.create<arith::SelectOp>(loc, cmpOp, trueValue, falseValue);
-  return selectOp.getResult();
+  return selectOFRs(cmpOp.getResult(), trueOFR, falseOFR, loc, b);
 }
 
 } // namespace mlir
diff --git a/python/examples/test_mask.py b/python/examples/test_mask.py
@@ -37,3 +37,29 @@ def test(in0, out0):
     print(input)
     print(output)
     torch.testing.assert_close(output, torch.tensor([-1, -1, -1, -1, -2, -2, -2, -2], device=device, dtype=torch.int32))
+
+
+def test_mask_with_scalar_in_conjunction(device):
+    if device == 'cpu':
+        triton.runtime.driver.set_active(CPUDriver())
+
+    @triton.jit
+    def kernel(in0, out0, mask, value):
+        offs = tl.arange(0, 8)
+        out_offs = tl.arange(0, 8)
+        a = tl.load(in0 + offs, mask=(value < 5) & (offs < mask), other=-1)
+        tl.store(out0 + out_offs, a)
+
+    # Test scalar mask evaluate to True
+    SIZE = 8
+    input = torch.arange(0, SIZE, device=device, dtype=torch.int32)
+    output = torch.full((SIZE,), -2, device=device, dtype=torch.int32)
+    kernel[(1,)](input, output, 4, 3)
+    torch.testing.assert_close(output, torch.tensor([0, 1, 2, 3, -1, -1, -1, -1], device=device, dtype=torch.int32))
+
+    # Test scalar mask evaluate to False
+    SIZE = 8
+    input = torch.arange(0, SIZE, device=device, dtype=torch.int32)
+    output = torch.full((SIZE,), -2, device=device, dtype=torch.int32)
+    kernel[(1,)](input, output, 4, 8)
+    torch.testing.assert_close(output, torch.tensor([-1, -1, -1, -1, -1, -1, -1, -1], device=device, dtype=torch.int32))
diff --git a/test/Conversion/TritonToStructured/mask_ld_st_scalar_dim.mlir b/test/Conversion/TritonToStructured/mask_ld_st_scalar_dim.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-shared-opt --triton-to-structured --remove-dead-values --canonicalize %s | FileCheck %s
+// RUN: triton-shared-opt --triton-to-structured --remove-dead-values --canonicalize --cse %s | FileCheck %s
 
 module {
   tt.func @mask_ld_st_scalar(
@@ -38,10 +38,11 @@ module {
   }
 }
 
-// CHECK:   %{{.*}} = "tts.load"(%{{.*}}, %{{.*}}) <{operandSegmentSizes = array<i32: 1, 1, 0>, static_mask_dims = array<i64: -9223372036854775808, 1>}> : (tensor<2x1x!tt.ptr<f32>>, index) -> tensor<2x1xf32>
-// CHECK:   %{{.*}} = "tts.load"(%{{.*}}, %{{.*}}) <{operandSegmentSizes = array<i32: 1, 1, 0>, static_mask_dims = array<i64: -9223372036854775808, 1>}> : (tensor<2x1x!tt.ptr<f32>>, index) -> tensor<2x1xf32>
-// CHECK:   "tts.store"(%{{.*}}, %{{.*}}, %{{.*}}) <{static_mask_dims = array<i64: -9223372036854775808, 1>}> : (tensor<2x1x!tt.ptr<f32>>, tensor<2x1xf32>, index) -> ()
-// CHECK:   "tts.store"(%{{.*}}, %{{.*}}, %{{.*}}) <{static_mask_dims = array<i64: -9223372036854775808, 1>}> : (tensor<2x1x!tt.ptr<f32>>, tensor<2x1xf32>, index) -> ()
+// CHECK:   %{{.*}} = "tts.load"(%{{.*}}, %{{.*}}, %{{.*}}) <{operandSegmentSizes = array<i32: 1, 2, 0>, static_mask_dims = array<i64: -9223372036854775808, -9223372036854775808>}> : (tensor<2x1x!tt.ptr<f32>>, index, index) -> tensor<2x1xf32>
+// CHECK:   %{{.*}} = "tts.load"(%{{.*}}, %{{.*}}, %{{.*}}) <{operandSegmentSizes = array<i32: 1, 2, 0>, static_mask_dims = array<i64: -9223372036854775808, -9223372036854775808>}> : (tensor<2x1x!tt.ptr<f32>>, index, index) -> tensor<2x1xf32>
+// CHECK:   "tts.store"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) <{static_mask_dims = array<i64: -9223372036854775808, -9223372036854775808>}> : (tensor<2x1x!tt.ptr<f32>>, tensor<2x1xf32>, index, index) -> ()
+// CHECK:   "tts.store"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) <{static_mask_dims = array<i64: -9223372036854775808, -9223372036854775808>}> : (tensor<2x1x!tt.ptr<f32>>, tensor<2x1xf32>, index, index) -> ()
+
 
 // Original Triton Function:
 // def test_masked_ld_st(