Support mask analysis against 0, and between two scalars (#202)

kile01 · web-flow · commit 72e73a0f3d53 · 2025-01-09T12:08:18.000Z
Support was added for two new scenarios:
`arith.cmpi ge %scalar, %c0`: aka offset comparison to the lower bound
of 0. Mask analysis already has an implicit assumption that the
beginning of a mask starts at 0, so support was added to allow this case
through and assumes that this comparison evaluates to true.
`arith.cmpi slt %scalar,_1 %scalar_2`: offset comparison between two
scalars. E.g.:
```
%11 = tt.expand_dims %offset
%cst_4 = arith.constant dense&lt;324&gt; : tensor&lt;16x1xi64&gt;
%23 = arith.cmpi slt, %11, %cst_4 : tensor&lt;16x1xi64&gt;
```
This example is notable in that we cannot take the normal approach of
computing the minimum of the lhs and rhs as the new dimension (the lhs
offset may be 0). To handle this, a ternary operator is inserted to
evaluate the comparison at runtime. If it succeeds, we keep the existing
dimensions from the lhs, otherwise we assume nothing should be
loaded/stored.

This change also adds a dump method to both `MaskState` and `PtrState`
as a small QOL improvement.
diff --git a/include/triton-shared/Analysis/MaskAnalysis.h b/include/triton-shared/Analysis/MaskAnalysis.h
@@ -51,6 +51,8 @@ struct MaskState {
   OpFoldResult scalar;
   const bool useUnsafeMask;
 
+  void dump() const;
+
   MaskState(bool useUnsafeMask = false) : useUnsafeMask(useUnsafeMask) {}
 
   int64_t getRank() const { return dims.size(); }
@@ -118,9 +120,17 @@ struct MaskState {
                          OpBuilder &builder);
 
   // Operand is the result of cmpi
-  // Assume only of the dimensions have size > 1. Only support slt for now.
-  // For that dimension, calculate this new dim as: dim = min(end, value) -
-  // start
+  // Assume only one of the dimensions has size > 1. Only support slt/ult, and
+  // sge against 0 for now. For that dimension, we have three cases:
+  //  1. Constant comparison with both left and right-hand sides being scalars.
+  //     Calculate this new dim as a compare and select.
+  //      I.e. dim = lhs < rhs ? end : 0
+  //  2. Left-hand side is not a scalar, and the right-hand side is.
+  //      2.a. Predicate is slt/ult. Calculate this new dim as:
+  //            dim = max(min(end, value), start) - start
+  //      2.b. Predicate is sge against 0. Mask analysis already has an
+  //            assumption that the mask starts at 0, so evaluate this to true
+  //            and calculate this new dim as: dim = end
   LogicalResult parseCmp(arith::CmpIOp cmpOp, const Location loc,
                          OpBuilder &builder);
   // Operand is the result of make_range
diff --git a/include/triton-shared/Analysis/OpFoldResultUtils.h b/include/triton-shared/Analysis/OpFoldResultUtils.h
@@ -10,6 +10,7 @@
 
 #include "mlir/IR/Location.h"
 #include "mlir/IR/OpDefinition.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 
 #include <optional>
 
@@ -57,6 +58,10 @@ OpFoldResult minOFRs(const OpFoldResult lhs, const OpFoldResult rhs,
 
 OpFoldResult maxOFRs(const OpFoldResult lhs, const OpFoldResult rhs,
                      const Location loc, OpBuilder &b);
+
+OpFoldResult compareOFRs(const OpFoldResult lhs, const OpFoldResult rhs,
+                    const arith::CmpIPredicate pred, const OpFoldResult trueVal,
+                    const OpFoldResult falseVal, const Location loc, OpBuilder &b);
 } // namespace mlir
 
 #endif
diff --git a/include/triton-shared/AnalysisStructured/PtrAnalysis.h b/include/triton-shared/AnalysisStructured/PtrAnalysis.h
@@ -59,6 +59,8 @@ struct PtrState {
 
   bool isBlockPtr() const;
 
+  void dump() const;
+
   // Process addition of two PtrStates.
   LogicalResult addState(const PtrState &lhsState, const PtrState &rhsState,
                          Operation *op, OpBuilder &builder);
diff --git a/lib/Analysis/MaskAnalysis.cpp b/lib/Analysis/MaskAnalysis.cpp
@@ -265,6 +265,17 @@ LogicalResult MaskState::parseIntScalar(Value scalar, const Location loc,
   return success();
 }
 
+void MaskState::dump() const {
+  llvm::dbgs() << "start: " << start << "\n";
+  llvm::dbgs() << "end: " << end << "\n";
+  llvm::dbgs() << "scalar: " << scalar << "\n";
+  llvm::dbgs() << "useUnsafeMask: " << useUnsafeMask << "\n";
+  llvm::dbgs() << "dims: ";
+  for (auto dim : dims)
+    llvm::dbgs() << "\t" << dim << "\n";
+  llvm::dbgs() << "\n";
+}
+
 LogicalResult MaskState::parseAdd(arith::AddIOp addOp, const Location loc,
                                   OpBuilder &builder) {
   assert(this->isEmpty());
@@ -308,7 +319,8 @@ LogicalResult MaskState::parseCmp(arith::CmpIOp cmpOp, const Location loc,
   assert(this->isEmpty());
 
   if (cmpOp.getPredicate() != arith::CmpIPredicate::slt &&
-      cmpOp.getPredicate() != arith::CmpIPredicate::ult) {
+      cmpOp.getPredicate() != arith::CmpIPredicate::ult &&
+      cmpOp.getPredicate() != arith::CmpIPredicate::sge) {
     InFlightDiagnostic diag = emitError(loc) << "Unsupported cmpi";
     return failure();
   }
@@ -321,9 +333,17 @@ LogicalResult MaskState::parseCmp(arith::CmpIOp cmpOp, const Location loc,
   if (failed(rhsState.parse(cmpOp.getRhs(), loc, builder)))
     return failure();
 
-  assert((!lhsState.scalar && rhsState.scalar) && "Unsupported cmpi scenario");
+  // We only support sge against 0 for lower bounds. Dims already has an
+  // implicit assumption that the lower bound is 0, so if we see this, assume
+  // the comparison evaluates to true.
+  if (cmpOp.getPredicate() == arith::CmpIPredicate::sge
+    && !(rhsState.scalar && hasConstZero(rhsState.scalar))) {
+    InFlightDiagnostic diag = emitError(loc)
+                              << "Unsupported cmpi with rhs not equal to 0";
+    return failure();
+  }
 
-  int32_t cmpDim = -1;
+  int32_t cmpDim = lhsState.scalar && rhsState.scalar ? 0 : -1;
   for (int32_t i = 0; i < lhsState.getRank(); i++) {
     auto dimIntAttr = getIntAttr(lhsState.dims[i]);
     if (!dimIntAttr || dimIntAttr.value() != 1) {
@@ -339,22 +359,42 @@ LogicalResult MaskState::parseCmp(arith::CmpIOp cmpOp, const Location loc,
   assert(cmpDim != -1 &&
          "Unexpected case where no dimension has size larger than 1");
 
-  // Important:
-  // In the case where the values we are loading are entirely masked off like
-  // the following:
-  //
-  // ---|-------|-----------|
-  //    ^       ^           ^
-  //   scalar  start       end
-  //
-  // newEnd = min(end, scalar) = scalar
-  // Now scalar < start, so simply doing dim = newEnd - start is incorrect.
-  //
-  // The correct formula is to optionally move `newDim` back to `start` using
-  // max(newEnd, start).
-  auto newEnd = minOFRs(lhsState.end, rhsState.scalar, loc, builder);
-  newEnd = maxOFRs(newEnd, lhsState.start, loc, builder);
-  auto newDim = subOFRs(newEnd, lhsState.start, loc, builder);
+  OpFoldResult newDim;
+  if (lhsState.scalar) {
+    assert(rhsState.scalar && "Unexpected case where rhs is not a scalar");
+    // If both lhs and rhs are scalars, we can't just derive the dimension of
+    // the mask as the minimum value: lhs/rhs could be 0 and then we don't
+    // load/store anything.
+    //
+    // Instead treat the comparison as a scalar that determines if anything
+    // should be loaded/stored by inserting a comparison + select:
+    //    dim = lhs < rhs ? lhs.dim : 0
+    newDim = compareOFRs(lhsState.scalar, rhsState.scalar, cmpOp.getPredicate(),
+                  lhsState.dims[cmpDim], builder.getIndexAttr(0),
+                  loc, builder);
+  } else if (cmpOp.getPredicate() == arith::CmpIPredicate::slt ||
+    cmpOp.getPredicate() == arith::CmpIPredicate::ult) {
+    // Important:
+    // In the case where the values we are loading are entirely masked off like
+    // the following:
+    //
+    // ---|-------|-----------|
+    //    ^       ^           ^
+    //   scalar  start       end
+    //
+    // newEnd = min(end, scalar) = scalar
+    // Now scalar < start, so simply doing dim = newEnd - start is incorrect.
+    //
+    // The correct formula is to optionally move `newDim` back to `start` using
+    // max(newEnd, start).
+    auto newEnd = minOFRs(lhsState.end, rhsState.scalar, loc, builder);
+    newEnd = maxOFRs(newEnd, lhsState.start, loc, builder);
+    newDim = subOFRs(newEnd, lhsState.start, loc, builder);
+  } else {
+    assert(cmpOp.getPredicate() == arith::CmpIPredicate::sge && rhsState.scalar
+           && hasConstZero(rhsState.scalar));
+    newDim = lhsState.dims[cmpDim];
+  }
 
   for (int32_t i = 0; i < lhsState.getRank(); i++) {
     if (i == cmpDim)
diff --git a/lib/Analysis/OpFoldResultUtils.cpp b/lib/Analysis/OpFoldResultUtils.cpp
@@ -245,4 +245,43 @@ OpFoldResult maxOFRs(const OpFoldResult lhs, const OpFoldResult rhs,
   return maxOp.getResult();
 }
 
+OpFoldResult compareOFRs(const OpFoldResult lhs, const OpFoldResult rhs,
+                    const arith::CmpIPredicate pred, const OpFoldResult trueOFR,
+                    const OpFoldResult falseOFR, const Location loc, OpBuilder &b) {
+  auto lhsIntAttr = getIntAttr(lhs);
+  auto rhsIntAttr = getIntAttr(rhs);
+
+  // both lhs and rhs are constants, return the result directly
+  if (lhsIntAttr && rhsIntAttr) {
+    switch (pred) {
+      case arith::CmpIPredicate::eq:
+        return *lhsIntAttr == *rhsIntAttr ? trueOFR : falseOFR;
+      case arith::CmpIPredicate::ne:
+        return *lhsIntAttr != *rhsIntAttr ? trueOFR : falseOFR;
+      case arith::CmpIPredicate::slt:
+      case arith::CmpIPredicate::ult:
+        return *lhsIntAttr < *rhsIntAttr ? trueOFR : falseOFR;
+      case arith::CmpIPredicate::sle:
+      case arith::CmpIPredicate::ule:
+        return *lhsIntAttr <= *rhsIntAttr ? trueOFR : falseOFR;
+      case arith::CmpIPredicate::sgt:
+      case arith::CmpIPredicate::ugt:
+        return *lhsIntAttr > *rhsIntAttr ? trueOFR : falseOFR;
+      case arith::CmpIPredicate::sge:
+      case arith::CmpIPredicate::uge:
+        return *lhsIntAttr >= *rhsIntAttr ? trueOFR : falseOFR;
+      default:
+        llvm_unreachable("Unsupported predicate");
+    }
+  }
+
+  auto lhsValue = ofrToIndexValue(lhs, loc, b);
+  auto rhsValue = ofrToIndexValue(rhs, loc, b);
+  auto trueValue = ofrToIndexValue(trueOFR, loc, b);
+  auto falseValue = ofrToIndexValue(falseOFR, loc, b);
+
+  auto cmpOp = b.create<arith::CmpIOp>(loc, pred, lhsValue, rhsValue);
+  auto selectOp = b.create<arith::SelectOp>(loc, cmpOp, trueValue, falseValue);
+  return selectOp.getResult();
+}
 } // namespace mlir
diff --git a/lib/AnalysisStructured/PtrAnalysis.cpp b/lib/AnalysisStructured/PtrAnalysis.cpp
@@ -251,6 +251,28 @@ LogicalResult PtrState::addState(const PtrState &lhsState,
   return success();
 }
 
+void PtrState::dump() const {
+  llvm::dbgs() << "PtrState: ";
+  if (source) {
+    llvm::dbgs() << "source: " << source << "\n";
+  }
+  if (scalar) {
+    llvm::dbgs() << "scalar: " << scalar << "\n";
+  }
+
+  llvm::dbgs() << "offsets: ";
+  llvm::interleave(offsets, llvm::dbgs(), "\n");
+  llvm::dbgs() << "\nstrides: ";
+  llvm::interleave(strides, llvm::dbgs(), "\n");
+  llvm::dbgs() << "\nsizes: ";
+  llvm::interleave(sizes, llvm::dbgs(), "\n");
+  llvm::dbgs() << "\nshape: ";
+  llvm::interleave(shape, llvm::dbgs(), "\n");
+  llvm::dbgs() << "\norder: ";
+  llvm::interleave(order, llvm::dbgs(), "\n");
+  llvm::dbgs() << "\n";
+}
+
 LogicalResult PtrState::mulState(const PtrState &lhsState,
                                  const PtrState &rhsState, Operation *op,
                                  OpBuilder &builder) {
@@ -265,9 +287,6 @@ LogicalResult PtrState::mulState(const PtrState &lhsState,
     return failure();
   }
 
-  assert(!(lhsState.scalar && rhsState.scalar) &&
-         "do not expect to see both lhs and rhs are scalars");
-
   // currently do not support both tensors are effectively non-scalar
   if (!lhsState.scalar && !rhsState.scalar) {
     op->emitRemark(
@@ -283,6 +302,11 @@ LogicalResult PtrState::mulState(const PtrState &lhsState,
     std::swap(lhs, rhs);
   }
 
+  if (lhsState.scalar && rhsState.scalar) {
+    scalar = builder.create<arith::MulIOp>(
+        loc, lhsState.scalar, rhsState.scalar);
+  }
+
   for (uint64_t i = 0; i < lhs->sizes.size(); i++) {
     OpFoldResult newOffset =
         mulOFRValue(lhs->offsets[i], rhs->scalar, loc, builder);
diff --git a/test/Conversion/TritonToStructured/addptr_cmpge.mlir b/test/Conversion/TritonToStructured/addptr_cmpge.mlir
@@ -0,0 +1,73 @@
+// RUN: triton-shared-opt --triton-to-structured --split-input-file %s | FileCheck %s
+
+// These tests check that loads/stores that exhibit a cmp ge against 0 work
+// correctly with the pointer analysis pass
+
+// Example of the triton kernel that generates the loads/stores with cmp ge 0.
+// The boundary_check fields of the load/stores, along with preprocessing the
+// kernel through --triton-rewrite-tensor-pointer before calling the
+// --triton-to-structured pass results in those cmp ge 0 instructions.
+//
+//  def kernel(in_ptr0, out_ptr0, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
+//     yoffset = tl.program_id(1) * YBLOCK
+//     xoffset = tl.program_id(0) * XBLOCK
+//     tmp0 = tl.load(tl.make_block_ptr(in_ptr0, shape=[16640, 10],
+//                     strides=[1, 16640], block_shape=[XBLOCK, YBLOCK],
+//                     order=[1, 0], offsets=[xoffset, yoffset]),
+//                     boundary_check=[0, 1])
+//     tl.store(tl.make_block_ptr(out_ptr0, shape=[16640, 10],
+//                     strides=[1, 16640], block_shape=[XBLOCK, YBLOCK],
+//                     order=[1, 0], offsets=[xoffset, yoffset]),
+//                     tl.broadcast_to(tmp0, [XBLOCK, YBLOCK]).to(tl.float16),
+//                     boundary_check=[0, 1])
+
+tt.func public @test_masked_load(%arg0: !tt.ptr<f16>) -> tensor<16x16xf16> {
+  %cst = arith.constant dense<0> : tensor<1x16xi64>
+  %c16_i32 = arith.constant 16 : i32
+  %0 = tt.get_program_id y : i32
+  %1 = arith.muli %0, %c16_i32 : i32
+  %2 = arith.extsi %1 : i32 to i64
+  %3 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<16x16x!tt.ptr<f16>>
+  %4 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32>
+  %5 = arith.extsi %4 : tensor<16xi32> to tensor<16xi64>
+  %6 = tt.expand_dims %5 {axis = 1 : i32} : tensor<16xi64> -> tensor<16x1xi64>
+  %7 = tt.broadcast %6 : tensor<16x1xi64> -> tensor<16x16xi64>
+  %8 = tt.addptr %3, %7 : tensor<16x16x!tt.ptr<f16>>, tensor<16x16xi64>
+  %9 = tt.splat %2 : i64 -> tensor<16xi64>
+  %10 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32>
+  %11 = arith.extsi %10 : tensor<16xi32> to tensor<16xi64>
+  %12 = arith.addi %9, %11 : tensor<16xi64>
+  %13 = tt.expand_dims %12 {axis = 0 : i32} : tensor<16xi64> -> tensor<1x16xi64>
+  %14 = arith.cmpi sge, %13, %cst : tensor<1x16xi64>
+  %15 = tt.broadcast %14 : tensor<1x16xi1> -> tensor<16x16xi1>
+  %16 = tt.load %8, %15 evictionPolicy = evict_last : tensor<16x16x!tt.ptr<f16>>
+  tt.return %16 : tensor<16x16xf16>
+}
+
+// CHECK:         tt.func public @test_masked_load([[arg0_:%.+]]: !tt.ptr<f16>) -> tensor<16x16xf16> {
+// CHECK:           [[VAR_0_:%.+]] = tts.make_tptr [[arg0_]] to sizes: [16, 16], strides: [1, 0], offsets: [0, 0], shape: [0, 0], order: [] : <f16> to tensor<16x16x!tt.ptr<f16>>
+// CHECK:           [[VAR_1_:%.+]] = "tts.load"([[VAR_0_]]) <{operandSegmentSizes = array<i32: 1, 0, 0>, static_mask_dims = array<i64: 16, 16>}> : (tensor<16x16x!tt.ptr<f16>>) -> tensor<16x16xf16>
+// CHECK:         }
+
+// -----
+
+tt.func public @test_masked_store(%arg0: !tt.ptr<f16>) {
+  %cst = arith.constant dense<0> : tensor<16x1xi64>
+  %cst_0 = arith.constant dense<1.500000e+01> : tensor<16x16xf16>
+  %0 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<16x16x!tt.ptr<f16>>
+  %1 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32>
+  %2 = arith.extsi %1 : tensor<16xi32> to tensor<16xi64>
+  %3 = tt.expand_dims %2 {axis = 1 : i32} : tensor<16xi64> -> tensor<16x1xi64>
+  %4 = tt.broadcast %3 : tensor<16x1xi64> -> tensor<16x16xi64>
+  %5 = tt.addptr %0, %4 : tensor<16x16x!tt.ptr<f16>>, tensor<16x16xi64>
+  %6 = arith.cmpi sge, %3, %cst : tensor<16x1xi64>
+  %7 = tt.broadcast %6 : tensor<16x1xi1> -> tensor<16x16xi1>
+  tt.store %5, %cst_0, %7 : tensor<16x16x!tt.ptr<f16>>
+  tt.return
+}
+
+// CHECK:         tt.func public @test_masked_store([[arg0_:%.+]]: !tt.ptr<f16>) {
+// CHECK-DAG:       [[VAR_cst_:%.+]] = arith.constant dense<1.500000e+01> : tensor<16x16xf16>
+// CHECK-DAG:       [[VAR_0_:%.+]] = tts.make_tptr [[arg0_]] to sizes: [16, 16], strides: [1, 0], offsets: [0, 0], shape: [0, 0], order: [] : <f16> to tensor<16x16x!tt.ptr<f16>>
+// CHECK:           "tts.store"([[VAR_0_]], [[VAR_cst_]]) <{static_mask_dims = array<i64: 16, 16>}> : (tensor<16x16x!tt.ptr<f16>>, tensor<16x16xf16>) -> ()
+// CHECK:         }