[triton-raise-block-ptr]: Fix lowering of tt.addptr where ptr operand is yielded by a previous tt.advance operation (#3296)

etiotto · web-flow · commit 737e7b394cb9 · 2025-01-29T12:56:29.000-05:00
Fixed issue #3295 --------- Signed-off-by: Tiotto, Ettore <ettore.tiotto@intel.com>
diff --git a/test/Triton/Intel/RaiseToBlockPointers/addptr_nested.mlir b/test/Triton/Intel/RaiseToBlockPointers/addptr_nested.mlir
@@ -0,0 +1,58 @@
+// RUN: triton-opt %s -triton-raise-block-pointer -canonicalize | FileCheck %s
+
+module {
+  tt.func @kernel(
+    %arg0 : !tt.ptr<bf16>,
+    %arg1 : i32
+  )
+  {
+    %0 = tt.make_range {end = 4 : i32, start = 0 : i32}:tensor<4xi32>
+    // offset = 0, size = 4, stride = 1
+    %1 = tt.expand_dims %0 {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32>
+    // offset = [0,0], size = [4,1], stride = [1,0]
+    %2 = tt.broadcast %1 : tensor<4x1xi32> -> tensor<4x256xi32>
+    // offset = [0,0], size = [4,256], stride = [1,0]
+    %arg1splat = tt.splat %arg1 : i32 -> tensor<4x256xi32>
+    %offset3 = arith.addi %2, %arg1splat : tensor<4x256xi32>
+    // offset = [%arg1,0], size = [4,256], stride = [1,0]
+    %3 = tt.make_range {end = 256 : i32, start = 0 : i32}:tensor<256xi32>
+    // offset = 0, size = 256, stride = 1
+    %4 = tt.expand_dims %3 {axis = 0 : i32} : tensor<256xi32> -> tensor<1x256xi32>
+    // offset = [0,0], size = [1,256], stride = [0,1]
+    %5 = tt.broadcast %4 : tensor<1x256xi32> -> tensor<4x256xi32>
+    // offset = [0,0], size = [4,256], stride = [0,1]
+    %6 = arith.constant 5 : i32
+    %splat6 = tt.splat %6 : i32 -> tensor<4x256xi32>
+    %scale5 = arith.muli %5, %splat6 : tensor<4x256xi32>
+    // offset = [0,0], size = [4,256], stride = [0,5]
+    %7 = arith.addi %offset3, %scale5: tensor<4x256xi32>
+    // offset = [%arg1, 0], size = [4, 256], stride = [1, 5]
+    %8 = tt.splat %arg0 : !tt.ptr<bf16> -> tensor<4x256x!tt.ptr<bf16>>
+    %9 = tt.addptr %8, %7 : tensor<4x256x!tt.ptr<bf16>>, tensor<4x256xi32>
+    // source: %arg0, offset = [%arg1, 0], size = [4, 256], stride = [1, 5]
+    %10 = tt.load %9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false}: tensor<4x256x!tt.ptr<bf16>>
+    %12 = tt.addptr %9, %7 : tensor<4x256x!tt.ptr<bf16>>, tensor<4x256xi32>
+    // source: %arg0, offset = [%arg1+%arg1, 0], size = [4, 256], stride = [2, 10]
+    %13 = tt.load %12 {cache = 1 : i32, evict = 1 : i32, isVolatile = false}: tensor<4x256x!tt.ptr<bf16>>
+    %14 = arith.addf %10, %13 : tensor<4x256xbf16>
+    %16 = tt.addptr %12, %7 : tensor<4x256x!tt.ptr<bf16>>, tensor<4x256xi32>
+    // source: %arg0, offset = [%arg1+%arg1+%arg1, 0], size = [4, 256], stride = [3, 15]
+    tt.store %16, %14 : tensor<4x256x!tt.ptr<bf16>>
+    tt.return
+  }
+}
+
+// CHECK:         tt.func @kernel([[PARAM_0_:%.+]]: !tt.ptr<bf16>, [[PARAM_1_:%.+]]: i32) {
+// CHECK-DAG:       [[CST_1_i64:%.+]] = arith.constant 1 : i64
+// CHECK-DAG:       [[CST_0_i64:%.+]] = arith.constant 0 : i64
+// CHECK-DAG:       [[CST_0_i32:%.+]] = arith.constant 0 : i32
+// CHECK-DAG:       [[CST_5_i64:%.+]] = arith.constant 5 : i64
+// CHECK:           [[VAR_0_:%.+]] = tt.make_tensor_ptr [[PARAM_0_]], {{\[}}[[CST_0_i64]], [[CST_0_i64]]], {{\[}}[[CST_1_i64]], [[CST_5_i64]]], {{\[}}[[PARAM_1_]], [[CST_0_i32]]] {{.*}} : <tensor<4x256xbf16>>
+// CHECK:           [[VAR_1_:%.+]] = tt.load [[VAR_0_]] : !tt.ptr<tensor<4x256xbf16>>
+// CHECK:           [[VAR_2_:%.+]] = tt.advance [[VAR_0_]], {{\[}}[[PARAM_1_]], [[CST_0_i32]]] : <tensor<4x256xbf16>>
+// CHECK:           [[VAR_3_:%.+]] = tt.load [[VAR_2_]] : !tt.ptr<tensor<4x256xbf16>>
+// CHECK-DAG:       [[VAR_4_:%.+]] = arith.addf [[VAR_1_]], [[VAR_3_]] : tensor<4x256xbf16>
+// CHECK-DAG:       [[VAR_5_:%.+]] = tt.advance [[VAR_2_]], {{\[}}[[PARAM_1_]], [[CST_0_i32]]] : <tensor<4x256xbf16>>
+// CHECK:           tt.store [[VAR_5_]], [[VAR_4_]] : !tt.ptr<tensor<4x256xbf16>>
+// CHECK:           tt.return
+// CHECK:         }
diff --git a/third_party/intel/lib/TritonRaiseBlockPointer/TritonRaiseBlockPointer.cpp b/third_party/intel/lib/TritonRaiseBlockPointer/TritonRaiseBlockPointer.cpp
@@ -12,6 +12,7 @@
 #include "mlir/IR/Verifier.h"
 #include "mlir/Support/LLVM.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/Triton/IR/Types.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -373,6 +374,8 @@ struct PtrState {
 
   Value createTTAdvanceOp(Value ptr, tt::MakeTensorPtrOp makeTPtrOp,
                           OpBuilder &builder, Location loc) const {
+    assert(triton::isTensorPointerType(ptr.getType()) &&
+           "Expecting a block ptr");
     SmallVector<Value> newOffsets;
     for (const auto &[offset, stride] :
          llvm::zip(offsets, makeTPtrOp.getStrides()))
@@ -676,44 +679,13 @@ struct TritonRaiseBlockPointer
   }
 
   LogicalResult rewriteAddPtrOp(tt::AddPtrOp op) {
-    LLVM_DEBUG(llvm::dbgs() << "Rewriting: " << *op << "\n");
-
     OpBuilder builder(op);
     Location loc = op.getLoc();
     Value ptr = op.getPtr();
 
-    auto fillOffsets = [&](Value offset, unsigned rank,
-                           SmallVector<Value> &offsets) {
-      switch (rank) {
-      case 1:
-        offsets.push_back(offset);
-        break;
-      case 2:
-        offsets.push_back(
-            findOrCreateConstant(loc, 0, offsetBitwidth, builder));
-        offsets.push_back(offset);
-        break;
-      default:
-        llvm_unreachable("unexpected rank");
-      }
-    };
-
-    auto getConstantValue = [](arith::ConstantOp cstOp) {
-      TypedAttr cstVal = cstOp.getValue();
-      APInt val;
-      if (auto attr = dyn_cast<DenseIntElementsAttr>(cstVal))
-        val = attr.getSplatValue<APInt>();
-      else if (auto attr = dyn_cast<IntegerAttr>(cstVal))
-        val = attr.getValue();
-      else
-        assert(false && "unexpected constant type");
-
-      return val;
-    };
-
-    // If the ptr has already been mapped (i.e. rewritten into a block
-    // pointer), rewrite the AddPtrOp using and AdvanceOp.
+    // Case 1: the ptr has been already been mapped.
     if (Value mappedV = ptrMap.lookupOrNull(ptr)) {
+      // Case 1a: the ptr has been mapped to a make_tensor_ptr operation.
       if (auto makeTPtrOp = mappedV.getDefiningOp<tt::MakeTensorPtrOp>()) {
         PtrState state;
         if (failed(visitOperand(op.getOffset(), state, loc, builder)))
@@ -726,20 +698,60 @@ struct TritonRaiseBlockPointer
         cleanUp.insert(op);
         ptrMap.map(op.getResult(), advanceOp);
 
-        LLVM_DEBUG(llvm::dbgs()
-                   << "Rewrote:\n\t" << op << "\nto:\n\t" << advanceOp << "\n");
+        LLVM_DEBUG({
+          auto modOp =
+              builder.getBlock()->getParentOp()->getParentOfType<ModuleOp>();
+          llvm::dbgs() << "Module:\n" << modOp << "\n";
+          llvm::dbgs() << "Rewrote:\n\t" << op << "\nto:\n\t" << advanceOp
+                       << "\n";
+        });
+
+        return success();
+      }
+
+      // Case 1b: the ptr has been mapped to a tt.advance operation.
+      if (auto advanceOp = mappedV.getDefiningOp<tt::AdvanceOp>()) {
+        PtrState state;
+        if (failed(visitOperand(op.getOffset(), state, loc, builder)))
+          return failure();
+
+        // Skip through a chain of tt.advance operations...
+        Value ptr = advanceOp.getPtr();
+        while (auto advanceOp = ptr.getDefiningOp<tt::AdvanceOp>())
+          ptr = advanceOp.getPtr();
+
+        // ... until we find the make_tensor_ptr operation defining the block
+        // ptr feeding the first tt.advance operation.
+        auto makeTPtrOp = ptr.getDefiningOp<tt::MakeTensorPtrOp>();
+        assert(makeTPtrOp && "Expected a MakeTensorPtrOp");
+
+        Value newAdvanceOp = state.createTTAdvanceOp(advanceOp.getResult(),
+                                                     makeTPtrOp, builder, loc);
+
+        cleanUp.insert(op);
+        ptrMap.map(op.getResult(), newAdvanceOp);
+
+        LLVM_DEBUG({
+          llvm::dbgs() << "Rewrote:\n\t" << op << "\nto:\n\t" << newAdvanceOp
+                       << "\n";
+          auto modOp =
+              builder.getBlock()->getParentOp()->getParentOfType<ModuleOp>();
+          llvm::dbgs() << "Module:\n" << modOp << "\n";
+        });
+
         return success();
-      } else {
-        llvm_unreachable("Did not find tt::MakeTensorPtrOp");
       }
+
+      llvm_unreachable("Unexpected mappedV defining operation");
     }
 
+    // Case 2: the ptr has not previously been mapped.
     // If the addptr operation increments a scalar pointer, give up.
     Value result = op.getResult();
     if (!isa<RankedTensorType>(result.getType()))
       return failure();
 
-    // Otherwise, rewrite the AddPtrOp using PtrState.
+    // Otherwise, rewrite the AddPtrOp.
     PtrState state;
     if (failed(visitOperandAddptr(op, state, loc, builder)))
       return failure();
@@ -750,16 +762,11 @@ struct TritonRaiseBlockPointer
     Value makePtrOp = state.createTTMakeTensorPtrOp(builder, loc);
     knownPtrs[makePtrOp] = std::move(state);
 
-    ptrMap.map(result, makePtrOp);
-
-    LLVM_DEBUG(llvm::dbgs()
-               << "Rewrote:\n\t" << op << "\nto:\n\t" << makePtrOp << "\n");
-
-    // AddPtrOps that have been rewritten and no longer used in the code must
-    // be removed in the pass to avoid type matching issue.
     cleanUp.insert(op);
+    ptrMap.map(result, makePtrOp);
 
     LLVM_DEBUG({
+      llvm::dbgs() << "Rewrote:\n\t" << op << "\nto:\n\t" << makePtrOp << "\n";
       auto modOp =
           builder.getBlock()->getParentOp()->getParentOfType<ModuleOp>();
       llvm::dbgs() << "Module:\n" << modOp << "\n";
@@ -915,8 +922,8 @@ struct TritonRaiseBlockPointer
       }
 
       // This operand must be an iter-arg of an inner-loop in a multiple-level
-      // nested loop, which means its PtrState must have already been populated
-      // during rewriteForOp of the parent loop.
+      // nested loop, which means its PtrState must have already been
+      // populated during rewriteForOp of the parent loop.
       state = knownPtrs[operand];
       return success();
     }