Fix getPredMask to handle pointers to tensors loads. (#4582)

etiotto · web-flow · commit 92c789c056a2 · 2025-06-26T14:35:35.000-04:00
Fixes issue #4580. Signed-off-by: Tiotto, Ettore <ettore.tiotto@intel.com>
diff --git a/lib/Dialect/Triton/IR/Utility.cpp b/lib/Dialect/Triton/IR/Utility.cpp
@@ -2,13 +2,14 @@
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/Triton/IR/Types.h"
 
 using namespace mlir;
 namespace tt = mlir::triton;
 
 Value tt::getPredMask(RewriterBase &rewriter, Type typeLike, Value currentMask,
                       Value pred) {
-  Type maskType = tt::getI1SameShape(typeLike);
+  Type maskType = tt::getI1SameShape(tt::getPointeeType(typeLike));
   Location loc = pred.getLoc();
   Value mask = pred;
   if (isa<RankedTensorType>(maskType)) {
diff --git a/test/Triton/loop-invariant-code-motion.mlir b/test/Triton/loop-invariant-code-motion.mlir
@@ -1,10 +1,10 @@
 // RUN: triton-opt --split-input-file %s -triton-licm | FileCheck %s
 
-tt.func @hoist_load_without_mask(%arg0: tensor<1024x!tt.ptr<f32>>, %arg1: tensor<1024xi32>, %arg2: tensor<1024xi32>, %arg3: i32, %arg4 : i32, %arg5: tensor<1024x!tt.ptr<f32>>) {
+tt.func @hoist_load_without_mask1(%arg0: tensor<1024x!tt.ptr<f32>>, %arg1: tensor<1024xi32>, %arg2: tensor<1024xi32>, %arg3: i32, %arg4 : i32, %arg5: tensor<1024x!tt.ptr<f32>>) {
   %cst = arith.constant dense<0.000000e+00> : tensor<1024xf32>
   %c1_i32 = arith.constant 1 : i32
   // Check if the load is hoisted
-  // CHECK-LABEL: hoist_load_without_mask
+  // CHECK-LABEL: hoist_load_without_mask1
   // CHECK: %[[TRIP_COUNT_CMP:.*]] = arith.cmpi slt, %[[LB:.*]], %[[UB:.*]]
   // CHECK: %[[SPLAT:.*]] = tt.splat %[[TRIP_COUNT_CMP]]
   // CHECK: %[[LOAD:.*]] = tt.load %[[_:.*]], %[[SPLAT]]
@@ -23,6 +23,29 @@ tt.func @hoist_load_without_mask(%arg0: tensor<1024x!tt.ptr<f32>>, %arg1: tensor
 
 // -----
 
+tt.func @hoist_load_without_mask2(%arg0: !tt.ptr<tensor<1024xf32>>, %arg3: i32, %arg4 : i32, %arg5: !tt.ptr<tensor<1024xf32>>) {
+  %cst = arith.constant dense<0.000000e+00> : tensor<1024xf32>
+  %c1_i32 = arith.constant 1 : i32
+  // Check if the load is hoisted
+  // CHECK-LABEL: hoist_load_without_mask2
+  // CHECK: %[[TRIP_COUNT_CMP:.*]] = arith.cmpi slt, %[[LB:.*]], %[[UB:.*]]
+  // CHECK: %[[SPLAT:.*]] = tt.splat %[[TRIP_COUNT_CMP]]
+  // CHECK: %[[LOAD:.*]] = tt.load %[[_:.*]], %[[SPLAT]]
+  // CHECK: arith.addf %[[LOAD]], %[[LOAD]]
+  // CHECK: scf.for
+  // CHECK-NOT: tt.load
+  %1 = scf.for %arg7 = %arg3 to %arg4 step %c1_i32 iter_args(%arg6 = %cst) -> (tensor<1024xf32>)  : i32 {
+    %2 = tt.load %arg0 : !tt.ptr<tensor<1024xf32>>
+    %3 = arith.addf %2, %2 : tensor<1024xf32>
+    %4 = arith.addf %arg6, %3 : tensor<1024xf32>
+    scf.yield %4 : tensor<1024xf32>
+  }
+  tt.store %arg5, %1 : !tt.ptr<tensor<1024xf32>>
+  tt.return
+}
+
+// -----
+
 tt.func @hoist_two_loads_without_mask(%arg0: tensor<1024x!tt.ptr<f32>>, %arg1: tensor<1024xi32>, %arg2: tensor<1024xi32>, %arg3: i32, %arg4 : i32, %arg5: tensor<1024x!tt.ptr<f32>>, %arg6: tensor<1024x!tt.ptr<f32>>) {
   %cst = arith.constant dense<0.000000e+00> : tensor<1024xf32>
   %c1_i32 = arith.constant 1 : i32
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp
@@ -161,9 +161,8 @@ static Operation *predicateOp(RewriterBase &rewriter, Operation *op,
   return TypeSwitch<Operation *, Operation *>(op)
       .Case<tt::LoadOp, ttgi::PrefetchOp>([&](auto op) {
         rewriter.setInsertionPoint(op);
-        Value mask =
-            tt::getPredMask(rewriter, tt::getPointeeType(op.getPtr().getType()),
-                            op.getMask(), pred);
+        Value mask = tt::getPredMask(rewriter, op.getPtr().getType(),
+                                     op.getMask(), pred);
         op.getMaskMutable().assign(mask);
         return op;
       });