Generalize Intel coalescing pass to handle users of scf.for with coalesced load (#2856)

etiotto · whitneywhtsang · web-flow · commit ef8ded801b25 · 2024-11-29T17:03:16.000Z
Ensure that loads of a `scf.for` yielded value with block ptr type can
be coalesced

---------

Signed-off-by: Tiotto, Ettore &lt;ettore.tiotto@intel.com&gt;
Co-authored-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/test/TritonIntelGPU/coalesce.mlir b/test/TritonIntelGPU/coalesce.mlir
@@ -382,4 +382,30 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, "ttg.th
     }) : (tensor<32x128xf32, #blocked>) -> tensor<32xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
     tt.return
   }
+
+  // CHECK: @issue_2762
+  tt.func public @issue_2762(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
+    %c128_i32 = arith.constant 128 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c262144_i64 = arith.constant 262144 : i64
+    %c1_i64 = arith.constant 1 : i64
+    %c512_i64 = arith.constant 512 : i64
+    %c32_i32 = arith.constant 32 : i32
+    %c512_i32 = arith.constant 512 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c32_i32 : i32
+    %4 = arith.divsi %1, %c512_i32 : i32
+    %5 = arith.remsi %1, %c512_i32 : i32
+    // CHECK: [[PTR1:%.*]] = tt.make_tensor_ptr %arg0, {{.*}} : <tensor<1x32x128xf32, [[BLOCKED_LAYOUT1]]>>
+    %y = tt.make_tensor_ptr %arg0, [%c512_i64, %c512_i64, %c512_i64], [%c1_i64, %c512_i64, %c262144_i64], [%4, %5, %c0_i32] {order = array<i32: 2, 1, 0>} : <tensor<1x32x128xf32, #blocked1>>
+    // CHECK: [[RES:%.*]] = scf.for {{.*}} iter_args([[ARG1:%.*]] = [[PTR1]]) -> (!tt.ptr<tensor<1x32x128xf32, [[BLOCKED_LAYOUT1]]>>)
+    %8:1 = scf.for %arg5 = %c0_i32 to %c512_i32 step %c128_i32 iter_args(%arg7 = %y) -> (!tt.ptr<tensor<1x32x128xf32, #blocked1>>) : i32 {
+      // CHECK: scf.yield [[ARG1]] : !tt.ptr<tensor<1x32x128xf32, [[BLOCKED_LAYOUT1]]>>
+      scf.yield %arg7 : !tt.ptr<tensor<1x32x128xf32, #blocked1>>
+    }
+    // CHECK: [[LOAD_RES:%.*]] = tt.load [[RES]] : !tt.ptr<tensor<1x32x128xf32, [[BLOCKED_LAYOUT1]]>>
+    // CHECK: ttg.convert_layout [[LOAD_RES]] : tensor<1x32x128xf32, [[BLOCKED_LAYOUT1]]> -> tensor<1x32x128xf32, [[BLOCKED_LAYOUT2]]>
+    %res = tt.load %8#0 : !tt.ptr<tensor<1x32x128xf32, #blocked1>>
+    tt.return
+  }
 }
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp
@@ -122,6 +122,8 @@ struct CoalescePass
   // Find the defining makeTensorPtrOp operation of the given value.
   static std::optional<tt::MakeTensorPtrOp>
   findDefiningMakeTensorPtrOp(Value val) {
+    LDBG("Attempting to find `makeTensorPtrOp` defining: " << val);
+
     if (auto arg = dyn_cast<BlockArgument>(val)) {
       Operation *parentOp = val.getParentBlock()->getParentOp();
       assert(isa<scf::ForOp>(parentOp) && "Expected a scf::ForOp");
@@ -134,6 +136,14 @@ struct CoalescePass
       return findDefiningMakeTensorPtrOp(advanceOp.getPtr());
     if (auto makePtrOp = val.getDefiningOp<tt::MakeTensorPtrOp>())
       return makePtrOp;
+    if (auto opRes = dyn_cast<OpResult>(val)) {
+      Operation *defOp = opRes.getOwner();
+      if (auto forOp = dyn_cast<scf::ForOp>(defOp)) {
+        Value val = forOp.getYieldedValues()[opRes.getResultNumber()];
+        return findDefiningMakeTensorPtrOp(val);
+      }
+      assert(false && "unhandled operation");
+    }
 
     return std::nullopt;
   }
@@ -369,12 +379,14 @@ struct CoalescePass
     });
 
     LLVM_DEBUG({
-      DBGS() << "\nlayoutMap:\n";
+      DBGS() << "layoutMap:\n";
+      if (layoutMap.empty())
+        DBGS() << "\t<empty>";
       for (auto [op, encoding] : layoutMap) {
-        DBGS() << "op: " << *op << "\n";
-        DBGS() << "encoding: " << encoding << "\n\n";
+        DBGS() << "\top: " << *op << "\n";
+        DBGS() << "\tencoding: " << encoding << "\n";
       }
-      llvm::errs() << "\n\n";
+      llvm::errs() << "\n";
     });
 
     // For each memory op that has a layout L1: