Reduce PR footprint

etiotto · etiotto · commit 22b7ec9bc5bf · 2024-10-28T20:16:31.000Z
Signed-off-by: Tiotto, Ettore &lt;ettore.tiotto@intel.com&gt;
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp
@@ -305,29 +305,8 @@ bool hasConvertToMMATransisitiveUse(Operation *op, Attribute encoding) {
 // Return true if the op is an op with a layout we don't want to change. We will
 // propagate the layout starting from anchor ops.
 bool isLayoutAnchor(Operation *op) {
-  if (isa<LoadOp>(op)) {
-#ifdef HACK
-    // Note: currently block ptr loads are always considered not expensive and
-    // therefore they are never layout anchors.
-    Value base = op->getOperand(0);
-    auto parentLoop = op->getParentOfType<scf::ForOp>();
-    bool isInLoop = parentLoop != nullptr;
-    bool isTensorPtrLoad = mlir::triton::isTensorPointerType(base.getType());
-
-    if (!isTensorPtrLoad)
-      ttgi::isExpensiveLoadOrStore(op);
-
-    // HACK: consider block ptr loads expensive if they are in a loop.
-    return isInLoop;
-#else
+  if (isa<LoadOp, StoreOp>(op))
     return ttgi::isExpensiveLoadOrStore(op);
-#endif
-  }
-
-  if (isa<StoreOp>(op)) {
-    return ttgi::isExpensiveLoadOrStore(op);
-  }
-
   if (isa<DotOp, AtomicCASOp>(op))
     return true;
   if (isa<AtomicRMWOp>(op))
@@ -377,17 +356,6 @@ void LayoutPropagation::initAnchorLayout() {
       }
     }
   });
-
-#if 0
-  llvm::errs() << "Initial layouts:\n";
-  for (auto &entry : layouts) {
-    llvm::errs() << entry.first << "\n";
-    for (auto &layout : entry.second.encodings) {
-      llvm::errs() << "  " << layout << "\n";
-    }
-  }
-  llvm::errs() << "\n\n";
-#endif
 }
 
 void LayoutPropagation::setEncoding(ValueRange values, LayoutInfo &info,
@@ -1001,28 +969,8 @@ Operation *LayoutPropagation::rewriteOp(Operation *op) {
 }
 
 bool canBeRemat(Operation *op) {
-  if (isa<LoadOp>(op)) {
-#ifdef HACK
-    // Note: currently block ptr loads are always considered not expensive and
-    // therefore rematerializable.
-    Value base = op->getOperand(0);
-    auto parentLoop = op->getParentOfType<scf::ForOp>();
-    bool isInLoop = parentLoop != nullptr;
-    bool isTensorPtrLoad = mlir::triton::isTensorPointerType(base.getType());
-
-    if (!isTensorPtrLoad)
-      return !ttgi::isExpensiveLoadOrStore(op);
-
-    // HACK: consider block ptr loads expensive if they are in a loop.
-    return !isInLoop;
-#else
-    return !ttgi::isExpensiveLoadOrStore(op);
-#endif
-  }
-
-  if (isa<StoreOp>(op))
+  if (isa<LoadOp, StoreOp>(op))
     return !ttgi::isExpensiveLoadOrStore(op);
-
   if (isa<AtomicRMWOp, AtomicCASOp, DotOp>(op))
     return false;
   if (isa<scf::WhileOp, scf::ConditionOp>(op))
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Utility.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Utility.cpp
@@ -86,34 +86,22 @@ bool isExpensiveLoadOrStore(Operation *op) {
          "Expecting Triton LoadOp or StoreOp");
   Value base = op->getOperand(0);
 
-  // Case 1: A size 1 tensor is not expensive since all threads will load the
-  // same
+  // A size 1 tensor is not expensive since all threads will load the same
+  // value.
   if (isSingleValue(base))
     return false;
 
-    // Case 2: Tensor of pointers has more threads than elements
-    // we can presume a high hit-rate that makes it cheap to load
-
-    // IDEA: Block pointers loads are expensive if:
-    //   - they cannot be lowered to 2D block reads (they feed a dot operation)
-    //   - temporarily we can look at the "triton_intel_gpu.block_io" attribute,
-    //   if it has it it can be lowered to 2D block reads
-    //
-    //
-
-#define NEW 1
-#ifdef NEW
+  // Loads that use a block pointer are expensive if they cannot be lowered to
+  // 2D block read operations. Temporarily leverage the
+  // "triton_intel_gpu.block_io" attribute to filter out inexpensive loads.
   Attribute blockIOAttr =
       op->getAttr(TritonIntelGPUDialect::getBlockIOAttrName());
-  if (blockIOAttr) {
-    llvm::errs() << "load op: " << *op << " is not expensive\n";
+  if (blockIOAttr)
     return false;
-  }
 
+  // Loads that use more threads than elements can be presumed to have a high
+  // hit-rate that makes them cheap to load.
   if (auto ptrType = getRankedTensorType(base.getType())) {
-#else
-  if (auto ptrType = dyn_cast<RankedTensorType>(base.getType())) {
-#endif
     auto mod = op->getParentOfType<ModuleOp>();
     int numWarps = ttg::TritonGPUDialect::getNumWarps(mod);
     int threadsPerWarp = ttg::TritonGPUDialect::getThreadsPerWarp(mod);