Revert "[Triton][Allocation] Enable getScratchValueSize specialization (#5070)"

whitneywhtsang · whitneywhtsang · commit c17a0fb139ec · 2024-11-17T08:40:32.000Z
This reverts commit 32b0fce.
diff --git a/include/triton/Analysis/Allocation.h b/include/triton/Analysis/Allocation.h
@@ -18,12 +18,6 @@ namespace mlir {
 namespace triton {
 class AllocationAnalysis;
 
-/// Callback to allow backends to specify target-specific scratch sizes for
-/// some operations.
-using AllocationAnalysisScratchSizeFn = std::function<unsigned(Operation *)>;
-
-unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op);
-
 // To convert a tensor from one layout to another, we need to allocate a
 // temporary buffer (i.e., scratch buffer) in shared memory. The conversion may
 // require multiple iterations, with each iteration involving multiple
@@ -147,8 +141,7 @@ class Allocation {
   explicit Allocation(Operation *operation) : operation(operation) {}
 
   /// Runs allocation analysis on the given top-level operation.
-  void run(FuncAllocMapT &funcAllocMap,
-           triton::AllocationAnalysisScratchSizeFn scratchSizeGetter);
+  template <typename AllocationAnalysis> void run(FuncAllocMapT &funcAllocMap);
 
   /// Returns the operation this analysis was constructed from.
   Operation *getOperation() const { return operation; }
@@ -262,18 +255,17 @@ class ModuleAllocation : public CallGraph<Allocation> {
 public:
   using FuncOffsetMapT = DenseMap<FunctionOpInterface, Value>;
 
-  ModuleAllocation(ModuleOp moduleOp,
-                   triton::AllocationAnalysisScratchSizeFn scratchSizeGetter =
-                       triton::defaultAllocationAnalysisScratchSizeFn)
-      : CallGraph<Allocation>(moduleOp) {
-    walk<WalkOrder::PreOrder, WalkOrder::PostOrder>(
+  template <typename AllocationAnalysis = triton::AllocationAnalysis>
+  static ModuleAllocation get(ModuleOp moduleOp) {
+    ModuleAllocation res(moduleOp);
+    res.walk<WalkOrder::PreOrder, WalkOrder::PostOrder>(
         // Pre-order edge walk callback
         [](CallOpInterface callOp, FunctionOpInterface funcOp) {},
         // Post-order node walk callback
         [&](FunctionOpInterface funcOp) {
           auto [iter, inserted] = res.funcMap.try_emplace(funcOp, funcOp);
           if (inserted)
-            iter->second.run(funcMap, scratchSizeGetter);
+            iter->second.template run<AllocationAnalysis>(res.funcMap);
         });
     return res;
   }
diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp
@@ -118,70 +118,13 @@ ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
   return scratchConfig;
 }
 
-unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op) {
-  if (auto reduceOp = dyn_cast<ReduceOp>(op)) {
-    ReduceOpHelper helper(reduceOp);
-    return helper.getScratchSizeInBytes();
-  }
-  if (auto scanOp = dyn_cast<ScanOp>(op)) {
-    ScanLoweringHelper helper(scanOp);
-    return helper.getScratchSizeInBytes();
-  }
-  if (auto histogram = dyn_cast<HistogramOp>(op)) {
-    auto dstTy = histogram.getType();
-    int threadsPerWarp = gpu::TritonGPUDialect::getThreadsPerWarp(
-        op->getParentOfType<ModuleOp>());
-    return std::max<int>(dstTy.getNumElements(), threadsPerWarp) *
-           std::max<int>(8, dstTy.getElementTypeBitWidth()) / 8;
-  }
-  if (auto cvtLayout = dyn_cast<gpu::ConvertLayoutOp>(op)) {
-    auto srcTy = cvtLayout.getSrc().getType();
-    auto dstTy = cvtLayout.getType();
-    auto srcEncoding = srcTy.getEncoding();
-    auto dstEncoding = dstTy.getEncoding();
-    if (mlir::isa<gpu::SharedEncodingAttr>(srcEncoding) ||
-        mlir::isa<gpu::SharedEncodingAttr>(dstEncoding)) {
-      // Conversions from/to shared memory do not need scratch memory.
-      return 0;
-    }
-    // ConvertLayoutOp with both input/output non-shared_layout
-    // TODO: Besides of implementing ConvertLayoutOp via shared memory, it's
-    //       also possible to realize it with other approaches in restricted
-    //       conditions, such as warp-shuffle
-    auto scratchConfig = getScratchConfigForCvt(srcTy, dstTy);
-    auto elems = getNumScratchElements(scratchConfig.paddedRepShape);
-    return isa<PointerType>(srcTy.getElementType())
-               ? elems * kPtrBitWidth / 8
-               : elems * std::max<int>(8, srcTy.getElementTypeBitWidth()) / 8;
-  }
-  if (isa<AtomicRMWOp, AtomicCASOp>(op)) {
-    auto value = op->getOperand(0);
-    // only scalar requires scratch memory
-    // make it explicit for readability
-    if (dyn_cast<RankedTensorType>(value.getType())) {
-      return 0;
-    }
-    auto smemShape = getRepShapeForAtomic(op->getResult(0));
-    auto elems = getNumScratchElements(smemShape);
-    auto elemTy = cast<PointerType>(value.getType()).getPointeeType();
-    assert(!isa<PointerType>(elemTy) && "unexpected pointer type");
-    return elems * std::max<int>(8, elemTy.getIntOrFloatBitWidth()) / 8;
-  }
-  if (auto createTensormap = dyn_cast<ExperimentalTensormapCreateOp>(op)) {
-    constexpr int32_t kTMASize = 128;
-    return kTMASize;
-  }
-  return 0;
-}
-
 class AllocationAnalysis {
 public:
   AllocationAnalysis(Operation *operation,
                      Allocation::FuncAllocMapT *funcAllocMap,
-                     Allocation *allocation,
-                     AllocationAnalysisScratchSizeFn scratchSizeGetter)
+                     Allocation *allocation)
       : operation(operation), funcAllocMap(funcAllocMap),
-        allocation(allocation), scratchSizeGetter(scratchSizeGetter) {
+        allocation(allocation) {
     run();
   }
 
@@ -234,19 +177,77 @@ class AllocationAnalysis {
 
   /// Initializes temporary shared memory for a given operation.
   void getScratchValueSize(Operation *op) {
-    constexpr size_t scratchAlignment = 128;
-    if (auto callOp = dyn_cast<CallOpInterface>(op)) {
+    const size_t scratchAlignment = 128;
+    if (auto reduceOp = dyn_cast<ReduceOp>(op)) {
+      ReduceOpHelper helper(reduceOp);
+      unsigned bytes = helper.getScratchSizeInBytes();
+      maybeAddScratchBuffer<BufferT::BufferKind::Scratch>(op, bytes,
+                                                          scratchAlignment);
+    } else if (auto scanOp = dyn_cast<ScanOp>(op)) {
+      ScanLoweringHelper helper(scanOp);
+      unsigned bytes = helper.getScratchSizeInBytes();
+      maybeAddScratchBuffer<BufferT::BufferKind::Scratch>(op, bytes,
+                                                          scratchAlignment);
+    } else if (auto histogram = dyn_cast<HistogramOp>(op)) {
+      auto dstTy = histogram.getType();
+      int threadsPerWarp = gpu::TritonGPUDialect::getThreadsPerWarp(
+          op->getParentOfType<ModuleOp>());
+      auto bytes = std::max<int>(dstTy.getNumElements(), threadsPerWarp) *
+                   std::max<int>(8, dstTy.getElementTypeBitWidth()) / 8;
+      maybeAddScratchBuffer<BufferT::BufferKind::Scratch>(op, bytes,
+                                                          scratchAlignment);
+    } else if (auto cvtLayout = dyn_cast<gpu::ConvertLayoutOp>(op)) {
+      auto srcTy = cvtLayout.getSrc().getType();
+      auto dstTy = cvtLayout.getType();
+      auto srcEncoding = srcTy.getEncoding();
+      auto dstEncoding = dstTy.getEncoding();
+      if (mlir::isa<gpu::SharedEncodingAttr>(srcEncoding) ||
+          mlir::isa<gpu::SharedEncodingAttr>(dstEncoding)) {
+        // Conversions from/to shared memory do not need scratch memory.
+        return;
+      }
+      // ConvertLayoutOp with both input/output non-shared_layout
+      // TODO: Besides of implementing ConvertLayoutOp via shared memory, it's
+      //       also possible to realize it with other approaches in restricted
+      //       conditions, such as warp-shuffle
+      auto scratchConfig = getScratchConfigForCvt(srcTy, dstTy);
+      auto elems = getNumScratchElements(scratchConfig.paddedRepShape);
+      auto bytes =
+          isa<PointerType>(srcTy.getElementType())
+              ? elems * kPtrBitWidth / 8
+              : elems * std::max<int>(8, srcTy.getElementTypeBitWidth()) / 8;
+      maybeAddScratchBuffer<BufferT::BufferKind::Scratch>(op, bytes,
+                                                          scratchAlignment);
+    } else if (isa<AtomicRMWOp, AtomicCASOp>(op)) {
+      auto value = op->getOperand(0);
+      // only scalar requires scratch memory
+      // make it explicit for readability
+      if (dyn_cast<RankedTensorType>(value.getType())) {
+        // nothing to do
+      } else {
+        auto smemShape = getRepShapeForAtomic(op->getResult(0));
+        auto elems = getNumScratchElements(smemShape);
+        auto elemTy = cast<PointerType>(value.getType()).getPointeeType();
+        assert(!isa<PointerType>(elemTy) && "unexpected pointer type");
+        auto bytes =
+            elems * std::max<int>(8, elemTy.getIntOrFloatBitWidth()) / 8;
+        maybeAddScratchBuffer<BufferT::BufferKind::Scratch>(op, bytes,
+                                                            scratchAlignment);
+      }
+    } else if (auto callOp = dyn_cast<CallOpInterface>(op)) {
       auto callable = callOp.resolveCallable();
       auto funcOp = dyn_cast<FunctionOpInterface>(callable);
       auto *funcAlloc = &(*funcAllocMap)[funcOp];
       auto bytes = funcAlloc->getSharedMemorySize();
       maybeAddScratchBuffer<BufferT::BufferKind::Virtual>(op, bytes,
                                                           scratchAlignment);
-      return;
+    } else if (auto createTensormap =
+                   dyn_cast<ExperimentalTensormapCreateOp>(op)) {
+      constexpr int32_t kTMASize = 128;
+      constexpr int32_t kTMAAlign = 128;
+      maybeAddScratchBuffer<BufferT::BufferKind::Scratch>(op, kTMASize,
+                                                          kTMAAlign);
     }
-    unsigned bytes = scratchSizeGetter(op);
-    maybeAddScratchBuffer<BufferT::BufferKind::Scratch>(op, bytes,
-                                                        scratchAlignment);
   }
 
   void getValueAlias(Value value, SharedMemoryAliasAnalysis &analysis) {
@@ -546,16 +547,13 @@ class AllocationAnalysis {
   Allocation::FuncAllocMapT *funcAllocMap;
   Allocation *allocation;
   BufferRangeMapT bufferRange;
-  AllocationAnalysisScratchSizeFn scratchSizeGetter;
 };
 
 } // namespace triton
 
-void Allocation::run(
-    FuncAllocMapT &funcAllocMap,
-    triton::AllocationAnalysisScratchSizeFn scratchSizeGetter) {
-  triton::AllocationAnalysis(getOperation(), &funcAllocMap, this,
-                             scratchSizeGetter);
+template <>
+void Allocation::run<triton::AllocationAnalysis>(FuncAllocMapT &funcAllocMap) {
+  triton::AllocationAnalysis(getOperation(), &funcAllocMap, this);
 }
 
 std::map<Operation *, SmallVector<Allocation::BufferId>>
diff --git a/test/Analysis/test-allocation.mlir b/test/Analysis/test-allocation.mlir
@@ -1,11 +1,4 @@
 // RUN: triton-opt %s -split-input-file --mlir-disable-threading -test-print-allocation 2>&1 | FileCheck %s
-// RUN: triton-opt %s -split-input-file --mlir-disable-threading -test-print-allocation="get-scratch-size-function=ValidConstant" 2>&1 | FileCheck %s --check-prefix=CHECK-128
-
-// Check there are no lines with a size different to 128 and we have at least a line with size 128.
-
-// CHECK-128-NOT: scratch offset = {{.*}}, size = {{^(128)}}
-// CHECK-128: scratch offset = {{.*}}, size = 128
-// CHECK-128-NOT: scratch offset = {{.*}}, size = {{^(128)}}
 
 #AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
 #sliceAd0 = #triton_gpu.slice<{dim = 0, parent = #AL}>
diff --git a/test/lib/Analysis/TestAllocation.cpp b/test/lib/Analysis/TestAllocation.cpp
@@ -5,42 +5,21 @@ using namespace mlir;
 
 namespace {
 
-unsigned getScratchSize128(Operation *) { return 128; }
-
-enum class GetScratchSizeFunction {
-  None,
-  ValidConstant,
-};
-
 struct TestAllocationPass
     : public PassWrapper<TestAllocationPass, OperationPass<ModuleOp>> {
 
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestAllocationPass);
 
-  TestAllocationPass() = default;
-  TestAllocationPass(const TestAllocationPass &other)
-      : PassWrapper<TestAllocationPass, OperationPass<ModuleOp>>(other) {}
-
   StringRef getArgument() const final { return "test-print-allocation"; }
   StringRef getDescription() const final {
     return "print the result of the allocation pass";
   }
 
-  ModuleAllocation getModuleAllocation() {
-    switch (getScratchSizeFunction) {
-    case GetScratchSizeFunction::None:
-      return {getOperation()};
-    case GetScratchSizeFunction::ValidConstant:
-      return {getOperation(), getScratchSize128};
-    }
-    llvm_unreachable("Unhandled case");
-  }
-
   void runOnOperation() override {
     auto &os = llvm::errs();
     ModuleOp moduleOp = getOperation();
     // Convert to std::string can remove quotes from opName
-    ModuleAllocation moduleAllocation = getModuleAllocation();
+    ModuleAllocation moduleAllocation = ModuleAllocation::get(moduleOp);
     moduleOp.walk([&](triton::FuncOp funcOp) {
       auto opName = SymbolTable::getSymbolName(funcOp).getValue().str();
       os << opName << "\n";
@@ -69,15 +48,6 @@ struct TestAllocationPass
       os << "size = " << allocation->getSharedMemorySize() << "\n";
     });
   }
-
-  Option<GetScratchSizeFunction> getScratchSizeFunction{
-      *this, "get-scratch-size-function",
-      llvm::cl::desc("Custom scratch size function to use"),
-      llvm::cl::init(GetScratchSizeFunction::None),
-      llvm::cl::values(
-          clEnumValN(GetScratchSizeFunction::None, "None", "None (default)"),
-          clEnumValN(GetScratchSizeFunction::ValidConstant, "ValidConstant",
-                     "ValidConstant"))};
 };
 
 } // namespace