[Cherry-pick] [PROTON] Filter out all intrinsics when counting the number of triton functions (#8021) (#556)

Jokeren · meta-codesync[bot] · commit b3fc429e5f89 · 2025-10-30T20:36:48.000-07:00
Summary: Cherry-picked from upstream OAI repository. Original Commit: 23a4421 Original Author: Keren Zhou Original Date: 2025-09-01 18:27:52 -0400 Original commit message: ``` [PROTON] Filter out all intrinsics when counting the number of triton functions (#8021) ``` This PR was automatically cherry-picked from the upstream triton-lang/triton repository. Pull Request resolved: #556 Reviewed By: agron911 Differential Revision: D85909904 Pulled By: dshi7 fbshipit-source-id: 60f11154a4ffb226fafd20b347192dda2ac32a3c
diff --git a/test/Proton/amd/add_sched_barriers.mlir b/test/Proton/amd/add_sched_barriers.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file  -add-sched-barriers --verify-diagnostics | FileCheck --check-prefix=CHECK %s
+// RUN: triton-opt %s -split-input-file -add-sched-barriers --verify-diagnostics | FileCheck --check-prefix=CHECK %s
 
 #shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
 #smem = #ttg.shared_memory
@@ -75,3 +75,14 @@ module attributes {"ttg.num-warps" = 8 : i32, ttg.profile_scratch_memory_alignme
     llvm.return
   }
 }
+
+// -----
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 3072 : i32, ttg.target = "hip:gfx90a", "ttg.threads-per-warp" = 64 : i32} {
+  llvm.func @llvm.exp2.f32(f32) -> f32 attributes {libname = "", libpath = ""}
+  // CHECK-LABEL: two_functions
+  llvm.func @two_functions(%arg: f32) -> f32 {
+    %1 = llvm.call @llvm.exp2.f32(%arg) : (f32) -> f32
+    llvm.return %1 : f32
+  }
+}
diff --git a/third_party/proton/Dialect/include/Conversion/ProtonGPUToLLVM/Utility.h b/third_party/proton/Dialect/include/Conversion/ProtonGPUToLLVM/Utility.h
@@ -45,6 +45,8 @@ CircularStoreDataPack
 lowerCircularStoreOpHelper(CircularStoreOp op, Value segmentStruct,
                            ConversionPatternRewriter &rewriter);
 
+SmallVector<FunctionOpInterface> getTritonFunctions(ModuleOp mod);
+
 } // namespace proton::gpu
 } // namespace triton
 
diff --git a/third_party/proton/Dialect/lib/ProtonGPUToLLVM/AllocateProtonGlobalScratchBuffer.cpp b/third_party/proton/Dialect/lib/ProtonGPUToLLVM/AllocateProtonGlobalScratchBuffer.cpp
@@ -1,4 +1,5 @@
 #include "Conversion/ProtonGPUToLLVM/Passes.h"
+#include "Conversion/ProtonGPUToLLVM/Utility.h"
 #include "Dialect/ProtonGPU/IR/Dialect.h"
 #include "mlir/Pass/Pass.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
@@ -17,25 +18,13 @@ struct AllocateProtonGlobalScratchBufferPass
     MLIRContext *ctx = &getContext();
     OpBuilder builder(ctx);
 
-    int numFuncOps = 0;
-    FunctionOpInterface func;
-    mod.walk([&](FunctionOpInterface op) {
-      // Ignore any intrinsic functions. On AMD the predicate load/store ops
-      // are currently pseduo instrunctions at this point and will get picked up
-      // here and trigger the FunctionOpInterface range based assert below
-      StringRef funcName(op.getNameAttr());
-      if (!funcName.contains("__")) {
-        numFuncOps += 1;
-        func = op;
-      }
-    });
-
-    assert(numFuncOps == 1);
+    auto funcOps = triton::proton::gpu::getTritonFunctions(mod);
+    assert(funcOps.size() == 1 && "Expected exactly one funcOp");
 
     int32_t cumulativeMemorySize = 0; // bytes
     std::vector<uint32_t> alignments;
 
-    func.walk([&](proton::gpu::GlobalScratchAllocOp op) {
+    funcOps[0].walk([&](proton::gpu::GlobalScratchAllocOp op) {
       int offset = llvm::alignTo(cumulativeMemorySize,
                                  proton::gpu::getBytesPerClockEntry());
       op->setAttr("offset",
diff --git a/third_party/proton/Dialect/lib/ProtonGPUToLLVM/ProtonAMDGPUToLLVM/AddSchedBarriers.cpp b/third_party/proton/Dialect/lib/ProtonGPUToLLVM/ProtonAMDGPUToLLVM/AddSchedBarriers.cpp
@@ -1,4 +1,5 @@
 #include "Conversion/ProtonGPUToLLVM/Passes.h"
+#include "Conversion/ProtonGPUToLLVM/Utility.h"
 #include "Dialect/ProtonGPU/IR/Dialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
@@ -28,33 +29,21 @@ struct AddSchedBarriers
     MLIRContext *ctx = &getContext();
     OpBuilder builder(ctx);
 
-    int numFuncOps = 0;
-    FunctionOpInterface func;
-    mod.walk([&](FunctionOpInterface op) {
-      // Ignore any intrinsic functions. On AMD the predicate load/store ops
-      // are currently pseduo instrunctions at this point and may get picked up
-      // here and trigger the FunctionOpInterface range based assert below
-      StringRef funcName(op.getNameAttr());
-      if (!funcName.contains("__")) {
-        numFuncOps += 1;
-        func = op;
-      }
-    });
-
-    assert(numFuncOps == 1);
+    auto funcOps = triton::proton::gpu::getTritonFunctions(mod);
+    assert(funcOps.size() == 1 && "Expected exactly one funcOp");
 
     IntegerAttr zeroAttrValue =
         builder.getI32IntegerAttr(static_cast<int32_t>(0));
 
-    func.walk([&](mlir::triton::proton::gpu::ReadCounterOp op) {
+    funcOps[0].walk([&](mlir::triton::proton::gpu::ReadCounterOp op) {
       auto loc = op.getLoc();
       if (!isa_and_nonnull<ROCDL::SchedBarrier>(op->getPrevNode())) {
         builder.setInsertionPoint(op);
         builder.create<ROCDL::SchedBarrier>(loc, zeroAttrValue);
       }
     });
 
-    func.walk([&](mlir::triton::proton::gpu::CircularStoreOp op) {
+    funcOps[0].walk([&](mlir::triton::proton::gpu::CircularStoreOp op) {
       auto loc = op.getLoc();
       if (!isa_and_nonnull<ROCDL::SchedBarrier>(op->getNextNode())) {
         builder.setInsertionPointAfter(op);
diff --git a/third_party/proton/Dialect/lib/ProtonGPUToLLVM/Utility.cpp b/third_party/proton/Dialect/lib/ProtonGPUToLLVM/Utility.cpp
@@ -156,6 +156,20 @@ lowerCircularStoreOpHelper(CircularStoreOp op, Value segmentStruct,
   return {isWriter, valsVec, vecPtr, addrSpace};
 }
 
+SmallVector<FunctionOpInterface> getTritonFunctions(ModuleOp mod) {
+  SmallVector<FunctionOpInterface> funcOps;
+  mod.walk([&](FunctionOpInterface funcOp) {
+    // Ignore any intrinsic functions which have an empty body.
+    // For example, on AMD the predicate load/store ops are currently pseudo
+    // instructions at this point and may get picked up here and trigger the
+    // FunctionOpInterface range based assert below.
+    if (funcOp.empty())
+      return;
+    funcOps.push_back(funcOp);
+  });
+  return funcOps;
+}
+
 } // namespace proton::gpu
 } // namespace triton