triton-lang
diff --git a/‎include/triton/Dialect/TritonInstrument/IR/Utility.h‎
Lines changed: 7 additions & 0 deletions b/‎include/triton/Dialect/TritonInstrument/IR/Utility.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonInstrument/IR/Utility.cpp‎
Lines changed: 11 additions & 4 deletions b/‎lib/Dialect/TritonInstrument/IR/Utility.cpp‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎lib/Dialect/TritonInstrument/Transforms/FpSanitizer.cpp‎
Lines changed: 6 additions & 6 deletions b/‎lib/Dialect/TritonInstrument/Transforms/FpSanitizer.cpp‎
Lines changed: 6 additions & 6 deletions
@@ -8,6 +8,10 @@
 
 #include <array>
 
+namespace mlir::triton::gpu {
+class GlobalScratchAllocOp;
+}
+
 namespace mlir::triton::instrument {
 class FunctionBuilder;
 
@@ -29,6 +33,9 @@ Operation *createStoreScratchMemory(OpBuilder &b, Location loc, Value alloc,
                                     Value tensor, RankedTensorType tensorType);
 Value createLoadScratchMemory(OpBuilder &b, Location loc, Value alloc,
                               RankedTensorType tensorType);
+gpu::GlobalScratchAllocOp
+createThirdPartyScratchAlloc(OpBuilder &b, Location loc, Type ptrType,
+                             int64_t sizeInBytes, int64_t alignment);
 Value expandOuterSlicedDim(OpBuilder &b, Location loc, Value tensor);
 RankedTensorType getIntTensorType(Region *region, ArrayRef<int64_t> shape,
                                   unsigned bitWidth);
 
@@ -163,7 +163,7 @@ Value createInitializedScratchMemory(ImplicitLocOpBuilder &b,
   int64_t sizeInBytes = numEls * elSize;
   Type ptrType = triton::getPointerType(elType);
   auto alloc =
-      GlobalScratchAllocOp::create(b, ptrType, sizeInBytes, elSize, UnitAttr());
+      createThirdPartyScratchAlloc(b, b.getLoc(), ptrType, sizeInBytes, elSize);
   createStoreScratchMemory(b, b.getLoc(), alloc, tensor, tensor.getType());
   return alloc;
 }
@@ -182,8 +182,8 @@ Value createZeroInitStateTensor(ImplicitLocOpBuilder &b, int m, int n,
   Type ptrType = triton::getPointerType(elType);
   // Allocate scratch buffers with 16-byte alignment so global loads and stores
   // can be vectorized if possible.
-  auto alloc = GlobalScratchAllocOp::create(b, ptrType, sizeInBytes,
-                                            /*alignment=*/16, UnitAttr());
+  auto alloc = createThirdPartyScratchAlloc(b, b.getLoc(), ptrType, sizeInBytes,
+                                            /*alignment=*/16);
   Value cstZero = arith::ConstantIntOp::create(b, 0, bitWidth);
   funcBuilder.createFillGlobalTensorCall(b, alloc, type, cstZero);
   return alloc;
@@ -245,7 +245,7 @@ bool hasTMAStore(ModuleOp module) {
 
 Value createLockVariable(ImplicitLocOpBuilder &b) {
   Type ptrType = triton::getPointerType(b.getI32Type());
-  auto alloc = GlobalScratchAllocOp::create(b, ptrType, 4, 4, UnitAttr());
+  auto alloc = createThirdPartyScratchAlloc(b, b.getLoc(), ptrType, 4, 4);
   Value zero = arith::ConstantOp::create(b, b.getLoc(), b.getI32Type(),
                                          b.getI32IntegerAttr(0));
   triton::AtomicRMWOp::create(b, b.getI32Type(), RMWOp::XCHG, alloc, zero,
@@ -258,6 +258,13 @@ Value createLockVariable(ImplicitLocOpBuilder &b) {
 
 namespace mlir::triton::instrument {
 
+gpu::GlobalScratchAllocOp
+createThirdPartyScratchAlloc(OpBuilder &b, Location loc, Type ptrType,
+                             int64_t sizeInBytes, int64_t alignment) {
+  return gpu::GlobalScratchAllocOp::create(b, loc, ptrType, sizeInBytes,
+                                           alignment, b.getUnitAttr());
+}
+
 void createAssertInThread(ImplicitLocOpBuilder &b, Value condition,
                           StringRef message) {
   if (auto tensorTy = dyn_cast<RankedTensorType>(condition.getType())) {
 
@@ -164,8 +164,8 @@ class TmemScratchManager {
       int64_t alignment = std::max<int64_t>(elSize, 16);
       int64_t sizeInBytes = product(memTy.getShape()) * elSize;
       auto ptrTy = triton::getPointerType(memTy.getElementType());
-      auto allocOp = ttg::GlobalScratchAllocOp::create(
-          rewriter, loc, ptrTy, sizeInBytes, alignment, UnitAttr());
+      auto allocOp = createThirdPartyScratchAlloc(rewriter, loc, ptrTy,
+                                                  sizeInBytes, alignment);
       allocOp->setDiscardableAttr("tt.divisibility",
                                   rewriter.getI64IntegerAttr(alignment));
       Value ptr = allocOp.getResult();
@@ -312,8 +312,8 @@ Value createScratchAndStore(PatternRewriter &rewriter, Location loc, Value val,
   int64_t alignment = std::max<int64_t>(elSize, 16);
   int64_t sizeInBytes = product(tensorTy.getShape()) * elSize;
   auto ptrTy = triton::getPointerType(tensorTy.getElementType());
-  auto allocOp = ttg::GlobalScratchAllocOp::create(
-      rewriter, loc, ptrTy, sizeInBytes, alignment, UnitAttr());
+  auto allocOp = createThirdPartyScratchAlloc(rewriter, loc, ptrTy, sizeInBytes,
+                                              alignment);
   allocOp->setDiscardableAttr("tt.divisibility",
                               rewriter.getI64IntegerAttr(alignment));
   createStoreScratchMemory(rewriter, loc, allocOp.getResult(), val, tensorTy);
@@ -482,8 +482,8 @@ createOperandScratch(PatternRewriter &rewriter, Location loc,
   int64_t alignment = std::max<int64_t>(elSize, 16);
   int64_t sizeInBytes = product(memTy.getShape()) * elSize;
   auto ptrTy = triton::getPointerType(memTy.getElementType());
-  auto allocOp = ttg::GlobalScratchAllocOp::create(
-      rewriter, loc, ptrTy, sizeInBytes, alignment, UnitAttr());
+  auto allocOp = createThirdPartyScratchAlloc(rewriter, loc, ptrTy, sizeInBytes,
+                                              alignment);
   allocOp->setDiscardableAttr("tt.divisibility",
                               rewriter.getI64IntegerAttr(alignment));
   Value ptr = allocOp.getResult();