[AMD][CanonicalizePointers] Propagate the attributes during the rewrites (#4815)

giuseros · web-flow · commit 762a7d197c4e · 2024-10-01T16:50:51.000Z
This is fixing the issue where the IR was not properly vectorized in
Triton (and we were relying on a backend pass which was not always able
to do the right thing).

The general issue was that we were not propagating the attributes of the
operation we were rewriting. The specific issue was that block argument
attributes are in the for of `tt.property_argi` for the i-th block
argument, so we needed to do a bit more work to propagate those
correctly.

This PR is trying to address this problem by adding a vector of
attributes to the `FatPtr` structure. We are not propagating the
attributes, but only setting them whenever the IR had them set.
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipelineExpander.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipelineExpander.cpp
@@ -461,6 +461,7 @@ scf::ForOp LoopPipelinerInternal::createKernelLoop(
   auto newForOp =
       rewriter.create<scf::ForOp>(forOp.getLoc(), forOp.getLowerBound(), newUb,
                                   forOp.getStep(), newLoopArg);
+  newForOp->setAttrs(forOp->getAttrs());
   // When there are no iter args, the loop body terminator will be created.
   // Since we always create it below, remove the terminator if it was created.
   if (!newForOp.getBody()->empty())
diff --git a/test/TritonGPU/amd/amd-canonicalize-pointers.mlir b/test/TritonGPU/amd/amd-canonicalize-pointers.mlir
@@ -546,3 +546,34 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     tt.return
   }
 }
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
+module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
+  // CHECK-LABEL: tt.func @forOpWithHints
+  tt.func @forOpWithHints(%arg0: !tt.ptr<f32>, %init : tensor<1024xf32, #blocked>)-> tensor<1024xf32, #blocked>{
+    %c0 = arith.constant 0: index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128: index
+    %0 = tt.get_program_id x : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
+    %3 = tt.splat %0 : i32 -> tensor<1024xi32, #blocked>
+    %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
+    %5 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked>
+    %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>
+    %52:2 = scf.for %arg9 = %c0 to %c128 step %c1 iter_args(%arg1 = %6, %arg2 = %init) -> (tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xf32, #blocked>){
+        %9 = tt.load %arg1: tensor<1024x!tt.ptr<f32>, #blocked>
+        // CHECK: tt.addptr {{.*}}, {{.*}} {tt.divisibility = dense<16> : tensor<1xi32>}
+        %11 = tt.addptr %arg1, %4 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>
+        %12 = tt.addptr %11, %3 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>
+        %10 = arith.addf %9, %arg2 : tensor<1024xf32, #blocked>
+        scf.yield %12, %10 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xf32, #blocked>
+    } {"tt.divisibility_arg1"=dense<[16]> : tensor<1xi32>}
+    // CHECK: tt.divisibility_arg1
+    // CHECK-SAME: tt.divisibility_arg4
+    %8 = tt.addptr %52#0, %4 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>
+    %11 = tt.load %8 : tensor<1024x!tt.ptr<f32>, #blocked>
+    tt.return %11 : tensor<1024xf32, #blocked>
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp
@@ -1,11 +1,14 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Block.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Matchers.h"
+#include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Value.h"
@@ -15,6 +18,8 @@
 #include "triton/Dialect/Triton/IR/Types.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
@@ -85,6 +90,8 @@ class PointerCanonicalizer {
     Value offset;
     // Flag to express if we can narrow the uses of the offset down to 32 bits
     bool canNarrow = false;
+    // Collection of attributes that need to be applied to the pointer
+    SmallVector<NamedAttribute> attributes;
 
     // Utility copy functions
     FatPtr copy(Value newBasePtr, Value newOffset) {
@@ -96,6 +103,11 @@ class PointerCanonicalizer {
     FatPtr copyWithOffset(Value newBase) {
       return FatPtr{newBase, offset, canNarrow};
     }
+    // Attribute functions
+    void setAttr(NamedAttribute attr) { attributes.push_back(attr); }
+    void setAttrs(ArrayRef<NamedAttribute> attrs) {
+      llvm::append_range(attributes, attrs);
+    }
   };
 
   // Rewrite any operation that needs a pointer
@@ -104,8 +116,15 @@ class PointerCanonicalizer {
   // Start from an argument of a function and propagate its fat pointers
   LogicalResult rewritePointer(Value argPtr);
 
+  // Create a tensor pointer from a fat pointer `fatPtr`. The tensor pointer is
+  // obtained by splatting the `fatPtr.basePtr` using the `fatPtr.offset` shape
+  // and adding the offset to it.
   Value createTensorPointer(FatPtr fatPtr, Location loc);
 
+  // Push the attributes of the given operation `op` to the fat pointer
+  // corresponding to `val`
+  void collectFatPointerAttributes(Operation *op, Value val);
+
   // Rewrite a given function, canonicalizing the different pointer arguments of
   // the region
   LogicalResult rewriteFunction(triton::FuncOp funcOp);
@@ -269,6 +288,46 @@ Value createTensorZero(IRRewriter &rw, Location loc, RankedTensorType type) {
 
 } // namespace
 
+void PointerCanonicalizer::collectFatPointerAttributes(Operation *op,
+                                                       Value val) {
+  auto addBlockArgumentAttr = [&](BlockArgument arg) {
+    // If the value is a block parameter, the operation can specify
+    // an attribute for the given parameter by using `tt.property_argi`
+    // where `argi` refers to the arg number of the given parameter.
+    // So we need to iterate through the property, find the right one
+    // and push the property onto the pointers attributes.
+    llvm::SmallString<8> scratchStr;
+    for (NamedAttribute namedAttr : op->getAttrs()) {
+      scratchStr.clear();
+      llvm::raw_svector_ostream sstream(scratchStr);
+      sstream << "_arg" << arg.getArgNumber();
+      StringRef attrName = namedAttr.getName().getValue();
+      if (attrName.ends_with(scratchStr)) {
+        StringRef newAttrName = attrName.drop_back(scratchStr.size());
+        namedAttr.setName(rewriter.getStringAttr(newAttrName));
+        pointers[val].setAttr(namedAttr);
+        // Propagate the argument to the offset if it is also a block argument
+        if (auto offsetArg = dyn_cast<BlockArgument>(pointers[val].offset)) {
+          scratchStr.clear();
+          sstream << newAttrName << "_arg" << offsetArg.getArgNumber();
+          op->setAttr(scratchStr, namedAttr.getValue());
+        }
+      }
+    }
+  };
+
+  // If it is the i-th block argument, then look if the operation defined some
+  // _argi attribute and add it to the fat pointer attributes
+  if (auto arg = dyn_cast<BlockArgument>(val)) {
+    addBlockArgumentAttr(arg);
+    return;
+  }
+
+  // Otherwise add the attributes of the operation to the fat pointer
+  for (NamedAttribute attr : op->getAttrs())
+    pointers[val].setAttr(attr);
+}
+
 // Offset extraction logic for an addition op:
 // decompose(A+B) = {U(A)+U(B), NU(A)+NU(B)}
 std::pair<Value, Value>
@@ -372,9 +431,6 @@ PointerCanonicalizer::decomposeOffsetFromExpr(Location loc, Value expr,
   return offsets;
 }
 
-// Create a tensor pointer from a fat pointer `fatPtr`. The tensor pointer is
-// obtained by splatting the `fatPtr.basePtr` using the `fatPtr.offset` shape
-// and adding the offset to it.
 Value PointerCanonicalizer::createTensorPointer(FatPtr fatPtr, Location loc) {
   Value basePtr = fatPtr.basePtr;
   Value offset = fatPtr.offset;
@@ -390,9 +446,12 @@ Value PointerCanonicalizer::createTensorPointer(FatPtr fatPtr, Location loc) {
   Value tensorPtr =
       rewriter.create<triton::SplatOp>(loc, tensorPtrType, basePtr);
 
-  tensorPtr =
+  auto addPtrOp =
       rewriter.create<triton::AddPtrOp>(loc, tensorPtrType, tensorPtr, offset);
-  return tensorPtr;
+
+  addPtrOp->setAttrs(fatPtr.attributes);
+
+  return addPtrOp.getResult();
 }
 
 // Rewrite a memory operation
@@ -477,6 +536,9 @@ LogicalResult PointerCanonicalizer::rewriteAddPtrOp(triton::AddPtrOp addPtrOp,
     newPtr = rewriter.create<triton::AddPtrOp>(curLoc, newPtr.getType(), newPtr,
                                                scalarConst);
     pointers[nextPtr] = fatPtr.copyWithOffset(newPtr);
+    // If we are updating the tensor pointer with a uniform value, we can
+    // propagate the attributes of the tensor pointer to the fat pointer.
+    pointers[nextPtr].setAttrs(fatPtr.attributes);
     opToDelete.insert(addPtrOp);
     return success();
   }
@@ -496,6 +558,7 @@ LogicalResult PointerCanonicalizer::rewriteAddPtrOp(triton::AddPtrOp addPtrOp,
   Value fatPtrOffset = fatPtr.offset;
   bool canNarrow = fatPtr.canNarrow;
   Value newOffset = fatPtrOffset;
+  bool propagateAtrs = true;
   if (!isZeroConst(nonUniformOffset)) {
     Type addPtrOffsetType = getElementTypeOrSelf(nonUniformOffset);
     canNarrow = canNarrow && canNarrowOffset(fatPtrOffset, nonUniformOffset);
@@ -507,9 +570,15 @@ LogicalResult PointerCanonicalizer::rewriteAddPtrOp(triton::AddPtrOp addPtrOp,
 
     newOffset =
         rewriter.create<arith::AddIOp>(curLoc, nonUniformOffset, fatPtrOffset);
+    propagateAtrs = false;
   }
   opToDelete.insert(addPtrOp);
   pointers[nextPtr] = FatPtr{newPtr, newOffset, canNarrow};
+
+  // If we are updating the tensor pointer with a uniform value, we can
+  // propagate the attributes of the tensor pointer to the fat pointer.
+  if (propagateAtrs)
+    pointers[nextPtr].setAttrs(fatPtr.attributes);
   return success();
 }
 
@@ -537,9 +606,12 @@ LogicalResult PointerCanonicalizer::rewriteForOp(scf::ForOp forOp,
   // This is making sure we visit the uses within the forOp region
   Value arg = newForOp.getTiedLoopRegionIterArg(forOperand);
   size_t numIterArgs = newForOp.getNumRegionIterArgs();
-  pointers[arg] =
-      FatPtr{newForOp.getRegionIterArg(numIterArgs - 2),
-             newForOp.getRegionIterArg(numIterArgs - 1), fatPtr.canNarrow};
+  pointers[arg] = fatPtr.copy(newForOp.getRegionIterArg(numIterArgs - 2),
+                              newForOp.getRegionIterArg(numIterArgs - 1));
+
+  // Collect attributes before continuing the visit
+  collectFatPointerAttributes(newForOp, arg);
+
   for (OpOperand &use : arg.getUses())
     queue.push_back(&use);
 
@@ -548,7 +620,6 @@ LogicalResult PointerCanonicalizer::rewriteForOp(scf::ForOp forOp,
   size_t numResults = newForOp->getNumResults();
   pointers[nextPtr] = fatPtr.copy(newForOp->getResult(numResults - 2),
                                   newForOp.getResult(numResults - 1));
-
   opToDelete.insert(forOp);
   return success();
 }
@@ -864,11 +935,13 @@ LogicalResult PointerCanonicalizer::rewritePointer(Value argPtr) {
           res = materializeFatPointer(op, curLoc, curOperand->get());
         });
 
-    // Keep propagating the fat pointer down the IR
-    if (nextPtr)
+    // Collect the attributes and Keep propagating the fat pointer down the IR
+    if (nextPtr) {
+      collectFatPointerAttributes(curOp, nextPtr);
       for (OpOperand &use : nextPtr.getUses())
         if (!opToDelete.contains(use.getOwner()))
           queue.push_back(&use);
+    }
   }
   return success();
 }