[AMD] Pipeline small tensors w/ registers only on GFX950 (#7171)

makslevental · web-flow · commit dc5b0c61897d · 2025-06-16T08:41:45.000-07:00
Fixes a perf regression on gfx942 but preserves functionality for
gfx950 (and above).
diff --git a/include/triton/Dialect/TritonGPU/Transforms/Utility.h b/include/triton/Dialect/TritonGPU/Transforms/Utility.h
@@ -208,7 +208,7 @@ bool isPureUnaryInlineAsm(Operation *op);
 int getNVIDIAComputeCapability(Operation *module);
 
 // Read the amd target from the module attributes
-StringRef getAMDArch(Operation *module);
+std::optional<StringRef> getAMDArch(Operation *module);
 
 std::optional<mlir::triton::gpu::SwizzledSharedEncodingAttr>
 getSharedEncIfAllUsersAreDotEnc(Value val, bool &incompatible);
diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
@@ -1055,14 +1055,19 @@ int getNVIDIAComputeCapability(Operation *module) {
   return computeCapability;
 }
 
-StringRef getAMDArch(Operation *module) {
+std::optional<StringRef> getAMDArch(Operation *module) {
   StringAttr targetAttr =
       module->getAttrOfType<StringAttr>(triton::gpu::AttrTargetName);
-  assert(targetAttr && "Expected a target attribute on the module operation");
+  if (!targetAttr) {
+    LDBG("Expected a target attribute on the module operation");
+    return {};
+  }
 
   StringRef ref = targetAttr.strref();
-  assert(ref.starts_with("hip:") &&
-         "expected target attribute to be prefixed with \"hip:\"");
+  if (!ref.starts_with("hip:")) {
+    LDBG("expected target attribute to be prefixed with \"hip:\"");
+    return {};
+  }
 
   return ref.drop_front(4); // drop the "hip:"
 }
diff --git a/test/TritonGPU/loop-pipeline-hip.mlir b/test/TritonGPU/loop-pipeline-hip.mlir
@@ -582,7 +582,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
 #blocked4 = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [32, 2], warpsPerCTA = [8, 1], order = [1, 0]}>
 #blocked5 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 64], warpsPerCTA = [8, 1], order = [1, 0]}>
 #blocked6 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 2], order = [1, 0]}>
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
   tt.func public @pipeline_small_vector(%arg0: !tt.ptr<f8E5M2>, %arg1: !tt.ptr<f8E5M2>, %arg2: !tt.ptr<f32>, %arg3: !tt.ptr<i8>, %arg4: !tt.ptr<i8>, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32) -> tensor<128x256xf32, #blocked3> {
     %c128_i32 = arith.constant 128 : i32
     %c256_i32 = arith.constant 256 : i32
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/FMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/FMA.cpp
@@ -31,9 +31,10 @@ class AMDFMAVectorMultiplier : public FMAVectorMultiplier {
     auto dElemTy = dOpTy.getElementType();
     auto mod = op->getParentOfType<ModuleOp>();
     auto arch = getAMDArch(mod);
+    assert(arch.has_value() && "expected arch");
     DotIntrinsic chosenOp;
 
-    bool dotAvailable = AMD::supportsVDot(arch);
+    bool dotAvailable = AMD::supportsVDot(*arch);
     auto b = TritonLLVMOpBuilder(loc, rewriter);
     if (dotAvailable) {
       if ((aElemTy.isF16() || aElemTy.isBF16()) && dElemTy.isF32()) {
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp
@@ -1,4 +1,5 @@
 #include "TritonAMDGPUTransforms/Passes.h"
+#include "amd/lib/TritonAMDGPUToLLVM/TargetInfo.h"
 #include "mlir/Support/LLVM.h"
 #include "third_party/amd/include/Analysis/AxisInfoExt.h"
 #include "third_party/amd/include/Dialect/TritonAMDGPU/IR/Dialect.h"
@@ -485,6 +486,11 @@ findPipelineableLoads(scf::ForOp forOp,
   DenseSet<Operation *> seen;
   // Recursively visit the given op and its operands to discover all load ops
   // and collect their distances and uses.
+
+  auto arch = getAMDArch(forOp->getParentOfType<ModuleOp>());
+  triton::AMD::ISAFamily isaFamily = triton::AMD::ISAFamily::Unknown;
+  if (arch)
+    isaFamily = triton::AMD::deduceISAFamily(*arch);
   std::function<void(Operation * op, int distance, Operation *use)> dfs =
       [&](Operation *op, int distance, Operation *use) {
         // Skip previously visited load ops.
@@ -507,12 +513,17 @@ findPipelineableLoads(scf::ForOp forOp,
               }
               auto pointeeTy = cast<tt::PointerType>(tensorTy.getElementType())
                                    .getPointeeType();
-              // If the max continugous bits we can read is < 32, buffer in
-              // registers.
-              if (vecContiguity * pointeeTy.getIntOrFloatBitWidth() >= 32) {
+              unsigned width =
+                  vecContiguity * pointeeTy.getIntOrFloatBitWidth();
+              // Limit shared memory sharing to width >= 32 elements.
+              LDBG("Load " << *loadOp << " has width " << width);
+              if (width >= 32) {
                 sharedEncoding =
                     getSharedEncIfAllUsersAreDotEnc(op->getResult(0))
                         .value_or(nullptr);
+              } else if (isaFamily != triton::AMD::ISAFamily::CDNA4) {
+                LDBG("Skip width<32 load " << loadOp << " for arch " << arch);
+                return;
               }
             } else if (auto useOp = dyn_cast<tt::LoadOp>(use)) {
               // The use of this loadOp is another loadOp. If the use is not in
@@ -790,6 +801,7 @@ LogicalResult preprocessLoopAndBuildSchedule(scf::ForOp &forOp, int numStages,
   int numBuffers = 1;
   std::array<tt::CoarseSchedule::Cluster, SCHED_SIZE> clusters;
   tt::CoarseSchedule schedule(numStages);
+
   // Schedule the loads and root ops (dot ops) in the loop. This will give us
   // a scaffold for the final schedule.
   FailureOr<llvm::MapVector<Operation *, LoadInfo>> loadToInfo =