[AMD] StreamPipeline V1: fix depArg return mapping (#4832)

davidberard98 · htyu · web-flow · commit a6ecc7553f45 · 2024-09-30T17:28:51.000-07:00
Previously, if an arg inside the loop was marked as a depArg, then a new
iter_arg would be added to the for loop to handle the arg; but any
usages of these variables _after_ the for loop would not be updated;
those usages would get the wrong value. This PR fixes this by updating
the return mapping. See the comment added in StreamPipeline.cpp for an
example.

Co-authored-by: Hongtao Yu &lt;hoy@meta.com&gt;
diff --git a/test/TritonGPU/amd/amd-loop-pipeline-v1.mlir b/test/TritonGPU/amd/amd-loop-pipeline-v1.mlir
@@ -0,0 +1,31 @@
+// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline | FileCheck %s
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [16, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#loc = loc("/data/users/dberard/triton-env/scripts/matmul.py":6:0)
+#mma = #triton_gpu.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [32, 32], isTransposed = false}>
+module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
+  // CHECK-LABEL: tt.func @use_dep_args
+  tt.func @use_dep_args(%a_ptrs: tensor<64x32x!tt.ptr<bf16>, #blocked>, %b_ptrs: tensor<32x64x!tt.ptr<bf16>, #blocked1>, %loop_range: i32) -> (tensor<64x64xf32, #mma>, tensor<64x32x!tt.ptr<bf16>, #blocked>, tensor<32x64x!tt.ptr<bf16>, #blocked1>) {
+    %cst = arith.constant dense<32> : tensor<64x32xi32, #blocked>
+    %cst2 = arith.constant dense<2048> : tensor<32x64xi32, #blocked1>
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma>
+    %c0_i32 = arith.constant 0 : i32
+    %c8_i32 = arith.constant 8 : i32
+    %c32_i32 = arith.constant 32 : i32
+    // CHECK: tt.load
+    // CHECK: [[FOR_OUT:%[a-z0-9_]+]]:{{[0-9]+}} = scf.for
+    %for:3 = scf.for %arg6 = %c0_i32 to %loop_range step %c32_i32 iter_args(%arg7 = %cst_0, %arg8 = %a_ptrs, %arg9 = %b_ptrs) -> (tensor<64x64xf32, #mma>, tensor<64x32x!tt.ptr<bf16>, #blocked>, tensor<32x64x!tt.ptr<bf16>, #blocked1>)  : i32 {
+      %63 = tt.load %arg8 : tensor<64x32x!tt.ptr<bf16>, #blocked>
+      %64 = tt.load %arg9 : tensor<32x64x!tt.ptr<bf16>, #blocked1>
+      %65 = triton_gpu.convert_layout %63 : tensor<64x32xbf16, #blocked> -> tensor<64x32xbf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
+      %66 = triton_gpu.convert_layout %64 : tensor<32x64xbf16, #blocked1> -> tensor<32x64xbf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>>
+      %67 = tt.dot %65, %66, %arg7 : tensor<64x32xbf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> * tensor<32x64xbf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>> -> tensor<64x64xf32, #mma>
+      %68 = tt.addptr %arg8, %cst : tensor<64x32x!tt.ptr<bf16>, #blocked>, tensor<64x32xi32, #blocked>
+      %69 = tt.addptr %arg9, %cst2 : tensor<32x64x!tt.ptr<bf16>, #blocked1>, tensor<32x64xi32, #blocked1>
+      scf.yield %67, %68, %69 : tensor<64x64xf32, #mma>, tensor<64x32x!tt.ptr<bf16>, #blocked>, tensor<32x64x!tt.ptr<bf16>, #blocked1>
+    }
+    // CHECK: tt.return {{[^,]+}}, [[FOR_OUT]]#3, [[FOR_OUT]]#4
+    tt.return %for#0, %for#1, %for#2 : tensor<64x64xf32, #mma>, tensor<64x32x!tt.ptr<bf16>, #blocked>, tensor<32x64x!tt.ptr<bf16>, #blocked1>
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp
@@ -71,7 +71,7 @@ class LoopPipeliner {
   /// shared mem and a next buffer stored in regs.
   int numStages = 2;
 
-  /// Arg indicies
+  /// Arg indicies in in pplForOp
   size_t depArgsBeginIdx;
   DenseMap<BlockArgument, size_t> depArgsIdx;
 
@@ -165,6 +165,9 @@ class LoopPipeliner {
   /// Collect loads to pipeline. Return success if we can pipeline this loop
   LogicalResult initialize();
 
+  // Update mapping from old forOp results to new pplForOp results
+  void setResultMapping(DenseMap<Value, Value> &newResults);
+
   /// Emit pipelined loads (before loop body)
   void emitPrologue();
 
@@ -548,6 +551,45 @@ void LoopPipeliner::emitPrologue() {
   } // for (Operation *op : orderedDeps)
 }
 
+void LoopPipeliner::setResultMapping(DenseMap<Value, Value> &newResults) {
+  // After pipelining, some of the depArgs have beem mapped to new args.
+  // We need to remap these.
+  //
+  // For example, if we have
+  //
+  //   ptr = ...
+  //   c = [zeros]
+  //   ret = scf.for iter_args(a_ptr=ptr, c=c)
+  //     a = load(a_ptr)
+  //     c += dot(a, ...)
+  //     a_ptr_new = a_ptr + N
+  //     scf.yield %a_ptr_new, %c
+  //
+  // then the ptr arg should be mapped to a new arg in the for loop.
+  //
+  //   ptr = ...
+  //   c = [zeros]
+  //   load_pre = load(ptr)
+  //   ptr_new = ptr + N
+  //   ret = scf.for iter_args(a_ptr=ptr, c=c, ld=load_pre, A_ptr_1=ptr_new)
+  //     a_next = load(A_ptr_1)
+  //     c += dot(ld, ...)
+  //     A_ptr_new = A_ptr_1 + N
+  //     scf.yield a_ptr, c, a_next, A_ptr_new
+  //
+  // After this, if there are downstream users of a_ptr, they should reference
+  // ret#3 instead of ret#0
+  for (const auto &origArg : llvm::enumerate(forOp.getRegionIterArgs())) {
+    if (depArgs.contains(origArg.value())) {
+      auto oldIdx = origArg.index();
+      auto newIdx = depArgsIdx[origArg.value()];
+      auto oldResult = forOp->getResult(oldIdx);
+      auto newResult = pplForOp->getResult(newIdx);
+      newResults[oldResult] = newResult;
+    }
+  }
+}
+
 void LoopPipeliner::emitEpilogue(DenseMap<Value, Value> &newResults) {
   if (!peelLastIter)
     return;
@@ -846,6 +888,7 @@ struct PipelinePass : public TritonAMDGPUStreamPipelineBase<PipelinePass> {
       DenseMap<Value, Value> newResults;
       for (unsigned i = 0; i < forOp->getNumResults(); ++i)
         newResults[forOp->getResult(i)] = pplForOp->getResult(i);
+      pipeliner.setResultMapping(newResults);
       pipeliner.emitEpilogue(newResults);
 
       // Replace the original loop