[AMD] NFC: Split Loop Expansion from StreamPipeliner (#8307)

knwng · web-flow · commit 51021fb2b153 · 2025-09-29T15:24:05.000-07:00
This PR refactored StreamPipeliner, with following modifications: - exposed some structures in header - splitted StreamPipeliner into 2 passes - `TritonAMDGPUScheduleLoops`: to generate schedule. It keeps most of the functionality of StreamPipeliner. In the following PR it will be further trimmed to only schedule global load and compute. - `TritonAMDGPUPipeline`: eventually it contains: `lowerLoops` to allocate lds and schedule async loads, and `expandLoops` to mechanically expand loops according to schedule. At the point of this PR, it will only contain `expandLoops`. - splitted `expandLoops` out of `StreamPipeliner`, into a separate Pipeline pass - renamed StreamPipeline.cpp to ScheduleLoops.cpp to better reflect its purpose - changed name of passes in lit tests accordingly It's part of a series of PRs to refactor StreamPipeliner. Previous PRs: - triton-lang/triton#8295
diff --git a/bin/RegisterTritonDialects.h b/bin/RegisterTritonDialects.h
@@ -108,7 +108,8 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::registerTritonAMDGPUHoistLayoutConversions();
   mlir::registerTritonAMDGPUReorderInstructions();
   mlir::registerTritonAMDGPUBlockPingpong();
-  mlir::registerTritonAMDGPUStreamPipeline();
+  mlir::registerTritonAMDGPUPipeline();
+  mlir::registerTritonAMDGPUScheduleLoops();
   mlir::registerTritonAMDGPUCanonicalizePointers();
   mlir::registerTritonAMDGPUConvertToBufferOps();
   mlir::registerTritonAMDGPUInThreadTranspose();
diff --git a/include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h b/include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h
@@ -182,6 +182,8 @@ getLastUseOfPipelinedOp(ArrayRef<Operation *> ops, scf::ForOp forOp,
                         CoarseSchedule &schedule,
                         std::function<bool(Operation *)> filterUse = nullptr);
 
+// Clean up attributes passing over schedules across stages in pipelining
+void removePipeliningAttributes(ModuleOp moduleOp);
 } // namespace triton
 } // namespace mlir
 
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
@@ -925,3 +925,11 @@ triton::getLastUseOfPipelinedOp(ArrayRef<Operation *> ops, scf::ForOp forOp,
                 current->isBeforeInBlock(candidate));
       });
 }
+
+void triton::removePipeliningAttributes(ModuleOp moduleOp) {
+  moduleOp->walk([&](Operation *op) {
+    op->removeAttr(mlir::triton::kLoopStageAttrName);
+    op->removeAttr(mlir::triton::kLoopClusterAttrName);
+    op->removeAttr(mlir::triton::kScheduledMaxStageAttrName);
+  });
+}
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp
@@ -148,14 +148,6 @@ static void expandLoops(ModuleOp moduleOp) {
   resolveMaskOp(moduleOp, peeledMaskOps);
 }
 
-static void removeAttributes(ModuleOp moduleOp) {
-  moduleOp->walk([&](Operation *op) {
-    op->removeAttr(mlir::triton::kLoopStageAttrName);
-    op->removeAttr(mlir::triton::kLoopClusterAttrName);
-    op->removeAttr(mlir::triton::kScheduledMaxStageAttrName);
-  });
-}
-
 struct PipelinePass : public impl::TritonGPUPipelineBase<PipelinePass> {
 
   using impl::TritonGPUPipelineBase<PipelinePass>::TritonGPUPipelineBase;
@@ -180,7 +172,7 @@ struct PipelinePass : public impl::TritonGPUPipelineBase<PipelinePass> {
     }
 
     // Cleanup the IR from the pipeline attributes.
-    removeAttributes(moduleOp);
+    removePipeliningAttributes(moduleOp);
 
     pipelineWgmma(moduleOp, numStages);
 
diff --git a/test/TritonGPU/amd/amd-pipeline-chained-dots.mlir b/test/TritonGPU/amd/amd-pipeline-chained-dots.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline="num_stages=4 use_async_copy=1" -canonicalize | FileCheck %s
+// RUN: triton-opt %s -split-input-file -tritonamdgpu-schedule-loops="num_stages=4 use_async_copy=1" -tritonamdgpu-pipeline="use_async_copy=1" -canonicalize | FileCheck %s
 
 #blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}>
 #mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [32, 32, 16], isTransposed = true}>
diff --git a/test/TritonGPU/amd/amd-stream-lds-layout-selection.mlir b/test/TritonGPU/amd/amd-stream-lds-layout-selection.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline="num_stages=2" -canonicalize | FileCheck %s
+// RUN: triton-opt %s -split-input-file -tritonamdgpu-schedule-loops="num_stages=2" -tritonamdgpu-pipeline -canonicalize | FileCheck %s
 
 // Pick a common shared memory layout with vec = max kWidth of all users.
 // CHECK{LITERAL}: #shared = #ttg.swizzled_shared<{vec = 8, perPhase = 2, maxPhase = 8, order = [0, 1]}>
diff --git a/test/TritonGPU/amd/amd-stream-loop-assume.mlir b/test/TritonGPU/amd/amd-stream-loop-assume.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline="num_stages=2" -canonicalize | FileCheck %s
+// RUN: triton-opt %s -split-input-file -tritonamdgpu-schedule-loops="num_stages=2" -tritonamdgpu-pipeline -canonicalize | FileCheck %s
 
 // matmul: 128x32 @ 32x128 -> 128x128
 #AL = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
diff --git a/test/TritonGPU/loop-pipeline-combine-waits.mlir b/test/TritonGPU/loop-pipeline-combine-waits.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline="num_stages=3 use_async_copy=1 use_pingpong=1" | FileCheck %s
+// RUN: triton-opt %s -split-input-file -tritonamdgpu-schedule-loops="num_stages=3 use_async_copy=1 use_pingpong=1" -tritonamdgpu-pipeline="use_async_copy=1" | FileCheck %s
 
 #blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}>
 #mma = #ttg.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}>
diff --git a/test/TritonGPU/loop-pipeline-hip.mlir b/test/TritonGPU/loop-pipeline-hip.mlir
@@ -1,5 +1,5 @@
-// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline=num_stages=2 -canonicalize | FileCheck %s --check-prefixes=COMMON,SYNC
-// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline="num_stages=2 use_async_copy=1" -canonicalize | FileCheck %s --check-prefixes=COMMON,ASYNC
+// RUN: triton-opt %s -split-input-file -tritonamdgpu-schedule-loops=num_stages=2 -tritonamdgpu-pipeline -canonicalize | FileCheck %s --check-prefixes=COMMON,SYNC
+// RUN: triton-opt %s -split-input-file -tritonamdgpu-schedule-loops="num_stages=2 use_async_copy=1" -tritonamdgpu-pipeline="use_async_copy=1" -canonicalize | FileCheck %s --check-prefixes=COMMON,ASYNC
 
 #blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}>
 #blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
diff --git a/test/TritonGPU/loop-pipeline.mlir b/test/TritonGPU/loop-pipeline.mlir
@@ -1,6 +1,6 @@
 // RUN: triton-opt %s -split-input-file -tritongpu-assign-latencies -tritongpu-schedule-loops -tritongpu-pipeline=num-stages=3 -canonicalize | FileCheck %s --check-prefixes=COMMON,CHECK
-// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline=num_stages=2 -canonicalize | FileCheck %s --check-prefixes=COMMON,AMD
-// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline="num_stages=3" -canonicalize | FileCheck %s --check-prefixes=COMMON,AMD_3_STAGES
+// RUN: triton-opt %s -split-input-file -tritonamdgpu-schedule-loops=num_stages=2 -tritonamdgpu-pipeline -canonicalize | FileCheck %s --check-prefixes=COMMON,AMD
+// RUN: triton-opt %s -split-input-file -tritonamdgpu-schedule-loops="num_stages=3" -tritonamdgpu-pipeline -canonicalize | FileCheck %s --check-prefixes=COMMON,AMD_3_STAGES
 
 // 4 warps
 // matmul: 128x32 @ 32x128 -> 128x128
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
@@ -224,7 +224,8 @@ def make_ttgir(mod, metadata, options):
         use_async_copy = knobs.amd.use_async_copy
         use_block_pingpong = is_pingpong_schedule_enabled(options.arch, use_async_copy)
 
-        amd.passes.ttgpuir.add_stream_pipeline(pm, options.num_stages, use_async_copy, use_block_pingpong)
+        amd.passes.ttgpuir.add_schedule_loops(pm, options.num_stages, use_async_copy, use_block_pingpong)
+        amd.passes.ttgpuir.add_pipeline(pm, use_async_copy)
         if use_async_copy:
             amd.passes.ttgpuir.add_coalesce_async_copy(pm, options.arch)
         passes.common.add_canonicalizer(pm)
diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td
@@ -3,12 +3,12 @@
 
 include "mlir/Pass/PassBase.td"
 
-def TritonAMDGPUStreamPipeline : Pass<"tritonamdgpu-stream-pipeline", "mlir::ModuleOp"> {
-  let summary = "pipeline";
+def TritonAMDGPUScheduleLoops : Pass<"tritonamdgpu-schedule-loops", "mlir::ModuleOp"> {
+  let summary = "Generate schedule for loops";
 
   let description = [{
-    Pipeline global loads through registers to shared memory while computing on previous
-    tile
+    Create a schedule for loops that will be handed over to the pipeline expander to
+    implement software pipelining
   }];
 
   let dependentDialects = ["mlir::triton::amdgpu::TritonAMDGPUDialect"];
@@ -26,6 +26,21 @@ def TritonAMDGPUStreamPipeline : Pass<"tritonamdgpu-stream-pipeline", "mlir::Mod
   ];
 }
 
+def TritonAMDGPUPipeline : Pass<"tritonamdgpu-pipeline", "mlir::ModuleOp"> {
+  let summary = "pipeline";
+  let description = [{
+    Allocate LDS buffer, convert some loads to async loads, and expand loops
+  }];
+
+  let dependentDialects = ["mlir::triton::amdgpu::TritonAMDGPUDialect"];
+
+  let options = [
+    Option<"useAsyncCopy", "use_async_copy",
+           "bool", /*default*/"false",
+           "Use AsyncCopyGlobalToLocal to directly load to shared memory">,
+  ];
+}
+
 def TritonAMDGPUAccelerateMatmul : Pass<"tritonamdgpu-accelerate-matmul", "mlir::ModuleOp"> {
   let summary = "accelerate matmul";
 
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/BlockPingpong.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/BlockPingpong.cpp
@@ -671,7 +671,7 @@ LogicalResult Pingponger::transformTwoClusterWithAsyncAndAll(OpBuilder &builder,
 // correct order to allow for efficient pingpong. The loop contains 2 pairs of
 // compute and memory clusters so we only have to place barriers/sched.barriers
 // at the bounaries and give higher priority to memory clusters
-// See StreamPipeliner.cpp:ChainedDotSchedule for details about the schedule
+// See ScheduleLoops.cpp:ChainedDotSchedule for details about the schedule
 LogicalResult Pingponger::transformChainedDotSchedule(OpBuilder &builder,
                                                       Location loc) {
   assert(dotOps.size() == 2);
@@ -746,7 +746,7 @@ Pingponger::transformTwoClusterWithLocalLoadAndAll(OpBuilder &builder,
   updateOpInsertion(gLoadRhs);
 
   // Combine asyncWaitOps.
-  // FIXME: This can be done in the streamPipeline pass but currently there's a
+  // FIXME: This can be done in the ScheduleLoops pass but currently there's a
   // know issue with combineRedundantWaitOps that produces incorrect IR. Can be
   // removed once the issue is fixed.
   auto newAsyncWaitOp = asyncWaitOps[0];
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt b/third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt
@@ -8,7 +8,8 @@ add_triton_library(TritonAMDGPUTransforms
   OptimizeDotOperands.cpp
   HoistLayoutConversions.cpp
   ReorderInstructions.cpp
-  StreamPipeline.cpp
+  Pipeline.cpp
+  ScheduleLoops.cpp
   MfmaGroup.cpp
   WmmaGroup.cpp
   InThreadTranspose.cpp
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/Pipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/Pipeline.cpp
@@ -0,0 +1,120 @@
+#include "TritonAMDGPUTransforms/Passes.h"
+#include "amd/lib/TritonAMDGPUTransforms/PipelineUtility.h"
+#include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h"
+
+#define DEBUG_TYPE "tritonamdgpu-pipeline"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+namespace tt = mlir::triton;
+namespace ttg = mlir::triton::gpu;
+
+namespace mlir {
+#define GEN_PASS_DEF_TRITONAMDGPUPIPELINE
+#include "TritonAMDGPUTransforms/Passes.h.inc"
+
+namespace {
+Operation *streamPredication(RewriterBase &rewriter, Operation *op,
+                             Value pred) {
+  // The epilogue peeling generates a select for the stage output. This causes
+  // too much register pressure with the loop result and the epilogue-dot in
+  // regs for the select. Conditionally executing the dot will allow the backend
+  // to optimize the select away as redundant.
+  if (auto dotOp = dyn_cast<tt::DotOpInterface>(op)) {
+    auto loc = dotOp->getLoc();
+    auto ifOp = rewriter.create<scf::IfOp>(loc, dotOp->getResult(0).getType(),
+                                           pred, /*withElseRegion=*/true);
+    auto thenB = ifOp.getThenBodyBuilder();
+    auto yield = thenB.create<scf::YieldOp>(loc, dotOp->getResult(0));
+    dotOp->moveBefore(yield);
+    ifOp.getElseBodyBuilder().create<scf::YieldOp>(loc, dotOp->getOperand(2));
+    return ifOp;
+  }
+  return tt::wrapInMaskOp(rewriter, op, pred);
+}
+
+void expandLoops(ModuleOp moduleOp, bool useAsyncCopy) {
+  SmallVector<scf::ForOp> loops;
+  moduleOp->walk([&](scf::ForOp forOp) { loops.push_back(forOp); });
+  for (scf::ForOp forOp : loops) {
+    tt::CoarseSchedule schedule;
+    if (failed(schedule.deSerialize(forOp)))
+      continue;
+
+    // Create the final schedule for the kernel loop. This will dictate the
+    // stages and order of operations to the pipeline expander.
+    auto coarseSchedule = schedule.createFinalSchedule(forOp);
+
+    tt::PipeliningOption options;
+    options.supportDynamicLoops = true;
+    options.peelEpilogue = true;
+    options.predicateFn = streamPredication;
+    // Annotate loadOp in prologue for further moving up
+    options.annotateFn = [](Operation *op,
+                            tt::PipeliningOption::PipelinerPart part,
+                            unsigned stage) {
+      if (part != tt::PipeliningOption::PipelinerPart::Prologue)
+        return;
+
+      auto annotateLoad = [](Operation *loadOp) {
+        loadOp->setAttr("amd.pipeliner_part",
+                        StringAttr::get(loadOp->getContext(), "prologue"));
+      };
+
+      if (auto loadOp = dyn_cast<tt::LoadOp>(op)) {
+        annotateLoad(loadOp);
+        return;
+      }
+      // loadOp may be wrapped by a MaskOp as predicateFn execution
+      // precedes annotation
+      if (auto maskOp = dyn_cast<ttg::MaskOp>(op)) {
+        for (auto &innerOp : maskOp.getBody()->without_terminator()) {
+          if (auto loadOp = dyn_cast<tt::LoadOp>(&innerOp)) {
+            annotateLoad(loadOp);
+            return;
+          }
+        }
+      }
+    };
+    // Set the final schedule as our scheduling function
+    options.getScheduleFn =
+        [coarseSchedule](scf::ForOp,
+                         std::vector<std::pair<Operation *, unsigned>> &s) {
+          s = std::move(coarseSchedule);
+        };
+
+    LDBG("Loop before sending to expander:\n" << *forOp);
+
+    IRRewriter rewriter(forOp);
+    FailureOr<scf::ForOp> newForOp =
+        tt::pipelineForLoop(rewriter, forOp, options);
+
+    if (failed(newForOp))
+      continue;
+
+    forOp = *newForOp;
+  }
+
+  // NOTE: Leave empty for now, until we utilize customEpiloguePeeling
+  DenseSet<ttg::MaskOp> peeledMaskOps;
+  tt::resolveMaskOp(moduleOp, peeledMaskOps);
+
+  if (useAsyncCopy) {
+    llvm::SmallSetVector<ttg::AsyncWaitOp, 8> waitOps;
+    moduleOp.walk([&](ttg::AsyncWaitOp waitOp) { waitOps.insert(waitOp); });
+    tt::combineRedundantWaitOps(waitOps);
+  }
+}
+} // namespace
+
+struct PipelinePass : impl::TritonAMDGPUPipelineBase<PipelinePass> {
+  using Base::Base;
+
+  void runOnOperation() override {
+    ModuleOp moduleOp = getOperation();
+    expandLoops(moduleOp, useAsyncCopy);
+
+    tt::removePipeliningAttributes(moduleOp);
+  }
+};
+} // namespace mlir
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/PipelineUtility.h b/third_party/amd/lib/TritonAMDGPUTransforms/PipelineUtility.h
@@ -0,0 +1,87 @@
+#ifndef TRITON_THIRD_PARTY_AMD_LIB_TRITONAMDGPUTRANSFORMS_PIPELINEUTILITY_H_
+#define TRITON_THIRD_PARTY_AMD_LIB_TRITONAMDGPUTRANSFORMS_PIPELINEUTILITY_H_
+
+#include "mlir/IR/Operation.h"
+#include "third_party/amd/include/Analysis/AxisInfoExt.h"
+#include "triton/Dialect/TritonGPU/Transforms/Schedule.h"
+
+namespace mlir {
+struct LoadInfo {
+  // Shared layout is used for loads feeding into dot ops.
+  triton::gpu::SwizzledSharedEncodingAttr sharedEncoding = nullptr;
+  // The distance of this load's stage to its use' stage.
+  int distToUse = 0;
+  Operation *use = nullptr;
+};
+using LoadToInfoMap = llvm::MapVector<Operation *, LoadInfo>;
+
+namespace SingleDotSchedule {
+// Define categories of scheduling details per Operation types.
+// The SingleDotSchedule schedules 5 types of operations:
+// 1. GLOBAL_LOAD: tt.load / ttg.async_copy_global_to_local
+// 2. LOCAL_STORE: ttg.local_store
+// 3. LOCAL_LOAD:  ttg.local_load
+// 4. COMPUTE:     ops that use the loaded data
+// 5. ASYNC_WAIT:  ttg.async_wait
+// Note that ttg ops mentioned in the above list are created during scheduling.
+enum SchedType {
+  SCHED_GLOBAL_LOAD,
+  SCHED_LOCAL_STORE,
+  SCHED_LOCAL_LOAD,
+  SCHED_COMPUTE,
+  SCHED_ASYNC_WAIT,
+  SCHED_SIZE
+};
+
+using Clusters = std::array<triton::CoarseSchedule::Cluster, SCHED_SIZE>;
+using Stages = std::array<int, SCHED_SIZE>;
+} // namespace SingleDotSchedule
+
+namespace ChainedDotSchedule {
+// Defines the order of scheduling clusters. The suffix numbers for memory
+// operations define which dot the operations belongs to. So *_LOAD_1 loads a
+// tensor consumed by the first dot. If a memory operation is used by both dots
+// it has to be be assigned to the *_1 clusters to ensure a valid schedule.
+enum Clusters {
+  // ComputeCluster1
+  CLUSTER_DOT_1,
+  CLUSTER_AFTER_DOT_1,
+  // MemoryCluster1
+  CLUSTER_ASYNC_WAIT_2,
+  CLUSTER_LOCAL_WRITE_1,
+  CLUSTER_LOCAL_LOAD_2,
+  CLUSTER_GLOBAL_LOAD_1,
+  // ComputeCluster2
+  CLUSTER_DOT_2,
+  CLUSTER_AFTER_DOT_2,
+  // MemoryCluster2
+  CLUSTER_ASYNC_WAIT_1,
+  CLUSTER_LOCAL_WRITE_2,
+  CLUSTER_LOCAL_LOAD_1,
+  CLUSTER_GLOBAL_LOAD_2,
+
+  CLUSTER_COUNT
+};
+
+using ChainedDotClusters =
+    std::array<triton::CoarseSchedule::Cluster, CLUSTER_COUNT>;
+
+enum Stages {
+  STAGE_DOT_1 = 2,
+  STAGE_DOT_2 = 3,
+
+  STAGE_GLOBAL_LOAD_1 = 0,
+  STAGE_LOCAL_WRITE_1 = 1,
+  STAGE_LOCAL_LOAD_1 = 1,
+
+  STAGE_GLOBAL_LOAD_2 = 1,
+  STAGE_LOCAL_WRITE_2 = 2,
+  STAGE_LOCAL_LOAD_2 = 3,
+};
+
+LogicalResult checkPreconditions(scf::ForOp forOp, int numStages,
+                                 LoadToInfoMap loadToInfo);
+} // namespace ChainedDotSchedule
+} // namespace mlir
+
+#endif // TRITON_THIRD_PARTY_AMD_LIB_TRITONAMDGPUTRANSFORMS_PIPELINEUTILITY_H_
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ScheduleLoops.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ScheduleLoops.cpp
diff --git a/third_party/amd/python/triton_amd.cc b/third_party/amd/python/triton_amd.cc

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline="num_stages=4 use_async_copy=1" -canonicalize \| FileCheck %s`
	`1`	`+// RUN: triton-opt %s -split-input-file -tritonamdgpu-schedule-loops="num_stages=4 use_async_copy=1" -tritonamdgpu-pipeline="use_async_copy=1" -canonicalize \| FileCheck %s`
`2`	`2`
`3`	`3`	`#blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}>`
`4`	`4`	`#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [32, 32, 16], isTransposed = true}>`