[WS] Use aref for TMA load pipelining and lowering (triton-lang#7826)

masahi · Mogball · web-flow · commit 44e830ed71e2 · 2025-09-05T12:19:47.000-07:00
A follow-up to triton-lang#7581 which actually starts to use aref for TMA load. It replaces one half of `LoadMMASpecialization`, but the code there is not removed to keep the pass self-contained and its lit tests functional. I verified that all tests pass if I remove the TMA code in `LoadMMASpecialization`. `LowerAref` is updated to add * Lowering for NVWS desc load ops * Aref combining optimization, to coalesce barrier operations on MMA operands into one * Aref multi-buffering, only enabled for arefs whose producer is TMA --------- Co-authored-by: Jeff Niu <jeffniu22@gmail.com>
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
@@ -257,6 +257,16 @@ Operation *mlir::triton::predicateOp(RewriterBase &rewriter, Operation *op,
     arriveBarrier.getPredMutable().assign(mask);
     return op;
   }
+  if (auto commit = dyn_cast<ttng::TCGen5CommitOp>(op)) {
+    rewriter.setInsertionPoint(commit);
+    Value mask = pred;
+    Value currentPred = commit.getPred();
+    if (currentPred) {
+      mask = getPredMask(rewriter, currentPred.getType(), currentPred, pred);
+    }
+    commit.getPredMutable().assign(mask);
+    return op;
+  }
   if (auto storeOp = dyn_cast<tt::StoreOp>(op)) {
     rewriter.setInsertionPoint(storeOp);
     Value mask = getPredMask(rewriter, storeOp.getPtr().getType(),
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/AutomaticWarpSpecialization.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/AutomaticWarpSpecialization.cpp
@@ -35,15 +35,15 @@ struct AutomaticWarpSpecialization
 void AutomaticWarpSpecialization::runOnOperation() {
   OpPassManager pm;
   pm.addPass(createTritonGPUPartitionScheduling());
+  pm.addPass(createNVWSInsertAref());
   pm.addPass(createTritonGPULoadMMASpecialization({numStages}));
   pm.addPass(createTritonGPURewritePartitionDependencies());
   // `int-range-optimizations` and SCCP are good at cleaning up loop arithmetic.
   // FIXME: Re-enable integer range analysis once it is fixed.
   // pm.addPass(arith::createIntRangeOptimizationsPass());
   pm.addPass(createSCCPPass());
   pm.addPass(createCSEPass());
-  pm.addPass(createNVWSAssignStagePhase());
-  pm.addPass(createNVWSLowerAref());
+  pm.addPass(createNVWSLowerAref({numStages}));
   pm.addPass(createTritonGPUPartitionLoops());
   pm.addPass(createNVWSLowerWarpGroup());
   if (failed(runPipeline(pm, getOperation())))
diff --git a/test/NVWS/lower_aref.mlir b/test/NVWS/lower_aref.mlir
diff --git a/test/TritonGPU/load-mma-specialization.mlir b/test/TritonGPU/load-mma-specialization.mlir
@@ -1,6 +1,6 @@
 // RUN: triton-opt %s -split-input-file -allow-unregistered-dialect -tritongpu-hoist-tmem-alloc | FileCheck %s --check-prefix=TMEM --check-prefix=FUNC
 // RUN: triton-opt %s -split-input-file -allow-unregistered-dialect -verify-diagnostics --tritongpu-hoist-tmem-alloc -tritongpu-partition-scheduling -tritongpu-load-mma-specialization -sccp -int-range-optimizations -canonicalize -cse -tritongpu-remove-layout-conversions | FileCheck %s
-// RUN: triton-opt %s -split-input-file -allow-unregistered-dialect -verify-diagnostics --tritongpu-hoist-tmem-alloc -tritongpu-automatic-warp-specialization | FileCheck %s --check-prefix=AWS --check-prefix=FUNC
+// RUN: triton-opt %s -split-input-file -allow-unregistered-dialect -verify-diagnostics --tritongpu-hoist-tmem-alloc -tritongpu-assign-latencies -tritongpu-schedule-loops -tritongpu-automatic-warp-specialization | FileCheck %s --check-prefix=AWS --check-prefix=FUNC
 
 #acc_layout = #ttg.blocked<{sizePerThread = [1, 128], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
 #oper_layout = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0]}>
@@ -768,7 +768,7 @@ tt.func @matmul_scaled_rhs_scales_tma(
   %off_n: i32,
   %a_desc: !tt.tensordesc<tensor<128x64xf8E4M3FN, #nvmma_smem>>,
   %b_desc: !tt.tensordesc<tensor<128x64xf8E4M3FN, #nvmma_smem>>,
-  %b_scale_desc: !tt.tensordesc<tensor<128x8xi8, #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [4, 3, 2, 1, 0]}>>>
+  %b_scale_desc: !tt.tensordesc<tensor<128x8xi8, #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>>>
 ) {
   %true = arith.constant true
   %c0_i32 = arith.constant 0 : i32
@@ -791,7 +791,7 @@ tt.func @matmul_scaled_rhs_scales_tma(
     // CHECK-COUNT-3: async_tma_copy_global_to_local {{.*}} {ttg.partition = 2 : i32}
     %a_reg = tt.descriptor_load %a_desc[%off_m, %off_k] : !tt.tensordesc<tensor<128x64xf8E4M3FN, #nvmma_smem>> -> tensor<128x64xf8E4M3FN, #oper_layout>
     %b_reg = tt.descriptor_load %b_desc[%off_n, %off_k] : !tt.tensordesc<tensor<128x64xf8E4M3FN, #nvmma_smem>> -> tensor<128x64xf8E4M3FN, #oper_layout>
-    %b_scales_reg = tt.descriptor_load %b_scale_desc[%off_m, %c0_i32] : !tt.tensordesc<tensor<128x8xi8, #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [4, 3, 2, 1, 0]}>>> -> tensor<128x8xi8, #scales>
+    %b_scales_reg = tt.descriptor_load %b_scale_desc[%off_m, %c0_i32] : !tt.tensordesc<tensor<128x8xi8, #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>>> -> tensor<128x8xi8, #scales>
 
     %a_sh = ttg.local_alloc %a_reg : (tensor<128x64xf8E4M3FN, #oper_layout>) -> !ttg.memdesc<128x64xf8E4M3FN, #nvmma_smem, #smem>
     %b_sh_raw = ttg.local_alloc %b_reg : (tensor<128x64xf8E4M3FN, #oper_layout>) -> !ttg.memdesc<128x64xf8E4M3FN, #nvmma_smem, #smem>
@@ -1023,13 +1023,13 @@ tt.func @specialize_load_only(%desc: !tt.tensordesc<tensor<128x64xf16, #shared>>
   %c1_i32 = arith.constant 1 : i32
   // CHECK: local_alloc : () -> !ttg.memdesc<3x128x64xf16,
   scf.for %i = %c0_i32 to %ub step %c1_i32 : i32 {
-    // CHECK: wait_barrier {{.*}} {ttg.partition = 0 : i32}
-    // CHECK-NEXT: local_load {{.*}} {ttg.partition = 0 : i32}
+    // CHECK: wait_barrier {{.*}} {loop.cluster = 0 : i32, loop.stage = 1 : i32, ttg.partition = 0 : i32}
+    // CHECK-NEXT: local_load {{.*}} {loop.cluster = 0 : i32, loop.stage = 1 : i32, ttg.partition = 0 : i32}
     // CHECK-NEXT: fence_async_shared {{.*}}partition = 0
-    // CHECK-NEXT: arrive_barrier {{.*}} {ttg.partition = 0 : i32}
-    %val = tt.descriptor_load %desc[%i, %i] : !tt.tensordesc<tensor<128x64xf16, #shared>> -> tensor<128x64xf16, #oper_layout>
-    "use"(%val) : (tensor<128x64xf16, #oper_layout>) -> ()
-  } {tt.warp_specialize}
+    // CHECK-NEXT: arrive_barrier {{.*}} {loop.cluster = 0 : i32, loop.stage = 1 : i32, ttg.partition = 0 : i32}
+    %val = tt.descriptor_load %desc[%i, %i] {loop.cluster = 1 : i32, loop.stage = 0}: !tt.tensordesc<tensor<128x64xf16, #shared>> -> tensor<128x64xf16, #oper_layout>
+    "use"(%val) {loop.cluster = 0 : i32, loop.stage = 1 : i32} : (tensor<128x64xf16, #oper_layout>) -> ()
+  } {tt.num_stages = 3 : i32, tt.scheduled_max_stage = 1 : i32, tt.warp_specialize}
   tt.return
 }
 
@@ -1041,9 +1041,9 @@ tt.func @fp4_padded_load(%desc: !tt.tensordesc<tensor<1x256x64xui8, #fp4_padded_
   scf.for %i = %c0_i32 to %ub step %c1_i32 : i32 {
     // CHECK: [[IDX:%.*]] = arith.muli [[I]], %c2_i32 : i32
     // CHECK: async_tma_copy_global_to_local %arg{{[0-9]+}}[[[I]], [[IDX]]]
-    %val = tt.descriptor_load %desc[%i, %i] : !tt.tensordesc<tensor<1x256x64xui8, #fp4_padded_shared>> -> tensor<256x64xi8, #oper_layout>
-    "use"(%val) : (tensor<256x64xi8, #oper_layout>) -> ()
-  } {tt.warp_specialize}
+    %val = tt.descriptor_load %desc[%i, %i] {loop.cluster = 1 : i32, loop.stage = 0, ttg.partition = 2 : i32} : !tt.tensordesc<tensor<1x256x64xui8, #fp4_padded_shared>> -> tensor<256x64xi8, #oper_layout>
+    "use"(%val) {loop.cluster = 0 : i32, loop.stage = 1 : i32, ttg.partition = 0 : i32} : (tensor<256x64xi8, #oper_layout>) -> ()
+  } {tt.num_stages = 2 : i32, tt.scheduled_max_stage = 1 : i32, tt.warp_specialize}
   tt.return
 }
 
diff --git a/third_party/nvidia/include/Dialect/NVWS/Transforms/Passes.td b/third_party/nvidia/include/Dialect/NVWS/Transforms/Passes.td
@@ -80,6 +80,11 @@ def NVWSLowerAref : Pass<"nvws-lower-aref", "mlir::ModuleOp"> {
     "mlir::triton::gpu::TritonGPUDialect",
     "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect"
   ];
+
+  let options = [
+    Option<"numStages", "num-stages", "int32_t", /*default*/"3",
+           "number of pipeline stages">
+  ];
 }
 
 def NVWSInsertAref: Pass<"nvws-insert-aref", "mlir::ModuleOp"> {
diff --git a/third_party/nvidia/lib/Dialect/NVWS/Transforms/InsertAref.cpp b/third_party/nvidia/lib/Dialect/NVWS/Transforms/InsertAref.cpp
@@ -95,7 +95,7 @@ ArefCreateOp createAref(OpBuilder &builder, ProducedValueInfo &producedValue) {
   };
 
   MemDescType memDescType;
-  if (isDescLoadAndAlloc<LocalAllocOp>(result)) {
+  if (result.getDefiningOp<LocalAllocOp>()) {
     memDescType = dyn_cast<MemDescType>(result.getType());
   } else if (auto opt = isDescLoadAndAlloc<TMEMAllocOp>(result)) {
     auto descLoadResult = opt->first.getSrc();
@@ -206,6 +206,10 @@ SmallVector<Operation *> createArefPut(PartitionBuilder &builder,
   } else if (isGlobalLoadAndAlloc<LocalAllocOp>(result) ||
              isGlobalLoadAndAlloc<TMEMAllocOp>(result)) {
     llvm_unreachable("cpasync not supported yet");
+  } else if (auto alloc = result.getDefiningOp<LocalAllocOp>()) {
+    builder.createInto<LocalStoreOp>(*producerPartition, stageCluster,
+                                     alloc.getSrc(), dataBuf);
+    staleOps.push_back(alloc);
   } else if (auto tensorType = dyn_cast<RankedTensorType>(result.getType())) {
     if (auto descOp = result.getDefiningOp<triton::DescriptorOpInterface>()) {
       createNVWSDescriptorLoadOp(builder, descOp, dataBuf, producerPartition,
@@ -296,8 +300,7 @@ getEnterAndExitStageClustersOfUses(const SetVector<Value> &producedResults,
                                    scf::ForOp forOp) {
   CoarseSchedule coarseSchedule;
   if (failed(coarseSchedule.deSerialize(forOp))) {
-    llvm::report_fatal_error(
-        "Failed to deserialze stage and cluster annotations.");
+    return std::make_pair(std::nullopt, std::nullopt);
   }
 
   SmallVector<Operation *> ops;
@@ -485,6 +488,8 @@ class NVWSArefInsertion
               (allowDescLoadRegUse &&
                (isa<triton::DescriptorOpInterface>(op)))) {
             ops.push_back(op);
+          } else if (isa<LocalAllocOp>(op)) {
+            ops.push_back(op);
           }
           return WalkResult::advance();
         });
diff --git a/third_party/nvidia/lib/Dialect/NVWS/Transforms/LowerAref.cpp b/third_party/nvidia/lib/Dialect/NVWS/Transforms/LowerAref.cpp