Merge commit '711caa4d78a56cc7eb539b61501b904a94bba2db'

whitneywhtsang · whitneywhtsang · commit 71e0eb7e84e3 · 2025-04-05T11:05:20.000Z
diff --git a/include/triton/Dialect/TritonGPU/IR/Dialect.h b/include/triton/Dialect/TritonGPU/IR/Dialect.h
@@ -39,6 +39,7 @@ template <> struct hash<CacheKey> {
 
 namespace mlir::triton::gpu {
 
+constexpr static char AttrMaxRegistersName[] = "ttg.maxnreg";
 constexpr static char AttrNumWarpsName[] = "ttg.num-warps";
 constexpr static char AttrNumCTAsName[] = "ttg.num-ctas";
 constexpr static char AttrTargetName[] = "ttg.target";
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
@@ -376,7 +376,9 @@ def TTG_WarpSpecializeOp : TTG_Op<"warp_specialize", [
   let arguments = (ins
     Variadic<AnyType>:$explicitCaptures,
     DenseI32ArrayAttr:$partitionNumWarps,
-    OptionalAttr<DenseI32ArrayAttr>:$warpGroupStartIds
+    OptionalAttr<DenseI32ArrayAttr>:$warpGroupStartIds,
+    OptionalAttr<DenseI32ArrayAttr>:$requestedRegisters,
+    OptionalAttr<DenseI32ArrayAttr>:$actualRegisters
   );
   let results = (outs Variadic<AnyType>:$defaultPassthrough);
 
diff --git a/lib/Conversion/TritonGPUToLLVM/AllocateWarpGroups.cpp b/lib/Conversion/TritonGPUToLLVM/AllocateWarpGroups.cpp
@@ -18,6 +18,20 @@ struct AllocateWarpGroups
   void runOnOperation() override {
     ModuleOp mod = getOperation();
 
+    int threadsPerWarp = TritonGPUDialect::getThreadsPerWarp(mod);
+
+    struct WarpGroupInfo {
+      SmallVector<Region *> partitions;
+      int maxRequestedRegs = 0;
+      unsigned numWarps = 0;
+    };
+    struct WarpGroupPartition {
+      int startId;
+      Region *partition;
+      int32_t estRegs;
+      int numWarps;
+    };
+
     // Compute the total number of warps required at any given time.
     int baseNumWarps = lookupNumWarps(mod);
     int maxExtraWarps = 0;
@@ -42,6 +56,81 @@ struct AllocateWarpGroups
         startId += size;
       }
       op.setWarpGroupStartIds(startIds);
+
+      // Require that an estimate has been set and that we have even warpgroups.
+      auto regsAttr = op.getRequestedRegisters();
+      if (!regsAttr || op.getTotalPartitionWarps() % 4 != 0)
+        return;
+
+      // Group the partitions into warpgroups.
+      SmallVector<WarpGroupPartition> orderedPartitions;
+      for (auto [startId, partition, estRegs, numWarps] :
+           llvm::zip(startIds, op.getPartitionRegions(), *regsAttr, arr))
+        orderedPartitions.push_back({startId, partition, estRegs, numWarps});
+      llvm::sort(orderedPartitions,
+                 [&](auto lhs, auto rhs) { return lhs.startId < rhs.startId; });
+
+      // Iterate over the partitions and assign them to warp groups. Determine
+      // the maximum number of requested registers per warp group.
+      SmallVector<WarpGroupInfo> warpGroups;
+      for (auto [startId, partition, estRegs, numWarps] : orderedPartitions) {
+        if (startId % 4 == 0) {
+          warpGroups.push_back(WarpGroupInfo{});
+        }
+        warpGroups.back().partitions.push_back(partition);
+        // Round up the nearest multiple of 8.
+        int estRegsCeil8 = llvm::divideCeil(estRegs, 8) * 8;
+        warpGroups.back().maxRequestedRegs =
+            std::max<int>(warpGroups.back().maxRequestedRegs, estRegsCeil8);
+        warpGroups.back().numWarps += numWarps;
+      }
+
+      // Determine the maximum number of registers per thread. This may have
+      // been set by the user.
+      int maxnreg;
+      if (auto maxnregAttr =
+              op->getAttrOfType<IntegerAttr>(AttrMaxRegistersName)) {
+        maxnreg = maxnregAttr.getInt();
+      } else {
+        maxnreg = (1 << 16) / (baseNumWarps + op.getTotalPartitionWarps()) /
+                  threadsPerWarp;
+        maxnreg = maxnreg / 8 * 8;
+      }
+
+      // Compute the register deficit over the partition warp groups.
+      int registerDeficit = 0;
+      for (const WarpGroupInfo &wg : warpGroups) {
+        assert(wg.numWarps % 4 == 0);
+        registerDeficit +=
+            (maxnreg - wg.maxRequestedRegs) * wg.numWarps * threadsPerWarp;
+      }
+      if (registerDeficit <= 0)
+        return;
+
+      // Determine the number of extra registers that we can distribute to the
+      // default warp group.
+      int leftover =
+          ((baseNumWarps * threadsPerWarp * maxnreg) + registerDeficit) /
+          baseNumWarps / threadsPerWarp;
+      // Round down to the nearest multiple of 8.
+      leftover = leftover / 8 * 8;
+
+      // Generate setmaxnreg in each partition according to its warp group.
+      SmallVector<int32_t> maxnregsPerPartition(1 + arr.size());
+      for (const WarpGroupInfo &wg : warpGroups) {
+        for (Region *region : wg.partitions) {
+          maxnregsPerPartition[1 + region->getRegionNumber()] =
+              wg.maxRequestedRegs;
+        }
+      }
+      // Set the register usage for the default warp group.
+      maxnregsPerPartition.front() = leftover;
+      op.setActualRegisters(maxnregsPerPartition);
+
+      // Set the initial max number of registers. This is needed for PTXAS to
+      // cooperate.
+      mod->setAttr(AttrMaxRegistersName,
+                   Builder(op.getContext()).getI32IntegerAttr(maxnreg));
     });
 
     Builder b(&getContext());
diff --git a/lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp
@@ -125,11 +125,11 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
               mlir::IntegerType::get(llvmFuncOp.getContext(), 8);
           const auto arrayType = mlir::LLVM::LLVMArrayType::get(
               llvmFuncOp.getContext(), byteType, 128);
-          llvmFuncOp.setArgAttr(i, "llvm.byval",
+          llvmFuncOp.setArgAttr(i, LLVM::LLVMDialect::getByValAttrName(),
                                 mlir::TypeAttr::get(arrayType));
-          llvmFuncOp.setArgAttr(i, "nvvm.grid_constant",
+          llvmFuncOp.setArgAttr(i, NVVM::NVVMDialect::getGridConstantAttrName(),
                                 mlir::UnitAttr::get(llvmFuncOp.getContext()));
-          llvmFuncOp.setArgAttr(i, "llvm.align",
+          llvmFuncOp.setArgAttr(i, LLVM::LLVMDialect::getAlignAttrName(),
                                 mlir::IntegerAttr::get(i32_type, 64));
         }
       }
@@ -155,7 +155,7 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
 
     if (LLVM::isKernel(funcOp)) {
       // Set an attribute to indicate this function is a kernel entry.
-      newFuncOp->setAttr("nvvm.kernel",
+      newFuncOp->setAttr(NVVM::NVVMDialect::getKernelFuncAttrName(),
                          rewriter.getIntegerAttr(type::u1Ty(ctx), 1));
       newFuncOp.setLinkage(LLVM::Linkage::External);
     } else {
@@ -166,12 +166,20 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
           ArrayAttr::get(ctx, rewriter.getStringAttr("noinline")));
       newFuncOp.setLinkage(LLVM::Linkage::Internal);
     }
-    // Set an attribute for reqntidx, it could be used in latter LLVM codegen
-    // for `nvvm.annotation` metadata.
+
+    // Determine the actual number of required warps.
     int numWarps = triton::gpu::lookupNumWarps(funcOp);
     if (auto totalNumWarps = funcOp.getParentOp()->getAttrOfType<IntegerAttr>(
             "ttg.total-num-warps"))
       numWarps = totalNumWarps.getInt();
+
+    // Set `nvvm.maxnreg` if it was specified on the module.
+    if (Attribute maxnregAttr =
+            funcOp.getParentOp()->getAttr(triton::gpu::AttrMaxRegistersName))
+      newFuncOp->setAttr(NVVM::NVVMDialect::getMaxnregAttrName(), maxnregAttr);
+
+    // Set an attribute for reqntidx, it could be used in latter LLVM codegen
+    // for `nvvm.annotation` metadata.
     newFuncOp->setAttr(NVVM::NVVMDialect::getReqntidAttrName(),
                        rewriter.getDenseI32ArrayAttr(32 * numWarps));
 
diff --git a/lib/Dialect/TritonGPU/IR/Ops.cpp b/lib/Dialect/TritonGPU/IR/Ops.cpp
@@ -771,7 +771,7 @@ void WarpSpecializeOp::build(OpBuilder &builder, OperationState &state,
                              ArrayRef<int32_t> partitionNumWarps,
                              unsigned partitionNumRegions) {
   build(builder, state, resultTypes, /*explicitCaptures=*/ValueRange(),
-        partitionNumWarps, /*warpGroupStartIds=*/{});
+        partitionNumWarps, {}, {}, {});
   OpBuilder::InsertionGuard guard(builder);
   Block *container = builder.createBlock(state.regions.back().get());
   builder.create<WarpSpecializePartitionsOp>(state.location,
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp
@@ -381,17 +381,29 @@ LogicalResult triton::gpu::specializeLoadMMADependencies(scf::ForOp &loop,
            donePt.getPoint()->isBeforeInBlock(&*b.getInsertionPoint()));
     donePt = b.saveInsertionPoint();
 
-    // Acquire and get the accumulator result.
-    b.setInsertionPoint(domOp);
     Partition *userPartition = schedule.addPartition(numStages + numMmaStages);
+    // Acquire and get the accumulator result. Normally, we want to acquire the
+    // accumulator for as small of a critical section as possible to unblock
+    // dependents, but if the most dominating user is inside a conditional,
+    // acquire the accumulator for the whole branch. This will improve
+    // instruction scheduling and interleaving of the TMEM load.
+    bool userInConditional = isa<scf::IfOp>(domOp->getParentOp());
+    b.setInsertionPoint(domOp);
+    if (userInConditional)
+      b.setInsertionPointToStart(domOp->getBlock());
     createInPartition<ttng::WaitBarrierOp>(b, *userPartition, curAccReadyBar,
                                            accPhase);
+
+    b.setInsertionPoint(domOp);
     Value acc = createInPartition<ttng::TMEMLoadOp>(
         b, *userPartition, info.accLoad.getType(), curAccBuf);
     for (Operation *user : accUses)
       user->replaceUsesOfWith(info.accLoad, acc);
+
     // Signal the accumulator buffer is ready for the next iteration. Because
     // the mbarriers got shifted over by 1, we have to signal the next mbarrier.
+    if (userInConditional)
+      b.setInsertionPoint(domOp->getBlock()->getTerminator());
     Value nextIndex =
         b.create<arith::AddIOp>(accIndex, intCst(numMmaStages - 1));
     nextIndex = b.create<arith::RemUIOp>(nextIndex, intCst(numMmaStages));
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/OptimizePartitionWarps.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/OptimizePartitionWarps.cpp
@@ -5,11 +5,13 @@
 #include "triton/Analysis/AxisInfo.h"
 #include "triton/Conversion/TritonToTritonGPU/Passes.h"
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
+#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 #include "llvm/ADT/ScopeExit.h"
 
 using namespace mlir;
 using namespace triton;
 using namespace triton::gpu;
+namespace ttng = triton::nvidia_gpu;
 
 //===----------------------------------------------------------------------===//
 // relayoutWarps
@@ -182,14 +184,28 @@ static LogicalResult optimizePartitionNumWarps(ModuleAxisInfoAnalysis &axisInfo,
   // If the compiler could control that, then we could allow non-uniform
   // register distributions, mostly beneficial for single-warp warpgroups that
   // just do some artihmetic.
-  constexpr unsigned nTotalRegs = 65536; // for Blackwell SMs
+  constexpr unsigned nTotalRegs = 1 << 16; // for Blackwell SMs
   const unsigned threadsPerWarp =
       TritonGPUDialect::getThreadsPerWarp(axisInfo.getModuleOp());
   const unsigned defaultNumWarps = lookupNumWarps(wsOp);
 
   SmallVector<int32_t> partitionNumWarps =
       llvm::to_vector(wsOp.getPartitionNumWarps());
 
+  // Some instructions have critical throughput if have low register usage. Make
+  // sure there are enough warps for these ops to execute quickly.
+  SmallVector<int32_t> minWarpsForPartition(partitionNumWarps.size(), 1);
+  for (auto [minWarps, region] :
+       llvm::zip(minWarpsForPartition, wsOp.getPartitionRegions())) {
+    region->walk([minWarps = &minWarps](Operation *op) {
+      if (!isa<scf::ForOp>(op->getParentOp()))
+        return;
+      if (isa<ttng::AsyncTMAGatherOp, ttng::AsyncTMAScatterOp,
+              ttng::AsyncTMACopyGlobalToLocalOp>(op))
+        *minWarps = 2;
+    });
+  }
+
   bool changed;
   do {
     changed = false;
@@ -215,9 +231,9 @@ static LogicalResult optimizePartitionNumWarps(ModuleAxisInfoAnalysis &axisInfo,
     int32_t curTotalNumWarps = std::accumulate(
         partitionNumWarps.begin(), partitionNumWarps.end(), defaultNumWarps);
 
-    for (auto [numWarps, tensorRegs] :
-         llvm::zip(partitionNumWarps, maxTensorRegs)) {
-      if (numWarps == 1)
+    for (auto [minWarps, numWarps, tensorRegs] :
+         llvm::zip(minWarpsForPartition, partitionNumWarps, maxTensorRegs)) {
+      if (numWarps <= minWarps)
         continue;
       // Check if reducing the number of warps will still fit the tensor. If it
       // didn't fit to begin with, it won't fit after shrinking.
@@ -233,16 +249,23 @@ static LogicalResult optimizePartitionNumWarps(ModuleAxisInfoAnalysis &axisInfo,
     }
   } while (changed);
 
-  for (auto [partition, newNumWarps, prevNumWarps, tensorRegs] :
+  SmallVector<int32_t> estRegUsage(partitionNumWarps.size());
+  for (auto [partition, newNumWarps, prevNumWarps, tensorRegs, estRegs] :
        llvm::zip(wsOp.getPartitionRegions(), partitionNumWarps,
-                 wsOp.getPartitionNumWarps(), maxTensorRegs)) {
+                 wsOp.getPartitionNumWarps(), maxTensorRegs, estRegUsage)) {
+    // "Guess" the register usage for each partition.
+    estRegs = tensorRegs ? 80 : 48;
+
+    // Layouts need to be reassigned if the number of warps changed and there
+    // are tensor computations.
     if (newNumWarps == prevNumWarps || !tensorRegs)
       continue;
     // We need to reassign layouts.
     if (failed(relayoutWarps(axisInfo, partition, prevNumWarps, newNumWarps,
                              runPipeline)))
       return failure();
   }
+  wsOp.setRequestedRegisters(estRegUsage);
   wsOp.setPartitionNumWarps(partitionNumWarps);
   return success();
 }
diff --git a/test/Conversion/allocate_warp_groups.mlir b/test/Conversion/allocate_warp_groups.mlir
@@ -63,3 +63,28 @@ tt.func @two_warp_specialize() {
 }
 
 }
+
+// -----
+
+// CHECK: module attributes {ttg.maxnreg = 168 : i32
+module attributes {"ttg.num-warps" = 8 : i32} {
+
+tt.func @setmaxnreg() {
+  // CHECK: actualRegisters = array<i32: 208, 80, 80, 80>
+  ttg.warp_specialize() attributes {requestedRegisters = array<i32: 48, 80, 48>}
+  default {
+    ttg.warp_yield
+  }
+  partition0() num_warps(1) {
+    ttg.warp_return
+  }
+  partition1() num_warps(2) {
+    ttg.warp_return
+  }
+  partition2() num_warps(1) {
+    ttg.warp_return
+  } : () -> ()
+  tt.return
+}
+
+}
diff --git a/test/TritonGPU/automatic-warp-specialization.mlir b/test/TritonGPU/automatic-warp-specialization.mlir
@@ -32,7 +32,7 @@ tt.func @matmul_change_desc_in_prologue(
   // BASE-NOT: tt.make_tensor_descriptor
   // PIPELINE-NOT: tt.experimental_tensormap_create
   // CHECK-LABEL: partition1
-  // CHECK-SAME: num_warps(1)
+  // CHECK-SAME: num_warps(2)
   // BASE-COUNT-2: tt.make_tensor_descriptor
   // PIPELINE-COUNT-2: ttg.global_scratch_alloc {alignment = 128 : i32, nbytes = 512 : i32}
   // PIPELINE-COUNT-2: tt.experimental_tensormap_create
@@ -87,7 +87,7 @@ tt.func @matmul_tma_acc_with_conditional_def_and_use(
   // CHECK-LABEL: partition0
   // CHECK-SAME: num_warps(1)
   // CHECK-LABEL: partition1
-  // CHECK-SAME: num_warps(1)
+  // CHECK-SAME: num_warps(2)
   // CHECK: [[INDICES:%.*]] = tt.splat %{{.*}} : i32 -> tensor<128xi32,
   // CHECK: ttng.async_tma_gather %{{.*}}[[[INDICES]],
   // CHECK-LABEL: partition2
diff --git a/test/TritonGPU/load-mma-specialization.mlir b/test/TritonGPU/load-mma-specialization.mlir
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertWarpSpecializeToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertWarpSpecializeToLLVM.cpp
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TensorMemoryToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TensorMemoryToLLVM.cpp