[TritonGPU] Control dynamic register allocation from triton (#6407)

Mogball · web-flow · commit 711caa4d78a5 · 2025-04-04T20:38:18.000-07:00
This pipes the ability for the compiler to set dynamic register usage in
warp specialization through the compiler. The middle-end will estimate
how many registers each partition will use as part of deciding how many
warps each partition will be. Then, the warpgroup allocator will group
partitions together and figure out the actual distribution of registers.
If registers can be redistributed, it will set a final number of
registers per warpgroup, which in turn generate `nvvm.setmaxregister`
directives.

"estimating" register usage is in general not possible from TTGIR, so
currently this just hard codes some numbers.
diff --git a/include/triton/Dialect/TritonGPU/IR/Dialect.h b/include/triton/Dialect/TritonGPU/IR/Dialect.h
@@ -39,6 +39,7 @@ template <> struct hash<CacheKey> {
 
 namespace mlir::triton::gpu {
 
+constexpr static char AttrMaxRegistersName[] = "ttg.maxnreg";
 constexpr static char AttrNumWarpsName[] = "ttg.num-warps";
 constexpr static char AttrNumCTAsName[] = "ttg.num-ctas";
 constexpr static char AttrTargetName[] = "ttg.target";
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
@@ -376,7 +376,9 @@ def TTG_WarpSpecializeOp : TTG_Op<"warp_specialize", [
   let arguments = (ins
     Variadic<AnyType>:$explicitCaptures,
     DenseI32ArrayAttr:$partitionNumWarps,
-    OptionalAttr<DenseI32ArrayAttr>:$warpGroupStartIds
+    OptionalAttr<DenseI32ArrayAttr>:$warpGroupStartIds,
+    OptionalAttr<DenseI32ArrayAttr>:$requestedRegisters,
+    OptionalAttr<DenseI32ArrayAttr>:$actualRegisters
   );
   let results = (outs Variadic<AnyType>:$defaultPassthrough);
 
diff --git a/lib/Conversion/TritonGPUToLLVM/AllocateWarpGroups.cpp b/lib/Conversion/TritonGPUToLLVM/AllocateWarpGroups.cpp
@@ -18,6 +18,20 @@ struct AllocateWarpGroups
   void runOnOperation() override {
     ModuleOp mod = getOperation();
 
+    int threadsPerWarp = TritonGPUDialect::getThreadsPerWarp(mod);
+
+    struct WarpGroupInfo {
+      SmallVector<Region *> partitions;
+      int maxRequestedRegs = 0;
+      unsigned numWarps = 0;
+    };
+    struct WarpGroupPartition {
+      int startId;
+      Region *partition;
+      int32_t estRegs;
+      int numWarps;
+    };
+
     // Compute the total number of warps required at any given time.
     int baseNumWarps = lookupNumWarps(mod);
     int maxExtraWarps = 0;
@@ -42,6 +56,81 @@ struct AllocateWarpGroups
         startId += size;
       }
       op.setWarpGroupStartIds(startIds);
+
+      // Require that an estimate has been set and that we have even warpgroups.
+      auto regsAttr = op.getRequestedRegisters();
+      if (!regsAttr || op.getTotalPartitionWarps() % 4 != 0)
+        return;
+
+      // Group the partitions into warpgroups.
+      SmallVector<WarpGroupPartition> orderedPartitions;
+      for (auto [startId, partition, estRegs, numWarps] :
+           llvm::zip(startIds, op.getPartitionRegions(), *regsAttr, arr))
+        orderedPartitions.push_back({startId, partition, estRegs, numWarps});
+      llvm::sort(orderedPartitions,
+                 [&](auto lhs, auto rhs) { return lhs.startId < rhs.startId; });
+
+      // Iterate over the partitions and assign them to warp groups. Determine
+      // the maximum number of requested registers per warp group.
+      SmallVector<WarpGroupInfo> warpGroups;
+      for (auto [startId, partition, estRegs, numWarps] : orderedPartitions) {
+        if (startId % 4 == 0) {
+          warpGroups.push_back(WarpGroupInfo{});
+        }
+        warpGroups.back().partitions.push_back(partition);
+        // Round up the nearest multiple of 8.
+        int estRegsCeil8 = llvm::divideCeil(estRegs, 8) * 8;
+        warpGroups.back().maxRequestedRegs =
+            std::max<int>(warpGroups.back().maxRequestedRegs, estRegsCeil8);
+        warpGroups.back().numWarps += numWarps;
+      }
+
+      // Determine the maximum number of registers per thread. This may have
+      // been set by the user.
+      int maxnreg;
+      if (auto maxnregAttr =
+              op->getAttrOfType<IntegerAttr>(AttrMaxRegistersName)) {
+        maxnreg = maxnregAttr.getInt();
+      } else {
+        maxnreg = (1 << 16) / (baseNumWarps + op.getTotalPartitionWarps()) /
+                  threadsPerWarp;
+        maxnreg = maxnreg / 8 * 8;
+      }
+
+      // Compute the register deficit over the partition warp groups.
+      int registerDeficit = 0;
+      for (const WarpGroupInfo &wg : warpGroups) {
+        assert(wg.numWarps % 4 == 0);
+        registerDeficit +=
+            (maxnreg - wg.maxRequestedRegs) * wg.numWarps * threadsPerWarp;
+      }
+      if (registerDeficit <= 0)
+        return;
+
+      // Determine the number of extra registers that we can distribute to the
+      // default warp group.
+      int leftover =
+          ((baseNumWarps * threadsPerWarp * maxnreg) + registerDeficit) /
+          baseNumWarps / threadsPerWarp;
+      // Round down to the nearest multiple of 8.
+      leftover = leftover / 8 * 8;
+
+      // Generate setmaxnreg in each partition according to its warp group.
+      SmallVector<int32_t> maxnregsPerPartition(1 + arr.size());
+      for (const WarpGroupInfo &wg : warpGroups) {
+        for (Region *region : wg.partitions) {
+          maxnregsPerPartition[1 + region->getRegionNumber()] =
+              wg.maxRequestedRegs;
+        }
+      }
+      // Set the register usage for the default warp group.
+      maxnregsPerPartition.front() = leftover;
+      op.setActualRegisters(maxnregsPerPartition);
+
+      // Set the initial max number of registers. This is needed for PTXAS to
+      // cooperate.
+      mod->setAttr(AttrMaxRegistersName,
+                   Builder(op.getContext()).getI32IntegerAttr(maxnreg));
     });
 
     Builder b(&getContext());
diff --git a/lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp
@@ -125,11 +125,11 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
               mlir::IntegerType::get(llvmFuncOp.getContext(), 8);
           const auto arrayType = mlir::LLVM::LLVMArrayType::get(
               llvmFuncOp.getContext(), byteType, 128);
-          llvmFuncOp.setArgAttr(i, "llvm.byval",
+          llvmFuncOp.setArgAttr(i, LLVM::LLVMDialect::getByValAttrName(),
                                 mlir::TypeAttr::get(arrayType));
-          llvmFuncOp.setArgAttr(i, "nvvm.grid_constant",
+          llvmFuncOp.setArgAttr(i, NVVM::NVVMDialect::getGridConstantAttrName(),
                                 mlir::UnitAttr::get(llvmFuncOp.getContext()));
-          llvmFuncOp.setArgAttr(i, "llvm.align",
+          llvmFuncOp.setArgAttr(i, LLVM::LLVMDialect::getAlignAttrName(),
                                 mlir::IntegerAttr::get(i32_type, 64));
         }
       }
@@ -155,7 +155,7 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
 
     if (LLVM::isKernel(funcOp)) {
       // Set an attribute to indicate this function is a kernel entry.
-      newFuncOp->setAttr("nvvm.kernel",
+      newFuncOp->setAttr(NVVM::NVVMDialect::getKernelFuncAttrName(),
                          rewriter.getIntegerAttr(type::u1Ty(ctx), 1));
       newFuncOp.setLinkage(LLVM::Linkage::External);
     } else {
@@ -166,12 +166,20 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
           ArrayAttr::get(ctx, rewriter.getStringAttr("noinline")));
       newFuncOp.setLinkage(LLVM::Linkage::Internal);
     }
-    // Set an attribute for reqntidx, it could be used in latter LLVM codegen
-    // for `nvvm.annotation` metadata.
+
+    // Determine the actual number of required warps.
     int numWarps = triton::gpu::lookupNumWarps(funcOp);
     if (auto totalNumWarps = funcOp.getParentOp()->getAttrOfType<IntegerAttr>(
             "ttg.total-num-warps"))
       numWarps = totalNumWarps.getInt();
+
+    // Set `nvvm.maxnreg` if it was specified on the module.
+    if (Attribute maxnregAttr =
+            funcOp.getParentOp()->getAttr(triton::gpu::AttrMaxRegistersName))
+      newFuncOp->setAttr(NVVM::NVVMDialect::getMaxnregAttrName(), maxnregAttr);
+
+    // Set an attribute for reqntidx, it could be used in latter LLVM codegen
+    // for `nvvm.annotation` metadata.
     newFuncOp->setAttr(NVVM::NVVMDialect::getReqntidAttrName(),
                        rewriter.getDenseI32ArrayAttr(32 * numWarps));
 
diff --git a/lib/Dialect/TritonGPU/IR/Ops.cpp b/lib/Dialect/TritonGPU/IR/Ops.cpp
@@ -769,7 +769,7 @@ void WarpSpecializeOp::build(OpBuilder &builder, OperationState &state,
                              ArrayRef<int32_t> partitionNumWarps,
                              unsigned partitionNumRegions) {
   build(builder, state, resultTypes, /*explicitCaptures=*/ValueRange(),
-        partitionNumWarps, /*warpGroupStartIds=*/{});
+        partitionNumWarps, {}, {}, {});
   OpBuilder::InsertionGuard guard(builder);
   Block *container = builder.createBlock(state.regions.back().get());
   builder.create<WarpSpecializePartitionsOp>(state.location,
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/OptimizePartitionWarps.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/OptimizePartitionWarps.cpp
@@ -249,16 +249,23 @@ static LogicalResult optimizePartitionNumWarps(ModuleAxisInfoAnalysis &axisInfo,
     }
   } while (changed);
 
-  for (auto [partition, newNumWarps, prevNumWarps, tensorRegs] :
+  SmallVector<int32_t> estRegUsage(partitionNumWarps.size());
+  for (auto [partition, newNumWarps, prevNumWarps, tensorRegs, estRegs] :
        llvm::zip(wsOp.getPartitionRegions(), partitionNumWarps,
-                 wsOp.getPartitionNumWarps(), maxTensorRegs)) {
+                 wsOp.getPartitionNumWarps(), maxTensorRegs, estRegUsage)) {
+    // "Guess" the register usage for each partition.
+    estRegs = tensorRegs ? 80 : 48;
+
+    // Layouts need to be reassigned if the number of warps changed and there
+    // are tensor computations.
     if (newNumWarps == prevNumWarps || !tensorRegs)
       continue;
     // We need to reassign layouts.
     if (failed(relayoutWarps(axisInfo, partition, prevNumWarps, newNumWarps,
                              runPipeline)))
       return failure();
   }
+  wsOp.setRequestedRegisters(estRegUsage);
   wsOp.setPartitionNumWarps(partitionNumWarps);
   return success();
 }
diff --git a/test/Conversion/allocate_warp_groups.mlir b/test/Conversion/allocate_warp_groups.mlir
@@ -63,3 +63,28 @@ tt.func @two_warp_specialize() {
 }
 
 }
+
+// -----
+
+// CHECK: module attributes {ttg.maxnreg = 168 : i32
+module attributes {"ttg.num-warps" = 8 : i32} {
+
+tt.func @setmaxnreg() {
+  // CHECK: actualRegisters = array<i32: 208, 80, 80, 80>
+  ttg.warp_specialize() attributes {requestedRegisters = array<i32: 48, 80, 48>}
+  default {
+    ttg.warp_yield
+  }
+  partition0() num_warps(1) {
+    ttg.warp_return
+  }
+  partition1() num_warps(2) {
+    ttg.warp_return
+  }
+  partition2() num_warps(1) {
+    ttg.warp_return
+  } : () -> ()
+  tt.return
+}
+
+}
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
@@ -238,6 +238,10 @@ def make_ttir(mod, metadata, opt):
 
     @staticmethod
     def make_ttgir(mod, metadata, opt, capability):
+        # Set maxnreg on all kernels, if it was provided.
+        if opt.maxnreg is not None:
+            mod.set_attr("ttg.maxnreg", ir.builder(mod.context).get_int32_attr(opt.maxnreg))
+
         cluster_info = nvidia.ClusterInfo()
         if opt.cluster_dims is not None:
             cluster_info.clusterDimX = opt.cluster_dims[0]
@@ -335,12 +339,6 @@ def make_llir(self, src, metadata, options, capability):
         llvm.attach_datalayout(llvm_mod, triple, proc, features)
         nvidia.set_nvvm_reflect_ftz(llvm_mod)
 
-        # Set maxnreg on all kernels, if it was provided.
-        if options.maxnreg is not None:
-            for k in llvm_mod.get_functions():
-                if not k.is_declaration() and k.is_external_linkage():
-                    k.set_nvvm_maxnreg(options.maxnreg)
-
         if options.extern_libs:
             paths = [path for (name, path) in options.extern_libs]
             llvm.link_extern_libs(llvm_mod, paths)
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertWarpSpecializeToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertWarpSpecializeToLLVM.cpp
@@ -216,6 +216,26 @@ static LogicalResult rewriteWarpGroupBarriers(LLVM::LLVMFuncOp func,
         bar.erase();
       });
     }
+
+    if (auto actRegisters = op.getActualRegisters()) {
+      int maxnreg = func->getParentOfType<ModuleOp>()
+                        ->getAttrOfType<IntegerAttr>(AttrMaxRegistersName)
+                        .getInt();
+      auto b = OpBuilder::atBlockBegin(&op.getDefaultRegion().front());
+      b.create<NVVM::SetMaxRegisterOp>(op.getLoc(),
+                                       std::min(256, actRegisters->front()),
+                                       NVVM::SetMaxRegisterAction::increase);
+      for (auto [actRegs, region] :
+           llvm::zip(actRegisters->drop_front(), op.getPartitionRegions())) {
+        if (actRegs == maxnreg)
+          continue;
+        auto action = actRegs < maxnreg ? NVVM::SetMaxRegisterAction::decrease
+                                        : NVVM::SetMaxRegisterAction::increase;
+        b.setInsertionPointToStart(&region->front());
+        b.create<NVVM::SetMaxRegisterOp>(op.getLoc(), std::min(256, actRegs),
+                                         action);
+      }
+    }
   }
 
   return success();
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TensorMemoryToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TensorMemoryToLLVM.cpp
@@ -100,8 +100,8 @@ TMemMessageTraits getTMemMessageFromAtom(const TMemAccessAtom &atom,
 // Only allows half of the thread registers to be used for tensor memory access
 // to avoid register pressure. This ensures the largest tmem message width is
 // used for the workload without inducing spills.
-int getTMemMessageNarrowingFactor(int workloadThreadRegs) {
-  const int allowedRegUsage = maxRegisters / 2;
+int getTMemMessageNarrowingFactor(int workloadThreadRegs, int maxnreg) {
+  const int allowedRegUsage = maxnreg / 2;
   int narrowingFactor = 1;
   while (workloadThreadRegs > allowedRegUsage) {
     workloadThreadRegs /= 2;
@@ -338,13 +338,13 @@ void createWaitOpSt(Location loc, ConversionPatternRewriter &rewriter) {
   ptxBuilder.launch(rewriter, loc, void_ty(rewriter.getContext()));
 }
 
-TMemMessageTraits selectTMemMessage(const TMemRuntimeInfo &info) {
+TMemMessageTraits selectTMemMessage(const TMemRuntimeInfo &info, int maxnreg) {
   auto atom = info.useStridedMessage ? TMemAccess16x32bx2 : TMemAccess32x32b;
 
   int totalRegsNeeded =
       getEffectiveRegs(info.unpackedb16, info.useStridedMessage,
                        info.numCols / info.numWarpGroups);
-  int narrowingFactor = getTMemMessageNarrowingFactor(totalRegsNeeded);
+  int narrowingFactor = getTMemMessageNarrowingFactor(totalRegsNeeded, maxnreg);
   auto narrowedMessage = getTMemMessageFromAtom(atom, narrowingFactor);
   narrowedMessage = constrainMessageFromWorkload(narrowedMessage, info,
                                                  narrowedMessage.numRegs);
@@ -355,6 +355,35 @@ TMemMessageTraits selectTMemMessage(const TMemRuntimeInfo &info) {
   return std::min(narrowedMessage, maxWidthMessage);
 }
 
+// Get the maximum number of registers per thread based on the context. This is
+// by default 256, but it can be overridden by `ttg.maxnreg` set on the module.
+// Alternatively, warp groups within warp specialized regions can have a
+// different number of registers allocated.
+static int getContextualMaxNReg(Operation *op) {
+  if (auto mod = dyn_cast<ModuleOp>(op)) {
+    // Check for a maxnreg attribute.
+    if (auto attr = op->getAttrOfType<IntegerAttr>(AttrMaxRegistersName))
+      return std::max<int>(maxRegisters, attr.getInt());
+
+  } else if (auto partitions =
+                 dyn_cast<WarpSpecializePartitionsOp>(op->getParentOp())) {
+    // Check if the partition has reduced registers.
+    unsigned idx = op->getParentRegion()->getRegionNumber();
+    if (auto actRegisters = partitions.getParentOp().getActualRegisters())
+      return std::max<int>(maxRegisters, (*actRegisters)[1 + idx]);
+    return getContextualMaxNReg(partitions.getParentOp());
+
+  } else if (auto wsOp = dyn_cast<WarpSpecializeOp>(op->getParentOp())) {
+    // Check the register usage of the default warpgroup.
+    if (auto actRegisters = wsOp.getActualRegisters())
+      return std::max<int>(maxRegisters, actRegisters->front());
+  }
+
+  if (Operation *parent = op->getParentOp())
+    return getContextualMaxNReg(parent);
+  return maxRegisters;
+}
+
 static void lowerStoreToTensorMemory(Location loc, Operation *op, Value src,
                                      Value dest, Value llSrc, Value pred,
                                      Value tmemBase,
@@ -365,7 +394,8 @@ static void lowerStoreToTensorMemory(Location loc, Operation *op, Value src,
   auto dstType = cast<MemDescType>(dest.getType());
   auto info = getTMemRuntimeInfo(op, cast<RankedTensorType>(src.getType()),
                                  cast<MemDescType>(dest.getType()));
-  const TMemMessageTraits message = selectTMemMessage(info);
+  const TMemMessageTraits message =
+      selectTMemMessage(info, getContextualMaxNReg(op));
   int regIdx = 0;
   calculateAddressAndEmitTmemMessage(
       loc, tmemBase, info, message, rewriter,
@@ -503,7 +533,8 @@ struct TensorMemoryLoadOpConversion
 
     auto info = getTMemRuntimeInfo(op, cast<RankedTensorType>(op.getType()),
                                    cast<MemDescType>(op.getSrc().getType()));
-    const TMemMessageTraits message = selectTMemMessage(info);
+    const TMemMessageTraits message =
+        selectTMemMessage(info, getContextualMaxNReg(op));
     SmallVector<Value> resultVals;
     calculateAddressAndEmitTmemMessage(
         loc, tmemBase, info, message, rewriter,