intel
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 6 additions & 3 deletions b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 3 additions & 1 deletion b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 2 additions & 3 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 5 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 2 additions & 0 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/AllocateWarpGroups.cpp‎
Lines changed: 89 additions & 0 deletions b/‎lib/Conversion/TritonGPUToLLVM/AllocateWarpGroups.cpp‎
Lines changed: 89 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp‎
Lines changed: 16 additions & 7 deletions b/‎lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp‎
Lines changed: 16 additions & 7 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -1314,7 +1314,7 @@ def TT_DescriptorStoreOp : TT_Op<"descriptor_store", [
 def TT_DescriptorGatherOp : TT_Op<"descriptor_gather", [MemoryEffects<[MemRead<GlobalMemory>]>]> {
   let summary = "gather multiple rows from a descriptor into a single tensor";
   let description = [{
-    The `tt.desciptor_gather` op will be lowered to NVIDIA TMA
+    The `tt.descriptor_gather` op will be lowered to NVIDIA TMA
     load operations on targets that support it.
 
     `desc_ptr` is a pointer to the TMA descriptor allocated in global memory.
@@ -1340,9 +1340,10 @@ def TT_DescriptorGatherOp : TT_Op<"descriptor_gather", [MemoryEffects<[MemRead<G
   let hasVerifier = 1;
 
   let extraClassDeclaration = [{
-    // TMA gathers have resstrictions on the minimum size of the gather result.
+    // TMA gathers have restrictions on the minimum size of the gather result.
     // This function verifies the result type.
-    static LogicalResult verifyResultType(Operation *op, mlir::ShapedType type);
+    static LogicalResult verifyResultType(Operation *op, ShapedType resultType,
+                                          RankedTensorType indicesType);
   }];
 }
 
@@ -1360,6 +1361,8 @@ def TT_DescriptorScatterOp : TT_Op<"descriptor_scatter", [
     $desc `[` $x_offsets `,` $y_offset `]` `,` $src
     attr-dict `:` type(operands)
   }];
+
+  let hasVerifier = 1;
 }
 
 def TT_ExperimentalTensormapCreateOp: TT_Op<
 
@@ -39,6 +39,7 @@ template <> struct hash<CacheKey> {
 
 namespace mlir::triton::gpu {
 
+constexpr static char AttrMaxRegistersName[] = "ttg.maxnreg";
 constexpr static char AttrNumWarpsName[] = "ttg.num-warps";
 constexpr static char AttrNumCTAsName[] = "ttg.num-ctas";
 constexpr static char AttrTargetName[] = "ttg.target";
 
@@ -376,7 +376,9 @@ def TTG_WarpSpecializeOp : TTG_Op<"warp_specialize", [
   let arguments = (ins
     Variadic<AnyType>:$explicitCaptures,
     DenseI32ArrayAttr:$partitionNumWarps,
-    OptionalAttr<DenseI32ArrayAttr>:$warpGroupStartIds
+    OptionalAttr<DenseI32ArrayAttr>:$warpGroupStartIds,
+    OptionalAttr<DenseI32ArrayAttr>:$requestedRegisters,
+    OptionalAttr<DenseI32ArrayAttr>:$actualRegisters
   );
   let results = (outs Variadic<AnyType>:$defaultPassthrough);
 
 
@@ -49,7 +49,7 @@ std::pair<OpResult, int64_t> getDefinitionAndDistance(scf::ForOp forOp,
 std::pair<Operation *, int64_t> getDefiningOpAndDistance(scf::ForOp forOp,
                                                          Value value);
 
-// Return maxumum length of the vectorized copy between registers and shared
+// Return maximum length of the vectorized copy between registers and shared
 // memory for the given tensor type and shared encoding.
 int getCopyVecBytes(RankedTensorType registerTy,
                     gpu::SharedEncodingTrait sharedEnc);
 
@@ -28,9 +28,6 @@ void lowerLoops(ModuleOp moduleOp);
 /// Pipeline the TMA stores in the loop.
 bool pipelineTMAStores(scf::ForOp forOp);
 
-/// Simple pipelining for the MMA ops which accumulator is modified in the loop.
-scf::ForOp pipelineMMAWithScaledAcc(scf::ForOp forOp);
-
 /// This does post-processing on the pipelined loop to try to pipeline wgmma
 /// ops.
 // TODO: this should be included as part of the pipeline but currently the wgmma
@@ -75,6 +72,8 @@ class CoarseSchedule {
     }
 
     bool isBefore(iterator a, iterator b) const {
+      if (a == b)
+        return false;
       for (auto it = begin(); it != end(); ++it) {
         if (it == a)
           return true;
 
@@ -237,6 +237,11 @@ SetVector<Value> getNestedOperands(Operation *op);
 // Erase the given loop carried values from the loop, where `loop` is replaced
 // with a new loop.
 void eraseLoopCarriedValues(scf::ForOp &loop, llvm::BitVector indices);
+
+// Return true if two value sets may refer to the same allocation.
+bool mayAliasAllocations(const DenseSet<Value> &lhs,
+                         const DenseSet<Value> &rhs);
+
 } // namespace mlir
 
 #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_
@@ -366,6 +366,8 @@ def TTNG_AsyncTMAScatterOp : TTNG_Op<"async_tma_scatter", [DeclareOpInterfaceMet
     $desc_ptr `[` $x_offsets `,` $y_offset `]` $src
     attr-dict `:` type(operands)
   }];
+
+  let hasVerifier = 1;
 }
 
 def TTNG_TMAStoreWaitOp : TTNG_Op<"async_tma_store_wait"> {
 
@@ -18,6 +18,20 @@ struct AllocateWarpGroups
   void runOnOperation() override {
     ModuleOp mod = getOperation();
 
+    int threadsPerWarp = TritonGPUDialect::getThreadsPerWarp(mod);
+
+    struct WarpGroupInfo {
+      SmallVector<Region *> partitions;
+      int maxRequestedRegs = 0;
+      unsigned numWarps = 0;
+    };
+    struct WarpGroupPartition {
+      int startId;
+      Region *partition;
+      int32_t estRegs;
+      int numWarps;
+    };
+
     // Compute the total number of warps required at any given time.
     int baseNumWarps = lookupNumWarps(mod);
     int maxExtraWarps = 0;
@@ -42,6 +56,81 @@ struct AllocateWarpGroups
         startId += size;
       }
       op.setWarpGroupStartIds(startIds);
+
+      // Require that an estimate has been set and that we have even warpgroups.
+      auto regsAttr = op.getRequestedRegisters();
+      if (!regsAttr || op.getTotalPartitionWarps() % 4 != 0)
+        return;
+
+      // Group the partitions into warpgroups.
+      SmallVector<WarpGroupPartition> orderedPartitions;
+      for (auto [startId, partition, estRegs, numWarps] :
+           llvm::zip(startIds, op.getPartitionRegions(), *regsAttr, arr))
+        orderedPartitions.push_back({startId, partition, estRegs, numWarps});
+      llvm::sort(orderedPartitions,
+                 [&](auto lhs, auto rhs) { return lhs.startId < rhs.startId; });
+
+      // Iterate over the partitions and assign them to warp groups. Determine
+      // the maximum number of requested registers per warp group.
+      SmallVector<WarpGroupInfo> warpGroups;
+      for (auto [startId, partition, estRegs, numWarps] : orderedPartitions) {
+        if (startId % 4 == 0) {
+          warpGroups.push_back(WarpGroupInfo{});
+        }
+        warpGroups.back().partitions.push_back(partition);
+        // Round up the nearest multiple of 8.
+        int estRegsCeil8 = llvm::divideCeil(estRegs, 8) * 8;
+        warpGroups.back().maxRequestedRegs =
+            std::max<int>(warpGroups.back().maxRequestedRegs, estRegsCeil8);
+        warpGroups.back().numWarps += numWarps;
+      }
+
+      // Determine the maximum number of registers per thread. This may have
+      // been set by the user.
+      int maxnreg;
+      if (auto maxnregAttr =
+              op->getAttrOfType<IntegerAttr>(AttrMaxRegistersName)) {
+        maxnreg = maxnregAttr.getInt();
+      } else {
+        maxnreg = (1 << 16) / (baseNumWarps + op.getTotalPartitionWarps()) /
+                  threadsPerWarp;
+        maxnreg = maxnreg / 8 * 8;
+      }
+
+      // Compute the register deficit over the partition warp groups.
+      int registerDeficit = 0;
+      for (const WarpGroupInfo &wg : warpGroups) {
+        assert(wg.numWarps % 4 == 0);
+        registerDeficit +=
+            (maxnreg - wg.maxRequestedRegs) * wg.numWarps * threadsPerWarp;
+      }
+      if (registerDeficit <= 0)
+        return;
+
+      // Determine the number of extra registers that we can distribute to the
+      // default warp group.
+      int leftover =
+          ((baseNumWarps * threadsPerWarp * maxnreg) + registerDeficit) /
+          baseNumWarps / threadsPerWarp;
+      // Round down to the nearest multiple of 8.
+      leftover = leftover / 8 * 8;
+
+      // Generate setmaxnreg in each partition according to its warp group.
+      SmallVector<int32_t> maxnregsPerPartition(1 + arr.size());
+      for (const WarpGroupInfo &wg : warpGroups) {
+        for (Region *region : wg.partitions) {
+          maxnregsPerPartition[1 + region->getRegionNumber()] =
+              wg.maxRequestedRegs;
+        }
+      }
+      // Set the register usage for the default warp group.
+      maxnregsPerPartition.front() = leftover;
+      op.setActualRegisters(maxnregsPerPartition);
+
+      // Set the initial max number of registers. This is needed for PTXAS to
+      // cooperate.
+      mod->setAttr(AttrMaxRegistersName,
+                   Builder(op.getContext()).getI32IntegerAttr(maxnreg));
     });
 
     Builder b(&getContext());
 
@@ -1,3 +1,4 @@
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h"
 #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
@@ -124,11 +125,11 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
               mlir::IntegerType::get(llvmFuncOp.getContext(), 8);
           const auto arrayType = mlir::LLVM::LLVMArrayType::get(
               llvmFuncOp.getContext(), byteType, 128);
-          llvmFuncOp.setArgAttr(i, "llvm.byval",
+          llvmFuncOp.setArgAttr(i, LLVM::LLVMDialect::getByValAttrName(),
                                 mlir::TypeAttr::get(arrayType));
-          llvmFuncOp.setArgAttr(i, "nvvm.grid_constant",
+          llvmFuncOp.setArgAttr(i, NVVM::NVVMDialect::getGridConstantAttrName(),
                                 mlir::UnitAttr::get(llvmFuncOp.getContext()));
-          llvmFuncOp.setArgAttr(i, "llvm.align",
+          llvmFuncOp.setArgAttr(i, LLVM::LLVMDialect::getAlignAttrName(),
                                 mlir::IntegerAttr::get(i32_type, 64));
         }
       }
@@ -154,7 +155,7 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
 
     if (LLVM::isKernel(funcOp)) {
       // Set an attribute to indicate this function is a kernel entry.
-      newFuncOp->setAttr("nvvm.kernel",
+      newFuncOp->setAttr(NVVM::NVVMDialect::getKernelFuncAttrName(),
                          rewriter.getIntegerAttr(type::u1Ty(ctx), 1));
       newFuncOp.setLinkage(LLVM::Linkage::External);
     } else {
@@ -165,13 +166,21 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
           ArrayAttr::get(ctx, rewriter.getStringAttr("noinline")));
       newFuncOp.setLinkage(LLVM::Linkage::Internal);
     }
-    // Set an attribute for reqntidx, it could be used in latter LLVM codegen
-    // for `nvvm.annotation` metadata.
+
+    // Determine the actual number of required warps.
     int numWarps = triton::gpu::lookupNumWarps(funcOp);
     if (auto totalNumWarps = funcOp.getParentOp()->getAttrOfType<IntegerAttr>(
             "ttg.total-num-warps"))
       numWarps = totalNumWarps.getInt();
-    newFuncOp->setAttr("nvvm.reqntid",
+
+    // Set `nvvm.maxnreg` if it was specified on the module.
+    if (Attribute maxnregAttr =
+            funcOp.getParentOp()->getAttr(triton::gpu::AttrMaxRegistersName))
+      newFuncOp->setAttr(NVVM::NVVMDialect::getMaxnregAttrName(), maxnregAttr);
+
+    // Set an attribute for reqntidx, it could be used in latter LLVM codegen
+    // for `nvvm.annotation` metadata.
+    newFuncOp->setAttr(NVVM::NVVMDialect::getReqntidAttrName(),
                        rewriter.getDenseI32ArrayAttr(32 * numWarps));
 
     rewriter.eraseOp(funcOp);
 
@@ -358,7 +358,7 @@ struct ReduceOpConversion
                resultIdx < resultDim; ++resultIdx) {
             auto smemIdx = resultIdx < op.getAxis() ? resultIdx : resultIdx + 1;
             if (resultShape[resultIdx] > smemShape[smemIdx]) {
-              // When srcShape smaller then src sizePerThread, only srcShape
+              // When srcShape smaller than src sizePerThread, only srcShape
               // elements is accumulated in smem. Modulo smemShape effectively
               // replicates srcShape elements to src sizePerThread.
               readIdx[smemIdx] =
Original file line number	Diff line number	Diff line change
`@@ -366,6 +366,8 @@ def TTNG_AsyncTMAScatterOp : TTNG_Op<"async_tma_scatter", [DeclareOpInterfaceMet`
`366`	`366`	$desc_ptr `[` $x_offsets `,` $y_offset `]` $src
`367`	`367`	attr-dict `:` type(operands)
`368`	`368`	`}];`
	`369`	`+`
	`370`	`+ let hasVerifier = 1;`
`369`	`371`	`}`
`370`	`372`
`371`	`373`	`def TTNG_TMAStoreWaitOp : TTNG_Op<"async_tma_store_wait"> {`